[med-svn] [plink2] 02/03: Imported Upstream version 2.00~a-170717

Tue Jul 18 21:35:55 UTC 2017

This is an automated email from the git hooks/post-receive script.

bob.dybian-guest pushed a commit to branch master
in repository plink2.

commit 9bddd9cc7752d2d055aa39b95d8c5dd85cff62fd
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date:   Tue Jul 18 23:32:10 2017 +0200

    Imported Upstream version 2.00~a-170717
---
 LICENSE                               |   674 ++
 Python/ReadMe.md                      |     6 +
 Python/pgenlib.pyx                    |  1405 +++
 Python/python_api.txt                 |   216 +
 Python/setup.py                       |    26 +
 SFMT.c                                |   591 ++
 SFMT.h                                |   330 +
 bgzf.c                                |  1195 +++
 bgzf.h                                |   323 +
 build_dynamic/Makefile                |   128 +
 build_win/Makefile                    |    70 +
 hfile.c                               |   584 ++
 hfile.h                               |   216 +
 hfile_internal.h                      |    76 +
 hts.h                                 |   456 +
 hts_defs.h                            |    56 +
 khash.h                               |   621 ++
 pgen_compress.cpp                     |   242 +
 pgenlib_internal.cpp                  |  9604 +++++++++++++++++++
 pgenlib_internal.h                    |  2269 +++++
 pgenlib_python_support.cpp            |   501 +
 pgenlib_python_support.h              |    75 +
 plink2.cpp                            |  7728 ++++++++++++++++
 plink2_adjust.cpp                     |    41 +
 plink2_adjust.h                       |    61 +
 plink2_common.cpp                     |  6459 +++++++++++++
 plink2_common.h                       |  2508 +++++
 plink2_compress_stream.cpp            |   128 +
 plink2_compress_stream.h              |   103 +
 plink2_data.cpp                       | 15907 ++++++++++++++++++++++++++++++++
 plink2_data.h                         |   176 +
 plink2_decompress.cpp                 |   194 +
 plink2_decompress.h                   |    89 +
 plink2_filter.cpp                     |  2884 ++++++
 plink2_filter.h                       |    98 +
 plink2_glm.cpp                        |  6494 +++++++++++++
 plink2_glm.h                          |   112 +
 plink2_help.cpp                       |  1717 ++++
 plink2_help.h                         |    35 +
 plink2_ld.cpp                         |  1488 +++
 plink2_ld.h                           |    52 +
 plink2_matrix.cpp                     |  1256 +++
 plink2_matrix.h                       |   178 +
 plink2_matrix_calc.cpp                |  4221 +++++++++
 plink2_matrix_calc.h                  |   145 +
 plink2_misc.cpp                       |  3317 +++++++
 plink2_misc.h                         |   202 +
 plink2_psam.cpp                       |  1416 +++
 plink2_psam.h                         |    91 +
 plink2_pvar.cpp                       |  1584 ++++
 plink2_pvar.h                         |    76 +
 plink2_random.cpp                     |   118 +
 plink2_random.h                       |    46 +
 plink2_set.cpp                        |   408 +
 plink2_set.h                          |    39 +
 plink2_stats.cpp                      |  2102 +++++
 plink2_stats.h                        |    61 +
 zstd/lib/common/bitstream.h           |   417 +
 zstd/lib/common/entropy_common.c      |   221 +
 zstd/lib/common/entropy_common.o      |   Bin 0 -> 2816 bytes
 zstd/lib/common/error_private.c       |    43 +
 zstd/lib/common/error_private.h       |    76 +
 zstd/lib/common/error_private.o       |   Bin 0 -> 1868 bytes
 zstd/lib/common/fse.h                 |   694 ++
 zstd/lib/common/fse_decompress.c      |   328 +
 zstd/lib/common/fse_decompress.o      |   Bin 0 -> 5492 bytes
 zstd/lib/common/huf.h                 |   260 +
 zstd/lib/common/mem.h                 |   372 +
 zstd/lib/common/pool.c                |   194 +
 zstd/lib/common/pool.h                |    56 +
 zstd/lib/common/threading.c           |    80 +
 zstd/lib/common/threading.h           |   104 +
 zstd/lib/common/xxhash.c              |   869 ++
 zstd/lib/common/xxhash.h              |   305 +
 zstd/lib/common/xxhash.o              |   Bin 0 -> 6136 bytes
 zstd/lib/common/zstd_common.c         |    73 +
 zstd/lib/common/zstd_common.o         |   Bin 0 -> 1832 bytes
 zstd/lib/common/zstd_errors.h         |    74 +
 zstd/lib/common/zstd_internal.h       |   283 +
 zstd/lib/compress/fse_compress.c      |   857 ++
 zstd/lib/compress/fse_compress.o      |   Bin 0 -> 12312 bytes
 zstd/lib/compress/huf_compress.c      |   684 ++
 zstd/lib/compress/huf_compress.o      |   Bin 0 -> 12560 bytes
 zstd/lib/compress/zstd_compress.c     |  3387 +++++++
 zstd/lib/compress/zstd_compress.o     |   Bin 0 -> 200156 bytes
 zstd/lib/compress/zstd_opt.h          |   919 ++
 zstd/lib/compress/zstdmt_compress.c   |   739 ++
 zstd/lib/compress/zstdmt_compress.h   |    78 +
 zstd/lib/decompress/huf_decompress.c  |   885 ++
 zstd/lib/decompress/huf_decompress.o  |   Bin 0 -> 20100 bytes
 zstd/lib/decompress/zstd_decompress.c |  2480 +++++
 zstd/lib/decompress/zstd_decompress.o |   Bin 0 -> 34896 bytes
 zstd/lib/zstd.h                       |   765 ++
 zstd/zlibWrapper/BUCK                 |    22 +
 zstd/zlibWrapper/Makefile             |   107 +
 zstd/zlibWrapper/README.md            |   163 +
 zstd/zlibWrapper/example              |   Bin 0 -> 518704 bytes
 zstd/zlibWrapper/fitblk               |   Bin 0 -> 503724 bytes
 zstd/zlibWrapper/gzclose.c            |    28 +
 zstd/zlibWrapper/gzclose.o            |   Bin 0 -> 712 bytes
 zstd/zlibWrapper/gzcompatibility.h    |    67 +
 zstd/zlibWrapper/gzguts.h             |   227 +
 zstd/zlibWrapper/gzlib.c              |   640 ++
 zstd/zlibWrapper/gzlib.o              |   Bin 0 -> 5716 bytes
 zstd/zlibWrapper/gzread.c             |   670 ++
 zstd/zlibWrapper/gzread.o             |   Bin 0 -> 6608 bytes
 zstd/zlibWrapper/gzwrite.c            |   668 ++
 zstd/zlibWrapper/gzwrite.o            |   Bin 0 -> 7072 bytes
 zstd/zlibWrapper/minigzip             |   Bin 0 -> 518696 bytes
 zstd/zlibWrapper/zstd_zlibwrapper.c   |  1082 +++
 zstd/zlibWrapper/zstd_zlibwrapper.h   |    91 +
 zstd/zlibWrapper/zstd_zlibwrapper.o   |   Bin 0 -> 19464 bytes
 zstd/zlibWrapper/zwrapbench           |   Bin 0 -> 526244 bytes
 113 files changed, 100206 insertions(+)

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/Python/ReadMe.md b/Python/ReadMe.md
new file mode 100644
index 0000000..97f2e2c
--- /dev/null
+++ b/Python/ReadMe.md
@@ -0,0 +1,6 @@
+This provides a basic Python API for pgenlib; see python_api.txt for details.
+Cython and NumPy must be installed.
+
+Build this with e.g.
+  python setup.py build_ext
+  [sudo] python setup.py install
diff --git a/Python/pgenlib.pyx b/Python/pgenlib.pyx
new file mode 100644
index 0000000..a3860e5
--- /dev/null
+++ b/Python/pgenlib.pyx
@@ -0,0 +1,1405 @@
+# from libc.stdlib cimport malloc, free
+from libc.stdint cimport int64_t, uintptr_t, uint32_t, int32_t, uint16_t, uint8_t, int8_t
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+# from cpython.view cimport array as cvarray
+import numpy as np
+cimport numpy as np
+import sys
+
+cdef extern from "../pgenlib_python_support.h":
+    # macros aren't namespaced
+    uintptr_t DIV_UP(uintptr_t val, uintptr_t divisor)
+
+cdef extern from "../pgenlib_python_support.h" namespace "plink2":
+    ctypedef uint32_t boolerr_t
+    ctypedef enum pglerr_t:
+        kPglRetSuccess
+        kPglRetSkipped
+        kPglRetNomem
+        kPglRetOpenFail
+        kPglRetReadFail
+        kPglRetWriteFail
+        kPglRetMalformedInput
+        kPglRetInconsistentInput
+        kPglRetInvalidCmdline
+        kPglRetHelp
+        kPglRetThreadCreateFail
+        kPglRetNetworkFail
+        kPglRetSampleMajorBed
+        kPglRetImproperFunctionCall
+        kPglRetNotYetSupported
+        kPglRetLongLine
+        kPglRetEmptyFile
+
+    boolerr_t cachealigned_malloc(uintptr_t size, void* aligned_pp)
+    void aligned_free(void* aligned_ptr)
+
+    void fill_ulong_zero(uintptr_t entry_ct, uintptr_t* ularr)
+    # void bitvec_and(const uintptr_t* arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec)
+    void fill_interleaved_mask_vec(const uintptr_t* subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec)
+    void fill_cumulative_popcounts(const uintptr_t* subset_mask, uint32_t word_ct, uint32_t* cumulative_popcounts)
+
+    ctypedef uintptr_t vul_t
+    void transpose_quaterblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, vul_t* vecaligned_buf)
+    void transpose_bitblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, vul_t* vecaligned_buf)
+
+    void genovec_invert_unsafe(uint32_t sample_ct, uintptr_t* genovec)
+    void biallelic_dosage16_invert(uint32_t dosage_ct, uint16_t* dosage_vals)
+
+    void genoarr_to_bytes_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int8_t* genobytes)
+    void genoarr_to_int32s_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int32_t* geno_int32)
+    void genoarr_to_int64s_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int64_t* geno_int64)
+    void genoarr_to_allele_codes(const uintptr_t* genoarr, uint32_t sample_ct, int32_t* allele_codes)
+    void genoarr_phased_to_allele_codes(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, uint32_t sample_ct, uint32_t phasepresent_ct, unsigned char* phasebytes, int32_t* allele_codes)
+    void genoarr_phased_to_hap_codes(const uintptr_t* genoarr, const uintptr_t* phaseinfo, uint32_t variant_batch_size, int32_t* hap0_codes_iter, int32_t* hap1_codes_iter)
+    void dosage16_to_floats_minus9(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_vals, uint32_t sample_ct, uint32_t dosage_ct, float* geno_float)
+    void dosage16_to_doubles_minus9(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_vals, uint32_t sample_ct, uint32_t dosage_ct, double* geno_double)
+    void bytes_to_bits_unsafe(const uint8_t* boolbytes, uint32_t sample_ct, uintptr_t* bitarr)
+    void bytes_to_genoarr_unsafe(const int8_t* genobytes, uint32_t sample_ct, uintptr_t* genoarr)
+    void allele_codes_to_genoarr_unsafe(const int32_t* allele_codes, const unsigned char* phasepresent_bytes, uint32_t sample_ct, uintptr_t* genoarr, uintptr_t* phasepresent, uintptr_t* phaseinfo)
+    void floats_to_dosage16(const float* floatarr, uint32_t sample_ct, uint32_t hard_call_halfdist, uintptr_t* genoarr, uintptr_t* dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr)
+    void doubles_to_dosage16(const double* doublearr, uint32_t sample_ct, uint32_t hard_call_halfdist, uintptr_t* genoarr, uintptr_t* dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr)
+
+    cdef enum:
+        k1LU
+    cdef enum:
+        kCacheline
+    cdef enum:
+        kBitsPerWord
+    cdef enum:
+        kBitsPerWordD2
+    cdef enum:
+        kBitsPerVec
+    cdef enum:
+        kBitsPerCacheline
+    cdef enum:
+        kQuatersPerVec
+    cdef enum:
+        kQuatersPerCacheline
+    cdef enum:
+        kBytesPerVec
+    cdef enum:
+        kInt32PerVec
+    cdef enum:
+        kInt32PerCacheline
+    cdef enum:
+        kWordsPerVec
+    cdef enum:
+        kPglErrstrBufBlen
+
+    cdef enum:
+        kPglQuaterTransposeBatch
+    cdef enum:
+        kPglQuaterTransposeWords
+    cdef enum:
+        kPglQuaterTransposeBufbytes
+    cdef enum:
+        kPglBitTransposeBufbytes
+
+    ctypedef uint32_t pgen_global_flags_t
+    cdef enum:
+        kfPgenGlobal0
+    cdef enum:
+        kfPgenGlobalHardcallPhasePresent
+    cdef enum:
+        kfPgenGlobalDosagePresent
+
+    cdef cppclass pgen_file_info_t:
+        uint32_t raw_variant_ct
+        uint32_t raw_sample_ct        
+        unsigned char* vrtypes
+        uint32_t gflags
+
+    ctypedef uint32_t pgen_header_ctrl_t
+
+    void pgfi_preinit(pgen_file_info_t* pgfip)
+    
+    pglerr_t pgfi_init_phase1(const char* fname, uint32_t raw_variant_ct, uint32_t raw_sample_ct, uint32_t use_mmap, pgen_header_ctrl_t* header_ctrl_ptr, pgen_file_info_t* pgfip, uintptr_t* pgfi_alloc_cacheline_ct_ptr, char* errstr_buf)
+
+    pglerr_t pgfi_init_phase2(pgen_header_ctrl_t header_ctrl, uint32_t allele_cts_already_loaded, uint32_t nonref_flags_already_loaded, uint32_t use_blockload, uint32_t vblock_idx_start, uint32_t vidx_end, uint32_t* max_vrec_width_ptr, pgen_file_info_t* pgfip, unsigned char* pgfi_alloc, uintptr_t* pgr_alloc_cacheline_ct_ptr, char* errstr_buf)
+    
+    cdef cppclass pgen_reader_t:
+        pgen_file_info_t fi
+        unsigned char* fread_buf
+
+    void pgr_preinit(pgen_reader_t* pgrp)
+
+    pglerr_t pgr_init(const char* fname, uint32_t max_vrec_width, pgen_file_info_t* pgfip, pgen_reader_t* pgrp, unsigned char* pgr_alloc)
+
+    pglerr_t pgr_read_allele_countvec_subset_unsafe(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, pgen_reader_t* pgrp, uintptr_t* allele_countvec)
+
+    pglerr_t pgr_read_refalt1_genovec_hphase_subset_unsafe(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* genovec, uintptr_t* phasepresent, uintptr_t* phaseinfo, uint32_t* phasepresent_ct_ptr)
+
+    pglerr_t pgr_read_refalt1_genovec_dosage16_subset_unsafe(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* genovec, uintptr_t* dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr, uint32_t* is_explicit_alt1_ptr)
+    
+    pglerr_t pgr_get_refalt1_genotype_counts(const uintptr_t* sample_include, const uintptr_t* sample_include_interleaved_vec, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uint32_t* genocounts)
+    
+    boolerr_t pgfi_cleanup(pgen_file_info_t* pgfip)
+    boolerr_t pgr_cleanup(pgen_reader_t* pgrp)
+
+    cdef cppclass pgen_writer_common_t:
+        uint32_t variant_ct
+        uint32_t sample_ct
+        uintptr_t* allele_idx_offsets
+        uint32_t vidx
+
+    cdef cppclass st_pgen_writer_t:
+        pgen_writer_common_t pwc
+
+    pglerr_t spgw_init_phase1(const char* fname, uintptr_t* allele_idx_offsets, uintptr_t* explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uint32_t nonref_flags_storage, st_pgen_writer_t* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr)
+
+    void spgw_init_phase2(uint32_t max_vrec_len, st_pgen_writer_t* spgwp, unsigned char* spgw_alloc)
+
+    pglerr_t spgw_append_biallelic_genovec(const uintptr_t* genovec, st_pgen_writer_t* spgwp)
+    
+    pglerr_t spgw_append_biallelic_genovec_hphase(const uintptr_t* genovec, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, st_pgen_writer_t* spgwp)
+
+    pglerr_t spgw_append_biallelic_genovec_dosage16(const uintptr_t* genovec, const uintptr_t* dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, st_pgen_writer_t* spgwp)
+    
+    pglerr_t spgw_finish(st_pgen_writer_t* spgwp)
+    
+    boolerr_t spgw_cleanup(st_pgen_writer_t* spgwp)
+
+    
+cdef class PgenReader:
+    # todo: nonref_flags, multiallelic variant support
+    cdef pgen_file_info_t* _info_ptr
+    cdef pgen_reader_t* _state_ptr
+    cdef uintptr_t* _subset_include_vec
+    cdef uintptr_t* _subset_include_interleaved_vec
+    cdef uint32_t* _subset_cumulative_popcounts
+    cdef uint32_t _subset_size
+    # preallocate buffers we'll use repeatedly
+    cdef uintptr_t* _genovec
+    cdef uintptr_t* _phasepresent
+    cdef uintptr_t* _phaseinfo
+    cdef uintptr_t* _dosage_present
+    cdef uint16_t* _dosage_vals
+    cdef vul_t* _transpose_batch_buf
+    # for multi-variant load-and-transpose, we load up to
+    # kPglQuaterTransposeBatch (= 256) variants at a time, and then transpose
+    cdef uintptr_t* _multivar_vmaj_geno_buf
+    cdef uintptr_t* _multivar_vmaj_phasepresent_buf
+    cdef uintptr_t* _multivar_vmaj_phaseinfo_buf
+    cdef uintptr_t* _multivar_smaj_geno_batch_buf
+    cdef uintptr_t* _multivar_smaj_phaseinfo_batch_buf
+    cdef uintptr_t* _multivar_smaj_phasepresent_batch_buf
+
+    cdef set_sample_subset_internal(self, np.ndarray[np.uint32_t,mode="c",ndim=1] sample_subset):
+        cdef uint32_t raw_sample_ct = self._info_ptr[0].raw_sample_ct
+        cdef uint32_t raw_sample_ctv = DIV_UP(raw_sample_ct, kBitsPerVec)
+        cdef uint32_t raw_sample_ctaw = raw_sample_ctv * kWordsPerVec
+        cdef uintptr_t* sample_include = self._subset_include_vec
+        fill_ulong_zero(raw_sample_ctaw, sample_include)
+        cdef uint32_t subset_size = sample_subset.size
+        if subset_size == 0:
+            raise RuntimeError("Empty sample_subset is not currently permitted.")
+        cdef sample_uidx = sample_subset[0]
+        cdef uint32_t idx = 0
+        cdef next_uidx
+        while True:
+            if sample_uidx >= raw_sample_ct:
+                raise RuntimeError("0-based sample idx too large (" + str(sample_uidx) + "; only " + str(raw_sample_ct) + " in file).")
+            sample_include[sample_uidx / kBitsPerWord] |= k1LU << (sample_uidx % kBitsPerWord)
+            idx += 1
+            if idx == subset_size:
+                break
+            next_uidx = sample_subset[idx]
+
+            # prohibit this since it implies that the caller expects genotypes
+            # to be returned in a different order
+            if next_uidx <= sample_uidx:
+                raise RuntimeError("sample_subset is not in strictly increasing order.")
+            
+            sample_uidx = next_uidx
+
+        fill_interleaved_mask_vec(sample_include, raw_sample_ctv, self._subset_include_interleaved_vec)
+        
+        cdef uint32_t raw_sample_ctl = DIV_UP(raw_sample_ct, kBitsPerWord)
+        fill_cumulative_popcounts(sample_include, raw_sample_ctl, self._subset_cumulative_popcounts)
+
+        self._subset_size = subset_size
+        return
+
+    
+    def __cinit__(self, bytes filename, object raw_sample_ct = None,
+                  object variant_ct = None, object sample_subset = None):
+        self._info_ptr = <pgen_file_info_t*>PyMem_Malloc(sizeof(pgen_file_info_t))
+        if not self._info_ptr:
+            raise MemoryError()
+        pgfi_preinit(self._info_ptr)
+        # this depends on pgenlib_internal implementation.  could save
+        # pgfi_alloc and pgr_alloc instead.
+        self._info_ptr[0].vrtypes = NULL
+        cdef uint32_t cur_sample_ct = 0xffffffffU
+        if raw_sample_ct is not None:
+            cur_sample_ct = raw_sample_ct
+        cdef uint32_t cur_variant_ct = 0xffffffffU
+        if variant_ct is not None:
+            cur_variant_ct = variant_ct
+        cdef const char* fname = <const char*>filename
+        cdef pgen_header_ctrl_t header_ctrl
+        cdef uintptr_t pgfi_alloc_cacheline_ct
+        cdef char errstr_buf[kPglErrstrBufBlen]
+        if pgfi_init_phase1(fname, cur_variant_ct, cur_sample_ct, 0, &header_ctrl, self._info_ptr, &pgfi_alloc_cacheline_ct, errstr_buf) != kPglRetSuccess:
+            raise RuntimeError(errstr_buf[7:])
+        assert (header_ctrl & 0x30) == 0 # no alt allele counts
+        assert (header_ctrl & 0xc0) != 0xc0 # no explicit nonref_flags
+        cdef uint32_t file_sample_ct = self._info_ptr[0].raw_sample_ct
+        assert file_sample_ct != 0
+        cdef unsigned char* pgfi_alloc = NULL
+        if pgfi_alloc_cacheline_ct != 0:
+            if cachealigned_malloc(pgfi_alloc_cacheline_ct * kCacheline, &pgfi_alloc):
+                raise MemoryError()
+        cdef uint32_t max_vrec_width
+        cdef uintptr_t pgr_alloc_cacheline_ct
+        if pgfi_init_phase2(header_ctrl, 1, 1, 0, 0, self._info_ptr[0].raw_variant_ct, &max_vrec_width, self._info_ptr, pgfi_alloc, &pgr_alloc_cacheline_ct, errstr_buf):
+            if pgfi_alloc and not self._info_ptr[0].vrtypes:
+                aligned_free(pgfi_alloc)
+            raise RuntimeError(errstr_buf[7:])
+
+        self._state_ptr = <pgen_reader_t*>PyMem_Malloc(sizeof(pgen_reader_t))
+        if not self._state_ptr:
+            raise MemoryError()
+        pgr_preinit(self._state_ptr)
+        self._state_ptr[0].fread_buf = NULL
+        cdef uintptr_t pgr_alloc_main_byte_ct = pgr_alloc_cacheline_ct * kCacheline
+        cdef uintptr_t sample_subset_byte_ct = DIV_UP(file_sample_ct, kBitsPerVec) * kBytesPerVec
+        cdef uintptr_t cumulative_popcounts_byte_ct = DIV_UP(file_sample_ct, kBitsPerWord * kInt32PerVec) * kBytesPerVec
+        cdef uintptr_t genovec_byte_ct = DIV_UP(file_sample_ct, kQuatersPerVec) * kBytesPerVec
+        cdef uintptr_t dosage_vals_byte_ct = DIV_UP(file_sample_ct, (2 * kInt32PerVec)) * kBytesPerVec
+        cdef unsigned char* pgr_alloc
+        if cachealigned_malloc(pgr_alloc_main_byte_ct + (2 * kPglQuaterTransposeBatch + 5) * sample_subset_byte_ct + cumulative_popcounts_byte_ct + (1 + kPglQuaterTransposeBatch) * genovec_byte_ct + dosage_vals_byte_ct + kPglBitTransposeBufbytes + 4 * (kPglQuaterTransposeBatch * kPglQuaterTransposeBatch / 8), &pgr_alloc):
+            raise MemoryError()
+        cdef pglerr_t reterr = pgr_init(fname, max_vrec_width, self._info_ptr, self._state_ptr, pgr_alloc)
+        if reterr != kPglRetSuccess:
+            if not self._state_ptr[0].fread_buf:
+                aligned_free(pgr_alloc)
+            raise RuntimeError("pgl_init() error " + str(reterr))
+        cdef unsigned char* pgr_alloc_iter = &(pgr_alloc[pgr_alloc_main_byte_ct])
+        self._subset_include_vec = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[sample_subset_byte_ct])
+        self._subset_include_interleaved_vec = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[sample_subset_byte_ct])
+        
+        # assumes kWordsPerVec <= 2
+        self._subset_include_interleaved_vec[-1] = 0
+
+        self._subset_cumulative_popcounts = <uint32_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[cumulative_popcounts_byte_ct])
+        self._genovec = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[genovec_byte_ct])
+        self._phasepresent = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[sample_subset_byte_ct])
+        self._phaseinfo = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[sample_subset_byte_ct])
+        self._dosage_present = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[sample_subset_byte_ct])
+        self._dosage_vals = <uint16_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[dosage_vals_byte_ct])
+        if sample_subset is not None:
+            self.set_sample_subset_internal(sample_subset)
+        else:
+            self._subset_size = file_sample_ct
+        self._transpose_batch_buf = <vul_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[kPglBitTransposeBufbytes])
+        self._multivar_vmaj_geno_buf = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[kPglQuaterTransposeBatch * genovec_byte_ct])
+        self._multivar_vmaj_phasepresent_buf = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[kPglQuaterTransposeBatch * sample_subset_byte_ct])
+        self._multivar_vmaj_phaseinfo_buf = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[kPglQuaterTransposeBatch * sample_subset_byte_ct])
+        self._multivar_smaj_geno_batch_buf = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[kPglQuaterTransposeBatch * kPglQuaterTransposeBatch / 4])
+        self._multivar_smaj_phaseinfo_batch_buf = <uintptr_t*>pgr_alloc_iter
+        pgr_alloc_iter = &(pgr_alloc_iter[kPglQuaterTransposeBatch * kPglQuaterTransposeBatch / 8])
+        self._multivar_smaj_phasepresent_batch_buf = <uintptr_t*>pgr_alloc_iter
+        # pgr_alloc_iter = &(pgr_alloc_iter[kPglQuaterTransposeBatch * kPglQuaterTransposeBatch / 8])
+        return
+
+    
+    cpdef __enter__(self):
+        return self
+
+
+    cpdef get_raw_sample_ct(self):
+        return self._info_ptr[0].raw_sample_ct
+
+
+    cpdef get_variant_ct(self):
+        return self._info_ptr[0].raw_variant_ct
+
+
+    cpdef hardcall_phase_present(self):
+        return ((self._info_ptr[0].gflags & kfPgenGlobalHardcallPhasePresent) != 0)
+    
+    
+    cpdef read(self, uint32_t variant_idx, np.ndarray geno_int_out, uint32_t allele_idx = 1):
+        if variant_idx >= self._info_ptr[0].raw_variant_ct:
+            # could have an unsafe mode which doesn't perform this check, but
+            # let's default to at least this much bounds-checking
+            raise RuntimeError("read() variant_idx too large (" + str(variant_idx) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        if not geno_int_out.flags["C_CONTIGUOUS"]:
+            raise RuntimeError("read() requires geno_int_out to be C-contiguous.")
+        # for full genotype info for multiallelic variants, use read_phased()
+        # instead
+        cdef pglerr_t reterr = pgr_read_allele_countvec_subset_unsafe(self._subset_include_vec, self._subset_cumulative_popcounts, self._subset_size, variant_idx, allele_idx, self._state_ptr, self._genovec)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("read() error " + str(reterr))
+        cdef int8_t* data8_ptr
+        cdef int32_t* data32_ptr
+        cdef int64_t* data64_ptr
+        if geno_int_out.dtype == np.int8:
+            data8_ptr = <int8_t*>geno_int_out.data
+            genoarr_to_bytes_minus9(self._genovec, self._subset_size, data8_ptr)
+        elif geno_int_out.dtype == np.int32:
+            data32_ptr = <int32_t*>geno_int_out.data
+            genoarr_to_int32s_minus9(self._genovec, self._subset_size, data32_ptr)
+        elif geno_int_out.dtype == np.int64:
+            data64_ptr = <int64_t*>geno_int_out.data
+            genoarr_to_int64s_minus9(self._genovec, self._subset_size, data64_ptr)
+        else:
+            raise RuntimeError("Invalid read() geno_int_out array element type (int8, int32, or int64 expected).")
+        return
+
+
+    cpdef read_dosages(self, uint32_t variant_idx, np.ndarray floatarr_out, uint32_t allele_idx = 1):
+        if variant_idx >= self._info_ptr[0].raw_variant_ct:
+            raise RuntimeError("read_dosages() variant_idx too large (" + str(variant_idx) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        if not floatarr_out.flags["C_CONTIGUOUS"]:
+            raise RuntimeError("read_dosages() requires floatarr_out to be C-contiguous.")
+        # todo: change this when pgenlib_internal supports multiallelic
+        # variants
+        cdef uint32_t dosage_ct
+        cdef uint32_t is_explicit_alt1
+        cdef pglerr_t reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(self._subset_include_vec, self._subset_cumulative_popcounts, self._subset_size, variant_idx, self._state_ptr, self._genovec, self._dosage_present, self._dosage_vals, &dosage_ct, &is_explicit_alt1)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("read_dosages() error " + str(reterr))
+        if allele_idx == 0:
+            genovec_invert_unsafe(self._subset_size, self._genovec)
+            biallelic_dosage16_invert(dosage_ct, self._dosage_vals)
+        # todo: flip on allele_idx == 0
+        cdef float* data32_ptr
+        cdef double* data64_ptr
+        if floatarr_out.dtype == np.float32:
+            data32_ptr = <float*>floatarr_out.data
+            dosage16_to_floats_minus9(self._genovec, self._dosage_present, self._dosage_vals, self._subset_size, dosage_ct, data32_ptr)
+        elif floatarr_out.dtype == np.float64:
+            data64_ptr = <double*>floatarr_out.data
+            dosage16_to_doubles_minus9(self._genovec, self._dosage_present, self._dosage_vals, self._subset_size, dosage_ct, data64_ptr)
+        else:
+            raise RuntimeError("Invalid read_dosages() floatarr_out array element type (float32 or float64 expected).")
+        return
+
+
+    cpdef read_alleles(self, uint32_t variant_idx, np.ndarray[np.int32_t,mode="c",ndim=1] allele_int32_out):
+        if variant_idx >= self._info_ptr[0].raw_variant_ct:
+            # could have an unsafe mode which doesn't perform this check, but
+            # let's default to at least this much bounds-checking
+            raise RuntimeError("read_alleles() variant_idx too large (" + str(variant_idx) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        cdef uint32_t phasepresent_ct
+        # upgrade to multiallelic version of this function in the future
+        cdef pglerr_t reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(self._subset_include_vec, self._subset_cumulative_popcounts, self._subset_size, variant_idx, self._state_ptr, self._genovec, self._phasepresent, self._phaseinfo, &phasepresent_ct)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("read_alleles() error " + str(reterr))
+        cdef int32_t* main_data_ptr = <int32_t*>(&(allele_int32_out[0]))
+        genoarr_phased_to_allele_codes(self._genovec, self._phasepresent, self._phaseinfo, self._subset_size, phasepresent_ct, NULL, main_data_ptr)
+        return
+
+
+    cpdef read_alleles_and_phasepresent(self, uint32_t variant_idx, np.ndarray[np.int32_t,mode="c",ndim=1] allele_int32_out, np.ndarray[np.uint8_t,mode="c",cast=True] phasepresent_out):
+        if variant_idx >= self._info_ptr[0].raw_variant_ct:
+            # could have an unsafe mode which doesn't perform this check, but
+            # let's default to at least this much bounds-checking
+            raise RuntimeError("read_alleles_and_phasepresent() variant_idx too large (" + str(variant_idx) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        cdef uint32_t phasepresent_ct
+        # upgrade to multiallelic version of this function in the future
+        cdef pglerr_t reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(self._subset_include_vec, self._subset_cumulative_popcounts, self._subset_size, variant_idx, self._state_ptr, self._genovec, self._phasepresent, self._phaseinfo, &phasepresent_ct)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("read_alleles_and_phasepresent() error " + str(reterr))
+        cdef int32_t* main_data_ptr = <int32_t*>(&(allele_int32_out[0]))
+        cdef unsigned char* phasepresent_data_ptr = <unsigned char*>(&(phasepresent_out[0]))
+        genoarr_phased_to_allele_codes(self._genovec, self._phasepresent, self._phaseinfo, self._subset_size, phasepresent_ct, phasepresent_data_ptr, main_data_ptr)
+        return
+
+
+    cdef read_range_internal8(self, uint32_t variant_idx_start, uint32_t variant_idx_end, np.ndarray[np.int8_t,mode="c",ndim=2] geno_int8_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        if variant_idx_end > self._info_ptr[0].raw_variant_ct:
+            raise RuntimeError("read_range() variant_idx_end too large (" + str(variant_idx_end) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uint32_t variant_idx_ct = variant_idx_end - variant_idx_start
+        cdef uint32_t subset_size = self._subset_size
+        cdef int8_t* data_ptr
+        cdef uint32_t variant_idx
+        cdef pglerr_t reterr
+        if sample_maj == 0:
+            if geno_int8_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_range() geno_int_out buffer has too few rows (" + str(geno_int8_out.shape[0]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+            if geno_int8_out.shape[1] < subset_size:
+                raise RuntimeError("Variant-major read_range() geno_int_out buffer has too few columns (" + str(geno_int8_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ")")
+            for variant_idx in range(variant_idx_start, variant_idx_end):
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, genovec)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                data_ptr = &(geno_int8_out[(variant_idx - variant_idx_start), 0])
+                genoarr_to_bytes_minus9(genovec, subset_size, data_ptr)
+            return
+        if variant_idx_start >= variant_idx_end:
+            raise RuntimeError("read_range() variant_idx_start >= variant_idx_end (" + str(variant_idx_start) + ", " + str(variant_idx_end) + ")")
+        if geno_int8_out.shape[0] < subset_size:
+            raise RuntimeError("Sample-major read_range() geno_int_out buffer has too few rows (" + str(geno_int8_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ")")
+        if geno_int8_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Sample-major read_range() geno_int_out buffer has too few columns (" + str(geno_int8_out.shape[1]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t variant_idx_offset = variant_idx_start
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* vmaj_iter
+        cdef uintptr_t* smaj_iter
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+            vmaj_iter = multivar_vmaj_geno_buf
+            for uii in range(variant_batch_size):
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, uii + variant_idx_offset, allele_idx, pgrp, vmaj_iter)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                vmaj_iter = &(vmaj_iter[sample_ctaw2])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_iter = multivar_vmaj_geno_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_iter = multivar_smaj_geno_batch_buf
+                transpose_quaterblock(vmaj_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    data_ptr = &(geno_int8_out[uii + sample_batch_idx * kPglQuaterTransposeBatch, variant_batch_idx * kPglQuaterTransposeBatch])
+                    genoarr_to_bytes_minus9(smaj_iter, variant_batch_size, data_ptr)
+                    smaj_iter = &(smaj_iter[kPglQuaterTransposeWords])
+                vmaj_iter = &(vmaj_iter[kPglQuaterTransposeWords])
+            variant_idx_offset += kPglQuaterTransposeBatch
+        return
+
+    cdef read_range_internal32(self, uint32_t variant_idx_start, uint32_t variant_idx_end, np.ndarray[np.int32_t,mode="c",ndim=2] geno_int32_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        if variant_idx_end > self._info_ptr[0].raw_variant_ct:
+            raise RuntimeError("read_range() variant_idx_end too large (" + str(variant_idx_end) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uint32_t variant_idx_ct = variant_idx_end - variant_idx_start
+        cdef uint32_t subset_size = self._subset_size
+        cdef int32_t* data_ptr
+        cdef uint32_t variant_idx
+        cdef pglerr_t reterr
+        if sample_maj == 0:
+            if geno_int32_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_range() geno_int_out buffer has too few rows (" + str(geno_int32_out.shape[0]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+            if geno_int32_out.shape[1] < subset_size:
+                raise RuntimeError("Variant-major read_range() geno_int_out buffer has too few columns (" + str(geno_int32_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ")")
+            for variant_idx in range(variant_idx_start, variant_idx_end):
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, genovec)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                data_ptr = <int32_t*>(&(geno_int32_out[(variant_idx - variant_idx_start), 0]))
+                genoarr_to_int32s_minus9(genovec, subset_size, data_ptr)
+            return
+        if variant_idx_start >= variant_idx_end:
+            raise RuntimeError("read_range() variant_idx_start >= variant_idx_end (" + str(variant_idx_start) + ", " + str(variant_idx_end) + ")")
+        if geno_int32_out.shape[0] < subset_size:
+            raise RuntimeError("Sample-major read_range() geno_int_out buffer has too few rows (" + str(geno_int32_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ")")
+        if geno_int32_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Sample-major read_range() geno_int_out buffer has too few columns (" + str(geno_int32_out.shape[1]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t variant_idx_offset = variant_idx_start
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* vmaj_iter
+        cdef uintptr_t* smaj_iter
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+            vmaj_iter = multivar_vmaj_geno_buf
+            for uii in range(variant_batch_size):
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, uii + variant_idx_offset, allele_idx, pgrp, vmaj_iter)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                vmaj_iter = &(vmaj_iter[sample_ctaw2])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_iter = multivar_vmaj_geno_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_iter = multivar_smaj_geno_batch_buf
+                transpose_quaterblock(vmaj_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    data_ptr = <int32_t*>(&(geno_int32_out[uii + sample_batch_idx * kPglQuaterTransposeBatch, variant_batch_idx * kPglQuaterTransposeBatch]))
+                    genoarr_to_int32s_minus9(smaj_iter, variant_batch_size, data_ptr)
+                    smaj_iter = &(smaj_iter[kPglQuaterTransposeWords])
+                vmaj_iter = &(vmaj_iter[kPglQuaterTransposeWords])
+            variant_idx_offset += kPglQuaterTransposeBatch
+        return
+
+    cdef read_range_internal64(self, uint32_t variant_idx_start, uint32_t variant_idx_end, np.ndarray[np.int64_t,mode="c",ndim=2] geno_int64_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        if variant_idx_end > self._info_ptr[0].raw_variant_ct:
+            raise RuntimeError("read_range() variant_idx_end too large (" + str(variant_idx_end) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uint32_t variant_idx_ct = variant_idx_end - variant_idx_start
+        cdef uint32_t subset_size = self._subset_size
+        cdef int64_t* data_ptr
+        cdef uint32_t variant_idx
+        cdef pglerr_t reterr
+        if sample_maj == 0:
+            if geno_int64_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_range() geno_int_out buffer has too few rows (" + str(geno_int64_out.shape[0]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+            if geno_int64_out.shape[1] < subset_size:
+                raise RuntimeError("Variant-major read_range() geno_int_out buffer has too few columns (" + str(geno_int64_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ")")
+            for variant_idx in range(variant_idx_start, variant_idx_end):
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, genovec)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                data_ptr = &(geno_int64_out[(variant_idx - variant_idx_start), 0])
+                genoarr_to_int64s_minus9(genovec, subset_size, data_ptr)
+            return
+        if variant_idx_start >= variant_idx_end:
+            raise RuntimeError("read_range() variant_idx_start >= variant_idx_end (" + str(variant_idx_start) + ", " + str(variant_idx_end) + ")")
+        if geno_int64_out.shape[0] < subset_size:
+            raise RuntimeError("Sample-major read_range() geno_int_out buffer has too few rows (" + str(geno_int64_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ")")
+        if geno_int64_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Sample-major read_range() geno_int_out buffer has too few columns (" + str(geno_int64_out.shape[1]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t variant_idx_offset = variant_idx_start
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* vmaj_iter
+        cdef uintptr_t* smaj_iter
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+            vmaj_iter = multivar_vmaj_geno_buf
+            for uii in range(variant_batch_size):
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, uii + variant_idx_offset, allele_idx, pgrp, vmaj_iter)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                vmaj_iter = &(vmaj_iter[sample_ctaw2])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_iter = multivar_vmaj_geno_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_iter = multivar_smaj_geno_batch_buf
+                transpose_quaterblock(vmaj_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    data_ptr = &(geno_int64_out[uii + sample_batch_idx * kPglQuaterTransposeBatch, variant_batch_idx * kPglQuaterTransposeBatch])
+                    genoarr_to_int64s_minus9(smaj_iter, variant_batch_size, data_ptr)
+                    smaj_iter = &(smaj_iter[kPglQuaterTransposeWords])
+                vmaj_iter = &(vmaj_iter[kPglQuaterTransposeWords])
+            variant_idx_offset += kPglQuaterTransposeBatch
+        return
+
+    cpdef read_range(self, uint32_t variant_idx_start, uint32_t variant_idx_end, np.ndarray geno_int_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        # C-contiguity checked by read_range_internal8(), etc.
+        if geno_int_out.dtype == np.int8:
+            self.read_range_internal8(variant_idx_start, variant_idx_end, geno_int_out, allele_idx, sample_maj)
+        elif geno_int_out.dtype == np.int32:
+            self.read_range_internal32(variant_idx_start, variant_idx_end, geno_int_out, allele_idx, sample_maj)
+        elif geno_int_out.dtype == np.int64:
+            self.read_range_internal64(variant_idx_start, variant_idx_end, geno_int_out, allele_idx, sample_maj)
+        else:
+            raise RuntimeError("Invalid read_range() geno_int_out array element type (int8, int32, or int64 expected).")
+        return
+
+
+    cdef read_list_internal8(self, np.ndarray[np.uint32_t] variant_idxs, np.ndarray[np.int8_t,mode="c",ndim=2] geno_int8_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        cdef uint32_t raw_variant_ct = self._info_ptr[0].raw_variant_ct
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uint32_t variant_idx_ct = <uint32_t>variant_idxs.shape[0]
+        cdef uint32_t subset_size = self._subset_size
+        cdef int8_t* data_ptr
+        cdef uint32_t variant_list_idx
+        cdef uint32_t variant_idx
+        cdef pglerr_t reterr
+        if sample_maj == 0:
+            if geno_int8_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_list() geno_int_out buffer has too few rows (" + str(geno_int8_out.shape[0]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+            if geno_int8_out.shape[1] < subset_size:
+                raise RuntimeError("Variant-major read_list() geno_int_out buffer has too few columns (" + str(geno_int8_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ")")
+            for variant_list_idx in range(variant_idx_ct):
+                variant_idx = variant_idxs[variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, genovec)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                data_ptr = &(geno_int8_out[variant_list_idx, 0])
+                genoarr_to_bytes_minus9(genovec, subset_size, data_ptr)
+            return
+        if geno_int8_out.shape[0] < subset_size:
+            raise RuntimeError("Sample-major read_list() geno_int_out buffer has too few rows (" + str(geno_int8_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ")")
+        if geno_int8_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Sample-major read_list() geno_int_out buffer has too few columns (" + str(geno_int8_out.shape[1]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* vmaj_iter
+        cdef uintptr_t* smaj_iter
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        variant_list_idx = 0
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+            vmaj_iter = multivar_vmaj_geno_buf
+            for uii in range(variant_batch_size):
+                variant_idx = variant_idxs[uii + variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, vmaj_iter)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_list() error " + str(reterr))
+                vmaj_iter = &(vmaj_iter[sample_ctaw2])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_iter = multivar_vmaj_geno_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_iter = multivar_smaj_geno_batch_buf
+                transpose_quaterblock(vmaj_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    data_ptr = &(geno_int8_out[uii + sample_batch_idx * kPglQuaterTransposeBatch, variant_batch_idx * kPglQuaterTransposeBatch])
+                    genoarr_to_bytes_minus9(smaj_iter, variant_batch_size, data_ptr)
+                    smaj_iter = &(smaj_iter[kPglQuaterTransposeWords])
+                vmaj_iter = &(vmaj_iter[kPglQuaterTransposeWords])
+            variant_list_idx += kPglQuaterTransposeBatch
+        return
+
+    cdef read_list_internal32(self, np.ndarray[np.uint32_t] variant_idxs, np.ndarray[np.int32_t,mode="c",ndim=2] geno_int32_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        cdef uint32_t raw_variant_ct = self._info_ptr[0].raw_variant_ct
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uint32_t variant_idx_ct = <uint32_t>variant_idxs.shape[0]
+        cdef uint32_t subset_size = self._subset_size
+        cdef int32_t* data_ptr
+        cdef uint32_t variant_list_idx
+        cdef uint32_t variant_idx
+        cdef pglerr_t reterr
+        if sample_maj == 0:
+            if geno_int32_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_list() geno_int_out buffer has too few rows (" + str(geno_int32_out.shape[0]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+            if geno_int32_out.shape[1] < subset_size:
+                raise RuntimeError("Variant-major read_list() geno_int_out buffer has too few columns (" + str(geno_int32_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ")")
+            for variant_list_idx in range(variant_idx_ct):
+                variant_idx = variant_idxs[variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, genovec)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                data_ptr = <int32_t*>(&(geno_int32_out[variant_list_idx, 0]))
+                genoarr_to_int32s_minus9(genovec, subset_size, data_ptr)
+            return
+        if geno_int32_out.shape[0] < subset_size:
+            raise RuntimeError("Sample-major read_list() geno_int_out buffer has too few rows (" + str(geno_int32_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ")")
+        if geno_int32_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Sample-major read_list() geno_int_out buffer has too few columns (" + str(geno_int32_out.shape[1]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* vmaj_iter
+        cdef uintptr_t* smaj_iter
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        variant_list_idx = 0
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+            vmaj_iter = multivar_vmaj_geno_buf
+            for uii in range(variant_batch_size):
+                variant_idx = variant_idxs[uii + variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, vmaj_iter)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_list() error " + str(reterr))
+                vmaj_iter = &(vmaj_iter[sample_ctaw2])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_iter = multivar_vmaj_geno_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_iter = multivar_smaj_geno_batch_buf
+                transpose_quaterblock(vmaj_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    data_ptr = <int32_t*>(&(geno_int32_out[uii + sample_batch_idx * kPglQuaterTransposeBatch, variant_batch_idx * kPglQuaterTransposeBatch]))
+                    genoarr_to_int32s_minus9(smaj_iter, variant_batch_size, data_ptr)
+                    smaj_iter = &(smaj_iter[kPglQuaterTransposeWords])
+                vmaj_iter = &(vmaj_iter[kPglQuaterTransposeWords])
+            variant_list_idx += kPglQuaterTransposeBatch
+        return
+
+    cdef read_list_internal64(self, np.ndarray[np.uint32_t] variant_idxs, np.ndarray[np.int64_t,mode="c",ndim=2] geno_int64_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        cdef uint32_t raw_variant_ct = self._info_ptr[0].raw_variant_ct
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uint32_t variant_idx_ct = <uint32_t>variant_idxs.shape[0]
+        cdef uint32_t subset_size = self._subset_size
+        cdef int64_t* data_ptr
+        cdef uint32_t variant_list_idx
+        cdef uint32_t variant_idx
+        cdef pglerr_t reterr
+        if sample_maj == 0:
+            if geno_int64_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_list() geno_int_out buffer has too few rows (" + str(geno_int64_out.shape[0]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+            if geno_int64_out.shape[1] < subset_size:
+                raise RuntimeError("Variant-major read_list() geno_int_out buffer has too few columns (" + str(geno_int64_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ")")
+            for variant_list_idx in range(variant_idx_ct):
+                variant_idx = variant_idxs[variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, genovec)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_range() error " + str(reterr))
+                data_ptr = &(geno_int64_out[variant_list_idx, 0])
+                genoarr_to_int64s_minus9(genovec, subset_size, data_ptr)
+            return
+        if geno_int64_out.shape[0] < subset_size:
+            raise RuntimeError("Sample-major read_list() geno_int_out buffer has too few rows (" + str(geno_int64_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ")")
+        if geno_int64_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Sample-major read_list() geno_int_out buffer has too few columns (" + str(geno_int64_out.shape[1]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* vmaj_iter
+        cdef uintptr_t* smaj_iter
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        variant_list_idx = 0
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+            vmaj_iter = multivar_vmaj_geno_buf
+            for uii in range(variant_batch_size):
+                variant_idx = variant_idxs[uii + variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                reterr = pgr_read_allele_countvec_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, allele_idx, pgrp, vmaj_iter)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_list() error " + str(reterr))
+                vmaj_iter = &(vmaj_iter[sample_ctaw2])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_iter = multivar_vmaj_geno_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_iter = multivar_smaj_geno_batch_buf
+                transpose_quaterblock(vmaj_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    data_ptr = &(geno_int64_out[uii + sample_batch_idx * kPglQuaterTransposeBatch, variant_batch_idx * kPglQuaterTransposeBatch])
+                    genoarr_to_int64s_minus9(smaj_iter, variant_batch_size, data_ptr)
+                    smaj_iter = &(smaj_iter[kPglQuaterTransposeWords])
+                vmaj_iter = &(vmaj_iter[kPglQuaterTransposeWords])
+            variant_list_idx += kPglQuaterTransposeBatch
+        return
+
+    cpdef read_list(self, np.ndarray[np.uint32_t] variant_idxs, np.ndarray geno_int_out, uint32_t allele_idx = 1, bint sample_maj = 0):
+        if geno_int_out.dtype == np.int8:
+            self.read_list_internal8(variant_idxs, geno_int_out, allele_idx, sample_maj)
+        elif geno_int_out.dtype == np.int32:
+            self.read_list_internal32(variant_idxs, geno_int_out, allele_idx, sample_maj)
+        elif geno_int_out.dtype == np.int64:
+            self.read_list_internal64(variant_idxs, geno_int_out, allele_idx, sample_maj)
+        else:
+            raise RuntimeError("Invalid read_list() geno_int_out array element type (int8, int32, or int64 expected).")
+        return
+
+
+    cpdef read_alleles_range(self, uint32_t variant_idx_start, uint32_t variant_idx_end, np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out, bint hap_maj = 0):
+        # if hap_maj == False, allele_int32_out must have at least
+        #   variant_idx_ct rows, 2 * sample_ct columns
+        # if hap_maj == True, allele_int32_out must have at least 2 * sample_ct
+        #   rows, variant_idx_ct columns
+        if variant_idx_end > self._info_ptr[0].raw_variant_ct:
+            raise RuntimeError("read_alleles_range() variant_idx_end too large (" + str(variant_idx_end) + "; only " + str(self._info_ptr[0].raw_variant_ct) + " in file)")
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* phasepresent = self._phasepresent
+        cdef uintptr_t* phaseinfo = self._phaseinfo
+        cdef uint32_t variant_idx_ct = variant_idx_end - variant_idx_start
+        cdef uint32_t subset_size = self._subset_size
+        cdef int32_t* main_data_ptr
+        cdef uint32_t variant_idx
+        cdef uint32_t phasepresent_ct
+        cdef pglerr_t reterr
+        if hap_maj == 0:
+            if allele_int32_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_alleles_range() allele_int32_out buffer has too few rows (" + str(allele_int32_out.shape[0]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+            if allele_int32_out.shape[1] < 2 * subset_size:
+                raise RuntimeError("Variant-major read_alleles_range() allele_int32_out buffer has too few columns (" + str(allele_int32_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ", and column count should be twice that)")
+            for variant_idx in range(variant_idx_start, variant_idx_end):
+                # upgrade to multiallelic version of this function later
+                reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, pgrp, genovec, phasepresent, phaseinfo, &phasepresent_ct)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_alleles_range() error " + str(reterr))
+                main_data_ptr = <int32_t*>(&(allele_int32_out[(variant_idx - variant_idx_start), 0]))
+                genoarr_phased_to_allele_codes(genovec, phasepresent, phaseinfo, subset_size, phasepresent_ct, NULL, main_data_ptr)
+            return
+        if variant_idx_start >= variant_idx_end:
+            raise RuntimeError("read_alleles_range() variant_idx_start >= variant_idx_end (" + str(variant_idx_start) + ", " + str(variant_idx_end) + ")")
+        if allele_int32_out.shape[0] < 2 * subset_size:
+            raise RuntimeError("Haplotype-major read_alleles_range() allele_int32_out buffer has too few rows (" + str(allele_int32_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ", and row count should be twice that)")
+        if allele_int32_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Haplotype-major read_alleles_range() allele_int32_out buffer has too few columns (" + str(allele_int32_out.shape[1]) + "; (variant_idx_end - variant_idx_start) is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t variant_batch_sizel = DIV_UP(variant_batch_size, kBitsPerWord)
+        cdef uint32_t variant_idx_offset = variant_idx_start
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_ctaw = kWordsPerVec * DIV_UP(subset_size, kBitsPerWord)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_vmaj_phaseinfo_buf = self._multivar_vmaj_phaseinfo_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* multivar_smaj_phaseinfo_batch_buf = self._multivar_smaj_phaseinfo_batch_buf
+        cdef uintptr_t* vmaj_geno_iter
+        cdef uintptr_t* vmaj_phaseinfo_iter
+        cdef uintptr_t* smaj_geno_iter
+        cdef uintptr_t* smaj_phaseinfo_iter
+        cdef int32_t* main_data1_ptr
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+                variant_batch_sizel = DIV_UP(variant_batch_size, kBitsPerWord)
+            vmaj_geno_iter = multivar_vmaj_geno_buf
+            vmaj_phaseinfo_iter = multivar_vmaj_phaseinfo_buf
+            for uii in range(variant_batch_size):
+                reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, uii + variant_idx_offset, pgrp, vmaj_geno_iter, phasepresent, vmaj_phaseinfo_iter, &phasepresent_ct)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_alleles_range() error " + str(reterr))
+                if phasepresent_ct == 0:
+                    fill_ulong_zero(sample_ctaw, vmaj_phaseinfo_iter)
+                # else:
+                    # bitvec_and(phasepresent, sample_ctaw, vmaj_phaseinfo_iter)
+                vmaj_geno_iter = &(vmaj_geno_iter[sample_ctaw2])
+                vmaj_phaseinfo_iter = &(vmaj_phaseinfo_iter[sample_ctaw])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_geno_iter = multivar_vmaj_geno_buf
+            vmaj_phaseinfo_iter = multivar_vmaj_phaseinfo_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_geno_iter = multivar_smaj_geno_batch_buf
+                smaj_phaseinfo_iter = multivar_smaj_phaseinfo_batch_buf
+                transpose_quaterblock(vmaj_geno_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_geno_iter, transpose_batch_buf)
+                # todo: skip bitblock transpose when all phasepresent_ct values
+                #       are zero, etc.
+                transpose_bitblock(vmaj_phaseinfo_iter, sample_ctaw, <uint32_t>(kPglQuaterTransposeWords / 2), variant_batch_size, sample_batch_size, smaj_phaseinfo_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    main_data_ptr = <int32_t*>(&(allele_int32_out[2 * (uii + sample_batch_idx * kPglQuaterTransposeWords), variant_batch_idx * kPglQuaterTransposeBatch]))
+                    main_data1_ptr = <int32_t*>(&(allele_int32_out[2 * (uii + sample_batch_idx * kPglQuaterTransposeWords) + 1, variant_batch_idx * kPglQuaterTransposeBatch]))
+                    genoarr_phased_to_hap_codes(smaj_geno_iter, smaj_phaseinfo_iter, variant_batch_size, main_data_ptr, main_data1_ptr)
+                    smaj_geno_iter = &(smaj_geno_iter[kPglQuaterTransposeWords])
+                    smaj_phaseinfo_iter = &(smaj_phaseinfo_iter[kPglQuaterTransposeWords / 2])
+                vmaj_geno_iter = &(vmaj_geno_iter[kPglQuaterTransposeWords])
+                vmaj_phaseinfo_iter = &(vmaj_phaseinfo_iter[kPglQuaterTransposeWords / 2])
+            variant_idx_offset += kPglQuaterTransposeBatch
+        return
+
+
+    cpdef read_alleles_list(self, np.ndarray[np.uint32_t] variant_idxs, np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out, bint hap_maj = 0):
+        # if hap_maj == False, allele_int32_out must have at least
+        #   variant_idx_ct rows, 2 * sample_ct columns
+        # if hap_maj == True, allele_int32_out must have at least 2 * sample_ct
+        #   rows, variant_idx_ct columns
+        cdef uint32_t raw_variant_ct = self._info_ptr[0].raw_variant_ct
+        cdef const uintptr_t* subset_include_vec = self._subset_include_vec
+        cdef const uint32_t* subset_cumulative_popcounts = self._subset_cumulative_popcounts
+        cdef pgen_reader_t* pgrp = self._state_ptr
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* phasepresent = self._phasepresent
+        cdef uintptr_t* phaseinfo = self._phaseinfo
+        cdef uint32_t variant_idx_ct = <uint32_t>variant_idxs.shape[0]
+        cdef uint32_t subset_size = self._subset_size
+        cdef int32_t* main_data_ptr
+        cdef uint32_t variant_list_idx
+        cdef uint32_t variant_idx
+        cdef uint32_t phasepresent_ct
+        cdef pglerr_t reterr
+        if hap_maj == 0:
+            if allele_int32_out.shape[0] < variant_idx_ct:
+                raise RuntimeError("Variant-major read_alleles_list() allele_int32_out buffer has too few rows (" + str(allele_int32_out.shape[0]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+            if allele_int32_out.shape[1] < 2 * subset_size:
+                raise RuntimeError("Variant-major read_alleles_list() allele_int32_out buffer has too few columns (" + str(allele_int32_out.shape[1]) + "; current sample subset has size " + str(subset_size) + ", and column count should be twice that)")
+            for variant_list_idx in range(variant_idx_ct):
+                variant_idx = variant_idxs[variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_alleles_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                # upgrade to multiallelic version of this function later
+                reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, pgrp, genovec, phasepresent, phaseinfo, &phasepresent_ct)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_alleles_list() error " + str(reterr))
+                main_data_ptr = <int32_t*>(&(allele_int32_out[variant_list_idx, 0]))
+                genoarr_phased_to_allele_codes(genovec, phasepresent, phaseinfo, subset_size, phasepresent_ct, NULL, main_data_ptr)
+            return
+        if allele_int32_out.shape[0] < 2 * subset_size:
+            raise RuntimeError("Haplotype-major read_alleles_list() allele_int32_out buffer has too few rows (" + str(allele_int32_out.shape[0]) + "; current sample subset has size " + str(subset_size) + ", and row count should be twice that)")
+        if allele_int32_out.shape[1] < variant_idx_ct:
+            raise RuntimeError("Haplotype-major read_alleles_list() allele_int32_out buffer has too few columns (" + str(allele_int32_out.shape[1]) + "; variant_idxs length is " + str(variant_idx_ct) + ")")
+        cdef uint32_t variant_batch_ct = DIV_UP(variant_idx_ct, kPglQuaterTransposeBatch)
+        cdef uint32_t variant_batch_size = kPglQuaterTransposeBatch
+        cdef uint32_t variant_batch_sizel = DIV_UP(variant_batch_size, kBitsPerWord)
+        cdef uint32_t sample_ctaw2 = kWordsPerVec * DIV_UP(subset_size, kBitsPerWordD2)
+        cdef uint32_t sample_ctaw = kWordsPerVec * DIV_UP(subset_size, kBitsPerWord)
+        cdef uint32_t sample_batch_ct = DIV_UP(subset_size, kPglQuaterTransposeBatch)
+        cdef vul_t* transpose_batch_buf = self._transpose_batch_buf
+        cdef uintptr_t* multivar_vmaj_geno_buf = self._multivar_vmaj_geno_buf
+        cdef uintptr_t* multivar_vmaj_phaseinfo_buf = self._multivar_vmaj_phaseinfo_buf
+        cdef uintptr_t* multivar_smaj_geno_batch_buf = self._multivar_smaj_geno_batch_buf
+        cdef uintptr_t* multivar_smaj_phaseinfo_batch_buf = self._multivar_smaj_phaseinfo_batch_buf
+        cdef uintptr_t* vmaj_geno_iter
+        cdef uintptr_t* vmaj_phaseinfo_iter
+        cdef uintptr_t* smaj_geno_iter
+        cdef uintptr_t* smaj_phaseinfo_iter
+        cdef int32_t* main_data1_ptr
+        cdef uint32_t variant_batch_idx
+        cdef uint32_t sample_batch_size
+        cdef uint32_t sample_batch_idx
+        cdef uint32_t uii
+        for variant_batch_idx in range(variant_batch_ct):
+            if variant_batch_idx == (variant_batch_ct - 1):
+                variant_batch_size = 1 + <uint32_t>((variant_idx_ct - 1) % kPglQuaterTransposeBatch)
+                variant_batch_sizel = DIV_UP(variant_batch_size, kBitsPerWord)
+            vmaj_geno_iter = multivar_vmaj_geno_buf
+            vmaj_phaseinfo_iter = multivar_vmaj_phaseinfo_buf
+            for variant_list_idx in range(variant_batch_idx * kPglQuaterTransposeBatch, variant_batch_idx * kPglQuaterTransposeBatch + variant_batch_size):
+                variant_idx = variant_idxs[variant_list_idx]
+                if variant_idx >= raw_variant_ct:
+                    raise RuntimeError("read_alleles_list() variant index too large (" + str(variant_idx) + "; only " + str(raw_variant_ct) + " in file)")
+                reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(subset_include_vec, subset_cumulative_popcounts, subset_size, variant_idx, pgrp, vmaj_geno_iter, phasepresent, vmaj_phaseinfo_iter, &phasepresent_ct)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("read_alleles_list() error " + str(reterr))
+                if phasepresent_ct == 0:
+                    fill_ulong_zero(sample_ctaw, vmaj_phaseinfo_iter)
+                # else:
+                    # bitvec_and(phasepresent, sample_ctaw, vmaj_phaseinfo_iter)
+                vmaj_geno_iter = &(vmaj_geno_iter[sample_ctaw2])
+                vmaj_phaseinfo_iter = &(vmaj_phaseinfo_iter[sample_ctaw])
+            sample_batch_size = kPglQuaterTransposeBatch
+            vmaj_geno_iter = multivar_vmaj_geno_buf
+            vmaj_phaseinfo_iter = multivar_vmaj_phaseinfo_buf
+            for sample_batch_idx in range(sample_batch_ct):
+                if sample_batch_idx == sample_batch_ct - 1:
+                    sample_batch_size = 1 + <uint32_t>((subset_size - 1) % kPglQuaterTransposeBatch)
+                smaj_geno_iter = multivar_smaj_geno_batch_buf
+                smaj_phaseinfo_iter = multivar_smaj_phaseinfo_batch_buf
+                transpose_quaterblock(vmaj_geno_iter, sample_ctaw2, kPglQuaterTransposeWords, variant_batch_size, sample_batch_size, smaj_geno_iter, transpose_batch_buf)
+                # todo: skip bitblock transpose when all phasepresent_ct values
+                #       are zero, etc.
+                transpose_bitblock(vmaj_phaseinfo_iter, sample_ctaw, <uint32_t>(kPglQuaterTransposeWords / 2), variant_batch_size, sample_batch_size, smaj_phaseinfo_iter, transpose_batch_buf)
+                for uii in range(sample_batch_size):
+                    main_data_ptr = <int32_t*>(&(allele_int32_out[2 * (uii + sample_batch_idx * kPglQuaterTransposeWords), variant_batch_idx * kPglQuaterTransposeBatch]))
+                    main_data1_ptr = <int32_t*>(&(allele_int32_out[2 * (uii + sample_batch_idx * kPglQuaterTransposeWords) + 1, variant_batch_idx * kPglQuaterTransposeBatch]))
+                    genoarr_phased_to_hap_codes(smaj_geno_iter, smaj_phaseinfo_iter, variant_batch_size, main_data_ptr, main_data1_ptr)
+                    smaj_geno_iter = &(smaj_geno_iter[kPglQuaterTransposeWords])
+                    smaj_phaseinfo_iter = &(smaj_phaseinfo_iter[kPglQuaterTransposeWords / 2])
+                vmaj_geno_iter = &(vmaj_geno_iter[kPglQuaterTransposeWords])
+                vmaj_phaseinfo_iter = &(vmaj_phaseinfo_iter[kPglQuaterTransposeWords / 2])
+        return
+
+
+    cpdef read_alleles_and_phasepresent_range(self, uint32_t variant_idx_start, uint32_t variant_idx_end, np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out, np.ndarray[np.uint8_t,mode="c",cast=True,ndim=2] phasepresent_out, bint hap_maj = 0):
+        pass
+
+
+    cpdef read_alleles_and_phasepresent_list(self, np.ndarray[np.uint32_t] variant_idxs, np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out, np.ndarray[np.uint8_t,cast=True,mode="c",ndim=2] phasepresent_out, bint hap_maj = 0):
+        pass
+
+    
+    cpdef count(self, uint32_t variant_idx, np.ndarray[np.uint32_t,mode="c"] genocount_uint32_out, object allele_idx = 1):
+        # todo: multiallelic variants
+        if allele_idx is None:
+            allele_idx = 1
+        cdef uint32_t* data_ptr = <uint32_t*>(&(genocount_uint32_out[0]))
+        cdef pglerr_t reterr = pgr_get_refalt1_genotype_counts(self._subset_include_vec, self._subset_include_interleaved_vec, self._subset_cumulative_popcounts, self._subset_size, variant_idx, self._state_ptr, data_ptr)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("count() error " + str(reterr))
+        if allele_idx != 0:
+            return
+        cdef uint32_t tmp = data_ptr[0]
+        data_ptr[0] = data_ptr[2]
+        data_ptr[2] = tmp
+        return
+
+    
+    cpdef change_sample_subset(self, object sample_subset = None):
+        if sample_subset is not None:
+            self.set_sample_subset_internal(sample_subset)
+        else:
+            self._subset_size = self._info_ptr[0].raw_sample_ct
+        return
+
+    
+    cpdef close(self):
+        # don't bother propagating file close errors for now
+        if self._info_ptr:
+            pgfi_cleanup(self._info_ptr)
+            if self._info_ptr[0].vrtypes:
+                aligned_free(self._info_ptr[0].vrtypes)
+                if self._state_ptr:
+                    pgr_cleanup(self._state_ptr)
+                    if self._state_ptr[0].fread_buf:
+                        aligned_free(self._state_ptr[0].fread_buf)
+                    PyMem_Free(self._state_ptr)
+                    self._state_ptr = NULL
+            PyMem_Free(self._info_ptr)
+            self._info_ptr = NULL
+        return
+
+    
+    cpdef __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return
+
+    
+    def __dealloc__(self):
+        if self._info_ptr:
+            pgfi_cleanup(self._info_ptr)
+            if self._info_ptr[0].vrtypes:
+                aligned_free(self._info_ptr[0].vrtypes)
+                if self._state_ptr:
+                    pgr_cleanup(self._state_ptr)
+                    if self._state_ptr[0].fread_buf:
+                        aligned_free(self._state_ptr[0].fread_buf)
+                    PyMem_Free(self._state_ptr)
+            PyMem_Free(self._info_ptr)
+        return
+
+
+
+cdef bytes_to_bits_internal(np.ndarray[np.uint8_t,mode="c",cast=True] boolbytes, uint32_t sample_ct, uintptr_t* bitarr):
+    bytes_to_bits_unsafe(boolbytes, sample_ct, bitarr)
+
+cdef class PgenWriter:
+    cdef st_pgen_writer_t* _state_ptr
+    cdef uintptr_t* _nonref_flags
+    # preallocate buffers we'll use repeatedly
+    cdef uintptr_t* _genovec
+    cdef uintptr_t* _phasepresent
+    cdef uintptr_t* _phaseinfo
+    cdef uintptr_t* _dosage_present
+    cdef uint16_t* _dosage_vals
+    
+    
+    def __cinit__(self, bytes filename, uint32_t sample_ct,
+                  uint32_t variant_ct, object nonref_flags,
+                  object allele_idx_offsets = None,
+                  bint hardcall_phase_present = False,
+                  bint dosage_present = False,
+                  bint dosage_phase_present = False):
+        if dosage_phase_present and not dosage_present:
+            raise RuntimeError("Invalid arguments for PgenWriter constructor (dosage_phase_present true but dosage_present false).")
+        if allele_idx_offsets is not None:
+            for uii in range(variant_ct + 1):
+                if allele_idx_offsets[uii] != uii * 2:
+                    raise RuntimeError("Multiallelic variants aren't supported by PgenWriter yet.")
+                
+        self._state_ptr = <st_pgen_writer_t*>PyMem_Malloc(sizeof(st_pgen_writer_t))
+        if not self._state_ptr:
+            raise MemoryError()
+        self._nonref_flags = NULL
+        cdef uint32_t nonref_flags_storage = 0
+        cdef uint32_t bitvec_cacheline_ct = DIV_UP(sample_ct, kBitsPerCacheline)
+        if nonref_flags is not None:
+            if type(nonref_flags) == type(True):
+                if nonref_flags:
+                    nonref_flags_storage = 2
+                else:
+                    nonref_flags_storage = 1
+            else:
+                nonref_flags_storage = 3
+                if cachealigned_malloc(bitvec_cacheline_ct * kCacheline, &(self._nonref_flags)):
+                    raise MemoryError()
+                bytes_to_bits_internal(nonref_flags, sample_ct, self._nonref_flags)
+        cdef const char* fname = <const char*>filename
+        cdef pgen_global_flags_t phase_dosage_gflags = kfPgenGlobal0
+        if hardcall_phase_present:
+            phase_dosage_gflags |= kfPgenGlobalHardcallPhasePresent
+        if dosage_present:
+            phase_dosage_gflags |= kfPgenGlobalDosagePresent
+        assert not dosage_phase_present
+        cdef uintptr_t alloc_cacheline_ct
+        cdef uint32_t max_vrec_len
+        cdef pglerr_t reterr = spgw_init_phase1(fname, NULL, self._nonref_flags, variant_ct, sample_ct, phase_dosage_gflags, nonref_flags_storage, self._state_ptr, &alloc_cacheline_ct, &max_vrec_len)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("spgw_init_phase1() error " + str(reterr))
+        cdef uint32_t genovec_cacheline_ct = DIV_UP(sample_ct, kQuatersPerCacheline)
+        cdef uint32_t dosage_vals_cacheline_ct = DIV_UP(sample_ct, (2 * kInt32PerCacheline))
+        cdef unsigned char* spgw_alloc
+        if cachealigned_malloc((alloc_cacheline_ct + genovec_cacheline_ct + 3 * bitvec_cacheline_ct + dosage_vals_cacheline_ct) * kCacheline, &spgw_alloc):
+            raise MemoryError()
+        spgw_init_phase2(max_vrec_len, self._state_ptr, spgw_alloc)  
+        self._genovec = <uintptr_t*>(&(spgw_alloc[alloc_cacheline_ct * kCacheline]))
+        self._phasepresent = <uintptr_t*>(&(spgw_alloc[(alloc_cacheline_ct + genovec_cacheline_ct) * kCacheline]))
+        self._phaseinfo = <uintptr_t*>(&(spgw_alloc[(alloc_cacheline_ct + genovec_cacheline_ct + bitvec_cacheline_ct) * kCacheline]))
+        self._dosage_present = <uintptr_t*>(&(spgw_alloc[(alloc_cacheline_ct + genovec_cacheline_ct + 2 * bitvec_cacheline_ct) * kCacheline]))
+        self._dosage_vals = <uint16_t*>(&(spgw_alloc[(alloc_cacheline_ct + genovec_cacheline_ct + 3 * bitvec_cacheline_ct) * kCacheline]))
+        return
+
+    
+    cpdef __enter__(self):
+        return self
+
+
+    cpdef append_biallelic(self, np.ndarray[np.int8_t,mode="c"] geno_int8):
+        cdef int8_t* genobytes = &(geno_int8[0])
+        bytes_to_genoarr_unsafe(genobytes, self._state_ptr[0].pwc.sample_ct, self._genovec)
+        cdef pglerr_t reterr = spgw_append_biallelic_genovec(self._genovec, self._state_ptr)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("append_biallelic() error " + str(reterr))
+        return
+    
+
+    cpdef append_alleles(self, np.ndarray[np.int32_t,mode="c"] allele_int32, bint all_phased = False):
+        cdef int32_t* allele_codes = <int32_t*>(&(allele_int32[0]))
+        cdef uintptr_t* genovec = self._genovec
+        cdef pglerr_t reterr
+        if not all_phased:
+            allele_codes_to_genoarr_unsafe(allele_codes, NULL, self._state_ptr[0].pwc.sample_ct, genovec, NULL, NULL)
+            reterr = spgw_append_biallelic_genovec(genovec, self._state_ptr)
+        else:
+            allele_codes_to_genoarr_unsafe(allele_codes, NULL, self._state_ptr[0].pwc.sample_ct, genovec, self._phasepresent, self._phaseinfo)
+            reterr = spgw_append_biallelic_genovec_hphase(genovec, self._phasepresent, self._phaseinfo, self._state_ptr)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("append_alleles() error " + str(reterr))
+        return
+
+    
+    cpdef append_partially_phased(self, np.ndarray[np.int32_t,mode="c"] allele_int32, np.ndarray[np.uint8_t,cast=True] phasepresent):
+        cdef int32_t* allele_codes = <int32_t*>(&(allele_int32[0]))
+        cdef unsigned char* phasepresent_bytes = <unsigned char*>(&(phasepresent[0]))
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* phasepresent_buf = self._phasepresent
+        cdef uintptr_t* phaseinfo = self._phaseinfo
+        allele_codes_to_genoarr_unsafe(allele_codes, phasepresent_bytes, self._state_ptr[0].pwc.sample_ct, genovec, phasepresent_buf, phaseinfo)
+        cdef pglerr_t reterr = spgw_append_biallelic_genovec_hphase(genovec, phasepresent_buf, phaseinfo, self._state_ptr)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("append_partially_phased() error " + str(reterr))
+        return
+
+
+    cdef append_dosages_internal32(self, np.ndarray[np.float32_t,mode="c"] floatarr):
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* dosage_present = self._dosage_present
+        cdef uint16_t* dosage_vals = self._dosage_vals
+        cdef uint32_t dosage_ct
+        floats_to_dosage16(<float*>(&(floatarr[0])), self._state_ptr[0].pwc.sample_ct, 6554, genovec, dosage_present, dosage_vals, &dosage_ct)
+        cdef pglerr_t reterr = spgw_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_ct, self._state_ptr)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("append_dosages() error " + str(reterr))
+        return
+
+    cdef append_dosages_internal64(self, np.ndarray[np.float64_t,mode="c"] doublearr):
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* dosage_present = self._dosage_present
+        cdef uint16_t* dosage_vals = self._dosage_vals
+        cdef uint32_t dosage_ct
+        doubles_to_dosage16(<double*>(&(doublearr[0])), self._state_ptr[0].pwc.sample_ct, 6554, genovec, dosage_present, dosage_vals, &dosage_ct)
+        cdef pglerr_t reterr = spgw_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_ct, self._state_ptr)
+        if reterr != kPglRetSuccess:
+            raise RuntimeError("append_dosages() error " + str(reterr))
+        return
+
+    cpdef append_dosages(self, np.ndarray floatarr):
+        if floatarr.dtype == np.float32:
+            self.append_dosages_internal32(floatarr)
+        elif floatarr.dtype == np.float64:
+            self.append_dosages_internal64(floatarr)
+        else:
+            raise RuntimeError("Invalid append_dosages() dosage array element type (float32 or float64 expected).")
+        return
+
+
+    cpdef append_biallelic_batch(self, np.ndarray[np.int8_t,mode="c",ndim=2] geno_int8_batch):
+        cdef uint32_t batch_size = <uint32_t>geno_int8_batch.shape[0]
+        cdef int8_t* genobytes
+        cdef uint32_t uii
+        cdef pglerr_t reterr
+        for uii in range(batch_size):
+            genobytes = &(geno_int8_batch[uii, 0])
+            bytes_to_genoarr_unsafe(genobytes, self._state_ptr[0].pwc.sample_ct, self._genovec)
+            reterr = spgw_append_biallelic_genovec(self._genovec, self._state_ptr)
+            if reterr != kPglRetSuccess:
+                raise RuntimeError("append_biallelic_batch() error " + str(reterr))
+        return
+    
+
+    cpdef append_alleles_batch(self, np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_batch, bint all_phased = False):
+        cdef uint32_t batch_size = <uint32_t>allele_int32_batch.shape[0]
+        cdef uintptr_t* genovec = self._genovec
+        cdef int32_t* allele_codes
+        cdef uint32_t uii
+        cdef pglerr_t reterr
+        if not all_phased:
+            for uii in range(batch_size):
+                allele_codes = <int32_t*>(&(allele_int32_batch[uii, 0]))
+                allele_codes_to_genoarr_unsafe(allele_codes, NULL, self._state_ptr[0].pwc.sample_ct, genovec, NULL, NULL)
+                reterr = spgw_append_biallelic_genovec(genovec, self._state_ptr)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("append_alleles_batch() error " + str(reterr))
+        else:
+            for uii in range(batch_size):
+                allele_codes = <int32_t*>(&(allele_int32_batch[uii, 0]))
+                allele_codes_to_genoarr_unsafe(allele_codes, NULL, self._state_ptr[0].pwc.sample_ct, genovec, self._phasepresent, self._phaseinfo)
+                reterr = spgw_append_biallelic_genovec_hphase(genovec, self._phasepresent, self._phaseinfo, self._state_ptr)
+                if reterr != kPglRetSuccess:
+                    raise RuntimeError("append_alleles_batch() error " + str(reterr))
+        return
+
+    
+    cpdef append_partially_phased_batch(self, np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_batch, np.ndarray[np.uint8_t,mode="c",cast=True,ndim=2] phasepresent_batch):
+        cdef uint32_t batch_size = <uint32_t>allele_int32_batch.shape[0]
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* phasepresent_buf = self._phasepresent
+        cdef uintptr_t* phaseinfo = self._phaseinfo
+        cdef int32_t* allele_codes
+        cdef unsigned char* phasepresent_bytes
+        cdef uint32_t uii
+        cdef pglerr_t reterr
+        for uii in range(batch_size):
+            allele_codes = <int32_t*>(&(allele_int32_batch[uii, 0]))
+            phasepresent_bytes = <unsigned char*>(&(phasepresent_batch[uii, 0]))
+            allele_codes_to_genoarr_unsafe(allele_codes, phasepresent_bytes, self._state_ptr[0].pwc.sample_ct, genovec, phasepresent_buf, phaseinfo)
+            reterr = spgw_append_biallelic_genovec_hphase(genovec, phasepresent_buf, phaseinfo, self._state_ptr)
+            if reterr != kPglRetSuccess:
+                raise RuntimeError("append_partially_phased_batch() error " + str(reterr))
+        return
+
+
+    cdef append_dosages_batch_internal32(self, np.ndarray[np.float32_t,mode="c",ndim=2] floatarr_batch):
+        cdef uint32_t batch_size = <uint32_t>floatarr_batch.shape[0]
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* dosage_present = self._dosage_present
+        cdef uint16_t* dosage_vals = self._dosage_vals
+        cdef uint32_t dosage_ct
+        cdef uint32_t uii
+        cdef pglerr_t reterr
+        for uii in range(batch_size):
+            floats_to_dosage16(<float*>(&(floatarr_batch[uii, 0])), self._state_ptr[0].pwc.sample_ct, 6554, genovec, dosage_present, dosage_vals, &dosage_ct)
+            reterr = spgw_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_ct, self._state_ptr)
+            if reterr != kPglRetSuccess:
+                raise RuntimeError("append_dosages_batch() error " + str(reterr))
+        return
+
+    cdef append_dosages_batch_internal64(self, np.ndarray[np.float64_t,mode="c",ndim=2] doublearr_batch):
+        cdef uint32_t batch_size = <uint32_t>doublearr_batch.shape[0]
+        cdef uintptr_t* genovec = self._genovec
+        cdef uintptr_t* dosage_present = self._dosage_present
+        cdef uint16_t* dosage_vals = self._dosage_vals
+        cdef uint32_t dosage_ct
+        cdef uint32_t uii
+        cdef pglerr_t reterr
+        for uii in range(batch_size):
+            doubles_to_dosage16(<double*>(&(doublearr_batch[uii, 0])), self._state_ptr[0].pwc.sample_ct, 6554, genovec, dosage_present, dosage_vals, &dosage_ct)
+            reterr = spgw_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_ct, self._state_ptr)
+            if reterr != kPglRetSuccess:
+                raise RuntimeError("append_dosages_batch() error " + str(reterr))
+        return
+
+    cpdef append_dosages_batch(self, np.ndarray floatarr_batch):
+        if floatarr_batch.dtype == np.float32:
+            self.append_dosages_batch_internal32(floatarr_batch)
+        elif floatarr_batch.dtype == np.float64:
+            self.append_dosages_batch_internal64(floatarr_batch)
+        else:
+            raise RuntimeError("Invalid append_dosages_batch() dosage array element type (float32 or float64 expected).")
+        return
+    
+    
+    cpdef close(self):
+        if self._state_ptr:
+            if self._state_ptr[0].pwc.vidx != self._state_ptr[0].pwc.variant_ct:
+                raise RuntimeError("PgenWriter.close() called when number of written variants (" + str(self._state_ptr[0].pwc.vidx) + ") unequal to initially declared value (" + str(self._state_ptr[0].pwc.variant_ct) + ").")
+            spgw_finish(self._state_ptr)
+            if self._nonref_flags:
+                aligned_free(self._nonref_flags)
+            PyMem_Free(self._state_ptr)
+            self._state_ptr = NULL
+        return
+
+
+    cpdef __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return
+
+
+    def __dealloc__(self):
+        if self._state_ptr:
+            if self._state_ptr[0].pwc.vidx == self._state_ptr[0].pwc.variant_ct:
+                spgw_finish(self._state_ptr)
+            else:
+                spgw_cleanup(self._state_ptr)
+            if self._nonref_flags:
+                aligned_free(self._nonref_flags)
+            PyMem_Free(self._state_ptr)
+        return
diff --git a/Python/python_api.txt b/Python/python_api.txt
new file mode 100644
index 0000000..435432e
--- /dev/null
+++ b/Python/python_api.txt
@@ -0,0 +1,216 @@
+Pgenlib Python API specification, v0.7
+("import numpy as np" assumed)
+
+class PgenReader:
+* PgenReader(filename, raw_sample_ct = None, variant_ct = None,
+             sample_subset = None)
+  Constructor, opens the .pgen or .bed file.  Requires a filename (exception is
+  thrown if file doesn't exist or it's invalid).
+  - raw_sample_ct is required for a .bed file (otherwise an exception will be
+    thrown), and optional for a .pgen file.  If it's provided for a .pgen file,
+    an exception is thrown if sample_ct does not match the value in the .pgen.
+  - variant_ct is always optional.  An exception is thrown if it does not match
+    the value implied by the .bed file's size, or the explicitly stored value
+    in the .pgen.
+  - sample_subset is an optional numpy uint32 array of (0-based) indexes in
+    increasing order, telling the reader to only consider the specified samples
+    when loading or counting genotypes.  For example, if your .fam file looks
+    something like
+      40184_187545456 40184_187545456 0       0       1       -9
+      40195_187545457 40195_187545457 0       0       1       -9
+      40206_187545458 40206_187545458 0       0       1       -9
+      40217_187545459 40217_187545459 0       0       2       -9
+      40228_187545460 40228_187545460 0       0       1       -9
+      40239_187545461 40239_187545461 0       0       1       -9
+      40250_187520807 40250_187520807 0       0       1       -9
+      40261_187520806 40261_187520806 0       0       1       -9
+      40272_187520805 40272_187520805 0       0       2       -9
+      40283_187520804 40283_187520804 0       0       1       -9
+    then a [1, 4, 5, 6] array specifies samples 40195_187545457,
+    40228_187545460, 40239_187545461, and 40250_187520807.
+    None indicates that all samples should be included.  Otherwise, an
+    exception is thrown if any value is not an integer in 0..(raw_sample_ct -
+    1), or the values are not strictly increasing, or the array is empty.
+    (Empty array might be permitted in the future.)
+
+* get_raw_sample_ct()
+  Returns the number of samples in the .pgen file.
+
+* get_variant_ct()
+  Returns the number of variants in the .pgen file.
+
+* hardcall_phase_present()
+  Returns True iff phased hardcalls may be present.
+
+* read(uint32_t variant_idx, np.ndarray[np.int{8,32,64}_t] geno_int_out,
+       uint32_t allele_idx = 1)
+  Takes a (0-based) variant index; fills geno_int_out, which must be a numpy
+  int8, int32, or int64 array with at least sample_ct entries, where sample_ct
+  (as distinguished from raw_sample_ct) is the size of the current sample
+  subset.  These values are in {0, 1, 2, -9}; by default, 0/1/2 represents the
+  alternate allele count.  Setting allele_idx = 0 causes reference allele
+  counts to be reported instead.  (If we ever use multiallelic variant records,
+  setting allele_idx = 2 causes alternate allele 2 counts, etc.)
+  - I tried the interface without the required outbuffer, and it could take
+    twice as long due to all the extra allocation/deallocation.  The buffer can
+    be allocated with
+      buf = np.empty(sample_ct, np.int{8,32,64})
+  - An exception is thrown if variant_idx or allele_idx is invalid (or there's
+    e.g. a file I/O error, or the file isn't open)
+  - Note that pgenlib is unaware of chromosome boundaries.  The caller is
+    responsible for chrX/chrY/chrM-specific corrections for now.
+
+* read_dosages(uint32_t variant_idx,
+               np.ndarray[np.float{32,64}_t] floatarr_out,
+	       uint32_t allele_idx = 1)
+  Takes a (0-based) variant index; fills floatarr_out, which must be a numpy
+  float32 or float64 array with at least sample_ct entries.  Missing entries
+  are encoded as -9, everything else is a dosage in [0, 2] (more precisely,
+  they'll be a multiple of 2^{-14}).
+
+* read_alleles(uint32_t variant_idx, np.ndarray[np.int32_t] allele_int32_out)
+* read_alleles_and_phasepresent(uint32_t variant_idx,
+                                np.ndarray[np.int32_t] allele_int32_out,
+                                np.ndarray[np.uint8_t,cast=True] phasepresent_out)
+  Takes a (0-based) variant index.
+  - allele_int32_out must have space for at least (2 * sample_ct) allele
+    indexes, where elements 2n and (2n+1) correspond to sample n.  Both indexes
+    are -9 if the genotype is missing.  If the genotype is unphased, the lower
+    index appears first.
+  - For read_alleles_and_phasepresent(), phasepresent_out should be a numpy
+    bool_, int8, or uint8 array.  Element n is set to true iff the genotype for
+    sample n has known phase.  Most of these values will be true even when the
+    raw data is unphased, because homozygous genotypes always have known phase.
+    (Missing genotypes are considered to have unknown phase, of course.)
+  - If int32_t is problematically large, you probably want to work with a 2-bit
+    representation rather than int8_t.  That's outside the scope of this API;
+    you may instead want to adapt the appropriate pgenlib.pyx function.
+
+* read_range(uint32_t variant_idx_start, uint32_t variant_idx_end,
+             np.ndarray[np.int{8,32,64}_t,mode="c",ndim=2] geno_int_out,
+	     uint32_t allele_idx = 1, bint sample_maj = 0)
+  read_list(np.ndarray[np.uint32_t] variant_idxs,
+            np.ndarray[np.int{8,32,64}_t,mode="c",ndim=2] geno_int_out,
+	    uint32_t allele_idx = 1, bint sample_maj = 0)
+  read_alleles_range(uint32_t variant_idx_start, uint32_t variant_idx_end,
+                     np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out,
+		     bint hap_maj = 0)
+  read_alleles_list(np.ndarray[np.uint32_t] variant_idxs,
+                    np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out,
+		    bint hap_maj = 0)
+  read_alleles_and_phasepresent_range(uint32_t variant_idx_start,
+                                      uint32_t variant_idx_end,
+                                      np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out,
+			              np.ndarray[np.uint8_t,mode="c",cast=True,ndim=2] phasepresent_out,
+			              bint hap_maj = 0)
+  read_alleles_and_phasepresent_list(np.ndarray[np.uint32_t] variant_idxs,
+                                     np.ndarray[np.int32_t,mode="c",ndim=2] allele_int32_out,
+			             np.ndarray[np.uint8_t,mode="c",cast=True,ndim=2] phasepresent_out,
+			             bint hap_maj = 0)
+  Read functions which handle multiple variants at once, saving data to 2D
+  arrays.  By default, the return arrays are variant-major (# of columns is
+  based on sample_ct, # of rows is equal to the block size); set
+  sample_maj(/hap_maj) = True to make them sample(/haplotype)-major instead.
+  For the _range() functions, the interval is half-open (i.e. variant_idx_start
+  is included, variant_idx_end - 1 is included, variant_idx_end is not).  For
+  the _list() functions, it's currently okay for the variant indexes to be
+  unsorted, or for duplicates to be present, but that may not remain true.
+  (read_alleles_and_phasepresent_{range,list} not implemented yet.)
+  (Todo: read_dosages_range() and read_dosages_list().)
+
+* count(uint32_t variant_idx, np.ndarray[np.uint32_t] genocount_uint32_out,
+        uint32_t allele_idx = 1)
+  Takes a (0-based) variant index, fills genocount_uint32_out (which must have
+  at least 4 slots).  First element corresponds to the number of genotypes with
+  0 alt1 alleles, the second element corresponds to the # with one alt1 allele,
+  the third corresponds to homozygous-alt1, and the fourth corresponds to the
+  number of missing genotypes.
+  - Setting allele_idx = 0 replaces "alt1" with "ref" above.
+  - Setting allele_idx = None is functionally equivalent to allele_idx = 1 on
+    all-biallelic data, but the length of the array is no longer limited to 4
+    for multiallelic variants; instead it has 1 + v(v+1)/2 elements where v is
+    the number of alleles, in 0/0, 0/1, 1/1, 0/2, 1/2, 2/2, 0/3, ...,
+    (v-1)/(v-1), ./. order.  (This matches VCF FORMAT:GL order.)
+  (Todo: check whether multi-variant analogues of read/read_phased/count have
+  any value.)
+
+* phased_samples() [not written yet]
+  If phase set data is present, or the entire dataset is unphased, returns a
+  numpy bool_ array where element n is true iff sample n has at least one phase
+  set.  Throws an exception if phased variants are present, but no phase set
+  data track is present.  (Note that the phase set data track is not
+  implemented yet, but it should be within the next few months.)
+
+* change_sample_subset(sample_subset = None)
+  Changes the subset of samples to read.
+  - sample_subset format is the same as for the constructor.  (This isn't
+    really optimized; it's assumed that you won't be calling this often.  If
+    you ever need a higher-performance version of this, let me know.)
+
+* close()
+  Closes the file, no further read operations can be performed.
+  - Not strictly necessary when e.g. a with-statement is used, but still a good
+    idea in longer scripts since otherwise object cleanup/file close may be
+    delayed.
+
+
+class PgenWriter:
+* PgenWriter(filename, sample_ct, variant_ct, nonref_flags,
+             allele_idx_offsets = None, hardcall_phase_present = False,
+	     dosage_present = False, dosage_phase_present = False)
+  Constructor, creates a new .pgen file and writes a mostly-empty header (which
+  gets filled at the end).
+  - sample_ct and variant_ct must be positive (and less than about 2^31).
+  - nonref_flags should be True when the data is from an ordinary PLINK 1.x
+    .bed file (where the A2 allele is major rather than consistently
+    reference), False when the A2 allele *is* consistently reference, or a
+    numpy bool_ array of length variant_ct when this is mixed.  (Strictly
+    speaking, None is also permitted--that delegates tracking of nonref
+    information to the .pvar file--but it's discouraged, since .pgen+.bim+.fam
+    is a useful data representation with direct plink2 support.)
+  - allele_idx_offsets is an optional numpy intp array of length
+    (variant_ct+1), where allele_idx_offsets[0] = 0, and the number of alleles
+    for (0-based) variant n is (allele_idx_offsets[n+1]-allele_idx_offsets[n]).
+    # of alleles must be at least 2 for each variant.  If allele_idx_offsets
+    is None, all variants are assumed to be biallelic.
+
+* append_biallelic(genobytes)
+  Takes a numpy int8 array with sample_ct {0, 1, 2, -9} elements, and appends
+  it to the .pgen.  Ok to use 3 in place of -9.
+
+* append_alleles(allele_codes, all_phased = False)
+  append_partially_phased(allele_codes, np.ndarray[np.uint8_t,cast=True] phasepresent)
+  Takes a numpy int32 array with (2 * sample_ct) allele codes (0 = ref,
+  1 = alt1, -9 = missing), and appends it to the .pgen.  -9s must occur in
+  pairs.
+  - With append_alleles(), all genotypes are treated as unphased by default.
+    Set all_phased to True to treat all genotypes as phased instead.
+  - With append_partially_phased(), phasepresent should be a numpy bool_ (or
+    equivalent int8_t/uint8_t; in that case, all values must be 0s and 1s)
+    array of length sample_ct.  Entries for non-heterozygous calls are ignored.
+  - It's fine for an unphased het call to be stored in 1/0 instead of 0/1
+    order.
+
+* append_dosages(floatarr)
+  Takes a numpy float32 or float64 array of dosages, and appends it to the
+  .pgen.
+  - The writer must have been initialized with dosage_present = True.
+  - Regular dosages are expected to be in [0, 2] (you'll want to multiply
+    haploid dosages by 2).
+  - Except for small (2^{-16}) tolerances around 0 and 2, any out-of-range
+    value (including -9) is interpreted as missing.
+
+* append_biallelic_batch(np.ndarray[np.int8_t,mode="c",ndim=2] genobytes_batch)
+  append_alleles_batch(np.ndarray[np.int32_t,mode="c",ndim=2] allele_codes_batch,
+                       all_phased = False)
+  append_partially_phased_batch(np.ndarray[np.int32_t,mode="c",ndim=2] allele_codes_batch,
+                                np.ndarray[np.{int8,uint8}_t,mode="c",ndim=2] phasepresent_batch)
+  append_dosages_batch(np.ndarray[np.float{32,64}_t] floatarr_batch)
+  Multi-variant forms of append_biallelic(), append_alleles(),
+  append_partially_phased(), and append_dosages().  Input matrices must be
+  variant-major.
+
+* close()
+  Backfills the header and closes the file; no further append operations can be
+  performed.  Throws an exception if the number of written variants is not
+  equal to the initially-declared variant_ct.
diff --git a/Python/setup.py b/Python/setup.py
new file mode 100644
index 0000000..ac887b9
--- /dev/null
+++ b/Python/setup.py
@@ -0,0 +1,26 @@
+#!/usr/bin/python
+
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Build import cythonize
+
+import numpy as np
+
+ext_modules = [
+    Extension('pgenlib',
+              sources = ['pgenlib.pyx', '../pgenlib_python_support.cpp', '../pgenlib_internal.cpp'],
+              language = "c++",
+              # do not compile as c++11, since cython doesn't yet support
+              # overload of uint32_t operator
+              # extra_compile_args = ["-std=c++11", "-Wno-unused-function"],
+              # extra_link_args = ["-std=c++11"],
+              extra_compile_args = ["-std=c++98", "-Wno-unused-function"],
+              extra_link_args = ["-std=c++98"],
+              include_dirs = [np.get_include()]
+              )
+    ]
+
+setup(name = 'Pgenlib',
+      version = '0.7',
+      description = "Wrapper for pgenlib's basic reader and writer.",
+      ext_modules = cythonize(ext_modules))
diff --git a/SFMT.c b/SFMT.c
new file mode 100644
index 0000000..5de9e89
--- /dev/null
+++ b/SFMT.c
@@ -0,0 +1,591 @@
+/**
+ * @file  SFMT.c
+ * @brief SIMD oriented Fast Mersenne Twister(SFMT)
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (Hiroshima University)
+ *
+ * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.
+ * Copyright (C) 2012 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and The University of Tokyo.
+ * All rights reserved.
+ *
+ * The 3-clause BSD License is applied to this software:
+ *
+ * * * * * *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the names of Hiroshima University, The University of Tokyo nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * * * * * *
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <string.h>
+#include <assert.h>
+#include "SFMT.h"
+
+#ifndef __LP64__
+inline static void do_recursion(w128_t * r, w128_t * a, w128_t * b,
+				w128_t * c, w128_t * d);
+#endif
+
+#ifndef __LP64__
+  inline static void rshift128(w128_t *out,  w128_t const *in, int shift);
+  inline static void lshift128(w128_t *out,  w128_t const *in, int shift);
+  
+/**
+ * This function simulates SIMD 128-bit right shift by the standard C.
+ * The 128-bit integer given in in is shifted by (shift * 8) bits.
+ * This function simulates the LITTLE ENDIAN SIMD.
+ * @param out the output of this function
+ * @param in the 128-bit data to be shifted
+ * @param shift the shift value
+ */
+inline static void rshift128(w128_t *out, w128_t const *in, int shift)
+{
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
+    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
+
+    oh = th >> (shift * 8);
+    ol = tl >> (shift * 8);
+    ol |= th << (64 - shift * 8);
+    out->u[1] = (uint32_t)(ol >> 32);
+    out->u[0] = (uint32_t)ol;
+    out->u[3] = (uint32_t)(oh >> 32);
+    out->u[2] = (uint32_t)oh;
+}
+
+/**
+ * This function simulates SIMD 128-bit left shift by the standard C.
+ * The 128-bit integer given in in is shifted by (shift * 8) bits.
+ * This function simulates the LITTLE ENDIAN SIMD.
+ * @param out the output of this function
+ * @param in the 128-bit data to be shifted
+ * @param shift the shift value
+ */
+inline static void lshift128(w128_t *out, w128_t const *in, int shift)
+{
+    uint64_t th, tl, oh, ol;
+
+    th = ((uint64_t)in->u[3] << 32) | ((uint64_t)in->u[2]);
+    tl = ((uint64_t)in->u[1] << 32) | ((uint64_t)in->u[0]);
+
+    oh = th << (shift * 8);
+    ol = tl << (shift * 8);
+    oh |= tl >> (64 - shift * 8);
+    out->u[1] = (uint32_t)(ol >> 32);
+    out->u[0] = (uint32_t)ol;
+    out->u[3] = (uint32_t)(oh >> 32);
+    out->u[2] = (uint32_t)oh;
+}
+
+/**
+ * This function represents the recursion formula.
+ * @param r output
+ * @param a a 128-bit part of the internal state array
+ * @param b a 128-bit part of the internal state array
+ * @param c a 128-bit part of the internal state array
+ * @param d a 128-bit part of the internal state array
+ */
+inline static void do_recursion(w128_t *r, w128_t *a, w128_t *b,
+				w128_t *c, w128_t *d)
+{
+    w128_t x;
+    w128_t y;
+
+    lshift128(&x, a, SFMT_SL2);
+    rshift128(&y, c, SFMT_SR2);
+    r->u[0] = a->u[0] ^ x.u[0] ^ ((b->u[0] >> SFMT_SR1) & SFMT_MSK1)
+	^ y.u[0] ^ (d->u[0] << SFMT_SL1);
+    r->u[1] = a->u[1] ^ x.u[1] ^ ((b->u[1] >> SFMT_SR1) & SFMT_MSK2)
+	^ y.u[1] ^ (d->u[1] << SFMT_SL1);
+    r->u[2] = a->u[2] ^ x.u[2] ^ ((b->u[2] >> SFMT_SR1) & SFMT_MSK3)
+	^ y.u[2] ^ (d->u[2] << SFMT_SL1);
+    r->u[3] = a->u[3] ^ x.u[3] ^ ((b->u[3] >> SFMT_SR1) & SFMT_MSK4)
+	^ y.u[3] ^ (d->u[3] << SFMT_SL1);
+}
+#endif
+
+/**
+ * parameters used by sse2.
+ */
+#ifdef __LP64__
+static const w128_t sse2_param_mask = {{SFMT_MSK1, SFMT_MSK2,
+					SFMT_MSK3, SFMT_MSK4}};
+#endif
+/*----------------
+  STATIC FUNCTIONS
+  ----------------*/
+inline static int idxof(int i);
+inline static void gen_rand_array(sfmt_t * sfmt, w128_t *array, int size);
+inline static uint32_t func1(uint32_t x);
+inline static uint32_t func2(uint32_t x);
+static void period_certification(sfmt_t * sfmt);
+
+#ifdef __LP64__
+inline static void mm_recursion(__m128i * r, __m128i a, __m128i b,
+				__m128i c, __m128i d);
+
+/**
+ * This function represents the recursion formula.
+ * @param r an output
+ * @param a a 128-bit part of the interal state array
+ * @param b a 128-bit part of the interal state array
+ * @param c a 128-bit part of the interal state array
+ * @param d a 128-bit part of the interal state array
+ */
+inline static void mm_recursion(__m128i * r, __m128i a, __m128i b,
+				__m128i c, __m128i d)
+{
+    __m128i v, x, y, z;
+
+    y = _mm_srli_epi32(b, SFMT_SR1);
+    z = _mm_srli_si128(c, SFMT_SR2);
+    v = _mm_slli_epi32(d, SFMT_SL1);
+    z = _mm_xor_si128(z, a);
+    z = _mm_xor_si128(z, v);
+    x = _mm_slli_si128(a, SFMT_SL2);
+    y = _mm_and_si128(y, sse2_param_mask.si);
+    z = _mm_xor_si128(z, x);
+    z = _mm_xor_si128(z, y);
+    *r = z;
+}
+
+/**
+ * This function fills the internal state array with pseudorandom
+ * integers.
+ * @param sfmt SFMT internal state
+ */
+void sfmt_gen_rand_all(sfmt_t * sfmt) {
+    int i;
+    __m128i r1, r2;
+    w128_t * pstate = sfmt->state;
+
+    r1 = pstate[SFMT_N - 2].si;
+    r2 = pstate[SFMT_N - 1].si;
+    for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
+	mm_recursion(&pstate[i].si, pstate[i].si,
+		     pstate[i + SFMT_POS1].si, r1, r2);
+	r1 = r2;
+	r2 = pstate[i].si;
+    }
+    for (; i < SFMT_N; i++) {
+	mm_recursion(&pstate[i].si, pstate[i].si,
+		     pstate[i + SFMT_POS1 - SFMT_N].si,
+		     r1, r2);
+	r1 = r2;
+	r2 = pstate[i].si;
+    }
+}
+
+/**
+ * This function fills the user-specified array with pseudorandom
+ * integers.
+ * @param sfmt SFMT internal state.
+ * @param array an 128-bit array to be filled by pseudorandom numbers.
+ * @param size number of 128-bit pseudorandom numbers to be generated.
+ */
+static void gen_rand_array(sfmt_t * sfmt, w128_t * array, int size)
+{
+    int i, j;
+    __m128i r1, r2;
+    w128_t * pstate = sfmt->state;
+
+    r1 = pstate[SFMT_N - 2].si;
+    r2 = pstate[SFMT_N - 1].si;
+    for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
+	mm_recursion(&array[i].si, pstate[i].si,
+		     pstate[i + SFMT_POS1].si, r1, r2);
+	r1 = r2;
+	r2 = array[i].si;
+    }
+    for (; i < SFMT_N; i++) {
+	mm_recursion(&array[i].si, pstate[i].si,
+		     array[i + SFMT_POS1 - SFMT_N].si, r1, r2);
+	r1 = r2;
+	r2 = array[i].si;
+    }
+    for (; i < size - SFMT_N; i++) {
+	mm_recursion(&array[i].si, array[i - SFMT_N].si,
+		     array[i + SFMT_POS1 - SFMT_N].si, r1, r2);
+	r1 = r2;
+	r2 = array[i].si;
+    }
+    for (j = 0; j < 2 * SFMT_N - size; j++) {
+	pstate[j] = array[j + size - SFMT_N];
+    }
+    for (; i < size; i++, j++) {
+	mm_recursion(&array[i].si, array[i - SFMT_N].si,
+		     array[i + SFMT_POS1 - SFMT_N].si, r1, r2);
+	r1 = r2;
+	r2 = array[i].si;
+	pstate[j] = array[i];
+    }
+}
+
+#endif
+
+/**
+ * This function simulate a 64-bit index of LITTLE ENDIAN
+ * in BIG ENDIAN machine.
+ */
+inline static int idxof(int i) {
+    return i;
+}
+
+#ifndef __LP64__
+/**
+ * This function fills the user-specified array with pseudorandom
+ * integers.
+ *
+ * @param sfmt SFMT internal state
+ * @param array an 128-bit array to be filled by pseudorandom numbers.
+ * @param size number of 128-bit pseudorandom numbers to be generated.
+ */
+inline static void gen_rand_array(sfmt_t * sfmt, w128_t *array, int size) {
+    int i, j;
+    w128_t *r1, *r2;
+
+    r1 = &sfmt->state[SFMT_N - 2];
+    r2 = &sfmt->state[SFMT_N - 1];
+    for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
+	do_recursion(&array[i], &sfmt->state[i], &sfmt->state[i + SFMT_POS1], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+    }
+    for (; i < SFMT_N; i++) {
+	do_recursion(&array[i], &sfmt->state[i],
+		     &array[i + SFMT_POS1 - SFMT_N], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+    }
+    for (; i < size - SFMT_N; i++) {
+	do_recursion(&array[i], &array[i - SFMT_N],
+		     &array[i + SFMT_POS1 - SFMT_N], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+    }
+    for (j = 0; j < 2 * SFMT_N - size; j++) {
+	sfmt->state[j] = array[j + size - SFMT_N];
+    }
+    for (; i < size; i++, j++) {
+	do_recursion(&array[i], &array[i - SFMT_N],
+		     &array[i + SFMT_POS1 - SFMT_N], r1, r2);
+	r1 = r2;
+	r2 = &array[i];
+	sfmt->state[j] = array[i];
+    }
+}
+#endif
+
+/**
+ * This function represents a function used in the initialization
+ * by init_by_array
+ * @param x 32-bit integer
+ * @return 32-bit integer
+ */
+static uint32_t func1(uint32_t x) {
+    return (x ^ (x >> 27)) * (uint32_t)1664525UL;
+}
+
+/**
+ * This function represents a function used in the initialization
+ * by init_by_array
+ * @param x 32-bit integer
+ * @return 32-bit integer
+ */
+static uint32_t func2(uint32_t x) {
+    return (x ^ (x >> 27)) * (uint32_t)1566083941UL;
+}
+
+/**
+ * This function certificate the period of 2^{MEXP}
+ * @param sfmt SFMT internal state
+ */
+static void period_certification(sfmt_t * sfmt) {
+    int inner = 0;
+    int i, j;
+    uint32_t work;
+    uint32_t *psfmt32 = &sfmt->state[0].u[0];
+    const uint32_t parity[4] = {SFMT_PARITY1, SFMT_PARITY2,
+				SFMT_PARITY3, SFMT_PARITY4};
+
+    for (i = 0; i < 4; i++)
+	inner ^= psfmt32[idxof(i)] & parity[i];
+    for (i = 16; i > 0; i >>= 1)
+	inner ^= inner >> i;
+    inner &= 1;
+    /* check OK */
+    if (inner == 1) {
+	return;
+    }
+    /* check NG, and modification */
+    for (i = 0; i < 4; i++) {
+	work = 1;
+	for (j = 0; j < 32; j++) {
+	    if ((work & parity[i]) != 0) {
+		psfmt32[idxof(i)] ^= work;
+		return;
+	    }
+	    work = work << 1;
+	}
+    }
+}
+
+/*----------------
+  PUBLIC FUNCTIONS
+  ----------------*/
+#define UNUSED_VARIABLE(x) (void)(x)
+/**
+ * This function returns the identification string.
+ * The string shows the word size, the Mersenne exponent,
+ * and all parameters of this generator.
+ * @param sfmt SFMT internal state
+ */
+const char *sfmt_get_idstring(sfmt_t * sfmt) {
+    UNUSED_VARIABLE(sfmt);
+    return SFMT_IDSTR;
+}
+
+/**
+ * This function returns the minimum size of array used for \b
+ * fill_array32() function.
+ * @param sfmt SFMT internal state
+ * @return minimum size of array used for fill_array32() function.
+ */
+int sfmt_get_min_array_size32(sfmt_t * sfmt) {
+    UNUSED_VARIABLE(sfmt);
+    return SFMT_N32;
+}
+
+/**
+ * This function returns the minimum size of array used for \b
+ * fill_array64() function.
+ * @param sfmt SFMT internal state
+ * @return minimum size of array used for fill_array64() function.
+ */
+int sfmt_get_min_array_size64(sfmt_t * sfmt) {
+    UNUSED_VARIABLE(sfmt);
+    return SFMT_N64;
+}
+
+#ifndef __LP64__
+/**
+ * This function fills the internal state array with pseudorandom
+ * integers.
+ * @param sfmt SFMT internal state
+ */
+void sfmt_gen_rand_all(sfmt_t * sfmt) {
+    int i;
+    w128_t *r1, *r2;
+
+    r1 = &sfmt->state[SFMT_N - 2];
+    r2 = &sfmt->state[SFMT_N - 1];
+    for (i = 0; i < SFMT_N - SFMT_POS1; i++) {
+	do_recursion(&sfmt->state[i], &sfmt->state[i],
+		     &sfmt->state[i + SFMT_POS1], r1, r2);
+	r1 = r2;
+	r2 = &sfmt->state[i];
+    }
+    for (; i < SFMT_N; i++) {
+	do_recursion(&sfmt->state[i], &sfmt->state[i],
+		     &sfmt->state[i + SFMT_POS1 - SFMT_N], r1, r2);
+	r1 = r2;
+	r2 = &sfmt->state[i];
+    }
+}
+#endif
+
+/**
+ * This function generates pseudorandom 32-bit integers in the
+ * specified array[] by one call. The number of pseudorandom integers
+ * is specified by the argument size, which must be at least 624 and a
+ * multiple of four.  The generation by this function is much faster
+ * than the following gen_rand function.
+ *
+ * For initialization, init_gen_rand or init_by_array must be called
+ * before the first call of this function. This function can not be
+ * used after calling gen_rand function, without initialization.
+ *
+ * @param sfmt SFMT internal state
+ * @param array an array where pseudorandom 32-bit integers are filled
+ * by this function.  The pointer to the array must be \b "aligned"
+ * (namely, must be a multiple of 16) in the SIMD version, since it
+ * refers to the address of a 128-bit integer.  In the standard C
+ * version, the pointer is arbitrary.
+ *
+ * @param size the number of 32-bit pseudorandom integers to be
+ * generated.  size must be a multiple of 4, and greater than or equal
+ * to (MEXP / 128 + 1) * 4.
+ *
+ * @note \b memalign or \b posix_memalign is available to get aligned
+ * memory. Mac OSX doesn't have these functions, but \b malloc of OSX
+ * returns the pointer to the aligned memory block.
+ */
+void sfmt_fill_array32(sfmt_t * sfmt, uint32_t *array, int size) {
+    assert(sfmt->idx == SFMT_N32);
+    assert(size % 4 == 0);
+    assert(size >= SFMT_N32);
+
+    gen_rand_array(sfmt, (w128_t *)array, size / 4);
+    sfmt->idx = SFMT_N32;
+}
+
+/**
+ * This function generates pseudorandom 64-bit integers in the
+ * specified array[] by one call. The number of pseudorandom integers
+ * is specified by the argument size, which must be at least 312 and a
+ * multiple of two.  The generation by this function is much faster
+ * than the following gen_rand function.
+ *
+ * @param sfmt SFMT internal state
+ * For initialization, init_gen_rand or init_by_array must be called
+ * before the first call of this function. This function can not be
+ * used after calling gen_rand function, without initialization.
+ *
+ * @param array an array where pseudorandom 64-bit integers are filled
+ * by this function.  The pointer to the array must be "aligned"
+ * (namely, must be a multiple of 16) in the SIMD version, since it
+ * refers to the address of a 128-bit integer.  In the standard C
+ * version, the pointer is arbitrary.
+ *
+ * @param size the number of 64-bit pseudorandom integers to be
+ * generated.  size must be a multiple of 2, and greater than or equal
+ * to (MEXP / 128 + 1) * 2
+ *
+ * @note \b memalign or \b posix_memalign is available to get aligned
+ * memory. Mac OSX doesn't have these functions, but \b malloc of OSX
+ * returns the pointer to the aligned memory block.
+ */
+void sfmt_fill_array64(sfmt_t * sfmt, uint64_t *array, int size) {
+    assert(sfmt->idx == SFMT_N32);
+    assert(size % 2 == 0);
+    assert(size >= SFMT_N64);
+
+    gen_rand_array(sfmt, (w128_t *)array, size / 2);
+    sfmt->idx = SFMT_N32;
+
+}
+
+/**
+ * This function initializes the internal state array with a 32-bit
+ * integer seed.
+ *
+ * @param sfmt SFMT internal state
+ * @param seed a 32-bit integer used as the seed.
+ */
+void sfmt_init_gen_rand(sfmt_t * sfmt, uint32_t seed) {
+    int i;
+
+    uint32_t *psfmt32 = &sfmt->state[0].u[0];
+
+    psfmt32[idxof(0)] = seed;
+    for (i = 1; i < SFMT_N32; i++) {
+	psfmt32[idxof(i)] = 1812433253UL * (psfmt32[idxof(i - 1)]
+					    ^ (psfmt32[idxof(i - 1)] >> 30))
+	    + i;
+    }
+    sfmt->idx = SFMT_N32;
+    period_certification(sfmt);
+}
+
+/**
+ * This function initializes the internal state array,
+ * with an array of 32-bit integers used as the seeds
+ * @param sfmt SFMT internal state
+ * @param init_key the array of 32-bit integers, used as a seed.
+ * @param key_length the length of init_key.
+ */
+void sfmt_init_by_array(sfmt_t * sfmt, uint32_t *init_key, int key_length) {
+    int i, j, count;
+    uint32_t r;
+    int lag;
+    int mid;
+    int size = SFMT_N * 4;
+    uint32_t *psfmt32 = &sfmt->state[0].u[0];
+
+    if (size >= 623) {
+	lag = 11;
+    } else if (size >= 68) {
+	lag = 7;
+    } else if (size >= 39) {
+	lag = 5;
+    } else {
+	lag = 3;
+    }
+    mid = (size - lag) / 2;
+
+    memset(sfmt, 0x8b, sizeof(sfmt_t));
+    if (key_length + 1 > SFMT_N32) {
+	count = key_length + 1;
+    } else {
+	count = SFMT_N32;
+    }
+    r = func1(psfmt32[idxof(0)] ^ psfmt32[idxof(mid)]
+	      ^ psfmt32[idxof(SFMT_N32 - 1)]);
+    psfmt32[idxof(mid)] += r;
+    r += key_length;
+    psfmt32[idxof(mid + lag)] += r;
+    psfmt32[idxof(0)] = r;
+
+    count--;
+    for (i = 1, j = 0; (j < count) && (j < key_length); j++) {
+	r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % SFMT_N32)]
+		  ^ psfmt32[idxof((i + SFMT_N32 - 1) % SFMT_N32)]);
+	psfmt32[idxof((i + mid) % SFMT_N32)] += r;
+	r += init_key[j] + i;
+	psfmt32[idxof((i + mid + lag) % SFMT_N32)] += r;
+	psfmt32[idxof(i)] = r;
+	i = (i + 1) % SFMT_N32;
+    }
+    for (; j < count; j++) {
+	r = func1(psfmt32[idxof(i)] ^ psfmt32[idxof((i + mid) % SFMT_N32)]
+		  ^ psfmt32[idxof((i + SFMT_N32 - 1) % SFMT_N32)]);
+	psfmt32[idxof((i + mid) % SFMT_N32)] += r;
+	r += i;
+	psfmt32[idxof((i + mid + lag) % SFMT_N32)] += r;
+	psfmt32[idxof(i)] = r;
+	i = (i + 1) % SFMT_N32;
+    }
+    for (j = 0; j < SFMT_N32; j++) {
+	r = func2(psfmt32[idxof(i)] + psfmt32[idxof((i + mid) % SFMT_N32)]
+		  + psfmt32[idxof((i + SFMT_N32 - 1) % SFMT_N32)]);
+	psfmt32[idxof((i + mid) % SFMT_N32)] ^= r;
+	r -= i;
+	psfmt32[idxof((i + mid + lag) % SFMT_N32)] ^= r;
+	psfmt32[idxof(i)] = r;
+	i = (i + 1) % SFMT_N32;
+    }
+
+    sfmt->idx = SFMT_N32;
+    period_certification(sfmt);
+}
+#if defined(__cplusplus)
+}
+#endif
diff --git a/SFMT.h b/SFMT.h
new file mode 100644
index 0000000..9be2bc6
--- /dev/null
+++ b/SFMT.h
@@ -0,0 +1,330 @@
+#pragma once
+/**
+ * @file SFMT.h
+ *
+ * @brief SIMD oriented Fast Mersenne Twister(SFMT) pseudorandom
+ * number generator using C structure.
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (The University of Tokyo)
+ *
+ * Copyright (C) 2006, 2007 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.
+ * Copyright (C) 2012 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and The University of Tokyo.
+ * All rights reserved.
+ *
+ * The 3-clause BSD License is applied to this software:
+ *
+ * * * * * *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the names of Hiroshima University, The University of Tokyo nor the
+ *   names of its contributors may be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * * * * * *
+ *
+ * @note We assume that your system has inttypes.h.  If your system
+ * doesn't have inttypes.h, you have to typedef uint32_t and uint64_t,
+ * and you have to define PRIu64 and PRIx64 in this file as follows:
+ * @verbatim
+ typedef unsigned int uint32_t
+ typedef unsigned long long uint64_t
+ #define PRIu64 "llu"
+ #define PRIx64 "llx"
+ at endverbatim
+ * uint32_t must be exactly 32-bit unsigned integer type (no more, no
+ * less), and uint64_t must be exactly 64-bit unsigned integer type.
+ * PRIu64 and PRIx64 are used for printf function to print 64-bit
+ * unsigned int and 64-bit unsigned int in hexadecimal format.
+ */
+
+#ifndef SFMTST_H
+#define SFMTST_H
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <assert.h>
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+  #include <inttypes.h>
+#elif defined(_MSC_VER) || defined(__BORLANDC__)
+  typedef unsigned int uint32_t;
+  typedef unsigned __int64 uint64_t;
+  #define inline __inline
+#else
+  #include <inttypes.h>
+  #if defined(__GNUC__)
+    #define inline __inline__
+  #endif
+#endif
+
+#ifndef PRIu64
+  #if defined(_MSC_VER) || defined(__BORLANDC__)
+    #define PRIu64 "I64u"
+    #define PRIx64 "I64x"
+  #else
+    #define PRIu64 "llu"
+    #define PRIx64 "llx"
+  #endif
+#endif
+
+
+  // fold in SFMT-params.h, etc. to reduce compilation time a bit
+#if !defined(SFMT_MEXP)
+  #define SFMT_MEXP 19937
+#endif
+/*-----------------
+  BASIC DEFINITIONS
+  -----------------*/
+/** Mersenne Exponent. The period of the sequence
+ *  is a multiple of 2^MEXP-1.
+ * #define SFMT_MEXP 19937 */
+/** SFMT generator has an internal state array of 128-bit integers,
+ * and N is its size. */
+#define SFMT_N (SFMT_MEXP / 128 + 1)
+/** N32 is the size of internal state array when regarded as an array
+ * of 32-bit integers.*/
+#define SFMT_N32 (SFMT_N * 4)
+/** N64 is the size of internal state array when regarded as an array
+ * of 64-bit integers.*/
+#define SFMT_N64 (SFMT_N * 2)
+#define SFMT_POS1	122
+#define SFMT_SL1	18
+#define SFMT_SL2	1
+#define SFMT_SR1	11
+#define SFMT_SR2	1
+#define SFMT_MSK1	0xdfffffefU
+#define SFMT_MSK2	0xddfecb7fU
+#define SFMT_MSK3	0xbffaffffU
+#define SFMT_MSK4	0xbffffff6U
+#define SFMT_PARITY1	0x00000001U
+#define SFMT_PARITY2	0x00000000U
+#define SFMT_PARITY3	0x00000000U
+#define SFMT_PARITY4	0x13c9e684U
+#define SFMT_IDSTR	"SFMT-19937:122-18-1-11-1:dfffffef-ddfecb7f-bffaffff-bffffff6"
+
+
+/*------------------------------------------
+  128-bit SIMD like data type for standard C
+  ------------------------------------------*/
+#ifdef __LP64__
+  #include <emmintrin.h>
+
+/** 128-bit data structure */
+union W128_T {
+    uint32_t u[4];
+    uint64_t u64[2];
+    __m128i si;
+};
+#else
+/** 128-bit data structure */
+union W128_T {
+    uint32_t u[4];
+    uint64_t u64[2];
+};
+#endif
+
+/** 128-bit data type */
+typedef union W128_T w128_t;
+
+/**
+ * SFMT internal state
+ */
+struct SFMT_T {
+    /** the 128-bit internal state array */
+    w128_t state[SFMT_N];
+    /** index counter to the 32-bit internal state array */
+    int idx;
+};
+
+typedef struct SFMT_T sfmt_t;
+
+void sfmt_fill_array32(sfmt_t * sfmt, uint32_t * array, int size);
+void sfmt_fill_array64(sfmt_t * sfmt, uint64_t * array, int size);
+void sfmt_init_gen_rand(sfmt_t * sfmt, uint32_t seed);
+void sfmt_init_by_array(sfmt_t * sfmt, uint32_t * init_key, int key_length);
+const char * sfmt_get_idstring(sfmt_t * sfmt);
+int sfmt_get_min_array_size32(sfmt_t * sfmt);
+int sfmt_get_min_array_size64(sfmt_t * sfmt);
+void sfmt_gen_rand_all(sfmt_t * sfmt);
+
+/**
+ * This function generates and returns 32-bit pseudorandom number.
+ * init_gen_rand or init_by_array must be called before this function.
+ * @param sfmt SFMT internal state
+ * @return 32-bit pseudorandom number
+ */
+inline static uint32_t sfmt_genrand_uint32(sfmt_t * sfmt) {
+    uint32_t r;
+    uint32_t * psfmt32 = &sfmt->state[0].u[0];
+
+    if (sfmt->idx >= SFMT_N32) {
+	sfmt_gen_rand_all(sfmt);
+	sfmt->idx = 0;
+    }
+    r = psfmt32[sfmt->idx++];
+    return r;
+}
+/**
+ * This function generates and returns 64-bit pseudorandom number.
+ * init_gen_rand or init_by_array must be called before this function.
+ * The function gen_rand64 should not be called after gen_rand32,
+ * unless an initialization is again executed.
+ * @param sfmt SFMT internal state
+ * @return 64-bit pseudorandom number
+ */
+inline static uint64_t sfmt_genrand_uint64(sfmt_t * sfmt) {
+    uint64_t r;
+    uint64_t * psfmt64 = &sfmt->state[0].u64[0];
+    assert(sfmt->idx % 2 == 0);
+
+    if (sfmt->idx >= SFMT_N32) {
+	sfmt_gen_rand_all(sfmt);
+	sfmt->idx = 0;
+    }
+    r = psfmt64[sfmt->idx / 2];
+    sfmt->idx += 2;
+    return r;
+}
+
+/* =================================================
+   The following real versions are due to Isaku Wada
+   ================================================= */
+/**
+ * converts an unsigned 32-bit number to a double on [0,1]-real-interval.
+ * @param v 32-bit unsigned integer
+ * @return double on [0,1]-real-interval
+ */
+inline static double sfmt_to_real1(uint32_t v)
+{
+    return v * (1.0/4294967295.0);
+    /* divided by 2^32-1 */
+}
+
+/**
+ * generates a random number on [0,1]-real-interval
+ * @param sfmt SFMT internal state
+ * @return double on [0,1]-real-interval
+ */
+inline static double sfmt_genrand_real1(sfmt_t * sfmt)
+{
+    return sfmt_to_real1(sfmt_genrand_uint32(sfmt));
+}
+
+/**
+ * converts an unsigned 32-bit integer to a double on [0,1)-real-interval.
+ * @param v 32-bit unsigned integer
+ * @return double on [0,1)-real-interval
+ */
+inline static double sfmt_to_real2(uint32_t v)
+{
+    return v * (1.0/4294967296.0);
+    /* divided by 2^32 */
+}
+
+/**
+ * generates a random number on [0,1)-real-interval
+ * @param sfmt SFMT internal state
+ * @return double on [0,1)-real-interval
+ */
+inline static double sfmt_genrand_real2(sfmt_t * sfmt)
+{
+    return sfmt_to_real2(sfmt_genrand_uint32(sfmt));
+}
+
+/**
+ * converts an unsigned 32-bit integer to a double on (0,1)-real-interval.
+ * @param v 32-bit unsigned integer
+ * @return double on (0,1)-real-interval
+ */
+inline static double sfmt_to_real3(uint32_t v)
+{
+    return (((double)v) + 0.5)*(1.0/4294967296.0);
+    /* divided by 2^32 */
+}
+
+/**
+ * generates a random number on (0,1)-real-interval
+ * @param sfmt SFMT internal state
+ * @return double on (0,1)-real-interval
+ */
+inline static double sfmt_genrand_real3(sfmt_t * sfmt)
+{
+    return sfmt_to_real3(sfmt_genrand_uint32(sfmt));
+}
+
+/**
+ * converts an unsigned 32-bit integer to double on [0,1)
+ * with 53-bit resolution.
+ * @param v 32-bit unsigned integer
+ * @return double on [0,1)-real-interval with 53-bit resolution.
+ */
+inline static double sfmt_to_res53(uint64_t v)
+{
+    return v * (1.0/18446744073709551616.0L);
+}
+
+/**
+ * generates a random number on [0,1) with 53-bit resolution
+ * @param sfmt SFMT internal state
+ * @return double on [0,1) with 53-bit resolution
+ */
+inline static double sfmt_genrand_res53(sfmt_t * sfmt)
+{
+    return sfmt_to_res53(sfmt_genrand_uint64(sfmt));
+}
+
+
+/* =================================================
+   The following function are added by Saito.
+   ================================================= */
+/**
+ * generates a random number on [0,1) with 53-bit resolution from two
+ * 32 bit integers
+ */
+inline static double sfmt_to_res53_mix(uint32_t x, uint32_t y)
+{
+    return sfmt_to_res53(x | ((uint64_t)y << 32));
+}
+
+/**
+ * generates a random number on [0,1) with 53-bit resolution
+ * using two 32bit integers.
+ * @param sfmt SFMT internal state
+ * @return double on [0,1) with 53-bit resolution
+ */
+inline static double sfmt_genrand_res53_mix(sfmt_t * sfmt)
+{
+    uint32_t x, y;
+
+    x = sfmt_genrand_uint32(sfmt);
+    y = sfmt_genrand_uint32(sfmt);
+    return sfmt_to_res53_mix(x, y);
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/bgzf.c b/bgzf.c
new file mode 100644
index 0000000..cc0cf40
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,1195 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+   Copyright (C) 2009, 2013, 2014 Genome Research Ltd
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <inttypes.h>
+
+#include "hts.h"
+#include "bgzf.h"
+#include "hfile.h"
+
+#define BGZF_CACHE
+
+#ifndef _WIN32
+  #define BGZF_MT
+#endif
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139|  8|  4|              0|  0|255|      6| 66| 67|      2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+  BGZF extension:
+                ^                              ^   ^   ^
+                |                              |   |   |
+               FLG.EXTRA                     XLEN  B   C
+
+  BGZF format is compatible with GZIP. It limits the size of each compressed
+  block to 2^16 bytes and adds and an extra "BC" field in the gzip header which
+  records the size.
+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+    int size;
+    uint8_t *block;
+    int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+typedef struct
+{
+    uint64_t uaddr;  // offset w.r.t. uncompressed data
+    uint64_t caddr;  // offset w.r.t. compressed data
+}
+bgzidx1_t;
+
+struct __bgzidx_t
+{
+    int noffs, moffs;       // the size of the index, n:used, m:allocated
+    bgzidx1_t *offs;        // offsets
+    uint64_t ublock_addr;   // offset of the current block (uncompressed data)
+};
+
+void bgzf_index_destroy(BGZF *fp);
+int bgzf_index_add_block(BGZF *fp);
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+    return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+    buffer[2] = value >> 16;
+    buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init(hFILE *hfpr)
+{
+    BGZF *fp;
+    uint8_t magic[18];
+    ssize_t n = hpeek(hfpr, magic, 18);
+    if (n < 0) return NULL;
+
+    fp = (BGZF*)calloc(1, sizeof(BGZF));
+    if (fp == NULL) return NULL;
+
+    fp->is_write = 0;
+    fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b);
+    fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+    fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+    fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b) ? 1 : 0;
+    fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
+#ifdef BGZF_CACHE
+    fp->cache = kh_init(cache);
+#endif
+    return fp;
+}
+
+// get the compress level from the mode string: compress_level==-1 for the default level, -2 plain uncompressed
+static int mode2level(const char *__restrict mode)
+{
+    int i, compress_level = -1;
+    for (i = 0; mode[i]; ++i)
+        if (mode[i] >= '0' && mode[i] <= '9') break;
+    if (mode[i]) compress_level = (int)mode[i] - '0';
+    if (strchr(mode, 'u')) compress_level = -2;
+    return compress_level;
+}
+static BGZF *bgzf_write_init(const char *mode)
+{
+    BGZF *fp;
+    {
+	fp = (BGZF*)calloc(1, sizeof(BGZF));
+	if (!fp) {
+	  goto mem_fail1;
+	}
+	fp->is_write = 1;
+	int compress_level = mode2level(mode);
+	if ( compress_level==-2 )
+	{
+	    fp->is_compressed = 0;
+	    return fp;
+	}
+	fp->is_compressed = 1;
+	fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	if (!fp->uncompressed_block) {
+	  goto mem_fail2;
+	}
+	fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	if (!fp->compressed_block) {
+	  goto mem_fail3;
+	}
+	fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+	if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+	if ( strchr(mode,'g') )
+	{
+	    // gzip output
+	    fp->is_gzip = 1;
+	    fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream));
+	    if (!fp->gz_stream) {
+	      goto mem_fail4;
+	    }
+	    fp->gz_stream->zalloc = NULL;
+	    fp->gz_stream->zfree  = NULL;
+	    if ( deflateInit2(fp->gz_stream, fp->compress_level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return NULL;
+	}
+	return fp;
+    }
+  mem_fail4:
+    free(fp->compressed_block);
+  mem_fail3:
+    free(fp->uncompressed_block);
+  mem_fail2:
+    free(fp);
+  mem_fail1:
+    return NULL;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+    BGZF *fp = 0;
+    assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+    if (strchr(mode, 'r')) {
+        hFILE *fpr;
+        if ((fpr = hopen(path, mode)) == 0) return 0;
+        fp = bgzf_read_init(fpr);
+        if (fp == 0) { hclose_abruptly(fpr); return NULL; }
+        fp->fp = fpr;
+    } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+        hFILE *fpw;
+        if ((fpw = hopen(path, mode)) == 0) return 0;
+        fp = bgzf_write_init(mode);
+        fp->fp = fpw;
+    }
+    else { errno = EINVAL; return 0; }
+
+    fp->is_be = ed_is_big();
+    return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+    BGZF *fp = 0;
+    assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+    if (strchr(mode, 'r')) {
+        hFILE *fpr;
+        if ((fpr = hdopen(fd, mode)) == 0) return 0;
+        fp = bgzf_read_init(fpr);
+        if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd
+        fp->fp = fpr;
+    } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+        hFILE *fpw;
+        if ((fpw = hdopen(fd, mode)) == 0) return 0;
+        fp = bgzf_write_init(mode);
+        fp->fp = fpw;
+    }
+    else { errno = EINVAL; return 0; }
+
+    fp->is_be = ed_is_big();
+    return fp;
+}
+
+BGZF *bgzf_hopen(hFILE *hfp, const char *mode)
+{
+    BGZF *fp = NULL;
+    assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+    if (strchr(mode, 'r')) {
+        fp = bgzf_read_init(hfp);
+        if (fp == NULL) return NULL;
+    } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+        fp = bgzf_write_init(mode);
+    }
+    else { errno = EINVAL; return 0; }
+
+    fp->fp = hfp;
+    fp->is_be = ed_is_big();
+    return fp;
+}
+
+static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
+{
+    uint32_t crc;
+    z_stream zs;
+    uint8_t *dst = (uint8_t*)_dst;
+
+    // compress the body
+    zs.zalloc = NULL; zs.zfree = NULL;
+    zs.next_in  = (Bytef*)src;
+    zs.avail_in = slen;
+    zs.next_out = dst + BLOCK_HEADER_LENGTH;
+    zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+    if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
+    if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
+    if (deflateEnd(&zs) != Z_OK) return -1;
+    *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+    // write the header
+    memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+    packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
+    // write the footer
+    crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen);
+    packInt32((uint8_t*)&dst[*dlen - 8], crc);
+    packInt32((uint8_t*)&dst[*dlen - 4], slen);
+    return 0;
+}
+
+static int bgzf_gzip_compress(BGZF *fp, void *_dst, int *dlen, void *src, int slen)
+{
+    uint8_t *dst = (uint8_t*)_dst;
+    z_stream *zs = fp->gz_stream;
+    int flush = slen ? Z_NO_FLUSH : Z_FINISH;
+    zs->next_in   = (Bytef*)src;
+    zs->avail_in  = slen;
+    zs->next_out  = dst;
+    zs->avail_out = *dlen;
+    if ( deflate(zs, flush) == Z_STREAM_ERROR ) return -1;
+    *dlen = *dlen - zs->avail_out;
+    return 0;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+    int comp_size = BGZF_MAX_BLOCK_SIZE;
+    int ret;
+    if ( !fp->is_gzip )
+        ret = bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
+    else
+        ret = bgzf_gzip_compress(fp, fp->compressed_block, &comp_size, fp->uncompressed_block, block_length);
+
+    if ( ret != 0 )
+    {
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    fp->block_offset = 0;
+    return comp_size;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+    z_stream zs;
+    zs.zalloc = NULL;
+    zs.zfree = NULL;
+    zs.next_in = (Bytef*)fp->compressed_block + 18;
+    zs.avail_in = block_length - 16;
+    zs.next_out = (Bytef*)fp->uncompressed_block;
+    zs.avail_out = BGZF_MAX_BLOCK_SIZE;
+
+    if (inflateInit2(&zs, -15) != Z_OK) {
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+        inflateEnd(&zs);
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    if (inflateEnd(&zs) != Z_OK) {
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    return zs.total_out;
+}
+
+static int inflate_gzip_block(BGZF *fp, int cached)
+{
+    int ret = Z_OK;
+    do
+    {
+        if ( !cached && fp->gz_stream->avail_out!=0 )
+        {
+            fp->gz_stream->avail_in = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE);
+            if ( fp->gz_stream->avail_in<=0 ) return fp->gz_stream->avail_in;
+            if ( fp->gz_stream->avail_in==0 ) break;
+            fp->gz_stream->next_in = (Bytef*)(fp->compressed_block);
+        }
+        else cached = 0;
+        do
+        {
+            fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset;
+            fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset;
+            ret = inflate(fp->gz_stream, Z_NO_FLUSH);
+            if ( ret==Z_BUF_ERROR ) continue;   // non-critical error
+            if ( ret<0 ) return -1;
+            unsigned int have = BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+            if ( have ) return have;
+        }
+        while ( fp->gz_stream->avail_out == 0 );
+    }
+    while (ret != Z_STREAM_END);
+    return BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+}
+
+// Returns: 0 on success (BGZF header); -1 on non-BGZF GZIP header; -2 on error
+static int check_header(const uint8_t *header)
+{
+    if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+    return ((header[3] & 4) != 0
+            && unpackInt16((const uint8_t*)&header[10]) == 6
+            && header[12] == 'B' && header[13] == 'C'
+            && unpackInt16((const uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+    khint_t k;
+    khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+    if (fp->is_write) return;
+    for (k = kh_begin(h); k < kh_end(h); ++k)
+        if (kh_exist(h, k)) free(kh_val(h, k).block);
+    kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+    khint_t k;
+    cache_t *p;
+    khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+    k = kh_get(cache, h, block_address);
+    if (k == kh_end(h)) return 0;
+    p = &kh_val(h, k);
+    if (fp->block_length != 0) fp->block_offset = 0;
+    fp->block_address = block_address;
+    fp->block_length = p->size;
+    memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
+    if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 )
+    {
+        // todo: move the error up
+#ifdef _WIN32
+        // patch for PRId64 unhappiness
+        fprintf(stderr, "Could not hseek to %I64d\n", p->end_offset);
+#else
+  #if defined(__LP64__) && !defined(__APPLE__)
+        fprintf(stderr, "Could not hseek to %ld\n", p->end_offset);
+  #else
+        fprintf(stderr, "Could not hseek to %lld\n", p->end_offset);
+  #endif
+#endif
+        exit(1);
+    }
+    return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+    int ret;
+    khint_t k;
+    cache_t *p;
+    khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+    if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
+    if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
+        /* A better way would be to remove the oldest block in the
+         * cache, but here we remove a random one for simplicity. This
+         * should not have a big impact on performance. */
+        for (k = kh_begin(h); k < kh_end(h); ++k)
+            if (kh_exist(h, k)) break;
+        if (k < kh_end(h)) {
+            free(kh_val(h, k).block);
+            kh_del(cache, h, k);
+        }
+    }
+    k = kh_put(cache, h, fp->block_address, &ret);
+    if (ret == 0) return; // if this happens, a bug!
+    p = &kh_val(h, k);
+    p->size = fp->block_length;
+    p->end_offset = fp->block_address + size;
+    p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
+    memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+    uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+    int count, size = 0, block_length, remaining;
+
+    // Reading an uncompressed file
+    if ( !fp->is_compressed )
+    {
+        count = hread(fp->fp, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+        if ( count==0 )
+        {
+            fp->block_length = 0;
+            return 0;
+        }
+        if (fp->block_length != 0) fp->block_offset = 0;
+        fp->block_address += count;
+        fp->block_length = count;
+        return 0;
+    }
+
+    // Reading compressed file
+    int64_t block_address;
+    block_address = htell(fp->fp);
+    if ( fp->is_gzip && fp->gz_stream ) // is this is a initialized gzip stream?
+    {
+        count = inflate_gzip_block(fp, 0);
+        if ( count<0 )
+        {
+            fp->errcode |= BGZF_ERR_ZLIB;
+            return -1;
+        }
+        fp->block_length = count;
+        fp->block_address = block_address;
+        return 0;
+    }
+    if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
+    count = hread(fp->fp, header, sizeof(header));
+    if (count == 0) { // no data read
+        fp->block_length = 0;
+        return 0;
+    }
+    int ret;
+    if ( count != sizeof(header) || (ret=check_header(header))==-2 )
+    {
+        fp->errcode |= BGZF_ERR_HEADER;
+        return -1;
+    }
+    if ( ret==-1 )
+    {
+        // GZIP, not BGZF
+        uint8_t *cblock = (uint8_t*)fp->compressed_block;
+        memcpy(cblock, header, sizeof(header));
+        count = hread(fp->fp, cblock+sizeof(header), BGZF_BLOCK_SIZE - sizeof(header)) + sizeof(header);
+        int nskip = 10;
+
+        // Check optional fields to skip: FLG.FNAME,FLG.FCOMMENT,FLG.FHCRC,FLG.FEXTRA
+        // Note: Some of these fields are untested, I did not have appropriate data available
+        if ( header[3] & 0x4 ) // FLG.FEXTRA
+        {
+            nskip += unpackInt16(&cblock[nskip]) + 2;
+        }
+        if ( header[3] & 0x8 ) // FLG.FNAME
+        {
+            while ( nskip<BGZF_BLOCK_SIZE && cblock[nskip] ) nskip++;
+            if ( nskip==BGZF_BLOCK_SIZE )
+            {
+                fp->errcode |= BGZF_ERR_HEADER;
+                return -1;
+            }
+            nskip++;
+        }
+        if ( header[3] & 0x10 ) // FLG.FCOMMENT
+        {
+            while ( nskip<BGZF_BLOCK_SIZE && cblock[nskip] ) nskip++;
+            if ( nskip==BGZF_BLOCK_SIZE )
+            {
+                fp->errcode |= BGZF_ERR_HEADER;
+                return -1;
+            }
+            nskip++;
+        }
+        if ( header[3] & 0x2 ) nskip += 2;  //  FLG.FHCRC
+
+        fp->is_gzip = 1;
+        fp->gz_stream = (z_stream*) calloc(1,sizeof(z_stream));
+        ret = inflateInit2(fp->gz_stream, -15);
+        if (ret != Z_OK)
+        {
+            fp->errcode |= BGZF_ERR_ZLIB;
+            return -1;
+        }
+        fp->gz_stream->avail_in = count - nskip;
+        fp->gz_stream->next_in  = cblock + nskip;
+        count = inflate_gzip_block(fp, 1);
+        if ( count<0 )
+        {
+            fp->errcode |= BGZF_ERR_ZLIB;
+            return -1;
+        }
+        fp->block_length = count;
+        fp->block_address = block_address;
+        if ( fp->idx_build_otf ) return -1; // cannot build index for gzip
+        return 0;
+    }
+    size = count;
+    block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+    compressed_block = (uint8_t*)fp->compressed_block;
+    memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+    remaining = block_length - BLOCK_HEADER_LENGTH;
+    count = hread(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+    if (count != remaining) {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+    size += count;
+    if ((count = inflate_block(fp, block_length)) < 0) return -1;
+    if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+    fp->block_address = block_address;
+    fp->block_length = count;
+    if ( fp->idx_build_otf )
+    {
+        bgzf_index_add_block(fp);
+        fp->idx->ublock_addr += count;
+    }
+    cache_block(fp, size);
+    return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
+{
+    ssize_t bytes_read = 0;
+    uint8_t *output = (uint8_t*)data;
+    if (length <= 0) return 0;
+    assert(fp->is_write == 0);
+    // kludge to address signed vs. unsigned comparison warning
+    while (bytes_read < ((ssize_t)length)) {
+        int copy_length, available = fp->block_length - fp->block_offset;
+        uint8_t *buffer;
+        if (available <= 0) {
+            if (bgzf_read_block(fp) != 0) return -1;
+            available = fp->block_length - fp->block_offset;
+            if (available <= 0) break;
+        }
+        copy_length = ((ssize_t)(length - bytes_read)) < available? ((ssize_t)(length - bytes_read)) : available;
+        buffer = (uint8_t*)fp->uncompressed_block;
+        memcpy(output, buffer + fp->block_offset, copy_length);
+        fp->block_offset += copy_length;
+        output += copy_length;
+        bytes_read += copy_length;
+    }
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = htell(fp->fp);
+        fp->block_offset = fp->block_length = 0;
+    }
+    fp->uncompressed_address += bytes_read;
+    return bytes_read;
+}
+
+ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
+{
+    return hread(fp->fp, data, length);
+}
+
+#ifdef BGZF_MT
+
+typedef struct {
+    struct bgzf_mtaux_t *mt;
+    void *buf;
+    int i, errcode, toproc, compress_level;
+} worker_t;
+
+typedef struct bgzf_mtaux_t {
+    int n_threads, n_blks, curr, done;
+    volatile int proc_cnt;
+    void **blk;
+    int *len;
+    worker_t *w;
+    pthread_t *tid;
+    pthread_mutex_t lock;
+    pthread_cond_t cv;
+} mtaux_t;
+
+static int worker_aux(worker_t *w)
+{
+    int i, stop = 0;
+    // wait for condition: to process or all done
+    pthread_mutex_lock(&w->mt->lock);
+    while (!w->toproc && !w->mt->done)
+        pthread_cond_wait(&w->mt->cv, &w->mt->lock);
+    if (w->mt->done) stop = 1;
+    w->toproc = 0;
+    pthread_mutex_unlock(&w->mt->lock);
+    if (stop) return 1; // to quit the thread
+    w->errcode = 0;
+    for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
+        int clen = BGZF_MAX_BLOCK_SIZE;
+        if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->compress_level) != 0)
+            w->errcode |= BGZF_ERR_ZLIB;
+        memcpy(w->mt->blk[i], w->buf, clen);
+        w->mt->len[i] = clen;
+    }
+    __sync_fetch_and_add(&w->mt->proc_cnt, 1);
+    return 0;
+}
+
+static void *mt_worker(void *data)
+{
+    while (worker_aux((worker_t*)data) == 0);
+    return 0;
+}
+
+int bgzf_mt2(unsigned char* arena_top, int n_threads, int n_sub_blks, unsigned char** arena_bottom_ptr, BGZF* fp)
+{
+    int i;
+    mtaux_t *mt;
+    pthread_attr_t attr;
+    if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
+    mt = (mtaux_t*)calloc(1, sizeof(mtaux_t));
+    if (!mt) {
+        goto mem_fail1;
+    }
+    mt->n_threads = n_threads;
+    mt->n_blks = n_threads * n_sub_blks;
+    mt->len = (int*)calloc(mt->n_blks, sizeof(int));
+    if (!mt->len) {
+        goto mem_fail2;
+    }
+    mt->blk = (void**)calloc(mt->n_blks, sizeof(void*));
+    if (!mt->blk) {
+        goto mem_fail3;
+    }
+    if ((mt->n_blks * ((size_t)2) * BGZF_MAX_BLOCK_SIZE) > ((size_t)(arena_top - (*arena_bottom_ptr)))) {
+        goto mem_fail4;
+    }
+    for (i = 0; i < mt->n_blks; ++i) {
+        mt->blk[i] = (void*)(*arena_bottom_ptr);
+	// assumes BGZF_MAX_BLOCK_SIZE is a multiple of 64
+	*arena_bottom_ptr += BGZF_MAX_BLOCK_SIZE;
+    }
+    mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
+    if (!mt->tid) {
+        goto mem_fail4;
+    }
+    mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t));
+    if (!mt->w) {
+        goto mem_fail5;
+    }
+    for (i = 0; i < mt->n_threads; ++i) {
+        mt->w[i].i = i;
+        mt->w[i].mt = mt;
+        mt->w[i].compress_level = fp->compress_level;
+	mt->w[i].buf = (void*)(*arena_bottom_ptr);
+	*arena_bottom_ptr += BGZF_MAX_BLOCK_SIZE;
+    }
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+    pthread_mutex_init(&mt->lock, 0);
+    pthread_cond_init(&mt->cv, 0);
+    for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
+        pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
+    fp->mt = mt;
+    return 0;
+  mem_fail5:
+    free(mt->tid);
+  mem_fail4:
+    free(mt->blk);
+  mem_fail3:
+    free(mt->len);
+  mem_fail2:
+    free(mt);
+  mem_fail1:
+    return -1;
+}
+
+static void mt_destroy(mtaux_t *mt)
+{
+    int i;
+    // signal all workers to quit
+    pthread_mutex_lock(&mt->lock);
+    mt->done = 1; mt->proc_cnt = 0;
+    pthread_cond_broadcast(&mt->cv);
+    pthread_mutex_unlock(&mt->lock);
+    for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
+    // free other data allocated on heap
+    // for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
+    // for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
+    free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
+    pthread_cond_destroy(&mt->cv);
+    pthread_mutex_destroy(&mt->lock);
+    free(mt);
+}
+
+static void mt_queue(BGZF *fp)
+{
+    mtaux_t *mt = fp->mt;
+    // assert(mt->curr < mt->n_blks); // guaranteed by the caller
+    memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
+    mt->len[mt->curr] = fp->block_offset;
+    fp->block_offset = 0;
+    ++mt->curr;
+}
+
+static int mt_flush_queue(BGZF *fp)
+{
+    int i;
+    mtaux_t *mt = fp->mt;
+    // signal all the workers to compress
+    pthread_mutex_lock(&mt->lock);
+    for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
+    mt->proc_cnt = 0;
+    pthread_cond_broadcast(&mt->cv);
+    pthread_mutex_unlock(&mt->lock);
+    // worker 0 is doing things here
+    worker_aux(&mt->w[0]);
+    // wait for all the threads to complete
+    while (mt->proc_cnt < mt->n_threads);
+    // dump data to disk
+    for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
+    for (i = 0; i < mt->curr; ++i)
+        if (hwrite(fp->fp, mt->blk[i], mt->len[i]) != mt->len[i]) {
+            fp->errcode |= BGZF_ERR_IO;
+            break;
+        }
+    mt->curr = 0;
+    return (fp->errcode == 0)? 0 : -1;
+}
+
+static int lazy_flush(BGZF *fp)
+{
+    if (fp->mt) {
+        if (fp->block_offset) mt_queue(fp);
+        return (fp->mt->curr < fp->mt->n_blks)? 0 : mt_flush_queue(fp);
+    }
+    else return bgzf_flush(fp);
+}
+
+#else  // ~ #ifdef BGZF_MT
+
+int bgzf_mt(__attribute__((unused)) BGZF *fp, __attribute__((unused)) int n_threads, __attribute__((unused)) int n_sub_blks)
+{
+    return 0;
+}
+
+static inline int lazy_flush(BGZF *fp)
+{
+    return bgzf_flush(fp);
+}
+
+#endif // ~ #ifdef BGZF_MT
+
+int bgzf_flush(BGZF *fp)
+{
+    if (!fp->is_write) return 0;
+#ifdef BGZF_MT
+    if (fp->mt) {
+        if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
+        return mt_flush_queue(fp);
+    }
+#endif
+    while (fp->block_offset > 0) {
+        if ( fp->idx_build_otf )
+        {
+            bgzf_index_add_block(fp);
+            fp->idx->ublock_addr += fp->block_offset;
+        }
+        int block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) return -1;
+        if (hwrite(fp->fp, fp->compressed_block, block_length) != block_length) {
+            fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+            return -1;
+        }
+        fp->block_address += block_length;
+    }
+    return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+    if (fp->block_offset + size > BGZF_BLOCK_SIZE) return lazy_flush(fp);
+    return 0;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
+{
+    if ( !fp->is_compressed )
+        return hwrite(fp->fp, data, length);
+
+    const uint8_t *input = (const uint8_t*)data;
+    ssize_t remaining = length;
+    // assert(fp->is_write);
+    while (remaining > 0) {
+        uint8_t* buffer = (uint8_t*)fp->uncompressed_block;
+        int copy_length = BGZF_BLOCK_SIZE - fp->block_offset;
+        if (copy_length > remaining) copy_length = remaining;
+        memcpy(buffer + fp->block_offset, input, copy_length);
+        fp->block_offset += copy_length;
+        input += copy_length;
+        remaining -= copy_length;
+        if (fp->block_offset == BGZF_BLOCK_SIZE) {
+            if (lazy_flush(fp) != 0) return -1;
+        }
+    }
+    return length - remaining;
+}
+
+ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
+{
+    return hwrite(fp->fp, data, length);
+}
+
+int bgzf_close(BGZF* fp)
+{
+    int ret, block_length;
+    if (fp == 0) return -1;
+    if (fp->is_write && fp->is_compressed) {
+        if (bgzf_flush(fp) != 0) return -1;
+        fp->compress_level = -1;
+        block_length = deflate_block(fp, 0); // write an empty block
+        if (hwrite(fp->fp, fp->compressed_block, block_length) < 0
+            || hflush(fp->fp) != 0) {
+            fp->errcode |= BGZF_ERR_IO;
+            return -1;
+        }
+#ifdef BGZF_MT
+        if (fp->mt) mt_destroy(fp->mt);
+#endif
+    }
+    if ( fp->is_gzip )
+    {
+        if (!fp->is_write) (void)inflateEnd(fp->gz_stream);
+        else (void)deflateEnd(fp->gz_stream);
+        free(fp->gz_stream);
+    }
+    ret = hclose(fp->fp);
+    if (ret != 0) return -1;
+    bgzf_index_destroy(fp);
+    free(fp->uncompressed_block);
+    free(fp->compressed_block);
+    free_cache(fp);
+    free(fp);
+    return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+    if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+    uint8_t buf[28];
+    off_t offset = htell(fp->fp);
+    if (hseek(fp->fp, -28, SEEK_END) < 0) {
+        if (errno == ESPIPE) { hclearerr(fp->fp); return 2; }
+        else return -1;
+    }
+    if ( hread(fp->fp, buf, 28) != 28 ) return -1;
+    if ( hseek(fp->fp, offset, SEEK_SET) < 0 ) return -1;
+    return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+    int block_offset;
+    int64_t block_address;
+
+    if (fp->is_write || where != SEEK_SET) {
+        fp->errcode |= BGZF_ERR_MISUSE;
+        return -1;
+    }
+    block_offset = pos & 0xFFFF;
+    block_address = pos >> 16;
+    if (hseek(fp->fp, block_address, SEEK_SET) < 0) {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block has not been loaded
+    fp->block_address = block_address;
+    fp->block_offset = block_offset;
+    return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+    uint8_t buf[16];
+    int n;
+    hFILE *fp;
+    if ((fp = hopen(fn, "r")) == 0) return 0;
+    n = hread(fp, buf, 16);
+    if ( hclose(fp) < 0 ) return -1;
+    if (n != 16) return 0;
+    return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+    int c;
+    if (fp->block_offset >= fp->block_length) {
+        if (bgzf_read_block(fp) != 0) return -2; /* error */
+        if (fp->block_length == 0) return -1; /* end-of-file */
+    }
+    c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = htell(fp->fp);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+    fp->uncompressed_address++;
+    return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+    int l, state = 0;
+    unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+    str->l = 0;
+    do {
+        if (fp->block_offset >= fp->block_length) {
+            if (bgzf_read_block(fp) != 0) { state = -2; break; }
+            if (fp->block_length == 0) { state = -1; break; }
+        }
+        for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+        if (l < fp->block_length) state = 1;
+        l -= fp->block_offset;
+        if (str->l + l + 1 >= str->m) {
+            str->m = str->l + l + 2;
+            kroundup32(str->m);
+            str->s = (char*)realloc(str->s, str->m);
+        }
+        memcpy(str->s + str->l, buf + fp->block_offset, l);
+        str->l += l;
+        fp->block_offset += l + 1;
+        if (fp->block_offset >= fp->block_length) {
+            fp->block_address = htell(fp->fp);
+            fp->block_offset = 0;
+            fp->block_length = 0;
+        }
+    } while (state == 0);
+    if (str->l == 0 && state < 0) return state;
+    fp->uncompressed_address += str->l;
+    if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--;
+    str->s[str->l] = 0;
+    return str->l;
+}
+
+void bgzf_index_destroy(BGZF *fp)
+{
+    if ( !fp->idx ) return;
+    free(fp->idx->offs);
+    free(fp->idx);
+    fp->idx = NULL;
+    fp->idx_build_otf = 0;
+}
+
+int bgzf_index_build_init(BGZF *fp)
+{
+    bgzf_index_destroy(fp);
+    fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+    if ( !fp->idx ) return -1;
+    fp->idx_build_otf = 1;  // build index on the fly
+    return 0;
+}
+
+int bgzf_index_add_block(BGZF *fp)
+{
+    fp->idx->noffs++;
+    if ( fp->idx->noffs > fp->idx->moffs )
+    {
+        fp->idx->moffs = fp->idx->noffs;
+        kroundup32(fp->idx->moffs);
+        fp->idx->offs = (bgzidx1_t*) realloc(fp->idx->offs, fp->idx->moffs*sizeof(bgzidx1_t));
+        if ( !fp->idx->offs ) return -1;
+    }
+    fp->idx->offs[ fp->idx->noffs-1 ].uaddr = fp->idx->ublock_addr;
+    fp->idx->offs[ fp->idx->noffs-1 ].caddr = fp->block_address;
+    return 0;
+}
+
+int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
+{
+    if (bgzf_flush(fp) != 0) return -1;
+
+    assert(fp->idx);
+    char *tmp = NULL;
+    if ( suffix )
+    {
+        int blen = strlen(bname);
+        int slen = strlen(suffix);
+        tmp = (char*) malloc(blen + slen + 1);
+        if ( !tmp ) return -1;
+        memcpy(tmp,bname,blen);
+        memcpy(tmp+blen,suffix,slen+1);
+    }
+
+    FILE *idx = fopen(tmp?tmp:bname,"wb");
+    if ( tmp ) free(tmp);
+    if ( !idx ) return -1;
+
+    // Note that the index contains one extra record when indexing files opened
+    // for reading. The terminating record is not present when opened for writing.
+    // This is not a bug.
+
+    int i;
+    if ( fp->is_be )
+    {
+        uint64_t x = fp->idx->noffs - 1;
+        fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            x = fp->idx->offs[i].caddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+            x = fp->idx->offs[i].uaddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+        }
+    }
+    else
+    {
+        uint64_t x = fp->idx->noffs - 1;
+        fwrite(&x, 1, sizeof(x), idx);
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            fwrite(&fp->idx->offs[i].caddr, 1, sizeof(fp->idx->offs[i].caddr), idx);
+            fwrite(&fp->idx->offs[i].uaddr, 1, sizeof(fp->idx->offs[i].uaddr), idx);
+        }
+    }
+    fclose(idx);
+    return 0;
+}
+
+
+int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
+{
+    char *tmp = NULL;
+    if ( suffix )
+    {
+        int blen = strlen(bname);
+        int slen = strlen(suffix);
+        tmp = (char*) malloc(blen + slen + 1);
+        if ( !tmp ) return -1;
+        memcpy(tmp,bname,blen);
+        memcpy(tmp+blen,suffix,slen+1);
+    }
+
+    FILE *idx = fopen(tmp?tmp:bname,"rb");
+    if ( tmp ) free(tmp);
+    if ( !idx ) return -1;
+
+    fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+    uint64_t x;
+    if ( fread(&x, 1, sizeof(x), idx) != sizeof(x) ) return -1;
+
+    fp->idx->noffs = fp->idx->moffs = 1 + (fp->is_be ? ed_swap_8(x) : x);
+    fp->idx->offs  = (bgzidx1_t*) malloc(fp->idx->moffs*sizeof(bgzidx1_t));
+    fp->idx->offs[0].caddr = fp->idx->offs[0].uaddr = 0;
+
+    int i;
+    if ( fp->is_be )
+    {
+        int ret = 0;
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = ed_swap_8(x);
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = ed_swap_8(x);
+        }
+        if ( ret != ((ssize_t)(sizeof(x)*2*(fp->idx->noffs-1))) ) return -1;
+    }
+    else
+    {
+        int ret = 0;
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = x;
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = x;
+        }
+        if ( ret != ((ssize_t)(sizeof(x)*2*(fp->idx->noffs-1))) ) return -1;
+    }
+    fclose(idx);
+    return 0;
+
+}
+
+/*
+int bgzf_useek(BGZF *fp, long uoffset, int where)
+{
+    if ( !fp->is_compressed )
+    {
+        if (hseek(fp->fp, uoffset, SEEK_SET) < 0)
+        {
+            fp->errcode |= BGZF_ERR_IO;
+            return -1;
+        }
+        fp->block_length = 0;  // indicates current block has not been loaded
+        fp->block_address = uoffset;
+        fp->block_offset = 0;
+        bgzf_read_block(fp);
+        fp->uncompressed_address = uoffset;
+        return 0;
+    }
+
+    if ( !fp->idx )
+    {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+
+    // binary search
+    int ilo = 0, ihi = fp->idx->noffs - 1;
+    while ( ilo<=ihi )
+    {
+        int i = (ilo+ihi)*0.5;
+        if ( uoffset < ((intptr_t)fp->idx->offs[i].uaddr) ) ihi = i - 1;
+        else if ( uoffset >= ((intptr_t)fp->idx->offs[i].uaddr) ) ilo = i + 1;
+        else break;
+    }
+    int i = ilo-1;
+    if (hseek(fp->fp, fp->idx->offs[i].caddr, SEEK_SET) < 0)
+    {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block has not been loaded
+    fp->block_address = fp->idx->offs[i].caddr;
+    fp->block_offset = 0;
+    if ( bgzf_read_block(fp) < 0 ) return -1;
+    if ( uoffset - fp->idx->offs[i].uaddr > 0 )
+    {
+        fp->block_offset = uoffset - fp->idx->offs[i].uaddr;
+        assert( fp->block_offset <= fp->block_length );     // todo: skipped, unindexed, blocks
+    }
+    fp->uncompressed_address = uoffset;
+    return 0;
+}
+*/
+
+long bgzf_utell(BGZF *fp)
+{
+    return fp->uncompressed_address;    // currently maintained only when reading
+}
+
diff --git a/bgzf.h b/bgzf.h
new file mode 100644
index 0000000..c76e9b4
--- /dev/null
+++ b/bgzf.h
@@ -0,0 +1,323 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+   Copyright (C) 2009, 2013, 2014 Genome Research Ltd
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
+#ifndef HTSLIB_BGZF_H
+#define HTSLIB_BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+// do not use zstd wrapper here
+#ifdef STATIC_ZLIB
+  #include "../zlib-1.2.11/zlib.h"
+#else
+  #include <zlib.h>
+#endif
+
+#include <sys/types.h>
+
+#define BGZF_BLOCK_SIZE     0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
+#define BGZF_MAX_BLOCK_SIZE 0x10000
+
+#define BGZF_ERR_ZLIB   1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO     4
+#define BGZF_ERR_MISUSE 8
+
+struct hFILE;
+struct bgzf_mtaux_t;
+typedef struct __bgzidx_t bgzidx_t;
+
+struct BGZF {
+    int errcode:16, is_write:2, is_be:2, compress_level:9, is_compressed:2, is_gzip:1;
+    int cache_size;
+    int block_length, block_offset;
+    int64_t block_address, uncompressed_address;
+    void *uncompressed_block, *compressed_block;
+    void *cache; // a pointer to a hash table
+    struct hFILE *fp; // actual file handle
+    struct bgzf_mtaux_t *mt; // only used for multi-threading
+    bgzidx_t *idx;      // BGZF index
+    int idx_build_otf;  // build index on the fly, set by bgzf_index_build_init()
+    z_stream *gz_stream;// for gzip-compressed files
+};
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+    size_t l, m;
+    char *s;
+} kstring_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    /******************
+     * Basic routines *
+     ******************/
+
+    /**
+     * Open an existing file descriptor for reading or writing.
+     *
+     * @param fd    file descriptor
+     * @param mode  mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+     *              writing, 'a' for appending, 'g' for gzip rather than BGZF
+     *              compression (with 'w' only), and digit specifies the zlib
+     *              compression level. 
+     *              Note that there is a distinction between 'u' and '0': the
+     *              first yields plain uncompressed output whereas the latter
+     *              outputs uncompressed data wrapped in the zlib format.
+     * @return      BGZF file handler; 0 on error
+     */
+    BGZF* bgzf_dopen(int fd, const char *mode);
+
+    #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
+
+    /**
+     * Open the specified file for reading or writing.
+     */
+    BGZF* bgzf_open(const char* path, const char *mode);
+
+    /**
+     * Open an existing hFILE stream for reading or writing.
+     */
+    BGZF* bgzf_hopen(struct hFILE *fp, const char *mode);
+
+    /**
+     * Close the BGZF and free all associated resources.
+     *
+     * @param fp    BGZF file handler
+     * @return      0 on success and -1 on error
+     */
+    int bgzf_close(BGZF *fp);
+
+    /**
+     * Read up to _length_ bytes from the file storing into _data_.
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to read into
+     * @param length size of data to read
+     * @return       number of bytes actually read; 0 on end-of-file and -1 on error
+     */
+    ssize_t bgzf_read(BGZF *fp, void *data, size_t length);
+
+    /**
+     * Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
+     * the complete _length_ bytes will be written (or queued for writing).
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to write
+     * @param length size of data to write
+     * @return       number of bytes written (i.e., _length_); negative on error
+     */
+    ssize_t bgzf_write(BGZF *fp, const void *data, size_t length);
+
+    /**
+     * Read up to _length_ bytes directly from the underlying stream without
+     * decompressing.  Bypasses BGZF blocking, so must be used with care in
+     * specialised circumstances only.
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to read into
+     * @param length number of raw bytes to read
+     * @return       number of bytes actually read; 0 on end-of-file and -1 on error
+     */
+    ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length);
+
+    /**
+     * Write _length_ bytes directly to the underlying stream without
+     * compressing.  Bypasses BGZF blocking, so must be used with care
+     * in specialised circumstances only.
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to write
+     * @param length number of raw bytes to write
+     * @return       number of bytes actually written; -1 on error
+     */
+    ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length);
+
+    /**
+     * Write the data in the buffer to the file.
+     */
+    int bgzf_flush(BGZF *fp);
+
+    /**
+     * Return a virtual file pointer to the current location in the file.
+     * No interpetation of the value should be made, other than a subsequent
+     * call to bgzf_seek can be used to position the file at the same point.
+     * Return value is non-negative on success.
+     */
+    #define bgzf_tell(fp) (((fp)->block_address << 16) | ((fp)->block_offset & 0xFFFF))
+
+    /**
+     * Set the file to read from the location specified by _pos_.
+     *
+     * @param fp     BGZF file handler
+     * @param pos    virtual file offset returned by bgzf_tell()
+     * @param whence must be SEEK_SET
+     * @return       0 on success and -1 on error
+     */
+    int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
+
+    /**
+     * Check if the BGZF end-of-file (EOF) marker is present
+     *
+     * @param fp    BGZF file handler opened for reading
+     * @return      1 if the EOF marker is present and correct;
+     *              2 if it can't be checked, e.g., because fp isn't seekable;
+     *              0 if the EOF marker is absent;
+     *              -1 (with errno set) on error
+     */
+    int bgzf_check_EOF(BGZF *fp);
+
+    /**
+     * Check if a file is in the BGZF format
+     *
+     * @param fn    file name
+     * @return      1 if _fn_ is BGZF; 0 if not or on I/O error
+     */
+     int bgzf_is_bgzf(const char *fn);
+
+    /*********************
+     * Advanced routines *
+     *********************/
+
+    /**
+     * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+     *
+     * @param fp    BGZF file handler
+     * @param size  size of cache in bytes; 0 to disable caching (default)
+     */
+    void bgzf_set_cache_size(BGZF *fp, int size);
+
+    /**
+     * Flush the file if the remaining buffer size is smaller than _size_
+     * @return      0 if flushing succeeded or was not needed; negative on error
+     */
+    int bgzf_flush_try(BGZF *fp, ssize_t size);
+
+    /**
+     * Read one byte from a BGZF file. It is faster than bgzf_read()
+     * @param fp     BGZF file handler
+     * @return       byte read; -1 on end-of-file or error
+     */
+    int bgzf_getc(BGZF *fp);
+
+    /**
+     * Read one line from a BGZF file. It is faster than bgzf_getc()
+     *
+     * @param fp     BGZF file handler
+     * @param delim  delimitor
+     * @param str    string to write to; must be initialized
+     * @return       length of the string; 0 on end-of-file; negative on error
+     */
+    int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+    /**
+     * Read the next BGZF block.
+     */
+    int bgzf_read_block(BGZF *fp);
+
+    /**
+     * Enable multi-threading (only effective on writing and when the
+     * library was compiled with -DBGZF_MT)
+     * Modified to use bigstack for main allocations.
+     *
+     * @param fp          BGZF file handler; must be opened for writing
+     * @param n_threads   #threads used for writing
+     * @param n_sub_blks  #blocks processed by each thread; a value 64-256 is recommended
+     */
+    int bgzf_mt2(unsigned char* arena_top, int n_threads, int n_sub_blks, unsigned char** arena_bottom_ptr, BGZF* fp);
+
+
+    /*******************
+     * bgzidx routines *
+     *******************/
+
+    /**
+     *  Position BGZF at the uncompressed offset
+     *
+     *  @param fp           BGZF file handler; must be opened for reading
+     *  @param uoffset      file offset in the uncompressed data
+     *  @param where        SEEK_SET supported atm
+     *
+     *  Returns 0 on success and -1 on error.
+     */
+    // int bgzf_useek(BGZF *fp, long uoffset, int where);
+
+    /**
+     *  Position in uncompressed BGZF
+     *
+     *  @param fp           BGZF file handler; must be opened for reading
+     *
+     *  Returns the current offset on success and -1 on error.
+     */
+    long bgzf_utell(BGZF *fp);
+
+    /**
+     * Tell BGZF to build index while compressing.
+     *
+     * @param fp          BGZF file handler; can be opened for reading or writing.
+     *
+     * Returns 0 on success and -1 on error.
+     */
+    int bgzf_index_build_init(BGZF *fp);
+
+    /**
+     * Load BGZF index
+     *
+     * @param fp          BGZF file handler
+     * @param bname       base name
+     * @param suffix      suffix to add to bname (can be NULL)
+     *
+     * Returns 0 on success and -1 on error.
+     */
+    int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix);
+
+    /**
+     * Save BGZF index
+     *
+     * @param fp          BGZF file handler
+     * @param bname       base name
+     * @param suffix      suffix to add to bname (can be NULL)
+     *
+     * Returns 0 on success and -1 on error.
+     */
+    int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/build_dynamic/Makefile b/build_dynamic/Makefile
new file mode 100644
index 0000000..ff593ee
--- /dev/null
+++ b/build_dynamic/Makefile
@@ -0,0 +1,128 @@
+# Linux/OS X Makefile for PLINK 2.00.
+#
+# Compilation options (leave blank after "=" to disable, put "= 1" to enable):
+#   Do not use SSE 4.2 instructions: NO_SSE42
+#   Do not link to LAPACK: NO_LAPACK
+#   Use cblas_f77 instead of cblas: FORCE_CBLAS_F77
+#   Use only -O2 optimization for zstd (may be necessary for gcc 4.x): ZSTD_O2
+#   Statically link zlib: STATIC_ZLIB
+#   Link to MKL with 64-bit indexes (dynamically): DYNAMIC_MKL
+#     (this also requires MKLROOT and MKL_IOMP5_DIR to be defined, and
+#     LD_LIBRARY_PATH to include the appropriate directories)
+#   32-bit binary (also sets STATIC_ZLIB, ZSTD_O2, and NO_SSE42): FORCE_32BIT
+#     (warning: you may need to add a zconf.h symlink to make that work)
+NO_SSE42 =
+NO_LAPACK =
+PREFER_CBLAS_F77 =
+ZSTD_O2 = 1
+STATIC_ZLIB =
+DYNAMIC_MKL =
+MKLROOT = /home/ubuntu/intel/mkl
+MKL_IOMP5_DIR = /home/ubuntu/intel/compilers_and_libraries_2017.2.174/linux/compiler/lib/intel64
+FORCE_32BIT =
+
+BASEFLAGS=-Wall -Wextra
+# ***** end configuration *****
+
+LINKFLAGS=-lm -lpthread
+ZLIB=
+ARCH32=
+
+ifdef FORCE_32BIT
+  # this is targeted at Scientific Linux 6.
+  NO_SSE42 = 1
+  STATIC_ZLIB = 1
+  ZSTD_O2 = 1
+  ARCH32 = -m32 -march=i686
+  CXXFLAGS = -std=c++0x
+else
+  CXXFLAGS = -std=c++11
+endif
+BASEFLAGS += ${ARCH32}
+
+CFLAGS=-O2 -std=gnu99
+# zstd appears to be seriously targeted at -O3; see 26 Jul 2016 entry at
+# cbloom.com/rants.html
+ifdef ZSTD_O2
+  ZCFLAGS=-O2 -std=gnu99
+else
+  ZCFLAGS=-O3 -std=gnu99
+endif
+# this actually needs to be named "CXXFLAGS"
+CXXFLAGS += -O2
+
+ifndef NO_SSE42
+  BASEFLAGS += -msse4.2
+endif
+
+ifdef FORCE_CBLAS_F77
+  BASEFLAGS += -DFORCE_CBLAS_F77
+  BLASFLAGS=-llapack -lf77blas -latlas
+else
+  BLASFLAGS=-llapack -lblas -lcblas -latlas
+endif
+
+ifdef STATIC_ZLIB
+  BASEFLAGS += -DSTATIC_ZLIB
+  LINKFLAGS += -L. ../../zlib-1.2.11/libz.a
+else
+  LINKFLAGS += -lz
+endif
+
+UNAME := $(shell uname)
+ifeq ($(UNAME), Darwin)
+  ifdef FORCE_32BIT
+    $(error 32-bit OS X builds are not supported)
+  endif
+  ifdef DYNAMIC_MKL
+    $(error MKL is not currently supported on OS X)
+  endif
+  BLASFLAGS=-framework Accelerate
+else
+  ifdef DYNAMIC_MKL
+    ifdef NO_LAPACK
+      $(error DYNAMIC_MKL and NO_LAPACK conflict)
+    endif
+    ifdef FORCE_32BIT
+      $(error DYNAMIC_MKL + FORCE_32BIT not supported)
+    endif
+    BASEFLAGS = -DDYNAMIC_MKL -DLAPACK_ILP64 -I${MKLROOT}/include
+    BLASFLAGS = -L${MKLROOT}/lib/intel64 -L${MKL_IOMP5_DIR} -Wl,--no-as-needed -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core -liomp5
+    LINKFLAGS += -ldl
+  endif
+endif
+
+ifdef NO_LAPACK
+  BASEFLAGS += -DNOLAPACK
+  BLASFLAGS=
+endif
+
+ZSTD_INCLUDE = -I../zstd/lib -I../zstd/lib/common -I../zstd/zlibWrapper
+ZCFLAGS += ${ZSTD_INCLUDE}
+
+CFLAGS += ${BASEFLAGS}
+ZCFLAGS += ${BASEFLAGS}
+CXXFLAGS += ${BASEFLAGS}
+
+CSRC = ../SFMT.c ../hfile.c ../bgzf.c
+ZCSRC = ../zstd/zlibWrapper/zstd_zlibwrapper.c ../zstd/zlibWrapper/gzclose.c ../zstd/zlibWrapper/gzlib.c ../zstd/zlibWrapper/gzread.c ../zstd/zlibWrapper/gzwrite.c ../zstd/lib/common/entropy_common.c ../zstd/lib/common/zstd_common.c ../zstd/lib/common/error_private.c ../zstd/lib/common/xxhash.c ../zstd/lib/common/fse_decompress.c ../zstd/lib/compress/fse_compress.c ../zstd/lib/compress/huf_compress.c ../zstd/lib/compress/zstd_compress.c ../zstd/lib/decompress/huf_decompress.c ../zstd/lib [...]
+CPPSRC = ../pgenlib_internal.cpp ../plink2.cpp ../plink2_adjust.cpp ../plink2_common.cpp ../plink2_compress_stream.cpp ../plink2_data.cpp ../plink2_decompress.cpp ../plink2_filter.cpp ../plink2_glm.cpp ../plink2_help.cpp ../plink2_ld.cpp ../plink2_matrix.cpp ../plink2_matrix_calc.cpp ../plink2_misc.cpp ../plink2_psam.cpp ../plink2_pvar.cpp ../plink2_random.cpp ../plink2_set.cpp ../plink2_stats.cpp
+
+OBJ = SFMT.o hfile.o bgzf.o zstd_zlibwrapper.o gzclose.o gzlib.o gzread.o gzwrite.o entropy_common.o zstd_common.o error_private.o xxhash.o fse_decompress.o fse_compress.o huf_compress.o zstd_compress.o huf_decompress.o zstd_decompress.o pgenlib_internal.o plink2.o plink2_adjust.o plink2_common.o plink2_compress_stream.o plink2_data.o plink2_decompress.o plink2_filter.o plink2_glm.o plink2_help.o plink2_ld.o plink2_matrix.o plink2_matrix_calc.o plink2_misc.o plink2_psam.o plink2_pvar.o p [...]
+
+all: plink2 pgen_compress
+
+plink2: $(CSRC) $(ZCSRC) $(CPPSRC)
+	gcc $(CFLAGS) $(CSRC) -c
+	gcc $(ZCFLAGS) $(ZCSRC) -c
+	g++ $(CXXFLAGS) $(CPPSRC) -c
+	g++ $(OBJ) $(ARCH32) -o plink2 $(BLASFLAGS) $(LINKFLAGS)
+
+pgen_compress: ../pgenlib_internal.cpp ../pgen_compress.cpp
+	g++ $(CXXFLAGS) ../pgenlib_internal.cpp ../pgen_compress.cpp -o pgen_compress
+
+.PHONY: clean
+clean:
+	rm -f *.o
+	rm -f plink2
+	rm -f pgen_compress
diff --git a/build_win/Makefile b/build_win/Makefile
new file mode 100644
index 0000000..84a0c76
--- /dev/null
+++ b/build_win/Makefile
@@ -0,0 +1,70 @@
+# MinGW/MinGW-w64 Makefile for PLINK 2.00.
+#
+# Compilation options (leave blank after "=" to disable, put "= 1" to enable):
+#   Do not use SSE 4.2 instructions: NO_SSE42
+#   Do not link to OpenBLAS: NO_OPENBLAS
+#   Use only -O2 optimization for zstd: ZSTD_O2
+NO_SSE42 = 1
+NO_OPENBLAS =
+ZSTD_O2 = 1
+
+OPENBLAS_ROOT = ../../openblas
+ZLIB_STATIC = ../../zlib-1.2.11/libz.a
+
+BASEFLAGS=-Wall -Wextra
+# ***** end configuration *****
+
+BASEFLAGS += -DSTATIC_ZLIB -fno-exceptions
+LINKFLAGS=-lm -static-libgcc -L. ${ZLIB_STATIC}
+ifdef NO_OPENBLAS
+  BASEFLAGS += -DNOLAPACK
+  BLASFLAGS=
+else
+  BASEFLAGS += -DUSE_OPENBLAS
+  BLASFLAGS=-Wl,-Bstatic -L. ${OPENBLAS_ROOT}/lib/libopenblas.a
+endif
+
+CFLAGS=-O2 -std=gnu99
+CXXFLAGS=-O2 -std=gnu++11
+
+ifndef NO_SSE42
+  BASEFLAGS += -msse4.2
+endif
+
+ifdef ZSTD_O2
+  ZCFLAGS=-O2 -std=gnu99
+else
+  ZCFLAGS=-O3 -std=gnu99
+endif
+
+BASEFLAGS += -I${OPENBLAS_ROOT}/include
+
+CFLAGS += ${BASEFLAGS}
+ZCFLAGS += ${BASEFLAGS}
+CXXFLAGS += ${BASEFLAGS}
+
+ZSTD_INCLUDE = -I../zstd/lib -I../zstd/lib/common -I../zstd/zlibWrapper
+ZCFLAGS += ${ZSTD_INCLUDE}
+
+CSRC = ../SFMT.c ../hfile.c ../bgzf.c
+ZCSRC = ../zstd/zlibWrapper/zstd_zlibwrapper.c ../zstd/zlibWrapper/gzclose.c ../zstd/zlibWrapper/gzlib.c ../zstd/zlibWrapper/gzread.c ../zstd/zlibWrapper/gzwrite.c ../zstd/lib/common/entropy_common.c ../zstd/lib/common/zstd_common.c ../zstd/lib/common/error_private.c ../zstd/lib/common/xxhash.c ../zstd/lib/common/fse_decompress.c ../zstd/lib/compress/fse_compress.c ../zstd/lib/compress/huf_compress.c ../zstd/lib/compress/zstd_compress.c ../zstd/lib/decompress/huf_decompress.c ../zstd/lib [...]
+CPPSRC = ../pgenlib_internal.cpp ../plink2.cpp ../plink2_adjust.cpp ../plink2_common.cpp ../plink2_compress_stream.cpp ../plink2_data.cpp ../plink2_decompress.cpp ../plink2_filter.cpp ../plink2_glm.cpp ../plink2_help.cpp ../plink2_ld.cpp ../plink2_matrix.cpp ../plink2_matrix_calc.cpp ../plink2_misc.cpp ../plink2_psam.cpp ../plink2_pvar.cpp ../plink2_random.cpp ../plink2_set.cpp ../plink2_stats.cpp
+
+OBJ = SFMT.o hfile.o bgzf.o zstd_zlibwrapper.o gzclose.o gzlib.o gzread.o gzwrite.o entropy_common.o zstd_common.o error_private.o xxhash.o fse_decompress.o fse_compress.o huf_compress.o zstd_compress.o huf_decompress.o zstd_decompress.o pgenlib_internal.o plink2.o plink2_adjust.o plink2_common.o plink2_compress_stream.o plink2_data.o plink2_decompress.o plink2_filter.o plink2_glm.o plink2_help.o plink2_ld.o plink2_matrix.o plink2_matrix_calc.o plink2_misc.o plink2_psam.o plink2_pvar.o p [...]
+
+all: plink2 pgen_compress
+
+plink2: $(CSRC) $(ZCSRC) $(CPPSRC)
+	gcc $(CFLAGS) $(CSRC) -c
+	gcc $(ZCFLAGS) $(ZCSRC) -c
+	g++ $(CXXFLAGS) $(CPPSRC) -c
+	gfortran $(OBJ) -o plink2 $(BLASFLAGS) $(LINKFLAGS)
+
+pgen_compress: ../pgenlib_internal.cpp ../pgen_compress.cpp
+	g++ $(CXXFLAGS) ../pgenlib_internal.cpp ../pgen_compress.cpp -o pgen_compress
+
+.PHONY: clean
+clean:
+	rm -f *.o
+	rm -f plink2.exe
+	rm -f pgen_compress.exe
diff --git a/hfile.c b/hfile.c
new file mode 100644
index 0000000..33b0532
--- /dev/null
+++ b/hfile.c
@@ -0,0 +1,584 @@
+/*  hfile.c -- buffered low-level input/output streams.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#define _FILE_OFFSET_BITS 64
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <pthread.h>
+
+#include "hfile.h"
+#include "hfile_internal.h"
+
+/* hFILE fields are used as follows:
+
+   char *buffer;     // Pointer to the start of the I/O buffer
+   char *begin;      // First not-yet-read character / unused position
+   char *end;        // First unfilled/unfillable position
+   char *limit;      // Pointer to the first position past the buffer
+
+   const hFILE_backend *backend;  // Methods to refill/flush I/O buffer
+
+   off_t offset;     // Offset within the stream of buffer position 0
+   int at_eof:1;     // For reading, whether EOF has been seen
+   int has_errno;    // Error number from the last failure on this stream
+
+For reading, begin is the first unread character in the buffer and end is the
+first unfilled position:
+
+   -----------ABCDEFGHIJKLMNO---------------
+   ^buffer    ^begin         ^end           ^limit
+
+For writing, begin is the first unused position and end is unused so remains
+equal to buffer:
+
+   ABCDEFGHIJKLMNOPQRSTUVWXYZ---------------
+   ^buffer                   ^begin         ^limit
+   ^end
+
+Thus if begin > end then there is a non-empty write buffer, if begin < end
+then there is a non-empty read buffer, and if begin == end then both buffers
+are empty.  In all cases, the stream's file position indicator corresponds
+to the position pointed to by begin.  */
+
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
+{
+    hFILE *fp = (hFILE *) malloc(struct_size);
+    if (fp == NULL) goto error;
+
+    if (capacity == 0) capacity = 32768;
+    // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory
+    if (strchr(mode, 'r') && capacity > 32768) capacity = 32768;
+
+    fp->buffer = (char *) malloc(capacity);
+    if (fp->buffer == NULL) goto error;
+
+    fp->begin = fp->end = fp->buffer;
+    fp->limit = &fp->buffer[capacity];
+
+    fp->offset = 0;
+    fp->at_eof = 0;
+    fp->has_errno = 0;
+    return fp;
+
+error:
+    hfile_destroy(fp);
+    return NULL;
+}
+
+void hfile_destroy(hFILE *fp)
+{
+    int save = errno;
+    if (fp) free(fp->buffer);
+    free(fp);
+    errno = save;
+}
+
+static inline int writebuffer_is_nonempty(hFILE *fp)
+{
+    return fp->begin > fp->end;
+}
+
+/* Refills the read buffer from the backend (once, so may only partially
+   fill the buffer), returning the number of additional characters read
+   (which might be 0), or negative when an error occurred.  */
+static ssize_t refill_buffer(hFILE *fp)
+{
+    ssize_t n;
+
+    // Move any unread characters to the start of the buffer
+    if (fp->begin > fp->buffer) {
+        fp->offset += fp->begin - fp->buffer;
+        memmove(fp->buffer, fp->begin, fp->end - fp->begin);
+        fp->end = &fp->buffer[fp->end - fp->begin];
+        fp->begin = fp->buffer;
+    }
+
+    // Read into the available buffer space at fp->[end,limit)
+    if (fp->at_eof || fp->end == fp->limit) n = 0;
+    else {
+        n = fp->backend->read(fp, fp->end, fp->limit - fp->end);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        else if (n == 0) fp->at_eof = 1;
+    }
+
+    fp->end += n;
+    return n;
+}
+
+/* Called only from hgetc(), when our buffer is empty.  */
+int hgetc2(hFILE *fp)
+{
+    return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF;
+}
+
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
+{
+    size_t n = fp->end - fp->begin;
+    while (n < nbytes) {
+        ssize_t ret = refill_buffer(fp);
+        if (ret < 0) return ret;
+        else if (ret == 0) break;
+        else n += ret;
+    }
+
+    if (n > nbytes) n = nbytes;
+    memcpy(buffer, fp->begin, n);
+    return n;
+}
+
+/* Called only from hread(); when called, our buffer is empty and nread bytes
+   have already been placed in the destination buffer.  */
+ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread)
+{
+    const size_t capacity = fp->limit - fp->buffer;
+    char *dest = (char *) destv;
+    dest += nread, nbytes -= nread;
+
+    // Read large requests directly into the destination buffer
+    while (nbytes * 2 >= capacity && !fp->at_eof) {
+        ssize_t n = fp->backend->read(fp, dest, nbytes);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        else if (n == 0) fp->at_eof = 1;
+        fp->offset += n;
+        dest += n, nbytes -= n;
+        nread += n;
+    }
+
+    while (nbytes > 0 && !fp->at_eof) {
+        size_t n;
+        ssize_t ret = refill_buffer(fp);
+        if (ret < 0) return ret;
+
+        n = fp->end - fp->begin;
+        if (n > nbytes) n = nbytes;
+        memcpy(dest, fp->begin, n);
+        fp->begin += n;
+        dest += n, nbytes -= n;
+        nread += n;
+    }
+
+    return nread;
+}
+
+/* Flushes the write buffer, fp->[buffer,begin), out through the backend
+   returning 0 on success or negative if an error occurred.  */
+static ssize_t flush_buffer(hFILE *fp)
+{
+    const char *buffer = fp->buffer;
+    while (buffer < fp->begin) {
+        ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        buffer += n;
+        fp->offset += n;
+    }
+
+    fp->begin = fp->buffer;  // Leave the buffer empty
+    return 0;
+}
+
+int hflush(hFILE *fp)
+{
+    if (flush_buffer(fp) < 0) return EOF;
+    if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; }
+    return 0;
+}
+
+/* Called only from hputc(), when our buffer is already full.  */
+int hputc2(int c, hFILE *fp)
+{
+    if (flush_buffer(fp) < 0) return EOF;
+    *(fp->begin++) = c;
+    return c;
+}
+
+/* Called only from hwrite() and hputs2(); when called, our buffer is full and
+   ncopied bytes from the source have already been copied to our buffer.  */
+ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied)
+{
+    const char *src = (const char *) srcv;
+    ssize_t ret;
+    const size_t capacity = fp->limit - fp->buffer;
+    size_t remaining = totalbytes - ncopied;
+    src += ncopied;
+
+    ret = flush_buffer(fp);
+    if (ret < 0) return ret;
+
+    // Write large blocks out directly from the source buffer
+    while (remaining * 2 >= capacity) {
+        ssize_t n = fp->backend->write(fp, src, remaining);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        fp->offset += n;
+        src += n, remaining -= n;
+    }
+
+    // Just buffer any remaining characters
+    memcpy(fp->begin, src, remaining);
+    fp->begin += remaining;
+
+    return totalbytes;
+}
+
+/* Called only from hputs(), when our buffer is already full.  */
+int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp)
+{
+    return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF;
+}
+
+off_t hseek(hFILE *fp, off_t offset, int whence)
+{
+    off_t pos;
+
+    if (writebuffer_is_nonempty(fp)) {
+        int ret = flush_buffer(fp);
+        if (ret < 0) return ret;
+    }
+    else {
+        // Convert relative offsets from being relative to the hFILE's stream
+        // position (at begin) to being relative to the backend's physical
+        // stream position (at end, due to the buffering read-ahead).
+        if (whence == SEEK_CUR) offset -= fp->end - fp->begin;
+    }
+
+    pos = fp->backend->seek(fp, offset, whence);
+    if (pos < 0) { fp->has_errno = errno; return pos; }
+
+    // Seeking succeeded, so discard any non-empty read buffer
+    fp->begin = fp->end = fp->buffer;
+    fp->at_eof = 0;
+
+    fp->offset = pos;
+    return pos;
+}
+
+int hclose(hFILE *fp)
+{
+    int err = fp->has_errno;
+
+    if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
+    if (fp->backend->close(fp) < 0) err = errno;
+    hfile_destroy(fp);
+
+    if (err) {
+        errno = err;
+        return EOF;
+    }
+    else return 0;
+}
+
+void hclose_abruptly(hFILE *fp)
+{
+    int save = errno;
+    if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
+    hfile_destroy(fp);
+    errno = save;
+}
+
+
+/***************************
+ * File descriptor backend *
+ ***************************/
+
+// #include <sys/socket.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+// #ifdef _WIN32
+// #define HAVE_CLOSESOCKET
+// #endif
+
+/* For Unix, it doesn't matter whether a file descriptor is a socket.
+   However Windows insists on send()/recv() and its own closesocket()
+   being used when fd happens to be a socket.  */
+
+typedef struct {
+    hFILE base;
+    int fd;
+  // int is_socket:1;
+} hFILE_fd;
+
+static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    ssize_t n;
+    do {
+      /*
+        n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0)
+                         : read(fp->fd, buffer, nbytes);
+      */
+        n = read(fp->fd, buffer, nbytes);
+    } while (n < 0 && errno == EINTR);
+    return n;
+}
+
+static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    ssize_t n;
+    do {
+      /*
+        n = fp->is_socket?  send(fp->fd, buffer, nbytes, 0)
+                         : write(fp->fd, buffer, nbytes);
+      */
+        n = write(fp->fd, buffer, nbytes);
+    } while (n < 0 && errno == EINTR);
+    return n;
+}
+
+static off_t fd_seek(hFILE *fpv, off_t offset, int whence)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    return lseek(fp->fd, offset, whence);
+}
+
+static int fd_flush(hFILE *fpv)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+#ifdef _WIN32
+    // See the patch at
+    // https://lists.gnu.org/archive/html/bug-gnulib/2008-10/msg00004.html .
+    HANDLE hh = (HANDLE)_get_osfhandle(fp->fd);
+    DWORD err;
+    if (hh == INVALID_HANDLE_VALUE) {
+        errno = EBADF;
+        return -1;
+    }
+    if (!FlushFileBuffers(hh)) {
+        err = GetLastError();
+        switch (err) {
+        case ERROR_INVALID_HANDLE:
+	    errno = EINVAL;
+	    break;
+	default:
+	    errno = EIO;
+	}
+	return -1;
+    }
+    return 0;
+#else
+    int ret;
+    do {
+        ret = fsync(fp->fd);
+        // Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe,
+        // and operation-not-supported errors (Mac OS X)
+        if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0;
+    } while (ret < 0 && errno == EINTR);
+    return ret;
+#endif
+}
+
+static int fd_close(hFILE *fpv)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    int ret;
+    do {
+#ifdef HAVE_CLOSESOCKET
+        ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd);
+#else
+        ret = close(fp->fd);
+#endif
+    } while (ret < 0 && errno == EINTR);
+    return ret;
+}
+
+static const struct hFILE_backend fd_backend =
+{
+    fd_read, fd_write, fd_seek, fd_flush, fd_close
+};
+
+static size_t blksize(int fd)
+{
+    struct stat sbuf;
+    if (fstat(fd, &sbuf) != 0) return 0;
+#ifdef _WIN32
+    return 512;
+#else
+    return sbuf.st_blksize;
+#endif
+}
+
+static hFILE *hopen_fd(const char *filename, const char *mode)
+{
+    hFILE_fd *fp = NULL;
+    int fd = open(filename, hfile_oflags(mode), 0666);
+    if (fd < 0) goto error;
+
+    fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+    if (fp == NULL) goto error;
+
+    fp->fd = fd;
+    // fp->is_socket = 0;
+    fp->base.backend = &fd_backend;
+    return &fp->base;
+
+error:
+    if (fd >= 0) { int save = errno; (void) close(fd); errno = save; }
+    hfile_destroy((hFILE *) fp);
+    return NULL;
+}
+
+hFILE *hdopen(int fd, const char *mode)
+{
+    hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+    if (fp == NULL) return NULL;
+
+    fp->fd = fd;
+    // fp->is_socket = (strchr(mode, 's') != NULL);
+    fp->base.backend = &fd_backend;
+    return &fp->base;
+}
+
+static hFILE *hopen_fd_stdinout(const char *mode)
+{
+    int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO;
+    // TODO Set binary mode (for Windows)
+    return hdopen(fd, mode);
+}
+
+int hfile_oflags(const char *mode)
+{
+    int rdwr = 0, flags = 0;
+    const char *s;
+    for (s = mode; *s; s++)
+        switch (*s) {
+        case 'r': rdwr = O_RDONLY;  break;
+        case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC;  break;
+        case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND;  break;
+        case '+': rdwr = O_RDWR;  break;
+        default:  break;
+        }
+
+#ifdef O_BINARY
+    flags |= O_BINARY;
+#endif
+
+    return rdwr | flags;
+}
+
+
+/*********************
+ * In-memory backend *
+ *********************/
+
+typedef struct {
+    hFILE base;
+    const char *buffer;
+    size_t length, pos;
+} hFILE_mem;
+
+/*
+static ssize_t mem_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+    hFILE_mem *fp = (hFILE_mem *) fpv;
+    size_t avail = fp->length - fp->pos;
+    if (nbytes > avail) nbytes = avail;
+    memcpy(buffer, fp->buffer + fp->pos, nbytes);
+    fp->pos += nbytes;
+    return nbytes;
+}
+
+static off_t mem_seek(hFILE *fpv, off_t offset, int whence)
+{
+    hFILE_mem *fp = (hFILE_mem *) fpv;
+    size_t absoffset = (offset >= 0)? offset : -offset;
+    size_t origin;
+
+    switch (whence) {
+    case SEEK_SET: origin = 0; break;
+    case SEEK_CUR: origin = fp->pos; break;
+    case SEEK_END: origin = fp->length; break;
+    default: errno = EINVAL; return -1;
+    }
+
+    if ((offset  < 0 && absoffset > origin) ||
+        (offset >= 0 && absoffset > fp->length - origin)) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    fp->pos = origin + offset;
+    return fp->pos;
+}
+
+static int mem_close(hFILE *fpv)
+{
+    return 0;
+}
+
+static const struct hFILE_backend mem_backend =
+{
+    mem_read, NULL, mem_seek, NULL, mem_close
+};
+
+static hFILE *hopen_mem(const char *data, const char *mode)
+{
+    // TODO Implement write modes, which will require memory allocation
+    if (strchr(mode, 'r') == NULL) { errno = EINVAL; return NULL; }
+
+    hFILE_mem *fp = (hFILE_mem *) hfile_init(sizeof (hFILE_mem), mode, 0);
+    if (fp == NULL) return NULL;
+
+    fp->buffer = data;
+    fp->length = strlen(data);
+    fp->pos = 0;
+    fp->base.backend = &mem_backend;
+    return &fp->base;
+}
+*/
+
+
+/******************************
+ * hopen() backend dispatcher *
+ ******************************/
+
+hFILE *hopen(const char *fname, const char *mode)
+{
+  // if (strncmp(fname, "http://", 7) == 0 ||
+  //      strncmp(fname, "ftp://", 6) == 0) return hopen_net(fname, mode);
+#ifdef HAVE_IRODS
+  // else if (strncmp(fname, "irods:", 6) == 0) return hopen_irods(fname, mode);
+#endif
+  // else if (strncmp(fname, "data:", 5) == 0) return hopen_mem(fname + 5, mode);
+    if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
+    else return hopen_fd(fname, mode);
+}
+
+/*
+int hisremote(const char *fname)
+{
+    // FIXME Make a new backend entry to return this
+    if (strncmp(fname, "http://", 7) == 0 ||
+        strncmp(fname, "https://", 8) == 0 ||
+        strncmp(fname, "ftp://", 6) == 0) return 1;
+#ifdef HAVE_IRODS
+    else if (strncmp(fname, "irods:", 6) == 0) return 1;
+#endif
+    else return 0;
+}
+*/
diff --git a/hfile.h b/hfile.h
new file mode 100644
index 0000000..63141eb
--- /dev/null
+++ b/hfile.h
@@ -0,0 +1,216 @@
+/*  hfile.h -- buffered low-level input/output streams.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HTSLIB_HFILE_H
+#define HTSLIB_HFILE_H
+
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "hts_defs.h"
+
+#ifdef _WIN32
+  #include <windows.h>
+#endif
+
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+/* These fields are declared here solely for the benefit of the inline functions
+   below.  They may change in future releases.  User code should not use them
+   directly; you should imagine that hFILE is an opaque incomplete type.  */
+struct hFILE_backend;
+typedef struct hFILE {
+    char *buffer, *begin, *end, *limit;
+    const struct hFILE_backend *backend;
+    off_t offset;
+    int at_eof:1;
+    int has_errno;
+} hFILE;
+
+/*!
+  @abstract  Open the named file or URL as a stream
+  @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
+*/
+hFILE *hopen(const char *filename, const char *mode) HTS_RESULT_USED;
+
+/*!
+  @abstract  Associate a stream with an existing open file descriptor
+  @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
+  @notes     For socket descriptors (on Windows), mode should contain 's'.
+*/
+hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED;
+
+/*!
+  @abstract  Report whether the file name or URL denotes remote storage
+  @return    0 if local, 1 if remote.
+  @notes     "Remote" means involving e.g. explicit network access, with the
+    implication that callers may wish to cache such files' contents locally.
+*/
+// int hisremote(const char *filename) HTS_RESULT_USED;
+
+/*!
+  @abstract  Flush (for output streams) and close the stream
+  @return    0 if successful, or EOF (with errno set) if an error occurred.
+*/
+int hclose(hFILE *fp) HTS_RESULT_USED;
+
+/*!
+  @abstract  Close the stream, without flushing or propagating errors
+  @notes     For use while cleaning up after an error only.  Preserves errno.
+*/
+void hclose_abruptly(hFILE *fp);
+
+/*!
+  @abstract  Return the stream's error indicator
+  @return    Non-zero (in fact, an errno value) if an error has occurred.
+  @notes     This would be called herror() and return true/false to parallel
+    ferror(3), but a networking-related herror(3) function already exists.  */
+static inline int herrno(hFILE *fp)
+{
+    return fp->has_errno;
+}
+
+/*!
+  @abstract  Clear the stream's error indicator
+*/
+static inline void hclearerr(hFILE *fp)
+{
+    fp->has_errno = 0;
+}
+
+/*!
+  @abstract  Reposition the read/write stream offset
+  @return    The resulting offset within the stream (as per lseek(2)),
+    or negative if an error occurred.
+*/
+off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+/*!
+  @abstract  Report the current stream offset
+  @return    The offset within the stream, starting from zero.
+*/
+static inline off_t htell(hFILE *fp)
+{
+    return fp->offset + (fp->begin - fp->buffer);
+}
+
+/*!
+  @abstract  Read one character from the stream
+  @return    The character read, or EOF on end-of-file or error
+*/
+static inline int hgetc(hFILE *fp)
+{
+    extern int hgetc2(hFILE *);
+    return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp);
+}
+
+/*!
+  @abstract  Peek at characters to be read without removing them from buffers
+  @param fp      The file stream
+  @param buffer  The buffer to which the peeked bytes will be written
+  @param nbytes  The number of bytes to peek at; limited by the size of the
+    internal buffer, which could be as small as 4K.
+  @return    The number of bytes peeked, which may be less than nbytes if EOF
+    is encountered; or negative, if there was an I/O error.
+  @notes  The characters peeked at remain in the stream's internal buffer,
+    and will be returned by later hread() etc calls.
+*/
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+/*!
+  @abstract  Read a block of characters from the file
+  @return    The number of bytes read, or negative if an error occurred.
+  @notes     The full nbytes requested will be returned, except as limited
+    by EOF or I/O errors.
+*/
+static inline ssize_t HTS_RESULT_USED
+hread(hFILE *fp, void *buffer, size_t nbytes)
+{
+    extern ssize_t hread2(hFILE *, void *, size_t, size_t);
+
+    size_t n = fp->end - fp->begin;
+    if (n > nbytes) n = nbytes;
+    memcpy(buffer, fp->begin, n);
+    fp->begin += n;
+    return (n == nbytes)? (ssize_t) n : hread2(fp, buffer, nbytes, n);
+}
+
+/*!
+  @abstract  Write a character to the stream
+  @return    The character written, or EOF if an error occurred.
+*/
+static inline int hputc(int c, hFILE *fp)
+{
+    extern int hputc2(int, hFILE *);
+    if (fp->begin < fp->limit) *(fp->begin++) = c;
+    else c = hputc2(c, fp);
+    return c;
+}
+
+/*!
+  @abstract  Write a string to the stream
+  @return    0 if successful, or EOF if an error occurred.
+*/
+static inline int hputs(const char *text, hFILE *fp)
+{
+    extern int hputs2(const char *, size_t, size_t, hFILE *);
+
+    size_t nbytes = strlen(text), n = fp->limit - fp->begin;
+    if (n > nbytes) n = nbytes;
+    memcpy(fp->begin, text, n);
+    fp->begin += n;
+    return (n == nbytes)? 0 : hputs2(text, nbytes, n, fp);
+}
+
+/*!
+  @abstract  Write a block of characters to the file
+  @return    Either nbytes, or negative if an error occurred.
+  @notes     In the absence of I/O errors, the full nbytes will be written.
+*/
+static inline ssize_t HTS_RESULT_USED
+hwrite(hFILE *fp, const void *buffer, size_t nbytes)
+{
+    extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t);
+
+    size_t n = fp->limit - fp->begin;
+    if (n > nbytes) n = nbytes;
+    memcpy(fp->begin, buffer, n);
+    fp->begin += n;
+    return (n==nbytes)? (ssize_t) n : hwrite2(fp, buffer, nbytes, n);
+}
+
+/*!
+  @abstract  For writing streams, flush buffered output to the underlying stream
+  @return    0 if successful, or EOF if an error occurred.
+*/
+int hflush(hFILE *fp) HTS_RESULT_USED;
+
+// #ifdef __cplusplus
+// }
+// #endif
+
+#endif
diff --git a/hfile_internal.h b/hfile_internal.h
new file mode 100644
index 0000000..0997705
--- /dev/null
+++ b/hfile_internal.h
@@ -0,0 +1,76 @@
+/*  hfile_internal.h -- internal parts of low-level input/output streams.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HFILE_INTERNAL_H
+#define HFILE_INTERNAL_H
+
+#include "hfile.h"
+
+struct hFILE_backend {
+    /* As per read(2), returning the number of bytes read (possibly 0) or
+       negative (and setting errno) on errors.  Front-end code will call this
+       repeatedly if necessary to attempt to get the desired byte count.  */
+    ssize_t (*read)(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+    /* As per write(2), returning the number of bytes written or negative (and
+       setting errno) on errors.  Front-end code will call this repeatedly if
+       necessary until the desired block is written or an error occurs.  */
+    ssize_t (*write)(hFILE *fp, const void *buffer, size_t nbytes)
+        HTS_RESULT_USED;
+
+    /* As per lseek(2), returning the resulting offset within the stream or
+       negative (and setting errno) on errors.  */
+    off_t (*seek)(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+    /* Performs low-level flushing, if any, e.g., fsync(2); for writing streams
+       only.  Returns 0 for success or negative (and sets errno) on errors. */
+    int (*flush)(hFILE *fp) HTS_RESULT_USED;
+
+    /* Closes the underlying stream (for output streams, the buffer will
+       already have been flushed), returning 0 for success or negative (and
+       setting errno) on errors, as per close(2).  */
+    int (*close)(hFILE *fp) HTS_RESULT_USED;
+};
+
+/* These are called from the hopen() dispatcher, and should call hfile_init()
+   to malloc a struct "derived" from hFILE and initialise it appropriately,
+   including setting base.backend to their own backend vector.  */
+hFILE *hopen_irods(const char *filename, const char *mode);
+hFILE *hopen_net(const char *filename, const char *mode);
+
+/* May be called by hopen_*() functions to decode a fopen()-style mode into
+   open(2)-style flags.  */
+int hfile_oflags(const char *mode);
+
+/* Must be called by hopen_*() functions to allocate the hFILE struct and set
+   up its base.  Capacity is a suggested buffer size (e.g., via fstat(2))
+   or 0 for a default-sized buffer.  */
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity);
+
+/* May be called by hopen_*() functions to undo the effects of hfile_init()
+   in the event opening the stream subsequently fails.  (This is safe to use
+   even if fp is NULL.  This takes care to preserve errno.)  */
+void hfile_destroy(hFILE *fp);
+
+#endif
diff --git a/hts.h b/hts.h
new file mode 100644
index 0000000..084c162
--- /dev/null
+++ b/hts.h
@@ -0,0 +1,456 @@
+/*  hts.h -- format-neutral I/O, indexing, and iterator API functions.
+
+    Copyright (C) 2012-2014 Genome Research Ltd.
+    Copyright (C) 2012 Broad Institute.
+
+    Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HTSLIB_HTS_H
+#define HTSLIB_HTS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+struct cram_fd;
+struct hFILE;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+    size_t l, m;
+    char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/**
+ * hts_expand()  - expands memory block pointed to by $ptr;
+ * hts_expand0()   the latter sets the newly allocated part to 0.
+ *
+ * @param n     requested number of elements of type type_t
+ * @param m     size of memory allocated
+ */
+#define hts_expand(type_t, n, m, ptr) if ((n) > (m)) { \
+        (m) = (n); kroundup32(m); \
+        (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+    }
+#define hts_expand0(type_t, n, m, ptr) if ((n) > (m)) { \
+        int t = (m); (m) = (n); kroundup32(m); \
+        (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+        memset(((type_t*)ptr)+t,0,sizeof(type_t)*((m)-t)); \
+    }
+
+/************
+ * File I/O *
+ ************/
+
+// Add new entries only at the end (but before the *_maximum entry)
+// of these enums, as their numbering is part of the htslib ABI.
+
+enum htsFormatCategory {
+    unknown_category,
+    sequence_data,    // Sequence data -- SAM, BAM, CRAM, etc
+    variant_data,     // Variant calling data -- VCF, BCF, etc
+    index_file,       // Index file associated with some data file
+    region_list,      // Coordinate intervals or regions -- BED, etc
+    category_maximum = 32767
+};
+
+enum htsExactFormat {
+    unknown_format,
+    binary_format, text_format,
+    sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed,
+    format_maximum = 32767
+};
+
+enum htsCompression {
+    no_compression, gzip, bgzf, custom,
+    compression_maximum = 32767
+};
+
+typedef struct htsFormat {
+    enum htsFormatCategory category;
+    enum htsExactFormat format;
+    struct { short major, minor; } version;
+    enum htsCompression compression;
+    short compression_level;  // currently unused
+    void *specific;  // currently unused
+} htsFormat;
+
+// Maintainers note htsFile cannot be an opaque structure because some of its
+// fields are part of libhts.so's ABI (hence these fields must not be moved):
+//  - fp is used in the public sam_itr_next()/etc macros
+//  - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
+//  - is_write and is_cram are used directly in samtools <= 1.1
+//  - fp is used directly in samtools (up to and including current develop)
+//  - line is used directly in bcftools (up to and including current develop)
+typedef struct {
+    uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, dummy:28;
+    int64_t lineno;
+    kstring_t line;
+    char *fn, *fn_aux;
+    union {
+        BGZF *bgzf;
+        struct cram_fd *cram;
+        struct hFILE *hfile;
+        void *voidp;
+    } fp;
+    htsFormat format;
+} htsFile;
+
+// REQUIRED_FIELDS
+enum sam_fields {
+    SAM_QNAME = 0x00000001,
+    SAM_FLAG  = 0x00000002,
+    SAM_RNAME = 0x00000004,
+    SAM_POS   = 0x00000008,
+    SAM_MAPQ  = 0x00000010,
+    SAM_CIGAR = 0x00000020,
+    SAM_RNEXT = 0x00000040,
+    SAM_PNEXT = 0x00000080,
+    SAM_TLEN  = 0x00000100,
+    SAM_SEQ   = 0x00000200,
+    SAM_QUAL  = 0x00000400,
+    SAM_AUX   = 0x00000800,
+    SAM_RGAUX = 0x00001000,
+};
+
+enum cram_option {
+    CRAM_OPT_DECODE_MD,
+    CRAM_OPT_PREFIX,
+    CRAM_OPT_VERBOSITY,
+    CRAM_OPT_SEQS_PER_SLICE,
+    CRAM_OPT_SLICES_PER_CONTAINER,
+    CRAM_OPT_RANGE,
+    CRAM_OPT_VERSION,
+    CRAM_OPT_EMBED_REF,
+    CRAM_OPT_IGNORE_MD5,
+    CRAM_OPT_REFERENCE,
+    CRAM_OPT_MULTI_SEQ_PER_SLICE,
+    CRAM_OPT_NO_REF,
+    CRAM_OPT_USE_BZIP2,
+    CRAM_OPT_SHARED_REF,
+    CRAM_OPT_NTHREADS,
+    CRAM_OPT_THREAD_POOL,
+    CRAM_OPT_USE_LZMA,
+    CRAM_OPT_USE_RANS,
+    CRAM_OPT_REQUIRED_FIELDS,
+};
+
+/**********************
+ * Exported functions *
+ **********************/
+
+extern int hts_verbose;
+
+/*! @abstract Table for converting a nucleotide character to 4-bit encoding.
+The input character may be either an IUPAC ambiguity code, '=' for 0, or
+'0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
+for A/C/G/T or combinations of these bits for ambiguous bases.
+*/
+extern const unsigned char seq_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+ambiguity code letter (or '=' when given 0).
+*/
+extern const char seq_nt16_str[];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+*/
+extern const int seq_nt16_int[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+  @abstract  Get the htslib version number
+  @return    For released versions, a string like "N.N[.N]"; or git describe
+  output if using a library built within a Git repository.
+*/
+const char *hts_version(void);
+
+/*!
+  @abstract    Determine format by peeking at the start of a file
+  @param fp    File opened for reading, positioned at the beginning
+  @param fmt   Format structure that will be filled out on return
+  @return      0 for success, or negative if an error occurred.
+*/
+int hts_detect_format(struct hFILE *fp, htsFormat *fmt);
+
+/*!
+  @abstract    Get a human-readable description of the file format
+  @return      Description string, to be freed by the caller after use.
+*/
+char *hts_format_description(const htsFormat *format);
+
+/*!
+  @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
+  @param fn       The file name or "-" for stdin/stdout
+  @param mode     Mode matching /[rwa][bcuz0-9]+/
+  @discussion
+      With 'r' opens for reading; any further format mode letters are ignored
+      as the format is detected by checking the first few bytes or BGZF blocks
+      of the file.  With 'w' or 'a' opens for writing or appending, with format
+      specifier letters:
+        b  binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
+        c  CRAM format
+        g  gzip compressed
+        u  uncompressed
+        z  bgzf compressed
+        [0-9]  zlib compression level
+      Note that there is a distinction between 'u' and '0': the first yields
+      plain uncompressed output whereas the latter outputs uncompressed data
+      wrapped in the zlib format.
+  @example
+      [rw]b .. compressed BCF, BAM, FAI
+      [rw]u .. uncompressed BCF
+      [rw]z .. compressed VCF
+      [rw]  .. uncompressed VCF
+*/
+htsFile *hts_open(const char *fn, const char *mode);
+
+/*!
+  @abstract       Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+  @param fn       The already-open file handle
+  @param mode     Open mode, as per hts_open()
+*/
+htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode);
+
+/*!
+  @abstract  Close a file handle, flushing buffered data for output streams
+  @param fp  The file handle to be closed
+  @return    0 for success, or negative if an error occurred.
+*/
+int hts_close(htsFile *fp);
+
+/*!
+  @abstract  Returns the file's format information
+  @param fp  The file handle
+  @return    Read-only pointer to the file's htsFormat.
+*/
+const htsFormat *hts_get_format(htsFile *fp);
+
+/*!
+  @abstract  Sets a specified CRAM option on the open file handle.
+  @param fp  The file handle open the open file.
+  @param opt The CRAM_OPT_* option.
+  @param ... Optional arguments, dependent on the option used.
+  @return    0 for success, or negative if an error occurred.
+*/
+int hts_set_opt(htsFile *fp, enum cram_option opt, ...);
+
+int hts_getline(htsFile *fp, int delimiter, kstring_t *str);
+char **hts_readlines(const char *fn, int *_n);
+/*!
+    @abstract       Parse comma-separated list or read list from a file
+    @param list     File name or comma-separated list
+    @param is_file
+    @param _n       Size of the output array (number of items read)
+    @return         NULL on failure or pointer to newly allocated array of
+                    strings
+*/
+char **hts_readlist(const char *fn, int is_file, int *_n);
+
+/*!
+  @abstract  Create extra threads to aid compress/decompression for this file
+  @param fp  The file handle
+  @param n   The number of worker threads to create
+  @return    0 for success, or negative if an error occurred.
+  @notes     THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
+*/
+int hts_set_threads(htsFile *fp, int n);
+
+/*!
+  @abstract  Set .fai filename for a file opened for reading
+  @return    0 for success, negative on failure
+  @discussion
+      Called before *_hdr_read(), this provides the name of a .fai file
+      used to provide a reference list if the htsFile contains no @SQ headers.
+*/
+int hts_set_fai_filename(htsFile *fp, const char *fn_aux);
+
+#ifdef __cplusplus
+}
+#endif
+
+/************
+ * Indexing *
+ ************/
+
+/*!
+These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc,
+producing iterators operating as follows:
+ - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
+ - HTS_IDX_START  iterates over the entire file
+ - HTS_IDX_REST   iterates from the current position to the end of the file
+ - HTS_IDX_NONE   always returns "no more alignment records"
+When one of these special tid values is used, beg and end are ignored.
+When REST or NONE is used, idx is also ignored and may be NULL.
+*/
+#define HTS_IDX_NOCOOR (-2)
+#define HTS_IDX_START  (-3)
+#define HTS_IDX_REST   (-4)
+#define HTS_IDX_NONE   (-5)
+
+#define HTS_FMT_CSI 0
+#define HTS_FMT_BAI 1
+#define HTS_FMT_TBI 2
+#define HTS_FMT_CRAI 3
+
+struct __hts_idx_t;
+typedef struct __hts_idx_t hts_idx_t;
+
+typedef struct {
+    uint64_t u, v;
+} hts_pair64_t;
+
+typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end);
+
+typedef struct {
+    uint32_t read_rest:1, finished:1, dummy:29;
+    int tid, beg, end, n_off, i;
+    int curr_tid, curr_beg, curr_end;
+    uint64_t curr_off;
+    hts_pair64_t *off;
+    hts_readrec_func *readrec;
+    struct {
+        int n, m;
+        int *a;
+    } bins;
+} hts_itr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7)
+    #define hts_bin_parent(l) (((l) - 1) >> 3)
+
+    hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls);
+    void hts_idx_destroy(hts_idx_t *idx);
+    int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped);
+    void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset);
+
+    void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt);
+    hts_idx_t *hts_idx_load(const char *fn, int fmt);
+
+    uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta);
+    void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy);
+
+    int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped);
+    uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx);
+
+    const char *hts_parse_reg(const char *s, int *beg, int *end);
+    hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+    void hts_itr_destroy(hts_itr_t *iter);
+
+    typedef int (*hts_name2id_f)(void*, const char*);
+    typedef const char *(*hts_id2name_f)(void*, int);
+    typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+
+    hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec);
+    int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data);
+    const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values
+
+    /**
+     * hts_file_type() - Convenience function to determine file type
+     * DEPRECATED:  This function has been replaced by hts_detect_format().
+     * It and these FT_* macros will be removed in a future HTSlib release.
+     */
+    #define FT_UNKN   0
+    #define FT_GZ     1
+    #define FT_VCF    2
+    #define FT_VCF_GZ (FT_GZ|FT_VCF)
+    #define FT_BCF    (1<<2)
+    #define FT_BCF_GZ (FT_GZ|FT_BCF)
+    #define FT_STDIN  (1<<3)
+    int hts_file_type(const char *fname);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+{
+    int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
+    for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
+        if (beg>>s == end>>s) return t + (beg>>s);
+    return 0;
+}
+
+static inline int hts_bin_bot(int bin, int n_lvls)
+{
+    int l, b;
+    for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin
+    return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
+}
+
+/**************
+ * Endianness *
+ **************/
+
+static inline int ed_is_big(void)
+{
+    long one= 1;
+    return !(*((char *)(&one)));
+}
+static inline uint16_t ed_swap_2(uint16_t v)
+{
+    return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *ed_swap_2p(void *x)
+{
+    *(uint16_t*)x = ed_swap_2(*(uint16_t*)x);
+    return x;
+}
+static inline uint32_t ed_swap_4(uint32_t v)
+{
+    v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+    return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *ed_swap_4p(void *x)
+{
+    *(uint32_t*)x = ed_swap_4(*(uint32_t*)x);
+    return x;
+}
+static inline uint64_t ed_swap_8(uint64_t v)
+{
+    v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+    v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+    return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *ed_swap_8p(void *x)
+{
+    *(uint64_t*)x = ed_swap_8(*(uint64_t*)x);
+    return x;
+}
+
+#endif
diff --git a/hts_defs.h b/hts_defs.h
new file mode 100644
index 0000000..4a244da
--- /dev/null
+++ b/hts_defs.h
@@ -0,0 +1,56 @@
+/*  hts_defs.h -- Miscellaneous definitions.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HTSLIB_HTS_DEFS_H
+#define HTSLIB_HTS_DEFS_H
+
+#if (__GNUC__ >= 3) || \
+    (defined __clang__ && __clang_major__ >= 2)
+#define HTS_NORETURN __attribute__ ((__noreturn__))
+#else
+#define HTS_NORETURN
+#endif
+
+#if (defined __clang__ && __clang_major__ >= 3) || \
+    (defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__==4 && __GNUC_MINOR__ >= 5)))
+#define HTS_RESULT_USED __attribute__ ((__warn_unused_result__))
+#else
+#define HTS_RESULT_USED
+#endif
+
+#if defined __clang__ || \
+    (defined __GNUC__ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95)))
+#define HTS_UNUSED __attribute__ ((__unused__))
+#else
+#define HTS_UNUSED
+#endif
+
+#if (defined __clang__ && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 1))) || \
+    (defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)))
+#define HTS_DEPRECATED(x) __attribute__ ((__deprecated__(x))) 
+#else
+#define HTS_DEPRECATED(x)
+#endif
+
+#endif
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..e900842
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,621 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	/* extern void kh_clear_##name(kh_##name##_t *h); */		\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	/*
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}
+*/									\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(new_flags); return -1; }		\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(new_flags); return -1; }	\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/pgen_compress.cpp b/pgen_compress.cpp
new file mode 100644
index 0000000..335c96b
--- /dev/null
+++ b/pgen_compress.cpp
@@ -0,0 +1,242 @@
+#include "pgenlib_internal.h"
+
+// #define SUBSET_TEST
+
+int32_t main(int32_t argc, char** argv) {
+#ifdef __cplusplus
+  using namespace plink2;
+#endif
+  pglerr_t reterr = kPglRetSuccess;
+  unsigned char* pgfi_alloc = nullptr;
+  unsigned char* pgr_alloc = nullptr;
+  unsigned char* spgw_alloc = nullptr;
+  uintptr_t* genovec = nullptr;
+  uintptr_t* raregeno = nullptr;
+  uintptr_t* sample_include = nullptr;
+  uint32_t* sample_include_cumulative_popcounts = nullptr;
+  uint32_t* difflist_sample_ids = nullptr;
+  FILE* outfile = nullptr;
+  pgen_header_ctrl_t header_ctrl;
+  st_pgen_writer_t spgw;
+  uint32_t write_sample_ct;
+  pgen_file_info_t pgfi;
+  pgen_reader_t pgr;
+  pgfi_preinit(&pgfi);
+  pgr_preinit(&pgr);
+  spgw_preinit(&spgw);
+  {
+    const uint32_t use_mmap = 0;
+    if ((argc < 3) || (argc > 5)) {
+      fputs(
+"Usage:\n"
+"pgen_compress [input .bed or .pgen] [output filename] {sample_ct}\n"
+"  (sample_ct is required when loading a .bed file)\n"
+"pgen_compress -u [input .pgen] [output .bed]\n"
+	    , stdout);
+      goto main_ret_INVALID_CMDLINE;
+    }
+    const uint32_t decompress = (argv[1][0] == '-') && (argv[1][1] == 'u') && (argv[1][2] == '\0');
+    uint32_t sample_ct = 0xffffffffU;
+    if (((uint32_t)argc) == 4 + decompress) {
+      if (scan_posint_defcap(argv[3 + decompress], &sample_ct)) {
+	goto main_ret_INVALID_CMDLINE;
+      }
+    }
+    char errstr_buf[kPglErrstrBufBlen];
+    uintptr_t cur_alloc_cacheline_ct;
+    reterr = pgfi_init_phase1(argv[1 + decompress], 0xffffffffU, sample_ct, use_mmap, &header_ctrl, &pgfi, &cur_alloc_cacheline_ct, errstr_buf);
+    if (reterr) {
+      fputs(errstr_buf, stderr);
+      goto main_ret_1;
+    }
+    sample_ct = pgfi.raw_sample_ct;
+    if (!sample_ct) {
+      fprintf(stderr, "error: sample_ct == 0\n");
+      goto main_ret_INVALID_CMDLINE;
+    }
+    const uint32_t variant_ct = pgfi.raw_variant_ct;
+    if (!variant_ct) {
+      fprintf(stderr, "error: variant_ct == 0\n");
+      goto main_ret_INVALID_CMDLINE;
+    }
+    if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &pgfi_alloc)) {
+      goto main_ret_NOMEM;
+    }
+    uint32_t max_vrec_width;
+    // todo: test block-fread
+    reterr = pgfi_init_phase2(header_ctrl, 0, 0, 0, 0, variant_ct, &max_vrec_width, &pgfi, pgfi_alloc, &cur_alloc_cacheline_ct, errstr_buf);
+    if (reterr) {
+      fputs(errstr_buf, stderr);
+      goto main_ret_1;
+    }
+    if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &pgr_alloc)) {
+      goto main_ret_NOMEM;
+    }
+
+    // modify this when trying block-fread
+    reterr = pgr_init(use_mmap? nullptr : argv[1 + decompress], max_vrec_width, &pgfi, &pgr, pgr_alloc);
+    if (reterr) {
+      fprintf(stderr, "pgr_init error %u\n", (uint32_t)reterr);
+      goto main_ret_1;
+    }
+
+    if (((uint32_t)argc) == 4 + decompress) {
+      printf("%u variant%s detected.\n", variant_ct, (variant_ct == 1)? "" : "s");
+    } else {
+      printf("%u variant%s and %u sample%s detected.\n", variant_ct, (variant_ct == 1)? "" : "s", sample_ct, (sample_ct == 1)? "" : "s");
+    }
+    if (cachealigned_malloc(QUATERCT_TO_VECCT(sample_ct) * kBytesPerVec, &genovec)) {
+      goto main_ret_NOMEM;
+    }
+    if (decompress) {
+      outfile = fopen(argv[3], FOPEN_WB);
+      if (!outfile) {
+	goto main_ret_OPEN_FAIL;
+      }
+      const uintptr_t final_mask = (k1LU << ((sample_ct % kBitsPerWordD2) * 2)) - k1LU;
+      const uint32_t final_widx = QUATERCT_TO_WORDCT(sample_ct) - 1;
+      const uint32_t variant_byte_ct = (sample_ct + 3) / 4;
+      fwrite("l\x1b\x01", 3, 1, outfile);
+      for (uint32_t vidx = 0; vidx < variant_ct;) {
+	reterr = pgr_read_refalt1_genovec_subset_unsafe(nullptr, nullptr, sample_ct, vidx, &pgr, genovec);
+	if (reterr) {
+	  fprintf(stderr, "\nread error %u, vidx=%u\n", (uint32_t)reterr, vidx);
+	  goto main_ret_1;
+	}
+	pgr_plink2_to_plink1_inplace_unsafe(sample_ct, genovec);
+	if (final_mask) {
+	  genovec[final_widx] &= final_mask;
+	}
+	fwrite(genovec, variant_byte_ct, 1, outfile);
+	++vidx;
+	if (!(vidx % 100000)) {
+	  printf("\r%u.%um variants decompressed.", vidx / 1000000, (vidx / 100000) % 10);
+	  fflush(stdout);
+	}
+      }
+      if (fclose_null(&outfile)) {
+	goto main_ret_WRITE_FAIL;
+      }
+      printf("\n");
+      goto main_ret_1;
+    }
+#ifdef SUBSET_TEST
+    // write_sample_ct = sample_ct - 3;
+    write_sample_ct = 3;
+#else
+    write_sample_ct = sample_ct;
+#endif
+    uint32_t max_vrec_len;
+    reterr = spgw_init_phase1(argv[2], nullptr, nullptr, variant_ct, write_sample_ct, kfPgenGlobal0, 2, &spgw, &cur_alloc_cacheline_ct, &max_vrec_len);
+    if (reterr) {
+      fprintf(stderr, "compression phase 1 error %u\n", (uint32_t)reterr);
+      goto main_ret_1;
+    }
+    if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+      goto main_ret_NOMEM;
+    }
+    spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+
+    const uint32_t max_simple_difflist_len = sample_ct / kBitsPerWordD2;
+    const uint32_t max_returned_difflist_len = max_simple_difflist_len + (sample_ct / kPglMaxDifflistLenDivisor);
+    const uint32_t max_difflist_len = 2 * (write_sample_ct / kPglMaxDifflistLenDivisor);
+    if (cachealigned_malloc(round_up_pow2((max_returned_difflist_len + 3) / 4, kCacheline), &raregeno) ||
+        cachealigned_malloc(round_up_pow2((sample_ct + 7) / 8, kCacheline), &sample_include) ||
+	cachealigned_malloc(round_up_pow2((1 + (sample_ct / kBitsPerWord)) * sizeof(int32_t), kCacheline), &sample_include_cumulative_popcounts) ||
+	cachealigned_malloc(round_up_pow2((max_returned_difflist_len + 1) * sizeof(int32_t), kCacheline), &difflist_sample_ids)) {
+      goto main_ret_NOMEM;
+    }
+    fill_all_bits(sample_ct, sample_include);
+#ifdef SUBSET_TEST
+    fill_ulong_zero(BITCT_TO_WORDCT(sample_ct), sample_include);
+    set_bit(123, sample_include);
+    set_bit(127, sample_include);
+    set_bit(320, sample_include);
+    // clear_bit(123, sample_include);
+    // clear_bit(127, sample_include);
+    // clear_bit(320, sample_include);
+    fill_cumulative_popcounts(sample_include, 1 + (sample_ct / kBitsPerWord), sample_include_cumulative_popcounts);
+#endif
+    for (uint32_t vidx = 0; vidx < variant_ct;) {
+      uint32_t difflist_common_geno;
+      uint32_t difflist_len;
+      reterr = pgr_read_refalt1_difflist_or_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, write_sample_ct, max_simple_difflist_len, vidx, &pgr, genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len);
+      if (reterr) {
+	fprintf(stderr, "\nread error %u, vidx=%u\n", (uint32_t)reterr, vidx);
+	goto main_ret_1;
+      }
+      if (difflist_common_geno == 0xffffffffU) {
+        zero_trailing_bits(write_sample_ct * 2, genovec);
+	reterr = spgw_append_biallelic_genovec(genovec, &spgw);
+      } else if (difflist_len <= max_difflist_len) {
+	zero_trailing_bits(2 * difflist_len, raregeno);
+	difflist_sample_ids[difflist_len] = write_sample_ct;
+	reterr = spgw_append_biallelic_difflist_limited(raregeno, difflist_sample_ids, difflist_common_geno, difflist_len, &spgw);
+      } else {
+	pgr_difflist_to_genovec_unsafe(raregeno, difflist_sample_ids, difflist_common_geno, write_sample_ct, difflist_len, genovec);
+        zero_trailing_bits(write_sample_ct * 2, genovec);
+	reterr = spgw_append_biallelic_genovec(genovec, &spgw);
+      }
+      if (reterr) {
+	fprintf(stderr, "\ncompress/write error %u, vidx=%u\n", (uint32_t)reterr, vidx);
+	goto main_ret_1;
+      }
+      ++vidx;
+      if (!(vidx % 100000)) {
+	printf("\r%u.%um variants compressed.", vidx / 1000000, (vidx / 100000) % 10);
+	fflush(stdout);
+      }
+    }
+  }
+  printf("\n");  
+
+  spgw_finish(&spgw);
+  while (0) {
+  main_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  main_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  main_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  main_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  }
+ main_ret_1:
+  pgr_cleanup(&pgr);
+#ifndef NO_MMAP
+  pgfi_cleanup(&pgfi);
+#endif
+  spgw_cleanup(&spgw);
+  if (pgfi_alloc) {
+    aligned_free(pgfi_alloc);
+  }
+  if (pgr_alloc) {
+    aligned_free(pgr_alloc);
+  }
+  if (spgw_alloc) {
+    aligned_free(spgw_alloc);
+  }
+  if (genovec) {
+    aligned_free(genovec);
+  }
+  if (raregeno) {
+    aligned_free(raregeno);
+  }
+  if (sample_include) {
+    aligned_free(sample_include);
+  }
+  if (sample_include_cumulative_popcounts) {
+    aligned_free(sample_include_cumulative_popcounts);
+  }
+  if (difflist_sample_ids) {
+    aligned_free(difflist_sample_ids);
+  }
+  if (outfile) {
+    fclose(outfile);
+  }
+  return (uint32_t)reterr;
+}
diff --git a/pgenlib_internal.cpp b/pgenlib_internal.cpp
new file mode 100644
index 0000000..4bf237e
--- /dev/null
+++ b/pgenlib_internal.cpp
@@ -0,0 +1,9604 @@
+// This library is part of PLINK 2, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "pgenlib_internal.h"
+
+#ifndef NO_MMAP
+  #include <sys/types.h> // fstat()
+  #include <sys/stat.h> // open(), fstat()
+  #include <sys/mman.h> // mmap()
+  #include <fcntl.h> // open()
+  #include <unistd.h> // fstat()
+#endif
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+uintptr_t g_failed_alloc_attempt_size = 0;
+
+#if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
+boolerr_t pgl_malloc(uintptr_t size, void* pp) {
+  *((unsigned char**)pp) = (unsigned char*)malloc(size);
+  if (*((unsigned char**)pp)) {
+    return 0;
+  }
+  g_failed_alloc_attempt_size = size;
+  return 1;
+}
+#endif
+
+interr_t fwrite_checked(const void* buf, uintptr_t len, FILE* outfile) {
+  while (len > kMaxBytesPerIO) {
+    // OS X can't perform 2GB+ writes
+    // typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
+    // write size
+    fwrite(buf, kMaxBytesPerIO, 1, outfile);
+    buf = &(((const unsigned char*)buf)[kMaxBytesPerIO]);
+    len -= kMaxBytesPerIO;
+  }
+  fwrite(buf, len, 1, outfile);
+  return ferror(outfile);
+}
+
+interr_t fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr) {
+  uintptr_t bytes_read = 0;
+  while (len > kMaxBytesPerIO) {
+    const uintptr_t cur_bytes_read = fread(buf, 1, kMaxBytesPerIO, infile);
+    bytes_read += cur_bytes_read;
+    if (cur_bytes_read != kMaxBytesPerIO) {
+      *bytes_read_ptr = bytes_read;
+      return ferror(infile);
+    }
+    buf = &(((char*)buf)[kMaxBytesPerIO]);
+    len -= kMaxBytesPerIO;
+  }
+  bytes_read += fread(buf, 1, len, infile);
+  *bytes_read_ptr = bytes_read;
+  return ferror(infile);
+}
+
+#ifdef __LP64__
+static inline boolerr_t scan_uint_capped_finish(const char* ss, uint64_t cap, uint32_t* valp) {
+  uint64_t val = *valp;
+  while (1) {
+    // a little bit of unrolling seems to help
+    const uint64_t cur_digit = (uint64_t)((unsigned char)(*ss++)) - 48;
+    if (cur_digit >= 10) {
+      break;
+    }
+    // val = val * 10 + cur_digit;
+    const uint64_t cur_digit2 = (uint64_t)((unsigned char)(*ss++)) - 48;
+    if (cur_digit2 >= 10) {
+      val = val * 10 + cur_digit;
+      if (val > cap) {
+	return 1;
+      }
+      break;
+    }
+    val = val * 100 + cur_digit * 10 + cur_digit2;
+    if (val > cap) {
+      return 1;
+    }
+  }
+  *valp = (uint32_t)val;
+  return 0;
+}
+
+boolerr_t scan_posint_capped(const char* ss, uint64_t cap, uint32_t* valp) {
+  // '0' has ascii code 48
+  assert(((unsigned char)ss[0]) > 32);
+  *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
+  if (*valp >= 10) {
+    // permit leading '+' (ascii 43), but not '++' or '+-'
+    if (*valp != 0xfffffffbU) {
+      return 1;
+    }
+    *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (*valp >= 10) {
+      return 1;
+    }
+  }
+  while (!(*valp)) {
+    *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if ((*valp) >= 10) {
+      return 1;
+    }
+  }
+  return scan_uint_capped_finish(ss, cap, valp);
+}
+
+boolerr_t scan_uint_capped(const char* ss, uint64_t cap, uint32_t* valp) {
+  // Reads an integer in [0, cap].  Assumes first character is nonspace. 
+  assert(((unsigned char)ss[0]) > 32);
+  uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
+  if (val >= 10) {
+    if (val != 0xfffffffbU) {
+      // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
+      if ((val != 0xfffffffdU) || (*ss != '0')) {
+	return 1;
+      }
+      // accept "-0", "-00", etc.
+      while (*(++ss) == '0');
+      *valp = 0;
+      return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;      
+    }
+    // accept leading '+'
+    val = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  *valp = val;
+  return scan_uint_capped_finish(ss, cap, valp);
+}
+
+boolerr_t scan_int_abs_bounded(const char* ss, uint64_t bound, int32_t* valp) {
+  // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
+  assert(((unsigned char)ss[0]) > 32);
+  *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
+  int32_t sign = 1;
+  if (((uint32_t)*valp) >= 10) {
+    if (*valp == -3) {
+      sign = -1;
+    } else if (*valp != -5) {
+      return 1;
+    }
+    *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (((uint32_t)*valp) >= 10) {
+      return 1;
+    }
+  }
+  if (scan_uint_capped_finish(ss, bound, (uint32_t*)valp)) {
+    return 1;
+  }
+  *valp *= sign;
+  return 0;
+}
+#else // not __LP64__
+boolerr_t scan_posint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
+  // '0' has ascii code 48
+  assert(((unsigned char)ss[0]) > 32);
+  uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
+  if (val >= 10) {
+    if (val != 0xfffffffbU) {
+      return 1;
+    }
+    val = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (!val) {
+    val = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (1) {
+    const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (cur_digit >= 10) {
+      *valp = val;
+      return 0;
+    }
+    // avoid integer overflow in middle of computation
+    if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+
+boolerr_t scan_uint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
+  // Reads an integer in [0, cap].  Assumes first character is nonspace. 
+  assert(((unsigned char)ss[0]) > 32);
+  uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
+  if (val >= 10) {
+    if (val != 0xfffffffbU) {
+      if ((val != 0xfffffffdU) || (*ss != '0')) {
+	return 1;
+      }
+      while (*(++ss) == '0');
+      *valp = 0;
+      return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
+    }
+    val = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (1) {
+    const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (cur_digit >= 10) {
+      *valp = val;
+      return 0;
+    }
+    if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+
+boolerr_t scan_int_abs_bounded32(const char* ss, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
+  // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
+  assert(((unsigned char)ss[0]) > 32);
+  uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
+  int32_t sign = 1;
+  if (val >= 10) {
+    if (val == 0xfffffffdU) {
+      sign = -1;
+    } else if (val != 0xfffffffbU) {
+      return 1;
+    }
+    val = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (1) {
+    const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (cur_digit >= 10) {
+      *valp = sign * ((int32_t)val);
+      return 0;
+    }
+    if ((val >= bound_div_10) && ((val > bound_div_10) || (cur_digit > bound_mod_10))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+#endif
+
+boolerr_t aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp) {
+  // Assumes malloc returns word-aligned addresses.
+  assert(alignment);
+  assert(!(alignment % kBytesPerWord));
+  uintptr_t malloc_addr;
+  if (pgl_malloc(size + alignment, &malloc_addr)) {
+    return 1;
+  }
+  assert(!(malloc_addr % kBytesPerWord));
+  uintptr_t** casted_aligned_pp = (uintptr_t**)aligned_pp;
+  *casted_aligned_pp = (uintptr_t*)round_down_pow2(malloc_addr + alignment, alignment);
+  (*casted_aligned_pp)[-1] = malloc_addr;
+  return 0;
+}
+
+void fill_all_bits(uintptr_t ct, uintptr_t* bitarr) {
+  // leaves bits beyond the end unset
+  // ok for ct == 0
+  uintptr_t quotient = ct / kBitsPerWord;
+  uintptr_t remainder = ct % kBitsPerWord;
+  fill_ulong_one(quotient, bitarr);
+  if (remainder) {
+    bitarr[quotient] = (k1LU << remainder) - k1LU;
+  }
+}
+
+void bitvec_and(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := main_bitvec AND arg_bitvec
+#ifdef __LP64__
+  vul_t* main_bitvvec_iter = (vul_t*)main_bitvec;
+  const vul_t* arg_bitvvec_iter = (const vul_t*)arg_bitvec;
+  const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
+  if (full_vec_ct & 1) {
+    *main_bitvvec_iter++ &= *arg_bitvvec_iter++;
+  }
+  if (full_vec_ct & 2) {
+    *main_bitvvec_iter++ &= *arg_bitvvec_iter++;
+    *main_bitvvec_iter++ &= *arg_bitvvec_iter++;
+  }
+  for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
+    *main_bitvvec_iter++ &= *arg_bitvvec_iter++;
+    *main_bitvvec_iter++ &= *arg_bitvvec_iter++;
+    *main_bitvvec_iter++ &= *arg_bitvvec_iter++;
+    *main_bitvvec_iter++ &= *arg_bitvvec_iter++;
+  }
+  #ifdef USE_AVX2
+  if (word_ct & 2) {
+    const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
+    main_bitvec[base_idx] &= arg_bitvec[base_idx];
+    main_bitvec[base_idx + 1] &= arg_bitvec[base_idx + 1];
+  }
+  #endif
+  if (word_ct & 1) {
+    main_bitvec[word_ct - 1] &= arg_bitvec[word_ct - 1];
+  }
+#else
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    main_bitvec[widx] &= arg_bitvec[widx];
+  }
+#endif
+}
+
+void bitvec_andnot(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := main_bitvec ANDNOT exclude_bitvec
+  // note that this is the reverse of the _mm_andnot() operand order
+#ifdef __LP64__
+  vul_t* main_bitvvec_iter = (vul_t*)main_bitvec;
+  const vul_t* exclude_bitvvec_iter = (const vul_t*)exclude_bitvec;
+  const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
+  if (full_vec_ct & 1) {
+    *main_bitvvec_iter++ &= ~(*exclude_bitvvec_iter++);
+  }
+  if (full_vec_ct & 2) {
+    *main_bitvvec_iter++ &= ~(*exclude_bitvvec_iter++);
+    *main_bitvvec_iter++ &= ~(*exclude_bitvvec_iter++);
+  }
+  for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
+    *main_bitvvec_iter++ &= ~(*exclude_bitvvec_iter++);
+    *main_bitvvec_iter++ &= ~(*exclude_bitvvec_iter++);
+    *main_bitvvec_iter++ &= ~(*exclude_bitvvec_iter++);
+    *main_bitvvec_iter++ &= ~(*exclude_bitvvec_iter++);
+  }
+  #ifdef USE_AVX2
+  if (word_ct & 2) {
+    const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
+    main_bitvec[base_idx] &= ~exclude_bitvec[base_idx];
+    main_bitvec[base_idx + 1] &= ~exclude_bitvec[base_idx + 1];
+  }
+  #endif
+  if (word_ct & 1) {
+    main_bitvec[word_ct - 1] &= ~exclude_bitvec[word_ct - 1];
+  }
+#else
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    main_bitvec[widx] &= ~exclude_bitvec[widx];
+  }
+#endif
+}
+
+uint32_t next_set_unsafe(const uintptr_t* bitarr, uint32_t loc) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
+  uintptr_t ulii = (*bitarr_ptr) >> (loc % kBitsPerWord);
+  if (ulii) {
+    return loc + CTZLU(ulii);
+  }
+  do {
+    ulii = *(++bitarr_ptr);
+  } while (!ulii);
+  return (uint32_t)(((uintptr_t)(bitarr_ptr - bitarr)) * kBitsPerWord + CTZLU(ulii));
+}
+
+uint32_t next_unset_unsafe(const uintptr_t* bitarr, uint32_t loc) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
+  uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % kBitsPerWord);
+  if (ulii) {
+    return loc + CTZLU(ulii);
+  }
+  do {
+    ulii = *(++bitarr_ptr);
+  } while (ulii == ~k0LU);
+  return (uint32_t)(((uintptr_t)(bitarr_ptr - bitarr)) * kBitsPerWord + CTZLU(~ulii));
+}
+
+/*
+uint32_t next_nonmissing_unsafe(const uintptr_t* genoarr, uint32_t loc) {
+  const uintptr_t* genoarr_ptr = &(genoarr[loc / kBitsPerWordD2]);
+  uintptr_t ulii = (~(*genoarr_ptr)) >> (2 * (loc % kBitsPerWordD2));
+  if (ulii) {
+    return loc + (CTZLU(ulii) / 2);
+  }
+  do {
+    ulii = *(++genoarr_ptr);
+  } while (ulii == ~k0LU);
+  return ((uintptr_t)(genoarr_ptr - genoarr)) * kBitsPerWordD2 + (CTZLU(~ulii) / 2);
+}
+*/
+
+uint32_t next_set(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
+  // safe version.
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
+  uintptr_t ulii = (*bitarr_ptr) >> (loc % kBitsPerWord);
+  uint32_t rval;
+  if (ulii) {
+    rval = loc + CTZLU(ulii);
+    return MINV(rval, ceil);
+  }
+  const uintptr_t* bitarr_last = &(bitarr[(ceil - 1) / kBitsPerWord]);
+  do {
+    if (bitarr_ptr >= bitarr_last) {
+      return ceil;
+    }
+    ulii = *(++bitarr_ptr);
+  } while (!ulii);
+  rval = (uint32_t)(((uintptr_t)(bitarr_ptr - bitarr)) * kBitsPerWord + CTZLU(ulii));
+  return MINV(rval, ceil);
+}
+
+uint32_t prev_set_unsafe(const uintptr_t* bitarr, uint32_t loc) {
+  // unlike the next_{un}set family, this always returns a STRICTLY earlier
+  // position
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
+  uint32_t remainder = loc % kBitsPerWord;
+  uintptr_t ulii;
+  if (remainder) {
+    ulii = (*bitarr_ptr) & ((k1LU << remainder) - k1LU);
+    if (ulii) {
+      return (loc | (kBitsPerWord - 1)) - CLZLU(ulii);
+    }
+  }
+  do {
+    ulii = *(--bitarr_ptr);
+  } while (!ulii);
+  return (uint32_t)(((uintptr_t)(bitarr_ptr - bitarr)) * kBitsPerWord + kBitsPerWord - 1 - CLZLU(ulii));
+}
+
+void copy_bitarr_subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t subset_size, uintptr_t* __restrict output_bitarr) {
+  // could try exploiting _pext_u64() intrinsic, but probably not worthwhile
+  // until 2020ish
+  const uint32_t subset_size_lowbits = subset_size % kBitsPerWord;
+  uintptr_t* output_bitarr_iter = output_bitarr;
+  uintptr_t* output_bitarr_last = &(output_bitarr[subset_size / kBitsPerWord]);
+  uintptr_t cur_output_word = 0;
+  uint32_t read_widx = 0xffffffffU; // deliberate overflow
+  uint32_t write_idx_lowbits = 0;
+  while ((output_bitarr_iter != output_bitarr_last) || (write_idx_lowbits != subset_size_lowbits)) {
+    uintptr_t cur_mask_word;
+    // sparse subset_mask optimization
+    // guaranteed to terminate since there's at least one more set bit
+    do {
+      cur_mask_word = subset_mask[++read_widx];
+    } while (!cur_mask_word);
+    uintptr_t cur_masked_input_word = raw_bitarr[read_widx] & cur_mask_word;
+    const uint32_t cur_mask_popcount = popcount_long(cur_mask_word);
+    uintptr_t subsetted_input_word = 0;
+    if (cur_masked_input_word) {
+      const uintptr_t cur_inv_mask = ~cur_mask_word;
+      do {
+	const uint32_t read_uidx_nz_start_lowbits = CTZLU(cur_masked_input_word);
+	const uintptr_t cur_inv_mask_shifted = cur_inv_mask >> read_uidx_nz_start_lowbits;
+	if (!cur_inv_mask_shifted) {
+	  subsetted_input_word |= cur_masked_input_word >> (kBitsPerWord - cur_mask_popcount);
+	  break;
+	}
+	const uint32_t cur_read_end = CTZLU(cur_inv_mask_shifted) + read_uidx_nz_start_lowbits;
+	// this seems to optimize better than (k1LU << cur_read_end) - k1LU
+	// todo: check if/when that's true elsewhere
+        const uintptr_t lowmask = (~k0LU) >> (kBitsPerWord - cur_read_end);
+	const uintptr_t bits_to_copy = cur_masked_input_word & lowmask;
+	cur_masked_input_word -= bits_to_copy;
+	// todo: check if a less-popcounty implementation should be used in
+	// non-SSE4.2 case
+	const uint32_t cur_write_end = popcount_long(cur_mask_word & lowmask);
+	subsetted_input_word |= bits_to_copy >> (cur_read_end - cur_write_end);
+      } while (cur_masked_input_word);
+    }
+    cur_output_word |= subsetted_input_word << write_idx_lowbits;
+    const uint32_t new_write_idx_lowbits = write_idx_lowbits + cur_mask_popcount;
+    if (new_write_idx_lowbits >= kBitsPerWord) {
+      *output_bitarr_iter++ = cur_output_word;
+      // ...and these are the bits that fell off
+      // bugfix: unsafe to right-shift 64
+      if (write_idx_lowbits) {
+	cur_output_word = subsetted_input_word >> (kBitsPerWord - write_idx_lowbits);
+      } else {
+	cur_output_word = 0;
+      }
+    }
+    write_idx_lowbits = new_write_idx_lowbits % kBitsPerWord;
+  }
+  if (write_idx_lowbits) {
+    *output_bitarr_iter = cur_output_word;
+  }
+}
+
+void copy_quaterarr_nonempty_subset(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_mask, uint32_t raw_quaterarr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_quaterarr) {
+  // in plink 2.0, we probably want (0-based) bit raw_quaterarr_entry_ct of
+  // subset_mask to be always allocated and unset.  This removes a few special
+  // cases re: iterating past the end of arrays.
+  assert(subset_entry_ct);
+  assert(raw_quaterarr_entry_ct >= subset_entry_ct);
+  uintptr_t cur_output_word = 0;
+  
+  uintptr_t* output_quaterarr_iter = output_quaterarr;
+
+  uintptr_t* output_quaterarr_last = &(output_quaterarr[subset_entry_ct / kBitsPerWordD2]);
+  const uint32_t word_write_halfshift_end = subset_entry_ct % kBitsPerWordD2;
+  uint32_t word_write_halfshift = 0;
+  // if <= 2/3-filled, use sparse copy algorithm
+  // (tried copy_bitarr_subset() approach, that actually worsened things)
+  if (subset_entry_ct * (3 * k1LU) <= raw_quaterarr_entry_ct * (2 * k1LU)) {
+    uint32_t subset_mask_widx = 0;
+    while (1) {
+      const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
+      if (cur_include_word) {
+	uint32_t wordhalf_idx = 0;
+	uint32_t cur_include_halfword = (halfword_t)cur_include_word;
+	while (1) {
+	  if (cur_include_halfword) {
+	    uintptr_t raw_quaterarr_word = raw_quaterarr[subset_mask_widx * 2 + wordhalf_idx];
+	    do {
+	      uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
+	      cur_output_word |= ((raw_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
+	      if (++word_write_halfshift == kBitsPerWordD2) {
+		*output_quaterarr_iter++ = cur_output_word;
+		word_write_halfshift = 0;
+		cur_output_word = 0;
+	      }
+	      cur_include_halfword &= cur_include_halfword - 1;
+	    } while (cur_include_halfword);
+	  }
+	  if (wordhalf_idx) {
+	    break;
+	  }
+	  ++wordhalf_idx;
+	  cur_include_halfword = cur_include_word >> kBitsPerWordD2;
+	}
+	if (output_quaterarr_iter == output_quaterarr_last) {
+	  if (word_write_halfshift == word_write_halfshift_end) {
+            if (word_write_halfshift_end) {
+	      *output_quaterarr_last = cur_output_word;
+	    }
+	    return;
+	  }
+	}
+      }
+      ++subset_mask_widx;
+    }
+  }
+  // blocked copy
+  const uintptr_t* raw_quaterarr_iter = raw_quaterarr;
+  while (1) {
+    const uintptr_t cur_include_word = *subset_mask++;
+    uint32_t wordhalf_idx = 0;
+    uintptr_t cur_include_halfword = (halfword_t)cur_include_word;
+    while (1) {
+      uintptr_t raw_quaterarr_word = *raw_quaterarr_iter++;
+      while (cur_include_halfword) {
+	uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
+	uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
+	uintptr_t raw_quaterarr_curblock_unmasked = raw_quaterarr_word >> (rqa_idx_lowbits * 2);
+	uint32_t rqa_block_len = CTZLU(halfword_invshifted);
+	uint32_t block_len_limit = kBitsPerWordD2 - word_write_halfshift;
+	cur_output_word |= raw_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
+	if (rqa_block_len < block_len_limit) {
+	  word_write_halfshift += rqa_block_len;
+	  cur_output_word &= (k1LU << (word_write_halfshift * 2)) - k1LU;
+	} else {
+	  // no need to mask, extra bits vanish off the high end
+	  *output_quaterarr_iter++ = cur_output_word;
+	  word_write_halfshift = rqa_block_len - block_len_limit;
+	  if (word_write_halfshift) {
+	    cur_output_word = (raw_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((k1LU << (2 * word_write_halfshift)) - k1LU);
+	  } else {
+	    // avoid potential right-shift-[word length]
+	    cur_output_word = 0;
+	  }
+	}
+	cur_include_halfword &= (~(k1LU << (rqa_block_len + rqa_idx_lowbits))) + k1LU;
+      }
+      if (wordhalf_idx) {
+	break;
+      }
+      ++wordhalf_idx;
+      cur_include_halfword = cur_include_word >> kBitsPerWordD2;
+    }
+    if (output_quaterarr_iter == output_quaterarr_last) {
+      if (word_write_halfshift == word_write_halfshift_end) {
+	if (word_write_halfshift_end) {
+	  *output_quaterarr_last = cur_output_word;
+	}
+	return;
+      }
+    }
+  }
+}
+
+// Basic SSE2 implementation of Lauradoux/Walisch popcount.
+uintptr_t popcount_vecs(const vul_t* bit_vvec, uintptr_t vec_ct) {
+  // popcounts vptr[0..(vec_ct-1)].  Assumes vec_ct is a multiple of 3 (0 ok).
+  assert(!(vec_ct % 3));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t m8 = VCONST_UL(kMask00FF);
+  const vul_t* bit_vvec_iter = bit_vvec;
+  uintptr_t tot = 0;
+  while (1) {
+    univec_t acc;
+    acc.vi = vul_setzero();
+    const vul_t* bit_vvec_stop;
+    if (vec_ct < 30) {
+      if (!vec_ct) {
+	return tot;
+      }
+      bit_vvec_stop = &(bit_vvec_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      bit_vvec_stop = &(bit_vvec_iter[30]);
+      vec_ct -= 30;
+    }
+    do {
+      vul_t count1 = *bit_vvec_iter++;
+      vul_t count2 = *bit_vvec_iter++;
+      vul_t half1 = *bit_vvec_iter++;
+      vul_t half2 = vul_rshift(half1, 1) & m1;
+      half1 = half1 & m1;
+      // Two bits can represent values from 0-3, so make each pair in count1
+      // count2 store a partial bitcount covering themselves AND another bit
+      // from elsewhere.
+      count1 = count1 - (vul_rshift(count1, 1) & m1);
+      count2 = count2 - (vul_rshift(count2, 1) & m1);
+      count1 = count1 + half1;
+      count2 = count2 + half2;
+      // Four bits represent 0-15, so we can safely add four 0-3 partial
+      // bitcounts together.
+      count1 = (count1 & m2) + (vul_rshift(count1, 2) & m2);
+      count1 = count1 + (count2 & m2) + (vul_rshift(count2, 2) & m2);
+      // Accumulator stores sixteen 0-255 counts in parallel.
+      // (32 in AVX2 case, 4 in 32-bit case)
+      acc.vi = acc.vi + (count1 & m4) + (vul_rshift(count1, 4) & m4);
+    } while (bit_vvec_iter < bit_vvec_stop);
+    acc.vi = (acc.vi & m8) + (vul_rshift(acc.vi, 8) & m8);
+    tot += univec_hsum_16bit(acc);
+  }
+}
+
+void count_2freq_3xvec(const vul_t* geno_vvec, uint32_t vec_ct, uint32_t* __restrict alt1_plus_bothset_ctp, uint32_t* __restrict bothset_ctp) {
+  assert(!(vec_ct % 3));
+  // Increments bothset_ct by the number of 0b11 in the current block, and
+  // alt1_ct by twice the number of 0b10 plus the number of 0b01.
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t* geno_vvec_iter = geno_vvec;
+  uint32_t alt1_plus_bothset_ct = 0;
+  uint32_t bothset_ct = 0;
+
+  while (1) {
+    univec_t acc_alt1_plus_bothset;
+    univec_t acc_bothset;
+    acc_alt1_plus_bothset.vi = vul_setzero();
+    acc_bothset.vi = vul_setzero();
+    const vul_t* geno_vvec_stop;
+    if (vec_ct < 30) {
+      if (!vec_ct) {
+	*alt1_plus_bothset_ctp = alt1_plus_bothset_ct;
+	*bothset_ctp = bothset_ct;
+	return;	
+      }
+      geno_vvec_stop = &(geno_vvec_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      geno_vvec_stop = &(geno_vvec_iter[30]);
+      vec_ct -= 30;
+    }
+    do {
+      vul_t cur_geno_vword1 = *geno_vvec_iter++;
+      // process first two vwords simultaneously to minimize linear dependence
+      vul_t cur_geno_vword2 = *geno_vvec_iter++;
+      vul_t cur_geno_vword_low_lshifted1 = vul_lshift(cur_geno_vword1 & m1, 1);
+      vul_t cur_geno_vword_low_lshifted2 = vul_lshift(cur_geno_vword2 & m1, 1);
+      
+      // 00 -> 00; 01 -> 01; 10 -> 10; 11 -> 01
+      // note that _mm_andnot_si128 flips the *first* argument before the AND
+      // operation.
+      vul_t alt1_plus_bothset1 = (~cur_geno_vword_low_lshifted1) & cur_geno_vword1;
+      vul_t alt1_plus_bothset2 = (~cur_geno_vword_low_lshifted2) & cur_geno_vword2;
+
+      vul_t bothset1 = vul_rshift(cur_geno_vword_low_lshifted1 & cur_geno_vword1, 1);
+      vul_t bothset2 = vul_rshift(cur_geno_vword_low_lshifted2 & cur_geno_vword2, 1);
+      
+      cur_geno_vword1 = *geno_vvec_iter++;
+      alt1_plus_bothset1 = (alt1_plus_bothset1 & m2) + (vul_rshift(alt1_plus_bothset1, 2) & m2);
+      bothset2 = bothset1 + bothset2;
+      alt1_plus_bothset2 = (alt1_plus_bothset2 & m2) + (vul_rshift(alt1_plus_bothset2, 2) & m2);
+      cur_geno_vword_low_lshifted1 = vul_lshift(cur_geno_vword1 & m1, 1);
+      
+      alt1_plus_bothset2 = alt1_plus_bothset1 + alt1_plus_bothset2;
+      // alt1_plus_bothset2 now contains 4-bit values from 0-8, while bothset2
+      // contains 2-bit values from 0-2
+      // (todo: check whether this is faster if we use double_bothsetx
+      // variables instead of bothset1/bothset2)
+      bothset1 = vul_rshift(cur_geno_vword_low_lshifted1 & cur_geno_vword1, 1);
+      alt1_plus_bothset1 = (~cur_geno_vword_low_lshifted1) & cur_geno_vword1;
+      bothset2 = bothset1 + bothset2;
+      alt1_plus_bothset1 = (alt1_plus_bothset1 & m2) + (vul_rshift(alt1_plus_bothset1, 2) & m2);
+
+      bothset2 = (bothset2 & m2) + (vul_rshift(bothset2, 2) & m2);
+      alt1_plus_bothset2 = alt1_plus_bothset1 + alt1_plus_bothset2;
+      // alt1_plus_bothset2 now contains 4-bit values from 0-12, while bothset2
+      // contains 4-bit values from 0-6.  aggregate both into 8-bit values.
+      bothset2 = (bothset2 & m4) + (vul_rshift(bothset2, 4) & m4);
+      alt1_plus_bothset2 = (alt1_plus_bothset2 & m4) + (vul_rshift(alt1_plus_bothset2, 4) & m4);
+
+      acc_bothset.vi = acc_bothset.vi + bothset2;
+      acc_alt1_plus_bothset.vi = acc_alt1_plus_bothset.vi + alt1_plus_bothset2;
+    } while (geno_vvec_iter < geno_vvec_stop);
+    const vul_t m8 = VCONST_UL(kMask00FF);
+    acc_bothset.vi = (acc_bothset.vi + vul_rshift(acc_bothset.vi, 8)) & m8;
+    acc_alt1_plus_bothset.vi = (acc_alt1_plus_bothset.vi & m8) + (vul_rshift(acc_alt1_plus_bothset.vi, 8) & m8);
+    bothset_ct += univec_hsum_16bit(acc_bothset);
+    alt1_plus_bothset_ct += univec_hsum_16bit(acc_alt1_plus_bothset);
+  }
+}
+
+void count_3freq_6xvec(const vul_t* geno_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp) {
+  assert(!(vec_ct % 6));
+  // Sets even_ct to the number of set low bits in the current block, odd_ct to
+  // the number of set high bits, and bothset_ct by the number of 0b11s.
+  // Easy to adapt this to take a subset quatervec parameter.
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t* geno_vvec_iter = geno_vvec;
+  uint32_t even_ct = 0;
+  uint32_t odd_ct = 0;
+  uint32_t bothset_ct = 0;
+  while (1) {
+    univec_t acc_even;
+    univec_t acc_odd;
+    univec_t acc_bothset;
+    acc_even.vi = vul_setzero();
+    acc_odd.vi = vul_setzero();
+    acc_bothset.vi = vul_setzero();
+    const vul_t* geno_vvec_stop;
+    if (vec_ct < 60) {
+      if (!vec_ct) {
+	*even_ctp = even_ct;
+	*odd_ctp = odd_ct;
+	*bothset_ctp = bothset_ct;
+	return;
+      }
+      geno_vvec_stop = &(geno_vvec_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      geno_vvec_stop = &(geno_vvec_iter[60]);
+      vec_ct -= 60;
+    }
+    do {
+      // hmm, this seems to have more linear dependence than I'd want, but the
+      // reorderings I tried just made the code harder to read without helping,
+      // so I'll leave this alone
+      vul_t cur_geno_vword = *geno_vvec_iter++;
+      vul_t odd1 = m1 & vul_rshift(cur_geno_vword, 1);
+      vul_t even1 = m1 & cur_geno_vword;
+      vul_t bothset1 = odd1 & cur_geno_vword;
+      
+      cur_geno_vword = *geno_vvec_iter++;
+      vul_t cur_geno_vword_high = m1 & vul_rshift(cur_geno_vword, 1);
+      even1 = even1 + (m1 & cur_geno_vword);
+      odd1 = odd1 + cur_geno_vword_high;
+      bothset1 = bothset1 + (cur_geno_vword_high & cur_geno_vword);
+      
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_geno_vword_high = m1 & vul_rshift(cur_geno_vword, 1);
+      even1 = even1 + (m1 & cur_geno_vword);
+      odd1 = odd1 + cur_geno_vword_high;
+      bothset1 = bothset1 + (cur_geno_vword_high & cur_geno_vword);
+
+      even1 = (even1 & m2) + (vul_rshift(even1, 2) & m2);
+      odd1 = (odd1 & m2) + (vul_rshift(odd1, 2) & m2);
+      bothset1 = (bothset1 & m2) + (vul_rshift(bothset1, 2) & m2);
+
+      cur_geno_vword = *geno_vvec_iter++;
+      vul_t odd2 = m1 & vul_rshift(cur_geno_vword, 1);
+      vul_t even2 = m1 & cur_geno_vword;
+      vul_t bothset2 = odd2 & cur_geno_vword;
+      
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_geno_vword_high = m1 & vul_rshift(cur_geno_vword, 1);
+      even2 = even2 + (m1 & cur_geno_vword);
+      odd2 = odd2 + cur_geno_vword_high;
+      bothset2 = bothset2 + (cur_geno_vword_high & cur_geno_vword);
+      
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_geno_vword_high = m1 & vul_rshift(cur_geno_vword, 1);
+      even2 = even2 + (m1 & cur_geno_vword);
+      odd2 = odd2 + cur_geno_vword_high;
+      bothset2 = bothset2 + (cur_geno_vword_high & cur_geno_vword);
+
+      even1 = even1 + (even2 & m2) + (vul_rshift(even2, 2) & m2);
+      odd1 = odd1 + (odd2 & m2) + (vul_rshift(odd2, 2) & m2);
+      bothset1 = bothset1 + (bothset2 & m2) + (vul_rshift(bothset2, 2) & m2);
+      // these now contain 4-bit values from 0-12
+
+      acc_even.vi = acc_even.vi + (even1 & m4) + (vul_rshift(even1, 4) & m4);
+      acc_odd.vi = acc_odd.vi + (odd1 & m4) + (vul_rshift(odd1, 4) & m4);
+      acc_bothset.vi = acc_bothset.vi + (bothset1 & m4) + (vul_rshift(bothset1, 4) & m4);
+    } while (geno_vvec_iter < geno_vvec_stop);
+    const vul_t m8 = VCONST_UL(kMask00FF);
+    acc_even.vi = (acc_even.vi & m8) + (vul_rshift(acc_even.vi, 8) & m8);
+    acc_odd.vi = (acc_odd.vi & m8) + (vul_rshift(acc_odd.vi, 8) & m8);
+    acc_bothset.vi = (acc_bothset.vi & m8) + (vul_rshift(acc_bothset.vi, 8) & m8);
+    even_ct += univec_hsum_16bit(acc_even);
+    odd_ct += univec_hsum_16bit(acc_odd);
+    bothset_ct += univec_hsum_16bit(acc_bothset);
+  }
+}
+
+void count_subset_3freq_6xvec(const vul_t* __restrict geno_vvec, const vul_t* __restrict interleaved_mask_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp) {
+  assert(!(vec_ct % 6));
+  // Sets even_ct to the number of set low bits in the current block, odd_ct to
+  // the number of set high bits, and bothset_ct by the number of 0b11s.
+  // Easy to adapt this to take a subset quatervec parameter.
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t* geno_vvec_iter = geno_vvec;
+  const vul_t* interleaved_mask_vvec_iter = interleaved_mask_vvec;
+  uint32_t even_ct = 0;
+  uint32_t odd_ct = 0;
+  uint32_t bothset_ct = 0;
+  while (1) {
+    univec_t acc_even;
+    univec_t acc_odd;
+    univec_t acc_bothset;
+    acc_even.vi = vul_setzero();
+    acc_odd.vi = vul_setzero();
+    acc_bothset.vi = vul_setzero();
+    const vul_t* geno_vvec_stop;
+    if (vec_ct < 60) {
+      if (!vec_ct) {
+	*even_ctp = even_ct;
+	*odd_ctp = odd_ct;
+	*bothset_ctp = bothset_ct;
+	return;
+      }
+      geno_vvec_stop = &(geno_vvec_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      geno_vvec_stop = &(geno_vvec_iter[60]);
+      vec_ct -= 60;
+    }
+    do {
+      vul_t interleaved_mask_vword = *interleaved_mask_vvec_iter++;      
+      vul_t cur_geno_vword = *geno_vvec_iter++;
+      vul_t cur_mask = interleaved_mask_vword & m1;
+      vul_t odd1 = cur_mask & vul_rshift(cur_geno_vword, 1);
+      vul_t even1 = cur_mask & cur_geno_vword;
+      vul_t bothset1 = odd1 & cur_geno_vword;
+
+      cur_mask = vul_rshift(interleaved_mask_vword, 1) & m1;
+      cur_geno_vword = *geno_vvec_iter++;
+      vul_t cur_geno_vword_high_masked = cur_mask & vul_rshift(cur_geno_vword, 1);
+      even1 = even1 + (cur_mask & cur_geno_vword);
+      odd1 = odd1 + cur_geno_vword_high_masked;
+      bothset1 = bothset1 + (cur_geno_vword_high_masked & cur_geno_vword);
+
+      interleaved_mask_vword = *interleaved_mask_vvec_iter++;
+      cur_mask = interleaved_mask_vword & m1;
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_geno_vword_high_masked = cur_mask & vul_rshift(cur_geno_vword, 1);
+      even1 = even1 + (cur_mask & cur_geno_vword);
+      odd1 = odd1 + cur_geno_vword_high_masked;
+      bothset1 = bothset1 + (cur_geno_vword_high_masked & cur_geno_vword);
+
+      even1 = (even1 & m2) + (vul_rshift(even1, 2) & m2);
+      odd1 = (odd1 & m2) + (vul_rshift(odd1, 2) & m2);
+      bothset1 = (bothset1 & m2) + (vul_rshift(bothset1, 2) & m2);
+
+      cur_mask = vul_rshift(interleaved_mask_vword, 1) & m1;
+      cur_geno_vword = *geno_vvec_iter++;
+      vul_t odd2 = cur_mask & vul_rshift(cur_geno_vword, 1);
+      vul_t even2 = cur_mask & cur_geno_vword;
+      vul_t bothset2 = odd2 & cur_geno_vword;
+
+      interleaved_mask_vword = *interleaved_mask_vvec_iter++;
+      cur_mask = interleaved_mask_vword & m1;
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_geno_vword_high_masked = cur_mask & vul_rshift(cur_geno_vword, 1);
+      even2 = even2 + (cur_mask & cur_geno_vword);
+      odd2 = odd2 + cur_geno_vword_high_masked;
+      bothset2 = bothset2 + (cur_geno_vword_high_masked & cur_geno_vword);
+
+      cur_mask = vul_rshift(interleaved_mask_vword, 1) & m1;
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_geno_vword_high_masked = cur_mask & vul_rshift(cur_geno_vword, 1);
+      even2 = even2 + (cur_mask & cur_geno_vword);
+      odd2 = odd2 + cur_geno_vword_high_masked;
+      bothset2 = bothset2 + (cur_geno_vword_high_masked & cur_geno_vword);
+
+      even1 = even1 + (even2 & m2) + (vul_rshift(even2, 2) & m2);
+      odd1 = odd1 + (odd2 & m2) + (vul_rshift(odd2, 2) & m2);
+      bothset1 = bothset1 + (bothset2 & m2) + (vul_rshift(bothset2, 2) & m2);
+      // these now contain 4-bit values from 0-12
+
+      acc_even.vi = acc_even.vi + (even1 & m4) + (vul_rshift(even1, 4) & m4);
+      acc_odd.vi = acc_odd.vi + (odd1 & m4) + (vul_rshift(odd1, 4) & m4);
+      acc_bothset.vi = acc_bothset.vi + (bothset1 & m4) + (vul_rshift(bothset1, 4) & m4);
+    } while (geno_vvec_iter < geno_vvec_stop);
+    const vul_t m8 = VCONST_UL(kMask00FF);
+    acc_even.vi = (acc_even.vi & m8) + (vul_rshift(acc_even.vi, 8) & m8);
+    acc_odd.vi = (acc_odd.vi & m8) + (vul_rshift(acc_odd.vi, 8) & m8);
+    acc_bothset.vi = (acc_bothset.vi & m8) + (vul_rshift(acc_bothset.vi, 8) & m8);
+    even_ct += univec_hsum_16bit(acc_even);
+    odd_ct += univec_hsum_16bit(acc_odd);
+    bothset_ct += univec_hsum_16bit(acc_bothset);
+  }
+}
+
+uint32_t count_01_vecs(const vul_t* geno_vvec, uint32_t vec_ct) {
+  assert(!(vec_ct % 6));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t m8 = VCONST_UL(kMask00FF);
+  const vul_t* geno_vvec_iter = geno_vvec;
+  uint32_t tot = 0;
+  while (1) {
+    univec_t acc;
+    acc.vi = vul_setzero();
+    const vul_t* geno_vvec_stop;
+    if (vec_ct < 60) {
+      if (!vec_ct) {
+	return tot;
+      }
+      geno_vvec_stop = &(geno_vvec_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      geno_vvec_stop = &(geno_vvec_iter[60]);
+      vec_ct -= 60;
+    }
+    do {
+      vul_t loader1 = *geno_vvec_iter++;
+      vul_t loader2 = *geno_vvec_iter++;
+      vul_t count1 = ((~vul_rshift(loader1, 1)) & loader1) & m1;
+      vul_t count2 = ((~vul_rshift(loader2, 1)) & loader2) & m1;
+
+      loader1 = *geno_vvec_iter++;
+      loader2 = *geno_vvec_iter++;
+      count1 = count1 + (((~vul_rshift(loader1, 1)) & loader1) & m1);
+      count2 = count2 + (((~vul_rshift(loader2, 1)) & loader2) & m1);
+
+      loader1 = *geno_vvec_iter++;
+      loader2 = *geno_vvec_iter++;
+      count1 = count1 + (((~vul_rshift(loader1, 1)) & loader1) & m1);
+      count2 = count2 + (((~vul_rshift(loader2, 1)) & loader2) & m1);
+
+      count1 = (count1 & m2) + (vul_rshift(count1, 2) & m2);
+      count1 = count1 + (count2 & m2) + (vul_rshift(count2, 2) & m2);
+      acc.vi = acc.vi + (count1 & m4) + (vul_rshift(count1, 4) & m4);
+    } while (geno_vvec_iter < geno_vvec_stop);
+    acc.vi = (acc.vi & m8) + (vul_rshift(acc.vi, 8) & m8);
+    tot += univec_hsum_16bit(acc);
+  }
+}
+
+uintptr_t popcount_bytes(const unsigned char* bitarr, uintptr_t byte_ct) {
+  const uint32_t lead_byte_ct = ((uintptr_t)(-((uintptr_t)bitarr))) % kBytesPerVec;
+  uintptr_t tot = 0;
+  const uintptr_t* bitarr_iter;
+  uint32_t trail_byte_ct;
+  // bugfix: had wrong condition here
+  if (byte_ct >= lead_byte_ct) {
+#ifdef __LP64__
+    const uint32_t word_rem = lead_byte_ct % kBytesPerWord;
+    if (word_rem) {
+      uintptr_t cur_word = 0;
+      memcpy(&cur_word, bitarr, word_rem);
+      tot = popcount_long(cur_word);
+    }
+    bitarr_iter = (const uintptr_t*)(&(bitarr[word_rem]));
+    if (lead_byte_ct / kBytesPerWord) {
+      tot += popcount_long(*bitarr_iter++);
+    }
+#else
+    if (lead_byte_ct) {
+      uintptr_t cur_word = 0;
+      memcpy(&cur_word, bitarr, lead_byte_ct);
+      tot = popcount_long(cur_word);
+    }
+    bitarr_iter = (const uintptr_t*)(&(bitarr[lead_byte_ct]));
+#endif
+    byte_ct -= lead_byte_ct;
+    const uintptr_t word_ct = byte_ct / kBytesPerWord;
+    // vec-alignment required here
+    tot += popcount_longs(bitarr_iter, word_ct);
+    bitarr_iter = &(bitarr_iter[word_ct]);
+    trail_byte_ct = byte_ct % kBytesPerWord;
+  } else {
+    bitarr_iter = (const uintptr_t*)bitarr;
+    // this may still be >= kBytesPerWord, so can't remove loop
+    trail_byte_ct = (uint32_t)byte_ct;
+  }
+  while (1) {
+    uintptr_t cur_word;
+    if (trail_byte_ct < kBytesPerWord) {
+      if (!trail_byte_ct) {
+	return tot;
+      }
+      cur_word = 0;
+      memcpy(&cur_word, bitarr_iter, trail_byte_ct);
+      trail_byte_ct = 0;
+    } else {
+      cur_word = *bitarr_iter++;
+      trail_byte_ct -= kBytesPerWord;
+    }
+    tot += popcount_long(cur_word);
+  }
+}
+
+uintptr_t popcount_bytes_masked(const unsigned char* bitarr, const uintptr_t* mask_arr, uintptr_t byte_ct) {
+  // could detect aligned case, but that shouldn't happen often enough?
+  const uintptr_t word_ct = byte_ct / kBytesPerWord;
+#ifdef USE_SSE42
+  const uintptr_t* bitarr_alias = (const uintptr_t*)bitarr;
+  uintptr_t tot = 0;
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    tot += popcount_long(bitarr_alias[widx] & mask_arr[widx]);
+  }
+  const uint32_t trail_byte_ct = byte_ct % kBytesPerWord;
+  if (trail_byte_ct) {
+    uintptr_t cur_word = 0;
+    memcpy(&cur_word, &(bitarr_alias[word_ct]), trail_byte_ct);
+    tot += popcount_long(cur_word & mask_arr[word_ct]);
+  }
+  return tot;
+#else
+  const uintptr_t* bitarr_iter = (const uintptr_t*)bitarr;
+  const uintptr_t mainblock_word_ct = word_ct - (word_ct % (24 / kBytesPerWord));
+  const uintptr_t* bitarr_24b_end = &(bitarr_iter[mainblock_word_ct]);
+  const uintptr_t* mask_arr_iter = mask_arr;
+  uintptr_t tot = 0;
+  while (bitarr_iter < bitarr_24b_end) {
+    uintptr_t loader = (*bitarr_iter++) & (*mask_arr_iter++);
+    uintptr_t ulii = loader - ((loader >> 1) & kMask5555);
+    loader = (*bitarr_iter++) & (*mask_arr_iter++);
+    uintptr_t uljj = loader - ((loader >> 1) & kMask5555);
+    loader = (*bitarr_iter++) & (*mask_arr_iter++);
+    ulii += (loader >> 1) & kMask5555;
+    uljj += loader & kMask5555;
+    ulii = (ulii & kMask3333) + ((ulii >> 2) & kMask3333);
+    ulii += (uljj & kMask3333) + ((uljj >> 2) & kMask3333);
+    uintptr_t tmp_stor = (ulii & kMask0F0F) + ((ulii >> 4) & kMask0F0F);
+
+  #ifndef __LP64__
+    loader = (*bitarr_iter++) & (*mask_arr_iter++);
+    ulii = loader - ((loader >> 1) & kMask5555);
+    loader = (*bitarr_iter++) & (*mask_arr_iter++);
+    uljj = loader - ((loader >> 1) & kMask5555);
+    loader = (*bitarr_iter++) & (*mask_arr_iter++);
+    ulii += (loader >> 1) & kMask5555;
+    uljj += loader & kMask5555;
+    ulii = (ulii & kMask3333) + ((ulii >> 2) & kMask3333);
+    ulii += (uljj & kMask3333) + ((uljj >> 2) & kMask3333);
+    tmp_stor += (ulii & kMask0F0F) + ((ulii >> 4) & kMask0F0F);
+  #endif
+
+    // 32-bit case: each 8-bit slot stores a number in 0..48.  Multiplying by
+    // 0x01010101 is equivalent to the left-shifts and adds we need to sum
+    // those four 8-bit numbers in the high-order slot.
+    // 64-bit case: each 8-bit slot stores a number in 0..24.
+    tot += (tmp_stor * kMask0101) >> (kBitsPerWord - 8);
+  }
+  uint32_t trail_byte_ct = (uint32_t)(byte_ct - (mainblock_word_ct * kBytesPerWord));
+  while (1) {
+    uintptr_t cur_word;
+    if (trail_byte_ct < kBytesPerWord) {
+      if (!trail_byte_ct) {
+	return tot;
+      }
+      cur_word = 0;
+      memcpy(&cur_word, bitarr_iter, trail_byte_ct);
+      trail_byte_ct = 0;
+    } else {
+      cur_word = *bitarr_iter++;
+      trail_byte_ct -= kBytesPerWord;
+    }
+    tot += popcount_long(cur_word & (*mask_arr_iter++));
+  }
+#endif
+}
+
+void fill_interleaved_mask_vec(const uintptr_t* __restrict subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec) {
+#ifdef __LP64__
+  const uintptr_t* subset_mask_iter = subset_mask;
+  uintptr_t* interleaved_mask_vec_iter = interleaved_mask_vec;
+  #ifdef USE_AVX2
+  uintptr_t orig_word1 = 0;
+  uintptr_t orig_word3 = 0;
+  #endif
+  for (uint32_t vec_idx = 0; vec_idx < base_vec_ct; ++vec_idx) {
+  #ifdef USE_AVX2
+    // 0 128 1 129 2 130 ...
+    for (uint32_t widx = 0; widx < 4; ++widx) {
+      uintptr_t ww_even;
+      uintptr_t ww_odd;
+      if (!(widx % 2)) {
+	orig_word1 = subset_mask_iter[0];
+	orig_word3 = subset_mask_iter[2];
+	++subset_mask_iter;
+	ww_even = (uint32_t)orig_word1;
+	ww_odd = (uint32_t)orig_word3;
+      } else {
+	ww_even = orig_word1 >> 32;
+	ww_odd = orig_word3 >> 32;
+      }
+      ww_even = unpack_halfword_to_word(ww_even);
+      ww_odd = unpack_halfword_to_word(ww_odd);
+      *interleaved_mask_vec_iter++ = ww_even | (ww_odd << 1);
+    }
+    subset_mask_iter = &(subset_mask_iter[2]);
+  #else // not USE_AVX2
+    // 0 64 1 65 2 66 ...
+    const uintptr_t orig_word1 = *subset_mask_iter++;
+    const uintptr_t orig_word2 = *subset_mask_iter++;
+    for (uint32_t widx = 0; widx < 2; ++widx) {
+      uintptr_t ww_even;
+      uintptr_t ww_odd;
+      // todo: check if there's a better way to organize this loop
+      if (!widx) {
+	ww_even = (uint32_t)orig_word1;
+	ww_odd = (uint32_t)orig_word2;
+      } else {
+	ww_even = orig_word1 >> 32;
+	ww_odd = orig_word2 >> 32;
+      }
+      ww_even = unpack_halfword_to_word(ww_even);
+      ww_odd = unpack_halfword_to_word(ww_odd);
+      *interleaved_mask_vec_iter++ = ww_even | (ww_odd << 1);
+    }
+  #endif // not USE_AVX2
+  }
+#else
+  for (uint32_t widx = 0; widx < base_vec_ct; ++widx) {
+    const uintptr_t orig_word = subset_mask[widx];
+    uintptr_t ww_even = (uint16_t)orig_word;
+    uintptr_t ww_odd = orig_word >> 16;
+    ww_even = unpack_halfword_to_word(ww_even);
+    ww_odd = unpack_halfword_to_word(ww_odd);
+    interleaved_mask_vec[widx] = ww_even | (ww_odd << 1);
+  }
+#endif
+}
+
+void fill_cumulative_popcounts(const uintptr_t* subset_mask, uint32_t word_ct, uint32_t* cumulative_popcounts) {
+  assert(word_ct);
+  const uint32_t word_ct_m1 = word_ct - 1;
+  uint32_t cur_sum = 0;
+  for (uint32_t widx = 0; widx < word_ct_m1; ++widx) {
+    cumulative_popcounts[widx] = cur_sum;
+    cur_sum += popcount_long(subset_mask[widx]);
+  }
+  cumulative_popcounts[word_ct_m1] = cur_sum;
+}
+
+void uidxs_to_idxs(const uintptr_t* subset_mask, const uint32_t* subset_cumulative_popcounts, const uint32_t idx_list_len, uint32_t* idx_list) {
+  uint32_t* idx_list_end = &(idx_list[idx_list_len]);
+  for (uint32_t* idx_list_iter = idx_list; idx_list_iter != idx_list_end; ++idx_list_iter) {
+    *idx_list_iter = raw_to_subsetted_pos(subset_mask, subset_cumulative_popcounts, *idx_list_iter);
+  }
+}
+
+void genovec_allele_cts_unsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* __restrict allele_cts, uint32_t* __restrict bothset_ctp) {
+  // assumes trailing bits of last genovec word are zeroed out.
+  // sets allele_cts[0] to the number of observed ref alleles, and
+  // allele_cts[1] to the number of observed alt1s.
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (3 * kWordsPerVec));
+  uint32_t alt1_plus_bothset_ct;
+  uint32_t bothset_ct;
+  assert(IS_VEC_ALIGNED(genovec));
+  count_2freq_3xvec((const vul_t*)genovec, word_idx / kWordsPerVec, &alt1_plus_bothset_ct, &bothset_ct);
+  for (; word_idx < sample_ctl2; ++word_idx) {
+    const uintptr_t cur_geno_word = genovec[word_idx];
+    const uintptr_t cur_geno_word_low_lshifted = (cur_geno_word & kMask5555) << 1;
+    alt1_plus_bothset_ct += popcount2_long((~cur_geno_word_low_lshifted) & cur_geno_word);
+    bothset_ct += popcount2_long(cur_geno_word_low_lshifted & cur_geno_word);
+  }
+  const uint32_t alt1_ct = alt1_plus_bothset_ct - bothset_ct;
+  allele_cts[0] = (sample_ct - bothset_ct) * 2 - alt1_ct;
+  allele_cts[1] = alt1_ct;
+  *bothset_ctp = bothset_ct;
+}
+
+void genovec_count_freqs_unsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* genocounts) {
+  // fills genocounts[0] with the number of 00s, genocounts[1] with the number
+  // of 01s, etc.
+  // assumes trailing bits of last genovec word are zeroed out.
+  // sample_ct == 0 ok.
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  uint32_t even_ct;
+  uint32_t odd_ct;
+  uint32_t bothset_ct;
+  uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (6 * kWordsPerVec));
+  assert(IS_VEC_ALIGNED(genovec));
+  count_3freq_6xvec((const vul_t*)genovec, word_idx / kWordsPerVec, &even_ct, &odd_ct, &bothset_ct);
+  for (; word_idx < sample_ctl2; ++word_idx) {
+    const uintptr_t cur_geno_word = genovec[word_idx];
+    const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
+    even_ct += popcount01_long(cur_geno_word & kMask5555);
+    odd_ct += popcount01_long(cur_geno_word_high);
+    bothset_ct += popcount01_long(cur_geno_word & cur_geno_word_high);
+  }
+  genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
+  genocounts[1] = even_ct - bothset_ct;
+  genocounts[2] = odd_ct - bothset_ct;
+  genocounts[3] = bothset_ct;
+}
+
+void genovec_count_subset_freqs(const uintptr_t* __restrict genovec, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t* genocounts) {
+  // fills genocounts[0] with the number of 00s, genocounts[1] with the number
+  // of 01s, etc.
+  // {raw_}sample_ct == 0 ok.
+  const uint32_t raw_sample_ctv2 = QUATERCT_TO_VECCT(raw_sample_ct);
+  uint32_t even_ct;
+  uint32_t odd_ct;
+  uint32_t bothset_ct;
+#ifdef __LP64__
+  uint32_t vec_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
+  assert(IS_VEC_ALIGNED(genovec));
+  count_subset_3freq_6xvec((const vul_t*)genovec, (const vul_t*)sample_include_interleaved_vec, vec_idx, &even_ct, &odd_ct, &bothset_ct);
+  const uintptr_t* genovec_iter = &(genovec[kWordsPerVec * vec_idx]);
+  const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[vec_idx]);
+  #ifdef USE_AVX2
+  uintptr_t mask_base1 = 0;
+  uintptr_t mask_base2 = 0;
+  uintptr_t mask_base3 = 0;
+  uintptr_t mask_base4 = 0;
+  for (; vec_idx < raw_sample_ctv2; ++vec_idx) {
+    uintptr_t mask_word1;
+    uintptr_t mask_word2;
+    uintptr_t mask_word3;
+    uintptr_t mask_word4;
+    if (!(vec_idx % 2)) {
+      mask_base1 = *interleaved_mask_iter++;
+      mask_base2 = *interleaved_mask_iter++;
+      mask_base3 = *interleaved_mask_iter++;
+      mask_base4 = *interleaved_mask_iter++;
+      mask_word1 = mask_base1 & kMask5555;
+      mask_word2 = mask_base2 & kMask5555;
+      mask_word3 = mask_base3 & kMask5555;
+      mask_word4 = mask_base4 & kMask5555;
+    } else {
+      mask_word1 = (mask_base1 >> 1) & kMask5555;
+      mask_word2 = (mask_base2 >> 1) & kMask5555;
+      mask_word3 = (mask_base3 >> 1) & kMask5555;
+      mask_word4 = (mask_base4 >> 1) & kMask5555;
+    }
+    uint32_t uii = 0;
+    while (1) {
+      const uintptr_t cur_geno_word1 = *genovec_iter++;
+      const uintptr_t cur_geno_word2 = *genovec_iter++;
+      const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
+      const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
+      even_ct += popcount_long(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
+      odd_ct += popcount_long((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
+      bothset_ct += popcount_long(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
+      if (uii) {
+	break;
+      }
+      ++uii;
+      mask_word1 = mask_word3;
+      mask_word2 = mask_word4;
+    }
+  }
+  #else // not USE_AVX2
+  uintptr_t mask_base1 = 0;
+  uintptr_t mask_base2 = 0;
+  for (; vec_idx < raw_sample_ctv2; ++vec_idx) {
+    uintptr_t mask_word1;
+    uintptr_t mask_word2;
+    if (!(vec_idx % 2)) {
+      mask_base1 = *interleaved_mask_iter++;
+      mask_base2 = *interleaved_mask_iter++;
+      mask_word1 = mask_base1 & kMask5555;
+      mask_word2 = mask_base2 & kMask5555;
+    } else {
+      mask_word1 = (mask_base1 >> 1) & kMask5555;
+      mask_word2 = (mask_base2 >> 1) & kMask5555;
+    }
+    const uintptr_t cur_geno_word1 = *genovec_iter++;
+    const uintptr_t cur_geno_word2 = *genovec_iter++;
+    const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
+    const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
+    #ifdef USE_SSE42
+    even_ct += popcount_long(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
+    odd_ct += popcount_long((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
+    bothset_ct += popcount_long(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
+    #else
+    even_ct += popcount2_long((cur_geno_word1 & mask_word1) + (cur_geno_word2 & mask_word2));
+    odd_ct += popcount2_long(cur_geno_word1_high_masked + cur_geno_word2_high_masked);
+    bothset_ct += popcount2_long((cur_geno_word1 & cur_geno_word1_high_masked) + (cur_geno_word2 & cur_geno_word2_high_masked));
+    #endif
+  }
+  #endif // not USE_AVX2
+#else // not __LP64__
+  uint32_t word_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
+  count_subset_3freq_6xvec((const vul_t*)genovec, (const vul_t*)sample_include_interleaved_vec, word_idx, &even_ct, &odd_ct, &bothset_ct);
+  const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[word_idx / 2]);
+  uintptr_t mask_base = 0;
+  for (; word_idx < raw_sample_ctv2; ++word_idx) {
+    uintptr_t mask_word;
+    if (!(word_idx % 2)) {
+      mask_base = *interleaved_mask_iter++;
+      mask_word = mask_base & kMask5555;
+    } else {
+      mask_word = (mask_base >> 1) & kMask5555;
+    }
+    const uintptr_t cur_geno_word = genovec[word_idx];
+    const uintptr_t cur_geno_word_high_masked = mask_word & (cur_geno_word >> 1);
+    even_ct += popcount01_long(cur_geno_word & mask_word);
+    odd_ct += popcount01_long(cur_geno_word_high_masked);
+    bothset_ct += popcount01_long(cur_geno_word & cur_geno_word_high_masked);
+  }
+#endif
+  genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
+  genocounts[1] = even_ct - bothset_ct;
+  genocounts[2] = odd_ct - bothset_ct;
+  genocounts[3] = bothset_ct;
+}
+
+uint32_t genovec_count_01_unsafe(const uintptr_t* genovec, uint32_t sample_ct) {
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (6 * kWordsPerVec));
+  assert(IS_VEC_ALIGNED(genovec));
+  uint32_t tot = count_01_vecs((const vul_t*)genovec, word_idx / kWordsPerVec);
+  for (; word_idx < sample_ctl2; ++word_idx) {
+    const uintptr_t cur_geno_word = genovec[word_idx];
+    tot += popcount01_long(cur_geno_word & (~(cur_geno_word >> 1)) & kMask5555);
+  }
+  return tot;
+}
+
+void small_genoarr_count_3freq_incr(const uintptr_t* genoarr_iter, uint32_t byte_ct, uint32_t* even_ctp, uint32_t* odd_ctp, uint32_t* bothset_ctp) {
+  while (1) {
+    uintptr_t cur_geno_word;
+    if (byte_ct < kBytesPerWord) {
+      if (!byte_ct) {
+	return;
+      }
+      cur_geno_word = 0;
+      memcpy(&cur_geno_word, genoarr_iter, byte_ct);
+      byte_ct = 0;
+    } else {
+      cur_geno_word = *genoarr_iter++;
+      byte_ct -= kBytesPerWord;
+    }
+    const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
+    *even_ctp += popcount01_long(cur_geno_word & kMask5555);
+    *odd_ctp += popcount01_long(cur_geno_word_high);
+    *bothset_ctp += popcount01_long(cur_geno_word & cur_geno_word_high);
+  }
+}
+
+#ifdef __arm__
+  #error "Unaligned accesses in small_genoarr_count_3freq_incr()."
+#endif
+void genoarr_count_freqs(const unsigned char* genoarr, uint32_t sample_ct, uint32_t* genocounts) {
+  // does not read past the end of genoarr
+  uint32_t lead_byte_ct = ((uintptr_t)(-((uintptr_t)genoarr))) % kBytesPerVec;
+  uint32_t even_ct = 0;
+  uint32_t odd_ct = 0;
+  uint32_t bothset_ct = 0;
+  const uintptr_t* genoarr_iter;
+  uint32_t trail_ct;
+  if (sample_ct > lead_byte_ct * 4 + (6 * kQuatersPerVec)) {
+    const uint32_t remaining_sample_ct = sample_ct - 4 * lead_byte_ct;
+    // strictly speaking, this relies on undefined behavior: see e.g.
+    // http://pzemtsov.github.io/2016/11/06/bug-story-alignment-on-x86.html
+    // probably want to search out all instances of __arm__ and make the code
+    // standard-compliant, if that can be done without a speed penalty
+    small_genoarr_count_3freq_incr((const uintptr_t*)genoarr, lead_byte_ct, &even_ct, &odd_ct, &bothset_ct);
+    genoarr_iter = (const uintptr_t*)(&(genoarr[lead_byte_ct]));
+    const uint32_t remaining_full_vec_ct = remaining_sample_ct / kQuatersPerVec;
+    uint32_t even_ct_incr;
+    uint32_t odd_ct_incr;
+    uint32_t bothset_ct_incr;
+    const uint32_t vec_ct = remaining_full_vec_ct - (remaining_full_vec_ct % 6);
+    count_3freq_6xvec((const vul_t*)genoarr_iter, vec_ct, &even_ct_incr, &odd_ct_incr, &bothset_ct_incr);
+    even_ct += even_ct_incr;
+    odd_ct += odd_ct_incr;
+    bothset_ct += bothset_ct_incr;
+    genoarr_iter = &(genoarr_iter[kWordsPerVec * vec_ct]);
+    trail_ct = remaining_sample_ct - (vec_ct * kQuatersPerVec);
+  } else {
+    genoarr_iter = (const uintptr_t*)genoarr;
+    trail_ct = sample_ct;
+  }
+  const uint32_t trail_byte_ct = QUATERCT_TO_BYTECT(trail_ct);
+  small_genoarr_count_3freq_incr(genoarr_iter, trail_byte_ct, &even_ct, &odd_ct, &bothset_ct);
+  genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
+  genocounts[1] = even_ct - bothset_ct;
+  genocounts[2] = odd_ct - bothset_ct;
+  genocounts[3] = bothset_ct;
+}
+
+#ifdef __arm__
+  #error "Unaligned accesses in genoarr_count_subset_freqs()."
+#endif
+void genoarr_count_subset_freqs(const unsigned char* genoarr, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t* genocounts) {
+  // does not read past the end of genoarr
+  const uintptr_t* genoarr_iter = (const uintptr_t*)genoarr;
+  const uintptr_t* interleaved_mask_iter = (const uintptr_t*)sample_include_interleaved_vec;
+  const uint32_t raw_sample_ctv2 = QUATERCT_TO_VECCT(raw_sample_ct);
+  uint32_t even_ct = 0;
+  uint32_t odd_ct = 0;
+  uint32_t bothset_ct = 0;
+#ifdef USE_AVX2
+  const uint32_t halfvec_idx_trail = (raw_sample_ct + 3) / (kBitsPerVec / 4);
+  uintptr_t mask_base1 = 0;
+  uintptr_t mask_base2 = 0;
+  uintptr_t mask_base3 = 0;
+  uintptr_t mask_base4 = 0;
+  for (uint32_t vec_idx = 0; vec_idx < raw_sample_ctv2; ++vec_idx) {
+    uintptr_t mask_word1;
+    uintptr_t mask_word2;
+    uintptr_t mask_word3;
+    uintptr_t mask_word4;
+    if (!(vec_idx % 2)) {
+      mask_base1 = *interleaved_mask_iter++;
+      mask_base2 = *interleaved_mask_iter++;
+      mask_base3 = *interleaved_mask_iter++;
+      mask_base4 = *interleaved_mask_iter++;
+      mask_word1 = mask_base1 & kMask5555;
+      mask_word2 = mask_base2 & kMask5555;
+      mask_word3 = mask_base3 & kMask5555;
+      mask_word4 = mask_base4 & kMask5555;
+    } else {
+      mask_word1 = (mask_base1 >> 1) & kMask5555;
+      mask_word2 = (mask_base2 >> 1) & kMask5555;
+      mask_word3 = (mask_base3 >> 1) & kMask5555;
+      mask_word4 = (mask_base4 >> 1) & kMask5555;
+    }
+    uint32_t uii = 0;
+    while (1) {
+      uintptr_t cur_geno_word1;
+      uintptr_t cur_geno_word2;
+      if (2 * vec_idx + uii < halfvec_idx_trail) {
+	cur_geno_word1 = *genoarr_iter++;
+	cur_geno_word2 = *genoarr_iter++;
+      } else {
+	const uint32_t remaining_byte_ct = QUATERCT_TO_BYTECT(raw_sample_ct) % kBytesPerVec;
+	cur_geno_word2 = 0;
+	uii = 1; // todo: check if this harms usual-case loop efficiency
+	if (remaining_byte_ct <= kBytesPerWord) {
+	  cur_geno_word1 = 0;
+	  memcpy(&cur_geno_word1, genoarr_iter, remaining_byte_ct);
+	} else {
+	  cur_geno_word1 = *genoarr_iter++;
+	  memcpy(&cur_geno_word2, genoarr_iter, remaining_byte_ct - kBytesPerWord);
+	}
+      }
+      const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
+      const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
+      even_ct += popcount_long(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
+      odd_ct += popcount_long((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
+      bothset_ct += popcount_long(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
+      if (uii) {
+	break;
+      }
+      ++uii;
+      mask_word1 = mask_word3;
+      mask_word2 = mask_word4;
+    }
+  }
+#else // not USE_AVX2
+  const uint32_t vec_idx_trail = (raw_sample_ct + 3) / kQuatersPerVec;
+  #ifdef __LP64__
+  uintptr_t mask_base1 = 0;
+  uintptr_t mask_base2 = 0;
+  for (uint32_t vec_idx = 0; vec_idx < raw_sample_ctv2; ++vec_idx) {
+    uintptr_t mask_word1;
+    uintptr_t mask_word2;
+    if (!(vec_idx % 2)) {
+      mask_base1 = *interleaved_mask_iter++;
+      mask_base2 = *interleaved_mask_iter++;
+      mask_word1 = mask_base1 & kMask5555;
+      mask_word2 = mask_base2 & kMask5555;
+    } else {
+      mask_word1 = (mask_base1 >> 1) & kMask5555;
+      mask_word2 = (mask_base2 >> 1) & kMask5555;
+    }
+    uintptr_t cur_geno_word1;
+    uintptr_t cur_geno_word2;
+    if (vec_idx < vec_idx_trail) {
+      cur_geno_word1 = *genoarr_iter++;
+      cur_geno_word2 = *genoarr_iter++;
+    } else {
+      const uint32_t remaining_byte_ct = QUATERCT_TO_BYTECT(raw_sample_ct) % kBytesPerVec;
+      cur_geno_word2 = 0;
+      if (remaining_byte_ct <= kBytesPerWord) {
+	cur_geno_word1 = 0;
+	memcpy(&cur_geno_word1, genoarr_iter, remaining_byte_ct);
+      } else {
+	cur_geno_word1 = *genoarr_iter++;
+	memcpy(&cur_geno_word2, genoarr_iter, remaining_byte_ct - kBytesPerWord);
+      }
+    }
+    const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
+    const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
+    #ifdef USE_SSE42
+    even_ct += popcount_long(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
+    odd_ct += popcount_long((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
+    bothset_ct += popcount_long(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
+    #else
+    even_ct += popcount2_long((cur_geno_word1 & mask_word1) + (cur_geno_word2 & mask_word2));
+    odd_ct += popcount2_long(cur_geno_word1_high_masked + cur_geno_word2_high_masked);
+    bothset_ct += popcount2_long((cur_geno_word1 & cur_geno_word1_high_masked) + (cur_geno_word2 & cur_geno_word2_high_masked));
+    #endif
+  }
+  #else // not __LP64__
+  uintptr_t mask_base = 0;
+  for (uint32_t word_idx = 0; word_idx < raw_sample_ctv2; ++word_idx) {
+    uintptr_t mask_word;
+    if (!(word_idx % 2)) {
+      mask_base = *interleaved_mask_iter++;
+      mask_word = mask_base & kMask5555;
+    } else {
+      mask_word = (mask_base >> 1) & kMask5555;
+    }
+    uintptr_t cur_geno_word;
+    if (word_idx < vec_idx_trail) {
+      cur_geno_word = *genoarr_iter++;
+    } else {
+      const uint32_t remaining_byte_ct = QUATERCT_TO_BYTECT(raw_sample_ct) % kBytesPerVec;
+      cur_geno_word = 0;
+      memcpy(&cur_geno_word, genoarr_iter, remaining_byte_ct);
+    }
+    const uintptr_t cur_geno_word_high_masked = mask_word & (cur_geno_word >> 1);
+    even_ct += popcount01_long(cur_geno_word & mask_word);
+    odd_ct += popcount01_long(cur_geno_word_high_masked);
+    bothset_ct += popcount01_long(cur_geno_word & cur_geno_word_high_masked);
+  }
+  #endif // not __LP64__
+#endif // not USE_AVX2
+  genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
+  genocounts[1] = even_ct - bothset_ct;
+  genocounts[2] = odd_ct - bothset_ct;
+  genocounts[3] = bothset_ct;
+}
+
+void genoarr_count_subset_freqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t* genocounts) {
+  // slower genoarr_count_subset_freqs() which does not require
+  // sample_include_interleaved_vec to be precomputed.
+  // {raw_}sample_ct == 0 ok.
+  const uint32_t raw_sample_ctl2 = QUATERCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t fullword_ct = raw_sample_ctl2 / 2;
+  uint32_t even_ct = 0;
+  uint32_t odd_ct = 0;
+  uint32_t bothset_ct = 0;
+  for (uint32_t widx = 0; widx < fullword_ct; ++widx) {
+    const uintptr_t mask_word = sample_include[widx];
+    if (mask_word) {
+      uintptr_t geno_word = genoarr[2 * widx];
+      uintptr_t geno_even = pack_word_to_halfword(geno_word & kMask5555);
+      uintptr_t geno_odd = pack_word_to_halfword((geno_word >> 1) & kMask5555);
+      geno_word = genoarr[2 * widx + 1];
+      geno_even |= ((uintptr_t)pack_word_to_halfword(geno_word & kMask5555)) << kBitsPerWordD2;
+      geno_odd |= ((uintptr_t)pack_word_to_halfword((geno_word >> 1) & kMask5555)) << kBitsPerWordD2;
+      const uintptr_t geno_even_masked = geno_even & mask_word;
+      even_ct += popcount_long(geno_even_masked);
+      odd_ct += popcount_long(geno_odd & mask_word);
+      bothset_ct += popcount_long(geno_odd & geno_even_masked);
+    }
+  }
+  if (raw_sample_ctl2 % 2) {
+    const uintptr_t mask_hw = sample_include[fullword_ct];
+    if (mask_hw) {
+      const uintptr_t geno_word = genoarr[2 * fullword_ct];
+      // todo: benchmark main loop unpack vs. pack
+      const uintptr_t mask_word = unpack_halfword_to_word(mask_hw);
+      const uintptr_t geno_word_shifted = geno_word >> 1;
+      const uintptr_t geno_word_masked = geno_word & mask_word;
+      even_ct += popcount01_long(geno_word_masked);
+      odd_ct += popcount01_long(geno_word_shifted & mask_word);
+      bothset_ct += popcount01_long(geno_word_masked & geno_word_shifted);
+    }
+  }
+  genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
+  genocounts[1] = even_ct - bothset_ct;
+  genocounts[2] = odd_ct - bothset_ct;
+  genocounts[3] = bothset_ct;
+}
+
+void genoarr_count_subset_intersect_freqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict subset1, const uintptr_t* __restrict subset2, uint32_t raw_sample_ct, uint32_t* genocounts) {
+  // {raw_}sample_ct == 0 ok.
+  const uint32_t raw_sample_ctl2 = QUATERCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t fullword_ct = raw_sample_ctl2 / 2;
+  uint32_t subset_intersect_ct = 0;
+  uint32_t even_ct = 0;
+  uint32_t odd_ct = 0;
+  uint32_t bothset_ct = 0;
+  for (uint32_t widx = 0; widx < fullword_ct; ++widx) {
+    const uintptr_t mask_word = subset1[widx] & subset2[widx];
+    if (mask_word) {
+      uintptr_t geno_word = genoarr[2 * widx];
+      uintptr_t geno_even = pack_word_to_halfword(geno_word & kMask5555);
+      uintptr_t geno_odd = pack_word_to_halfword((geno_word >> 1) & kMask5555);
+      geno_word = genoarr[2 * widx + 1];
+      geno_even |= ((uintptr_t)pack_word_to_halfword(geno_word & kMask5555)) << kBitsPerWordD2;
+      geno_odd |= ((uintptr_t)pack_word_to_halfword((geno_word >> 1) & kMask5555)) << kBitsPerWordD2;
+      const uintptr_t geno_even_masked = geno_even & mask_word;
+      subset_intersect_ct += popcount_long(mask_word);
+      even_ct += popcount_long(geno_even_masked);
+      odd_ct += popcount_long(geno_odd & mask_word);
+      bothset_ct += popcount_long(geno_odd & geno_even_masked);
+    }
+  }
+  if (raw_sample_ctl2 % 2) {
+    const uintptr_t mask_hw = subset1[fullword_ct] & subset2[fullword_ct];
+    if (mask_hw) {
+      const uintptr_t geno_word = genoarr[fullword_ct * 2];
+      const uintptr_t mask_word = unpack_halfword_to_word(mask_hw);
+      const uintptr_t geno_word_shifted = geno_word >> 1;
+      const uintptr_t geno_word_masked = geno_word & mask_word;
+      subset_intersect_ct += popcount01_long(mask_word);
+      even_ct += popcount01_long(geno_word_masked);
+      odd_ct += popcount01_long(geno_word_shifted & mask_word);
+      bothset_ct += popcount01_long(geno_word_masked & geno_word_shifted);
+    }
+  }
+  genocounts[0] = subset_intersect_ct + bothset_ct - even_ct - odd_ct;
+  genocounts[1] = even_ct - bothset_ct;
+  genocounts[2] = odd_ct - bothset_ct;
+  genocounts[3] = bothset_ct;
+}
+
+void genovec_count_freqs(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* genocounts) {
+  // ok to read trailing genovec bytes, but must mask them out
+  const uint32_t sample_ct_remainder = sample_ct % kBitsPerWordD2;
+  genovec_count_freqs_unsafe(genovec, sample_ct - sample_ct_remainder, genocounts);
+  if (sample_ct_remainder) {
+    uintptr_t cur_geno_word = genovec[sample_ct / kBitsPerWordD2] & ((k1LU << (2 * sample_ct_remainder)) - k1LU);
+    const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
+    const uint32_t even_ct = popcount01_long(cur_geno_word & kMask5555);
+    const uint32_t odd_ct = popcount01_long(cur_geno_word_high);
+    const uint32_t bothset_ct = popcount01_long(cur_geno_word & cur_geno_word_high);
+    genocounts[0] += sample_ct_remainder + bothset_ct - even_ct - odd_ct;
+    genocounts[1] += even_ct - bothset_ct;
+    genocounts[2] += odd_ct - bothset_ct;
+    genocounts[3] += bothset_ct;
+  }
+}
+
+void genovec_invert_unsafe(uint32_t sample_ct, uintptr_t* genovec) {
+  // flips 0 to 2 and vice versa.
+  // "unsafe" because trailing bits are not zeroed out.
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  assert(IS_VEC_ALIGNED(genovec));
+  const vul_t not_m1 = VCONST_UL(kMaskAAAA);
+  vul_t* vptr = (vul_t*)genovec;
+  for (uint32_t vidx = 0; vidx < vec_ct; ++vidx) {
+    vul_t cur_vec = vptr[vidx];
+    // flip high bit iff low bit is unset
+    vptr[vidx] = cur_vec ^ ((~vul_lshift(cur_vec, 1)) & not_m1);
+  }
+}
+
+void genovec_invert_copy_unsafe(const uintptr_t* __restrict genovec, uint32_t sample_ct, uintptr_t* __restrict genovec_inverted_copy) {
+  // flips 0 to 2 and vice versa.
+  // "unsafe" because trailing bits are not zeroed out.
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  assert(IS_VEC_ALIGNED(genovec));
+  const vul_t not_m1 = VCONST_UL(kMaskAAAA);
+  const vul_t* vin_ptr = (const vul_t*)genovec;
+  vul_t* vout_ptr = (vul_t*)genovec_inverted_copy;
+  for (uint32_t vidx = 0; vidx < vec_ct; ++vidx) {
+    vul_t cur_vec = vin_ptr[vidx];
+    // flip high bit iff low bit is unset
+    vout_ptr[vidx] = cur_vec ^ ((~vul_lshift(cur_vec, 1)) & not_m1);
+  }
+}
+
+void genovec_nonmissing_to_zero_unsafe(uint32_t sample_ct, uintptr_t* genovec) {
+  // sets 1 and 2 to zero; leaves 3s untouched.
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  assert(IS_VEC_ALIGNED(genovec));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  vul_t* vptr = (vul_t*)genovec;
+  for (uint32_t vidx = 0; vidx < vec_ct; ++vidx) {
+    vul_t cur_vec = vptr[vidx];
+    vul_t cur_vec_rshifted = vul_rshift(cur_vec, 1);
+    cur_vec = cur_vec & m1;
+    cur_vec = cur_vec & cur_vec_rshifted;
+    vptr[vidx] = cur_vec | vul_lshift(cur_vec, 1);
+  }
+}
+
+void genovec_nonzero_to_missing_unsafe(uint32_t sample_ct, uintptr_t* genovec) {
+  // converts 1s and 2s to 3s, leaves zeroes untouched.
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  assert(IS_VEC_ALIGNED(genovec));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  vul_t* vptr = (vul_t*)genovec;
+  for (uint32_t vidx = 0; vidx < vec_ct; ++vidx) {
+    vul_t cur_vec = vptr[vidx];
+    vul_t cur_vec_rshifted = vul_rshift(cur_vec, 1);
+    cur_vec = cur_vec | cur_vec_rshifted;
+    cur_vec = cur_vec & m1;
+    vptr[vidx] = cur_vec | vul_lshift(cur_vec, 1);
+  }
+}
+
+void difflist_count_subset_freqs(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t common_geno, uint32_t difflist_len, uint32_t sample_ct, uint32_t* genocounts) {
+  fill_uint_zero(4, genocounts);
+  uint32_t common_geno_ct = sample_ct;
+  for (uint32_t difflist_idx = 0; difflist_idx < difflist_len; ++difflist_idx) {
+    const uint32_t raw_sample_idx = difflist_sample_ids[difflist_idx];
+    if (IS_SET(sample_include, raw_sample_idx)) {
+      genocounts[GET_QUATERARR_ENTRY(raregeno, difflist_idx)] += 1;
+      --common_geno_ct;
+    }
+  }
+  genocounts[common_geno] = common_geno_ct;
+}
+
+
+static_assert(kPglQuaterTransposeBatch == ((uint32_t)kQuatersPerCacheline), "transpose_quaterblock() needs to be updated.");
+#ifdef __LP64__
+static_assert(kWordsPerVec == 2, "transpose_quaterblock() needs to be updated.");
+#else
+static_assert(kWordsPerVec == 1, "transpose_quaterblock() needs to be updated.");
+#endif
+void transpose_quaterblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, vul_t* vecaligned_buf) {
+  // buf must be vector-aligned and have size 32k
+  const uint32_t initial_read_byte_ct = QUATERCT_TO_BYTECT(write_batch_size);
+  // fold the first 6 shuffles into the initial ingestion loop
+  const unsigned char* initial_read_iter = (const unsigned char*)read_iter;
+  const unsigned char* initial_read_end = &(initial_read_iter[initial_read_byte_ct]);
+  unsigned char* initial_target_iter = (unsigned char*)vecaligned_buf;
+  const uint32_t read_byte_stride = read_ul_stride * kBytesPerWord;
+  const uint32_t read_batch_rem = kQuatersPerCacheline - read_batch_size;
+  for (; initial_read_iter < initial_read_end; ++initial_read_iter) {
+    const unsigned char* read_iter_tmp = initial_read_iter;
+    for (uint32_t ujj = 0; ujj < read_batch_size; ++ujj) {
+      *initial_target_iter++ = *read_iter_tmp;
+      read_iter_tmp = &(read_iter_tmp[read_byte_stride]);
+    }
+    initial_target_iter = memseta(initial_target_iter, 0, read_batch_rem);
+  }
+
+  // second-to-last shuffle, 8 bit spacing -> 4
+  const vul_t* source_iter = vecaligned_buf;
+  uintptr_t* target_iter0 = (uintptr_t*)(&(vecaligned_buf[kPglQuaterTransposeBufwords / (2 * kWordsPerVec)]));
+#ifdef __LP64__
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t m8 = VCONST_UL(kMask00FF);
+  const vul_t m16 = VCONST_UL(kMask0000FFFF);
+#endif
+  const uint32_t write_word_ct = QUATERCT_TO_WORDCT(read_batch_size);
+  const uint32_t penult_inner_loop_iter_ct = 2 * write_word_ct;
+  const uint32_t cur_write_skip = 2 * kWordsPerCacheline - penult_inner_loop_iter_ct;
+  // coincidentally, this also needs to run DIV_UP(write_batch_size, 4) times
+  for (uint32_t uii = 0; uii < initial_read_byte_ct; ++uii) {
+    uintptr_t* target_iter1 = &(target_iter0[kWordsPerCacheline * 2]);
+    for (uint32_t ujj = 0; ujj < penult_inner_loop_iter_ct; ++ujj) {
+#ifdef __LP64__
+      const vul_t loader = *source_iter++;
+      vul_t target0 = loader & m4;
+      vul_t target1 = (vul_rshift(loader, 4)) & m4;
+      target0 = (target0 | (vul_rshift(target0, 4))) & m8;
+      target1 = (target1 | (vul_rshift(target1, 4))) & m8;
+      target0 = (target0 | (vul_rshift(target0, 8))) & m16;
+      target1 = (target1 | (vul_rshift(target1, 8))) & m16;
+      univec_t target0u;
+      univec_t target1u;
+      target0u.vi = target0 | (vul_rshift(target0, 16));
+      target1u.vi = target1 | (vul_rshift(target1, 16));
+      *target_iter0++ = ((uint32_t)target0u.u8[0]) | (target0u.u8[1] << 32);
+      *target_iter1++ = ((uint32_t)target1u.u8[0]) | (target1u.u8[1] << 32);
+#else
+      const uintptr_t source_word_lo = (uintptr_t)(*source_iter++);
+      const uintptr_t source_word_hi = (uintptr_t)(*source_iter++);
+      uintptr_t target_word0_lo = source_word_lo & kMask0F0F;
+      uintptr_t target_word1_lo = (source_word_lo >> 4) & kMask0F0F;
+      uintptr_t target_word0_hi = source_word_hi & kMask0F0F;
+      uintptr_t target_word1_hi = (source_word_hi >> 4) & kMask0F0F;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
+      target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
+      target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
+      target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
+      target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
+      *target_iter0++ = ((halfword_t)target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
+      *target_iter1++ = ((halfword_t)target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
+#endif
+    }
+#ifdef __LP64__
+    source_iter = &(source_iter[cur_write_skip]);
+#else
+    source_iter = &(source_iter[2 * cur_write_skip]);
+#endif
+    target_iter0 = &(target_iter1[cur_write_skip]);
+  }
+
+  // last shuffle, 4 bit spacing -> 2
+  source_iter = (&(vecaligned_buf[kPglQuaterTransposeBufwords / (2 * kWordsPerVec)]));
+  target_iter0 = write_iter;
+#ifdef __LP64__
+  const vul_t m2 = VCONST_UL(kMask3333);
+#endif
+  const uint32_t last_loop_iter_ct = DIV_UP(write_batch_size, 2);
+  for (uint32_t uii = 0; uii < last_loop_iter_ct; ++uii) {
+    uintptr_t* target_iter1 = &(target_iter0[write_ul_stride]);
+    for (uint32_t ujj = 0; ujj < write_word_ct; ++ujj) {
+#ifdef __LP64__
+      // in AVX2 case, use write_dword_ct instead of write_word_ct, etc.
+      const vul_t loader = *source_iter++;
+      vul_t target0 = loader & m2;
+      vul_t target1 = (vul_rshift(loader, 2)) & m2;
+      target0 = (target0 | (vul_rshift(target0, 2))) & m4;
+      target1 = (target1 | (vul_rshift(target1, 2))) & m4;
+      target0 = (target0 | (vul_rshift(target0, 4))) & m8;
+      target1 = (target1 | (vul_rshift(target1, 4))) & m8;
+      target0 = (target0 | (vul_rshift(target0, 8))) & m16;
+      target1 = (target1 | (vul_rshift(target1, 8))) & m16;
+      univec_t target0u;
+      univec_t target1u;
+      target0u.vi = target0 | (vul_rshift(target0, 16));
+      target1u.vi = target1 | (vul_rshift(target1, 16));
+      target_iter0[ujj] = ((uint32_t)target0u.u8[0]) | (target0u.u8[1] << 32);
+      target_iter1[ujj] = ((uint32_t)target1u.u8[0]) | (target1u.u8[1] << 32);
+#else
+      const uintptr_t source_word_lo = (uintptr_t)(*source_iter++);
+      const uintptr_t source_word_hi = (uintptr_t)(*source_iter++);
+      uintptr_t target_word0_lo = source_word_lo & kMask3333;
+      uintptr_t target_word1_lo = (source_word_lo >> 2) & kMask3333;
+      uintptr_t target_word0_hi = source_word_hi & kMask3333;
+      uintptr_t target_word1_hi = (source_word_hi >> 2) & kMask3333;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 2)) & kMask0F0F;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 2)) & kMask0F0F;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 2)) & kMask0F0F;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 2)) & kMask0F0F;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
+      target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
+      target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
+      target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
+      target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
+      target_iter0[ujj] = ((halfword_t)target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
+      target_iter1[ujj] = ((halfword_t)target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
+#endif
+    }
+#ifdef __LP64__
+    source_iter = &(source_iter[kWordsPerCacheline - write_word_ct]);
+#else
+    source_iter = &(source_iter[2 * (kWordsPerCacheline - write_word_ct)]);
+#endif
+    target_iter0 = &(target_iter1[write_ul_stride]);
+  }
+}
+
+static_assert(kPglBitTransposeBatch == ((uint32_t)kBitsPerCacheline), "transpose_bitblock() needs to be updated.");
+#ifdef __LP64__
+static_assert(kWordsPerVec == 2, "transpose_bitblock() needs to be updated.");
+#else
+static_assert(kWordsPerVec == 1, "transpose_bitblock() needs to be updated.");
+#endif
+void transpose_bitblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, vul_t* vecaligned_buf) {
+  // buf must be vector-aligned and have size 64k
+  const uint32_t initial_read_byte_ct = DIV_UP(write_batch_size, CHAR_BIT);
+  // fold the first 6 shuffles into the initial ingestion loop
+  const unsigned char* initial_read_iter = (const unsigned char*)read_iter;
+  const unsigned char* initial_read_end = &(initial_read_iter[initial_read_byte_ct]);
+  unsigned char* initial_target_iter = (unsigned char*)vecaligned_buf;
+  const uint32_t read_byte_stride = read_ul_stride * kBytesPerWord;
+  const uint32_t read_batch_rem = kBitsPerCacheline - read_batch_size;
+  for (; initial_read_iter < initial_read_end; ++initial_read_iter) {
+    const unsigned char* read_iter_tmp = initial_read_iter;
+    for (uint32_t ujj = 0; ujj < read_batch_size; ++ujj) {
+      *initial_target_iter++ = *read_iter_tmp;
+      read_iter_tmp = &(read_iter_tmp[read_byte_stride]);
+    }
+    initial_target_iter = memseta(initial_target_iter, 0, read_batch_rem);
+  }
+
+  // third-to-last shuffle, 8 bit spacing -> 4
+  const vul_t* source_iter = vecaligned_buf;
+  uintptr_t* target_iter0 = (uintptr_t*)(&(vecaligned_buf[kPglBitTransposeBufwords / (2 * kWordsPerVec)]));
+#ifdef __LP64__
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t m8 = VCONST_UL(kMask00FF);
+  const vul_t m16 = VCONST_UL(kMask0000FFFF);
+#endif
+  const uint32_t write_word_ct = BITCT_TO_WORDCT(read_batch_size);
+  const uint32_t first_inner_loop_iter_ct = 4 * write_word_ct;
+  uint32_t cur_write_skip = 4 * kWordsPerCacheline - first_inner_loop_iter_ct;
+  // coincidentally, this also needs to run DIV_UP(write_batch_size, CHAR_BIT)
+  // times
+  for (uint32_t uii = 0; uii < initial_read_byte_ct; ++uii) {
+    uintptr_t* target_iter1 = &(target_iter0[kWordsPerCacheline * 4]);
+    for (uint32_t ujj = 0; ujj < first_inner_loop_iter_ct; ++ujj) {
+#ifdef __LP64__
+      const vul_t loader = *source_iter++;
+      vul_t target0 = loader & m4;
+      vul_t target1 = (vul_rshift(loader, 4)) & m4;
+      target0 = (target0 | (vul_rshift(target0, 4))) & m8;
+      target1 = (target1 | (vul_rshift(target1, 4))) & m8;
+      target0 = (target0 | (vul_rshift(target0, 8))) & m16;
+      target1 = (target1 | (vul_rshift(target1, 8))) & m16;
+      univec_t target0u;
+      univec_t target1u;
+      target0u.vi = target0 | (vul_rshift(target0, 16));
+      target1u.vi = target1 | (vul_rshift(target1, 16));
+      *target_iter0++ = ((uint32_t)target0u.u8[0]) | (target0u.u8[1] << 32);
+      *target_iter1++ = ((uint32_t)target1u.u8[0]) | (target1u.u8[1] << 32);
+#else
+      const uintptr_t source_word_lo = (uintptr_t)(*source_iter++);
+      const uintptr_t source_word_hi = (uintptr_t)(*source_iter++);
+      uintptr_t target_word0_lo = source_word_lo & kMask0F0F;
+      uintptr_t target_word1_lo = (source_word_lo >> 4) & kMask0F0F;
+      uintptr_t target_word0_hi = source_word_hi & kMask0F0F;
+      uintptr_t target_word1_hi = (source_word_hi >> 4) & kMask0F0F;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
+      target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
+      target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
+      target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
+      target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
+      *target_iter0++ = ((halfword_t)target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
+      *target_iter1++ = ((halfword_t)target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
+#endif
+    }
+#ifdef __LP64__
+    source_iter = &(source_iter[cur_write_skip]);
+#else
+    source_iter = &(source_iter[2 * cur_write_skip]);
+#endif
+    target_iter0 = &(target_iter1[cur_write_skip]);
+  }
+
+  // second-to-last shuffle, 4 bit spacing -> 2
+  source_iter = (&(vecaligned_buf[kPglBitTransposeBufwords / (2 * kWordsPerVec)]));
+  target_iter0 = (uintptr_t*)vecaligned_buf;
+#ifdef __LP64__
+  const vul_t m2 = VCONST_UL(kMask3333);
+#endif
+  const uint32_t second_outer_loop_iter_ct = DIV_UP(write_batch_size, 4);
+  const uint32_t second_inner_loop_iter_ct = 2 * write_word_ct;
+  cur_write_skip = 2 * kWordsPerCacheline - second_inner_loop_iter_ct;
+  for (uint32_t uii = 0; uii < second_outer_loop_iter_ct; ++uii) {
+    uintptr_t* target_iter1 = &(target_iter0[kWordsPerCacheline * 2]);
+    for (uint32_t ujj = 0; ujj < second_inner_loop_iter_ct; ++ujj) {
+#ifdef __LP64__
+      // in AVX2 case, use write_dword_ct instead of write_word_ct, etc.
+      const vul_t loader = *source_iter++;
+      vul_t target0 = loader & m2;
+      vul_t target1 = (vul_rshift(loader, 2)) & m2;
+      target0 = (target0 | (vul_rshift(target0, 2))) & m4;
+      target1 = (target1 | (vul_rshift(target1, 2))) & m4;
+      target0 = (target0 | (vul_rshift(target0, 4))) & m8;
+      target1 = (target1 | (vul_rshift(target1, 4))) & m8;
+      target0 = (target0 | (vul_rshift(target0, 8))) & m16;
+      target1 = (target1 | (vul_rshift(target1, 8))) & m16;
+      univec_t target0u;
+      univec_t target1u;
+      target0u.vi = target0 | (vul_rshift(target0, 16));
+      target1u.vi = target1 | (vul_rshift(target1, 16));
+      *target_iter0++ = ((uint32_t)target0u.u8[0]) | (target0u.u8[1] << 32);
+      *target_iter1++ = ((uint32_t)target1u.u8[0]) | (target1u.u8[1] << 32);
+#else
+      const uintptr_t source_word_lo = (uintptr_t)(*source_iter++);
+      const uintptr_t source_word_hi = (uintptr_t)(*source_iter++);
+      uintptr_t target_word0_lo = source_word_lo & kMask3333;
+      uintptr_t target_word1_lo = (source_word_lo >> 2) & kMask3333;
+      uintptr_t target_word0_hi = source_word_hi & kMask3333;
+      uintptr_t target_word1_hi = (source_word_hi >> 2) & kMask3333;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 2)) & kMask0F0F;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 2)) & kMask0F0F;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 2)) & kMask0F0F;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 2)) & kMask0F0F;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
+      target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
+      target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
+      target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
+      target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
+      *target_iter0++ = ((halfword_t)target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
+      *target_iter1++ = ((halfword_t)target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
+#endif
+    }
+#ifdef __LP64__
+    source_iter = &(source_iter[cur_write_skip]);
+#else
+    source_iter = &(source_iter[2 * cur_write_skip]);
+#endif
+    target_iter0 = &(target_iter1[cur_write_skip]);
+  }
+  // last shuffle, 2 bit spacing -> 1
+  source_iter = vecaligned_buf;
+  target_iter0 = write_iter;
+#ifdef __LP64__
+  const vul_t m1 = VCONST_UL(kMask5555);
+#endif
+  const uint32_t last_loop_iter_ct = DIV_UP(write_batch_size, 2);
+  for (uint32_t uii = 0; uii < last_loop_iter_ct; ++uii) {
+    uintptr_t* target_iter1 = &(target_iter0[write_ul_stride]);
+    for (uint32_t ujj = 0; ujj < write_word_ct; ++ujj) {
+#ifdef __LP64__
+      // in AVX2 case, use write_dword_ct instead of write_word_ct, etc.
+      const vul_t loader = *source_iter++;
+      vul_t target0 = loader & m1;
+      vul_t target1 = (vul_rshift(loader, 1)) & m1;
+      target0 = (target0 | (vul_rshift(target0, 1))) & m2;
+      target1 = (target1 | (vul_rshift(target1, 1))) & m2;
+      target0 = (target0 | (vul_rshift(target0, 2))) & m4;
+      target1 = (target1 | (vul_rshift(target1, 2))) & m4;
+      target0 = (target0 | (vul_rshift(target0, 4))) & m8;
+      target1 = (target1 | (vul_rshift(target1, 4))) & m8;
+      target0 = (target0 | (vul_rshift(target0, 8))) & m16;
+      target1 = (target1 | (vul_rshift(target1, 8))) & m16;
+      univec_t target0u;
+      univec_t target1u;
+      target0u.vi = target0 | (vul_rshift(target0, 16));
+      target1u.vi = target1 | (vul_rshift(target1, 16));
+      target_iter0[ujj] = ((uint32_t)target0u.u8[0]) | (target0u.u8[1] << 32);
+      target_iter1[ujj] = ((uint32_t)target1u.u8[0]) | (target1u.u8[1] << 32);
+#else
+      const uintptr_t source_word_lo = (uintptr_t)(*source_iter++);
+      const uintptr_t source_word_hi = (uintptr_t)(*source_iter++);
+      uintptr_t target_word0_lo = source_word_lo & kMask5555;
+      uintptr_t target_word1_lo = (source_word_lo >> 1) & kMask5555;
+      uintptr_t target_word0_hi = source_word_hi & kMask5555;
+      uintptr_t target_word1_hi = (source_word_hi >> 1) & kMask5555;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 1)) & kMask3333;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 1)) & kMask3333;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 1)) & kMask3333;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 1)) & kMask3333;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 2)) & kMask0F0F;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 2)) & kMask0F0F;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 2)) & kMask0F0F;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 2)) & kMask0F0F;
+      target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
+      target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
+      target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
+      target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
+      target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
+      target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
+      target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
+      target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
+      target_iter0[ujj] = ((halfword_t)target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
+      target_iter1[ujj] = ((halfword_t)target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
+#endif
+    }
+#ifdef __LP64__
+    source_iter = &(source_iter[kWordsPerCacheline - write_word_ct]);
+#else
+    source_iter = &(source_iter[2 * (kWordsPerCacheline - write_word_ct)]);
+#endif
+    target_iter0 = &(target_iter1[write_ul_stride]);
+  }
+}
+
+void biallelic_dosage16_invert(uint32_t dosage_ct, uint16_t* dosage_vals) {
+  // replace each x with (32768 - x).
+  // uses vector operations, but does not require dosage_vals to be
+  // vec-aligned.
+  const vul_t subvec = VCONST_UL(32768 * kMask0001);
+  const uint32_t lead_usi_ct = (((uintptr_t)(-((uintptr_t)dosage_vals))) % kBytesPerVec) / sizeof(int16_t);
+  if (dosage_ct >= lead_usi_ct) {
+    for (uint32_t uii = 0; uii < lead_usi_ct; ++uii) {
+      *dosage_vals = 32768 - (*dosage_vals);
+      ++dosage_vals;
+    }
+    dosage_ct -= lead_usi_ct;
+    const uint32_t vec_ct = dosage_ct / (kBytesPerVec / sizeof(int16_t));
+    vul_t* dosage_vals_iter = (vul_t*)dosage_vals;
+    for (uint32_t vec_idx = 0; vec_idx < vec_ct; ++vec_idx) {
+      const vul_t cur_vec = *dosage_vals_iter;
+      *dosage_vals_iter++ = subvec - cur_vec;
+    }
+    dosage_vals = &(dosage_vals[vec_ct * (kBytesPerVec / sizeof(int16_t))]);
+    dosage_ct -= vec_ct * (kBytesPerVec / sizeof(int16_t));
+  }
+  for (uint32_t uii = 0; uii < dosage_ct; ++uii) {
+    dosage_vals[uii] = 32768 - dosage_vals[uii];
+  }
+}
+
+void genovec_to_missingness_unsafe(const uintptr_t* __restrict genovec, uint32_t sample_ct, uintptr_t* __restrict missingness) {
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  halfword_t* missingness_alias = (halfword_t*)missingness;
+  for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+    const uintptr_t cur_geno_word = genovec[widx];
+    missingness_alias[widx] = pack_word_to_halfword(cur_geno_word & (cur_geno_word >> 1) & kMask5555);
+  }
+  if (sample_ctl2 % 2) {
+    missingness_alias[sample_ctl2] = 0;
+  }
+}
+
+
+void pgfi_preinit(pgen_file_info_t* pgfip) {
+  pgfip->shared_ff = nullptr;
+  pgfip->block_base = nullptr;
+}
+
+uint32_t count_pgfi_alloc_cachelines_required(uint32_t raw_variant_ct) {
+  // assumes variable-width variant records, otherwise pgfi.vrtypes and
+  // pgfi.vr_fpos can just be nullptr.
+  
+  // vrtypes: 1 byte per entry, (raw_variant_ct + 1) entries
+  uint32_t cachelines_required = 1 + (raw_variant_ct / kCacheline);
+
+  // var_fpos: 8 bytes per entry, (raw_variant_ct + 1) entries
+  cachelines_required += 1 + (raw_variant_ct / kInt64PerCacheline);
+  return cachelines_required;
+}
+
+uint32_t count_pgr_alloc_cachelines_required(uint32_t raw_sample_ct, pgen_global_flags_t gflags, uint32_t max_alt_allele_ct, uint32_t fread_buf_byte_ct) {
+  // workspace_vec: always needed, 2 bits per entry, up to raw_sample_ct
+  // entries
+  const uint32_t genovec_cacheline_req = QUATERCT_TO_CLCT(raw_sample_ct);
+  const uint32_t bitvec_cacheline_req = BITCT_TO_CLCT(raw_sample_ct);
+  uint32_t cachelines_required = genovec_cacheline_req;
+  // fread_buf.  fread_buf_byte_ct should be zero if mmap() is being used.
+  // DIV_UP() won't overflow since fread_buf_byte_ct requirement can't exceed
+  // kPglMaxBytesPerVariant, which is sufficiently far from 2^32.
+  cachelines_required += DIV_UP(fread_buf_byte_ct, kCacheline);
+
+  const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
+  if (gflags & kfPgenGlobalDifflistOrLdPresent) {
+    const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
+    // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
+    // workspace_raregeno_vec
+    cachelines_required += QUATERCT_TO_CLCT(max_difflist_entry_ct_base);
+    
+    // workspace_difflist_sample_ids
+    // bugfix: must add 1 since several routines add a terminator element
+    cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
+
+    // workspace_raregeno_tmp_loadbuf
+    cachelines_required += QUATERCT_TO_CLCT(max_difflist_entry_ct_base);
+    
+    // workspace_difflist_sample_ids_tmp
+    cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
+
+    if (ld_compression_present) {
+      // ldbase_genovec
+      cachelines_required += genovec_cacheline_req;
+      
+      // ldbase_raregeno
+      cachelines_required += QUATERCT_TO_CLCT(max_difflist_entry_ct_base);
+      
+      // ldbase_difflist_sample_ids
+      cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
+    }
+  }
+  if (max_alt_allele_ct > 1) {
+    // workspace_aux1_nonmissing_vec
+    cachelines_required += bitvec_cacheline_req;
+    
+    // workspace_aux1_code_vec
+    // prepare for worst-case scenario for now.  todo: use loaded variant
+    // record lengths to bound this when appropriate
+    uintptr_t aux1_allele_bytect = get_aux1_allele_bytect(max_alt_allele_ct, raw_sample_ct);
+    if (aux1_allele_bytect > kPglMaxBytesPerVariant) {
+      // but assume the file isn't actually invalid
+      aux1_allele_bytect = kPglMaxBytesPerVariant;
+    }
+    
+    cachelines_required += DIV_UP(aux1_allele_bytect, kCacheline);
+    
+    // workspace_ambig_sample_ids
+    cachelines_required += INT32CT_TO_CLCT(raw_sample_ct);
+  }
+  if (gflags & kfPgenGlobalHardcallPhasePresent) {
+    // workspace_all_hets, possibly ldbase_all_hets
+    cachelines_required += bitvec_cacheline_req * (1 + ld_compression_present);
+  }
+  if (gflags & kfPgenGlobalDosagePresent) {
+    // aux track #3: usually bitarray tracking which samples have dosage info
+    // (may be stored on disk as a dosage list)
+    cachelines_required += bitvec_cacheline_req;
+    if (gflags & kfPgenGlobalDosagePhasePresent) {
+      // aux track #4: bitarray tracking which dosage entries are phased
+      cachelines_required += bitvec_cacheline_req;
+      
+      // phased aux track #5: max_alt_allele_ct * 4 bytes per sample
+      // (commented out since caller always provides this buffer for now)
+      // cachelines_required += DIV_UP(max_alt_allele_ct * 4 * k1LU * raw_sample_ct, kCacheline);
+    }
+    // unphased aux track #5: max_alt_allele_ct * 2 bytes per sample
+    // cachelines_required += DIV_UP(max_alt_allele_ct * 2 * k1LU * raw_sample_ct, kCacheline);
+  }
+  return cachelines_required;
+}
+
+static_assert(kPglMaxAltAlleleCt == 254, "Need to update pgfi_init_phase1().");
+pglerr_t pgfi_init_phase1(const char* fname, uint32_t raw_variant_ct, uint32_t raw_sample_ct, uint32_t use_mmap, pgen_header_ctrl_t* header_ctrl_ptr, pgen_file_info_t* pgfip, uintptr_t* pgfi_alloc_cacheline_ct_ptr, char* errstr_buf) {
+  pgfip->var_fpos = nullptr;
+  pgfip->vrtypes = nullptr;
+  pgfip->allele_idx_offsets = nullptr;
+  pgfip->nonref_flags = nullptr;
+
+  pgfip->max_alt_allele_ct = 1;
+  pgfip->max_dosage_alt_allele_ct = 0;
+
+  pgfip->block_base = nullptr;
+  // this should force overflow when value is uninitialized.
+  pgfip->block_offset = 1LLU << 63;
+  
+  uint64_t fsize;
+  const unsigned char* fread_ptr;
+  FILE* shared_ff = nullptr;
+  if (use_mmap) {
+    pgfip->shared_ff = nullptr; // this must be initialized before block_base
+#ifdef NO_MMAP
+    strcpy(errstr_buf, "Error: pgfi_init_phase1() use_mmap parameter is nonzero, but pgenlib was not compiled with mmap support.\n");
+    return kPglRetImproperFunctionCall;
+#else
+    int32_t file_handle = open(fname, O_RDONLY);
+    if (file_handle < 0) {
+      sprintf(errstr_buf, "Error: Failed to open %s.\n", fname);
+      return kPglRetOpenFail;
+    }
+    struct stat statbuf;
+    if (fstat(file_handle, &statbuf) < 0) {
+      sprintf(errstr_buf, "Error: Failed to open %s.\n", fname);
+      return kPglRetOpenFail;
+    }
+    fsize = statbuf.st_size;
+    pgfip->block_offset = 0;
+    pgfip->file_size = fsize;
+    pgfip->block_base = (const unsigned char*)mmap(0, pgfip->file_size, PROT_READ, MAP_SHARED, file_handle, 0);
+    if (((uintptr_t)pgfip->block_base) == (~k0LU)) {
+      pgfip->block_base = nullptr;
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    // this provided less than a ~5% boost on OS X; mmap still took >80% longer
+    // than fread on an 85GB file there
+    // try MAP_POPULATE on Linux?
+    // madvise((unsigned char*)(pgfip->block_base), fsize, MADV_SEQUENTIAL);
+    close(file_handle);
+    if (fsize < 3) {
+      sprintf(errstr_buf, "Error: %s is too small to be a valid .pgen file.\n", fname);
+      return kPglRetMalformedInput;
+    }
+    fread_ptr = pgfip->block_base;
+#endif
+  } else {
+    shared_ff = fopen(fname, FOPEN_RB);
+    pgfip->shared_ff = shared_ff;
+    if (!shared_ff) {
+      sprintf(errstr_buf, "Error: Failed to open %s.\n", fname);
+      return kPglRetOpenFail;
+    }
+    if (fseeko(shared_ff, 0, SEEK_END)) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    fsize = ftello(shared_ff);
+    if (fsize < 3) {
+      sprintf(errstr_buf, "Error: %s is too small to be a valid .pgen file.\n", fname);
+      return kPglRetMalformedInput;
+    }
+    rewind(shared_ff);
+    unsigned char small_readbuf[3];
+    if (!fread(small_readbuf, 3, 1, shared_ff)) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    fread_ptr = small_readbuf;
+  }
+  if (memcmp(fread_ptr, "l\x1b", 2)) {
+    sprintf(errstr_buf, "Error: %s is not a .pgen file (first two bytes don't match the magic number).\n", fname);
+    return kPglRetMalformedInput;
+  }
+  const uint32_t file_type_code = fread_ptr[2];
+  *header_ctrl_ptr = 0;
+  if (file_type_code < 2) {
+    // plink 1 binary
+    if (!file_type_code) {
+      // sample-major.  validate file size here so we don't have to recheck it
+      if ((raw_sample_ct != 0xffffffffU) && (raw_variant_ct != 0xffffffffU)) {
+	const uint64_t fsize_expected = 3 + ((uint64_t)raw_sample_ct) * QUATERCT_TO_BYTECT(raw_variant_ct);
+	if (fsize != fsize_expected) {
+	  sprintf(errstr_buf, "Error: Unexpected PLINK 1 sample-major .bed file size (%" PRIu64 " bytes expected).\n", fsize_expected);
+	  return kPglRetMalformedInput;
+	}
+      }
+      strcpy(errstr_buf, "Error: pgenlib does not support sample-major PLINK 1 .bed files.\n");
+      return kPglRetSampleMajorBed;
+    }
+    if (raw_sample_ct == 0xffffffffU) {
+      // either .fam must be loaded first, or user must provide sample count
+      sprintf(errstr_buf, "Error: pgen_init_phase1() must be called with an accurate raw_sample_ct value, since %s is a PLINK 1 .bed file.\n", fname);
+      return kPglRetImproperFunctionCall;
+    }
+    const uint32_t const_vrec_width = QUATERCT_TO_BYTECT(raw_sample_ct);
+    if (raw_variant_ct == 0xffffffffU) {
+      if (!raw_sample_ct) {
+	raw_variant_ct = 0;
+      } else {
+	// allow raw_variant_ct to be inferred
+	uint64_t quotient = (fsize - 3) / const_vrec_width;
+        if ((quotient > 0x7fffffffU) || (quotient * const_vrec_width + 3 != fsize)) {
+          sprintf(errstr_buf, "Error: Unexpected PLINK 1 .bed file size (since raw_sample_ct was %u, [file size - 3] should be divisible by %u and the quotient should be smaller than 2^31).\n", raw_sample_ct, const_vrec_width);
+	  return kPglRetMalformedInput;
+	}
+	raw_variant_ct = (uint32_t)quotient;
+      }
+    } else {
+      if (((uint64_t)raw_variant_ct) * const_vrec_width + 3 != fsize) {
+        sprintf(errstr_buf, "Error: Unexpected PLINK 1 .bed file size (expected %" PRIu64 " bytes).\n", ((uint64_t)raw_variant_ct) * const_vrec_width + 3);
+	return kPglRetMalformedInput;
+      }
+    }
+    pgfip->raw_variant_ct = raw_variant_ct;
+    pgfip->raw_sample_ct = raw_sample_ct;
+    pgfip->const_fpos_offset = 3;
+
+    pgfip->const_vrtype = kPglVrtypePlink1;
+    pgfip->const_vrec_width = const_vrec_width;
+    pgfip->gflags = kfPgenGlobalAllNonref;
+    *pgfi_alloc_cacheline_ct_ptr = 0;
+    return kPglRetSuccess;
+  }
+  
+  if (fsize < 12) {
+    sprintf(errstr_buf, "Error: %s is too small to be a valid .pgen file.\n", fname);
+    return kPglRetMalformedInput;
+  }
+#ifndef NO_MMAP
+  if (use_mmap) {
+    memcpy(&(pgfip->raw_variant_ct), &(fread_ptr[3]), sizeof(int32_t));
+    memcpy(&(pgfip->raw_sample_ct), &(fread_ptr[7]), sizeof(int32_t));
+    memcpy(header_ctrl_ptr, &(fread_ptr[11]), 1);
+  } else {
+#endif
+    if ((!fread(&(pgfip->raw_variant_ct), sizeof(int32_t), 1, shared_ff)) ||
+	(!fread(&(pgfip->raw_sample_ct), sizeof(int32_t), 1, shared_ff)) ||
+	(!fread(header_ctrl_ptr, 1, 1, shared_ff))) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+#ifndef NO_MMAP
+  }
+#endif
+  pgen_header_ctrl_t header_ctrl = *header_ctrl_ptr;
+  if (raw_variant_ct == 0xffffffffU) {
+    raw_variant_ct = pgfip->raw_variant_ct;
+  } else if (raw_variant_ct != pgfip->raw_variant_ct) {
+    sprintf(errstr_buf, "Error: pgen_init_phase1() was called with raw_variant_ct == %u, but %s contains %u variant%s.\n", raw_variant_ct, fname, pgfip->raw_variant_ct, (pgfip->raw_variant_ct == 1)? "" : "s");
+    return kPglRetInconsistentInput;
+  }
+  if (raw_sample_ct == 0xffffffffU) {
+    raw_sample_ct = pgfip->raw_sample_ct;
+  } else if (raw_sample_ct != pgfip->raw_sample_ct) {
+    sprintf(errstr_buf, "Error: pgen_init_phase1() was called with raw_sample_ct == %u, but %s contains %u sample%s.\n", raw_sample_ct, fname, pgfip->raw_sample_ct, (pgfip->raw_sample_ct == 1)? "" : "s");
+    return kPglRetInconsistentInput;
+  }
+  pgfip->gflags = kfPgenGlobal0;
+  pgfip->const_fpos_offset = 12;
+
+  // explicit storage of "is this reference allele untrusted?"
+  // need caller to allocate this
+  uint32_t nonref_flags_storage = header_ctrl >> 6;
+  if (nonref_flags_storage == 3) {
+    pgfip->const_fpos_offset += DIV_UP(raw_variant_ct, CHAR_BIT);
+  } else if (nonref_flags_storage == 2) {
+    pgfip->gflags |= kfPgenGlobalAllNonref;
+  }
+
+  if (file_type_code < 16) {
+    // plink 2 binary, single constant-width vrtype
+    if (file_type_code > 4) {
+      sprintf(errstr_buf, "Error: Third byte of %s does not correspond to a storage mode supported by this version of pgenlib.\n", fname);
+      return kPglRetNotYetSupported;
+    }
+    if (header_ctrl & 63) {
+      sprintf(errstr_buf, "Error: Third byte of %s corresponds to a fixed-width storage mode, but twelfth byte is only consistent with a variable-width mode.\n", fname);
+      return kPglRetMalformedInput;
+    }
+    uint32_t vrtype = 0;
+    uintptr_t const_vrec_width = QUATERCT_TO_BYTECT(raw_sample_ct);
+    if (file_type_code == 3) {
+      vrtype = 0x40;
+      const_vrec_width += raw_sample_ct * 2;
+      pgfip->gflags |= kfPgenGlobalDosagePresent;
+    } else if (file_type_code == 4) {
+      vrtype = 0xc0;
+      const_vrec_width += raw_sample_ct * 4;
+      pgfip->gflags |= kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent;
+    }
+    if (((uint64_t)raw_variant_ct) * const_vrec_width + pgfip->const_fpos_offset != fsize) {
+      sprintf(errstr_buf, "Error: Unexpected .pgen file size (expected %" PRIu64 " bytes).\n", ((uint64_t)raw_variant_ct) * const_vrec_width + pgfip->const_fpos_offset);
+      return kPglRetMalformedInput;
+    }
+    pgfip->const_vrtype = vrtype;
+    pgfip->const_vrec_width = (uint32_t)const_vrec_width;
+    *pgfi_alloc_cacheline_ct_ptr = 0;
+    return kPglRetSuccess;
+  }
+  if (file_type_code >= 0x11) {
+    // todo: 0x11 phase sets
+    sprintf(errstr_buf, "Error: Third byte of %s does not correspond to a storage mode supported by this version of pgenlib.\n", fname);
+    return kPglRetNotYetSupported;
+  }
+  // plink 2 binary, general-purpose
+  pgfip->const_vrtype = 0xffffffffU;
+  pgfip->const_vrec_width = 0;
+  const uintptr_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
+  if (alt_allele_ct_byte_ct > 1) {
+    strcpy(errstr_buf, "Error: This version of pgenlib does not support >254 alternate alleles for a single variant.\n");
+    return kPglRetNotYetSupported;
+  }
+  
+  // 8 extra bytes per vblock, to support fast random access
+  const uintptr_t vblock_ct = DIV_UP(raw_variant_ct, kPglVblockSize);
+  
+  uint64_t vrtype_and_vrec_len_quarterbyte_cost;
+  if (header_ctrl & 8) {
+    const uint32_t header_ctrl_lowbits = header_ctrl & 15;
+    if (header_ctrl_lowbits > 9) {
+      strcpy(errstr_buf, "Error: Twelfth byte of %s does not correspond to a format supported by this version of pgenlib.\n");
+      return kPglRetNotYetSupported;
+    }
+    vrtype_and_vrec_len_quarterbyte_cost = header_ctrl_lowbits - 7;
+  } else {
+    // set this to *2* if true, 0 if false
+    const uint32_t phase_or_dosage_present = (header_ctrl >> 1) & 2;
+    // vrtype entries = 2 quarterbytes if no phase/dosage, 4 otherwise
+    // var_fpos entries = 4 + (4 * (header_ctrl & 3)) quarterbytes
+    vrtype_and_vrec_len_quarterbyte_cost = 6 + phase_or_dosage_present + 4 * (header_ctrl & 3);
+  }
+  pgfip->const_fpos_offset += (raw_sample_ct * vrtype_and_vrec_len_quarterbyte_cost + 3) / 4 + (raw_sample_ct * alt_allele_ct_byte_ct) + (8 * vblock_ct);
+  *pgfi_alloc_cacheline_ct_ptr = count_pgfi_alloc_cachelines_required(raw_variant_ct);
+  return kPglRetSuccess;
+}
+
+static_assert(kPglMaxAltAlleleCt == 254, "Need to update pgfi_init_phase2().");
+pglerr_t pgfi_init_phase2(pgen_header_ctrl_t header_ctrl, uint32_t allele_cts_already_loaded, uint32_t nonref_flags_already_loaded, uint32_t use_blockload, uint32_t vblock_idx_start, uint32_t vidx_end, uint32_t* max_vrec_width_ptr, pgen_file_info_t* pgfip, unsigned char* pgfi_alloc, uintptr_t* pgr_alloc_cacheline_ct_ptr, char* errstr_buf) {
+  // *max_vrec_width_ptr technically only needs to be set in single-variant
+  // fread() mode, but its computation is not currently optimized out in the
+  // other two modes.
+  
+  // possible todo: add option to skip validation when
+  // allele_cts/nonref_flags are already loaded.  but let's play it
+  // safe for now.
+  const uint32_t raw_variant_ct = pgfip->raw_variant_ct;
+  const uint32_t const_vrec_width = pgfip->const_vrec_width;
+  *pgr_alloc_cacheline_ct_ptr = 0;
+  unsigned char loadbuf[kPglVblockSize * 4];
+  uintptr_t* allele_idx_offsets_iter = pgfip->allele_idx_offsets;
+  uintptr_t prev_allele_idx_offset = 0;
+  if (allele_idx_offsets_iter) {
+    if (!allele_cts_already_loaded) {
+      *allele_idx_offsets_iter = 0;
+    } else {
+      prev_allele_idx_offset = *allele_idx_offsets_iter;
+    }
+    ++allele_idx_offsets_iter;
+  }
+  if (!raw_variant_ct) {
+    return kPglRetSuccess;
+  }
+  const uint32_t nonref_flags_stored = ((header_ctrl >> 6) == 3);
+  unsigned char* nonref_flags_iter = (unsigned char*)pgfip->nonref_flags;
+  const unsigned char* fread_ptr = nullptr; // maybe-uninitialized warning
+  FILE* shared_ff = pgfip->shared_ff;
+  if (const_vrec_width) {
+    // no allele counts to verify if fixed-width
+    // always need workspace_vec
+    *pgr_alloc_cacheline_ct_ptr = QUATERCT_TO_CLCT(pgfip->raw_sample_ct);
+    *max_vrec_width_ptr = const_vrec_width;
+#ifdef NO_MMAP
+    assert(shared_ff);
+#else
+    if (!shared_ff) {
+      if (use_blockload) {
+	strcpy(errstr_buf, "Error: pgfi_init_phase2() cannot be called with use_blockload set when pgfi_init_phase1() had use_mmap set.\n");
+	return kPglRetImproperFunctionCall;
+      }
+      if ((!(header_ctrl & 192)) || (pgfip->const_vrtype == kPglVrtypePlink1)) {
+	return kPglRetSuccess;
+      }
+      fread_ptr = &(pgfip->block_base[12]);
+      const uint32_t nonref_flags_byte_ct = DIV_UP(raw_variant_ct, CHAR_BIT);
+      if (!nonref_flags_already_loaded) {
+	if (nonref_flags_stored) {
+	  memcpy(nonref_flags_iter, fread_ptr, nonref_flags_byte_ct);
+	}
+	return kPglRetSuccess;
+      }
+      if (nonref_flags_stored) {
+	if (memcmp(nonref_flags_iter, fread_ptr, nonref_flags_byte_ct)) {
+	  strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	  return kPglRetInconsistentInput;
+	}
+	return kPglRetSuccess;
+      }
+      if (header_ctrl & 64) {
+	// all ref
+	if (!are_all_words_zero(pgfip->nonref_flags, BITCT_TO_WORDCT(raw_variant_ct))) {
+	  strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	  return kPglRetInconsistentInput;
+	}
+	return kPglRetSuccess;
+      }
+      // all nonref
+      if (!are_all_bits_one(pgfip->nonref_flags, raw_variant_ct)) {
+	strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	return kPglRetInconsistentInput;
+      }
+      return kPglRetSuccess;
+    }
+#endif
+    if (!use_blockload) {
+      // using fread() single-variant-at-a-time, need pgr.fread_buf
+      *pgr_alloc_cacheline_ct_ptr += DIV_UP(const_vrec_width, kCacheline);
+    }
+    if ((!(header_ctrl & 192)) || (pgfip->const_vrtype == kPglVrtypePlink1)) {
+      return kPglRetSuccess;
+    }
+    if ((header_ctrl >> 6) == 1) {
+      // all ref
+      if (nonref_flags_already_loaded) {
+	if (!are_all_words_zero(pgfip->nonref_flags, BITCT_TO_WORDCT(raw_variant_ct))) {
+	  strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	  return kPglRetInconsistentInput;
+	}
+      }
+      return kPglRetSuccess;
+    }
+    if ((header_ctrl >> 6) == 2) {
+      // all nonref
+      if (nonref_flags_already_loaded) {
+	if (!are_all_bits_one(pgfip->nonref_flags, raw_variant_ct)) {
+	  strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	  return kPglRetInconsistentInput;
+	}
+      }
+      return kPglRetSuccess;
+    }
+    // _last more useful than _end iff we just refer to the number of elements
+    // in the block and have no use for a _stop pointer
+    unsigned char* nonref_flags_last = &(nonref_flags_iter[((raw_variant_ct - 1) / (kPglVblockSize * 32)) * (kPglVblockSize * 4)]);
+    uint32_t cur_byte_ct = kPglVblockSize * 4;
+    while (1) {
+      if (nonref_flags_iter >= nonref_flags_last) {
+	if (nonref_flags_iter > nonref_flags_last) {
+	  return kPglRetSuccess;
+	}
+	cur_byte_ct = 1 + ((raw_variant_ct - 1) % (kPglVblockSize * 32)) / CHAR_BIT;
+      }
+      unsigned char* loadptr = nonref_flags_already_loaded? loadbuf : nonref_flags_iter;
+      if (!fread(loadptr, cur_byte_ct, 1, shared_ff)) {
+	strcpy(errstr_buf, "Error: File read failure.\n");
+	return kPglRetReadFail;
+      }
+      if (nonref_flags_already_loaded) {
+	if (memcmp(nonref_flags_iter, loadbuf, cur_byte_ct)) {
+	  strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	  return kPglRetInconsistentInput;
+	}
+      }
+      nonref_flags_iter = &(nonref_flags_iter[cur_byte_ct]);
+    }
+  }
+
+  const uint32_t raw_sample_ct = pgfip->raw_sample_ct;
+  unsigned char* vrtypes_iter = pgfi_alloc;
+  pgfip->vrtypes = vrtypes_iter;
+  uint64_t* var_fpos_iter = (uint64_t*)(&(vrtypes_iter[round_up_pow2(raw_variant_ct + 1, kCacheline)]));
+  pgfip->var_fpos = var_fpos_iter;
+  uint32_t vblock_ct_m1 = (raw_variant_ct - 1) / kPglVblockSize;
+  uint32_t max_vrec_width = 0;
+  uint64_t cur_fpos;
+#ifdef NO_MMAP
+  assert(shared_ff);
+#else
+  if (!shared_ff) {
+    if (use_blockload) {
+      strcpy(errstr_buf, "Error: pgfi_init_phase2() cannot be called with use_blockload set when pgfi_init_phase1() had use_mmap set.\n");
+      return kPglRetImproperFunctionCall;
+    }
+    fread_ptr = &(pgfip->block_base[12 + 8 * vblock_idx_start]);
+    memcpy(&cur_fpos, fread_ptr, sizeof(int64_t));
+    fread_ptr = &(fread_ptr[(vblock_ct_m1 + 1 - vblock_idx_start) * sizeof(int64_t)]);
+  } else {
+#endif
+    if (vblock_idx_start) {
+      if (fseeko(shared_ff, vblock_idx_start * sizeof(int64_t), SEEK_CUR)) {
+	strcpy(errstr_buf, "Error: File read failure.\n");
+	return kPglRetReadFail;
+      }
+    }
+    if (!fread(&cur_fpos, sizeof(int64_t), 1, shared_ff)) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    // May also need to load the rest of these values in the future, if we want
+    // to support dynamic insertion into a memory-mapped file.  But skip them
+    // for now.
+    if (fseeko(shared_ff, (vblock_ct_m1 - vblock_idx_start) * sizeof(int64_t), SEEK_CUR)) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+#ifndef NO_MMAP
+  }
+#endif
+  const uint32_t vrtype_and_fpos_storage = header_ctrl & 15;
+  const uint32_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
+  if (alt_allele_ct_byte_ct) {
+    assert(alt_allele_ct_byte_ct == 1);
+    if (!allele_idx_offsets_iter) {
+      strcpy(errstr_buf, "Error: pgfip->allele_idx_offsets must be allocated before pgfi_init_phase2() is called.\n");
+      return kPglRetImproperFunctionCall;
+    }
+  }
+  uint32_t vblock_idx = vblock_idx_start;
+  vblock_ct_m1 = (vidx_end - 1) / kPglVblockSize;
+  if (vblock_idx) {
+    uintptr_t header_vblock_byte_ct = kPglVblockSize * alt_allele_ct_byte_ct;
+    if (nonref_flags_stored) {
+      header_vblock_byte_ct += kPglVblockSize / CHAR_BIT;
+    }
+    if (vrtype_and_fpos_storage & 8) {
+      header_vblock_byte_ct += kPglVblockSize >> (10 - vrtype_and_fpos_storage);
+    } else {
+      if (!(vrtype_and_fpos_storage & 4)) {
+	header_vblock_byte_ct += kPglVblockSize / 2;
+      } else {
+	header_vblock_byte_ct += kPglVblockSize;
+      }
+      header_vblock_byte_ct += kPglVblockSize * (1 + (vrtype_and_fpos_storage & 3));
+    }
+#ifndef NO_MMAP
+    if (!shared_ff) {
+      fread_ptr = &(fread_ptr[header_vblock_byte_ct * ((uint64_t)vblock_idx)]);
+    } else {
+#endif
+      if (fseeko(shared_ff, header_vblock_byte_ct * ((uint64_t)vblock_idx), SEEK_CUR)) {
+	strcpy(errstr_buf, "Error: File read failure.\n");
+	return kPglRetReadFail;
+      }
+#ifndef NO_MMAP
+    }
+#endif
+  }
+  uint32_t cur_vblock_variant_ct = kPglVblockSize;
+  uint32_t max_alt_allele_ct = 1;
+  while (1) {
+    if (vblock_idx >= vblock_ct_m1) {
+      if (vblock_idx > vblock_ct_m1) {
+	// finish up
+#ifndef NO_MMAP
+	// now > instead of != to allow additional information to be stored
+	// between header and first variant record
+	if (!shared_ff) {
+	  if ((uintptr_t)(fread_ptr - pgfip->block_base) > pgfip->var_fpos[0]) {
+            strcpy(errstr_buf, "Error: Invalid .pgen header.\n");
+	    return kPglRetMalformedInput;
+	  }
+	} else {
+#endif
+	  if ((uint64_t)ftello(shared_ff) > pgfip->var_fpos[0]) {
+            strcpy(errstr_buf, "Error: Invalid .pgen header.\n");
+	    return kPglRetMalformedInput;
+	  }
+#ifndef NO_MMAP
+	}
+#endif
+	pgfip->var_fpos[vidx_end] = cur_fpos;
+	pgfip->max_alt_allele_ct = max_alt_allele_ct;
+	// if difflist/LD might be present, scan for them in a way that's
+	// likely to terminate quickly
+	pgen_global_flags_t new_gflags = kfPgenGlobal0;
+	if (vrtype_and_fpos_storage < 8) {
+	  uintptr_t* vrtypes_alias_start = (uintptr_t*)pgfip->vrtypes;
+	  uintptr_t* vrtypes_alias_end = &(vrtypes_alias_start[DIV_UP(vidx_end, kBytesPerWord)]);
+	  if (vblock_idx_start) {
+	    vrtypes_alias_start = &(vrtypes_alias_start[vblock_idx_start * (kPglVblockSize / kBytesPerWord)]);
+	  }
+	  uintptr_t* vrtypes_alias_iter = vrtypes_alias_start;
+	  if (vidx_end & (kBytesPerWord - 1)) {
+	    vrtypes_alias_end[-1] &= (k1LU << ((vidx_end & (kBytesPerWord - 1)) * CHAR_BIT)) - k1LU;
+	  }
+	  for (; vrtypes_alias_iter < vrtypes_alias_end; ++vrtypes_alias_iter) {
+	    const uintptr_t cur_word = *vrtypes_alias_iter;
+	    const uintptr_t cur_word_shifted = cur_word >> 1;
+	    // check if any vrtype has bit 1 set and bit 2 clear
+	    if (cur_word & (~cur_word_shifted) & (2 * kMask0101)) {
+	      new_gflags |= kfPgenGlobalLdCompressionPresent | kfPgenGlobalDifflistOrLdPresent;
+	      break;
+	    }
+	    if (cur_word & (5 * kMask0101)) {
+	      // this catches onebit
+	      new_gflags |= kfPgenGlobalDifflistOrLdPresent;
+	    }
+	  }
+	  if (!(vrtype_and_fpos_storage & 3)) {
+	    // 1 byte per vrec_len entry, don't bother to determine true
+	    // maximum
+	    max_vrec_width = 255;
+	  }
+	  if (vrtype_and_fpos_storage & 4) {
+	    // likely for one of {hphase, dosage} to be present without the
+	    // other; make this scan faster in that case, at the cost of
+	    // failing to early-exit when both are present
+	    uintptr_t or_word = 0; // just bitwise-or everything together
+	    for (vrtypes_alias_iter = vrtypes_alias_start; vrtypes_alias_iter < vrtypes_alias_end; ++vrtypes_alias_iter) {
+	      or_word |= *vrtypes_alias_iter;
+	    }
+	    if (or_word & (0x10 * kMask0101)) {
+	      new_gflags |= kfPgenGlobalHardcallPhasePresent;
+	    }
+	    if (or_word & (0x60 * kMask0101)) {
+	      new_gflags |= kfPgenGlobalDosagePresent;
+	      if (or_word & (0x80 * kMask0101)) {
+		new_gflags |= kfPgenGlobalDosagePhasePresent;
+	      }
+	    }
+	  }
+	  pgfip->gflags |= new_gflags;
+	} else {
+	  // just assume worst case here.  the funny-looking
+	  // (vrtype_and_fpos_storage * 12) - 93 expression evaluates to 3 for
+	  // the 2-bit case and 15 for the 4-bit case.
+	  assert(vrtype_and_fpos_storage < 10);
+	  max_vrec_width = QUATERCT_TO_BYTECT(raw_sample_ct) + (vrtype_and_fpos_storage * 12) - 93;
+	}
+	*pgr_alloc_cacheline_ct_ptr = count_pgr_alloc_cachelines_required(raw_sample_ct, new_gflags, max_alt_allele_ct, (shared_ff && (!use_blockload))? max_vrec_width : 0);
+	*max_vrec_width_ptr = max_vrec_width;
+	return kPglRetSuccess;
+      }
+      cur_vblock_variant_ct = MOD_NZ(vidx_end, kPglVblockSize);
+    }
+    ++vblock_idx;
+    // 1. handle vrtypes and var_fpos.
+    if (vrtype_and_fpos_storage & 8) {
+      // vrtype_and_fpos_storage == 8 -> 2-bit storage -> right-shift 2
+      //                         == 9 -> 4-bit storage -> right-shift 1
+      const uint32_t log2_entry_bit_width = vrtype_and_fpos_storage - 7;
+      const uint32_t entry_bit_width = log2_entry_bit_width * 2;
+      const uintptr_t entry_mask = (1 << entry_bit_width) - 1;
+      const uint32_t log2_entries_per_word = kBitsPerWordLog2 - log2_entry_bit_width;
+      const uintptr_t base_vrec_len = QUATERCT_TO_BYTECT(raw_sample_ct);
+      const uint32_t cur_byte_ct = 1 + ((cur_vblock_variant_ct - 1) >> (10 - vrtype_and_fpos_storage));
+      uint32_t block_len = 1 << log2_entries_per_word;
+      const uintptr_t* loadbuf_iter;
+#ifdef __arm__
+  #error "Unaligned accesses in pgfi_init_phase2()."
+#endif
+#ifndef NO_MMAP
+      if (!shared_ff) {
+	loadbuf_iter = (const uintptr_t*)fread_ptr;
+	fread_ptr = &(fread_ptr[cur_byte_ct]);
+      } else {
+#endif
+	if (!fread(loadbuf, cur_byte_ct, 1, shared_ff)) {
+	  strcpy(errstr_buf, "Error: File read failure.\n");
+	  return kPglRetReadFail;
+	}
+        loadbuf_iter = (const uintptr_t*)loadbuf;
+#ifndef NO_MMAP
+      }
+#endif
+      uint32_t cur_vblock_idx = 0;
+      uint32_t cur_vblock_idx_stop = 0;
+      while (1) {
+	cur_vblock_idx_stop += block_len;
+	if (cur_vblock_idx_stop > cur_vblock_variant_ct) {
+	  if (cur_vblock_idx == cur_vblock_variant_ct) {
+	    break;
+	  }
+	  cur_vblock_idx_stop = cur_vblock_variant_ct;
+	}
+	uintptr_t input_word = *loadbuf_iter++;
+	for (; cur_vblock_idx < cur_vblock_idx_stop; ++cur_vblock_idx) {
+	  const uintptr_t input_word_masked = input_word & entry_mask;
+	  *vrtypes_iter++ = input_word_masked? 8 : 0;
+	  *var_fpos_iter++ = cur_fpos;
+	  cur_fpos += base_vrec_len + input_word_masked;
+	  input_word >>= entry_bit_width;
+	}
+      }
+    } else {
+      if (!(vrtype_and_fpos_storage & 4)) {
+	// no phase or dosage present, 4-bit vrtypes
+	const uint32_t cur_byte_ct = DIV_UP(cur_vblock_variant_ct, 2);
+#ifndef NO_MMAP
+	if (shared_ff) {
+#endif
+	  if (!fread(loadbuf, cur_byte_ct, 1, shared_ff)) {
+	    strcpy(errstr_buf, "Error: File read failure.\n");
+	    return kPglRetReadFail;
+	  }
+	  fread_ptr = loadbuf;
+#ifndef NO_MMAP
+	}
+#endif
+	const uint32_t word_write_ct = DIV_UP(cur_vblock_variant_ct, kBytesPerWord);
+	uintptr_t* vrtypes_alias_fullword = (uintptr_t*)vrtypes_iter;
+	const halfword_t* loadbuf_alias_halfword = (const halfword_t*)fread_ptr;
+	for (uint32_t widx = 0; widx < word_write_ct; ++widx) {
+          uintptr_t ww = (uintptr_t)(loadbuf_alias_halfword[widx]);
+#ifdef __LP64__
+	  ww = (ww | (ww << 16)) & kMask0000FFFF;
+#endif
+	  ww = (ww | (ww << 8)) & kMask00FF;
+	  vrtypes_alias_fullword[widx] = (ww | (ww << 4)) & kMask0F0F;
+	}
+	const uint32_t last_word_byte_ct = cur_vblock_variant_ct % kBytesPerWord;
+	vrtypes_iter = &(vrtypes_iter[cur_vblock_variant_ct]);
+	if (last_word_byte_ct) {
+	  memset(vrtypes_iter, 0, kBytesPerWord - last_word_byte_ct);
+	} else {
+	  // must guarantee a trailing zero for is_ldbase check to work
+	  vrtypes_iter[0] = 0;
+	}
+#ifndef NO_MMAP
+	if (!shared_ff) {
+	  fread_ptr = &(fread_ptr[cur_byte_ct]);
+	}
+#endif
+      } else {
+	// phase and dosage
+#ifndef NO_MMAP
+	if (shared_ff) {
+#endif
+	  if (!fread(vrtypes_iter, cur_vblock_variant_ct, 1, shared_ff)) {
+	    strcpy(errstr_buf, "Error: File read failure.\n");
+	    return kPglRetReadFail;
+	  }
+#ifndef NO_MMAP
+	} else {
+	  memcpy(vrtypes_iter, fread_ptr, cur_vblock_variant_ct);
+	}
+#endif
+	const uint32_t last_word_byte_ct = cur_vblock_variant_ct % kBytesPerWord;
+	vrtypes_iter = &(vrtypes_iter[cur_vblock_variant_ct]);
+	if (last_word_byte_ct) {
+	  memset(vrtypes_iter, 0, kBytesPerWord - last_word_byte_ct);
+	} else {
+	  // must guarantee a trailing zero for is_ldbase check to work
+	  vrtypes_iter[0] = 0;
+	}
+#ifndef NO_MMAP
+	if (!shared_ff) {
+	  fread_ptr = &(fread_ptr[cur_vblock_variant_ct]);
+	}
+#endif
+      }
+      const uint32_t bytes_per_entry = 1 + (vrtype_and_fpos_storage & 3);
+      const uint32_t cur_byte_ct = cur_vblock_variant_ct * bytes_per_entry;
+#ifndef NO_MMAP
+      if (shared_ff) {
+#endif
+	if (!fread(loadbuf, cur_byte_ct, 1, shared_ff)) {
+	  strcpy(errstr_buf, "Error: File read failure.\n");
+	  return kPglRetReadFail;
+	}
+	fread_ptr = loadbuf;
+#ifndef NO_MMAP
+      }
+#endif
+      if (bytes_per_entry == 1) {
+	for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx < cur_vblock_variant_ct; ++cur_vblock_vidx) {
+	  var_fpos_iter[cur_vblock_vidx] = cur_fpos;
+	  uint32_t cur_vrec_len = fread_ptr[cur_vblock_vidx];
+	  cur_fpos += cur_vrec_len;
+	  // no need for correct max_vrec_width
+	}
+      } else if (bytes_per_entry == 2) {
+	for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx < cur_vblock_variant_ct; ++cur_vblock_vidx) {
+	  var_fpos_iter[cur_vblock_vidx] = cur_fpos;
+	  uint32_t cur_vrec_len = ((const uint16_t*)fread_ptr)[cur_vblock_vidx];
+	  cur_fpos += cur_vrec_len;
+	  if (cur_vrec_len > max_vrec_width) {
+	    // todo: check whether we're better off just assuming 2^16 - 1
+	    max_vrec_width = cur_vrec_len;
+	  }
+	}
+      } else if (bytes_per_entry == 3) {
+	for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx < cur_vblock_variant_ct; ++cur_vblock_vidx) {
+	  var_fpos_iter[cur_vblock_vidx] = cur_fpos;
+	  uint32_t cur_vrec_len = (*((const uint32_t*)(&(fread_ptr[cur_vblock_vidx * bytes_per_entry])))) & 0xffffff;
+	  cur_fpos += cur_vrec_len;
+	  if (cur_vrec_len > max_vrec_width) {
+	    max_vrec_width = cur_vrec_len;
+	  }
+	}
+      } else {
+	for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx < cur_vblock_variant_ct; ++cur_vblock_vidx) {
+	  var_fpos_iter[cur_vblock_vidx] = cur_fpos;
+	  uint32_t cur_vrec_len = ((const uint32_t*)fread_ptr)[cur_vblock_vidx];
+	  cur_fpos += cur_vrec_len;
+	  if (cur_vrec_len > max_vrec_width) {
+	    max_vrec_width = cur_vrec_len;
+	  }
+	}
+#ifdef __LP64__
+	if (max_vrec_width > kPglMaxBytesPerVariant) {
+	  strcpy(errstr_buf, "Error: Invalid .pgen header.\n");
+	  return kPglRetMalformedInput;
+	}
+#else
+	if (max_vrec_width > kMaxBytesPerIO) {
+	  strcpy(errstr_buf, "Error: Variant records too large for 32-bit pgenlib.\n");
+	  return kPglRetNomem;
+	}
+#endif
+      }
+      var_fpos_iter = &(var_fpos_iter[cur_vblock_variant_ct]);
+#ifndef NO_MMAP
+      if (!shared_ff) {
+	fread_ptr = &(fread_ptr[cur_byte_ct]);
+      }
+#endif
+    }
+    // 2. allele counts?
+    if (alt_allele_ct_byte_ct) {
+      assert(alt_allele_ct_byte_ct == 1);
+#ifndef NO_MMAP
+      if (shared_ff) {
+#endif
+	if (!fread(loadbuf, cur_vblock_variant_ct * alt_allele_ct_byte_ct, 1, shared_ff)) {
+	  strcpy(errstr_buf, "Error: File read failure.\n");
+	  return kPglRetReadFail;
+	}
+	fread_ptr = loadbuf;
+#ifndef NO_MMAP
+      }
+#endif
+      if (allele_cts_already_loaded) {	
+	for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx < cur_vblock_variant_ct; ++cur_vblock_vidx) {
+	  uintptr_t cur_allele_idx_offset = allele_idx_offsets_iter[cur_vblock_vidx];
+	  uint32_t cur_alt_allele_ct = fread_ptr[cur_vblock_vidx];
+	  if ((cur_allele_idx_offset - prev_allele_idx_offset) != (cur_alt_allele_ct + 1)) {
+	    strcpy(errstr_buf, "Error: Loaded allele_idx_offsets do not match values in .pgen file.\n");
+	    return kPglRetInconsistentInput;
+	  }
+	  prev_allele_idx_offset = cur_allele_idx_offset;
+	  if (cur_alt_allele_ct > max_alt_allele_ct) {
+	    max_alt_allele_ct = cur_alt_allele_ct;
+	  }
+	}
+      } else {
+	for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx < cur_vblock_variant_ct; ++cur_vblock_vidx) {
+	  uint32_t cur_alt_allele_ct = fread_ptr[cur_vblock_vidx];
+	  prev_allele_idx_offset += cur_alt_allele_ct + 1;
+	  allele_idx_offsets_iter[cur_vblock_vidx] = prev_allele_idx_offset;
+	  if (cur_alt_allele_ct > max_alt_allele_ct) {
+	    max_alt_allele_ct = cur_alt_allele_ct;
+	  }
+	}
+      }
+      allele_idx_offsets_iter = &(allele_idx_offsets_iter[cur_vblock_variant_ct]);
+#ifndef NO_MMAP
+      if (!shared_ff) {
+	fread_ptr = &(fread_ptr[cur_vblock_variant_ct * alt_allele_ct_byte_ct]);
+      }
+#endif
+    }
+    // 3. nonref flags?
+    if (nonref_flags_stored) {
+      const uint32_t cur_byte_ct = DIV_UP(cur_vblock_variant_ct, CHAR_BIT);
+#ifndef NO_MMAP
+      if (!shared_ff) {
+	if (nonref_flags_already_loaded) {
+	  if (memcmp(nonref_flags_iter, fread_ptr, cur_byte_ct)) {
+	    strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	    return kPglRetInconsistentInput;
+	  }
+	} else {
+	  memcpy(nonref_flags_iter, fread_ptr, cur_byte_ct);
+	}
+	fread_ptr = &(fread_ptr[cur_byte_ct]);
+      } else {
+#endif
+	unsigned char* loadptr = nonref_flags_already_loaded? loadbuf : nonref_flags_iter;
+	if (!fread(loadptr, cur_byte_ct, 1, shared_ff)) {
+	  strcpy(errstr_buf, "Error: File read failure.\n");
+	  return kPglRetReadFail;
+	}
+	if (nonref_flags_already_loaded) {
+	  if (memcmp(nonref_flags_iter, loadbuf, cur_byte_ct)) {
+	    strcpy(errstr_buf, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
+	    return kPglRetInconsistentInput;
+	  }
+	}
+#ifndef NO_MMAP
+      }
+#endif
+      nonref_flags_iter = &(nonref_flags_iter[cur_byte_ct]);
+    }
+  }
+}
+
+uint32_t get_ldbase_vidx(const unsigned char* vrtypes, uint32_t cur_vidx) {
+  const uintptr_t* vrtypes_walias = (const uintptr_t*)vrtypes;
+  const uint32_t cur_vidx_orig_remainder = cur_vidx % kBytesPerWord;
+  uint32_t vidx_word_idx = (cur_vidx - 1) / kBytesPerWord;
+  uintptr_t cur_vrtypes_word = vrtypes_walias[vidx_word_idx];
+  if (cur_vidx_orig_remainder) {
+    // make sure we don't detect a byte after the current position.
+    cur_vrtypes_word &= (k1LU << (CHAR_BIT * cur_vidx_orig_remainder)) - k1LU;
+    cur_vrtypes_word |= (kMask0101 * 2) << (CHAR_BIT * cur_vidx_orig_remainder);
+  }
+  while (1) {
+    // ((bit 2) OR (NOT bit 1)) for each byte.  (possible experiment: see if
+    // the same assembly is generated if this expression is rewritten to use
+    // ands/nots.)
+    const uintptr_t detect_non_ld_word = ((cur_vrtypes_word >> 1) | (~cur_vrtypes_word)) & (kMask0101 * 2);
+    if (detect_non_ld_word) {
+      // find the highest-order set bit in detect_non_ld_word; this corresponds
+      // to the last non-LD-compressed byte (assuming little-endian).
+      const uint32_t new_ldbase_vidx_loworder = kBytesPerWord - 1 - (CLZLU(detect_non_ld_word) / CHAR_BIT);
+      return (vidx_word_idx * kBytesPerWord) + new_ldbase_vidx_loworder;
+    }
+    // everything LD-compressed in the current block.  move back 8 bytes in the
+    // array (or 4-bytes for 32-bit build).
+    cur_vrtypes_word = vrtypes_walias[--vidx_word_idx];
+  }
+}
+
+uint64_t pgfi_multiread_get_cacheline_req(const uintptr_t* variant_include, const pgen_file_info_t* pgfip, uint32_t variant_ct, uint32_t block_size) {
+  // if block_size < kPglVblockSize, it should be a power of 2 (to avoid
+  // unnecessary vblock crossing), but that's not required.
+  const uint32_t raw_variant_ct = pgfip->raw_variant_ct;
+  if (variant_ct == raw_variant_ct) {
+    variant_include = nullptr;
+  }
+  uint32_t block_ct_m1 = 0;
+  if (raw_variant_ct < block_size) {
+    block_size = raw_variant_ct;
+  } else {
+    block_ct_m1 = (raw_variant_ct - 1) / block_size;
+  }
+  const uint64_t* var_fpos = pgfip->var_fpos;
+  if ((!variant_include) && (!var_fpos)) {
+    return DIV_UP(((uint64_t)pgfip->const_vrec_width) * block_size, kCacheline);
+  }
+  uint64_t max_block_byte_ct = 0;
+  uint32_t max_block_variant_ct = 0;
+  uint32_t block_idx = 0;
+  while (1) {
+    uint32_t variant_uidx_start = block_idx * block_size;
+    uint32_t variant_uidx_end = variant_uidx_start + block_size;
+    if (block_idx >= block_ct_m1) {
+      if (block_idx > block_ct_m1) {
+	break;
+      }
+      variant_uidx_end = raw_variant_ct;
+    }
+    if (variant_include) {
+      variant_uidx_start = next_set(variant_include, variant_uidx_start, variant_uidx_end);
+      if (variant_uidx_start == variant_uidx_end) {
+	++block_idx;
+	continue;
+      }
+      variant_uidx_end = 1 + prev_set_unsafe(variant_include, variant_uidx_end);
+    }
+    if (var_fpos) {
+      if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
+	// need to start loading from LD-buddy
+	variant_uidx_start = get_ldbase_vidx(pgfip->vrtypes, variant_uidx_start);
+      }
+      uint64_t cur_block_byte_ct = var_fpos[variant_uidx_end] - var_fpos[variant_uidx_start];
+      if (cur_block_byte_ct > max_block_byte_ct) {
+	max_block_byte_ct = cur_block_byte_ct;
+      }
+    } else {
+      // no LD compression here
+      const uint32_t cur_block_variant_ct = variant_uidx_end - variant_uidx_start;
+      if (cur_block_variant_ct > max_block_variant_ct) {
+	max_block_variant_ct = cur_block_variant_ct;
+	if (cur_block_variant_ct == block_size) {
+	  // no larger value possible, terminate search
+	  break;
+	}
+      }
+    }
+    ++block_idx;
+  }
+  if (!var_fpos) {
+    max_block_byte_ct = max_block_variant_ct * ((uint64_t)pgfip->const_vrec_width);
+  }
+  return DIV_UP(max_block_byte_ct, kCacheline);
+}
+
+pglerr_t pgfi_multiread(const uintptr_t* variant_include, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t load_variant_ct, pgen_file_info_t* pgfip) {
+  // we could permit 0, but that encourages lots of unnecessary thread wakeups
+  assert(load_variant_ct);
+  if (variant_include) {
+    next_set_unsafe_ck(variant_include, &variant_uidx_start);
+  }
+  assert(variant_uidx_start < pgfip->raw_variant_ct);
+  uint64_t block_offset;
+  if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
+    // need to start loading from LD-buddy
+    // assume for now that we can't skip any variants between the LD-buddy and
+    // the actual first variant; should remove this assumption later
+    block_offset = pgfip->var_fpos[get_ldbase_vidx(pgfip->vrtypes, variant_uidx_start)];
+  } else {
+    block_offset = get_pgfi_fpos(pgfip, variant_uidx_start);
+  }
+  pgfip->block_offset = block_offset;
+  uint64_t next_read_start_fpos = block_offset;
+  // break this up into multiple freads whenever this lets us skip an entire
+  // disk block
+  // (possible todo: make the disk block size a parameter of this function)
+  do {
+    const uint64_t cur_read_start_fpos = next_read_start_fpos;
+    uint32_t cur_read_uidx_end;
+    uint64_t cur_read_end_fpos;
+    while (1) {
+      cur_read_uidx_end = variant_uidx_end;      
+      if (cur_read_uidx_end - variant_uidx_start == load_variant_ct) {
+	cur_read_end_fpos = get_pgfi_fpos(pgfip, cur_read_uidx_end);
+	load_variant_ct = 0;
+	break;
+      }
+      cur_read_uidx_end = next_unset_unsafe(variant_include, variant_uidx_start);
+      cur_read_end_fpos = get_pgfi_fpos(pgfip, cur_read_uidx_end);
+      load_variant_ct -= cur_read_uidx_end - variant_uidx_start;
+      if (!load_variant_ct) {
+	break;
+      }
+      variant_uidx_start = next_set_unsafe(variant_include, cur_read_uidx_end);
+      next_read_start_fpos = get_pgfi_fpos(pgfip, variant_uidx_start);
+      if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
+	const uint32_t variant_read_uidx_start = get_ldbase_vidx(pgfip->vrtypes, variant_uidx_start);
+	if (variant_read_uidx_start <= cur_read_uidx_end) {
+	  continue;
+	}
+	next_read_start_fpos = pgfip->var_fpos[variant_read_uidx_start];
+      }
+      // bugfix: can't use do..while, since previous "continue" needs to skip
+      // this check
+      if (round_down_pow2_ull(cur_read_end_fpos + kDiskBlockSize + 1LLU, kDiskBlockSize) < round_down_pow2_ull(next_read_start_fpos, kDiskBlockSize)) {
+	// minor bugfix (7 Jul 2017): break, not continue
+	break;
+      }
+    }
+    if (fseeko(pgfip->shared_ff, cur_read_start_fpos, SEEK_SET)) {
+      return kPglRetReadFail;
+    }
+    uintptr_t len = (uintptr_t)(cur_read_end_fpos - cur_read_start_fpos);
+    // const_cast
+    if (fread_checked((unsigned char*)((uintptr_t)(&(pgfip->block_base[cur_read_start_fpos - block_offset]))), len, pgfip->shared_ff)) {
+      return kPglRetReadFail;
+    }
+  } while (load_variant_ct);
+  return kPglRetSuccess;
+}
+
+
+void pgr_preinit(pgen_reader_t* pgrp) {
+  pgrp->ff = nullptr;
+}
+
+pglerr_t pgr_init(const char* fname, uint32_t max_vrec_width, pgen_file_info_t* pgfip, pgen_reader_t* pgrp, unsigned char* pgr_alloc) {
+  // See count_pgr_alloc_cachelines_required().
+  // Could add a debug mode.
+
+  // Mode 1 (mmap): block_base initialized, shared_ff == nullptr.  fname must
+  //   be nullptr.
+  // Mode 2 (block-fread): block_base initialized, shared_ff != nullptr.  fname
+  //   must be nullptr.
+  // Mode 3 (per-variant fread): block_base == nullptr.  fname must be
+  //   non-null, though it isn't actually referenced during the first
+  //   pgen_reader_t initialization (instead shared_ff is moved).
+  unsigned char* pgr_alloc_iter = pgr_alloc;
+  if (pgfip->block_base != nullptr) {
+    if (fname != nullptr) {
+      return kPglRetImproperFunctionCall;
+    }
+    pgrp->ff = nullptr; // make sure pgr_cleanup() doesn't break
+  } else {
+    if (pgfip->shared_ff != nullptr) {
+      if (fname == nullptr) {
+	return kPglRetImproperFunctionCall;
+      }
+      // move instead of close/reopen.
+      pgrp->ff = pgfip->shared_ff;
+      pgfip->shared_ff = nullptr;
+    } else {
+      pgrp->ff = fopen(fname, FOPEN_RB);
+      if (!pgrp->ff) {
+	return kPglRetOpenFail;
+      }
+    }
+    // now that arbitrary info can be stored between header and first variant
+    // record, always seek.
+    uint64_t seek_pos;
+    if (pgfip->var_fpos) {
+      seek_pos = pgfip->var_fpos[0];
+    } else {
+      seek_pos = pgfip->const_fpos_offset;
+    }
+    if (fseeko(pgrp->ff, seek_pos, SEEK_SET)) {
+      return kPglRetReadFail;
+    }
+  }
+  pgrp->fi = *pgfip; // struct copy
+  if (fname) {
+    // Mode 3 per-reader load buffer
+    pgrp->fread_buf = pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[round_up_pow2(max_vrec_width, kCacheline)]);
+  }
+  pgrp->fp_vidx = 0;
+  pgrp->ldbase_vidx = 0xffffffffU;
+  pgrp->ldbase_stypes = kfPgrLdcache0;
+  pgrp->ldbase_genovec = nullptr;
+  pgrp->ldbase_raregeno = nullptr;
+  pgrp->ldbase_difflist_sample_ids = nullptr;
+  
+  const pgen_global_flags_t gflags = pgrp->fi.gflags;
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  pgrp->workspace_vec = (uintptr_t*)pgr_alloc_iter;
+  const uint32_t genovec_bytes_req = QUATERCT_TO_CLCT(raw_sample_ct) * kCacheline;
+  pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
+  const uint32_t bitvec_bytes_req = BITCT_TO_CLCT(raw_sample_ct) * kCacheline;
+  const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
+  if (gflags & kfPgenGlobalDifflistOrLdPresent) {
+    const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
+    // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
+    
+    pgrp->workspace_raregeno_vec = (uintptr_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[QUATERCT_TO_CLCT(max_difflist_entry_ct_base) * kCacheline]);
+
+    pgrp->workspace_difflist_sample_ids = (uint32_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[(1 + (max_difflist_entry_ct_base / kInt32PerCacheline)) * (kCacheline * k1LU)]);
+
+    pgrp->workspace_raregeno_tmp_loadbuf = (uintptr_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[QUATERCT_TO_CLCT(max_difflist_entry_ct_base) * kCacheline]);
+    
+    pgrp->workspace_difflist_sample_ids_tmp = (uint32_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[(1 + (max_difflist_entry_ct_base / kInt32PerCacheline)) * (kCacheline * k1LU)]);
+
+    if (ld_compression_present) {
+      pgrp->ldbase_genovec = (uintptr_t*)pgr_alloc_iter;
+      pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
+
+      pgrp->ldbase_raregeno = (uintptr_t*)pgr_alloc_iter;
+      pgr_alloc_iter = &(pgr_alloc_iter[QUATERCT_TO_CLCT(max_difflist_entry_ct_base) * kCacheline]);
+
+      pgrp->ldbase_difflist_sample_ids = (uint32_t*)pgr_alloc_iter;
+      pgr_alloc_iter = &(pgr_alloc_iter[(1 + (max_difflist_entry_ct_base / kInt32PerCacheline)) * (kCacheline * k1LU)]);
+    }
+  } else {
+    pgrp->workspace_raregeno_vec = nullptr;
+    pgrp->workspace_difflist_sample_ids = nullptr;
+    pgrp->workspace_raregeno_tmp_loadbuf = nullptr;
+    pgrp->workspace_difflist_sample_ids_tmp = nullptr;
+  }
+  const uint32_t max_alt_allele_ct = pgrp->fi.max_alt_allele_ct;
+  if (max_alt_allele_ct > 1) {
+    pgrp->workspace_aux1_nonmissing_vec = (uintptr_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
+
+    uintptr_t aux1_allele_bytect = get_aux1_allele_bytect(max_alt_allele_ct, raw_sample_ct);
+    if (aux1_allele_bytect > kPglMaxBytesPerVariant) {
+      aux1_allele_bytect = kPglMaxBytesPerVariant;
+    }
+    pgrp->workspace_aux1_code_vec = (uintptr_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[round_up_pow2(aux1_allele_bytect, kCacheline)]);
+
+    pgrp->workspace_ambig_sample_ids = (uint32_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[INT32CT_TO_CLCT(raw_sample_ct) * (kCacheline * k1LU)]);
+  } else {
+    pgrp->workspace_aux1_nonmissing_vec = nullptr;
+    pgrp->workspace_aux1_code_vec = nullptr;
+    pgrp->workspace_ambig_sample_ids = nullptr;
+  }
+  pgrp->workspace_all_hets = nullptr;
+  pgrp->ldbase_all_hets = nullptr;
+  if (gflags & kfPgenGlobalHardcallPhasePresent) {
+    pgrp->workspace_all_hets = (uintptr_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
+    if (ld_compression_present) {
+      pgrp->ldbase_all_hets = (uintptr_t*)pgr_alloc_iter;
+      pgrp->ldbase_all_hets[(raw_sample_ct - 1) / kBitsPerWord] = 0;
+      pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
+    }
+  }
+  pgrp->workspace_dosage_present = nullptr;
+  pgrp->workspace_dosage_phased = nullptr;
+  if (gflags & kfPgenGlobalDosagePresent) {
+    pgrp->workspace_dosage_present = (uintptr_t*)pgr_alloc_iter;
+    pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
+    if (gflags & kfPgenGlobalDosagePhasePresent) {
+      pgrp->workspace_dosage_phased = (uintptr_t*)pgr_alloc_iter;
+    }
+    // pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
+  }
+  return kPglRetSuccess;
+}
+
+void pgr_plink1_to_plink2_inplace_unsafe(uint32_t sample_ct, uintptr_t* genovec) {
+  // 00 -> 10, 01 -> 11, 10 -> 01, 11 -> 00
+  // new low bit  = [old low] ^ [old high]
+  // new high bit = ~[old high]
+  // "unsafe" because trailing bits are not zeroed out.
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t not_m1 = VCONST_UL(kMaskAAAA);
+  vul_t* vptr = (vul_t*)genovec;
+  for (uint32_t vidx = 0; vidx < vec_ct; vidx++) {
+    const vul_t not_cur_vec_high = (~vptr[vidx]) & not_m1;
+    vptr[vidx] = (((~vptr[vidx]) & m1) ^ vul_rshift(not_cur_vec_high, 1)) | not_cur_vec_high;
+  }
+}
+
+void pgr_plink2_to_plink1_inplace_unsafe(uint32_t sample_ct, uintptr_t* genovec) {
+  // 00 -> 11, 01 -> 10, 10 -> 00, 11 -> 01
+  // new low bit  = [old low] ^ (~[old high])
+  // new high bit = ~[old high]
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  const vul_t not_m1 = VCONST_UL(kMaskAAAA);
+  vul_t* vptr = (vul_t*)genovec;
+  for (uint32_t vidx = 0; vidx < vec_ct; vidx++) {
+    vul_t cur_vec = vptr[vidx];
+    vul_t not_cur_vec_high = (~cur_vec) & not_m1;
+    vptr[vidx] = (((~not_m1) & cur_vec) ^ vul_rshift(not_cur_vec_high, 1)) | not_cur_vec_high;
+  }
+}
+
+pglerr_t parse_difflist_header(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* raregeno_buf, const unsigned char** difflist_group_info_ptr, uint32_t* difflist_len_ptr) {
+  // Can be used for deltalists as well: pass raregeno_buf == nullptr.
+  // Trailing bits of raregeno may not be zeroed out.
+  const uint32_t difflist_len = get_vint31(fread_end, fread_pp);
+  // moved here to address maybe-uninitialized warnings
+  *difflist_group_info_ptr = *fread_pp;
+  *difflist_len_ptr = difflist_len;
+  if (!difflist_len) {
+    return kPglRetSuccess;
+  }
+  if (difflist_len > raw_sample_ct / kPglMaxDifflistLenDivisor) {
+    // automatically catches get_vint31() failure
+    return kPglRetMalformedInput;
+  }
+  const uint32_t group_ct = DIV_UP(difflist_len, kPglDifflistGroupSize);
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  const uint32_t difflist_index_byte_ct = group_ct * (sample_id_byte_ct + 1) - 1;
+  if ((uintptr_t)(fread_end - (*fread_pp)) < difflist_index_byte_ct) {
+    return kPglRetMalformedInput;
+  }
+  *fread_pp += difflist_index_byte_ct;
+  if (!raregeno_buf) {
+    // for sample ID lists without 2-bit genotype info, used for sparse dosage
+    return kPglRetSuccess;
+  }
+  const uint32_t raregeno_byte_ct = QUATERCT_TO_BYTECT(difflist_len);
+  if ((uintptr_t)(fread_end - (*fread_pp)) < raregeno_byte_ct) {
+    return kPglRetMalformedInput;
+  }
+  const unsigned char* raregeno_end = &((*fread_pp)[raregeno_byte_ct]);
+  // possible todo: just return a pointer to the beginning of the raregeno
+  // segment, and let the caller perform this copy
+  memcpy(raregeno_buf, *fread_pp, raregeno_byte_ct);
+  *fread_pp = raregeno_end;
+  return kPglRetSuccess;
+}
+
+pglerr_t parse_and_save_difflist(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
+  // Appropriate when we need to iterate through the difflist multiple times.
+  // Other functions are more efficient if we only need to process the list
+  // once.
+  // Trailing bits of raregeno may not be zeroed out.
+  const unsigned char* group_info_iter;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, raregeno, &group_info_iter, difflist_len_ptr);
+  uint32_t difflist_remaining = *difflist_len_ptr;
+  // todo: check if difflist_len == 0 early exit is a net positive or negative
+  // on a few test datasets
+  if (reterr || (!difflist_remaining)) {
+    return reterr;
+  }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
+  while (1) {
+    const uint32_t* difflist_sample_ids_stop;
+    if (difflist_remaining < kPglDifflistGroupSize) {
+      if (!difflist_remaining) {
+	return kPglRetSuccess;
+      }
+      difflist_sample_ids_stop = &(difflist_sample_ids_iter[difflist_remaining]);
+      difflist_remaining = 0;
+    } else {
+      difflist_sample_ids_stop = &(difflist_sample_ids_iter[kPglDifflistGroupSize]);
+      difflist_remaining -= kPglDifflistGroupSize;
+    }
+    // can't use uint32_t assignment trick for now since there's a corner case
+    // where that would read past the end of the mapped address range
+    uintptr_t raw_sample_idx = 0;
+    memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+    group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    while (1) {
+#ifndef __LP64__
+      // perform more frequent checks in 32-bit build since raw_sample_idx may
+      // overflow
+      // misses "small negative" malformed input, but it'll catch data
+      // corruption with very high probability
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      *difflist_sample_ids_iter++ = (uint32_t)raw_sample_idx;
+      if (difflist_sample_ids_iter == difflist_sample_ids_stop) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+#ifdef __LP64__
+    if (raw_sample_idx >= raw_sample_ct) {
+      return kPglRetMalformedInput;
+    }
+#endif
+  }
+  return kPglRetSuccess;
+}
+
+void get_difflist_ambig_ids_unsafe(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_len, uint32_t* __restrict ambig_sample_ids, uint32_t* ambig_id_ct_ptr) {
+  // assumes trailing bits of raregeno are zeroed out
+  const uint32_t difflist_wct = QUATERCT_TO_WORDCT(difflist_len);
+  uint32_t ambig_id_ct = 0;
+  for (uint32_t widx = 0; widx < difflist_wct; ++widx) {
+    uintptr_t detect_11 = raregeno[widx] & (raregeno[widx] >> 1) & kMask5555;
+    // now detect_11 has a set bit iff the raregeno entry is 0b11
+    if (detect_11) {
+      const uint32_t* difflist_sample_ids_base = &(difflist_sample_ids[widx * kBitsPerWordD2]);
+      do {
+	uint32_t difflist_idx_lowbits = CTZLU(detect_11) / 2;
+	ambig_sample_ids[ambig_id_ct++] = difflist_sample_ids_base[difflist_idx_lowbits];
+	detect_11 &= detect_11 - 1;
+      } while (detect_11);
+    }
+  }
+  *ambig_id_ct_ptr = ambig_id_ct;
+}
+
+pglerr_t parse_and_save_difflist_proper_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr, uintptr_t* __restrict raregeno_workspace) {
+  // Requires a PROPER subset, and does not save ambig_sample_ids.  Use the
+  // more generic parse_and_save_difflist_subset() if the latter might be
+  // needed.
+  // Might want to just merge this with parse_and_save_difflist() and rename
+  // appropriately.
+  // Trailing bits of raregeno are zeroed out.
+  uint32_t raw_difflist_len;
+  const unsigned char* group_info_iter;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &raw_difflist_len);
+  if (reterr || (!raw_difflist_len)) {
+    *difflist_len_ptr = 0;
+    return reterr;
+  }
+  const uint32_t subgroup_idx_last = (raw_difflist_len - 1) / kBitsPerWordD2;
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uintptr_t* raregeno_workspace_iter = raregeno_workspace;
+  uintptr_t* raregeno_iter = raregeno;
+  uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
+
+  // technically doesn't need to be initialized, but I have principles
+  uintptr_t raw_sample_idx = 0;
+
+  uintptr_t raregeno_word = 0;
+  uint32_t subgroup_idx = 0;
+  uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
+  uint32_t difflist_len_lowbits = 0;
+  while (1) {
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	if (difflist_len_lowbits) {
+	  *raregeno_iter = raregeno_word;
+	}
+	*difflist_len_ptr = (uint32_t)((uintptr_t)(difflist_sample_ids_iter - difflist_sample_ids) + difflist_len_lowbits);
+	return kPglRetSuccess;
+      }
+      subgroup_len_m1 &= raw_difflist_len - 1;
+    }
+    // We need to consume a new rare genotype word every 32 entries, and pull a
+    // raw sample index from the difflist header every 64 entries.  So it's
+    // best to make the inner loop have a period of 32 (call this a "subgroup",
+    // where "group" refers to a set of 64 entries).
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+#ifdef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t raregeno_workspace_word = *raregeno_workspace_iter++;
+    uint32_t raw_difflist_idx_lowbits = 0;
+    while (1) {
+#ifndef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      if (IS_SET(sample_include, raw_sample_idx)) {
+	raregeno_word |= ((raregeno_workspace_word >> (2 * raw_difflist_idx_lowbits)) & 3) << (difflist_len_lowbits * 2);
+	difflist_sample_ids_iter[difflist_len_lowbits] = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, (uint32_t)raw_sample_idx);
+	if (difflist_len_lowbits++ == (kBitsPerWordD2 - 1)) {
+	  *raregeno_iter++ = raregeno_word;
+	  raregeno_word = 0;
+	  difflist_len_lowbits = 0;
+	  difflist_sample_ids_iter = &(difflist_sample_ids_iter[kBitsPerWordD2]);
+	}
+      }
+      if (raw_difflist_idx_lowbits == subgroup_len_m1) {
+	break;
+      }
+      ++raw_difflist_idx_lowbits;
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+  }
+}
+
+pglerr_t parse_and_save_difflist_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict ambig_sample_ids, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr, uint32_t* __restrict ambig_id_ct_ptr, uintptr_t* __restrict raregeno_workspace) {
+  // Generic interface.  sample_include should be nullptr if
+  // sample_ct == raw_sample_ct.
+  // Trailing bits of raregeno are zeroed out.
+  if (!ambig_sample_ids) {
+    if (!sample_include) {
+      return parse_and_save_difflist(fread_end, raw_sample_ct, fread_pp, raregeno, difflist_sample_ids, difflist_len_ptr);
+    }
+    return parse_and_save_difflist_proper_subset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_sample_ct, fread_pp, raregeno, difflist_sample_ids, difflist_len_ptr, raregeno_workspace);
+  }
+  uint32_t raw_difflist_len;
+  const unsigned char* group_info_iter;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &raw_difflist_len);
+  if (reterr || (!raw_difflist_len)) {
+    *difflist_len_ptr = 0;
+    *ambig_id_ct_ptr = 0;
+    return reterr;
+  }
+  const uint32_t subgroup_idx_last = (raw_difflist_len - 1) / kBitsPerWordD2;
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uintptr_t* raregeno_workspace_iter = raregeno_workspace;
+  uintptr_t* raregeno_iter = raregeno;
+  uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
+
+  // technically doesn't need to be initialized, but I have principles
+  uintptr_t raw_sample_idx = 0;
+
+  uintptr_t raregeno_word = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t subgroup_idx = 0;
+  uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
+  uint32_t difflist_len_lowbits = 0;
+  while (1) {
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	if (difflist_len_lowbits) {
+	  *raregeno_iter = raregeno_word;
+	}
+	*difflist_len_ptr = (uint32_t)((uintptr_t)(difflist_sample_ids_iter - difflist_sample_ids) + difflist_len_lowbits);
+	*ambig_id_ct_ptr = ambig_id_ct;
+	return kPglRetSuccess;
+      }
+      subgroup_len_m1 &= raw_difflist_len - 1;
+    }
+    // We need to consume a new rare genotype word every 32 entries, and pull a
+    // raw sample index from the difflist header every 64 entries.  So it's
+    // best to make the inner loop have a period of 32 (call this a "subgroup",
+    // where "group" refers to a set of 64 entries).
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+#ifdef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t raregeno_workspace_word = *raregeno_workspace_iter++;
+    uint32_t raw_difflist_idx_lowbits = 0;
+    while (1) {
+#ifndef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      const uintptr_t cur_geno = raregeno_workspace_word & 3;
+      if ((!sample_include) || IS_SET(sample_include, raw_sample_idx)) {
+        uint32_t sample_idx = (uint32_t)raw_sample_idx;
+	if (sample_include) {
+	  sample_idx = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, (uint32_t)raw_sample_idx);
+	}
+	raregeno_word |= cur_geno << (difflist_len_lowbits * 2);
+	difflist_sample_ids_iter[difflist_len_lowbits] = sample_idx;
+	if (difflist_len_lowbits++ == (kBitsPerWordD2 - 1)) {
+	  *raregeno_iter++ = raregeno_word;
+	  raregeno_word = 0;
+	  difflist_len_lowbits = 0;
+	  difflist_sample_ids_iter = &(difflist_sample_ids_iter[kBitsPerWordD2]);
+	}
+      }
+      if (cur_geno == 3) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (raw_difflist_idx_lowbits == subgroup_len_m1) {
+	break;
+      }
+      ++raw_difflist_idx_lowbits;
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      raregeno_workspace_word >>= 2;
+    }
+  }
+}
+
+pglerr_t parse_ld_and_merge_difflist_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_raregeno, const uint32_t* __restrict ldbase_difflist_sample_ids, uint32_t ldbase_difflist_len, uintptr_t ldbase_common_geno, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, uint32_t* __restrict ambig_sample_ids, uintptr_t* __restrict merged_rare [...]
+  // Used when the ldbase variant was saved as a difflist, and it's useful to
+  // process the current variant as a difflist.
+  // * If the current variant is multiallelic, 0b11 entries in ldbase_raregeno
+  //   do NOT have associated aux1 entries; only freshly loaded 0b11 values do.
+  //   ambig_sample_ids keeps track of this.
+  // * ambig_sample_ids should be nullptr if the current variant is not
+  //   multiallelic.  (Hence its positioning in the argument list: it's in/out,
+  //   everything after it is a pure outparemeter.)
+  // * ambig_sample_ids is NOT subsetted; otherwise it wouldn't support
+  //   subsequent loading of the aux1 data track.
+  // * Assumes ldbase_difflist_sample_ids[ldbase_difflist_len]==sample_ct.
+  // * Assumes sample_include == nullptr if no subsetting needed.  (Otherwise,
+  //   it'll still work, but performance will be slower.)
+  // Trailing bits of merged_raregeno may not be zeroed out.
+  // Caller is responsible for inverting ldbase_common_geno and merged_raregeno
+  // afterward if necessary.
+  assert(ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct);
+  uint32_t diff_from_ldbase_len;
+  const unsigned char* group_info_iter;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, diff_from_ldbase_raregeno_iter, &group_info_iter, &diff_from_ldbase_len);
+  if (reterr) {
+    return reterr;
+  }
+  if (!diff_from_ldbase_len) {
+    memcpy(merged_difflist_sample_ids, ldbase_difflist_sample_ids, ldbase_difflist_len * sizeof(int32_t));
+    *ambig_id_ct_ptr = 0;
+    *merged_difflist_len_ptr = ldbase_difflist_len;
+    copy_quaterarr(ldbase_raregeno, ldbase_difflist_len, merged_raregeno);
+    return kPglRetSuccess;
+  }
+  if (ldbase_common_geno == 3) {
+    ldbase_common_geno = 4; // force these to be saved
+  }
+  const uint32_t subgroup_idx_last = (diff_from_ldbase_len - 1) / kBitsPerWordD2;
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uintptr_t* merged_raregeno_iter = merged_raregeno;
+  uint32_t* merged_difflist_sample_ids_iter = merged_difflist_sample_ids;
+  uintptr_t merged_raregeno_word = 0;
+  uintptr_t ldbase_raregeno_word = 0;
+  uintptr_t diff_from_ldbase_raregeno_word = 0;
+  uint32_t ldbase_sample_idx = ldbase_difflist_sample_ids[0];
+  uintptr_t raw_sample_idx = 0;
+  uintptr_t cur_geno = 0;
+  uint32_t sample_idx = 0;
+  uint32_t ldbase_difflist_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t done = 0;
+  uint32_t subgroup_idx = 0;
+  uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
+  uint32_t merge_idx_lowbits = 0;
+  while (1) {
+    uint32_t diff_from_ldbase_idx_lowbits = 0;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	done = 1;
+	sample_idx = sample_ct;
+	goto parse_ld_and_merge_difflist_subset_finish;
+      }
+      subgroup_len_m1 &= diff_from_ldbase_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    diff_from_ldbase_raregeno_word = *diff_from_ldbase_raregeno_iter++;
+    ++subgroup_idx;
+    while (1) {
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+      cur_geno = diff_from_ldbase_raregeno_word & 3;
+      if ((!sample_include) || IS_SET(sample_include, raw_sample_idx)) {
+	sample_idx = sample_include? raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, (uint32_t)raw_sample_idx) : ((uint32_t)raw_sample_idx);
+      parse_ld_and_merge_difflist_subset_finish:
+	while (ldbase_sample_idx < sample_idx) {
+	  // replace with blocked copy?
+	  if (!(ldbase_difflist_idx % kBitsPerWordD2)) {
+	    ldbase_raregeno_word = ldbase_raregeno[ldbase_difflist_idx / kBitsPerWordD2];
+	  }
+	  *merged_difflist_sample_ids_iter++ = ldbase_sample_idx;
+	  merged_raregeno_word |= (ldbase_raregeno_word & 3) << (2 * merge_idx_lowbits);
+	  if (merge_idx_lowbits++ == (kBitsPerWordD2 - 1)) {
+	    *merged_raregeno_iter++ = merged_raregeno_word;
+	    merged_raregeno_word = 0;
+	    merge_idx_lowbits = 0;
+	  }
+	  ++ldbase_difflist_idx;
+	  ldbase_raregeno_word >>= 2;
+	  ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx];
+	}
+	if (ldbase_sample_idx == sample_idx) {
+	  if (done) {
+	    if (merge_idx_lowbits) {
+	      *merged_raregeno_iter = merged_raregeno_word;
+	    }
+	    *ambig_id_ct_ptr = ambig_id_ct;
+	    *merged_difflist_len_ptr = (uint32_t)((uintptr_t)(merged_difflist_sample_ids_iter - merged_difflist_sample_ids));
+	    return kPglRetSuccess;
+	  }
+	  if (!(ldbase_difflist_idx % kBitsPerWordD2)) {
+	    ldbase_raregeno_word = ldbase_raregeno[ldbase_difflist_idx / kBitsPerWordD2];
+	  }
+	  ++ldbase_difflist_idx;
+	  ldbase_raregeno_word >>= 2;
+	  ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx];
+	}
+	if (cur_geno != ldbase_common_geno) {
+	  *merged_difflist_sample_ids_iter++ = sample_idx;
+	  merged_raregeno_word |= cur_geno << (2 * merge_idx_lowbits);
+	  if (merge_idx_lowbits++ == (kBitsPerWordD2 - 1)) {
+	    *merged_raregeno_iter++ = merged_raregeno_word;
+	    merged_raregeno_word = 0;
+	    merge_idx_lowbits = 0;
+	  }
+	}
+      }
+      if (ambig_sample_ids && (cur_geno == 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (diff_from_ldbase_idx_lowbits == subgroup_len_m1) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      ++diff_from_ldbase_idx_lowbits;
+      diff_from_ldbase_raregeno_word >>= 2;
+    }
+  }
+}
+
+void pgr_difflist_to_genovec_unsafe(const uintptr_t* __restrict raregeno, const uint32_t* difflist_sample_ids, uintptr_t difflist_common_geno, uint32_t sample_ct, uint32_t difflist_len, uintptr_t* __restrict genovec) {
+  // Ok for trailing bits of raregeno to be nonzero.  Does not zero out
+  // trailing bits of genovec.
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  // could just memset up to word boundary; this should be a bit more
+  // vector-instruction-friendly, though?
+  memset(genovec, (unsigned int)(difflist_common_geno * 0x55), vec_ct * kBytesPerVec);
+  const uintptr_t* raregeno_incr = raregeno;
+  uint32_t difflist_idx = 0;
+  uint32_t difflist_idx_stop = 0;
+  if (!difflist_common_geno) {
+    // faster inner loop since there's no existing value to mask out
+    // todo: verify that memset with "unknown" parameter, set to zero, is only
+    // a tiny bit slower than hardcoded memset zero
+    // todo: check if this should just be deleted since the code bloat causes
+    // too many more cache misses
+    while (1) {
+      difflist_idx_stop += kBitsPerWordD2;
+      if (difflist_idx_stop > difflist_len) {
+	if (difflist_idx == difflist_len) {
+	  return;
+	}
+	difflist_idx_stop = difflist_len;
+      }
+      uintptr_t raregeno_word = *raregeno_incr++;
+      for (; difflist_idx < difflist_idx_stop; ++difflist_idx) {
+	const uint32_t cur_sample_idx = difflist_sample_ids[difflist_idx];
+	genovec[cur_sample_idx / kBitsPerWordD2] |= (raregeno_word & 3) << (2 * (cur_sample_idx % kBitsPerWordD2));
+	raregeno_word >>= 2;
+      }
+    }
+  }
+  while (1) {
+    difflist_idx_stop += kBitsPerWordD2;
+    if (difflist_idx_stop > difflist_len) {
+      if (difflist_idx == difflist_len) {
+	return;
+      }
+      difflist_idx_stop = difflist_len;
+    }
+    uintptr_t raregeno_word = *raregeno_incr++;
+    for (; difflist_idx < difflist_idx_stop; ++difflist_idx) {
+      const uint32_t cur_sample_idx = difflist_sample_ids[difflist_idx];
+      ASSIGN_QUATERARR_ENTRY(cur_sample_idx, raregeno_word & 3, genovec);    
+      raregeno_word >>= 2;
+    }
+  }
+}
+
+/*
+void pruned_difflist_to_genovec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t sample_ct, uint32_t difflist_common_geno, uint32_t difflist_len, uintptr_t* __restrict genovec) {
+  // Designed to be used after genovec subsetting.  Assumes all difflist
+  // entries are valid.  Ok for trailing bits of raregeno to be nonzero.  Does
+  // not zero out trailing bits of genovec.
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  memset(genovec, difflist_common_geno * 0x55, vec_ct * kBytesPerVec);
+  if (!difflist_len) {
+    return;
+  }
+  const uintptr_t* raregeno_incr = raregeno;
+  const uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
+  const uint32_t* difflist_sample_ids_end = &(difflist_sample_ids[difflist_len]);
+  // don't think there's a point to separating out the
+  // difflist_common_geno == 0 case here, since the raw_to_subsetted_pos
+  // operation is a bit expensive
+  while (1) {
+    // er, get rid of this undefined behavior if we uncomment this function
+    const uint32_t* difflist_sample_ids_stop = &(difflist_sample_ids_iter[kBitsPerWordD2]);
+    uintptr_t raregeno_word = *raregeno_incr++;
+    if (difflist_sample_ids_stop > difflist_sample_ids_end) {
+      if (difflist_sample_ids_iter == difflist_sample_ids_end) {
+	return;
+      }
+      difflist_sample_ids_stop = difflist_sample_ids_end;
+    }
+    while (1) {
+      const uint32_t cur_sample_idx = *difflist_sample_ids_iter;
+      const uint32_t cur_subsetted_pos = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, cur_sample_idx);
+      ASSIGN_QUATERARR_ENTRY(cur_subsetted_pos, raregeno_word & 3, genovec);
+      if (difflist_sample_ids_iter++ == difflist_sample_ids_stop) {
+	break;
+      }
+      raregeno_word >>= 2;
+    }
+  }
+}
+*/
+
+pglerr_t parse_and_apply_difflist(const unsigned char* fread_end, uint32_t multiallelic_relevant, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
+  // Cannot occur after genovec subsetting since the difflist sample indexes
+  // will be incorrect.
+  // If multiallelic_relevant is true, a list of sample indices with freshly
+  // loaded raregeno value 0b11 is saved to pgr.workspace_ambig_sample_ids, and
+  // pgr.workspace_ambig_id_ct is set to the length of the list.
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
+  const unsigned char* group_info_iter;
+  uint32_t difflist_len;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len);
+  if (reterr || (!difflist_len)) {
+    return reterr;
+  }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
+  uint32_t* ambig_sample_ids = multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr;
+  uintptr_t raw_sample_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t subgroup_idx = 0;
+  while (1) {
+    uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	pgrp->workspace_ambig_id_ct = ambig_id_ct;
+	return kPglRetSuccess;
+      }
+      remaining_deltas_in_subgroup &= difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
+    // This loop tends to be the decompression bottleneck.  Tried to modify it
+    // to process 4 entries at a time, but that didn't end up helping.
+    while (1) {
+      // always check, since otherwise ASSIGN_QUATERARR_ENTRY() can scribble
+      // over arbitrary memory
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+      const uintptr_t cur_geno = cur_raregeno_word & 3;
+      ASSIGN_QUATERARR_ENTRY(raw_sample_idx, cur_geno, genovec);
+      if (multiallelic_relevant && (cur_geno == 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (!remaining_deltas_in_subgroup) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      --remaining_deltas_in_subgroup;
+      cur_raregeno_word >>= 2;
+    }
+  }
+}
+
+// could merge parse_and_apply_difflist() with this?
+pglerr_t parse_and_apply_difflist_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t multiallelic_relevant, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
+  // If multiallelic_relevant is true, a list of sample indices with freshly
+  // loaded raregeno value 0b11 is saved to pgr.workspace_ambig_sample_ids, and
+  // pgr.workspace_ambig_id_ct is set to the length of the list.
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  if (sample_ct == raw_sample_ct) {
+    return parse_and_apply_difflist(fread_end, multiallelic_relevant, fread_pp, pgrp, genovec);
+  }
+  uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
+  const unsigned char* group_info_iter;
+  uint32_t difflist_len;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len);
+  if (reterr || (!difflist_len)) {
+    return reterr;
+  }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
+  uint32_t* ambig_sample_ids = multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr;
+  uintptr_t raw_sample_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t subgroup_idx = 0;
+  while (1) {
+    uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	pgrp->workspace_ambig_id_ct = ambig_id_ct;
+	return kPglRetSuccess;
+      }
+      remaining_deltas_in_subgroup &= difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
+    // This loop tends to be the decompression bottleneck.  Tried to modify it
+    // to process 4 entries at a time, but that didn't end up helping.
+    while (1) {
+      // always check, since otherwise ASSIGN_QUATERARR_ENTRY() can scribble
+      // over arbitrary memory
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+      const uintptr_t cur_geno = cur_raregeno_word & 3;
+      if (IS_SET(sample_include, raw_sample_idx)) {
+	ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, (uint32_t)raw_sample_idx), cur_geno, genovec);
+      }
+      if (multiallelic_relevant && (cur_geno == 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (!remaining_deltas_in_subgroup) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      --remaining_deltas_in_subgroup;
+      cur_raregeno_word >>= 2;
+    }
+  }
+}
+
+pglerr_t parse_onebit_unsafe(const unsigned char* fread_end, uint32_t difflist_ambig_ids_needed, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  // doesn't zero out trailing genovec bits
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t common2_and_bitarray_byte_ct = (raw_sample_ct + 15) / CHAR_BIT;
+  if ((uintptr_t)(fread_end - (*fread_pp)) < common2_and_bitarray_byte_ct) {
+    return kPglRetMalformedInput;
+  }
+  const unsigned char* fread_difflist_start = &((*fread_pp)[common2_and_bitarray_byte_ct]);
+  const uintptr_t common2_code = *((*fread_pp)++);
+  const uintptr_t word_base = (common2_code / 4) * kMask5555;
+  const uintptr_t common_code_delta = common2_code & 3;
+  const uint32_t genovec_widx_trail = (raw_sample_ct + 7) / kBitsPerWordD2;
+  const uint32_t genovec_widx_end = QUATERCT_TO_WORDCT(raw_sample_ct);
+  uint32_t genovec_widx = 0;
+#ifdef __arm__
+  #error "Unaligned accesses in parse_onebit_unsafe()."
+#endif
+  const halfword_t* fread_alias = (const halfword_t*)(*fread_pp);
+  while (1) {
+    uintptr_t ww;
+    if (genovec_widx >= genovec_widx_trail) {
+      // might want to modify to not go here if last read is an entire halfword
+      if (genovec_widx == genovec_widx_end) {
+	break;
+      }
+      ww = 0;
+      memcpy(&ww, &(fread_alias[genovec_widx_trail]), 1 + (((raw_sample_ct - 1) % kBitsPerWordD2) / CHAR_BIT));
+    } else {
+      ww = (uintptr_t)(fread_alias[genovec_widx]);
+    }
+    // apply middle-out operation
+    // 64-bit:
+    //   const uintptr_t middle_out_result = (ww | (ww << 31)) & kMask5555;
+    // 32-bit:
+    //   *genovec_iter++ = word_base + (ww & kMask5555) * common_code_delta;
+    //   *genovec_iter++ = word_base + ((ww >> 1) & kMask5555) * common_code_delta;
+    // (scrapped for now since the time savings don't seem to be worth the
+    // extra end-of-vector corner cases, apparently the extra operations here
+    // are sufficiently cheap)
+    ww = unpack_halfword_to_word(ww);
+    genovec[genovec_widx++] = word_base + ww * common_code_delta;
+  }
+  *fread_pp = fread_difflist_start;
+  return parse_and_apply_difflist(fread_end, difflist_ambig_ids_needed, fread_pp, pgrp, genovec);
+}
+
+void extract_genoarr_ambig_ids(const uintptr_t* genoarr, uint32_t raw_sample_ct, uint32_t* __restrict ambig_sample_ids, uint32_t* ambig_id_ct_ptr) {
+#ifdef __arm__
+  #error "Unaligned accesses in extract_genoarr_ambig_ids() (genoarr may not be aligned)."
+#endif
+  // does not read trailing bytes of genoarr
+  const uint32_t word_ct_trail = (raw_sample_ct + 3) / kBitsPerWordD2;
+  const uint32_t word_ct_end = QUATERCT_TO_WORDCT(raw_sample_ct);
+  uint32_t ambig_id_ct = 0;
+  uint32_t widx = 0;
+  while (1) {
+    uintptr_t detect_11;
+    if (widx >= word_ct_trail) {
+      if (widx == word_ct_end) {
+	*ambig_id_ct_ptr = ambig_id_ct;
+	return;
+      }
+      detect_11 = 0;
+      memcpy(&detect_11, &(genoarr[widx]), QUATERCT_TO_BYTECT(raw_sample_ct % kBitsPerWordD2));
+    } else {
+      detect_11 = genoarr[widx];
+    }
+    detect_11 = detect_11 & (detect_11 >> 1) & kMask5555;
+    // now detect_11 has a set bit iff the genoarr entry is 0b11
+    if (detect_11) {
+      const uint32_t sample_idx_base = widx * kBitsPerWordD2;
+      do {
+	const uint32_t sample_idx_lowbits = CTZLU(detect_11) / 2;
+	ambig_sample_ids[ambig_id_ct++] = sample_idx_base + sample_idx_lowbits;
+	detect_11 &= detect_11 - 1;
+      } while (detect_11);
+    }
+  }
+}
+
+pglerr_t parse_1or2bit_genovec_unsafe(const unsigned char* fread_end, uint32_t vrtype, uint32_t difflist_ambig_ids_needed, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  // Side effect: may use pgrp->workspace_raregeno_tmp_loadbuf.
+  // Does not update fp_vidx, does not rotate plink1-formatted data (since it's
+  // better to do that post-subsetting)
+  // If difflist_ambig_ids_needed is set, pgr.workspace_ambig_sample_ids and
+  // pgr.workspace_ambig_id_ct are filled with with the sample IDs
+  // corresponding to aux track 1.
+  assert((!difflist_ambig_ids_needed) || vrtype_multiallelic(vrtype));
+  if (vrtype & 3) {
+    return parse_onebit_unsafe(fread_end, difflist_ambig_ids_needed, fread_pp, pgrp, genovec);
+  }
+  // uncompressed storage
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t genovec_byte_ct = QUATERCT_TO_BYTECT(raw_sample_ct);
+  if ((uintptr_t)(fread_end - (*fread_pp)) < genovec_byte_ct) {
+    return kPglRetMalformedInput;
+  }
+  const unsigned char* new_fread_ptr = &((*fread_pp)[genovec_byte_ct]);
+  memcpy(genovec, *fread_pp, genovec_byte_ct);
+  *fread_pp = new_fread_ptr;
+  if (difflist_ambig_ids_needed) {
+    extract_genoarr_ambig_ids(genovec, raw_sample_ct, pgrp->workspace_ambig_sample_ids, &(pgrp->workspace_ambig_id_ct));
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t parse_non_ld_genovec_subset_unsafe(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vrtype, uint32_t difflist_ambig_ids_needed, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  // Side effects:
+  //   may use pgrp->workspace_raregeno_tmp_loadbuf
+  //   may use pgrp->workspace_vec (subsetting)
+  // See comments on parse_1or2bit_genovec_unsafe().
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  if (!vrtype_difflist(vrtype)) {
+    const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+    uintptr_t* raw_genovec = subsetting_required? pgrp->workspace_vec : genovec;
+    pglerr_t reterr = parse_1or2bit_genovec_unsafe(fread_end, vrtype, difflist_ambig_ids_needed, fread_pp, pgrp, raw_genovec);
+    if ((!subsetting_required) || reterr) {
+      return reterr;
+    }
+    copy_quaterarr_nonempty_subset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
+    return kPglRetSuccess;
+  }
+  assert((!difflist_ambig_ids_needed) || vrtype_multiallelic(vrtype));
+  const uint32_t vrtype_low2 = vrtype & 3;
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  memset(genovec, vrtype_low2 * 0x55, vec_ct * kBytesPerVec);
+  return parse_and_apply_difflist_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, difflist_ambig_ids_needed, fread_pp, pgrp, genovec);
+}
+
+pglerr_t init_read_ptrs(uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp) {
+  const unsigned char* block_base = pgrp->fi.block_base;
+  if (block_base != nullptr) {
+    // possible todo: special handling of end of vblock
+    const uint64_t block_offset = pgrp->fi.block_offset;
+    *fread_pp = &(block_base[get_pgfi_fpos(&(pgrp->fi), vidx) - block_offset]);
+    *fread_endp = &(block_base[get_pgfi_fpos(&(pgrp->fi), vidx + 1) - block_offset]);
+    pgrp->fp_vidx = vidx + 1;
+    return kPglRetSuccess;
+  }
+  if (pgrp->fp_vidx != vidx) {
+    if (fseeko(pgrp->ff, get_pgfi_fpos(&(pgrp->fi), vidx), SEEK_SET)) {
+      return kPglRetReadFail;
+    }
+  }
+  const uintptr_t cur_vrec_width = get_pgfi_vrec_width(&(pgrp->fi), vidx);
+#ifdef __LP64__
+  if (fread_checked(pgrp->fread_buf, cur_vrec_width, pgrp->ff)) {
+    return kPglRetReadFail;
+  }
+#else
+  // cur_vrec_width < 2^31 since otherwise we error out on initialization
+  if (!fread(pgrp->fread_buf, cur_vrec_width, 1, pgrp->ff)) {
+    return kPglRetReadFail;
+  }
+#endif
+  *fread_pp = pgrp->fread_buf;
+  *fread_endp = &(pgrp->fread_buf[cur_vrec_width]);
+  pgrp->fp_vidx = vidx + 1;
+  return kPglRetSuccess;
+}
+
+uint32_t ld_load_necessary(uint32_t cur_vidx, pgen_reader_t* pgrp) {
+  // Determines whether LD base variant needs to be loaded (in addition to the
+  // current variant).  Updates pgrp->ldbase_vidx when necessary.
+  // possible todo: add the vidx_word_stop optimization to get_ldbase_vidx() so
+  // this can call it instead of duplicating code (this forces
+  // pgfi_block_load() to pass a fp_vidx parameter of zero).
+  const uint32_t fp_vidx = pgrp->ldbase_stypes? pgrp->fp_vidx : 0;
+  if (cur_vidx == fp_vidx) {
+    // ldbase variant guaranteed to be up-to-date if we didn't skip the last
+    // variant, and cache wasn't cleared
+    return 0;
+  }
+  // Find the last vrtypes[] value before vrtypes[cur_vidx] with bit 1 unset or
+  // bit 2 set.
+  const uintptr_t* vrtypes_walias = (const uintptr_t*)pgrp->fi.vrtypes;
+  const uint32_t cur_vidx_orig_remainder = cur_vidx % kBytesPerWord;
+  uint32_t vidx_word_idx = (cur_vidx - 1) / kBytesPerWord;
+  uintptr_t cur_vrtypes_word = vrtypes_walias[vidx_word_idx];
+  if (cur_vidx_orig_remainder) {
+    // make sure we don't detect a byte after the current position.
+    cur_vrtypes_word &= (k1LU << (CHAR_BIT * cur_vidx_orig_remainder)) - k1LU;
+    cur_vrtypes_word |= (kMask0101 * 2) << (CHAR_BIT * cur_vidx_orig_remainder);
+  }
+  const uint32_t vidx_word_stop = (fp_vidx < cur_vidx)? (fp_vidx / kBytesPerWord) : 0;
+  while (1) {
+    // ((bit 2) OR (NOT bit 1)) for each byte.  (possible experiment: see if
+    // the same assembly is generated if this expression is rewritten to use
+    // ands/nots.)
+    uintptr_t detect_non_ld_word = ((cur_vrtypes_word >> 1) | (~cur_vrtypes_word)) & (kMask0101 * 2);
+
+    if (detect_non_ld_word) {
+      // find the highest-order set bit in detect_non_ld_word; this corresponds
+      // to the last non-LD-compressed byte (assuming little-endian).
+      const uint32_t old_ldbase_vidx = pgrp->ldbase_vidx;
+      const uint32_t new_ldbase_vidx_loworder = kBytesPerWord - 1 - (CLZLU(detect_non_ld_word) / CHAR_BIT);
+      const uint32_t new_ldbase_vidx = (vidx_word_idx * kBytesPerWord) + new_ldbase_vidx_loworder;
+      pgrp->ldbase_vidx = new_ldbase_vidx;
+      return (old_ldbase_vidx != new_ldbase_vidx);
+    }
+    // everything LD-compressed in the current block.  move back 8 bytes in the
+    // array (or 4-bytes for 32-bit build).
+    if (vidx_word_idx == vidx_word_stop) {
+      return 0;
+    }
+    --vidx_word_idx;
+    cur_vrtypes_word = vrtypes_walias[vidx_word_idx];
+  }
+}
+
+// loads ldbase variant if necessary, unpacks to genovec if necessary (the
+// latter happens even if the variant was already loaded)
+// may use workspace_vec
+pglerr_t ld_load_genovec_subset_if_necessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp) {
+  if (ld_load_necessary(vidx, pgrp)) {
+    const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
+    const unsigned char* fread_ptr;
+    const unsigned char* fread_end;
+    if (init_read_ptrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+    return parse_non_ld_genovec_subset_unsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, pgrp->fi.vrtypes[ldbase_vidx], 0, &fread_ptr, pgrp, pgrp->ldbase_genovec);
+  }
+  if (!(pgrp->ldbase_stypes & kfPgrLdcacheQuater)) {
+    assert(pgrp->ldbase_stypes & kfPgrLdcacheDifflist);
+    pgr_difflist_to_genovec_unsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, sample_ct, pgrp->ldbase_difflist_len, pgrp->ldbase_genovec);
+    pgrp->ldbase_stypes |= kfPgrLdcacheQuater;
+  }
+  return kPglRetSuccess;
+}
+
+// fread_pp should be non-null iff this is being called by an internal function
+// as part of a multiallelic variant read
+pglerr_t read_refalt1_genovec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec) {
+  // Side effects:
+  //   may use pgr.workspace_vec iff subsetting required
+  //   may use pgr.workspace_raregeno_tmp_loadbuf (any difflist)
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t maintrack_vrtype = vrtype & 7;
+  const uint32_t multiallelic_relevant = fread_pp && vrtype_multiallelic(vrtype);
+  if (vrtype_ld_compressed(maintrack_vrtype)) {
+    // LD compression
+    pglerr_t reterr = ld_load_genovec_subset_if_necessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
+    if (reterr) {
+      return reterr;
+    }
+    const unsigned char* fread_ptr;
+    const unsigned char* fread_end;
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    copy_quaterarr(pgrp->ldbase_genovec, sample_ct, genovec);
+    reterr = parse_and_apply_difflist_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, multiallelic_relevant, &fread_ptr, pgrp, genovec);
+    if (reterr) {
+      return reterr;
+    }
+    if (maintrack_vrtype == 3) {
+      genovec_invert_unsafe(sample_ct, genovec);
+    }
+    if (fread_pp) {
+      *fread_pp = fread_ptr;
+      *fread_endp = fread_end;
+    }
+    return kPglRetSuccess;
+  }
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end = nullptr; // maybe-uninitialized warning
+  // tried inserting special-case code for the plink1 case to avoid a copy, and
+  // it was actually slower
+  if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+    return kPglRetReadFail;
+  }
+  // tried to add more sophisticated caching, but turns out it isn't worth it
+  pglerr_t reterr = parse_non_ld_genovec_subset_unsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, maintrack_vrtype, multiallelic_relevant, &fread_ptr, pgrp, genovec);
+  if (reterr) {
+    return reterr;
+  }
+  const uint32_t is_ldbase = pgrp->fi.vrtypes && vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+  if (is_ldbase) {
+    copy_quaterarr(genovec, sample_ct, pgrp->ldbase_genovec);
+    pgrp->ldbase_vidx = vidx;
+    pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+  }
+  if (vrtype == kPglVrtypePlink1) {
+    pgr_plink1_to_plink2_inplace_unsafe(sample_ct, genovec);
+  } else if (fread_pp) {
+    *fread_pp = fread_ptr;
+    *fread_endp = fread_end;
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t pgr_read_refalt1_genovec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    return kPglRetSuccess;
+  }
+  return read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
+}
+
+/*
+void copy_and_subset_difflist(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_raregeno, const uint32_t* __restrict raw_difflist_sample_ids, uint32_t raw_difflist_len, uintptr_t* __restrict new_raregeno, uint32_t* __restrict new_difflist_sample_ids, uint32_t* __restrict new_difflist_len_ptr) {
+  // Trailing bits of new_raregeno are zeroed out.
+  if (!raw_difflist_len) {
+    *new_difflist_len_ptr = 0;
+    return;
+  }
+  const uintptr_t* raw_raregeno_incr = raw_raregeno;
+  const uint32_t* raw_difflist_sample_ids_iter = raw_difflist_sample_ids;
+  const uint32_t* raw_difflist_sample_ids_last = &(raw_difflist_sample_ids[round_down_pow2(raw_difflist_len - 1, kBitsPerWordD2)]);
+  uintptr_t* new_raregeno_incr = new_raregeno;
+  uintptr_t new_raregeno_word = 0;
+  uint32_t new_difflist_len = 0;
+  uint32_t block_len_m1 = kBitsPerWordD2 - 1;
+  while (1) {
+    if (raw_difflist_sample_ids_iter >= raw_difflist_sample_ids_last) {
+      if (raw_difflist_sample_ids_iter > raw_difflist_sample_ids_last) {
+	if (new_difflist_len % kBitsPerWordD2) {
+	  *new_raregeno_incr = new_raregeno_word;
+	}
+	*new_difflist_len_ptr = new_difflist_len;
+	return;
+      }
+      block_len_m1 &= raw_difflist_len - 1;
+    }
+    uintptr_t raw_raregeno_word = *raw_raregeno_incr++;
+    uint32_t raw_difflist_idx_lowbits = 0;
+    while (1) {
+      const uint32_t raw_sample_idx = raw_difflist_sample_ids_iter[raw_difflist_idx_lowbits];
+      if (IS_SET(sample_include, raw_sample_idx)) {
+	new_difflist_sample_ids[new_difflist_len] = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx);
+	new_raregeno_word |= ((raw_raregeno_word >> (2 * raw_difflist_idx_lowbits)) & 3) << (2 * (new_difflist_len % kBitsPerWordD2));
+	++new_difflist_len;
+	if (!(new_difflist_len % kBitsPerWordD2)) {
+	  *new_raregeno_incr++ = new_raregeno_word;
+	  new_raregeno_word = 0;
+	}
+      }
+      if (raw_difflist_idx_lowbits == block_len_m1) {
+	break;
+      }
+      ++raw_difflist_idx_lowbits;
+    }
+    raw_difflist_sample_ids_iter = &(raw_difflist_sample_ids_iter[kBitsPerWordD2]);
+  }
+}
+*/
+
+// populates pgrp->ldbase_genovec or
+// pgrp->ldbase_{raregeno,difflist_sample_ids,difflist_len}, depending on
+// storage type
+// requires workspace_vec
+pglerr_t ld_load_minimal_subset_if_necessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp) {
+  if (!ld_load_necessary(vidx, pgrp)) {
+    return kPglRetSuccess;
+  }
+  const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
+  const uint64_t cur_vidx_fpos = pgrp->fi.var_fpos[ldbase_vidx];
+  const uint32_t ldbase_vrtype = pgrp->fi.vrtypes[ldbase_vidx];
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  uintptr_t* raw_genovec = subsetting_required? pgrp->workspace_vec : pgrp->ldbase_genovec;
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  const unsigned char* block_base = pgrp->fi.block_base;
+  pglerr_t reterr = kPglRetSuccess;
+  if (block_base != nullptr) {
+    {
+      const uint64_t block_offset = pgrp->fi.block_offset;
+      fread_ptr = &(block_base[cur_vidx_fpos - block_offset]);
+      fread_end = &(block_base[pgrp->fi.var_fpos[ldbase_vidx + 1] - block_offset]);
+    }
+    if (!vrtype_difflist(ldbase_vrtype)) {
+      pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+      reterr = parse_1or2bit_genovec_unsafe(fread_end, ldbase_vrtype, 0, &fread_ptr, pgrp, raw_genovec);
+    ld_load_minimal_subset_if_necessary_genovec_finish:
+      if ((!subsetting_required) || reterr) {
+	return reterr;
+      }
+      copy_quaterarr_nonempty_subset(raw_genovec, sample_include, raw_sample_ct, sample_ct, pgrp->ldbase_genovec);
+      return kPglRetSuccess;
+    }
+  } else {
+    if (fseeko(pgrp->ff, pgrp->fi.var_fpos[ldbase_vidx], SEEK_SET)) {
+      return kPglRetReadFail;
+    }
+    pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+    if (!(ldbase_vrtype & 7)) {
+      // don't actually need to fread the whole record in this case
+      if (!fread(raw_genovec, QUATERCT_TO_BYTECT(raw_sample_ct), 1, pgrp->ff)) {
+	return kPglRetReadFail;
+      }
+      goto ld_load_minimal_subset_if_necessary_genovec_finish;
+    }
+    const uintptr_t cur_vrec_width = (uintptr_t)(pgrp->fi.var_fpos[ldbase_vidx + 1] - cur_vidx_fpos);
+    if (!fread(pgrp->fread_buf, cur_vrec_width, 1, pgrp->ff)) {
+      return kPglRetReadFail;
+    }
+    fread_ptr = pgrp->fread_buf;
+    fread_end = &(pgrp->fread_buf[cur_vrec_width]);
+    if (!vrtype_difflist(ldbase_vrtype)) {
+      reterr = parse_onebit_unsafe(fread_end, 0, &fread_ptr, pgrp, raw_genovec);
+      goto ld_load_minimal_subset_if_necessary_genovec_finish;
+    }
+  }
+  uint32_t ldbase_difflist_len;
+  if (!subsetting_required) {
+    reterr = parse_and_save_difflist(fread_end, raw_sample_ct, &fread_ptr, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, &ldbase_difflist_len);
+  } else {
+    reterr = parse_and_save_difflist_proper_subset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_sample_ct, &fread_ptr, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, &ldbase_difflist_len, pgrp->workspace_raregeno_tmp_loadbuf);
+  }
+  if (reterr) {
+    return reterr;
+  }
+  pgrp->ldbase_difflist_len = ldbase_difflist_len;
+  pgrp->ldbase_difflist_sample_ids[ldbase_difflist_len] = sample_ct;
+  pgrp->ldbase_stypes = kfPgrLdcacheDifflist;
+  pgrp->fp_vidx = ldbase_vidx + 1;
+  return kPglRetSuccess;
+}
+
+pglerr_t read_refalt1_difflist_or_genovec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict diffl [...]
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  assert(sample_ct);
+  // Side effects:
+  //   may use pgr.workspace_vec or workspace_difflist_sample_ids_tmp iff
+  //     subsetting required.
+  //   may use pgr.workspace_raregeno_tmp_loadbuf
+  // Trailing bits of genovec/main_raregeno may not be zeroed out.
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t maintrack_vrtype = vrtype & 7;
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  const uint32_t multiallelic_relevant = fread_pp && vrtype_multiallelic(vrtype);
+  if (vrtype_ld_compressed(maintrack_vrtype)) {
+    // LD compression
+    
+    // note that this can currently load a difflist longer than
+    // max_simple_difflist_len
+    pglerr_t reterr = ld_load_minimal_subset_if_necessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
+    if (reterr) {
+      return reterr;
+    }
+    const unsigned char* fread_ptr;
+    const unsigned char* fread_end;
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    const uint32_t ld_invert = (maintrack_vrtype == 3);
+    if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
+      const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3;
+      // unnecessary for this to branch on LD difflist length, since that's
+      // limited to 3/4 of the ldbase difflist length.
+      *difflist_common_geno_ptr = ldbase_common_geno;
+      reterr = parse_ld_and_merge_difflist_subset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->ldbase_difflist_len, ldbase_common_geno, raw_sample_ct, sample_ct, &fread_ptr, multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, &(pgrp->workspace_ambig_id_ct), pgrp->workspace_raregeno_tmp_loadbuf);
+      if (reterr) {
+	return reterr;
+      }
+      if (ld_invert) {
+	*difflist_common_geno_ptr = (6 - ldbase_common_geno) & 3;
+	genovec_invert_unsafe(*difflist_len_ptr, main_raregeno);
+      }
+      return kPglRetSuccess;
+    }
+    assert(pgrp->ldbase_stypes & kfPgrLdcacheQuater);
+    *difflist_common_geno_ptr = 0xffffffffU;
+    copy_quaterarr(pgrp->ldbase_genovec, sample_ct, genovec);
+    reterr = parse_and_apply_difflist_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, multiallelic_relevant, &fread_ptr, pgrp, genovec);
+    if (reterr) {
+      return reterr;
+    }
+    if (ld_invert) {
+      genovec_invert_unsafe(sample_ct, genovec);
+    }
+    if (fread_pp) {
+      *fread_pp = fread_ptr;
+      *fread_endp = fread_end;
+    }
+    return kPglRetSuccess;
+  }
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end = nullptr; // maybe-uninitialized warning
+  if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+    return kPglRetReadFail;
+  }
+  const uint32_t is_ldbase = pgrp->fi.vrtypes && vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+  const uint32_t saved_difflist_len = vrtype_difflist(vrtype)? peek_vint31(fread_ptr, fread_end) : raw_sample_ct;
+  pgrp->ldbase_vidx = vidx;
+  // no limit is slightly better than /16 but substantially worse than /32 on
+  // the large test dataset (/64 is slightly worse than /32)
+  // no limit is best on the small test dataset
+  if (saved_difflist_len > max_simple_difflist_len) {
+    *difflist_common_geno_ptr = 0xffffffffU;
+    pglerr_t reterr = parse_non_ld_genovec_subset_unsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, multiallelic_relevant, &fread_ptr, pgrp, genovec);
+    if (reterr) {
+      return reterr;
+    }
+    if (is_ldbase) {
+      copy_quaterarr(genovec, sample_ct, pgrp->ldbase_genovec);
+      pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+    }
+    if (vrtype == kPglVrtypePlink1) {
+      pgr_plink1_to_plink2_inplace_unsafe(sample_ct, genovec);
+    }
+    if (fread_pp) {
+      *fread_pp = fread_ptr;
+      *fread_endp = fread_end;
+    }
+    return kPglRetSuccess;
+  }
+  *difflist_common_geno_ptr = vrtype & 3;
+  if (parse_and_save_difflist_subset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, raw_sample_ct, &fread_ptr, multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, &(pgrp->workspace_ambig_id_ct), pgrp->workspace_raregeno_tmp_loadbuf)) {
+    return kPglRetMalformedInput;
+  }
+  if (is_ldbase) {
+    const uint32_t difflist_len = *difflist_len_ptr;
+    pgrp->ldbase_stypes = kfPgrLdcacheDifflist;
+    pgrp->ldbase_difflist_len = difflist_len;
+    copy_quaterarr(main_raregeno, difflist_len, pgrp->ldbase_raregeno);
+    memcpy(pgrp->ldbase_difflist_sample_ids, difflist_sample_ids, difflist_len * sizeof(int32_t));
+    pgrp->ldbase_difflist_sample_ids[difflist_len] = sample_ct;
+  }
+  if (fread_pp) {
+    *fread_pp = fread_ptr;
+    *fread_endp = fread_end;
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t pgr_read_refalt1_difflist_or_genovec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    *difflist_common_geno_ptr = 0xffffffffU;
+    return kPglRetSuccess;
+  }
+  return read_refalt1_difflist_or_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, genovec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
+}
+
+pglerr_t ld_subset_adjust_genocounts(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_genovec, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict genocounts, uint32_t* __restrict ambig_sample_ids, uint32_t* __restrict ambig_id_ct_ptr, uint32_t* __restrict ambig_id_ct_filtered_ptr, uintptr_t* __restrict raregeno_workspace) {
+  // * Assumes genocounts[] is initialized to the proper values for the LD
+  //   reference variant (including subsetting).
+  // * Tried a hybrid implementation which allowed the base variant to be saved
+  //   as a difflist; turns out it's practically always better to unpack to a
+  //   genovec first.
+  // * ambig_sample_ids should be nullptr if it doesn't need to be filled.
+  //   Note that, for a multiallelic variant, we don't need ambig_sample_ids to
+  //   be filled unless we're also looking at a subset of the samples.  If we
+  //   skip filling ambig_sample_ids, ambig_id_ct will be zero, but
+  //   ambig_id_ct_filtered will contain the correct value.  (Strangely, the
+  //   function slows down substantially if I try to conditionally assign it to
+  //   ambig_id_ct instead of returning it separately.)
+  // * This is the main frequency-counting bottleneck.
+  uint32_t raw_difflist_len;
+  const unsigned char* group_info_iter;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &raw_difflist_len);
+  if (reterr || (!raw_difflist_len)) {
+    *ambig_id_ct_ptr = 0;
+    // assumes ambig_id_ct_filtered is initialized to zero
+    return reterr;
+  }
+  const uint32_t subgroup_idx_last = (raw_difflist_len - 1) / kBitsPerWordD2;
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uintptr_t* raregeno_workspace_iter = raregeno_workspace;
+  uintptr_t raw_sample_idx = 0;
+  uint32_t subgroup_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t delta_counts[16];
+  fill_uint_zero(16, delta_counts);
+  while (1) {
+    uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	*ambig_id_ct_ptr = ambig_id_ct;
+	*ambig_id_ct_filtered_ptr = delta_counts[12] + delta_counts[13] + delta_counts[14] + delta_counts[15];
+	const int32_t incr0 = (int32_t)(delta_counts[1] + delta_counts[2] + delta_counts[3] - delta_counts[4] - delta_counts[8] - delta_counts[12]);
+	const int32_t incr1 = (int32_t)(delta_counts[4] + delta_counts[6] + delta_counts[7] - delta_counts[1] - delta_counts[9] - delta_counts[13]);
+	const int32_t incr2 = (int32_t)(delta_counts[8] + delta_counts[9] + delta_counts[11] - delta_counts[2] - delta_counts[6] - delta_counts[14]);
+	genocounts[0] += incr0;
+	genocounts[1] += incr1;
+	genocounts[2] += incr2;
+	genocounts[3] -= incr0 + incr1 + incr2;
+	return kPglRetSuccess;
+      }
+      remaining_deltas_in_subgroup &= raw_difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+#ifdef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
+    while (1) {
+#ifndef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      const uintptr_t cur_geno = cur_raregeno_word & 3;
+      if (!sample_include) {
+	delta_counts[cur_geno * 4 + GET_QUATERARR_ENTRY(ldbase_genovec, raw_sample_idx)] += 1;
+      } else if (IS_SET(sample_include, raw_sample_idx)) {
+	const uint32_t sample_idx = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, (uint32_t)raw_sample_idx);
+	delta_counts[cur_geno * 4 + GET_QUATERARR_ENTRY(ldbase_genovec, sample_idx)] += 1;
+      }
+      if (ambig_sample_ids && (cur_geno == 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (!remaining_deltas_in_subgroup) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      --remaining_deltas_in_subgroup;
+      cur_raregeno_word >>= 2;
+    }
+  }
+}
+
+uint32_t bytesum_arr(const unsigned char* bytearr, uint32_t byte_ct) {
+  // Assumes sum < 2^32.
+  // This is only slightly slower than SSE2 code, while tolerating an unaligned
+  // starting address.
+#ifdef __arm__
+  #error "Unaligned accesses in bytesum_arr()."
+#endif
+  const uint32_t word_ct = byte_ct / kBytesPerWord;
+  const uintptr_t* bytearr_alias_iter = (const uintptr_t*)bytearr;
+  const uint32_t wordblock_idx_trail = word_ct / 256;
+  const uint32_t wordblock_idx_end = DIV_UP(word_ct, 256);
+  uint32_t wordblock_idx = 0;
+  uint32_t wordblock_len = 256;
+  uint32_t tot = 0;
+  while (1) {
+    if (wordblock_idx >= wordblock_idx_trail) {
+      if (wordblock_idx == wordblock_idx_end) {
+	byte_ct = byte_ct % kBytesPerWord;
+	const unsigned char* bytearr_alias_iter2 = (const unsigned char*)bytearr_alias_iter;
+	for (uint32_t uii = 0; uii < byte_ct; ++uii) {
+	  tot += bytearr_alias_iter2[uii];
+	}
+	return tot;
+      }
+      wordblock_len = word_ct % 256;
+    }
+    ++wordblock_idx;
+    const uintptr_t* bytearr_alias_stop = &(bytearr_alias_iter[wordblock_len]);
+    uintptr_t acc_even = 0;
+    uintptr_t acc_odd = 0;
+    do {
+      uintptr_t cur_word = *bytearr_alias_iter++;
+      acc_even += cur_word & kMask00FF;
+      acc_odd += (cur_word >> 8) & kMask00FF;
+    } while (bytearr_alias_iter < bytearr_alias_stop);
+    acc_even += acc_odd;
+#ifdef __LP64__
+    acc_even = (acc_even & kMask0000FFFF) + ((acc_even >> 16) & kMask0000FFFF);
+#endif
+    tot += ((halfword_t)acc_even) + (acc_even >> kBitsPerWordD2);
+  }
+}
+
+pglerr_t skip_difflist_ids(const unsigned char* fread_end, const unsigned char* group_info, uint32_t difflist_len, uint32_t raw_sample_ct, const unsigned char** fread_pp) {
+  assert(difflist_len);
+  // fread_pp is a pure output parameter here
+  const uint32_t group_ct = DIV_UP(difflist_len, kPglDifflistGroupSize);
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  const unsigned char* extra_byte_cts = &(group_info[group_ct * sample_id_byte_ct]);
+  const uint32_t extra_byte_tot = bytesum_arr(extra_byte_cts, group_ct - 1);
+
+  // (group_ct - 1) for extra_byte_cts
+  // (difflist_len + 3) / 4 for raregeno
+  // (group_ct - 1) * (kPglDifflistGroupSize - 1) + extra_byte_tot for
+  //   all but last ID block
+  // total = (group_ct - 1) * kPglDifflistGroupSize + extra_byte_tot +
+  //         (difflist_len + 3) / 4
+#ifdef __arm__
+  #error "Unaligned accesses in skip_difflist_ids()."
+#endif
+  const uintptr_t* fread_alias = (const uintptr_t*)(&(extra_byte_cts[(group_ct - 1) * kPglDifflistGroupSize + extra_byte_tot + QUATERCT_TO_BYTECT(difflist_len)]));
+  const uintptr_t* fread_alias_stop = (const uintptr_t*)(&(fread_end[-((int32_t)kBytesPerWord)]));
+  uint32_t remaining_id_ct = (difflist_len - 1) % kPglDifflistGroupSize;
+  while (remaining_id_ct >= kBytesPerWord) {
+    // scan a word at a time, count number of high bits set
+    if (fread_alias > fread_alias_stop) {
+      return kPglRetMalformedInput;
+    }
+#ifdef USE_SSE42
+    const uintptr_t ww = (*fread_alias++) & (0x80 * kMask0101);
+    remaining_id_ct -= kBytesPerWord - popcount_long(ww);
+#else
+    const uintptr_t ww = ((*fread_alias++) >> 7) & kMask0101;
+    remaining_id_ct -= kBytesPerWord - ((ww * kMask0101) >> (kBitsPerWord - 8));
+#endif
+  }
+  const unsigned char* fread_ptr = (const unsigned char*)fread_alias;
+  if (!remaining_id_ct) {
+    *fread_pp = fread_ptr;
+    return kPglRetSuccess;
+  }
+  --remaining_id_ct;
+  while (fread_ptr < fread_end) {
+    if ((*fread_ptr++) <= 127) {
+      if (!remaining_id_ct) {
+	*fread_pp = fread_ptr;
+	return kPglRetSuccess;
+      }
+      --remaining_id_ct;
+    }
+  }
+  return kPglRetMalformedInput;
+}
+
+pglerr_t countparse_difflist_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t common_geno, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, uint32_t* __restrict ambig_sample_ids, uint32_t* __restrict ambig_id_ct_ptr, uint32_t* __restrict genocounts, uintptr_t* __restrict raregeno_workspace) {
+  const unsigned char* group_info_iter;
+  uint32_t difflist_len;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &difflist_len);
+  *ambig_id_ct_ptr = 0;
+  fill_uint_zero(4, genocounts);
+  if (reterr || (!difflist_len)) {
+    genocounts[common_geno] = sample_ct;
+    return reterr;
+  }
+  if (raw_sample_ct == sample_ct) {
+    zero_trailing_quaters(difflist_len, raregeno_workspace);
+    genovec_count_freqs_unsafe(raregeno_workspace, difflist_len, genocounts);
+    if (ambig_sample_ids && genocounts[3]) {
+      // no need to update ambig_sample_ids[], but necessary to set ambig_id_ct
+      // and fread_pp to enable rarealt counting.
+      reterr = skip_difflist_ids(fread_end, group_info_iter, difflist_len, raw_sample_ct, fread_pp);
+      *ambig_id_ct_ptr = genocounts[3];
+    }
+    genocounts[common_geno] += sample_ct - difflist_len;
+    return kPglRetSuccess;
+  }
+  const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uintptr_t* raregeno_workspace_iter = raregeno_workspace;
+  uintptr_t raw_sample_idx = 0;
+  uint32_t subgroup_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t common_decr = 0;
+  while (1) {
+    uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	*ambig_id_ct_ptr = ambig_id_ct;
+	genocounts[common_geno] = sample_ct - common_decr;
+	return kPglRetSuccess;
+      }
+      remaining_deltas_in_subgroup &= difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+#ifdef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
+    while (1) {
+#ifndef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      const uintptr_t cur_geno = cur_raregeno_word & 3;
+      if (IS_SET(sample_include, raw_sample_idx)) {
+	genocounts[cur_geno] += 1;
+	++common_decr;
+      }
+      if (ambig_sample_ids && (cur_geno == 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (!remaining_deltas_in_subgroup) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      --remaining_deltas_in_subgroup;
+      cur_raregeno_word >>= 2;
+    }
+  }
+}
+
+// 1-bit, unsubsetted: count 1-bit array, then count raregeno
+// 1-bit, subsetted: count [1-bit array AND sample_include], iterate through
+//   difflist
+pglerr_t countparse_onebit_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, uint32_t* __restrict ambig_sample_ids, uint32_t* __restrict ambig_id_ct_ptr, uint32_t* __restrict genocounts, uintptr_t* __restrict raregeno_workspace) {
+  const uint32_t initial_bitarray_byte_ct = DIV_UP(raw_sample_ct, CHAR_BIT);
+  if ((uintptr_t)(fread_end - (*fread_pp)) <= initial_bitarray_byte_ct) {
+    return kPglRetMalformedInput;
+  }
+  const unsigned char* fread_difflist_start = &((*fread_pp)[1 + initial_bitarray_byte_ct]);
+  const uint32_t common2_code = *((*fread_pp)++);
+  const uint32_t geno_code_low = common2_code / 4;
+  const uint32_t geno_code_high = (common2_code & 3) + geno_code_low;
+#ifdef __arm__
+  #error "Unaligned accesses in countparse_onebit_subset()."
+#endif
+  const uintptr_t* onebitarr = (const uintptr_t*)(*fread_pp);
+  uint32_t high_geno_ct;
+  if (raw_sample_ct == sample_ct) {
+    high_geno_ct = (uint32_t)popcount_bytes(*fread_pp, initial_bitarray_byte_ct);
+  } else {
+    high_geno_ct = (uint32_t)popcount_bytes_masked(*fread_pp, sample_include, initial_bitarray_byte_ct);
+  }
+  *fread_pp = fread_difflist_start;  
+  const unsigned char* group_info_iter;
+  uint32_t difflist_len;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &difflist_len);
+  *ambig_id_ct_ptr = 0;
+  fill_uint_zero(4, genocounts);
+  if (reterr || (!difflist_len)) {
+    genocounts[geno_code_low] = sample_ct - high_geno_ct;
+    genocounts[geno_code_high] = high_geno_ct;
+    return reterr;
+  }
+  if (raw_sample_ct == sample_ct) {
+    zero_trailing_quaters(difflist_len, raregeno_workspace);
+    genovec_count_freqs_unsafe(raregeno_workspace, difflist_len, genocounts);
+    if (ambig_sample_ids && genocounts[3]) {
+      // no need to update ambig_sample_ids[], but necessary to set ambig_id_ct
+      // and fread_pp to enable rarealt counting.
+      reterr = skip_difflist_ids(fread_end, group_info_iter, difflist_len, raw_sample_ct, fread_pp);
+      *ambig_id_ct_ptr = genocounts[3];
+    }
+    genocounts[geno_code_low] += sample_ct - difflist_len - high_geno_ct;
+    genocounts[geno_code_high] += high_geno_ct;
+    return kPglRetSuccess;
+  }
+  const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uintptr_t* raregeno_workspace_iter = raregeno_workspace;
+  uintptr_t raw_sample_idx = 0;
+  uint32_t subgroup_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t common_decr = 0;
+  while (1) {
+    uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	*ambig_id_ct_ptr = ambig_id_ct;
+	genocounts[geno_code_low] += sample_ct - common_decr - high_geno_ct;
+	genocounts[geno_code_high] += high_geno_ct;
+	return kPglRetSuccess;
+      }
+      remaining_deltas_in_subgroup &= difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+#ifdef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
+    while (1) {
+#ifndef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      const uintptr_t cur_geno = cur_raregeno_word & 3;
+      if (IS_SET(sample_include, raw_sample_idx)) {
+	genocounts[cur_geno] += 1;
+	++common_decr;
+	high_geno_ct -= IS_SET(onebitarr, raw_sample_idx);
+      }
+      if (ambig_sample_ids && (cur_geno == 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (!remaining_deltas_in_subgroup) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      --remaining_deltas_in_subgroup;
+      cur_raregeno_word >>= 2;
+    }
+  }
+}
+
+// fread_pp should be non-null iff this is being called by an internal function
+// gathering rarealt counts, or dosages, as well
+pglerr_t get_refalt1_genotype_counts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uint32_t* genocounts) {
+  // genocounts[0] := ref/ref, genocounts[1] := ref/alt1,
+  // genocounts[2] := alt1/alt1, genocounts[3] := missing/other
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  assert(sample_ct);
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  const uint32_t multiallelic_relevant = fread_pp && vrtype_multiallelic(vrtype);
+  if (vrtype_ld_compressed(vrtype)) {
+    // LD compression
+    pglerr_t reterr = ld_load_genovec_subset_if_necessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
+    if (reterr) {
+      return reterr;
+    }
+    const unsigned char* fread_ptr;
+    const unsigned char* fread_end;
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    if (!(pgrp->ldbase_stypes & kfPgrLdcacheRefalt1Genocounts)) {
+      zero_trailing_quaters(sample_ct, pgrp->ldbase_genovec);
+      genovec_count_freqs_unsafe(pgrp->ldbase_genovec, sample_ct, pgrp->ldbase_refalt1_genocounts);
+      pgrp->ldbase_stypes |= kfPgrLdcacheRefalt1Genocounts;
+    }
+    memcpy(genocounts, pgrp->ldbase_refalt1_genocounts, 4 * sizeof(int32_t));
+    uint32_t ambig_id_ct_filtered = 0;
+    reterr = ld_subset_adjust_genocounts(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_genovec, raw_sample_ct, &fread_ptr, genocounts, (subsetting_required && multiallelic_relevant)? pgrp->workspace_ambig_sample_ids : nullptr, &(pgrp->workspace_ambig_id_ct), &ambig_id_ct_filtered, pgrp->workspace_raregeno_tmp_loadbuf);
+    if (!subsetting_required) {
+      pgrp->workspace_ambig_id_ct = ambig_id_ct_filtered;
+    }
+    if (vrtype & 1) {
+      // inverted
+      const uint32_t tmpval = genocounts[0];
+      genocounts[0] = genocounts[2];
+      genocounts[2] = tmpval;
+    }
+    if (fread_pp) {
+      *fread_pp = fread_ptr;
+      *fread_endp = fread_end;
+    }
+    return reterr;
+  }
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end = nullptr; // maybe-uninitialized warning
+  if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+    return kPglRetReadFail;
+  }
+  const uint32_t is_ldbase = pgrp->fi.vrtypes && vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+  if (is_ldbase) {
+    // difflists are very efficient to count directly when not subsetting
+    // (since we can entirely ignore the sample IDs), but it's often better to
+    // unpack them first when subsetting.
+
+    // ...er, the statement above is a lie, unpack-first almost always seems to
+    // be better.
+    pgrp->ldbase_vidx = vidx;
+    // this may be slowed down by the LD caching change.
+    pglerr_t reterr = parse_non_ld_genovec_subset_unsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, multiallelic_relevant, &fread_ptr, pgrp, pgrp->ldbase_genovec);
+    if (reterr) {
+      return reterr;
+    }
+    zero_trailing_quaters(sample_ct, pgrp->ldbase_genovec);
+    genovec_count_freqs_unsafe(pgrp->ldbase_genovec, sample_ct, genocounts);
+    memcpy(pgrp->ldbase_refalt1_genocounts, genocounts, 4 * sizeof(int32_t));
+    pgrp->ldbase_stypes = kfPgrLdcacheQuater | kfPgrLdcacheRefalt1Genocounts;
+    if (fread_pp) {
+      *fread_pp = fread_ptr;
+      *fread_endp = fread_end;
+    }
+    return kPglRetSuccess;
+  }
+  if (vrtype_difflist(vrtype)) {
+    pglerr_t reterr = countparse_difflist_subset(fread_end, sample_include, vrtype & 3, raw_sample_ct, sample_ct, &fread_ptr, multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr, &(pgrp->workspace_ambig_id_ct), genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
+    if (fread_pp) {
+      *fread_pp = fread_ptr;
+      *fread_endp = fread_end;
+    }
+    return reterr;
+  }
+  if (vrtype & 1) {
+    pglerr_t reterr = countparse_onebit_subset(fread_end, sample_include, raw_sample_ct, sample_ct, &fread_ptr, multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr, &(pgrp->workspace_ambig_id_ct), genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
+    if (fread_pp) {
+      *fread_pp = fread_ptr;
+      *fread_endp = fread_end;
+    }
+    return reterr;
+  }
+  const uint32_t genovec_byte_ct = QUATERCT_TO_BYTECT(raw_sample_ct);
+  if ((uintptr_t)(fread_end - fread_ptr) < genovec_byte_ct) {
+    return kPglRetMalformedInput;
+  }
+  const unsigned char* fread_2bit_end = &(fread_ptr[genovec_byte_ct]);
+  const uint32_t fread_ptr_unaligned = ((uintptr_t)fread_ptr) & (kBytesPerVec - 1);
+  if (!subsetting_required) {
+    if (fread_ptr_unaligned) {
+      genoarr_count_freqs(fread_ptr, raw_sample_ct, genocounts);
+    } else {
+      genovec_count_freqs((const uintptr_t*)fread_ptr, raw_sample_ct, genocounts);
+    }
+  } else {
+    if (fread_ptr_unaligned) {
+      genoarr_count_subset_freqs(fread_ptr, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
+    } else {
+      genovec_count_subset_freqs((const uintptr_t*)fread_ptr, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
+    }
+  }
+  if (multiallelic_relevant) {
+    extract_genoarr_ambig_ids((const uintptr_t*)fread_ptr, raw_sample_ct, pgrp->workspace_ambig_sample_ids, &(pgrp->workspace_ambig_id_ct));
+    *fread_pp = fread_2bit_end;
+    *fread_endp = fread_end;
+  } else if (vrtype == kPglVrtypePlink1) {
+    // [3] -> [0]
+    // [2] -> [1]
+    // [1] -> [3]
+    // [0] -> [2]
+    const uint32_t save2 = genocounts[0];
+    const uint32_t save3 = genocounts[1];
+    genocounts[0] = genocounts[3];
+    genocounts[1] = genocounts[2];
+    genocounts[2] = save2;
+    genocounts[3] = save3;
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t pgr_get_refalt1_genotype_counts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uint32_t* genocounts) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    fill_uint_zero(4, genocounts);
+    return kPglRetSuccess;
+  }
+  return get_refalt1_genotype_counts(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genocounts);
+}
+
+
+pglerr_t parse_aux1(const unsigned char* fread_end, uint32_t alt_allele_ct, const unsigned char** fread_pp, pgen_reader_t* pgrp, uint32_t* aux1_nonmissing_ct_ptr) {
+  // Assumes pgr.workspace_ambig_id_ct has the correct entry count.
+  // Fills pgr.workspace_aux1_nonmissing_vec and (usually)
+  //   pgr.workspace_aux1_code_vec, zeroes trailing bits of the former.
+  // aux1_nonmissing_ct_ptr can be set to nullptr to skip past this track
+  //   instead of copying it into aux1_code_vec.
+  const uint32_t ambig_id_ct = pgrp->workspace_ambig_id_ct;
+  uintptr_t* aux1_nonmissing_vec = pgrp->workspace_aux1_nonmissing_vec;
+  const uint32_t aux1_nonmissing_byte_ct = DIV_UP(ambig_id_ct, CHAR_BIT);
+  const unsigned char* fread_ptr = *fread_pp;
+  if ((uintptr_t)(fread_end - fread_ptr) < aux1_nonmissing_byte_ct) {
+    return kPglRetMalformedInput;
+  }
+  memcpy(aux1_nonmissing_vec, fread_ptr, aux1_nonmissing_byte_ct);
+  zero_trailing_bits(ambig_id_ct, aux1_nonmissing_vec);
+  fread_ptr = &(fread_ptr[aux1_nonmissing_byte_ct]);
+  const uint32_t aux1_nonmissing_ct = (uint32_t)popcount_longs(aux1_nonmissing_vec, BITCT_TO_WORDCT(ambig_id_ct));
+  const uint32_t aux1_allele_entry_bytect = (uint32_t)get_aux1_allele_bytect(alt_allele_ct, aux1_nonmissing_ct);
+  if ((uintptr_t)(fread_end - fread_ptr) < aux1_allele_entry_bytect) {
+    return kPglRetMalformedInput;
+  }
+  if (aux1_nonmissing_ct_ptr) {
+    *aux1_nonmissing_ct_ptr = aux1_nonmissing_ct;
+    memcpy(pgrp->workspace_aux1_code_vec, fread_ptr, aux1_allele_entry_bytect);
+  }
+  *fread_pp = &(fread_ptr[aux1_allele_entry_bytect]);
+  return kPglRetSuccess;
+}
+
+static_assert(kPglMaxAltAlleleCt == 254, "Need to update aux1_update_allele_counts().");
+void aux1_update_allele_counts(uint32_t alt_allele_ct, uint32_t aux1_nonmissing_ct, uintptr_t* aux1_code_vec, uint32_t* allele_ct_buf) {
+  // aux1_code_vec not const since we might zero the trailing bits
+  // todo: validate?
+  fill_uint_zero(alt_allele_ct - 1, &(allele_ct_buf[2]));
+  if (!aux1_nonmissing_ct) {
+    return;
+  }
+  if (alt_allele_ct < 4) {
+    assert(alt_allele_ct >= 2);
+    uint32_t aux1_counts[4];
+    uint32_t code_vec_entry_ct = aux1_nonmissing_ct * (alt_allele_ct - 1);
+    zero_trailing_quaters(code_vec_entry_ct, aux1_code_vec);
+    genovec_count_freqs_unsafe(aux1_code_vec, code_vec_entry_ct, aux1_counts);
+    allele_ct_buf[0] += aux1_counts[0];
+    allele_ct_buf[1] += aux1_counts[1];
+    if (alt_allele_ct == 2) {
+      allele_ct_buf[2] += aux1_counts[0] + aux1_counts[1] + 2 * aux1_counts[2];
+    } else {
+      allele_ct_buf[2] += aux1_counts[2];
+      allele_ct_buf[3] += aux1_counts[3];
+    }
+    return;
+  }
+  const uintptr_t* aux1_code_vec_iter = aux1_code_vec;
+  const uint32_t aux1_nonmissing_allele_ct = 2 * aux1_nonmissing_ct;
+  // Slightly different code must be used for 256 <= alt_allele_ct < 4096, and
+  // 65536 <= alt_allele_ct < 2^24.
+  assert(alt_allele_ct <= kPglMaxAltAlleleCt);
+  uint32_t halfcode_bit_width;
+  uint32_t log2_halfcodes_per_word;  
+  if (alt_allele_ct < 16) {
+    halfcode_bit_width = 4;
+    log2_halfcodes_per_word = kBitsPerWordLog2 - 2;
+  } else {
+    halfcode_bit_width = 8;
+    log2_halfcodes_per_word = kBitsPerWordLog2 - 3;
+  }
+  const uintptr_t* aux1_code_vec_last = &(aux1_code_vec[aux1_nonmissing_allele_ct >> log2_halfcodes_per_word]);
+  const uint32_t halfcode_mask = (1 << halfcode_bit_width) - 1;
+  uint32_t block_len_m1 = (1 << log2_halfcodes_per_word) - 1;
+  while (1) {
+    if (aux1_code_vec_iter >= aux1_code_vec_last) {
+      if (aux1_code_vec_iter > aux1_code_vec_last) {
+	return;
+      }
+      block_len_m1 &= aux1_nonmissing_allele_ct - 1;
+    }
+    uintptr_t cur_aux_word = *aux1_code_vec_iter++;
+    uint32_t aux_idx_lowbits = 0;
+    while (1) {
+      allele_ct_buf[cur_aux_word & halfcode_mask] += 1;
+      if (aux_idx_lowbits == block_len_m1) {
+	break;
+      }
+      ++aux_idx_lowbits;
+      cur_aux_word >>= halfcode_bit_width;
+    }
+  }
+}
+
+static_assert(kPglMaxAltAlleleCt == 254, "Need to update aux1_subset_update_allele_counts().");
+void aux1_subset_update_allele_counts(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* __restrict aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, const uintptr_t* __restrict sample_include, uint32_t alt_allele_ct, uint32_t aux1_nonmissing_ct, uint32_t* allele_ct_buf) {
+  // todo: validate?
+  fill_uint_zero(alt_allele_ct - 1, &(allele_ct_buf[2]));
+  uint32_t ambig_idx = 0;
+  if (alt_allele_ct == 2) {
+    for (uint32_t aux_idx = 0; aux_idx < aux1_nonmissing_ct; ++aux_idx, ++ambig_idx) {
+      next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+      if (IS_SET(sample_include, sample_idx)) {
+	allele_ct_buf[GET_QUATERARR_ENTRY(aux1_code_vec, aux_idx)] += 1;
+      }
+    }
+    allele_ct_buf[2] += aux1_nonmissing_ct;
+    return;
+  }
+  assert(alt_allele_ct <= kPglMaxAltAlleleCt);
+  uint32_t log2_codes_per_word;
+  uint32_t halfcode_bit_width;
+  if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    halfcode_bit_width = 2;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    halfcode_bit_width = 4;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    halfcode_bit_width = 8;
+  }
+  const uint32_t idx_mask = (1 << log2_codes_per_word) - 1;
+  const uint32_t entry_bit_ct = halfcode_bit_width * 2;
+  const uint32_t halfcode_mask = (1 << halfcode_bit_width) - 1;
+  for (uint32_t aux_idx = 0; aux_idx < aux1_nonmissing_ct; ++aux_idx, ++ambig_idx) {
+    next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+    uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+    if (IS_SET(sample_include, sample_idx)) {
+      uint32_t cur_code_unmasked = (uint32_t)(aux1_code_vec[aux_idx >> log2_codes_per_word] >> (entry_bit_ct * (aux_idx & idx_mask)));
+      allele_ct_buf[cur_code_unmasked & halfcode_mask] += 1;
+      allele_ct_buf[(cur_code_unmasked >> halfcode_bit_width) & halfcode_mask] += 1;
+    }
+  }
+}
+
+void aux1_update_ref_or_alt1_countvec(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, uint32_t alt_allele_ct, uint32_t aux1_nonmissing_ct, uint32_t allele_idx, uintptr_t* __restrict allele_countvec) {
+  if (!aux1_nonmissing_ct) {
+    return;
+  }
+  uint32_t log2_codes_per_word;
+  uint32_t code_bit_width;
+  if (alt_allele_ct == 2) {
+    log2_codes_per_word = kBitsPerWordLog2 - 1;
+    code_bit_width = 2;
+  } else if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    code_bit_width = 4;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    code_bit_width = 8;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    code_bit_width = 16;
+  }
+
+  // The "+ (code_bit_width == 2)" is needed to handle alt_allele_ct == 2
+  // correctly.
+  // This code may need to be changed when we increase the alt allele count
+  // limit, since a natural 256-4095 alt allele idx representation uses 12 bits
+  // per halfcode, which is not a nice power of two.  We might decide at that
+  // point that code simplicity is worth bloating this part of the file by 33%
+  // (i.e. use 16 bits per halfcode), but the necessary code isn't very
+  // complicated...
+  const uint32_t halfcode_mask = (1 << ((code_bit_width / 2) + (code_bit_width == 2))) - 1;
+  const uintptr_t* aux1_code_vec_iter = aux1_code_vec;
+  const uintptr_t* aux1_code_vec_last = &(aux1_code_vec[(aux1_nonmissing_ct - 1) >> log2_codes_per_word]);
+  uint32_t ambig_idx = 0;
+  uint32_t block_len_m1 = (1 << log2_codes_per_word) - 1;
+  while (1) {
+    if (aux1_code_vec_iter >= aux1_code_vec_last) {
+      if (aux1_code_vec_iter > aux1_code_vec_last) {
+	return;
+      }
+      block_len_m1 &= aux1_nonmissing_ct - 1;
+    }
+    uintptr_t aux1_code_word = *aux1_code_vec_iter++;
+    uint32_t aux_idx_lowbits = 0;
+    while (1) {
+      next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+      const uintptr_t cur_allele_ct = ((aux1_code_word & halfcode_mask) == allele_idx);
+      ASSIGN_QUATERARR_ENTRY(sample_idx, cur_allele_ct, allele_countvec);
+      if (aux_idx_lowbits == block_len_m1) {
+	break;
+      }
+      ++aux_idx_lowbits;
+      aux1_code_word >>= code_bit_width;
+    }
+  }
+}
+
+void aux1_update_ref_or_alt1_countvec_subset(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t alt_allele_ct, uint32_t aux1_nonmissing_ct, uint32_t allele_idx, uintptr_t* __restrict allele_countvec) {
+  uint32_t log2_codes_per_word;
+  uint32_t code_bit_width;
+  if (alt_allele_ct == 2) {
+    log2_codes_per_word = kBitsPerWordLog2 - 1;
+    code_bit_width = 2;
+  } else if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    code_bit_width = 4;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    code_bit_width = 8;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    code_bit_width = 16;
+  }
+  const uint32_t halfcode_mask = (1 << ((code_bit_width / 2) + (code_bit_width == 2))) - 1;
+  uint32_t ambig_idx = 0;
+  for (uint32_t aux_idx = 0; aux_idx < aux1_nonmissing_ct; ++aux_idx, ++ambig_idx) {
+    next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+    const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+    if (IS_SET(sample_include, sample_idx)) {
+      const uint32_t cur_code = (aux1_code_vec[aux_idx >> log2_codes_per_word] >> ((code_bit_width * aux_idx) & (kBitsPerWord - 1))) & halfcode_mask;
+      const uintptr_t cur_allele_ct = (cur_code == allele_idx);
+      ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_idx), cur_allele_ct, allele_countvec);
+    }
+  }
+}
+
+// "rarealt" = alt2/alt3/etc.
+void aux1_update_rarealt_countvec(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, uint32_t alt_allele_ct, uint32_t aux1_nonmissing_ct, uintptr_t allele_idx, uintptr_t* __restrict allele_countvec) {
+  // todo: check whether promoting allele_idx to uintptr_t actually helps
+  if (!aux1_nonmissing_ct) {
+    return;
+  }
+  const uint32_t aux1_nonmissing_ct_m1 = aux1_nonmissing_ct - 1;
+  const uintptr_t* aux1_code_vec_iter = aux1_code_vec;
+  uint32_t ambig_idx = 0;
+  if (alt_allele_ct == 2) {
+    assert(allele_idx == 2);
+    const uintptr_t* aux1_code_vec_last = &(aux1_code_vec_iter[aux1_nonmissing_ct_m1 / kBitsPerWordD2]);
+    uint32_t block_len_m1 = kBitsPerWordD2 - 1;
+    while (1) {
+      if (aux1_code_vec_iter >= aux1_code_vec_last) {
+	if (aux1_code_vec_iter > aux1_code_vec_last) {
+	  return;
+	}
+	block_len_m1 &= aux1_nonmissing_ct_m1;
+      }
+      uintptr_t aux1_code_word = *aux1_code_vec_iter++;
+      uint32_t aux_idx_lowbits = 0;
+      while (1) {
+	next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+	const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+	uintptr_t cur_allele_ct = 1 + ((aux1_code_word & 3) == allele_idx);
+	ASSIGN_QUATERARR_ENTRY(sample_idx, cur_allele_ct, allele_countvec);
+	if (aux_idx_lowbits == block_len_m1) {
+	  break;
+	}
+	++aux_idx_lowbits;
+	aux1_code_word >>= 2;
+      }
+    }
+  }
+  uint32_t log2_codes_per_word;
+  uint32_t halfcode_bit_width;
+  if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    halfcode_bit_width = 4;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    halfcode_bit_width = 8;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    halfcode_bit_width = 16;
+  }
+  const uint32_t code_bit_width = halfcode_bit_width * 2;
+  const uintptr_t halfcode_mask = (1 << halfcode_bit_width) - 1;
+  const uintptr_t* aux1_code_vec_last = &(aux1_code_vec_iter[aux1_nonmissing_ct_m1 >> log2_codes_per_word]);
+  uint32_t block_len_m1 = (1 << log2_codes_per_word) - 1;
+  while (1) {
+    if (aux1_code_vec_iter >= aux1_code_vec_last) {
+      if (aux1_code_vec_iter > aux1_code_vec_last) {
+	return;
+      }
+      block_len_m1 &= aux1_nonmissing_ct_m1;
+    }
+    uintptr_t aux1_code_word = *aux1_code_vec_iter++;
+    uint32_t aux_idx_lowbits = 0;
+    while (1) {
+      next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+      const uintptr_t cur_allele_ct = ((aux1_code_word & halfcode_mask) == allele_idx) + (((aux1_code_word >> halfcode_bit_width) & halfcode_mask) == allele_idx);
+      ASSIGN_QUATERARR_ENTRY(sample_idx, cur_allele_ct, allele_countvec);      
+      if (aux_idx_lowbits == block_len_m1) {
+	break;
+      }
+      ++aux_idx_lowbits;
+      aux1_code_word >>= code_bit_width;
+    }
+  }
+}
+
+void aux1_update_rarealt_countvec_subset(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t alt_allele_ct, uint32_t aux1_nonmissing_ct, uint32_t allele_idx, uintptr_t* __restrict allele_countvec) {
+  uint32_t ambig_idx = 0;
+  if (alt_allele_ct == 2) {
+    assert(allele_idx == 2);
+    for (uint32_t aux_idx = 0; aux_idx < aux1_nonmissing_ct; ++aux_idx, ++ambig_idx) {
+      next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+      if (IS_SET(sample_include, sample_idx)) {
+	const uint32_t cur_code = GET_QUATERARR_ENTRY(aux1_code_vec, aux_idx);
+	const uintptr_t cur_allele_ct = 1 + (cur_code == allele_idx);
+	ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_idx), cur_allele_ct, allele_countvec);
+      }
+    }
+  }
+  uint32_t log2_codes_per_word;
+  uint32_t halfcode_bit_width;
+  if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    halfcode_bit_width = 2;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    halfcode_bit_width = 4;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    halfcode_bit_width = 8;
+  }
+  const uint32_t code_bit_width = 2 * halfcode_bit_width;
+  const uint32_t halfcode_mask = (1 << halfcode_bit_width) - 1;
+  for (uint32_t aux_idx = 0; aux_idx < aux1_nonmissing_ct; ++aux_idx, ++ambig_idx) {
+    next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+    const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+    if (IS_SET(sample_include, sample_idx)) {
+      const uint32_t cur_code_unmasked = (uint32_t)(aux1_code_vec[aux_idx >> log2_codes_per_word] >> ((code_bit_width * aux_idx) & (kBitsPerWord - 1)));
+      const uintptr_t cur_allele_ct = ((cur_code_unmasked & halfcode_mask) == allele_idx) + (((cur_code_unmasked >> halfcode_bit_width) & halfcode_mask) == allele_idx);
+      ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_idx), cur_allele_ct, allele_countvec);
+    }
+  }
+}
+
+// See comments toward end of pgr_read_genovec_subset_then_common2().
+void aux1_update_genovec_match2_unsafe(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, uint32_t alt_allele_ct, uint32_t most_common_idx, uint32_t second_most_common_idx, uint32_t aux1_nonmissing_ct, uintptr_t* __restrict genovec) {
+  assert(aux1_nonmissing_ct);
+  // One rarealt, one ref/alt1.  Only two codes to match, since
+  // homozygous-ref/alt1 calls are not stored in this data track.
+  // This code is separate from the ...match3() functions below since, if we
+  // iterate over a variable-length array, we probably don't get to take
+  // advantage of registers.  But maybe the compiler is actually smart enough;
+  // need to test the simple implementation...
+  const uint32_t rarealt_is_minor = (most_common_idx < 2);
+  uint32_t nonrare_idx;
+  uint32_t rarealt_idx;
+  if (rarealt_is_minor) {
+    nonrare_idx = most_common_idx;
+    rarealt_idx = second_most_common_idx;
+  } else {
+    nonrare_idx = second_most_common_idx;
+    rarealt_idx = most_common_idx;
+  }
+  uint32_t log2_codes_per_word;
+  uint32_t code_bit_width;
+  if (alt_allele_ct == 2) {
+    log2_codes_per_word = kBitsPerWordLog2 - 1;
+    code_bit_width = 2;
+  } else if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    code_bit_width = 4;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    code_bit_width = 8;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    code_bit_width = 16;
+  }
+  const uintptr_t* aux1_code_vec_iter = aux1_code_vec;
+  const uintptr_t* aux1_code_vec_last = &(aux1_code_vec[(aux1_nonmissing_ct - 1) >> log2_codes_per_word]);
+  const uint32_t halfcode_bit_width = (code_bit_width / 2) + (code_bit_width == 2);
+  const uint32_t code1 = (rarealt_idx << halfcode_bit_width) + nonrare_idx;
+  const uint32_t code02 = rarealt_idx * ((1 << halfcode_bit_width) + 1);
+  const uintptr_t store02 = rarealt_is_minor * 2;
+  assert(code_bit_width < 32);
+  const uint32_t code_mask = (1 << code_bit_width) - 1;
+  uint32_t ambig_idx = 0;
+  uint32_t block_len_m1 = (1 << log2_codes_per_word) - 1;
+  while (1) {
+    if (aux1_code_vec_iter >= aux1_code_vec_last) {
+      if (aux1_code_vec_iter > aux1_code_vec_last) {
+	return;
+      }
+      block_len_m1 &= aux1_nonmissing_ct - 1;
+    }
+    uintptr_t aux1_code_word = *aux1_code_vec_iter++;
+    uint32_t aux_idx_lowbits = 0;
+    while (1) {
+      next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      const uint32_t cur_code = aux1_code_word & code_mask;
+      const uint32_t match1 = (cur_code == code1);
+      if (match1 || (cur_code == code02)) {
+	const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+
+	// todo: check if there's a better way to perform this assignment
+	// e.g. (rarealt_is_minor ^ match1) + rarealt_is_minor
+	const uintptr_t new_geno = match1? 1 : store02;
+	
+	ASSIGN_QUATERARR_ENTRY(sample_idx, new_geno, genovec);
+      }
+      if (aux_idx_lowbits == block_len_m1) {
+	break;
+      }
+      ++aux_idx_lowbits;
+      aux1_code_word >>= code_bit_width;
+    }
+  }
+}
+
+void aux1_update_genovec_subset_match2(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t alt_allele_ct, uint32_t most_common_idx, uint32_t second_most_common_idx, uint32_t aux1_nonmissing_ct, uintptr_t* __restrict genovec) {
+  const uint32_t rarealt_is_minor = (most_common_idx < 2);
+  uint32_t nonrare_idx;
+  uint32_t rarealt_idx;
+  if (rarealt_is_minor) {
+    nonrare_idx = most_common_idx;
+    rarealt_idx = second_most_common_idx;
+  } else {
+    nonrare_idx = second_most_common_idx;
+    rarealt_idx = most_common_idx;
+  }
+  uint32_t log2_codes_per_word;
+  uint32_t code_bit_width;
+  if (alt_allele_ct == 2) {
+    log2_codes_per_word = kBitsPerWordLog2 - 1;
+    code_bit_width = 2;
+  } else if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    code_bit_width = 4;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    code_bit_width = 8;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    code_bit_width = 16;
+  }
+  const uint32_t halfcode_bit_width = (code_bit_width / 2) + (code_bit_width == 2);
+  const uint32_t code1 = (rarealt_idx << halfcode_bit_width) + nonrare_idx;
+  const uint32_t code02 = rarealt_idx * ((1 << halfcode_bit_width) + 1);
+  const uintptr_t store02 = rarealt_is_minor * 2;
+  assert(code_bit_width < 32);
+  const uint32_t code_mask = (1 << code_bit_width) - 1;
+  uint32_t ambig_idx = 0;
+  for (uint32_t aux_idx = 0; aux_idx < aux1_nonmissing_ct; ++aux_idx, ++ambig_idx) {
+    next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+    const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+    if (IS_SET(sample_include, sample_idx)) {
+      const uint32_t cur_code = (aux1_code_vec[aux_idx >> log2_codes_per_word] >> ((code_bit_width * aux_idx) & (kBitsPerWord - 1))) & code_mask;
+      const uint32_t match1 = (cur_code == code1);
+      if (match1 || (cur_code == code02)) {
+	const uintptr_t new_geno = match1? 1 : store02;
+	ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_idx), new_geno, genovec);
+      }
+    }
+  }
+}
+
+void aux1_update_genovec_match3_unsafe(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, uint32_t alt_allele_ct, uint32_t most_common_idx, uint32_t second_most_common_idx, uint32_t aux1_nonmissing_ct, uintptr_t* __restrict genovec) {
+  assert(aux1_nonmissing_ct);
+  // can't have two rarealts be the most common if there is only one rarealt...
+  assert(alt_allele_ct > 2);
+  uint32_t log2_codes_per_word;
+  uint32_t code_bit_width;
+  if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    code_bit_width = 4;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    code_bit_width = 8;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    code_bit_width = 16;
+  }
+  const uintptr_t* aux1_code_vec_iter = aux1_code_vec;
+  const uintptr_t* aux1_code_vec_last = &(aux1_code_vec[(aux1_nonmissing_ct - 1) >> log2_codes_per_word]);
+  const uint32_t halfcode_bit_width = code_bit_width / 2;
+  assert(halfcode_bit_width <= 16);
+  const uint32_t code0 = most_common_idx * ((1 << halfcode_bit_width) + 1);
+  const uint32_t code1 = (most_common_idx << halfcode_bit_width) + second_most_common_idx;
+  const uint32_t code2 = second_most_common_idx * ((1 << halfcode_bit_width) + 1);
+  assert(code_bit_width < 32);
+  const uint32_t code_mask = (1 << code_bit_width) - 1;
+  uint32_t ambig_idx = 0;
+  uint32_t block_len_m1 = (1 << log2_codes_per_word) - 1;
+  while (1) {
+    if (aux1_code_vec_iter >= aux1_code_vec_last) {
+      if (aux1_code_vec_iter > aux1_code_vec_last) {
+	return;
+      }
+      block_len_m1 &= aux1_nonmissing_ct - 1;
+    }
+    uintptr_t aux1_code_word = *aux1_code_vec_iter++;
+    uint32_t aux_idx_lowbits = 0;
+    while (1) {
+      next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      const uint32_t cur_code = aux1_code_word & code_mask;
+      const uint32_t match0 = (cur_code == code0);
+      const uint32_t match1 = (cur_code == code1);
+      if (match0 || match1 || (cur_code == code2)) {
+	const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+	const uintptr_t new_geno = match0? 0 : (2 - match1);	
+	ASSIGN_QUATERARR_ENTRY(sample_idx, new_geno, genovec);
+      }
+      if (aux_idx_lowbits == block_len_m1) {
+	break;
+      }
+      ++aux_idx_lowbits;
+      aux1_code_word >>= code_bit_width;
+    }
+  }
+}
+
+void aux1_update_genovec_subset_match3(const uint32_t* __restrict ambig_sample_ids, const uintptr_t* aux1_nonmissing_vec, const uintptr_t* __restrict aux1_code_vec, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t alt_allele_ct, uint32_t most_common_idx, uint32_t second_most_common_idx, uint32_t aux1_nonmissing_ct, uintptr_t* __restrict genovec) {
+  assert(aux1_nonmissing_ct);
+  // can't have two rarealts be the most common if there is only one rarealt...
+  assert(alt_allele_ct > 2);
+  uint32_t log2_codes_per_word;
+  uint32_t code_bit_width;
+  if (alt_allele_ct == 3) {
+    log2_codes_per_word = kBitsPerWordLog2 - 2;
+    code_bit_width = 4;
+  } else if (alt_allele_ct < 16) {
+    log2_codes_per_word = kBitsPerWordLog2 - 3;
+    code_bit_width = 8;
+  } else {
+    log2_codes_per_word = kBitsPerWordLog2 - 4;
+    code_bit_width = 16;
+  }
+  const uint32_t halfcode_bit_width = code_bit_width / 2;
+  assert(halfcode_bit_width <= 16);
+  const uint32_t code0 = most_common_idx * ((1 << halfcode_bit_width) + 1);
+  const uint32_t code1 = (most_common_idx << halfcode_bit_width) + second_most_common_idx;
+  const uint32_t code2 = second_most_common_idx * ((1 << halfcode_bit_width) + 1);
+  assert(code_bit_width < 32);
+  const uint32_t code_mask = (1 << code_bit_width) - 1;
+  uint32_t ambig_idx = 0;
+  for (uint32_t aux_idx = 0; aux_idx < aux1_nonmissing_ct; ++aux_idx, ++ambig_idx) {
+    next_set_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+    const uint32_t sample_idx = ambig_sample_ids[ambig_idx];
+    if (IS_SET(sample_include, sample_idx)) {
+      const uint32_t cur_code = (aux1_code_vec[aux_idx >> log2_codes_per_word] >> ((code_bit_width * aux_idx) & (kBitsPerWord - 1))) & code_mask;
+      const uint32_t match0 = (cur_code == code0);
+      const uint32_t match1 = (cur_code == code1);
+      if (match0 || match1 || (cur_code == code2)) {
+	const uintptr_t new_geno = match0? 0 : (2 - match1);
+	ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_idx), new_geno, genovec);
+      }
+    }
+  }
+}
+
+pglerr_t pgr_read_genovec_subset_then_common2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uint32_t* __restrict maj_allele_idx_ptr, uint32_t* __restrict second_allele_idx_ptr, uint32_t* __restrict allele_ct_buf) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  const uint32_t allele_ct = (uint32_t)(pgrp->fi.allele_idx_offsets[vidx + 1] - pgrp->fi.allele_idx_offsets[vidx]);
+  if (!sample_ct) {
+    *maj_allele_idx_ptr = 0;
+    *second_allele_idx_ptr = 0;
+    fill_uint_zero(allele_ct, allele_ct_buf);
+    return kPglRetSuccess;
+  }
+  // major allele corresponds to 0 bits, second-most-common allele corresponds
+  // to 1.
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  pglerr_t reterr = read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec);
+  if (reterr) {
+    return reterr;
+  }
+  zero_trailing_quaters(sample_ct, genovec);
+  uint32_t bothset_ct;
+  genovec_allele_cts_unsafe(genovec, sample_ct, allele_ct_buf, &bothset_ct);
+  uint32_t second_most_common_idx;
+  if (allele_ct == 2) {
+  pgr_read_genovec_subset_then_common2_refalt1_finish:
+    second_most_common_idx = (allele_ct_buf[0] >= allele_ct_buf[1]);
+    *maj_allele_idx_ptr = 1 - second_most_common_idx;
+    *second_allele_idx_ptr = second_most_common_idx;
+    if (!second_most_common_idx) {
+      genovec_invert_unsafe(sample_ct, genovec);
+      zero_trailing_quaters(sample_ct, genovec);
+    }
+    return kPglRetSuccess;
+  }
+  const uint32_t ambig_id_ct = pgrp->workspace_ambig_id_ct;
+  second_most_common_idx = (allele_ct_buf[0] >= allele_ct_buf[1]);
+  uint32_t included_ambig_id_ct_threshold = DIV_UP(allele_ct_buf[second_most_common_idx], 2);
+  // avoid processing the aux1 data track if possible.
+  const uint32_t subsetting_required = (sample_ct != pgrp->fi.raw_sample_ct);
+  if (ambig_id_ct < included_ambig_id_ct_threshold) {
+    goto pgr_read_genovec_subset_then_common2_refalt1_finish;
+  }
+  if (subsetting_required) {
+    const uint32_t* __restrict ambig_sample_ids = pgrp->workspace_ambig_sample_ids;
+    uint32_t included_ambig_id_ct = 0;
+    uint32_t ambig_idx = 0;
+    // minor optimization: check included_ambig_id_ct halfway through, we might
+    // be able to skip the second half of the list.
+    const uint32_t half_ambig_id_ct = ambig_id_ct / 2;
+    for (; ambig_idx < half_ambig_id_ct; ++ambig_idx) {
+      included_ambig_id_ct += IS_SET(sample_include, ambig_sample_ids[ambig_idx]);
+    }
+    if (included_ambig_id_ct < included_ambig_id_ct_threshold) {
+      for (; ambig_idx < ambig_id_ct; ++ambig_idx) {
+	included_ambig_id_ct += IS_SET(sample_include, ambig_sample_ids[ambig_idx]);
+      }
+      if (included_ambig_id_ct < included_ambig_id_ct_threshold) {
+	goto pgr_read_genovec_subset_then_common2_refalt1_finish;
+      }
+    }
+  }
+  const uint32_t alt_allele_ct = allele_ct - 1;
+  uint32_t aux1_nonmissing_ct;
+  if (parse_aux1(fread_end, alt_allele_ct, &fread_ptr, pgrp, &aux1_nonmissing_ct)) {
+    return kPglRetMalformedInput;
+  }
+  if (subsetting_required) {
+    aux1_subset_update_allele_counts(pgrp->workspace_ambig_sample_ids, pgrp->workspace_aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, sample_include, alt_allele_ct, aux1_nonmissing_ct, allele_ct_buf);
+  } else {
+    aux1_update_allele_counts(alt_allele_ct, aux1_nonmissing_ct, pgrp->workspace_aux1_code_vec, allele_ct_buf);
+  }
+  uint32_t most_common_idx = 0;
+  uint32_t most_common_allele_ct = allele_ct_buf[0]; // ref count
+  second_most_common_idx = 1;
+  uint32_t second_most_common_allele_ct = 0;
+  for (uint32_t allele_idx = 1; allele_idx <= alt_allele_ct; ++allele_idx) {
+    uint32_t cur_allele_ct = allele_ct_buf[allele_idx];
+    if (cur_allele_ct > second_most_common_allele_ct) {
+      if (cur_allele_ct > most_common_allele_ct) {
+	second_most_common_allele_ct = most_common_allele_ct;
+	second_most_common_idx = most_common_idx;
+	most_common_allele_ct = cur_allele_ct;
+	most_common_idx = allele_idx;
+      } else {
+	second_most_common_allele_ct = cur_allele_ct;
+	second_most_common_idx = allele_idx;
+      }
+    }
+  }
+  if (most_common_idx + second_most_common_idx == 1) {
+    goto pgr_read_genovec_subset_then_common2_refalt1_finish;
+  }
+  uint32_t rarealt_is_minor = (most_common_idx < 2);
+  if (rarealt_is_minor || (second_most_common_idx < 2)) {
+    // One of the most common alleles is ref or alt1, and the other is a
+    // rarealt.  Suppose for clarity that ref and alt2 are the two most common.
+    // 1. Keep just the hom ref calls in the base genotype vector.  het
+    //    ref/alt1 and hom alt1 are converted to missing.
+    // 2. Search aux1_code_vec for het ref/alt2 and hom alt2 genotypes.  Update
+    //    the corresponding positions in genovec.
+    uint32_t ref_or_alt1_idx = rarealt_is_minor? most_common_idx : second_most_common_idx;
+    if (ref_or_alt1_idx == 1) {
+      genovec_invert_unsafe(sample_ct, genovec);
+    }
+    genovec_nonzero_to_missing_unsafe(sample_ct, genovec);
+    if (!rarealt_is_minor) {
+      genovec_invert_unsafe(sample_ct, genovec);
+    }
+    if (subsetting_required) {
+      aux1_update_genovec_subset_match2(pgrp->workspace_ambig_sample_ids, pgrp->workspace_aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, sample_include, sample_include_cumulative_popcounts, alt_allele_ct, most_common_idx, second_most_common_idx, aux1_nonmissing_ct, genovec);
+    } else {
+      aux1_update_genovec_match2_unsafe(pgrp->workspace_ambig_sample_ids, pgrp->workspace_aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, alt_allele_ct, most_common_idx, second_most_common_idx, aux1_nonmissing_ct, genovec);
+    }
+  }
+  // Both of the most common alleles are rarealts.  Supposing for clarity that
+  // they are alt2 and alt3,
+  // 1. Initialize genovec to all-missing.
+  // 2. Search aux1_code_vec for hom alt2, het alt2/alt3, and hom alt3
+  //    genotypes.  Update the corresponding positions in genovec.
+  fill_all_bits(sample_ct * 2, genovec);
+  if (subsetting_required) {
+    aux1_update_genovec_subset_match3(pgrp->workspace_ambig_sample_ids, pgrp->workspace_aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, sample_include, sample_include_cumulative_popcounts, alt_allele_ct, most_common_idx, second_most_common_idx, aux1_nonmissing_ct, genovec);
+  } else {
+    aux1_update_genovec_match3_unsafe(pgrp->workspace_ambig_sample_ids, pgrp->workspace_aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, alt_allele_ct, most_common_idx, second_most_common_idx, aux1_nonmissing_ct, genovec);
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t parse_difflist_just_ambig_ids(const unsigned char* fread_end, pgen_reader_t* pgrp, const unsigned char** fread_pp) {
+  // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
+  uintptr_t* __restrict raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const unsigned char* group_info_iter;
+  uint32_t difflist_len;
+  pglerr_t reterr = parse_difflist_header(fread_end, pgrp->fi.raw_sample_ct, fread_pp, raregeno_iter, &group_info_iter, &difflist_len);
+  if (reterr || (!difflist_len)) {
+    pgrp->workspace_ambig_id_ct = 0;
+    return reterr;
+  }
+  // variant is guaranteed to be multiallelic, so little point in optimizing
+  // for sparsity.
+  const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  uint32_t* __restrict ambig_sample_ids = pgrp->workspace_ambig_sample_ids;
+  uint32_t subgroup_idx = 0;
+  uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
+  uint32_t ambig_id_ct = 0;
+  uintptr_t raw_sample_idx = 0;
+  while (1) {
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	pgrp->workspace_ambig_id_ct = ambig_id_ct;
+	return kPglRetSuccess;
+      }
+      subgroup_len_m1 &= difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+#ifdef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    // invert so we can compare vs. zero
+    uintptr_t cur_raregeno_word_inv = ~(*raregeno_iter++);
+    uint32_t difflist_idx_lowbits = 0;
+    while (1) {
+#ifndef __LP64__
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+#endif
+      if (!(cur_raregeno_word_inv & 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+      }
+      if (difflist_idx_lowbits == subgroup_len_m1) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      ++difflist_idx_lowbits;
+      cur_raregeno_word_inv >>= 2;
+    }
+  }
+}
+
+pglerr_t parse_just_ambig_ids(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp) {
+  assert(pgrp->fi.vrtypes);
+  // Just initializes pgr.workspace_ambig_sample_ids and
+  // pgr.workspace_ambig_id_ct.  Avoids some unnecessary work when we're just
+  // interested in rare alt(s).  May use pgrp->workspace_vec.
+  const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
+  assert(vrtype_multiallelic(vrtype));
+  const uint32_t maintrack_vrtype = vrtype & 7;
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  const uint32_t is_ld_compressed = vrtype_ld_compressed(maintrack_vrtype);
+  const uint32_t is_ldbase = pgrp->fi.vrtypes && (!is_ld_compressed) && vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+  if (!maintrack_vrtype) {
+    const uint32_t genovec_byte_ct = QUATERCT_TO_BYTECT(raw_sample_ct);
+    if ((uintptr_t)(fread_end - (*fread_pp)) < genovec_byte_ct) {
+      return kPglRetMalformedInput;
+    }
+    const unsigned char* fread_ptr_new = &((*fread_pp)[genovec_byte_ct]);
+#ifdef __arm__
+  #error "Unaligned accesses in parse_just_ambig_ids()."
+#endif
+    const uintptr_t* cur_genoarr = (const uintptr_t*)(*fread_pp);
+    if (is_ldbase) {
+      if (!subsetting_required) {
+	memcpy(pgrp->ldbase_genovec, cur_genoarr, genovec_byte_ct);
+	cur_genoarr = pgrp->ldbase_genovec; // may as well guarantee alignment
+      } else {
+	copy_quaterarr_nonempty_subset(cur_genoarr, sample_include, raw_sample_ct, sample_ct, pgrp->ldbase_genovec);
+      }
+    }
+    extract_genoarr_ambig_ids(cur_genoarr, raw_sample_ct, pgrp->workspace_ambig_sample_ids, &(pgrp->workspace_ambig_id_ct));
+    *fread_pp = fread_ptr_new;
+    return kPglRetSuccess;
+  }
+  if (is_ld_compressed) {
+    // LD compression
+    pglerr_t reterr = ld_load_minimal_subset_if_necessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
+    if (reterr) {
+      return reterr;
+    }
+  }
+  if (!is_ldbase) {
+    // difflist storage, unnecessary to save ldbase info
+    if (maintrack_vrtype == 1) {
+      *fread_pp += (raw_sample_ct + 15) / CHAR_BIT;
+    }
+    return parse_difflist_just_ambig_ids(fread_end, pgrp, fread_pp);
+  }
+  if (maintrack_vrtype == 1) {
+    uintptr_t* cur_genoarr = subsetting_required? pgrp->workspace_vec : pgrp->ldbase_genovec;
+    if (parse_onebit_unsafe(fread_end, 1, fread_pp, pgrp, cur_genoarr)) {
+      return kPglRetMalformedInput;
+    }
+    if (subsetting_required) {
+      copy_quaterarr_nonempty_subset(cur_genoarr, sample_include, raw_sample_ct, sample_ct, pgrp->ldbase_genovec);
+    }
+    return kPglRetSuccess;
+  }
+  pglerr_t reterr = parse_and_save_difflist_subset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_sample_ct, fread_pp, pgrp->workspace_ambig_sample_ids, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, &(pgrp->ldbase_difflist_len), &(pgrp->workspace_ambig_id_ct), pgrp->workspace_vec);
+  if (reterr) {
+    return reterr;
+  }
+  pgrp->ldbase_difflist_sample_ids[pgrp->ldbase_difflist_len] = sample_ct;
+  return kPglRetSuccess;
+}
+
+pglerr_t pgr_read_allele_countvec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, pgen_reader_t* pgrp, uintptr_t* __restrict allele_countvec) {
+  if (!sample_ct) {
+    return kPglRetSuccess;
+  }
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  if (allele_idx < 2) {
+    const uint32_t is_multiallelic = vrtype_multiallelic(vrtype);
+    const unsigned char* fread_ptr;
+    const unsigned char* fread_end;
+    pglerr_t reterr = read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, is_multiallelic? (&fread_ptr) : nullptr, &fread_end, allele_countvec);
+    if (reterr) {
+      return reterr;
+    }
+    if (!allele_idx) {
+      genovec_invert_unsafe(sample_ct, allele_countvec);
+    }
+    if (!is_multiallelic) {
+      return kPglRetSuccess;
+    }
+    const uint32_t alt_allele_ct = (uint32_t)(pgrp->fi.allele_idx_offsets[vidx + 1] - pgrp->fi.allele_idx_offsets[vidx]);
+    uint32_t aux1_nonmissing_ct;
+    if (parse_aux1(fread_end, alt_allele_ct, &fread_ptr, pgrp, &aux1_nonmissing_ct)) {
+      return kPglRetReadFail;
+    }
+    if (subsetting_required) {
+      aux1_update_ref_or_alt1_countvec_subset(pgrp->workspace_ambig_sample_ids, pgrp->workspace_aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, sample_include, sample_include_cumulative_popcounts, alt_allele_ct, aux1_nonmissing_ct, allele_idx, allele_countvec);
+    } else {
+      aux1_update_ref_or_alt1_countvec(pgrp->workspace_ambig_sample_ids, pgrp->workspace_aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, alt_allele_ct, aux1_nonmissing_ct, allele_idx, allele_countvec);
+    }
+    return kPglRetSuccess;
+  }
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+    return kPglRetReadFail;
+  }
+  assert(vrtype_multiallelic(vrtype));
+  const uint32_t alt_allele_ct = (uint32_t)(pgrp->fi.allele_idx_offsets[vidx + 1] - pgrp->fi.allele_idx_offsets[vidx]);
+  assert(allele_idx <= alt_allele_ct);
+  uint32_t* ambig_sample_ids = pgrp->workspace_ambig_sample_ids;
+  if ((vrtype & 7) == 7) {
+    // most values missing, 0b11 entries
+    fill_ulong_one(QUATERCT_TO_WORDCT(sample_ct), allele_countvec);
+    uintptr_t* __restrict raregeno_vec = pgrp->workspace_raregeno_vec;
+    uint32_t* __restrict difflist_sample_ids = pgrp->workspace_difflist_sample_ids;
+    uint32_t difflist_len;
+    pglerr_t reterr = parse_and_save_difflist(fread_end, raw_sample_ct, &fread_ptr, raregeno_vec, difflist_sample_ids, &difflist_len);
+    if (reterr || (!difflist_len)) {
+      return reterr;
+    }
+    uintptr_t* raregeno_iter = raregeno_vec;
+    uintptr_t raregeno_word = *raregeno_iter++;
+    uint32_t difflist_idx = 0;
+    uint32_t ambig_id_ct = 0;
+    if (subsetting_required) {
+      while (1) {
+	const uint32_t cur_raregeno = raregeno_word & 3;
+	const uint32_t sample_idx = difflist_sample_ids[difflist_idx];
+	++difflist_idx;
+	if (cur_raregeno == 3) {
+	  ambig_sample_ids[ambig_id_ct++] = sample_idx;
+	} else if (IS_SET(sample_include, sample_idx)) {
+	  const uint32_t subsetted_pos = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_idx);
+	  CLEAR_QUATERARR_ENTRY(subsetted_pos, allele_countvec);
+	}
+	if (difflist_idx == difflist_len) {
+	  break;
+	}
+	raregeno_word >>= 2;
+	if (!(difflist_idx % kBitsPerWordD2)) {
+	  raregeno_word = *raregeno_iter++;
+	}
+      }
+    } else {
+      while (1) {
+	const uint32_t cur_raregeno = raregeno_word & 3;
+	const uint32_t sample_idx = difflist_sample_ids[difflist_idx];
+	++difflist_idx;
+	if (cur_raregeno == 3) {
+	  ambig_sample_ids[ambig_id_ct++] = sample_idx;
+	} else {
+	  CLEAR_QUATERARR_ENTRY(sample_idx, allele_countvec);
+	}
+	if (difflist_idx == difflist_len) {
+	  break;
+	}
+	raregeno_word >>= 2;
+	if (!(difflist_idx % kBitsPerWordD2)) {
+	  raregeno_word = *raregeno_iter++;
+	}
+      }
+    }
+    pgrp->workspace_ambig_id_ct = ambig_id_ct;
+  } else {
+    pglerr_t reterr = parse_just_ambig_ids(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr);
+    if (reterr) {
+      return reterr;
+    }
+    if (vrtype_ld_compressed(vrtype)) {
+      // todo: optimize difflist case
+      if (!(pgrp->ldbase_stypes & kfPgrLdcacheQuater)) {
+	assert(pgrp->ldbase_stypes & kfPgrLdcacheDifflist);
+	pgr_difflist_to_genovec_unsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, sample_ct, pgrp->ldbase_difflist_len, pgrp->ldbase_genovec);
+	pgrp->ldbase_stypes |= kfPgrLdcacheQuater;
+      }
+      copy_quaterarr(pgrp->ldbase_genovec, sample_ct, allele_countvec);
+      genovec_nonmissing_to_zero_unsafe(sample_ct, allele_countvec);
+    } else {
+      // most values nonmissing, 0b00 entries
+      fill_ulong_zero(QUATERCT_TO_WORDCT(sample_ct), allele_countvec);
+    }
+  }
+  uint32_t aux1_nonmissing_ct;
+  if (parse_aux1(fread_end, alt_allele_ct, &fread_ptr, pgrp, &aux1_nonmissing_ct)) {
+    return kPglRetReadFail;
+  }
+  uint32_t ambig_id_ct = pgrp->workspace_ambig_id_ct;
+  uint32_t ambig_missing_ct = ambig_id_ct - aux1_nonmissing_ct;
+  const uintptr_t* __restrict aux1_nonmissing_vec = pgrp->workspace_aux1_nonmissing_vec;
+  if (subsetting_required) {
+    uint32_t ambig_idx = 0;
+    for (uint32_t ambig_missing_idx = 0; ambig_missing_idx < ambig_missing_ct; ++ambig_missing_idx) {
+      next_unset_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, ambig_sample_ids[ambig_idx]), 3, allele_countvec);
+    }
+    aux1_update_rarealt_countvec_subset(ambig_sample_ids, aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, sample_include, sample_include_cumulative_popcounts, alt_allele_ct, aux1_nonmissing_ct, allele_idx, allele_countvec);
+  } else {
+    uint32_t ambig_idx = 0;
+    for (uint32_t ambig_missing_idx = 0; ambig_missing_idx < ambig_missing_ct; ++ambig_missing_idx) {
+      next_unset_unsafe_ck(aux1_nonmissing_vec, &ambig_idx);
+      ASSIGN_QUATERARR_ENTRY(ambig_sample_ids[ambig_idx], 3, allele_countvec);
+    }
+    aux1_update_rarealt_countvec(ambig_sample_ids, aux1_nonmissing_vec, pgrp->workspace_aux1_code_vec, alt_allele_ct, aux1_nonmissing_ct, allele_idx, allele_countvec);
+  }
+  return kPglRetSuccess;
+}
+
+void detect_genovec_hets_hw(const uintptr_t*__restrict genovec, uint32_t raw_sample_ctl2, halfword_t* all_hets_hw) {
+  // requires trailing bits of genovec to be zeroed out.  does not update last
+  // all_hets[] halfword if raw_sample_ctl2 is odd.
+  for (uint32_t widx = 0; widx < raw_sample_ctl2; ++widx) {
+    const uintptr_t cur_word = genovec[widx];
+    uintptr_t ww = (~(cur_word >> 1)) & cur_word & kMask5555; // low 1, high 0
+    all_hets_hw[widx] = pack_word_to_halfword(ww);
+  }
+}
+
+pglerr_t parse_and_apply_difflist_hphase_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t multiallelic_relevant, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict all_hets) {
+  // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
+  // If multiallelic_relevant is true, a list of sample indices with freshly
+  // loaded raregeno value 0b11 is saved to pgr.workspace_ambig_sample_ids, and
+  // pgr.workspace_ambig_id_ct is set to the length of the list.
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  if (sample_ct == raw_sample_ct) {
+    pglerr_t reterr = parse_and_apply_difflist(fread_end, multiallelic_relevant, fread_pp, pgrp, genovec);
+    if (reterr) {
+      return reterr;
+    }
+    pgr_detect_genovec_hets(genovec, raw_sample_ct, all_hets);
+    return kPglRetSuccess;
+  }
+  uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
+  const unsigned char* group_info_iter;
+  uint32_t difflist_len;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len);
+  if (reterr || (!difflist_len)) {
+    return reterr;
+  }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
+  uint32_t* ambig_sample_ids = multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr;
+  uintptr_t raw_sample_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t subgroup_idx = 0;
+  while (1) {
+    uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	pgrp->workspace_ambig_id_ct = ambig_id_ct;
+	return kPglRetSuccess;
+      }
+      remaining_deltas_in_subgroup &= difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+      raw_sample_idx = 0;
+      memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+    ++subgroup_idx;
+    uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
+    // This loop tends to be the decompression bottleneck.  Tried to modify it
+    // to process 4 entries at a time, but that didn't end up helping.
+    while (1) {
+      // always check, since otherwise ASSIGN_QUATERARR_ENTRY() can scribble
+      // over arbitrary memory
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+      const uintptr_t cur_geno = cur_raregeno_word & 3;
+      if (IS_SET(sample_include, raw_sample_idx)) {
+	ASSIGN_QUATERARR_ENTRY(raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, (uint32_t)raw_sample_idx), cur_geno, genovec);
+      }
+      if (cur_geno == 1) {
+	SET_BIT(raw_sample_idx, all_hets);
+      } else {
+	CLEAR_BIT(raw_sample_idx, all_hets); // needed for LD decompression
+	if (multiallelic_relevant && (cur_geno == 3)) {
+	  ambig_sample_ids[ambig_id_ct++] = (uint32_t)raw_sample_idx;
+	}
+      }
+      if (!remaining_deltas_in_subgroup) {
+	break;
+      }
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+      --remaining_deltas_in_subgroup;
+      cur_raregeno_word >>= 2;
+    }
+  }
+}
+
+pglerr_t parse_non_ld_genovec_hphase_subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vrtype, uint32_t difflist_ambig_ids_needed, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict all_hets) {
+  // If all_hets is nullptr, this is essentially identical to
+  // parse_non_ld_genovec_subset_unsafe().
+  // Side effects:
+  //   may use pgrp->workspace_raregeno_tmp_loadbuf
+  //   may use pgrp->workspace_vec (subsetting)
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  if (!vrtype_difflist(vrtype)) {
+    const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+    uintptr_t* raw_genovec = subsetting_required? pgrp->workspace_vec : genovec;
+    pglerr_t reterr = parse_1or2bit_genovec_unsafe(fread_end, vrtype, difflist_ambig_ids_needed, fread_pp, pgrp, raw_genovec);
+    // can technically avoid this return and just plow forward in error case,
+    // but that tiny optimization is not worth the associated maintenance
+    // problems
+    if (reterr) {
+      return reterr;
+    }
+    zero_trailing_quaters(raw_sample_ct, raw_genovec);
+    if (all_hets) {
+      pgr_detect_genovec_hets_unsafe(raw_genovec, QUATERCT_TO_WORDCT(raw_sample_ct), all_hets);
+    }
+    if (subsetting_required) {
+      copy_quaterarr_nonempty_subset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
+    }
+    return kPglRetSuccess;
+  }
+  const uint32_t vrtype_low2 = vrtype & 3;
+  const uint32_t word_ct = QUATERCT_TO_WORDCT(sample_ct);
+  memset(genovec, vrtype_low2 * 0x55, word_ct * kBytesPerWord);
+  zero_trailing_quaters(sample_ct, genovec);
+  // common genotype can't be het
+  if (all_hets) {
+    fill_ulong_zero(BITCT_TO_WORDCT(raw_sample_ct), all_hets);
+    return parse_and_apply_difflist_hphase_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, difflist_ambig_ids_needed, fread_pp, pgrp, genovec, all_hets);
+  } else {
+    return parse_and_apply_difflist_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, difflist_ambig_ids_needed, fread_pp, pgrp, genovec);
+  }
+}
+
+// may use workspace_vec
+pglerr_t ld_load_genovec_hphase_subset_if_necessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp) {
+  // bugfix: this conditional was in the other order, which was wrong since
+  // we depend on ld_load_necessary() to set pgrp->ldbase_vidx as a side effect
+  // todo: make that explicit instead of a side effect...
+  if (ld_load_necessary(vidx, pgrp) || (!(pgrp->ldbase_stypes & kfPgrLdcacheAllHets))) {
+    const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
+    const unsigned char* fread_ptr;
+    const unsigned char* fread_end;
+    if (init_read_ptrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+    return parse_non_ld_genovec_hphase_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, pgrp->fi.vrtypes[ldbase_vidx], 0, &fread_ptr, pgrp, pgrp->ldbase_genovec, pgrp->ldbase_all_hets);
+  }
+  if (!(pgrp->ldbase_stypes & kfPgrLdcacheQuater)) {
+    assert(pgrp->ldbase_stypes & kfPgrLdcacheDifflist);
+    pgr_difflist_to_genovec_unsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, sample_ct, pgrp->ldbase_difflist_len, pgrp->ldbase_genovec);
+    pgrp->ldbase_stypes |= kfPgrLdcacheQuater;
+  }
+  return kPglRetSuccess;
+}
+
+// "h" in hphase = "hardcall"
+// No need for fread_pp/fread_endp, since any function which needed them would
+// use a multiallelic variant-supporting phase loader.
+// Iff *phasepresent_ct_ptr is nonzero, returned phasepresent is guaranteed to
+// only have set bits for het calls where phase information is present.
+/*
+pglerr_t pgr_read_refalt1_genovec_hphase_raw_unsafe(uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict phaseraw, uint32_t* phasepresent_ct_ptr) {
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  if (!vrtype_hphase(vrtype)) {
+    // don't bother updating ldbase_all_hets, too much of a performance
+    // penalty, and too likely that we won't need it
+    *phasepresent_ct_ptr = 0;
+    return read_refalt1_genovec_subset_unsafe(nullptr, nullptr, raw_sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
+  }
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t multiallelic_relevant = vrtype_multiallelic(vrtype);
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  uintptr_t* all_hets = pgrp->workspace_all_hets;
+  if (vrtype_ld_compressed(vrtype)) {
+    // ldbase_all_hets not needed in this case
+    pglerr_t reterr = ld_load_genovec_subset_if_necessary(nullptr, nullptr, raw_sample_ct, vidx, pgrp);
+    if (reterr) {
+      return reterr;
+    }
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    copy_quaterarr(pgrp->ldbase_genovec, raw_sample_ct, genovec);
+
+    reterr = parse_and_apply_difflist(fread_end, multiallelic_relevant, &fread_ptr, pgrp, genovec);
+    if (reterr) {
+      return reterr;
+    }
+    pgr_detect_genovec_hets(genovec, raw_sample_ct, all_hets);
+    if ((vrtype & 7) == 3) {
+      genovec_invert_unsafe(raw_sample_ct, genovec);
+    }
+  } else {
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    pglerr_t reterr = parse_non_ld_genovec_hphase_subset(fread_end, nullptr, nullptr, raw_sample_ct, vrtype, multiallelic_relevant, &fread_ptr, pgrp, genovec, all_hets);
+    if (reterr) {
+      return reterr;
+    }
+    const uint32_t is_ldbase = pgrp->fi.vrtypes && vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+    if (is_ldbase) {
+      copy_quaterarr(genovec, raw_sample_ct, pgrp->ldbase_genovec);
+      pgrp->ldbase_vidx = vidx;
+      pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+    }
+  }
+  if (multiallelic_relevant) {
+    // todo
+    // can't ignore multiallelic track, since it may contain additional het
+    // calls.  (these additional calls must *not* be saved to ldbase_all_hets.)
+    return kPglRetNotYetSupported;
+  }
+  const uint32_t het_ct = popcount_longs(all_hets, raw_sample_ctl);
+  if (!het_ct) {
+    // there shouldn't be a hphase track at all in this case
+    return kPglRetMalformedInput;
+  }
+  const uint32_t het_ctdl = het_ct / kBitsPerWord;
+  phaseraw[het_ctdl] = 0;
+  const uint32_t first_half_byte_ct = 1 + (het_ct / CHAR_BIT);
+  memcpy(phaseraw, fread_ptr, first_half_byte_ct);
+  if (!(fread_ptr[0] & 1)) {
+    // phase always present, phasepresent not stored
+    *phasepresent_ct_ptr = het_ct;
+    return kPglRetSuccess;
+  }
+  const uint32_t raw_phasepresent_ct = popcount_longs(phaseraw, het_ctdl + 1) - 1;
+  if (!raw_phasepresent_ct) {
+    // there shouldn't be a hphase track at all in this case, either
+    return kPglRetMalformedInput;
+  }
+  // put this in a phasepresent-independent location, to make things more
+  // convenient for the caller
+  memcpy(&(phaseraw[1 + (raw_sample_ct / kBitsPerWord)]), &(fread_ptr[first_half_byte_ct]), DIV_UP(raw_phasepresent_ct, CHAR_BIT));
+  *phasepresent_ct_ptr = raw_phasepresent_ct;
+  return kPglRetSuccess;
+}
+*/
+
+// If fread_pp/fread_endp are non-null, this always moves fread_ptr to the end
+// of aux2.  Set phasepresent/phaseinfo to nullptr when you don't actually care
+// about the contents of aux2.
+pglerr_t read_refalt1_genovec_hphase_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* phasepresent_ct_ptr) {
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t multiallelic_relevant = vrtype_multiallelic(vrtype);
+  if (!vrtype_hphase(vrtype)) {
+    // don't bother updating ldbase_all_hets, too much of a performance
+    // penalty, and too likely that we won't need it
+    *phasepresent_ct_ptr = 0;
+    pglerr_t reterr = read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, fread_pp, fread_endp, genovec);
+    if ((!multiallelic_relevant) || (!fread_pp) || reterr) {
+      return reterr;
+    }
+    return parse_aux1(*fread_endp, (uint32_t)(pgrp->fi.allele_idx_offsets[vidx + 1] - pgrp->fi.allele_idx_offsets[vidx] - 1), fread_pp, pgrp, nullptr);
+  }
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  uintptr_t* all_hets = pgrp->workspace_all_hets;
+  if (vrtype_ld_compressed(vrtype)) {
+    pglerr_t reterr;
+    if (!subsetting_required) {
+      // ldbase_all_hets not needed in this case
+      reterr = ld_load_genovec_subset_if_necessary(nullptr, nullptr, sample_ct, vidx, pgrp);
+    } else {
+      reterr = ld_load_genovec_hphase_subset_if_necessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
+      memcpy(all_hets, pgrp->ldbase_all_hets, raw_sample_ctl * sizeof(intptr_t));
+    }
+    if (reterr) {
+      return reterr;
+    }
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    copy_quaterarr(pgrp->ldbase_genovec, sample_ct, genovec);
+    reterr = parse_and_apply_difflist_hphase_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, multiallelic_relevant, &fread_ptr, pgrp, genovec, all_hets);
+    if (reterr) {
+      return reterr;
+    }
+    if ((vrtype & 7) == 3) {
+      genovec_invert_unsafe(sample_ct, genovec);
+    }
+  } else {
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    pglerr_t reterr = parse_non_ld_genovec_hphase_subset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, multiallelic_relevant, &fread_ptr, pgrp, genovec, all_hets);
+    if (reterr) {
+      return reterr;
+    }
+    const uint32_t is_ldbase = pgrp->fi.vrtypes && vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+    if (is_ldbase) {
+      copy_quaterarr(genovec, sample_ct, pgrp->ldbase_genovec);
+      pgrp->ldbase_vidx = vidx;
+      pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+      if (subsetting_required) {
+	pgrp->ldbase_stypes |= kfPgrLdcacheAllHets;
+        memcpy(pgrp->ldbase_all_hets, all_hets, raw_sample_ctl * sizeof(intptr_t));
+      }
+    }
+  }
+  if (multiallelic_relevant) {
+    // todo
+    // can't ignore multiallelic track, since it may contain additional het
+    // calls.  (these additional calls must *not* be saved to ldbase_all_hets.)
+    return kPglRetNotYetSupported;
+  }
+  const uint32_t het_ct = (uint32_t)popcount_longs(all_hets, raw_sample_ctl);
+  if (!het_ct) {
+    // there shouldn't be a hphase track at all in this case, het_ct is not
+    // computed off a subset
+    return kPglRetMalformedInput;
+  }
+  if (fread_pp) {
+    *fread_endp = fread_end;
+  }
+  const uint32_t het_ctdl = het_ct / kBitsPerWord;
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  if (!(fread_ptr[0] & 1)) {
+    // phase always present
+    if (fread_pp) {
+      *fread_pp = &(fread_ptr[1 + (het_ct / CHAR_BIT)]);
+      if (!phaseinfo) {
+	// for internal callers which just want to skip aux2
+	return kPglRetSuccess;
+      }
+    }
+    fill_ulong_zero(raw_sample_ctl, phaseinfo);
+    const uintptr_t* aux2_raw_phaseinfo_iter = (const uintptr_t*)fread_ptr;
+    uint32_t phaseinfo_widx = 0;
+    uint32_t phaseinfo_idx_lowbits = 1; // skip first bit
+    uint32_t loop_len = kBitsPerWord;
+    uint32_t sample_uidx = 0;
+    if (!subsetting_required) {
+      memcpy(phasepresent, all_hets, raw_sample_ctl * kBytesPerWord);
+      *phasepresent_ct_ptr = het_ct;
+      while (1) {
+	uintptr_t phaseinfo_word;
+	if (phaseinfo_widx >= het_ctdl) {
+	  if (phaseinfo_widx > het_ctdl) {
+	    return kPglRetSuccess;
+	  }
+	  loop_len = 1 + (het_ct % kBitsPerWord);
+	  phaseinfo_word = 0;
+	  // avoid possible segfault
+	  memcpy(&phaseinfo_word, &(aux2_raw_phaseinfo_iter[phaseinfo_widx]), DIV_UP(loop_len, CHAR_BIT));
+	} else {
+#ifdef __arm__
+  #error "Unaligned accesses in read_refalt1_genovec_hphase_subset_unsafe()."
+#endif
+	  phaseinfo_word = aux2_raw_phaseinfo_iter[phaseinfo_widx];
+	}
+	for (; phaseinfo_idx_lowbits < loop_len; ++phaseinfo_idx_lowbits, ++sample_uidx) {
+	  sample_uidx = next_set_unsafe(all_hets, sample_uidx);
+	  // bugfix: can't just use (phaseinfo_word & 1) and phaseinfo_word
+	  // >>= 1, since we skip the first bit on the first loop iteration
+	  if ((phaseinfo_word >> phaseinfo_idx_lowbits) & 1) {
+	    SET_BIT(sample_uidx, phaseinfo);
+	  }
+	}
+	phaseinfo_idx_lowbits = 0;
+	++phaseinfo_widx;
+      }
+    } else {
+      // we could drop the "phasepresent bit can only be set at hets" guarantee
+      // and speed up this case, but I doubt it's worth it
+      copy_bitarr_subset(all_hets, sample_include, sample_ct, phasepresent);
+      *phasepresent_ct_ptr = (uint32_t)popcount_longs(phasepresent, sample_ctl);
+      if (!(*phasepresent_ct_ptr)) {
+	return kPglRetSuccess;
+      }
+      while (1) {
+	uintptr_t phaseinfo_word;
+	if (phaseinfo_widx >= het_ctdl) {
+	  if (phaseinfo_widx > het_ctdl) {
+	    return kPglRetSuccess;
+	  }
+	  loop_len = 1 + (het_ct % kBitsPerWord);
+	  phaseinfo_word = 0;
+	  memcpy(&phaseinfo_word, &(aux2_raw_phaseinfo_iter[phaseinfo_widx]), DIV_UP(loop_len, CHAR_BIT));
+	} else {
+	  phaseinfo_word = aux2_raw_phaseinfo_iter[phaseinfo_widx];
+	}
+	for (; phaseinfo_idx_lowbits < loop_len; ++phaseinfo_idx_lowbits, ++sample_uidx) {
+	  sample_uidx = next_set_unsafe(all_hets, sample_uidx);
+	  if (((phaseinfo_word >> phaseinfo_idx_lowbits) & 1) && IS_SET(sample_include, sample_uidx)) {
+	    const uint32_t sample_idx = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_uidx);
+	    SET_BIT(sample_idx, phaseinfo);
+	  }
+	}
+	phaseinfo_idx_lowbits = 0;
+	++phaseinfo_widx;
+      }
+    }
+  }
+    
+  // explicit phasepresent
+  const uintptr_t* aux2_first_half = (const uintptr_t*)fread_ptr;
+  uintptr_t* aux2_first_half_copy = pgrp->workspace_vec;
+  aux2_first_half_copy[het_ctdl] = 0;
+  memcpy(aux2_first_half_copy, aux2_first_half, 1 + (het_ct / CHAR_BIT));
+  const uint32_t raw_phasepresent_ct = (uint32_t)popcount_longs(aux2_first_half_copy, het_ctdl + 1) - 1;
+  if (!raw_phasepresent_ct) {
+    // there shouldn't be a hphase track at all in this case
+    return kPglRetMalformedInput;
+  }
+  if (fread_pp) {
+    *fread_pp = &(fread_ptr[1 + (het_ct / CHAR_BIT) + DIV_UP(raw_phasepresent_ct, CHAR_BIT)]);
+    if (!phaseinfo) {
+      return kPglRetSuccess;
+    }
+  }
+  fill_ulong_zero(sample_ctl, phasepresent);
+  fill_ulong_zero(sample_ctl, phaseinfo);
+  const uint32_t raw_phasepresent_ctl_m1 = BITCT_TO_WORDCT(raw_phasepresent_ct) - 1;
+  const uintptr_t* aux2_second_half = (const uintptr_t*)(&(fread_ptr[1 + (het_ct / CHAR_BIT)]));
+
+  uint32_t phasepresent_idx = 1;
+  uint32_t phaseinfo_widx = 0;
+  uint32_t loop_len = kBitsPerWord;
+  uint32_t sample_uidx = 0;
+  uint32_t phasepresent_ct = (1 - subsetting_required) * raw_phasepresent_ct;
+  while (1) {
+    uintptr_t phaseinfo_word;
+    if (phaseinfo_widx >= raw_phasepresent_ctl_m1) {
+      if (phaseinfo_widx > raw_phasepresent_ctl_m1) {
+	*phasepresent_ct_ptr = phasepresent_ct;
+	return kPglRetSuccess;
+      }
+      loop_len = MOD_NZ(raw_phasepresent_ct, kBitsPerWord);
+      phaseinfo_word = 0;
+      // avoid possible segfault
+      memcpy(&phaseinfo_word, &(aux2_second_half[phaseinfo_widx]), DIV_UP(loop_len, CHAR_BIT));
+    } else {
+      phaseinfo_word = aux2_second_half[phaseinfo_widx];
+    }
+    if (!subsetting_required) {
+      for (uint32_t phaseinfo_idx_lowbits = 0; phaseinfo_idx_lowbits < loop_len; ++phasepresent_idx, ++sample_uidx) {
+	// could conditionally use jump_forward_set_unsafe, if we need more
+	// efficient handling of merged datasets with only a few phased hets
+	sample_uidx = next_set_unsafe(all_hets, sample_uidx);
+	if (IS_SET(aux2_first_half_copy, phasepresent_idx)) {
+	  const uintptr_t new_bit = k1LU << (sample_uidx % kBitsPerWord);
+	  const uint32_t sample_widx = sample_uidx / kBitsPerWord;
+	  phasepresent[sample_widx] |= new_bit;
+	  phaseinfo[sample_widx] |= new_bit * (phaseinfo_word & 1);
+	  phaseinfo_word >>= 1;
+	  ++phaseinfo_idx_lowbits;
+	}
+      }
+    } else {
+      for (uint32_t phaseinfo_idx_lowbits = 0; phaseinfo_idx_lowbits < loop_len; ++phasepresent_idx, ++sample_uidx) {
+	sample_uidx = next_set_unsafe(all_hets, sample_uidx);
+	if (IS_SET(aux2_first_half_copy, phasepresent_idx)) {
+	  if (IS_SET(sample_include, sample_uidx)) {
+	    const uint32_t sample_idx = raw_to_subsetted_pos(sample_include, sample_include_cumulative_popcounts, sample_uidx);
+	    const uintptr_t new_bit = k1LU << (sample_idx % kBitsPerWord);
+	    const uint32_t sample_widx = sample_idx / kBitsPerWord;
+	    phasepresent[sample_widx] |= new_bit;
+	    phaseinfo[sample_widx] |= new_bit * (phaseinfo_word & 1);
+	    ++phasepresent_ct;
+	  }
+	  phaseinfo_word >>= 1;
+	  ++phaseinfo_idx_lowbits;
+	}
+      }
+    }
+    ++phaseinfo_widx;
+  }
+}
+
+pglerr_t pgr_read_refalt1_genovec_hphase_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* phasepresent_ct_ptr) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    *phasepresent_ct_ptr = 0;
+    return kPglRetSuccess;
+  }
+  return read_refalt1_genovec_hphase_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec, phasepresent, phaseinfo, phasepresent_ct_ptr);
+}
+
+
+// similar to parse_and_save_difflist()
+pglerr_t parse_and_save_deltalist_as_bitarr(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* deltalist_include, uint32_t* deltalist_len_ptr) {
+  const unsigned char* group_info_iter;
+  pglerr_t reterr = parse_difflist_header(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr);
+  const uint32_t deltalist_len = *deltalist_len_ptr;
+  if (reterr || (!deltalist_len)) {
+    return reterr;
+  }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(raw_sample_ct);
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
+  fill_ulong_zero(raw_sample_ctl, deltalist_include);
+  uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
+  uint32_t group_idx = 0;
+  while (1) {
+    if (group_idx >= group_idx_last) {
+      if (group_idx > group_idx_last) {
+	return kPglRetSuccess;
+      }
+      group_len_m1 &= deltalist_len - 1;
+    }
+    uintptr_t raw_sample_idx = 0;
+    memcpy(&raw_sample_idx, group_info_iter, sample_id_byte_ct);
+    group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    ++group_idx;
+    uint32_t raw_deltalist_idx_lowbits = 0;
+    while (1) {
+      // always check, otherwise we may scribble over arbitrary memory
+      if (raw_sample_idx >= raw_sample_ct) {
+	return kPglRetMalformedInput;
+      }
+      SET_BIT(raw_sample_idx, deltalist_include);
+      if (raw_deltalist_idx_lowbits == group_len_m1) {
+	break;
+      }
+      ++raw_deltalist_idx_lowbits;
+      raw_sample_idx += get_vint31(fread_end, fread_pp);
+    }
+  }
+}
+
+pglerr_t parse_dosage16(const unsigned char* fread_ptr, const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t sample_ct, uint32_t vidx, uint32_t alt_allele_ct, pgen_reader_t* pgrp, uint32_t* dosage_ct_ptr, uintptr_t* __restrict dosage_present, uint16_t* dosage_vals) {
+  // Side effect: may use pgrp->workspace_dosage_present
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  uintptr_t* raw_dosage_present = subsetting_required? pgrp->workspace_dosage_present : dosage_present;
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t is_unconditional_dosage = ((vrtype & 0x60) == 0x40);
+  uint32_t raw_dosage_ct;
+  if ((vrtype & 0x60) == 0x20) {
+    // case 1: dosage list
+    if (parse_and_save_deltalist_as_bitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct)) {
+      return kPglRetMalformedInput;
+    }
+  } else if (is_unconditional_dosage) {
+    // case 2: unconditional dosage.  handle separately from other two cases
+    // since missing values may be present.
+    fill_all_bits(raw_sample_ct, raw_dosage_present);
+    raw_dosage_ct = raw_sample_ct;
+  } else {
+    // case 3: dosage bitarray
+    raw_dosage_present[raw_sample_ctl - 1] = 0;
+    const uint32_t raw_sample_ctb = DIV_UP(raw_sample_ct, CHAR_BIT);
+    memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
+    fread_ptr = &(fread_ptr[raw_sample_ctb]);
+    raw_dosage_ct = (uint32_t)popcount_longs(raw_dosage_present, raw_sample_ctl);
+  }
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  uint32_t dosage_ct;
+  if (subsetting_required) {
+    copy_bitarr_subset(raw_dosage_present, sample_include, sample_ct, dosage_present);
+    dosage_ct = (uint32_t)popcount_longs(dosage_present, sample_ctl);
+  } else {
+    dosage_ct = raw_dosage_ct;
+  }
+  if (dosage_ct_ptr) {
+    *dosage_ct_ptr = dosage_ct;
+  }
+  if (!dosage_ct) {
+    return kPglRetSuccess;
+  }
+#ifdef __arm__
+  #error "Unaligned accesses in parse_dosage16()."
+#endif
+  const uint16_t* dosage_vals_read_iter = (const uint16_t*)fread_ptr;
+  uint16_t* dosage_vals_write_iter = dosage_vals;
+  if (!(vrtype & 0x80)) {
+    if (alt_allele_ct == 1) {
+      if (dosage_ct == raw_dosage_ct) {
+	if (!is_unconditional_dosage) {
+	  memcpy(dosage_vals_write_iter, dosage_vals_read_iter, dosage_ct * sizeof(int16_t));
+	} else {
+	  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	    const uint16_t cur_dosage = *dosage_vals_read_iter++;
+	    if (cur_dosage != 65535) {
+	      *dosage_vals_write_iter++ = cur_dosage;
+	    } else {
+	      CLEAR_BIT(sample_idx, raw_dosage_present);
+	    }
+	  }
+	  if (dosage_ct_ptr) {
+	    *dosage_ct_ptr = (uint32_t)((uintptr_t)(dosage_vals_write_iter - dosage_vals));
+	  }
+	}
+      } else {
+	uint32_t sample_uidx = 0;
+	// bugfix (22 May 2017): dosage_entry_idx needs to iterate up to
+	// raw_dosage_ct, not dosage_ct
+	for (uint32_t dosage_entry_idx = 0; dosage_entry_idx < raw_dosage_ct; ++dosage_entry_idx, ++sample_uidx, ++dosage_vals_read_iter) {
+	  next_set_unsafe_ck(raw_dosage_present, &sample_uidx);
+	  if (!IS_SET(sample_include, sample_uidx)) {
+	    continue;
+	  }
+	  *dosage_vals_write_iter++ = *dosage_vals_read_iter;
+	}
+      }
+    } else {
+      // todo: multiallelic dosage
+      // need to support downcode to ref/alt1 as well as raw load
+      // (dosage_ct_ptr should be nullptr iff we're doing a raw load)
+      return kPglRetNotYetSupported;
+    }
+    return kPglRetSuccess;
+  } else {
+    // todo: phased dosage
+    return kPglRetNotYetSupported;
+  }
+}
+
+pglerr_t pgr_read_refalt1_genovec_dosage16_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr, uint32_t* is_explicit_alt1_ptr) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    *dosage_ct_ptr = 0;
+    return kPglRetSuccess;
+  }
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  if (!vrtype_dosage(vrtype)) {
+    pglerr_t reterr = read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
+    *dosage_ct_ptr = 0;
+    return reterr;
+  }
+  const unsigned char* fread_ptr = nullptr;
+  const unsigned char* fread_end = nullptr;
+  uint32_t phasepresent_ct;
+  pglerr_t reterr = read_refalt1_genovec_hphase_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec, nullptr, nullptr, &phasepresent_ct);
+  if (reterr) {
+    return reterr;
+  }
+  const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
+  const uint32_t alt_allele_ct = allele_idx_offsets? ((uint32_t)(allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx] - 1)) : 1;
+  *is_explicit_alt1_ptr = (alt_allele_ct > 1);
+  return parse_dosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, alt_allele_ct, pgrp, dosage_ct_ptr, dosage_present, dosage_vals);
+}
+
+uint64_t uint16_vec_sum(const uint16_t* __restrict uint16_vec, uint32_t entry_ct) {
+#ifdef __LP64__
+  // univec_hsum_32bit() could overflow once we exceed this
+  const uint32_t max_loop_len = (131072 / kInt32PerVec) - 1;
+  
+  const vul_t m16 = VCONST_UL(kMask0000FFFF);
+  const vul_t* uint16_vvec_iter = (const vul_t*)uint16_vec;
+  uint32_t full_vecs_remaining = entry_ct / (kBytesPerVec / sizeof(int16_t));
+  uint64_t sum = 0;
+  while (1) {
+    univec_t acc_even;
+    univec_t acc_odd;
+    acc_even.vi = vul_setzero();
+    acc_odd.vi = vul_setzero();
+    const vul_t* uint16_vvec_stop;    
+    if (full_vecs_remaining < max_loop_len) {
+      if (!full_vecs_remaining) {
+	const uint32_t trail_ct = entry_ct % (kBytesPerVec / sizeof(int16_t));
+	uint16_vec = (const uint16_t*)uint16_vvec_iter;
+	for (uint32_t uii = 0; uii < trail_ct; ++uii) {
+	  sum += uint16_vec[uii];
+	}
+	return sum;
+      }
+      uint16_vvec_stop = &(uint16_vvec_iter[full_vecs_remaining]);
+      full_vecs_remaining = 0;
+    } else {
+      uint16_vvec_stop = &(uint16_vvec_iter[max_loop_len]);
+      full_vecs_remaining -= max_loop_len;
+    }
+    do {
+      const vul_t cur_vec = *uint16_vvec_iter++;
+      acc_even.vi = acc_even.vi + (cur_vec & m16);
+      acc_odd.vi = acc_odd.vi + (vul_rshift(cur_vec, 16) & m16);
+    } while (uint16_vvec_iter < uint16_vvec_stop);
+    sum += univec_hsum_32bit(acc_even);
+    sum += univec_hsum_32bit(acc_odd);
+  }
+#else
+  uint64_t sum = 0;
+  for (uint32_t uii = 0; uii < entry_ct; ++uii) {
+    sum += uint16_vec[uii];
+  }
+  return sum;
+#endif
+}
+
+pglerr_t get_ref_nonref_genotype_counts_and_dosage16s(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, double* mach_r2_ptr, uint32_t* genocounts, uint64_t* all_dosages) {
+  // genocounts[0] := ref/ref, genocounts[1] := ref/altx,
+  // genocounts[2] := altx/alty, genocounts[3] := missing
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  // to avoid LD cache thrashing, we either always keep a subsetted cache, or
+  // never do so.
+  // todo: can't take the shortcut in the multiallelic variant case
+  if ((!(pgrp->fi.gflags & (kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent))) || ((!(vrtype & 0x68)) && (!subsetting_required))) {
+    {
+      pglerr_t reterr = get_refalt1_genotype_counts(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genocounts);
+      if (reterr) {
+	return reterr;
+      }
+    }
+  get_ref_nonref_genotype_counts_and_dosage16s_basic_finish:
+    all_dosages[0] = (genocounts[0] * 2 + genocounts[1]) * 16384LLU;
+    all_dosages[1] = (genocounts[2] * 2 + genocounts[1]) * 16384LLU;
+    if (!mach_r2_ptr) {
+      return kPglRetSuccess;
+    }
+    // yeah, it's sinful to implement mach-r2 here...
+    const uint32_t nm_sample_ct = sample_ct - genocounts[3];
+    double mach_r2 = 1.0;
+    if (nm_sample_ct) {
+      const uintptr_t dosage_sum = genocounts[2] * 2 + genocounts[1];
+      const int64_t dosage_ssq = (uint64_t)(dosage_sum + genocounts[2] * 2LLU);
+      const double dosage_sumd = dosage_sum;
+      const double dosage_avg = dosage_sumd / ((double)((int32_t)nm_sample_ct));
+      const double dosage_variance = dosage_ssq - dosage_sumd * dosage_avg;
+      mach_r2 = 2 * dosage_variance / (dosage_sumd * (2 - dosage_avg));
+    }
+    *mach_r2_ptr = mach_r2;
+    return kPglRetSuccess;
+  }
+  uintptr_t* tmp_genovec = pgrp->workspace_vec;
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  pglerr_t reterr = read_refalt1_genovec_subset_unsafe(nullptr, nullptr, raw_sample_ct, vidx, pgrp, &fread_ptr, &fread_end, tmp_genovec);
+  if (reterr) {
+    return reterr;
+  }
+  if (!subsetting_required) {
+    zero_trailing_quaters(raw_sample_ct, tmp_genovec);
+    genovec_count_freqs_unsafe(tmp_genovec, raw_sample_ct, genocounts);
+  } else {
+    genovec_count_subset_freqs(tmp_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
+  }
+  if (!(vrtype & 0x68)) {
+    goto get_ref_nonref_genotype_counts_and_dosage16s_basic_finish;
+  }
+  if (vrtype & 8) {
+    // todo: multiallelic case
+    assert(0);
+    if (!(vrtype & 0x60)) {
+      return kPglRetSuccess;
+    }
+    // update raw_het_ct if hphase present
+  }
+  if (vrtype & 0x10) {
+    uint32_t raw_het_ct;
+    if (!subsetting_required) {
+      raw_het_ct = genocounts[1];
+    } else {
+      zero_trailing_quaters(raw_sample_ct, tmp_genovec);
+      raw_het_ct = genovec_count_01_unsafe(tmp_genovec, raw_sample_ct);
+    }
+    // skip phase info
+    // probably make this its own function...
+    // bugfix: need to use raw het ct, not subsetted
+    const uint32_t first_half_byte_ct = 1 + (raw_het_ct / CHAR_BIT);
+    const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
+    if (explicit_phasepresent) {
+      // uintptr_t popcount_bytes(const unsigned char* bitarr, uintptr_t byte_ct) {
+      const uint32_t raw_phasepresent_ct = (uint32_t)popcount_bytes(fread_ptr, first_half_byte_ct) - 1;
+      const uint32_t second_half_byte_ct = DIV_UP(raw_phasepresent_ct, CHAR_BIT);
+      fread_ptr = &(fread_ptr[first_half_byte_ct + second_half_byte_ct]);
+    } else {
+      fread_ptr = &(fread_ptr[first_half_byte_ct]);
+    }
+  }
+  
+  // todo: phased dosage
+#ifndef NDEBUG
+  const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
+  const uint32_t alt_allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx] - 1) : 1;
+  assert(alt_allele_ct == 1);
+#endif
+  uint64_t alt1_dosage = 0;
+  uint64_t alt1_dosage_sq_sum = 0;
+  uint32_t dosage_ct = 0;
+  uint32_t replaced_genocounts[4];
+  if ((vrtype & 0x60) == 0x40) {
+    // unconditional dosage.  needs to be handled separately from the other
+    // cases due to possible presence of missing values.
+    // note that this code will also need to be adjusted when multiallelic
+    // support is added.
+    uint32_t sample_uidx = 0;
+#ifdef __arm__
+  #error "Unaligned accesses in get_ref_nonref_genotype_counts_and_dosage16s()."
+#endif
+    fill_uint_zero(4, replaced_genocounts);
+    const uint16_t* dosage_vals = (const uint16_t*)fread_ptr;
+    if (subsetting_required) {
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	const uintptr_t cur_dosage_val = dosage_vals[sample_uidx];
+	if (cur_dosage_val != 65535) {
+	  alt1_dosage += cur_dosage_val;
+
+	  // todo: check if this is slow enough to justify removing it from the
+	  // main loop
+	  alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
+
+	  ++dosage_ct;
+	  const uint32_t hardcall_code = GET_QUATERARR_ENTRY(tmp_genovec, sample_uidx);
+	  replaced_genocounts[hardcall_code] += 1;
+	}
+      }
+    } else {
+      for (; sample_uidx < sample_ct; ++sample_uidx) {
+	const uintptr_t cur_dosage_val = dosage_vals[sample_uidx];
+	if (cur_dosage_val != 65535) {
+	  alt1_dosage += cur_dosage_val;
+	  alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
+	  ++dosage_ct;
+	  const uint32_t hardcall_code = GET_QUATERARR_ENTRY(tmp_genovec, sample_uidx);
+	  replaced_genocounts[hardcall_code] += 1;
+	}
+      }
+    }
+  } else {
+    uintptr_t* raw_dosage_present = pgrp->workspace_dosage_present;
+    uint32_t raw_dosage_ct;
+    if (!(vrtype & 0x40)) {
+      // dosage list
+      if (parse_and_save_deltalist_as_bitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct)) {
+	return kPglRetMalformedInput;
+      }
+    } else {
+      // dosage bitarray
+      const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+      raw_dosage_present[raw_sample_ctl - 1] = 0;
+      const uint32_t raw_sample_ctb = DIV_UP(raw_sample_ct, CHAR_BIT);
+      memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
+      fread_ptr = &(fread_ptr[raw_sample_ctb]);
+      raw_dosage_ct = (uint32_t)popcount_longs(raw_dosage_present, raw_sample_ctl);
+    }
+    const uint16_t* dosage_vals_iter = (const uint16_t*)fread_ptr;
+    uint32_t sample_uidx = 0;
+    if (subsetting_required) {
+      for (uint32_t dosage_idx = 0; dosage_idx < raw_dosage_ct; ++dosage_idx, ++sample_uidx) {
+	next_set_unsafe_ck(raw_dosage_present, &sample_uidx);
+	if (IS_SET(sample_include, sample_uidx)) {
+	  const uintptr_t cur_dosage_val = dosage_vals_iter[dosage_idx];
+	  alt1_dosage += cur_dosage_val;
+	  alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
+	  ++dosage_ct;
+	}
+      }
+      genoarr_count_subset_intersect_freqs(tmp_genovec, raw_dosage_present, sample_include, raw_sample_ct, replaced_genocounts);
+    } else {
+      if (!mach_r2_ptr) {
+	for (uint32_t dosage_idx = 0; dosage_idx < raw_dosage_ct; ++dosage_idx) {
+	  alt1_dosage += dosage_vals_iter[dosage_idx];
+	}
+      } else {
+	for (uint32_t dosage_idx = 0; dosage_idx < raw_dosage_ct; ++dosage_idx) {
+	  const uintptr_t cur_dosage_val = dosage_vals_iter[dosage_idx];
+	  alt1_dosage += cur_dosage_val;
+	  alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
+	}
+      }
+      dosage_ct = raw_dosage_ct;
+      genoarr_count_subset_freqs2(tmp_genovec, raw_dosage_present, raw_sample_ct, raw_dosage_ct, replaced_genocounts);
+    }
+  }
+  const uint32_t replaced_ct = replaced_genocounts[0] + replaced_genocounts[1] + replaced_genocounts[2];
+  const uint32_t remaining_het_ct = genocounts[1] - replaced_genocounts[1];
+  const uint32_t remaining_hom_alt_ct = genocounts[2] - replaced_genocounts[2];
+  const uint32_t alt1_ct = 2 * remaining_hom_alt_ct + remaining_het_ct;
+  alt1_dosage += alt1_ct * 16384LLU;
+  all_dosages[1] = alt1_dosage;
+  const uint32_t nondosage_nm_ct = sample_ct - genocounts[3] - replaced_ct;
+  const uint32_t new_sample_nm_ct = dosage_ct + nondosage_nm_ct;
+  all_dosages[0] = new_sample_nm_ct * 32768LLU - alt1_dosage;
+  if (!mach_r2_ptr) {
+    return kPglRetSuccess;
+  }
+  double mach_r2 = 1.0;
+  if (new_sample_nm_ct) {
+    // 16384^2, 32768^2
+    alt1_dosage_sq_sum += remaining_het_ct * 0x10000000LLU + remaining_hom_alt_ct * 0x40000000LLU;
+    const double dosage_sumd = (int64_t)alt1_dosage;
+    const double dosage_avg = dosage_sumd / ((double)((int32_t)new_sample_nm_ct));
+    const double dosage_variance = ((int64_t)alt1_dosage_sq_sum) - dosage_sumd * dosage_avg;
+    mach_r2 = 2 * dosage_variance / (dosage_sumd * (32768 - dosage_avg));
+  }
+  *mach_r2_ptr = mach_r2;
+  return kPglRetSuccess;
+}
+
+pglerr_t pgr_get_ref_nonref_genotype_counts_and_dosage16s(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, double* mach_r2_ptr, uint32_t* genocounts, uint64_t* all_dosages) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    fill_uint_zero(4, genocounts);
+    const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
+    const uint32_t cur_allele_ct = allele_idx_offsets? ((uint32_t)(allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx])) : 2;
+    fill_ull_zero(cur_allele_ct, all_dosages);
+    if (mach_r2_ptr) {
+      *mach_r2_ptr = 1.0;
+    }
+    return kPglRetSuccess;
+  }
+  return get_ref_nonref_genotype_counts_and_dosage16s(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, mach_r2_ptr, genocounts, all_dosages);
+}
+
+pglerr_t pgr_read_refalt1_genovec_hphase_dosage16_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* phasepresent_ct_ptr, uintptr_t* __restrict dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr, uint32_t* is_explicit_alt1_ptr) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    *phasepresent_ct_ptr = 0;
+    *dosage_ct_ptr = 0;
+    return kPglRetSuccess;
+  }
+  const unsigned char* fread_ptr = nullptr;
+  const unsigned char* fread_end = nullptr;
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t dosage_is_present = vrtype_dosage(vrtype);
+  pglerr_t reterr = read_refalt1_genovec_hphase_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, dosage_is_present? (&fread_ptr) : nullptr, dosage_is_present? (&fread_end) : nullptr, genovec, phasepresent, phaseinfo, phasepresent_ct_ptr);
+  if (reterr || (!dosage_is_present)) {
+    *dosage_ct_ptr = 0;
+    return reterr;
+  }
+  const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
+  const uint32_t alt_allele_ct = allele_idx_offsets? ((uint32_t)(allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx] - 1)) : 1;
+  *is_explicit_alt1_ptr = (alt_allele_ct > 1);
+  return parse_dosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, alt_allele_ct, pgrp, dosage_ct_ptr, dosage_present, dosage_vals);
+}
+
+pglerr_t pgr_read_raw(uint32_t vidx, pgen_global_flags_t read_gflags, pgen_reader_t* pgrp, uintptr_t** loadbuf_iter_ptr, unsigned char* loaded_vrtype_ptr) {
+  // currently handles hardcall phase and unphased dosage
+  // todo: multiallelic variants, phased dosage
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  uintptr_t* genovec = (*loadbuf_iter_ptr);
+  uintptr_t* loadbuf_iter = &(genovec[QUATERCT_TO_ALIGNED_WORDCT(raw_sample_ct)]);
+  const uint32_t hphase_is_present = (vrtype / 0x10) & 1;
+  const uint32_t save_hphase = hphase_is_present && (read_gflags & kfPgenGlobalHardcallPhasePresent);
+  const uint32_t dosage_is_present = (vrtype & 0x60)? 1 : 0;
+  const uint32_t save_dosage = dosage_is_present && (read_gflags & kfPgenGlobalDosagePresent);
+  if (loaded_vrtype_ptr) {
+    *loaded_vrtype_ptr = save_hphase * 0x10 + save_dosage * 0x60;
+  }
+  if (!(save_hphase || save_dosage)) {
+    // don't bother updating ldbase_all_hets, too much of a performance
+    // penalty, and too likely that we won't need it
+    *loadbuf_iter_ptr = loadbuf_iter;
+    return read_refalt1_genovec_subset_unsafe(nullptr, nullptr, raw_sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
+  }
+  
+  // todo: main multiallelic track goes here
+  // (i) nonmissing bitarray
+  // (ii) appropriate-width allele codes
+
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t multiallelic_relevant = vrtype_multiallelic(vrtype);
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  uintptr_t* all_hets = hphase_is_present? pgrp->workspace_all_hets : nullptr;
+  if (vrtype_ld_compressed(vrtype)) {
+    // ldbase_all_hets not needed in this case
+    pglerr_t reterr = ld_load_genovec_subset_if_necessary(nullptr, nullptr, raw_sample_ct, vidx, pgrp);
+    if (reterr) {
+      return reterr;
+    }
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    copy_quaterarr(pgrp->ldbase_genovec, raw_sample_ct, genovec);
+
+    reterr = parse_and_apply_difflist(fread_end, multiallelic_relevant, &fread_ptr, pgrp, genovec);
+    if (reterr) {
+      return reterr;
+    }
+    if (all_hets) {
+      pgr_detect_genovec_hets(genovec, raw_sample_ct, all_hets);
+    }
+    if ((vrtype & 7) == 3) {
+      genovec_invert_unsafe(raw_sample_ct, genovec);
+    }
+  } else {
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      return kPglRetReadFail;
+    }
+    pglerr_t reterr = parse_non_ld_genovec_hphase_subset(fread_end, nullptr, nullptr, raw_sample_ct, vrtype, multiallelic_relevant, &fread_ptr, pgrp, genovec, all_hets);
+    if (reterr) {
+      return reterr;
+    }
+    const uint32_t is_ldbase = pgrp->fi.vrtypes && vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+    if (is_ldbase) {
+      copy_quaterarr(genovec, raw_sample_ct, pgrp->ldbase_genovec);
+      pgrp->ldbase_vidx = vidx;
+      pgrp->ldbase_stypes = kfPgrLdcacheQuater;
+    }
+  }
+  if (multiallelic_relevant) {
+    // todo
+    return kPglRetNotYetSupported;
+  }
+  if (all_hets) {
+    const uint32_t het_ct = (uint32_t)popcount_longs(all_hets, raw_sample_ctl);
+    if (!het_ct) {
+      // there shouldn't be a hphase track at all in this case
+      return kPglRetMalformedInput;
+    }
+    const uint32_t het_ctdl = het_ct / kBitsPerWord;
+    uintptr_t* phaseraw = loadbuf_iter;
+    const uint32_t first_half_byte_ct = 1 + (het_ct / CHAR_BIT);
+    if (save_hphase) {
+      // this needs to be synced with phaseraw_word_ct in make_pgen_thread()
+      loadbuf_iter = &(loadbuf_iter[kWordsPerVec + round_down_pow2(raw_sample_ct / kBitsPerWordD2, kWordsPerVec)]);
+      phaseraw[het_ctdl] = 0;
+
+      memcpy(phaseraw, fread_ptr, first_half_byte_ct);
+    }
+    const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
+    fread_ptr = &(fread_ptr[first_half_byte_ct]);
+    if (explicit_phasepresent) {
+      const uint32_t raw_phasepresent_ct = (uint32_t)popcount_longs(phaseraw, het_ctdl + 1) - 1;
+      if (!raw_phasepresent_ct) {
+	// there shouldn't be a hphase track at all in this case, either
+	return kPglRetMalformedInput;
+      }
+      const uint32_t second_half_byte_ct = DIV_UP(raw_phasepresent_ct, CHAR_BIT);
+      if (save_hphase) {
+	// put this in a phasepresent-independent location, to make things more
+	// convenient for the caller
+	memcpy(&(phaseraw[1 + (raw_sample_ct / kBitsPerWord)]), fread_ptr, second_half_byte_ct);
+      }
+      fread_ptr = &(fread_ptr[second_half_byte_ct]);
+    }
+  }
+  if (!save_dosage) {
+    *loadbuf_iter_ptr = loadbuf_iter;
+    return kPglRetSuccess;
+  }
+  uintptr_t* dosage_present = loadbuf_iter;
+  loadbuf_iter = &(loadbuf_iter[BITCT_TO_ALIGNED_WORDCT(raw_sample_ct)]);
+  uint16_t* dosage_vals = (uint16_t*)loadbuf_iter;
+  const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
+  const uint32_t alt_allele_ct = allele_idx_offsets? ((uint32_t)(allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx] - 1)) : 1;
+  *loadbuf_iter_ptr = &(loadbuf_iter[kWordsPerVec * DIV_UP(raw_sample_ct, (kBytesPerVec / sizeof(int16_t)))]);
+  return parse_dosage16(fread_ptr, fread_end, nullptr, raw_sample_ct, vidx, alt_allele_ct, pgrp, nullptr, dosage_present, dosage_vals);
+}
+
+
+// tried to have more custom code, turned out to not be worth it
+pglerr_t read_missingness(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict missingness, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf) {
+  const unsigned char* fread_ptr;
+  const unsigned char* fread_end;
+  pglerr_t reterr = read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec_buf);
+  zero_trailing_quaters(sample_ct, genovec_buf);
+  genovec_to_missingness_unsafe(genovec_buf, sample_ct, missingness);
+  if (hets) {
+    pgr_detect_genovec_hets_unsafe(genovec_buf, QUATERCT_TO_WORDCT(sample_ct), hets);
+  }
+  if (fread_pp) {
+    *fread_pp = fread_ptr;
+    *fread_endp = fread_end;
+  }
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t is_multiallelic = vrtype_multiallelic(vrtype);
+  if (reterr || (!is_multiallelic)) {
+    return reterr;
+  }
+  // todo: multiallelic case
+  assert(0);
+  return kPglRetSuccess;
+}
+
+pglerr_t pgr_read_missingness(const uintptr_t* __restrict sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict missingness, uintptr_t* __restrict genovec_buf) {
+  // may as well add a hets parameter?
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    return kPglRetSuccess;
+  }
+  return read_missingness(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, missingness, nullptr, genovec_buf);
+}
+
+/*
+pglerr_t pgr_read_missingness_dosage(const uintptr_t* __restrict sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict missingness, uintptr_t* __restrict genovec_buf) {
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    return kPglRetSuccess;
+  }
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t dosage_is_present = vrtype_dosage(vrtype);
+  const uint32_t need_to_skip_hphase = dosage_is_present && vrtype_hphase(vrtype);
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  const unsigned char* fread_ptr = nullptr;
+  const unsigned char* fread_end = nullptr;
+  if (!need_to_skip_hphase) {
+    pglerr_t reterr = read_missingness(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, dosage_is_present? (&fread_ptr) : nullptr, dosage_is_present? (&fread_end) : nullptr, missingness, genovec_buf);
+    if (reterr || (!dosage_is_present)) {
+      return reterr;
+    }
+  } else {
+    uint32_t dummy;
+    pglerr_t reterr = read_refalt1_genovec_hphase_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec_buf, nullptr, nullptr, &dummy);
+    if (reterr) {
+      return reterr;
+    }
+    zero_trailing_quaters(sample_ct, genovec_buf);
+    genovec_to_missingness_unsafe(genovec_buf, sample_ct, missingness);
+  }
+  // now perform bitwise andnot with dosage_present
+  if ((vrtype & 0x60) == 0x40) {
+    // unconditional dosage.  spot-check the appropriate entries for equality
+    // to 65535.
+#ifdef __arm__
+  #error "Unaligned accesses in pgr_read_missingness_dosage()."
+#endif
+    const uint16_t* dosage_vals = (const uint16_t*)fread_ptr;
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      if (!IS_SET(missingness, sample_idx)) {
+	continue;
+      }
+      if (dosage_vals[sample_uidx] != 65535) {
+	CLEAR_BIT(sample_idx, missingness);
+      }
+    }
+    return kPglRetSuccess;
+  }
+  uintptr_t* dosage_present = pgrp->workspace_dosage_present;
+  if ((vrtype & 0x60) == 0x20) {
+    // dosage list
+    uint32_t dummy;
+    if (parse_and_save_deltalist_as_bitarr(fread_end, raw_sample_ct, &fread_ptr, dosage_present, &dummy)) {
+      return kPglRetMalformedInput;
+    }
+  } else {
+    // dosage bitarray
+    dosage_present[raw_sample_ctl - 1] = 0;
+    const uint32_t raw_sample_ctb = DIV_UP(raw_sample_ct, CHAR_BIT);
+    memcpy(dosage_present, fread_ptr, raw_sample_ctb);
+  }
+  if (subsetting_required) {
+    copy_bitarr_subset(dosage_present, sample_include, sample_ct, pgrp->workspace_vec);
+    dosage_present = pgrp->workspace_vec;
+  }
+  bitvec_andnot(dosage_present, BITCT_TO_WORDCT(sample_ct), missingness);
+  return kPglRetSuccess;
+}
+*/
+
+pglerr_t pgr_read_missingness_multi(const uintptr_t* __restrict sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict missingness_hc, uintptr_t* __restrict missingness_dosage, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf) {
+  // either missingness_hc or missingness_dosage must be non-null
+  assert(vidx < pgrp->fi.raw_variant_ct);
+  if (!sample_ct) {
+    return kPglRetSuccess;
+  }
+  const uint32_t vrtype = get_pgfi_vrtype(&(pgrp->fi), vidx);
+  const uint32_t dosage_is_relevant = missingness_dosage && vrtype_dosage(vrtype);
+  const uint32_t need_to_skip_hphase = dosage_is_relevant && vrtype_hphase(vrtype);
+  const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+  const unsigned char* fread_ptr = nullptr;
+  const unsigned char* fread_end = nullptr;
+  uintptr_t* missingness_base = missingness_hc? missingness_hc : missingness_dosage;
+  if (!need_to_skip_hphase) {
+    pglerr_t reterr = read_missingness(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, dosage_is_relevant? (&fread_ptr) : nullptr, dosage_is_relevant? (&fread_end) : nullptr, missingness_base, hets, genovec_buf);
+    if (missingness_dosage && missingness_hc) {
+      memcpy(missingness_dosage, missingness_hc, BITCT_TO_WORDCT(sample_ct) * sizeof(intptr_t));
+    }
+    if (reterr || (!dosage_is_relevant)) {
+      return reterr;
+    }
+  } else {
+    uint32_t dummy;
+    // will need to switch to a different function when multiallelic variants
+    // are implemented.
+    pglerr_t reterr = read_refalt1_genovec_hphase_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec_buf, nullptr, nullptr, &dummy);
+    if (reterr) {
+      return reterr;
+    }
+    zero_trailing_quaters(sample_ct, genovec_buf);
+    genovec_to_missingness_unsafe(genovec_buf, sample_ct, missingness_base);
+    if (hets) {
+      pgr_detect_genovec_hets_unsafe(genovec_buf, QUATERCT_TO_WORDCT(sample_ct), hets);
+    }
+    if (missingness_hc) {
+      memcpy(missingness_dosage, missingness_hc, BITCT_TO_WORDCT(sample_ct) * sizeof(intptr_t));
+    }
+  }
+  // now perform bitwise andnot with dosage_present
+  if ((vrtype & 0x60) == 0x40) {
+    // unconditional dosage.  spot-check the appropriate entries for equality
+    // to 65535.
+#ifdef __arm__
+  #error "Unaligned accesses in pgr_read_missingness_dosage()."
+#endif
+    const uint16_t* dosage_vals = (const uint16_t*)fread_ptr;
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      if (!IS_SET(missingness_dosage, sample_idx)) {
+	continue;
+      }
+      if (dosage_vals[sample_uidx] != 65535) {
+	CLEAR_BIT(sample_idx, missingness_dosage);
+      }
+    }
+    return kPglRetSuccess;
+  }
+  uintptr_t* dosage_present = pgrp->workspace_dosage_present;
+  if ((vrtype & 0x60) == 0x20) {
+    // dosage list
+    uint32_t dummy;
+    if (parse_and_save_deltalist_as_bitarr(fread_end, raw_sample_ct, &fread_ptr, dosage_present, &dummy)) {
+      return kPglRetMalformedInput;
+    }
+  } else {
+    // dosage bitarray
+    dosage_present[raw_sample_ctl - 1] = 0;
+    const uint32_t raw_sample_ctb = DIV_UP(raw_sample_ct, CHAR_BIT);
+    memcpy(dosage_present, fread_ptr, raw_sample_ctb);
+  }
+  if (subsetting_required) {
+    copy_bitarr_subset(dosage_present, sample_include, sample_ct, pgrp->workspace_vec);
+    dosage_present = pgrp->workspace_vec;
+  }
+  bitvec_andnot(dosage_present, BITCT_TO_WORDCT(sample_ct), missingness_dosage);
+  return kPglRetSuccess;
+}
+
+static inline boolerr_t validate_vint31(const unsigned char* buf_end, const unsigned char** bufpp, uint32_t* val_ptr) {
+  if (buf_end <= (*bufpp)) {
+    return 1;
+  }
+  uint32_t vint32 = *((*bufpp)++);
+  if (vint32 <= 127) {
+    *val_ptr = vint32;
+    return 0;
+  }
+  vint32 &= 127;
+  for (uint32_t shift = 7; shift < 28; shift += 7) {
+    if (buf_end == (*bufpp)) {
+      return 1;
+    }
+    uint32_t uii = *((*bufpp)++);
+    vint32 |= (uii & 127) << shift;
+    if (uii <= 127) {
+      *val_ptr = vint32;
+      return 0;
+    }
+  }
+  if (buf_end == (*bufpp)) {
+    return 1;
+  }
+  uint32_t uii = *((*bufpp)++);
+  if (uii > 7) {
+    return 1;
+  }
+  vint32 |= uii << 28;
+  *val_ptr = vint32;
+  return 0;
+}
+
+boolerr_t validate_difflist_header(const unsigned char* fread_end, uint32_t sample_ct, const unsigned char** fread_pp, uintptr_t* raregeno_buf, const unsigned char** difflist_group_info_ptr, uint32_t* difflist_len_ptr) {
+  // can be used for deltalists: pass raregeno_buf == nullptr.
+  if (validate_vint31(fread_end, fread_pp, difflist_len_ptr)) {
+    // todo: ensure fread_pp points to a problematic byte whenever a validate_
+    // function returns an error, so the error message can provide an accurate
+    // byte offset.
+    return 1;
+  }
+  const uint32_t difflist_len = *difflist_len_ptr;
+  *difflist_group_info_ptr = *fread_pp;
+  if (!difflist_len) {
+    return 0;
+  }
+  if (difflist_len > sample_ct / kPglMaxDifflistLenDivisor) {
+    return 1;
+  }
+  const uint32_t group_ct = DIV_UP(difflist_len, kPglDifflistGroupSize);
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(sample_ct);
+  const uint32_t difflist_index_byte_ct = group_ct * (sample_id_byte_ct + 1) - 1;
+  if ((uintptr_t)(fread_end - (*fread_pp)) < difflist_index_byte_ct) {
+    return 1;
+  }
+  *fread_pp += difflist_index_byte_ct;
+  if (!raregeno_buf) {
+    return 0;
+  }
+  const uint32_t raregeno_byte_ct = QUATERCT_TO_BYTECT(difflist_len);
+  if ((uintptr_t)(fread_end - (*fread_pp)) < raregeno_byte_ct) {
+    return 1;
+  }
+  const unsigned char* raregeno_end = &((*fread_pp)[raregeno_byte_ct]);
+  memcpy(raregeno_buf, *fread_pp, raregeno_byte_ct);
+  *fread_pp = raregeno_end;
+  const uint32_t difflist_len_mod4 = difflist_len % 4;
+  if (difflist_len_mod4) {
+    const uint32_t last_raregeno_byte = (uint32_t)((*fread_pp)[-1]);
+    if (last_raregeno_byte >> (2 * difflist_len_mod4)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+boolerr_t validate_and_apply_difflist(const unsigned char* fread_end, uint32_t common2_code, uint32_t multiallelic_relevant, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
+  // Similar to parse_and_apply_difflist(), but with exhaustive input
+  // validation.
+  // If multiallelic_relevant is true, a list of sample indices with freshly
+  // loaded raregeno value 0b11 is saved to pgr.workspace_ambig_sample_ids, and
+  // pgr.workspace_ambig_id_ct is set to the length of the list.
+  const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
+  uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
+  const unsigned char* group_info_iter;
+  uint32_t difflist_len;
+  if (validate_difflist_header(fread_end, sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len)) {
+    return 1;
+  }
+  if (!difflist_len) {
+    return 0;
+  }
+  const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
+  if (common2_code) {
+    // 1-bit format + list of exceptions.  In this case,
+    //   (i) the length of the exception list must be < (sample_ct / 16)
+    //   (ii) every raregeno entry must either be one of the two rare genotype
+    //        values, or involve a rare alt allele.
+    if (difflist_len >= (sample_ct / (2 * kPglMaxDifflistLenDivisor))) {
+      return 1;
+    }
+    const uintptr_t common_code_delta = common2_code & 3;
+    const uintptr_t inv_common_word1 = (3 - common2_code / 4) * kMask5555;
+    const uintptr_t inv_common_word2 = inv_common_word1 - (common_code_delta * kMask5555);
+    uint32_t subgroup_idx = 0;
+    while (1) {
+      uintptr_t cur_raregeno_word = cur_raregeno_iter[subgroup_idx];
+      uintptr_t match1 = cur_raregeno_word ^ inv_common_word1;
+      match1 = match1 & (match1 >> 1) & kMask5555;
+      uintptr_t match2 = cur_raregeno_word ^ inv_common_word2;
+      match2 = match2 & (match2 >> 1) & kMask5555;
+      if (subgroup_idx == subgroup_idx_last) {
+	// ignore trailing bits
+	const uint32_t lshift = (((uint32_t)(-difflist_len)) % kBitsPerWordD2) * 2;
+	if ((match1 << lshift) || (match2 << lshift)) {
+	  return 1;
+	}
+	break;
+      }
+      if (match1 || match2) {
+	// todo: if (multiallelic_relevant && (!inv_common_word2)), record
+	// might be fine; but we need to verify these are actually rare alt
+	// alleles.
+	return 1;
+      }
+      ++subgroup_idx;
+    }
+  }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(sample_ct);
+  uint32_t* ambig_sample_ids = multiallelic_relevant? pgrp->workspace_ambig_sample_ids : nullptr;
+  const unsigned char* group_byte_cts_iter = &(group_info_iter[DIV_UP(difflist_len, kPglDifflistGroupSize) * sample_id_byte_ct]);
+  const unsigned char* prev_group_start = *fread_pp;
+  
+  uintptr_t sample_idx = 0;
+  uint32_t ambig_id_ct = 0;
+  uint32_t subgroup_idx = 0;
+  while (1) {
+    uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
+    if (subgroup_idx >= subgroup_idx_last) {
+      if (subgroup_idx > subgroup_idx_last) {
+	pgrp->workspace_ambig_id_ct = ambig_id_ct;
+	return 0;
+      }
+      remaining_deltas_in_subgroup &= difflist_len - 1;
+    }
+    if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+      uintptr_t new_sample_idx_start = 0;
+      memcpy(&new_sample_idx_start, group_info_iter, sample_id_byte_ct);
+      if (subgroup_idx) {
+	if (sample_idx >= new_sample_idx_start) {
+	  return 1;
+	}
+	const uint32_t group_byte_ct = ((uint32_t)(*group_byte_cts_iter++)) + 63;
+	if ((uintptr_t)((*fread_pp) - prev_group_start) != group_byte_ct) {
+	  return 1;
+	}
+	prev_group_start = *fread_pp;
+      }
+      sample_idx = new_sample_idx_start;
+      group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    } else {
+      uint32_t sample_idx_incr;
+      if (validate_vint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr)) {
+	return 1;
+      }
+      sample_idx += sample_idx_incr;
+    }
+    ++subgroup_idx;
+    uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
+    while (1) {
+      if (sample_idx >= sample_ct) {
+	return 1;
+      }
+      const uintptr_t cur_geno = cur_raregeno_word & 3;
+      ASSIGN_QUATERARR_ENTRY(sample_idx, cur_geno, genovec);
+      if (multiallelic_relevant && (cur_geno == 3)) {
+	ambig_sample_ids[ambig_id_ct++] = (uint32_t)sample_idx;
+      }
+      if (!remaining_deltas_in_subgroup) {
+	break;
+      }
+      uint32_t sample_idx_incr;
+      if (validate_vint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr)) {
+	return 1;
+      }
+      sample_idx += sample_idx_incr;
+      --remaining_deltas_in_subgroup;
+      cur_raregeno_word >>= 2;
+    }
+  }
+}
+
+boolerr_t validate_onebit(const unsigned char* fread_end, uint32_t difflist_ambig_ids_needed, const unsigned char** fread_pp, pgen_reader_t* pgrp, uintptr_t* __restrict genovec) {
+  // parse_onebit_unsafe() with exhaustive input validation.
+  const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t common2_and_bitarray_byte_ct = (sample_ct + 15) / CHAR_BIT;
+  if ((uintptr_t)(fread_end - (*fread_pp)) < common2_and_bitarray_byte_ct) {
+    return 1;
+  }
+  const unsigned char* fread_difflist_start = &((*fread_pp)[common2_and_bitarray_byte_ct]);
+  const uintptr_t common2_code = *((*fread_pp)++);
+  const uintptr_t common_code_delta = common2_code & 3;
+  uintptr_t word_base = common2_code / 4;
+  if ((!common_code_delta) || (word_base + common_code_delta > 3)) {
+    return 1;
+  }
+  word_base *= kMask5555;
+  const uint32_t genovec_widx_trail = (sample_ct + 7) / kBitsPerWordD2;
+  const uint32_t genovec_widx_end = QUATERCT_TO_WORDCT(sample_ct);
+  uint32_t genovec_widx = 0;
+#ifdef __arm__
+  #error "Unaligned accesses in validate_onebit()."
+#endif
+  const halfword_t* fread_alias = (const halfword_t*)(*fread_pp);
+  while (1) {
+    uintptr_t ww;
+    if (genovec_widx >= genovec_widx_trail) {
+      if (genovec_widx == genovec_widx_end) {
+	break;
+      }
+      ww = 0;
+      const uint32_t nontrail_byte_ct = ((sample_ct - 1) % kBitsPerWordD2) / CHAR_BIT;
+      memcpy(&ww, &(fread_alias[genovec_widx_trail]), 1 + nontrail_byte_ct);
+      const uint32_t sample_ct_mod8 = sample_ct % 8;
+      if (sample_ct_mod8) {
+	if (ww >> (nontrail_byte_ct * 8 + sample_ct_mod8)) {
+	  return 1;
+	}
+      }
+    } else {
+      ww = (uintptr_t)(fread_alias[genovec_widx]);
+    }
+    ww = unpack_halfword_to_word(ww);
+    genovec[genovec_widx++] = word_base + ww * common_code_delta;
+  }
+  *fread_pp = fread_difflist_start;
+  return validate_and_apply_difflist(fread_end, (uint32_t)common2_code, difflist_ambig_ids_needed, fread_pp, pgrp, genovec);
+}
+
+// assumes that we aren't dealing with the trivial fixed-width case.
+// saves main genotype array to workspace_vec.  does not zero out trailing
+// bits.
+boolerr_t validate_geno(const unsigned char* fread_end, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, char* errstr_buf) {
+  const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
+  const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t multiallelic_relevant = vrtype_multiallelic(vrtype);
+  uintptr_t* genovec = pgrp->workspace_vec;
+  if (vrtype_ld_compressed(vrtype)) {
+    copy_quaterarr(pgrp->ldbase_genovec, sample_ct, genovec);
+    if (validate_and_apply_difflist(fread_end, 0, multiallelic_relevant, fread_pp, pgrp, genovec)) {
+      sprintf(errstr_buf, "Error: Invalid LD difflist for (0-based) variant #%u.\n", vidx);
+      return 1;
+    }
+    if (vrtype & 1) {
+      // do we actually need this?
+      genovec_invert_unsafe(sample_ct, genovec);
+    }
+    return 0;
+  }
+  const uint32_t is_ldbase = vrtype_ld_compressed(pgrp->fi.vrtypes[vidx + 1]);
+  if (!vrtype_difflist(vrtype)) {
+    if (vrtype & 1) {
+      if (validate_onebit(fread_end, multiallelic_relevant, fread_pp, pgrp, genovec)) {
+	sprintf(errstr_buf, "Error: Invalid 1-bit genotype record for (0-based) variant #%u.\n", vidx);
+	return 1;
+      }
+    } else {
+      const uint32_t genovec_byte_ct = DIV_UP(sample_ct, 4);
+      if ((uintptr_t)(fread_end - (*fread_pp)) < genovec_byte_ct) {
+	sprintf(errstr_buf, "Error: Invalid 2-bit genotype record for (0-based) variant #%u\n", vidx);
+	return 1;
+      }
+      memcpy(genovec, *fread_pp, genovec_byte_ct);
+      *fread_pp += genovec_byte_ct;
+      const uint32_t sample_ct_mod4 = sample_ct % 4;
+      if (sample_ct_mod4) {
+	const uint32_t last_geno_byte = (*fread_pp)[-1];
+	if (last_geno_byte >> (2 * sample_ct_mod4)) {
+	  sprintf(errstr_buf, "Error: Last genotype byte for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
+	  return 1;
+	}
+      }
+      if (vrtype_multiallelic(vrtype)) {
+	extract_genoarr_ambig_ids(genovec, sample_ct, pgrp->workspace_ambig_sample_ids, &(pgrp->workspace_ambig_id_ct));
+      }      
+    }
+  } else {
+    const uint32_t vrtype_low2 = vrtype & 3;
+    const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+    memset(genovec, vrtype_low2 * 0x55, vec_ct * kBytesPerVec);
+    if (validate_and_apply_difflist(fread_end, 0, multiallelic_relevant, fread_pp, pgrp, genovec)) {
+      sprintf(errstr_buf, "Error: Invalid genotype difflist for (0-based) variant #%u.\n", vidx);
+      return 1;
+    }
+  }
+  if (is_ldbase) {
+    copy_quaterarr(genovec, sample_ct, pgrp->ldbase_genovec);
+  }
+  return 0;
+}
+
+boolerr_t validate_hphase(const unsigned char* fread_end, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, char* errstr_buf) {
+  const uintptr_t* all_hets = pgrp->workspace_all_hets;
+  const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  const uint32_t het_ct = (uint32_t)popcount_longs(all_hets, sample_ctl);
+  if (!het_ct) {
+    sprintf(errstr_buf, "Error: Hardcall phase track present for (0-based) variant #%u, but there were no heterozygous calls.\n", vidx);
+    return 1;
+  }
+  const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
+  const unsigned char* aux2_first_part = *fread_pp;
+  if ((uintptr_t)(fread_end - (*fread_pp)) < aux2_first_part_byte_ct) {
+    sprintf(errstr_buf, "Error: Invalid hardcall phase track present for (0-based) variant #%u.\n", vidx);
+    return 1;
+  }
+  *fread_pp += aux2_first_part_byte_ct;
+  const uint32_t het_ct_p1_mod8 = (het_ct + 1) % CHAR_BIT;
+  if (het_ct_p1_mod8) {
+    // verify trailing bits are zero
+    if ((*fread_pp)[-1] >> het_ct_p1_mod8) {
+      sprintf(errstr_buf, "Error: Hardcall phase track for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
+      return 1;
+    }
+  }
+  if (!((*aux2_first_part) & 1)) {
+    // phase always present, "first part" is only part
+    return 0;
+  }
+  const uint32_t phasepresent_ct = (uint32_t)popcount_bytes(aux2_first_part, aux2_first_part_byte_ct) - 1;
+  if (!phasepresent_ct) {
+    sprintf(errstr_buf, "Error: Hardcall phase track for (0-based) variant #%u does not have any actual phase information.\n", vidx);
+    return 1;
+  }
+  const uint32_t phaseinfo_byte_ct = DIV_UP(phasepresent_ct, CHAR_BIT);
+  if ((uintptr_t)(fread_end - (*fread_pp)) < phaseinfo_byte_ct) {
+    sprintf(errstr_buf, "Error: Invalid hardcall phase track present for (0-based) variant #%u.\n", vidx);
+    return 1;
+  }
+  *fread_pp += phaseinfo_byte_ct;
+  const uint32_t phasepresent_ct_mod8 = phasepresent_ct % 8;
+  if (phasepresent_ct_mod8) {
+    if ((*fread_pp)[-1] >> phasepresent_ct_mod8) {
+      sprintf(errstr_buf, "Error: Hardcall phase track for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+boolerr_t validate_and_count_deltalist(const unsigned char* fread_end, uint32_t sample_ct, const unsigned char** fread_pp, uint32_t* deltalist_len_ptr) {
+  // we only need to know the number of entries in the list, not the actual bit
+  // positions for now
+  // (if we do need the bit positions, copy
+  // parse_and_save_deltalist_as_bitarr().)
+  const unsigned char* group_info_iter;
+  if (validate_difflist_header(fread_end, sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr)) {
+    return 1;
+  }
+  const uint32_t deltalist_len = *deltalist_len_ptr;
+  if (!deltalist_len) {
+    return 0;
+  }
+  // not an appropriate error, since this is just a tuning parameter for the
+  // compressor; readers are expected to handle at least
+  // (sample_ct / kPglMaxDifflistLenDivisor) entries.
+  // if (deltalist_len > sample_ct / kPglMaxDeltalistLenDivisor) {
+  //   return 1;
+  // }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(sample_ct);
+  const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
+  const unsigned char* group_byte_cts_iter = &(group_info_iter[DIV_UP(deltalist_len, kPglDifflistGroupSize) * sample_id_byte_ct]);
+  const unsigned char* prev_group_start = *fread_pp;
+  uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
+  uint32_t group_idx = 0;
+  uintptr_t sample_idx = 0;
+  while (1) {
+    if (group_idx >= group_idx_last) {
+      if (group_idx > group_idx_last) {
+	return 0;
+      }
+      group_len_m1 &= deltalist_len - 1;
+    }
+    uintptr_t new_sample_idx = 0;
+    memcpy(&new_sample_idx, group_info_iter, sample_id_byte_ct);
+    if (group_idx) {
+      if (sample_idx >= new_sample_idx) {
+	return 1;
+      }
+      const uint32_t group_byte_ct = ((uint32_t)(*group_byte_cts_iter++)) + 63;
+      if ((uintptr_t)((*fread_pp) - prev_group_start) != group_byte_ct) {
+	return 1;
+      }
+      prev_group_start = *fread_pp;
+    }
+    sample_idx = new_sample_idx;
+    group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+    ++group_idx;
+    uint32_t deltalist_idx_lowbits = 0;
+    while (1) {
+      if (sample_idx >= sample_ct) {
+	return 1;
+      }
+      if (deltalist_idx_lowbits == group_len_m1) {
+	break;
+      }
+      ++deltalist_idx_lowbits;
+      uint32_t sample_idx_incr;
+      if (validate_vint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr)) {
+	return 1;
+      }
+      sample_idx += sample_idx_incr;
+    }
+  }
+}
+
+pglerr_t validate_dosage16(const unsigned char* fread_end, uint32_t vidx, pgen_reader_t* pgrp, const unsigned char** fread_pp, char* errstr_buf) {
+  // similar to parse_dosage16().  doesn't support multiallelic data yet.
+  const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
+  if (vrtype & 0x80) {
+    // this should be trivial, just multiply array lengths by 2...
+    strcpy(errstr_buf, "Error: Phased dosage validation is not implemented yet.\n");
+    return kPglRetNotYetSupported;
+  }
+  const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
+  if ((vrtype & 0x60) == 0x40) {
+    // unconditional dosage.  handle separately from other two cases since
+    // 65535 is valid.
+#ifdef __arm__
+  #error "Unaligned accesses in validate_dosage16()."
+#endif
+    if ((uintptr_t)(fread_end - (*fread_pp)) < sample_ct * sizeof(int16_t)) {
+      sprintf(errstr_buf, "Error: Invalid unconditional dosage track for (0-based) variant #%u.\n", vidx);
+      return kPglRetMalformedInput;
+    }
+    const uint16_t* dosage_vals_read_iter = (const uint16_t*)(*fread_pp);
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      uint16_t cur_dosage_val_p1 = *dosage_vals_read_iter++;
+      cur_dosage_val_p1 += 1; // intentional overflow on 65535
+      if (cur_dosage_val_p1 > 32769) {
+	sprintf(errstr_buf, "Error: Invalid unconditional dosage track for (0-based) variant #%u (dosage is greater than 2).\n", vidx);
+	return kPglRetMalformedInput;
+      }
+    }
+    *fread_pp += sample_ct * sizeof(int16_t);
+    return kPglRetSuccess;
+  }
+  uint32_t dosage_ct;
+  if ((vrtype & 0x60) == 0x20) {
+    // dosage list
+    if (validate_and_count_deltalist(fread_end, sample_ct, fread_pp, &dosage_ct)) {
+      sprintf(errstr_buf, "Error: Invalid dosage list for (0-based) variant #%u.\n", vidx);
+      return kPglRetMalformedInput;
+    }
+  } else {
+    const uint32_t sample_ctb = DIV_UP(sample_ct, CHAR_BIT);
+    if ((uintptr_t)(fread_end - (*fread_pp)) < sample_ctb) {
+      sprintf(errstr_buf, "Error: Invalid dosage subset for (0-based) variant #%u.\n", vidx);
+      return kPglRetMalformedInput;
+    }
+    dosage_ct = (uint32_t)popcount_bytes(*fread_pp, sample_ctb);
+    *fread_pp += sample_ctb;
+    const uint32_t sample_ct_mod8 = sample_ct % 8;
+    if (sample_ct_mod8) {
+      if ((*fread_pp)[-1] >> sample_ct_mod8) {
+	sprintf(errstr_buf, "Error: Dosage subset bitarray for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
+	return kPglRetMalformedInput;
+      }
+    }
+  }
+  if ((uintptr_t)(fread_end - (*fread_pp)) < dosage_ct * sizeof(int16_t)) {
+    sprintf(errstr_buf, "Error: Invalid dosage track for (0-based) variant #%u.\n", vidx);
+    return kPglRetMalformedInput;
+  }
+  const uint16_t* dosage_vals_read_iter = (const uint16_t*)(*fread_pp);
+  for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx) {
+    if ((*dosage_vals_read_iter++) > 32768) {
+      sprintf(errstr_buf, "Error: Invalid dosage track for (0-based) variant #%u (dosage is greater than 2).\n", vidx);
+      return kPglRetMalformedInput;
+    }
+  }
+  *fread_pp += dosage_ct * sizeof(int16_t);
+  return kPglRetSuccess;
+}
+
+static_assert(kPglVblockSize == 65536, "pgr_validate() needs to have an error message updated.");
+pglerr_t pgr_validate(pgen_reader_t* pgrp, char* errstr_buf) {
+  // Performs all validation which isn't done by pgfi_init_phase{1,2}() and
+  // pgr_init().
+  const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
+  const uint32_t variant_ct = pgrp->fi.raw_variant_ct;
+  const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
+  const uint32_t const_vrtype = pgrp->fi.const_vrtype;
+  if (const_vrtype != 0xffffffffU) {
+    if (allele_idx_offsets && (allele_idx_offsets[variant_ct] != 2 * variant_ct)) {
+      sprintf(errstr_buf, "Error: .pvar file contains multiallelic variant(s), but .%s file does not.\n", (const_vrtype == kPglVrtypePlink1)? "bed" : "pgen");
+      return kPglRetInconsistentInput;
+    }
+    // const uintptr_t const_vrec_width = pgrp->fi.const_vrec_width;
+    if ((!const_vrtype) || (const_vrtype == kPglVrtypePlink1)) {
+      // only thing that can go wrong is nonzero trailing bits
+      const uint32_t dbl_sample_ct_mod4 = 2 * (sample_ct % 4);
+      if (!dbl_sample_ct_mod4) {
+	return kPglRetSuccess;
+      }
+      for (uint32_t vidx = 0; vidx < variant_ct; ++vidx) {
+	const unsigned char* fread_ptr;
+	const unsigned char* fread_end = nullptr;
+	if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+	  strcpy(errstr_buf, "Error: File read failure.\n");
+	  return kPglRetReadFail;
+	}
+	const uint32_t last_byte_in_record = fread_end[-1];
+	if (last_byte_in_record >> dbl_sample_ct_mod4) {
+	  sprintf(errstr_buf, "Error: Last byte of (0-based) variant #%u has nonzero trailing bits.\n", vidx);
+	  return kPglRetMalformedInput;
+	}
+      }
+      return kPglRetSuccess;
+    }
+    // todo: 16-bit dosage entries can't be in [32769,65534]
+    strcpy(errstr_buf, "Error: Validation of fixed-width dosage formats is not implemented yet.\n");
+    return kPglRetNotYetSupported;
+  }
+  const unsigned char* vrtypes = pgrp->fi.vrtypes;
+  for (uint32_t vidx = 0; vidx < variant_ct; vidx += kPglVblockSize) {
+    if (vrtype_ld_compressed(vrtypes[vidx])) {
+      sprintf(errstr_buf, "Error: (0-based) variant #%u is LD-compressed; this is prohibited when the variant index is a multiple of 65536.\n", vidx);
+      return kPglRetMalformedInput;
+    }
+  }
+  // file size may not be validated yet.
+  uint64_t fsize;
+  FILE* ff = pgrp->ff;
+#ifndef NO_MMAP
+  if (ff == nullptr) {
+    // mmap case
+    fsize = pgrp->fi.file_size;
+  } else {
+#endif
+    if (fseeko(ff, 0, SEEK_END)) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    fsize = ftello(ff);
+    pgrp->fp_vidx = 1; // force fseek when loading first variant
+#ifndef NO_MMAP
+  }
+#endif
+  // todo: modify this check when phase sets are implemented
+  const uint64_t expected_fsize = pgrp->fi.var_fpos[variant_ct];
+  if (expected_fsize != fsize) {
+    sprintf(errstr_buf, "Error: .pgen header indicates that file size should be %" PRIu64 " bytes, but actual file size is %" PRIu64 " bytes.\n", expected_fsize, fsize);
+    return kPglRetMalformedInput;
+  }
+  const uint32_t vblock_ct = DIV_UP(variant_ct, kPglVblockSize);
+  uint32_t header_ctrl = 0;
+#ifndef NO_MMAP
+  if (ff == nullptr) {
+  #ifdef __arm__
+    #error "Unaligned accesses in pgr_validate()."
+  #endif
+    memcpy(&header_ctrl, &(pgrp->fi.block_base[11]), 1);
+    // validate the random-access index.
+    const uint64_t* fpos_index = (const uint64_t*)(&(pgrp->fi.block_base[12]));
+    for (uint32_t vblock_idx = 0; vblock_idx < vblock_ct; ++vblock_idx) {
+      if (fpos_index[vblock_idx] != pgrp->fi.var_fpos[vblock_idx * kPglVblockSize]) {
+	strcpy(errstr_buf, "Error: .pgen header vblock-start index is inconsistent with variant record length index.\n");
+	return kPglRetMalformedInput;
+      }
+    }
+  } else {
+#endif
+    if (fseeko(ff, 11, SEEK_SET)) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    header_ctrl = getc_unlocked(ff);
+    if (header_ctrl > 255) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    for (uint32_t vblock_idx = 0; vblock_idx < vblock_ct; ++vblock_idx) {
+      uint64_t vblock_start_fpos;
+      if (!fread(&vblock_start_fpos, sizeof(int64_t), 1, ff)) {
+	return kPglRetReadFail;
+      }
+      if (vblock_start_fpos != pgrp->fi.var_fpos[vblock_idx * kPglVblockSize]) {
+	strcpy(errstr_buf, "Error: .pgen header vblock-start index is inconsistent with variant record length index.\n");
+	return kPglRetMalformedInput;
+      }
+    }
+#ifndef NO_MMAP
+  }
+#endif
+  const uint32_t vrtype_and_fpos_storage = header_ctrl & 15;
+  const uint32_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
+  const uint32_t nonref_flags_stored = ((header_ctrl >> 6) == 3);
+
+  // does not include vrtypes yet
+  uint64_t vblock_index_byte_ct = kPglVblockSize * (1 + (vrtype_and_fpos_storage & 3) + alt_allele_ct_byte_ct);
+  if (nonref_flags_stored) {
+    vblock_index_byte_ct += kPglVblockSize / CHAR_BIT;
+  }
+  uint64_t last_vrtype_byte_offset = 0;
+  uint32_t trailing_shift = 4;
+  if (vrtype_and_fpos_storage & 8) {
+    vblock_index_byte_ct += kPglVblockSize >> (10 - vrtype_and_fpos_storage);
+    if (vrtype_and_fpos_storage == 8) {
+      const uint32_t variant_ct_mod4 = variant_ct % 4;
+      if (variant_ct_mod4) {
+	last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t));
+	trailing_shift = variant_ct_mod4 * 2;
+      }
+    } else {
+      assert(vrtype_and_fpos_storage == 9);
+      if (variant_ct % 2) {
+	last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t));
+      }
+    }
+  } else if (!(vrtype_and_fpos_storage & 4)) {
+    vblock_index_byte_ct += kPglVblockSize / 2;
+    if (variant_ct % 2) {
+      last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t));
+    }
+    /*
+  } else {
+    vblock_index_byte_ct += kPglVblockSize;
+    */
+  }
+  if (last_vrtype_byte_offset) {
+    uint32_t last_vrtype_byte = 0;
+#ifndef NO_MMAP
+    if (ff == nullptr) {
+      memcpy(&last_vrtype_byte, &(pgrp->fi.block_base[last_vrtype_byte_offset]), 1);
+    } else {
+#endif
+      if (fseeko(ff, last_vrtype_byte_offset, SEEK_SET)) {
+        strcpy(errstr_buf, "Error: File read failure.\n");
+	return kPglRetReadFail;
+      }
+      last_vrtype_byte = getc_unlocked(ff);
+      if (last_vrtype_byte > 255) {
+        strcpy(errstr_buf, "Error: File read failure.\n");
+	return kPglRetReadFail;
+      }
+#ifndef NO_MMAP
+    }
+#endif
+    if (last_vrtype_byte >> trailing_shift) {
+      strcpy(errstr_buf, "Error: Nonzero trailing bits in last vrtype index byte.\n");
+      return kPglRetMalformedInput;
+    }
+  }
+  const uintptr_t* nonref_flags = pgrp->fi.nonref_flags;
+  if (nonref_flags) {
+    const uint32_t variant_ct_modl = variant_ct % kBitsPerWord;
+    if (variant_ct % CHAR_BIT) {
+      if (nonref_flags[variant_ct / kBitsPerWord] >> variant_ct_modl) {
+	strcpy(errstr_buf, "Error: Nonzero trailing bits in last nonref_flags byte.\n");
+	return kPglRetMalformedInput;
+      }
+    }
+  }
+  
+  // could move most of this into plink2_common and make it multithreaded, if
+  // speed is ever an issue.
+  for (uint32_t vidx = 0; vidx < variant_ct; ++vidx) {
+    const unsigned char* fread_ptr;
+    const unsigned char* fread_end;
+    if (init_read_ptrs(vidx, pgrp, &fread_ptr, &fread_end)) {
+      strcpy(errstr_buf, "Error: File read failure.\n");
+      return kPglRetReadFail;
+    }
+    if (validate_geno(fread_end, vidx, pgrp, &fread_ptr, errstr_buf)) {
+      return kPglRetMalformedInput;
+    }
+    const uint32_t vrtype = vrtypes[vidx];
+    if (vrtype_hphase(vrtype)) {
+      pgr_detect_genovec_hets(pgrp->workspace_vec, sample_ct, pgrp->workspace_all_hets);
+    }
+    if (vrtype_multiallelic(vrtype)) {
+      // todo
+      strcpy(errstr_buf, "Error: Validation of multiallelic data track is not implemented yet.\n");
+      return kPglRetNotYetSupported;
+    }
+    // don't need pgrp->workspace_vec to store main genotypes past this point.
+    if (vrtype_hphase(vrtype)) {
+      if (validate_hphase(fread_end, vidx, pgrp, &fread_ptr, errstr_buf)) {
+	return kPglRetMalformedInput;
+      }
+    }
+    if (vrtype & 0xe0) {
+      if ((vrtype & 0xe0) == 0x80) {
+	sprintf(errstr_buf, "Error: Invalid record type for (0-based) variant #%u (phased dosage bit set, but main dosage bits unset).\n", vidx);
+	return kPglRetMalformedInput;
+      }
+      pglerr_t reterr = validate_dosage16(fread_end, vidx, pgrp, &fread_ptr, errstr_buf);
+      if (reterr) {
+	return reterr;
+      }
+    }
+    if (fread_ptr != fread_end) {
+      // possible todo: tolerate this at the end of a vblock.
+      assert(fread_ptr < fread_end);
+      sprintf(errstr_buf, "Error: Extra byte(s) in (0-based) variant record #%u.\n", vidx);
+      return kPglRetMalformedInput;
+    }
+  }
+  return kPglRetSuccess;
+}
+
+
+boolerr_t pgfi_cleanup(pgen_file_info_t* pgfip) {
+  // memory is the responsibility of the caller
+  if (pgfip->shared_ff) {
+    if (fclose_null(&pgfip->shared_ff)) {
+      return 1;
+    }
+#ifndef NO_MMAP
+  } else if (pgfip->block_base != nullptr) {
+    // const_cast
+    munmap((unsigned char*)((uintptr_t)pgfip->block_base), pgfip->file_size);
+#endif
+  }
+  return 0;
+}
+
+boolerr_t pgr_cleanup(pgen_reader_t* pgrp) {
+  // assume file is open if pgr.ff is not null
+  // memory is the responsibility of the caller for now
+  if (!pgrp->ff) {
+    return 0;
+  }
+  return fclose_null(&(pgrp->ff));
+}
+
+
+// ***** end pgen_reader_t, begin {st,mt}_pgen_writer_t *****
+
+
+void spgw_preinit(st_pgen_writer_t* spgwp) {
+  spgwp->pgen_outfile = nullptr;
+}
+
+pglerr_t pwc_init_phase1(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, pgen_writer_common_t* pwcp, FILE** pgen_outfile_ptr) {
+  pwcp->allele_idx_offsets = allele_idx_offsets;
+  pwcp->explicit_nonref_flags = nullptr;
+  if (nonref_flags_storage == 3) {
+    if (!explicit_nonref_flags) {
+      return kPglRetImproperFunctionCall;
+    }
+    pwcp->explicit_nonref_flags = explicit_nonref_flags;
+  }
+  pwcp->variant_ct = variant_ct;
+  pwcp->sample_ct = sample_ct;
+  pwcp->phase_dosage_gflags = phase_dosage_gflags;
+#ifndef NDEBUG
+  pwcp->vblock_fpos = nullptr;
+  pwcp->vrec_len_buf = nullptr;
+  pwcp->vrtype_buf = nullptr;
+  pwcp->fwrite_buf = nullptr;
+  pwcp->fwrite_bufp = nullptr;
+  pwcp->genovec_invert_buf = nullptr;
+  pwcp->ldbase_genovec = nullptr;
+  pwcp->ldbase_raregeno = nullptr;
+  pwcp->ldbase_difflist_sample_ids = nullptr;
+#endif
+  pwcp->vidx = 0;
+
+  FILE* pgen_outfile = fopen(fname, FOPEN_WB);
+  *pgen_outfile_ptr = pgen_outfile;
+  if (!pgen_outfile) {
+    return kPglRetOpenFail;
+  }
+  fwrite("l\x1b\x10", 3, 1, pgen_outfile);
+  fwrite(&(pwcp->variant_ct), sizeof(int32_t), 1, pgen_outfile);
+  fwrite(&(pwcp->sample_ct), sizeof(int32_t), 1, pgen_outfile);
+  
+  const unsigned char control_byte = (vrec_len_byte_ct - 1) + (4 * (phase_dosage_gflags != 0)) + (nonref_flags_storage << 6);
+  pwcp->vrec_len_byte_ct = vrec_len_byte_ct;
+  fwrite(&control_byte, 1, 1, pgen_outfile);
+  const uint32_t vblock_ct = DIV_UP(variant_ct, kPglVblockSize);
+  uintptr_t header_bytes_left = vblock_ct * sizeof(int64_t) + variant_ct * vrec_len_byte_ct;
+  if (phase_dosage_gflags) {
+    // 8-bit vrtypes
+    header_bytes_left += variant_ct;
+  } else {
+    // 4-bit vrtypes
+    header_bytes_left += DIV_UP(variant_ct, 2);
+  }
+  if (nonref_flags_storage == 3) {
+    header_bytes_left += DIV_UP(variant_ct, CHAR_BIT);
+  }
+  
+  // this should be the position of the first variant
+  pwcp->vblock_fpos_offset = 12 + header_bytes_left;
+  
+  uintptr_t zeroed_cachelines_needed = DIV_UP(header_bytes_left, kCacheline);
+  if (zeroed_cachelines_needed > (kPglFwriteBlockSize / kCacheline)) {
+    zeroed_cachelines_needed = kPglFwriteBlockSize / kCacheline;
+  }
+  // could wait until fwrite_buf is allocated, and make sure it's aligned?
+  unsigned char zerobuf[kPglFwriteBlockSize];
+  memset(zerobuf, 0, zeroed_cachelines_needed * kCacheline);
+  while (header_bytes_left > kPglFwriteBlockSize) {
+    fwrite(zerobuf, kPglFwriteBlockSize, 1, pgen_outfile);
+    header_bytes_left -= kPglFwriteBlockSize;
+  }
+  if (fwrite_checked(zerobuf, header_bytes_left, pgen_outfile)) {
+    return kPglRetWriteFail;
+  }
+  return kPglRetSuccess;
+}
+
+uint32_t count_spgw_alloc_cachelines_required(uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uint32_t max_vrec_len) {
+  // vblock_fpos
+  const uint32_t vblock_ct = DIV_UP(variant_ct, kPglVblockSize);
+  uint32_t cachelines_required = INT64CT_TO_CLCT(vblock_ct);
+
+  // vrec_len_buf
+  // overlapping uint32_t writes used, so (variant_ct * vrec_len_byte_ct) might
+  // not be enough
+  const uintptr_t vrec_len_byte_ct = bytes_to_represent_ui(max_vrec_len);
+  cachelines_required += DIV_UP((variant_ct - 1) * vrec_len_byte_ct + sizeof(int32_t), kCacheline);
+
+  // vrtype_buf
+  if (phase_dosage_gflags) {
+    cachelines_required += DIV_UP(variant_ct, kCacheline);
+  } else {
+    cachelines_required += DIV_UP(variant_ct, kCacheline * 2);
+  }
+  
+  // genovec_invert_buf, ldbase_genovec
+  cachelines_required += 2 * QUATERCT_TO_CLCT(sample_ct);
+
+  const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+  // ldbase_raregeno
+  cachelines_required += QUATERCT_TO_CLCT(max_difflist_len);
+
+  // ldbase_difflist_sample_ids
+  cachelines_required += 1 + (max_difflist_len / kInt32PerCacheline);
+
+  // fwrite_buf
+  cachelines_required += DIV_UP(max_vrec_len + (kPglFwriteBlockSize - k1LU), kCacheline);
+  if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) {
+    // phasepresent, phaseinfo
+    cachelines_required += 2 * BITCT_TO_CLCT(sample_ct);
+  }
+  // possible todo: dosage (doesn't currently need an allocation, but that's
+  // unlikely to remain true--e.g. get_ref_nonref_genotype_counts_and_dosages
+  // tends to use workspace_vec when a function it calls doesn't use it...)
+  return cachelines_required;
+}
+
+static_assert(kPglMaxAltAlleleCt == 254, "Need to update spgw_init_phase1().");
+pglerr_t spgw_init_phase1(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uint32_t nonref_flags_storage, st_pgen_writer_t* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr) {
+  assert(variant_ct);
+  assert(sample_ct);
+  
+  // separate from mpgw_init_phase1's version of this computation since the
+  // latter wants a better bound on the compressed size of an entire vblock
+  // than max_vrec_len * kPglVblockSize...
+  uint64_t max_vrec_len = QUATERCT_TO_BYTECT(sample_ct);
+  uintptr_t max_alt_ct_p1 = 2;
+  if (allele_idx_offsets && (allele_idx_offsets[variant_ct] != 2 * variant_ct)) {
+    assert(allele_idx_offsets[0] == 0);
+    assert(allele_idx_offsets[variant_ct] > 2 * variant_ct);
+    // could add this as a parameter, since caller should know...
+    max_alt_ct_p1 = 3;
+    uintptr_t prev_offset = 0;
+    for (uint32_t vidx = 1; vidx <= variant_ct; ++vidx) {
+      const uintptr_t cur_offset = allele_idx_offsets[vidx];
+      if (cur_offset - prev_offset > max_alt_ct_p1) {
+	max_alt_ct_p1 = cur_offset - prev_offset;
+      }
+      prev_offset = cur_offset;
+    }
+    // nonmissingness array
+    max_vrec_len += DIV_UP(sample_ct, CHAR_BIT) + get_aux1_allele_bytect((uint32_t)max_alt_ct_p1 - 1, sample_ct);
+    // try to permit uncompressed records to be larger than this, only error
+    // out when trying to write a larger compressed record?  (might not be
+    // worth it.)
+  }
+  if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) {
+    max_vrec_len += 2 * DIV_UP(sample_ct, CHAR_BIT);
+  }
+  if (phase_dosage_gflags & kfPgenGlobalDosagePresent) {
+    const uint32_t dosage_phase_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePhasePresent) & 1;
+    // aux3, aux4
+    max_vrec_len += (1 + dosage_phase_gflag) * DIV_UP(sample_ct, 8);
+    // aux5
+    max_vrec_len += (2 + 2 * dosage_phase_gflag) * ((uint64_t)sample_ct) * (max_alt_ct_p1 - 1);
+
+  }
+  if (max_vrec_len >= kPglMaxBytesPerVariant) {
+#ifdef __LP64__
+    max_vrec_len = kPglMaxBytesPerVariant;
+#else
+    return kPglRetNomem;
+#endif
+  }
+  *max_vrec_len_ptr = (uint32_t)max_vrec_len;
+  const uintptr_t vrec_len_byte_ct = bytes_to_represent_ui((uint32_t)max_vrec_len);
+
+  pglerr_t reterr = pwc_init_phase1(fname, allele_idx_offsets, explicit_nonref_flags, variant_ct, sample_ct, phase_dosage_gflags, nonref_flags_storage, (uint32_t)vrec_len_byte_ct, &(spgwp->pwc), &(spgwp->pgen_outfile));
+  if (!reterr) {
+    *alloc_cacheline_ct_ptr = count_spgw_alloc_cachelines_required(variant_ct, sample_ct, phase_dosage_gflags, (uint32_t)max_vrec_len);
+  }
+  return reterr;
+}
+
+static_assert(kPglMaxAltAlleleCt == 254, "Need to update mpgw_init_phase1().");
+void mpgw_init_phase1(const uintptr_t* __restrict allele_idx_offsets, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uintptr_t* alloc_base_cacheline_ct_ptr, uint64_t* alloc_per_thread_cacheline_ct_ptr, uint32_t* vrec_len_byte_ct_ptr, uint64_t* vblock_cacheline_ct_ptr) {
+  assert(variant_ct);
+  assert(sample_ct);
+  // vblock_fpos
+  const uint32_t vblock_ct = DIV_UP(variant_ct, kPglVblockSize);
+  uint32_t alloc_base_cacheline_ct = INT64CT_TO_CLCT(vblock_ct);
+
+  // vrtype_buf
+  if (phase_dosage_gflags) {
+    alloc_base_cacheline_ct += DIV_UP(variant_ct, kCacheline);
+  } else {
+    alloc_base_cacheline_ct += DIV_UP(variant_ct, kCacheline * 2);
+  }
+
+  // pwcs
+  uint64_t alloc_per_thread_cacheline_ct = DIV_UP(sizeof(pgen_writer_common_t), kCacheline);
+  
+  // genovec_invert_buf, ldbase_genovec
+  alloc_per_thread_cacheline_ct += 2 * QUATERCT_TO_CLCT(sample_ct);
+
+  const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+  // ldbase_raregeno
+  alloc_per_thread_cacheline_ct += QUATERCT_TO_CLCT(max_difflist_len);
+
+  // ldbase_difflist_sample_ids
+  alloc_per_thread_cacheline_ct += 1 + (max_difflist_len / kInt32PerCacheline);
+
+  uint64_t max_vrec_len = QUATERCT_TO_BYTECT(sample_ct);
+  if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) {
+    max_vrec_len += 2 * DIV_UP(sample_ct, CHAR_BIT);
+  }
+  const uint32_t dosage_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePresent) & 1;
+  const uint32_t dosage_phase_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePhasePresent) & 1;
+  if (dosage_gflag) {
+    max_vrec_len += ((1 + dosage_phase_gflag) * DIV_UP(sample_ct, CHAR_BIT)) + (2 + 2 * dosage_phase_gflag) * ((uint64_t)sample_ct);
+  }
+  const uint32_t max_vblock_size = MINV(variant_ct, kPglVblockSize);
+  uint64_t max_vblock_byte_ct = ((uint64_t)max_vrec_len) * max_vblock_size;
+  if (max_vrec_len >= kPglMaxBytesPerVariant) {
+    max_vrec_len = kPglMaxBytesPerVariant;
+    max_vblock_byte_ct = kPglMaxBytesPerVariant * ((uint64_t)max_vblock_size);
+  } else if (allele_idx_offsets && (allele_idx_offsets[variant_ct] != 2 * variant_ct)) {
+    assert(allele_idx_offsets[0] == 0);
+    assert(allele_idx_offsets[variant_ct] > 2 * variant_ct);
+    // When multiallelic variants are present, larger write buffers are needed.
+    // we compute the largest possible size here.
+    //
+    // For aux1, a nonmissingness array with (sample_ct + 7) / 8 bytes is
+    // always needed.  on top of that,
+    //   alt ct  additional bytes required
+    //   ------  -------------------------
+    //        2        (sample_ct + 3) / 4
+    //        3        (sample_ct + 1) / 2
+    //     4-15                  sample_ct
+    //   16-255              2 * sample_ct
+    //
+    // For aux5, (2 + 2 * dosage_phase_gflag) additional bytes are needed per
+    // sample x additional alt allele (yes, it isn't hard to exceed the ~4GB
+    // variant record size limit here).
+    //
+    // Between the two, we have a piecewise linear function with up to 5
+    // segments (last segment could correspond to the record size limit).
+    // Okay, the last segment means "out of memory" unless we have something
+    // like 256TB RAM, but still.
+    uintptr_t prev_offset = 0;
+    uint32_t vidx = 0;
+    const uint32_t extra_bytes_base = DIV_UP(sample_ct, CHAR_BIT);
+    const uint64_t extra_bytes_max = kPglMaxBytesPerVariant - max_vrec_len;
+    const uint64_t extra_dosage_bytes_per_alt = dosage_phase_gflag * (2 + 2 * dosage_phase_gflag) * ((uint64_t)sample_ct);
+    uint64_t extra_byte_cts[4];
+    uint32_t extra_alt_ceil = kPglMaxAltAlleleCt + 1;
+
+    // alt_ct == 2
+    uint64_t cur_extra_byte_ct = extra_bytes_base + DIV_UP(sample_ct, 4) + extra_dosage_bytes_per_alt;
+    extra_byte_cts[0] = cur_extra_byte_ct;
+    extra_byte_cts[1] = 0; // force initialization
+    extra_byte_cts[2] = 0;
+    extra_byte_cts[3] = 0;
+    if (cur_extra_byte_ct >= extra_bytes_max) {
+      extra_alt_ceil = 2;
+    } else {
+      // alt_ct == 3
+      cur_extra_byte_ct = extra_bytes_base + DIV_UP(sample_ct, 4) + 2 * extra_dosage_bytes_per_alt;
+      extra_byte_cts[1] = cur_extra_byte_ct;
+      if (cur_extra_byte_ct >= extra_bytes_max) {
+	extra_alt_ceil = 3;
+      } else {
+	// alt_ct in [4, 15]
+	cur_extra_byte_ct = extra_bytes_base + sample_ct + 3 * extra_dosage_bytes_per_alt;
+	extra_byte_cts[2] = cur_extra_byte_ct;
+	if (cur_extra_byte_ct >= extra_bytes_max) {
+	  extra_alt_ceil = 4;
+	} else if (cur_extra_byte_ct + 11 * extra_dosage_bytes_per_alt >= extra_bytes_max) {
+	  extra_alt_ceil = (uint32_t)(5 + (extra_bytes_max - cur_extra_byte_ct - 1) / extra_dosage_bytes_per_alt);
+	} else {
+	  // alt_ct in [16, 254]
+	  cur_extra_byte_ct = extra_bytes_base + 2 * sample_ct + 15 * extra_dosage_bytes_per_alt;
+	  extra_byte_cts[3] = cur_extra_byte_ct;
+	  if (cur_extra_byte_ct >= extra_bytes_max) {
+	    extra_alt_ceil = 16;
+	  } else if (cur_extra_byte_ct + 238 * extra_dosage_bytes_per_alt >= extra_bytes_max) {
+	    extra_alt_ceil = (uint32_t)(17 + (extra_bytes_max - cur_extra_byte_ct - 1) / extra_dosage_bytes_per_alt);
+	  }
+	}
+      }
+    }    
+    uint64_t extra_nonceil_altp1_total = 0;
+    uint32_t extra_alt_ceil_ct = 0;
+    const uint64_t uncompressed_biallelic_vrec_len = max_vrec_len;
+    uint32_t altx_seen_mask = 0;
+    uint32_t max_alt_ct_p1 = 3;
+    while (1) {
+      uint32_t vblock_end = vidx + kPglVblockSize;
+      if (vblock_end > variant_ct) {
+	if (vidx == variant_ct) {
+	  break;
+	}
+	vblock_end = variant_ct;
+      }
+      uint32_t altx_seen[4];
+      fill_uint_zero(4, altx_seen);
+      for (; vidx < vblock_end;) {
+	const uintptr_t cur_offset = allele_idx_offsets[++vidx];
+	const uint32_t alt_ct_p1 = (uint32_t)(cur_offset - prev_offset);
+	if (alt_ct_p1 > 2) {
+	  if (alt_ct_p1 >= extra_alt_ceil) {
+	    ++extra_alt_ceil_ct;
+	  } else {
+	    // don't need to track this when we hit the ceiling
+	    if (alt_ct_p1 > max_alt_ct_p1) {
+	      max_alt_ct_p1 = alt_ct_p1;
+	    }
+
+	    extra_nonceil_altp1_total += alt_ct_p1;
+	    if (alt_ct_p1 < 5) {
+	      altx_seen[alt_ct_p1 - 3] += 1;
+	    } else {
+	      altx_seen[2 + (alt_ct_p1 >= 16)] += 1;
+	    }
+	  }
+	}
+	prev_offset = cur_offset;
+      }
+      uint64_t cur_vblock_byte_ct = uncompressed_biallelic_vrec_len * (vblock_end - vidx);
+      cur_vblock_byte_ct += extra_alt_ceil_ct * extra_bytes_max;
+      for (uint32_t uii = 0; uii < 4; ++uii) {
+	if (altx_seen[uii]) {
+	  const uint32_t cur_seen_ct = altx_seen[uii];
+	  altx_seen_mask |= 1 << uii;
+	  cur_vblock_byte_ct += cur_seen_ct * extra_byte_cts[uii];
+	}
+      }
+      if (dosage_gflag) {
+	cur_vblock_byte_ct += (extra_nonceil_altp1_total - altx_seen[0] * 3 - altx_seen[1] * 4 - altx_seen[2] * 5 - altx_seen[3] * 17) * extra_dosage_bytes_per_alt;
+      }
+      if (cur_vblock_byte_ct > max_vblock_byte_ct) {
+	max_vblock_byte_ct = cur_vblock_byte_ct;
+      }
+    }
+    if (extra_alt_ceil_ct) {
+      max_vrec_len = kPglMaxBytesPerVariant;
+    } else {
+      max_vrec_len = uncompressed_biallelic_vrec_len + extra_byte_cts[31 - __builtin_clz(altx_seen_mask)];
+      if (dosage_gflag && (max_alt_ct_p1 >= 6)) {
+	if (max_alt_ct_p1 >= 17) {
+	  max_vrec_len += (max_alt_ct_p1 - 17) * extra_dosage_bytes_per_alt;
+	} else {
+	  max_vrec_len += (max_alt_ct_p1 - 5) * extra_dosage_bytes_per_alt;
+	}
+      }
+    }
+  }
+  // vrec_len_buf
+  // previously used overlapping uint32_t writes-to-memory, but that was
+  // incompatible with multithreaded compression
+  *vrec_len_byte_ct_ptr = bytes_to_represent_ui((uint32_t)max_vrec_len);
+  *alloc_base_cacheline_ct_ptr = alloc_base_cacheline_ct + DIV_UP(((uintptr_t)variant_ct) * (*vrec_len_byte_ct_ptr), kCacheline);
+  
+  // main write buffer
+  *vblock_cacheline_ct_ptr = DIV_UP(max_vblock_byte_ct, kCacheline);
+  *alloc_per_thread_cacheline_ct_ptr = alloc_per_thread_cacheline_ct + (*vblock_cacheline_ct_ptr);
+}
+
+
+void pwc_init_phase2(uintptr_t fwrite_cacheline_ct, uint32_t thread_ct, pgen_writer_common_t** pwcs, unsigned char* pwc_alloc) {
+  const uint32_t variant_ct = pwcs[0]->variant_ct;
+  unsigned char* alloc_iter = pwc_alloc;
+  const uint32_t vblock_ct = DIV_UP(variant_ct, kPglVblockSize);
+  const pgen_global_flags_t phase_dosage_gflags = pwcs[0]->phase_dosage_gflags;
+  uint32_t vrtype_buf_bytes;
+  if (phase_dosage_gflags) {
+    vrtype_buf_bytes = (uint32_t)round_up_pow2(variant_ct, kCacheline);
+  } else {
+    vrtype_buf_bytes = DIV_UP(variant_ct, kCacheline * 2) * kCacheline;
+  }
+  pwcs[0]->vblock_fpos = (uint64_t*)alloc_iter;
+  alloc_iter = &(alloc_iter[INT64CT_TO_CLCT(vblock_ct) * kCacheline]);
+
+  pwcs[0]->vrec_len_buf = alloc_iter;
+  alloc_iter = &(alloc_iter[round_up_pow2(variant_ct * pwcs[0]->vrec_len_byte_ct, kCacheline)]);
+
+  pwcs[0]->vrtype_buf = (uintptr_t*)alloc_iter;
+  // spgw_append() assumes these bytes are zeroed out
+  memset(pwcs[0]->vrtype_buf, 0, vrtype_buf_bytes);
+  alloc_iter = &(alloc_iter[vrtype_buf_bytes]);
+
+  const uint32_t sample_ct = pwcs[0]->sample_ct;
+  const uint32_t genovec_byte_alloc = QUATERCT_TO_CLCT(sample_ct) * kCacheline;
+  const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+  for (uint32_t tidx = 0; tidx < thread_ct; ++tidx) {
+    if (tidx) {
+      pwcs[tidx]->vblock_fpos = pwcs[0]->vblock_fpos;
+      pwcs[tidx]->vrec_len_buf = pwcs[0]->vrec_len_buf;
+      pwcs[tidx]->vrtype_buf = pwcs[0]->vrtype_buf;
+    }
+    pwcs[tidx]->genovec_invert_buf = (uintptr_t*)alloc_iter;
+    alloc_iter = &(alloc_iter[genovec_byte_alloc]);
+    pwcs[tidx]->ldbase_genovec = (uintptr_t*)alloc_iter;
+    alloc_iter = &(alloc_iter[genovec_byte_alloc]);
+
+    pwcs[tidx]->ldbase_raregeno = (uintptr_t*)alloc_iter;
+    alloc_iter = &(alloc_iter[QUATERCT_TO_CLCT(max_difflist_len) * kCacheline]);
+    pwcs[tidx]->ldbase_difflist_sample_ids = (uint32_t*)alloc_iter;
+    alloc_iter = &(alloc_iter[(1 + (max_difflist_len / kInt32PerCacheline)) * kCacheline]);
+
+    pwcs[tidx]->fwrite_buf = alloc_iter;
+    pwcs[tidx]->fwrite_bufp = alloc_iter;
+    alloc_iter = &(alloc_iter[fwrite_cacheline_ct * kCacheline]);
+  }
+}
+
+void spgw_init_phase2(uint32_t max_vrec_len, st_pgen_writer_t* spgwp, unsigned char* spgw_alloc) {
+  uintptr_t fwrite_cacheline_ct = DIV_UP(max_vrec_len + kPglFwriteBlockSize - 1, kCacheline);
+  pgen_writer_common_t* pwcp = &(spgwp->pwc);
+  if (pwcp->phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) {
+    fwrite_cacheline_ct += 2 * BITCT_TO_CLCT(pwcp->sample_ct);
+  }
+  pwc_init_phase2(fwrite_cacheline_ct, 1, &pwcp, spgw_alloc);
+}
+
+pglerr_t mpgw_init_phase2(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, mt_pgen_writer_t* mpgwp) {
+  assert(thread_ct);
+  const uintptr_t pwc_byte_ct = round_up_pow2(sizeof(pgen_writer_common_t), kCacheline);
+  for (uint32_t tidx = 0; tidx < thread_ct; ++tidx) {
+    mpgwp->pwcs[tidx] = (pgen_writer_common_t*)(&(mpgw_alloc[tidx * pwc_byte_ct]));
+  }
+  pglerr_t reterr = pwc_init_phase1(fname, allele_idx_offsets, explicit_nonref_flags, variant_ct, sample_ct, phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, mpgwp->pwcs[0], &(mpgwp->pgen_outfile));
+  if (!reterr) {
+    mpgwp->thread_ct = thread_ct;
+    if (thread_ct > 1) {
+      for (uint32_t tidx = 1; tidx < thread_ct; ++tidx) {
+	memcpy(mpgwp->pwcs[tidx], mpgwp->pwcs[0], sizeof(pgen_writer_common_t));
+	mpgwp->pwcs[tidx]->vidx = tidx * kPglVblockSize;
+      }
+    }
+    pwc_init_phase2(vblock_cacheline_ct, thread_ct, mpgwp->pwcs, &(mpgw_alloc[thread_ct * pwc_byte_ct]));
+  }
+  return reterr;
+}
+
+
+void count_ld_and_inverted_ld_diffs(const uintptr_t* __restrict ldbase_genovec, const uintptr_t* __restrict genovec, uint32_t sample_ct, uint32_t* ld_diff_ctp, uint32_t* ld_inv_diff_ctp) {
+  // Requires trailing bits to be zeroed out.
+  const uint32_t word_ct = QUATERCT_TO_WORDCT(sample_ct);
+  const uintptr_t* genovec_end = &(genovec[word_ct]);
+  uint32_t ld_diff_ct = 0;
+  uint32_t ld_inv_diff_ct = 0;
+  // construct the words we want to popcount_quatervec_01 on the fly
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t* ldbase_vvec_iter = (const vul_t*)ldbase_genovec;
+  const vul_t* geno_vvec_iter = (const vul_t*)genovec;
+  uint32_t full_vecs_left = 3 * (word_ct / (3 * kWordsPerVec));
+  univec_t acc_ld;
+  univec_t acc_ld_inv;
+  while (1) {
+    acc_ld.vi = vul_setzero();
+    acc_ld_inv.vi = vul_setzero();
+    const vul_t* geno_vvec_stop;
+    if (full_vecs_left < 60) {
+      if (!full_vecs_left) {
+	break;
+      }
+      geno_vvec_stop = &(geno_vvec_iter[full_vecs_left]);
+      full_vecs_left = 0;
+    } else {
+      geno_vvec_stop = &(geno_vvec_iter[60]);
+      full_vecs_left -= 60;
+    }
+    do {
+      vul_t loader_ldbase1 = *ldbase_vvec_iter++;
+      vul_t loader_geno1 = *geno_vvec_iter++;
+      vul_t loader_ldbase2 = *ldbase_vvec_iter++;
+      vul_t loader_geno2 = *geno_vvec_iter++;
+      vul_t xor1 = loader_ldbase1 ^ loader_geno1;
+      vul_t xor2 = loader_ldbase2 ^ loader_geno2;
+      vul_t xor_shifted1 = vul_rshift(xor1, 1);
+      vul_t xor_shifted2 = vul_rshift(xor2, 1);
+      // xor(_low)  xor_shifted  loader_geno   result
+      //         1                                  1
+      //         0            0            0        1
+      //         0            0            1        0
+      //         0            1            0        0
+      //         0            1            1        1
+      // gah, don't see a way to avoid throwing in an extra xor for
+      // loader_geno...
+      vul_t count_ld_inv = (xor1 | (xor_shifted1 ^ loader_geno1 ^ m1)) & m1;
+      loader_ldbase1 = *ldbase_vvec_iter++;
+      vul_t count_ld = (xor1 | xor_shifted1) & m1;
+      loader_geno1 = *geno_vvec_iter++;
+      count_ld_inv = count_ld_inv + ((xor2 | (xor_shifted2 ^ loader_geno2 ^ m1)) & m1);
+      xor1 = loader_ldbase1 ^ loader_geno1;
+      count_ld = count_ld + ((xor2 | xor_shifted2) & m1);
+      xor_shifted1 = vul_rshift(xor1, 1);
+      count_ld_inv = count_ld_inv + ((xor1 | (xor_shifted1 ^ loader_geno1 ^ m1)) & m1);
+      count_ld = count_ld + ((xor1 | xor_shifted1) & m1);
+      // now count_ld and count_ld_inv each have 64 2-bit values from 0-3
+
+      count_ld_inv = (count_ld_inv & m2) + (vul_rshift(count_ld_inv, 2) & m2);
+      count_ld = (count_ld & m2) + (vul_rshift(count_ld, 2) & m2);
+      // now they have 32 4-bit values from 0-6
+
+      acc_ld_inv.vi = acc_ld_inv.vi + ((count_ld_inv + vul_rshift(count_ld_inv, 4)) & m4);
+      acc_ld.vi = acc_ld.vi + ((count_ld + vul_rshift(count_ld, 4)) & m4);
+    } while (geno_vvec_iter < geno_vvec_stop);
+    const vul_t m8 = VCONST_UL(kMask00FF);
+    acc_ld_inv.vi = (acc_ld_inv.vi & m8) + (vul_rshift(acc_ld_inv.vi, 8) & m8);
+    acc_ld.vi = (acc_ld.vi & m8) + (vul_rshift(acc_ld.vi, 8) & m8);
+    ld_inv_diff_ct += univec_hsum_16bit(acc_ld_inv);
+    ld_diff_ct += univec_hsum_16bit(acc_ld);
+  }
+  const uintptr_t* ldbase_iter = (const uintptr_t*)ldbase_vvec_iter;
+  const uintptr_t* genovec_iter = (const uintptr_t*)geno_vvec_iter;
+  while (genovec_iter < genovec_end) {
+    uintptr_t ldbase_word = *ldbase_iter++;
+    uintptr_t geno_word = *genovec_iter++;
+    uintptr_t xor_result = ldbase_word ^ geno_word;
+    uintptr_t xor_result_shifted = xor_result >> 1;
+    ld_diff_ct += popcount01_long((xor_result | xor_result_shifted) & kMask5555);
+    ld_inv_diff_ct += popcount01_long((xor_result | (xor_result_shifted ^ (~geno_word))) & kMask5555);
+  }
+  *ld_diff_ctp = ld_diff_ct;
+  // trailing entries in last word are always "different"
+  *ld_inv_diff_ctp = ld_inv_diff_ct - ((-sample_ct) & (kBitsPerWordD2 - 1));
+}
+
+uint32_t count_ld_and_inverted_ld_diffs_list(const uintptr_t* __restrict ldbase_raregeno, const uint32_t* __restrict ldbase_difflist_sample_ids, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t ldbase_difflist_len, uint32_t difflist_len, uint32_t* ld_diff_ctp, uint32_t* ld_inv_diff_ctp) {
+  // assumes ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct
+  // assumes variant isn't multiallelic
+  
+  // only the count(s) with aligned common_geno values are valid.  e.g. if
+  // ldbase_common_geno and difflist_common_geno are both zero, the ld_inv_diff
+  // return value can be anything, while if they're both three, ld_diff and
+  // ld_inv_diff are both accurate.
+  
+  // some similarities to parse_ld_and_merge_difflist_subset(), but much
+  // simpler.
+  // noticeably slower than count_ld_and_inverted_ld_diffs() when the lists
+  // aren't tiny.
+  // possible todo: take threshold into account?
+
+  uint32_t collision_ct = 0;
+  uint32_t ld_diff_ct = 0;
+  uint32_t ld_inv_diff_ct = 0;
+  uint32_t ldbase_sample_idx = ldbase_difflist_sample_ids[0];
+  uint32_t ldbase_difflist_idx = 1;
+  // this loop is a bit slow.  attempt to bail halfway through?
+  for (uint32_t difflist_idx = 0; difflist_idx < difflist_len; ++difflist_idx) {
+    const uint32_t raw_sample_idx = difflist_sample_ids[difflist_idx];
+    while (ldbase_sample_idx < raw_sample_idx) {
+      ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx++];
+    }
+    if (ldbase_sample_idx > raw_sample_idx) {
+      continue;
+    }
+    const uint32_t cur_raregeno = GET_QUATERARR_ENTRY(raregeno, difflist_idx);
+    const uint32_t cur_ldbase_raregeno = GET_QUATERARR_ENTRY(ldbase_raregeno, ldbase_difflist_idx - 1);
+    const uint32_t cur_inv_raregeno = (6 - cur_raregeno) & 3;
+    ld_diff_ct += (cur_ldbase_raregeno != cur_raregeno);
+    ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx++];
+    ++collision_ct;
+    ld_inv_diff_ct += (cur_ldbase_raregeno != cur_inv_raregeno);
+  }
+  // no more collisions, don't actually need to look at rest of
+  // ldbase_difflist
+  const uint32_t base_diff_ct = ldbase_difflist_len + difflist_len - 2 * collision_ct;
+  *ld_diff_ctp = base_diff_ct + ld_diff_ct;
+  *ld_inv_diff_ctp = base_diff_ct + ld_inv_diff_ct;
+  return 1;
+}
+
+uint32_t save_ld_difflist(const uintptr_t* __restrict genovec, const uintptr_t* __restrict ldbase_genovec, uintptr_t common_geno, uint32_t difflist_len, pgen_writer_common_t* pwcp) {
+  unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+  if (!difflist_len) {
+    *fwrite_bufp = 0;
+    pwcp->fwrite_bufp = &(fwrite_bufp[1]);
+    return 1;
+  }
+  unsigned char* fwrite_bufp_start = fwrite_bufp;
+  fwrite_bufp = vint32_append(difflist_len, fwrite_bufp);
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(pwcp->sample_ct);
+  const uintptr_t common_geno_word = common_geno * kMask5555;
+  const uint32_t group_ct = DIV_UP(difflist_len, kPglDifflistGroupSize);
+  unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+  unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+#ifdef __arm__
+  #error "Unaligned accesses in save_ld_difflist()."
+#endif
+  uintptr_t* raregeno_iter = (uintptr_t*)(&(extra_byte_cts_iter[group_ct - 1]));
+  fwrite_bufp = &(extra_byte_cts_iter[group_ct + (difflist_len - 1) / 4]);
+  unsigned char* last_group_vint_start = fwrite_bufp;
+  uintptr_t raregeno_word = 0;
+  uint32_t last_sample_idx = 0;
+  uint32_t difflist_idx = 0;
+  uint32_t widx = 0;
+  while (1) {
+    const uintptr_t cur_geno_word = genovec[widx];
+    uintptr_t xor_word = ldbase_genovec? ldbase_genovec[widx] : common_geno_word;
+    xor_word ^= cur_geno_word;
+    if (xor_word) {
+      const uint32_t sample_idx_base = widx * kBitsPerWordD2;
+      do {
+	const uint32_t sample_idx_lowbits = CTZLU(xor_word) / 2;
+	const uint32_t new_sample_idx = sample_idx_base + sample_idx_lowbits;
+	raregeno_word |= ((cur_geno_word >> (2 * sample_idx_lowbits)) & 3) << (2 * (difflist_idx % kBitsPerWordD2));
+	if (!(difflist_idx % kPglDifflistGroupSize)) {
+	  group_first_sample_ids_iter = (unsigned char*)memcpya(group_first_sample_ids_iter, &new_sample_idx, sample_id_byte_ct);
+	  if (difflist_idx) {
+	    *extra_byte_cts_iter++ = ((uintptr_t)(fwrite_bufp - last_group_vint_start)) - (kPglDifflistGroupSize - 1);
+	  }
+	  last_group_vint_start = fwrite_bufp;
+	} else {
+	  assert(new_sample_idx >= last_sample_idx + 1);
+	  fwrite_bufp = vint32_append(new_sample_idx - last_sample_idx, fwrite_bufp);
+	}
+	++difflist_idx;
+	last_sample_idx = new_sample_idx;
+	if (difflist_idx == difflist_len) {
+	  memcpy(raregeno_iter, &raregeno_word, 1 + (((difflist_len - 1) / 4) % sizeof(intptr_t)));
+	  pwcp->fwrite_bufp = fwrite_bufp;
+	  return (uint32_t)((uintptr_t)(fwrite_bufp - fwrite_bufp_start));
+	}
+	if (!(difflist_idx % kBitsPerWordD2)) {
+	  *raregeno_iter++ = raregeno_word;
+	  raregeno_word = 0;
+	}
+	xor_word &= (~(3 * k1LU)) << (2 * sample_idx_lowbits);
+      } while (xor_word);
+    }
+    ++widx;
+  }
+}
+
+void onebit_preprocess_buf(const uintptr_t* __restrict genovec, uint32_t sample_ct, uint32_t common2_code, uintptr_t* __restrict genovec_buf) {
+  assert(sample_ct);
+  const uint32_t vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  // todo: look for better ways to perform some of these operations
+  const vul_t* geno_vvec_iter = (const vul_t*)genovec;
+  const vul_t* geno_vvec_end = &(geno_vvec_iter[vec_ct]);
+  vul_t* write_iter = (vul_t*)genovec_buf;
+  const vul_t m1 = VCONST_UL(kMask5555);
+  if (common2_code < 5) {
+    if (common2_code == 1) {
+      // 11 -> 10, everything else unchanged
+      // todo: check if these loops are actually faster as simple while loops
+      // todo: check if it's better to unroll these loops to process 2 __m128is
+      //       at a time
+      do {
+	const vul_t cur_geno = *geno_vvec_iter++;
+	*write_iter++ = (~(m1 & vul_rshift(cur_geno, 1))) & cur_geno;
+      } while (geno_vvec_iter < geno_vvec_end);
+    } else if (common2_code == 3) {
+      // 00 -> 00, 01 -> 10, 10 -> 10, 11 -> 01
+      do {
+	const vul_t cur_geno = *geno_vvec_iter++;
+	const vul_t cur_geno_rshift = vul_rshift(cur_geno, 1);
+	const vul_t cur_geno_xor_masked = (cur_geno ^ cur_geno_rshift) & m1;
+	const vul_t cur_geno_or_masked = (cur_geno | cur_geno_rshift) & m1;
+	*write_iter++ = cur_geno_xor_masked + cur_geno_or_masked;
+      } while (geno_vvec_iter < geno_vvec_end);
+    } else {
+      assert(common2_code == 2);
+      // 00 -> 00, 01 -> 10, 10 -> 01, 11 -> 10
+      do {
+	const vul_t cur_geno = *geno_vvec_iter++;
+	const vul_t cur_geno_or_masked = (cur_geno | vul_rshift(cur_geno, 1)) & m1;
+	const vul_t cur_geno_lowbits = cur_geno & m1;
+	*write_iter++ = cur_geno_lowbits + cur_geno_or_masked;
+      } while (geno_vvec_iter < geno_vvec_end);
+    }
+  } else {
+    if (common2_code == 5) {
+      // 00 -> 10, 01 -> 00, 10 -> 01, 11 -> 10
+      do {
+	const vul_t cur_geno = *geno_vvec_iter++;
+	const vul_t cur_geno_rshift = vul_rshift(cur_geno, 1);
+	const vul_t cur_geno_not_xor_masked = (~(cur_geno ^ cur_geno_rshift)) & m1;
+	const vul_t cur_geno_rshift_masked = cur_geno_rshift & m1;
+	*write_iter++ = cur_geno_not_xor_masked + (cur_geno_not_xor_masked | cur_geno_rshift_masked);
+      } while (geno_vvec_iter < geno_vvec_end);
+    } else if (common2_code == 9) {
+      // 00 -> 10, 01 -> 10, 10 -> 00, 11 -> 01
+      const vul_t not_m1 = VCONST_UL(kMaskAAAA);
+      do {
+	const vul_t cur_geno = *geno_vvec_iter++;
+	*write_iter++ = (cur_geno ^ not_m1) - ((~not_m1) & ((~vul_rshift(cur_geno, 1)) & cur_geno));
+      } while (geno_vvec_iter < geno_vvec_end);
+    } else {
+      assert(common2_code == 6);
+      // 00 -> 10, 01 -> 00, 10 -> 10, 11 -> 01
+      do {
+	const vul_t cur_geno = *geno_vvec_iter++;
+	const vul_t cur_geno_not_lowbits = (~cur_geno) & m1;
+	const vul_t cur_geno_rshift_masked = vul_rshift(cur_geno, 1) & m1;
+	*write_iter++ = cur_geno_not_lowbits + (cur_geno_not_lowbits | cur_geno_rshift_masked);
+      } while (geno_vvec_iter < geno_vvec_end);
+    }
+  }
+}
+
+uint32_t save_onebit(const uintptr_t* __restrict genovec, uint32_t common2_code, uint32_t onebit_difflist_len, pgen_writer_common_t* pwcp) {
+  // Uses ldbase_genovec as a temporary buffer.
+  
+  // common2_code is expected to have the difference between the common
+  // genotype values in bits 0-1, and the smaller common genotype value in bits
+  // 2-3.
+  unsigned char* fwrite_bufp_start = pwcp->fwrite_bufp;
+  *fwrite_bufp_start = common2_code;
+  const uint32_t sample_ct = pwcp->sample_ct;
+  uintptr_t* __restrict genovec_buf = pwcp->ldbase_genovec;
+  // There's a 4-byte-interleaved format which is slightly more efficient for
+  // unsubsetted handling (~10 fewer compression/decompression operations per
+  // 32 genotypes), but that's only a 1-2% speedup, which probably isn't worth
+  // the more annoying subsetting.
+  //
+  // Any 10s and 11s are saved as 00 in this part.
+  // Similar logic is used to handle the other five possibilities (00/10,
+  // 00/11, 01/10, 01/11, 10/11); all of them should be expected to actually
+  // happen.  (E.g. 01/11 can happen at a high MAF variant when there's lots of
+  // missing data.)  To reduce branching, we preprocess genovec_buf to have
+  // even bit set iff the corresponding genotype is equal to the high common
+  // genotype value, and odd bit set iff the corresponding genotype is one of
+  // the two uncommon values.  (There may be a better way to do this, analogous
+  // to the simpler decompression algorithm.)
+  onebit_preprocess_buf(genovec, sample_ct, common2_code, genovec_buf);
+  zero_trailing_quaters(sample_ct, genovec_buf);
+  const uint32_t word_read_ct = QUATERCT_TO_WORDCT(sample_ct);
+#ifdef __arm__
+  #error "Unaligned accesses in save_onebit()."
+#endif
+  halfword_t* fwrite_bufp_alias_halfword = (halfword_t*)(&(fwrite_bufp_start[1]));
+  for (uint32_t widx = 0; widx < word_read_ct; ++widx) {
+    const uintptr_t cur_buf_word = genovec_buf[widx] & kMask5555;
+    fwrite_bufp_alias_halfword[widx] = pack_word_to_halfword(cur_buf_word);
+  }
+  const uint32_t onebit_block_len = (sample_ct + 15) / CHAR_BIT;
+  unsigned char* fwrite_bufp = vint32_append(onebit_difflist_len, &(fwrite_bufp_start[onebit_block_len]));
+  // the rest is almost identical to save_ld_difflist()
+  if (!onebit_difflist_len) {
+    pwcp->fwrite_bufp = fwrite_bufp;
+    return (onebit_block_len + 1);
+  }
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(pwcp->sample_ct);
+  const uint32_t group_ct = DIV_UP(onebit_difflist_len, kPglDifflistGroupSize);
+  unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+  unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+  uintptr_t* raregeno_iter = (uintptr_t*)(&(extra_byte_cts_iter[group_ct - 1]));
+  fwrite_bufp = &(extra_byte_cts_iter[group_ct + (onebit_difflist_len - 1) / 4]);
+  unsigned char* last_group_vint_start = fwrite_bufp;
+  uintptr_t raregeno_word = 0;
+  uint32_t last_sample_idx = 0;
+  uint32_t difflist_idx = 0;
+  uint32_t widx = 0;
+  while (1) {
+    uintptr_t xor_word = genovec_buf[widx] & kMaskAAAA;
+    if (xor_word) {
+      const uintptr_t cur_geno_word = genovec[widx];
+      const uint32_t sample_idx_base = widx * kBitsPerWordD2;
+
+      // enable stronger loop optimizations
+      const uint32_t difflist_idx_end = difflist_idx + popcount_long(xor_word);
+      while (1) {
+	const uint32_t sample_idx_lowbits = CTZLU(xor_word) / 2;
+	const uint32_t new_sample_idx = sample_idx_base + sample_idx_lowbits;
+	if (!(difflist_idx % kBitsPerWordD2)) {
+	  if (!(difflist_idx % kPglDifflistGroupSize)) {
+	    group_first_sample_ids_iter = (unsigned char*)memcpya(group_first_sample_ids_iter, &new_sample_idx, sample_id_byte_ct);
+	    if (difflist_idx) {
+	      *extra_byte_cts_iter++ = ((uintptr_t)(fwrite_bufp - last_group_vint_start)) - (kPglDifflistGroupSize - 1);
+	      *raregeno_iter++ = raregeno_word;
+	      raregeno_word = 0;
+	    }
+	    last_group_vint_start = fwrite_bufp;
+	    goto save_onebit_skip_delta_write;
+	  }
+	  *raregeno_iter++ = raregeno_word;
+	  raregeno_word = 0;
+	}
+	assert(new_sample_idx >= last_sample_idx + 1);
+	fwrite_bufp = vint32_append(new_sample_idx - last_sample_idx, fwrite_bufp);	
+      save_onebit_skip_delta_write:	
+	raregeno_word |= ((cur_geno_word >> (2 * sample_idx_lowbits)) & 3) << (2 * (difflist_idx % kBitsPerWordD2));
+	++difflist_idx;
+	last_sample_idx = new_sample_idx;
+	if (difflist_idx == difflist_idx_end) {
+	  break;
+	}
+	xor_word &= xor_word - 1;
+      }
+      // trailing bits of genovec_buf guaranteed to be zeroed out
+      if (difflist_idx == onebit_difflist_len) {
+	memcpy(raregeno_iter, &raregeno_word, 1 + (((onebit_difflist_len - 1) / 4) % sizeof(intptr_t)));
+	pwcp->fwrite_bufp = fwrite_bufp;
+	return (uint32_t)((uintptr_t)(fwrite_bufp - fwrite_bufp_start));
+      }
+    }
+    ++widx;
+  }
+}
+
+uint32_t pwc_append_biallelic_genovec_main(const uintptr_t* __restrict genovec, uint32_t vidx, pgen_writer_common_t* pwcp, uint32_t* het_ct_ptr, unsigned char* vrtype_ptr) {
+#ifndef NDEBUG
+  if (pwcp->allele_idx_offsets) {
+    assert(pwcp->allele_idx_offsets[vidx + 1] == pwcp->allele_idx_offsets[vidx] + 2);
+  }
+#endif
+  const uint32_t sample_ct = pwcp->sample_ct;
+  assert((!(sample_ct % kBitsPerWordD2)) || (!(genovec[sample_ct / kBitsPerWordD2] >> (2 * (sample_ct % kBitsPerWordD2)))));
+  uint32_t genocounts[4];
+  genovec_count_freqs_unsafe(genovec, sample_ct, genocounts);
+  *het_ct_ptr = genocounts[1];
+  uint32_t most_common_geno = (genocounts[1] > genocounts[0]);
+  uint32_t second_most_common_geno = 1 - most_common_geno;
+  uint32_t largest_geno_ct = genocounts[most_common_geno];
+  uint32_t second_largest_geno_ct = genocounts[second_most_common_geno];
+  for (uint32_t cur_geno = 2; cur_geno < 4; ++cur_geno) {
+    const uint32_t cur_geno_ct = genocounts[cur_geno];
+    if (cur_geno_ct > second_largest_geno_ct) {
+      if (cur_geno_ct > largest_geno_ct) {
+	second_largest_geno_ct = largest_geno_ct;
+	second_most_common_geno = most_common_geno;
+	largest_geno_ct = cur_geno_ct;
+	most_common_geno = cur_geno;
+      } else {
+	second_largest_geno_ct = cur_geno_ct;
+	second_most_common_geno = cur_geno;
+      }
+    }
+  }
+  const uint32_t difflist_len = sample_ct - largest_geno_ct;
+  const uint32_t rare_2_geno_ct_sum = difflist_len - second_largest_geno_ct;
+  // average of 10-11 bits per difflist entry
+  const uint32_t sample_ctd8 = sample_ct / 8;
+  const uint32_t sample_ctd64 = sample_ct / 64;
+  uint32_t max_difflist_len = sample_ctd8 - 2 * sample_ctd64 + rare_2_geno_ct_sum;
+  if (max_difflist_len > sample_ctd8) {
+    max_difflist_len = sample_ctd8;
+  }
+  const uint32_t difflist_viable = (most_common_geno != 1) && (difflist_len <= max_difflist_len);
+
+  uintptr_t* ldbase_genovec = pwcp->ldbase_genovec;
+  uint32_t* ldbase_genocounts = pwcp->ldbase_genocounts;
+  if (!(vidx % kPglVblockSize)) {
+    // beginning of a variant block.  save raw fpos in header; LD compression
+    // prohibited.
+
+    // er, need to use a relative offset in the multithreaded case, absolute
+    // position isn't known
+    pwcp->vblock_fpos[vidx / kPglVblockSize] = pwcp->vblock_fpos_offset + (uintptr_t)(pwcp->fwrite_bufp - pwcp->fwrite_buf);
+  } else if (difflist_len > sample_ctd64) {
+    // do not use LD compression if there are at least this many differences.
+    // tune this threshold in the future.
+    const uint32_t ld_diff_threshold = difflist_viable? (difflist_len - sample_ctd64) : max_difflist_len;
+    // number of changes between current genovec and LD reference is bounded
+    // below by sum(genocounts[x] - ldbase_genocounts[x]) / 2
+    const int32_t count02_limit = 2 * ld_diff_threshold - abs_int32(genocounts[1] - ldbase_genocounts[1]) + abs_int32(genocounts[3] - ldbase_genocounts[3]);
+    if ((((int32_t)(abs_int32(genocounts[0] - ldbase_genocounts[0]) + abs_int32(genocounts[2] - ldbase_genocounts[2]))) < count02_limit) || (((int32_t)(abs_int32(genocounts[0] - ldbase_genocounts[2]) + abs_int32(genocounts[2] - ldbase_genocounts[0]))) < count02_limit)) {
+      uint32_t ld_diff_ct;
+      uint32_t ld_inv_diff_ct;
+      // okay, perform a brute-force diff
+      // (could check LD vs. inverted LD separately?)
+      if (pwcp->ldbase_common_geno < 4) {
+	// unpack to ldbase_genovec
+	pgr_difflist_to_genovec_unsafe(pwcp->ldbase_raregeno, pwcp->ldbase_difflist_sample_ids, pwcp->ldbase_common_geno, sample_ct, pwcp->ldbase_difflist_len, ldbase_genovec);
+	zero_trailing_quaters(sample_ct, ldbase_genovec);
+	pwcp->ldbase_common_geno = 0xffffffffU;
+      }
+      count_ld_and_inverted_ld_diffs(ldbase_genovec, genovec, sample_ct, &ld_diff_ct, &ld_inv_diff_ct);
+      if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) {
+	const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct);
+	*vrtype_ptr = 2 + invert_before_compressing;
+	if (invert_before_compressing) {
+	  genovec_invert_copy_unsafe(genovec, sample_ct, pwcp->genovec_invert_buf);
+	  ld_diff_ct = ld_inv_diff_ct;
+	}
+	return save_ld_difflist(invert_before_compressing? pwcp->genovec_invert_buf : genovec, ldbase_genovec, 0, ld_diff_ct, pwcp);
+      }
+    }
+  }
+  const uint32_t genovec_word_ct = QUATERCT_TO_WORDCT(sample_ct);
+  memcpy(ldbase_genocounts, genocounts, 4 * sizeof(int32_t));
+  pwcp->ldbase_common_geno = 0xffffffffU;
+  if ((!difflist_viable) && (rare_2_geno_ct_sum < sample_ct / (2 * kPglMaxDifflistLenDivisor))) {
+    *vrtype_ptr = 1;
+    uint32_t larger_common_geno = second_most_common_geno;
+    uint32_t smaller_common_geno = most_common_geno;
+    if (most_common_geno > second_most_common_geno) {
+      larger_common_geno = most_common_geno;
+      smaller_common_geno = second_most_common_geno;
+    }
+    const uint32_t vrec_len = save_onebit(genovec, larger_common_geno + (smaller_common_geno * 3), rare_2_geno_ct_sum, pwcp);
+    memcpy(ldbase_genovec, genovec, genovec_word_ct * sizeof(intptr_t));
+    return vrec_len;
+  }
+  memcpy(ldbase_genovec, genovec, genovec_word_ct * sizeof(intptr_t));
+  if (difflist_viable) {
+    *vrtype_ptr = 4 + most_common_geno;
+    return save_ld_difflist(genovec, nullptr, most_common_geno, difflist_len, pwcp);
+  }
+  *vrtype_ptr = 0;
+  const uint32_t vrec_len = QUATERCT_TO_BYTECT(sample_ct);
+  pwcp->fwrite_bufp = (unsigned char*)memcpya(pwcp->fwrite_bufp, genovec, vrec_len);
+  return vrec_len;
+}
+
+void pwc_append_biallelic_genovec(const uintptr_t* __restrict genovec, pgen_writer_common_t* pwcp) {
+  const uint32_t vidx = pwcp->vidx;
+  uint32_t het_ct; // dummy
+  unsigned char vrtype;
+  const uint32_t vrec_len = pwc_append_biallelic_genovec_main(genovec, vidx, pwcp, &het_ct, &vrtype);
+  const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+  pwcp->vidx += 1;
+  memcpy(&(pwcp->vrec_len_buf[vidx * pwcp->vrec_len_byte_ct]), &vrec_len, vrec_len_byte_ct);
+  // could have a single expression which branchlessly handles both cases, but
+  // doubt that's worthwhile
+  if (!pwcp->phase_dosage_gflags) {
+    pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= ((uintptr_t)vrtype) << (4 * (vidx % kBitsPerWordD4));
+  } else {
+    ((unsigned char*)pwcp->vrtype_buf)[vidx] = vrtype;
+  }
+}
+
+pglerr_t spgw_append_biallelic_genovec(const uintptr_t* __restrict genovec, st_pgen_writer_t* spgwp) {
+  // flush write buffer if necessary
+  if (spgwp->pwc.fwrite_bufp >= &(spgwp->pwc.fwrite_buf[kPglFwriteBlockSize])) {
+    const uintptr_t cur_byte_ct = (uintptr_t)(spgwp->pwc.fwrite_bufp - spgwp->pwc.fwrite_buf);
+    if (fwrite_checked(spgwp->pwc.fwrite_buf, cur_byte_ct, spgwp->pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    spgwp->pwc.vblock_fpos_offset += cur_byte_ct;
+    spgwp->pwc.fwrite_bufp = spgwp->pwc.fwrite_buf;
+  }
+  pwc_append_biallelic_genovec(genovec, &(spgwp->pwc));
+  return kPglRetSuccess;
+}
+
+uint32_t save_ld_two_list_delta(const uintptr_t* __restrict difflist_raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t ld_diff_ct, pgen_writer_common_t* pwcp) {
+  // assumes ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct, and
+  // difflist_sample_ids[ldbase_difflist_len] == sample_ct.
+  // assumes biallelic data.
+  
+  // similar to save_ld_difflist() and, to a lesser degree,
+  // parse_ld_and_merge_difflist_subset()
+  unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+  if (!ld_diff_ct) {
+    *fwrite_bufp = 0;
+    pwcp->fwrite_bufp = &(fwrite_bufp[1]);
+    return 1;
+  }
+  unsigned char* fwrite_bufp_start = fwrite_bufp;
+  fwrite_bufp = vint32_append(ld_diff_ct, fwrite_bufp);
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(pwcp->sample_ct);
+  const uint32_t group_ct = DIV_UP(ld_diff_ct, kPglDifflistGroupSize);
+  const uint32_t ldbase_common_geno = pwcp->ldbase_common_geno;
+  assert(ldbase_common_geno < 4);
+  const uintptr_t* __restrict ldbase_raregeno = pwcp->ldbase_raregeno;
+  const uint32_t* __restrict ldbase_sample_ids = pwcp->ldbase_difflist_sample_ids;
+  unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+  unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+#ifdef __arm__
+  #error "Unaligned accesses in save_ld_two_list_delta()."
+#endif
+  uintptr_t* raregeno_write_iter = (uintptr_t*)(&(extra_byte_cts_iter[group_ct - 1]));
+  fwrite_bufp = &(extra_byte_cts_iter[group_ct + (ld_diff_ct - 1) / 4]);
+  unsigned char* last_group_vint_start = fwrite_bufp;
+  uintptr_t ldbase_raregeno_word = 0;
+  uintptr_t difflist_raregeno_word = 0;
+  uintptr_t raregeno_write_word = 0;
+  uint32_t last_sample_idx = 0;
+
+  uint32_t next_ldbase_sample_idx = ldbase_sample_ids[0];
+  uint32_t next_difflist_sample_idx = difflist_sample_ids[0];
+  uint32_t ldbase_idx = 0;
+  uint32_t difflist_idx = 0;
+  uint32_t diff_written_ct = 0;
+  while (diff_written_ct < ld_diff_ct) {
+    uintptr_t cur_geno;
+    uint32_t new_sample_idx;
+    if (next_ldbase_sample_idx <= next_difflist_sample_idx) {
+      ldbase_raregeno_word >>= 2;
+      if (!(ldbase_idx % kBitsPerWordD2)) {
+	ldbase_raregeno_word = ldbase_raregeno[ldbase_idx / kBitsPerWordD2];
+      }
+      ++ldbase_idx;
+    }
+    if (next_difflist_sample_idx <= next_ldbase_sample_idx) {
+      difflist_raregeno_word >>= 2;
+      if (!(difflist_idx % kBitsPerWordD2)) {
+	difflist_raregeno_word = difflist_raregeno[difflist_idx / kBitsPerWordD2];
+      }
+      new_sample_idx = next_difflist_sample_idx;
+      ++difflist_idx;
+      cur_geno = difflist_raregeno_word & 3;
+      next_difflist_sample_idx = difflist_sample_ids[difflist_idx];
+      if (next_ldbase_sample_idx == new_sample_idx) {
+	next_ldbase_sample_idx = ldbase_sample_ids[ldbase_idx];
+	if (cur_geno == (ldbase_raregeno_word & 3)) {
+	  continue;
+	}
+      }
+    } else {
+      cur_geno = ldbase_common_geno;
+      new_sample_idx = next_ldbase_sample_idx;
+      next_ldbase_sample_idx = ldbase_sample_ids[ldbase_idx];
+    }
+    raregeno_write_word |= cur_geno << (2 * (diff_written_ct % kBitsPerWordD2));
+    if (!(diff_written_ct % kPglDifflistGroupSize)) {
+      group_first_sample_ids_iter = (unsigned char*)memcpya(group_first_sample_ids_iter, &new_sample_idx, sample_id_byte_ct);
+      if (diff_written_ct) {
+	*extra_byte_cts_iter++ = ((uintptr_t)(fwrite_bufp - last_group_vint_start)) - (kPglDifflistGroupSize - 1);
+      }
+      last_group_vint_start = fwrite_bufp;
+    } else {
+      fwrite_bufp = vint32_append(new_sample_idx - last_sample_idx, fwrite_bufp);
+    }
+    last_sample_idx = new_sample_idx;    
+    ++diff_written_ct;
+    if (!(diff_written_ct % kBitsPerWordD2)) {
+      *raregeno_write_iter++ = raregeno_write_word;
+      raregeno_write_word = 0;
+    }
+  }
+  if (diff_written_ct % kBitsPerWordD2) {
+    memcpy(raregeno_write_iter, &raregeno_write_word, 1 + (((ld_diff_ct - 1) / 4) % kBytesPerWord));
+  }
+  pwcp->fwrite_bufp = fwrite_bufp;
+  return (uint32_t)((uintptr_t)(fwrite_bufp - fwrite_bufp_start));
+}
+
+uint32_t save_ld_input_list(pgen_writer_common_t* pwcp) {
+  // simply "copies" ldbase_{raregeno,difflist_sample_ids,difflist_len} to the
+  // write buffer.
+  unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+  const uint32_t difflist_len = pwcp->ldbase_difflist_len;
+  if (!difflist_len) {
+    *fwrite_bufp = 0;
+    pwcp->fwrite_bufp = &(fwrite_bufp[1]);
+    return 1;
+  }
+  unsigned char* fwrite_bufp_start = fwrite_bufp;
+  fwrite_bufp = vint32_append(difflist_len, fwrite_bufp);
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(pwcp->sample_ct);
+  const uint32_t group_ct = DIV_UP(difflist_len, kPglDifflistGroupSize);
+  const uint32_t* __restrict difflist_sample_ids = pwcp->ldbase_difflist_sample_ids;
+  unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+  unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+  fwrite_bufp = (unsigned char*)memcpya(&(extra_byte_cts_iter[group_ct - 1]), pwcp->ldbase_raregeno, QUATERCT_TO_BYTECT(difflist_len));
+  unsigned char* last_group_vint_start = nullptr;
+  uint32_t last_sample_idx = 0;
+  for (uint32_t difflist_idx = 0; difflist_idx < difflist_len; ++difflist_idx) {
+    const uint32_t new_sample_idx = difflist_sample_ids[difflist_idx];
+    if (!(difflist_idx % kPglDifflistGroupSize)) {
+      group_first_sample_ids_iter = (unsigned char*)memcpya(group_first_sample_ids_iter, &new_sample_idx, sample_id_byte_ct);
+      if (difflist_idx) {
+	*extra_byte_cts_iter++ = ((uintptr_t)(fwrite_bufp - last_group_vint_start)) - (kPglDifflistGroupSize - 1);
+      }
+      last_group_vint_start = fwrite_bufp;
+    } else {
+      // assert(new_sample_idx >= last_sample_idx + 1);
+      fwrite_bufp = vint32_append(new_sample_idx - last_sample_idx, fwrite_bufp);
+    }
+    last_sample_idx = new_sample_idx;
+  }
+  pwcp->fwrite_bufp = fwrite_bufp;
+  return (uint32_t)((uintptr_t)(fwrite_bufp - fwrite_bufp_start));
+}
+
+uint32_t pwc_append_biallelic_difflist_limited_main(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t vidx, uint32_t difflist_common_geno, uint32_t difflist_len, pgen_writer_common_t* pwcp, unsigned char* vrtype_ptr) {
+  const uint32_t sample_ct = pwcp->sample_ct;
+  // caller's responsibility not to exceed this limit
+  assert(difflist_len <= 2 * (sample_ct / kPglMaxDifflistLenDivisor));
+
+  // trailing bits of raregeno must be zeroed out
+
+  assert(difflist_common_geno < 4);
+#ifndef NDEBUG
+  if (pwcp->allele_idx_offsets) {
+    assert(pwcp->allele_idx_offsets[vidx + 1] == pwcp->allele_idx_offsets[vidx] + 2);
+  }
+#endif
+  assert((!(difflist_len % kBitsPerWordD2)) || (!(raregeno[difflist_len / kBitsPerWordD2] >> (2 * (difflist_len % kBitsPerWordD2)))));
+  assert(difflist_sample_ids[difflist_len] == sample_ct);
+  uint32_t genocounts[4];
+  genovec_count_freqs_unsafe(raregeno, difflist_len, genocounts);
+  assert(!genocounts[difflist_common_geno]);
+  genocounts[difflist_common_geno] = sample_ct - difflist_len;
+  uint32_t second_most_common_geno = difflist_common_geno? 0 : 1;
+  uint32_t second_largest_geno_ct = genocounts[second_most_common_geno];
+  for (uint32_t cur_geno = second_most_common_geno + 1; cur_geno < 4; ++cur_geno) {
+    if (cur_geno == difflist_common_geno) {
+      continue;
+    }
+    const uint32_t cur_geno_ct = genocounts[cur_geno];
+    if (cur_geno_ct > second_largest_geno_ct) {
+      second_most_common_geno = cur_geno;
+      second_largest_geno_ct = cur_geno_ct;
+    }
+  }
+  const uint32_t rare_2_geno_ct_sum = difflist_len - second_largest_geno_ct;
+  const uint32_t sample_ctd8 = sample_ct / 8;
+  const uint32_t sample_ctd64 = sample_ct / 64;
+  uint32_t max_difflist_len = sample_ctd8 - 2 * sample_ctd64 + rare_2_geno_ct_sum;
+  if (max_difflist_len > sample_ctd8) {
+    max_difflist_len = sample_ctd8;
+  }
+  const uint32_t difflist_viable = (difflist_common_geno != 1) && (difflist_len <= max_difflist_len);
+  uint32_t* ldbase_genocounts = pwcp->ldbase_genocounts;
+  if (!(vidx % kPglVblockSize)) {
+    pwcp->vblock_fpos[vidx / kPglVblockSize] = pwcp->vblock_fpos_offset + (uintptr_t)(pwcp->fwrite_bufp - pwcp->fwrite_buf);
+  } else if (difflist_len > sample_ctd64) {
+    const uint32_t ld_diff_threshold = difflist_viable? (difflist_len - sample_ctd64) : max_difflist_len;
+    // number of changes between current genovec and LD reference is bounded
+    // below by sum(genocounts[x] - ldbase_genocounts[x]) / 2
+    const int32_t count02_limit = 2 * ld_diff_threshold - abs_int32(genocounts[1] - ldbase_genocounts[1]) + abs_int32(genocounts[3] - ldbase_genocounts[3]);
+    if ((((int32_t)(abs_int32(genocounts[0] - ldbase_genocounts[0]) + abs_int32(genocounts[2] - ldbase_genocounts[2]))) < count02_limit) || (((int32_t)(abs_int32(genocounts[0] - ldbase_genocounts[2]) + abs_int32(genocounts[2] - ldbase_genocounts[0]))) < count02_limit)) {
+      uint32_t ld_diff_ct;
+      uint32_t ld_inv_diff_ct;
+      if (pwcp->ldbase_common_geno < 4) {
+	pwcp->ldbase_difflist_sample_ids[pwcp->ldbase_difflist_len] = sample_ct;
+	if (count_ld_and_inverted_ld_diffs_list(pwcp->ldbase_raregeno, pwcp->ldbase_difflist_sample_ids, raregeno, difflist_sample_ids, pwcp->ldbase_difflist_len, difflist_len, &ld_diff_ct, &ld_inv_diff_ct)) {
+	  const uint32_t difflist_common_geno_inv = (6 - difflist_common_geno) & 3;
+	  if (pwcp->ldbase_common_geno != difflist_common_geno) {
+	    ld_diff_ct = ld_diff_threshold;
+	  }
+	  if (pwcp->ldbase_common_geno != difflist_common_geno_inv) {
+	    ld_inv_diff_ct = ld_diff_threshold;
+	  }
+	  if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) {
+	    const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct);
+	    *vrtype_ptr = 2 + invert_before_compressing;
+	    if (invert_before_compressing) {
+	      genovec_invert_copy_unsafe(raregeno, difflist_len, pwcp->genovec_invert_buf);
+	      // difflist_common_geno = difflist_common_geno_inv;
+	      ld_diff_ct = ld_inv_diff_ct;
+	    }
+	    return save_ld_two_list_delta(invert_before_compressing? pwcp->genovec_invert_buf : raregeno, difflist_sample_ids, ld_diff_ct, pwcp);
+	  }
+	}
+      } else {
+	uintptr_t* __restrict genobuf = pwcp->genovec_invert_buf;
+	pgr_difflist_to_genovec_unsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genobuf);
+	zero_trailing_quaters(sample_ct, genobuf);
+	count_ld_and_inverted_ld_diffs(pwcp->ldbase_genovec, genobuf, sample_ct, &ld_diff_ct, &ld_inv_diff_ct);
+	if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) {
+	  const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct);
+	  *vrtype_ptr = 2 + invert_before_compressing;
+	  if (invert_before_compressing) {
+	    genovec_invert_unsafe(sample_ct, genobuf);
+	    ld_diff_ct = ld_inv_diff_ct;
+	  }
+	  return save_ld_difflist(genobuf, pwcp->ldbase_genovec, 0, ld_diff_ct, pwcp);
+	}
+      }
+    }
+  }
+  memcpy(ldbase_genocounts, genocounts, 4 * sizeof(int32_t));
+  if (difflist_viable) {
+    *vrtype_ptr = 4 + difflist_common_geno;
+    memcpy(pwcp->ldbase_raregeno, raregeno, QUATERCT_TO_BYTECT(difflist_len));
+    memcpy(pwcp->ldbase_difflist_sample_ids, difflist_sample_ids, difflist_len * sizeof(int32_t));
+    // memcpy(pwcp->ldbase_difflist_sample_ids, difflist_sample_ids, (difflist_len + 1) * sizeof(int32_t));
+    pwcp->ldbase_common_geno = difflist_common_geno;
+    pwcp->ldbase_difflist_len = difflist_len;
+    return save_ld_input_list(pwcp);
+  }
+  pwcp->ldbase_common_geno = 0xffffffffU;
+  const uint32_t use_onebit = (rare_2_geno_ct_sum < sample_ct / (2 * kPglMaxDifflistLenDivisor));
+  uintptr_t* genobuf = use_onebit? pwcp->genovec_invert_buf : pwcp->ldbase_genovec;
+  pgr_difflist_to_genovec_unsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genobuf);
+  zero_trailing_quaters(sample_ct, genobuf);
+  *vrtype_ptr = use_onebit;
+  if (use_onebit) {
+    uint32_t larger_common_geno = second_most_common_geno;
+    uint32_t smaller_common_geno = difflist_common_geno;
+    if (difflist_common_geno > second_most_common_geno) {
+      larger_common_geno = difflist_common_geno;
+      smaller_common_geno = second_most_common_geno;
+    }
+    const uint32_t vrec_len = save_onebit(genobuf, larger_common_geno + (smaller_common_geno * 3), rare_2_geno_ct_sum, pwcp);
+    memcpy(pwcp->ldbase_genovec, genobuf, QUATERCT_TO_WORDCT(sample_ct) * sizeof(uintptr_t));
+    return vrec_len;
+  }
+  const uint32_t vrec_len = QUATERCT_TO_BYTECT(sample_ct);
+  pwcp->fwrite_bufp = (unsigned char*)memcpya(pwcp->fwrite_bufp, genobuf, vrec_len);
+  return vrec_len;
+}
+
+void pwc_append_biallelic_difflist_limited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, pgen_writer_common_t* pwcp) {
+  const uint32_t vidx = pwcp->vidx;
+  unsigned char vrtype;
+  const uint32_t vrec_len = pwc_append_biallelic_difflist_limited_main(raregeno, difflist_sample_ids, vidx, difflist_common_geno, difflist_len, pwcp, &vrtype);
+  const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+  pwcp->vidx += 1;
+  memcpy(&(pwcp->vrec_len_buf[vidx * pwcp->vrec_len_byte_ct]), &vrec_len, vrec_len_byte_ct);
+  if (!pwcp->phase_dosage_gflags) {
+    pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= ((uintptr_t)vrtype) << (4 * (vidx % kBitsPerWordD4));
+  } else {
+    ((unsigned char*)pwcp->vrtype_buf)[vidx] = vrtype;
+  }
+}
+
+pglerr_t spgw_append_biallelic_difflist_limited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, st_pgen_writer_t* spgwp) {
+  // trailing bits of raregeno must be zeroed out
+
+  // flush write buffer if necessary
+  if (spgwp->pwc.fwrite_bufp >= &(spgwp->pwc.fwrite_buf[kPglFwriteBlockSize])) {
+    const uintptr_t cur_byte_ct = (uintptr_t)(spgwp->pwc.fwrite_bufp - spgwp->pwc.fwrite_buf);
+    if (fwrite_checked(spgwp->pwc.fwrite_buf, cur_byte_ct, spgwp->pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    spgwp->pwc.vblock_fpos_offset += cur_byte_ct;
+    spgwp->pwc.fwrite_bufp = spgwp->pwc.fwrite_buf;
+  }
+  pwc_append_biallelic_difflist_limited(raregeno, difflist_sample_ids, difflist_common_geno, difflist_len, &(spgwp->pwc));
+  return kPglRetSuccess;
+}
+
+
+pglerr_t spgw_append_multiallelic_counts(__attribute__((unused)) const uintptr_t** __restrict alt_countvecs) {
+  // todo
+  return kPglRetNotYetSupported;
+}
+
+
+void append_hphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t het_ct, uint32_t phasepresent_ct, pgen_writer_common_t* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) {
+  assert(phasepresent_ct);
+  const uint32_t sample_ct = pwcp->sample_ct;
+  *vrtype_ptr += 16;
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+#ifdef __arm__
+  #error "Unaligned accesses in append_hphase()."
+#endif
+  uintptr_t* fwrite_bufp_alias = (uintptr_t*)pwcp->fwrite_bufp;
+  uintptr_t phaseinfo_write_word = 0;
+  uint32_t phaseinfo_write_idx_lowbits;
+  unsigned char* fwrite_bufp_final;
+  if (het_ct == phasepresent_ct) {
+    // no need to write phasepresent; just write phaseinfo directly to output
+    // buffer
+    phaseinfo_write_idx_lowbits = 1;
+    for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+      const uintptr_t geno_word = genovec[widx];
+      uintptr_t geno_hets = (~(geno_word >> 1)) & geno_word & kMask5555;
+      if (geno_hets) {
+	const uint32_t phaseinfo_halfword = ((const halfword_t*)phaseinfo)[widx];
+	do {
+	  const uint32_t sample_idx_lowbits = CTZLU(geno_hets) / 2;
+	  phaseinfo_write_word |= ((uintptr_t)((phaseinfo_halfword >> sample_idx_lowbits) & k1LU)) << phaseinfo_write_idx_lowbits;
+	  if (++phaseinfo_write_idx_lowbits == kBitsPerWord) {
+	    *fwrite_bufp_alias++ = phaseinfo_write_word;
+	    phaseinfo_write_word = 0;
+	    phaseinfo_write_idx_lowbits = 0;
+	  }
+	  geno_hets &= geno_hets - k1LU;
+	} while (geno_hets);
+      }
+    }
+    fwrite_bufp_final = (unsigned char*)fwrite_bufp_alias;
+  } else {
+    uintptr_t* phaseinfo_tmp = pwcp->genovec_invert_buf;
+    uintptr_t* phaseinfo_tmp_iter = phaseinfo_tmp;
+    uint32_t phasepresent_write_idx_lowbits = 1;
+    phaseinfo_write_idx_lowbits = 0;
+    uintptr_t phasepresent_write_word = 1;
+    for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+      const uintptr_t geno_word = genovec[widx];
+      uintptr_t geno_hets = (~(geno_word >> 1)) & geno_word & kMask5555;
+      if (geno_hets) {
+	const uint32_t phasepresent_halfword = ((const halfword_t*)phasepresent)[widx];
+	if (phasepresent_halfword) {
+	  const uint32_t phaseinfo_halfword = ((const halfword_t*)phaseinfo)[widx];
+	  do {
+	    const uint32_t sample_idx_lowbits = CTZLU(geno_hets) / 2;
+	    if ((phasepresent_halfword >> sample_idx_lowbits) & 1) {
+	      phasepresent_write_word |= k1LU << phasepresent_write_idx_lowbits;
+	      phaseinfo_write_word |= ((uintptr_t)((phaseinfo_halfword >> sample_idx_lowbits) & k1LU)) << phaseinfo_write_idx_lowbits;
+	      if (++phaseinfo_write_idx_lowbits == kBitsPerWord) {
+		*phaseinfo_tmp_iter++ = phaseinfo_write_word;
+		phaseinfo_write_word = 0;
+		phaseinfo_write_idx_lowbits = 0;
+	      }
+	    }
+	    if (++phasepresent_write_idx_lowbits == kBitsPerWord) {
+	      *fwrite_bufp_alias++ = phasepresent_write_word;
+	      phasepresent_write_word = 0;
+	      phasepresent_write_idx_lowbits = 0;
+	    }
+	    geno_hets &= geno_hets - k1LU;
+	  } while (geno_hets);
+	} else {
+	  phasepresent_write_idx_lowbits += popcount_long(geno_hets);
+	  if (phasepresent_write_idx_lowbits >= kBitsPerWord) {
+	    *fwrite_bufp_alias++ = phasepresent_write_word;
+	    phasepresent_write_word = 0;
+	    phasepresent_write_idx_lowbits -= kBitsPerWord;
+	  }
+	}
+      }
+    }
+    fwrite_bufp_final = (unsigned char*)fwrite_bufp_alias;
+    if (phasepresent_write_idx_lowbits) {
+      fwrite_bufp_final = (unsigned char*)memcpya(fwrite_bufp_final, &phasepresent_write_word, DIV_UP(phasepresent_write_idx_lowbits, CHAR_BIT));
+    }
+    fwrite_bufp_final = (unsigned char*)memcpya(fwrite_bufp_final, phaseinfo_tmp, sizeof(intptr_t) * (phaseinfo_tmp_iter - phaseinfo_tmp));
+  }
+  if (phaseinfo_write_idx_lowbits) {
+    fwrite_bufp_final = (unsigned char*)memcpya(fwrite_bufp_final, &phaseinfo_write_word, DIV_UP(phaseinfo_write_idx_lowbits, CHAR_BIT));
+  }
+#ifdef __LP64__
+  assert(((*vrec_len_ptr) + (uintptr_t)(fwrite_bufp_final - pwcp->fwrite_bufp)) <= kPglMaxBytesPerVariant);
+#endif
+  *vrec_len_ptr += fwrite_bufp_final - pwcp->fwrite_bufp;
+  pwcp->fwrite_bufp = fwrite_bufp_final;
+}
+
+void pwc_append_biallelic_genovec_hphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, pgen_writer_common_t* pwcp) {
+  // assumes phase_dosage_gflags is nonzero
+  const uint32_t vidx = pwcp->vidx;
+  unsigned char* vrtype_dest = &(((unsigned char*)pwcp->vrtype_buf)[vidx]);
+  uint32_t het_ct;
+  uint32_t vrec_len = pwc_append_biallelic_genovec_main(genovec, vidx, pwcp, &het_ct, vrtype_dest);
+  const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+  const uint32_t sample_ct = pwcp->sample_ct;
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  pwcp->vidx += 1;
+  unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+  const uint32_t phasepresent_ct = phasepresent? ((uint32_t)popcount_longs(phasepresent, sample_ctl)) : het_ct;
+  if (phasepresent_ct) {
+    append_hphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len);
+  }
+  memcpy(vrec_len_dest, &vrec_len, vrec_len_byte_ct);
+}
+
+pglerr_t spgw_append_biallelic_genovec_hphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, st_pgen_writer_t* spgwp) {
+  // flush write buffer if necessary
+  if (spgwp->pwc.fwrite_bufp >= &(spgwp->pwc.fwrite_buf[kPglFwriteBlockSize])) {
+    const uintptr_t cur_byte_ct = (uintptr_t)(spgwp->pwc.fwrite_bufp - spgwp->pwc.fwrite_buf);
+    if (fwrite_checked(spgwp->pwc.fwrite_buf, cur_byte_ct, spgwp->pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    spgwp->pwc.vblock_fpos_offset += cur_byte_ct;
+    spgwp->pwc.fwrite_bufp = spgwp->pwc.fwrite_buf;
+  }
+  pwc_append_biallelic_genovec_hphase(genovec, phasepresent, phaseinfo, &(spgwp->pwc));
+  return kPglRetSuccess;
+}
+
+
+uint32_t pwc_append_deltalist(const uintptr_t* delta_bitarr, uint32_t deltalist_len, pgen_writer_common_t* pwcp) {
+  assert(deltalist_len);
+  unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+  unsigned char* fwrite_bufp_start = fwrite_bufp;
+  fwrite_bufp = vint32_append(deltalist_len, fwrite_bufp);
+  const uint32_t sample_id_byte_ct = bytes_to_represent_ui(pwcp->sample_ct);
+  const uint32_t group_ct = DIV_UP(deltalist_len, kPglDifflistGroupSize);
+  unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+  unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+  fwrite_bufp = &(extra_byte_cts_iter[group_ct - 1]);
+  unsigned char* last_group_vint_start = nullptr;
+  uint32_t last_sample_idx = 0;
+  uint32_t new_sample_idx = 0;
+  for (uint32_t deltalist_idx = 0; deltalist_idx < deltalist_len; ++deltalist_idx, ++new_sample_idx) {
+    next_set_unsafe_ck(delta_bitarr, &new_sample_idx);
+    if (!(deltalist_idx % kPglDifflistGroupSize)) {
+      group_first_sample_ids_iter = (unsigned char*)memcpya(group_first_sample_ids_iter, &new_sample_idx, sample_id_byte_ct);
+      if (deltalist_idx) {
+	*extra_byte_cts_iter++ = ((uintptr_t)(fwrite_bufp - last_group_vint_start)) - (kPglDifflistGroupSize - 1);
+      }
+      last_group_vint_start = fwrite_bufp;
+    } else {
+      assert(new_sample_idx >= last_sample_idx + 1);
+      fwrite_bufp = vint32_append(new_sample_idx - last_sample_idx, fwrite_bufp);
+    }
+    last_sample_idx = new_sample_idx;
+  }
+  pwcp->fwrite_bufp = fwrite_bufp;
+  return (uint32_t)((uintptr_t)(fwrite_bufp - fwrite_bufp_start));
+}
+
+void append_dosage16(const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, pgen_writer_common_t* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) {
+  const uint32_t sample_ct = pwcp->sample_ct;
+  const uint32_t max_deltalist_entry_ct = sample_ct / kPglMaxDeltalistLenDivisor;
+  if (dosage_ct <= max_deltalist_entry_ct) {
+    // case 1: store dosage IDs as deltalist.
+    *vrec_len_ptr += pwc_append_deltalist(dosage_present, dosage_ct, pwcp);
+    *vrtype_ptr += 0x20;
+  } else if (dosage_ct == sample_ct) {
+    // case 2: fixed-width, no need to store dosage IDs at all.
+    // dosage_vals permitted to have 65535 = missing
+    *vrtype_ptr += 0x40;
+  } else {
+    // case 3: save dosage_present bitarray directly.
+    const uint32_t sample_ctb = DIV_UP(sample_ct, CHAR_BIT);
+    *vrec_len_ptr += sample_ctb;
+    pwcp->fwrite_bufp = (unsigned char*)memcpya(pwcp->fwrite_bufp, dosage_present, sample_ctb);
+    *vrtype_ptr += 0x60;
+  }
+  pwcp->fwrite_bufp = (unsigned char*)memcpya(pwcp->fwrite_bufp, dosage_vals, dosage_ct * sizeof(int16_t));
+  *vrec_len_ptr += dosage_ct * sizeof(int16_t);
+}
+
+void pwc_append_biallelic_genovec_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, pgen_writer_common_t* pwcp) {
+  // safe to call this even when entire file has no phase/dosage info
+  const uint32_t vidx = pwcp->vidx;
+  unsigned char vrtype;
+  uint32_t het_ct; // dummy
+  uint32_t vrec_len = pwc_append_biallelic_genovec_main(genovec, vidx, pwcp, &het_ct, &vrtype);
+  const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+  pwcp->vidx += 1;
+  unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+  if (dosage_ct) {
+    append_dosage16(dosage_present, dosage_vals, dosage_ct, pwcp, &vrtype, &vrec_len);
+  }
+  memcpy(vrec_len_dest, &vrec_len, vrec_len_byte_ct);
+  if (!pwcp->phase_dosage_gflags) {
+    pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= ((uintptr_t)vrtype) << (4 * (vidx % kBitsPerWordD4));
+  } else {
+    ((unsigned char*)pwcp->vrtype_buf)[vidx] = vrtype;
+  }
+}
+
+pglerr_t spgw_append_biallelic_genovec_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, st_pgen_writer_t* spgwp) {
+  // flush write buffer if necessary
+  if (spgwp->pwc.fwrite_bufp >= &(spgwp->pwc.fwrite_buf[kPglFwriteBlockSize])) {
+    const uintptr_t cur_byte_ct = (uintptr_t)(spgwp->pwc.fwrite_bufp - spgwp->pwc.fwrite_buf);
+    if (fwrite_checked(spgwp->pwc.fwrite_buf, cur_byte_ct, spgwp->pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    spgwp->pwc.vblock_fpos_offset += cur_byte_ct;
+    spgwp->pwc.fwrite_bufp = spgwp->pwc.fwrite_buf;
+  }
+  pwc_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_ct, &(spgwp->pwc));
+  return kPglRetSuccess;
+}
+
+void pwc_append_biallelic_genovec_hphase_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, pgen_writer_common_t* pwcp) {
+  // assumes there is phase and/or dosage data in output file, otherwise
+  // vrtype_dest needs to be replaced
+  
+  // this mostly overlaps with pwc_append_biallelic_genovec_hphase(); probably
+  // get rid of the latter
+  const uint32_t vidx = pwcp->vidx;
+  unsigned char* vrtype_dest = &(((unsigned char*)pwcp->vrtype_buf)[vidx]);
+  uint32_t het_ct;
+  uint32_t vrec_len = pwc_append_biallelic_genovec_main(genovec, vidx, pwcp, &het_ct, vrtype_dest);
+  const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+  const uint32_t sample_ct = pwcp->sample_ct;
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  pwcp->vidx += 1;
+  unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+  const uint32_t phasepresent_ct = phasepresent? ((uint32_t)popcount_longs(phasepresent, sample_ctl)) : het_ct;
+  if (phasepresent_ct) {
+    append_hphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len);
+  }
+  if (dosage_ct) {
+    append_dosage16(dosage_present, dosage_vals, dosage_ct, pwcp, vrtype_dest, &vrec_len);
+  }
+  memcpy(vrec_len_dest, &vrec_len, vrec_len_byte_ct);
+}
+
+pglerr_t spgw_append_biallelic_genovec_hphase_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, st_pgen_writer_t* spgwp) {
+  // flush write buffer if necessary
+  if (spgwp->pwc.fwrite_bufp >= &(spgwp->pwc.fwrite_buf[kPglFwriteBlockSize])) {
+    const uintptr_t cur_byte_ct = (uintptr_t)(spgwp->pwc.fwrite_bufp - spgwp->pwc.fwrite_buf);
+    if (fwrite_checked(spgwp->pwc.fwrite_buf, cur_byte_ct, spgwp->pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    spgwp->pwc.vblock_fpos_offset += cur_byte_ct;
+    spgwp->pwc.fwrite_bufp = spgwp->pwc.fwrite_buf;
+  }
+  pwc_append_biallelic_genovec_hphase_dosage16(genovec, phasepresent, phaseinfo, dosage_present, dosage_vals, dosage_ct, &(spgwp->pwc));
+  return kPglRetSuccess;
+}
+
+/*
+void append_dphase16(const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* __restrict dosage_vals, uint32_t dosage_ct, uint32_t dphase_ct, pgen_writer_common_t* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) {
+  if (!dphase_ct) {
+    append_dosage16(dosage_present, dosage_vals, dosage_ct, pwcp, vrtype_ptr, vrec_len_ptr);
+  }
+  const uint32_t sample_ct = pwcp->sample_ct;
+  const uint32_t max_deltalist_entry_ct = sample_ct / kPglMaxDeltalistLenDivisor;
+  if (dosage_ct <= max_deltalist_entry_ct) {
+    // case 1: store dosage IDs as deltalist.
+    *vrec_len_ptr += pwc_append_deltalist(dosage_present, dosage_ct, pwcp);
+    *vrtype_ptr += 0x20;
+  } else if (dosage_ct == sample_ct) {
+    // case 2: fixed-width, no need to store dosage IDs at all.
+    // dosage_vals permitted to have 65535 = missing
+    *vrtype_ptr += 0x40;
+  } else {
+    // case 3: save dosage_present bitarray directly.
+    const uint32_t sample_ctb = DIV_UP(sample_ct, CHAR_BIT);
+    *vrec_len_ptr += sample_ctb;
+    pwcp->fwrite_bufp = (unsigned char*)memcpya(pwcp->fwrite_bufp, dosage_present, sample_ctb);
+    *vrtype_ptr += 0x60;
+  }
+  *vrtype_ptr += 0x80;
+  if (dosage_ct == dphase_ct) {
+    *(pwcp->fwrite_bufp)++ = 0;
+    pwcp->fwrite_bufp = (unsigned char*)memcpya(pwcp->fwrite_bufp, dosage_vals, dphase_ct * 2 * sizeof(int16_t));
+    *vrec_len_ptr += 1 + (dphase_ct * 2 * sizeof(int16_t));
+  } else {
+    uintptr_t* dphase_present_tmp_write_iter = pwcp->genovec_invert_buf;
+    const uint32_t dosage_ctp1b = 1 + (dosage_ct / CHAR_BIT);
+    const uint32_t widx_last = dosage_ct / kBitsPerWord;
+    uintptr_t dphase_present_write_word = 1;
+    uint32_t sample_idx = 0;
+    uint32_t dosage_idx_lowbits = 1;
+    uint32_t widx = 0;
+    uint32_t loop_end = kBitsPerWord;
+    while (1) {
+      if (widx >= widx_last) {
+	if (widx > widx_last) {
+	  break;
+	}
+	loop_end = 1 + (dosage_ct % kBitsPerWord);
+      }
+      for (; dosage_idx_lowbits < loop_end; ++dosage_idx_lowbits, ++sample_idx) {
+	next_set_unsafe_ck(dosage_present, &sample_idx);
+	if (IS_SET(dphase_present, sample_idx)) {
+	  dphase_present_write_word |= k1LU << dosage_idx_lowbits;
+	}
+      }
+      *dphase_present_tmp_write_iter++ = dphase_present_write_word;
+      dphase_present_write_word = 0;
+      dosage_idx_lowbits = 0;
+      ++widx;
+    }
+    char* cur_write_iter = memcpya(pwcp->fwrite_bufp, pwcp->genovec_invert_buf, dosage_ctp1b);
+    cur_write_iter = memcpya(cur_write_iter, dosage_vals, (dosage_ct + dphase_ct) * sizeof(int16_t));
+    *vrec_len_ptr += (uintptr_t)(cur_write_iter - pwcp->fwrite_bufp);
+    pwcp->fwrite_bufp = (unsigned char*)cur_write_iter;
+  }
+}
+
+void pwc_append_biallelic_genovec_dphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* __restrict dosage_vals, uint32_t dosage_ct, uint32_t dphase_ct, pgen_writer_common_t* pwcp) {
+  // assumes there is phase and/or dosage data in output file, otherwise
+  // vrtype_dest needs to be replaced
+  const uint32_t vidx = pwcp->vidx;
+  unsigned char* vrtype_dest = &(((unsigned char*)pwcp->vrtype_buf)[vidx]);
+  uint32_t het_ct;
+  uint32_t vrec_len = pwc_append_biallelic_genovec_main(genovec, vidx, pwcp, &het_ct, vrtype_dest);
+  const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+  const uint32_t sample_ct = pwcp->sample_ct;
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  pwcp->vidx += 1;
+  unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+  const uint32_t phasepresent_ct = phasepresent? ((uint32_t)popcount_longs(phasepresent, sample_ctl)) : het_ct;
+  if (phasepresent_ct) {
+    append_hphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len);
+  }
+  if (dosage_ct) {
+    append_dphase16(dosage_present, dphase_present, dosage_vals, dosage_ct, dphase_ct, pwcp, vrtype_dest, &vrec_len);
+  }
+  memcpy(vrec_len_dest, &vrec_len, vrec_len_byte_ct);
+}
+
+pglerr_t spgw_append_biallelic_genovec_dphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* dphase_present, const uint16_t* dosage_vals, uint32_t dosage_ct, uint32_t dphase_ct, st_pgen_writer_t* spgwp) {
+  // flush write buffer if necessary
+  if (spgwp->pwc.fwrite_bufp >= &(spgwp->pwc.fwrite_buf[kPglFwriteBlockSize])) {
+    const uintptr_t cur_byte_ct = (uintptr_t)(spgwp->pwc.fwrite_bufp - spgwp->pwc.fwrite_buf);
+    if (fwrite_checked(spgwp->pwc.fwrite_buf, cur_byte_ct, spgwp->pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    spgwp->pwc.vblock_fpos_offset += cur_byte_ct;
+    spgwp->pwc.fwrite_bufp = spgwp->pwc.fwrite_buf;
+  }
+  pwc_append_biallelic_genovec_dphase16(genovec, phasepresent, phaseinfo, dosage_present, dphase_present, dosage_vals, dosage_ct, dphase_ct, &(spgwp->pwc));
+  return kPglRetSuccess;
+}
+*/
+
+pglerr_t pwc_finish(pgen_writer_common_t* pwcp, FILE** pgen_outfile_ptr) {
+  const uint32_t variant_ct = pwcp->variant_ct;
+  assert(pwcp->vidx == variant_ct);
+  FILE* pgen_outfile = *pgen_outfile_ptr;
+  if (fseeko(pgen_outfile, 12, SEEK_SET)) {
+    return kPglRetWriteFail;
+  }
+  const uint32_t vblock_ct = DIV_UP(variant_ct, kPglVblockSize);
+  fwrite(pwcp->vblock_fpos, vblock_ct * sizeof(int64_t), 1, pgen_outfile);
+  const unsigned char* vrtype_buf_iter = (unsigned char*)pwcp->vrtype_buf;
+  const uint32_t vrec_len_byte_ct = (uint32_t)pwcp->vrec_len_byte_ct;
+  const unsigned char* vrec_len_buf_iter = pwcp->vrec_len_buf;
+  const pgen_global_flags_t phase_dosage_gflags = pwcp->phase_dosage_gflags;
+  uint32_t vrec_iter_incr = kPglVblockSize * vrec_len_byte_ct;
+  uint32_t vrtype_buf_iter_incr = phase_dosage_gflags? kPglVblockSize : (kPglVblockSize / 2);
+  uint32_t nonref_flags_write_byte_ct = kPglVblockSize / CHAR_BIT;
+  const unsigned char* vrec_len_buf_last = &(vrec_len_buf_iter[((uintptr_t)(vblock_ct - 1)) * vrec_iter_incr]);
+  uintptr_t* explicit_nonref_flags = pwcp->explicit_nonref_flags;
+  uintptr_t* explicit_nonref_flags_iter = explicit_nonref_flags;
+  while (1) {
+    if (vrec_len_buf_iter >= vrec_len_buf_last) {
+      if (vrec_len_buf_iter > vrec_len_buf_last) {
+	return fclose_null(pgen_outfile_ptr)? kPglRetWriteFail : kPglRetSuccess;
+      }
+      const uint32_t vblock_size = MOD_NZ(variant_ct, kPglVblockSize);
+      vrtype_buf_iter_incr = phase_dosage_gflags? vblock_size : DIV_UP(vblock_size, 2);
+      vrec_iter_incr = vblock_size * vrec_len_byte_ct;
+      nonref_flags_write_byte_ct = DIV_UP(vblock_size, CHAR_BIT);
+    }
+    // 4b(i): array of 4-bit or 1-byte vrtypes
+    fwrite(vrtype_buf_iter, vrtype_buf_iter_incr, 1, pgen_outfile);
+    vrtype_buf_iter = &(vrtype_buf_iter[vrtype_buf_iter_incr]);
+
+    // 4b(ii): array of variant record lengths
+    if (fwrite_checked(vrec_len_buf_iter, vrec_iter_incr, pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    vrec_len_buf_iter = &(vrec_len_buf_iter[vrec_iter_incr]);
+
+    // 4b(iii): alt allele counts
+    // not yet supported
+
+    // 4b(iv): explicit nonref flags
+    if (explicit_nonref_flags) {
+      if (fwrite_checked(explicit_nonref_flags_iter, nonref_flags_write_byte_ct, pgen_outfile)) {
+	return kPglRetWriteFail;
+      }
+      explicit_nonref_flags_iter = &(explicit_nonref_flags_iter[kPglVblockSize / kBitsPerWord]);
+    }
+  }
+}
+
+pglerr_t spgw_finish(st_pgen_writer_t* spgwp) {
+  if (fwrite_checked(spgwp->pwc.fwrite_buf, spgwp->pwc.fwrite_bufp - spgwp->pwc.fwrite_buf, spgwp->pgen_outfile)) {
+    return kPglRetWriteFail;
+  }
+  return pwc_finish(&(spgwp->pwc), &(spgwp->pgen_outfile));
+}
+
+pglerr_t mpgw_flush(mt_pgen_writer_t* mpgwp) {
+  pgen_writer_common_t* pwcp = mpgwp->pwcs[0];
+  uint32_t vidx = (uint32_t)round_down_pow2(pwcp->vidx - 1, kPglVblockSize);
+  uint32_t thread_ct = mpgwp->thread_ct;
+  const uint32_t variant_ct = pwcp->variant_ct;
+  const uint32_t is_last_flush = ((vidx + thread_ct * kPglVblockSize) >= variant_ct);
+  if (is_last_flush) {
+    thread_ct = DIV_UP(variant_ct - vidx, kPglVblockSize);
+  }
+  uint64_t* vblock_fpos = pwcp->vblock_fpos;
+  FILE* pgen_outfile = mpgwp->pgen_outfile;
+  const uint32_t vidx_incr = (thread_ct - 1) * kPglVblockSize;
+  uint64_t cur_vblock_fpos = ftello(pgen_outfile);
+  for (uint32_t tidx = 0; tidx < thread_ct; ++tidx) {
+    vblock_fpos[(vidx / kPglVblockSize) + tidx] = cur_vblock_fpos;
+    pgen_writer_common_t* cur_pwcp = mpgwp->pwcs[tidx];
+    uintptr_t cur_vblock_byte_ct = (uintptr_t)(cur_pwcp->fwrite_bufp - cur_pwcp->fwrite_buf);
+    if (fwrite_checked(cur_pwcp->fwrite_buf, cur_vblock_byte_ct, pgen_outfile)) {
+      return kPglRetWriteFail;
+    }
+    cur_pwcp->vidx += vidx_incr;
+    cur_pwcp->fwrite_bufp = cur_pwcp->fwrite_buf;
+    cur_vblock_fpos += cur_vblock_byte_ct;
+  }
+  if (!is_last_flush) {
+    return kPglRetSuccess;
+  }
+  pwcp->vidx = variant_ct;
+  return pwc_finish(pwcp, &(mpgwp->pgen_outfile));
+}
+
+boolerr_t spgw_cleanup(st_pgen_writer_t* spgwp) {
+  // assume file is open if spgw.pgen_outfile is not null
+  // memory is the responsibility of the caller for now
+  if (!spgwp->pgen_outfile) {
+    return 0;
+  }
+  return fclose_null(&(spgwp->pgen_outfile));
+}
+
+boolerr_t mpgw_cleanup(mt_pgen_writer_t* mpgwp) {
+  if ((!mpgwp) || (!mpgwp->pgen_outfile)) {
+    return 0;
+  }
+  return fclose_null(&(mpgwp->pgen_outfile));
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/pgenlib_internal.h b/pgenlib_internal.h
new file mode 100644
index 0000000..04d99f4
--- /dev/null
+++ b/pgenlib_internal.h
@@ -0,0 +1,2269 @@
+#ifndef __PGENLIB_INTERNAL_H__
+#define __PGENLIB_INTERNAL_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+// Low-level C99/C++03/C++11 library for reading .pgen (PLINK 2.0 binary) files
+// (designed to produce good lowest-common-denominator binaries across
+// Windows/OS X/Linux).  We try to benefit from as much C++ type safety as we
+// can without either breaking compatibility with C-only codebases or making
+// extension of pgenlib/plink2 code more difficult than the old type-unsafe
+// style.
+//
+// File format design:
+// - With the header loaded, it is possible to efficiently access a variant by
+//   its index.  Since records can now be variable-length, this sometimes
+//   requires storage of record lengths.
+// - Due to the power of LD-based compression, we permit a variant record to
+//   just store a list of differences from an earlier, fully stored variant.
+//   However, only short-range dependence is permitted; sequential processing
+//   of the file only requires caching of the most recent explicitly stored
+//   variant.
+// - Like the plink1 format, this is balanced for relatively easy reading and
+//   writing; in particular, the mode-0x10/0x11 header is not read-optimized,
+//   it passes up some obvious compression opportunities which would make it
+//   more difficult to write e.g. an efficient file merger.  This isn't a big
+//   deal if we don't have a huge number of one-sample .pgen files sharing a
+//   single .bim file (or equivalent).  (If they don't share the same .bim
+//   file, .bim overhead > .pgen overhead.)  If we ever do, we can define an
+//   additional mode to handle that case more efficiently.
+// - Building blocks are arrays of 1-bit, 2-bit, 4-bit, 1-byte, 2-byte, 3-byte,
+//   and 4-byte values.  3/5/6/7(/9...)-bit values don't play well with
+//   bitwise operations, and when it's important, there's usually a natural way
+//   to split them into power-of-2-bit components.
+//   (unsigned integers known to be smaller than 2^24, but not known to be
+//   smaller than 2^16, are stored as 3-byte values on disk and "decompressed"
+//   to uint32_t during loading.)
+// - Missing value is usually all-1s.  (Only exceptions right now: plink1
+//   backward compatibility mode; presence/absence of rare alts for variants
+//   with >2 alt alleles is an array of 1-bit values, where absence = 0; and
+//   presence/absence of phasing info is similar.)  Time to move away from 01
+//   nonsense.
+// - Individual variant records are prohibited from being >= 4GB, to reduce
+//   integer overflow issues.  (This may be reduced to 2GB later, but I'll
+//   attempt to handle the 2-4GB range properly for now since it's conceivable
+//   for multiallelic records in very large datasets to reach that size.)
+// - (later todo: include stuff like file creation command in .bim successor
+//   header; that doesn't really belong in a binary file.)
+
+// Parameter conventions:
+// - Input parameters, then in/out, then pure outputs, then temporary buffers.
+//   Reference-style input parameters tend to go in the very front, to make it
+//   more obvious that they aren't in/out.
+// - "bitarr" indicates a word-aligned, packed array of bits, while "bitvec"
+//   indicates vector-alignment in 64-bit builds.  ("vector" always means SIMD
+//   inputs/outputs here; C++ std::vector is not used in this codebase.)
+// - "quaterarr" indicates a word-aligned, packed array of 2-bit values, while
+//   "quatervec" is the vector-aligned equivalent.  Similarly, "hexadecarr"
+//   marks the much rarer case of a packed array of 4-bit values, etc.
+// - "quatervec_01" indicates a packed, vector-aligned array of 2-bit values
+//   where each value is zero or one.  This data structure was used quite a bit
+//   bit by plink 1.9 for operating on a subset of a 2-bit-genotype array.
+// - "genovec" indicates a quatervec containing genotype information.
+// - "interleaved_vec" is the plink 2.0 replacement for quatervec_01: we
+//   basically stack pairs of adjacent vectors on top of each other and unpack
+//   on the fly, since that tends to be faster than having to access twice as
+//   much memory.
+// - Most pointers are stationary; moving pointers have an _iter suffix.
+
+
+// 10000 * major + 100 * minor + patch
+// Exception to CONSTU31, since we want the preprocessor to have access to this
+// value.  Named with all caps as a consequence.
+#define PGENLIB_INTERNAL_VERNUM 602
+
+
+#define _FILE_OFFSET_BITS 64
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#define __STDC_FORMAT_MACROS 1
+#include <inttypes.h>
+#include <limits.h> // CHAR_BIT, PATH_MAX
+
+// #define NDEBUG
+#include <assert.h>
+
+#ifdef _WIN32
+  // needed for MEMORYSTATUSEX
+  #ifndef _WIN64
+    #define WINVER 0x0500
+  #else
+    #define __LP64__
+  #endif
+  #include <windows.h>
+#endif
+
+#ifdef __LP64__
+  #ifndef __SSE2__
+    // todo: remove this requirement, the 32-bit vul_t-using code does most of
+    // what we need
+    #error "64-bit builds currently require SSE2.  Try producing a 32-bit build instead."
+  #endif
+  #include <emmintrin.h>
+  #ifdef __SSE4_2__
+    #define USE_SSE42
+    #ifdef __AVX2__
+      #include <immintrin.h>
+      #define USE_AVX2
+    #endif
+  #endif
+#endif
+
+
+// done with #includes, can start C++ namespace
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+#ifdef __cplusplus
+  #define HEADER_INLINE inline
+  #if __cplusplus <= 199711L
+    #define static_assert(cond, msg)
+  #endif
+#else
+  #define HEADER_INLINE static inline
+  // _Static_assert() should work in gcc 4.6+
+  #if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+    #if defined(__APPLE__) && defined(__has_feature) && defined(__has_extension)
+      // clang
+      #if __has_feature(c_static_assert) || __has_extension(c_static_assert)
+        #define static_assert _Static_assert
+      #else
+        #define static_assert(cond, msg)
+      #endif
+    #else
+      #define static_assert(cond, msg)
+    #endif
+  #else
+    #define static_assert _Static_assert
+  #endif
+#endif
+
+#define __maybe_unused __attribute__((unused))
+
+// Error return types.  All of these evaluate to true on error and false on
+// success, but otherwise they have slightly different semantics:
+// * pglerr_t is the general-purpose enum.  Unlike an enum, implicit conversion
+//   *to* int, not just from int, is prevented by the C++11 compiler (and the
+//   C++11-compiler-validated code still works under C99).  (To achieve this
+//   additional safety, we engage in a bit of code duplication which would be
+//   unreasonable for flagsets.)
+//   Explicit cast to uint32_t, but not int32_t, is supported, to reflect the
+//   fact that all error codes are positive.
+// * boolerr_t allows implicit conversion from int, but conversion back to
+//   uint32_t requires an explicit cast.  (It should always be 0/1-valued, but
+//   this isn't enforced by the compiler.)
+// * interr_t allows implicit conversion from int, but conversion back to
+//   int32_t requires an explicit cast.  It mainly serves as a holding pen for
+//   C standard library error return values, which can be negative.
+#if __cplusplus >= 201103L
+struct pglerr_t {
+  enum class ec
+#else
+typedef enum
+#endif
+  {
+  kPglRetSuccess,
+  kPglRetSkipped,
+  kPglRetNomem,
+  kPglRetOpenFail,
+  kPglRetReadFail,
+  kPglRetWriteFail,
+  // MalformedInput should be returned on low-level file format violations,
+  // while InconsistentInput should be returned for higher-level logical
+  // problems like mismatched files.
+  kPglRetMalformedInput,
+  kPglRetInconsistentInput,
+  kPglRetInvalidCmdline,
+  kPglRetHelp,
+  kPglRetThreadCreateFail,
+  kPglRetNetworkFail,
+  kPglRetSampleMajorBed = 32,
+  kPglRetWarningErrcode = 61,
+  kPglRetImproperFunctionCall = 62,
+  kPglRetNotYetSupported = 63,
+  kPglRetLongLine = 126,
+  kPglRetEmptyFile = 127}
+#if __cplusplus >= 201103L
+  ;
+
+  pglerr_t() {}
+  
+  pglerr_t(const pglerr_t& source) : value_(source.value_) {}
+
+  pglerr_t(ec source) : value_(source) {}
+
+  operator ec() const {
+    return value_;
+  }
+
+  explicit operator uint32_t() const {
+    return static_cast<uint32_t>(value_);
+  }
+  
+  explicit operator bool() const {
+    return (static_cast<uint32_t>(value_) != 0);
+  }
+
+private:
+  ec value_;
+};
+
+const pglerr_t kPglRetSuccess = pglerr_t::ec::kPglRetSuccess;
+const pglerr_t kPglRetSkipped = pglerr_t::ec::kPglRetSkipped;
+const pglerr_t kPglRetNomem = pglerr_t::ec::kPglRetNomem;
+const pglerr_t kPglRetOpenFail = pglerr_t::ec::kPglRetOpenFail;
+const pglerr_t kPglRetReadFail = pglerr_t::ec::kPglRetReadFail;
+const pglerr_t kPglRetWriteFail = pglerr_t::ec::kPglRetWriteFail;
+const pglerr_t kPglRetMalformedInput = pglerr_t::ec::kPglRetMalformedInput;
+const pglerr_t kPglRetInconsistentInput = pglerr_t::ec::kPglRetInconsistentInput;
+const pglerr_t kPglRetInvalidCmdline = pglerr_t::ec::kPglRetInvalidCmdline;
+const pglerr_t kPglRetHelp = pglerr_t::ec::kPglRetHelp;
+const pglerr_t kPglRetThreadCreateFail = pglerr_t::ec::kPglRetThreadCreateFail;
+const pglerr_t kPglRetNetworkFail = pglerr_t::ec::kPglRetNetworkFail;
+const pglerr_t kPglRetSampleMajorBed = pglerr_t::ec::kPglRetSampleMajorBed;
+const pglerr_t kPglRetWarningErrcode = pglerr_t::ec::kPglRetWarningErrcode;
+const pglerr_t kPglRetImproperFunctionCall = pglerr_t::ec::kPglRetImproperFunctionCall;
+const pglerr_t kPglRetNotYetSupported = pglerr_t::ec::kPglRetNotYetSupported;
+const pglerr_t kPglRetLongLine = pglerr_t::ec::kPglRetLongLine;
+const pglerr_t kPglRetEmptyFile = pglerr_t::ec::kPglRetEmptyFile;
+#else
+  pglerr_t;
+#endif
+
+#if __cplusplus >= 201103L
+// allow efficient arithmetic on these, but force them to require explicit
+// int32_t/uint32_t casts; only permit implicit assignment from
+// int32_t/uint32_t by default.
+// built-in bool type does too many things we don't want...
+
+// expected to be integer-valued, but not necessarily 0/1 or positive
+struct interr_t {
+  interr_t() {}
+  
+  interr_t(int32_t source) : value_(source) {}
+
+  explicit operator int32_t() const {
+    return static_cast<int32_t>(value_);
+  }
+  
+  explicit operator bool() const {
+    return (value_ != 0);
+  }
+  
+private:
+  int32_t value_;
+};
+
+// expected to be 0/1-valued
+struct boolerr_t {
+  boolerr_t() {}
+  
+  boolerr_t(uint32_t source) : value_(source) {}
+
+  explicit operator uint32_t() const {
+    return static_cast<uint32_t>(value_);
+  }
+  
+  explicit operator bool() const {
+    return (value_ != 0);
+  }
+  
+private:
+  uint32_t value_;
+};
+#else
+  typedef int32_t interr_t;
+  typedef uint32_t boolerr_t;
+#endif
+
+// make this work on 32-bit as well as 64-bit systems, across
+// Windows/OS X/Linux
+// (todo: clean this up a bit.  it's inherently a baling-wire-and-duct-tape
+// sort of thing, though...)
+#ifdef _WIN32
+  // must compile with -std=gnu++11, not c++11, on 32-bit Windows since
+  // otherwise fseeko64 not defined...
+  #define fseeko fseeko64
+  #define ftello ftello64
+  #define FOPEN_RB "rb"
+  #define FOPEN_WB "wb"
+  #define FOPEN_AB "ab"
+  #ifdef __LP64__
+    #define getc_unlocked _fgetc_nolock
+    #define putc_unlocked _fputc_nolock
+  #else
+    #define getc_unlocked getc
+    #define putc_unlocked putc
+  #endif
+  #if __cplusplus < 201103L
+    #define uint64_t unsigned long long
+    #define int64_t long long
+  #endif
+#else
+  #define FOPEN_RB "r"
+  #define FOPEN_WB "w"
+  #define FOPEN_AB "a"
+#endif
+
+#ifdef _WIN32
+  #define PRId64 "I64d"
+  #define PRIu64 "I64u"
+#else
+  #ifdef __cplusplus
+    #ifndef PRId64
+      #define PRId64 "lld"
+    #endif
+  #endif
+#endif
+
+#ifdef _WIN64
+  #define CTZLU __builtin_ctzll
+  #define CLZLU __builtin_clzll
+#else
+  #define CTZLU __builtin_ctzl
+  #define CLZLU __builtin_clzl
+  #ifndef __LP64__
+    // needed to prevent GCC 6 build failure
+    #if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 8)
+      #if (__cplusplus < 201103L) && !defined(__APPLE__)
+	#ifndef uintptr_t
+	  #define uintptr_t unsigned long
+	#endif
+	#ifndef intptr_t
+	  #define intptr_t long
+	#endif
+      #endif
+    #endif
+  #endif
+#endif
+
+#ifdef __LP64__
+  #ifdef _WIN32 // i.e. Win64
+
+    #undef PRIuPTR
+    #undef PRIdPTR
+    #define PRIuPTR PRIu64
+    #define PRIdPTR PRId64
+    #define PRIxPTR2 "016I64x"
+
+  #else // not _WIN32
+
+    #ifndef PRIuPTR
+      #define PRIuPTR "lu"
+    #endif
+    #ifndef PRIdPTR
+      #define PRIdPTR "ld"
+    #endif
+    #define PRIxPTR2 "016lx"
+
+  #endif // Win64
+
+#else // not __LP64__
+
+  // without this, we get ridiculous warning spew...
+  // not 100% sure this is the right cutoff, but this has been tested on 4.7
+  // and 4.8 build machines, so it plausibly is.
+  #if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 8) && (__cplusplus < 201103L)
+    #undef PRIuPTR
+    #undef PRIdPTR
+    #define PRIuPTR "lu"
+    #define PRIdPTR "ld"
+  #endif
+  
+  #define PRIxPTR2 "08lx"
+
+#endif
+
+#ifndef HAVE_NULLPTR
+  #ifndef __cplusplus
+    #define nullptr NULL
+  #else
+    #if __cplusplus <= 199711L
+      #ifndef nullptr
+        #define nullptr NULL
+      #endif
+    #endif
+  #endif
+#endif
+
+// Checked a bunch of alternatives to #define constants.  For integer constants
+// less than 2^31, enum {} avoids macro expansion issues that actually matter,
+// and that more than cancels out any tiny increase in binary size due to
+// additional debugger information (which has value, anyway).  However, we
+// don't want to use this under C++ due to enumeral/non-enumeral conditional
+// expression warnings, so this isn't one-size-fits-all; and plain old const
+// int has all the functionality we want under C++ (including internal linkage,
+// so it's fine to define them in header files).  Thus we wrap the
+// implementation in a macro.
+//
+// Otherwise, the macro expansion thing is still annoying but we suck it up due
+// to the need for too much duplicate C vs. C++ code ("initializer element is
+// not constant" when using const [type] in C99...)
+//
+// We start most global library-specific numeric constant names here with
+// "kPgl", which should have a vanishingly small chance of colliding with
+// anything in C99.  Note that stuff like kBytesPerWord is not considered
+// library-specific, so it's exempt from having "Pgl" in the name.  Also, the
+// few string literals here are of the FOPEN_WB sort, which have similar usage
+// patterns to e.g. PRIuPTR which shouldn't be renamed, so those remain
+// all-caps.
+#ifdef __cplusplus
+  #define CONSTU31(name, expr) const uint32_t name = (expr)
+#else
+  #define CONSTU31(name, expr) enum {name = (expr)}
+#endif
+
+// other configuration-ish values needed by plink2_common subset
+typedef unsigned char alt_allele_ct_t;
+// don't use CONSTU31 for this since it may need the 32nd bit in the future
+#define kPglMaxAltAlleleCt ((uint32_t)((alt_allele_ct_t)(-2)))
+
+// useful because of its bitwise complement: ~k0LU is a word with all 1 bits,
+// while ~0 is always 32 1 bits.
+// LLU is used over ULL for searchability (no conflict with NULL).
+static const uintptr_t k0LU = (uintptr_t)0;
+
+// mainly useful for bitshifts: (k1LU << 32) works in 64-bit builds, while
+// (1 << 32) is undefined.  also used as a quicker-to-type way of casting
+// numbers/expressions to uintptr_t (via multiplication).
+static const uintptr_t k1LU = (uintptr_t)1;
+
+
+#ifdef __LP64__
+  #ifdef USE_AVX2
+    CONSTU31(kBytesPerVec, 32);
+    CONSTU31(kBytesPerFVec, 32);
+    // bleah, have to define these here, vector_size doesn't see enum values
+    typedef uintptr_t vul_t __attribute__ ((vector_size (32)));
+    typedef float vf_t __attribute__ ((vector_size (32)));
+  #else
+    CONSTU31(kBytesPerVec, 16);
+    CONSTU31(kBytesPerFVec, 16);
+    typedef uintptr_t vul_t __attribute__ ((vector_size (16)));
+    typedef float vf_t __attribute__ ((vector_size (16)));
+  #endif
+  CONSTU31(kBitsPerWord, 64);
+  CONSTU31(kBitsPerWordLog2, 6);
+
+  typedef uint32_t halfword_t;
+  typedef uint16_t quarterword_t;
+
+  #ifdef USE_AVX2
+    #define VCONST_UL(xx) {xx, xx, xx, xx}
+    #define vul_setzero() (vul_t)_mm256_setzero_si256()
+    #define vul_rshift(vv, ct) ((vul_t)_mm256_srli_epi64((__m256i)(vv), ct))
+    #define vul_lshift(vv, ct) ((vul_t)_mm256_slli_epi64((__m256i)(vv), ct))
+  #else
+    #define VCONST_UL(xx) {xx, xx}
+    // vv = VCONST_UL(k0LU) doesn't work (only ok for initialization)
+    #define vul_setzero() (vul_t)_mm_setzero_si128()
+    // "vv >> ct" doesn't work, and Scientific Linux gcc 4.4 might not optimize
+    // VCONST_UL shift properly (todo: test this)
+    #define vul_rshift(vv, ct) ((vul_t)_mm_srli_epi64((__m128i)(vv), ct))
+    #define vul_lshift(vv, ct) ((vul_t)_mm_slli_epi64((__m128i)(vv), ct))
+  #endif
+#else // not __LP64__
+  CONSTU31(kBytesPerVec, 4);
+  CONSTU31(kBytesPerFVec, 4);
+  CONSTU31(kBitsPerWord, 32);
+  CONSTU31(kBitsPerWordLog2, 5);
+
+  typedef uint16_t halfword_t;
+  typedef uint8_t quarterword_t;
+
+  typedef uintptr_t vul_t;
+  typedef float vf_t;
+
+  #define VCONST_UL(xx) (xx)
+  #define vul_setzero() k0LU
+  #define vul_rshift(vv, ct) ((vv) >> (ct))
+  #define vul_lshift(vv, ct) ((vv) << (ct))
+#endif
+
+static const uintptr_t kMask5555 = (~((uintptr_t)0)) / 3;
+static const uintptr_t kMaskAAAA = ((~((uintptr_t)0)) / 3) * 2;
+static const uintptr_t kMask3333 = (~((uintptr_t)0)) / 5;
+static const uintptr_t kMask1111 = (~((uintptr_t)0)) / 15;
+static const uintptr_t kMask0F0F = (~((uintptr_t)0)) / 17;
+static const uintptr_t kMask0101 = (~((uintptr_t)0)) / 255;
+static const uintptr_t kMask00FF = (~((uintptr_t)0)) / 257;
+static const uintptr_t kMask0001 = (~((uintptr_t)0)) / 65535;
+static const uintptr_t kMask0000FFFF = (~((uintptr_t)0)) / 65537;
+static const uintptr_t kMask00000001 = (~((uintptr_t)0)) / 4294967295U;
+
+static const uintptr_t kMask000000FF = (~((uintptr_t)0)) / 16843009;
+static const uintptr_t kMask000F = (~((uintptr_t)0)) / 4369;
+static const uintptr_t kMask0303 = (~((uintptr_t)0)) / 85;
+
+CONSTU31(kBitsPerVec, kBytesPerVec * CHAR_BIT);
+CONSTU31(kQuatersPerVec, kBytesPerVec * 4);
+
+CONSTU31(kBitsPerWordD2, kBitsPerWord / 2);
+CONSTU31(kBitsPerWordD4, kBitsPerWord / 4);
+
+// number of bytes in a word
+CONSTU31(kBytesPerWord, kBitsPerWord / CHAR_BIT);
+
+static_assert(CHAR_BIT == 8, "pgenlib_internal requires CHAR_BIT == 8.");
+static_assert(sizeof(int32_t) == 4, "pgenlib_internal requires sizeof(int32_t) == 4.");
+static_assert(sizeof(int64_t) == 8, "pgenlib_internal requires sizeof(int64_t) == 8.");
+static_assert(sizeof(intptr_t) == kBytesPerWord, "pgenlib_internal requires sizeof(intptr_t) == kBytesPerWord.");
+
+CONSTU31(kWordsPerVec, kBytesPerVec / kBytesPerWord);
+CONSTU31(kInt32PerVec, kBytesPerVec / 4);
+
+CONSTU31(kCacheline, 64);
+
+CONSTU31(kBitsPerCacheline, kCacheline * CHAR_BIT);
+CONSTU31(kQuatersPerCacheline, kCacheline * 4);
+CONSTU31(kInt32PerCacheline, kCacheline / sizeof(int32_t));
+CONSTU31(kInt64PerCacheline, kCacheline / sizeof(int64_t));
+CONSTU31(kWordsPerCacheline, kCacheline / kBytesPerWord);
+CONSTU31(kDoublesPerCacheline, kCacheline / sizeof(double));
+CONSTU31(kVecsPerCacheline, kCacheline / kBytesPerVec);
+
+// could use ioctl, etc. to dynamically determine this later, and pass it as a
+// parameter to e.g. pgfi_multiread
+CONSTU31(kDiskBlockSize, 4096);
+
+// unsafe to fread or fwrite more bytes than this on e.g. OS X
+CONSTU31(kMaxBytesPerIO, 0x7ffff000);
+
+
+// note that this is NOT foolproof: see e.g.
+// http://insanecoding.blogspot.com/2007/11/pathmax-simply-isnt.html .  (This
+// is why I haven't bothered with OS-based #ifdefs here.)  But it should be
+// good enough in practice.  And PATH_MAX itself is still relevant due to use
+// of realpath().
+CONSTU31(kPglFnamesize, 4096);
+#if defined(PATH_MAX) && !defined(_WIN32)
+static_assert(kPglFnamesize >= PATH_MAX, "pgenlib_internal assumes PATH_MAX <= 4096.  (Safe to increase kPglFnamesize to address this, up to 131072.)");
+#endif
+
+// safe errstr_buf size for pgen_init_phase{1,2}()
+CONSTU31(kPglErrstrBufBlen, kPglFnamesize + 256);
+
+
+typedef union {
+  vul_t vi;
+  uintptr_t u8[kBitsPerVec / kBitsPerWord];
+  uint32_t u4[kBytesPerVec / sizeof(int32_t)];
+} univec_t;
+
+typedef union {
+  vf_t vf;
+  float f4[kBytesPerFVec / sizeof(float)];
+} univecf_t;
+
+// sum must fit in 16 bits
+HEADER_INLINE uintptr_t univec_hsum_16bit(univec_t uv) {
+#ifdef __LP64__
+  #ifdef USE_AVX2
+  return ((uv.u8[0] + uv.u8[1] + uv.u8[2] + uv.u8[3]) * kMask0001) >> 48;
+  #else
+  return ((uv.u8[0] + uv.u8[1]) * kMask0001) >> 48;
+  #endif
+#else
+  return (uv.u8[0] * kMask0001) >> 16;
+#endif
+}
+
+// sum must fit in 32 bits
+HEADER_INLINE uintptr_t univec_hsum_32bit(univec_t uv) {
+#ifdef __LP64__
+  #ifdef USE_AVX2
+  return ((uv.u8[0] + uv.u8[1] + uv.u8[2] + uv.u8[3]) * kMask00000001) >> 32;
+  #else
+  return ((uv.u8[0] + uv.u8[1]) * kMask00000001) >> 32;
+  #endif
+#else
+  return uv.u8[0];
+#endif
+}
+
+HEADER_INLINE uintptr_t unpack_halfword_to_word(uintptr_t hw) {
+#ifdef __LP64__
+  hw = (hw | (hw << 16)) & kMask0000FFFF;
+#endif
+  hw = (hw | (hw << 8)) & kMask00FF;
+  hw = (hw | (hw << 4)) & kMask0F0F;
+  hw = (hw | (hw << 2)) & kMask3333;
+  return ((hw | (hw << 1)) & kMask5555);
+}
+
+HEADER_INLINE halfword_t pack_word_to_halfword(uintptr_t ww) {
+  // assumes only even bits of ww can be set
+  ww = (ww | (ww >> 1)) & kMask3333;
+  ww = (ww | (ww >> 2)) & kMask0F0F;
+  ww = (ww | (ww >> 4)) & kMask00FF;
+#ifdef __LP64__
+  ww = (ww | (ww >> 8)) & kMask0000FFFF;
+#endif
+  return (halfword_t)(ww | (ww >> kBitsPerWordD4));
+}
+
+// alignment must be a power of 2
+// tried splitting out round_down_pow2_ui() and _up_pow2_ui() functions, no
+// practical difference
+HEADER_INLINE uintptr_t round_down_pow2(uintptr_t val, uintptr_t alignment) {
+  const uintptr_t alignment_m1 = alignment - 1;
+  assert(!(alignment & alignment_m1));
+  return val & (~alignment_m1);
+}
+
+HEADER_INLINE uint64_t round_down_pow2_ull(uint64_t val, uint64_t alignment) {
+  const uint64_t alignment_m1 = alignment - 1;
+  assert(!(alignment & alignment_m1));
+  return val & (~alignment_m1);
+}
+
+HEADER_INLINE uintptr_t round_up_pow2(uintptr_t val, uintptr_t alignment) {
+  const uintptr_t alignment_m1 = alignment - 1;
+  assert(!(alignment & alignment_m1));
+  return (val + alignment_m1) & (~alignment_m1);
+}
+
+
+// this is best when the divisor is constant (so (divisor - 1) can be
+// collapsed), and handles val == 0 properly.  if the divisor isn't constant
+// and val is guaranteed to be nonzero, go with explicit
+// "1 + (val - 1) / divisor".
+//
+// Thought about conditional use of constexpr here, but that has annoying
+// integer-widening effects.  Unless we split the use cases into DIV_UP,
+// DIVL_UP, and DIV64_UP; this may be worth doing at some point.
+// Note that this fails if (val + divisor - 1) overflows the widest integer
+// type on the left.
+#define DIV_UP(val, divisor) (((val) + (divisor) - 1) / (divisor))
+
+// "NZ" means nonzero in two ways:
+// * result is in [1, modulus], not [0, modulus - 1]
+// * val should not be zero (though this expression still works if val is zero
+//   and modulus is a hardcoded power of 2)
+#define MOD_NZ(val, modulus) (1 + (((val) - 1) % (modulus)))
+
+HEADER_INLINE uint32_t abs_int32(int32_t ii) {
+  const uint32_t neg_sign_bit = -(((uint32_t)ii) >> 31);
+  return (((uint32_t)ii) ^ neg_sign_bit) - neg_sign_bit;
+}
+
+extern uintptr_t g_failed_alloc_attempt_size;
+
+#if (__GNUC__ <= 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
+// putting this in the header file caused a bunch of gcc 4.4 strict-aliasing
+// warnings, while not doing so seems to inhibit some malloc-related compiler
+// optimizations, bleah
+// compromise: header-inline iff gcc version >= 4.7 (might not be the right
+// cutoff?)
+boolerr_t pgl_malloc(uintptr_t size, void* pp);
+#else
+HEADER_INLINE boolerr_t pgl_malloc(uintptr_t size, void* pp) {
+  *((unsigned char**)pp) = (unsigned char*)malloc(size);
+  if (*((unsigned char**)pp)) {
+    return 0;
+  }
+  g_failed_alloc_attempt_size = size;
+  return 1;
+}
+#endif
+
+// This must be used for all fwrite() calls where len could be >= 2^31, since
+// OS X raw fwrite() doesn't work in that case.
+static_assert(sizeof(size_t) == sizeof(intptr_t), "pgenlib_internal assumes size_t and intptr_t are synonymous.");
+interr_t fwrite_checked(const void* buf, uintptr_t len, FILE* outfile);
+
+interr_t fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr);
+
+HEADER_INLINE boolerr_t fread_checked(void* buf, uintptr_t len, FILE* infile) {
+  uintptr_t bytes_read;
+  if (fread_checked2(buf, len, infile, &bytes_read)) {
+    return 1;
+  }
+  return (bytes_read != len);
+}
+
+HEADER_INLINE boolerr_t fclose_null(FILE** fptr_ptr) {
+  int32_t ii = ferror(*fptr_ptr);
+  int32_t jj = fclose(*fptr_ptr);
+  *fptr_ptr = nullptr;
+  return ii || jj;
+}
+
+
+#ifdef __LP64__
+// Reads an integer in [1, cap].
+// * Errors out unless first character is a digit, or is '+' followed by a
+//   digit.  Initial whitespace is not permitted.
+// * Like atoi(), this considereds the number to be terminated by *any*
+//   nondigit character.  E.g. "1000genomes" is treated as a valid instance of
+//   1000 rather than a nonnumeric token, and "98.6" is treated as 98.  (May
+//   want to write another function with strtol-like semantics, returning a
+//   pointer to the end of the string so the caller can decide whether to error
+//   out on a nonspace terminator.  Don't need the base parameter, though...)
+// * Errors out on overflow.
+boolerr_t scan_posint_capped(const char* ss, uint64_t cap, uint32_t* valp);
+
+// [0, cap]
+boolerr_t scan_uint_capped(const char* ss, uint64_t cap, uint32_t* valp);
+
+// [-bound, bound]
+boolerr_t scan_int_abs_bounded(const char* ss, uint64_t bound, int32_t* valp);
+#else // not __LP64__
+// Need to be more careful in 32-bit case due to overflow.
+// A funny-looking div_10/mod_10 interface is used since the cap will usually
+// be a constant, and we want the integer division/modulus to occur at compile
+// time.
+boolerr_t scan_posint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
+
+boolerr_t scan_uint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
+
+boolerr_t scan_int_abs_bounded32(const char* ss, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp);
+
+HEADER_INLINE boolerr_t scan_posint_capped(const char* ss, uint32_t cap, uint32_t* valp) {
+  return scan_posint_capped32(ss, cap / 10, cap % 10, valp);
+}
+
+HEADER_INLINE boolerr_t scan_uint_capped(const char* ss, uint32_t cap, uint32_t* valp) {
+  return scan_uint_capped32(ss, cap / 10, cap % 10, valp);
+}
+
+HEADER_INLINE boolerr_t scan_int_abs_bounded(const char* ss, uint32_t bound, int32_t* valp) {
+  return scan_int_abs_bounded32(ss, bound / 10, bound % 10, valp);
+}
+#endif
+
+
+// intentionally rejects -2^31 for now
+HEADER_INLINE boolerr_t scan_int32(const char* ss, int32_t* valp) {
+  return scan_int_abs_bounded(ss, 0x7fffffff, valp);
+}
+
+// default cap = 0x7ffffffe
+HEADER_INLINE boolerr_t scan_posint_defcap(const char* ss, uint32_t* valp) {
+  return scan_posint_capped(ss, 0x7ffffffe, valp);
+}
+
+HEADER_INLINE boolerr_t scan_uint_defcap(const char* ss, uint32_t* valp) {
+  return scan_uint_capped(ss, 0x7ffffffe, valp);
+}
+
+HEADER_INLINE boolerr_t scan_int_abs_defcap(const char* ss, int32_t* valp) {
+  return scan_int_abs_bounded(ss, 0x7ffffffe, valp);
+}
+
+HEADER_INLINE boolerr_t scan_uint_icap(const char* ss, uint32_t* valp) {
+  return scan_uint_capped(ss, 0x7fffffff, valp);
+}
+
+HEADER_INLINE unsigned char* memseta(void* target, unsigned char val, uintptr_t ct) {
+  memset(target, val, ct);
+  return &(((unsigned char*)target)[ct]);
+}
+
+HEADER_INLINE char* memcpya(void* __restrict target, const void* __restrict source, uintptr_t ct) {
+  memcpy(target, source, ct);
+  return &(((char*)target)[ct]);
+}
+
+#define BITCT_TO_VECCT(val) DIV_UP(val, kBitsPerVec)
+#define BITCT_TO_WORDCT(val) DIV_UP(val, kBitsPerWord)
+#define BITCT_TO_ALIGNED_WORDCT(val) (kWordsPerVec * BITCT_TO_VECCT(val))
+#define BITCT_TO_CLCT(val) DIV_UP(val, kBitsPerCacheline)
+
+// more verbose than (val + 3) / 4, but may as well make semantic meaning
+// obvious; any explicit DIV_UP(val, 4) expressions should have a different
+// meaning
+// (not needed for bitct -> bytect, DIV_UP(val, CHAR_BIT) is clear enough)
+#define QUATERCT_TO_BYTECT(val) DIV_UP(val, 4)
+
+#define QUATERCT_TO_VECCT(val) DIV_UP(val, kQuatersPerVec)
+#define QUATERCT_TO_WORDCT(val) DIV_UP(val, kBitsPerWordD2)
+#define QUATERCT_TO_ALIGNED_WORDCT(val) (kWordsPerVec * QUATERCT_TO_VECCT(val))
+#define QUATERCT_TO_CLCT(val) DIV_UP(val, kQuatersPerCacheline)
+
+
+#define INT32CT_TO_VECCT(val) DIV_UP(val, kInt32PerVec)
+#define INT32CT_TO_CLCT(val) DIV_UP(val, kInt32PerCacheline)
+
+#define WORDCT_TO_VECCT(val) DIV_UP(val, kWordsPerVec)
+#define WORDCT_TO_CLCT(val) DIV_UP(val, kWordsPerCacheline)
+
+#ifdef __LP64__
+  #define INT64CT_TO_VECCT(val) DIV_UP(val, kBytesPerVec / 8)
+#else
+  #define INT64CT_TO_VECCT(val) ((val) * 2)
+#endif
+#define INT64CT_TO_CLCT(val) DIV_UP(val, kInt64PerCacheline)
+#define DBLCT_TO_VECCT INT64CT_TO_VECCT
+
+#define VECCT_TO_CLCT(val) DIV_UP(val, kVecsPerCacheline)
+
+// C++11 standard guarantees std::min and std::max return leftmost minimum in
+// case of equality; best to adhere to that
+// We don't actually use std::min/max since casting one argument when comparing
+// e.g. a uint32_t with a uintptr_t is pointlessly verbose
+#define MAXV(aa, bb) (((bb) > (aa))? (bb) : (aa))
+#define MINV(aa, bb) (((bb) < (aa))? (bb) : (aa))
+
+#define GET_QUATERARR_ENTRY(ulptr, idx) (((ulptr)[(idx) / kBitsPerWordD2] >> (2 * ((idx) % kBitsPerWordD2))) & 3)
+#define ASSIGN_QUATERARR_ENTRY(idx, newval, ulptr) (ulptr)[(idx) / kBitsPerWordD2] = ((ulptr)[(idx) / kBitsPerWordD2] & (~((3 * k1LU) << (2 * ((idx) % kBitsPerWordD2))))) | (((uintptr_t)(newval)) << (2 * ((idx) % kBitsPerWordD2)))
+// todo: check if ASSIGN_QUATERARR_ENTRY optimizes newval=0 out
+#define CLEAR_QUATERARR_ENTRY(idx, ulptr) (ulptr)[(idx) / kBitsPerWordD2] &= ~((3 * k1LU) << (idx % kBitsPerWordD2))
+
+#define GET_HEXADECARR_ENTRY(ulptr, idx) (((ulptr)[(idx) / kBitsPerWordD4] >> (4 * ((idx) % kBitsPerWordD4))) & 15)
+
+
+// don't use pglerr_t here since there's only one failure mode, it's
+// obvious what it is, and stacking multiple aligned_mallocs in a single
+// if-statement is useful.
+boolerr_t aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp);
+
+// ok for ct == 0
+void fill_all_bits(uintptr_t ct, uintptr_t* bitarr);
+
+void bitvec_and(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
+
+void bitvec_andnot(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
+
+uint32_t next_set_unsafe(const uintptr_t* bitarr, uint32_t loc);
+
+uint32_t next_unset_unsafe(const uintptr_t* bitarr, uint32_t loc);
+
+// uint32_t next_nonmissing_unsafe(const uintptr_t* genoarr, uint32_t loc);
+
+uint32_t next_set(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil);
+
+uint32_t prev_set_unsafe(const uintptr_t* bitarr, uint32_t loc);
+
+HEADER_INLINE uint32_t are_all_words_zero(const uintptr_t* word_arr, uintptr_t word_ct) {
+  while (word_ct--) {
+    if (*word_arr++) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+HEADER_INLINE uint32_t are_all_bits_one(const uintptr_t* bitarr, uintptr_t bit_ct) {
+  const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
+  for (uintptr_t widx = 0; widx < fullword_ct; ++widx) {
+    if (~(bitarr[widx])) {
+      return 0;
+    }
+  }
+  const uint32_t trailing_bit_ct = bit_ct % kBitsPerWord;
+  return (!trailing_bit_ct) || ((~(bitarr[fullword_ct])) << (kBitsPerWord - trailing_bit_ct));
+}
+
+HEADER_INLINE uint32_t popcount2_long(uintptr_t val) {
+#ifdef USE_SSE42
+  return __builtin_popcountll(val) + __builtin_popcountll(val & kMaskAAAA);
+#else
+  val = (val & kMask3333) + ((val >> 2) & kMask3333);
+  return (((val + (val >> 4)) & kMask0F0F) * kMask0101) >> (kBitsPerWord - 8);
+#endif
+}
+
+HEADER_INLINE uint32_t popcount_long(uintptr_t val) {
+  // the simple version, good enough for all non-time-critical stuff
+  // (without SSE4.2, popcount_longs() tends to be >3x as fast on arrays.
+  // with SSE4.2, there's no noticeable difference.)
+#ifdef USE_SSE42
+  return __builtin_popcountll(val);
+#else
+  // sadly, this is still faster than the clang implementation of the intrinsic
+  // as of 2016
+  return popcount2_long(val - ((val >> 1) & kMask5555));
+#endif
+}
+
+#ifdef USE_SSE42
+HEADER_INLINE uint32_t popcount01_long(uintptr_t val) {
+  return popcount_long(val);
+}
+#else
+HEADER_INLINE uint32_t popcount01_long(uintptr_t val) {
+  return popcount2_long(val);
+}
+#endif
+
+HEADER_INLINE uint32_t popcount_2_longs(uintptr_t val0, uintptr_t val1) {
+  // the simple version, good enough for all non-time-critical stuff
+  // (without SSE4.2, popcount_longs() tends to be >3x as fast on arrays.
+  // with SSE4.2, there's no noticeable difference.)
+#ifdef USE_SSE42
+  return __builtin_popcountll(val0) + __builtin_popcountll(val1);
+#else
+  val0 -= (val0 >> 1) & kMask5555;
+  val1 -= (val1 >> 1) & kMask5555;
+  const uintptr_t four_bit = (val0 & kMask3333) + ((val0 >> 2) & kMask3333) + (val1 & kMask3333) + ((val1 >> 2) & kMask3333);
+  // up to 16 values in 0..12; sum fits in 8 bits
+  return (((four_bit & kMask0F0F) + ((four_bit >> 4) & kMask0F0F)) * kMask0101) >> (kBitsPerWord - 8);
+#endif
+}
+
+#ifndef __LP64__
+HEADER_INLINE uint32_t popcount_4_longs(uintptr_t val0, uintptr_t val1, uintptr_t val2, uintptr_t val3) {
+  val0 -= (val0 >> 1) & kMask5555;
+  val1 -= (val1 >> 1) & kMask5555;
+  val2 -= (val2 >> 1) & kMask5555;
+  val3 -= (val3 >> 1) & kMask5555;
+  const uintptr_t four_bit_0 = (val0 & kMask3333) + ((val0 >> 2) & kMask3333) + (val1 & kMask3333) + ((val1 >> 2) & kMask3333);
+  const uintptr_t four_bit_1 = (val2 & kMask3333) + ((val2 >> 2) & kMask3333) + (val3 & kMask3333) + ((val3 >> 2) & kMask3333);
+  return (((four_bit_0 & kMask0F0F) + ((four_bit_0 >> 4) & kMask0F0F) + (four_bit_1 & kMask0F0F) + ((four_bit_1 >> 4) & kMask0F0F)) * kMask0101) >> (kBitsPerWord - 8);
+}
+#endif
+
+// assumes vec_ct is a multiple of 3
+uintptr_t popcount_vecs(const vul_t* bit_vvec, uintptr_t vec_ct);
+
+#define IS_VEC_ALIGNED(addr) (!(((uintptr_t)(addr)) % kBytesPerVec))
+
+HEADER_INLINE uintptr_t popcount_longs(const uintptr_t* bitvec, uintptr_t word_ct) {
+  // Efficiently popcounts bitvec[0..(word_ct - 1)].  In the 64-bit case,
+  // bitvec[] must be 16-byte aligned.
+  // The popcount_longs_nzbase() wrapper takes care of starting from a later
+  // index.
+  // No need for a separate USE_SSE42 implementation, there's no noticeable
+  // speed difference.
+  uintptr_t tot = 0;
+  if (word_ct >= (3 * kWordsPerVec)) {
+    assert(IS_VEC_ALIGNED(bitvec));
+    const uintptr_t remainder = word_ct % (3 * kWordsPerVec);
+    const uintptr_t main_block_word_ct = word_ct - remainder;
+    tot = popcount_vecs((const vul_t*)bitvec, main_block_word_ct / kWordsPerVec);
+    word_ct = remainder;
+    bitvec = &(bitvec[main_block_word_ct]);
+  }
+  for (uintptr_t trailing_word_idx = 0; trailing_word_idx < word_ct; ++trailing_word_idx) {
+    tot += popcount_long(bitvec[trailing_word_idx]);
+  }
+  return tot;
+}
+
+// these don't read past the end of bitarr
+uintptr_t popcount_bytes(const unsigned char* bitarr, uintptr_t byte_ct);
+uintptr_t popcount_bytes_masked(const unsigned char* bitarr, const uintptr_t* mask_arr, uintptr_t byte_ct);
+
+// assumes subset_mask has trailing zeroes up to the next vector boundary
+void fill_interleaved_mask_vec(const uintptr_t* __restrict subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec);
+
+// requires positive word_ct
+// stay agnostic a bit longer re: word_ct := DIV_UP(entry_ct, kBitsPerWord)
+// vs. word_ct := 1 + (entry_ct / kBitsPerWord)
+// (this is a source of bugs, though; interface should probably be changed to
+// use entry_ct once multiallelic/dosage implementation is done)
+void fill_cumulative_popcounts(const uintptr_t* subset_mask, uint32_t word_ct, uint32_t* cumulative_popcounts);
+
+void uidxs_to_idxs(const uintptr_t* subset_mask, const uint32_t* subset_cumulative_popcounts, const uint32_t idx_list_len, uint32_t* idx_list);
+
+
+HEADER_INLINE boolerr_t vecaligned_malloc(uintptr_t size, void* aligned_pp) {
+#if defined(__APPLE__) || !defined(__LP64__)
+  const boolerr_t ret_boolerr = pgl_malloc(size, aligned_pp);
+  assert(IS_VEC_ALIGNED(*((uintptr_t*)aligned_pp)));
+  return ret_boolerr;
+#else
+  return aligned_malloc(size, kBytesPerVec, aligned_pp);
+#endif
+}
+
+HEADER_INLINE boolerr_t cachealigned_malloc(uintptr_t size, void* aligned_pp) {
+  return aligned_malloc(size, kCacheline, aligned_pp);
+}
+
+HEADER_INLINE void aligned_free(void* aligned_ptr) {
+  free((uintptr_t*)(((uintptr_t*)aligned_ptr)[-1]));
+}
+
+HEADER_INLINE void aligned_free_cond(void* aligned_ptr) {
+  if (aligned_ptr) {
+    free((uintptr_t*)(((uintptr_t*)aligned_ptr)[-1]));
+  }
+}
+
+// C spec is a bit broken here
+HEADER_INLINE void free_const(const void* memptr) {
+  // const_cast
+  free((void*)((uintptr_t)memptr));
+}
+ 
+HEADER_INLINE void free_cond(const void* memptr) {
+  if (memptr) {
+    free_const(memptr);
+  }
+}
+
+#if defined(__APPLE__) || !defined(__LP64__)
+HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
+  free(aligned_ptr);
+}
+
+HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
+  free_cond(aligned_ptr);
+}
+#else
+HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
+  aligned_free(aligned_ptr);
+}
+
+HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
+  aligned_free_cond(aligned_ptr);
+}
+#endif
+
+// now compiling with gcc >= 4.4 (or clang equivalent) on all platforms, so
+// safe to use memset everywhere
+HEADER_INLINE void fill_uint_zero(uintptr_t entry_ct, uint32_t* uiarr) {
+  memset(uiarr, 0, entry_ct * sizeof(int32_t));
+}
+
+HEADER_INLINE void fill_ulong_zero(uintptr_t entry_ct, uintptr_t* ularr) {
+  memset(ularr, 0, entry_ct * sizeof(intptr_t));
+}
+
+HEADER_INLINE void fill_ull_zero(uintptr_t entry_ct, uint64_t* ullarr) {
+  memset(ullarr, 0, entry_ct * sizeof(int64_t));
+}
+
+HEADER_INLINE void fill_ulong_one(uintptr_t entry_ct, uintptr_t* ularr) {
+  for (uintptr_t idx = 0; idx < entry_ct; ++idx) {
+    ularr[idx] = ~k0LU;
+  }
+}
+
+#define IS_SET(ulptr, idx) (((ulptr)[(idx) / kBitsPerWord] >> ((idx) % kBitsPerWord)) & 1)
+
+#define SET_BIT(idx, arr) ((arr)[(idx) / kBitsPerWord] |= k1LU << ((idx) % kBitsPerWord))
+
+#define CLEAR_BIT(idx, arr) ((arr)[(idx) / kBitsPerWord] &= ~(k1LU << ((idx) % kBitsPerWord)))
+
+HEADER_INLINE void assign_bit(uintptr_t idx, uintptr_t newbit, uintptr_t* arr) {
+  const uintptr_t inv_mask = k1LU << (idx % kBitsPerWord);
+  uintptr_t* cur_word_ptr = &(arr[idx / kBitsPerWord]);
+  *cur_word_ptr = ((*cur_word_ptr) & (~inv_mask)) | (inv_mask * newbit);
+}
+
+HEADER_INLINE void next_set_unsafe_ck(const uintptr_t* __restrict bitarr, uint32_t* __restrict loc_ptr) {
+  if (!IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_set_unsafe(bitarr, *loc_ptr);
+  }
+}
+
+HEADER_INLINE void next_unset_unsafe_ck(const uintptr_t* __restrict bitarr, uint32_t* __restrict loc_ptr) {
+  if (IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_unset_unsafe(bitarr, *loc_ptr);
+  }
+}
+
+// todo: test this against extracting a nonmissing bitarr first
+/*
+HEADER_INLINE void next_nonmissing_unsafe_ck(const uintptr_t* __restrict genoarr, uint32_t* __restrict loc_ptr) {
+  if (GET_QUATERARR_ENTRY(genoarr, *loc_ptr) == 3) {
+    *loc_ptr = next_nonmissing_unsafe(genoarr, *loc_ptr);
+  }
+}
+*/
+
+HEADER_INLINE void copy_quaterarr(const uintptr_t* __restrict source_quaterarr, uint32_t quaterarr_entry_ct, uintptr_t* __restrict target_quaterarr) {
+  memcpy(target_quaterarr, source_quaterarr, QUATERCT_TO_WORDCT(quaterarr_entry_ct) * kBytesPerWord);
+}
+
+void copy_bitarr_subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t subset_size, uintptr_t* __restrict output_bitarr);
+
+// may want bit past the end of subset_mask (i.e. position
+// raw_quaterarr_entry_ct) to always be allocated and unset.  This removes the
+// need for some explicit end-of-bitarray checks.
+void copy_quaterarr_nonempty_subset(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_mask, uint32_t raw_quaterarr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_quaterarr);
+
+HEADER_INLINE uint32_t raw_to_subsetted_pos(const uintptr_t* subset_mask, const uint32_t* subset_cumulative_popcounts, uint32_t raw_idx) {
+  // this should be much better than keeping a uidx_to_idx array!
+  uint32_t raw_widx = raw_idx / kBitsPerWord;
+  return subset_cumulative_popcounts[raw_widx] + popcount_long(subset_mask[raw_widx] & ((k1LU << (raw_idx % kBitsPerWord)) - k1LU));
+}
+
+void genovec_count_freqs_unsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* counts);
+
+void genovec_count_subset_freqs(const uintptr_t* __restrict genovec, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t* genocounts);
+
+// slower genovec_count_subset_freqs() which does not require
+// sample_include_interleaved_vec to be precomputed (and incidentally doesn't
+// require vector alignment)
+void genoarr_count_subset_freqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t* genocounts);
+
+void genoarr_count_subset_intersect_freqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict subset1, const uintptr_t* __restrict subset2, uint32_t raw_sample_ct, uint32_t* genocounts);
+
+void genovec_invert_unsafe(uint32_t sample_ct, uintptr_t* genovec);
+
+HEADER_INLINE uintptr_t invert_geno_word_unsafe(uintptr_t geno_word) {
+  return (geno_word ^ ((~(geno_word << 1)) & kMaskAAAA));
+}
+
+HEADER_INLINE void zero_trailing_bits(uintptr_t bit_ct, uintptr_t* bitarr) {
+  uintptr_t trail_ct = bit_ct % kBitsPerWord;
+  if (trail_ct) {
+    bitarr[bit_ct / kBitsPerWord] &= (k1LU << trail_ct) - k1LU;
+  }
+}
+
+// too easy to forget to multiply by 2
+HEADER_INLINE void zero_trailing_quaters(uintptr_t quater_ct, uintptr_t* bitarr) {
+  zero_trailing_bits(quater_ct * 2, bitarr);
+}
+
+
+HEADER_INLINE uint32_t bytes_to_represent_ui(uint32_t uii) {
+  return (4 - (__builtin_clz(uii) / CHAR_BIT));
+}
+
+// A VINT is a sequence of bytes where each byte stores just 7 bits of an
+// an integer, and the high bit is set when the integer has more nonzero bits.
+// See e.g.
+//   https://developers.google.com/protocol-buffers/docs/encoding#varints
+// (Note that protocol buffers used "group varints" at one point, but then
+// abandoned them.  I suspect they'd be simultaneously slower and less
+// compact here.)
+
+HEADER_INLINE unsigned char* vint32_append(uint32_t uii, unsigned char* buf) {
+  while (uii > 127) {
+    *buf++ = (uii & 127) + 128;
+    uii >>= 7;
+  }
+  *buf++ = uii;
+  return buf;
+}
+
+// Returns 0x80000000U instead of 0xffffffffU so overflow check works properly
+// in 32-bit build.  Named "get_vint31" to make it more obvious that a 2^31
+// return value can't be legitimate.
+HEADER_INLINE uint32_t get_vint31(const unsigned char* buf_end, const unsigned char** bufpp) {
+  if (buf_end > (*bufpp)) {
+    uint32_t vint32 = *((*bufpp)++);
+    if (vint32 <= 127) {
+      return vint32;
+    }
+    vint32 &= 127;
+    uint32_t shift = 7;
+    while (buf_end > (*bufpp)) {
+      uint32_t uii = *((*bufpp)++);
+      vint32 |= (uii & 127) << shift;
+      if (uii <= 127) {
+	return vint32;
+      }
+      shift += 7;
+      // currently don't check for shift >= 32 (that's what validate_vint31()
+      // is for).
+    }
+  }
+  return 0x80000000U;
+}
+
+// Input must be validated, or bufp must be >= 5 characters before the end of
+// the read buffer.
+// todo: check if this has enough of a speed advantage over get_vint31() to
+// justify using this in the main loops and catching SIGSEGV.  (update: using
+// this over get_vint31() provides a ~3% speed advantage for
+// load-and-recompress on the big test dataset.)
+HEADER_INLINE uint32_t get_vint31_unsafe(const unsigned char** bufpp) {
+  uint32_t vint32 = *(*bufpp)++;
+  if (vint32 <= 127) {
+    return vint32;
+  }
+  vint32 &= 127;
+  for (uint32_t shift = 7; shift < 32; shift += 7) {
+    uint32_t uii = *(*bufpp)++;
+    vint32 |= (uii & 127) << shift;
+    if (uii <= 127) {
+      return vint32;
+    }
+  }
+  return 0x80000000U;
+}
+
+// Does not update buf_ptr.
+HEADER_INLINE uint32_t peek_vint31(const unsigned char* buf_ptr, const unsigned char* buf_end) {
+  if (buf_end > buf_ptr) {
+    uint32_t vint32 = *buf_ptr++;
+    if (vint32 <= 127) {
+      return vint32;
+    }
+    vint32 &= 127;
+    uint32_t shift = 7;
+    while (buf_end > buf_ptr) {
+      uint32_t uii = *buf_ptr++;
+      vint32 |= (uii & 127) << shift;
+      if (uii <= 127) {
+	return vint32;
+      }
+      shift += 7;
+    }
+  }
+  return 0x80000000U;
+}
+
+/*
+HEADER_INLINE uint32_t fget_vint31(FILE* ff) {
+  // Can't be used when multiple threads are reading from ff.
+  uint32_t vint32 = getc_unlocked(ff);
+  if (vint32 <= 127) {
+    return vint32;
+  }
+  vint32 &= 127;
+  for (uint32_t shift = 7; shift < 32; shift += 7) {
+    uint32_t uii = getc_unlocked(ff);
+    vint32 |= (uii & 127) << shift;
+    if (uii <= 127) {
+      return vint32;
+    }
+  }
+  return 0x80000000U;
+}
+
+HEADER_INLINE void fput_vint31(uint32_t uii, FILE* ff) {
+  // caller's responsibility to periodically check ferror
+  while (uii > 127) {
+    putc_unlocked((uii & 127) + 128, ff);
+    uii >>= 7;
+  }
+  putc_unlocked(uii, ff);
+}
+*/
+
+// main batch size
+CONSTU31(kPglQuaterTransposeBatch, kQuatersPerCacheline);
+
+// word width of each matrix row
+CONSTU31(kPglQuaterTransposeWords, kWordsPerCacheline);
+
+CONSTU31(kPglQuaterTransposeBufbytes, (kPglQuaterTransposeBatch * kPglQuaterTransposeBatch) / 2);
+CONSTU31(kPglQuaterTransposeBufwords, kPglQuaterTransposeBufbytes / kBytesPerWord);
+// up to 256x256; vecaligned_buf must have size 32k
+// write_iter must be allocated up to at least
+//   round_up_pow2(write_batch_size, 2) rows
+void transpose_quaterblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, vul_t* vecaligned_buf);
+
+CONSTU31(kPglBitTransposeBatch, kBitsPerCacheline);
+CONSTU31(kPglBitTransposeWords, kWordsPerCacheline);
+CONSTU31(kPglBitTransposeBufbytes, (kPglBitTransposeBatch * kPglBitTransposeBatch) / (CHAR_BIT / 2));
+CONSTU31(kPglBitTransposeBufwords, kPglBitTransposeBufbytes / kBytesPerWord);
+// up to 512x512; vecaligned_buf must have size 64k
+// write_iter must be allocated up to at least
+//   round_up_pow2(write_batch_size, 2) rows
+void transpose_bitblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, vul_t* vecaligned_buf);
+
+// replaces each x with (32768 - x)
+// okay for dosage_vals to be nullptr if dosage_ct == 0
+void biallelic_dosage16_invert(uint32_t dosage_ct, uint16_t* dosage_vals);
+
+void genovec_to_missingness_unsafe(const uintptr_t* __restrict genovec, uint32_t sample_ct, uintptr_t* __restrict missingness);
+
+// ----- end plink2_common subset -----
+
+// other configuration-ish values
+// this part of the specification is set in stone.
+
+CONSTU31(kPglVblockSize, 65536);
+
+// currently chosen so that it plus kPglFwriteBlockSize is < 2^32
+static const uint32_t kPglMaxBytesPerVariant = 0xfffdffc0U;
+// CONSTU31(kPglMaxBytesPerDataTrack, 0x7ffff000);
+// static_assert(kMaxBytesPerIO >= (int32_t)kPglMaxBytesPerDataTrack, "pgenlib_internal assumes a single variant data track always fits in one fread/fwrite operation.");
+
+// mmap is a horrible idea for 32-bit builds, and as long as we have non-mmap
+// code we may as well not worry about Win64 CreateFileMapping.
+
+// also, OS X mmap implementation seems to be crappy for large sequentially
+// accessed files, compared to Linux.
+
+// possible todo: SIGBUS handling?  do we ever want to try to recover from an
+// I/O error?
+#if defined(_WIN32) || !defined(__LP64__)
+  #define NO_MMAP
+#endif
+
+// currently must be power of 2, and multiple of (kBitsPerWord / 2)
+CONSTU31(kPglDifflistGroupSize, 64);
+
+// Flagset conventions:
+// * Each 32-bit and 64-bit flagset has its own type, which is guaranteed to be
+//   the appropriate width.  (Todo: verify that bit 31 works properly in 32-bit
+//   case.)
+// * Constant flag names start with "kf[CamelCase description]", followed by a
+//   description that shouldn't suck too badly.  The zero flagset is always
+//   named kf[CamelCase description]0.
+// * The type name is always of the form [snake_case description]_flags_t.
+// * To gain the desired level of type-checking under C++11 without pointless
+//   verbosity, &, |, ^, ~, &=, |=, and ^= operations are defined; [my_flags_t
+//   variable] |= [another my_flags_t variable] & [a my_flags_t constant] works
+//   without an explicit cast.  (Defining "struct my_flags_t" separately from
+//   the enum global-scope-constants container is necessary to make |= work
+//   without a cast.  inline is needed due to duplicate operator definitions
+//   across multiple files.)
+// * To slightly reduce the chance of breakage under C99/C++03, the enum is
+//   nameless; the flagset type is just a uint32_t/uint64_t alias.  This is
+//   because the C99 and C++03 specs do not provide enough control over the
+//   enum base type to make it safe for the enum to serve as the flagset type.
+// * Implicit conversion to int is not prevented for now, since I'm trying to
+//   keep pglerr_t-style code duplication to a minimum.
+#if __cplusplus >= 201103L
+
+  // could avoid the typedef here, but that leads to a bit more verbosity.
+  #define FLAGSET_DEF_START() typedef enum : uint32_t {
+  #define FLAGSET_DEF_END(tname) } tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ ; \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator|(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa, tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ bb) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(static_cast<uint32_t>(aa) | static_cast<uint32_t>(bb)); \
+} \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator&(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa, tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ bb) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(static_cast<uint32_t>(aa) & static_cast<uint32_t>(bb)); \
+} \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator^(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa, tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ bb) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(static_cast<uint32_t>(aa) ^ static_cast<uint32_t>(bb)); \
+} \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator~(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(~static_cast<uint32_t>(aa)); \
+} \
+  \
+struct tname { \
+  tname() {} \
+  \
+  tname(const tname& source) : value_(source.value_) {} \
+  \
+  tname(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ source) : value_(static_cast<uint32_t>(source)) {} \
+  \
+  explicit tname(uint32_t source) : value_(source) {} \
+  \
+  operator tname ## _PGENLIB_INTERNAL_DO_NOT_USE__() const { \
+    return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(value_); \
+  } \
+  \
+  tname& operator|=(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ rhs) { \
+    value_ |= rhs; \
+    return *this; \
+  } \
+  \
+  tname& operator&=(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ rhs) { \
+    value_ &= rhs; \
+    return *this; \
+  } \
+  \
+  tname& operator^=(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ rhs) { \
+    value_ ^= rhs; \
+    return *this; \
+  } \
+  \
+private: \
+  uint32_t value_; \
+}
+
+  #define FLAGSET64_DEF_START() typedef enum : uint64_t {
+  #define FLAGSET64_DEF_END(tname) } tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ ; \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator|(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa, tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ bb) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(static_cast<uint64_t>(aa) | static_cast<uint64_t>(bb)); \
+} \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator&(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa, tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ bb) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(static_cast<uint64_t>(aa) & static_cast<uint64_t>(bb)); \
+} \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator^(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa, tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ bb) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(static_cast<uint64_t>(aa) ^ static_cast<uint64_t>(bb)); \
+} \
+  \
+inline tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ operator~(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ aa) { \
+  return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(~static_cast<uint64_t>(aa)); \
+} \
+  \
+struct tname { \
+  tname() {} \
+  \
+  tname(const tname& source) : value_(source.value_) {} \
+  \
+  tname(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ source) : value_(static_cast<uint64_t>(source)) {} \
+  \
+  explicit tname(uint64_t source) : value_(source) {} \
+  \
+  operator tname ## _PGENLIB_INTERNAL_DO_NOT_USE__() const { \
+    return static_cast<tname ## _PGENLIB_INTERNAL_DO_NOT_USE__>(value_); \
+  } \
+  \
+  tname& operator|=(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ rhs) { \
+    value_ |= rhs; \
+    return *this; \
+  } \
+  \
+  tname& operator&=(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ rhs) { \
+    value_ &= rhs; \
+    return *this; \
+  } \
+  \
+  tname& operator^=(const tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ rhs) { \
+    value_ ^= rhs; \
+    return *this; \
+  } \
+  \
+private: \
+  uint64_t value_; \
+}
+
+  #define ENUM_U31_DEF_START() typedef enum : uint32_t {
+  #define ENUM_U31_DEF_END(tname) } tname
+
+#else
+
+  #define FLAGSET_DEF_START() enum {
+  #define FLAGSET_DEF_END(tname) } ; \
+typedef uint32_t tname
+
+  // don't use a nameless enum here, since we want to be able to static_assert
+  // the enum size.
+  // best to artificially add an element to the end for now to force width to
+  // 64-bit, otherwise gcc actually shrinks it even when the constants are
+  // defined with LLU.
+  #define FLAGSET64_DEF_START() typedef enum {
+  #define FLAGSET64_DEF_END(tname) , \
+  tname ## PGENLIB_INTERNAL_DO_NOT_USE__ALL_64_SET__ = ~(0LLU) } tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ ; \
+static_assert(sizeof(tname ## _PGENLIB_INTERNAL_DO_NOT_USE__) == 8, "64-bit flagset constants are not actually uint64_ts."); \
+typedef uint64_t tname
+
+  #define ENUM_U31_DEF_START() typedef enum {
+  #define ENUM_U31_DEF_END(tname) } tname ## _PGENLIB_INTERNAL_DO_NOT_USE__ ; \
+typedef uint32_t tname
+
+#endif
+
+FLAGSET_DEF_START()
+  kfPgenGlobal0,
+  kfPgenGlobalLdCompressionPresent = (1 << 0),
+  kfPgenGlobalDifflistOrLdPresent = (1 << 1),
+  kfPgenGlobalHardcallPhasePresent = (1 << 2),
+  kfPgenGlobalDosagePresent = (1 << 3),
+  kfPgenGlobalDosagePhasePresent = (1 << 4),
+  kfPgenGlobalAllNonref = (1 << 5)
+FLAGSET_DEF_END(pgen_global_flags_t);
+
+FLAGSET_DEF_START()
+  kfPgrLdcache0,
+  kfPgrLdcacheQuater = (1 << 0),
+  kfPgrLdcacheDifflist = (1 << 1),
+  kfPgrLdcacheAllHets = (1 << 2),
+  kfPgrLdcacheRefalt1Genocounts = (1 << 3)
+FLAGSET_DEF_END(pgr_ldcache_flags_t);
+
+// difflist/LD compression should never involve more than
+//   raw_sample_ct / kPglMaxDifflistLenDivisor
+// entries.  (however, returned difflists can have up to twice as many entries,
+// when a variant is LD-compressed and the reference variant is
+// difflist-compressed.)
+CONSTU31(kPglMaxDifflistLenDivisor, 8);
+
+// threshold for using a deltalist to represent a bitarray on disk (currently
+// relevant for dosage data)
+CONSTU31(kPglMaxDeltalistLenDivisor, 9);
+
+// The actual format:
+// 1. 2 magic bytes 0x6c 0x1b.
+//
+// 2. Mode byte.
+//      0x01 = plink1 variant-major.
+//      0x02 = plink2 basic variant-major.  variant/sample counts in header,
+//             00 = hom ref, 01 = het, 10 = hom alt, 11 = missing.  (vrtype 0)
+//      0x03 = plink2 basic unphased dosage (vrtype 0x40)
+//      0x04 = plink2 basic phased dosage (vrtype 0xc0)
+//      These are designed to be easy to write.  Note that the dosage formats
+//      require hardcalls to be stored as well; however, you can just set them
+//      to all-missing and then use
+//        plink2 --hard-call-threshold [...] --make-pgen
+//      to populate them.
+//
+//      0x10 = variable-type and/or variable-length records present.
+//      0x11 = mode 0x10, but with phase set information at the end of the
+//             file.
+//      larger values, and 0x05..0x0f, reserved for now.
+//
+// 3. If not plink1-format,
+//    a. 4-byte # of variants; call this M.
+//    b. 4-byte # of samples, call this N.
+//    c. Additional 1-byte header "control" value (pgen_header_ctrl_t).  May be
+//       extended in the future.
+//       bits 0-3: Indicates vrtype and variant record length storage widths.
+//         If bit 3 is unset, bits 0-1 store (vrec_len_byte_ct - 1), while bit
+//         2 is set iff phase or dosage info is present (requiring 8 bits
+//         instead of 4 bits for vrtypes).
+//         If bit 3 is set, a specialized encoding is used which combines the
+//         two pieces of information (reducing the overhead for files with few
+//         samples).  The following encodings are currently defined:
+//         1000: No difflist/LD/onebit compression, 2 bit
+//               (vrec_len - ceil(sample_ct / 4))) value.  vrtype is zero if
+//               the entry is zero, and 8 (multiallelic) if the record has 1-3
+//               extra bytes.  Designed for single-sample files sharing a
+//               single .bim-like file (note that if they don't share a .bim,
+//               .bim size will dominate), but it's usable whenever there's no
+//               variant where >2 samples have a rare alternate allele
+//               (assuming <16 alt alleles).
+//         1001: No difflist/LD/onebit compression, 4 bit
+//               (vrec_len - ceil(sample_ct / 4)) value.  vrtype is zero if the
+//               entry is zero, and 8 if the record has 1-15 extra bytes.
+//       bits 4-5: alt allele count storage (00 = unstored, 01-11 = bytes per
+//                 count)
+//       bits 6-7: nonref flags info (00 = unstored, 01 = all ref/alt, 10 =
+//                 never ref/alt, 11 = explicitly stored)
+//       Bits 0-5 do not apply to the fixed-length modes (currently 0x02-0x04)
+//       and should be zeroed out in that case.
+//
+// 4. If mode 0x10/0x11,
+//    a. Array of 8-byte fpos values for the first variant in each vblock.
+//       (Note that this suggests a way to support in-place insertions: some
+//       unused space can be left between the vblocks.)
+//    b. Sequence of header blocks, each containing information about
+//       kPglVblockSize variants (except the last may be shorter).  All values
+//       are known-width, to allow e.g. plink2 --make-pgen/--pmerge to compress
+//       all variant records first, then fseek to the beginning of the output
+//       file and write the header.
+//         i. array of 4-bit or 1-byte vrtypes.
+//        ii. array of variant record lengths (each occupying vrec_len_byte_ct
+//            bytes, or 4 bits).
+//       iii. if bits 4-5 of {3c} aren't 00, array of alt allele counts.
+//        iv. nonref flags info, if explicitly stored
+//      (this representation allows more efficient random access)
+//    If mode 0x02-0x04, and nonref flags info explicitly stored, just that
+//    bitarray.
+//
+// 5. The variant records.  See below for details.
+
+// Difflist format (used for onebit, sparse variant, and LD compression):
+//   a. [difflist_len VINT]
+//   If difflist_len is zero, that's it.  Otherwise, the difflist is organized
+//   into 64-element groups (the last group will usually be smaller), to make
+//   extraction of e.g. a single sample less painful.  Note that with 20k
+//   samples, a difflist is space-saving even with MAF 5%:
+//     ~1/400 hom alt + ~38/400 het = (~39/400) * 20k
+//                                  = ~1950 sample IDs.
+//     that's 31 groups, requiring about 2 + 62 + 30 + 488 + 1919 = 2501 bytes
+//     (can be slightly higher since a few ID deltas may be larger than 127);
+//     uncompressed storage requires 5000 bytes.
+//   b. [array of group start sample IDs, each of sample_id_byte_ct]
+//   c. [array of 1-byte [delta segment lengths minus 63], with last entry
+//      omitted]
+//   d. [array of 2-bit replacement genotype values]
+//   e. one "delta segment"/group: [array of [group size - 1] VINT values,
+//      each indicating the difference between the current and previous sample
+//      IDs; i.e. value is 1 for two consecutive samples]
+//   f. [if multiallelic, array of appropriate-length disambiguation values]
+
+
+// pgen_file_info_t and pgen_reader_t are the main exported "classes".
+// Exported functions involving these data structure should all have
+// "pgfi"/"pgr" in their names.
+
+struct Pgen_file_info_struct {
+  // ----- Header information, constant after initialization -----
+  uint32_t raw_variant_ct;
+  uint32_t raw_sample_ct;
+
+  // 0 if variant records aren't all the same length.
+  // If they are (e.g. PLINK 1 encoding; or vrtype bits 0-5 unset), we just
+  // fseek to
+  //   const_fpos_offset + const_vrec_width * ((uint64_t)variant_idx).
+  uint64_t const_fpos_offset;
+
+  uint32_t const_vrec_width;
+  
+  // see below.  positioned here instead of slightly later due to struct
+  // packing behavior.
+  uint32_t const_vrtype; // 256 for plink 1 encoding, 0xffffffffU for nonconst
+
+  // size (raw_variant_ct + 1), so that the number of bytes of (zero-based)
+  // variant n is var_fpos[n+1] - var_fpos[n].  nullptr if
+  // const_vrec_width is nonzero.
+  // It's not difficult to save some memory here (e.g. unless we're dealing
+  // with >256 TB files, it's trivial to go from 8 bytes down to 6 bytes per
+  // entry), but I doubt that's worth the trouble; let's worry about
+  // O(mn)-or-worse stuff, and on-disk stuff, first.
+  uint64_t* var_fpos;
+
+  // representation type codes.
+  //
+  // bits 0-2:
+  //   000 = Simple 2-bit encoding.
+  //   100, 110, 111 = Simple difflist.  Low two bits store the base value.
+  //         for multiallelic variants, if the base value is 0b11 (missing),
+  //         auxiliary data track #1 only contains entries for explicitly
+  //         listed 0b11 values, the rest are assumed to be actual missing
+  //         data.  (101 should practically never happen--gross violation of
+  //         Hardy-Weinberg equilibrium--so it's reserved for future use.)
+  //   010 = Differences-from-earlier-variant encoding ("LD compression").  The
+  //         last variant without this type of encoding is the base.
+  //         To simplify random access logic, the first variant in each vblock
+  //         is prohibited from using this encoding.
+  //   011 = Inverted differences-from-earlier-variant encoding.  (This covers
+  //         the case where a reference allele is "wrong".)  When decoding, the
+  //         difflist should be processed first, then the entire genovec should
+  //         be flipped.
+  //   001 = 1-bit + difflist representation.  Suppose most calls are
+  //         hom ref or het (e.g. a 20% MAF variant with ~4% hom alt1, ~36%
+  //         het ref/alt1, ~64% hom ref), then the main datatrack has just the
+  //         low bits of the usual 2-bit codes.  This is followed by a difflist
+  //         containing the hom alt1 and missing genotypes.
+  //         The main datatrack is preceded by a single byte indicating what
+  //         the two common values are: 2 low bits = [set value - unset value],
+  //         next 2 bits = unset value (6 possibilities).  Top 4 bits are
+  //         reserved.  When the set value is 3, it does NOT represent
+  //         rarealts; those must be explicitly spelled out in the difflist.
+  // bit 3: more than 1 alt allele?
+  // bit 4: hardcall phased?  if yes, auxiliary data track #2 contains
+  //        phasing information for heterozygous calls.
+  //        The first *bit* of the track indicates whether an explicit
+  //        "phasepresent" bitarray is stored.  If it's set, the next het_ct
+  //        bits are 1-bit values, where 0 = no phasing info known, and 1 =
+  //        phasing info present.  If it's unset, phasing info is present for
+  //        every het call.
+  //        This is followed by a "phaseinfo" bitarray, where 0 = unswapped,
+  //        1 = swapped (e.g. "1|0" in VCF).
+  //        This track is normally unpacked into fixed-size bitarrays when
+  //        loaded, but a raw mode is also provided (which doesn't support
+  //        subsetting).
+  //        By default, entire chromosomes/contigs are assumed to be phased
+  //        together.  (Todo: support contiguous phase sets.)
+  //
+  // bits 5-6:
+  //   00 = no dosage data.
+  //   01 = dosage list.  auxiliary data track #3 contains a delta-encoded list
+  //        of sample IDs (like a difflist, but with no genotypes).  if dosage
+  //        is unphased, track #5 contains a 16-bit (0..2^15; 65535 missing
+  //        value is only permitted in unconditional-dosage case) value for
+  //        each allele except the last alt; if it's phased, it uses the order
+  //          [hap1 ref prob] [hap2 ref prob] [hap1 alt1 prob] ...
+  //        where the values are in 0..2^14 (to minimize rounding headaches).
+  //        Note that this and the other dosage modes are in ADDITION to
+  //        hardcalls.  This increases filesize by up to 12.5%, but makes the
+  //        reader substantially simpler; --hard-call-threshold logic is nicely
+  //        compartmentalized.
+  //   10 = unconditional dosage (just track #5).
+  //   11 = dosage bitarray.  in this case, auxiliary data track #3 contains an
+  //        array of 1-bit values indicating which samples have dosages.
+  //   bgen 1.2 format no longer permits fractional missingness, so no good
+  //   reason for us to support it.
+  //   considered putting *all* dosage data at the end of the file (like I will
+  //   do for phase set info); this could actually be worthwhile for
+  //   unconditional dosages, but it doesn't work well when only some samples
+  //   have dosage data.
+  // bit 7: some dosages phased?  if yes, and dosages are not unconditionally
+  //        present, auxiliary data track #4 is either a single zero byte
+  //        (indicating that all dosages are phased), or a bitarray of length
+  //        (dosage_ct + 1) where the first bit is set, and the other bits
+  //        indicate whether phase info is present for that sample (unset = no
+  //        phasing info)
+  //        note that this is independent of bit 4; either can be set without
+  //        the other.
+  //
+  // Representation of variable ploidy (MT) was considered, but rejected since
+  // dosages should be at least as appropriate for MT.
+  // Oxford/VCF-style storage of separate probabilities for every possible
+  // genotype (e.g. P(AA), P(AB), P(BB) instead of just 2P(AA) + P(AB) and
+  // 2P(BB) + P(AB)) is tentatively rejected due to (i) lack of relevance to
+  // PLINK's analysis functions and (ii) high storage cost where we can afford
+  // it least.  However, this is subject to reevaluation if (i) changes.
+  //
+  //
+  // base pointer is null if mode is 0x01-0x04 (const_vrtype != 0xffffffffU).
+  // if not nullptr, required to be length >=
+  //   max(raw_variant_ct + 1, round_up_pow2(raw_variant_ct, kBytesPerWord))
+  unsigned char* vrtypes;
+
+  // alt allele counts.  if >1, auxiliary data track #1 disambiguates all the
+  // "missing or rare alt" explicit hardcalls.  genotype representation is:
+  //   low bits: smaller [1-based alt allele idx], 0 = ref
+  //   high bits: larger [1-based alt allele idx]
+  //   ...
+  //   2 alts: 1-bit array with 0 = missing, 1 = nonmissing.  Then, for the
+  //     nonmissing subset, 2 low bits.  The high bits entry is omitted because
+  //     the value has to be alt2; optimize the common case!
+  //   3 alts: 1-bit nonmissingness array,  2 low bits, 2 high bits
+  //   4-15 alts: 1-bit nonmissingness array, then 4 low bits, 4 high bits
+  //   16-255 alts: 1-bit nonmissingness array; then 8 low bits, 8 high bits
+  //   (the following is also defined, but not implemented for now:
+  //   256-4095 alts: 1-bit nonmissingness array; 12 low bits, 12 high bits
+  //   4096-65535 alts: 1-bit nonmissingness array; 16 low bits, 16 high bits
+  //   65536-16777215 alts: 1-bit nonmissingness array; 24 low bits, 24 high
+  //   bits; the latter might be necessary in the most variable regions if we
+  //   use tiles...)
+  // This can be nullptr if all alt allele counts are 1.
+  // (actually, we store the allele index offsets, so
+  // (allele_idx_offsets[n+1] - allele_idx_offsets[n]) is the number of alleles
+  // for variant n.  Otherwise, we'd need another data structure to support
+  // fast allele name lookup.)
+  uintptr_t* allele_idx_offsets;
+
+  uintptr_t* nonref_flags;
+
+  // If pgr.nonref_flags is nullptr and kfPgenGlobalAllNonref is unset, all
+  // reference alleles are assumed to be correct.
+  pgen_global_flags_t gflags;
+  
+  uint32_t max_alt_allele_ct;
+  uint32_t max_dosage_alt_allele_ct;
+
+  // * nullptr if using mmap
+  // * if using per-variant fread(), this is non-null during Pgen_file_info
+  //   initialization, but it's then "moved" to the first Pgen_reader and set
+  //   to nullptr.
+  FILE* shared_ff;
+  
+  const unsigned char* block_base; // nullptr if using per-variant fread()
+  uint64_t block_offset; // 0 for mmap
+#ifndef NO_MMAP
+  uint64_t file_size;
+#endif
+};
+
+typedef struct Pgen_file_info_struct pgen_file_info_t;
+
+struct Pgen_reader_struct {
+  // would like to make this const, but that makes initialization really
+  // annoying in C99
+  struct Pgen_file_info_struct fi;
+  
+  // ----- Mutable state -----
+  // If we don't fseek, what's the next variant we'd read?  (Still relevant
+  // with mmap due to how LD decompression is implemented.)
+  uint32_t fp_vidx;
+
+  // ** per-variant fread()-only **
+  FILE* ff;
+  unsigned char* fread_buf;
+  // ** end per-variant fread()-only **
+  
+  // if LD compression is present, cache the last non-LD-compressed variant
+  uint32_t ldbase_vidx;
+
+  // flags indicating which base_variant buffers are populated
+  pgr_ldcache_flags_t ldbase_stypes;
+  
+  uint32_t ldbase_difflist_len;
+
+  // these should be treated as private after initial allocation.
+  // not currently guaranteed to have trailing zeroes.
+  uintptr_t* ldbase_genovec;
+  uintptr_t* ldbase_raregeno;
+
+  // when ldbase_difflist_ids[] is initialized, element [ldbase_difflist_len]
+  // must be set to sample_ct.
+  uint32_t* ldbase_difflist_sample_ids;
+
+  uintptr_t* ldbase_all_hets;
+  
+  // common genotype can be looked up from vrtypes[]
+
+  uint32_t ldbase_refalt1_genocounts[4];
+  
+  uintptr_t* workspace_vec; // must hold raw_sample_ct entries
+  
+  // currently must hold (raw_sample_ct / kPglMaxDifflistLenDivisor)
+  // entries; may need to double the sizes later
+  // some top-level interface functions use these, so several lower-level
+  // functions cannot
+  uintptr_t* workspace_raregeno_vec;
+  uint32_t* workspace_difflist_sample_ids;
+
+  // must hold (raw_sample_ct / kPglMaxDifflistLenDivisor) entries
+  uintptr_t* workspace_raregeno_tmp_loadbuf;
+  uint32_t* workspace_difflist_sample_ids_tmp;
+
+  uintptr_t* workspace_aux1_nonmissing_vec;
+  uintptr_t* workspace_aux1_code_vec;
+
+  uintptr_t* workspace_all_hets;
+
+  uint32_t* workspace_ambig_sample_ids;
+  uint32_t workspace_ambig_id_ct;
+
+  uintptr_t* workspace_dosage_present;  
+  uintptr_t* workspace_dosage_phased;
+  
+  // phase set loading (mode 0x11) unimplemented for now; should be a sequence
+  // of (sample ID, [uint32_t phase set begin, set end), [set begin, set end),
+  // ...).
+};
+
+typedef struct Pgen_reader_struct pgen_reader_t;
+
+// might want this value to be typed...
+CONSTU31(kPglVrtypePlink1, 256);
+
+HEADER_INLINE uint32_t get_pgfi_vrtype(const pgen_file_info_t* pgfip, uint32_t vidx) {
+  if (pgfip->vrtypes) {
+    return pgfip->vrtypes[vidx];
+  }
+  return pgfip->const_vrtype;
+}
+
+HEADER_INLINE uint64_t get_pgfi_fpos(const pgen_file_info_t* pgfip, uintptr_t vidx) {
+  if (pgfip->var_fpos) {
+    return pgfip->var_fpos[vidx];
+  }
+  return pgfip->const_fpos_offset + pgfip->const_vrec_width * ((uint64_t)vidx);
+}
+
+HEADER_INLINE uint32_t get_pgfi_vrec_width(const pgen_file_info_t* pgfip, uint32_t vidx) {
+  if (pgfip->var_fpos) {
+    return (uint32_t)(pgfip->var_fpos[vidx + 1] - pgfip->var_fpos[vidx]);
+  }
+  return pgfip->const_vrec_width;
+}
+
+HEADER_INLINE uint32_t pgfi_is_simple_format(const pgen_file_info_t* pgfip) {
+  return (pgfip->const_vrtype != 0xffffffffU);
+}
+
+HEADER_INLINE uint32_t vrtype_difflist(uint32_t vrtype) {
+  return (vrtype & 4);
+}
+
+HEADER_INLINE uint32_t vrtype_ld_compressed(uint32_t vrtype) {
+  return (vrtype & 6) == 2;
+}
+
+HEADER_INLINE uint32_t vrtype_multiallelic(uint32_t vrtype) {
+  return (vrtype & 8);
+}
+
+HEADER_INLINE uint32_t vrtype_hphase(uint32_t vrtype) {
+  return (vrtype & 0x10);
+}
+
+HEADER_INLINE uint32_t vrtype_aux_tracks_present(uint32_t vrtype) {
+  return (vrtype & 0x78);
+}
+
+HEADER_INLINE uint32_t vrtype_variable_width(uint32_t vrtype) {
+  return (vrtype & 0x3e);
+}
+
+HEADER_INLINE uint32_t vrtype_dosage(uint32_t vrtype) {
+  return (vrtype & 0x60);
+}
+
+HEADER_INLINE uintptr_t get_aux1_allele_bytect(uint32_t alt_allele_ct, uint32_t aux1_nonmissing_ct) {
+  assert(alt_allele_ct >= 2);
+  if (alt_allele_ct == 2) {
+    return DIV_UP(aux1_nonmissing_ct, 4);
+  }
+  if (alt_allele_ct == 3) {
+    return DIV_UP(aux1_nonmissing_ct, 2);
+  }
+  // one byte per entry for alt_allele_ct < 16, two bytes for 16..255
+  return ((alt_allele_ct >= 16) + 1) * aux1_nonmissing_ct;
+  // todo: alt_allele_ct > 255
+}
+
+// pgen_file_info_t initialization is split into two phases, to decouple
+// plink2's arena allocator from this library.
+//
+// Phase 1: Open the .pgen; verify that the initial bytes are consistent with
+//   the file format; load/verify sample and variant counts, initialize
+//   pgfi.const_vrtype, pgfi.const_vrec_width, and pgfi.const_fpos_offset;
+//   determine initial memory allocation requirement.  first_alloc_cacheline_ct
+//   does not include allele counts and nonref flags, since it may be more
+//   appropriate to allocate those arrays earlier (during loading of a
+//   .bim-like file).
+//
+//   pgfi.var_fpos is set to nullptr if pgfi.const_vrec_width is nonzero.
+//   pgfi.vrtypes/var_allele_cts are set to nullptr in the plink1-format case.
+//
+//   raw_sample_ct and raw_variant_ct should be 0xffffffffU if not previously
+//   known.
+//
+// Intermission: Caller obtains a block of pgfi_alloc_cacheline_ct * 64 bytes,
+//   64-byte aligned.  The cachealigned_malloc() function can be used for this
+//   purpose.  If necessary, pgfi.allele_idx_offsets and pgfi.nonref_flags
+//   should be pointed at already-loaded data, or allocated so they can be
+//   loaded during phase 2.
+//
+// Phase 2: Initialize most pointers in the pgen_reader_t struct to appropriate
+//   positions in first_alloc.  For modes 0x10-0x11, load pgfi.var_fpos and
+//   pgfi.vrtypes, load/validate pgfi.allele_idx_offsets and pgfi.nonref_flags
+//   if appropriate, and initialize pgfi.gflags, pgfi.max_alt_allele_ct, and
+//   pgfi.max_dosage_alt_allele_ct.
+//
+// Finally, if block-fread mode is being used, pgfi.block_base must be
+//   initialized to point to a memory large enough to handle the largest
+//   pgfi_block_read() operation that will be attempted.
+//   pgfi_blockload_get_cacheline_req() can be used to determine the necessary
+//   buffer size.
+
+// This type may change if we introduce a more read-optimized format in the
+// future.  Right now it just tracks the presence/absence of two optional
+// pieces of information: allele counts and nonref flags.
+typedef uint32_t pgen_header_ctrl_t;
+
+void pgfi_preinit(pgen_file_info_t* pgfip);
+
+// There are three modes of operation:
+// 1. mmaped file.  Appropriate for handling multiple queries across different
+//    parts of the genome in parallel.  Suboptimal for whole-genome queries.
+//    Doesn't currently run on Windows.
+// 2. fread block-load.  Block-load operations are single-threaded, while
+//    decompression/counting is multithreaded.  Appropriate for whole-genome
+//    queries, since even with a SSD, reading from multiple parts of a file
+//    simultaneously doesn't work well.
+// 3. fread single-variant-at-a-time.  Simpler interface than block-load, and
+//    doesn't share its inability to handle multiple queries at a time, but
+//    less performant for CPU-heavy operations on the whole genome.
+//
+// To specify mode 1, pass in use_mmap == 1 here.
+// To specify mode 2, pass in use_mmap == 0 here, and use_blockload == 1 during
+//   phase2.
+// To specify mode 3, pass in use_mmap == 0 here, and use_blockload == 0 during
+//   phase2.
+pglerr_t pgfi_init_phase1(const char* fname, uint32_t raw_variant_ct, uint32_t raw_sample_ct, uint32_t use_mmap, pgen_header_ctrl_t* header_ctrl_ptr, pgen_file_info_t* pgfip, uintptr_t* pgfi_alloc_cacheline_ct_ptr, char* errstr_buf);
+
+// If allele_cts_already_loaded is set, but they're present in the file,
+// they'll be validated; similarly for nonref_flags_already_loaded.
+pglerr_t pgfi_init_phase2(pgen_header_ctrl_t header_ctrl, uint32_t allele_cts_already_loaded, uint32_t nonref_flags_already_loaded, uint32_t use_blockload, uint32_t vblock_idx_start, uint32_t vidx_end, uint32_t* max_vrec_width_ptr, pgen_file_info_t* pgfip, unsigned char* pgfi_alloc, uintptr_t* pgr_alloc_cacheline_ct_ptr, char* errstr_buf);
+
+
+uint64_t pgfi_multiread_get_cacheline_req(const uintptr_t* variant_include, const pgen_file_info_t* pgfip, uint32_t variant_ct, uint32_t block_size);
+
+// variant_include can be nullptr; in that case, we simply load all the
+// variants (load_variant_ct must be variant_uidx_end - variant_uidx_start).)
+// IMPORTANT: pgfi.block_offset must be manually copied to each reader for now.
+//   (todo: probably replace pgr.fi with a pointer.  when doing that, need to
+//   ensure multiple per-variant readers still works.)
+pglerr_t pgfi_multiread(const uintptr_t* variant_include, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t load_variant_ct, pgen_file_info_t* pgfip);
+
+
+void pgr_preinit(pgen_reader_t* pgrp);
+
+// Before pgr_init() is called, the caller must obtain a block of
+// pgr_alloc_cacheline_ct * 64 bytes (this value is returned by
+// pgfi_init_phase2), 64-byte aligned; this is the pgr_alloc parameter.
+//
+// There's also a modal usage difference:
+//
+// * Modes 1-2 (mmap, block-fread): There is one pgen_file_info_t per file
+//   which doesn't belong to any reader.  After it's initialized, multiple
+//   pgen_reader_ts can be based off of it.  When the pgen_file_info_t is
+//   destroyed, those pgen_reader_ts are invalidated and should be destroyed if
+//   that hasn't already happened.
+//
+//   fname parameter must be nullptr.
+//
+// * Mode 3 (per-variant fread): Destruction of the original pgen_file_info_t
+//   struct does not invalidate any extant pgen_reader_t instances (at least
+//   from pgenlib_internal's perspective).  Instead, destruction of the
+//   corresponding memory block or allele_idx_offsets/nonref_flags invalidates
+//   the associated pgen_reader_ts.
+//
+//   The only difference between the first reader and later readers of the same
+//   file is that the first reader steals the shared_ff used to read the
+//   header.
+//
+//   fname parameter must be non-null.
+
+pglerr_t pgr_init(const char* fname, uint32_t max_vrec_width, pgen_file_info_t* pgfip, pgen_reader_t* pgrp, unsigned char* pgr_alloc);
+
+// practically all these functions require genovec to be allocated up to
+// vector, not word, boundary
+void pgr_plink1_to_plink2_inplace_unsafe(uint32_t sample_ct, uintptr_t* genovec);
+
+void pgr_plink2_to_plink1_inplace_unsafe(uint32_t sample_ct, uintptr_t* genovec);
+
+void pgr_difflist_to_genovec_unsafe(const uintptr_t* __restrict raregeno, const uint32_t* difflist_sample_ids, uintptr_t difflist_common_geno, uint32_t sample_ct, uint32_t difflist_len, uintptr_t* __restrict genovec);
+
+// This will normally extract only the genotype indexes corresponding to set
+// bits in sample_include.  Set sample_ct == raw_sample_ct if you don't want
+// any subsetting to occur (in this case sample_include is ignored, can be
+// nullptr).
+// Only the maintrack is loaded.  00 = hom ref, 01 = het ref/alt1,
+// 10 = hom alt1, 11 = missing or anything else.
+// If multiallelic_relevant is set, and the current variant is multiallelic,
+// pgr.workspace_ambig_sample_ids and pgr.workspace_ambig_id_ct are updated.
+// "unsafe": sample_ct cannot be zero.  Trailing bits of genovec are not zeroed
+// out.
+// Ok if genovec only has space for sample_ct values.
+pglerr_t pgr_read_refalt1_genovec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec);
+
+// Loads the specified variant as a difflist if that's more efficient, setting
+// difflist_common_geno to the common genotype value in that case.  Otherwise,
+// genovec is populated and difflist_common_geno is set to 0xffffffffU.
+//
+// Note that the returned difflist_len can be much larger than
+// max_simple_difflist_len when the variant is LD-encoded; it's bounded by
+//   2 * (raw_sample_ct / kPglMaxDifflistLenDivisor).
+pglerr_t pgr_read_refalt1_difflist_or_genovec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr);
+
+// This is necessary when changing sample_include, unless the new query is
+// iterating from the first variant.  (Which can almost never be assumed in
+// plink2 since variant_include[] may not include the first variant.)
+HEADER_INLINE void pgr_clear_ld_cache(pgen_reader_t* pgrp) {
+  pgrp->ldbase_stypes &= kfPgrLdcacheAllHets;
+
+  // bugfix, ld_load_necessary() was otherwise claiming that reload wasn't
+  // necessary in certain cases
+  pgrp->ldbase_vidx = 0x80000000U;
+}
+
+pglerr_t pgr_get_refalt1_genotype_counts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uint32_t* genocounts);
+
+// allele_idx is set to 0 for ref, 1 for alt1, 2 for alt2, etc.
+// frequencies are computed on the fly.  ties are broken in favor of the
+// lower-indexed allele.
+// possible todo: also provide ..._common2_then_subset() function.
+// better default than the functions above for machine learning/GWAS, since the
+// reference allele is "wrong" sometimes.
+pglerr_t pgr_read_genovec_subset_then_common2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uint32_t* __restrict maj_allele_idx_ptr, uint32_t* __restrict second_allele_idx_ptr, uint32_t* __restrict allele_ct_buf);
+
+// Loads a quatervec with counts of a single allele (allele_idx 0 corresponds
+// to the reference allele, allele_idx 1 corresponds to alt1, etc.).  0b11 ==
+// missing call.
+// Note that calling this with allele_idx == 0 is similar to a plink1 load
+// (except with missing == 0b11, of course).
+// todo: provide a difflist interface once anyone wants it.
+pglerr_t pgr_read_allele_countvec_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, pgen_reader_t* pgrp, uintptr_t* __restrict allele_countvec);
+
+// todo: add functions which directly support MAF-based queries.  Note that
+// when the difflist representation is used, we can disqualify some low-MAF
+// variants without actually loading the genotype data, since the size of the
+// record puts an upper bound on the alt allele frequency.
+
+// requires trailing bits of genovec to be zeroed out, AND does not update high
+// bits of last word if raw_sample_ctl2 is odd.
+void detect_genovec_hets_hw(const uintptr_t*__restrict genovec, uint32_t raw_sample_ctl2, halfword_t* __restrict all_hets_hw);
+
+// requires trailing bits of genovec to be zeroed out.
+HEADER_INLINE void pgr_detect_genovec_hets_unsafe(const uintptr_t*__restrict genovec, uint32_t raw_sample_ctl2, uintptr_t* __restrict all_hets) {
+  halfword_t* all_hets_alias = (halfword_t*)all_hets;
+  detect_genovec_hets_hw(genovec, raw_sample_ctl2, all_hets_alias);
+  if (raw_sample_ctl2 % 2) {
+    all_hets_alias[raw_sample_ctl2] = 0;
+  }
+}
+
+HEADER_INLINE void pgr_detect_genovec_hets(const uintptr_t* __restrict genovec, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets) {
+  detect_genovec_hets_hw(genovec, QUATERCT_TO_WORDCT(raw_sample_ct), (halfword_t*)all_hets);
+  zero_trailing_bits(raw_sample_ct, all_hets);
+}
+
+// pglerr_t pgr_read_refalt1_genovec_hphase_raw_unsafe(uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict phaseraw, uint32_t* phasepresent_ct_ptr);
+
+pglerr_t pgr_read_refalt1_genovec_hphase_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* phasepresent_ct_ptr);
+
+// ok for both dosage_present and dosage_vals to be nullptr when no dosage data
+// is present
+pglerr_t pgr_read_refalt1_genovec_dosage16_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr, uint32_t* is_explicit_alt1_ptr);
+
+pglerr_t pgr_get_ref_nonref_genotype_counts_and_dosage16s(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, double* mach_r2_ptr, uint32_t* genocounts, uint64_t* all_dosages);
+
+// ok for both dosage_present and dosage_vals to be nullptr when no dosage data
+// is present
+pglerr_t pgr_read_refalt1_genovec_hphase_dosage16_subset_unsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* phasepresent_ct_ptr, uintptr_t* __restrict dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr, uint32_t* is_explicit_alt1_ptr);
+
+// interface used by --make-pgen, just performs basic LD/difflist decompression
+// (still needs multiallelic and dosage-phase extensions)
+pglerr_t pgr_read_raw(uint32_t vidx, pgen_global_flags_t read_gflags, pgen_reader_t* pgrp, uintptr_t** loadbuf_iter_ptr, unsigned char* loaded_vrtype_ptr);
+
+pglerr_t pgr_validate(pgen_reader_t* pgrp, char* errstr_buf);
+
+// missingness bit is set iff hardcall is not present (even if dosage info *is*
+// present)
+pglerr_t pgr_read_missingness(const uintptr_t* __restrict sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict missingness, uintptr_t* __restrict genovec_buf);
+
+// either missingness_hc (hardcall) or missingness_dosage must be non-null
+pglerr_t pgr_read_missingness_multi(const uintptr_t* __restrict sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, pgen_reader_t* pgrp, uintptr_t* __restrict missingness_hc, uintptr_t* __restrict missingness_dosage, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf);
+
+
+// failure = kPglRetReadFail
+boolerr_t pgfi_cleanup(pgen_file_info_t* pgfip);
+
+boolerr_t pgr_cleanup(pgen_reader_t* pgrp);
+
+
+struct Pgen_writer_common_struct {
+  uint32_t variant_ct;
+  uint32_t sample_ct;
+  pgen_global_flags_t phase_dosage_gflags; // subset of gflags
+
+  // there should be a single copy of these arrays shared by all threads.
+  // allele_idx_offsets is read-only.
+  uint64_t* vblock_fpos;
+  unsigned char* vrec_len_buf;
+  uintptr_t* vrtype_buf;
+  const uintptr_t* allele_idx_offsets;
+  uintptr_t* explicit_nonref_flags; // usually nullptr
+
+  // you can get a ~1-2% speedup by writing directly to genovec and swapping
+  // it with ldbase_genovec when appropriate; don't think that's worth
+  // supporting, given the messier API.
+  // uintptr_t* genovec;
+  
+  uint32_t ldbase_genocounts[4];
+
+  // should match ftello() return value in singlethreaded case, but be set to
+  // zero in multithreaded case
+  uint64_t vblock_fpos_offset;
+  
+  // these must hold sample_ct entries (could be fewer if not subsetting, but
+  // let's play it safe)
+  // genovec_invert_buf also used as phaseinfo and dphase_present temporary
+  // storage
+  uintptr_t* genovec_invert_buf;
+  uintptr_t* ldbase_genovec;
+  
+  // these must hold 2 * (sample_ct / kPglMaxDifflistLenDivisor) entries
+  uintptr_t* ldbase_raregeno;
+  uint32_t* ldbase_difflist_sample_ids; // 1 extra entry, == sample_ct
+  
+  // this must fit 64k variants in multithreaded case
+  unsigned char* fwrite_buf;
+  unsigned char* fwrite_bufp;
+
+  uint32_t ldbase_common_geno; // 0xffffffffU if ldbase_genovec present
+  uint32_t ldbase_difflist_len;
+  
+  // I'll cache this for now
+  uintptr_t vrec_len_byte_ct;
+  
+  uint32_t vidx;
+};
+
+typedef struct Pgen_writer_common_struct pgen_writer_common_t;
+
+CONSTU31(kPglFwriteBlockSize, 131072);
+
+// Given packed arrays of unphased biallelic genotypes in uncompressed plink2
+// binary format (00 = hom ref, 01 = het ref/alt1, 10 = hom alt1, 11 =
+// missing), {Single,Multi}threaded_pgen_writer performs difflist (sparse
+// variant), one bit (mostly-two-value), and LD compression before writing to
+// disk, and backfills the header at the end.  CPRA -> CPR merging is under
+// development.
+// The major difference between the two interfaces is that
+// Multithreaded_pgen_writer forces you to process large blocks of variants at
+// a time (64k per thread).  So Singlethreaded_pgen_writer is still worth using
+// in some cases (memory is very limited, I/O is slow, no programmer time to
+// spare for the additional complexity).
+
+struct Singlethreaded_pgen_writer_struct {
+  struct Pgen_writer_common_struct pwc;
+  FILE* pgen_outfile;  
+};
+
+struct Multithreaded_pgen_writer_struct {
+  FILE* pgen_outfile;
+  uint32_t thread_ct;
+  struct Pgen_writer_common_struct* pwcs[];
+};
+
+typedef struct Singlethreaded_pgen_writer_struct st_pgen_writer_t;
+typedef struct Multithreaded_pgen_writer_struct mt_pgen_writer_t;
+
+void spgw_preinit(st_pgen_writer_t* spgwp);
+
+// nonref_flags_storage values:
+//   0 = no info stored
+//   1 = always trusted
+//   2 = always untrusted
+//   3 = use explicit_nonref_flags
+pglerr_t spgw_init_phase1(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uint32_t nonref_flags_storage, st_pgen_writer_t* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr);
+
+void spgw_init_phase2(uint32_t max_vrec_len, st_pgen_writer_t* spgwp, unsigned char* spgw_alloc);
+
+// moderately likely that there isn't enough memory to use the maximum number
+// of threads, so this returns per-thread memory requirements before forcing
+// the caller to specify thread count
+// (eventually should write code which falls back on st_pgen_writer_t
+// when there isn't enough memory for even a single 64k variant block, at least
+// for the most commonly used plink 2.0 functions)
+void mpgw_init_phase1(const uintptr_t* __restrict allele_idx_offsets, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uintptr_t* alloc_base_cacheline_ct_ptr, uint64_t* alloc_per_thread_cacheline_ct_ptr, uint32_t* vrec_len_byte_ct_ptr, uint64_t* vblock_cacheline_ct_ptr);
+
+pglerr_t mpgw_init_phase2(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, pgen_global_flags_t phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, mt_pgen_writer_t* mpgwp);
+
+
+// trailing bits of genovec must be zeroed out
+void pwc_append_biallelic_genovec(const uintptr_t* __restrict genovec, pgen_writer_common_t* pwcp);
+
+pglerr_t spgw_append_biallelic_genovec(const uintptr_t* __restrict genovec, st_pgen_writer_t* spgwp);
+
+// trailing bits of raregeno must be zeroed out
+// all raregeno entries assumed to be unequal to difflist_common_geno; the
+// difflist should be compacted first if this isn't true (might be possible
+// with multiallelic projections?)
+// difflist_len must be <= 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+// there's an assert checking this
+void pwc_append_biallelic_difflist_limited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, pgen_writer_common_t* pwcp);
+
+pglerr_t spgw_append_biallelic_difflist_limited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, st_pgen_writer_t* spgwp);
+
+// trailing bits of refalt1_genovec must be zeroed out
+// not implemented yet
+pglerr_t spgw_append_multiallelic_counts(const uintptr_t** __restrict alt_countvecs);
+
+// phasepresent == nullptr ok, that indicates that ALL heterozygous calls are
+// phased.  Caller should use e.g. pwc_append_biallelic_genovec() if it's known
+// in advance that no calls are phased.
+// Ok for phaseinfo to have bits set at non-het calls, NOT currently okay for
+//   phasepresent
+// void pwc_append_biallelic_genovec_hphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, pgen_writer_common_t* pwcp);
+
+// phasepresent == nullptr ok
+// ok for trailing bits of phaseinfo to not be zeroed out, NOT currently ok for
+//   phasepresent
+pglerr_t spgw_append_biallelic_genovec_hphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, st_pgen_writer_t* spgwp);
+
+// dosage_vals[] has length dosage_ct, not sample_ct
+void pwc_append_biallelic_genovec_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, pgen_writer_common_t* pwcp);
+
+pglerr_t spgw_append_biallelic_genovec_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, st_pgen_writer_t* spgwp);
+
+void pwc_append_biallelic_genovec_hphase_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, pgen_writer_common_t* pwcp);
+
+pglerr_t spgw_append_biallelic_genovec_hphase_dosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_vals, uint32_t dosage_ct, st_pgen_writer_t* spgwp);
+
+// dphase_present can be nullptr if dosage_ct == dphase_ct
+// dosage_present cannot be null for nonzero dosage_ct
+// dosage_vals[] has length dosage_ct + dphase_ct
+// pglerr_t spgw_append_biallelic_genovec_dphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* dphase_present, const uint16_t* dosage_vals, uint32_t dosage_ct, uint32_t dphase_ct, st_pgen_writer_t* spgwp);
+
+
+// Backfills header info, then closes the file.
+pglerr_t spgw_finish(st_pgen_writer_t* spgwp);
+
+// Last flush automatically backfills header info and closes the file.
+// (caller should set mpgwp = nullptr after that)
+pglerr_t  mpgw_flush(mt_pgen_writer_t* mpgwp);
+
+
+// these close the file if open, but do not free any memory
+// mpgw_cleanup() handles mpgwp == nullptr, since it shouldn't be allocated on
+// the stack
+boolerr_t spgw_cleanup(st_pgen_writer_t* spgwp);
+boolerr_t mpgw_cleanup(mt_pgen_writer_t* mpgwp);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PGENLIB_INTERNAL_H__
diff --git a/pgenlib_python_support.cpp b/pgenlib_python_support.cpp
new file mode 100644
index 0000000..260bc34
--- /dev/null
+++ b/pgenlib_python_support.cpp
@@ -0,0 +1,501 @@
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "pgenlib_python_support.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+void genoarr_to_bytes_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int8_t* genobytes) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBytesPerWord;
+  const quarterword_t* read_alias = (const quarterword_t*)genoarr;
+  uintptr_t* write_walias = (uintptr_t*)genobytes;
+  uint32_t widx = 0;
+  while (1) {
+    uintptr_t qw = read_alias[widx];
+#ifdef __LP64__
+    qw = (qw | (qw << 24)) & kMask000000FF;
+#endif
+    qw = (qw | (qw << 12)) & kMask000F;
+    qw = (qw | (qw << 6)) & kMask0303;
+    // now each byte is in {0, 1, 2, 3}.  Convert the 3s to -9s in a branchless
+    // manner.
+    // (-9) - 3 = -12, which is represented as 244 in a uint8_t
+    const uintptr_t geno_missing = qw & (qw >> 1) & kMask0101;
+    qw += geno_missing * 244;
+    if (widx == word_ct_m1) {
+      memcpy(&(write_walias[widx]), &qw, MOD_NZ(sample_ct, kBytesPerWord));
+      return;
+    }
+    write_walias[widx++] = qw;
+  }
+}
+
+// could have a size-16 lookup table in 64-bit builds, etc.
+static const int32_t geno_to_int32[4] = {0, 1, 2, -9};
+
+void genoarr_to_int32s_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int32_t* geno_int32) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  int32_t* write_iter = geno_int32;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	return;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      *write_iter++ = geno_to_int32[geno_word & 3];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+}
+
+static const int64_t geno_to_int64[4] = {0, 1, 2, -9};
+
+void genoarr_to_int64s_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int64_t* geno_int64) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  int64_t* write_iter = geno_int64;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	return;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      *write_iter++ = geno_to_int64[geno_word & 3];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+}
+
+// missing = -9
+static const uint64_t geno_to_intcode_pair[4] = {0, 0x100000000LLU, 0x100000001LLU, 0xfffffff7fffffff7LLU};
+
+void genoarr_to_allele_codes(const uintptr_t* genoarr, uint32_t sample_ct, int32_t* allele_codes) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint64_t* write_iter = (uint64_t*)allele_codes;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	return;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      *write_iter++ = geno_to_intcode_pair[geno_word & 3];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+}
+
+void genoarr_phased_to_allele_codes(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, uint32_t sample_ct, uint32_t phasepresent_ct, unsigned char* phasebytes, int32_t* allele_codes) {
+  // phasebytes can be nullptr
+  genoarr_to_allele_codes(genoarr, sample_ct, allele_codes);
+  uint64_t* allele_codes_alias64 = (uint64_t*)allele_codes;
+  uint32_t sample_uidx = 0;
+  if (!phasebytes) {
+    for (uint32_t phased_idx = 0; phased_idx < phasepresent_ct; ++phased_idx, ++sample_uidx) {
+      next_set_unsafe_ck(phasepresent, &sample_uidx);
+      if (IS_SET(phaseinfo, sample_uidx)) {
+	// 1|0
+	allele_codes_alias64[sample_uidx] = 1;
+      }
+    }
+    return;
+  }
+  // 0 and 2 = homozygous, automatically phased; otherwise patch in from
+  // phaseinfo if phasepresent_ct is nonzero
+  // so, start off by extracting low bit from each pair and flipping it
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBytesPerWord;
+  const quarterword_t* read_alias = (const quarterword_t*)genoarr;
+  uintptr_t* write_walias = (uintptr_t*)phasebytes;
+  uint32_t widx = 0;
+  while (1) {
+    uintptr_t qw = read_alias[widx];
+#ifdef __LP64__
+    qw = (qw | (qw << 24)) & kMask000000FF;
+#endif
+    qw = (qw | (qw << 12)) & kMask000F;
+    qw = (~(qw | (qw << 6))) & kMask0101;
+    if (widx == word_ct_m1) {
+      memcpy(&(write_walias[widx]), &qw, MOD_NZ(sample_ct, kBytesPerWord));
+      break;
+    }
+    write_walias[widx++] = qw;
+  }
+  for (uint32_t phased_idx = 0; phased_idx < phasepresent_ct; ++phased_idx, ++sample_uidx) {
+    next_set_unsafe_ck(phasepresent, &sample_uidx);
+    phasebytes[sample_uidx] = 1;
+    if (IS_SET(phaseinfo, sample_uidx)) {
+      allele_codes_alias64[sample_uidx] = 1;
+    }
+  }
+}
+
+// missing = -9
+static const int32_t geno_to_hap0_code[6] = {0, 0, 1, -9, 0, 1};
+static const int32_t geno_to_hap1_code[6] = {0, 1, 1, -9, 0, 0};
+
+// todo: write version of this which fills phasebytes
+void genoarr_phased_to_hap_codes(const uintptr_t* genoarr, const uintptr_t* phaseinfo, uint32_t variant_batch_size, int32_t* hap0_codes_iter, int32_t* hap1_codes_iter) {
+  // assumes genoarr and phaseinfo have already been transposed
+  const uint32_t word_ct_m1 = (variant_batch_size - 1) / kBitsPerWordD2;
+  const halfword_t* phaseinfo_alias = (const halfword_t*)phaseinfo;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	return;
+      }
+      subgroup_len = MOD_NZ(variant_batch_size, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    uintptr_t phaseinfo_hw = phaseinfo_alias[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      const uintptr_t cur_pgeno_code = (geno_word & 3) + 4 * (phaseinfo_hw & 1);
+      *hap0_codes_iter++ = geno_to_hap0_code[cur_pgeno_code];
+      *hap1_codes_iter++ = geno_to_hap1_code[cur_pgeno_code];
+      geno_word >>= 2;
+      phaseinfo_hw >>= 1;
+    }
+    ++widx;
+  }
+}
+
+static const float geno_to_float[4] = {0.0f, 1.0f, 2.0f, -9.0f};
+
+void dosage16_to_floats_minus9(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_vals, uint32_t sample_ct, uint32_t dosage_ct, float* geno_float) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  float* write_iter = geno_float;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	break;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      *write_iter++ = geno_to_float[geno_word & 3];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+  if (dosage_ct) {
+    const uint16_t* dosage_vals_iter = dosage_vals;
+    uint32_t sample_uidx = 0;
+    for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_uidx) {
+      next_set_unsafe_ck(dosage_present, &sample_uidx);
+      // multiply by 2^{-14}
+      geno_float[sample_uidx] = ((float)(*dosage_vals_iter++)) * 0.00006103515625f;
+    }
+  }
+}
+
+static const double geno_to_double[4] = {0.0, 1.0, 2.0, -9.0};
+
+void dosage16_to_doubles_minus9(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_vals, uint32_t sample_ct, uint32_t dosage_ct, double* geno_double) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  double* write_iter = geno_double;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	break;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      *write_iter++ = geno_to_double[geno_word & 3];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+  if (dosage_ct) {
+    const uint16_t* dosage_vals_iter = dosage_vals;
+    uint32_t sample_uidx = 0;
+    for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_uidx) {
+      next_set_unsafe_ck(dosage_present, &sample_uidx);
+      geno_double[sample_uidx] = ((double)(*dosage_vals_iter++)) * 0.00006103515625;
+    }
+  }
+}
+
+void bytes_to_bits_unsafe(const uint8_t* boolbytes, uint32_t sample_ct, uintptr_t* bitarr) {
+  const uint32_t ull_ct_m1 = (sample_ct - 1) / 8;
+  const uint64_t* read_alias = (const uint64_t*)boolbytes;
+  unsigned char* write_alias = (unsigned char*)bitarr;
+  uint32_t ullidx = 0;
+  while (1) {
+    uint64_t cur_ull;
+    if (ullidx >= ull_ct_m1) {
+      if (ullidx > ull_ct_m1) {
+	return;
+      }
+      cur_ull = 0;
+      memcpy(&cur_ull, &(read_alias[ullidx]), MOD_NZ(sample_ct, 8));
+    } else {
+      cur_ull = read_alias[ullidx];
+    }
+    // assuming boolbytes is 0/1-valued, this multiply-and-shift maps binary
+    //  h0000000g0000000f... to binary hgfedcba.
+    //  ^       ^       ^
+    //  |       |       |
+    // 56      48      40
+    // (the constant has bits 0, 7, 14, 21, 28, 35, 42, and 49 set)
+    write_alias[ullidx++] = (unsigned char)((cur_ull * 0x2040810204081LLU) >> 49);
+  }
+}
+
+void bytes_to_genoarr_unsafe(const int8_t* genobytes, uint32_t sample_ct, uintptr_t* genoarr) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBytesPerWord;
+  const uintptr_t* read_walias = (const uintptr_t*)genobytes;
+  quarterword_t* write_alias = (quarterword_t*)genoarr;
+  uint32_t widx = 0;
+  while (1) {
+    uintptr_t ww;
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	return;
+      }
+      ww = 0;
+      memcpy(&ww, &(read_walias[widx]), MOD_NZ(sample_ct, kBytesPerWord));
+    } else {
+      ww = read_walias[widx];
+    }
+    ww &= kMask0303;
+    ww = (ww | (ww >> 6)) & kMask000F;
+#ifdef __LP64__
+    ww = (ww | (ww >> 12)) & kMask000000FF;
+    write_alias[widx] = (quarterword_t)(ww | (ww >> 24));
+#else
+    write_alias[widx] = (quarterword_t)(ww | (ww >> 12));
+#endif
+    ++widx;
+  }
+}
+
+void allele_codes_to_genoarr_unsafe(const int32_t* allele_codes, const unsigned char* phasepresent_bytes, uint32_t sample_ct, uintptr_t* genoarr, uintptr_t* phasepresent, uintptr_t* phaseinfo) {
+  // - If phasepresent_bytes is nullptr, phasepresent is not updated.  In this
+  //   case, phaseinfo is updated iff it's not nullptr.  It's okay for both
+  //   phasepresent and phaseinfo to be nullptr here.
+  // - Otherwise, phasepresent and phaseinfo are always updated; neither can be
+  //   nullptr.
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  const int32_t* read_alias = allele_codes;
+  halfword_t* phaseinfo_alias = (halfword_t*)phaseinfo;
+  if (!phasepresent_bytes) {
+    while (1) {
+      if (widx >= word_ct_m1) {
+	if (widx > word_ct_m1) {
+	  return;
+	}
+	subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+      }
+      uintptr_t geno_write_word = 0;
+      if (!phaseinfo) {
+	for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+	  // 0,0 -> 0
+	  // 0,1 or 1,0 -> 1
+	  // 1,1 -> 2
+	  // -9,-9 -> 3
+	  // undefined behavior on e.g. 0,2
+	  const uint32_t first_code = (uint32_t)(*read_alias++);
+	  const uint32_t second_code = (uint32_t)(*read_alias++);
+	  uintptr_t cur_geno;
+	  if (first_code <= 1) {
+	    cur_geno = first_code + second_code;
+	  } else {
+	    // todo: test whether branchless is better
+	    // (in practice, this will usually be predictable?)
+	    cur_geno = 3;
+	  }
+	  geno_write_word |= (cur_geno << (uii * 2));
+	}
+      } else {
+	halfword_t phaseinfo_write_hw = 0;
+	for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+	  // set phaseinfo_write_hw bit iff 1,0
+	  const uint32_t first_code = (uint32_t)(*read_alias++);
+	  const uint32_t second_code = (uint32_t)(*read_alias++);
+	  uintptr_t cur_geno;
+	  if (first_code <= 1) {
+	    cur_geno = first_code + second_code;
+	    phaseinfo_write_hw |= (cur_geno & first_code) << uii;
+	  } else {
+	    // todo: test whether branchless is better
+	    // (in practice, this will usually be predictable?)
+	    cur_geno = 3;
+	  }
+	  geno_write_word |= (cur_geno << (uii * 2));
+	}
+	phaseinfo_alias[widx] = phaseinfo_write_hw;
+      }
+      genoarr[widx] = geno_write_word;
+      ++widx;
+    }
+  }
+  const unsigned char* phasepresent_bytes_iter = phasepresent_bytes;
+  halfword_t* phasepresent_alias = (halfword_t*)phasepresent;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	return;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_write_word = 0;
+    halfword_t phasepresent_write_hw = 0;
+    halfword_t phaseinfo_write_hw = 0;
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      const uint32_t first_code = (uint32_t)(*read_alias++);
+      const uint32_t second_code = (uint32_t)(*read_alias++);
+      uintptr_t cur_geno;
+      if (first_code <= 1) {
+	cur_geno = first_code + second_code;
+	const uint32_t cur_phasepresent = cur_geno & phasepresent_bytes_iter[uii];
+	phasepresent_write_hw |= cur_phasepresent << uii;
+	phaseinfo_write_hw |= (cur_phasepresent & first_code) << uii;
+      } else {
+	cur_geno = 3;
+      }
+      geno_write_word |= (cur_geno << (uii * 2));
+    }
+    phasepresent_bytes_iter = &(phasepresent_bytes_iter[subgroup_len]);
+    phasepresent_alias[widx] = phasepresent_write_hw;
+    phaseinfo_alias[widx] = phaseinfo_write_hw;
+    genoarr[widx] = geno_write_word;
+    ++widx;
+  }
+}
+
+static inline uint32_t biallelic_dosage_halfdist(uint32_t dosage_int) {
+  const uint32_t dosage_int_rem = dosage_int & 16383;
+  return abs_int32(((int32_t)dosage_int_rem) - 8192);
+}
+
+void floats_to_dosage16(const float* floatarr, uint32_t sample_ct, uint32_t hard_call_halfdist, uintptr_t* genoarr, uintptr_t* dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  const float* read_iter = floatarr;
+  halfword_t* dosage_present_alias = (halfword_t*)dosage_present;
+  uint16_t* dosage_vals_iter = dosage_vals;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	break;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = 0;
+    uint32_t dosage_present_hw = 0;
+    for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits < subgroup_len; ++sample_idx_lowbits) {
+      // 0..2 -> 0..32768
+      const float fxx = (*read_iter++) * 16384 + 0.5;
+      uintptr_t cur_geno = 3;
+      if ((fxx >= 0.0) && (fxx < 32769)) {
+	uint32_t dosage_int = (int32_t)fxx;
+	const uint32_t cur_halfdist = biallelic_dosage_halfdist(dosage_int);
+	if (cur_halfdist >= hard_call_halfdist) {
+	  cur_geno = (dosage_int + (8192 * k1LU)) / 16384;
+	}
+	if (cur_halfdist != 8192) {
+	  dosage_present_hw |= 1U << sample_idx_lowbits;
+	  *dosage_vals_iter++ = dosage_int;
+	}
+      }
+      geno_word |= cur_geno << (2 * sample_idx_lowbits);
+    }
+    genoarr[widx] = geno_word;
+    dosage_present_alias[widx] = dosage_present_hw;
+    ++widx;
+  }
+  if (widx % 2) {
+    dosage_present_alias[widx] = 0;
+  }
+  *dosage_ct_ptr = (uint32_t)((uintptr_t)(dosage_vals_iter - dosage_vals));
+}
+
+void doubles_to_dosage16(const double* doublearr, uint32_t sample_ct, uint32_t hard_call_halfdist, uintptr_t* genoarr, uintptr_t* dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr) {
+  const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  const double* read_iter = doublearr;
+  halfword_t* dosage_present_alias = (halfword_t*)dosage_present;
+  uint16_t* dosage_vals_iter = dosage_vals;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  while (1) {
+    if (widx >= word_ct_m1) {
+      if (widx > word_ct_m1) {
+	break;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = 0;
+    uint32_t dosage_present_hw = 0;
+    for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits < subgroup_len; ++sample_idx_lowbits) {
+      // 0..2 -> 0..32768
+      const double dxx = (*read_iter++) * 16384 + 0.5;
+      uintptr_t cur_geno = 3;
+      if ((dxx >= 0.0) && (dxx < 32769)) {
+	uint32_t dosage_int = (int32_t)dxx;
+	const uint32_t cur_halfdist = biallelic_dosage_halfdist(dosage_int);
+	if (cur_halfdist >= hard_call_halfdist) {
+	  cur_geno = (dosage_int + (8192 * k1LU)) / 16384;
+	}
+	if (cur_halfdist != 8192) {
+	  dosage_present_hw |= 1U << sample_idx_lowbits;
+	  *dosage_vals_iter++ = dosage_int;
+	}
+      }
+      geno_word |= cur_geno << (2 * sample_idx_lowbits);
+    }
+    genoarr[widx] = geno_word;
+    dosage_present_alias[widx] = dosage_present_hw;
+    ++widx;
+  }
+  if (widx % 2) {
+    dosage_present_alias[widx] = 0;
+  }
+  *dosage_ct_ptr = (uint32_t)((uintptr_t)(dosage_vals_iter - dosage_vals));
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/pgenlib_python_support.h b/pgenlib_python_support.h
new file mode 100644
index 0000000..06cb0ce
--- /dev/null
+++ b/pgenlib_python_support.h
@@ -0,0 +1,75 @@
+#ifndef __PGENLIB_PYTHON_SUPPORT_H__
+#define __PGENLIB_PYTHON_SUPPORT_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "pgenlib_internal.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// Could define a slightly-more-efficient version of this function which uses a
+// missing code of 3 instead of -9.  But let's play well with existing scripts
+// first.
+void genoarr_to_bytes_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int8_t* genobytes);
+
+void genoarr_to_int32s_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int32_t* geno_int32);
+
+void genoarr_to_int64s_minus9(const uintptr_t* genoarr, uint32_t sample_ct, int64_t* geno_int64);
+
+// For Python interface, allele_codes is always int32_t.  Python programmers
+// should not need to worry about whether pgenlib was compiled with 1-, 2-, or
+// 4-byte alt_allele_ct_t.
+void genoarr_to_allele_codes(const uintptr_t* genoarr, uint32_t sample_ct, int32_t* allele_codes);
+
+// phasebytes can be nullptr; if it isn't, entry is 1 iff genotype is an
+// explicitly phased het, OR genotype is homozygous
+// phasepresent cannot be nullptr
+void genoarr_phased_to_allele_codes(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, uint32_t sample_ct, uint32_t phasepresent_ct, unsigned char* phasebytes, int32_t* allele_codes);
+
+// assumes transposed genoarr, phaseinfo
+void genoarr_phased_to_hap_codes(const uintptr_t* genoarr, const uintptr_t* phaseinfo, uint32_t variant_batch_size, int32_t* hap0_codes_iter, int32_t* hap1_codes_iter);
+
+void dosage16_to_floats_minus9(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_vals, uint32_t sample_ct, uint32_t dosage_ct, float* geno_float);
+
+void dosage16_to_doubles_minus9(const uintptr_t* genoarr, const uintptr_t* dosage_present, const uint16_t* dosage_vals, uint32_t sample_ct, uint32_t dosage_ct, double* geno_double);
+
+// Does not zero out trailing bits of bitarr.
+void bytes_to_bits_unsafe(const uint8_t* boolbytes, uint32_t sample_ct, uintptr_t* bitarr);
+
+// Bottom 2 bits are extracted from every byte.  Conveniently, -9 and 3 are
+// treated identically.
+// Does not zero out trailing bits of genoarr.
+void bytes_to_genoarr_unsafe(const int8_t* genobytes, uint32_t sample_ct, uintptr_t* genoarr);
+
+// - If phasepresent_bytes is nullptr, phasepresent is not updated.  In this
+//   case, phaseinfo is updated iff it's not nullptr.  It's okay for both
+//   phasepresent and phaseinfo to be nullptr here.
+// - Otherwise, phasepresent and phaseinfo are always updated; neither can be
+//   nullptr.
+void allele_codes_to_genoarr_unsafe(const int32_t* allele_codes, const unsigned char* phasepresent_bytes, uint32_t sample_ct, uintptr_t* genoarr, uintptr_t* phasepresent, uintptr_t* phaseinfo);
+
+void floats_to_dosage16(const float* floatarr, uint32_t sample_ct, uint32_t hard_call_halfdist, uintptr_t* genoarr, uintptr_t* dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr);
+
+void doubles_to_dosage16(const double* doublearr, uint32_t sample_ct, uint32_t hard_call_halfdist, uintptr_t* genoarr, uintptr_t* dosage_present, uint16_t* dosage_vals, uint32_t* dosage_ct_ptr);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PGENLIB_PYTHON_SUPPORT_H__
diff --git a/plink2.cpp b/plink2.cpp
new file mode 100644
index 0000000..0430139
--- /dev/null
+++ b/plink2.cpp
@@ -0,0 +1,7728 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_data.h"
+#include "plink2_decompress.h"
+#include "plink2_filter.h"
+#include "plink2_glm.h"
+#include "plink2_ld.h"
+#include "plink2_matrix_calc.h"
+#include "plink2_misc.h"
+#include "plink2_psam.h"
+#include "plink2_pvar.h"
+#include "plink2_random.h"
+#include "plink2_set.h"
+
+// #include <locale.h>
+#include <time.h>
+#include <unistd.h> // getcwd(), gethostname(), sysconf(), unlink()
+
+#include "plink2_help.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+  
+static const char ver_str[] = "PLINK v2.00a"
+#ifdef NOLAPACK
+  "NL"
+#endif
+#ifdef __LP64__
+  #ifdef LAPACK_ILP64
+    "LM"
+  #endif
+  #ifdef USE_SSE42
+    #ifdef USE_AVX2
+      #error "version string code needs to be updated"
+    #endif
+    " SSE4.2"
+  #else
+    " 64-bit"
+  #endif
+#else
+  " 32-bit"
+#endif
+
+#ifdef USE_MKL
+  " Intel"
+#endif
+  " (17 Jul 2017)";
+static const char ver_str2[] =
+  // include leading space if day < 10, so character length stays the same
+  ""
+#ifndef LAPACK_ILP64
+  "  "
+#endif
+#ifndef USE_MKL
+  "      "
+#endif
+#ifndef NOLAPACK
+  "  "
+#endif
+  "    www.cog-genomics.org/plink/2.0/\n"
+  "(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3\n";
+static const char errstr_append[] = "For more info, try '" PROG_NAME_STR " --help [flag name]' or '" PROG_NAME_STR " --help | more'.\n";
+
+#ifndef NOLAPACK
+static const char notestr_null_calc2[] = "Commands include --make-bpgen, --export, --freq, --geno-counts, --missing,\n--hardy, --indep-pairwise, --make-king, --king-cutoff, --write-snplist,\n--make-grm-gz, --pca, --glm, --score, --genotyping-rate, --validate, and\n--zst-decompress.\n\n'" PROG_NAME_STR " --help | more' describes all functions.\n";
+#else
+static const char notestr_null_calc2[] = "Commands include --make-bpgen, --export, --freq, --geno-counts, --missing,\n--hardy, --indep-pairwise, --make-king, --king-cutoff, --write-snplist,\n--make-grm-gz, --glm, --score, --genotyping-rate, --validate, and\n--zst-decompress.\n\n'" PROG_NAME_STR " --help | more' describes all functions.\n";
+#endif
+
+static const char errstr_nomem[] = "Error: Out of memory.  The --memory flag may be helpful.\n";
+static const char errstr_write[] = "Error: File write failure.\n";
+static const char errstr_read[] = "Error: File read failure.\n";
+static const char errstr_thread_create[] = "Error: Failed to create thread.\n";
+
+#ifndef __LP64__
+  // 2047 seems to consistently fail on both OS X and Windows
+  #ifdef _WIN32
+CONSTU31(kMalloc32bitMbMax, 1760);
+  #else
+    #ifdef __APPLE__
+CONSTU31(kMalloc32bitMbMax, 1920);
+    #else
+CONSTU31(kMalloc32bitMbMax, 2047);
+    #endif
+  #endif
+#endif
+
+// assumes logfile is open
+void disp_exit_msg(pglerr_t reterr) {
+  if (reterr) {
+    if (reterr == kPglRetNomem) {
+      logprint("\n");
+      logerrprint(errstr_nomem);
+      if (g_failed_alloc_attempt_size) {
+	LOGERRPRINTF("Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
+      }
+    } else if (reterr == kPglRetReadFail) {
+      logprint("\n");
+      logerrprint(errstr_read);
+    } else if (reterr == kPglRetWriteFail) {
+      logprint("\n");
+      logerrprint(errstr_write);
+    } else if (reterr == kPglRetThreadCreateFail) {
+      logprint("\n");
+      logerrprint(errstr_thread_create);
+    }
+  }
+}
+
+// covar-variance-standardize + terminating null
+CONSTU31(kMaxFlagBlen, 27);
+
+FLAGSET_DEF_START()
+  kfLoadParams0,
+  kfLoadParamsPgen = (1 << 0),
+  kfLoadParamsPsam = (1 << 1),
+  kfLoadParamsPvar = (1 << 2),
+  kfLoadParamsPfileAll = (kfLoadParamsPgen | kfLoadParamsPsam | kfLoadParamsPvar)
+FLAGSET_DEF_END(load_params_t);
+
+FLAGSET_DEF_START()
+  kfXload0,
+  kfXloadVcf = (1 << 0),
+  kfXloadBcf = (1 << 1),
+  kfXloadOxSample = (1 << 2),
+  kfXloadOxGen = (1 << 3),
+  kfXloadOxBgen = (1 << 4),
+  kfXloadOxHaps = (1 << 5),
+  kfXloadOxLegend = (1 << 6),
+  kfXloadPlink1Dosage = (1 << 7),
+  kfXloadMap = (1 << 8),
+  kfXloadGenDummy = (1 << 9)
+FLAGSET_DEF_END(xload_t);
+
+// maximum number of usable cluster computers, this is arbitrary though it
+// shouldn't be larger than 2^32 - 1
+CONSTU31(kParallelMax, 32768);
+
+uint32_t realpath_identical(const char* outname, const char* read_realpath, char* write_realpath_buf) {
+#ifdef _WIN32
+  const uint32_t fname_slen = GetFullPathName(outname, kPglFnamesize, write_realpath_buf, nullptr);
+  return (fname_slen && (fname_slen <= kPglFnamesize) && (!strcmp(read_realpath, write_realpath_buf)));
+#else
+  return (realpath(outname, write_realpath_buf) && (!strcmp(read_realpath, write_realpath_buf)));
+#endif
+}
+
+
+// assume for now that .pgen must always be accompanied by both .pvar and .psam
+FLAGSET64_DEF_START()
+  kfFilter0,
+  kfFilterAllReq = (1 << 0),
+  kfFilterPvarReq = (1 << 1),
+  kfFilterPsamReq = (1 << 2),
+  kfFilterNoSplitChr = (1 << 3),
+  kfFilterExclFemales = (1 << 4),
+  kfFilterExclMales = (1 << 5),
+  kfFilterExclNosex = (1 << 6),
+  kfFilterExclFounders = (1 << 7),
+  kfFilterExclNonfounders = (1 << 8),
+  kfFilterSnpsOnly = (1 << 9),
+  kfFilterSnpsOnlyJustAcgt = (1 << 10)
+FLAGSET64_DEF_END(filter_flags_t);
+
+FLAGSET64_DEF_START()
+  kfCommand10,
+  kfCommand1MakePlink2 = (1 << 0),
+  kfCommand1Exportf = (1 << 1),
+  kfCommand1MakeKing = (1 << 2),
+  kfCommand1KingCutoff = (1 << 3),
+  kfCommand1MissingReport = (1 << 4),
+  kfCommand1WriteSnplist = (1 << 5),
+  kfCommand1AlleleFreq = (1 << 6),
+  kfCommand1GenoCounts = (1 << 7),
+  kfCommand1Hardy = (1 << 8),
+  kfCommand1LdPrune = (1 << 9),
+  kfCommand1Pca = (1 << 10),
+  kfCommand1Glm = (1 << 11),
+  kfCommand1MakeRel = (1 << 12),
+  kfCommand1Validate = (1 << 13),
+  kfCommand1GenotypingRate = (1 << 14),
+  kfCommand1Score = (1 << 15),
+  kfCommand1WriteCovar = (1 << 16)
+FLAGSET64_DEF_END(command1_flags_t);
+
+// this is a hybrid, only kfSortFileSid is actually a flag
+FLAGSET_DEF_START()
+  kfSort0,
+  kfSortNone = (1 << 0),
+  kfSortNatural = (1 << 1),
+  kfSortAscii = (1 << 2),
+  kfSortFile = (1 << 3),
+  kfSortFileSid = (1 << 4)
+FLAGSET_DEF_END(sort_flags_t);
+
+typedef struct plink2_cmdline_struct {
+  misc_flags_t misc_flags;
+  filter_flags_t filter_flags;
+  command1_flags_t command_flags1;
+  pvar_psam_t pvar_psam_modifier;
+  exportf_flags_t exportf_modifier;
+  sort_flags_t sample_sort_flags;
+  grm_flags_t grm_flags;
+  pca_flags_t pca_flags;
+  write_covar_flags_t write_covar_flags;
+  pheno_transform_flags_t pheno_transform_flags;
+  range_list_t snps_range_list;
+  range_list_t exclude_snps_range_list;
+  range_list_t pheno_range_list;
+  range_list_t covar_range_list;
+  fam_col_t fam_cols;
+  idpaste_t exportf_id_paste;
+  ld_info_t ld_info;
+  king_flags_t king_modifier;
+  double king_cutoff;
+  double king_table_filter;
+  allele_freq_t allele_freq_modifier;
+  missing_rpt_t missing_rpt_modifier;
+  geno_counts_t geno_counts_modifier;
+  hardy_flags_t hardy_modifier;
+  glm_info_t glm_info;
+  adjust_info_t adjust_info;
+  score_info_t score_info;
+  aperm_t aperm;
+  cmp_expr_t keep_if_expr;
+  cmp_expr_t remove_if_expr;
+  double ci_size;
+  float var_min_qual;
+  uint32_t splitpar_bound1;
+  uint32_t splitpar_bound2;
+  uint32_t new_variant_id_max_allele_slen;
+  uint32_t update_sex_colm2;
+
+  // maybe support BGEN v1.2-style variable-precision dosages later, at which
+  // point these should be floating-point numbers; but let's first see what we
+  // gain from v1.1 fixed-point arithmetic
+  uint32_t hard_call_thresh;
+  uint32_t dosage_erase_thresh;
+  
+  double pfilter;
+  double output_min_p;
+  double vif_thresh;
+  double mind_thresh;
+  double geno_thresh;
+  double hwe_thresh;
+  double mach_r2_min;
+  double mach_r2_max;
+  double min_maf;
+  double max_maf;
+  uint64_t min_allele_dosage;
+  uint64_t max_allele_dosage;
+  int32_t missing_pheno;
+  int32_t from_bp;
+  int32_t to_bp;
+  int32_t window_bp;
+  uint32_t pca_ct;
+  uint32_t xchr_model;
+  uint32_t max_thread_ct;
+  uint32_t parallel_idx;
+  uint32_t parallel_tot;
+  uint32_t exportf_bits;
+  uint32_t mwithin_val;
+  char exportf_id_delim;
+  
+  char* varid_template;
+  char* missing_varid_match;
+  char* varid_from;
+  char* varid_to;
+  char* varid_snp;
+  char* varid_exclude_snp;
+  char* pheno_fname;
+  char* covar_fname;
+  char* extract_fnames;
+  char* exclude_fnames;
+  char* update_sex_fname;
+  char* keep_fnames;
+  char* keepfam_fnames;
+  char* remove_fnames;
+  char* removefam_fnames;
+  char* sample_sort_fname;
+  char* freq_ref_binstr;
+  char* freq_alt1_binstr;
+  char* glm_local_covar_fname;
+  char* glm_local_pvar_fname;
+  char* glm_local_psam_fname;
+  char* read_freq_fname;
+  char* within_fname;
+  char* catpheno_name;
+  char* family_missing_catname;
+  char* keep_cats_fname;
+  char* keep_cat_names_flattened;
+  char* keep_cat_phenoname;
+  char* remove_cats_fname;
+  char* remove_cat_names_flattened;
+  char* remove_cat_phenoname;
+  char* split_cat_phenonames_flattened;
+  char* vstd_flattened;
+  char* quantnorm_flattened;
+  char* covar_quantnorm_flattened;
+} plink2_cmdline_t;
+
+uint32_t is_single_variant_loader_needed(const char* king_cutoff_fprefix, command1_flags_t command_flags1, make_plink2_t make_plink2_modifier) {
+  return (command_flags1 & (kfCommand1Exportf | kfCommand1MakeKing | kfCommand1GenoCounts | kfCommand1LdPrune | kfCommand1Validate | kfCommand1Pca | kfCommand1MakeRel | kfCommand1Glm | kfCommand1Score)) || ((command_flags1 & kfCommand1MakePlink2) && (make_plink2_modifier & kfMakePgen)) || ((command_flags1 & kfCommand1KingCutoff) && (!king_cutoff_fprefix));
+}
+
+uint32_t are_allele_freqs_needed(command1_flags_t command_flags1, double min_maf, double max_maf) {
+  return (command_flags1 & (kfCommand1AlleleFreq | kfCommand1LdPrune | kfCommand1Pca | kfCommand1MakeRel | kfCommand1Score)) || (min_maf != 0.0) || (max_maf != 1.0);
+}
+
+uint32_t are_maj_alleles_needed(command1_flags_t command_flags1) {
+  return (command_flags1 & (kfCommand1LdPrune | kfCommand1Pca | kfCommand1MakeRel));
+}
+
+uint32_t get_first_haploid_uidx(const chr_info_t* cip, unsorted_var_t vpos_sortstatus) {
+  // returns 0x7fffffff if no X/haploid chromosomes present
+  if (!(vpos_sortstatus & kfUnsortedVarSplitChr)) {
+    const uint32_t chr_ct = cip->chr_ct;
+    for (uint32_t chr_fo_idx = 0; chr_fo_idx < chr_ct; ++chr_fo_idx) {
+      const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+      if (IS_SET(cip->haploid_mask, chr_idx)) {
+	return cip->chr_fo_vidx_start[chr_fo_idx];
+      }
+    }
+  }
+  return 0x7fffffff;
+}
+
+uint32_t are_allele_dosages_needed(misc_flags_t misc_flags, make_plink2_t make_plink2_modifier, uint32_t afreq_needed, uint64_t min_allele_dosage, uint64_t max_allele_dosage) {
+  return (make_plink2_modifier & kfMakePlink2TrimAlts) || ((misc_flags & kfMiscNonfounders) && (afreq_needed || (misc_flags & kfMiscMajRef) || min_allele_dosage || (max_allele_dosage != 0xffffffffU)));
+}
+
+uint32_t are_founder_allele_dosages_needed(misc_flags_t misc_flags, uint32_t afreq_needed, uint64_t min_allele_dosage, uint64_t max_allele_dosage) {
+  return (afreq_needed || (misc_flags & kfMiscMajRef) || min_allele_dosage || (max_allele_dosage != (~0LLU))) && (!(misc_flags & kfMiscNonfounders));
+}
+
+uint32_t are_sample_missing_dosage_cts_needed(misc_flags_t misc_flags, uint32_t smaj_missing_geno_report_requested, double mind_thresh, missing_rpt_t missing_rpt_modifier) {
+  return ((mind_thresh != 1.0) && (misc_flags & kfMiscMindDosage)) || (smaj_missing_geno_report_requested && (missing_rpt_modifier & (kfMissingRptScolNmissDosage | kfMissingRptScolFmissDosage)));
+}
+
+uint32_t are_variant_missing_hc_cts_needed(command1_flags_t command_flags1, misc_flags_t misc_flags, double geno_thresh, missing_rpt_t missing_rpt_modifier) {
+  return ((command_flags1 & kfCommand1GenotypingRate) && (!(misc_flags & kfMiscGenotypingRateDosage))) || ((command_flags1 & kfCommand1MissingReport) && (missing_rpt_modifier & (kfMissingRptVcolNmiss | kfMissingRptVcolNmissHh | kfMissingRptVcolHethap | kfMissingRptVcolFmiss | kfMissingRptVcolFmissHh | kfMissingRptVcolFhethap))) || ((geno_thresh != 1.0) && (!(misc_flags & kfMiscGenoDosage)));
+}
+
+uint32_t are_variant_hethap_cts_needed(command1_flags_t command_flags1, misc_flags_t misc_flags, double geno_thresh, missing_rpt_t missing_rpt_modifier, uint32_t first_hap_uidx) {
+  return (first_hap_uidx != 0x7fffffff) && (((command_flags1 & kfCommand1MissingReport) && (missing_rpt_modifier & (kfMissingRptVcolNmissHh | kfMissingRptVcolHethap | kfMissingRptVcolFmissHh | kfMissingRptVcolFhethap))) || ((geno_thresh != 1.0) && (!(misc_flags & kfMiscGenoHhMissing))));
+}
+
+uint32_t are_variant_missing_dosage_cts_needed(command1_flags_t command_flags1, misc_flags_t misc_flags, double geno_thresh, missing_rpt_t missing_rpt_modifier) {
+  return ((command_flags1 & kfCommand1GenotypingRate) && (misc_flags & kfMiscGenotypingRateDosage)) || ((command_flags1 & kfCommand1MissingReport) && (!(missing_rpt_modifier & kfMissingRptSampleOnly)) && (missing_rpt_modifier & (kfMissingRptVcolNmissDosage | kfMissingRptVcolFmissDosage))) || ((geno_thresh != 1.0) && (misc_flags & kfMiscGenoDosage));
+}
+
+// can simplify --geno-counts all-biallelic case, but let's first make sure the
+// general case works for multiallelic variants
+uint32_t are_raw_geno_cts_needed(command1_flags_t command_flags1, misc_flags_t misc_flags, double hwe_thresh) {
+  return (command_flags1 & kfCommand1GenoCounts) || ((misc_flags & kfMiscNonfounders) && ((command_flags1 & kfCommand1Hardy) || (hwe_thresh != 1.0)));
+}
+
+uint32_t are_founder_raw_geno_cts_needed(command1_flags_t command_flags1, misc_flags_t misc_flags, double hwe_thresh) {
+  return (!(misc_flags & kfMiscNonfounders)) && ((command_flags1 & kfCommand1Hardy) || (hwe_thresh != 1.0));
+}
+
+uint32_t is_info_reload_needed(command1_flags_t command_flags1, pvar_psam_t pvar_psam_modifier, exportf_flags_t exportf_modifier) {
+  // add kfExportfBcf later
+  return ((command_flags1 & kfCommand1MakePlink2) && (pvar_psam_modifier & kfPvarColXinfo)) || ((command_flags1 & kfCommand1Exportf) && (exportf_modifier & kfExportfVcf));
+}
+
+uint32_t grm_keep_needed(command1_flags_t command_flags1, pca_flags_t pca_flags) {
+  return ((command_flags1 & kfCommand1Pca) && (!(pca_flags & kfPcaApprox)));
+}
+
+void report_genotyping_rate(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_missing_cts, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t male_ct, uint32_t variant_ct, uint32_t is_dosage) {
+  // defined the same way as PLINK 1.x, to allow this to serve as a sanity
+  // check
+  // trivial to multithread this if it ever matters
+  uint64_t tot_nony_missing = 0;
+  uint64_t tot_y_missing = 0;
+  uint64_t cur_tot_missing = 0;
+  uint32_t y_start = 0xffffffffU;
+  uint32_t y_end = 0xffffffffU;
+  uint32_t variant_ct_y = 0;
+  int32_t y_code;
+  if (xymt_exists(cip, kChrOffsetY, &y_code)) {
+    const uint32_t y_chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)y_code];
+    y_start = cip->chr_fo_vidx_start[y_chr_fo_idx];
+    y_end = cip->chr_fo_vidx_start[y_chr_fo_idx + 1];
+    variant_ct_y = popcount_bit_idx(variant_include, y_start, y_end);
+  }
+  uint32_t y_thresh = y_start;
+  uint32_t variant_uidx = 0;
+  uint32_t is_y = 0;  
+  for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    if (variant_uidx >= y_thresh) {
+      if (is_y) {
+	tot_y_missing = cur_tot_missing;
+      } else {
+	tot_nony_missing = cur_tot_missing;
+      }
+      is_y = (variant_uidx < y_end);
+      cur_tot_missing = 0;
+      if (is_y) {
+	y_thresh = y_end;
+      } else {
+	y_thresh = 0xffffffffU;
+      }
+    }
+    cur_tot_missing += variant_missing_cts[variant_uidx];
+  }
+  if (is_y) {
+    tot_y_missing = cur_tot_missing;
+  } else {
+    tot_nony_missing += cur_tot_missing;
+  }
+  if ((!tot_y_missing) && (!tot_nony_missing)) {
+    LOGPRINTF("Total (%s) genotyping rate %sis exactly 1.\n", is_dosage? "dosage" : "hardcall", (raw_sample_ct != sample_ct)? "in remaining samples " : "");
+    return;
+  }
+  double genotyping_rate;
+  if (male_ct && variant_ct_y) {
+    const uint64_t nony_possible_obs = (variant_ct - variant_ct_y) * ((uint64_t)sample_ct);
+    const uint64_t y_possible_obs = variant_ct_y * ((uint64_t)male_ct);
+    genotyping_rate = ((double)((int64_t)(nony_possible_obs - tot_nony_missing))) / ((double)((int32_t)sample_ct)) + ((double)((int64_t)(y_possible_obs - tot_y_missing))) / ((double)((int32_t)male_ct));
+    genotyping_rate /= (int32_t)variant_ct;
+  } else {
+    variant_ct -= variant_ct_y;
+    const uint64_t denom = variant_ct * ((uint64_t)sample_ct);
+    genotyping_rate = (double)((int64_t)(denom - tot_nony_missing)) / ((double)((int64_t)denom));
+  }
+  if (genotyping_rate >= 0.9999995) {
+    LOGPRINTF("Total (%s) genotyping rate %sis in [0.9999995, 1).\n", is_dosage? "dosage" : "hardcall", (raw_sample_ct != sample_ct)? "in remaining samples " : "");
+  } else {
+    LOGPRINTF("Total (%s) genotyping rate %sis %g.\n", is_dosage? "dosage" : "hardcall", (raw_sample_ct != sample_ct)? "in remaining samples " : "", genotyping_rate);
+  }
+}
+
+pglerr_t apply_variant_bp_filters(const char* extract_fnames, const char* exclude_fnames, const chr_info_t* cip, const uint32_t* variant_bps, int32_t from_bp, int32_t to_bp, uint32_t raw_variant_ct, misc_flags_t misc_flags, unsorted_var_t vpos_sortstatus, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  // todo: add --from-bp/--to-bp
+  if ((from_bp != -1) || (to_bp != -1)) {
+    if (vpos_sortstatus & kfUnsortedVarBp) {
+      logerrprint("Error: --from-bp and --to-bp require a sorted .pvar/.bim.  Retry this command\nafter using e.g. plink 1.9 --make-bed to sort your data.\n");
+      return kPglRetInconsistentInput;
+    }
+    const uint32_t chr_idx = next_set(cip->chr_mask, 0, kChrRawEnd);
+
+    // this function shouldn't be called unless variant_ct is nonzero
+    assert(chr_idx != kChrRawEnd);
+
+    const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
+    uint32_t variant_uidx_start = cip->chr_fo_vidx_start[chr_fo_idx];
+    uint32_t variant_uidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+    if (from_bp != -1) {
+      const uint32_t from_offset = uint32arr_greater_than(&(variant_bps[variant_uidx_start]), variant_uidx_end - variant_uidx_start, (uint32_t)from_bp);
+      variant_uidx_start += from_offset;
+    }
+    if ((to_bp != -1) && (variant_uidx_start < variant_uidx_end)) {
+      const uint32_t to_offset = uint32arr_greater_than(&(variant_bps[variant_uidx_start]), variant_uidx_end - variant_uidx_start, 1 + ((uint32_t)to_bp));
+      variant_uidx_end = variant_uidx_start + to_offset;
+    }
+    if (variant_uidx_start) {
+      clear_bits_nz(0, variant_uidx_start, variant_include);
+    }
+    if (variant_uidx_end < raw_variant_ct) {
+      clear_bits_nz(variant_uidx_end, raw_variant_ct, variant_include);
+    }
+    *variant_ct_ptr = popcount_bit_idx(variant_include, variant_uidx_start, variant_uidx_end);
+  }
+  if (extract_fnames && (misc_flags & kfMiscExtractRange)) {
+    if (vpos_sortstatus & kfUnsortedVarBp) {
+      logerrprint("Error: '--extract range' requires a sorted .pvar/.bim.  Retry this command\nafter using e.g. plink 1.9 --make-bed to sort your data.\n");
+      return kPglRetInconsistentInput;
+    }
+    pglerr_t reterr = extract_exclude_range(extract_fnames, cip, variant_bps, raw_variant_ct, 0, variant_include, variant_ct_ptr);
+    if (reterr) {
+      return reterr;
+    }
+  }
+  if (exclude_fnames && (misc_flags & kfMiscExcludeRange)) {
+    if (vpos_sortstatus & kfUnsortedVarBp) {
+      logerrprint("Error: '--exclude range' requires a sorted .pvar/.bim.  Retry this command\nafter using e.g. plink 1.9 --make-bed to sort your data.\n");
+      return kPglRetInconsistentInput;
+    }
+    pglerr_t reterr = extract_exclude_range(exclude_fnames, cip, variant_bps, raw_variant_ct, 1, variant_include, variant_ct_ptr);
+    if (reterr) {
+      return reterr;
+    }
+  }
+  return kPglRetSuccess;
+}
+
+void update_sample_subsets(const uintptr_t* sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t* founder_info, uint32_t* founder_ct_ptr, uintptr_t* sex_nm, uintptr_t* sex_male, uint32_t* male_ct_ptr, uint32_t* nosex_ct_ptr) {
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  bitvec_and(sample_include, raw_sample_ctl, founder_info);
+  *founder_ct_ptr = popcount_longs(founder_info, raw_sample_ctl);
+  bitvec_and(sample_include, raw_sample_ctl, sex_male);
+  *male_ct_ptr = popcount_longs(sex_male, raw_sample_ctl);
+  bitvec_and(sample_include, raw_sample_ctl, sex_nm);
+  *nosex_ct_ptr = sample_ct - popcount_longs(sex_nm, raw_sample_ctl);
+}
+
+// command_flags2 will probably be needed before we're done
+static_assert(kPglMaxAltAlleleCt == 254, "plink2() --maj-ref needs to be updated.");
+pglerr_t plink2_core(char* var_filter_exceptions_flattened, char* require_pheno_flattened, char* require_covar_flattened, const plink2_cmdline_t* pcp, make_plink2_t make_plink2_modifier, char* pgenname, char* psamname, char* pvarname, char* outname, char* outname_end, char* king_cutoff_fprefix, chr_info_t* cip) {
+  pheno_col_t* pheno_cols = nullptr;
+  pheno_col_t* covar_cols = nullptr;
+  char* pheno_names = nullptr;
+  char* covar_names = nullptr;
+  uint32_t pheno_ct = 0;
+  uint32_t covar_ct = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  pgen_file_info_t pgfi;
+  pgen_reader_t simple_pgr;
+  pgfi_preinit(&pgfi);
+  pgr_preinit(&simple_pgr);
+  {
+    // this predicate will need to exclude --merge-list special case later
+    uint32_t pvar_renamed = 0;
+    if ((make_plink2_modifier & (kfMakeBed | kfMakePgen)) || (pcp->exportf_modifier & kfExportfIndMajorBed)) {
+      uint32_t fname_slen;
+#ifdef _WIN32
+      fname_slen = GetFullPathName(pgenname, kPglFnamesize, g_textbuf, nullptr);
+      if ((!fname_slen) || (fname_slen > kPglFnamesize))
+#else
+      if (!realpath(pgenname, g_textbuf))
+#endif
+      {
+	LOGERRPRINTFWW(g_errstr_fopen, pgenname);
+	goto plink2_ret_OPEN_FAIL;
+      }
+      uint32_t pgen_rename = 0;
+      if (make_plink2_modifier & kfMakePgen) {
+        strcpy(outname_end, ".pgen");
+	pgen_rename = realpath_identical(outname, g_textbuf, &(g_textbuf[kPglFnamesize + 64]));
+      }
+      if ((!pgen_rename) && ((make_plink2_modifier & kfMakeBed) || (pcp->exportf_modifier & kfExportfIndMajorBed))) {
+	strcpy(outname_end, ".bed");
+	pgen_rename = realpath_identical(outname, g_textbuf, &(g_textbuf[kPglFnamesize + 64]));
+      }
+      if (pgen_rename) {
+	LOGPRINTF("Note: --make-%s input and output filenames match.  Appending '~' to input\nfilenames.\n", (make_plink2_modifier & kfMakeBed)? "bed" : ((make_plink2_modifier & kfMakePvar)? "pgen" : "bpgen"));
+	fname_slen = strlen(pgenname);
+	memcpy(g_textbuf, pgenname, fname_slen);
+	strcpy(&(pgenname[fname_slen]), "~");
+	if (rename(g_textbuf, pgenname)) {
+	  logerrprint("Error: Failed to append '~' to input .bed/.pgen filename.\n");
+	  goto plink2_ret_OPEN_FAIL;
+	}
+	fname_slen = strlen(pvarname);
+	memcpy(g_textbuf, pvarname, fname_slen);
+	strcpy(&(pvarname[fname_slen]), "~");
+	if (rename(g_textbuf, pvarname)) {
+	  logerrprint("Error: Failed to append '~' to input .bim/.pvar filename.\n");
+	  goto plink2_ret_OPEN_FAIL;
+	}
+	pvar_renamed = 1;
+	fname_slen = strlen(psamname);
+	memcpy(g_textbuf, psamname, fname_slen);
+	strcpy(&(psamname[fname_slen]), "~");
+	if (rename(g_textbuf, psamname)) {
+	  logerrprint("Error: Failed to append '~' to input .fam/.psam filename.\n");
+	  goto plink2_ret_OPEN_FAIL;
+	}
+      }
+    }
+    uintptr_t max_sample_id_blen = 4;
+    uintptr_t max_sid_blen = 0;
+    uintptr_t max_paternal_id_blen = 2;
+    uintptr_t max_maternal_id_blen = 2;
+    uint32_t raw_sample_ct = 0;
+    uintptr_t* sample_include = nullptr;
+    char* sample_ids = nullptr;
+    char* sids = nullptr;
+    char* paternal_ids = nullptr;
+    char* maternal_ids = nullptr;
+    uintptr_t* sex_nm = nullptr;
+    uintptr_t* sex_male = nullptr;
+    uintptr_t* founder_info = nullptr;
+    uintptr_t max_pheno_name_blen = 0;
+    uint32_t raw_sample_ctl = 0;
+    uint32_t sample_ct = 0;
+    if (psamname[0]) {
+      reterr = load_psam(psamname, pcp->pheno_fname? nullptr : &(pcp->pheno_range_list), pcp->fam_cols, (pcp->pheno_fname && pcp->pheno_range_list.name_ct)? 0 : 0x7fffffff, pcp->missing_pheno, (pcp->misc_flags / kfMiscAffection01) & 1, &max_sample_id_blen, &max_sid_blen, &max_paternal_id_blen, &max_maternal_id_blen, &sample_include, &sample_ids, &sids, &paternal_ids, &maternal_ids, &founder_info, &sex_nm, &sex_male, &pheno_cols, &pheno_names, &raw_sample_ct, &pheno_ct, &max_pheno_name_blen);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+      // todo: add option to discard loaded SIDs
+      raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+      sample_ct = popcount_longs(sample_include, raw_sample_ctl);
+      const uint32_t known_sex_ct = popcount_longs(sex_nm, raw_sample_ctl);
+      const uint32_t male_ct = popcount_longs(sex_male, raw_sample_ctl);
+      const uint32_t female_ct = known_sex_ct - male_ct;
+      const uint32_t founder_ct = popcount_longs(founder_info, raw_sample_ctl);
+      if (known_sex_ct == sample_ct) {
+        LOGPRINTFWW("%u sample%s (%u female%s, %u male%s; %u founder%s) loaded from %s.\n", sample_ct, (sample_ct == 1)? "" : "s", female_ct, (female_ct == 1)? "" : "s", male_ct, (male_ct == 1)? "" : "s", founder_ct, (founder_ct == 1)? "" : "s", psamname);
+      } else {
+	const uint32_t unknown_sex_ct = sample_ct - known_sex_ct;
+        LOGPRINTFWW("%u sample%s (%u female%s, %u male%s, %u ambiguous; %u founder%s) loaded from %s.\n", sample_ct, (sample_ct == 1)? "" : "s", female_ct, (female_ct == 1)? "" : "s", male_ct, (male_ct == 1)? "" : "s", unknown_sex_ct, founder_ct, (founder_ct == 1)? "" : "s", psamname);
+      }
+    }
+
+    uint32_t max_variant_id_slen = 1;
+    uint32_t info_reload_slen = is_info_reload_needed(pcp->command_flags1, pcp->pvar_psam_modifier, pcp->exportf_modifier);
+    uintptr_t* variant_allele_idxs = nullptr;
+    uint32_t raw_variant_ct = 0;
+    uint32_t variant_ct = 0;
+    char* xheader = nullptr;
+    uintptr_t xheader_blen = 0;
+    uintptr_t* variant_include = nullptr;
+    uint32_t* variant_bps = nullptr;
+    char** variant_ids = nullptr;
+    char** allele_storage = nullptr;
+    uintptr_t* pvar_qual_present = nullptr;
+    float* pvar_quals = nullptr;
+    uintptr_t* pvar_filter_present = nullptr;
+    uintptr_t* pvar_filter_npass = nullptr;
+    char** pvar_filter_storage = nullptr;
+    uintptr_t* nonref_flags = nullptr;
+    uint32_t xheader_info_pr = 0;
+    uint32_t max_allele_slen = 0;
+    uint32_t max_filter_slen = 0;
+    unsorted_var_t vpos_sortstatus = kfUnsortedVar0;
+    double* variant_cms = nullptr;
+    chr_idx_t* chr_idxs = nullptr; // split-chromosome case only
+    if (pvarname[0]) {
+      reterr = load_pvar(pvarname, var_filter_exceptions_flattened, pcp->varid_template, pcp->missing_varid_match, pcp->misc_flags, pcp->pvar_psam_modifier, pcp->exportf_modifier, pcp->var_min_qual, pcp->splitpar_bound1, pcp->splitpar_bound2, pcp->new_variant_id_max_allele_slen, (pcp->filter_flags / kfFilterSnpsOnly) & 3, !(pcp->filter_flags & kfFilterNoSplitChr), cip, &max_variant_id_slen, &info_reload_slen, &vpos_sortstatus, &xheader, &variant_include, &variant_bps, &variant_ids, &vari [...]
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+      if (variant_ct == raw_variant_ct) {
+	LOGPRINTFWW("%u variant%s loaded from %s.\n", variant_ct, (variant_ct == 1)? "" : "s", pvarname);
+      } else {
+	LOGPRINTFWW("%u out of %u variant%s loaded from %s.\n", variant_ct, raw_variant_ct, (raw_variant_ct == 1)? "" : "s", pvarname);
+      }
+      if (info_reload_slen && (make_plink2_modifier & (kfMakeBim | kfMakePvar)) && (!pvar_renamed)) {
+	// need to be careful with .pvar in this case
+	uint32_t fname_slen;
+#ifdef _WIN32
+	fname_slen = GetFullPathName(pvarname, kPglFnamesize, g_textbuf, nullptr);
+	if ((!fname_slen) || (fname_slen > kPglFnamesize))
+#else
+	if (!realpath(pvarname, g_textbuf))
+#endif
+	{
+	  LOGERRPRINTFWW(g_errstr_fopen, pvarname);
+	  goto plink2_ret_OPEN_FAIL;
+	}
+	if (make_plink2_modifier & kfMakeBim) {
+	  char* bimname_end = strcpya0(outname_end, ".bim");
+	  if (make_plink2_modifier & kfMakeBimZs) {
+	    strcpy(bimname_end, ".zst");
+	  }
+	  pvar_renamed = realpath_identical(outname, g_textbuf, &(g_textbuf[kPglFnamesize + 64]));
+	  if (pvar_renamed) {
+	    logprint("Note: .bim input and output filenames match.  Appending '~' to input filename.\n");
+	    fname_slen = strlen(pvarname);
+	    memcpy(g_textbuf, pvarname, fname_slen);
+	    strcpy(&(pvarname[fname_slen]), "~");
+	    if (rename(g_textbuf, pvarname)) {
+	      logerrprint("Error: Failed to append '~' to input .bim filename.\n");
+	      goto plink2_ret_OPEN_FAIL;
+	    }
+	  }
+	}
+	if ((!pvar_renamed) && (make_plink2_modifier & kfMakePvar)) {
+	  char* pvarname_end = strcpya0(outname_end, ".pvar");
+	  if (pcp->pvar_psam_modifier & kfPvarZs) {
+	    strcpy(pvarname_end, ".zst");
+	  }
+	  // pvar_renamed = realpath_identical();
+	  if (realpath_identical(outname, g_textbuf, &(g_textbuf[kPglFnamesize + 64]))) {
+	    logprint("Note: .pvar input and output filenames match.  Appending '~' to input filename.\n");
+	    fname_slen = strlen(pvarname);
+	    memcpy(g_textbuf, pvarname, fname_slen);
+	    strcpy(&(pvarname[fname_slen]), "~");
+	    if (rename(g_textbuf, pvarname)) {
+	      logerrprint("Error: Failed to append '~' to input .pvar filename.\n");
+	      goto plink2_ret_OPEN_FAIL;
+	    }
+	  }
+	}
+      }
+    }
+
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uintptr_t pgr_alloc_cacheline_ct = 0;
+    if (pgenname[0]) {
+      pgen_header_ctrl_t header_ctrl;
+      uintptr_t cur_alloc_cacheline_ct;
+      while (1) {
+	reterr = pgfi_init_phase1(pgenname, raw_variant_ct, raw_sample_ct, 0, &header_ctrl, &pgfi, &cur_alloc_cacheline_ct, g_logbuf);
+	if (!reterr) {
+	  break;
+	}
+	// detect and autoconvert plink 1 sample-major files, instead of
+	// failing (don't bother supporting plink 0.99 files any more)
+	if (reterr == kPglRetSampleMajorBed) {
+	  char* pgenname_end = memcpya(pgenname, outname, (uintptr_t)(outname_end - outname));
+	  pgenname_end = strcpya(pgenname_end, ".pgen");
+	  const uint32_t no_vmaj_ext = (pcp->command_flags1 & kfCommand1MakePlink2) && (!pcp->filter_flags) && ((make_plink2_modifier & (kfMakePgen | (kfMakePgenFormatBase * 3))) == kfMakePgen);
+	  if (no_vmaj_ext) {
+	    *pgenname_end = '\0';
+	    make_plink2_modifier &= ~kfMakePgen;
+	    // no --make-just-pgen command, so we'll never entirely skip the
+	    // make_plink2 operation
+	  } else {
+	    strcpy(pgenname_end, ".vmaj");
+	  }
+	  reterr = plink1_sample_major_to_pgen(pgenname, raw_variant_ct, raw_sample_ct, (pcp->misc_flags / kfMiscRealRefAlleles) & 1, pcp->max_thread_ct, pgfi.shared_ff);
+	  if (!reterr) {
+	    fclose(pgfi.shared_ff);
+	    pgfi.shared_ff = nullptr;
+	    continue;
+	  }
+	} else {
+	  if (reterr != kPglRetReadFail) {
+	    wordwrapb(0);
+	    logerrprintb();
+	  }
+	}
+	goto plink2_ret_1;
+      }
+      pgfi.allele_idx_offsets = variant_allele_idxs;
+      unsigned char* pgfi_alloc;
+      if (bigstack_alloc_uc(cur_alloc_cacheline_ct * kCacheline, &pgfi_alloc)) {
+	goto plink2_ret_NOMEM;
+      }
+      const uint32_t nonref_flags_already_loaded = (nonref_flags != nullptr);
+      if ((!nonref_flags) && ((header_ctrl & 192) == 192)) {
+	if (bigstack_alloc_ul(raw_variant_ctl, &nonref_flags)) {
+	  goto plink2_ret_NOMEM;
+	}
+      }
+      pgfi.nonref_flags = nonref_flags;
+      uint32_t max_vrec_width;
+      // only practical effect of setting use_blockload to zero here is that
+      // pgr_alloc_cacheline_ct is overestimated by
+      // DIV_UP(max_vrec_width, kCacheline).
+      reterr = pgfi_init_phase2(header_ctrl, 1, nonref_flags_already_loaded, 1, 0, raw_variant_ct, &max_vrec_width, &pgfi, pgfi_alloc, &pgr_alloc_cacheline_ct, g_logbuf);
+      if (reterr) {
+	if (reterr != kPglRetReadFail) {
+	  wordwrapb(0);
+	  logerrprintb();
+	}
+	goto plink2_ret_1;
+      }
+      if (pcp->misc_flags & kfMiscRealRefAlleles) {
+	if (nonref_flags && (!are_all_bits_one(nonref_flags, raw_variant_ct))) {
+	  // technically a lie, it's okay if a .bed is first converted to .pgen
+	  // without this flag, and then the user remembers the existence of
+	  // --real-ref-alleles later.  but to reduce the ease of
+	  // foot-shooting, we don't allow this to clobber arbitrary
+	  // nonref_flags arrays.
+	  logerrprint("Error: --real-ref-alleles must be used on a plink1 fileset.\n");
+	  goto plink2_ret_INCONSISTENT_INPUT;
+	}
+
+	// wasteful if nonref_flags was allocated, but nonref_flags isn't that
+	// large, and --real-ref-alleles + --make-pgen can be run separately
+	// from anything truly memory-limited
+	nonref_flags = nullptr;
+	pgfi.nonref_flags = nullptr;
+	
+	pgfi.gflags &= ~kfPgenGlobalAllNonref;
+      }
+      if (is_single_variant_loader_needed(king_cutoff_fprefix, pcp->command_flags1, make_plink2_modifier)) {
+	// ugly kludge, probably want to add pgenlib_internal support for this
+	// hybrid use pattern
+	FILE* shared_ff_copy = pgfi.shared_ff;
+	pgfi.shared_ff = nullptr;
+	unsigned char* simple_pgr_alloc;
+	if (bigstack_alloc_uc((pgr_alloc_cacheline_ct + DIV_UP(max_vrec_width, kCacheline)) * kCacheline, &simple_pgr_alloc)) {
+	  goto plink2_ret_NOMEM;
+	}
+	reterr = pgr_init(pgenname, max_vrec_width, &pgfi, &simple_pgr, simple_pgr_alloc);
+	if (reterr) {
+	  if (reterr == kPglRetOpenFail) {
+	    LOGERRPRINTF(g_errstr_fopen, pgenname);
+	  }
+	  // only other possibility is kPglRetReadFail
+	  goto plink2_ret_1;
+	}
+	pgfi.shared_ff = shared_ff_copy;
+	if (pcp->command_flags1 & kfCommand1Validate) {
+	  LOGPRINTFWW5("Validating %s... ", pgenname);
+	  fflush(stdout);
+	  reterr = pgr_validate(&simple_pgr, g_logbuf);
+	  if (reterr) {
+	    if (reterr != kPglRetReadFail) {
+	      logprint("\n");
+	      wordwrapb(0);
+	      logerrprintb();
+	    }
+	    goto plink2_ret_1;
+	  }
+	  logprint("done.\n");
+	  if (!(pcp->command_flags1 & (~kfCommand1Validate))) {
+	    goto plink2_ret_1;
+	  }
+	}
+      }
+      // any functions using blockload must perform its own pgr_init(), etc.
+    }
+    if (pcp->pheno_fname) {
+      reterr = load_phenos(pcp->pheno_fname, &(pcp->pheno_range_list), sample_include, sample_ids, raw_sample_ct, sample_ct, max_sample_id_blen, pcp->missing_pheno, (pcp->misc_flags / kfMiscAffection01) & 1, &pheno_cols, &pheno_names, &pheno_ct, &max_pheno_name_blen);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+
+    // move processing of PLINK 1.x cluster-loading/filtering flags here, since
+    // they're now under the categorical-phenotype umbrella
+    if ((pcp->misc_flags & kfMiscCatPhenoFamily) || pcp->within_fname) {
+      reterr = plink1_cluster_import(pcp->within_fname, pcp->catpheno_name, pcp->family_missing_catname, sample_include, sample_ids, raw_sample_ct, sample_ct, max_sample_id_blen, pcp->mwithin_val, &pheno_cols, &pheno_names, &pheno_ct, &max_pheno_name_blen);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+
+    if (!pheno_ct) {
+      logprint("Note: No phenotype data present.\n");      
+    } else {
+      if (pheno_ct == 1) {
+	if (pheno_cols[0].type_code == kPhenoDtypeCc) {
+	  const uint32_t obs_ct = popcount_longs(pheno_cols[0].nonmiss, raw_sample_ctl);
+	  const uint32_t case_ct = popcount_longs(pheno_cols[0].data.cc, raw_sample_ctl);
+	  const uint32_t ctrl_ct = obs_ct - case_ct;
+	  LOGPRINTF("1 binary phenotype loaded (%u case%s, %u control%s).\n", case_ct, (case_ct == 1)? "" : "s", ctrl_ct, (ctrl_ct == 1)? "" : "s");
+	} else if (pheno_cols[0].type_code == kPhenoDtypeQt) {
+	  LOGPRINTF("1 quantitative phenotype loaded.\n");
+	} else {
+	  LOGPRINTF("1 categorical phenotype loaded.\n");
+	}
+      } else {
+	uint32_t cc_ct = 0;
+	uint32_t qt_ct = 0;
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  const pheno_dtype_t cur_type_code = pheno_cols[pheno_idx].type_code;
+	  if (pheno_cols[pheno_idx].type_code == kPhenoDtypeCc) {
+	    ++cc_ct;
+	  } else if (cur_type_code == kPhenoDtypeQt) {
+	    ++qt_ct;
+	  }
+	}
+	uint32_t cat_ct = pheno_ct - cc_ct - qt_ct;
+	// just brute force this for now
+	if (!cc_ct) {
+	  if (!qt_ct) {
+	    LOGPRINTF("%u categorical phenotypes loaded.\n", pheno_ct);
+	  } else if (!cat_ct) {
+	    LOGPRINTF("%u quantitative phenotypes loaded.\n", pheno_ct);
+	  } else {
+	    LOGPRINTF("%u phenotypes loaded (%u quantitative, %u categorical).\n", pheno_ct, qt_ct, cat_ct);
+	  }
+	} else if (!qt_ct) {
+	  if (!cat_ct) {
+	    LOGPRINTF("%u binary phenotypes loaded.\n", pheno_ct);
+	  } else {
+	    LOGPRINTF("%u phenotypes loaded (%u binary, %u categorical).\n", pheno_ct, cc_ct, cat_ct);
+	  }
+	} else if (!cat_ct) {
+	  LOGPRINTF("%u phenotypes loaded (%u binary, %u quantitative).\n", pheno_ct, cc_ct, qt_ct);
+	} else {
+	  LOGPRINTFWW("%u phenotypes loaded (%u binary, %u quantitative, %u categorical).\n", pheno_ct, cc_ct, qt_ct, cat_ct);
+	}
+      }
+    }
+    const uint32_t full_variant_id_htable_needed = variant_ct && (pcp->varid_from || pcp->varid_to || pcp->varid_snp || pcp->varid_exclude_snp || pcp->snps_range_list.name_ct || pcp->exclude_snps_range_list.name_ct);
+    if (variant_ct && (!full_variant_id_htable_needed)) {
+      reterr = apply_variant_bp_filters(pcp->extract_fnames, pcp->exclude_fnames, cip, variant_bps, pcp->from_bp, pcp->to_bp, raw_variant_ct, pcp->misc_flags, vpos_sortstatus, variant_include, &variant_ct);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    if (variant_ct && (full_variant_id_htable_needed || (pcp->extract_fnames && (!(pcp->misc_flags & kfMiscExtractRange))) || (pcp->exclude_fnames && (!(pcp->misc_flags & kfMiscExcludeRange))))) {
+      // don't bother with having different allow_dups vs. no allow_dups hash
+      // table structures, just check specific IDs for duplication in the
+      // no-duplicates-allowed cases
+      unsigned char* bigstack_mark = g_bigstack_base;
+      uint32_t* variant_id_htable = nullptr;
+      uint32_t* htable_dup_base = nullptr;
+      uint32_t variant_id_htable_size;
+      reterr = alloc_and_populate_id_htable_mt(variant_include, variant_ids, variant_ct, pcp->max_thread_ct, &variant_id_htable, &htable_dup_base, &variant_id_htable_size);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+      if (vpos_sortstatus & kfUnsortedVarBp) {
+	if (pcp->varid_from || pcp->varid_to) {
+	  logerrprint("Error: --from/--to require a sorted .pvar/.bim.  Retry this command after using\ne.g. plink 1.9 --make-bed to sort your data.\n");
+	  goto plink2_ret_INCONSISTENT_INPUT;
+	}
+	if (pcp->window_bp != -1) {
+	  logerrprint("Error: --window requires a sorted .pvar/.bim.  Retry this command\nafter using e.g. plink 1.9 --make-bed to sort your data.\n");
+	  goto plink2_ret_INCONSISTENT_INPUT;
+	}
+      }
+      if (pcp->varid_from || pcp->varid_to) {
+	reterr = from_to_flag(variant_ids, variant_id_htable, pcp->varid_from, pcp->varid_to, raw_variant_ct, max_variant_id_slen, variant_id_htable_size, variant_include, cip, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->varid_snp) {
+	reterr = snp_flag(variant_bps, variant_ids, variant_id_htable, pcp->varid_snp, raw_variant_ct, max_variant_id_slen, variant_id_htable_size, 0, pcp->window_bp, variant_include, cip, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->snps_range_list.name_ct) {
+	reterr = snps_flag(variant_ids, variant_id_htable, &(pcp->snps_range_list), raw_variant_ct, max_variant_id_slen, variant_id_htable_size, 0, variant_include, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->varid_exclude_snp) {
+	reterr = snp_flag(variant_bps, variant_ids, variant_id_htable, pcp->varid_exclude_snp, raw_variant_ct, max_variant_id_slen, variant_id_htable_size, 1, pcp->window_bp, variant_include, cip, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->exclude_snps_range_list.name_ct) {
+	reterr = snps_flag(variant_ids, variant_id_htable, &(pcp->exclude_snps_range_list), raw_variant_ct, max_variant_id_slen, variant_id_htable_size, 1, variant_include, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+
+      if (pcp->extract_fnames && (!(pcp->misc_flags & kfMiscExtractRange))) {
+	reterr = extract_exclude_flag_norange(variant_ids, variant_id_htable, pcp->extract_fnames, raw_variant_ct, max_variant_id_slen, variant_id_htable_size, 0, variant_include, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->exclude_fnames && (!(pcp->misc_flags & kfMiscExcludeRange))) {
+	reterr = extract_exclude_flag_norange(variant_ids, variant_id_htable, pcp->exclude_fnames, raw_variant_ct, max_variant_id_slen, variant_id_htable_size, 1, variant_include, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      bigstack_reset(bigstack_mark);
+      if (variant_ct && full_variant_id_htable_needed) {
+	reterr = apply_variant_bp_filters(pcp->extract_fnames, pcp->exclude_fnames, cip, variant_bps, pcp->from_bp, pcp->to_bp, raw_variant_ct, pcp->misc_flags, vpos_sortstatus, variant_include, &variant_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+    }
+    // xid_mode may vary between these operations in a single run, and
+    // sample-sort is relatively cheap, so we abandon plink 1.9's "construct
+    // sample ID map only once" optimization.
+    if (pcp->update_sex_fname) {
+      reterr = update_sample_sexes(pcp->update_sex_fname, sample_include, sample_ids, raw_sample_ct, sample_ct, max_sample_id_blen, pcp->update_sex_colm2, sex_nm, sex_male);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    if (pcp->keepfam_fnames) {
+      reterr = keep_or_remove(pcp->keepfam_fnames, sample_ids, sids, raw_sample_ct, max_sample_id_blen, max_sid_blen, kfKeepFam, sample_include, &sample_ct);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    if (pcp->keep_fnames) {
+      reterr = keep_or_remove(pcp->keep_fnames, sample_ids, sids, raw_sample_ct, max_sample_id_blen, max_sid_blen, (keep_flags_t)(kfKeepForceSid * ((pcp->misc_flags / kfMiscKeepfileSid) & 1)), sample_include, &sample_ct);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    if (pcp->removefam_fnames) {
+      reterr = keep_or_remove(pcp->removefam_fnames, sample_ids, sids, raw_sample_ct, max_sample_id_blen, max_sid_blen, kfKeepRemove | kfKeepFam, sample_include, &sample_ct);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    if (pcp->remove_fnames) {
+      reterr = keep_or_remove(pcp->remove_fnames, sample_ids, sids, raw_sample_ct, max_sample_id_blen, max_sid_blen, kfKeepRemove | ((keep_flags_t)(kfKeepForceSid * ((pcp->misc_flags / kfMiscRemovefileSid) & 1))), sample_include, &sample_ct);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    uint32_t* sample_missing_dosage_cts = nullptr;
+    uint32_t* sample_missing_hc_cts = nullptr;
+    uint32_t* sample_hethap_cts = nullptr;
+    uintptr_t max_covar_name_blen = 0;
+    if (psamname[0]) {
+      if (pcp->misc_flags & kfMiscRequirePheno) {
+        reterr = require_pheno(pheno_cols, pheno_names, require_pheno_flattened, raw_sample_ct, pheno_ct, max_pheno_name_blen, 0, sample_include, &sample_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->filter_flags & (kfFilterExclFemales | kfFilterExclMales | kfFilterExclNosex)) {
+	if (pcp->filter_flags & kfFilterExclFemales) {
+	  for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+	    sample_include[widx] &= (~sex_nm[widx]) | sex_male[widx];
+	  }
+	}
+	if (pcp->filter_flags & kfFilterExclMales) {
+	  bitvec_andnot(sex_male, raw_sample_ctl, sample_include);
+	}
+	if (pcp->filter_flags & kfFilterExclNosex) {
+	  bitvec_and(sex_nm, raw_sample_ctl, sample_include);
+	}
+	const uint32_t old_sample_ct = sample_ct;
+	sample_ct = popcount_longs(sample_include, raw_sample_ctl);
+	const uint32_t removed_ct = old_sample_ct - sample_ct;
+	LOGPRINTF("%u sample%s removed due to sex filter(s).\n", removed_ct, (removed_ct == 1)? "" : "s");
+      }
+      if (pcp->filter_flags & (kfFilterExclFounders | kfFilterExclNonfounders)) {
+	const uint32_t keep_founders = (pcp->filter_flags / kfFilterExclNonfounders) & 1;
+	if (keep_founders) {
+	  bitvec_and(founder_info, raw_sample_ctl, sample_include);
+	} else {
+	  bitvec_andnot(founder_info, raw_sample_ctl, sample_include);
+	}
+	const uint32_t old_sample_ct = sample_ct;
+	sample_ct = popcount_longs(sample_include, raw_sample_ctl);
+	const uint32_t removed_ct = old_sample_ct - sample_ct;
+	LOGPRINTF("--keep-%sfounders: %u sample%s removed.\n", keep_founders? "" : "non", removed_ct, (removed_ct == 1)? "" : "s");
+      }
+      const uint32_t smaj_missing_geno_report_requested = (pcp->command_flags1 & kfCommand1MissingReport) && (!(pcp->missing_rpt_modifier & kfMissingRptVariantOnly));
+      if ((pcp->mind_thresh < 1.0) || smaj_missing_geno_report_requested) {
+	if (bigstack_alloc_ui(raw_sample_ct, &sample_missing_hc_cts) ||
+	    bigstack_alloc_ui(raw_sample_ct, &sample_hethap_cts)) {
+	  goto plink2_ret_NOMEM;
+	}
+	if (are_sample_missing_dosage_cts_needed(pcp->misc_flags, smaj_missing_geno_report_requested, pcp->mind_thresh, pcp->missing_rpt_modifier)) {
+	  if (pgfi.gflags & kfPgenGlobalDosagePresent) {
+	    if (bigstack_alloc_ui(raw_sample_ct, &sample_missing_dosage_cts)) {
+	      goto plink2_ret_NOMEM;
+	    }
+	  } else {
+	    sample_missing_dosage_cts = sample_missing_hc_cts;
+	  }
+	}
+	// could avoid this call and make load_allele_and_geno_counts() do
+	// double duty with --missing?
+	reterr = load_sample_missing_cts(sex_male, variant_include, cip, raw_variant_ct, variant_ct, raw_sample_ct, pcp->max_thread_ct, pgr_alloc_cacheline_ct, &pgfi, sample_missing_hc_cts, (pgfi.gflags & kfPgenGlobalDosagePresent)? sample_missing_dosage_cts : nullptr, sample_hethap_cts);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+	if (pcp->mind_thresh < 1.0) {
+	  uint32_t variant_ct_y = 0;
+	  int32_t y_code;
+	  if (xymt_exists(cip, kChrOffsetY, &y_code)) {
+	    variant_ct_y = count_chr_variants_unsafe(variant_include, cip, y_code);
+	  }
+	  reterr = mind_filter((pcp->misc_flags & kfMiscMindDosage)? sample_missing_dosage_cts : sample_missing_hc_cts, (pcp->misc_flags & kfMiscMindHhMissing)? sample_hethap_cts : nullptr, sample_ids, sids, raw_sample_ct, max_sample_id_blen, max_sid_blen, variant_ct, variant_ct_y, pcp->mind_thresh, sample_include, sex_male, &sample_ct, outname, outname_end);
+	  if (reterr) {
+	    goto plink2_ret_1;
+	  }
+	}
+	if (!smaj_missing_geno_report_requested) {
+	  bigstack_reset(sample_missing_hc_cts);
+	}
+	// this results in a small "memory leak" when a regular missingness
+	// report is requested, not a big deal
+      }
+      if (pcp->covar_fname || pcp->covar_range_list.name_ct) {
+	const char* cur_covar_fname = pcp->covar_fname? pcp->covar_fname : (pcp->pheno_fname? pcp->pheno_fname : psamname);
+	reterr = load_phenos(cur_covar_fname, &(pcp->covar_range_list), sample_include, sample_ids, raw_sample_ct, sample_ct, max_sample_id_blen, pcp->missing_pheno, 2, &covar_cols, &covar_names, &covar_ct, &max_covar_name_blen);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+	LOGPRINTF("%u covariate%s loaded from %s.\n", covar_ct, (covar_ct == 1)? "" : "s", cur_covar_fname);
+
+	// do we still want to clear some main phenotype values here if some
+	// covariate values are missing?  (don't think there's a point to
+	// preserving that behavior, let the regression functions do it to
+	// their local phenotype copies on their own.)
+      }
+
+      if (pcp->misc_flags & kfMiscRequireCovar) {
+        reterr = require_pheno(covar_cols, covar_names, require_covar_flattened, raw_sample_ct, covar_ct, max_covar_name_blen, 1, sample_include, &sample_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->keep_if_expr.pheno_name) {
+	reterr = keep_remove_if(&(pcp->keep_if_expr), pheno_cols, pheno_names, covar_cols, covar_names, raw_sample_ct, pheno_ct, max_pheno_name_blen, covar_ct, max_covar_name_blen, (pcp->misc_flags / kfMiscAffection01) & 1, 0, sample_include, &sample_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->remove_if_expr.pheno_name) {
+	reterr = keep_remove_if(&(pcp->remove_if_expr), pheno_cols, pheno_names, covar_cols, covar_names, raw_sample_ct, pheno_ct, max_pheno_name_blen, covar_ct, max_covar_name_blen, (pcp->misc_flags / kfMiscAffection01) & 1, 1, sample_include, &sample_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      // meow
+      if (pcp->keep_cats_fname || pcp->keep_cat_names_flattened) {
+	reterr = keep_remove_cats(pcp->keep_cats_fname, pcp->keep_cat_names_flattened, pcp->keep_cat_phenoname, pheno_cols, pheno_names, covar_cols, covar_names, raw_sample_ct, pheno_ct, max_pheno_name_blen, covar_ct, max_covar_name_blen, 0, pcp->max_thread_ct, sample_include, &sample_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->remove_cats_fname || pcp->remove_cat_names_flattened) {
+	reterr = keep_remove_cats(pcp->remove_cats_fname, pcp->remove_cat_names_flattened, pcp->remove_cat_phenoname, pheno_cols, pheno_names, covar_cols, covar_names, raw_sample_ct, pheno_ct, max_pheno_name_blen, covar_ct, max_covar_name_blen, 1, pcp->max_thread_ct, sample_include, &sample_ct);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+    }
+
+    const uint32_t nonfounders = (pcp->misc_flags / kfMiscNonfounders) & 1;
+    uint32_t founder_ct = 0;
+    uint32_t male_ct = 0;
+    uint32_t nosex_ct = 0;
+    if (psamname[0]) {
+      if ((!sample_ct) && (!(pcp->misc_flags & kfMiscAllowNoSamples))) {
+	logerrprint("Error: No samples remaining after main filters.  (Add --allow-no-samples to\npermit this.)\n");
+	goto plink2_ret_INCONSISTENT_INPUT;
+      }
+      update_sample_subsets(sample_include, raw_sample_ct, sample_ct, founder_info, &founder_ct, sex_nm, sex_male, &male_ct, &nosex_ct);
+      if (pcp->filter_flags) {
+	const uint32_t female_ct = sample_ct - male_ct - nosex_ct;
+	if (!nosex_ct) {
+	  LOGPRINTFWW("%u sample%s (%u female%s, %u male%s; %u founder%s) remaining after main filters.\n", sample_ct, (sample_ct == 1)? "" : "s", female_ct, (female_ct == 1)? "" : "s", male_ct, (male_ct == 1)? "" : "s", founder_ct, (founder_ct == 1)? "" : "s");
+	} else {
+	  LOGPRINTFWW("%u sample%s (%u female%s, %u male%s, %u ambiguous; %u founder%s) remaining after main filters.\n", sample_ct, (sample_ct == 1)? "" : "s", female_ct, (female_ct == 1)? "" : "s", male_ct, (male_ct == 1)? "" : "s", nosex_ct, founder_ct, (founder_ct == 1)? "" : "s");
+	}
+	if ((pheno_ct == 1) && (pheno_cols[0].type_code == kPhenoDtypeCc)) {
+	  const uint32_t obs_ct = popcount_longs_intersect(pheno_cols[0].nonmiss, sample_include, raw_sample_ctl);
+	  const uint32_t case_ct = popcount_longs_intersect(pheno_cols[0].data.cc, sample_include, raw_sample_ctl);
+	  const uint32_t ctrl_ct = obs_ct - case_ct;
+	  LOGPRINTF("%u case%s and %u control%s remaining after main filters.\n", case_ct, (case_ct == 1)? "" : "s", ctrl_ct, (ctrl_ct == 1)? "" : "s");
+	}
+      }
+    }
+    if (pcp->pheno_transform_flags & kfPhenoTransformSplitCat) {
+      reterr = split_cat_pheno(pcp->split_cat_phenonames_flattened, sample_include, raw_sample_ct, pcp->pheno_transform_flags, &pheno_cols, &pheno_names, &pheno_ct, &max_pheno_name_blen, &covar_cols, &covar_names, &covar_ct, &max_covar_name_blen);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+
+    // quantnorm before variance-standardize, since at least that has a minor
+    // effect, whereas the other order is pointless
+    if (pcp->pheno_transform_flags & (kfPhenoTransformQuantnormPheno | kfPhenoTransformQuantnormAll)) {
+      reterr = pheno_quantile_normalize(pcp->quantnorm_flattened, sample_include, pheno_names, raw_sample_ct, sample_ct, pheno_ct, max_pheno_name_blen, 0, (pcp->pheno_transform_flags / kfPhenoTransformQuantnormPheno) & 1, pheno_cols);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    if (pcp->pheno_transform_flags & (kfPhenoTransformQuantnormCovar | kfPhenoTransformQuantnormAll)) {
+      reterr = pheno_quantile_normalize((pcp->pheno_transform_flags & kfPhenoTransformQuantnormAll)? pcp->quantnorm_flattened : pcp->covar_quantnorm_flattened, sample_include, covar_names, raw_sample_ct, sample_ct, covar_ct, max_covar_name_blen, 1, (pcp->pheno_transform_flags / kfPhenoTransformQuantnormCovar) & 1, covar_cols);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+
+    if (pcp->pheno_transform_flags & (kfPhenoTransformVstdCovar | kfPhenoTransformVstdAll)) {
+      const uint32_t is_covar_flag = (pcp->pheno_transform_flags / kfPhenoTransformVstdCovar) & 1;
+      if (!is_covar_flag) {
+	reterr = pheno_variance_standardize(pcp->vstd_flattened, sample_include, pheno_names, raw_sample_ct, pheno_ct, max_pheno_name_blen, 0, 0, pheno_cols);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      reterr = pheno_variance_standardize(pcp->vstd_flattened, sample_include, covar_names, raw_sample_ct, covar_ct, max_covar_name_blen, 1, is_covar_flag, covar_cols);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+
+    // dosages are currently in 32768ths
+    uint64_t* allele_dosages = nullptr; // same indexes as allele_storage
+    uint64_t* founder_allele_dosages = nullptr;
+    alt_allele_ct_t* maj_alleles = nullptr;
+    double* allele_freqs = nullptr;
+    uint32_t* raw_geno_cts = nullptr;
+    uint32_t* founder_raw_geno_cts = nullptr;
+    unsigned char* bigstack_mark_allele_dosages = g_bigstack_base;
+    unsigned char* bigstack_mark_founder_allele_dosages = g_bigstack_base;
+    if (pgenname[0]) {
+      if (are_allele_freqs_needed(pcp->command_flags1, pcp->min_maf, pcp->max_maf)) {
+	if (are_maj_alleles_needed(pcp->command_flags1)) {
+	  maj_alleles = (alt_allele_ct_t*)bigstack_alloc(raw_variant_ct * sizeof(alt_allele_ct_t));
+	  if (!maj_alleles) {
+	    goto plink2_ret_NOMEM;
+	  }
+	}
+	//   allele_freqs[variant_allele_idxs[variant_uidx] - variant_uidx]
+	// stores the frequency estimate for the reference allele; if there's
+	// more than 1 alt allele, next element stores alt1 freq, etc.  To save
+	// memory, we omit the last alt.
+	uintptr_t total_alt_allele_ct = raw_variant_ct;
+	if (variant_allele_idxs) {
+	  total_alt_allele_ct = variant_allele_idxs[raw_variant_ct] - raw_variant_ct;
+	}
+	if (bigstack_alloc_d(total_alt_allele_ct, &allele_freqs)) {
+	  goto plink2_ret_NOMEM;
+	}
+      }
+      uint32_t x_start = 0;
+      uint32_t x_len = 0;
+      uint32_t hwe_x_probs_needed = 0;
+      int32_t x_code;
+      if ((!(vpos_sortstatus & kfUnsortedVarSplitChr)) && xymt_exists(cip, kChrOffsetX, &x_code)) {
+	const uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)x_code];
+	x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
+	const uint32_t x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1];
+	x_len = x_end - x_start;
+	if (x_len && ((pcp->command_flags1 & kfCommand1Hardy) || (pcp->hwe_thresh != 1.0)) && (!are_all_bits_zero(variant_include, x_start, x_end))) {
+	  if (nonfounders) {
+	    hwe_x_probs_needed = (sample_ct > nosex_ct);
+	  } else {
+	    for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+	      if (founder_info[widx] & sex_nm[widx]) {
+		// at least one founder with known gender
+		hwe_x_probs_needed = 1;
+		break;
+	      }
+	    }
+	  }
+	}
+      }
+      bigstack_mark_allele_dosages = g_bigstack_base;
+      const uint32_t first_hap_uidx = get_first_haploid_uidx(cip, vpos_sortstatus);
+      if (are_allele_dosages_needed(pcp->misc_flags, make_plink2_modifier, (allele_freqs != nullptr), pcp->min_allele_dosage, pcp->max_allele_dosage)) {
+	if (bigstack_alloc_ull(variant_allele_idxs? variant_allele_idxs[raw_variant_ct] : (2 * raw_variant_ct), &allele_dosages)) {
+	  goto plink2_ret_NOMEM;
+	}
+      }
+      bigstack_mark_founder_allele_dosages = g_bigstack_base;
+      if (are_founder_allele_dosages_needed(pcp->misc_flags, (allele_freqs != nullptr), pcp->min_allele_dosage, pcp->max_allele_dosage)) {
+	if ((founder_ct == sample_ct) && allele_dosages) {
+	  founder_allele_dosages = allele_dosages;
+	} else {
+	  if (bigstack_alloc_ull(variant_allele_idxs? variant_allele_idxs[raw_variant_ct] : (2 * raw_variant_ct), &founder_allele_dosages)) {
+	    goto plink2_ret_NOMEM;
+	  }
+	}
+      }
+      double* mach_r2_vals = nullptr;
+      if ((pcp->allele_freq_modifier & kfAlleleFreqColMachR2) || (pcp->mach_r2_max != 0.0)) {
+	if (bigstack_alloc_d(raw_variant_ct, &mach_r2_vals)) {
+	  goto plink2_ret_NOMEM;
+	}
+      }
+      
+      unsigned char* bigstack_mark_geno_cts = g_bigstack_base;
+      
+      // no longer includes hethaps by default
+      uint32_t* variant_missing_hc_cts = nullptr;
+      uint32_t* variant_hethap_cts = nullptr;
+      if (are_variant_missing_hc_cts_needed(pcp->command_flags1, pcp->misc_flags, pcp->geno_thresh, pcp->missing_rpt_modifier)) {
+	if (bigstack_alloc_ui(raw_variant_ct, &variant_missing_hc_cts)) {
+	  goto plink2_ret_NOMEM;
+	}
+	if (are_variant_hethap_cts_needed(pcp->command_flags1, pcp->misc_flags, pcp->geno_thresh, pcp->missing_rpt_modifier, first_hap_uidx)) {
+	  // first_hap_uidx offset can save an entire GB...
+	  if (bigstack_alloc_ui(raw_variant_ct - first_hap_uidx, &variant_hethap_cts)) {
+	    goto plink2_ret_NOMEM;
+	  }
+	}
+      }
+      uint32_t* variant_missing_dosage_cts = nullptr;
+      if (are_variant_missing_dosage_cts_needed(pcp->command_flags1, pcp->misc_flags, pcp->geno_thresh, pcp->missing_rpt_modifier)) {
+	if ((!variant_missing_hc_cts) || (pgfi.gflags & kfPgenGlobalDosagePresent)) {
+	  if (bigstack_alloc_ui(raw_variant_ct, &variant_missing_dosage_cts)) {
+	    goto plink2_ret_NOMEM;
+	  }
+	} else {
+	  variant_missing_dosage_cts = variant_missing_hc_cts;
+	}
+      }
+      uint32_t* x_male_geno_cts = nullptr;
+      uint32_t* founder_x_male_geno_cts = nullptr;
+      uint32_t* x_nosex_geno_cts = nullptr;
+      uint32_t* founder_x_nosex_geno_cts = nullptr;
+      // [3n] = homref ct, [3n+1] = het ref-altx total, [3n+2] = nonref diploid
+      //   total
+      // use unfiltered indexes, since we remove more variants later
+      if (are_raw_geno_cts_needed(pcp->command_flags1, pcp->misc_flags, pcp->hwe_thresh)) {
+	if (bigstack_alloc_ui((3 * k1LU) * raw_variant_ct, &raw_geno_cts)) {
+	  goto plink2_ret_NOMEM;
+	}
+	if (x_len) {
+	  if (male_ct) {
+	    if (bigstack_alloc_ui((3 * k1LU) * x_len, &x_male_geno_cts)) {
+	      goto plink2_ret_NOMEM;
+	    }
+	  }
+	  if (nosex_ct && hwe_x_probs_needed && nonfounders) {
+	    if (bigstack_alloc_ui((3 * k1LU) * x_len, &x_nosex_geno_cts)) {
+	      goto plink2_ret_NOMEM;
+	    }
+	  }
+	}
+      }
+      if (are_founder_raw_geno_cts_needed(pcp->command_flags1, pcp->misc_flags, pcp->hwe_thresh)) {
+	if ((founder_ct == sample_ct) && raw_geno_cts) {
+	  founder_raw_geno_cts = raw_geno_cts;
+	  founder_x_male_geno_cts = x_male_geno_cts;
+	} else {
+	  if (bigstack_alloc_ui((3 * k1LU) * raw_variant_ct, &founder_raw_geno_cts)) {
+	    goto plink2_ret_NOMEM;
+	  }
+	  if (x_len && male_ct) {
+	    const uint32_t founder_male_ct = popcount_longs_intersect(founder_info, sex_male, raw_sample_ctl);
+	    if (founder_male_ct) {
+	      if (bigstack_alloc_ui((3 * k1LU) * x_len, &founder_x_male_geno_cts)) {
+		goto plink2_ret_NOMEM;
+	      }
+	    }
+	  }
+	}
+	if (nosex_ct && hwe_x_probs_needed && (!nonfounders)) {
+	  const uint32_t founder_knownsex_ct = popcount_longs_intersect(founder_info, sex_nm, raw_sample_ctl);
+	  if (founder_knownsex_ct < founder_ct) {
+	    if ((founder_ct == sample_ct) && x_nosex_geno_cts) {
+	      // shouldn't be possible for now
+	      assert(0);
+	      // founder_x_nosex_geno_cts = x_nosex_geno_cts;
+	    } else {
+	      if (bigstack_alloc_ui((3 * k1LU) * x_len, &founder_x_nosex_geno_cts)) {
+		goto plink2_ret_NOMEM;
+	      }
+	    }
+	  }
+	}
+      }
+      if (allele_dosages || founder_allele_dosages || variant_missing_hc_cts || variant_missing_dosage_cts || variant_hethap_cts || raw_geno_cts || founder_raw_geno_cts || mach_r2_vals) {
+	// note that --geno depends on different handling of X/Y than --maf.
+
+	// possible todo: "free" these arrays early in some cases
+	// todo: oblig-missing
+	reterr = load_allele_and_geno_counts(sample_include, founder_info, sex_nm, sex_male, variant_include, cip, variant_allele_idxs, raw_sample_ct, sample_ct, founder_ct, male_ct, nosex_ct, raw_variant_ct, variant_ct, first_hap_uidx, pcp->max_thread_ct, pgr_alloc_cacheline_ct, &pgfi, allele_dosages, founder_allele_dosages, variant_missing_hc_cts, (pgfi.gflags & kfPgenGlobalDosagePresent)? variant_missing_dosage_cts : nullptr, variant_hethap_cts, raw_geno_cts, founder_raw_geno_cts, x_male_gen [...]
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+	if (pcp->command_flags1 & kfCommand1GenotypingRate) {
+	  const uint32_t is_dosage = (pcp->misc_flags / kfMiscGenotypingRateDosage) & 1;
+	  report_genotyping_rate(variant_include, cip, is_dosage? variant_missing_dosage_cts : variant_missing_hc_cts, raw_sample_ct, sample_ct, male_ct, variant_ct, is_dosage);
+	  if (!(pcp->command_flags1 & (~kfCommand1GenotypingRate))) {
+	    goto plink2_ret_1;
+	  }
+	}
+      }
+      if (allele_freqs) {
+	const uint32_t maf_succ = (pcp->misc_flags / kfMiscMafSucc) & 1;
+	compute_allele_freqs(variant_include, variant_allele_idxs, nonfounders? allele_dosages : founder_allele_dosages, variant_ct, maf_succ, allele_freqs);
+	if (pcp->read_freq_fname) {
+	  reterr = read_allele_freqs(variant_include, variant_ids, variant_allele_idxs, allele_storage, pcp->read_freq_fname, raw_variant_ct, variant_ct, pgfi.max_alt_allele_ct, max_variant_id_slen, max_allele_slen, maf_succ, pcp->max_thread_ct, allele_freqs);
+	  if (reterr) {
+	    goto plink2_ret_1;
+	  }
+	}
+	if (maj_alleles) {
+	  compute_maj_alleles(variant_include, variant_allele_idxs, allele_freqs, variant_ct, maj_alleles);
+	}
+      } else if (pcp->read_freq_fname) {
+	LOGERRPRINTF("Warning: Ignoring --read-freq since no command would use the frequencies.\n");
+      }
+
+      if (pcp->command_flags1 & kfCommand1AlleleFreq) {
+	reterr = write_allele_freqs(variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, nonfounders? allele_dosages : founder_allele_dosages, mach_r2_vals, pcp->freq_ref_binstr, pcp->freq_alt1_binstr, variant_ct, pgfi.max_alt_allele_ct, max_allele_slen, pcp->allele_freq_modifier, nonfounders, outname, outname_end);
+	if (reterr || (!(pcp->command_flags1 & (~(kfCommand1GenotypingRate | kfCommand1AlleleFreq))))) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->command_flags1 & kfCommand1GenoCounts) {
+	reterr = write_geno_counts(sample_include, sex_male, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, raw_geno_cts, x_male_geno_cts, raw_sample_ct, sample_ct, male_ct, variant_ct, x_start, max_allele_slen, pcp->geno_counts_modifier, &simple_pgr, outname, outname_end);
+	if (reterr || (!(pcp->command_flags1 & (~(kfCommand1GenotypingRate | kfCommand1AlleleFreq | kfCommand1GenoCounts))))) {
+	  goto plink2_ret_1;
+	}
+      }
+      
+      if (pcp->command_flags1 & kfCommand1MissingReport) {
+	reterr = write_missingness_reports(sample_include, sex_male, sample_ids, sids, pheno_cols, pheno_names, sample_missing_hc_cts, sample_missing_dosage_cts, sample_hethap_cts, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, variant_missing_hc_cts, variant_missing_dosage_cts, variant_hethap_cts, sample_ct, male_ct, max_sample_id_blen, max_sid_blen, pheno_ct, max_pheno_name_blen, variant_ct, max_allele_slen, variant_hethap_cts? first_hap_uidx : 0x7fffffff [...]
+	if (reterr || (!(pcp->command_flags1 & (~(kfCommand1GenotypingRate | kfCommand1AlleleFreq | kfCommand1GenoCounts | kfCommand1MissingReport))))) {
+	  goto plink2_ret_1;
+	}
+      }
+
+      if (pcp->geno_thresh != 1.0) {
+	const uint32_t geno_hh_missing = (uint32_t)(pcp->misc_flags & kfMiscGenoHhMissing);
+	enforce_geno_thresh(cip, (pcp->misc_flags & kfMiscGenoDosage)? variant_missing_dosage_cts : variant_missing_hc_cts, geno_hh_missing? variant_hethap_cts : nullptr, sample_ct, male_ct, geno_hh_missing? first_hap_uidx : 0x7fffffff, pcp->geno_thresh, variant_include, &variant_ct);
+      }
+
+      double* hwe_x_pvals = nullptr;
+      uint32_t hwe_x_ct = 0;
+      if (hwe_x_probs_needed) {
+	hwe_x_ct = count_chr_variants_unsafe(variant_include, cip, cip->xymt_codes[kChrOffsetX]);
+	// hwe_x_ct == 0 possible, if --geno filters out all remaining chrX
+	// variants
+	// also support suppression of --hardy p column (with a gigantic
+	// dataset, maybe it's reasonable to stick to femalep, etc.)
+	if (hwe_x_ct && ((pcp->hwe_thresh != 1.0) || (pcp->hardy_modifier & kfHardyColP))) {
+	  uint32_t hwe_midp;
+	  if (pcp->command_flags1 & kfCommand1Hardy) {
+	    hwe_midp = (pcp->hardy_modifier / kfHardyMidp) & 1;
+	    if (pcp->hwe_thresh != 1.0) {
+	      const uint32_t hwe_midp2 = (pcp->misc_flags / kfMiscHweMidp) & 1;
+	      if (hwe_midp != hwe_midp2) {
+		// could support this efficiently, but why bother...
+		logerrprint("Error: --hardy and --hwe must have identical midp settings when chrX is\npresent.\n");
+		goto plink2_ret_INVALID_CMDLINE;
+	      }
+	    }
+	  } else {
+	    hwe_midp = (pcp->misc_flags / kfMiscHweMidp) & 1;
+	  }
+	  reterr = compute_hwe_x_pvals(variant_include, nonfounders? raw_geno_cts : founder_raw_geno_cts, nonfounders? x_male_geno_cts : founder_x_male_geno_cts, nonfounders? x_nosex_geno_cts : founder_x_nosex_geno_cts, x_start, hwe_x_ct, hwe_midp, pcp->max_thread_ct, &hwe_x_pvals);
+	  if (reterr) {
+	    goto plink2_ret_1;
+	  }
+	}
+      }
+      if (pcp->command_flags1 & kfCommand1Hardy) {
+	reterr = hardy_report(variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, nonfounders? raw_geno_cts : founder_raw_geno_cts, nonfounders? x_male_geno_cts : founder_x_male_geno_cts, nonfounders? x_nosex_geno_cts : founder_x_nosex_geno_cts, hwe_x_pvals, variant_ct, hwe_x_ct, max_allele_slen, pcp->output_min_p, pcp->hardy_modifier, nonfounders, outname, outname_end);
+	if (reterr || (!(pcp->command_flags1 & (~(kfCommand1GenotypingRate | kfCommand1AlleleFreq | kfCommand1GenoCounts | kfCommand1MissingReport | kfCommand1Hardy))))) {
+	  goto plink2_ret_1;
+	}
+      }
+      if (pcp->hwe_thresh != 1.0) {
+	// assumes no filtering between hwe_x_pvals[] computation and here
+	enforce_hwe_thresh(cip, nonfounders? raw_geno_cts : founder_raw_geno_cts, nonfounders? x_male_geno_cts : founder_x_male_geno_cts, nonfounders? x_nosex_geno_cts : founder_x_nosex_geno_cts, hwe_x_pvals, pcp->misc_flags, pcp->hwe_thresh, nonfounders, variant_include, &variant_ct);
+      }
+      // raw_geno_cts/founder_raw_geno_cts/hwe_x_pvals no longer needed
+      bigstack_reset(bigstack_mark_geno_cts);
+
+      if ((pcp->min_maf != 0.0) || (pcp->max_maf != 1.0) || pcp->min_allele_dosage || (pcp->max_allele_dosage != (~0LLU))) {
+	enforce_minor_freq_constraints(variant_allele_idxs, nonfounders? allele_dosages : founder_allele_dosages, allele_freqs, pcp->min_maf, pcp->max_maf, pcp->min_allele_dosage, pcp->max_allele_dosage, variant_include, &variant_ct);
+      }
+
+      if (mach_r2_vals) {
+	if (pcp->mach_r2_max != 0.0) {
+	  enforce_mach_r2_thresh(cip, mach_r2_vals, pcp->mach_r2_min, pcp->mach_r2_max, variant_include, &variant_ct);
+	}
+        bigstack_reset(mach_r2_vals);
+      }
+
+      if ((!variant_ct) && (!(pcp->misc_flags & kfMiscAllowNoVars))) {
+	logerrprint("Error: No variants remaining after main filters.  (Add --allow-no-vars to\npermit this.)\n");
+	goto plink2_ret_INCONSISTENT_INPUT;
+      }
+      if (pcp->filter_flags) {
+	LOGPRINTF("%u variant%s remaining after main filters.\n", variant_ct, (variant_ct == 1)? "" : "s");
+      }
+
+      if (pcp->command_flags1 & (kfCommand1MakeKing | kfCommand1KingCutoff)) {
+	uintptr_t* prev_sample_include = nullptr;
+	const uint32_t prev_sample_ct = sample_ct;
+	if (pcp->king_cutoff != -1) {
+	  if (bigstack_alloc_ul(raw_sample_ctl, &prev_sample_include)) {
+	    goto plink2_ret_NOMEM;
+	  }
+	  memcpy(prev_sample_include, sample_include, raw_sample_ctl * sizeof(intptr_t));
+	}
+	if (king_cutoff_fprefix) {
+	  reterr = king_cutoff_batch(sample_ids, sids, raw_sample_ct, max_sample_id_blen, max_sid_blen, pcp->king_cutoff, sample_include, king_cutoff_fprefix, &sample_ct);
+	} else {
+	  reterr = calc_king(sample_ids, sids, variant_include, cip, raw_sample_ct, max_sample_id_blen, max_sid_blen, raw_variant_ct, variant_ct, pcp->king_cutoff, pcp->king_table_filter, pcp->king_modifier, pcp->parallel_idx, pcp->parallel_tot, pcp->max_thread_ct, &simple_pgr, sample_include, &sample_ct, outname, outname_end);
+	}
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+	if (pcp->king_cutoff != -1) {
+	  strcpy(outname_end, ".king.cutoff.in");
+	  reterr = write_sample_ids(sample_include, sample_ids, sids, outname, sample_ct, max_sample_id_blen, max_sid_blen);
+	  if (reterr) {
+	    goto plink2_ret_1;
+	  }
+	  strcpy(&(outname_end[13]), "out");
+	  bitvec_andnot(sample_include, raw_sample_ctl, prev_sample_include);
+	  const uint32_t removed_sample_ct = prev_sample_ct - sample_ct;
+	  reterr = write_sample_ids(prev_sample_include, sample_ids, sids, outname, removed_sample_ct, max_sample_id_blen, max_sid_blen);
+	  if (reterr) {
+	    goto plink2_ret_1;
+	  }
+	  bigstack_reset(prev_sample_include);
+	  outname_end[13] = '\0';
+	  LOGPRINTFWW("--king-cutoff: Excluded sample ID%s written to %sout, and %u remaining sample ID%s written to %sin .\n", (removed_sample_ct == 1)? "" : "s", outname, sample_ct, (sample_ct == 1)? "" : "s", outname);
+	  update_sample_subsets(sample_include, raw_sample_ct, sample_ct, founder_info, &founder_ct, sex_nm, sex_male, &male_ct, &nosex_ct);
+	}
+      }
+    }
+    double* grm = nullptr;
+    const uint32_t keep_grm = grm_keep_needed(pcp->command_flags1, pcp->pca_flags);
+    if ((pcp->command_flags1 & kfCommand1MakeRel) || keep_grm) {
+      reterr = calc_grm(sample_include, sample_ids, sids, variant_include, cip, variant_allele_idxs, maj_alleles, allele_freqs, raw_sample_ct, sample_ct, max_sample_id_blen, max_sid_blen, raw_variant_ct, variant_ct, pcp->grm_flags, pcp->parallel_idx, pcp->parallel_tot, pcp->max_thread_ct, &simple_pgr, outname, outname_end, keep_grm? (&grm) : nullptr);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+      // don't bother with --rel-cutoff for now, since --king-cutoff seems to
+      // work better...
+      
+      // possible todo: unrelated heritability?
+    }
+#ifndef NOLAPACK
+    if (pcp->command_flags1 & kfCommand1Pca) {
+      // if the GRM is on the stack, this always frees it
+      reterr = calc_pca(sample_include, sample_ids, sids, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, maj_alleles, allele_freqs, raw_sample_ct, sample_ct, max_sample_id_blen, max_sid_blen, raw_variant_ct, variant_ct, max_allele_slen, pcp->pca_ct, pcp->pca_flags, pcp->max_thread_ct, &simple_pgr, grm, outname, outname_end);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+#endif
+    
+    if (pcp->command_flags1 & kfCommand1WriteSnplist) {
+      reterr = write_snplist(variant_include, variant_ids, variant_ct, (pcp->misc_flags / kfMiscWriteSnplistZs) & 1, outname, outname_end);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+
+    if (pcp->command_flags1 & (kfCommand1MakePlink2 | kfCommand1Exportf | kfCommand1WriteCovar)) {
+      // If non-null, this has (2 * raw_variant_ct) entries.  [2n] stores the
+      // index of the new ref allele for variant n, and [2n+1] stores the index
+      // of the new alt1 allele.  (0 = original ref, 1 = original alt1, etc.)
+      // If at all possible, operations which instantiate this
+      // (--ref-allele, --alt1-allele, ...) should only be usable with fileset
+      // creation commands.  no more pass-marker_reverse-to-everything
+      // nonsense.
+      // (Technically, I could also drop support for --export, but that would
+      // force too many real-world jobs to require two plink2 runs instead of
+      // one.)
+      unsigned char* bigstack_end_mark = g_bigstack_end;
+      alt_allele_ct_t* refalt1_select = nullptr;
+      if (pcp->misc_flags & kfMiscMajRef) {
+	// todo: also support updated version of --a2-allele, etc.
+	const uintptr_t refalt1_word_ct = DIV_UP(2 * raw_variant_ct * sizeof(alt_allele_ct_t), kBytesPerWord);
+	uintptr_t* refalt1_select_ul;
+	if (bigstack_end_alloc_ul(refalt1_word_ct, &refalt1_select_ul)) {
+	  goto plink2_ret_NOMEM;
+	}
+	const uintptr_t alt_allele_vals = (uintptr_t)(k1LU << (8 * sizeof(alt_allele_ct_t)));
+	const uintptr_t fill_word = ((~k0LU) / ((alt_allele_vals - 1) * (alt_allele_vals + 1))) * alt_allele_vals;
+	for (uintptr_t widx = 0; widx < refalt1_word_ct; ++widx) {
+	  refalt1_select_ul[widx] = fill_word;
+	}
+	refalt1_select = (alt_allele_ct_t*)refalt1_select_ul;
+	if (pcp->misc_flags & kfMiscMajRef) {
+	  // possible todo: make this subscribe to maj_alleles[] instead?
+	  // might be pointless due to ALT1 computation, though.
+
+	  // todo: warning if this is specified without file write command, if
+	  // this is ever moved out of the file-write subblock
+	  const uint64_t* main_allele_dosages = nonfounders? allele_dosages : founder_allele_dosages;
+	  const uint32_t not_all_nonref = !(pgfi.gflags & kfPgenGlobalAllNonref);
+	  const uint32_t skip_real_ref = not_all_nonref && (!(pcp->misc_flags & kfMiscMajRefForce));
+	  if (skip_real_ref && (!nonref_flags)) {
+	    logerrprint("Warning: --maj-ref has no effect, since no provisional reference alleles are\npresent.  (Did you forget to add the 'force' modifier?)\n");
+	  } else {
+	    if (not_all_nonref && (!nonref_flags)) {
+	      if (bigstack_end_alloc_ul(raw_variant_ctl, &nonref_flags)) {
+		goto plink2_ret_NOMEM;
+	      }
+	      pgfi.nonref_flags = nonref_flags;
+	      fill_ulong_zero(raw_variant_ctl, nonref_flags);
+	    }
+	    uint32_t variant_uidx = 0;
+	    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	      next_set_unsafe_ck(variant_include, &variant_uidx);
+	      if (skip_real_ref && IS_SET(nonref_flags, variant_uidx)) {
+		continue;
+	      }
+	      const uint64_t* cur_allele_dosages = &(main_allele_dosages[variant_allele_idxs? variant_allele_idxs[variant_uidx] : (2 * variant_uidx)]);
+	      const uint32_t alt_ct_p1 = variant_allele_idxs? (variant_allele_idxs[variant_uidx + 1] - variant_allele_idxs[variant_uidx]) : 2;
+	      if (alt_ct_p1 == 2) {
+		// optimize common case
+		if (cur_allele_dosages[1] > cur_allele_dosages[0]) {
+		  // assumes alt_allele_ct_t is unsigned char
+		  ((uint16_t*)refalt1_select)[variant_uidx] = 1;
+		  if (nonref_flags) {
+		    SET_BIT(variant_uidx, nonref_flags);
+		  }
+		}
+	      } else {
+		uint32_t new_ref_idx = (cur_allele_dosages[1] > cur_allele_dosages[0]);
+		uint32_t new_alt1_idx = 1 - new_ref_idx;
+		uint64_t ref_dosage = cur_allele_dosages[new_ref_idx];
+		uint64_t alt1_dosage = cur_allele_dosages[new_alt1_idx];
+		for (uint32_t alt_idx = 2; alt_idx < alt_ct_p1; ++alt_idx) {
+		  const uint64_t cur_alt_dosage = cur_allele_dosages[alt_idx];
+		  if (cur_alt_dosage > alt1_dosage) {
+		    if (cur_alt_dosage > ref_dosage) {
+		      alt1_dosage = ref_dosage;
+		      ref_dosage = cur_alt_dosage;
+		      new_alt1_idx = new_ref_idx;
+		      new_ref_idx = alt_idx;
+		    } else {
+		      alt1_dosage = cur_alt_dosage;
+		      new_alt1_idx = alt_idx;
+		    }
+		  }
+		}
+		if (new_ref_idx || (new_alt1_idx != 1)) {
+		  refalt1_select[2 * variant_uidx] = new_ref_idx;
+		  refalt1_select[2 * variant_uidx + 1] = new_alt1_idx;
+		  if (nonref_flags) {
+		    SET_BIT(variant_uidx, nonref_flags);
+		  }
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      // founder_allele_dosages no longer needed
+      // allele_dosages only needed in trim-alts case
+      // todo: trim-alts does NOT need to be dosage-sensitive when we're
+      //   erasing dosage.  may want a bitarray to handle that case; and once
+      //   that's implemented, make dosage-preserving trim-alts also use that
+      //   (get rid of its allele_dosages[] dependency).
+      if (make_plink2_modifier & kfMakePlink2TrimAlts) {
+	bigstack_reset(bigstack_mark_founder_allele_dosages);
+      } else {
+        bigstack_reset(bigstack_mark_allele_dosages);
+      }
+
+      uint32_t* new_sample_idx_to_old = nullptr;
+      if (pcp->sample_sort_flags & (kfSortNatural | kfSortAscii | kfSortFile)) {
+	if (sample_ct < 2) {
+	  logerrprint("Warning: Skipping --sample-sort since <2 samples are present.\n");
+	} else {
+	  if (pcp->sample_sort_flags & kfSortFile) {
+	    reterr = sample_sort_file_map(sample_include, sample_ids, sids, pcp->sample_sort_fname, raw_sample_ct, sample_ct, max_sample_id_blen, max_sid_blen, pcp->sample_sort_flags & kfSortFileSid, &new_sample_idx_to_old);
+	    if (reterr) {
+	      goto plink2_ret_1;
+	    }
+	  } else {
+	    // probably more efficient to have --make-{bed,pgen,bpgen} perform
+	    // an unfiltered load?  but we should have compute power to spare
+	    // here, so keep the code simpler for now
+	    char* sorted_xidbox_tmp;
+	    uintptr_t max_xid_blen;
+	    reterr = sorted_xidbox_init_alloc(sample_include, sample_ids, sids, sample_ct, max_sample_id_blen, max_sid_blen, sids? kfXidModeFidiidSid : kfXidModeFidiid, (pcp->sample_sort_flags == kfSortNatural), &sorted_xidbox_tmp, &new_sample_idx_to_old, &max_xid_blen);
+	    if (reterr) {
+	      goto plink2_ret_1;
+	    }
+	    bigstack_reset(sorted_xidbox_tmp);
+	  }
+	  LOGPRINTF("--indiv-sort: %u samples reordered.\n", sample_ct);
+	}
+      }
+
+      if (covar_ct && ((pcp->command_flags1 & (kfCommand1Exportf | kfCommand1WriteCovar)) || ((pcp->command_flags1 & kfCommand1MakePlink2) && (make_plink2_modifier & (kfMakeBed | kfMakeFam | kfMakePgen | kfMakePsam))))) {
+	reterr = write_covar(sample_include, sample_ids, sids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, pheno_names, covar_cols, covar_names, new_sample_idx_to_old, sample_ct, max_sample_id_blen, max_sid_blen, max_paternal_id_blen, max_maternal_id_blen, pheno_ct, max_pheno_name_blen, covar_ct, max_covar_name_blen, pcp->write_covar_flags, outname, outname_end);
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      } else if (pcp->command_flags1 & kfCommand1WriteCovar) {
+	logerrprint("Warning: Skipping --write-covar, since no covariates are loaded.\n");
+      }
+      
+      if (pcp->command_flags1 & kfCommand1MakePlink2) {
+	// todo: unsorted case (--update-chr, etc.)
+	if (vpos_sortstatus & kfUnsortedVarSplitChr) {
+	  logerrprint("Error: --make-bed/--make-{b}pgen variant sorting is under development.\n");
+	  reterr = kPglRetNotYetSupported;
+	  goto plink2_ret_1;
+	}
+	if (vpos_sortstatus & kfUnsortedVarBp) {
+	  logerrprint("Warning: --make-bed/--make-{b}pgen variant sorting is not implemented yet.\n");
+	}
+	reterr = make_plink2_no_vsort(xheader, sample_include, sample_ids, sids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, allele_dosages, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, xheader_blen, xheader_info_pr, raw_sample_ct, sample_ct, max_sam [...]
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+        bigstack_reset(bigstack_mark_allele_dosages);
+      }
+
+      if (pcp->command_flags1 & kfCommand1Exportf) {
+	reterr = exportf(xheader, sample_include, sample_ids, sids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, pheno_names, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, info_reload_slen? pvarname : nullptr, variant_cms, xheader_blen, xheader_info_pr, raw_sample_ct, sample_ct, max_sample_id_blen, max_sid_blen, max_paternal_id_blen, max [...]
+	if (reterr) {
+	  goto plink2_ret_1;
+	}
+      }
+      bigstack_end_reset(bigstack_end_mark);
+    }
+    bigstack_reset(bigstack_mark_allele_dosages);
+    
+    if (pcp->command_flags1 & kfCommand1LdPrune) {
+      if ((pcp->ld_info.prune_modifier & kfLdPruneWindowBp) && (vpos_sortstatus & kfUnsortedVarBp)) {
+	logerrprint("Error: When the window size is in kb units, LD-based pruning requires a sorted\n.pvar/.bim.  Retry this command after using e.g. plink 1.9 --make-bed to sort\nyour data.\n");
+	goto plink2_ret_INCONSISTENT_INPUT;
+      }
+      reterr = ld_prune(variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, maj_alleles, allele_freqs, founder_info, sex_male, &(pcp->ld_info), raw_variant_ct, variant_ct, raw_sample_ct, founder_ct, pcp->max_thread_ct, &simple_pgr, outname, outname_end);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+
+    if (pcp->command_flags1 & kfCommand1Score) {
+      reterr = score_report(sample_include, sample_ids, sids, sex_male, pheno_cols, pheno_names, variant_include, cip, variant_ids, variant_allele_idxs, allele_storage, allele_freqs, &(pcp->score_info), sample_ct, max_sample_id_blen, max_sid_blen, pheno_ct, max_pheno_name_blen, raw_variant_ct, variant_ct, max_variant_id_slen, pcp->xchr_model, pcp->max_thread_ct, &simple_pgr, outname, outname_end);
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+    // eventually check for nonzero pheno_ct here?
+    
+    if (pcp->command_flags1 & kfCommand1Glm) {
+      reterr = glm_main(sample_include, sample_ids, sids, sex_nm, sex_male, pheno_cols, pheno_names, covar_cols, covar_names, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, &(pcp->glm_info), &(pcp->adjust_info), &(pcp->aperm), pcp->glm_local_covar_fname, pcp->glm_local_pvar_fname, pcp->glm_local_psam_fname, raw_sample_ct, sample_ct, max_sample_id_blen, max_sid_blen, pheno_ct, max_pheno_name_blen, covar_ct, max_covar_name_blen, raw_variant_ct, variant [...]
+      if (reterr) {
+	goto plink2_ret_1;
+      }
+    }
+  }
+  while (0) {
+  plink2_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  plink2_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  plink2_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  plink2_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ plink2_ret_1:
+  cleanup_pheno_cols(covar_ct, covar_cols);
+  cleanup_pheno_cols(pheno_ct, pheno_cols);
+  free_cond(covar_names);
+  free_cond(pheno_names);
+  if (pgr_cleanup(&simple_pgr) && (!reterr)) {
+    reterr = kPglRetReadFail;
+  }
+  if (pgfi_cleanup(&pgfi) && (!reterr)) {
+    reterr = kPglRetReadFail;
+  }
+  // no bigstack_reset() needed?
+  return reterr;
+}
+
+pglerr_t zst_decompress(const char* in_fname, const char* out_fname) {
+  // Since this needs to be able to dump the decompressed data and nothing but
+  // the decompressed data to stdout, we have to duplicate a bit of
+  // plink2_common code and strip out printing/logging.
+
+  // Strictly speaking, this can decompress gzipped files too, but that's not
+  // its purpose.
+  gzFile gz_infile = gzopen(in_fname, FOPEN_RB);
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!gz_infile) {
+      fprintf(stderr, g_errstr_fopen, in_fname);
+      goto zst_decompress_ret_OPEN_FAIL;
+    }
+    if (gzbuffer(gz_infile, 131072)) {
+      goto zst_decompress_ret_NOMEM;
+    }
+    if (out_fname) {
+      outfile = fopen(out_fname, FOPEN_WB);
+      if (!outfile) {
+	fprintf(stderr, g_errstr_fopen, out_fname);
+	goto zst_decompress_ret_OPEN_FAIL;
+      }
+    } else {
+      outfile = stdout;
+    }
+    unsigned char* buf = (unsigned char*)g_textbuf;
+    while (1) {
+      const int32_t bytes_read = gzread(gz_infile, buf, kTextbufMainSize);
+      if (bytes_read <= 0) {
+	if (!bytes_read) {
+	  break;
+	}
+	goto zst_decompress_ret_READ_FAIL;
+      }
+      if (!fwrite(buf, bytes_read, 1, outfile)) {
+	goto zst_decompress_ret_WRITE_FAIL;
+      }
+      fflush(outfile);
+    }
+    if (gzclose_null(&gz_infile)) {
+      goto zst_decompress_ret_READ_FAIL;
+    }
+    if (out_fname) {
+      if (fclose_null(&outfile)) {
+	goto zst_decompress_ret_WRITE_FAIL;
+      }
+    }
+  }
+  // we exit from main() immediately, so need to print nomem/read/write error
+  // messages here
+  while (0) {
+  zst_decompress_ret_NOMEM:
+    // in this exceedingly unlikely case, the --memory flag doesn't help, so
+    // print a different message
+    fputs("Error: Out of memory.\n", stderr);
+    reterr = kPglRetNomem;
+    break;
+  zst_decompress_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  zst_decompress_ret_READ_FAIL:
+    fputs(errstr_read, stderr);
+    reterr = kPglRetReadFail;
+    break;
+  zst_decompress_ret_WRITE_FAIL:
+    fputs(errstr_write, stderr);
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  if (out_fname) {
+    fclose_cond(outfile);
+  }
+  gzclose_cond(gz_infile);
+  return reterr;
+}
+
+// useful when there's e.g. a filename and an optional modifier, and we want to
+// permit either parmeter ordering
+boolerr_t check_extra_param(char** argv, const char* permitted_modif, uint32_t* other_idx_ptr) {
+  const uint32_t idx_base = *other_idx_ptr;
+  if (!strcmp(argv[idx_base], permitted_modif)) {
+    *other_idx_ptr = idx_base + 1;
+  } else if (strcmp(argv[idx_base + 1], permitted_modif)) {
+    LOGERRPRINTF("Error: Invalid %s parameter sequence.\n", argv[0]);
+    return 1;
+  }
+  return 0;
+}
+
+char extract_char_param(const char* ss) {
+  // maps c, 'c', and "c" to c, and anything else to the null char.  This is
+  // intended to support e.g. always using '#' to designate a # parameter
+  // without worrying about differences between shells.
+  const char cc = ss[0];
+  if (((cc == '\'') || (cc == '"')) && (ss[1]) && (ss[2] == cc) && (!ss[3])) {
+    return ss[1];
+  }
+  if (cc && (!ss[1])) {
+    return cc;
+  }
+  return '\0';
+}
+
+pglerr_t cmdline_alloc_string(const char* source, const char* flag_name, uint32_t max_slen, char** sbuf_ptr) {
+  const uint32_t slen = strlen(source);
+  if (slen > max_slen) {
+    LOGERRPRINTF("Error: %s parameter too long.\n", flag_name);
+    return kPglRetInvalidCmdline;
+  }
+  const uint32_t blen = slen + 1;
+  if (pgl_malloc(blen, sbuf_ptr)) {
+    return kPglRetNomem;
+  }
+  memcpy(*sbuf_ptr, source, blen);
+  return kPglRetSuccess;
+}
+
+pglerr_t alloc_fname(const char* source, const char* flagname_p, uint32_t extra_size, char** fnbuf_ptr) {
+  const uint32_t blen = strlen(source) + 1;
+  if (blen > (kPglFnamesize - extra_size)) {
+    LOGERRPRINTF("Error: --%s filename too long.\n", flagname_p);
+    return kPglRetOpenFail;
+  }
+  if (pgl_malloc(blen + extra_size, fnbuf_ptr)) {
+    return kPglRetNomem;
+  }
+  memcpy(*fnbuf_ptr, source, blen);
+  return kPglRetSuccess;
+}
+
+pglerr_t alloc_and_flatten(char** sources, uint32_t param_ct, uint32_t max_blen, char** flattened_buf_ptr) {
+  uintptr_t tot_blen = 1;
+  for (uint32_t param_idx = 0; param_idx < param_ct; ++param_idx) {
+    const uint32_t cur_blen = 1 + strlen(sources[param_idx]);
+    if (cur_blen > max_blen) {
+      return kPglRetInvalidCmdline;
+    }
+    tot_blen += cur_blen;
+  }
+  char* buf_iter;
+  if (pgl_malloc(tot_blen, &buf_iter)) {
+    return kPglRetNomem;
+  }
+  *flattened_buf_ptr = buf_iter;
+  for (uint32_t param_idx = 0; param_idx < param_ct; ++param_idx) {
+    buf_iter = strcpyax(buf_iter, sources[param_idx], '\0');
+  }
+  *buf_iter = '\0';
+  return kPglRetSuccess;
+}
+
+
+// may move these to plink2_common or plink2_filter
+char* parse_next_binary_op(char* expr_str, uint32_t expr_slen, char** op_start_ptr, cmp_binary_op_t* binary_op_ptr) {
+  // !=, <>: kCmpOperatorNoteq
+  // <: kCmpOperatorLe
+  // <=: kCmpOperatorLeq
+  // =, ==: kCmpOperatorEq
+  // >=: kCmpOperatorGeq
+  // >: kCmpOperatorGe
+  char* next_eq = (char*)memchr(expr_str, '=', expr_slen);
+  char* next_lt = (char*)memchr(expr_str, '<', expr_slen);
+  char* next_gt = (char*)memchr(expr_str, '>', expr_slen);
+  if (!next_eq) {
+    if (!next_lt) {
+      if (!next_gt) {
+	return nullptr;
+      }
+      *op_start_ptr = next_gt;
+      *binary_op_ptr = kCmpOperatorGe;
+      return &(next_gt[1]);
+    }
+    if (next_gt == (&(next_lt[1]))) {
+      *op_start_ptr = next_lt;
+      *binary_op_ptr = kCmpOperatorNoteq;
+      return &(next_lt[2]);
+    }
+    if ((!next_gt) || (next_gt > next_lt)) {
+      *op_start_ptr = next_lt;
+      *binary_op_ptr = kCmpOperatorLe;
+      return &(next_lt[1]);
+    }
+    *op_start_ptr = next_gt;
+    *binary_op_ptr = kCmpOperatorGe;
+    return &(next_gt[1]);
+  }
+  if ((!next_lt) || (next_lt > next_eq)) {
+    if ((!next_gt) || (next_gt > next_eq)) {
+      if ((next_eq != expr_str) && (next_eq[-1] == '!')) {
+	*op_start_ptr = &(next_eq[-1]);
+	*binary_op_ptr = kCmpOperatorNoteq;
+	return &(next_eq[1]);
+      }
+      *op_start_ptr = next_eq;
+      *binary_op_ptr = kCmpOperatorEq;
+      return (next_eq[1] == '=')? (&(next_eq[2])) : (&(next_eq[1]));
+    }
+    *op_start_ptr = next_gt;
+    if (next_eq == (&(next_gt[1]))) {
+      *binary_op_ptr = kCmpOperatorGeq;
+      return &(next_gt[2]);
+    }
+    *binary_op_ptr = kCmpOperatorGe;
+    return &(next_gt[1]);
+  }
+  if (next_gt == (&(next_lt[1]))) {
+    *op_start_ptr = next_lt;
+    *binary_op_ptr = kCmpOperatorNoteq;
+    return &(next_lt[2]);
+  }
+  if ((!next_gt) || (next_gt > next_lt)) {
+    *op_start_ptr = next_lt;
+    if (next_eq == (&(next_lt[1]))) {
+      *binary_op_ptr = kCmpOperatorLeq;
+      return &(next_lt[2]);
+    }
+    *binary_op_ptr = kCmpOperatorLe;
+    return &(next_lt[1]);
+  }
+  *op_start_ptr = next_gt;
+  if (next_eq == (&(next_gt[1]))) {
+    *binary_op_ptr = kCmpOperatorGeq;
+    return &(next_gt[2]);
+  }
+  *binary_op_ptr = kCmpOperatorGe;
+  return &(next_gt[1]);
+}
+
+pglerr_t validate_and_alloc_cmp_expr(char** sources, const char* flag_name, uint32_t param_ct, cmp_expr_t* cmp_expr_ptr) {
+  // restrict to [pheno/covar name] [operator] [pheno val] for now.  could
+  // support or/and, parentheses, etc. later.
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if ((param_ct != 1) && (param_ct != 3)) {
+      goto validate_and_alloc_cmp_expr_ret_INVALID_EXPR_GENERIC;
+    }
+    char* pheno_name_start = sources[0];
+    char* pheno_val_start;
+    uint32_t pheno_name_slen;
+    uint32_t pheno_val_slen;
+    if (param_ct == 3) {
+      pheno_name_slen = strlen(pheno_name_start);
+      char* op_str = sources[1];
+      uint32_t op_slen = strlen(op_str);
+      // ok to have single/double quotes around operator
+      if (op_slen > 2) {
+	const char cc = op_str[0];
+	if (((cc == '\'') || (cc == '"')) && (op_str[op_slen - 1] == cc)) {
+	  ++op_str;
+	  op_slen -= 2;
+	}
+      }
+      char* op_start;
+      char* op_end = parse_next_binary_op(op_str, op_slen, &op_start, &cmp_expr_ptr->binary_op);
+      if ((!op_end) || (*op_end) || (op_start != op_str)) {
+	goto validate_and_alloc_cmp_expr_ret_INVALID_EXPR_GENERIC;
+      }
+      pheno_val_start = sources[2];
+      pheno_val_slen = strlen(pheno_val_start);
+    } else {
+      // permit param_ct == 1 as long as tokens are unambiguous
+      uint32_t expr_slen = strlen(pheno_name_start);
+      char* op_start;
+      pheno_val_start = parse_next_binary_op(pheno_name_start, expr_slen, &op_start, &cmp_expr_ptr->binary_op);
+      if ((!pheno_val_start) || (!(*pheno_val_start)) || (op_start == pheno_name_start)) {
+        goto validate_and_alloc_cmp_expr_ret_INVALID_EXPR_GENERIC;
+      }
+      pheno_name_slen = (uintptr_t)(op_start - pheno_name_start);
+      pheno_val_slen = expr_slen - ((uintptr_t)(pheno_val_start - pheno_name_start));
+    }
+    if ((pheno_name_slen > kMaxIdSlen) || (pheno_val_slen > kMaxIdSlen)) {
+      LOGERRPRINTF("Error: ID too long in %s expression.\n", flag_name);
+      goto validate_and_alloc_cmp_expr_ret_INVALID_CMDLINE;
+    }
+    char* new_pheno_name_buf;
+    if (pgl_malloc(2 + pheno_name_slen + pheno_val_slen, &new_pheno_name_buf)) {
+      goto validate_and_alloc_cmp_expr_ret_NOMEM;
+    }
+    memcpyx(new_pheno_name_buf, pheno_name_start, pheno_name_slen, '\0');
+    // pheno_val_start guaranteed to be null-terminated for now
+    memcpy(&(new_pheno_name_buf[pheno_name_slen + 1]), pheno_val_start, pheno_val_slen + 1);
+    cmp_expr_ptr->pheno_name = new_pheno_name_buf;
+  }
+  while (0) {
+  validate_and_alloc_cmp_expr_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  validate_and_alloc_cmp_expr_ret_INVALID_EXPR_GENERIC:
+    LOGERRPRINTF("Error: Invalid %s expression.\n", flag_name);
+  validate_and_alloc_cmp_expr_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  }
+  return reterr;
+}
+
+/*
+pglerr_t alloc_and_flatten_comma_delim(char** sources, uint32_t param_ct, char** flattened_buf_ptr) {
+  uint32_t totlen = 1;
+  for (uint32_t param_idx = 0; param_idx < param_ct; ++param_idx) {
+    const char* cur_param_iter = sources[param_idx];
+    while (1) {
+      while (*cur_param_iter == ',') {
+	++cur_param_iter;
+      }
+      const char* cur_token_end = strchr(cur_param_iter, ',');
+      if (!cur_token_end) {
+	break;
+      }
+      totlen += 1 + (uintptr_t)(cur_token_end - cur_param_iter);
+      cur_param_iter = &(cur_token_end[1]);
+    }
+    totlen += 1 + strlen(cur_param_iter);
+  }
+  char* write_iter;
+  if (pgl_malloc(totlen, &write_iter)) {
+    return kPglRetNomem;
+  }
+  *flattened_buf_ptr = write_iter;
+  for (uint32_t param_idx = 0; param_idx < param_ct; ++param_idx) {
+    const char* cur_param_iter = sources[param_idx];
+    while (1) {
+      while (*cur_param_iter == ',') {
+	++cur_param_iter;
+      }
+      const char* cur_token_end = strchr(cur_param_iter, ',');
+      if (!cur_token_end) {
+	break;
+      }
+      write_iter = memcpyax(write_iter, cur_param_iter, (uintptr_t)(cur_token_end - cur_param_iter), '\0');
+      cur_param_iter = &(cur_token_end[1]);
+    }
+    write_iter = strcpyax(write_iter, cur_param_iter, '\0');
+  }
+  *write_iter = '\0';
+  return kPglRetSuccess;
+}
+*/
+
+void invalid_arg(const char* cur_arg) {
+  LOGPREPRINTFWW("Error: Unrecognized flag ('%s').\n", cur_arg);
+}
+
+void print_ver() {
+  fputs(ver_str, stdout);
+  fputs(ver_str2, stdout);
+}
+
+pglerr_t rerun(uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t* argc_ptr, uint32_t* first_arg_idx_ptr, char*** argv_ptr, char*** subst_argv_ptr, char** rerun_buf_ptr) {
+  // caller is responsible for freeing rerun_buf
+  char** subst_argv2 = nullptr;
+  uintptr_t line_idx = 1;
+  pglerr_t reterr = kPglRetSuccess;
+  gzFile gz_rerunfile;
+  {
+    char** argv = *argv_ptr;
+    gz_rerunfile = gzopen(rerun_parameter_present? argv[rerun_argv_pos + 1] : (PROG_NAME_STR ".log"), FOPEN_RB);
+    if (!gz_rerunfile) {
+      goto rerun_ret_OPEN_FAIL;
+    }
+    char* textbuf = g_textbuf;
+    textbuf[kMaxMediumLine - 1] = ' ';
+    if (!gzgets(gz_rerunfile, textbuf, kMaxMediumLine)) {
+      print_ver();
+      fputs("Error: Empty log file for --rerun.\n", stderr);
+      goto rerun_ret_MALFORMED_INPUT;
+    }
+    if (!textbuf[kMaxMediumLine - 1]) {
+      goto rerun_ret_LONG_LINE;
+    }
+    if (!gzgets(gz_rerunfile, textbuf, kMaxMediumLine)) {
+      print_ver();
+      fputs("Error: Only one line in --rerun log file.\n", stderr);
+      goto rerun_ret_MALFORMED_INPUT;
+    }
+    line_idx++;
+    if (!textbuf[kMaxMediumLine - 1]) {
+      goto rerun_ret_LONG_LINE;
+    }
+    // don't bother supporting "xx arguments: --aa bb --cc --dd" format
+    while (memcmp(textbuf, "Options in effect:", 18) || (textbuf[18] >= ' ')) {
+      line_idx++;
+      if (!gzgets(gz_rerunfile, textbuf, kMaxMediumLine)) {
+	print_ver();
+	fputs("Error: Invalid log file for --rerun.\n", stderr);
+	goto rerun_ret_MALFORMED_INPUT;
+      }
+    }
+    char* all_args_write_iter = textbuf;
+    char* textbuf_limit = &(textbuf[kMaxMediumLine]);
+    uint32_t loaded_arg_ct = 0;
+    // We load each of the option lines in sequence into textbuf, always
+    // overwriting the previous line's newline.  (Note that textbuf[] has
+    // size > 2 * kMaxMediumLine; this lets us avoid additional
+    // dynamic memory allocation as long as we impose the constraint that all
+    // lines combined add up to less than kMaxMediumLine.)
+    while (1) {
+      all_args_write_iter[kMaxMediumLine - 1] = ' ';
+      if (!gzgets(gz_rerunfile, all_args_write_iter, kMaxMediumLine)) {
+	break;
+      }
+      line_idx++;
+      if (!all_args_write_iter[kMaxMediumLine - 1]) {
+	goto rerun_ret_LONG_LINE;
+      }
+      char* arg_iter = skip_initial_spaces(all_args_write_iter);
+      if (is_eoln_kns(*arg_iter)) {
+	*all_args_write_iter = '\0';
+	break;
+      }
+      char* token_end;
+      do {
+	token_end = token_endnn(arg_iter);
+	loaded_arg_ct++;
+	arg_iter = skip_initial_spaces(token_end);
+      } while (!is_eoln_kns(*arg_iter));
+      all_args_write_iter = token_end;
+      if (all_args_write_iter >= textbuf_limit) {
+	print_ver();
+	fputs("Error: --rerun argument sequence too long.\n", stderr);
+	goto rerun_ret_MALFORMED_INPUT;
+      }
+    }
+    gzclose_null(&gz_rerunfile);
+    const uint32_t line_byte_ct = 1 + (uintptr_t)(all_args_write_iter - textbuf);
+    char* rerun_buf;
+    if (pgl_malloc(line_byte_ct, &rerun_buf)) {
+      goto rerun_ret_NOMEM;
+    }
+    *rerun_buf_ptr = rerun_buf;
+    memcpy(rerun_buf, textbuf, line_byte_ct);
+    const uint32_t argc = (uint32_t)(*argc_ptr);
+    const uint32_t first_arg_idx = *first_arg_idx_ptr;
+    char* rerun_first_token = skip_initial_spaces(rerun_buf);
+    char* arg_iter = rerun_first_token;
+    // now use textbuf as a lame bitfield
+    memset(textbuf, 1, loaded_arg_ct);
+    uint32_t loaded_arg_idx = 0;
+    uint32_t duplicate_arg_ct = 0;
+    do {
+      if (no_more_tokens_kns(arg_iter)) {
+	print_ver();
+	fputs("Error: Line 2 of --rerun log file has fewer tokens than expected.\n", stderr);
+	goto rerun_ret_MALFORMED_INPUT;
+      }
+      char* flagname_p = is_flag_start(arg_iter);
+      if (flagname_p) {
+	const uint32_t slen = strlen_se(flagname_p);
+	uint32_t cmdline_arg_idx = first_arg_idx;
+	for (; cmdline_arg_idx < argc; cmdline_arg_idx++) {
+	  char* later_flagname_p = is_flag_start(argv[cmdline_arg_idx]);
+	  if (later_flagname_p) {
+	    const uint32_t slen2 = strlen(later_flagname_p);
+	    if ((slen == slen2) && (!memcmp(flagname_p, later_flagname_p, slen))) {
+	      cmdline_arg_idx = 0xffffffffU;
+	      break;
+	    }
+	  }
+	}
+	if (cmdline_arg_idx == 0xffffffffU) {
+	  // matching flag, override --rerun
+	  do {
+	    duplicate_arg_ct++;
+	    textbuf[loaded_arg_idx++] = 0;
+	    if (loaded_arg_idx == loaded_arg_ct) {
+	      break;
+	    }
+	    arg_iter = next_token(arg_iter);
+	  } while (!is_flag(arg_iter));
+	} else {
+	  loaded_arg_idx++;
+	  arg_iter = next_token(arg_iter);
+	}
+      } else {
+	loaded_arg_idx++;
+	arg_iter = next_token(arg_iter);
+      }
+    } while (loaded_arg_idx < loaded_arg_ct);
+    if (pgl_malloc((argc + loaded_arg_ct - duplicate_arg_ct - rerun_parameter_present - 1 - first_arg_idx) * sizeof(intptr_t), &subst_argv2)) {
+      goto rerun_ret_NOMEM;
+    }
+    uint32_t new_arg_idx = rerun_argv_pos - first_arg_idx;
+    memcpy(subst_argv2, &(argv[first_arg_idx]), new_arg_idx * sizeof(intptr_t));
+    arg_iter = rerun_first_token;
+    for (loaded_arg_idx = 0; loaded_arg_idx < loaded_arg_ct; ++loaded_arg_idx) {
+      arg_iter = skip_initial_spaces(arg_iter);
+      char* token_end = token_endnn(arg_iter);
+      if (textbuf[loaded_arg_idx]) {
+	subst_argv2[new_arg_idx++] = arg_iter;
+	*token_end = '\0';
+      }
+      arg_iter = &(token_end[1]);
+    }
+    const uint32_t final_copy_start_idx = rerun_argv_pos + rerun_parameter_present + 1;
+    memcpy(&(subst_argv2[new_arg_idx]), &(argv[final_copy_start_idx]), (argc - final_copy_start_idx) * sizeof(intptr_t));
+    *first_arg_idx_ptr = 0;
+    *argc_ptr = new_arg_idx + argc - final_copy_start_idx;
+    if (*subst_argv_ptr) {
+      free(*subst_argv_ptr);
+    }
+    *subst_argv_ptr = subst_argv2;
+    *argv_ptr = subst_argv2;
+    subst_argv2 = nullptr;
+  }
+  while (0) {
+  rerun_ret_NOMEM:
+    print_ver();
+    reterr = kPglRetNomem;
+    break;
+  rerun_ret_OPEN_FAIL:
+    print_ver();
+    reterr = kPglRetOpenFail;
+    break;
+  rerun_ret_LONG_LINE:
+    print_ver();
+    fprintf(stderr, "Error: Line %" PRIuPTR " of --rerun log file is pathologically long.\n", line_idx);
+  rerun_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+  free_cond(subst_argv2);
+  gzclose_cond(gz_rerunfile);
+  return reterr;
+}
+
+uint32_t cmdline_single_chr(const chr_info_t* cip, misc_flags_t misc_flags) {
+  if ((misc_flags & (kfMiscAutosomeOnly | kfMiscAutosomePar)) || (!cip->is_include_stack)) {
+    return 0;
+  }
+  const uint32_t main_chr_ct = popcount_longs(cip->chr_mask, kChrExcludeWords) + popcount_long(cip->chr_mask[kChrMaskWords - 1]);
+  if (main_chr_ct > 1) {
+    return 0;
+  }
+  if (main_chr_ct == 1) {
+    return (cip->incl_excl_name_stack == nullptr);
+  }
+  return cip->incl_excl_name_stack && (!(cip->incl_excl_name_stack->next));
+}
+
+pglerr_t parse_col_descriptor(const char* col_descriptor_iter, const char* supported_ids, const char* cur_flag_name, uint32_t first_col_shifted, uint32_t default_cols_mask, uint32_t prohibit_empty, void* result_ptr) {
+  // col_descriptor is usually a pointer to argv[...][5] (first five characters
+  // are "cols=").  supported_ids is a multistr.
+  // may need to switch first_col_shifted/default_cols_mask/result to uint64_t
+  pglerr_t reterr = kPglRetSuccess;
+  uint32_t* id_map = nullptr;
+  {
+    uint32_t max_id_blen = 0;
+    uint32_t id_ct = 0;
+
+    // work around strchr not returning a const char*?
+    const char* supported_ids_iter = supported_ids;
+    
+    // can precompute this sorted index and avoid the dynamic
+    // allocations/deallocations, but this is cheap enough that I'd rather make
+    // it easier to extend functionality.
+    do {
+      const char* tok_end = (const char*)rawmemchr(supported_ids_iter, '\0');
+      const uint32_t slen = (uintptr_t)(tok_end - supported_ids_iter);
+      if (slen >= max_id_blen) {
+        max_id_blen = slen + 1;
+      }
+      ++id_ct;
+      supported_ids_iter = &(tok_end[1]);
+    } while (*supported_ids_iter);
+    // max_id_blen + 4 extra bytes at the end, to support a "maybe" search
+    // (yes, this can also be precomputed)
+    if (pgl_malloc((max_id_blen + 4) * (id_ct + 1), &id_map)) {
+      goto parse_col_descriptor_ret_NOMEM;
+    }
+    char* sorted_ids = (char*)(&(id_map[id_ct]));
+    supported_ids_iter = (const char*)supported_ids;
+    for (uint32_t id_idx = 0; id_idx < id_ct; ++id_idx) {
+      const uint32_t blen = strlen(supported_ids_iter) + 1;
+      memcpy(&(sorted_ids[id_idx * max_id_blen]), supported_ids_iter, blen);
+      id_map[id_idx] = id_idx;
+      supported_ids_iter = &(supported_ids_iter[blen]);
+    }
+    if (sort_strbox_indexed_malloc(id_ct, max_id_blen, sorted_ids, id_map)) {
+      goto parse_col_descriptor_ret_NOMEM;
+    }
+    uint32_t result = *((uint32_t*)result_ptr);
+    // might not want to bother splitting this into two loops
+    if ((col_descriptor_iter[0] == '+') || (col_descriptor_iter[0] == '-')) {
+      result |= default_cols_mask;
+      char* maybebuf = &(sorted_ids[max_id_blen * id_ct]);
+      memcpy(maybebuf, "maybe", 5);
+      while (1) {
+	const char* id_start = &(col_descriptor_iter[1]);
+	const char* tok_end = strchr(id_start, ',');
+	uint32_t slen;
+	if (!tok_end) {
+	  slen = strlen(id_start);
+	} else {
+	  slen = (uintptr_t)(tok_end - id_start);
+	}
+	int32_t alpha_idx = bsearch_str(id_start, sorted_ids, slen, max_id_blen, id_ct);
+	if (alpha_idx == -1) {
+	  char* write_iter = strcpya(g_logbuf, "Error: Unrecognized ID '");
+	  write_iter = memcpya(write_iter, id_start, slen);
+	  write_iter = strcpya(write_iter, "' in --");
+	  write_iter = strcpya(write_iter, cur_flag_name);
+	  write_iter = strcpya(write_iter, " column set descriptor.\n");
+	  goto parse_col_descriptor_ret_INVALID_CMDLINE_WW;
+	}
+	uint32_t shift = id_map[(uint32_t)alpha_idx];
+	if (col_descriptor_iter[0] == '+') {
+	  result |= first_col_shifted << shift;
+	} else {
+	  if (result & (first_col_shifted << shift)) {
+	    result -= first_col_shifted << shift;
+	  } else if (slen + 5 < max_id_blen) {
+	    // special case: if default column set includes e.g. "maybesid",
+	    // and user types "-sid", that should work
+	    memcpy(&(maybebuf[5]), id_start, slen);
+	    alpha_idx = bsearch_str(maybebuf, sorted_ids, slen + 5, max_id_blen, id_ct);
+	    if (alpha_idx != -1) {
+	      shift = id_map[(uint32_t)alpha_idx];
+	      result &= ~(first_col_shifted << shift);
+	    }
+	  }
+	}
+	if (!tok_end) {
+	  break;
+	}
+	col_descriptor_iter = &(tok_end[1]);
+	if ((col_descriptor_iter[0] != '+') && (col_descriptor_iter[0] != '-')) {
+	  goto parse_col_descriptor_ret_MIXED_SIGN;
+	}
+      }
+    } else if (*col_descriptor_iter) {
+      while (1) {
+	const char* tok_end = strchr(col_descriptor_iter, ',');
+	uint32_t slen;
+	if (!tok_end) {
+	  slen = strlen(col_descriptor_iter);
+	} else {
+	  slen = (uintptr_t)(tok_end - col_descriptor_iter);
+	}
+	int32_t alpha_idx = bsearch_str(col_descriptor_iter, sorted_ids, slen, max_id_blen, id_ct);
+	if (alpha_idx == -1) {
+	  char* write_iter = strcpya(g_logbuf, "Error: Unrecognized ID '");
+	  write_iter = memcpya(write_iter, col_descriptor_iter, slen);
+	  write_iter = strcpya(write_iter, "' in --");
+	  write_iter = strcpya(write_iter, cur_flag_name);
+	  write_iter = strcpya(write_iter, " column set descriptor.\n");
+	  goto parse_col_descriptor_ret_INVALID_CMDLINE_WW;
+	}
+	uint32_t shift = id_map[(uint32_t)alpha_idx];
+	result |= first_col_shifted << shift;
+	if (!tok_end) {
+	  break;
+	}
+	col_descriptor_iter = &(tok_end[1]);
+	if ((col_descriptor_iter[0] == '+') || (col_descriptor_iter[0] == '-')) {
+	  goto parse_col_descriptor_ret_MIXED_SIGN;
+	}
+      }
+    }
+    if (prohibit_empty && (!(result & (first_col_shifted * (0xffffffffU >> (32 - id_ct)))))) {
+      char* write_iter = strcpya(g_logbuf, "Error: All columns excluded by --");
+      write_iter = strcpya(write_iter, cur_flag_name);
+      write_iter = strcpya(write_iter, " column set descriptor.\n");
+      goto parse_col_descriptor_ret_INVALID_CMDLINE_WW;
+    }
+    *((uint32_t*)result_ptr) = result;
+  }
+  while (0) {
+  parse_col_descriptor_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  parse_col_descriptor_ret_MIXED_SIGN:
+    sprintf(g_logbuf, "Error: Invalid --%s column set descriptor (either all column set IDs must be preceded by +/-, or none of them can be).\n", cur_flag_name);
+  parse_col_descriptor_ret_INVALID_CMDLINE_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetInvalidCmdline;
+    break;
+  }
+  free_cond(id_map);
+  return reterr;
+}
+
+void get_exportf_targets(char** argv, uint32_t param_ct, exportf_flags_t* exportf_modifier_ptr, idpaste_t* exportf_id_paste_ptr, uint32_t* format_param_idxs_ptr) {
+  // does not error out if no format present, since this is needed for --recode
+  // translation
+  // supports multiple formats
+  uint32_t format_param_idxs = 0;
+  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+    const char* cur_modif = argv[param_idx];
+    const char* cur_modif2 = &(cur_modif[1]);
+    exportf_flags_t cur_format = kfExportf0;
+    switch (*cur_modif) {
+    case '2':
+      if (!strcmp(cur_modif2, "3")) {
+	cur_format = kfExportf23;
+      }
+      break;
+    case 'A':
+    case 'a':
+      if (!cur_modif2[0]) {
+	cur_format = kfExportfA;
+      } else if (((cur_modif2[0] & 0xdf) == 'D') && (!cur_modif2[1])) {
+	cur_format = kfExportfAD;
+      } else if (!strcmp(cur_modif2, "-transpose")) {
+	cur_format = kfExportfATranspose;
+      }
+      break;
+    case 'b':
+      if (!strcmp(cur_modif2, "eagle")) {
+	cur_format = kfExportfBeagle;
+      } else if (!strcmp(cur_modif2, "eagle-nomap")) {
+	cur_format = kfExportfBeagleNomap;
+      } else if ((!strcmp(cur_modif2, "gen-1.1")) || (!strcmp(cur_modif2, "gen_1.1"))) {
+	cur_format = kfExportfBgen11;
+      } else if ((!strcmp(cur_modif2, "gen-1.2")) || (!strcmp(cur_modif2, "gen_1.2"))) {
+	cur_format = kfExportfBgen12;
+      } else if ((!strcmp(cur_modif2, "gen-1.3")) || (!strcmp(cur_modif2, "gen_1.3"))) {
+	cur_format = kfExportfBgen13;
+      } else if (!strcmp(cur_modif2, "imbam")) {
+	cur_format = kfExportfBimbam;
+      } else if (!strcmp(cur_modif2, "imbam-1chr")) {
+	cur_format = kfExportfBimbam1chr;
+      }
+      break;
+    case 'c':
+      if (!strcmp(cur_modif2, "ompound-genotypes")) {
+	cur_format = kfExportfCompound;
+      }
+      break;
+    case 'f':
+      if (!strcmp(cur_modif2, "astphase")) {
+	cur_format = kfExportfFastphase;
+      } else if (!strcmp(cur_modif2, "astphase-1chr")) {
+	cur_format = kfExportfFastphase1chr;
+      }
+      break;
+    case 'h':
+      if (!strcmp(cur_modif2, "aps")) {
+	cur_format = kfExportfHaps;
+	break;
+      } else if (!strcmp(cur_modif2, "apslegend")) {
+	cur_format = kfExportfHapsLegend;
+	break;
+      }
+      // fall through
+    case 'H':
+      if ((cur_modif2[0] & 0xdf) == 'V') {
+	if (!cur_modif2[1]) {
+	  cur_format = kfExportfHv;
+	} else if (!strcmp(&(cur_modif2[1]), "-1chr")) {
+	  cur_format = kfExportfHv1chr;
+	}
+      }
+      break;
+    case 'i':
+      if (!strcmp(cur_modif2, "nd-major-bed")) {
+	cur_format = kfExportfIndMajorBed;
+      }
+      break;
+    case 'l':
+      if (!strcmp(cur_modif2, "gen")) {
+	cur_format = kfExportfLgen;
+      } else if (!strcmp(cur_modif2, "gen-ref")) {
+	cur_format = kfExportfLgenRef;
+      } else if (!strcmp(cur_modif2, "ist")) {
+        cur_format = kfExportfList;
+      }
+      break;
+    case 'o':
+      if (!strcmp(cur_modif2, "xford")) {
+	cur_format = kfExportfOxGen;
+      }
+      break;
+    case 'p':
+      if (!strcmp(cur_modif2, "ed")) {
+	cur_format = kfExportfPed;
+      }
+      break;
+    case 'r':
+      if (!strcmp(cur_modif2, "list")) {
+	cur_format = kfExportfRlist;
+      }
+      break;
+    case 's':
+      if (!strcmp(cur_modif2, "tructure")) {
+	cur_format = kfExportfStructure;
+      }
+      break;
+    case 't':
+      if (!strcmp(cur_modif2, "ranspose")) {
+	cur_format = kfExportfTranspose;
+      }
+      break;
+    case 'v':
+      if ((cur_modif2[0] == 'c') && (cur_modif2[1] == 'f')) {
+	if (!cur_modif2[2]) {
+	  cur_format = kfExportfVcf;
+	} else if ((!strcmp(&(cur_modif2[2]), "-fid")) || (!strcmp(&(cur_modif2[2]), "-iid"))) {
+	  sprintf(g_logbuf, "Note: --export 'v%s' modifier is deprecated.  Use 'vcf' + 'id-paste=%s'.\n", cur_modif2, &(cur_modif2[3]));
+	  cur_format = kfExportfVcf;
+	  *exportf_id_paste_ptr = (cur_modif2[3] == 'f')? kfIdpasteFid : kfIdpasteIid;
+	}
+      }
+      break;
+    }
+    if (cur_format) {
+      format_param_idxs |= 1U << param_idx;
+      *exportf_modifier_ptr |= cur_format;
+    }
+  }
+  *format_param_idxs_ptr = format_param_idxs;
+}
+
+uint32_t varid_template_is_valid(char* varid_str, const char* flagname_p) {
+  char* sptr = strchr(varid_str, '@');
+  char* sptr2 = strchr(varid_str, '#');
+  if ((!sptr) || (!sptr2) || strchr(&(sptr[1]), '@') || strchr(&(sptr2[1]), '#')) {
+    LOGERRPRINTFWW("Error: The --%s template string requires exactly one '@' and one '#'.\n", flagname_p);
+    return 0;
+  }
+  // snp/nonsnp is not sufficient for assigning unique IDs to unnamed 1000
+  // Genomes phase 3 variants (see e.g. chr22:18078898).  So we now allow the
+  // template string to include allele names, where '$r' = reference allele,
+  // '$a' = alt1, and '$1'/'$2' refer to ref/alt1 in ASCII-sort order
+  // (necessary for interoperation with plink1).
+  // For now, either '$' must be entirely absent from the template string, or
+  // '$r' and/or '$a' appear exactly once, or '$1' and '$2' both appear exactly
+  // once.
+  // probable todo: alternate naming scheme for long indels (e.g. first base,
+  //   middle length, last base, like "i18n")
+  // possible todo: some way to include alt2, etc. in name
+  sptr = strchr(varid_str, '$');
+  if (sptr) {
+    sptr2 = &(sptr[1]);
+    uint32_t first_allele_type_code = (unsigned char)(*sptr2);
+    if ((first_allele_type_code == 49) || (first_allele_type_code == 50)) {
+      sptr2 = strchr(sptr2, '$');
+      if ((!sptr2) || strchr(&(sptr2[1]), '$') || ((first_allele_type_code + ((unsigned char)sptr2[1])) != 99)) {
+      varid_template_is_valid_dollar_error:
+	LOGERRPRINTFWW("Error: The --%s template string requires either no instances of '$', exactly one instance of '$r' and/or '$a', or exactly one '$1' and one '$2'.\n", flagname_p);
+	return 0;
+      }
+    } else {
+      first_allele_type_code &= 0xdf;
+      if ((first_allele_type_code != 65) && (first_allele_type_code != 82)) {
+	goto varid_template_is_valid_dollar_error;
+      }
+      sptr2 = strchr(sptr2, '$');
+      if (sptr2) {
+	const uint32_t second_allele_type_code = (uint32_t)((unsigned char)(*(++sptr2))) & 0xdf;
+	if (((first_allele_type_code + second_allele_type_code) != 147) || strchr(sptr2, '$')) {
+	  goto varid_template_is_valid_dollar_error;
+	}
+      }
+    }
+  }
+  return 1;
+}
+
+
+static_assert(sizeof(int) == sizeof(int32_t), "main() assumes int and int32_t are synonymous.");
+static_assert(!kChrOffsetX, "--autosome-num/--chr-set/--cow/etc. assume kChrOffsetX == 0.");
+static_assert(kChrOffsetY == 1, "--chr-set/--cow/... assume kChrOffsetY == 1.");
+static_assert(kChrOffsetXY == 2, "--chr-set/--cow/... assume kChrOffsetXY == 2.");
+static_assert(kChrOffsetMT == 3, "--chr-set/--cow/... assume kChrOffsetMT == 3.");
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+int main(int argc, char** argv) {
+#ifdef __cplusplus
+  using namespace plink2;
+#endif
+  // special case, since it may dump to stdout
+  if ((argc > 1) && ((!strcmp(argv[1], "--zst-decompress")) || (!strcmp(argv[1], "-zst-decompress")))) {
+    if (argc == 2) {
+      fprintf(stderr, "Error: Missing %s parameter.\n", argv[1]);
+      return (uint32_t)kPglRetInvalidCmdline;
+    }
+    for (int ii = 2; ii < argc; ++ii) {
+      if (is_flag(argv[(uint32_t)ii])) {
+	fprintf(stderr, "Error: %s cannot be used with other flags.\n", argv[1]);
+	return (uint32_t)kPglRetInvalidCmdline;
+      }
+    }
+    if (argc > 4) {
+      fprintf(stderr, "Error: %s accepts at most 2 parameters.\n", argv[1]);
+      return (uint32_t)kPglRetInvalidCmdline;
+    }
+    return (uint32_t)zst_decompress(argv[2], (argc == 4)? argv[3] : nullptr);
+  }
+  
+  unsigned char* bigstack_ua = nullptr;
+  char** subst_argv = nullptr;
+  char* script_buf = nullptr;
+  char* rerun_buf = nullptr;
+  char* flag_buf = nullptr;
+  char* flagname_p = nullptr;
+  uint32_t* flag_map = nullptr;
+  char* king_cutoff_fprefix = nullptr;
+  char* const_fid = nullptr;
+  char* var_filter_exceptions_flattened = nullptr;
+  char* require_pheno_flattened = nullptr;
+  char* require_covar_flattened = nullptr;
+  char* import_single_chr_str = nullptr;
+  char* ox_missing_code = nullptr;
+  char* vcf_dosage_import_field = nullptr;
+  FILE* scriptfile = nullptr;
+  uint32_t* rseeds = nullptr;
+  ll_str_t* file_delete_list = nullptr;
+  uint32_t arg_idx = 0;
+  uint32_t print_end_time = 0;
+  uint32_t warning_errcode = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  plink2_cmdline_t pc;
+  pc.filter_flags = kfFilter0;
+  pc.varid_template = nullptr;
+  pc.missing_varid_match = nullptr;
+  pc.varid_from = nullptr;
+  pc.varid_to = nullptr;
+  pc.varid_snp = nullptr;
+  pc.varid_exclude_snp = nullptr;
+  pc.pheno_fname = nullptr;
+  pc.covar_fname = nullptr;
+  pc.sample_sort_fname = nullptr;
+  pc.keep_fnames = nullptr;
+  pc.keepfam_fnames = nullptr;
+  pc.remove_fnames = nullptr;
+  pc.removefam_fnames = nullptr;
+  pc.extract_fnames = nullptr;
+  pc.exclude_fnames = nullptr;
+  pc.update_sex_fname = nullptr;
+  pc.freq_ref_binstr = nullptr;
+  pc.freq_alt1_binstr = nullptr;
+  pc.glm_local_covar_fname = nullptr;
+  pc.glm_local_pvar_fname = nullptr;
+  pc.glm_local_psam_fname = nullptr;
+  pc.read_freq_fname = nullptr;
+  pc.within_fname = nullptr;
+  pc.catpheno_name = nullptr;
+  pc.family_missing_catname = nullptr;
+  pc.keep_cats_fname = nullptr;
+  pc.keep_cat_names_flattened = nullptr;
+  pc.keep_cat_phenoname = nullptr;
+  pc.remove_cats_fname = nullptr;
+  pc.remove_cat_names_flattened = nullptr;
+  pc.remove_cat_phenoname = nullptr;
+  pc.split_cat_phenonames_flattened = nullptr;
+  pc.vstd_flattened = nullptr;
+  pc.quantnorm_flattened = nullptr;
+  pc.covar_quantnorm_flattened = nullptr;
+  init_range_list(&pc.snps_range_list);
+  init_range_list(&pc.exclude_snps_range_list);
+  init_range_list(&pc.pheno_range_list);
+  init_range_list(&pc.covar_range_list);
+  init_ld(&pc.ld_info);
+  init_glm(&pc.glm_info);
+  init_adjust(&pc.adjust_info);
+  init_score(&pc.score_info);
+  init_cmp_expr(&pc.keep_if_expr);
+  init_cmp_expr(&pc.remove_if_expr);
+  chr_info_t chr_info;
+  if (init_chr_info(&chr_info)) {
+    goto main_ret_NOMEM_NOLOG;
+  }
+  
+  {
+    // standardize strtod() behavior
+    // setlocale(LC_NUMERIC, "C");
+
+    uint32_t first_arg_idx = 1;
+    for (arg_idx = 1; arg_idx < (uint32_t)argc; ++arg_idx) {
+      if ((!strcmp("-script", argv[arg_idx])) || (!strcmp("--script", argv[arg_idx]))) {
+	const uint32_t param_ct = param_count(argv, argc, arg_idx);
+	if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	  print_ver();
+	  fputs(g_logbuf, stderr);
+	  fputs(errstr_append, stderr);
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	for (uint32_t arg_idx2 = arg_idx + 2; arg_idx2 < (uint32_t)argc; ++arg_idx2) {
+	  if ((!strcmp("-script", argv[arg_idx2])) || (!strcmp("--script", argv[arg_idx2]))) {
+	    print_ver();
+	    fputs("Error: Multiple --script flags.  Merge the files into one.\n", stderr);
+	    fputs(errstr_append, stderr);
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	}
+	// logging not yet active, so don't use fopen_checked()
+	scriptfile = fopen(argv[arg_idx + 1], FOPEN_RB);
+	if (!scriptfile) {
+	  print_ver();
+	  fprintf(stderr, g_errstr_fopen, argv[arg_idx + 1]);
+	  goto main_ret_OPEN_FAIL;
+	}
+	if (fseeko(scriptfile, 0, SEEK_END)) {
+	  goto main_ret_READ_FAIL_NOLOG;
+	}
+	int64_t fsize = ftello(scriptfile);
+	if (fsize < 0) {
+	  goto main_ret_READ_FAIL_NOLOG;
+	}
+	if (fsize > 0x7ffffffe) {
+	  // could actually happen if user enters parameters in the wrong
+	  // order, so may as well catch it and print a somewhat informative
+	  // error message
+	  print_ver();
+	  fputs("Error: --script file too large.", stderr);
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	rewind(scriptfile);
+	const uint32_t fsize_ui = (uint64_t)fsize;
+	if (pgl_malloc(fsize_ui + 1, &script_buf)) {
+	  goto main_ret_NOMEM_NOLOG;
+	}
+	if (!fread(script_buf, fsize_ui, 1, scriptfile)) {
+	  goto main_ret_READ_FAIL_NOLOG;
+	}
+	script_buf[fsize_ui] = '\0';
+	fclose_null(&scriptfile);
+	uint32_t num_script_params = 0;
+	char* script_buf_iter = script_buf;
+	uint32_t char_code;
+	do {
+	  uint32_t char_code_m1;
+	  do {
+	    char_code_m1 = ((uint32_t)((unsigned char)(*script_buf_iter++))) - 1;
+	  } while (char_code_m1 < 32);
+	  if (char_code_m1 == 0xffffffffU) {
+	    break;
+	  }
+	  ++num_script_params;
+	  do {
+	    char_code = (uint32_t)((unsigned char)(*script_buf_iter++));
+	  } while (char_code > 32);
+	} while (char_code);
+	if (script_buf_iter != (&(script_buf[fsize_ui + 1]))) {
+	  print_ver();
+	  fputs("Error: Null byte in --script file.\n", stderr);
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	const uint32_t new_param_ct = num_script_params + argc - 3;
+	if (pgl_malloc(new_param_ct * sizeof(intptr_t), &subst_argv)) {
+	  goto main_ret_NOMEM_NOLOG;
+	}
+	memcpy(subst_argv, &(argv[1]), arg_idx * sizeof(intptr_t));
+	const uint32_t load_param_idx_end = arg_idx + num_script_params;
+	script_buf_iter = &(script_buf[-1]);
+	for (uint32_t param_idx = arg_idx; param_idx < load_param_idx_end; ++param_idx) {
+	  while (((unsigned char)(*(++script_buf_iter))) <= 32);
+	  subst_argv[param_idx] = script_buf_iter;
+	  while (((unsigned char)(*(++script_buf_iter))) > 32);
+	  // could enforce some sort of length limit here
+	  *script_buf_iter = '\0';
+	}
+	memcpy(&(subst_argv[load_param_idx_end]), &(argv[arg_idx + 2]), (argc - arg_idx - 2) * sizeof(intptr_t));
+	argc = new_param_ct;
+	first_arg_idx = 0;
+	argv = subst_argv;
+	break;
+      }
+    }
+    for (arg_idx = first_arg_idx; arg_idx < (uint32_t)argc; ++arg_idx) {
+      if ((!strcmp("-rerun", argv[arg_idx])) || (!strcmp("--rerun", argv[arg_idx]))) {
+	const uint32_t param_ct = param_count(argv, argc, arg_idx);
+	if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	  print_ver();
+	  fputs(g_logbuf, stderr);
+	  fputs(errstr_append, stderr);
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	for (uint32_t arg_idx2 = arg_idx + param_ct + 1; arg_idx2 < (uint32_t)argc; ++arg_idx2) {
+	  if ((!strcmp("-rerun", argv[arg_idx2])) || (!strcmp("--rerun", argv[arg_idx2]))) {
+	    print_ver();
+	    fputs("Error: Duplicate --rerun flag.\n", stderr);
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	}
+	reterr = rerun(arg_idx, param_ct, &argc, &first_arg_idx, &argv, &subst_argv, &rerun_buf);
+	if (reterr) {
+	  goto main_ret_NOLOG;
+	}
+	break;
+      }
+    }
+    if ((first_arg_idx < (uint32_t)argc) && (!is_flag(argv[first_arg_idx]))) {
+      fputs("Error: First parameter must be a flag.\n", stderr);
+      fputs(errstr_append, stderr);
+      goto main_ret_INVALID_CMDLINE;
+    }
+    uint32_t flag_ct = 0;
+    uint32_t version_present = 0;
+    uint32_t silent_present = 0;
+    for (arg_idx = first_arg_idx; arg_idx < (uint32_t)argc; ++arg_idx) {
+      flagname_p = is_flag_start(argv[arg_idx]);
+      if (flagname_p) {
+	if (!strcmp("help", flagname_p)) {
+	  print_ver();
+	  if ((!first_arg_idx) || (arg_idx != 1) || subst_argv) {
+	    fputs("--help present, ignoring other flags.\n", stdout);
+	  }
+	  if ((arg_idx == ((uint32_t)argc) - 1) && flag_ct) {
+	    // make "plink [valid flags/parameters] --help" work, and skip the
+	    // parameters
+	    char** help_argv;
+	    if (pgl_malloc(flag_ct * sizeof(intptr_t), &help_argv)) {
+	      goto main_ret_NOMEM_NOLOG2;
+	    }
+	    uint32_t arg_idx2 = 0;
+	    for (uint32_t flag_idx = 0; flag_idx < flag_ct; ++flag_idx) {
+	      while (!is_flag_start(argv[++arg_idx2]));
+	      help_argv[flag_idx] = argv[arg_idx2];
+	    }
+	    reterr = disp_help(flag_ct, help_argv);
+	    free(help_argv);
+	  } else {
+	    reterr = disp_help(argc - arg_idx - 1, &(argv[arg_idx + 1]));
+	  }
+	  goto main_ret_1;
+	}
+	if ((!strcmp("h", flagname_p)) || (!strcmp("?", flagname_p))) {
+	  // these just act like the no-parameter case
+	  print_ver();
+	  if ((!first_arg_idx) || (arg_idx != 1) || subst_argv) {
+	    printf("-%c present, ignoring other flags.\n", *flagname_p);
+	  }
+	  fputs(g_cmdline_format_str, stdout);
+	  fputs(notestr_null_calc2, stdout);
+	  reterr = kPglRetHelp;
+	  goto main_ret_1;
+	}
+	if (!strcmp("version", flagname_p)) {
+	  version_present = 1;
+	} else if (!strcmp("silent", flagname_p)) {
+	  silent_present = 1;
+	}
+	if (strlen(flagname_p) >= kMaxFlagBlen) {
+	  print_ver();
+	  // shouldn't be possible for this to overflow the buffer...
+	  sprintf(g_logbuf, "Error: Unrecognized flag ('%s').\n", argv[arg_idx]);
+	  wordwrapb(0);
+	  fputs(g_logbuf, stderr);
+	  fputs(errstr_append, stderr);
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	++flag_ct;
+      }
+    }
+    if (version_present) {
+      fputs(ver_str, stdout);
+      putc_unlocked('\n', stdout);
+      goto main_ret_1;
+    }
+    if (silent_present) {
+      if (!freopen("/dev/null", "w", stdout)) {
+	fputs("Warning: --silent failed.", stderr);
+	g_stderr_written_to = 1;
+      }
+    }
+    print_ver();
+    if (!flag_ct) {
+      goto main_ret_NULL_CALC_0;
+    }
+    if (pgl_malloc(flag_ct * kMaxFlagBlen, &flag_buf) ||
+	pgl_malloc(flag_ct * sizeof(int32_t), &flag_map)) {
+      goto main_ret_NOMEM_NOLOG2;
+    }
+    char* flagname_write_iter = flag_buf;
+    uint32_t cur_flag_idx = 0;
+    for (arg_idx = first_arg_idx; arg_idx < (uint32_t)argc; ++arg_idx) {
+      flagname_p = is_flag_start(argv[arg_idx]);
+      if (flagname_p) {
+	const uint32_t flag_slen = strlen(flagname_p);
+	switch (*flagname_p) {
+	case '\0':
+	  // special case, since we reserve empty names for preprocessed flags
+	  fputs("Error: Unrecognized flag ('--').\n", stderr);
+	  goto main_ret_INVALID_CMDLINE;
+	case 'a':
+	  if ((flag_slen == 3) && (!memcmp(flagname_p, "aec", 3))) {
+	    strcpy(flagname_write_iter, "allow-extra-chr");
+	  } else if ((flag_slen == 11) && (!memcmp(flagname_p, "autosome-xy", 11))) {
+	    strcpy(flagname_write_iter, "autosome-par");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'b':
+	  if ((flag_slen == 3) && (!memcmp(flagname_p, "bed", 3))) {
+	    strcpy(flagname_write_iter, "pgen");
+	  } else if ((flag_slen == 3) && (!memcmp(flagname_p, "bim", 3))) {
+	    strcpy(flagname_write_iter, "pvar");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'f':
+	  if ((flag_slen == 3) && (!memcmp(flagname_p, "fam", 3))) {
+	    strcpy(flagname_write_iter, "psam");
+	  } else if ((flag_slen == 12) && (!memcmp(flagname_p, "filter-males", 12))) {
+	    strcpy(flagname_write_iter, "keep-males");
+	  } else if ((flag_slen == 14) && (!memcmp(flagname_p, "filter-females", 14))) {
+	    fputs("Note: --filter-females flag deprecated.  Use --keep-females or --remove-males\ninstead.\n", stdout);
+	    strcpy(flagname_write_iter, "remove-males");
+	  } else if ((flag_slen == 15) && (!memcmp(flagname_p, "filter-founders", 15))) {
+	    strcpy(flagname_write_iter, "keep-founders");
+	  } else if ((flag_slen == 17) && (!memcmp(flagname_p, "filter-nonfounders", 17))) {
+	    strcpy(flagname_write_iter, "keep-nonfounders");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'h':
+	  if ((flag_slen == 5) && (!memcmp(flagname_p, "hound", 5))) {
+	    // the creature type should be Dog.
+	    strcpy(flagname_write_iter, "dog");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'k':
+	  if ((flag_slen == 13) && (!memcmp(flagname_p, "keep-clusters", 13))) {
+	    fputs("Note: --keep-clusters flag deprecated.  Use --keep-cats instead.\n", stdout);
+	    strcpy(flagname_write_iter, "keep-cats");
+	  } else if ((flag_slen == 18) && (!memcmp(flagname_p, "keep-cluster-names", 18))) {
+	    fputs("Note: --keep-cluster-names flag deprecated.  Use --keep-cat-names instead.\n", stdout);
+	    strcpy(flagname_write_iter, "keep-cat-names");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'l':
+	  if (((flag_slen == 6) && (!memcmp(flagname_p, "linear", 6))) || ((flag_slen == 8) && (!memcmp(flagname_p, "logistic", 8)))) {
+	    strcpy(flagname_write_iter, "glm");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'm':
+	  if ((flag_slen == 6) && (!memcmp(flagname_p, "min-ac", 6))) {
+	    strcpy(flagname_write_iter, "mac");
+	  } else if ((flag_slen == 6) && (!memcmp(flagname_p, "max-ac", 6))) {
+	    strcpy(flagname_write_iter, "max-mac");
+	  } else if ((flag_slen == 10) && (!memcmp(flagname_p, "make-bfile", 10))) {
+	    strcpy(flagname_write_iter, "make-bed");
+	  } else if ((flag_slen == 11) && (!memcmp(flagname_p, "make-bpfile", 11))) {
+	    strcpy(flagname_write_iter, "make-bpgen");
+	  } else if ((flag_slen == 10) && (!memcmp(flagname_p, "make-pfile", 10))) {
+	    strcpy(flagname_write_iter, "make-pgen");
+	  } else if ((flag_slen == 12) && (!memcmp(flagname_p, "missing_code", 12))) {
+	    strcpy(flagname_write_iter, "missing-code");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'n':
+	  if ((flag_slen == 11) && (!memcmp(flagname_p, "num_threads", 11))) {
+	    strcpy(flagname_write_iter, "threads");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+        case 'p':
+	  if ((flag_slen == 5) && (!memcmp(flagname_p, "prune", 5))) {
+	    strcpy(flagname_write_iter, "require-pheno");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'r':
+	  if ((flag_slen == 6) && (!memcmp(flagname_p, "recode", 6))) {
+	    // special case: translate to "export ped" if no format specified
+	    const uint32_t param_ct = param_count(argv, argc, arg_idx);
+	    if (param_ct > 4) {
+	      fputs("Error: --recode accepts at most 4 parameters.\n", stderr);
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	    exportf_flags_t dummy;
+	    idpaste_t dummy2;
+	    uint32_t format_param_idxs;
+	    get_exportf_targets(&(argv[arg_idx]), param_ct, &dummy, &dummy2, &format_param_idxs);
+	    if (!format_param_idxs) {
+	      strcpy(flagname_write_iter, "export ped");
+	    } else {
+	      strcpy(flagname_write_iter, "export");
+	    }
+	  } else if ((flag_slen == 15) && (!memcmp(flagname_p, "remove-founders", 15))) {
+	    strcpy(flagname_write_iter, "keep-founders");
+	  } else if ((flag_slen == 17) && (!memcmp(flagname_p, "remove-nonfounders", 17))) {
+	    strcpy(flagname_write_iter, "keep-nonfounders");
+	  } else if ((flag_slen == 15) && (!memcmp(flagname_p, "remove-clusters", 15))) {
+	    fputs("Note: --remove-clusters flag deprecated.  Use --remove-cats instead.\n", stdout);	    
+	    strcpy(flagname_write_iter, "remove-cats");
+	  } else if ((flag_slen == 20) && (!memcmp(flagname_p, "remove-cluster-names", 20))) {
+	    fputs("Note: --remove-cluster-names flag deprecated.  Use --remove-cat-names instead.\n", stdout);
+	    strcpy(flagname_write_iter, "remove-cat-names");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 't':
+	  if ((flag_slen == 10) && (!memcmp(flagname_p, "thread-num", 10))) {
+	    strcpy(flagname_write_iter, "threads");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	case 'v':
+	  if ((flag_slen == 10) && (!memcmp(flagname_p, "vcf-filter", 10))) {
+	    strcpy(flagname_write_iter, "var-filter");
+	  } else if ((flag_slen == 12) && (!memcmp(flagname_p, "vcf-min-qual", 12))) {
+	    strcpy(flagname_write_iter, "var-min-qual");
+	  } else {
+	    goto main_flag_copy;
+	  }
+	  break;
+	default:
+	main_flag_copy:
+	  memcpy(flagname_write_iter, flagname_p, flag_slen + 1);
+	}
+	flagname_write_iter = &(flagname_write_iter[kMaxFlagBlen]);
+	flag_map[cur_flag_idx++] = arg_idx;
+      }
+    }
+    reterr = sort_cmdline_flags(kMaxFlagBlen, flag_ct, flag_buf, flag_map);
+    if (reterr) {
+      if (reterr == kPglRetNomem) {
+	goto main_ret_NOMEM_NOLOG2;
+      }
+      goto main_ret_NOLOG;
+    }
+    char outname[kPglFnamesize];
+    memcpy(outname, "plink2", 6);
+    char* outname_end = nullptr;
+    for (cur_flag_idx = 0; cur_flag_idx < flag_ct; ++cur_flag_idx) {
+      int32_t memcmp_out_result = memcmp("out", &(flag_buf[cur_flag_idx * kMaxFlagBlen]), 4);
+      if (!memcmp_out_result) {
+	arg_idx = flag_map[cur_flag_idx];
+	const uint32_t param_ct = param_count(argv, argc, arg_idx);
+	if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	  fputs(g_logbuf, stderr);
+	  fputs(errstr_append, stderr);
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	if (strlen(argv[arg_idx + 1]) > (kPglFnamesize - kMaxOutfnameExtBlen)) {
+	  fflush(stdout);
+	  fputs("Error: --out parameter too long.\n", stderr);
+	  goto main_ret_OPEN_FAIL;
+	}
+	const uint32_t slen = strlen(argv[arg_idx + 1]);
+	memcpy(outname, argv[arg_idx + 1], slen + 1);
+	outname_end = &(outname[slen]);
+      }
+      if (memcmp_out_result <= 0) {
+	break;
+      }
+    }
+    if (init_logfile(0, outname, outname_end? outname_end : &(outname[6]))) {
+      goto main_ret_OPEN_FAIL;
+    }
+    logstr(ver_str);
+    logstr("\n");
+    logprint("Options in effect:\n");
+    for (cur_flag_idx = 0; cur_flag_idx < flag_ct; ++cur_flag_idx) {
+      logprint("  --");
+      logprint(&(flag_buf[cur_flag_idx * kMaxFlagBlen]));
+      arg_idx = flag_map[cur_flag_idx] + 1;
+      while ((arg_idx < (uint32_t)argc) && (!is_flag(argv[arg_idx]))) {
+	logprint(" ");
+	logprint(argv[arg_idx++]);
+      }
+      logprint("\n");
+    }
+    logprint("\n");
+
+#ifdef _WIN32
+    DWORD windows_dw = kTextbufSize;
+    if (GetComputerName(g_textbuf, &windows_dw))
+#else
+    if (gethostname(g_textbuf, kTextbufSize) != -1)
+#endif
+    {
+      logstr("Hostname: ");
+      logstr(g_textbuf);
+    }
+    logstr("\nWorking directory: ");
+    if (!getcwd(g_textbuf, kPglFnamesize)) {
+      goto main_ret_READ_FAIL;
+    }
+    logstr(g_textbuf);
+    logstr("\n");
+    logprint("Start time: ");
+    time_t rawtime;
+    time(&rawtime);
+    logprint(ctime(&rawtime));
+    // ctime string always has a newline at the end
+    logstr("\n");
+
+    int32_t known_procs;
+#ifdef _WIN32
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    pc.max_thread_ct = sysinfo.dwNumberOfProcessors;
+    known_procs = pc.max_thread_ct;
+#else
+    known_procs = sysconf(_SC_NPROCESSORS_ONLN);
+    pc.max_thread_ct = (known_procs == -1)? 1 : ((uint32_t)known_procs);
+#endif
+    // don't subtract 1 any more since, when max_thread_ct > 2, one of the
+    // (virtual) cores will be dedicated to I/O and have lots of downtime.
+    if (pc.max_thread_ct > kMaxThreads) {
+      pc.max_thread_ct = kMaxThreads;
+    }
+
+    char pgenname[kPglFnamesize];
+    char psamname[kPglFnamesize];
+    char pvarname[kPglFnamesize];
+    pgenname[0] = '\0';
+    psamname[0] = '\0';
+    pvarname[0] = '\0';
+    init_pheno();
+    cur_flag_idx = 0;
+    pc.command_flags1 = kfCommand10;
+    // uint64_t command_flags2 = 0;
+    pc.misc_flags = kfMisc0;
+    pc.pvar_psam_modifier = kfPvarPsam0;
+    pc.exportf_modifier = kfExportf0;
+    pc.sample_sort_flags = kfSort0;
+    pc.grm_flags = kfGrm0;
+    pc.pca_flags = kfPca0;
+    pc.write_covar_flags = kfWriteCovar0;
+    pc.pheno_transform_flags = kfPhenoTransform0;
+    pc.fam_cols = kfFamCol13456;
+    pc.exportf_id_paste = kfIdpaste0;
+    pc.king_modifier = kfKing0;
+    pc.king_cutoff = -1;
+    pc.king_table_filter = -DBL_MAX;
+    pc.allele_freq_modifier = kfAlleleFreq0;
+    pc.missing_rpt_modifier = kfMissingRpt0;
+    pc.geno_counts_modifier = kfGenoCounts0;
+    pc.hardy_modifier = kfHardy0;
+    pc.aperm.min = 6;
+    pc.aperm.max = 1000000;
+    pc.aperm.alpha = 0.0;
+    pc.aperm.beta = 0.0001;
+    pc.aperm.init_interval = 1.0;
+    pc.aperm.interval_slope = 0.001;
+    pc.ci_size = 0.0;
+    
+    // Default value is 1638 = 32768 / 20, and that's applied to imported
+    // dosages when --hard-call-threshold is not specified.
+    // However, when --make-{b}pgen is run on a dosage-containing dataset,
+    // explicit --hard-call-threshold will cause the hardcall set to be
+    // regenerated, and that won't happen without --hard-call-threshold.  So we
+    // need to distinguish between --hard-call-threshold 0.1 and no flag.
+    pc.hard_call_thresh = 0xffffffffU;
+    
+    pc.dosage_erase_thresh = 0;
+    pc.pfilter = 2.0; // make --pfilter 1 still filter out NAs
+    pc.output_min_p = 0.0;
+    pc.vif_thresh = 50.0;
+    pc.mind_thresh = 1.0;
+    pc.geno_thresh = 1.0;
+    pc.hwe_thresh = 1.0;
+    pc.mach_r2_min = 0.0;
+    pc.mach_r2_max = 0.0;
+    pc.min_maf = 0.0;
+    pc.max_maf = 1.0;
+    pc.min_allele_dosage = 0;
+    pc.max_allele_dosage = (~0LLU);
+    pc.var_min_qual = -1;
+    pc.update_sex_colm2 = 1;
+    pc.new_variant_id_max_allele_slen = 23;
+    pc.splitpar_bound1 = 0;
+    pc.splitpar_bound2 = 0;
+    pc.missing_pheno = -9;
+    pc.from_bp = -1;
+    pc.to_bp = -1;
+    pc.window_bp = -1;
+    pc.pca_ct = 0;
+    pc.xchr_model = 2;
+    pc.parallel_idx = 0;
+    pc.parallel_tot = 1;
+    pc.exportf_bits = 0;
+    pc.mwithin_val = 1;
+    pc.exportf_id_delim = '\0';
+    double import_dosage_certainty = 0.0;
+    int32_t vcf_min_gq = -1;
+    int32_t vcf_min_dp = -1;
+    intptr_t malloc_size_mb = 0;
+    load_params_t load_params = kfLoadParams0;
+    xload_t xload = kfXload0;
+    uint32_t rseed_ct = 0;
+    make_plink2_t make_plink2_modifier = kfMake0;
+    oxford_import_t oxford_import_flags = kfOxfordImport0;
+    vcf_half_call_t vcf_half_call = kVcfHalfCallDefault;
+    char range_delim = '-';
+    char id_delim = '\0';
+    char idspace_to = '\0';
+    char input_missing_geno_char = '0';
+    char output_missing_geno_char = '.';
+    uint32_t aperm_present = 0;
+    uint32_t notchr_present = 0;
+    uint32_t permit_multiple_inclusion_filters = 0;
+    uint32_t memory_require = 0;
+    gendummy_info_t gendummy_info;
+    init_gendummy(&gendummy_info);
+    plink1_dosage_info_t plink1_dosage_info;
+    init_plink1_dosage(&plink1_dosage_info);
+    do {
+      flagname_p = &(flag_buf[cur_flag_idx * kMaxFlagBlen]);
+      if (!(*flagname_p)) {
+	// preprocessed; not relevant now, but will need --d later
+	continue;
+      }
+      char* flagname_p2 = &(flagname_p[1]);
+      arg_idx = flag_map[cur_flag_idx];
+      uint32_t param_ct = param_count(argv, argc, arg_idx);
+      switch (*flagname_p) {
+      case '1':
+	if (*flagname_p2 == '\0') {
+	  pc.misc_flags |= kfMiscAffection01;
+	  goto main_param_zero;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'a':
+	if (!memcmp(flagname_p2, "llow-extra-chr", 15)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if (memcmp("0", cur_modif, 2)) {
+	      sprintf(g_logbuf, "Error: Invalid --allow-extra-chr parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    chr_info.zero_extra_chrs = 1;
+	  }
+	  pc.misc_flags |= kfMiscAllowExtraChrs;
+	} else if (!memcmp(flagname_p2, "utosome", 8)) {
+	  pc.misc_flags |= kfMiscAutosomeOnly;
+	  chr_info.is_include_stack = 1;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "utosome-par", 12)) {
+	  if (pc.misc_flags & kfMiscAutosomeOnly) {
+	    logerrprint("Error: --autosome-par cannot be used with --autosome.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  pc.misc_flags |= kfMiscAutosomePar;
+	  chr_info.is_include_stack = 1;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "llow-no-samples", 16)) {
+	  // er, lock these out until they're at least close to fully supported
+	  logerrprint("Error: --allow-no-samples is not implemented yet.\n");
+	  reterr = kPglRetNotYetSupported;
+	  goto main_ret_1;
+	  pc.misc_flags |= kfMiscAllowNoSamples;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "llow-no-vars", 13)) {
+	  logerrprint("Error: --allow-no-vars is not implemented yet.\n");
+	  reterr = kPglRetNotYetSupported;
+	  goto main_ret_1;
+	  pc.misc_flags |= kfMiscAllowNoVars;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "djust", 6)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "gc", 2))) {
+	      pc.adjust_info.flags |= kfAdjustGc;
+	    } else if ((cur_modif_slen == 4) && (!memcmp(cur_modif, "log10", 5))) {
+	      pc.adjust_info.flags |= kfAdjustLog10;
+	    } else if ((cur_modif_slen > 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (pc.adjust_info.flags & kfAdjustColAll) {
+		logerrprint("Error: Multiple --adjust cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "chrom\0pos\0ref\0alt1\0alt\0unadj\0gc\0qq\0bonf\0holm\0sidakss\0sidaksd\0fdrbh\0fdrby\0", "adjust", kfAdjustColChrom, kfAdjustColDefault, 1, &pc.adjust_info.flags);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --adjust parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!(pc.adjust_info.flags & kfAdjustColAll)) {
+	    pc.adjust_info.flags |= kfAdjustColDefault;
+	  }
+	} else if (!memcmp(flagname_p2, "perm", 5)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 6)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if (scan_posint_defcap(cur_modif, &pc.aperm.min)) {
+	    sprintf(g_logbuf, "Error: Invalid --aperm min permutation count '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  ++pc.aperm.min;
+	  if (param_ct > 1) {
+	    cur_modif = argv[arg_idx + 2];
+	    if (scan_posint_capped(cur_modif, kApermMax, &pc.aperm.max)) {
+	      sprintf(g_logbuf, "Error: Invalid --aperm max permutation count '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (pc.aperm.min >= pc.aperm.max) {
+	    logerrprint("Error: --aperm min permutation count must be smaller than max.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  aperm_present = 1;
+	  if (param_ct > 2) {
+	    cur_modif = argv[arg_idx + 3];
+	    if (!scanadv_double(cur_modif, &pc.aperm.alpha)) {
+	      sprintf(g_logbuf, "Error: Invalid --aperm alpha threshold '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    if (param_ct > 3) {
+	      cur_modif = argv[arg_idx + 4];
+	      if ((!scanadv_double(cur_modif, &pc.aperm.beta)) || (pc.aperm.beta <= 0.0)) {
+		sprintf(g_logbuf, "Error: Invalid --aperm beta '%s'.\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      if (param_ct > 4) {
+		cur_modif = argv[arg_idx + 5];
+		if (!scanadv_double(cur_modif, &pc.aperm.init_interval)) {
+		  sprintf(g_logbuf, "Error: Invalid --aperm initial pruning interval '%s'.\n", cur_modif);
+		  goto main_ret_INVALID_CMDLINE_WWA;
+		}
+		if ((pc.aperm.init_interval < 1.0) || (pc.aperm.init_interval > 1000000.0)) {
+		  sprintf(g_logbuf, "Error: Invalid --aperm initial pruning interval '%s'.\n", cur_modif);
+		  goto main_ret_INVALID_CMDLINE_WWA;
+		}
+		if (param_ct == 6) {
+		  cur_modif = argv[arg_idx + 6];
+		  if (!scanadv_double(cur_modif, &pc.aperm.interval_slope) || (pc.aperm.interval_slope < 0.0) || (pc.aperm.interval_slope > 1.0)) {
+		    sprintf(g_logbuf, "Error: Invalid --aperm pruning interval slope '%s'.\n", cur_modif);
+		    goto main_ret_INVALID_CMDLINE_WWA;
+		  }
+		}
+	      }
+	    }
+	  }
+	} else if (!memcmp(flagname_p2, "utosome-num", 12)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  uint32_t autosome_ct;
+	  if (scan_posint_capped(cur_modif, kMaxChrTextnum, &autosome_ct)) {
+	    sprintf(g_logbuf, "Error: Invalid --autosome-num parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  // see plink2_common finalize_chrset()
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  chr_info.autosome_ct = autosome_ct;
+	  // assumes first code is X
+	  chr_info.xymt_codes[0] = autosome_ct + 1;
+	  for (uint32_t xymt_idx = 1; xymt_idx < kChrOffsetCt; ++xymt_idx) {
+	    // bugfix: this needs to be -2, not -1, for get_chr_code() to work
+	    // properly
+	    chr_info.xymt_codes[xymt_idx] = -2;
+	  }
+	  chr_info.haploid_mask[0] = 0;
+	  set_bit(autosome_ct + 1, chr_info.haploid_mask);
+	} else if (!memcmp(flagname_p2, "llow-no-sex", 12)) {
+	  logprint("Note: --allow-no-sex no longer has any effect.  (Missing-sex samples are\nautomatically excluded from association analysis when sex is a covariate, and\ntreated normally otherwise.)\n");
+	  goto main_param_zero;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'b':
+	if (!memcmp(flagname_p2, "file", 5)) {
+	  if (xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t fname_modif_idx = 1;
+	  if (param_ct == 2) {
+	    if (check_extra_param(&(argv[arg_idx]), "vzs", &fname_modif_idx)) {
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  }
+	  const char* fname_prefix = argv[arg_idx + fname_modif_idx];
+	  const uint32_t slen = strlen(fname_prefix);
+	  if (slen > (kPglFnamesize - 9)) {
+	    // could use kPglFnamesize - 2 - 3 * param_ct, but that's pointless
+	    logerrprint("Error: --bfile parameter too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  strcpy(memcpya(pgenname, fname_prefix, slen), ".bed");
+	  strcpy(memcpya(psamname, fname_prefix, slen), ".fam");
+	  char* bimname_end = memcpya(pvarname, fname_prefix, slen);
+	  bimname_end = strcpya0(bimname_end, ".bim");
+	  if (param_ct == 2) {
+	    strcpy(bimname_end, ".zst");
+	  }
+	  load_params |= kfLoadParamsPfileAll;
+	} else if (!memcmp(flagname_p2, "pfile", 6)) {
+	  if (load_params || xload) {
+	    // currently only possible with --bcf, --bfile
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t fname_modif_idx = 1;
+	  if (param_ct == 2) {
+	    if (check_extra_param(&(argv[arg_idx]), "vzs", &fname_modif_idx)) {
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  }
+	  const char* fname_prefix = argv[arg_idx + fname_modif_idx];
+	  const uint32_t slen = strlen(fname_prefix);
+	  if (slen > (kPglFnamesize - 9)) {
+	    logerrprint("Error: --bpfile parameter too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  strcpy(memcpya(pgenname, fname_prefix, slen), ".pgen");
+	  strcpy(memcpya(psamname, fname_prefix, slen), ".fam");
+	  char* bimname_end = memcpya(pvarname, fname_prefix, slen);
+	  bimname_end = strcpya0(bimname_end, ".bim");
+	  if (param_ct == 2) {
+	    strcpy(bimname_end, ".zst");
+	  }
+	  load_params |= kfLoadParamsPfileAll;
+	} else if (!memcmp(flagname_p2, "iallelic-only", 14)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "strict")) {
+	      pc.misc_flags |= kfMiscBiallelicOnlyStrict;
+	    } else if (!strcmp(cur_modif, "list")) {
+	      pc.misc_flags |= kfMiscBiallelicOnlyList;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --biallelic-only parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  pc.misc_flags |= kfMiscBiallelicOnly;
+	  logerrprint("Error: --biallelic-only is not implemented yet.\n");
+	  reterr = kPglRetNotYetSupported;
+	  goto main_ret_1;
+	} else if (!memcmp(flagname_p2, "cf", 3)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct == 2) {
+	    const char* cur_modif = argv[arg_idx + 2];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen < 8) || memcmp(cur_modif, "dosage=", 7)) {
+	      sprintf(g_logbuf, "Error: Invalid --bcf parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    reterr = cmdline_alloc_string(&(cur_modif[7]), argv[arg_idx], 4095, &vcf_dosage_import_field);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	    if (!is_alphanumeric(vcf_dosage_import_field)) {
+	      logerrprint("Error: --bcf dosage= parameter is not alphanumeric.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	    if (!strcmp(vcf_dosage_import_field, "GT")) {
+	      logerrprint("Error: --bcf dosage= parameter cannot be 'GT'.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_modif);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --bcf filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pgenname, cur_modif, slen + 1);
+	  xload = kfXloadBcf;
+	} else if (!memcmp(flagname_p2, "gen", 4)) {
+	  if (load_params || xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 2; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "snpid-chr")) {
+	      oxford_import_flags |= kfOxfordImportBgenSnpIdChr;
+	    } else if (!strcmp(cur_modif, "ref-first")) {
+	      oxford_import_flags |= kfOxfordImportRefFirst;
+	    } else if (!strcmp(cur_modif, "ref-second")) {
+	      oxford_import_flags |= kfOxfordImportRefSecond;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --bgen parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  const char* cur_fname = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_fname);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --bgen filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pgenname, cur_fname, slen + 1);
+	  xload = kfXloadOxBgen;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'c':
+	if (!memcmp(flagname_p2, "hr", 3)) {
+	  if (pc.misc_flags & (kfMiscAutosomePar | kfMiscAutosomeOnly)) {
+	    logerrprint("Error: --chr cannot be used with --autosome{-par}.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = parse_chr_ranges(flagname_p, errstr_append, param_ct, (pc.misc_flags / kfMiscAllowExtraChrs) & 1, 0, '-', &(argv[arg_idx]), &chr_info, chr_info.chr_mask);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  chr_info.is_include_stack = 1;
+	} else if (!memcmp(flagname_p2, "ovar", 5)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 0, &pc.covar_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "ovar-name", 10)) {
+	  // can now be used without --covar
+	  reterr = parse_name_ranges(&(argv[arg_idx]), errstr_append, param_ct, 0, range_delim, &pc.covar_range_list);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "onst-fid", 9)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(param_ct? argv[arg_idx + 1] : "0", argv[arg_idx], kMaxIdSlen, &const_fid);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "i", 2)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (!scanadv_double(argv[arg_idx + 1], &pc.ci_size)) {
+	    sprintf(g_logbuf, "Error: Invalid --ci parameter '%s'.\n", argv[arg_idx + 1]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if ((pc.ci_size < 0.01) || (pc.ci_size >= 1.0)) {
+	    logerrprint("Error: --ci confidence interval size must be in [0.01, 1).\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	} else if (!memcmp(flagname_p2, "ondition", 9)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t fname_modif_idx = 1;
+	  if (param_ct == 2) {
+	    if (!strcmp("dominant", argv[arg_idx + 2])) {
+	      pc.glm_info.flags |= kfGlmConditionDominant;
+	    } else if (!strcmp("recessive", argv[arg_idx + 2])) {
+	      pc.glm_info.flags |= kfGlmConditionRecessive;
+	    } else {
+	      fname_modif_idx = 2;
+	      if (!strcmp("dominant", argv[arg_idx + 1])) {
+		pc.glm_info.flags |= kfGlmConditionDominant;
+	      } else if (!strcmp("recessive", argv[arg_idx + 1])) {
+		pc.glm_info.flags |= kfGlmConditionRecessive;
+	      } else {
+		logerrprint("Error: Invalid --condition parameter sequence.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    }
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + fname_modif_idx], argv[arg_idx], kMaxIdSlen, &pc.glm_info.condition_varname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "ondition-list", 14)) {
+	  if (pc.glm_info.condition_varname) {
+	    logerrprint("Error: --condition-list cannot be used with --condition.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t fname_modif_idx = 1;
+	  if (param_ct == 2) {
+	    if (!strcmp("dominant", argv[arg_idx + 2])) {
+	      pc.glm_info.flags |= kfGlmConditionDominant;
+	    } else if (!strcmp("recessive", argv[arg_idx + 2])) {
+	      pc.glm_info.flags |= kfGlmConditionRecessive;
+	    } else {
+	      fname_modif_idx = 2;
+	      if (!strcmp("dominant", argv[arg_idx + 1])) {
+		pc.glm_info.flags |= kfGlmConditionDominant;
+	      } else if (!strcmp("recessive", argv[arg_idx + 1])) {
+		pc.glm_info.flags |= kfGlmConditionRecessive;
+	      } else {
+		logerrprint("Error: Invalid --condition-list parameter sequence.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    }
+	  }
+	  reterr = alloc_fname(argv[arg_idx + fname_modif_idx], flagname_p, 0, &pc.glm_info.condition_list_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "ow", 3)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  // initialize here instead of finalize_chrset(), to simplify
+	  // read_chrset_header_line()
+	  chr_info.autosome_ct = 29;
+	  chr_info.xymt_codes[0] = 30;
+	  chr_info.xymt_codes[1] = 31;
+	  chr_info.xymt_codes[2] = -2;
+	  chr_info.xymt_codes[3] = 33;
+	  chr_info.xymt_codes[4] = -2;
+	  chr_info.xymt_codes[5] = -2;
+	  chr_info.haploid_mask[0] = 0xc0000000U;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "hr-set", 7)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 5)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  int32_t signed_autosome_ct;
+	  if (scan_int_abs_bounded(cur_modif, kMaxChrTextnum, &signed_autosome_ct) || (!signed_autosome_ct)) {
+	    sprintf(g_logbuf, "Error: Invalid --chr-set parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  // see plink2_common finalize_chrset()
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  chr_info.haploid_mask[0] = 0;
+	  if (signed_autosome_ct < 0) {
+	    // haploid
+	    if (param_ct > 1) {
+	      logerrprint("Error: --chr-set does not accept multiple parameters in haploid mode.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    const uint32_t autosome_ct = -signed_autosome_ct;
+	    chr_info.autosome_ct = autosome_ct;
+	    for (uint32_t xymt_idx = 0; xymt_idx < kChrOffsetCt; ++xymt_idx) {
+	      chr_info.xymt_codes[xymt_idx] = -2;
+	    }
+	    fill_all_bits(autosome_ct + 1, chr_info.haploid_mask);
+	  } else {
+	    const uint32_t autosome_ct = signed_autosome_ct;
+	    chr_info.autosome_ct = autosome_ct;
+	    // assumes first four codes are x, y, xy, mt
+	    for (uint32_t xymt_idx = 0; xymt_idx < 4; ++xymt_idx) {
+	      chr_info.xymt_codes[xymt_idx] = autosome_ct + 1 + xymt_idx;
+	    }
+	    for (uint32_t xymt_idx = 4; xymt_idx < kChrOffsetCt; ++xymt_idx) {
+	      chr_info.xymt_codes[xymt_idx] = -2;
+	    }
+	    set_bit(autosome_ct + 1, chr_info.haploid_mask);
+	    set_bit(autosome_ct + 2, chr_info.haploid_mask);
+	    for (uint32_t param_idx = 2; param_idx <= param_ct; ++param_idx) {
+	      cur_modif = argv[arg_idx + param_idx];
+	      if (!strcmp(cur_modif, "no-x")) {
+		chr_info.xymt_codes[0] = -2;
+		clear_bit(autosome_ct + 1, chr_info.haploid_mask);
+	      } else if (!strcmp(cur_modif, "no-y")) {
+		chr_info.xymt_codes[1] = -2;
+		clear_bit(autosome_ct + 2, chr_info.haploid_mask);
+	      } else if (!strcmp(cur_modif, "no-xy")) {
+		chr_info.xymt_codes[2] = -2;
+	      } else if (!strcmp(cur_modif, "no-mt")) {
+		chr_info.xymt_codes[3] = -2;
+	      } else {
+		sprintf(g_logbuf, "Error: Invalid --chr-set parameter '%s'.\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    }
+	  }
+	} else if (!memcmp(flagname_p2, "hr-override", 12)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if (!strcmp(cur_modif, "file")) {
+	      pc.misc_flags |= kfMiscChrOverrideFile;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --chr-override parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  } else {
+	    pc.misc_flags |= kfMiscChrOverrideCmdline;
+	  }
+	} else if (!memcmp(flagname_p2, "ovar-quantile-normalize", 24)) {
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &pc.covar_quantnorm_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.pheno_transform_flags |= kfPhenoTransformQuantnormCovar;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "ovar-variance-standardize", 26)) {
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &pc.vstd_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.pheno_transform_flags |= kfPhenoTransformVstdCovar;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'd':
+	if (!memcmp(flagname_p2, "ouble-id", 9)) {
+	  if (const_fid) {
+	    logerrprint("Error: --double-id cannot be used with --const-fid.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  pc.misc_flags |= kfMiscDoubleId;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ebug", 5)) {
+	  g_debug_on = 1;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ata", 4)) {
+	  if (load_params || (xload & (~kfXloadOxBgen))) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t is_gzs = 0;
+	  for (uint32_t param_idx = 2; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "ref-first")) {
+	      oxford_import_flags |= kfOxfordImportRefFirst;
+	    } else if (!strcmp(cur_modif, "ref-second")) {
+	      oxford_import_flags |= kfOxfordImportRefSecond;
+	    } else if (!strcmp(cur_modif, "gzs")) {
+	      if (xload & kfXloadOxBgen) {
+		// may as well permit e.g. --data ref-first + --bgen
+		logerrprint("Error: --data 'gzs' modifier cannot be used with .bgen input.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      is_gzs = 1;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --data parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  const char* fname_prefix = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(fname_prefix);
+	  if (slen > (kPglFnamesize - 9)) {
+	    logerrprint("Error: --data parameter too long.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (!(xload & kfXloadOxBgen)) {
+	    // allow --bgen to override this
+	    char* genname_end = memcpya(pgenname, fname_prefix, slen);
+	    genname_end = strcpya0(genname_end, ".gen");
+	    if (is_gzs) {
+	      strcpy(genname_end, ".zst");
+	    }
+	    xload |= kfXloadOxGen;
+	  }
+	  strcpy(memcpya(psamname, fname_prefix, slen), ".sample");
+	  xload |= kfXloadOxSample;
+	} else if (!memcmp(flagname_p2, "osage-erase-threshold", 22)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double dosage_erase_frac;
+	  if ((!scanadv_double(cur_modif, &dosage_erase_frac)) || (dosage_erase_frac < 0.0) || (dosage_erase_frac >= (0.5 - kSmallEpsilon))) {
+	    sprintf(g_logbuf, "Error: Invalid --dosage-erase-threshold parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  pc.dosage_erase_thresh = (int32_t)(dosage_erase_frac * ((1 + kSmallEpsilon) * kDosageMid));
+	} else if (!memcmp(flagname_p2, "ummy", 5)) {
+	  if (load_params || xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 2, 8)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  // todo: support --allow-no-samples/--allow-no-vars
+	  if (scan_posint_defcap(argv[arg_idx + 1], &gendummy_info.sample_ct)) {
+	    logerrprint("Error: Invalid --dummy sample count.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (scan_posint_defcap(argv[arg_idx + 2], &gendummy_info.variant_ct)) {
+	    logerrprint("Error: Invalid --dummy SNP count.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  uint32_t extra_numeric_param_ct = 0;
+	  for (uint32_t param_idx = 3; param_idx <= param_ct; ++param_idx) {
+	    char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 4) && match_upper_counted(cur_modif, "ACGT", 4)) {
+	      gendummy_info.flags |= kfGenDummyAcgt;
+	    } else if ((cur_modif_slen == 4) && (!memcmp(cur_modif, "1234", 4))) {
+	      gendummy_info.flags |= kfGenDummy1234;
+	    } else if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "12", 2))) {
+	      gendummy_info.flags |= kfGenDummy12;
+	    } else if ((cur_modif_slen > 9) && (!memcmp(cur_modif, "pheno-ct=", 9))) {
+	      const char* pheno_ct_start = &(cur_modif[9]);
+	      if (scan_uint_capped(pheno_ct_start, kMaxPhenoCt, &gendummy_info.pheno_ct)) {
+		sprintf(g_logbuf, "Error: Invalid --dummy pheno-ct= parameter '%s'.\n", pheno_ct_start);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    } else if ((cur_modif_slen == 12) && (!memcmp(cur_modif, "scalar-pheno", 12))) {
+	      gendummy_info.flags |= kfGenDummyScalarPheno;
+	    } else if ((cur_modif_slen > 12) && (!memcmp(cur_modif, "dosage-freq=", 12))) {
+	      char* dosage_freq_start = &(cur_modif[12]);
+	      double dxx;
+	      if ((!scanadv_double(dosage_freq_start, &dxx)) || (dxx < 0.0) || (dxx > 1.0)) {
+		sprintf(g_logbuf, "Error: Invalid --dummy dosage-freq= parameter '%s'.\n", dosage_freq_start);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      gendummy_info.dosage_freq = dxx;
+	    } else {
+	      double dxx;
+	      if ((extra_numeric_param_ct == 2) || (!scanadv_double(cur_modif, &dxx)) || (dxx < 0.0) || (dxx > 1.0)) {
+		sprintf(g_logbuf, "Error: Invalid --dummy parameter '%s'.\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      if (!extra_numeric_param_ct) {
+		gendummy_info.geno_mfreq = dxx;
+	      } else {
+		gendummy_info.pheno_mfreq = dxx;
+	      }
+	      ++extra_numeric_param_ct;
+	    }
+	  }
+	  const uint32_t mutually_exclusive_flags = gendummy_info.flags & (kfGenDummyAcgt | kfGenDummy1234 | kfGenDummy12);
+	  if (mutually_exclusive_flags & (mutually_exclusive_flags - 1)) {
+	    logerrprint("Error: --dummy 'acgt', '1234', and '12' modifiers are mutually exclusive.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  xload |= kfXloadGenDummy;
+	} else if (!memcmp(flagname_p2, "ummy-coding", 12)) {
+	  logerrprint("Error: --dummy-coding is retired.  Use --split-cat-pheno instead.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	} else if (!memcmp(flagname_p2, "og", 3)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  chr_info.autosome_ct = 38;
+	  chr_info.xymt_codes[0] = 39;
+	  chr_info.xymt_codes[1] = 40;
+	  chr_info.xymt_codes[2] = 41;
+	  chr_info.xymt_codes[3] = 42;
+	  chr_info.xymt_codes[4] = -2;
+	  chr_info.xymt_codes[5] = -2;
+#ifdef __LP64__
+	  chr_info.haploid_mask[0] = 0x18000000000LLU;
+#else
+	  chr_info.haploid_mask[0] = 0;
+	  chr_info.haploid_mask[1] = 0x180;
+#endif
+	  goto main_param_zero;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'e':
+	if (!memcmp(flagname_p2, "xtract", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const uint32_t is_range = !strcmp(argv[arg_idx + 1], "range");
+	  if (is_range) {
+	    if (param_ct == 1) {
+	      logerrprint("Error: '--extract range' requires at least one filename.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    pc.misc_flags |= kfMiscExtractRange;
+	    pc.filter_flags |= kfFilterNoSplitChr;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1 + is_range]), param_ct - is_range, kPglFnamesize, &pc.extract_fnames);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if (!memcmp(flagname_p2, "xclude", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const uint32_t is_range = !strcmp(argv[arg_idx + 1], "range");
+	  if (is_range) {
+	    if (param_ct == 1) {
+	      logerrprint("Error: '--exclude range' requires at least one filename.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    pc.misc_flags |= kfMiscExcludeRange;
+	    pc.filter_flags |= kfFilterNoSplitChr;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1 + is_range]), param_ct - is_range, kPglFnamesize, &pc.exclude_fnames);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if ((!memcmp(flagname_p2, "xport", 6)) || (!memcmp(flagname_p2, "xport ped", 10))) {
+	  // todo: determine actual limit
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 50)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t format_param_idxs = 0;
+	  if (!flagname_p2[5]) {
+	    get_exportf_targets(&(argv[arg_idx]), param_ct, &pc.exportf_modifier, &pc.exportf_id_paste, &format_param_idxs);
+	    if (!format_param_idxs) {
+	      logerrprint("Error: --export requires at least one output format.  (Did you forget 'ped' or\n'vcf'?)\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    logprintb();
+	  } else {
+	    pc.exportf_modifier = kfExportfPed;
+	  }
+	  // can't have e.g. bgen-1.1 and bgen-1.2 simultaneously, since they
+	  // have the same extension and different content.
+	  const uint64_t bgen_flags = (uint64_t)(pc.exportf_modifier & (kfExportfBgen11 | kfExportfBgen12 | kfExportfBgen13));
+	  if (bgen_flags & (bgen_flags - 1)) {
+	    logerrprint("Error: Multiple --export bgen versions.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if ((pc.exportf_modifier & (kfExportfHaps | kfExportfHapsLegend)) == (kfExportfHaps | kfExportfHapsLegend)) {
+	    logerrprint("Error: 'haps' and 'hapslegend' formats cannot be exported simultaneously.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    // could use next_unset()...
+	    if ((format_param_idxs >> param_idx) & 1) {
+	      continue;
+	    }
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if (cur_modif_slen > 9) {
+	      if (!memcmp(cur_modif, "id-paste=", 9)) {
+		if (!(pc.exportf_modifier & (kfExportfVcf | kfExportfBgen12 | kfExportfBgen13))) {
+		  // todo: bcf
+		  logerrprint("Error: The 'id-paste' modifier only applies to --export's vcf, bgen-1.2, and\nbgen-1.3 output formats.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+		if (pc.exportf_id_paste) {
+		  logerrprint("Error: Multiple --export id-paste= modifiers.\n");
+		  goto main_ret_INVALID_CMDLINE;
+		}
+		reterr = parse_col_descriptor(&(cur_modif[9]), "fid\0iid\0maybesid\0sid\0", "export", kfIdpasteFid, kfIdpasteDefault, 1, &pc.exportf_id_paste);
+		if (reterr) {
+		  goto main_ret_1;
+		}
+	      } else if (!memcmp(cur_modif, "id-delim=", 9)) {
+		if (!(pc.exportf_modifier & (kfExportfVcf | kfExportfBgen12 | kfExportfBgen13))) {
+		  logerrprint("Error: The 'id-delim' modifier only applies to --export's vcf, bgen-1.2, and\nbgen-1.3 output formats.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+		if (pc.exportf_id_delim) {
+		  logerrprint("Error: Multiple --export id-delim= modifiers.\n");
+		  goto main_ret_INVALID_CMDLINE;
+		}
+		pc.exportf_id_delim = extract_char_param(&(cur_modif[9]));
+		if (!pc.exportf_id_delim) {
+		  logerrprint("Error: --export id-delim= value must be a single character.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+		if ((((unsigned char)pc.exportf_id_delim) < ' ') || (pc.exportf_id_delim == '0')) {
+		  logerrprint("Error: --export id-delim= value cannot be tab, newline, '0', or a nonprinting\ncharacter.\n");
+		  goto main_ret_INVALID_CMDLINE;
+		}
+	      } else if (!memcmp(cur_modif, "vcf-dosage=", 11)) {
+		if (!(pc.exportf_modifier & kfExportfVcf)) {
+		  logerrprint("Error: The 'vcf-dosage' modifier only applies to --export's vcf output format.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+		if (pc.exportf_modifier & (kfExportfVcfDosageGp | kfExportfVcfDosageDs)) {
+		  logerrprint("Error: Multiple --export vcf-dosage= modifiers.\n");
+		  goto main_ret_INVALID_CMDLINE;
+		}
+		const char* vcf_dosage_start = &(cur_modif[11]);
+		if (!strcmp(vcf_dosage_start, "GP")) {
+		  pc.exportf_modifier |= kfExportfVcfDosageGp;
+		} else if (!strcmp(vcf_dosage_start, "DS")) {
+		  pc.exportf_modifier |= kfExportfVcfDosageDs;
+		} else {
+		  sprintf(g_logbuf, "Error: Invalid --export vcf-dosage= parameter '%s'.\n", vcf_dosage_start);
+		  goto main_ret_INVALID_CMDLINE_WWA;
+		}
+	      } else if (!memcmp(cur_modif, "bits=", 5)) {
+		if (!(pc.exportf_modifier & (kfExportfBgen12 | kfExportfBgen13))) {
+		  logerrprint("Error: The 'bits' modifier only applies to --export's bgen-1.2 and bgen-1.3\noutput formats.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+		if (pc.exportf_bits) {
+		  logerrprint("Error: Multiple --export bits= modifiers.\n");
+		  goto main_ret_INVALID_CMDLINE;
+		}
+		const char* bits_start = &(cur_modif[5]);
+		if (scan_posint_capped(bits_start, 32, &pc.exportf_bits)) {
+		  sprintf(g_logbuf, "Error: Invalid --export bits= parameter '%s'.\n", bits_start);
+		  goto main_ret_INVALID_CMDLINE_WWA;
+		}
+	      } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "include-alt", 11))) {
+		if (!(pc.exportf_modifier & (kfExportfA | kfExportfAD))) {
+		  logerrprint("Error: The 'include-alt' modifier only applies to --export's A and AD output\nformats.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+		pc.exportf_modifier |= kfExportfIncludeAlt;
+	      } else if ((cur_modif_slen == 14) && (!memcmp(cur_modif, "omit-nonmale-y", 14))) {
+		if (!(pc.exportf_modifier & (kfExportfList | kfExportfRlist))) {
+		  logerrprint("Error: The 'omit-nonmale-y' modifier only applies to --export's list and rlist\noutput formats.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+		pc.exportf_modifier |= kfExportfOmitNonmaleY;
+	      }
+	    } else if ((cur_modif_slen == 2) && ((!memcmp(cur_modif, "01", 2)) || (!memcmp(cur_modif, "12", 2)))) {
+	      if (pc.exportf_modifier & (kfExportfA | kfExportfAD)) {
+		sprintf(g_logbuf, "Error: The '%s' modifier does not apply to --export's A and AD output formats.\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_2A;
+	      }
+	      if (pc.exportf_modifier & kfExportfVcf) {
+		logerrprint("Error: '01'/'12' cannot be used with --export's vcf output format.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      if (cur_modif[0] == '0') {
+		if (pc.exportf_modifier & kfExportf12) {
+		  logerrprint("Error: --export '01' and '12' cannot be used together.\n");
+		  goto main_ret_INVALID_CMDLINE;
+		}
+		pc.exportf_modifier |= kfExportf01;
+	      } else {
+		if (pc.exportf_modifier & kfExportf01) {
+		  logerrprint("Error: --export '01' and '12' cannot be used together.\n");
+		  goto main_ret_INVALID_CMDLINE;
+		}
+		pc.exportf_modifier |= kfExportf12;
+	      }
+	    } else if ((cur_modif_slen == 3) && (!memcmp(cur_modif, "bgz", 3))) {
+	      if (!(pc.exportf_modifier & kfExportfVcf)) {
+		logerrprint("Error: The 'bgz' modifier only applies to --export's vcf output format.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.exportf_modifier |= kfExportfBgz;
+	    } else if ((cur_modif_slen == 6) && (!memcmp(cur_modif, "spaces", 6))) {
+	      pc.exportf_modifier |= kfExportfSpaces;
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "ref-first", 9))) {
+	      pc.exportf_modifier |= kfExportfRefFirst;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --export parameter '%s'.%s\n", cur_modif, ((param_idx == param_ct) && (!outname_end))? " (Did you forget '--out'?)" : "");
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (pc.exportf_modifier & (kfExportfVcf | kfExportfBgen12 | kfExportfBgen13)) {
+	    if (!pc.exportf_id_paste) {
+	      pc.exportf_id_paste = kfIdpasteDefault;
+	    }
+	  }
+	  pc.command_flags1 |= kfCommand1Exportf;
+	} else if (!memcmp(flagname_p2, "xclude-snp", 11)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.varid_exclude_snp);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if (!memcmp(flagname_p2, "xclude-snps", 12)) {
+	  reterr = parse_name_ranges(&(argv[arg_idx]), errstr_append, param_ct, 0, range_delim, &pc.exclude_snps_range_list);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'f':
+	if (!memcmp(flagname_p2, "req", 4)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 5)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t bins_only = 0;
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "zs", 2))) {
+	      pc.allele_freq_modifier |= kfAlleleFreqZs;
+	    } else if ((cur_modif_slen == 6) && (!memcmp(cur_modif, "counts", 6))) {
+	      pc.allele_freq_modifier |= kfAlleleFreqCounts;
+	    } else if ((cur_modif_slen == 12) && (!memcmp(cur_modif, "case-control", 12))) {
+	      logerrprint("Error: --freq 'case-control' modifier has been retired.  Use\n--keep-if/--remove-if in conjunction with Unix text-processing utilities\ninstead.\n");
+	    } else if ((cur_modif_slen > 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (pc.allele_freq_modifier & kfAlleleFreqColAll) {
+		logerrprint("Error: Multiple --freq cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "chrom\0pos\0ref\0alt1\0alt\0reffreq\0alt1freq\0altfreq\0freq\0eq\0eqz\0alteq\0alteqz\0numeq\0altnumeq\0machr2\0nobs\0", "freq", kfAlleleFreqColChrom, kfAlleleFreqColDefault, 1, &pc.allele_freq_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	      const uint32_t mutually_exclusive_cols = pc.allele_freq_modifier & kfAlleleFreqColMutex;
+	      if (mutually_exclusive_cols & (mutually_exclusive_cols - 1)) {
+		logerrprint("Error: --freq's altfreq, freq, eq, eqz, alteq, alteqz, numeq, and altnumeq\ncolumns are mutually exclusive.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "bins-only", 9))) {
+	      bins_only = 1;
+	    } else if (((cur_modif_slen > 8) && (!memcmp(cur_modif, "refbins=", 8))) || ((cur_modif_slen > 13) && (!memcmp(cur_modif, "refbins-file=", 13))) || ((cur_modif_slen > 9) && (!memcmp(cur_modif, "alt1bins=", 9))) || ((cur_modif_slen > 14) && (!memcmp(cur_modif, "alt1bins-file=", 14)))) {
+	      const uint32_t is_alt1 = (cur_modif[0] == 'a');
+	      char** binstr_ptr = is_alt1? (&pc.freq_alt1_binstr) : (&pc.freq_ref_binstr);
+	      if (*binstr_ptr) {
+		LOGERRPRINTF("Error: Multiple --freq %sbins{-file}= modifiers.\n", is_alt1? "alt1" : "ref");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      if (cur_modif[7 + is_alt1] == '=') {
+		reterr = cmdline_alloc_string(&(cur_modif[8 + is_alt1]), is_alt1? "--freq alt1bins=" : "--freq refbins=", 0x7fffffff, binstr_ptr);
+	      } else {
+		pc.allele_freq_modifier |= is_alt1? kfAlleleFreqBinsAlt1Fname : kfAlleleFreqBinsRefFname;
+		reterr = alloc_fname(&(cur_modif[13 + is_alt1]), is_alt1? "freq alt1bins-file=" : "freq refbins-file=", 0, binstr_ptr);
+	      }
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --freq parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (bins_only) {
+	    if ((!pc.freq_ref_binstr) && (!pc.freq_alt1_binstr)) {
+	      logerrprint("Error: --freq 'bins-only' must be used with 'refbins{-file}=' and/or\n'alt1bins{-file}='.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if (pc.allele_freq_modifier & (kfAlleleFreqZs | kfAlleleFreqColAll)) {
+	      logerrprint("Error: --freq 'bins-only' cannot be used with 'zs' or 'cols=' (which only\naffect the main report).\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    pc.allele_freq_modifier |= kfAlleleFreqBinsOnly;
+	  }
+	  if (!(pc.allele_freq_modifier & kfAlleleFreqColAll)) {
+	    pc.allele_freq_modifier |= kfAlleleFreqColDefault;
+	  }
+	  pc.command_flags1 |= kfCommand1AlleleFreq;
+	} else if (!memcmp(flagname_p2, "rom", 4)) {
+	  if (chr_info.is_include_stack) {
+	    logerrprint("Error: --from/--to cannot be used with --autosome{-par} or --{not-}chr.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.varid_from);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq | kfFilterNoSplitChr;
+	} else if ((!memcmp(flagname_p2, "rom-bp", 7)) || (!memcmp(flagname_p2, "rom-kb", 7)) || (!memcmp(flagname_p2, "rom-mb", 7))) {
+	  if (!cmdline_single_chr(&chr_info, pc.misc_flags)) {
+	    logerrprint("Error: --from-bp/-kb/-mb and --to-bp/-kb/-mb must be used with --chr, and only\none chromosome.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (pc.from_bp != -1) {
+	    logerrprint("Error: Multiple --from-bp/-kb/-mb values.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  // permit negative numbers, to simplify shell script windowing logic
+	  char* cur_modif = argv[arg_idx + 1];
+	  double dxx;
+	  if (!scanadv_double(cur_modif, &dxx)) {
+	    sprintf(g_logbuf, "Error: Invalid --from-bp/-kb/-mb parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  const char unit_char = flagname_p2[4];
+	  if (unit_char == 'k') {
+	    dxx *= 1000;
+	  } else if (unit_char == 'm') {
+	    dxx *= 1000000;
+	  }
+	  if (dxx <= 0.0) {
+	    pc.from_bp = 0;
+	  } else {
+	    // logical to round up rather than down here (this is actually a
+	    // change from v1.9)
+	    // don't use ceil() since e.g. ceil(0.001015 * 1000000) is 1016
+	    if (dxx > 2147483646.0) {
+	      LOGERRPRINTF("Error: --from-bp/-kb/-mb parameter '%s' too large.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    pc.from_bp = 1 + (int32_t)(dxx * (1 - kSmallEpsilon));
+	  }
+	  pc.filter_flags |= kfFilterPvarReq | kfFilterNoSplitChr;
+	} else if (!memcmp(flagname_p2, "orce-intersect", 15)) {
+	  permit_multiple_inclusion_filters = 1;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "amily", 6)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if (is_reserved_pheno_name(cur_modif, cur_modif_slen)) {
+	      sprintf(g_logbuf, "Error: '%s' cannot be used as a categorical phenotype name.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_2A;
+	    }
+	    reterr = cmdline_alloc_string(cur_modif, argv[arg_idx], kMaxIdSlen, &pc.catpheno_name);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.misc_flags |= kfMiscCatPhenoFamily;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "amily-missing-catname", 22)) {
+	  if (!(pc.misc_flags & kfMiscCatPhenoFamily)) {
+	    logerrprint("Error: --family-missing-catname must be used with --family.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.family_missing_catname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if ((!memcmp(flagname_p2, "ilter-cases", 12)) || (!memcmp(flagname_p2, "ilter-controls", 15))) {
+	  logerrprint("Error: --filter-cases and --filter-controls have been retired.  Use\n--keep-if/--remove-if instead.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	} else if ((!memcmp(flagname_p2, "rqx", 4)) || (!memcmp(flagname_p2, "reqx", 5))) {
+	  logerrprint("Error: --freqx has been retired.  Use --geno-counts instead.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'g':
+	if (!memcmp(flagname_p2, "eno", 4)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t geno_thresh_present = 0;
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "dosage")) {
+	      pc.misc_flags |= kfMiscGenoDosage;
+	    } else if (!strcmp(cur_modif, "hh-missing")) {
+	      pc.misc_flags |= kfMiscGenoHhMissing;
+	    } else if (geno_thresh_present) {
+	      logerrprint("Error: Invalid --geno parameter sequence.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    } else if (!scanadv_double(cur_modif, &pc.geno_thresh)) {
+	      sprintf(g_logbuf, "Error: Invalid --geno parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    } else if ((pc.geno_thresh < 0.0) || (pc.geno_thresh > 1.0)) {
+	      sprintf(g_logbuf, "Error: Invalid --geno parameter '%s' (must be in [0, 1]).\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    } else {
+	      geno_thresh_present = 1;
+	    }
+	  }
+	  if (!geno_thresh_present) {
+	    pc.geno_thresh = 0.1;
+	  }
+	  if (pc.geno_thresh < 1.0) {
+	    pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	  }
+	} else if (!memcmp(flagname_p2, "eno-counts", 11)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "zs", 2))) {
+	      pc.geno_counts_modifier |= kfGenoCountsZs;
+	    } else if ((cur_modif_slen > 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (pc.geno_counts_modifier & kfGenoCountsColAll) {
+		logerrprint("Error: Multiple --geno-counts cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }	      
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "chrom\0pos\0ref\0alt1\0alt\0homref\0refalt1\0refalt\0homalt1\0altxy\0xy\0hapref\0hapalt1\0hapalt\0hap\0numeq\0missing\0nobs\0", "geno-counts", kfGenoCountsColChrom, kfGenoCountsColDefault, 1, &pc.geno_counts_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	      if ((pc.geno_counts_modifier & kfGenoCountsColPairex) == kfGenoCountsColPairex) {
+		logerrprint("Error: --geno-counts's hapaltx and hapx columns are mutually exclusive.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      const uint32_t mutually_exclusive_cols = pc.geno_counts_modifier & kfGenoCountsColMutex;
+	      if (mutually_exclusive_cols & (mutually_exclusive_cols - 1)) {
+		logerrprint("Error: --geno-counts's altxy, xy, and numeq columns are mutually exclusive.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --geno-counts parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!(pc.geno_counts_modifier & kfGenoCountsColAll)) {
+	    pc.geno_counts_modifier |= kfGenoCountsColDefault;
+	  }
+	  pc.command_flags1 |= kfCommand1GenoCounts;
+	} else if (!memcmp(flagname_p2, "lm", 3)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 15)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "zs", 2))) {
+	      pc.glm_info.flags |= kfGlmZs;
+	    } else if ((cur_modif_slen == 3) && (!memcmp(cur_modif, "sex", 3))) {
+	      pc.glm_info.flags |= kfGlmSex;
+	    } else if ((cur_modif_slen == 8) && (!memcmp(cur_modif, "no-x-sex", 8))) {
+	      pc.glm_info.flags |= kfGlmNoXSex;
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "genotypic", 9))) {
+	      pc.glm_info.flags |= kfGlmGenotypic;
+	    } else if ((cur_modif_slen == 6) && (!memcmp(cur_modif, "hethom", 6))) {
+	      pc.glm_info.flags |= kfGlmHethom;
+	    } else if ((cur_modif_slen == 8) && (!memcmp(cur_modif, "dominant", 8))) {
+	      pc.glm_info.flags |= kfGlmDominant;
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "recessive", 9))) {
+	      pc.glm_info.flags |= kfGlmRecessive;
+	    } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "interaction", 11))) {
+	      pc.glm_info.flags |= kfGlmInteraction;
+	    } else if ((cur_modif_slen == 10) && (!memcmp(cur_modif, "hide-covar", 10))) {
+	      pc.glm_info.flags |= kfGlmHideCovar;
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "intercept", 9))) {
+	      pc.glm_info.flags |= kfGlmIntercept;
+	    } else if ((cur_modif_slen == 14) && (!memcmp(cur_modif, "firth-fallback", 14))) {
+	      pc.glm_info.flags |= kfGlmFirthFallback;
+	    } else if ((cur_modif_slen == 5) && (!memcmp(cur_modif, "firth", 5))) {
+	      pc.glm_info.flags |= kfGlmFirth;
+	    } else if ((cur_modif_slen == 13) && (!memcmp(cur_modif, "standard-beta", 13))) {
+	      logerrprint("Error: --glm 'standard-beta' modifier has been retired.  Use\n--{covar-}variance-standardize instead.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    } else if ((cur_modif_slen == 4) && (!memcmp(cur_modif, "perm", 4))) {
+	      pc.glm_info.flags |= kfGlmPerm;
+	    } else if ((cur_modif_slen == 10) && (!memcmp(cur_modif, "perm-count", 10))) {
+	      pc.glm_info.flags |= kfGlmPermCount;
+	    } else if ((cur_modif_slen > 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (pc.glm_info.cols) {
+		logerrprint("Error: Multiple --glm cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "chrom\0pos\0ref\0alt1\0alt\0altcount\0totallele\0altcountcc\0totallelecc\0altfreq\0altfreqcc\0machr2\0firth\0test\0nobs\0beta\0orbeta\0se\0ci\0t\0p\0", flagname_p, kfGlmColChrom, kfGlmColDefault, 1, &pc.glm_info.cols);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	      if ((!(pc.glm_info.cols & (kfGlmColBeta | kfGlmColOrbeta))) && ((pc.glm_info.cols & kfGlmColSe) || ((pc.glm_info.cols & kfGlmColCi) && (pc.ci_size != 0)))) {
+		logerrprint("Error: --glm's 'se' and 'ci' columns require beta/orbeta to be included.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else if ((cur_modif_slen >= 5) && (!memcmp(cur_modif, "mperm", 5))) {
+	      if ((cur_modif_slen < 7) || (cur_modif[5] != '=')) {
+		logerrprint("Error: Improper --glm mperm syntax.  (Use --glm mperm=[value]'.)\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      if (scan_posint_defcap(cur_modif, &pc.glm_info.mperm_ct)) {
+		sprintf(g_logbuf, "Error: Invalid --glm mperm parameter '%s'.\n", &(cur_modif[6]));
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    } else if ((cur_modif_slen > 12) && (!memcmp(cur_modif, "local-covar=", 12))) {
+	      if (pc.glm_local_covar_fname) {
+		logerrprint("Error: Multiple --glm local-covar= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = alloc_fname(&(cur_modif[12]), "glm local-covar=", 0, &pc.glm_local_covar_fname);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else if ((cur_modif_slen > 10) && ((!memcmp(cur_modif, "local-bim=", 10)) || ((cur_modif_slen > 11) && (!memcmp(cur_modif, "local-pvar=", 11))))) {
+	      if (pc.glm_local_pvar_fname) {
+		logerrprint("Error: Multiple --glm local-pvar= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const uint32_t is_pvar = (cur_modif[6] == 'p');
+	      reterr = alloc_fname(&(cur_modif[10 + is_pvar]), "glm local-pvar=", 0, &pc.glm_local_pvar_fname);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else if ((cur_modif_slen > 10) && ((!memcmp(cur_modif, "local-fam=", 10)) || ((cur_modif_slen > 11) && (!memcmp(cur_modif, "local-psam=", 11))))) {
+	      if (pc.glm_local_psam_fname) {
+		logerrprint("Error: Multiple --glm local-psam= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const uint32_t is_psam = (cur_modif[6] == 'p');
+	      reterr = alloc_fname(&(cur_modif[10 + is_psam]), "glm local-psam=", 0, &pc.glm_local_psam_fname);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else if ((cur_modif_slen == 15) && (!memcmp(cur_modif, "local-omit-last", 15))) {
+	      pc.glm_info.flags |= kfGlmLocalOmitLast;
+	    } else if ((cur_modif_slen > 11) && (!memcmp(cur_modif, "local-cats=", 11))) {
+	      if (pc.glm_info.local_cat_ct) {
+		logerrprint("Error: Multiple --glm local-cats= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      if (scan_posint_capped(cur_modif, 4095, &pc.glm_info.local_cat_ct) || (pc.glm_info.local_cat_ct == 1)) {
+		logerrprint("Error: Invalid --glm local-cats= category count (must be in [2, 4095]).\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --glm parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!pc.glm_info.cols) {
+	    pc.glm_info.cols = kfGlmColDefault;
+	  }
+	  if ((pc.glm_info.flags & (kfGlmSex | kfGlmNoXSex)) == (kfGlmSex | kfGlmNoXSex)) {
+	    logerrprint("Error: Conflicting --glm parameters.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if ((pc.glm_info.flags & kfGlmPerm) && pc.glm_info.mperm_ct) {
+	    logerrprint("Error: --glm 'perm' and 'mperm=' cannot be used together.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  uint32_t alternate_genotype_col_flags = (uint32_t)(pc.glm_info.flags & (kfGlmGenotypic | kfGlmHethom | kfGlmDominant | kfGlmRecessive));
+	  if (alternate_genotype_col_flags) {
+	    pc.xchr_model = 0;
+	    if (alternate_genotype_col_flags & (alternate_genotype_col_flags - 1)) {
+	      logerrprint("Error: Conflicting --glm parameters.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  }
+	  if ((pc.glm_info.flags & kfGlmIntercept) && (!(pc.glm_info.cols & kfGlmColTest))) {
+	    logerrprint("Error: --glm 'intercept' modifier cannot be used with an omitted 'test' column.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (!pc.glm_local_covar_fname) {
+	    if (pc.glm_local_pvar_fname || pc.glm_local_psam_fname) {
+	      logerrprint("Error: Either all three --glm local-covar filenames must be specified, or none\nof them.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if (pc.glm_info.flags & kfGlmLocalOmitLast) {
+	      logerrprint("Error: --glm 'local-omit-last' must be used with 'local-covar='.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if (pc.glm_info.local_cat_ct) {
+	      logerrprint("Error: --glm 'local-cats=' must be used with 'local-covar='.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  } else {
+	    if ((!pc.glm_local_pvar_fname) || (!pc.glm_local_psam_fname)) {
+	      logerrprint("Error: Either all three --glm local-covar filenames must be specified, or none\nof them.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  }
+	  pc.command_flags1 |= kfCommand1Glm;
+	} else if (!memcmp(flagname_p2, "en", 3)) {
+	  if (load_params || (xload & (~kfXloadOxSample))) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct == 2) {
+	    const char* cur_modif = argv[arg_idx + 2];
+	    if (!strcmp(cur_modif, "ref-first")) {
+	      oxford_import_flags |= kfOxfordImportRefFirst;
+	    } else if (!strcmp(cur_modif, "ref-second")) {
+	      oxford_import_flags |= kfOxfordImportRefSecond;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --gen parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  const char* cur_fname = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_fname);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --gen filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pgenname, cur_fname, slen + 1);
+	  xload |= kfXloadOxGen;
+	} else if (!memcmp(flagname_p2, "enotyping-rate", 15)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if (strcmp("dosage", cur_modif)) {
+	      sprintf(g_logbuf, "Error: Invalid --genotyping-rate parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    pc.misc_flags |= kfMiscGenotypingRateDosage;
+	  }
+	  pc.command_flags1 |= kfCommand1GenotypingRate;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'h':
+	if (!memcmp(flagname_p2, "ardy", 5)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "zs")) {
+	      pc.hardy_modifier |= kfHardyZs;
+	    } else if (!strcmp(cur_modif, "midp")) {
+	      pc.hardy_modifier |= kfHardyMidp;
+	    } else if ((strlen(cur_modif) > 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (pc.hardy_modifier & kfHardyColAll) {
+		logerrprint("Error: Multiple --hardy cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }	      
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "chrom\0pos\0ref\0alt1\0alt\0gcounts\0gcount1col\0hetfreq\0sexaf\0femalep\0p\0", "freq", kfHardyColChrom, kfHardyColDefault, 1, &pc.hardy_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	      if ((pc.hardy_modifier & (kfHardyColGcounts | kfHardyColGcount1col)) == (kfHardyColGcounts | kfHardyColGcount1col)) {
+		logerrprint("Error: --hardy's gcounts and gcounts1col column sets are mutually exclusive.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --hardy parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!(pc.hardy_modifier & kfHardyColAll)) {
+	    pc.hardy_modifier |= kfHardyColDefault;
+	  }
+	  pc.command_flags1 |= kfCommand1Hardy;
+	} else if (!memcmp(flagname_p2, "we", 3)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "midp")) {
+	      pc.misc_flags |= kfMiscHweMidp;
+	    } else if (!strcmp(cur_modif, "keep-fewhet")) {
+	      pc.misc_flags |= kfMiscHweKeepFewhet;
+	    } else {
+	      if ((pc.hwe_thresh != 1.0) || (!scanadv_double(cur_modif, &pc.hwe_thresh))) {
+		logerrprint("Error: Invalid --hwe parameter sequence.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      if ((pc.hwe_thresh < 0.0) || (pc.hwe_thresh >= 1.0)) {
+		sprintf(g_logbuf, "Error: Invalid --hwe threshold '%s' (must be in [0, 1)).\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    }
+	  }
+	  if (pc.hwe_thresh == 1.0) {
+	    logerrprint("Error: --hwe requires a p-value threshold.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if ((pc.misc_flags & kfMiscHweMidp) && (pc.hwe_thresh >= 0.5)) {
+	    logerrprint("Error: --hwe threshold must be smaller than 0.5 when using mid-p adjustment.\n");
+	  }
+	  pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	} else if (!memcmp(flagname_p2, "ard-call-threshold", 19)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double hard_call_frac;
+	  if ((!scanadv_double(cur_modif, &hard_call_frac)) || (hard_call_frac < 0.0) || (hard_call_frac >= (0.5 - kSmallEpsilon))) {
+	    sprintf(g_logbuf, "Error: Invalid --hard-call-threshold parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  pc.hard_call_thresh = (int32_t)(hard_call_frac * ((1 + kSmallEpsilon) * kDosageMid));
+	} else if (!memcmp(flagname_p2, "orse", 5)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  chr_info.autosome_ct = 31;
+	  chr_info.xymt_codes[0] = 32;
+	  chr_info.xymt_codes[1] = 33;
+	  chr_info.xymt_codes[2] = -2;
+	  chr_info.xymt_codes[3] = -2;
+	  chr_info.xymt_codes[4] = -2;
+	  chr_info.xymt_codes[5] = -2;
+#ifdef __LP64__
+	  chr_info.haploid_mask[0] = 0x300000000LLU;
+#else
+	  chr_info.haploid_mask[0] = 0;
+	  chr_info.haploid_mask[1] = 3;
+#endif
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "uman", 5)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "aps", 4)) {
+	  if (load_params || xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct == 2) {
+	    const char* cur_modif = argv[arg_idx + 2];
+	    if (!strcmp(cur_modif, "ref-first")) {
+	      oxford_import_flags |= kfOxfordImportRefFirst;
+	    } else if (!strcmp(cur_modif, "ref-second")) {
+	      oxford_import_flags |= kfOxfordImportRefSecond;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --haps parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  const char* cur_fname = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_fname);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --haps filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pgenname, cur_fname, slen + 1);
+	  xload |= kfXloadOxHaps;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'i':
+	if (!memcmp(flagname_p2, "ndiv-sort", 10)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* indiv_sort_mode_str = argv[arg_idx + 1];
+	  const char first_char_upcase_match = indiv_sort_mode_str[0] & 0xdf;
+	  const uint32_t is_short_name = (indiv_sort_mode_str[1] == '\0');
+	  if ((is_short_name && (indiv_sort_mode_str[0] == '0')) || (!strcmp(indiv_sort_mode_str, "none")))  {
+	    pc.sample_sort_flags = kfSortNone;
+	  } else if ((is_short_name && (first_char_upcase_match == 'N')) || (!strcmp(indiv_sort_mode_str, "natural"))) {
+	    pc.sample_sort_flags = kfSortNatural;
+	  } else if ((is_short_name && (first_char_upcase_match == 'A')) || (!strcmp(indiv_sort_mode_str, "ascii"))) {
+	    pc.sample_sort_flags = kfSortAscii;
+	  } else if ((is_short_name && ((indiv_sort_mode_str[0] & 0xdf) == 'F')) || (!strcmp(indiv_sort_mode_str, "file"))) {
+	    if (param_ct == 1) {
+	      sprintf(g_logbuf, "Error: Missing '--indiv-sort %s' filename.\n", indiv_sort_mode_str);
+	      goto main_ret_INVALID_CMDLINE_2A;
+	    }
+	    pc.sample_sort_flags = kfSortFile;
+	    uint32_t fname_modif_idx = 2;
+	    if (param_ct == 3) {
+	      if (check_extra_param(&(argv[arg_idx]), "sid", &fname_modif_idx)) {
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.sample_sort_flags |= kfSortFileSid;
+	    }
+	    reterr = alloc_fname(argv[arg_idx + fname_modif_idx], flagname_p, 0, &pc.sample_sort_fname);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  } else {
+	    sprintf(g_logbuf, "Error: '%s' is not a valid mode for --indiv-sort.\n", indiv_sort_mode_str);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if ((param_ct > 1) && (!(pc.sample_sort_flags & kfSortFile))) {
+	    sprintf(g_logbuf, "Error: '--indiv-sort %s' does not accept additional parameters.\n", indiv_sort_mode_str);
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	} else if (!memcmp(flagname_p2, "d-delim", 8)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    id_delim = extract_char_param(argv[arg_idx + 1]);
+	    if (!id_delim) {
+	      logerrprint("Error: --id-delim parameter must be a single character.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if (((unsigned char)id_delim) < ' ') {
+	      logerrprint("Error: --id-delim parameter cannot be tab, newline, or a nonprinting character.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	  } else {
+	    id_delim = '_';
+	  }
+	} else if ((!memcmp(flagname_p2, "ndep-pairwise", 14)) || (!memcmp(flagname_p2, "ndep-pairphase", 15))) {
+	  if (pc.command_flags1 & kfCommand1LdPrune) {
+	    logerrprint("Error: Multiple LD pruning commands.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 2, 4)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double first_paramd;
+	  char* first_param_end = scanadv_double(cur_modif, &first_paramd);
+	  if ((!first_param_end) || (first_paramd < 0.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --%s window size '%s'.\n", flagname_p, cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  uint32_t is_kb = 0;
+	  uint32_t next_param_idx = 2;
+	  if (match_upper_counted(first_param_end, "KB", 2) && (!first_param_end[2])) {
+	    is_kb = 1;
+	  } else if (match_upper_counted(argv[arg_idx + 2], "KB", 2) && (!argv[arg_idx + 2][2])) {
+	    is_kb = 1;
+	    next_param_idx = 3;
+	  }
+	  if (is_kb) {
+	    pc.ld_info.prune_modifier |= kfLdPruneWindowBp;
+	    if (first_paramd > 2147483.646) {
+	      pc.ld_info.prune_window_size = 2147483646;
+	    } else {
+	      pc.ld_info.prune_window_size = ((int32_t)(first_paramd * 1000 * (1 + kSmallEpsilon)));
+	      if (pc.ld_info.prune_window_size < 2) {
+		sprintf(g_logbuf, "Error: --%s window size cannot be smaller than 2.\n", flagname_p);
+		goto main_ret_INVALID_CMDLINE_2A;
+	      }
+	    }
+	  } else {
+	    if (first_paramd > 2147483647) {
+	      pc.ld_info.prune_window_size = 2147483647;
+	    } else {
+	      pc.ld_info.prune_window_size = ((int32_t)first_paramd);
+	    }
+	  }
+	  if (next_param_idx + 2 == param_ct) {
+	    sprintf(g_logbuf, "Error: Invalid --%s parameter sequence.\n", flagname_p);
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (next_param_idx < param_ct) {
+	    // explicit step size
+	    cur_modif = argv[arg_idx + next_param_idx];
+	    if (scan_posint_defcap(cur_modif, &pc.ld_info.prune_window_incr)) {
+	      sprintf(g_logbuf, "Error: Invalid --%s window-increment '%s'.\n", flagname_p, cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    if (!is_kb) {
+	      if (pc.ld_info.prune_window_incr > pc.ld_info.prune_window_size) {
+	        sprintf(g_logbuf, "Error: --%s window-increment cannot be larger than window size.\n", flagname_p);
+	        goto main_ret_INVALID_CMDLINE_2A;
+	      }
+	    } else if (pc.ld_info.prune_window_incr != 1) {
+	      sprintf(g_logbuf, "Error: --%s window-increment must be 1 when window size is in\nkilobase units.\n", flagname_p);
+	      goto main_ret_INVALID_CMDLINE_2A;
+	    }
+	  } else {
+	    pc.ld_info.prune_window_incr = 1;
+	  }
+	  cur_modif = argv[arg_idx + param_ct];
+	  if ((!scanadv_double(cur_modif, &pc.ld_info.prune_last_param)) || (pc.ld_info.prune_last_param < 0.0) || (pc.ld_info.prune_last_param >= 1.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --%s r^2 threshold '%s'.\n", flagname_p2, cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  pc.command_flags1 |= kfCommand1LdPrune;
+	  if (flagname_p2[9] == 'p') {
+	    pc.ld_info.prune_modifier |= kfLdPrunePairphase;
+	  } else {
+	    pc.ld_info.prune_modifier |= kfLdPrunePairwise;
+	  }
+	} else if (!memcmp(flagname_p2, "nput-missing-genotype", 22)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  input_missing_geno_char = extract_char_param(cur_modif);
+	  if (((unsigned char)input_missing_geno_char) <= ' ') {
+	    sprintf(g_logbuf, "Error: Invalid --input-missing-genotype parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "nput-missing-phenotype", 23)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double dxx;
+	  if (scan_int32(cur_modif, &pc.missing_pheno) || ((pc.missing_pheno >= 0) && (pc.missing_pheno <= 2)) || (!scanadv_double(cur_modif, &dxx)) || (dxx != ((double)pc.missing_pheno))) {
+	    sprintf(g_logbuf, "Error: Invalid --input-missing-phenotype parameter '%s' (must be an integer in [-2147483647, -1] or [3, 2147483647]).\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "mport-dosage-certainty", 23)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if ((!scanadv_double(cur_modif, &import_dosage_certainty)) || (import_dosage_certainty < 0.0) || (import_dosage_certainty > 1.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --import-dosage-certainty parameter '%s' (must be in [0, 1]).\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  import_dosage_certainty *= 1.0 - kSmallEpsilon;
+	} else if (!memcmp(flagname_p2, "mport-dosage", 13)) {
+	  if (load_params || xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 10)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t format_num_m1 = 3;
+	  for (uint32_t param_idx = 2; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 8) && (!memcmp(cur_modif, "noheader", 8))) {
+	      plink1_dosage_info.flags |= kfPlink1DosageNoheader;
+	    } else if ((cur_modif_slen > 6) && (!memcmp(cur_modif, "skip", 4)) && (cur_modif[4] >= '0') && (cur_modif[4] <= '2') && (cur_modif[5] == '=')) {
+	      const uint32_t skip_idx = (uint32_t)((unsigned char)cur_modif[4]) - 48;
+	      if (plink1_dosage_info.skips[skip_idx]) {
+		LOGERRPRINTF("Error: Multiple --import-dosage skip%u= modifiers.\n", skip_idx);
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      if (scan_uint_capped(&(cur_modif[6]), kMaxLongLine / 2, &(plink1_dosage_info.skips[skip_idx]))) {
+		sprintf(g_logbuf, "Error: Invalid --import-dosage skip%u= parameter '%s'.\n", skip_idx, &(cur_modif[6]));
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    } else if ((cur_modif_slen == 5) && (!memcmp(cur_modif, "dose1", 5))) {
+	      plink1_dosage_info.flags |= kfPlink1DosageFormatSingle01;
+	    } else if ((cur_modif_slen == 8) && (!memcmp(cur_modif, "format=", 7))) {
+	      if (format_num_m1 != 3) {
+	        logerrprint("Error: Multiple --import-dosage format= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      format_num_m1 = (uint32_t)((unsigned char)cur_modif[7]) - 49;
+	      if (format_num_m1 >= 3) {
+		sprintf(g_logbuf, "Error: Invalid --import-dosage format= parameter '%c'.\n", cur_modif[7]);
+		goto main_ret_INVALID_CMDLINE_2A;
+	      }
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "ref-first", 9))) {
+	      plink1_dosage_info.flags |= kfPlink1DosageRefFirst;
+	    } else if ((cur_modif_slen == 10) && (!memcmp(cur_modif, "ref-second", 10))) {
+	      plink1_dosage_info.flags |= kfPlink1DosageRefSecond;
+	    } else if ((cur_modif_slen > 11) && (!memcmp(cur_modif, "single-chr=", 11))) {
+	      if (import_single_chr_str) {
+		logerrprint("Error: Multiple --import-dosage single-chr= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const char* chr_code = &(cur_modif[11]);
+	      if (!(pc.misc_flags & kfMiscAllowExtraChrs)) {
+		if (get_chr_code_raw(chr_code) < 0) {
+		  sprintf(g_logbuf, "Error: Invalid --import-dosage single-chr= chromosome code '%s'. (Did you forget --allow-extra-chr?)\n", chr_code);
+		  goto main_ret_INVALID_CMDLINE_WWA;
+		}
+	      }
+	      reterr = cmdline_alloc_string(chr_code, argv[arg_idx], kMaxIdSlen, &import_single_chr_str);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else if ((cur_modif_slen > 12) && (!memcmp(cur_modif, "chr-col-num=", 12))) {
+	      if (plink1_dosage_info.chr_col_idx != 0xffffffffU) {
+		logerrprint("Error: Multiple --import-dosage chr-col-num= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      uint32_t uii;
+	      if (scan_posint_capped(&(cur_modif[12]), kMaxLongLine / 2, &uii)) {
+		sprintf(g_logbuf, "Error: Invalid --import-dosage chr-col-num= parameter '%s'.\n", &(cur_modif[12]));
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      plink1_dosage_info.chr_col_idx = uii - 1;
+	    } else if ((cur_modif_slen > 12) && (!memcmp(cur_modif, "pos-col-num=", 12))) {
+	      if (plink1_dosage_info.pos_col_idx != 0xffffffffU) {
+		logerrprint("Error: Multiple --import-dosage pos-col-num= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      uint32_t uii;
+	      if (scan_posint_capped(&(cur_modif[12]), kMaxLongLine / 2, &uii)) {
+		sprintf(g_logbuf, "Error: Invalid --import-dosage pos-col-num= parameter '%s'.\n", &(cur_modif[12]));
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      plink1_dosage_info.pos_col_idx = uii - 1;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --import-dosage parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+
+	  if (!format_num_m1) {
+	    plink1_dosage_info.flags |= kfPlink1DosageFormatSingle;
+	  } else {
+	    if (plink1_dosage_info.flags & kfPlink1DosageFormatSingle01) {
+	      logerrprint("Error: --import-dosage 'dose1' modifier must be used with 'format=1'.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if (format_num_m1 == 2) {
+	      plink1_dosage_info.flags |= kfPlink1DosageFormatTriple;
+	    }
+	  }
+	  if ((plink1_dosage_info.flags & (kfPlink1DosageRefFirst | kfPlink1DosageRefSecond)) == (kfPlink1DosageRefFirst | kfPlink1DosageRefSecond)) {
+	    logerrprint("Error: --import-dosage 'ref-first' and 'ref-second' modifiers cannot be used\ntogether.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  const uint32_t id_col_idx = plink1_dosage_info.skips[0];
+	  const uint32_t a1_col_idx = id_col_idx + plink1_dosage_info.skips[1] + 1;
+	  const uint32_t data_col_idx = a1_col_idx + plink1_dosage_info.skips[2] + 2;
+	  const uint32_t chr_col_idx = plink1_dosage_info.chr_col_idx;
+	  if (chr_col_idx != 0xffffffffU) {
+	    if (import_single_chr_str) {
+	      logerrprint("Error: --import-dosage 'single-chr=' and 'chr-col-num=' modifiers cannot be\nused together.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if ((chr_col_idx == id_col_idx) || (chr_col_idx == a1_col_idx) || (chr_col_idx == a1_col_idx + 1)) {
+	      logerrprint("Error: --import-dosage chr-col-num= value collides with another column.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    } else if (chr_col_idx >= data_col_idx) {
+	      logerrprint("Error: --import-dosage chr-col-num= value too large.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  }
+	  const uint32_t pos_col_idx = plink1_dosage_info.pos_col_idx;
+	  if (pos_col_idx != 0xffffffffU) {
+	    if ((pos_col_idx == id_col_idx) || (pos_col_idx == a1_col_idx) || (pos_col_idx == a1_col_idx + 1) || (pos_col_idx == chr_col_idx)) {
+	      logerrprint("Error: --import-dosage pos-col-num= value collides with another column.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    } else if (pos_col_idx >= data_col_idx) {
+	      logerrprint("Error: --import-dosage pos-col-num= value too large.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_modif);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --import-dosage filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pgenname, cur_modif, slen + 1);
+	  xload = kfXloadPlink1Dosage;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'k':
+	if (!memcmp(flagname_p2, "eep", 4)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const uint32_t sid_present = !strcmp(argv[arg_idx + 1], "sid");
+	  if (sid_present) {
+	    if (param_ct == 1) {
+	      logerrprint("Error: '--keep sid' requires at least one filename.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    pc.misc_flags |= kfMiscKeepfileSid;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1 + sid_present]), param_ct - sid_present, kPglFnamesize, &pc.keep_fnames);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "eep-fam", 8)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, kPglFnamesize, &pc.keepfam_fnames);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "eep-autoconv", 13)) {
+	  pc.misc_flags |= kfMiscKeepAutoconv;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "eep-females", 12)) {
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclMales | kfFilterExclNosex;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "eep-males", 10)) {
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclFemales | kfFilterExclNosex;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "eep-nosex", 10)) {
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclFemales | kfFilterExclMales;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "eep-founders", 13)) {
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclNonfounders;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "eep-nonfounders", 16)) {
+	  if (pc.filter_flags & kfFilterExclNonfounders) {
+	    logerrprint("Error: --keep-nonfounders cannot be used with --keep-founders.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclFounders;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ing-cutoff", 11)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct == 2) {
+	    // .king.id, .king.bin appended
+	    reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 9, &king_cutoff_fprefix);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  char* cur_modif = argv[arg_idx + param_ct];
+	  if ((!scanadv_double(cur_modif, &pc.king_cutoff)) || (pc.king_cutoff < 0.0) || (pc.king_cutoff >= 0.5)) {
+	    sprintf(g_logbuf, "Error: Invalid --king-cutoff parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  pc.command_flags1 |= kfCommand1KingCutoff;
+	} else if (!memcmp(flagname_p2, "ing-table-filter", 17)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if ((!scanadv_double(cur_modif, &pc.king_table_filter)) || (pc.king_table_filter > 0.5)) {
+	    sprintf(g_logbuf, "Error: Invalid --king-table-filter parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "eep-if", 7)) {
+	  reterr = validate_and_alloc_cmp_expr(&(argv[arg_idx + 1]), argv[arg_idx], param_ct, &pc.keep_if_expr);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "eep-cats", 9)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 0, &pc.keep_cats_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "eep-cat-names", 14)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, kMaxIdBlen, &pc.keep_cat_names_flattened);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "eep-cat-pheno", 14)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.keep_cat_phenoname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "eep-allele-order", 17)) {
+	  logprint("Note: --keep-allele-order no longer has any effect.\n");
+	  goto main_param_zero;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'l':
+	if (!memcmp(flagname_p2, "ambda", 6)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (!scanadv_double(argv[arg_idx + 1], &pc.adjust_info.lambda)) {
+	    sprintf(g_logbuf, "Error: Invalid --lambda parameter '%s'.\n", argv[arg_idx + 1]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (pc.adjust_info.lambda < 1.0) {
+	    logprint("Note: --lambda parameter set to 1.\n");
+	    pc.adjust_info.lambda = 1.0;
+	  }
+	} else if (!memcmp(flagname_p2, "egend", 6)) {
+	  if (load_params || (xload & (~kfXloadOxHaps))) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (!xload) {
+	    logerrprint("Error: --legend must be used with --haps.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 2, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_fname = argv[arg_idx + 1];
+	  uint32_t slen = strlen(cur_fname);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --legend filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pvarname, cur_fname, slen + 1);
+	  const char* chr_code = argv[arg_idx + 2];
+	  if (!(pc.misc_flags & kfMiscAllowExtraChrs)) {
+	    if (get_chr_code_raw(chr_code) < 0) {
+	      sprintf(g_logbuf, "Error: Invalid --legend chromosome code '%s'. (Did you forget --allow-extra-chr?)\n", chr_code);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  reterr = cmdline_alloc_string(chr_code, argv[arg_idx], kMaxIdSlen, &import_single_chr_str);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  xload |= kfXloadOxLegend;
+	} else if (!memcmp(flagname_p2, "oop-assoc", 10)) {
+	  logerrprint("Error: --loop-assoc is retired.  Use --within + --split-cat-pheno instead.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'm':
+	if (!memcmp(flagname_p2, "emory", 6)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t mb_modif_idx = 1;
+	  if (param_ct == 2) {
+	    if (check_extra_param(&(argv[arg_idx]), "require", &mb_modif_idx)) {
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    memory_require = 1;
+	  }
+	  const char* mb_modif = argv[arg_idx + mb_modif_idx];
+	  if (scan_posintptr(mb_modif, (uintptr_t*)(&malloc_size_mb))) {
+	    sprintf(g_logbuf, "Error: Invalid --memory parameter '%s'.\n", mb_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (malloc_size_mb < (intptr_t)kBigstackMinMb) {
+	    sprintf(g_logbuf, "Error: Invalid --memory parameter '%s' (minimum %u).\n", mb_modif, kBigstackMinMb);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+#ifndef __LP64__
+	  if (malloc_size_mb > (intptr_t)kMalloc32bitMbMax) {
+	    LOGERRPRINTF("Error: --memory parameter too large for 32-bit version (max %u).\n", kMalloc32bitMbMax);
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+#endif
+	} else if (!memcmp(flagname_p2, "ake-bed", 8)) {
+	  if (pc.exportf_modifier & kfExportfIndMajorBed) {
+	    logerrprint("Error: --make-bed cannot be used with --export ind-major-bed.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 3) && (!memcmp(cur_modif, "vzs", 3))) {
+	      make_plink2_modifier |= kfMakeBimZs;
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "trim-alts", 9))) {
+	      make_plink2_modifier |= kfMakePlink2TrimAlts;
+	    } else if (((cur_modif_slen > 2) && (!memcmp(cur_modif, "m=", 2))) || ((cur_modif_slen > 14) && (!memcmp(cur_modif, "multiallelics=", 14)))) {
+	      if (make_plink2_modifier & kfMakePlink2MMask) {
+		logerrprint("Error: Multiple --make-bed multiallelics= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const char* mode_start = (cur_modif[1] == '=')? (&(cur_modif[2])) : (&(cur_modif[14]));
+	      if (!strcmp(mode_start, "-")) {
+		make_plink2_modifier |= kfMakePlink2MSplitAll;
+	      } else if (!strcmp(mode_start, "-snps")) {
+		make_plink2_modifier |= kfMakePlink2MSplitSnps;
+	      } else if ((!strcmp(mode_start, "+")) || (!strcmp(mode_start, "+both"))) {
+		make_plink2_modifier |= kfMakePlink2MMergeBoth;
+	      } else if (!strcmp(mode_start, "+snps")) {
+		make_plink2_modifier |= kfMakePlink2MMergeSnps;
+	      } else if (!strcmp(mode_start, "+any")) {
+		make_plink2_modifier |= kfMakePlink2MMergeAny;
+	      } else {
+		sprintf(g_logbuf, "Error: Invalid --make-bed multiallelics= mode '%s'.\n", mode_start);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    } else {
+	      char* write_iter = strcpya(g_logbuf, "Error: Invalid --make-bed parameter '");
+	      write_iter = memcpya(write_iter, cur_modif, cur_modif_slen);
+	      write_iter = strcpya(write_iter, "'.");
+	      if ((param_idx == 1) && (!outname_end)) {
+		// the missing --out mistake is so common--I must have made it
+		// over a hundred times by now--that a custom error message is
+		// worthwhile.
+		write_iter = strcpya(write_iter, " (Did you forget '--out'?)");
+	      }
+	      write_iter = strcpya(write_iter, "\n");
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  make_plink2_modifier |= kfMakeBed | kfMakeBim | kfMakeFam;
+	  pc.command_flags1 |= kfCommand1MakePlink2;
+	} else if (!memcmp(flagname_p2, "ake-bpgen", 10)) {
+	  if (make_plink2_modifier & kfMakeBed) {
+	    logerrprint("Error: --make-bpgen cannot be used with --make-bed.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (pc.misc_flags & kfMiscKeepAutoconv) {
+	    logerrprint("Error: --make-bpgen cannot be used with --keep-autoconv.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 3) && (!memcmp(cur_modif, "vzs", 3))) {
+	      make_plink2_modifier |= kfMakeBimZs;
+	    } else if ((cur_modif_slen > 7) && (!memcmp(cur_modif, "format=", 7))) {
+	      if (make_plink2_modifier & (kfMakePgenFormatBase * 3)) {
+		logerrprint("Error: Multiple --make-bpgen format= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const uint32_t fcode_minus_2 = ((uint32_t)((unsigned char)cur_modif[7])) - 50;
+	      if ((fcode_minus_2 > 2) || cur_modif[8]) {
+		sprintf(g_logbuf, "Error: Invalid --make-bpgen format code '%s'.\n", &(cur_modif[7]));
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      if (fcode_minus_2) {
+		logerrprint("Error: --make-bpgen formats 3 and 4 (unphased/phased dosage) are not\nimplemented yet.\n");
+		reterr = kPglRetNotYetSupported;
+		goto main_ret_1;
+	      }
+	      make_plink2_modifier = (make_plink2_t)(make_plink2_modifier | (kfMakePgenFormatBase * (1 + fcode_minus_2)));
+	    } else if (((cur_modif_slen > 2) && (!memcmp(cur_modif, "m=", 2))) && ((cur_modif_slen > 14) && (!memcmp(cur_modif, "multiallelics=", 14)))) {
+	      if (make_plink2_modifier & kfMakePlink2MMask) {
+		logerrprint("Error: Multiple --make-bpgen multiallelics= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const char* mode_start = (cur_modif[1] == '=')? (&(cur_modif[2])) : (&(cur_modif[14]));
+	      if (!strcmp(mode_start, "-")) {
+		make_plink2_modifier |= kfMakePlink2MSplitAll;
+	      } else if (!strcmp(mode_start, "-snps")) {
+		make_plink2_modifier |= kfMakePlink2MSplitSnps;
+	      } else if ((!strcmp(mode_start, "+")) || (!strcmp(mode_start, "+both"))) {
+		make_plink2_modifier |= kfMakePlink2MMergeBoth;
+	      } else if (!strcmp(mode_start, "+snps")) {
+		make_plink2_modifier |= kfMakePlink2MMergeSnps;
+	      } else if (!strcmp(mode_start, "+any")) {
+		make_plink2_modifier |= kfMakePlink2MMergeAny;
+	      } else {
+		sprintf(g_logbuf, "Error: Invalid --make-bpgen multiallelics= mode '%s'.\n", mode_start);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "trim-alts", 9))) {
+	      make_plink2_modifier |= kfMakePlink2TrimAlts;
+	    } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "erase-alt2+", 11))) {
+	      make_plink2_modifier |= kfMakePgenEraseAlt2Plus;
+	    } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "erase-phase", 11))) {
+	      make_plink2_modifier |= kfMakePgenErasePhase;
+	    } else if ((cur_modif_slen == 12) && (!memcmp(cur_modif, "erase-dosage", 12))) {
+	      make_plink2_modifier |= kfMakePgenEraseDosage;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-bpgen parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  make_plink2_modifier |= kfMakePgen | kfMakeBim | kfMakeFam;
+	  pc.command_flags1 |= kfCommand1MakePlink2;
+	} else if (!memcmp(flagname_p2, "ake-pgen", 9)) {
+	  if (make_plink2_modifier & (kfMakeBed | kfMakePgen)) {
+	    logerrprint("Error: --make-pgen cannot be used with --make-bed/--make-bpgen.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (make_plink2_modifier & (kfMakeBim | kfMakeFam | kfMakePvar | kfMakePsam)) {
+	    logerrprint("Error: --make-just-... cannot be used with --make-bed/--make-{b}pgen.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (pc.misc_flags & kfMiscKeepAutoconv) {
+	    logerrprint("Error: --make-pgen cannot be used with --keep-autoconv.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 4)) {
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  uint32_t explicit_pvar_cols = 0;
+	  uint32_t explicit_psam_cols = 0;
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 3) && (!memcmp(cur_modif, "vzs", 3))) {
+	      pc.pvar_psam_modifier |= kfPvarZs;
+	    } else if ((cur_modif_slen >= 10) && (!memcmp(cur_modif, "pvar-cols=", 10))) {
+	      if (explicit_pvar_cols) {
+		logerrprint("Error: Multiple --make-pgen pvar-cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      explicit_pvar_cols = 1;
+	      reterr = parse_col_descriptor(&(cur_modif[10]), "xheader\0maybequal\0qual\0maybefilter\0filter\0maybeinfo\0info\0maybecm\0cm\0", "--make-pgen pvar-cols", kfPvarColXheader, kfPvarColDefault, 0, &pc.pvar_psam_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	      if ((pc.pvar_psam_modifier & kfPvarColXinfo) && (!(pc.pvar_psam_modifier & kfPvarColXheader))) {
+		logerrprint("Error: --make-pgen pvar-cols= expression cannot exclude xheader when info is\npresent.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else if ((cur_modif_slen > 7) && (!memcmp(cur_modif, "format=", 7))) {
+	      if (make_plink2_modifier & (kfMakePgenFormatBase * 3)) {
+		logerrprint("Error: Multiple --make-pgen format= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const uint32_t fcode_minus_2 = ((uint32_t)((unsigned char)cur_modif[7])) - 50;
+	      if ((fcode_minus_2 > 2) || cur_modif[8]) {
+		sprintf(g_logbuf, "Error: Invalid --make-pgen format code '%s'.\n", &(cur_modif[7]));
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      if (fcode_minus_2) {
+		logerrprint("Error: --make-pgen formats 3 and 4 (unphased/phased dosage) are not implemented\nyet.\n");
+		reterr = kPglRetNotYetSupported;
+		goto main_ret_1;
+	      }
+	      make_plink2_modifier = (make_plink2_t)(make_plink2_modifier | (kfMakePgenFormatBase * (1 + fcode_minus_2)));
+	    } else if (((cur_modif_slen > 2) && (!memcmp(cur_modif, "m=", 2))) && ((cur_modif_slen > 14) && (!memcmp(cur_modif, "multiallelics=", 14)))) {
+	      if (make_plink2_modifier & kfMakePlink2MMask) {
+		logerrprint("Error: Multiple --make-pgen multiallelics= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      const char* mode_start = (cur_modif[1] == '=')? (&(cur_modif[2])) : (&(cur_modif[14]));
+	      if (!strcmp(mode_start, "-")) {
+		make_plink2_modifier |= kfMakePlink2MSplitAll;
+	      } else if (!strcmp(mode_start, "-snps")) {
+		make_plink2_modifier |= kfMakePlink2MSplitSnps;
+	      } else if ((!strcmp(mode_start, "+")) || (!strcmp(mode_start, "+both"))) {
+		make_plink2_modifier |= kfMakePlink2MMergeBoth;
+	      } else if (!strcmp(mode_start, "+snps")) {
+		make_plink2_modifier |= kfMakePlink2MMergeSnps;
+	      } else if (!strcmp(mode_start, "+any")) {
+		make_plink2_modifier |= kfMakePlink2MMergeAny;
+	      } else {
+		sprintf(g_logbuf, "Error: Invalid --make-pgen multiallelics= mode '%s'.\n", mode_start);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    } else if ((cur_modif_slen == 9) && (!memcmp(cur_modif, "trim-alts", 9))) {
+	      make_plink2_modifier |= kfMakePlink2TrimAlts;
+	    } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "erase-alt2+", 11))) {
+	      make_plink2_modifier |= kfMakePgenEraseAlt2Plus;
+	    } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "erase-phase", 11))) {
+	      make_plink2_modifier |= kfMakePgenErasePhase;
+	    } else if ((cur_modif_slen == 12) && (!memcmp(cur_modif, "erase-dosage", 12))) {
+	      make_plink2_modifier |= kfMakePgenEraseDosage;
+	    } else if ((cur_modif_slen >= 10) && (!memcmp(cur_modif, "psam-cols=", 10))) {
+	      if (explicit_psam_cols) {
+		logerrprint("Error: Multiple --make-pgen psam-cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      explicit_psam_cols = 1;
+	      reterr = parse_col_descriptor(&(cur_modif[10]), "maybesid\0sid\0maybeparents\0parents\0sex\0pheno1\0phenos\0", "make-pgen psam-cols", kfPsamColMaybesid, kfPsamColDefault, 0, &pc.pvar_psam_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-pgen parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!explicit_pvar_cols) {
+	    pc.pvar_psam_modifier |= kfPvarColDefault;
+	  }
+	  if (!explicit_psam_cols) {
+	    pc.pvar_psam_modifier |= kfPsamColDefault;
+	  }
+	  make_plink2_modifier |= kfMakePgen | kfMakePvar | kfMakePsam;
+	  pc.command_flags1 |= kfCommand1MakePlink2;
+	} else if (!memcmp(flagname_p2, "ake-just-bim", 13)) {
+	  if (make_plink2_modifier & (kfMakeBed | kfMakePgen)) {
+	    logerrprint("Error: --make-just-... cannot be used with --make-bed/--make-{b}pgen.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if (!strcmp(cur_modif, "zs")) {
+	      make_plink2_modifier |= kfMakeBimZs;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-just-bim parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  make_plink2_modifier |= kfMakeBim;
+	  pc.command_flags1 |= kfCommand1MakePlink2;
+	} else if (!memcmp(flagname_p2, "ake-just-fam", 13)) {
+	  if (make_plink2_modifier & (kfMakeBed | kfMakePgen)) {
+	    logerrprint("Error: --make-just-... cannot be used with --make-bed/--make-{b}pgen.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  make_plink2_modifier |= kfMakeFam;
+	  pc.command_flags1 |= kfCommand1MakePlink2;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ake-just-pvar", 14)) {
+	  if (make_plink2_modifier & (kfMakeBed | kfMakePgen)) {
+	    logerrprint("Error: --make-just-... cannot be used with --make-bed/--make-{b}pgen.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t explicit_cols = 0;
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "zs", 2))) {
+	      pc.pvar_psam_modifier |= kfPvarZs;
+	    } else if ((cur_modif_slen >= 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (explicit_cols) {
+		logerrprint("Error: Multiple --make-just-pvar cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      explicit_cols = 1;
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "xheader\0maybequal\0qual\0maybefilter\0filter\0maybeinfo\0info\0maybecm\0cm\0", "make-just-pvar", kfPvarColXheader, kfPvarColDefault, 0, &pc.pvar_psam_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	      if ((pc.pvar_psam_modifier & kfPvarColXinfo) && (!(pc.pvar_psam_modifier & kfPvarColXheader))) {
+		logerrprint("Error: --make-just-pvar cols= expression cannot exclude xheader when info is\npresent.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-just-pvar parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!explicit_cols) {
+	    pc.pvar_psam_modifier |= kfPvarColDefault;
+	  }
+	  make_plink2_modifier |= kfMakePvar;
+	  pc.command_flags1 |= kfCommand1MakePlink2;
+	} else if (!memcmp(flagname_p2, "ake-just-psam", 14)) {
+	  if (make_plink2_modifier & (kfMakeBed | kfMakePgen)) {
+	    logerrprint("Error: --make-just-... cannot be used with --make-bed/--make-{b}pgen.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if ((strlen(cur_modif) >= 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "maybesid\0sid\0maybeparents\0parents\0sex\0pheno1\0phenos\0", "make-just-psam", kfPsamColMaybesid, kfPsamColDefault, 0, &pc.pvar_psam_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-just-psam parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  } else {
+	    pc.pvar_psam_modifier |= kfPsamColDefault;
+	  }
+	  make_plink2_modifier |= kfMakePsam;
+	  pc.command_flags1 |= kfCommand1MakePlink2;
+	} else if (!memcmp(flagname_p2, "ake-king", 9)) {
+	  // may want to add options for handling X/Y/MT
+	  if (king_cutoff_fprefix) {
+	    logerrprint("Error: --make-king cannot be used with a --king-cutoff input fileset.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "zs")) {
+	      if (pc.king_modifier & kfKingMatrixEncodemask) {
+		logerrprint("Error: Multiple --make-king encoding modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.king_modifier |= kfKingMatrixZs;
+	    } else if (!strcmp(cur_modif, "bin")) {
+	      if (pc.king_modifier & kfKingMatrixEncodemask) {
+		logerrprint("Error: Multiple --make-king encoding modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.king_modifier |= kfKingMatrixBin;
+	    } else if (!strcmp(cur_modif, "bin4")) {
+	      if (pc.king_modifier & kfKingMatrixEncodemask) {
+		logerrprint("Error: Multiple --make-king encoding modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.king_modifier |= kfKingMatrixBin4;
+	    } else if (!strcmp(cur_modif, "square")) {
+	      if (pc.king_modifier & kfKingMatrixShapemask) {
+		logerrprint("Error: Multiple --make-king shape modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.king_modifier |= kfKingMatrixSq;
+	    } else if (!strcmp(cur_modif, "square0")) {
+	      if (pc.king_modifier & kfKingMatrixShapemask) {
+		logerrprint("Error: Multiple --make-king shape modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.king_modifier |= kfKingMatrixSq0;
+	    } else if (!strcmp(cur_modif, "triangle")) {
+	      if (pc.king_modifier & kfKingMatrixShapemask) {
+		logerrprint("Error: Multiple --make-king shape modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.king_modifier |= kfKingMatrixTri;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-king parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!(pc.king_modifier & kfKingMatrixShapemask)) {
+	    if (pc.king_modifier & (kfKingMatrixBin | kfKingMatrixBin4)) {
+	      pc.king_modifier |= kfKingMatrixSq;
+	    } else {
+	      pc.king_modifier |= kfKingMatrixTri;
+	    }
+	  }
+	  pc.command_flags1 |= kfCommand1MakeKing;
+	} else if (!memcmp(flagname_p2, "ake-king-table", 15)) {
+	  if (king_cutoff_fprefix) {
+	    logerrprint("Error: --make-king-table cannot be used with a --king-cutoff input fileset.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "zs")) {
+	      pc.king_modifier |= kfKingTableZs;
+	    } else if (!strcmp(cur_modif, "counts")) {
+	      pc.king_modifier |= kfKingCounts;
+	    } else if ((strlen(cur_modif) > 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (pc.king_modifier & kfKingColAll) {
+		logerrprint("Error: Multiple --make-king-table cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "id\0maybesid\0sid\0nsnp\0hethet\0ibs0\0ibs1\0kinship\0", "make-king-table", kfKingColId, kfKingColDefault, 1, &pc.king_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	      if ((pc.king_modifier & (kfKingColMaybesid | kfKingColSid)) && (!(pc.king_modifier & kfKingColId))) {
+		logerrprint("Error: Invalid --make-king-table column set descriptor ('maybesid' and 'sid'\nrequire 'id').\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-king-table parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!(pc.king_modifier & kfKingColAll)) {
+	    pc.king_modifier |= kfKingColDefault;
+	  }
+	  pc.command_flags1 |= kfCommand1MakeKing;
+	} else if (!memcmp(flagname_p2, "issing", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 4)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "zs", 2))) {
+	      pc.missing_rpt_modifier |= kfMissingRptZs;
+	    } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "sample-only", 11))) {
+	      if (pc.missing_rpt_modifier & kfMissingRptVariantOnly) {
+		logerrprint("Error: --missing 'sample-only' and 'variant-only' cannot be used together.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      pc.missing_rpt_modifier |= kfMissingRptSampleOnly;
+	    } else if ((cur_modif_slen == 12) && (!memcmp(cur_modif, "variant-only", 12))) {
+	      if (pc.missing_rpt_modifier & kfMissingRptSampleOnly) {
+		logerrprint("Error: --missing 'sample-only' and 'variant-only' cannot be used together.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      pc.missing_rpt_modifier |= kfMissingRptVariantOnly;
+	    } else if ((cur_modif_slen > 6) && (!memcmp(cur_modif, "scols=", 6))) {
+	      if (pc.missing_rpt_modifier & kfMissingRptScolAll) {
+		logerrprint("Error: Multiple --missing scols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[6]), "maybesid\0sid\0misspheno1\0missphenos\0nmissdosage\0nmiss\0nmisshh\0hethap\0nobs\0fmissdosage\0fmiss\0fmisshh\0", "missing scols", kfMissingRptScolMaybesid, kfMissingRptScolDefault, 1, &pc.missing_rpt_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else if ((cur_modif_slen > 6) && (!memcmp(cur_modif, "vcols=", 6))) {
+	      if (pc.missing_rpt_modifier & kfMissingRptVcolAll) {
+		logerrprint("Error: Multiple --missing vcols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[6]), "chrom\0pos\0ref\0alt1\0alt\0nmissdosage\0nmiss\0nmisshh\0hethap\0nobs\0fmissdosage\0fmiss\0fmisshh\0fhethap\0", "missing vcols", kfMissingRptVcolChrom, kfMissingRptVcolDefault, 1, &pc.missing_rpt_modifier);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --missing parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  const uint32_t explicit_scols = pc.missing_rpt_modifier & kfMissingRptScolAll;
+	  if (pc.missing_rpt_modifier & kfMissingRptVariantOnly) {
+	    if (explicit_scols) {
+	      logerrprint("Error: --missing 'variant-only' and 'scols=' modifiers cannot be used together.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  } else {
+	    pc.filter_flags |= kfFilterNoSplitChr;
+            if (!explicit_scols) {
+	      pc.missing_rpt_modifier |= kfMissingRptScolDefault;
+	    }
+	  }
+	  const uint32_t explicit_vcols = pc.missing_rpt_modifier & kfMissingRptVcolAll;
+	  if (pc.missing_rpt_modifier & kfMissingRptSampleOnly) {
+	    if (explicit_vcols) {
+	      logerrprint("Error: --missing 'sample-only' and 'vcols=' modifiers cannot be used together.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  } else if (!explicit_vcols) {
+	    pc.missing_rpt_modifier |= kfMissingRptVcolDefault;
+	  }
+	  pc.command_flags1 |= kfCommand1MissingReport;
+	} else if (!memcmp(flagname_p2, "aj-ref", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if (!strcmp(cur_modif, "force")) {
+	      pc.misc_flags |= kfMiscMajRefForce;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --maj-ref parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  pc.misc_flags |= kfMiscMajRef;
+	  pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	} else if (!memcmp(flagname_p2, "af", 3)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    char* cur_modif = argv[arg_idx + 1];
+	    if (!scanadv_double(cur_modif, &pc.min_maf)) {
+	      sprintf(g_logbuf, "Error: Invalid --maf parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    if (pc.min_maf < 0.0) {
+	      sprintf(g_logbuf, "Error: --maf parameter '%s' too small (must be >= 0).\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    } else if (pc.min_maf >= 1.0) {
+	      sprintf(g_logbuf, "Error: --maf parameter '%s' too large (must be < 1).\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  } else {
+	    pc.min_maf = 0.01;
+	  }
+	  if (pc.min_maf != 0.0) {
+	    pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	  }
+	} else if (!memcmp(flagname_p2, "ax-maf", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if (!scanadv_double(cur_modif, &pc.max_maf)) {
+	    sprintf(g_logbuf, "Error: Invalid --max-maf parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (pc.max_maf < pc.min_maf) {
+	    sprintf(g_logbuf, "Error: --max-maf parameter '%s' too small (must be >= %g).\n", cur_modif, pc.min_maf);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  } else if (pc.max_maf >= 1.0) {
+	    sprintf(g_logbuf, "Error: --max-maf parameter '%s' too large (must be < 1).\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	} else if (!memcmp(flagname_p2, "ac", 3)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double dxx;
+	  if ((!scanadv_double(cur_modif, &dxx)) || (dxx < 0.0) || (dxx > 2147483646.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --mac parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (dxx > 0.0) {
+	    // round up, but keep as much precision as possible
+	    int32_t int_part = (int32_t)dxx;
+	    dxx -= int_part;
+	    pc.min_allele_dosage = int_part * ((uint64_t)kDosageMax);
+	    if (dxx > 0.0) {
+	      pc.min_allele_dosage += 1 + (dxx * (kDosageMax * (1 - kSmallEpsilon)));
+	    }
+	    pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	  }
+	} else if (!memcmp(flagname_p2, "ax-mac", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double dxx;
+	  if ((!scanadv_double(cur_modif, &dxx)) || (dxx < 0.0) || (dxx > 2147483646.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --max-mac parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  // round down
+	  pc.max_allele_dosage = (int64_t)(dxx * kDosageMax);
+	  if (pc.max_allele_dosage < pc.min_allele_dosage) {
+	    // yeah, --mac 0.1 --max-mac 0.1 also isn't allowed
+	    logerrprint("Error: --max-mac parameter cannot be smaller than --mac parameter.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	} else if (!memcmp(flagname_p2, "ind", 4)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t mind_thresh_present = 0;
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "dosage")) {
+	      pc.misc_flags |= kfMiscMindDosage;
+	    } else if (!strcmp(cur_modif, "hh-missing")) {
+	      pc.misc_flags |= kfMiscMindHhMissing;
+	    } else if (mind_thresh_present) {
+	      logerrprint("Error: Invalid --mind parameter sequence.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    } else if (!scanadv_double(cur_modif, &pc.mind_thresh)) {
+	      sprintf(g_logbuf, "Error: Invalid --mind parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    } else if ((pc.mind_thresh < 0.0) || (pc.mind_thresh > 1.0)) {
+	      sprintf(g_logbuf, "Error: Invalid --mind parameter '%s' (must be in [0, 1]).\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    } else {
+	      mind_thresh_present = 1;
+	    }
+	  }
+	  if (!mind_thresh_present) {
+	    pc.mind_thresh = 0.1;
+	  }
+	  if (pc.mind_thresh < 1.0) {
+	    pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	  }
+	} else if (!memcmp(flagname_p2, "issing-var-code", 16)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.missing_varid_match);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "erge-par", 9)) {
+	  if (pc.exportf_modifier & kfExportfVcf) {
+	    logerrprint("Warning: --merge-par should not be used with VCF export.  (The VCF export\nroutine automatically converts PAR1/PAR2 chromosome codes to X, while using\nthe PAR boundaries to get male ploidy right; --merge-par causes VCF export to\nget male ploidy wrong.)\n");
+	  }
+	  pc.misc_flags |= kfMiscMergePar;
+	  pc.filter_flags |= kfFilterPvarReq | kfFilterNoSplitChr;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "af-succ", 8)) {
+	  pc.misc_flags |= kfMiscMafSucc;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ax-corr", 8)) {
+	  if (!(pc.command_flags1 & kfCommand1Glm)) {
+	    logerrprint("Error: --max-corr must be used with --glm.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if (!scanadv_double(cur_modif, &pc.glm_info.max_corr)) {
+	    sprintf(g_logbuf, "Error: Invalid --max-corr parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if ((pc.glm_info.max_corr < 0.0) || (pc.glm_info.max_corr > 1.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --max-corr parameter '%s' (must be in [0, 1]).\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "ach-r2-filter", 14)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    char* cur_modif = argv[arg_idx + 1];
+	    if (!scanadv_double(cur_modif, &pc.mach_r2_min)) {
+	      sprintf(g_logbuf, "Error: Invalid --mach-r2-filter min parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    if (pc.mach_r2_min < 0.0) {
+	      sprintf(g_logbuf, "Error: Invalid --mach-r2-filter min parameter '%s' (must be nonnegative).\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    if (param_ct == 2) {
+	      cur_modif = argv[arg_idx + 2];
+	      if (!scanadv_double(cur_modif, &pc.mach_r2_max)) {
+		sprintf(g_logbuf, "Error: Invalid --mach-r2-filter max parameter '%s'.\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	    } else {
+	      pc.mach_r2_max = 2.0;
+	    }
+	    if (pc.mach_r2_max < pc.mach_r2_min) {
+	      logerrprint("Error: --mach-r2-filter min parameter cannot be larger than max parameter.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  } else {
+	    pc.mach_r2_min = 0.1;
+	  }
+	  pc.filter_flags |= kfFilterAllReq | kfFilterNoSplitChr;
+	} else if (!memcmp(flagname_p2, "issing-code", 12)) {
+	  if (!(xload & (kfXloadOxGen | kfXloadOxBgen))) {
+	    // could technically support pure .sample -> .fam/.psam, but let's
+	    // keep this simple
+	    logerrprint("Error: --missing-code must be used with --data/--gen/--bgen.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(param_ct? argv[arg_idx + 1] : "", argv[arg_idx], 0x7fffffff, &ox_missing_code);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "issing-genotype", 16)) {
+	  logerrprint("Error: --missing-genotype flag retired.  Use --input-missing-genotype and/or\n--output-missing-genotype.\n");
+	  goto main_ret_INVALID_CMDLINE;
+	} else if (!memcmp(flagname_p2, "issing-phenotype", 17)) {
+	  logerrprint("Error: --missing-phenotype flag retired.  Use --input-missing-phenotype and/or\n--output-missing-phenotype.\n");
+	  goto main_ret_INVALID_CMDLINE;
+	} else if (!memcmp(flagname_p2, "issing-catname", 15)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  const uint32_t cur_modif_slen = strlen(cur_modif);
+	  double dxx;
+	  if (scanadv_double(cur_modif, &dxx) || is_nan_str(cur_modif, cur_modif_slen)) {
+	    logerrprint("Error: --missing-catname string cannot be 'NA' or start with a number.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (cur_modif_slen > 31) {
+	    logerrprint("Error: --missing-catname string too long (max 31 chars).\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  memcpy(g_missing_catname, cur_modif, cur_modif_slen + 1);
+	} else if (!memcmp(flagname_p2, "ouse", 5)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  chr_info.autosome_ct = 19;
+	  chr_info.xymt_codes[0] = 20;
+	  chr_info.xymt_codes[1] = 21;
+	  chr_info.xymt_codes[2] = -2;
+	  chr_info.xymt_codes[3] = -2;
+	  chr_info.xymt_codes[4] = -2;
+	  chr_info.xymt_codes[5] = -2;
+	  chr_info.haploid_mask[0] = 0x300000;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ake-grm", 8)) {
+	  logerrprint("Error: --make-grm has been retired due to inconsistent meaning across GCTA\nversions.  Use --make-grm-gz or --make-grm-bin.\n");
+	  goto main_ret_INVALID_CMDLINE;
+	} else if (!memcmp(flagname_p2, "ake-grm-bin", 12)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "cov")) {
+	      pc.grm_flags |= kfGrmCov;
+	    } else if (!strcmp(cur_modif, "meanimpute")) {
+	      pc.grm_flags |= kfGrmMeanimpute;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-grm-bin parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  pc.grm_flags |= kfGrmBin;
+	  pc.command_flags1 |= kfCommand1MakeRel;
+	} else if (!memcmp(flagname_p2, "ake-grm-gz", 11)) {
+	  if (pc.command_flags1 & kfCommand1MakeRel) {
+	    logerrprint("Error: --make-grm-gz cannot be used with --make-grm-bin.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 3)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t compress_stream_type = 0; // 1 = no-gz, 2 = zs
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "cov")) {
+	      pc.grm_flags |= kfGrmCov;
+	    } else if (!strcmp(cur_modif, "meanimpute")) {
+	      pc.grm_flags |= kfGrmMeanimpute;
+	    } else if (!strcmp(cur_modif, "no-gz")) {
+	      if (compress_stream_type) {
+		logerrprint("Error: Multiple --make-grm-gz compression type modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      compress_stream_type = 1;
+	      pc.grm_flags |= kfGrmTableNoGz;
+	    } else if (!strcmp(cur_modif, "zs")) {
+	      if (compress_stream_type) {
+		logerrprint("Error: Multiple --make-grm-gz compression type modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      compress_stream_type = 2;
+	      pc.grm_flags |= kfGrmTableZs;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-grm-gz parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!compress_stream_type) {
+	    pc.grm_flags |= kfGrmTableGz;
+	  }
+	  pc.command_flags1 |= kfCommand1MakeRel;
+	} else if (!memcmp(flagname_p2, "ake-rel", 8)) {
+	  if (pc.command_flags1 & kfCommand1MakeRel) {
+	    logerrprint("Error: --make-rel cannot be used with --make-grm-gz/--make-grm-bin.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 4)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (!strcmp(cur_modif, "cov")) {
+	      pc.grm_flags |= kfGrmCov;
+	    } else if (!strcmp(cur_modif, "meanimpute")) {
+	      pc.grm_flags |= kfGrmMeanimpute;
+	    } else if (!strcmp(cur_modif, "zs")) {
+	      if (pc.grm_flags & kfGrmMatrixEncodemask) {
+		logerrprint("Error: Multiple --make-rel encoding modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.grm_flags |= kfGrmMatrixZs;
+	    } else if (!strcmp(cur_modif, "bin")) {
+	      if (pc.grm_flags & kfGrmMatrixEncodemask) {
+		logerrprint("Error: Multiple --make-rel encoding modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.grm_flags |= kfGrmMatrixBin;
+	    } else if (!strcmp(cur_modif, "bin4")) {
+	      if (pc.grm_flags & kfGrmMatrixEncodemask) {
+		logerrprint("Error: Multiple --make-rel encoding modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.grm_flags |= kfGrmMatrixBin4;
+	    } else if (!strcmp(cur_modif, "square")) {
+	      if (pc.grm_flags & kfGrmMatrixShapemask) {
+		logerrprint("Error: Multiple --make-rel shape modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.grm_flags |= kfGrmMatrixSq;
+	    } else if (!strcmp(cur_modif, "square0")) {
+	      if (pc.grm_flags & kfGrmMatrixShapemask) {
+		logerrprint("Error: Multiple --make-rel shape modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.grm_flags |= kfGrmMatrixSq0;
+	    } else if (!strcmp(cur_modif, "triangle")) {
+	      if (pc.grm_flags & kfGrmMatrixShapemask) {
+		logerrprint("Error: Multiple --make-rel shape modifiers.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      pc.grm_flags |= kfGrmMatrixTri;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --make-rel parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  if (!(pc.grm_flags & kfGrmMatrixShapemask)) {
+	    if (pc.grm_flags & (kfGrmMatrixBin | kfGrmMatrixBin4)) {
+	      pc.grm_flags |= kfGrmMatrixSq;
+	    } else {
+	      pc.grm_flags |= kfGrmMatrixTri;
+	    }
+	  }
+	  pc.command_flags1 |= kfCommand1MakeRel;
+	} else if (!memcmp(flagname_p2, "ap", 3)) {
+	  if (load_params || (xload & (~kfXloadPlink1Dosage))) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_modif);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --map filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pvarname, cur_modif, slen + 1);
+	  xload |= kfXloadMap;
+	} else if (!memcmp(flagname_p2, "within", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  if (scan_posint_capped(cur_modif, kMaxLongLine / 2, &pc.mwithin_val)) {
+	    sprintf(g_logbuf, "Error: Invalid --mwithin parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'n':
+	if (!memcmp(flagname_p2, "o-fid", 6)) {
+	  pc.fam_cols &= ~kfFamCol1;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "o-parents", 10)) {
+	  pc.fam_cols &= ~kfFamCol34;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "o-sex", 6)) {
+	  pc.fam_cols &= ~kfFamCol5;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "o-pheno", 8)) {
+	  pc.fam_cols &= ~kfFamCol6;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "onfounders", 11)) {
+	  pc.misc_flags |= kfMiscNonfounders;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ot-chr", 7)) {
+	  if (pc.varid_from) {
+	    logerrprint("Error: --from/--to cannot be used with --autosome{-par} or --{not-}chr.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (pc.from_bp != -1) {
+	    logerrprint("Error: --from-bp/-kb/-mb and --to-bp/-kb/-mb cannot be used with --not-chr.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+
+	  // allowed:
+	  //   --allow-extra-chr --chr 5-22 bobs_chrom --not-chr 17
+	  // allowed:
+	  //   --allow-extra-chr --not-chr 12-17 bobs_chrom
+	  // does not make sense, disallowed:
+	  //   --allow-extra-chr --chr 5-22 --not-chr bobs_chrom
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+
+	  // --allow-extra-chr present, --chr/--autosome{-xy} not present
+	  const uint32_t aec_and_no_chr_include = ((pc.misc_flags / kfMiscAllowExtraChrs) & 1) && (!chr_info.is_include_stack);
+	  reterr = parse_chr_ranges(flagname_p, errstr_append, param_ct, aec_and_no_chr_include, kChrRawEnd - (kChrExcludeWords * kBitsPerWord), '-', &(argv[arg_idx]), &chr_info, chr_info.chr_exclude);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  notchr_present = 1;
+	  // remaining processing now postponed to finalize_chrset()
+	} else if (!memcmp(flagname_p2, "ew-id-max-allele-len", 21)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  if (scan_posint_capped(cur_modif, kMaxIdSlen - 2, &pc.new_variant_id_max_allele_slen)) {
+	    sprintf(g_logbuf, "Error: Invalid --new-id-max-allele-len length parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (param_ct == 2) {
+	    cur_modif = argv[arg_idx + 2];
+	    if (!strcmp(cur_modif, "missing")) {
+	      pc.misc_flags |= kfMiscNewVarIdOverflowMissing;
+	    } else if (!strcmp(cur_modif, "truncate")) {
+	      pc.misc_flags |= kfMiscNewVarIdOverflowTruncate;
+	    } else if (strcmp(cur_modif, "error")) {
+	      sprintf(g_logbuf, "Error: Invalid --new-id-max-allele-len parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'o':
+	if (!memcmp(flagname_p2, "utput-chr", 10)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* mt_code = argv[arg_idx + 1];
+	  if (!strcmp(mt_code, "M")) {
+	    chr_info.output_encoding = kfChrOutputM;
+	  } else if (!strcmp(mt_code, "MT")) {
+	    chr_info.output_encoding = kfChrOutputMT;
+	  } else if (!strcmp(mt_code, "0M")) {
+	    chr_info.output_encoding = kfChrOutput0M;
+	  } else if (!strcmp(mt_code, "chr26")) {
+	    chr_info.output_encoding = kfChrOutputPrefix;
+	  } else if (!strcmp(mt_code, "chrM")) {
+	    chr_info.output_encoding = kfChrOutputPrefix | kfChrOutputM;
+	  } else if (!strcmp(mt_code, "chrMT")) {
+	    chr_info.output_encoding = kfChrOutputPrefix | kfChrOutputMT;
+	  } else if (!strcmp(mt_code, "26")) {
+	    chr_info.output_encoding = kfChrOutput0;
+	  } else {
+	    sprintf(g_logbuf, "Error: Invalid --output-chr parameter '%s'.\n", mt_code);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "utput-min-p", 12)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if ((!scanadv_double(cur_modif, &pc.output_min_p)) || (!(pc.output_min_p >= 0.0)) || (pc.output_min_p >= 1.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --output-min-p parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "xford-single-chr", 17)) {
+	  if (!(xload & kfXloadOxGen)) {
+	    if (xload & kfXloadOxBgen) {
+	      logerrprint("Error: --oxford-single-chr must be used with .gen input.  (Single-chromosome\n.bgen files do not require this, since they still contain chromosome codes.)\n");
+	    } else {
+	      logerrprint("Error: --oxford-single-chr must be used with .gen input.\n");
+	    }
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  if (!(pc.misc_flags & kfMiscAllowExtraChrs)) {
+	    if (get_chr_code_raw(cur_modif) < 0) {
+	      sprintf(g_logbuf, "Error: Invalid --oxford-single-chr chromosome code '%s'. (Did you forget --allow-extra-chr?)\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  reterr = cmdline_alloc_string(cur_modif, argv[arg_idx], kMaxIdSlen, &import_single_chr_str);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "utput-missing-genotype", 23)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  output_missing_geno_char = extract_char_param(cur_modif);
+	  if (((unsigned char)output_missing_geno_char) <= ' ') {
+	    sprintf(g_logbuf, "Error: Invalid --output-missing-genotype parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "utput-missing-phenotype", 24)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  const uint32_t cur_modif_slen = strlen(cur_modif);
+	  if (cur_modif_slen > 31) {
+	    logerrprint("Error: --output-missing-phenotype string too long (max 31 chars).\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  memcpy(g_output_missing_pheno, cur_modif, cur_modif_slen + 1);
+	} else if (memcmp(flagname_p2, "ut", 3)) {
+	  // --out is a special case due to logging
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'p':
+	if (!memcmp(flagname_p2, "file", 5)) {
+	  if (load_params || xload) {
+	    // currently only possible with --bcf, --bfile, --pfile
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t fname_modif_idx = 1;
+	  if (param_ct == 2) {
+	    if (check_extra_param(&(argv[arg_idx]), "vzs", &fname_modif_idx)) {
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  }
+	  const char* fname_prefix = argv[arg_idx + fname_modif_idx];
+	  const uint32_t slen = strlen(fname_prefix);
+	  if (slen > (kPglFnamesize - 10)) {
+	    logerrprint("Error: --pfile parameter too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  strcpy(memcpya(pgenname, fname_prefix, slen), ".pgen");
+	  strcpy(memcpya(psamname, fname_prefix, slen), ".psam");
+	  char* pvarname_end = memcpya(pvarname, fname_prefix, slen);
+	  pvarname_end = strcpya0(pvarname_end, ".pvar");
+	  if (param_ct == 2) {
+	    strcpy(pvarname_end, ".zst");
+	  }
+	  load_params |= kfLoadParamsPfileAll;
+	} else if (!memcmp(flagname_p2, "gen", 4)) {
+	  if (xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  load_params |= kfLoadParamsPgen;
+	  char* fname = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(fname);
+	  if (slen > (kPglFnamesize - 1)) {
+	    logerrprint("Error: --pgen parameter too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pgenname, fname, slen + 1);
+	} else if (!memcmp(flagname_p2, "sam", 4)) {
+	  if (xload & (~(kfXloadVcf | kfXloadBcf | kfXloadPlink1Dosage | kfXloadMap))) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  load_params |= kfLoadParamsPsam;
+	  char* fname = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(fname);
+	  if (slen > (kPglFnamesize - 1)) {
+	    logerrprint("Error: --psam parameter too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(psamname, fname, slen + 1);
+	} else if (!memcmp(flagname_p2, "var", 4)) {
+	  if (xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  load_params |= kfLoadParamsPvar;
+	  char* fname = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(fname);
+	  if (slen > (kPglFnamesize - 1)) {
+	    logerrprint("Error: --pvar parameter too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pvarname, fname, slen + 1);
+	} else if (!memcmp(flagname_p2, "heno", 5)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 0, &pc.pheno_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "heno-name", 10)) {
+	  // can now be used without --pheno
+	  reterr = parse_name_ranges(&(argv[arg_idx]), errstr_append, param_ct, 0, range_delim, &pc.pheno_range_list);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "arallel", 8)) {
+	  if (pc.king_modifier & kfKingMatrixSq) {
+	    logerrprint("Error: --parallel cannot be used with '--make-king square'.  Use '--make-king\nsquare0' or plain --make-king instead.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if ((pc.king_cutoff != -1) && (!king_cutoff_fprefix)) {
+	    logerrprint("Error: --parallel cannot be used with --king-cutoff.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (pc.grm_flags & kfGrmMatrixSq) {
+	    logerrprint("Error: --parallel cannot be used with '--make-rel square'.  Use '--make-rel\nsquare0' or plain --make-rel instead.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 2, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (scan_posint_capped(argv[arg_idx + 1], kParallelMax, &pc.parallel_idx)) {
+	    sprintf(g_logbuf, "Error: Invalid --parallel job index '%s'.\n", argv[arg_idx + 1]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (scan_posint_capped(argv[arg_idx + 2], kParallelMax, &pc.parallel_tot) || (pc.parallel_tot == 1) || (pc.parallel_tot < pc.parallel_idx)) {
+	    sprintf(g_logbuf, "Error: Invalid --parallel total job count '%s'.\n", argv[arg_idx + 2]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  --pc.parallel_idx; // internal 0..(n-1) indexing
+	} else if (!memcmp(flagname_p2, "arameters", 10)) {
+	  if (!(pc.command_flags1 & kfCommand1Glm)) {
+	    logerrprint("Error: --parameters must be used with --glm.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  reterr = parse_name_ranges(&(argv[arg_idx]), errstr_append, param_ct, 1, '-', &pc.glm_info.parameters_range_list);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "filter", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if (!scanadv_double(cur_modif, &pc.pfilter)) {
+	    sprintf(g_logbuf, "Error: Invalid --pfilter parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if ((pc.pfilter <= 0.0) || (pc.pfilter > 1.0)) {
+	    logerrprint("Error: --pfilter threshold must be in (0, 1].\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	} else if (!memcmp(flagname_p2, "ca", 3)) {
+#ifdef NOLAPACK
+	  logerrprint("Error: --pca requires " PROG_NAME_STR " to be built with LAPACK.\n");
+	  goto main_ret_INVALID_CMDLINE;
+#endif
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 6)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  uint32_t is_var_wts = 0;
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 6) && (!memcmp(cur_modif, "approx", 6))) {
+	      pc.pca_flags |= kfPcaApprox;
+	    } else if ((cur_modif_slen == 10) && (!memcmp(cur_modif, "meanimpute", 10))) {
+	      pc.pca_flags |= kfPcaMeanimpute;
+	    } else if ((cur_modif_slen == 3) && (!memcmp(cur_modif, "sid", 3))) {
+	      pc.pca_flags |= kfPcaSid;
+	    } else if ((cur_modif_slen == 7) && (!memcmp(cur_modif, "var-wts", 7))) {
+	      is_var_wts = 1;
+	    } else if ((cur_modif_slen == 3) && (!memcmp(cur_modif, "vzs", 3))) {
+	      pc.pca_flags |= kfPcaVarZs;
+	    } else if ((cur_modif_slen > 6) && (!memcmp(cur_modif, "vcols=", 6))) {
+	      if (pc.pca_flags & kfPcaVcolAll) {
+		logerrprint("Error: Multiple --pca vcols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[6]), "chrom\0pos\0ref\0alt1\0alt\0maj\0nonmaj\0", "pca vcols", kfPcaVcolChrom, kfPcaVcolDefault, 1, &pc.pca_flags);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else {
+	      if (pc.pca_ct || scan_posint_defcap(cur_modif, &pc.pca_ct)) {
+		logerrprint("Error: Invalid --pca parameter sequence.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      if (pc.pca_ct > 8000) {
+		// this slightly simplifies output buffering.
+		// lower limit for randomized algorithm?
+		// (just let memory allocation fail for now...)
+		logerrprint("Error: --pca does not support more than 8000 PCs.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	    }
+	  }
+	  if (pc.pca_flags & kfPcaApprox) {
+	    if (pc.pca_ct > 100) {
+	      // double-precision overflow too likely
+	      logerrprint("Error: --pca approx does not support more than 100 PCs.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	  } else {
+	    // todo: if --make-rel/--make-grm present, verify consistency
+	    if (pc.parallel_tot != 1) {
+	      logerrprint("Error: Non-approximate --pca cannot be used with --parallel.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    const uint32_t pca_meanimpute = (pc.pca_flags / kfPcaMeanimpute) & 1;
+	    if (pc.command_flags1 & kfCommand1MakeRel) {
+	      if (((pc.grm_flags / kfGrmMeanimpute) & 1) != pca_meanimpute) {
+		logerrprint("Error: --make-rel/--make-grm-gz/--make-grm-bin meanimpute setting must match\n--pca meanimpute setting.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      if (pc.grm_flags & kfGrmCov) {
+		logerrprint("Error: --make-rel/--make-grm-gz/--make-grm-bin cannot be used to compute a\ncovariance matrix in the same run as non-approximate --pca.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	    } else {
+	      if (pca_meanimpute) {
+		pc.grm_flags |= kfGrmMeanimpute;
+	      }
+	    }
+	  }
+	  if (!pc.pca_ct) {
+	    pc.pca_ct = 10;
+	  }
+	  if (!(pc.pca_flags & kfPcaVcolAll)) {
+	    if (is_var_wts) {
+	      pc.pca_flags |= kfPcaVcolDefault;
+	    }
+	  } else if (!is_var_wts) {
+	    logerrprint("Error: --pca 'vcols=' has no effect without 'var-wts'.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (is_var_wts) {
+	    pc.pca_flags |= kfPcaVarWts;
+	  } else if (pc.pca_flags & kfPcaVarZs) {
+	    logerrprint("Error: --pca 'vzs' modifier has no effect without 'var-wts'.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  pc.command_flags1 |= kfCommand1Pca;
+	} else if (!memcmp(flagname_p2, "heno-quantile-normalize", 24)) {
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &pc.quantnorm_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.pheno_transform_flags |= kfPhenoTransformQuantnormPheno;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'q':
+	if (!memcmp(flagname_p2, "uantile-normalize", 18)) {
+	  if (pc.pheno_transform_flags & (kfPhenoTransformQuantnormPheno | kfPhenoTransformQuantnormCovar)) {
+	    logerrprint("Error: --quantile-normalize cannot be used with --pheno-quantile-normalize or\n--covar-quantile-normalize.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &pc.quantnorm_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.pheno_transform_flags |= kfPhenoTransformQuantnormAll;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'r':
+	if (!memcmp(flagname_p2, "eal-ref-alleles", 16)) {
+	  if (pc.misc_flags & kfMiscMajRef) {
+	    logerrprint("Error: --real-ref-alleles cannot be used with --maj-ref.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  pc.misc_flags |= kfMiscRealRefAlleles;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "emove", 6)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const uint32_t sid_present = !strcmp(argv[arg_idx + 1], "sid");
+	  if (sid_present) {
+	    if (param_ct == 1) {
+	      logerrprint("Error: '--remove sid' requires at least one filename.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    pc.misc_flags |= kfMiscRemovefileSid;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1 + sid_present]), param_ct - sid_present, kPglFnamesize, &pc.remove_fnames);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "emove-fam", 10)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, kPglFnamesize, &pc.removefam_fnames);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "emove-females", 14)) {
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclFemales;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "emove-males", 12)) {
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclMales;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "emove-nosex", 12)) {
+	  pc.filter_flags |= kfFilterPsamReq | kfFilterExclNosex;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ead-freq", 9)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 0, &pc.read_freq_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterAllReq;
+	} else if (!memcmp(flagname_p2, "equire-pheno", 13)) {
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &require_pheno_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.misc_flags |= kfMiscRequirePheno;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "equire-covar", 13)) {
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &require_covar_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.misc_flags |= kfMiscRequireCovar;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "emove-if", 9)) {
+	  reterr = validate_and_alloc_cmp_expr(&(argv[arg_idx + 1]), argv[arg_idx], param_ct, &pc.remove_if_expr);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "emove-cats", 11)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 0, &pc.remove_cats_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "emove-cat-names", 16)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, kMaxIdBlen, &pc.remove_cat_names_flattened);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "emove-cat-pheno", 14)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.remove_cat_phenoname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "ice", 4)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  chr_info.autosome_ct = 12;
+	  chr_info.xymt_codes[0] = -2;
+	  chr_info.xymt_codes[1] = -2;
+	  chr_info.xymt_codes[2] = -2;
+	  chr_info.xymt_codes[3] = -2;
+	  chr_info.xymt_codes[4] = -2;
+	  chr_info.xymt_codes[5] = -2;
+	  chr_info.haploid_mask[0] = 0x1fff;
+	  goto main_param_zero;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 's':
+	if (!memcmp(flagname_p2, "eed", 4)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 0x7fffffff)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  rseed_ct = param_ct;
+	  if (pgl_malloc(param_ct * sizeof(int32_t), &rseeds)) {
+	    goto main_ret_NOMEM;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    if (scan_uint_capped(cur_modif, 0xffffffffU, &(rseeds[param_idx - 1]))) {
+	      sprintf(g_logbuf, "Error: Invalid --seed parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	} else if (!memcmp(flagname_p2, "plit-par", 9)) {
+	  if (pc.misc_flags & kfMiscMergePar) {
+	    logerrprint("Error: --split-par cannot be used with --merge-par.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct == 1) {
+	    const char* build_code = argv[arg_idx + 1];
+	    if ((!strcmp(build_code, "b38")) || (!strcmp(build_code, "hg38"))) {
+	      pc.splitpar_bound1 = 2781479;
+	      pc.splitpar_bound2 = 155701383;
+	    } else if ((!strcmp(build_code, "b37")) || (!strcmp(build_code, "hg19"))) {
+	      pc.splitpar_bound1 = 2699520;
+	      pc.splitpar_bound2 = 154931044;
+	    } else if ((!strcmp(build_code, "b36")) || (!strcmp(build_code, "hg18"))) {
+	      pc.splitpar_bound1 = 2709521;
+	      pc.splitpar_bound2 = 154584237;
+	    } else {
+	      sprintf(g_logbuf, "Error: Unrecognized --split-par build code '%s'.\n", build_code);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  } else {
+	    if (scan_uint_defcap(argv[arg_idx + 1], &pc.splitpar_bound1)) {
+	      sprintf(g_logbuf, "Error: Invalid --split-par parameter '%s'.\n", argv[arg_idx + 1]);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    if (scan_uint_defcap(argv[arg_idx + 2], &pc.splitpar_bound2) || (pc.splitpar_bound2 <= pc.splitpar_bound1)) {
+	      sprintf(g_logbuf, "Error: Invalid --split-par parameter '%s'.\n", argv[arg_idx + 2]);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  pc.filter_flags |= kfFilterPvarReq | kfFilterNoSplitChr;
+	} else if ((!memcmp(flagname_p2, "et-all-var-ids", 15)) || (!memcmp(flagname_p2, "et-missing-var-ids", 19))) {
+	  if (flagname_p2[3] == 'm') {
+	    if (pc.varid_template) {
+	      logerrprint("Error: --set-missing-var-ids cannot be used with --set-all-var-ids.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	    pc.misc_flags |= kfMiscSetMissingVarIds;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (!varid_template_is_valid(argv[arg_idx + 1], flagname_p)) {
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.varid_template);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if (!memcmp(flagname_p2, "et-hh-missing", 14)) {
+	  if (!(pc.command_flags1 & kfCommand1MakePlink2)) {
+	    logerrprint("Error: --set-hh-missing must be used with --make-{b}pgen/--make-bed.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  make_plink2_modifier |= kfMakePlink2SetHhMissing;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "et-mixed-mt-missing", 20)) {
+	  if (!(pc.command_flags1 & kfCommand1MakePlink2)) {
+	    logerrprint("Error: --set-mixed-mt-missing must be used with --make-{b}pgen/--make-bed.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  make_plink2_modifier |= kfMakePlink2SetMixedMtMissing;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "ample", 6)) {
+	  if (load_params || (xload & (~(kfXloadOxGen | kfXloadOxBgen | kfXloadOxHaps | kfXloadOxLegend | kfXloadOxSample)))) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (!(xload & (kfXloadOxGen | kfXloadOxBgen | kfXloadOxHaps))) {
+	    logerrprint("Error: --sample must be used with --gen/--bgen/--data/--haps.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_fname = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_fname);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --sample filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(psamname, cur_fname, slen + 1);
+	  xload |= kfXloadOxSample;
+	} else if (!memcmp(flagname_p2, "heep", 5)) {
+	  if (chr_info.chrset_source) {
+	    logerrprint("Error: Conflicting chromosome-set flags.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  chr_info.chrset_source = kChrsetSourceCmdline;
+	  chr_info.autosome_ct = 26;
+	  chr_info.xymt_codes[0] = 27;
+	  chr_info.xymt_codes[1] = 28;
+	  chr_info.xymt_codes[2] = -2;
+	  chr_info.xymt_codes[3] = -2;
+	  chr_info.xymt_codes[4] = -2;
+	  chr_info.xymt_codes[5] = -2;
+	  chr_info.haploid_mask[0] = 0x18000000;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "np", 3)) {
+	  if (pc.varid_exclude_snp) {
+	    // problematic due to --window
+	    logerrprint("Error: --snp cannot be used with --exclude-snp.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.varid_snp);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if (!memcmp(flagname_p2, "nps", 4)) {
+	  reterr = parse_name_ranges(&(argv[arg_idx]), errstr_append, param_ct, 0, range_delim, &pc.snps_range_list);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if (!memcmp(flagname_p2, "nps-only", 9)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if (!strcmp(cur_modif, "just-acgt")) {
+	      pc.filter_flags |= kfFilterSnpsOnlyJustAcgt;
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --snps-only parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  pc.filter_flags |= kfFilterPvarReq | kfFilterSnpsOnly;
+	} else if (!memcmp(flagname_p2, "core", 5)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 11)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 0, &pc.score_info.input_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  uint32_t numeric_param_ct = 0;
+	  uint32_t score_cols[3];
+	  for (uint32_t param_idx = 2; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 6) && (!memcmp(cur_modif, "header", 6))) {
+	      pc.score_info.flags |= kfScoreHeaderIgnore;
+	    } else if ((cur_modif_slen == 11) && (!memcmp(cur_modif, "header-read", 11))) {
+	      pc.score_info.flags |= kfScoreHeaderRead;
+	    } else if ((cur_modif_slen == 18) && (!memcmp(cur_modif, "no-mean-imputation", 18))) {
+	      pc.score_info.flags |= kfScoreNoMeanimpute;
+	    } else if ((cur_modif_slen == 6) && (!memcmp(cur_modif, "center", 6))) {
+	      pc.score_info.flags |= kfScoreCenter;
+	    } else if ((cur_modif_slen == 20) && (!memcmp(cur_modif, "variance-standardize", 20))) {
+	      pc.score_info.flags |= kfScoreVarianceStandardize;
+	    } else if ((cur_modif_slen == 18) && (!memcmp(cur_modif, "variance-normalize", 18))) {
+	      logerrprint("Note: --score's 'variance-normalize' modifier has been renamed to the more\nprecise 'variance-standardize'.\n");
+	      pc.score_info.flags |= kfScoreVarianceStandardize;
+	    } else if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "se", 2))) {
+	      pc.score_info.flags |= kfScoreSe;
+	    } else if ((cur_modif_slen == 2) && (!memcmp(cur_modif, "zs", 2))) {
+	      pc.score_info.flags |= kfScoreZs;
+	    } else if ((cur_modif_slen == 13) && (!memcmp(cur_modif, "list-variants", 13))) {
+	      pc.score_info.flags |= kfScoreListVariants;
+	    } else if ((cur_modif_slen == 16) && (!memcmp(cur_modif, "list-variants-zs", 16))) {
+	      pc.score_info.flags |= kfScoreListVariants | kfScoreListVariantsZs;
+	    } else if ((cur_modif_slen > 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      if (pc.score_info.flags & kfScoreColAll) {
+		logerrprint("Error: Multiple --score cols= modifiers.\n");
+		goto main_ret_INVALID_CMDLINE;
+	      }
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "maybesid\0sid\0pheno1\0phenos\0nmissallele\0denom\0dosagesum\0scoreavgs\0scoresums\0", "score", kfScoreColMaybesid, kfScoreColDefault, 1, &pc.score_info.flags);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else if (numeric_param_ct == 3) {
+	      logerrprint("Error: --score takes at most three numeric parameters.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    } else {
+	      if (scan_posint_capped(cur_modif, kMaxLongLine / 2, &(score_cols[numeric_param_ct]))) {
+		sprintf(g_logbuf, "Error: Invalid --score parameter '%s'.\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_WWA;
+	      }
+	      for (uint32_t uii = 0; uii < numeric_param_ct; ++uii) {
+		if (score_cols[uii] == score_cols[numeric_param_ct]) {
+		  logerrprint("Error: Identical --score column indexes.\n");
+		  goto main_ret_INVALID_CMDLINE_A;
+		}
+	      }
+	      ++numeric_param_ct;
+	    }
+	  }
+	  if ((pc.score_info.flags & (kfScoreHeaderIgnore | kfScoreHeaderRead)) == (kfScoreHeaderIgnore | kfScoreHeaderRead)) {
+	    logerrprint("Error: --score 'header' and 'header-read' modifiers cannot be used together.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (!(pc.score_info.flags & kfScoreColAll)) {
+	    pc.score_info.flags |= kfScoreColDefault;
+	  }
+	  if (numeric_param_ct) {
+	    pc.score_info.varid_col_p1 = score_cols[0];
+	  }
+	  if (numeric_param_ct > 1) {
+	    pc.score_info.allele_col_p1 = score_cols[1];
+	  } else {
+	    pc.score_info.allele_col_p1 = pc.score_info.varid_col_p1 + 1;
+	  }
+	  if (numeric_param_ct == 3) {
+	    // a bit artificial, but it works
+	    const uint32_t col_idx = score_cols[2];
+	    const uint32_t col_idx_blen = 1 + int_slen(col_idx);
+	    char* new_buf;
+	    if (pgl_malloc(col_idx_blen + 1, &new_buf)) {
+	      goto main_ret_NOMEM;
+	    }
+	    pc.score_info.input_col_idx_range_list.names = new_buf;
+	    pc.score_info.input_col_idx_range_list.name_max_blen = col_idx_blen;
+	    pc.score_info.input_col_idx_range_list.name_ct = 1;
+	    uint32toa_x(col_idx, '\0', new_buf);
+	    new_buf[col_idx_blen] = '\0';
+	    pc.score_info.input_col_idx_range_list.starts_range = (unsigned char*)(&(new_buf[col_idx_blen]));
+	  }
+	  pc.command_flags1 |= kfCommand1Score;
+	} else if (!memcmp(flagname_p2, "core-col-nums", 14)) {
+	  if (!(pc.command_flags1 & kfCommand1Score)) {
+	    logerrprint("Error: --score-col-nums must be used with --score.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (pc.score_info.input_col_idx_range_list.name_ct) {
+	    logerrprint("Error: --score-col-nums cannot be used when three numeric parameters are\nprovided to --score.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  reterr = parse_name_ranges(&(argv[arg_idx]), errstr_append, param_ct, 1, '-', &pc.score_info.input_col_idx_range_list);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	} else if (!memcmp(flagname_p2, "plit-cat-pheno", 15)) {
+	  uint32_t first_phenoname_idx = 1;
+	  for (; first_phenoname_idx <= param_ct; ++first_phenoname_idx) {
+	    const char* cur_modif = argv[arg_idx + first_phenoname_idx];
+	    if (!strcmp(cur_modif, "omit-last")) {
+	      pc.pheno_transform_flags |= kfPhenoTransformSplitCatOmitLast;
+	    } else if (!strcmp(cur_modif, "covar-01")) {
+	      pc.pheno_transform_flags |= kfPhenoTransformSplitCatCovar01;
+	    } else {
+	      break;
+	    }
+	  }
+	  if (first_phenoname_idx <= param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + first_phenoname_idx]), param_ct + 1 - first_phenoname_idx, kMaxIdSlen - 1, &pc.split_cat_phenonames_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	    // may as well verify that no phenotype name has an '=' in it
+	    char* phenonames_iter = pc.split_cat_phenonames_flattened;
+	    do {
+	      const uint32_t cur_phenoname_slen = strlen(phenonames_iter);
+	      if (memchr(phenonames_iter, '=', cur_phenoname_slen)) {
+		logerrprint("Error: --split-cat-pheno phenotype names may not contain the '=' character.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	      phenonames_iter = &(phenonames_iter[cur_phenoname_slen + 1]);
+	    } while (*phenonames_iter);
+	  } else if (pc.pheno_transform_flags & kfPhenoTransformSplitCatCovar01) {
+	    logerrprint("Error: --split-cat-pheno 'covar-01' modifier cannot be used without any\nphenotype names.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  pc.pheno_transform_flags |= kfPhenoTransformSplitCat;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "ort-vars", 9)) {
+	  logerrprint("Error: --sort-vars is not implemented yet.\n");
+	  reterr = kPglRetNotYetSupported;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 't':
+	if (!memcmp(flagname_p2, "hreads", 7)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (scan_posint_defcap(argv[arg_idx + 1], &pc.max_thread_ct)) {
+	    sprintf(g_logbuf, "Error: Invalid --threads parameter '%s'.\n", argv[arg_idx + 1]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (pc.max_thread_ct > kMaxThreads) {
+	    LOGPRINTF("Note: Reducing --threads parameter to %u.  (If this is not large enough,\nrecompile with a larger kMaxThreads setting.)\n", kMaxThreads);
+	    pc.max_thread_ct = kMaxThreads;
+	  } else if (known_procs == -1) {
+	    // trigger BLAS/LAPACK warning?
+	    known_procs = 0;
+	  }
+	} else if (!memcmp(flagname_p2, "o", 2)) {
+	  if (chr_info.is_include_stack || notchr_present) {
+	    logerrprint("Error: --from/--to cannot be used with --autosome{-par} or --{not-}chr.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = cmdline_alloc_string(argv[arg_idx + 1], argv[arg_idx], kMaxIdSlen, &pc.varid_to);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq | kfFilterNoSplitChr;
+	} else if ((!memcmp(flagname_p2, "o-bp", 5)) || (!memcmp(flagname_p2, "o-kb", 5)) || (!memcmp(flagname_p2, "o-mb", 5))) {
+	  if (!cmdline_single_chr(&chr_info, pc.misc_flags)) {
+	    logerrprint("Error: --from-bp/-kb/-mb and --to-bp/-kb/-mb must be used with --chr, and only\none chromosome.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (notchr_present) {
+	    logerrprint("Error: --from-bp/-kb/-mb and --to-bp/-kb/-mb cannot be used with --not-chr.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (pc.to_bp != -1) {
+	    logerrprint("Error: Multiple --to-bp/-kb/-mb values.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double dxx;
+	  if (!scanadv_double(cur_modif, &dxx)) {
+	    sprintf(g_logbuf, "Error: Invalid --to-bp/-kb/-mb parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  const char unit_char = flagname_p2[2];
+	  if (unit_char == 'k') {
+	    dxx *= 1000;
+	  } else if (unit_char == 'm') {
+	    dxx *= 1000000;
+	  }
+	  if (dxx < 0) {
+	    LOGERRPRINTF("Error: --to-bp/-kb/-mb parameter '%s' too small.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_A;
+	  } else if (dxx >= 2147483646) {
+	    pc.to_bp = 0x7ffffffe;
+	  } else {
+	    // round down
+	    pc.to_bp = (int32_t)(dxx * (1 + kSmallEpsilon));
+	  }
+	  if (pc.from_bp > pc.to_bp) {
+	    // (if we do permit this, rounding must be postponed)
+	    logerrprint("Error: --to-bp/-kb/-mb parameter is smaller than --from-bp/-kb/-mb parameter.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if (!memcmp(flagname_p2, "ests", 5)) {
+	  if (!(pc.command_flags1 & kfCommand1Glm)) {
+	    logerrprint("Error: --tests must be used with --glm.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if ((param_ct == 1) && (!strcmp(argv[arg_idx + 1], "all"))) {
+	    pc.glm_info.flags |= kfGlmTestsAll;
+	  } else {
+	    reterr = parse_name_ranges(&(argv[arg_idx]), errstr_append, param_ct, 1, '-', &pc.glm_info.tests_range_list);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  logerrprint("Error: --tests is not implemented yet.\n");
+	  reterr = kPglRetNotYetSupported;
+	  goto main_ret_1;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'u':
+	if (!memcmp(flagname_p2, "pdate-sex", 10)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  reterr = alloc_fname(argv[arg_idx + 1], flagname_p, 0, &pc.update_sex_fname);
+	  if (reterr) {
+	    goto main_ret_1;
+	  }
+	  if (param_ct == 2) {
+	    const char* cur_modif = argv[arg_idx + 2];
+	    if (scan_posint_defcap(cur_modif, &pc.update_sex_colm2)) {
+	      sprintf(g_logbuf, "Error: Invalid --update-sex column parameter '%s'. (This must be a positive integer.)\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'v':
+	if (!memcmp(flagname_p2, "ar-min-qual", 12)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (scan_float(argv[arg_idx + 1], &pc.var_min_qual) || (pc.var_min_qual < 0.0)) {
+	    sprintf(g_logbuf, "Error: Invalid --var-min-qual parameter '%s'.\n", argv[arg_idx + 1]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  pc.var_min_qual *= 1 - kSmallEpsilon;
+	  pc.filter_flags |= kfFilterPvarReq;
+	} else if (!memcmp(flagname_p2, "ar-filter", 10)) {
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &var_filter_exceptions_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.misc_flags |= kfMiscExcludePvarFilterFail;
+	  pc.filter_flags |= kfFilterPvarReq;
+        } else if (!memcmp(flagname_p2, "cf", 3)) {
+	  // permit accompanying .fam/.psam
+	  // IIDs must match VCF sample line order
+	  if ((load_params & (~kfLoadParamsPsam)) || xload) {
+	    goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct == 2) {
+	    const char* cur_modif = argv[arg_idx + 2];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen < 8) || memcmp(cur_modif, "dosage=", 7)) {
+	      sprintf(g_logbuf, "Error: Invalid --vcf parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    reterr = cmdline_alloc_string(&(cur_modif[7]), argv[arg_idx], 4095, &vcf_dosage_import_field);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	    if (!is_alphanumeric(vcf_dosage_import_field)) {
+	      logerrprint("Error: --vcf dosage= parameter is not alphanumeric.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	    if (!strcmp(vcf_dosage_import_field, "GT")) {
+	      logerrprint("Error: --vcf dosage= parameter cannot be 'GT'.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  const uint32_t slen = strlen(cur_modif);
+	  if (slen > kPglFnamesize - 1) {
+	    logerrprint("Error: --vcf filename too long.\n");
+	    goto main_ret_OPEN_FAIL;
+	  }
+	  memcpy(pgenname, cur_modif, slen + 1);
+	  xload = kfXloadVcf;
+        } else if (!memcmp(flagname_p2, "cf-min-gp", 10)) {
+	  logerrprint("Error: --vcf-min-gp is no longer supported.  Use --import-dosage-certainty\ninstead.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	} else if ((!memcmp(flagname_p2, "cf-min-gq", 10)) || (!memcmp(flagname_p2, "cf-min-dp", 10))) {
+	  if (!(xload & kfXloadVcf)) {
+	    // todo: support BCF too
+	    LOGERRPRINTF("Error: --%s must be used with --vcf.\n", flagname_p);
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  uint32_t uii;
+	  if (scan_uint_defcap(cur_modif, &uii)) {
+	    sprintf(g_logbuf, "Error: Invalid --%s parameter '%s'.\n", flagname_p, cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (flagname_p2[7] == 'g') {
+	    vcf_min_gq = uii;
+	  } else {
+	    vcf_min_dp = uii;
+	  }
+	} else if (!memcmp(flagname_p2, "cf-idspace-to", 14)) {
+	  if (!(xload & (kfXloadVcf | kfXloadBcf))) {
+	    logerrprint("Error: --vcf-idspace-to must be used with --vcf/--bcf.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (id_delim == ' ') {
+	    logerrprint("Error: --vcf-idspace-to cannot be used when the --id-delim character is space.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  idspace_to = extract_char_param(argv[arg_idx + 1]);
+	  if (!idspace_to) {
+	    logerrprint("Error: --vcf-idspace-to parameter must be a single character.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (((unsigned char)idspace_to) <= ' ') {
+	    logerrprint("Error: --vcf-idspace-to parameter must be a nonspace character.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	} else if (!memcmp(flagname_p2, "cf-half-call", 13)) {
+	  if (!(xload & kfXloadVcf)) {
+	    logerrprint("Error: --vcf-half-call must be used with --vcf.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* half_call_mode_str = argv[arg_idx + 1];
+	  const char first_char_upcase_match = half_call_mode_str[0] & 0xdf;
+	  const uint32_t is_short_name = (half_call_mode_str[1] == '\0');
+	  if ((is_short_name && (first_char_upcase_match == 'H')) || (!strcmp(half_call_mode_str, "haploid"))) {
+	    vcf_half_call = kVcfHalfCallHaploid;
+	  } else if ((is_short_name && (first_char_upcase_match == 'M')) || (!strcmp(half_call_mode_str, "missing"))) {
+	    vcf_half_call = kVcfHalfCallMissing;
+	  } else if ((is_short_name && (first_char_upcase_match == 'E')) || (!strcmp(half_call_mode_str, "error"))) {
+	    vcf_half_call = kVcfHalfCallError;
+	  } else if ((is_short_name && (first_char_upcase_match == 'R')) || (!strcmp(half_call_mode_str, "reference"))) {
+	    vcf_half_call = kVcfHalfCallError;
+	  } else {
+	    sprintf(g_logbuf, "Error: '%s' is not a valid mode for --vcf-half-call.\n", half_call_mode_str);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "cf-require-gt", 14)) {
+	  if (!(xload & (kfXloadVcf | kfXloadBcf))) {
+	    logerrprint("Error: --vcf-require-gt must be used with --vcf/--bcf.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  pc.misc_flags |= kfMiscVcfRequireGt;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "if", 3)) {
+	  if (!(pc.command_flags1 & kfCommand1Glm)) {
+	    logerrprint("Error: --vif must be used with --glm/--epistasis.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  if (!scanadv_double(cur_modif, &pc.vif_thresh)) {
+	    sprintf(g_logbuf, "Error: Invalid --glm/--epistasis VIF threshold '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (pc.vif_thresh < 1.0) {
+	    sprintf(g_logbuf, "Error: --glm/--epistasis VIF threshold '%s' too small (must be >= 1).\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else if (!memcmp(flagname_p2, "ariance-standardize", 20)) {
+	  if (pc.pheno_transform_flags & kfPhenoTransformVstdCovar) {
+	    logerrprint("Error: --variance-standardize cannot be used with --covar-variance-standardize.\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  }
+	  if (param_ct) {
+	    reterr = alloc_and_flatten(&(argv[arg_idx + 1]), param_ct, 0x7fffffff, &pc.vstd_flattened);
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.pheno_transform_flags |= kfPhenoTransformVstdAll;
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "alidate", 8)) {
+	  pc.command_flags1 |= kfCommand1Validate;
+	  goto main_param_zero;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      case 'w':
+	if (!memcmp(flagname_p2, "rite-snplist", 13)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    if (strcmp(cur_modif, "zs")) {
+	      sprintf(g_logbuf, "Error: Invalid --write-snplist parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    pc.misc_flags |= kfMiscWriteSnplistZs;
+	  }
+	  pc.command_flags1 |= kfCommand1WriteSnplist;
+	} else if (!memcmp(flagname_p2, "indow", 6)) {
+	  if (!(pc.varid_snp || pc.varid_exclude_snp)) {
+	    logerrprint("Error: --window must be used with --snp or --exclude-snp.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  char* cur_modif = argv[arg_idx + 1];
+	  double dxx;
+	  if (!scanadv_double(cur_modif, &dxx) || (dxx < 0)) {
+	    sprintf(g_logbuf, "Error: Invalid --window parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  dxx *= 500 * (1 + kSmallEpsilon);
+	  if (dxx > 2147483646) {
+	    pc.window_bp = 0x7ffffffe;
+	  } else {
+	    pc.window_bp = (int32_t)dxx;
+	  }
+	  pc.filter_flags |= kfFilterNoSplitChr;
+	  // no need to set kfFilterPvarReq due to --snp/--exclude-snp req.
+	} else if (!memcmp(flagname_p2, "ithin", 6)) {
+	  if (pc.misc_flags & kfMiscCatPhenoFamily) {
+	    logerrprint("Error: --within cannot be used with --family.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 2)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  for (uint32_t param_idx = 1; param_idx <= param_ct; ++param_idx) {
+	    const char* cur_modif = argv[arg_idx + param_idx];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen == 7) && (!memcmp(cur_modif, "keep-", 5)) && match_upper_counted(&(cur_modif[5]), "NA", 2)) {
+	      logerrprint("Error: --within's keep-NA modifier has been retired.  Rename that category in\nthe input file if you wish to keep it.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if (param_idx == 1) {
+	      reterr = alloc_fname(cur_modif, flagname_p, 0, &pc.within_fname);
+	    } else {
+	      if (is_reserved_pheno_name(cur_modif, cur_modif_slen)) {
+		sprintf(g_logbuf, "Error: '%s' cannot be used as a categorical phenotype name.\n", cur_modif);
+		goto main_ret_INVALID_CMDLINE_2A;
+	      }
+	      reterr = cmdline_alloc_string(cur_modif, argv[arg_idx], kMaxIdSlen, &pc.catpheno_name);
+	    }
+	    if (reterr) {
+	      goto main_ret_1;
+	    }
+	  }
+	  pc.filter_flags |= kfFilterPsamReq;
+	} else if (!memcmp(flagname_p2, "rite-covar", 11)) {
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 0, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (param_ct) {
+	    const char* cur_modif = argv[arg_idx + 1];
+	    const uint32_t cur_modif_slen = strlen(cur_modif);
+	    if ((cur_modif_slen >= 5) && (!memcmp(cur_modif, "cols=", 5))) {
+	      reterr = parse_col_descriptor(&(cur_modif[5]), "maybesid\0sid\0maybeparents\0parents\0sex\0pheno1\0phenos\0", "write-covar", kfWriteCovarColMaybesid, kfWriteCovarColDefault, 0, &pc.write_covar_flags);
+	      if (reterr) {
+		goto main_ret_1;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Invalid --write-covar parameter '%s'.\n", cur_modif);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	  } else {
+	    pc.write_covar_flags |= kfWriteCovarColDefault;
+	  }
+	  pc.command_flags1 |= kfCommand1WriteCovar;
+	} else if (!memcmp(flagname_p2, "arning-errcode", 15)) {
+	  warning_errcode = 1;
+	  goto main_param_zero;
+	} else if (!memcmp(flagname_p2, "rite-cluster", 13)) {
+	  logerrprint("Error: --write-cluster is retired.  Use e.g. --make-just-psam.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	} else if (!memcmp(flagname_p2, "ith-phenotype", 14)) {
+	  logerrprint("Error: --with-phenotype is retired.  Use --write-covar cols=... instead.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+
+      case 'x':
+	if (!memcmp(flagname_p2, "chr-model", 10)) {
+	  if (!(pc.command_flags1 & (kfCommand1Glm | kfCommand1Score))) {
+	    logerrprint("Error: --xchr-model must be used with --glm or --score.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	  if (pc.glm_info.flags & (kfGlmGenotypic | kfGlmHethom | kfGlmDominant | kfGlmRecessive)) {
+	    sprintf(g_logbuf, "Error: --xchr-model cannot be used with --glm %s.\n", (pc.glm_info.flags & kfGlmGenotypic)? "genotypic" : ((pc.glm_info.flags & kfGlmHethom)? "hethom" : ((pc.glm_info.flags & kfGlmDominant)? "dominant" : "recessive")));
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  if (enforce_param_ct_range(argv[arg_idx], param_ct, 1, 1)) {
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  }
+	  const char* cur_modif = argv[arg_idx + 1];
+	  pc.xchr_model = ((uint32_t)extract_char_param(cur_modif)) - 48;
+	  if (pc.xchr_model > 2) {
+	    sprintf(g_logbuf, "Error: Invalid --xchr-model parameter '%s'.\n", cur_modif);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	} else {
+	  goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+	}
+	break;
+	
+      default:
+	goto main_ret_INVALID_CMDLINE_UNRECOGNIZED;
+      main_param_zero:
+	if (param_ct) {
+	  sprintf(g_logbuf, "Error: --%s doesn't accept parameters.\n", flagname_p);
+	  goto main_ret_INVALID_CMDLINE_2A;
+	}
+      }
+    } while ((++cur_flag_idx) < flag_ct);
+    if (!outname_end) {
+      outname_end = &(outname[6]);
+    }
+    
+    if ((!pc.command_flags1) && (!(xload & (kfXloadVcf | kfXloadBcf | kfXloadOxBgen | kfXloadOxHaps | kfXloadOxSample | kfXloadPlink1Dosage | kfXloadGenDummy)))) {
+      // add command_flags2 when needed
+      goto main_ret_NULL_CALC;
+    }
+    if (!(load_params || xload)) {
+      logerrprint("Error: No input dataset.\n");
+      goto main_ret_INVALID_CMDLINE_A;
+    }
+    if ((xload & kfXloadOxGen) && (!(xload & kfXloadOxSample))) {
+      // could permit .fam/.psam, but unless Oxford software supports that mode
+      // it's pointless
+      logerrprint("Error: --gen must be used with --sample or --data.\n");
+      goto main_ret_INVALID_CMDLINE_A;
+    }
+    if ((xload & kfXloadOxSample) && (pc.misc_flags & kfMiscAffection01)) {
+      // necessary for --data and --data --make-pgen to yield the same output
+      logerrprint("Error: --data/--sample cannot be used with --1.\n");
+      goto main_ret_INVALID_CMDLINE_A;
+    }
+    if ((pc.sample_sort_flags != kfSort0) && (!(pc.command_flags1 & (kfCommand1MakePlink2 | kfCommand1WriteCovar)))) {
+      // todo: permit merge
+      logerrprint("Error: --indiv-sort must be used with --make-{b}pgen/--make-bed/--write-covar\nor dataset merging.\n");
+      goto main_ret_INVALID_CMDLINE_A;
+    }
+    if ((make_plink2_modifier & (kfMakePlink2MMask | kfMakePlink2TrimAlts | kfMakePgenEraseAlt2Plus | kfMakePgenErasePhase | kfMakePgenEraseDosage)) && (pc.command_flags1 & (~kfCommand1MakePlink2))) {
+      logerrprint("Error: When the 'multiallelics=', 'trim-alts', and/or 'erase-...' modifier is\npresent, --make-bed/--make-{b}pgen cannot be combined with other commands.\n(Other filters are fine.)\n");
+      goto main_ret_INVALID_CMDLINE;
+    }
+    if (aperm_present && (pc.command_flags1 & kfCommand1Glm) && (!(pc.glm_info.flags & kfGlmPerm))) {
+      // If --aperm is present, at least one association analysis command which
+      // supports adaptive permutation testing was also specified, but no
+      // actual adaptive permutation test is happening, the user is likely to
+      // be confused.  Produce a warning.  (Not an error since a sophisticated
+      // user may want to use --script with different --aperm defaults.)
+      logerrprint("Warning: --aperm only controls the settings for adaptive permutation tests; it\ndoes not cause such a test to be performed.  (Did you forget to add the 'perm'\nmodifier to an association analysis flag?)\n");
+    }
+    if ((pc.hard_call_thresh == 0xffffffffU) && (xload & (kfXloadVcf | kfXloadBcf | kfXloadOxGen | kfXloadOxBgen))) {
+      if (pc.dosage_erase_thresh > (kDosageMid / 10)) {
+	logerrprint("Error: --dosage-erase-threshold value cannot be larger than (default)\n--hard-call-threshold value.\n");
+	goto main_ret_INVALID_CMDLINE_A;
+      }
+    } else {
+      if (pc.dosage_erase_thresh > pc.hard_call_thresh) {
+	logerrprint("Error: --dosage-erase-threshold value cannot be larger than\n--hard-call-threshold value.\n");
+	goto main_ret_INVALID_CMDLINE_A;
+      }
+    }
+    if ((oxford_import_flags & (kfOxfordImportRefFirst | kfOxfordImportRefSecond)) == (kfOxfordImportRefFirst | kfOxfordImportRefSecond)) {
+      logerrprint("Error: --data/--{b}gen 'ref-first' and 'ref-second' modifiers cannot be used\ntogether.\n");
+      goto main_ret_INVALID_CMDLINE;
+    }
+    if (!strcmp(g_missing_catname, g_output_missing_pheno)) {
+      logerrprint("Error: --missing-catname and --output-missing-phenotype strings can't match.\n");
+      goto main_ret_INVALID_CMDLINE_A;
+    }
+    if ((pc.misc_flags & kfMiscChrOverrideCmdline) && (!chr_info.chrset_source)) {
+      logerrprint("Error: --chr-override requires an explicit chromosome set.\n");
+      goto main_ret_INVALID_CMDLINE_A;
+    }
+    if ((xload & kfXloadPlink1Dosage) && (!(load_params & kfLoadParamsPsam))) {
+      logerrprint("Error: --import-dosage requires a .fam file.\n");
+      goto main_ret_INVALID_CMDLINE_A;
+    }
+    if (!permit_multiple_inclusion_filters) {
+      // Permit only one position- or ID-based variant inclusion filter, since
+      // it's not immediately obvious whether the union or intersection should be
+      // taken with multiple inclusion filters.
+      // However, multiple exclusion filters are fine.  (Also,
+      // --autosome{-par}/--chr is exempted since it's more obvious how they
+      // interact with other filters.)
+      const uint32_t inclusion_filter_extract = (pc.extract_fnames != nullptr);
+      const uint32_t inclusion_filter_fromto_id = pc.varid_from || pc.varid_to;
+      const uint32_t inclusion_filter_fromto_bp = (pc.from_bp != -1) || (pc.to_bp != -1);
+      const uint32_t inclusion_filter_snpflag = (pc.varid_snp != nullptr);
+      const uint32_t inclusion_filter_snpsflag = !!pc.snps_range_list.name_ct;
+      if (inclusion_filter_extract + inclusion_filter_fromto_id + inclusion_filter_fromto_bp + inclusion_filter_snpflag + inclusion_filter_snpsflag > 1) {
+	logerrprint("Error: Multiple variant inclusion filters specified (--extract, --from/--to,\n--from-bp/--to-bp, --snp, --snps).  Add --force-intersect if you really want\nthe intersection of these sets.  (If your variant IDs are unique, you can\nextract the union by e.g. running --write-snplist for each set, followed by\n--extract on all the .snplist files.)\n");
+	goto main_ret_INVALID_CMDLINE_A;
+      }
+    }
+
+    free_cond(subst_argv);
+    free_cond(script_buf);
+    free_cond(rerun_buf);
+    free_cond(flag_buf);
+    free_cond(flag_map);
+    subst_argv = nullptr;
+    script_buf = nullptr;
+    rerun_buf = nullptr;
+    flag_buf = nullptr;
+    flag_map = nullptr;
+    if (!rseeds) {
+      uint32_t seed = (uint32_t)time(nullptr);
+      sprintf(g_logbuf, "Random number seed: %u\n", seed);
+      logstr(g_logbuf);
+      sfmt_init_gen_rand(&g_sfmt, seed);
+    } else {
+      if (rseed_ct == 1) {
+	sfmt_init_gen_rand(&g_sfmt, rseeds[0]);
+      } else {
+	sfmt_init_by_array(&g_sfmt, rseeds, rseed_ct);
+      }
+      free(rseeds);
+      rseeds = nullptr;
+    }
+    
+    uint64_t total_mb = detect_mb();
+    if (!malloc_size_mb) {
+      if (!total_mb) {
+	malloc_size_mb = kBigstackDefaultMb;
+      } else if (total_mb < (kBigstackMinMb * 2)) {
+	malloc_size_mb = kBigstackMinMb;
+      } else {
+	malloc_size_mb = total_mb / 2;
+      }
+    }
+    assert(malloc_size_mb >= (intptr_t)kBigstackMinMb);
+#ifndef __LP64__
+    if (malloc_size_mb > (intptr_t)kMalloc32bitMbMax) {
+      malloc_size_mb = kMalloc32bitMbMax;
+    }
+#endif
+    if (total_mb) {
+      sprintf(g_logbuf, "%" PRIu64 " MB RAM detected; reserving %" PRIdPTR " MB for main workspace.\n", total_mb, malloc_size_mb);
+    } else {
+      sprintf(g_logbuf, "Failed to determine total system memory.  Attempting to reserve %" PRIuPTR " MB.\n", malloc_size_mb);
+    }
+    logprintb();
+    uintptr_t malloc_mb_final;
+    if (init_bigstack(malloc_size_mb, &malloc_mb_final, &bigstack_ua)) {
+      goto main_ret_NOMEM;
+    }
+    g_input_missing_geno_ptr = &(g_one_char_strs[2 * ((unsigned char)input_missing_geno_char)]);
+    g_output_missing_geno_ptr = &(g_one_char_strs[2 * ((unsigned char)output_missing_geno_char)]);
+    if (((uintptr_t)malloc_size_mb) != malloc_mb_final) {
+      if (memory_require) {
+	goto main_ret_NOMEM;
+      }
+      LOGPRINTF("Allocated %" PRIuPTR " MB successfully, after larger attempt(s) failed.\n", malloc_mb_final);
+    }
+
+#ifndef _WIN32
+    pthread_attr_init(&g_smallstack_thread_attr);
+    pthread_attr_setstacksize(&g_smallstack_thread_attr, kDefaultThreadStack);
+#endif
+    // pigz_init(pc.max_thread_ct);
+
+    print_end_time = 1;
+    if (0) {
+      // nonstandard cases (CNV, etc.) here
+    } else {
+      if (pc.filter_flags) {
+	if (!pc.command_flags1) {
+	  logerrprint("Error: Basic file conversions do not support regular filtering operations.\nRerun your command with --make-bed/--make-{b}pgen.\n");
+	  goto main_ret_INVALID_CMDLINE;
+	}
+      }
+      // print this here since some import functions are now multithreaded
+      if (pc.max_thread_ct > 8) {
+	LOGPRINTF("Using up to %u threads (change this with --threads).\n", pc.max_thread_ct);
+      } else {
+	// "1 compute thread" instead of "1 thread" since, when
+	// max_thread_ct == 2, some code will use one I/O thread and one
+	// compute thread.  Not worth the trouble of writing special-case code
+	// to avoid that.  (also, with 2 cores, the I/O thread isn't
+	// sufficiently busy to justify only 1 compute thread.)
+	LOGPRINTF("Using %s%u compute thread%s.\n", (pc.max_thread_ct > 1)? "up to " : "", pc.max_thread_ct, (pc.max_thread_ct == 1)? "" : "s");
+      }
+      if (xload) {
+	char* convname_end = outname_end;
+	if (pc.command_flags1) {
+	  if (pc.misc_flags & kfMiscKeepAutoconv) {
+	    if (pc.misc_flags & kfMiscAffection01) {
+	      logerrprint("Error: --1 cannot be used with --keep-autoconv.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    if ((output_missing_geno_char != '.') && (output_missing_geno_char != input_missing_geno_char)) {
+	      logerrprint("Error: --output-missing-genotype and --input-missing-genotype parameters cannot\nbe inconsistent when --keep-autoconv is specified.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    double dxx;
+	    const char* num_end = scanadv_double(g_output_missing_pheno, &dxx);
+	    if (num_end) {
+	      if (dxx != ((double)pc.missing_pheno)) {
+		logerrprint("Error: --output-missing-phenotype and --input-missing-phenotype parameters\ncannot be inconsistent when --keep-autoconv is specified.\n");
+		goto main_ret_INVALID_CMDLINE_A;
+	      }
+	    } else if (!is_nan_str(g_output_missing_pheno, strlen(g_output_missing_pheno))) {
+	      logerrprint("Error: --output-missing-phenotype parameter must be numeric or 'NA' when\n--keep-autoconv is specified.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	  } else {
+	    convname_end = strcpya0(convname_end, "-temporary");
+	  }
+	} else {
+	  pc.misc_flags |= kfMiscKeepAutoconv;
+	}
+	const uint32_t convname_slen = (uintptr_t)(convname_end - outname);
+	const uint32_t psam_specified = (load_params & kfLoadParamsPsam);
+	if (xload & kfXloadVcf) {
+	  reterr = vcf_to_pgen(pgenname, psam_specified? psamname : nullptr, const_fid, vcf_dosage_import_field, pc.misc_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, id_delim, idspace_to, vcf_min_gq, vcf_min_dp, vcf_half_call, pc.fam_cols, outname, convname_end, &chr_info);
+	} else if (xload & kfXloadVcf) {
+	  logerrprint("Error: --bcf is not implemented yet.\n");
+	  reterr = kPglRetNotYetSupported;
+	} else if (xload & kfXloadOxGen) {
+	  reterr = ox_gen_to_pgen(pgenname, psamname, import_single_chr_str, ox_missing_code, pc.misc_flags, oxford_import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, outname, convname_end, &chr_info);
+	} else if (xload & kfXloadOxBgen) {
+	  reterr = ox_bgen_to_pgen(pgenname, psamname, const_fid, ox_missing_code, pc.misc_flags, oxford_import_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, id_delim, idspace_to, pc.max_thread_ct, outname, convname_end, &chr_info);
+	} else if (xload & kfXloadOxHaps) {
+	  reterr = ox_hapslegend_to_pgen(pgenname, pvarname, psamname, import_single_chr_str, ox_missing_code, pc.misc_flags, oxford_import_flags, outname, convname_end, &chr_info);
+	} else if (xload & kfXloadPlink1Dosage) {
+	  reterr = plink1_dosage_to_pgen(pgenname, psamname, (xload & kfXloadMap)? pvarname : nullptr, import_single_chr_str, &plink1_dosage_info, pc.misc_flags, pc.fam_cols, pc.missing_pheno, pc.hard_call_thresh, pc.dosage_erase_thresh, import_dosage_certainty, pc.max_thread_ct, outname, convname_end, &chr_info);
+	} else if (xload & kfXloadGenDummy) {
+	  reterr = generate_dummy(&gendummy_info, pc.misc_flags, pc.hard_call_thresh, pc.dosage_erase_thresh, pc.max_thread_ct, outname, convname_end, &chr_info);
+	}
+	if (reterr || (!pc.command_flags1)) {
+	  goto main_ret_1;
+	}
+
+	// todo: we have to skip this when merging is involved
+	pc.hard_call_thresh = 0xffffffffU;
+	
+	strcpy(memcpya(pgenname, outname, convname_slen), ".pgen");
+	strcpy(memcpya(pvarname, outname, convname_slen), ".pvar");
+	if (!psam_specified) {
+	  strcpy(memcpya(psamname, outname, convname_slen), ".psam");
+	}
+	if (!(pc.misc_flags & kfMiscKeepAutoconv)) {
+	  if (push_llstr(pgenname, &file_delete_list) || push_llstr(pvarname, &file_delete_list)) {
+	    goto main_ret_NOMEM;
+	  }
+	  if (!psam_specified) {
+	    if (push_llstr(psamname, &file_delete_list)) {
+	      goto main_ret_NOMEM;
+	    }
+	  }
+	}
+	*outname_end = '\0';
+      }
+      const uint32_t calc_all_req = (pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (make_plink2_modifier & (kfMakeBed | kfMakePgen)));
+      if (calc_all_req || (pc.filter_flags & kfFilterAllReq)) {
+	if ((!xload) && (load_params != kfLoadParamsPfileAll)) {
+	  logerrprint("Error: A full fileset (.pgen/.bed + .pvar/.bim + .psam/.fam) is required for\nthis.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	}
+      } else {
+	// no genotype file required
+	pgenname[0] = '\0';
+	
+	const uint32_t calc_pvar_req = (pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1WriteCovar))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (make_plink2_modifier & (kfMakeBed | kfMakeBim | kfMakePgen | kfMakePvar)));
+	if (calc_pvar_req || (pc.filter_flags & kfFilterPvarReq)) {
+	  if ((!xload) && (!(load_params & kfLoadParamsPvar))) {
+	    logerrprint("Error: A .pvar/.bim file is required for this.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	} else {
+	  pvarname[0] = '\0';
+	}
+	const uint32_t calc_psam_req = (pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1WriteSnplist))) || ((pc.command_flags1 & kfCommand1MakePlink2) && (make_plink2_modifier & (kfMakeBed | kfMakeFam | kfMakePgen | kfMakePsam)));
+	if (calc_psam_req || (pc.filter_flags & kfFilterPsamReq)) {
+	  if ((!xload) && (!(load_params & kfLoadParamsPsam))) {
+	    logerrprint("Error: A .psam/.fam file is required for this.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
+	} else {
+	  psamname[0] = '\0';
+	}
+      }
+      if (pc.command_flags1 & (~(kfCommand1MakePlink2 | kfCommand1Validate | kfCommand1WriteSnplist | kfCommand1WriteCovar))) {
+	pc.filter_flags |= kfFilterNoSplitChr;
+      }
+
+      BLAS_SET_NUM_THREADS(1);
+      reterr = plink2_core(var_filter_exceptions_flattened, require_pheno_flattened, require_covar_flattened, &pc, make_plink2_modifier, pgenname, psamname, pvarname, outname, outname_end, king_cutoff_fprefix, &chr_info);
+    }    
+  }
+  while (0) {
+  main_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  main_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  main_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  main_ret_INVALID_CMDLINE_UNRECOGNIZED:
+    invalid_arg(argv[arg_idx]);
+    logerrprintb();
+    logerrprint(errstr_append);
+    reterr = kPglRetInvalidCmdline;
+    break;
+  main_ret_INVALID_CMDLINE_INPUT_CONFLICT:
+    LOGERRPRINTF("Error: --%s conflicts with another input flag.\n%s", flagname_p, errstr_append);
+    reterr = kPglRetInvalidCmdline;
+    break;
+  main_ret_INVALID_CMDLINE_WWA:
+    wordwrapb(0);
+  main_ret_INVALID_CMDLINE_2A:
+    logerrprintb();
+  main_ret_INVALID_CMDLINE_A:
+    logerrprint(errstr_append);
+  main_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  main_ret_NULL_CALC:
+    if (pc.filter_flags) {
+      logerrprint("Warning: No output requested.  (Did you forget --make-bed/--make-{b}pgen?)\nExiting.\n");
+    } else {
+      logerrprint("Warning: No output requested.  Exiting.\n");
+    }
+  main_ret_NULL_CALC_0:
+    fputs(g_cmdline_format_str, stdout);
+    fputs(notestr_null_calc2, stdout);
+    reterr = kPglRetSkipped;
+    break;
+  }
+ main_ret_1:
+  disp_exit_msg(reterr);
+  while (0) {
+  main_ret_NOMEM_NOLOG:
+    print_ver();
+  main_ret_NOMEM_NOLOG2:
+    fputs(errstr_nomem, stderr);
+    if (g_failed_alloc_attempt_size) {
+      fprintf(stderr, "Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
+    }
+    reterr = kPglRetNomem;
+    break;
+  main_ret_READ_FAIL_NOLOG:
+    print_ver();
+    fputs(errstr_read, stderr);
+    reterr = kPglRetReadFail;
+    break;
+  }
+ main_ret_NOLOG:
+  fclose_cond(scriptfile);
+  free_cond(vcf_dosage_import_field);
+  free_cond(ox_missing_code);
+  free_cond(import_single_chr_str);
+  free_cond(const_fid);
+  free_cond(require_covar_flattened);
+  free_cond(require_pheno_flattened);
+  free_cond(var_filter_exceptions_flattened);
+  free_cond(rseeds);
+  free_cond(subst_argv);
+  free_cond(script_buf);
+  free_cond(rerun_buf);
+  free_cond(flag_buf);
+  free_cond(flag_map);
+  free_cond(king_cutoff_fprefix);
+  free_cond(pc.covar_quantnorm_flattened);
+  free_cond(pc.quantnorm_flattened);
+  free_cond(pc.vstd_flattened);
+  free_cond(pc.split_cat_phenonames_flattened);
+  free_cond(pc.remove_cat_phenoname);
+  free_cond(pc.remove_cat_names_flattened);
+  free_cond(pc.remove_cats_fname);
+  free_cond(pc.keep_cat_phenoname);
+  free_cond(pc.keep_cat_names_flattened);
+  free_cond(pc.keep_cats_fname);
+  free_cond(pc.family_missing_catname);
+  free_cond(pc.catpheno_name);
+  free_cond(pc.within_fname);
+  free_cond(pc.read_freq_fname);
+  free_cond(pc.glm_local_covar_fname);
+  free_cond(pc.glm_local_pvar_fname);
+  free_cond(pc.glm_local_psam_fname);
+  free_cond(pc.freq_alt1_binstr);
+  free_cond(pc.freq_ref_binstr);
+  free_cond(pc.update_sex_fname);
+  free_cond(pc.removefam_fnames);
+  free_cond(pc.remove_fnames);
+  free_cond(pc.keepfam_fnames);
+  free_cond(pc.keep_fnames);
+  free_cond(pc.exclude_fnames);
+  free_cond(pc.extract_fnames);
+  free_cond(pc.sample_sort_fname);
+  free_cond(pc.covar_fname);
+  free_cond(pc.pheno_fname);
+  free_cond(pc.varid_exclude_snp);
+  free_cond(pc.varid_snp);
+  free_cond(pc.varid_to);
+  free_cond(pc.varid_from);
+  free_cond(pc.missing_varid_match);
+  free_cond(pc.varid_template);
+  if (file_delete_list) {
+    do {
+      ll_str_t* llstr_ptr = file_delete_list->next;
+      unlink(file_delete_list->ss);
+      free(file_delete_list);
+      file_delete_list = llstr_ptr;
+    } while (file_delete_list);
+  }
+  cleanup_cmp_expr(&pc.remove_if_expr);
+  cleanup_cmp_expr(&pc.keep_if_expr);
+  cleanup_score(&pc.score_info);
+  cleanup_glm(&pc.glm_info);
+  cleanup_chr_info(&chr_info);
+  cleanup_ld(&pc.ld_info);
+  cleanup_range_list(&pc.covar_range_list);
+  cleanup_range_list(&pc.pheno_range_list);
+  cleanup_range_list(&pc.exclude_snps_range_list);
+  cleanup_range_list(&pc.snps_range_list);
+  if (warning_errcode && g_stderr_written_to && (!reterr)) {
+    logerrprint("--warning-errcode: One or more warnings in this run; exiting with code 61.\n");
+    reterr = kPglRetWarningErrcode;
+  }
+  if (cleanup_logfile(print_end_time) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  if (bigstack_ua) {
+    free(bigstack_ua);
+  }
+  return (uint32_t)reterr;
+}
diff --git a/plink2_adjust.cpp b/plink2_adjust.cpp
new file mode 100644
index 0000000..b8e6499
--- /dev/null
+++ b/plink2_adjust.cpp
@@ -0,0 +1,41 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "plink2_glm.h"
+#include "plink2_matrix.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+void init_adjust(adjust_info_t* adjust_info_ptr) {
+  adjust_info_ptr->flags = kfAdjust0;
+  adjust_info_ptr->lambda = 0.0;
+}
+
+typedef struct adjustable_assoc_result_struct {
+  double pval;
+  uint32_t variant_uidx;
+#ifdef __cplusplus
+  bool operator<(const struct adjustable_assoc_result_struct& rhs) const {
+    return pval < rhs.pval;
+  }
+#endif
+} adjustable_assoc_result_t;
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_adjust.h b/plink2_adjust.h
new file mode 100644
index 0000000..da545ea
--- /dev/null
+++ b/plink2_adjust.h
@@ -0,0 +1,61 @@
+#ifndef __PLINK2_ADJUST_H__
+#define __PLINK2_ADJUST_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+FLAGSET_DEF_START()
+  kfAdjust0,
+  kfAdjustGc = (1 << 0),
+  kfAdjustLog10 = (1 << 1),
+
+  kfAdjustColChrom = (1 << 2),
+  kfAdjustColPos = (1 << 3),
+  kfAdjustColRef = (1 << 4),
+  kfAdjustColAlt1 = (1 << 5),
+  kfAdjustColAlt = (1 << 6),
+  kfAdjustColUnadj = (1 << 7),
+  kfAdjustColGc = (1 << 8),
+  kfAdjustColQq = (1 << 9),
+  kfAdjustColBonf = (1 << 10),
+  kfAdjustColHolm = (1 << 11),
+  kfAdjustColSidakss = (1 << 12),
+  kfAdjustColSidaksd = (1 << 13),
+  kfAdjustColFdrbh = (1 << 14),
+  kfAdjustColFdrby = (1 << 15),
+  kfAdjustColDefault = (kfAdjustColChrom | kfAdjustColUnadj | kfAdjustColGc | kfAdjustColBonf | kfAdjustColHolm | kfAdjustColSidakss | kfAdjustColSidaksd | kfAdjustColFdrbh | kfAdjustColFdrby),
+  kfAdjustColAll = ((kfAdjustColFdrby * 2) - kfAdjustColChrom)
+FLAGSET_DEF_END(adjust_flags_t);
+
+typedef struct adjust_info_struct {
+  adjust_flags_t flags;
+  double lambda;
+} adjust_info_t;
+
+void init_adjust(adjust_info_t* adjust_info_ptr);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+ 
+#endif // __PLINK2_ADJUST_H__
diff --git a/plink2_common.cpp b/plink2_common.cpp
new file mode 100644
index 0000000..7ac74db
--- /dev/null
+++ b/plink2_common.cpp
@@ -0,0 +1,6459 @@
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#include <unistd.h> // sysconf()
+
+#ifdef __APPLE__
+  // needed for sysctl() call
+  #include <sys/sysctl.h>
+#endif
+
+#include <time.h> // cleanup_logfile()
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+const char g_errstr_fopen[] = "Error: Failed to open %s.\n";
+
+char g_textbuf[kTextbufSize];
+
+// now initialized by init_bigstack
+const char* g_one_char_strs = nullptr;
+// If one-base indels become sufficiently common, might want to predefine
+// g_two_char_strs[], and update allele string construction/destruction
+// accordingly.  (Though that should either be programmatically initialized, or
+// only cover a subset of the space; 192k is a lot to increase the binary image
+// size for a single simple table.)
+
+const char* g_input_missing_geno_ptr = nullptr; // in addition to '.'
+const char* g_output_missing_geno_ptr = nullptr; // now '.'
+
+FILE* g_logfile = nullptr;
+
+char g_logbuf[kMaxMediumLine * 2];
+
+uint32_t g_debug_on = 0;
+uint32_t g_log_failed = 0;
+uint32_t g_stderr_written_to = 0;
+
+boolerr_t push_llstr(const char* ss, ll_str_t** ll_stack_ptr) {
+  uintptr_t blen = strlen(ss) + 1;
+  ll_str_t* new_llstr;
+  if (pgl_malloc(sizeof(ll_str_t) + blen, &new_llstr)) {
+    return 1;
+  }
+  new_llstr->next = *ll_stack_ptr;
+  memcpy(new_llstr->ss, ss, blen);
+  *ll_stack_ptr = new_llstr;
+  return 0;
+}
+
+void logstr(const char* ss) {
+  if (!g_debug_on) {
+    fputs(ss, g_logfile);
+    if (ferror(g_logfile)) {
+      putchar('\n');
+      fflush(stdout);
+      fprintf(stderr, "Warning: Logging failure on:\n%s\nFurther logging will not be attempted in this run.\n", ss);
+      g_log_failed = 1;
+    }
+  } else {
+    if (g_log_failed) {
+      fflush(stdout);
+      fputs(ss, stderr);
+    } else {
+      fputs(ss, g_logfile);
+      if (ferror(g_logfile)) {
+	putchar('\n');
+	fflush(stdout);
+        fprintf(stderr, "Error: Debug logging failure.  Dumping to stderr:\n%s", ss);
+	g_log_failed = 1;
+	g_stderr_written_to = 1;
+      } else {
+	fflush(g_logfile);
+      }
+    }
+  }
+}
+
+void logprint(const char* ss) {
+  logstr(ss);
+  fputs(ss, stdout);
+}
+
+void logerrprint(const char* ss) {
+  logstr(ss);
+  fflush(stdout);
+  fputs(ss, stderr);
+  g_stderr_written_to = 1;
+}
+
+void logprintb() {
+  logstr(g_logbuf);
+  fputs(g_logbuf, stdout);
+}
+
+void logerrprintb() {
+  logstr(g_logbuf);
+  fflush(stdout);
+  fputs(g_logbuf, stderr);
+  g_stderr_written_to = 1;
+}
+
+void wordwrap(uint32_t suffix_len, char* ss) {
+  // Input: A null-terminated string with no intermediate newlines.  If
+  //        suffix_len is zero, there should be a terminating \n; otherwise,
+  //        the last character should be a space.  The allocation the string is
+  //        part of must include at least ~80 bytes past the string end.
+  // Effect: Spaces are replaced with newlines in a manner that plays well with
+  //         80 column terminal windows.  (Multi-space blocks are never
+  //         collapsed.)
+  char* token_start = ss;
+  char* line_end = &(ss[79]);
+  char* token_end;
+  while (1) {
+    while (*token_start == ' ') {
+      ++token_start;
+    }
+    if (token_start > line_end) {
+      do {
+	*line_end = '\n';
+	line_end = &(line_end[80]);
+      } while (token_start > line_end);
+    }
+    token_end = strchr(token_start, ' ');
+    if (!token_end) {
+      if (&(token_start[79]) == line_end) {
+	return;
+      }
+      token_end = (char*)rawmemchr(token_start, '\0');
+      if (!suffix_len) {
+	if (token_end <= &(line_end[1])) {
+	  // okay if end-of-string is one past the end, because function
+	  // assumes last character is \n in suffix_len == 0 case
+	  assert(token_end[-1] == '\n');
+	  return;
+	}
+      } else {
+        if (&(token_end[suffix_len]) <= line_end) {
+	  return;
+	}
+	// because of terminal space assumption, token_start actually points
+	// to the end of the string
+	assert(token_start[-1] == ' ');
+      }
+      token_start[-1] = '\n';
+      return;
+    }
+    if (token_end > line_end) {
+      if (&(token_start[79]) != line_end) {
+	token_start[-1] = '\n';
+        line_end = &(token_start[79]);
+	if (token_end > line_end) {
+	  // single really long token, can't do anything beyond putting it on
+	  // its own line
+          *token_end = '\n';
+	  line_end = &(token_end[80]);
+	}
+      } else {
+	// single really long token, *and* previous token was either
+	// nonexistent or long
+	*token_end = '\n';
+	line_end = &(token_end[80]);
+      }
+    }
+    token_start = &(token_end[1]);
+  }
+}
+
+void wordwrapb(uint32_t suffix_len) {
+  wordwrap(suffix_len, g_logbuf);
+}
+
+
+boolerr_t fopen_checked(const char* fname, const char* mode, FILE** target_ptr) {
+  *target_ptr = fopen(fname, mode);
+  if (!(*target_ptr)) {
+    logprint("\n");
+    LOGERRPRINTFWW(g_errstr_fopen, fname);
+    return 1;
+  }
+  return 0;
+}
+
+interr_t fwrite_flush2(char* buf_flush, FILE* outfile, char** write_iter_ptr) {
+  char* buf = &(buf_flush[-((int32_t)kMaxMediumLine)]);
+  char* buf_end = *write_iter_ptr;
+  *write_iter_ptr = buf;
+  return fwrite_checked(buf, (uintptr_t)(buf_end - buf), outfile);
+}
+
+
+uint32_t int_slen(int32_t num) {
+  int32_t slen = 1;
+  uint32_t absnum;
+  if (num < 0) {
+    absnum = -num;
+    ++slen;
+  } else {
+    absnum = num;
+  }
+  while (absnum > 99) {
+    // division by a constant is faster for unsigned ints
+    absnum /= 100;
+    slen += 2;
+  }
+  if (absnum > 9) {
+    ++slen;
+  }
+  return slen;
+}
+
+int32_t strcmp_se(const char* s_read, const char* s_const, uint32_t s_const_len) {
+  return memcmp(s_read, s_const, s_const_len) || (!is_space_or_eoln(s_read[s_const_len]));
+}
+
+int32_t strcmp_casted(const void* s1, const void* s2) {
+  return strcmp((const char*)s1, (const char*)s2);
+}
+
+// PLINK 2's natural sort uses the following logic:
+// - All alphabetic characters act as if they are capitalized, except for
+// tiebreaking purposes (where ASCII is used).
+// - Numbers are compared by magnitude, with the exception of...
+// - Numbers with leading zero(es).  If you're putting extraneous zeroes in
+// front of IDs, we assume they're there to force particular items to be sorted
+// earlier, rather than just appearing at random.  So, unlike many natural sort
+// implementations, we sort 00200 < 021 < 20: all numbers with n leading zeroes
+// are sorted before all numbers with (n-1) leading zeroes; magnitude only
+// applies if the leading zero counts match.  This handles e.g. subbasement
+// room numbering properly.
+//
+// This won't always do what you want if your IDs have variable-length decimals
+// in them (e.g. it yields 0.99 < 0.101); if you don't want to fall back on
+// ASCII sort, enforce a fixed number of digits after the decimal point.  Also
+// note that ASCII sort is outright better for e.g. numbers represented in
+// hexadecimal or base 36.  In principle, it's possible to reliably autodetect
+// some of these cases (especially hexadecimal numbers beginning with "0x"),
+// but that'll never be perfect so we just let the user toggle the sort method.
+int32_t strcmp_natural_scan_forward(const unsigned char* s1, const unsigned char* s2) {
+  // assumes s1 and s2 currently point to the middle of a mismatching number,
+  // where s1 < s2.
+  unsigned char c1;
+  unsigned char c2;
+  do {
+    c1 = *(++s1);
+    c2 = *(++s2);
+    if (is_not_digit(c1)) {
+      return -1;
+    }
+  } while (is_digit(c2));
+  return 1;
+}
+
+// We have the following major states:
+//   0 (initial): strings perfectly match so far, last char (if any) is
+//                nonnumeric.
+//   1: strings perfectly match so far, last char is numeric.
+//   2: strings match except for capitalization, last char is nonnumeric.
+//   3: strings match except for capitalization, last char is numeric.
+// strcmp_natural_tiebroken() expresses the logic for states 2 and 3, while
+// strcmp_natural_uncasted() handles states 0 and 1.
+int32_t strcmp_natural_tiebroken(const unsigned char* s1, const unsigned char* s2) {
+  // assumes ties should be broken in favor of s2.
+  unsigned char c1 = *(++s1);
+  unsigned char c2 = *(++s2);
+  while (is_not_nzdigit(c1) && is_not_nzdigit(c2)) {
+    // state 2
+  strcmp_natural_tiebroken_state_2:
+    if (c1 != c2) {
+      if ((c1 >= 'a') && (c1 <= 'z')) {
+	c1 -= 32;
+      }
+      if ((c2 >= 'a') && (c2 <= 'z')) {
+	c2 -= 32;
+      }
+      if (c1 < c2) {
+	return -1;
+      }
+      if (c1 > c2) {
+	return 1;
+      }
+    } else if (!c1) {
+      return -1;
+    }
+    c1 = *(++s1);
+    c2 = *(++s2);
+  }
+  if (is_not_nzdigit(c1) || is_not_nzdigit(c2)) {
+    return (c1 < c2)? -1 : 1;
+  }
+  do {
+    // state 3
+    if (c1 != c2) {
+      if (is_digit(c2)) {
+	if (c1 < c2) {
+	  return strcmp_natural_scan_forward(s1, s2);
+	}
+	return -strcmp_natural_scan_forward(s2, s1);
+      }
+      return 1;
+    }
+    c1 = *(++s1);
+    c2 = *(++s2);
+  } while (is_digit(c1));
+  if (is_digit(c2)) {
+    return -1;
+  }
+  // skip the while (is_not_digit...) check
+  goto strcmp_natural_tiebroken_state_2;
+}
+
+static inline int32_t strcmp_natural_uncasted(const unsigned char* s1, const unsigned char* s2) {
+  unsigned char c1 = *s1;
+  unsigned char c2 = *s2;
+  while (is_not_nzdigit(c1) && is_not_nzdigit(c2)) {
+    // state 0
+  strcmp_natural_uncasted_state_0:
+    if (c1 != c2) {
+      if ((c1 >= 'a') && (c1 <= 'z')) {
+	if (c2 + 32 == c1) {
+	  return -strcmp_natural_tiebroken(s2, s1);
+	}
+	if ((c2 < 'a') || (c2 > 'z')) {
+	  c1 -= 32;
+	}
+      } else if ((c2 >= 'a') && (c2 <= 'z')) {
+	c2 -= 32;
+	if (c1 == c2) {
+	  return strcmp_natural_tiebroken(s1, s2);
+	}
+      }
+      return (c1 < c2)? -1 : 1;
+    }
+    if (!c1) {
+      return 0;
+    }
+    c1 = *(++s1);
+    c2 = *(++s2);
+  }
+  if (is_not_nzdigit(c1) || is_not_nzdigit(c2)) {
+    return (c1 < c2)? -1 : 1;
+  }
+  do {
+    // state 1
+    if (c1 != c2) {
+      if (is_digit(c2)) {
+	if (c1 < c2) {
+	  return strcmp_natural_scan_forward(s1, s2);
+	}
+	return -strcmp_natural_scan_forward(s2, s1);
+      }
+      return 1;
+    }
+    c1 = *(++s1);
+    c2 = *(++s2);
+  } while (is_digit(c1));
+  if (is_digit(c2)) {
+    return -1;
+  }
+  goto strcmp_natural_uncasted_state_0;
+}
+
+int32_t strcmp_natural(const void* s1, const void* s2) {
+  return strcmp_natural_uncasted((const unsigned char*)s1, (const unsigned char*)s2);
+}
+
+int32_t strcmp_deref(const void* s1, const void* s2) {
+  // const_cast
+  return strcmp(*(char**)((uintptr_t)s1), *(char**)((uintptr_t)s2));
+}
+
+int32_t strcmp_natural_deref(const void* s1, const void* s2) {
+  // const_cast
+  return strcmp_natural_uncasted(*(unsigned char**)((uintptr_t)s1), *(unsigned char**)((uintptr_t)s2));
+}
+
+int32_t float_cmp(const void* aa, const void* bb) {
+  const float fxx = *((const float*)aa);
+  const float fyy = *((const float*)bb);
+  if (fxx < fyy) {
+    return -1;
+  }
+  return (fxx > fyy);
+}
+
+int32_t double_cmp(const void* aa, const void* bb) {
+  const double dxx = *((const double*)aa);
+  const double dyy = *((const double*)bb);
+  if (dxx < dyy) {
+    return -1;
+  }
+  return (dxx > dyy);
+}
+
+int32_t double_cmp_decr(const void* aa, const void* bb) {
+  const double dxx = *((const double*)aa);
+  const double dyy = *((const double*)bb);
+  if (dxx > dyy) {
+    return -1;
+  }
+  return (dxx < dyy);
+}
+
+int32_t intcmp(const void* aa, const void* bb) {
+  return *((const int32_t*)aa) - *((const int32_t*)bb);
+}
+
+int32_t uint64cmp(const void* aa, const void* bb) {
+  const uint64_t ullaa = *((const uint64_t*)aa);
+  const uint64_t ullbb = *((const uint64_t*)bb);
+  if (ullaa < ullbb) {
+    return -1;
+  }
+  return (ullaa > ullbb);
+}
+
+#ifndef __cplusplus
+int32_t uint64cmp_decr(const void* aa, const void* bb) {
+  const uint64_t ullaa = *((const uint64_t*)aa);
+  const uint64_t ullbb = *((const uint64_t*)bb);
+  if (ullaa > ullbb) {
+    return -1;
+  }
+  return (ullaa < ullbb);
+}
+#endif
+
+#ifdef __cplusplus
+float destructive_get_fmedian(uintptr_t len, float* unsorted_arr) {
+  if (!len) {
+    return 0.0;
+  }
+  const uintptr_t len_d2 = len / 2;
+  std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
+  const float median_upper = unsorted_arr[len_d2];
+  if (len % 2) {
+    return median_upper;
+  }
+  return (get_fmax(len_d2, unsorted_arr) + median_upper) * 0.5f;
+}
+
+double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
+  if (!len) {
+    return 0.0;
+  }
+  const uintptr_t len_d2 = len / 2;
+  std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
+  const double median_upper = unsorted_arr[len_d2];
+  if (len % 2) {
+    return median_upper;
+  }
+  return (get_dmax(len_d2, unsorted_arr) + median_upper) * 0.5;
+}
+#else
+// these will probably be used in __cplusplus case too
+float get_fmedian(const float* sorted_arr, uintptr_t len) {
+  if (!len) {
+    return 0.0f;
+  }
+  if (len % 2) {
+    return sorted_arr[len / 2];
+  }
+  return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5f;
+}
+
+double get_dmedian(const double* sorted_arr, uintptr_t len) {
+  if (!len) {
+    return 0.0;
+  }
+  if (len % 2) {
+    return sorted_arr[len / 2];
+  }
+  return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5;
+}
+
+float destructive_get_fmedian(uintptr_t len, float* unsorted_arr) {
+  // no, I'm not gonna bother reimplementing introselect just for folks who
+  // insist on compiling this as pure C instead of C++
+  qsort(unsorted_arr, len, sizeof(float), float_cmp);
+  return get_fmedian(unsorted_arr, len);
+}
+
+double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
+  qsort(unsorted_arr, len, sizeof(double), double_cmp);
+  return get_dmedian(unsorted_arr, len);
+}
+#endif
+
+// alas, qsort_r not available on some Linux distributions
+
+// note that this can be expected to have size 16 bytes, not 12, on 64-bit
+// systems
+typedef struct str_sort_indexed_deref_struct {
+  const char* strptr;
+  uint32_t orig_idx;
+#ifdef __cplusplus
+  bool operator<(const struct str_sort_indexed_deref_struct& rhs) const {
+    return (strcmp(strptr, rhs.strptr) < 0);
+  }
+#endif
+} str_sort_indexed_deref_t;
+
+#ifdef __cplusplus
+typedef struct strbuf36_ui_struct {
+  char strbuf[36];
+  uint32_t orig_idx;
+  bool operator<(const struct strbuf36_ui_struct& rhs) const {
+    return (strcmp_natural_uncasted((const unsigned char*)strbuf, (const unsigned char*)(rhs.strbuf)) < 0);
+  }
+} Strbuf36_ui;
+
+typedef struct strbuf60_ui_struct {
+  char strbuf[60];
+  uint32_t orig_idx;
+  bool operator<(const struct strbuf60_ui_struct& rhs) const {
+    return (strcmp_natural_uncasted((const unsigned char*)strbuf, (const unsigned char*)(rhs.strbuf)) < 0);
+  }
+} Strbuf60_ui;
+
+static_assert(sizeof(Strbuf36_ui) == 40, "Strbuf36_ui is not laid out as expected.");
+static_assert(offsetof(Strbuf36_ui, orig_idx) == 36, "Strbuf36_ui is not laid out as expected.");
+static_assert(sizeof(Strbuf60_ui) == 64, "Strbuf60_ui is not laid out as expected.");
+static_assert(offsetof(Strbuf60_ui, orig_idx) == 60, "Strbuf60_ui is not laid out as expected.");
+uintptr_t get_strboxsort_wentry_blen(uintptr_t max_str_blen) {
+  if (max_str_blen <= 36) {
+    return sizeof(Strbuf36_ui);
+  }
+  if (max_str_blen <= 60) {
+    return sizeof(Strbuf60_ui);
+  }
+  return max_str_blen;
+}
+
+typedef struct str_nsort_indexed_deref_struct {
+  char* strptr;
+  uint32_t orig_idx;
+  bool operator<(const struct str_nsort_indexed_deref_struct& rhs) const {
+    return (strcmp_natural_uncasted((unsigned char*)strptr, (unsigned char*)(rhs.strptr)) < 0);
+  }
+} str_nsort_indexed_deref_t;
+#else
+uintptr_t get_strboxsort_wentry_blen(uintptr_t max_str_blen) {
+  return MAXV(max_str_blen, sizeof(str_sort_indexed_deref_t));
+}
+#endif
+
+// assumed that sort_wkspace has size >= str_ct *
+// max(sizeof(str_sort_indexed_deref_t), max_str_blen)
+void sort_strbox_indexed2_fallback(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace) {
+  str_sort_indexed_deref_t* wkspace_alias = (str_sort_indexed_deref_t*)sort_wkspace;
+  for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+    wkspace_alias[str_idx].strptr = &(strbox[str_idx * max_str_blen]);
+    wkspace_alias[str_idx].orig_idx = id_map[str_idx];
+  }
+  if (!use_nsort) {
+#ifdef __cplusplus
+    std::sort(wkspace_alias, &(wkspace_alias[str_ct]));
+#else
+    qsort(wkspace_alias, str_ct, sizeof(str_sort_indexed_deref_t), strcmp_deref);
+#endif
+  } else {
+#ifdef __cplusplus
+    str_nsort_indexed_deref_t* wkspace_alias2 = (str_nsort_indexed_deref_t*)wkspace_alias;
+    std::sort(wkspace_alias2, &(wkspace_alias2[str_ct]));
+#else
+    qsort(wkspace_alias, str_ct, sizeof(str_sort_indexed_deref_t), strcmp_natural_deref);
+#endif
+  }
+  for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+    id_map[str_idx] = wkspace_alias[str_idx].orig_idx;
+  }
+#ifndef __cplusplus
+  if (max_str_blen < sizeof(str_sort_indexed_deref_t)) {
+    // actually better to use non-deref sort here, but just get this working
+    // properly for now
+    for (uint32_t new_idx = 0; new_idx < str_ct; ++new_idx) {
+      const char* strptr = wkspace_alias[new_idx].strptr;
+      strcpy(&(((char*)wkspace_alias)[new_idx * max_str_blen]), strptr);
+    }
+  } else {
+#endif
+    // bugfix: need to handle id_map[str_idx] != str_idx
+    uint32_t new_idx = str_ct;
+    do {
+      --new_idx;
+      const char* strptr = wkspace_alias[new_idx].strptr;
+      strcpy(&(((char*)wkspace_alias)[new_idx * max_str_blen]), strptr);
+    } while (new_idx);
+#ifndef __cplusplus
+  }
+#endif
+  memcpy(strbox, wkspace_alias, str_ct * max_str_blen);
+}
+
+#ifdef __cplusplus
+typedef struct word_cmp40b_struct {
+  uintptr_t words[40 / kBytesPerWord];
+  bool operator<(const struct word_cmp40b_struct& rhs) const {
+    uint32_t idx = 0;
+    do {
+      const uintptr_t cur_word = words[idx];
+      const uintptr_t rhs_word = rhs.words[idx];
+      if (cur_word != rhs_word) {
+	// could pre-reverse the strings?
+        const uintptr_t xor_word = cur_word ^ rhs_word;
+	const uint32_t lshift = (kBitsPerWord - 8) - (CTZLU(xor_word) & (kBitsPerWord - 8));
+	return (cur_word << lshift) < (rhs_word << lshift);
+      }
+    } while (++idx < (40 / kBytesPerWord));
+    return false;
+  }
+} word_cmp40b_t;
+
+typedef struct word_cmp64b_struct {
+  uintptr_t words[64 / kBytesPerWord];
+  bool operator<(const struct word_cmp64b_struct& rhs) const {
+    uint32_t idx = 0;
+    do {
+      const uintptr_t cur_word = words[idx];
+      const uintptr_t rhs_word = rhs.words[idx];
+      if (cur_word != rhs_word) {
+        const uintptr_t xor_word = cur_word ^ rhs_word;
+	const uint32_t lshift = (kBitsPerWord - 8) - (CTZLU(xor_word) & (kBitsPerWord - 8));
+	return (cur_word << lshift) < (rhs_word << lshift);
+      }
+    } while (++idx < (64 / kBytesPerWord));
+    return false;
+  }
+} word_cmp64b_t;
+
+static_assert(sizeof(word_cmp40b_t) == 40, "word_cmp40b_t does not have the expected size.");
+static_assert(sizeof(word_cmp64b_t) == 64, "word_cmp64b_t does not have the expected size.");
+
+void sort_strbox_40b_finish(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, Strbuf36_ui* filled_wkspace, char* sorted_strbox, uint32_t* id_map) {
+  if (!use_nsort) {
+    word_cmp40b_t* wkspace_alias = (word_cmp40b_t*)filled_wkspace;
+    std::sort(wkspace_alias, &(wkspace_alias[str_ct]));
+  } else {
+    std::sort(filled_wkspace, &(filled_wkspace[str_ct]));
+  }
+  for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+    strcpy(&(sorted_strbox[str_idx * max_str_blen]), filled_wkspace[str_idx].strbuf);
+    id_map[str_idx] = filled_wkspace[str_idx].orig_idx;
+  }
+}
+
+void sort_strbox_64b_finish(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, Strbuf60_ui* filled_wkspace, char* sorted_strbox, uint32_t* id_map) {
+  if (!use_nsort) {
+    word_cmp64b_t* wkspace_alias = (word_cmp64b_t*)filled_wkspace;
+    std::sort(wkspace_alias, &(wkspace_alias[str_ct]));
+  } else {
+    std::sort(filled_wkspace, &(filled_wkspace[str_ct]));
+  }
+  for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+    strcpy(&(sorted_strbox[str_idx * max_str_blen]), filled_wkspace[str_idx].strbuf);
+    id_map[str_idx] = filled_wkspace[str_idx].orig_idx;
+  }
+}
+
+// Normally use sort_strbox_indexed(), but this version is necessary before
+// g_bigstack has been allocated.
+void sort_strbox_indexed2(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace) {
+  if (max_str_blen <= 36) {
+    Strbuf36_ui* wkspace_alias = (Strbuf36_ui*)sort_wkspace;
+    for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+      const char* cur_str = &(strbox[str_idx * max_str_blen]);
+      strcpy(wkspace_alias[str_idx].strbuf, cur_str);
+      wkspace_alias[str_idx].orig_idx = id_map[str_idx];
+    }
+    sort_strbox_40b_finish(str_ct, max_str_blen, use_nsort, wkspace_alias, strbox, id_map);
+    return;
+  }
+  if (max_str_blen <= 60) {
+    Strbuf60_ui* wkspace_alias = (Strbuf60_ui*)sort_wkspace;
+    for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+      const char* cur_str = &(strbox[str_idx * max_str_blen]);
+      strcpy(wkspace_alias[str_idx].strbuf, cur_str);
+      wkspace_alias[str_idx].orig_idx = id_map[str_idx];
+    }
+    sort_strbox_64b_finish(str_ct, max_str_blen, use_nsort, wkspace_alias, strbox, id_map);
+    return;
+  }
+  sort_strbox_indexed2_fallback(str_ct, max_str_blen, use_nsort, strbox, id_map, sort_wkspace);
+}
+#endif
+
+boolerr_t sort_strbox_indexed(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map) {
+  if (str_ct < 2) {
+    return 0;
+  }
+  unsigned char* bigstack_mark = g_bigstack_base;
+  const uintptr_t wkspace_entry_blen = get_strboxsort_wentry_blen(max_str_blen);
+  unsigned char* sort_wkspace;
+  if (bigstack_alloc_uc(str_ct * wkspace_entry_blen, &sort_wkspace)) {
+    return 1;
+  }
+  sort_strbox_indexed2(str_ct, max_str_blen, use_nsort, strbox, id_map, sort_wkspace);
+  bigstack_reset(bigstack_mark);
+  return 0;
+}
+
+boolerr_t sort_strbox_indexed_malloc(uintptr_t str_ct, uintptr_t max_str_blen, char* strbox, uint32_t* id_map) {
+  if (str_ct < 2) {
+    return 0;
+  }
+  const uintptr_t wkspace_entry_blen = get_strboxsort_wentry_blen(max_str_blen);
+  unsigned char* sort_wkspace;
+  if (pgl_malloc(str_ct * wkspace_entry_blen, &sort_wkspace)) {
+    return 1;
+  }
+  sort_strbox_indexed2(str_ct, max_str_blen, 0, strbox, id_map, sort_wkspace);
+  free(sort_wkspace);
+  return 0;
+}
+
+
+uint32_t copy_and_dedup_sorted_strptrs_to_strbox(char** sorted_strptrs, uintptr_t str_ct, uintptr_t max_str_blen, char* strbox) {
+  if (!str_ct) {
+    return 0;
+  }
+  char** sorted_strptrs_iter = sorted_strptrs;
+  char** sorted_strptrs_end = &(sorted_strptrs[str_ct]);
+  uintptr_t write_idx = 0;
+  uint32_t prev_slen = 0xffffffffU;
+  char* prev_str = nullptr;
+  do {
+    char* cur_str = *sorted_strptrs_iter++;
+    const uint32_t cur_slen = strlen(cur_str);
+    if ((cur_slen != prev_slen) || memcmp(cur_str, prev_str, prev_slen)) {
+      memcpy(&(strbox[write_idx * max_str_blen]), cur_str, cur_slen + 1);
+      ++write_idx;
+      prev_str = cur_str;
+    }
+  } while (sorted_strptrs_iter != sorted_strptrs_end);
+  return write_idx;
+}
+
+uint32_t uint32arr_greater_than(const uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii) {
+  // (strangely, this seems to be equal to or better than std::lower_bound with
+  // -O2 optimization, but can become much slower with -O3?)
+  
+  // assumes arr_length is nonzero, and sorted_uint32_arr is in nondecreasing
+  // order.  (useful for searching marker_pos.)
+  // also assumes arr_length < 2^31.
+  // uii guaranteed to be larger than sorted_uint32_arr[min_idx - 1] if it
+  // exists, but NOT necessarily sorted_uint32_arr[min_idx].
+  int32_t min_idx = 0;
+  // similarly, uii guaranteed to be no greater than
+  // sorted_uint32_arr[max_idx + 1] if it exists, but not necessarily
+  // sorted_uint32_arr[max_idx].  Signed integer since it could become -1, and
+  // min_idx in turn is signed so comparisons are safe.
+  int32_t max_idx = arr_length - 1;
+  while (min_idx < max_idx) {
+    const uint32_t mid_idx = (((uint32_t)min_idx) + ((uint32_t)max_idx)) / 2;
+    if (uii > sorted_uint32_arr[mid_idx]) {
+      min_idx = mid_idx + 1;
+    } else {
+      max_idx = mid_idx - 1;
+    }
+  }
+  return min_idx + (uii > sorted_uint32_arr[((uint32_t)min_idx)]);
+}
+
+uintptr_t uint64arr_greater_than(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii) {
+  intptr_t min_idx = 0;
+  intptr_t max_idx = arr_length - 1;
+  while (min_idx < max_idx) {
+    const uintptr_t mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
+    if (ullii > sorted_uint64_arr[mid_idx]) {
+      min_idx = mid_idx + 1;
+    } else {
+      max_idx = mid_idx - 1;
+    }
+  }
+  return min_idx + (ullii > sorted_uint64_arr[((uintptr_t)min_idx)]);
+}
+
+uintptr_t doublearr_greater_than(const double* sorted_dbl_arr, uintptr_t arr_length, double dxx) {
+  intptr_t min_idx = 0;
+  intptr_t max_idx = arr_length - 1;
+  while (min_idx < max_idx) {
+    const uintptr_t mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
+    if (dxx > sorted_dbl_arr[mid_idx]) {
+      min_idx = mid_idx + 1;
+    } else {
+      max_idx = mid_idx - 1;
+    }
+  }
+  return min_idx + (dxx > sorted_dbl_arr[((uintptr_t)min_idx)]);
+}
+
+uintptr_t uint64arr_geq(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii) {
+  intptr_t min_idx = 0;
+  intptr_t max_idx = arr_length - 1;
+  while (min_idx < max_idx) {
+    const uintptr_t mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
+    if (ullii >= sorted_uint64_arr[mid_idx]) {
+      min_idx = mid_idx + 1;
+    } else {
+      max_idx = mid_idx - 1;
+    }
+  }
+  return min_idx + (ullii >= sorted_uint64_arr[((uintptr_t)min_idx)]);
+}
+
+uint32_t param_count(char** argv, uint32_t argc, uint32_t flag_idx) {
+  // Counts the number of optional parameters given to the flag at position
+  // flag_idx, treating any nonnumeric parameter beginning with "-" as
+  // optional.
+  ++flag_idx;
+  uint32_t cur_idx = flag_idx;
+  while ((cur_idx < argc) && (!is_flag(argv[cur_idx]))) {
+    ++cur_idx;
+  }
+  return cur_idx - flag_idx;
+}
+
+boolerr_t enforce_param_ct_range(const char* flag_name, uint32_t param_ct, uint32_t min_ct, uint32_t max_ct) {
+  if (param_ct > max_ct) {
+    if (max_ct > min_ct) {
+      sprintf(g_logbuf, "Error: %s accepts at most %u parameter%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
+    } else {
+      sprintf(g_logbuf, "Error: %s only accepts %u parameter%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
+    }
+    return 1;
+  }
+  if (param_ct >= min_ct) {
+    return 0;
+  }
+  if (min_ct == 1) {
+    sprintf(g_logbuf, "Error: Missing %s parameter.\n", flag_name);
+  } else {
+    sprintf(g_logbuf, "Error: %s requires %s%u parameters.\n", flag_name, (min_ct < max_ct)? "at least " : "", min_ct);
+  }
+  return 1;
+}
+
+pglerr_t sort_cmdline_flags(uint32_t max_flag_blen, uint32_t flag_ct, char* flag_buf, uint32_t* flag_map) {
+  // Assumes flag_ct is the number of flag (as opposed to value) parameters,
+  // flag_buf[] points to a rectangular char* array (width max_flag_blen) of
+  // flag names with leading dash(es) stripped, and flag_map[] maps flag_buf[]
+  // entries to argv[] entries.
+  // Lexicographically sorts flag_buf (updating flag_map in the process), and
+  // then checks for duplicates.
+  // Okay for flag_buf to contain entries with spaces (plink 1.9's alias
+  // resolution takes advantage of this).
+  assert(flag_ct); // this must be skipped if there are no flags at all
+  if (sort_strbox_indexed_malloc(flag_ct, max_flag_blen, flag_buf, flag_map)) {
+    return kPglRetNomem;
+  }
+  uint32_t prev_flag_len = strlen_se(flag_buf);
+  char* prev_flag_ptr = flag_buf;
+  for (uint32_t cur_flag_idx = 1; cur_flag_idx < flag_ct; ++cur_flag_idx) {
+    char* cur_flag_ptr = &(prev_flag_ptr[max_flag_blen]);
+    const uint32_t cur_flag_len = strlen_se(cur_flag_ptr);
+    if ((prev_flag_len == cur_flag_len) && (!memcmp(prev_flag_ptr, cur_flag_ptr, cur_flag_len))) {
+      cur_flag_ptr[cur_flag_len] = '\0'; // just in case of aliases
+      fflush(stdout);
+      fprintf(stderr, "Error: Duplicate --%s flag.\n", cur_flag_ptr);
+      // g_stderr_written_to = 1;
+      return kPglRetInvalidCmdline;
+    }
+    prev_flag_ptr = cur_flag_ptr;
+    prev_flag_len = cur_flag_len;
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t init_logfile(uint32_t always_stderr, char* outname, char* outname_end) {
+  strcpy(outname_end, ".log");
+  g_logfile = fopen(outname, "w");
+  if (!g_logfile) {
+    fflush(stdout);
+    fprintf(stderr, "Error: Failed to open %s for logging.\n", outname);
+    // g_stderr_written_to = 1;
+    return kPglRetOpenFail;
+  }
+  fprintf(always_stderr? stderr : stdout, "Logging to %s.\n", outname);
+  return kPglRetSuccess;
+}
+
+boolerr_t cleanup_logfile(uint32_t print_end_time) {
+  char* write_iter = strcpya(g_logbuf, "End time: ");
+  time_t rawtime;
+  time(&rawtime);
+  write_iter = strcpya0(write_iter, ctime(&rawtime)); // has trailing \n
+  if (print_end_time) {
+    fputs(g_logbuf, stdout);
+  }
+  boolerr_t ret_boolerr = 0;
+  if (g_logfile) {
+    if (!g_log_failed) {
+      logstr("\n");
+      logstr(g_logbuf);
+      if (fclose(g_logfile)) {
+	fflush(stdout);
+	fputs("Error: Failed to finish writing to log.\n", stderr);
+	ret_boolerr = 1;
+      }
+    } else {
+      fclose(g_logfile);
+    }
+    g_logfile = nullptr;
+  }
+  return ret_boolerr;
+}
+
+// manually managed, very large stack
+unsigned char* g_bigstack_base = nullptr;
+unsigned char* g_bigstack_end = nullptr;
+
+uintptr_t detect_mb() {
+  int64_t llxx;
+  // return zero if detection failed
+  // see e.g. http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system .
+#ifdef __APPLE__
+  int32_t mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  llxx = 0;
+  size_t sztmp = sizeof(int64_t);
+  sysctl(mib, 2, &llxx, &sztmp, nullptr, 0);
+  llxx /= 1048576;
+#else
+#ifdef _WIN32
+  MEMORYSTATUSEX memstatus;
+  memstatus.dwLength = sizeof(memstatus);
+  GlobalMemoryStatusEx(&memstatus);
+  llxx = memstatus.ullTotalPhys / 1048576;
+#else
+  llxx = ((uint64_t)sysconf(_SC_PHYS_PAGES)) * ((size_t)sysconf(_SC_PAGESIZE)) / 1048576;
+#endif
+#endif
+  return llxx;
+}
+
+uintptr_t get_default_alloc_mb() {
+  const uintptr_t total_mb = detect_mb();
+  if (!total_mb) {
+    return kBigstackDefaultMb;
+  }
+  if (total_mb < (kBigstackMinMb * 2)) {
+    return kBigstackMinMb;
+  }
+  return (total_mb / 2);
+}
+
+pglerr_t init_bigstack(uintptr_t malloc_size_mb, uintptr_t* malloc_mb_final_ptr, unsigned char** bigstack_ua_ptr) {
+  // guarantee contiguous malloc space outside of main workspace
+  unsigned char* bubble;
+  if (pgl_malloc(kNonBigstackMin, &bubble)) {
+    return kPglRetNomem;
+  }
+  assert(malloc_size_mb >= kBigstackMinMb);
+#ifndef __LP64__
+  assert(malloc_size_mb <= 2047);
+#endif
+  // don't use pgl_malloc here since we don't automatically want to set
+  // g_failed_alloc_attempt_size on failure
+  unsigned char* bigstack_ua = (unsigned char*)malloc(malloc_size_mb * 1048576 * sizeof(char));
+  // this is thwarted by overcommit, but still better than nothing...
+  while (!bigstack_ua) {
+    malloc_size_mb = (malloc_size_mb * 3) / 4;
+    if (malloc_size_mb < kBigstackMinMb) {
+      malloc_size_mb = kBigstackMinMb;
+    }
+    bigstack_ua = (unsigned char*)malloc(malloc_size_mb * 1048576 * sizeof(char));
+    if ((!bigstack_ua) && (malloc_size_mb == kBigstackMinMb)) {
+      // switch to "goto cleanup" pattern if any more exit points are needed
+      g_failed_alloc_attempt_size = kBigstackMinMb * 1048576;
+      free(bubble);
+      return kPglRetNomem;
+    }
+  }
+  // force 64-byte align to make cache line sensitivity work
+  unsigned char* bigstack_initial_base = (unsigned char*)round_up_pow2((uintptr_t)bigstack_ua, kCacheline);
+  g_bigstack_base = bigstack_initial_base;
+  // last 512 bytes now reserved for g_one_char_strs
+  g_bigstack_end = &(bigstack_initial_base[round_down_pow2(malloc_size_mb * 1048576 - 512 - (uintptr_t)(bigstack_initial_base - bigstack_ua), kCacheline)]);
+  free(bubble);
+  uintptr_t* one_char_iter = (uintptr_t*)g_bigstack_end;
+#ifdef __LP64__
+  // assumes little-endian
+  uintptr_t cur_word = 0x3000200010000LLU;
+  for (uint32_t uii = 0; uii < 64; ++uii) {
+    *one_char_iter++ = cur_word;
+    cur_word += 0x4000400040004LLU;
+  }
+#else
+  uintptr_t cur_word = 0x10000;
+  for (uint32_t uii = 0; uii < 128; ++uii) {
+    *one_char_iter++ = cur_word;
+    cur_word += 0x20002;
+  }
+#endif
+  g_one_char_strs = (const char*)g_bigstack_end;
+
+  // plink2 doesn't actually need these here, but short programs using
+  // plink2_common benefit from this
+  g_input_missing_geno_ptr = (const char*)(&(g_one_char_strs[96]));
+  g_output_missing_geno_ptr = (const char*)(&(g_one_char_strs[92]));
+  
+  *malloc_mb_final_ptr = malloc_size_mb;
+  *bigstack_ua_ptr = bigstack_ua;
+  return kPglRetSuccess;
+}
+
+
+/*
+uint32_t match_upper(const char* ss, const char* fixed_str) {
+  char cc = *fixed_str++;
+  do {
+    if ((((unsigned char)(*ss++)) & 0xdf) != ((unsigned char)cc)) {
+      return 0;
+    }
+    cc = *fixed_str++;
+  } while (cc);
+  return !(*ss);
+}
+*/
+
+uint32_t match_upper_counted(const char* ss, const char* fixed_str, uint32_t ct) {
+  for (uint32_t uii = 0; uii < ct; ++uii) {
+    if ((((unsigned char)ss[uii]) & 0xdf) != ((unsigned char)fixed_str[uii])) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+/*
+void str_toupper(char* ss) {
+  while (1) {
+    const uint32_t uii = (unsigned char)(*ss);
+    if (!uii) {
+      return;
+    }
+    if (((uint32_t)(uii - 97)) < 26) {
+      // 'a' has ASCII code 97
+      *ss = uii - 32;
+    }
+    ++ss;
+  }
+}
+
+void buf_toupper(uint32_t slen, char* ss) {
+  for (uint32_t pos = 0; pos < slen; ++pos) {
+    const uint32_t uii = (unsigned char)(ss[pos]);
+    if (((uint32_t)(uii - 97)) < 26) {
+      ss[pos] = uii - 32;
+    }
+  }
+}
+
+void strcpy_toupper(char* target, const char* source) {
+  while (1) {
+    uint32_t uii = (unsigned char)(*source++);
+    if (!uii) {
+      return;
+    }
+    if (((uint32_t)(uii - 97)) < 26) {
+      uii -= 32;
+    }
+    *target++ = uii;
+  }
+}
+*/
+
+uint32_t is_alphanumeric(const char* ss) {
+  while (1) {
+    uint32_t uii = (unsigned char)(*ss++);
+    if (!uii) {
+      return 1;
+    }
+    if (((uii - 48) > 9) && (((uii & 0xffffffdfU) - 65) > 25)) {
+      return 0;
+    }
+  }
+}
+
+boolerr_t scan_posintptr(const char* ss, uintptr_t* valp) {
+  // Reads an integer in [1, 2^kBitsPerWord - 1].  Assumes first character is
+  // nonspace.
+  assert(((unsigned char)ss[0]) > 32);
+  uintptr_t val = (uintptr_t)((unsigned char)(*ss++)) - 48;
+  if (val >= 10) {
+#ifdef __LP64__
+    if (val != 0xfffffffffffffffbLLU) {
+      return 1;
+    }
+#else
+    if (val != 0xfffffffbU) {
+      return 1;
+    }
+#endif
+    val = (uintptr_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (!val) {
+    val = (uintptr_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+#ifdef __LP64__
+  // limit is 20 digits, we've already read one
+  const char* ss_limit = &(ss[20]);
+#else
+  const char* ss_limit = &(ss[10]);
+#endif
+  while (1) {
+    const uintptr_t cur_digit = (uintptr_t)((unsigned char)(*ss++)) - 48;
+    if (cur_digit >= 10) {
+      *valp = val;
+      return 0;
+    }
+    const uintptr_t cur_digit2 = (uintptr_t)((unsigned char)(*ss++)) - 48;
+    if (ss == ss_limit) {
+      if ((cur_digit2 < 10) || ((val >= (~k0LU) / 10) && ((val > (~k0LU) / 10) || (cur_digit > (~k0LU) % 10)))) {
+	return 1;
+      }
+      *valp = val * 10 + cur_digit;
+      return 0;
+    }
+    if (cur_digit2 >= 10) {
+      *valp = val * 10 + cur_digit;
+      return 0;
+    }
+    val = val * 100 + cur_digit * 10 + cur_digit2;
+  }
+}
+
+#ifdef __LP64__
+static inline boolerr_t scanadv_uint_capped_finish(uint64_t cap, char** ss_ptr, uint32_t* valp) {
+  unsigned char* ss = (unsigned char*)(*ss_ptr);
+  uint64_t val = *valp;
+  while (1) {
+    // a little bit of unrolling seems to help
+    const uint64_t cur_digit = (uint64_t)(*ss++) - 48;
+    if (cur_digit >= 10) {
+      break;
+    }
+    // val = val * 10 + cur_digit;
+    const uint64_t cur_digit2 = (uint64_t)(*ss++) - 48;
+    if (cur_digit2 >= 10) {
+      val = val * 10 + cur_digit;
+      if (val > cap) {
+	return 1;
+      }
+      break;
+    }
+    val = val * 100 + cur_digit * 10 + cur_digit2;
+    if (val > cap) {
+      return 1;
+    }
+  }
+  *valp = val;
+  *ss_ptr = (char*)(&(ss[-1]));
+  return 0;
+}
+
+boolerr_t scanadv_posint_capped(uint64_t cap, char** ss_ptr, uint32_t* valp) {
+  unsigned char* ss = (unsigned char*)(*ss_ptr);
+  *valp = (uint32_t)(*ss++) - 48;
+  if (*valp >= 10) {
+    if (*valp != 0xfffffffbU) {
+      return 1;
+    }
+    *valp = (uint32_t)(*ss++) - 48;
+    if (*valp >= 10) {
+      return 1;
+    }
+  }
+  while (!(*valp)) {
+    *valp = (uint32_t)(*ss++) - 48;
+    if ((*valp) >= 10) {
+      return 1;
+    }
+  }
+  *ss_ptr = (char*)ss;
+  return scanadv_uint_capped_finish(cap, ss_ptr, valp);
+}
+
+boolerr_t scanadv_uint_capped(uint64_t cap, char** ss_ptr, uint32_t* valp) {
+  unsigned char* ss = (unsigned char*)(*ss_ptr);
+  *valp = (uint32_t)(*ss++) - 48;
+  if (*valp >= 10) {
+    if (*valp != 0xfffffffbU) {
+      // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
+      if ((*valp != 0xfffffffdU) || (*ss != '0')) {
+	return 1;
+      }
+      // accept "-0", "-00", etc.
+      while (*(++ss) == '0');
+      *valp = 0;
+      *ss_ptr = (char*)ss;
+      return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
+    }
+    // accept leading '+'
+    *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (*valp >= 10) {
+      return 1;
+    }
+  }
+  *ss_ptr = (char*)ss;
+  return scanadv_uint_capped_finish(cap, ss_ptr, valp);
+}
+#else
+boolerr_t scanadv_posint_capped32(uint32_t cap_div_10, uint32_t cap_mod_10, char** ss_ptr, uint32_t* valp) {
+  unsigned char* ss = (unsigned char*)ss_ptr;
+  uint32_t val = (uint32_t)(*ss++) - 48;
+  if (val >= 10) {
+    if (val != 0xfffffffbU) {
+      return 1;
+    }
+    val = (uint32_t)(*ss++) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (!val) {
+    val = (uint32_t)(*ss++);
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (1) {
+    const uint32_t cur_digit = (uint32_t)(*ss++) - 48;
+    if (cur_digit >= 10) {
+      *valp = val;
+      *ss_ptr = (char*)(&(ss[-1]));
+      return 0;
+    }
+    if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+
+boolerr_t scanadv_uint_capped32(uint32_t cap_div_10, uint32_t cap_mod_10, char** ss_ptr, uint32_t* valp) {
+  unsigned char* ss = (unsigned char*)ss_ptr;
+  uint32_t val = (uint32_t)(*ss++) - 48;
+  if (val >= 10) {
+    if (val != 0xfffffffbU) {
+      if ((val != 0xfffffffd) || (*ss != '0')) {
+	return 1;
+      }
+      while (*(++ss) == '0');
+      *valp = 0;
+      *ss_ptr = (char*)ss;
+      return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
+    }
+    val = (uint32_t)((unsigned char)(*ss++)) - 48;
+    if (val >= 10) {
+      return 1;
+    }
+  }
+  while (1) {
+    const uint32_t cur_digit = (uint32_t)(*ss++) - 48;
+    if (cur_digit >= 10) {
+      *valp = val;
+      *ss_ptr = (char*)(&(ss[-1]));
+      return 0;
+    }
+    if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
+      return 1;
+    }
+    val = val * 10 + cur_digit;
+  }
+}
+#endif
+
+static const double kPositivePow10[16] = {1, 1.0e1, 1.0e2, 1.0e3, 1.0e4, 1.0e5, 1.0e6, 1.0e7, 1.0e8, 1.0e9, 1.0e10, 1.0e11, 1.0e12, 1.0e13, 1.0e14, 1.0e15};
+static const double kPositivePowTen16[16] = {1, 1.0e16, 1.0e32, 1.0e48, 1.0e64, 1.0e80, 1.0e96, 1.0e112, 1.0e128, 1.0e144, 1.0e160, 1.0e176, 1.0e192, 1.0e208, 1.0e224, 1.0e240};
+static const double kNegativePow10[16] = {1, 1.0e-1, 1.0e-2, 1.0e-3, 1.0e-4, 1.0e-5, 1.0e-6, 1.0e-7, 1.0e-8, 1.0e-9, 1.0e-10, 1.0e-11, 1.0e-12, 1.0e-13, 1.0e-14, 1.0e-15};
+static const double kNegativePowTen16[8] = {1, 1.0e-16, 1.0e-32, 1.0e-48, 1.0e-64, 1.0e-80, 1.0e-96, 1.0e-112};
+
+char* scanadv_double(char* ss, double* valp) {
+  // requires first character to be nonspace (to succeed; it fails without
+  //   segfaulting on space/eoln/null)
+  // don't care about hexadecimal
+  // ok to lose last ~2 bits of precision
+  // ok if this yields incorrect results on >1GB strings
+  // fail on nan/infinity/overflow instead of usual strtod behavior
+  uint32_t cur_char_code = (unsigned char)(*ss);
+  const uint32_t is_negative = (cur_char_code == 45);
+  if (is_negative || (cur_char_code == 43)) {
+    cur_char_code = (unsigned char)(*(++ss));
+  }
+  uint32_t cur_digit = cur_char_code - 48;
+  int32_t e10 = 0;
+  char* dot_ptr;
+  int64_t digits;
+#ifdef __LP64__
+  if (cur_digit < 10) {
+    // ok, we have at least one digit
+    digits = cur_digit;
+    // to check: best to skip leading zeroes and compare against 17 instead of
+    // 10^16?
+    do {
+      cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+      if (cur_digit >= 10) {
+	if (cur_digit == 0xfffffffeU) {
+	  dot_ptr = ss;
+	  goto scanadv_double_parse_decimal;
+	}
+	goto scanadv_double_parse_exponent;
+      }
+      digits = digits * 10 + cur_digit;
+      // this check should work differently in 32-bit version
+    } while (digits < 10000000000000000LL);
+    // we have 17 significant digits; count the rest, but don't worry about
+    // contents
+    // (could keep ~19 instead, but if we're systematically losing the last two
+    // bits of precision anyway...)
+    char* last_sig_fig_ptr = ss;
+    do {
+      cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+    } while (cur_digit < 10);
+    e10 = (int32_t)((uint32_t)((uintptr_t)(ss - last_sig_fig_ptr))) - 1;
+    if (cur_digit == 0xfffffffeU) {
+      do {
+	cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+      } while (cur_digit < 10);
+    }
+    goto scanadv_double_parse_exponent;
+  }
+  if (cur_digit != 0xfffffffeU) {
+    return nullptr;
+  }
+  // first (nonsign) character is dot, verify we have a digit after it
+  dot_ptr = ss;
+  cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+  if (cur_digit >= 10) {
+    return nullptr;
+  }
+  digits = cur_digit;
+ scanadv_double_parse_decimal:
+  while (1) {
+    cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+    if (cur_digit >= 10) {
+      e10 = 1 - (int32_t)((uint32_t)((uintptr_t)(ss - dot_ptr)));
+      break;
+    }
+    digits = digits * 10 + cur_digit;
+    if (digits >= 10000000000000000LL) {
+      e10 = -(int32_t)((uint32_t)((uintptr_t)(ss - dot_ptr)));
+      do {
+	cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+      } while (cur_digit < 10);
+      break;
+    }
+  }
+ scanadv_double_parse_exponent:
+  if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
+    cur_char_code = (unsigned char)(*(++ss));
+    const uint32_t exp_is_negative = (cur_char_code == 45);
+    if (exp_is_negative || (cur_char_code == 43)) {
+      cur_char_code = (unsigned char)(*(++ss));
+    }
+    cur_digit = cur_char_code - 48;
+    int32_t cur_exp = 0;
+    while (cur_digit < 10) {
+      if (cur_exp >= 107374182) {
+	// may as well guard against exponent overflow
+	if (!exp_is_negative) {
+	  return nullptr;
+	}
+	*valp = 0;
+	do {
+	  cur_digit = ((unsigned char)(*(++ss))) - 48;
+	} while (cur_digit < 10);
+	return ss;
+      }
+      cur_exp = cur_exp * 10 + cur_digit;
+      cur_digit = ((unsigned char)(*(++ss))) - 48;
+    }
+    if (exp_is_negative) {
+      cur_exp = -cur_exp;
+    }
+    e10 += cur_exp;
+  }
+#else // not __LP64__
+  int32_t digits_short;
+  if (cur_digit < 10) {
+    // ok, we have at least one digit
+    digits_short = cur_digit;
+    // to check: best to skip leading zeroes and compare against 17 instead of
+    // 10^16?
+    do {
+      cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+      if (cur_digit >= 10) {
+	if (cur_digit == 0xfffffffeU) {
+	  dot_ptr = ss;
+	  goto scanadv_double_parse_decimal;
+	}
+	digits = digits_short;
+	goto scanadv_double_parse_exponent;
+      }
+      digits_short = digits_short * 10 + cur_digit;
+    } while (digits_short < 100000000);
+    digits = digits_short;
+    do {
+      cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+      if (cur_digit >= 10) {
+	if (cur_digit == 0xfffffffeU) {
+	  dot_ptr = ss;
+	  goto scanadv_double_parse_decimal_long;
+	}
+	goto scanadv_double_parse_exponent;
+      }
+      digits = digits * 10 + cur_digit;
+    } while (digits < 10000000000000000LL);
+    // we have 17 significant digits; count the rest, but don't worry about
+    // contents
+    char* last_sig_fig_ptr = ss;
+    do {
+      cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+    } while (cur_digit < 10);
+    e10 = (int32_t)((uint32_t)((uintptr_t)(ss - last_sig_fig_ptr))) - 1;
+    if (cur_digit == 0xfffffffeU) {
+      do {
+	cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+      } while (cur_digit < 10);
+    }
+    goto scanadv_double_parse_exponent;
+  }
+  if (cur_digit != 0xfffffffeU) {
+    return nullptr;
+  }
+  // first (nonsign) character is dot, verify we have a digit after it
+  dot_ptr = ss;
+  cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+  if (cur_digit >= 10) {
+    return nullptr;
+  }
+  digits_short = cur_digit;
+ scanadv_double_parse_decimal:
+  while (1) {
+    cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+    if (cur_digit >= 10) {
+      e10 = 1 - (int32_t)((uint32_t)((uintptr_t)(ss - dot_ptr)));
+      digits = digits_short;
+      break;
+    }
+    digits_short = digits_short * 10 + cur_digit;
+    if (digits_short >= 100000000) {
+      digits = digits_short;
+    scanadv_double_parse_decimal_long:
+      while (1) {
+	cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+	if (cur_digit >= 10) {
+	  e10 = 1 - (int32_t)((uint32_t)((uintptr_t)(ss - dot_ptr)));
+	  goto scanadv_double_parse_exponent;
+	}
+	digits = digits * 10 + cur_digit;
+	if (digits >= 10000000000000000LL) {
+	  e10 = -(int32_t)((uint32_t)((uintptr_t)(ss - dot_ptr)));
+	  do {
+	    cur_digit = ((uint32_t)((unsigned char)(*(++ss)))) - 48;
+	  } while (cur_digit < 10);
+	  goto scanadv_double_parse_exponent;
+	}
+      }
+    }
+  }
+ scanadv_double_parse_exponent:
+  if ((cur_digit & 0xdf) == 21) { // 'E' - '0' is 21
+    cur_char_code = (unsigned char)(*(++ss));
+    const uint32_t exp_is_negative = (cur_char_code == 45);
+    if (exp_is_negative || (cur_char_code == 43)) {
+      cur_char_code = (unsigned char)(*(++ss));
+    }
+    cur_digit = cur_char_code - 48;
+    int32_t cur_exp = 0;
+    while (cur_digit < 10) {
+      if (cur_exp >= 107374182) {
+	// may as well guard against exponent overflow
+	if (!exp_is_negative) {
+	  return nullptr;
+	}
+	*valp = 0;
+	do {
+	  cur_digit = ((unsigned char)(*(++ss))) - 48;
+	} while (cur_digit < 10);
+	return ss;
+      }
+      cur_exp = cur_exp * 10 + cur_digit;
+      cur_digit = ((unsigned char)(*(++ss))) - 48;
+    }
+    if (exp_is_negative) {
+      cur_exp = -cur_exp;
+    }
+    e10 += cur_exp;
+  }
+#endif
+  if (digits == 0) {
+    *valp = 0;
+    return ss;
+  }
+  if (is_negative) {
+    digits = -digits;
+  }
+  double dxx = (double)digits;
+  if (e10) {
+    if (e10 < 0) {
+      uint32_t pos_exp = (uint32_t)(-e10);
+      dxx *= kNegativePow10[pos_exp & 15];
+      pos_exp /= 16;
+      if (pos_exp) {
+	dxx *= kNegativePowTen16[pos_exp & 7];
+	if (pos_exp > 7) {
+	  if (pos_exp > 23) {
+	    dxx = 0;
+	  } else if (pos_exp > 15) {
+	    dxx *= 1.0e-256;
+	  } else {
+	    dxx *= 1.0e-128;
+	  }
+	}
+      }
+    } else {
+      uint32_t pos_exp = (uint32_t)e10;
+      dxx *= kPositivePow10[pos_exp & 15];
+      pos_exp /= 16;
+      if (pos_exp) {
+	dxx *= kPositivePowTen16[pos_exp & 15];
+	if (pos_exp > 15) {
+	  // overflow check
+	  // last digits are "54" instead of "57" since that's the threshold
+	  // beyond which multiply-by-1e256 overflows
+	  if ((pos_exp > 31) || (dxx > 1.7976931348623154e52)) {
+	    return nullptr;
+	  }
+	  dxx *= 1.0e256;
+	}
+      }
+    }
+  }
+  *valp = dxx;
+  return ss;
+}
+
+void get_top_two_ui(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr) {
+  assert(uia_size > 1);
+  uintptr_t top_idx = (uint_arr[1] > uint_arr[0])? 1 : 0;
+  uintptr_t second_idx = 1 ^ top_idx;
+  uint32_t top_val = uint_arr[top_idx];
+  uint32_t second_val = uint_arr[second_idx];
+  uintptr_t cur_idx;
+  uintptr_t cur_val;
+  for (cur_idx = 2; cur_idx < uia_size; ++cur_idx) {
+    cur_val = uint_arr[cur_idx];
+    if (cur_val > second_val) {
+      if (cur_val > top_val) {
+	second_val = top_val;
+	second_idx = top_idx;
+	top_val = cur_val;
+	top_idx = cur_idx;
+      } else {
+	second_val = cur_val;
+	second_idx = cur_idx;
+      }
+    }
+  }
+  *top_idx_ptr = top_idx;
+  *second_idx_ptr = second_idx;
+}
+
+char* comma_or_space_next_token_mult(char* sptr, uint32_t ct, uint32_t comma_delim) {
+  assert(ct);
+  if (!comma_delim) {
+    return next_token_mult(sptr, ct);
+  }
+  if (!sptr) {
+    return nullptr;
+  }
+  // assumes initial spaces in current token have been skipped
+  // ok if we're at the end of the token
+  unsigned char ucc = *sptr;
+  assert(ucc != ' ');
+  while (1) {
+    // avoid strchr to keep "ASCII code < 32 == newline" consistent
+    // (tab handling is quirky right now--permitted at the beginning of a
+    // token, but treated as newline later--but it should never appear so
+    // no point in e.g. adding an extra parameter to skip_initial_spaces();
+    // just need to make sure the quirky behavior is consistent.)
+    if (ucc < 32) {
+      return nullptr;
+    }
+    if (ucc == ',') {
+      do {
+	ucc = (unsigned char)(*(++sptr));
+      } while ((ucc == ' ') || (ucc == '\t'));
+      if (!(--ct)) {
+	return sptr;
+      }
+      continue;
+    }
+    ucc = (unsigned char)(*(++sptr));
+  }
+}
+
+uint32_t count_tokens(const char* bufptr) {
+  uint32_t token_ct = 0;
+  // skip_initial_spaces/token_endnn spelled out due to const qualifier
+  while ((*bufptr == ' ') || (*bufptr == '\t')) {
+    ++bufptr;
+  }
+  while (!is_eoln_kns(*bufptr)) {
+    ++token_ct;
+    while (!is_space_or_eoln(*(++bufptr)));
+    while ((*bufptr == ' ') || (*bufptr == '\t')) {
+      ++bufptr;
+    }
+  }
+  return token_ct;
+}
+
+/*
+uint32_t comma_or_space_count_tokens(const char* bufptr, uint32_t comma_delim) {
+  if (comma_delim) {
+    // assumes nonempty line (treats trailing empty string as a token).
+    uint32_t token_ct = 1;
+    unsigned char ucc = (unsigned char)(*bufptr++);
+    while (1) {
+      if (ucc < 32) {
+	return token_ct;
+      }
+      if (ucc == ',') {
+	// spelled out due to const qualifier
+	do {
+	  ucc = (unsigned char)(*bufptr++);
+	} while ((ucc == ' ') || (ucc == '\t'));
+	token_ct++;
+	continue;
+      }
+      ucc = (unsigned char)(*bufptr++);
+    }
+  }
+  return count_tokens(bufptr);
+}
+
+uint32_t count_and_measure_multistr(const char* multistr, uintptr_t* max_blen_ptr) {
+  // assumes multistr is nonempty
+  assert(multistr[0]);
+  uint32_t ct = 0;
+  uintptr_t max_blen = *max_blen_ptr;
+  do {
+    const uintptr_t blen = strlen(multistr) + 1;
+    if (blen > max_blen) {
+      max_blen = blen;
+    }
+    multistr = &(multistr[blen]);
+    ++ct;
+  } while (*multistr);
+  *max_blen_ptr = max_blen;
+  return ct;
+}
+*/
+
+boolerr_t count_and_measure_multistr_reverse_alloc(char* multistr, uintptr_t max_str_ct, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr, char*** strptr_arrp) {
+  // assumes multistr is nonempty
+  assert(multistr[0]);
+  uintptr_t ct = 0;
+  uintptr_t max_blen = *max_blen_ptr;
+  char** strptr_arr_iter = *strptr_arrp;
+  do {
+    if (++ct > max_str_ct) {
+      return 1;
+    }
+    const uintptr_t blen = strlen(multistr) + 1;
+    if (blen > max_blen) {
+      max_blen = blen;
+    }
+    *(--strptr_arr_iter) = multistr;
+    multistr = &(multistr[blen]);
+  } while (*multistr);
+  *str_ct_ptr = ct;
+  *max_blen_ptr = max_blen;
+  *strptr_arrp = strptr_arr_iter;
+  return 0;
+}
+
+// number-to-string encoders
+
+static const uint16_t kDigitPair[100] = {
+  0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
+  0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931,
+  0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932,
+  0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933,
+  0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934,
+  0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935,
+  0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936,
+  0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937,
+  0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938,
+  0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939};
+
+char* uint32toa(uint32_t uii, char* start) {
+  // Memory-efficient fast integer writer.  (You can do a bit better sometimes
+  // by using a larger lookup table, but on average I doubt that pays off.)
+  // Returns a pointer to the end of the integer (not null-terminated).
+  uint32_t quotient;
+  if (uii < 1000) {
+    if (uii < 10) {
+      *start++ = '0' + uii;
+      return start;
+    }
+    if (uii < 100) {
+      goto uint32toa_2;
+    }
+    quotient = uii / 100;
+    *start++ = '0' + quotient;
+  } else {
+    if (uii < 10000000) {
+      if (uii >= 100000) {
+	if (uii < 1000000) {
+	  goto uint32toa_6;
+	}
+	quotient = uii / 1000000;
+	*start++ = '0' + quotient;
+	goto uint32toa_6b;
+      }
+      if (uii < 10000) {
+	goto uint32toa_4;
+      }
+      quotient = uii / 10000;
+      *start++ = '0' + quotient;
+    } else {
+      if (uii >= 100000000) {
+	quotient = uii / 100000000;
+	if (uii >= 1000000000) {
+	  start = memcpya(start, &(kDigitPair[quotient]), 2);
+	} else {
+	  *start++ = '0' + quotient;
+	}
+	uii -= 100000000 * quotient;
+      }
+      quotient = uii / 1000000;
+      start = memcpya(start, &(kDigitPair[quotient]), 2);
+    uint32toa_6b:
+      uii -= 1000000 * quotient;
+    uint32toa_6:
+      quotient = uii / 10000;
+      start = memcpya(start, &(kDigitPair[quotient]), 2);
+    }
+    uii -= 10000 * quotient;
+  uint32toa_4:
+    // could make a uitoa_z4() call here, but that's slightly slower
+    quotient = uii / 100;
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+  }
+  uii -= 100 * quotient;
+ uint32toa_2:
+  return memcpya(start, &(kDigitPair[uii]), 2);
+}
+
+char* int32toa(int32_t ii, char* start) {
+  uint32_t uii = ii;
+  if (ii < 0) {
+    // -INT_MIN is undefined, but negating the unsigned int equivalent works
+    *start++ = '-';
+    uii = -uii;
+  }
+  return uint32toa(uii, start);
+}
+
+char* uitoa_z4(uint32_t uii, char* start) {
+  uint32_t quotient = uii / 100;
+  assert(quotient < 100);
+  uii -= 100 * quotient;
+  start = memcpya(start, &(kDigitPair[quotient]), 2);
+  return memcpya(start, &(kDigitPair[uii]), 2);
+}
+
+char* uitoa_z6(uint32_t uii, char* start) {
+  uint32_t quotient = uii / 10000;
+  start = memcpya(start, &(kDigitPair[quotient]), 2);
+  return uitoa_z4(uii - 10000 * quotient, start);
+}
+
+char* uitoa_z8(uint32_t uii, char* start) {
+  uint32_t quotient = uii / 1000000;
+  start = memcpya(start, &(kDigitPair[quotient]), 2);
+  return uitoa_z6(uii - 1000000 * quotient, start);
+}
+
+char* int64toa(int64_t llii, char* start) {
+  uint64_t ullii = llii;
+  uint64_t top_digits;
+  uint32_t bottom_eight;
+  uint32_t middle_eight;
+  if (llii < 0) {
+    *start++ = '-';
+    ullii = -ullii;
+  }
+  if (ullii <= 0xffffffffLLU) {
+    return uint32toa((uint32_t)ullii, start);
+  }
+  top_digits = ullii / 100000000;
+  bottom_eight = (uint32_t)(ullii - (top_digits * 100000000));
+  if (top_digits <= 0xffffffffLLU) {
+    start = uint32toa((uint32_t)top_digits, start);
+    return uitoa_z8(bottom_eight, start);
+  }
+  ullii = top_digits / 100000000;
+  middle_eight = (uint32_t)(top_digits - (ullii * 100000000));
+  start = uint32toa((uint32_t)ullii, start);
+  start = uitoa_z8(middle_eight, start);
+  return uitoa_z8(bottom_eight, start);
+}
+
+
+char* uitoa_trunc4(uint32_t uii, char* start) {
+  uint32_t quotient = uii / 100;
+  memcpy(start, &(kDigitPair[quotient]), 2);
+  uii -= 100 * quotient;
+  if (uii) {
+    start += 2;
+    memcpy(start, &(kDigitPair[uii]), 2);
+  }
+  if (start[1] != '0') {
+    return &(start[2]);
+  }
+  return &(start[1]);
+}
+
+static inline char* uitoa_trunc6(uint32_t uii, char* start) {
+  uint32_t quotient = uii / 10000;
+  memcpy(start, &(kDigitPair[quotient]), 2);
+  uii -= 10000 * quotient;
+  if (uii) {
+    quotient = uii / 100;
+    start += 2;
+    memcpy(start, &(kDigitPair[quotient]), 2);
+    uii -= 100 * quotient;
+    if (uii) {
+      start += 2;
+      memcpy(start, &(kDigitPair[uii]), 2);
+    }
+  }
+  if (start[1] != '0') {
+    return &(start[2]);
+  }
+  return &(start[1]);
+}
+
+static inline char* uitoa_trunc8(uint32_t uii, char* start) {
+  uint32_t quotient = uii / 1000000;
+  memcpy(start, &(kDigitPair[quotient]), 2);
+  uii -= 1000000 * quotient;
+  if (uii) {
+    quotient = uii / 10000;
+    start += 2;
+    memcpy(start, &(kDigitPair[quotient]), 2);
+    uii -= 10000 * quotient;
+    if (uii) {
+      quotient = uii / 100;
+      start += 2;
+      memcpy(start, &(kDigitPair[quotient]), 2);
+      uii -= 100 * quotient;
+      if (uii) {
+	start += 2;
+	memcpy(start, &(kDigitPair[uii]), 2);
+      }
+    }
+  }
+  if (start[1] != '0') {
+    return &(start[2]);
+  }
+  return &(start[1]);
+}
+
+static inline char* rtoa_p5(uint32_t remainder, char* start) {
+  if (!remainder) {
+    return start;
+  }
+  *start++ = '.';
+  uint32_t quotient = remainder / 1000;
+  memcpy(start, &(kDigitPair[quotient]), 2);
+  remainder -= 1000 * quotient;
+  if (remainder) {
+    quotient = remainder / 10;
+    start += 2;
+    memcpy(start, &(kDigitPair[quotient]), 2);
+    remainder -= 10 * quotient;
+    if (remainder) {
+      start[2] = '0' + remainder;
+      return &(start[3]);
+    }
+  }
+  if (start[1] != '0') {
+    return &(start[2]);
+  }
+  return &(start[1]);
+}
+
+static inline char* qrtoa_1p5(uint32_t quotient, uint32_t remainder, char* start) {
+  *start++ = '0' + quotient;
+  return rtoa_p5(remainder, start);
+}
+
+static inline char* qrtoa_1p7(uint32_t quotient, uint32_t remainder, char* start) {
+  *start++ = '0' + quotient;
+  if (!remainder) {
+    return start;
+  }
+  *start++ = '.';
+  quotient = remainder / 100000;
+  memcpy(start, &(kDigitPair[quotient]), 2);
+  remainder -= 100000 * quotient;
+  if (remainder) {
+    quotient = remainder / 1000;
+    start += 2;
+    memcpy(start, &(kDigitPair[quotient]), 2);
+    remainder -= 1000 * quotient;
+    if (remainder) {
+      quotient = remainder / 10;
+      start += 2;
+      memcpy(start, &(kDigitPair[quotient]), 2);
+      remainder -= 10 * quotient;
+      if (remainder) {
+	start[2] = '0' + remainder;
+	return &(start[3]);
+      }
+    }
+  }
+  if (start[1] != '0') {
+    return &(start[2]);
+  }
+  return &(start[1]);
+}
+
+// Okay, time to do banker's rounding when printing doubles.  14 digits of
+// precision are used in judging equality to 0.5 (actual precision of doubles
+// is 15-17 digits); the intention is to capture all directly loaded or exactly
+// computed edge cases (so enough tolerance is needed to survive the internal
+// multiplications by powers of 10, etc.), while rounding a negligible number
+// of honest-to-god 0.4999999s up and 0.5000001s down.
+// To avoid inadvertent printing of an extra digit, there's a deliberate gap
+// between the 99.9994999...-type bounds and the largest numbers that would
+// actually round down.
+static const double kBankerRound6[] = {0.4999995, 0.5000005};
+static const double kBankerRound8[] = {0.499999995, 0.500000005};
+
+static inline uint32_t double_bround(double dxx, const double* banker_round) {
+  uint32_t result = (int32_t)dxx;
+  return result + (int32_t)((dxx - ((int32_t)result)) + banker_round[result & 1]);
+}
+
+// These are separate functions so the compiler can optimize the integer
+// divisions.
+static inline void double_bround1(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
+  dxx *= 10;
+  uint32_t remainder = (int32_t)dxx;
+  remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
+  *quotientp = remainder / 10;
+  *remainderp = remainder - (*quotientp) * 10; 
+}
+
+static inline void double_bround2(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
+  dxx *= 100;
+  uint32_t remainder = (int32_t)dxx;
+  remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
+  *quotientp = remainder / 100;
+  *remainderp = remainder - (*quotientp) * 100; 
+}
+
+static inline void double_bround3(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
+  dxx *= 1000;
+  uint32_t remainder = (int32_t)dxx;
+  remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
+  *quotientp = remainder / 1000;
+  *remainderp = remainder - (*quotientp) * 1000; 
+}
+
+static inline void double_bround4(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
+  dxx *= 10000;
+  uint32_t remainder = (int32_t)dxx;
+  remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
+  *quotientp = remainder / 10000;
+  *remainderp = remainder - (*quotientp) * 10000; 
+}
+
+static inline void double_bround5(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
+  dxx *= 100000;
+  uint32_t remainder = (int32_t)dxx;
+  remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
+  *quotientp = remainder / 100000;
+  *remainderp = remainder - (*quotientp) * 100000; 
+}
+
+static inline void double_bround6(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
+  dxx *= 1000000;
+  uint32_t remainder = (int32_t)dxx;
+  remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
+  *quotientp = remainder / 1000000;
+  *remainderp = remainder - (*quotientp) * 1000000; 
+}
+
+static inline void double_bround7(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
+  dxx *= 10000000;
+  uint32_t remainder = (int32_t)dxx;
+  remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
+  *quotientp = remainder / 10000000;
+  *remainderp = remainder - (*quotientp) * 10000000; 
+}
+
+char* dtoa_so6(double dxx, char* start) {
+  // 6 sig fig number, 0.999995 <= dxx < 999999.5
+  // 'so' = "significand only"
+  // Just hardcoding all six cases, in the absence of a better approach...
+  uint32_t uii;
+  uint32_t quotient;
+  uint32_t remainder;
+  if (dxx < 99.999949999999) {
+    if (dxx < 9.9999949999999) {
+      double_bround5(dxx, kBankerRound8, &quotient, &remainder);
+      return qrtoa_1p5(quotient, remainder, start);
+    }
+    double_bround4(dxx, kBankerRound8, &quotient, &remainder);
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    if (!remainder) {
+      return start;
+    }
+    *start++ = '.';
+    quotient = remainder / 100;
+    memcpy(start, &(kDigitPair[quotient]), 2);
+    remainder -= 100 * quotient;
+    if (remainder) {
+      start += 2;
+    dtoa_so6_pretail:
+      memcpy(start, &(kDigitPair[remainder]), 2);
+    }
+  dtoa_so6_tail:
+    if (start[1] != '0') {
+      return &(start[2]);
+    }
+    return &(start[1]);
+  }
+  if (dxx < 9999.9949999999) {
+    if (dxx < 999.99949999999) {
+      double_bround3(dxx, kBankerRound8, &uii, &remainder);
+      quotient = uii / 100;
+      *start++ = '0' + quotient;
+      quotient = uii - 100 * quotient;
+      start = memcpya(start, &(kDigitPair[quotient]), 2);
+      if (!remainder) {
+	return start;
+      }
+      *start++ = '.';
+      quotient = remainder / 10;
+      memcpy(start, &(kDigitPair[quotient]), 2);
+      remainder -= quotient * 10;
+      if (!remainder) {
+        goto dtoa_so6_tail;
+      }
+      start[2] = '0' + remainder;
+      return &(start[3]);
+    }
+    double_bround2(dxx, kBankerRound8, &uii, &remainder);
+    quotient = uii / 100;
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    quotient = uii - (100 * quotient);
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    if (!remainder) {
+      return start;
+    }
+    *start++ = '.';
+    goto dtoa_so6_pretail;
+  }
+  if (dxx >= 99999.949999999) {
+    return uitoa_z6(double_bround(dxx, kBankerRound8), start);
+  }
+  double_bround1(dxx, kBankerRound8, &uii, &remainder);
+  quotient = uii / 10000;
+  *start = '0' + quotient;
+  uii -= 10000 * quotient;
+  quotient = uii / 100;
+  start = memcpya(&(start[1]), &(kDigitPair[quotient]), 2);
+  uii = uii - 100 * quotient;
+  start = memcpya(start, &(kDigitPair[uii]), 2);
+  if (!remainder) {
+    return start;
+  }
+  *start++ = '.';
+  *start = '0' + remainder;
+  return &(start[1]);
+}
+
+char* dtoa_so8(double dxx, char* start) {
+  // 8 sig fig number, 0.99999995 <= dxx < 99999999.5
+  uint32_t uii;
+  uint32_t quotient;
+  uint32_t remainder;
+  if (dxx < 99.999999499999) {
+    if (dxx < 9.9999999499999) {
+      double_bround7(dxx, kBankerRound6, &quotient, &remainder);
+      return qrtoa_1p7(quotient, remainder, start);
+    }
+    double_bround6(dxx, kBankerRound6, &quotient, &remainder);
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    if (!remainder) {
+      return start;
+    }
+    *start++ = '.';
+    quotient = remainder / 10000;
+    memcpy(start, &(kDigitPair[quotient]), 2);
+    remainder -= 10000 * quotient;
+    if (remainder) {
+      start += 2;
+    dtoa_so8_pretail4:
+      quotient = remainder / 100;
+      memcpy(start, &(kDigitPair[quotient]), 2);
+      remainder -= 100 * quotient;
+      if (remainder) {
+	start += 2;
+      dtoa_so8_pretail2:
+        memcpy(start, &(kDigitPair[remainder]), 2);
+      }
+    }
+  dtoa_so8_tail:
+    if (start[1] != '0') {
+      return &(start[2]);
+    }
+    return &(start[1]);
+  }
+  if (dxx < 9999.9999499999) {
+    if (dxx < 999.99999499999) {
+      double_bround5(dxx, kBankerRound6, &uii, &remainder);
+      quotient = uii / 100;
+      *start++ = '0' + quotient;
+      quotient = uii - 100 * quotient;
+      start = memcpya(start, &(kDigitPair[quotient]), 2);
+      if (!remainder) {
+	return start;
+      }
+      *start++ = '.';
+      quotient = remainder / 1000;
+      memcpy(start, &(kDigitPair[quotient]), 2);
+      remainder -= quotient * 1000;
+      if (!remainder) {
+        goto dtoa_so8_tail;
+      }
+      start += 2;
+    dtoa_so8_pretail3:
+      quotient = remainder / 10;
+      memcpy(start, &(kDigitPair[quotient]), 2);
+      remainder -= quotient * 10;
+      if (!remainder) {
+	goto dtoa_so8_tail;
+      }
+      start[2] = '0' + remainder;
+      return &(start[3]);
+    }
+    double_bround4(dxx, kBankerRound6, &uii, &remainder);
+    quotient = uii / 100;
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    quotient = uii - (100 * quotient);
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    if (!remainder) {
+      return start;
+    }
+    *start++ = '.';
+    goto dtoa_so8_pretail4;
+  }
+  if (dxx < 999999.99499999) {
+    if (dxx < 99999.999499999) {
+      double_bround3(dxx, kBankerRound6, &uii, &remainder);
+      quotient = uii / 10000;
+      *start = '0' + quotient;
+      uii -= 10000 * quotient;
+      quotient = uii / 100;
+      start = memcpya(&(start[1]), &(kDigitPair[quotient]), 2);
+      uii -= 100 * quotient;
+      start = memcpya(start, &(kDigitPair[uii]), 2);
+      if (!remainder) {
+	return start;
+      }
+      *start++ = '.';
+      goto dtoa_so8_pretail3;
+    }
+    double_bround2(dxx, kBankerRound6, &uii, &remainder);
+    quotient = uii / 10000;
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    uii -= 10000 * quotient;
+    quotient = uii / 100;
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    uii -= 100 * quotient;
+    start = memcpya(start, &(kDigitPair[uii]), 2);
+    if (!remainder) {
+      return start;
+    }
+    *start++ = '.';
+    goto dtoa_so8_pretail2;
+  }
+  if (dxx >= 9999999.9499999) {
+    return uitoa_z8(double_bround(dxx, kBankerRound6), start);
+  }
+  double_bround1(dxx, kBankerRound6, &uii, &remainder);
+  quotient = uii / 1000000;
+  *start = '0' + quotient;
+  uii -= 1000000 * quotient;
+  quotient = uii / 10000;
+  start = memcpya(&(start[1]), &(kDigitPair[quotient]), 2);
+  uii -= 10000 * quotient;
+  quotient = uii / 100;
+  start = memcpya(start, &(kDigitPair[quotient]), 2);
+  uii -= 100 * quotient;
+  start = memcpya(start, &(kDigitPair[uii]), 2);
+  if (!remainder) {
+    return start;
+  }
+  *start = '.';
+  start[1] = '0' + remainder;
+  return &(start[2]);
+}
+
+char* dtoa_g(double dxx, char* start) {
+  uint32_t xp10 = 0;
+  uint32_t quotient;
+  uint32_t remainder;
+  if (dxx != dxx) {
+    return memcpyl3a(start, "nan");
+  }
+  if (dxx < 0) {
+    *start++ = '-';
+    dxx = -dxx;
+  }
+  if (dxx < 9.9999949999999e-5) {
+    // 6 sig fig exponential notation, small
+    if (dxx < 9.9999949999999e-16) {
+      if (dxx < 9.9999949999999e-128) {
+	if (dxx == 0.0) {
+	  *start = '0';
+	  return &(start[1]);
+	}
+	if (dxx < 9.9999949999999e-256) {
+	  dxx *= 1.0e256;
+	  xp10 |= 256;
+	} else {
+	  dxx *= 1.0e128;
+	  xp10 |= 128;
+	}
+      }
+      if (dxx < 9.9999949999999e-64) {
+	dxx *= 1.0e64;
+	xp10 |= 64;
+      }
+      if (dxx < 9.9999949999999e-32) {
+	dxx *= 1.0e32;
+	xp10 |= 32;
+      }
+      if (dxx < 9.9999949999999e-16) {
+	dxx *= 1.0e16;
+	xp10 |= 16;
+      }
+    }
+    if (dxx < 9.9999949999999e-8) {
+      dxx *= 100000000;
+      xp10 |= 8;
+    }
+    if (dxx < 9.9999949999999e-4) {
+      dxx *= 10000;
+      xp10 |= 4;
+    }
+    if (dxx < 9.9999949999999e-2) {
+      dxx *= 100;
+      xp10 |= 2;
+    }
+    if (dxx < 9.9999949999999e-1) {
+      dxx *= 10;
+      ++xp10;
+    }
+    double_bround5(dxx, kBankerRound8, &quotient, &remainder);
+    start = memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2);
+    if (xp10 >= 100) {
+      quotient = xp10 / 100;
+      *start++ = '0' + quotient;
+      xp10 -= 100 * quotient;
+    }
+    return memcpya(start, &(kDigitPair[xp10]), 2);
+  }
+  if (dxx >= 999999.49999999) {
+    // 6 sig fig exponential notation, large
+    if (dxx >= 9.9999949999999e15) {
+      if (dxx >= 9.9999949999999e127) {
+	if (dxx > DBL_MAX) {
+	  return memcpyl3a(start, "inf");
+	}
+	if (dxx >= 9.9999949999999e255) {
+	  dxx *= 1.0e-256;
+	  xp10 |= 256;
+	} else {
+	  dxx *= 1.0e-128;
+	  xp10 |= 128;
+	}
+      }
+      if (dxx >= 9.9999949999999e63) {
+	dxx *= 1.0e-64;
+	xp10 |= 64;
+      }
+      if (dxx >= 9.9999949999999e31) {
+	dxx *= 1.0e-32;
+	xp10 |= 32;
+      }
+      if (dxx >= 9.9999949999999e15) {
+	dxx *= 1.0e-16;
+	xp10 |= 16;
+      }
+    }
+    if (dxx >= 9.9999949999999e7) {
+      dxx *= 1.0e-8;
+      xp10 |= 8;
+    }
+    if (dxx >= 9.9999949999999e3) {
+      dxx *= 1.0e-4;
+      xp10 |= 4;
+    }
+    if (dxx >= 9.9999949999999e1) {
+      dxx *= 1.0e-2;
+      xp10 |= 2;
+    }
+    if (dxx >= 9.9999949999999e0) {
+      dxx *= 1.0e-1;
+      xp10++;
+    }
+    double_bround5(dxx, kBankerRound8, &quotient, &remainder);
+    start = memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2);
+    if (xp10 >= 100) {
+      quotient = xp10 / 100;
+      *start++ = '0' + quotient;
+      xp10 -= 100 * quotient;
+    }
+    return memcpya(start, &(kDigitPair[xp10]), 2);
+  }
+  if (dxx >= 0.99999949999999) {
+    return dtoa_so6(dxx, start);
+  }
+  // 6 sig fig decimal, no less than ~0.0001
+  start = memcpya(start, "0.", 2);
+  if (dxx < 9.9999949999999e-3) {
+    dxx *= 100;
+    start = memcpya(start, "00", 2);
+  }
+  if (dxx < 9.9999949999999e-2) {
+    dxx *= 10;
+    *start++ = '0';
+  }
+  return uitoa_trunc6(double_bround(dxx * 1000000, kBankerRound8), start);
+}
+
+char* dtoa_g_p8(double dxx, char* start) {
+  uint32_t xp10 = 0;
+  char wbuf[16];
+  char* wpos = wbuf;
+  uint32_t quotient;
+  uint32_t remainder;
+  if (dxx != dxx) {
+    return memcpyl3a(start, "nan");
+  }
+  if (dxx < 0) {
+    *wpos++ = '-';
+    dxx = -dxx;
+  }
+  if (dxx < 9.9999999499999e-5) {
+    // 8 sig fig exponential notation, small
+    if (dxx < 9.9999999499999e-16) {
+      if (dxx < 9.9999999499999e-128) {
+	if (dxx == 0.0) {
+	  *start = '0';
+	  return &(start[1]);
+        }
+	if (dxx < 9.9999999499999e-256) {
+	  dxx *= 1.0e256;
+	  xp10 |= 256;
+	} else {
+	  dxx *= 1.0e128;
+	  xp10 |= 128;
+	}
+      }
+      if (dxx < 9.9999999499999e-64) {
+	dxx *= 1.0e64;
+	xp10 |= 64;
+      }
+      if (dxx < 9.9999999499999e-32) {
+	dxx *= 1.0e32;
+	xp10 |= 32;
+      }
+      if (dxx < 9.9999999499999e-16) {
+	dxx *= 1.0e16;
+	xp10 |= 16;
+      }
+    }
+    if (dxx < 9.9999999499999e-8) {
+      dxx *= 100000000;
+      xp10 |= 8;
+    }
+    if (dxx < 9.9999999499999e-4) {
+      dxx *= 10000;
+      xp10 |= 4;
+    }
+    if (dxx < 9.9999999499999e-2) {
+      dxx *= 100;
+      xp10 |= 2;
+    }
+    if (dxx < 9.9999999499999e-1) {
+      dxx *= 10;
+      ++xp10;
+    }
+    double_bround7(dxx, kBankerRound6, &quotient, &remainder);
+    wpos = qrtoa_1p7(quotient, remainder, wpos);
+    remainder = wpos - wbuf;
+    if (xp10 >= 100) {
+      start = memcpya(start, wbuf, remainder);
+      quotient = xp10 / 100;
+      start = memcpyax(start, "e-", 2, '0' + quotient);
+      xp10 -= 100 * quotient;
+    } else {
+      start = memcpya(start, wbuf, remainder);
+      start = memcpya(start, "e-", 2);
+    }
+    return memcpya(start, &(kDigitPair[xp10]), 2);
+  }
+  if (dxx >= 99999999.499999) {
+    // 8 sig fig exponential notation, large
+    if (dxx >= 9.9999999499999e15) {
+      if (dxx >= 9.9999999499999e127) {
+	if (dxx > DBL_MAX) {
+	  if (wpos == wbuf) {
+	    return memcpya(start, " inf", 4);
+	  }
+	  return memcpya(start, "-inf", 4);
+	}
+	if (dxx >= 9.9999999499999e255) {
+	  dxx *= 1.0e-256;
+	  xp10 |= 256;
+	} else {
+	  dxx *= 1.0e-128;
+	  xp10 |= 128;
+	}
+      }
+      if (dxx >= 9.9999999499999e63) {
+	dxx *= 1.0e-64;
+	xp10 |= 64;
+      }
+      if (dxx >= 9.9999999499999e31) {
+	dxx *= 1.0e-32;
+	xp10 |= 32;
+      }
+      if (dxx >= 9.9999999499999e15) {
+	dxx *= 1.0e-16;
+	xp10 |= 16;
+      }
+    }
+    if (dxx >= 9.9999999499999e7) {
+      dxx *= 1.0e-8;
+      xp10 |= 8;
+    }
+    if (dxx >= 9.9999999499999e3) {
+      dxx *= 1.0e-4;
+      xp10 |= 4;
+    }
+    if (dxx >= 9.9999999499999e1) {
+      dxx *= 1.0e-2;
+      xp10 |= 2;
+    }
+    if (dxx >= 9.9999999499999e0) {
+      dxx *= 1.0e-1;
+      ++xp10;
+    }
+    double_bround7(dxx, kBankerRound6, &quotient, &remainder);
+    wpos = qrtoa_1p7(quotient, remainder, wpos);
+    remainder = wpos - wbuf;
+    if (xp10 >= 100) {
+      start = memcpya(start, wbuf, remainder);
+      quotient = xp10 / 100;
+      start = memcpyax(start, "e+", 2, '0' + quotient);
+      xp10 -= 100 * quotient;
+    } else {
+      start = memcpya(start, wbuf, remainder);
+      start = memcpya(start, "e+", 2);
+    }
+    return memcpya(start, &(kDigitPair[xp10]), 2);
+  }
+  if (dxx >= 0.99999999499999) {
+    wpos = dtoa_so8(dxx, wpos);
+  } else {
+    // 8 sig fig decimal, no less than ~0.0001
+    wpos = memcpya(wpos, "0.", 2);
+    if (dxx < 9.9999999499999e-3) {
+      dxx *= 100;
+      wpos = memcpya(wpos, "00", 2);
+    }
+    if (dxx < 9.9999999499999e-2) {
+      dxx *= 10;
+      *wpos++ = '0';
+    }
+    wpos = uitoa_trunc8(double_bround(dxx * 100000000, kBankerRound6), wpos);
+  }
+  remainder = wpos - wbuf;
+  return memcpya(start, wbuf, remainder);
+}
+
+static_assert(kDosageMax == 32768, "print_dosage() needs to be updated.");
+char* print_dosage(uint64_t dosage, char* start) {
+  // 3 digit precision seems like the best compromise between accuracy and
+  // avoidance of rounding ugliness
+  // (Rounding ugliness is not actually hidden for e.g. 1000 Genomes phase 1,
+  // since there are lots of 0.05 and 0.1 dosages which all get rounded in the
+  // same direction; oh well.)
+
+  // +16 since we need to round .99951 up to 1
+  const uint64_t dosage_p16 = dosage + 16;
+  start = uint32toa(dosage_p16 / kDosageMax, start);
+  const uint32_t remainder_p16 = ((uint32_t)dosage_p16) & (kDosageMax - 1);
+  if (remainder_p16 < 33) {
+    return start;
+  }
+  // (1000 * remainder + 16384) / 32768
+  //   1/16 = .0625 -> print 0.062
+  //   3/16 = .1875 -> print 0.188
+  //   5/16 = .3125 -> print 0.312
+  // const uint32_t three_decimal_places = ((125 * remainder + 2048) / 4096) - ((remainder % 8192) == 2048);
+  const uint32_t three_decimal_places = ((125 * remainder_p16 + 48) / 4096) - ((remainder_p16 % 8192) == 4048);
+  // three_decimal_places guaranteed to be nonzero here
+  *start++ = '.';
+  const uint32_t first_decimal_place = three_decimal_places / 100;
+  *start++ = '0' + first_decimal_place;
+  const uint32_t last_two_digits = three_decimal_places - first_decimal_place * 100;
+  if (last_two_digits) {
+    memcpy(start, &(kDigitPair[last_two_digits]), 2);
+    return &(start[1 + (start[1] != '0')]);
+  }
+  return start;
+}
+/*
+char* dtoa_f_p5_clipped(double dxx, char* start) {
+  if (dxx != dxx) {
+    return memcpyl3a(start, "nan");
+  }
+  if (dxx < 0.0) {
+    // note that "-0" will be printed for very small negative numbers; do we
+    // want this?
+    *start++ = '-';
+    dxx = -dxx;
+  }
+#ifdef __LP64__
+  if (dxx < 4294967295.999994) {
+    // We could use different levels of banker's rounding for different-size
+    // quotients, but that's overkill for now; revisit after basic dosage
+    // support is working.
+    dxx *= 100000;
+    uint64_t remainder = (int64_t)dxx;
+    remainder += (int64_t)((dxx - ((int64_t)remainder)) + kBankerRound6[remainder & 1]);
+    uint64_t quotient = remainder / 100000;
+    remainder = remainder - quotient * 100000;
+    start = uint32toa(quotient, start);
+    return rtoa_p5(remainder, start);
+  }
+#else
+  if (dxx < 2147483647.999994) {
+    // avoid 64-bit integer math in 32-bit build.
+    // (todo: a bit of benchmarking)
+    const uintptr_t quotient = (intptr_t)dxx;
+    const double remainder_d = (dxx - ((intptr_t)quotient)) * 100000;
+    const uint32_t remainder_d_trunc = (int32_t)remainder_d;
+    const uint32_t remainder = (int32_t)(remainder_d + kBankerRound6[remainder_d_trunc & 1]);
+    start = uint32toa(quotient, start);
+    return rtoa_p5(remainder, start);
+  }
+#endif
+  if (dxx == INFINITY) {
+    return memcpyl3a(start, "inf");
+  }
+  // just punt larger numbers to glibc for now, this isn't a bottleneck
+  start += sprintf(start, "%.5f", dxx);
+  // .5f doesn't strip trailing zeroes, do that manually
+  for (uint32_t uii = 0; uii < 5; ++uii) {
+    if (start[-1] != '0') {
+      return start;
+    }
+    --start;
+  }
+  return &(start[-1]); // strip the decimal point
+}
+*/
+
+
+// Briefly had banker's rounding for floats, but then I realized that the only
+// float-printing function calls are --make-grm related, they all request 6-7
+// digits of precision, and at that point it's impossible to distinguish exact
+// 0.5-matches in the remainder.  So we just have generic rounding functions
+// here, with similar interfaces to the double-rounding functions to minimize
+// the need for separate reasoning about this code.
+static inline uint32_t float_round(float fxx) {
+  return (uint32_t)((int32_t)(fxx + 0.5));
+}
+
+static inline void float_round1(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
+  uint32_t remainder = float_round(fxx * 10);
+  *quotientp = remainder / 10;
+  *remainderp = remainder - (*quotientp) * 10;
+}
+
+static inline void float_round2(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
+  uint32_t remainder = float_round(fxx * 100);
+  *quotientp = remainder / 100;
+  *remainderp = remainder - (*quotientp) * 100;
+}
+
+static inline void float_round3(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
+  uint32_t remainder = float_round(fxx * 1000);
+  *quotientp = remainder / 1000;
+  *remainderp = remainder - (*quotientp) * 1000;
+}
+
+static inline void float_round4(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
+  uint32_t remainder = float_round(fxx * 10000);
+  *quotientp = remainder / 10000;
+  *remainderp = remainder - (*quotientp) * 10000;
+}
+
+static inline void float_round5(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
+  uint32_t remainder = float_round(fxx * 100000);
+  *quotientp = remainder / 100000;
+  *remainderp = remainder - (*quotientp) * 100000;
+}
+
+char* ftoa_so6(float fxx, char* start) {
+  uint32_t uii;
+  uint32_t quotient;
+  uint32_t remainder;
+  // difference between consecutive floats near 10 can be as large as
+  // 10 * 2^{-23}, which is just under 1.2e-6.  So, to avoid printing an extra
+  // digit, we have to set this bound to be robust to an addition error of size
+  // 6e-7.
+  // (possible todo: just brute-force test this on all <2^32 possible floats
+  // and look for a better threshold)
+  if (fxx < 99.999944) {
+    if (fxx < 9.9999944) {
+      float_round5(fxx, &quotient, &remainder);
+      return qrtoa_1p5(quotient, remainder, start);
+    }
+    float_round4(fxx, &quotient, &remainder);
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    if (!remainder) {
+      return start;
+    }
+    *start++ = '.';
+    quotient = remainder / 100;
+    memcpy(start, &(kDigitPair[quotient]), 2);
+    remainder -= 100 * quotient;
+    if (remainder) {
+      start += 2;
+    ftoa_so6_pretail:
+      memcpy(start, &(kDigitPair[remainder]), 2);
+    }
+  ftoa_so6_tail:
+    if (start[1] != '0') {
+      return &(start[2]);
+    }
+    return &(start[1]);
+  }
+  if (fxx < 9999.9944) {
+    if (fxx < 999.99944) {
+      float_round3(fxx, &uii, &remainder);
+      quotient = uii / 100;
+      *start = '0' + quotient;
+      quotient = uii - 100 * quotient;
+      start = memcpya(&(start[1]), &(kDigitPair[quotient]), 2);
+      if (!remainder) {
+	return start;
+      }
+      *start++ = '.';
+      quotient = remainder / 10;
+      memcpy(start, &(kDigitPair[quotient]), 2);
+      remainder -= quotient * 10;
+      if (!remainder) {
+        goto ftoa_so6_tail;
+      }
+      start[2] = '0' + remainder;
+      return &(start[3]);
+    }
+    float_round2(fxx, &uii, &remainder);
+    quotient = uii / 100;
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    quotient = uii - (100 * quotient);
+    start = memcpya(start, &(kDigitPair[quotient]), 2);
+    if (!remainder) {
+      return start;
+    }
+    *start++ = '.';
+    goto ftoa_so6_pretail;
+  }
+  if (fxx >= 99999.944) {
+    return uitoa_z6(float_round(fxx), start);
+  }
+  float_round1(fxx, &uii, &remainder);
+  quotient = uii / 10000;
+  *start = '0' + quotient;
+  uii -= 10000 * quotient;
+  quotient = uii / 100;
+  start = memcpya(&(start[1]), &(kDigitPair[quotient]), 2);
+  uii = uii - 100 * quotient;
+  start = memcpya(start, &(kDigitPair[uii]), 2);
+  if (!remainder) {
+    return start;
+  }
+  *start = '.';
+  start[1] = '0' + remainder;
+  return &(start[2]);
+}
+
+char* ftoa_g(float fxx, char* start) {
+  uint32_t xp10 = 0;
+  uint32_t quotient;
+  uint32_t remainder;
+  if (fxx != fxx) {
+    return memcpyl3a(start, "nan");
+  }
+  if (fxx < 0) {
+    *start++ = '-';
+    fxx = -fxx;
+  }
+  if (fxx < 9.9999944e-5) {
+    if (fxx < 9.9999944e-16) {
+      if (fxx == 0.0) {
+	*start = '0';
+	return &(start[1]);
+      }
+      if (fxx < 9.9999944e-32) {
+	fxx *= 1.0e32;
+	xp10 |= 32;
+      } else {
+	fxx *= 1.0e16;
+	xp10 |= 16;
+      }
+    }
+    if (fxx < 9.9999944e-8) {
+      fxx *= 100000000;
+      xp10 |= 8;
+    }
+    if (fxx < 9.9999944e-4) {
+      fxx *= 10000;
+      xp10 |= 4;
+    }
+    if (fxx < 9.9999944e-2) {
+      fxx *= 100;
+      xp10 |= 2;
+    }
+    if (fxx < 9.9999944e-1) {
+      fxx *= 10;
+      ++xp10;
+    }
+    float_round5(fxx, &quotient, &remainder);
+    return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2), &(kDigitPair[xp10]), 2);
+  }
+  if (fxx >= 999999.44) {
+    if (fxx >= 9.9999944e15) {
+      if (fxx > FLT_MAX) {
+	return memcpyl3a(start, "inf");
+      }
+      if (fxx >= 9.9999944e31) {
+	fxx *= 1.0e-32;
+	xp10 |= 32;
+      } else {
+	fxx *= 1.0e-16;
+	xp10 |= 16;
+      }
+    }
+    if (fxx >= 9.9999944e7) {
+      fxx *= 1.0e-8;
+      xp10 |= 8;
+    }
+    if (fxx >= 9.9999944e3) {
+      fxx *= 1.0e-4;
+      xp10 |= 4;
+    }
+    if (fxx >= 9.9999944e1) {
+      fxx *= 1.0e-2;
+      xp10 |= 2;
+    }
+    if (fxx >= 9.9999944e0) {
+      fxx *= 1.0e-1;
+      ++xp10;
+    }
+    float_round5(fxx, &quotient, &remainder);
+    return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2), &(kDigitPair[xp10]), 2);
+  }
+  if (fxx >= 0.99999944) {
+    return ftoa_so6(fxx, start);
+  }
+  // 6 sig fig decimal, no less than ~0.0001
+  start = memcpya(start, "0.", 2);
+  if (fxx < 9.9999944e-3) {
+    fxx *= 100;
+    start = memcpya(start, "00", 2);
+  }
+  if (fxx < 9.9999944e-2) {
+    fxx *= 10;
+    *start++ = '0';
+  }
+  return uitoa_trunc6(float_round(fxx * 1000000), start);
+}
+
+
+/*
+void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp) {
+  // Enables fast integer division by a constant not known until runtime.  See
+  // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html .
+  // Assumes divisor is not zero, of course.
+  // May want to populate a struct instead.
+  // (May not need this any more?)
+  assert(divisor);
+  if (!(divisor & (divisor - 1))) {
+    // power of 2
+    *multp = 1;
+    *pre_shiftp = 0;
+    *post_shiftp = __builtin_ctz(divisor);
+    *incrp = 0;
+    return;
+  }
+  uint32_t quotient = 0x80000000U / divisor;
+  uint32_t remainder = 0x80000000U - (quotient * divisor);
+  const uint32_t ceil_log_2_d = 32 - __builtin_clz(divisor);
+  uint32_t down_multiplier = 0;
+  uint32_t down_exponent = 0;
+  uint32_t has_magic_down = 0;
+  uint32_t exponent;
+  for (exponent = 0; ; ++exponent) {
+    if (remainder >= divisor - remainder) {
+      quotient = quotient * 2 + 1;
+      remainder = remainder * 2 - divisor;
+    } else {
+      quotient = quotient * 2;
+      remainder = remainder * 2;
+    }
+    if ((exponent >= ceil_log_2_d) || (divisor - remainder) <= (1U << exponent)) {
+      break;
+    }
+    if ((!has_magic_down) && (remainder <= (1U << exponent))) {
+      has_magic_down = 1;
+      down_multiplier = quotient;
+      down_exponent = exponent;
+    }
+  }
+  if (exponent < ceil_log_2_d) {
+    *multp = quotient + 1;
+    *pre_shiftp = 0;
+    *post_shiftp = 32 + exponent;
+    *incrp = 0;
+    return;
+  }
+  if (divisor & 1) {
+    *multp = down_multiplier;
+    *pre_shiftp = 0;
+    *post_shiftp = 32 + down_exponent;
+    *incrp = 1;
+    return;
+  }
+  *pre_shiftp = __builtin_ctz(divisor);
+  uint32_t dummy;
+  magic_num(divisor >> (*pre_shiftp), multp, &dummy, post_shiftp, incrp);
+}
+*/
+
+
+void fill_bits_nz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr) {
+  assert(end_idx > start_idx);
+  uintptr_t maj_start = start_idx / kBitsPerWord;
+  uintptr_t maj_end = end_idx / kBitsPerWord;
+  uintptr_t minor;
+  if (maj_start == maj_end) {
+    bitarr[maj_start] |= (k1LU << (end_idx % kBitsPerWord)) - (k1LU << (start_idx % kBitsPerWord));
+  } else {
+    bitarr[maj_start] |= ~((k1LU << (start_idx % kBitsPerWord)) - k1LU);
+    fill_ulong_one(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
+    minor = end_idx % kBitsPerWord;
+    if (minor) {
+      bitarr[maj_end] |= (k1LU << minor) - k1LU;
+    }
+  }
+}
+
+void clear_bits_nz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr) {
+  assert(end_idx > start_idx);
+  uintptr_t maj_start = start_idx / kBitsPerWord;
+  uintptr_t maj_end = end_idx / kBitsPerWord;
+  uintptr_t minor;
+  if (maj_start == maj_end) {
+    bitarr[maj_start] &= ~((k1LU << (end_idx % kBitsPerWord)) - (k1LU << (start_idx % kBitsPerWord)));
+  } else {
+    bitarr[maj_start] &= ((k1LU << (start_idx % kBitsPerWord)) - k1LU);
+    fill_ulong_zero(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
+    minor = end_idx % kBitsPerWord;
+    if (minor) {
+      bitarr[maj_end] &= ~((k1LU << minor) - k1LU);
+    }
+  }
+}
+
+#ifdef __LP64__
+uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
+  uintptr_t ulii = (*bitarr_ptr) >> (loc % kBitsPerWord);
+  if (ulii) {
+    return loc + CTZLU(ulii);
+  }
+  do {
+    ulii = *(++bitarr_ptr);
+  } while (!ulii);
+  return ((uintptr_t)(bitarr_ptr - bitarr)) * kBitsPerWord + CTZLU(ulii);
+}
+#endif
+
+uint32_t next_unset(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
+  assert(ceil >= 1);
+  const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
+  uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % kBitsPerWord);
+  if (ulii) {
+    loc += CTZLU(ulii);
+    return MINV(loc, ceil);
+  }
+  const uintptr_t* bitarr_last = &(bitarr[(ceil - 1) / kBitsPerWord]);
+  do {
+    if (bitarr_ptr >= bitarr_last) {
+      return ceil;
+    }
+    ulii = *(++bitarr_ptr);
+  } while (ulii == ~k0LU);
+  loc = ((uintptr_t)(bitarr_ptr - bitarr)) * kBitsPerWord + CTZLU(~ulii);
+  return MINV(loc, ceil);
+}
+
+
+boolerr_t bigstack_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
+  *uc_arr_ptr = (unsigned char*)bigstack_alloc(ct);
+  if (!(*uc_arr_ptr)) {
+    return 1;
+  }
+  memset(*uc_arr_ptr, 0, ct);
+  return 0;
+}
+
+boolerr_t bigstack_calloc_d(uintptr_t ct, double** d_arr_ptr) {
+  *d_arr_ptr = (double*)bigstack_alloc(ct * sizeof(double));
+  if (!(*d_arr_ptr)) {
+    return 1;
+  }
+  fill_double_zero(ct, *d_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_calloc_f(uintptr_t ct, float** f_arr_ptr) {
+  *f_arr_ptr = (float*)bigstack_alloc(ct * sizeof(float));
+  if (!(*f_arr_ptr)) {
+    return 1;
+  }
+  fill_float_zero(ct, *f_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_calloc_usi(uintptr_t ct, uint16_t** usi_arr_ptr) {
+  *usi_arr_ptr = (uint16_t*)bigstack_alloc(ct * sizeof(int16_t));
+  if (!(*usi_arr_ptr)) {
+    return 1;
+  }
+  memset(*usi_arr_ptr, 0, ct * sizeof(int16_t));
+  return 0;
+}
+
+boolerr_t bigstack_calloc_ui(uintptr_t ct, uint32_t** ui_arr_ptr) {
+  *ui_arr_ptr = (uint32_t*)bigstack_alloc(ct * sizeof(int32_t));
+  if (!(*ui_arr_ptr)) {
+    return 1;
+  }
+  fill_uint_zero(ct, *ui_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_calloc_ul(uintptr_t ct, uintptr_t** ul_arr_ptr) {
+  *ul_arr_ptr = (uintptr_t*)bigstack_alloc(ct * sizeof(intptr_t));
+  if (!(*ul_arr_ptr)) {
+    return 1;
+  }
+  fill_ulong_zero(ct, *ul_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_calloc_ull(uintptr_t ct, uint64_t** ull_arr_ptr) {
+  *ull_arr_ptr = (uint64_t*)bigstack_alloc(ct * sizeof(int64_t));
+  if (!(*ull_arr_ptr)) {
+    return 1;
+  }
+  fill_ull_zero(ct, *ull_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_end_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
+  *uc_arr_ptr = (unsigned char*)bigstack_end_alloc(ct);
+  if (!(*uc_arr_ptr)) {
+    return 1;
+  }
+  memset(*uc_arr_ptr, 0, ct);
+  return 0;
+}
+
+boolerr_t bigstack_end_calloc_d(uintptr_t ct, double** d_arr_ptr) {
+  *d_arr_ptr = (double*)bigstack_end_alloc(ct * sizeof(double));
+  if (!(*d_arr_ptr)) {
+    return 1;
+  }
+  fill_double_zero(ct, *d_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_end_calloc_f(uintptr_t ct, float** f_arr_ptr) {
+  *f_arr_ptr = (float*)bigstack_end_alloc(ct * sizeof(float));
+  if (!(*f_arr_ptr)) {
+    return 1;
+  }
+  fill_float_zero(ct, *f_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_end_calloc_ui(uintptr_t ct, uint32_t** ui_arr_ptr) {
+  *ui_arr_ptr = (uint32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
+  if (!(*ui_arr_ptr)) {
+    return 1;
+  }
+  fill_uint_zero(ct, *ui_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_end_calloc_ul(uintptr_t ct, uintptr_t** ul_arr_ptr) {
+  *ul_arr_ptr = (uintptr_t*)bigstack_end_alloc(ct * sizeof(intptr_t));
+  if (!(*ul_arr_ptr)) {
+    return 1;
+  }
+  fill_ulong_zero(ct, *ul_arr_ptr);
+  return 0;
+}
+
+boolerr_t bigstack_end_calloc_ull(uintptr_t ct, uint64_t** ull_arr_ptr) {
+  *ull_arr_ptr = (uint64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
+  if (!(*ull_arr_ptr)) {
+    return 1;
+  }
+  fill_ull_zero(ct, *ull_arr_ptr);
+  return 0;
+}
+
+
+void bitarr_invert(uintptr_t bit_ct, uintptr_t* bitarr) {
+  uintptr_t* bitarr_stop = &(bitarr[bit_ct / kBitsPerWord]);
+  while (bitarr < bitarr_stop) {
+    *bitarr = ~(*bitarr);
+    ++bitarr;
+  }
+  const uint32_t trailing_bit_ct = bit_ct % kBitsPerWord;
+  if (trailing_bit_ct) {
+    *bitarr = (~(*bitarr)) & ((k1LU << trailing_bit_ct) - k1LU);
+  }
+}
+
+void bitarr_invert_copy(const uintptr_t* __restrict source_bitarr, uintptr_t bit_ct, uintptr_t* __restrict target_bitarr) {
+  const uintptr_t* source_bitarr_stop = &(source_bitarr[bit_ct / kBitsPerWord]);
+  while (source_bitarr < source_bitarr_stop) {
+    *target_bitarr++ = ~(*source_bitarr++);
+  }
+  const uint32_t trailing_bit_ct = bit_ct % kBitsPerWord;
+  if (trailing_bit_ct) {
+    *target_bitarr = (~(*source_bitarr)) & ((k1LU << trailing_bit_ct) - k1LU);
+  }
+}
+
+void bitvec_and_copy(const uintptr_t* __restrict source1_bitvec, const uintptr_t* __restrict source2_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec) {
+#ifdef __LP64__
+  vul_t* target_bitvvec = (vul_t*)target_bitvec;
+  const vul_t* source1_bitvvec = (const vul_t*)source1_bitvec;
+  const vul_t* source2_bitvvec = (const vul_t*)source2_bitvec;
+  const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
+  for (uintptr_t ulii = 0; ulii < full_vec_ct; ++ulii) {
+    target_bitvvec[ulii] = source1_bitvvec[ulii] & source2_bitvvec[ulii];
+  }
+  #ifdef USE_AVX2
+  if (word_ct & 2) {
+    const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
+    target_bitvec[base_idx] = source1_bitvec[base_idx] & source2_bitvec[base_idx];
+    target_bitvec[base_idx + 1] = source1_bitvec[base_idx + 1] & source2_bitvec[base_idx + 1];
+  }
+  #endif
+  if (word_ct & 1) {
+    target_bitvec[word_ct - 1] = source1_bitvec[word_ct - 1] & source2_bitvec[word_ct - 1];
+  }
+#else
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    target_bitvec[widx] = source1_bitvec[widx] & source2_bitvec[widx];
+  }
+#endif
+}
+
+void bitvec_andnot_copy(const uintptr_t* __restrict source_bitvec, const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec) {
+  // target_bitvec := source_bitvec AND (~exclude_bitvec)
+#ifdef __LP64__
+  vul_t* target_bitvvec = (vul_t*)target_bitvec;
+  const vul_t* source_bitvvec = (const vul_t*)source_bitvec;
+  const vul_t* exclude_bitvvec = (const vul_t*)exclude_bitvec;
+  const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
+  for (uintptr_t ulii = 0; ulii < full_vec_ct; ++ulii) {
+    target_bitvvec[ulii] = source_bitvvec[ulii] & (~exclude_bitvvec[ulii]);
+  }
+  #ifdef USE_AVX2
+  if (word_ct & 2) {
+    const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
+    target_bitvec[base_idx] = source_bitvec[base_idx] & (~exclude_bitvec[base_idx]);
+    target_bitvec[base_idx + 1] = source_bitvec[base_idx + 1] & (~exclude_bitvec[base_idx + 1]);
+  }
+  #endif
+  if (word_ct & 1) {
+    target_bitvec[word_ct - 1] = source_bitvec[word_ct - 1] & (~exclude_bitvec[word_ct - 1]);
+  }
+#else
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    target_bitvec[widx] = source_bitvec[widx] & (~exclude_bitvec[widx]);
+  }
+#endif
+}
+
+void bitvec_or(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
+  // main_bitvec := main_bitvec OR arg_bitvec
+#ifdef __LP64__
+  vul_t* main_bitvvec_iter = (vul_t*)main_bitvec;
+  const vul_t* arg_bitvvec_iter = (const vul_t*)arg_bitvec;
+  const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
+  if (full_vec_ct & 1) {
+    *main_bitvvec_iter++ |= (*arg_bitvvec_iter++);
+  }
+  if (full_vec_ct & 2) {
+    *main_bitvvec_iter++ |= (*arg_bitvvec_iter++);
+    *main_bitvvec_iter++ |= (*arg_bitvvec_iter++);
+  }
+  for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
+    *main_bitvvec_iter++ |= (*arg_bitvvec_iter++);
+    *main_bitvvec_iter++ |= (*arg_bitvvec_iter++);
+    *main_bitvvec_iter++ |= (*arg_bitvvec_iter++);
+    *main_bitvvec_iter++ |= (*arg_bitvvec_iter++);
+  }
+  #ifdef USE_AVX2
+  if (word_ct & 2) {
+    const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
+    main_bitvec[base_idx] |= arg_bitvec[base_idx];
+    main_bitvec[base_idx + 1] |= arg_bitvec[base_idx + 1]
+  }
+  #endif
+  if (word_ct & 1) {
+    main_bitvec[word_ct - 1] |= arg_bitvec[word_ct - 1];
+  }
+#else
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    main_bitvec[widx] |= arg_bitvec[widx];
+  }
+#endif
+}
+
+void bitvec_andnot2(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
+  // main_bitvec := (~main_bitvec) AND include_bitvec
+  // this corresponds _mm_andnot() operand order
+#ifdef __LP64__
+  vul_t* main_bitvvec_iter = (vul_t*)main_bitvec;
+  const vul_t* include_bitvvec_iter = (const vul_t*)include_bitvec;
+  const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
+  if (full_vec_ct & 1) {
+    *main_bitvvec_iter = (~(*main_bitvvec_iter)) & (*include_bitvvec_iter++);
+    ++main_bitvvec_iter;
+  }
+  if (full_vec_ct & 2) {
+    *main_bitvvec_iter = (~(*main_bitvvec_iter)) & (*include_bitvvec_iter++);
+    ++main_bitvvec_iter;
+    *main_bitvvec_iter = (~(*main_bitvvec_iter)) & (*include_bitvvec_iter++);
+    ++main_bitvvec_iter;
+  }
+  for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
+    *main_bitvvec_iter = (~(*main_bitvvec_iter)) & (*include_bitvvec_iter++);
+    ++main_bitvvec_iter;
+    *main_bitvvec_iter = (~(*main_bitvvec_iter)) & (*include_bitvvec_iter++);
+    ++main_bitvvec_iter;
+    *main_bitvvec_iter = (~(*main_bitvvec_iter)) & (*include_bitvvec_iter++);
+    ++main_bitvvec_iter;
+    *main_bitvvec_iter = (~(*main_bitvvec_iter)) & (*include_bitvvec_iter++);
+    ++main_bitvvec_iter;
+  }
+  #ifdef USE_AVX2
+  if (word_ct & 2) {
+    const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
+    main_bitvec[base_idx] = (~main_bitvec[base_idx]) & include_bitvec[base_idx];
+    main_bitvec[base_idx + 1] = (~main_bitvec[base_idx + 1]) & include_bitvec[base_idx + 1];
+  }
+  #endif
+  if (word_ct & 1) {
+    main_bitvec[word_ct - 1] = (~main_bitvec[word_ct - 1]) & include_bitvec[word_ct - 1];
+  }
+#else
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    main_bitvec[widx] = (~main_bitvec[widx]) & include_bitvec[widx];
+  }
+#endif
+}
+
+void set_het_missing(uintptr_t word_ct, uintptr_t* genovec) {
+  // 01 -> 11, nothing else changes
+#ifdef __LP64__
+  const vul_t m1 = VCONST_UL(kMask5555);
+  vul_t* geno_vvec_iter = (vul_t*)genovec;
+  const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
+  if (full_vec_ct & 1) {
+    const vul_t cur_geno_vword = *geno_vvec_iter;
+    const vul_t cur_geno_vword_low_lshifted = vul_lshift(cur_geno_vword & m1, 1);
+    *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
+  }
+  if (full_vec_ct & 2) {
+    vul_t cur_geno_vword = *geno_vvec_iter;
+    vul_t cur_geno_vword_low_lshifted = vul_lshift(cur_geno_vword & m1, 1);
+    *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
+    cur_geno_vword = *geno_vvec_iter;
+    cur_geno_vword_low_lshifted = vul_lshift(cur_geno_vword & m1, 1);
+    *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
+  }
+  for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
+    vul_t cur_geno_vword = *geno_vvec_iter;
+    vul_t cur_geno_vword_low_lshifted = vul_lshift(cur_geno_vword & m1, 1);
+    *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
+    cur_geno_vword = *geno_vvec_iter;
+    cur_geno_vword_low_lshifted = vul_lshift(cur_geno_vword & m1, 1);
+    *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
+    cur_geno_vword = *geno_vvec_iter;
+    cur_geno_vword_low_lshifted = vul_lshift(cur_geno_vword & m1, 1);
+    *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
+    cur_geno_vword = *geno_vvec_iter;
+    cur_geno_vword_low_lshifted = vul_lshift(cur_geno_vword & m1, 1);
+    *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
+  }
+  #ifdef USE_AVX2
+  if (word_ct & 2) {
+    const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
+    uintptr_t geno_word = genovec[base_idx];
+    genovec[base_idx] = geno_word | ((geno_word & kMask5555) << 1);
+    geno_word = genovec[base_idx + 1];
+    genovec[base_idx + 1] = geno_word | ((geno_word & kMask5555) << 1);
+  }
+  #endif
+  if (word_ct & 1) {
+    const uintptr_t geno_word = genovec[word_ct - 1];
+    genovec[word_ct - 1] = geno_word | ((geno_word & kMask5555) << 1);
+  }
+#else
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    const uintptr_t geno_word = genovec[widx];
+    genovec[widx] = geno_word | ((geno_word & kMask5555) << 1);
+  }
+#endif
+}
+
+void genoarr_to_nonmissing(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* nonmissing_bitarr) {
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  const uintptr_t* genoarr_iter = genoarr;
+  halfword_t* nonmissing_bitarr_iter = (halfword_t*)nonmissing_bitarr;
+  for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+    uintptr_t ww = ~(*genoarr_iter++);
+    ww = (ww | (ww >> 1)) & kMask5555;
+    *nonmissing_bitarr_iter++ = pack_word_to_halfword(ww);
+  }
+  // zero trailing bits up to word boundary, in a way that doesn't create
+  // aliasing issues
+  // (if zeroing is needed up to vector boundary, that's the caller's
+  // responsibility)
+  const uint32_t trail_ct = sample_ct % kBitsPerWordD2;
+  if (trail_ct) {
+    nonmissing_bitarr_iter[-1] &= (1U << trail_ct) - 1;
+  }
+  if (sample_ctl2 % 2) {
+    *nonmissing_bitarr_iter = 0;
+  }
+}
+
+uint32_t genoarr_count_missing_notsubset_unsafe(const uintptr_t* genoarr, const uintptr_t* exclude_mask, uint32_t sample_ct) {
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  const uintptr_t* genoarr_iter = genoarr;
+  const halfword_t* exclude_alias_iter = (halfword_t*)exclude_mask;
+  uint32_t missing_ct = 0;
+  for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+    uintptr_t ww = *genoarr_iter++;
+    ww = ww & (ww >> 1);
+    const uint32_t include_mask = ~(*exclude_alias_iter++);
+    missing_ct += popcount01_long(ww & unpack_halfword_to_word(include_mask));
+  }
+  return missing_ct;
+}
+
+
+int32_t get_variant_uidx_without_htable(const char* idstr, char** variant_ids, const uintptr_t* variant_include, uint32_t variant_ct) {
+  const uint32_t id_blen = strlen(idstr) + 1;
+  uint32_t variant_uidx = 0;
+  int32_t retval = -1;
+  for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    if (!memcmp(idstr, variant_ids[variant_uidx], id_blen)) {
+      if (retval != -1) {
+	// duplicate
+	return -2;
+      }
+      retval = (int32_t)variant_uidx;
+    }
+  }
+  return retval;
+}
+
+// assumes 0b11 == missing
+// retired in favor of genoarr_to_nonmissing followed by a for loop?
+/*
+void copy_when_nonmissing(const uintptr_t* loadbuf, const void* source, uintptr_t elem_size, uintptr_t unfiltered_sample_ct, uintptr_t missing_ct, void* dest) {
+  // tried hardcoding elem_size == 4 and 8; that was only ~4% faster, so may
+  // as well use a single general-purpose function.
+  if (!missing_ct) {
+    memcpy(dest, source, unfiltered_sample_ct * elem_size);
+    return;
+  }
+  const uintptr_t* loadbuf_iter = loadbuf;
+  const uintptr_t* loadbuf_end = &(loadbuf[QUATERCT_TO_WORDCT(unfiltered_sample_ct)]);
+  const unsigned char* source_alias = (const unsigned char*)source;
+  char* dest_iter = (char*)dest;
+  uintptr_t copy_start_idx = 0;
+  uintptr_t sample_idx_offset = 0;
+  do {
+    uintptr_t cur_word = *loadbuf_iter++;
+    cur_word = cur_word & (cur_word >> 1) & kMask5555;
+    while (cur_word) {
+      const uintptr_t new_missing_idx = sample_idx_offset + (CTZLU(cur_word) / 2);
+      if (new_missing_idx != copy_start_idx) {
+        const uintptr_t diff = new_missing_idx - copy_start_idx;
+	dest_iter = memcpya(dest_iter, &(source_alias[copy_start_idx * elem_size]), diff * elem_size);
+      }
+      copy_start_idx = new_missing_idx + 1;
+      cur_word &= cur_word - 1;
+    }
+    sample_idx_offset += kBitsPerWordD2;
+  } while (loadbuf_iter < loadbuf_end);
+  const uintptr_t diff = unfiltered_sample_ct - copy_start_idx;
+  if (diff) {
+    memcpy(dest_iter, &(source_alias[copy_start_idx * elem_size]), diff * elem_size);
+  }
+}
+*/
+
+
+// MurmurHash3, from
+// https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+static inline uint32_t rotl32(uint32_t x, int8_t r) {
+  return (x << r) | (x >> (32 - r));
+}
+
+static inline uint32_t getblock32(const uint32_t* p, int i) {
+  return p[i];
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+static inline uint32_t fmix32(uint32_t h) {
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+uint32_t murmurhash3_32(const void* key, uint32_t len) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int32_t nblocks = len / 4;
+
+  uint32_t h1 = 0;
+  // uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + nblocks*4);
+
+  int32_t i;
+  uint32_t k1;
+  for(i = -nblocks; i; i++) {
+      k1 = getblock32(blocks,i);
+
+      k1 *= c1;
+      k1 = rotl32(k1,15);
+      k1 *= c2;
+   
+      h1 ^= k1;
+      h1 = rotl32(h1,13);
+      h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + nblocks*4);
+
+  k1 = 0;
+
+  switch(len & 3) {
+    case 3:
+      k1 ^= tail[2] << 16;
+      // fall through
+    case 2:
+      k1 ^= tail[1] << 8;
+      // fall through
+    case 1:
+      k1 ^= tail[0];
+      k1 *= c1;
+      k1 = rotl32(k1,15);
+      k1 *= c2;
+      h1 ^= k1;
+  }
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  return fmix32(h1);
+}
+
+
+uint32_t is_composite6(uintptr_t num) {
+  // assumes num is congruent to 1 or 5 mod 6.
+  // can speed this up by ~50% by hardcoding avoidance of multiples of 5/7,
+  // but this isn't currently a bottleneck so I'll keep this simple
+  assert((num % 6 == 1) || (num % 6 == 5));
+  uintptr_t divisor = 5;
+  while (divisor * divisor <= num) {
+    if (!(num % divisor)) {
+      return 1;
+    }
+    divisor += 2;
+    if (!(num % divisor)) {
+      return 1;
+    }
+    divisor += 4;
+  }
+  return 0;
+}
+
+uintptr_t geqprime(uintptr_t floor) {
+  // assumes floor is odd and greater than 1.  Returns 5 if floor = 3,
+  // otherwise returns the first prime >= floor.
+  assert((floor % 2) && (floor > 1));
+  uintptr_t ulii = floor % 3;
+  if (!ulii) {
+    floor += 2;
+  } else if (ulii == 1) {
+    goto geqprime_1mod6;
+  }
+  while (is_composite6(floor)) {
+    floor += 2;
+  geqprime_1mod6:
+    if (!is_composite6(floor)) {
+      return floor;
+    }
+    floor += 4;
+  }
+  return floor;
+}
+
+uintptr_t leqprime(uintptr_t ceil) {
+  // assumes ceil is odd and greater than 4.  Returns the first prime <= ceil.
+  assert((ceil % 2) && (ceil > 4));
+  uintptr_t ulii = ceil % 3;
+  if (!ulii) {
+    ceil -= 2;
+  } else if (ulii == 2) {
+    goto leqprime_5mod6;
+  }
+  while (is_composite6(ceil)) {
+    ceil -= 2;
+  leqprime_5mod6:
+    if (!is_composite6(ceil)) {
+      return ceil;
+    }
+    ceil -= 4;
+  }
+  return ceil;
+}
+
+boolerr_t htable_good_size_alloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr) {
+  bytes_avail &= (~(kCacheline - k1LU));
+  uint32_t htable_size = get_htable_fast_size(item_ct);
+  if (htable_size > bytes_avail / sizeof(int32_t)) {
+    if (!bytes_avail) {
+      return 1;
+    }
+    htable_size = leqprime((bytes_avail / sizeof(int32_t)) - 1);
+    if (htable_size < item_ct * 2) {
+      return 1;
+    }
+  }
+  *htable_ptr = (uint32_t*)bigstack_alloc_raw_rd(htable_size * sizeof(int32_t));
+  *htable_size_ptr = htable_size;
+  return 0;
+}
+
+uint32_t populate_strbox_htable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
+  // may want subset_mask parameter later
+  fill_uint_one(str_htable_size, str_htable);
+  const char* strbox_iter = strbox;
+  for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+    const uint32_t slen = strlen(strbox_iter);
+    uint32_t hashval = hashceil(strbox_iter, slen, str_htable_size);
+    // previously used quadratic probing, but turns out that that isn't
+    // meaningfully better than linear probing.
+    // uint32_t next_incr = 1;
+    while (1) {
+      const uint32_t cur_htable_entry = str_htable[hashval];
+      if (cur_htable_entry == 0xffffffffU) {
+	str_htable[hashval] = str_idx;
+	break;
+      }
+      if (!memcmp(strbox_iter, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
+	// guaranteed to be positive
+	return str_idx;
+      }
+      if (++hashval == str_htable_size) {
+	hashval = 0;
+      }
+      /*
+      // defend against overflow
+      const uint32_t top_diff = str_htable_size - hashval;
+      if (top_diff > next_incr) {
+	hashval += next_incr;
+      } else {
+	hashval = next_incr - top_diff;
+      }
+      next_incr += 2;
+      */
+    }
+    strbox_iter = &(strbox_iter[max_str_blen]);
+  }
+  return 0;
+}
+
+// could merge this with non-subset case, but this isn't much code
+/*
+uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
+  // may want subset_mask parameter later
+  fill_uint_one(str_htable_size, str_htable);
+  uintptr_t str_uidx = 0;
+  for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx, ++str_uidx) {
+    next_set_ul_unsafe_ck(subset_mask, &str_uidx);
+    const char* cur_str = &(strbox[str_uidx * max_str_blen]);
+    const uint32_t slen = strlen(cur_str);
+    uint32_t hashval = hashceil(cur_str, slen, str_htable_size);
+    while (1) {
+      const uint32_t cur_htable_entry = str_htable[hashval];
+      if (cur_htable_entry == 0xffffffffU) {
+	str_htable[hashval] = str_uidx;
+	break;
+      }
+      if (!memcmp(cur_str, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
+	// guaranteed to be positive
+	return str_uidx;
+      }
+      if (++hashval == str_htable_size) {
+	hashval = 0;
+      }
+    }
+  }
+  return 0;
+}
+*/
+
+uint32_t id_htable_find(const char* cur_id, char** item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size) {
+  // returns 0xffffffffU on failure
+  uint32_t hashval = hashceil(cur_id, cur_id_slen, id_htable_size);
+  while (1) {
+    const uint32_t cur_htable_idval = id_htable[hashval];
+    if ((cur_htable_idval == 0xffffffffU) || (!strcmp(cur_id, item_ids[cur_htable_idval]))) {
+      return cur_htable_idval;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
+// assumes cur_id_slen < max_str_blen.
+// requires cur_id to be null-terminated.
+uint32_t strbox_htable_find(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size) {
+  uint32_t hashval = hashceil(cur_id, cur_id_slen, id_htable_size);
+  const uint32_t cur_id_blen = cur_id_slen + 1;
+  while (1) {
+    const uint32_t cur_htable_idval = id_htable[hashval];
+    if ((cur_htable_idval == 0xffffffffU) || (!memcmp(cur_id, &(strbox[cur_htable_idval * max_str_blen]), cur_id_blen))) {
+      return cur_htable_idval;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
+uint32_t variant_id_dupflag_htable_find(const char* idbuf, char** variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen) {
+  // assumes duplicate variant IDs are flagged, but full variant_uidx linked
+  // lists are not stored
+  // idbuf does not need to be null-terminated (note that this is currently
+  // achieved in a way that forces variant_ids[] entries to not be too close
+  // to the end of bigstack, otherwise memcmp behavior is potentially
+  // undefined)
+  // returns 0xffffffffU on failure, value with bit 31 set on duplicate
+  if (cur_id_slen > max_id_slen) {
+    return 0xffffffffU;
+  }
+  uint32_t hashval = hashceil(idbuf, cur_id_slen, id_htable_size);
+  while (1) {
+    const uint32_t cur_htable_idval = id_htable[hashval];
+    if ((cur_htable_idval == 0xffffffffU) || ((!memcmp(idbuf, variant_ids[cur_htable_idval & 0x7fffffff], cur_id_slen)) && (!variant_ids[cur_htable_idval & 0x7fffffff][cur_id_slen]))) {
+      return cur_htable_idval;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
+uint32_t variant_id_dup_htable_find(const char* idbuf, char** variant_ids, const uint32_t* id_htable, const uint32_t* htable_dup_base, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen, uint32_t* llidx_ptr) {
+  // Permits duplicate entries.  Similar to plink 1.9
+  // extract_exclude_process_token().
+  // - Returns 0xffffffffU on failure (llidx currently unset in that case),
+  //   otherwise returns the index of the first match (which will have the
+  //   highest index, due to how the linked list is constructed)
+  // - Sets second_llidx to 0xffffffffU if not a duplicate, otherwise it's the
+  //   position in htable_dup_base[] of the next {variant_uidx, next_llidx}
+  //   linked list entry.
+  // - idbuf does not need to be null-terminated.
+  if (cur_id_slen > max_id_slen) {
+    return 0xffffffffU;
+  }
+  uint32_t hashval = hashceil(idbuf, cur_id_slen, id_htable_size);
+  while (1) {
+    const uint32_t cur_htable_idval = id_htable[hashval];
+    const uint32_t cur_dup = cur_htable_idval >> 31;
+    uint32_t cur_llidx;
+    uint32_t variant_uidx;
+    if (cur_dup) {
+      // 0xffffffffU empty-entry code has high bit set, so only need to check
+      // here
+      if (cur_htable_idval == 0xffffffffU) {
+	return 0xffffffffU;
+      }
+      cur_llidx = cur_htable_idval << 1;
+      variant_uidx = htable_dup_base[cur_llidx];
+    } else {
+      cur_llidx = 0xffffffffU;
+      variant_uidx = cur_htable_idval;
+    }
+    const char* sptr = variant_ids[variant_uidx];
+    if ((!memcmp(idbuf, sptr, cur_id_slen)) && (!sptr[cur_id_slen])) {
+      *llidx_ptr = cur_llidx;
+      return variant_uidx;
+    }
+    if (++hashval == id_htable_size) {
+      hashval = 0;
+    }
+  }
+}
+
+char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_blen) {
+  --id_ct;
+  for (uintptr_t id_idx = 0; id_idx < id_ct; ++id_idx) {
+    if (!strcmp(&(sorted_ids[id_idx * max_id_blen]), &(sorted_ids[(id_idx + 1) * max_id_blen]))) {
+      return &(sorted_ids[id_idx * max_id_blen]);
+    }
+  }
+  return nullptr;
+}
+
+uint32_t collapse_duplicate_ids(uintptr_t id_ct, uintptr_t max_id_blen, char* sorted_ids, uint32_t* id_starts) {
+  // Collapses array of sorted IDs to remove duplicates, and writes
+  // pre-collapse positions to id_starts (so e.g. duplication count of any
+  // sample ID can be determined via subtraction) if it isn't nullptr.
+  // Returns id_ct of collapsed array.
+  if (!id_ct) {
+    return 0;
+  }
+  uintptr_t read_idx = 1;
+  uintptr_t write_idx;
+  if (id_starts) {
+    id_starts[0] = 0;
+    for (; read_idx < id_ct; ++read_idx) {
+      if (!strcmp(&(sorted_ids[(read_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
+	break;
+      }
+      id_starts[read_idx] = read_idx;
+    }
+    write_idx = read_idx;
+    while (++read_idx < id_ct) {
+      if (strcmp(&(sorted_ids[(write_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
+	strcpy(&(sorted_ids[write_idx * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]));
+	id_starts[write_idx++] = read_idx;
+      }
+    }
+  } else {
+    for (; read_idx < id_ct; ++read_idx) {
+      if (!strcmp(&(sorted_ids[(read_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
+	break;
+      }
+    }
+    write_idx = read_idx;
+    while (++read_idx < id_ct) {
+      if (strcmp(&(sorted_ids[(write_idx - 1) * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]))) {
+	strcpy(&(sorted_ids[write_idx * max_id_blen]), &(sorted_ids[read_idx * max_id_blen]));
+	++write_idx;
+      }
+    }
+  }
+  return write_idx;
+}
+
+pglerr_t copy_sort_strbox_subset_noalloc(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char* __restrict sorted_strbox, uint32_t* __restrict id_map) {
+  // Stores a lexicographically sorted list of IDs in sorted_strbox and the raw
+  // positions of the corresponding markers/samples in *id_map_ptr.  Does not
+  // include excluded markers/samples in the list.
+  // Assumes sorted_strbox and id_map have been allocated; use the
+  // copy_sort_strbox_subset() wrapper if they haven't been.
+  // Note that this DOES still perform a "stack" allocation (in the qsort_ext()
+  // call).
+  if (!str_ct) {
+    return kPglRetSuccess;
+  }
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+#ifdef __cplusplus
+    if (max_str_blen <= 60) {
+      uintptr_t wkspace_entry_blen = (max_str_blen > 36)? sizeof(Strbuf60_ui) : sizeof(Strbuf36_ui);
+      char* sort_wkspace;
+      if (bigstack_alloc_c(str_ct * wkspace_entry_blen, &sort_wkspace)) {
+	goto copy_sort_strbox_subset_noalloc_ret_NOMEM;
+      }
+      uint32_t str_uidx = 0;
+      const uint32_t wkspace_entry_blen_m4 = wkspace_entry_blen - 4;
+      char* sort_wkspace_iter = sort_wkspace;
+      for (uint32_t str_idx = 0; str_idx < str_ct; ++str_idx, ++str_uidx) {
+	next_set_unsafe_ck(subset_mask, &str_uidx);
+	strcpy(sort_wkspace_iter, &(orig_strbox[str_uidx * max_str_blen]));
+	sort_wkspace_iter = &(sort_wkspace_iter[wkspace_entry_blen_m4]);
+	if (collapse_idxs) {
+	  *((uint32_t*)sort_wkspace_iter) = str_idx;
+	} else {
+	  *((uint32_t*)sort_wkspace_iter) = str_uidx;
+	}
+	sort_wkspace_iter = &(sort_wkspace_iter[sizeof(int32_t)]);
+      }
+      if (wkspace_entry_blen == 40) {
+	sort_strbox_40b_finish(str_ct, max_str_blen, use_nsort, (Strbuf36_ui*)sort_wkspace, sorted_strbox, id_map);
+      } else {
+	sort_strbox_64b_finish(str_ct, max_str_blen, use_nsort, (Strbuf60_ui*)sort_wkspace, sorted_strbox, id_map);
+      }
+    } else {
+#endif
+      str_sort_indexed_deref_t* sort_wkspace = (str_sort_indexed_deref_t*)bigstack_alloc(str_ct * sizeof(str_sort_indexed_deref_t));
+      if (!sort_wkspace) {
+	goto copy_sort_strbox_subset_noalloc_ret_NOMEM;
+      }
+      uint32_t str_uidx = 0;
+      for (uint32_t str_idx = 0; str_idx < str_ct; ++str_idx, ++str_uidx) {
+	next_set_unsafe_ck(subset_mask, &str_uidx);
+	sort_wkspace[str_idx].strptr = (const char*)(&(orig_strbox[str_uidx * max_str_blen]));
+	if (collapse_idxs) {
+	  sort_wkspace[str_idx].orig_idx = str_idx;
+	} else {
+	  sort_wkspace[str_idx].orig_idx = str_uidx;
+	}
+      }
+      if (!use_nsort) {
+#ifdef __cplusplus
+        std::sort(sort_wkspace, &(sort_wkspace[str_ct]));
+#else
+        qsort(sort_wkspace, str_ct, sizeof(str_sort_indexed_deref_t), strcmp_deref);
+#endif
+      } else {
+#ifdef __cplusplus
+	str_nsort_indexed_deref_t* wkspace_alias = (str_nsort_indexed_deref_t*)sort_wkspace;
+	std::sort(wkspace_alias, &(wkspace_alias[str_ct]));
+#else
+	qsort(sort_wkspace, str_ct, sizeof(str_sort_indexed_deref_t), strcmp_natural_deref);
+#endif
+      }
+      for (uintptr_t str_idx = 0; str_idx < str_ct; ++str_idx) {
+	strcpy(&(sorted_strbox[str_idx * max_str_blen]), sort_wkspace[str_idx].strptr);
+	id_map[str_idx] = sort_wkspace[str_idx].orig_idx;
+      }
+#ifdef __cplusplus
+    }
+#endif
+    if (!allow_dups) {
+      char* dup_id = scan_for_duplicate_ids(sorted_strbox, str_ct, max_str_blen);
+      if (dup_id) {
+	char* tptr = dup_id;
+	while (1) {
+	  tptr = strchr(tptr, '\t');
+	  if (!tptr) {
+	    break;
+	  }
+	  *tptr++ = ' ';
+	}
+	LOGERRPRINTFWW("Error: Duplicate ID '%s'.\n", dup_id);
+	goto copy_sort_strbox_subset_noalloc_ret_MALFORMED_INPUT;
+      }
+    }
+  }
+  while (0) {
+  copy_sort_strbox_subset_noalloc_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  copy_sort_strbox_subset_noalloc_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t copy_sort_strbox_subset(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char** sorted_strbox_ptr, uint32_t** id_map_ptr) {
+  // id_map on bottom because --indiv-sort frees *sorted_strbox_ptr
+  if (bigstack_alloc_ui(str_ct, id_map_ptr) ||
+      bigstack_alloc_c(str_ct * max_str_blen, sorted_strbox_ptr)) {
+    return kPglRetNomem;
+  }
+  return copy_sort_strbox_subset_noalloc(subset_mask, orig_strbox, str_ct, max_str_blen, allow_dups, collapse_idxs, use_nsort, *sorted_strbox_ptr, *id_map_ptr);
+}
+
+int32_t bsearch_str(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx) {
+  // does not assume null-terminated idbuf, or nonempty array.
+  if (cur_id_slen >= max_id_blen) {
+    return -1;
+  }
+  uintptr_t start_idx = 0;
+  while (start_idx < end_idx) {
+    const uintptr_t mid_idx = (start_idx + end_idx) / 2;
+    const int32_t ii = memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen);
+    if (ii > 0) {
+      start_idx = mid_idx + 1;
+    } else if ((ii < 0) || sorted_strbox[mid_idx * max_id_blen + cur_id_slen]) {
+      end_idx = mid_idx;
+    } else {
+      return ((uint32_t)mid_idx);
+    }
+  }
+  return -1;
+}
+
+int32_t bsearch_str_natural(const char* idbuf, const char* sorted_strbox, uintptr_t max_id_blen, uintptr_t end_idx) {
+  // unlike bsearch_str(), caller is responsible for slen >= max_id_blen check
+  // if appropriate here
+  uintptr_t start_idx = 0;
+  while (start_idx < end_idx) {
+    const uintptr_t mid_idx = (start_idx + end_idx) / 2;
+    const int32_t ii = strcmp_natural(idbuf, &(sorted_strbox[mid_idx * max_id_blen]));
+    if (ii > 0) {
+      start_idx = mid_idx + 1;
+    } else if (ii < 0) {
+      end_idx = mid_idx;
+    } else {
+      return ((uint32_t)mid_idx);
+    }
+  }
+  return -1;
+}
+
+uintptr_t bsearch_str_lb(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx) {
+  // returns number of elements in sorted_strbox[] less than idbuf.
+  if (cur_id_slen > max_id_blen) {
+    cur_id_slen = max_id_blen;
+  }
+  uintptr_t start_idx = 0;
+  while (start_idx < end_idx) {
+    const uintptr_t mid_idx = (start_idx + end_idx) / 2;
+    if (memcmp(idbuf, &(sorted_strbox[mid_idx * max_id_blen]), cur_id_slen) > 0) {
+      start_idx = mid_idx + 1;
+    } else {
+      end_idx = mid_idx;
+    }
+  }
+  return start_idx;
+}
+
+
+uint32_t sid_col_required(const uintptr_t* sample_include, const char* sids, uint32_t sample_ct, uint32_t max_sid_blen, uint32_t maybe_modifier) {
+  // note that MAYBESID and SID can both be set
+  if (maybe_modifier & 2) {
+    return 1;
+  }
+  if (sids && (maybe_modifier & 1)) {
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      if (memcmp(&(sids[sample_uidx * max_sid_blen]), "0", 2)) {
+	return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+// sample_augid_map_ptr == nullptr ok
+pglerr_t augid_init_alloc(const uintptr_t* sample_include, const char* sample_ids, const char* sids, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t** sample_augid_map_ptr, char** sample_augids_ptr, uintptr_t* max_sample_augid_blen_ptr) {
+  if (!sids) {
+    max_sid_blen = 2;
+  }
+  const uintptr_t max_sample_augid_blen = max_sample_id_blen + max_sid_blen;
+  *max_sample_augid_blen_ptr = max_sample_augid_blen;
+  uint32_t* sample_augid_map = nullptr;
+  if (sample_augid_map_ptr) {
+    if (bigstack_alloc_ui(sample_ct, sample_augid_map_ptr)) {
+      return kPglRetNomem;
+    }
+    sample_augid_map = *sample_augid_map_ptr;
+  }
+  if (bigstack_alloc_c(max_sample_augid_blen * sample_ct, sample_augids_ptr)) {
+    return kPglRetNomem;
+  }
+  char* sample_augids_iter = *sample_augids_ptr;
+  uint32_t sample_uidx = 0;
+  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+    next_set_unsafe_ck(sample_include, &sample_uidx);
+    char* write_iter = strcpyax(sample_augids_iter, &(sample_ids[sample_uidx * max_sample_id_blen]), '\t');
+    if (sids) {
+      strcpy(write_iter, &(sids[sample_uidx * max_sid_blen]));
+    } else {
+      strcpy(write_iter, "0");
+    }
+    sample_augids_iter = &(sample_augids_iter[max_sample_augid_blen]);
+    if (sample_augid_map) {
+      sample_augid_map[sample_idx] = sample_uidx;
+    }
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t sorted_xidbox_init_alloc(const uintptr_t* sample_include, const char* sample_ids, const char* sids, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, xid_mode_t xid_mode, uint32_t use_nsort, char** sorted_xidbox_ptr, uint32_t** xid_map_ptr, uintptr_t* max_xid_blen_ptr) {
+  if (!(xid_mode & kfXidModeFlagSid)) {
+    // two fields
+    *max_xid_blen_ptr = max_sample_id_blen;
+    return copy_sort_strbox_subset(sample_include, sample_ids, sample_ct, max_sample_id_blen, 0, 0, use_nsort, sorted_xidbox_ptr, xid_map_ptr);
+  }
+  // three fields
+  if (augid_init_alloc(sample_include, sample_ids, sids, sample_ct, max_sample_id_blen, max_sid_blen, xid_map_ptr, sorted_xidbox_ptr, max_xid_blen_ptr)) {
+    return kPglRetNomem;
+  }
+  if (sort_strbox_indexed(sample_ct, *max_xid_blen_ptr, use_nsort, *sorted_xidbox_ptr, *xid_map_ptr)) {
+    return kPglRetNomem;
+  }
+  char* dup_id = scan_for_duplicate_ids(*sorted_xidbox_ptr, sample_ct, *max_xid_blen_ptr);
+  if (dup_id) {
+    char* tptr = (char*)rawmemchr(dup_id, '\t');
+    *tptr = ' ';
+    tptr = (char*)rawmemchr(&(tptr[1]), '\t');
+    *tptr = ' ';
+    LOGERRPRINTFWW("Error: Duplicate ID '%s'.\n", dup_id);
+    return kPglRetMalformedInput;
+  }
+  return kPglRetSuccess;
+}
+
+boolerr_t sorted_xidbox_read_find(const char* __restrict sorted_xidbox, const uint32_t* __restrict xid_map, uintptr_t max_xid_blen, uintptr_t end_idx, uint32_t comma_delim, xid_mode_t xid_mode, char** read_pp, uint32_t* sample_uidx_ptr, char* __restrict idbuf) {
+  // idbuf = workspace
+  // sorted_xidbox = packed, sorted list of ID strings to search over.
+  //
+  // input *read_pp must point to beginning of FID; this is a change from plink
+  // 1.9.
+  //
+  // *read_pp is now set to point to the end of the last parsed token instead
+  // of the beginning of the next; this is another change from plink 1.9.
+  //
+  // returns 1 on missing token *or* if the sample ID is not present.  cases
+  // can be distinguished by checking whether *read_pp == nullptr.
+  char* first_token_start = *read_pp;
+  uintptr_t blen_sid = 0;
+  char* token_iter;
+  char* iid_ptr;
+  char* sid_ptr = nullptr;
+  uintptr_t slen_fid;
+  uintptr_t slen_iid;
+  if (comma_delim) {
+    token_iter = first_token_start;
+    unsigned char ucc = (unsigned char)(*token_iter);
+    while (ucc != ',') {
+      if (ucc < 32) {
+	if (!(xid_mode & kfXidModeFlagOneTokenOk)) {
+	  *read_pp = nullptr;
+	  return 1;
+	}
+	slen_fid = (uintptr_t)(token_iter - first_token_start);
+	goto sorted_xidbox_read_find_comma_single_token;
+      }
+      ucc = (unsigned char)(*(++token_iter));
+    }
+    slen_fid = (uintptr_t)(token_iter - first_token_start);
+    if (xid_mode & kfXidModeFlagNeverFid) {
+    sorted_xidbox_read_find_comma_single_token:
+      iid_ptr = first_token_start;
+      slen_iid = slen_fid;
+    } else {
+      do {
+	ucc = (unsigned char)(*(++token_iter));
+      } while ((ucc == ' ') || (ucc == '\t'));
+      iid_ptr = token_iter;
+      while ((ucc >= 32) && (ucc != ',')) {
+	ucc = (unsigned char)(*(++token_iter));
+      }
+      slen_iid = (uintptr_t)(token_iter - iid_ptr);
+    }
+    // token_iter now points to comma/eoln at end of IID
+    if (xid_mode & kfXidModeFlagSid) {
+      if (*token_iter != ',') {
+	return 1;
+      }
+      do {
+	ucc = (unsigned char)(*(++token_iter));
+      } while ((ucc == ' ') || (ucc == '\t'));
+      sid_ptr = token_iter;
+      while ((ucc >= 32) && (ucc != ',')) {
+	ucc = (unsigned char)(*(++token_iter));
+      }
+      blen_sid = 1 + (uintptr_t)(token_iter - sid_ptr);
+      if (token_iter == sid_ptr) {
+	// special case: treat missing SID as '0'
+	blen_sid = 2;
+	// const_cast, since token_endnn doesn't return const pointer
+        // function is too long for me to be comfortable just turning off
+        // -Wcast-qual...
+        sid_ptr = (char*)((uintptr_t)(&(g_one_char_strs[96])));
+      }
+    }
+  } else {
+    assert(!is_eoln_kns(*first_token_start));
+    token_iter = token_endnn(first_token_start);
+    slen_fid = (uintptr_t)(token_iter - first_token_start);
+    if (xid_mode & kfXidModeFlagNeverFid) {
+    sorted_xidbox_read_find_space_single_token:
+      iid_ptr = first_token_start;
+      slen_iid = slen_fid;
+    } else {
+      token_iter = skip_initial_spaces(token_iter);
+      if (is_eoln_kns(*token_iter)) {
+	if (!(xid_mode & kfXidModeFlagOneTokenOk)) {
+	  *read_pp = nullptr;
+	  return 1;
+	}
+	// need to backtrack
+	token_iter = &(first_token_start[slen_fid]);
+	goto sorted_xidbox_read_find_space_single_token;
+      }
+      iid_ptr = token_iter;
+      token_iter = token_endnn(iid_ptr);
+      slen_iid = (uintptr_t)(token_iter - iid_ptr);
+    }
+    // token_iter now points to space/eoln at end of IID
+    if (xid_mode & kfXidModeFlagSid) {
+      token_iter = skip_initial_spaces(token_iter);
+      if (is_eoln_kns(*token_iter)) {
+	*read_pp = nullptr;
+	return 1;
+      }
+      sid_ptr = token_iter;
+      token_iter = token_endnn(sid_ptr);
+      blen_sid = 1 + (uintptr_t)(token_iter - sid_ptr);
+    }
+  }
+  *read_pp = token_iter;
+  uintptr_t slen_final = slen_fid + slen_iid + blen_sid + 1;
+  if (slen_final >= max_xid_blen) {
+    // avoid buffer overflow
+    return 1;
+  }
+  char* idbuf_end = memcpya(memcpyax(idbuf, first_token_start, slen_fid, '\t'), iid_ptr, slen_iid);
+  if (blen_sid) {
+    *idbuf_end++ = '\t';
+    memcpy(idbuf_end, sid_ptr, blen_sid - 1);
+  }
+  return sorted_idbox_find(idbuf, sorted_xidbox, xid_map, slen_final, max_xid_blen, end_idx, sample_uidx_ptr);
+}
+
+void init_range_list(range_list_t* range_list_ptr) {
+  range_list_ptr->names = nullptr;
+  range_list_ptr->starts_range = nullptr;
+  range_list_ptr->name_ct = 0;
+  range_list_ptr->name_max_blen = 0;
+}
+
+void cleanup_range_list(range_list_t* range_list_ptr) {
+  free_cond(range_list_ptr->names);
+  // starts_range now uses same allocation
+}
+
+boolerr_t numeric_range_list_to_bitarr(const range_list_t* range_list_ptr, uint32_t bitarr_size, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr) {
+  // bitarr assumed to be initialized (but not necessarily zero-initialized)
+  const char* names = range_list_ptr->names;
+  const unsigned char* starts_range = range_list_ptr->starts_range;
+  const uint32_t name_ct = range_list_ptr->name_ct;
+  const uint32_t name_max_blen = range_list_ptr->name_max_blen;
+  const uint32_t idx_max = bitarr_size + offset;
+  for (uint32_t name_idx = 0; name_idx < name_ct; ++name_idx) {
+    uint32_t idx1;
+    if (scan_uint_capped(&(names[name_idx * name_max_blen]), idx_max, &idx1)) {
+      if (ignore_overflow) {
+	continue;
+      }
+      return 1;
+    }
+    if (starts_range[name_idx]) {
+      ++name_idx;
+      uint32_t idx2;
+      if (scan_uint_capped(&(names[name_idx * name_max_blen]), idx_max, &idx2)) {
+	if (!ignore_overflow) {
+	  return 1;
+	}
+        idx2 = idx_max - 1;
+      }
+      fill_bits_nz(idx1 - offset, (idx2 - offset) + 1, bitarr);
+    } else {
+      set_bit(idx1 - offset, bitarr);
+    }
+  }
+  return 0;
+}
+
+pglerr_t string_range_list_to_bitarr(char* header_line, const range_list_t* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t* bitarr, int32_t* __restrict seen_idxs) {
+  // bitarr assumed to be zero-initialized
+  // if fixed_len is zero, header_line is assumed to be a list of
+  // space-delimited unequal-length names
+  assert(token_ct);
+  assert(!popcount_longs(bitarr, BITCT_TO_WORDCT(token_ct)));
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    char* header_line_iter = header_line;
+    const uintptr_t name_ct = range_list_ptr->name_ct;
+    const uintptr_t max_id_blen = range_list_ptr->name_max_blen;
+    uint32_t item_idx = 0;
+    while (1) {
+      char* token_end = comma_or_space_token_end(header_line_iter, comma_delim);
+      uint32_t cmdline_pos;
+      if (!sorted_idbox_find(header_line_iter, sorted_ids, id_map, (uintptr_t)(token_end - header_line_iter), max_id_blen, name_ct, &cmdline_pos)) {
+	if (seen_idxs[cmdline_pos] != -1) {
+	  sprintf(g_logbuf, "Error: Duplicate --%s token in %s.\n", range_list_flag, file_descrip);
+	  goto string_range_list_to_bitarr_ret_MALFORMED_INPUT_2;
+	}
+	seen_idxs[cmdline_pos] = item_idx;
+	if (cmdline_pos && range_list_ptr->starts_range[cmdline_pos - 1]) {
+	  if (seen_idxs[cmdline_pos - 1] == -1) {
+	    LOGPREPRINTFWW("Error: Second element of --%s range appears before first element in %s.\n", range_list_flag, file_descrip);
+	    goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_2;
+	  }
+	  fill_bits_nz(seen_idxs[cmdline_pos - 1], item_idx + 1, bitarr);
+	} else if (!(range_list_ptr->starts_range[cmdline_pos])) {
+	  SET_BIT(item_idx, bitarr);
+	}
+      }
+      if (++item_idx == token_ct) {
+	break;
+      }
+      if (fixed_len) {
+	header_line_iter = &(header_line_iter[fixed_len]);
+      } else {
+	header_line_iter = skip_initial_spaces(&(token_end[1]));
+      }
+    }
+    for (uint32_t cmdline_pos = 0; cmdline_pos < name_ct; ++cmdline_pos) {
+      if (seen_idxs[cmdline_pos] == -1) {
+	goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_3;
+      }
+    }
+  }
+  while (0) {
+  string_range_list_to_bitarr_ret_INVALID_CMDLINE_3:
+    sprintf(g_logbuf, "Error: Missing --%s token in %s.\n", range_list_flag, file_descrip);
+  string_range_list_to_bitarr_ret_INVALID_CMDLINE_2:
+    logerrprintb();
+    reterr = kPglRetInvalidCmdline;
+    break;
+  string_range_list_to_bitarr_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+  return reterr;
+}
+
+pglerr_t string_range_list_to_bitarr_alloc(char* header_line, const range_list_t* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t** bitarr_ptr) {
+  // wrapper for string_range_list_to_bitarr which allocates the bitfield and
+  // temporary buffers on the heap
+  uintptr_t token_ctl = BITCT_TO_WORDCT(token_ct);
+  uintptr_t name_ct = range_list_ptr->name_ct;
+  int32_t* seen_idxs;
+  char* sorted_ids;
+  uint32_t* id_map;
+  if (bigstack_calloc_ul(token_ctl, bitarr_ptr) ||
+      bigstack_alloc_i(name_ct, &seen_idxs)) {
+    return kPglRetNomem;
+  }
+  // kludge to use copy_sort_strbox_subset()
+  fill_all_bits(name_ct, (uintptr_t*)seen_idxs);
+  if (copy_sort_strbox_subset((uintptr_t*)seen_idxs, range_list_ptr->names, name_ct, range_list_ptr->name_max_blen, 0, 0, 0, &sorted_ids, &id_map)) {
+    return kPglRetNomem;
+  }
+  fill_int_one(name_ct, seen_idxs);
+  pglerr_t reterr = string_range_list_to_bitarr(header_line, range_list_ptr, sorted_ids, id_map, range_list_flag, file_descrip, token_ct, fixed_len, comma_delim, *bitarr_ptr, seen_idxs);
+  bigstack_reset(seen_idxs);
+  return reterr;
+}
+
+
+const char g_xymt_log_names[][5] = {"chrX", "chrY", "XY", "chrM", "PAR1", "PAR2"};
+
+static_assert(!(kChrRawEnd % kBytesPerVec), "kChrRawEnd must be a multiple of kBytesPerVec.");
+pglerr_t init_chr_info(chr_info_t* cip) {
+  // "constructor".  initializes with maximum capacity.  doesn't use bigstack.
+  // chr_mask, haploid_mask: bits
+  // chr_file_order, chr_idx_to_foidx: int32s
+  // chr_fo_vidx_start: int32s, with an extra trailing element
+  // nonstd_names: intptr_ts
+  // nonstd_id_htable: kChrHtableSize int32s
+
+  // this assumes kChrRawEnd is divisible by kBytesPerVec
+  const uintptr_t vecs_required = 2 * BITCT_TO_VECCT(kChrRawEnd) + 3 * (kChrRawEnd / kInt32PerVec) + 1 + (kChrRawEnd / kWordsPerVec) + INT32CT_TO_VECCT(kChrHtableSize);
+
+  // needed for proper cleanup
+  cip->name_ct = 0;
+  cip->incl_excl_name_stack = nullptr;
+  if (vecaligned_malloc(vecs_required * kBytesPerVec, &(cip->chr_mask))) {
+    return kPglRetNomem;
+  }
+  uintptr_t* alloc_iter = &(cip->chr_mask[BITCT_TO_VECCT(kChrRawEnd) * kWordsPerVec]);
+  cip->haploid_mask = alloc_iter;
+  alloc_iter = &(alloc_iter[BITCT_TO_VECCT(kChrRawEnd) * kWordsPerVec]);
+  cip->chr_file_order = (uint32_t*)alloc_iter;
+  alloc_iter = &(alloc_iter[(kChrRawEnd / kInt32PerVec) * kWordsPerVec]);
+  cip->chr_fo_vidx_start = (uint32_t*)alloc_iter;
+  alloc_iter = &(alloc_iter[((kChrRawEnd / kInt32PerVec) + 1) * kWordsPerVec]);
+  cip->chr_idx_to_foidx = (uint32_t*)alloc_iter;
+  alloc_iter = &(alloc_iter[(kChrRawEnd / kInt32PerVec) * kWordsPerVec]);
+  cip->nonstd_names = (char**)alloc_iter;
+  alloc_iter = &(alloc_iter[kChrRawEnd]);
+  cip->nonstd_id_htable = (uint32_t*)alloc_iter;
+  // alloc_iter = &(alloc_iter[((kChrHtableSize + (kInt32PerVec - 1)) / kInt32PerVec) * kWordsPerVec]);
+  // fill_uint_one(kChrHtableSize, cip->nonstd_id_htable);
+
+  fill_ulong_zero(kChrMaskWords, cip->chr_mask);
+  fill_ulong_zero(kChrExcludeWords, cip->chr_exclude);
+
+  // this is a change from plink 1.x.  MT > M since the former matches Ensembl,
+  // while the latter doesn't match any major resource.  no "chr" to reduce
+  // file sizes and reduce the impact of this change.
+  cip->output_encoding = kfChrOutputMT;
+  
+  cip->zero_extra_chrs = 0;
+  cip->is_include_stack = 0;
+  cip->chrset_source = kChrsetSourceDefault;
+  cip->autosome_ct = 22;
+  for (uint32_t xymt_idx = 0; xymt_idx < kChrOffsetCt; ++xymt_idx) {
+    cip->xymt_codes[xymt_idx] = 23 + xymt_idx;
+  }
+  cip->haploid_mask[0] = 0x1800000;
+  fill_ulong_zero(kChrMaskWords - 1, &(cip->haploid_mask[1]));
+  return kPglRetSuccess;
+}
+
+// explicit plink 1.07 species (now initialized by command line parser):
+// human: 22, X, Y, XY, MT, PAR1, PAR2 (PAR1/PAR2 added, XY deprecated in plink
+//   2.0)
+// cow: 29, X, Y, MT
+// dog: 38, X, Y, XY, MT
+// horse: 31, X, Y
+// mouse: 19, X, Y
+// rice: 12
+// sheep: 26, X, Y
+
+// must be safe to call this twice.
+void finalize_chrset(misc_flags_t misc_flags, chr_info_t* cip) {
+  uint32_t autosome_ct = cip->autosome_ct;
+  uint32_t max_code = autosome_ct;
+  for (uint32_t xymt_idx_p1 = kChrOffsetCt; xymt_idx_p1; --xymt_idx_p1) {
+    if (cip->xymt_codes[xymt_idx_p1 - 1] >= 0) {
+      max_code = autosome_ct + xymt_idx_p1;
+      break;
+    }
+  }
+  
+  // could initialize haploid_mask bits (after the first) here, instead of
+  // earlier...
+  
+  cip->max_numeric_code = MINV(max_code, autosome_ct + 4);
+  cip->max_code = max_code;
+  uintptr_t* chr_mask = cip->chr_mask;
+  uintptr_t last_chr_mask_word = chr_mask[kChrMaskWords - 1];
+  int32_t* xymt_codes = cip->xymt_codes;
+  if (last_chr_mask_word) {
+    // avoids repeating some work if this is called twice
+    chr_mask[kChrMaskWords - 1] = 0;
+
+    uint32_t xymt_include = last_chr_mask_word >> (kBitsPerWord - kChrOffsetCt);
+    do {
+      const uint32_t xymt_idx = __builtin_ctz(xymt_include);
+      const int32_t cur_chr_code = xymt_codes[xymt_idx];
+      if (cur_chr_code >= 0) {
+	set_bit(cur_chr_code, chr_mask);
+      }
+      xymt_include &= xymt_include - 1;
+    } while (xymt_include);
+  } else if (are_all_words_zero(chr_mask, kChrExcludeWords) && (!cip->is_include_stack)) {
+    // init_default_chr_mask()
+    fill_all_bits(cip->autosome_ct + 1, chr_mask);
+    for (uint32_t xymt_idx = 0; xymt_idx < kChrOffsetCt; ++xymt_idx) {
+      const int32_t cur_chr_code = cip->xymt_codes[xymt_idx];
+      if (cur_chr_code >= 0) {
+	set_bit(cur_chr_code, chr_mask);
+      }
+    }
+  } else if (misc_flags & (kfMiscAutosomePar | kfMiscAutosomeOnly)) {
+    fill_bits_nz(1, cip->autosome_ct + 1, chr_mask);
+    clear_bits_nz(cip->autosome_ct + 1, kChrExcludeWords * kBitsPerWord, chr_mask);
+    if (misc_flags & kfMiscAutosomePar) {
+      int32_t par_chr_code = cip->xymt_codes[kChrOffsetXY];
+      if (par_chr_code >= 0) {
+	set_bit(par_chr_code, chr_mask);
+      }
+      par_chr_code = cip->xymt_codes[kChrOffsetPAR1];
+      if (par_chr_code >= 0) {
+	set_bit(par_chr_code, chr_mask);
+      }
+      par_chr_code = cip->xymt_codes[kChrOffsetPAR2];
+      if (par_chr_code >= 0) {
+	set_bit(par_chr_code, chr_mask);
+      }
+    }
+  }
+  
+  uintptr_t* chr_exclude = cip->chr_exclude;
+  uintptr_t last_chr_exclude_word = chr_exclude[kChrExcludeWords - 1];
+  uint32_t xymt_exclude = last_chr_exclude_word >> (kBitsPerWord - kChrOffsetCt);
+  last_chr_exclude_word &= (k1LU << (kBitsPerWord - kChrOffsetCt)) - k1LU;
+  for (uint32_t widx = 0; widx < kChrExcludeWords - 1; ++widx) {
+    chr_mask[widx] &= ~chr_exclude[widx];
+  }
+  chr_mask[kChrExcludeWords - 1] &= ~last_chr_exclude_word;
+  if (xymt_exclude) {
+    do {
+      const uint32_t xymt_idx = __builtin_ctz(xymt_exclude);
+      const int32_t cur_chr_code = xymt_codes[xymt_idx];
+      if (cur_chr_code >= 0) {
+	clear_bit(cur_chr_code, chr_mask);
+      }
+      xymt_exclude &= xymt_exclude - 1;
+    } while (xymt_exclude);
+  }
+  fill_uint_one(max_code + 1, cip->chr_idx_to_foidx);
+}
+
+void forget_extra_chr_names(uint32_t reinitialize, chr_info_t* cip) {
+  const uint32_t name_ct = cip->name_ct;
+  if (name_ct) {
+    char** nonstd_names = cip->nonstd_names;
+    const uint32_t chr_idx_last = cip->max_code + name_ct;
+    for (uint32_t chr_idx = cip->max_code + 1; chr_idx <= chr_idx_last; ++chr_idx) {
+      free(nonstd_names[chr_idx]);
+      nonstd_names[chr_idx] = nullptr;
+    }
+    if (reinitialize) {
+      // fill_uint_one(kChrHtableSize, cip->nonstd_id_htable);
+      cip->name_ct = 0;
+    }
+  }
+}
+
+// not currently called.  might want to do so in the future.
+pglerr_t finalize_chr_info(chr_info_t* cip) {
+  const uint32_t chr_ct = cip->chr_ct;
+  const uint32_t name_ct = cip->name_ct;
+  const uint32_t chr_code_end = cip->max_code + 1 + name_ct;
+  const uint32_t chr_code_bitvec_ct = BITCT_TO_VECCT(chr_code_end);
+  const uint32_t chr_ct_int32vec_ct = INT32CT_TO_VECCT(chr_ct);
+  const uint32_t chr_ct_p1_int32vec_ct = 1 + (chr_ct / kInt32PerVec);
+  const uint32_t chr_code_end_int32vec_ct = INT32CT_TO_VECCT(chr_code_end);
+  const uint32_t chr_code_end_wordvec_ct = WORDCT_TO_VECCT(chr_code_end);
+  uint32_t final_vecs_required = 2 * chr_code_bitvec_ct + chr_ct_int32vec_ct + chr_ct_p1_int32vec_ct + chr_code_end_int32vec_ct;
+  if (name_ct) {
+    final_vecs_required += chr_code_end_wordvec_ct + INT32CT_TO_VECCT(kChrHtableSize);
+  }
+  uintptr_t* new_alloc;
+  if (vecaligned_malloc(final_vecs_required * kBytesPerVec, &new_alloc)) {
+    return kPglRetNomem;
+  }
+  uintptr_t* old_alloc = cip->chr_mask;
+  uintptr_t* new_alloc_iter = new_alloc;
+
+  memcpy(new_alloc_iter, cip->chr_mask, chr_code_bitvec_ct * kBytesPerVec);
+  cip->chr_mask = new_alloc_iter;
+  new_alloc_iter = &(new_alloc_iter[chr_code_bitvec_ct * kWordsPerVec]);
+
+  memcpy(new_alloc_iter, cip->haploid_mask, chr_code_bitvec_ct * kBytesPerVec);
+  cip->haploid_mask = new_alloc_iter;
+  new_alloc_iter = &(new_alloc_iter[chr_code_bitvec_ct * kWordsPerVec]);
+
+  memcpy(new_alloc_iter, cip->chr_file_order, chr_ct_int32vec_ct * kBytesPerVec);
+  cip->chr_file_order = (uint32_t*)new_alloc_iter;
+  new_alloc_iter = &(new_alloc_iter[chr_ct_int32vec_ct * kWordsPerVec]);
+
+  memcpy(new_alloc_iter, cip->chr_fo_vidx_start, chr_ct_p1_int32vec_ct * kBytesPerVec);
+  cip->chr_fo_vidx_start = (uint32_t*)new_alloc_iter;
+  new_alloc_iter = &(new_alloc_iter[chr_ct_p1_int32vec_ct * kWordsPerVec]);
+
+  memcpy(new_alloc_iter, cip->chr_idx_to_foidx, chr_code_end_int32vec_ct * kBytesPerVec);
+  cip->chr_idx_to_foidx = (uint32_t*)new_alloc_iter;
+
+  if (!name_ct) {
+    cip->nonstd_names = nullptr;
+    cip->nonstd_id_htable = nullptr;
+  } else {
+    new_alloc_iter = &(new_alloc_iter[chr_code_end_int32vec_ct * kWordsPerVec]);
+
+    memcpy(new_alloc_iter, cip->nonstd_names, chr_code_end_wordvec_ct * kBytesPerVec);
+    cip->nonstd_names = (char**)new_alloc_iter;
+    new_alloc_iter = &(new_alloc_iter[chr_code_end_wordvec_ct * kWordsPerVec]);
+
+    memcpy(new_alloc_iter, cip->nonstd_id_htable, kChrHtableSize * sizeof(int32_t));
+    cip->nonstd_id_htable = (uint32_t*)new_alloc_iter;
+  }
+  vecaligned_free(old_alloc);
+  return kPglRetSuccess;
+}
+
+void cleanup_chr_info(chr_info_t* cip) {
+  if (cip->chr_mask) {
+    forget_extra_chr_names(0, cip);
+    vecaligned_free(cip->chr_mask);
+    cip->chr_mask = nullptr;
+  }
+  ll_str_t* llstr_ptr = cip->incl_excl_name_stack;
+  while (llstr_ptr) {
+    ll_str_t* next_ptr = llstr_ptr->next;
+    free(llstr_ptr);
+    llstr_ptr = next_ptr;
+  }
+  cip->incl_excl_name_stack = nullptr;
+}
+
+char* chr_name_std(const chr_info_t* cip, uint32_t chr_idx, char* buf) {
+  if (chr_idx > cip->max_numeric_code) {
+    // this is encoding-independent.  users who require all numbers should use
+    // 25 == XY instead.
+    // this code will probably need to be changed later if we add more standard
+    // nonnumeric codes.
+    memcpyl3(buf, "PAR");
+    buf[3] = '0' + (((int32_t)chr_idx) - cip->max_numeric_code);
+    return &(buf[4]);
+  }
+  const uint32_t output_encoding = cip->output_encoding;
+  if (output_encoding & (kfChrOutputPrefix | kfChrOutput0M)) {
+    if (output_encoding == kfChrOutput0M) {
+      // force two chars
+      if (chr_idx <= cip->autosome_ct) {
+	buf = memcpya(buf, &(kDigitPair[chr_idx]), 2);
+      } else if ((int32_t)chr_idx == cip->xymt_codes[kChrOffsetY]) {
+	buf = strcpya(buf, "XY");
+      } else {
+	*buf++ = '0';
+	if ((int32_t)chr_idx == cip->xymt_codes[kChrOffsetX]) {
+	  *buf++ = 'X';
+	} else {
+	  // assumes only X/Y/XY/MT defined
+	  *buf++ = ((int32_t)chr_idx == cip->xymt_codes[kChrOffsetY])? 'Y' : 'M';
+	}
+      }
+      return buf;
+    }
+    buf = memcpyl3a(buf, "chr");
+  }
+  if ((!(output_encoding & (kfChrOutputM | kfChrOutputMT))) || (chr_idx <= cip->autosome_ct)) {
+    return uint32toa(chr_idx, buf);
+  }
+  if ((int32_t)chr_idx == cip->xymt_codes[kChrOffsetX]) {
+    *buf++ = 'X';
+  } else if ((int32_t)chr_idx == cip->xymt_codes[kChrOffsetY]) {
+    *buf++ = 'Y';
+  } else if ((int32_t)chr_idx == cip->xymt_codes[kChrOffsetXY]) {
+    buf = strcpya(buf, "XY");
+  } else {
+    *buf++ = 'M';
+    if (output_encoding & kfChrOutputMT) {
+      *buf++ = 'T';
+    }
+  }
+  return buf;
+}
+
+char* chr_name_write(const chr_info_t* cip, uint32_t chr_idx, char* buf) {
+  // assumes chr_idx is valid
+  if (!chr_idx) {
+    *buf++ = '0';
+    return buf;
+  }
+  if (chr_idx <= cip->max_code) {
+    return chr_name_std(cip, chr_idx, buf);
+  }
+  if (cip->zero_extra_chrs) {
+    *buf++ = '0';
+    return buf;
+  }
+  return strcpya(buf, cip->nonstd_names[chr_idx]);
+}
+
+uint32_t get_max_chr_slen(const chr_info_t* cip) {
+  // does not include trailing null
+  // can be overestimate
+  // if more functions start calling this, it should just be built into
+  // load_bim() instead
+  if (cip->zero_extra_chrs) {
+    return 3 + kMaxChrTextnum;
+  }
+  const uint32_t chr_ct = cip->chr_ct;
+  const uint32_t max_code = cip->max_code;
+  uint32_t max_chr_slen = 3 + kMaxChrTextnum;
+  for (uint32_t chr_fo_idx = 0; chr_fo_idx < chr_ct; ++chr_fo_idx) {
+    const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+    if (!is_set(cip->chr_mask, chr_idx)) {
+      continue;
+    }
+    if (chr_idx > max_code) {
+      const uint32_t name_slen = strlen(cip->nonstd_names[chr_idx]);
+      if (name_slen > max_chr_slen) {
+	max_chr_slen = name_slen;
+      }
+    }
+  }
+  return max_chr_slen;
+}
+
+uint32_t haploid_chr_present(const chr_info_t* cip) {
+  const uintptr_t* chr_mask = cip->chr_mask;
+  const uintptr_t* haploid_mask = cip->haploid_mask;
+  // since we don't load haploid vs. diploid info from ##contig header lines,
+  // this is sufficient
+  for (uint32_t widx = 0; widx < kChrExcludeWords; ++widx) {
+    if (chr_mask[widx] & haploid_mask[widx]) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static inline int32_t single_cap_letter_chrom(uint32_t cap_letter) {
+  if (cap_letter == 'X') {
+    return kChrRawX;
+  }
+  if (cap_letter == 'Y') {
+    return kChrRawY;
+  }
+  if (cap_letter == 'M') {
+    return kChrRawMT;
+  }
+  return -1;
+}
+
+static_assert(kMaxChrTextnumSlen == 2, "get_chr_code_raw() must be updated.");
+int32_t get_chr_code_raw(const char* sptr) {
+  // any character <= ' ' is considered a terminator
+  // note that char arithmetic tends to be compiled to int32 operations, so we
+  // mostly work with ints here
+  uint32_t first_char_code = (unsigned char)sptr[0];
+  uint32_t first_char_toi;
+  if (first_char_code < 58) {
+  get_chr_code_raw_digits:
+    first_char_toi = first_char_code - '0';
+    if (first_char_toi < 10) {
+      const uint32_t second_char_code = (unsigned char)sptr[1];
+      if (second_char_code <= ' ') {
+	return first_char_toi;
+      }
+      if (((unsigned char)sptr[2]) <= ' ') {
+	const uint32_t second_char_toi = second_char_code - '0';
+	if (second_char_toi < 10) {
+	  return first_char_toi * 10 + second_char_toi;
+	}
+	if (!first_char_toi) {
+	  // accept '0X', '0Y', '0M' emitted by Oxford software
+	  return single_cap_letter_chrom(second_char_code & 0xdf);
+	}
+      }
+    }
+    return -1;
+  }
+  first_char_code &= 0xdf;
+  uint32_t second_char_code = (unsigned char)sptr[1];
+  if (first_char_code == 'P') {
+    // chrPAR1 *not* supported; has to be PAR1 by itself.
+    // can't do uint16_t compare of multiple characters, since we could be
+    // dealing with a length-1 null-terminated string; that IS faster when it's
+    // safe, though
+    if (((second_char_code & 0xdf) == 'A') && ((((unsigned char)sptr[2]) & 0xdf) == 'R')) {
+      const uint32_t par_idx_m1 = ((unsigned char)sptr[3]) - '1';
+      if ((par_idx_m1 < 2) && (((unsigned char)sptr[4]) <= ' ')) {
+	return kChrRawPAR1 + par_idx_m1;
+      }
+    }
+    return -1;
+  }
+  if (first_char_code == 'C') {
+    if (((second_char_code & 0xdf) != 'H') || ((((unsigned char)sptr[2]) & 0xdf) != 'R')) {
+      return -1;
+    }
+    sptr = &(sptr[3]);
+    first_char_code = (unsigned char)sptr[0];
+    if (first_char_code < 58) {
+      goto get_chr_code_raw_digits;
+    }
+    first_char_code &= 0xdf;
+    second_char_code = (unsigned char)sptr[1];
+  }
+  if (second_char_code <= ' ') {
+    return single_cap_letter_chrom(first_char_code);
+  }
+  if (((unsigned char)sptr[2]) <= ' ') {
+    second_char_code &= 0xdf;
+    if ((first_char_code == 'X') && (second_char_code == 'Y')) {
+      return kChrRawXY;
+    } else if ((first_char_code == 'M') && (second_char_code == 'T')) {
+      return kChrRawMT;
+    }
+  }
+  return -1;
+}
+
+int32_t get_chr_code(const char* chr_name, const chr_info_t* cip, uint32_t name_slen) {
+  // requires chr_name to be null-terminated
+  // in practice, name_slen will usually already be known, may as well avoid
+  // redundant strlen() calls even though this uglifies the interface
+  // does not perform exhaustive error-checking
+  // -1 = --allow-extra-chr ok, -2 = total fail
+  int32_t chr_code_raw = get_chr_code_raw(chr_name);
+  if (((uint32_t)chr_code_raw) <= cip->max_code) {
+    return chr_code_raw;
+  }
+  if (chr_code_raw != -1) {
+    if (chr_code_raw >= ((int32_t)kMaxContigs)) {
+      return cip->xymt_codes[chr_code_raw - kMaxContigs];
+    }
+    return -2;
+  }
+  if (!cip->name_ct) {
+    return -1;
+  }
+  // 0xffffffffU gets casted to -1
+  return (int32_t)id_htable_find(chr_name, cip->nonstd_names, cip->nonstd_id_htable, name_slen, kChrHtableSize);
+}
+
+int32_t get_chr_code_counted(const chr_info_t* cip, uint32_t name_slen, char* chr_name) {
+  // when the chromosome name isn't null-terminated
+  char* s_end = &(chr_name[name_slen]);
+  const char tmpc = *s_end;
+  *s_end = '\0';
+  const int32_t chr_code = get_chr_code(chr_name, cip, name_slen);
+  *s_end = tmpc;
+  return chr_code;
+}
+
+void chr_error(const char* chr_name, const char* file_descrip, const chr_info_t* cip, uintptr_t line_idx, int32_t error_code) {
+  // assumes chr_name is null-terminated
+  const int32_t raw_code = get_chr_code_raw(chr_name);
+  logprint("\n");
+  if (line_idx) {
+    LOGERRPRINTFWW("Error: Invalid chromosome code '%s' on line %" PRIuPTR " of %s.\n", chr_name, line_idx, file_descrip);
+  } else {
+    LOGERRPRINTFWW("Error: Invalid chromosome code '%s' in %s.\n", chr_name, file_descrip);
+  }
+  if ((raw_code > ((int32_t)cip->max_code)) && ((raw_code <= (int32_t)(kMaxChrTextnum + kChrOffsetCt)) || (raw_code >= ((int32_t)kMaxContigs)))) {
+    if (cip->chrset_source == kChrsetSourceDefault) {
+      logerrprint("(This is disallowed for humans.  Check if the problem is with your data, or if\nyou forgot to define a different chromosome set with e.g. --chr-set.).\n");
+    } else if (cip->chrset_source == kChrsetSourceCmdline) {
+      logerrprint("(This is disallowed by your command-line flags.)\n");
+    } else {
+      // kChrsetSourceFile
+      logerrprint("(This is disallowed by the file's own ##chrSet header line.)\n");
+    }
+    // maybe want to print message(s) depending on whether chromosome set was
+    // defined on the command line or by the input file?
+  } else if (error_code == -1) {
+    logerrprint("(Use --allow-extra-chr to force it to be accepted.)\n");
+  }
+}
+
+pglerr_t try_to_add_chr_name(const char* chr_name, const char* file_descrip, uintptr_t line_idx, uint32_t name_slen, uint32_t allow_extra_chrs, int32_t* chr_idx_ptr, chr_info_t* cip) {
+  // assumes chr_name is either nonstandard (i.e. not "2", "chr2", "chrX",
+  // etc.), or a rejected xymt.
+  // requires chr_name to be null-terminated
+  // assumes chr_idx currently has the return value of get_chr_code()
+  if ((!allow_extra_chrs) || ((*chr_idx_ptr) == -2)) {
+    chr_error(chr_name, file_descrip, cip, line_idx, *chr_idx_ptr);
+    return kPglRetMalformedInput;
+  }
+
+  // quasi-bugfix: remove redundant hash table check
+  
+  if (chr_name[0] == '#') {
+    // redundant with some of the comment-skipping loaders, but this isn't
+    // performance-critical
+    logprint("\n");
+    logerrprint("Error: Chromosome/contig names may not begin with '#'.\n");
+    return kPglRetMalformedInput;
+  }
+  if (name_slen > kMaxIdSlen) {
+    logprint("\n");
+    if (line_idx) {
+      LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has an excessively long chromosome/contig name. (The " PROG_NAME_STR " limit is " MAX_ID_SLEN_STR " characters.)\n", line_idx, file_descrip);
+    } else {
+      LOGERRPRINTFWW("Error: Excessively long chromosome/contig name in %s. (The " PROG_NAME_STR " limit is " MAX_ID_SLEN_STR " characters.)\n", file_descrip);
+    }
+    return kPglRetMalformedInput;
+  }
+  const uint32_t max_code_p1 = cip->max_code + 1;
+  const uint32_t name_ct = cip->name_ct;
+  const uint32_t chr_code_end = max_code_p1 + name_ct;
+  if (chr_code_end == kMaxContigs) {
+    logprint("\n");
+    logerrprint("Error: Too many distinct nonstandard chromosome/contig names.\n");
+    return kPglRetMalformedInput;
+  }
+  if (!name_ct) {
+    // lazy initialization
+    fill_uint_one(kChrHtableSize, cip->nonstd_id_htable);
+  }
+  char** nonstd_names = cip->nonstd_names;
+  if (pgl_malloc(name_slen + 1, &(nonstd_names[chr_code_end]))) {
+    return kPglRetNomem;
+  }
+  ll_str_t* name_stack_ptr = cip->incl_excl_name_stack;
+  uint32_t in_name_stack = 0;
+  while (name_stack_ptr) {
+    // there shouldn't be many of these, so sorting is unimportant
+    if (!strcmp(chr_name, name_stack_ptr->ss)) {
+      in_name_stack = 1;
+      break;
+    }
+    name_stack_ptr = name_stack_ptr->next;
+  }
+  if ((in_name_stack && cip->is_include_stack) || ((!in_name_stack) && (!cip->is_include_stack))) {
+    SET_BIT(chr_code_end, cip->chr_mask);
+    if (cip->haploid_mask[0] & 1) {
+      SET_BIT(chr_code_end, cip->haploid_mask);
+    }
+  }
+  memcpy(nonstd_names[chr_code_end], chr_name, name_slen + 1);
+  *chr_idx_ptr = (int32_t)chr_code_end;
+  cip->name_ct = name_ct + 1;
+  uint32_t* id_htable = cip->nonstd_id_htable;
+  uint32_t hashval = hashceil(chr_name, name_slen, kChrHtableSize);
+  while (1) {
+    if (id_htable[hashval] == 0xffffffffU) {
+      id_htable[hashval] = chr_code_end;
+      return kPglRetSuccess;
+    }
+    if (++hashval == kChrHtableSize) {
+      hashval = 0;
+    }
+  }
+}
+
+uintptr_t popcount_bit_idx(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
+  uintptr_t start_idxl = start_idx / kBitsPerWord;
+  const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1);
+  const uintptr_t end_idxl = end_idx / kBitsPerWord;
+  const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1);
+  uintptr_t ct = 0;
+  if (start_idxl == end_idxl) {
+    return popcount_long(bitvec[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr)));
+  }
+  if (start_idxlr) {
+    ct = popcount_long(bitvec[start_idxl++] >> start_idxlr);
+  }
+  if (end_idxl > start_idxl) {
+    ct += popcount_longs_nzbase(bitvec, start_idxl, end_idxl);
+  }
+  if (end_idxlr) {
+    ct += popcount_long(bitvec[end_idxl] & ((k1LU << end_idxlr) - k1LU));
+  }
+  return ct;
+}
+
+static inline uintptr_t popcount_vecs_intersect(const vul_t* __restrict vvec1_iter, const vul_t* __restrict vvec2_iter, uintptr_t vec_ct) {
+  // popcounts vvec1 AND vvec2[0..(ct-1)].  ct is a multiple of 3.
+  assert(!(vec_ct % 3));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t m8 = VCONST_UL(kMask00FF);
+  uintptr_t tot = 0;
+  while (1) {
+    univec_t acc;
+    acc.vi = vul_setzero();
+    const vul_t* vvec1_stop;
+    if (vec_ct < 30) {
+      if (!vec_ct) {
+	return tot;
+      }
+      vvec1_stop = &(vvec1_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      vvec1_stop = &(vvec1_iter[30]);
+      vec_ct -= 30;
+    }
+    do {
+      vul_t count1 = (*vvec1_iter++) & (*vvec2_iter++);
+      vul_t count2 = (*vvec1_iter++) & (*vvec2_iter++);
+      vul_t half1 = (*vvec1_iter++) & (*vvec2_iter++);
+      const vul_t half2 = vul_rshift(half1, 1) & m1;
+      half1 = half1 & m1;
+      count1 = count1 - (vul_rshift(count1, 1) & m1);
+      count2 = count2 - (vul_rshift(count2, 1) & m1);
+      count1 = count1 + half1;
+      count2 = count2 + half2;
+      count1 = (count1 & m2) + (vul_rshift(count1, 2) & m2);
+      count1 = count1 + (count2 & m2) + (vul_rshift(count2, 2) & m2);
+      acc.vi = acc.vi + (count1 & m4) + (vul_rshift(count1, 4) & m4);
+    } while (vvec1_iter < vvec1_stop);
+    acc.vi = (acc.vi & m8) + (vul_rshift(acc.vi, 8) & m8);
+    tot += univec_hsum_16bit(acc);
+  }
+}
+
+uintptr_t popcount_longs_intersect(const uintptr_t* __restrict bitvec1_iter, const uintptr_t* __restrict bitvec2_iter, uintptr_t word_ct) {
+  uintptr_t tot = 0;
+  const uintptr_t* bitvec1_end = &(bitvec1_iter[word_ct]);
+  const uintptr_t trivec_ct = word_ct / (3 * kWordsPerVec);
+  tot += popcount_vecs_intersect((const vul_t*)bitvec1_iter, (const vul_t*)bitvec2_iter, trivec_ct * 3);
+  bitvec1_iter = &(bitvec1_iter[trivec_ct * (3 * kWordsPerVec)]);
+  bitvec2_iter = &(bitvec2_iter[trivec_ct * (3 * kWordsPerVec)]);
+  while (bitvec1_iter < bitvec1_end) {
+    tot += popcount_long((*bitvec1_iter++) & (*bitvec2_iter++));
+  }
+  return tot;
+}
+
+/*
+uintptr_t count_11_vecs(const vul_t* geno_vvec, uintptr_t vec_ct) {
+  // Counts number of aligned 11s in vptr[0..(vec_ct-1)].  Assumes vec_ct is a
+  // multiple of 6 (0 ok).
+  assert(!(vec_ct % 6));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t m8 = VCONST_UL(kMask00FF);
+  const vul_t* geno_vvec_iter = geno_vvec;
+  const vul_t* geno_vvec_end = &(geno_vvec[vec_ct]);
+  uintptr_t tot = 0;
+
+  while (1) {
+    const vul_t* geno_vvec_stop = &(geno_vvec_iter[60]);
+
+    univec_t acc;
+    acc.vi = vul_setzero();
+    
+    if (geno_vvec_stop > geno_vvec_end) {
+      if (geno_vvec_iter == geno_vvec_end) {
+	return tot;
+      }
+      geno_vvec_stop = geno_vvec_end;
+    }
+    do {
+      vul_t cur_geno_vword = *geno_vvec_iter++;
+      vul_t count1 = cur_geno_vword & m1;
+      count1 = count1 & vul_rshift(cur_geno_vword, 1);
+      
+      cur_geno_vword = *geno_vvec_iter++;
+      vul_t cur_11 = cur_geno_vword & m1;
+      cur_11 = cur_11 & vul_rshift(cur_geno_vword, 1);
+      count1 = count1 + cur_11;
+
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_11 = cur_geno_vword & m1;
+      cur_11 = cur_11 & vul_rshift(cur_geno_vword, 1);
+      count1 = count1 + cur_11;
+      count1 = (count1 & m2) + (vul_rshift(count1, 2) & m2);
+
+      cur_geno_vword = *geno_vvec_iter++;
+      vul_t count2 = cur_geno_vword & m1;
+      count2 = count2 & vul_rshift(cur_geno_vword, 1);
+      
+      cur_geno_vword = *geno_vvec_iter++;
+      vul_t cur_11 = cur_geno_vword & m1;
+      cur_11 = cur_11 & vul_rshift(cur_geno_vword, 1);
+      count2 = count2 + cur_11;
+
+      cur_geno_vword = *geno_vvec_iter++;
+      cur_11 = cur_geno_vword & m1;
+      cur_11 = cur_11 & vul_rshift(cur_geno_vword, 1);
+      count2 = count2 + cur_11;
+      count1 = count1 + (count2 & m2) + (vul_rshift(count2, 2) & m2);
+
+      acc.vi = acc.vi + (count1 & m4) + (vul_rshift(count1, 4) & m4);
+    } while (geno_vvec_iter < geno_vvec_stop);
+    acc.vi = (acc.vi & m8) + (vul_rshift(acc.vi, 8) & m8);
+    tot += univec_hsum_16bit(acc);
+  }
+}
+
+uintptr_t count_11_longs(const uintptr_t* genovec, uintptr_t word_ct) {
+  uintptr_t tot = 0;
+  if (word_ct >= (6 * kWordsPerVec)) {
+    assert(IS_VEC_ALIGNED(genovec));
+    const uintptr_t remainder = word_ct % (6 * kWordsPerVec);
+    const uintptr_t main_block_word_ct = word_ct - remainder;
+    tot = count_11_vecs((const vul_t*)genovec, main_block_word_ct / kWordsPerVec);
+    word_ct = remainder;
+    genovec = &(genovec[main_block_word_ct]);
+  }
+  for (uintptr_t trailing_word_idx = 0; trailing_word_idx < word_ct; ++trailing_word_idx) {
+    const uintptr_t cur_geno_word = genovec[trailing_word_idx];
+    tot += popcount01_long(cur_geno_word & (cur_geno_word >> 1) & kMask5555);
+  }
+}
+*/
+
+uint32_t are_all_bits_zero(const uintptr_t* bitarr, uintptr_t start_idx, uintptr_t end_idx) {
+  uintptr_t start_idxl = start_idx / kBitsPerWord;
+  const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1);
+  const uintptr_t end_idxl = end_idx / kBitsPerWord;
+  const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1);
+  if (start_idxl == end_idxl) {
+    return !(bitarr[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr)));
+  }
+  if (start_idxlr && (bitarr[start_idxl++] >> start_idxlr)) {
+    return 0;
+  }
+  for (; start_idxl < end_idxl; ++start_idxl) {
+    if (bitarr[start_idxl]) {
+      return 0;
+    }
+  }
+  if (!end_idxlr) {
+    return 1;
+  }
+  return !(bitarr[end_idxl] & ((k1LU << end_idxlr) - k1LU));
+}
+
+void copy_bitarr_range(const uintptr_t* __restrict src_bitarr, uintptr_t src_start_bitidx, uintptr_t target_start_bitidx, uintptr_t len, uintptr_t* __restrict target_bitarr) {
+  // assumes len is positive, and relevant bits of target_bitarr are zero
+  const uintptr_t* src_bitarr_iter = &(src_bitarr[src_start_bitidx / kBitsPerWord]);
+  uint32_t src_rshift = src_start_bitidx % kBitsPerWord;
+  uintptr_t* target_bitarr_iter = &(target_bitarr[target_start_bitidx / kBitsPerWord]);
+  uint32_t target_initial_lshift = target_start_bitidx % kBitsPerWord;
+  uintptr_t cur_src_word;
+  if (target_initial_lshift) {
+    const uint32_t initial_copy_bitct = kBitsPerWord - target_initial_lshift;
+    if (len <= initial_copy_bitct) {
+      goto copy_bitarr_range_last_partial_word;
+    }
+    cur_src_word = (*src_bitarr_iter) >> src_rshift;
+    if (src_rshift >= target_initial_lshift) {
+      ++src_bitarr_iter;
+      cur_src_word |= (*src_bitarr_iter) << (kBitsPerWord - src_rshift);
+    }
+    *target_bitarr_iter++ |= cur_src_word << target_initial_lshift;
+    src_rshift = (src_rshift + initial_copy_bitct) % kBitsPerWord;
+    len -= initial_copy_bitct;
+  }
+  {
+    const uintptr_t fullword_ct = len / kBitsPerWord;
+    if (!src_rshift) {
+      memcpy(target_bitarr_iter, src_bitarr_iter, fullword_ct * sizeof(intptr_t));
+      target_bitarr_iter = &(target_bitarr_iter[fullword_ct]);
+      src_bitarr_iter = &(src_bitarr_iter[fullword_ct]);
+    } else {
+      const uint32_t src_lshift = kBitsPerWord - src_rshift;
+      cur_src_word = *src_bitarr_iter;
+      for (uintptr_t widx = 0; widx < fullword_ct; ++widx) {
+	const uintptr_t next_src_word = *(++src_bitarr_iter);
+	*target_bitarr_iter++ = (cur_src_word >> src_rshift) | (next_src_word << src_lshift);
+	cur_src_word = next_src_word;
+      }
+    }
+  }
+  len %= kBitsPerWord;
+  if (len) {
+    target_initial_lshift = 0;
+  copy_bitarr_range_last_partial_word:
+    cur_src_word = (*src_bitarr_iter) >> src_rshift;
+    if (len + src_rshift > kBitsPerWord) {
+      cur_src_word |= src_bitarr_iter[1] << (kBitsPerWord - src_rshift);
+    }
+    *target_bitarr_iter |= (cur_src_word & ((~k0LU) >> (kBitsPerWord - ((uint32_t)len)))) << target_initial_lshift;
+  }
+}
+
+void interleaved_mask_zero(const uintptr_t* __restrict interleaved_mask, uintptr_t vec_ct, uintptr_t* __restrict genovec) {
+  const uintptr_t twovec_ct = vec_ct / 2;
+#ifdef __LP64__
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t* interleaved_mask_iter = (const vul_t*)interleaved_mask;
+  vul_t* genovvec_iter = (vul_t*)genovec;
+  for (uintptr_t twovec_idx = 0; twovec_idx < twovec_ct; ++twovec_idx) {
+    const vul_t mask_vvec = *interleaved_mask_iter++;
+    vul_t mask_first = mask_vvec & m1;
+    mask_first = mask_first | vul_lshift(mask_first, 1);
+    vul_t mask_second = (~m1) & mask_vvec;
+    mask_second = mask_second | vul_rshift(mask_second, 1);
+    *genovvec_iter = (*genovvec_iter) & mask_first;
+    ++genovvec_iter;
+    *genovvec_iter = (*genovvec_iter) & mask_second;
+    ++genovvec_iter;
+  }
+  if (vec_ct & 1) {
+    vul_t mask_first = *interleaved_mask_iter;
+    mask_first = mask_first | vul_lshift(mask_first, 1);
+    *genovvec_iter = (*genovvec_iter) & mask_first;
+  }
+#else
+  const uintptr_t* interleaved_mask_iter = interleaved_mask;
+  uintptr_t* genovec_iter = genovec;
+  for (uintptr_t twovec_idx = 0; twovec_idx < twovec_ct; ++twovec_idx) {
+    const uintptr_t mask_word = *interleaved_mask_iter++;
+    *genovec_iter &= (mask_word & kMask5555) * 3;
+    ++genovec_iter;
+    *genovec_iter &= ((mask_word >> 1) & kMask5555) * 3;
+    ++genovec_iter;
+  }
+  if (vec_ct & 1) {
+    const uintptr_t mask_word = *interleaved_mask_iter;
+    *genovec_iter &= mask_word * 3;
+  }
+#endif
+}
+
+void interleaved_set_missing(const uintptr_t* __restrict interleaved_set, uintptr_t vec_ct, uintptr_t* __restrict genovec) {
+  const uintptr_t twovec_ct = vec_ct / 2;
+#ifdef __LP64__
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t* interleaved_set_iter = (const vul_t*)interleaved_set;
+  vul_t* genovvec_iter = (vul_t*)genovec;
+  for (uintptr_t twovec_idx = 0; twovec_idx < twovec_ct; ++twovec_idx) {
+    const vul_t set_vvec = *interleaved_set_iter++;
+    vul_t set_first = set_vvec & m1;
+    set_first = set_first | vul_lshift(set_first, 1);
+    vul_t set_second = (~m1) & set_vvec;
+    set_second = set_second | vul_rshift(set_second, 1);
+    *genovvec_iter = (*genovvec_iter) | set_first;
+    ++genovvec_iter;
+    *genovvec_iter = (*genovvec_iter) | set_second;
+    ++genovvec_iter;
+  }
+  if (vec_ct & 1) {
+    vul_t set_first = *interleaved_set_iter;
+    set_first = set_first | vul_lshift(set_first, 1);
+    *genovvec_iter = (*genovvec_iter) | set_first;
+  }
+#else
+  const uintptr_t* interleaved_set_iter = interleaved_set;
+  uintptr_t* genovec_iter = genovec;
+  for (uintptr_t twovec_idx = 0; twovec_idx < twovec_ct; ++twovec_idx) {
+    const uintptr_t set_word = *interleaved_set_iter++;
+    *genovec_iter |= (set_word & kMask5555) * 3;
+    ++genovec_iter;
+    *genovec_iter |= ((set_word >> 1) & kMask5555) * 3;
+    ++genovec_iter;
+  }
+  if (vec_ct & 1) {
+    const uintptr_t set_word = *interleaved_set_iter;
+    *genovec_iter |= set_word * 3;
+  }
+#endif
+}
+
+void set_male_het_missing(const uintptr_t* __restrict sex_male_interleaved, uint32_t vec_ct, uintptr_t* __restrict genovec) {
+  const uint32_t twovec_ct = vec_ct / 2;
+#ifdef __LP64__
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t* sex_male_interleaved_iter = (const vul_t*)sex_male_interleaved;
+  vul_t* genovvec_iter = (vul_t*)genovec;
+  for (uint32_t twovec_idx = 0; twovec_idx < twovec_ct; ++twovec_idx) {
+    const vul_t sex_male_vvec = *sex_male_interleaved_iter++;
+    // we wish to bitwise-or with (sex_male_quatervec_01 & genovec) << 1
+    const vul_t sex_male_first = sex_male_vvec & m1;
+    const vul_t sex_male_second_shifted = (~m1) & sex_male_vvec;
+    vul_t cur_geno_vword = *genovvec_iter;
+    
+    const vul_t missing_male_vword = sex_male_first & cur_geno_vword;
+    
+    *genovvec_iter++ = cur_geno_vword | vul_lshift(missing_male_vword, 1);
+    cur_geno_vword = *genovvec_iter;
+    *genovvec_iter++ = cur_geno_vword | (sex_male_second_shifted & vul_lshift(cur_geno_vword, 1));
+  }
+  if (vec_ct & 1) {
+    const vul_t sex_male_first = (*sex_male_interleaved_iter) & m1;
+    const vul_t cur_geno_vword = *genovvec_iter;
+    const vul_t missing_male_vword = sex_male_first & cur_geno_vword;
+    *genovvec_iter = cur_geno_vword | vul_lshift(missing_male_vword, 1);
+  }
+#else
+  const uintptr_t* sex_male_interleaved_iter = sex_male_interleaved;
+  uintptr_t* genovec_iter = genovec;
+  for (uint32_t twovec_idx = 0; twovec_idx < twovec_ct; ++twovec_idx) {
+    const uintptr_t sex_male_word = *sex_male_interleaved_iter++;
+    uintptr_t cur_geno_word = *genovec_iter;
+    *genovec_iter++ = cur_geno_word | ((sex_male_word & kMask5555 & cur_geno_word) << 1);
+    cur_geno_word = *genovec_iter;
+    *genovec_iter++ = cur_geno_word | (sex_male_word & kMaskAAAA & (cur_geno_word << 1));
+  }
+  if (vec_ct & 1) {
+    const uintptr_t sex_male_word = *sex_male_interleaved_iter;
+    uintptr_t cur_geno_word = *genovec_iter;
+    *genovec_iter = cur_geno_word | ((sex_male_word & kMask5555 & cur_geno_word) << 1);
+  }
+#endif
+}
+
+// Clears each bit in bitarr which doesn't correspond to a genovec het.
+// Assumes that either trailing bits of bitarr are already zero, or trailing
+// bits of genovec are zero.
+//
+// Similar to pgr_detect_genovec_hets_unsafe(). 
+void mask_genovec_hets_unsafe(const uintptr_t* __restrict genovec, uint32_t raw_sample_ctl2, uintptr_t* __restrict bitarr) {
+  halfword_t* bitarr_alias = (halfword_t*)bitarr;
+  for (uint32_t widx = 0; widx < raw_sample_ctl2; ++widx) {
+    const uintptr_t cur_word = genovec[widx];
+    uintptr_t ww = (~(cur_word >> 1)) & cur_word & kMask5555; // low 1, high 0
+    bitarr_alias[widx] &= pack_word_to_halfword(ww);
+  }
+}
+
+/*
+uint32_t chr_window_max(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bp, uint32_t chr_fo_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max) {
+  if (cur_window_max >= ct_max) {
+    return ct_max;
+  }
+  const uint32_t chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+  uint32_t variant_uidx = next_set(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], chr_end);
+  const uint32_t variant_ct = popcount_bit_idx(variant_include, variant_uidx, chr_end);
+  if (variant_ct <= cur_window_max) {
+    return cur_window_max;
+  }
+  uint32_t window_idx_first = 0;
+  uint32_t window_uidx_first = variant_uidx;
+  uint32_t window_bp_first = variant_bp[variant_uidx];
+  for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_uidx, ++variant_idx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    uint32_t variant_bp_thresh = variant_bp[variant_uidx];
+    if (variant_bp_thresh < bp_max) {
+      variant_bp_thresh = 0;
+    } else {
+      variant_bp_thresh -= bp_max;
+    }
+    if (variant_bp_thresh > window_bp_first) {
+      do {
+        ++window_uidx_first;
+        next_set_unsafe_ck(variant_include, &window_uidx_first);
+        window_bp_first = variant_bp[window_uidx_first];
+        ++window_idx_first;
+      } while (variant_bp_thresh > window_bp_first);
+    } else if (variant_idx - window_idx_first == cur_window_max) {
+      if (++cur_window_max == ct_max) {
+	return cur_window_max;
+      }
+    }
+  }
+  return cur_window_max;
+}
+*/
+
+// advances forward_ct set bits; forward_ct must be positive.  (stays put if
+// forward_ct == 1 and current bit is set.  may want to tweak this interface,
+// easy to introduce off-by-one bugs...)
+// In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end of
+// the trailing 16-byte block can be safely read from.
+uintptr_t jump_forward_set_unsafe(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) {
+  assert(forward_ct);
+  uintptr_t widx = cur_pos / kBitsPerWord;
+  uintptr_t ulii = cur_pos % kBitsPerWord;
+  const uintptr_t* bptr = &(bitvec[widx]);
+  uintptr_t uljj;
+  uintptr_t ulkk;
+#ifdef __LP64__
+  const vul_t* vptr;
+  assert(IS_VEC_ALIGNED(bitvec));
+#endif
+  if (ulii) {
+    uljj = (*bptr) >> ulii;
+    ulkk = popcount_long(uljj);
+    if (ulkk >= forward_ct) {
+    jump_forward_set_unsafe_finish:
+      ulkk = CTZLU(uljj);
+      while (--forward_ct) {
+        uljj &= uljj - 1;
+        ulkk = CTZLU(uljj);
+      }
+      return widx * kBitsPerWord + ulii + ulkk;
+    }
+    forward_ct -= ulkk;
+    ++widx;
+    ++bptr;
+  }
+  ulii = 0;
+#ifdef __LP64__
+  while (widx & (kWordsPerVec - k1LU)) {
+    uljj = *bptr;
+    ulkk = popcount_long(uljj);
+    if (ulkk >= forward_ct) {
+      goto jump_forward_set_unsafe_finish;
+    }
+    forward_ct -= ulkk;
+    ++widx;
+    ++bptr;
+  }
+  vptr = (const vul_t*)bptr;
+  while (forward_ct > kBitsPerWord * (3 * kWordsPerVec)) {
+    uljj = ((forward_ct - 1) / (kBitsPerWord * (3 * kWordsPerVec))) * 3;
+    ulkk = popcount_vecs(vptr, uljj);
+    vptr = &(vptr[uljj]);
+    forward_ct -= ulkk;
+  }
+  bptr = (const uintptr_t*)vptr;
+  while (forward_ct > kBitsPerWord) {
+    forward_ct -= popcount_long(*bptr++);
+  }
+#else
+  while (forward_ct > kBitsPerWord) {
+    uljj = (forward_ct - 1) / kBitsPerWord;
+    ulkk = popcount_longs(bptr, uljj);
+    bptr = &(bptr[uljj]);
+    forward_ct -= ulkk;
+  }
+#endif
+  while (1) {
+    uljj = *bptr;
+    ulkk = popcount_long(uljj);
+    if (ulkk >= forward_ct) {
+      widx = (uintptr_t)(bptr - bitvec);
+      goto jump_forward_set_unsafe_finish;
+    }
+    forward_ct -= ulkk;
+    ++bptr;
+  }
+}
+
+void compute_uidx_start_partition(const uintptr_t* variant_include, uint64_t variant_ct, uint32_t thread_ct, uint32_t first_variant_uidx, uint32_t* variant_uidx_starts) {
+  assert(variant_ct);
+  uint32_t cur_variant_uidx_start = next_set_unsafe(variant_include, first_variant_uidx);
+  uint32_t cur_variant_idx_start = 0;
+  variant_uidx_starts[0] = cur_variant_uidx_start;
+  for (uint32_t tidx = 1; tidx < thread_ct; ++tidx) {
+    const uint32_t new_variant_idx_start = (tidx * variant_ct) / thread_ct;
+    if (new_variant_idx_start != cur_variant_idx_start) {
+      cur_variant_uidx_start = jump_forward_set_unsafe(variant_include, cur_variant_uidx_start + 1, new_variant_idx_start - cur_variant_idx_start);
+      cur_variant_idx_start = new_variant_idx_start;
+    }
+    variant_uidx_starts[tidx] = cur_variant_uidx_start;
+  }
+}
+
+uint32_t not_only_xymt(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t raw_variant_ct, uint32_t xymt_offset) {
+  const uint32_t xymt_code = (uint32_t)cip->xymt_codes[xymt_offset];
+  const uint32_t cur_chr_fo_idx = cip->chr_idx_to_foidx[xymt_code];
+  const uint32_t chr_start = cip->chr_fo_vidx_start[cur_chr_fo_idx];
+  if (chr_start) {
+    const uint32_t first_uidx = next_set_unsafe(variant_include, 0);
+    if (first_uidx < chr_start) {
+      return 1;
+    }
+  }
+  const uint32_t chr_end = cip->chr_fo_vidx_start[cur_chr_fo_idx + 1];
+  return (chr_end < raw_variant_ct) && (next_set(variant_include, chr_end, raw_variant_ct) != raw_variant_ct);
+}
+
+uint32_t count_non_autosomal_variants(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t count_x, uint32_t count_mt) {
+  // for backward compatibility, unplaced markers are considered to be
+  // autosomal here
+  uint32_t ct = 0;
+  if (count_x) {
+    int32_t x_code;
+    if (xymt_exists(cip, kChrOffsetX, &x_code)) {
+      ct += count_chr_variants_unsafe(variant_include, cip, x_code);
+    }
+  }
+  int32_t y_code;
+  if (xymt_exists(cip, kChrOffsetY, &y_code)) {
+    ct += count_chr_variants_unsafe(variant_include, cip, y_code);
+  }
+  if (count_mt) {
+    int32_t mt_code;
+    if (xymt_exists(cip, kChrOffsetMT, &mt_code)) {
+      ct += count_chr_variants_unsafe(variant_include, cip, mt_code);
+    }
+  }
+  return ct;
+}
+
+pglerr_t conditional_allocate_non_autosomal_variants(const chr_info_t* cip, const char* calc_descrip, uint32_t raw_variant_ct, uintptr_t** variant_include_ptr, uint32_t* variant_ct_ptr) {
+  const uint32_t non_autosomal_variant_ct = count_non_autosomal_variants(*variant_include_ptr, cip, 1, 1);
+  if (!non_autosomal_variant_ct) {
+    return kPglRetSuccess;
+  }
+  LOGPRINTF("Excluding %u variant%s on non-autosomes from %s.\n", non_autosomal_variant_ct, (non_autosomal_variant_ct == 1)? "" : "s", calc_descrip);
+  *variant_ct_ptr -= non_autosomal_variant_ct;
+  if (!(*variant_ct_ptr)) {
+    // this may not always be an error condition
+    LOGERRPRINTF("Error: No variants remaining for %s.\n", calc_descrip);
+    return kPglRetInconsistentInput;
+  }
+  const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+  uintptr_t* working_variant_include;
+  if (bigstack_alloc_ul(raw_variant_ctl, &working_variant_include)) {
+    return kPglRetNomem;
+  }
+  memcpy(working_variant_include, *variant_include_ptr, raw_variant_ctl * sizeof(intptr_t));
+  int32_t x_code;
+  if (xymt_exists(cip, kChrOffsetX, &x_code)) {
+    uint32_t chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)x_code];
+    clear_bits_nz(cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1], working_variant_include);
+  }
+  int32_t y_code;
+  if (xymt_exists(cip, kChrOffsetX, &y_code)) {
+    uint32_t chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)y_code];
+    clear_bits_nz(cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1], working_variant_include);
+  }
+  int32_t mt_code;
+  if (xymt_exists(cip, kChrOffsetX, &mt_code)) {
+    uint32_t chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)mt_code];
+    clear_bits_nz(cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1], working_variant_include);
+  }
+  *variant_include_ptr = working_variant_include;
+  return kPglRetSuccess;
+}
+
+void fill_subset_chr_fo_vidx_start(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t* subset_chr_fo_vidx_start) {
+  const uint32_t chr_ct = cip->chr_ct;
+  subset_chr_fo_vidx_start[0] = 0;
+  uint32_t variant_uidx = 0;
+  uint32_t variant_idx = 0;
+  for (uint32_t chr_fo_idx = 1; chr_fo_idx <= chr_ct; ++chr_fo_idx) {
+    const uint32_t chr_end_variant_uidx = cip->chr_fo_vidx_start[chr_fo_idx];
+    variant_idx += popcount_bit_idx(variant_include, variant_uidx, chr_end_variant_uidx);
+    subset_chr_fo_vidx_start[chr_fo_idx] = variant_idx;
+    variant_uidx = chr_end_variant_uidx;
+  }
+}
+
+boolerr_t allele_set(const char* newval, uint32_t allele_slen, char** allele_ptr) {
+  char* newptr;
+  if (allele_slen == 1) {
+    // const_cast
+    newptr = (char*)((uintptr_t)(&(g_one_char_strs[((unsigned char)(*newval)) * 2])));
+  } else {
+    char* new_alloc;
+    if (pgl_malloc(allele_slen + 1, &new_alloc)) {
+      return 1;
+    }
+    memcpyx(new_alloc, newval, allele_slen, '\0');
+    newptr = new_alloc;
+  }
+  *allele_ptr = newptr;
+  return 0;
+}
+
+boolerr_t allele_reset(const char* newval, uint32_t allele_slen, char** allele_ptr) {
+  char* newptr;
+  if (allele_slen == 1) {
+    // const_cast
+    newptr = (char*)((uintptr_t)(&(g_one_char_strs[((unsigned char)(*newval)) * 2])));
+  } else {
+    char* new_alloc;
+    if (pgl_malloc(allele_slen + 1, &new_alloc)) {
+      return 1;
+    }
+    memcpyx(new_alloc, newval, allele_slen, '\0');
+    newptr = new_alloc;
+  }
+  const uintptr_t bigstack_end_addr = (uintptr_t)g_bigstack_end;
+  const uintptr_t maxdiff = ((uintptr_t)(&(g_one_char_strs[512]))) - bigstack_end_addr;
+  // take advantage of unsigned wraparound
+  if ((((uintptr_t)(*allele_ptr)) - bigstack_end_addr) >= maxdiff) {
+    free(*allele_ptr);
+  }
+  *allele_ptr = newptr;
+  return 0;
+}
+
+void cleanup_allele_storage(uint32_t max_allele_slen, uintptr_t allele_storage_entry_ct, char** allele_storage) {
+  // Now doesn't improperly free bigstack allocations (as long as they aren't
+  // past g_bigstack_end), and doesn't need to be called at all most of the
+  // time.
+  
+  // An alternative representation: have a separate bitarray which indicates
+  // whether the allele_storage[] element should be interpreted as a heap
+  // pointer or an in-place zero-terminated string (i.e. string length can be
+  // up to 7 on 64-bit systems).  I expect that to be more efficient for new
+  // datasets, but let's get the simple (and 1.9-codebase-compatible)
+  // implementation working first, and then benchmark the fancier code later.
+  if (allele_storage && (max_allele_slen > 1)) {
+    const uintptr_t bigstack_end_addr = (uintptr_t)g_bigstack_end;
+    const uintptr_t maxdiff = ((uintptr_t)(&(g_one_char_strs[512]))) - bigstack_end_addr;
+    for (uintptr_t idx = 0; idx < allele_storage_entry_ct; ++idx) {
+      char* cur_entry = allele_storage[idx];
+      assert(cur_entry);
+      // take advantage of unsigned wraparound
+      if ((((uintptr_t)cur_entry) - bigstack_end_addr) >= maxdiff) {
+	free(cur_entry);
+      }
+    }
+  }
+}
+
+char g_missing_catname[kMaxMissingPhenostrBlen];
+char g_output_missing_pheno[kMaxMissingPhenostrBlen];
+char g_legacy_output_missing_pheno[kMaxMissingPhenostrBlen];
+
+void init_pheno() {
+  strcpy(g_missing_catname, "NONE");
+  strcpy(g_output_missing_pheno, "NA");
+  strcpy(g_legacy_output_missing_pheno, "-9");
+}
+
+uint32_t is_categorical_phenostr(const char* phenostr) {
+  uint32_t first_char_code = (unsigned char)(*phenostr++);
+  // allow leading +/-
+  if ((first_char_code == 43) || (first_char_code == 45)) {
+    first_char_code = (unsigned char)(*phenostr++);
+  }
+  if (((first_char_code - 48) < 10) || (first_char_code == 44) || (first_char_code < 32)) {
+    // the last two conditions are for detecting CSV empty strings
+    return 0;
+  }
+  if (first_char_code == 46) {
+    // decimal point.  classify based on whether next character is a digit.
+    const uint32_t second_char_code = (unsigned char)phenostr[0];
+    return ((second_char_code - 48) >= 10);
+  }
+  // allow any capitalization of "NA"/"nan", but not "inf"
+  if ((first_char_code & 0xdf) != 78) {
+    return 1;
+  }
+  const uint32_t second_char_code = (unsigned char)phenostr[0];
+  if ((second_char_code & 0xdf) != 65) {
+    return 1;
+  }
+  const uint32_t third_char_code = (unsigned char)phenostr[1];
+  if ((third_char_code & 0xdf) == 78) {
+    return (((unsigned char)phenostr[2]) > ' ');
+  }
+  return (third_char_code > 32);
+}
+
+uint32_t is_categorical_phenostr_nocsv(const char* phenostr) {
+  uint32_t first_char_code = (unsigned char)(*phenostr++);
+  // allow leading +/-
+  if ((first_char_code == 43) || (first_char_code == 45)) {
+    first_char_code = (unsigned char)(*phenostr++);
+  }
+  if ((first_char_code - 48) < 10) {
+    return 0;
+  }
+  if (first_char_code == 46) {
+    // decimal point.  classify based on whether next character is a digit.
+    const uint32_t second_char_code = (unsigned char)phenostr[0];
+    return ((second_char_code - 48) >= 10);
+  }
+  // allow any capitalization of "NA"/"nan", but not "inf"
+  if ((first_char_code & 0xdf) != 78) {
+    return 1;
+  }
+  const uint32_t second_char_code = (unsigned char)phenostr[0];
+  if ((second_char_code & 0xdf) != 65) {
+    return 1;
+  }
+  const uint32_t third_char_code = (unsigned char)phenostr[1];
+  if ((third_char_code & 0xdf) == 78) {
+    return (((unsigned char)phenostr[2]) > ' ');
+  }
+  return (third_char_code > 32);
+}
+
+uint32_t first_cc_or_qt_pheno_idx(const pheno_col_t* pheno_cols, uint32_t pheno_ct) {
+  for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+    if (pheno_cols[pheno_idx].type_code < kPhenoDtypeCat) {
+      return pheno_idx;
+    }
+  }
+  return 0xffffffffU;
+}
+
+uint32_t is_const_covar(const pheno_col_t* covar_col, const uintptr_t* sample_include, uint32_t sample_ct) {
+  if (sample_ct < 2) {
+    return 1;
+  }
+  uint32_t sample_uidx = next_set_unsafe(sample_include, 0);
+  if (covar_col->type_code == kPhenoDtypeQt) {
+    const double* covar_vals = covar_col->data.qt;
+    const double first_covar_val = covar_vals[sample_uidx];
+    for (uint32_t sample_idx = 1; sample_idx < sample_ct; ++sample_idx) {
+      ++sample_uidx;
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      if (covar_vals[sample_uidx] != first_covar_val) {
+	return 0;
+      }
+    }
+    return 1;
+  }
+  assert(covar_col->type_code == kPhenoDtypeCat);
+  const uint32_t* covar_vals = covar_col->data.cat;
+  const uint32_t first_covar_val = covar_vals[sample_uidx];
+  for (uint32_t sample_idx = 1; sample_idx < sample_ct; ++sample_idx) {
+    ++sample_uidx;
+    next_set_unsafe_ck(sample_include, &sample_uidx);
+    if (covar_vals[sample_uidx] != first_covar_val) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+uint32_t identify_remaining_cats(const uintptr_t* sample_include, const pheno_col_t* covar_col, uint32_t sample_ct, uintptr_t* cat_covar_wkspace) {
+  // assumes covar_col->type_code == kPhenoTypeCat
+  const uint32_t nonnull_cat_ct = covar_col->nonnull_category_ct;
+  const uint32_t* covar_vals = covar_col->data.cat;
+  const uint32_t word_ct = 1 + (nonnull_cat_ct / kBitsPerWord);
+  fill_ulong_zero(word_ct, cat_covar_wkspace);
+  uint32_t sample_uidx = 0;
+  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+    next_set_unsafe_ck(sample_include, &sample_uidx);
+    set_bit(covar_vals[sample_uidx], cat_covar_wkspace);
+  }
+  return popcount_longs(cat_covar_wkspace, word_ct);
+}
+
+void cleanup_pheno_cols(uint32_t pheno_ct, pheno_col_t* pheno_cols) {
+  if (pheno_cols) {
+    for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+      vecaligned_free_cond(pheno_cols[pheno_idx].nonmiss);
+    }
+    free(pheno_cols);
+  }
+}
+
+boolerr_t parse_next_range(char** argv, uint32_t param_ct, char range_delim, uint32_t* cur_param_idx_ptr, char** cur_arg_pptr, char** range_start_ptr, uint32_t* rs_len_ptr, char** range_end_ptr, uint32_t* re_len_ptr) {
+  // Starts reading from argv[cur_param_idx][cur_pos].  If a valid range is
+  // next, range_start + rs_len + range_end + re_len are updated.  If only a
+  // single item is next, range_end is set to nullptr and range_start + rs_len
+  // are updated.  If there are no items left, range_start is set to nullptr.
+  // If the input is not well-formed, -1 is returned instead of 0.
+  uint32_t cur_param_idx = *cur_param_idx_ptr;
+  if (cur_param_idx > param_ct) {
+    *cur_arg_pptr = nullptr;
+    return 0;
+  }
+  char* cur_arg_ptr = *cur_arg_pptr;
+  while (1) {
+    char cc = *cur_arg_ptr;
+    if (!cc) {
+      *cur_param_idx_ptr = ++cur_param_idx;
+      if (cur_param_idx > param_ct) {
+	*range_start_ptr = nullptr;
+	return 0;
+      }
+      cur_arg_ptr = argv[cur_param_idx];
+      cc = *cur_arg_ptr;
+    }
+    if (cc == range_delim) {
+      return 1;
+    }
+    if (cc != ',') {
+      break;
+    }
+    ++cur_arg_ptr;
+  }
+  *range_start_ptr = cur_arg_ptr;
+  char cc;
+  do {
+    cc = *(++cur_arg_ptr);
+    if ((!cc) || (cc == ',')) {
+      *rs_len_ptr = (uintptr_t)(cur_arg_ptr - (*range_start_ptr));
+      *cur_arg_pptr = cur_arg_ptr;
+      *range_end_ptr = nullptr;
+      return 0;
+    }
+  } while (cc != range_delim);
+  *rs_len_ptr = (uintptr_t)(cur_arg_ptr - (*range_start_ptr));
+  cc = *(++cur_arg_ptr);
+  if ((!cc) || (cc == ',') || (cc == range_delim)) {
+    return 1;
+  }
+  *range_end_ptr = cur_arg_ptr;
+  do {
+    cc = *(++cur_arg_ptr);
+    if (cc == range_delim) {
+      return 1;
+    }
+  } while (cc && (cc != ','));
+  *re_len_ptr = (uintptr_t)(cur_arg_ptr - (*range_end_ptr));
+  *cur_arg_pptr = cur_arg_ptr;
+  return 0;
+}
+
+pglerr_t parse_chr_ranges(const char* flagname_p, const char* errstr_append, uint32_t param_ct, uint32_t allow_extra_chrs, uint32_t xymt_subtract, char range_delim, char** argv, chr_info_t* cip, uintptr_t* chr_mask) {
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    char* cur_arg_ptr = argv[1];
+    char* range_end = nullptr;
+    uint32_t cur_param_idx = 1;
+    uint32_t rs_len = 0;
+    uint32_t re_len = 0;
+    while (1) {
+      char* range_start;
+      if (parse_next_range(argv, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len)) {
+	sprintf(g_logbuf, "Error: Invalid --%s parameter '%s'.\n", flagname_p, argv[cur_param_idx]);
+	goto parse_chr_ranges_ret_INVALID_CMDLINE_WWA;
+      }
+      if (!range_start) {
+	break;
+      }
+      const char cc = range_start[rs_len];
+      range_start[rs_len] = '\0';
+      int32_t chr_code_start = get_chr_code_raw(range_start);
+      if (chr_code_start < 0) {
+	if (!allow_extra_chrs) {
+	  sprintf(g_logbuf, "Error: Invalid --%s chromosome code '%s'.\n", flagname_p, range_start);
+	  goto parse_chr_ranges_ret_INVALID_CMDLINE_WWA;
+	}
+	if (range_end) {
+	  goto parse_chr_ranges_ret_INVALID_CMDLINE_NONSTD;
+	}
+        if (push_llstr(range_start, &(cip->incl_excl_name_stack))) {
+	  goto parse_chr_ranges_ret_NOMEM;
+	}
+      } else {
+	if (chr_code_start >= ((int32_t)kMaxContigs)) {
+	  chr_code_start -= xymt_subtract;
+	}
+	if (range_end) {
+	  const char cc2 = range_end[re_len];
+	  range_end[re_len] = '\0';
+	  int32_t chr_code_end = get_chr_code_raw(range_end);
+	  if (chr_code_end < 0) {
+	    if (!allow_extra_chrs) {
+	      sprintf(g_logbuf, "Error: Invalid --%s chromosome code '%s'.\n", flagname_p, range_end);
+	      goto parse_chr_ranges_ret_INVALID_CMDLINE_WWA;
+	    }
+	    goto parse_chr_ranges_ret_INVALID_CMDLINE_NONSTD;
+	  }
+	  if (chr_code_end >= ((int32_t)kMaxContigs)) {
+	    // prohibit stuff like "--chr par1-par2", "--chr x-y", "--chr x-26"
+	    sprintf(g_logbuf, "Error: --%s chromosome code '%s' cannot be the end of a range.\n", flagname_p, range_end);
+	    goto parse_chr_ranges_ret_INVALID_CMDLINE_WWA;
+	  }
+	  if (chr_code_end <= chr_code_start) {
+	    sprintf(g_logbuf, "Error: --%s chromosome code '%s' is not greater than '%s'.\n", flagname_p, range_end, range_start);
+	    goto parse_chr_ranges_ret_INVALID_CMDLINE_WWA;
+	  }
+	  range_end[re_len] = cc2;
+	  fill_bits_nz(chr_code_start, chr_code_end + 1, chr_mask);
+	} else {
+          set_bit(chr_code_start, chr_mask);
+	}
+      }
+      range_start[rs_len] = cc;
+    }
+    // no compelling reason to prohibit "--not-chr ,"
+  }
+  while (0) {
+  parse_chr_ranges_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  parse_chr_ranges_ret_INVALID_CMDLINE_NONSTD:
+    logerrprint("Error: Chromosome ranges cannot include nonstandard names.\n");
+    reterr = kPglRetInvalidCmdline;
+    break;
+  parse_chr_ranges_ret_INVALID_CMDLINE_WWA:
+    wordwrapb(0);
+    logerrprintb();
+    logerrprint(errstr_append);
+    reterr = kPglRetInvalidCmdline;
+    break;
+  }
+  return reterr;
+}
+
+pglerr_t parse_name_ranges(char** argv, const char* errstr_append, uint32_t param_ct, uint32_t require_posint, char range_delim, range_list_t* range_list_ptr) {
+  uint32_t name_ct = 0;
+  uint32_t cur_param_idx = 1;
+  uint32_t name_max_blen = 0;
+  char* cur_arg_ptr;
+  char* range_start;
+  uint32_t rs_len;
+  char* range_end;
+  uint32_t re_len;
+  char* cur_name_str;
+  char* dup_check;
+  unsigned char* cur_name_starts_range;
+  uint32_t last_val;
+  uint32_t cur_val;
+  // two passes.  first pass: count parameters, determine name_max_blen;
+  // then allocate memory; then fill it.
+  if (param_ct) {
+    cur_arg_ptr = argv[1];
+    while (1) {
+      if (parse_next_range(argv, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len)) {
+	LOGERRPRINTFWW("Error: Invalid %s parameter '%s'.\n", argv[0], argv[cur_param_idx]);
+	logerrprint(errstr_append);
+        return kPglRetInvalidCmdline;
+      }
+      if (!range_start) {
+	break;
+      }
+      ++name_ct;
+      if (rs_len > name_max_blen) {
+	name_max_blen = rs_len; // does NOT include trailing null yet
+      }
+      if (range_end) {
+	++name_ct;
+	if (re_len > name_max_blen) {
+	  name_max_blen = re_len;
+	}
+      }
+    }
+  }
+  if (!name_ct) {
+    LOGERRPRINTF("Error: %s requires at least one value.\n%s", argv[0], errstr_append);
+    return kPglRetInvalidCmdline;
+  }
+  range_list_ptr->name_max_blen = ++name_max_blen;
+  range_list_ptr->name_ct = name_ct;
+  if (pgl_malloc(name_ct * (((uintptr_t)name_max_blen) + 1), &range_list_ptr->names)) {
+    return kPglRetNomem;
+  }
+  range_list_ptr->starts_range = (unsigned char*)(&(range_list_ptr->names[name_ct * ((uintptr_t)name_max_blen)]));
+  cur_name_str = range_list_ptr->names;
+  cur_name_starts_range = range_list_ptr->starts_range;
+  cur_param_idx = 1;
+  cur_arg_ptr = argv[1];
+  while (1) {
+    // second pass; this can't fail since we already validated
+    parse_next_range(argv, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len);
+    if (!range_start) {
+      if (require_posint) {
+	last_val = 0;
+	for (cur_param_idx = 0; cur_param_idx < name_ct; ++cur_param_idx) {
+	  cur_name_str = &(range_list_ptr->names[cur_param_idx * ((uintptr_t)name_max_blen)]);
+	  dup_check = cur_name_str; // actually a numeric check
+	  do {
+	    if (is_not_digit(*dup_check)) {
+	      LOGERRPRINTFWW("Error: Invalid %s parameter '%s'.\n", argv[0], cur_name_str);
+	      return kPglRetInvalidCmdline;
+	    }
+	  } while (*(++dup_check));
+	  if (scan_posint_defcap(cur_name_str, &cur_val)) {
+	    LOGERRPRINTFWW("Error: Invalid %s parameter '%s'.\n", argv[0], cur_name_str);
+	    return kPglRetInvalidCmdline;
+	  }
+	  if (range_list_ptr->starts_range[cur_param_idx]) {
+	    last_val = cur_val;
+	  } else {
+	    if (cur_val <= last_val) {
+	      LOGERRPRINTFWW("Error: Invalid %s range '%s-%s'.\n", argv[0], &(range_list_ptr->names[(cur_param_idx - 1) * name_max_blen]), cur_name_str);
+	      return kPglRetInvalidCmdline;
+	    }
+	    last_val = 0;
+	  }
+	}
+      }
+      return kPglRetSuccess;
+    }
+    memcpyx(cur_name_str, range_start, rs_len, 0);
+    dup_check = range_list_ptr->names;
+    while (dup_check < cur_name_str) {
+      if (!memcmp(dup_check, cur_name_str, rs_len + 1)) {
+	LOGERRPRINTFWW("Error: Duplicate %s parameter '%s'.\n", argv[0], cur_name_str);
+	return kPglRetInvalidCmdline;
+      }
+      dup_check = &(dup_check[name_max_blen]);
+    }
+    cur_name_str = &(cur_name_str[name_max_blen]);
+    if (range_end) {
+      *cur_name_starts_range++ = 1;
+      memcpyx(cur_name_str, range_end, re_len, 0);
+      dup_check = range_list_ptr->names;
+      while (dup_check < cur_name_str) {
+	if (!memcmp(dup_check, cur_name_str, rs_len + 1)) {
+	  LOGERRPRINTFWW("Error: Duplicate %s parameter '%s'.\n", argv[0], cur_name_str);
+	  return kPglRetInvalidCmdline;
+	}
+        dup_check = &(dup_check[name_max_blen]);
+      }
+      cur_name_str = &(cur_name_str[name_max_blen]);
+      *cur_name_starts_range++ = 0;
+    } else {
+      *cur_name_starts_range++ = 0;
+    }
+  }
+}
+
+void join_threads(uint32_t ctp1, pthread_t* threads) {
+  if (!(--ctp1)) {
+    return;
+  }
+#ifdef _WIN32
+  WaitForMultipleObjects(ctp1, threads, 1, INFINITE);
+  for (uint32_t uii = 0; uii < ctp1; ++uii) {
+    // fix handle leak?
+    CloseHandle(threads[uii]);
+  }
+#else
+  for (uint32_t uii = 0; uii < ctp1; ++uii) {
+    pthread_join(threads[uii], nullptr);
+  }
+#endif
+}
+
+boolerr_t spawn_threads(THREAD_FUNCPTR_T(start_routine), uintptr_t ct, pthread_t* threads) {
+  uintptr_t ulii;
+  if (ct == 1) {
+    return 0;
+  }
+  for (ulii = 1; ulii < ct; ++ulii) {
+#ifdef _WIN32
+    threads[ulii - 1] = (HANDLE)_beginthreadex(nullptr, 4096, start_routine, (void*)ulii, 0, nullptr);
+    if (!threads[ulii - 1]) {
+      join_threads(ulii, threads);
+      return 1;
+    }
+#else
+    if (pthread_create(&(threads[ulii - 1]), nullptr, start_routine, (void*)ulii)) {
+      join_threads(ulii, threads);
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
+// Main plink 2.0 threading framework:
+// * On all operating systems, g_is_last_thread_block indicates whether all
+//   threads should terminate upon completion of the current block.
+// * On Linux and OS X, if we aren't dealing with the final block,
+//   spawn_threads2z() also reinitializes g_thread_active_ct.
+// * On Linux and OS X, spawn_threads2z() checks if g_thread_mutex_initialized
+//   is set.  If not, it, it is set, g_thread_sync_mutex,
+//   g_thread_cur_block_done_condvar and g_thread_start_next_condvar are
+//   initialized, then threads are launched.
+//   If it has, pthread_cond_broadcast() acts on g_thread_start_next_condvar.
+// * On Windows, spawn_threads2z() checks if g_thread_mutex_initialized is set.
+//   If it has not, it, along with g_thread_start_next_event[] and
+//   g_thread_cur_block_done_events[], are initialized, then the threads are
+//   launched.  If it has, SetEvent() acts on g_thread_start_next_event[].
+//   (It used to act on only one event; then I realized that safely dealing
+//   with a manual-reset event could be a pain if the first thread finishes
+//   before the last one wakes up...)
+// * Thread functions are expected to be of the form
+//     THREAD_FUNC_DECL function_name(void* arg) {
+//       uintptr_t tidx = (uintptr_t)arg;
+//       ...
+//       while (1) {
+//         ... // process current block
+//         if (g_is_last_thread_block) {
+//           THREAD_RETURN;
+//         }
+//         THREAD_BLOCK_FINISH(tidx);
+//       }
+//     }
+// * On Linux and OS X, THREAD_BLOCK_FINISH() acquires a mutex, decrements
+//   g_thread_active_ct, calls pthread_cond_signal() on
+//   g_thread_cur_block_done_condvar iff g_thread_active_ct is now zero, then
+//   unconditionally calls pthread_cond_wait on g_thread_start_next_condvar and
+//   the mutex.
+// * On Windows, THREAD_BLOCK_FINISH() calls SetEvent() on
+//   g_thread_cur_block_done_events[tidx], then waits on
+//   g_thread_start_next_event[tidx].
+// * If the termination variable is set, join_threads2z() waits for all threads
+//   to complete, then cleans up all multithreading objects.  Otherwise, on
+//   Linux and OS X, it acquires the mutex and calls pthread_cond_wait() on
+//   g_thread_cur_block_done_condvar and the mutex; and on Windows, it calls
+//   WaitForMultipleObjects() on g_thread_cur_block_done_events[].
+//   WaitForMultipleObjects has a 64 object limit, and for now it doesn't seem
+//   too important to use a for loop to handle more objects?... well, we can
+//   add that if anyone wants it, but for now the Windows thread limit is 64.
+
+uintptr_t g_thread_spawn_ct;
+uint32_t g_is_last_thread_block = 0;
+#ifdef _WIN32
+HANDLE g_thread_start_next_event[kMaxThreads];
+HANDLE g_thread_cur_block_done_events[kMaxThreads];
+#else
+pthread_attr_t g_smallstack_thread_attr;
+static pthread_mutex_t g_thread_sync_mutex;
+static pthread_cond_t g_thread_cur_block_done_condvar;
+static pthread_cond_t g_thread_start_next_condvar;
+static uint32_t g_thread_active_ct;
+
+void THREAD_BLOCK_FINISH(__attribute__((unused)) uintptr_t tidx) {
+  const uintptr_t initial_spawn_ct = g_thread_spawn_ct;
+  pthread_mutex_lock(&g_thread_sync_mutex);
+  if (!(--g_thread_active_ct)) {
+    pthread_cond_signal(&g_thread_cur_block_done_condvar);
+  }
+  while (g_thread_spawn_ct == initial_spawn_ct) {
+    // spurious wakeup guard
+    pthread_cond_wait(&g_thread_start_next_condvar, &g_thread_sync_mutex);
+  }
+  pthread_mutex_unlock(&g_thread_sync_mutex);
+}
+#endif
+static uint32_t g_thread_mutex_initialized = 0;
+
+void join_threads2z(uint32_t ct, uint32_t is_last_block, pthread_t* threads) {
+#ifdef _WIN32
+  if (!is_last_block) {
+    WaitForMultipleObjects(ct, g_thread_cur_block_done_events, 1, INFINITE);
+  } else {
+    WaitForMultipleObjects(ct, threads, 1, INFINITE);
+    for (uint32_t uii = 0; uii < ct; ++uii) {
+      // fix handle leak?
+      CloseHandle(threads[uii]);
+      
+      CloseHandle(g_thread_start_next_event[uii]);
+      CloseHandle(g_thread_cur_block_done_events[uii]);
+    }
+    g_thread_mutex_initialized = 0;
+  }
+#else
+  if (!is_last_block) {
+    pthread_mutex_lock(&g_thread_sync_mutex);
+    while (g_thread_active_ct) {
+      pthread_cond_wait(&g_thread_cur_block_done_condvar, &g_thread_sync_mutex);
+    }
+    // keep mutex until next block loaded
+  } else {
+    for (uint32_t uii = 0; uii < ct; ++uii) {
+      pthread_join(threads[uii], nullptr);
+    }
+    // slightly inefficient if there are multiple multithreaded commands being
+    // run, but if different commands require different numbers of threads,
+    // optimizing this sort of thing away could introduce bugs...
+    pthread_mutex_destroy(&g_thread_sync_mutex);
+    pthread_cond_destroy(&g_thread_cur_block_done_condvar);
+    pthread_cond_destroy(&g_thread_start_next_condvar);
+    g_thread_mutex_initialized = 0;
+  }
+#endif
+}
+
+boolerr_t spawn_threads2z(THREAD_FUNCPTR_T(start_routine), uintptr_t ct, uint32_t is_last_block, pthread_t* threads) {
+  // start_routine() might need this
+  if (g_is_last_thread_block != is_last_block) {
+    // might save us an unnecessary memory write that confuses the cache
+    // coherency logic?
+    g_is_last_thread_block = is_last_block;
+  }
+#ifdef _WIN32
+  if (!g_thread_mutex_initialized) {
+    g_thread_spawn_ct = 0;
+    g_thread_mutex_initialized = 1;
+    for (uintptr_t ulii = 0; ulii < ct; ++ulii) {
+      g_thread_start_next_event[ulii] = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+      g_thread_cur_block_done_events[ulii] = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+    }
+    for (uintptr_t ulii = 0; ulii < ct; ++ulii) {
+      threads[ulii] = (HANDLE)_beginthreadex(nullptr, kDefaultThreadStack, start_routine, (void*)ulii, 0, nullptr);
+      if (!threads[ulii]) {
+	if (ulii) {
+	  join_threads2z(ulii, is_last_block, threads);
+	  if (!is_last_block) {
+	    // fix handle leak?
+	    for (uintptr_t uljj = 0; uljj < ulii; ++uljj) {
+	      CloseHandle(threads[uljj]);
+	    }
+	  }
+	}
+	if ((!is_last_block) || (!ulii)) {
+	  for (uint32_t uii = 0; uii < ct; ++uii) {
+	    CloseHandle(g_thread_start_next_event[uii]);
+	    CloseHandle(g_thread_cur_block_done_events[uii]);
+	  }
+	  g_thread_mutex_initialized = 0;
+	}
+	return 1;
+      }
+    }
+  } else {
+    g_thread_spawn_ct++;
+    for (uintptr_t ulii = 0; ulii < ct; ++ulii) {
+      SetEvent(g_thread_start_next_event[ulii]);
+    }
+  }
+#else
+  if (!is_last_block) {
+    g_thread_active_ct = ct;
+  }
+  if (!g_thread_mutex_initialized) {
+    g_thread_spawn_ct = 0; // tidx 0 may need to know modulus
+    g_thread_mutex_initialized = 1;
+    if (pthread_mutex_init(&g_thread_sync_mutex, nullptr) ||
+        pthread_cond_init(&g_thread_cur_block_done_condvar, nullptr) ||
+        pthread_cond_init(&g_thread_start_next_condvar, nullptr)) {
+      return 1;
+    }
+    for (uintptr_t ulii = 0; ulii < ct; ++ulii) {
+      if (pthread_create(&(threads[ulii]), nullptr, start_routine, (void*)ulii)) {
+	if (ulii) {
+	  join_threads2z(ulii, is_last_block, threads);
+	  if (!is_last_block) {
+	    // not worth the trouble of demanding that all callers handle
+	    // pthread_create() failure cleanly
+	    // (in contrast, error_cleanup_threads2z is relevant when an input
+	    // .pgen is malformed, which could happen a lot)
+	    for (uintptr_t uljj = 0; uljj < ulii; ++uljj) {
+	      pthread_cancel(threads[uljj]);
+	    }
+	  }
+	}
+	if ((!is_last_block) || (!ulii)) {
+	  pthread_mutex_destroy(&g_thread_sync_mutex);
+	  pthread_cond_destroy(&g_thread_cur_block_done_condvar);
+	  pthread_cond_destroy(&g_thread_start_next_condvar);
+	  g_thread_mutex_initialized = 0;
+	}
+	return 1;
+      }
+    }
+  } else {
+    ++g_thread_spawn_ct;
+    // still holding mutex
+    pthread_mutex_unlock(&g_thread_sync_mutex);
+    pthread_cond_broadcast(&g_thread_start_next_condvar);
+  }
+#endif
+  return 0;
+}
+
+void error_cleanup_threads2z(THREAD_FUNCPTR_T(start_routine), uintptr_t ct, pthread_t* threads) {
+  if (!spawn_threads2z(start_routine, ct, 1, threads)) {
+    join_threads2z(ct, 1, threads);
+  }
+}
+
+
+// multithread globals
+static const uintptr_t* g_subset_mask = nullptr;
+static char** g_item_ids;
+static uint32_t* g_id_htable = nullptr;
+
+// currently by item_idx, not item_uidx
+static uint32_t* g_item_id_hashes = nullptr;
+static uint32_t g_item_ct = 0;
+static uint32_t g_id_htable_size = 0;
+static uint32_t g_calc_thread_ct = 0;
+static uint32_t g_item_uidx_starts[16];
+
+THREAD_FUNC_DECL calc_id_hash_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t* subset_mask = g_subset_mask;
+  char** item_ids = g_item_ids;
+  uint32_t* item_id_hashes = g_item_id_hashes;
+  const uint32_t id_htable_size = g_id_htable_size;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uint32_t fill_start = round_down_pow2((id_htable_size * ((uint64_t)tidx)) / calc_thread_ct, kInt32PerCacheline);
+  uint32_t fill_end;
+  if (tidx + 1 < calc_thread_ct) {
+    fill_end = round_down_pow2((id_htable_size * (((uint64_t)tidx) + 1)) / calc_thread_ct, kInt32PerCacheline);
+  } else {
+    fill_end = id_htable_size;
+  }
+  fill_uint_one(fill_end - fill_start, &(g_id_htable[fill_start]));
+
+  const uint32_t item_ct = g_item_ct;
+  const uint32_t item_idx_end = (item_ct * (((uint64_t)tidx) + 1)) / calc_thread_ct;
+  uint32_t item_uidx = g_item_uidx_starts[tidx];
+  for (uint32_t item_idx = (item_ct * ((uint64_t)tidx)) / calc_thread_ct; item_idx < item_idx_end; ++item_idx, ++item_uidx) {
+    next_set_unsafe_ck(subset_mask, &item_uidx);
+    const char* sptr = item_ids[item_uidx];
+    const uint32_t slen = strlen(sptr);
+    item_id_hashes[item_idx] = hashceil(sptr, slen, id_htable_size);
+  }
+  THREAD_RETURN;
+}
+
+pglerr_t populate_id_htable_mt(const uintptr_t* subset_mask, char** item_ids, uintptr_t item_ct, uint32_t store_all_dups, uint32_t id_htable_size, uint32_t thread_ct, uint32_t* id_htable) {
+  // Change from plink 1.9: if store_all_dups is false, we don't error out on
+  // the first encountered duplicate ID; instead, we just flag it in the hash
+  // table.  So if '.' is the only duplicate ID, and it never appears in a
+  // variant ID list, plink2 never complains.
+  //
+  // When store_all_dups is true, additional linked lists are allocated past
+  // the end of id_htable to track all raw indexes of duplicate names.
+  if (!item_ct) {
+    return kPglRetSuccess;
+  }
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    // this seems to be a sweet spot
+    if (thread_ct > 16) {
+      thread_ct = 16;
+    }
+    if (thread_ct > item_ct / 65536) {
+      thread_ct = item_ct / 65536;
+      if (!thread_ct) {
+	thread_ct = 1;
+      }
+    }
+    if (bigstack_end_alloc_ui(item_ct, &g_item_id_hashes)) {
+      goto populate_id_htable_mt_ret_NOMEM;
+    }
+    g_subset_mask = subset_mask;
+    g_item_ids = item_ids;
+    g_id_htable = id_htable;
+    g_item_ct = item_ct;
+    g_id_htable_size = id_htable_size;
+    g_calc_thread_ct = thread_ct;
+    pthread_t threads[16];
+    {
+      uint32_t item_uidx = next_set_unsafe(subset_mask, 0);
+      uint32_t item_idx = 0;
+      g_item_uidx_starts[0] = item_uidx;
+      for (uintptr_t tidx = 1; tidx < thread_ct; ++tidx) {
+	const uint32_t item_idx_new = (item_ct * ((uint64_t)tidx)) / thread_ct;
+	item_uidx = jump_forward_set_unsafe(subset_mask, item_uidx + 1, item_idx_new - item_idx);
+	g_item_uidx_starts[tidx] = item_uidx;
+	item_idx = item_idx_new;
+      }
+    }
+    if (spawn_threads(calc_id_hash_thread, thread_ct, threads)) {
+      goto populate_id_htable_mt_ret_THREAD_CREATE_FAIL;
+    }
+    calc_id_hash_thread((void*)0);
+    join_threads(thread_ct, threads);
+    // could also partial-sort and actually fill the hash table in a
+    // multithreaded manner, but I'll postpone that for now since it's tricky
+    // to make that work with duplicate ID handling, and it also is a
+    // substantially smaller bottleneck than hash value computation.
+    uint32_t item_uidx = 0;
+    if (!store_all_dups) {
+      for (uint32_t item_idx = 0; item_idx < item_ct; ++item_uidx, ++item_idx) {
+	next_set_unsafe_ck(subset_mask, &item_uidx);
+	uint32_t hashval = g_item_id_hashes[item_idx];
+	uint32_t cur_htable_entry = id_htable[hashval];
+	if (cur_htable_entry == 0xffffffffU) {
+	  id_htable[hashval] = item_uidx;
+	} else {
+	  const char* sptr = item_ids[item_uidx];
+	  while (1) {
+	    // could also use memcmp, guaranteed to be safe due to where
+	    // variant IDs are allocated
+	    if (!strcmp(sptr, item_ids[cur_htable_entry & 0x7fffffff])) {
+	      if (!(cur_htable_entry >> 31)) {
+	        id_htable[hashval] |= 0x80000000U;
+	      }
+	      break;
+	    }
+	    if (++hashval == id_htable_size) {
+	      hashval = 0;
+	    }
+	    cur_htable_entry = id_htable[hashval];
+	    if (cur_htable_entry == 0xffffffffU) {
+	      id_htable[hashval] = item_uidx;
+	      break;
+	    }
+	  }
+	}
+      }
+    } else {
+      const uintptr_t cur_bigstack_left = bigstack_left();
+      uint32_t max_extra_alloc_m4;
+#ifdef __LP64__
+      if (cur_bigstack_left >= 0x400000000LLU) {
+	// this can never be hit
+	max_extra_alloc_m4 = 0xfffffffaU;
+      } else {
+#endif
+	if (cur_bigstack_left < 4 * sizeof(int32_t)) {
+	  goto populate_id_htable_mt_ret_NOMEM;
+	}
+	max_extra_alloc_m4 = (cur_bigstack_left / sizeof(int32_t)) - 4;
+#ifdef __LP64__
+      }
+#endif
+      uint32_t extra_alloc = 0;
+      uint32_t prev_llidx = 0;
+      // needs to be synced with extract_exclude_flag_norange()
+      // multithread this?
+      uint32_t* htable_dup_base = (uint32_t*)g_bigstack_base;
+      for (uint32_t item_idx = 0; item_idx < item_ct; ++item_uidx, ++item_idx) {
+	next_set_unsafe_ck(subset_mask, &item_uidx);
+	uint32_t hashval = g_item_id_hashes[item_idx];
+	uint32_t cur_htable_entry = id_htable[hashval];
+	if (cur_htable_entry == 0xffffffffU) {
+	  id_htable[hashval] = item_uidx;
+	} else {
+	  const char* sptr = item_ids[item_uidx];
+	  while (1) {
+	    const uint32_t cur_dup = cur_htable_entry >> 31;
+	    uint32_t prev_uidx;
+	    if (cur_dup) {
+	      prev_llidx = cur_htable_entry * 2;
+	      prev_uidx = htable_dup_base[prev_llidx];
+	    } else {
+	      prev_uidx = cur_htable_entry;
+	    }
+	    if (!strcmp(sptr, item_ids[prev_uidx])) {
+	      if (extra_alloc > max_extra_alloc_m4) {
+		goto populate_id_htable_mt_ret_NOMEM;
+	      }
+	      // point to linked list entry instead
+	      if (!cur_dup) {
+		htable_dup_base[extra_alloc] = cur_htable_entry;
+		htable_dup_base[extra_alloc + 1] = 0xffffffffU; // list end
+		prev_llidx = extra_alloc;
+		extra_alloc += 2;
+	      }
+	      htable_dup_base[extra_alloc] = item_uidx;
+	      htable_dup_base[extra_alloc + 1] = prev_llidx;
+	      id_htable[hashval] = 0x80000000U | (extra_alloc >> 1);
+	      extra_alloc += 2;
+	      break; // bugfix
+	    }
+	    if (++hashval == id_htable_size) {
+	      hashval = 0;
+	    }
+	    cur_htable_entry = id_htable[hashval];
+	    if (cur_htable_entry == 0xffffffffU) {
+	      id_htable[hashval] = item_uidx;
+	      break;
+	    }
+	  }
+	}
+      }
+      if (extra_alloc) {
+	// bugfix: forgot to align this
+	bigstack_alloc_raw_rd(extra_alloc * sizeof(int32_t));
+      }
+    }
+  }
+  while (0) {
+  populate_id_htable_mt_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  populate_id_htable_mt_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+  bigstack_end_reset(bigstack_end_mark);
+  return reterr;
+}
+
+pglerr_t alloc_and_populate_id_htable_mt(const uintptr_t* subset_mask, char** item_ids, uintptr_t item_ct, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr) {
+  uint32_t id_htable_size = get_htable_fast_size(item_ct);
+  // 4 bytes per variant for hash buffer
+  // if store_all_dups, up to 8 bytes per variant in extra_alloc for duplicate
+  //   tracking
+  const uint32_t store_all_dups = (htable_dup_base_ptr != nullptr);
+  const uintptr_t nonhtable_alloc = round_up_pow2(item_ct * sizeof(int32_t), kCacheline) + store_all_dups * round_up_pow2(item_ct * 2 * sizeof(int32_t), kCacheline);
+  uintptr_t max_bytes = round_down_pow2(bigstack_left(), kCacheline);
+  // force max_bytes >= 5 so leqprime() doesn't fail
+  if (nonhtable_alloc + (item_ct + 6) * sizeof(int32_t) > max_bytes) {
+    return kPglRetNomem;
+  }
+  max_bytes -= nonhtable_alloc;
+  if (id_htable_size * sizeof(int32_t) > max_bytes) {
+    id_htable_size = leqprime((max_bytes / sizeof(int32_t)) - 1);
+    const uint32_t min_htable_size = get_htable_min_size(item_ct);
+    if (id_htable_size < min_htable_size) {
+      id_htable_size = min_htable_size;
+    }
+  }
+  *id_htable_ptr = (uint32_t*)bigstack_alloc_raw_rd(id_htable_size * sizeof(int32_t));
+  if (store_all_dups) {
+    *htable_dup_base_ptr = &((*id_htable_ptr)[round_up_pow2(id_htable_size, kInt32PerCacheline)]);
+  }
+  *id_htable_size_ptr = id_htable_size;
+  return populate_id_htable_mt(subset_mask, item_ids, item_ct, store_all_dups, id_htable_size, max_thread_ct, *id_htable_ptr);
+}
+
+pglerr_t multithread_load_init(const uintptr_t* variant_include, uint32_t sample_ct, uint32_t variant_ct, uintptr_t pgr_alloc_cacheline_ct, uintptr_t thread_xalloc_cacheline_ct, uintptr_t per_variant_xalloc_byte_ct, pgen_file_info_t* pgfip, uint32_t* calc_thread_ct_ptr, uintptr_t*** genovecs_ptr, uintptr_t*** dosage_present_ptr, dosage_t*** dosage_val_bufs_ptr, uint32_t* read_block_size_ptr, unsigned char** main_loadbufs, pthread_t** threads_ptr, pgen_reader_t*** pgr_pps, uint32_t** read [...]
+  uintptr_t cachelines_avail = bigstack_left() / kCacheline;
+  uint32_t read_block_size = kPglVblockSize;
+  uint64_t multiread_cacheline_ct;
+  while (1) {
+    multiread_cacheline_ct = pgfi_multiread_get_cacheline_req(variant_include, pgfip, variant_ct, read_block_size);
+    // limit each raw load buffer to 1/4 of remaining workspace
+    // if there's an additional per-variant allocation, put it in the same bin
+    // as the load buffers
+    if ((multiread_cacheline_ct + (((uint64_t)per_variant_xalloc_byte_ct) * read_block_size) / kCacheline) * 4 <= cachelines_avail) {
+      break;
+    }
+    // lots of callers require read_block_size to be either raw_variant_ct or a
+    // multiple of kBitsPerVec
+#ifdef __LP64__
+    if (read_block_size <= kBitsPerVec) {
+      return kPglRetNomem;
+    }
+#else
+    if (read_block_size <= kCacheline) {
+      return kPglRetNomem;
+    }
+#endif
+    read_block_size /= 2;
+  }
+#ifndef __LP64__
+  if (multiread_cacheline_ct > (kMaxBytesPerIO / kCacheline)) {
+    return kPglRetNomem;
+  }
+#endif
+  main_loadbufs[0] = bigstack_alloc_raw(multiread_cacheline_ct * kCacheline);
+  main_loadbufs[1] = bigstack_alloc_raw(multiread_cacheline_ct * kCacheline);
+  pgfip->block_base = main_loadbufs[0];
+  *read_block_size_ptr = read_block_size;
+  cachelines_avail -= 2 * (multiread_cacheline_ct + (((uint64_t)per_variant_xalloc_byte_ct) * read_block_size) / kCacheline);
+  // reduce calc_thread_ct if necessary
+  uint32_t calc_thread_ct = *calc_thread_ct_ptr;
+  if (calc_thread_ct > read_block_size) {
+    calc_thread_ct = read_block_size;
+    *calc_thread_ct_ptr = calc_thread_ct;
+  }
+
+  // pgr_pps, threads_ptr, read_variant_uidx_starts_ptr, (*pgr_pps)[tidx],
+  //   pgr_alloc; deliberately a slight overestimate
+  const uintptr_t pgr_struct_alloc = round_up_pow2(sizeof(pgen_reader_t), kCacheline);
+  uintptr_t thread_alloc_cacheline_ct = 1 + 1 + 1 + (pgr_struct_alloc / kCacheline) + pgr_alloc_cacheline_ct + thread_xalloc_cacheline_ct;
+
+  const uint32_t sample_ctcl2 = QUATERCT_TO_CLCT(sample_ct);
+  const uint32_t sample_ctcl = BITCT_TO_CLCT(sample_ct);
+  
+  // todo: increase in multiallelic case
+  const uintptr_t dosage_vals_cl = DIV_UP(sample_ct, (kCacheline / sizeof(dosage_t)));
+  if (genovecs_ptr) {
+    thread_alloc_cacheline_ct += 1 + sample_ctcl2;
+    if (dosage_present_ptr) {
+      assert(dosage_val_bufs_ptr);
+      thread_alloc_cacheline_ct += 2 + sample_ctcl + dosage_vals_cl;
+    }
+  }
+  if (thread_alloc_cacheline_ct * calc_thread_ct > cachelines_avail) {
+    if (thread_alloc_cacheline_ct > cachelines_avail) {
+      return kPglRetNomem;
+    }
+    calc_thread_ct = cachelines_avail / thread_alloc_cacheline_ct;
+    *calc_thread_ct_ptr = calc_thread_ct;
+  }
+
+  const uint32_t array_of_ptrs_alloc = round_up_pow2(calc_thread_ct * sizeof(intptr_t), kCacheline);
+  *pgr_pps = (pgen_reader_t**)bigstack_alloc_raw(array_of_ptrs_alloc);
+  *threads_ptr = (pthread_t*)bigstack_alloc_raw(array_of_ptrs_alloc);
+  *read_variant_uidx_starts_ptr = (uint32_t*)bigstack_alloc_raw_rd(calc_thread_ct * sizeof(int32_t));
+  for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+    (*pgr_pps)[tidx] = (pgen_reader_t*)bigstack_alloc_raw(pgr_struct_alloc);
+    // pgr_preinit(g_pgr_ptrs[tidx]);
+    unsigned char* pgr_alloc = bigstack_alloc_raw(pgr_alloc_cacheline_ct * kCacheline);
+
+    // shouldn't be possible for this to fail
+    pgr_init(nullptr, 0, pgfip, (*pgr_pps)[tidx], pgr_alloc);
+  }
+  if (genovecs_ptr) {
+    *genovecs_ptr = (uintptr_t**)bigstack_alloc_raw(array_of_ptrs_alloc);
+    if (dosage_present_ptr) {
+      *dosage_present_ptr = (uintptr_t**)bigstack_alloc_raw(array_of_ptrs_alloc);
+      *dosage_val_bufs_ptr = (dosage_t**)bigstack_alloc_raw(array_of_ptrs_alloc);
+    }
+    const uintptr_t genovec_alloc = sample_ctcl2 * kCacheline;
+    const uintptr_t dosage_present_alloc = sample_ctcl * kCacheline;
+    const uintptr_t dosage_vals_alloc = dosage_vals_cl * kCacheline;
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      (*genovecs_ptr)[tidx] = (uintptr_t*)bigstack_alloc_raw(genovec_alloc);
+      if (dosage_present_ptr) {
+	(*dosage_present_ptr)[tidx] = (uintptr_t*)bigstack_alloc_raw(dosage_present_alloc);
+	(*dosage_val_bufs_ptr)[tidx] = (dosage_t*)bigstack_alloc_raw(dosage_vals_alloc);
+      }
+    }
+  }
+  return kPglRetSuccess;
+}
+
+pglerr_t write_sample_ids(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* outname, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen) {
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto write_sample_ids_ret_OPEN_FAIL;
+    }
+    char* textbuf = g_textbuf;
+    char* write_iter = textbuf;
+    char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+    uintptr_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_ul_unsafe_ck(sample_include, &sample_uidx);
+      write_iter = strcpya(write_iter, &(sample_ids[sample_uidx * max_sample_id_blen]));
+      if (sids) {
+	*write_iter++ = '\t';
+	write_iter = strcpya(write_iter, &(sids[sample_uidx * max_sid_blen]));
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= textbuf_flush) {
+	if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	  goto write_sample_ids_ret_WRITE_FAIL;
+	}
+	write_iter = textbuf;
+      }
+    }
+    if (write_iter > textbuf) {
+      if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	goto write_sample_ids_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto write_sample_ids_ret_WRITE_FAIL;
+    }
+  }
+  while (0) {
+  write_sample_ids_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_sample_ids_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  fclose_cond(outfile);
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_common.h b/plink2_common.h
new file mode 100644
index 0000000..6f598d5
--- /dev/null
+++ b/plink2_common.h
@@ -0,0 +1,2508 @@
+#ifndef __PLINK2_COMMON_H__
+#define __PLINK2_COMMON_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+// Resources needed across a variety of plink2 modules.  Now includes
+// initialization code (init_chr_info_human, init_logfile, init_bigstack) to
+// simplify inclusion in other programs.
+
+#include "pgenlib_internal.h"
+
+#include <math.h>
+#include <stddef.h>
+
+#ifndef _WIN32
+  #include <sys/stat.h>
+#endif
+
+#ifdef _WIN32
+  #include <process.h>
+#else
+  #include <pthread.h>
+#endif
+
+#ifdef __cplusplus
+  #include <algorithm>
+  #ifdef _WIN32
+    // Windows C++11 <algorithm> resets these values :(
+    #undef PRIu64
+    #undef PRId64
+    #define PRIu64 "I64u"
+    #define PRId64 "I64d"
+    #undef PRIuPTR
+    #undef PRIdPTR
+    #ifdef __LP64__
+      #define PRIuPTR PRIu64
+      #define PRIdPTR PRId64
+    #else
+      #if __cplusplus < 201103L
+	#define PRIuPTR "lu"
+	#define PRIdPTR "ld"
+      #else
+	#define PRIuPTR "u"
+	#define PRIdPTR "d"
+      #endif
+    #endif
+  #endif
+#endif
+
+#ifdef DYNAMIC_MKL
+  #define USE_MKL
+#endif
+
+#ifdef USE_MKL
+  #ifdef __APPLE__
+    #error "plink2 cannot currently use MKL on OS X."
+  #endif
+  #ifdef LAPACK_ILP64
+    #define MKL_ILP64
+  #endif
+  #ifdef DYNAMIC_MKL
+    #include <mkl_service.h>
+  #else
+    #include "/opt/intel/mkl/include/mkl_service.h"
+  #endif
+  #define USE_MTBLAS
+  #define BLAS_SET_NUM_THREADS mkl_set_num_threads
+#else
+  #ifdef USE_OPENBLAS
+    #ifdef __cplusplus
+extern "C" {
+    #endif
+      void openblas_set_num_threads(int num_threads);
+    #ifdef __cplusplus
+} // extern "C"
+    #endif
+    #define USE_MTBLAS
+    #define BLAS_SET_NUM_THREADS openblas_set_num_threads
+  #else
+    #define BLAS_SET_NUM_THREADS(num)
+  #endif
+#endif
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+#define PROG_NAME_STR "plink2"
+
+// leave the door open to 32-bit dosages (or 31?  24?)
+typedef uint16_t dosage_t;
+typedef uint32_t dosage_prod_t;
+#define kDosageMax (1U << (8 * sizeof(dosage_t) - 1))
+CONSTU31(kDosageMid, kDosageMax / 2);
+CONSTU31(kDosage4th, kDosageMax / 4);
+static const double kRecipDosageMax = 0.000030517578125;
+static const double kRecipDosageMid = 0.00006103515625;
+static const float kRecipDosageMidf = 0.00006103515625;
+
+// this is a bit arbitrary
+CONSTU31(kMaxPhenoCt, 524287);
+#define MAX_PHENO_CT_STR "524287"
+
+// unnecessary to use e.g. (1LLU << 0), the FLAGSET64 macros should force the
+// integer type to 64-bit.
+FLAGSET64_DEF_START()
+  kfMisc0,
+  kfMiscAffection01 = (1 << 0),
+  kfMiscAllowExtraChrs = (1 << 1),
+  kfMiscRealRefAlleles = (1 << 2),
+  kfMiscMajRef = (1 << 3),
+  kfMiscMajRefForce = (1 << 4),
+  kfMiscNonfounders = (1 << 5),
+  kfMiscExtractRange = (1 << 6),
+  kfMiscExcludeRange = (1 << 7),
+  kfMiscKeepfileSid = (1 << 8),
+  kfMiscRemovefileSid = (1 << 9),
+  kfMiscKeepAutoconv = (1 << 10),
+  kfMiscDoubleId = (1 << 11),
+  kfMiscBiallelicOnly = (1 << 12),
+  kfMiscBiallelicOnlyStrict = (1 << 13),
+  kfMiscBiallelicOnlyList = (1 << 14),
+  kfMiscExcludePvarFilterFail = (1 << 15),
+  kfMiscVcfRequireGt = (1 << 16),
+  kfMiscAutosomePar = (1 << 17),
+  kfMiscAutosomeOnly = (1 << 18),
+  kfMiscMergePar = (1 << 19),
+  kfMiscAllowNoSamples = (1 << 20),
+  kfMiscAllowNoVars = (1 << 21),
+  kfMiscHweMidp = (1 << 22),
+  kfMiscHweKeepFewhet = (1 << 23),
+  kfMiscWriteSnplistZs = (1 << 24),
+  kfMiscMafSucc = (1 << 25),
+  kfMiscGenoDosage = (1 << 26),
+  kfMiscGenoHhMissing = (1 << 27),
+  kfMiscMindDosage = (1 << 28),
+  kfMiscMindHhMissing = (1 << 29),
+  kfMiscGenotypingRateDosage = (1 << 30),
+  kfMiscSetMissingVarIds = (1LLU << 31),
+  kfMiscChrOverrideCmdline = (1LLU << 32),
+  kfMiscChrOverrideFile = (1LLU << 33),
+  kfMiscNewVarIdOverflowMissing = (1LLU << 34),
+  kfMiscNewVarIdOverflowTruncate = (1LLU << 35),
+  kfMiscRequirePheno = (1LLU << 36),
+  kfMiscRequireCovar = (1LLU << 37),
+  kfMiscCatPhenoFamily = (1LLU << 38)
+FLAGSET64_DEF_END(misc_flags_t);
+
+FLAGSET64_DEF_START()
+  kfExportf0,
+  kfExportf01 = (1 << 0),
+  kfExportf12 = (1 << 1),
+  kfExportfSpaces = (1 << 2),
+  kfExportfRefFirst = (1 << 3),
+  kfExportf23 = (1 << 4),
+  kfExportfA = (1 << 5),
+  kfExportfATranspose = (1 << 6),
+  kfExportfAD = (1 << 7),
+  kfExportfBeagle = (1 << 8),
+  kfExportfBeagleNomap = (1 << 9),
+  kfExportfBgen11 = (1 << 10),
+  kfExportfBgen12 = (1 << 11),
+  kfExportfBgen13 = (1 << 12),
+  kfExportfBimbam = (1 << 13),
+  kfExportfBimbam1chr = (1 << 14),
+  kfExportfFastphase = (1 << 15),
+  kfExportfFastphase1chr = (1 << 16),
+  kfExportfHaps = (1 << 17),
+  kfExportfHapsLegend = (1 << 18),
+  kfExportfHv = (1 << 19),
+  kfExportfHv1chr = (1 << 20),
+  kfExportfIndMajorBed = (1 << 21),
+  kfExportfLgen = (1 << 22),
+  kfExportfLgenRef = (1 << 23),
+  kfExportfList = (1 << 24),
+  kfExportfRlist = (1 << 25),
+  kfExportfOxGen = (1 << 26),
+  kfExportfPed = (1 << 27),
+  kfExportfCompound = (1 << 28),
+  kfExportfStructure = (1 << 29),
+  kfExportfTranspose = (1 << 30),
+  kfExportfVcf = (1U << 31),
+  kfExportfTypemask = (2 * kfExportfVcf) - kfExportf23,
+  kfExportfIncludeAlt = (1LLU << 32),
+  kfExportfBgz = (1LLU << 33),
+  kfExportfVcfDosageGp = (1LLU << 34),
+  kfExportfVcfDosageDs = (1LLU << 35),
+  kfExportfOmitNonmaleY = (1LLU << 36)
+FLAGSET64_DEF_END(exportf_flags_t);
+
+#ifdef _WIN32
+  #define pthread_t HANDLE
+  #define THREAD_FUNC_DECL unsigned __stdcall
+  #define THREAD_FUNCPTR_T(func_ptr) unsigned (__stdcall *func_ptr)(void*)
+  // #define THREAD_FUNCPP_T(func_pp) unsigned (__stdcall **func_pp)(void*)
+  #define THREAD_RETURN return 0
+  #define EOLN_STR "\r\n"
+#else
+  #define THREAD_FUNC_DECL void*
+  #define THREAD_FUNCPTR_T(func_ptr) void* (*func_ptr)(void*)
+  // #define THREAD_FUNCPP_T(func_pp) void* (**func_pp)(void*)
+  #define THREAD_RETURN return nullptr
+  #define EOLN_STR "\n"
+#endif
+
+#ifndef _GNU_SOURCE
+  #define rawmemchr(ss, cc) memchr((ss), (cc), (0x80000000U - kBytesPerVec))
+#endif
+
+#ifdef _WIN32
+// if MAX_THREADS > 64, single WaitForMultipleObjects calls must be converted
+// into loops
+  CONSTU31(kMaxThreads, 64);
+#else
+// currently assumed to be less than 2^16 (otherwise some multiply overflows
+// are theoretically possible, at least in the 32-bit build)
+  CONSTU31(kMaxThreads, 512);
+#endif
+
+// asserts didn't seem to work properly with a setting much smaller than this
+CONSTU31(kDefaultThreadStack, 131072);
+
+// generic maximum line byte length.  .ped/.vcf/etc. lines can of course be
+// longer
+CONSTU31(kMaxMediumLine, 131072);
+
+// must be at least 2 * kMaxMediumLine + 2 to support generic token loader.
+CONSTU31(kTextbufSize, 2 * kMaxMediumLine + 256);
+
+// when g_textbuf is used as a generic I/O buffer, this is a convenient
+// power-of-2 size (must be <= kTextbufSize).
+CONSTU31(kTextbufMainSize, 2 * kMaxMediumLine);
+
+// "slen" is now used to indicate string lengths excluding terminating nulls,
+// while "blen" includes the terminator.
+
+// Maximum length of chromosome, variant, FID, IID, cluster, and set IDs (not
+// including terminating null).  This value supports up to 8 IDs per line
+// (maximum so far is 5, for e.g. --hom).
+// Assumed by plink2_pvar to be a multiple of 16.
+CONSTU31(kMaxIdSlen, 16000);
+CONSTU31(kMaxIdBlen, kMaxIdSlen + 1);
+#define MAX_ID_SLEN_STR "16000"
+
+// Maximum size of "dynamically" allocated line load buffer.  (This is the
+// limit that applies to .vcf and similar files.)  Inconvenient to go higher
+// since fgets() takes a int32_t size argument.
+CONSTU31(kMaxLongLine, 0x7fffffc0);
+static_assert(!(kMaxLongLine % kCacheline), "kMaxLongLine must be a multiple of kCacheline.");
+
+// allow extensions like .model.trend.fisher.set.score.adjusted
+CONSTU31(kMaxOutfnameExtBlen, 39);
+
+#ifdef __LP64__
+HEADER_INLINE uint64_t round_up_pow2_ull(uint64_t val, uint64_t alignment) {
+  return round_up_pow2(val, alignment);
+}
+#else
+HEADER_INLINE uint64_t round_up_pow2_ull(uint64_t val, uint64_t alignment) {
+  uint64_t alignment_m1 = alignment - 1;
+  assert(!(alignment & alignment_m1));
+  return (val + alignment_m1) & (~alignment_m1);
+}
+#endif
+
+typedef struct aperm_struct {
+  uint32_t min;
+  uint32_t max;
+  double alpha;
+  double beta;
+  double init_interval;
+  double interval_slope;
+} aperm_t;
+
+// (2^31 - 1000001) / 2
+CONSTU31(kApermMax, 1073241823);
+
+// file-scope string constants don't always have the g_ prefix, but multi-file
+// constants are always tagged.
+extern const char g_errstr_fopen[];
+// extern const char g_cmdline_format_str[];
+
+extern char g_textbuf[];
+
+extern const char* g_one_char_strs;
+
+// '.' missing genotype value is now taken for granted; this is in *addition*
+// to it (default '0').
+extern const char* g_input_missing_geno_ptr;
+
+extern const char* g_output_missing_geno_ptr; // now defaults to '.'
+
+extern FILE* g_logfile;
+
+// mostly-safe sprintf buffer.  warning: do NOT put allele codes or
+// arbitrary-length lists in here.
+extern char g_logbuf[];
+
+extern uint32_t g_debug_on;
+extern uint32_t g_log_failed;
+
+// for --warning-errcode
+extern uint32_t g_stderr_written_to;
+
+
+typedef struct ll_str_struct {
+  struct ll_str_struct* next;
+  char ss[];
+} ll_str_t;
+
+boolerr_t push_llstr(const char* ss, ll_str_t** ll_stack_ptr);
+
+// warning: do NOT include allele codes (unless they're guaranteed to be SNPs)
+// in log strings; they can overflow the buffer.
+void logstr(const char* ss);
+
+void logprint(const char* ss);
+
+void logerrprint(const char* ss);
+
+void logprintb();
+
+void logerrprintb();
+
+#define LOGPRINTF(...) sprintf(g_logbuf, __VA_ARGS__); logprintb();
+
+#define LOGERRPRINTF(...) sprintf(g_logbuf, __VA_ARGS__); logerrprintb();
+
+// input for wordwrap/LOGPRINTFWW should have no intermediate '\n's.  If
+// suffix_len is 0, there should be a terminating \n.
+// void wordwrap(uint32_t suffix_len, char* ss);
+
+void wordwrapb(uint32_t suffix_len);
+
+#define LOGPREPRINTFWW(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(0);
+
+#define LOGPRINTFWW(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(0); logprintb();
+
+#define LOGERRPRINTFWW(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(0); logerrprintb();
+
+// 5 = length of "done." suffix, which is commonly used
+#define LOGPRINTFWW5(...) sprintf(g_logbuf, __VA_ARGS__); wordwrapb(5); logprintb();
+
+boolerr_t fopen_checked(const char* fname, const char* mode, FILE** target_ptr);
+
+HEADER_INLINE interr_t putc_checked(int32_t ii, FILE* outfile) {
+  putc_unlocked(ii, outfile);
+  return ferror(outfile);
+}
+
+HEADER_INLINE interr_t fputs_checked(const char* ss, FILE* outfile) {
+  fputs(ss, outfile);
+  return ferror(outfile);
+}
+
+interr_t fwrite_flush2(char* buf_flush, FILE* outfile, char** write_iter_ptr);
+
+HEADER_INLINE interr_t fwrite_ck(char* buf_flush, FILE* outfile, char** write_iter_ptr) {
+  if ((*write_iter_ptr) < buf_flush) {
+    return 0;
+  }
+  return fwrite_flush2(buf_flush, outfile, write_iter_ptr);
+}
+
+// fclose_null defined in pgenlib_internal.h
+
+HEADER_INLINE void fclose_cond(FILE* fptr) {
+  if (fptr) {
+    fclose(fptr);
+  }
+}
+
+uint32_t int_slen(int32_t num);
+
+// assumes it's safe to read first s_const_len bytes of s_read
+int32_t strcmp_se(const char* s_read, const char* s_const, uint32_t s_const_len);
+
+int32_t strcmp_casted(const void* s1, const void* s2);
+
+int32_t strcmp_natural(const void* s1, const void* s2);
+
+int32_t strcmp_deref(const void* s1, const void* s2);
+
+int32_t strcmp_natural_deref(const void* s1, const void* s2);
+
+int32_t double_cmp(const void* aa, const void* bb);
+
+int32_t double_cmp_decr(const void* aa, const void* bb);
+
+// requires all elements to be within 2^31 - 1 of each other
+int32_t intcmp(const void* aa, const void* bb);
+
+int32_t uint64cmp(const void* aa, const void* bb);
+
+#ifndef __cplusplus
+int32_t uint64cmp_decr(const void* aa, const void* bb);
+#endif
+
+HEADER_INLINE uint32_t get_uimax(uintptr_t len, const uint32_t* unsorted_arr) {
+  const uint32_t* unsorted_arr_end = &(unsorted_arr[len]);
+#ifndef __cplusplus
+  const uint32_t* unsorted_arr_iter = unsorted_arr;
+  uint32_t uimax = *unsorted_arr_iter++;
+  while (unsorted_arr_iter < unsorted_arr_end) {
+    const uint32_t cur_val = *unsorted_arr_iter++;
+    if (cur_val > uimax) {
+      uimax = cur_val;
+    }
+  }
+  return uimax;
+#else
+  return *std::max_element(unsorted_arr, unsorted_arr_end);
+#endif
+}
+
+HEADER_INLINE float get_fmax(uintptr_t len, const float* unsorted_arr) {
+  const float* unsorted_arr_end = &(unsorted_arr[len]);
+#if defined(__APPLE__) || !defined(__cplusplus)
+  // std::max_element doesn't seem to be performant for floats/doubles on OS X
+  const float* unsorted_arr_iter = unsorted_arr;
+  float fmax = *unsorted_arr_iter++;
+  while (unsorted_arr_iter < unsorted_arr_end) {
+    const float cur_val = *unsorted_arr_iter++;
+    if (cur_val > fmax) {
+      fmax = cur_val;
+    }
+  }
+  return fmax;
+#else
+  return *std::max_element(unsorted_arr, unsorted_arr_end);
+#endif
+}
+
+HEADER_INLINE double get_dmax(uintptr_t len, const double* unsorted_arr) {
+  const double* unsorted_arr_end = &(unsorted_arr[len]);
+#if defined(__APPLE__) || !defined(__cplusplus)
+  const double* unsorted_arr_iter = unsorted_arr;
+  double dmax = *unsorted_arr_iter++;
+  while (unsorted_arr_iter < unsorted_arr_end) {
+    const double cur_val = *unsorted_arr_iter++;
+    if (cur_val > dmax) {
+      dmax = cur_val;
+    }
+  }
+  return dmax;
+#else
+  return *std::max_element(unsorted_arr, unsorted_arr_end);
+#endif
+}
+
+float destructive_get_fmedian(uintptr_t len, float* unsorted_arr);
+
+double destructive_get_dmedian(uintptr_t len, double* unsorted_arr);
+
+uintptr_t get_strboxsort_wentry_blen(uintptr_t max_str_blen);
+
+#ifdef __cplusplus
+typedef struct str_sort_deref_struct {
+  char* strptr;
+  bool operator<(const struct str_sort_deref_struct& rhs) const {
+    return (strcmp(strptr, rhs.strptr) < 0);
+  }
+} str_sort_deref_t;
+
+typedef struct str_nsort_deref_struct {
+  char* strptr;
+  bool operator<(const struct str_nsort_deref_struct& rhs) const {
+    return (strcmp_natural(strptr, rhs.strptr) < 0);
+  }
+} str_nsort_deref_t;
+
+HEADER_INLINE void strptr_arr_sort(uintptr_t ct, char** strptr_arr) {
+  std::sort((str_sort_deref_t*)strptr_arr, &(((str_sort_deref_t*)strptr_arr)[ct]));
+}
+
+HEADER_INLINE void strptr_arr_nsort(uintptr_t ct, char** strptr_arr) {
+  std::sort((str_nsort_deref_t*)strptr_arr, &(((str_nsort_deref_t*)strptr_arr)[ct]));
+}
+
+void sort_strbox_indexed2(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace);
+#else
+HEADER_INLINE void strptr_arr_sort(uintptr_t ct, char** strptr_arr) {
+  qsort(strptr_arr, ct, sizeof(intptr_t), strcmp_deref);
+}
+
+HEADER_INLINE void strptr_arr_nsort(uintptr_t ct, char** strptr_arr) {
+  qsort(strptr_arr, ct, sizeof(intptr_t), strcmp_natural_deref);
+}
+
+void sort_strbox_indexed2_fallback(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace);
+
+HEADER_INLINE void sort_strbox_indexed2(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map, void* sort_wkspace) {
+  sort_strbox_indexed2_fallback(str_ct, max_str_blen, use_nsort, strbox, id_map, sort_wkspace);
+}
+#endif
+
+// This makes a temporary g_bigstack allocation.
+boolerr_t sort_strbox_indexed(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map);
+
+// Uses malloc instead of bigstack.
+boolerr_t sort_strbox_indexed_malloc(uintptr_t str_ct, uintptr_t max_str_blen, char* strbox, uint32_t* id_map);
+
+// Returns dedup'd strbox entry count.
+uint32_t copy_and_dedup_sorted_strptrs_to_strbox(char** sorted_strptrs, uintptr_t str_ct, uintptr_t max_str_blen, char* strbox);
+
+/*
+void qsort_ext2(void* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), void* secondary_arr, uintptr_t secondary_item_len, unsigned char* proxy_arr, uintptr_t proxy_len);
+
+// This makes a g_bigstack allocation, and returns -1 on alloc failure.
+int32_t qsort_ext(void* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), void* secondary_arr, uintptr_t secondary_item_len);
+*/
+
+uint32_t uint32arr_greater_than(const uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii);
+
+uintptr_t uint64arr_greater_than(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii);
+
+uintptr_t doublearr_greater_than(const double* sorted_dbl_arr, uintptr_t arr_length, double dxx);
+
+
+uintptr_t uint64arr_geq(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii);
+
+HEADER_INLINE uint32_t is_flag(const char* param) {
+  unsigned char ucc = param[1];
+  return ((*param == '-') && ((ucc > '9') || ((ucc < '0') && (ucc != '.') && (ucc != '\0'))));
+}
+
+HEADER_INLINE char* is_flag_start(char* param) {
+  unsigned char ucc = param[1];
+  if ((*param == '-') && ((ucc > '9') || ((ucc < '0') && (ucc != '.') && (ucc != '\0')))) {
+    return (ucc == '-')? (&(param[2])) : (&(param[1]));
+  }
+  return nullptr;
+}
+
+uint32_t param_count(char** argv, uint32_t argc, uint32_t flag_idx);
+
+boolerr_t enforce_param_ct_range(const char* flag_name, uint32_t param_ct, uint32_t min_ct, uint32_t max_ct);
+
+pglerr_t sort_cmdline_flags(uint32_t max_flag_len, uint32_t flag_ct, char* flag_buf, uint32_t* flag_map);
+
+pglerr_t init_logfile(uint32_t always_stderr, char* outname, char* outname_end);
+
+boolerr_t cleanup_logfile(uint32_t print_end_time);
+
+CONSTU31(kNonBigstackMin, 67108864);
+
+CONSTU31(kBigstackMinMb, 640);
+CONSTU31(kBigstackDefaultMb, 2048);
+
+static const double kPi = 3.1415926535897932;
+static const double kRecipE = 0.36787944117144233;
+static const double kRecip2m53 = 0.00000000000000011102230246251565404236316680908203125;
+static const double kRecip2m32 = 0.00000000023283064365386962890625;
+static const double k2m64 = 18446744073709551616.0;
+
+// 2^{-44}
+static const double kSmallEpsilon = 0.00000000000005684341886080801486968994140625;
+
+// 2^{-21}, must be >= sqrt(kSmallEpsilon)
+static const double kBigEpsilon = 0.000000476837158203125;
+
+// 2^{-83} bias to give exact tests maximum ability to determine tiny p-values.
+// (~2^{-53} is necessary to take advantage of denormalized small numbers, then
+// allow tail sum to be up to 2^30.)
+static const double kExactTestBias = 0.00000000000000000000000010339757656912845935892608650874535669572651386260986328125;
+
+// apparently these aren't always defined in limits.h
+#ifndef DBL_MAX
+  #define DBL_MAX 1.7976931348623157e308
+#endif
+#ifndef FLT_MAX
+  #define FLT_MAX 3.40282347e38f
+#endif
+
+// probably time to flip arena_alloc and bigstack_alloc definitions...
+
+// manually managed, very large double-ended stack
+extern unsigned char* g_bigstack_base;
+extern unsigned char* g_bigstack_end;
+
+uintptr_t detect_mb();
+
+uintptr_t get_default_alloc_mb();
+ 
+// caller is responsible for freeing bigstack_ua
+pglerr_t init_bigstack(uintptr_t malloc_size_mb, uintptr_t* malloc_mb_final_ptr, unsigned char** bigstack_ua_ptr);
+
+
+HEADER_INLINE uintptr_t bigstack_left() {
+  return (((uintptr_t)g_bigstack_end) - ((uintptr_t)g_bigstack_base));
+}
+
+HEADER_INLINE unsigned char* bigstack_alloc_raw(uintptr_t size) {
+  // Assumes caller has already forced size to a multiple of
+  // kCacheline, and verified that enough space is available.
+  assert(!(size % kCacheline));
+  unsigned char* alloc_ptr = g_bigstack_base;
+  g_bigstack_base += size;
+  return alloc_ptr;
+}
+
+HEADER_INLINE unsigned char* bigstack_alloc_raw_rd(uintptr_t size) {
+  // Same as bigstack_alloc_raw(), except for rounding up size.
+  unsigned char* alloc_ptr = g_bigstack_base;
+  g_bigstack_base += round_up_pow2(size, kCacheline);
+  return alloc_ptr;
+}
+
+// Basic 64-byte-aligned allocation at bottom of stack.
+HEADER_INLINE unsigned char* bigstack_alloc(uintptr_t size) {
+  size = round_up_pow2(size, kCacheline);
+  if (bigstack_left() < size) {
+    g_failed_alloc_attempt_size = size;
+    return nullptr;
+  }
+  return bigstack_alloc_raw(size);
+}
+
+
+// Typesafe, return-0-iff-success interfaces.  (See also bigstack_calloc_...
+// further below.)
+HEADER_INLINE boolerr_t bigstack_alloc_c(uintptr_t ct, char** c_arr_ptr) {
+  *c_arr_ptr = (char*)bigstack_alloc(ct);
+  return !(*c_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_d(uintptr_t ct, double** d_arr_ptr) {
+  *d_arr_ptr = (double*)bigstack_alloc(ct * sizeof(double));
+  return !(*d_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_f(uintptr_t ct, float** f_arr_ptr) {
+  *f_arr_ptr = (float*)bigstack_alloc(ct * sizeof(float));
+  return !(*f_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_si(uintptr_t ct, int16_t** si_arr_ptr) {
+  *si_arr_ptr = (int16_t*)bigstack_alloc(ct * sizeof(int16_t));
+  return !(*si_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_i(uintptr_t ct, int32_t** i_arr_ptr) {
+  *i_arr_ptr = (int32_t*)bigstack_alloc(ct * sizeof(int32_t));
+  return !(*i_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
+  *uc_arr_ptr = bigstack_alloc(ct);
+  return !(*uc_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_usi(uintptr_t ct, uint16_t** usi_arr_ptr) {
+  *usi_arr_ptr = (uint16_t*)bigstack_alloc(ct * sizeof(int16_t));
+  return !(*usi_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_dosage(uintptr_t ct, dosage_t** dosage_arr_ptr) {
+  *dosage_arr_ptr = (dosage_t*)bigstack_alloc(ct * sizeof(dosage_t));
+  return !(*dosage_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_ui(uintptr_t ct, uint32_t** ui_arr_ptr) {
+  *ui_arr_ptr = (uint32_t*)bigstack_alloc(ct * sizeof(int32_t));
+  return !(*ui_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_ul(uintptr_t ct, uintptr_t** ul_arr_ptr) {
+  *ul_arr_ptr = (uintptr_t*)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*ul_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_ll(uintptr_t ct, int64_t** ll_arr_ptr) {
+  *ll_arr_ptr = (int64_t*)bigstack_alloc(ct * sizeof(int64_t));
+  return !(*ll_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_ull(uintptr_t ct, uint64_t** ull_arr_ptr) {
+  *ull_arr_ptr = (uint64_t*)bigstack_alloc(ct * sizeof(int64_t));
+  return !(*ull_arr_ptr);
+}
+
+// some versions of gcc give aliasing warnings if we use bigstack_alloc_ul()
+// for everything
+// if sizeof(intptr_t) != sizeof(uintptr_t*), we're doomed anyway, so I won't
+// bother with that static assert...
+HEADER_INLINE boolerr_t bigstack_alloc_ulp(uintptr_t ct, uintptr_t*** ulp_arr_ptr) {
+  *ulp_arr_ptr = (uintptr_t**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*ulp_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
+  *cp_arr_ptr = (char**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*cp_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_sip(uintptr_t ct, int16_t*** sip_arr_ptr) {
+  *sip_arr_ptr = (int16_t**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*sip_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_ip(uintptr_t ct, int32_t*** ip_arr_ptr) {
+  *ip_arr_ptr = (int32_t**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*ip_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_ucp(uintptr_t ct, unsigned char*** ucp_arr_ptr) {
+  *ucp_arr_ptr = (unsigned char**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*ucp_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_usip(uintptr_t ct, uint16_t*** usip_arr_ptr) {
+  *usip_arr_ptr = (uint16_t**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*usip_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_dosagep(uintptr_t ct, dosage_t*** dosagep_arr_ptr) {
+  *dosagep_arr_ptr = (dosage_t**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*dosagep_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_uip(uintptr_t ct, uint32_t*** uip_arr_ptr) {
+  *uip_arr_ptr = (uint32_t**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*uip_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_dp(uintptr_t ct, double*** dp_arr_ptr) {
+  *dp_arr_ptr = (double**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*dp_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_vp(uintptr_t ct, vul_t*** vp_arr_ptr) {
+  *vp_arr_ptr = (vul_t**)bigstack_alloc(ct * sizeof(intptr_t));
+  return !(*vp_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_alloc_thread(uintptr_t ct, pthread_t** thread_arr_ptr) {
+  *thread_arr_ptr = (pthread_t*)bigstack_alloc(ct * sizeof(pthread_t));
+  return !(*thread_arr_ptr);
+}
+
+HEADER_INLINE void bigstack_reset(void* new_base) {
+  g_bigstack_base = (unsigned char*)new_base;
+}
+
+HEADER_INLINE void bigstack_end_reset(void* new_end) {
+  g_bigstack_end = (unsigned char*)new_end;
+}
+
+HEADER_INLINE void bigstack_double_reset(void* new_base, void* new_end) {
+  bigstack_reset(new_base);
+  bigstack_end_reset(new_end);
+}
+
+// assumes we've already been writing to ulptr and have previously performed
+// bounds-checking.
+HEADER_INLINE void bigstack_finalize_ul(__maybe_unused const uintptr_t* ulptr, uintptr_t ct) {
+  assert(ulptr == (const uintptr_t*)g_bigstack_base);
+  g_bigstack_base += round_up_pow2(ct * sizeof(intptr_t), kCacheline);
+  assert(g_bigstack_base <= g_bigstack_end);
+}
+
+HEADER_INLINE void bigstack_finalize_ui(__maybe_unused const uint32_t* uiptr, uintptr_t ct) {
+  assert(uiptr == (const uint32_t*)g_bigstack_base);
+  g_bigstack_base += round_up_pow2(ct * sizeof(int32_t), kCacheline);
+  assert(g_bigstack_base <= g_bigstack_end);
+}
+
+HEADER_INLINE void bigstack_finalize_c(__maybe_unused const char* cptr, uintptr_t ct) {
+  assert(cptr == (const char*)g_bigstack_base);
+  g_bigstack_base += round_up_pow2(ct, kCacheline);
+  assert(g_bigstack_base <= g_bigstack_end);
+}
+
+HEADER_INLINE void bigstack_finalize_cp(__maybe_unused char** cpptr, uintptr_t ct) {
+  assert(cpptr == (char**)g_bigstack_base);
+  g_bigstack_base += round_up_pow2(ct * sizeof(intptr_t), kCacheline);
+  assert(g_bigstack_base <= g_bigstack_end);
+}
+
+
+HEADER_INLINE void bigstack_shrink_top(const void* rebase, uintptr_t new_size) {
+  // could assert that this doesn't go in the wrong direction?
+  g_bigstack_base = (unsigned char*)round_up_pow2(((uintptr_t)rebase) + new_size, kCacheline);
+}
+
+// simpler to have these allocations automatically AVX2-aligned when the time
+// comes
+CONSTU31(kEndAllocAlign, MAXV(kBytesPerVec, 16));
+
+HEADER_INLINE void bigstack_end_set(const void* unaligned_end) {
+  g_bigstack_end = (unsigned char*)round_down_pow2((uintptr_t)unaligned_end, kEndAllocAlign);
+}
+
+// assumes size is divisible by kEndAllocAlign
+// assumes enough space is available
+HEADER_INLINE unsigned char* bigstack_end_alloc_raw(uintptr_t size) {
+  assert(!(size % kEndAllocAlign));
+  g_bigstack_end -= size;
+  return g_bigstack_end;
+}
+
+HEADER_INLINE unsigned char* bigstack_end_alloc_raw_rd(uintptr_t size) {
+  g_bigstack_end -= round_up_pow2(size, kEndAllocAlign);
+  return g_bigstack_end;
+}
+
+HEADER_INLINE unsigned char* bigstack_end_alloc_presized(uintptr_t size) {
+  uintptr_t cur_bigstack_left = bigstack_left();
+  if (size > cur_bigstack_left) {
+    g_failed_alloc_attempt_size = size;
+    return nullptr;
+  }
+  return bigstack_end_alloc_raw(size);
+}
+
+HEADER_INLINE unsigned char* bigstack_end_alloc(uintptr_t size) {
+  size = round_up_pow2(size, kEndAllocAlign);
+  return bigstack_end_alloc_presized(size);
+}
+
+HEADER_INLINE unsigned char* bigstack_end_aligned_alloc(uintptr_t size) {
+  return bigstack_end_alloc(size);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_c(uintptr_t ct, char** c_arr_ptr) {
+  *c_arr_ptr = (char*)bigstack_end_alloc(ct);
+  return !(*c_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_d(uintptr_t ct, double** d_arr_ptr) {
+  *d_arr_ptr = (double*)bigstack_end_alloc(ct * sizeof(double));
+  return !(*d_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_f(uintptr_t ct, float** f_arr_ptr) {
+  *f_arr_ptr = (float*)bigstack_end_alloc(ct * sizeof(float));
+  return !(*f_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_i(uintptr_t ct, int32_t** i_arr_ptr) {
+  *i_arr_ptr = (int32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
+  return !(*i_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
+  *uc_arr_ptr = bigstack_end_alloc(ct);
+  return !(*uc_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_dosage(uintptr_t ct, dosage_t** dosage_arr_ptr) {
+  *dosage_arr_ptr = (dosage_t*)bigstack_end_alloc(ct * sizeof(dosage_t));
+  return !(*dosage_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_ui(uintptr_t ct, uint32_t** ui_arr_ptr) {
+  *ui_arr_ptr = (uint32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
+  return !(*ui_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_ul(uintptr_t ct, uintptr_t** ul_arr_ptr) {
+  *ul_arr_ptr = (uintptr_t*)bigstack_end_alloc(ct * sizeof(intptr_t));
+  return !(*ul_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_ll(uintptr_t ct, int64_t** ll_arr_ptr) {
+  *ll_arr_ptr = (int64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
+  return !(*ll_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_ull(uintptr_t ct, uint64_t** ull_arr_ptr) {
+  *ull_arr_ptr = (uint64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
+  return !(*ull_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_llstr(uintptr_t str_bytes, ll_str_t** llstr_arr_ptr) {
+  *llstr_arr_ptr = (ll_str_t*)bigstack_end_alloc(str_bytes + sizeof(ll_str_t));
+  return !(*llstr_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
+  *cp_arr_ptr = (char**)bigstack_end_alloc(ct * sizeof(intptr_t));
+  return !(*cp_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_ucp(uintptr_t ct, unsigned char*** ucp_arr_ptr) {
+  *ucp_arr_ptr = (unsigned char**)bigstack_end_alloc(ct * sizeof(intptr_t));
+  return !(*ucp_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_alloc_thread(uintptr_t ct, pthread_t** thread_arr_ptr) {
+  *thread_arr_ptr = (pthread_t*)bigstack_end_alloc(ct * sizeof(pthread_t));
+  return !(*thread_arr_ptr);
+}
+
+
+// and here's the interface for a non-global arena (necessary for some
+// multithreaded code).
+HEADER_INLINE unsigned char* arena_alloc_raw(uintptr_t size, unsigned char** arena_bottom_ptr) {
+  assert(!(size % kCacheline));
+  unsigned char* alloc_ptr = *arena_bottom_ptr;
+  *arena_bottom_ptr = &(alloc_ptr[size]);
+  return alloc_ptr;
+}
+
+HEADER_INLINE unsigned char* arena_alloc_raw_rd(uintptr_t size, unsigned char** arena_bottom_ptr) {
+  unsigned char* alloc_ptr = *arena_bottom_ptr;
+  *arena_bottom_ptr = &(alloc_ptr[round_up_pow2(size, kCacheline)]);
+  return alloc_ptr;
+}
+
+HEADER_INLINE unsigned char* arena_alloc(unsigned char* arena_top, uintptr_t size, unsigned char** arena_bottom_ptr) {
+  size = round_up_pow2(size, kCacheline);
+  if (((uintptr_t)(arena_top - (*arena_bottom_ptr))) < size) {
+    g_failed_alloc_attempt_size = size;
+    return nullptr;
+  }
+  return arena_alloc_raw(size, arena_bottom_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_c(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, char** c_arr_ptr) {
+  *c_arr_ptr = (char*)arena_alloc(arena_top, ct, arena_bottom_ptr);
+  return !(*c_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_d(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, double** d_arr_ptr) {
+  *d_arr_ptr = (double*)arena_alloc(arena_top, ct * sizeof(double), arena_bottom_ptr);
+  return !(*d_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_f(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, float** f_arr_ptr) {
+  *f_arr_ptr = (float*)arena_alloc(arena_top, ct * sizeof(float), arena_bottom_ptr);
+  return !(*f_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_i(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, int32_t** i_arr_ptr) {
+  *i_arr_ptr = (int32_t*)arena_alloc(arena_top, ct * sizeof(int32_t), arena_bottom_ptr);
+  return !(*i_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_uc(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, unsigned char** uc_arr_ptr) {
+  *uc_arr_ptr = arena_alloc(arena_top, ct, arena_bottom_ptr);
+  return !(*uc_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_ui(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uint32_t** ui_arr_ptr) {
+  *ui_arr_ptr = (uint32_t*)arena_alloc(arena_top, ct * sizeof(int32_t), arena_bottom_ptr);
+  return !(*ui_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_ul(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uintptr_t** ul_arr_ptr) {
+  *ul_arr_ptr = (uintptr_t*)arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr);
+  return !(*ul_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_ll(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, int64_t** ll_arr_ptr) {
+  *ll_arr_ptr = (int64_t*)arena_alloc(arena_top, ct * sizeof(int64_t), arena_bottom_ptr);
+  return !(*ll_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_ull(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uint64_t** ull_arr_ptr) {
+  *ull_arr_ptr = (uint64_t*)arena_alloc(arena_top, ct * sizeof(int64_t), arena_bottom_ptr);
+  return !(*ull_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_alloc_cp(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, char*** cp_arr_ptr) {
+  *cp_arr_ptr = (char**)arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr);
+  return !(*cp_arr_ptr);
+}
+
+HEADER_INLINE unsigned char* arena_end_alloc_raw(uintptr_t size, unsigned char** arena_top_ptr) {
+  assert(!(size % kEndAllocAlign));
+  unsigned char* alloc_ptr = *arena_top_ptr;
+  alloc_ptr -= size;
+  *arena_top_ptr = alloc_ptr;
+  return alloc_ptr;
+}
+
+HEADER_INLINE unsigned char* arena_end_alloc(unsigned char* arena_bottom, uintptr_t size, unsigned char** arena_top_ptr) {
+  size = round_up_pow2(size, kEndAllocAlign);
+  if (((uintptr_t)((*arena_top_ptr) - arena_bottom)) < size) {
+    g_failed_alloc_attempt_size = size;
+    return nullptr;
+  }
+  return arena_end_alloc_raw(size, arena_top_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_c(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, char** c_arr_ptr) {
+  *c_arr_ptr = (char*)arena_end_alloc(arena_bottom, ct, arena_top_ptr);
+  return !(*c_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_d(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, double** d_arr_ptr) {
+  *d_arr_ptr = (double*)arena_end_alloc(arena_bottom, ct * sizeof(double), arena_top_ptr);
+  return !(*d_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_f(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, float** f_arr_ptr) {
+  *f_arr_ptr = (float*)arena_end_alloc(arena_bottom, ct * sizeof(float), arena_top_ptr);
+  return !(*f_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_i(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, int32_t** i_arr_ptr) {
+  *i_arr_ptr = (int32_t*)arena_end_alloc(arena_bottom, ct * sizeof(int32_t), arena_top_ptr);
+  return !(*i_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_uc(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, unsigned char** uc_arr_ptr) {
+  *uc_arr_ptr = arena_end_alloc(arena_bottom, ct, arena_top_ptr);
+  return !(*uc_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_ui(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uint32_t** ui_arr_ptr) {
+  *ui_arr_ptr = (uint32_t*)arena_end_alloc(arena_bottom, ct * sizeof(int32_t), arena_top_ptr);
+  return !(*ui_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_ul(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uintptr_t** ul_arr_ptr) {
+  *ul_arr_ptr = (uintptr_t*)arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr);
+  return !(*ul_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_ll(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, int64_t** ll_arr_ptr) {
+  *ll_arr_ptr = (int64_t*)arena_end_alloc(arena_bottom, ct * sizeof(int64_t), arena_top_ptr);
+  return !(*ll_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_ull(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uint64_t** ull_arr_ptr) {
+  *ull_arr_ptr = (uint64_t*)arena_end_alloc(arena_bottom, ct * sizeof(int64_t), arena_top_ptr);
+  return !(*ull_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t arena_end_alloc_cp(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, char*** cp_arr_ptr) {
+  *cp_arr_ptr = (char**)arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr);
+  return !(*cp_arr_ptr);
+}
+
+
+typedef struct l32str_struct {
+  uint32_t len;
+  char ss[];
+} l32str_t;
+
+
+HEADER_INLINE int32_t is_letter(unsigned char ucc) {
+  return (((ucc & 192) == 64) && (((ucc - 1) & 31) < 26));
+}
+
+// if we need the digit value, better to use (unsigned char)cc - '0'...
+HEADER_INLINE int32_t is_digit(unsigned char ucc) {
+  return (ucc <= '9') && (ucc >= '0');
+}
+
+HEADER_INLINE int32_t is_not_digit(unsigned char ucc) {
+  return (ucc > '9') || (ucc < '0');
+}
+
+HEADER_INLINE int32_t is_not_nzdigit(unsigned char ucc) {
+  return (ucc > '9') || (ucc <= '0');
+}
+
+// may as well treat all chars < 32, except tab, as eoln...
+// kns = "known non-space" (where tab counts as a space)
+/*
+HEADER_INLINE int32_t is_eoln_kns(unsigned char ucc) {
+  return (ucc < 32);
+}
+*/
+
+HEADER_INLINE int32_t is_space_or_eoln(unsigned char ucc) {
+  return (ucc <= 32);
+}
+
+HEADER_INLINE int32_t is_eoln_kns(unsigned char ucc) {
+  // could assert ucc is not a space/tab?
+  return (ucc <= 32);
+}
+
+HEADER_INLINE int32_t is_eoln_or_comment_kns(unsigned char ucc) {
+  return (ucc < 32) || (ucc == '#');
+}
+
+HEADER_INLINE int32_t no_more_tokens_kns(const char* sptr) {
+  return ((!sptr) || is_eoln_kns(*sptr));
+}
+
+HEADER_INLINE char* skip_initial_spaces(char* sptr) {
+  while ((*sptr == ' ') || (*sptr == '\t')) {
+    ++sptr;
+  }
+  return sptr;
+}
+
+// assumes we are currently in a token -- UNSAFE OTHERWISE
+HEADER_INLINE char* token_endnn(char* sptr) {
+  // assert(((unsigned char)(*sptr)) > 32);
+  while (!is_space_or_eoln(*(++sptr)));
+  return sptr;
+}
+
+HEADER_INLINE char* next_prespace(char* sptr) {
+  while (((unsigned char)(*(++sptr))) >= 32);
+  return sptr;
+}
+
+
+// length-zero tokens and non-leading spaces are permitted in the
+// comma-delimiter case
+HEADER_INLINE char* comma_or_space_token_end(char* token_iter, uint32_t comma_delim) {
+  if (comma_delim) {
+    unsigned char ucc = (unsigned char)(*token_iter);
+    while ((ucc >= ' ') && (ucc != ',')) {
+      ucc = *(++token_iter);
+    }
+    return token_iter;
+  }
+  return token_endnn(token_iter);
+}
+
+HEADER_INLINE char* comma_or_space_next_token(char* token_end_iter, uint32_t comma_delim) {
+  // assumes token_end_iter is non-null, returns nullptr if there are no more
+  // tokens
+  // assert(token_end_iter);
+  if (comma_delim) {
+    if ((*token_end_iter) != ',') {
+      return nullptr;
+    }
+    return skip_initial_spaces(&(token_end_iter[1]));
+  }
+  char* ss = skip_initial_spaces(token_end_iter);
+  return is_eoln_kns(*ss)? nullptr : ss;
+}
+
+
+// Returns whether uppercased ss matches nonempty fixed_str.  Assumes fixed_str
+// contains nothing but letters and a null terminator.
+// uint32_t match_upper(const char* ss, const char* fixed_str);
+
+uint32_t match_upper_counted(const char* ss, const char* fixed_str, uint32_t ct);
+
+/*
+void str_toupper(char* ss);
+
+void buf_toupper(uint32_t slen, char* ss);
+
+void strcpy_toupper(char* target, const char* source);
+*/
+
+uint32_t is_alphanumeric(const char* ss);
+
+// scan_posint_capped(), scan_uint_capped(), scan_int_abs_bounded(),
+// scan_int32(), scan_posint_defcap(), scan_uint_defcap(),
+// scan_int_abs_defcap(), scan_uint_icap() in pgenlib_internal
+
+boolerr_t scan_posintptr(const char* ss, uintptr_t* valp);
+
+#ifdef __LP64__
+boolerr_t scanadv_posint_capped(uint64_t cap, char** ss_ptr, uint32_t* valp);
+
+boolerr_t scanadv_uint_capped(uint64_t cap, char** ss_ptr, uint32_t* valp);
+#else
+boolerr_t scanadv_posint_capped32(uint32_t cap_div_10, uint32_t cap_mod_10, char** ss_ptr, uint32_t* valp);
+
+boolerr_t scanadv_uint_capped32(uint32_t cap_div_10, uint32_t cap_mod_10, char** ss_ptr, uint32_t* valp);
+
+HEADER_INLINE boolerr_t scanadv_posint_capped(uint32_t cap, char** ss_ptr, uint32_t* valp) {
+ return scanadv_posint_capped32(cap / 10, cap % 10, ss_ptr, valp);
+}
+
+HEADER_INLINE boolerr_t scanadv_uint_capped(uint32_t cap, char** ss_ptr, uint32_t* valp) {
+ return scanadv_uint_capped32(cap / 10, cap % 10, ss_ptr, valp);
+}
+#endif
+
+HEADER_INLINE boolerr_t scanadv_uint_defcap(char** ss_ptr, uint32_t* valp) {
+  return scanadv_uint_capped(0x7ffffffe, ss_ptr, valp);
+}
+
+// this has different semantics from scanadv_posint_capped, etc. since integer
+// readers don't take much code (so it's fine to have a bunch of similar
+// functions, optimized for slightly different use cases), but we only want one
+// floating point reader
+char* scanadv_double(char* ss, double* valp);
+
+HEADER_INLINE boolerr_t scan_float(const char* ss, float* valp) {
+  double dxx;
+  // const_cast
+  if (!scanadv_double((char*)((uintptr_t)ss), &dxx)) {
+    return 1;
+  }
+  if (fabs(dxx) > 3.4028235677973362e38) {
+    return 1;
+  }
+  *valp = (float)dxx;
+  return 0;
+}
+
+// memcpya(), memseta() defined in pgenlib_internal.h
+
+HEADER_INLINE char* memcpyax(void* __restrict target, const void* __restrict source, uint32_t ct, char extra_char) {
+  memcpy(target, source, ct);
+  ((char*)target)[ct] = extra_char;
+  return &(((char*)target)[ct + 1]);
+}
+
+HEADER_INLINE void memcpyx(void* __restrict target, const void* __restrict source, uint32_t ct, char extra_char) {
+  memcpy(target, source, ct);
+  ((char*)target)[ct] = extra_char;
+}
+
+HEADER_INLINE void memcpyl3(void* __restrict target, const void* __restrict source) {
+  // when it's safe to clobber the fourth character, this is faster
+  *((uint32_t*)target) = *((const uint32_t*)source);
+}
+
+HEADER_INLINE char* memcpyl3a(void* __restrict target, const void* __restrict source) {
+  memcpyl3(target, source);
+  return &(((char*)target)[3]);
+}
+
+HEADER_INLINE char* strcpya(char* __restrict target, const void* __restrict source) {
+  uintptr_t slen = strlen((const char*)source);
+  memcpy(target, source, slen);
+  return &(target[slen]);
+}
+
+HEADER_INLINE char* strcpyax(char* __restrict target, const void* __restrict source, char extra_char) {
+  uintptr_t slen = strlen((const char*)source);
+  memcpy(target, source, slen);
+  target[slen] = extra_char;
+  return &(target[slen + 1]);
+}
+
+// MinGW support for stpcpy is a mess, so I'll use a different name
+// ("strcpya0") which doesn't depend on MinGW knowing what it's doing
+#if defined(_GNU_SOURCE) || defined(__APPLE__) || (_POSIX_C_SOURCE >= 200809L)
+HEADER_INLINE char* strcpya0(char* __restrict target, const char* __restrict source) {
+  return stpcpy(target, source);
+}
+#else
+HEADER_INLINE char* strcpya0(char* __restrict target, const char* __restrict source) {
+  uintptr_t slen = strlen(source);
+  memcpy(target, source, slen + 1);
+  return &(target[slen]);
+}
+#endif
+
+HEADER_INLINE void append_binary_eoln(char** target_ptr) {
+#ifdef _WIN32
+  (*target_ptr)[0] = '\r';
+  (*target_ptr)[1] = '\n';
+  *target_ptr += 2;
+#else
+  **target_ptr = '\n';
+  *target_ptr += 1;
+#endif
+}
+
+HEADER_INLINE void decr_append_binary_eoln(char** target_ptr) {
+#ifdef _WIN32
+  (*target_ptr)[-1] = '\r';
+  (*target_ptr)[0] = '\n';
+  *target_ptr += 1;
+#else
+  (*target_ptr)[-1] = '\n';
+#endif
+}
+
+void get_top_two_ui(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr);
+
+// safer than token_endnn(), since it handles length zero
+// "se" = space/eoln treated as terminators
+HEADER_INLINE uintptr_t strlen_se(const char* ss) {
+  const char* ss2 = ss;
+  while (!is_space_or_eoln(*ss2)) {
+    ss2++;
+  }
+  return (uintptr_t)(ss2 - ss);
+}
+
+// ok if sptr is at end of current token
+HEADER_INLINE char* next_token(char* sptr) {
+  if (!sptr) {
+    return nullptr;
+  }
+  unsigned char ucc = *sptr;
+  while (ucc > 32) {
+    ucc = *(++sptr);
+  }
+  while ((ucc == ' ') || (ucc == '\t')) {
+    ucc = *(++sptr);
+  }
+  return (ucc > 32)? sptr : nullptr;
+}
+
+HEADER_INLINE char* next_token_mult(char* sptr, uint32_t ct) {
+  // assert(ct);
+  if (!sptr) {
+    return nullptr;
+  }
+  unsigned char ucc = *sptr;
+  do {
+    while (ucc > 32) {
+      ucc = *(++sptr);
+    }
+    while ((ucc == ' ') || (ucc == '\t')) {
+      ucc = *(++sptr);
+    }
+    if (ucc <= 32) {
+      return nullptr;
+    }
+  } while (--ct);
+  return sptr;
+}
+
+HEADER_INLINE char* next_token_multz(char* sptr, uint32_t ct) {
+  // tried replacing this with ternary operator, but that actually seemed to
+  // slow things down a bit under gcc 4.2.1 (tail call optimization issue?).
+  // todo: recheck this under newer gcc/clang.
+  if (ct) {
+    return next_token_mult(sptr, ct);
+  }
+  return sptr;
+}
+
+char* comma_or_space_next_token_mult(char* sptr, uint32_t ct, uint32_t comma_delim);
+
+uint32_t count_tokens(const char* bufptr);
+
+// uint32_t comma_or_space_count_tokens(const char* bufptr, uint32_t comma_delim);
+
+HEADER_INLINE char* fw_strcpyn(const char* source, uint32_t min_width, uint32_t source_len, char* dest) {
+  // right-justified strcpy with known source length
+  if (source_len < min_width) {
+    memcpy(memseta((unsigned char*)dest, ' ', min_width - source_len), source, source_len);
+    return &(dest[min_width]);
+  }
+  return (char*)memcpya(dest, source, source_len);
+}
+
+HEADER_INLINE char* fw_strcpy(const char* source, uint32_t min_width, char* dest) {
+  return fw_strcpyn(source, min_width, strlen(source), dest);
+}
+
+// uint32_t count_and_measure_multistr(const char* multistr, uintptr_t* max_blen_ptr);
+
+boolerr_t count_and_measure_multistr_reverse_alloc(char* multistr, uintptr_t max_str_ct, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr, char*** strptr_arrp);
+
+char* uint32toa(uint32_t uii, char* start);
+
+char* int32toa(int32_t ii, char* start);
+
+char* uitoa_z4(uint32_t uii, char* start);
+
+char* int64toa(int64_t llii, char* start);
+
+char* uitoa_trunc4(uint32_t uii, char* start);
+
+char* dtoa_g(double dxx, char* start);
+
+// We try to avoid micromanaging floating point printing and just use %g
+// everywhere, but occasionally we explicitly need more precision.
+//
+// dtoa_g_p8 provides generic 8-digit precision (instead of %g's 6-digit
+// default), while print_dosage provides up to 3 places after the decimal point
+// when dealing with dosages (which are internally represented as 32768ths).
+// (may want to replace _p8 with _p10 for perfect int32 handling.)
+char* dtoa_g_p8(double dxx, char* start);
+
+char* print_dosage(uint64_t dosage, char* start);
+
+// char* dtoa_f_p5_clipped(double dxx, char* start);
+
+char* ftoa_g(float fxx, char* start);
+
+HEADER_INLINE char* uint32toa_x(uint32_t uii, char extra_char, char* start) {
+  char* penult = uint32toa(uii, start);
+  *penult = extra_char;
+  return &(penult[1]);
+}
+
+// fill_uint_zero, fill_ulong_zero, fill_ull_zero, fill_ulong_one currently
+// defined in pgenlib_internal.h
+
+HEADER_INLINE void fill_vvec_zero(uintptr_t entry_ct, vul_t* vvec) {
+  memset(vvec, 0, entry_ct * kBytesPerVec);
+}
+
+HEADER_INLINE void fill_ull_one(uintptr_t entry_ct, uint64_t* ullarr) {
+  fill_ulong_one(entry_ct, (uintptr_t*)ullarr);
+}
+
+HEADER_INLINE void fill_int_zero(uintptr_t entry_ct, int32_t* iarr) {
+  memset(iarr, 0, entry_ct * sizeof(int32_t));
+}
+
+HEADER_INLINE void fill_int_one(uintptr_t entry_ct, int32_t* iarr) {
+  for (uintptr_t ulii = 0; ulii < entry_ct; ulii++) {
+    *iarr++ = -1;
+  }
+}
+
+HEADER_INLINE void fill_uint_one(uintptr_t entry_ct, uint32_t* uiarr) {
+  for (uintptr_t ulii = 0; ulii < entry_ct; ulii++) {
+    *uiarr++ = ~0U;
+  }
+}
+
+HEADER_INLINE void fill_float_zero(uintptr_t entry_ct, float* farr) {
+  for (uintptr_t ulii = 0; ulii < entry_ct; ulii++) {
+    *farr++ = 0.0;
+  }
+}
+
+HEADER_INLINE void fill_double_zero(uintptr_t entry_ct, double* darr) {
+  for (uintptr_t ulii = 0; ulii < entry_ct; ulii++) {
+    *darr++ = 0.0;
+  }
+}
+
+
+HEADER_INLINE void append_float_zero(uintptr_t entry_ct, float** farr_ptr) {
+  float* farr = *farr_ptr;
+  for (uintptr_t ulii = 0; ulii < entry_ct; ulii++) {
+    *farr++ = 0.0;
+  }
+  *farr_ptr = farr;
+}
+
+
+// void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp);
+
+
+// fill_all_bits, IS_SET, SET_BIT, CLEAR_BIT, next_set_unsafe,
+// next_set_unsafe_ck, next_unset_unsafe, next_unset_unsafe_ck,
+// next_set, prev_set_unsafe, are_all_words_zero defined in pgenlib_internal.h
+
+// use this instead of IS_SET() for signed 32-bit integers
+HEADER_INLINE uint32_t is_set(const uintptr_t* bitarr, uint32_t loc) {
+  return (bitarr[loc / kBitsPerWord] >> (loc % kBitsPerWord)) & 1;
+}
+
+// useful for coercing int32_t loc to unsigned
+HEADER_INLINE void set_bit(uint32_t loc, uintptr_t* bitarr) {
+  bitarr[loc / kBitsPerWord] |= (k1LU << (loc % kBitsPerWord));
+}
+
+HEADER_INLINE void clear_bit(uint32_t loc, uintptr_t* bitarr) {
+  bitarr[loc / kBitsPerWord] &= ~(k1LU << (loc % kBitsPerWord));
+}
+
+#define FLIP_BIT(idx, arr) ((arr)[(idx) / kBitsPerWord] ^= k1LU << ((idx) % kBitsPerWord))
+
+// "_nz" added to names to make it obvious these require positive len
+void fill_bits_nz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr);
+void clear_bits_nz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr);
+
+#ifdef __LP64__
+uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc);
+#else
+HEADER_INLINE uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
+  return (uintptr_t)next_set_unsafe(bitarr, loc);
+}
+#endif
+
+HEADER_INLINE void next_set_ul_unsafe_ck(const uintptr_t* __restrict bitarr, uintptr_t* __restrict loc_ptr) {
+  if (!IS_SET(bitarr, *loc_ptr)) {
+    *loc_ptr = next_set_ul_unsafe(bitarr, *loc_ptr);
+  }
+}
+
+uint32_t next_unset(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil);
+
+HEADER_INLINE uint32_t are_all_words_identical(const uintptr_t* word_arr1, const uintptr_t* word_arr2, uintptr_t word_ct) {
+  for (uintptr_t widx = 0; widx < word_ct; ++widx) {
+    if (word_arr1[widx] ^ word_arr2[widx]) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+
+boolerr_t bigstack_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr);
+
+boolerr_t bigstack_calloc_d(uintptr_t ct, double** d_arr_ptr);
+
+boolerr_t bigstack_calloc_f(uintptr_t ct, float** f_arr_ptr);
+
+boolerr_t bigstack_calloc_usi(uintptr_t ct, uint16_t** usi_arr_ptr);
+
+boolerr_t bigstack_calloc_ui(uintptr_t ct, uint32_t** ui_arr_ptr);
+
+boolerr_t bigstack_calloc_ul(uintptr_t ct, uintptr_t** ul_arr_ptr);
+ 
+boolerr_t bigstack_calloc_ull(uintptr_t ct, uint64_t** ull_arr_ptr);
+
+HEADER_INLINE boolerr_t bigstack_calloc_c(uintptr_t ct, char** c_arr_ptr) {
+  return bigstack_calloc_uc(ct, (unsigned char**)c_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_calloc_si(uintptr_t ct, int16_t** si_arr_ptr) {
+  return bigstack_calloc_usi(ct, (uint16_t**)si_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_calloc_i(uintptr_t ct, int32_t** i_arr_ptr) {
+  return bigstack_calloc_ui(ct, (uint32_t**)i_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_calloc_ll(uintptr_t ct, int64_t** ll_arr_ptr) {
+  return bigstack_calloc_ull(ct, (uint64_t**)ll_arr_ptr);
+}
+
+boolerr_t bigstack_end_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr);
+
+boolerr_t bigstack_end_calloc_d(uintptr_t ct, double** d_arr_ptr);
+
+boolerr_t bigstack_end_calloc_f(uintptr_t ct, float** f_arr_ptr);
+
+boolerr_t bigstack_end_calloc_ui(uintptr_t ct, uint32_t** ui_arr_ptr);
+
+boolerr_t bigstack_end_calloc_ul(uintptr_t ct, uintptr_t** ul_arr_ptr);
+
+boolerr_t bigstack_end_calloc_ull(uintptr_t ct, uint64_t** ull_arr_ptr);
+
+HEADER_INLINE boolerr_t bigstack_end_calloc_c(uintptr_t ct, char** c_arr_ptr) {
+  return bigstack_end_calloc_uc(ct, (unsigned char**)c_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_calloc_i(uintptr_t ct, int32_t** i_arr_ptr) {
+  return bigstack_end_calloc_ui(ct, (uint32_t**)i_arr_ptr);
+}
+
+HEADER_INLINE boolerr_t bigstack_end_calloc_ll(uintptr_t ct, int64_t** ll_arr_ptr) {
+  return bigstack_end_calloc_ull(ct, (uint64_t**)ll_arr_ptr);
+}
+
+
+// These ensure the trailing bits are zeroed out.
+void bitarr_invert(uintptr_t bit_ct, uintptr_t* bitarr);
+
+void bitarr_invert_copy(const uintptr_t* __restrict source_bitarr, uintptr_t bit_ct, uintptr_t* __restrict target_bitarr);
+
+// bitvec_and(), bitvec_andnot() in pgenlib_internal.h
+
+void bitvec_and_copy(const uintptr_t* __restrict source1_bitvec, const uintptr_t* __restrict source2_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec);
+
+void bitvec_andnot_copy(const uintptr_t* __restrict source_bitvec, const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec);
+
+void bitvec_or(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
+
+void bitvec_andnot2(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
+
+void set_het_missing(uintptr_t word_ct, uintptr_t* genovec);
+
+void genoarr_to_nonmissing(const uintptr_t* genoarr, uint32_t sample_ctl2, uintptr_t* nonmissing_bitarr);
+
+uint32_t genoarr_count_missing_notsubset_unsafe(const uintptr_t* genoarr, const uintptr_t* exclude_mask, uint32_t sample_ct);
+
+// dumb linear scan
+// returns -1 on failure to find, -2 if duplicate
+int32_t get_variant_uidx_without_htable(const char* idstr, char** variant_ids, const uintptr_t* variant_include, uint32_t variant_ct);
+
+// copy_subset() doesn't exist since a loop of the form
+//   uint32_t uidx = 0;
+//   for (uint32_t idx = 0; idx < subset_size; ++idx, ++uidx) {
+//     next_set_unsafe_ck(subset_mask, &uidx);
+//     *target_iter++ = source_arr[uidx];
+//   }
+// seems to compile better?
+
+// void copy_when_nonmissing(const uintptr_t* loadbuf, const void* source, uintptr_t elem_size, uintptr_t unfiltered_sample_ct, uintptr_t missing_ct, void* dest);
+
+
+// tried to replace this with a faster hash function, but turns out it's hard
+// to meaningfully beat (and multithreading parts of hash table construction
+// solved most of the initialization time issue, anyway)
+// eventually want this to be a C++14 constexpr?
+uint32_t murmurhash3_32(const void* key, uint32_t len);
+
+// see http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+HEADER_INLINE uint32_t hashceil(const char* idstr, uint32_t idlen, uint32_t htable_size) {
+  return (((uint64_t)murmurhash3_32(idstr, idlen)) * htable_size) >> 32;
+}
+
+uintptr_t geqprime(uintptr_t floor);
+
+// assumes ceil is odd and greater than 4.  Returns the first prime <= ceil.
+uintptr_t leqprime(uintptr_t ceil);
+
+HEADER_INLINE uint32_t get_htable_min_size(uintptr_t item_ct) {
+  if (item_ct > 6) {
+    return geqprime(item_ct * 2 + 1);
+  }
+  return 13;
+}
+
+// load factor ~20% seems to yield the best speed/space tradeoff on my test
+// machines
+HEADER_INLINE uint32_t get_htable_fast_size(uint32_t item_ct) {
+  if (item_ct < 858993456) {
+    return geqprime(round_up_pow2(item_ct * 5, 2) + 1);
+  }
+  return 4294967291U;
+}
+
+boolerr_t htable_good_size_alloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr);
+
+// useful for duplicate detection: returns 0 on no duplicates, a positive index
+// of a duplicate pair if they're present
+uint32_t populate_strbox_htable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
+
+// returned index in duplicate-pair case is unfiltered
+// uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
+
+// cur_id DOES need to be null-terminated
+uint32_t id_htable_find(const char* cur_id, char** item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size);
+
+// assumes cur_id_slen < max_str_blen.
+// requires cur_id to be null-terminated.
+uint32_t strbox_htable_find(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size);
+
+// last variant_ids entry must be at least kMaxIdBlen bytes before end of
+// bigstack
+uint32_t variant_id_dupflag_htable_find(const char* idbuf, char** variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen);
+
+uint32_t variant_id_dup_htable_find(const char* idbuf, char** variant_ids, const uint32_t* id_htable, const uint32_t* htable_dup_base, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen, uint32_t* llidx_ptr);
+
+char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_blen);
+
+// Collapses array of sorted IDs to remove duplicates, and writes pre-collapse
+// positions to id_starts (so e.g. duplication count of any sample ID can be
+// determined via subtraction) if it isn't nullptr.
+// Returns id_ct of collapsed array.
+uint32_t collapse_duplicate_ids(uintptr_t id_ct, uintptr_t max_id_blen, char* sorted_ids, uint32_t* id_starts);
+
+pglerr_t copy_sort_strbox_subset_noalloc(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char* __restrict sorted_strbox, uint32_t* __restrict id_map);
+
+pglerr_t copy_sort_strbox_subset(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char** sorted_strbox_ptr, uint32_t** id_map_ptr);
+
+
+// returns position of string, or -1 if not found.
+int32_t bsearch_str(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx);
+
+// requires null-terminated string
+int32_t bsearch_str_natural(const char* idbuf, const char* sorted_strbox, uintptr_t max_id_blen, uintptr_t end_idx);
+
+
+// returns number of elements in sorted_strbox[] less than idbuf.
+uintptr_t bsearch_str_lb(const char* idbuf, const char* sorted_strbox, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx);
+
+// this is frequently preferable to bsearch_str(), since it's way too easy to
+// forget to convert the sorted-stringbox index to the final index
+// sample_id_map == nullptr is permitted; in this case id will be an index into
+// the sorted array
+HEADER_INLINE boolerr_t sorted_idbox_find(const char* idbuf, const char* sorted_idbox, const uint32_t* id_map, uintptr_t cur_id_slen, uintptr_t max_id_blen, uintptr_t end_idx, uint32_t* id_ptr) {
+  const int32_t ii = bsearch_str(idbuf, sorted_idbox, cur_id_slen, max_id_blen, end_idx);
+  if (ii == -1) {
+    return 1;
+  }
+  *id_ptr = id_map? id_map[(uint32_t)ii] : ((uint32_t)ii);
+  return 0;
+}
+
+uint32_t sid_col_required(const uintptr_t* sample_include, const char* sids, uint32_t sample_ct, uint32_t max_sid_blen, uint32_t maybe_modifier);
+
+// forced SID '0' if sids == nullptr
+// ok for sample_augid_map_ptr == nullptr
+pglerr_t augid_init_alloc(const uintptr_t* sample_include, const char* sample_ids, const char* sids, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t** sample_augid_map_ptr, char** sample_augids_ptr, uintptr_t* max_sample_augid_blen_ptr);
+
+HEADER_INLINE double get_nonmaj_freq(const double* cur_allele_freqs, uint32_t cur_allele_ct) {
+  double tot_nonlast_freq = cur_allele_freqs[0];
+  double max_freq = tot_nonlast_freq;
+  const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
+  for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct_m1; ++allele_idx) {
+    const double cur_alt_freq = cur_allele_freqs[allele_idx];
+    tot_nonlast_freq += cur_alt_freq;
+    if (cur_alt_freq > max_freq) {
+      max_freq = cur_alt_freq;
+    }
+  }
+  const double nonmajor_freq = 1.0 - max_freq;
+  return MINV(nonmajor_freq, tot_nonlast_freq);
+}
+
+HEADER_INLINE double get_allele_freq(const double* cur_allele_freqs, uint32_t allele_idx, uint32_t cur_allele_ct) {
+  const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
+  if (allele_idx < cur_allele_ct_m1) {
+    return cur_allele_freqs[allele_idx];
+  }
+  double last_freq = 1.0 - cur_allele_freqs[0];
+  for (uint32_t tmp_allele_idx = 1; tmp_allele_idx < cur_allele_ct_m1; ++tmp_allele_idx) {
+    last_freq -= cur_allele_freqs[tmp_allele_idx];
+  }
+  // possible todo: force this to be nonnegative?
+  return last_freq;
+}
+
+
+FLAGSET_DEF_START()
+  kfXidMode0,
+  
+  kfXidModeFlagOneTokenOk = (1 << 0),
+  kfXidModeFlagNeverFid = (1 << 1),
+  kfXidModeFlagSid = (1 << 2),
+
+  kfXidModeFidiid = 0,
+  kfXidModeFidiidOrIid = kfXidModeFlagOneTokenOk,
+  kfXidModeIid = (kfXidModeFlagOneTokenOk | kfXidModeFlagNeverFid),
+  kfXidModeFidiidSid = kfXidModeFlagSid,
+  kfXidModeIidSid = (kfXidModeFlagNeverFid | kfXidModeFlagSid)
+FLAGSET_DEF_END(xid_mode_t);
+
+// sample_xid_map allocated on bottom, to play well with --indiv-sort
+pglerr_t sorted_xidbox_init_alloc(const uintptr_t* sample_include, const char* sample_ids, const char* sids, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, xid_mode_t xid_mode, uint32_t use_nsort, char** sorted_xidbox_ptr, uint32_t** xid_map_ptr, uintptr_t* max_xid_blen_ptr);
+
+// returns 1 on missing token *or* if the sample ID is not present.  cases can
+// be distinguished by checking whether *read_pp_new == nullptr: if it is, a
+// missing-tokens error should probably be reported.
+// sample_id_map == nullptr is permitted
+// *read_pp is now set to point to the end of IID/SID instead of the beginning
+// of the next token; this is a change from plink 1.9.
+boolerr_t sorted_xidbox_read_find(const char* __restrict sorted_xidbox, const uint32_t* __restrict xid_map, uintptr_t max_xid_blen, uintptr_t end_idx, uint32_t comma_delim, xid_mode_t xid_mode, char** read_pp, uint32_t* sample_uidx_ptr, char* __restrict idbuf);
+
+ENUM_U31_DEF_START()
+  kSidDetectModeNotLoaded,
+  kSidDetectModeLoaded,
+  kSidDetectModeForce
+ENUM_U31_DEF_END(sid_detect_mode_t);
+
+
+typedef struct range_list_struct {
+  char* names;
+  unsigned char* starts_range;
+  uint32_t name_ct;
+  uint32_t name_max_blen;
+} range_list_t;
+
+void init_range_list(range_list_t* range_list_ptr);
+
+void cleanup_range_list(range_list_t* range_list_ptr);
+
+// bitarr assumed to be initialized (but not necessarily zero-initialized)
+boolerr_t numeric_range_list_to_bitarr(const range_list_t* range_list_ptr, uint32_t bitarr_size, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr);
+
+pglerr_t string_range_list_to_bitarr(char* header_line, const range_list_t* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t* bitarr, int32_t* __restrict seen_idxs);
+
+pglerr_t string_range_list_to_bitarr_alloc(char* header_line, const range_list_t* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t** bitarr_ptr);
+
+
+HEADER_INLINE uint32_t realnum(double dd) {
+  return (dd == dd) && (dd != INFINITY) && (dd != -INFINITY);
+}
+
+// note that this is no longer divisible by 64
+CONSTU31(kMaxContigs, 65274);
+
+// change chr_idx_t to uint32_t if (kMaxContigs + kChrOffsetCt) > 65536
+typedef uint16_t chr_idx_t;
+
+// get_htable_min_size(kChrRawEnd) (use constexpr once sufficient
+// compiler support is available)
+// (not get_htable_fast_size since, an overwhelming majority of the time, we'll
+// have far fewer than 2^16 codes)
+CONSTU31(kChrHtableSize, 130579);
+
+// (note that n+1, n+2, n+3, and n+4 are reserved for X/Y/XY/MT)
+CONSTU31(kMaxChrTextnum, 95);
+
+// get_chr_code_raw() needs to be modified if this changes
+CONSTU31(kMaxChrTextnumSlen, 2);
+
+ENUM_U31_DEF_START()
+  kChrOffsetX,
+  kChrOffsetY,
+
+  // old way of representing pseudo-autosomal regions.  clumsy since this
+  // required changing chromosome order
+  kChrOffsetXY,
+  
+  kChrOffsetMT,
+
+  // plink 2.x pseudo-autosomal regions.
+  kChrOffsetPAR1,
+  kChrOffsetPAR2,
+  kChrOffsetCt
+ENUM_U31_DEF_END(xymt_offset_t);
+
+CONSTU31(kChrRawX, kMaxContigs + kChrOffsetX);
+CONSTU31(kChrRawY, kMaxContigs + kChrOffsetY);
+CONSTU31(kChrRawXY, kMaxContigs + kChrOffsetXY);
+CONSTU31(kChrRawMT, kMaxContigs + kChrOffsetMT);
+CONSTU31(kChrRawPAR1, kMaxContigs + kChrOffsetPAR1);
+CONSTU31(kChrRawPAR2, kMaxContigs + kChrOffsetPAR2);
+CONSTU31(kChrRawEnd, kMaxContigs + kChrOffsetCt);
+
+static_assert((!(kChrRawEnd % kBitsPerWord)), "kChrRawEnd expression must be updated.");
+CONSTU31(kChrMaskWords, kChrRawEnd / kBitsPerWord);
+
+#ifdef __LP64__
+CONSTU31(kChrExcludeWords, 2);
+#else
+CONSTU31(kChrExcludeWords, 4);
+#endif
+static_assert(kChrExcludeWords * kBitsPerWord >= kMaxChrTextnum + 2 * kChrOffsetCt + 1, "kChrExcludeWords must be updated.");
+
+ENUM_U31_DEF_START()
+  kChrsetSourceDefault,
+  kChrsetSourceCmdline,
+  kChrsetSourceFile
+ENUM_U31_DEF_END(chrset_source_t);
+
+FLAGSET_DEF_START()
+  kfChrOutput0,
+  kfChrOutputPrefix = (1 << 0),
+  kfChrOutputM = (1 << 1),
+  kfChrOutputMT = (1 << 2),
+  kfChrOutput0M = (1 << 3)
+FLAGSET_DEF_END(chr_output_t);
+
+typedef struct {
+  // Main dynamic block intended to be allocated as a single aligned block of
+  // memory on the heap freeable with vecaligned_free(), with chr_mask at the
+  // base.
+
+  uintptr_t* chr_mask; // which chromosomes aren't known to be absent?
+  // This is a misnomer--it includes X and excludes MT.  Underlying concept is
+  // "are some calls guaranteed to be homozygous (assuming >= 1 male)", which
+  // is no longer true for MT since heteroplasmy is a thing.  (Well, the real
+  // goal with MT is to enable dosage-based analysis, but until all pipelines
+  // have adapted, diploid data handling loses slightly less information than
+  // haploid.)
+  uintptr_t* haploid_mask;
+
+  // order of chromosomes in input files
+  // currently tolerates out-of-order chromosomes, as long as all variants for
+  // any given chromosome are together
+  uint32_t* chr_file_order;
+  
+  // if the second chromosome in the dataset is chr5, chr_file_order[1] == 5,
+  // the raw variant indexes for chr5 are in [chr_fo_vidx_start[1],
+  // chr_fo_vidx_start[2]). and chr_idx_to_foidx[5] == 1.
+  uint32_t* chr_fo_vidx_start;
+  uint32_t* chr_idx_to_foidx;
+
+  // --allow-extra-chr support
+  char** nonstd_names;
+  uint32_t* nonstd_id_htable;
+  // end main dynamic block
+
+  uint32_t chr_ct; // number of distinct chromosomes/contigs
+  chrset_source_t chrset_source;
+
+  uintptr_t chr_exclude[kChrExcludeWords];
+  int32_t xymt_codes[kChrOffsetCt]; // X, Y, XY...; -2 = not in chromosome set
+  uint32_t max_numeric_code;
+  uint32_t max_code; // no longer identical to max_numeric_code, with PARs
+
+  uint32_t autosome_ct;
+
+  // yet more --allow-extra-chr support
+  uint32_t zero_extra_chrs;
+  uint32_t name_ct;
+  ll_str_t* incl_excl_name_stack;
+  uint32_t is_include_stack;
+  chr_output_t output_encoding;
+} chr_info_t;
+
+extern const char g_xymt_log_names[][5];
+
+pglerr_t init_chr_info(chr_info_t* cip);
+
+void finalize_chrset(misc_flags_t misc_flags, chr_info_t* cip);
+
+HEADER_INLINE pglerr_t init_chr_info_human(chr_info_t* cip) {
+  // convenience wrapper
+  if (init_chr_info(cip)) {
+    return kPglRetNomem;
+  }
+  finalize_chrset(kfMisc0, cip);
+  return kPglRetSuccess;
+}
+
+void forget_extra_chr_names(uint32_t reinitialize, chr_info_t* cip);
+
+// in the usual case where the number of chromosomes/contigs is much less than
+// kMaxContigs, this reduces chr_info's memory consumption and improves
+// locality.
+pglerr_t finalize_chr_info(chr_info_t* cip);
+
+void cleanup_chr_info(chr_info_t* cip);
+
+char* chr_name_write(const chr_info_t* cip, uint32_t chr_idx, char* buf);
+
+uint32_t get_max_chr_slen(const chr_info_t* cip);
+
+uint32_t haploid_chr_present(const chr_info_t* cip);
+
+// any character <= ' ' is considered a terminator
+// maps chrX -> kChrRawX, etc.
+int32_t get_chr_code_raw(const char* sptr);
+
+// requires chr_name to be null-terminated
+// maps chrX -> xymt_codes[kChrOffsetX], etc.
+// error codes:
+//   -1 = --allow-extra-chr ok
+//   -2 = total fail
+int32_t get_chr_code(const char* chr_name, const chr_info_t* cip, uint32_t name_slen);
+
+// when the chromosome name isn't null-terminated
+// requires chr_name[name_slen] to be mutable
+int32_t get_chr_code_counted(const chr_info_t* cip, uint32_t name_slen, char* chr_name);
+
+HEADER_INLINE uint32_t get_variant_chr_fo_idx(const chr_info_t* cip, uintptr_t variant_uidx) {
+  return uint32arr_greater_than(&(cip->chr_fo_vidx_start[1]), cip->chr_ct, variant_uidx + 1);
+}
+
+HEADER_INLINE uint32_t get_variant_chr(const chr_info_t* cip, uintptr_t variant_uidx) {
+  return cip->chr_file_order[get_variant_chr_fo_idx(cip, variant_uidx)];
+}
+
+HEADER_INLINE uint32_t xymt_exists(const chr_info_t* cip, uint32_t xymt_offset, int32_t* xymt_code_ptr) {
+  // too easy to forget is_set(chr_mask) check if we don't use this
+  const int32_t xymt_code = cip->xymt_codes[xymt_offset];
+  *xymt_code_ptr = xymt_code;
+  return (xymt_code >= 0) && is_set(cip->chr_mask, xymt_code);
+}
+
+HEADER_INLINE void get_xymt_start_and_end(const chr_info_t* cip, uint32_t xymt_offset, uint32_t* xymt_start_ptr, uint32_t* xymt_end_ptr) {
+  int32_t xymt_code;
+  if (!xymt_exists(cip, xymt_offset, &xymt_code)) {
+    *xymt_start_ptr = 0;
+    *xymt_end_ptr = 0;
+    return;
+  }
+  const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)xymt_code];
+  *xymt_start_ptr = cip->chr_fo_vidx_start[chr_fo_idx];
+  *xymt_end_ptr = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+}
+
+HEADER_INLINE void get_xymt_code_start_and_end_unsafe(const chr_info_t* cip, uint32_t xymt_offset, int32_t* xymt_code_ptr, uint32_t* xymt_start_ptr, uint32_t* xymt_end_ptr) {
+  // assumes xymt_exists was previously called, and is true
+  const int32_t xymt_code = cip->xymt_codes[xymt_offset];
+  *xymt_code_ptr = xymt_code;
+  const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)xymt_code];
+  *xymt_start_ptr = cip->chr_fo_vidx_start[chr_fo_idx];
+  *xymt_end_ptr = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+}
+
+// now assumes chr_name is null-terminated
+pglerr_t try_to_add_chr_name(const char* chr_name, const char* file_descrip, uintptr_t line_idx, uint32_t name_slen, uint32_t allow_extra_chrs, int32_t* chr_idx_ptr, chr_info_t* cip);
+
+HEADER_INLINE pglerr_t get_or_add_chr_code(const char* chr_name, const char* file_descrip, uintptr_t line_idx, uint32_t name_slen, uint32_t allow_extra_chrs, chr_info_t* cip, int32_t* chr_idx_ptr) {
+  *chr_idx_ptr = get_chr_code(chr_name, cip, name_slen);
+  if (*chr_idx_ptr >= 0) {
+    return kPglRetSuccess;
+  }
+  return try_to_add_chr_name(chr_name, file_descrip, line_idx, name_slen, allow_extra_chrs, chr_idx_ptr, cip);
+}
+
+HEADER_INLINE pglerr_t get_or_add_chr_code_destructive(const char* file_descrip, uintptr_t line_idx, uint32_t allow_extra_chrs, char* chr_name, char* chr_name_end, chr_info_t* cip, int32_t* chr_idx_ptr) {
+  *chr_name_end = '\0';
+  return get_or_add_chr_code(chr_name, file_descrip, line_idx, (uintptr_t)(chr_name_end - chr_name), allow_extra_chrs, cip, chr_idx_ptr);
+}
+
+#ifdef __LP64__
+HEADER_INLINE uintptr_t popcount_longs_nzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
+  uintptr_t prefix_ct = 0;
+  #ifdef USE_AVX2
+  while (start_idx & 3) {
+    if (end_idx == start_idx) {
+      return 0;
+    }
+    prefix_ct = popcount_long(bitvec[start_idx++]);
+  }
+  #else
+  if (start_idx & 1) {
+    if (end_idx == start_idx) {
+      return 0;
+    }
+    prefix_ct = popcount_long(bitvec[start_idx++]);
+  }
+  #endif // USE_AVX2
+  return prefix_ct + popcount_longs(&(bitvec[start_idx]), end_idx - start_idx);
+}
+#else
+HEADER_INLINE uintptr_t popcount_longs_nzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
+  return popcount_longs(&(bitvec[start_idx]), end_idx - start_idx);
+}
+#endif
+
+uintptr_t popcount_bit_idx(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx);
+
+uintptr_t popcount_longs_intersect(const uintptr_t* __restrict bitvec1_iter, const uintptr_t* __restrict bitvec2_iter, uintptr_t word_ct);
+
+// uintptr_t count_11_longs(const uintptr_t* genovec, uintptr_t word_ct);
+
+uint32_t are_all_bits_zero(const uintptr_t* bitarr, uintptr_t start_idx, uintptr_t end_idx);
+
+// assumes len is positive, and relevant bits of target_bitarr are zero
+void copy_bitarr_range(const uintptr_t* __restrict src_bitarr, uintptr_t src_start_bitidx, uintptr_t target_start_bitidx, uintptr_t len, uintptr_t* __restrict target_bitarr);
+
+// zeroes out samples not in the mask
+void interleaved_mask_zero(const uintptr_t* __restrict interleaved_mask, uintptr_t vec_ct, uintptr_t* __restrict genovec);
+
+// sets samples in the mask to missing (0b11)
+void interleaved_set_missing(const uintptr_t* __restrict interleaved_set, uintptr_t vec_ct, uintptr_t* __restrict genovec);
+
+void set_male_het_missing(const uintptr_t* __restrict sex_male_interleaved, uint32_t vec_ct, uintptr_t* __restrict genovec);
+
+// Clears each bit in bitarr which doesn't correspond to a genovec het.
+// Assumes that either trailing bits of bitarr are already zero, or trailing
+// bits of genovec are zero.
+void mask_genovec_hets_unsafe(const uintptr_t* __restrict genovec, uint32_t raw_sample_ctl2, uintptr_t* __restrict bitarr);
+
+// vertical popcount support
+#ifdef __LP64__
+static_assert(kBytesPerVec == 16, "scramble_2_4_8_32() assumes kBytesPerVec == 16.");
+HEADER_INLINE uint32_t scramble_2_4_8_32(uint32_t orig_idx) {
+  return (orig_idx & (~63)) + ((orig_idx & 1) * 32) + ((orig_idx & 2) * 8) + (orig_idx & 12) + ((orig_idx & 48) / 16);
+}
+#else
+// 2->4: 0 2 4 6 8 10 12 14 1 3 5 ...
+// 4->8: 0 4 8 12 2 6 10 14 1 5 9 ...
+// 8->32: 0 4 8 12 2 6 10 14 1 5 9 13 3 7 11 15
+HEADER_INLINE uint32_t scramble_2_4_8_32(uint32_t orig_idx) {
+  return (orig_idx & (~15)) + ((orig_idx & 1) * 8) + ((orig_idx & 2) * 2) + ((orig_idx & 12) / 4);
+}
+#endif
+
+// probable todo: switch to vul_t* parameters
+HEADER_INLINE void unroll_incr_2_4(const uintptr_t* acc2, uint32_t acc2_vec_ct, uintptr_t* acc4) {
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t* acc2v_iter = (const vul_t*)acc2;
+  vul_t* acc4v_iter = (vul_t*)acc4;
+  for (uint32_t vidx = 0; vidx < acc2_vec_ct; ++vidx) {
+    vul_t loader = *acc2v_iter++;
+    *acc4v_iter = (*acc4v_iter) + (loader & m2);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 2);
+    *acc4v_iter = (*acc4v_iter) + (loader & m2);
+    ++acc4v_iter;
+  }
+}
+
+HEADER_INLINE void unroll_zero_incr_2_4(uint32_t acc2_vec_ct, uintptr_t* acc2, uintptr_t* acc4) {
+  const vul_t m2 = VCONST_UL(kMask3333);
+  vul_t* acc2v_iter = (vul_t*)acc2;
+  vul_t* acc4v_iter = (vul_t*)acc4;
+  for (uint32_t vidx = 0; vidx < acc2_vec_ct; ++vidx) {
+    vul_t loader = *acc2v_iter;
+    *acc2v_iter++ = vul_setzero();
+    *acc4v_iter = (*acc4v_iter) + (loader & m2);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 2);
+    *acc4v_iter = (*acc4v_iter) + (loader & m2);
+    ++acc4v_iter;
+  }
+}
+
+// er, should this just be the same function as unroll_incr_2_4 with extra
+// parameters?...
+HEADER_INLINE void unroll_incr_4_8(const uintptr_t* acc4, uint32_t acc4_vec_ct, uintptr_t* acc8) {
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  const vul_t* acc4v_iter = (const vul_t*)acc4;
+  vul_t* acc8v_iter = (vul_t*)acc8;
+  for (uint32_t vidx = 0; vidx < acc4_vec_ct; ++vidx) {
+    vul_t loader = *acc4v_iter++;
+    *acc8v_iter = (*acc8v_iter) + (loader & m4);
+    ++acc8v_iter;
+    loader = vul_rshift(loader, 4);
+    *acc8v_iter = (*acc8v_iter) + (loader & m4);
+    ++acc8v_iter;
+  }
+}
+
+HEADER_INLINE void unroll_zero_incr_4_8(uint32_t acc4_vec_ct, uintptr_t* acc4, uintptr_t* acc8) {
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  vul_t* acc4v_iter = (vul_t*)acc4;
+  vul_t* acc8v_iter = (vul_t*)acc8;
+  for (uint32_t vidx = 0; vidx < acc4_vec_ct; ++vidx) {
+    vul_t loader = *acc4v_iter;
+    *acc4v_iter++ = vul_setzero();
+    *acc8v_iter = (*acc8v_iter) + (loader & m4);
+    ++acc8v_iter;
+    loader = vul_rshift(loader, 4);
+    *acc8v_iter = (*acc8v_iter) + (loader & m4);
+    ++acc8v_iter;
+  }
+}
+
+HEADER_INLINE void unroll_incr_8_32(const uintptr_t* acc8, uint32_t acc8_vec_ct, uintptr_t* acc32) {
+  const vul_t m8x32 = VCONST_UL(kMask000000FF);
+  const vul_t* acc8v_iter = (const vul_t*)acc8;
+  vul_t* acc32v_iter = (vul_t*)acc32;
+  for (uint32_t vidx = 0; vidx < acc8_vec_ct; ++vidx) {
+    vul_t loader = *acc8v_iter++;
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+    loader = vul_rshift(loader, 8);
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+    loader = vul_rshift(loader, 8);
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+    loader = vul_rshift(loader, 8);
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+  }
+}
+
+HEADER_INLINE void unroll_zero_incr_8_32(uint32_t acc8_vec_ct, uintptr_t* acc8, uintptr_t* acc32) {
+  const vul_t m8x32 = VCONST_UL(kMask000000FF);
+  vul_t* acc8v_iter = (vul_t*)acc8;
+  vul_t* acc32v_iter = (vul_t*)acc32;
+  for (uint32_t vidx = 0; vidx < acc8_vec_ct; ++vidx) {
+    vul_t loader = *acc8v_iter;
+    *acc8v_iter++ = vul_setzero();
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+    loader = vul_rshift(loader, 8);
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+    loader = vul_rshift(loader, 8);
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+    loader = vul_rshift(loader, 8);
+    *acc32v_iter = (*acc32v_iter) + (loader & m8x32);
+    ++acc32v_iter;
+  }
+}
+
+#ifdef __LP64__
+static_assert(kBytesPerVec == 16, "scramble_1_4_8_32() assumes kBytesPerVec == 16.");
+HEADER_INLINE uint32_t scramble_1_4_8_32(uint32_t orig_idx) {
+  // 1->4: 0 4 8 12 16 20 24 28 32 ... 124 1 5 9 ...
+  // 4->8: 0 8 16 24 32 ... 120 4 12 20 ... 1 9 17 ...
+  // 8->32: 0 32 64 96 8 40 72 104 16 48 80 112 24 56 88 120 4 36 68 ... 1 33 ...
+  return (orig_idx & (~127)) + ((orig_idx & 3) * 32) + ((orig_idx & 4) * 4) + ((orig_idx & 24) / 2) + ((orig_idx & 96) / 32);
+}
+#else
+// 1->4: 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 ...
+// 4->8: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18 ...
+// 8->32: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18 ...
+HEADER_INLINE uint32_t scramble_1_4_8_32(uint32_t orig_idx) {
+  return (orig_idx & (~31)) + ((orig_idx & 3) * 8) + (orig_idx & 4) + ((orig_idx & 24) / 8);
+}
+#endif
+
+HEADER_INLINE void unroll_incr_1_4(const uintptr_t* acc1, uint32_t acc1_vec_ct, uintptr_t* acc4) {
+  const vul_t m1x4 = VCONST_UL(kMask1111);
+  const vul_t* acc1v_iter = (const vul_t*)acc1;
+  vul_t* acc4v_iter = (vul_t*)acc4;
+  for (uint32_t vidx = 0; vidx < acc1_vec_ct; ++vidx) {
+    vul_t loader = *acc1v_iter++;
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 1);
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 1);
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 1);
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+  }
+}
+
+HEADER_INLINE void unroll_zero_incr_1_4(uint32_t acc1_vec_ct, uintptr_t* acc1, uintptr_t* acc4) {
+  const vul_t m1x4 = VCONST_UL(kMask1111);
+  vul_t* acc1v_iter = (vul_t*)acc1;
+  vul_t* acc4v_iter = (vul_t*)acc4;
+  for (uint32_t vidx = 0; vidx < acc1_vec_ct; ++vidx) {
+    vul_t loader = *acc1v_iter;
+    *acc1v_iter++ = vul_setzero();
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 1);
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 1);
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+    loader = vul_rshift(loader, 1);
+    *acc4v_iter = (*acc4v_iter) + (loader & m1x4);
+    ++acc4v_iter;
+  }
+}
+
+
+// uint32_t chr_window_max(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_pos, uint32_t chr_fo_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max);
+
+// advances forward_ct set bits; forward_ct must be positive.  (stays put if
+// forward_ct == 1 and current bit is set.  may want to tweak this interface,
+// easy to introduce off-by-one bugs...)
+// In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end of
+// the trailing 16-byte block can be safely read from.
+uintptr_t jump_forward_set_unsafe(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct);
+
+// ...and here's the obvious tweaked interface.
+HEADER_INLINE uint32_t idx_to_uidx_basic(const uintptr_t* bitvec, uint32_t idx) {
+  return jump_forward_set_unsafe(bitvec, 0, idx + 1);
+}
+
+// variant_ct must be positive, but can be smaller than thread_ct
+void compute_uidx_start_partition(const uintptr_t* variant_include, uint64_t variant_ct, uint32_t thread_ct, uint32_t first_uidx, uint32_t* variant_uidx_starts);
+
+HEADER_INLINE uint32_t count_chr_variants_unsafe(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t chr_idx) {
+  assert(is_set(cip->chr_mask, chr_idx));
+  const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
+  const uint32_t min_idx = cip->chr_fo_vidx_start[chr_fo_idx];
+  const uint32_t max_idx = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+  return popcount_bit_idx(variant_include, min_idx, max_idx);
+}
+
+HEADER_INLINE uint32_t chr_is_nonempty(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t chr_idx) {
+  if (!is_set(cip->chr_mask, chr_idx)) {
+    return 0;
+  }
+  const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
+  const uint32_t min_idx = cip->chr_fo_vidx_start[chr_fo_idx];
+  const uint32_t max_idx = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+  return !are_all_bits_zero(variant_include, min_idx, max_idx);
+}
+
+HEADER_INLINE uint32_t xymt_is_nonempty(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t xymt_offset) {
+  const int32_t xymt_code = cip->xymt_codes[xymt_offset];
+  if ((xymt_code < 0) || (!is_set(cip->chr_mask, xymt_code))) {
+    return 0;
+  }
+  return chr_is_nonempty(variant_include, cip, (uint32_t)xymt_code);
+}
+
+// assumes there's at least one variant on specified chromosome
+uint32_t not_only_xymt(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t raw_variant_ct, uint32_t xymt_offset);
+
+uint32_t count_non_autosomal_variants(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t count_x, uint32_t count_mt);
+
+pglerr_t conditional_allocate_non_autosomal_variants(const chr_info_t* cip, const char* calc_descrip, uint32_t raw_variant_ct, uintptr_t** variant_include_ptr, uint32_t* variant_ct_ptr);
+
+void fill_subset_chr_fo_vidx_start(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t* subset_chr_fo_vidx_start);
+
+HEADER_INLINE boolerr_t alloc_and_fill_subset_chr_fo_vidx_start(const uintptr_t* variant_include, const chr_info_t* cip, uint32_t** subset_chr_fo_vidx_start_ptr) {
+  const uint32_t chr_ct = cip->chr_ct;
+  if (bigstack_alloc_ui(chr_ct + 1, subset_chr_fo_vidx_start_ptr)) {
+    return 1;
+  }
+  fill_subset_chr_fo_vidx_start(variant_include, cip, *subset_chr_fo_vidx_start_ptr);
+  return 0;
+}
+
+// newval does not need to be null-terminated
+// assumes *allele_ptr is not initialized
+boolerr_t allele_set(const char* newval, uint32_t allele_slen, char** allele_ptr);
+
+// *allele_ptr must be initialized; frees *allele_ptr if necessary
+boolerr_t allele_reset(const char* newval, uint32_t allele_slen, char** allele_ptr);
+
+void cleanup_allele_storage(uint32_t max_allele_slen, uintptr_t allele_storage_entry_ct, char** allele_storage);
+
+CONSTU31(kMaxMissingPhenostrBlen, 32);
+// might want g_input_missing_catname and/or g_output_missing_catname later,
+// but let's start with the simplest implementation
+extern char g_missing_catname[]; // default "NONE", not changeable for now
+
+extern char g_output_missing_pheno[]; // default "NA"
+extern char g_legacy_output_missing_pheno[]; // default "-9"
+
+// don't care about kfUnsortedVarChrom
+FLAGSET_DEF_START()
+  kfUnsortedVar0,
+  kfUnsortedVarBp = (1 << 0),
+  kfUnsortedVarCm = (1 << 1),
+  kfUnsortedVarSplitChr = (1 << 2)
+FLAGSET_DEF_END(unsorted_var_t);
+
+FLAGSET_DEF_START()
+  kfFamCol0,
+  kfFamCol1 = (1 << 0),
+  kfFamCol34 = (1 << 1),
+  kfFamCol5 = (1 << 2),
+  kfFamCol6 = (1 << 3),
+  kfFamCol13456 = (kfFamCol1 | kfFamCol34 | kfFamCol5 | kfFamCol6)
+FLAGSET_DEF_END(fam_col_t);
+
+HEADER_INLINE char sexchar(const uintptr_t* sex_nm, const uintptr_t* sex_male, uintptr_t sample_uidx) {
+  if (is_set(sex_nm, sample_uidx)) {
+    return '2' - is_set(sex_male, sample_uidx);
+  }
+  return '0';
+}
+
+// kPhenoDtypeCc and kPhenoDtypeQt currently can't change
+// kPhenoDtypeOther currently used for --glm local covariates
+ENUM_U31_DEF_START()
+  kPhenoDtypeCc,
+  kPhenoDtypeQt,
+  kPhenoDtypeCat,
+  kPhenoDtypeOther
+ENUM_U31_DEF_END(pheno_dtype_t);
+
+typedef union {
+  uintptr_t* cc; // bitvector
+  double* qt;
+  uint32_t* cat; // always 0 for missing, nonmiss[] check unnecessary
+} phenodata_t;
+
+typedef struct {
+  // * If categorical phenotype, [0] points to g_missing_catname, while [1],
+  //   [2], etc. point to category names.  These are part of the same
+  //   allocation as nonmiss, so no separate free is needed.
+  //   Otherwise, this is nullptr.
+  // * When .sample categorical variables are imported, 'P' is added in front
+  //   of the integers.
+  char** category_names;
+  
+  uintptr_t* nonmiss; // bitvector
+
+  // essentially a tagged union; part of the same allocation as nonmiss
+  phenodata_t data;
+  pheno_dtype_t type_code;
+  
+  uint32_t nonnull_category_ct;
+} pheno_col_t;
+
+void init_pheno();
+
+
+uint32_t is_categorical_phenostr(const char* phenostr);
+
+uint32_t is_categorical_phenostr_nocsv(const char* phenostr);
+
+#ifdef __arm__
+  #error "Unaligned accesses in is_nan_str()."
+#endif
+// todo: check whether there's actually any point to the uint16_t type-pun
+HEADER_INLINE uint32_t is_nan_str(const char* ss, uint32_t slen) {
+  if ((slen > 3) || (slen == 1)) {
+    return 0;
+  }
+  if (!slen) {
+    return 1;
+  }
+  const uint32_t first_two_chars_code = ((const uint16_t*)ss)[0];
+  // assumes little-endian
+  if ((first_two_chars_code & 0xdfdf) != 0x414e) {
+    return 0;
+  }
+  return (slen == 2) || ((((unsigned char)ss[2]) & 0xdf) == 78);
+}
+
+// returns 0xffffffffU if none exists
+uint32_t first_cc_or_qt_pheno_idx(const pheno_col_t* pheno_cols, uint32_t pheno_ct);
+
+// "_covar" since this doesn't handle case/control
+uint32_t is_const_covar(const pheno_col_t* covar_col, const uintptr_t* sample_include, uint32_t sample_ct);
+
+uint32_t identify_remaining_cats(const uintptr_t* sample_include, const pheno_col_t* covar_col, uint32_t sample_ct, uintptr_t* cat_covar_wkspace);
+
+// pheno_names is also allocated on the heap, but it can be handled with a
+// simple free_cond().
+void cleanup_pheno_cols(uint32_t pheno_ct, pheno_col_t* pheno_cols);
+
+pglerr_t parse_chr_ranges(const char* flagname_p, const char* errstr_append, uint32_t param_ct, uint32_t allow_extra_chrs, uint32_t xymt_subtract, char range_delim, char** argv, chr_info_t* cip, uintptr_t* chr_mask);
+
+pglerr_t parse_name_ranges(char** argv, const char* errstr_append, uint32_t param_ct, uint32_t require_posint, char range_delim, range_list_t* range_list_ptr);
+
+
+// For pure computations, where the launcher thread joins in as thread 0.
+// threads[] is second rather than first parameter since, on Windows, we may
+// need to call CloseHandle.
+void join_threads(uint32_t ctp1, pthread_t* threads);
+
+boolerr_t spawn_threads(THREAD_FUNCPTR_T(start_routine), uintptr_t ct, pthread_t* threads);
+
+
+// For double-buffering workloads where we don't want to respawn/join the
+// threads on every block, and (unlike plink 1.9) the launcher thread does not
+// participate.  (Function names end with "2z" instead of "2" since launched
+// threads start with index 0 instead of 1.)
+extern uintptr_t g_thread_spawn_ct;
+extern uint32_t g_is_last_thread_block;
+
+#ifdef _WIN32
+extern HANDLE g_thread_start_next_event[];
+extern HANDLE g_thread_cur_block_done_events[];
+
+HEADER_INLINE void THREAD_BLOCK_FINISH(uintptr_t tidx) {
+  SetEvent(g_thread_cur_block_done_events[tidx]);
+  WaitForSingleObject(g_thread_start_next_event[tidx], INFINITE);
+}
+#else
+extern pthread_attr_t g_smallstack_thread_attr;
+
+void THREAD_BLOCK_FINISH(uintptr_t tidx);
+#endif
+
+void join_threads2z(uint32_t ct, uint32_t is_last_block, pthread_t* threads);
+
+boolerr_t spawn_threads2z(THREAD_FUNCPTR_T(start_routine), uintptr_t ct, uint32_t is_last_block, pthread_t* threads);
+
+// if a thread sets g_error_ret, and is_last_block wasn't true, caller should
+// initialize globals to tell threads to stop, then call this function
+void error_cleanup_threads2z(THREAD_FUNCPTR_T(start_routine), uintptr_t ct, pthread_t* threads);
+
+
+// this interface simplifies error handling.  (todo: put most of these
+// variables in a struct.)
+typedef struct threads_state_struct {
+  THREAD_FUNCPTR_T(thread_func_ptr);
+  pthread_t* threads;
+  uint32_t calc_thread_ct;
+  uint32_t is_last_block;
+  uint32_t is_unjoined;
+} threads_state_t;
+
+HEADER_INLINE void init_threads3z(threads_state_t* tsp) {
+  tsp->thread_func_ptr = nullptr;
+  tsp->threads = nullptr;
+  tsp->is_last_block = 0;
+  tsp->is_unjoined = 0;
+}
+
+HEADER_INLINE void reinit_threads3z(threads_state_t* tsp) {
+  assert(!tsp->is_unjoined);
+  tsp->is_last_block = 0;
+}
+
+HEADER_INLINE boolerr_t spawn_threads3z(uint32_t is_not_first_block, threads_state_t* tsp) {
+  if (spawn_threads2z(tsp->thread_func_ptr, tsp->calc_thread_ct, tsp->is_last_block, tsp->threads)) {
+    if (!is_not_first_block) {
+      tsp->thread_func_ptr = nullptr;
+    }
+    return 1;
+  }
+  tsp->is_unjoined = 1;
+  return 0;
+}
+
+HEADER_INLINE void join_threads3z(threads_state_t* tsp) {
+  join_threads2z(tsp->calc_thread_ct, tsp->is_last_block, tsp->threads);
+  tsp->is_unjoined = 0;
+  if (tsp->is_last_block) {
+    tsp->thread_func_ptr = nullptr;
+  }
+}
+
+HEADER_INLINE void stop_threads3z(threads_state_t* tsp, uint32_t* cur_block_sizep) {
+  assert(tsp->thread_func_ptr);
+  if (tsp->is_unjoined) {
+    join_threads2z(tsp->calc_thread_ct, tsp->is_last_block, tsp->threads);
+  }
+  tsp->is_unjoined = 0;
+  if (!tsp->is_last_block) {
+    tsp->is_last_block = 1;
+    if (cur_block_sizep) {
+      *cur_block_sizep = 0;
+    }
+    error_cleanup_threads2z(tsp->thread_func_ptr, tsp->calc_thread_ct, tsp->threads);
+  }
+  tsp->thread_func_ptr = nullptr;
+}
+
+HEADER_INLINE void threads3z_cleanup(threads_state_t* tsp, uint32_t* cur_block_sizep) {
+  if (tsp->thread_func_ptr) {
+    if (tsp->is_unjoined) {
+      join_threads2z(tsp->calc_thread_ct, tsp->is_last_block, tsp->threads);
+    }
+    if (!tsp->is_last_block) {
+      if (cur_block_sizep) {
+	*cur_block_sizep = 0;
+      }
+      error_cleanup_threads2z(tsp->thread_func_ptr, tsp->calc_thread_ct, tsp->threads);
+    }
+  }
+}
+
+
+pglerr_t populate_id_htable_mt(const uintptr_t* subset_mask, char** item_ids, uintptr_t item_ct, uint32_t store_all_dups, uint32_t id_htable_size, uint32_t thread_ct, uint32_t* id_htable);
+
+// pass in htable_dup_base_ptr == nullptr if not storing all duplicate IDs
+pglerr_t alloc_and_populate_id_htable_mt(const uintptr_t* subset_mask, char** item_ids, uintptr_t item_ct, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr);
+
+// sample_ct not relevant if genovecs_ptr == nullptr
+pglerr_t multithread_load_init(const uintptr_t* variant_include, uint32_t sample_ct, uint32_t variant_ct, uintptr_t pgr_alloc_cacheline_ct, uintptr_t thread_xalloc_cacheline_ct, uintptr_t per_variant_xalloc_byte_ct, pgen_file_info_t* pgfip, uint32_t* calc_thread_ct_ptr, uintptr_t*** genovecs_ptr, uintptr_t*** dosage_present_ptr, dosage_t*** dosage_val_bufs_ptr, uint32_t* read_block_size_ptr, unsigned char** main_loadbufs, pthread_t** threads_ptr, pgen_reader_t*** pgr_pps, uint32_t** read [...]
+
+pglerr_t write_sample_ids(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* outname, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+ 
+#endif // __PLINK2_COMMON_H__
diff --git a/plink2_compress_stream.cpp b/plink2_compress_stream.cpp
new file mode 100644
index 0000000..922fe58
--- /dev/null
+++ b/plink2_compress_stream.cpp
@@ -0,0 +1,128 @@
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_compress_stream.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+boolerr_t uncompressed_cswrite_init(const char* out_fname, uint32_t do_append, unsigned char* overflow_buf, compress_stream_state_t* css_ptr) {
+  css_ptr->z_outfile = nullptr;
+  // can't use fopen_checked since we need to be able to append
+  css_ptr->outfile = fopen(out_fname, do_append? FOPEN_AB : FOPEN_WB);
+  if (!css_ptr->outfile) {
+    logprint("\n");
+    LOGERRPRINTFWW(g_errstr_fopen, out_fname);
+    return 1;
+  }
+  css_ptr->overflow_buf = overflow_buf;
+  return 0;
+}
+
+boolerr_t zstd_cswrite_init(const char* out_fname, uint32_t do_append, unsigned char* overflow_buf, compress_stream_state_t* css_ptr) {
+  css_ptr->outfile = nullptr;
+  css_ptr->z_outfile = gzopen(out_fname, do_append? FOPEN_AB : FOPEN_WB);
+  if (!css_ptr->z_outfile) {
+    logprint("\n");
+    LOGERRPRINTFWW(g_errstr_fopen, out_fname);
+    return 1;
+  }
+  css_ptr->overflow_buf = overflow_buf;
+  return 0;
+}
+
+// possible todo: replace output_zst with an enum which permits gzipping
+boolerr_t cswrite_init(const char* out_fname, uint32_t do_append, uint32_t output_zst, unsigned char* overflow_buf, compress_stream_state_t* css_ptr) {
+  if (!output_zst) {
+    return uncompressed_cswrite_init(out_fname, do_append, overflow_buf, css_ptr);
+  } else {
+    return zstd_cswrite_init(out_fname, do_append, overflow_buf, css_ptr);
+  }
+}
+
+boolerr_t force_uncompressed_cswrite(compress_stream_state_t* css_ptr, char** writep_ptr) {
+  unsigned char* writep = (unsigned char*)(*writep_ptr);
+  if (css_ptr->overflow_buf != writep) {
+    if (!fwrite(css_ptr->overflow_buf, writep - css_ptr->overflow_buf, 1, css_ptr->outfile)) {
+      return 1;
+    }
+    *writep_ptr = (char*)(css_ptr->overflow_buf);
+  }
+  return 0;
+}
+
+boolerr_t force_compressed_cswrite(__attribute__((unused)) uint32_t write_min, compress_stream_state_t* css_ptr, char** writep_ptr) {
+  unsigned char* writep = (unsigned char*)(*writep_ptr);
+  if (css_ptr->overflow_buf != writep) {
+    if (!gzwrite(css_ptr->z_outfile, css_ptr->overflow_buf, writep - css_ptr->overflow_buf)) {
+      return 1;
+    }
+    *writep_ptr = (char*)(css_ptr->overflow_buf);
+  }
+  return 0;
+}
+
+boolerr_t csputs_std(const char* ss, uint32_t sslen, compress_stream_state_t* css_ptr, char** writep_ptr) {
+  unsigned char* writep = (unsigned char*)(*writep_ptr);
+  const unsigned char* readp = (const unsigned char*)ss;
+  uint32_t cur_write_space = 2 * kCompressStreamBlock - ((uintptr_t)(writep - css_ptr->overflow_buf));
+  while (sslen > cur_write_space) {
+    memcpy(writep, readp, cur_write_space);
+    if (is_uncompressed_cswrite(css_ptr)) {
+      if (!fwrite(css_ptr->overflow_buf, 2 * kCompressStreamBlock, 1, css_ptr->outfile)) {
+	return 1;
+      }
+    } else {
+      if (!gzwrite(css_ptr->z_outfile, css_ptr->overflow_buf, 2 * kCompressStreamBlock)) {
+	return 1;
+      }
+    }
+    writep = css_ptr->overflow_buf;
+    readp = &(readp[cur_write_space]);
+    sslen -= cur_write_space;
+    cur_write_space = 2 * kCompressStreamBlock;
+  }
+  memcpy(writep, readp, sslen);
+  *writep_ptr = (char*)(&(writep[sslen]));
+  return cswrite(css_ptr, writep_ptr);
+}
+
+boolerr_t uncompressed_cswrite_close_null(compress_stream_state_t* css_ptr, char* writep) {
+  force_uncompressed_cswrite(css_ptr, &writep);
+  css_ptr->overflow_buf = nullptr;
+  int32_t ii = ferror(css_ptr->outfile);
+  int32_t jj = fclose(css_ptr->outfile);
+  return ii || jj;
+}
+
+boolerr_t compressed_cswrite_close_null(compress_stream_state_t* css_ptr, char* writep) {
+  force_compressed_cswrite(0, css_ptr, &writep);
+  css_ptr->overflow_buf = nullptr;
+  return (gzclose(css_ptr->z_outfile) != Z_OK);
+}
+
+boolerr_t cswrite_close_null(compress_stream_state_t* css_ptr, char* writep) {
+  if (is_uncompressed_cswrite(css_ptr)) {
+    return uncompressed_cswrite_close_null(css_ptr, writep);
+  }
+  return compressed_cswrite_close_null(css_ptr, writep);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/plink2_compress_stream.h b/plink2_compress_stream.h
new file mode 100644
index 0000000..631de84
--- /dev/null
+++ b/plink2_compress_stream.h
@@ -0,0 +1,103 @@
+#ifndef __PLINK2_COMPRESS_STREAM_H__
+#define __PLINK2_COMPRESS_STREAM_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+// Successor to plink 1.9 pigz.h.  Provides a basic manually-buffered output
+// stream interface for zstd compression.  Not multithreaded yet, but the
+// interface is identical to the old multithreaded gzipper so we'll be able to
+// upgrade the backend later without making significant changes to other code.
+#include "plink2_decompress.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// todo: test different values, may want to increase on at least OS X...
+CONSTU31(kCompressStreamBlock, 131072);
+
+typedef struct {
+  unsigned char* overflow_buf;
+  FILE* outfile;
+  gzFile z_outfile;
+} compress_stream_state_t;
+
+HEADER_INLINE uint32_t is_uncompressed_cswrite(const compress_stream_state_t* css_ptr) {
+  return (css_ptr->outfile != nullptr);
+}
+
+HEADER_INLINE void cswrite_init_null(compress_stream_state_t* css_ptr) {
+  css_ptr->overflow_buf = nullptr;
+}
+
+boolerr_t uncompressed_cswrite_init(const char* out_fname, uint32_t do_append, unsigned char* overflow_buf, compress_stream_state_t* css_ptr);
+
+// overflow_buf must have space for at least kCompressStreamBlock + [max bytes
+// added between cswrite() calls] bytes.
+boolerr_t cswrite_init(const char* out_fname, uint32_t do_append, uint32_t output_zst, unsigned char* overflow_buf, compress_stream_state_t* css_ptr);
+
+boolerr_t force_uncompressed_cswrite(compress_stream_state_t* css_ptr, char** writep_ptr);
+
+// may or may not need the write_min parameter; let's leave it until we
+// actually try to implement multithreaded zstd
+boolerr_t force_compressed_cswrite(uint32_t write_min, compress_stream_state_t* css_ptr, char** writep_ptr);
+
+HEADER_INLINE boolerr_t cswrite(compress_stream_state_t* css_ptr, char** writep_ptr) {
+  if ((uintptr_t)(((unsigned char*)(*writep_ptr)) - css_ptr->overflow_buf) >= kCompressStreamBlock + 1) {
+    if (is_uncompressed_cswrite(css_ptr)) {
+      return force_uncompressed_cswrite(css_ptr, writep_ptr);
+    } else {
+      return force_compressed_cswrite(kCompressStreamBlock + 1, css_ptr, writep_ptr);
+    }
+  }
+  return 0;
+}
+
+// assumes overflow_buf has size >= 2 * kCompressStreamBlock.
+boolerr_t csputs_std(const char* ss, uint32_t sslen, compress_stream_state_t* css_ptr, char** writep_ptr);
+
+boolerr_t uncompressed_cswrite_close_null(compress_stream_state_t* css_ptr, char* writep);
+
+boolerr_t compressed_cswrite_close_null(compress_stream_state_t* css_ptr, char* writep);
+
+boolerr_t cswrite_close_null(compress_stream_state_t* css_ptr, char* writep);
+
+HEADER_INLINE void uncompressed_cswrite_close_cond(compress_stream_state_t* css_ptr, char* writep) {
+  if (css_ptr->overflow_buf) {
+    uncompressed_cswrite_close_null(css_ptr, writep);
+  }
+}
+
+HEADER_INLINE void compressed_cswrite_close_cond(compress_stream_state_t* css_ptr, char* writep) {
+  if (css_ptr->overflow_buf) {
+    compressed_cswrite_close_null(css_ptr, writep);
+  }
+}
+
+HEADER_INLINE void cswrite_close_cond(compress_stream_state_t* css_ptr, char* writep) {
+  if (css_ptr->overflow_buf) {
+    cswrite_close_null(css_ptr, writep);
+  }
+}
+
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+ 
+#endif // __PLINK2_COMPRESS_STREAM_H__
diff --git a/plink2_data.cpp b/plink2_data.cpp
new file mode 100644
index 0000000..0c7b4aa
--- /dev/null
+++ b/plink2_data.cpp
@@ -0,0 +1,15907 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_compress_stream.h"
+#include "plink2_data.h"
+#include "plink2_psam.h"
+#include "plink2_pvar.h"
+#include "plink2_random.h"
+
+#include "bgzf.h"
+#include "zstd/lib/zstd.h"
+
+#include <time.h>
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+void init_plink1_dosage(plink1_dosage_info_t* plink1_dosage_info_ptr) {
+  plink1_dosage_info_ptr->flags = kfPlink1Dosage0;
+  fill_uint_zero(3, plink1_dosage_info_ptr->skips);
+  plink1_dosage_info_ptr->chr_col_idx = 0xffffffffU;
+  plink1_dosage_info_ptr->pos_col_idx = 0xffffffffU;
+}
+
+void init_gendummy(gendummy_info_t* gendummy_info_ptr) {
+  gendummy_info_ptr->flags = kfGenDummy0;
+  gendummy_info_ptr->pheno_ct = 1;
+  gendummy_info_ptr->geno_mfreq = 0.0;
+  gendummy_info_ptr->pheno_mfreq = 0.0;
+  gendummy_info_ptr->dosage_freq = 0.0;
+}
+
+pglerr_t write_map_or_bim(const char* outname, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint64_t* allele_dosages, const alt_allele_ct_t* refalt1_select, const double* variant_cms, uint32_t variant_ct, uint32_t max_allele_slen, char delim, uint32_t output_zst) {
+  // set max_allele_slen to zero for .map
+  // allele_dosages must be nullptr unless we're trimming alt alleles
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+    // includes trailing tab
+    char* chr_buf;
+
+    unsigned char* overflow_buf;
+    if (bigstack_alloc_c(max_chr_blen, &chr_buf) ||
+	bigstack_alloc_uc(kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen, &overflow_buf)) {
+      goto write_map_or_bim_ret_NOMEM;
+    }
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto write_map_or_bim_ret_OPEN_FAIL;
+    }
+    cswritep = (char*)overflow_buf;
+
+    const char output_missing_geno_char = *g_output_missing_geno_ptr;
+    uint32_t variant_uidx = 0;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_buf_blen = 0;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	char* chr_name_end = chr_name_write(cip, cip->chr_file_order[chr_fo_idx], chr_buf);
+	*chr_name_end = delim;
+	chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+      }
+      cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+      cswritep = strcpyax(cswritep, variant_ids[variant_uidx], delim);
+      if (!variant_cms) {
+	*cswritep++ = '0';
+      } else {
+	cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
+      }
+      *cswritep++ = delim;
+      cswritep = uint32toa(variant_bps[variant_uidx], cswritep);
+      if (max_allele_slen) {
+	*cswritep++ = delim;
+	const uintptr_t variant_allele_idx_base = variant_allele_idxs? variant_allele_idxs[variant_uidx] : (variant_uidx * 2);
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	// note that VCF ref allele corresponds to A2, not A1
+	if (!refalt1_select) {
+	  // needs to be revised in multiallelic case
+	  if ((!allele_dosages) || allele_dosages[1 + variant_allele_idx_base]) {
+	    cswritep = strcpya(cswritep, cur_alleles[1]);
+	  } else {
+	    *cswritep++ = output_missing_geno_char;
+	  }
+	  *cswritep++ = delim;
+	  cswritep = strcpya(cswritep, cur_alleles[0]);
+	} else {
+	  const alt_allele_ct_t* cur_refalt1_select = &(refalt1_select[variant_uidx * 2]);
+	  if ((!allele_dosages) || allele_dosages[cur_refalt1_select[1] + variant_allele_idx_base]) {
+	    cswritep = strcpya(cswritep, cur_alleles[cur_refalt1_select[1]]);
+	  } else {
+	    *cswritep++ = output_missing_geno_char;
+	  }
+	  *cswritep++ = delim;
+	  cswritep = strcpya(cswritep, cur_alleles[cur_refalt1_select[0]]);
+	}
+      }
+      append_binary_eoln(&cswritep);
+      if (cswrite(&css, &cswritep)) {
+	goto write_map_or_bim_ret_WRITE_FAIL;
+      }
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto write_map_or_bim_ret_WRITE_FAIL;
+    }
+  }
+  while (0) {
+  write_map_or_bim_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_map_or_bim_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_map_or_bim_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  cswrite_close_cond(&css, cswritep);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t pvar_info_reload_header(const char* pvar_info_reload, gzFile* gz_pvar_reload_ptr, char** loadbuf_ptr, uintptr_t* loadbuf_size_ptr, uint32_t* info_col_idx_ptr) {
+  pglerr_t reterr = gzopen_read_checked(pvar_info_reload, gz_pvar_reload_ptr);
+  if (reterr) {
+    return reterr;
+  }
+  uintptr_t loadbuf_size = bigstack_left();
+  if (loadbuf_size > kMaxLongLine) {
+    loadbuf_size = kMaxLongLine;
+  } else if (loadbuf_size <= kMaxMediumLine) {
+    return kPglRetNomem;
+  }
+  char* loadbuf = (char*)g_bigstack_base;
+  loadbuf[loadbuf_size - 1] = ' ';
+  char* loadbuf_iter;
+  do {
+    // this is a reload, so no need to validate
+    if (!gzgets(*gz_pvar_reload_ptr, loadbuf, loadbuf_size)) {
+      return kPglRetReadFail;
+    }
+    if (!loadbuf[loadbuf_size - 1]) {
+      if (loadbuf_size == kMaxLongLine) {
+	return kPglRetReadFail;
+      }
+      return kPglRetNomem;
+    }
+    loadbuf_iter = skip_initial_spaces(loadbuf);
+  } while (memcmp(loadbuf_iter, "#CHROM", 6));
+  uint32_t info_col_idx = 0;
+  do {
+    loadbuf_iter = next_token(loadbuf_iter);
+    ++info_col_idx;
+  } while (memcmp(loadbuf_iter, "INFO", 4) || (((unsigned char)loadbuf_iter[4]) > 32));
+  *loadbuf_ptr = loadbuf;
+  *loadbuf_size_ptr = loadbuf_size;
+  *info_col_idx_ptr = info_col_idx;
+  return kPglRetSuccess;
+}
+
+void pvar_info_write(char* info_token, uint32_t xheader_info_pr, uint32_t is_pr, char** write_iter_ptr) {
+  char* info_token_end = token_endnn(info_token);
+  uint32_t info_token_slen = (uintptr_t)(info_token_end - info_token);
+  char* info_token_pr = nullptr;
+  if (xheader_info_pr) {
+    info_token_pr = pr_in_info_token(info_token_slen, info_token);
+  }
+  char* write_iter = *write_iter_ptr;
+  if (is_pr || (!info_token_pr))  {
+    write_iter = memcpya(write_iter, info_token, info_token_slen);
+    if (is_pr && (!info_token_pr)) {
+      if ((info_token_slen == 1) && (info_token[0] == '.')) {
+	write_iter[-1] = 'P';
+	*write_iter++ = 'R';
+      } else {
+	write_iter = memcpyl3a(write_iter, ";PR");
+      }
+    }
+  } else {
+    // currently only possible with --real-ref-alleles
+    if (info_token_pr == info_token) {
+      if (info_token_slen == 2) {
+	*write_iter++ = '.';
+      } else {
+	write_iter = memcpya(write_iter, &(info_token[3]), info_token_slen - 3);
+      }
+    } else {
+      write_iter = memcpya(write_iter, info_token, ((uintptr_t)(info_token_pr - info_token)) - 1);
+      char* pr_end = &(info_token_pr[2]);
+      write_iter = memcpya(write_iter, pr_end, (uintptr_t)(info_token_end - pr_end));
+    }
+  }
+  *write_iter_ptr = write_iter;
+}
+
+pglerr_t pvar_info_reload_and_write(uintptr_t loadbuf_size, uint32_t xheader_info_pr, uint32_t info_col_idx, uint32_t variant_uidx, uint32_t is_pr, gzFile gz_pvar_reload, char** write_iter_ptr, uint32_t* gz_variant_uidx_ptr, char* loadbuf) {
+  uint32_t gz_variant_uidx = *gz_variant_uidx_ptr;
+  char* loadbuf_first_token;
+  do {
+    do {
+      if (!gzgets(gz_pvar_reload, loadbuf, loadbuf_size)) {
+	return kPglRetReadFail;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  return kPglRetReadFail;
+	}
+	return kPglRetNomem;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    } while (is_eoln_kns(*loadbuf_first_token));
+    ++gz_variant_uidx;
+  } while (gz_variant_uidx <= variant_uidx);
+  char* info_token = next_token_mult(loadbuf_first_token, info_col_idx);
+  *gz_variant_uidx_ptr = gz_variant_uidx;
+  pvar_info_write(info_token, xheader_info_pr, is_pr, write_iter_ptr);
+  return kPglRetSuccess;
+}
+
+void append_chrset_line(const chr_info_t* cip, char** write_iter_ptr) {
+  char* write_iter = strcpya(*write_iter_ptr, "##chrSet=<");
+  if (!(cip->haploid_mask[0] & 1)) {
+    write_iter = strcpya(write_iter, "autosomePairCt=");
+    write_iter = uint32toa(cip->autosome_ct, write_iter);
+    if (cip->xymt_codes[kChrOffsetX] >= 0) {
+      write_iter = strcpya(write_iter, ",X");
+    }
+    if (cip->xymt_codes[kChrOffsetY] >= 0) {
+      write_iter = strcpya(write_iter, ",Y");
+    }
+    if (cip->xymt_codes[kChrOffsetXY] >= 0) {
+      write_iter = strcpya(write_iter, ",XY");
+    }
+    if (cip->xymt_codes[kChrOffsetMT] >= 0) {
+      write_iter = strcpya(write_iter, ",M");
+    }
+    if (cip->xymt_codes[kChrOffsetPAR1] >= 0) {
+      write_iter = strcpya(write_iter, ",PAR1");
+    }
+    if (cip->xymt_codes[kChrOffsetPAR2] >= 0) {
+      write_iter = strcpya(write_iter, ",PAR2");
+    }
+  } else {
+    write_iter = strcpya(write_iter, "haploidAutosomeCt=");
+    write_iter = uint32toa(cip->autosome_ct, write_iter);
+  }
+  *write_iter++ = '>';
+  *write_iter_ptr = write_iter;
+  append_binary_eoln(write_iter_ptr);
+}
+
+pglerr_t write_pvar(const char* outname, const char* xheader, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint64_t* allele_dosages, const alt_allele_ct_t* refalt1_select, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, char** filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_r [...]
+  // allele_dosages must be nullptr unless we're trimming alt alleles
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  gzFile gz_pvar_reload = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+    // includes trailing tab
+    char* chr_buf;
+
+    uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen + max_filter_slen + info_reload_slen;
+    if (overflow_buf_size < 2 * kCompressStreamBlock) {
+      overflow_buf_size = 2 * kCompressStreamBlock;
+    }
+    unsigned char* overflow_buf;
+    uintptr_t* allele_include;
+    if (bigstack_alloc_c(max_chr_blen, &chr_buf) ||
+	bigstack_alloc_uc(overflow_buf_size, &overflow_buf) ||
+	bigstack_alloc_ul(BITCT_TO_WORDCT(kPglMaxAltAlleleCt), &allele_include)) {
+      goto write_pvar_ret_NOMEM;
+    }
+    const uint32_t output_zst = (pvar_psam_modifier / kfPvarZs) & 1;
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto write_pvar_ret_OPEN_FAIL;
+    }
+    cswritep = (char*)overflow_buf;
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    const uint32_t all_nonref = (nonref_flags_storage == 2);
+    uint32_t write_info_pr = all_nonref;
+    uint32_t write_info = (pvar_psam_modifier & kfPvarColInfo) || pvar_info_reload;
+    if (write_info && nonref_flags) {
+      uint32_t widx;
+      for (widx = 0; widx < raw_variant_ctl; ++widx) {
+	if (variant_include[widx] & nonref_flags[widx]) {
+	  break;
+	}
+      }
+      if (widx == raw_variant_ctl) {
+	write_info_pr = 0;
+      }
+    }
+    write_info_pr = write_info_pr && write_info;
+
+    char* loadbuf = nullptr;
+    uintptr_t loadbuf_size = 0;
+    uint32_t info_col_idx = 0; // could save this during first load instead
+    if (pvar_psam_modifier & kfPvarColXheader) {
+      if (csputs_std(xheader, xheader_blen, &css, &cswritep)) {
+	goto write_pvar_ret_WRITE_FAIL;
+      }
+      if (write_info_pr && (!xheader_info_pr)) {
+	cswritep = strcpya(cswritep, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+      }
+      if (pvar_info_reload) {
+	reterr = pvar_info_reload_header(pvar_info_reload, &gz_pvar_reload, &loadbuf, &loadbuf_size, &info_col_idx);
+	if (reterr) {
+	  goto write_pvar_ret_1;
+	}
+      }
+    }
+    if (cip->chrset_source) {
+      append_chrset_line(cip, &cswritep);
+    }
+    cswritep = strcpya(cswritep, "#CHROM\tPOS\tID\tREF\tALT");
+
+    uint32_t write_qual = 0;
+    if (pvar_psam_modifier & kfPvarColQual) {
+      write_qual = 1;
+    } else if ((pvar_psam_modifier & kfPvarColMaybequal) && qual_present) {
+      for (uint32_t widx = 0; widx < raw_variant_ctl; ++widx) {
+	if (variant_include[widx] & qual_present[widx]) {
+	  write_qual = 1;
+	  break;
+	}
+      }
+    }
+    if (write_qual) {
+      cswritep = strcpya(cswritep, "\tQUAL");
+    }
+    
+    uint32_t write_filter = 0;
+    if (pvar_psam_modifier & kfPvarColFilter) {
+      write_filter = 1;
+    } else if ((pvar_psam_modifier & kfPvarColMaybefilter) && filter_present) {
+      for (uint32_t widx = 0; widx < raw_variant_ctl; ++widx) {
+	if (variant_include[widx] & filter_present[widx]) {
+	  write_filter = 1;
+	  break;
+	}
+      }
+    }
+    if (write_filter) {
+      cswritep = strcpya(cswritep, "\tFILTER");
+    }
+
+    if (write_info) {
+      cswritep = strcpya(cswritep, "\tINFO");
+    }
+    
+    uint32_t write_cm = 0;
+    if (pvar_psam_modifier & kfPvarColCm) {
+      write_cm = 1;
+    } else if ((pvar_psam_modifier & kfPvarColMaybecm) && variant_cms) {
+      if (raw_variant_ct == variant_ct) {
+	// nonzero_cm_present check was performed
+	write_cm = 1;
+      } else {
+	uint32_t variant_uidx = 0;
+	for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  if (variant_cms[variant_uidx] != 0.0) {
+	    write_cm = 1;
+	    break;
+	  }
+	}
+      }
+    }
+    if (write_cm) {
+      cswritep = memcpyl3a(cswritep, "\tCM");
+    }
+    append_binary_eoln(&cswritep);
+
+    const char output_missing_geno_char = *g_output_missing_geno_ptr;
+    uint32_t gz_variant_uidx = 0;
+    uint32_t variant_uidx = 0;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_buf_blen = 0;
+    uint32_t ref_allele_idx = 0;
+    uint32_t alt1_allele_idx = 1;
+    uint32_t cur_allele_ct = 2;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	char* chr_name_end = chr_name_write(cip, cip->chr_file_order[chr_fo_idx], chr_buf);
+	*chr_name_end = '\t';
+	chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+      }
+      cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+      cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+      cswritep = strcpyax(cswritep, variant_ids[variant_uidx], '\t');
+      uintptr_t variant_allele_idx_base;
+      if (!variant_allele_idxs) {
+	variant_allele_idx_base = variant_uidx * 2;
+      } else {
+	variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+      }
+      char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[variant_uidx * 2];
+	alt1_allele_idx = refalt1_select[variant_uidx * 2 + 1];
+      }
+      cswritep = strcpyax(cswritep, cur_alleles[ref_allele_idx], '\t');
+      if ((!allele_dosages) || allele_dosages[variant_allele_idx_base + alt1_allele_idx]) {
+        cswritep = strcpya(cswritep, cur_alleles[alt1_allele_idx]);
+      } else {
+	*cswritep++ = output_missing_geno_char;
+      }
+      if (cswrite(&css, &cswritep)) {
+	goto write_pvar_ret_WRITE_FAIL;
+      }
+      if (cur_allele_ct > 2) {
+	fill_all_bits(cur_allele_ct, allele_include);
+	CLEAR_BIT(ref_allele_idx, allele_include);
+	CLEAR_BIT(alt1_allele_idx, allele_include);
+        uint32_t cur_allele_uidx = 0;
+	uint32_t alt_allele_idx = 2;
+	do {
+	  *cswritep++ = ',';
+	  next_set_unsafe_ck(allele_include, &cur_allele_uidx);
+	  cswritep = strcpya(cswritep, cur_alleles[cur_allele_uidx++]);
+	  if (cswrite(&css, &cswritep)) {
+	    goto write_pvar_ret_WRITE_FAIL;
+	  }
+	} while (++alt_allele_idx < cur_allele_ct);
+      }
+
+      if (write_qual) {
+	*cswritep++ = '\t';
+	if (!IS_SET(qual_present, variant_uidx)) {
+	  *cswritep++ = '.';
+	} else {
+	  cswritep = ftoa_g(quals[variant_uidx], cswritep);
+	}
+      }
+
+      if (write_filter) {
+	*cswritep++ = '\t';
+	if (!IS_SET(filter_present, variant_uidx)) {
+	  *cswritep++ = '.';
+	} else if (!IS_SET(filter_npass, variant_uidx)) {
+	  cswritep = strcpya(cswritep, "PASS");
+	} else {
+	  cswritep = strcpya(cswritep, filter_storage[variant_uidx]);
+	}
+      }
+
+      if (write_info) {
+	*cswritep++ = '\t';
+	const uint32_t is_pr = all_nonref || (nonref_flags && IS_SET(nonref_flags, variant_uidx));
+	if (gz_pvar_reload) {
+	  reterr = pvar_info_reload_and_write(loadbuf_size, xheader_info_pr, info_col_idx, variant_uidx, is_pr, gz_pvar_reload, &cswritep, &gz_variant_uidx, loadbuf);
+	  if (reterr) {
+	    goto write_pvar_ret_1;
+	  }
+	} else {
+	  if (is_pr) {
+	    cswritep = strcpya(cswritep, "PR");
+	  } else {
+	    *cswritep++ = '.';
+	  }
+	}
+      }
+      
+      if (write_cm) {
+        *cswritep++ = '\t';
+	if (!variant_cms) {
+	  *cswritep++ = '0';
+	} else {
+	  cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
+	}
+      }
+      append_binary_eoln(&cswritep);
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto write_pvar_ret_WRITE_FAIL;
+    }
+  }
+  while (0) {
+  write_pvar_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_pvar_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_pvar_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+ write_pvar_ret_1:
+  cswrite_close_cond(&css, cswritep);
+  gzclose_cond(gz_pvar_reload);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t write_fam(const char* outname, const uintptr_t* sample_include, const char* sample_ids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_paternal_id_blen, uintptr_t max_maternal_id_blen, uint32_t pheno_ct, char delim) {
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto write_fam_ret_OPEN_FAIL;
+    }
+    uintptr_t* pheno_nm = nullptr;
+    uintptr_t* pheno_cc = nullptr;
+    double* pheno_qt = nullptr;
+    // .fam files don't support categorical phenotypes
+    const uint32_t pheno_idx = first_cc_or_qt_pheno_idx(pheno_cols, pheno_ct);
+    if (pheno_idx != 0xffffffffU) {
+      const pheno_dtype_t type_code = pheno_cols[pheno_idx].type_code;
+      pheno_nm = pheno_cols[pheno_idx].nonmiss;
+      if (type_code == kPhenoDtypeCc) {
+	pheno_cc = pheno_cols[pheno_idx].data.cc;
+      } else {
+	pheno_qt = pheno_cols[pheno_idx].data.qt;
+      }
+    }
+    const char* legacy_output_missing_pheno = g_legacy_output_missing_pheno;
+    const uint32_t lomp_slen = strlen(legacy_output_missing_pheno);
+
+    uintptr_t sample_uidx = 0;
+    uint32_t sample_uidx2 = 0;
+    char* textbuf = g_textbuf;
+    char* write_iter = textbuf;
+    char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+    // not really necessary to make sample_uidx increment dependent on
+    // new_sample_idx_to_old == nullptr
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      if (!new_sample_idx_to_old) {
+	next_set_ul_unsafe_ck(sample_include, &sample_uidx);
+      } else {
+	do {
+	  sample_uidx = new_sample_idx_to_old[sample_uidx2++];
+	} while (!IS_SET(sample_include, sample_uidx));
+      }
+      const char* cur_sample_id = &(sample_ids[max_sample_id_blen * sample_uidx]);
+      if (delim == '\t') {
+	write_iter = strcpya(write_iter, cur_sample_id);
+      } else {
+	const char* fid_end = (const char*)rawmemchr(cur_sample_id, '\t');
+	write_iter = memcpyax(write_iter, cur_sample_id, (uintptr_t)(fid_end - cur_sample_id), delim);
+	write_iter = strcpya(write_iter, &(fid_end[1]));
+      }
+      *write_iter++ = delim;
+      write_iter = strcpyax(write_iter, &(paternal_ids[max_paternal_id_blen * sample_uidx]), delim);
+      write_iter = strcpyax(write_iter, &(maternal_ids[max_maternal_id_blen * sample_uidx]), delim);
+      *write_iter++ = sexchar(sex_nm, sex_male, sample_uidx);
+      *write_iter++ = delim;
+      if ((!pheno_nm) || (!IS_SET(pheno_nm, sample_uidx))) {
+	write_iter = memcpya(write_iter, legacy_output_missing_pheno, lomp_slen);
+      } else if (pheno_cc) {
+	// do we want to allow user to force 0/1 output?
+	*write_iter++ = '1' + IS_SET(pheno_cc, sample_uidx);
+      } else {
+	write_iter = dtoa_g(pheno_qt[sample_uidx], write_iter);
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= textbuf_flush) {
+	if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	  goto write_fam_ret_WRITE_FAIL;
+	}
+	write_iter = textbuf;
+      }
+    }
+    if (write_iter != textbuf) {
+      if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	goto write_fam_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto write_fam_ret_WRITE_FAIL;
+    }
+  }
+  while (0) {
+  write_fam_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_fam_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  fclose_cond(outfile);
+  return reterr;
+}
+
+uint32_t is_parental_info_present(const uintptr_t* sample_include, const char* paternal_ids, const char* maternal_ids, uint32_t sample_ct, uintptr_t max_paternal_id_blen, uintptr_t max_maternal_id_blen) {
+  uint32_t sample_uidx = 0;
+  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+    next_set_unsafe_ck(sample_include, &sample_uidx);
+    if (memcmp(&(paternal_ids[sample_uidx * max_paternal_id_blen]), "0", 2) || memcmp(&(maternal_ids[sample_uidx * max_maternal_id_blen]), "0", 2)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+char* append_pheno_str(const pheno_col_t* pheno_col, const char* output_missing_pheno, uint32_t omp_slen, uint32_t sample_uidx, char* write_iter) {
+  const pheno_dtype_t type_code = pheno_col->type_code;
+  if (type_code <= kPhenoDtypeQt) {
+    if (!is_set(pheno_col->nonmiss, sample_uidx)) {
+      write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
+    } else if (type_code == kPhenoDtypeCc) {
+      *write_iter++ = '1' + is_set(pheno_col->data.cc, sample_uidx);
+    } else {
+      write_iter = dtoa_g(pheno_col->data.qt[sample_uidx], write_iter);
+    }
+  } else {
+    write_iter = strcpya(write_iter, pheno_col->category_names[pheno_col->data.cat[sample_uidx]]);
+  }
+  return write_iter;
+}
+
+pglerr_t write_psam(const char* outname, const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uintptr_t max_paternal_id_blen, uintptr_t max_maternal_id_blen, uint32_t pheno_ct, uintptr_t max_pheno_ [...]
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto write_psam_ret_OPEN_FAIL;
+    }
+    const char* output_missing_pheno = g_output_missing_pheno;
+    const uint32_t omp_slen = strlen(output_missing_pheno);
+    
+    char* textbuf = g_textbuf;
+    char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+
+    const uint32_t write_sid = sid_col_required(sample_include, sids, sample_ct, max_sid_blen, pvar_psam_modifier / kfPsamColMaybesid);
+    uint32_t write_parents = 0;
+    if (pvar_psam_modifier & kfPsamColParents) {
+      write_parents = 1;
+    } else if (pvar_psam_modifier & kfPsamColMaybeparents) {
+      write_parents = is_parental_info_present(sample_include, paternal_ids, maternal_ids, sample_ct, max_paternal_id_blen, max_maternal_id_blen);
+    }
+    const uint32_t write_sex = (pvar_psam_modifier / kfPsamColSex) & 1;
+    const uint32_t write_empty_pheno = (pvar_psam_modifier & kfPsamColPheno1) && (!pheno_ct);
+    const uint32_t write_phenos = (pvar_psam_modifier & (kfPsamColPheno1 | kfPsamColPhenos)) && pheno_ct;
+    if (write_phenos && (!(pvar_psam_modifier & kfPsamColPhenos))) {
+      pheno_ct = 1;
+    }
+    char* write_iter = strcpya(textbuf, "#FID\tIID");
+    if (write_sid) {
+      write_iter = strcpya(write_iter, "\tSID");
+    }
+    if (write_parents) {
+      write_iter = strcpya(write_iter, "\tPAT\tMAT");
+    }
+    if (write_sex) {
+      write_iter = strcpya(write_iter, "\tSEX");
+    }
+    if (write_phenos) {
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	*write_iter++ = '\t';
+	const char* cur_pheno_name = &(pheno_names[pheno_idx * max_pheno_name_blen]);
+	const uint32_t cur_pheno_name_slen = strlen(cur_pheno_name);
+	if ((cur_pheno_name_slen == 3) && (!memcmp(cur_pheno_name, "SEX", 3))) {
+	  if (write_sex) {
+	    logerrprint("Error: .psam file cannot have both a regular SEX column and a phenotype named\n'SEX'.  Exclude or rename one of these columns.\n");
+	    goto write_psam_ret_INCONSISTENT_INPUT;
+	  }
+	  // does this phenotype column conform to the SEX column format?
+	  // case/control is always ok, but quantitative or categorical needs
+	  // to be checked
+	  const pheno_col_t* sex_col = &(pheno_cols[pheno_idx]);
+	  if (sex_col->type_code != kPhenoDtypeCc) {
+	    // could bitwise-and sample_include and pheno_nm before the loop
+	    const uintptr_t* pheno_nm = sex_col->nonmiss;
+	    uint32_t sample_uidx = 0;
+	    if (sex_col->type_code == kPhenoDtypeQt) {
+	      const double* pheno_vals = sex_col->data.qt;
+	      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+		next_set_unsafe_ck(sample_include, &sample_uidx);
+		if (is_set(pheno_nm, sample_uidx)) {
+		  const double dxx = pheno_vals[sample_uidx];
+		  // tolerate '-9' and '0' as missing values, and anything in
+		  // [1, 2] (could be reasonable to represent XXY, etc. with
+		  // decimals).
+		  if (((dxx < 1.0) && (dxx != -9.0) && (dxx != 0.0)) || (dxx > 2.0)) {
+		    logerrprint("Error: .psam SEX values are expected to be in {-9, 0, 1, 2}.\n");
+		    goto write_psam_ret_INCONSISTENT_INPUT;
+		  }
+		}
+	      }
+	    } else {
+	      assert(sex_col->type_code == kPhenoDtypeCat);
+	      const uint32_t nonnull_cat_ct = sex_col->nonnull_category_ct;
+	      if (nonnull_cat_ct) {
+		char** cur_category_names = sex_col->category_names;
+		// tolerate 'M' and 'm' being present simultaneously, etc.
+		uint32_t male_cat_idx1 = 0;
+		uint32_t male_cat_idx2 = 0;
+		uint32_t female_cat_idx1 = 0;
+		uint32_t female_cat_idx2 = 0;
+		for (uint32_t cat_idx = 1; cat_idx <= nonnull_cat_ct; ++cat_idx) {
+		  const char* cur_cat_name = cur_category_names[cat_idx];
+		  if (!cur_cat_name[1]) {
+		    uint32_t first_char_code = (unsigned char)cur_cat_name[0];
+		    first_char_code &= 0xdf;
+		    if (first_char_code == 70) {
+		      if (!female_cat_idx1) {
+			female_cat_idx1 = cat_idx;
+		      } else {
+			female_cat_idx2 = cat_idx;
+		      }
+		    } else if (first_char_code == 77) {
+		      if (!male_cat_idx1) {
+			male_cat_idx1 = cat_idx;
+		      } else {
+			male_cat_idx2 = cat_idx;
+		      }
+		    }
+		  }
+		}
+		if ((uint32_t)((male_cat_idx1 != 0) + (male_cat_idx2 != 0) + (female_cat_idx1 != 0) + (female_cat_idx2 != 0)) < nonnull_cat_ct) {
+		  const uint32_t* pheno_vals = sex_col->data.cat;
+		  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+		    next_set_unsafe_ck(sample_include, &sample_uidx);
+		    if (is_set(pheno_nm, sample_uidx)) {
+		      const uint32_t cur_cat_idx = pheno_vals[sample_uidx];
+		      if ((cur_cat_idx != male_cat_idx1) && (cur_cat_idx != female_cat_idx1) && (cur_cat_idx != male_cat_idx2) && (cur_cat_idx != female_cat_idx2)) {
+			logerrprint("Error: .psam SEX values are expected to be in {'F', 'f', 'M', 'm'}.\n");
+			goto write_psam_ret_INCONSISTENT_INPUT;
+		      }
+		    }
+		  }
+		}
+	      }
+	    }
+	  }
+	}
+	write_iter = memcpya(write_iter, cur_pheno_name, cur_pheno_name_slen);
+	if (write_iter >= textbuf_flush) {
+	  if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	    goto write_psam_ret_WRITE_FAIL;
+	  }
+	  write_iter = textbuf;
+	}
+      }
+    } else if (write_empty_pheno) {
+      write_iter = strcpya(write_iter, "\tPHENO1");
+    }
+    append_binary_eoln(&write_iter);
+
+    uintptr_t sample_uidx = 0;
+    uint32_t sample_uidx2 = 0;
+    // not really necessary to make sample_uidx increment dependent on
+    // new_sample_idx_to_old == nullptr
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      if (!new_sample_idx_to_old) {
+	next_set_ul_unsafe_ck(sample_include, &sample_uidx);
+      } else {
+	do {
+	  sample_uidx = new_sample_idx_to_old[sample_uidx2++];
+	} while (!IS_SET(sample_include, sample_uidx));
+      }
+      write_iter = strcpya(write_iter, &(sample_ids[max_sample_id_blen * sample_uidx]));
+      if (write_sid) {
+	*write_iter++ = '\t';
+	if (sids) {
+	  write_iter = strcpya(write_iter, &(sids[max_sid_blen * sample_uidx]));
+	} else {
+	  *write_iter++ = '0';
+	}
+      }
+      if (write_parents) {
+	*write_iter++ = '\t';
+	write_iter = strcpyax(write_iter, &(paternal_ids[max_paternal_id_blen * sample_uidx]), '\t');
+	write_iter = strcpya(write_iter, &(maternal_ids[max_maternal_id_blen * sample_uidx]));
+      }
+      if (write_sex) {
+	*write_iter++ = '\t';
+	if (IS_SET(sex_nm, sample_uidx)) {
+	  *write_iter++ = '2' - IS_SET(sex_male, sample_uidx);
+	} else {
+	  // this is better than '0' since it allows the raw column to be used
+	  // as --covar input
+	  // (can't do this for .fam export, though: not worth the
+	  // compatibility issues)
+	  write_iter = strcpya(write_iter, "NA");
+	}
+      }
+      if (write_phenos) {
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  *write_iter++ = '\t';
+	  write_iter = append_pheno_str(&(pheno_cols[pheno_idx]), output_missing_pheno, omp_slen, sample_uidx, write_iter);
+	  if (write_iter >= textbuf_flush) {
+	    if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	      goto write_psam_ret_WRITE_FAIL;
+	    }
+	    write_iter = textbuf;
+	  }
+	}
+      } else {
+	if (write_empty_pheno) {
+	  *write_iter++ = '\t';
+	  write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
+	}
+	if (write_iter >= textbuf_flush) {
+	  if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	    goto write_psam_ret_WRITE_FAIL;
+	  }
+	  write_iter = textbuf;
+	}	
+      }
+      append_binary_eoln(&write_iter);
+    }
+    if (write_iter != textbuf) {
+      if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	goto write_psam_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto write_psam_ret_WRITE_FAIL;
+    }
+  }
+  while (0) {
+  write_psam_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_psam_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  write_psam_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  fclose_cond(outfile);
+  return reterr;
+}
+
+
+pglerr_t vcf_sample_line(const char* preexisting_psamname, const char* const_fid, uint32_t double_id, fam_col_t fam_cols, char id_delim, char idspace_to, char flag_char, char* sample_line_first_id, char* outname, char* outname_end, uintptr_t* sample_ct_ptr) {
+  gzFile gz_infile = nullptr;
+  FILE* outfile = nullptr;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    uintptr_t const_fid_len = 0;
+    if (const_fid) {
+      const_fid_len = strlen(const_fid);
+    } else if ((!double_id) && (!id_delim)) {
+      // default: --double-id + --id-delim
+      double_id = 1;
+      id_delim = '_';
+    }
+    const uint32_t double_or_const_fid = double_id || const_fid;
+    if (id_delim != ' ') {
+      char* sample_line_iter = strchr(sample_line_first_id, ' ');
+      if (sample_line_iter) {
+	if (!idspace_to) {
+	  logerrprint("Error: VCF/BCF2 sample ID contains space(s).  Use --idspace-to to convert them\nto another character, or \"--id-delim ' '\" to interpret the spaces as FID/IID\ndelimiters.\n");
+	  goto vcf_sample_line_ret_INCONSISTENT_INPUT;
+	}
+	do {
+	  *sample_line_iter = idspace_to;
+	  sample_line_iter = strchr(&(sample_line_iter[1]), ' ');
+	} while (sample_line_iter);
+      }
+    }
+    char* sample_line_iter = sample_line_first_id;
+    char* textbuf = g_textbuf;
+    uintptr_t sample_ct = 0;
+    if (!preexisting_psamname) {
+      strcpy(outname_end, ".psam");
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	goto vcf_sample_line_ret_OPEN_FAIL;
+      }
+      char* write_iter = strcpya(textbuf, "#FID\tIID");
+      uint32_t sid_present = 0;
+      if (id_delim) {
+	while (((unsigned char)sample_line_iter[0]) >= ' ') {
+	  char* token_end = strchr(sample_line_iter, '\t');
+	  if (!token_end) {
+	    token_end = next_prespace(sample_line_iter);
+	  }
+	  char* first_delim = (char*)memchr(sample_line_iter, (unsigned char)id_delim, (uintptr_t)(token_end - sample_line_iter));
+	  if (first_delim) {
+	    sample_line_iter = &(first_delim[1]);
+	    if (memchr(sample_line_iter, (unsigned char)id_delim, (uintptr_t)(token_end - sample_line_iter)) != nullptr) {
+	      sid_present = 1;
+	      write_iter = strcpya(write_iter, "\tSID");
+	      break;
+	    }
+	  }
+	  if (*token_end != '\t') {
+	    break;
+	  }
+	  sample_line_iter = &(token_end[1]);
+	}
+	sample_line_iter = sample_line_first_id;
+      }
+      write_iter = strcpya(write_iter, "\tSEX");
+      append_binary_eoln(&write_iter);
+      char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+      while (((unsigned char)sample_line_iter[0]) >= ' ') {
+	++sample_ct;
+	char* token_end = strchr(sample_line_iter, '\t');
+	if (!token_end) {
+	  token_end = next_prespace(sample_line_iter);
+	}
+	const uint32_t token_slen = (uintptr_t)(token_end - sample_line_iter);
+	if ((*sample_line_iter == '0') && (token_slen == 1)) {
+	  logerrprint("Error: Sample ID cannot be '0'.\n");
+	  goto vcf_sample_line_ret_MALFORMED_INPUT;
+	}
+	if (id_delim) {
+	  if (*sample_line_iter == id_delim) {
+	    sprintf(g_logbuf, "Error: '%c' at beginning of sample ID.\n", id_delim);
+	    goto vcf_sample_line_ret_INCONSISTENT_INPUT_2;
+	  }
+	  if (sample_line_iter[token_slen - 1] == id_delim) {
+	    sprintf(g_logbuf, "Error: '%c' at end of sample ID.\n", id_delim);
+	    goto vcf_sample_line_ret_INCONSISTENT_INPUT_2;
+	  }
+	  char* first_delim = (char*)memchr(sample_line_iter, (unsigned char)id_delim, token_slen);
+	  if (!first_delim) {
+	    if (double_or_const_fid) {
+	      goto vcf_sample_line_nopsam_one_id;
+	    }
+	    sprintf(g_logbuf, "Error: No '%c' in sample ID.\n", id_delim);
+	    goto vcf_sample_line_ret_INCONSISTENT_INPUT_2;
+	  }
+	  char* iid_start = &(first_delim[1]);
+	  char* iid_end = (char*)memchr(iid_start, (unsigned char)id_delim, (uintptr_t)(token_end - iid_start));
+	  const char* sid_start = &(g_one_char_strs[96]);
+	  uint32_t sid_slen = 1;
+	  if (iid_end) {
+	    if (iid_start == iid_end) {
+	      sprintf(g_logbuf, "Error: Consecutive instances of '%c' in sample ID.\n", id_delim);
+	      goto vcf_sample_line_ret_INCONSISTENT_INPUT_DELIM;
+	    }
+	    sid_start = &(iid_end[1]);
+	    sid_slen = (uintptr_t)(token_end - sid_start);
+	    if (memchr(sid_start, (unsigned char)id_delim, sid_slen)) {
+	      sprintf(g_logbuf, "Error: More than two instances of '%c' in sample ID.\n", id_delim);
+	      goto vcf_sample_line_ret_INCONSISTENT_INPUT_DELIM;
+	    }
+	    if (sid_slen > kMaxIdSlen) {
+	      logerrprint("Error: SIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+	      goto vcf_sample_line_ret_MALFORMED_INPUT;
+	    }
+	  } else {
+	    iid_end = token_end;
+	  }
+	  const uint32_t fid_slen = (uintptr_t)(first_delim - sample_line_iter);
+	  if (fid_slen > kMaxIdSlen) {
+	    // strictly speaking, you could have e.g. a 20k char ID which
+	    // splits into a valid FID/IID pair with the right delimiter, but
+	    // you're not supposed to have sample IDs anywhere near that length
+	    // so I'll classify this as MalformedInput.
+	    goto vcf_sample_line_ret_MALFORMED_INPUT_LONG_ID;
+	  }
+	  write_iter = memcpyax(write_iter, sample_line_iter, fid_slen, '\t');
+	  const uint32_t iid_slen = (uintptr_t)(iid_end - iid_start);
+	  if ((*iid_start == '0') && (iid_slen == 1)) {
+	    logerrprint("Error: Sample ID induces an invalid IID of '0'.\n");
+	    goto vcf_sample_line_ret_INCONSISTENT_INPUT;
+	  }
+	  if (iid_slen > kMaxIdSlen) {
+	    goto vcf_sample_line_ret_MALFORMED_INPUT_LONG_ID;
+	  }
+	  write_iter = memcpya(write_iter, iid_start, iid_slen);
+	  if (sid_present) {
+	    *write_iter++ = '\t';
+	    write_iter = memcpya(write_iter, sid_start, sid_slen);
+	  }
+	} else {
+	vcf_sample_line_nopsam_one_id:
+	  if (token_slen > kMaxIdSlen) {
+	    goto vcf_sample_line_ret_MALFORMED_INPUT_LONG_ID;
+	  }
+	  if (double_id) {
+	    write_iter = memcpya(write_iter, sample_line_iter, token_slen);
+	  } else {
+	    write_iter = memcpya(write_iter, const_fid, const_fid_len);
+	  }
+	  *write_iter++ = '\t';
+	  write_iter = memcpya(write_iter, sample_line_iter, token_slen);
+	  if (sid_present) {
+	    write_iter = strcpya(write_iter, "\t0");
+	  }
+	}
+	// PAT/MAT/PHENO1 not required in .psam file
+	// SEX now included, so that --vcf + --out has the same effect as --vcf
+	// + --make-pgen + --out
+	write_iter = memcpyl3a(write_iter, "\tNA");
+	append_binary_eoln(&write_iter);
+	if (write_iter >= textbuf_flush) {
+	  if (fwrite_checked(textbuf, (uintptr_t)(write_iter - textbuf), outfile)) {
+	    goto vcf_sample_line_ret_WRITE_FAIL;
+	  }
+	  write_iter = textbuf;
+	}
+	if (*token_end != '\t') {
+	  break;
+	}
+	sample_line_iter = &(token_end[1]);
+      }
+      if (write_iter != textbuf) {
+	if (fwrite_checked(textbuf, (uintptr_t)(write_iter - textbuf), outfile)) {
+	  goto vcf_sample_line_ret_WRITE_FAIL;
+	}
+      }
+      if (fclose_null(&outfile)) {
+	goto vcf_sample_line_ret_WRITE_FAIL;
+      }
+    } else {
+      // check consistency of IIDs between VCF and .psam file.
+      reterr = gzopen_read_checked(preexisting_psamname, &gz_infile);
+      if (reterr) {
+	goto vcf_sample_line_ret_1;
+      }
+      uintptr_t loadbuf_size = bigstack_left();
+      if (loadbuf_size > kMaxLongLine) {
+	loadbuf_size = kMaxLongLine;
+      } else if (loadbuf_size <= kMaxMediumLine) {
+	goto vcf_sample_line_ret_NOMEM;
+      }
+      char* loadbuf = (char*)g_bigstack_base;
+      // not formally allocated for now
+      loadbuf[loadbuf_size - 1] = ' ';
+      char* loadbuf_first_token;
+      do {
+	++line_idx;
+	if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_infile)) {
+	    goto vcf_sample_line_ret_READ_FAIL;
+	  }
+	  loadbuf_first_token = loadbuf;
+	  loadbuf_first_token[0] = '\0';
+	  break;
+	}
+	if (!loadbuf[loadbuf_size - 1]) {
+	  if (loadbuf_size == kMaxLongLine) {
+	    goto vcf_sample_line_ret_LONG_LINE;
+	  }
+	  goto vcf_sample_line_ret_NOMEM;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+      } while (is_eoln_kns(*loadbuf_first_token) || ((loadbuf_first_token[0] == '#') && strcmp_se(&(loadbuf_first_token[1]), "FID", 3) && strcmp_se(&(loadbuf_first_token[1]), "IID", 3)));
+      uint32_t fid_present;
+      if (loadbuf_first_token[0] == '#') {
+	// only care about position of IID column
+	fid_present = (loadbuf_first_token[1] == 'F');
+      } else {
+	fid_present = fam_cols & kfFamCol1;
+      }
+      while (1) {
+	if (!is_eoln_kns(*loadbuf_first_token)) {
+	  char* psam_iid_start = loadbuf_first_token;
+	  if (fid_present) {
+	    psam_iid_start = skip_initial_spaces(token_endnn(psam_iid_start));
+	    if (is_eoln_kns(*psam_iid_start)) {
+	      goto vcf_sample_line_ret_MISSING_TOKENS;
+	    }
+	  }
+	  if (((unsigned char)sample_line_iter[0]) < ' ') {
+	    sprintf(g_logbuf, "Error: --%ccf file contains fewer sample IDs than %s.\n", flag_char, preexisting_psamname);
+	    goto vcf_sample_line_ret_INCONSISTENT_INPUT_WW;
+	  }
+	  ++sample_ct;
+	  char* sample_line_token_end = strchr(sample_line_iter, '\t');
+	  if (!sample_line_token_end) {
+	    sample_line_token_end = next_prespace(sample_line_iter);
+	  }
+	  uint32_t sample_line_token_slen = (uintptr_t)(sample_line_token_end - sample_line_iter);
+	  if ((*sample_line_iter == '0') && (sample_line_token_slen == 1)) {
+	    logerrprint("Error: Sample ID cannot be '0'.\n");
+	    goto vcf_sample_line_ret_MALFORMED_INPUT;
+	  }
+	  char* sample_line_iid_start = sample_line_iter;
+	  if (id_delim) {
+	    char* first_delim = (char*)memchr(sample_line_iter, (unsigned char)id_delim, sample_line_token_slen);
+	    if (!first_delim) {
+	      if (!double_or_const_fid) {
+		sprintf(g_logbuf, "Error: No '%c' in sample ID.\n", id_delim);
+		goto vcf_sample_line_ret_INCONSISTENT_INPUT_2;
+	      }
+	    } else {
+	      sample_line_iid_start = &(first_delim[1]);
+	      sample_line_token_slen = (uintptr_t)(sample_line_token_end - sample_line_iid_start);
+	      char* sample_line_iid_end = (char*)memchr(sample_line_iid_start, (unsigned char)id_delim, sample_line_token_slen);
+	      if (sample_line_iid_end) {
+		// don't bother erroring out on >2 instances of delimiter for
+		// now
+		sample_line_token_slen = (uintptr_t)(sample_line_iid_end - sample_line_iid_start);
+	      }
+	      if ((*sample_line_iid_start == '0') && (sample_line_token_slen == 1)) {
+		logerrprint("Error: Sample ID induces an invalid IID of '0'.\n");
+		goto vcf_sample_line_ret_INCONSISTENT_INPUT;
+	      }
+	    }
+	  }
+	  if (sample_line_token_slen > kMaxIdSlen) {
+	    goto vcf_sample_line_ret_MALFORMED_INPUT_LONG_ID;
+	  }
+	  if (memcmp(sample_line_iid_start, psam_iid_start, sample_line_token_slen) || (((unsigned char)psam_iid_start[sample_line_token_slen]) > 32)) {
+	    sprintf(g_logbuf, "Error: Mismatched IDs between --%ccf file and %s.\n", flag_char, preexisting_psamname);
+	    goto vcf_sample_line_ret_INCONSISTENT_INPUT_WW;
+	  }
+	  sample_line_iter = &(sample_line_token_end[1]);
+	}
+	++line_idx;
+	if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_infile)) {
+	    goto vcf_sample_line_ret_READ_FAIL;
+	  }
+	  break;
+	}
+	if (!loadbuf[loadbuf_size - 1]) {
+	  if (loadbuf_size == kMaxLongLine) {
+	    goto vcf_sample_line_ret_LONG_LINE;
+	  }
+	  goto vcf_sample_line_ret_NOMEM;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+	if (loadbuf_first_token[0] == '#') {
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx, preexisting_psamname);
+	  goto vcf_sample_line_ret_MALFORMED_INPUT_WW;
+	}
+      }
+      if (gzclose_null(&gz_infile)) {
+	goto vcf_sample_line_ret_READ_FAIL;
+      }
+      if (((unsigned char)sample_line_iter[0]) >= ' ') {
+	sprintf(g_logbuf, "Error: --%ccf file contains more sample IDs than %s.\n", flag_char, preexisting_psamname);
+	goto vcf_sample_line_ret_INCONSISTENT_INPUT_WW;
+      }
+    }
+    *sample_ct_ptr = sample_ct;
+  }
+  while (0) {
+  vcf_sample_line_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  vcf_sample_line_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  vcf_sample_line_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  vcf_sample_line_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  vcf_sample_line_ret_LONG_LINE:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, preexisting_psamname);
+  vcf_sample_line_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  vcf_sample_line_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_sample_line_ret_MISSING_TOKENS:
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, preexisting_psamname);
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_sample_line_ret_INCONSISTENT_INPUT_DELIM:
+    logerrprintb();
+    if (id_delim == '_') {
+      logerrprint("If you do not want '_' to be treated as a FID/IID delimiter, use --double-id or\n--const-fid to choose a different method of converting VCF sample IDs to PLINK\nIDs, or --id-delim to change the FID/IID delimiter.\n");
+    }
+    reterr = kPglRetInconsistentInput;
+    break;
+  vcf_sample_line_ret_MALFORMED_INPUT_LONG_ID:
+    logerrprint("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_sample_line_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+  vcf_sample_line_ret_INCONSISTENT_INPUT_2:
+    logerrprintb();
+  vcf_sample_line_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ vcf_sample_line_ret_1:
+  gzclose_cond(gz_infile);
+  fclose_cond(outfile);
+  return reterr;
+}
+
+uint32_t vcf_is_het_short(const char* first_gchar_ptr, vcf_half_call_t vcf_half_call) {
+  // '.' == ascii 46, '0' == ascii 48
+  // if kVcfHalfCallReference, ./0 is not phased, but ./1 is
+  const uint32_t first_gchar = (unsigned char)first_gchar_ptr[0];
+  const uint32_t second_gchar = (unsigned char)first_gchar_ptr[2];
+  return (first_gchar != second_gchar) && (((first_gchar != 46) && (second_gchar != 46)) || ((vcf_half_call == kVcfHalfCallReference) && ((first_gchar > 48) || (second_gchar > 48))));
+}
+
+uint32_t get_vcf_format_position(const char* __restrict needle, uint32_t needle_slen, char* format_start, char* format_end) {
+  *format_end = '\0';
+  // assumes first field is GT
+  char* token_start = &(format_start[3]);
+  uint32_t field_idx = 0;
+  while (1) {
+    char* token_end = strchr(token_start, ':');
+    ++field_idx;
+    if (!token_end) {
+      if (((uintptr_t)(format_end - token_start) != needle_slen) || memcmp(token_start, needle, needle_slen)) {
+	field_idx = 0;
+      }
+      break;
+    }
+    if (((uintptr_t)(token_end - token_start) == needle_slen) && (!memcmp(token_start, needle, needle_slen))) {
+      break;
+    }
+    token_start = &(token_end[1]);
+  }
+  *format_end = '\t';
+  return field_idx;
+}
+
+uint32_t vcf_qual_scan_init(int32_t vcf_min_gq, int32_t vcf_min_dp, char* format_start, char* format_end, uint32_t* qual_field_skips, int32_t* qual_thresholds) {
+  uint32_t gq_field_idx = 0;
+  if (vcf_min_gq >= 0) {
+    gq_field_idx = get_vcf_format_position("GQ", 2, format_start, format_end);
+  }
+  uint32_t qual_field_ct = 0;
+  uint32_t dp_field_idx = 0;
+  if (vcf_min_dp >= 0) {
+    dp_field_idx = get_vcf_format_position("DP", 2, format_start, format_end);
+    if (dp_field_idx && ((!gq_field_idx) || (dp_field_idx < gq_field_idx))) {
+      qual_field_skips[0] = dp_field_idx;
+      qual_thresholds[0] = vcf_min_dp;
+      qual_field_ct = 1;
+      dp_field_idx = 0;
+    }
+  }
+  if (gq_field_idx) {
+    qual_field_skips[qual_field_ct] = gq_field_idx;
+    qual_thresholds[qual_field_ct++] = vcf_min_gq;
+    if (dp_field_idx) {
+      qual_field_skips[qual_field_ct] = dp_field_idx;
+      qual_thresholds[qual_field_ct++] = vcf_min_dp;
+    }
+  }
+  if (qual_field_ct == 2) {
+    qual_field_skips[1] -= qual_field_skips[0];
+  }
+  return qual_field_ct;
+}
+
+char* vcf_field_advance(char* gtext_iter, char* gtext_end, uint32_t advance_ct) {
+  // assumes advance_ct is nonzero
+  do {
+    char* field_end = (char*)memchr(gtext_iter, ':', gtext_end - gtext_iter);
+    if (!field_end) {
+      return nullptr;
+    }
+    gtext_iter = &(field_end[1]);
+  } while (--advance_ct);
+  return gtext_iter;
+}
+
+// returns 1 if a quality check failed
+// assumes either 1 or 2 qual fields, otherwise change this to a loop
+uint32_t vcf_check_quals(const uint32_t* qual_field_skips, const int32_t* qual_thresholds, char* gtext_iter, char* gtext_end, uint32_t qual_field_ct) {
+  gtext_iter = vcf_field_advance(gtext_iter, gtext_end, qual_field_skips[0]);
+  if (!gtext_iter) {
+    return 0;
+  }
+  int32_t ii;
+  if ((!scan_int32(gtext_iter, &ii)) && (ii < qual_thresholds[0])) {
+    return 1;
+  }
+  if (qual_field_ct == 1) {
+    return 0;
+  }
+  gtext_iter = vcf_field_advance(gtext_iter, gtext_end, qual_field_skips[1]);
+  if (!gtext_iter) {
+    return 0;
+  }
+  return (!scan_int32(gtext_iter, &ii)) && (ii < qual_thresholds[1]);
+}
+
+boolerr_t parse_vcf_gp(char* gp_iter, uint32_t is_haploid, double import_dosage_certainty, uint32_t* is_missing_ptr, double* alt_dosage_ptr) {
+  // P(0/0), P(0/1), P(1/1), etc.
+  // assumes is_missing initialized to 0
+  double prob_0alt;
+  gp_iter = scanadv_double(gp_iter, &prob_0alt);
+  if ((!gp_iter) || (prob_0alt < 0.0) || (prob_0alt > 1.0) || (*gp_iter != ',')) {
+    return 1;
+  }
+  double prob_1alt;
+  gp_iter = scanadv_double(&(gp_iter[1]), &prob_1alt);
+  if ((!gp_iter) || (prob_1alt < 0.0) || (prob_1alt > 1.0)) {
+    return 1;
+  }
+  if (is_haploid) {
+    const double denom = prob_0alt + prob_1alt;
+    if (denom <= 2 * import_dosage_certainty) {
+      if ((prob_0alt <= import_dosage_certainty) && (prob_1alt <= import_dosage_certainty)) {
+	*is_missing_ptr = 1;
+	return 1;
+      }
+    }
+    *alt_dosage_ptr = 2 * prob_1alt / denom;
+    return 0;
+  }
+  double prob_2alt;
+  if ((*gp_iter != ',') || (!scanadv_double(&(gp_iter[1]), &prob_2alt)) || (prob_2alt < 0.0) || (prob_2alt > 1.0)) {
+    return 1;
+  }
+  const double denom = prob_0alt + prob_1alt + prob_2alt;
+  if (denom <= 3 * import_dosage_certainty) {
+    if ((prob_0alt <= import_dosage_certainty) && (prob_1alt <= import_dosage_certainty) && (prob_2alt <= import_dosage_certainty)) {
+      // treat as missing
+      // ok to use <= since we multiplied by (1 - epsilon)
+      // during command-line parsing.  this lets us avoid
+      // special-casing denom=0.
+      *is_missing_ptr = 1;
+      return 1; // not really an error
+    }
+  }
+  *alt_dosage_ptr = (prob_1alt + 2 * prob_2alt) / denom;
+  return 0;
+}
+
+boolerr_t parse_vcf_dosage(char* gtext_iter, char* gtext_end, uint32_t dosage_field_idx, uint32_t is_haploid, uint32_t dosage_is_gp, double import_dosage_certainty, uint32_t* is_missing_ptr, uint32_t* dosage_int_ptr) {
+  // assumes is_missing initialized to 0
+  // assumes dosage_field_idx != 0
+  // returns 1 if missing OR parsing error.
+  uint32_t field_idx = 0;
+  do {
+    char* field_end = (char*)memchr(gtext_iter, ':', gtext_end - gtext_iter);
+    if (!field_end) {
+      *is_missing_ptr = 1;
+      return 1;
+    }
+    gtext_iter = &(field_end[1]);
+  } while (++field_idx < dosage_field_idx);
+  if (((gtext_iter[0] == '.') || (gtext_iter[0] == '?')) && (((uint32_t)((unsigned char)gtext_iter[1])) - 48 >= 10)) {
+    // missing field (dot/'?' followed by non-digit)
+    // could enforce gtext_iter[1] == colon, comma, etc.?
+    *is_missing_ptr = 1;
+    return 1;
+  }
+  double alt_dosage;
+  if (dosage_is_gp) {
+    if (parse_vcf_gp(gtext_iter, is_haploid, import_dosage_certainty, is_missing_ptr, &alt_dosage)) {
+      return 1;
+    }
+  } else {
+    if ((!scanadv_double(gtext_iter, &alt_dosage)) || (alt_dosage < 0.0)) {
+      return 1;
+    }
+    if (is_haploid) {
+      // possible todo: allow this to be suppressed (maybe upstream of this
+      // function); 1000 Genomes phase 1 haploid dosages are still on 0..2
+      // scale
+      alt_dosage *= 2;
+    }
+    if (alt_dosage > 2.0) {
+      return 1;
+    }
+  }
+  *dosage_int_ptr = (int32_t)(alt_dosage * kDosageMid + 0.5);
+  return 0;
+}
+
+// dosage_int = 0..2 value in 16384ths
+// returns distance from 0.5 or 1.5 in 16384ths, whichever is closer
+static inline uint32_t biallelic_dosage_halfdist(uint32_t dosage_int) {
+  const uint32_t dosage_int_rem = dosage_int & (kDosageMid - 1);
+  return abs_int32(((int32_t)dosage_int_rem) - kDosage4th);
+}
+
+static_assert(!kVcfHalfCallReference, "vcf_to_pgen() assumes kVcfHalfCallReference == 0.");
+static_assert(kVcfHalfCallHaploid == 1, "vcf_to_pgen() assumes kVcfHalfCallHaploid == 1.");
+pglerr_t vcf_to_pgen(const char* vcfname, const char* preexisting_psamname, const char* const_fid, const char* dosage_import_field, misc_flags_t misc_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, char id_delim, char idspace_to, int32_t vcf_min_gq, int32_t vcf_min_dp, vcf_half_call_t vcf_half_call, fam_col_t fam_cols, char* outname, char* outname_end, chr_info_t* cip) {
+  // Now performs a 2-pass load.  Yes, this can be slower than plink 1.9, but
+  // it's necessary to use the Pgen_writer classes for now (since we need to
+  // know upfront how many variants there are, and whether phase/dosage is
+  // present).
+  // preexisting_psamname should be nullptr if no such file was specified.
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  FILE* pvarfile = nullptr;
+  uintptr_t line_idx = 0;
+  const uint32_t vcf_half_call_explicit_error = (vcf_half_call == kVcfHalfCallError);
+  st_pgen_writer_t spgw;
+  pglerr_t reterr = kPglRetSuccess;
+  gzFile gz_infile;
+  spgw_preinit(&spgw);
+  {
+    // don't use gzopen_read_checked() since we want to customize the error
+    // message
+    gz_infile = gzopen(vcfname, FOPEN_RB);
+    if (!gz_infile) {
+      const uint32_t slen = strlen(vcfname);
+      if (((slen > 4) && (!memcmp(&(vcfname[slen - 4]), ".vcf", 4))) || ((slen > 7) && (!memcmp(&(vcfname[slen - 7]), ".vcf.gz", 7)))) {
+	LOGERRPRINTFWW(g_errstr_fopen, vcfname);
+      } else {
+	LOGERRPRINTFWW("Error: Failed to open %s. (--vcf expects a complete filename; did you forget '.vcf' at the end?)\n", vcfname);
+      }
+      goto vcf_to_pgen_ret_OPEN_FAIL;
+    }
+    if (gzbuffer(gz_infile, 131072)) {
+      goto vcf_to_pgen_ret_NOMEM;
+    }
+    uintptr_t loadbuf_size = bigstack_left() / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto vcf_to_pgen_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kEndAllocAlign);
+    }
+    char* loadbuf = (char*)bigstack_end_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* loadbuf_iter;
+    const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
+    uint32_t dosage_import_field_slen = 0;
+    if (dosage_import_field) {
+      dosage_import_field_slen = strlen(dosage_import_field);
+    }
+    const uint32_t dosage_is_gp = (dosage_import_field_slen == 2) && (!memcmp(dosage_import_field, "GP", 2));
+    uint32_t format_gt_present = 0;
+    uint32_t format_gq_relevant = 0;
+    uint32_t format_dp_relevant = 0;
+    uint32_t format_dosage_relevant = 0;
+    uint32_t info_pr_present = 0;
+    uint32_t info_nonpr_present = 0;
+    uint32_t chrset_present = 0;
+    while (1) {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto vcf_to_pgen_ret_READ_FAIL;
+	}
+	logerrprint("Error: No #CHROM header line or variant records in --vcf file.\n");
+	goto vcf_to_pgen_ret_MALFORMED_INPUT;
+      }
+      if ((line_idx == 1) && (!memcmp(loadbuf, "BCF", 3))) {
+	// this is more informative than "missing header line"...
+	if (loadbuf[3] == 2) {
+	  sprintf(g_logbuf, "Error: %s appears to be a BCF2 file. Try --bcf instead of --vcf.\n", vcfname);
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+	if (loadbuf[3] == 4) {
+	  sprintf(g_logbuf, "Error: %s appears to be a BCF1 file. Use 'bcftools view' to convert it to a PLINK-readable VCF.\n", vcfname);
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto vcf_to_pgen_ret_LONG_LINE;
+	}
+	goto vcf_to_pgen_ret_NOMEM;
+      }
+      // don't tolerate leading spaces
+      loadbuf_iter = loadbuf;
+      if (*loadbuf_iter != '#') {
+	logerrprint("Error: No #CHROM header line in --vcf file.\n");
+	goto vcf_to_pgen_ret_MALFORMED_INPUT;
+      }
+      if (loadbuf_iter[1] != '#') {
+	break;
+      }
+      // Recognized header lines:
+      // ##fileformat: discard (regenerate; todo: conditionally error out)
+      // ##fileDate: discard (regenerate)
+      // ##source: discard (regenerate)
+      // ##contig: conditionally keep
+      // ##INFO: note presence of INFO:PR, note presence of at least one non-PR
+      //         field, keep data (though, if INFO:PR is the *only* field,
+      //         omit it from the .pvar for consistency with --make-pgen
+      //         default)
+      // ##FORMAT: note presence of FORMAT:GT and FORMAT:GP, discard
+      //           (regenerate)
+      // ##chrSet: if recognized, perform consistency check and/or update
+      //           chr_info
+      //
+      // Everything else (##FILTER, ##reference, etc.) is passed through
+      // unchanged.  FILTER values in the VCF body do not have to be mentioned
+      // in the header (since only BCF, not VCF, spec requires that).
+      //
+      // Because of how ##contig is handled (we only keep the lines which
+      // correspond to chromosomes/contigs actually present in the VCF, and not
+      // filtered out), we wait until second pass to write the .pvar.
+      if (!memcmp(&(loadbuf_iter[2]), "chrSet=<", 8)) {
+	if (chrset_present) {
+	  logerrprint("Error: Multiple ##chrSet header lines in --vcf file.\n");
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT;
+	}
+	chrset_present = 1;
+	// .pvar loader will print a warning if necessary
+	reterr = read_chrset_header_line(&(loadbuf_iter[10]), "--vcf file", misc_flags, line_idx, cip);
+	if (reterr) {
+	  goto vcf_to_pgen_ret_1;
+	}
+      } else if (!memcmp(&(loadbuf_iter[2]), "FORMAT=<ID=GT,Number=", 21)) {
+	if (format_gt_present) {
+	  logerrprint("Error: Duplicate FORMAT:GT header line in --vcf file.\n");
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (memcmp(&(loadbuf_iter[23]), "1,Type=String,Description=", 26)) {
+	  sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of --vcf file does not have expected FORMAT:GT format.\n", line_idx);
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+	format_gt_present = 1;
+      } else if ((vcf_min_gq != -1) && (!memcmp(&(loadbuf_iter[2]), "FORMAT=<ID=GQ,Number=1,Type=", 21))) {
+	if (format_gq_relevant) {
+	  logerrprint("Error: Duplicate FORMAT:GQ header line in --vcf file.\n");
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT;
+	}
+	format_gq_relevant = 1;
+      } else if ((vcf_min_dp != -1) && (!memcmp(&(loadbuf_iter[2]), "FORMAT=<ID=DP,Number=1,Type=", 21))) {
+	if (format_dp_relevant) {
+	  logerrprint("Error: Duplicate FORMAT:DP header line in --vcf file.\n");
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT;
+	}
+	format_dp_relevant = 1;
+      } else if (dosage_import_field && (!memcmp(&(loadbuf_iter[2]), "FORMAT=<ID=", 11)) && (!memcmp(&(loadbuf_iter[13]), dosage_import_field, dosage_import_field_slen)) && (loadbuf_iter[13 + dosage_import_field_slen] == ',')) {
+	if (format_dosage_relevant) {
+	  LOGERRPRINTFWW("Error: Duplicate FORMAT:%s header line in --vcf file.\n", dosage_import_field);
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+	format_dosage_relevant = 1;
+      } else if (!memcmp(&(loadbuf_iter[2]), "INFO=<ID=", 9)) {
+	if (!memcmp(&(loadbuf_iter[11]), "PR,Number=", 10)) {
+	  if (info_pr_present) {
+	    logerrprint("Error: Duplicate INFO:PR header line in --vcf file.\n");
+	    goto vcf_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	  if (memcmp(&(loadbuf_iter[21]), "0,Type=Flag,Description=", 24)) {
+	    sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of --vcf file does not have expected INFO:PR format.\n", line_idx);
+	    goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+	  }
+	  info_pr_present = 1;
+	} else {
+	  info_nonpr_present = 1;
+	}
+      }
+    }
+    const uint32_t require_gt = (misc_flags / kfMiscVcfRequireGt) & 1;
+    if ((!format_gt_present) && require_gt) {
+      // todo: allow_no_variants exception
+      logerrprint("Error: No GT field in --vcf file header, when --vcf-require-gt was specified.\n");
+      goto vcf_to_pgen_ret_INCONSISTENT_INPUT;
+    }
+    if ((!format_gq_relevant) && (vcf_min_gq != -1)) {
+      logerrprint("Warning: No GQ field in --vcf file header.  --vcf-min-gq ignored.\n");
+      vcf_min_gq = -1;
+    }
+    if ((!format_dp_relevant) && (vcf_min_dp != -1)) {
+      logerrprint("Warning: No DP field in --vcf file header.  --vcf-min-dp ignored.\n");
+      vcf_min_dp = -1;
+    }
+    const uint32_t format_gq_or_dp_relevant = format_gq_relevant || format_dp_relevant;
+    if ((!format_dosage_relevant) && dosage_import_field) {
+      LOGERRPRINTFWW("Warning: No %s field in --vcf file header. Dosages will not be imported.\n", dosage_import_field);
+    }
+    finalize_chrset(misc_flags, cip);
+    // don't call finalize_chr_info here, since this may be followed by
+    // --pmerge, etc.
+    
+    if (memcmp(loadbuf_iter, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", 38)) {
+      sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of --vcf file does not have expected field sequence after #CHROM.\n", line_idx);
+      goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+    }
+    loadbuf_iter = &(loadbuf_iter[38]);
+    const uint32_t double_id = (misc_flags / kfMiscDoubleId) & 1;
+    uintptr_t sample_ct = 0;
+    if (!memcmp(loadbuf_iter, "\tFORMAT\t", 8)) {
+      reterr = vcf_sample_line(preexisting_psamname, const_fid, double_id, fam_cols, id_delim, idspace_to, 'v', &(loadbuf_iter[8]), outname, outname_end, &sample_ct);
+      if (reterr) {
+	goto vcf_to_pgen_ret_1;
+      }
+      /*
+    } else if (allow_no_samples) {
+      strcpy(outname_end, ".psam");
+      if (fopen_checked(outname, "w", &outfile)) {
+	goto vcf_to_pgen_ret_OPEN_FAIL;
+      }
+      fputs("#FID\tIID\n", outfile);
+      if (fclose_null(&outfile)) {
+	goto vcf_to_pgen_ret_WRITE_FAIL;
+      }
+      */
+    }
+    // todo: allow_no_samples exception
+    if (!sample_ct) {
+      logerrprint("Error: No samples in --vcf file.\n");
+      goto vcf_to_pgen_ret_INCONSISTENT_INPUT;
+    }
+
+    uint32_t variant_ct = 0;
+    uint32_t max_alt_ct = 1;
+    uintptr_t* variant_allele_idxs = (uintptr_t*)g_bigstack_base;
+    uintptr_t max_variant_ct = (uintptr_t)(((uintptr_t*)g_bigstack_end) - variant_allele_idxs);
+    max_variant_ct -= BITCT_TO_ALIGNED_WORDCT(max_variant_ct) * kWordsPerVec;
+    if (format_dosage_relevant) {
+      max_variant_ct -= BITCT_TO_ALIGNED_WORDCT(max_variant_ct) * kWordsPerVec;
+    }
+    if (info_pr_present) {
+      max_variant_ct -= BITCT_TO_ALIGNED_WORDCT(max_variant_ct) * kWordsPerVec;
+    }
+#ifdef __LP64__
+    if (max_variant_ct > 0x7ffffffd) {
+      max_variant_ct = 0x7ffffffd;
+    }
+#endif
+    uintptr_t base_chr_present[kChrExcludeWords];
+    fill_ulong_zero(kChrExcludeWords, base_chr_present);
+    
+    const uintptr_t header_line_ct = line_idx;
+    const uint32_t max_variant_ctaw = BITCT_TO_ALIGNED_WORDCT(max_variant_ct);
+    uintptr_t* phasing_flags = (uintptr_t*)bigstack_end_alloc_raw_rd(max_variant_ctaw * sizeof(intptr_t));
+    uintptr_t* phasing_flags_iter = phasing_flags;
+    uintptr_t* dosage_flags = nullptr;
+    if (format_dosage_relevant) {
+      dosage_flags = (uintptr_t*)bigstack_end_alloc_raw_rd(max_variant_ctaw * sizeof(intptr_t));
+    }
+    uintptr_t* dosage_flags_iter = dosage_flags;
+    uintptr_t* nonref_flags = nullptr;
+    if (info_pr_present) {
+      nonref_flags = (uintptr_t*)bigstack_end_alloc_raw_rd(max_variant_ctaw * sizeof(intptr_t));
+    }
+    uintptr_t* nonref_flags_iter = nonref_flags;
+    if (vcf_half_call == kVcfHalfCallDefault) {
+      vcf_half_call = kVcfHalfCallError;
+    }
+    uintptr_t variant_skip_ct = 0;
+    uintptr_t phasing_word = 0;
+    uintptr_t dosage_word = 0;
+    uintptr_t nonref_word = 0;
+    uintptr_t allele_idx_end = 0;
+    uint32_t max_allele_slen = 1;
+    uint32_t max_qualfilterinfo_slen = 6;
+    uint32_t qual_field_ct = 0;
+
+    const uint32_t dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
+
+    // temporary kludge
+    uintptr_t multiallelic_skip_ct = 0;
+
+    while (1) {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto vcf_to_pgen_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto vcf_to_pgen_ret_LONG_LINE_N;
+	}
+	goto vcf_to_pgen_ret_NOMEM;
+      }
+      // do tolerate trailing newlines
+      if ((unsigned char)(*loadbuf) <= 32) {
+	if (*loadbuf == ' ') {
+	  sprintf(g_logbuf, "Error: Leading space on line %" PRIuPTR " of --vcf file.\n", line_idx);
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT_2N;
+	}
+	continue;
+      }
+      loadbuf_iter = loadbuf;
+      char* chr_code_end = strchr(loadbuf, '\t');
+      if (!chr_code_end) {
+	goto vcf_to_pgen_ret_MISSING_TOKENS;
+      }
+      // QUAL/FILTER enforcement is now postponed till .pvar loading.  only
+      // other things we do during the scanning pass are (i) count alt alleles,
+      // and (ii) check whether any phased genotype calls are present.
+
+      char* pos_end = strchr(&(chr_code_end[1]), '\t');
+      if (!pos_end) {
+	goto vcf_to_pgen_ret_MISSING_TOKENS;
+      }
+      
+      // may as well check ID length here
+      // postpone POS validation till second pass so we only have to parse it
+      // once
+      char* id_end = strchr(&(pos_end[1]), '\t');
+      if (!id_end) {
+	goto vcf_to_pgen_ret_MISSING_TOKENS;
+      }
+      if ((uintptr_t)(id_end - pos_end) > kMaxIdBlen) {
+	sprintf(g_logbuf, "Error: Invalid ID on line %" PRIuPTR " of --vcf file (max " MAX_ID_SLEN_STR " chars).\n", line_idx);
+	goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+      }
+      
+      // note REF length
+      char* ref_allele_start = &(id_end[1]);
+      loadbuf_iter = strchr(ref_allele_start, '\t');
+      if (!loadbuf_iter) {
+	goto vcf_to_pgen_ret_MISSING_TOKENS;
+      }
+      uint32_t cur_max_allele_slen = (uintptr_t)(loadbuf_iter - ref_allele_start);
+
+      uint32_t alt_ct = 1;
+      unsigned char ucc;
+      // treat ALT=. as if it were an actual allele for now
+      while (1) {
+	char* cur_allele_start = ++loadbuf_iter;
+	ucc = (unsigned char)(*loadbuf_iter);
+	if ((ucc <= ',') && (ucc != '*')) {
+	  sprintf(g_logbuf, "Error: Invalid alternate allele on line %" PRIuPTR " of --vcf file.\n", line_idx);
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT_2N;
+	}
+	do {
+	  ucc = (unsigned char)(*(++loadbuf_iter));
+	  // allow GATK 3.4 <*:DEL> symbolic allele
+	} while ((ucc > ',') || (ucc == '*'));
+	const uint32_t cur_allele_slen = (uintptr_t)(loadbuf_iter - cur_allele_start);
+	if (cur_allele_slen > cur_max_allele_slen) {
+	  cur_max_allele_slen = cur_allele_slen;
+	}
+	if (ucc != ',') {
+	  break;
+	}
+	++alt_ct;
+      }
+
+      // temporary kludge
+      if (alt_ct > 1) {
+	++multiallelic_skip_ct;
+	continue;
+      }
+      
+      if (ucc != '\t') {
+	sprintf(g_logbuf, "Error: Malformed ALT field on line %" PRIuPTR " of --vcf file.\n", line_idx);
+	goto vcf_to_pgen_ret_MALFORMED_INPUT_2N;
+      }
+      if (alt_ct > max_alt_ct) {
+	max_alt_ct = alt_ct;
+      }
+
+      // skip QUAL, FILTER
+      char* qual_start_m1 = loadbuf_iter;
+      for (uint32_t uii = 0; uii < 2; ++uii) {
+	loadbuf_iter = strchr(&(loadbuf_iter[1]), '\t');
+	if (!loadbuf_iter) {
+	  goto vcf_to_pgen_ret_MISSING_TOKENS;
+	}
+      }
+      
+      // possibly check for FORMAT:GT before proceeding
+      char* info_start = &(loadbuf_iter[1]);
+      char* info_end = strchr(info_start, '\t');
+      if (!info_end) {
+	goto vcf_to_pgen_ret_MISSING_TOKENS;
+      }
+      loadbuf_iter = &(info_end[1]);
+      const uint32_t gt_missing = memcmp(loadbuf_iter, "GT", 2) || ((loadbuf_iter[2] != ':') && (loadbuf_iter[2] != '\t'));
+      if (require_gt && gt_missing) {
+	++variant_skip_ct;
+	continue;
+      }
+      const uint32_t cur_qualfilterinfo_slen = (uintptr_t)(info_end - qual_start_m1);
+
+      // all converters *do* respect chromosome filters
+      // wait till this point to apply it, since we don't want to
+      // add a contig name to the hash table unless at least one variant on
+      // that contig wasn't filtered out for other reasons.
+      int32_t cur_chr_code;
+      reterr = get_or_add_chr_code_destructive("--vcf file", line_idx, allow_extra_chrs, loadbuf, chr_code_end, cip, &cur_chr_code);
+      if (reterr) {
+	goto vcf_to_pgen_ret_1;
+      }
+      if (!is_set(cip->chr_mask, cur_chr_code)) {
+	++variant_skip_ct;
+	continue;
+      }
+      if (cur_max_allele_slen > max_allele_slen) {
+	max_allele_slen = cur_max_allele_slen;
+      }
+      if (cur_qualfilterinfo_slen > max_qualfilterinfo_slen) {
+	max_qualfilterinfo_slen = cur_qualfilterinfo_slen;
+      }
+      if ((uint32_t)cur_chr_code <= cip->max_code) {
+	set_bit(cur_chr_code, base_chr_present);
+      }
+
+      variant_allele_idxs[variant_ct] = allele_idx_end;
+      allele_idx_end += alt_ct + 1;
+      const uint32_t variant_idx_lowbits = variant_ct % kBitsPerWord;
+      if (info_pr_present) {
+	if (pr_in_info_token((uintptr_t)(info_end - info_start), info_start)) {
+	  nonref_word |= k1LU << variant_idx_lowbits;
+	}
+	if (variant_idx_lowbits == (kBitsPerWord - 1)) {
+	  *nonref_flags_iter++ = nonref_word;
+	  nonref_word = 0;
+	}
+      }
+      if (!gt_missing) {
+	// todo: sample_ct == 0 case
+	// possible todo: import dosages when GT missing
+	
+	// loadbuf_iter currently points to beginning of FORMAT field
+	char* format_end = strchr(loadbuf_iter, '\t');
+	if (!format_end) {
+	  goto vcf_to_pgen_ret_MISSING_TOKENS;
+	}
+
+	uint32_t qual_field_skips[2];
+	int32_t qual_thresholds[2];
+	if (format_gq_or_dp_relevant) {
+	  qual_field_ct = vcf_qual_scan_init(vcf_min_gq, vcf_min_dp, loadbuf_iter, format_end, qual_field_skips, qual_thresholds);
+	}
+        // if nonzero, 0-based index of dosage field
+	uint32_t dosage_field_idx = 0;
+	if (format_dosage_relevant) {
+	  dosage_field_idx = get_vcf_format_position(dosage_import_field, dosage_import_field_slen, loadbuf_iter, format_end);
+	}
+
+	// check if there's at least one phased het call, and/or at least one
+	// relevant dosage
+	if (alt_ct < 10) {
+	  // always check for a phased het
+	  char* phasescan_iter = format_end;
+	  while (1) {
+	    // this should quickly fail if there are no phased calls at all.
+	    phasescan_iter = strchr(&(phasescan_iter[1]), '|');
+	    if (!phasescan_iter) {
+	      break;
+	    }
+	    if (phasescan_iter[-2] != '\t') {
+	      // at least one other gdata field uses the '|' character.
+	      // switch to iterating over tabs.
+	      while (1) {
+		phasescan_iter = strchr(&(phasescan_iter[1]), '\t');
+		if (!phasescan_iter) {
+		  break;
+		}
+		if (phasescan_iter[2] == '|') {
+		  if (vcf_is_het_short(&(phasescan_iter[1]), vcf_half_call)) {
+		    if (qual_field_ct) {
+		      char* cur_gtext_end = strchr(phasescan_iter, '\t');
+		      if (!cur_gtext_end) {
+			cur_gtext_end = next_prespace(phasescan_iter);
+		      }
+		      if (vcf_check_quals(qual_field_skips, qual_thresholds, phasescan_iter, cur_gtext_end, qual_field_ct)) {
+			break;
+		      }
+		    }
+		    phasing_word |= k1LU << variant_idx_lowbits;
+		    break;
+		  }
+		}
+	      }
+	      break;
+	    }
+	    if (vcf_is_het_short(&(phasescan_iter[-1]), vcf_half_call)) {
+	      if (qual_field_ct) {
+		char* cur_gtext_end = strchr(phasescan_iter, '\t');
+		if (!cur_gtext_end) {
+		  cur_gtext_end = next_prespace(phasescan_iter);
+		}
+		if (vcf_check_quals(qual_field_skips, qual_thresholds, phasescan_iter, cur_gtext_end, qual_field_ct)) {
+		  break;
+		}
+	      }
+	      phasing_word |= k1LU << variant_idx_lowbits;
+	      break;
+	    }
+	    phasescan_iter = strchr(&(phasescan_iter[1]), '\t');
+	    if (!phasescan_iter) {
+	      break;
+	    }
+	  }
+	  if (dosage_field_idx) {
+	    char* dosagescan_iter = format_end;
+	    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	      char* cur_gtext_start = ++dosagescan_iter;
+	      char* cur_gtext_end = strchr(dosagescan_iter, '\t');
+	      if (!cur_gtext_end) {
+		if (sample_idx + 1 != sample_ct) {
+		  goto vcf_to_pgen_ret_MISSING_TOKENS;
+		}
+		cur_gtext_end = next_prespace(dosagescan_iter);
+	      }
+	      dosagescan_iter = cur_gtext_end;
+	      if (qual_field_ct) {
+		if (vcf_check_quals(qual_field_skips, qual_thresholds, cur_gtext_start, cur_gtext_end, qual_field_ct)) {
+		  break;
+		}
+	      }
+	      const uint32_t is_haploid = (cur_gtext_start[1] != '/') && (cur_gtext_start[1] != '|');
+	      uint32_t is_missing = 0;
+	      uint32_t dosage_int;
+	      if (parse_vcf_dosage(cur_gtext_start, cur_gtext_end, dosage_field_idx, is_haploid, dosage_is_gp, import_dosage_certainty, &is_missing, &dosage_int)) {
+		if (is_missing) {
+		  continue;
+		}
+		goto vcf_to_pgen_ret_INVALID_DOSAGE;
+	      }
+	      const uint32_t cur_halfdist = biallelic_dosage_halfdist(dosage_int);
+	      if (cur_halfdist < dosage_erase_halfdist) {
+		goto vcf_to_pgen_dosagescan_hit;
+	      }
+	    }
+	  }
+	  if (0) {
+	  vcf_to_pgen_dosagescan_hit:
+	    dosage_word |= k1LU << variant_idx_lowbits;
+	  }
+	} else {
+	  // alt_ct >= 10
+	  // todo
+	}
+      }
+      if (variant_idx_lowbits == (kBitsPerWord - 1)) {
+	*phasing_flags_iter++ = phasing_word;
+	phasing_word = 0;
+	if (dosage_flags_iter) {
+	  *dosage_flags_iter++ = dosage_word;
+	  dosage_word = 0;
+	}
+      }
+      if (variant_ct++ == max_variant_ct) {
+#ifdef __LP64__
+	if (variant_ct == 0x7ffffffd) {
+	  logerrprint("Error: " PROG_NAME_STR " does not support more than 2^31 - 3 variants.  We recommend other\nsoftware, such as PLINK/SEQ, for very deep studies of small numbers of genomes.\n");
+	  goto vcf_to_pgen_ret_MALFORMED_INPUT;
+	}
+#endif
+	goto vcf_to_pgen_ret_NOMEM;
+      }
+      if (!(variant_ct % 1000)) {
+	printf("\r--vcf: %uk variants scanned.", variant_ct / 1000);
+	fflush(stdout);
+      }
+    }
+    if (variant_ct % kBitsPerWord) {
+      *phasing_flags_iter = phasing_word;
+      if (dosage_flags_iter) {
+	*dosage_flags_iter = dosage_word;
+      }
+      if (nonref_flags_iter) {
+	*nonref_flags_iter = nonref_word;
+      }
+    } else if (!variant_ct) {
+      // todo: allow_no_variants exception
+      logerrprint("Error: No variants in --vcf file.\n");
+      goto vcf_to_pgen_ret_INCONSISTENT_INPUT;
+    }
+    
+    putc_unlocked('\r', stdout);
+    if (!variant_skip_ct) {
+      LOGPRINTF("--vcf: %u variant%s scanned.\n", variant_ct, (variant_ct == 1)? "" : "s");
+    } else {
+      LOGPRINTF("--vcf: %u variant%s scanned (%" PRIuPTR " skipped).\n", variant_ct, (variant_ct == 1)? "" : "s", variant_skip_ct);
+    }
+
+    // temporary kludge
+    if (multiallelic_skip_ct) {
+      LOGERRPRINTFWW("Warning: %" PRIuPTR " multiallelic variant%s %sskipped (not yet supported).\n", multiallelic_skip_ct, (multiallelic_skip_ct == 1)? "" : "s", variant_skip_ct? "also " : "");
+    }
+    
+    if (gzrewind(gz_infile)) {
+      goto vcf_to_pgen_ret_READ_FAIL;
+    }
+    const uintptr_t line_ct = line_idx - 1;
+
+    if (allele_idx_end > 2 * variant_ct) {
+      variant_allele_idxs[variant_ct] = allele_idx_end;
+      bigstack_finalize_ul(variant_allele_idxs, variant_ct + 1);
+    } else {
+      variant_allele_idxs = nullptr;
+    }
+    
+    strcpy(outname_end, ".pvar");
+    if (fopen_checked(outname, FOPEN_WB, &pvarfile)) {
+      goto vcf_to_pgen_ret_OPEN_FAIL;
+    }
+    line_idx = 0;
+    while (1) {
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	goto vcf_to_pgen_ret_READ_FAIL;
+      }
+      if (++line_idx == header_line_ct) {
+	break;
+      }
+      // don't use textbuf here, since header line length could theoretically
+      // exceed kMaxMediumLine bytes
+      if ((!memcmp(loadbuf, "##fileformat=", 13)) || (!memcmp(loadbuf, "##fileDate=", 11)) || (!memcmp(loadbuf, "##source=", 9)) || (!memcmp(loadbuf, "##FORMAT=", 9)) || (!memcmp(loadbuf, "##chrSet=", 9))) {
+	continue;
+      }
+      const uint32_t line_slen = strlen(loadbuf);
+      if (!memcmp(loadbuf, "##contig=<ID=", 13)) {
+	char* contig_name_start = &(loadbuf[13]);
+	char* contig_name_end = strchr(contig_name_start, ',');
+	if (!contig_name_end) {
+	  // could search backwards from end of line in this case, but if
+	  // contig names are long enough for that to matter we have other
+	  // problems...
+	  contig_name_end = strchr(contig_name_start, '>');
+	  if (!contig_name_end) {
+	    sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of --vcf file does not have expected ##contig format.\n", line_idx);
+	    goto vcf_to_pgen_ret_MALFORMED_INPUT_WW;
+	  }
+	}
+        const int32_t cur_chr_code = get_chr_code_counted(cip, (uintptr_t)(contig_name_end - contig_name_start), contig_name_start);
+	if (cur_chr_code < 0) {
+	  continue;
+	}
+	if ((uint32_t)cur_chr_code <= cip->max_code) {
+	  if (!is_set(base_chr_present, cur_chr_code)) {
+	    continue;
+	  }
+	} else {
+	  if (!is_set(cip->chr_mask, cur_chr_code)) {
+	    continue;
+	  }
+	}
+	// Note that, when --output-chr is specified, we don't update the
+	// ##contig header line chromosome code in the .pvar file, since
+	// ##contig is not an explicit part of the .pvar specification, it's
+	// just another blob of text as far as the main body of plink2 is
+	// concerned.  However, the codes are brought in sync during VCF/BCF
+	// export.
+      }
+      // force OS-appropriate eoln
+      // don't need to check for \n since this is pass #2, we already validated
+      // file contents, and we can't be on the last line of a file lacking a
+      // final eoln since #CHROM is still to come
+      char* line_end = &(loadbuf[line_slen - 1]);
+      if (line_end[-1] == '\r') {
+	--line_end;
+      }
+      append_binary_eoln(&line_end);
+      if (fwrite_checked(loadbuf, (uintptr_t)(line_end - loadbuf), pvarfile)) {
+	goto vcf_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    char* write_iter = g_textbuf;
+    if (cip->chrset_source) {
+      append_chrset_line(cip, &write_iter);
+    }
+    write_iter = strcpya(write_iter, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER");
+    if (info_nonpr_present) {
+      write_iter = strcpya(write_iter, "\tINFO");
+    }
+    append_binary_eoln(&write_iter);
+    if (fwrite_checked(g_textbuf, write_iter - g_textbuf, pvarfile)) {
+      goto vcf_to_pgen_ret_WRITE_FAIL;
+    }
+    
+    const uint32_t variant_ctl = BITCT_TO_WORDCT(variant_ct);
+    pgen_global_flags_t phase_dosage_gflags = are_all_words_zero(phasing_flags, variant_ctl)? kfPgenGlobal0 : kfPgenGlobalHardcallPhasePresent;
+    if (format_dosage_relevant) {
+      if (are_all_words_zero(dosage_flags, variant_ctl)) {
+	format_dosage_relevant = 0;
+      } else {
+        phase_dosage_gflags |= kfPgenGlobalDosagePresent;
+      }
+    }
+    uint32_t nonref_flags_storage = 1;
+    if (nonref_flags) {
+      const uint32_t variant_ctl_m1 = variant_ctl - 1;
+      const uintptr_t last_nonref_flags_word = nonref_flags[variant_ctl_m1];
+      if (!last_nonref_flags_word) {
+	for (uint32_t widx = 0; widx < variant_ctl_m1; ++widx) {
+	  if (nonref_flags[widx]) {
+	    nonref_flags_storage = 3;
+	    break;
+	  }
+	}
+	// todo: replace kBitsPerWord - MOD_NZ...
+      } else if (!((~last_nonref_flags_word) << (kBitsPerWord - MOD_NZ(variant_ct, kBitsPerWord)))) {
+	nonref_flags_storage = 2;
+	for (uint32_t widx = 0; widx < variant_ctl_m1; ++widx) {
+	  if (~nonref_flags[widx]) {
+	    nonref_flags_storage = 3;
+	    break;
+	  }
+	}
+      } else {
+	nonref_flags_storage = 3;
+      }
+      if (nonref_flags_storage != 3) {
+	nonref_flags = nullptr;
+	bigstack_end_reset(phasing_flags);
+      }
+    }
+    strcpy(outname_end, ".pgen");
+    uintptr_t spgw_alloc_cacheline_ct;
+    uint32_t max_vrec_len;
+    reterr = spgw_init_phase1(outname, variant_allele_idxs, nonref_flags, variant_ct, sample_ct, phase_dosage_gflags, nonref_flags_storage, &spgw, &spgw_alloc_cacheline_ct, &max_vrec_len);
+    if (reterr) {
+      goto vcf_to_pgen_ret_1;
+    }
+    unsigned char* spgw_alloc;
+    if (bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+      goto vcf_to_pgen_ret_NOMEM;
+    }
+    spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    uintptr_t* genovec;
+    // if we weren't using bigstack_alloc, this would need to be sample_ctaw2
+    if (bigstack_alloc_ul(sample_ctl2, &genovec)) {
+      goto vcf_to_pgen_ret_NOMEM;
+    }
+    // nothing should go wrong if trailing word is garbage, but keep an eye on
+    // this
+    // fill_ulong_zero(sample_ctaw2 - sample_ctl2, &(genovec[sample_ctl2]));
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uintptr_t* phasepresent = nullptr;
+    uintptr_t* phaseinfo = nullptr;
+    if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) {
+      if (bigstack_alloc_ul(sample_ctl, &phasepresent) ||
+	  bigstack_alloc_ul(sample_ctl, &phaseinfo)) {
+	goto vcf_to_pgen_ret_NOMEM;
+      }
+    }
+    uintptr_t* dosage_present = nullptr;
+    dosage_t* dosage_vals = nullptr;
+    if (phase_dosage_gflags & kfPgenGlobalDosagePresent) {
+      if (bigstack_alloc_ul(sample_ctl, &dosage_present) ||
+	  bigstack_alloc_dosage(sample_ct, &dosage_vals)) {
+	goto vcf_to_pgen_ret_NOMEM;
+      }
+      dosage_present[sample_ctl - 1] = 0;
+    }
+
+    char* writebuf;
+    if (bigstack_alloc_c(2 * max_allele_slen + max_qualfilterinfo_slen + kCompressStreamBlock + kMaxIdSlen + 32, &writebuf)) {
+      goto vcf_to_pgen_ret_NOMEM;
+    }
+    write_iter = writebuf;
+    char* writebuf_flush = &(writebuf[kCompressStreamBlock]);
+
+    if (hard_call_thresh == 0xffffffffU) {
+      hard_call_thresh = kDosageMid / 10;
+    }
+    const uint32_t hard_call_halfdist = kDosage4th - hard_call_thresh;
+
+    uint32_t vidx = 0;
+    for (line_idx = header_line_ct + 1; line_idx <= line_ct; ++line_idx) {
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	goto vcf_to_pgen_ret_READ_FAIL;
+      }
+      if ((unsigned char)(*loadbuf) < 32) {
+	continue;
+      }
+      // 1. check if we skip this variant.  chromosome filter, require_gt, and
+      //    (temporarily) multiple alt alleles can cause this.
+      char* chr_code_end = (char*)rawmemchr(loadbuf, '\t');
+      int32_t chr_code_base = get_chr_code_raw(loadbuf);
+      if (chr_code_base == -1) {
+	// skip hash table lookup if we know we aren't skipping the variant
+	if (variant_skip_ct) {
+	  *chr_code_end = '\0';
+	  const uint32_t chr_code = id_htable_find(loadbuf, cip->nonstd_names, cip->nonstd_id_htable, (uintptr_t)(chr_code_end - loadbuf), kChrHtableSize);
+	  if ((chr_code == 0xffffffffU) || (!IS_SET(cip->chr_mask, chr_code))) {
+	    continue;
+	  }
+	  *chr_code_end = '\t';
+	}
+      } else {
+	if (chr_code_base >= ((int32_t)kMaxContigs)) {
+	  chr_code_base = cip->xymt_codes[chr_code_base - kMaxContigs];
+	}
+	if ((chr_code_base < 0) || (!is_set(base_chr_present, chr_code_base))) {
+	  assert(variant_skip_ct);
+	  continue;
+	}
+      }
+      // chr_code_base is now a proper numeric chromosome index for
+      // non-contigs, and -1 if it's a contig name
+      char* pos_str = &(chr_code_end[1]);
+      char* pos_str_end = (char*)rawmemchr(pos_str, '\t');
+      loadbuf_iter = pos_str_end;
+      // copy ID, REF verbatim
+      for (uint32_t uii = 0; uii < 2; ++uii) {
+	loadbuf_iter = (char*)rawmemchr(&(loadbuf_iter[1]), '\t');
+      }
+
+      // ALT, QUAL, FILTER, INFO
+      char* filter_end = loadbuf_iter;
+      for (uint32_t uii = 0; uii < 3; ++uii) {
+	filter_end = (char*)rawmemchr(&(filter_end[1]), '\t');
+      }
+      char* info_end = (char*)rawmemchr(&(filter_end[1]), '\t');
+      char* format_start = &(info_end[1]);
+      const uint32_t gt_missing = memcmp(format_start, "GT", 2) || ((format_start[2] != ':') && (format_start[2] != '\t'));
+      if (require_gt && gt_missing) {
+	continue;
+      }
+      
+      // make sure POS starts with an integer, apply --output-chr setting
+      uint32_t cur_bp;
+      if (scan_uint_defcap(pos_str, &cur_bp)) {
+	sprintf(g_logbuf, "Error: Invalid POS on line %" PRIuPTR " of --vcf file.\n", line_idx);
+	goto vcf_to_pgen_ret_MALFORMED_INPUT_2N;
+      }
+
+      // temporary kludge
+      char* write_line_start = write_iter;
+
+      if (chr_code_base == -1) {
+	write_iter = memcpya(write_iter, loadbuf, (uintptr_t)(chr_code_end - loadbuf));
+      } else {
+	write_iter = chr_name_write(cip, chr_code_base, write_iter);
+      }
+      *write_iter++ = '\t';
+      write_iter = uint32toa(cur_bp, write_iter);
+
+      uint32_t alt_ct = 1;
+      char* copy_start = pos_str_end;
+      while (1) {
+	++loadbuf_iter;
+	unsigned char ucc;
+	do {
+	  ucc = (unsigned char)(*(++loadbuf_iter));
+	  // allow GATK 3.4 <*:DEL> symbolic allele
+	} while ((ucc > ',') || (ucc == '*'));
+
+	// temporary kludge	
+	if (ucc == ',') {
+	  alt_ct = 2;
+	  break;
+	}
+	
+	write_iter = memcpya(write_iter, copy_start, (uintptr_t)(loadbuf_iter - copy_start));
+	// unsafe to flush for now due to multiallelic kludge
+	/*
+	if (write_iter >= writebuf_flush) {
+	  if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	    goto vcf_to_pgen_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	*/
+	if (ucc != ',') {
+	  break;
+	}
+	copy_start = loadbuf_iter;
+	++alt_ct;
+      }
+
+      // temporary kludge
+      if (alt_ct > 1) {
+	write_iter = write_line_start;
+	continue;
+      }
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	  goto vcf_to_pgen_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+
+      write_iter = memcpya(write_iter, loadbuf_iter, (uintptr_t)((info_nonpr_present ? info_end : filter_end) - loadbuf_iter));
+      append_binary_eoln(&write_iter);
+
+      if (gt_missing) {
+	fill_all_bits(2 * sample_ct, genovec);
+	if (spgw_append_biallelic_genovec(genovec, &spgw)) {
+	  goto vcf_to_pgen_ret_WRITE_FAIL;
+	}
+      } else {
+	loadbuf_iter = (char*)rawmemchr(format_start, '\t');
+	uint32_t qual_field_skips[2];
+	int32_t qual_thresholds[2];
+	if (format_gq_or_dp_relevant) {
+	  qual_field_ct = vcf_qual_scan_init(vcf_min_gq, vcf_min_dp, format_start, loadbuf_iter, qual_field_skips, qual_thresholds);
+	}
+
+	uint32_t dosage_field_idx = 0;
+	dosage_t* dosage_vals_iter = dosage_vals;
+	if (dosage_flags && IS_SET(dosage_flags, vidx)) {
+	  dosage_field_idx = get_vcf_format_position(dosage_import_field, dosage_import_field_slen, format_start, loadbuf_iter);
+	}
+
+	// todo: multiallelic variants
+	++loadbuf_iter;
+	const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+	uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+	uint32_t widx = 0;
+	if (!IS_SET(phasing_flags, vidx)) {
+	  while (1) {
+	    if (widx >= sample_ctl2_m1) {
+	      if (widx > sample_ctl2_m1) {
+		break;
+	      }
+	      inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	    }
+	    uintptr_t genovec_word = 0;
+	    uint32_t dosage_present_hw = 0;
+	    for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	      char* cur_gtext_end = strchr(loadbuf_iter, '\t');
+	      if (!cur_gtext_end) {
+		if ((sample_idx_lowbits != inner_loop_last) || (widx != sample_ctl2_m1)) {
+		  goto vcf_to_pgen_ret_MISSING_TOKENS;
+		}
+		cur_gtext_end = next_prespace(loadbuf_iter);
+	      }
+	      uintptr_t cur_geno;
+	      if (qual_field_ct) {
+		if (vcf_check_quals(qual_field_skips, qual_thresholds, loadbuf_iter, cur_gtext_end, qual_field_ct)) {
+		  goto vcf_to_pgen_force_missing_1;
+		}
+	      }
+	      {
+		const uint32_t is_haploid = (loadbuf_iter[1] != '/') && (loadbuf_iter[1] != '|');
+		cur_geno = (unsigned char)(*loadbuf_iter) - 48;
+		if (cur_geno <= 1) {
+		  if (is_haploid) {
+		    cur_geno *= 2;
+		  } else {
+		    const char cc = loadbuf_iter[3];
+		    if (((cc != '/') && (cc != '|')) || (loadbuf_iter[4] == '.')) {
+		      // code triploids, etc. as missing
+		      // might want to subject handling of 0/0/. to
+		      // --vcf-half-call control
+		      const uintptr_t second_allele_idx = ((unsigned char)loadbuf_iter[2]) - 48;
+		      if (second_allele_idx <= 1) {
+			cur_geno += second_allele_idx;
+		      } else if (second_allele_idx != (uintptr_t)((intptr_t)(-2))) {
+			// not '.'
+			goto vcf_to_pgen_ret_INVALID_GT;
+		      } else if (vcf_half_call == kVcfHalfCallMissing) {
+			cur_geno = 3;
+		      } else if (vcf_half_call == kVcfHalfCallError) {
+			goto vcf_to_pgen_ret_HALF_CALL_ERROR;
+		      } else {
+			// kVcfHalfCallHaploid, kVcfHalfCallReference
+			cur_geno <<= vcf_half_call;
+		      }
+		    }
+		  }
+		} else if (cur_geno != (uintptr_t)((intptr_t)(-2))) {
+		  // not '.'
+		  goto vcf_to_pgen_ret_INVALID_GT;
+		} else if (vcf_half_call != kVcfHalfCallMissing) {
+		  const char second_allele_char = loadbuf_iter[2];
+		  if ((second_allele_char != '.') && ((loadbuf_iter[1] == '/') || (loadbuf_iter[1] == '|'))) {
+		    cur_geno = ((unsigned char)second_allele_char) - 48;
+		    if (cur_geno > 1) {
+		      goto vcf_to_pgen_ret_INVALID_GT;
+		    }
+		    if (vcf_half_call == kVcfHalfCallError) {
+		      goto vcf_to_pgen_ret_HALF_CALL_ERROR;
+		    }
+		    // kVcfHalfCallHaploid, kVcfHalfCallReference
+		    cur_geno <<= vcf_half_call;
+		  } else {
+		    cur_geno = 3;
+		  }
+		} else {
+		  cur_geno = 3;
+		}
+		if (dosage_field_idx) {
+		  uint32_t is_missing = 0;
+		  uint32_t dosage_int;
+		  if (!parse_vcf_dosage(loadbuf_iter, cur_gtext_end, dosage_field_idx, is_haploid, dosage_is_gp, import_dosage_certainty, &is_missing, &dosage_int)) {
+		    const uint32_t cur_halfdist = biallelic_dosage_halfdist(dosage_int);
+		    if ((cur_geno == 3) && (cur_halfdist >= hard_call_halfdist)) {
+		      // only overwrite GT if (i) it was missing, and (ii) the
+		      // dosage's distance from the nearest hardcall doesn't
+		      // exceed the --hard-call-threshold value.
+		      // (possible todo: warn or error out if dosage and GT are
+		      // inconsistent)
+		      cur_geno = (dosage_int + (kDosage4th * k1LU)) / kDosageMid;
+		    }
+		    if (cur_halfdist < dosage_erase_halfdist) {
+		      dosage_present_hw |= 1U << sample_idx_lowbits;
+		      *dosage_vals_iter++ = dosage_int;
+		    }
+		  } else if (!is_missing) {
+		    goto vcf_to_pgen_ret_INVALID_DOSAGE;
+		  }
+		}
+	      }
+	      while (0) {
+	      vcf_to_pgen_force_missing_1:
+		cur_geno = 3;
+	      }
+	      genovec_word |= cur_geno << (2 * sample_idx_lowbits);
+	      loadbuf_iter = &(cur_gtext_end[1]);
+	    }
+	    genovec[widx] = genovec_word;
+	    if (dosage_field_idx) {
+	      ((halfword_t*)dosage_present)[widx] = dosage_present_hw;
+	    }
+	    ++widx;
+	  }
+	  if (!dosage_field_idx) {
+	    if (spgw_append_biallelic_genovec(genovec, &spgw)) {
+	      goto vcf_to_pgen_ret_WRITE_FAIL;
+	    }
+	  } else {
+	    assert(dosage_vals_iter != dosage_vals);
+	    if (spgw_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_vals_iter - dosage_vals, &spgw)) {
+	      goto vcf_to_pgen_ret_WRITE_FAIL;
+	    }
+	  }
+	} else {
+	  while (1) {
+	    if (widx >= sample_ctl2_m1) {
+	      if (widx > sample_ctl2_m1) {
+		break;
+	      }
+	      inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	    }
+	    uintptr_t genovec_word = 0;
+	    uint32_t phasepresent_halfword = 0;
+	    uint32_t phaseinfo_halfword = 0;
+	    uint32_t dosage_present_hw = 0;
+	    for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	      char* cur_gtext_end = strchr(loadbuf_iter, '\t');
+	      if (!cur_gtext_end) {
+		if ((sample_idx_lowbits != inner_loop_last) || (widx != sample_ctl2_m1)) {
+		  goto vcf_to_pgen_ret_MISSING_TOKENS;
+		}
+		cur_gtext_end = next_prespace(loadbuf_iter);
+	      }
+	      uintptr_t cur_geno;
+	      if (qual_field_ct) {
+		if (vcf_check_quals(qual_field_skips, qual_thresholds, loadbuf_iter, cur_gtext_end, qual_field_ct)) {
+		  goto vcf_to_pgen_force_missing_2;
+		}
+	      }
+	      {
+		const uint32_t is_phased = (loadbuf_iter[1] == '|');
+		const uint32_t is_haploid = (!is_phased) && (loadbuf_iter[1] != '/');
+		cur_geno = (unsigned char)(*loadbuf_iter) - 48;
+		if (cur_geno <= 1) {
+		  if (is_haploid) {
+		    cur_geno *= 2;
+		  } else {
+		    const char cc = loadbuf_iter[3];
+		    if (((cc != '/') && (cc != '|')) || (loadbuf_iter[4] == '.')) {
+		      // code triploids, etc. as missing
+		      // might want to subject handling of 0/0/. to
+		      // --vcf-half-call control
+		      const uintptr_t second_allele_idx = ((unsigned char)loadbuf_iter[2]) - 48;
+		      if (second_allele_idx <= 1) {
+			cur_geno += second_allele_idx;
+			// todo: check if this should be less branchy
+			if (is_phased && (cur_geno == 1)) {
+			  const uint32_t shifted_bit = 1U << sample_idx_lowbits;
+			  phasepresent_halfword |= shifted_bit;
+			  if (!second_allele_idx) {
+			    // 1|0
+			    phaseinfo_halfword |= shifted_bit;
+			  }
+			}
+		      } else if (second_allele_idx != (uintptr_t)((intptr_t)(-2))) {
+			// not '.'
+			goto vcf_to_pgen_ret_INVALID_GT;
+		      } else if (vcf_half_call == kVcfHalfCallMissing) {
+			cur_geno = 3;
+		      } else if (vcf_half_call == kVcfHalfCallError) {
+			goto vcf_to_pgen_ret_HALF_CALL_ERROR;
+		      } else {
+			// kVcfHalfCallHaploid, kVcfHalfCallReference
+			cur_geno <<= vcf_half_call;
+		      }
+		    }
+		  }
+		} else if (cur_geno != (uintptr_t)((intptr_t)(-2))) {
+		  // not '.'
+		  goto vcf_to_pgen_ret_INVALID_GT;
+		} else if (vcf_half_call != kVcfHalfCallMissing) {
+		  const char second_allele_char = loadbuf_iter[2];
+		  if ((second_allele_char != '.') && ((loadbuf_iter[1] == '/') || (loadbuf_iter[1] == '|'))) {
+		    cur_geno = ((unsigned char)second_allele_char) - 48;
+		    if (cur_geno > 1) {
+		      goto vcf_to_pgen_ret_INVALID_GT;
+		    }
+		    if (vcf_half_call == kVcfHalfCallError) {
+		      goto vcf_to_pgen_ret_HALF_CALL_ERROR;
+		    }
+		    // kVcfHalfCallHaploid, kVcfHalfCallReference
+		    cur_geno <<= vcf_half_call;
+		  } else {
+		    cur_geno = 3;
+		  }
+		} else {
+		  cur_geno = 3;
+		}
+		if (dosage_field_idx) {
+		  uint32_t is_missing = 0;
+		  uint32_t dosage_int;
+		  if (!parse_vcf_dosage(loadbuf_iter, cur_gtext_end, dosage_field_idx, is_haploid, dosage_is_gp, import_dosage_certainty, &is_missing, &dosage_int)) {
+		    const uint32_t cur_halfdist = biallelic_dosage_halfdist(dosage_int);
+		    if ((cur_geno == 3) && (cur_halfdist >= hard_call_halfdist)) {
+		      cur_geno = (dosage_int + (kDosage4th * k1LU)) / kDosageMid;
+		    }
+		    if (cur_halfdist < dosage_erase_halfdist) {
+		      dosage_present_hw |= 1U << sample_idx_lowbits;
+		      *dosage_vals_iter++ = dosage_int;
+		    }
+		  } else if (!is_missing) {
+		    goto vcf_to_pgen_ret_INVALID_DOSAGE;
+		  }
+		}
+	      }
+	      while (0) {
+	      vcf_to_pgen_force_missing_2:
+		cur_geno = 3;
+	      }
+	      genovec_word |= cur_geno << (2 * sample_idx_lowbits);
+	      loadbuf_iter = &(cur_gtext_end[1]);
+	    }
+	    genovec[widx] = genovec_word;
+	    ((halfword_t*)phasepresent)[widx] = (halfword_t)phasepresent_halfword;
+	    ((halfword_t*)phaseinfo)[widx] = (halfword_t)phaseinfo_halfword;
+	    if (dosage_field_idx) {
+	      ((halfword_t*)dosage_present)[widx] = dosage_present_hw;
+	    }
+	    ++widx;
+	  }
+	  if (!dosage_field_idx) {
+	    if (spgw_append_biallelic_genovec_hphase(genovec, phasepresent, phaseinfo, &spgw)) {
+	      goto vcf_to_pgen_ret_WRITE_FAIL;
+	    }
+	  } else {
+	    if (spgw_append_biallelic_genovec_hphase_dosage16(genovec, phasepresent, phaseinfo, dosage_present, dosage_vals, dosage_vals_iter - dosage_vals, &spgw)) {
+	      goto vcf_to_pgen_ret_WRITE_FAIL;
+	    }
+	  }
+	}
+      }
+      if (!(++vidx % 1000)) {
+	printf("\r--vcf: %uk variants converted.", vidx / 1000);
+	fflush(stdout);
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	goto vcf_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&pvarfile)) {
+      goto vcf_to_pgen_ret_WRITE_FAIL;
+    }
+    spgw_finish(&spgw);
+    putc_unlocked('\r', stdout);
+    write_iter = strcpya(g_logbuf, "--vcf: ");
+    const uint32_t outname_base_slen = (uintptr_t)(outname_end - outname);
+    write_iter = memcpya(write_iter, outname, outname_base_slen + 5);
+    write_iter = memcpyl3a(write_iter, " + ");
+    write_iter = memcpya(write_iter, outname, outname_base_slen);
+    write_iter = strcpya(write_iter, ".pvar");
+    if (!preexisting_psamname) {
+      write_iter = memcpyl3a(write_iter, " + ");
+      write_iter = memcpya(write_iter, outname, outname_base_slen);
+      write_iter = strcpya(write_iter, ".psam");
+    }
+    strcpy(write_iter, " written.\n");
+    wordwrapb(0);
+    logprintb();
+  }
+  while (0) {
+  vcf_to_pgen_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  vcf_to_pgen_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  vcf_to_pgen_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  vcf_to_pgen_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  vcf_to_pgen_ret_HALF_CALL_ERROR:
+    putc_unlocked('\n', stdout);
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of --vcf file has a GT half-call.\n", line_idx);
+    if (!vcf_half_call_explicit_error) {
+      logerrprint("Use --vcf-half-call to specify how these should be processed.\n");
+    }
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_to_pgen_ret_INVALID_GT:
+    putc_unlocked('\n', stdout);
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of --vcf file has an invalid GT field.\n", line_idx);
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_to_pgen_ret_MISSING_TOKENS:
+    putc_unlocked('\n', stdout);
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of --vcf file has fewer tokens than expected.\n", line_idx);
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_to_pgen_ret_LONG_LINE_N:
+    putc_unlocked('\n', stdout);
+  vcf_to_pgen_ret_LONG_LINE:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --vcf file is pathologically long.\n", line_idx);
+  vcf_to_pgen_ret_MALFORMED_INPUT_2N:
+    logprint("\n");
+    logerrprintb();
+  vcf_to_pgen_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_to_pgen_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  vcf_to_pgen_ret_INVALID_DOSAGE:
+    putc_unlocked('\n', stdout);
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of --vcf file has an invalid %s field.\n", line_idx, dosage_import_field);
+  vcf_to_pgen_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ vcf_to_pgen_ret_1:
+  if (spgw_cleanup(&spgw) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  gzclose_cond(gz_infile);
+  fclose_cond(pvarfile);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+pglerr_t ox_sample_to_psam(const char* samplename, const char* ox_missing_code, misc_flags_t misc_flags, char* outname, char* outname_end, uint32_t* sample_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gzFile gz_infile = nullptr;
+  FILE* psamfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  uintptr_t loadbuf_size = 0;
+  uintptr_t line_idx = 0;
+  {
+    uint32_t omp_slen = 2;
+    char output_missing_pheno[kMaxMissingPhenostrBlen];
+    if (misc_flags & kfMiscKeepAutoconv) {
+      // must use --output-missing-phenotype parameter, which we've validated
+      // to be consistent with --input-missing-phenotype
+      omp_slen = strlen(g_output_missing_pheno);
+      memcpy(output_missing_pheno, g_output_missing_pheno, omp_slen);
+    } else {
+      // use "NA" since that's always safe
+      memcpy(output_missing_pheno, "NA", 2);
+    }
+    const char* missing_catname = g_missing_catname;
+    uint32_t missing_catname_slen = strlen(missing_catname);
+    
+    gz_infile = gzopen(samplename, FOPEN_RB);
+    if (!gz_infile) {
+      const uint32_t slen = strlen(samplename);
+      if (((slen > 7) && (!memcmp(&(samplename[slen - 7]), ".sample", 7))) || ((slen > 10) && (!memcmp(&(samplename[slen - 10]), ".sample.gz", 10)))) {
+	LOGERRPRINTFWW(g_errstr_fopen, samplename);
+      } else {
+	LOGERRPRINTFWW("Error: Failed to open %s. (--sample expects a complete filename; did you forget '.sample' at the end?)\n", samplename);
+      }
+      goto ox_sample_to_psam_ret_OPEN_FAIL;
+    }
+    if (gzbuffer(gz_infile, 131072)) {
+      goto ox_sample_to_psam_ret_NOMEM;
+    }
+    uint32_t mc_ct = 0;
+    uintptr_t max_mc_blen = 1;
+    char* sorted_mc = nullptr;
+    if (!ox_missing_code) {
+      if (bigstack_alloc_c(3, &sorted_mc)) {
+	goto ox_sample_to_psam_ret_NOMEM;
+      }
+      memcpy(sorted_mc, "NA", 3);
+      mc_ct = 1;
+      max_mc_blen = 3;
+    } else {
+      // er, this should use something like
+      // count_and_measure_multistr_reverse_alloc()...
+      const char* missing_code_iter = ox_missing_code;
+      while (*missing_code_iter) {
+	while (*missing_code_iter == ',') {
+	  ++missing_code_iter;
+	}
+	if (!(*missing_code_iter)) {
+	  break;
+	}
+	++mc_ct;
+	const char* token_end = strchr(missing_code_iter, ',');
+	if (!token_end) {
+	  token_end = (const char*)rawmemchr(missing_code_iter, '\0');
+	}
+	uintptr_t token_slen = (uintptr_t)(token_end - missing_code_iter);
+	if (token_slen >= max_mc_blen) {
+	  max_mc_blen = token_slen + 1;
+	}
+	missing_code_iter = token_end;
+      }
+      if (mc_ct) {
+	if (bigstack_alloc_c(mc_ct * max_mc_blen, &sorted_mc)) {
+	  goto ox_sample_to_psam_ret_NOMEM;
+	}
+	missing_code_iter = ox_missing_code;
+	for (uintptr_t mc_idx = 0; mc_idx < mc_ct; ++mc_idx) {
+	  while (*missing_code_iter == ',') {
+	    ++missing_code_iter;
+	  }
+	  const char* token_end = strchr(missing_code_iter, ',');
+	  if (!token_end) {
+	    token_end = (const char*)rawmemchr(missing_code_iter, '\0');
+	  }
+	  uintptr_t token_slen = (uintptr_t)(token_end - missing_code_iter);
+	  memcpyx(&(sorted_mc[mc_idx * max_mc_blen]), missing_code_iter, token_slen, '\0');
+	  missing_code_iter = token_end;
+	}
+	qsort(sorted_mc, mc_ct, max_mc_blen, strcmp_casted);
+      }
+    }
+    loadbuf_size = bigstack_left() / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto ox_sample_to_psam_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* loadbuf_first_token;
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto ox_sample_to_psam_ret_READ_FAIL;
+	}
+	logerrprint("Error: Empty .sample file.\n");
+	goto ox_sample_to_psam_ret_MALFORMED_INPUT;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto ox_sample_to_psam_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    } while (is_eoln_kns(*loadbuf_first_token));
+    char* token_end = token_endnn(loadbuf_first_token);
+    if ((((uintptr_t)(token_end - loadbuf_first_token)) != 4) || memcmp(loadbuf_first_token, "ID_1", 4)) {
+      goto ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_1;
+    }
+    // currently accepts tab as delimiter, though .sample spec technically
+    // prohibits that
+    char* loadbuf_iter = skip_initial_spaces(token_end);
+    uint32_t token_slen = strlen_se(loadbuf_iter);
+    if ((token_slen != 4) || memcmp(loadbuf_iter, "ID_2", 4)) {
+      goto ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_1;      
+    }
+    loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[token_slen]));
+    token_slen = strlen_se(loadbuf_iter);
+    if ((token_slen != 7) || (!match_upper_counted(loadbuf_iter, "MISSING", 7))) {
+      goto ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_1;
+    }
+    loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[token_slen]));
+
+    strcpy(outname_end, ".psam");
+    if (fopen_checked(outname, FOPEN_WB, &psamfile)) {
+      goto ox_sample_to_psam_ret_OPEN_FAIL;
+    }
+    // categorical phenotypes are lengthened by 1 character ('C' added in
+    // front), so this needs to be 50% larger than loadbuf to handle worst case
+    char* writebuf = (char*)bigstack_alloc_raw_rd(loadbuf_size + (loadbuf_size / 2));
+    char* write_iter = strcpya(writebuf, "#FID\tIID\tSEX");
+    
+    // 0 = not present, otherwise zero-based index (this is fine since first
+    //     column has to be FID)
+    uint32_t sex_col = 0;
+
+    uint32_t col_ct = 3;
+    
+    while (!is_eoln_kns(*loadbuf_iter)) {      
+      token_end = token_endnn(loadbuf_iter);
+      token_slen = (uintptr_t)(token_end - loadbuf_iter);
+      if ((token_slen == 3) && match_upper_counted(loadbuf_iter, "SEX", 3)) {
+	if (sex_col) {
+	  logerrprint("Error: Multiple sex columns in .sample file.\n");
+	  goto ox_sample_to_psam_ret_MALFORMED_INPUT;
+	}
+	sex_col = col_ct;
+      }
+      ++col_ct;
+      loadbuf_iter = skip_initial_spaces(token_end);
+    }
+
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto ox_sample_to_psam_ret_READ_FAIL;
+	}
+	logerrprint("Error: Only one nonempty line in .sample file.\n");
+	goto ox_sample_to_psam_ret_MALFORMED_INPUT;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto ox_sample_to_psam_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    } while (is_eoln_kns(*loadbuf_first_token));
+
+    token_end = token_endnn(loadbuf_first_token);
+    if ((((uintptr_t)(token_end - loadbuf_first_token)) != 1) || (*loadbuf_first_token != '0')) {
+      goto ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_2;
+    }
+    loadbuf_iter = skip_initial_spaces(token_end);
+    token_slen = strlen_se(loadbuf_iter);
+    if ((token_slen != 1) || (*loadbuf_iter != '0')) {
+      goto ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_2;
+    }
+    loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[1]));
+    token_slen = strlen_se(loadbuf_iter);
+    if ((token_slen != 1) || (*loadbuf_iter != '0')) {
+      goto ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_2;
+    }
+    loadbuf_iter = &(loadbuf_iter[1]);
+
+    const uint32_t col_ctl = BITCT_TO_WORDCT(col_ct);
+    uintptr_t* col_is_categorical = (uintptr_t*)bigstack_alloc_raw_rd(col_ctl * sizeof(intptr_t));
+    uintptr_t* col_is_qt = (uintptr_t*)bigstack_alloc_raw_rd(col_ctl * sizeof(intptr_t));
+    fill_ulong_zero(col_ctl, col_is_categorical);
+    fill_ulong_zero(col_ctl, col_is_qt);
+    uint32_t at_least_one_binary_pheno = 0;
+    for (uint32_t col_idx = 3; col_idx < col_ct; ++col_idx) {
+      loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+      unsigned char col_type_char = *loadbuf_iter;
+      if (is_eoln_kns(col_type_char)) {
+	logerrprint("Error: Second .sample header line has fewer tokens than the first.\n");
+	goto ox_sample_to_psam_ret_MALFORMED_INPUT;
+      }
+      if (loadbuf_iter[1] > ' ') {
+	goto ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_2;
+      }
+      if (col_idx == sex_col) {
+	if (col_type_char != 'D') {
+	  logerrprint("Error: .sample sex column is not of type 'D'.\n");
+	  goto ox_sample_to_psam_ret_MALFORMED_INPUT;
+	}
+      } else {
+	if ((col_type_char == 'C') || (col_type_char == 'P')) {
+	  set_bit(col_idx, col_is_qt);
+	} else if (col_type_char == 'D') {
+	  set_bit(col_idx, col_is_categorical);
+	} else {
+	  at_least_one_binary_pheno = 1;
+	  if (col_type_char != 'B') {
+	    sprintf(g_logbuf, "Error: Unrecognized .sample variable type '%c'.\n", col_type_char);
+	    goto ox_sample_to_psam_ret_MALFORMED_INPUT_2;
+	  }
+	}
+      }
+      ++loadbuf_iter;
+    }
+    if (at_least_one_binary_pheno) {
+      // check for pathological case
+      if ((bsearch_str("0", sorted_mc, 1, max_mc_blen, mc_ct) != -1) || (bsearch_str("1", sorted_mc, 1, max_mc_blen, mc_ct) != -1)) {
+	logerrprint("Error: '0' and '1' are unacceptable missing case/control phenotype codes.\n");
+	goto ox_sample_to_psam_ret_INCONSISTENT_INPUT;
+      }
+    }
+    // to make --data and --data --make-pgen consistent, we do a two-pass load,
+    // checking for empty phenotypes in the first pass.
+    uintptr_t* col_keep = (uintptr_t*)bigstack_alloc_raw(round_up_pow2(col_ct, kCacheline * CHAR_BIT) / CHAR_BIT);
+    col_keep[0] = 7;
+    fill_ulong_zero(col_ctl - 1, &(col_keep[1]));
+    uint32_t uncertain_col_ct = col_ct - 3;
+    if (sex_col) {
+      // we don't care if sex column is all-NA
+      set_bit(sex_col, col_keep);
+      --uncertain_col_ct;
+    }
+    while (uncertain_col_ct) {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto ox_sample_to_psam_ret_LONG_LINE;
+      }
+      loadbuf_iter = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*loadbuf_iter)) {
+	continue;
+      }
+
+      const uint32_t old_uncertain_col_ct = uncertain_col_ct;
+      uint32_t old_col_uidx = 0;
+      uint32_t col_uidx = 0;
+      for (uint32_t uncertain_col_idx = 0; uncertain_col_idx < old_uncertain_col_ct; ++uncertain_col_idx, ++col_uidx) {
+	next_unset_unsafe_ck(col_keep, &col_uidx);
+	loadbuf_iter = next_token_mult(loadbuf_iter, col_uidx - old_col_uidx);
+	if (!loadbuf_iter) {
+	  goto ox_sample_to_psam_ret_MISSING_TOKENS;
+	}
+	token_end = token_endnn(loadbuf_iter);
+	token_slen = (uintptr_t)(token_end - loadbuf_iter);
+        if (bsearch_str(loadbuf_iter, sorted_mc, token_slen, max_mc_blen, mc_ct) == -1) {
+	  set_bit(col_uidx, col_keep);
+	  --uncertain_col_ct;
+	}
+	loadbuf_iter = token_end;
+	old_col_uidx = col_uidx;
+      }
+    }
+
+    if (gzrewind(gz_infile)) {
+      goto ox_sample_to_psam_ret_READ_FAIL;
+    }
+    line_idx = 0;
+    
+    uint32_t sample_ct_p2 = 0;
+    while (1) {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto ox_sample_to_psam_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*loadbuf_first_token)) {
+	continue;
+      }
+      ++sample_ct_p2;
+      if (sample_ct_p2 < 3) {
+	// header lines
+	if (sample_ct_p2 == 1) {
+	  loadbuf_iter = next_token_mult(loadbuf_first_token, 3);
+	  for (uint32_t col_idx = 3; col_idx < col_ct; ++col_idx) {
+	    token_end = token_endnn(loadbuf_iter);
+	    if (is_set(col_keep, col_idx) && (col_idx != sex_col)) {
+	      *write_iter++ = '\t';
+	      write_iter = memcpya(write_iter, loadbuf_iter, (uintptr_t)(token_end - loadbuf_iter));
+	    }
+	    loadbuf_iter = skip_initial_spaces(token_end);
+	  }
+	  append_binary_eoln(&write_iter);
+	  if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), psamfile)) {
+	    goto ox_sample_to_psam_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	continue;
+      }
+      if (sample_ct_p2 == 0x80000001U) {
+	logerrprint("Error: " PROG_NAME_STR " does not support more than 2^31 - 2 samples.\n");
+	goto ox_sample_to_psam_ret_MALFORMED_INPUT;
+      }
+
+      // FID
+      token_end = token_endnn(loadbuf_first_token);
+      write_iter = memcpyax(writebuf, loadbuf_first_token, (uintptr_t)(token_end - loadbuf_first_token), '\t');
+
+      // IID
+      loadbuf_iter = skip_initial_spaces(token_end);
+      if (is_eoln_kns(*loadbuf_iter)) {
+	goto ox_sample_to_psam_ret_MISSING_TOKENS;
+      }
+      token_end = token_endnn(loadbuf_iter);
+      write_iter = memcpya(write_iter, loadbuf_iter, (uintptr_t)(token_end - loadbuf_iter));
+      
+      // MISSING
+      loadbuf_iter = skip_initial_spaces(token_end);
+      if (is_eoln_kns(*loadbuf_iter)) {
+	goto ox_sample_to_psam_ret_MISSING_TOKENS;
+      }
+      token_end = token_endnn(loadbuf_iter);
+
+      // flush now since backfilled sex is variable-length ("NA" vs. "1"/"2")
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), psamfile)) {
+	goto ox_sample_to_psam_ret_WRITE_FAIL;
+      }
+      char* cur_writebuf_start = writebuf;
+      write_iter = memcpyl3a(writebuf, "\tNA");
+      for (uint32_t col_idx = 3; col_idx < col_ct; ++col_idx) {
+	loadbuf_iter = skip_initial_spaces(token_end);
+	if (is_eoln_kns(*loadbuf_iter)) {
+	  goto ox_sample_to_psam_ret_MISSING_TOKENS;
+	}
+	token_end = token_endnn(loadbuf_iter);
+	if (!is_set(col_keep, col_idx)) {
+	  continue;
+	}
+	token_slen = (uintptr_t)(token_end - loadbuf_iter);
+	const uint32_t is_missing = (bsearch_str(loadbuf_iter, sorted_mc, token_slen, max_mc_blen, mc_ct) != -1);
+	if (col_idx == sex_col) {
+	  if (!is_missing) {
+	    const unsigned char sex_ucc = *loadbuf_iter;
+	    if ((token_slen == 1) && ((((uint32_t)sex_ucc) - 49) < 2)) {
+	      ++cur_writebuf_start;
+	      cur_writebuf_start[0] = '\t';
+	      cur_writebuf_start[1] = sex_ucc;
+	    } else if ((token_slen != 1) || (sex_ucc != '0')) {
+	      // tolerate '0' as a sex-only missing code even when not
+	      // explicitly specified
+	      *token_end = '\0';
+	      sprintf(g_logbuf, "Error: Invalid sex code '%s' on line %" PRIuPTR ", column %u of .sample file ('0', '1', '2', or --missing-code value expected).\n", loadbuf_iter, line_idx, col_idx + 1);
+	      goto ox_sample_to_psam_ret_INCONSISTENT_INPUT_WW;
+	    }
+	  }
+	} else {
+	  *write_iter++ = '\t';
+	  if (is_set(col_is_categorical, col_idx)) {
+	    if (!is_missing) {
+	      *write_iter++ = 'C';
+	      // .sample files are relatively small, so let's go ahead and
+	      // (i) validate we have a positive integer < 2^31
+	      // (ii) convert e.g. 9000000, 9000000., 9.0e6 all to 9000000
+	      double dxx = 0.0;
+	      char* num_end = scanadv_double(loadbuf_iter, &dxx);
+	      int32_t ii = (int32_t)dxx;
+	      if ((num_end != token_end) || (ii <= 0) || (((double)ii) != dxx)) {
+		*token_end = '\0';
+		sprintf(g_logbuf, "Error: Invalid categorical phenotype '%s' on line %" PRIuPTR ", column %u of .sample file (positive integer < 2^31 or --missing-code value expected).\n", loadbuf_iter, line_idx, col_idx + 1);
+		goto ox_sample_to_psam_ret_INCONSISTENT_INPUT_WW;
+	      }
+	      write_iter = uint32toa(ii, write_iter);
+	    } else {
+	      write_iter = memcpya(write_iter, missing_catname, missing_catname_slen);
+	    }
+	  } else if (!is_missing) {
+	    if (is_set(col_is_qt, col_idx)) {
+	      double dxx = 0.0;
+	      char* num_end = scanadv_double(loadbuf_iter, &dxx);
+	      if (num_end != token_end) {
+		*token_end = '\0';
+		sprintf(g_logbuf, "Error: Invalid quantitative phenotype '%s' on line %" PRIuPTR ", column %u of .sample file (non-infinite number or --missing-code value expected).\n", loadbuf_iter, line_idx, col_idx + 1);
+		goto ox_sample_to_psam_ret_INCONSISTENT_INPUT_WW;
+	      }
+	      // used over memcpy to make --data and --data --make-pgen the
+	      // same (could make that conditional on keep_autoconv?)
+	      write_iter = dtoa_g(dxx, write_iter);
+	    } else {
+	      const uint32_t cc_char_m48 = (uint32_t)((unsigned char)(*loadbuf_iter)) - 48;
+	      if ((token_slen == 1) && (cc_char_m48 < 2)) {
+		*write_iter++ = cc_char_m48 + '1';
+	      } else {
+		*token_end = '\0';
+		sprintf(g_logbuf, "Error: Invalid binary phenotype value '%s' on line %" PRIuPTR ", column %u of .sample file ('0', '1', or --missing-code value expected).\n", loadbuf_iter, line_idx, col_idx + 1);
+		goto ox_sample_to_psam_ret_INCONSISTENT_INPUT_WW;
+	      }
+	    }
+	  } else {
+	    write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
+	  }
+	}
+      }
+      append_binary_eoln(&write_iter);
+      if (fwrite_checked(cur_writebuf_start, (uintptr_t)(write_iter - cur_writebuf_start), psamfile)) {
+	goto ox_sample_to_psam_ret_WRITE_FAIL;
+      }
+    }
+    if (!gzeof(gz_infile)) {
+      goto ox_sample_to_psam_ret_READ_FAIL;
+    }
+
+    // no final writebuf flush since we didn't use usual manual-streaming
+    // strategy
+    if (fclose_null(&psamfile)) {
+      goto ox_sample_to_psam_ret_WRITE_FAIL;
+    }
+    const uint32_t sample_ct = sample_ct_p2 - 2;
+    if ((!sample_ct) && (!(misc_flags & kfMiscAllowNoSamples))) {
+      logerrprint("Error: No samples in .sample file.\n");
+      goto ox_sample_to_psam_ret_INCONSISTENT_INPUT;
+    }
+    LOGPRINTFWW("%u sample%s imported from .sample file to %s .\n", sample_ct, (sample_ct == 1)? "" : "s", outname);
+    *sample_ct_ptr = sample_ct;
+  }
+  while (0) {
+  ox_sample_to_psam_ret_LONG_LINE:
+    if (loadbuf_size == kMaxLongLine) {
+      LOGERRPRINTF("Error: Line %" PRIuPTR " of .sample file is pathologically long.\n", line_idx);
+      reterr = kPglRetMalformedInput;
+      break;
+    }
+  ox_sample_to_psam_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  ox_sample_to_psam_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  ox_sample_to_psam_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  ox_sample_to_psam_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  ox_sample_to_psam_ret_MISSING_TOKENS:
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of .sample file has fewer tokens than expected.\n", line_idx);
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_sample_to_psam_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+  ox_sample_to_psam_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_1:
+    logerrprint("Error: Invalid first header line in .sample file.\n");
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_sample_to_psam_ret_INVALID_SAMPLE_HEADER_2:
+    logerrprint("Error: Invalid second header line in .sample file.\n");
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_sample_to_psam_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  ox_sample_to_psam_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  gzclose_cond(gz_infile);
+  fclose_cond(psamfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+uint32_t bgen11_dosage_import_check(uint32_t dosage_int_sum_thresh, uint32_t import_dosage_certainty_int, uint32_t dosage_erase_halfdist, uint32_t dosage_int0, uint32_t dosage_int1, uint32_t dosage_int2) {
+  const uint32_t dosage_int_sum = dosage_int0 + dosage_int1 + dosage_int2;
+  if ((dosage_int_sum <= dosage_int_sum_thresh) && (dosage_int0 < import_dosage_certainty_int) && (dosage_int1 < import_dosage_certainty_int) && (dosage_int2 < import_dosage_certainty_int)) {
+    return 0;
+  }
+  // ties realistically happen, use banker's rounding
+  // 1/65536 -> 0/32768
+  // 3/65536 -> 2/32768
+  // 5/65536 -> 2/32768
+  const dosage_prod_t write_dosage_int_numer = ((dosage_prod_t)kDosageMid) * dosage_int1 + ((dosage_prod_t)kDosageMax) * dosage_int2;
+  uint32_t write_dosage_int;
+  if (dosage_int_sum == kDosageMax) {
+    // optimize common case
+    write_dosage_int = ((write_dosage_int_numer + kDosageMid) / kDosageMax) - ((write_dosage_int_numer % (2 * ((dosage_prod_t)kDosageMax))) == kDosageMid);
+  } else {
+    write_dosage_int = (write_dosage_int_numer + (dosage_int_sum / 2)) / dosage_int_sum;
+    write_dosage_int -= (2 * (write_dosage_int_numer - write_dosage_int * dosage_int_sum) == dosage_int_sum) * (write_dosage_int % 2);
+  }
+  const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+  return (halfdist < dosage_erase_halfdist);
+}
+
+void bgen11_dosage_import_update(uint32_t dosage_int_sum_thresh, uint32_t import_dosage_certainty_int, uint32_t hard_call_halfdist, uint32_t dosage_erase_halfdist, uint32_t sample_idx_lowbits, uint32_t dosage_int0, uint32_t dosage_int1, uint32_t dosage_int2, uintptr_t* genovec_word_ptr, uint32_t* dosage_present_hw_ptr, dosage_t** dosage_vals_iterp) {
+  const uint32_t dosage_int_sum = dosage_int0 + dosage_int1 + dosage_int2;
+  if (dosage_int_sum <= dosage_int_sum_thresh) {
+    if ((dosage_int0 < import_dosage_certainty_int) && (dosage_int1 < import_dosage_certainty_int) && (dosage_int2 < import_dosage_certainty_int)) {
+      *genovec_word_ptr |= (3 * k1LU) << (2 * sample_idx_lowbits);
+      return;
+    }
+  }
+  const dosage_prod_t write_dosage_int_numer = ((dosage_prod_t)kDosageMid) * dosage_int1 + ((dosage_prod_t)kDosageMax) * dosage_int2;
+  uint32_t write_dosage_int;
+  if (dosage_int_sum == kDosageMax) {
+    write_dosage_int = ((write_dosage_int_numer + kDosageMid) / kDosageMax) - ((write_dosage_int_numer % (2 * ((dosage_prod_t)kDosageMax))) == kDosageMid);
+  } else {
+    write_dosage_int = (write_dosage_int_numer + (dosage_int_sum / 2)) / dosage_int_sum;
+    write_dosage_int -= (2 * (write_dosage_int_numer - write_dosage_int * dosage_int_sum) == dosage_int_sum) * (write_dosage_int % 2);
+  }
+  const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+  if (halfdist < hard_call_halfdist) {
+    *genovec_word_ptr |= (3 * k1LU) << (2 * sample_idx_lowbits);
+  } else {
+    *genovec_word_ptr |= ((write_dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+    if (halfdist >= dosage_erase_halfdist) {
+      return;
+    }
+  }
+  *dosage_present_hw_ptr |= 1U << sample_idx_lowbits;
+  **dosage_vals_iterp = write_dosage_int;
+  *dosage_vals_iterp += 1;
+}
+
+static_assert(sizeof(dosage_t) == 2, "ox_gen_to_pgen() needs to be updated.");
+pglerr_t ox_gen_to_pgen(const char* genname, const char* samplename, const char* ox_single_chr_str, const char* ox_missing_code, misc_flags_t misc_flags, oxford_import_t oxford_import_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, char* outname, char* outname_end, chr_info_t* cip) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gzFile gz_infile = nullptr;
+  FILE* pvarfile = nullptr;
+  st_pgen_writer_t spgw;
+  pglerr_t reterr = kPglRetSuccess;
+  uintptr_t loadbuf_size = 0;
+  uintptr_t line_idx = 0;
+  spgw_preinit(&spgw);
+  {
+    uint32_t sample_ct;
+    reterr = ox_sample_to_psam(samplename, ox_missing_code, misc_flags, outname, outname_end, &sample_ct);
+    if (reterr) {
+      goto ox_gen_to_pgen_ret_1;
+    }
+    if (sample_ct > (kMaxLongLine / 6)) {
+      // impossible for a valid .gen line to fit in maximum-length load buffer
+      logerrprint("Error: Too many samples for .gen file converter.\n");
+      reterr = kPglRetNotYetSupported;
+      goto ox_gen_to_pgen_ret_1;
+    }
+    // Two passes:
+    // 1. Count # of (non-chromosome-filtered) variants, write .pvar file, and
+    //    check if at least one non-hardcall needs to be saved.
+    // 2. Write the .pgen.
+    gz_infile = gzopen(genname, FOPEN_RB);
+    if (!gz_infile) {
+      const uint32_t slen = strlen(genname);
+      if (((slen > 4) && (!memcmp(&(genname[slen - 4]), ".gen", 4))) || ((slen > 7) && (!memcmp(&(genname[slen - 7]), ".gen.gz", 7)))) {
+	LOGERRPRINTFWW(g_errstr_fopen, genname);
+      } else {
+	LOGERRPRINTFWW("Error: Failed to open %s. (--gen expects a complete filename; did you forget '.gen' at the end?)\n", genname);
+      }
+      goto ox_gen_to_pgen_ret_OPEN_FAIL;
+    }
+    if (gzbuffer(gz_infile, 131072)) {
+      goto ox_gen_to_pgen_ret_NOMEM;
+    }    
+    loadbuf_size = bigstack_left() / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto ox_gen_to_pgen_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
+    finalize_chrset(misc_flags, cip);
+
+    char* writebuf = (char*)bigstack_alloc_raw(kCompressStreamBlock + loadbuf_size);
+    char* writebuf_flush = &(writebuf[kCompressStreamBlock]);
+
+    const char* single_chr_str = nullptr;
+    uint32_t single_chr_slen = 0;
+    if (ox_single_chr_str) {
+      int32_t chr_code_raw = get_chr_code_raw(ox_single_chr_str);
+      if (chr_code_raw == -1) {
+	// command-line parser guarantees that allow_extra_chrs is true here
+	single_chr_str = ox_single_chr_str;
+        single_chr_slen = strlen(ox_single_chr_str);
+      } else {
+	uint32_t chr_code = chr_code_raw;
+	if (chr_code > cip->max_code) {
+	  if (chr_code < kMaxContigs) {
+	    logerrprint("Error: --oxford-single-chr chromosome code is not in the chromosome set.\n");
+	    goto ox_gen_to_pgen_ret_INVALID_CMDLINE;
+	  }
+	  chr_code = cip->xymt_codes[chr_code - kMaxContigs];
+	  if (((int32_t)chr_code) < 0) {
+	    logerrprint("Error: --oxford-single-chr chromosome code is not in the chromosome set.\n");
+	    goto ox_gen_to_pgen_ret_INVALID_CMDLINE;
+	  }
+	}
+	if (!is_set(cip->chr_mask, chr_code)) {
+	  // could permit this in --allow-no-vars case, but it's silly
+	  logerrprint("Error: --oxford-single-chr chromosome code is excluded by chromosome filter.\n");
+	  goto ox_gen_to_pgen_ret_INVALID_CMDLINE;
+	}
+	char* chr_buf = (char*)bigstack_alloc_raw(kCacheline);
+	char* chr_name_end = chr_name_write(cip, chr_code, chr_buf);
+	single_chr_str = chr_buf;
+        single_chr_slen = (uintptr_t)(chr_name_end - chr_buf);
+      }
+    }
+
+    strcpy(outname_end, ".pvar");
+    if (fopen_checked(outname, FOPEN_WB, &pvarfile)) {
+      goto ox_gen_to_pgen_ret_OPEN_FAIL;
+    }
+    char* write_iter = writebuf;
+    if (cip->chrset_source) {
+      append_chrset_line(cip, &write_iter);
+    }
+    write_iter = strcpya(write_iter, "#CHROM\tPOS\tID\tREF\tALT" EOLN_STR);
+
+    // Explicit 32768 instead of kDosageMax since this is driven by the BGEN
+    // 1.1 format, not plink2's dosage representation.
+    // Note that command-line parser multiplies import_dosage_certainty by
+    // (1 - kSmallEpsilon), and we want import_dosage_certainty_int to be 1
+    // when import_dosage_certainty is zero.
+    uint32_t import_dosage_certainty_int = 1 + (int32_t)(import_dosage_certainty * 32768);
+    const uint32_t dosage_int_sum_thresh = 3 * (import_dosage_certainty_int - 1);
+    
+    const uint32_t dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
+    const uint32_t prov_ref_allele_second = !(oxford_import_flags & kfOxfordImportRefFirst);
+    uint32_t dosage_is_present = 0;
+    uint32_t variant_ct = 0;
+    uintptr_t variant_skip_ct = 0;
+    while (1) {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto ox_gen_to_pgen_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto ox_gen_to_pgen_ret_LONG_LINE_N;
+      }
+      char* chr_code_str = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*chr_code_str)) {
+	continue;
+      }
+      char* chr_code_end = token_endnn(chr_code_str);
+      char* variant_id_str = skip_initial_spaces(chr_code_end);
+      if (is_eoln_kns(*variant_id_str)) {
+	goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+      }
+      
+      if (!single_chr_str) {
+	int32_t cur_chr_code;
+        reterr = get_or_add_chr_code_destructive(".gen file", line_idx, allow_extra_chrs, chr_code_str, chr_code_end, cip, &cur_chr_code);
+	if (reterr) {
+	  if ((((uintptr_t)(chr_code_end - chr_code_str)) == 3) && (!memcmp(chr_code_str, "---", 3))) {
+	    logerrprint("(Did you forget --oxford-single-chr?)\n");
+	  }
+	  goto ox_gen_to_pgen_ret_1;
+	}
+	if (!is_set(cip->chr_mask, cur_chr_code)) {
+	  ++variant_skip_ct;
+	  continue;
+	}
+	write_iter = chr_name_write(cip, cur_chr_code, write_iter);
+      } else {
+	write_iter = memcpya(write_iter, single_chr_str, single_chr_slen);
+      }
+      *write_iter++ = '\t';
+      ++variant_ct;
+
+      char* variant_id_end = token_endnn(variant_id_str);
+      char* pos_str = skip_initial_spaces(variant_id_end);
+      if (is_eoln_kns(*pos_str)) {
+	goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+      }
+      char* pos_end = token_endnn(pos_str);
+      uint32_t cur_bp;
+      if (scan_uint_defcap(pos_str, &cur_bp)) {
+	putc_unlocked('\n', stdout);
+	sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, genname);
+	goto ox_gen_to_pgen_ret_MALFORMED_INPUT_WW;
+      }
+      
+      write_iter = uint32toa_x(cur_bp, '\t', write_iter);
+      const uint32_t variant_id_slen = (uintptr_t)(variant_id_end - variant_id_str);
+      if (variant_id_slen > kMaxIdSlen) {
+	putc_unlocked('\n', stdout);
+	logerrprint("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
+	goto ox_gen_to_pgen_ret_MALFORMED_INPUT;
+      }
+      write_iter = memcpyax(write_iter, variant_id_str, variant_id_slen, '\t');
+
+      // .gen specification does not define which column should be expected to
+      // be the reference allele, and which the alternate.  plink 1.9 assumed
+      // alt was usually first, but the reverse seems to be more common now.
+      // So:
+      //   If 'ref-first' or 'ref-second' was specified, we know what to do.
+      //   If not, we treat the second allele as the provisional reference, for
+      //     backward compatibility.
+      char* first_allele_str = skip_initial_spaces(pos_end);
+      if (is_eoln_kns(*first_allele_str)) {
+	goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+      }
+      char* first_allele_end = token_endnn(first_allele_str);
+      char* second_allele_str = skip_initial_spaces(first_allele_end);
+      if (is_eoln_kns(*second_allele_str)) {
+	goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+      }
+      char* loadbuf_iter = token_endnn(second_allele_str);
+      if (!prov_ref_allele_second) {
+	write_iter = memcpyax(write_iter, first_allele_str, (uintptr_t)(first_allele_end - first_allele_str), '\t');
+	write_iter = memcpya(write_iter, second_allele_str, (uintptr_t)(loadbuf_iter - second_allele_str));
+      } else {
+	write_iter = memcpyax(write_iter, second_allele_str, (uintptr_t)(loadbuf_iter - second_allele_str), '\t');
+	write_iter = memcpya(write_iter, first_allele_str, (uintptr_t)(first_allele_end - first_allele_str));
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	  goto ox_gen_to_pgen_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+      
+      if (!dosage_is_present) {
+	for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	  loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	  const char cc = *loadbuf_iter;
+	  if (is_eoln_kns(cc)) {
+	    goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	  }
+	  char cc2 = loadbuf_iter[1];
+	  if ((cc2 == ' ') || (cc2 == '\t')) {
+	    // fast handling of "1 0 0 ", "0 1 0 ", "0 0 1 ", "0 0 0 " cases
+	    cc2 = loadbuf_iter[3];
+	    if (((cc2 == ' ') || (cc2 == '\t')) && ((unsigned char)(loadbuf_iter[5]) <= 32)) {
+	      const uint32_t uii = ((uint32_t)((unsigned char)cc)) - 48;
+	      const uint32_t ujj = ((uint32_t)((unsigned char)loadbuf_iter[2])) - 48;
+	      const uint32_t ukk = ((uint32_t)((unsigned char)loadbuf_iter[4])) - 48;
+	      if (((uii | ujj | ukk) < 2) && (uii + ujj + ukk < 2)) {
+		loadbuf_iter = &(loadbuf_iter[5]);
+		continue;
+	      }
+	    }
+	  }
+	  double prob_0alt;
+	  char* first_dosage_str_end = scanadv_double(loadbuf_iter, &prob_0alt);
+	  if (!first_dosage_str_end) {
+	    // triple-NA, etc. ok; treat as missing value
+	    loadbuf_iter = next_token_mult(loadbuf_iter, 2);
+	    if (!loadbuf_iter) {
+	      goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    loadbuf_iter = token_endnn(loadbuf_iter);
+	    continue;
+	  }
+	  if ((unsigned char)(*first_dosage_str_end) > ' ') {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  loadbuf_iter = skip_initial_spaces(first_dosage_str_end);
+	  if (is_eoln_kns(*loadbuf_iter)) {
+	    goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	  }
+	  double prob_1alt;
+	  loadbuf_iter = scanadv_double(loadbuf_iter, &prob_1alt);
+	  if ((!loadbuf_iter) || ((unsigned char)(*loadbuf_iter) > ' ')) {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	  if (is_eoln_kns(*loadbuf_iter)) {
+	    goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	  }
+	  double prob_2alt;
+	  loadbuf_iter = scanadv_double(loadbuf_iter, &prob_2alt);
+	  if ((!loadbuf_iter) || ((unsigned char)(*loadbuf_iter) > ' ')) {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  // bugfix: forgot the "multiply by 32768" part of "multiply by 32768
+	  // and round" .gen -> .bgen conversion.
+	  prob_0alt *= 32768;
+	  prob_1alt *= 32768;
+	  prob_2alt *= 32768;
+	  
+	  // now treat this identically to bgen-1.1
+	  // Compare with 65535.4999999999 instead of 65535.5 since 0.5 +
+	  // [first floating point number below 65535.5] may evaluate to 65536.
+	  if ((prob_0alt < 0.0) || (prob_0alt >= 65535.4999999999) || (prob_1alt < 0.0) || (prob_1alt >= 65535.4999999999) || (prob_2alt < 0.0) || (prob_2alt >= 65535.4999999999)) {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  const uint32_t dosage_int0 = (int32_t)(prob_0alt + 0.5);
+	  const uint32_t dosage_int1 = (int32_t)(prob_1alt + 0.5);
+	  const uint32_t dosage_int2 = (int32_t)(prob_2alt + 0.5);
+	  dosage_is_present = bgen11_dosage_import_check(dosage_int_sum_thresh, import_dosage_certainty_int, dosage_erase_halfdist, dosage_int0, dosage_int1, dosage_int2);
+	  if (dosage_is_present) {
+	    break;
+	  }
+	}
+      }
+      if (!(variant_ct % 1000)) {
+	printf("\r--data/--gen: %uk variants scanned.", variant_ct / 1000);
+	fflush(stdout);
+      }
+    }
+    putc_unlocked('\r', stdout);
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	goto ox_gen_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&pvarfile)) {
+      goto ox_gen_to_pgen_ret_WRITE_FAIL;
+    }
+    if (!variant_ct) {
+      if (!variant_skip_ct) {
+	// permit this in --allow-no-vars case?
+	logerrprint("Error: Empty .gen file.\n");
+	goto ox_gen_to_pgen_ret_INCONSISTENT_INPUT;
+      }
+      LOGERRPRINTFWW("Error: All %" PRIuPTR " variant%s in .gen file skipped due to chromosome filter.\n", variant_skip_ct, (variant_skip_ct == 1)? "" : "s");
+      goto ox_gen_to_pgen_ret_INCONSISTENT_INPUT;
+    }
+    LOGPRINTF("--data/--gen: %u variant%s scanned%s.\n", variant_ct, (variant_ct == 1)? "" : "s", dosage_is_present? "" : " (all hardcalls)");
+
+    // second pass
+    bigstack_reset(writebuf);
+    if (gzrewind(gz_infile)) {
+      goto ox_gen_to_pgen_ret_READ_FAIL;
+    }
+    strcpy(outname_end, ".pgen");
+    uintptr_t spgw_alloc_cacheline_ct;
+    uint32_t max_vrec_len;
+    reterr = spgw_init_phase1(outname, nullptr, nullptr, variant_ct, sample_ct, dosage_is_present? kfPgenGlobalDosagePresent : kfPgenGlobal0, (oxford_import_flags & (kfOxfordImportRefFirst | kfOxfordImportRefSecond))? 1 : 2, &spgw, &spgw_alloc_cacheline_ct, &max_vrec_len);
+    if (reterr) {
+      goto ox_gen_to_pgen_ret_1;
+    }
+    unsigned char* spgw_alloc;
+    if (bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+      goto ox_gen_to_pgen_ret_NOMEM;
+    }
+    spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uintptr_t* genovec;
+    uintptr_t* dosage_present;
+    // if we weren't using bigstack_alloc, this would need to be sample_ctaw2
+    if (bigstack_alloc_ul(sample_ctl2, &genovec) ||
+	bigstack_alloc_ul(sample_ctl, &dosage_present)) {
+      goto ox_gen_to_pgen_ret_NOMEM;
+    }
+    dosage_t* dosage_vals = nullptr;
+    if (dosage_is_present) {
+      if (bigstack_alloc_dosage(sample_ct, &dosage_vals)) {
+	goto ox_gen_to_pgen_ret_NOMEM;
+      }
+    }
+    if (hard_call_thresh == 0xffffffffU) {
+      hard_call_thresh = kDosageMid / 10;
+    }
+    const uint32_t hard_call_halfdist = kDosage4th - hard_call_thresh;
+    const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+    const uintptr_t line_ct = line_idx - 1;
+    uint32_t vidx = 0;
+    for (line_idx = 1; line_idx <= line_ct; ++line_idx) {
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	goto ox_gen_to_pgen_ret_READ_FAIL;
+      }
+      char* chr_code_str = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*chr_code_str)) {
+	continue;
+      }
+      char* chr_code_end = token_endnn(chr_code_str);
+      char* loadbuf_iter = skip_initial_spaces(chr_code_end);
+      if (variant_skip_ct) {
+	*chr_code_end = '\0';
+	const uint32_t chr_code = get_chr_code(chr_code_str, cip, (uintptr_t)(chr_code_end - chr_code_str));
+	if (!is_set(cip->chr_mask, chr_code)) {
+	  continue;
+	}
+      }
+      loadbuf_iter = next_token_mult(loadbuf_iter, 4);
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      uint32_t widx = 0;
+      dosage_t* dosage_vals_iter = dosage_vals;
+      while (1) {
+	if (widx >= sample_ctl2_m1) {
+	  if (widx > sample_ctl2_m1) {
+	    break;
+	  }
+	  inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	}
+	uintptr_t genovec_word = 0;
+	uint32_t dosage_present_hw = 0;
+	for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	  loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	  const char cc = *loadbuf_iter;
+	  if (is_eoln_kns(cc)) {
+	    goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	  }
+	  char cc2 = loadbuf_iter[1];
+	  if ((cc2 == ' ') || (cc2 == '\t')) {
+	    cc2 = loadbuf_iter[3];
+	    if (((cc2 == ' ') || (cc2 == '\t')) && ((unsigned char)(loadbuf_iter[5]) <= 32)) {
+	      const uint32_t uii = ((uint32_t)((unsigned char)cc)) - 48;
+	      const uint32_t ujj = ((uint32_t)((unsigned char)loadbuf_iter[2])) - 48;
+	      const uint32_t ukk = ((uint32_t)((unsigned char)loadbuf_iter[4])) - 48;
+	      const uint32_t all_or = uii | ujj | ukk;
+	      if (all_or < 2) {
+		if (!all_or) {
+	          genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		  loadbuf_iter = &(loadbuf_iter[5]);
+		  continue;
+		} else if (uii + ujj + ukk == 1) {
+		  uintptr_t cur_geno = ukk * 2 + ujj;
+		  genovec_word |= cur_geno << (2 * sample_idx_lowbits);
+		  loadbuf_iter = &(loadbuf_iter[5]);
+		  continue;
+		}
+	      }
+	    }
+	  }
+	  double prob_0alt;
+	  char* first_dosage_str_end = scanadv_double(loadbuf_iter, &prob_0alt);
+	  if (!first_dosage_str_end) {
+	    // ignore next two tokens if first token in triplet is not numeric,
+	    // since we treat this as missing regardless
+	    loadbuf_iter = next_token_mult(loadbuf_iter, 2);
+	    if (!loadbuf_iter) {
+	      goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	    loadbuf_iter = token_endnn(loadbuf_iter);
+	    continue;
+	  }
+	  if ((unsigned char)(*first_dosage_str_end) > ' ') {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  loadbuf_iter = skip_initial_spaces(first_dosage_str_end);
+	  if (is_eoln_kns(*loadbuf_iter)) {
+	    goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	  }
+	  double prob_1alt;
+	  loadbuf_iter = scanadv_double(loadbuf_iter, &prob_1alt);
+	  if ((!loadbuf_iter) || ((unsigned char)(*loadbuf_iter) > ' ')) {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	  if (is_eoln_kns(*loadbuf_iter)) {
+	    goto ox_gen_to_pgen_ret_MISSING_TOKENS;
+	  }
+	  double prob_2alt;
+	  loadbuf_iter = scanadv_double(loadbuf_iter, &prob_2alt);
+	  if ((!loadbuf_iter) || ((unsigned char)(*loadbuf_iter) > ' ')) {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  // bugfix
+	  prob_0alt *= 32768;
+	  prob_1alt *= 32768;
+	  prob_2alt *= 32768;
+	  
+	  if ((prob_0alt < 0.0) || (prob_0alt >= 65535.4999999999) || (prob_1alt < 0.0) || (prob_1alt >= 65535.4999999999) || (prob_2alt < 0.0) || (prob_2alt >= 65535.4999999999)) {
+	    goto ox_gen_to_pgen_ret_INVALID_DOSAGE;
+	  }
+	  const uint32_t dosage_int0 = (int32_t)(prob_0alt + 0.5);
+	  const uint32_t dosage_int1 = (int32_t)(prob_1alt + 0.5);
+	  const uint32_t dosage_int2 = (int32_t)(prob_2alt + 0.5);
+	  bgen11_dosage_import_update(dosage_int_sum_thresh, import_dosage_certainty_int, hard_call_halfdist, dosage_erase_halfdist, sample_idx_lowbits, dosage_int0, dosage_int1, dosage_int2, &genovec_word, &dosage_present_hw, &dosage_vals_iter);
+	}
+	genovec[widx] = genovec_word;
+	((halfword_t*)dosage_present)[widx] = (halfword_t)dosage_present_hw;
+	++widx;
+      }
+      if (prov_ref_allele_second) {
+	genovec_invert_unsafe(sample_ct, genovec);
+	zero_trailing_quaters(sample_ct, genovec);
+      }
+      if (dosage_vals_iter != dosage_vals) {
+	const uint32_t dosage_ct = (uintptr_t)(dosage_vals_iter - dosage_vals);
+	if (prov_ref_allele_second) {
+	  biallelic_dosage16_invert(dosage_ct, dosage_vals);
+	}
+	if (spgw_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_ct, &spgw)) {
+	  goto ox_gen_to_pgen_ret_WRITE_FAIL;
+	}
+      } else {
+	if (spgw_append_biallelic_genovec(genovec, &spgw)) {
+	  goto ox_gen_to_pgen_ret_WRITE_FAIL;
+	}
+      }
+      ++vidx;
+      if (!(vidx % 1000)) {
+	printf("\r--data/--gen: %uk variants converted.", vidx / 1000);
+	fflush(stdout);
+      }
+    }
+    spgw_finish(&spgw);
+    putc_unlocked('\r', stdout);
+    write_iter = strcpya(g_logbuf, "--data/--gen: ");
+    const uint32_t outname_base_slen = (uintptr_t)(outname_end - outname);
+    write_iter = memcpya(write_iter, outname, outname_base_slen + 5);
+    write_iter = memcpyl3a(write_iter, " + ");
+    write_iter = memcpya(write_iter, outname, outname_base_slen);
+    write_iter = strcpya(write_iter, ".pvar");
+    strcpy(write_iter, " written.\n");
+    wordwrapb(0);
+    logprintb();
+  }
+  while (0) {
+  ox_gen_to_pgen_ret_LONG_LINE_N:
+    putc_unlocked('\n', stdout);
+    if (loadbuf_size == kMaxLongLine) {
+      LOGERRPRINTF("Error: Line %" PRIuPTR " of .gen file is pathologically long.\n", line_idx);
+      reterr = kPglRetMalformedInput;
+      break;
+    }
+  ox_gen_to_pgen_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  ox_gen_to_pgen_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  ox_gen_to_pgen_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  ox_gen_to_pgen_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  ox_gen_to_pgen_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  ox_gen_to_pgen_ret_INVALID_DOSAGE:
+    putc_unlocked('\n', stdout);
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of .gen file has an invalid dosage value.\n", line_idx);
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_gen_to_pgen_ret_MISSING_TOKENS:
+    putc_unlocked('\n', stdout);
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of .gen file has fewer tokens than expected.\n", line_idx);
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_gen_to_pgen_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  ox_gen_to_pgen_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_gen_to_pgen_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ ox_gen_to_pgen_ret_1:
+  if (spgw_cleanup(&spgw) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  gzclose_cond(gz_infile);
+  fclose_cond(pvarfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+// a few multithread globals
+static uint16_t** g_bgen_geno_bufs = nullptr; // per-thread
+
+
+// per-variant (could make compressed_geno_starts per-thread)
+static unsigned char** g_compressed_geno_starts[2] = {nullptr, nullptr};
+static uintptr_t* g_write_genovecs[2] = {nullptr, nullptr};
+static uint32_t* g_write_dosage_cts[2] = {nullptr, nullptr};
+static uintptr_t* g_write_dosage_presents[2] = {nullptr, nullptr};
+static dosage_t* g_write_dosage_val_bufs[2] = {nullptr, nullptr};
+
+static uint32_t g_sample_ct = 0;
+static uint32_t g_calc_thread_ct = 0;
+static uint32_t g_cur_block_write_ct = 0;
+static uint32_t g_hard_call_halfdist = 0;
+static uint32_t g_dosage_erase_halfdist = 0;
+static uint32_t g_import_dosage_certainty_int = 0;
+static uint32_t g_compression_mode = 0;
+static uint32_t g_dosage_is_present = 0;
+static uint32_t g_prov_ref_allele_second = 0;
+static pglerr_t g_error_ret = kPglRetSuccess;
+
+// static uint32_t* g_error_vidxs = nullptr; // per-thread
+
+THREAD_FUNC_DECL bgen11_dosage_scan_thread(void* arg) {
+  // this bails as soon as a single non-hardcall is detected.  still
+  // multithreaded due to low speed of uncompress() call, practical value of
+  // handling the all-hardcall case efficiently, and reduced code complexity
+  // (locally more complex, but globally cleaner due to overlap with
+  // bgen11_geno_to_pgen_thread()).
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t sample_ct = g_sample_ct;
+  uint16_t* bgen_geno_buf = g_bgen_geno_bufs[tidx];
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  // hard_call_halfdist irrelevant here
+  const uint32_t dosage_erase_halfdist = g_dosage_erase_halfdist;
+  const uint32_t import_dosage_certainty_int = g_import_dosage_certainty_int;
+  const uint32_t dosage_int_sum_thresh = 3 * (import_dosage_certainty_int - 1);
+  const uint32_t compression_mode = g_compression_mode;
+  // uint32_t vidx_base = 0;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    const uint32_t vidx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
+    uint32_t vidx = (tidx * cur_block_write_ct) / calc_thread_ct;
+    unsigned char* compressed_geno_iter = g_compressed_geno_starts[parity][vidx];
+    uint16_t* bgen_probs = bgen_geno_buf;
+    for (; vidx < vidx_end; ++vidx) {
+      if (compression_mode) {
+	uint32_t compressed_block_byte_ct;
+	memcpy(&compressed_block_byte_ct, compressed_geno_iter, 4);
+	compressed_geno_iter = &(compressed_geno_iter[4]);
+	uLongf zlib_ulongf = 6 * sample_ct;
+	if (uncompress((Bytef*)bgen_probs, &zlib_ulongf, (Bytef*)compressed_geno_iter, compressed_block_byte_ct) != Z_OK) {
+	  break;
+	}
+	compressed_geno_iter = &(compressed_geno_iter[compressed_block_byte_ct]);
+      } else {
+	bgen_probs = (uint16_t*)compressed_geno_iter;
+	compressed_geno_iter = &(compressed_geno_iter[6 * sample_ct]);
+      }
+      const uint16_t* bgen_probs_iter = bgen_probs;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	const uint32_t dosage_int0 = (uint32_t)(*bgen_probs_iter++);
+	const uint32_t dosage_int1 = (uint32_t)(*bgen_probs_iter++);
+	const uint32_t dosage_int2 = (uint32_t)(*bgen_probs_iter++);
+	const uint32_t dosage_int_sum = dosage_int0 + dosage_int1 + dosage_int2;
+	if ((dosage_int_sum > dosage_int_sum_thresh) || (dosage_int0 >= import_dosage_certainty_int) || (dosage_int1 >= import_dosage_certainty_int) || (dosage_int2 >= import_dosage_certainty_int)) {
+	  // ties realistically happen, use banker's rounding
+	  // 1/65536 -> 0/32768
+	  // 3/65536 -> 2/32768
+	  // 5/65536 -> 2/32768
+	  const dosage_prod_t write_dosage_int_numer = ((dosage_prod_t)kDosageMid) * dosage_int1 + ((dosage_prod_t)kDosageMax) * dosage_int2;
+	  uint32_t write_dosage_int;
+	  if (dosage_int_sum == kDosageMax) {
+	    // optimize common case
+	    write_dosage_int = ((write_dosage_int_numer + kDosageMid) / kDosageMax) - ((write_dosage_int_numer % (2 * ((dosage_prod_t)kDosageMax))) == kDosageMid);
+	  } else {
+	    write_dosage_int = (write_dosage_int_numer + (dosage_int_sum / 2)) / dosage_int_sum;
+	    write_dosage_int -= (2 * (write_dosage_int_numer - write_dosage_int * dosage_int_sum) == dosage_int_sum) * (write_dosage_int % 2);
+	  }
+	  const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+	  if (halfdist < dosage_erase_halfdist) {
+	    goto bgen11_dosage_scan_thread_dosage_found;
+	  }
+	}
+      }
+    }
+    if (vidx != vidx_end) {
+      // g_error_vidxs[tidx] = vidx + vidx_base;
+      g_error_ret = kPglRetMalformedInput;
+    }
+    while (0) {
+    bgen11_dosage_scan_thread_dosage_found:
+      g_dosage_is_present = 1;
+    }
+    // vidx_base += cur_block_write_ct;
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+static_assert(sizeof(dosage_t) == 2, "bgen11_geno_to_pgen_thread() needs to be updated.");
+THREAD_FUNC_DECL bgen11_geno_to_pgen_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t sample_ct = g_sample_ct;
+  uint16_t* bgen_geno_buf = g_bgen_geno_bufs[tidx];
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uint32_t hard_call_halfdist = g_hard_call_halfdist;
+  const uint32_t dosage_erase_halfdist = g_dosage_erase_halfdist;
+  const uint32_t import_dosage_certainty_int = g_import_dosage_certainty_int;
+  const uint32_t dosage_int_sum_thresh = 3 * (import_dosage_certainty_int - 1);
+  const uint32_t compression_mode = g_compression_mode;
+  const uint32_t prov_ref_allele_second = g_prov_ref_allele_second;
+  const uintptr_t sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+  const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  const uintptr_t sample_ctaw = BITCT_TO_ALIGNED_WORDCT(sample_ct);
+  // uint32_t vidx_base = 0;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    uint32_t vidx = (tidx * cur_block_write_ct) / calc_thread_ct;
+    const uint32_t vidx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
+    unsigned char* compressed_geno_iter = g_compressed_geno_starts[parity][vidx];
+    uintptr_t* write_genovec_iter = &(g_write_genovecs[parity][vidx * sample_ctaw2]);
+    uint32_t* write_dosage_ct_iter = &(g_write_dosage_cts[parity][vidx]);
+    uintptr_t* write_dosage_present_iter = &(g_write_dosage_presents[parity][vidx * sample_ctaw]);
+    dosage_t* write_dosage_vals_iter = &(g_write_dosage_val_bufs[parity][vidx * sample_ct]);
+    uint16_t* bgen_probs = bgen_geno_buf;
+    for (; vidx < vidx_end; ++vidx) {
+      if (compression_mode) {
+	uint32_t compressed_block_byte_ct;
+	memcpy(&compressed_block_byte_ct, compressed_geno_iter, 4);
+	compressed_geno_iter = &(compressed_geno_iter[4]);
+	uLongf zlib_ulongf = 6 * sample_ct;
+	if (uncompress((Bytef*)bgen_probs, &zlib_ulongf, (Bytef*)compressed_geno_iter, compressed_block_byte_ct) != Z_OK) {
+	  break;
+	}
+	compressed_geno_iter = &(compressed_geno_iter[compressed_block_byte_ct]);
+      } else {
+	bgen_probs = (uint16_t*)compressed_geno_iter;
+	compressed_geno_iter = &(compressed_geno_iter[6 * sample_ct]);
+      }
+      const uint16_t* bgen_probs_iter = bgen_probs;
+      dosage_t* cur_dosage_vals_iter = write_dosage_vals_iter;
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      uint32_t widx = 0;
+      while (1) {
+	if (widx >= sample_ctl2_m1) {
+	  if (widx > sample_ctl2_m1) {
+	    break;
+	  }
+	  inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	}
+	uintptr_t genovec_word = 0;
+	uint32_t dosage_present_hw = 0;
+	for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	  const uint32_t dosage_int0 = (uint32_t)(*bgen_probs_iter++);
+	  const uint32_t dosage_int1 = (uint32_t)(*bgen_probs_iter++);
+	  const uint32_t dosage_int2 = (uint32_t)(*bgen_probs_iter++);
+	  const uint32_t dosage_int_sum = dosage_int0 + dosage_int1 + dosage_int2;
+	  if (dosage_int_sum <= dosage_int_sum_thresh) {
+	    if ((dosage_int0 < import_dosage_certainty_int) && (dosage_int1 < import_dosage_certainty_int) && (dosage_int2 < import_dosage_certainty_int)) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	      continue;
+	    }
+	  }
+	  const dosage_prod_t write_dosage_int_numer = ((dosage_prod_t)kDosageMid) * dosage_int1 + ((dosage_prod_t)kDosageMax) * dosage_int2;
+	  uint32_t write_dosage_int;
+	  if (dosage_int_sum == kDosageMax) {
+	    write_dosage_int = ((write_dosage_int_numer + kDosageMid) / kDosageMax) - ((write_dosage_int_numer % (2 * ((dosage_prod_t)kDosageMax))) == kDosageMid);
+	  } else {
+	    write_dosage_int = (write_dosage_int_numer + (dosage_int_sum / 2)) / dosage_int_sum;
+	    write_dosage_int -= (2 * (write_dosage_int_numer - write_dosage_int * dosage_int_sum) == dosage_int_sum) * (write_dosage_int % 2);
+	  }
+	  const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+	  if (halfdist < hard_call_halfdist) {
+	    genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	  } else {
+	    genovec_word |= ((write_dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+	    if (halfdist >= dosage_erase_halfdist) {
+	      continue;
+	    }
+	  }
+	  dosage_present_hw |= 1U << sample_idx_lowbits;
+	  *cur_dosage_vals_iter++ = write_dosage_int;
+	}
+	write_genovec_iter[widx] = genovec_word;
+	((halfword_t*)write_dosage_present_iter)[widx] = (halfword_t)dosage_present_hw;
+	++widx;
+      }
+      const uint32_t dosage_ct = (uintptr_t)(cur_dosage_vals_iter - write_dosage_vals_iter);
+      if (prov_ref_allele_second) {
+	genovec_invert_unsafe(sample_ct, write_genovec_iter);
+	zero_trailing_quaters(sample_ct, write_genovec_iter);
+	if (dosage_ct) {
+	  biallelic_dosage16_invert(dosage_ct, write_dosage_vals_iter);
+	}
+      }
+      *write_dosage_ct_iter++ = dosage_ct;
+      write_genovec_iter = &(write_genovec_iter[sample_ctaw2]);
+      write_dosage_present_iter = &(write_dosage_present_iter[sample_ctaw]);
+      write_dosage_vals_iter = &(write_dosage_vals_iter[sample_ct]);
+    }
+    if (vidx != vidx_end) {
+      // g_error_vidxs[tidx] = vidx + vidx_base;
+      g_error_ret = kPglRetMalformedInput;
+    }
+    // vidx_base += cur_block_write_ct;
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+static unsigned char** g_thread_wkspaces = nullptr;
+static uint32_t* g_thread_bidxs[2] = {nullptr, nullptr};
+static uint16_t* g_bgen_allele_cts[2] = {nullptr, nullptr};
+static uint32_t* g_uncompressed_genodata_byte_cts[2] = {nullptr, nullptr};
+
+// for each bit precision level, how large must
+//   max(numerators, 2^{bit_precision} - 1 - [sum of numerators])
+// be to avoid throwing out the genotype?
+static uint32_t* g_bgen_import_dosage_certainty_thresholds = nullptr;
+
+// Reliably fast division by constants of the form 2^n - 1; see
+//   http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
+// The general case also requires a preshift parameter, but it's always zero
+// for the odd .bgen denominators.
+typedef struct bgen_magic_num_struct {
+  uint64_t totq_magic;
+  uint32_t totq_postshift;
+  uint32_t totq_incr;
+} bgen_magic_num_t;
+
+static const bgen_magic_num_t kBgenMagicNums[25] = {
+  {0, 0, 0},
+  {1, 0, 0},
+  {2863311531U, 33, 0},
+  {1227133513U, 33, 1},
+  {2290649225U, 35, 0},
+  {1108378657U, 35, 1},
+  {1090785345U, 36, 1},
+  {270549121, 35, 1},
+  {2155905153U, 39, 0},
+  {134480385, 36, 1},
+  {1074791425U, 40, 1},
+  {4196353, 33, 1},
+  {16781313, 36, 1},
+  {67117057, 39, 1},
+  {268451841, 42, 1},
+  {1073774593U, 45, 1},
+  {2147516417U, 47, 0}
+  // todo: check whether something similar works for 17-32 bit cases
+  /*
+  ,{131073, 34, 1},
+  {262145, 36, 1},
+  {524289, 38, 1},
+  {1048577, 40, 1},
+  {2097153, 42, 1},
+  {4194305, 44, 1},
+  {8388609, 46, 1},
+  {16777217, 48, 1},
+  {33554433, 50, 1},
+  {67108865, 52, 1},
+  {134217729, 54, 1},
+  {268435457, 56, 1},
+  {536870913, 58, 1},
+  {1073741825U, 60, 1},
+  {2147483649U, 62, 1},
+  {2147483649U, 63, 0}
+  */
+};
+
+static_assert(sizeof(dosage_t) == 2, "bgen13_dosage_or_phase_scan_thread() needs to be updated.");
+THREAD_FUNC_DECL bgen13_dosage_or_phase_scan_thread(void* arg) {
+  // This bails as soon as a single phased or dosage call is detected.  We
+  // provisionally assume e.g. phased calls are also present when dosages are,
+  // and clean up the relevant header bytes when the assumption is untrue.
+  // (Well, that's how it'll work after phased dosages are implemented,
+  // anyway.)
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t sample_ct = g_sample_ct;
+  const uint32_t dosage_erase_halfdist = g_dosage_erase_halfdist;
+  const uint32_t* bgen_import_dosage_certainty_thresholds = g_bgen_import_dosage_certainty_thresholds;
+  const uint32_t compression_mode = g_compression_mode;
+  const unsigned char* cur_uncompressed_geno = nullptr;
+  if (compression_mode) {
+    cur_uncompressed_geno = g_thread_wkspaces[tidx];
+  }
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+
+    // this is just used as an no-error flag
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    if (cur_block_write_ct) {
+      const uint32_t bidx_end = g_thread_bidxs[parity][tidx + 1];
+      uint32_t bidx = g_thread_bidxs[parity][tidx];
+      unsigned char** compressed_geno_starts = g_compressed_geno_starts[parity];
+      const uint16_t* bgen_allele_cts = g_bgen_allele_cts[parity];
+      const uint32_t* uncompressed_genodata_byte_cts = g_uncompressed_genodata_byte_cts[parity];
+      for (; bidx < bidx_end; ++bidx) {
+	const unsigned char* compressed_geno_start = compressed_geno_starts[bidx];
+	const unsigned char* compressed_geno_end = compressed_geno_starts[bidx + 1];
+	uint32_t compressed_byte_ct = (uintptr_t)(compressed_geno_end - compressed_geno_start);
+	uint32_t uncompressed_byte_ct;
+	if (compression_mode) {
+	  uncompressed_byte_ct = uncompressed_genodata_byte_cts[bidx];
+	  if (compression_mode == 1) {
+	    uLongf zlib_ulongf = uncompressed_byte_ct;
+	    // const_cast
+	    if (uncompress((Bytef*)((uintptr_t)cur_uncompressed_geno), &zlib_ulongf, (const Bytef*)compressed_geno_start, compressed_byte_ct) != Z_OK) {
+	      // possible todo: report variant index
+	      goto bgen13_dosage_or_phase_scan_thread_malformed;
+	    }
+	  } else {
+	    // const_cast
+	    const uintptr_t extracted_byte_ct = ZSTD_decompress((void*)((uintptr_t)cur_uncompressed_geno), uncompressed_byte_ct, compressed_geno_start, compressed_byte_ct);
+	    if (extracted_byte_ct != uncompressed_byte_ct) {
+	      // possible todo: inspect error code
+	      goto bgen13_dosage_or_phase_scan_thread_malformed;
+	    }
+	  }
+	} else {
+	  cur_uncompressed_geno = compressed_geno_start;
+	  uncompressed_byte_ct = compressed_byte_ct;
+	}
+	// 4 bytes: sample_ct
+	// 2 bytes: # of alleles, must match bgen_allele_cts[bidx]
+	// 1 byte: min ploidy
+	// 1 byte: max ploidy
+	// sample_ct bytes: low 6 bits = ploidy, top bit = missingness
+	// 1 byte: 1 if phased, 0 if not
+	// 1 byte: # of bits of probability precision (we just support 8 and 16
+	//         for now, add others later)
+	if ((uncompressed_byte_ct < 10 + sample_ct) || memcmp(&sample_ct, cur_uncompressed_geno, 4)) {
+	  goto bgen13_dosage_or_phase_scan_thread_malformed;
+	}
+	const uint32_t cur_allele_ct = bgen_allele_cts[bidx];
+	if (*((const uint16_t*)(&(cur_uncompressed_geno[4]))) != cur_allele_ct) {
+	  goto bgen13_dosage_or_phase_scan_thread_malformed;
+	}
+	const uint32_t min_ploidy = cur_uncompressed_geno[6];
+	const uint32_t max_ploidy = cur_uncompressed_geno[7];
+	if ((min_ploidy > max_ploidy) || (max_ploidy > 63)) {
+	  goto bgen13_dosage_or_phase_scan_thread_malformed;
+	}
+	if (max_ploidy > 2) {
+	  goto bgen13_dosage_or_phase_scan_thread_not_yet_supported;
+	}
+	const unsigned char* missing_and_ploidy_info = &(cur_uncompressed_geno[8]);
+	const unsigned char* uncompressed_geno_iter = &(cur_uncompressed_geno[8 + sample_ct]);
+	const uint32_t is_phased = *uncompressed_geno_iter++;
+	if (is_phased > 1) {
+	  goto bgen13_dosage_or_phase_scan_thread_malformed;
+	}
+	const uint32_t bit_precision = *uncompressed_geno_iter++;
+	if ((!bit_precision) || (bit_precision > 32)) {
+	  goto bgen13_dosage_or_phase_scan_thread_malformed;
+	}
+	if (bit_precision > 16) {
+	  goto bgen13_dosage_or_phase_scan_thread_not_yet_supported;
+	}
+	const uint64_t totq_magic = kBgenMagicNums[bit_precision].totq_magic;
+	const uint32_t totq_postshift = kBgenMagicNums[bit_precision].totq_postshift;
+	uint32_t totq_incr = kBgenMagicNums[bit_precision].totq_incr;
+	const uint32_t bytes_per_prob = DIV_UP(bit_precision, CHAR_BIT);
+
+	// also equal to denominator
+	const uintptr_t numer_mask = (1U << bit_precision) - 1;
+
+        // diploid (haploid is identical except b is always zero):
+        //   round((32768a + 16384b)/(2^{bit precision} - 1))
+	//   floor((32768a + 16384b)/(2^{bit_precision} - 1) + 0.5)
+	// = floor((32768a + 16384b + 2^{bit_precision - 1})
+        //     / (2^{bit_precision} - 1))
+	// = (totq_magic * (32768a + 16384b + 2^{bits-1} + totq_incr))
+        //     >> totq_postshift
+	//
+	// This works fine for bit_precision <= 16, anyway.  There are two
+	// issues which come up with higher precision:
+	// 1. The ridiculous_fish magic numbers assume a 32-bit dividend.  Our
+	//    dividend is guaranteed to be divisible by 2^14, but it can be as
+	//    large as
+	//      (2^{bits} - 1) * 2^15 + 2^{bits-1}.
+	//    I would not be surprised if a similar approach still works with
+	//    bits > 16, but I'm pretty sure the magic-number-generating
+	//    function would need to be different.
+	// 2. Relatedly, the current sequence of operations multiplies
+	//    totq_magic by (dividend + totq_incr) (where totq_incr is zero or
+	//    one); this intermediate result must not overflow a uint64_t.
+        //
+        // Meanwhile, idempotence is not possible for --import-dosage-certainty
+	// anyway, so we apply that check to the pre-conversion numerators.
+	totq_incr += 1U << (bit_precision - 1);
+	uint32_t numer_certainty_min = 0;
+	if (bgen_import_dosage_certainty_thresholds) {
+	  numer_certainty_min = bgen_import_dosage_certainty_thresholds[bit_precision];
+	}
+
+	if (is_phased) {
+	  // todo
+	  goto bgen13_dosage_or_phase_scan_thread_not_yet_supported;
+	} else {
+	  if (cur_allele_ct == 2) {
+	    if (min_ploidy == max_ploidy) {
+	      // faster handling of common cases (no need to keep checking if
+	      // we've read past the end)
+	      if (uncompressed_byte_ct != (1 + bytes_per_prob * (max_ploidy * k1LU)) * sample_ct + 10) {
+		goto bgen13_dosage_or_phase_scan_thread_malformed;
+	      }
+	      if (max_ploidy == 2) {
+		for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+		  const uint32_t missing_and_ploidy = missing_and_ploidy_info[sample_idx];
+		  // treat anything else as missing
+		  if (missing_and_ploidy == 2) {
+		    const unsigned char* sample_probs_start = &(uncompressed_geno_iter[sample_idx * 2 * bytes_per_prob]);
+#ifdef __arm__
+  #error "Unaligned accesses in bgen13_dosage_or_phase_scan_thread()."
+#endif
+		    // this can read slightly past the end of the buffer
+		    const uintptr_t numer_aa = (*((const uint32_t*)sample_probs_start)) & numer_mask;
+		    const uintptr_t numer_ab = (*((const uint32_t*)(&(sample_probs_start[bytes_per_prob])))) & numer_mask;
+		    if ((numer_aa < numer_certainty_min) && (numer_ab < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_aa + numer_ab)) {
+		      // treat as missing
+		      continue;
+		    }
+		    const uint32_t write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_aa) + kDosageMid * ((uint64_t)numer_ab) + totq_incr)) >> totq_postshift;
+		    const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+		    if (halfdist < dosage_erase_halfdist) {
+		      goto bgen13_dosage_or_phase_scan_thread_found;
+		    }
+		  }
+		}
+	      } else if (max_ploidy == 1) {
+		for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+		  const uint32_t missing_and_ploidy = missing_and_ploidy_info[sample_idx];
+		  if (missing_and_ploidy == 1) {
+		    const unsigned char* sample_probs_start = &(uncompressed_geno_iter[sample_idx * bytes_per_prob]);
+		    const uintptr_t numer_a = (*((const uint32_t*)sample_probs_start)) & numer_mask;
+		    if ((numer_a < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_a)) {
+		      continue;
+		    }
+		    const uint32_t write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_a) + totq_incr)) >> totq_postshift;
+		    const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+		    if (halfdist < dosage_erase_halfdist) {
+		      goto bgen13_dosage_or_phase_scan_thread_found;
+		    }
+		  }
+		}
+	      }
+	      // don't need to do anything in all-ploidy-0 case
+	    } else {
+	      const unsigned char* uncompressed_geno_end = &(cur_uncompressed_geno[uncompressed_byte_ct]);
+	      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+		if (uncompressed_geno_iter > uncompressed_geno_end) {
+		  goto bgen13_dosage_or_phase_scan_thread_malformed;
+		}
+		uint32_t missing_and_ploidy = missing_and_ploidy_info[sample_idx];
+		if (missing_and_ploidy == 2) {
+		  const uintptr_t numer_aa = (*((const uint32_t*)uncompressed_geno_iter)) & numer_mask;
+		  const uintptr_t numer_ab = (*((const uint32_t*)(&(uncompressed_geno_iter[bytes_per_prob])))) & numer_mask;
+		  uncompressed_geno_iter = &(uncompressed_geno_iter[2 * bytes_per_prob]);
+		  if ((numer_aa < numer_certainty_min) && (numer_ab < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_aa + numer_ab)) {
+		    // treat as missing
+		    continue;
+		  }
+		  const uint32_t write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_aa) + kDosageMid * ((uint64_t)numer_ab) + totq_incr)) >> totq_postshift;
+		  const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+		  if (halfdist < dosage_erase_halfdist) {
+		    goto bgen13_dosage_or_phase_scan_thread_found;
+		  }
+		} else if (missing_and_ploidy == 1) {
+		  const uintptr_t numer_a = (*((const uint32_t*)uncompressed_geno_iter)) & numer_mask;
+		  uncompressed_geno_iter = &(uncompressed_geno_iter[bytes_per_prob]);
+		  if ((numer_a < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_a)) {
+		    continue;
+		  }
+		  const uint32_t write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_a) + totq_incr)) >> totq_postshift;
+		  const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+		  if (halfdist < dosage_erase_halfdist) {
+		    goto bgen13_dosage_or_phase_scan_thread_found;
+		  }
+		} else {
+		  // treat as missing
+		  missing_and_ploidy &= 127;
+		  if (missing_and_ploidy > 2) {
+		    goto bgen13_dosage_or_phase_scan_thread_malformed;
+		  }
+		  uncompressed_geno_iter = &(uncompressed_geno_iter[missing_and_ploidy * bytes_per_prob]);
+		}
+	      }
+	    }
+	  } else {
+	    // todo: unphased multiallelic variants
+	    // (shouldn't currently be possible to reach here, I/O thread skips
+	    // multiallelics for now)
+	    assert(0);
+	    goto bgen13_dosage_or_phase_scan_thread_not_yet_supported;
+	  }
+	}
+      }
+    }
+    while (0) {
+    bgen13_dosage_or_phase_scan_thread_malformed:
+      g_error_ret = kPglRetMalformedInput;
+      break;
+    bgen13_dosage_or_phase_scan_thread_not_yet_supported:
+      g_error_ret = kPglRetNotYetSupported;
+      break;
+    bgen13_dosage_or_phase_scan_thread_found:
+      g_dosage_is_present = 1;
+      break;
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+static uintptr_t* g_write_phasepresents[2] = {nullptr, nullptr};
+static uintptr_t* g_write_phaseinfos[2] = {nullptr, nullptr};
+static uintptr_t* g_write_dphase_presents[2] = {nullptr, nullptr};
+static uint32_t* g_write_dphase_cts[2] = {nullptr, nullptr};
+
+static_assert(sizeof(dosage_t) == 2, "bgen13_geno_to_pgen_thread() needs to be updated.");
+THREAD_FUNC_DECL bgen13_geno_to_pgen_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t sample_ct = g_sample_ct;
+  const uint32_t hard_call_halfdist = g_hard_call_halfdist;
+  const uint32_t dosage_erase_halfdist = g_dosage_erase_halfdist;
+  const uint32_t* bgen_import_dosage_certainty_thresholds = g_bgen_import_dosage_certainty_thresholds;
+  const uint32_t compression_mode = g_compression_mode;
+  const uint32_t prov_ref_allele_second = g_prov_ref_allele_second;
+  const uintptr_t sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+  const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  const uintptr_t sample_ctaw = BITCT_TO_ALIGNED_WORDCT(sample_ct);
+  const unsigned char* cur_uncompressed_geno = nullptr;
+  if (compression_mode) {
+    cur_uncompressed_geno = g_thread_wkspaces[tidx];
+  }
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+
+    // this is just used as an no-error flag
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    if (cur_block_write_ct) {
+      const uint32_t bidx_end = g_thread_bidxs[parity][tidx + 1];
+      uint32_t bidx = g_thread_bidxs[parity][tidx];
+      unsigned char** compressed_geno_starts = g_compressed_geno_starts[parity];
+      uintptr_t* write_genovec_iter = &(g_write_genovecs[parity][bidx * sample_ctaw2]);
+      uint32_t* write_dosage_ct_iter = &(g_write_dosage_cts[parity][bidx]);
+      uint32_t* write_dphase_ct_iter = &(g_write_dphase_cts[parity][bidx]);
+      uintptr_t* write_dosage_present_iter = &(g_write_dosage_presents[parity][bidx * sample_ctaw]);
+      uintptr_t* write_dphase_present_iter = &(g_write_dphase_presents[parity][bidx * sample_ctaw]);
+      dosage_t* write_dosage_vals_iter = &(g_write_dosage_val_bufs[parity][bidx * sample_ct * 2]);
+      const uint16_t* bgen_allele_ct_iter = &(g_bgen_allele_cts[parity][bidx]);
+      const uint32_t* uncompressed_genodata_byte_ct_iter = &(g_uncompressed_genodata_byte_cts[parity][bidx]);
+      for (; bidx < bidx_end; ++bidx) {
+	const unsigned char* compressed_geno_start = compressed_geno_starts[bidx];
+	const unsigned char* compressed_geno_end = compressed_geno_starts[bidx + 1];
+	uint32_t compressed_byte_ct = (uintptr_t)(compressed_geno_end - compressed_geno_start);
+	uint32_t uncompressed_byte_ct;
+	if (compression_mode) {
+	  uncompressed_byte_ct = *uncompressed_genodata_byte_ct_iter++;
+	  if (compression_mode == 1) {
+	    uLongf zlib_ulongf = uncompressed_byte_ct;
+	    // const_cast
+	    if (uncompress((Bytef*)((uintptr_t)cur_uncompressed_geno), &zlib_ulongf, (const Bytef*)compressed_geno_start, compressed_byte_ct) != Z_OK) {
+	      // possible todo: report variant index
+	      goto bgen13_geno_to_pgen_thread_malformed;
+	    }
+	  } else {
+            // const_cast
+	    const uintptr_t extracted_byte_ct = ZSTD_decompress((void*)((uintptr_t)cur_uncompressed_geno), uncompressed_byte_ct, compressed_geno_start, compressed_byte_ct);
+	    if (extracted_byte_ct != uncompressed_byte_ct) {
+	      // possible todo: inspect error code
+	      goto bgen13_geno_to_pgen_thread_malformed;
+	    }
+	  }
+	} else {
+	  cur_uncompressed_geno = compressed_geno_start;
+	  uncompressed_byte_ct = compressed_byte_ct;
+	}
+	// 4 bytes: sample_ct
+	// 2 bytes: # of alleles, must match bgen_allele_cts[bidx]
+	// 1 byte: min ploidy
+	// 1 byte: max ploidy
+	// sample_ct bytes: low 6 bits = ploidy, top bit = missingness
+	// 1 byte: 1 if phased, 0 if not
+	// 1 byte: # of bits of probability precision (we just support 8 and 16
+	//         for now, add others later)
+	if ((uncompressed_byte_ct < 10 + sample_ct) || memcmp(&sample_ct, cur_uncompressed_geno, 4)) {
+	  goto bgen13_geno_to_pgen_thread_malformed;
+	}
+	const uint32_t cur_allele_ct = *bgen_allele_ct_iter++;
+	if (*((const uint16_t*)(&(cur_uncompressed_geno[4]))) != cur_allele_ct) {
+	  goto bgen13_geno_to_pgen_thread_malformed;
+	}
+	const uint32_t min_ploidy = cur_uncompressed_geno[6];
+	const uint32_t max_ploidy = cur_uncompressed_geno[7];
+	if ((min_ploidy > max_ploidy) || (max_ploidy > 63)) {
+	  goto bgen13_geno_to_pgen_thread_malformed;
+	}
+	if (max_ploidy > 2) {
+	  goto bgen13_geno_to_pgen_thread_not_yet_supported;
+	}
+	const unsigned char* missing_and_ploidy_iter = &(cur_uncompressed_geno[8]);
+	const unsigned char* uncompressed_geno_iter = &(cur_uncompressed_geno[8 + sample_ct]);
+	const uint32_t is_phased = *uncompressed_geno_iter++;
+	if (is_phased > 1) {
+	  goto bgen13_geno_to_pgen_thread_malformed;
+	}
+	const uint32_t bit_precision = *uncompressed_geno_iter++;
+	if ((!bit_precision) || (bit_precision > 32)) {
+	  goto bgen13_geno_to_pgen_thread_malformed;
+	}
+	if (bit_precision > 16) {
+	  goto bgen13_geno_to_pgen_thread_not_yet_supported;
+	}
+	const uint64_t totq_magic = kBgenMagicNums[bit_precision].totq_magic;
+	const uint32_t totq_postshift = kBgenMagicNums[bit_precision].totq_postshift;
+	uint32_t totq_incr = kBgenMagicNums[bit_precision].totq_incr;
+	const uint32_t bytes_per_prob = DIV_UP(bit_precision, CHAR_BIT);
+
+	// also equal to denominator
+	const uintptr_t numer_mask = (1U << bit_precision) - 1;
+
+	totq_incr += 1U << (bit_precision - 1);
+	uint32_t numer_certainty_min = 0;
+	if (bgen_import_dosage_certainty_thresholds) {
+	  numer_certainty_min = bgen_import_dosage_certainty_thresholds[bit_precision];
+	}
+
+	dosage_t* cur_dosage_vals_iter = write_dosage_vals_iter;
+	uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+	uint32_t widx = 0;
+	if (is_phased) {
+	  // todo
+	  goto bgen13_geno_to_pgen_thread_not_yet_supported;
+	} else {
+	  // fill_ulong_zero(sample_ctaw, write_dphase_present_iter);
+	  if (cur_allele_ct == 2) {
+	    if (min_ploidy == max_ploidy) {
+	      // faster handling of common cases (no need to keep checking if
+	      // we've read past the end)
+	      if (uncompressed_byte_ct != (bytes_per_prob * (max_ploidy * k1LU) + 1) * sample_ct + 10) {
+		goto bgen13_geno_to_pgen_thread_malformed;
+	      }
+	      if (max_ploidy == 2) {
+		while (1) {
+		  if (widx >= sample_ctl2_m1) {
+		    if (widx > sample_ctl2_m1) {
+		      break;
+		    }
+		    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+		  }
+		  uintptr_t genovec_word = 0;
+		  uint32_t dosage_present_hw = 0;
+		  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits, uncompressed_geno_iter = &(uncompressed_geno_iter[2 * bytes_per_prob])) {
+		    const uint32_t missing_and_ploidy = *missing_and_ploidy_iter++;
+		    if (missing_and_ploidy == 2) {
+#ifdef __arm__
+  #error "Unaligned accesses in bgen13_geno_to_pgen_thread()."
+#endif
+		      const uintptr_t numer_aa = (*((const uint32_t*)uncompressed_geno_iter)) & numer_mask;
+		      const uintptr_t numer_ab = (*((const uint32_t*)(&(uncompressed_geno_iter[bytes_per_prob])))) & numer_mask;
+		      if (numer_aa + numer_ab > numer_mask) {
+			goto bgen13_geno_to_pgen_thread_malformed;
+		      }
+		      if ((numer_aa < numer_certainty_min) && (numer_ab < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_aa + numer_ab)) {
+			// missing due to --import-dosage-certainty
+			goto bgen13_geno_to_pgen_thread_diploid_missing;
+		      }
+		      const uint32_t write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_aa) + kDosageMid * ((uint64_t)numer_ab) + totq_incr)) >> totq_postshift;
+		      const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+		      if (halfdist < hard_call_halfdist) {
+			genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		      } else {
+			genovec_word |= ((write_dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+			if (halfdist >= dosage_erase_halfdist) {
+			  continue;
+			}
+		      }
+		      dosage_present_hw |= 1U << sample_idx_lowbits;
+		      *cur_dosage_vals_iter++ = write_dosage_int;
+		    } else {
+		      // (could also validate that missing_and_ploidy == 130)
+		    bgen13_geno_to_pgen_thread_diploid_missing:
+		      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		    }
+		  }
+		  write_genovec_iter[widx] = genovec_word;
+		  ((halfword_t*)write_dosage_present_iter)[widx] = (halfword_t)dosage_present_hw;
+		  ++widx;
+		}
+	      } else if (max_ploidy == 1) {
+		while (1) {
+		  if (widx >= sample_ctl2_m1) {
+		    if (widx > sample_ctl2_m1) {
+		      break;
+		    }
+		    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+		  }
+		  uintptr_t genovec_word = 0;
+		  uint32_t dosage_present_hw = 0;
+		  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits, uncompressed_geno_iter = &(uncompressed_geno_iter[bytes_per_prob])) {
+		    const uint32_t missing_and_ploidy = *missing_and_ploidy_iter++;
+		    if (missing_and_ploidy == 1) {
+		      const uintptr_t numer_a = (*((const uint32_t*)uncompressed_geno_iter)) & numer_mask;
+		      if ((numer_a < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_a)) {
+			goto bgen13_geno_to_pgen_thread_haploid_missing;
+		      }
+		      const uint32_t write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_a) + totq_incr)) >> totq_postshift;
+		      const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+		      if (halfdist < hard_call_halfdist) {
+			genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		      } else {
+			genovec_word |= ((write_dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+			if (halfdist >= dosage_erase_halfdist) {
+			  continue;
+			}
+		      }
+		      dosage_present_hw |= 1U << sample_idx_lowbits;
+		      *cur_dosage_vals_iter++ = write_dosage_int;
+		    } else {
+		    bgen13_geno_to_pgen_thread_haploid_missing:
+		      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		    }
+		  }
+		  write_genovec_iter[widx] = genovec_word;
+		  ((halfword_t*)write_dosage_present_iter)[widx] = (halfword_t)dosage_present_hw;
+		  ++widx;
+		}
+	      }
+	      // don't need to do anything in all-ploidy-0 case
+	    } else {
+	      const unsigned char* uncompressed_geno_end = &(cur_uncompressed_geno[uncompressed_byte_ct]);
+	      while (1) {
+		if (widx >= sample_ctl2_m1) {
+		  if (widx > sample_ctl2_m1) {
+		    break;
+		  }
+		  inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+		}
+		uintptr_t genovec_word = 0;
+		uint32_t dosage_present_hw = 0;
+		for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		  if (uncompressed_geno_iter > uncompressed_geno_end) {
+		    goto bgen13_geno_to_pgen_thread_malformed;
+		  }
+		  uint32_t missing_and_ploidy = *missing_and_ploidy_iter++;
+		  uint32_t write_dosage_int;
+		  if (missing_and_ploidy == 2) {
+		    const uintptr_t numer_aa = (*((const uint32_t*)uncompressed_geno_iter)) & numer_mask;
+		    const uintptr_t numer_ab = (*((const uint32_t*)(&(uncompressed_geno_iter[bytes_per_prob])))) & numer_mask;
+		    uncompressed_geno_iter = &(uncompressed_geno_iter[2 * bytes_per_prob]);
+		    if (numer_aa + numer_ab > numer_mask) {
+		      goto bgen13_geno_to_pgen_thread_malformed;
+		    }
+		    if ((numer_aa < numer_certainty_min) && (numer_ab < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_aa + numer_ab)) {
+		      // missing due to --import-dosage-certainty
+		      goto bgen13_geno_to_pgen_thread_generic_missing;
+		    }
+		    write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_aa) + kDosageMid * ((uint64_t)numer_ab) + totq_incr)) >> totq_postshift;
+		  } else if (missing_and_ploidy == 1) {
+		    const uintptr_t numer_a = (*((const uint32_t*)uncompressed_geno_iter)) & numer_mask;
+		    uncompressed_geno_iter = &(uncompressed_geno_iter[bytes_per_prob]);
+		    if ((numer_a < numer_certainty_min) && (numer_mask - numer_certainty_min < numer_a)) {
+		      goto bgen13_geno_to_pgen_thread_generic_missing;
+		    }
+		    write_dosage_int = (totq_magic * (kDosageMax * ((uint64_t)numer_a) + totq_incr)) >> totq_postshift;
+		  } else {
+		    missing_and_ploidy &= 127;
+		    if (missing_and_ploidy > 2) {
+		      goto bgen13_geno_to_pgen_thread_malformed;
+		    }
+		    uncompressed_geno_iter = &(uncompressed_geno_iter[missing_and_ploidy * bytes_per_prob]);
+		  bgen13_geno_to_pgen_thread_generic_missing:
+		    genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		    continue;
+		  }
+		  const uint32_t halfdist = biallelic_dosage_halfdist(write_dosage_int);
+		  if (halfdist < hard_call_halfdist) {
+		    genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		  } else {
+		    genovec_word |= ((write_dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+		    if (halfdist >= dosage_erase_halfdist) {
+		      continue;
+		    }
+		  }
+		  dosage_present_hw |= 1U << sample_idx_lowbits;
+		  *cur_dosage_vals_iter++ = write_dosage_int;
+		}
+		write_genovec_iter[widx] = genovec_word;
+		((halfword_t*)write_dosage_present_iter)[widx] = (halfword_t)dosage_present_hw;
+		++widx;
+	      }
+	    }
+	    const uint32_t dosage_ct = (uintptr_t)(cur_dosage_vals_iter - write_dosage_vals_iter);
+	    // note that this is inverted from bgen-1.1
+	    if (!prov_ref_allele_second) {
+	      genovec_invert_unsafe(sample_ct, write_genovec_iter);
+	      zero_trailing_quaters(sample_ct, write_genovec_iter);
+	      if (dosage_ct) {
+		biallelic_dosage16_invert(dosage_ct, write_dosage_vals_iter);
+	      }
+	    }
+	    *write_dosage_ct_iter++ = dosage_ct;
+	    *write_dphase_ct_iter++ = 0;
+	    write_genovec_iter = &(write_genovec_iter[sample_ctaw2]);
+	    write_dosage_present_iter = &(write_dosage_present_iter[sample_ctaw]);
+	    write_dphase_present_iter = &(write_dphase_present_iter[sample_ctaw]);
+	    write_dosage_vals_iter = &(write_dosage_vals_iter[sample_ct * 2]);
+	  } else {
+	    // todo: unphased multiallelic variants
+	    assert(0);
+	    goto bgen13_geno_to_pgen_thread_not_yet_supported;
+	  }
+	}
+      }
+    }
+    while (0) {
+    bgen13_geno_to_pgen_thread_malformed:
+      g_error_ret = kPglRetMalformedInput;
+      break;
+    bgen13_geno_to_pgen_thread_not_yet_supported:
+      g_error_ret = kPglRetNotYetSupported;
+      break;
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+static_assert(sizeof(dosage_t) == 2, "ox_bgen_to_pgen() needs to be updated.");
+pglerr_t ox_bgen_to_pgen(const char* bgenname, const char* samplename, const char* const_fid, const char* ox_missing_code, misc_flags_t misc_flags, oxford_import_t oxford_import_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, char id_delim, char idspace_to, uint32_t max_thread_ct, char* outname, char* outname_end, chr_info_t* cip) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  FILE* bgenfile = nullptr;
+
+  // only if no sample file specified, and .bgen has sample IDs.  (possible
+  // todo: consistency check when both sources of sample IDs are present?)
+  FILE* psamfile = nullptr;
+  
+  FILE* pvarfile = nullptr;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  st_pgen_writer_t spgw;
+  pglerr_t reterr = kPglRetSuccess;
+  spgw_preinit(&spgw);
+  {
+    // Pass 1: Determine whether there's at least one non-hardcall needs to be
+    //         saved, and if a chromosome filter was specified, count the
+    //         number of variants which pass the filter.
+    //         For bgen-1.2/1.3, the .pvar is also written in this pass.
+    //         For bgen-1.1, we can usually early-bail when no chromosome
+    //         filter is involved, so .pvar writing is postponed till the
+    //         second pass.
+    // Pass 2: Write .pgen file.
+
+    if (fopen_checked(bgenname, FOPEN_RB, &bgenfile)) {
+      goto ox_bgen_to_pgen_ret_OPEN_FAIL;
+    }
+    uint32_t initial_uints[5];
+    if (!fread(initial_uints, 20, 1, bgenfile)) {
+      // this could be malformed input as well; could distinguish later?
+      goto ox_bgen_to_pgen_ret_READ_FAIL;
+    }
+    if (initial_uints[1] > initial_uints[0]) {
+      logerrprint("Error: Invalid .bgen header.\n");
+      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+    }
+    const uint32_t raw_variant_ct = initial_uints[2];
+    if (!raw_variant_ct) {
+      // permit this in --allow-no-vars case?
+      logerrprint("Error: Empty .bgen file.\n");
+      goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+    }
+    const uint32_t sample_ct = initial_uints[3];
+    if (initial_uints[4] && (initial_uints[4] != 0x6e656762)) {
+      logerrprint("Error: Invalid .bgen magic number.\n");
+      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+    }
+
+    if (fseeko(bgenfile, initial_uints[1], SEEK_SET)) {
+      goto ox_bgen_to_pgen_ret_READ_FAIL;
+    }
+    uint32_t header_flags;
+    if (!fread(&header_flags, 4, 1, bgenfile)) {
+      goto ox_bgen_to_pgen_ret_READ_FAIL;
+    }
+    const uint32_t compression_mode = header_flags & 3;
+    const uint32_t layout = (header_flags >> 2) & 15;
+    if (!layout) {
+      logerrprint("Error: BGEN v1.0 files are not supported by " PROG_NAME_STR ".\n");
+      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+    }
+    if ((compression_mode == 3) || (layout > 2)) {
+      logerrprint("Error: Unrecognized BGEN version.  Use gen-convert or a similar tool to\ndowncode to BGEN v1.3 if you want to process this data with " PROG_NAME_STR ".\n");
+      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+    }
+    if ((compression_mode == 2) && (layout == 1)) {
+      logerrprint("Error: Invalid .bgen header.\n");
+      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+    }
+    LOGPRINTF("--bgen: %u variant%s detected, format v1.%c.\n", raw_variant_ct, (raw_variant_ct == 1)? "" : "s", (layout == 1)? '1' : ((compression_mode == 2)? '3' : '2'));
+    if (samplename[0]) {
+      uint32_t sfile_sample_ct;
+      reterr = ox_sample_to_psam(samplename, ox_missing_code, misc_flags, outname, outname_end, &sfile_sample_ct);
+      if (reterr) {
+	goto ox_bgen_to_pgen_ret_1;
+      }
+      if (sfile_sample_ct != sample_ct) {
+	LOGERRPRINTF("Error: .sample file has %u sample%s, while .bgen file has %u.\n", sfile_sample_ct, (sfile_sample_ct == 1)? "" : "s", sample_ct);
+	goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+      }
+      if (header_flags >> 31) {
+	uint32_t sample_id_block_byte_ct;
+	uint32_t sample_id_block_entry_ct;
+	if ((!fread(&sample_id_block_byte_ct, 4, 1, bgenfile)) ||
+	    (!fread(&sample_id_block_entry_ct, 4, 1, bgenfile))) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if ((((uint64_t)sample_id_block_byte_ct) + initial_uints[1] > initial_uints[0]) ||
+	    (sample_id_block_entry_ct != sample_ct)) {
+	  logerrprint("Error: Invalid .bgen header.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+      }
+    } else {
+      if (!(header_flags >> 31)) {
+	logerrprint("Error: .bgen file does not contain sample IDs, and no .sample file was\nspecified.\n");
+	goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+      }
+      // possible todo: optionally error out if sample IDs aren't consistent
+      // between .bgen and .sample
+
+      // see vcf_sample_line()
+      // probable todo: wrap much of this in its own function
+      uint32_t double_id = (misc_flags / kfMiscDoubleId) & 1;
+      uintptr_t const_fid_len = 0;
+      if (const_fid) {
+	const_fid_len = strlen(const_fid);
+      } else if ((!double_id) && (!id_delim)) {
+	// default: --double-id + --id-delim
+	double_id = 1;
+	id_delim = '_';
+      }
+      const uint32_t double_or_const_fid = double_id || const_fid;
+      uint32_t sample_id_block_byte_ct;
+      uint32_t sample_id_block_entry_ct;
+      if ((!fread(&sample_id_block_byte_ct, 4, 1, bgenfile)) ||
+	  (!fread(&sample_id_block_entry_ct, 4, 1, bgenfile))) {
+	goto ox_bgen_to_pgen_ret_READ_FAIL;
+      }
+      if ((sample_id_block_byte_ct < 8) ||
+	  (((uint64_t)sample_id_block_byte_ct) + initial_uints[1] > initial_uints[0]) ||
+	  (sample_id_block_entry_ct != sample_ct)) {
+	logerrprint("Error: Invalid .bgen header.\n");
+	goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+      }
+      sample_id_block_byte_ct -= 8;
+      unsigned char* sample_id_block_main = bigstack_alloc(sample_id_block_byte_ct);
+      if (!sample_id_block_main) {
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      unsigned char* sample_id_block_end = &(sample_id_block_main[sample_id_block_byte_ct]);
+      if (fread_checked(sample_id_block_main, sample_id_block_byte_ct, bgenfile)) {
+	goto ox_bgen_to_pgen_ret_READ_FAIL;
+      }
+      
+      // high 16 bits always zero
+      // we don't just use a uint16_t since we add 2
+      uint32_t input_id_slen = 0;
+      
+      // always check if any tab/eoln characters are present, and error out if
+      // so
+      // if id_delim != ' ', also check if spaces are present; if so, replace
+      // with --idspace-to character or error out
+      unsigned char* sample_id_block_iter = sample_id_block_main;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	memcpy(&input_id_slen, sample_id_block_iter, 2);
+
+	// need to check this to avoid read-past-the-end indeterminate
+	// behavior
+	if ((uintptr_t)(sample_id_block_end - sample_id_block_iter) < input_id_slen + 2) {
+	  logerrprint("Error: Invalid .bgen header.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	unsigned char* sample_id_iter = &(sample_id_block_iter[2]);
+	unsigned char* sample_id_end = &(sample_id_iter[input_id_slen]);
+        uint32_t char_code_min = 32 + (id_delim != ' ');
+	for (; sample_id_iter != sample_id_end; ++sample_id_iter) {
+	  const uint32_t char_code = *sample_id_iter;
+	  if (char_code < char_code_min) {
+	    if (char_code < 32) {
+	      logerrprint("Error: .bgen sample ID contains tabs, newlines, and/or nonprinting characters.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    if (!idspace_to) {
+	      logerrprint("Error: .bgen sample ID contains space(s).  Use --idspace-to to convert them to\nanother character, or \"--id-delim ' '\" to interpret the spaces as FID/IID\ndelimiters.\n");
+	      goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	    }
+	    *sample_id_iter = idspace_to;
+	  }
+	}
+	sample_id_block_iter = sample_id_end;
+      }
+      strcpy(outname_end, ".psam");
+      if (fopen_checked(outname, FOPEN_WB, &psamfile)) {
+	goto ox_bgen_to_pgen_ret_OPEN_FAIL;
+      }
+      char* textbuf = g_textbuf;
+      char* write_iter = strcpya(textbuf, "#FID\tIID");
+      uint32_t sid_present = 0;
+      if (id_delim) {
+	// check if three-part IDs are present
+	sample_id_block_iter = sample_id_block_main;
+	for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	  memcpy(&input_id_slen, sample_id_block_iter, 2);
+	  if ((uintptr_t)(sample_id_block_end - sample_id_block_iter) < input_id_slen + 2) {
+	    logerrprint("Error: Invalid .bgen header.\n");
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	  unsigned char* sample_id_start = &(sample_id_block_iter[2]);
+	  unsigned char* sample_id_end = &(sample_id_start[input_id_slen]);
+	  unsigned char* first_delim = (unsigned char*)memchr(sample_id_start, (unsigned char)id_delim, (uintptr_t)(sample_id_end - sample_id_start));
+	  if (first_delim) {
+	    unsigned char* iid_start = &(first_delim[1]);
+	    if (memchr(iid_start, (unsigned char)id_delim, (uintptr_t)(sample_id_end - iid_start)) != nullptr) {
+	      sid_present = 1;
+	      write_iter = strcpya(write_iter, "\tSID");
+	      break;
+	    }
+	  }
+	  sample_id_block_iter = sample_id_end;
+	}
+      }
+      write_iter = strcpya(write_iter, "\tSEX");
+      append_binary_eoln(&write_iter);
+      char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+      sample_id_block_iter = sample_id_block_main;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	memcpy(&input_id_slen, sample_id_block_iter, 2);
+	if ((uintptr_t)(sample_id_block_end - sample_id_block_iter) < input_id_slen + 2) {
+	  logerrprint("Error: Invalid .bgen header.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	unsigned char* sample_id_start = &(sample_id_block_iter[2]);
+	if (input_id_slen <= 1) {
+	  if (!input_id_slen) {
+	    logerrprint("Error: Empty sample ID in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	  if (*sample_id_start == '0') {
+	    logerrprint("Error: Sample ID cannot be '0'.\n");
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	}
+	unsigned char* sample_id_end = &(sample_id_start[input_id_slen]);
+	if (id_delim) {
+	  if (*sample_id_start == id_delim) {
+	    sprintf(g_logbuf, "Error: '%c' at beginning of sample ID.\n", id_delim);
+	    goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT_2;
+	  }
+	  unsigned char* first_delim = (unsigned char*)memchr(sample_id_start, (unsigned char)id_delim, input_id_slen);
+	  if (!first_delim) {
+	    if (double_or_const_fid) {
+	      goto ox_bgen_to_pgen_one_sample_id;
+	    }
+	    sprintf(g_logbuf, "Error: No '%c' in sample ID.\n", id_delim);
+	    goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT_2;
+	  }
+	  unsigned char* iid_start = &(first_delim[1]);
+	  unsigned char* iid_end = (unsigned char*)memchr(iid_start, (unsigned char)id_delim, (uintptr_t)(sample_id_end - iid_start));
+	  const unsigned char* sid_start = (const unsigned char*)(&(g_one_char_strs[96]));
+	  uint32_t sid_slen = 1;
+	  if (iid_end) {
+	    if (iid_start == iid_end) {
+	      sprintf(g_logbuf, "Error: Consecutive instances of '%c' in sample ID.\n", id_delim);
+	      goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT_DELIM;
+	    }
+	    sid_start = &(iid_end[1]);
+	    sid_slen = (uintptr_t)(sample_id_end - sid_start);
+	    if (memchr(sid_start, (unsigned char)id_delim, sid_slen)) {
+	      sprintf(g_logbuf, "Error: More than two instances of '%c' in sample ID.\n", id_delim);
+	      goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT_DELIM;
+	    }
+	  } else {
+	    iid_end = sample_id_end;
+	  }
+	  const uint32_t fid_slen = (uintptr_t)(first_delim - sample_id_start);
+	  if (fid_slen > kMaxIdSlen) {
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT_LONG_ID;
+	  }
+	  write_iter = memcpyax(write_iter, sample_id_start, fid_slen, '\t');
+	  const uint32_t iid_slen = (uintptr_t)(iid_end - iid_start);
+	  if ((*iid_start == '0') && (iid_slen == 1)) {
+	    logerrprint("Error: Sample ID induces an invalid IID of '0'.\n");
+	    goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	  }
+	  if (iid_slen > kMaxIdSlen) {
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT_LONG_ID;
+	  }
+	  write_iter = memcpya(write_iter, iid_start, iid_slen);
+	  if (sid_present) {
+	    *write_iter++ = '\t';
+	    write_iter = memcpya(write_iter, sid_start, sid_slen);
+	  }
+	} else {
+	ox_bgen_to_pgen_one_sample_id:
+	  if (input_id_slen > kMaxIdSlen) {
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT_LONG_ID;
+	  }
+	  if (double_id) {
+	    write_iter = memcpya(write_iter, sample_id_start, input_id_slen);
+	  } else {
+	    write_iter = memcpya(write_iter, const_fid, const_fid_len);
+	  }
+	  *write_iter++ = '\t';
+	  write_iter = memcpya(write_iter, sample_id_start, input_id_slen);
+	  if (sid_present) {
+	    write_iter = strcpya(write_iter, "\t0");
+	  }
+	}
+	// SEX
+	write_iter = memcpyl3a(write_iter, "\tNA");
+	append_binary_eoln(&write_iter);
+	if (write_iter >= textbuf_flush) {
+	  if (fwrite_checked(textbuf, (uintptr_t)(write_iter - textbuf), psamfile)) {
+	    goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	  }
+	  write_iter = textbuf;
+	}
+	sample_id_block_iter = sample_id_end;
+      }
+      if (sample_id_block_iter != &(sample_id_block_main[sample_id_block_byte_ct])) {
+	logerrprint("Error: Invalid .bgen header.\n");
+	goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+      }
+      if (write_iter != textbuf) {
+	if (fwrite_checked(textbuf, (uintptr_t)(write_iter - textbuf), psamfile)) {
+	  goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	}
+      }
+      bigstack_reset(sample_id_block_main);
+      if (fclose_null(&psamfile)) {
+	goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+      }
+      LOGPRINTFWW("--bgen: %u sample ID%s written to %s .\n", sample_ct, (sample_ct == 1)? "" : "s", outname);
+    }
+    if (fseeko(bgenfile, initial_uints[0] + 4, SEEK_SET)) {
+      goto ox_bgen_to_pgen_ret_READ_FAIL;
+    }
+    const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
+    finalize_chrset(misc_flags, cip);
+    const uint32_t autosome_ct_p1 = cip->autosome_ct + 1;
+    uint32_t chr_filter_present = (popcount_bit_idx(cip->chr_mask, 0, autosome_ct_p1) != autosome_ct_p1) || (allow_extra_chrs && (cip->is_include_stack || cip->incl_excl_name_stack));
+    if (!chr_filter_present) {
+      for (uint32_t xymt_idx = 0; xymt_idx < kChrOffsetCt; ++xymt_idx) {
+	if (cip->xymt_codes[xymt_idx] >= 0) {
+	  if (!is_set(cip->chr_mask, autosome_ct_p1 + xymt_idx)) {
+	    chr_filter_present = 1;
+	    break;
+	  }
+	}
+      }
+    }
+    
+    char* writebuf = (char*)bigstack_alloc_raw(kMaxMediumLine + kCompressStreamBlock + kCacheline);
+    char* writebuf_flush = &(writebuf[kCompressStreamBlock]);
+    strcpy(outname_end, ".pvar");
+    if (fopen_checked(outname, FOPEN_WB, &pvarfile)) {
+      goto ox_bgen_to_pgen_ret_OPEN_FAIL;
+    }
+    char* write_iter = writebuf;
+    if (cip->chrset_source) {
+      append_chrset_line(cip, &write_iter);
+    }
+    write_iter = strcpya(write_iter, "#CHROM\tPOS\tID\tREF\tALT" EOLN_STR);
+
+    const uint32_t snpid_chr = (oxford_import_flags & kfOxfordImportBgenSnpIdChr);
+
+    // true for both provisional-reference and real-reference second
+    const uint32_t prov_ref_allele_second = !(oxford_import_flags & kfOxfordImportRefFirst);
+
+    if (hard_call_thresh == 0xffffffffU) {
+      hard_call_thresh = kDosageMid / 10;
+    }
+    const uint32_t sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+    const uint32_t sample_ctaw = BITCT_TO_ALIGNED_WORDCT(sample_ct);
+    uint32_t dosage_is_present = 0;
+    g_sample_ct = sample_ct;
+    g_hard_call_halfdist = kDosage4th - hard_call_thresh;
+    g_dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
+    g_compression_mode = compression_mode;
+    g_prov_ref_allele_second = prov_ref_allele_second;
+    g_error_ret = kPglRetSuccess;
+    g_dosage_is_present = 0;
+    if (layout == 1) {
+      // v1.1
+      uintptr_t loadbuf_size = round_down_pow2(bigstack_left() / 4, kCacheline);
+#ifdef __LP64__
+      if (loadbuf_size > kMaxLongLine) {
+	loadbuf_size = kMaxLongLine;
+      }
+#endif
+      // must have enough space for chromosome and variant IDs
+      if (loadbuf_size < 2 * 65536) {
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      unsigned char* loadbuf = (unsigned char*)bigstack_alloc_raw(loadbuf_size);
+      g_import_dosage_certainty_int = 1 + (int32_t)(import_dosage_certainty * 32768);
+      uintptr_t bgen_geno_max_byte_ct = 6LU * sample_ct;
+      if (compression_mode) {
+        bgen_geno_max_byte_ct = compressBound(bgen_geno_max_byte_ct);
+      }
+      if (bgen_geno_max_byte_ct > 0xffffffffU) {
+	logerrprint("Error: Too many samples for .bgen format.\n");
+	goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+      }
+      bgen_geno_max_byte_ct += compression_mode * 4;
+      // thread-count-independent:
+      //   (everything after "2 *" rounded up to cacheline)
+      //   compressed_geno_bufs: 2 * bgen_geno_max_byte_ct * main_block_size
+      //   g_compressed_geno_starts: 2 * sizeof(intptr_t) * main_block_size
+      //   g_write_genovecs: 2 * sample_ctaw2 * sizeof(intptr_t) *
+      //                     main_block_size
+      //   g_write_dosage_cts: 2 * sizeof(int32_t) * main_block_size
+      //   g_write_dosage_presents: 2 * sample_ctaw * sizeof(intptr_t) *
+      //                            main_block_size
+      //   g_write_dosage_val_bufs (main bottleneck): 2 * sample_ct *
+      //                                              sizeof(dosage_t)
+      // additional requirement per thread:
+      //   g_bgen_geno_bufs: sample_ct * 3 * sizeof(int16_t)
+
+      uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+      if ((!compression_mode) && (calc_thread_ct > 2)) {
+	// computation doesn't seem to saturate when decompression is involved
+	calc_thread_ct = 2;
+      }
+      if (bigstack_alloc_thread(calc_thread_ct, &ts.threads) ||
+	  bigstack_alloc_usip(calc_thread_ct, &g_bgen_geno_bufs)) {
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      const uint32_t sample_ct_x3 = sample_ct * 3;
+      for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	if (bigstack_alloc_usi(sample_ct_x3, &(g_bgen_geno_bufs[tidx]))) {
+	  goto ox_bgen_to_pgen_ret_NOMEM;
+	}
+      }
+      uintptr_t cachelines_avail_m12 = bigstack_left() / kCacheline;
+      // reserve 1/8 of remaining memory for writer
+      cachelines_avail_m12 -= cachelines_avail_m12 / 8;
+      if (cachelines_avail_m12 < 12) {
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      // we're making 12 allocations; be pessimistic re: rounding
+      cachelines_avail_m12 -= 12;
+      const uintptr_t bytes_req_per_in_block_variant = 2 * (bgen_geno_max_byte_ct + sizeof(intptr_t) + sample_ctaw2 * sizeof(intptr_t) + sizeof(int32_t) + sample_ctaw * sizeof(intptr_t) + sample_ct * sizeof(dosage_t));
+      uintptr_t main_block_size = (cachelines_avail_m12 * kCacheline) / bytes_req_per_in_block_variant;
+      if (main_block_size > 65536) {
+	main_block_size = 65536;
+      } else if (main_block_size < 8) {
+	// this threshold is arbitrary
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      if (calc_thread_ct > main_block_size / 8) {
+	calc_thread_ct = main_block_size / 8;
+      }
+      ts.calc_thread_ct = calc_thread_ct;
+      g_calc_thread_ct = calc_thread_ct;
+      unsigned char* compressed_geno_bufs[2];
+      if (bigstack_alloc_uc(bgen_geno_max_byte_ct * main_block_size, &(compressed_geno_bufs[0])) ||
+	  bigstack_alloc_uc(bgen_geno_max_byte_ct * main_block_size, &(compressed_geno_bufs[1])) ||
+	  bigstack_alloc_ucp(main_block_size, &(g_compressed_geno_starts[0])) ||
+	  bigstack_alloc_ucp(main_block_size, &(g_compressed_geno_starts[1])) ||
+	  bigstack_alloc_ul(sample_ctaw2 * main_block_size, &(g_write_genovecs[0])) ||
+	  bigstack_alloc_ul(sample_ctaw2 * main_block_size, &(g_write_genovecs[1])) ||
+	  bigstack_alloc_ui(main_block_size, &(g_write_dosage_cts[0])) ||
+	  bigstack_alloc_ui(main_block_size, &(g_write_dosage_cts[1])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dosage_presents[0])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dosage_presents[1])) ||
+	  bigstack_alloc_dosage(sample_ct * main_block_size, &(g_write_dosage_val_bufs[0])) ||
+	  bigstack_alloc_dosage(sample_ct * main_block_size, &(g_write_dosage_val_bufs[1]))) {
+	// this should be impossible
+	assert(0);
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+
+      // likely cases are (i) non-hardcall near top of the file, and (ii) no
+      // non-hardcalls at all.  to handle the first case efficiently, we want
+      // the first blocks to be small so we bail quickly; to handle the second
+      // case efficiently, we want large blocks on average.  so we start with
+      // a minimal block size and then repeatedly double.
+      uint32_t variant_ct = 0;
+      uint32_t block_vidx = 0;
+      uint32_t cur_block_size = calc_thread_ct;
+      uint32_t parity = 0;
+      uintptr_t compressed_block_byte_ct = 6LU * sample_ct;
+      unsigned char** compressed_geno_starts = g_compressed_geno_starts[0];
+      unsigned char* bgen_geno_iter = compressed_geno_bufs[0];
+      for (uint32_t variant_uidx = 0; variant_uidx < raw_variant_ct; ) {
+	uint32_t uii;
+	if (!fread(&uii, 4, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (uii != sample_ct) {
+	  logprint("\n");
+	  logerrprint("Error: Unexpected number of samples specified in SNP block header.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	uint16_t snpid_slen;
+	if (!fread(&snpid_slen, 2, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (!snpid_chr) {
+	  if (fseeko(bgenfile, snpid_slen, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	} else {
+	  if (!snpid_slen) {
+	    logprint("\n");
+	    logerrprint("Error: Length-0 SNP ID in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	  }
+	  if (!fread(loadbuf, snpid_slen, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  loadbuf[snpid_slen] = '\0';
+	}
+	uint16_t rsid_slen;
+	if (!fread(&rsid_slen, 2, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (fseeko(bgenfile, rsid_slen, SEEK_CUR)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	uint16_t chr_name_slen;
+	if (!fread(&chr_name_slen, 2, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (!snpid_chr) {
+	  if (!chr_name_slen) {
+	    logprint("\n");
+	    logerrprint("Error: Length-0 chromosome ID in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	  }
+	  if (!fread(loadbuf, chr_name_slen, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  if ((chr_name_slen == 2) && (!memcmp(loadbuf, "NA", 2))) {
+	    strcpy((char*)loadbuf, "0");
+	    chr_name_slen = 1;
+	  } else {
+	    loadbuf[chr_name_slen] = '\0';
+	  }
+	} else {
+	  if (fseeko(bgenfile, chr_name_slen, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  chr_name_slen = snpid_slen;
+	}
+	int32_t cur_chr_code;
+	reterr = get_or_add_chr_code_destructive("--bgen file", 0, allow_extra_chrs, (char*)loadbuf, (char*)(&(loadbuf[chr_name_slen])), cip, &cur_chr_code);
+	if (reterr) {
+	  goto ox_bgen_to_pgen_ret_1;
+	}
+	const uint32_t skip = !is_set(cip->chr_mask, cur_chr_code);
+
+	uint32_t cur_bp; // ignore in this pass
+	if (!fread(&cur_bp, 4, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+
+	// allele count always 2 and not stored when layout=1
+	for (uint32_t allele_idx = 0; allele_idx < 2; ++allele_idx) {
+	  uint32_t allele_slen;
+	  if (!fread(&allele_slen, 4, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  if (fseeko(bgenfile, allele_slen, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	}
+
+	if (compression_mode) {
+#ifdef __LP64__
+	  compressed_block_byte_ct = 0;
+#endif	  
+	  if (!fread(&compressed_block_byte_ct, 4, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	}
+	++variant_uidx;
+	if (!(variant_uidx % 1000)) {
+	  printf("\r--bgen: %uk variants scanned.", variant_uidx / 1000);
+	  fflush(stdout);
+	}
+	if (dosage_is_present || skip) {
+	  if (fseeko(bgenfile, compressed_block_byte_ct, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  // bugfix (25 Jun 2017): block_vidx should be left unchanged here
+	  variant_ct += 1 - skip;
+	  continue;
+	}
+	compressed_geno_starts[block_vidx] = bgen_geno_iter;
+	if (compression_mode) {
+	  memcpy(bgen_geno_iter, &compressed_block_byte_ct, 4);
+	  bgen_geno_iter = &(bgen_geno_iter[4]);
+	}
+	if (fread_checked(bgen_geno_iter, compressed_block_byte_ct, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	bgen_geno_iter = &(bgen_geno_iter[compressed_block_byte_ct]);
+	++block_vidx;
+	if (block_vidx == cur_block_size) {
+	  parity = 1 - parity;
+	  if (ts.thread_func_ptr) {
+	    // process *previous* block results
+	    join_threads3z(&ts);
+	    reterr = g_error_ret;
+	    if (reterr) {
+	      logprint("\n");
+	      logerrprint("Error: Invalid compressed SNP block in .bgen file.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    dosage_is_present = g_dosage_is_present;
+	    if (dosage_is_present) {
+	      // don't need to scan for any more dosages
+	      stop_threads3z(&ts, &g_cur_block_write_ct);
+	      if (!chr_filter_present) {
+		break;
+	      }
+	      continue;
+	    }
+	  }
+	  g_cur_block_write_ct = cur_block_size;
+	  ts.thread_func_ptr = bgen11_dosage_scan_thread;
+	  if (spawn_threads3z(variant_ct, &ts)) {
+	    goto ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL;
+	  }
+	  compressed_geno_starts = g_compressed_geno_starts[parity];
+	  bgen_geno_iter = compressed_geno_bufs[parity];
+	  block_vidx = 0;
+	  variant_ct += cur_block_size;
+	  if (cur_block_size < main_block_size) {
+	    cur_block_size *= 2;
+	    if (cur_block_size > main_block_size) {
+	      cur_block_size = main_block_size;
+	    }
+	  }
+	}
+      }
+
+      if (!chr_filter_present) {
+	variant_ct = raw_variant_ct;
+      } else {
+	variant_ct += block_vidx;
+        if (!variant_ct) {
+	  logprint("\n");
+	  LOGERRPRINTFWW("Error: All %u variant%s in .bgen file skipped due to chromosome filter.\n", raw_variant_ct, (raw_variant_ct == 1)? "" : "s");
+	  goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	}
+      }
+      if (ts.thread_func_ptr) {
+	join_threads3z(&ts);
+	reterr = g_error_ret;
+	if (reterr) {
+	  logprint("\n");
+	  logerrprint("Error: Invalid compressed SNP block in .bgen file.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (block_vidx && (!g_dosage_is_present)) {
+	  g_cur_block_write_ct = block_vidx;
+	} else {
+	  g_cur_block_write_ct = 0;
+	}
+	ts.is_last_block = 1;
+	if (spawn_threads3z(1, &ts)) {
+	  goto ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL;
+	}
+	join_threads3z(&ts);
+	dosage_is_present = g_dosage_is_present;
+      }
+
+      if (fseeko(bgenfile, initial_uints[0] + 4, SEEK_SET)) {
+	goto ox_bgen_to_pgen_ret_READ_FAIL;
+      }
+      strcpy(outname_end, ".pgen");
+      uintptr_t spgw_alloc_cacheline_ct;
+      uint32_t max_vrec_len;
+      reterr = spgw_init_phase1(outname, nullptr, nullptr, variant_ct, sample_ct, dosage_is_present? kfPgenGlobalDosagePresent : kfPgenGlobal0, (oxford_import_flags & (kfOxfordImportRefFirst | kfOxfordImportRefSecond))? 1 : 2, &spgw, &spgw_alloc_cacheline_ct, &max_vrec_len);
+      if (reterr) {
+	goto ox_bgen_to_pgen_ret_1;
+      }
+      unsigned char* spgw_alloc;
+      if (bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+
+      // Main workflow:
+      // 1. Set n=0, load genotype data for first main_block_size variants
+      //    while writing .pvar
+      //
+      // 2. Spawn threads processing batch n genotype data
+      // 3. If n>0, write results for block (n-1)
+      // 4. Increment n by 1
+      // 5. Load/write-.pvar for batch (n+1) unless eof
+      // 6. Join threads
+      // 7. Goto step 2 unless eof
+      //
+      // 8. Write results for last block
+      //
+      // (May be better to change this to use one output buffer instead of 2.)
+      uint32_t vidx_start = 0;
+      uint32_t prev_block_write_ct = 0;
+      parity = 0;
+      reinit_threads3z(&ts);
+      while (1) {
+	uint32_t cur_block_write_ct = 0;
+	if (!ts.is_last_block) {
+	  cur_block_write_ct = MINV(variant_ct - vidx_start, main_block_size);
+	  compressed_geno_starts = g_compressed_geno_starts[parity];
+          bgen_geno_iter = compressed_geno_bufs[parity];
+	  for (block_vidx = 0; block_vidx < cur_block_write_ct;) {
+	    uint32_t uii;
+	    if (!fread(&uii, 4, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (uii != sample_ct) {
+	      logprint("\n");
+	      logerrprint("Error: Unexpected number of samples specified in SNP block header.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    uint16_t snpid_slen;
+	    if (!fread(&snpid_slen, 2, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    char* rsid_start = (char*)loadbuf;
+	    if (!snpid_chr) {
+	      if (fseeko(bgenfile, snpid_slen, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	    } else {
+	      if (!snpid_slen) {
+		logprint("\n");
+		logerrprint("Error: Length-0 SNP ID in .bgen file.\n");
+		goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	      }
+	      if (!fread(loadbuf, snpid_slen, 1, bgenfile)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      loadbuf[snpid_slen] = '\0';
+	      rsid_start = (char*)(&(loadbuf[snpid_slen + 1]));
+	    }
+	    uint16_t rsid_slen;
+	    if (!fread(&rsid_slen, 2, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (!rsid_slen) {
+	      logprint("\n");
+	      logerrprint("Error: Length-0 rsID in .bgen file.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    if (!fread(rsid_start, rsid_slen, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    char* loadbuf_iter = &(rsid_start[rsid_slen]);
+	    char* chr_name_start = loadbuf_iter;
+	    uint16_t chr_name_slen;
+	    if (!fread(&chr_name_slen, 2, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (!snpid_chr) {
+	      if (!chr_name_slen) {
+		logprint("\n");
+		logerrprint("Error: Length-0 chromosome ID in .bgen file.\n");
+		goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	      }
+	      if (!fread(chr_name_start, chr_name_slen, 1, bgenfile)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      if ((chr_name_slen == 2) && (!memcmp(chr_name_start, "NA", 2))) {
+		strcpy(chr_name_start, "0");
+		chr_name_slen = 1;
+	      } else {
+		chr_name_start[chr_name_slen] = '\0';
+	      }
+	    } else {
+	      if (fseeko(bgenfile, chr_name_slen, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      chr_name_start = (char*)loadbuf;
+	      chr_name_slen = snpid_slen;
+	    }
+	    int32_t cur_chr_code;
+	    reterr = get_or_add_chr_code_destructive("--bgen file", 0, allow_extra_chrs, (char*)chr_name_start, &(chr_name_start[chr_name_slen]), cip, &cur_chr_code);
+	    if (reterr) {
+	      goto ox_bgen_to_pgen_ret_1;
+	    }
+	    const uint32_t skip = !is_set(cip->chr_mask, cur_chr_code);
+
+	    uint32_t cur_bp;
+	    uint32_t a1_slen;
+	    if ((!fread(&cur_bp, 4, 1, bgenfile)) ||
+		(!fread(&a1_slen, 4, 1, bgenfile))) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (skip) {
+	      uint32_t a2_slen;
+	      if (fseeko(bgenfile, a1_slen, SEEK_CUR) ||
+		  (!fread(&a2_slen, 4, 1, bgenfile)) ||
+		  fseeko(bgenfile, a2_slen, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      if (compression_mode) {
+#ifdef __LP64__
+		compressed_block_byte_ct = 0;
+#endif
+		if (!fread(&compressed_block_byte_ct, 4, 1, bgenfile)) {
+		  goto ox_bgen_to_pgen_ret_READ_FAIL;
+		}
+	      }
+	      if (fseeko(bgenfile, compressed_block_byte_ct, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      continue;
+	    }
+	    char* a1_ptr = loadbuf_iter;
+	    if (!a1_slen) {
+	      logprint("\n");
+	      logerrprint("Error: Empty allele code in .bgen file.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    if (a1_slen > 1000000000) {
+	      logprint("\n");
+	      logerrprint("Error: Allele code in .bgen file has more than 1 billion characters.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    if (a1_slen + (uintptr_t)(a1_ptr - ((char*)loadbuf)) > loadbuf_size) {
+	      goto ox_bgen_to_pgen_ret_NOMEM;
+	    }
+	    if (!fread(a1_ptr, a1_slen, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    char* a2_ptr = &(a1_ptr[a1_slen]);
+	    uint32_t a2_slen;
+	    if (!fread(&a2_slen, 4, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (!a2_slen) {
+	      logprint("\n");
+	      logerrprint("Error: Empty allele code in .bgen file.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    if (a2_slen > 1000000000) {
+	      logprint("\n");
+	      logerrprint("Error: Allele code in .bgen file has more than 1 billion characters.\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    if (a2_slen + (uintptr_t)(a2_ptr - ((char*)loadbuf)) > loadbuf_size) {
+	      goto ox_bgen_to_pgen_ret_NOMEM;
+	    }
+	    if (!fread(a2_ptr, a2_slen, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (compression_mode) {
+#ifdef __LP64__
+	      compressed_block_byte_ct = 0;
+#endif
+	      if (!fread(&compressed_block_byte_ct, 4, 1, bgenfile)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	    }
+	    write_iter = chr_name_write(cip, cur_chr_code, write_iter);
+	    *write_iter++ = '\t';
+	    if (cur_bp > 0x7ffffffe) {
+	      logprint("\n");
+	      logerrprint("Error: Invalid bp coordinate (> 2^31 - 2) in .bgen file\n");
+	      goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    write_iter = uint32toa_x(cur_bp, '\t', write_iter);
+	    write_iter = memcpyax(write_iter, rsid_start, rsid_slen, '\t');
+	    if (prov_ref_allele_second) {
+	      uint32_t swap_slen = a1_slen;
+	      a1_slen = a2_slen;
+	      a2_slen = swap_slen;
+	      char* swap_ptr = a1_ptr;
+	      a1_ptr = a2_ptr;
+	      a2_ptr = swap_ptr;
+	    }
+	    if ((write_iter >= writebuf_flush) || (a1_slen >= kMaxMediumLine)) {
+	      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+		goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	      }
+	      write_iter = writebuf;
+	    }
+	    if (a1_slen < kMaxMediumLine) {
+	      write_iter = memcpya(write_iter, a1_ptr, a1_slen);
+	    } else {
+	      if (fwrite_checked(a1_ptr, a1_slen, pvarfile)) {
+		goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	      }
+	    }
+	    *write_iter++ = '\t';
+	    if ((write_iter >= writebuf_flush) || (a2_slen >= kMaxMediumLine)) {
+	      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+		goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	      }
+	      write_iter = writebuf;
+	    }
+	    if (a2_slen < kMaxMediumLine) {
+	      write_iter = memcpya(write_iter, a2_ptr, a2_slen);
+	    } else {
+	      if (fwrite_checked(a2_ptr, a2_slen, pvarfile)) {
+		goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	      }
+	    }
+	    append_binary_eoln(&write_iter);
+
+	    compressed_geno_starts[block_vidx] = bgen_geno_iter;
+	    if (compression_mode) {
+	      memcpy(bgen_geno_iter, &compressed_block_byte_ct, 4);
+	      bgen_geno_iter = &(bgen_geno_iter[4]);
+	    }
+	    if (fread_checked(bgen_geno_iter, compressed_block_byte_ct, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    bgen_geno_iter = &(bgen_geno_iter[compressed_block_byte_ct]);
+	    ++block_vidx;
+	  }
+	}
+	if (vidx_start) {
+	  join_threads3z(&ts);
+	  reterr = g_error_ret;
+	  if (reterr) {
+	    logprint("\n");
+	    logerrprint("Error: Invalid compressed SNP block in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	}
+	if (!ts.is_last_block) {
+	  g_cur_block_write_ct = cur_block_write_ct;
+	  ts.is_last_block = (vidx_start + cur_block_write_ct == variant_ct);
+	  ts.thread_func_ptr = bgen11_geno_to_pgen_thread;
+	  if (spawn_threads3z(vidx_start, &ts)) {
+	    goto ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL;
+	  }
+	}
+	parity = 1 - parity;
+	if (vidx_start) {
+	  // write *previous* block results
+	  uintptr_t* write_genovec_iter = g_write_genovecs[parity];
+	  uint32_t* write_dosage_ct_iter = g_write_dosage_cts[parity];
+	  uintptr_t* write_dosage_present_iter = g_write_dosage_presents[parity];
+	  dosage_t* write_dosage_vals_iter = g_write_dosage_val_bufs[parity];
+	  for (uint32_t vidx = vidx_start - prev_block_write_ct; vidx < vidx_start; ++vidx) {
+	    const uint32_t cur_dosage_ct = *write_dosage_ct_iter++;
+	    if (!cur_dosage_ct) {
+	      if (spgw_append_biallelic_genovec(write_genovec_iter, &spgw)) {
+		goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	      }
+	    } else {
+	      if (spgw_append_biallelic_genovec_dosage16(write_genovec_iter, write_dosage_present_iter, write_dosage_vals_iter, cur_dosage_ct, &spgw)) {
+		goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	      }
+	    }
+            write_genovec_iter = &(write_genovec_iter[sample_ctaw2]);
+	    write_dosage_present_iter = &(write_dosage_present_iter[sample_ctaw]);
+	    write_dosage_vals_iter = &(write_dosage_vals_iter[sample_ct]);
+	  }
+	}
+	if (vidx_start == variant_ct) {
+	  break;
+	}
+	if (vidx_start) {
+	  printf("\r--bgen: %uk variants converted.", vidx_start / 1000);
+	  if (vidx_start <= main_block_size) {
+	    fputs("    \b\b\b\b", stdout);
+	  }
+	  fflush(stdout);
+	}
+	vidx_start += cur_block_write_ct;
+	prev_block_write_ct = cur_block_write_ct;
+      }
+    } else {
+      // v1.2-1.3
+
+      uintptr_t* allele_idx_offsets;
+      if (bigstack_end_alloc_ul(raw_variant_ct + 1, &allele_idx_offsets)) {
+	logerrprint("error path 1\n");
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+
+      g_bgen_import_dosage_certainty_thresholds = nullptr;
+      if (import_dosage_certainty > (1.0 - kSmallEpsilon) / 3.0) {
+	g_bgen_import_dosage_certainty_thresholds = (uint32_t*)bigstack_alloc_raw_rd(25 * sizeof(int32_t));
+	for (uint32_t bit_precision = 1; bit_precision <= 16; ++bit_precision) {
+	  const uint32_t denom = (1U << bit_precision) - 1;
+	  g_bgen_import_dosage_certainty_thresholds[bit_precision] = 1 + (int32_t)(import_dosage_certainty * ((int32_t)denom));
+	}
+      }
+      // bugfix (2 Jul 2017): if max_thread_ct == 1 but there's >12GB memory,
+      //   limit to 1 thread rather than (max_thread_ct - 1)...
+      const uint32_t calc_thread_ct_limit = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+      if (bigstack_alloc_thread(calc_thread_ct_limit, &ts.threads)) {
+	logerrprint("error path 2\n");
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+
+      g_thread_wkspaces = (unsigned char**)bigstack_alloc_raw_rd(calc_thread_ct_limit * sizeof(intptr_t));
+      g_thread_bidxs[0] = (uint32_t*)bigstack_alloc_raw_rd((calc_thread_ct_limit + 1) * sizeof(int32_t));
+      g_thread_bidxs[1] = (uint32_t*)bigstack_alloc_raw_rd((calc_thread_ct_limit + 1) * sizeof(int32_t));
+      // ***** all allocations from this point on are reset before pass 2 *****
+      uintptr_t main_block_size = 65536;
+      if (bigstack_alloc_usi(main_block_size, &(g_bgen_allele_cts[0])) ||
+	  bigstack_alloc_usi(main_block_size, &(g_bgen_allele_cts[1])) ||
+	  bigstack_alloc_ucp(main_block_size + 1, &(g_compressed_geno_starts[0])) ||
+	  bigstack_alloc_ucp(main_block_size + 1, &(g_compressed_geno_starts[1]))) {
+	logerrprint("error path 3\n");
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      if (compression_mode) {
+	if (bigstack_alloc_ui(main_block_size, &(g_uncompressed_genodata_byte_cts[0])) ||
+	    bigstack_alloc_ui(main_block_size, &(g_uncompressed_genodata_byte_cts[1]))) {
+	  logerrprint("error path 4\n");
+	  goto ox_bgen_to_pgen_ret_NOMEM;
+	}
+      } else {
+	// defensive
+	g_uncompressed_genodata_byte_cts[0] = nullptr;
+	g_uncompressed_genodata_byte_cts[1] = nullptr;
+      }
+
+      // ploidy >2 is not supported by PLINK 2.  (A future build may have code
+      // to treat those calls as missing instead of erroring out, as is done
+      // with VCF ploidy >2.  But I'll wait until this case actually comes up
+      // in the wild...)
+      // But even without that, the diploid worst case of 65535 alleles,
+      // unphased 32-bit probabilities blows past the 4GB uncompressed record
+      // size limit with just 1 sample!  Consequences:
+      // * A simple way to avoid unnecessary NOMEM errors is to give each
+      //   thread 4GB of decompression workspace on the first pass.  This may
+      //   greatly reduce the number of decompression worker threads we can
+      //   deploy, but for the first pass that's acceptable: the worker threads
+      //   will usually all exit almost immediately (since we just need to
+      //   determine whether *any* phase/dosage info needs to be saved).
+      // * Even 1 thread x 4GB won't always be available, especially since we
+      //   have a double-buffering workflow which requires additional
+      //   allocations summing to more than twice the decompression workspace.
+      //   So we need to be able to fall back to a smaller decompression
+      //   workspace size, and throw NOMEM when it's insufficient.
+      // * Of course, records will almost always be far smaller than 4GB.
+      //   During the first pass, we'll see every uncompressed record size
+      //   (even if the decompression worker threads terminate early), so we
+      //   can usually increase the number of worker threads before the second
+      //   pass.
+      // Overall memory allocation for first pass:
+      //   loadbuf_size (~1/7, up to 2GB) : Chromosome code/variant ID/allele
+      //                                    code load buffer.
+      //   mainbuf_size (~2/7) : Compressed genotype data buffer 0, up to 4GB
+      //                         per decompression thread
+      //   mainbuf_size        : Compressed genotype data buffer 1
+      //   mainbuf_size        : Decompression thread workspace(s)
+      // Second pass:
+      //   mainbuf_size (~1/6) : Decompression thread workspaces.
+      //   16K                 : .bgen chromosome code load buffer.
+      //   remainder (~5/6)    : Compressed genotype data buffers, writer, and
+      //                         write buffers.
+      uintptr_t loadbuf_size = round_down_pow2(bigstack_left() / 7, kCacheline);
+      if (loadbuf_size > kMaxLongLine) {
+	loadbuf_size = kMaxLongLine;
+      } else if (loadbuf_size < 2 * 65536) {
+	// don't want to worry about chromosome/variant ID buffer space checks
+	// in inner loop
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      unsigned char* loadbuf = bigstack_alloc_raw(loadbuf_size);
+
+      uintptr_t mainbuf_size = round_down_pow2(bigstack_left() / 3, kCacheline);
+      uint32_t calc_thread_ct = 1;
+      uintptr_t thread_wkspace_size;
+#ifdef __LP64__
+      // hard compressed and uncompressed record length limits of 2^31 - 1
+      // bytes, since these are represented as uint32s in the file.
+      if (mainbuf_size > 0x100000000LLU) {
+	thread_wkspace_size = 0x100000000LLU;
+	mainbuf_size &= 0xffffffff00000000LLU;
+	calc_thread_ct = mainbuf_size >> 32;
+	if (calc_thread_ct > calc_thread_ct_limit) {
+	  calc_thread_ct = calc_thread_ct_limit;
+	  mainbuf_size = ((uintptr_t)calc_thread_ct_limit) << 32;
+	}
+      } else {
+	thread_wkspace_size = mainbuf_size;
+      }
+#else
+      thread_wkspace_size = mainbuf_size;
+#endif
+      // note that thread_wkspace_size is the size limit for a compressed
+      // variant record *and* the uncompressed form
+
+      if (main_block_size > raw_variant_ct + calc_thread_ct - 1) {
+	main_block_size = raw_variant_ct + calc_thread_ct - 1;
+      }
+      uint32_t per_thread_block_limit = main_block_size / calc_thread_ct;
+      // may as well guarantee divisibility
+      main_block_size = per_thread_block_limit * calc_thread_ct;
+      ts.calc_thread_ct = calc_thread_ct;
+      g_calc_thread_ct = calc_thread_ct;
+      unsigned char* compressed_geno_bufs[2];
+      compressed_geno_bufs[0] = bigstack_alloc_raw(mainbuf_size);
+      compressed_geno_bufs[1] = bigstack_alloc_raw(mainbuf_size);
+      for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	g_thread_wkspaces[tidx] = bigstack_alloc_raw(thread_wkspace_size);
+      }
+
+      uint32_t variant_ct = 0;
+
+      uint32_t block_vidx = 0;
+
+      // bgen-1.2 and -1.3 records can vary wildly in size, so we're a bit more
+      // careful with load balancing here.
+      uint32_t cur_per_thread_block_limit = 1;
+      uint32_t cur_thread_block_vidx_limit = 1;
+      uint32_t cur_thread_fill_idx = 0;
+
+      uint32_t parity = 0;
+      uint32_t* thread_bidxs = g_thread_bidxs[0];
+      uint16_t* bgen_allele_cts = g_bgen_allele_cts[0];
+      unsigned char** compressed_geno_starts = g_compressed_geno_starts[0];
+      uint32_t* uncompressed_genodata_byte_cts = g_uncompressed_genodata_byte_cts[0];
+      unsigned char* bgen_geno_iter = compressed_geno_bufs[0];
+      unsigned char* cur_geno_buf_end = &(bgen_geno_iter[thread_wkspace_size]);
+      thread_bidxs[0] = 0;
+      compressed_geno_starts[0] = bgen_geno_iter;
+      uintptr_t* allele_idx_offsets_iter = allele_idx_offsets;
+      uintptr_t tot_allele_ct = 0;
+      uint32_t max_geno_blen = 0;
+      uint32_t uncompressed_genodata_byte_ct = 0;
+
+      // temporary kludge
+      uint32_t multiallelic_skip_ct = 0;
+
+      g_cur_block_write_ct = 1; // just used as a flag
+
+      for (uint32_t variant_uidx = 0; variant_uidx < raw_variant_ct; ) {
+	// format is mostly identical to bgen 1.1; but there's no sample count,
+	// and there is an allele count
+	// logic is more similar to the second bgen 1.1 pass since we write the
+	// .pvar here.
+	uint16_t snpid_slen;
+	if (!fread(&snpid_slen, 2, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	char* rsid_start = (char*)loadbuf;
+	if (!snpid_chr) {
+	  if (fseeko(bgenfile, snpid_slen, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	} else {
+	  if (!snpid_slen) {
+	    logprint("\n");
+	    logerrprint("Error: Length-0 SNP ID in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	  }
+	  if (!fread(loadbuf, snpid_slen, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  loadbuf[snpid_slen] = '\0';
+	  rsid_start = (char*)(&(loadbuf[snpid_slen + 1]));
+	}
+	uint16_t rsid_slen;
+	if (!fread(&rsid_slen, 2, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (!rsid_slen) {
+	  logprint("\n");
+	  logerrprint("Error: Length-0 rsID in .bgen file.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (!fread(rsid_start, rsid_slen, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	char* loadbuf_iter = &(rsid_start[rsid_slen]);
+	char* chr_name_start = loadbuf_iter;
+	uint16_t chr_name_slen;
+	if (!fread(&chr_name_slen, 2, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (!snpid_chr) {
+	  if (!chr_name_slen) {
+	    logprint("\n");
+	    logerrprint("Error: Length-0 chromosome ID in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+	  }
+	  if (!fread(chr_name_start, chr_name_slen, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  if ((chr_name_slen == 2) && (!memcmp(chr_name_start, "NA", 2))) {
+	    strcpy(chr_name_start, "0");
+	    chr_name_slen = 1;
+	  } else {
+	    chr_name_start[chr_name_slen] = '\0';
+	  }
+	} else {
+	  if (fseeko(bgenfile, chr_name_slen, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  chr_name_start = (char*)loadbuf;
+	  chr_name_slen = snpid_slen;
+	}
+	// chromosome ID length restriction enforced here, so we don't check
+	// earlier
+	int32_t cur_chr_code;
+	reterr = get_or_add_chr_code_destructive("--bgen file", 0, allow_extra_chrs, (char*)chr_name_start, &(chr_name_start[chr_name_slen]), cip, &cur_chr_code);
+	if (reterr) {
+	  goto ox_bgen_to_pgen_ret_1;
+	}
+	uint32_t skip = !is_set(cip->chr_mask, cur_chr_code);
+
+	uint32_t cur_bp;
+	uint32_t cur_allele_ct = 0;
+	if ((!fread(&cur_bp, 4, 1, bgenfile)) ||
+	    (!fread(&cur_allele_ct, 2, 1, bgenfile))) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (cur_allele_ct < 2) {
+	  // this is undefined in the 1.3 standard; prohibit for now
+	  logprint("\n");
+	  logerrprint("Error: .bgen variant has fewer than two alleles.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	++variant_uidx;
+	if (!(variant_uidx % 1000)) {
+	  printf("\r--bgen: %uk variants scanned.", variant_uidx / 1000);
+	  fflush(stdout);
+	}
+
+	// the "cur_allele_ct > 2" part is a temporary kludge
+	if (skip || (cur_allele_ct > 2)) {
+	  if (!skip) {
+	    ++multiallelic_skip_ct;
+	  }
+	  for (uint32_t allele_idx = 0; allele_idx < cur_allele_ct; ++allele_idx) {
+	    uint32_t cur_allele_slen;
+	    if ((!fread(&cur_allele_slen, 4, 1, bgenfile)) ||
+		fseeko(bgenfile, cur_allele_slen, SEEK_CUR)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	  }
+	  uint32_t genodata_byte_ct;
+	  if ((!fread(&genodata_byte_ct, 4, 1, bgenfile)) ||
+	      fseeko(bgenfile, genodata_byte_ct, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  continue;
+	}
+	if (rsid_slen > kMaxIdSlen) {
+	  // enforce this iff we aren't skipping
+	  logprint("\n");
+	  logerrprint("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	// special handling of first two alleles since either may be
+	// reference, so we may need to swap order
+	char* a1_ptr = loadbuf_iter;
+	uint32_t a1_slen;
+	if (!fread(&a1_slen, 4, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (!a1_slen) {
+	  logprint("\n");
+	  logerrprint("Error: Empty allele code in .bgen file.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (a1_slen > 1000000000) {
+	  logprint("\n");
+	  logerrprint("Error: Allele code in .bgen file has more than 1 billion characters.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (a1_slen + (uintptr_t)(a1_ptr - ((char*)loadbuf)) > loadbuf_size) {
+	  goto ox_bgen_to_pgen_ret_NOMEM;
+	}
+	if (!fread(a1_ptr, a1_slen, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	char* a2_ptr = &(a1_ptr[a1_slen]);
+	uint32_t a2_slen;
+	if (!fread(&a2_slen, 4, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (!a2_slen) {
+	  logprint("\n");
+	  logerrprint("Error: Empty allele code in .bgen file.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (a2_slen > 1000000000) {
+	  logprint("\n");
+	  logerrprint("Error: Allele code in .bgen file has more than 1 billion characters.\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (a2_slen + (uintptr_t)(a2_ptr - ((char*)loadbuf)) > loadbuf_size) {
+	  goto ox_bgen_to_pgen_ret_NOMEM;
+	}
+	if (!fread(a2_ptr, a2_slen, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	write_iter = chr_name_write(cip, cur_chr_code, write_iter);
+	*write_iter++ = '\t';
+	if (cur_bp > 0x7ffffffe) {
+	  logprint("\n");
+	  logerrprint("Error: Invalid bp coordinate (> 2^31 - 2) in .bgen file\n");
+	  goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	}
+	write_iter = uint32toa_x(cur_bp, '\t', write_iter);
+	write_iter = memcpyax(write_iter, rsid_start, rsid_slen, '\t');
+	if (prov_ref_allele_second) {
+	  const uint32_t swap_slen = a1_slen;
+	  a1_slen = a2_slen;
+	  a2_slen = swap_slen;
+	  char* swap_ptr = a1_ptr;
+	  a1_ptr = a2_ptr;
+	  a2_ptr = swap_ptr;
+	}
+	// allele codes may be too large for write buffer, so we special-case
+	// this instead of using fwrite_ck()
+	if ((write_iter >= writebuf_flush) || (a1_slen >= kMaxMediumLine)) {
+	  if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	    goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	if (a1_slen < kMaxMediumLine) {
+	  write_iter = memcpya(write_iter, a1_ptr, a1_slen);
+	} else {
+	  if (fwrite_checked(a1_ptr, a1_slen, pvarfile)) {
+	    goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	  }
+	}
+	*write_iter++ = '\t';
+	if ((write_iter >= writebuf_flush) || (a2_slen >= kMaxMediumLine)) {
+	  if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	    goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	if (a2_slen < kMaxMediumLine) {
+	  write_iter = memcpya(write_iter, a2_ptr, a2_slen);
+	} else {
+	  if (fwrite_checked(a2_ptr, a2_slen, pvarfile)) {
+	    goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	  }
+	}
+	for (uint32_t allele_idx = 2; allele_idx < cur_allele_ct; ++allele_idx) {
+	  // (can't actually reach here yet since we're skipping multiallelics
+	  // for now)
+	  // safe to use entire loadbuf for this
+	  assert(0);
+	  uint32_t cur_allele_slen;
+	  if (!fread(&cur_allele_slen, 4, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  if (!cur_allele_slen) {
+	    logprint("\n");
+	    logerrprint("Error: Empty allele code in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	  if (cur_allele_slen > 1000000000) {
+	    logprint("\n");
+	    logerrprint("Error: Allele code in .bgen file has more than 1 billion characters.\n");
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	  if (cur_allele_slen > loadbuf_size) {
+	    goto ox_bgen_to_pgen_ret_NOMEM;
+	  }
+	  if (!fread(loadbuf, cur_allele_slen, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  *write_iter++ = ',';
+	  if ((write_iter >= writebuf_flush) || (cur_allele_slen >= kMaxMediumLine)) {
+	    if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	      goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	    }
+	    write_iter = writebuf;
+	  }
+	  if (cur_allele_slen < kMaxMediumLine) {
+	    write_iter = memcpya(write_iter, loadbuf, cur_allele_slen);
+	  } else {
+	    if (fwrite_checked(loadbuf, cur_allele_slen, pvarfile)) {
+	      goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+	    }
+	  }
+	}
+
+	append_binary_eoln(&write_iter);
+	*allele_idx_offsets_iter++ = tot_allele_ct;
+	tot_allele_ct += cur_allele_ct;
+	uint32_t genodata_byte_ct;
+	if (!fread(&genodata_byte_ct, 4, 1, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	if (genodata_byte_ct > max_geno_blen) {
+	  max_geno_blen = genodata_byte_ct;
+	}
+	if (uncompressed_genodata_byte_cts) {
+	  if (genodata_byte_ct < 4) {
+	    logerrprint("Error: Invalid compressed block length in .bgen file.\n");
+	    goto ox_bgen_to_pgen_ret_MALFORMED_INPUT;
+	  }
+	  if (!fread(&uncompressed_genodata_byte_ct, 4, 1, bgenfile)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  if (uncompressed_genodata_byte_ct > max_geno_blen) {
+	    max_geno_blen = uncompressed_genodata_byte_ct;
+	  }
+	  genodata_byte_ct -= 4;
+	}
+	if (dosage_is_present) {
+	  if (fseeko(bgenfile, genodata_byte_ct, SEEK_CUR)) {
+	    goto ox_bgen_to_pgen_ret_READ_FAIL;
+	  }
+	  ++variant_ct;
+	  continue;
+	}
+
+	if ((block_vidx == cur_thread_block_vidx_limit) || ((uintptr_t)(cur_geno_buf_end - bgen_geno_iter) < genodata_byte_ct)) {
+	  if (!block_vidx) {
+	    goto ox_bgen_to_pgen_ret_NOMEM;
+	  }
+	  thread_bidxs[++cur_thread_fill_idx] = block_vidx;
+	  if (cur_thread_fill_idx == calc_thread_ct) {
+	    parity = 1 - parity;
+	    if (ts.thread_func_ptr) {
+	      // process *previous* block results
+	      join_threads3z(&ts);
+	      reterr = g_error_ret;
+	      if (reterr) {
+		goto ox_bgen_to_pgen_ret_bgen13_thread_fail;
+	      }
+	      dosage_is_present = g_dosage_is_present;
+	      if (dosage_is_present) {
+		// don't need to scan for any more dosages
+		stop_threads3z(&ts, &g_cur_block_write_ct);
+
+		// however, unlike bgen-1.1 case, we can never do full
+		// early-exit since we have to scan for multiallelic variants:
+		// writer must be initialized with (i) an accurate variant
+		// count, which is affected by skipped multiallelic variants,
+		// and (ii) when we no longer skip them, the PgenWriter
+		// constructor still needs a maximum allele count so it can
+		// allocate properly-sized buffers.
+		if (fseeko(bgenfile, genodata_byte_ct, SEEK_CUR)) {
+		  goto ox_bgen_to_pgen_ret_READ_FAIL;
+		}
+		++variant_ct;
+		continue;
+	      }
+	    }
+	    ts.thread_func_ptr = bgen13_dosage_or_phase_scan_thread;
+	    if (spawn_threads3z(variant_ct, &ts)) {
+	      goto ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL;
+	    }
+	    compressed_geno_starts = g_compressed_geno_starts[parity];
+	    uncompressed_genodata_byte_cts = g_uncompressed_genodata_byte_cts[parity];
+	    thread_bidxs = g_thread_bidxs[parity];
+	    bgen_allele_cts = g_bgen_allele_cts[parity];
+	    bgen_geno_iter = compressed_geno_bufs[parity];
+	    thread_bidxs[0] = 0;
+	    compressed_geno_starts[0] = bgen_geno_iter;
+	    variant_ct += block_vidx;
+	    block_vidx = 0;
+	    if (cur_per_thread_block_limit < per_thread_block_limit) {
+	      cur_per_thread_block_limit *= 2;
+	      if (cur_per_thread_block_limit > per_thread_block_limit) {
+		cur_per_thread_block_limit = per_thread_block_limit;
+	      }
+	    }
+	    cur_thread_block_vidx_limit = 0;
+	    cur_thread_fill_idx = 0;
+	  }
+	  cur_geno_buf_end = &(bgen_geno_iter[thread_wkspace_size]);
+	  cur_thread_block_vidx_limit += cur_per_thread_block_limit;
+	}
+	bgen_allele_cts[block_vidx] = cur_allele_ct;
+	if (uncompressed_genodata_byte_cts) {
+	  uncompressed_genodata_byte_cts[block_vidx] = uncompressed_genodata_byte_ct;
+	}
+	if (fread_checked(bgen_geno_iter, genodata_byte_ct, bgenfile)) {
+	  goto ox_bgen_to_pgen_ret_READ_FAIL;
+	}
+	bgen_geno_iter = &(bgen_geno_iter[genodata_byte_ct]);
+	compressed_geno_starts[++block_vidx] = bgen_geno_iter;
+      }
+      variant_ct += block_vidx;
+      if (multiallelic_skip_ct) {
+	logprint("\n");
+        LOGERRPRINTFWW("Warning: %u multiallelic variant%s skipped (not yet supported).\n", multiallelic_skip_ct, (multiallelic_skip_ct == 1)? "" : "s");
+      }
+      if (!variant_ct) {
+	logprint("\n");
+	LOGERRPRINTF("Error: All %u variant%s in .bgen file skipped.\n", raw_variant_ct, (raw_variant_ct == 1)? "" : "s");
+	goto ox_bgen_to_pgen_ret_INCONSISTENT_INPUT;
+      }
+      if (variant_ct == block_vidx) {
+	// with multiple threads, there's no guarantee that even the first
+	// decompression job has launched (e.g. there's only 1 variant on the
+	// relevant chromosome in the entire .bgen, and calc_thread_ct == 2).
+	// (this is not an issue with the bgen-1.1 converter because we error
+	// out on variant_ct == 0, and the first block size is 1.)
+	thread_bidxs[cur_thread_fill_idx + 1] = block_vidx;
+	ts.thread_func_ptr = bgen13_dosage_or_phase_scan_thread;
+	if (spawn_threads3z(variant_ct, &ts)) {
+	  goto ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL;
+	}
+	block_vidx = 0;
+      }
+      chr_filter_present = (variant_ct + multiallelic_skip_ct != raw_variant_ct);
+      if (ts.thread_func_ptr) {
+	join_threads3z(&ts);
+	reterr = g_error_ret;
+	if (reterr) {
+	  goto ox_bgen_to_pgen_ret_bgen13_thread_fail;
+	}
+	if ((!block_vidx) || g_dosage_is_present) {
+	  // ignore thread_bidxs[] in this case
+	  g_cur_block_write_ct = 0;
+	} else {
+	  for (; cur_thread_fill_idx < calc_thread_ct; ) {
+	    // save endpoint for current thread, and tell any leftover threads
+	    // to do nothing
+	    thread_bidxs[++cur_thread_fill_idx] = block_vidx;
+	  }
+	}
+	ts.is_last_block = 1;
+	if (spawn_threads3z(1, &ts)) {
+	  goto ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL;
+	}
+	join_threads3z(&ts);
+	dosage_is_present = g_dosage_is_present;
+      }
+
+      if (tot_allele_ct == variant_ct * 2) {
+	allele_idx_offsets = nullptr;
+	bigstack_end_reset(bigstack_end_mark);
+      } else {
+	// not yet possible
+	assert(0);
+	*allele_idx_offsets_iter = tot_allele_ct;
+      }
+      if (fseeko(bgenfile, initial_uints[0] + 4, SEEK_SET)) {
+	goto ox_bgen_to_pgen_ret_READ_FAIL;
+      }
+      strcpy(outname_end, ".pgen");
+      uintptr_t spgw_alloc_cacheline_ct;
+      uint32_t max_vrec_len;
+      reterr = spgw_init_phase1(outname, allele_idx_offsets, nullptr, variant_ct, sample_ct, dosage_is_present? (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent) : kfPgenGlobal0, (oxford_import_flags & (kfOxfordImportRefFirst | kfOxfordImportRefSecond))? 1 : 2, &spgw, &spgw_alloc_cacheline_ct, &max_vrec_len);
+      if (reterr) {
+	goto ox_bgen_to_pgen_ret_1;
+      }
+
+      bigstack_reset(g_bgen_allele_cts[0]);
+
+      // only needs to fit chromosome codes in second pass
+      loadbuf = bigstack_alloc_raw_rd(kMaxIdBlen);
+      unsigned char* spgw_alloc;
+      if (bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+	logerrprint("error path 5\n");
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      // Now that we know max_geno_blen, try to increase calc_thread_ct, and
+      // resize g_thread_wkspaces[tidx] (and also resize compressed_geno_bufs[]
+      // in next step).
+      // Additional *6 in denominator since we want to limit these allocations
+      // to 1/6 of remaining workspace.
+      thread_wkspace_size = round_up_pow2(max_geno_blen, kCacheline);
+      // bugfix (16 Jul 2017): was computing cachelines_avail, not bytes_avail
+      uintptr_t bytes_avail = round_down_pow2(bigstack_left() / 6, kCacheline);
+      if (calc_thread_ct_limit * thread_wkspace_size <= bytes_avail) {
+	calc_thread_ct = calc_thread_ct_limit;
+      } else {
+	calc_thread_ct = bytes_avail / thread_wkspace_size;
+	if (!calc_thread_ct) {
+	  goto ox_bgen_to_pgen_ret_NOMEM;
+	}
+      }
+      ts.calc_thread_ct = calc_thread_ct;
+      for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	g_thread_wkspaces[tidx] = bigstack_alloc_raw(thread_wkspace_size);
+      }
+      bytes_avail -= thread_wkspace_size * calc_thread_ct;
+      // Per-write-buffer-variant allocations:
+      //   g_bgen_allele_cts: 2 * sizeof(int16_t)
+      //   g_compressed_geno_starts: 2 * sizeof(intptr_t)
+      //   g_uncompressed_genodata_byte_cts: 2 * sizeof(int32_t)
+      //     (unless compression_mode == 0)
+      //   g_write_genovecs: 2 * sample_ctaw2 * sizeof(intptr_t)
+      //   g_write_phasepresents: 2 * sample_ctaw * sizeof(intptr_t)
+      //   g_write_phaseinfos: 2 * sample_ctaw * sizeof(intptr_t)
+      //   g_write_dosage_presents: 2 * sample_ctaw * sizeof(intptr_t)
+      //   g_write_dphase_presents: 2 * sample_ctaw * sizeof(intptr_t)
+      //   g_write_dosage_cts: 2 * sizeof(int32_t)
+      //   g_write_dphase_cts: 2 * sizeof(int32_t)
+      //   g_write_dosage_val_bufs (the big one): 2 * sample_ct * 2 *
+      //                                          sizeof(dosage_t)
+      //     additional factor of 2 here is due to phased-dosage support.  (not
+      //     actually implemented yet, but will be soon.)
+      //   g_compressed_geno_bufs (the other big one): up to 2 * max_geno_blen
+      //     The "up to" here is due to the possibility that a few variants
+      //     require much more space than the rest; unlikely now, but will be
+      //     important when multiallelic support is added.  To defend against
+      //     that possibility, we limit g_compressed_geno_bufs[] to 50% of the
+      //     total allocation here.
+      uintptr_t cachelines_avail_m24 = bigstack_left() / kCacheline;
+      if (cachelines_avail_m24 < 24) {
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      // we're making up to 24 allocations; be pessimistic re: rounding
+      // (g_compressed_geno_starts has +1, but we have enough room for error)
+      cachelines_avail_m24 -= 24;
+      uintptr_t bytes_req_per_in_block_variant = 2 * (sizeof(int16_t) + sizeof(intptr_t) + sample_ctaw2 * sizeof(intptr_t) + sample_ctaw * 4 * sizeof(intptr_t) + 2 * sizeof(int32_t) + sample_ct * 2 * sizeof(dosage_t));
+      if (compression_mode) {
+	bytes_req_per_in_block_variant += 2 * sizeof(int32_t);
+      }
+      // 50% cap
+      mainbuf_size = MINV(max_geno_blen, bytes_req_per_in_block_variant);
+      // bugfix (16 Jul 2017): forgot to include this term
+      // (17 Jul 2017): forgot to multiply by 2
+      bytes_req_per_in_block_variant += 2 * mainbuf_size;
+      main_block_size = (cachelines_avail_m24 * kCacheline) / bytes_req_per_in_block_variant;
+      if (main_block_size > 65536) {
+	main_block_size = 65536;
+      }
+      if (main_block_size > raw_variant_ct + calc_thread_ct - 1) {
+	main_block_size = raw_variant_ct + calc_thread_ct - 1;
+      }
+      per_thread_block_limit = main_block_size / calc_thread_ct;
+      // may as well guarantee divisibility
+      main_block_size = per_thread_block_limit * calc_thread_ct;
+      mainbuf_size *= main_block_size;
+      if (mainbuf_size < max_geno_blen) {
+	// bugfix (2 Jul 2017): don't error out here if the entire .bgen has
+	// e.g. only one variant
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      if (bigstack_alloc_usi(main_block_size, &(g_bgen_allele_cts[0])) ||
+	  bigstack_alloc_usi(main_block_size, &(g_bgen_allele_cts[1])) ||
+	  bigstack_alloc_ucp(main_block_size + 1, &(g_compressed_geno_starts[0])) ||
+	  bigstack_alloc_ucp(main_block_size + 1, &(g_compressed_geno_starts[1])) ||
+	  bigstack_alloc_uc(mainbuf_size, &(compressed_geno_bufs[0])) ||
+	  bigstack_alloc_uc(mainbuf_size, &(compressed_geno_bufs[1])) ||
+	  bigstack_alloc_ul(sample_ctaw2 * main_block_size, &(g_write_genovecs[0])) ||
+	  bigstack_alloc_ul(sample_ctaw2 * main_block_size, &(g_write_genovecs[1])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_phasepresents[0])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_phasepresents[1])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_phaseinfos[0])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_phaseinfos[1])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dosage_presents[0])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dosage_presents[1])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dphase_presents[0])) ||
+	  bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dphase_presents[1])) ||
+	  bigstack_alloc_ui(main_block_size, &(g_write_dosage_cts[0])) ||
+	  bigstack_alloc_ui(main_block_size, &(g_write_dosage_cts[1])) ||
+	  bigstack_alloc_ui(main_block_size, &(g_write_dphase_cts[0])) ||
+	  bigstack_alloc_ui(main_block_size, &(g_write_dphase_cts[1])) ||
+	  bigstack_alloc_dosage(sample_ct * 2 * main_block_size, &(g_write_dosage_val_bufs[0])) ||
+	  bigstack_alloc_dosage(sample_ct * 2 * main_block_size, &(g_write_dosage_val_bufs[1]))) {
+	// this should be impossible
+	logerrprint("error path 6\n");
+	LOGERRPRINTF("main_block_size: %" PRIuPTR "\n", main_block_size);
+	LOGERRPRINTF("mainbuf_size: %" PRIuPTR "\n", mainbuf_size);
+	LOGERRPRINTF("sample_ctaw: %u\n", sample_ctaw);
+	LOGERRPRINTF("cachelines_avail_m24: %" PRIuPTR "\n", cachelines_avail_m24);
+	LOGERRPRINTF("bytes_req_per_in_block_variant: %" PRIuPTR "\n", bytes_req_per_in_block_variant);
+	assert(0);
+	goto ox_bgen_to_pgen_ret_NOMEM;
+      }
+      if (compression_mode) {
+	if (bigstack_alloc_ui(main_block_size, &(g_uncompressed_genodata_byte_cts[0])) ||
+	    bigstack_alloc_ui(main_block_size, &(g_uncompressed_genodata_byte_cts[1]))) {
+	  logerrprint("error path 7\n");
+	  assert(0);
+	  goto ox_bgen_to_pgen_ret_NOMEM;
+	}
+      }
+      spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+
+      // Main workflow:
+      // 1. Set n=0, load genotype data for first main_block_size variants
+      //    while writing .pvar
+      //
+      // 2. Spawn threads processing batch n genotype data
+      // 3. If n>0, write results for block (n-1)
+      // 4. Increment n by 1
+      // 5. Load/write-.pvar for batch (n+1) unless eof
+      // 6. Join threads
+      // 7. Goto step 2 unless eof
+      //
+      // 8. Write results for last block
+      //
+      // (May be better to change this to use one output buffer instead of 2,
+      // due to high memory requirement.)
+      uint32_t vidx_start = 0;
+      uint32_t prev_block_write_ct = 0;
+      uint32_t prev_genodata_byte_ct = 0;
+      uint32_t prev_allele_ct = 0;
+      uint32_t skip = 0;
+      parity = 0;
+      reinit_threads3z(&ts);
+      g_cur_block_write_ct = 1;
+      while (1) {
+	uint32_t cur_block_write_ct = 0;
+	if (!ts.is_last_block) {
+	  const uint32_t block_vidx_limit = variant_ct - vidx_start;
+	  cur_thread_block_vidx_limit = MINV(block_vidx_limit, per_thread_block_limit);
+	  cur_thread_fill_idx = 0;
+	  thread_bidxs = g_thread_bidxs[parity];
+	  bgen_allele_cts = g_bgen_allele_cts[parity];
+	  compressed_geno_starts = g_compressed_geno_starts[parity];
+	  uncompressed_genodata_byte_cts = g_uncompressed_genodata_byte_cts[parity];
+          bgen_geno_iter = compressed_geno_bufs[parity];
+	  cur_geno_buf_end = &(bgen_geno_iter[thread_wkspace_size]);
+	  thread_bidxs[0] = 0;
+	  compressed_geno_starts[0] = bgen_geno_iter;
+	  block_vidx = 0;
+	  // strictly speaking, prev_genodata_byte_ct and genodata_byte_ct
+	  // can be collapsed into one variable, as well as
+	  // {block_vidx, cur_block_write_ct}, but not a big deal if the
+	  // compiler fails to see this
+	  uint32_t genodata_byte_ct = prev_genodata_byte_ct;
+	  uint32_t cur_allele_ct = prev_allele_ct;
+	  if (!genodata_byte_ct) {
+	    goto ox_bgen_to_pgen_load13_start;
+	  }
+	  // we may stop before main_block_size due to insufficient space in
+	  // compressed_geno_buf.  if so, the file pointer is right before the
+	  // genotype data, rather than at the beginning of a variant record.
+	  while (1) {
+	    bgen_allele_cts[block_vidx] = cur_allele_ct;
+	    if (uncompressed_genodata_byte_cts) {
+	      uncompressed_genodata_byte_cts[block_vidx] = uncompressed_genodata_byte_ct;
+	    }
+	    if (fread_checked(bgen_geno_iter, genodata_byte_ct, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    bgen_geno_iter = &(bgen_geno_iter[genodata_byte_ct]);
+	    compressed_geno_starts[++block_vidx] = bgen_geno_iter;
+
+	    uint16_t snpid_slen;
+	    // true iff this is the last variant we're keeping in the entire
+	    // file
+	    if (block_vidx == block_vidx_limit) {
+	      for (; cur_thread_fill_idx < calc_thread_ct; ) {
+		// save endpoint for current thread, and tell any leftover
+		// threads to do nothing
+		thread_bidxs[++cur_thread_fill_idx] = block_vidx;
+	      }
+	      break;
+	    }
+	  ox_bgen_to_pgen_load13_start:
+	    if (!fread(&snpid_slen, 2, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+
+	    if (!snpid_chr) {
+	      if (fseeko(bgenfile, snpid_slen, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	    } else {
+	      if (!fread(loadbuf, snpid_slen, 1, bgenfile)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      loadbuf[snpid_slen] = '\0';
+	    }
+	    uint16_t rsid_slen;
+	    if (!fread(&rsid_slen, 2, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (fseeko(bgenfile, rsid_slen, SEEK_CUR)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    uint16_t chr_name_slen;
+	    if (!fread(&chr_name_slen, 2, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    if (!snpid_chr) {
+	      if (!fread(loadbuf, chr_name_slen, 1, bgenfile)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      if ((chr_name_slen == 2) && (!memcmp(loadbuf, "NA", 2))) {
+		strcpy((char*)loadbuf, "0");
+		chr_name_slen = 1;
+	      } else {
+		loadbuf[chr_name_slen] = '\0';
+	      }
+	    } else {
+	      if (fseeko(bgenfile, chr_name_slen, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      chr_name_slen = snpid_slen;
+	    }
+	    if (chr_filter_present) {
+	      const int32_t cur_chr_code = get_chr_code((char*)loadbuf, cip, chr_name_slen);
+	      assert(cur_chr_code >= 0); // we scanned all the variants
+	      skip = !is_set(cip->chr_mask, cur_chr_code);
+	    }
+
+	    uint32_t cur_bp; // ignore in this pass
+	    if (!fread(&cur_bp, 4, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+
+	    cur_allele_ct = 0;
+	    if (!fread(&cur_allele_ct, 2, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+	    for (uint32_t allele_idx = 0; allele_idx < cur_allele_ct; ++allele_idx) {
+	      uint32_t allele_slen;
+	      if (!fread(&allele_slen, 4, 1, bgenfile)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      if (fseeko(bgenfile, allele_slen, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	    }
+	    if (!fread(&genodata_byte_ct, 4, 1, bgenfile)) {
+	      goto ox_bgen_to_pgen_ret_READ_FAIL;
+	    }
+
+	    // "cur_allele_ct > 2" is temporary kludge
+	    if (skip || (cur_allele_ct > 2)) {
+	      if (fseeko(bgenfile, genodata_byte_ct, SEEK_CUR)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      goto ox_bgen_to_pgen_load13_start;
+	    }
+	    if (uncompressed_genodata_byte_cts) {
+	      if (!fread(&uncompressed_genodata_byte_ct, 4, 1, bgenfile)) {
+		goto ox_bgen_to_pgen_ret_READ_FAIL;
+	      }
+	      genodata_byte_ct -= 4;
+	    }
+
+	    if ((block_vidx == cur_thread_block_vidx_limit) || ((uintptr_t)(cur_geno_buf_end - bgen_geno_iter) < genodata_byte_ct)) {
+	      thread_bidxs[++cur_thread_fill_idx] = block_vidx;
+	      if (cur_thread_fill_idx == calc_thread_ct) {
+		prev_allele_ct = cur_allele_ct;
+		prev_genodata_byte_ct = genodata_byte_ct;
+		break;
+	      }
+	      cur_geno_buf_end = &(bgen_geno_iter[thread_wkspace_size]);
+	      cur_thread_block_vidx_limit = MINV(cur_thread_block_vidx_limit + per_thread_block_limit, block_vidx_limit);
+	    }
+	  }
+	  cur_block_write_ct = block_vidx;
+	}
+	if (vidx_start) {
+	  join_threads3z(&ts);
+	  reterr = g_error_ret;
+	  if (reterr) {
+	    goto ox_bgen_to_pgen_ret_bgen13_thread_fail;
+	  }
+	}
+	if (!ts.is_last_block) {
+	  ts.is_last_block = (vidx_start + cur_block_write_ct == variant_ct);
+	  ts.thread_func_ptr = bgen13_geno_to_pgen_thread;
+	  if (spawn_threads3z(vidx_start, &ts)) {
+	    goto ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL;
+	  }
+	}
+	parity = 1 - parity;
+	if (vidx_start) {
+	  // write *previous* block results
+	  const uintptr_t* write_genovec_iter = g_write_genovecs[parity];
+	  // const uintptr_t* write_phasepresents = g_write_phasepresents[parity];
+	  // const uintptr_t* write_phaseinfos = g_write_phaseinfos[parity];
+	  const uintptr_t* write_dosage_presents = g_write_dosage_presents[parity];
+	  // const uintptr_t* write_dphase_presents = g_write_dphase_presents[parity];
+	  const uint32_t* write_dosage_cts = g_write_dosage_cts[parity];
+	  const uint32_t* write_dphase_cts = g_write_dphase_cts[parity];
+	  const dosage_t* write_dosage_val_bufs = g_write_dosage_val_bufs[parity];
+	  for (uintptr_t block_vidx = 0; block_vidx < prev_block_write_ct; ++block_vidx) {
+	    const uint32_t cur_dosage_ct = write_dosage_cts[block_vidx];
+	    const uint32_t cur_dphase_ct = write_dphase_cts[block_vidx];
+	    if (!cur_dphase_ct) {
+	      if (!cur_dosage_ct) {
+		if (spgw_append_biallelic_genovec(write_genovec_iter, &spgw)) {
+		  goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+		}
+	      } else {
+		if (spgw_append_biallelic_genovec_dosage16(write_genovec_iter, &(write_dosage_presents[block_vidx * sample_ctaw]), &(write_dosage_val_bufs[block_vidx * 2 * sample_ct]), cur_dosage_ct, &spgw)) {
+		  goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+		}
+	      }
+	    } else {
+	      // todo
+	    }
+	    write_genovec_iter = &(write_genovec_iter[sample_ctaw2]);
+	  }
+	}
+	if (vidx_start == variant_ct) {
+	  break;
+	}
+	if (vidx_start) {
+	  printf("\r--bgen: %uk variants converted.", vidx_start / 1000);
+	  if (vidx_start <= main_block_size) {
+	    fputs("    \b\b\b\b", stdout);
+	  }
+	  fflush(stdout);
+	}
+	vidx_start += cur_block_write_ct;
+	prev_block_write_ct = cur_block_write_ct;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), pvarfile)) {
+	goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&pvarfile)) {
+      goto ox_bgen_to_pgen_ret_WRITE_FAIL;
+    }
+    
+    spgw_finish(&spgw);
+    putc_unlocked('\r', stdout);
+    write_iter = strcpya(g_logbuf, "--bgen: ");
+    const uint32_t outname_base_slen = (uintptr_t)(outname_end - outname);
+    write_iter = memcpya(write_iter, outname, outname_base_slen + 5);
+    write_iter = memcpyl3a(write_iter, " + ");
+    write_iter = memcpya(write_iter, outname, outname_base_slen);
+    write_iter = strcpya(write_iter, ".pvar");
+    write_iter = strcpya(write_iter, " written");
+    if (!dosage_is_present) {
+      write_iter = strcpya(write_iter, " (only hardcalls)");
+    }
+    strcpy(write_iter, ".\n");
+    wordwrapb(0);
+    logprintb();
+  }
+  while (0) {
+  ox_bgen_to_pgen_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  ox_bgen_to_pgen_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  ox_bgen_to_pgen_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  ox_bgen_to_pgen_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  ox_bgen_to_pgen_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_bgen_to_pgen_ret_INCONSISTENT_INPUT_DELIM:
+    logerrprintb();
+    if (id_delim == '_') {
+      logerrprint("If you do not want '_' to be treated as a FID/IID delimiter, use --double-id or\n--const-fid to choose a different method of converting .bgen sample IDs to\nPLINK IDs, or --id-delim to change the FID/IID delimiter.\n");
+    }
+    reterr = kPglRetInconsistentInput;
+    break;
+  ox_bgen_to_pgen_ret_MALFORMED_INPUT_LONG_ID:
+    logerrprint("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_bgen_to_pgen_ret_INCONSISTENT_INPUT_2:
+    logerrprintb();
+  ox_bgen_to_pgen_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  ox_bgen_to_pgen_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  ox_bgen_to_pgen_ret_bgen13_thread_fail:
+    if (reterr == kPglRetMalformedInput) {
+      logprint("\n");
+      logerrprint("Error: Invalid compressed SNP block in .bgen file.\n");
+    } else if (reterr == kPglRetNotYetSupported) {
+      logprint("\n");
+      logerrprint("Error: BGEN import doesn't currently support phased variants, >16-bit\nprobability precision, or ploidy > 2.\n");
+    }
+  }
+ ox_bgen_to_pgen_ret_1:
+  if (spgw_cleanup(&spgw) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  threads3z_cleanup(&ts, &g_cur_block_write_ct);
+  fclose_cond(bgenfile);
+  fclose_cond(psamfile);
+  fclose_cond(pvarfile);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+boolerr_t import_legend_cols(const char* fname, uintptr_t line_idx, uint32_t prov_ref_allele_second, char** loadbuf_iter_ptr, char** write_iter_ptr, uint32_t* variant_ct_ptr) {
+  {
+    if (*variant_ct_ptr == 0x7ffffffd) {
+      logerrprint("Error: " PROG_NAME_STR " does not support more than 2^31 - 3 variants.  We recommend other\nsoftware, such as PLINK/SEQ, for very deep studies of small numbers of genomes.\n");
+      return 1;
+    }
+    *variant_ct_ptr += 1;
+    char* write_iter = *write_iter_ptr;
+    *write_iter++ = '\t';
+    char* id_start = *loadbuf_iter_ptr;
+    char* id_end = token_endnn(id_start);
+    const uint32_t id_slen = (uintptr_t)(id_end - id_start);
+    if (id_slen > kMaxIdSlen) {
+      logerrprint("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
+      return 1;
+    }
+    char* pos_str = skip_initial_spaces(id_end);
+    if (!pos_str) {
+      goto import_legend_cols_ret_MISSING_TOKENS;
+    }
+    char* pos_end = token_endnn(pos_str);
+    uint32_t cur_bp;
+    if (scan_uint_defcap(pos_str, &cur_bp)) {
+      LOGPREPRINTFWW("Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, fname);
+      return 1;
+    }
+    write_iter = uint32toa_x(cur_bp, '\t', write_iter);
+    write_iter = memcpyax(write_iter, id_start, id_slen, '\t');
+    char* first_allele_str = skip_initial_spaces(pos_end);
+    if (is_eoln_kns(*first_allele_str)) {
+      goto import_legend_cols_ret_MISSING_TOKENS;
+    }
+    char* first_allele_end = token_endnn(first_allele_str);
+    char* second_allele_str = skip_initial_spaces(first_allele_end);
+    if (is_eoln_kns(*second_allele_str)) {
+      goto import_legend_cols_ret_MISSING_TOKENS;
+    }
+    char* second_allele_end = token_endnn(second_allele_str);
+    if (!prov_ref_allele_second) {
+      write_iter = memcpyax(write_iter, first_allele_str, (uintptr_t)(first_allele_end - first_allele_str), '\t');
+      write_iter = memcpya(write_iter, second_allele_str, (uintptr_t)(second_allele_end - second_allele_str));
+    } else {
+      write_iter = memcpyax(write_iter, second_allele_str, (uintptr_t)(second_allele_end - second_allele_str), '\t');
+      write_iter = memcpya(write_iter, first_allele_str, (uintptr_t)(first_allele_end - first_allele_str));
+    }
+    *write_iter_ptr = write_iter;
+    append_binary_eoln(write_iter_ptr);
+    *loadbuf_iter_ptr = second_allele_end;
+    return 0;
+  }
+  {
+  import_legend_cols_ret_MISSING_TOKENS:
+    LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, fname);
+    return 1;
+  }
+}
+
+pglerr_t scan_haps_for_het(char* loadbuf_iter, const char* hapsname, uint32_t sample_ct, uint32_t is_haploid_or_mt, uintptr_t line_idx_haps, uint32_t* at_least_one_het_ptr) {
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      const uint32_t first_hap_char_code = (uint32_t)((unsigned char)(*loadbuf_iter));
+      const uint32_t first_hap_int = first_hap_char_code - 48;
+      // will .haps files ever support triallelic variants?  don't worry about
+      // that for now
+      char* post_first_hap = &(loadbuf_iter[1]);
+      if ((first_hap_int >= 2) || ((unsigned char)(*post_first_hap) > 32)) {
+	if (first_hap_char_code <= 32) {
+	  goto scan_haps_for_het_ret_MISSING_TOKENS;
+	}
+	goto scan_haps_for_het_ret_INVALID_TOKEN;
+      }
+      char* second_hap = skip_initial_spaces(post_first_hap);
+      char* post_second_hap = &(second_hap[1]);
+      const uint32_t second_hap_char_code = (uint32_t)((unsigned char)(*second_hap));
+      const uint32_t second_hap_int = second_hap_char_code - 48;
+      const uint32_t post_second_hap_char_code = (uint32_t)((unsigned char)(*post_second_hap));
+      if ((second_hap_int >= 2) || (post_second_hap_char_code > 32)) {
+	// if haploid or MT, permit '-' in second column
+	if ((!is_haploid_or_mt) || (second_hap_char_code != 45)) {
+	  if (second_hap_char_code <= 32) {
+	    goto scan_haps_for_het_ret_MISSING_TOKENS;
+	  }
+	  if ((second_hap_char_code == 45) && (post_second_hap_char_code <= 32)) {
+	    goto scan_haps_for_het_ret_HAPLOID_TOKEN;
+	  }
+	  goto scan_haps_for_het_ret_INVALID_TOKEN;
+	}
+      } else if (first_hap_int != second_hap_int) {
+	*at_least_one_het_ptr = 1;
+	break;
+      }
+      loadbuf_iter = skip_initial_spaces(post_second_hap);
+    }
+  }
+  while (0) {
+  scan_haps_for_het_ret_HAPLOID_TOKEN:
+    sprintf(g_logbuf, "Error: Haploid/MT-only token on line %" PRIuPTR " of %s.\n", line_idx_haps, hapsname);
+    reterr = kPglRetInconsistentInput;
+    break;
+  scan_haps_for_het_ret_INVALID_TOKEN:
+    sprintf(g_logbuf, "Error: Invalid token on line %" PRIuPTR " of %s.\n", line_idx_haps, hapsname);
+    reterr = kPglRetMalformedInput;
+    break;
+  scan_haps_for_het_ret_MISSING_TOKENS:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx_haps, hapsname);
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+  return reterr;
+}
+
+#ifdef __arm__
+  #error "Unaligned accesses in ox_hapslegend_to_pgen()."
+#endif
+pglerr_t ox_hapslegend_to_pgen(const char* hapsname, const char* legendname, const char* samplename, const char* ox_single_chr_str, const char* ox_missing_code, misc_flags_t misc_flags, oxford_import_t oxford_import_flags, char* outname, char* outname_end, chr_info_t* cip) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gzFile gz_hapsfile = nullptr;
+  gzFile gz_legendfile = nullptr;
+  FILE* outfile = nullptr;
+  uintptr_t line_idx_haps = 0;
+  uintptr_t line_idx_legend = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  st_pgen_writer_t spgw;
+  uintptr_t loadbuf_size;
+  spgw_preinit(&spgw);
+  {
+    uint32_t sfile_sample_ct = 0;
+    if (samplename[0]) {
+      reterr = ox_sample_to_psam(samplename, ox_missing_code, misc_flags, outname, outname_end, &sfile_sample_ct);
+      if (reterr) {
+	goto ox_hapslegend_to_pgen_ret_1;
+      }
+      if (sfile_sample_ct > (kMaxLongLine / 4)) {
+	logerrprint("Error: Too many samples for .haps file converter.\n");
+	reterr = kPglRetNotYetSupported;
+	goto ox_hapslegend_to_pgen_ret_1;
+      }
+    }
+    
+    reterr = gzopen_read_checked(hapsname, &gz_hapsfile);
+    if (reterr) {
+      goto ox_hapslegend_to_pgen_ret_1;
+    }
+    uintptr_t writebuf_size = bigstack_left() / 2;
+    if (writebuf_size < kMaxMediumLine + kCompressStreamBlock + kCacheline) {
+      return kPglRetNomem;
+#ifdef __LP64__
+      // in 32-bit case, kMaxLongLine + kCompressStreamBlock overflows
+    } else if (writebuf_size > kMaxLongLine + kCompressStreamBlock) {
+      writebuf_size = kMaxLongLine + kCompressStreamBlock;
+#endif
+    } else {
+      writebuf_size &= ~(kCacheline - 1);
+    }
+    loadbuf_size = writebuf_size - kCompressStreamBlock;
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* writebuf = (char*)bigstack_alloc_raw(writebuf_size);
+    char* writebuf_flush = &(writebuf[kCompressStreamBlock]);
+    strcpy(outname_end, ".pvar");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto ox_hapslegend_to_pgen_ret_OPEN_FAIL;
+    }
+    char* write_iter = strcpya(writebuf, "#CHROM\tPOS\tID\tREF\tALT" EOLN_STR);
+    char* loadbuf_first_token;
+    do {
+      ++line_idx_haps;
+      if (!gzgets(gz_hapsfile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_hapsfile)) {
+	  goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+	}
+	sprintf(g_logbuf, "Error: %s is empty.\n", hapsname);
+	goto ox_hapslegend_to_pgen_ret_INCONSISTENT_INPUT_WW;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto ox_hapslegend_to_pgen_ret_LONG_LINE_HAP;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    } while (is_eoln_kns(*loadbuf_first_token));
+    const uint32_t token_ct = count_tokens(loadbuf_first_token);
+    // pass 1: count variants, write .pvar file, may as well also verify
+    // there's at least one heterozygous call
+    finalize_chrset(misc_flags, cip);
+    const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
+    const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+    const uint32_t prov_ref_allele_second = !(oxford_import_flags & kfOxfordImportRefFirst);
+    uint32_t at_least_one_het = 0;
+    uintptr_t variant_skip_ct = 0;
+    uint32_t variant_ct = 0;
+    uint32_t is_haploid_or_mt = 0;
+    uint32_t sample_ct;
+    // support both .haps + .legend (.haps expected to contain no header
+    // columns), and pure .haps
+    if (legendname[0]) {
+      assert(ox_single_chr_str);
+      if (token_ct % 2) {
+	sprintf(g_logbuf, "Error: %s has an odd number of tokens in the first line. (With --haps + --legend, the .haps file is expected to have no header columns.)\n", hapsname);
+	goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+      }
+      sample_ct = token_ct / 2;
+      if (sfile_sample_ct && (sfile_sample_ct != sample_ct)) {
+	sprintf(g_logbuf, "Error: .sample file has %u sample%s, while %s has %u.\n", sfile_sample_ct, (sfile_sample_ct == 1)? "" : "s", hapsname, sample_ct);
+	goto ox_hapslegend_to_pgen_ret_INCONSISTENT_INPUT_WW;
+      }
+      if (gzrewind(gz_hapsfile)) {
+	goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+      }
+      line_idx_haps = 0;
+      int32_t chr_code_raw = get_chr_code_raw(ox_single_chr_str);
+      const char* single_chr_str = nullptr;
+      uint32_t single_chr_slen;
+      char chr_buf[8]; // nothing longer than e.g. "chrMT" for now
+      if (chr_code_raw == -1) {
+	// command-line parser guarantees that allow_extra_chrs is true here
+	single_chr_str = ox_single_chr_str;
+	single_chr_slen = strlen(ox_single_chr_str);
+      } else {
+	uint32_t chr_code = chr_code_raw;
+	if (chr_code > cip->max_code) {
+	  if (chr_code < kMaxContigs) {
+	    logerrprint("Error: --legend chromosome code is not in the chromosome set.\n");
+	    goto ox_hapslegend_to_pgen_ret_INVALID_CMDLINE;
+	  }
+	  chr_code = cip->xymt_codes[chr_code - kMaxContigs];
+	  if (((int32_t)chr_code) < 0) {
+	    logerrprint("Error: --legend chromosome code is not in the chromosome set.\n");
+	    goto ox_hapslegend_to_pgen_ret_INVALID_CMDLINE;
+	  }
+	}
+	if (!is_set(cip->chr_mask, chr_code)) {
+	  logerrprint("Error: --legend chromosome code is excluded by chromosome filter.\n");
+	  goto ox_hapslegend_to_pgen_ret_INVALID_CMDLINE;
+	}
+	is_haploid_or_mt = is_set(cip->haploid_mask, chr_code) || (((int32_t)chr_code) == mt_code);
+	char* chr_name_end = chr_name_write(cip, chr_code, chr_buf);
+	single_chr_str = chr_buf;
+	single_chr_slen = (uintptr_t)(chr_name_end - chr_buf);
+      }
+      reterr = gzopen_read_checked(legendname, &gz_legendfile);
+      if (reterr) {
+	goto ox_hapslegend_to_pgen_ret_1;
+      }
+      do {
+	++line_idx_legend;
+	if (!gzgets(gz_legendfile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_legendfile)) {
+	    goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+	  }
+	  sprintf(g_logbuf, "Error: %s is empty.\n", legendname);
+	  goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+	if (!loadbuf[loadbuf_size - 1]) {
+	  goto ox_hapslegend_to_pgen_ret_LONG_LINE_LEGEND;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+      } while (is_eoln_kns(*loadbuf_first_token));
+      // require at least 4 columns, in ID/pos/A1/A2 order; header text is
+      // permitted to vary.  tolerate and ignore extra columns.
+      if (!next_token_mult(loadbuf_first_token, 3)) {
+	goto ox_hapslegend_to_pgen_ret_MISSING_TOKENS_LEGEND;
+      }      
+      while (1) {
+	++line_idx_legend;
+	if (!gzgets(gz_legendfile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_legendfile)) {
+	    goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+	  }
+	  break;
+	}
+	if (!loadbuf[loadbuf_size - 1]) {
+	  goto ox_hapslegend_to_pgen_ret_LONG_LINE_LEGEND;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+	if (is_eoln_kns(*loadbuf_first_token)) {
+	  continue;
+	}
+	char* loadbuf_iter = loadbuf_first_token;
+	write_iter = memcpya(write_iter, single_chr_str, single_chr_slen);
+	if (import_legend_cols(legendname, line_idx_legend, prov_ref_allele_second, &loadbuf_iter, &write_iter, &variant_ct)) {
+	  putc_unlocked('\n', stdout);
+	  goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT;
+	}
+	if (write_iter >= writebuf_flush) {
+	  if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	    goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	if (!at_least_one_het) {
+	  do {
+	    ++line_idx_haps;
+	    if (!gzgets(gz_hapsfile, loadbuf, loadbuf_size)) {
+	      if (!gzeof(gz_hapsfile)) {
+		goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+	      }
+	      sprintf(g_logbuf, "Error: %s has fewer nonheader lines than %s.\n", hapsname, legendname);
+	      goto ox_hapslegend_to_pgen_ret_INCONSISTENT_INPUT_WW;
+	    }
+	    if (!loadbuf[loadbuf_size - 1]) {
+	      goto ox_hapslegend_to_pgen_ret_LONG_LINE_HAP;
+	    }
+	    loadbuf_first_token = skip_initial_spaces(loadbuf);
+	  } while (is_eoln_kns(*loadbuf_first_token));
+	  reterr = scan_haps_for_het(loadbuf_first_token, hapsname, sample_ct, is_haploid_or_mt, line_idx_haps, &at_least_one_het);
+	  if (reterr) {
+            putc_unlocked('\n', stdout);
+	    wordwrapb(0);
+	    logerrprintb();
+	    goto ox_hapslegend_to_pgen_ret_1;
+	  }
+	}
+      }
+      if (gzrewind(gz_legendfile)) {
+	goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+      }
+    } else {
+      assert(!legendname[0]);
+      if ((token_ct < 7) || (!(token_ct % 2))) {
+	sprintf(g_logbuf, "Error: Unexpected token count in line %" PRIuPTR " of %s (should be odd, >5).\n", line_idx_haps, hapsname);
+	goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+      }
+      sample_ct = (token_ct - 5) / 2;
+      if (sfile_sample_ct && (sfile_sample_ct != sample_ct)) {
+	sprintf(g_logbuf, "Error: .sample file has %u sample%s, while %s has %u.\n", sfile_sample_ct, (sfile_sample_ct == 1)? "" : "s", hapsname, sample_ct);
+	goto ox_hapslegend_to_pgen_ret_INCONSISTENT_INPUT_WW;
+      }
+      while (1) {
+	if (!is_eoln_kns(*loadbuf_first_token)) {
+	  char* chr_code_end = token_endnn(loadbuf_first_token);
+	  char* loadbuf_iter = skip_initial_spaces(chr_code_end);
+	  if (is_eoln_kns(*loadbuf_iter)) {
+	    goto ox_hapslegend_to_pgen_ret_MISSING_TOKENS_HAPS;
+	  }
+	  int32_t cur_chr_code;
+	  reterr = get_or_add_chr_code_destructive("--haps file", line_idx_haps, allow_extra_chrs, loadbuf_first_token, chr_code_end, cip, &cur_chr_code);
+	  if (reterr) {
+	    goto ox_hapslegend_to_pgen_ret_1;
+	  }
+	  if (!is_set(cip->chr_mask, cur_chr_code)) {
+	    ++variant_skip_ct;
+	  } else {
+	    is_haploid_or_mt = is_set(cip->haploid_mask, cur_chr_code) || (cur_chr_code == mt_code);
+	    write_iter = chr_name_write(cip, cur_chr_code, write_iter);
+	    if (import_legend_cols(hapsname, line_idx_haps, prov_ref_allele_second, &loadbuf_iter, &write_iter, &variant_ct)) {
+	      putc_unlocked('\n', stdout);
+	      goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT;
+	    }
+	    if (write_iter >= writebuf_flush) {
+	      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+		goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+	      }
+	      write_iter = writebuf;
+	    }
+	    if (!at_least_one_het) {
+	      loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	      if (scan_haps_for_het(loadbuf_iter, hapsname, sample_ct, is_haploid_or_mt, line_idx_haps, &at_least_one_het)) {
+		// override InconsistentInput return code since chromosome info
+		// was also gathered from .haps file
+		goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+	      }
+	    }
+	  }
+	}
+	++line_idx_haps;
+	if (!gzgets(gz_hapsfile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_hapsfile)) {
+	    goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+	  }
+	  break;
+	}
+	if (!loadbuf[loadbuf_size - 1]) {
+	  goto ox_hapslegend_to_pgen_ret_LONG_LINE_HAP;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+      }
+      if (!variant_ct) {
+	sprintf(g_logbuf, "Error: All %" PRIuPTR " variant%s in %s skipped due to chromosome filter.\n", variant_skip_ct, (variant_skip_ct == 1)? "" : "s", hapsname);
+	goto ox_hapslegend_to_pgen_ret_INCONSISTENT_INPUT_WW;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+    }
+    if (!sfile_sample_ct) {
+      // create a dummy .psam file with "per0/per0", "per1/per1", etc. IDs,
+      // matching --dummy
+      strcpy(outname_end, ".psam");
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	goto ox_hapslegend_to_pgen_ret_OPEN_FAIL;
+      }
+      write_iter = strcpya(writebuf, "#FID\tIID\tSEX" EOLN_STR);
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	write_iter = memcpyl3a(write_iter, "per");
+	write_iter = uint32toa(sample_idx, write_iter);
+	write_iter = strcpya(write_iter, "\tper");
+	write_iter = uint32toa(sample_idx, write_iter);
+	write_iter = strcpya(write_iter, "\tNA" EOLN_STR);
+	if (write_iter >= writebuf_flush) {
+	  if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	    goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+      }
+      if (write_iter != writebuf) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+	}
+      }
+      if (fclose_null(&outfile)) {
+	goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    if (gzrewind(gz_hapsfile)) {
+      goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+    }
+    line_idx_haps = 0;
+    bigstack_reset(writebuf);
+    putc_unlocked('\r', stdout);
+    LOGPRINTF("--haps%s: %u variant%s scanned.\n", legendname[0]? " + --legend" : "", variant_ct, (variant_ct == 1)? "" : "s");
+    strcpy(outname_end, ".pgen");
+    uintptr_t spgw_alloc_cacheline_ct;
+    uint32_t max_vrec_len;
+    reterr = spgw_init_phase1(outname, nullptr, nullptr, variant_ct, sample_ct, at_least_one_het? kfPgenGlobalHardcallPhasePresent : kfPgenGlobal0, (oxford_import_flags & (kfOxfordImportRefFirst | kfOxfordImportRefSecond))? 1 : 2, &spgw, &spgw_alloc_cacheline_ct, &max_vrec_len);
+    if (reterr) {
+      goto ox_hapslegend_to_pgen_ret_1;
+    }
+    unsigned char* spgw_alloc;
+    if (bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+      goto ox_hapslegend_to_pgen_ret_NOMEM;
+    }
+    spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    const uint32_t phaseinfo_match_4char = prov_ref_allele_second? 0x20312030 : 0x20302031;
+    const uint32_t phaseinfo_match = 1 + prov_ref_allele_second;
+    uintptr_t* genovec;
+    uintptr_t* phaseinfo;
+    if (bigstack_alloc_ul(sample_ctl2, &genovec) ||
+	bigstack_alloc_ul(sample_ctl, &phaseinfo)) {
+      goto ox_hapslegend_to_pgen_ret_NOMEM;
+    }
+
+    for (uint32_t vidx = 0; vidx < variant_ct;) {
+      ++line_idx_haps;
+      if (!gzgets(gz_hapsfile, loadbuf, loadbuf_size)) {
+	if ((!gzeof(gz_hapsfile)) || (!legendname[0])) {
+	  goto ox_hapslegend_to_pgen_ret_READ_FAIL;
+	}
+	sprintf(g_logbuf, "Error: %s has fewer nonheader lines than %s.\n", hapsname, legendname);
+	goto ox_hapslegend_to_pgen_ret_INCONSISTENT_INPUT_WW;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto ox_hapslegend_to_pgen_ret_LONG_LINE_HAP;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*loadbuf_first_token)) {
+	continue;
+      }
+      char* loadbuf_iter = loadbuf_first_token;
+      if (!legendname[0]) {
+	char* chr_code_end = token_endnn(loadbuf_first_token);
+	const int32_t cur_chr_code = get_chr_code_counted(cip, (uintptr_t)(chr_code_end - loadbuf_first_token), loadbuf_first_token);
+	if (!is_set(cip->chr_mask, cur_chr_code)) {
+	  continue;
+	}
+	is_haploid_or_mt = is_set(cip->haploid_mask, cur_chr_code) || (cur_chr_code == mt_code);
+	loadbuf_iter = next_token_mult(skip_initial_spaces(chr_code_end), 4);
+	if (!loadbuf_iter) {
+	  goto ox_hapslegend_to_pgen_ret_MISSING_TOKENS_HAPS;
+	}
+      }
+      uintptr_t genovec_word_or = 0;
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      uint32_t widx = 0;
+      // optimize common case: autosomal diploid, always exactly one space
+      // this loop is time-critical; all my attemps to merge in the haploid/MT
+      // case have caused >10% slowdowns
+      if ((!is_haploid_or_mt) && ((unsigned char)loadbuf_iter[sample_ct * 4 - 1] < 32)) {
+	loadbuf_iter[sample_ct * 4 - 1] = ' ';
+	const uint32_t* loadbuf_alias32_iter = (const uint32_t*)loadbuf_iter;
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	  }
+	  uintptr_t genovec_word = 0;
+	  uint32_t phaseinfo_halfword = 0;
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    // assumes little-endian
+	    uint32_t cur_hap_4char = *loadbuf_alias32_iter++;
+	    if ((cur_hap_4char & 0xfffefffeU) != 0x20302030) {
+	      if ((cur_hap_4char & 0xfffffffeU) == 0x202d2030) {
+		// "0 - ", "1 - "
+		goto ox_hapslegend_to_pgen_ret_HAPLOID_TOKEN;
+	      }
+	      // any character < 32?
+	      if ((((cur_hap_4char & 0xe0e0e0e0U) * 7) & 0x80808080U) != 0x80808080U) {
+		goto ox_hapslegend_to_pgen_ret_MISSING_TOKENS_HAPS;
+	      }
+	      sprintf(g_logbuf, "Error: Invalid token on line %" PRIuPTR " of %s.\n", line_idx_haps, hapsname);
+	      goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+	    }
+	    const uintptr_t new_geno = (cur_hap_4char + (cur_hap_4char >> 16)) & 3;
+	    genovec_word |= new_geno << (2 * sample_idx_lowbits);
+	    if (cur_hap_4char == phaseinfo_match_4char) {
+	      phaseinfo_halfword |= 1U << sample_idx_lowbits;
+	    }
+	  }
+	  genovec[widx] = genovec_word;
+	  genovec_word_or |= genovec_word;
+	  ((halfword_t*)phaseinfo)[widx] = phaseinfo_halfword;
+	  ++widx;
+	}
+      } else {
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	  }
+	  uintptr_t genovec_word = 0;
+	  uint32_t phaseinfo_halfword = 0;
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    const uint32_t first_hap_char_code = (uint32_t)((unsigned char)(*loadbuf_iter));
+	    const uint32_t first_hap_int = first_hap_char_code - 48;
+	    char* post_first_hap = &(loadbuf_iter[1]);
+	    if ((first_hap_int >= 2) || ((unsigned char)(*post_first_hap) > 32)) {
+	      if (first_hap_char_code <= 32) {
+		goto ox_hapslegend_to_pgen_ret_MISSING_TOKENS_HAPS;
+	      }
+	      sprintf(g_logbuf, "Error: Invalid token on line %" PRIuPTR " of %s.\n", line_idx_haps, hapsname);
+	      goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+	    }
+	    char* second_hap = skip_initial_spaces(post_first_hap);
+	    char* post_second_hap = &(second_hap[1]);
+	    const uint32_t second_hap_char_code = (uint32_t)((unsigned char)(*second_hap));
+	    const uint32_t post_second_hap_char_code = (uint32_t)((unsigned char)(*post_second_hap));
+	    uint32_t second_hap_int = second_hap_char_code - 48;
+	    if ((second_hap_int >= 2) || (post_second_hap_char_code > 32)) {
+	      if (is_haploid_or_mt && (second_hap_char_code == 45)) {
+		// could require --sample, and require this sample to be male
+		// in this case?
+		second_hap_int = first_hap_int;
+	      } else {
+		if (second_hap_char_code <= 32) {
+		  goto ox_hapslegend_to_pgen_ret_MISSING_TOKENS_HAPS;
+		}
+		if ((second_hap_char_code == 45) && (post_second_hap_char_code <= 32)) {
+		  goto ox_hapslegend_to_pgen_ret_HAPLOID_TOKEN;
+		}
+		sprintf(g_logbuf, "Error: Invalid token on line %" PRIuPTR " of %s.\n", line_idx_haps, hapsname);
+		goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+	      }
+	    }
+	    genovec_word |= ((uintptr_t)(first_hap_int + second_hap_int)) << (2 * sample_idx_lowbits);
+	    if (first_hap_int + 2 * second_hap_int == phaseinfo_match) {
+	      phaseinfo_halfword |= 1U << sample_idx_lowbits;
+	    }
+	    loadbuf_iter = skip_initial_spaces(post_second_hap);
+	  }
+	  genovec[widx] = genovec_word;
+	  genovec_word_or |= genovec_word;
+	  ((halfword_t*)phaseinfo)[widx] = phaseinfo_halfword;
+	  ++widx;
+	}
+      }
+      if (prov_ref_allele_second) {
+	genovec_invert_unsafe(sample_ct, genovec);
+	zero_trailing_quaters(sample_ct, genovec);
+      }
+      if (genovec_word_or & kMask5555) {
+	if (spgw_append_biallelic_genovec_hphase(genovec, nullptr, phaseinfo, &spgw)) {
+	  goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+	}
+      } else {
+	if (spgw_append_biallelic_genovec(genovec, &spgw)) {
+	  goto ox_hapslegend_to_pgen_ret_WRITE_FAIL;
+	}
+      }
+      if (!(++vidx % 1000)) {
+	printf("\r--haps%s: %uk variants converted.", legendname[0]? " + --legend" : "", vidx / 1000);
+	fflush(stdout);
+      }
+    }
+    spgw_finish(&spgw);
+    putc_unlocked('\r', stdout);
+    write_iter = strcpya(g_logbuf, "--haps");
+    if (legendname[0]) {
+      write_iter = strcpya(write_iter, " + --legend");
+    }
+    write_iter = strcpya(write_iter, ": ");
+    const uint32_t outname_base_slen = (uintptr_t)(outname_end - outname);
+    write_iter = memcpya(write_iter, outname, outname_base_slen + 5);
+    write_iter = memcpyl3a(write_iter, " + ");
+    if (!sfile_sample_ct) {
+      write_iter = memcpya(write_iter, outname, outname_base_slen);
+      write_iter = strcpya(write_iter, ".psam + ");
+    }
+    write_iter = memcpya(write_iter, outname, outname_base_slen);
+    strcpy(write_iter, ".pvar written.\n");
+    wordwrapb(0);
+    logprintb();
+  }
+  while (0) {
+  ox_hapslegend_to_pgen_ret_LONG_LINE_HAP:
+    if (loadbuf_size == kMaxLongLine) {
+      sprintf(g_logbuf, "Line %" PRIuPTR " of %s is pathologically long.\n", line_idx_haps, hapsname);
+      goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+    }
+  ox_hapslegend_to_pgen_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  ox_hapslegend_to_pgen_ret_LONG_LINE_LEGEND:
+    if (loadbuf_size == kMaxLongLine) {
+      sprintf(g_logbuf, "Line %" PRIuPTR " of %s is pathologically long.\n", line_idx_legend, legendname);
+      goto ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW;
+    }
+    reterr = kPglRetNomem;
+    break;
+  ox_hapslegend_to_pgen_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  ox_hapslegend_to_pgen_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  ox_hapslegend_to_pgen_ret_WRITE_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  ox_hapslegend_to_pgen_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  ox_hapslegend_to_pgen_ret_MISSING_TOKENS_HAPS:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx_haps, hapsname);
+  ox_hapslegend_to_pgen_ret_MALFORMED_INPUT_WW:
+    putc_unlocked('\n', stdout);
+    wordwrapb(0);
+    logerrprintb();
+  ox_hapslegend_to_pgen_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_hapslegend_to_pgen_ret_MISSING_TOKENS_LEGEND:
+    putc_unlocked('\n', stdout);
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx_legend, legendname);
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  ox_hapslegend_to_pgen_ret_HAPLOID_TOKEN:
+    putc_unlocked('\n', stdout);
+    sprintf(g_logbuf, "Error: Haploid/MT-only token on line %" PRIuPTR " of %s.\n", line_idx_haps, hapsname);
+    wordwrapb(0);
+    logerrprintb();
+    reterr = legendname[0]? kPglRetInconsistentInput : kPglRetMalformedInput;
+    break;
+  ox_hapslegend_to_pgen_ret_INCONSISTENT_INPUT_WW:
+    putc_unlocked('\n', stdout);
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ ox_hapslegend_to_pgen_ret_1:
+  spgw_cleanup(&spgw);
+  fclose_cond(outfile);
+  gzclose_cond(gz_legendfile);
+  gzclose_cond(gz_hapsfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+
+// could add an option to load_pvar() to not require allele columns, but .map
+// is easy enough to write a separate loader for...
+CONSTU31(kLoadMapBlockSize, 65536);
+
+// assumes finalize_chrset() has already been called.
+static_assert(kMaxContigs <= 65536, "load_map() needs to be updated.");
+pglerr_t load_map(const char* mapname, misc_flags_t misc_flags, chr_info_t* cip, uint32_t* max_variant_id_slen_ptr, uint16_t** variant_chr_codes_ptr, uint32_t** variant_bps_ptr, char*** variant_ids_ptr, double** variant_cms_ptr, uint32_t* variant_ct_ptr) {
+  // caller should call forget_extra_chr_names(1, cip) after finishing import.
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  gzFile gz_infile = nullptr;
+  uintptr_t loadbuf_size = 0;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    reterr = gzopen_read_checked(mapname, &gz_infile);
+    if (reterr) {
+      goto load_map_ret_1;
+    }
+    // Workspace used as follows:
+    // |--loadbuf--|--temp-->----|----<- variant IDs --|
+    //            1/4                                 end
+    // loadbuf is overwritten with the main return arrays at the end.
+    loadbuf_size = round_down_pow2(bigstack_left() / 4, kCacheline);
+    char* loadbuf = (char*)g_bigstack_base;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto load_map_ret_NOMEM;
+    }
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* loadbuf_first_token;
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_map_ret_READ_FAIL;
+	}
+	logerrprint("Error: Empty .map file.\n");
+	goto load_map_ret_INCONSISTENT_INPUT;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto load_map_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    } while (is_eoln_kns(*loadbuf_first_token) || (*loadbuf_first_token == '#'));
+    char* loadbuf_iter = next_token_mult(loadbuf_first_token, 2);
+    if (!loadbuf_iter) {
+      goto load_map_ret_MISSING_TOKENS;
+    }
+    uint32_t map_cols = 3;
+    loadbuf_iter = next_token(loadbuf_iter);
+    if (loadbuf_iter) {
+      loadbuf_iter = next_token(loadbuf_iter);
+      if (!loadbuf_iter) {
+	map_cols = 4;
+      } else {
+	loadbuf_iter = next_token(loadbuf_iter);
+	if (loadbuf_iter) {
+	  if (next_token(loadbuf_iter)) {
+	    // do NOT permit >6 columns, .bim is ok but .pvar is not
+	    // (pointless to support .pvar for legacy formats)
+	    sprintf(g_logbuf, "Error: %s is not a .map/.bim file (too many columns).\n", mapname);
+	    goto load_map_ret_MALFORMED_INPUT_WW;
+	  }
+	  map_cols = 4;
+	}
+      }
+    }
+
+    const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
+    uint32_t max_variant_id_slen = *max_variant_id_slen_ptr;
+    unsigned char* tmp_alloc_base = (unsigned char*)(&(loadbuf[loadbuf_size]));
+    unsigned char* tmp_alloc_end = bigstack_end_mark;
+    uint16_t* cur_chr_codes = nullptr;
+    uint32_t* cur_bps = nullptr;
+    char** cur_ids = nullptr;
+    double* cur_cms = nullptr;
+    double cur_cm = 0.0;
+    uint32_t at_least_one_nzero_cm = 0;
+    uint32_t variant_ct = 0;
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	// chrom, id, (cm?), pos
+	char* loadbuf_iter = token_endnn(loadbuf_first_token);
+	if (!(*loadbuf_iter)) {
+	  goto load_map_ret_MISSING_TOKENS;
+	}
+	int32_t cur_chr_code;
+	reterr = get_or_add_chr_code_destructive(".map file", line_idx, allow_extra_chrs, loadbuf_first_token, loadbuf_iter, cip, &cur_chr_code);
+	if (reterr) {
+	  goto load_map_ret_1;
+	}
+	if (!is_set(cip->chr_mask, cur_chr_code)) {
+	  goto load_map_skip_variant;
+	}
+        loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[1]));
+        if (is_eoln_kns(*loadbuf_iter)) {
+	  goto load_map_ret_MISSING_TOKENS;
+	}
+	char* token_end = token_endnn(loadbuf_iter);
+	uint32_t id_slen = (uintptr_t)(token_end - loadbuf_iter);
+	if (id_slen > max_variant_id_slen) {
+	  max_variant_id_slen = id_slen;
+	}
+	tmp_alloc_end -= id_slen + 1;
+	if (tmp_alloc_end < tmp_alloc_base) {
+	  goto load_map_ret_NOMEM;
+	}
+	memcpyx(tmp_alloc_end, loadbuf_iter, id_slen, '\0');
+	loadbuf_iter = skip_initial_spaces(token_end);
+	if (is_eoln_kns(*loadbuf_iter)) {
+	  goto load_map_ret_MISSING_TOKENS;
+	}
+
+	if (map_cols == 4) {
+	  char* cm_end = scanadv_double(loadbuf_iter, &cur_cm);
+	  if (!cm_end) {
+	    sprintf(g_logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of %s.\n", line_idx, mapname);
+	    goto load_map_ret_MALFORMED_INPUT_WW;
+	  }
+	  at_least_one_nzero_cm = (cur_cm != 0.0);
+	  loadbuf_iter = next_token(cm_end);
+	  if (!loadbuf_iter) {
+	    goto load_map_ret_MISSING_TOKENS;
+	  }
+	}
+	int32_t cur_bp;
+	if (scan_int_abs_defcap(loadbuf_iter, &cur_bp)) {
+	  sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, mapname);
+	  goto load_map_ret_MALFORMED_INPUT_WW;
+	}
+	if (cur_bp < 0) {
+	  goto load_map_skip_variant;
+	}
+
+	const uint32_t variant_idx_lowbits = variant_ct % kLoadMapBlockSize;
+	if (!variant_idx_lowbits) {
+	  if ((uintptr_t)(tmp_alloc_end - tmp_alloc_base) <= kLoadMapBlockSize * (sizeof(int16_t) + sizeof(int32_t) + sizeof(intptr_t) + sizeof(double))) {
+	    goto load_map_ret_NOMEM;
+	  }
+	  cur_chr_codes = (uint16_t*)tmp_alloc_base;
+	  tmp_alloc_base = (unsigned char*)(&(cur_chr_codes[kLoadMapBlockSize]));
+	  cur_bps = (uint32_t*)tmp_alloc_base;
+	  tmp_alloc_base = (unsigned char*)(&(cur_bps[kLoadMapBlockSize]));
+	  cur_ids = (char**)tmp_alloc_base;
+	  tmp_alloc_base = (unsigned char*)(&(cur_ids[kLoadMapBlockSize]));
+	  cur_cms = (double*)tmp_alloc_base;
+	  tmp_alloc_base = (unsigned char*)(&(cur_cms[kLoadMapBlockSize]));
+	}
+	cur_chr_codes[variant_idx_lowbits] = (uint32_t)cur_chr_code;
+	cur_ids[variant_idx_lowbits] = (char*)tmp_alloc_end;
+	cur_cms[variant_idx_lowbits] = cur_cm;
+	cur_bps[variant_idx_lowbits] = (uint32_t)cur_bp;
+	++variant_ct;
+      }
+    load_map_skip_variant:
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_map_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto load_map_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (loadbuf_first_token[0] == '#') {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line.)\n", line_idx, mapname);
+	goto load_map_ret_MALFORMED_INPUT_WW;
+      }
+    }
+    if (max_variant_id_slen > kMaxIdSlen) {
+      logerrprint("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
+      goto load_map_ret_MALFORMED_INPUT;
+    }
+
+    if (!variant_ct) {
+      logerrprint("Error: All variants in .map/.bim file skipped due to chromosome filter.\n");
+      goto load_map_ret_INCONSISTENT_INPUT;
+    }
+    tmp_alloc_base = (unsigned char*)(&(loadbuf[loadbuf_size]));
+    // true requirement is weaker, but whatever
+    g_bigstack_end = tmp_alloc_base;
+
+    if (bigstack_alloc_usi(variant_ct, variant_chr_codes_ptr) ||
+	bigstack_alloc_ui(variant_ct, variant_bps_ptr) ||
+	bigstack_alloc_cp(variant_ct, variant_ids_ptr)) {
+      goto load_map_ret_NOMEM;
+    }
+    uint16_t* variant_chr_codes = *variant_chr_codes_ptr;
+    uint32_t* variant_bps = *variant_bps_ptr;
+    char** variant_ids = *variant_ids_ptr;
+    double* variant_cms = nullptr;
+    if (at_least_one_nzero_cm) {
+      if (bigstack_alloc_d(variant_ct, variant_cms_ptr)) {
+        goto load_map_ret_NOMEM;
+      }
+      variant_cms = *variant_cms_ptr;
+    } else {
+      *variant_cms_ptr = nullptr;
+    }
+    *max_variant_id_slen_ptr = max_variant_id_slen;
+    *variant_ct_ptr = variant_ct;
+    const uint32_t full_block_ct = variant_ct / kLoadMapBlockSize;
+    bigstack_mark = g_bigstack_base;
+    bigstack_end_set(tmp_alloc_end);
+    bigstack_end_mark = g_bigstack_end;
+
+    unsigned char* read_iter = tmp_alloc_base;
+    for (uint32_t block_idx = 0; block_idx < full_block_ct; ++block_idx) {
+      memcpy(&(variant_chr_codes[block_idx * kLoadMapBlockSize]), read_iter, kLoadMapBlockSize * sizeof(int16_t));
+      read_iter = &(read_iter[kLoadMapBlockSize * sizeof(int16_t)]);
+      memcpy(&(variant_bps[block_idx * kLoadMapBlockSize]), read_iter, kLoadMapBlockSize * sizeof(int32_t));
+      read_iter = &(read_iter[kLoadMapBlockSize * sizeof(int32_t)]);
+      memcpy(&(variant_ids[block_idx * kLoadMapBlockSize]), read_iter, kLoadMapBlockSize * sizeof(intptr_t));
+      read_iter = &(read_iter[kLoadMapBlockSize * sizeof(intptr_t)]);
+      if (at_least_one_nzero_cm) {
+	memcpy(&(variant_cms[block_idx * kLoadMapBlockSize]), read_iter, kLoadMapBlockSize * sizeof(double));
+      }
+      read_iter = &(read_iter[kLoadMapBlockSize * sizeof(double)]);
+    }
+    const uint32_t variant_ct_lowbits = variant_ct % kLoadMapBlockSize;
+    memcpy(&(variant_chr_codes[full_block_ct * kLoadMapBlockSize]), read_iter, variant_ct_lowbits * sizeof(int16_t));
+    read_iter = &(read_iter[kLoadMapBlockSize * sizeof(int16_t)]);
+    memcpy(&(variant_bps[full_block_ct * kLoadMapBlockSize]), read_iter, variant_ct_lowbits * sizeof(int32_t));
+    read_iter = &(read_iter[kLoadMapBlockSize * sizeof(int32_t)]);
+    memcpy(&(variant_ids[full_block_ct * kLoadMapBlockSize]), read_iter, variant_ct_lowbits * sizeof(intptr_t));
+    if (at_least_one_nzero_cm) {
+      read_iter = &(read_iter[kLoadMapBlockSize * sizeof(intptr_t)]);
+      memcpy(&(variant_cms[full_block_ct * kLoadMapBlockSize]), read_iter, variant_ct_lowbits * sizeof(double));
+    }
+  }
+  while (0) {
+  load_map_ret_LONG_LINE:
+    if (loadbuf_size == kMaxLongLine) {
+      LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, mapname);
+      reterr = kPglRetMalformedInput;
+      break;
+    }
+  load_map_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  load_map_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  load_map_ret_MISSING_TOKENS:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, mapname);
+  load_map_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  load_map_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  load_map_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ load_map_ret_1:
+  // forget_extra_chr_names(1, cip);
+  gzclose_cond(gz_infile);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+static_assert(sizeof(dosage_t) == 2, "plink1_dosage_to_pgen() needs to be updated.");
+pglerr_t plink1_dosage_to_pgen(const char* dosagename, const char* famname, const char* mapname, const char* import_single_chr_str, const plink1_dosage_info_t* pdip, misc_flags_t misc_flags, fam_col_t fam_cols, int32_t missing_pheno, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, uint32_t max_thread_ct, char* outname, char* outname_end, chr_info_t* cip) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+
+  // these are not allocated on bigstack, and must be explicitly freed
+  pheno_col_t* pheno_cols = nullptr;
+  char* pheno_names = nullptr;
+  uint32_t pheno_ct = 0;
+
+  gzFile gz_infile = nullptr;
+  FILE* outfile = nullptr;
+  uintptr_t loadbuf_size = 0;
+  uintptr_t line_idx = 0;
+  st_pgen_writer_t spgw;
+  pglerr_t reterr = kPglRetSuccess;
+  spgw_preinit(&spgw);
+  {
+    // 1. Read .fam file.  (May as well support most .psam files too, since
+    //    it's the same driver function.  However, unless 'noheader' modifier
+    //    is present, SID field cannot be used for disambiguation.)
+    uintptr_t max_sample_id_blen = 4;
+    uintptr_t max_sid_blen = 0;
+    uintptr_t max_paternal_id_blen = 2;
+    uintptr_t max_maternal_id_blen = 2;
+    uint32_t raw_sample_ct = 0;
+    uintptr_t* sample_include = nullptr;
+    char* sample_ids = nullptr;
+    char* sids = nullptr;
+    char* paternal_ids = nullptr;
+    char* maternal_ids = nullptr;
+    uintptr_t* sex_nm = nullptr;
+    uintptr_t* sex_male = nullptr;
+    uintptr_t* founder_info = nullptr;
+    uintptr_t max_pheno_name_blen = 0;
+    reterr = load_psam(famname, nullptr, fam_cols, 0x7fffffff, missing_pheno, (misc_flags / kfMiscAffection01) & 1, &max_sample_id_blen, &max_sid_blen, &max_paternal_id_blen, &max_maternal_id_blen, &sample_include, &sample_ids, &sids, &paternal_ids, &maternal_ids, &founder_info, &sex_nm, &sex_male, &pheno_cols, &pheno_names, &raw_sample_ct, &pheno_ct, &max_pheno_name_blen);
+    if (reterr) {
+      goto plink1_dosage_to_pgen_ret_1;
+    }
+
+    // 2. Read dosage-file header line if it exists, then write new .psam.
+    reterr = gzopen_read_checked(dosagename, &gz_infile);
+    if (reterr) {
+      goto plink1_dosage_to_pgen_ret_1;
+    }
+
+    const uint32_t first_data_col_idx = pdip->skips[0] + pdip->skips[1] + pdip->skips[2] + 3;
+    uint32_t sample_ct = 0;
+    uint32_t* dosage_sample_idx_to_fam_uidx;
+    if (bigstack_end_alloc_ui(raw_sample_ct, &dosage_sample_idx_to_fam_uidx)) {
+      goto plink1_dosage_to_pgen_ret_NOMEM;
+    }
+    const plink1_dosage_flags_t flags = pdip->flags;
+    if (flags & kfPlink1DosageNoheader) {
+      sample_ct = raw_sample_ct;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	dosage_sample_idx_to_fam_uidx[sample_idx] = sample_idx;
+      }
+    } else {
+      fill_ulong_zero(BITCT_TO_WORDCT(raw_sample_ct), sample_include);
+      const uint32_t tmp_htable_size = get_htable_fast_size(raw_sample_ct);
+      uint32_t* htable_tmp;
+      char* idbuf;
+      if (bigstack_end_alloc_ui(tmp_htable_size, &htable_tmp) ||
+	  bigstack_end_alloc_c(max_sample_id_blen, &idbuf)) {
+	goto plink1_dosage_to_pgen_ret_NOMEM;
+      }
+      const uint32_t duplicate_idx = populate_strbox_htable(sample_ids, raw_sample_ct, max_sample_id_blen, tmp_htable_size, htable_tmp);
+      if (duplicate_idx) {
+	char* duplicate_sample_id = &(sample_ids[duplicate_idx * max_sample_id_blen]);
+	char* duplicate_fid_end = (char*)rawmemchr(duplicate_sample_id, '\t');
+	*duplicate_fid_end = ' ';
+	sprintf(g_logbuf, "Error: Duplicate sample ID '%s' in .fam file.\n", duplicate_sample_id);
+	goto plink1_dosage_to_pgen_ret_MALFORMED_INPUT_WW;
+      }
+
+      loadbuf_size = bigstack_left();
+      if (loadbuf_size > kMaxLongLine) {
+	loadbuf_size = kMaxLongLine;
+      } else if (loadbuf_size <= kMaxMediumLine) {
+	goto plink1_dosage_to_pgen_ret_NOMEM;
+      }
+      // not formally allocated
+      char* loadbuf = (char*)g_bigstack_base;
+      loadbuf[loadbuf_size - 1] = ' ';
+      char* loadbuf_first_token;
+      do {
+	++line_idx;
+	if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_infile)) {
+	    goto plink1_dosage_to_pgen_ret_READ_FAIL;
+	  }
+	  sprintf(g_logbuf, "Error: %s is empty.\n", dosagename);
+	  goto plink1_dosage_to_pgen_ret_INCONSISTENT_INPUT_WW;
+	}
+	if (!loadbuf[loadbuf_size - 1]) {
+	  goto plink1_dosage_to_pgen_ret_LONG_LINE;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+      } while (is_eoln_kns(*loadbuf_first_token));
+      char* loadbuf_iter = next_token_mult(loadbuf_first_token, first_data_col_idx);
+      if (!loadbuf_iter) {
+	goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+      }
+      do {
+	char* fid_end = token_endnn(loadbuf_iter);
+	char* iid_start = skip_initial_spaces(fid_end);
+	if (is_eoln_kns(*iid_start)) {
+	  goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	}
+	char* iid_end = token_endnn(iid_start);
+	const uint32_t fid_slen = (uintptr_t)(fid_end - loadbuf_iter);
+	const uint32_t iid_slen = (uintptr_t)(iid_end - iid_start);
+	const uint32_t cur_id_slen = fid_slen + iid_slen + 1;
+	if (cur_id_slen >= max_sample_id_blen) {
+	  logerrprint("Error: .fam file does not contain all sample IDs in dosage file.\n");
+	  goto plink1_dosage_to_pgen_ret_INCONSISTENT_INPUT;
+	}
+	char* idbuf_iid = memcpyax(idbuf, loadbuf_iter, fid_slen, '\t');
+	memcpyx(idbuf_iid, iid_start, iid_slen, '\0');
+	uint32_t sample_uidx = strbox_htable_find(idbuf, sample_ids, htable_tmp, max_sample_id_blen, cur_id_slen, tmp_htable_size);
+	if (sample_uidx == 0xffffffffU) {
+	  logerrprint("Error: .fam file does not contain all sample IDs in dosage file.\n");
+	  goto plink1_dosage_to_pgen_ret_INCONSISTENT_INPUT;
+	}
+	if (is_set(sample_include, sample_uidx)) {
+	  idbuf_iid[-1] = ' ';
+	  sprintf(g_logbuf, "Error: Duplicate sample ID '%s' in dosage file.\n", idbuf);
+	  goto plink1_dosage_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+	set_bit(sample_uidx, sample_include);
+	dosage_sample_idx_to_fam_uidx[sample_ct++] = sample_uidx;
+	loadbuf_iter = skip_initial_spaces(iid_end);
+      } while (!is_eoln_kns(*loadbuf_iter));
+    }
+
+    strcpy(outname_end, ".psam");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto plink1_dosage_to_pgen_ret_OPEN_FAIL;
+    }
+    char* writebuf = g_textbuf;
+    char* writebuf_flush = &(writebuf[kMaxMediumLine]);
+    char* write_iter = strcpya(writebuf, "#FID\tIID");
+    const uint32_t write_sid = sid_col_required(sample_include, sids, sample_ct, max_sid_blen, 1);
+    if (write_sid) {
+      write_iter = strcpya(write_iter, "\tSID");
+    }
+    const uint32_t write_parents = is_parental_info_present(sample_include, paternal_ids, maternal_ids, sample_ct, max_paternal_id_blen, max_maternal_id_blen);
+    if (write_parents) {
+      write_iter = strcpya(write_iter, "\tPAT\tMAT");
+    }
+    write_iter = strcpya(write_iter, "\tSEX");
+    for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+      *write_iter++ = '\t';
+      write_iter = strcpya(write_iter, &(pheno_names[pheno_idx * max_pheno_name_blen]));
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	  goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+    }
+    append_binary_eoln(&write_iter);
+    uint32_t omp_slen = 2;
+    char output_missing_pheno[kMaxMissingPhenostrBlen];
+    if (misc_flags & kfMiscKeepAutoconv) {
+      omp_slen = strlen(g_output_missing_pheno);
+      memcpy(output_missing_pheno, g_output_missing_pheno, omp_slen);
+    } else {
+      memcpy(output_missing_pheno, "NA", 2);
+    }
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      const uint32_t sample_uidx = dosage_sample_idx_to_fam_uidx[sample_idx];
+      write_iter = strcpya(write_iter, &(sample_ids[sample_uidx * max_sample_id_blen]));
+      if (write_sid) {
+	*write_iter++ = '\t';
+	write_iter = strcpya(write_iter, &(sids[sample_uidx * max_sid_blen]));
+      }
+      if (write_parents) {
+	*write_iter++ = '\t';
+	write_iter = strcpyax(write_iter, &(paternal_ids[max_paternal_id_blen * sample_uidx]), '\t');
+	write_iter = strcpya(write_iter, &(maternal_ids[max_maternal_id_blen * sample_uidx]));
+      }
+      *write_iter++ = '\t';
+      if (is_set(sex_nm, sample_uidx)) {
+	*write_iter++ = '2' - is_set(sex_male, sample_uidx);
+      } else {
+	write_iter = strcpya(write_iter, "NA");
+      }
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	if (write_iter >= writebuf_flush) {
+	  if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	    goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	*write_iter++ = '\t';
+	write_iter = append_pheno_str(&(pheno_cols[pheno_idx]), output_missing_pheno, omp_slen, sample_uidx, write_iter);
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	  goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+    }
+    // Don't need sample info any more.
+    bigstack_end_reset(bigstack_end_mark);
+
+    // 3. Read .map file if it exists.
+    uint32_t max_variant_id_slen = 1;
+    uint16_t* variant_chr_codes = nullptr;
+    uint32_t* variant_bps = nullptr;
+    char** variant_ids = nullptr;
+    double* variant_cms = nullptr;
+    uint32_t* variant_id_htable = nullptr;
+    uintptr_t* variant_already_seen = nullptr;
+    uint32_t variant_id_htable_size = 0;
+    uint32_t map_variant_ct = 0;
+    finalize_chrset(misc_flags, cip);
+    if (mapname) {
+      reterr = load_map(mapname, misc_flags, cip, &max_variant_id_slen, &variant_chr_codes, &variant_bps, &variant_ids, &variant_cms, &map_variant_ct);
+      if (reterr) {
+	goto plink1_dosage_to_pgen_ret_1;
+      }
+      const uint32_t map_variant_ctl = BITCT_TO_WORDCT(map_variant_ct);
+      if (bigstack_alloc_ul(map_variant_ctl, &variant_already_seen)) {
+	goto plink1_dosage_to_pgen_ret_NOMEM;
+      }
+      fill_all_bits(map_variant_ct, variant_already_seen);
+      unsigned char* bigstack_end_mark2 = g_bigstack_end;
+      g_bigstack_end = &(g_bigstack_base[round_down_pow2(bigstack_left() / 2, kEndAllocAlign)]); // allow hash table to only use half of available memory
+      reterr = alloc_and_populate_id_htable_mt(variant_already_seen, variant_ids, map_variant_ct, max_thread_ct, &variant_id_htable, nullptr, &variant_id_htable_size);
+      if (reterr) {
+	goto plink1_dosage_to_pgen_ret_1;
+      }
+      g_bigstack_end = bigstack_end_mark2;
+      fill_ulong_zero(map_variant_ctl, variant_already_seen);
+    }
+
+    // 4. Dosage file pass 1: count variants, check whether any decimal dosages
+    //    need to be saved, write .pvar.
+    //
+    // Lots of overlap with ox_gen_to_pgen().
+    loadbuf_size = bigstack_left() / 2;
+    if (loadbuf_size <= kMaxMediumLine) {
+      goto plink1_dosage_to_pgen_ret_NOMEM;
+    }
+    loadbuf_size -= kMaxMediumLine;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    writebuf = (char*)bigstack_alloc_raw(kMaxMediumLine + loadbuf_size);
+    writebuf_flush = &(writebuf[kMaxMediumLine]);
+    const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
+    const char* single_chr_str = nullptr;
+    uint32_t single_chr_slen = 0;
+    const uint32_t chr_col_idx = pdip->chr_col_idx;
+    const uint32_t check_chr_col = (chr_col_idx != 0xffffffffU);
+    if (!check_chr_col) {
+      if (import_single_chr_str) {
+	int32_t chr_code_raw = get_chr_code_raw(import_single_chr_str);
+	if (chr_code_raw == -1) {
+	  // command-line parser guarantees that allow_extra_chrs is true here
+	  single_chr_str = import_single_chr_str;
+	  single_chr_slen = strlen(import_single_chr_str);
+	} else {
+	  uint32_t chr_code = chr_code_raw;
+	  if (chr_code > cip->max_code) {
+	    if (chr_code < kMaxContigs) {
+	      logerrprint("Error: --import-dosage single-chr= code is not in the chromosome set.\n");
+	      goto plink1_dosage_to_pgen_ret_INVALID_CMDLINE;
+	    }
+	    chr_code = cip->xymt_codes[chr_code - kMaxContigs];
+	    if (((int32_t)chr_code) < 0) {
+	      logerrprint("Error: --import-dosage single-chr= code is not in the chromosome set.\n");
+	      goto plink1_dosage_to_pgen_ret_INVALID_CMDLINE;
+	    }
+	  }
+	  if (!is_set(cip->chr_mask, chr_code)) {
+	    // could permit this in --allow-no-vars case, but it's silly
+	    logerrprint("Error: --import-dosage single-chr= code is excluded by chromosome filter.\n");
+	    goto plink1_dosage_to_pgen_ret_INVALID_CMDLINE;
+	  }
+	  char* chr_buf = (char*)bigstack_alloc_raw(kCacheline);
+	  char* chr_name_end = chr_name_write(cip, chr_code, chr_buf);
+	  single_chr_str = chr_buf;
+	  single_chr_slen = (uintptr_t)(chr_name_end - chr_buf);
+	}
+      } else {
+	// default to "chr0"
+	if (!is_set(cip->chr_mask, 0)) {
+	  logerrprint("Error: No --import-dosage chromosome information specified, and chr0 excluded.\n");
+	  goto plink1_dosage_to_pgen_ret_INVALID_CMDLINE;
+	}
+	char* chr_buf = (char*)bigstack_alloc_raw(kCacheline);
+	char* chr_name_end = chr_name_write(cip, 0, chr_buf);
+	single_chr_str = chr_buf;
+	single_chr_slen = (uintptr_t)(chr_name_end - chr_buf);
+      }
+    }
+    strcpy(outname_end, ".pvar");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto plink1_dosage_to_pgen_ret_OPEN_FAIL;
+    }
+    write_iter = strcpya(writebuf, "#CHROM\tPOS\tID\tREF\tALT");
+    if (variant_cms) {
+      write_iter = memcpyl3a(write_iter, "\tCM");
+    }
+    append_binary_eoln(&write_iter);
+    // types:
+    // 0 = #CHROM
+    // 1 = POS
+    // 2 = ID
+    // 3 = REF
+    // 4 = ALT
+    // 5 = first data column
+    // (command-line parser verifies that CHROM/POS don't collide with
+    // anything else)
+    uint64_t parse_table[6];
+    // high bits = col index, low bits = col type
+    const uint32_t id_col_idx = pdip->skips[0];
+    const uint32_t prov_ref_allele_second = !(flags & kfPlink1DosageRefFirst);
+    uint32_t ref_col_idx = id_col_idx + pdip->skips[1] + 1;
+    const uint32_t alt_col_idx = ref_col_idx + (!prov_ref_allele_second);
+    ref_col_idx += prov_ref_allele_second;
+    parse_table[0] = (((uint64_t)id_col_idx) << 32) + 2;
+    parse_table[1] = (((uint64_t)ref_col_idx) << 32) + 3;
+    parse_table[2] = (((uint64_t)alt_col_idx) << 32) + 4;
+    uint32_t relevant_initial_col_ct = 3;
+    if (check_chr_col) {
+      parse_table[relevant_initial_col_ct++] = ((uint64_t)chr_col_idx) << 32;
+    }
+    const uint32_t check_pos_col = (pdip->pos_col_idx != 0xffffffffU);
+    if (check_pos_col) {
+      parse_table[relevant_initial_col_ct++] = (((uint64_t)(pdip->pos_col_idx)) << 32) + 1;
+    }
+    qsort(parse_table, relevant_initial_col_ct, sizeof(int64_t), uint64cmp);
+    uint32_t col_skips[6];
+    uint32_t col_types[6];
+    for (uint32_t uii = 0; uii < relevant_initial_col_ct; ++uii) {
+      const uint64_t parse_table_entry = parse_table[uii];
+      col_skips[uii] = parse_table_entry >> 32;
+      col_types[uii] = (uint32_t)parse_table_entry;
+    }
+    col_skips[relevant_initial_col_ct] = first_data_col_idx;
+    col_types[relevant_initial_col_ct++] = 5;
+    for (uint32_t uii = relevant_initial_col_ct - 1; uii; --uii) {
+      col_skips[uii] -= col_skips[uii - 1];
+    }
+
+    double dosage_multiplier = kDosageMid;
+    double dosage_ceil = 32767.5 / 16384.0;
+    if (flags & kfPlink1DosageFormatSingle01) {
+      dosage_multiplier = kDosageMax;
+      dosage_ceil = 32767.5 / 32768.0;
+    }
+    const uint32_t format_triple = (flags / kfPlink1DosageFormatTriple) & 1;
+    const uint32_t dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
+    uint32_t dosage_is_present = 0;
+    uint32_t variant_ct = 0;
+    uintptr_t variant_skip_ct = 0;
+    uint32_t variant_uidx = 0;
+    while (1) {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto plink1_dosage_to_pgen_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto plink1_dosage_to_pgen_ret_LONG_LINE_N;
+      }
+      char* loadbuf_iter = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*loadbuf_iter)) {
+	continue;
+      }
+      char* token_ptrs[6];
+      uint32_t token_slens[6];
+      for (uint32_t ric_col_idx = 0; ric_col_idx < relevant_initial_col_ct; ++ric_col_idx) {
+	const uint32_t cur_col_type = col_types[ric_col_idx];
+	loadbuf_iter = next_token_multz(loadbuf_iter, col_skips[ric_col_idx]);
+	if (!loadbuf_iter) {
+	  goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	}
+	token_ptrs[cur_col_type] = loadbuf_iter;
+	char* token_end = token_endnn(loadbuf_iter);
+	token_slens[cur_col_type] = (uintptr_t)(token_end - loadbuf_iter);
+	loadbuf_iter = token_end;
+      }
+      // ID
+      const char* variant_id = token_ptrs[2];
+      const uint32_t variant_id_slen = token_slens[2];
+      if (map_variant_ct) {
+	variant_uidx = variant_id_dupflag_htable_find(variant_id, variant_ids, variant_id_htable, variant_id_slen, variant_id_htable_size, max_variant_id_slen);
+	if (variant_uidx >> 31) {
+	  if (variant_uidx == 0xffffffffU) {
+	    ++variant_skip_ct;
+	    continue;
+	  }
+	  sprintf(g_logbuf, "Error: Variant ID '%s' appears multiple times in .map file.\n", variant_ids[variant_uidx & 0x7fffffff]);
+	  goto plink1_dosage_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+	if (is_set(variant_already_seen, variant_uidx)) {
+	  sprintf(g_logbuf, "Error: Variant ID '%s' appears multiple times in --import-dosage file.\n", variant_ids[variant_uidx]);
+	  goto plink1_dosage_to_pgen_ret_MALFORMED_INPUT_WW;
+	}
+	// already performed chromosome filtering
+	write_iter = chr_name_write(cip, (uint32_t)variant_chr_codes[variant_uidx], write_iter);
+	*write_iter++ = '\t';
+	write_iter = uint32toa_x(variant_bps[variant_uidx], '\t', write_iter);
+	write_iter = memcpya(write_iter, variant_id, variant_id_slen);
+      } else {
+	if (variant_id_slen > kMaxIdSlen) {
+	  putc_unlocked('\n', stdout);
+	  logerrprint("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
+	  goto plink1_dosage_to_pgen_ret_MALFORMED_INPUT;
+	}
+	// #CHROM
+	if (check_chr_col) {
+	  char* chr_code_str = token_ptrs[0];
+	  char* chr_code_end = &(chr_code_str[token_slens[0]]);
+	  int32_t cur_chr_code;
+	  reterr = get_or_add_chr_code_destructive("--import-dosage file", line_idx, allow_extra_chrs, chr_code_str, chr_code_end, cip, &cur_chr_code);
+	  if (reterr) {
+	    goto plink1_dosage_to_pgen_ret_1;
+	  }
+	  if (!is_set(cip->chr_mask, cur_chr_code)) {
+	    ++variant_skip_ct;
+	    continue;
+	  }
+	  write_iter = chr_name_write(cip, cur_chr_code, write_iter);
+	} else {
+	  write_iter = memcpya(write_iter, single_chr_str, single_chr_slen);
+	}
+	*write_iter++ = '\t';
+	// POS
+	if (check_pos_col) {
+	  char* pos_str = token_ptrs[1];
+	  // no need to support negative values here
+	  uint32_t cur_bp;
+	  if (scan_uint_defcap(pos_str, &cur_bp)) {
+	    sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, dosagename);
+	    goto plink1_dosage_to_pgen_ret_MALFORMED_INPUT_WW;
+	  }
+	  write_iter = uint32toa(cur_bp, write_iter);
+	} else {
+	  *write_iter++ = '0';
+	}
+	*write_iter++ = '\t';
+        write_iter = memcpya(write_iter, variant_id, variant_id_slen);
+      }
+      ++variant_ct;
+      *write_iter++ = '\t';
+      // REF, ALT
+      write_iter = memcpyax(write_iter, token_ptrs[3], token_slens[3], '\t');
+      write_iter = memcpya(write_iter, token_ptrs[4], token_slens[4]);
+      if (variant_cms) {
+	*write_iter++ = '\t';
+	write_iter = dtoa_g(variant_cms[variant_uidx], write_iter);
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	  goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+      if (!dosage_is_present) {
+	loadbuf_iter = token_ptrs[5];
+	if (flags & kfPlink1DosageFormatSingle) {
+	  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	    if (!loadbuf_iter) {
+	      goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    double a1_dosage;
+	    char* str_end = scanadv_double(loadbuf_iter, &a1_dosage);
+	    if ((!loadbuf_iter) || (a1_dosage < (0.5 / 32768.0)) || (a1_dosage >= dosage_ceil)) {
+	      loadbuf_iter = next_token(loadbuf_iter);
+	      continue;
+	    }
+	    a1_dosage *= dosage_multiplier;
+	    const uint32_t dosage_int = (uint32_t)(a1_dosage + 0.5);
+	    const uint32_t halfdist = biallelic_dosage_halfdist(dosage_int);
+	    if (halfdist < dosage_erase_halfdist) {
+	      dosage_is_present = 1;
+	      break;
+	    }
+	    loadbuf_iter = next_token(str_end);
+	  }
+	} else {
+	  // for compatibility with plink 1.x, do not actually parse third
+	  // value of each triplet if format=3
+	  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	    if (!loadbuf_iter) {
+	      goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    double prob_2a1;
+	    char* str_end = scanadv_double(loadbuf_iter, &prob_2a1);
+	    if (!str_end) {
+	      loadbuf_iter = next_token_mult(loadbuf_iter, 2 + format_triple);
+	      continue;
+	    }
+	    loadbuf_iter = next_token(str_end);
+	    if (!loadbuf_iter) {
+	      goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    double prob_1a1;
+	    str_end = scanadv_double(loadbuf_iter, &prob_1a1);
+	    if (!str_end) {
+	      loadbuf_iter = next_token_mult(loadbuf_iter, 1 + format_triple);
+	      continue;
+	    }
+	    loadbuf_iter = next_token_mult(str_end, 1 + format_triple);
+	    double prob_one_or_two_a1 = prob_2a1 + prob_1a1;
+	    if ((prob_2a1 < 0.0) || (prob_1a1 < 0.0) || (prob_one_or_two_a1 > 1.01 * (1 + kSmallEpsilon))) {
+	      continue;
+	    }
+	    if (prob_one_or_two_a1 > 1.0) {
+	      const double rescale = 1.0 / prob_one_or_two_a1;
+	      prob_2a1 *= rescale;
+	      prob_1a1 *= rescale;
+	      prob_one_or_two_a1 = 1.0;
+	    }
+	    const uint32_t dosage_int = (uint32_t)(prob_2a1 * 32768 + prob_1a1 * 16384 + 0.5);
+	    const uint32_t halfdist = biallelic_dosage_halfdist(dosage_int);
+	    if ((halfdist < dosage_erase_halfdist) && ((prob_2a1 >= import_dosage_certainty) || (prob_1a1 >= import_dosage_certainty) || (prob_one_or_two_a1 <= 1.0 - import_dosage_certainty))) {
+	      dosage_is_present = 1;
+	      break;
+	    }
+	  }
+	}
+      }
+      if (!(variant_ct % 1000)) {
+	printf("\r--import-dosage: %uk variants scanned.", variant_ct / 1000);
+	fflush(stdout);
+      }
+    }
+    putc_unlocked('\r', stdout);
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+    }
+    if (!variant_ct) {
+      if (!variant_skip_ct) {
+	logerrprint("Error: Empty --import-dosage file.\n");
+	goto plink1_dosage_to_pgen_ret_INCONSISTENT_INPUT;
+      }
+      LOGERRPRINTFWW("Error: All %" PRIuPTR " variant%s in --import-dosage file skipped.\n", variant_skip_ct, (variant_skip_ct == 1)? "" : "s");
+      goto plink1_dosage_to_pgen_ret_INCONSISTENT_INPUT;
+    }
+    LOGPRINTF("--import-dosage: %u variant%s scanned%s.\n", variant_ct, (variant_ct == 1)? "" : "s", dosage_is_present? "" : " (all hardcalls)");
+
+    // 5. Dosage file pass 2: write .pgen.
+    bigstack_reset(writebuf);
+    if (gzrewind(gz_infile)) {
+      goto plink1_dosage_to_pgen_ret_READ_FAIL;
+    }
+    const uintptr_t line_ct = line_idx - 1;
+    line_idx = 0;
+    if (!(flags & kfPlink1DosageNoheader)) {
+      // skip header line again
+      char* loadbuf_first_token;
+      do {
+	++line_idx;
+	if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	  goto plink1_dosage_to_pgen_ret_READ_FAIL;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+      } while (is_eoln_kns(*loadbuf_first_token));
+    }
+    strcpy(outname_end, ".pgen");
+    uintptr_t spgw_alloc_cacheline_ct;
+    uint32_t max_vrec_len;
+    reterr = spgw_init_phase1(outname, nullptr, nullptr, variant_ct, sample_ct, dosage_is_present? kfPgenGlobalDosagePresent: kfPgenGlobal0, (flags & (kfPlink1DosageRefFirst | kfPlink1DosageRefSecond))? 1 : 2, &spgw, &spgw_alloc_cacheline_ct, &max_vrec_len);
+    if (reterr) {
+      goto plink1_dosage_to_pgen_ret_1;
+    }
+    unsigned char* spgw_alloc;
+    if (bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+      goto plink1_dosage_to_pgen_ret_NOMEM;
+    }
+    spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uintptr_t* genovec;
+    uintptr_t* dosage_present;
+    if (bigstack_alloc_ul(sample_ctl2, &genovec) ||
+	bigstack_alloc_ul(sample_ctl, &dosage_present)) {
+      goto plink1_dosage_to_pgen_ret_NOMEM;
+    }
+    dosage_t* dosage_vals = nullptr;
+    if (dosage_is_present) {
+      if (bigstack_alloc_dosage(sample_ct, &dosage_vals)) {
+	goto plink1_dosage_to_pgen_ret_NOMEM;
+      }
+    }
+    if (hard_call_thresh == 0xffffffffU) {
+      hard_call_thresh = kDosageMid / 10;
+    }
+    dosage_ceil = 2.02 * (1 + kSmallEpsilon);
+    if (flags & kfPlink1DosageFormatSingle01) {
+      dosage_ceil = 1.01 * (1 + kSmallEpsilon);
+    }
+    const uint32_t hard_call_halfdist = kDosage4th - hard_call_thresh;
+    const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+    uint32_t vidx = 0;
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	goto plink1_dosage_to_pgen_ret_READ_FAIL;
+      }
+      char* loadbuf_iter = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*loadbuf_iter)) {
+	continue;
+      }
+      if (variant_skip_ct) {
+	if (map_variant_ct) {
+	  char* variant_id = next_token_multz(loadbuf_iter, id_col_idx);
+	  const uint32_t variant_id_slen = strlen_se(variant_id);
+	  if (variant_id_dupflag_htable_find(variant_id, variant_ids, variant_id_htable, variant_id_slen, variant_id_htable_size, max_variant_id_slen) == 0xffffffffU) {
+	    continue;
+	  }
+	  loadbuf_iter = next_token_mult(variant_id, first_data_col_idx - id_col_idx);
+	} else {
+	  char* chr_code_str = next_token_multz(loadbuf_iter, chr_col_idx);
+	  char* chr_code_end = token_endnn(chr_code_str);
+	  loadbuf_iter = next_token_mult(chr_code_end, first_data_col_idx - chr_col_idx);
+	  *chr_code_end = '\0';
+	  const uint32_t chr_code = get_chr_code(chr_code_str, cip, (uintptr_t)(chr_code_end - chr_code_str));
+	  if (!is_set(cip->chr_mask, chr_code)) {
+	    continue;
+	  }
+	}
+      } else {
+	loadbuf_iter = next_token_mult(loadbuf_iter, first_data_col_idx);
+      }
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      uint32_t widx = 0;
+      dosage_t* dosage_vals_iter = dosage_vals;
+      while (1) {
+	if (widx >= sample_ctl2_m1) {
+	  if (widx > sample_ctl2_m1) {
+	    break;
+	  }
+	  inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	}
+	uintptr_t genovec_word = 0;
+	uint32_t dosage_present_hw = 0;
+	if (flags & kfPlink1DosageFormatSingle) {
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    if (!loadbuf_iter) {
+	      goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    double a1_dosage;
+	    char* str_end = scanadv_double(loadbuf_iter, &a1_dosage);
+	    if ((!loadbuf_iter) || (a1_dosage < 0.0) || (a1_dosage > dosage_ceil)) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	      loadbuf_iter = next_token(loadbuf_iter);
+	      continue;
+	    }
+	    loadbuf_iter = next_token(str_end);
+	    uint32_t dosage_int = (uint32_t)(a1_dosage * dosage_multiplier + 0.5);
+	    if (dosage_int > kDosageMax) {
+	      dosage_int = kDosageMax;
+	    }
+	    const uint32_t cur_halfdist = biallelic_dosage_halfdist(dosage_int);
+	    if (cur_halfdist < hard_call_halfdist) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	    } else {
+	      genovec_word |= ((dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+	      if (cur_halfdist >= dosage_erase_halfdist) {
+		continue;
+	      }
+	    }
+	    dosage_present_hw |= 1U << sample_idx_lowbits;
+	    *dosage_vals_iter++ = dosage_int;
+	  }
+	} else {
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    if (!loadbuf_iter) {
+	      goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    double prob_2a1;
+	    char* str_end = scanadv_double(loadbuf_iter, &prob_2a1);
+	    if (!str_end) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	      loadbuf_iter = next_token_mult(loadbuf_iter, 2 + format_triple);
+	      continue;
+	    }
+	    loadbuf_iter = next_token(str_end);
+	    if (!loadbuf_iter) {
+	      goto plink1_dosage_to_pgen_ret_MISSING_TOKENS;
+	    }
+	    double prob_1a1;
+	    str_end = scanadv_double(loadbuf_iter, &prob_1a1);
+	    if (!str_end) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	      loadbuf_iter = next_token_mult(loadbuf_iter, 1 + format_triple);
+	      continue;
+	    }
+	    loadbuf_iter = next_token_mult(str_end, 1 + format_triple);
+	    double prob_one_or_two_a1 = prob_2a1 + prob_1a1;
+	    if ((prob_2a1 < 0.0) || (prob_1a1 < 0.0) || (prob_one_or_two_a1 > 1.01 * (1 + kSmallEpsilon))) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	      continue;
+	    }
+	    if (prob_one_or_two_a1 > 1.0) {
+	      const double rescale = 1.0 / prob_one_or_two_a1;
+	      prob_2a1 *= rescale;
+	      prob_1a1 *= rescale;
+	      prob_one_or_two_a1 = 1.0;
+	    }
+	    if ((prob_2a1 < import_dosage_certainty) && (prob_1a1 < import_dosage_certainty) && (prob_one_or_two_a1 > 1.0 - import_dosage_certainty)) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	    }
+	    const uint32_t dosage_int = (uint32_t)(prob_2a1 * 32768 + prob_1a1 * 16384 + 0.5);
+	    const uint32_t cur_halfdist = biallelic_dosage_halfdist(dosage_int);
+	    if (cur_halfdist < hard_call_halfdist) {
+	      genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	    } else {
+	      genovec_word |= ((dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+	      if (cur_halfdist >= dosage_erase_halfdist) {
+		continue;
+	      }
+	    }
+	    dosage_present_hw |= 1U << sample_idx_lowbits;
+	    *dosage_vals_iter++ = dosage_int;
+	  }
+	}
+	genovec[widx] = genovec_word;
+	((halfword_t*)dosage_present)[widx] = (halfword_t)dosage_present_hw;
+	++widx;
+      }
+      if (!prov_ref_allele_second) {
+	genovec_invert_unsafe(sample_ct, genovec);
+	zero_trailing_quaters(sample_ct, genovec);
+      }
+      if (dosage_vals_iter != dosage_vals) {
+	const uint32_t dosage_ct = (uintptr_t)(dosage_vals_iter - dosage_vals);
+	if (!prov_ref_allele_second) {
+	  biallelic_dosage16_invert(dosage_ct, dosage_vals);
+	}
+	if (spgw_append_biallelic_genovec_dosage16(genovec, dosage_present, dosage_vals, dosage_ct, &spgw)) {
+	  goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+	}
+      } else {
+	if (spgw_append_biallelic_genovec(genovec, &spgw)) {
+	  goto plink1_dosage_to_pgen_ret_WRITE_FAIL;
+	}
+      }
+      ++vidx;
+      if (!(vidx % 1000)) {
+	printf("\r--import-dosage: %uk variants converted.", vidx / 1000);
+	fflush(stdout);
+      }
+    } while (line_idx < line_ct);
+    spgw_finish(&spgw);
+    putc_unlocked('\r', stdout);
+    write_iter = strcpya(g_logbuf, "--import-dosage: ");
+    const uint32_t outname_base_slen = (uintptr_t)(outname_end - outname);
+    write_iter = memcpya(write_iter, outname, outname_base_slen + 5);
+    write_iter = memcpyl3a(write_iter, " + ");
+    write_iter = memcpya(write_iter, outname, outname_base_slen);
+    write_iter = strcpya(write_iter, ".pvar + ");
+    write_iter = memcpya(write_iter, outname, outname_base_slen);
+    write_iter = strcpya(write_iter, ".psam written.\n");
+    wordwrapb(0);
+    logprintb();
+  }
+  while (0) {
+  plink1_dosage_to_pgen_ret_LONG_LINE_N:
+    putc_unlocked('\n', stdout);
+  plink1_dosage_to_pgen_ret_LONG_LINE:
+    if (loadbuf_size == kMaxLongLine) {
+      LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, dosagename);
+      reterr = kPglRetMalformedInput;
+      break;
+    }
+  plink1_dosage_to_pgen_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  plink1_dosage_to_pgen_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  plink1_dosage_to_pgen_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  plink1_dosage_to_pgen_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  plink1_dosage_to_pgen_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  plink1_dosage_to_pgen_ret_MISSING_TOKENS:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, dosagename);
+  plink1_dosage_to_pgen_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    putc_unlocked('\n', stdout);
+    logerrprintb();
+  plink1_dosage_to_pgen_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  plink1_dosage_to_pgen_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  plink1_dosage_to_pgen_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ plink1_dosage_to_pgen_ret_1:
+  if (spgw_cleanup(&spgw) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  forget_extra_chr_names(1, cip);
+  fclose_cond(outfile);
+  gzclose_cond(gz_infile);
+  free_cond(pheno_names);
+  cleanup_pheno_cols(pheno_ct, pheno_cols);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+
+// binary search over cdf is faster than (int)(log(drand)/log(q)) for truncated
+// geometric distribution
+static uint64_t g_geno_missing_geomdist[kBitsPerWordD2];
+static uint64_t g_dosage_geomdist[kBitsPerWordD2];
+static uint32_t g_geno_missing_invert = 0;
+static uint32_t g_dosage_geomdist_max = 0;
+
+static_assert(sizeof(dosage_t) == 2, "generate_dummy_thread() needs to be updated.");
+THREAD_FUNC_DECL generate_dummy_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t sample_ct = g_sample_ct;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uint64_t* geno_missing_geomdist = g_geno_missing_geomdist;
+  const uint64_t* dosage_geomdist = g_dosage_geomdist;
+  const uint32_t geno_missing_invert = g_geno_missing_invert;
+  const uint32_t geno_missing_check = geno_missing_invert || (geno_missing_geomdist[kBitsPerWordD2 - 1] != 0);
+  const uint32_t dosage_is_present = (dosage_geomdist[kBitsPerWordD2 - 1] != 0);
+  const uint32_t dosage_geomdist_max = g_dosage_geomdist_max;
+  const uint32_t hard_call_halfdist = g_hard_call_halfdist;
+  const uint32_t dosage_erase_halfdist = g_dosage_erase_halfdist;
+  const uintptr_t sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+  const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  const uintptr_t sample_ctaw = BITCT_TO_ALIGNED_WORDCT(sample_ct);
+  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
+  uint64_t ullrand = sfmt_genrand_uint64(sfmtp);
+  uint32_t rand16_left = 4;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    uint32_t vidx = (tidx * cur_block_write_ct) / calc_thread_ct;
+    const uint32_t vidx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
+    uintptr_t* write_genovec_iter = &(g_write_genovecs[parity][vidx * sample_ctaw2]);
+    uint32_t* write_dosage_ct_iter = &(g_write_dosage_cts[parity][vidx]);
+    uintptr_t* write_dosage_present_iter = &(g_write_dosage_presents[parity][vidx * sample_ctaw]);
+    dosage_t* write_dosage_vals_iter = &(g_write_dosage_val_bufs[parity][vidx * sample_ctaw]);
+    for (; vidx < vidx_end; ++vidx) {
+      dosage_t* cur_dosage_vals_iter = write_dosage_vals_iter;
+      uint32_t loop_len = kBitsPerWordD2;
+      uint32_t widx = 0;
+      while (1) {
+	if (widx >= sample_ctl2_m1) {
+	  if (widx > sample_ctl2_m1) {
+	    break;
+	  }
+	  loop_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+	}
+	// sfmt_genrand_uint64 calls can't be mixed with sfmt_genrand_uint32
+	// calls, so use it here even in 32-bit build
+	uintptr_t genovec_word = sfmt_genrand_uint64(sfmtp);
+	genovec_word = genovec_word - ((genovec_word >> 1) & kMask5555);
+	if (geno_missing_check) {
+	  uintptr_t missing_mask = 0;
+	  uint32_t sample_idx_lowbits = 0;
+	  while (1) {
+	    sample_idx_lowbits += uint64arr_geq(geno_missing_geomdist, kBitsPerWordD2, sfmt_genrand_uint64(sfmtp));
+	    if (sample_idx_lowbits >= loop_len) {
+	      break;
+	    }
+	    missing_mask |= (3 * k1LU) << (2 * sample_idx_lowbits);
+	    ++sample_idx_lowbits;
+	  }
+	  if (geno_missing_invert) {
+	    missing_mask = ~missing_mask;
+	  }
+	  genovec_word |= missing_mask;
+	}
+	uint32_t dosage_present_hw = 0;
+	if (dosage_is_present) {
+	  // deliberate overflow
+	  uint32_t sample_idx_lowbits = 0xffffffffU;
+	  while (1) {
+	    ++sample_idx_lowbits;
+	    if (dosage_geomdist_max) {
+	      sample_idx_lowbits += uint64arr_geq(dosage_geomdist, dosage_geomdist_max, sfmt_genrand_uint64(sfmtp));
+	    }
+	    if (sample_idx_lowbits >= loop_len) {
+	      break;
+	    }
+	    if (((genovec_word >> (2 * sample_idx_lowbits)) & 3) == 3) {
+	      continue;
+	    }
+	    if (!rand16_left) {
+	      ullrand = sfmt_genrand_uint64(sfmtp);
+	      rand16_left = 4;
+	    }
+	    const uint32_t dosage_int = ((ullrand & 65535) + 1) / 2;
+	    ullrand >>= 16;
+	    --rand16_left;
+	    const uint32_t halfdist = biallelic_dosage_halfdist(dosage_int);
+	    if (halfdist < dosage_erase_halfdist) {
+	      *cur_dosage_vals_iter++ = dosage_int;
+	      dosage_present_hw |= 1U << sample_idx_lowbits;
+	      if (halfdist < hard_call_halfdist) {
+		genovec_word |= (3 * k1LU) << (2 * sample_idx_lowbits);
+		continue;
+	      }
+	    }
+	    genovec_word &= ~((3 * k1LU) << (2 * sample_idx_lowbits));
+	    genovec_word |= ((dosage_int + (kDosage4th * k1LU)) / kDosageMid) << (2 * sample_idx_lowbits);
+	  }
+	}
+	write_genovec_iter[widx] = genovec_word;
+	((halfword_t*)write_dosage_present_iter)[widx] = (halfword_t)dosage_present_hw;
+	++widx;
+      }
+      zero_trailing_quaters(sample_ct, write_genovec_iter);
+      const uint32_t dosage_ct = (uintptr_t)(cur_dosage_vals_iter - write_dosage_vals_iter);
+      *write_dosage_ct_iter++ = dosage_ct;
+      write_genovec_iter = &(write_genovec_iter[sample_ctaw2]);
+      write_dosage_present_iter = &(write_dosage_present_iter[sample_ctaw]);
+      write_dosage_vals_iter = &(write_dosage_vals_iter[sample_ct]);
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+static_assert(sizeof(dosage_t) == 2, "generate_dummy() needs to be updated.");
+pglerr_t generate_dummy(const gendummy_info_t* gendummy_info_ptr, misc_flags_t misc_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, uint32_t max_thread_ct, char* outname, char* outname_end, chr_info_t* cip) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  st_pgen_writer_t spgw;
+  pglerr_t reterr = kPglRetSuccess;
+  spgw_preinit(&spgw);
+  {
+    finalize_chrset(misc_flags, cip);
+    if (!is_set(cip->chr_mask, 1)) {
+      logerrprint("Error: --dummy cannot be used when chromosome 1 is excluded.\n");
+      goto generate_dummy_ret_INVALID_CMDLINE;
+    }
+    if (is_set(cip->haploid_mask, 1)) {
+      logerrprint("Error: --dummy cannot be used to generate haploid data.\n");
+      goto generate_dummy_ret_INVALID_CMDLINE;
+    }
+    char chr1_name_buf[5];
+    char* chr1_name_end = chr_name_write(cip, 1, chr1_name_buf);
+    *chr1_name_end = '\t';
+    const uint32_t chr1_name_blen = 1 + (uintptr_t)(chr1_name_end - chr1_name_buf);
+    const uint32_t sample_ct = gendummy_info_ptr->sample_ct;
+    const uint32_t variant_ct = gendummy_info_ptr->variant_ct;
+    // missing pheno string is always "NA"
+    const gendummy_flags_t flags = gendummy_info_ptr->flags;
+    uint16_t alleles[13];
+    uint32_t four_alleles = 0;
+    if (flags & kfGenDummyAcgt) {
+      memcpy(alleles, "\tA\tC\tA\tG\tA\tT\tC\tG\tC\tT\tG\tT\tA", 26);
+      four_alleles = 1;
+    } else if (flags & kfGenDummy1234) {
+      memcpy(alleles, "\t1\t2\t1\t3\t1\t4\t2\t3\t2\t4\t3\t4\t1", 26);
+      four_alleles = 1;
+    } else if (flags & kfGenDummy12) {
+      memcpy(alleles, "\t1\t2\t1", 6);
+    } else {
+      memcpy(alleles, "\tA\tB\tA", 6);
+    }
+    char* textbuf = g_textbuf;
+    char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+    strcpy(outname_end, ".pvar");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto generate_dummy_ret_OPEN_FAIL;
+    }
+    char* write_iter = strcpya(textbuf, "#CHROM\tPOS\tID\tREF\tALT");
+    append_binary_eoln(&write_iter);
+    if (four_alleles) {
+      uint32_t urand = 0;
+      for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx) {
+	if (!(variant_idx % 8)) {
+	  if (write_iter >= textbuf_flush) {
+	    if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	      goto generate_dummy_ret_WRITE_FAIL;
+	    }
+	    write_iter = textbuf;
+	  }
+	  do {
+	    urand = sfmt_genrand_uint32(&g_sfmt);
+	  } while (urand < 425132032U); // 2^32 - 12^8
+	}
+	const uint32_t quotient = urand / 12;
+	const uint32_t remainder = urand - (quotient * 12U);
+	urand = quotient;
+	write_iter = memcpya(write_iter, chr1_name_buf, chr1_name_blen);
+	write_iter = uint32toa(variant_idx, write_iter);
+	write_iter = strcpya(write_iter, "\tsnp");
+	write_iter = uint32toa(variant_idx, write_iter);
+	write_iter = memcpya(write_iter, &(alleles[remainder]), 4);
+	append_binary_eoln(&write_iter);
+      }
+    } else {
+      uint32_t urand = 0;
+      for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx) {
+	if (!(variant_idx % 32)) {
+	  if (write_iter >= textbuf_flush) {
+	    if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	      goto generate_dummy_ret_WRITE_FAIL;
+	    }
+	    write_iter = textbuf;
+	  }
+	  urand = sfmt_genrand_uint32(&g_sfmt);
+	}
+	const uint32_t remainder = urand & 1;
+	urand >>= 1;
+	write_iter = memcpya(write_iter, chr1_name_buf, chr1_name_blen);
+	write_iter = uint32toa(variant_idx, write_iter);
+	write_iter = strcpya(write_iter, "\tsnp");
+	write_iter = uint32toa(variant_idx, write_iter);
+	write_iter = memcpya(write_iter, &(alleles[remainder]), 4);
+	append_binary_eoln(&write_iter);
+      }
+    }
+    if (write_iter != textbuf) {
+      if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	goto generate_dummy_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto generate_dummy_ret_WRITE_FAIL;
+    }
+
+    strcpy(outname_end, ".psam");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto generate_dummy_ret_OPEN_FAIL;
+    }
+    const uint32_t pheno_ct = gendummy_info_ptr->pheno_ct;
+    char* writebuf;
+    if (bigstack_alloc_c(kMaxMediumLine + 48 + pheno_ct * MAXV(kMaxMissingPhenostrBlen, 16), &writebuf)) {
+      goto generate_dummy_ret_NOMEM;
+    }
+    char* writebuf_flush = &(writebuf[kMaxMediumLine]);
+    uint32_t omp_slen = 2;
+    char output_missing_pheno[kMaxMissingPhenostrBlen];
+    if (misc_flags & kfMiscKeepAutoconv) {
+      // must use --output-missing-phenotype parameter, which we've validated
+      // to be consistent with --input-missing-phenotype
+      omp_slen = strlen(g_output_missing_pheno);
+      memcpy(output_missing_pheno, g_output_missing_pheno, omp_slen);
+    } else {
+      // use "NA" since that's always safe
+      memcpy(output_missing_pheno, "NA", 2);
+    }
+    write_iter = strcpya(writebuf, "#FID\tIID\tSEX");
+    for (uint32_t pheno_idx_p1 = 1; pheno_idx_p1 <= pheno_ct; ++pheno_idx_p1) {
+      write_iter = strcpya(write_iter, "\tPHENO");
+      write_iter = uint32toa(pheno_idx_p1, write_iter);
+    }
+    append_binary_eoln(&write_iter);
+    const uint32_t pheno_m_check = (gendummy_info_ptr->pheno_mfreq >= kRecip2m32 * 0.5);
+    const uint32_t pheno_m32 = (uint32_t)(gendummy_info_ptr->pheno_mfreq * 4294967296.0 - 0.5);
+    if ((flags & kfGenDummyScalarPheno) && pheno_ct) {
+      uint32_t saved_rnormal = 0;
+      double saved_rnormal_val = 0.0;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	if (write_iter >= writebuf_flush) {
+	  if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	    goto generate_dummy_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	write_iter = memcpyl3a(write_iter, "per");
+	write_iter = uint32toa(sample_idx, write_iter);
+	write_iter = strcpya(write_iter, "\tper");
+	write_iter = uint32toa(sample_idx, write_iter);
+	// could add option to add some males/unknown gender
+	write_iter = strcpya(write_iter, "\t2");
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  *write_iter++ = '\t';
+	  if (pheno_m_check && (sfmt_genrand_uint32(&g_sfmt) <= pheno_m32)) {
+	    write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
+	  } else {
+	    double dxx;
+	    if (saved_rnormal) {
+	      dxx = saved_rnormal_val;
+	    } else {
+	      dxx = rand_normal(&g_sfmt, &saved_rnormal_val);
+	    }
+	    saved_rnormal_val = 1 - saved_rnormal_val;
+	    write_iter = dtoa_g(dxx, write_iter);
+	  }
+	}
+	append_binary_eoln(&write_iter);
+      }
+    } else {
+      uint32_t urand = sfmt_genrand_uint32(&g_sfmt);
+      uint32_t urand_bits_left = 32;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	if (write_iter >= writebuf_flush) {
+	  if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	    goto generate_dummy_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+	write_iter = memcpyl3a(write_iter, "per");
+	write_iter = uint32toa(sample_idx, write_iter);
+	write_iter = strcpya(write_iter, "\tper");
+	write_iter = uint32toa(sample_idx, write_iter);
+	write_iter = strcpya(write_iter, "\t2");
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  *write_iter++ = '\t';
+	  if (pheno_m_check && (sfmt_genrand_uint32(&g_sfmt) <= pheno_m32)) {
+	    write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
+	  } else {
+	    if (!urand_bits_left) {
+	      urand = sfmt_genrand_uint32(&g_sfmt);
+	      urand_bits_left = 32;
+	    }
+	    *write_iter++ = (char)((urand & 1) + '1');
+	    urand >>= 1;
+	    --urand_bits_left;
+	  }
+	}
+	append_binary_eoln(&write_iter);
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	goto generate_dummy_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto generate_dummy_ret_WRITE_FAIL;
+    }
+
+    bigstack_reset(writebuf);
+    strcpy(outname_end, ".pgen");
+    const double geno_mfreq = gendummy_info_ptr->geno_mfreq;
+    if (geno_mfreq < kRecip2m53) {
+      // beyond this point, 1-x may just be 1
+      g_geno_missing_geomdist[kBitsPerWordD2 - 1] = 0;
+    } else {
+      double remaining_prob = 1.0;
+      g_geno_missing_invert = (geno_mfreq > 0.5);
+      if (g_geno_missing_invert) {
+	for (uint32_t uii = 0; uii < kBitsPerWordD2; ++uii) {
+	  remaining_prob *= geno_mfreq;
+	  g_geno_missing_geomdist[uii] = -((uint64_t)(remaining_prob * k2m64));
+	}
+      } else {
+        const double geno_nmfreq = 1.0 - geno_mfreq;
+	for (uint32_t uii = 0; uii < kBitsPerWordD2; ++uii) {
+	  remaining_prob *= geno_nmfreq;
+	  g_geno_missing_geomdist[uii] = -((uint64_t)(remaining_prob * k2m64));
+	}
+      }
+    }
+    const double dosage_nfreq = 1.0 - gendummy_info_ptr->dosage_freq;
+    if (dosage_nfreq >= 1.0) {
+      g_dosage_geomdist[kBitsPerWordD2 - 1] = 0;
+    } else {
+      double remaining_prob = 1.0;
+      for (uint32_t uii = 0; uii < kBitsPerWordD2; ++uii) {
+	remaining_prob *= dosage_nfreq;
+	g_dosage_geomdist[uii] = -((uint64_t)(remaining_prob * k2m64));
+      }
+      uint32_t dosage_geomdist_max = kBitsPerWordD2;
+      for (; dosage_geomdist_max; --dosage_geomdist_max) {
+	if (g_dosage_geomdist[dosage_geomdist_max - 1] != 0) {
+	  break;
+	}
+      }
+      g_dosage_geomdist_max = dosage_geomdist_max;
+    }
+    uintptr_t spgw_alloc_cacheline_ct;
+    uint32_t max_vrec_len;
+    reterr = spgw_init_phase1(outname, nullptr, nullptr, variant_ct, sample_ct, (dosage_nfreq >= 1.0)? kfPgenGlobal0 : kfPgenGlobalDosagePresent, 1, &spgw, &spgw_alloc_cacheline_ct, &max_vrec_len);
+    if (reterr) {
+      goto generate_dummy_ret_1;
+    }
+    unsigned char* spgw_alloc;
+    if (bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+      goto generate_dummy_ret_NOMEM;
+    }
+    spgw_init_phase2(max_vrec_len, &spgw, spgw_alloc);
+
+    // thread-count-independent:
+    //   (everything after "2 *" rounded up to cacheline)
+    //   g_write_genovecs: 2 * sample_ctaw2 * sizeof(intptr_t) *
+    //                     main_block_size
+    //   g_write_dosage_cts: 2 * sizeof(int32_t) * main_block_size
+    //   g_write_dosage_presents: 2 * sample_ctaw * sizeof(intptr_t) *
+    //                            main_block_size
+    //   g_write_dosage_val_bufs: 2 * sample_ct * sizeof(dosage_t)
+    uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    // saturates around 4 compute threads, both with and without dosage
+    // (todo: test this on something other than a MacBook Pro, could just be a
+    // hyperthreading artifact)
+    if (calc_thread_ct > 4) {
+      calc_thread_ct = 4;
+    }
+    if (bigstack_init_sfmtp(calc_thread_ct, 0)) {
+      goto generate_dummy_ret_NOMEM;
+    }
+    if (bigstack_alloc_thread(calc_thread_ct, &ts.threads)) {
+      goto generate_dummy_ret_NOMEM;
+    }
+    const uint32_t sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+    const uint32_t sample_ctaw = BITCT_TO_ALIGNED_WORDCT(sample_ct);
+    uintptr_t cachelines_avail_m8 = bigstack_left() / kCacheline;
+    if (cachelines_avail_m8 < 8) {
+      goto generate_dummy_ret_NOMEM;
+    }
+    // we're making 8 allocations; be pessimistic re: rounding
+    cachelines_avail_m8 -= 8;
+    const uintptr_t bytes_req_per_in_block_variant = 2 * (sample_ctaw2 * sizeof(intptr_t) + sizeof(int32_t) + sample_ctaw * sizeof(intptr_t) + sample_ct * sizeof(dosage_t));
+    uintptr_t main_block_size = (cachelines_avail_m8 * kCacheline) / bytes_req_per_in_block_variant;
+    if (main_block_size > 65536) {
+      main_block_size = 65536;
+    } else if (main_block_size < 8) {
+      // this threshold is arbitrary
+      goto generate_dummy_ret_NOMEM;
+    }
+    if (calc_thread_ct > main_block_size / 8) {
+      calc_thread_ct = main_block_size / 8;
+    }
+    ts.calc_thread_ct = calc_thread_ct;
+    g_calc_thread_ct = calc_thread_ct;
+    g_sample_ct = sample_ct;
+    if (bigstack_alloc_ul(sample_ctaw2 * main_block_size, &(g_write_genovecs[0])) ||
+	bigstack_alloc_ul(sample_ctaw2 * main_block_size, &(g_write_genovecs[1])) ||
+	bigstack_alloc_ui(main_block_size, &(g_write_dosage_cts[0])) ||
+	bigstack_alloc_ui(main_block_size, &(g_write_dosage_cts[1])) ||
+	bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dosage_presents[0])) ||
+	bigstack_alloc_ul(sample_ctaw * main_block_size, &(g_write_dosage_presents[1])) ||
+	bigstack_alloc_dosage(sample_ct * main_block_size, &(g_write_dosage_val_bufs[0])) ||
+	bigstack_alloc_dosage(sample_ct * main_block_size, &(g_write_dosage_val_bufs[1]))) {
+      // this should be impossible
+      assert(0);
+      goto generate_dummy_ret_NOMEM;
+    }
+    g_hard_call_halfdist = kDosage4th - hard_call_thresh;
+    g_dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
+
+    // Main workflow:
+    // 1. Set n=0
+    //
+    // 2. Spawn threads generating batch n genotype data
+    // 3. If n>0, write results for block (n-1)
+    // 4. Increment n by 1
+    // 5. Join threads
+    // 6. Goto step 2 unless eof
+    //
+    // 7. Write results for last block
+    uint32_t vidx_start = 0;
+    uint32_t prev_block_write_ct = 0;
+    uint32_t parity = 0;
+    while (1) {
+      uint32_t cur_block_write_ct = 0;
+      if (!ts.is_last_block) {
+	cur_block_write_ct = MINV(variant_ct - vidx_start, main_block_size);
+      }
+      if (vidx_start) {
+	join_threads3z(&ts);
+      }
+      if (!ts.is_last_block) {
+	g_cur_block_write_ct = cur_block_write_ct;
+	ts.is_last_block = (vidx_start + cur_block_write_ct == variant_ct);
+	ts.thread_func_ptr = generate_dummy_thread;
+	if (spawn_threads3z(vidx_start, &ts)) {
+	  goto generate_dummy_ret_THREAD_CREATE_FAIL;
+	}
+      }
+      parity = 1 - parity;
+      if (vidx_start) {
+	// write *previous* block results
+	uintptr_t* write_genovec_iter = g_write_genovecs[parity];
+	uint32_t* write_dosage_ct_iter = g_write_dosage_cts[parity];
+	uintptr_t* write_dosage_present_iter = g_write_dosage_presents[parity];
+	dosage_t* write_dosage_vals_iter = g_write_dosage_val_bufs[parity];
+	for (uint32_t vidx = vidx_start - prev_block_write_ct; vidx < vidx_start; ++vidx) {
+	  const uint32_t cur_dosage_ct = *write_dosage_ct_iter++;
+	  if (!cur_dosage_ct) {
+	    if (spgw_append_biallelic_genovec(write_genovec_iter, &spgw)) {
+	      goto generate_dummy_ret_WRITE_FAIL;
+	    }
+	  } else {
+	    if (spgw_append_biallelic_genovec_dosage16(write_genovec_iter, write_dosage_present_iter, write_dosage_vals_iter, cur_dosage_ct, &spgw)) {
+	      goto generate_dummy_ret_WRITE_FAIL;
+	    }
+	  }
+	  write_genovec_iter = &(write_genovec_iter[sample_ctaw2]);
+	  write_dosage_present_iter = &(write_dosage_present_iter[sample_ctaw]);
+	  write_dosage_vals_iter = &(write_dosage_vals_iter[sample_ct]);
+	}
+      }
+      if (vidx_start == variant_ct) {
+	break;
+      }
+      if (vidx_start) {
+	printf("\r--dummy: %uk variants written.", vidx_start / 1000);
+	fflush(stdout);
+      }
+      vidx_start += cur_block_write_ct;
+      prev_block_write_ct = cur_block_write_ct;
+    }
+    spgw_finish(&spgw);
+
+    putc_unlocked('\r', stdout);
+    *outname_end = '\0';
+    LOGPRINTFWW("Dummy data (%u sample%s, %u SNP%s) written to %s.pgen + %s.pvar + %s.psam .\n", sample_ct, (sample_ct == 1)? "" : "s", variant_ct, (variant_ct == 1)? "" : "s", outname, outname, outname);
+  }
+  while (0) {
+  generate_dummy_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  generate_dummy_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  generate_dummy_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  generate_dummy_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  generate_dummy_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ generate_dummy_ret_1:
+  if (spgw_cleanup(&spgw) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  threads3z_cleanup(&ts, &g_cur_block_write_ct);
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+/*
+#ifdef __arm__
+  #error "Unaligned accesses in bitvec_resort()."
+#endif
+void bitvec_resort(const uintptr_t* bitvec, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, unsigned char* writebuf) {
+  const uint32_t sample_ctl_m1 = BITCT_TO_WORDCT(sample_ct) - 1;
+  uint32_t widx = 0;
+  uint32_t cur_word_entry_ct = kBitsPerWord;
+  const uint32_t* new_sample_idx_to_old_base = new_sample_idx_to_old;
+  uintptr_t* writebuf_walias = (uintptr_t*)writebuf;
+  while (1) {
+    if (widx == sample_ctl_m1) {
+      cur_word_entry_ct = 1 + ((sample_ct - 1) % kBitsPerWord);
+    }
+    uintptr_t cur_word = 0;
+    for (uint32_t uii = 0; uii < cur_word_entry_ct; ++uii) {
+      cur_word |= IS_SET(bitvec, new_sample_idx_to_old_base[uii]) << uii;
+    }
+    if (widx == sample_ctl_m1) {
+      memcpy(&(writebuf_walias[widx]), &cur_word, (cur_word_entry_ct + (CHAR_BIT - 1)) / CHAR_BIT);
+      return;
+    }
+    writebuf_walias[widx++] = cur_word;
+    new_sample_idx_to_old_base = &(new_sample_idx_to_old_base[kBitsPerWord]);
+  }
+}
+*/
+
+#ifdef __arm__
+  #error "Unaligned accesses in genovec_resort()."
+#endif
+void genovec_resort(const uintptr_t* genovec, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, unsigned char* writebuf) {
+  // writebuf need not be word-aligned
+  const uint32_t sample_ctl2_m1 = QUATERCT_TO_WORDCT(sample_ct) - 1;
+  uint32_t word_idx = 0;
+  uint32_t cur_word_entry_ct = kBitsPerWordD2;
+  const uint32_t* new_sample_idx_to_old_iter = new_sample_idx_to_old;
+  uintptr_t* writebuf_walias = (uintptr_t*)writebuf;
+  while (1) {
+    if (word_idx == sample_ctl2_m1) {
+      cur_word_entry_ct = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t cur_word = 0;
+    for (uint32_t uii = 0; uii < cur_word_entry_ct; ++uii) {
+      cur_word |= (GET_QUATERARR_ENTRY(genovec, new_sample_idx_to_old_iter[uii])) << (2 * uii);
+    }
+    if (word_idx == sample_ctl2_m1) {
+      memcpy(&(writebuf_walias[word_idx]), &cur_word, QUATERCT_TO_BYTECT(cur_word_entry_ct));
+      return;
+    }
+    writebuf_walias[word_idx++] = cur_word;
+    new_sample_idx_to_old_iter = &(new_sample_idx_to_old_iter[kBitsPerWordD2]);
+  }
+}
+
+void unpack_hphase(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, uint32_t raw_sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
+  const uintptr_t* phaseraw_iter = phaseraw;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  uintptr_t phaseraw_word = *phaseraw_iter++;
+  uint32_t read_idx_lowbits = 1;
+  if (!(phaseraw_word & 1)) {
+    // phase always present
+    phaseraw_word >>= 1;
+    *phasepresent_ptr = nullptr;
+    for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+      uintptr_t new_phasepresent_word = all_hets[widx];
+      uintptr_t new_phaseinfo_word = 0;
+      while (new_phasepresent_word) {
+	// this copies over bottom bit of new_phasepresent_word, retaining its
+	// position
+	if (read_idx_lowbits == kBitsPerWord) {
+	  phaseraw_word = *phaseraw_iter++;
+	  read_idx_lowbits = 0;
+	}
+	const uintptr_t new_phasepresent_word_mask = new_phasepresent_word - k1LU;
+	const uintptr_t reduced_phasepresent_word = new_phasepresent_word & new_phasepresent_word_mask;
+	new_phaseinfo_word += (new_phasepresent_word - reduced_phasepresent_word) * (phaseraw_word & 1);
+	++read_idx_lowbits;
+	phaseraw_word >>= 1;
+	new_phasepresent_word = reduced_phasepresent_word;
+      }
+      phaseinfo[widx] = new_phaseinfo_word;
+    }
+  } else {
+    phaseraw_word >>= 1;
+    uintptr_t* phasepresent = *phasepresent_ptr;
+    const uintptr_t* phaseinfo_read_iter = &(phaseraw[1 + (raw_sample_ct / kBitsPerWord)]);
+    uintptr_t phaseinfo_read_word = *phaseinfo_read_iter++;
+    uint32_t phaseinfo_read_idx_lowbits = 0;
+    for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+      uintptr_t cur_hets = all_hets[widx];
+      uintptr_t new_phasepresent = 0;
+      uintptr_t new_phaseinfo = 0;
+      while (cur_hets) {
+	if (read_idx_lowbits == kBitsPerWord) {
+	  phaseraw_word = *phaseraw_iter++;
+	  read_idx_lowbits = 0;
+	}
+	const uintptr_t cur_hets_mask = cur_hets - k1LU;
+	const uintptr_t reduced_hets = cur_hets & cur_hets_mask;
+	if (phaseraw_word & 1) {
+	  if (phaseinfo_read_idx_lowbits == kBitsPerWord) {
+	    phaseinfo_read_word = *phaseinfo_read_iter++;
+	    phaseinfo_read_idx_lowbits = 0;
+	  }
+	  const uintptr_t cur_bit = cur_hets - reduced_hets;
+	  new_phasepresent += cur_bit;
+	  new_phaseinfo += cur_bit * (phaseinfo_read_word & 1);
+	  ++phaseinfo_read_idx_lowbits;
+	  phaseinfo_read_word >>= 1;
+	}
+	++read_idx_lowbits;
+	phaseraw_word >>= 1;
+	cur_hets = reduced_hets;
+      }
+      phasepresent[widx] = new_phasepresent;
+      phaseinfo[widx] = new_phaseinfo;
+    }
+  }
+}
+
+void unpack_hphase_subset(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
+  const uintptr_t* phaseraw_iter = phaseraw;
+  uintptr_t* phaseinfo_write_iter = phaseinfo;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  uintptr_t phaseraw_word = *phaseraw_iter++;
+  uintptr_t phaseinfo_write_word = 0;
+  uint32_t read_idx_lowbits = 1;
+  uint32_t write_idx_lowbits = 0;
+  if (!(phaseraw_word & 1)) {
+    // phase always present
+    *phasepresent_ptr = nullptr;
+    for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+      const uintptr_t cur_sample_include = sample_include[widx];
+      const uintptr_t geno_hets = all_hets[widx];
+      uintptr_t tmp_phaseinfo_write_word = 0;
+      if (geno_hets) {
+	const uint32_t read_idx_lowbits_end = read_idx_lowbits + popcount_long(geno_hets);
+	uintptr_t tmp_phaseinfo_input_word = phaseraw_word >> read_idx_lowbits;
+	if (read_idx_lowbits_end >= kBitsPerWord) {
+	  // always safe to read an extra word off the end, when
+	  // read_idx_lowbits_end == kBitsPerWord and we're at the last word
+	  phaseraw_word = *phaseraw_iter++;
+	  if (read_idx_lowbits) {
+	    tmp_phaseinfo_input_word |= phaseraw_word << (kBitsPerWord - read_idx_lowbits);
+	  }
+	}
+	tmp_phaseinfo_input_word &= (~k0LU) >> (kBitsPerWord + read_idx_lowbits - read_idx_lowbits_end);
+	read_idx_lowbits = read_idx_lowbits_end % kBitsPerWord;
+	if (tmp_phaseinfo_input_word) {
+	  uintptr_t cur_masked_hets = cur_sample_include & geno_hets;
+	  while (cur_masked_hets) {
+	    const uintptr_t cur_masked_hets_and_arg = cur_masked_hets - k1LU;
+	    const uintptr_t lowmask = (cur_masked_hets ^ cur_masked_hets_and_arg) >> 1;
+	    const uint32_t read_idx_offset = popcount_long(geno_hets & lowmask);
+	    uintptr_t shifted_phaseinfo_input_word = tmp_phaseinfo_input_word >> read_idx_offset;
+	    if (shifted_phaseinfo_input_word & 1) {
+	      tmp_phaseinfo_write_word |= (k1LU << popcount_long(cur_sample_include & lowmask));
+	      if (shifted_phaseinfo_input_word == 1) {
+		break;
+	      }
+	    }
+	    cur_masked_hets &= cur_masked_hets_and_arg;
+	  }
+	}
+        phaseinfo_write_word |= tmp_phaseinfo_write_word << write_idx_lowbits;
+      }
+      const uint32_t write_idx_lowbits_end = write_idx_lowbits + popcount_long(cur_sample_include);
+      if (write_idx_lowbits_end >= kBitsPerWord) {
+	*phaseinfo_write_iter++ = phaseinfo_write_word;
+	if (write_idx_lowbits) {
+	  phaseinfo_write_word = tmp_phaseinfo_write_word >> (kBitsPerWord - write_idx_lowbits);
+	} else {
+	  phaseinfo_write_word = 0;
+	}
+      }
+      write_idx_lowbits = write_idx_lowbits_end % kBitsPerWord;
+    }
+    if (write_idx_lowbits) {
+      *phaseinfo_write_iter = phaseinfo_write_word;
+    }
+    return;
+  }
+  const uintptr_t* phaseinfo_read_iter = &(phaseraw[1 + (raw_sample_ct / kBitsPerWord)]);
+  uintptr_t* phasepresent_write_iter = *phasepresent_ptr;
+  uintptr_t phaseinfo_read_word = *phaseinfo_read_iter++;
+  uintptr_t phasepresent_write_word = 0;
+  uint32_t phaseinfo_read_idx_lowbits = 0;
+  for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+    const uintptr_t cur_sample_include = sample_include[widx];
+    const uintptr_t geno_hets = all_hets[widx];
+    uintptr_t tmp_phasepresent_write_word = 0;
+    uintptr_t tmp_phaseinfo_write_word = 0;
+    if (geno_hets) {
+      const uint32_t read_idx_lowbits_end = read_idx_lowbits + popcount_long(geno_hets);
+      uintptr_t tmp_phasepresent_input_word = phaseraw_word >> read_idx_lowbits;
+      if (read_idx_lowbits_end >= kBitsPerWord) {
+	// always safe to read an extra word off the end, when
+	// read_idx_lowbits_end == kBitsPerWord and we're at the last word
+	phaseraw_word = *phaseraw_iter++;
+	if (read_idx_lowbits) {
+	  tmp_phasepresent_input_word |= phaseraw_word << (kBitsPerWord - read_idx_lowbits);
+	}
+      }
+      tmp_phasepresent_input_word &= (~k0LU) >> (kBitsPerWord + read_idx_lowbits - read_idx_lowbits_end);
+      read_idx_lowbits = read_idx_lowbits_end % kBitsPerWord;
+      if (tmp_phasepresent_input_word) {
+	const uint32_t read_phasepresent_ct = popcount_long(tmp_phasepresent_input_word);
+	uintptr_t tmp_phaseinfo_input_word;
+	// avoid reading off end of phaseinfo here
+	if (phaseinfo_read_idx_lowbits != kBitsPerWord) {
+	  const uint32_t phaseinfo_read_idx_lowbits_end = phaseinfo_read_idx_lowbits + read_phasepresent_ct;
+	  tmp_phaseinfo_input_word = phaseinfo_read_word >> phaseinfo_read_idx_lowbits;
+	  if (phaseinfo_read_idx_lowbits_end < kBitsPerWord) {
+	    phaseinfo_read_idx_lowbits = phaseinfo_read_idx_lowbits_end;
+	  } else {
+	    phaseinfo_read_word = *phaseinfo_read_iter++;
+	    tmp_phaseinfo_input_word |= phaseinfo_read_word << (kBitsPerWord - phaseinfo_read_idx_lowbits);
+	    phaseinfo_read_idx_lowbits = phaseinfo_read_idx_lowbits_end - kBitsPerWord;
+	  }
+	} else {
+	  // special case, can't right-shift 64
+	  phaseinfo_read_word = *phaseinfo_read_iter++;
+	  phaseinfo_read_idx_lowbits = read_phasepresent_ct;
+	  tmp_phaseinfo_input_word = phaseinfo_read_word;
+	}
+	tmp_phaseinfo_input_word &= (~k0LU) >> (kBitsPerWord - read_phasepresent_ct);
+	
+	uintptr_t cur_masked_hets = cur_sample_include & geno_hets;
+	while (cur_masked_hets) {
+	  const uintptr_t cur_masked_hets_and_arg = cur_masked_hets - k1LU;
+	  const uintptr_t lowmask = (cur_masked_hets ^ cur_masked_hets_and_arg) >> 1;
+	  const uint32_t read_idx_offset = popcount_long(geno_hets & lowmask);
+	  uintptr_t shifted_phasepresent_input_word = tmp_phasepresent_input_word >> read_idx_offset;
+	  if (shifted_phasepresent_input_word & 1) {
+	    const uintptr_t cur_bit = popcount_long(cur_sample_include & lowmask);
+	    tmp_phasepresent_write_word |= cur_bit;
+	    tmp_phaseinfo_write_word += cur_bit * ((tmp_phaseinfo_input_word >> (read_phasepresent_ct - popcount_long(shifted_phasepresent_input_word))) & 1);
+	    if (shifted_phasepresent_input_word == 1) {
+	      break;
+	    }
+	  }
+	  cur_masked_hets &= cur_masked_hets_and_arg;
+	}
+      }
+      phasepresent_write_word |= tmp_phasepresent_write_word << write_idx_lowbits;
+      phaseinfo_write_word |= tmp_phaseinfo_write_word << write_idx_lowbits;
+    }
+    const uint32_t write_idx_lowbits_end = write_idx_lowbits + popcount_long(cur_sample_include);
+    if (write_idx_lowbits_end >= kBitsPerWord) {
+      *phasepresent_write_iter++ = phasepresent_write_word;
+      *phaseinfo_write_iter++ = phaseinfo_write_word;
+      if (write_idx_lowbits) {
+	const uint32_t rshift = kBitsPerWord - write_idx_lowbits;
+	phasepresent_write_word = tmp_phasepresent_write_word >> rshift;
+	phaseinfo_write_word = tmp_phaseinfo_write_word >> rshift;
+      } else {
+	phasepresent_write_word = 0;
+	phaseinfo_write_word = 0;
+      }
+    }
+    write_idx_lowbits = write_idx_lowbits_end % kBitsPerWord;
+  }
+  if (write_idx_lowbits) {
+    *phasepresent_write_iter = phasepresent_write_word;
+    *phaseinfo_write_iter = phaseinfo_write_word;
+  }
+}
+
+void unpack_and_resort_hphase(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, const uintptr_t* sample_include, const uint32_t* old_sample_idx_to_new, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
+  const uintptr_t* phaseraw_iter = phaseraw;
+  const uint32_t* old_sample_idx_to_new_iter = old_sample_idx_to_new;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  uintptr_t phaseraw_word = *phaseraw_iter++;
+  uint32_t read_idx_lowbits = 1;
+  fill_ulong_zero(sample_ctl, phaseinfo);
+  if (!(phaseraw_word & 1)) {
+    // phase always present
+    *phasepresent_ptr = nullptr;
+    for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+      uintptr_t new_phasepresent_word = all_hets[widx];
+      const uint32_t read_idx_lowbits_end = read_idx_lowbits + popcount_long(new_phasepresent_word);
+      uintptr_t tmp_phaseinfo_input_word = phaseraw_word >> read_idx_lowbits;
+      if (read_idx_lowbits_end >= kBitsPerWord) {
+	// always safe to read an extra word off the end
+	phaseraw_word = *phaseraw_iter++;
+	if (read_idx_lowbits) {
+	  tmp_phaseinfo_input_word |= phaseraw_word << (kBitsPerWord - read_idx_lowbits);
+	}
+      }
+      // no need to mask off top bits of tmp_phaseinfo_input_word
+      read_idx_lowbits = read_idx_lowbits_end % kBitsPerWord;
+      if (!sample_include) {
+	while (new_phasepresent_word) {
+	  const uint32_t sample_uidx_lowbits = CTZLU(new_phasepresent_word);
+	  if (tmp_phaseinfo_input_word & 1) {
+	    SET_BIT(old_sample_idx_to_new_iter[sample_uidx_lowbits], phaseinfo);
+	  }
+	  tmp_phaseinfo_input_word >>= 1;
+	  new_phasepresent_word &= new_phasepresent_word - k1LU;
+	}
+      } else {
+	uintptr_t masked_phasepresent_word = new_phasepresent_word & sample_include[widx];
+	while (masked_phasepresent_word) {
+	  const uint32_t sample_uidx_lowbits = CTZLU(masked_phasepresent_word);
+	  const uintptr_t lowmask = (k1LU << sample_uidx_lowbits) - k1LU;
+	  if ((tmp_phaseinfo_input_word >> popcount_long(new_phasepresent_word & lowmask)) & 1) {
+	    SET_BIT(old_sample_idx_to_new_iter[sample_uidx_lowbits], phaseinfo);
+	  }
+	  masked_phasepresent_word &= masked_phasepresent_word - k1LU;
+	}
+      }
+      old_sample_idx_to_new_iter = &(old_sample_idx_to_new_iter[kBitsPerWord]);
+    }
+    return;
+  }
+  uintptr_t* phasepresent = *phasepresent_ptr;
+  const uintptr_t* phaseinfo_read_iter = &(phaseraw[1 + (raw_sample_ct / kBitsPerWord)]);
+  uintptr_t phaseinfo_read_word = *phaseinfo_read_iter++;
+  uint32_t phaseinfo_read_idx_lowbits = 0;
+  fill_ulong_zero(sample_ctl, phasepresent);
+  for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+    uintptr_t geno_hets = all_hets[widx];
+    if (geno_hets) {
+      const uint32_t read_idx_lowbits_end = read_idx_lowbits + popcount_long(geno_hets);
+      uintptr_t tmp_phasepresent_input_word = phaseraw_word >> read_idx_lowbits;
+      if (read_idx_lowbits_end >= kBitsPerWord) {
+	// always safe to read an extra word off the end, when
+	// read_idx_lowbits_end == kBitsPerWord and we're at the last word
+	phaseraw_word = *phaseraw_iter++;
+	if (read_idx_lowbits) {
+	  tmp_phasepresent_input_word |= phaseraw_word << (kBitsPerWord - read_idx_lowbits);
+	}
+      }
+      tmp_phasepresent_input_word &= (~k0LU) >> (kBitsPerWord + read_idx_lowbits - read_idx_lowbits_end);
+      read_idx_lowbits = read_idx_lowbits_end % kBitsPerWord;
+      if (tmp_phasepresent_input_word) {
+	const uint32_t read_phasepresent_ct = popcount_long(tmp_phasepresent_input_word);
+	uintptr_t tmp_phaseinfo_input_word;
+	// avoid reading off end of phaseinfo here
+	if (phaseinfo_read_idx_lowbits != kBitsPerWord) {
+	  const uint32_t phaseinfo_read_idx_lowbits_end = phaseinfo_read_idx_lowbits + read_phasepresent_ct;
+	  tmp_phaseinfo_input_word = phaseinfo_read_word >> phaseinfo_read_idx_lowbits;
+	  if (phaseinfo_read_idx_lowbits_end < kBitsPerWord) {
+	    phaseinfo_read_idx_lowbits = phaseinfo_read_idx_lowbits_end;
+	  } else {
+	    phaseinfo_read_word = *phaseinfo_read_iter++;
+	    tmp_phaseinfo_input_word |= phaseinfo_read_word << (kBitsPerWord - phaseinfo_read_idx_lowbits);
+	    phaseinfo_read_idx_lowbits = phaseinfo_read_idx_lowbits_end - kBitsPerWord;
+	  }
+	} else {
+	  // special case, can't right-shift 64
+	  phaseinfo_read_word = *phaseinfo_read_iter++;
+	  phaseinfo_read_idx_lowbits = read_phasepresent_ct;
+	  tmp_phaseinfo_input_word = phaseinfo_read_word;
+	}
+	// no need to mask off top bits of tmp_phaseinfo_input_word
+	if (!sample_include) {
+	  while (1) {
+	    if (tmp_phasepresent_input_word & 1) {
+	      const uint32_t new_sample_idx = old_sample_idx_to_new_iter[CTZLU(geno_hets)];
+	      SET_BIT(new_sample_idx, phasepresent);
+	      if (tmp_phaseinfo_input_word & 1) {
+		SET_BIT(new_sample_idx, phaseinfo);
+	      }
+	      if (tmp_phasepresent_input_word == 1) {
+		break;
+	      }
+	      tmp_phaseinfo_input_word >>= 1;
+	    }
+	    tmp_phasepresent_input_word >>= 1;
+	    geno_hets &= geno_hets - k1LU;
+	  }
+	} else {
+	  const uintptr_t sample_include_word = sample_include[widx];
+	  while (1) {
+	    if (tmp_phasepresent_input_word & 1) {
+	      const uint32_t sample_uidx_lowbits = CTZLU(geno_hets);
+	      if ((sample_include_word >> sample_uidx_lowbits) & 1) {
+		const uint32_t new_sample_idx = old_sample_idx_to_new_iter[sample_uidx_lowbits];
+		SET_BIT(new_sample_idx, phasepresent);
+		if (tmp_phaseinfo_input_word & 1) {
+		  SET_BIT(new_sample_idx, phaseinfo);
+		}
+	      }
+	      if (tmp_phasepresent_input_word == 1) {
+		break;
+	      }
+	      tmp_phaseinfo_input_word >>= 1;
+	    }
+	    tmp_phasepresent_input_word >>= 1;
+	    geno_hets &= geno_hets - k1LU;
+	  }
+	}
+      }
+    }
+    old_sample_idx_to_new_iter = &(old_sample_idx_to_new_iter[kBitsPerWord]);
+  }
+}
+
+void copy_dosage(const uintptr_t* __restrict dosageraw, uint32_t raw_sample_ct, uintptr_t* __restrict write_dosagepresent, dosage_t* write_dosagevals, uint32_t* write_dosage_ct_ptr) {
+  const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+  const uintptr_t* read_dosagepresent = dosageraw;
+  const dosage_t* read_dosagevals = (const dosage_t*)(&(dosageraw[raw_sample_ctaw]));
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t dosage_ct = popcount_longs(read_dosagepresent, raw_sample_ctl);
+  *write_dosage_ct_ptr = dosage_ct;
+  memcpy(write_dosagepresent, read_dosagepresent, raw_sample_ctl * sizeof(intptr_t));
+  memcpy(write_dosagevals, read_dosagevals, dosage_ct * sizeof(dosage_t));
+}
+
+void copy_dosage_subset(const uintptr_t* __restrict dosageraw, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t* __restrict write_dosagepresent, dosage_t* write_dosagevals, uint32_t* __restrict write_dosage_ct_ptr) {
+  const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+  const uintptr_t* read_dosagepresent = dosageraw;
+  const dosage_t* read_dosagevals = (const dosage_t*)(&(dosageraw[raw_sample_ctaw]));
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t read_dosage_ct = popcount_longs(read_dosagepresent, raw_sample_ctl);
+  copy_bitarr_subset(read_dosagepresent, sample_include, sample_ct, write_dosagepresent);
+  uint32_t sample_uidx = 0;
+  dosage_t* write_dosagevals_iter = write_dosagevals;
+  for (uint32_t read_dosage_idx = 0; read_dosage_idx < read_dosage_ct; ++read_dosage_idx, ++sample_uidx) {
+    next_set_unsafe_ck(read_dosagepresent, &sample_uidx);
+    if (is_set(sample_include, sample_uidx)) {
+      *write_dosagevals_iter++ = read_dosagevals[read_dosage_idx];
+    }
+  }
+  *write_dosage_ct_ptr = (uintptr_t)(write_dosagevals_iter - write_dosagevals);
+}
+
+void copy_and_resort_dosage(const uintptr_t* __restrict dosageraw, const uint32_t* new_sample_idx_to_old, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t* __restrict write_dosagepresent, dosage_t* write_dosagevals, uint32_t* write_dosage_ct_ptr, uint32_t* cumulative_popcount_buf) {
+  const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+  const uintptr_t* read_dosagepresent = dosageraw;
+  const dosage_t* read_dosagevals = (const dosage_t*)(&(dosageraw[raw_sample_ctaw]));
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  fill_cumulative_popcounts(read_dosagepresent, raw_sample_ctl, cumulative_popcount_buf);
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  fill_ulong_zero(sample_ctl, write_dosagepresent);
+  dosage_t* write_dosagevals_iter = write_dosagevals;
+  for (uint32_t new_sample_idx = 0; new_sample_idx < sample_ct; ++new_sample_idx) {
+    const uint32_t old_sample_idx = new_sample_idx_to_old[new_sample_idx];
+    if (is_set(read_dosagepresent, old_sample_idx)) {
+      set_bit(new_sample_idx, write_dosagepresent);
+      const uint32_t old_dosagevals_idx = raw_to_subsetted_pos(read_dosagepresent, cumulative_popcount_buf, old_sample_idx);
+      *write_dosagevals_iter++ = read_dosagevals[old_dosagevals_idx];
+    }
+  }
+  *write_dosage_ct_ptr = (uintptr_t)(write_dosagevals_iter - write_dosagevals);
+}
+
+
+// more multithread globals
+static pgen_reader_t** g_pgr_ptrs = nullptr;
+static uintptr_t** g_genovecs = nullptr;
+static uintptr_t** g_dosage_presents = nullptr;
+static dosage_t** g_dosage_val_bufs = nullptr;
+static uint32_t* g_read_variant_uidx_starts = nullptr; // size calc_thread_ct
+
+static uint64_t* g_allele_dosages = nullptr;
+static uint32_t* g_raw_geno_cts = nullptr;
+static uint32_t* g_variant_missing_hc_cts = nullptr;
+static uint32_t* g_variant_missing_dosage_cts = nullptr;
+static uint32_t* g_variant_hethap_cts = nullptr;
+static uint64_t* g_founder_allele_dosages = nullptr;
+static uint32_t* g_founder_raw_geno_cts = nullptr;
+static uint32_t* g_x_male_geno_cts = nullptr;
+static uint32_t* g_founder_x_male_geno_cts = nullptr;
+static uint32_t* g_x_nosex_geno_cts = nullptr;
+static uint32_t* g_founder_x_nosex_geno_cts = nullptr;
+static double* g_mach_r2_vals = nullptr;
+
+static unsigned char* g_writebufs[2] = {nullptr, nullptr};
+
+static const uintptr_t* g_variant_include = nullptr;
+static const chr_info_t* g_cip = nullptr;
+static const uintptr_t* g_sample_include = nullptr;
+static uintptr_t* g_sample_include_interleaved_vec = nullptr;
+static uint32_t* g_sample_include_cumulative_popcounts = nullptr;
+static uintptr_t* g_sex_male = nullptr;
+static uintptr_t* g_sex_male_interleaved_vec = nullptr;
+static uintptr_t* g_sex_male_collapsed_interleaved = nullptr;
+static uintptr_t* g_sex_female_collapsed_interleaved = nullptr;
+static uint32_t* g_sex_male_cumulative_popcounts = nullptr;
+static uintptr_t* g_nosex_interleaved_vec = nullptr;
+static const uintptr_t* g_founder_info = nullptr;
+static uintptr_t* g_founder_info_interleaved_vec = nullptr;
+static uint32_t* g_founder_info_cumulative_popcounts = nullptr;
+static uintptr_t* g_founder_male = nullptr;
+static uintptr_t* g_founder_male_interleaved_vec = nullptr;
+static uint32_t* g_founder_male_cumulative_popcounts = nullptr;
+static uintptr_t* g_founder_nosex_interleaved_vec = nullptr;
+static const uintptr_t* g_variant_allele_idxs = nullptr;
+static const alt_allele_ct_t* g_refalt1_select = nullptr;
+static const uint32_t* g_collapsed_sort_map = nullptr;
+static const uint32_t* g_new_sample_idx_to_old = nullptr;
+static uint32_t* g_old_sample_idx_to_new = nullptr;
+static uint32_t g_raw_sample_ct = 0;
+// g_sample_ct, g_calc_thread_ct, g_cur_block_write_ct, g_hard_call_halfdist,
+// g_dosage_erase_halfdist, g_error_ret declared earlier
+static uint32_t g_founder_ct = 0;
+static uint32_t g_male_ct = 0;
+static uint32_t g_nosex_ct = 0;
+static uint32_t g_founder_male_ct = 0;
+static uint32_t g_founder_nosex_ct = 0;
+static uint32_t g_first_hap_uidx = 0;
+static pgen_global_flags_t g_read_phase_dosage_gflags = kfPgenGlobal0;
+
+// just store the beginning of each vblock for now
+// (may want to store record lengths later)
+static uintptr_t** g_loadbuf_thread_starts[2] = {nullptr, nullptr};
+
+// phase, dosage
+static unsigned char* g_loaded_vrtypes[2] = {nullptr, nullptr};
+
+static vul_t** g_thread_vecaligned_bufs = nullptr;
+static uintptr_t** g_thread_write_genovecs = nullptr;
+static uintptr_t** g_thread_write_phasepresents = nullptr;
+static uintptr_t** g_thread_write_phaseinfos = nullptr;
+static uintptr_t** g_thread_all_hets = nullptr;
+static uintptr_t** g_thread_write_dosagepresents = nullptr;
+static dosage_t** g_thread_write_dosagevals = nullptr;
+static uint32_t** g_thread_cumulative_popcount_bufs = nullptr;
+static pgen_writer_common_t** g_pwcs = nullptr;
+
+static uintptr_t* g_plink1_smaj_loadbuf_iter = nullptr;
+static uint32_t g_stride = 0;
+
+THREAD_FUNC_DECL plink1_smaj_transpose_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t sample_ct = g_sample_ct;
+  const uint32_t sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+  const uint32_t write_batch_ct_m1 = (sample_ct - 1) / kPglQuaterTransposeBatch;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  pgen_writer_common_t* pwcp = g_pwcs[tidx];
+  vul_t* vecaligned_buf = g_thread_vecaligned_bufs[tidx];
+  uintptr_t* write_genovec = g_thread_write_genovecs[tidx];
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    const uint32_t loadbuf_ul_stride = g_stride;
+    uint32_t write_idx = tidx * kPglVblockSize;
+    uintptr_t* read_iter = &(g_plink1_smaj_loadbuf_iter[write_idx / kBitsPerWordD2]);
+    const uint32_t write_idx_end = MINV(write_idx + kPglVblockSize, cur_block_write_ct);
+    while (write_idx < write_idx_end) {
+      const uintptr_t* read_iter2 = read_iter;
+      // uintptr_t* write_iter = write_genovec;
+      const uint32_t vblock_size = MINV(kPglQuaterTransposeBatch, write_idx_end - write_idx);
+      uint32_t write_batch_idx = 0;
+      uint32_t read_batch_size = kPglQuaterTransposeBatch;
+      while (1) {
+	if (write_batch_idx >= write_batch_ct_m1) {
+	  if (write_batch_idx > write_batch_ct_m1) {
+	    break;
+	  }
+	  read_batch_size = MOD_NZ(sample_ct, kPglQuaterTransposeBatch);
+	}
+	transpose_quaterblock(read_iter2, loadbuf_ul_stride, sample_ctaw2, read_batch_size, vblock_size, &(write_genovec[write_batch_idx * kPglQuaterTransposeWords]), vecaligned_buf);
+	read_iter2 = &(read_iter2[kPglQuaterTransposeBatch * loadbuf_ul_stride]);
+	++write_batch_idx;
+      }
+      for (uint32_t uii = 0; uii < vblock_size; ++uii) {
+	uintptr_t* cur_write_genovec = &(write_genovec[uii * sample_ctaw2]);
+	pgr_plink1_to_plink2_inplace_unsafe(sample_ct, cur_write_genovec);
+	zero_trailing_quaters(sample_ct, cur_write_genovec);
+	pwc_append_biallelic_genovec(cur_write_genovec, pwcp);
+      }
+      write_idx += vblock_size;
+      read_iter = &(read_iter[kPglQuaterTransposeWords]);
+    }
+    if ((tidx == calc_thread_ct - 1) || is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+  }
+}
+
+pglerr_t plink1_sample_major_to_pgen(const char* pgenname, uintptr_t variant_ct, uintptr_t sample_ct, uint32_t real_ref_alleles, uint32_t max_thread_ct, FILE* infile) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  mt_pgen_writer_t* mpgwp = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    // file size already validated by pgfi_init_phase1()
+    LOGPRINTFWW("Sample-major .bed file detected.  Transposing to %s .\n", pgenname);
+    fputs("0%", stdout);
+    fflush(stdout);
+    if ((!variant_ct) || (!sample_ct)) {
+      // todo: hardcoded 12-byte write
+      logprint("\n");
+      logerrprint("Error: Zero-variant/zero-sample .pgen writing is not currently supported.\n");
+      reterr = kPglRetNotYetSupported;
+      goto plink1_sample_major_to_pgen_ret_1;
+    }
+    const uint32_t variant_ct4 = QUATERCT_TO_BYTECT(variant_ct);
+    unsigned char* raw_loadbuf = nullptr;
+    uint32_t raw_load_batch_size = 1;
+    if (variant_ct4 < 5120) {
+      // assuming 4K block size, fseek won't let us avoid reading many
+      // unnecessary disk blocks
+      raw_load_batch_size += 131071 / variant_ct4;
+      if (bigstack_alloc_uc(raw_load_batch_size * variant_ct4, &raw_loadbuf)) {
+	goto plink1_sample_major_to_pgen_ret_NOMEM;
+      }
+    }
+    const uint32_t raw_load_batch_ct_m1 = (sample_ct - 1) / raw_load_batch_size;
+    if (!raw_load_batch_ct_m1) {
+      raw_load_batch_size = sample_ct;
+    }
+    const uint32_t raw_load_batch_ct = raw_load_batch_ct_m1 + 1;
+    uintptr_t alloc_base_cacheline_ct;
+    uint64_t mpgw_per_thread_cacheline_ct;
+    uint32_t vrec_len_byte_ct;
+    uint64_t vblock_cacheline_ct;
+    mpgw_init_phase1(nullptr, variant_ct, sample_ct, kfPgenGlobal0, &alloc_base_cacheline_ct, &mpgw_per_thread_cacheline_ct, &vrec_len_byte_ct, &vblock_cacheline_ct);
+#ifndef __LP64__
+    if ((mpgw_per_thread_cacheline_ct > (0x7fffffff / kCacheline)) || (vblock_cacheline_ct > (0x7fffffff / kCacheline))) {
+      goto plink1_sample_major_to_pgen_ret_NOMEM;
+    }
+#endif
+    
+    uint32_t calc_thread_ct = DIV_UP(variant_ct, kPglVblockSize);
+    if (calc_thread_ct >= max_thread_ct) {
+      calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    }
+    mpgwp = (mt_pgen_writer_t*)bigstack_alloc((calc_thread_ct + DIV_UP(sizeof(mt_pgen_writer_t), kBytesPerWord)) * sizeof(intptr_t));
+    if (!mpgwp) {
+      goto plink1_sample_major_to_pgen_ret_NOMEM;
+    }
+    mpgwp->pgen_outfile = nullptr;
+    pthread_t* threads;
+    if (bigstack_alloc_thread(calc_thread_ct, &threads) ||
+	bigstack_alloc_vp(calc_thread_ct, &g_thread_vecaligned_bufs) ||
+	bigstack_alloc_ulp(calc_thread_ct, &g_thread_write_genovecs)) {
+      goto plink1_sample_major_to_pgen_ret_NOMEM;
+    }
+    g_pwcs = &(mpgwp->pwcs[0]);
+    uintptr_t cachelines_avail = bigstack_left() / kCacheline;
+    // inner loop transposes kPglQuaterTransposeBatch variants at a time
+    const uintptr_t transpose_thread_cacheline_ct = kPglQuaterTransposeBufbytes / kCacheline + QUATERCT_TO_VECCT(sample_ct) * (kPglQuaterTransposeBatch / kVecsPerCacheline);
+    if (cachelines_avail < calc_thread_ct * ((uint64_t)transpose_thread_cacheline_ct)) {
+      goto plink1_sample_major_to_pgen_ret_NOMEM;
+    }
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      g_thread_vecaligned_bufs[tidx] = (vul_t*)bigstack_alloc_raw(kPglQuaterTransposeBufbytes);
+      g_thread_write_genovecs[tidx] = (uintptr_t*)bigstack_alloc_raw(QUATERCT_TO_VECCT(sample_ct) * kBytesPerVec * kPglQuaterTransposeBatch);
+    }
+    cachelines_avail = bigstack_left() / kCacheline;
+    // Main workflow:
+    // 1. Load next calc_thread_ct * load_multiplier * kPglVblockSize
+    //    variants.
+    //    calc_thread_ct is reduced as necessary to ensure the compression
+    //    write buffers use <= 1/8 of total workspace.
+    //    with calc_thread_ct determined, load_multiplier is then chosen to use
+    //    as much of the remaining workspace as possible.
+    // 2. Repeat load_multiplier times:
+    //    a. Spawn threads processing calc_thread_ct vblocks
+    //    b. Join threads
+    //    c. Flush results
+    // 3. Goto step 1 unless eof.  (load_multiplier may be smaller on last
+    //    iteration.)
+    // No double-buffering here since main bottleneck is how many variants we
+    // can load at once.
+    if ((cachelines_avail / 8) < alloc_base_cacheline_ct + mpgw_per_thread_cacheline_ct * calc_thread_ct) {
+      if ((cachelines_avail / 8) < alloc_base_cacheline_ct + mpgw_per_thread_cacheline_ct) {
+	// possible todo: simple single-threaded fallback
+	goto plink1_sample_major_to_pgen_ret_NOMEM;
+      }
+      calc_thread_ct = ((cachelines_avail / 8) - alloc_base_cacheline_ct) / mpgw_per_thread_cacheline_ct;
+    }
+    // todo: determine appropriate calc_thread_ct limit.  (should not be less
+    // than 7-8.)
+    unsigned char* mpgw_alloc = bigstack_alloc_raw((alloc_base_cacheline_ct + mpgw_per_thread_cacheline_ct * calc_thread_ct) * kCacheline);
+    reterr = mpgw_init_phase2(pgenname, nullptr, nullptr, variant_ct, sample_ct, kfPgenGlobal0, 2 - real_ref_alleles, vrec_len_byte_ct, vblock_cacheline_ct, calc_thread_ct, mpgw_alloc, mpgwp);
+    
+    cachelines_avail = bigstack_left() / kCacheline;
+    const uint64_t full_load_vecs_req = sample_ct * ((uint64_t)QUATERCT_TO_ALIGNED_WORDCT(variant_ct));
+    uintptr_t* plink1_smaj_loadbuf;
+    uint32_t load_multiplier;
+    uint32_t cur_vidx_ct;
+    if (full_load_vecs_req > cachelines_avail * kVecsPerCacheline) {
+      // each iteration requires ((kPglVblockSize / 4) * calc_thread_ct *
+      //   sample_ct) bytes to be loaded
+      load_multiplier = cachelines_avail / ((kPglVblockSize / (4 * kCacheline)) * calc_thread_ct * ((uintptr_t)sample_ct));
+      assert(load_multiplier);
+      cur_vidx_ct = load_multiplier * calc_thread_ct * kPglVblockSize;
+      plink1_smaj_loadbuf = (uintptr_t*)bigstack_alloc_raw_rd((cur_vidx_ct / 4) * ((uintptr_t)sample_ct));
+    } else {
+      load_multiplier = 1 + ((variant_ct - 1) / (calc_thread_ct * kPglVblockSize));
+      cur_vidx_ct = variant_ct;
+      plink1_smaj_loadbuf = (uintptr_t*)bigstack_alloc_raw_rd(full_load_vecs_req * kBytesPerVec);
+    }
+    uint32_t cur_vidx_base = 0;
+    uint32_t cur_vidx_ct4 = QUATERCT_TO_BYTECT(cur_vidx_ct);
+    uint32_t cur_vidx_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(cur_vidx_ct);
+    uint32_t pass_idx = 0;
+    const uint32_t pass_ct = 1 + (variant_ct - 1) / cur_vidx_ct;
+    g_sample_ct = sample_ct;
+    g_stride = QUATERCT_TO_VECCT(cur_vidx_ct) * kWordsPerVec;
+    g_calc_thread_ct = calc_thread_ct;
+    while (1) {
+      uint32_t raw_load_batch_idx = 0;
+      uint32_t cur_raw_load_batch_size = raw_load_batch_size;
+      uintptr_t* smaj_loadbuf_iter = plink1_smaj_loadbuf;
+      ++pass_idx;
+      putc_unlocked('\r', stdout);
+      printf("Pass %u/%u: loading... 0%%", pass_idx, pass_ct);
+      fflush(stdout);
+      uint32_t pct = 0;
+      uint32_t next_print_idx = raw_load_batch_ct / 100;
+      const uint64_t seek_addl_offset = 3 + cur_vidx_base / 4;
+      while (1) {
+	if (raw_load_batch_size == 1) {
+	  if (fseeko(infile, seek_addl_offset + raw_load_batch_idx * ((uint64_t)variant_ct4), SEEK_SET)) {
+	    goto plink1_sample_major_to_pgen_ret_READ_FAIL;
+	  }
+	  if (!fread(smaj_loadbuf_iter, cur_vidx_ct4, 1, infile)) {
+	    goto plink1_sample_major_to_pgen_ret_READ_FAIL;
+	  }
+	  smaj_loadbuf_iter = &(smaj_loadbuf_iter[cur_vidx_ctaw2]);
+	} else {
+	  if (!fread(raw_loadbuf, cur_raw_load_batch_size * variant_ct4, 1, infile)) {
+	    goto plink1_sample_major_to_pgen_ret_READ_FAIL;
+	  }
+	  unsigned char* raw_loadbuf_iter = &(raw_loadbuf[cur_vidx_base / 4]);
+	  for (uint32_t uii = 0; uii < cur_raw_load_batch_size; ++uii) {
+	    memcpy(smaj_loadbuf_iter, raw_loadbuf_iter, cur_vidx_ct4);
+	    raw_loadbuf_iter = &(raw_loadbuf_iter[variant_ct4]);
+	    smaj_loadbuf_iter = &(smaj_loadbuf_iter[cur_vidx_ctaw2]);
+	  }
+	}
+	++raw_load_batch_idx;
+	if (raw_load_batch_idx >= raw_load_batch_ct_m1) {
+	  if (raw_load_batch_idx > raw_load_batch_ct_m1) {
+	    break;
+	  }
+	  cur_raw_load_batch_size = sample_ct - raw_load_batch_idx * raw_load_batch_size;
+	}
+	if (raw_load_batch_idx >= next_print_idx) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (raw_load_batch_idx * 100LLU) / raw_load_batch_ct;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_idx = (pct * ((uint64_t)raw_load_batch_ct)) / 100;
+	}
+      }
+      const uintptr_t last_tidx = calc_thread_ct - 1;
+      uint32_t load_idx = 0;
+      g_cur_block_write_ct = calc_thread_ct * kPglVblockSize;
+      uint32_t is_last_block;
+      putc_unlocked('\r', stdout);
+      printf("Pass %u/%u: transposing and compressing... 0%%", pass_idx, pass_ct);
+      pct = 0;
+      next_print_idx = load_idx / 100;
+      do {
+	if (load_idx >= next_print_idx) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (load_idx * 100LLU) / load_multiplier;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_idx = (pct * ((uint64_t)load_multiplier)) / 100;
+	}
+	g_plink1_smaj_loadbuf_iter = &(plink1_smaj_loadbuf[load_idx * calc_thread_ct * (kPglVblockSize / kBitsPerWordD2)]);
+	is_last_block = (++load_idx == load_multiplier);
+	if (is_last_block) {
+	  g_cur_block_write_ct = cur_vidx_ct - (load_idx - 1) * calc_thread_ct * kPglVblockSize;
+	}
+	if (last_tidx) {
+	  if (spawn_threads2z(plink1_smaj_transpose_thread, last_tidx, is_last_block, threads)) {
+	    goto plink1_sample_major_to_pgen_ret_THREAD_CREATE_FAIL;
+	  }
+	}
+	plink1_smaj_transpose_thread((uintptr_t*)last_tidx);
+	if (last_tidx) {
+	  join_threads2z(last_tidx, is_last_block, threads);
+	}
+	reterr = mpgw_flush(mpgwp);
+	if (reterr) {
+	  if (!is_last_block) {
+	    g_cur_block_write_ct = 0;
+	    error_cleanup_threads2z(plink1_smaj_transpose_thread, last_tidx, threads);
+	  }
+	  goto plink1_sample_major_to_pgen_ret_WRITE_FAIL;
+	}
+      } while (!is_last_block);
+      cur_vidx_base += cur_vidx_ct;
+      if (cur_vidx_base == variant_ct) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	break;
+      }
+      fputs("\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                     ", stdout);
+      // assumes pgfi_init_phase1() leaves file pointer at byte 3; otherwise,
+      // necessary to put this at top of main loop
+      if (fseeko(infile, 3, SEEK_SET)) {
+	goto plink1_sample_major_to_pgen_ret_READ_FAIL;
+      }
+      if (variant_ct - cur_vidx_base <= cur_vidx_ct) {
+	cur_vidx_ct = variant_ct - cur_vidx_base;
+	cur_vidx_ct4 = QUATERCT_TO_BYTECT(cur_vidx_ct);
+        cur_vidx_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(cur_vidx_ct);
+        g_stride = QUATERCT_TO_VECCT(cur_vidx_ct) * kWordsPerVec;
+        load_multiplier = 1 + (cur_vidx_ct - 1) / (kPglVblockSize * calc_thread_ct);
+      }
+    }
+    mpgwp = nullptr;
+    fputs("\b\bdone.\n", stdout);
+    LOGPRINTF("Transpose complete.\n");
+  }
+  while (0) {
+  plink1_sample_major_to_pgen_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  plink1_sample_major_to_pgen_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  plink1_sample_major_to_pgen_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  plink1_sample_major_to_pgen_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ plink1_sample_major_to_pgen_ret_1:
+  if (mpgw_cleanup(mpgwp) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+THREAD_FUNC_DECL load_allele_and_geno_counts_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+  const uintptr_t* variant_include = g_variant_include;
+  const chr_info_t* cip = g_cip;
+  const uintptr_t* variant_allele_idxs = g_variant_allele_idxs;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uint32_t subset_ct = (g_founder_info != nullptr) + 1;
+  const uint32_t raw_sample_ct = g_raw_sample_ct;
+  const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+  const uint32_t first_hap_uidx = g_first_hap_uidx;
+  const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+  uintptr_t* genovec = g_genovecs[tidx];
+  uintptr_t* dosage_present = nullptr;
+  dosage_t* dosage_vals = nullptr;
+  if (g_dosage_presents) {
+    dosage_present = g_dosage_presents[tidx];
+    dosage_vals = g_dosage_val_bufs[tidx];
+  }
+  uint32_t is_y = 0;
+  uint32_t is_nonxy_haploid = 0;
+  uint32_t x_start = 0;
+  int32_t x_code;
+  if (xymt_exists(cip, kChrOffsetX, &x_code)) {
+    const uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)x_code];
+    x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
+  }
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    // no overflow danger since cur_block_write_ct <= 2^16, tidx < (2^16 - 1)
+    const uint32_t cur_idx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
+    const uintptr_t* sample_include = g_sample_include;
+    const uintptr_t* sample_include_interleaved_vec = g_sample_include_interleaved_vec;
+    const uint32_t* sample_include_cumulative_popcounts = g_sample_include_cumulative_popcounts;
+    const uintptr_t* sex_male = g_sex_male;
+    const uintptr_t* sex_male_interleaved_vec = g_sex_male_interleaved_vec;
+    const uint32_t* sex_male_cumulative_popcounts = g_sex_male_cumulative_popcounts;
+    const uintptr_t* nosex_interleaved_vec = g_nosex_interleaved_vec;
+    uint32_t sample_ct = g_sample_ct;
+    uint32_t male_ct = g_male_ct;
+    uint32_t nosex_ct = g_nosex_ct;
+    uint64_t* allele_dosages = g_allele_dosages;
+    uint32_t* raw_geno_cts = g_raw_geno_cts;
+    uint32_t* variant_missing_hc_cts = g_variant_missing_hc_cts;
+    uint32_t* variant_missing_dosage_cts = g_variant_missing_dosage_cts;
+    uint32_t* variant_hethap_cts = g_variant_hethap_cts;
+    uint32_t* x_male_geno_cts = g_x_male_geno_cts;
+    uint32_t* x_nosex_geno_cts = g_x_nosex_geno_cts;
+    double* mach_r2_vals = g_mach_r2_vals;
+    uint32_t subset_idx = 0;
+    uint32_t dosage_ct = 0;
+    while (1) {
+      uint32_t cur_idx = (tidx * cur_block_write_ct) / calc_thread_ct;
+      uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+      uint32_t chr_end = 0;
+      uint32_t is_x_or_y = 0;
+      pglerr_t reterr = kPglRetSuccess;
+      
+      // different approach will be needed for multiallelic case...
+      uint32_t genocounts[4];
+      uint32_t sex_specific_genocounts[4];
+      for (; cur_idx < cur_idx_end; ++cur_idx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	if (variant_uidx >= chr_end) {
+	  const uint32_t chr_fo_idx = get_variant_chr_fo_idx(cip, variant_uidx);
+	  const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	  is_y = 0;
+	  is_nonxy_haploid = 0;
+	  if (chr_idx == x_code) {
+	    is_x_or_y = 1;
+	    pgr_clear_ld_cache(pgrp);
+	  } else if (chr_idx == y_code) {
+	    is_x_or_y = 1;
+	    is_y = 1;
+	    pgr_clear_ld_cache(pgrp);
+	  } else {
+	    if (is_x_or_y) {
+	      pgr_clear_ld_cache(pgrp);
+	    }
+	    is_x_or_y = 0;
+	    // no way for this to happen now unless everything is haploid?
+	    is_nonxy_haploid = is_set(cip->haploid_mask, chr_idx);
+	  }
+	}
+	const uintptr_t cur_variant_allele_idx = variant_allele_idxs? variant_allele_idxs[variant_uidx] : (2 * variant_uidx);
+	// insert cur_allele_ct == 2 check here
+	uint64_t cur_dosages[2];
+	uint32_t hethap_ct;
+	if (!is_x_or_y) {
+	  // call pgr_get_refalt1_genotype_counts() instead when dosages not
+	  // needed?
+	  reterr = pgr_get_ref_nonref_genotype_counts_and_dosage16s(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, variant_uidx, pgrp, mach_r2_vals? (&(mach_r2_vals[variant_uidx])) : nullptr, genocounts, cur_dosages);
+	  if (reterr) {
+	    g_error_ret = reterr;
+	    break;
+	  }
+	  if (!is_nonxy_haploid) {
+	    // in multiallelic case, check ref vs. non-ref...	    
+	    hethap_ct = 0;
+	    if (allele_dosages) {
+	      // ...but save all allele counts here.
+	      // workhorse multiallelic count function should return both
+	      // individual allele counts and (at least if appropriate
+	      // parameter is not nullptr) hom-altx total.
+	      allele_dosages[cur_variant_allele_idx] = cur_dosages[0] * 2;
+	      allele_dosages[cur_variant_allele_idx + 1] = cur_dosages[1] * 2;
+	    }
+	  } else {
+	    hethap_ct = genocounts[1];
+	    if (allele_dosages) {
+	      allele_dosages[cur_variant_allele_idx] = cur_dosages[0];
+	      allele_dosages[cur_variant_allele_idx + 1] = cur_dosages[1];
+	    }
+	  }
+	} else if (is_y) {
+	  reterr = pgr_get_ref_nonref_genotype_counts_and_dosage16s(sex_male, sex_male_interleaved_vec, sex_male_cumulative_popcounts, male_ct, variant_uidx, pgrp, nullptr, genocounts, cur_dosages);
+	  if (reterr) {
+	    g_error_ret = reterr;
+	    break;
+	  }
+	  hethap_ct = genocounts[1];
+	  if (allele_dosages) {
+	    allele_dosages[cur_variant_allele_idx] = cur_dosages[0];
+	    allele_dosages[cur_variant_allele_idx + 1] = cur_dosages[1];
+	  }
+	} else {
+	  // chrX
+	  uint32_t is_explicit_alt1;
+	  reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(nullptr, nullptr, raw_sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+	  if (reterr) {
+	    g_error_ret = reterr;
+	    break;
+	  }
+	  // assert(!is_explicit_alt1);
+	  if (sample_ct == raw_sample_ct) {
+	    zero_trailing_quaters(raw_sample_ct, genovec);
+	    genovec_count_freqs_unsafe(genovec, sample_ct, genocounts);
+	  } else {
+	    genovec_count_subset_freqs(genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
+	  }
+	  genovec_count_subset_freqs(genovec, sex_male_interleaved_vec, raw_sample_ct, male_ct, sex_specific_genocounts);
+	  hethap_ct = sex_specific_genocounts[1];
+	  if (allele_dosages) {
+	    uint32_t sample_uidx = 0;
+	    uintptr_t replaced_ct = 0; // nonmales count twice
+	    uintptr_t alt1_ct = 4 * genocounts[2] + 2 * genocounts[1] - 2 * sex_specific_genocounts[2] - hethap_ct; // nonmales count twice
+	    uint64_t alt1_dosage = 0; // in 32768ths, nonmales count twice
+	    uint32_t included_dosage_ct = 0; // nonmales count twice
+	    if (sample_ct == raw_sample_ct) {
+	      for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_uidx) {
+		next_set_unsafe_ck(dosage_present, &sample_uidx);
+		const uintptr_t cur_dosage_val = dosage_vals[dosage_idx];
+		const uintptr_t sex_multiplier = 2 - IS_SET(sex_male, sample_uidx);
+		alt1_dosage += cur_dosage_val * sex_multiplier;
+
+		// could call genoarr_count_subset_intersect_freqs() twice
+		// instead, but since we've already manually extracted the sex
+		// bit it probably doesn't help?
+		const uintptr_t hardcall_code = GET_QUATERARR_ENTRY(genovec, sample_uidx);
+		if (hardcall_code != 3) {
+		  alt1_ct -= hardcall_code * sex_multiplier;
+		  replaced_ct += sex_multiplier;
+		}
+	      }
+	      included_dosage_ct = 2 * dosage_ct;
+	      if (dosage_ct) {
+		included_dosage_ct -= popcount_longs_intersect(dosage_present, sex_male, raw_sample_ctl);
+	      }
+	    } else {
+	      for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_uidx) {
+		next_set_unsafe_ck(dosage_present, &sample_uidx);
+		if (IS_SET(sample_include, sample_uidx)) {
+		  const uintptr_t cur_dosage_val = dosage_vals[dosage_idx];
+		  const uintptr_t sex_multiplier = 2 - IS_SET(sex_male, sample_uidx);
+		  alt1_dosage += cur_dosage_val * sex_multiplier;
+		  included_dosage_ct += sex_multiplier;
+		  const uintptr_t hardcall_code = GET_QUATERARR_ENTRY(genovec, sample_uidx);
+		  if (hardcall_code != 3) {
+		    alt1_ct -= hardcall_code * sex_multiplier;
+		    replaced_ct += sex_multiplier;
+		  }
+		}
+	      }
+	    }
+	    const uintptr_t ref_ct = (2 * (sample_ct - genocounts[3]) - male_ct + sex_specific_genocounts[3]) * (2 * k1LU) - alt1_ct;
+	    allele_dosages[cur_variant_allele_idx] = included_dosage_ct * ((uint64_t)kDosageMax) - alt1_dosage + (ref_ct * ((uint64_t)kDosageMid));
+	    allele_dosages[cur_variant_allele_idx + 1] = alt1_dosage + (alt1_ct * ((uint64_t)kDosageMid));
+	  }
+	  if (x_male_geno_cts) {
+	    uint32_t* cur_x_male_geno_cts = &(x_male_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+	    cur_x_male_geno_cts[0] = sex_specific_genocounts[0];
+	    cur_x_male_geno_cts[1] = sex_specific_genocounts[1];
+	    cur_x_male_geno_cts[2] = sex_specific_genocounts[2];
+	    if (x_nosex_geno_cts) {
+	      genovec_count_subset_freqs(genovec, nosex_interleaved_vec, raw_sample_ct, nosex_ct, sex_specific_genocounts);
+	      uint32_t* cur_nosex_geno_cts = &(x_nosex_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+	      cur_nosex_geno_cts[0] = sex_specific_genocounts[0];
+	      cur_nosex_geno_cts[1] = sex_specific_genocounts[1];
+	      cur_nosex_geno_cts[2] = sex_specific_genocounts[2];
+	    }
+	  }
+	}
+	if (raw_geno_cts) {
+	  uint32_t* cur_raw_geno_cts = &(raw_geno_cts[(3 * k1LU) * variant_uidx]);
+	  cur_raw_geno_cts[0] = genocounts[0];
+	  cur_raw_geno_cts[1] = genocounts[1];
+	  cur_raw_geno_cts[2] = genocounts[2];
+	}
+	if (variant_missing_hc_cts) {
+	  variant_missing_hc_cts[variant_uidx] = genocounts[3];
+	  if (variant_hethap_cts && (variant_uidx >= first_hap_uidx)) {
+	    variant_hethap_cts[variant_uidx - first_hap_uidx] = hethap_ct;
+	  }
+	}
+	if (variant_missing_dosage_cts) {
+	  uint32_t missing_dosage_ct;
+	  if (!is_x_or_y) {
+	    missing_dosage_ct = sample_ct - ((cur_dosages[0] + cur_dosages[1]) / kDosageMax);
+	  } else if (is_y) {
+	    missing_dosage_ct = male_ct - ((cur_dosages[0] + cur_dosages[1]) / kDosageMax);
+	  } else {
+	    if (dosage_ct) {
+	      zero_trailing_quaters(raw_sample_ct, genovec);
+	      missing_dosage_ct = genoarr_count_missing_notsubset_unsafe(genovec, dosage_present, raw_sample_ct);
+	    } else {
+	      missing_dosage_ct = genocounts[3];
+	    }
+	  }
+	  variant_missing_dosage_cts[variant_uidx] = missing_dosage_ct;
+	}
+      }
+      if ((++subset_idx == subset_ct) || reterr) {
+	break;
+      }
+      sample_include = g_founder_info;
+      sample_include_interleaved_vec = g_founder_info_interleaved_vec;
+      sample_include_cumulative_popcounts = g_founder_info_cumulative_popcounts;
+      sex_male = g_founder_male;
+      sex_male_interleaved_vec = g_founder_male_interleaved_vec;
+      sex_male_cumulative_popcounts = g_founder_male_cumulative_popcounts;
+
+      nosex_interleaved_vec = g_founder_nosex_interleaved_vec;
+      
+      sample_ct = g_founder_ct;
+      male_ct = g_founder_male_ct;
+      nosex_ct = g_founder_nosex_ct;
+      allele_dosages = g_founder_allele_dosages;
+      variant_missing_hc_cts = nullptr;
+      variant_missing_dosage_cts = nullptr;
+      raw_geno_cts = g_founder_raw_geno_cts;
+      x_male_geno_cts = g_founder_x_male_geno_cts;
+      x_nosex_geno_cts = g_founder_x_nosex_geno_cts;
+      mach_r2_vals = nullptr;
+      pgr_clear_ld_cache(pgrp);
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+  }
+}
+
+pglerr_t load_allele_and_geno_counts(const uintptr_t* sample_include, const uintptr_t* founder_info, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, const uintptr_t* variant_allele_idxs, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t founder_ct, uint32_t male_ct, uint32_t nosex_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t first_hap_uidx, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, pgen_file_inf [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!variant_ct) {
+      goto load_allele_and_geno_counts_ret_1;
+    }
+    if (variant_allele_idxs) {
+      logerrprint("Error: load_allele_and_geno_counts() doesn't support multiallelic variants yet.\n");
+      reterr = kPglRetNotYetSupported;
+      goto load_allele_and_geno_counts_ret_1;
+    }
+
+    // four cases:
+    // 1. allele_dosages, raw_geno_cts, and/or variant_missing_{hc,dosage}_cts
+    //    required, and that's it
+    // 2. founder_allele_dosages and/or founder_raw_geno_cts required, and
+    //    that's it
+    // 3. both required, and founder_ct != sample_ct.
+    // 4. both required, and founder_ct == sample_ct.  caller is expected to
+    //    make founder_allele_dosages and allele_dosages point to the same
+    //    memory, ditto for founder_raw_geno_cts/raw_geno_cts.
+    const uint32_t only_founder_cts_required = (!allele_dosages) && (!raw_geno_cts) && (!variant_missing_hc_cts) && (!variant_missing_dosage_cts);
+    const uint32_t two_subsets_required = (founder_ct != sample_ct) && (!only_founder_cts_required) && (founder_allele_dosages || founder_raw_geno_cts);
+    g_cip = cip;
+    g_sample_include = only_founder_cts_required? founder_info : sample_include;
+    g_raw_sample_ct = raw_sample_ct;
+    g_sample_ct = only_founder_cts_required? founder_ct : sample_ct;
+    g_male_ct = male_ct;
+    g_allele_dosages = only_founder_cts_required? founder_allele_dosages : allele_dosages;
+    g_raw_geno_cts = only_founder_cts_required? founder_raw_geno_cts : raw_geno_cts;
+    g_x_male_geno_cts = only_founder_cts_required? founder_x_male_geno_cts : x_male_geno_cts;
+    g_x_nosex_geno_cts = only_founder_cts_required? founder_x_nosex_geno_cts : x_nosex_geno_cts;
+    g_mach_r2_vals = mach_r2_vals;
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t raw_sample_ctv = BITCT_TO_VECCT(raw_sample_ct);
+    if (bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &g_sample_include_interleaved_vec) ||
+	bigstack_alloc_ui(raw_sample_ctl, &g_sample_include_cumulative_popcounts) ||
+	bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &g_sex_male_interleaved_vec) ||
+	bigstack_alloc_ui(raw_sample_ctl, &g_sex_male_cumulative_popcounts)) {
+      goto load_allele_and_geno_counts_ret_NOMEM;
+    }
+    fill_interleaved_mask_vec(g_sample_include, raw_sample_ctv, g_sample_include_interleaved_vec);
+    fill_cumulative_popcounts(g_sample_include, raw_sample_ctl, g_sample_include_cumulative_popcounts);
+    if ((founder_ct == sample_ct) || (!only_founder_cts_required)) {
+      // const_cast
+      g_sex_male = (uintptr_t*)((uintptr_t)sex_male);
+    } else {
+      // no nonfounder counts required
+      if (bigstack_alloc_ul(raw_sample_ctl, &g_sex_male)) {
+	goto load_allele_and_geno_counts_ret_NOMEM;
+      }
+      for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+	g_sex_male[widx] = sex_male[widx] & founder_info[widx];
+      }
+    }
+    fill_interleaved_mask_vec(g_sex_male, raw_sample_ctv, g_sex_male_interleaved_vec);
+    fill_cumulative_popcounts(g_sex_male, raw_sample_ctl, g_sex_male_cumulative_popcounts);
+    if (!(x_nosex_geno_cts || founder_x_nosex_geno_cts)) {
+      nosex_ct = 0;
+    }
+    g_nosex_ct = nosex_ct;
+    g_nosex_interleaved_vec = nullptr;
+    uintptr_t* nosex_buf = nullptr;
+    if (nosex_ct) {
+      if (bigstack_end_alloc_ul(raw_sample_ctl, &nosex_buf) ||
+          bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &g_nosex_interleaved_vec)) {
+	goto load_allele_and_geno_counts_ret_NOMEM;
+      }
+      for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+	nosex_buf[widx] = (~sex_nm[widx]) & g_sample_include[widx];
+      }
+      fill_interleaved_mask_vec(nosex_buf, raw_sample_ctv, g_nosex_interleaved_vec);
+    }
+
+    g_variant_missing_hc_cts = variant_missing_hc_cts;
+    g_variant_missing_dosage_cts = variant_missing_dosage_cts;
+    g_variant_hethap_cts = variant_hethap_cts;
+    g_first_hap_uidx = first_hap_uidx;
+    
+    g_founder_info = nullptr;
+    g_founder_info_interleaved_vec = nullptr;
+    g_founder_info_cumulative_popcounts = nullptr;
+    g_founder_male = nullptr;
+    g_founder_male_interleaved_vec = nullptr;
+    g_founder_male_cumulative_popcounts = nullptr;
+    g_founder_nosex_interleaved_vec = nullptr;
+    g_founder_ct = 0;
+    g_founder_male_ct = 0;
+    g_founder_nosex_ct = 0;
+    g_founder_allele_dosages = nullptr;
+    g_founder_raw_geno_cts = nullptr;
+    g_founder_x_male_geno_cts = nullptr;
+    g_founder_x_nosex_geno_cts = nullptr;
+    if (two_subsets_required) {
+      if (founder_ct) {
+	g_founder_info = founder_info;
+	if (bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &g_founder_info_interleaved_vec) ||
+	    bigstack_alloc_ui(raw_sample_ctl, &g_founder_info_cumulative_popcounts) ||
+	    bigstack_alloc_ul(raw_sample_ctl, &g_founder_male) ||
+	    bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &g_founder_male_interleaved_vec) ||
+	    bigstack_alloc_ui(raw_sample_ctl, &g_founder_male_cumulative_popcounts)) {
+	  goto load_allele_and_geno_counts_ret_NOMEM;
+	}
+	fill_interleaved_mask_vec(founder_info, raw_sample_ctv, g_founder_info_interleaved_vec);
+	fill_cumulative_popcounts(founder_info, raw_sample_ctl, g_founder_info_cumulative_popcounts);
+	for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+	  g_founder_male[widx] = sex_male[widx] & founder_info[widx];
+	}
+	fill_interleaved_mask_vec(g_founder_male, raw_sample_ctv, g_founder_male_interleaved_vec);
+	fill_cumulative_popcounts(g_founder_male, raw_sample_ctl, g_founder_male_cumulative_popcounts);
+	g_founder_ct = founder_ct;
+	g_founder_male_ct = g_founder_male_cumulative_popcounts[raw_sample_ctl - 1] + popcount_long(g_founder_male[raw_sample_ctl - 1]);
+	g_founder_allele_dosages = founder_allele_dosages;
+	g_founder_raw_geno_cts = founder_raw_geno_cts;
+	g_founder_x_male_geno_cts = founder_x_male_geno_cts;
+	if (nosex_ct) {
+	  // caller currently responsible for ensuring that when
+	  // founder_nosex_ct is zero, founder_x_nosex_geno_cts ==
+	  // nullptr
+	  if (bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &g_founder_nosex_interleaved_vec)) {
+	    goto load_allele_and_geno_counts_ret_NOMEM;
+	  }
+	  for (uint32_t widx = 0; widx < raw_sample_ctl; ++widx) {
+	    nosex_buf[widx] &= founder_info[widx];
+	  }
+	  g_founder_nosex_ct = popcount_longs(nosex_buf, raw_sample_ctl);
+	  assert(g_founder_nosex_ct);
+          fill_interleaved_mask_vec(nosex_buf, raw_sample_ctv, g_founder_nosex_interleaved_vec);
+	  g_founder_x_nosex_geno_cts = founder_x_nosex_geno_cts;
+	}
+      } else {
+	if (founder_allele_dosages) {
+	  fill_ull_zero(variant_allele_idxs? variant_allele_idxs[raw_variant_ct] : (2 * raw_variant_ct), founder_allele_dosages);
+	}
+	if (founder_raw_geno_cts) {
+	  fill_uint_zero((3 * k1LU) * raw_variant_ct, founder_raw_geno_cts);
+	}
+      }
+    } else if (founder_ct == sample_ct) {
+      // bugfix: some founder and some nonfounder counts required
+      if ((!g_allele_dosages) && founder_allele_dosages) {
+	g_allele_dosages = founder_allele_dosages;
+      }
+      if ((!g_raw_geno_cts) && founder_raw_geno_cts) {
+	g_raw_geno_cts = founder_raw_geno_cts;
+      }
+      if ((!g_x_male_geno_cts) && founder_x_male_geno_cts) {
+	g_x_male_geno_cts = founder_x_male_geno_cts;
+      }
+      if ((!g_x_nosex_geno_cts) && founder_x_nosex_geno_cts) {
+	g_x_nosex_geno_cts = founder_x_nosex_geno_cts;
+      }
+    } else if (only_founder_cts_required) {
+      g_male_ct = g_sex_male_cumulative_popcounts[raw_sample_ctl - 1] + popcount_long(g_sex_male[raw_sample_ctl - 1]);
+      if (nosex_ct) {
+        g_nosex_ct = popcount_longs(nosex_buf, raw_sample_ctl);
+      }
+    }
+    if (!g_sample_ct) {
+      if (g_allele_dosages) {
+	fill_ull_zero(variant_allele_idxs? variant_allele_idxs[raw_variant_ct] : (2 * raw_variant_ct), g_allele_dosages);
+      }
+      if (g_raw_geno_cts) {
+	fill_uint_zero((3 * k1LU) * raw_variant_ct, g_raw_geno_cts);
+      }
+      // early exit
+      goto load_allele_and_geno_counts_ret_1;
+    }
+    bigstack_end_reset(bigstack_end_mark); // free nosex_buf
+
+    int32_t ii;
+    const uint32_t x_dosages_needed = (allele_dosages || founder_allele_dosages || variant_missing_dosage_cts) && xymt_exists(cip, kChrOffsetX, &ii) && (pgfip->gflags & kfPgenGlobalDosagePresent);
+    if (!x_dosages_needed) {
+      // defensive
+      g_dosage_presents = nullptr;
+      g_dosage_val_bufs = nullptr;
+    }
+    
+    // todo: check when this saturates
+    uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    unsigned char* main_loadbufs[2];
+    pthread_t* threads;
+    uint32_t read_block_size;
+    // todo: check if raw_sample_ct should be replaced with sample_ct here
+    if (multithread_load_init(variant_include, raw_sample_ct, variant_ct, pgr_alloc_cacheline_ct, 0, 0, pgfip, &calc_thread_ct, &g_genovecs, x_dosages_needed? (&g_dosage_presents) : nullptr, x_dosages_needed? (&g_dosage_val_bufs) : nullptr, &read_block_size, main_loadbufs, &threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+      goto load_allele_and_geno_counts_ret_NOMEM;
+    }
+
+    g_variant_include = variant_include;
+    g_variant_allele_idxs = variant_allele_idxs;
+    g_calc_thread_ct = calc_thread_ct;
+    g_error_ret = kPglRetSuccess;
+
+    logprint("Calculating allele frequencies... ");
+    fputs("0%", stdout);
+    fflush(stdout);
+    uint32_t pct = 0;
+
+    const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+    const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+    uint32_t parity = 0;
+    uint32_t read_block_idx = 0;
+    uint32_t variant_idx = 0;
+    uint32_t is_last_block = 0;
+    uint32_t cur_read_block_size = read_block_size;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    while (1) {
+      uintptr_t cur_block_write_ct = 0;
+      if (!is_last_block) {
+	while (read_block_idx < read_block_ct_m1) {
+	  // this uses multithread_load_init's guarantee that read_block_size
+	  // is either raw_variant_ct or a multiple of kBitsPerVec
+	  cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	  if (cur_block_write_ct) {
+	    break;
+	  }
+	  ++read_block_idx;
+	}
+	if (read_block_idx == read_block_ct_m1) {
+	  cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	  cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	}
+	if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_write_ct, pgfip)) {
+	  if (variant_idx) {
+	    join_threads2z(calc_thread_ct, 0, threads);
+	    g_cur_block_write_ct = 0;
+	    error_cleanup_threads2z(load_allele_and_geno_counts_thread, calc_thread_ct, threads);
+	  }
+	  goto load_allele_and_geno_counts_ret_READ_FAIL;
+	}
+      }
+      if (variant_idx) {
+	join_threads2z(calc_thread_ct, is_last_block, threads);
+	reterr = g_error_ret;
+	if (reterr) {
+	  if (!is_last_block) {
+	    g_cur_block_write_ct = 0;
+	    error_cleanup_threads2z(load_allele_and_geno_counts_thread, calc_thread_ct, threads);
+	  }
+	  if (reterr == kPglRetMalformedInput) {
+	    logprint("\n");
+	    logerrprint("Error: Malformed .pgen file.\n");
+	  }
+	  goto load_allele_and_geno_counts_ret_1;
+	}
+      }
+      if (!is_last_block) {
+	g_cur_block_write_ct = cur_block_write_ct;
+	compute_uidx_start_partition(variant_include, cur_block_write_ct, calc_thread_ct, read_block_idx * read_block_size, g_read_variant_uidx_starts);
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	  g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	}
+	is_last_block = (variant_idx + cur_block_write_ct == variant_ct);
+	if (spawn_threads2z(load_allele_and_geno_counts_thread, calc_thread_ct, is_last_block, threads)) {
+	  goto load_allele_and_geno_counts_ret_THREAD_CREATE_FAIL;
+	}
+      }
+
+      parity = 1 - parity;
+      if (variant_idx == variant_ct) {
+	break;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+
+      ++read_block_idx;
+      variant_idx += cur_block_write_ct;
+      // crucially, this is independent of the pgen_reader_t block_base
+      // pointers
+      pgfip->block_base = main_loadbufs[parity];
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+  }
+  while (0) {
+  load_allele_and_geno_counts_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  load_allele_and_geno_counts_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  load_allele_and_geno_counts_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ load_allele_and_geno_counts_ret_1:
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  pgfip->block_base = nullptr;
+  return reterr;
+}
+
+FLAGSET_DEF_START()
+  kfPlink2Write0,
+  kfPlink2WriteSetHhMissing = (1 << 0),
+  kfPlink2WriteSetMixedMtMissing = (1 << 1),
+  kfPlink2WriteMeMissing = (1 << 2),
+  kfPlink2WriteZeroCluster = (1 << 3),
+  kfPlink2WriteFillRef = (1 << 4),
+  kfPlink2WriteLateDosageErase = (1 << 5),
+  // no need for sample_sort, determined by g_collapsed_sort_map != nullptr?
+  kfPlink2WritePlink1 = (1 << 6)
+FLAGSET_DEF_END(plink2_write_flags_t);
+// todo: add .pgen-specific stuff
+
+static plink2_write_flags_t g_plink2_write_flags = kfPlink2Write0;
+
+THREAD_FUNC_DECL make_bedlike_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+  uintptr_t* genovec = g_genovecs[tidx];
+  const uintptr_t* variant_include = g_variant_include;
+  const chr_info_t* cip = g_cip;
+  const uintptr_t* sample_include = g_sample_include;
+  const uint32_t* sample_include_cumulative_popcounts = g_sample_include_cumulative_popcounts;
+  const uintptr_t* sex_male_collapsed_interleaved = g_sex_male_collapsed_interleaved;
+  const uintptr_t* sex_female_collapsed_interleaved = g_sex_female_collapsed_interleaved;
+  const uint32_t* collapsed_sort_map = g_collapsed_sort_map;
+  const uint32_t set_hh_missing = g_plink2_write_flags & kfPlink2WriteSetHhMissing;
+  const uint32_t set_mixed_mt_missing = g_plink2_write_flags & kfPlink2WriteSetMixedMtMissing;
+  const uint32_t write_plink1 = g_plink2_write_flags & kfPlink2WritePlink1;
+  const uint32_t sample_ct = g_sample_ct;
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  const uint32_t sample_ctv2 = QUATERCT_TO_VECCT(sample_ct);
+  const uint32_t sample_ct4 = QUATERCT_TO_BYTECT(sample_ct);
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const alt_allele_ct_t* refalt1_select = g_refalt1_select;
+  const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+  const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+  const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    uint32_t write_idx = (tidx * cur_block_write_ct) / calc_thread_ct;
+    const uint32_t write_idx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
+    unsigned char* writebuf_iter = &(g_writebufs[parity][write_idx * sample_ct4]);
+    uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+    uint32_t chr_end = 0;
+    uint32_t is_x_or_y = 0;
+    uint32_t is_y = 0;
+    uint32_t is_haploid = 0;
+    uint32_t is_mt = 0;
+    for (; write_idx < write_idx_end; ++write_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	const uint32_t chr_fo_idx = get_variant_chr_fo_idx(cip, variant_uidx);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	is_y = (chr_idx == y_code);
+	is_x_or_y = is_y || (chr_idx == x_code);
+	is_haploid = is_set(cip->haploid_mask, chr_idx);
+	is_mt = (chr_idx == mt_code);
+      }
+      const pglerr_t reterr = pgr_read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, pgrp, genovec);
+      if (reterr) {
+	// printf("fail vidx: %u\n", variant_uidx);
+	g_error_ret = reterr;
+	break;
+      }
+      // this doesn't work in multiallelic case
+      // todo: pgenlib_internal function which takes two allele indexes
+      if (refalt1_select && (refalt1_select[2 * variant_uidx] == 1)) {
+	genovec_invert_unsafe(sample_ct, genovec);
+      }
+      if (set_hh_missing && is_haploid) {
+	if (is_x_or_y) {
+	  // male hets to missing
+	  set_male_het_missing(sex_male_collapsed_interleaved, sample_ctv2, genovec);
+	  if (is_y) {
+	    // all female calls to missing; unknown-sex calls now left alone
+	    interleaved_set_missing(sex_female_collapsed_interleaved, sample_ctv2, genovec);
+	  }
+	} else {
+	  // all hets to missing
+	  set_het_missing(sample_ctl2, genovec);
+	}
+      } else if (set_mixed_mt_missing && is_mt) {
+	// all hets to missing
+	set_het_missing(sample_ctl2, genovec);
+      }
+      // todo: --set-me-missing, --zero-cluster, --fill-missing-with-ref
+      // (--set-me-missing should happen after --set-hh-missing)
+      if (write_plink1) {
+	pgr_plink2_to_plink1_inplace_unsafe(sample_ct, genovec);
+      }
+      // trailing bytes don't matter, but trailing bits of last byte may
+      zero_trailing_quaters(sample_ct, genovec);
+      if (!collapsed_sort_map) {
+	writebuf_iter = (unsigned char*)memcpya(writebuf_iter, genovec, sample_ct4);
+      } else {
+	genovec_resort(genovec, collapsed_sort_map, sample_ct, writebuf_iter);
+	writebuf_iter = &(writebuf_iter[sample_ct4]);
+      }
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+// combine existing chr_mask/xymt_codes/haploid_mask/chr_idx_to_foidx with new
+// collapsed chromosome boundary table
+static uint32_t* g_write_chr_fo_vidx_start = nullptr;
+
+static st_pgen_writer_t* g_spgwp = nullptr;
+
+THREAD_FUNC_DECL make_pgen_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t* new_sample_idx_to_old = g_new_sample_idx_to_old;
+  const uint32_t* old_sample_idx_to_new = g_old_sample_idx_to_new;
+  const chr_info_t* cip = g_cip;
+  const uint32_t* write_chr_fo_vidx_start = g_write_chr_fo_vidx_start;
+  const alt_allele_ct_t* refalt1_select_iter = g_refalt1_select;
+  const uintptr_t* sample_include = g_sample_include;
+  const uintptr_t* sex_male_collapsed_interleaved = g_sex_male_collapsed_interleaved;
+  const uintptr_t* sex_female_collapsed_interleaved = g_sex_female_collapsed_interleaved;
+  const uint32_t raw_sample_ct = g_raw_sample_ct;
+  const uint32_t sample_ct = g_sample_ct;
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  const uint32_t sample_ctv2 = QUATERCT_TO_VECCT(sample_ct);
+  const uint32_t raw_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+  const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+  const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+  const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+  const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+
+  const uint32_t set_hh_missing = g_plink2_write_flags & kfPlink2WriteSetHhMissing;
+  const uint32_t set_mixed_mt_missing = g_plink2_write_flags & kfPlink2WriteSetMixedMtMissing;
+  const uint32_t late_dosage_erase = g_plink2_write_flags & kfPlink2WriteLateDosageErase;
+  
+  const uint32_t hphase_present = (g_read_phase_dosage_gflags / kfPgenGlobalHardcallPhasePresent) & 1;
+  const uint32_t dosage_present = (g_read_phase_dosage_gflags & (kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent))? 1 : 0;
+  const uint32_t hard_call_halfdist = g_hard_call_halfdist;
+  const uint32_t dosage_erase_halfdist = g_dosage_erase_halfdist;
+  const uintptr_t phaseraw_word_ct = kWordsPerVec + round_down_pow2(raw_sample_ct / kBitsPerWordD2, kWordsPerVec);
+  // todo: double dosage_vals allocation in phased-dosage case
+  const uintptr_t dosageraw_word_ct = kWordsPerVec * (BITCT_TO_VECCT(raw_sample_ct) + DIV_UP(raw_sample_ct, (kBytesPerVec / sizeof(dosage_t))));
+  
+  st_pgen_writer_t* spgwp = g_spgwp;
+  pgen_writer_common_t* pwcp;
+  if (spgwp) {
+    pwcp = &(spgwp->pwc);
+  } else {
+    pwcp = g_pwcs[tidx];
+  }
+  uintptr_t* write_genovec = nullptr;
+  // assumes g_sample_include == nullptr if sample_ct == raw_sample_ct
+  if (new_sample_idx_to_old || sample_include) {
+    write_genovec = g_thread_write_genovecs[tidx];
+    write_genovec[sample_ctl2 - 1] = 0;
+  }
+  uintptr_t* write_phasepresent = nullptr;
+  uintptr_t* write_phaseinfo = nullptr;
+  uintptr_t* all_hets = nullptr;
+  if (hphase_present) {
+    all_hets = g_thread_all_hets[tidx];
+    write_phasepresent = g_thread_write_phasepresents[tidx];
+    write_phaseinfo = g_thread_write_phaseinfos[tidx];
+  }
+  uintptr_t* write_dosagepresent = nullptr;
+  dosage_t* write_dosagevals = nullptr;
+  uint32_t* cumulative_popcount_buf = nullptr;
+  if (dosage_present) {
+    write_dosagepresent = g_thread_write_dosagepresents[tidx];
+    write_dosagevals = g_thread_write_dosagevals[tidx];
+    if (new_sample_idx_to_old) {
+      cumulative_popcount_buf = g_thread_cumulative_popcount_bufs[tidx];
+    }
+  }
+  uint32_t variant_idx_offset = 0;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    uint32_t write_idx = tidx * kPglVblockSize;
+    const uint32_t write_idx_end = MINV(write_idx + kPglVblockSize, cur_block_write_ct);
+    uintptr_t* loadbuf_iter = g_loadbuf_thread_starts[parity][tidx];
+    unsigned char* loaded_vrtypes = g_loaded_vrtypes[parity];
+    uint32_t loaded_vrtype = 0;
+    uint32_t chr_end_bidx = 0;
+    uint32_t is_x_or_y = 0;
+    uint32_t is_y = 0;
+    uint32_t is_haploid = 0;
+    uint32_t is_mt = 0;
+    for (; write_idx < write_idx_end; ++write_idx) {
+      if (loaded_vrtypes) {
+	loaded_vrtype = loaded_vrtypes[write_idx];
+      }
+      if (write_idx >= chr_end_bidx) {
+	const uint32_t chr_fo_idx = uint32arr_greater_than(&(write_chr_fo_vidx_start[1]), cip->chr_ct, write_idx + variant_idx_offset + 1);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	chr_end_bidx = write_chr_fo_vidx_start[chr_fo_idx + 1] - variant_idx_offset;
+	is_y = (chr_idx == y_code);
+	is_x_or_y = is_y || (chr_idx == x_code);
+	is_haploid = is_set(cip->haploid_mask, chr_idx);
+	is_mt = (chr_idx == mt_code);
+      }
+      uint32_t is_hphase = loaded_vrtype & 0x10;
+      const uint32_t is_dosage = loaded_vrtype & 0x60;
+      uintptr_t* cur_write_phasepresent = write_phasepresent;
+      if (1) {
+	// biallelic, no phased-dosage
+	uintptr_t* cur_genovec_end = &(loadbuf_iter[raw_sample_ctaw2]);
+	uintptr_t* cur_phaseraw = nullptr;
+	uintptr_t* cur_dosageraw = nullptr;
+	if (is_hphase) {
+	  pgr_detect_genovec_hets(loadbuf_iter, raw_sample_ct, all_hets);
+	  cur_phaseraw = cur_genovec_end;
+	  cur_genovec_end = &(cur_genovec_end[phaseraw_word_ct]);
+	}
+	if (is_dosage) {
+	  cur_dosageraw = cur_genovec_end;
+	  cur_genovec_end = &(cur_genovec_end[dosageraw_word_ct]);
+	}
+	uint32_t write_dosage_ct = 0;
+	if (new_sample_idx_to_old) {
+	  genovec_resort(loadbuf_iter, new_sample_idx_to_old, sample_ct, (unsigned char*)write_genovec);
+	  if (is_hphase) {
+	    unpack_and_resort_hphase(all_hets, cur_phaseraw, sample_include, old_sample_idx_to_new, raw_sample_ct, sample_ct, &cur_write_phasepresent, write_phaseinfo);
+	  }
+	  if (is_dosage) {
+	    copy_and_resort_dosage(cur_dosageraw, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_dosagepresent, write_dosagevals, &write_dosage_ct, cumulative_popcount_buf);
+	  }
+	} else if (sample_include) {
+	  copy_quaterarr_nonempty_subset(loadbuf_iter, sample_include, raw_sample_ct, sample_ct, write_genovec);
+	  if (is_hphase) {
+	    unpack_hphase_subset(all_hets, cur_phaseraw, sample_include, raw_sample_ct, &cur_write_phasepresent, write_phaseinfo);
+	  }
+	  if (is_dosage) {
+	    copy_dosage_subset(cur_dosageraw, sample_include, raw_sample_ct, sample_ct, write_dosagepresent, write_dosagevals, &write_dosage_ct);
+	  }
+	} else {
+	  write_genovec = loadbuf_iter;
+	  if (is_hphase) {
+	    unpack_hphase(all_hets, cur_phaseraw, sample_ct, &cur_write_phasepresent, write_phaseinfo);
+	  }
+	  if (is_dosage) {
+	    copy_dosage(cur_dosageraw, sample_ct, write_dosagepresent, write_dosagevals, &write_dosage_ct);
+	  }
+	}
+	if (refalt1_select_iter && (refalt1_select_iter[2 * write_idx] == 1)) {
+	  genovec_invert_unsafe(sample_ct, write_genovec);
+	  if (is_hphase) {
+	    bitarr_invert(sample_ctl, write_phaseinfo);
+	  }
+	  if (write_dosage_ct) {
+	    biallelic_dosage16_invert(write_dosage_ct, write_dosagevals);
+	  }
+	}
+	if (write_dosage_ct) {
+	  if (hard_call_halfdist) {
+	    if (is_hphase && (!cur_write_phasepresent)) {
+	      cur_write_phasepresent = write_phasepresent;
+	      // unsafe to just copy all_hets, because we may have resorted
+	      pgr_detect_genovec_hets(write_genovec, sample_ct, cur_write_phasepresent);
+	    }
+	    uint32_t sample_uidx = 0;
+	    for (uint32_t dosage_idx = 0; dosage_idx < write_dosage_ct; ++dosage_idx, ++sample_uidx) {
+	      next_set_unsafe_ck(write_dosagepresent, &sample_uidx);
+	      const uint32_t dosage_int = write_dosagevals[dosage_idx];
+	      const uint32_t halfdist = biallelic_dosage_halfdist(dosage_int);
+	      const uint32_t widx = sample_uidx / kBitsPerWordD2;
+	      uintptr_t prev_geno_word = write_genovec[widx];
+	      const uint32_t shift = (sample_uidx % kBitsPerWordD2) * 2;
+	      uintptr_t new_geno;
+	      if (halfdist < hard_call_halfdist) {
+		new_geno = 3;
+	      } else {
+		new_geno = (dosage_int + kDosage4th) / kDosageMid;
+	      }
+	      const uintptr_t prev_geno = (prev_geno_word >> shift) & 3;
+	      const uintptr_t geno_xor = new_geno ^ prev_geno;
+	      if (geno_xor) {
+	        if (is_hphase) {
+		  // must erase phase here
+		  CLEAR_BIT(sample_uidx, cur_write_phasepresent);
+		}
+		write_genovec[widx] = prev_geno_word ^ (geno_xor << shift);
+	      }
+	    }
+	  }
+	  if (dosage_erase_halfdist < kDosage4th) {
+	    uint32_t dosage_read_idx = 0;
+	    uint32_t sample_uidx = 0;
+	    for (; dosage_read_idx < write_dosage_ct; ++dosage_read_idx, ++sample_uidx) {
+	      next_set_unsafe_ck(write_dosagepresent, &sample_uidx);
+	      const uint32_t dosage_int = write_dosagevals[dosage_read_idx];
+	      const uint32_t halfdist = biallelic_dosage_halfdist(dosage_int);
+	      if (halfdist >= dosage_erase_halfdist) {
+		clear_bit(sample_uidx, write_dosagepresent);
+		++sample_uidx;
+		break;
+	      }
+	    }
+	    uint32_t dosage_write_idx = dosage_read_idx;
+	    while (++dosage_read_idx < write_dosage_ct) {
+	      next_set_unsafe_ck(write_dosagepresent, &sample_uidx);
+	      const uint32_t dosage_int = write_dosagevals[dosage_read_idx];
+	      const uint32_t halfdist = biallelic_dosage_halfdist(dosage_int);
+	      if (halfdist < dosage_erase_halfdist) {
+		write_dosagevals[dosage_write_idx++] = dosage_int;
+	      } else {
+		clear_bit(sample_uidx, write_dosagepresent);
+	      }
+	      ++sample_uidx;
+	    }
+	    write_dosage_ct = dosage_write_idx;
+	  } else if (late_dosage_erase) {
+	    write_dosage_ct = 0;
+	  }
+	}
+	// moved after --hard-call-threshold, since it makes sense to
+	// immediately erase fresh het haploid calls
+	if (set_hh_missing && is_haploid) {
+	  if (is_x_or_y) {
+	    // male hets to missing
+	    set_male_het_missing(sex_male_collapsed_interleaved, sample_ctv2, write_genovec);
+	    if (is_y) {
+	      // all female calls to missing; unknown-sex calls now left alone
+	      interleaved_set_missing(sex_female_collapsed_interleaved, sample_ctv2, write_genovec);
+	    }
+	    if (is_hphase && cur_write_phasepresent) {
+	      mask_genovec_hets_unsafe(write_genovec, sample_ctl2, cur_write_phasepresent);
+	    }
+	  } else {
+	    // all hets to missing
+	    // may want to move is_hphase zeroing in front
+	    set_het_missing(sample_ctl2, write_genovec);
+	    is_hphase = 0;
+	  }
+	} else if (set_mixed_mt_missing && is_mt) {
+	  // all hets to missing
+	  set_het_missing(sample_ctl2, write_genovec);
+	  is_hphase = 0;
+	}
+	zero_trailing_quaters(sample_ct, write_genovec);
+	// todo: --set-me-missing, --zero-cluster, --fill-missing-with-ref
+	if (spgwp) {
+	  if (pwcp->fwrite_bufp >= &(pwcp->fwrite_buf[kPglFwriteBlockSize])) {
+	    const uintptr_t cur_byte_ct = (uintptr_t)(pwcp->fwrite_bufp - pwcp->fwrite_buf);
+	    if (fwrite_checked(pwcp->fwrite_buf, cur_byte_ct, spgwp->pgen_outfile)) {
+	      g_error_ret = kPglRetWriteFail;
+	      break;
+	    }
+	    // printf("vblock_fpos_offset: %llu\n", pwcp->vblock_fpos_offset);
+	    pwcp->vblock_fpos_offset += cur_byte_ct;
+	    // printf("%u %llu\n", write_idx + variant_idx_offset, pwcp->vblock_fpos_offset);
+	    pwcp->fwrite_bufp = pwcp->fwrite_buf;
+	  }
+	}
+	if (!is_hphase) {
+	  pwc_append_biallelic_genovec_dosage16(write_genovec, write_dosagepresent, write_dosagevals, write_dosage_ct, pwcp);
+	} else {
+	  pwc_append_biallelic_genovec_hphase_dosage16(write_genovec, cur_write_phasepresent, write_phaseinfo, write_dosagepresent, write_dosagevals, write_dosage_ct, pwcp);
+	  cur_write_phasepresent = write_phasepresent;
+	}
+	loadbuf_iter = cur_genovec_end;
+      } else {
+        // todo: multiallelic write
+	// (some trim-alts logic here)
+      }
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+    variant_idx_offset += cur_block_write_ct;
+    if (refalt1_select_iter) {
+      refalt1_select_iter = &(refalt1_select_iter[2 * cur_block_write_ct]);
+    }
+  }
+}
+
+pgen_global_flags_t gflags_vfilter(const uintptr_t* variant_include, const unsigned char* vrtypes, uint32_t raw_variant_ct, pgen_global_flags_t input_gflags) {
+  pgen_global_flags_t read_phase_dosage_gflags = kfPgenGlobal0;
+  const uintptr_t* vrtypes_alias_iter = (const uintptr_t*)vrtypes;
+  const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+  uint32_t mask_multiply = ((input_gflags & kfPgenGlobalHardcallPhasePresent)? 0x10 : 0) + ((input_gflags & kfPgenGlobalDosagePresent)? 0x60 : 0) + ((input_gflags & kfPgenGlobalDosagePhasePresent)? 0x80 : 0);
+  uintptr_t vrtypes_or = 0;
+  for (uint32_t widx = 0; widx < raw_variant_ctl; ++widx) {
+    uintptr_t cur_variant_include_word = variant_include[widx];
+    if (cur_variant_include_word) {
+#ifdef __LP64__
+      for (uint32_t vi_byte_idx = 0; vi_byte_idx < 8; ++vi_byte_idx) {
+	// this operation maps binary hgfedcba to h0000000g0000000f...
+	//                                        ^       ^       ^
+	//                                        |       |       |
+	//                                       56      48      40
+	// 1. (cur_variant_include_word & 0xfe) gives us hgfedcb0;
+	//    necessary to avoid carryover.
+	// 2. multiply by the number with bits 7, 14, 21, ..., 49 set, to
+	//    get hgfedcbhgfedcbhgf...
+	//        ^       ^       ^
+	//        |       |       |
+	//       56      48      40
+	// 3. mask out all but bits 8, 16, 24, ..., 56
+	// todo: test if this actually beats the per-character loop...
+	const uintptr_t cur_mask = (((cur_variant_include_word & 0xfe) * 0x2040810204080LLU) & kMask0101) | (cur_variant_include_word & 1);
+	vrtypes_or |= (*vrtypes_alias_iter++) & (cur_mask * mask_multiply);
+	cur_variant_include_word >>= 8;
+      }
+#else
+      for (uint32_t vi_hexa_idx = 0; vi_hexa_idx < 8; ++vi_hexa_idx) {
+	// dcba -> d0000000c0000000b0000000a
+	const uintptr_t cur_mask = ((cur_variant_include_word & 0xf) * 0x204081) & kMask0101;
+	vrtypes_or |= (*vrtypes_alias_iter++) & (cur_mask * mask_multiply);
+	cur_variant_include_word >>= 4;
+      }
+#endif
+      if (vrtypes_or) {
+	if (vrtypes_or & 0x10) {
+	  read_phase_dosage_gflags |= kfPgenGlobalHardcallPhasePresent;
+	  mask_multiply -= 0x10;
+	}
+	if (vrtypes_or & 0x60) {
+	  read_phase_dosage_gflags |= kfPgenGlobalDosagePresent;
+	  mask_multiply -= 0x60;
+	}
+	if (vrtypes_or & 0x80) {
+	  read_phase_dosage_gflags |= kfPgenGlobalDosagePhasePresent;
+	  mask_multiply -= 0x80;
+	}
+	if (!mask_multiply) {
+	  return read_phase_dosage_gflags;
+	}
+      }
+    }
+  }
+  return read_phase_dosage_gflags;
+}
+
+// Single-output-thread implementation.  Allows variants to be unsorted.
+// (Note that make_plink2_no_vsort() requires enough memory for 64k * 2
+// variants per output thread, due to LD compression.  This is faster in the
+// common case, but once you have 150k+ samples with dosage data...)
+pglerr_t make_pgen_robust(const uintptr_t* sample_include, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const uintptr_t* variant_allele_idxs, const alt_allele_ct_t* refalt1_select, const uint32_t* new_variant_idx_to_old, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, make_plink2_t make_plink2_modifier, pgen_reader_t* simple_pgrp, char* outname, char* outname_end) {
+  // variant_uidx_new_to_old[] can be nullptr
+
+  // caller responsible for initializing g_cip (may need to be different from
+  // initial cip struct)
+  unsigned char* bigstack_mark = g_bigstack_base;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  st_pgen_writer_t spgw;
+  pglerr_t reterr = kPglRetSuccess;
+  spgw_preinit(&spgw);
+  {
+    // g_plink2_write_flags assumed to include --set-hh-missing and
+    //   --set-mixed-mt-missing
+    // g_sex_{fe}male_collapsed_interleaved assumed to be initialized if
+    //   necessary
+
+    if (bigstack_alloc_thread(1, &ts.threads)) {
+      goto make_pgen_robust_ret_NOMEM;
+    }
+    ts.calc_thread_ct = 1;
+    g_spgwp = &spgw;
+    const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+    g_sample_include = subsetting_required? sample_include : nullptr;
+    g_new_sample_idx_to_old = new_sample_idx_to_old;
+    g_raw_sample_ct = raw_sample_ct;
+    g_sample_ct = sample_ct;
+    g_error_ret = kPglRetSuccess;
+    const uint32_t* new_variant_idx_to_old_iter = new_variant_idx_to_old;
+    if ((make_plink2_modifier & kfMakeBed) || ((make_plink2_modifier & (kfMakePgen | (kfMakePgenFormatBase * 3))) == (kfMakePgen | kfMakePgenFormatBase))) {
+      // g_calc_thread_ct = 1;
+      logerrprint("Error: Fixed-width .bed/.pgen output doesn't support sorting yet.  Generate a\nregular sorted .pgen first, and then reformat it.\n");
+      reterr = kPglRetNotYetSupported;
+      goto make_pgen_robust_ret_1;
+    } else {
+      const uint32_t input_biallelic = (!variant_allele_idxs);
+      const uintptr_t* write_allele_idx_offsets = nullptr;
+      if (!input_biallelic) {
+	if ((variant_ct < raw_variant_ct) || new_variant_idx_to_old_iter) {
+	  uintptr_t* new_allele_idx_offsets;
+	  if (bigstack_alloc_ul(variant_ct + 1, &new_allele_idx_offsets)) {
+	    goto make_pgen_robust_ret_NOMEM;
+	  }
+	  uintptr_t cur_offset = 0;
+	  // todo: separate trim-alts case
+	  if (!new_variant_idx_to_old_iter) {
+	    uint32_t variant_uidx = 0;
+	    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	      next_set_unsafe_ck(variant_include, &variant_uidx);
+	      new_allele_idx_offsets[variant_idx] = cur_offset;
+	      cur_offset += variant_allele_idxs[variant_uidx + 1] - variant_allele_idxs[variant_uidx];
+	    }
+	  } else {
+	    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx) {
+	      const uint32_t variant_uidx = *new_variant_idx_to_old_iter++;
+	      new_allele_idx_offsets[variant_idx] = cur_offset;
+	      cur_offset += variant_allele_idxs[variant_uidx + 1] - variant_allele_idxs[variant_uidx];
+	    }
+	    new_variant_idx_to_old_iter = new_variant_idx_to_old;
+	  }
+	  if (cur_offset != 2 * variant_ct) {
+	    new_allele_idx_offsets[variant_ct] = cur_offset;
+	    write_allele_idx_offsets = new_allele_idx_offsets;
+	    logprint("Error: Multiallelic .pgen write is not yet supported.\n");
+	    reterr = kPglRetNotYetSupported;
+	    goto make_pgen_robust_ret_1;
+	  } else {
+	    bigstack_reset(new_allele_idx_offsets);
+	  }
+	} else {
+	  write_allele_idx_offsets = variant_allele_idxs;
+	}
+      }
+      if ((variant_ct == raw_variant_ct) || new_variant_idx_to_old_iter) {
+	g_write_chr_fo_vidx_start = g_cip->chr_fo_vidx_start;
+      } else {
+	if (alloc_and_fill_subset_chr_fo_vidx_start(variant_include, g_cip, &g_write_chr_fo_vidx_start)) {
+	  goto make_pgen_robust_ret_NOMEM;
+	}
+      }
+      pgen_global_flags_t read_phase_dosage_gflags = simple_pgrp->fi.gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
+      if (make_plink2_modifier & kfMakePgenErasePhase) {
+	read_phase_dosage_gflags &= ~(kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent);
+      }
+      if (make_plink2_modifier & kfMakePgenEraseDosage) {
+	if (hard_call_thresh == 0xffffffffU) {
+	  read_phase_dosage_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
+	  g_plink2_write_flags |= kfPlink2WriteLateDosageErase;
+	}
+      }
+      if (read_phase_dosage_gflags && (variant_ct < raw_variant_ct)) {
+	read_phase_dosage_gflags = gflags_vfilter(variant_include, simple_pgrp->fi.vrtypes, raw_variant_ct, simple_pgrp->fi.gflags);
+      }
+      g_read_phase_dosage_gflags = read_phase_dosage_gflags;
+      g_hard_call_halfdist = (hard_call_thresh == 0xffffffffU)? 0 : (kDosage4th - hard_call_thresh);
+      g_dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
+      const uint32_t read_hphase_present = (read_phase_dosage_gflags / kfPgenGlobalHardcallPhasePresent) & 1;
+      const uint32_t read_dosage_present = (read_phase_dosage_gflags & (kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent))? 1 : 0;
+      pgen_global_flags_t write_phase_dosage_gflags = read_phase_dosage_gflags;
+      if (g_plink2_write_flags & kfPlink2WriteLateDosageErase) {
+	write_phase_dosage_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
+      }
+
+      uint32_t nonref_flags_storage = 3;
+      if (!simple_pgrp->fi.nonref_flags) {
+	nonref_flags_storage = (simple_pgrp->fi.gflags & kfPgenGlobalAllNonref)? 2 : 1;
+      } else if (variant_ct < raw_variant_ct) {
+	// todo: check if now constant
+      }
+      strcpy(outname_end, ".pgen");
+      uintptr_t spgw_alloc_cacheline_ct;
+      uint32_t max_vrec_len;
+      reterr = spgw_init_phase1(outname, write_allele_idx_offsets, simple_pgrp->fi.nonref_flags, variant_ct, sample_ct, write_phase_dosage_gflags, nonref_flags_storage, g_spgwp, &spgw_alloc_cacheline_ct, &max_vrec_len);
+      if (reterr) {
+	goto make_pgen_robust_ret_1;
+      }
+      unsigned char* spgw_alloc;
+      if (bigstack_alloc_ulp(1, &(g_loadbuf_thread_starts[0])) ||
+	  bigstack_alloc_ulp(1, &(g_loadbuf_thread_starts[1])) ||
+	  bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
+	goto make_pgen_robust_ret_NOMEM;
+      }
+      spgw_init_phase2(max_vrec_len, g_spgwp, spgw_alloc);
+
+      const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+      if (new_sample_idx_to_old || subsetting_required) {
+	if (bigstack_alloc_ulp(1, &g_thread_write_genovecs)) {
+	  goto make_pgen_robust_ret_NOMEM;
+	}
+	if (read_hphase_present && new_sample_idx_to_old) {
+	  if (bigstack_alloc_ui(raw_sample_ct, &g_old_sample_idx_to_new)) {
+	    goto make_pgen_robust_ret_NOMEM;
+	  }
+	  for (uint32_t new_sample_idx = 0; new_sample_idx < sample_ct; ++new_sample_idx) {
+	    g_old_sample_idx_to_new[new_sample_idx_to_old[new_sample_idx]] = new_sample_idx;
+	  }
+	}
+	if (input_biallelic) {
+	  if (bigstack_alloc_ul(sample_ctl2, &(g_thread_write_genovecs[0]))) {
+	    goto make_pgen_robust_ret_NOMEM;
+	  }
+	} else {
+	  if (bigstack_alloc_ul(DIV_UP(2 * sample_ct * sizeof(alt_allele_ct_t), kBytesPerWord), &(g_thread_write_genovecs[0]))) {
+	    goto make_pgen_robust_ret_NOMEM;
+	  }
+	}
+      } else {
+        g_thread_write_genovecs = nullptr;
+      }
+      const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+      const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+      if (read_hphase_present) {
+	if (bigstack_alloc_ulp(1, &g_thread_write_phasepresents) ||
+	    bigstack_alloc_ulp(1, &g_thread_write_phaseinfos) ||
+	    bigstack_alloc_ulp(1, &g_thread_all_hets) ||
+	    bigstack_alloc_ul(sample_ctl, &(g_thread_write_phasepresents[0])) ||
+	    bigstack_alloc_ul(sample_ctl, &(g_thread_write_phaseinfos[0])) ||
+	    bigstack_alloc_ul(raw_sample_ctl, &(g_thread_all_hets[0]))) {
+	  goto make_pgen_robust_ret_NOMEM;
+	}
+      }
+      if (read_dosage_present) {
+	if (bigstack_alloc_dosagep(1, &g_thread_write_dosagevals) ||
+	    bigstack_alloc_ulp(1, &g_thread_write_dosagepresents) ||
+	    bigstack_alloc_ul(sample_ctl, &(g_thread_write_dosagepresents[0])) ||
+	    bigstack_alloc_dosage(sample_ct, &(g_thread_write_dosagevals[0]))) {
+	  goto make_pgen_robust_ret_NOMEM;
+	}
+	if (new_sample_idx_to_old) {
+	  if (bigstack_alloc_uip(1, &g_thread_cumulative_popcount_bufs) ||
+	      bigstack_alloc_ui(raw_sample_ctl, &(g_thread_cumulative_popcount_bufs[0]))) {
+	    goto make_pgen_robust_ret_NOMEM;
+	  }
+	}
+      }
+      g_refalt1_select = refalt1_select;
+      if (refalt1_select) {
+	if (variant_ct < raw_variant_ct) {
+	  // might want inner loop to map variant uidx -> idx instead
+	  g_refalt1_select = (alt_allele_ct_t*)bigstack_alloc(variant_ct * 2 * sizeof(alt_allele_ct_t));
+	  if (!g_refalt1_select) {
+	    goto make_pgen_robust_ret_NOMEM;
+	  }
+	  uint32_t variant_uidx = 0;
+	  for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	    next_set_unsafe_ck(variant_include, &variant_uidx);
+	    // const_cast
+	    memcpy((alt_allele_ct_t*)((uintptr_t)(&(g_refalt1_select[2 * variant_idx]))), &(refalt1_select[2 * variant_uidx]), 2 * sizeof(alt_allele_ct_t));
+	  }
+	} else {
+	  assert(!new_variant_idx_to_old_iter);
+	}
+      }
+
+      const uint32_t raw_sample_ctv2 = QUATERCT_TO_VECCT(raw_sample_ct);
+      uintptr_t load_variant_vec_ct = raw_sample_ctv2;
+      uint32_t loaded_vrtypes_needed = 0;
+      if (read_hphase_present || read_dosage_present) {
+        loaded_vrtypes_needed = 1;
+	if (read_hphase_present) {
+	  // phaseraw has two parts:
+	  // 1. vec-aligned bitarray of up to (raw_sample_ct + 1) bits.  first
+	  //    bit is set iff phasepresent is explicitly stored at all (if not,
+	  //    all hets are assumed to be phased), if yes the remaining bits
+	  //    store packed phasepresent values for all hets, if no the
+	  //    remaining bits store packed phaseinfo values for all hets.
+	  // 2. word-aligned bitarray of up to raw_sample_ct bits, storing
+	  //    phaseinfo values.  (end of this array is vec-aligned.)
+	  const uintptr_t phaseraw_word_ct = kWordsPerVec + round_down_pow2(raw_sample_ct / kBitsPerWordD2, kWordsPerVec);
+	  load_variant_vec_ct += WORDCT_TO_VECCT(phaseraw_word_ct);
+	}
+	if (read_dosage_present) {	
+	  // todo: phased dosage
+
+	  // (unphased, biallelic) dosageraw has two parts:
+	  // 1. vec-aligned bitarray of up to raw_sample_ct bits, storing which
+	  //    samples have dosages.
+	  // 2. word-aligned array of uint16s with 0..32768 fixed-point dosages.
+	  assert(!(read_phase_dosage_gflags & kfPgenGlobalDosagePhasePresent));
+	  const uintptr_t dosageraw_word_ct = kWordsPerVec * (BITCT_TO_VECCT(raw_sample_ct) + DIV_UP(raw_sample_ct, (kBytesPerVec / sizeof(dosage_t))));
+	  load_variant_vec_ct += WORDCT_TO_VECCT(dosageraw_word_ct);
+	}
+      }
+      // todo: multiallelic variants
+
+      uintptr_t bytes_left = bigstack_left();
+      if (bytes_left < 7 * kCacheline) {
+	goto make_pgen_robust_ret_NOMEM;
+      }
+      bytes_left -= 7 * kCacheline; // defend against adverse rounding
+      uintptr_t ulii = bytes_left / (2 * (kBytesPerVec * load_variant_vec_ct + loaded_vrtypes_needed));
+      if (!ulii) {
+	goto make_pgen_robust_ret_NOMEM;
+      }
+      if (ulii > MINV(kPglVblockSize, variant_ct)) {
+	ulii = MINV(kPglVblockSize, variant_ct);
+      }
+      const uint32_t write_block_size = ulii;
+      uintptr_t* main_loadbufs[2];
+      main_loadbufs[0] = (uintptr_t*)bigstack_alloc_raw_rd(load_variant_vec_ct * kBytesPerVec * write_block_size);
+      main_loadbufs[1] = (uintptr_t*)bigstack_alloc_raw_rd(load_variant_vec_ct * kBytesPerVec * write_block_size);
+      if (loaded_vrtypes_needed) {
+	g_loaded_vrtypes[0] = bigstack_alloc_raw_rd(write_block_size);
+	g_loaded_vrtypes[1] = bigstack_alloc_raw_rd(write_block_size);
+      } else {
+	g_loaded_vrtypes[0] = nullptr;
+	g_loaded_vrtypes[1] = nullptr;
+      }
+      
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fputs("0%", stdout);
+      fflush(stdout);
+
+      // Main workflow:
+      // 1. Set n=0, load first write_block_size post-filtering variants
+      //
+      // 2. Spawn single thread processing batch n
+      // 3. Load batch (n+1) unless eof
+      // 4. Join thread
+      // 5. Increment n by 1
+      // 6. Goto step 2 unless eof
+      const uint32_t batch_ct_m1 = (variant_ct - 1) / write_block_size;
+      uint32_t pct = 0;
+      uint32_t parity = 0;
+      uint32_t read_batch_idx = 0;
+      uint32_t cur_batch_size = write_block_size;
+      uint32_t next_print_variant_idx = variant_ct / 100;
+      uint32_t read_variant_uidx = 0xffffffffU; // deliberate overflow
+      pgr_clear_ld_cache(simple_pgrp);
+      while (1) {
+	if (!ts.is_last_block) {
+	  if (read_batch_idx == batch_ct_m1) {
+	    cur_batch_size = variant_ct - (read_batch_idx * write_block_size);
+	  }
+	  uintptr_t* cur_loadbuf = main_loadbufs[parity];
+	  uintptr_t* loadbuf_iter = cur_loadbuf;
+	  unsigned char* cur_loaded_vrtypes = g_loaded_vrtypes[parity];
+	  g_loadbuf_thread_starts[parity][0] = loadbuf_iter;
+	  for (uint32_t uii = 0; uii < cur_batch_size; ++uii) {
+	    if (!new_variant_idx_to_old_iter) {
+	      ++read_variant_uidx;
+	      next_set_unsafe_ck(variant_include, &read_variant_uidx);
+	    } else {
+	      read_variant_uidx = *new_variant_idx_to_old_iter++;
+	    }
+	    reterr = pgr_read_raw(read_variant_uidx, read_phase_dosage_gflags, simple_pgrp, &loadbuf_iter, cur_loaded_vrtypes? (&(cur_loaded_vrtypes[uii])) : nullptr);
+	    if (reterr) {
+	      if (reterr == kPglRetMalformedInput) {
+		logprint("\n");
+		logerrprint("Error: Malformed .pgen file.\n");
+	      }
+	      goto make_pgen_robust_ret_1;
+	    }
+	  }
+	}
+	if (read_batch_idx) {
+	  join_threads3z(&ts);
+	  reterr = g_error_ret;
+	  if (reterr) {
+	    goto make_pgen_robust_ret_WRITE_FAIL;
+	  }
+	}
+	if (!ts.is_last_block) {
+	  g_cur_block_write_ct = cur_batch_size;
+	  ts.is_last_block = (read_batch_idx == batch_ct_m1);
+	  ts.thread_func_ptr = make_pgen_thread;
+	  if (spawn_threads3z(read_batch_idx, &ts)) {
+	    goto make_pgen_robust_ret_THREAD_CREATE_FAIL;
+	  }
+	}
+	parity = 1 - parity;
+	if (read_batch_idx) {
+	  if (read_batch_idx > batch_ct_m1) {
+	    break;
+	  }
+	  const uint32_t write_idx_end = read_batch_idx * write_block_size;
+	  if (write_idx_end >= next_print_variant_idx) {
+	    if (pct > 10) {
+	      putc_unlocked('\b', stdout);
+	    }
+	    pct = (write_idx_end * 100LLU) / variant_ct;
+	    printf("\b\b%u%%", pct++);
+	    fflush(stdout);
+	    next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	  }
+	}
+	++read_batch_idx;
+      }
+      spgw_finish(g_spgwp);
+      if (pct > 10) {
+	putc_unlocked('\b', stdout);
+      }
+      fputs("\b\b", stdout);
+      LOGPRINTF("done.\n");
+    }
+  }
+  while (0) {
+  make_pgen_robust_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  make_pgen_robust_ret_WRITE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  make_pgen_robust_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ make_pgen_robust_ret_1:
+  threads3z_cleanup(&ts, &g_cur_block_write_ct);
+  if (spgw_cleanup(&spgw) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t make_plink2_no_vsort(const char* xheader, const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage,  [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  mt_pgen_writer_t* mpgwp = nullptr;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (make_plink2_modifier & kfMakePlink2MMask) {
+      logerrprint("Error: --make-bed/--make-{b}pgen multiallelics= is currently under development.\n");
+      reterr = kPglRetNotYetSupported;
+      goto make_plink2_no_vsort_ret_1;
+    }    
+    g_plink2_write_flags = kfPlink2Write0;
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    if (make_plink2_modifier & kfMakePlink2SetHhMissing) {
+      const uint32_t sample_ctv = BITCT_TO_VECCT(sample_ct);
+      uintptr_t* sex_collapsed_tmp;
+      uintptr_t* sex_female;
+      if (bigstack_alloc_ul(sample_ctv * kWordsPerVec, &g_sex_male_collapsed_interleaved) ||
+	  bigstack_alloc_ul(sample_ctv * kWordsPerVec, &g_sex_female_collapsed_interleaved) ||
+	  bigstack_alloc_ul(sample_ctv * kWordsPerVec, &sex_collapsed_tmp) ||
+	  bigstack_alloc_ul(raw_sample_ctl, &sex_female)) {
+	goto make_plink2_no_vsort_ret_NOMEM;
+      }
+      copy_bitarr_subset(sex_male, sample_include, sample_ct, sex_collapsed_tmp);
+      fill_interleaved_mask_vec(sex_collapsed_tmp, sample_ctv, g_sex_male_collapsed_interleaved);
+
+      bitvec_andnot_copy(sex_nm, sex_male, raw_sample_ctl, sex_female);
+      copy_bitarr_subset(sex_female, sample_include, sample_ct, sex_collapsed_tmp);
+      fill_interleaved_mask_vec(sex_collapsed_tmp, sample_ctv, g_sex_female_collapsed_interleaved);
+      
+      bigstack_reset(sex_collapsed_tmp);
+      g_plink2_write_flags |= kfPlink2WriteSetHhMissing;
+    }
+    if (make_plink2_modifier & kfMakePlink2SetMixedMtMissing) {
+      g_plink2_write_flags |= kfPlink2WriteSetMixedMtMissing;
+    }
+    g_cip = cip;
+    unsigned char* bigstack_mark2 = g_bigstack_base;
+    const uint32_t make_pgen = make_plink2_modifier & kfMakePgen;
+    // todo: prohibit .pgen + .bim write when data is multiallelic without
+    //   either multiallelic split or erase-alt2+ specified
+    //   (--make-bed = automatic erase-alt2+)
+    if ((make_plink2_modifier & kfMakeBed) || ((make_plink2_modifier & (kfMakePgen | (kfMakePgenFormatBase * 3))) == (kfMakePgen | kfMakePgenFormatBase))) {
+      // fixed-width
+      if (make_pgen) {
+        strcpy(outname_end, ".pgen");
+      } else {
+        strcpy(outname_end, ".bed");
+      }
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	goto make_plink2_no_vsort_ret_OPEN_FAIL;
+      }
+      if (make_pgen) {
+	fwrite("l\x1b\x02", 3, 1, outfile);
+        fwrite(&variant_ct, 4, 1, outfile);
+	fwrite(&sample_ct, 4, 1, outfile);
+	if (!pgfip->nonref_flags) {
+	  const pgen_global_flags_t gflags = pgfip->gflags;
+	  uint32_t uii = 64;
+	  if (gflags & kfPgenGlobalAllNonref) {
+	    uii = 128;
+	  }
+	  putc_unlocked(uii, outfile);
+	} else {
+	  putc_unlocked(192, outfile);
+	  fwrite(pgfip->nonref_flags, DIV_UP(variant_ct, CHAR_BIT), 1, outfile);
+	}
+	if (ferror(outfile)) {
+	  goto make_plink2_no_vsort_ret_WRITE_FAIL;
+	}
+      } else {
+	if (fwrite_checked("l\x1b\x01", 3, outfile)) {
+	  goto make_plink2_no_vsort_ret_WRITE_FAIL;
+	}
+      }
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fputs("0%", stdout);
+      fflush(stdout);
+      uint32_t pct = 0;
+      if (variant_ct && sample_ct) {
+	const uintptr_t sample_ct4 = QUATERCT_TO_BYTECT(sample_ct);
+	if (bigstack_alloc_ui(raw_sample_ctl, &g_sample_include_cumulative_popcounts) ||
+	    bigstack_alloc_uc(sample_ct4 * kPglVblockSize, &(g_writebufs[0])) ||
+	    bigstack_alloc_uc(sample_ct4 * kPglVblockSize, &(g_writebufs[1]))) {
+	  // todo: low-memory single-threaded fallback mode
+	  goto make_plink2_no_vsort_ret_NOMEM;
+	}
+	fill_cumulative_popcounts(sample_include, raw_sample_ctl, g_sample_include_cumulative_popcounts);
+	// tried more threads, pointless since this is too I/O-bound
+	// (exception: reordering samples)
+	uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+	g_collapsed_sort_map = new_sample_idx_to_old;
+	if (!new_sample_idx_to_old) {
+	  // subsetting is most expensive with sample_ct near 2/3 of
+	  // raw_sample_ct; up to ~7 compute threads are useful in that case.
+	  // (see copy_quaterarr_nonempty_subset().)
+	  uint64_t numer;
+	  if (sample_ct * (3 * k1LU) <= raw_sample_ct * (2 * k1LU)) {
+	    numer = sample_ct * (9 * k1LU);
+	  } else {
+	    numer = (raw_sample_ct - sample_ct) * (18 * k1LU);
+	  }
+	  const uint32_t calc_thread_max = 1 + (numer / raw_sample_ct);
+	  if (calc_thread_max < calc_thread_ct) {
+	    calc_thread_ct = calc_thread_max;
+	  }
+	} else if (sample_ct < raw_sample_ct) {
+	  // const_cast
+	  if (bigstack_alloc_ui(sample_ct, (uint32_t**)((uintptr_t)(&g_collapsed_sort_map)))) {
+	    goto make_plink2_no_vsort_ret_NOMEM;
+	  }
+	  uidxs_to_idxs(sample_include, g_sample_include_cumulative_popcounts, sample_ct, (uint32_t*)((uintptr_t)g_collapsed_sort_map));
+	}
+
+	if (make_plink2_modifier & kfMakeBed) {
+	  g_plink2_write_flags |= kfPlink2WritePlink1;
+	}
+	
+	unsigned char* main_loadbufs[2];
+	uint32_t read_block_size;
+	if (multithread_load_init(variant_include, sample_ct, variant_ct, pgr_alloc_cacheline_ct, 0, 0, pgfip, &calc_thread_ct, &g_genovecs, nullptr, nullptr, &read_block_size, main_loadbufs, &ts.threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+	  goto make_plink2_no_vsort_ret_NOMEM;
+	}
+
+	g_variant_include = variant_include;
+	g_refalt1_select = refalt1_select;
+	g_sample_include = sample_include;
+	g_sample_ct = sample_ct;
+	ts.calc_thread_ct = calc_thread_ct;
+	g_calc_thread_ct = calc_thread_ct;
+	g_error_ret = kPglRetSuccess;
+
+	// Main workflow:
+	// 1. Set n=0, load/skip block 0
+	//
+	// 2. Spawn threads processing block n
+	// 3. If n>0, write results for block (n-1)
+	// 4. Increment n by 1
+	// 5. Load/skip block n unless eof
+	// 6. Join threads
+	// 7. Goto step 2 unless eof
+	//
+	// 8. Write results for last block
+
+	const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+	const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+	uint32_t parity = 0;
+	uint32_t read_block_idx = 0;
+	uint32_t prev_variant_idx = 0;
+	uint32_t variant_idx = 0;
+	uint32_t cur_read_block_size = read_block_size;
+	uint32_t next_print_variant_idx = variant_ct / 100;
+	while (1) {
+	  uintptr_t cur_block_write_ct = 0;
+	  if (!ts.is_last_block) {
+	    while (read_block_idx < read_block_ct_m1) {
+	      cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	      if (cur_block_write_ct) {
+		break;
+	      }
+	      ++read_block_idx;
+	    }
+	    if (read_block_idx == read_block_ct_m1) {
+	      cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	      cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	    }
+	    if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_write_ct, pgfip)) {
+	      goto make_plink2_no_vsort_ret_READ_FAIL;
+	    }
+	  }
+	  if (variant_idx) {
+	    join_threads3z(&ts);
+	    if (reterr) {
+	      if (reterr == kPglRetMalformedInput) {
+		logprint("\n");
+		logerrprint("Error: Malformed .pgen file.\n");
+	      }
+	      goto make_plink2_no_vsort_ret_1;
+	    }
+	  }
+	  if (!ts.is_last_block) {
+	    g_cur_block_write_ct = cur_block_write_ct;
+	    compute_uidx_start_partition(variant_include, cur_block_write_ct, calc_thread_ct, read_block_idx * read_block_size, g_read_variant_uidx_starts);
+	    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	      g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	      g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	    }
+	    ts.is_last_block = (variant_idx + cur_block_write_ct == variant_ct);
+	    ts.thread_func_ptr = make_bedlike_thread;
+	    if (spawn_threads3z(variant_idx, &ts)) {
+	      goto make_plink2_no_vsort_ret_THREAD_CREATE_FAIL;
+	    }
+	  }
+	  parity = 1 - parity;
+	  if (variant_idx) {
+	    // write *previous* block results
+	    if (fwrite_checked(g_writebufs[parity], (variant_idx - prev_variant_idx) * sample_ct4, outfile)) {
+	      goto make_plink2_no_vsort_ret_WRITE_FAIL;
+	    }
+	    if (variant_idx == variant_ct) {
+	      break;
+	    }
+	    if (variant_idx >= next_print_variant_idx) {
+	      if (pct > 10) {
+		putc_unlocked('\b', stdout);
+	      }
+	      pct = (variant_idx * 100LLU) / variant_ct;
+	      printf("\b\b%u%%", pct++);
+	      fflush(stdout);
+	      next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	    }
+	    prev_variant_idx = variant_idx;
+	  }
+	  ++read_block_idx;
+	  variant_idx += cur_block_write_ct;
+	  // crucially, this is independent of the pgen_reader_t block_base
+	  // pointers
+	  pgfip->block_base = main_loadbufs[parity];
+	}
+      }
+      if (fclose_null(&outfile)) {
+	goto make_plink2_no_vsort_ret_WRITE_FAIL;
+      }
+      if (pct > 10) {
+	putc_unlocked('\b', stdout);
+      }
+      fputs("\b\b", stdout);
+      LOGPRINTF("done.\n");
+      bigstack_reset(bigstack_mark);
+    } else if (make_pgen) {
+      // should be straightforward to make this sort variants...
+      if ((!variant_ct) || (!sample_ct)) {
+	logerrprint("Error: Zero-variant/zero-sample .pgen writing is not currently supported.\n");
+	reterr = kPglRetNotYetSupported;
+	goto make_plink2_no_vsort_ret_1;
+      }
+      const uint32_t input_biallelic = (!variant_allele_idxs);
+      const uintptr_t* write_allele_idx_offsets = nullptr;
+      if (!input_biallelic) {
+        if (variant_ct < raw_variant_ct) {
+	  uintptr_t* new_allele_idx_offsets;
+	  if (bigstack_alloc_ul(variant_ct + 1, &new_allele_idx_offsets)) {
+	    goto make_plink2_no_vsort_fallback;
+	  }
+	  uintptr_t cur_offset = 0;
+	  uint32_t variant_uidx = 0;
+	  // todo: separate trim-alts case
+	  for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	    next_set_unsafe_ck(variant_include, &variant_uidx);
+	    new_allele_idx_offsets[variant_idx] = cur_offset;
+	    cur_offset += variant_allele_idxs[variant_uidx + 1] - variant_allele_idxs[variant_uidx];
+	  }
+	  if (cur_offset != 2 * variant_ct) {
+	    new_allele_idx_offsets[variant_ct] = cur_offset;
+	    write_allele_idx_offsets = new_allele_idx_offsets;
+	    logprint("Error: Multiallelic .pgen write is not yet supported.\n");
+	    reterr = kPglRetNotYetSupported;
+	    goto make_plink2_no_vsort_ret_1;
+	  } else {
+	    bigstack_reset(new_allele_idx_offsets);
+	  }
+	} else {
+	  write_allele_idx_offsets = variant_allele_idxs;
+	}
+      }
+      if (variant_ct == raw_variant_ct) {
+	g_write_chr_fo_vidx_start = cip->chr_fo_vidx_start;
+      } else {
+	if (alloc_and_fill_subset_chr_fo_vidx_start(variant_include, cip, &g_write_chr_fo_vidx_start)) {
+	  goto make_plink2_no_vsort_fallback;
+	}
+      }
+      pgen_global_flags_t read_phase_dosage_gflags = pgfip->gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
+      if (make_plink2_modifier & kfMakePgenErasePhase) {
+	read_phase_dosage_gflags &= ~(kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent);
+      }
+      if (make_plink2_modifier & kfMakePgenEraseDosage) {
+	if (hard_call_thresh == 0xffffffffU) {
+	  read_phase_dosage_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
+	} else {
+	  // erase-dosage + --hard-call-threshold currently requires dosages to
+	  // be read, and only thrown away at the last minute
+	  // (alternatively, we could build --hard-call-threshold directly into
+	  // pgr_read_raw?)
+	  g_plink2_write_flags |= kfPlink2WriteLateDosageErase;
+	}
+      }
+      if (read_phase_dosage_gflags && (variant_ct < raw_variant_ct)) {
+	// did we e.g. filter out all the phased variants?
+	read_phase_dosage_gflags = gflags_vfilter(variant_include, pgfip->vrtypes, raw_variant_ct, pgfip->gflags);
+      }
+      // could check if all the phased samples were also filtered out, but
+      // that's already caught by running --make-pgen twice, so not a big deal
+      g_read_phase_dosage_gflags = read_phase_dosage_gflags;
+      g_hard_call_halfdist = (hard_call_thresh == 0xffffffffU)? 0 : (kDosage4th - hard_call_thresh);
+      g_dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
+      const uint32_t read_hphase_present = (read_phase_dosage_gflags / kfPgenGlobalHardcallPhasePresent) & 1;
+      const uint32_t read_dosage_present = (read_phase_dosage_gflags & (kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent))? 1 : 0;
+      pgen_global_flags_t write_phase_dosage_gflags = read_phase_dosage_gflags;
+      if (g_plink2_write_flags & kfPlink2WriteLateDosageErase) {
+	write_phase_dosage_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
+      }
+      uintptr_t alloc_base_cacheline_ct;
+      uint64_t mpgw_per_thread_cacheline_ct;
+      uint32_t vrec_len_byte_ct;
+      uint64_t vblock_cacheline_ct;
+      // may want to have a load_sample_ct which is raw_sample_ct when e.g.
+      // sample_ct > 0.1 * raw_sample_ct, and sample_ct otherwise.
+      mpgw_init_phase1(write_allele_idx_offsets, variant_ct, sample_ct, write_phase_dosage_gflags, &alloc_base_cacheline_ct, &mpgw_per_thread_cacheline_ct, &vrec_len_byte_ct, &vblock_cacheline_ct);
+
+      // bugfix: each variant currently needs to be vector-aligned
+      // bugfix?: need to use raw_sample_ct here, not sample_ct
+      const uint32_t raw_sample_ctv2 = QUATERCT_TO_VECCT(raw_sample_ct);
+      const uint32_t max_vblock_size = MINV(kPglVblockSize, variant_ct);
+      uint64_t load_vblock_cacheline_ct = VECCT_TO_CLCT(((uint64_t)raw_sample_ctv2) * max_vblock_size);
+
+      if (read_hphase_present) {
+	// could make this bound tighter when lots of unphased variants are
+	// mixed in among the phased variants, but this isn't nearly as
+	// important as the analogous multiallelic optimization
+
+	// phaseraw has two parts:
+	// 1. vec-aligned bitarray of up to (raw_sample_ct + 1) bits.  first
+	//    bit is set iff phasepresent is explicitly stored at all (if not,
+	//    all hets are assumed to be phased), if yes the remaining bits
+	//    store packed phasepresent values for all hets, if no the
+	//    remaining bits store packed phaseinfo values for all hets.
+	// 2. word-aligned bitarray of up to raw_sample_ct bits, storing
+	//    phaseinfo values.  (end of this array is vec-aligned.)
+	const uintptr_t phaseraw_word_ct = kWordsPerVec + round_down_pow2(raw_sample_ct / kBitsPerWordD2, kWordsPerVec);
+	load_vblock_cacheline_ct += WORDCT_TO_CLCT(((uint64_t)phaseraw_word_ct) * max_vblock_size);
+      }
+      if (read_dosage_present) {	
+	// todo: phased dosage
+
+	// (unphased, biallelic) dosageraw has two parts:
+	// 1. vec-aligned bitarray of up to raw_sample_ct bits, storing which
+	//    samples have dosages.
+	// 2. word-aligned array of uint16s with 0..32768 fixed-point dosages.
+	assert(!(read_phase_dosage_gflags & kfPgenGlobalDosagePhasePresent));
+	const uintptr_t dosageraw_word_ct = kWordsPerVec * (BITCT_TO_VECCT(raw_sample_ct) + DIV_UP(raw_sample_ct, (kBytesPerVec / sizeof(dosage_t))));
+	load_vblock_cacheline_ct += WORDCT_TO_CLCT(dosageraw_word_ct * ((uint64_t)max_vblock_size));
+      }
+      // todo: multiallelic variants
+      
+#ifndef __LP64__
+      if ((mpgw_per_thread_cacheline_ct > (0x7fffffff / kCacheline)) || (load_vblock_cacheline_ct > (0x7fffffff / kCacheline))) {
+	goto make_plink2_no_vsort_fallback;
+      }
+#endif
+      uint32_t calc_thread_ct = DIV_UP(variant_ct, kPglVblockSize);
+      if (calc_thread_ct >= max_thread_ct) {
+	calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+      }
+      const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+      if (!new_sample_idx_to_old) {
+	// hphase doesn't seem to affect read:write ratio much
+	const uint32_t max_calc_thread_ct = 2 + subsetting_required;
+	if (calc_thread_ct > max_calc_thread_ct) {
+	  calc_thread_ct = max_calc_thread_ct;
+	}
+      }
+      // this is frequently I/O-bound even when resorting, but I'll postpone
+      // tuning thread count there
+      g_refalt1_select = refalt1_select;
+      if (refalt1_select && (variant_ct < raw_variant_ct)) {
+	// might want inner loop to map variant uidx -> idx instead
+        g_refalt1_select = (alt_allele_ct_t*)bigstack_alloc(variant_ct * 2 * sizeof(alt_allele_ct_t));
+	if (!g_refalt1_select) {
+	  goto make_plink2_no_vsort_fallback;
+	}
+	uint32_t variant_uidx = 0;
+	for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  // const_cast
+	  memcpy((alt_allele_ct_t*)((uintptr_t)(&(g_refalt1_select[2 * variant_idx]))), &(refalt1_select[2 * variant_uidx]), 2 * sizeof(alt_allele_ct_t));
+	}
+      }
+      mpgwp = (mt_pgen_writer_t*)bigstack_alloc((calc_thread_ct + DIV_UP(sizeof(mt_pgen_writer_t), kBytesPerWord)) * sizeof(intptr_t));
+      if (!mpgwp) {
+	goto make_plink2_no_vsort_fallback;
+      }
+      mpgwp->pgen_outfile = nullptr;
+      if (bigstack_alloc_thread(calc_thread_ct, &ts.threads) ||
+	  bigstack_alloc_ulp(calc_thread_ct, &(g_loadbuf_thread_starts[0])) ||
+	  bigstack_alloc_ulp(calc_thread_ct, &(g_loadbuf_thread_starts[1]))) {
+	goto make_plink2_no_vsort_fallback;
+      }
+      uint32_t nonref_flags_storage = 3;
+      if (!pgfip->nonref_flags) {
+	nonref_flags_storage = (simple_pgrp->fi.gflags & kfPgenGlobalAllNonref)? 2 : 1;
+      } else if (variant_ct < raw_variant_ct) {
+	// todo: check if now constant
+      }
+      g_pwcs = &(mpgwp->pwcs[0]);
+      g_new_sample_idx_to_old = new_sample_idx_to_old;
+      g_thread_write_genovecs = nullptr;
+      uintptr_t other_per_thread_cacheline_ct = 2 * load_vblock_cacheline_ct;
+      if (new_sample_idx_to_old || subsetting_required) {
+	if (bigstack_alloc_ulp(calc_thread_ct, &g_thread_write_genovecs)) {
+	  goto make_plink2_no_vsort_fallback;
+	}
+	if (read_hphase_present && new_sample_idx_to_old) {
+	  if (bigstack_alloc_ui(raw_sample_ct, &g_old_sample_idx_to_new)) {
+	    goto make_plink2_no_vsort_fallback;
+	  }
+	  for (uint32_t new_sample_idx = 0; new_sample_idx < sample_ct; ++new_sample_idx) {
+	    g_old_sample_idx_to_new[new_sample_idx_to_old[new_sample_idx]] = new_sample_idx;
+	  }
+	}
+	if (read_dosage_present && new_sample_idx_to_old) {
+	  // g_thread_cumulative_popcount_bufs
+	  other_per_thread_cacheline_ct += INT32CT_TO_CLCT(raw_sample_ctl);
+	}
+	// per-thread output buffers required
+	if (input_biallelic) {
+	  other_per_thread_cacheline_ct += QUATERCT_TO_CLCT(sample_ct);
+	} else {
+	  other_per_thread_cacheline_ct += DIV_UP(2 * sample_ct * sizeof(alt_allele_ct_t), kCacheline);
+	}
+      }
+      if (read_hphase_present || read_dosage_present) {
+	if (read_hphase_present) {
+	  if (bigstack_alloc_ulp(calc_thread_ct, &g_thread_write_phasepresents) ||
+	      bigstack_alloc_ulp(calc_thread_ct, &g_thread_write_phaseinfos) ||
+	      bigstack_alloc_ulp(calc_thread_ct, &g_thread_all_hets)) {
+	    goto make_plink2_no_vsort_fallback;
+	  }
+	  // phasepresent, phaseinfo
+	  other_per_thread_cacheline_ct += 2 * BITCT_TO_CLCT(sample_ct);
+
+	  // all_hets
+	  other_per_thread_cacheline_ct += BITCT_TO_CLCT(raw_sample_ct);
+	}
+	if (read_dosage_present) {
+	  if (bigstack_alloc_dosagep(calc_thread_ct, &g_thread_write_dosagevals) ||
+	      bigstack_alloc_ulp(calc_thread_ct, &g_thread_write_dosagepresents)) {
+	    goto make_plink2_no_vsort_fallback;
+	  }
+	  if (new_sample_idx_to_old) {
+	    if (bigstack_alloc_uip(calc_thread_ct, &g_thread_cumulative_popcount_bufs)) {
+	      goto make_plink2_no_vsort_fallback;
+	    }
+	  }
+	  // dosage_present
+	  other_per_thread_cacheline_ct += BITCT_TO_CLCT(sample_ct);
+
+	  // dosage_vals
+	  other_per_thread_cacheline_ct += DIV_UP(sample_ct, (kCacheline / sizeof(dosage_t)));
+	}
+	// g_loaded_vrtypes
+	other_per_thread_cacheline_ct += 2 * (kPglVblockSize / kCacheline);
+      }
+      const uintptr_t cachelines_avail = bigstack_left() / kCacheline;
+      if (cachelines_avail < alloc_base_cacheline_ct + (mpgw_per_thread_cacheline_ct + other_per_thread_cacheline_ct) * calc_thread_ct) {
+	if (cachelines_avail < alloc_base_cacheline_ct + mpgw_per_thread_cacheline_ct + other_per_thread_cacheline_ct) {
+	  goto make_plink2_no_vsort_fallback;
+	}
+	calc_thread_ct = (cachelines_avail - alloc_base_cacheline_ct) / (mpgw_per_thread_cacheline_ct + other_per_thread_cacheline_ct);
+      }
+      uintptr_t* main_loadbufs[2];
+      main_loadbufs[0] = (uintptr_t*)bigstack_alloc_raw(load_vblock_cacheline_ct * calc_thread_ct * kCacheline);
+      main_loadbufs[1] = (uintptr_t*)bigstack_alloc_raw(load_vblock_cacheline_ct * calc_thread_ct * kCacheline);
+      if (read_hphase_present || read_dosage_present) {
+	g_loaded_vrtypes[0] = bigstack_alloc_raw(kPglVblockSize * calc_thread_ct);
+	g_loaded_vrtypes[1] = bigstack_alloc_raw(kPglVblockSize * calc_thread_ct);
+	const uint32_t bitvec_writebuf_byte_ct = BITCT_TO_CLCT(sample_ct) * kCacheline;
+	const uintptr_t dosagevals_writebuf_byte_ct = DIV_UP(sample_ct, (kCacheline / 2)) * kCacheline;
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  if (read_hphase_present) {
+	    g_thread_write_phasepresents[tidx] = (uintptr_t*)bigstack_alloc_raw(bitvec_writebuf_byte_ct);
+	    g_thread_write_phaseinfos[tidx] = (uintptr_t*)bigstack_alloc_raw(bitvec_writebuf_byte_ct);
+
+	    g_thread_all_hets[tidx] = (uintptr_t*)bigstack_alloc_raw(BITCT_TO_CLCT(raw_sample_ct) * kCacheline);
+	  }
+	  if (read_dosage_present) {
+	    g_thread_write_dosagepresents[tidx] = (uintptr_t*)bigstack_alloc_raw(bitvec_writebuf_byte_ct);
+	    g_thread_write_dosagevals[tidx] = (dosage_t*)bigstack_alloc_raw(dosagevals_writebuf_byte_ct);
+	    if (new_sample_idx_to_old) {
+	      g_thread_cumulative_popcount_bufs[tidx] = (uint32_t*)bigstack_alloc_raw(INT32CT_TO_CLCT(raw_sample_ctl) * kCacheline);
+	    }
+	  }
+	}
+      } else {
+	g_loaded_vrtypes[0] = nullptr;
+	g_loaded_vrtypes[1] = nullptr;
+      }
+      if (new_sample_idx_to_old || subsetting_required) {
+	uintptr_t writebuf_byte_ct = input_biallelic? QUATERCT_TO_BYTECT(sample_ct) : (2 * sample_ct * sizeof(alt_allele_ct_t));
+	writebuf_byte_ct = round_up_pow2(writebuf_byte_ct, kCacheline);
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_thread_write_genovecs[tidx] = (uintptr_t*)bigstack_alloc_raw(writebuf_byte_ct);
+	}
+      }
+      strcpy(outname_end, ".pgen");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fputs("0%", stdout);
+      fflush(stdout);
+      unsigned char* mpgw_alloc = bigstack_alloc_raw((alloc_base_cacheline_ct + mpgw_per_thread_cacheline_ct * calc_thread_ct) * kCacheline);
+      assert(g_bigstack_base <= g_bigstack_end);
+      reterr = mpgw_init_phase2(outname, write_allele_idx_offsets, pgfip->nonref_flags, variant_ct, sample_ct, write_phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, vblock_cacheline_ct, calc_thread_ct, mpgw_alloc, mpgwp);
+      if (reterr) {
+	goto make_plink2_no_vsort_ret_1;
+      }
+      g_sample_include = subsetting_required? sample_include : nullptr;
+      g_raw_sample_ct = raw_sample_ct;
+      g_sample_ct = sample_ct;
+      ts.calc_thread_ct = calc_thread_ct;
+      // g_calc_thread_ct = calc_thread_ct;
+      g_spgwp = nullptr;
+      // g_error_ret = kPglRetSuccess;
+
+      // Main workflow:
+      // 1. Set n=0, load first calc_thread_ct * kPglVblockSize
+      //    *post-filtering* variants.
+      //    This doesn't play well with blockload when any variants are
+      //    filtered out, so we don't use it.  (todo: look into special-casing
+      //    variant_ct == raw_variant_ct.)
+      //
+      // 2. Spawn threads processing batch n
+      // 3. Load batch (n+1) unless eof
+      // 4. Join threads
+      // 5. Flush results for batch n (must happen here since we aren't using
+      //    two output buffers.  this may be a mistake, revisit this choice...)
+      // 6. Increment n by 1
+      // 7. Goto step 2 unless eof
+      const uint32_t batch_ct_m1 = (variant_ct - 1) / (kPglVblockSize * calc_thread_ct);
+      uint32_t pct = 0;
+      uint32_t parity = 0;
+      uint32_t read_batch_idx = 0;
+      uint32_t write_idx_end = 0;
+      uint32_t cur_batch_size = kPglVblockSize * calc_thread_ct;
+      uint32_t next_print_variant_idx = variant_ct / 100;
+      uint32_t read_variant_uidx = next_set_unsafe(variant_include, 0);
+      pgr_clear_ld_cache(simple_pgrp);
+      while (1) {
+	if (read_batch_idx) {
+	  g_cur_block_write_ct = cur_batch_size;
+	  ts.is_last_block = (write_idx_end == variant_ct);
+	  ts.thread_func_ptr = make_pgen_thread;
+	  if (spawn_threads3z(read_batch_idx - 1, &ts)) {
+	    goto make_plink2_no_vsort_ret_THREAD_CREATE_FAIL;
+	  }
+	}
+	if (!ts.is_last_block) {
+	  if (read_batch_idx == batch_ct_m1) {
+	    cur_batch_size = variant_ct - (read_batch_idx * kPglVblockSize * calc_thread_ct);
+	  }
+	  uintptr_t* cur_loadbuf = main_loadbufs[parity];
+	  uintptr_t* loadbuf_iter = cur_loadbuf;
+	  unsigned char* cur_loaded_vrtypes = g_loaded_vrtypes[parity];
+	  for (uint32_t uii = 0; uii < cur_batch_size; ++uii, ++read_variant_uidx) {
+	    if (!(uii % kPglVblockSize)) {
+	      g_loadbuf_thread_starts[parity][uii / kPglVblockSize] = loadbuf_iter;
+	    }
+	    next_set_unsafe_ck(variant_include, &read_variant_uidx);
+	    reterr = pgr_read_raw(read_variant_uidx, read_phase_dosage_gflags, simple_pgrp, &loadbuf_iter, cur_loaded_vrtypes? (&(cur_loaded_vrtypes[uii])) : nullptr);
+	    if (reterr) {
+	      if (reterr == kPglRetMalformedInput) {
+		logprint("\n");
+		logerrprint("Error: Malformed .pgen file.\n");
+	      }
+	      goto make_plink2_no_vsort_ret_1;
+	    }
+	  }
+	}
+	if (read_batch_idx) {
+	  join_threads3z(&ts);
+	}
+	parity = 1 - parity;
+	if (write_idx_end) {
+	  reterr = mpgw_flush(mpgwp);
+	  if (reterr) {
+	    goto make_plink2_no_vsort_ret_WRITE_FAIL;
+	  }
+	  if (write_idx_end == variant_ct) {
+	    mpgwp = nullptr;
+	    break;
+	  }
+	  if (write_idx_end >= next_print_variant_idx) {
+	    if (pct > 10) {
+	      putc_unlocked('\b', stdout);
+	    }
+	    pct = (write_idx_end * 100LLU) / variant_ct;
+	    printf("\b\b%u%%", pct++);
+	    fflush(stdout);
+	    next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	  }
+	}
+	++read_batch_idx;
+	write_idx_end += cur_batch_size;
+      }
+      if (pct > 10) {
+	putc_unlocked('\b', stdout);
+      }
+      fputs("\b\b", stdout);
+      LOGPRINTF("done.\n");
+      bigstack_reset(bigstack_mark);
+    } else if (0) {
+    make_plink2_no_vsort_fallback:
+      mpgwp = nullptr;
+      bigstack_reset(bigstack_mark2);
+      reterr = make_pgen_robust(sample_include, new_sample_idx_to_old, variant_include, variant_allele_idxs, refalt1_select, nullptr, raw_sample_ct, sample_ct, raw_variant_ct, variant_ct, hard_call_thresh, dosage_erase_thresh, make_plink2_modifier, simple_pgrp, outname, outname_end);
+      if (reterr) {
+	goto make_plink2_no_vsort_ret_1;
+      }
+    }
+    const uint32_t trim_alts = (uint32_t)(make_plink2_modifier & kfMakePlink2TrimAlts);
+    // don't bother with trim-alts/set-hh-missing interaction for now
+    if (make_plink2_modifier & kfMakeBim) {
+      char* bimname_end = strcpya0(outname_end, ".bim");
+      const uint32_t bim_zst = (make_plink2_modifier / kfMakeBimZs) & 1;
+      if (bim_zst) {
+	strcpy(bimname_end, ".zst");
+      }
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      reterr = write_map_or_bim(outname, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, trim_alts? allele_dosages : nullptr, refalt1_select, variant_cms, variant_ct, max_allele_slen, '\t', bim_zst);
+      if (reterr) {
+	goto make_plink2_no_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if (make_plink2_modifier & kfMakePvar) {
+      char* pvarname_end = strcpya0(outname_end, ".pvar");
+      if (pvar_psam_modifier & kfPvarZs) {
+	strcpy(pvarname_end, ".zst");
+      }
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      uint32_t nonref_flags_storage = 3;
+      if (!pgfip->nonref_flags) {
+	nonref_flags_storage = (pgfip->gflags & kfPgenGlobalAllNonref)? 2 : 1;
+      }
+      reterr = write_pvar(outname, xheader, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, trim_alts? allele_dosages : nullptr, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, xheader_info_pr, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_modifier);
+      if (reterr) {
+	goto make_plink2_no_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if (make_plink2_modifier & kfMakeFam) {
+      strcpy(outname_end, ".fam");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      reterr = write_fam(outname, sample_include, sample_ids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, new_sample_idx_to_old, sample_ct, max_sample_id_blen, max_paternal_id_blen, max_maternal_id_blen, pheno_ct, '\t');
+      if (reterr) {
+	goto make_plink2_no_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if (make_plink2_modifier & kfMakePsam) {
+      strcpy(outname_end, ".psam");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      reterr = write_psam(outname, sample_include, sample_ids, sids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, max_sample_id_blen, max_sid_blen, max_paternal_id_blen, max_maternal_id_blen, pheno_ct, max_pheno_name_blen, pvar_psam_modifier);
+      if (reterr) {
+	goto make_plink2_no_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+  }
+  while (0) {
+  make_plink2_no_vsort_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  make_plink2_no_vsort_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  make_plink2_no_vsort_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  make_plink2_no_vsort_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  make_plink2_no_vsort_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ make_plink2_no_vsort_ret_1:
+  if (mpgw_cleanup(mpgwp) && (!reterr)) {
+    reterr = kPglRetWriteFail;
+  }
+  threads3z_cleanup(&ts, &g_cur_block_write_ct);
+  fclose_cond(outfile);
+  pgfip->block_base = nullptr;
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+/*
+pglerr_t write_pvar_resorted(const char* outname, const char* xheader, const uintptr_t* variant_include, const chr_info_t* write_cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint64_t* allele_dosages, const alt_allele_ct_t* refalt1_select, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, char** filter_storage, const uintptr_t* nonref_flags, char** p [...]
+  // allele_dosages must be nullptr unless we're trimming alt alleles
+
+  // The annoying part of this is handling a sequence of INFO strings that
+  // don't fit in memory; use a multipass approach for that.
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uint32_t max_chr_blen = get_max_chr_slen(write_cip) + 1;
+    // includes trailing tab
+    char* chr_buf;
+
+    uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen + max_filter_slen + info_reload_slen;
+    if (overflow_buf_size < 2 * kCompressStreamBlock) {
+      overflow_buf_size = 2 * kCompressStreamBlock;
+    }
+    unsigned char* overflow_buf;
+    uintptr_t* allele_include;
+    if (bigstack_alloc_c(max_chr_blen, &chr_buf) ||
+	bigstack_alloc_uc(overflow_buf_size, &overflow_buf) ||
+	bigstack_alloc_ul(BITCT_TO_WORDCT(kPglMaxAltAlleleCt), &allele_include)) {
+      goto write_pvar_resorted_ret_NOMEM;
+    }
+    const uint32_t output_zst = (pvar_psam_modifier / kfPvarZs) & 1;
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto write_pvar_resorted_ret_OPEN_FAIL;
+    }
+    cswritep = (char*)overflow_buf;
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    const uint32_t all_nonref = (nonref_flags_storage == 2);
+    uint32_t write_info_pr = all_nonref;
+    uint32_t write_info = (pvar_psam_modifier & kfPvarColInfo) || pvar_info_reload;
+    if (write_info && nonref_flags) {
+      uint32_t widx;
+      for (widx = 0; widx < raw_variant_ctl; ++widx) {
+	if (variant_include[widx] & nonref_flags[widx]) {
+	  break;
+	}
+      }
+      if (widx == raw_variant_ctl) {
+	write_info_pr = 0;
+      }
+    }
+    write_info_pr = write_info_pr && write_info;
+
+    char* loadbuf = nullptr;
+    uintptr_t loadbuf_size = 0;
+    uint32_t info_col_idx = 0; // could save this during first load instead
+    if (pvar_psam_modifier & kfPvarColXheader) {
+      if (csputs_std(xheader, xheader_blen, &css, &cswritep)) {
+	goto write_pvar_resorted_ret_WRITE_FAIL;
+      }
+      if (write_info_pr && (!xheader_info_pr)) {
+	cswritep = strcpya(cswritep, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+      }
+    }
+    if (write_cip->chrset_source) {
+      append_chrset_line(write_cip, &cswritep);
+    }
+    cswritep = strcpya(cswritep, "#CHROM\tPOS\tID\tREF\tALT");
+
+    uint32_t write_qual = 0;
+    if (pvar_psam_modifier & kfPvarColQual) {
+      write_qual = 1;
+    } else if ((pvar_psam_modifier & kfPvarColMaybequal) && qual_present) {
+      for (uint32_t widx = 0; widx < raw_variant_ctl; ++widx) {
+	if (variant_include[widx] & qual_present[widx]) {
+	  write_qual = 1;
+	  break;
+	}
+      }
+    }
+    if (write_qual) {
+      cswritep = strcpya(cswritep, "\tQUAL");
+    }
+    
+    uint32_t write_filter = 0;
+    if (pvar_psam_modifier & kfPvarColFilter) {
+      write_filter = 1;
+    } else if ((pvar_psam_modifier & kfPvarColMaybefilter) && filter_present) {
+      for (uint32_t widx = 0; widx < raw_variant_ctl; ++widx) {
+	if (variant_include[widx] & filter_present[widx]) {
+	  write_filter = 1;
+	  break;
+	}
+      }
+    }
+    if (write_filter) {
+      cswritep = strcpya(cswritep, "\tFILTER");
+    }
+
+    if (write_info) {
+      cswritep = strcpya(cswritep, "\tINFO");
+    }
+    
+    uint32_t write_cm = 0;
+    if (pvar_psam_modifier & kfPvarColCm) {
+      write_cm = 1;
+    } else if ((pvar_psam_modifier & kfPvarColMaybecm) && variant_cms) {
+      if (raw_variant_ct == variant_ct) {
+	// nonzero_cm_present check was performed
+	write_cm = 1;
+      } else {
+	uint32_t variant_uidx = 0;
+	for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  if (variant_cms[variant_uidx] != 0.0) {
+	    write_cm = 1;
+	    break;
+	  }
+	}
+      }
+    }
+    if (write_cm) {
+      cswritep = memcpyl3a(cswritep, "\tCM");
+    }
+    append_binary_eoln(&cswritep);
+
+    const char output_missing_geno_char = *g_output_missing_geno_ptr;
+    const uint32_t* new_variant_idx_to_old_iter = new_variant_idx_to_old;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_buf_blen = 0;
+    uint32_t ref_allele_idx = 0;
+    uint32_t alt1_allele_idx = 1;
+    uint32_t cur_allele_ct = 2;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx) {
+      const uint32_t variant_uidx = *new_variant_idx_to_old_iter++;
+      if (variant_idx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = write_cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_idx >= chr_end);
+	char* chr_name_end = chr_name_write(write_cip, write_cip->chr_file_order[chr_fo_idx], chr_buf);
+	*chr_name_end = '\t';
+	chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+      }
+      cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+      cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+      cswritep = strcpyax(cswritep, variant_ids[variant_uidx], '\t');
+      uintptr_t variant_allele_idx_base;
+      if (!variant_allele_idxs) {
+	variant_allele_idx_base = variant_uidx * 2;
+      } else {
+	variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+      }
+      char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[variant_uidx * 2];
+	alt1_allele_idx = refalt1_select[variant_uidx * 2 + 1];
+      }
+      cswritep = strcpyax(cswritep, cur_alleles[ref_allele_idx], '\t');
+      if ((!allele_dosages) || allele_dosages[variant_allele_idx_base + alt1_allele_idx]) {
+        cswritep = strcpya(cswritep, cur_alleles[alt1_allele_idx]);
+      } else {
+	*cswritep++ = output_missing_geno_char;
+      }
+      if (cswrite(&css, &cswritep)) {
+	goto write_pvar_resorted_ret_WRITE_FAIL;
+      }
+      if (cur_allele_ct > 2) {
+	fill_all_bits(cur_allele_ct, allele_include);
+	CLEAR_BIT(ref_allele_idx, allele_include);
+	CLEAR_BIT(alt1_allele_idx, allele_include);
+        uint32_t cur_allele_uidx = 0;
+	uint32_t alt_allele_idx = 2;
+	do {
+	  *cswritep++ = ',';
+	  next_set_unsafe_ck(allele_include, &cur_allele_uidx);
+	  cswritep = strcpya(cswritep, cur_alleles[cur_allele_uidx++]);
+	  if (cswrite(&css, &cswritep)) {
+	    goto write_pvar_resorted_ret_WRITE_FAIL;
+	  }
+	} while (++alt_allele_idx < cur_allele_ct);
+      }
+
+      if (write_qual) {
+	*cswritep++ = '\t';
+	if (!IS_SET(qual_present, variant_uidx)) {
+	  *cswritep++ = '.';
+	} else {
+	  cswritep = ftoa_g(quals[variant_uidx], cswritep);
+	}
+      }
+
+      if (write_filter) {
+	*cswritep++ = '\t';
+	if (!IS_SET(filter_present, variant_uidx)) {
+	  *cswritep++ = '.';
+	} else if (!IS_SET(filter_npass, variant_uidx)) {
+	  cswritep = strcpya(cswritep, "PASS");
+	} else {
+	  cswritep = strcpya(cswritep, filter_storage[variant_uidx]);
+	}
+      }
+
+      if (write_info) {
+	*cswritep++ = '\t';
+	const uint32_t is_pr = all_nonref || (nonref_flags && IS_SET(nonref_flags, variant_uidx));
+	if (pvar_info_strs && pvar_info_strs[variant_uidx]) {
+	  pvar_info_write(pvar_info_strs[variant_uidx], xheader_info_pr, is_pr, cswritep);
+	} else {
+	  if (is_pr) {
+	    cswritep = strcpya(cswritep, "PR");
+	  } else {
+	    *cswritep++ = '.';
+	  }
+	}
+      }
+      
+      if (write_cm) {
+        *cswritep++ = '\t';
+	if (!variant_cms) {
+	  *cswritep++ = '0';
+	} else {
+	  cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
+	}
+      }
+      append_binary_eoln(&cswritep);
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto write_pvar_resorted_ret_WRITE_FAIL;
+    }
+  }
+  while (0) {
+  write_pvar_resorted_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_pvar_resorted_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_pvar_resorted_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+ write_pvar_resorted_ret_1:
+  cswrite_close_cond(&css, cswritep);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+*/
+
+/*
+pglerr_t make_plink2_vsort(const char* xheader, const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, con [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (make_plink2_modifier & kfMakeBim) {
+      char* bimname_end = strcpya0(outname_end, ".bim");
+      const uint32_t bim_zst = (make_plink2_modifier / kfMakeBimZs) & 1;
+      if (bim_zst) {
+	strcpy(bimname_end, ".zst");
+      }
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      // reterr = write_map_or_bim(outname, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, trim_alts? allele_dosages : nullptr, refalt1_select, variant_cms, variant_ct, max_allele_slen, '\t', bim_zst);
+      if (reterr) {
+	goto make_plink2_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if (make_plink2_modifier & kfMakePvar) {
+      char* pvarname_end = strcpya0(outname_end, ".pvar");
+      if (pvar_psam_modifier & kfPvarZs) {
+	strcpy(pvarname_end, ".zst");
+      }
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      uint32_t nonref_flags_storage = 3;
+      if (!pgfip->nonref_flags) {
+	nonref_flags_storage = (pgfip->gflags & kfPgenGlobalAllNonref)? 2 : 1;
+      }
+      reterr = write_pvar_resorted(outname, xheader, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, trim_alts? allele_dosages : nullptr, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, xheader_info_pr, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_modifier);
+      if (reterr) {
+	goto make_plink2_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if (make_plink2_modifier & kfMakeFam) {
+      strcpy(outname_end, ".fam");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      reterr = write_fam(outname, sample_include, sample_ids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, new_sample_idx_to_old, sample_ct, max_sample_id_blen, max_paternal_id_blen, max_maternal_id_blen, pheno_ct, '\t');
+      if (reterr) {
+	goto make_plink2_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if (make_plink2_modifier & kfMakePsam) {
+      strcpy(outname_end, ".psam");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      reterr = write_psam(outname, sample_include, sample_ids, sids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, max_sample_id_blen, max_sid_blen, max_paternal_id_blen, max_maternal_id_blen, pheno_ct, max_pheno_name_blen, pvar_psam_modifier);
+      if (reterr) {
+	goto make_plink2_vsort_ret_1;
+      }
+      logprint("done.\n");
+    }
+  }
+  while (0) {
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+*/
+
+pglerr_t sample_sort_file_map(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* sample_sort_fname, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t sid_col_present, uint32_t** new_sample_idx_to_old_ptr) {
+  // assumes sample_ct >= 2 (enforced by caller)
+  // return strbox is not collapsed
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gzFile gz_infile = nullptr;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    char* idbuf;
+    uintptr_t* already_seen;
+    if (bigstack_alloc_ui(raw_sample_ct, new_sample_idx_to_old_ptr) ||
+	bigstack_alloc_c(max_sample_id_blen, &idbuf) ||
+	bigstack_calloc_ul(BITCT_TO_WORDCT(raw_sample_ct), &already_seen)) {
+      goto sample_sort_file_map_ret_NOMEM;
+    }
+    
+    uintptr_t loadbuf_size = bigstack_left();
+    loadbuf_size -= loadbuf_size / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto sample_sort_file_map_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    char* loadbuf_first_token;
+    xid_mode_t xid_mode;
+    reterr = open_and_load_xid_header(sample_sort_fname, "indiv-sort", sid_col_present? kSidDetectModeForce : (sids? kSidDetectModeLoaded : kSidDetectModeNotLoaded), loadbuf_size, loadbuf, nullptr, &line_idx, &loadbuf_first_token, &gz_infile, &xid_mode);
+    if (reterr) {
+      if (reterr == kPglRetEmptyFile) {
+	logerrprint("Error: --indiv-sort file is empty.\n");
+	goto sample_sort_file_map_ret_MALFORMED_INPUT;
+      }
+      if (reterr == kPglRetLongLine) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto sample_sort_file_map_ret_LONG_LINE;
+	}
+	goto sample_sort_file_map_ret_NOMEM;
+      }
+      goto sample_sort_file_map_ret_1;
+    }
+    uint32_t* xid_map;
+    char* sorted_xidbox;
+    uintptr_t max_xid_blen;
+    reterr = sorted_xidbox_init_alloc(sample_include, sample_ids, sids, sample_ct, max_sample_id_blen, max_sid_blen, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
+    if (reterr) {
+      goto sample_sort_file_map_ret_1;
+    }
+    uint32_t* new_sample_idx_to_old_iter = *new_sample_idx_to_old_ptr;
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	char* loadbuf_iter = loadbuf_first_token;
+	uint32_t sample_uidx;
+	if (!sorted_xidbox_read_find(sorted_xidbox, xid_map, max_xid_blen, sample_ct, 0, xid_mode, &loadbuf_iter, &sample_uidx, idbuf)) {
+	  if (IS_SET(already_seen, sample_uidx)) {
+	    char* tptr = (char*)rawmemchr(idbuf, '\t');
+	    *tptr = ' ';
+	    if (xid_mode & kfXidModeFlagSid) {
+	      *((char*)rawmemchr(&(tptr[1]), '\t')) = ' ';
+	    }
+	    sprintf(g_logbuf, "Error: Duplicate sample ID '%s' in --indiv-sort file.\n", idbuf);
+	    goto sample_sort_file_map_ret_MALFORMED_INPUT_WW;
+	  }
+	  SET_BIT(sample_uidx, already_seen);
+	  *new_sample_idx_to_old_iter++ = sample_uidx;
+	} else if (!loadbuf_iter) {
+	  goto sample_sort_file_map_ret_MISSING_TOKENS;
+	}
+      }
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto sample_sort_file_map_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto sample_sort_file_map_ret_LONG_LINE;
+	}
+	goto sample_sort_file_map_ret_NOMEM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (loadbuf_first_token[0] == '#') {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --indiv-sort file starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx);
+	goto sample_sort_file_map_ret_MALFORMED_INPUT_WW;
+      }
+    }
+    
+    if (gzclose_null(&gz_infile)) {
+      goto sample_sort_file_map_ret_READ_FAIL;
+    }
+    if ((uintptr_t)(new_sample_idx_to_old_iter - (*new_sample_idx_to_old_ptr)) != sample_ct) {
+      logerrprint("Error: --indiv-sort file does not contain all loaded sample IDs.\n");
+      goto sample_sort_file_map_ret_INCONSISTENT_INPUT;
+    }
+    bigstack_mark = (unsigned char*)idbuf;
+  }
+  while (0) {
+  sample_sort_file_map_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  sample_sort_file_map_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  sample_sort_file_map_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  sample_sort_file_map_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  sample_sort_file_map_ret_LONG_LINE:
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of --indiv-sort file is pathologically long.\n", line_idx);
+    reterr = kPglRetMalformedInput;
+    break;
+  sample_sort_file_map_ret_MISSING_TOKENS:
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of --indiv-sort file has fewer tokens than expected.\n", line_idx);
+  sample_sort_file_map_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ sample_sort_file_map_ret_1:
+  gzclose_cond(gz_infile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+
+// assumes rawval is in [0, 163839]
+static_assert(kDosageMid == 16384, "print_small_dosage() needs to be updated.");
+char* print_small_dosage(uint32_t rawval, char* start) {
+  // Instead of constant 5-digit precision, we print fewer digits whenever that
+  // doesn't interfere with proper round-tripping.  I.e. we search for the
+  // shortest string in
+  //   ((n - 0.5)/16384, (n + 0.5)/16384). 
+  // E.g. 3277/16384 is 0.20001 when printed with 5-digit precision, but we'd
+  // print that as 0.2 since that's still in (3276.5/16384, 3277.5/16384).
+  *start++ = '0' + (rawval / 16384);
+  rawval = rawval % 16384;
+  if (!rawval) {
+    return start;
+  }
+  *start++ = '.';
+  // (rawval * 2) is in 32768ths
+  // 32768 * 625 = 20480k
+
+  const uint32_t range_top_20480k = (rawval * 2 + 1) * 625;
+  // this is technically checking a half-open rather than a fully-open
+  // interval, but that's fine since we never hit the boundary points
+  if ((range_top_20480k % 2048) < 1250) {
+    // when this is true, the four-decimal-place approximation is in the range
+    // which round-trips back to our original number.
+    const uint32_t four_decimal_places = range_top_20480k / 2048;
+    return uitoa_trunc4(four_decimal_places, start);
+  }
+  
+  // we wish to print (100000 * remainder + 8192) / 16384, left-0-padded.  and
+  // may as well banker's round too.
+  //
+  // banker's rounding yields a different result than regular rounding for n/64
+  // when n is congruent to 1 mod 4:
+  //   1/64 = .015625 -> print 0.01562
+  //   3/64 = .046875 -> print 0.04688
+  //   5/64 = .078125 -> print 0.07812
+  const uint32_t five_decimal_places = ((3125 * rawval + 256) / 512) - ((rawval % 1024) == 256);
+  const uint32_t first_decimal_place = five_decimal_places / 10000;
+  *start++ = '0' + first_decimal_place;
+  const uint32_t last_four_digits = five_decimal_places - first_decimal_place * 10000;
+  if (last_four_digits) {
+    return uitoa_trunc4(last_four_digits, start);
+  }
+  return start;
+}
+
+#ifdef __arm__
+  #error "Unaligned accesses in export_012_vmaj()."
+#endif
+pglerr_t export_012_vmaj(const char* outname, const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, const char* sample_ids, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, const double* variant_cms, uint32_t sample_ct, uintptr_t max_sample_id_blen, uint32_t variant_ct, uint32_t max_allele_slen, pgen_r [...]
+  // todo: --recode-allele?
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    const uint32_t max_chr_blen = 1 + get_max_chr_slen(cip);
+    char* chr_buf; // includes trailing tab
+    char* writebuf;
+    uintptr_t* genovec;
+    // dosages are limited to 7 characters (x.yyyyy)
+    if (bigstack_alloc_c(max_chr_blen, &chr_buf) ||
+	bigstack_alloc_c(kMaxMediumLine + max_chr_blen + 2 * kMaxIdSlen + 48 + 2 * max_allele_slen + (8 * k1LU) * sample_ct, &writebuf) ||
+        bigstack_alloc_ul(sample_ctl2, &genovec)) {
+      goto export_012_vmaj_ret_NOMEM;
+    }
+    char* writebuf_flush = &(writebuf[kMaxMediumLine]);
+    const uint32_t dosage_is_present = simple_pgrp->fi.gflags & kfPgenGlobalDosagePresent;
+    uintptr_t* dosage_present = nullptr;
+    dosage_t* dosage_vals = nullptr;
+    if (dosage_is_present) {
+      if (bigstack_alloc_ul(sample_ctl, &dosage_present) ||
+	  bigstack_alloc_dosage(sample_ct, &dosage_vals)) {
+	goto export_012_vmaj_ret_NOMEM;
+      }
+    }
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto export_012_vmaj_ret_OPEN_FAIL;
+    }
+    char* write_iter = strcpya(writebuf, "CHR\tSNP\t(C)M\tPOS\tCOUNTED\tALT");
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      *write_iter++ = '\t';
+      const char* fid_start = &(sample_ids[sample_uidx * max_sample_id_blen]);
+      const char* fid_end = (const char*)rawmemchr(fid_start, '\t');
+      write_iter = memcpyax(write_iter, fid_start, fid_end - fid_start, '_');
+      write_iter = strcpya(write_iter, &(fid_end[1]));
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	  goto export_012_vmaj_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+    }
+    append_binary_eoln(&write_iter);
+    LOGPRINTFWW5("--export A-transpose to %s ... ", outname);
+    fputs("0%", stdout);
+    fflush(stdout);
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_blen = 0;
+    uint32_t ref_allele_idx = 0;
+    uint32_t cur_allele_ct = 2;
+    const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    uint32_t variant_uidx = 0;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	*chr_name_end++ = '\t';
+	chr_blen = (uintptr_t)(chr_name_end - chr_buf);
+      }
+      write_iter = memcpya(write_iter, chr_buf, chr_blen);
+      write_iter = strcpyax(write_iter, variant_ids[variant_uidx], '\t');
+      if (variant_cms) {
+	write_iter = dtoa_g(variant_cms[variant_uidx], write_iter);
+      } else {
+	*write_iter++ = '0';
+      }
+      *write_iter++ = '\t';
+      write_iter = uint32toa_x(variant_bps[variant_uidx], '\t', write_iter);
+      // todo: multiallelic case
+      uint32_t dosage_ct;
+      uint32_t is_explicit_alt1;
+      reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+      if (reterr) {
+	if (reterr != kPglRetReadFail) {
+	  logprint("\n");
+	  logerrprint("Error: Malformed .pgen file.\n");
+	}
+	goto export_012_vmaj_ret_1;
+      }
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[2 * variant_uidx];
+      }
+      if (!ref_allele_idx) {
+	// we *usually* invert, since COUNTED = REF.
+	genovec_invert_unsafe(sample_ct, genovec);
+	biallelic_dosage16_invert(dosage_ct, dosage_vals);
+      }
+      uintptr_t variant_allele_idx_base = variant_uidx * 2;
+      if (variant_allele_idxs) {
+	variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+      }
+      char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+      write_iter = strcpyax(write_iter, cur_alleles[ref_allele_idx], '\t');
+      const uint32_t first_alt_idx = (ref_allele_idx == 0);
+      write_iter = strcpya(write_iter, cur_alleles[first_alt_idx]);
+      if (cur_allele_ct > 2) {
+	for (uint32_t allele_idx = first_alt_idx + 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	  if (allele_idx == ref_allele_idx) {
+	    continue;
+	  }
+	  if (write_iter >= writebuf_flush) {
+	    if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	      goto export_012_vmaj_ret_WRITE_FAIL;
+	    }
+	    write_iter = writebuf;
+	  }
+	  *write_iter++ = ',';
+	  write_iter = strcpya(write_iter, cur_alleles[allele_idx]);
+	}
+      }
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	  goto export_012_vmaj_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+      uint32_t widx = 0;
+      uint32_t loop_len = kBitsPerWordD2;
+      if (!dosage_ct) {
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    loop_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+	  }
+	  uintptr_t geno_word = genovec[widx];
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits < loop_len; ++sample_idx_lowbits) {
+	    *write_iter++ = '\t';
+	    uintptr_t cur_geno = geno_word & 3;
+	    if (cur_geno != 3) {
+	      *write_iter++ = '0' + cur_geno;
+	    } else {
+	      write_iter = strcpya(write_iter, "NA");
+	    }
+	    geno_word >>= 2;
+	  }
+	  ++widx;
+	}
+      } else {
+	dosage_t* dosage_vals_iter = dosage_vals;
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    loop_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+	  }
+	  uintptr_t geno_word = genovec[widx];
+	  uint32_t dosage_present_hw = ((halfword_t*)dosage_present)[widx];
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits < loop_len; ++sample_idx_lowbits) {
+	    *write_iter++ = '\t';
+	    if (dosage_present_hw & 1) {
+	      write_iter = print_small_dosage(*dosage_vals_iter++, write_iter);
+	    } else {
+	      uintptr_t cur_geno = geno_word & 3;
+	      if (cur_geno != 3) {
+		*write_iter++ = '0' + cur_geno;
+	      } else {
+		write_iter = strcpya(write_iter, "NA");
+	      }
+	    }
+	    geno_word >>= 2;
+	    dosage_present_hw >>= 1;
+	  }
+	  ++widx;
+	}
+      }
+      append_binary_eoln(&write_iter);
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+    }
+    if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+      goto export_012_vmaj_ret_WRITE_FAIL;
+    }
+    if (fclose_null(&outfile)) {
+      goto export_012_vmaj_ret_WRITE_FAIL;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+  }
+  while (0) {
+  export_012_vmaj_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_012_vmaj_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_012_vmaj_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+ export_012_vmaj_ret_1:
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+static uintptr_t* g_vmaj_readbuf = nullptr;
+
+THREAD_FUNC_DECL transpose_to_smaj_read_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+  const uintptr_t* variant_include = g_variant_include;
+  // const uintptr_t* variant_allele_idxs = g_variant_allele_idxs;
+  const alt_allele_ct_t* refalt1_select = g_refalt1_select;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uintptr_t* sample_include = g_sample_include;
+  const uint32_t* sample_include_cumulative_popcounts = g_sample_include_cumulative_popcounts;
+  const uint32_t read_sample_ct = g_sample_ct;
+  const uintptr_t read_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(read_sample_ct);
+  uintptr_t prev_copy_ct = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_copy_ct = g_cur_block_write_ct;
+    const uint32_t cur_idx_end = ((tidx + 1) * cur_block_copy_ct) / calc_thread_ct;
+    uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+    uint32_t cur_idx = (tidx * cur_block_copy_ct) / calc_thread_ct;
+    uintptr_t* vmaj_readbuf_iter = &(g_vmaj_readbuf[(prev_copy_ct + cur_idx) * read_sample_ctaw2]);
+    for (; cur_idx < cur_idx_end; ++cur_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      // todo: multiallelic case
+      const pglerr_t reterr = pgr_read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, read_sample_ct, variant_uidx, pgrp, vmaj_readbuf_iter);
+      if (reterr) {
+	g_error_ret = reterr;
+	break;
+      }
+      if (refalt1_select && (refalt1_select[2 * variant_uidx] == 1)) {
+	genovec_invert_unsafe(read_sample_ct, vmaj_readbuf_iter);
+	// don't need zero_trailing_quaters()
+      }
+      vmaj_readbuf_iter = &(vmaj_readbuf_iter[read_sample_ctaw2]);
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    prev_copy_ct += cur_block_copy_ct;
+    THREAD_BLOCK_FINISH(tidx);
+  }
+}
+
+static uintptr_t* g_smaj_writebufs[2] = {nullptr, nullptr};
+static uint32_t g_variant_ct = 0;
+static uint32_t g_sample_batch_size = 0;
+static uint32_t g_output_calc_thread_ct = 0;
+
+THREAD_FUNC_DECL transpose_to_plink1_smaj_write_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t variant_ct = g_variant_ct;
+  const uintptr_t variant_batch_ct = DIV_UP(variant_ct, kPglQuaterTransposeBatch);
+  const uintptr_t variant_batch_word_ct = variant_batch_ct * kPglQuaterTransposeWords;
+  const uint32_t calc_thread_ct = g_output_calc_thread_ct;
+  const uint32_t variant_batch_idx_start = (((uint64_t)tidx) * variant_batch_ct) / calc_thread_ct;
+  vul_t* vecaligned_buf = g_thread_vecaligned_bufs[tidx];
+  uintptr_t variant_batch_idx_full_end = ((((uint64_t)tidx) + 1) * variant_batch_ct) / calc_thread_ct;
+  uint32_t variant_idx_end;
+  if (tidx + 1 < calc_thread_ct) {
+    variant_idx_end = variant_batch_idx_full_end * kPglQuaterTransposeBatch;
+  } else {
+    variant_idx_end = variant_ct;
+    if (variant_ct % kPglQuaterTransposeBatch) {
+      --variant_batch_idx_full_end;
+    }
+  }
+  const uint32_t thread_variant_ct = variant_idx_end - variant_batch_idx_start * kPglQuaterTransposeBatch;
+  const uint32_t read_sample_ct = g_sample_ct;
+  const uintptr_t read_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(read_sample_ct);
+  const uintptr_t* vmaj_readbuf = g_vmaj_readbuf;
+  uint32_t sample_widx = 0;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    uintptr_t variant_batch_idx = variant_batch_idx_start;
+    uint32_t variant_batch_size = kPglQuaterTransposeBatch;
+    const uintptr_t* vmaj_readbuf_iter = &(vmaj_readbuf[variant_batch_idx * kPglQuaterTransposeBatch * read_sample_ctaw2 + sample_widx]);
+    const uint32_t sample_batch_size = g_sample_batch_size;
+    uintptr_t* smaj_writebuf_start = &(g_smaj_writebufs[parity][variant_batch_idx * kPglQuaterTransposeWords]);
+    uintptr_t* smaj_writebuf_iter = smaj_writebuf_start;
+    while (1) {
+      if (variant_batch_idx >= variant_batch_idx_full_end) {
+	if (variant_batch_idx * kPglQuaterTransposeBatch >= variant_idx_end) {
+	  break;
+	}
+	variant_batch_size = variant_idx_end - variant_batch_idx * kPglQuaterTransposeBatch;
+      }
+      transpose_quaterblock(vmaj_readbuf_iter, read_sample_ctaw2, variant_batch_word_ct, variant_batch_size, sample_batch_size, smaj_writebuf_iter, vecaligned_buf);
+      smaj_writebuf_iter = &(smaj_writebuf_iter[kPglQuaterTransposeWords]);
+      vmaj_readbuf_iter = &(vmaj_readbuf_iter[variant_batch_size * read_sample_ctaw2]);
+      ++variant_batch_idx;
+    }
+    smaj_writebuf_iter = smaj_writebuf_start;
+    for (uint32_t sample_idx = 0; sample_idx < sample_batch_size; ++sample_idx) {
+      // could fold this into transpose_quaterblock(), but I won't bother,
+      // we're already saturating at ~3 threads
+      pgr_plink2_to_plink1_inplace_unsafe(thread_variant_ct, smaj_writebuf_iter);
+      zero_trailing_quaters(thread_variant_ct, smaj_writebuf_iter);
+      smaj_writebuf_iter = &(smaj_writebuf_iter[variant_batch_word_ct]);
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+    sample_widx += sample_batch_size / kBitsPerWordD2;
+  }
+}
+
+pglerr_t export_ind_major_bed(const uintptr_t* orig_sample_include, const uintptr_t* variant_include, const uintptr_t* variant_allele_idxs, const alt_allele_ct_t* refalt1_select, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, pgen_file_info_t* pgfip, char* outname, char* outname_end) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    // Possible special case: if the input file is a variant-major .bed, we do
+    // not have enough memory to just load the whole file at once, and there
+    // are more than ~20k samples, there can be a performance advantage to not
+    // loading an entire variant at a time; we can use smaller fread calls and
+    // reduce the number of (typically 4096 byte) disk blocks which need to be
+    // read on each pass.  But let's get .pgen -> sample-major humming first.
+    strcpy(outname_end, ".bed");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto export_ind_major_bed_ret_OPEN_FAIL;
+    }
+    if (fwrite_checked("l\x1b\0", 3, outfile)) {
+      goto export_ind_major_bed_ret_WRITE_FAIL;
+    }
+    if (variant_ct && sample_ct) {
+      const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+      uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+      // todo: if only 1 pass is needed, and no subsetting is happening, this
+      // saturates at ~4 threads?
+      unsigned char* bigstack_end_mark = g_bigstack_end;
+      // restrict multithread_load_init() to half of available workspace
+      g_bigstack_end = &(g_bigstack_base[round_up_pow2(bigstack_left() / 2, kCacheline)]);
+      unsigned char* main_loadbufs[2];
+      pthread_t* threads;
+      uint32_t read_block_size;
+      if (multithread_load_init(variant_include, sample_ct, variant_ct, pgr_alloc_cacheline_ct, 0, 0, pgfip, &calc_thread_ct, &g_genovecs, nullptr, nullptr, &read_block_size, main_loadbufs, &threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+	goto export_ind_major_bed_ret_NOMEM;
+      }
+      g_bigstack_end = bigstack_end_mark;
+      g_variant_include = variant_include;
+      g_variant_allele_idxs = variant_allele_idxs;
+      g_refalt1_select = refalt1_select;
+      g_calc_thread_ct = calc_thread_ct;
+      
+      const uintptr_t variant_cacheline_ct = QUATERCT_TO_CLCT(variant_ct);
+      uint32_t output_calc_thread_ct = MINV(calc_thread_ct, variant_cacheline_ct);
+      if (output_calc_thread_ct > 4) {
+	output_calc_thread_ct = 4;
+      }
+      uintptr_t* sample_include;
+      uint32_t* sample_include_cumulative_popcounts;
+      if (bigstack_alloc_ul(raw_sample_ctl, &sample_include) ||
+	  bigstack_alloc_ui(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
+	  bigstack_alloc_vp(output_calc_thread_ct, &g_thread_vecaligned_bufs)) {
+	goto export_ind_major_bed_ret_NOMEM;
+      }
+      for (uint32_t tidx = 0; tidx < output_calc_thread_ct; ++tidx) {
+	g_thread_vecaligned_bufs[tidx] = (vul_t*)bigstack_alloc_raw(kPglQuaterTransposeBufbytes);
+      }
+      // each of the two write buffers should use <= 1/8 of the remaining
+      // workspace
+      const uintptr_t writebuf_cachelines_avail = bigstack_left() / (kCacheline * 8);
+      uint32_t sample_batch_size = kPglQuaterTransposeBatch;
+      if (variant_cacheline_ct * kPglQuaterTransposeBatch > writebuf_cachelines_avail) {
+	sample_batch_size = round_down_pow2(writebuf_cachelines_avail / variant_cacheline_ct, kBitsPerWordD2);
+	if (!sample_batch_size) {
+	  goto export_ind_major_bed_ret_NOMEM;
+	}
+      }
+      g_smaj_writebufs[0] = (uintptr_t*)bigstack_alloc_raw(variant_cacheline_ct * kCacheline * sample_batch_size);
+      g_smaj_writebufs[1] = (uintptr_t*)bigstack_alloc_raw(variant_cacheline_ct * kCacheline * sample_batch_size);
+      const uintptr_t readbuf_vecs_avail = (bigstack_left() / kCacheline) * kVecsPerCacheline;
+      if (readbuf_vecs_avail < variant_ct) {
+	goto export_ind_major_bed_ret_NOMEM;
+      }
+      uintptr_t read_sample_ctv2 = readbuf_vecs_avail / variant_ct;
+      uint32_t read_sample_ct;
+      if (read_sample_ctv2 >= QUATERCT_TO_VECCT(sample_ct)) {
+	read_sample_ct = sample_ct;
+      } else {
+	read_sample_ct = read_sample_ctv2 * kQuatersPerVec;
+      }
+      uintptr_t read_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(read_sample_ct);
+      uintptr_t* vmaj_readbuf = (uintptr_t*)bigstack_alloc_raw_rd(variant_ct * read_sample_ctaw2 * kBytesPerWord);
+      g_variant_ct = variant_ct;
+      g_output_calc_thread_ct = output_calc_thread_ct;
+      g_error_ret = kPglRetSuccess;
+      uint32_t sample_uidx_start = next_set_unsafe(orig_sample_include, 0);
+      const uintptr_t variant_ct4 = QUATERCT_TO_BYTECT(variant_ct);
+      const uintptr_t variant_ctaclw2 = variant_cacheline_ct * kWordsPerCacheline;
+      const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+      const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+      const uint32_t pass_ct = 1 + (sample_ct - 1) / read_sample_ct;
+      for (uint32_t pass_idx = 0; pass_idx < pass_ct; ++pass_idx) {
+	memcpy(sample_include, orig_sample_include, raw_sample_ctl * sizeof(intptr_t));
+	if (sample_uidx_start) {
+	  clear_bits_nz(0, sample_uidx_start, sample_include);
+	}
+	uint32_t sample_uidx_end;
+	if (pass_idx + 1 == pass_ct) {
+	  read_sample_ct = sample_ct - pass_idx * read_sample_ct;
+	  read_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(read_sample_ct);
+	  sample_uidx_end = raw_sample_ct;
+	} else {
+	  sample_uidx_end = jump_forward_set_unsafe(orig_sample_include, sample_uidx_start + 1, read_sample_ct);
+	  clear_bits_nz(sample_uidx_end, raw_sample_ct, sample_include);
+	}
+        fill_cumulative_popcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
+	g_sample_include = sample_include;
+	g_sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
+	g_vmaj_readbuf = vmaj_readbuf;
+	g_sample_ct = read_sample_ct;
+	if (pass_idx) {
+	  pgfip->block_base = main_loadbufs[0];
+	  for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	    pgr_clear_ld_cache(g_pgr_ptrs[tidx]);
+	    g_pgr_ptrs[tidx]->fi.block_base = main_loadbufs[0];
+	    g_pgr_ptrs[tidx]->fi.block_offset = 0;
+	  }
+	}
+	uint32_t parity = 0;
+	uint32_t read_block_idx = 0;
+        uint32_t variant_idx = 0;
+	uint32_t is_last_block = 0;
+	uint32_t cur_read_block_size = read_block_size;
+	uint32_t pct = 0;
+	uint32_t next_print_idx = variant_ct / 100;
+	putc_unlocked('\r', stdout);
+	printf("--export ind-major-bed pass %u/%u: loading... 0%%", pass_idx + 1, pass_ct);
+	fflush(stdout);
+	while (1) {
+	  uintptr_t cur_block_write_ct = 0;
+	  if (!is_last_block) {
+	    while (read_block_idx < read_block_ct_m1) {
+	      cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	      if (cur_block_write_ct) {
+		break;
+	      }
+	      ++read_block_idx;
+	    }
+	    if (read_block_idx == read_block_ct_m1) {
+	      cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	      cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	    }
+	    if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_write_ct, pgfip)) {
+	      if (variant_idx) {
+		join_threads2z(calc_thread_ct, 0, threads);
+		g_cur_block_write_ct = 0;
+		error_cleanup_threads2z(transpose_to_smaj_read_thread, calc_thread_ct, threads);
+	      }
+	      goto export_ind_major_bed_ret_THREAD_CREATE_FAIL;
+	    }
+	  }
+	  if (variant_idx) {
+	    join_threads2z(calc_thread_ct, is_last_block, threads);
+	    reterr = g_error_ret;
+	    if (reterr) {
+	      if (!is_last_block) {
+		g_cur_block_write_ct = 0;
+		error_cleanup_threads2z(transpose_to_smaj_read_thread, calc_thread_ct, threads);
+	      }
+	      if (reterr == kPglRetMalformedInput) {
+		logprint("\n");
+		logerrprint("Error: Malformed .pgen file.\n");
+	      }
+	      goto export_ind_major_bed_ret_1;
+	    }
+	  }
+	  if (!is_last_block) {
+	    g_cur_block_write_ct = cur_block_write_ct;
+	    compute_uidx_start_partition(variant_include, cur_block_write_ct, calc_thread_ct, read_block_idx * read_block_size, g_read_variant_uidx_starts);
+	    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	      g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	      g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	    }
+	    is_last_block = (variant_idx + cur_block_write_ct == variant_ct);
+	    if (spawn_threads2z(transpose_to_smaj_read_thread, calc_thread_ct, is_last_block, threads)) {
+	      goto export_ind_major_bed_ret_THREAD_CREATE_FAIL;
+	    }
+	  }
+	  parity = 1 - parity;
+	  if (variant_idx == variant_ct) {
+	    break;
+	  }
+	  if (variant_idx >= next_print_idx) {
+	    if (pct > 10) {
+	      putc_unlocked('\b', stdout);
+	    }
+	    pct = (variant_idx * 100LLU) / variant_ct;
+	    printf("\b\b%u%%", pct++);
+	    fflush(stdout);
+	    next_print_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	  }
+
+	  ++read_block_idx;
+	  variant_idx += cur_block_write_ct;
+	  pgfip->block_base = main_loadbufs[parity];
+	}
+	// 2. Transpose and write.  (Could parallelize some of the transposing
+	//    with the read loop, but since we can't write a single row until
+	//    the read loop is done, and both write speed and write buffer
+	//    space are bottlenecks, that can't be expected to help much.)
+	g_sample_batch_size = sample_batch_size;
+	parity = 0;
+	is_last_block = 0;
+	if (pct > 10) {
+	  fputs("\b \b", stdout);
+	}
+	fputs("\b\b\b\b\b\b\b\b\b\b\b\b\bwriting... 0%", stdout);
+	fflush(stdout);
+	pct = 0;
+	uint32_t flush_sample_idx = 0;
+	uint32_t flush_sample_idx_end = 0;
+	next_print_idx = read_sample_ct / 100;
+	while (1) {
+	  if (!is_last_block) {
+	    is_last_block = (flush_sample_idx_end + sample_batch_size >= read_sample_ct);
+	    if (is_last_block) {
+	      g_sample_batch_size = read_sample_ct - flush_sample_idx_end;
+	    }
+	    if (spawn_threads2z(transpose_to_plink1_smaj_write_thread, output_calc_thread_ct, is_last_block, threads)) {
+	      goto export_ind_major_bed_ret_THREAD_CREATE_FAIL;
+	    }
+	  }
+	  if (flush_sample_idx_end) {
+	    uintptr_t* smaj_writebuf_iter = g_smaj_writebufs[1 - parity];
+	    for (; flush_sample_idx < flush_sample_idx_end; ++flush_sample_idx) {
+	      fwrite(smaj_writebuf_iter, variant_ct4, 1, outfile);
+	      smaj_writebuf_iter = &(smaj_writebuf_iter[variant_ctaclw2]);
+	    }
+	    if (flush_sample_idx_end == read_sample_ct) {
+	      break;
+	    }
+	    if (flush_sample_idx_end >= next_print_idx) {
+	      if (pct > 10) {
+		putc_unlocked('\b', stdout);
+	      }
+	      pct = (flush_sample_idx_end * 100LLU) / read_sample_ct;
+	      printf("\b\b%u%%", pct++);
+	      fflush(stdout);
+	      next_print_idx = (pct * ((uint64_t)read_sample_ct)) / 100;
+	    }
+	  }
+	  join_threads2z(output_calc_thread_ct, is_last_block, threads);
+	  if (ferror(outfile)) {
+	    // may as well put this check when there are no threads to clean up
+	    goto export_ind_major_bed_ret_WRITE_FAIL;
+	  }
+	  parity = 1 - parity;
+	  flush_sample_idx_end += sample_batch_size;
+	  if (flush_sample_idx_end > read_sample_ct) {
+	    flush_sample_idx_end = read_sample_ct;
+	  }
+	}
+	if (pct > 10) {
+	  fputs("\b \b", stdout);
+	}
+	sample_uidx_start = sample_uidx_end;
+      }
+      fputs("\b\bdone.\n", stdout);
+    }
+    if (fclose_null(&outfile)) {
+      goto export_ind_major_bed_ret_WRITE_FAIL;
+    }
+    LOGPRINTFWW("--export ind-major-bed: %s written.\n", outname);
+  }
+  while (0) {
+  export_ind_major_bed_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_ind_major_bed_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_ind_major_bed_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  export_ind_major_bed_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ export_ind_major_bed_ret_1:
+  fclose_cond(outfile);
+  pgfip->block_base = nullptr;
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+static_assert(kDosageMid == 16384, "print_gen_dosage() needs to be updated.");
+char* print_gen_dosage(uint32_t rawval, char* start) {
+  // Similar to small_dosage_print(), but it's complicated by .gen import's
+  // quantization step (instead of rounding the numbers directly, they're first
+  // converted to bgen-1.1 equivalents).  We check
+  //   ((n - 0.75)/16384, (n + 0.75)/16384) for even n
+  //   ((n - 0.25)/16384, (n + 0.25)/16384) for odd n
+  // due to banker's rounding.
+  *start++ = '0' + (rawval / 16384);
+  rawval = rawval % 16384;
+  if (!rawval) {
+    return start;
+  }
+  *start++ = '.';
+  const uint32_t halfwidth_65536ths = 3 - (2 * (rawval % 2));
+  // (rawval * 4) is in 65536ths
+  // 65536 * 625 = 40960k
+
+  const uint32_t range_top_40960k = (rawval * 4 + halfwidth_65536ths) * 625;
+  // this is technically checking a half-open rather than a fully-open
+  // interval, but that's fine since we never hit the boundary points
+  if ((range_top_40960k % 4096) < 1250 * halfwidth_65536ths) {
+    // when this is true, the four-decimal-place approximation is in the range
+    // which round-trips back to our original number.
+    const uint32_t four_decimal_places = range_top_40960k / 4096;
+    return uitoa_trunc4(four_decimal_places, start);
+  }
+  
+  // we wish to print (100000 * remainder + 8192) / 16384, left-0-padded.  and
+  // may as well banker's round too.
+  //  
+  // banker's rounding yields a different result than regular rounding for n/64
+  // when n is congruent to 1 mod 4:
+  //   1/64 = .015625 -> print 0.01562
+  //   3/64 = .046875 -> print 0.04688
+  //   5/64 = .078125 -> print 0.07812
+  const uint32_t five_decimal_places = ((3125 * rawval + 256) / 512) - ((rawval % 1024) == 256);
+  const uint32_t first_decimal_place = five_decimal_places / 10000;
+  *start++ = '0' + first_decimal_place;
+  const uint32_t last_four_digits = five_decimal_places - first_decimal_place * 10000;
+  if (last_four_digits) {
+    return uitoa_trunc4(last_four_digits, start);
+  }
+  return start;
+}
+
+// note that this is also in plink2_filter; may belong in plink2_common
+static inline void incr_missing_row(const uintptr_t* genovec, uint32_t acc2_vec_ct, uintptr_t* missing_acc2) {
+  const vul_t* genovvec = (const vul_t*)genovec;
+  vul_t* missing_acc2v = (vul_t*)missing_acc2;
+  const vul_t m1 = VCONST_UL(kMask5555);
+  for (uint32_t vidx = 0; vidx < acc2_vec_ct; ++vidx) {
+    const vul_t geno_vword = genovvec[vidx];
+    const vul_t geno_vword_shifted_masked = vul_rshift(geno_vword, 1) & m1;
+    missing_acc2v[vidx] = missing_acc2v[vidx] + (geno_vword & geno_vword_shifted_masked);
+  }
+}
+
+pglerr_t export_ox_gen(const char* outname, const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, uint32_t sample_ct, uint32_t variant_ct, uint32_t max_allele_slen, exportf_flags_t exportf_modifier, pgen_reader_t* simple_pgrp,  [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctv = BITCT_TO_VECCT(sample_ct);
+    uintptr_t* genovec;
+    uintptr_t* sex_male_collapsed_interleaved;
+    uintptr_t* sex_male_collapsed_tmp;
+    // if we weren't using bigstack_alloc, this would need to be sample_ctaw2
+    if (bigstack_alloc_ul(sample_ctl2, &genovec) ||
+	
+	bigstack_alloc_ul(sample_ctv * kWordsPerVec, &sex_male_collapsed_interleaved) ||
+	bigstack_alloc_ul(sample_ctv * kWordsPerVec, &sex_male_collapsed_tmp)) {
+      goto export_ox_gen_ret_NOMEM;
+    }
+    copy_bitarr_subset(sex_male, sample_include, sample_ct, sex_male_collapsed_tmp);
+    fill_interleaved_mask_vec(sex_male_collapsed_tmp, sample_ctv, sex_male_collapsed_interleaved);
+    bigstack_reset(sex_male_collapsed_tmp);
+    
+    // See load_sample_missing_cts in plink2_filter.cpp.
+    // Yes, this is overkill, but the obvious alternative of incrementing
+    // sample_missing_geno_cts[] when writing a missing call requires a bit of
+    // custom chrY logic anyway.
+    const uint32_t acc2_vec_ct = QUATERCT_TO_VECCT(sample_ct);
+    uintptr_t* missing_acc2;
+    if (bigstack_calloc_ul(acc2_vec_ct * kWordsPerVec * 23, &missing_acc2)) {
+      goto export_ox_gen_ret_NOMEM;
+    }
+    const uint32_t acc4_vec_ct = acc2_vec_ct * 2;
+    const uint32_t acc8_vec_ct = acc2_vec_ct * 4;
+    uintptr_t* missing_acc4 = &(missing_acc2[acc2_vec_ct * kWordsPerVec]);
+    uintptr_t* missing_acc8 = &(missing_acc4[acc4_vec_ct * kWordsPerVec]);
+    uintptr_t* missing_acc32 = &(missing_acc8[acc8_vec_ct * kWordsPerVec]);
+
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uintptr_t* dosage_present = nullptr;
+    dosage_t* dosage_vals = nullptr;
+    if (simple_pgrp->fi.gflags & kfPgenGlobalDosagePresent) {
+      const uint32_t multiallelic_present = (variant_allele_idxs != nullptr);
+      if (bigstack_alloc_dosage(sample_ct * (1 + multiallelic_present), &dosage_vals) ||
+	  bigstack_alloc_ul(sample_ctl, &dosage_present)) {
+	goto export_ox_gen_ret_NOMEM;
+      }
+    }
+    const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+    // if no dosages, all genotypes are 6 bytes (missing = " 0 0 0")
+    // with dosages, we print up to 5 digits past the decimal point, so 7 bytes
+    //   + space for each number, 24 bytes max
+    const uintptr_t max_geno_slen = 6 + (dosage_present != nullptr) * 18;
+    char* chr_buf; // includes trailing space
+    char* writebuf;
+    if (bigstack_alloc_c(max_chr_blen, &chr_buf) ||
+	bigstack_alloc_c(kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 16 + 2 * max_allele_slen + max_geno_slen * sample_ct, &writebuf)) {
+      goto export_ox_gen_ret_NOMEM;
+    }
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto export_ox_gen_ret_OPEN_FAIL;
+    }
+    char* writebuf_flush = &(writebuf[kCompressStreamBlock]);
+    char* write_iter = writebuf;
+    uint32_t variant_uidx = 0;
+    uint32_t chr_blen = 0;
+
+    // although we don't support --set-hh-missing, etc. here, we do still want
+    // to be aware of chrY so we can exclude nonmales from the
+    // sample_missing_geno_cts update there.
+    uint32_t is_y = 0;
+
+    uint32_t chr_fo_idx = 0xffffffffU;
+    const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+    uint32_t chr_end = 0;
+    uint32_t vidx_rem3 = 3;
+    uint32_t vidx_rem15d3 = 5;
+    uint32_t vidx_rem255d15 = 17;
+    const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+    const char hardcall_strs[] = " 1 0 0   0 1 0   0 0 1   0 0 0";
+    const uint32_t ref_allele_second = !(exportf_modifier & kfExportfRefFirst);
+    LOGPRINTFWW5("Writing %s ... ", outname);
+    fputs("0%", stdout);
+    fflush(stdout);
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    uint32_t ref_allele_idx = 0;
+    uint32_t alt1_allele_idx = 1;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	// Oxford spec doesn't seem to require spaces for .gen (only .sample),
+	// but in practice spaces always seem to be used, and plink 1.9 doesn't
+	// let you toggle this, so let's not worry about supporting tabs here
+	*chr_name_end++ = ' ';
+	chr_blen = (uintptr_t)(chr_name_end - chr_buf);
+	is_y = (chr_idx == y_code);
+      }
+      write_iter = memcpya(write_iter, chr_buf, chr_blen);
+      write_iter = strcpyax(write_iter, variant_ids[variant_uidx], ' ');
+      write_iter = uint32toa_x(variant_bps[variant_uidx], ' ', write_iter);
+      uintptr_t variant_allele_idx_base = variant_uidx * 2;
+      if (variant_allele_idxs) {
+	variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+      }
+      uint32_t is_explicit_alt1;
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[variant_uidx * 2];
+	alt1_allele_idx = refalt1_select[variant_uidx * 2 + 1];
+      }
+      // todo: multiallelic case
+      uint32_t dosage_ct;
+      reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+      if (reterr) {
+	if (reterr != kPglRetReadFail) {
+	  logprint("\n");
+	  logerrprint("Error: Malformed .pgen file.\n");
+	}
+	goto export_ox_gen_ret_1;
+      }
+      if (ref_allele_idx + ref_allele_second == 1) {
+	assert((!dosage_ct) || (!is_explicit_alt1));
+	genovec_invert_unsafe(sample_ct, genovec);
+	biallelic_dosage16_invert(dosage_ct, dosage_vals);
+      }
+      
+      char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+      if (ref_allele_second) {
+	write_iter = strcpyax(write_iter, cur_alleles[alt1_allele_idx], ' ');
+	write_iter = strcpya(write_iter, cur_alleles[ref_allele_idx]);
+      } else {
+	write_iter = strcpyax(write_iter, cur_alleles[ref_allele_idx], ' ');
+	write_iter = strcpya(write_iter, cur_alleles[alt1_allele_idx]);
+      }
+      uint32_t widx = 0;
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      if (!dosage_ct) {
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	  }
+	  uintptr_t geno_word = genovec[widx];
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    write_iter = memcpya(write_iter, &(hardcall_strs[(geno_word & 3) * 8]), 6);
+	    geno_word >>= 2;
+	  }
+	  ++widx;
+	}
+      } else {
+	const halfword_t* dosage_present_alias = (halfword_t*)dosage_present;
+	const dosage_t* dosage_vals_iter = dosage_vals;
+	if (!is_explicit_alt1) {
+	  while (1) {
+	    if (widx >= sample_ctl2_m1) {
+	      if (widx > sample_ctl2_m1) {
+		break;
+	      }
+	      inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	    }
+	    uintptr_t geno_word = genovec[widx];
+	    uint32_t dosage_present_hw = dosage_present_alias[widx];
+	    if (!dosage_present_hw) {
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		write_iter = memcpya(write_iter, &(hardcall_strs[(geno_word & 3) * 8]), 6);
+		geno_word >>= 2;
+	      }
+	    } else {
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		if (dosage_present_hw & 1) {
+		  const uint32_t dosage_int = *dosage_vals_iter++;
+		  if (dosage_int <= kDosageMid) {
+		    *write_iter++ = ' ';
+		    write_iter = print_gen_dosage(kDosageMid - dosage_int, write_iter);
+		    *write_iter++ = ' ';
+		    write_iter = print_gen_dosage(dosage_int, write_iter);
+		    write_iter = strcpya(write_iter, " 0");
+		  } else {
+		    assert(dosage_int <= kDosageMax);
+		    write_iter = memcpyl3a(write_iter, " 0 ");
+		    write_iter = print_gen_dosage(kDosageMax - dosage_int, write_iter);
+		    *write_iter++ = ' ';
+		    write_iter = print_gen_dosage(dosage_int - kDosageMid, write_iter);
+		  }
+		} else {
+		  write_iter = memcpya(write_iter, &(hardcall_strs[(geno_word & 3) * 8]), 6);
+		}
+		geno_word >>= 2;
+		dosage_present_hw >>= 1;
+	      }
+	    }
+	    ++widx;
+	  }
+	} else {
+	  // todo
+	  // In multiallelic case, if ref/alt1 dosages sum to less than 2 (but
+	  // more than 0), we first internally rescale them to sum to 2, to
+	  // make .gen and bgen-1.1 export isomorphic, and bgen-1.2 export as
+	  // similar as possible.
+	  assert(0);
+	}
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto export_ox_gen_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+      if (is_y) {
+	interleaved_mask_zero(sex_male_collapsed_interleaved, acc2_vec_ct, genovec);
+      }
+      incr_missing_row(genovec, acc2_vec_ct, missing_acc2);
+      if (!(--vidx_rem3)) {
+	unroll_zero_incr_2_4(acc2_vec_ct, missing_acc2, missing_acc4);
+	vidx_rem3 = 3;
+	if (!(--vidx_rem15d3)) {
+	  unroll_zero_incr_4_8(acc4_vec_ct, missing_acc4, missing_acc8);
+	  vidx_rem15d3 = 5;
+	  if (!(--vidx_rem255d15)) {
+	    unroll_zero_incr_8_32(acc8_vec_ct, missing_acc8, missing_acc32);
+	    vidx_rem255d15 = 17;
+	  }
+	}
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	goto export_ox_gen_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto export_ox_gen_ret_WRITE_FAIL;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+    unroll_incr_2_4(missing_acc2, acc2_vec_ct, missing_acc4);
+    unroll_incr_4_8(missing_acc4, acc4_vec_ct, missing_acc8);
+    unroll_incr_8_32(missing_acc8, acc8_vec_ct, missing_acc32);
+    uint32_t* scrambled_missing_cts = (uint32_t*)missing_acc32;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      const uint32_t scrambled_idx = scramble_2_4_8_32(sample_idx);
+      sample_missing_geno_cts[sample_idx] = scrambled_missing_cts[scrambled_idx];
+    }
+  }
+  while (0) {
+  export_ox_gen_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_ox_gen_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_ox_gen_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+ export_ox_gen_ret_1:
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+#ifdef __arm__
+  #error "Unaligned accesses in export_ox_hapslegend()."
+#endif
+pglerr_t export_ox_hapslegend(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, const uintptr_t* sex_male_collapsed, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, uint32_t sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, exportf_flags_t exportf_modifier, pgen_re [...]
+  assert(sample_ct);
+  assert(variant_ct);
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    const uint32_t just_haps = (exportf_modifier / kfExportfHaps) & 1;
+    const uint32_t male_ct = popcount_longs(sex_male_collapsed, sample_ctl);
+    if (xymt_is_nonempty(variant_include, cip, kChrOffsetY) && (male_ct != sample_ct)) {
+      LOGERRPRINTF("Error: '--export haps%s' must exclude chrY unless the dataset is all-male.\n", just_haps? "" : "legend");
+      goto export_ox_hapslegend_ret_INCONSISTENT_INPUT;
+    }
+    const uint32_t ref_allele_second = !(exportf_modifier & kfExportfRefFirst);
+    const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+    const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+    char* chr_buf = nullptr;
+    uint32_t is_x = 0;
+    uint32_t is_haploid_or_mt = 0;
+    uint32_t variant_uidx = next_set_unsafe(variant_include, 0);
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t ref_allele_idx = 0;
+    uint32_t alt1_allele_idx = 1;
+    uintptr_t writebuf_alloc = 0;
+    if (!just_haps) {
+      // .legend doesn't have a chromosome column, so verify we only need to
+      // export a single chromosome
+      const uint32_t variant_uidx_start = variant_uidx;
+      chr_fo_idx = get_variant_chr_fo_idx(cip, variant_uidx_start);
+      chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+      if ((chr_end != raw_variant_ct) && (popcount_bit_idx(variant_include, variant_uidx_start, chr_end) != variant_ct)) {
+	logerrprint("Error: '--export hapslegend' does not support multiple chromosomes.\n");
+	goto export_ox_hapslegend_ret_INCONSISTENT_INPUT;
+      }
+      const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+      is_x = (chr_idx == x_code);
+      is_haploid_or_mt = is_set(cip->haploid_mask, chr_idx) || (chr_idx == mt_code);
+      strcpy(outname_end, ".legend");
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	goto export_ox_hapslegend_ret_OPEN_FAIL;
+      }
+      char* writebuf;
+      if (bigstack_alloc_c(kCompressStreamBlock + kMaxIdSlen + 32 + 2 * max_allele_slen, &writebuf)) {
+	goto export_ox_hapslegend_ret_NOMEM;
+      }
+      char* writebuf_flush = &(writebuf[kCompressStreamBlock]);
+      char* write_iter = strcpya(writebuf, "id position a0 a1" EOLN_STR);
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	write_iter = strcpyax(write_iter, variant_ids[variant_uidx], ' ');
+	write_iter = uint32toa_x(variant_bps[variant_uidx], ' ', write_iter);
+	if (refalt1_select) {
+	  ref_allele_idx = refalt1_select[variant_uidx * 2];
+	  alt1_allele_idx = refalt1_select[variant_uidx * 2 + 1];
+	}
+	uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	if (variant_allele_idxs) {
+	  variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	}
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+        if (ref_allele_second) {
+	  write_iter = strcpyax(write_iter, cur_alleles[alt1_allele_idx], ' ');
+	  write_iter = strcpya(write_iter, cur_alleles[ref_allele_idx]);
+	} else {
+	  write_iter = strcpyax(write_iter, cur_alleles[ref_allele_idx], ' ');
+	  write_iter = strcpya(write_iter, cur_alleles[alt1_allele_idx]);
+	}
+	append_binary_eoln(&write_iter);
+	if (write_iter >= writebuf_flush) {
+	  if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	    goto export_ox_hapslegend_ret_WRITE_FAIL;
+	  }
+	  write_iter = writebuf;
+	}
+      }
+      if (write_iter != writebuf) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto export_ox_hapslegend_ret_WRITE_FAIL;
+	}
+      }
+      if (fclose_null(&outfile)) {
+	goto export_ox_hapslegend_ret_WRITE_FAIL;
+      }
+      logprint("done.\n");
+      variant_uidx = variant_uidx_start;
+      bigstack_reset(writebuf);
+    } else {
+      const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+      if (bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+	goto export_ox_hapslegend_ret_NOMEM;
+      }
+      writebuf_alloc = max_chr_blen + kMaxIdSlen + 32 + 2 * max_allele_slen;
+    }
+    writebuf_alloc += kCompressStreamBlock + (4 * k1LU) * sample_ct + kCacheline;
+    const uint32_t sample_ctv = BITCT_TO_VECCT(sample_ct);
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+    char* writebuf;
+    uintptr_t* sex_male_collapsed_interleaved;
+    uintptr_t* genovec;
+    uintptr_t* phasepresent;
+    uintptr_t* phaseinfo;
+    if (bigstack_alloc_ul(sample_ctv * kWordsPerVec, &sex_male_collapsed_interleaved) ||
+	bigstack_alloc_c(writebuf_alloc, &writebuf) ||
+	bigstack_alloc_ul(sample_ctl2, &genovec) ||
+        bigstack_alloc_ul(sample_ctl, &phasepresent) ||
+	bigstack_alloc_ul(sample_ctl, &phaseinfo)) {
+      goto export_ox_hapslegend_ret_NOMEM;
+    }
+    fill_interleaved_mask_vec(sex_male_collapsed, sample_ctv, sex_male_collapsed_interleaved);
+    // assumes little-endian
+    // 3 = 1|0, not missing
+    // 4..7 = male chrX
+    // user's responsibility to split off PARs
+    uint32_t genotext[7];
+    genotext[0] = 0x20302030;
+    genotext[2] = 0x20312031;
+    genotext[4] = 0x202d2030;
+    genotext[6] = 0x202d2031;
+    if (ref_allele_second) {
+      genotext[1] = 0x20302031;
+      genotext[3] = 0x20312030;
+    } else {
+      genotext[1] = 0x20312030;
+      genotext[3] = 0x20302031;
+    }
+#ifndef NDEBUG
+    genotext[5] = 0x21475542; // "BUG!"
+#endif
+    uint32_t* cur_genotext = genotext;
+    if (is_haploid_or_mt && (!is_x)) {
+      cur_genotext = &(genotext[4]);
+    }
+    char* writebuf_flush = &(writebuf[kCompressStreamBlock]);
+    char* write_iter = writebuf;
+    strcpy(outname_end, ".haps");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto export_ox_hapslegend_ret_OPEN_FAIL;
+    }
+    LOGPRINTFWW5("Writing %s ... ", outname);
+    fputs("0%", stdout);
+    fflush(stdout);
+    uint32_t chr_blen = 0;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	*chr_name_end++ = ' ';
+	chr_blen = (uintptr_t)(chr_name_end - chr_buf);
+	is_x = (chr_idx == x_code);
+	is_haploid_or_mt = is_set(cip->haploid_mask, chr_idx) || (chr_idx == mt_code);
+	if ((!is_haploid_or_mt) || is_x) {
+	  cur_genotext = genotext;
+	} else {
+	  cur_genotext = &(genotext[4]);
+	}
+      }
+      uint32_t phasepresent_ct;
+      reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec, phasepresent, phaseinfo, &phasepresent_ct);
+      if (reterr) {
+	goto export_ox_hapslegend_ret_PGR_FAIL;
+      }
+      zero_trailing_quaters(sample_ct, genovec);
+      if (!phasepresent_ct) {
+	// phaseinfo is NOT cleared in this case
+	fill_ulong_zero(sample_ctl, phaseinfo);
+      }
+      uint32_t genocounts[4];
+      genovec_count_freqs_unsafe(genovec, sample_ct, genocounts);
+      if (phasepresent_ct != genocounts[1]) {
+	logprint("\n");
+	LOGERRPRINTF("Error: '--export haps%s' must be used with a fully phased dataset.\n", just_haps? "" : "legend");
+	goto export_ox_hapslegend_ret_INCONSISTENT_INPUT;
+      } else if (genocounts[3]) {
+	logprint("\n");
+	LOGERRPRINTF("Error: '--export haps%s' cannot be used with missing genotype calls.\n", just_haps? "" : "legend");
+	goto export_ox_hapslegend_ret_INCONSISTENT_INPUT;
+      }
+      if (is_haploid_or_mt) {
+	// verify that there are no het haploids (treating MT as haploid here)
+	if (is_x) {
+	  genovec_count_subset_freqs(genovec, sex_male_collapsed_interleaved, sample_ct, male_ct, genocounts);
+	}
+	if (genocounts[1]) {
+	  logprint("\n");
+	  LOGERRPRINTFWW("Error: '--export haps%s' cannot be used when heterozygous haploid/MT calls are present.%s\n", just_haps? "" : "legend", (is_x && (variant_bps[variant_uidx] <= 2781479))? " (Did you forget --split-par?)" : "");
+	  goto export_ox_hapslegend_ret_INCONSISTENT_INPUT;
+	}
+      }
+      uintptr_t variant_allele_idx_base = variant_uidx * 2;
+      if (variant_allele_idxs) {
+	variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+      }
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[variant_uidx * 2];
+	alt1_allele_idx = refalt1_select[variant_uidx * 2 + 1];
+      }
+      // this logic only works in the biallelic case
+      if (ref_allele_second + ref_allele_idx == 1) {
+	genovec_invert_unsafe(sample_ct, genovec);
+	zero_trailing_quaters(sample_ct, genovec);
+      }
+      if (just_haps) {
+	write_iter = memcpya(write_iter, chr_buf, chr_blen);
+	write_iter = strcpyax(write_iter, variant_ids[variant_uidx], ' ');
+	write_iter = uint32toa_x(variant_bps[variant_uidx], ' ', write_iter);
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+        if (ref_allele_second) {
+	  write_iter = strcpyax(write_iter, cur_alleles[alt1_allele_idx], ' ');
+	  write_iter = strcpya(write_iter, cur_alleles[ref_allele_idx]);
+	} else {
+	  write_iter = strcpyax(write_iter, cur_alleles[ref_allele_idx], ' ');
+	  write_iter = strcpya(write_iter, cur_alleles[alt1_allele_idx]);
+	}
+	*write_iter++ = ' ';
+      }
+      uint32_t* write_iter_ui_alias = (uint32_t*)write_iter;
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      uint32_t widx = 0;
+      if (!is_x) {
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	  }
+	  uintptr_t genovec_word = genovec[widx];
+	  const uint32_t phaseinfo_halfword = ((halfword_t*)phaseinfo)[widx];
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    const uintptr_t cur_geno = genovec_word & 3;
+	    *write_iter_ui_alias++ = cur_genotext[cur_geno + 2 * ((phaseinfo_halfword >> sample_idx_lowbits) & 1)];
+	    genovec_word >>= 2;
+	  }
+	  ++widx;
+	}
+      } else {
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	  }
+	  uintptr_t genovec_word = genovec[widx];
+	  const uint32_t phaseinfo_halfword = ((halfword_t*)phaseinfo)[widx];
+	  const uint32_t male_halfword = ((const halfword_t*)sex_male_collapsed)[widx];
+	  
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    const uintptr_t cur_geno = genovec_word & 3;
+	    if (cur_geno == 2) {
+	      assert(!((phaseinfo_halfword >> sample_idx_lowbits) & 1));
+	    }
+	    *write_iter_ui_alias++ = cur_genotext[cur_geno + 2 * ((phaseinfo_halfword >> sample_idx_lowbits) & 1) + 4 * ((male_halfword >> sample_idx_lowbits) & 1)];
+	    genovec_word >>= 2;
+	  }
+	  ++widx;
+	}
+      }
+      write_iter = (char*)write_iter_ui_alias;
+      decr_append_binary_eoln(&write_iter);
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto export_ox_hapslegend_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	goto export_ox_hapslegend_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto export_ox_hapslegend_ret_WRITE_FAIL;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+  }
+  while (0) {
+  export_ox_hapslegend_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_ox_hapslegend_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_ox_hapslegend_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  export_ox_hapslegend_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  export_ox_hapslegend_ret_PGR_FAIL:
+    if (reterr != kPglRetReadFail) {
+      logprint("\n");
+      logerrprint("Error: Malformed .pgen file.\n");
+    }
+  }
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+static uintptr_t** g_missing_acc2 = nullptr;
+static uint32_t* g_variant_bytects[2] = {nullptr, nullptr};
+static uint32_t g_ref_allele_second = 0;
+static uint32_t g_bgen_compressed_buf_max = 0;
+static uint32_t g_y_start = 0;
+static uint32_t g_y_end = 0;
+
+static const uint16_t bgen11_hardcall_usis[] = {32768, 0, 0, 0,
+						0, 32768, 0, 0,
+						0, 0, 32768, 0,
+						0, 0, 0, 0};
+
+THREAD_FUNC_DECL export_bgen11_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+  uintptr_t* genovec = g_genovecs[tidx];
+  const uint32_t sample_ct = g_sample_ct;
+  const uint32_t acc2_vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  const uint32_t acc4_vec_ct = acc2_vec_ct * 2;
+  const uint32_t acc8_vec_ct = acc2_vec_ct * 4;
+  uintptr_t* missing_acc2 = g_missing_acc2[tidx];
+  uintptr_t* missing_acc4 = &(missing_acc2[acc2_vec_ct * kWordsPerVec]);
+  uintptr_t* missing_acc8 = &(missing_acc4[acc4_vec_ct * kWordsPerVec]);
+  uintptr_t* missing_acc32 = &(missing_acc8[acc8_vec_ct * kWordsPerVec]);
+  uintptr_t* dosage_present = g_dosage_presents? g_dosage_presents[tidx] : nullptr;
+  dosage_t* dosage_vals = dosage_present? g_dosage_val_bufs[tidx] : nullptr;
+  uint16_t* bgen_geno_buf = g_bgen_geno_bufs[tidx];
+  const uintptr_t* variant_include = g_variant_include;
+  const uintptr_t* sample_include = g_sample_include;
+  const uint32_t* sample_include_cumulative_popcounts = g_sample_include_cumulative_popcounts;
+  const uintptr_t* sex_male_collapsed_interleaved = g_sex_male_collapsed_interleaved;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uint32_t sample_ctl2_m1 = QUATERCT_TO_WORDCT(sample_ct) - 1;
+  const uint32_t bgen_geno_buf_blen = 6 * sample_ct;
+  const uint32_t bgen_compressed_buf_max = g_bgen_compressed_buf_max;
+  const alt_allele_ct_t* refalt1_select = g_refalt1_select;
+  uint32_t is_y = 0;
+  uint32_t y_thresh = g_y_start;
+  const uint32_t y_end = g_y_end;
+  const uint32_t ref_allele_second = g_ref_allele_second;
+  uint32_t vidx_rem3 = 3;
+  uint32_t vidx_rem15d3 = 5;
+  uint32_t vidx_rem255d15 = 17;
+  uint32_t ref_allele_idx = 0;
+  uint32_t parity = 0;
+  fill_ulong_zero(acc2_vec_ct * kWordsPerVec * 23, missing_acc2);
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    uint32_t write_idx = (tidx * cur_block_write_ct) / calc_thread_ct;
+    const uint32_t write_idx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
+    unsigned char* writebuf_iter = &(g_writebufs[parity][write_idx * ((uintptr_t)bgen_compressed_buf_max)]);
+    uint32_t* variant_bytect_iter = &(g_variant_bytects[parity][write_idx]);
+    uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+    for (; write_idx < write_idx_end; ++write_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= y_thresh) {
+	if (variant_uidx < y_end) {
+	  y_thresh = y_end;
+	  is_y = 1;
+	} else {
+	  y_thresh = 0xffffffffU;
+	  is_y = 0;
+	}
+      }
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[variant_uidx * 2];
+      }
+      uint32_t dosage_ct;
+      uint32_t is_explicit_alt1;
+      pglerr_t reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+      if (reterr) {
+	g_error_ret = reterr;
+	break;
+      }
+      if (ref_allele_idx + ref_allele_second == 1) {
+	genovec_invert_unsafe(sample_ct, genovec);
+	biallelic_dosage16_invert(dosage_ct, dosage_vals);
+      }
+      uint32_t widx = 0;
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      uint16_t* bgen_geno_buf_iter = bgen_geno_buf;
+      if (!dosage_ct) {
+	while (1) {
+	  if (widx >= sample_ctl2_m1) {
+	    if (widx > sample_ctl2_m1) {
+	      break;
+	    }
+	    inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	  }
+	  uintptr_t geno_word = genovec[widx];
+	  for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	    memcpy(bgen_geno_buf_iter, &(bgen11_hardcall_usis[(geno_word & 3) * 4]), 6);
+	    bgen_geno_buf_iter = &(bgen_geno_buf_iter[3]);
+	    geno_word >>= 2;
+	  }
+	  ++widx;
+	}
+      } else {
+	const halfword_t* dosage_present_alias = (halfword_t*)dosage_present;
+	const dosage_t* dosage_vals_iter = dosage_vals;
+	if (!is_explicit_alt1) {
+	  while (1) {
+	    if (widx >= sample_ctl2_m1) {
+	      if (widx > sample_ctl2_m1) {
+		break;
+	      }
+	      inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	    }
+	    uintptr_t geno_word = genovec[widx];
+	    uint32_t dosage_present_hw = dosage_present_alias[widx];
+	    if (!dosage_present_hw) {
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		memcpy(bgen_geno_buf_iter, &(bgen11_hardcall_usis[(geno_word & 3) * 4]), 6);
+		bgen_geno_buf_iter = &(bgen_geno_buf_iter[3]);
+		geno_word >>= 2;
+	      }
+	    } else {
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		if (dosage_present_hw & 1) {
+		  uint32_t dosage_int = *dosage_vals_iter++;
+		  dosage_int *= 2;
+		  if (dosage_int <= kDosageMax) {
+		    *bgen_geno_buf_iter++ = kDosageMax - dosage_int;
+		    *bgen_geno_buf_iter++ = dosage_int;
+		    *bgen_geno_buf_iter++ = 0;
+		  } else {
+		    dosage_int -= kDosageMax;
+		    *bgen_geno_buf_iter++ = 0;
+		    *bgen_geno_buf_iter++ = kDosageMax - dosage_int;
+		    *bgen_geno_buf_iter++ = dosage_int;
+		  }
+		} else {
+		  memcpy(bgen_geno_buf_iter, &(bgen11_hardcall_usis[(geno_word & 3) * 4]), 6);
+		  bgen_geno_buf_iter = &(bgen_geno_buf_iter[3]);
+		}
+		geno_word >>= 2;
+		dosage_present_hw >>= 1;
+	      }
+	    }
+	    ++widx;
+	  }
+	} else {
+	  // todo
+	  assert(0);
+	}
+      }
+      uLongf compressed_blen = bgen_compressed_buf_max;
+      if (compress(writebuf_iter, &compressed_blen, (const unsigned char*)bgen_geno_buf, bgen_geno_buf_blen)) {
+	// is this actually possible?
+	g_error_ret = kPglRetNomem;
+	break;
+      }
+      *variant_bytect_iter++ = compressed_blen;
+      writebuf_iter = &(writebuf_iter[bgen_compressed_buf_max]);
+      if (is_y) {
+	interleaved_mask_zero(sex_male_collapsed_interleaved, acc2_vec_ct, genovec);
+      }
+      incr_missing_row(genovec, acc2_vec_ct, missing_acc2);
+      if (!(--vidx_rem3)) {
+	unroll_zero_incr_2_4(acc2_vec_ct, missing_acc2, missing_acc4);
+	vidx_rem3 = 3;
+	if (!(--vidx_rem15d3)) {
+	  unroll_zero_incr_4_8(acc4_vec_ct, missing_acc4, missing_acc8);
+	  vidx_rem15d3 = 5;
+	  if (!(--vidx_rem255d15)) {
+	    unroll_zero_incr_8_32(acc8_vec_ct, missing_acc8, missing_acc32);
+	    vidx_rem255d15 = 17;
+	  }
+	}
+      }
+    }
+    if (is_last_block) {
+      unroll_incr_2_4(missing_acc2, acc2_vec_ct, missing_acc4);
+      unroll_incr_4_8(missing_acc4, acc4_vec_ct, missing_acc8);
+      unroll_incr_8_32(missing_acc8, acc8_vec_ct, missing_acc32);
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+pglerr_t export_bgen11(const char* outname, const uintptr_t* sample_include, uint32_t* sample_include_cumulative_popcounts, const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, uint32_t sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t max_thread_ct, exportf_flags_t exp [...]
+  // isomorphic to export_ox_gen().
+  assert(sample_ct);
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  // use gzip instead of zstd here.
+  ZWRAP_useZSTDcompression(0);
+  {
+    const uint32_t sample_ctv = BITCT_TO_VECCT(sample_ct);
+    const uint32_t max_chr_slen = get_max_chr_slen(cip);
+    const uintptr_t bgen_compressed_buf_max = compressBound(6LU * sample_ct);
+#ifdef __LP64__
+    if (bgen_compressed_buf_max > 0xffffffffU) {
+      logerrprint("Error: Too many samples for .bgen format.\n");
+      goto export_bgen11_ret_INCONSISTENT_INPUT;
+    }
+#endif
+    g_bgen_compressed_buf_max = bgen_compressed_buf_max;
+    const uintptr_t writebuf_len = bgen_compressed_buf_max + 2 * max_allele_slen + 2 * kMaxIdSlen + 32;
+    char* chr_buf;
+    unsigned char* writebuf;
+    uintptr_t* sex_male_collapsed_tmp;
+    if (bigstack_alloc_c(max_chr_slen, &chr_buf) ||
+        bigstack_alloc_uc(writebuf_len, &writebuf) ||
+	bigstack_alloc_ul(sample_ctv * kWordsPerVec, &g_sex_male_collapsed_interleaved) ||
+	bigstack_alloc_ul(sample_ctv * kWordsPerVec, &sex_male_collapsed_tmp)) {
+      goto export_bgen11_ret_NOMEM;
+    }
+    copy_bitarr_subset(sex_male, sample_include, sample_ct, sex_male_collapsed_tmp);
+    fill_interleaved_mask_vec(sex_male_collapsed_tmp, sample_ctv, g_sex_male_collapsed_interleaved);
+    bigstack_reset(sex_male_collapsed_tmp);
+
+    const uintptr_t max_write_block_byte_ct = bigstack_left() / 4;
+    uint32_t max_write_block_size = kPglVblockSize;
+    while (1) {
+      // limit each write buffer to 1/4 of remaining workspace
+      if (((uint64_t)(bgen_compressed_buf_max + sizeof(int32_t))) * max_write_block_size <= max_write_block_byte_ct) {
+	break;
+      }
+      if (max_write_block_size <= kBitsPerVec) {
+	goto export_bgen11_ret_NOMEM;
+      }
+      max_write_block_size /= 2;
+    }
+    uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    // seems to saturate around this point
+    if (calc_thread_ct > 15) {
+      calc_thread_ct = 15;
+    }
+    if (bigstack_alloc_uc(bgen_compressed_buf_max * max_write_block_size, &(g_writebufs[0])) ||
+	bigstack_alloc_uc(bgen_compressed_buf_max * max_write_block_size, &(g_writebufs[1])) ||
+	bigstack_alloc_ui(max_write_block_size, &(g_variant_bytects[0])) ||
+	bigstack_alloc_ui(max_write_block_size, &(g_variant_bytects[1])) ||
+	bigstack_alloc_ulp(calc_thread_ct, &g_missing_acc2) ||
+	bigstack_alloc_usip(calc_thread_ct, &g_bgen_geno_bufs)) {
+      goto export_bgen11_ret_NOMEM;
+    }
+    
+    const uint32_t acc2_vec_ct = QUATERCT_TO_VECCT(sample_ct);
+    const uint32_t dosage_is_present = pgfip->gflags & kfPgenGlobalDosagePresent;
+    const uintptr_t track_missing_cacheline_ct = VECCT_TO_CLCT(acc2_vec_ct * 23);
+    const uintptr_t bgen_geno_cacheline_ct = DIV_UP(6 * sample_ct, (kCacheline * k1LU));
+    const uintptr_t thread_xalloc_cacheline_ct = track_missing_cacheline_ct + bgen_geno_cacheline_ct;
+    unsigned char* main_loadbufs[2];
+    pthread_t* threads;
+    uint32_t read_block_size;
+    if (multithread_load_init(variant_include, sample_ct, variant_ct, pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, 0, pgfip, &calc_thread_ct, &g_genovecs, dosage_is_present? (&g_dosage_presents) : nullptr, dosage_is_present? (&g_dosage_val_bufs) : nullptr, &read_block_size, main_loadbufs, &threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+      goto export_bgen11_ret_NOMEM;
+    }
+    if (read_block_size > max_write_block_size) {
+      read_block_size = max_write_block_size;
+    }
+    
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto export_bgen11_ret_OPEN_FAIL;
+    }
+    // bgen 1.1 header
+    // note that \xxx character constants are interpreted in octal, so \24 is
+    // decimal 20, etc.
+    memcpy(writebuf, "\24\0\0\0\24\0\0\0", 8);
+    memcpy(&(writebuf[8]), &variant_ct, 4);
+    memcpy(&(writebuf[12]), &sample_ct, 4);
+    memcpy(&(writebuf[16]), "bgen\5\0\0\0", 8);
+    if (fwrite_checked(writebuf, 24, outfile)) {
+      goto export_bgen11_ret_WRITE_FAIL;
+    }
+    
+    const uint32_t ref_allele_second = !(exportf_modifier & kfExportfRefFirst);
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      g_missing_acc2[tidx] = (uintptr_t*)bigstack_alloc_raw(track_missing_cacheline_ct * kCacheline);
+      g_bgen_geno_bufs[tidx] = (uint16_t*)bigstack_alloc_raw(bgen_geno_cacheline_ct * kCacheline);
+    }
+    g_sample_ct = sample_ct;
+    g_variant_include = variant_include;
+    g_sample_include = sample_include;
+    g_sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
+    g_calc_thread_ct = calc_thread_ct;
+    g_refalt1_select = refalt1_select;
+    get_xymt_start_and_end(cip, kChrOffsetY, &g_y_start, &g_y_end);
+    g_ref_allele_second = ref_allele_second;
+    g_cip = cip;
+    
+    // 6 bytes present at start of every bgen-1.1 variant record
+    memcpy(writebuf, &sample_ct, 4);
+    memcpy(&(writebuf[4]), "\0", 2);
+
+    // Main workflow:
+    // 1. Set n=0, load/skip block 0
+    //
+    // 2. Spawn threads processing block n
+    // 3. If n>0, write results for block (n-1)
+    // 4. Increment n by 1
+    // 5. Load/skip block n unless eof
+    // 6. Join threads
+    // 7. Goto step 2 unless eof
+    //
+    // 8. Write results for last block
+    const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+    const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+    uint32_t parity = 0;
+    uint32_t read_block_idx = 0;
+    uint32_t write_variant_uidx = 0;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_slen = 0;
+    
+    uint32_t prev_block_write_ct = 0;
+    uint32_t variant_idx = 0;
+    uint32_t is_last_block = 0;
+    uint32_t cur_read_block_size = read_block_size;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    LOGPRINTFWW5("Writing %s ... ", outname);
+    fputs("0%", stdout);
+    fflush(stdout);
+    uint32_t ref_allele_idx = 0;
+    uint32_t alt1_allele_idx = 1;
+    while (1) {
+      uintptr_t cur_block_write_ct = 0;
+      if (!is_last_block) {
+	while (read_block_idx < read_block_ct_m1) {
+	  cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	  if (cur_block_write_ct) {
+	    break;
+	  }
+	  ++read_block_idx;
+	}
+	if (read_block_idx == read_block_ct_m1) {
+	  cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	  cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	}
+	if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_write_ct, pgfip)) {
+	  if (variant_idx) {
+	    join_threads2z(calc_thread_ct, 0, threads);
+	    g_cur_block_write_ct = 0;
+	    error_cleanup_threads2z(export_bgen11_thread, calc_thread_ct, threads);
+	  }
+	  goto export_bgen11_ret_READ_FAIL;
+	}
+      }
+      if (variant_idx) {
+	join_threads2z(calc_thread_ct, is_last_block, threads);
+	reterr = g_error_ret;
+	if (reterr) {
+	  if (!is_last_block) {
+	    g_cur_block_write_ct = 0;
+	    error_cleanup_threads2z(export_bgen11_thread, calc_thread_ct, threads);
+	  }
+	  if (reterr == kPglRetMalformedInput) {
+	    logprint("\n");
+	    logerrprint("Error: Malformed .pgen file.\n");
+	  }
+	  goto export_bgen11_ret_1;
+	}
+      }
+      if (!is_last_block) {
+	g_cur_block_write_ct = cur_block_write_ct;
+	compute_uidx_start_partition(variant_include, cur_block_write_ct, calc_thread_ct, read_block_idx * read_block_size, g_read_variant_uidx_starts);
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	  g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	}
+	is_last_block = (variant_idx + cur_block_write_ct == variant_ct);
+	if (spawn_threads2z(export_bgen11_thread, calc_thread_ct, is_last_block, threads)) {
+	  goto export_bgen11_ret_THREAD_CREATE_FAIL;
+	}
+      }
+      parity = 1 - parity;
+      if (variant_idx) {
+	// write *previous* block results
+	const unsigned char* compressed_data_iter = g_writebufs[parity];
+	const uint32_t* variant_bytect_iter = g_variant_bytects[parity];
+	for (uint32_t variant_bidx = 0; variant_bidx < prev_block_write_ct; ++variant_bidx, ++write_variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &write_variant_uidx);
+	  if (write_variant_uidx >= chr_end) {
+	    do {
+	      ++chr_fo_idx;
+	      chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	    } while (write_variant_uidx >= chr_end);
+	    const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	    char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	    chr_slen = (uintptr_t)(chr_name_end - chr_buf);
+	  }
+	  const char* cur_variant_id = variant_ids[write_variant_uidx];
+	  const uint32_t id_slen = strlen(cur_variant_id);
+	  memcpy(&(writebuf[6]), &id_slen, 4);
+	  // deliberately clobber top two bytes
+	  unsigned char* writebuf_iter = (unsigned char*)memcpya(&(writebuf[8]), cur_variant_id, id_slen);
+	  memcpy(writebuf_iter, &chr_slen, 4);
+	  writebuf_iter = (unsigned char*)memcpya(&(writebuf_iter[2]), chr_buf, chr_slen);
+	  memcpy(writebuf_iter, &(variant_bps[write_variant_uidx]), 4);
+	  writebuf_iter = &(writebuf_iter[4]);
+	  uintptr_t variant_allele_idx_base = write_variant_uidx * 2;
+	  if (variant_allele_idxs) {
+	    variant_allele_idx_base = variant_allele_idxs[write_variant_uidx];
+	  }
+	  char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	  if (refalt1_select) {
+	    ref_allele_idx = refalt1_select[write_variant_uidx * 2];
+	    alt1_allele_idx = refalt1_select[write_variant_uidx * 2 + 1];
+	  }
+	  const char* first_allele;
+	  const char* second_allele;
+	  if (ref_allele_second) {
+	    first_allele = cur_alleles[alt1_allele_idx];
+	    second_allele = cur_alleles[ref_allele_idx];
+	  } else {
+	    first_allele = cur_alleles[ref_allele_idx];
+	    second_allele = cur_alleles[alt1_allele_idx];
+	  }
+	  uint32_t allele_slen = strlen(first_allele);
+	  memcpy(writebuf_iter, &allele_slen, 4);
+	  writebuf_iter = (unsigned char*)memcpya(&(writebuf_iter[4]), first_allele, allele_slen);
+	  allele_slen = strlen(second_allele);
+	  memcpy(writebuf_iter, &allele_slen, 4);
+	  writebuf_iter = (unsigned char*)memcpya(&(writebuf_iter[4]), second_allele, allele_slen);
+	  const uint32_t cur_variant_bytect = *variant_bytect_iter++;
+	  memcpy(writebuf_iter, &cur_variant_bytect, 4);
+	  writebuf_iter = &(writebuf_iter[4]);
+	  memcpy(writebuf_iter, compressed_data_iter, cur_variant_bytect);
+	  writebuf_iter = &(writebuf_iter[cur_variant_bytect]);
+	  compressed_data_iter = &(compressed_data_iter[bgen_compressed_buf_max]);
+	  if (fwrite_checked(writebuf, writebuf_iter - writebuf, outfile)) {
+	    if (variant_idx < variant_ct) {
+	      join_threads2z(calc_thread_ct, is_last_block, threads);
+	      if (!is_last_block) {
+		g_cur_block_write_ct = 0;
+		error_cleanup_threads2z(export_bgen11_thread, calc_thread_ct, threads);
+	      }
+	    }
+	    goto export_bgen11_ret_WRITE_FAIL;
+	  }
+	}
+      }
+      if (variant_idx == variant_ct) {
+	break;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+      ++read_block_idx;
+      prev_block_write_ct = cur_block_write_ct;
+      variant_idx += cur_block_write_ct;
+      pgfip->block_base = main_loadbufs[parity];
+    }
+    if (fclose_null(&outfile)) {
+      goto export_bgen11_ret_WRITE_FAIL;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+    const uint32_t sample_ctav2 = acc2_vec_ct * kQuatersPerVec;
+    const uintptr_t acc32_offset = acc2_vec_ct * (7 * k1LU * kWordsPerVec);
+    uint32_t* scrambled_missing_cts = (uint32_t*)(&(g_missing_acc2[0][acc32_offset]));
+    for (uint32_t tidx = 1; tidx < calc_thread_ct; ++tidx) {
+      const uint32_t* thread_scrambled_missing_cts = (uint32_t*)(&(g_missing_acc2[tidx][acc32_offset]));
+      for (uint32_t uii = 0; uii < sample_ctav2; ++uii) {
+	scrambled_missing_cts[uii] += thread_scrambled_missing_cts[uii];
+      }
+    }
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      const uint32_t scrambled_idx = scramble_2_4_8_32(sample_idx);
+      sample_missing_geno_cts[sample_idx] = scrambled_missing_cts[scrambled_idx];
+    }
+  }
+  while (0) {
+  export_bgen11_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_bgen11_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_bgen11_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  export_bgen11_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+#ifdef __LP64__
+  export_bgen11_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+#endif
+  export_bgen11_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ export_bgen11_ret_1:
+  ZWRAP_useZSTDcompression(1);
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+/*
+static uintptr_t** g_phasepresents = nullptr;
+static uintptr_t** g_phaseinfos = nullptr;
+static uintptr_t** g_dphase_presents = nullptr;
+
+static uint32_t g_bgen_bit_precision = 0;
+static uint32_t g_bgen_uncompressed_buf_max = 0;
+
+// memcpy(target,
+//   &(g_bgen_hardcall_write[cur_geno * biallelic_diploid_byte_ct]),
+//   biallelic_diploid_byte_ct) should do the right thing
+static unsigned char* g_bgen_hardcall_write = nullptr;
+
+THREAD_FUNC_DECL export_bgen13_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+  uintptr_t* genovec = g_genovecs[tidx];
+  const uint32_t sample_ct = g_sample_ct;
+  const uint32_t acc2_vec_ct = QUATERCT_TO_VECCT(sample_ct);
+  const uint32_t acc4_vec_ct = acc2_vec_ct * 2;
+  const uint32_t acc8_vec_ct = acc2_vec_ct * 4;
+  uintptr_t* missing_acc2 = g_missing_acc2[tidx];
+  uintptr_t* missing_acc4 = &(missing_acc2[acc2_vec_ct * kWordsPerVec]);
+  uintptr_t* missing_acc8 = &(missing_acc4[acc4_vec_ct * kWordsPerVec]);
+  uintptr_t* missing_acc32 = &(missing_acc8[acc8_vec_ct * kWordsPerVec]);
+  uintptr_t* phasepresent = nullptr;
+  uintptr_t* phaseinfo = nullptr;
+  if (g_phasepresents) {
+    phasepresent = g_phasepresents[tidx];
+    phaseinfo = g_phaseinfos[tidx];
+  }
+  uintptr_t* dosage_present = g_dosage_presents? g_dosage_presents[tidx] : nullptr;
+  uintptr_t* dphase_present = g_dphase_presents? g_dphase_presents[tidx] : nullptr;
+  dosage_t* dosage_vals = dosage_present? g_dosage_val_bufs[tidx] : nullptr;
+  unsigned char* uncompressed_bgen_geno_buf = g_thread_wkspaces[tidx];
+  const uintptr_t* variant_include = g_variant_include;
+  const chr_info_t* cip = g_cip;
+  const uintptr_t* sample_include = g_sample_include;
+  const uint32_t* sample_include_cumulative_popcounts = g_sample_include_cumulative_popcounts;
+  const uintptr_t* sex_male_collapsed_interleaved = g_sex_male_collapsed_interleaved;
+  const unsigned char* bgen_diploid_hardcall_write = g_bgen_hardcall_write;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uint32_t sample_ctl2_m1 = QUATERCT_TO_WORDCT(sample_ct) - 1;
+  const uint32_t bit_precision = g_bgen_bit_precision;
+  const uint32_t bytes_per_prob = DIV_UP(bit_precision, CHAR_BIT);
+
+  // note that this applies to both unphased and phased output, for different
+  // reasons
+  const uint32_t biallelic_diploid_byte_ct = 2 * bytes_per_prob;
+  const unsigned char* bgen_haploid_hardcall_write = &(bgen_diploid_hardcall_write[4 * biallelic_diploid_byte_ct]);
+  ;;;
+
+  const uint32_t bgen_uncompressed_buf_max = g_bgen_uncompressed_buf_max;
+  const uint32_t bgen_compressed_buf_max = g_bgen_compressed_buf_max;
+  const alt_allele_ct_t* refalt1_select = g_refalt1_select;
+  uint32_t chr_fo_idx = 0xffffffffU; // deliberate overflow
+  uint32_t chr_end = 0;
+  uint32_t is_x = 0;
+  uint32_t is_y = 0;
+
+  uint32_t is_haploid_or_mt = 0; // includes chrX and chrY
+  // for bgen-1.2/1.3 and VCF/BCF export, MT ploidy is 1 unless the call is
+  //   heterozygous (i.e. it's treated the same way as an ordinary haploid
+  //   chromosome); similarly for chrX male ploidy
+  // for bgen-1.2/1.3, chrY female (but not unknown-sex) ploidy is 0 when
+  //   genotype is missing
+
+  const uint32_t ref_allele_second = g_ref_allele_second;
+  uint32_t vidx_rem3 = 3;
+  uint32_t vidx_rem15d3 = 5;
+  uint32_t vidx_rem255d15 = 17;
+  uint32_t ref_allele_idx = 0;
+  uint32_t parity = 0;
+  fill_ulong_zero(acc2_vec_ct * kWordsPerVec * 23, missing_acc2);
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_write_ct = g_cur_block_write_ct;
+    uint32_t write_idx = (tidx * cur_block_write_ct) / calc_thread_ct;
+    const uint32_t write_idx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
+    unsigned char* writebuf_iter = &(g_writebufs[parity][write_idx * ((uintptr_t)bgen_compressed_buf_max)]);
+    uint32_t* variant_bytect_iter = &(g_variant_bytects[parity][write_idx]);
+    uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+    for (; write_idx < write_idx_end; ++write_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	is_x = (chr_idx == x_code);
+	is_y = (chr_idx == y_code);
+	is_haploid_or_mt = is_set(cip->haploid_mask, chr_idx) || (chr_idx == mt_code);
+      }
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[variant_uidx * 2];
+      }
+      // todo: export phase info
+      // todo: multiallelic cases
+      uint32_t dosage_ct;
+      uint32_t is_explicit_alt1;
+      pglerr_t reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+      if (reterr) {
+	g_error_ret = reterr;
+	break;
+      }
+      if (ref_allele_idx + ref_allele_second == 1) {
+	genovec_invert_unsafe(sample_ct, genovec);
+	biallelic_dosage16_invert(dosage_ct, dosage_vals);
+      }
+      unsigned char* bgen_geno_buf_iter = uncompressed_bgen_geno_buf;
+      // 4 bytes: # of samples
+      // 2 bytes: # of alleles
+      // 1 byte: minimum ploidy
+      // 1 byte: maximum ploidy
+      // sample_ct bytes: high bit = missing, low bits = ploidy
+      // 1 byte: is_phased
+      // 1 byte: bit_precision
+      bgen_geno_buf_iter = memcpya(bgen_geno_buf_iter, &sample_ct, 4);
+      uint32_t widx = 0;
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      if (!dosage_ct) {
+	if (!is_haploid_or_mt) {
+	  // 2 alleles, min ploidy == max ploidy == 2
+	  *((uint32_t*)bgen_geno_buf_iter)++ = 0x2020002;
+	  unsigned char* sample_ploidy_and_missingness = bgen_geno_buf_iter;
+	  bgen_geno_buf_iter = memseta(bgen_geno_buf_iter, 2, sample_ct);
+	  *bgen_geno_buf_iter++ = 0; // not phased
+	  *bgen_geno_buf_iter++ = bit_precision;
+	  while (1) {
+	    if (widx >= sample_ctl2_m1) {
+	      if (widx > sample_ctl2_m1) {
+		break;
+	      }
+	      inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	    }
+	    uintptr_t geno_word = genovec[widx];
+	    for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+	      const uintptr_t cur_geno = geno_word & 3;
+	      bgen_geno_buf_iter = memcpya(bgen_geno_buf_iter, &(bgen_hardcall_write[cur_geno * biallelic_diploid_byte_ct]), biallelic_diploid_byte_ct);
+	      if (cur_geno == 3) {
+		// maybe handle this in a different loop?
+		sample_ploidy_and_missingness_iter[sample_idx_lowbits] = 130;
+	      }
+	      geno_word >>= 2;
+	    }
+	    ++widx;
+	    sample_ploidy_and_missingness_iter = &(sample_ploidy_and_missingness_iter[kBitsPerWordD2]);
+	  }
+	} else if (is_x) {
+	} else {
+	  // ...
+	}
+      } else {
+	const halfword_t* dosage_present_alias = (halfword_t*)dosage_present;
+	const dosage_t* dosage_vals_iter = dosage_vals;
+	if (!is_explicit_alt1) {
+	  while (1) {
+	    if (widx >= sample_ctl2_m1) {
+	      if (widx > sample_ctl2_m1) {
+		break;
+	      }
+	      inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	    }
+	    uintptr_t geno_word = genovec[widx];
+	    uint32_t dosage_present_hw = dosage_present_alias[widx];
+	    if (!dosage_present_hw) {
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		memcpy(bgen_geno_buf_iter, &(bgen11_hardcall_usis[(geno_word & 3) * 4]), 6);
+		bgen_geno_buf_iter = &(bgen_geno_buf_iter[3]);
+		geno_word >>= 2;
+	      }
+	    } else {
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		if (dosage_present_hw & 1) {
+		  uint32_t dosage_int = *dosage_vals_iter++;
+		  dosage_int *= 2;
+		  if (dosage_int <= kDosageMax) {
+		    *bgen_geno_buf_iter++ = kDosageMax - dosage_int;
+		    *bgen_geno_buf_iter++ = dosage_int;
+		    *bgen_geno_buf_iter++ = 0;
+		  } else {
+		    dosage_int -= kDosageMax;
+		    *bgen_geno_buf_iter++ = 0;
+		    *bgen_geno_buf_iter++ = kDosageMax - dosage_int;
+		    *bgen_geno_buf_iter++ = dosage_int;
+		  }
+		} else {
+		  memcpy(bgen_geno_buf_iter, &(bgen11_hardcall_usis[(geno_word & 3) * 4]), 6);
+		  bgen_geno_buf_iter = &(bgen_geno_buf_iter[3]);
+		}
+		geno_word >>= 2;
+		dosage_present_hw >>= 1;
+	      }
+	    }
+	    ++widx;
+	  }
+	} else {
+	  // todo
+	  assert(0);
+	}
+      }
+      uLongf compressed_blen = bgen_compressed_buf_max;
+      if (compress(writebuf_iter, &compressed_blen, (const unsigned char*)bgen_geno_buf, bgen_geno_buf_blen)) {
+	// is this actually possible?
+	g_error_ret = kPglRetNomem;
+	break;
+      }
+      *variant_bytect_iter++ = compressed_blen;
+      writebuf_iter = &(writebuf_iter[bgen_compressed_buf_max]);
+      if (is_y) {
+	interleaved_mask_zero(sex_male_collapsed_interleaved, acc2_vec_ct, genovec);
+      }
+      incr_missing_row(genovec, acc2_vec_ct, missing_acc2);
+      if (!(--vidx_rem3)) {
+	unroll_zero_incr_2_4(acc2_vec_ct, missing_acc2, missing_acc4);
+	vidx_rem3 = 3;
+	if (!(--vidx_rem15d3)) {
+	  unroll_zero_incr_4_8(acc4_vec_ct, missing_acc4, missing_acc8);
+	  vidx_rem15d3 = 5;
+	  if (!(--vidx_rem255d15)) {
+	    unroll_zero_incr_8_32(acc8_vec_ct, missing_acc8, missing_acc32);
+	    vidx_rem255d15 = 17;
+	  }
+	}
+      }
+    }
+    if (is_last_block) {
+      unroll_incr_2_4(missing_acc2, acc2_vec_ct, missing_acc4);
+      unroll_incr_4_8(missing_acc4, acc4_vec_ct, missing_acc8);
+      unroll_incr_8_32(missing_acc8, acc8_vec_ct, missing_acc32);
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+pglerr_t export_bgen13(const char* outname, const uintptr_t* sample_include, uint32_t* sample_include_cumulative_popcounts, const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, uint32_t sample_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t max_thread_ct, exportf_flags_t exp [...]
+  assert(sample_ct);
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  // use gzip iff v1.2
+  if (exportf_modifier & kfExportfBgen12) {
+    ZWRAP_useZSTDcompression(0);
+  }
+  {
+    if (exportf_bits > 16) {
+      logerrprint("Error: bits= parameter is currently limited to 16.  (This is sufficient to\ncapture all information in a .pgen file.)\n");
+      reterr = kPglRetNotYetSupported;
+      goto export_bgen13_ret_1;
+    }
+    if (pgfip->gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent)) {
+      logerrprint("Error: Export of phase information to .bgen files is currently under\ndevelopment.\n");
+      reterr = kPglRetNotYetSupported;
+      goto export_bgen13_ret_1;
+    }
+    const uint32_t sample_ctv = BITCT_TO_VECCT(sample_ct);
+    const uint32_t max_chr_slen = get_max_chr_slen(cip);
+    const uintptr_t bgen_compressed_buf_max = compressBound(6LU * sample_ct);
+#ifdef __LP64__
+    if (bgen_compressed_buf_max > 0xffffffffU) {
+      logerrprint("Error: Too many samples for .bgen format.\n");
+      goto export_bgen13_ret_INCONSISTENT_INPUT;
+    }
+#endif
+    g_bgen_compressed_buf_max = bgen_compressed_buf_max;
+    const uintptr_t writebuf_len = bgen_compressed_buf_max + 2 * max_allele_slen + 2 * kMaxIdSlen + 32;
+    char* chr_buf;
+    unsigned char* writebuf;
+    uintptr_t* sex_male_collapsed_tmp;
+    if (bigstack_alloc_c(max_chr_slen, &chr_buf) ||
+        bigstack_alloc_uc(writebuf_len, &writebuf) ||
+	bigstack_alloc_ul(sample_ctv * kWordsPerVec, &g_sex_male_collapsed_interleaved) ||
+	bigstack_alloc_ul(sample_ctv * kWordsPerVec, &sex_male_collapsed_tmp)) {
+      goto export_bgen13_ret_NOMEM;
+    }
+    copy_bitarr_subset(sex_male, sample_include, sample_ct, sex_male_collapsed_tmp);
+    fill_interleaved_mask_vec(sex_male_collapsed_tmp, sample_ctv, g_sex_male_collapsed_interleaved);
+    bigstack_reset(sex_male_collapsed_tmp);
+
+    const uintptr_t max_write_block_byte_ct = bigstack_left() / 4;
+    uint32_t max_write_block_size = kPglVblockSize;
+    while (1) {
+      // limit each write buffer to 1/4 of remaining workspace
+      if (((uint64_t)(bgen_compressed_buf_max + sizeof(int32_t))) * max_write_block_size <= max_write_block_byte_ct) {
+	break;
+      }
+      if (max_write_block_size <= kBitsPerVec) {
+	goto export_bgen13_ret_NOMEM;
+      }
+      max_write_block_size /= 2;
+    }
+    uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    // seems to saturate around this point
+    if (calc_thread_ct > 15) {
+      calc_thread_ct = 15;
+    }
+    if (bigstack_alloc_uc(bgen_compressed_buf_max * max_write_block_size, &(g_writebufs[0])) ||
+	bigstack_alloc_uc(bgen_compressed_buf_max * max_write_block_size, &(g_writebufs[1])) ||
+	bigstack_alloc_ui(max_write_block_size, &(g_variant_bytects[0])) ||
+	bigstack_alloc_ui(max_write_block_size, &(g_variant_bytects[1])) ||
+	bigstack_alloc_ulp(calc_thread_ct, &g_missing_acc2) ||
+	bigstack_alloc_usip(calc_thread_ct, &g_bgen_geno_bufs)) {
+      goto export_bgen13_ret_NOMEM;
+    }
+    
+    const uint32_t acc2_vec_ct = QUATERCT_TO_VECCT(sample_ct);
+    const uint32_t dosage_is_present = pgfip->gflags & kfPgenGlobalDosagePresent;
+    const uintptr_t track_missing_cacheline_ct = VECCT_TO_CLCT(acc2_vec_ct * 23);
+    const uintptr_t bgen_geno_cacheline_ct = DIV_UP(6 * sample_ct, (kCacheline * k1LU));
+    const uintptr_t thread_xalloc_cacheline_ct = track_missing_cacheline_ct + bgen_geno_cacheline_ct;
+    unsigned char* main_loadbufs[2];
+    pthread_t* threads;
+    uint32_t read_block_size;
+    if (multithread_load_init(variant_include, sample_ct, raw_variant_ct, pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, 0, pgfip, &calc_thread_ct, &g_genovecs, dosage_is_present? (&g_dosage_presents) : nullptr, dosage_is_present? (&g_dosage_val_bufs) : nullptr, &read_block_size, main_loadbufs, &threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+      goto export_bgen13_ret_NOMEM;
+    }
+    if (read_block_size > max_write_block_size) {
+      read_block_size = max_write_block_size;
+    }
+    
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto export_bgen13_ret_OPEN_FAIL;
+    }
+    // bgen 1.1 header
+    // note that \xxx character constants are interpreted in octal, so \24 is
+    // decimal 20, etc.
+    memcpy(writebuf, "\24\0\0\0\24\0\0\0", 8);
+    memcpy(&(writebuf[8]), &variant_ct, 4);
+    memcpy(&(writebuf[12]), &sample_ct, 4);
+    memcpy(&(writebuf[16]), "bgen\5\0\0\0", 8);
+    if (fwrite_checked(writebuf, 24, outfile)) {
+      goto export_bgen13_ret_WRITE_FAIL;
+    }
+    
+    const uint32_t ref_allele_second = !(exportf_modifier & kfExportfRefFirst);
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      g_missing_acc2[tidx] = (uintptr_t*)bigstack_alloc_raw(track_missing_cacheline_ct * kCacheline);
+      g_bgen_geno_bufs[tidx] = (uint16_t*)bigstack_alloc_raw(bgen_geno_cacheline_ct * kCacheline);
+    }
+    g_sample_ct = sample_ct;
+    g_variant_include = variant_include;
+    g_sample_include = sample_include;
+    g_sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
+    g_calc_thread_ct = calc_thread_ct;
+    g_refalt1_select = refalt1_select;
+    get_xymt_start_and_end(cip, kChrOffsetY, &g_y_start, &g_y_end);
+    g_ref_allele_second = ref_allele_second;
+    g_cip = cip;
+    
+    // 6 bytes present at start of every bgen-1.1 variant record
+    memcpy(writebuf, &sample_ct, 4);
+    memcpy(&(writebuf[4]), "\0", 2);
+
+    // Main workflow:
+    // 1. Set n=0, load/skip block 0
+    //
+    // 2. Spawn threads processing block n
+    // 3. If n>0, write results for block (n-1)
+    // 4. Increment n by 1
+    // 5. Load/skip block n unless eof
+    // 6. Join threads
+    // 7. Goto step 2 unless eof
+    //
+    // 8. Write results for last block
+    const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+    const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+    uint32_t parity = 0;
+    uint32_t read_block_idx = 0;
+    uint32_t write_variant_uidx = 0;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_slen = 0;
+    
+    uint32_t prev_block_write_ct = 0;
+    uint32_t variant_idx = 0;
+    uint32_t is_last_block = 0;
+    uint32_t cur_read_block_size = read_block_size;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    LOGPRINTFWW5("Writing %s ... ", outname);
+    fputs("0%", stdout);
+    fflush(stdout);
+    uint32_t ref_allele_idx = 0;
+    uint32_t alt1_allele_idx = 1;
+    while (1) {
+      uintptr_t cur_block_write_ct = 0;
+      if (!is_last_block) {
+	while (read_block_idx < read_block_ct_m1) {
+	  cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	  if (cur_block_write_ct) {
+	    break;
+	  }
+	  ++read_block_idx;
+	}
+	if (read_block_idx == read_block_ct_m1) {
+	  cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	  cur_block_write_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	}
+	if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_write_ct, pgfip)) {
+	  if (variant_idx) {
+	    join_threads2z(calc_thread_ct, 0, threads);
+	    g_cur_block_write_ct = 0;
+	    error_cleanup_threads2z(export_bgen13_thread, calc_thread_ct, threads);
+	  }
+	  goto export_bgen13_ret_READ_FAIL;
+	}
+      }
+      if (variant_idx) {
+	join_threads2z(calc_thread_ct, is_last_block, threads);
+	reterr = g_error_ret;
+	if (reterr) {
+	  if (!is_last_block) {
+	    g_cur_block_write_ct = 0;
+	    error_cleanup_threads2z(export_bgen13_thread, calc_thread_ct, threads);
+	  }
+	  if (reterr == kPglRetMalformedInput) {
+	    logprint("\n");
+	    logerrprint("Error: Malformed .pgen file.\n");
+	  }
+	  goto export_bgen13_ret_1;
+	}
+      }
+      if (!is_last_block) {
+	g_cur_block_write_ct = cur_block_write_ct;
+	compute_uidx_start_partition(variant_include, cur_block_write_ct, calc_thread_ct, read_block_idx * read_block_size, g_read_variant_uidx_starts);
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	  g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	}
+	is_last_block = (variant_idx + cur_block_write_ct == variant_ct);
+	if (spawn_threads2z(export_bgen13_thread, calc_thread_ct, is_last_block, threads)) {
+	  goto export_bgen13_ret_THREAD_CREATE_FAIL;
+	}
+      }
+      parity = 1 - parity;
+      if (variant_idx) {
+	// write *previous* block results
+	const unsigned char* compressed_data_iter = g_writebufs[parity];
+	const uint32_t* variant_bytect_iter = g_variant_bytects[parity];
+	for (uint32_t variant_bidx = 0; variant_bidx < prev_block_write_ct; ++variant_bidx, ++write_variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &write_variant_uidx);
+	  if (write_variant_uidx >= chr_end) {
+	    do {
+	      ++chr_fo_idx;
+	      chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	    } while (write_variant_uidx >= chr_end);
+	    const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	    char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	    chr_slen = (uintptr_t)(chr_name_end - chr_buf);
+	  }
+	  const char* cur_variant_id = variant_ids[write_variant_uidx];
+	  const uint32_t id_slen = strlen(cur_variant_id);
+	  memcpy(&(writebuf[6]), &id_slen, 4);
+	  // deliberately clobber top two bytes
+	  unsigned char* writebuf_iter = (unsigned char*)memcpya(&(writebuf[8]), cur_variant_id, id_slen);
+	  memcpy(writebuf_iter, &chr_slen, 4);
+	  writebuf_iter = (unsigned char*)memcpya(&(writebuf_iter[2]), chr_buf, chr_slen);
+	  memcpy(writebuf_iter, &(variant_bps[write_variant_uidx]), 4);
+	  writebuf_iter = &(writebuf_iter[4]);
+	  uintptr_t variant_allele_idx_base = write_variant_uidx * 2;
+	  if (variant_allele_idxs) {
+	    variant_allele_idx_base = variant_allele_idxs[write_variant_uidx];
+	  }
+	  char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	  if (refalt1_select) {
+	    ref_allele_idx = refalt1_select[write_variant_uidx * 2];
+	    alt1_allele_idx = refalt1_select[write_variant_uidx * 2 + 1];
+	  }
+	  const char* first_allele;
+	  const char* second_allele;
+	  if (ref_allele_second) {
+	    first_allele = cur_alleles[alt1_allele_idx];
+	    second_allele = cur_alleles[ref_allele_idx];
+	  } else {
+	    first_allele = cur_alleles[ref_allele_idx];
+	    second_allele = cur_alleles[alt1_allele_idx];
+	  }
+	  uint32_t allele_slen = strlen(first_allele);
+	  memcpy(writebuf_iter, &allele_slen, 4);
+	  writebuf_iter = (unsigned char*)memcpya(&(writebuf_iter[4]), first_allele, allele_slen);
+	  allele_slen = strlen(second_allele);
+	  memcpy(writebuf_iter, &allele_slen, 4);
+	  writebuf_iter = (unsigned char*)memcpya(&(writebuf_iter[4]), second_allele, allele_slen);
+	  const uint32_t cur_variant_bytect = *variant_bytect_iter++;
+	  memcpy(writebuf_iter, &cur_variant_bytect, 4);
+	  writebuf_iter = &(writebuf_iter[4]);
+	  memcpy(writebuf_iter, compressed_data_iter, cur_variant_bytect);
+	  writebuf_iter = &(writebuf_iter[cur_variant_bytect]);
+	  compressed_data_iter = &(compressed_data_iter[bgen_compressed_buf_max]);
+	  if (fwrite_checked(writebuf, writebuf_iter - writebuf, outfile)) {
+	    if (variant_idx < variant_ct) {
+	      join_threads2z(calc_thread_ct, is_last_block, threads);
+	      if (!is_last_block) {
+		g_cur_block_write_ct = 0;
+		error_cleanup_threads2z(export_bgen13_thread, calc_thread_ct, threads);
+	      }
+	    }
+	    goto export_bgen13_ret_WRITE_FAIL;
+	  }
+	}
+      }
+      if (variant_idx == variant_ct) {
+	break;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+      ++read_block_idx;
+      prev_block_write_ct = cur_block_write_ct;
+      variant_idx += cur_block_write_ct;
+      pgfip->block_base = main_loadbufs[parity];
+    }
+    if (fclose_null(&outfile)) {
+      goto export_bgen13_ret_WRITE_FAIL;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+    const uint32_t sample_ctav2 = acc2_vec_ct * kQuatersPerVec;
+    const uintptr_t acc32_offset = acc2_vec_ct * (7 * k1LU * kWordsPerVec);
+    uint32_t* scrambled_missing_cts = (uint32_t*)(&(g_missing_acc2[0][acc32_offset]));
+    for (uint32_t tidx = 1; tidx < calc_thread_ct; ++tidx) {
+      const uint32_t* thread_scrambled_missing_cts = (uint32_t*)(&(g_missing_acc2[tidx][acc32_offset]));
+      for (uint32_t uii = 0; uii < sample_ctav2; ++uii) {
+	scrambled_missing_cts[uii] += thread_scrambled_missing_cts[uii];
+      }
+    }
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      const uint32_t scrambled_idx = scramble_2_4_8_32(sample_idx);
+      sample_missing_geno_cts[sample_idx] = scrambled_missing_cts[scrambled_idx];
+    }
+  }
+  while (0) {
+  export_bgen13_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_bgen13_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_bgen13_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  export_bgen13_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+#ifdef __LP64__
+  export_bgen13_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+#endif
+  export_bgen13_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ export_bgen13_ret_1:
+  ZWRAP_useZSTDcompression(1);
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+*/
+
+pglerr_t export_ox_sample(const char* outname, const uintptr_t* sample_include, const char* sample_ids, const uint32_t* sample_missing_geno_cts, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, uint32_t sample_ct, uintptr_t max_sample_id_blen, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t variant_ct, uint32_t y_ct) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t pheno_ctl = BITCT_TO_WORDCT(pheno_ct);
+    char* writebuf;
+    uintptr_t* is_basic_categorical;
+    // if phenotype is categorical, and all (non-null) category names are of
+    // the form P[positive integer], then it's best to emit the positive
+    // integer in the name string instead of the internal index.
+    if (bigstack_calloc_ul(pheno_ctl, &is_basic_categorical) ||
+	bigstack_alloc_c(kMaxMediumLine + max_sample_id_blen + 32 + pheno_ct * MAXV(kMaxMissingPhenostrBlen, 16), &writebuf)) {
+      goto export_ox_sample_ret_NOMEM;
+    }
+    
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto export_ox_sample_ret_OPEN_FAIL;
+    }
+    char* writebuf_flush = &(writebuf[kMaxMediumLine]);
+    char* write_iter = strcpya(writebuf, "ID_1 ID_2 missing sex");
+    for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+      *write_iter++ = ' ';
+      write_iter = strcpya(write_iter, &(pheno_names[pheno_idx * max_pheno_name_blen]));
+      const pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_idx]);
+      if (cur_pheno_col->type_code == kPhenoDtypeCat) {
+	const uint32_t nn_cat_ct = cur_pheno_col->nonnull_category_ct;
+	char** cur_cat_names = cur_pheno_col->category_names;
+	uint32_t cat_idx;
+	for (cat_idx = 1; cat_idx <= nn_cat_ct; ++cat_idx) {
+	  const char* cat_name_iter = cur_cat_names[cat_idx];
+	  if (*cat_name_iter == 'C') {
+	    uint32_t char_code = *(++cat_name_iter);
+	    if ((char_code - 49) < 9) {
+	      uint32_t uii;
+	      if (!scan_posint_capped(cat_name_iter, 0x7fffffff, &uii)) {
+		continue;
+	      }
+	    }
+	  }
+	  break;
+	}
+	if (cat_idx == nn_cat_ct + 1) {
+	  set_bit(pheno_idx, is_basic_categorical);
+	}
+      }
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto export_ox_sample_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+    }
+    append_binary_eoln(&write_iter);
+
+    write_iter = strcpya(write_iter, "0 0 0 D");
+    for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+      *write_iter++ = ' ';
+      const pheno_dtype_t cur_type_code = pheno_cols[pheno_idx].type_code;
+      if (cur_type_code == kPhenoDtypeCc) {
+	*write_iter++ = 'B';
+      } else if (cur_type_code == kPhenoDtypeQt) {
+	// .psam file does not distinguish between "continuous covariate" and
+	// "continuous phenotype", that's lost on round-trip
+	*write_iter++ = 'P';
+      } else {
+	*write_iter++ = 'D';
+      }
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto export_ox_sample_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+    }
+    append_binary_eoln(&write_iter);
+
+    const double nonmale_geno_ct_recip = 1.0 / ((double)((int32_t)(variant_ct - y_ct)));
+    const double male_geno_ct_recip = 1.0 / ((double)((int32_t)variant_ct));
+    uintptr_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_ul_unsafe_ck(sample_include, &sample_uidx);
+      const char* cur_sample_id = &(sample_ids[max_sample_id_blen * sample_uidx]);
+      const char* fid_end = (const char*)rawmemchr(cur_sample_id, '\t');
+      write_iter = memcpyax(write_iter, cur_sample_id, (uintptr_t)(fid_end - cur_sample_id), ' ');
+      write_iter = strcpya(write_iter, &(fid_end[1]));
+      *write_iter++ = ' ';
+      const int32_t cur_missing_geno_ct = sample_missing_geno_cts[sample_idx];
+      if (is_set(sex_male, sample_uidx)) {
+        write_iter = dtoa_g(cur_missing_geno_ct * male_geno_ct_recip, write_iter);
+	write_iter = strcpya(write_iter, " 1");
+      } else {
+	write_iter = dtoa_g(cur_missing_geno_ct * nonmale_geno_ct_recip, write_iter);
+	*write_iter++ = ' ';
+	if (is_set(sex_nm, sample_uidx)) {
+	  *write_iter++ = '2';
+	} else {
+	  write_iter = strcpya(write_iter, "NA");
+	}
+      }
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	*write_iter++ = ' ';
+        const pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_idx]);
+	if (!is_set(cur_pheno_col->nonmiss, sample_uidx)) {
+	  write_iter = strcpya(write_iter, "NA");
+	} else {
+	  const pheno_dtype_t cur_type_code = cur_pheno_col->type_code;
+	  if (cur_type_code == kPhenoDtypeCc) {
+	    *write_iter++ = '0' + is_set(cur_pheno_col->data.cc, sample_uidx);
+	  } else if (cur_type_code == kPhenoDtypeQt) {
+	    write_iter = dtoa_g(cur_pheno_col->data.qt[sample_uidx], write_iter);
+	  } else {
+	    const uint32_t cur_cat_idx = cur_pheno_col->data.cat[sample_uidx];
+	    if (is_set(is_basic_categorical, pheno_idx)) {
+	      write_iter = strcpya(write_iter, &(cur_pheno_col->category_names[cur_cat_idx][1]));
+	    } else {
+	      write_iter = uint32toa(cur_cat_idx, write_iter);
+	    }
+	  }
+	}
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto export_ox_sample_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, write_iter - writebuf, outfile)) {
+	goto export_ox_sample_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto export_ox_sample_ret_WRITE_FAIL;
+    }
+  }
+  while (0) {
+  export_ox_sample_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_ox_sample_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_ox_sample_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+uint32_t valid_vcf_allele_code(const char* allele_code) {
+  // returns 1 if probably valid (angle-bracket case is not exhaustively
+  // checked), 0 if definitely not
+  uint32_t uii = (unsigned char)(*allele_code);
+  if ((uii == '<') || ((uii == '*') && (!allele_code[1]))) {
+    return 1;
+  }
+  do {
+    uii -= 64;
+    // A = 1, C = 3, G = 7, N = 14, T = 20, so (0x10408a >> ucc) & 1 works as a
+    // set membership test
+#ifdef __LP64__
+    if ((uii > 63) || (!((0x10408a0010408aLLU >> uii) & 1))) {
+      // if '[', ']', or '.', assume breakend
+      return ((uii == 27) || (uii == 29) || (uii == 0xffffffeeU))? 1 : 0;
+    }
+#else
+    if ((uii > 63) || (!((0x10408a >> (uii % 32)) & 1))) {
+      return ((uii == 27) || (uii == 29) || (uii == 0xffffffeeU))? 1 : 0;
+    }
+#endif
+    uii = (unsigned char)(*(++allele_code));
+  } while (uii);
+  return 1;
+}
+
+char* diploid_vcf_dosage_print(uint32_t dosage_int, uint32_t write_ds, char* write_iter) {
+  if (write_ds) {
+    return print_small_dosage(dosage_int, write_iter);
+  }
+  if (dosage_int <= kDosageMid) {
+    write_iter = print_small_dosage(kDosageMid - dosage_int, write_iter);
+    *write_iter++ = ',';
+    write_iter = print_small_dosage(dosage_int, write_iter);
+    return strcpya(write_iter, ",0");
+  }
+  write_iter = strcpya(write_iter, "0,");
+  write_iter = print_small_dosage(kDosageMax - dosage_int, write_iter);
+  *write_iter++ = ',';
+  return print_small_dosage(dosage_int - kDosageMid, write_iter);
+}
+
+// assumes rawval is in [0, 327679]
+static_assert(kDosageMax == 32768, "haploid_dosage_print() needs to be updated.");
+char* haploid_dosage_print(uint32_t rawval, char* start) {
+  // Instead of constant 5-digit precision, we print fewer digits whenever that
+  // doesn't interfere with proper round-tripping.  I.e. we search for the
+  // shortest string in
+  //   ((n - 0.5)/32768, (n + 0.5)/32768).
+  *start++ = '0' + (rawval / 32768);
+  rawval = rawval % 32768;
+  if (!rawval) {
+    // this shouldn't come up for now?
+    return start;
+  }
+  *start++ = '.';
+
+  // (rawval * 2) is in 65536ths
+  // 65536 * 625 = 40960k
+  const uint32_t range_top_40960k = rawval * 1250 + 625;
+  // ok to check half-open interval since we never hit boundary
+  if ((range_top_40960k % 4096) < 1250) {
+    // when this is true, the four-decimal-place approximation is in the range
+    // which round-trips back to our original number.
+    const uint32_t four_decimal_places = range_top_40960k / 4096;
+    return uitoa_trunc4(four_decimal_places, start);
+  }
+  
+  // we wish to print (100000 * remainder + 16384) / 32768, left-0-padded.  and
+  // may as well banker's round too.
+  //
+  // banker's rounding yields a different result than regular rounding for n/64
+  // when n is congruent to 1 mod 4.  32768/64 = 512.
+  const uint32_t five_decimal_places = ((3125 * rawval + 512) / 1024) - ((rawval % 2048) == 512);
+  const uint32_t first_decimal_place = five_decimal_places / 10000;
+  *start++ = '0' + first_decimal_place;
+  const uint32_t last_four_digits = five_decimal_places - first_decimal_place * 10000;
+  if (last_four_digits) {
+    return uitoa_trunc4(last_four_digits, start);
+  }
+  return start;
+}
+
+interr_t flexbwrite_flush(char* buf, size_t len, FILE* outfile, BGZF* bgz_outfile) {
+  if (outfile) {
+    return fwrite_checked(buf, len, outfile);
+  }
+  return (bgzf_write(bgz_outfile, buf, len) < 0);
+}
+
+
+// these assume buf_flush - buf = kMaxMediumLine
+// outfile should be nullptr iff we're doing bgzf compression
+interr_t flexbwrite_flush2(char* buf_flush, FILE* outfile, BGZF* bgz_outfile, char** write_iter_ptr) {
+  char* buf = &(buf_flush[-((int32_t)kMaxMediumLine)]);
+  char* buf_end = *write_iter_ptr;
+  *write_iter_ptr = buf;
+  return flexbwrite_flush(buf, (uintptr_t)(buf_end - buf), outfile, bgz_outfile);
+}
+
+static inline interr_t flexbwrite_ck(char* buf_flush, FILE* outfile, BGZF* bgz_outfile, char** write_iter_ptr) {
+  if ((*write_iter_ptr) < buf_flush) {
+    return 0;
+  }
+  return flexbwrite_flush2(buf_flush, outfile, bgz_outfile, write_iter_ptr);
+}
+
+
+#ifdef __arm__
+  #error "Unaligned accesses in export_vcf()."
+#endif
+pglerr_t export_vcf(char* xheader, const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, const char* sample_ids, const char* sids, const uintptr_t* sex_male_collapsed, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_fil [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  gzFile gz_pvar_reload = nullptr;
+  BGZF* bgz_outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!(exportf_modifier & kfExportfBgz)) {
+      strcpy(outname_end, ".vcf");
+      if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	goto export_vcf_ret_OPEN_FAIL;
+      }
+    } else {
+      strcpy(outname_end, ".vcf.gz");
+      bgz_outfile = bgzf_open(outname, "w");
+      if (!bgz_outfile) {
+	goto export_vcf_ret_OPEN_FAIL;
+      }
+#ifndef _WIN32
+      if (max_thread_ct > 1) {
+	// 128 doesn't seem any worse than 256 (and is clearly better than 64)
+	// also tried reducing thread count by 1, that seems worse
+	if (bgzf_mt2(g_bigstack_end, MINV(128, max_thread_ct), 128, &g_bigstack_base, bgz_outfile)) {
+	  goto export_vcf_ret_NOMEM;
+	}
+      }
+#endif
+    }
+    const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+    // CHROM, POS, ID, REF, one ALT, eoln
+    uintptr_t writebuf_blen = kMaxIdSlen + 32 + max_chr_blen + 2 * max_allele_slen;
+    // QUAL, FILTER, INFO, FORMAT, genotypes, eoln
+    // needs to be larger for >9 alt alleles
+    uint32_t write_ds = (exportf_modifier / kfExportfVcfDosageDs) & 1;
+    uint32_t write_gp_or_ds = write_ds || (exportf_modifier & kfExportfVcfDosageGp);
+    if (write_gp_or_ds && (!(pgfip->gflags & kfPgenGlobalDosagePresent))) {
+      write_gp_or_ds = 0;
+      LOGERRPRINTF("Warning: No dosage data present.  %s field will not be exported.\n", write_ds? "DS" : "GP");
+      write_ds = 0;
+    }
+    if (writebuf_blen < ((4 * k1LU) + write_gp_or_ds * 24 - write_ds * 16) * sample_ct + 32 + max_filter_slen + info_reload_slen) {
+      writebuf_blen = ((4 * k1LU) + write_gp_or_ds * 24 - write_ds * 16) * sample_ct + 32 + max_filter_slen + info_reload_slen;
+    }
+    writebuf_blen += kCompressStreamBlock;
+    char* writebuf;
+    if (bigstack_alloc_c(writebuf_blen, &writebuf)) {
+      goto export_vcf_ret_NOMEM;
+    }
+    char* writebuf_flush = &(writebuf[kMaxMediumLine]);
+    char* write_iter = strcpya(writebuf, "##fileformat=VCFv4.3" EOLN_STR "##fileDate=");
+    time_t rawtime;
+    time(&rawtime);
+    struct tm* loctime;
+    loctime = localtime(&rawtime);
+    write_iter += strftime(write_iter, kMaxMediumLine, "%Y%m%d", loctime);
+    write_iter = strcpya(write_iter, EOLN_STR "##source=PLINKv2.00" EOLN_STR);
+    if (cip->chrset_source) {
+      append_chrset_line(cip, &write_iter);
+    }
+    if (flexbwrite_flush(writebuf, write_iter - writebuf, outfile, bgz_outfile)) {
+      goto export_vcf_ret_WRITE_FAIL;
+    }
+    const uint32_t chr_ctl = BITCT_TO_WORDCT(cip->chr_ct);
+    uintptr_t* written_contig_header_lines;
+    if (bigstack_calloc_ul(chr_ctl, &written_contig_header_lines)) {
+      goto export_vcf_ret_NOMEM;
+    }
+    if (xheader) {
+      memcpy(writebuf, "##contig=<ID=", 13);
+      char* xheader_iter = xheader;
+      char* xheader_end = &(xheader[xheader_blen]);
+      char* line_end = xheader;
+      while (line_end != xheader_end) {
+	xheader_iter = line_end;
+	line_end = (char*)rawmemchr(xheader_iter, '\n');
+	++line_end;
+	const uint32_t slen = (uintptr_t)(line_end - xheader_iter);
+	if ((slen > 14) && (!memcmp(xheader_iter, "##contig=<ID=", 13))) {
+	  char* contig_name_start = &(xheader_iter[13]);
+	  char* contig_name_end = (char*)memchr(contig_name_start, ',', slen - 14);
+	  if (!contig_name_end) {
+	    // if this line is technically well-formed (ends in '>'), it's
+	    // useless anyway, throw it out
+	    continue;
+	  }
+	  const int32_t chr_idx = get_chr_code_counted(cip, contig_name_end - contig_name_start, contig_name_start);
+	  if (chr_idx < 0) {
+	    continue;
+	  }
+	  const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)chr_idx];
+	  if (IS_SET(written_contig_header_lines, chr_fo_idx)) {
+	    logerrprint("Error: Duplicate ##contig line in .pvar file.\n");
+	    goto export_vcf_ret_MALFORMED_INPUT;
+	  }
+	  SET_BIT(chr_fo_idx, written_contig_header_lines);
+	  // if --output-chr was used at some point, we need to sync the
+	  // ##contig chromosome code with the code in the VCF body.
+	  write_iter = chr_name_write(cip, chr_idx, &(writebuf[13]));
+	  if (flexbwrite_flush(writebuf, write_iter - writebuf, outfile, bgz_outfile)) {
+	    goto export_vcf_ret_WRITE_FAIL;
+	  }
+	  if (flexbwrite_flush(contig_name_end, (uintptr_t)(line_end - contig_name_end), outfile, bgz_outfile)) {
+	    goto export_vcf_ret_WRITE_FAIL;
+	  }
+	} else {
+	  if (flexbwrite_flush(xheader_iter, slen, outfile, bgz_outfile)) {
+	    goto export_vcf_ret_WRITE_FAIL;
+	  }
+	}
+      }
+    }
+    write_iter = writebuf;
+    // fill in the missing ##contig lines
+    uint32_t contig_zero_written = 0;
+    for (uint32_t chr_fo_idx = 0; chr_fo_idx < cip->chr_ct; ++chr_fo_idx) {
+      if (IS_SET(written_contig_header_lines, chr_fo_idx)) {
+	continue;
+      }
+      const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+      if ((!IS_SET(cip->chr_mask, chr_idx)) || are_all_bits_zero(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1])) {
+	continue;
+      }
+      char* chr_name_write_start = strcpya(write_iter, "##contig=<ID=");
+      char* chr_name_write_end = chr_name_write(cip, chr_idx, chr_name_write_start);
+      if ((*chr_name_write_start == '0') && (chr_name_write_end == &(chr_name_write_start[1]))) {
+	// --allow-extra-chr 0 special case
+	if (contig_zero_written) {
+	  continue;
+	}
+	contig_zero_written = 1;
+	write_iter = strcpya(chr_name_write_end, ",length=2147483645");
+      } else {
+	if (memchr(chr_name_write_start, ':', chr_name_write_end - chr_name_write_start)) {
+	  logerrprint("Error: VCF chromosome codes may not include the ':' character.\n");
+	  goto export_vcf_ret_MALFORMED_INPUT;
+	}
+	write_iter = strcpya(chr_name_write_end, ",length=");
+	if (1) {
+	  write_iter = uint32toa(variant_bps[cip->chr_fo_vidx_start[chr_fo_idx + 1] - 1] + 1, write_iter);
+	} else {
+	  // todo: unsorted map case
+	}
+      }
+      *write_iter++ = '>';
+      append_binary_eoln(&write_iter);
+      if (flexbwrite_ck(writebuf_flush, outfile, bgz_outfile, &write_iter)) {
+	goto export_vcf_ret_WRITE_FAIL;
+      }
+    }
+    bigstack_reset(written_contig_header_lines);
+    const uint32_t all_nonref = pgfip->gflags & kfPgenGlobalAllNonref;
+    const uintptr_t* nonref_flags = pgfip->nonref_flags;
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uint32_t write_pr = all_nonref;
+    if (nonref_flags) {
+      for (uint32_t widx = 0; widx < raw_variant_ctl; ++widx) {
+	if (variant_include[widx] & nonref_flags[widx]) {
+	  write_pr = 1;
+	  break;
+	}
+      }
+    }
+    if (write_pr && (!xheader_info_pr)) {
+      write_iter = strcpya(write_iter, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
+    }
+    if (write_ds) {
+      write_iter = strcpya(write_iter, "##FORMAT=<ID=DS,Number=1,Type=Float,Description=\"Estimated Alternate Allele Dosage : [P(0/1)+2*P(1/1)]\">" EOLN_STR);
+    } else if (write_gp_or_ds) {
+      write_iter = strcpya(write_iter, "##FORMAT=<ID=GP,Number=G,Type=Float,Description=\"Phred-scaled Genotype Likelihoods\">" EOLN_STR);
+    }
+    // possible todo: optionally export .psam information as
+    // PEDIGREE/META/SAMPLE lines in header, and make --vcf be able to read it
+    write_iter = strcpya(write_iter, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" EOLN_STR "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
+    uint32_t write_sid = 0;
+    // possible for both MAYBESID and SID to be set
+    if (exportf_id_paste & kfIdpasteSid) {
+      write_sid = 1;
+      if (!sids) {
+	max_sid_blen = 2;
+      }
+    } else if ((exportf_id_paste & kfIdpasteMaybesid) && sids) {
+      // no nonzero check in load_psam(), so we have to do it here
+      uint32_t sample_uidx = 0;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	if (memcmp(&(sids[sample_uidx * max_sid_blen]), "0", 2)) {
+	  write_sid = 1;
+	  break;
+	}
+      }
+    }
+    uint32_t sample_uidx = 0;
+    uint32_t id_delim_warning = 0;
+    char id_delim = exportf_id_delim? exportf_id_delim : '_';
+    const uintptr_t max_exported_sample_id_blen = max_sample_id_blen + write_sid * max_sid_blen;
+    char* exported_sample_ids;
+    const uint32_t exported_id_htable_size = get_htable_min_size(sample_ct);
+    uint32_t* exported_id_htable;
+    // check for duplicates
+    if (bigstack_alloc_c(sample_ct * max_exported_sample_id_blen, &exported_sample_ids) ||
+	bigstack_alloc_ui(exported_id_htable_size, &exported_id_htable)) {
+      goto export_vcf_ret_NOMEM;
+    }
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      const char* orig_sample_id = &(sample_ids[sample_uidx * max_sample_id_blen]);
+      const char* orig_fid_end = (const char*)rawmemchr(orig_sample_id, '\t');
+      char* exported_sample_ids_iter = &(exported_sample_ids[sample_idx * max_exported_sample_id_blen]);
+      if (exportf_id_paste & kfIdpasteFid) {
+	const uint32_t fid_slen = (uintptr_t)(orig_fid_end - orig_sample_id);
+	if ((!id_delim_warning) && memchr(orig_sample_id, id_delim, fid_slen)) {
+	  id_delim_warning = 1;
+	}
+	exported_sample_ids_iter = memcpyax(exported_sample_ids_iter, orig_sample_id, fid_slen, id_delim);
+      }
+      if (exportf_id_paste & kfIdpasteIid) {
+	const char* orig_iid = &(orig_fid_end[1]);
+        const uint32_t iid_slen = strlen(orig_iid);
+	if ((!id_delim_warning) && memchr(orig_iid, id_delim, iid_slen)) {
+	  id_delim_warning = 1;
+	}
+	exported_sample_ids_iter = memcpyax(exported_sample_ids_iter, orig_iid, iid_slen, id_delim);
+      }
+      if (write_sid) {
+	if (sids) {
+	  const char* orig_sid = &(sids[sample_uidx * max_sid_blen]);
+	  const uint32_t sid_slen = strlen(orig_sid);
+	  if ((!id_delim_warning) && memchr(orig_sid, id_delim, sid_slen)) {
+	    id_delim_warning = 1;
+	  }
+	  exported_sample_ids_iter = memcpya(exported_sample_ids_iter, orig_sid, sid_slen);
+	} else {
+	  *exported_sample_ids_iter++ = '0';
+	}
+	++exported_sample_ids_iter;
+      }
+      exported_sample_ids_iter[-1] = '\0';
+    }
+    if (id_delim_warning) {
+      if (exportf_id_delim) {
+	LOGERRPRINTF("Warning: '%c' present in original sample IDs; --vcf will not be able to\nreconstruct them.  Consider rerunning with a different --export id-delim=\nvalue.\n", exportf_id_delim);
+      } else {
+	logerrprint("Warning: '_' present in original sample IDs; --vcf will not be able to\nreconstruct them.  Consider rerunning with a suitable --export id-delim= value.\n");
+      }
+    }
+    if (populate_strbox_htable(exported_sample_ids, sample_ct, max_exported_sample_id_blen, exported_id_htable_size, exported_id_htable)) {
+      logerrprint("Warning: Duplicate sample ID(s) present in exported VCF file.\n");
+    }
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      *write_iter++ = '\t';
+      write_iter = strcpya(write_iter, &(exported_sample_ids[sample_idx * max_exported_sample_id_blen]));
+      if (flexbwrite_ck(writebuf_flush, outfile, bgz_outfile, &write_iter)) {
+	goto export_vcf_ret_WRITE_FAIL;
+      }
+    }
+    append_binary_eoln(&write_iter);
+    bigstack_reset(exported_sample_ids);
+
+    LOGPRINTFWW5("--export vcf%s to %s ... ", bgz_outfile? " bgz" : "", outname);
+    fputs("0%", stdout);
+    fflush(stdout);
+
+    // includes trailing tab
+    char* chr_buf;
+
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uintptr_t* genovec;
+    uintptr_t* allele_include;
+    if (bigstack_alloc_c(max_chr_blen, &chr_buf) ||
+    // if we weren't using bigstack_alloc, this would need to be sample_ctaw2
+	bigstack_alloc_ul(sample_ctl2, &genovec) ||
+	bigstack_alloc_ul(BITCT_TO_WORDCT(kPglMaxAltAlleleCt), &allele_include)) {
+      goto export_vcf_ret_NOMEM;
+    }
+    // For now, if phased data is present, each homozygous call is represented
+    // as phased iff the previous heterozygous call was phased.  (If no
+    // previous heterozygous call exists, it's treated as phased.)  This does
+    // the right thing when the entire genome is phased, and it induces about
+    // as good a phase set approximation as you can get without explicitly
+    // saving that info.  But that approximation is still pretty inaccurate; as
+    // soon as we have any use for them, explicit phase set support should be
+    // added to pgenlib.
+    const uint32_t some_phased = (pgfip->gflags / kfPgenGlobalHardcallPhasePresent) & 1;
+    uintptr_t* prev_phased = nullptr;
+    uintptr_t* phasepresent = nullptr;
+    uintptr_t* phaseinfo = nullptr;
+    if (some_phased) {
+      if (bigstack_alloc_ul(sample_ctl, &prev_phased) ||
+	  bigstack_alloc_ul(sample_ctl, &phasepresent) ||
+	  bigstack_alloc_ul(sample_ctl, &phaseinfo)) {
+	goto export_vcf_ret_NOMEM;
+      }
+      fill_all_bits(sample_ct, prev_phased);
+    }
+
+    uintptr_t* dosage_present = nullptr;
+    dosage_t* dosage_vals = nullptr;
+    if (write_gp_or_ds) {
+      if (bigstack_alloc_ul(sample_ctl, &dosage_present) ||
+	  bigstack_alloc_dosage(sample_ct, &dosage_vals)) {
+	goto export_vcf_ret_NOMEM;
+      }
+    }
+
+    char* loadbuf = nullptr;
+    uintptr_t loadbuf_size = 0;
+    uint32_t info_col_idx = 0;
+    if (pvar_info_reload) {
+      reterr = pvar_info_reload_header(pvar_info_reload, &gz_pvar_reload, &loadbuf, &loadbuf_size, &info_col_idx);
+      if (reterr) {
+	goto export_vcf_ret_1;
+      }
+    }
+
+    // assumes little-endian
+    uint32_t basic_genotext[4];
+    basic_genotext[0] = 0x302f3009; // \t0/0
+    basic_genotext[1] = 0x312f3009; // \t0/1
+    basic_genotext[2] = 0x312f3109; // \t1/1
+    basic_genotext[3] = 0x2e2f2e09; // \t./.
+    char haploid_genotext[4][4];
+    uint32_t haploid_genotext_blen[8]; // 4..7 = male chrX
+    memcpy(haploid_genotext[0], "\t0/0", 4);
+    memcpy(haploid_genotext[1], "\t0/1", 4);
+    memcpy(haploid_genotext[2], "\t1/1", 4);
+    memcpy(haploid_genotext[3], "\t./.", 4);
+    haploid_genotext_blen[1] = 4;
+    haploid_genotext_blen[4] = 2;
+    haploid_genotext_blen[5] = 4;
+    haploid_genotext_blen[6] = 2;
+    haploid_genotext_blen[7] = 2;
+    // don't bother exporting GP/DS for hardcalls
+    const char* dot_ptr = &(g_one_char_strs[92]);
+    const char* input_missing_geno_ptr = g_input_missing_geno_ptr;
+    const uint32_t sample_ctl2_m1 = sample_ctl2 - 1;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_buf_blen = 0;
+    uint32_t variant_uidx = 0;
+    uint32_t is_x = 0;
+    uint32_t is_haploid_or_mt = 0; // includes chrX and chrY
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    uint32_t gz_variant_uidx = 0;
+    uint32_t ref_allele_idx = 0;
+    uint32_t alt1_allele_idx = 1;
+    uint32_t cur_allele_ct = 2;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      // a lot of this is redundant with write_pvar(), may want to factor the
+      // commonalities out
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	is_x = (chr_idx == cip->xymt_codes[kChrOffsetX]);
+	is_haploid_or_mt = is_set(cip->haploid_mask, chr_idx) || (chr_idx == cip->xymt_codes[kChrOffsetMT]);
+	// forced --merge-par, with diploid male output (is_x NOT set, but
+	// chromosome code is X/chrX)
+	if ((chr_idx == cip->xymt_codes[kChrOffsetPAR1]) || (chr_idx == cip->xymt_codes[kChrOffsetPAR2])) {
+	  chr_idx = cip->xymt_codes[kChrOffsetX];
+	}
+	char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	*chr_name_end = '\t';
+	chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	if (is_haploid_or_mt) {
+	  if (is_x) {
+	    haploid_genotext_blen[0] = 4;
+	    haploid_genotext_blen[2] = 4;
+	    haploid_genotext_blen[3] = 4;
+	  } else {
+	    haploid_genotext_blen[0] = 2;
+	    haploid_genotext_blen[2] = 2;
+	    haploid_genotext_blen[3] = 2;
+	  }
+	}
+      }
+      // #CHROM
+      write_iter = memcpya(write_iter, chr_buf, chr_buf_blen);
+
+      // POS
+      write_iter = uint32toa_x(variant_bps[variant_uidx], '\t', write_iter);
+
+      // ID
+      write_iter = strcpyax(write_iter, variant_ids[variant_uidx], '\t');
+
+      // REF, ALT
+      uintptr_t variant_allele_idx_base = variant_uidx * 2;
+      if (variant_allele_idxs) {
+	variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+      }
+      char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+      if (refalt1_select) {
+	ref_allele_idx = refalt1_select[variant_uidx * 2];
+	alt1_allele_idx = refalt1_select[variant_uidx * 2 + 1];
+	// this logic only works in the biallelic case
+	assert(cur_allele_ct == 2);
+	if (!is_haploid_or_mt) {
+	  if (alt1_allele_idx) {
+	    basic_genotext[0] = 0x302f3009;
+	    basic_genotext[2] = 0x312f3109;
+	  } else {
+	    basic_genotext[0] = 0x312f3109;
+	    basic_genotext[2] = 0x302f3009;
+	  }
+	} else {
+	  if (alt1_allele_idx) {
+	    memcpy(haploid_genotext[0], "\t0/0", 4);
+	    memcpy(haploid_genotext[2], "\t1/1", 4);
+	  } else {
+	    memcpy(haploid_genotext[0], "\t1/1", 4);
+	    memcpy(haploid_genotext[2], "\t0/0", 4);
+	  }
+	}
+      }
+      if ((cur_alleles[ref_allele_idx] != dot_ptr) && (cur_alleles[ref_allele_idx] != input_missing_geno_ptr)) {
+        write_iter = strcpya(write_iter, cur_alleles[ref_allele_idx]);
+      } else {
+	*write_iter++ = 'N';
+      }
+      *write_iter++ = '\t';
+      write_iter = strcpya(write_iter, cur_alleles[alt1_allele_idx]);
+      if (flexbwrite_ck(writebuf_flush, outfile, bgz_outfile, &write_iter)) {
+	goto export_vcf_ret_WRITE_FAIL;
+      }
+      if (cur_allele_ct > 2) {
+	fill_all_bits(cur_allele_ct, allele_include);
+	CLEAR_BIT(ref_allele_idx, allele_include);
+	CLEAR_BIT(alt1_allele_idx, allele_include);
+        uint32_t cur_allele_uidx = 0;
+	uint32_t alt_allele_idx = 2;
+	do {
+	  *write_iter++ = ',';
+	  next_set_unsafe_ck(allele_include, &cur_allele_uidx);
+	  write_iter = strcpya(write_iter, cur_alleles[cur_allele_uidx++]);
+	  if (flexbwrite_ck(writebuf_flush, outfile, bgz_outfile, &write_iter)) {
+	    goto export_vcf_ret_WRITE_FAIL;
+	  }
+	} while (++alt_allele_idx < cur_allele_ct);
+      }
+
+      // QUAL
+      *write_iter++ = '\t';
+      if ((!pvar_qual_present) || (!IS_SET(pvar_qual_present, variant_uidx))) {
+	*write_iter++ = '.';
+      } else {
+	write_iter = ftoa_g(pvar_quals[variant_uidx], write_iter);
+      }
+
+      // FILTER
+      *write_iter++ = '\t';
+      if ((!pvar_filter_present) || (!IS_SET(pvar_filter_present, variant_uidx))) {
+	*write_iter++ = '.';
+      } else if (!IS_SET(pvar_filter_npass, variant_uidx)) {
+	write_iter = strcpya(write_iter, "PASS");
+      } else {
+	write_iter = strcpya(write_iter, pvar_filter_storage[variant_uidx]);
+      }
+
+      // INFO
+      *write_iter++ = '\t';
+      const uint32_t is_pr = all_nonref || (nonref_flags && IS_SET(nonref_flags, variant_uidx));
+      if (gz_pvar_reload) {
+	reterr = pvar_info_reload_and_write(loadbuf_size, xheader_info_pr, info_col_idx, variant_uidx, is_pr, gz_pvar_reload, &write_iter, &gz_variant_uidx, loadbuf);
+	if (reterr) {
+	  goto export_vcf_ret_1;
+	}
+      } else {
+	if (is_pr) {
+	  write_iter = strcpya(write_iter, "PR");
+	} else {
+	  *write_iter++ = '.';
+	}
+      }
+
+      // FORMAT
+      write_iter = memcpyl3a(write_iter, "\tGT");
+      
+      uint32_t dosage_ct = 0;
+      uint32_t is_explicit_alt1 = 0;
+      uint32_t inner_loop_last = kBitsPerWordD2 - 1;
+      uint32_t widx = 0;
+      if (!some_phased) {
+	// biallelic, nothing phased in entire file
+	if (!write_gp_or_ds) {
+	  reterr = pgr_read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec);
+	} else {
+	  reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+	}
+	if (reterr) {
+	  goto export_vcf_ret_PGR_FAIL;
+	}
+	if (!dosage_ct) {
+	  if (!is_haploid_or_mt) {
+	    // always 4 bytes wide, exploit that
+	    uint32_t* write_iter_ui_alias = (uint32_t*)write_iter;
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		*write_iter_ui_alias++ = basic_genotext[genovec_word & 3];
+		genovec_word >>= 2;
+	      }
+	      ++widx;
+	    }
+	    write_iter = (char*)write_iter_ui_alias;
+	  } else {
+	    // chrX: male homozygous/missing calls use only one character + tab
+	    // other haploid/MT: this is true for nonmales too
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      uint32_t sex_male_hw = is_x * (((const halfword_t*)sex_male_collapsed)[widx]);
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		const uint32_t cur_geno = genovec_word & 3;
+		const uint32_t cur_is_male = sex_male_hw & 1;
+		write_iter = memcpya(write_iter, haploid_genotext[cur_geno], haploid_genotext_blen[cur_geno + cur_is_male * 4]);
+		genovec_word >>= 2;
+		sex_male_hw >>= 1;
+	      }
+	      ++widx;
+	    }
+	  }
+	} else {
+	  // some dosages present
+	  if (write_ds) {
+	    write_iter = memcpyl3a(write_iter, ":DS");
+	  } else {
+	    write_iter = memcpyl3a(write_iter, ":GP");
+	  }
+	  if (!alt1_allele_idx) {
+	    biallelic_dosage16_invert(dosage_ct, dosage_vals);
+	  }
+	  dosage_t* dosage_vals_iter = dosage_vals;
+          if (!is_haploid_or_mt) {
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      uint32_t dosage_present_hw = ((halfword_t*)dosage_present)[widx];
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		const uint32_t cur_geno = genovec_word & 3;
+		write_iter = memcpya(write_iter, &(basic_genotext[cur_geno]), 4);
+		if (dosage_present_hw & 1) {
+		  *write_iter++ = ':';
+		  const uint32_t dosage_int = *dosage_vals_iter++;
+		  write_iter = diploid_vcf_dosage_print(dosage_int, write_ds, write_iter);
+		}
+		genovec_word >>= 2;
+		dosage_present_hw >>= 1;
+	      }
+	      ++widx;
+	    }
+	  } else {
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      uint32_t sex_male_hw = is_x * (((const halfword_t*)sex_male_collapsed)[widx]);
+	      uint32_t dosage_present_hw = ((halfword_t*)dosage_present)[widx];
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		const uint32_t cur_geno = genovec_word & 3;
+		const uint32_t cur_is_male = sex_male_hw & 1;
+		const uint32_t cur_genotext_blen = haploid_genotext_blen[cur_geno + cur_is_male * 4];
+		write_iter = memcpya(write_iter, haploid_genotext[cur_geno], cur_genotext_blen);
+		if (dosage_present_hw & 1) {
+		  *write_iter++ = ':';
+		  uint32_t dosage_int = *dosage_vals_iter++;
+		  if (cur_genotext_blen == 2) {
+		    if (write_ds) {
+		      write_iter = haploid_dosage_print(dosage_int, write_iter);
+		    } else {
+		      write_iter = haploid_dosage_print(kDosageMax - dosage_int, write_iter);
+		      *write_iter++ = ',';
+		      write_iter = haploid_dosage_print(dosage_int, write_iter);
+		    }
+		  } else {
+		    // het haploid, or female X
+		    write_iter = diploid_vcf_dosage_print(dosage_int, write_ds, write_iter);
+		  }
+		}
+		genovec_word >>= 2;
+		sex_male_hw >>= 1;
+		dosage_present_hw >>= 1;
+	      }
+	      ++widx;
+	    }
+	  }
+	}
+      } else {
+	// biallelic, phased
+	uint32_t at_least_one_phase_present;
+	if (!write_gp_or_ds) {
+	  reterr = pgr_read_refalt1_genovec_hphase_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec, phasepresent, phaseinfo, &at_least_one_phase_present);
+	} else {
+	  reterr = pgr_read_refalt1_genovec_hphase_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec, phasepresent, phaseinfo, &at_least_one_phase_present, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+	}
+	if (reterr) {
+	  goto export_vcf_ret_PGR_FAIL;
+	}
+	at_least_one_phase_present = (at_least_one_phase_present != 0);
+	if (!dosage_ct) {
+	  if (!is_haploid_or_mt) {
+	    uint32_t* write_iter_ui_alias = (uint32_t*)write_iter;
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      uint32_t prev_phased_halfword = ((halfword_t*)prev_phased)[widx];
+
+	      // zero this out if phasepresent_ct == 0
+	      const uint32_t phasepresent_halfword = at_least_one_phase_present * (((halfword_t*)phasepresent)[widx]);
+
+	      const uint32_t phaseinfo_halfword = ((halfword_t*)phaseinfo)[widx];
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		const uintptr_t cur_geno = genovec_word & 3;
+
+		// usually "\t0/0", etc.
+		uint32_t cur_basic_genotext = basic_genotext[cur_geno];
+		if (cur_geno == 1) {
+		  const uint32_t cur_shift = (1U << sample_idx_lowbits);
+		  if (phasepresent_halfword & cur_shift) {
+		    prev_phased_halfword |= cur_shift;
+		    if (phaseinfo_halfword & cur_shift) {
+		      cur_basic_genotext ^= 0x1000100; // 0|1 -> 1|0
+		    }
+		  } else {
+		    prev_phased_halfword &= ~cur_shift;
+		  }
+		}
+		// '/' = ascii 47, '|' = ascii 124
+		*write_iter_ui_alias++ = cur_basic_genotext + 0x4d0000 * ((prev_phased_halfword >> sample_idx_lowbits) & 1);
+		genovec_word >>= 2;
+	      }
+	      ((halfword_t*)prev_phased)[widx] = prev_phased_halfword;
+	      ++widx;
+	    }
+	    write_iter = (char*)write_iter_ui_alias;
+	  } else {
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      uint32_t is_male_hw = is_x * (((const halfword_t*)sex_male_collapsed)[widx]);
+	      uint32_t prev_phased_halfword = ((halfword_t*)prev_phased)[widx];
+
+	      // zero this out if phasepresent_ct == 0
+	      const uint32_t phasepresent_halfword = at_least_one_phase_present * (((halfword_t*)phasepresent)[widx]);
+
+	      const uint32_t phaseinfo_halfword = ((halfword_t*)phaseinfo)[widx];
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		const uint32_t cur_geno = genovec_word & 3;
+		const uint32_t cur_is_male = is_male_hw & 1;
+		const uint32_t cur_blen = haploid_genotext_blen[cur_geno + cur_is_male * 4];
+		write_iter = memcpya(write_iter, haploid_genotext[cur_geno], cur_blen);
+		if (cur_blen == 4) {
+		  if (cur_geno == 1) {
+		    // a bit redundant with how is_male_hw is handled, but
+		    // updating this on every loop iteration doesn't seem better
+		    const uint32_t cur_shift = (1U << sample_idx_lowbits);
+		    if (phasepresent_halfword & cur_shift) {
+		      prev_phased_halfword |= cur_shift;
+		      if (phaseinfo_halfword & cur_shift) {
+			memcpy(&(write_iter[-4]), "\t1|0", 4);
+		      } else {
+			write_iter[-2] = '|';
+		      }
+		    } else {
+		      prev_phased_halfword &= ~cur_shift;
+		    }
+		  } else if ((prev_phased_halfword >> sample_idx_lowbits) & 1) {
+		    write_iter[-2] = '|';
+		  }
+		}
+		genovec_word >>= 2;
+		is_male_hw >>= 1;
+	      }
+	      ((halfword_t*)prev_phased)[widx] = prev_phased_halfword;
+	      ++widx;
+	    }
+	  }
+	} else {
+	  // both dosage and phase present
+	  if (write_ds) {
+	    write_iter = memcpyl3a(write_iter, ":DS");
+	  } else {
+	    write_iter = memcpyl3a(write_iter, ":GP");
+	  }
+	  if (!alt1_allele_idx) {
+	    biallelic_dosage16_invert(dosage_ct, dosage_vals);
+	  }
+	  dosage_t* dosage_vals_iter = dosage_vals;
+	  if (!is_haploid_or_mt) {
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      uint32_t prev_phased_halfword = ((halfword_t*)prev_phased)[widx];
+
+	      // zero this out if phasepresent_ct == 0
+	      const uint32_t phasepresent_halfword = at_least_one_phase_present * (((halfword_t*)phasepresent)[widx]);
+
+	      const uint32_t phaseinfo_halfword = ((halfword_t*)phaseinfo)[widx];
+	      const uint32_t dosage_present_hw = ((halfword_t*)dosage_present)[widx];
+	      uint32_t cur_shift = 1;
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		const uint32_t cur_geno = genovec_word & 3;
+		write_iter = memcpya(write_iter, &(basic_genotext[cur_geno]), 4);
+		if (cur_geno == 1) {
+		  if (phasepresent_halfword & cur_shift) {
+		    prev_phased_halfword |= cur_shift;
+		    if (phaseinfo_halfword & cur_shift) {
+		      memcpy(&(write_iter[-4]), "\t1|0", 4);
+		    }
+		  } else {
+		    prev_phased_halfword &= ~cur_shift;
+		  }
+		}
+		if (prev_phased_halfword & cur_shift) {
+		  write_iter[-2] = '|';
+		}
+		if (dosage_present_hw & cur_shift) {
+		  *write_iter++ = ':';
+		  const uint32_t dosage_int = *dosage_vals_iter++;
+		  write_iter = diploid_vcf_dosage_print(dosage_int, write_ds, write_iter);
+		}
+		genovec_word >>= 2;
+		cur_shift <<= 1;
+	      }
+	      ((halfword_t*)prev_phased)[widx] = prev_phased_halfword;
+	      ++widx;
+	    }
+	  } else {
+	    while (1) {
+	      if (widx >= sample_ctl2_m1) {
+		if (widx > sample_ctl2_m1) {
+		  break;
+		}
+		inner_loop_last = (sample_ct - 1) % kBitsPerWordD2;
+	      }
+	      uintptr_t genovec_word = genovec[widx];
+	      uint32_t is_male_hw = is_x * (((const halfword_t*)sex_male_collapsed)[widx]);
+	      uint32_t prev_phased_halfword = ((halfword_t*)prev_phased)[widx];
+
+	      // zero this out if phasepresent_ct == 0
+	      const uint32_t phasepresent_halfword = at_least_one_phase_present * (((halfword_t*)phasepresent)[widx]);
+
+	      const uint32_t phaseinfo_halfword = ((halfword_t*)phaseinfo)[widx];
+	      const uint32_t dosage_present_hw = ((halfword_t*)dosage_present)[widx];
+	      uint32_t cur_shift = 1;
+	      for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits <= inner_loop_last; ++sample_idx_lowbits) {
+		const uint32_t cur_geno = genovec_word & 3;
+		const uint32_t cur_is_male = is_male_hw & 1;
+		const uint32_t cur_blen = haploid_genotext_blen[cur_geno + cur_is_male * 4];
+		write_iter = memcpya(write_iter, haploid_genotext[cur_geno], cur_blen);
+		if (cur_blen == 4) {
+		  if (cur_geno == 1) {
+		    if (phasepresent_halfword & cur_shift) {
+		      prev_phased_halfword |= cur_shift;
+		      if (phaseinfo_halfword & cur_shift) {
+			memcpy(&(write_iter[-4]), "\t1|0", 4);
+		      }
+		    } else {
+		      prev_phased_halfword &= ~cur_shift;
+		    }
+		  }
+		  if (prev_phased_halfword & cur_shift) {
+		    write_iter[-2] = '|';
+		  }
+		  if (dosage_present_hw & cur_shift) {
+		    *write_iter++ = ':';
+		    const uint32_t dosage_int = *dosage_vals_iter++;
+		    write_iter = diploid_vcf_dosage_print(dosage_int, write_ds, write_iter);
+		  }
+		} else {
+		  if (dosage_present_hw & cur_shift) {
+		    *write_iter++ = ':';
+		    const uint32_t dosage_int = *dosage_vals_iter++;
+		    if (write_ds) {
+		      write_iter = haploid_dosage_print(dosage_int, write_iter);
+		    } else {
+		      write_iter = haploid_dosage_print(kDosageMax - dosage_int, write_iter);
+		      *write_iter++ = ',';
+		      write_iter = haploid_dosage_print(dosage_int, write_iter);
+		    }
+		  }
+		}
+		genovec_word >>= 2;
+		is_male_hw >>= 1;
+		cur_shift <<= 1;
+	      }
+	      ((halfword_t*)prev_phased)[widx] = prev_phased_halfword;
+	      ++widx;
+	    }
+	  }
+	}
+      }
+      // todo: multiallelic cases (separate out cur_allele_ct <= 10)
+      append_binary_eoln(&write_iter);
+      if (flexbwrite_ck(writebuf_flush, outfile, bgz_outfile, &write_iter)) {
+	goto export_vcf_ret_WRITE_FAIL;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (flexbwrite_flush(writebuf, write_iter - writebuf, outfile, bgz_outfile)) {
+	goto export_vcf_ret_WRITE_FAIL;
+      }
+    }
+    if (bgz_outfile) {
+      if (bgzf_close(bgz_outfile)) {
+	bgz_outfile = nullptr;
+	goto export_vcf_ret_WRITE_FAIL;
+      }
+      bgz_outfile = nullptr;
+    } else {
+      if (fclose_null(&outfile)) {
+	goto export_vcf_ret_WRITE_FAIL;
+      }
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+  }
+  while (0) {
+  export_vcf_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  export_vcf_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  export_vcf_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  export_vcf_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  export_vcf_ret_PGR_FAIL:
+    if (reterr != kPglRetReadFail) {
+      logprint("\n");
+      logerrprint("Error: Malformed .pgen file.\n");
+    }
+  }
+ export_vcf_ret_1:
+  fclose_cond(outfile);
+  gzclose_cond(gz_pvar_reload);
+  if (bgz_outfile) {
+    bgzf_close(bgz_outfile);
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t exportf(char* xheader, const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, const uintptr_t* pv [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t sample_ctaw = BITCT_TO_ALIGNED_WORDCT(sample_ct);
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uint32_t* sample_include_cumulative_popcounts;
+    uintptr_t* sex_male_collapsed;
+    if (bigstack_alloc_ui(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
+	bigstack_alloc_ul(sample_ctaw, &sex_male_collapsed)) {
+      goto exportf_ret_NOMEM;
+    }
+    fill_cumulative_popcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
+    copy_bitarr_subset(sex_male, sample_include, sample_ct, sex_male_collapsed);
+    fill_ulong_zero(sample_ctaw - sample_ctl, sex_male_collapsed);
+    uint32_t* sample_missing_geno_cts = nullptr;
+    if (exportf_modifier & (kfExportfOxGen | kfExportfHaps | kfExportfHapsLegend | kfExportfBgen11 | kfExportfBgen12 | kfExportfBgen13)) {
+      if (bigstack_alloc_ui(sample_ct, &sample_missing_geno_cts)) {
+	goto exportf_ret_NOMEM;
+      }
+    }
+    if (exportf_modifier & (kfExportf01 | kfExportf12)) {
+      // todo
+    }
+    if (exportf_modifier & (kfExportfTypemask - kfExportfIndMajorBed - kfExportfVcf - kfExportfOxGen - kfExportfBgen11 - kfExportfHaps - kfExportfHapsLegend - kfExportfATranspose)) {
+      logerrprint("Error: Only VCF, oxford, bgen-1.1, haps, hapslegend, A-transpose, and\nind-major-bed output have been implemented so far.\n");
+      reterr = kPglRetNotYetSupported;
+      goto exportf_ret_1;
+    }
+    const char exportf_delim = (exportf_modifier & kfExportfSpaces)? ' ' : '\t';
+    if (exportf_modifier & kfExportfATranspose) {
+      strcpy(outname_end, ".traw");
+      pgr_clear_ld_cache(simple_pgrp);
+      reterr = export_012_vmaj(outname, sample_include, sample_include_cumulative_popcounts, sample_ids, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, refalt1_select, variant_cms, sample_ct, max_sample_id_blen, variant_ct, max_allele_slen, simple_pgrp);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+    }
+    if (exportf_modifier & kfExportfIndMajorBed) {
+      reterr = export_ind_major_bed(sample_include, variant_include, variant_allele_idxs, refalt1_select, raw_sample_ct, sample_ct, raw_variant_ct, variant_ct, max_thread_ct, pgr_alloc_cacheline_ct, pgfip, outname, outname_end);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+    }
+    if (exportf_modifier & kfExportfOxGen) {
+      strcpy(outname_end, ".gen");
+      pgr_clear_ld_cache(simple_pgrp);
+      reterr = export_ox_gen(outname, sample_include, sample_include_cumulative_popcounts, sex_male, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, refalt1_select, sample_ct, variant_ct, max_allele_slen, exportf_modifier, simple_pgrp, sample_missing_geno_cts);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+    }
+    if (exportf_modifier & (kfExportfHaps | kfExportfHapsLegend)) {
+      pgr_clear_ld_cache(simple_pgrp);
+      reterr = export_ox_hapslegend(sample_include, sample_include_cumulative_popcounts, sex_male_collapsed, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, refalt1_select, sample_ct, raw_variant_ct, variant_ct, max_allele_slen, exportf_modifier, simple_pgrp, outname, outname_end);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+      fill_uint_zero(sample_ct, sample_missing_geno_cts);
+    }
+    if (exportf_modifier & kfExportfBgen11) {
+      assert(popcount_longs(sample_include, raw_sample_ctl) == sample_ct);
+      strcpy(outname_end, ".bgen");
+      reterr = export_bgen11(outname, sample_include, sample_include_cumulative_popcounts, sex_male, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, refalt1_select, sample_ct, raw_variant_ct, variant_ct, max_allele_slen, max_thread_ct, exportf_modifier, pgr_alloc_cacheline_ct, pgfip, sample_missing_geno_cts);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+      /*
+    } else if (exportf_modifier & (kfExportfBgen12 | kfExportfBgen13)) {
+      strcpy(outname_end, ".bgen");
+      reterr = export_bgen13(outname, sample_include, sample_include_cumulative_popcounts, sex_male, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, refalt1_select, sample_ct, raw_variant_ct, variant_ct, max_allele_slen, max_thread_ct, exportf_modifier, exportf_bits, pgr_alloc_cacheline_ct, pgfip, sample_missing_geno_cts);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+      */
+    }
+    if (exportf_modifier & (kfExportfOxGen | kfExportfBgen11 | kfExportfBgen12 | kfExportfBgen13 | kfExportfHaps | kfExportfHapsLegend)) {
+      strcpy(outname_end, ".sample");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      uint32_t y_ct = 0;
+      int32_t y_code = cip->xymt_codes[kChrOffsetY];
+      if ((y_code >= 0) && is_set(cip->chr_mask, y_code)) {
+	y_ct = count_chr_variants_unsafe(variant_include, cip, y_code);
+      }
+      assert(popcount_longs(sample_include, raw_sample_ctl) == sample_ct);
+      reterr = export_ox_sample(outname, sample_include, sample_ids, sample_missing_geno_cts, sex_nm, sex_male, pheno_cols, pheno_names, sample_ct, max_sample_id_blen, pheno_ct, max_pheno_name_blen, variant_ct, y_ct);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if (exportf_modifier & kfExportfVcf) {
+      pgr_clear_ld_cache(simple_pgrp);
+      reterr = export_vcf(xheader, sample_include, sample_include_cumulative_popcounts, sample_ids, sids, sex_male_collapsed, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pvar_info_reload, xheader_blen, xheader_info_pr, sample_ct, max_sample_id_blen, max_sid_blen, raw_variant_ct, variant_ct, max_allele_slen, max_filter_slen, info_reload_slen, [...]
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+    }
+    // todo: everything else
+    // sample-major output should share a (probably multithreaded) transpose
+    // routine
+
+    if ((!(make_plink2_modifier & kfMakeFam)) && (exportf_modifier & kfExportfIndMajorBed)) {
+      strcpy(outname_end, ".fam");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      reterr = write_fam(outname, sample_include, sample_ids, paternal_ids, maternal_ids, sex_nm, sex_male, pheno_cols, nullptr, sample_ct, max_sample_id_blen, max_paternal_id_blen, max_maternal_id_blen, pheno_ct, exportf_delim);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+      logprint("done.\n");
+    }
+    if ((!(make_plink2_modifier & kfMakeBim)) && (exportf_modifier & kfExportfIndMajorBed)) {
+      strcpy(outname_end, ".bim");
+      LOGPRINTFWW5("Writing %s ... ", outname);
+      fflush(stdout);
+      reterr = write_map_or_bim(outname, variant_include, cip, variant_bps, variant_ids, variant_allele_idxs, allele_storage, nullptr, refalt1_select, variant_cms, variant_ct, max_allele_slen, exportf_delim, 0);
+      if (reterr) {
+	goto exportf_ret_1;
+      }
+      logprint("done.\n");
+    }
+  }
+  while (0) {
+  exportf_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  }
+ exportf_ret_1:
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_data.h b/plink2_data.h
new file mode 100644
index 0000000..e5d286d
--- /dev/null
+++ b/plink2_data.h
@@ -0,0 +1,176 @@
+#ifndef __PLINK2_DATA_H__
+#define __PLINK2_DATA_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+ENUM_U31_DEF_START()
+  kVcfHalfCallReference,
+  kVcfHalfCallHaploid,
+  kVcfHalfCallMissing,
+  kVcfHalfCallError,
+  // gets converted to kVcfHalfCallError, but with a different error message
+  kVcfHalfCallDefault
+ENUM_U31_DEF_END(vcf_half_call_t);
+
+FLAGSET_DEF_START()
+  kfOxfordImport0,
+  kfOxfordImportRefFirst = (1 << 0),
+  kfOxfordImportRefSecond = (1 << 1),
+  kfOxfordImportBgenSnpIdChr = (1 << 2)
+FLAGSET_DEF_END(oxford_import_t);
+
+FLAGSET_DEF_START()
+  kfPlink1Dosage0,
+  kfPlink1DosageNoheader = (1 << 0),
+  kfPlink1DosageFormatSingle = (1 << 1),
+  kfPlink1DosageFormatSingle01 = (1 << 2),
+  kfPlink1DosageFormatTriple = (1 << 3),
+  kfPlink1DosageRefFirst = (1 << 4),
+  kfPlink1DosageRefSecond = (1 << 5)
+FLAGSET_DEF_END(plink1_dosage_flags_t);
+
+FLAGSET_DEF_START()
+  kfGenDummy0,
+  kfGenDummyAcgt = (1 << 0),
+  kfGenDummy1234 = (1 << 1),
+  kfGenDummy12 = (1 << 2),
+  kfGenDummyScalarPheno = (1 << 3)
+FLAGSET_DEF_END(gendummy_flags_t);
+
+FLAGSET_DEF_START()
+  kfMake0,
+  kfMakeBed = (1 << 0),
+  kfMakeBim = (1 << 1),
+  kfMakeFam = (1 << 2),
+  kfMakePgen = (1 << 3),
+  kfMakePvar = (1 << 4),
+  kfMakePsam = (1 << 5),
+  kfMakeBimZs = (1 << 6),
+  kfMakePlink2MSplitBase = (1 << 7), // three bits for multiallelic mode
+  kfMakePlink2MSplitAll = kfMakePlink2MSplitBase,
+  kfMakePlink2MSplitSnps = 2 * kfMakePlink2MSplitBase,
+  kfMakePlink2MMerge = (1 << 9),
+  kfMakePlink2MMergeBoth = kfMakePlink2MMerge,
+  kfMakePlink2MMergeSnps = kfMakePlink2MMerge + kfMakePlink2MSplitBase,
+  kfMakePlink2MMergeAny = kfMakePlink2MMerge + 2 * kfMakePlink2MSplitBase,
+  // don't support e.g. '+indels' for now due to lack of standardization re:
+  // handling of MNP/'other' classes
+  kfMakePlink2TrimAlts = (1 << 10),
+  kfMakePlink2MMask = kfMakePlink2TrimAlts - kfMakePlink2MSplitBase,
+  kfMakePlink2SetHhMissing = (1 << 11),
+  kfMakePlink2SetMixedMtMissing = (1 << 12),
+  kfMakePgenFormatBase = (1 << 13), // two bits
+  kfMakePgenEraseAlt2Plus = (1 << 15),
+  kfMakePgenErasePhase = (1 << 16),
+  kfMakePgenEraseDosage = (1 << 17)
+FLAGSET_DEF_END(make_plink2_t);
+
+FLAGSET_DEF_START()
+  kfPvarPsam0,
+  kfPvarZs = (1 << 0),
+  
+  kfPvarColXheader = (1 << 1),
+  kfPvarColMaybequal = (1 << 2),
+  kfPvarColQual = (1 << 3),
+  kfPvarColMaybefilter = (1 << 4),
+  kfPvarColFilter = (1 << 5),
+  kfPvarColMaybeinfo = (1 << 6),
+  kfPvarColInfo = (1 << 7),
+  kfPvarColXinfo = (kfPvarColInfo * 2) - kfPvarColMaybeinfo,
+  kfPvarColMaybecm = (1 << 8),
+  kfPvarColCm = (1 << 9),
+  kfPvarColDefault = (kfPvarColXheader | kfPvarColMaybequal | kfPvarColMaybefilter | kfPvarColMaybeinfo | kfPvarColMaybecm),
+  kfPvarColAll = ((kfPvarColCm * 2) - kfPvarColXheader),
+  kfPsamColMaybesid = (1 << 10),
+  kfPsamColSid = (1 << 11),
+  kfPsamColMaybeparents = (1 << 12),
+  kfPsamColParents = (1 << 13),
+  kfPsamColSex = (1 << 14),
+  kfPsamColPheno1 = (1 << 15),
+  kfPsamColPhenos = (1 << 16),
+  kfPsamColDefault = (kfPsamColMaybesid | kfPsamColMaybeparents | kfPsamColSex | kfPsamColPhenos),
+  kfPsamColAll = ((kfPsamColPhenos * 2) - kfPsamColMaybesid)
+FLAGSET_DEF_END(pvar_psam_t);
+
+FLAGSET_DEF_START()
+  kfIdpaste0,
+  kfIdpasteFid = (1 << 0),
+  kfIdpasteIid = (1 << 1),
+  kfIdpasteMaybesid = (1 << 2),
+  kfIdpasteSid = (1 << 3),
+  kfIdpasteDefault = (kfIdpasteFid | kfIdpasteIid | kfIdpasteMaybesid)
+FLAGSET_DEF_END(idpaste_t);
+
+typedef struct plink1_dosage_info_struct {
+  plink1_dosage_flags_t flags;
+  uint32_t skips[3];
+  uint32_t chr_col_idx; // 0-based
+  uint32_t pos_col_idx;
+} plink1_dosage_info_t;
+
+typedef struct gendummy_info_struct {
+  gendummy_flags_t flags;
+  uint32_t sample_ct;
+  uint32_t variant_ct;
+  uint32_t pheno_ct;
+  double geno_mfreq;
+  double pheno_mfreq;
+  double dosage_freq;
+} gendummy_info_t;
+
+void init_plink1_dosage(plink1_dosage_info_t* plink1_dosage_info_ptr);
+
+void init_gendummy(gendummy_info_t* gendummy_info_ptr);
+
+uint32_t is_parental_info_present(const uintptr_t* sample_include, const char* paternal_ids, const char* maternal_ids, uint32_t sample_ct, uintptr_t max_paternal_id_blen, uintptr_t max_maternal_id_blen);
+
+char* append_pheno_str(const pheno_col_t* pheno_col, const char* output_missing_pheno, uint32_t omp_slen, uint32_t sample_uidx, char* write_iter);
+
+pglerr_t vcf_to_pgen(const char* vcfname, const char* preexisting_psamname, const char* const_fid, const char* dosage_import_field, misc_flags_t misc_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, char id_delim, char idspace_to, int32_t vcf_min_gq, int32_t vcf_min_dp, vcf_half_call_t vcf_half_call, fam_col_t fam_cols, char* outname, char* outname_end, chr_info_t* cip);
+
+pglerr_t ox_gen_to_pgen(const char* genname, const char* samplename, const char* ox_single_chr_str, const char* ox_missing_code, misc_flags_t misc_flags, oxford_import_t oxford_import_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, char* outname, char* outname_end, chr_info_t* cip);
+
+pglerr_t ox_bgen_to_pgen(const char* bgenname, const char* samplename, const char* const_fid, const char* ox_missing_code, misc_flags_t misc_flags, oxford_import_t oxford_import_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, char id_delim, char idspace_to, uint32_t max_thread_ct, char* outname, char* outname_end, chr_info_t* cip);
+
+pglerr_t ox_hapslegend_to_pgen(const char* hapsname, const char* legendname, const char* samplename, const char* ox_single_chr_str, const char* ox_missing_code, misc_flags_t misc_flags, oxford_import_t oxford_import_flags, char* outname, char* outname_end, chr_info_t* cip);
+
+pglerr_t plink1_dosage_to_pgen(const char* dosagename, const char* famname, const char* mapname, const char* import_single_chr_str, const plink1_dosage_info_t* pdip, misc_flags_t misc_flags, fam_col_t fam_cols, int32_t missing_pheno, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, double import_dosage_certainty, uint32_t max_thread_ct, char* outname, char* outname_end, chr_info_t* cip);
+
+pglerr_t generate_dummy(const gendummy_info_t* gendummy_info_ptr, misc_flags_t misc_flags, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, uint32_t max_thread_ct, char* outname, char* outname_end, chr_info_t* cip);
+
+pglerr_t plink1_sample_major_to_pgen(const char* pgenname, uintptr_t variant_ct, uintptr_t sample_ct, uint32_t real_ref_alleles, uint32_t max_thread_ct, FILE* infile);
+
+pglerr_t load_allele_and_geno_counts(const uintptr_t* sample_include, const uintptr_t* founder_info, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, const uintptr_t* variant_allele_idxs, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t founder_ct, uint32_t male_ct, uint32_t nosex_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t first_hap_uidx, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, pgen_file_inf [...]
+
+pglerr_t make_plink2_no_vsort(const char* xheader, const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage,  [...]
+
+pglerr_t sample_sort_file_map(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* sample_sort_fname, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t sid_col_present, uint32_t** new_sample_idx_to_old_ptr);
+
+pglerr_t exportf(char* xheader, const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* refalt1_select, const uintptr_t* pv [...]
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_DATA_H__
diff --git a/plink2_decompress.cpp b/plink2_decompress.cpp
new file mode 100644
index 0000000..eb201c4
--- /dev/null
+++ b/plink2_decompress.cpp
@@ -0,0 +1,194 @@
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_decompress.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+pglerr_t gzopen_read_checked(const char* fname, gzFile* gzf_ptr) {
+  *gzf_ptr = gzopen(fname, FOPEN_RB);
+  if (!(*gzf_ptr)) {
+    logprint("\n");
+    LOGERRPRINTFWW(g_errstr_fopen, fname);
+    return kPglRetOpenFail;
+  }
+  if (gzbuffer(*gzf_ptr, 131072)) {
+    return kPglRetNomem;
+  }
+  return kPglRetSuccess;
+}
+
+
+pglerr_t load_xid_header(const char* flag_name, sid_detect_mode_t sid_detect_mode, uintptr_t loadbuf_size, char* loadbuf, char** loadbuf_iter_ptr, uintptr_t* line_idx_ptr, char** loadbuf_first_token_ptr, gzFile* gz_infile_ptr, xid_mode_t* xid_mode_ptr) {
+  // possible todo: support comma delimiter
+  uintptr_t line_idx = *line_idx_ptr;
+  uint32_t is_header_line;
+  char* loadbuf_first_token;
+  do {
+    ++line_idx;
+    if (!gzgets(*gz_infile_ptr, loadbuf, loadbuf_size)) {
+      if (!gzeof(*gz_infile_ptr)) {
+	return kPglRetReadFail;
+      }
+      return kPglRetEmptyFile;
+    }
+    if (!loadbuf[loadbuf_size - 1]) {
+      return kPglRetLongLine;
+    }
+    loadbuf_first_token = skip_initial_spaces(loadbuf);
+    is_header_line = (loadbuf_first_token[0] == '#');
+  } while (is_header_line && strcmp_se(&(loadbuf_first_token[1]), "FID", 3) && strcmp_se(&(loadbuf_first_token[1]), "IID", 3));
+  xid_mode_t xid_mode = kfXidMode0;
+  char* loadbuf_iter;
+  if (is_header_line) {
+    // the following header leading columns are supported:
+    // #FID IID (sid_detect_mode can't be FORCE)
+    // #FID IID SID (SID ignored on sid_detect_mode NOT_LOADED)
+    // #IID
+    // #IID SID
+    loadbuf_iter = &(loadbuf_first_token[4]);
+    if (loadbuf_first_token[1] == 'I') {
+      xid_mode = kfXidModeFlagNeverFid;
+    } else {
+      loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+      if (strcmp_se(loadbuf_iter, "IID", 3)) {
+	LOGERRPRINTF("Error: No IID column on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
+	return kPglRetMalformedInput;
+      }
+      loadbuf_iter = &(loadbuf_iter[3]);
+    }
+    loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+    if (!strcmp_se(loadbuf_iter, "SID", 3)) {
+      if ((uint32_t)sid_detect_mode >= kSidDetectModeLoaded) {
+	xid_mode |= kfXidModeFlagSid;
+      }
+      loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[3]));
+    } else if (sid_detect_mode == kSidDetectModeForce) {
+      LOGERRPRINTFWW("Error: No SID column on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
+      return kPglRetMalformedInput;
+    }
+  } else {
+    xid_mode = (sid_detect_mode == kSidDetectModeForce)? kfXidModeFidiidSid : kfXidModeFidiidOrIid;
+    loadbuf_iter = loadbuf_first_token;
+  }
+  if (loadbuf_iter_ptr) {
+    *loadbuf_iter_ptr = loadbuf_iter;
+  }
+  *loadbuf_first_token_ptr = loadbuf_first_token;
+  *line_idx_ptr = line_idx;
+  *xid_mode_ptr = xid_mode;
+  return kPglRetSuccess;
+}
+
+pglerr_t open_and_load_xid_header(const char* fname, const char* flag_name, sid_detect_mode_t sid_detect_mode, uintptr_t loadbuf_size, char* loadbuf, char** loadbuf_iter_ptr, uintptr_t* line_idx_ptr, char** loadbuf_first_token_ptr, gzFile* gz_infile_ptr, xid_mode_t* xid_mode_ptr) {
+  pglerr_t reterr = gzopen_read_checked(fname, gz_infile_ptr);
+  if (reterr) {
+    return reterr;
+  }
+  loadbuf[loadbuf_size - 1] = ' ';
+  return load_xid_header(flag_name, sid_detect_mode, loadbuf_size, loadbuf, loadbuf_iter_ptr, line_idx_ptr, loadbuf_first_token_ptr, gz_infile_ptr, xid_mode_ptr);
+}
+
+
+void gz_token_stream_preinit(gz_token_stream_t* gtsp) {
+  gtsp->gz_infile = nullptr;
+}
+
+pglerr_t gz_token_stream_init(const char* fname, gz_token_stream_t* gtsp, char* buf_start) {
+  pglerr_t reterr = gzopen_read_checked(fname, &(gtsp->gz_infile));
+  if (reterr) {
+    return reterr;
+  }
+  gtsp->buf_start = buf_start;
+  gtsp->read_iter = &(buf_start[kMaxMediumLine]);
+  gtsp->buf_end = gtsp->read_iter;
+  gtsp->buf_end[0] = '0'; // force initial load
+  return kPglRetSuccess;
+}
+
+char* gz_token_stream_advance(gz_token_stream_t* gtsp, uint32_t* token_slen_ptr) {
+  char* token_start = gtsp->read_iter;
+  char* buf_end = gtsp->buf_end;
+ gz_token_stream_advance_restart:
+  while ((unsigned char)(*token_start) <= ' ') {
+    ++token_start;
+  }
+  while (1) {
+    if (token_start < buf_end) {
+      char* token_end = &(token_start[1]);
+      while ((unsigned char)(*token_end) > ' ') {
+	++token_end;
+      }
+      const uint32_t token_slen = (uintptr_t)(token_end - token_start);
+      if (token_end < buf_end) {
+	*token_slen_ptr = token_slen;
+	gtsp->read_iter = &(token_end[1]);
+	return token_start;
+      }
+      if (token_slen > kMaxMediumLine) {
+	*token_slen_ptr = 0xffffffffU;
+	return nullptr;
+      }
+      char* new_token_start = &(gtsp->buf_start[kMaxMediumLine - token_slen]);
+      memcpy(new_token_start, token_start, token_slen);
+      token_start = new_token_start;
+    } else {
+      token_start = &(gtsp->buf_start[kMaxMediumLine]);
+    }
+    char* load_start = &(gtsp->buf_start[kMaxMediumLine]);
+    const int32_t bufsize = gzread(gtsp->gz_infile, load_start, kMaxMediumLine);
+    if (bufsize < 0) {
+      *token_slen_ptr = 0xfffffffeU;
+      return nullptr;
+    }
+    buf_end = &(load_start[(uint32_t)bufsize]);
+    buf_end[0] = ' ';
+    buf_end[1] = '0';
+    gtsp->buf_end = buf_end;
+    if (!bufsize) {
+      if (!gzeof(gtsp->gz_infile)) {
+	*token_slen_ptr = 0xfffffffeU;
+	return nullptr;
+      }
+      // bufsize == 0, eof
+      if (token_start == load_start) {
+	*token_slen_ptr = 0;
+	return nullptr;
+      }
+      gtsp->read_iter = load_start;
+      *token_slen_ptr = (uintptr_t)(load_start - token_start);
+      return token_start;
+    }
+    if (token_start == load_start) {
+      goto gz_token_stream_advance_restart;
+    }
+  }
+}
+
+boolerr_t gz_token_stream_close(gz_token_stream_t* gtsp) {
+  if (!gtsp->gz_infile) {
+    return 0;
+  }
+  return gzclose_null(&(gtsp->gz_infile));
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/plink2_decompress.h b/plink2_decompress.h
new file mode 100644
index 0000000..39eda5d
--- /dev/null
+++ b/plink2_decompress.h
@@ -0,0 +1,89 @@
+#ifndef __PLINK2_DECOMPRESS_H__
+#define __PLINK2_DECOMPRESS_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+// This has been separated from plink2_common due to the relatively heavyweight
+// dependence on zstd/zlibWrapper.
+#include "plink2_common.h"
+
+// documentation on ZWRAP_USE_ZSTD is incorrect as of 11 Jan 2017, necessary to
+// edit zstd_zlibwrapper.c or use compile flag.
+#include "zstd/zlibWrapper/zstd_zlibwrapper.h"
+#ifndef STATIC_ZLIB
+  #if !defined(ZLIB_VERNUM) || ZLIB_VERNUM < 0x1240
+    #error "plink2_decompress requires zlib 1.2.4 or later."
+  #endif
+#endif
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// Also sets 128k read buffer.
+pglerr_t gzopen_read_checked(const char* fname, gzFile* gzf_ptr);
+
+// plink2_compress_stream interface should be used for writing .gz files.
+
+HEADER_INLINE boolerr_t gzclose_null(gzFile* gzf_ptr) {
+  const int32_t ii = gzclose(*gzf_ptr);
+  *gzf_ptr = nullptr;
+  return (ii != Z_OK);
+}
+
+HEADER_INLINE void gzclose_cond(gzFile gz_infile) {
+  if (gz_infile) {
+    gzclose(gz_infile);
+  }
+}
+
+
+// may return kPglRetLongLine or kPglRetEmptyFile
+// loadbuf_iter_ptr can be nullptr
+// line_idx must be zero unless initial lines were skipped
+pglerr_t load_xid_header(const char* flag_name, sid_detect_mode_t sid_detect_mode, uintptr_t loadbuf_size, char* loadbuf, char** loadbuf_iter_ptr, uintptr_t* line_idx_ptr, char** loadbuf_first_token_ptr, gzFile* gz_infile_ptr, xid_mode_t* xid_mode_ptr);
+
+// sets last character of loadbuf to ' '
+pglerr_t open_and_load_xid_header(const char* fname, const char* flag_name, sid_detect_mode_t sid_detect_mode, uintptr_t loadbuf_size, char* loadbuf, char** loadbuf_iter_ptr, uintptr_t* line_idx_ptr, char** loadbuf_first_token_ptr, gzFile* gz_infile_ptr, xid_mode_t* xid_mode_ptr);
+
+
+// currently hardcoded to have maximum token length = kMaxMediumLine, buffer
+// size = 2 * kMaxMediumLine * 2.
+typedef struct gz_token_stream_struct {
+  gzFile gz_infile;
+  char* buf_start;
+  char* read_iter;
+  char* buf_end;
+} gz_token_stream_t;
+
+void gz_token_stream_preinit(gz_token_stream_t* gtsp);
+
+pglerr_t gz_token_stream_init(const char* fname, gz_token_stream_t* gtsp, char* buf_start);
+
+// sets token_slen to 0xfffffffeU on read fail, 0xffffffffU on too-long token
+// safe to null-terminate token between calls
+char* gz_token_stream_advance(gz_token_stream_t* gtsp, uint32_t* token_slen_ptr);
+
+// ok if already closed
+boolerr_t gz_token_stream_close(gz_token_stream_t* gtsp);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+ 
+#endif // __PLINK2_DECOMPRESS_H__
diff --git a/plink2_filter.cpp b/plink2_filter.cpp
new file mode 100644
index 0000000..4385ccf
--- /dev/null
+++ b/plink2_filter.cpp
@@ -0,0 +1,2884 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_decompress.h"
+#include "plink2_filter.h"
+#include "plink2_stats.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+void init_cmp_expr(cmp_expr_t* cmp_expr_ptr) {
+  cmp_expr_ptr->pheno_name = nullptr;
+}
+
+void cleanup_cmp_expr(cmp_expr_t* cmp_expr_ptr) {
+  free_cond(cmp_expr_ptr->pheno_name);
+}
+
+pglerr_t from_to_flag(char** variant_ids, const uint32_t* variant_id_htable, const char* varid_from, const char* varid_to, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uintptr_t* variant_include, chr_info_t* cip, uint32_t* variant_ct_ptr) {
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t* htable_dup_base = &(variant_id_htable[round_up_pow2(variant_id_htable_size, kInt32PerCacheline)]);
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t variant_uidx_start = 0xffffffffU;
+    if (varid_from) {
+      uint32_t cur_llidx;
+      variant_uidx_start = variant_id_dup_htable_find(varid_from, variant_ids, variant_id_htable, htable_dup_base, strlen(varid_from), variant_id_htable_size, max_variant_id_slen, &cur_llidx);
+      if (variant_uidx_start == 0xffffffffU) {
+	sprintf(g_logbuf, "Error: --from variant '%s' not found.\n", varid_from);
+	goto from_to_flag_ret_INCONSISTENT_INPUT_WW;
+      }
+      // do *not* check variant_include here.  variant ID uniqueness should not
+      // be dependent on the order in which filters are applied.
+      if (cur_llidx != 0xffffffffU) {
+	sprintf(g_logbuf, "Error: --from variant ID '%s' appears multiple times.\n", varid_from);
+	goto from_to_flag_ret_INCONSISTENT_INPUT_WW;
+      }
+      chr_fo_idx = get_variant_chr_fo_idx(cip, variant_uidx_start);
+    }
+    uint32_t variant_uidx_end = 0;
+    if (varid_to) {
+      uint32_t cur_llidx;
+      variant_uidx_end = variant_id_dup_htable_find(varid_to, variant_ids, variant_id_htable, htable_dup_base, strlen(varid_to), variant_id_htable_size, max_variant_id_slen, &cur_llidx);
+      if (variant_uidx_end == 0xffffffffU) {
+	sprintf(g_logbuf, "Error: --to variant '%s' not found.\n", varid_to);
+	goto from_to_flag_ret_INCONSISTENT_INPUT_WW;
+      }
+      if (cur_llidx != 0xffffffffU) {
+	sprintf(g_logbuf, "Error: --to variant ID '%s' appears multiple times.\n", varid_to);
+	goto from_to_flag_ret_INCONSISTENT_INPUT_WW;
+      }
+      uint32_t chr_fo_idx2 = get_variant_chr_fo_idx(cip, variant_uidx_end);
+      if (variant_uidx_start == 0xffffffffU) {
+	chr_fo_idx = chr_fo_idx2;
+	variant_uidx_start = cip->chr_fo_vidx_start[chr_fo_idx];
+      } else {
+	if (chr_fo_idx != chr_fo_idx2) {
+	  logerrprint("Error: --from and --to variants are not on the same chromosome.\n");
+	  goto from_to_flag_ret_INCONSISTENT_INPUT;
+	}
+	if (variant_uidx_start > variant_uidx_end) {
+	  // permit order to be reversed
+	  uint32_t uii = variant_uidx_start;
+	  variant_uidx_start = variant_uidx_end;
+	  variant_uidx_end = uii;
+	}
+      }
+      ++variant_uidx_end; // convert to half-open interval
+    } else {
+      variant_uidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+    }
+    if (variant_uidx_start) {
+      clear_bits_nz(0, variant_uidx_start, variant_include);
+    }
+    if (variant_uidx_end < raw_variant_ct) {
+      clear_bits_nz(variant_uidx_end, raw_variant_ct, variant_include);
+    }
+    fill_ulong_zero(kChrMaskWords, cip->chr_mask);
+    set_bit(cip->chr_file_order[chr_fo_idx], cip->chr_mask);
+    const uint32_t new_variant_ct = popcount_bit_idx(variant_include, variant_uidx_start, variant_uidx_end);
+    LOGPRINTF("--from/--to: %u variant%s remaining.\n", new_variant_ct, (new_variant_ct == 1)? "" : "s");
+    *variant_ct_ptr = new_variant_ct;
+  }
+  while (0) {
+  from_to_flag_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  from_to_flag_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  return reterr;
+}
+
+pglerr_t snp_flag(const uint32_t* variant_bps, char** variant_ids, const uint32_t* variant_id_htable, const char* varid_snp, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uint32_t do_exclude, int32_t window_bp, uintptr_t* variant_include, chr_info_t* cip, uint32_t* variant_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t* htable_dup_base = &(variant_id_htable[round_up_pow2(variant_id_htable_size, kInt32PerCacheline)]);
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uint32_t cur_llidx;
+    uint32_t variant_uidx = variant_id_dup_htable_find(varid_snp, variant_ids, variant_id_htable, htable_dup_base, strlen(varid_snp), variant_id_htable_size, max_variant_id_slen, &cur_llidx);
+    if (variant_uidx == 0xffffffffU) {
+      sprintf(g_logbuf, "Error: --%ssnp variant '%s' not found.\n", do_exclude? "exclude-" : "", varid_snp);
+      goto snp_flag_ret_INCONSISTENT_INPUT_WW;
+    }
+    if (window_bp == -1) {
+      // duplicates ok
+      
+      uintptr_t* seen_uidxs;
+      // not actually necessary in --exclude-snp case, but this is still fast
+      // enough relative to hash table construction that there's no point in
+      // complicating the code further to conditionally optimize this out
+      if (bigstack_calloc_ul(raw_variant_ctl, &seen_uidxs)) {
+	goto snp_flag_ret_NOMEM;
+      }
+      while (1) {
+	set_bit(variant_uidx, seen_uidxs);
+	if (cur_llidx == 0xffffffffU) {
+	  break;
+	}
+	variant_uidx = htable_dup_base[cur_llidx];
+	cur_llidx = htable_dup_base[cur_llidx + 1];
+      }
+      if (do_exclude) {
+	bitvec_andnot(seen_uidxs, raw_variant_ctl, variant_include);
+      } else {
+	bitvec_and(seen_uidxs, raw_variant_ctl, variant_include);
+      }
+    } else {
+      if (cur_llidx != 0xffffffffU) {
+	sprintf(g_logbuf, "Error: --%ssnp + --window central variant ID '%s' appears multiple times.\n", do_exclude? "exclude-" : "", varid_snp);
+	goto snp_flag_ret_INCONSISTENT_INPUT_WW;
+      }
+      const uint32_t chr_fo_idx = get_variant_chr_fo_idx(cip, variant_uidx);
+      const uint32_t chr_vidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+      const uint32_t center_bp = variant_bps[variant_uidx];
+      uint32_t vidx_start = cip->chr_fo_vidx_start[chr_fo_idx];
+      if (center_bp > (uint32_t)window_bp) {
+	vidx_start += uint32arr_greater_than(&(variant_bps[vidx_start]), chr_vidx_end - vidx_start, center_bp - (uint32_t)window_bp);
+      }
+      const uint32_t bp_end = 1 + center_bp + (uint32_t)window_bp;
+      const uint32_t vidx_end = vidx_start + uint32arr_greater_than(&(variant_bps[vidx_start]), chr_vidx_end - vidx_start, bp_end);
+      if (do_exclude) {
+	clear_bits_nz(vidx_start, vidx_end, variant_include);
+      } else {
+	if (vidx_start) {
+	  clear_bits_nz(0, vidx_start, variant_include);
+	}
+	if (vidx_end < raw_variant_ct) {
+	  clear_bits_nz(vidx_end, raw_variant_ct, variant_include);
+	}
+	fill_ulong_zero(kChrMaskWords, cip->chr_mask);
+	set_bit(cip->chr_file_order[chr_fo_idx], cip->chr_mask);
+      }
+    }
+    const uint32_t new_variant_ct = popcount_longs(variant_include, raw_variant_ctl);
+    LOGPRINTF("--%ssnp%s: %u variant%s remaining.\n", do_exclude? "exclude-" : "", (window_bp == -1)? "" : " + --window", new_variant_ct, (new_variant_ct == 1)? "" : "s");
+    *variant_ct_ptr = new_variant_ct;
+  }
+  while (0) {
+  snp_flag_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  snp_flag_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t snps_flag(char** variant_ids, const uint32_t* variant_id_htable, const range_list_t* snps_range_list_ptr, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uint32_t do_exclude, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t* htable_dup_base = &(variant_id_htable[round_up_pow2(variant_id_htable_size, kInt32PerCacheline)]);
+    const char* varid_strbox = snps_range_list_ptr->names;
+    const unsigned char* starts_range = snps_range_list_ptr->starts_range;
+    const uint32_t varid_ct = snps_range_list_ptr->name_ct;
+    const uintptr_t varid_max_blen = snps_range_list_ptr->name_max_blen;
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uintptr_t* seen_uidxs;
+    if (bigstack_calloc_ul(raw_variant_ctl, &seen_uidxs)) {
+      goto snps_flag_ret_NOMEM;
+    }
+    uint32_t range_start_vidx = 0xffffffffU;
+    for (uint32_t varid_idx = 0; varid_idx < varid_ct; ++varid_idx) {
+      const char* cur_varid = &(varid_strbox[varid_idx * varid_max_blen]);
+      uint32_t cur_llidx;
+      uint32_t variant_uidx = variant_id_dup_htable_find(cur_varid, variant_ids, variant_id_htable, htable_dup_base, strlen(cur_varid), variant_id_htable_size, max_variant_id_slen, &cur_llidx);
+      if (variant_uidx == 0xffffffffU) {
+	sprintf(g_logbuf, "Error: --%ssnps variant '%s' not found.\n", do_exclude? "exclude-" : "", cur_varid);
+	goto snps_flag_ret_INCONSISTENT_INPUT_WW;
+      }
+      if (starts_range[varid_idx]) {
+	if (cur_llidx != 0xffffffffU) {
+	  sprintf(g_logbuf, "Error: --%ssnps range-starting variant ID '%s' appears multiple times.\n", do_exclude? "exclude-" : "", cur_varid);
+	  goto snps_flag_ret_INCONSISTENT_INPUT_WW;
+	}
+	range_start_vidx = variant_uidx;
+      } else {
+	if (range_start_vidx != 0xffffffffU) {
+	  if (cur_llidx != 0xffffffffU) {
+	    sprintf(g_logbuf, "Error: --%ssnps range-ending variant ID '%s' appears multiple times.\n", do_exclude? "exclude-" : "", cur_varid);
+	    goto snps_flag_ret_INCONSISTENT_INPUT_WW;
+	  }
+	  if (variant_uidx < range_start_vidx) {
+	    const uint32_t uii = variant_uidx;
+	    variant_uidx = range_start_vidx;
+	    range_start_vidx = uii;
+	  }
+	  fill_bits_nz(range_start_vidx, variant_uidx + 1, seen_uidxs);
+	} else {
+	  while (1) {
+	    set_bit(variant_uidx, seen_uidxs);
+	    if (cur_llidx == 0xffffffffU) {
+	      break;
+	    }
+	    variant_uidx = htable_dup_base[cur_llidx];
+	    cur_llidx = htable_dup_base[cur_llidx + 1];
+	  }
+	}
+	range_start_vidx = 0xffffffffU;
+      }
+    }
+    if (do_exclude) {
+      bitvec_andnot(seen_uidxs, raw_variant_ctl, variant_include);
+    } else {
+      bitvec_and(seen_uidxs, raw_variant_ctl, variant_include);
+    }
+    const uint32_t new_variant_ct = popcount_longs(variant_include, raw_variant_ctl);
+    LOGPRINTF("--%ssnps: %u variant%s remaining.\n", do_exclude? "exclude-" : "", new_variant_ct, (new_variant_ct == 1)? "" : "s");
+    *variant_ct_ptr = new_variant_ct;
+  }
+  while (0) {
+  snps_flag_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  snps_flag_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+void extract_exclude_process_token(char** variant_ids, const uint32_t* variant_id_htable, const uint32_t* htable_dup_base, const char* tok_start, uint32_t variant_id_htable_size, uintptr_t max_variant_id_slen, uint32_t token_slen, uintptr_t* already_seen, uintptr_t* duplicate_ct_ptr) {
+  uint32_t cur_llidx;
+  uint32_t variant_uidx = variant_id_dup_htable_find(tok_start, variant_ids, variant_id_htable, htable_dup_base, token_slen, variant_id_htable_size, max_variant_id_slen, &cur_llidx);
+  if (variant_uidx == 0xffffffffU) {
+    return;
+  }
+  if (IS_SET(already_seen, variant_uidx)) {
+    *duplicate_ct_ptr += 1;
+  } else {
+    while (1) {
+      SET_BIT(variant_uidx, already_seen);
+      if (cur_llidx == 0xffffffffU) {
+	return;
+      }
+      variant_uidx = htable_dup_base[cur_llidx];
+      cur_llidx = htable_dup_base[cur_llidx + 1];
+    }
+  }
+}
+
+pglerr_t extract_exclude_flag_norange(char** variant_ids, const uint32_t* variant_id_htable, const char* fnames, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uint32_t do_exclude, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gz_token_stream_t gts;
+  gz_token_stream_preinit(&gts);
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    // possible todo: multithreaded read/htable lookup
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uintptr_t* already_seen;
+    if (bigstack_calloc_ul(raw_variant_ctl, &already_seen)) {
+      goto extract_exclude_flag_norange_ret_NOMEM;
+    }
+    const uint32_t* htable_dup_base = &(variant_id_htable[round_up_pow2(variant_id_htable_size, kInt32PerCacheline)]);
+    const char* fnames_iter = fnames;
+    uintptr_t duplicate_ct = 0;
+    do {
+      reterr = gz_token_stream_init(fnames_iter, &gts, g_textbuf);
+      if (reterr) {
+	goto extract_exclude_flag_norange_ret_1;
+      }
+      uint32_t token_slen;
+      while (1) {
+	char* token_start = gz_token_stream_advance(&gts, &token_slen);
+	if (!token_start) {
+	  break;
+	}
+	extract_exclude_process_token(variant_ids, variant_id_htable, htable_dup_base, token_start, variant_id_htable_size, max_variant_id_slen, token_slen, already_seen, &duplicate_ct);
+      }
+      if (token_slen) {
+	// error code
+	if (token_slen == 0xffffffffU) {
+	  sprintf(g_logbuf, "Error: Excessively long ID in --%s file.\n", do_exclude? "exclude" : "extract");
+	  goto extract_exclude_flag_norange_ret_MALFORMED_INPUT_2;
+	}
+	goto extract_exclude_flag_norange_ret_READ_FAIL;
+      }
+      if (gz_token_stream_close(&gts)) {
+	goto extract_exclude_flag_norange_ret_READ_FAIL;
+      }
+      fnames_iter = (const char*)rawmemchr(fnames_iter, '\0');
+      ++fnames_iter;
+    } while (*fnames_iter);
+    if (do_exclude) {
+      bitvec_andnot(already_seen, raw_variant_ctl, variant_include);
+    } else {
+      bitvec_and(already_seen, raw_variant_ctl, variant_include);
+    }
+    const uint32_t new_variant_ct = popcount_longs(variant_include, raw_variant_ctl);
+    LOGPRINTF("--%s: %u variant%s remaining.\n", do_exclude? "exclude" : "extract", new_variant_ct, (new_variant_ct == 1)? "" : "s");
+    *variant_ct_ptr = new_variant_ct;
+    if (duplicate_ct) {
+      LOGERRPRINTF("Warning: At least %" PRIuPTR " duplicate ID%s in --%s file(s).\n", duplicate_ct, (duplicate_ct == 1)? "" : "s", do_exclude? "exclude" : "extract");
+    }
+  }
+  while (0) {
+  extract_exclude_flag_norange_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  extract_exclude_flag_norange_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  extract_exclude_flag_norange_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ extract_exclude_flag_norange_ret_1:
+  gz_token_stream_close(&gts);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+static const char keep_remove_flag_strs[4][11] = {"keep", "remove", "keep-fam", "remove-fam"};
+
+pglerr_t keep_or_remove(const char* fnames, const char* sample_ids, const char* sids, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, keep_flags_t flags, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  const char* flag_name = keep_remove_flag_strs[flags % 4];
+  gzFile gz_infile = nullptr;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    uintptr_t* seen_uidxs;
+    if (bigstack_calloc_ul(raw_sample_ctl, &seen_uidxs)) {
+      goto keep_or_remove_ret_NOMEM;
+    }
+    uintptr_t loadbuf_size = bigstack_left();
+    loadbuf_size -= loadbuf_size / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto keep_or_remove_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    
+    const uint32_t families_only = flags & kfKeepFam;
+    const uint32_t orig_sample_ct = *sample_ct_ptr;
+    char* idbuf = nullptr;
+    uint32_t* xid_map = nullptr;
+    char* sorted_xidbox = nullptr;
+    uintptr_t max_xid_blen = max_sample_id_blen - 1;
+    if (families_only) {
+      // only need to do this once
+      if (bigstack_alloc_ui(orig_sample_ct, &xid_map) ||
+	  bigstack_alloc_c(orig_sample_ct * max_xid_blen, &sorted_xidbox)) {
+	goto keep_or_remove_ret_NOMEM;
+      }
+      uint32_t sample_uidx = 0;
+      for (uint32_t sample_idx = 0; sample_idx < orig_sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	const char* fidt_ptr = &(sample_ids[sample_uidx * max_sample_id_blen]);
+	const char* fidt_end = (const char*)rawmemchr(fidt_ptr, '\t');
+        const uint32_t cur_fidt_slen = 1 + (uintptr_t)(fidt_end - fidt_ptr);
+	// include trailing tab, to simplify bsearch_str_lb() usage
+	memcpyx(&(sorted_xidbox[sample_idx * max_xid_blen]), fidt_ptr, cur_fidt_slen, '\0');
+	xid_map[sample_idx] = sample_uidx;
+      }
+      if (sort_strbox_indexed(orig_sample_ct, max_xid_blen, 0, sorted_xidbox, xid_map)) {
+	goto keep_or_remove_ret_NOMEM;
+      }
+    }
+    unsigned char* bigstack_mark2 = g_bigstack_base;
+    const char* fnames_iter = fnames;
+    uintptr_t duplicate_ct = 0;
+    do {
+      reterr = gzopen_read_checked(fnames_iter, &gz_infile);
+      if (reterr) {
+	goto keep_or_remove_ret_1;
+      }
+      char* loadbuf_first_token;
+      xid_mode_t xid_mode;
+      if (!families_only) {
+	reterr = load_xid_header(flag_name, (flags & kfKeepForceSid)? kSidDetectModeForce : (sids? kSidDetectModeLoaded : kSidDetectModeNotLoaded), loadbuf_size, loadbuf, nullptr, &line_idx, &loadbuf_first_token, &gz_infile, &xid_mode);
+	if (reterr) {
+	  if (reterr == kPglRetEmptyFile) {
+	    reterr = kPglRetSuccess;
+	    goto keep_or_remove_empty_file;
+	  }
+	  if (reterr == kPglRetLongLine) {
+	    if (loadbuf_size == kMaxLongLine) {
+	      goto keep_or_remove_ret_LONG_LINE;
+	    }
+	    goto keep_or_remove_ret_NOMEM;
+	  }
+	  goto keep_or_remove_ret_1;
+	}
+	reterr = sorted_xidbox_init_alloc(sample_include, sample_ids, sids, orig_sample_ct, max_sample_id_blen, max_sid_blen, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
+	if (reterr) {
+	  goto keep_or_remove_ret_1;
+	}
+	if (bigstack_alloc_c(max_xid_blen, &idbuf)) {
+	  goto keep_or_remove_ret_NOMEM;
+	}
+      } else {
+	loadbuf_first_token = loadbuf;
+	loadbuf[0] = '\0';
+      }
+      while (1) {
+	if (!is_eoln_kns(*loadbuf_first_token)) {
+	  if (!families_only) {
+	    char* loadbuf_iter = loadbuf_first_token;
+	    uint32_t sample_uidx;
+	    if (!sorted_xidbox_read_find(sorted_xidbox, xid_map, max_xid_blen, orig_sample_ct, 0, xid_mode, &loadbuf_iter, &sample_uidx, idbuf)) {
+	      if (IS_SET(seen_uidxs, sample_uidx)) {
+		++duplicate_ct;
+	      } else {
+		SET_BIT(sample_uidx, seen_uidxs);
+	      }
+	    } else if (!loadbuf_iter) {
+	      goto keep_or_remove_ret_MISSING_TOKENS;
+	    }
+	  } else {
+	    char* token_end = token_endnn(loadbuf_first_token);
+	    *token_end = '\t';
+	    const uint32_t slen = 1 + (uintptr_t)(token_end - loadbuf_first_token);
+	    uint32_t lb_idx = bsearch_str_lb(loadbuf_first_token, sorted_xidbox, slen, max_xid_blen, orig_sample_ct);
+	    *token_end = ' ';
+	    const uint32_t ub_idx = bsearch_str_lb(loadbuf_first_token, sorted_xidbox, slen, max_xid_blen, orig_sample_ct);
+	    if (ub_idx != lb_idx) {
+	      uint32_t sample_uidx = xid_map[lb_idx];
+	      if (IS_SET(seen_uidxs, sample_uidx)) {
+		++duplicate_ct;
+	      } else {
+		while (1) {
+		  SET_BIT(sample_uidx, seen_uidxs);
+		  if (++lb_idx == ub_idx) {
+		    break;
+		  }
+		  sample_uidx = xid_map[lb_idx];
+		}
+	      }
+	    }
+	  }
+	}
+	++line_idx;
+	if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_infile)) {
+	    goto keep_or_remove_ret_READ_FAIL;
+	  }
+	  goto keep_or_remove_empty_file;
+	}
+	if (!loadbuf[loadbuf_size - 1]) {
+	  if (loadbuf_size == kMaxLongLine) {
+	    goto keep_or_remove_ret_LONG_LINE;
+	  }
+	  goto keep_or_remove_ret_NOMEM;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+      }
+    keep_or_remove_empty_file:
+      if (gzclose_null(&gz_infile)) {
+	goto keep_or_remove_ret_READ_FAIL;
+      }
+      bigstack_reset(bigstack_mark2);
+      fnames_iter = (const char*)rawmemchr(fnames_iter, '\0');
+      ++fnames_iter;
+    } while (*fnames_iter);
+    if (flags & kfKeepRemove) {
+      bitvec_andnot(seen_uidxs, raw_sample_ctl, sample_include);
+    } else {
+      memcpy(sample_include, seen_uidxs, raw_sample_ctl * sizeof(intptr_t));
+    }
+    const uint32_t sample_ct = popcount_longs(sample_include, raw_sample_ctl);
+    *sample_ct_ptr = sample_ct;
+    LOGPRINTF("--%s: %u sample%s remaining.\n", flag_name, sample_ct, (sample_ct == 1)? "" : "s");
+    if (duplicate_ct) {
+      // "At least" since this does not count duplicate IDs absent from the
+      // .fam.
+      LOGERRPRINTF("Warning: At least %" PRIuPTR " duplicate ID%s in --%s file.(s)\n", duplicate_ct, (duplicate_ct == 1)? "" : "s", flag_name);
+    }
+  }
+  while (0) {
+  keep_or_remove_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  keep_or_remove_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  keep_or_remove_ret_LONG_LINE:
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of --%s file is pathologically long.\n", line_idx, flag_name);
+    reterr = kPglRetMalformedInput;
+    break;
+  keep_or_remove_ret_MISSING_TOKENS:
+    LOGERRPRINTF("Error: Line %" PRIuPTR " of --%s file has fewer tokens than expected.\n", line_idx, flag_name);
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ keep_or_remove_ret_1:
+  bigstack_reset(bigstack_mark);
+  gzclose_cond(gz_infile);
+  return reterr;
+}
+
+pglerr_t require_pheno(const pheno_col_t* pheno_cols, const char* pheno_names, char* require_pheno_flattened, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t is_covar, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    uint32_t required_pheno_ct = 0;
+    uintptr_t max_required_pheno_blen = 2;
+    uintptr_t* matched_phenos = nullptr;
+    char* sorted_required_pheno_names = nullptr;
+    if (require_pheno_flattened) {
+      char** strptr_arr = (char**)bigstack_end_mark;
+      if (count_and_measure_multistr_reverse_alloc(require_pheno_flattened, bigstack_left() / sizeof(intptr_t), &required_pheno_ct, &max_required_pheno_blen, &strptr_arr)) {
+	goto require_pheno_ret_NOMEM;
+      }
+      if ((uintptr_t)(((unsigned char*)strptr_arr) - g_bigstack_base) < required_pheno_ct * max_required_pheno_blen) {
+	goto require_pheno_ret_NOMEM;
+      }
+      strptr_arr_sort(required_pheno_ct, strptr_arr);
+      sorted_required_pheno_names = (char*)g_bigstack_base;
+      required_pheno_ct = copy_and_dedup_sorted_strptrs_to_strbox(strptr_arr, required_pheno_ct, max_required_pheno_blen, sorted_required_pheno_names);
+      bigstack_end_reset(bigstack_end_mark);
+      bigstack_finalize_c(sorted_required_pheno_names, required_pheno_ct * max_required_pheno_blen);
+      if (bigstack_calloc_ul(1 + (required_pheno_ct / kBitsPerWord), &matched_phenos)) {
+	goto require_pheno_ret_NOMEM;
+      }
+    } else {
+      if (!pheno_ct) {
+	logerrprint(is_covar? "Warning: No covariates loaded; ignoring --require-covar.\n" : "Warning: No phenotypes loaded; ignoring --require-pheno.\n");
+	goto require_pheno_ret_1;
+      }
+      required_pheno_ct = pheno_ct;
+    }
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+      if (sorted_required_pheno_names) {
+	const char* cur_pheno_name = &(pheno_names[pheno_idx * max_pheno_name_blen]);
+	const int32_t ii = bsearch_str(cur_pheno_name, sorted_required_pheno_names, strlen(cur_pheno_name), max_required_pheno_blen, required_pheno_ct);
+	if (ii == -1) {
+	  continue;
+	}
+	set_bit(ii, matched_phenos);
+      }
+      bitvec_and(pheno_cols[pheno_idx].nonmiss, raw_sample_ctl, sample_include);
+    }
+    if (matched_phenos) {
+      const uint32_t first_unmatched_idx = next_unset_unsafe(matched_phenos, 0);
+      if (first_unmatched_idx < required_pheno_ct) {
+	LOGERRPRINTFWW("Error: --require-%s '%s' not loaded.\n", is_covar? "covar covariate" : "pheno phenotype", &(sorted_required_pheno_names[first_unmatched_idx * max_required_pheno_blen]));
+	goto require_pheno_ret_INCONSISTENT_INPUT;
+      }
+    }
+    const uint32_t new_sample_ct = popcount_longs(sample_include, raw_sample_ctl);
+    const uint32_t removed_sample_ct = (*sample_ct_ptr) - new_sample_ct;
+    LOGPRINTF("--require-%s: %u sample%s removed.\n", is_covar? "covar" : "pheno", removed_sample_ct, (removed_sample_ct == 1)? "" : "s");
+    *sample_ct_ptr = new_sample_ct;
+  }
+  while (0) {
+  require_pheno_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  require_pheno_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ require_pheno_ret_1:
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+pglerr_t keep_remove_if(const cmp_expr_t* cmp_expr, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t covar_ct, uintptr_t max_covar_name_blen, uint32_t affection_01, uint32_t is_remove, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const char* cur_name = cmp_expr->pheno_name;
+    const uintptr_t name_blen = 1 + strlen(cur_name);
+    const pheno_col_t* cur_pheno_col = nullptr;
+    if (name_blen <= max_pheno_name_blen) {
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	if (!memcmp(cur_name, &(pheno_names[pheno_idx * max_pheno_name_blen]), name_blen)) {
+	  cur_pheno_col = &(pheno_cols[pheno_idx]);
+	  break;
+	}
+      }
+    }
+    if (!cur_pheno_col) {
+      if (name_blen <= max_covar_name_blen) {
+	for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx) {
+	  if (!memcmp(cur_name, &(covar_names[covar_idx * max_covar_name_blen]), name_blen)) {
+	    cur_pheno_col = &(covar_cols[covar_idx]);
+	    break;
+	  }
+	}
+      }
+    }
+    if (!cur_pheno_col) {
+      sprintf(g_logbuf, "Error: --%s-if phenotype/covariate not loaded.\n", is_remove? "remove" : "keep");
+      goto keep_remove_if_ret_INCONSISTENT_INPUT_2;
+    }
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    cmp_binary_op_t binary_op = cmp_expr->binary_op;
+    const uint32_t pheno_must_exist = is_remove ^ (binary_op != kCmpOperatorNoteq);
+    const uintptr_t* pheno_nm = cur_pheno_col->nonmiss;
+    if (pheno_must_exist) {
+      bitvec_and(pheno_nm, raw_sample_ctl, sample_include);
+    }
+    uintptr_t* sample_include_intersect;
+    if (bigstack_alloc_ul(raw_sample_ctl, &sample_include_intersect)) {
+      goto keep_remove_if_ret_NOMEM;
+    }
+    memcpy(sample_include_intersect, sample_include, raw_sample_ctl * sizeof(intptr_t));
+    if (!pheno_must_exist) {
+      bitvec_and(pheno_nm, raw_sample_ctl, sample_include_intersect);
+    }
+    const uint32_t sample_intersect_ct = popcount_longs(sample_include_intersect, raw_sample_ctl);
+    const char* cur_val_str = &(cur_name[name_blen]);
+    const uint32_t val_slen = strlen(cur_val_str);
+    if (cur_pheno_col->type_code == kPhenoDtypeQt) {
+      double val;
+      if (!scanadv_double((char*)cur_val_str, &val)) {
+	sprintf(g_logbuf, "Error: Invalid --%s-if value (number expected).\n", is_remove? "remove" : "keep");
+	goto keep_remove_if_ret_INCONSISTENT_INPUT_2;
+      }
+      if (is_remove) {
+	binary_op = (cmp_binary_op_t)(kCmpOperatorEq - (uint32_t)binary_op);
+      }
+      const double* pheno_vals = cur_pheno_col->data.qt;
+      uint32_t sample_uidx = 0;
+      switch (binary_op) {
+      case kCmpOperatorNoteq:
+	for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	  if (pheno_vals[sample_uidx] == val) {
+	    clear_bit(sample_uidx, sample_include);
+	  }
+	}
+	break;
+      case kCmpOperatorLe:
+	for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	  if (pheno_vals[sample_uidx] >= val) {
+	    clear_bit(sample_uidx, sample_include);
+	  }
+	}
+	break;
+      case kCmpOperatorLeq:
+	for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	  if (pheno_vals[sample_uidx] > val) {
+	    clear_bit(sample_uidx, sample_include);
+	  }
+	}
+	break;
+      case kCmpOperatorGe:
+	for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	  if (pheno_vals[sample_uidx] <= val) {
+	    clear_bit(sample_uidx, sample_include);
+	  }
+	}
+	break;
+      case kCmpOperatorGeq:
+	for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	  if (pheno_vals[sample_uidx] < val) {
+	    clear_bit(sample_uidx, sample_include);
+	  }
+	}
+	break;
+      case kCmpOperatorEq:
+	for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	  if (pheno_vals[sample_uidx] != val) {
+	    clear_bit(sample_uidx, sample_include);
+	  }
+	}
+	break;
+      }
+    } else {
+      if ((binary_op != kCmpOperatorNoteq) && (binary_op != kCmpOperatorEq)) {
+	sprintf(g_logbuf, "Error: --%s-if operator type mismatch (binary and categorical phenotypes only support == and !=).\n", is_remove? "remove" : "keep");
+	goto keep_remove_if_ret_INCONSISTENT_INPUT_WW;
+      }
+      if (cur_pheno_col->type_code == kPhenoDtypeCc) {
+	uint32_t val_12 = 0; // 1 = control, 2 = case
+	if (val_slen == 1) {
+	  val_12 = affection_01 + (uint32_t)((unsigned char)cur_val_str[0]) - 48;
+	  if ((val_12 != 1) && (val_12 != 2)) {
+	    val_12 = 0;
+	  } 
+	} else if (val_slen == 4) {
+	  if (match_upper_counted(cur_val_str, "CASE", 4)) {
+	    val_12 = 2;
+	  } else if (match_upper_counted(cur_val_str, "CTRL", 4)) {
+	    val_12 = 1;
+	  }
+	} else if (val_slen == 7) {
+	  if (match_upper_counted(cur_val_str, "CONTROL", 7)) {
+	    val_12 = 1;
+	  }
+	}
+	if (!val_12) {
+	  sprintf(g_logbuf, "Error: Invalid --%s-if value ('case'/'%c' or 'control'/'ctrl'/'%c' expected).\n", is_remove? "remove" : "keep", '2' - affection_01, '1' - affection_01);
+	  goto keep_remove_if_ret_INCONSISTENT_INPUT_WW;
+	}
+	if (is_remove ^ (val_12 == 2)) {
+	  bitvec_and(cur_pheno_col->data.cc, raw_sample_ctl, sample_include);
+	} else {
+	  bitvec_andnot(cur_pheno_col->data.cc, raw_sample_ctl, sample_include);
+	}
+      } else {
+        assert(cur_pheno_col->type_code == kPhenoDtypeCat);
+	const uint32_t nonnull_cat_ct = cur_pheno_col->nonnull_category_ct;
+	uint32_t cat_idx = 1;
+	for (; cat_idx <= nonnull_cat_ct; ++cat_idx) {
+	  if (!strcmp(cur_val_str, cur_pheno_col->category_names[cat_idx])) {
+	    break;
+	  }
+	}
+	if (cat_idx == nonnull_cat_ct + 1) {
+	  double dxx;
+	  if (scanadv_double((char*)cur_val_str, &dxx)) {
+	    sprintf(g_logbuf, "Error: Invalid --%s-if value (category name expected).\n", is_remove? "remove" : "keep");
+	    goto keep_remove_if_ret_INCONSISTENT_INPUT_2;
+	  }
+	  // tolerate this, there are legitimate reasons for empty categories
+	  // to exist
+	  LOGERRPRINTFWW("Warning: Categorical phenotype/covariate '%s' does not have a category named '%s'.\n", cur_name, cur_val_str);
+	  if (pheno_must_exist) {
+	    fill_ulong_zero(raw_sample_ctl, sample_include);
+	  }
+	} else {
+	  const uint32_t* cur_cats = cur_pheno_col->data.cat;
+	  uint32_t sample_uidx = 0;
+	  if (pheno_must_exist) {
+	    for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	      next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	      if (cur_cats[sample_uidx] != cat_idx) {
+		clear_bit(sample_uidx, sample_include);
+	      }
+	    }
+	  } else {
+	    for (uint32_t sample_idx = 0; sample_idx < sample_intersect_ct; ++sample_idx, ++sample_uidx) {
+	      next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	      if (cur_cats[sample_uidx] == cat_idx) {
+		clear_bit(sample_uidx, sample_include);
+	      }
+	    }
+	  }
+	}
+      }
+    }
+
+    const uint32_t new_sample_ct = popcount_longs(sample_include, raw_sample_ctl);
+    const uint32_t removed_sample_ct = (*sample_ct_ptr) - new_sample_ct;
+    LOGPRINTF("--%s-if: %u sample%s removed.\n", is_remove? "remove" : "keep", removed_sample_ct, (removed_sample_ct == 1)? "" : "s");
+    *sample_ct_ptr = new_sample_ct;
+  }
+  while (0) {
+  keep_remove_if_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  keep_remove_if_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+  keep_remove_if_ret_INCONSISTENT_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t keep_remove_cats_internal(const pheno_col_t* cur_pheno_col, const char* cats_fname, const char* cat_names_flattened, uint32_t raw_sample_ct, uint32_t is_remove, uint32_t max_thread_ct, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gz_token_stream_t gts;
+  gz_token_stream_preinit(&gts);
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t cat_ct = cur_pheno_col->nonnull_category_ct + 1;
+    const uint32_t cat_ctl = BITCT_TO_WORDCT(cat_ct);
+    uintptr_t* affected_samples;
+    uintptr_t* cat_include;
+    if (bigstack_calloc_ul(raw_sample_ctl, &affected_samples) ||
+        bigstack_alloc_ul(cat_ctl, &cat_include)) {
+      goto keep_remove_cats_internal_ret_NOMEM;
+    }
+    fill_all_bits(cat_ct, cat_include);
+    char** category_names = cur_pheno_col->category_names;
+    uint32_t* cat_id_htable;
+    uint32_t id_htable_size;
+    reterr = alloc_and_populate_id_htable_mt(cat_include, category_names, cat_ct, max_thread_ct, &cat_id_htable, nullptr, &id_htable_size);
+    if (reterr) {
+      goto keep_remove_cats_internal_ret_1;
+    }
+    fill_ulong_zero(cat_ctl, cat_include);
+    if (cats_fname) {
+      reterr = gz_token_stream_init(cats_fname, &gts, g_textbuf);
+      if (reterr) {
+	goto keep_remove_cats_internal_ret_1;
+      }
+      uintptr_t skip_ct = 0;
+      uint32_t token_slen;
+      while (1) {
+	char* token_start = gz_token_stream_advance(&gts, &token_slen);
+	if (!token_start) {
+	  break;
+	}
+	token_start[token_slen] = '\0';
+        const uint32_t cur_cat_idx = id_htable_find(token_start, category_names, cat_id_htable, token_slen, id_htable_size);
+	if (cur_cat_idx == 0xffffffffU) {
+	  ++skip_ct;
+	} else {
+	  set_bit(cur_cat_idx, cat_include);
+	}
+      }
+      if (token_slen) {
+	// error code
+	if (token_slen == 0xffffffffU) {
+	  sprintf(g_logbuf, "Error: Excessively long ID in --%s-cats file.\n", is_remove? "remove" : "keep");
+	  goto keep_remove_cats_internal_ret_MALFORMED_INPUT_2;
+	}
+	goto keep_remove_cats_internal_ret_READ_FAIL;
+      }
+      if (gz_token_stream_close(&gts)) {
+	goto keep_remove_cats_internal_ret_READ_FAIL;
+      }
+      if (skip_ct) {
+	LOGERRPRINTF("Warning: %" PRIuPTR " --%s-cats categor%s not present.\n", skip_ct, is_remove? "remove" : "keep", (skip_ct == 1)? "y" : "ies");
+      }
+    }
+    if (cat_names_flattened) {
+      uint32_t skip_ct = 0;
+      const char* cat_names_iter = cat_names_flattened;
+      do {
+	const uint32_t cat_name_slen = strlen(cat_names_iter);
+	const uint32_t cur_cat_idx = id_htable_find(cat_names_iter, category_names, cat_id_htable, cat_name_slen, id_htable_size);
+	if (cur_cat_idx == 0xffffffffU) {
+	  ++skip_ct;
+	} else {
+	  set_bit(cur_cat_idx, cat_include);
+	}
+	cat_names_iter = &(cat_names_iter[cat_name_slen + 1]);
+      } while (*cat_names_iter);
+      if (skip_ct) {
+	LOGERRPRINTF("Warning: %u --%s-cat-names categor%s not present.\n", skip_ct, is_remove? "remove" : "keep", (skip_ct == 1)? "y" : "ies");
+      }
+    }
+    const uint32_t selected_cat_ct = popcount_longs(cat_include, cat_ctl);
+    if (!selected_cat_ct) {
+      LOGERRPRINTF("Warning: No matching --%s-cat-names category names.\n", is_remove? "remove-cats/--remove" : "keep-cats/--keep");
+    } else {
+      const uint32_t* cur_cats = cur_pheno_col->data.cat;
+      const uint32_t orig_sample_ct = *sample_ct_ptr;
+      uint32_t sample_uidx = 0;
+      for (uint32_t sample_idx = 0; sample_idx < orig_sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	const uint32_t cur_cat_idx = cur_cats[sample_uidx];
+	if (is_set(cat_include, cur_cat_idx)) {
+	  set_bit(sample_uidx, affected_samples);
+	}
+      }
+      if (is_remove) {
+	bitvec_andnot(affected_samples, raw_sample_ctl, sample_include);
+      } else {
+	bitvec_and(affected_samples, raw_sample_ctl, sample_include);
+      }
+      const uint32_t new_sample_ct = popcount_longs(sample_include, raw_sample_ctl);
+      const uint32_t removed_sample_ct = (*sample_ct_ptr) - new_sample_ct;
+      LOGPRINTFWW("--%s-cat-names: %u categor%s selected, %u sample%s removed.\n", is_remove? "remove-cats/--remove" : "keep-cats/--keep", selected_cat_ct, (selected_cat_ct == 1)? "y" : "ies", removed_sample_ct, (removed_sample_ct == 1)? "" : "s");
+      *sample_ct_ptr = new_sample_ct;
+    }
+  }
+  while (0) {
+  keep_remove_cats_internal_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  keep_remove_cats_internal_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  keep_remove_cats_internal_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ keep_remove_cats_internal_ret_1:
+  gz_token_stream_close(&gts);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t keep_remove_cats(const char* cats_fname, const char* cat_names_flattened, const char* cat_phenoname, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t covar_ct, uintptr_t max_covar_name_blen, uint32_t is_remove, uint32_t max_thread_ct, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!cat_phenoname) {
+      // Default behavior:
+      // 1. If at least one categorical phenotype exists, fail on >= 2, select
+      //    it if one.
+      // 2. Otherwise, fail if 0 or >= 2 categorical covariates, select the
+      //    categorical covariate if there's exactly one.
+      uint32_t cat_pheno_idx = 0xffffffffU;
+      const pheno_col_t* cur_pheno_col = nullptr;
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	if (pheno_cols[pheno_idx].type_code == kPhenoDtypeCat) {
+	  if (cat_pheno_idx != 0xffffffffU) {
+	    sprintf(g_logbuf, "Error: Multiple categorical phenotypes present. Use --%s-cat-pheno to specify which phenotype/covariate you want to filter on.\n", is_remove? "remove" : "keep");
+	    goto keep_remove_cats_ret_INCONSISTENT_INPUT_WW;
+	  }
+	  cat_pheno_idx = pheno_idx;
+	}
+      }
+      if (cat_pheno_idx != 0xffffffffU) {
+        cur_pheno_col = &(pheno_cols[cat_pheno_idx]);
+      } else {
+	for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx) {
+	  if (covar_cols[covar_idx].type_code == kPhenoDtypeCat) {
+	    if (cat_pheno_idx != 0xffffffffU) {
+	      sprintf(g_logbuf, "Error: Multiple categorical covariates and no categorical phenotype present. Use --%s-cat-pheno to specify which phenotype/covariate you want to filter on.\n", is_remove? "remove" : "keep");
+	      goto keep_remove_cats_ret_INCONSISTENT_INPUT_WW;
+	    }
+	    cat_pheno_idx = covar_idx;
+	  }
+	}
+	if (cat_pheno_idx == 0xffffffffU) {
+	  sprintf(g_logbuf, "Error: --%s-cat-names requires a categorical phenotype or covariate.\n", is_remove? "remove-cats/--remove" : "keep-cats/--keep");
+	  goto keep_remove_cats_ret_INCONSISTENT_INPUT_WW;
+	}
+	cur_pheno_col = &(covar_cols[cat_pheno_idx]);
+      }
+      reterr = keep_remove_cats_internal(cur_pheno_col, cats_fname, cat_names_flattened, raw_sample_ct, is_remove, max_thread_ct, sample_include, sample_ct_ptr);
+      if (reterr) {
+	goto keep_remove_cats_ret_1;
+      }
+    } else {
+      const uintptr_t name_blen = 1 + strlen(cat_phenoname);
+      uint32_t success = 0;
+      if (name_blen <= max_pheno_name_blen) {
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  if (!memcmp(cat_phenoname, &(pheno_names[pheno_idx * max_pheno_name_blen]), name_blen)) {
+	    const pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_idx]);
+	    if (cur_pheno_col->type_code != kPhenoDtypeCat) {
+	      sprintf(g_logbuf, "Error: '%s' is not a categorical phenotype.\n", cat_phenoname);
+	      goto keep_remove_cats_ret_INCONSISTENT_INPUT_WW;
+	    }
+	    reterr = keep_remove_cats_internal(cur_pheno_col, cats_fname, cat_names_flattened, raw_sample_ct, is_remove, max_thread_ct, sample_include, sample_ct_ptr);
+	    if (reterr) {
+	      goto keep_remove_cats_ret_1;
+	    }
+	    success = 1;
+	    break;
+	  }
+	}
+      }
+      if (name_blen <= max_covar_name_blen) {
+	for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx) {
+	  if (!memcmp(cat_phenoname, &(covar_names[covar_idx * max_covar_name_blen]), name_blen)) {
+	    const pheno_col_t* cur_pheno_col = &(covar_cols[covar_idx]);
+	    if (cur_pheno_col->type_code != kPhenoDtypeCat) {
+	      sprintf(g_logbuf, "Error: '%s' is not a categorical covariate.\n", cat_phenoname);
+	      goto keep_remove_cats_ret_INCONSISTENT_INPUT_WW;
+	    }
+	    reterr = keep_remove_cats_internal(cur_pheno_col, cats_fname, cat_names_flattened, raw_sample_ct, is_remove, max_thread_ct, sample_include, sample_ct_ptr);
+	    if (reterr) {
+	      goto keep_remove_cats_ret_1;
+	    }
+	    success = 1;
+	    break;
+	  }
+	}
+      }
+      if (!success) {
+	sprintf(g_logbuf, "Error: --%s-cat-pheno phenotype/covariate not loaded.\n", is_remove? "remove" : "keep");
+	goto keep_remove_cats_ret_INCONSISTENT_INPUT_2;
+      }
+    }
+  }
+  while (0) {
+  keep_remove_cats_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+  keep_remove_cats_ret_INCONSISTENT_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ keep_remove_cats_ret_1:
+  return reterr;
+}
+
+void compute_allele_freqs(const uintptr_t* variant_include, const uintptr_t* variant_allele_idxs, const uint64_t* founder_allele_dosages, uint32_t variant_ct, uint32_t maf_succ, double* allele_freqs) {
+  // ok for maj_alleles or allele_freqs to be nullptr
+  // note that founder_allele_dosages is in 32768ths
+  uint32_t cur_allele_ct = 2;
+  uint32_t variant_uidx = 0;
+  for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    uintptr_t variant_allele_idx_base;
+    if (!variant_allele_idxs) {
+      variant_allele_idx_base = 2 * variant_uidx;
+    } else {
+      variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+      cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+    }
+    const uint64_t* cur_founder_allele_dosages = &(founder_allele_dosages[variant_allele_idx_base]);
+    uint64_t tot_dosage = 0;
+    for (uint32_t allele_idx = 0; allele_idx < cur_allele_ct; ++allele_idx) {
+      tot_dosage += cur_founder_allele_dosages[allele_idx];
+    }
+    // todo: try changing this expression
+    const uint64_t cur_maf_succ_dosage = (maf_succ | (!tot_dosage)) * kDosageMax;
+
+    tot_dosage += cur_maf_succ_dosage * cur_allele_ct;
+    double* cur_allele_freqs_base = &(allele_freqs[variant_allele_idx_base - variant_uidx]);
+    const double tot_dosage_recip = 1.0 / ((double)((int64_t)tot_dosage));
+    const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
+    for (uint32_t allele_idx = 0; allele_idx < cur_allele_ct_m1; ++allele_idx) {
+      const double cur_dosage = (double)((int64_t)(cur_founder_allele_dosages[allele_idx] + cur_maf_succ_dosage));
+      cur_allele_freqs_base[allele_idx] = cur_dosage * tot_dosage_recip;
+    }
+  }
+}
+
+CONSTU31(kMaxReadFreqAlleles, 255);
+
+// relevant column types:
+// 0: variant ID
+// 1: ref allele code
+// 2: all alt allele codes (potentially just alt1)
+//
+// (3-4 are --freq only)
+// 3: ref freq/count
+// 4: either all freqs/counts, or all-but-ref
+//
+// 5: obs ct (only relevant for --freq, but can be in --geno-counts)
+//
+// (6-11 are --geno-counts/--freqx only)
+// 6: hom-ref count
+// 7: het ref-alt counts (worst case, just ref-alt1)
+// 8: altx-alty counts (worst case, just hom-alt1), or all pairs
+// 9: hap-ref count
+// 10: hap-alt counts (worst case, just hap-alt1), or all hap counts
+// 11: --geno-counts numeq (if present, ignore 6..10)
+//
+// overrideable:
+// 12->2: ALT1
+// 13->4: ALT1_FREQ/ALT1_CT
+// 14->7: HET_REF_ALT1_CT
+// 15->8: HOM_ALT1_CT
+// 16->10: HAP_ALT1_CT
+ENUM_U31_DEF_START()
+  kfReadFreqColVarId = 0,
+  kfReadFreqColRefAllele,
+  kfReadFreqColAltAlleles,
+
+  kfReadFreqColRefFreq,
+  kfReadFreqColAltFreqs,
+
+  kfReadFreqColObsCt,
+
+  kfReadFreqColHomRefCt,
+  kfReadFreqColHetRefAltCts,
+  kfReadFreqColNonrefDiploidCts,
+  kfReadFreqColHapRefCt,
+  kfReadFreqColHapAltCts,
+  kfReadFreqColGenoCtNumeq,
+
+  kfReadFreqColAlt1Allele,
+  kfReadFreqColAlt1Freq,
+  kfReadFreqColHetRefAlt1Ct,
+  kfReadFreqColHomAlt1Ct,
+  kfReadFreqColHapAlt1Ct,
+
+  kfReadFreqColNull
+ENUM_U31_DEF_END(read_freq_colidx_t);
+
+FLAGSET_DEF_START()
+  kfReadFreqColset0,
+  kfReadFreqColsetVarId = (1 << kfReadFreqColVarId),
+  kfReadFreqColsetRefAllele = (1 << kfReadFreqColRefAllele),
+  kfReadFreqColsetAltAlleles = (1 << kfReadFreqColAltAlleles),
+  kfReadFreqColsetBase = (kfReadFreqColsetVarId | kfReadFreqColsetRefAllele | kfReadFreqColsetAltAlleles),
+
+  kfReadFreqColsetRefFreq = (1 << kfReadFreqColRefFreq),
+  kfReadFreqColsetAltFreqs = (1 << kfReadFreqColAltFreqs),
+  kfReadFreqColsetAfreqOnly = (kfReadFreqColsetRefFreq | kfReadFreqColsetAltFreqs),
+
+  kfReadFreqColsetObsCt = (1 << kfReadFreqColObsCt),
+
+  kfReadFreqColsetHomRefCt = (1 << kfReadFreqColHomRefCt),
+  kfReadFreqColsetHetRefAltCts = (1 << kfReadFreqColHetRefAltCts),
+  kfReadFreqColsetNonrefDiploidCts = (1 << kfReadFreqColNonrefDiploidCts),
+  kfReadFreqColsetHapRefCt = (1 << kfReadFreqColHapRefCt),
+  kfReadFreqColsetHapAltCts = (1 << kfReadFreqColHapAltCts),
+  kfReadFreqColsetGcountDefault = ((kfReadFreqColsetHapAltCts * 2) - kfReadFreqColsetHomRefCt),
+
+  kfReadFreqColsetGenoCtNumeq = (1 << kfReadFreqColGenoCtNumeq),
+  kfReadFreqColsetGcountOnly = (kfReadFreqColsetGcountDefault | kfReadFreqColsetGenoCtNumeq),
+
+  kfReadFreqColsetAlt1Allele = (1 << kfReadFreqColAlt1Allele),
+  kfReadFreqColsetAlt1Freq = (1 << kfReadFreqColAlt1Freq),
+  kfReadFreqColsetHetRefAlt1Ct = (1 << kfReadFreqColHetRefAlt1Ct),
+  kfReadFreqColsetHomAlt1Ct = (1 << kfReadFreqColHomAlt1Ct),
+  kfReadFreqColsetHapAlt1Ct = (1 << kfReadFreqColHapAlt1Ct)
+FLAGSET_DEF_END(read_freq_colset_t);
+
+pglerr_t read_allele_freqs(const uintptr_t* variant_include, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const char* read_freq_fname, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_alt_allele_ct, uint32_t max_variant_id_slen, uint32_t max_allele_slen, uint32_t maf_succ, uint32_t max_thread_ct, double* allele_freqs) {
+  // support PLINK 1.9 --freq/--freqx, and 2.0 --freq/--geno-counts.
+  // GCTA-format no longer supported since it inhibits the allele consistency
+  // check.
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gzFile gz_infile = nullptr;
+  uintptr_t loadbuf_size = 0;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    double* cur_allele_freqs;
+    uintptr_t* matched_loaded_alleles;
+    uintptr_t* matched_internal_alleles;
+    uint32_t* loaded_to_internal_allele_idx;
+    uintptr_t* already_seen;
+    if (bigstack_calloc_d(kMaxReadFreqAlleles, &cur_allele_freqs) ||
+	bigstack_alloc_ul(BITCT_TO_WORDCT(kMaxReadFreqAlleles), &matched_loaded_alleles) ||
+	bigstack_alloc_ul(BITCT_TO_WORDCT(max_alt_allele_ct + 1), &matched_internal_alleles) ||
+	bigstack_alloc_ui(kMaxReadFreqAlleles, &loaded_to_internal_allele_idx) ||
+	bigstack_calloc_ul(BITCT_TO_WORDCT(raw_variant_ct), &already_seen)) {
+      goto read_allele_freqs_ret_NOMEM;
+    }
+    reterr = gzopen_read_checked(read_freq_fname, &gz_infile);
+    if (reterr) {
+      goto read_allele_freqs_ret_1;
+    }
+    loadbuf_size = bigstack_left() / 8;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto read_allele_freqs_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    uint32_t* variant_id_htable = nullptr;
+    uint32_t variant_id_htable_size;
+    reterr = alloc_and_populate_id_htable_mt(variant_include, variant_ids, variant_ct, max_thread_ct, &variant_id_htable, nullptr, &variant_id_htable_size);
+    if (reterr) {
+      goto read_allele_freqs_ret_1;
+    }
+    char* loadbuf_first_token;
+    do {
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto read_allele_freqs_ret_READ_FAIL;
+	}
+	logerrprint("Error: Empty --read-freq file.\n");
+	goto read_allele_freqs_ret_MALFORMED_INPUT;
+      }
+      ++line_idx;
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto read_allele_freqs_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      // automatically skip header lines that start with '##' or '# '
+    } while (is_eoln_kns(*loadbuf_first_token) || ((*loadbuf_first_token == '#') && ((unsigned char)loadbuf_first_token[1] <= '#')));
+
+    uint32_t col_skips[kfReadFreqColNull];
+    read_freq_colidx_t col_types[kfReadFreqColNull];
+    uint32_t overrideable_pos[kfReadFreqColNull - kfReadFreqColAlt1Allele];
+    uint32_t geno_counts = 0;
+    uint32_t main_eq = 0;
+    uint32_t is_numeq = 0;
+    uint32_t use_obs_ct = 0;
+    uint32_t infer_one_freq = 0;
+    uint32_t infer_freq_loaded_idx = 0;
+    uint32_t relevant_col_ct = 0;
+
+    // interpretation of ColAltAlleles
+    uint32_t allele_list_just_alt1 = 1;
+
+    uint32_t is_frac = 0; // if true, one frequency can be missing
+    // could add consistency check (can't mix FREQ and CT)
+
+    // interpretation of ColAltFreqs and ColNonrefDiploidCts
+    uint32_t main_allele_idx_start = 1;
+    uint32_t main_list_just_alt1 = 1;
+
+    // interpretation of ColHapAltCts
+    uint32_t hap_allele_idx_start = 1;
+    uint32_t hap_list_just_alt1 = 1;
+
+    uint32_t het_list_just_alt1 = 1; // ColHetRefAltCts
+
+    uint32_t biallelic_only = 0;
+
+    read_freq_colset_t header_cols = kfReadFreqColset0;
+    if (*loadbuf_first_token == '#') {
+      // PLINK 2.0
+      char* loadbuf_iter = &(loadbuf_first_token[1]); // guaranteed nonspace
+      uint32_t col_idx = 0;
+      while (1) {
+	char* token_end = token_endnn(loadbuf_iter);
+	const uint32_t token_slen = (uintptr_t)(token_end - loadbuf_iter);
+	read_freq_colidx_t cur_colidx = kfReadFreqColNull;
+	if (token_slen <= 4) {
+	  if ((token_slen == 2) && (!memcmp(loadbuf_iter, "ID", 2))) {
+	    cur_colidx = kfReadFreqColVarId;
+	  } else if (token_slen == 3) {
+	    if (!memcmp(loadbuf_iter, "REF", 3)) {
+	      cur_colidx = kfReadFreqColRefAllele;
+	    } else if (!memcmp(loadbuf_iter, "ALT", 3)) {
+	      cur_colidx = kfReadFreqColAltAlleles;
+	      if (allele_list_just_alt1) {
+		header_cols &= ~kfReadFreqColsetAlt1Allele;
+	        allele_list_just_alt1 = 0;
+	      }
+	    } else if (!memcmp(loadbuf_iter, "CTS", 3)) {
+	      goto read_allele_freqs_freqmain_found1;
+	    }
+	  } else if (token_slen == 4) {
+	    if ((!memcmp(loadbuf_iter, "ALT1", 4)) && allele_list_just_alt1) {
+	      cur_colidx = kfReadFreqColAlt1Allele;
+	    }
+	  }
+	} else if (((token_slen == 8) && (!memcmp(loadbuf_iter, "REF_FREQ", 8))) || ((token_slen == 6) && (!memcmp(loadbuf_iter, "REF_CT", 6)))) {
+	  cur_colidx = kfReadFreqColRefFreq;
+	  if (loadbuf_iter[4] == 'F') {
+	    is_frac = 1;
+	  }
+	} else if ((((token_slen == 9) && (!memcmp(loadbuf_iter, "ALT1_FREQ", 9))) || ((token_slen == 7) && (!memcmp(loadbuf_iter, "ALT1_CT", 7)))) && main_list_just_alt1) {
+	  cur_colidx = kfReadFreqColAlt1Freq;
+	  if (loadbuf_iter[5] == 'F') {
+	    is_frac = 1;
+	  }
+	} else if (((token_slen == 9) && (!memcmp(loadbuf_iter, "ALT_FREQS", 9))) || ((token_slen == 7) && (!memcmp(loadbuf_iter, "ALT_CTS", 7)))) {
+	  if (loadbuf_iter[4] == 'F') {
+	    is_frac = 1;
+	  }
+	  goto read_allele_freqs_freqmain_found2;
+	} else if ((token_slen == 5) && (!memcmp(loadbuf_iter, "FREQS", 5))) {
+	  is_frac = 1;
+	  goto read_allele_freqs_freqmain_found1;
+	} else if (((token_slen == 13) && (!memcmp(loadbuf_iter, "ALT_NUM_FREQS", 13))) || ((token_slen == 11) && (!memcmp(loadbuf_iter, "ALT_NUM_CTS", 11)))) {
+	  is_numeq = 1;
+	  goto read_allele_freqs_freqmain_found2;
+	} else if (((token_slen == 9) && (!memcmp(loadbuf_iter, "NUM_FREQS", 9))) || ((token_slen == 7) && (!memcmp(loadbuf_iter, "NUM_CTS", 7)))) {
+	  is_numeq = 1;
+	read_allele_freqs_freqmain_found1:
+	  main_allele_idx_start = 0;
+	read_allele_freqs_freqmain_found2:
+	  cur_colidx = kfReadFreqColAltFreqs;
+	  if (main_list_just_alt1) {
+	    header_cols &= ~kfReadFreqColsetAlt1Freq;
+	    main_list_just_alt1 = 0;
+	  }
+	} else if ((token_slen == 6) && (!memcmp(loadbuf_iter, "OBS_CT", 6))) {
+	  cur_colidx = kfReadFreqColObsCt;
+	} else if ((token_slen == 10) && (!memcmp(loadbuf_iter, "HOM_REF_CT", 10))) {
+	  cur_colidx = kfReadFreqColHomRefCt;
+	} else if ((token_slen == 15) && (!memcmp(loadbuf_iter, "HET_REF_ALT1_CT", 15)) && het_list_just_alt1) {
+	  cur_colidx = kfReadFreqColHetRefAlt1Ct;
+	} else if ((token_slen == 15) && (!memcmp(loadbuf_iter, "HET_REF_ALT_CTS", 15))) {
+	  cur_colidx = kfReadFreqColHetRefAltCts;
+	  if (het_list_just_alt1) {
+	    header_cols &= ~kfReadFreqColsetHetRefAlt1Ct;
+	    het_list_just_alt1 = 0;
+	  }
+	} else if ((token_slen == 11) && (!memcmp(loadbuf_iter, "HOM_ALT1_CT", 11)) && main_list_just_alt1) {
+	  cur_colidx = kfReadFreqColHomAlt1Ct;
+	} else if ((token_slen == 23) && (!memcmp(loadbuf_iter, "NONREF_DIPLOID_GENO_CTS", 23))) {
+	  goto read_allele_freqs_countmain_found;
+	} else if ((token_slen == 16) && (!memcmp(loadbuf_iter, "DIPLOID_GENO_CTS", 16))) {
+	  main_allele_idx_start = 0;
+	read_allele_freqs_countmain_found:
+	  cur_colidx = kfReadFreqColNonrefDiploidCts;
+	  if (main_list_just_alt1) {
+	    header_cols &= ~kfReadFreqColsetHomAlt1Ct;
+	    // could make this use a different variable than FREQS does
+	    main_list_just_alt1 = 0;
+	  }
+	} else if ((token_slen == 10) && (!memcmp(loadbuf_iter, "HAP_REF_CT", 10))) {
+	  cur_colidx = kfReadFreqColHapRefCt;
+	} else if ((token_slen == 11) && (!memcmp(loadbuf_iter, "HAP_ALT1_CT", 11)) && hap_list_just_alt1) {
+	  cur_colidx = kfReadFreqColHapAlt1Ct;
+	} else if ((token_slen == 11) && (!memcmp(loadbuf_iter, "HAP_ALT_CTS", 11))) {
+	  goto read_allele_freqs_hapmain_found;
+	} else if ((token_slen == 7) && (!memcmp(loadbuf_iter, "HAP_CTS", 7))) {
+	  hap_allele_idx_start = 0;
+	read_allele_freqs_hapmain_found:
+	  cur_colidx = kfReadFreqColHapAltCts;
+	  if (hap_list_just_alt1) {
+	    header_cols &= ~kfReadFreqColsetHapAlt1Ct;
+	    hap_list_just_alt1 = 0;
+	  }
+	} else if ((token_slen == 12) && (!memcmp(loadbuf_iter, "GENO_NUM_CTS", 12))) {
+	  cur_colidx = kfReadFreqColGenoCtNumeq;
+	  is_numeq = 1;
+	}
+	if (cur_colidx != kfReadFreqColNull) {
+	  const read_freq_colset_t cur_colset = (read_freq_colset_t)(1 << cur_colidx);
+	  if (header_cols & cur_colset) {
+	    logerrprint("Error: Conflicting columns in header line of --read-freq file.\n");
+	    goto read_allele_freqs_ret_MALFORMED_INPUT;
+	  }
+	  if (cur_colidx >= kfReadFreqColAlt1Allele) {
+	    overrideable_pos[cur_colidx - kfReadFreqColAlt1Allele] = relevant_col_ct;
+	  }
+	  header_cols |= cur_colset;
+	  col_skips[relevant_col_ct] = col_idx;
+	  col_types[relevant_col_ct++] = cur_colidx;
+	}
+	loadbuf_iter = skip_initial_spaces(token_end);
+	if (is_eoln_kns(*loadbuf_iter)) {
+	  break;
+	}
+	++col_idx;
+      }
+      read_freq_colset_t semifinal_header_cols = header_cols;
+      if (header_cols & kfReadFreqColsetAlt1Allele) {
+	header_cols ^= kfReadFreqColsetAltAlleles | kfReadFreqColsetAlt1Allele;
+	col_types[overrideable_pos[0]] = kfReadFreqColAltAlleles;
+      }
+      if (header_cols & kfReadFreqColsetAlt1Freq) {
+	header_cols ^= kfReadFreqColsetAltFreqs | kfReadFreqColsetAlt1Freq;
+	col_types[overrideable_pos[kfReadFreqColAlt1Freq - kfReadFreqColAlt1Allele]] = kfReadFreqColAltFreqs;
+      }
+      if (header_cols & kfReadFreqColsetHetRefAlt1Ct) {
+	header_cols ^= kfReadFreqColsetHetRefAltCts | kfReadFreqColsetHetRefAlt1Ct;
+	col_types[overrideable_pos[kfReadFreqColHetRefAlt1Ct - kfReadFreqColAlt1Allele]] = kfReadFreqColHetRefAltCts;
+      }
+      if (header_cols & kfReadFreqColsetHomAlt1Ct) {
+	header_cols ^= kfReadFreqColsetNonrefDiploidCts | kfReadFreqColsetHomAlt1Ct;
+	col_types[overrideable_pos[kfReadFreqColHomAlt1Ct - kfReadFreqColAlt1Allele]] = kfReadFreqColNonrefDiploidCts;
+      }
+      if (header_cols & kfReadFreqColsetHapAlt1Ct) {
+	header_cols ^= kfReadFreqColsetHapAltCts | kfReadFreqColsetHapAlt1Ct;
+	col_types[overrideable_pos[kfReadFreqColHapAlt1Ct - kfReadFreqColAlt1Allele]] = kfReadFreqColHapAltCts;
+      }
+      if ((semifinal_header_cols != header_cols) && (!(header_cols & kfReadFreqColsetGenoCtNumeq))) {
+	// we're treating at least one ALT1 column as if it spoke for all ALT
+	// alleles
+	biallelic_only = 1;
+      }
+
+      main_eq = is_numeq;
+      semifinal_header_cols = header_cols;
+      if (header_cols & kfReadFreqColsetAfreqOnly) {
+	if (header_cols & kfReadFreqColsetGcountOnly) {
+	  logerrprint("Error: Conflicting columns in header line of --read-freq file (--freq and\n--geno-counts values mixed together).\n");
+	  goto read_allele_freqs_ret_MALFORMED_INPUT;
+	}
+	read_freq_colset_t header_cols_exempt = kfReadFreqColset0;
+	if ((header_cols & kfReadFreqColsetAltFreqs) && (!is_numeq)) {
+	  // {ALT_}FREQS can be formatted as either
+	  //   0.5,0,0.2
+	  // or
+	  //   A=0.5,G=0.2
+	  // Look at the first nonheader line to distinguish between these two.
+	  do {
+	    if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	      if (!gzeof(gz_infile)) {
+		goto read_allele_freqs_ret_READ_FAIL;
+	      }
+	      logerrprint("Error: Empty --read-freq file.\n");
+	      goto read_allele_freqs_ret_MALFORMED_INPUT;
+	    }
+	    ++line_idx;
+	    if (!loadbuf[loadbuf_size - 1]) {
+	      goto read_allele_freqs_ret_LONG_LINE;
+	    }
+	    loadbuf_first_token = skip_initial_spaces(loadbuf);
+	  } while (is_eoln_kns(*loadbuf_first_token));
+	  char* loadbuf_iter = loadbuf_first_token;
+	  char* alt_freq_str = nullptr;
+	  for (uint32_t relevant_col_idx = 0; relevant_col_idx < relevant_col_ct; ++relevant_col_idx) {
+	    if (col_types[relevant_col_idx] == kfReadFreqColAltFreqs) {
+	      alt_freq_str = next_token_multz(loadbuf_iter, col_skips[relevant_col_idx]);
+	      break;
+	    }
+	  }
+	  if (!alt_freq_str) {
+	    goto read_allele_freqs_ret_MISSING_TOKENS;
+	  }
+	  const uint32_t alt_freq_slen = token_endnn(alt_freq_str) - alt_freq_str;
+	  // bare '.' can only appear in eq formats
+	  main_eq = ((alt_freq_slen == 1) && (*alt_freq_str == '.')) || (memchr(alt_freq_str, '=', alt_freq_slen) != nullptr);
+	  if (main_eq) {
+	    header_cols_exempt = kfReadFreqColsetAltAlleles;
+	    if (!main_allele_idx_start) {
+	      header_cols_exempt |= kfReadFreqColsetRefAllele;
+	    }
+	    header_cols &= ~header_cols_exempt;
+	  }
+	}
+	if (((header_cols & kfReadFreqColsetBase) | header_cols_exempt) != kfReadFreqColsetBase) {
+	  logerrprint("Error: Missing column(s) in --read-freq file (ID, REF, ALT{1} usually\nrequired).\n");
+	  goto read_allele_freqs_ret_MALFORMED_INPUT;
+	}
+	if (!main_allele_idx_start) {
+	  header_cols &= ~kfReadFreqColsetRefFreq;
+	} else {
+	  if ((header_cols & (kfReadFreqColsetRefFreq | kfReadFreqColsetAltFreqs)) != (kfReadFreqColsetRefFreq | kfReadFreqColsetAltFreqs)) {
+	    if (main_list_just_alt1) {
+	      biallelic_only = 1;
+	    }
+	    infer_one_freq = 1;
+	    infer_freq_loaded_idx = (header_cols / kfReadFreqColsetRefFreq) & 1;
+	    if (!is_frac) {
+	      if (!(header_cols & kfReadFreqColsetObsCt)) {
+		logerrprint("Error: Missing column(s) in --read-freq file (at least two of {REF_CT, ALT1_CT,\nALT_CTS, OBS_CT} must be present).\n");
+		goto read_allele_freqs_ret_MALFORMED_INPUT;
+	      }
+	      use_obs_ct = 1;
+	    }
+	  }
+	}
+	logprint("--read-freq: PLINK 2 --freq file detected.\n");
+      } else if (header_cols & kfReadFreqColsetGcountOnly) {
+	if ((header_cols & kfReadFreqColsetBase) != kfReadFreqColsetBase) {
+	  logerrprint("Error: Missing column(s) in --read-freq file (ID, REF, ALT{1} required).\n");
+	  goto read_allele_freqs_ret_MALFORMED_INPUT;
+	}
+	// possible todo: allow one frequency/count to be missing.  (not really
+	// necessary since PLINK 1.9 --freqx does not leave anything out,
+	// unlike PLINK 1.x --freq)
+	if (header_cols & kfReadFreqColsetGenoCtNumeq) {
+	  header_cols &= ~kfReadFreqColsetGcountDefault; // don't need anything but GENO_NUM_CTS
+	} else {
+	  // require both diploid and haploid columns for now.  (could
+	  // conditionally drop one of these requirements later.)
+	  if (!(header_cols & kfReadFreqColsetNonrefDiploidCts)) {
+	    logerrprint("Error: Missing column(s) in --read-freq file (HOM_ALT1_CT,\nNONREF_DIPLOID_GENO_CTS, or DIPLOID_GENO_CTS required).\n");
+	    goto read_allele_freqs_ret_MALFORMED_INPUT;
+	  }
+	  if (!main_allele_idx_start) {
+	    header_cols &= ~(kfReadFreqColsetHomRefCt | kfReadFreqColsetHetRefAltCts);
+	  } else if ((header_cols & (kfReadFreqColsetHomRefCt | kfReadFreqColsetHetRefAltCts)) != (kfReadFreqColsetHomRefCt | kfReadFreqColsetHetRefAltCts)) {
+	    logerrprint("Error: Missing column(s) in --read-freq file (HOM_REF_CT, HET_REF_ALT1_CT, or\nHET_REF_ALT_CTS required unless {DIPLOID_}GENO_CTS present).\n");
+	    goto read_allele_freqs_ret_MALFORMED_INPUT;
+	  }
+	  if (!(header_cols & kfReadFreqColsetHapAltCts)) {
+	    logerrprint("Error: Missing column(s) in --read-freq file (HAP_ALT1_CT, HAP_ALT_CTS, or\nHAP_CTS required).\n");
+	    goto read_allele_freqs_ret_MALFORMED_INPUT;
+	  }
+	  if (!hap_allele_idx_start) {
+	    header_cols &= ~kfReadFreqColsetHapRefCt;
+	  } else if (!(header_cols & kfReadFreqColsetHapRefCt)) {
+	    logerrprint("Error: Missing column(s) in --read-freq file (HAP_REF_CT required unless\nHAP_CTS or GENO_CTS present).\n");
+	    goto read_allele_freqs_ret_MALFORMED_INPUT;
+	  }
+	}
+	geno_counts = 1;
+	logprint("--read-freq: PLINK 2 --geno-counts file detected.\n");
+	*loadbuf_first_token = '\0';
+      } else {
+	logerrprint("Error: Missing column(s) in --read-freq file (no frequencies/counts).\n");
+	goto read_allele_freqs_ret_MALFORMED_INPUT;
+      }
+      if (!use_obs_ct) {
+	header_cols &= ~kfReadFreqColsetObsCt;
+      }
+      if (semifinal_header_cols != header_cols) {
+	// remove redundant columns
+	uint32_t relevant_col_idx_read = 0;
+	while ((((uint32_t)header_cols) >> col_types[relevant_col_idx_read]) & 1) {
+	  ++relevant_col_idx_read;
+	}
+	uint32_t relevant_col_idx_write = relevant_col_idx_read;
+	for (; relevant_col_idx_read < relevant_col_ct; ++relevant_col_idx_read) {
+	  const read_freq_colidx_t cur_colidx = col_types[relevant_col_idx_read];
+	  if ((((uint32_t)header_cols) >> cur_colidx) & 1) {
+	    col_types[relevant_col_idx_write] = cur_colidx;
+	    col_skips[relevant_col_idx_write] = col_skips[relevant_col_idx_read];
+	    ++relevant_col_idx_write;
+	  }
+	}
+	relevant_col_ct = relevant_col_idx_write;
+      }
+      for (uint32_t uii = relevant_col_ct - 1; uii; --uii) {
+	col_skips[uii] -= col_skips[uii - 1];
+      }
+    } else {
+      // PLINK 1.x
+      // .frq:       CHR  SNP  A1  A2  MAF        NCHROBS
+      // .frq.count: CHR  SNP  A1  A2  C1         C2       G0
+      // .frqx:      CHR  SNP  A1  A2  C(HOM A1)  C(HET)   C(HOM A2)  C(HAP A1)
+      //   C(HAP A2)  C(MISSING)
+      // (yeah, the spaces in the .frqx header were a mistake, should have used
+      // underscores.  oh well, live and learn.)
+      col_skips[0] = 1;
+      col_skips[1] = 1;
+      col_skips[2] = 1;
+      col_skips[3] = 1;
+
+      col_types[0] = kfReadFreqColVarId;
+      // doesn't matter if we treat A1 or A2 as ref
+      col_types[1] = kfReadFreqColRefAllele;
+      col_types[2] = kfReadFreqColAltAlleles;
+      biallelic_only = 1;
+      if (!memcmp(loadbuf_first_token, "CHR\tSNP\tA1\tA2\tC(HOM A1)\tC(HET)\tC(HOM A2)\tC(HAP A1)\tC(HAP A2)\tC(MISSING)", 71)) {
+        col_skips[4] = 1;
+	col_skips[5] = 1;
+	col_skips[6] = 1;
+	col_skips[7] = 1;
+
+	col_types[3] = kfReadFreqColHomRefCt;
+	col_types[4] = kfReadFreqColHetRefAltCts;
+	col_types[5] = kfReadFreqColNonrefDiploidCts;
+	col_types[6] = kfReadFreqColHapRefCt;
+	col_types[7] = kfReadFreqColHapAltCts;
+	header_cols = kfReadFreqColsetBase | kfReadFreqColsetGcountOnly;
+	geno_counts = 1;
+	relevant_col_ct = 8;
+	logprint("--read-freq: PLINK 1.9 --freqx file detected.\n");
+      } else {
+	if (strcmp_se(loadbuf_first_token, "CHR", 3)) {
+	  goto read_allele_freqs_ret_UNRECOGNIZED_HEADER;
+	}
+	char* loadbuf_iter = skip_initial_spaces(&(loadbuf_first_token[3]));
+	if (strcmp_se(loadbuf_iter, "SNP", 3)) {
+	  goto read_allele_freqs_ret_UNRECOGNIZED_HEADER;
+	}
+	loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[3]));
+	if (strcmp_se(loadbuf_iter, "A1", 2)) {
+	  goto read_allele_freqs_ret_UNRECOGNIZED_HEADER;
+	}
+	loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[2]));
+	if (strcmp_se(loadbuf_iter, "A2", 2)) {
+	  goto read_allele_freqs_ret_UNRECOGNIZED_HEADER;
+	}
+	loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[2]));
+	col_types[3] = kfReadFreqColRefFreq;
+	if (!strcmp_se(loadbuf_iter, "MAF", 3)) {
+	  is_frac = 1;
+	  infer_one_freq = 1;
+	  infer_freq_loaded_idx = 1;
+	  header_cols = kfReadFreqColsetBase | kfReadFreqColsetRefFreq;
+	  relevant_col_ct = 4;
+	  logprint("--read-freq: PLINK 1.x --freq file detected.\n");
+	} else {
+	  if (strcmp_se(loadbuf_iter, "C1", 2)) {
+	    goto read_allele_freqs_ret_UNRECOGNIZED_HEADER;
+	  }
+	  loadbuf_iter = skip_initial_spaces(&(loadbuf_iter[2]));
+	  if (strcmp_se(loadbuf_iter, "C2", 2)) {
+	    goto read_allele_freqs_ret_UNRECOGNIZED_HEADER;
+	  }
+	  col_skips[4] = 1;
+	  col_types[4] = kfReadFreqColAltFreqs;
+	  header_cols = kfReadFreqColsetBase | kfReadFreqColsetAfreqOnly;
+	  relevant_col_ct = 5;
+	  logprint("--read-freq: PLINK 1.x '--freq counts' file detected.\n");
+	}
+      }
+      *loadbuf_first_token = '\0';
+    }
+    assert(relevant_col_ct <= 8);
+
+    double freq_max = 4294967295.0;
+    if (is_frac) {
+      maf_succ = 0;
+      freq_max = 1.0;
+    }
+    uintptr_t skipped_variant_ct = 0;
+    uint32_t loaded_variant_ct = 0;
+    uint32_t cur_allele_ct = 2;
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	char* loadbuf_iter = loadbuf_first_token;
+	char* token_ptrs[12];
+	uint32_t token_slens[12];
+	for (uint32_t relevant_col_idx = 0; relevant_col_idx < relevant_col_ct; ++relevant_col_idx) {
+	  const read_freq_colidx_t cur_colidx = col_types[relevant_col_idx];
+	  loadbuf_iter = next_token_multz(loadbuf_iter, col_skips[relevant_col_idx]);
+	  if (!loadbuf_iter) {
+	    goto read_allele_freqs_ret_MISSING_TOKENS;
+	  }
+	  token_ptrs[cur_colidx] = loadbuf_iter;
+	  char* token_end = token_endnn(loadbuf_iter);
+	  token_slens[cur_colidx] = (uintptr_t)(token_end - loadbuf_iter);
+	  loadbuf_iter = token_end;
+	}
+	char* variant_id_start = token_ptrs[kfReadFreqColVarId];
+	const uint32_t variant_id_slen = token_slens[kfReadFreqColVarId];
+	uint32_t variant_uidx = variant_id_dupflag_htable_find(variant_id_start, variant_ids, variant_id_htable, variant_id_slen, variant_id_htable_size, max_variant_id_slen);
+	if (variant_uidx >> 31) {
+	  if (variant_uidx == 0xffffffffU) {
+	    goto read_allele_freqs_skip_variant;
+	  }
+	  sprintf(g_logbuf, "Error: --read-freq variant ID '%s' appears multiple times in main dataset.\n", variant_ids[variant_uidx & 0xffffffffU]);
+	  goto read_allele_freqs_ret_MALFORMED_INPUT_WW;
+	}
+	if (is_set(already_seen, variant_uidx)) {
+	  sprintf(g_logbuf, "Error: Variant ID '%s' appears multiple times in --read-freq file.\n", variant_ids[variant_uidx]);
+	  goto read_allele_freqs_ret_MALFORMED_INPUT_WW;
+	}
+	set_bit(variant_uidx, already_seen);
+
+	uintptr_t variant_allele_idx_base;
+	if (!variant_allele_idxs) {
+	  variant_allele_idx_base = variant_uidx * 2;
+	} else {
+	  variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	  cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	  if (biallelic_only) {
+	    goto read_allele_freqs_skip_variant;
+	  }
+	}
+	fill_ulong_zero(BITCT_TO_WORDCT(cur_allele_ct), matched_internal_alleles);
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	uint32_t loaded_allele_ct = 0;
+	if (header_cols & kfReadFreqColsetRefAllele) {
+	  uint32_t cur_loaded_allele_code_slen = token_slens[kfReadFreqColRefAllele];
+	  uint32_t unmatched_allele_ct = cur_allele_ct;
+	  char* cur_loaded_allele_code = token_ptrs[kfReadFreqColRefAllele];
+	  cur_loaded_allele_code[cur_loaded_allele_code_slen] = '\0';
+	  char* loaded_allele_code_iter;
+	  char* loaded_allele_code_end;
+	  if (header_cols & kfReadFreqColsetAltAlleles) {
+	    loaded_allele_code_iter = token_ptrs[kfReadFreqColAltAlleles];
+	    loaded_allele_code_end = &(loaded_allele_code_iter[token_slens[kfReadFreqColAltAlleles]]);
+	    *loaded_allele_code_end++ = ',';
+	  } else {
+	    // special case: with --freq alteq or alteqz column, we only need
+	    // to scrape REF here
+	    loaded_allele_code_iter = &(cur_loaded_allele_code[cur_loaded_allele_code_slen + 1]);
+	    loaded_allele_code_end = loaded_allele_code_iter;
+	  }
+	  uint32_t widx = 0;
+	  while (1) {
+	    if (!(loaded_allele_ct % kBitsPerWord)) {
+	      widx = loaded_allele_ct / kBitsPerWord;
+	      matched_loaded_alleles[widx] = 0;
+	    }
+	    if (cur_loaded_allele_code_slen <= max_allele_slen) {
+	      uint32_t internal_allele_idx = 0;
+	      uint32_t unmatched_allele_idx = 0;
+	      for (; unmatched_allele_idx < unmatched_allele_ct; ++unmatched_allele_idx, ++internal_allele_idx) {
+		next_unset_unsafe_ck(matched_internal_alleles, &internal_allele_idx);
+		if (!strcmp(cur_loaded_allele_code, cur_alleles[internal_allele_idx])) {
+		  break;
+		}
+	      }
+	      if (unmatched_allele_idx != unmatched_allele_ct) {
+		// success
+		if (is_set(matched_internal_alleles, internal_allele_idx)) {
+		  sprintf(g_logbuf, "Error: Duplicate allele code on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+		  goto read_allele_freqs_ret_MALFORMED_INPUT_2;
+		}
+		set_bit(internal_allele_idx, matched_internal_alleles);
+		set_bit(loaded_allele_ct, matched_loaded_alleles);
+		loaded_to_internal_allele_idx[loaded_allele_ct] = internal_allele_idx;
+	      }
+	    }
+	    ++loaded_allele_ct;
+	    if (loaded_allele_code_iter == loaded_allele_code_end) {
+	      break;
+	    }
+	    if (loaded_allele_ct == kMaxReadFreqAlleles) {
+	      sprintf(g_logbuf, "Error: --read-freq file entry for variant ID '%s' has more than %u ALT alleles.\n", variant_ids[variant_uidx], kMaxReadFreqAlleles - 1);
+	      goto read_allele_freqs_ret_MALFORMED_INPUT_WW;
+	    }
+	    cur_loaded_allele_code = loaded_allele_code_iter;
+	    loaded_allele_code_iter = (char*)rawmemchr(loaded_allele_code_iter, ',');
+	    cur_loaded_allele_code_slen = (uintptr_t)(loaded_allele_code_iter - cur_loaded_allele_code);
+	    *loaded_allele_code_iter++ = '\0';
+	  }
+	}
+
+	double* allele_freqs_write = &(allele_freqs[variant_allele_idx_base - variant_uidx]);
+	if (geno_counts) {
+	  fill_double_zero(cur_allele_ct, cur_allele_freqs);
+	  if (is_numeq) {
+	    char* geno_num_cts_iter = token_ptrs[kfReadFreqColGenoCtNumeq];
+	    const uint32_t full_slen = token_slens[kfReadFreqColGenoCtNumeq];
+	    char* geno_num_cts_end = &(geno_num_cts_iter[full_slen]);
+	    if (full_slen > 1) {
+	      *geno_num_cts_end = ',';
+#ifndef __LP64__
+	      const uint32_t cap_div_10 = (loaded_allele_ct - 1) / 10;
+	      const uint32_t cap_mod_10 = (loaded_allele_ct - 1) % 10;
+#endif
+	      while (1) {
+		uint32_t second_loaded_allele_idx = 0xffffffffU;
+		uint32_t first_loaded_allele_idx;
+#ifdef __LP64__
+		if (scanadv_uint_capped(loaded_allele_ct - 1, &geno_num_cts_iter, &first_loaded_allele_idx)) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		if (*geno_num_cts_iter == '/') {
+		  ++geno_num_cts_iter;
+		  if (scanadv_uint_capped(loaded_allele_ct - 1, &geno_num_cts_iter, &second_loaded_allele_idx)) {
+		    goto read_allele_freqs_ret_INVALID_FREQS;
+		  }
+		}
+#else
+		if (scanadv_uint_capped32(cap_div_10, cap_mod_10, &geno_num_cts_iter, &first_loaded_allele_idx)) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		if (*geno_num_cts_iter == '/') {
+		  ++geno_num_cts_iter;
+		  if (scanadv_uint_capped32(cap_div_10, cap_mod_10, &geno_num_cts_iter, &second_loaded_allele_idx)) {
+		    goto read_allele_freqs_ret_INVALID_FREQS;
+		  }
+		}
+#endif
+		if (*geno_num_cts_iter != '=') {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		++geno_num_cts_iter;
+		double dxx;
+		char* cur_ct_end = scanadv_double(geno_num_cts_iter, &dxx);
+		if ((!cur_ct_end) || (*cur_ct_end != ',') || (dxx < 0.0) || (dxx > 4294967295.0)) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		if (is_set(matched_loaded_alleles, first_loaded_allele_idx)) {
+		  cur_allele_freqs[loaded_to_internal_allele_idx[first_loaded_allele_idx]] += dxx;
+		}
+		if ((second_loaded_allele_idx != 0xffffffffU) && is_set(matched_loaded_alleles, second_loaded_allele_idx)) {
+		  cur_allele_freqs[loaded_to_internal_allele_idx[second_loaded_allele_idx]] += dxx;
+		}
+		geno_num_cts_iter = cur_ct_end;
+	        if (geno_num_cts_iter == geno_num_cts_end) {
+		  break;
+		}
+		++geno_num_cts_iter;
+	      }
+	    } else if (*geno_num_cts_iter != '.') {
+	      goto read_allele_freqs_ret_INVALID_FREQS;
+	    }
+	  } else {
+	    const uint32_t internal0 = is_set(matched_loaded_alleles, 0)? loaded_to_internal_allele_idx[0] : 0xffffffffU;
+	    if (header_cols & kfReadFreqColsetHomRefCt) {
+	      if (internal0 != 0xffffffffU) {
+		char* hom_ref_str = token_ptrs[kfReadFreqColHomRefCt];
+		double dxx;
+		char* hom_ref_end = scanadv_double(hom_ref_str, &dxx);
+		if ((!hom_ref_end) || (hom_ref_end != &(hom_ref_str[token_slens[kfReadFreqColHomRefCt]])) || (dxx < 0.0) || (dxx > 4294967295.0)) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		cur_allele_freqs[internal0] += 2 * dxx;
+	      }
+	      
+	      char* het_refalt_iter = token_ptrs[kfReadFreqColHetRefAltCts];
+	      char* het_refalt_end = &(het_refalt_iter[token_slens[kfReadFreqColHetRefAltCts]]);
+	      *het_refalt_end = ',';
+	      for (uint32_t alt_allele_idx = 1; alt_allele_idx < cur_allele_ct; ++alt_allele_idx) {
+		if (het_refalt_iter >= het_refalt_end) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		double dxx;
+		char* cur_entry_end = scanadv_double(het_refalt_iter, &dxx);
+		if ((!cur_entry_end) || (*cur_entry_end != ',') || (dxx < 0.0) || (dxx > 4294967295.0)) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		if (internal0 != 0xffffffffU) {
+		  cur_allele_freqs[internal0] += dxx;
+		}
+		if (is_set(matched_loaded_alleles, alt_allele_idx)) {
+		  cur_allele_freqs[loaded_to_internal_allele_idx[alt_allele_idx]] += dxx;
+		}
+		het_refalt_iter = &(cur_entry_end[1]);
+	      }
+	    }
+	    // ColNonrefDiploidCts required
+	    char* diploid_cts_iter = token_ptrs[kfReadFreqColNonrefDiploidCts];
+	    char* diploid_cts_end = &(diploid_cts_iter[token_slens[kfReadFreqColNonrefDiploidCts]]);
+	    *diploid_cts_end = ',';
+	    for (uint32_t second_allele_idx = main_allele_idx_start; second_allele_idx < cur_allele_ct; ++second_allele_idx) {
+	      uint32_t internalx = 0xffffffffU;
+	      if (is_set(matched_loaded_alleles, second_allele_idx)) {
+		internalx = loaded_to_internal_allele_idx[second_allele_idx];
+	      }
+	      // 1/1, 1/2, 2/2, 1/3, ...
+	      for (uint32_t first_allele_idx = main_allele_idx_start; first_allele_idx <= second_allele_idx; ++first_allele_idx) {
+		if (diploid_cts_iter >= diploid_cts_end) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+	        double dxx;
+		char* cur_entry_end = scanadv_double(diploid_cts_iter, &dxx);
+		if ((!cur_entry_end) || (*cur_entry_end != ',') || (dxx < 0.0) || (dxx > 4294967295.0)) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		if (is_set(matched_loaded_alleles, first_allele_idx)) {
+		  cur_allele_freqs[loaded_to_internal_allele_idx[first_allele_idx]] += dxx;
+		}
+		if (internalx != 0xffffffffU) {
+		  cur_allele_freqs[internalx] += dxx;
+		}
+		diploid_cts_iter = &(cur_entry_end[1]);
+	      }
+	    }
+
+	    if ((header_cols & kfReadFreqColsetHapRefCt) && (internal0 != 0xffffffffU)) {
+	      char* hap_ref_str = token_ptrs[kfReadFreqColHapRefCt];
+	      double dxx;
+	      char* hap_ref_end = scanadv_double(hap_ref_str, &dxx);
+	      if ((!hap_ref_end) || (hap_ref_end != &(hap_ref_str[token_slens[kfReadFreqColHapRefCt]])) || (dxx < 0.0) || (dxx > 4294967295.0)) {
+		goto read_allele_freqs_ret_INVALID_FREQS;
+	      }
+	      cur_allele_freqs[internal0] += dxx;
+	    }
+	    // ColHapAltCts required
+	    char* hap_alt_iter = token_ptrs[kfReadFreqColHapAltCts];
+	    char* hap_alt_end = &(hap_alt_iter[token_slens[kfReadFreqColHapAltCts]]);
+	    *hap_alt_end = ',';
+	    for (uint32_t alt_allele_idx = 1; alt_allele_idx < cur_allele_ct; ++alt_allele_idx) {
+	      if (hap_alt_iter >= hap_alt_end) {
+		goto read_allele_freqs_ret_INVALID_FREQS;
+	      }
+	      double dxx;
+	      char* cur_entry_end = scanadv_double(hap_alt_iter, &dxx);
+	      if ((!cur_entry_end) || (*cur_entry_end != ',') || (dxx < 0.0) || (dxx > 4294967295.0)) {
+		goto read_allele_freqs_ret_INVALID_FREQS;
+	      }
+	      if (is_set(matched_loaded_alleles, alt_allele_idx)) {
+		cur_allele_freqs[loaded_to_internal_allele_idx[alt_allele_idx]] += dxx;
+	      }
+	      hap_alt_iter = &(cur_entry_end[1]);
+	    }
+	  }
+	} else {
+	  if ((header_cols & kfReadFreqColsetRefFreq) && is_set(matched_loaded_alleles, 0)) {
+	    char* ref_freq_str = token_ptrs[kfReadFreqColRefFreq];
+	    double dxx;
+	    if (!scanadv_double(ref_freq_str, &dxx)) {
+	      if (is_nan_str(ref_freq_str, token_slens[kfReadFreqColRefFreq])) {
+	        goto read_allele_freqs_skip_variant;
+	      }
+	      sprintf(g_logbuf, "Error: Invalid REF frequency/count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	      goto read_allele_freqs_ret_MALFORMED_INPUT_WW;
+	    }
+	    if ((dxx < 0.0) || (dxx > freq_max)) {
+	      sprintf(g_logbuf, "Error: Invalid REF frequency/count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	      goto read_allele_freqs_ret_MALFORMED_INPUT_WW;
+	    }
+	    cur_allele_freqs[loaded_to_internal_allele_idx[0]] = dxx;
+	  }
+	  if (header_cols & kfReadFreqColsetAltFreqs) {
+	    char* alt_freq_iter = token_ptrs[kfReadFreqColAltFreqs];
+	    const uint32_t full_slen = token_slens[kfReadFreqColAltFreqs];
+	    char* alt_freq_end = &(alt_freq_iter[full_slen]);
+	    *alt_freq_end = ',';
+	    if (!main_eq) {
+	      for (uint32_t allele_idx = main_allele_idx_start; allele_idx < loaded_allele_ct; ++allele_idx, ++alt_freq_iter) {
+		if (alt_freq_iter >= alt_freq_end) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		if (!is_set(matched_loaded_alleles, allele_idx)) {
+		  alt_freq_iter = (char*)rawmemchr(alt_freq_iter, ',');
+		  continue;
+		}
+		double dxx;
+		char* cur_freq_end = scanadv_double(alt_freq_iter, &dxx);
+		if (!cur_freq_end) {
+		  cur_freq_end = (char*)rawmemchr(alt_freq_iter, ',');
+		  if (is_nan_str(alt_freq_iter, (uintptr_t)(cur_freq_end - alt_freq_iter))) {
+		    goto read_allele_freqs_skip_variant;
+		  }
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		if ((*cur_freq_end != ',') || (dxx < 0.0) || (dxx > freq_max)) {
+		  goto read_allele_freqs_ret_INVALID_FREQS;
+		}
+		alt_freq_iter = cur_freq_end;
+		cur_allele_freqs[loaded_to_internal_allele_idx[allele_idx]] = dxx;
+	      }
+	    } else {
+	      fill_double_zero(cur_allele_ct, cur_allele_freqs);
+	      if ((full_slen > 1) || (*alt_freq_iter != '.')) {
+		if (is_numeq) {
+#ifndef __LP64__
+		  const uint32_t cap_div_10 = (loaded_allele_ct - 1) / 10;
+		  const uint32_t cap_mod_10 = (loaded_allele_ct - 1) % 10;
+#endif
+		  while (1) {
+		    char* cur_entry_end = (char*)rawmemchr(alt_freq_iter, ',');
+		    uint32_t loaded_allele_idx;
+#ifdef __LP64__
+		    if (scanadv_uint_capped(loaded_allele_ct - 1, &alt_freq_iter, &loaded_allele_idx)) {
+		      goto read_allele_freqs_ret_INVALID_FREQS;
+		    }
+#else
+		    if (scanadv_uint_capped32(cap_div_10, cap_mod_10, &alt_freq_iter, &loaded_allele_idx)) {
+		      goto read_allele_freqs_ret_INVALID_FREQS;
+		    }
+#endif
+		    if (*alt_freq_iter != '=') {
+		      goto read_allele_freqs_ret_INVALID_FREQS;
+		    }
+		    if (is_set(matched_loaded_alleles, loaded_allele_idx)) {
+		      const uint32_t internal_allele_idx = loaded_to_internal_allele_idx[loaded_allele_idx];
+		      if (cur_allele_freqs[internal_allele_idx]) {
+			sprintf(g_logbuf, "Error: Duplicate entry on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+			goto read_allele_freqs_ret_MALFORMED_INPUT_2;
+		      }
+		      ++alt_freq_iter;
+		      double dxx;
+		      char* cur_freq_end = scanadv_double(alt_freq_iter, &dxx);
+		      if (!cur_freq_end) {
+			if (is_nan_str(alt_freq_iter, (uintptr_t)(cur_entry_end - alt_freq_iter))) {
+			  goto read_allele_freqs_skip_variant;
+			}
+			goto read_allele_freqs_ret_INVALID_FREQS;
+		      }
+		      if ((cur_freq_end != cur_entry_end) || (dxx < 0.0) || (dxx > freq_max)) {
+			goto read_allele_freqs_ret_INVALID_FREQS;
+		      }
+		      cur_allele_freqs[internal_allele_idx] = dxx;
+		    }
+		    alt_freq_iter = cur_entry_end;
+		    if (alt_freq_iter == alt_freq_end) {
+		      break;
+		    }
+		    ++alt_freq_iter;
+		  }
+		} else {
+		  while (1) {
+		    char* cur_entry_end = (char*)rawmemchr(alt_freq_iter, ',');
+		    const uint32_t cur_entry_slen = (uintptr_t)(cur_entry_end - alt_freq_iter);
+		    char* eq_ptr = (char*)memchr(alt_freq_iter, '=', cur_entry_slen);
+		    if (!eq_ptr) {
+		      goto read_allele_freqs_ret_INVALID_FREQS;
+		    }
+		    *eq_ptr = '\0';
+		    uint32_t internal_allele_idx = 0;
+		    // O(n^2), may want to replace with O(n log n)
+		    for (; internal_allele_idx < cur_allele_ct; ++internal_allele_idx) {
+		      if (!strcmp(alt_freq_iter, cur_alleles[internal_allele_idx])) {
+			if (cur_allele_freqs[internal_allele_idx]) {
+			  sprintf(g_logbuf, "Error: Duplicate entry on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+			  goto read_allele_freqs_ret_MALFORMED_INPUT_2;
+			}
+			alt_freq_iter = &(eq_ptr[1]);
+			double dxx;
+			char* cur_freq_end = scanadv_double(alt_freq_iter, &dxx);
+			if (!cur_freq_end) {
+			  if (is_nan_str(alt_freq_iter, (uintptr_t)(cur_entry_end - alt_freq_iter))) {
+			    goto read_allele_freqs_skip_variant;
+			  }
+			  goto read_allele_freqs_ret_INVALID_FREQS;
+			}
+			if ((cur_freq_end != cur_entry_end) || (dxx < 0.0) || (dxx > freq_max)) {
+			  goto read_allele_freqs_ret_INVALID_FREQS;
+			}
+			cur_allele_freqs[internal_allele_idx] = dxx;
+			break;
+		      }
+		    }
+		    alt_freq_iter = cur_entry_end;
+		    if (alt_freq_iter == alt_freq_end) {
+		      break;
+		    }
+		    ++alt_freq_iter;
+		  }
+		}
+	      }
+	    }
+	  }
+	}
+	if (infer_one_freq && is_set(matched_loaded_alleles, infer_freq_loaded_idx)) {
+	  double obs_ct_recip = 1.0;
+	  if (header_cols & kfReadFreqColsetObsCt) {
+	    uint32_t obs_ct_raw;
+	    if (scan_uint_capped(token_ptrs[kfReadFreqColObsCt], 0xffffffffU, &obs_ct_raw)) {
+	      sprintf(g_logbuf, "Error: Invalid allele count on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	      goto read_allele_freqs_ret_MALFORMED_INPUT_2;
+	    }
+	    uint64_t obs_ct = obs_ct_raw + ((uint64_t)maf_succ) * cur_allele_ct;
+	    if (!obs_ct) {
+	      goto read_allele_freqs_skip_variant;
+	    }
+	    obs_ct_recip = 1.0 / (double)((int64_t)obs_ct);
+	  }
+	  const uint32_t infer_freq_internal_idx = loaded_to_internal_allele_idx[infer_freq_loaded_idx];
+	  if (cur_allele_ct == 2) {
+	    // optimize common case
+	    double known_freq_d = cur_allele_freqs[1 - infer_freq_internal_idx];
+	    if (maf_succ) {
+	      known_freq_d += 1;
+	    }
+	    double known_scaled_freq = known_freq_d * obs_ct_recip;
+	    if (known_scaled_freq <= 1.0) {
+	      if (infer_freq_internal_idx) {
+		allele_freqs_write[0] = known_scaled_freq;
+	      } else {
+		allele_freqs_write[0] = 1.0 - known_scaled_freq;
+	      }
+	    } else if (known_scaled_freq <= (1.0 / 0.99)) {
+	      if (infer_freq_internal_idx) {
+		allele_freqs_write[0] = 1.0;
+	      } else {
+		allele_freqs_write[0] = 0.0;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Frequency/count too large on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	      goto read_allele_freqs_ret_MALFORMED_INPUT_2;
+	    }
+	  } else {
+	    if (maf_succ) {
+	      for (uint32_t internal_allele_idx = 0; internal_allele_idx < cur_allele_ct; ++internal_allele_idx) {
+		cur_allele_freqs[internal_allele_idx] += 1;
+	      }
+	    }
+	    cur_allele_freqs[infer_freq_internal_idx] = 0.0;
+	    double known_freq_sum_d = 0.0;
+	    for (uint32_t internal_allele_idx = 0; internal_allele_idx < cur_allele_ct; ++internal_allele_idx) {
+	      known_freq_sum_d += cur_allele_freqs[internal_allele_idx];
+	    }
+	    double known_scaled_freq_sum = known_freq_sum_d * obs_ct_recip;
+	    if (known_scaled_freq_sum <= (1.0 / 0.99)) {
+	      if (known_scaled_freq_sum > 1.0) {
+		// possible rounding error, rescale
+		obs_ct_recip = 1.0 / known_scaled_freq_sum;
+		known_scaled_freq_sum = 1.0;
+	      }
+	      const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
+	      for (uint32_t internal_allele_idx = 0; internal_allele_idx < cur_allele_ct_m1; ++internal_allele_idx) {
+		double dxx;
+		if (internal_allele_idx == infer_freq_internal_idx) {
+		  dxx = 1.0 - known_scaled_freq_sum;
+		} else {
+		  dxx = obs_ct_recip * cur_allele_freqs[internal_allele_idx];
+		}
+		allele_freqs_write[internal_allele_idx] = dxx;
+	      }
+	    } else {
+	      sprintf(g_logbuf, "Error: Frequency/count too large on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+	      goto read_allele_freqs_ret_MALFORMED_INPUT_2;
+	    }
+	  }
+	} else {
+	  // complete frequency or count data
+	  if (maf_succ) {
+	    for (uint32_t internal_allele_idx = 0; internal_allele_idx < cur_allele_ct; ++internal_allele_idx) {
+	      cur_allele_freqs[internal_allele_idx] += 1;
+	    }
+	  }
+	  double tot_freq = 0.0;
+	  for (uint32_t internal_allele_idx = 0; internal_allele_idx < cur_allele_ct; ++internal_allele_idx) {
+	    tot_freq += cur_allele_freqs[internal_allele_idx];
+	  }
+	  if (tot_freq == 0.0) {
+	    goto read_allele_freqs_skip_variant;
+	  }
+	  const double tot_freq_recip = 1.0 / tot_freq;
+	  const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
+	  for (uint32_t internal_allele_idx = 0; internal_allele_idx < cur_allele_ct_m1; ++internal_allele_idx) {
+	    allele_freqs_write[internal_allele_idx] = tot_freq_recip * cur_allele_freqs[internal_allele_idx];
+	  }
+	}
+
+	++loaded_variant_ct;
+	if (!(loaded_variant_ct % 10000)) {
+	  printf("\r--read-freq: Frequencies for %uk variants loaded.", loaded_variant_ct / 1000);
+	  fflush(stdout);
+	}
+      }
+      while (0) {
+      read_allele_freqs_skip_variant:
+	++skipped_variant_ct;
+      }
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto read_allele_freqs_ret_READ_FAIL;
+	}
+	break;
+      }
+      ++line_idx;
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    }
+    if (gzclose_null(&gz_infile)) {
+      goto read_allele_freqs_ret_READ_FAIL;
+    }
+    putc_unlocked('\r', stdout);
+    LOGPRINTF("--read-freq: Frequencies for %u variant%s loaded.\n", loaded_variant_ct, (loaded_variant_ct == 1)? "" : "s");
+    if (skipped_variant_ct) {
+      LOGERRPRINTFWW("Warning: %" PRIuPTR " entr%s skipped due to missing variant IDs, mismatching allele codes, and/or zero observations.\n", skipped_variant_ct, (skipped_variant_ct == 1)? "y" : "ies");
+    }
+  }
+  while (0) {
+  read_allele_freqs_ret_LONG_LINE:
+    if (loadbuf_size == kMaxLongLine) {
+      LOGERRPRINTF("Error: Line %" PRIuPTR " of --read-freq file is pathologically long.\n", line_idx);
+      reterr = kPglRetMalformedInput;
+      break;
+    }
+  read_allele_freqs_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  read_allele_freqs_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  read_allele_freqs_ret_UNRECOGNIZED_HEADER:
+    logerrprint("Error: Unrecognized header line in --read-freq file.\n");
+    reterr = kPglRetMalformedInput;
+    break;
+  read_allele_freqs_ret_MISSING_TOKENS:
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of --read-freq file has fewer tokens than expected.\n", line_idx);
+    reterr = kPglRetMalformedInput;
+    break;
+  read_allele_freqs_ret_INVALID_FREQS:
+    sprintf(g_logbuf, "Error: Invalid frequencies/counts on line %" PRIuPTR " of --read-freq file.\n", line_idx);
+  read_allele_freqs_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+  read_allele_freqs_ret_MALFORMED_INPUT_2:
+    logprint("\n");
+    logerrprintb();
+  read_allele_freqs_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ read_allele_freqs_ret_1:
+  bigstack_reset(bigstack_mark);
+  gzclose_cond(gz_infile);
+  return reterr;
+}
+
+void compute_maj_alleles(const uintptr_t* variant_include, const uintptr_t* variant_allele_idxs, const double* allele_freqs, uint32_t variant_ct, alt_allele_ct_t* maj_alleles) {
+  uint32_t cur_allele_ct = 2;
+  uint32_t variant_uidx = 0;
+  for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    uintptr_t allele_idx_base;
+    if (!variant_allele_idxs) {
+      allele_idx_base = variant_uidx;
+    } else {
+      allele_idx_base = variant_allele_idxs[variant_uidx];
+      cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - allele_idx_base;
+      allele_idx_base -= variant_uidx;
+    }
+    const double* cur_allele_freqs_base = &(allele_freqs[allele_idx_base]);
+    if (cur_allele_ct == 2) {
+      maj_alleles[variant_uidx] = (cur_allele_freqs_base[0] < 0.5);
+    } else {
+      uint32_t maj_allele_idx = 0;
+      double max_freq = cur_allele_freqs_base[0];
+      double tot_alt_freq = max_freq;
+      const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
+      for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct_m1; ++allele_idx) {
+	const double cur_freq = cur_allele_freqs_base[allele_idx];
+	tot_alt_freq += cur_freq;
+	if (cur_freq > max_freq) {
+	  maj_allele_idx = allele_idx;
+	  max_freq = cur_freq;
+	}
+      }
+      if (max_freq + tot_alt_freq <= 1.0) {
+	maj_allele_idx = cur_allele_ct_m1;
+      }
+      maj_alleles[variant_uidx] = maj_allele_idx;
+    }
+  }
+}
+
+
+// multithread globals
+static pgen_reader_t** g_pgr_ptrs = nullptr;
+static uintptr_t** g_genovecs = nullptr;
+static uint32_t* g_read_variant_uidx_starts = nullptr;
+static uintptr_t** g_missing_hc_acc1 = nullptr;
+static uintptr_t** g_missing_dosage_acc1 = nullptr;
+static uintptr_t** g_hethap_acc1 = nullptr;
+
+static const uintptr_t* g_variant_include = nullptr;
+static const chr_info_t* g_cip = nullptr;
+static const uintptr_t* g_sex_male = nullptr;
+static uint32_t g_raw_sample_ct = 0;
+static uint32_t g_cur_block_size = 0;
+static uint32_t g_calc_thread_ct = 0;
+static pglerr_t g_error_ret = kPglRetSuccess;
+
+THREAD_FUNC_DECL load_sample_missing_cts_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t* variant_include = g_variant_include;
+  const chr_info_t* cip = g_cip;
+  const uintptr_t* sex_male = g_sex_male;
+  const uint32_t raw_sample_ct = g_raw_sample_ct;
+  const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+  const uint32_t acc1_vec_ct = BITCT_TO_VECCT(raw_sample_ct);
+  const uint32_t acc4_vec_ct = acc1_vec_ct * 4;
+  const uint32_t acc8_vec_ct = acc1_vec_ct * 8;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+  const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+  uintptr_t* genovec_buf = g_genovecs[tidx];
+  uintptr_t* missing_hc_acc1 = g_missing_hc_acc1[tidx];
+  uintptr_t* missing_hc_acc4 = &(missing_hc_acc1[acc1_vec_ct * kWordsPerVec]);
+  uintptr_t* missing_hc_acc8 = &(missing_hc_acc4[acc4_vec_ct * kWordsPerVec]);
+  uintptr_t* missing_hc_acc32 = &(missing_hc_acc8[acc8_vec_ct * kWordsPerVec]);
+  fill_ulong_zero(acc1_vec_ct * kWordsPerVec * 45, missing_hc_acc1);
+  uintptr_t* missing_dosage_acc1 = nullptr;
+  uintptr_t* missing_dosage_acc4 = nullptr;
+  uintptr_t* missing_dosage_acc8 = nullptr;
+  uintptr_t* missing_dosage_acc32 = nullptr;
+  if (g_missing_dosage_acc1) {
+    missing_dosage_acc1 = g_missing_dosage_acc1[tidx];
+    missing_dosage_acc4 = &(missing_dosage_acc1[acc1_vec_ct * kWordsPerVec]);
+    missing_dosage_acc8 = &(missing_dosage_acc4[acc4_vec_ct * kWordsPerVec]);
+    missing_dosage_acc32 = &(missing_dosage_acc8[acc8_vec_ct * kWordsPerVec]);
+    fill_ulong_zero(acc1_vec_ct * kWordsPerVec * 45, missing_dosage_acc1);
+  }
+  // could make this optional
+  // (could technically make missing_hc optional too...)
+  uintptr_t* hethap_acc1 = g_hethap_acc1[tidx];
+  uintptr_t* hethap_acc4 = &(hethap_acc1[acc1_vec_ct * kWordsPerVec]);
+  uintptr_t* hethap_acc8 = &(hethap_acc4[acc4_vec_ct * kWordsPerVec]);
+  uintptr_t* hethap_acc32 = &(hethap_acc8[acc8_vec_ct * kWordsPerVec]);
+  fill_ulong_zero(acc1_vec_ct * kWordsPerVec * 45, hethap_acc1);
+  uint32_t all_ct_rem15 = 15;
+  uint32_t all_ct_rem255d15 = 17;
+  uint32_t hap_ct_rem15 = 15;
+  uint32_t hap_ct_rem255d15 = 17;
+  while (1) {
+    pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uint32_t cur_block_size = g_cur_block_size;
+    const uint32_t cur_idx_ct = (((tidx + 1) * cur_block_size) / calc_thread_ct) - ((tidx * cur_block_size) / calc_thread_ct);
+    uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+    uint32_t chr_end = 0;
+    uintptr_t* cur_hets = nullptr;
+    uint32_t is_diploid_x = 0;
+    uint32_t is_y = 0;
+    for (uint32_t cur_idx = 0; cur_idx < cur_idx_ct; ++cur_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	const uint32_t chr_fo_idx = get_variant_chr_fo_idx(cip, variant_uidx);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	cur_hets = hethap_acc1;
+	is_diploid_x = 0;
+	is_y = 0;
+	if (chr_idx == x_code) {
+	  is_diploid_x = !is_set(cip->haploid_mask, 0);
+	} else if (chr_idx == y_code) {
+	  is_y = 1;
+	} else {
+	  if (!is_set(cip->haploid_mask, chr_idx)) {
+	    cur_hets = nullptr;
+	  }
+	}
+      }
+      // could instead have missing_hc and (missing_hc - missing_dosage); that
+      // has the advantage of letting you skip one of the two increment
+      // operations when the variant is all hardcalls.
+      pglerr_t reterr = pgr_read_missingness_multi(nullptr, nullptr, raw_sample_ct, variant_uidx, pgrp, missing_hc_acc1, missing_dosage_acc1, cur_hets, genovec_buf);
+      if (reterr) {
+	g_error_ret = reterr;
+	break;
+      }
+      if (is_y) {
+	bitvec_and(sex_male, raw_sample_ctaw, missing_hc_acc1);
+	if (missing_dosage_acc1) {
+	  bitvec_and(sex_male, raw_sample_ctaw, missing_dosage_acc1);
+	}
+      }
+      unroll_incr_1_4(missing_hc_acc1, acc1_vec_ct, missing_hc_acc4);
+      if (missing_dosage_acc1) {
+	unroll_incr_1_4(missing_dosage_acc1, acc1_vec_ct, missing_dosage_acc4);
+      }
+      if (!(--all_ct_rem15)) {
+	unroll_zero_incr_4_8(acc4_vec_ct, missing_hc_acc4, missing_hc_acc8);
+	if (missing_dosage_acc1) {
+	  unroll_zero_incr_4_8(acc4_vec_ct, missing_dosage_acc4, missing_dosage_acc8);
+	}
+	all_ct_rem15 = 15;
+	if (!(--all_ct_rem255d15)) {
+	  unroll_zero_incr_8_32(acc8_vec_ct, missing_hc_acc8, missing_hc_acc32);
+	  if (missing_dosage_acc1) {
+	    unroll_zero_incr_8_32(acc8_vec_ct, missing_dosage_acc8, missing_dosage_acc32);
+	  }
+	  all_ct_rem255d15 = 17;
+	}
+      }
+      if (cur_hets) {
+	if (is_diploid_x) {	  
+	  bitvec_and(sex_male, raw_sample_ctaw, cur_hets);
+	}
+	unroll_incr_1_4(cur_hets, acc1_vec_ct, hethap_acc4);
+	if (!(--hap_ct_rem15)) {
+	  unroll_zero_incr_4_8(acc4_vec_ct, hethap_acc4, hethap_acc8);
+	  hap_ct_rem15 = 15;
+	  if (!(--hap_ct_rem255d15)) {
+	    unroll_zero_incr_8_32(acc8_vec_ct, hethap_acc8, hethap_acc32);
+	    hap_ct_rem255d15 = 17;
+	  }
+	}
+      }
+    }
+    if (is_last_block) {
+      unroll_incr_4_8(missing_hc_acc4, acc4_vec_ct, missing_hc_acc8);
+      unroll_incr_8_32(missing_hc_acc8, acc8_vec_ct, missing_hc_acc32);
+      if (missing_dosage_acc1) {
+	unroll_incr_4_8(missing_dosage_acc4, acc4_vec_ct, missing_dosage_acc8);
+	unroll_incr_8_32(missing_dosage_acc8, acc8_vec_ct, missing_dosage_acc32);
+      }
+      unroll_incr_4_8(hethap_acc4, acc4_vec_ct, hethap_acc8);
+      unroll_incr_8_32(hethap_acc8, acc8_vec_ct, hethap_acc32);
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+  }
+}
+
+pglerr_t load_sample_missing_cts(const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t raw_sample_ct, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, pgen_file_info_t* pgfip, uint32_t* sample_missing_hc_cts, uint32_t* sample_missing_dosage_cts, uint32_t* sample_hethap_cts) {
+  assert(sample_missing_hc_cts || sample_missing_dosage_cts);
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!variant_ct) {
+      fill_uint_zero(raw_sample_ct, sample_missing_hc_cts);
+      if (sample_missing_dosage_cts) {
+	fill_uint_zero(raw_sample_ct, sample_missing_dosage_cts);
+      }
+      fill_uint_zero(raw_sample_ct, sample_hethap_cts);
+      goto load_sample_missing_cts_ret_1;
+    }
+    // this doesn't seem to saturate below 35 threads
+    uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    const uint32_t acc1_vec_ct = BITCT_TO_VECCT(raw_sample_ct);
+    const uintptr_t acc1_alloc_cacheline_ct = DIV_UP(acc1_vec_ct * (45 * k1LU * kBytesPerVec), kCacheline);
+    g_sex_male = sex_male;
+    uintptr_t thread_alloc_cacheline_ct = 2 * acc1_alloc_cacheline_ct;
+    g_missing_dosage_acc1 = nullptr;
+    if (sample_missing_dosage_cts) {
+      if (bigstack_alloc_ulp(calc_thread_ct, &g_missing_dosage_acc1)) {
+	goto load_sample_missing_cts_ret_NOMEM;
+      }
+      thread_alloc_cacheline_ct += acc1_alloc_cacheline_ct;
+    }
+    if (bigstack_alloc_ulp(calc_thread_ct, &g_missing_hc_acc1) ||
+	bigstack_alloc_ulp(calc_thread_ct, &g_hethap_acc1)) {
+      goto load_sample_missing_cts_ret_NOMEM;
+    }
+    unsigned char* main_loadbufs[2];
+    pthread_t* threads;
+    uint32_t read_block_size;
+    if (multithread_load_init(variant_include, raw_sample_ct, raw_variant_ct, pgr_alloc_cacheline_ct, thread_alloc_cacheline_ct, 0, pgfip, &calc_thread_ct, &g_genovecs, nullptr, nullptr, &read_block_size, main_loadbufs, &threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+      goto load_sample_missing_cts_ret_NOMEM;
+    }
+    const uintptr_t acc1_alloc = acc1_alloc_cacheline_ct * kCacheline;
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      g_missing_hc_acc1[tidx] = (uintptr_t*)bigstack_alloc_raw(acc1_alloc);
+      if (g_missing_dosage_acc1) {
+	g_missing_dosage_acc1[tidx] = (uintptr_t*)bigstack_alloc_raw(acc1_alloc);
+      }
+      g_hethap_acc1[tidx] = (uintptr_t*)bigstack_alloc_raw(acc1_alloc);
+    }
+    g_variant_include = variant_include;
+    g_cip = cip;
+    g_raw_sample_ct = raw_sample_ct;
+    g_calc_thread_ct = calc_thread_ct;
+
+    // nearly identical to load_allele_and_geno_counts()
+    logprint("Calculating sample missingness rates... ");
+    fputs("0%", stdout);
+    fflush(stdout);
+    uint32_t pct = 0;
+
+    const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+    const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+    uint32_t parity = 0;
+    uint32_t read_block_idx = 0;
+    uint32_t variant_idx = 0;
+    uint32_t is_last_block = 0;
+    uint32_t cur_read_block_size = read_block_size;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    
+    while (1) {
+      uintptr_t cur_loaded_variant_ct = 0;
+      if (!is_last_block) {
+	while (read_block_idx < read_block_ct_m1) {
+	  cur_loaded_variant_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	  if (cur_loaded_variant_ct) {
+	    break;
+	  }
+	  ++read_block_idx;
+	}
+	if (read_block_idx == read_block_ct_m1) {
+	  cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	  cur_loaded_variant_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	}
+	if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_loaded_variant_ct, pgfip)) {
+	  if (variant_idx) {
+	    join_threads2z(calc_thread_ct, 0, threads);
+	    g_cur_block_size = 0;
+	    error_cleanup_threads2z(load_sample_missing_cts_thread, calc_thread_ct, threads);
+	  }
+	  goto load_sample_missing_cts_ret_READ_FAIL;
+	}
+      }
+      if (variant_idx) {
+	join_threads2z(calc_thread_ct, is_last_block, threads);
+	reterr = g_error_ret;
+	if (reterr) {
+	  if (!is_last_block) {
+	    g_cur_block_size = 0;
+	    error_cleanup_threads2z(load_sample_missing_cts_thread, calc_thread_ct, threads);
+	  }
+	  if (reterr == kPglRetMalformedInput) {
+	    logprint("\n");
+	    logerrprint("Error: Malformed .pgen file.\n");
+	  }
+	  goto load_sample_missing_cts_ret_1;
+	}
+      }
+      if (!is_last_block) {
+	g_cur_block_size = cur_loaded_variant_ct;
+	compute_uidx_start_partition(variant_include, cur_loaded_variant_ct, calc_thread_ct, read_block_idx * read_block_size, g_read_variant_uidx_starts);
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	  g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	}
+	is_last_block = (variant_idx + cur_loaded_variant_ct == variant_ct);
+	if (spawn_threads2z(load_sample_missing_cts_thread, calc_thread_ct, is_last_block, threads)) {
+	  goto load_sample_missing_cts_ret_THREAD_CREATE_FAIL;
+	}
+      }
+
+      parity = 1 - parity;
+      if (variant_idx == variant_ct) {
+	break;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+
+      ++read_block_idx;
+      variant_idx += cur_loaded_variant_ct;
+      // crucially, this is independent of the pgen_reader_t block_base
+      // pointers
+      pgfip->block_base = main_loadbufs[parity];
+    }
+    const uint32_t sample_ctv = acc1_vec_ct * kBitsPerVec;
+    const uintptr_t acc32_offset = acc1_vec_ct * (13 * k1LU * kWordsPerVec);
+    uint32_t* scrambled_missing_hc_cts = nullptr;
+    uint32_t* scrambled_missing_dosage_cts = nullptr;
+    uint32_t* scrambled_hethap_cts = nullptr;
+    scrambled_missing_hc_cts = (uint32_t*)(&(g_missing_hc_acc1[0][acc32_offset]));
+    if (g_missing_dosage_acc1) {
+      scrambled_missing_dosage_cts = (uint32_t*)(&(g_missing_dosage_acc1[0][acc32_offset]));
+    }
+    scrambled_hethap_cts = (uint32_t*)(&(g_hethap_acc1[0][acc32_offset]));
+    for (uint32_t tidx = 1; tidx < calc_thread_ct; ++tidx) {
+      uint32_t* thread_scrambled_missing_hc_cts = (uint32_t*)(&(g_missing_hc_acc1[tidx][acc32_offset]));
+      for (uint32_t uii = 0; uii < sample_ctv; ++uii) {
+	scrambled_missing_hc_cts[uii] += thread_scrambled_missing_hc_cts[uii];
+      }
+      if (scrambled_missing_dosage_cts) {
+	uint32_t* thread_scrambled_missing_dosage_cts = (uint32_t*)(&(g_missing_dosage_acc1[tidx][acc32_offset]));
+	for (uint32_t uii = 0; uii < sample_ctv; ++uii) {
+	  scrambled_missing_dosage_cts[uii] += thread_scrambled_missing_dosage_cts[uii];
+	}
+      }
+      uint32_t* thread_scrambled_hethap_cts = (uint32_t*)(&(g_hethap_acc1[tidx][acc32_offset]));
+      for (uint32_t uii = 0; uii < sample_ctv; ++uii) {
+	scrambled_hethap_cts[uii] += thread_scrambled_hethap_cts[uii];
+      }
+    }
+    for (uint32_t sample_uidx = 0; sample_uidx < raw_sample_ct; ++sample_uidx) {
+      const uint32_t scrambled_idx = scramble_1_4_8_32(sample_uidx);
+      sample_missing_hc_cts[sample_uidx] = scrambled_missing_hc_cts[scrambled_idx];
+      if (sample_missing_dosage_cts) {
+	sample_missing_dosage_cts[sample_uidx] = scrambled_missing_dosage_cts[scrambled_idx];
+      }
+      sample_hethap_cts[sample_uidx] = scrambled_hethap_cts[scrambled_idx];
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+  }
+  while (0) {
+  load_sample_missing_cts_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  load_sample_missing_cts_ret_READ_FAIL:
+    reterr = kPglRetNomem;
+    break;
+  load_sample_missing_cts_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetNomem;
+    break;
+  }
+ load_sample_missing_cts_ret_1:
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t mind_filter(const uint32_t* sample_missing_cts, const uint32_t* sample_hethap_cts, const char* sample_ids, const char* sids, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t variant_ct, uint32_t variant_ct_y, double mind_thresh, uintptr_t* sample_include, uintptr_t* sex_male, uint32_t* sample_ct_ptr, char* outname, char* outname_end) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t orig_sample_ct = *sample_ct_ptr;
+    if (!orig_sample_ct) {
+      goto mind_filter_ret_1;
+    }
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    
+    uint32_t max_missing_cts[2];
+    mind_thresh *= 1 + kSmallEpsilon;
+    max_missing_cts[0] = (int32_t)((double)((int32_t)(variant_ct - variant_ct_y)) * mind_thresh);
+    max_missing_cts[1] = (int32_t)((double)((int32_t)variant_ct) * mind_thresh);
+    uintptr_t* newly_excluded;
+    if (bigstack_calloc_ul(raw_sample_ctl, &newly_excluded)) {
+      goto mind_filter_ret_NOMEM;
+    }
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < orig_sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      uint32_t cur_missing_geno_ct = sample_missing_cts[sample_uidx];
+      if (sample_hethap_cts) {
+	cur_missing_geno_ct += sample_hethap_cts[sample_uidx];
+      }
+      if (cur_missing_geno_ct > max_missing_cts[IS_SET(sex_male, sample_uidx)]) {
+	SET_BIT(sample_uidx, newly_excluded);
+      }
+    }
+    const uint32_t removed_ct = popcount_longs(newly_excluded, raw_sample_ctl);
+    // don't bother with allow_no_samples check here, better to have that in
+    // just one place
+    LOGPRINTF("%u sample%s removed due to missing genotype data (--mind).\n", removed_ct, (removed_ct == 1)? "" : "s");
+    if (removed_ct) {
+      bitvec_andnot(newly_excluded, raw_sample_ctl, sample_include);
+      bitvec_andnot(newly_excluded, raw_sample_ctl, sex_male);
+      strcpy(outname_end, ".irem");
+      if (fopen_checked(outname, "w", &outfile)) {
+	goto mind_filter_ret_OPEN_FAIL;
+      }
+      sample_uidx = 0;
+      char* textbuf = g_textbuf;
+      char* write_iter = textbuf;
+      char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+      for (uint32_t sample_idx = 0; sample_idx < removed_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(newly_excluded, &sample_uidx);
+	write_iter = strcpya(write_iter, &(sample_ids[sample_uidx * max_sample_id_blen]));
+	if (sids) {
+	  *write_iter++ = '\t';
+	  write_iter = strcpya(write_iter, &(sids[sample_uidx * max_sid_blen]));
+	}
+	*write_iter++ = '\n';
+	if (write_iter >= textbuf_flush) {
+	  if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	    goto mind_filter_ret_WRITE_FAIL;
+	  }
+	  write_iter = textbuf;
+	}
+      }
+      if (write_iter != textbuf) {
+	if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	  goto mind_filter_ret_WRITE_FAIL;
+	}
+      }
+      if (fclose_null(&outfile)) {
+	goto mind_filter_ret_WRITE_FAIL;
+      }
+      LOGPRINTFWW("ID%s written to %s .\n", (removed_ct == 1)? "" : "s", outname);
+      *sample_ct_ptr -= removed_ct;
+    }
+  }
+  while (0) {
+  mind_filter_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  mind_filter_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  mind_filter_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+ mind_filter_ret_1:
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+void enforce_geno_thresh(const chr_info_t* cip, const uint32_t* variant_missing_cts, const uint32_t* variant_hethap_cts, uint32_t sample_ct, uint32_t male_ct, uint32_t first_hap_uidx, double geno_thresh, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  const uint32_t prefilter_variant_ct = *variant_ct_ptr;
+  geno_thresh *= 1 + kSmallEpsilon;
+  const uint32_t missing_max_ct_nony = (int32_t)(geno_thresh * ((int32_t)sample_ct));
+  const uint32_t missing_max_ct_y = (int32_t)(geno_thresh * ((int32_t)male_ct));
+  uint32_t cur_missing_max_ct = missing_max_ct_nony;
+  uint32_t removed_ct = 0;
+  uint32_t variant_uidx = 0;
+  uint32_t y_thresh = 0xffffffffU;
+  uint32_t y_end = 0xffffffffU;
+  int32_t y_code;
+  if (xymt_exists(cip, kChrOffsetY, &y_code)) {
+    const uint32_t y_chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)y_code];
+    y_thresh = cip->chr_fo_vidx_start[y_chr_fo_idx];
+    y_end = cip->chr_fo_vidx_start[y_chr_fo_idx + 1];
+  }
+  for (uint32_t variant_idx = 0; variant_idx < prefilter_variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    if (variant_uidx >= y_thresh) {
+      if (variant_uidx < y_end) {
+	y_thresh = y_end;
+	cur_missing_max_ct = missing_max_ct_y;
+      } else {
+	y_thresh = 0xffffffffU;
+	cur_missing_max_ct = missing_max_ct_nony;
+      }
+    }
+    uint32_t cur_missing_ct = variant_missing_cts[variant_uidx];
+    if (variant_uidx >= first_hap_uidx) {
+      cur_missing_ct += variant_hethap_cts[variant_uidx - first_hap_uidx];
+    }
+    if (cur_missing_ct > cur_missing_max_ct) {
+      CLEAR_BIT(variant_uidx, variant_include);
+      ++removed_ct;
+    }
+  }
+  LOGPRINTF("--geno: %u variant%s removed due to missing genotype data.\n", removed_ct, (removed_ct == 1)? "" : "s");
+  *variant_ct_ptr -= removed_ct;
+}
+
+void enforce_hwe_thresh(const chr_info_t* cip, const uint32_t* founder_raw_geno_cts, const uint32_t* founder_x_male_geno_cts, const uint32_t* founder_x_nosex_geno_cts, const double* hwe_x_pvals, misc_flags_t misc_flags, double hwe_thresh, uint32_t nonfounders, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  if (cip->haploid_mask[0] & 1) {
+    logerrprint("Warning: --hwe has no effect since entire genome is haploid.\n");
+    return;
+  }
+  uint32_t prefilter_variant_ct = *variant_ct_ptr;
+  uint32_t x_start = 0xffffffffU;
+  uint32_t x_end = 0xffffffffU;
+  int32_t x_code;
+  if (xymt_exists(cip, kChrOffsetX, &x_code)) {
+    const uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)x_code];
+    x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
+    x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1];
+    // bugfix (4 Jun 2017): if no sex info available, need to skip chrX
+    if (!hwe_x_pvals) {
+      prefilter_variant_ct -= popcount_bit_idx(variant_include, x_start, x_end);
+    }
+  }
+  uint32_t x_thresh = x_start;
+  const uint32_t midp = (misc_flags / kfMiscHweMidp) & 1;
+  const uint32_t keep_fewhet = (misc_flags / kfMiscHweKeepFewhet) & 1;
+  hwe_thresh *= 1 - kSmallEpsilon;
+  uint32_t removed_ct = 0;
+  uint32_t variant_uidx = 0;
+  uint32_t min_obs = 0xffffffffU;
+  uint32_t max_obs = 0;
+  uint32_t is_x = 0;
+  uint32_t male_ref_ct = 0;
+  uint32_t male_alt_ct = 0;
+  const double* hwe_x_pvals_iter = hwe_x_pvals;
+  const double hwe_thresh_recip = (1 + 4 * kSmallEpsilon) / hwe_thresh;
+  for (uint32_t variant_idx = 0; variant_idx < prefilter_variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    if (variant_uidx >= x_thresh) {
+      is_x = (variant_uidx < x_end);
+      if (is_x) {
+	if (hwe_x_pvals) {
+	  x_thresh = x_end;
+	} else {
+	  is_x = 0;
+	  x_thresh = 0xffffffffU;
+	  variant_uidx = next_set_unsafe(variant_include, x_end);
+	}
+      } else {
+	x_thresh = 0xffffffffU;
+      }
+    }
+    const uint32_t* cur_geno_cts = &(founder_raw_geno_cts[3 * variant_uidx]);
+    uint32_t homref_ct = cur_geno_cts[0];
+    uint32_t hetref_ct = cur_geno_cts[1];
+    uint32_t nonref_diploid_ct = cur_geno_cts[2];
+    uint32_t test_failed;
+    uint32_t cur_obs_ct;
+    if (!is_x) {
+      cur_obs_ct = homref_ct + hetref_ct + nonref_diploid_ct;
+      if (!cur_obs_ct) {
+	// currently happens for chrY, chrM
+	continue;
+      }
+      if (keep_fewhet) {
+	if (hetref_ct * ((uint64_t)hetref_ct) <= (4LLU * homref_ct) * nonref_diploid_ct) {
+	  // no p-value computed at all, so don't count this toward
+	  // min_obs/max_obs
+	  continue;
+	}
+      }
+      if (midp) {
+	test_failed = SNPHWE_midp_t(hetref_ct, homref_ct, nonref_diploid_ct, hwe_thresh);
+      } else {
+	test_failed = SNPHWE_t(hetref_ct, homref_ct, nonref_diploid_ct, hwe_thresh);
+      }
+    } else {
+      if (founder_x_male_geno_cts) {
+	const uint32_t* cur_male_geno_cts = &(founder_x_male_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+	male_ref_ct = cur_male_geno_cts[0];
+	homref_ct -= male_ref_ct;
+	hetref_ct -= cur_male_geno_cts[1];
+	male_alt_ct = cur_male_geno_cts[2];
+	nonref_diploid_ct -= male_alt_ct;
+      }
+      if (founder_x_nosex_geno_cts) {
+	const uint32_t* cur_nosex_geno_cts = &(founder_x_nosex_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+	homref_ct -= cur_nosex_geno_cts[0];
+	hetref_ct -= cur_nosex_geno_cts[1];
+	nonref_diploid_ct -= cur_nosex_geno_cts[2];
+      }
+      cur_obs_ct = homref_ct + hetref_ct + nonref_diploid_ct + male_ref_ct + male_alt_ct;
+      double joint_pval = *hwe_x_pvals_iter++;
+      test_failed = (joint_pval < hwe_thresh);
+      if (test_failed && keep_fewhet && (hetref_ct * ((uint64_t)hetref_ct) < (4LLU * homref_ct) * nonref_diploid_ct)) {
+	// female-only retest
+	if (joint_pval) {
+	  joint_pval *= hwe_thresh_recip;
+	} else {
+	  // keep the variant iff female-only p-value also underflows
+	  joint_pval = 2.2250738585072013e-308;
+	}
+	if (midp) {
+	  test_failed = !SNPHWE_midp_t(hetref_ct, homref_ct, nonref_diploid_ct, joint_pval);
+	} else {
+	  test_failed = !SNPHWE_t(hetref_ct, homref_ct, nonref_diploid_ct, joint_pval);
+	}
+      }
+    }
+    if (test_failed) {
+      CLEAR_BIT(variant_uidx, variant_include);
+      ++removed_ct;
+    }
+    if (cur_obs_ct < min_obs) {
+      min_obs = cur_obs_ct;
+    }
+    if (cur_obs_ct > max_obs) {
+      max_obs = cur_obs_ct;
+    }
+  }
+  if (((uint64_t)max_obs) * 9 > ((uint64_t)min_obs) * 10) {
+    logerrprint("Warning: --hwe observation counts vary by more than 10%.  Consider using\n--geno, and/or applying different p-value thresholds to distinct subsets of\nyour data.\n");
+  }
+  LOGPRINTFWW("--hwe%s%s: %u variant%s removed due to Hardy-Weinberg exact test (%s).\n", midp? " midp" : "", keep_fewhet? " keep-fewhet" : "", removed_ct, (removed_ct == 1)? "" : "s", nonfounders? "all samples" : "founders only");
+  *variant_ct_ptr -= removed_ct;
+}
+
+void enforce_minor_freq_constraints(const uintptr_t* variant_allele_idxs, const uint64_t* founder_allele_dosages, const double* allele_freqs, double min_maf, double max_maf, uint64_t min_allele_dosage, uint64_t max_allele_dosage, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  const uint32_t prefilter_variant_ct = *variant_ct_ptr;
+  uint32_t variant_uidx = 0;
+  uint32_t removed_ct = 0;
+  if ((min_maf != 0.0) || (max_maf != 1.0)) {
+    // defend against floating point error
+    min_maf *= 1.0 - kSmallEpsilon;
+    max_maf *= 1.0 + kSmallEpsilon;
+  } else {
+    allele_freqs = nullptr;
+  }
+  const uint32_t dosage_filter = min_allele_dosage || (max_allele_dosage != (~0LLU));
+  
+  uint32_t cur_allele_ct = 2;
+  for (uint32_t variant_idx = 0; variant_idx < prefilter_variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    uintptr_t variant_allele_idx_base;
+    if (!variant_allele_idxs) {
+      variant_allele_idx_base = 2 * variant_uidx;
+    } else {
+      variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+      cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+    }
+    if (allele_freqs) {
+      const double cur_nonmaj_freq = get_nonmaj_freq(&(allele_freqs[variant_allele_idx_base - variant_uidx]), cur_allele_ct);
+      if ((cur_nonmaj_freq < min_maf) || (cur_nonmaj_freq > max_maf)) {
+	CLEAR_BIT(variant_uidx, variant_include);
+	++removed_ct;
+	continue;
+      }
+    }
+    if (dosage_filter) {
+      const uint64_t* cur_founder_allele_dosages = &(founder_allele_dosages[variant_allele_idx_base]);
+      uint64_t max_dosage = cur_founder_allele_dosages[0];
+      uint64_t nonmaj_dosage;
+      if (cur_allele_ct == 2) {
+	nonmaj_dosage = MINV(max_dosage, cur_founder_allele_dosages[1]);
+      } else {
+	uint64_t tot_dosage = max_dosage;
+	for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	  const uint64_t cur_dosage = cur_founder_allele_dosages[allele_idx];
+	  tot_dosage += cur_dosage;
+	  if (cur_dosage > max_dosage) {
+	    max_dosage = cur_dosage;
+	  }
+	}
+	nonmaj_dosage = tot_dosage - max_dosage;
+      }
+      if ((nonmaj_dosage < min_allele_dosage) || (nonmaj_dosage > max_allele_dosage)) {
+	CLEAR_BIT(variant_uidx, variant_include);
+	++removed_ct;
+      }
+    }
+  }
+  LOGPRINTFWW("%u variant%s removed due to minor allele threshold(s) (--maf/--max-maf/--mac/--max-mac).\n", removed_ct, (removed_ct == 1)? "" : "s");
+  *variant_ct_ptr -= removed_ct;
+}
+
+void enforce_mach_r2_thresh(const chr_info_t* cip, const double* mach_r2_vals, double mach_r2_min, double mach_r2_max, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  const uint32_t prefilter_variant_ct = *variant_ct_ptr;
+  mach_r2_min *= 1 - kSmallEpsilon;
+  mach_r2_max *= 1 + kSmallEpsilon;
+  uint32_t removed_ct = 0;
+  uint32_t variant_uidx = 0;
+  const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+  const uint32_t chr_ct = cip->chr_ct;
+  uint32_t relevant_variant_ct = prefilter_variant_ct;
+  for (uint32_t chr_fo_idx = 0; chr_fo_idx < chr_ct; ++chr_fo_idx) {
+    const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+    // skip X, Y, MT, other haploid
+    if (is_set(cip->haploid_mask, chr_idx) || (chr_idx == ((uint32_t)mt_code))) {
+      relevant_variant_ct -= popcount_bit_idx(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1]);
+    }
+  }
+  uint32_t chr_fo_idx = 0xffffffffU;
+  uint32_t chr_end = 0;
+  for (uint32_t variant_idx = 0; variant_idx < relevant_variant_ct; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    while (variant_uidx >= chr_end) {
+      uint32_t chr_idx;
+      do {
+	chr_idx = cip->chr_file_order[++chr_fo_idx];
+      } while (is_set(cip->haploid_mask, chr_idx) || (chr_idx == ((uint32_t)mt_code)));
+      chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+      variant_uidx = next_set(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], chr_end);
+    }
+    const double cur_mach_r2 = mach_r2_vals[variant_uidx];
+    if ((cur_mach_r2 < mach_r2_min) || (cur_mach_r2 > mach_r2_max)) {
+      CLEAR_BIT(variant_uidx, variant_include);
+      ++removed_ct;
+    }
+  }
+  LOGPRINTF("--mach-r2-filter: %u variant%s removed.\n", removed_ct, (removed_ct == 1)? "" : "s");
+  *variant_ct_ptr -= removed_ct;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_filter.h b/plink2_filter.h
new file mode 100644
index 0000000..a910fb4
--- /dev/null
+++ b/plink2_filter.h
@@ -0,0 +1,98 @@
+#ifndef __PLINK2_FILTER_H__
+#define __PLINK2_FILTER_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// order is such that (kCmpOperatorEq - x) is the inverse of x
+ENUM_U31_DEF_START()
+  kCmpOperatorNoteq,
+  kCmpOperatorLe,
+  kCmpOperatorLeq,
+
+  kCmpOperatorGe,
+  kCmpOperatorGeq,
+  kCmpOperatorEq
+ENUM_U31_DEF_END(cmp_binary_op_t);
+
+typedef struct {
+  // Restrict to [pheno/covar name] [operator] [pheno val] for now.  could
+  // support or/and, parentheses, etc. later.
+
+  // Currently stores null-terminated pheno/covar name, followed by
+  // null-terminated value string.  Storage format needs to be synced with
+  // plink2.cpp validate_and_alloc_cmp_expr().
+  char* pheno_name;
+  cmp_binary_op_t binary_op;
+} cmp_expr_t;
+
+void init_cmp_expr(cmp_expr_t* cmp_expr_ptr);
+
+void cleanup_cmp_expr(cmp_expr_t* cmp_expr_ptr);
+
+pglerr_t from_to_flag(char** variant_ids, const uint32_t* variant_id_htable, const char* varid_from, const char* varid_to, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uintptr_t* variant_include, chr_info_t* cip, uint32_t* variant_ct_ptr);
+
+pglerr_t snp_flag(const uint32_t* variant_bps, char** variant_ids, const uint32_t* variant_id_htable, const char* varid_snp, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uint32_t do_exclude, int32_t window_bp, uintptr_t* variant_include, chr_info_t* cip, uint32_t* variant_ct_ptr);
+
+pglerr_t snps_flag(char** variant_ids, const uint32_t* variant_id_htable, const range_list_t* snps_range_list_ptr, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uint32_t do_exclude, uintptr_t* variant_include, uint32_t* variant_ct_ptr);
+
+pglerr_t extract_exclude_flag_norange(char** variant_ids, const uint32_t* variant_id_htable, const char* fname, uint32_t raw_variant_ct, uintptr_t max_variant_id_slen, uintptr_t variant_id_htable_size, uint32_t do_exclude, uintptr_t* variant_include, uint32_t* variant_ct_ptr);
+
+FLAGSET_DEF_START()
+  kfKeep0,
+  kfKeepRemove = (1 << 0),
+  kfKeepFam = (1 << 1),
+  kfKeepForceSid = (1 << 2)
+FLAGSET_DEF_END(keep_flags_t);
+
+pglerr_t keep_or_remove(const char* fname, const char* sample_ids, const char* sids, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, keep_flags_t flags, uintptr_t* sample_include, uint32_t* sample_ct_ptr);
+
+pglerr_t require_pheno(const pheno_col_t* pheno_cols, const char* pheno_names, char* require_pheno_flattened, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t is_covar, uintptr_t* sample_include, uint32_t* sample_ct_ptr);
+
+pglerr_t keep_remove_if(const cmp_expr_t* cmp_expr, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t covar_ct, uintptr_t max_covar_name_blen, uint32_t affection_01, uint32_t is_remove, uintptr_t* sample_include, uint32_t* sample_ct_ptr);
+
+pglerr_t keep_remove_cats(const char* cats_fname, const char* cat_names_flattened, const char* cat_phenoname, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t covar_ct, uintptr_t max_covar_name_blen, uint32_t is_remove, uint32_t max_thread_ct, uintptr_t* sample_include, uint32_t* sample_ct_ptr);
+
+void compute_allele_freqs(const uintptr_t* variant_include, const uintptr_t* variant_allele_idxs, const uint64_t* founder_allele_dosages, uint32_t variant_ct, uint32_t maf_succ, double* allele_freqs);
+
+pglerr_t read_allele_freqs(const uintptr_t* variant_include, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const char* read_freq_fname, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_alt_allele_ct, uint32_t max_variant_id_slen, uint32_t max_allele_slen, uint32_t maf_succ, uint32_t max_thread_ct, double* allele_freqs);
+
+void compute_maj_alleles(const uintptr_t* variant_include, const uintptr_t* variant_allele_idxs, const double* allele_freqs, uint32_t variant_ct, alt_allele_ct_t* maj_alleles);
+
+pglerr_t load_sample_missing_cts(const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t raw_sample_ct, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, pgen_file_info_t* pgfip, uint32_t* sample_missing_hc_cts, uint32_t* sample_missing_dosage_cts, uint32_t* sample_hethap_cts);
+
+pglerr_t mind_filter(const uint32_t* sample_missing_cts, const uint32_t* sample_hethap_cts, const char* sample_ids, const char* sids, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t variant_ct, uint32_t variant_ct_y, double mind_thresh, uintptr_t* sample_include, uintptr_t* sex_male, uint32_t* sample_ct_ptr, char* outname, char* outname_end);
+
+void enforce_geno_thresh(const chr_info_t* cip, const uint32_t* variant_missing_cts, const uint32_t* variant_hethap_cts, uint32_t sample_ct, uint32_t male_ct, uint32_t first_hap_uidx, double geno_thresh, uintptr_t* variant_include, uint32_t* variant_ct_ptr);
+
+void enforce_hwe_thresh(const chr_info_t* cip, const uint32_t* founder_raw_geno_cts, const uint32_t* founder_x_male_geno_cts, const uint32_t* founder_x_nosex_geno_cts, const double* hwe_x_pvals, misc_flags_t misc_flags, double hwe_thresh, uint32_t nonfounders, uintptr_t* variant_include, uint32_t* variant_ct_ptr);
+
+void enforce_minor_freq_constraints(const uintptr_t* variant_allele_idxs, const uint64_t* founder_allele_dosages, const double* allele_freqs, double min_maf, double max_maf, uint64_t min_allele_dosage, uint64_t max_allele_dosage, uintptr_t* variant_include, uint32_t* variant_ct_ptr);
+
+void enforce_mach_r2_thresh(const chr_info_t* cip, const double* mach_r2_vals, double mach_r2_min, double mach_r2_max, uintptr_t* variant_include, uint32_t* variant_ct_ptr);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_FILTER_H__
diff --git a/plink2_glm.cpp b/plink2_glm.cpp
new file mode 100644
index 0000000..2c13413
--- /dev/null
+++ b/plink2_glm.cpp
@@ -0,0 +1,6494 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "plink2_compress_stream.h"
+#include "plink2_glm.h"
+#include "plink2_matrix.h"
+#include "plink2_stats.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+void init_glm(glm_info_t* glm_info_ptr) {
+  glm_info_ptr->flags = kfGlm0;
+  glm_info_ptr->cols = kfGlmCol0;
+  glm_info_ptr->mperm_ct = 0;
+  glm_info_ptr->local_cat_ct = 0;
+  glm_info_ptr->max_corr = 0.999;
+  glm_info_ptr->condition_varname = nullptr;
+  glm_info_ptr->condition_list_fname = nullptr;
+  init_range_list(&(glm_info_ptr->parameters_range_list));
+  init_range_list(&(glm_info_ptr->tests_range_list));
+}
+
+void cleanup_glm(glm_info_t* glm_info_ptr) {
+  free_cond(glm_info_ptr->condition_varname);
+  free_cond(glm_info_ptr->condition_list_fname);
+  cleanup_range_list(&(glm_info_ptr->parameters_range_list));
+  cleanup_range_list(&(glm_info_ptr->tests_range_list));
+}
+
+pglerr_t glm_local_init(const char* local_covar_fname, const char* local_pvar_fname, const char* local_psam_fname, const char* sample_ids, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const glm_info_t* glm_info_ptr, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uint32_t raw_variant_ct, const uintptr_t** sample_include_ptr, const uintptr_t** sex_nm_ptr, const uintptr_t** sex_male_ptr, const uintptr_t** variant_include_ptr, uint32_t* sample_ct_ptr, uint32 [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  gzFile gz_infile = nullptr;
+  uintptr_t line_idx = 0;
+  uintptr_t loadbuf_size = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    // 1. read .psam/.fam file, update sample_ct, initialize
+    //    local_sample_uidx_order (use open_and_load_xid_header()?)
+    reterr = gzopen_read_checked(local_psam_fname, &gz_infile);
+    if (reterr) {
+      goto glm_local_init_ret_1;
+    }
+    loadbuf_size = bigstack_left() / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto glm_local_init_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kEndAllocAlign);
+    }
+    char* loadbuf = (char*)bigstack_end_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* loadbuf_first_token;
+    uint32_t is_header_line;
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+        if (!gzeof(gz_infile)) {
+	  goto glm_local_init_ret_READ_FAIL;
+	}
+	sprintf(g_logbuf, "Error: %s is empty.\n", local_psam_fname);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto glm_local_init_ret_LONG_LINE_PSAM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      is_header_line = (loadbuf_first_token[0] == '#');
+    } while (is_header_line && strcmp_se(&(loadbuf_first_token[1]), "FID", 3) && strcmp_se(&(loadbuf_first_token[1]), "IID", 3));
+    xid_mode_t xid_mode = kfXidModeFidiid;
+    if (is_header_line) {
+      if (loadbuf_first_token[1] == 'I') {
+	xid_mode = kfXidModeIid;
+      }
+      *loadbuf_first_token = '\0';
+    }
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t orig_sample_ct = *sample_ct_ptr;
+    char* sorted_sample_idbox;
+    uint32_t* sample_id_map;
+    uintptr_t* new_sample_include;
+    char* idbuf;
+    if (bigstack_end_alloc_c(orig_sample_ct * max_sample_id_blen, &sorted_sample_idbox) ||
+	bigstack_end_alloc_ui(orig_sample_ct, &sample_id_map) ||
+	bigstack_end_calloc_ul(raw_sample_ctl, &new_sample_include) ||
+	bigstack_end_alloc_c(max_sample_id_blen, &idbuf)) {
+      goto glm_local_init_ret_NOMEM;
+    }
+    // (don't permit duplicate FID+IID for now, but maybe we'll want to use
+    // xid interface later?)
+    reterr = copy_sort_strbox_subset_noalloc(*sample_include_ptr, sample_ids, orig_sample_ct, max_sample_id_blen, 0, 0, 0, sorted_sample_idbox, sample_id_map);
+    if (reterr) {
+      goto glm_local_init_ret_1;
+    }
+    uint32_t* local_sample_uidx_order = (uint32_t*)g_bigstack_base;
+    uintptr_t max_local_sample_ct = round_down_pow2(bigstack_left(), kCacheline) / sizeof(int32_t);
+#ifdef __LP64__
+    if (max_local_sample_ct > kMaxLongLine / 2) {
+      max_local_sample_ct = kMaxLongLine / 2;
+    }
+#endif
+    uintptr_t local_sample_ct = 0;
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	if (local_sample_ct == max_local_sample_ct) {
+#ifdef __LP64__
+	  if (local_sample_ct == kMaxLongLine / 2) {
+	    sprintf(g_logbuf, "Error: Too many samples in %s.\n", local_psam_fname);
+	    goto glm_local_init_ret_MALFORMED_INPUT_WW;
+	  }
+#endif
+	  goto glm_local_init_ret_NOMEM;
+	}
+	char* read_ptr = loadbuf_first_token;
+	uint32_t sample_uidx;
+	if (!sorted_xidbox_read_find(sorted_sample_idbox, sample_id_map, max_sample_id_blen, orig_sample_ct, 0, xid_mode, &read_ptr, &sample_uidx, idbuf)) {
+	  if (is_set(new_sample_include, sample_uidx)) {
+	    char* first_tab = (char*)rawmemchr(idbuf, '\t');
+	    *first_tab = ' ';
+	    sprintf(g_logbuf, "Error: Duplicate ID '%s' in %s.\n", idbuf, local_psam_fname);
+	    goto glm_local_init_ret_MALFORMED_INPUT_WW;
+	  }
+	  set_bit(sample_uidx, new_sample_include);
+	  local_sample_uidx_order[local_sample_ct] = sample_uidx;
+	} else {
+	  if (!read_ptr) {
+	    sprintf(g_logbuf, "Error: Fewer tokens than expected on line %" PRIuPTR " of %s.\n", line_idx, local_psam_fname);
+	    goto glm_local_init_ret_MALFORMED_INPUT_WW;
+	  }
+	  local_sample_uidx_order[local_sample_ct] = 0xffffffffU;
+	}
+	++local_sample_ct;
+      }
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto glm_local_init_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto glm_local_init_ret_LONG_LINE_PSAM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (loadbuf_first_token[0] == '#') {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx, local_psam_fname);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+    }
+    if (gzclose_null(&gz_infile)) {
+      goto glm_local_init_ret_READ_FAIL;
+    }
+    bigstack_finalize_ui(local_sample_uidx_order, local_sample_ct);
+    *local_sample_uidx_order_ptr = local_sample_uidx_order;
+    *local_sample_ct_ptr = local_sample_ct;
+    const uint32_t new_sample_ct = popcount_longs(new_sample_include, raw_sample_ctl);
+    assert(new_sample_ct <= orig_sample_ct);
+    if (new_sample_ct < orig_sample_ct) {
+      uintptr_t* sample_include_copy;
+      uintptr_t* sex_nm_copy;
+      uintptr_t* sex_male_copy;      
+      if (bigstack_alloc_ul(raw_sample_ctl, &sample_include_copy) ||
+	  bigstack_alloc_ul(raw_sample_ctl, &sex_nm_copy) ||
+	  bigstack_alloc_ul(raw_sample_ctl, &sex_male_copy)) {
+	goto glm_local_init_ret_NOMEM;
+      }
+      memcpy(sample_include_copy, new_sample_include, raw_sample_ctl * sizeof(intptr_t));
+      bitvec_and_copy(sample_include_copy, *sex_nm_ptr, raw_sample_ctl, sex_nm_copy);
+      *sex_nm_ptr = sex_nm_copy;
+      bitvec_and_copy(sample_include_copy, *sex_male_ptr, raw_sample_ctl, sex_male_copy);
+      *sex_male_ptr = sex_male_copy;
+      *sample_include_ptr = sample_include_copy;
+      bigstack_end_reset(loadbuf);
+      uint32_t* sample_include_cumulative_popcounts;
+      if (bigstack_end_alloc_ui(raw_sample_ctl, &sample_include_cumulative_popcounts)) {
+	goto glm_local_init_ret_NOMEM;
+      }
+      fill_cumulative_popcounts(sample_include_copy, raw_sample_ctl, sample_include_cumulative_popcounts);
+      *sample_ct_ptr = new_sample_ct;
+    }
+    bigstack_end_reset(loadbuf);
+
+    // 2. read .pvar/.bim file, update variant_ct, initialize
+    //    local_variant_ct/local_variant_include.
+    reterr = gzopen_read_checked(local_pvar_fname, &gz_infile);
+    if (reterr) {
+      goto glm_local_init_ret_1;
+    }
+    line_idx = 0;
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+        if (!gzeof(gz_infile)) {
+	  goto glm_local_init_ret_READ_FAIL;
+	}
+	sprintf(g_logbuf, "Error: %s is empty.\n", local_pvar_fname);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto glm_local_init_ret_LONG_LINE_PVAR;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      is_header_line = (loadbuf_first_token[0] == '#');
+    } while (is_header_line && strcmp_se(&(loadbuf_first_token[1]), "CHROM", 5));
+    uint32_t col_skips[2];
+    uint32_t col_types[2];
+    // uint32_t relevant_postchr_col_ct = 2;
+    if (is_header_line) {
+      // parse header
+      // [-1] = #CHROM (must be first column)
+      // [0] = POS
+      // [1] = ID
+      // don't care about the rest
+      uint32_t col_idx = 0;
+      char* token_end = &(loadbuf_first_token[6]);
+      uint32_t found_header_bitset = 0;
+      uint32_t relevant_postchr_col_ct = 0;
+      char* loadbuf_iter;
+      while (1) {
+	loadbuf_iter = skip_initial_spaces(token_end);
+	if (is_eoln_kns(*loadbuf_iter)) {
+	  break;
+	}
+	++col_idx;
+	token_end = token_endnn(loadbuf_iter);
+	const uint32_t token_slen = (uintptr_t)(token_end - loadbuf_iter);
+	uint32_t cur_col_type;
+	if ((token_slen == 3) && (!memcmp(loadbuf_iter, "POS", 3))) {
+	  cur_col_type = 0;
+	} else if ((token_slen == 2) && (!memcmp(loadbuf_iter, "ID", 2))) {
+	  cur_col_type = 1;
+	} else {
+	  continue;
+	}
+	const uint32_t cur_col_type_shifted = 1 << cur_col_type;
+	if (found_header_bitset & cur_col_type_shifted) {
+	  *token_end = '\0';
+	  sprintf(g_logbuf, "Error: Duplicate column header '%s' on line %" PRIuPTR " of %s.\n", loadbuf_iter, line_idx, local_pvar_fname);
+	  goto glm_local_init_ret_MALFORMED_INPUT_WW;
+	}
+	found_header_bitset |= cur_col_type_shifted;
+	col_skips[relevant_postchr_col_ct] = col_idx;
+	col_types[relevant_postchr_col_ct++] = cur_col_type;
+      }
+      if (found_header_bitset != 3) {
+	sprintf(g_logbuf, "Error: Missing column header(s) on line %" PRIuPTR " of %s. (POS and ID are required.)\n", line_idx, local_pvar_fname);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+      for (uint32_t rpc_col_idx = relevant_postchr_col_ct - 1; rpc_col_idx; --rpc_col_idx) {
+	col_skips[rpc_col_idx] -= col_skips[rpc_col_idx - 1];
+      }
+      loadbuf_first_token[0] = '\0';
+    } else {
+      col_types[0] = 1;
+      col_types[1] = 0;
+      col_skips[0] = 1;
+      // CM column may be omitted
+      char* loadbuf_iter = next_token_mult(loadbuf_first_token, 4);
+      if (!loadbuf_iter) {
+	goto glm_local_init_ret_MISSING_TOKENS_PVAR;
+      }
+      loadbuf_iter = next_token(loadbuf_iter);
+      if (!loadbuf_iter) {
+	// #CHROM ID POS ALT REF
+	col_skips[1] = 1;
+      } else {
+	// #CHROM ID CM POS ALT REF
+	col_skips[1] = 2;
+      }
+    }
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uintptr_t* new_variant_include;
+    if (bigstack_end_calloc_ul(raw_variant_ctl, &new_variant_include)) {
+      goto glm_local_init_ret_NOMEM;
+    }
+    uint32_t max_local_variant_ct = 0x7ffffffd;
+    if (bigstack_left() < (0x80000000U / CHAR_BIT)) {
+      max_local_variant_ct = round_down_pow2(bigstack_left(), kCacheline) * CHAR_BIT;
+    }
+    const uintptr_t* orig_variant_include = *variant_include_ptr;
+    uintptr_t* local_variant_include = (uintptr_t*)g_bigstack_base;
+    *local_variant_include_ptr = local_variant_include;
+    uint32_t local_variant_ct = 0;
+    uint32_t new_variant_ct = 0;
+    uint32_t prev_variant_uidx = next_set_unsafe(orig_variant_include, 0);
+    uint32_t chr_fo_idx = get_variant_chr_fo_idx(cip, prev_variant_uidx);
+    uint32_t prev_chr_code = cip->chr_file_order[chr_fo_idx];
+    uint32_t prev_bp = variant_bps[prev_variant_uidx];
+    uint32_t chr_end = cip->chr_fo_vidx_start[cip->chr_idx_to_foidx[prev_chr_code] + 1];
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	if (local_variant_ct == max_local_variant_ct) {
+	  if (max_local_variant_ct == 0x7ffffffd) {
+	    sprintf(g_logbuf, "Error: Too many samples in %s.\n", local_pvar_fname);
+	    goto glm_local_init_ret_MALFORMED_INPUT_WW;
+	  }
+	  goto glm_local_init_ret_NOMEM;
+	}
+	if (!(local_variant_ct % kBitsPerWord)) {
+	  local_variant_include[local_variant_ct / kBitsPerWord] = 0;
+	}
+	char* loadbuf_iter = token_endnn(loadbuf_first_token);
+	// #CHROM
+	if (!(*loadbuf_iter)) {
+	  goto glm_local_init_ret_MISSING_TOKENS_PVAR;
+	}
+	{
+	  const int32_t cur_chr_code = get_chr_code_counted(cip, (uintptr_t)(loadbuf_iter - loadbuf_first_token), loadbuf_first_token);
+	  if (cur_chr_code < 0) {
+	    goto glm_local_init_skip_variant;
+	  }
+	  if ((uint32_t)cur_chr_code != prev_chr_code) {
+	    uint32_t first_variant_uidx_in_chr = cip->chr_fo_vidx_start[cip->chr_idx_to_foidx[(uint32_t)cur_chr_code]];
+	    if (first_variant_uidx_in_chr < prev_variant_uidx) {
+	      if (new_variant_ct) {
+		// not worth the trouble of handling this
+		sprintf(g_logbuf, "Error: Chromosome order in %s is different from chromosome order in main dataset.\n", local_pvar_fname);
+		goto glm_local_init_ret_INCONSISTENT_INPUT_WW;
+	      }
+	      goto glm_local_init_skip_variant;
+	    }
+	    prev_variant_uidx = next_set(orig_variant_include, first_variant_uidx_in_chr, raw_variant_ct);
+	    if (prev_variant_uidx == raw_variant_ct) {
+	      break;
+	    }
+	    chr_fo_idx = get_variant_chr_fo_idx(cip, prev_variant_uidx);
+	    prev_chr_code = cip->chr_file_order[chr_fo_idx];
+	    prev_bp = variant_bps[prev_variant_uidx];
+	    chr_end = cip->chr_fo_vidx_start[cip->chr_idx_to_foidx[prev_chr_code] + 1];
+	    if ((uint32_t)cur_chr_code != prev_chr_code) {
+	      goto glm_local_init_skip_variant;
+	    }
+	  }
+	  char* token_ptrs[2];
+	  uint32_t token_slens[2];
+	  for (uint32_t rpc_col_idx = 0; rpc_col_idx < 2; ++rpc_col_idx) {
+	    const uint32_t cur_col_type = col_types[rpc_col_idx];
+	    loadbuf_iter = next_token_mult(loadbuf_iter, col_skips[rpc_col_idx]);
+	    if (!loadbuf_iter) {
+	      goto glm_local_init_ret_MISSING_TOKENS_PVAR;
+	    }
+	    token_ptrs[cur_col_type] = loadbuf_iter;
+	    char* token_end = token_endnn(loadbuf_iter);
+	    token_slens[cur_col_type] = (uintptr_t)(token_end - loadbuf_iter);
+	    loadbuf_iter = token_end;
+	  }
+	  // POS
+	  int32_t cur_bp;
+	  if (scan_int_abs_defcap(token_ptrs[0], &cur_bp)) {
+	    sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, local_pvar_fname);
+	    goto glm_local_init_ret_MALFORMED_INPUT_WW;
+	  }
+	  if (cur_bp < (int32_t)prev_bp) {
+	    goto glm_local_init_skip_variant;
+	  }
+	  if ((uint32_t)cur_bp > prev_bp) {
+	    do {
+	      prev_variant_uidx = next_set(orig_variant_include, prev_variant_uidx + 1, raw_variant_ct);
+	    } while ((prev_variant_uidx < chr_end) && ((uint32_t)cur_bp > variant_bps[prev_variant_uidx]));
+	    if (prev_variant_uidx >= chr_end) {
+	      goto glm_local_init_skip_variant_and_update_chr;
+	    }
+	    prev_bp = variant_bps[prev_variant_uidx];
+	  }
+	  if ((uint32_t)cur_bp == prev_bp) {
+	    // ID
+	    // note that if there are two same-position variants which appear
+	    // in a different order in the main dataset and the local-pvar
+	    // file, one will be skipped.  (probably want to add a warning in
+	    // this case.)
+	    char* cur_variant_id = token_ptrs[1];
+	    cur_variant_id[token_slens[1]] = '\0';
+	    do {
+	      char* loaded_variant_id = variant_ids[prev_variant_uidx];
+	      if (!strcmp(cur_variant_id, loaded_variant_id)) {
+		if (is_set(new_variant_include, prev_variant_uidx)) {
+		  sprintf(g_logbuf, "Error: Duplicate ID (with duplicate CHROM/POS) '%s' in %s.\n", cur_variant_id, local_pvar_fname);
+		  goto glm_local_init_ret_MALFORMED_INPUT_WW;
+		}
+		set_bit(prev_variant_uidx, new_variant_include);
+		++new_variant_ct;
+		set_bit(local_variant_ct, local_variant_include);
+		break;
+	      }
+	      prev_variant_uidx = next_set(orig_variant_include, prev_variant_uidx + 1, raw_variant_ct);
+	      if (prev_variant_uidx >= chr_end) {
+		goto glm_local_init_skip_variant_and_update_chr;
+	      }
+	      prev_bp = variant_bps[prev_variant_uidx];
+	    } while ((uint32_t)cur_bp == prev_bp);
+	  }
+	}
+	if (0) {
+	glm_local_init_skip_variant_and_update_chr:
+	  if (prev_variant_uidx == raw_variant_ct) {
+	    break;
+	  }
+	  chr_fo_idx = get_variant_chr_fo_idx(cip, prev_variant_uidx);
+	  prev_chr_code = cip->chr_file_order[chr_fo_idx];
+	  prev_bp = variant_bps[prev_variant_uidx];
+	  chr_end = cip->chr_fo_vidx_start[cip->chr_idx_to_foidx[prev_chr_code] + 1];
+	}
+      glm_local_init_skip_variant:
+	++local_variant_ct;
+      }
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto glm_local_init_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto glm_local_init_ret_LONG_LINE_PVAR;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (loadbuf_first_token[0] == '#') {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #CHROM header line is present it must denote the end of the header block.)\n", line_idx, local_pvar_fname);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+    }
+    if (gzclose_null(&gz_infile)) {
+      goto glm_local_init_ret_READ_FAIL;
+    }
+    bigstack_finalize_ul(local_variant_include, BITCT_TO_WORDCT(local_variant_ct));
+    *local_variant_ctl_ptr = BITCT_TO_WORDCT(local_variant_ct);
+    assert(new_variant_ct <= *variant_ct_ptr);
+    if (new_variant_ct < *variant_ct_ptr) {
+      uintptr_t* variant_include_copy;
+      if (bigstack_alloc_ul(raw_variant_ctl, &variant_include_copy)) {
+	goto glm_local_init_ret_NOMEM;
+      }
+      memcpy(variant_include_copy, new_variant_include, raw_variant_ctl * sizeof(intptr_t));
+      *variant_include_ptr = variant_include_copy;
+      *variant_ct_ptr = new_variant_ct;
+    }
+
+    // 3. if not local-cats=, scan first line of local-covar= file to determine
+    //    covariate count
+    reterr = gzopen_read_checked(local_covar_fname, gz_local_covar_file_ptr);
+    if (reterr) {
+      goto glm_local_init_ret_1;
+    }
+    uint32_t local_covar_ct;
+    if (!glm_info_ptr->local_cat_ct) {
+      line_idx = 1;
+      if (!gzgets(*gz_local_covar_file_ptr, loadbuf, loadbuf_size)) {
+	if (!gzeof(*gz_local_covar_file_ptr)) {
+	  goto glm_local_init_ret_READ_FAIL;
+	}
+	sprintf(g_logbuf, "Error: %s is empty.\n", local_covar_fname);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size != kMaxLongLine) {
+	  goto glm_local_init_ret_NOMEM;
+	}
+	sprintf(g_logbuf, "Error: Line 1 of %s is pathologically long.\n", local_covar_fname);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      const uint32_t token_ct = count_tokens(loadbuf_first_token);
+      local_covar_ct = token_ct / local_sample_ct;
+      if (local_covar_ct * local_sample_ct != token_ct) {
+	sprintf(g_logbuf, "Error: Unexpected token count on line 1 of %s (%u, %smultiple of %" PRIuPTR " expected).\n", local_covar_fname, token_ct, (token_ct == local_sample_ct)? "larger " : "", local_sample_ct);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+      if (glm_info_ptr->flags & kfGlmLocalOmitLast) {
+	if (local_covar_ct == 1) {
+	  logerrprint("Error: --glm 'local-omit-last' modifier cannot be used when there is only one\nlocal covariate.\n");
+	  goto glm_local_init_ret_INCONSISTENT_INPUT;
+	}
+	LOGPRINTF("--glm local-covar=: %u local covariates present, %u used.\n", local_covar_ct, local_covar_ct - 1);
+	--local_covar_ct;
+      } else {
+	LOGPRINTF("--glm local-covar=: %u local covariate%s present.\n", local_covar_ct, (local_covar_ct == 1)? "" : "s");
+      }
+    } else {
+      local_covar_ct = glm_info_ptr->local_cat_ct - 1;
+      if (local_covar_ct * ((uint64_t)local_sample_ct) > kMaxLongLine / 2) {
+	sprintf(g_logbuf, "Error: [# samples in %s] * [# categories - 1] too large (limited to %u).\n", local_covar_fname, kMaxLongLine / 2);
+	goto glm_local_init_ret_MALFORMED_INPUT_WW;
+      }
+    }
+    *local_covar_ct_ptr = local_covar_ct;
+    bigstack_mark = g_bigstack_base;
+  }
+  while (0) {
+  glm_local_init_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  glm_local_init_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  glm_local_init_ret_LONG_LINE_PSAM:
+    if (loadbuf_size != kMaxLongLine) {
+      reterr = kPglRetNomem;
+      break;
+    }
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, local_psam_fname);
+  glm_local_init_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  glm_local_init_ret_LONG_LINE_PVAR:
+    if (loadbuf_size != kMaxLongLine) {
+      reterr = kPglRetNomem;
+      break;
+    }
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, local_pvar_fname);
+    reterr = kPglRetMalformedInput;
+    break;
+  glm_local_init_ret_MISSING_TOKENS_PVAR:
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, local_pvar_fname);
+    reterr = kPglRetMalformedInput;
+    break;
+  glm_local_init_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  glm_local_init_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ glm_local_init_ret_1:
+  gzclose_cond(gz_infile);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+
+// Returns 1 if phenotype fully separates the covariate, and the non-Firth
+// logistic regression should be aborted.
+// Otherwise, if we have quasi-separation, use Stata approach of throwing out
+// the covariate, and keeping only samples which have the one covariate value
+// present in both cases and controls.
+// Note that covar_ct is not a parameter; caller is responsible for
+// re-popcounting covar_include.
+boolerr_t check_for_and_handle_separated_covar(const uintptr_t* pheno_cc, const pheno_col_t* covar_cols, uint32_t raw_sample_ctl, uint32_t covar_uidx, uintptr_t* cur_sample_include, uintptr_t* covar_include, uint32_t* sample_ct_ptr, uintptr_t* cat_covar_wkspace) {
+  uint32_t sample_ct = *sample_ct_ptr;
+  if (sample_ct < 2) {
+    return 1;
+  }
+  const pheno_col_t* cur_covar_col = &(covar_cols[covar_uidx]);
+  if (cur_covar_col->type_code == kPhenoDtypeOther) {
+    return 0;
+  }
+  const uint32_t first_sample_uidx = next_set_unsafe(cur_sample_include, 0);
+  if (cur_covar_col->type_code == kPhenoDtypeQt) {
+    const double* covar_vals = cur_covar_col->data.qt;
+    double cur_covar_val = covar_vals[first_sample_uidx];
+    double ctrl_min;
+    double ctrl_max;
+    double case_min;
+    double case_max;
+    if (is_set(pheno_cc, first_sample_uidx)) {
+      ctrl_min = DBL_MAX;
+      ctrl_max = -DBL_MAX;
+      case_min = cur_covar_val;
+      case_max = cur_covar_val;
+    } else {
+      ctrl_min = cur_covar_val;
+      ctrl_max = cur_covar_val;
+      case_min = DBL_MAX;
+      case_max = -DBL_MAX;
+    }
+    uint32_t sample_uidx = first_sample_uidx + 1;
+    for (uint32_t sample_idx = 1; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(cur_sample_include, &sample_uidx);
+      cur_covar_val = covar_vals[sample_uidx];
+      if (IS_SET(pheno_cc, sample_uidx)) {
+	if (cur_covar_val < case_min) {
+	  if ((ctrl_min < case_max) && (cur_covar_val < ctrl_max)) {
+	    return 0;
+	  }
+	  case_min = cur_covar_val;
+	} else if (cur_covar_val > case_max) {
+	  if ((ctrl_max > case_min) && (cur_covar_val > ctrl_min)) {
+	    return 0;
+	  }
+	  case_max = cur_covar_val;
+	}
+      } else {
+	if (cur_covar_val < ctrl_min) {
+	  if ((case_min < ctrl_max) && (cur_covar_val < case_max)) {
+	    return 0;
+	  }
+	  ctrl_min = cur_covar_val;
+	} else if (cur_covar_val > ctrl_max) {
+	  if ((case_max > ctrl_min) && (cur_covar_val > case_min)) {
+	    return 0;
+	  }
+	  ctrl_max = cur_covar_val;
+	}
+      }
+    }
+    if ((case_min > ctrl_max) || (ctrl_min > case_max)) {
+      // fully separated
+      return 1;
+    }
+    // quasi-separated
+    const double covar_val_keep = (case_min == ctrl_max)? case_min : case_max;
+    sample_uidx = first_sample_uidx;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(cur_sample_include, &sample_uidx);
+      if (covar_vals[sample_uidx] != covar_val_keep) {
+	clear_bit(sample_uidx, cur_sample_include);
+      }
+    }
+    *sample_ct_ptr = popcount_longs(cur_sample_include, raw_sample_ctl);
+    clear_bit(covar_uidx, covar_include);
+    return 0;
+  }
+  const uint32_t nonnull_cat_ct = cur_covar_col->nonnull_category_ct;
+  const uint32_t cur_word_ct = 1 + nonnull_cat_ct / kBitsPerWordD2;
+  fill_ulong_zero(cur_word_ct, cat_covar_wkspace);
+  const uint32_t* covar_vals = cur_covar_col->data.cat;
+  // If no remaining categories have both cases and controls, we have complete
+  // separation.
+  // If some do and some do not, we have quasi-complete separation, and must
+  // remove samples in the all-case and all-control categories.
+  uint32_t sample_uidx = first_sample_uidx;
+  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+    next_set_unsafe_ck(cur_sample_include, &sample_uidx);
+    const uint32_t cur_cat_idx = covar_vals[sample_uidx];
+    // Odd bits represent presence of a case, even bits represent presence of a
+    // control.
+    const uint32_t is_case = is_set(pheno_cc, sample_uidx);
+    set_bit(cur_cat_idx * 2 + is_case, cat_covar_wkspace);
+  }
+  uint32_t case_and_ctrl_cat_ct = 0;
+  uint32_t pheno_by_cat_ct = 0;
+  for (uint32_t widx = 0; widx < cur_word_ct; ++widx) {
+    const uintptr_t cur_word = cat_covar_wkspace[widx];
+    case_and_ctrl_cat_ct += popcount01_long(cur_word & (cur_word >> 1) & kMask5555);
+    pheno_by_cat_ct += popcount_long(cur_word);
+  }
+  if (!case_and_ctrl_cat_ct) {
+    // fully separated
+    return 1;
+  }
+  if ((case_and_ctrl_cat_ct > 1) && (case_and_ctrl_cat_ct == pheno_by_cat_ct * 2)) {
+    // all categories contain both cases and controls.
+    return 0;
+  }
+  // more than one category contains both cases and controls (so we don't need
+  // to remove the categorical covariate), but at least one does not, so we
+  // still have to prune some samples.
+  for (uint32_t widx = 0; widx < cur_word_ct; ++widx) {
+    const uintptr_t cur_word = cat_covar_wkspace[widx];
+    cat_covar_wkspace[widx] = cur_word & (cur_word >> 1) & kMask5555;
+  }
+  sample_uidx = first_sample_uidx;
+  for (uint32_t sample_idx = 0; sample_uidx < sample_ct; ++sample_idx, ++sample_uidx) {
+    next_set_unsafe_ck(cur_sample_include, &sample_uidx);
+    if (!is_set(cat_covar_wkspace, covar_vals[sample_uidx] * 2)) {
+      clear_bit(sample_uidx, cur_sample_include);
+    }
+  }
+  *sample_ct_ptr = popcount_longs(cur_sample_include, raw_sample_ctl);
+  if (case_and_ctrl_cat_ct == 1) {
+    clear_bit(covar_uidx, covar_include);
+  }
+  return 0;
+}
+
+boolerr_t glm_determine_covars(const uintptr_t* pheno_cc, const uintptr_t* initial_covar_include, const pheno_col_t* covar_cols, uint32_t raw_sample_ct, uint32_t raw_covar_ctl, uint32_t initial_covar_ct, uint32_t covar_max_nonnull_cat_ct, uint32_t is_sometimes_firth, uintptr_t* cur_sample_include, uintptr_t* covar_include, uint32_t* sample_ct_ptr, uint32_t* covar_ct_ptr, uint32_t* extra_cat_ct_ptr, uint32_t* separation_warning_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  boolerr_t reterr = 0;
+  {
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    uintptr_t* sample_include_backup;
+    if (bigstack_alloc_ul(raw_sample_ctl, &sample_include_backup)) {
+      goto glm_determine_covars_ret_NOMEM;
+    }
+    memcpy(sample_include_backup, cur_sample_include, raw_sample_ctl * sizeof(intptr_t));
+    memcpy(covar_include, initial_covar_include, raw_covar_ctl * sizeof(intptr_t));
+
+    // 1. Determine samples for which all phenotype/covariate values are
+    //    present, then provisionally remove the covariates which are constant
+    //    over that set in linear case, or produce separation in logistic case
+    uint32_t covar_uidx = 0;
+    for (uint32_t covar_idx = 0; covar_idx < initial_covar_ct; ++covar_idx, ++covar_uidx) {
+      next_set_unsafe_ck(initial_covar_include, &covar_uidx);
+      if (covar_cols[covar_uidx].nonmiss) {
+	bitvec_and(covar_cols[covar_uidx].nonmiss, raw_sample_ctl, cur_sample_include);
+      }
+    }
+    uint32_t prev_sample_ct = popcount_longs(cur_sample_include, raw_sample_ctl);
+    covar_uidx = 0;
+    for (uint32_t covar_idx = 0; covar_idx < initial_covar_ct; ++covar_idx, ++covar_uidx) {
+      next_set_unsafe_ck(initial_covar_include, &covar_uidx);
+      if ((covar_cols[covar_uidx].type_code != kPhenoDtypeOther) && is_const_covar(&(covar_cols[covar_uidx]), cur_sample_include, prev_sample_ct)) {
+	clear_bit(covar_uidx, covar_include);
+      }
+    }
+    uint32_t covar_ct = popcount_longs(covar_include, raw_covar_ctl);
+    if (covar_ct != initial_covar_ct) {
+      // 2. If any covariates were removed, redetermine the samples for which
+      //    all phenotype/covariate values are present.  If there are more than
+      //    before, add back any provisionally removed covariates which don't
+      //    have any missing values in the new sample set, and are now
+      //    nonconstant.
+      //    Categorical covariates should behave just like they had been
+      //    pre-split into n-1 0/1 indicator variables.
+      memcpy(cur_sample_include, sample_include_backup, raw_sample_ctl * sizeof(intptr_t));
+      covar_uidx = 0;
+      for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx, ++covar_uidx) {
+	next_set_unsafe_ck(covar_include, &covar_uidx);
+	if (covar_cols[covar_uidx].nonmiss) {
+	  bitvec_and(covar_cols[covar_uidx].nonmiss, raw_sample_ctl, cur_sample_include);
+	}
+      }
+      uint32_t new_sample_ct = popcount_longs(cur_sample_include, raw_sample_ctl);
+      if (new_sample_ct > prev_sample_ct) {
+	prev_sample_ct = new_sample_ct;
+	covar_uidx = 0;
+	for (uint32_t covar_idx = 0; covar_idx < initial_covar_ct; ++covar_idx, ++covar_uidx) {
+	  next_set_unsafe_ck(initial_covar_include, &covar_uidx);
+	  if (!is_set(covar_include, covar_uidx)) {
+	    const pheno_col_t* cur_covar_col = &(covar_cols[covar_uidx]);
+	    if (popcount_longs_intersect(cur_sample_include, cur_covar_col->nonmiss, raw_sample_ctl) == prev_sample_ct) {
+	      if (!is_const_covar(cur_covar_col, cur_sample_include, prev_sample_ct)) {
+		set_bit(covar_uidx, covar_include);
+	      }
+	    }
+	  }
+	}
+	covar_ct = popcount_longs(covar_include, raw_covar_ctl);
+      }
+    }
+    if (!covar_ct) {
+      *sample_ct_ptr = prev_sample_ct;
+      goto glm_determine_covars_ret_NOCOVAR;
+    }
+    if (prev_sample_ct < 3) {
+      goto glm_determine_covars_ret_SKIP;
+    }
+    // 3. if quantitative trait, remove samples corresponding to single-element
+    //    categories or constant-except-for-one-sample regular covariates.
+    uint32_t sample_ct = prev_sample_ct;
+    uint32_t extra_cat_ct = 0;
+    bigstack_reset(sample_include_backup);
+    uint32_t first_sample_uidx = next_set_unsafe(cur_sample_include, 0);
+    if (!pheno_cc) {
+      uintptr_t* cat_one_obs = nullptr;
+      uintptr_t* cat_two_or_more_obs = nullptr;
+      uint32_t* cat_first_sample_uidxs = nullptr;
+      if (covar_max_nonnull_cat_ct) {
+	const uint32_t max_cat_ctl = 1 + (covar_max_nonnull_cat_ct / kBitsPerWord);
+	if (bigstack_alloc_ul(max_cat_ctl, &cat_one_obs) ||
+	    bigstack_alloc_ul(max_cat_ctl, &cat_two_or_more_obs) ||
+	    bigstack_alloc_ui(covar_max_nonnull_cat_ct + 1, &cat_first_sample_uidxs)) {
+	  goto glm_determine_covars_ret_NOMEM;
+	}
+      }
+      do {
+	prev_sample_ct = sample_ct;
+	covar_uidx = 0;
+	extra_cat_ct = 0;
+	for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx, ++covar_uidx) {
+	  next_set_unsafe_ck(covar_include, &covar_uidx);
+	  const pheno_col_t* cur_covar_col = &(covar_cols[covar_uidx]);
+	  if (cur_covar_col->type_code == kPhenoDtypeOther) {
+	    continue;
+	  }
+	  if (cur_covar_col->type_code == kPhenoDtypeQt) {
+	    const double* pheno_vals = cur_covar_col->data.qt;
+	    next_set_unsafe_ck(cur_sample_include, &first_sample_uidx);
+	    uint32_t sample_uidx = first_sample_uidx;
+	    double common_pheno_val = pheno_vals[sample_uidx];
+	    sample_uidx = next_set_unsafe(cur_sample_include, sample_uidx + 1);
+	    const double second_pheno_val = pheno_vals[sample_uidx];
+	    uint32_t sample_idx = 2;
+	    uint32_t sample_uidx_remove;
+	    if (second_pheno_val != common_pheno_val) {
+	      sample_uidx_remove = sample_uidx;
+	      sample_uidx = next_set_unsafe(cur_sample_include, sample_uidx + 1);
+	      const double third_pheno_val = pheno_vals[sample_uidx];
+	      if (third_pheno_val == second_pheno_val) {
+		common_pheno_val = second_pheno_val;
+		sample_uidx_remove = first_sample_uidx;
+	      } else if (third_pheno_val != common_pheno_val) {
+		continue;
+	      }
+	      sample_idx = 3;
+	    } else {
+	      sample_uidx_remove = 0xffffffffU;
+	    }
+	    for (; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+	      next_set_unsafe_ck(cur_sample_include, &sample_uidx);
+	      if (pheno_vals[sample_uidx] != common_pheno_val) {
+		if (sample_uidx_remove == 0xffffffffU) {
+		  sample_uidx_remove = sample_uidx;
+		} else {
+		  break;
+		}
+	      }
+	    }
+	    if (sample_idx == sample_ct) {
+	      if (sample_uidx_remove != 0xffffffffU) {
+		if (--sample_ct == 2) {
+		  goto glm_determine_covars_ret_SKIP;
+		}
+		clear_bit(sample_uidx_remove, cur_sample_include);
+	      } else {
+		// constant covariate, remove it
+		clear_bit(covar_uidx, covar_include);
+	      }
+	    }
+	  } else {
+	    const uint32_t cur_nonnull_cat_ct = cur_covar_col->nonnull_category_ct;
+	    const uint32_t cur_cat_ctl = 1 + (cur_nonnull_cat_ct / kBitsPerWord);
+	    fill_ulong_zero(cur_cat_ctl, cat_one_obs);
+	    fill_ulong_zero(cur_cat_ctl, cat_two_or_more_obs);
+	    const uint32_t* pheno_vals = cur_covar_col->data.cat;
+	    next_set_unsafe_ck(cur_sample_include, &first_sample_uidx);
+	    uint32_t sample_uidx = first_sample_uidx;
+	    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+	      next_set_unsafe_ck(cur_sample_include, &sample_uidx);
+	      const uint32_t cur_cat_idx = pheno_vals[sample_uidx];
+	      if (!is_set(cat_two_or_more_obs, cur_cat_idx)) {
+		if (is_set(cat_one_obs, cur_cat_idx)) {
+		  set_bit(cur_cat_idx, cat_two_or_more_obs);
+		} else {
+		  set_bit(cur_cat_idx, cat_one_obs);
+		  cat_first_sample_uidxs[cur_cat_idx] = sample_uidx;
+		}
+	      }
+	    }
+	    for (uint32_t widx = 0; widx < cur_cat_ctl; ++widx) {
+	      uintptr_t cur_word = cat_one_obs[widx] & (~cat_two_or_more_obs[widx]);
+	      if (cur_word) {
+		const uint32_t* cat_first_sample_uidxs_iter = &(cat_first_sample_uidxs[widx * kBitsPerWord]);
+		do {
+		  const uint32_t cat_idx_lowbits = CTZLU(cur_word);
+		  --sample_ct;
+		  clear_bit(cat_first_sample_uidxs_iter[cat_idx_lowbits], cur_sample_include);
+		  cur_word &= cur_word - 1;
+		} while (cur_word);
+	      }
+	    }
+	    if (sample_ct < 3) {
+	      goto glm_determine_covars_ret_SKIP;
+	    }
+	    uint32_t remaining_cat_ct = popcount_longs(cat_two_or_more_obs, cur_cat_ctl);
+	    if (remaining_cat_ct <= 1) {
+	      // now a constant covariate, remove it
+	      clear_bit(covar_uidx, covar_include);
+	    } else {
+	      extra_cat_ct += remaining_cat_ct - 2;
+	    }
+	  }
+	}
+	covar_ct = popcount_longs(covar_include, raw_covar_ctl);
+      } while (sample_ct < prev_sample_ct);
+    } else {
+      uintptr_t* cat_covar_wkspace;
+      if (bigstack_alloc_ul(1 + (covar_max_nonnull_cat_ct / kBitsPerWordD2), &cat_covar_wkspace)) {
+	goto glm_determine_covars_ret_NOMEM;
+      }
+      if (!is_sometimes_firth) {
+	// todo: in firth-fallback case, automatically switch to always-Firth
+	// if separated covariate is detected
+	do {
+	  prev_sample_ct = sample_ct;
+	  covar_uidx = 0;
+	  for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx, ++covar_uidx) {
+	    next_set_unsafe_ck(covar_include, &covar_uidx);
+	    if (check_for_and_handle_separated_covar(pheno_cc, covar_cols, raw_sample_ctl, covar_uidx, cur_sample_include, covar_include, &sample_ct, cat_covar_wkspace)) {
+	      *separation_warning_ptr = 1;
+	      goto glm_determine_covars_ret_SKIP;
+	    }
+	  }
+	  covar_ct = popcount_longs(covar_include, raw_covar_ctl);
+	} while (sample_ct < prev_sample_ct);
+      }
+
+      // now count extra categories
+      covar_uidx = 0;
+      extra_cat_ct = 0;
+      for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx, ++covar_uidx) {
+	next_set_unsafe_ck(covar_include, &covar_uidx);
+	const pheno_col_t* cur_covar_col = &(covar_cols[covar_uidx]);
+	if (cur_covar_col->type_code == kPhenoDtypeCat) {
+	  const uint32_t remaining_cat_ct = identify_remaining_cats(cur_sample_include, cur_covar_col, sample_ct, cat_covar_wkspace);
+	  if (remaining_cat_ct > 2) {
+	    extra_cat_ct += remaining_cat_ct - 2;
+	  }
+	}
+      }
+    }
+    *sample_ct_ptr = sample_ct;
+    *covar_ct_ptr = covar_ct;
+    *extra_cat_ct_ptr = extra_cat_ct;
+  }
+  while (0) {
+  glm_determine_covars_ret_NOMEM:
+    reterr = 1;
+    break;
+  glm_determine_covars_ret_SKIP:
+    *sample_ct_ptr = 0;
+  glm_determine_covars_ret_NOCOVAR:
+    *covar_ct_ptr = 0;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+void collapse_parameter_subset(const uintptr_t* covar_include, const uintptr_t* raw_parameter_subset, uint32_t domdev_present, uint32_t raw_covar_ct, uint32_t covar_ct, uint32_t add_interactions, uintptr_t* new_parameter_subset, uint32_t* predictor_ct_ptr) {
+  const uint32_t first_covar_pred_idx = 2 + domdev_present;
+  const uint32_t domdev_present_p1 = domdev_present + 1;
+  uint32_t first_interaction_pred_read_idx = 0;
+  uint32_t first_interaction_pred_write_idx = 0;
+  if (add_interactions) {
+    first_interaction_pred_read_idx = first_covar_pred_idx + raw_covar_ct;
+    first_interaction_pred_write_idx = first_covar_pred_idx + covar_ct;
+  }
+  const uint32_t write_idx_ct = 2 + domdev_present + covar_ct * (1 + add_interactions * domdev_present_p1);
+  const uint32_t write_idx_ctl = BITCT_TO_WORDCT(write_idx_ct);
+  fill_ulong_zero(write_idx_ctl, new_parameter_subset);
+  // intercept, additive, domdev
+  new_parameter_subset[0] = raw_parameter_subset[0] & (3 + 4 * domdev_present);
+  uint32_t covar_uidx = 0;
+  for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx, ++covar_uidx) {
+    next_set_unsafe_ck(covar_include, &covar_uidx);
+    if (is_set(raw_parameter_subset, first_covar_pred_idx + covar_uidx)) {
+      set_bit(first_covar_pred_idx + covar_idx, new_parameter_subset);
+    }
+    if (add_interactions) {
+      if (is_set(raw_parameter_subset, first_interaction_pred_read_idx + domdev_present_p1 * covar_uidx)) {
+	set_bit(first_interaction_pred_write_idx + domdev_present_p1 * covar_idx, new_parameter_subset);
+      }
+      if (domdev_present) {
+	if (is_set(raw_parameter_subset, first_interaction_pred_read_idx + 2 * covar_uidx + 1)) {
+	  set_bit(first_interaction_pred_write_idx + 2 * covar_idx + 1, new_parameter_subset);
+	}
+      }
+    }
+  }
+  *predictor_ct_ptr = popcount_longs(new_parameter_subset, write_idx_ctl);
+}
+
+
+ENUM_U31_DEF_START()
+  kVifCorrCheckOk,
+  kVifCorrCheckVifFail,
+  kVifCorrCheckCorrFail
+ENUM_U31_DEF_END(vif_corr_errcode_t);
+
+typedef struct {
+  vif_corr_errcode_t errcode;
+  uint32_t covar_idx1; // for both correlation and VIF failure
+  uint32_t covar_idx2; // for correlation failure only
+} vif_corr_err_t;
+
+boolerr_t glm_fill_and_test_covars(const uintptr_t* sample_include, const uintptr_t* covar_include, const pheno_col_t* covar_cols, const char* covar_names, uintptr_t sample_ct, uintptr_t covar_ct, uint32_t local_covar_ct, uint32_t covar_max_nonnull_cat_ct, uintptr_t extra_cat_ct, uintptr_t max_covar_name_blen, double vif_thresh, double max_corr, double* covars_smaj, double* covar_dotprod, double* covars_cmaj, char** cur_covar_names, vif_corr_err_t* vif_corr_check_result_ptr) {
+  vif_corr_check_result_ptr->errcode = kVifCorrCheckOk;
+  if (covar_ct == local_covar_ct) {
+    return 0;
+  }
+  const uintptr_t new_covar_ct = covar_ct + extra_cat_ct;
+  const uintptr_t new_nonlocal_covar_ct = new_covar_ct - local_covar_ct;
+  uintptr_t* cat_covar_wkspace;
+  matrix_invert_buf1_t* matrix_invert_buf1 = (matrix_invert_buf1_t*)bigstack_alloc(kMatrixInvertBuf1CheckedAlloc * new_nonlocal_covar_ct);
+  double* inverse_corr_buf;
+  double* dbl_2d_buf;
+  if ((!matrix_invert_buf1) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * new_nonlocal_covar_ct, &inverse_corr_buf) ||
+      bigstack_alloc_ul(1 + (covar_max_nonnull_cat_ct / kBitsPerWord), &cat_covar_wkspace) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * new_nonlocal_covar_ct, &dbl_2d_buf)) {
+    return 1;
+  }
+  unsigned char* alloc_base = g_bigstack_base;
+  unsigned char* new_covar_name_alloc = g_bigstack_end;
+  const uint32_t first_sample_uidx = next_set_unsafe(sample_include, 0);
+  uint32_t covar_read_uidx = 0;
+  const double sample_ct_recip = 1.0 / ((double)((intptr_t)sample_ct));
+  const double sample_ct_m1_d = (double)((intptr_t)(sample_ct - 1));
+  char** cur_covar_names_iter = cur_covar_names;
+  double* covar_write_iter = covars_cmaj;
+  double* sum_iter = dbl_2d_buf;
+  for (uintptr_t covar_read_idx = 0; covar_read_idx < covar_ct; ++covar_read_idx, ++covar_read_uidx) {
+    next_set_unsafe_ck(covar_include, &covar_read_uidx);
+    const pheno_col_t* cur_covar_col = &(covar_cols[covar_read_uidx]);
+    const char* covar_name_base = &(covar_names[covar_read_uidx * max_covar_name_blen]);
+    if (cur_covar_col->type_code == kPhenoDtypeOther) {
+      // local covariate
+      // const_cast
+      *cur_covar_names_iter++ = (char*)((uintptr_t)covar_name_base);
+    } else if (cur_covar_col->type_code == kPhenoDtypeQt) {
+      // const_cast
+      *cur_covar_names_iter++ = (char*)((uintptr_t)covar_name_base);
+      const double* covar_vals = cur_covar_col->data.qt;
+      uint32_t sample_uidx = first_sample_uidx;
+      double covar_sum = 0.0;
+      for (uintptr_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	const double cur_covar_val = covar_vals[sample_uidx];
+	covar_sum += cur_covar_val;
+	*covar_write_iter++ = cur_covar_val;
+      }
+      *sum_iter++ = covar_sum;
+    } else {
+      const uint32_t remaining_cat_ct = identify_remaining_cats(sample_include, cur_covar_col, sample_ct, cat_covar_wkspace);
+      assert(remaining_cat_ct >= 2);
+      const uint32_t* covar_vals = cur_covar_col->data.cat;
+      char** cur_category_names = cur_covar_col->category_names;
+      const uint32_t covar_name_base_slen = strlen(covar_name_base);
+      uint32_t cat_uidx = 1;
+      // this is equivalent to "--split-cat-pheno omit-last covar-01"
+      for (uint32_t cat_idx = 1; cat_idx < remaining_cat_ct; ++cat_idx, ++cat_uidx) {
+	next_set_unsafe_ck(cat_covar_wkspace, &cat_uidx);
+
+	const char* catname = cur_category_names[cat_uidx];
+	const uint32_t catname_slen = strlen(catname);
+	new_covar_name_alloc -= covar_name_base_slen + catname_slen + 2;
+	if (new_covar_name_alloc < alloc_base) {
+	  return 1;
+	}
+	*cur_covar_names_iter++ = (char*)new_covar_name_alloc;
+	char* new_covar_name_write = memcpyax(new_covar_name_alloc, covar_name_base, covar_name_base_slen, '=');
+	memcpyx(new_covar_name_write, catname, catname_slen, '\0');
+	
+	uint32_t sample_uidx = first_sample_uidx;
+	uint32_t cur_cat_obs_ct = 0;
+	for (uintptr_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sample_include, &sample_uidx);
+	  const uint32_t cur_sample_is_in_cat = (covar_vals[sample_uidx] == cat_uidx);
+	  cur_cat_obs_ct += cur_sample_is_in_cat;
+	  *covar_write_iter++ = (double)((int32_t)cur_sample_is_in_cat);
+	}
+	*sum_iter++ = (double)((int32_t)cur_cat_obs_ct);
+      }
+    }
+  }
+  bigstack_end_set(new_covar_name_alloc);
+  assert(covar_write_iter == &(covars_cmaj[new_nonlocal_covar_ct * sample_ct]));
+  transpose_copy(covars_cmaj, new_nonlocal_covar_ct, sample_ct, covars_smaj);
+  row_major_matrix_multiply(covars_cmaj, covars_smaj, new_nonlocal_covar_ct, new_nonlocal_covar_ct, sample_ct, covar_dotprod);
+  // we have dot products, now determine
+  //   (dotprod - sum(a)mean(b)) / (N-1)
+  // to get small-sample covariance
+  const uintptr_t new_nonlocal_covar_ct_p1 = new_nonlocal_covar_ct + 1;
+  const double sample_ct_m1_recip = 1.0 / sample_ct_m1_d;
+  double* covar_dotprod_iter = covar_dotprod;
+  double* sample_covariance_iter = inverse_corr_buf;
+  for (uintptr_t covar_idx1 = 0; covar_idx1 < new_nonlocal_covar_ct; ++covar_idx1) {
+    const double covar1_mean_adj = dbl_2d_buf[covar_idx1] * sample_ct_recip;
+    for (uintptr_t covar_idx2 = 0; covar_idx2 < new_nonlocal_covar_ct; ++covar_idx2) {
+      *sample_covariance_iter++ = ((*covar_dotprod_iter++) - covar1_mean_adj * dbl_2d_buf[covar_idx2]) * sample_ct_m1_recip;
+    }
+  }
+  // now use dbl_2d_buf to store inverse-sqrts, to get to correlation matrix
+  for (uintptr_t covar_idx = 0; covar_idx < new_nonlocal_covar_ct; ++covar_idx) {
+    dbl_2d_buf[covar_idx] = 1.0 / sqrt(inverse_corr_buf[covar_idx * new_nonlocal_covar_ct_p1]);
+  }
+  for (uintptr_t covar_idx1 = 1; covar_idx1 < new_nonlocal_covar_ct; ++covar_idx1) {
+    const double inverse_stdev1 = dbl_2d_buf[covar_idx1];
+    double* corr_row_iter = &(inverse_corr_buf[covar_idx1 * new_nonlocal_covar_ct]);
+    double* corr_col_start = &(inverse_corr_buf[covar_idx1]);
+    const double* inverse_stdev2_iter = dbl_2d_buf;
+    for (uintptr_t covar_idx2 = 0; covar_idx2 < covar_idx1; ++covar_idx2) {
+      double* corr_col_entry_ptr = &(corr_col_start[covar_idx2 * new_nonlocal_covar_ct]);
+      const double cur_corr = (*corr_col_entry_ptr) * inverse_stdev1 * (*inverse_stdev2_iter++);
+      if (cur_corr > max_corr) {
+	vif_corr_check_result_ptr->errcode = kVifCorrCheckCorrFail;
+	// may as well put smaller index first
+	vif_corr_check_result_ptr->covar_idx1 = covar_idx2;
+	vif_corr_check_result_ptr->covar_idx2 = covar_idx1;
+	// bigstack_reset unnecessary here, we'll reset the stack anyway before
+	// exiting or starting the next covariate
+	return 0;
+      }
+      *corr_col_entry_ptr = cur_corr;
+      *corr_row_iter++ = cur_corr;
+    }
+  }
+  for (uintptr_t covar_idx = 0; covar_idx < new_nonlocal_covar_ct; ++covar_idx) {
+    inverse_corr_buf[covar_idx * new_nonlocal_covar_ct_p1] = 1.0;
+  }
+  if (invert_matrix_checked(new_nonlocal_covar_ct, inverse_corr_buf, matrix_invert_buf1, dbl_2d_buf)) {
+    vif_corr_check_result_ptr->errcode = kVifCorrCheckVifFail;
+    vif_corr_check_result_ptr->covar_idx1 = 0xffffffffU;
+    return 0;
+  }
+  // VIFs = diagonal elements of inverse correlation matrix
+  for (uintptr_t covar_idx = 0; covar_idx < new_nonlocal_covar_ct; ++covar_idx) {
+    if (inverse_corr_buf[covar_idx * new_nonlocal_covar_ct_p1] > vif_thresh) {
+      vif_corr_check_result_ptr->errcode = kVifCorrCheckVifFail;
+      vif_corr_check_result_ptr->covar_idx1 = covar_idx + local_covar_ct;
+      return 0;
+    }
+  }
+  bigstack_reset(matrix_invert_buf1);
+  return 0;
+}
+
+boolerr_t glm_alloc_fill_and_test_pheno_covars_qt(const uintptr_t* sample_include, const double* pheno_qt, const uintptr_t* covar_include, const pheno_col_t* covar_cols, const char* covar_names, uintptr_t sample_ct, uintptr_t covar_ct, uint32_t local_covar_ct, uint32_t covar_max_nonnull_cat_ct, uintptr_t extra_cat_ct, uintptr_t max_covar_name_blen, double vif_thresh, double max_corr, double** pheno_d_ptr, double** covars_cmaj_d_ptr, char*** cur_covar_names_ptr, vif_corr_err_t* vif_corr_c [...]
+  const uintptr_t new_covar_ct = covar_ct + extra_cat_ct;
+  const uintptr_t new_nonlocal_covar_ct = new_covar_ct - local_covar_ct;
+  double* covar_dotprod;
+  double* covars_smaj;
+  if (bigstack_alloc_d(sample_ct, pheno_d_ptr) ||
+      bigstack_alloc_cp(new_covar_ct, cur_covar_names_ptr) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * sample_ct, covars_cmaj_d_ptr) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * new_nonlocal_covar_ct, &covar_dotprod) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * sample_ct, &covars_smaj)) {
+    return 1;
+  }
+  double* pheno_d_iter = *pheno_d_ptr;
+  uint32_t sample_uidx = 0;
+  for (uintptr_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+    next_set_unsafe_ck(sample_include, &sample_uidx);
+    *pheno_d_iter++ = pheno_qt[sample_uidx];
+  }
+  if (glm_fill_and_test_covars(sample_include, covar_include, covar_cols, covar_names, sample_ct, covar_ct, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct, max_covar_name_blen, vif_thresh, max_corr, covars_smaj, covar_dotprod, *covars_cmaj_d_ptr, *cur_covar_names_ptr, vif_corr_check_result_ptr)) {
+    return 1;
+  }
+  bigstack_reset(covar_dotprod);
+  return 0;
+}
+
+boolerr_t glm_alloc_fill_and_test_pheno_covars_cc(const uintptr_t* sample_include, const uintptr_t* pheno_cc, const uintptr_t* covar_include, const pheno_col_t* covar_cols, const char* covar_names, uintptr_t sample_ct, uintptr_t covar_ct, uint32_t local_covar_ct, uint32_t covar_max_nonnull_cat_ct, uintptr_t extra_cat_ct, uintptr_t max_covar_name_blen, double vif_thresh, double max_corr, uintptr_t** pheno_cc_collapsed_ptr, float** pheno_f_ptr, float** covars_cmaj_f_ptr, char*** cur_covar_ [...]
+  const uintptr_t sample_cta4 = round_up_pow2(sample_ct, 4);
+  const uintptr_t new_covar_ct = covar_ct + extra_cat_ct;
+  const uintptr_t new_nonlocal_covar_ct = new_covar_ct - local_covar_ct;
+  double* covars_cmaj_d;
+  double* covars_smaj_d;
+  double* covar_dotprod;
+  if (bigstack_alloc_ul(BITCT_TO_WORDCT(sample_ct), pheno_cc_collapsed_ptr) ||
+      bigstack_alloc_f(sample_cta4, pheno_f_ptr) ||
+      bigstack_alloc_f(new_nonlocal_covar_ct * sample_cta4, covars_cmaj_f_ptr) ||
+      bigstack_alloc_cp(new_covar_ct, cur_covar_names_ptr) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * sample_ct, &covars_cmaj_d) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * new_nonlocal_covar_ct, &covar_dotprod) ||
+      bigstack_alloc_d(new_nonlocal_covar_ct * sample_ct, &covars_smaj_d)) {
+    return 1;
+  }
+  uintptr_t* pheno_cc_collapsed = *pheno_cc_collapsed_ptr;
+  copy_bitarr_subset(pheno_cc, sample_include, sample_ct, pheno_cc_collapsed);
+  float* pheno_f_iter = *pheno_f_ptr;
+  for (uintptr_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+    *pheno_f_iter++ = (float)((int32_t)is_set(pheno_cc_collapsed, sample_idx));
+  }
+  const uint32_t sample_rem4 = sample_cta4 - sample_ct;
+  fill_float_zero(sample_rem4, pheno_f_iter);
+  if (glm_fill_and_test_covars(sample_include, covar_include, covar_cols, covar_names, sample_ct, covar_ct, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct, max_covar_name_blen, vif_thresh, max_corr, covars_smaj_d, covar_dotprod, covars_cmaj_d, *cur_covar_names_ptr, vif_corr_check_result_ptr)) {
+    return 1;
+  }
+  double* covar_read_iter = covars_cmaj_d;
+  float* covar_write_iter = *covars_cmaj_f_ptr;
+  for (uintptr_t covar_idx = 0; covar_idx < new_nonlocal_covar_ct; ++covar_idx) {
+    for (uintptr_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      *covar_write_iter++ = (float)(*covar_read_iter++);
+    }
+    fill_float_zero(sample_rem4, covar_write_iter);
+    covar_write_iter = &(covar_write_iter[sample_rem4]);
+  }
+  bigstack_reset(covars_cmaj_d);
+  return 0;
+}
+
+static const float kSmallFloats[4] = {0.0f, 1.0f, 2.0f, 3.0f};
+// static const float kSmallFloats[4] = {0.0f, 1.0f, 2.0f, -9.0f};
+
+void genoarr_to_floats(const uintptr_t* genoarr, uint32_t sample_ct, float* floatbuf) {
+  assert(sample_ct);
+  const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint32_t widx = 0;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  float* floatbuf_iter = floatbuf;
+  while (1) {
+    if (widx >= sample_ctl2m1) {
+      if (widx > sample_ctl2m1) {
+	return;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      const uintptr_t cur_geno = geno_word & 3;
+      // *floatbuf_iter++ = (float)((int32_t)cur_geno);
+      *floatbuf_iter++ = kSmallFloats[cur_geno];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+}
+
+uint32_t genoarr_to_floats_remove_missing(const uintptr_t* genoarr, uint32_t sample_ct, float* floatbuf) {
+  assert(sample_ct);
+  const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint32_t widx = 0;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  float* floatbuf_iter = floatbuf;
+  while (1) {
+    if (widx >= sample_ctl2m1) {
+      if (widx > sample_ctl2m1) {
+	return (uintptr_t)(floatbuf_iter - floatbuf);
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      const uintptr_t cur_geno = geno_word & 3;
+      if (cur_geno < 3) {
+	// *floatbuf_iter++ = (float)((int32_t)cur_geno);
+	*floatbuf_iter++ = kSmallFloats[cur_geno];
+      }
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+}
+
+// #####
+// The following code is based on the winning submissino of Pascal Pons in the
+// "GWASSpeedup" contest run in April 2013 by Babbage Analytics & Innovation
+// and TopCoder, who have donated the results to be used in PLINK.  The contest
+// was designed by Po-Ru Loh; subsequent analysis and code preparation were
+// performed by Andrew Hill, Ragu Bharadwaj, and Scott Jelinsky.  A manuscript
+// is in preparation by these authors and Iain Kilty, Kevin Boudreau, Karim
+// Lakhani and Eva Guinan.
+// #####
+
+#ifdef __LP64__
+// fmath_exp_ps is a C port of Shigeo Mitsunari's fast math library function
+// posted at https://github.com/herumi/fmath .  License is
+// http://opensource.org/licenses/BSD-3-Clause .
+// (I tried porting fmath_log_ps, but it turns out that Firth regression needs
+// double-precision log accuracy; logf() actually interferes with convergence.)
+
+// programmatically generated by:
+// typedef union {
+//   float f4;
+//   uint32_t u4;
+// } __uni4;
+//
+// __uni4 u4;
+// int32_t ii;
+// for (ii = 0; ii < 1024; ii++) {
+//   u4.f4 = pow(2.0f, ((float)ii) / 1024.0);
+//   printf("0x%08x", u4.u4 & 0x7fffff);
+//   if (ii % 4 != 3) {
+//     printf(", ");
+//   } else {
+//     printf(",\n");
+//   }
+// }
+const uint32_t float_exp_lookup_int[] __attribute__((aligned(16))) = {
+0x00000000, 0x00001630, 0x00002c64, 0x0000429c,
+0x000058d8, 0x00006f17, 0x0000855b, 0x00009ba2,
+0x0000b1ed, 0x0000c83c, 0x0000de8f, 0x0000f4e6,
+0x00010b41, 0x0001219f, 0x00013802, 0x00014e68,
+0x000164d2, 0x00017b40, 0x000191b2, 0x0001a828,
+0x0001bea1, 0x0001d51f, 0x0001eba1, 0x00020226,
+0x000218af, 0x00022f3c, 0x000245ce, 0x00025c63,
+0x000272fc, 0x00028998, 0x0002a039, 0x0002b6de,
+0x0002cd87, 0x0002e433, 0x0002fae4, 0x00031198,
+0x00032850, 0x00033f0d, 0x000355cd, 0x00036c91,
+0x00038359, 0x00039a25, 0x0003b0f5, 0x0003c7c9,
+0x0003dea1, 0x0003f57d, 0x00040c5d, 0x00042341,
+0x00043a29, 0x00045115, 0x00046804, 0x00047ef8,
+0x000495f0, 0x0004aceb, 0x0004c3eb, 0x0004daef,
+0x0004f1f6, 0x00050902, 0x00052012, 0x00053725,
+0x00054e3d, 0x00056558, 0x00057c78, 0x0005939c,
+0x0005aac3, 0x0005c1ef, 0x0005d91f, 0x0005f052,
+0x0006078a, 0x00061ec6, 0x00063606, 0x00064d4a,
+0x00066491, 0x00067bdd, 0x0006932d, 0x0006aa81,
+0x0006c1d9, 0x0006d935, 0x0006f095, 0x000707f9,
+0x00071f62, 0x000736ce, 0x00074e3e, 0x000765b3,
+0x00077d2b, 0x000794a8, 0x0007ac28, 0x0007c3ad,
+0x0007db35, 0x0007f2c2, 0x00080a53, 0x000821e8,
+0x00083981, 0x0008511e, 0x000868c0, 0x00088065,
+0x0008980f, 0x0008afbc, 0x0008c76e, 0x0008df23,
+0x0008f6dd, 0x00090e9b, 0x0009265d, 0x00093e24,
+0x000955ee, 0x00096dbc, 0x0009858f, 0x00099d66,
+0x0009b541, 0x0009cd20, 0x0009e503, 0x0009fcea,
+0x000a14d5, 0x000a2cc5, 0x000a44b9, 0x000a5cb1,
+0x000a74ad, 0x000a8cad, 0x000aa4b1, 0x000abcba,
+0x000ad4c6, 0x000aecd7, 0x000b04ec, 0x000b1d05,
+0x000b3523, 0x000b4d44, 0x000b656a, 0x000b7d94,
+0x000b95c2, 0x000badf4, 0x000bc62b, 0x000bde65,
+0x000bf6a4, 0x000c0ee7, 0x000c272f, 0x000c3f7a,
+0x000c57ca, 0x000c701e, 0x000c8876, 0x000ca0d2,
+0x000cb933, 0x000cd198, 0x000cea01, 0x000d026e,
+0x000d1adf, 0x000d3355, 0x000d4bcf, 0x000d644d,
+0x000d7cd0, 0x000d9556, 0x000dade1, 0x000dc671,
+0x000ddf04, 0x000df79c, 0x000e1038, 0x000e28d8,
+0x000e417d, 0x000e5a25, 0x000e72d3, 0x000e8b84,
+0x000ea43a, 0x000ebcf3, 0x000ed5b2, 0x000eee74,
+0x000f073b, 0x000f2006, 0x000f38d5, 0x000f51a9,
+0x000f6a81, 0x000f835d, 0x000f9c3e, 0x000fb523,
+0x000fce0c, 0x000fe6fa, 0x000fffec, 0x001018e2,
+0x001031dc, 0x00104adb, 0x001063de, 0x00107ce6,
+0x001095f2, 0x0010af02, 0x0010c816, 0x0010e12f,
+0x0010fa4d, 0x0011136e, 0x00112c94, 0x001145be,
+0x00115eed, 0x00117820, 0x00119158, 0x0011aa93,
+0x0011c3d3, 0x0011dd18, 0x0011f661, 0x00120fae,
+0x00122900, 0x00124256, 0x00125bb0, 0x0012750f,
+0x00128e72, 0x0012a7da, 0x0012c146, 0x0012dab7,
+0x0012f42c, 0x00130da5, 0x00132723, 0x001340a5,
+0x00135a2b, 0x001373b6, 0x00138d46, 0x0013a6d9,
+0x0013c072, 0x0013da0e, 0x0013f3af, 0x00140d55,
+0x001426ff, 0x001440ae, 0x00145a60, 0x00147418,
+0x00148dd4, 0x0014a794, 0x0014c159, 0x0014db22,
+0x0014f4f0, 0x00150ec2, 0x00152898, 0x00154274,
+0x00155c53, 0x00157637, 0x00159020, 0x0015aa0d,
+0x0015c3ff, 0x0015ddf5, 0x0015f7ef, 0x001611ee,
+0x00162bf2, 0x001645fa, 0x00166006, 0x00167a18,
+0x0016942d, 0x0016ae47, 0x0016c866, 0x0016e289,
+0x0016fcb1, 0x001716dd, 0x0017310e, 0x00174b43,
+0x0017657d, 0x00177fbc, 0x001799ff, 0x0017b446,
+0x0017ce92, 0x0017e8e3, 0x00180338, 0x00181d92,
+0x001837f0, 0x00185253, 0x00186cbb, 0x00188727,
+0x0018a197, 0x0018bc0d, 0x0018d686, 0x0018f105,
+0x00190b88, 0x0019260f, 0x0019409c, 0x00195b2c,
+0x001975c2, 0x0019905c, 0x0019aafa, 0x0019c59e,
+0x0019e046, 0x0019faf2, 0x001a15a3, 0x001a3059,
+0x001a4b13, 0x001a65d2, 0x001a8096, 0x001a9b5e,
+0x001ab62b, 0x001ad0fd, 0x001aebd3, 0x001b06ae,
+0x001b218d, 0x001b3c71, 0x001b575a, 0x001b7248,
+0x001b8d3a, 0x001ba831, 0x001bc32c, 0x001bde2c,
+0x001bf931, 0x001c143b, 0x001c2f49, 0x001c4a5c,
+0x001c6573, 0x001c8090, 0x001c9bb1, 0x001cb6d6,
+0x001cd201, 0x001ced30, 0x001d0864, 0x001d239c,
+0x001d3eda, 0x001d5a1c, 0x001d7562, 0x001d90ae,
+0x001dabfe, 0x001dc753, 0x001de2ad, 0x001dfe0b,
+0x001e196e, 0x001e34d6, 0x001e5043, 0x001e6bb4,
+0x001e872a, 0x001ea2a5, 0x001ebe25, 0x001ed9a9,
+0x001ef532, 0x001f10c0, 0x001f2c53, 0x001f47eb,
+0x001f6387, 0x001f7f28, 0x001f9ace, 0x001fb679,
+0x001fd228, 0x001feddc, 0x00200996, 0x00202553,
+0x00204116, 0x00205cde, 0x002078aa, 0x0020947b,
+0x0020b051, 0x0020cc2c, 0x0020e80b, 0x002103f0,
+0x00211fd9, 0x00213bc7, 0x002157ba, 0x002173b2,
+0x00218faf, 0x0021abb0, 0x0021c7b7, 0x0021e3c2,
+0x0021ffd2, 0x00221be7, 0x00223801, 0x0022541f,
+0x00227043, 0x00228c6b, 0x0022a899, 0x0022c4cb,
+0x0022e102, 0x0022fd3e, 0x0023197f, 0x002335c5,
+0x0023520f, 0x00236e5f, 0x00238ab3, 0x0023a70d,
+0x0023c36b, 0x0023dfce, 0x0023fc37, 0x002418a4,
+0x00243516, 0x0024518d, 0x00246e08, 0x00248a89,
+0x0024a70f, 0x0024c39a, 0x0024e029, 0x0024fcbe,
+0x00251958, 0x002535f6, 0x00255299, 0x00256f42,
+0x00258bef, 0x0025a8a2, 0x0025c559, 0x0025e215,
+0x0025fed7, 0x00261b9d, 0x00263868, 0x00265538,
+0x0026720e, 0x00268ee8, 0x0026abc7, 0x0026c8ac,
+0x0026e595, 0x00270283, 0x00271f76, 0x00273c6f,
+0x0027596c, 0x0027766e, 0x00279376, 0x0027b082,
+0x0027cd94, 0x0027eaaa, 0x002807c6, 0x002824e6,
+0x0028420c, 0x00285f37, 0x00287c66, 0x0028999b,
+0x0028b6d5, 0x0028d414, 0x0028f158, 0x00290ea1,
+0x00292bef, 0x00294942, 0x0029669b, 0x002983f8,
+0x0029a15b, 0x0029bec2, 0x0029dc2f, 0x0029f9a1,
+0x002a1718, 0x002a3494, 0x002a5215, 0x002a6f9b,
+0x002a8d26, 0x002aaab7, 0x002ac84c, 0x002ae5e7,
+0x002b0387, 0x002b212c, 0x002b3ed6, 0x002b5c85,
+0x002b7a3a, 0x002b97f3, 0x002bb5b2, 0x002bd376,
+0x002bf13f, 0x002c0f0d, 0x002c2ce0, 0x002c4ab9,
+0x002c6897, 0x002c867a, 0x002ca462, 0x002cc24f,
+0x002ce041, 0x002cfe39, 0x002d1c36, 0x002d3a38,
+0x002d583f, 0x002d764b, 0x002d945d, 0x002db274,
+0x002dd090, 0x002deeb1, 0x002e0cd8, 0x002e2b03,
+0x002e4934, 0x002e676b, 0x002e85a6, 0x002ea3e7,
+0x002ec22d, 0x002ee078, 0x002efec8, 0x002f1d1e,
+0x002f3b79, 0x002f59d9, 0x002f783e, 0x002f96a9,
+0x002fb519, 0x002fd38e, 0x002ff209, 0x00301089,
+0x00302f0e, 0x00304d98, 0x00306c28, 0x00308abd,
+0x0030a957, 0x0030c7f7, 0x0030e69c, 0x00310546,
+0x003123f6, 0x003142aa, 0x00316165, 0x00318024,
+0x00319ee9, 0x0031bdb3, 0x0031dc83, 0x0031fb57,
+0x00321a32, 0x00323911, 0x003257f6, 0x003276e0,
+0x003295d0, 0x0032b4c5, 0x0032d3bf, 0x0032f2bf,
+0x003311c4, 0x003330cf, 0x00334fde, 0x00336ef4,
+0x00338e0e, 0x0033ad2e, 0x0033cc54, 0x0033eb7e,
+0x00340aaf, 0x003429e4, 0x0034491f, 0x00346860,
+0x003487a6, 0x0034a6f1, 0x0034c642, 0x0034e598,
+0x003504f3, 0x00352454, 0x003543bb, 0x00356327,
+0x00358298, 0x0035a20f, 0x0035c18b, 0x0035e10d,
+0x00360094, 0x00362020, 0x00363fb2, 0x00365f4a,
+0x00367ee7, 0x00369e89, 0x0036be31, 0x0036dddf,
+0x0036fd92, 0x00371d4a, 0x00373d08, 0x00375ccc,
+0x00377c95, 0x00379c63, 0x0037bc37, 0x0037dc11,
+0x0037fbf0, 0x00381bd4, 0x00383bbe, 0x00385bae,
+0x00387ba3, 0x00389b9e, 0x0038bb9e, 0x0038dba4,
+0x0038fbaf, 0x00391bc0, 0x00393bd7, 0x00395bf3,
+0x00397c14, 0x00399c3b, 0x0039bc68, 0x0039dc9a,
+0x0039fcd2, 0x003a1d10, 0x003a3d53, 0x003a5d9b,
+0x003a7dea, 0x003a9e3e, 0x003abe97, 0x003adef6,
+0x003aff5b, 0x003b1fc5, 0x003b4035, 0x003b60aa,
+0x003b8126, 0x003ba1a6, 0x003bc22d, 0x003be2b9,
+0x003c034a, 0x003c23e2, 0x003c447f, 0x003c6521,
+0x003c85ca, 0x003ca678, 0x003cc72b, 0x003ce7e5,
+0x003d08a4, 0x003d2968, 0x003d4a33, 0x003d6b03,
+0x003d8bd8, 0x003dacb4, 0x003dcd95, 0x003dee7c,
+0x003e0f68, 0x003e305a, 0x003e5152, 0x003e7250,
+0x003e9353, 0x003eb45c, 0x003ed56b, 0x003ef67f,
+0x003f179a, 0x003f38ba, 0x003f59df, 0x003f7b0b,
+0x003f9c3c, 0x003fbd73, 0x003fdeb0, 0x003ffff2,
+0x0040213b, 0x00404289, 0x004063dc, 0x00408536,
+0x0040a695, 0x0040c7fb, 0x0040e966, 0x00410ad6,
+0x00412c4d, 0x00414dc9, 0x00416f4b, 0x004190d3,
+0x0041b261, 0x0041d3f5, 0x0041f58e, 0x0042172d,
+0x004238d2, 0x00425a7d, 0x00427c2e, 0x00429de4,
+0x0042bfa1, 0x0042e163, 0x0043032b, 0x004324f9,
+0x004346cd, 0x004368a7, 0x00438a86, 0x0043ac6b,
+0x0043ce57, 0x0043f048, 0x0044123f, 0x0044343c,
+0x0044563f, 0x00447848, 0x00449a56, 0x0044bc6b,
+0x0044de85, 0x004500a5, 0x004522cc, 0x004544f8,
+0x0045672a, 0x00458962, 0x0045aba0, 0x0045cde4,
+0x0045f02e, 0x0046127e, 0x004634d3, 0x0046572f,
+0x00467991, 0x00469bf8, 0x0046be66, 0x0046e0d9,
+0x00470353, 0x004725d2, 0x00474858, 0x00476ae3,
+0x00478d75, 0x0047b00c, 0x0047d2aa, 0x0047f54d,
+0x004817f7, 0x00483aa6, 0x00485d5b, 0x00488017,
+0x0048a2d8, 0x0048c5a0, 0x0048e86d, 0x00490b41,
+0x00492e1b, 0x004950fa, 0x004973e0, 0x004996cc,
+0x0049b9be, 0x0049dcb5, 0x0049ffb3, 0x004a22b7,
+0x004a45c1, 0x004a68d1, 0x004a8be8, 0x004aaf04,
+0x004ad226, 0x004af54f, 0x004b187d, 0x004b3bb2,
+0x004b5eed, 0x004b822e, 0x004ba575, 0x004bc8c2,
+0x004bec15, 0x004c0f6e, 0x004c32ce, 0x004c5633,
+0x004c799f, 0x004c9d11, 0x004cc089, 0x004ce407,
+0x004d078c, 0x004d2b16, 0x004d4ea7, 0x004d723d,
+0x004d95da, 0x004db97e, 0x004ddd27, 0x004e00d6,
+0x004e248c, 0x004e4848, 0x004e6c0a, 0x004e8fd2,
+0x004eb3a1, 0x004ed775, 0x004efb50, 0x004f1f31,
+0x004f4319, 0x004f6706, 0x004f8afa, 0x004faef4,
+0x004fd2f4, 0x004ff6fb, 0x00501b08, 0x00503f1b,
+0x00506334, 0x00508753, 0x0050ab79, 0x0050cfa5,
+0x0050f3d7, 0x00511810, 0x00513c4f, 0x00516094,
+0x005184df, 0x0051a931, 0x0051cd89, 0x0051f1e7,
+0x0052164c, 0x00523ab7, 0x00525f28, 0x005283a0,
+0x0052a81e, 0x0052cca2, 0x0052f12c, 0x005315bd,
+0x00533a54, 0x00535ef2, 0x00538396, 0x0053a840,
+0x0053ccf1, 0x0053f1a8, 0x00541665, 0x00543b29,
+0x00545ff3, 0x005484c3, 0x0054a99a, 0x0054ce77,
+0x0054f35b, 0x00551845, 0x00553d35, 0x0055622c,
+0x00558729, 0x0055ac2d, 0x0055d137, 0x0055f647,
+0x00561b5e, 0x0056407b, 0x0056659f, 0x00568ac9,
+0x0056affa, 0x0056d531, 0x0056fa6e, 0x00571fb2,
+0x005744fd, 0x00576a4e, 0x00578fa5, 0x0057b503,
+0x0057da67, 0x0057ffd2, 0x00582543, 0x00584abb,
+0x00587039, 0x005895be, 0x0058bb49, 0x0058e0db,
+0x00590673, 0x00592c12, 0x005951b8, 0x00597763,
+0x00599d16, 0x0059c2cf, 0x0059e88e, 0x005a0e54,
+0x005a3421, 0x005a59f4, 0x005a7fcd, 0x005aa5ae,
+0x005acb94, 0x005af182, 0x005b1776, 0x005b3d70,
+0x005b6371, 0x005b8979, 0x005baf87, 0x005bd59c,
+0x005bfbb8, 0x005c21da, 0x005c4802, 0x005c6e32,
+0x005c9468, 0x005cbaa4, 0x005ce0e7, 0x005d0731,
+0x005d2d82, 0x005d53d9, 0x005d7a36, 0x005da09b,
+0x005dc706, 0x005ded77, 0x005e13f0, 0x005e3a6f,
+0x005e60f5, 0x005e8781, 0x005eae14, 0x005ed4ae,
+0x005efb4e, 0x005f21f5, 0x005f48a3, 0x005f6f58,
+0x005f9613, 0x005fbcd5, 0x005fe39e, 0x00600a6d,
+0x00603143, 0x00605820, 0x00607f03, 0x0060a5ee,
+0x0060ccdf, 0x0060f3d7, 0x00611ad5, 0x006141db,
+0x006168e7, 0x00618ffa, 0x0061b713, 0x0061de34,
+0x0062055b, 0x00622c89, 0x006253be, 0x00627af9,
+0x0062a23c, 0x0062c985, 0x0062f0d5, 0x0063182c,
+0x00633f89, 0x006366ee, 0x00638e59, 0x0063b5cb,
+0x0063dd44, 0x006404c4, 0x00642c4b, 0x006453d8,
+0x00647b6d, 0x0064a308, 0x0064caaa, 0x0064f253,
+0x00651a03, 0x006541b9, 0x00656977, 0x0065913c,
+0x0065b907, 0x0065e0d9, 0x006608b2, 0x00663092,
+0x00665879, 0x00668067, 0x0066a85c, 0x0066d058,
+0x0066f85b, 0x00672064, 0x00674875, 0x0067708c,
+0x006798ab, 0x0067c0d0, 0x0067e8fd, 0x00681130,
+0x0068396a, 0x006861ac, 0x006889f4, 0x0068b243,
+0x0068da99, 0x006902f7, 0x00692b5b, 0x006953c6,
+0x00697c38, 0x0069a4b1, 0x0069cd32, 0x0069f5b9,
+0x006a1e47, 0x006a46dd, 0x006a6f79, 0x006a981c,
+0x006ac0c7, 0x006ae978, 0x006b1231, 0x006b3af1,
+0x006b63b7, 0x006b8c85, 0x006bb55a, 0x006bde36,
+0x006c0719, 0x006c3003, 0x006c58f4, 0x006c81ec,
+0x006caaec, 0x006cd3f2, 0x006cfd00, 0x006d2614,
+0x006d4f30, 0x006d7853, 0x006da17d, 0x006dcaae,
+0x006df3e7, 0x006e1d26, 0x006e466d, 0x006e6fbb,
+0x006e9910, 0x006ec26c, 0x006eebcf, 0x006f1539,
+0x006f3eab, 0x006f6824, 0x006f91a4, 0x006fbb2b,
+0x006fe4ba, 0x00700e4f, 0x007037ec, 0x00706190,
+0x00708b3b, 0x0070b4ee, 0x0070dea8, 0x00710868,
+0x00713231, 0x00715c00, 0x007185d7, 0x0071afb5,
+0x0071d99a, 0x00720386, 0x00722d7a, 0x00725775,
+0x00728177, 0x0072ab81, 0x0072d592, 0x0072ffaa,
+0x007329c9, 0x007353f0, 0x00737e1e, 0x0073a853,
+0x0073d290, 0x0073fcd4, 0x0074271f, 0x00745172,
+0x00747bcc, 0x0074a62d, 0x0074d096, 0x0074fb06,
+0x0075257d, 0x00754ffc, 0x00757a82, 0x0075a50f,
+0x0075cfa4, 0x0075fa40, 0x007624e4, 0x00764f8f,
+0x00767a41, 0x0076a4fb, 0x0076cfbc, 0x0076fa85,
+0x00772555, 0x0077502d, 0x00777b0b, 0x0077a5f2,
+0x0077d0df, 0x0077fbd5, 0x007826d1, 0x007851d5,
+0x00787ce1, 0x0078a7f4, 0x0078d30e, 0x0078fe30,
+0x0079295a, 0x0079548b, 0x00797fc3, 0x0079ab03,
+0x0079d64a, 0x007a0199, 0x007a2cf0, 0x007a584d,
+0x007a83b3, 0x007aaf20, 0x007ada94, 0x007b0610,
+0x007b3194, 0x007b5d1f, 0x007b88b2, 0x007bb44c,
+0x007bdfed, 0x007c0b97, 0x007c3748, 0x007c6300,
+0x007c8ec0, 0x007cba88, 0x007ce657, 0x007d122e,
+0x007d3e0c, 0x007d69f2, 0x007d95e0, 0x007dc1d5,
+0x007dedd2, 0x007e19d6, 0x007e45e2, 0x007e71f6,
+0x007e9e11, 0x007eca34, 0x007ef65f, 0x007f2291,
+0x007f4ecb, 0x007f7b0d, 0x007fa756, 0x007fd3a7
+};
+
+const float* const float_exp_lookup = (const float*)float_exp_lookup_int;
+
+static inline __m128 fmath_exp_ps(__m128 xx) {
+  const __m128i mask7ff = {0x7fffffff7fffffffLLU, 0x7fffffff7fffffffLLU};
+
+  // 88
+  const __m128i max_x = {0x42b0000042b00000LLU, 0x42b0000042b00000LLU};
+  // -88
+  // more sensible 0xc2b00000... not used here due to "narrowing conversion"
+  // warning
+  const __m128i min_x = {-0x3d4fffff3d500000LL, -0x3d4fffff3d500000LL};
+  // 2^10 / log(2)
+  const __m128i const_aa = {0x44b8aa3b44b8aa3bLLU, 0x44b8aa3b44b8aa3bLLU};
+  // log(2) / 2^10
+  const __m128i const_bb = {0x3a3172183a317218LLU, 0x3a3172183a317218LLU};
+
+  const __m128i f1 = {0x3f8000003f800000LLU, 0x3f8000003f800000LLU};
+  const __m128i mask_s = {0x3ff000003ffLLU, 0x3ff000003ffLLU};
+  const __m128i i127s = {0x1fc000001fc00LLU, 0x1fc000001fc00LLU};
+  const __m128i limit = _mm_castps_si128(_mm_and_ps(xx, (__m128)mask7ff));
+  const int32_t over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, max_x));
+  if (over) {
+    xx = _mm_min_ps(xx, (__m128)max_x);
+    xx = _mm_max_ps(xx, (__m128)min_x);
+  }
+  const __m128i rr = _mm_cvtps_epi32(_mm_mul_ps(xx, (__m128)const_aa));
+  __m128 tt = _mm_sub_ps(xx, _mm_mul_ps(_mm_cvtepi32_ps(rr), (__m128)const_bb));
+  tt = _mm_add_ps(tt, (__m128)f1);
+  const __m128i v4 = _mm_and_si128(rr, mask_s);
+  __m128i u4 = _mm_add_epi32(rr, i127s);
+  u4 = _mm_srli_epi32(u4, 10);
+  u4 = _mm_slli_epi32(u4, 23);
+  const uint32_t v0 = _mm_cvtsi128_si32(v4);
+  // uint32_t v1 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(2)));
+  // uint32_t v2 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(4)));
+  // uint32_t v3 = ((int32_t)(uint16_t)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(v4), (int32_t)(6)));
+  // make this work with LLVM
+  const uint32_t v1 = _mm_extract_epi16(((__m128i)(v4)), ((int32_t)(2)));
+  const uint32_t v2 = _mm_extract_epi16(((__m128i)(v4)), ((int32_t)(4)));
+  const uint32_t v3 = _mm_extract_epi16(((__m128i)(v4)), ((int32_t)(6)));
+
+  __m128 t0 = _mm_set_ss(float_exp_lookup[v0]);
+  __m128 t1 = _mm_set_ss(float_exp_lookup[v1]);
+  const __m128 t2 = _mm_set_ss(float_exp_lookup[v2]);
+  const __m128 t3 = _mm_set_ss(float_exp_lookup[v3]);
+  t1 = _mm_movelh_ps(t1, t3);
+  t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
+  t0 = _mm_movelh_ps(t0, t2);
+  t0 = _mm_or_ps(t0, t1);
+  t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
+  tt = _mm_mul_ps(tt, t0);
+  return tt;
+}
+
+// For equivalent "normal" C/C++ code, see the non-__LP64__ versions of these
+// functions.
+static inline void logistic_sse(uint32_t nn, float* vect) {
+  const __m128 zero = _mm_setzero_ps();
+  const __m128 one = _mm_set1_ps(1.0);
+  for (uint32_t uii = 0; uii < nn; uii += 4) {
+    __m128 aa = _mm_load_ps(&(vect[uii]));
+    aa = _mm_sub_ps(zero, aa);
+    aa = fmath_exp_ps(aa);
+    aa = _mm_add_ps(aa, one);
+    aa = _mm_div_ps(one, aa);
+    _mm_store_ps(&(vect[uii]), aa);
+  }
+}
+
+static inline void compute_v_and_p_minus_y(const float* yy, uint32_t nn, float* pp, float* vv) {
+  const __m128 one = _mm_set1_ps(1.0);
+  for (uint32_t uii = 0; uii < nn; uii += 4) {
+    __m128 ptmp = _mm_load_ps(&(pp[uii]));
+    __m128 one_minus_ptmp = _mm_sub_ps(one, ptmp);
+    _mm_store_ps(&(vv[uii]), _mm_mul_ps(ptmp, one_minus_ptmp));
+    __m128 ytmp = _mm_load_ps(&(yy[uii]));
+    _mm_store_ps(&(pp[uii]), _mm_sub_ps(ptmp, ytmp));
+  }
+}
+
+static inline void compute_v(const float* pp, uint32_t nn, float* vv) {
+  const __m128 one = _mm_set1_ps(1.0);
+  for (uint32_t uii = 0; uii < nn; uii += 4) {
+    __m128 ptmp = _mm_load_ps(&(pp[uii]));
+    __m128 one_minus_ptmp = _mm_sub_ps(one, ptmp);
+    _mm_store_ps(&(vv[uii]), _mm_mul_ps(ptmp, one_minus_ptmp));
+  }
+}
+
+static inline void mult_tmatrix_nxd_vect_d(const float* tm, const float* vect, uint32_t col_ct, uint32_t row_ct, float* dest) {
+  // tm is row-major, cols are packed to 16-byte alignment
+  // "col_cta4" = col_ct, aligned to multiple of 4.  Since 16-byte blocks
+  // contain 4 floats each, this is the actual length (in floats) of each tm
+  // row.  (Yes, I need to standardize a zillion other variable names of this
+  // sort...)
+  __m128 w1;
+  __m128 w2;
+  __m128 w3;
+  const uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
+  uint32_t row_idx = 0;
+  if (row_ct < 4) {
+    memset(dest, 0, col_ct * sizeof(float));
+  } else {
+    w1 = _mm_load1_ps(vect);
+    w2 = _mm_load1_ps(&(vect[1]));
+    w3 = _mm_load1_ps(&(vect[2]));
+    __m128 w4 = _mm_load1_ps(&(vect[3]));
+    for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+      __m128 r1 = _mm_load_ps(&(tm[col_idx]));
+      __m128 r2 = _mm_load_ps(&(tm[col_idx + col_cta4]));
+      __m128 r3 = _mm_load_ps(&(tm[col_idx + 2 * col_cta4]));
+      __m128 r4 = _mm_load_ps(&(tm[col_idx + 3 * col_cta4]));
+      r1 = _mm_mul_ps(r1, w1);
+      r2 = _mm_mul_ps(r2, w2);
+      r3 = _mm_mul_ps(r3, w3);
+      r4 = _mm_mul_ps(r4, w4);
+      r1 = _mm_add_ps(r1, r2);
+      r3 = _mm_add_ps(r3, r4);
+      r1 = _mm_add_ps(r1, r3);
+      _mm_store_ps(&(dest[col_idx]), r1);
+    }
+    const uint32_t row_ctm3 = row_ct - 3;
+    for (row_idx = 4; row_idx < row_ctm3; row_idx += 4) {
+      w1 = _mm_load1_ps(&(vect[row_idx]));
+      w2 = _mm_load1_ps(&(vect[row_idx + 1]));
+      w3 = _mm_load1_ps(&(vect[row_idx + 2]));
+      w4 = _mm_load1_ps(&(vect[row_idx + 3]));
+      for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+        __m128 r1 = _mm_load_ps(&(tm[col_idx + row_idx * col_cta4]));
+        __m128 r2 = _mm_load_ps(&(tm[col_idx + (row_idx + 1) * col_cta4]));
+        __m128 r3 = _mm_load_ps(&(tm[col_idx + (row_idx + 2) * col_cta4]));
+        __m128 r4 = _mm_load_ps(&(tm[col_idx + (row_idx + 3) * col_cta4]));
+        r1 = _mm_mul_ps(r1, w1);
+        r2 = _mm_mul_ps(r2, w2);
+        r3 = _mm_mul_ps(r3, w3);
+        r4 = _mm_mul_ps(r4, w4);
+        r1 = _mm_add_ps(r1, r2);
+        r3 = _mm_add_ps(r3, r4);
+        r1 = _mm_add_ps(r1, r3);
+	r1 = _mm_add_ps(r1, _mm_load_ps(&(dest[col_idx])));
+	_mm_store_ps(&(dest[col_idx]), r1);
+      }
+    }
+  }
+  switch(row_ct % 4) {
+  case 3:
+    w1 = _mm_load1_ps(&(vect[row_idx]));
+    w2 = _mm_load1_ps(&(vect[row_idx + 1]));
+    w3 = _mm_load1_ps(&(vect[row_idx + 2]));
+    for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+      __m128 r1 = _mm_load_ps(&(tm[col_idx + row_idx * col_cta4]));
+      __m128 r2 = _mm_load_ps(&(tm[col_idx + (row_idx + 1) * col_cta4]));
+      __m128 r3 = _mm_load_ps(&(tm[col_idx + (row_idx + 2) * col_cta4]));
+      r1 = _mm_mul_ps(r1, w1);
+      r2 = _mm_mul_ps(r2, w2);
+      r3 = _mm_mul_ps(r3, w3);
+      r1 = _mm_add_ps(r1, r2);
+      r3 = _mm_add_ps(r3, _mm_load_ps(&(dest[col_idx])));
+      r1 = _mm_add_ps(r1, r3);
+      _mm_store_ps(&(dest[col_idx]), r1);
+    }
+    break;
+  case 2:
+    w1 = _mm_load1_ps(&(vect[row_idx]));
+    w2 = _mm_load1_ps(&(vect[row_idx + 1]));
+    for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+      __m128 r1 = _mm_load_ps(&(tm[col_idx + row_idx * col_cta4]));
+      __m128 r2 = _mm_load_ps(&(tm[col_idx + (row_idx + 1) * col_cta4]));
+      r1 = _mm_mul_ps(r1, w1);
+      r2 = _mm_mul_ps(r2, w2);
+      r1 = _mm_add_ps(r1, r2);
+      r1 = _mm_add_ps(r1, _mm_load_ps(&(dest[col_idx])));
+      _mm_store_ps(&(dest[col_idx]), r1);
+    }
+    break;
+  case 1:
+    w1 = _mm_load1_ps(&(vect[row_idx]));
+    for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+      __m128 r1 = _mm_load_ps(&(tm[col_idx + row_idx * col_cta4]));
+      r1 = _mm_mul_ps(r1, w1);
+      r1 = _mm_add_ps(r1, _mm_load_ps(&(dest[col_idx])));
+      _mm_store_ps(&(dest[col_idx]), r1);
+    }
+  }
+}
+
+// This code was hand-optimized by others for 16-byte float vectors.  Exempt it
+// from the rest of the codebase's attempt at vector-size-agnosticism for now.
+typedef union {
+  __m128 vf;
+  float f4[4];
+} __old_univecf_t;
+
+static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, uint32_t col_ct, uint32_t row_ct, float* dest) {
+  const uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
+  uint32_t row_idx = 0;
+  __m128 s1;
+  __m128 s2;
+  __m128 s3;
+  __old_univecf_t uvec;
+  if (row_ct > 3) {
+    const uint32_t row_ctm3 = row_ct - 3;
+    for (; row_idx < row_ctm3; row_idx += 4) {
+      s1 = _mm_setzero_ps();
+      s2 = _mm_setzero_ps();
+      s3 = _mm_setzero_ps();
+      __m128 s4 = _mm_setzero_ps();
+      for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+	const float* mm_ptr = &(mm[row_idx * col_cta4 + col_idx]);
+        const __m128 vv = _mm_load_ps(&(vect[col_idx]));
+        __m128 a1 = _mm_load_ps(mm_ptr);
+        __m128 a2 = _mm_load_ps(&(mm_ptr[col_cta4]));
+        __m128 a3 = _mm_load_ps(&(mm_ptr[2 * col_cta4]));
+        __m128 a4 = _mm_load_ps(&(mm_ptr[3 * col_cta4]));
+	// want to switch this to fused multiply-add...
+        a1 = _mm_mul_ps(a1, vv);
+        a2 = _mm_mul_ps(a2, vv);
+        a3 = _mm_mul_ps(a3, vv);
+        a4 = _mm_mul_ps(a4, vv);
+        s1 = _mm_add_ps(s1, a1);
+        s2 = _mm_add_ps(s2, a2);
+        s3 = _mm_add_ps(s3, a3);
+        s4 = _mm_add_ps(s4, a4);
+      }
+      // refrain from using SSE3 _mm_hadd_ps() for now
+      uvec.vf = s1;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+      uvec.vf = s2;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+      uvec.vf = s3;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+      uvec.vf = s4;
+      *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    }
+  }
+  s1 = _mm_setzero_ps();
+  s2 = _mm_setzero_ps();
+  s3 = _mm_setzero_ps();
+  switch (row_ct % 4) {
+  case 3:
+    for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+      const float* mm_ptr = &(mm[row_idx * col_cta4 + col_idx]);
+      const __m128 vv = _mm_load_ps(&(vect[col_idx]));
+      __m128 a1 = _mm_load_ps(mm_ptr);
+      __m128 a2 = _mm_load_ps(&(mm_ptr[col_cta4]));
+      __m128 a3 = _mm_load_ps(&(mm_ptr[2 * col_cta4]));
+      a1 = _mm_mul_ps(a1, vv);
+      a2 = _mm_mul_ps(a2, vv);
+      a3 = _mm_mul_ps(a3, vv);
+      s1 = _mm_add_ps(s1, a1);
+      s2 = _mm_add_ps(s2, a2);
+      s3 = _mm_add_ps(s3, a3);
+    }
+    uvec.vf = s1;
+    *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    uvec.vf = s2;
+    *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    uvec.vf = s3;
+    *dest = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    break;
+  case 2:
+    for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+      const float* mm_ptr = &(mm[row_idx * col_cta4 + col_idx]);
+      const __m128 vv = _mm_load_ps(&(vect[col_idx]));
+      __m128 a1 = _mm_load_ps(mm_ptr);
+      __m128 a2 = _mm_load_ps(&(mm_ptr[col_cta4]));
+      a1 = _mm_mul_ps(a1, vv);
+      a2 = _mm_mul_ps(a2, vv);
+      s1 = _mm_add_ps(s1, a1);
+      s2 = _mm_add_ps(s2, a2);
+    }
+    uvec.vf = s1;
+    *dest++ = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    uvec.vf = s2;
+    *dest = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    break;
+  case 1:
+    for (uint32_t col_idx = 0; col_idx < col_ct; col_idx += 4) {
+      const __m128 vv = _mm_load_ps(&(vect[col_idx]));
+      __m128 a1 = _mm_load_ps(&(mm[row_idx * col_cta4 + col_idx]));
+      a1 = _mm_mul_ps(a1, vv);
+      s1 = _mm_add_ps(s1, a1);
+    }
+    uvec.vf = s1;
+    *dest = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+    break;
+  }
+}
+
+static inline float triple_product(const float* v1, const float* v2, const float* v3, uint32_t nn) {
+  __m128 sum = _mm_setzero_ps();
+  for (uint32_t uii = 0; uii < nn; uii += 4) {
+    const __m128 aa = _mm_load_ps(&(v1[uii]));
+    const __m128 bb = _mm_load_ps(&(v2[uii]));
+    const __m128 cc = _mm_load_ps(&(v3[uii]));
+    sum = _mm_add_ps(sum, _mm_mul_ps(_mm_mul_ps(aa, bb), cc));
+  }
+  __old_univecf_t uvec;
+  uvec.vf = sum;
+  return uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+}
+
+static inline void compute_two_diag_triple_product(const float* aa, const float* bb, const float* vv, uint32_t nn, float* raa_ptr, float* rab_ptr, float* rbb_ptr) {
+  __m128 saa = _mm_setzero_ps();
+  __m128 sab = _mm_setzero_ps();
+  __m128 sbb = _mm_setzero_ps();
+  for (uint32_t uii = 0; uii < nn; uii += 4) {
+    const __m128 vtmp = _mm_load_ps(&(vv[uii]));
+    const __m128 atmp = _mm_load_ps(&(aa[uii]));
+    const __m128 btmp = _mm_load_ps(&(bb[uii]));
+    const __m128 av = _mm_mul_ps(atmp, vtmp);
+    const __m128 bv = _mm_mul_ps(btmp, vtmp);
+    saa = _mm_add_ps(saa, _mm_mul_ps(atmp, av));
+    sab = _mm_add_ps(sab, _mm_mul_ps(atmp, bv));
+    sbb = _mm_add_ps(sbb, _mm_mul_ps(btmp, bv));
+  }
+  __old_univecf_t uvec;
+  uvec.vf = saa;
+  *raa_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = sab;
+  *rab_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = sbb;
+  *rbb_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+}
+
+static inline void compute_three_triple_product(const float* bb, const float* a1, const float* a2, const float* a3, const float* vv, uint32_t nn, float* r1_ptr, float* r2_ptr, float* r3_ptr) {
+  __m128 s1 = _mm_setzero_ps();
+  __m128 s2 = _mm_setzero_ps();
+  __m128 s3 = _mm_setzero_ps();
+  for (uint32_t uii = 0; uii < nn; uii += 4) {
+    const __m128 a1tmp = _mm_load_ps(&(a1[uii]));
+    const __m128 a2tmp = _mm_load_ps(&(a2[uii]));
+    const __m128 a3tmp = _mm_load_ps(&(a3[uii]));
+    const __m128 vtmp = _mm_load_ps(&(vv[uii]));
+    __m128 btmp = _mm_load_ps(&(bb[uii]));
+    btmp = _mm_mul_ps(btmp, vtmp);
+    s1 = _mm_add_ps(s1, _mm_mul_ps(a1tmp, btmp));
+    s2 = _mm_add_ps(s2, _mm_mul_ps(a2tmp, btmp));
+    s3 = _mm_add_ps(s3, _mm_mul_ps(a3tmp, btmp));
+  }
+  __old_univecf_t uvec;
+  uvec.vf = s1;
+  *r1_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s2;
+  *r2_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s3;
+  *r3_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+}
+
+static inline void compute_two_plus_one_triple_product(const float* bb, const float* a1, const float* a2, const float* vv, uint32_t nn, float* r1_ptr, float* r2_ptr, float* r3_ptr) {
+  __m128 s1 = _mm_setzero_ps();
+  __m128 s2 = _mm_setzero_ps();
+  __m128 s3 = _mm_setzero_ps();
+  for (uint32_t uii = 0; uii < nn; uii += 4) {
+    const __m128 a1tmp = _mm_load_ps(&(a1[uii]));
+    const __m128 a2tmp = _mm_load_ps(&(a2[uii]));
+    const __m128 btmp = _mm_load_ps(&(bb[uii]));
+    const __m128 vtmp = _mm_load_ps(&(vv[uii]));
+    const __m128 bv = _mm_mul_ps(btmp, vtmp);
+    s1 = _mm_add_ps(s1, _mm_mul_ps(btmp, bv));
+    s2 = _mm_add_ps(s2, _mm_mul_ps(a1tmp, bv));
+    s3 = _mm_add_ps(s3, _mm_mul_ps(a2tmp, bv));
+  }
+  __old_univecf_t uvec;
+  uvec.vf = s1;
+  *r1_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s2;
+  *r2_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+  uvec.vf = s3;
+  *r3_ptr = uvec.f4[0] + uvec.f4[1] + uvec.f4[2] + uvec.f4[3];
+}
+#else // no __LP64__ (and hence, unsafe to assume presence of SSE2)
+static inline void logistic_sse(uint32_t nn, float* vect) {
+  for (uint32_t uii = 0; uii < nn; ++uii) {
+    vect[uii] = 1.0 / (1 + exp(-vect[uii]));
+  }
+}
+
+static inline void compute_v_and_p_minus_y(const float* yy, uint32_t nn, float* pp, float* vv) {
+  for (uint32_t uii = 0; uii < nn; ++uii) {
+    vv[uii] = pp[uii] * (1.0 - pp[uii]);
+    pp[uii] -= yy[uii];
+  }
+}
+
+static inline void compute_v(const float* pp, uint32_t nn, float* vv) {
+  for (uint32_t uii = 0; uii < nn; ++uii) {
+    vv[uii] = pp[uii] * (1.0 - pp[uii]);
+  }
+}
+
+static inline void mult_tmatrix_nxd_vect_d(const float* tm, const float* vect, uint32_t col_ct, uint32_t row_ct, float* dest) {
+  const uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
+  fill_float_zero(col_ct, dest);
+  for (uint32_t row_idx = 0; row_idx < row_ct; ++row_idx) {
+    const float vect_val = vect[row_idx];
+    const float* tm_ptr = &(tm[row_idx * col_cta4]);
+    for (uint32_t col_idx = 0; col_idx < col_ct; ++col_idx) {
+      dest[col_idx] += (*tm_ptr++) * vect_val;
+    }
+  }
+}
+
+static inline void mult_matrix_dxn_vect_n(const float* mm, const float* vect, uint32_t col_ct, uint32_t row_ct, float* dest) {
+  const uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
+  for (uint32_t row_idx = 0; row_idx < row_ct; ++row_idx) {
+    float fxx = 0.0;
+    const float* vect_ptr = vect;
+    const float* mm_ptr = &(mm[row_idx * col_cta4]);
+    for (uint32_t col_idx = 0; col_idx < col_ct; ++col_idx) {
+      fxx += (*mm_ptr++) * (*vect_ptr++);
+    }
+    *dest++ = fxx;
+  }
+}
+
+static inline float triple_product(const float* v1, const float* v2, const float* v3, uint32_t nn) {
+  float fxx = 0.0;
+  for (uint32_t uii = 0; uii < nn; ++uii) {
+    fxx += (*v1++) * (*v2++) * (*v3++);
+  }
+  return fxx;
+}
+
+static inline void compute_two_diag_triple_product(const float* aa, const float* bb, const float* vv, uint32_t nn, float* raa_ptr, float* rab_ptr, float* rbb_ptr) {
+  float raa = 0.0;
+  float rab = 0.0;
+  float rbb = 0.0;
+  for (uint32_t uii = 0; uii < nn; ++uii) {
+    const float fxx = (*aa++);
+    const float fyy = (*bb++);
+    float fzz = (*vv++);
+    raa += fxx * fxx * fzz;
+    fzz *= fyy;
+    rab += fxx * fzz;
+    rbb += fyy * fzz;
+  }
+  *raa_ptr = raa;
+  *rab_ptr = rab;
+  *rbb_ptr = rbb;
+}
+
+static inline void compute_three_triple_product(const float* bb, const float* a1, const float* a2, const float* a3, const float* vv, uint32_t nn, float* r1_ptr, float* r2_ptr, float* r3_ptr) {
+  float r1 = 0.0;
+  float r2 = 0.0;
+  float r3 = 0.0;
+  for (uint32_t uii = 0; uii < nn; ++uii) {
+    const float fxx = (*bb++) * (*vv++);
+    r1 += (*a1++) * fxx;
+    r2 += (*a2++) * fxx;
+    r3 += (*a3++) * fxx;
+  }
+  *r1_ptr = r1;
+  *r2_ptr = r2;
+  *r3_ptr = r3;
+}
+
+static inline void compute_two_plus_one_triple_product(const float* bb, const float* a1, const float* a2, const float* vv, uint32_t nn, float* r1_ptr, float* r2_ptr, float* r3_ptr) {
+  float r1 = 0.0;
+  float r2 = 0.0;
+  float r3 = 0.0;
+  for (uint32_t uii = 0; uii < nn; ++uii) {
+    const float fxx = (*bb++);
+    const float fyy = fxx * (*vv++);
+    r1 += fxx * fyy;
+    r2 += (*a1++) * fyy;
+    r3 += (*a2++) * fyy;
+  }
+  *r1_ptr = r1;
+  *r2_ptr = r2;
+  *r3_ptr = r3;
+}
+#endif
+double compute_loglik(const float* yy, const float* pp, uint32_t sample_ct) {
+  double loglik = 0.0;
+  for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+    const float new_pi = pp[sample_idx];
+    loglik += (yy[sample_idx])? log(new_pi) : log(1.0 - new_pi);
+  }
+  return loglik;
+}
+
+static inline void compute_hessian(const float* mm, const float* vv, uint32_t col_ct, uint32_t row_ct, float* dest) {
+  const uintptr_t col_cta4 = round_up_pow2(col_ct, 4);
+  const uintptr_t row_cta4 = round_up_pow2(row_ct, 4);
+  const uintptr_t row_cta4p1 = row_cta4 + 1;
+  if (row_ct > 3) {
+    const uint32_t row_ctm3 = row_ct - 3;
+    for (uint32_t row_idx = 0; row_idx < row_ctm3; row_idx += 3) {
+      const float* mm_cur = &(mm[row_idx * col_cta4]);
+      compute_two_diag_triple_product(mm_cur, &(mm_cur[col_cta4]), vv, col_ct, &(dest[row_idx * row_cta4p1]), &(dest[(row_idx + 1) * row_cta4p1 - 1]), &(dest[(row_idx + 1) * row_cta4p1]));
+      compute_two_plus_one_triple_product(&(mm_cur[2 * col_cta4]), &(mm_cur[col_cta4]), mm_cur, vv, col_ct, &(dest[(row_idx + 2) * row_cta4p1]), &(dest[(row_idx + 2) * row_cta4p1 - 1]), &(dest[(row_idx + 2) * row_cta4p1 - 2]));
+      for (uint32_t row_idx2 = row_idx + 3; row_idx2 < row_ct; row_idx2++) {
+        compute_three_triple_product(&(mm[row_idx2 * col_cta4]), mm_cur, &(mm_cur[col_cta4]), &(mm_cur[2 * col_cta4]), vv, col_ct, &(dest[row_idx2 * row_cta4 + row_idx]), &(dest[row_idx2 * row_cta4 + row_idx + 1]), &(dest[row_idx2 * row_cta4 + row_idx + 2]));
+      }
+    }
+  }
+  switch (row_ct % 3) {
+  case 0:
+    compute_two_plus_one_triple_product(&(mm[(row_ct - 3) * col_cta4]), &(mm[(row_ct - 2) * col_cta4]), &(mm[(row_ct - 1) * col_cta4]), vv, col_ct, &(dest[(row_ct - 3) * row_cta4p1]), &(dest[(row_ct - 2) * row_cta4p1 - 1]), &(dest[(row_ct - 1) * row_cta4p1 - 2]));
+    // fall through
+  case 2:
+    compute_two_diag_triple_product(&(mm[(row_ct - 2) * col_cta4]), &(mm[(row_ct - 1) * col_cta4]), vv, col_ct, &(dest[(row_ct - 2) * row_cta4p1]), &(dest[(row_ct - 1) * row_cta4p1 - 1]), &(dest[(row_ct - 1) * row_cta4p1]));
+    break;
+  case 1:
+    dest[(row_ct - 1) * row_cta4p1] = triple_product(&(mm[(row_ct - 1) * col_cta4]), &(mm[(row_ct - 1) * col_cta4]), vv, col_ct);
+  }
+}
+
+void cholesky_decomposition(const float* aa, uint32_t predictor_ct, float* ll) {
+  const uintptr_t predictor_cta4 = round_up_pow2(predictor_ct, 4);
+  const uintptr_t predictor_cta4p1 = predictor_cta4 + 1;
+  for (uint32_t row_idx = 0; row_idx < predictor_ct; ++row_idx) {
+    float fxx = aa[row_idx * predictor_cta4p1];
+    float* ll_row_iter = &(ll[row_idx * predictor_cta4]);
+    for (uint32_t col_idx = 0; col_idx < row_idx; ++col_idx) {
+      const float fyy = (*ll_row_iter++);
+      fxx -= fyy * fyy;
+    }
+    float fyy;
+    if (fxx >= 0.0) {
+      fyy = sqrtf(fxx);
+    } else {
+      fyy = 1e-6;
+    }
+    ll[row_idx * predictor_cta4p1] = fyy;
+    fyy = 1.0 / fyy; // now 1.0 / L[j][j]
+    for (uint32_t row_idx2 = row_idx + 1; row_idx2 < predictor_ct; ++row_idx2) {
+      float fxx2 = aa[row_idx2 * predictor_cta4 + row_idx];
+      float* ll_row_iter2 = &(ll[row_idx * predictor_cta4]);
+      float* ll_row_iter3 = &(ll[row_idx2 * predictor_cta4]);
+      for (uint32_t col_idx = 0; col_idx < row_idx; ++col_idx) {
+        fxx2 -= (*ll_row_iter2++) * (*ll_row_iter3++);
+      }
+      ll[row_idx2 * predictor_cta4 + row_idx] = fxx2 * fyy;
+    }
+  }
+}
+
+void solve_linear_system(const float* ll, const float* yy, uint32_t predictor_ct, float* xx) {
+  // Finds x such that y = L(L^T)x, via forward and backward substitution
+  //
+  // might want to use this in NOLAPACK case only, since we can now produce
+  // 32-bit Linux builds with statically linked LAPACK
+  const uintptr_t predictor_cta4 = round_up_pow2(predictor_ct, 4);
+  for (uint32_t row_idx = 0; row_idx < predictor_ct; ++row_idx) {
+    float fxx = yy[row_idx];
+    const float* ll_row_iter = &(ll[row_idx * predictor_cta4]);
+    float* xx_iter = xx;
+    for (uint32_t col_idx = 0; col_idx < row_idx; ++col_idx) {
+      fxx -= (*ll_row_iter++) * (*xx_iter++);
+    }
+    *xx_iter = fxx / (*ll_row_iter);
+  }
+  for (uint32_t col_idx = predictor_ct; col_idx;) {
+    float fxx = xx[--col_idx];
+    float* xx_iter = &(xx[predictor_ct - 1]);
+    for (uint32_t row_idx = predictor_ct - 1; row_idx > col_idx; --row_idx) {
+      fxx -= ll[row_idx * predictor_cta4 + col_idx] * (*xx_iter--);
+    }
+    *xx_iter = fxx / ll[col_idx * (predictor_cta4 + 1)];
+  }
+}
+
+boolerr_t logistic_regression(const float* yy, const float* xx, uint32_t sample_ct, uint32_t predictor_ct, float* coef, float* ll, float* pp, float* vv, float* hh, float* grad, float* dcoef) {
+  // Similar to first part of logistic.cpp fitLM(), but incorporates changes
+  // from Pascal Pons et al.'s TopCoder code.
+  //
+  // Preallocated buffers (initial contents irrelevant):
+  // vv    = sample variance buffer
+  // hh    = hessian matrix buffer, predictor_ct^2, rows 16-byte aligned
+  // grad  = gradient buffer Y[] (length predictor_ct)
+  // dcoef = current coefficient change buffer (length predictor_ct)
+  // 
+  // Inputs:
+  // xx    = covariate (and usually genotype) matrix, covariate-major, rows are
+  //         16-byte aligned, trailing row elements must be zeroed out
+  // yy    = case/control phenotype
+  //
+  // Input/output:
+  // coef  = starting point, overwritten with logistic regression betas.  Must
+  //         be 16-byte aligned.
+  //
+  // Outputs:
+  // ll    = cholesky decomposition matrix, predictor_ct^2, rows 16-byte aligned
+  // pp    = final likelihoods minus Y[] (not currently used by callers)
+  //
+  // Returns 0 on success, 1 on convergence failure.
+  const uintptr_t predictor_cta4 = round_up_pow2(predictor_ct, 4);
+  uint32_t iteration = 0;
+  float min_delta_coef = 1e9;
+
+  fill_float_zero(predictor_ct * predictor_cta4, ll);
+  while (1) {
+    ++iteration;
+
+    // P[i] = \sum_j coef[j] * X[i][j];
+    mult_tmatrix_nxd_vect_d(xx, coef, sample_ct, predictor_ct, pp);
+
+    // P[i] = 1 / (1 + exp(-P[i]));
+    logistic_sse(sample_ct, pp);
+
+    // V[i] = P[i] * (1 - P[i]);
+    // P[i] -= Y[i];
+    compute_v_and_p_minus_y(yy, sample_ct, pp, vv);
+
+    compute_hessian(xx, vv, sample_ct, predictor_ct, hh);
+
+    mult_matrix_dxn_vect_n(xx, pp, sample_ct, predictor_ct, grad);
+
+    cholesky_decomposition(hh, predictor_ct, ll);
+
+    // fill_float_zero(predictor_ct, dcoef);
+    solve_linear_system(ll, grad, predictor_ct, dcoef);
+
+    float delta_coef = 0.0;
+    for (uint32_t pred_idx = 0; pred_idx < predictor_ct; pred_idx++) {
+      const float cur_dcoef = dcoef[pred_idx];
+      delta_coef += fabsf(cur_dcoef);
+      coef[pred_idx] -= cur_dcoef;
+    }
+    if (delta_coef < min_delta_coef) {
+      min_delta_coef = delta_coef;
+    }
+    if (delta_coef != delta_coef) {
+      return 1;
+    }
+    if (iteration > 4) {
+      if (((delta_coef > 20.0) && (delta_coef > 2 * min_delta_coef)) || ((iteration >= 8) && fabsf(1.0f - delta_coef) < 1e-3)) {
+	return 1;
+      }
+      if (iteration >= 15) {
+	return 0;
+      }
+    }
+    // Pons reported that 1.1e-3 was dangerous, so I agree with the decision to
+    // tighten this threshold from 1e-3 to 1e-4.
+    if (delta_coef < 1e-4) {
+      return 0;
+    }
+  }
+}
+
+boolerr_t firth_regression(const float* yy, const float* xx, uint32_t sample_ct, uint32_t predictor_ct, float* coef, float* hh, matrix_finvert_buf1_t* inv_1d_buf, float* flt_2d_buf, float* pp, float* vv, float* grad, float* dcoef, float* ww, float* tmpnxk_buf) {
+  // This is a port of Georg Heinze's logistf R function, adapted to use many
+  // of plink 1.9's optimizations; see
+  //   http://cemsiis.meduniwien.ac.at/en/kb/science-research/software/statistical-software/fllogistf/
+  //
+  // Preallocated buffers (initial contents irrelevant):
+  // inv_1d_buf, flt_2d_buf = for float32 matrix inversion
+  // pp    = likelihoods minus Y[] (not currently used by callers)
+  // vv    = sample variance buffer
+  // grad  = gradient buffer (length predictor_ct)
+  // dcoef = current coefficient change buffer (length predictor_ct)
+  // ww    = Firth-adjusted scores, sample_ct
+  // 
+  // Inputs:
+  // xx    = covariate (and usually genotype) matrix, covariate-major, rows are
+  //         16-byte aligned, trailing row elements must be zeroed out
+  // yy    = case/control phenotype
+  //
+  // Input/output:
+  // coef  = starting point, overwritten with logistic regression betas.  Must
+  //         be 16-byte aligned.
+  //
+  // Outputs:
+  // hh    = variance-covariance matrix buffer, predictor_ct^2, rows 16-byte
+  //         aligned.  (spends some time as pre-inversion Hessian matrix too)
+  //
+  // Returns 0 on success, 1 on convergence failure.
+  const uintptr_t predictor_cta4 = round_up_pow2(predictor_ct, 4);
+  const uintptr_t sample_cta4 = round_up_pow2(sample_ct, 4);
+  uint32_t is_last_iter = 0;
+  
+  // pull these out of the start of the loop, since they happen again in the
+  // log-likelihood update
+  // P[i] = \sum_j coef[j] * X[i][j];
+  mult_tmatrix_nxd_vect_d(xx, coef, sample_ct, predictor_ct, pp);
+  // P[i] = 1 / (1 + exp(-P[i]));
+  logistic_sse(sample_ct, pp);
+  // V[i] = P[i] * (1 - P[i]);
+  compute_v(pp, sample_ct, vv);
+  // P[i] -= Y[i] NOT done here
+
+  // hessian = X diag(V) X'
+  // note that only lower triangle is filled here
+  compute_hessian(xx, vv, sample_ct, predictor_ct, hh);
+
+  for (uint32_t uii = 0; uii < predictor_ct; ++uii) {
+    for (uint32_t ujj = uii + 1; ujj < predictor_ct; ++ujj) {
+      hh[uii * predictor_cta4 + ujj] = hh[ujj * predictor_cta4 + uii];
+    }
+  }
+  // we shouldn't need to compute the log directly, since underflow <->
+  // regression failure, right?  check this.
+  float dethh;
+  if (invert_fmatrix_first_half(predictor_ct, predictor_cta4, hh, &dethh, inv_1d_buf, flt_2d_buf)) {
+    return 1;
+  }
+  /*
+  if (sample_ct < sample_cta4) {
+    // trailing Y[] values must be zero
+    fill_float_zero(sample_cta4 - sample_ct, &(pp[sample_ct]));
+  }
+  */
+  double loglik = compute_loglik(yy, pp, sample_ct);
+  // printf("loglik: %g\n", loglik);
+  loglik += 0.5 * log(dethh);
+
+  uint32_t iter_idx = 0;
+  // start with 80% of logistf convergence defaults (some reduction is
+  // appropriate to be consistent with single-precision arithmetic); may tune
+  // later.
+  // see also the hs_bail condition: if we ever try all five halfsteps, when
+  // dcoef_max and grad_max aren't that far from the normal convergence
+  // conditions, it's probably pointless to continue with single-precision
+  // arithmetic.  (possible todo: use a fully-double-precision routine to
+  // finish the job when that happens.)
+  const uint32_t max_iter = 20;
+  const float gconv = 0.0001;
+  const float xconv = 0.0001;
+  const double lconv = 0.0001;
+  uint32_t hs_bail = 0;
+  while (1) {
+    invert_fmatrix_second_half(predictor_ct, predictor_cta4, hh, inv_1d_buf, flt_2d_buf);
+    if (is_last_iter) {
+      return 0;
+    }
+    col_major_fmatrix_multiply_strided(xx, hh, sample_ct, sample_cta4, predictor_ct, predictor_cta4, predictor_ct, sample_cta4, tmpnxk_buf);
+    // tmpNxK, interpreted as column-major, is sample_ct x predictor_ct
+    // X, interpreted as column-major, is also sample_ct x predictor_ct
+    // Hdiag[i] = V[i] (\sum_j tmpNxK[i][j] X[i][j])
+    // (todo: vectorize this)
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+      float dotprod = 0.0;
+      const float* xx_row = &(xx[sample_idx]);
+      const float* tmpnxk_row = &(tmpnxk_buf[sample_idx]);
+      for (uint32_t pred_uidx = 0; pred_uidx < predictor_ct; ++pred_uidx) {
+	dotprod += xx_row[pred_uidx * sample_cta4] * tmpnxk_row[pred_uidx * sample_cta4];
+      }
+      const float cur_weight = vv[sample_idx];
+      const float cur_pi = pp[sample_idx];
+      ww[sample_idx] = (yy[sample_idx] - cur_pi) + (0.5 - cur_pi) * cur_weight * dotprod;
+    }
+    
+    // gradient (Ustar in logistf) = X' W
+    mult_matrix_dxn_vect_n(xx, ww, sample_ct, predictor_ct, grad);
+    float grad_max = 0.0;
+    for (uint32_t pred_uidx = 0; pred_uidx < predictor_ct; ++pred_uidx) {
+      const float abs_grad_cur = fabsf(grad[pred_uidx]);
+      if (abs_grad_cur > grad_max) {
+	grad_max = abs_grad_cur;
+      }
+    }
+
+    // dcoef := hh * grad (note that hh is inverted already)
+    mult_matrix_dxn_vect_n(hh, grad, predictor_ct, predictor_ct, dcoef);
+
+    float dcoef_max = 0.0;
+    for (uint32_t pred_uidx = 0; pred_uidx < predictor_ct; ++pred_uidx) {
+      const float abs_dcoef_cur = fabsf(dcoef[pred_uidx]);
+      if (abs_dcoef_cur > dcoef_max) {
+	dcoef_max = abs_dcoef_cur;
+      }
+    }
+    const float maxstep = 5.0;
+    if (dcoef_max > maxstep) {
+      const float scaling_factor = maxstep / dcoef_max;
+      for (uint32_t pred_uidx = 0; pred_uidx < predictor_ct; ++pred_uidx) {
+	dcoef[pred_uidx] *= scaling_factor;
+      }
+      dcoef_max = maxstep;
+    }
+    for (uint32_t pred_uidx = 0; pred_uidx < predictor_ct; ++pred_uidx) {
+      coef[pred_uidx] += dcoef[pred_uidx];
+    }
+    const uint32_t delta_and_grad_converged = (dcoef_max <= xconv) && (grad_max < gconv);
+    const double loglik_old = loglik;
+    double loglik_thresh = loglik_old;
+    if (delta_and_grad_converged) {
+      // on the last iteration, we would frequently try all 5 halfsteps when
+      // the log-likelihood change was effectively random due to floating point
+      // error.  detect this and exit the loop earlier.
+      loglik_thresh -= 0.999999 * lconv;
+    }
+
+    uint32_t maxhs = 5;
+    uint32_t halfstep_idx = 1;
+    while (1) {
+      mult_tmatrix_nxd_vect_d(xx, coef, sample_ct, predictor_ct, pp);
+      logistic_sse(sample_ct, pp);
+      loglik = compute_loglik(yy, pp, sample_ct);
+      compute_v(pp, sample_ct, vv);
+      compute_hessian(xx, vv, sample_ct, predictor_ct, hh);
+      for (uint32_t uii = 0; uii < predictor_ct; ++uii) {
+	for (uint32_t ujj = uii + 1; ujj < predictor_ct; ++ujj) {
+	  hh[uii * predictor_cta4 + ujj] = hh[ujj * predictor_cta4 + uii];
+	}
+      }
+      if (invert_fmatrix_first_half(predictor_ct, predictor_cta4, hh, &dethh, inv_1d_buf, flt_2d_buf)) {
+	return 1;
+      }
+      loglik += 0.5 * log(dethh);
+      if (halfstep_idx > maxhs) {
+	break;
+      }
+      if (loglik >= loglik_thresh) {
+	if (loglik >= loglik_old) {
+	  break;
+	}
+	maxhs = halfstep_idx;
+      } else if (halfstep_idx == maxhs) {
+	if ((dcoef_max < 0.001) && (grad_max < 0.05) && (loglik >= loglik_old - lconv)) {
+	  // we've converged as much as we can with single-precision
+	  // arithmetic, and now we're flailing around.  don't even take the
+	  // 2^{-maxhs} step, undo it all and bail.
+	  // (0.001 and 0.05 constants can obviously be tuned; they were chosen
+	  // based on a test 500k sample/5 covariate regression.)
+	  --halfstep_idx;
+	  --maxhs;
+	  hs_bail = 1;
+	}
+      }
+      const float multiplier = exp2f(-((int32_t)halfstep_idx));
+      for (uint32_t pred_uidx = 0; pred_uidx < predictor_ct; ++pred_uidx) {
+	coef[pred_uidx] -= dcoef[pred_uidx] * multiplier;
+      }
+      ++halfstep_idx;
+    }
+    // printf("%.9g %.9g %g %g\n", loglik, loglik_old, dcoef_max, grad_max);
+    const double loglik_change = loglik - loglik_old;
+    ++iter_idx;
+    is_last_iter = (iter_idx == max_iter) || ((fabs(loglik_change) <= lconv) && (delta_and_grad_converged || hs_bail));
+  }
+}
+
+uintptr_t get_logistic_workspace_size(uint32_t sample_ct, uint32_t predictor_ct, uint32_t constraint_ct, uint32_t genof_buffer_needed, uint32_t is_sometimes_firth) {
+  // sample_cta4 * predictor_ct < 2^31, and sample_ct >= predictor_ct, so no
+  // overflows
+  // could round everything up to multiples of 16 instead of 64
+  const uint32_t sample_cta4 = round_up_pow2(sample_ct, 4);
+  const uint32_t predictor_cta4 = round_up_pow2(predictor_ct, 4);
+  // sample_nm, pheno_cc_nm, male_nm = sample_ctl words
+  uintptr_t workspace_size = 3 * round_up_pow2(BITCT_TO_WORDCT(sample_ct) * sizeof(intptr_t), kCacheline);
+  
+  // yy = sample_cta4 floats
+  workspace_size += round_up_pow2(sample_cta4 * sizeof(float), kCacheline);
+  
+  // xx = (predictor_ct + genof_buffer_needed) * sample_cta4 floats
+  workspace_size += round_up_pow2((predictor_ct + genof_buffer_needed) * sample_cta4 * sizeof(float), kCacheline);
+    
+  // hh = predictor_ct * predictor_cta4 floats
+  workspace_size += round_up_pow2(predictor_ct * predictor_cta4 * sizeof(float), kCacheline);
+
+  // pp, vv = sample_cta4 floats
+  workspace_size += 2 * round_up_pow2(sample_cta4 * sizeof(float), kCacheline);
+
+  // coef, grad, dcoef = predictor_cta4 floats
+  workspace_size += 3 * round_up_pow2(predictor_cta4 * sizeof(float), kCacheline);
+
+  // ll = predictor_ct * predictor_cta4 floats
+  // (technically not needed in pure-Firth case)
+  workspace_size += round_up_pow2(predictor_ct * predictor_cta4 * sizeof(float), kCacheline);
+
+  if (is_sometimes_firth || constraint_ct) {
+    // inv_1d_buf = predictor_ct * kMatrixFinvertBuf1CheckedAlloc bytes
+    workspace_size += round_up_pow2(predictor_ct * kMatrixFinvertBuf1CheckedAlloc, kCacheline);
+
+    // flt_2d_buf = predictor_ct * predictor_cta4 floats
+    workspace_size += round_up_pow2(predictor_ct * predictor_cta4 * sizeof(float), kCacheline);
+
+    if (is_sometimes_firth) {
+      // ww = sample_cta4 floats
+      workspace_size += round_up_pow2(sample_cta4 * sizeof(float), kCacheline);
+
+      // tmpnxk_buf = predictor_ct * sample_cta4 floats
+      workspace_size += round_up_pow2(predictor_ct * sample_cta4 * sizeof(float), kCacheline);
+    }
+    if (constraint_ct) {
+      // tmphxs_buf, h_transpose_buf = constraint_ct * predictor_cta4 floats
+      workspace_size += 2 * round_up_pow2(constraint_ct * predictor_cta4 * sizeof(float), kCacheline);
+      
+      // inner_buf = constraint_ct * constraint_ct
+      workspace_size += round_up_pow2(constraint_ct * constraint_ct * sizeof(float), kCacheline);
+    }
+  }
+  return workspace_size;  
+}
+
+
+typedef struct {
+  // double beta;
+  //   odds ratio = exp(beta)
+  // double se;
+  //   zval = beta / se
+  //   width of asymptotic CI (beta units) = ci_zt * se
+  //   T-statistic = zval
+  //   pval = chiprob_p(zval * zval, 1);
+  
+  uint32_t sample_obs_ct;
+  
+  uint32_t allele_obs_ct;
+  double alt_dosage;
+
+  uint32_t firth_fallback;
+  uint32_t case_allele_obs_ct;
+  double alt_case_dosage;
+
+  double mach_r2;
+} logistic_aux_result_t;
+
+typedef struct {
+  // double beta;
+  // double se;
+  //   zval = beta / se
+  //   width of asymptotic CI = ci_zt * se
+  //   T-statistic = zval
+  //   pval = calc_tprob(zval, sample_obs_ct - predictor_ct)
+
+  uint32_t sample_obs_ct;
+
+  uint32_t allele_obs_ct;
+  double alt_dosage;
+
+  double mach_r2;
+} linear_aux_result_t;
+
+// multithread globals
+static pgen_reader_t** g_pgr_ptrs = nullptr;
+static uintptr_t** g_genovecs = nullptr;
+static uintptr_t** g_dosage_presents = nullptr;
+static dosage_t** g_dosage_val_bufs = nullptr;
+static unsigned char** g_workspace_bufs = nullptr;
+static uint32_t* g_read_variant_uidx_starts = nullptr;
+
+static uintptr_t* g_sample_include = nullptr;
+static const uintptr_t* g_sample_include_x = nullptr;
+static const uintptr_t* g_sample_include_y = nullptr;
+static uint32_t* g_sample_include_cumulative_popcounts = nullptr;
+static uint32_t* g_sample_include_x_cumulative_popcounts = nullptr;
+static uint32_t* g_sample_include_y_cumulative_popcounts = nullptr;
+static const uintptr_t* g_sex_male_collapsed = nullptr;
+static const uintptr_t* g_pheno_cc = nullptr;
+static uintptr_t* g_pheno_x_cc = nullptr;
+static uintptr_t* g_pheno_y_cc = nullptr;
+static const float* g_pheno_f = nullptr;
+static float* g_pheno_x_f = nullptr;
+static float* g_pheno_y_f = nullptr;
+static uintptr_t* g_parameter_subset = nullptr;
+static uintptr_t* g_parameter_subset_x = nullptr;
+static uintptr_t* g_parameter_subset_y = nullptr;
+static const float* g_covars_cmaj_f = nullptr;
+static float* g_covars_cmaj_x_f = nullptr;
+static float* g_covars_cmaj_y_f = nullptr;
+static float* g_local_covars_vcmaj_f[2] = {nullptr, nullptr};
+static const double* g_pheno_d = nullptr;
+static double* g_pheno_x_d = nullptr;
+static double* g_pheno_y_d = nullptr;
+static const double* g_covars_cmaj_d = nullptr;
+static double* g_covars_cmaj_x_d = nullptr;
+static double* g_covars_cmaj_y_d = nullptr;
+static double* g_local_covars_vcmaj_d[2] = {nullptr, nullptr};
+// static const double* g_covar_dotprod_d = nullptr;
+// static double* g_covar_dotprod_x_d = nullptr;
+// static double* g_covar_dotprod_y_d = nullptr;
+static const uintptr_t* g_variant_include = nullptr;
+static const chr_info_t* g_cip = nullptr;
+static const uintptr_t* g_variant_allele_idxs = nullptr;
+static uint32_t* g_subset_chr_fo_vidx_start = nullptr;
+
+// static uint32_t g_raw_sample_ct = 0;
+static uint32_t g_sample_ct = 0;
+static uint32_t g_sample_ct_x = 0;
+static uint32_t g_sample_ct_y = 0;
+
+// chrX value.  always equal to sample_ct_y on chrY, irrelevant elsewhere.
+// (commented out since current algorithms always need nonmissing-male count)
+// static uint32_t g_male_ct = 0;
+
+static uint32_t g_covar_ct = 0;
+static uint32_t g_local_covar_ct = 0;
+static uint32_t g_covar_ct_x = 0;
+static uint32_t g_covar_ct_y = 0;
+
+static double* g_constraints_con_major = nullptr;
+static double* g_constraints_con_major_x = nullptr;
+static double* g_constraints_con_major_y = nullptr;
+static float* g_constraints_con_major_f = nullptr;
+static float* g_constraints_con_major_x_f = nullptr;
+static float* g_constraints_con_major_y_f = nullptr;
+static uint32_t g_constraint_ct = 0;
+static uint32_t g_constraint_ct_x = 0;
+static uint32_t g_constraint_ct_y = 0;
+
+static uint32_t g_variant_ct = 0;
+static uint32_t g_calc_thread_ct = 0;
+static uint32_t g_cur_block_variant_ct = 0;
+static glm_flags_t g_glm_flags = kfGlm0;
+static uint32_t g_is_xchr_model_1 = 0;
+static pglerr_t g_error_ret = kPglRetSuccess;
+
+static logistic_aux_result_t* g_logistic_block_aux = nullptr;
+static linear_aux_result_t* g_linear_block_aux = nullptr;
+
+// separate from block_aux, since we need up to g_max_reported_test_ct pairs of
+// values per variant
+static double* g_block_beta_se = nullptr;
+
+static uintptr_t g_max_reported_test_ct = 0;
+
+THREAD_FUNC_DECL glm_logistic_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+  uintptr_t* genovec = g_genovecs[tidx];
+  uintptr_t* dosage_present = nullptr;
+  dosage_t* dosage_vals = nullptr;
+  if (g_dosage_presents) {
+    dosage_present = g_dosage_presents[tidx];
+    dosage_vals = g_dosage_val_bufs[tidx];
+  }
+  unsigned char* workspace_buf = g_workspace_bufs[tidx];
+  const uintptr_t* variant_include = g_variant_include;
+  const uintptr_t* sex_male_collapsed = g_sex_male_collapsed;
+  const chr_info_t* cip = g_cip;
+  const uint32_t* subset_chr_fo_vidx_start = g_subset_chr_fo_vidx_start;
+  // const uint32_t raw_sample_ct = g_raw_sample_ct;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const glm_flags_t glm_flags = g_glm_flags;
+  const uint32_t add_interactions = (glm_flags / kfGlmInteraction) & 1;
+  const uint32_t hide_covar = (glm_flags / kfGlmHideCovar) & 1;
+  const uint32_t include_intercept = (glm_flags / kfGlmIntercept) & 1;
+  const uint32_t is_sometimes_firth = (glm_flags & (kfGlmFirthFallback | kfGlmFirth))? 1 : 0;
+  const uint32_t is_always_firth = (glm_flags / kfGlmFirth) & 1;
+  const uint32_t model_dominant = (glm_flags / kfGlmDominant) & 1;
+  const uint32_t model_recessive = (glm_flags / kfGlmRecessive) & 1;
+  const uint32_t joint_genotypic = (glm_flags / kfGlmGenotypic) & 1;
+  const uint32_t joint_hethom = (glm_flags / kfGlmHethom) & 1;
+  const uint32_t domdev_present = joint_genotypic || joint_hethom;
+  const uint32_t domdev_present_p1 = domdev_present + 1;  
+  const uint32_t reported_pred_uidx_start = 1 - include_intercept;
+  const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+  const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+  const uint32_t is_xchr_model_1 = g_is_xchr_model_1;
+  const uintptr_t max_reported_test_ct = g_max_reported_test_ct;
+  const uintptr_t local_covar_ct = g_local_covar_ct;
+  uintptr_t max_sample_ct = MAXV(g_sample_ct, g_sample_ct_x);
+  if (max_sample_ct < g_sample_ct_y) {
+    max_sample_ct = g_sample_ct_y;
+  }
+  uint32_t variant_idx_offset = 0;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_variant_ct = g_cur_block_variant_ct;
+    uint32_t variant_bidx = (tidx * cur_block_variant_ct) / calc_thread_ct;
+    const uint32_t variant_bidx_end = ((tidx + 1) * cur_block_variant_ct) / calc_thread_ct;
+    uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+    double* beta_se_iter = &(g_block_beta_se[2 * max_reported_test_ct * variant_bidx]);
+    logistic_aux_result_t* block_aux_iter = &(g_logistic_block_aux[variant_bidx]);
+    const float* local_covars_iter = nullptr;
+    if (local_covar_ct) {
+      // &(nullptr[0]) is okay in C++, but undefined in C
+      local_covars_iter = &(g_local_covars_vcmaj_f[parity][variant_bidx * max_sample_ct * local_covar_ct]);
+    }
+    while (variant_bidx < variant_bidx_end) {
+      const uint32_t variant_idx = variant_bidx + variant_idx_offset;
+      const uint32_t chr_fo_idx = uint32arr_greater_than(&(subset_chr_fo_vidx_start[1]), cip->chr_ct, variant_idx + 1);
+      const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+      uint32_t cur_variant_bidx_end = subset_chr_fo_vidx_start[chr_fo_idx + 1] - variant_idx_offset;
+      if (cur_variant_bidx_end > variant_bidx_end) {
+	cur_variant_bidx_end = variant_bidx_end;
+      }
+      const uint32_t is_x = (chr_idx == x_code);
+      const uint32_t is_y = (chr_idx == y_code);
+      const uint32_t is_nonx_haploid = (!is_x) && is_set(cip->haploid_mask, chr_idx);
+      const uintptr_t* cur_sample_include;
+      const uint32_t* cur_sample_include_cumulative_popcounts;
+      const uintptr_t* cur_pheno_cc;
+      const float* cur_pheno;
+      const float* cur_covars_cmaj;
+      const uintptr_t* cur_parameter_subset;
+      const float* cur_constraints_con_major;
+      uint32_t cur_sample_ct;
+      uint32_t cur_covar_ct;
+      uint32_t cur_constraint_ct;
+      uint32_t primary_pred_idx = include_intercept;
+      if (is_y && g_sample_include_y) {
+	cur_sample_include = g_sample_include_y;
+	cur_sample_include_cumulative_popcounts = g_sample_include_y_cumulative_popcounts;
+	cur_pheno_cc = g_pheno_y_cc;
+	cur_pheno = g_pheno_y_f;
+	cur_covars_cmaj = g_covars_cmaj_y_f;
+	cur_parameter_subset = g_parameter_subset_y;
+	cur_constraints_con_major = g_constraints_con_major_y_f;
+	cur_sample_ct = g_sample_ct_y;
+	cur_covar_ct = g_covar_ct_y;
+	cur_constraint_ct = g_constraint_ct_y;
+      } else if (is_x && g_sample_include_x) {
+	cur_sample_include = g_sample_include_x;
+	cur_sample_include_cumulative_popcounts = g_sample_include_x_cumulative_popcounts;
+	cur_pheno_cc = g_pheno_x_cc;
+	cur_pheno = g_pheno_x_f;
+	cur_covars_cmaj = g_covars_cmaj_x_f;
+	cur_parameter_subset = g_parameter_subset_x;
+	cur_constraints_con_major = g_constraints_con_major_x_f;
+	cur_sample_ct = g_sample_ct_x;
+	cur_covar_ct = g_covar_ct_x;
+	cur_constraint_ct = g_constraint_ct_x;
+      } else {
+	cur_sample_include = g_sample_include;
+	cur_sample_include_cumulative_popcounts = g_sample_include_cumulative_popcounts;
+	cur_pheno_cc = g_pheno_cc;
+	cur_pheno = g_pheno_f;
+	cur_covars_cmaj = g_covars_cmaj_f;
+	cur_parameter_subset = g_parameter_subset;
+	cur_constraints_con_major = g_constraints_con_major_f;
+	cur_sample_ct = g_sample_ct;
+	cur_covar_ct = g_covar_ct;
+	cur_constraint_ct = g_constraint_ct;
+      }
+      const uint32_t sample_ctl = BITCT_TO_WORDCT(cur_sample_ct);
+      const uint32_t sample_cta4 = round_up_pow2(cur_sample_ct, 4);
+      const uint32_t cur_predictor_ct_base = 2 + domdev_present + cur_covar_ct * (1 + add_interactions * domdev_present_p1);
+      uint32_t cur_predictor_ct = cur_predictor_ct_base;
+      if (cur_parameter_subset) {
+	cur_predictor_ct = popcount_longs(cur_parameter_subset, BITCT_TO_WORDCT(cur_predictor_ct_base));
+      }
+      const uint32_t predictor_cta4 = round_up_pow2(cur_predictor_ct, 4);
+      const uint32_t predictor_cta4p1 = predictor_cta4 + 1;
+      uint32_t reported_pred_uidx_end;
+      if (hide_covar) {
+	if (!cur_parameter_subset) {
+	  reported_pred_uidx_end = 2 + domdev_present;
+	} else {
+	  reported_pred_uidx_end = 1 + is_set(cur_parameter_subset, 1) + domdev_present;
+	}
+      } else {
+	reported_pred_uidx_end = cur_predictor_ct;
+      }
+      // todo: --tests
+      if (cur_constraint_ct) {
+	primary_pred_idx = reported_pred_uidx_end - reported_pred_uidx_start;
+      }
+      const uint32_t genof_buffer_needed = cur_parameter_subset && (!is_set(cur_parameter_subset, 1));
+      unsigned char* workspace_iter = workspace_buf;
+      uintptr_t* sample_nm = (uintptr_t*)arena_alloc_raw_rd(sample_ctl * sizeof(intptr_t), &workspace_iter);
+      uintptr_t* pheno_cc_nm = (uintptr_t*)arena_alloc_raw_rd(sample_ctl * sizeof(intptr_t), &workspace_iter);
+      uintptr_t* male_nm = (uintptr_t*)arena_alloc_raw_rd(sample_ctl * sizeof(intptr_t), &workspace_iter);
+      float* nm_pheno_buf = (float*)arena_alloc_raw_rd(sample_cta4 * sizeof(float), &workspace_iter);
+      float* nm_predictors_pmaj_buf = (float*)arena_alloc_raw_rd((cur_predictor_ct + genof_buffer_needed) * sample_cta4 * sizeof(float), &workspace_iter);
+      float* coef_return = (float*)arena_alloc_raw_rd(predictor_cta4 * sizeof(float), &workspace_iter);
+      float* hh_return = (float*)arena_alloc_raw_rd(cur_predictor_ct * predictor_cta4 * sizeof(float), &workspace_iter);
+      float* pp_buf = (float*)arena_alloc_raw_rd(sample_cta4 * sizeof(float), &workspace_iter);
+      float* sample_variance_buf = (float*)arena_alloc_raw_rd(sample_cta4 * sizeof(float), &workspace_iter);
+      float* gradient_buf = (float*)arena_alloc_raw_rd(predictor_cta4 * sizeof(float), &workspace_iter);
+      float* dcoef_buf = (float*)arena_alloc_raw_rd(predictor_cta4 * sizeof(float), &workspace_iter);
+      float* cholesky_decomp_return = (float*)arena_alloc_raw_rd(cur_predictor_ct * predictor_cta4 * sizeof(float), &workspace_iter);
+      
+      matrix_finvert_buf1_t* inv_1d_buf = nullptr;
+      float* flt_2d_buf = nullptr;
+
+      // Firth-only
+      float* score_buf = nullptr;
+      float* tmpnxk_buf = nullptr;
+
+      // joint test only
+      float* tmphxs_buf = nullptr;
+      float* h_transpose_buf = nullptr;
+      float* inner_buf = nullptr;
+
+      if (is_sometimes_firth || cur_constraint_ct) {
+	inv_1d_buf = (matrix_finvert_buf1_t*)arena_alloc_raw_rd(cur_predictor_ct * kMatrixFinvertBuf1CheckedAlloc, &workspace_iter);
+	flt_2d_buf = (float*)arena_alloc_raw_rd(cur_predictor_ct * predictor_cta4 * sizeof(float), &workspace_iter);
+	if (is_sometimes_firth) {
+	  score_buf = (float*)arena_alloc_raw_rd(sample_cta4 * sizeof(float), &workspace_iter);
+	  tmpnxk_buf = (float*)arena_alloc_raw_rd(cur_predictor_ct * sample_cta4 * sizeof(float), &workspace_iter);
+	}
+	if (cur_constraint_ct) {
+	  tmphxs_buf = (float*)arena_alloc_raw_rd(cur_constraint_ct * predictor_cta4 * sizeof(float), &workspace_iter);
+	  h_transpose_buf = (float*)arena_alloc_raw_rd(cur_constraint_ct * predictor_cta4 * sizeof(float), &workspace_iter);
+	  inner_buf = (float*)arena_alloc_raw_rd(cur_constraint_ct * cur_constraint_ct * sizeof(float), &workspace_iter);
+	}
+      }
+      // assert((uintptr_t)(workspace_iter - workspace_buf) == get_logistic_workspace_size(cur_sample_ct, cur_predictor_ct, cur_constraint_ct, genof_buffer_needed, is_sometimes_firth));
+      pgr_clear_ld_cache(pgrp);
+      uint32_t genocounts[4];
+      for (; variant_bidx < cur_variant_bidx_end; ++variant_bidx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	{
+	  uint32_t dosage_ct;
+	  uint32_t is_explicit_alt1;
+	  pglerr_t reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(cur_sample_include, cur_sample_include_cumulative_popcounts, cur_sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+	  if (reterr) {
+	    g_error_ret = reterr;
+	    variant_bidx = variant_bidx_end;
+	    break;
+	  }
+	  zero_trailing_quaters(cur_sample_ct, genovec);
+	  genovec_count_freqs_unsafe(genovec, cur_sample_ct, genocounts);
+	  uint32_t missing_ct = genocounts[3];
+	  if (!missing_ct) {
+	    fill_all_bits(cur_sample_ct, sample_nm);
+	  } else {
+	    genoarr_to_nonmissing(genovec, cur_sample_ct, sample_nm);
+	    if (dosage_ct) {
+	      bitvec_or(dosage_present, sample_ctl, sample_nm);
+	      missing_ct = cur_sample_ct - popcount_longs(sample_nm, sample_ctl);
+	    }
+	  }
+	  uint32_t nm_sample_ct = cur_sample_ct - missing_ct;
+	  // todo: alt2/alt3/etc. dosage > 0.5 -> missing
+	  const uint32_t nm_sample_ctl = BITCT_TO_WORDCT(nm_sample_ct);
+	  const uint32_t nm_sample_cta4 = round_up_pow2(nm_sample_ct, 4);
+	  const uint32_t nm_sample_ct_rem = nm_sample_cta4 - nm_sample_ct;
+	  float* nm_predictors_pmaj_iter = nm_predictors_pmaj_buf;
+	  // first predictor column: intercept
+	  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	    *nm_predictors_pmaj_iter++ = 1.0;
+	  }
+	  fill_float_zero(nm_sample_ct_rem, nm_predictors_pmaj_iter);
+	  // second predictor column: genotype
+	  float* genotype_vals = &(nm_predictors_pmaj_buf[nm_sample_cta4]);
+	  if (genof_buffer_needed) {
+	    // special case: --parameters excludes the main genotype column,
+	    // but does care about an interaction
+	    genotype_vals = &(nm_predictors_pmaj_buf[cur_predictor_ct * nm_sample_cta4]);
+	  }
+	  nm_predictors_pmaj_iter = genotype_vals;
+	  if (!missing_ct) {
+	    genoarr_to_floats(genovec, nm_sample_ct, nm_predictors_pmaj_iter);
+	    if (dosage_ct) {
+	      uint32_t sample_idx = 0;
+	      for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_idx) {
+		next_set_unsafe_ck(dosage_present, &sample_idx);
+		// 32768 -> 2, 16384 -> 1, 0 -> 0
+		nm_predictors_pmaj_iter[sample_idx] = kRecipDosageMidf * ((int32_t)((uint32_t)dosage_vals[dosage_idx]));
+	      }
+	    }
+	  } else {
+	    if (!dosage_ct) {
+	      genoarr_to_floats_remove_missing(genovec, cur_sample_ct, nm_predictors_pmaj_iter);
+	    } else {
+	      uint32_t sample_midx = 0;
+	      uint32_t dosage_idx = 0;
+	      for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+		next_set_unsafe_ck(sample_nm, &sample_midx);
+		float cur_val;
+		if (is_set(dosage_present, sample_midx)) {
+		  cur_val = kRecipDosageMidf * ((int32_t)((uint32_t)dosage_vals[dosage_idx++]));
+		} else {
+		  cur_val = (intptr_t)(GET_QUATERARR_ENTRY(genovec, sample_midx));
+		}
+	        nm_predictors_pmaj_iter[sample_idx] = cur_val;
+	      }
+	    }
+	  }
+	  nm_predictors_pmaj_iter = &(nm_predictors_pmaj_iter[nm_sample_ct]);
+	  append_float_zero(nm_sample_ct_rem, &nm_predictors_pmaj_iter);
+	  copy_bitarr_subset(cur_pheno_cc, sample_nm, nm_sample_ct, pheno_cc_nm);
+	  const uint32_t nm_case_ct = popcount_longs(pheno_cc_nm, nm_sample_ctl);
+	  // usually need to save some of {sample_obs_ct, allele_obs_ct,
+	  // alt_dosage, case_allele_obs_ct, alt_case_dosage, mach_r2 even
+	  // for skipped variants
+	  // compute them all for now, could conditionally skip later
+	  block_aux_iter->sample_obs_ct = nm_sample_ct;
+	  double dosage_ceil = 2.0;
+	  if (!is_x) {
+	    if (!is_nonx_haploid) {
+	      block_aux_iter->allele_obs_ct = nm_sample_ct * 2;
+	      block_aux_iter->case_allele_obs_ct = nm_case_ct * 2;
+	    } else {
+	      block_aux_iter->allele_obs_ct = nm_sample_ct;
+	      block_aux_iter->case_allele_obs_ct = nm_case_ct;
+	      // everything is on 0..1 scale, not 0..2
+	      dosage_ceil = 1.0;
+	      for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+		genotype_vals[sample_idx] *= 0.5;
+	      }
+	    }
+	  } else {
+	    copy_bitarr_subset(sex_male_collapsed, sample_nm, nm_sample_ct, male_nm);
+	    const uint32_t nm_male_ct = popcount_longs(male_nm, nm_sample_ctl);
+	    block_aux_iter->allele_obs_ct = nm_sample_ct * 2;
+	    block_aux_iter->case_allele_obs_ct = nm_case_ct * 2;
+	    if (is_xchr_model_1) {
+	      // special case: multiply male values by 0.5
+	      uint32_t sample_idx = 0;
+	      for (uint32_t male_idx = 0; male_idx < nm_male_ct; ++male_idx, ++sample_idx) {
+		next_set_unsafe_ck(male_nm, &sample_idx);
+		genotype_vals[sample_idx] *= 0.5;
+	      }
+	      block_aux_iter->allele_obs_ct -= nm_male_ct;
+	      block_aux_iter->case_allele_obs_ct -= popcount_longs_intersect(pheno_cc_nm, male_nm, nm_sample_ctl);
+	    }
+	  }
+	  double alt_case_dosage = 0.0;
+	  double dosage_sum = 0.0;
+	  // genotype_vals restricted to [0, 2], so naive variance computation
+	  // is stable
+	  double dosage_ssq = 0.0;
+	  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	    const double cur_genotype_val = genotype_vals[sample_idx];
+	    dosage_sum += cur_genotype_val;
+	    dosage_ssq += cur_genotype_val * cur_genotype_val;
+	    alt_case_dosage += cur_genotype_val * ((double)((int32_t)is_set(pheno_cc_nm, sample_idx)));
+	  }
+	  block_aux_iter->firth_fallback = 0;
+	  block_aux_iter->alt_dosage = dosage_sum;
+	  block_aux_iter->alt_case_dosage = alt_case_dosage;
+
+	  const double dosage_avg = dosage_sum / ((double)((int32_t)nm_sample_ct));
+	  const double dosage_variance = dosage_ssq - dosage_sum * dosage_avg;
+	  // note that this value is nonsense on chrX/chrY/MT/haploid
+	  block_aux_iter->mach_r2 = 2 * dosage_variance / (dosage_sum * (dosage_ceil - dosage_avg));
+	  // okay, now we're free to skip the actual regression if there are
+	  // too few samples, or remaining samples are all-case/all-control, or
+	  // variant is monomorphic (or all-het)
+	  if ((nm_sample_ct < cur_predictor_ct) || (!nm_case_ct) || (nm_case_ct == nm_sample_ct) || (fabs(dosage_variance) < kBigEpsilon)) {
+	    goto glm_logistic_thread_skip_variant;
+	  }
+	  float* domdev_vals = nullptr;
+	  if (genof_buffer_needed) {
+	    nm_predictors_pmaj_iter = &(nm_predictors_pmaj_buf[nm_sample_cta4]);
+	  } else if (joint_genotypic || joint_hethom) {
+	    // in hethom case, do this before clobbering genotype data
+	    domdev_vals = nm_predictors_pmaj_iter;
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	      float cur_genotype_val = genotype_vals[sample_idx];
+	      if (cur_genotype_val > 1.0) {
+		cur_genotype_val = 2.0 - cur_genotype_val;
+	      }
+	      nm_predictors_pmaj_iter[sample_idx] = cur_genotype_val;
+	    }
+	    nm_predictors_pmaj_iter = &(nm_predictors_pmaj_iter[nm_sample_ct]);
+	    append_float_zero(nm_sample_ct_rem, &nm_predictors_pmaj_iter);
+	  }
+	  if (model_dominant) {
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	      const float cur_genotype_val = genotype_vals[sample_idx];
+	      // 0..1..1
+	      if (cur_genotype_val > 1.0) {
+		genotype_vals[sample_idx] = 1.0;
+	      }
+	    }
+	  } else if (model_recessive || joint_hethom) {
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	      const float cur_genotype_val = genotype_vals[sample_idx];
+	      // 0..0..1
+	      if (cur_genotype_val < 1.0) {
+		genotype_vals[sample_idx] = 0.0;
+	      } else {
+		genotype_vals[sample_idx] = cur_genotype_val - 1.0;
+	      }
+	    }
+	  }
+
+	  // fill phenotype
+	  uint32_t sample_midx = 0;
+	  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+	    next_set_unsafe_ck(sample_nm, &sample_midx);
+	    nm_pheno_buf[sample_idx] = cur_pheno[sample_midx];
+	  }
+	  
+	  // fill covariates
+	  uint32_t parameter_uidx = 2 + domdev_present;
+	  for (uint32_t covar_idx = 0; covar_idx < cur_covar_ct; ++covar_idx, ++parameter_uidx) {
+	    // strictly speaking, we don't need cur_covars_cmaj to be
+	    // vector-aligned
+	    if (cur_parameter_subset && (!is_set(cur_parameter_subset, parameter_uidx))) {
+	      continue;
+	    }
+	    const float* cur_covar_col;
+	    if (covar_idx < local_covar_ct) {
+	      cur_covar_col = &(local_covars_iter[covar_idx * max_sample_ct]);
+	    } else {
+	      cur_covar_col = &(cur_covars_cmaj[(covar_idx - local_covar_ct) * sample_cta4]);
+	    }
+	    sample_midx = 0;
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+	      next_set_unsafe_ck(sample_nm, &sample_midx);
+	      *nm_predictors_pmaj_iter++ = cur_covar_col[sample_midx];
+	    }
+	    append_float_zero(nm_sample_ct_rem, &nm_predictors_pmaj_iter);
+	  }
+	  // fill interaction terms
+	  if (add_interactions) {
+	    for (uint32_t covar_idx = 0; covar_idx < cur_covar_ct; ++covar_idx) {
+	      const float* cur_covar_col;
+	      if (covar_idx < local_covar_ct) {
+	        cur_covar_col = &(local_covars_iter[covar_idx * max_sample_ct]);
+	      } else {
+		cur_covar_col = &(cur_covars_cmaj[covar_idx * sample_cta4]);
+	      }
+	      if ((!cur_parameter_subset) || is_set(cur_parameter_subset, parameter_uidx)) {
+		sample_midx = 0;
+		for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+		  next_set_unsafe_ck(sample_nm, &sample_midx);
+		  *nm_predictors_pmaj_iter++ = genotype_vals[sample_idx] * cur_covar_col[sample_midx];
+		}
+		append_float_zero(nm_sample_ct_rem, &nm_predictors_pmaj_iter);
+	      }
+	      ++parameter_uidx;
+	      if (domdev_present) {
+		if ((!cur_parameter_subset) || is_set(cur_parameter_subset, parameter_uidx)) {
+		  sample_midx = 0;
+		  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+		    next_set_unsafe_ck(sample_nm, &sample_midx);
+		    *nm_predictors_pmaj_iter++ = domdev_vals[sample_idx] * cur_covar_col[sample_midx];
+		  }
+		  append_float_zero(nm_sample_ct_rem, &nm_predictors_pmaj_iter);
+		}
+		++parameter_uidx;
+	      }
+	    }
+	  }
+	  fill_float_zero(predictor_cta4, coef_return);
+	  if (!is_always_firth) {
+	    const double ref_plus_alt1_sum = dosage_ceil * ((int32_t)nm_sample_ct);
+	    // "dosage_sum" = alt1 sum
+	    const double ref_sum = ref_plus_alt1_sum - dosage_sum;
+	    const double ref_case_dosage = dosage_ceil * ((int32_t)nm_case_ct) - alt_case_dosage;
+	    const double alt1_ctrl_dosage = dosage_sum - alt_case_dosage;
+	    const double ref_ctrl_dosage = ref_sum - ref_case_dosage;
+	    if ((alt_case_dosage == 0.0) || (fabs(ref_case_dosage) < kBigEpsilon) || (fabs(alt1_ctrl_dosage) < kBigEpsilon) || (fabs(ref_ctrl_dosage) < kBigEpsilon)) {
+	      if (is_sometimes_firth) {
+		block_aux_iter->firth_fallback = 1;
+		goto glm_logistic_thread_firth_fallback;
+	      } else {
+		// this fails to converge >99.99% of the time, but better to
+		// explicitly detect it since that can siginificantly speed
+		// things up
+		goto glm_logistic_thread_skip_variant;
+	      }
+	    }
+	    if (logistic_regression(nm_pheno_buf, nm_predictors_pmaj_buf, nm_sample_ct, cur_predictor_ct, coef_return, cholesky_decomp_return, pp_buf, sample_variance_buf, hh_return, gradient_buf, dcoef_buf)) {
+	      if (is_sometimes_firth) {
+		fill_float_zero(predictor_cta4, coef_return);
+		block_aux_iter->firth_fallback = 1;
+		goto glm_logistic_thread_firth_fallback;
+	      }
+	      goto glm_logistic_thread_skip_variant;
+	    }
+	    // unlike firth_regression(), hh_return isn't inverted yet, do that
+	    // here
+	    for (uint32_t pred_uidx = 0; pred_uidx < cur_predictor_ct; ++pred_uidx) {
+	      float* hh_inv_row = &(hh_return[pred_uidx * predictor_cta4]);
+	      // fill_float_zero(cur_predictor_ct, gradient_buf);
+	      // gradient_buf[pred_uidx] = 1.0;
+	      // (y is gradient_buf, x is dcoef_buf)
+	      // solve_linear_system(cholesky_decomp_return, gradient_buf, cur_predictor_ct, hh_inv_row);
+	      // that works, but doesn't exploit the sparsity of y
+
+	      // hh_return does now have vector-aligned rows
+	      fill_float_zero(pred_uidx, hh_inv_row);
+
+	      float fxx = 1.0;
+	      for (uint32_t row_idx = pred_uidx; row_idx < cur_predictor_ct; ++row_idx) {
+		const float* ll_row = &(cholesky_decomp_return[row_idx * predictor_cta4]);
+		for (uint32_t col_idx = pred_uidx; col_idx < row_idx; ++col_idx) {
+		  fxx -= ll_row[col_idx] * hh_inv_row[col_idx];
+		}
+		hh_inv_row[row_idx] = fxx / ll_row[row_idx];
+		fxx = 0.0;
+	      }
+	      for (uint32_t col_idx = cur_predictor_ct; col_idx; ) {
+		fxx = hh_inv_row[--col_idx];
+		float* hh_inv_row_iter = &(hh_inv_row[cur_predictor_ct - 1]);
+		for (uint32_t row_idx = cur_predictor_ct - 1; row_idx > col_idx; --row_idx) {
+		  fxx -= cholesky_decomp_return[row_idx * predictor_cta4 + col_idx] * (*hh_inv_row_iter--);
+		}
+		*hh_inv_row_iter = fxx / cholesky_decomp_return[col_idx * predictor_cta4p1];
+	      }
+	    }
+	  } else {
+	  glm_logistic_thread_firth_fallback:
+	    if (firth_regression(nm_pheno_buf, nm_predictors_pmaj_buf, nm_sample_ct, cur_predictor_ct, coef_return, hh_return, inv_1d_buf, flt_2d_buf, pp_buf, sample_variance_buf, gradient_buf, dcoef_buf, score_buf, tmpnxk_buf)) {
+	      goto glm_logistic_thread_skip_variant;
+	    }
+	  }
+	  // validParameters() check
+	  for (uint32_t pred_uidx = 1; pred_uidx < cur_predictor_ct; ++pred_uidx) {
+	    const float hh_inv_diag_element = hh_return[pred_uidx * predictor_cta4p1];
+	    if ((hh_inv_diag_element < 1e-20) || (!realnum(hh_inv_diag_element))) {
+	      goto glm_logistic_thread_skip_variant;
+	    }
+	    // use sample_variance_buf[] to store diagonal square roots
+	    sample_variance_buf[pred_uidx] = sqrtf(hh_inv_diag_element);
+	  }
+	  sample_variance_buf[0] = sqrtf(hh_return[0]);
+	  for (uint32_t pred_uidx = 1; pred_uidx < cur_predictor_ct; ++pred_uidx) {
+	    const float cur_hh_inv_diag_sqrt = 0.99999 * sample_variance_buf[pred_uidx];
+	    const float* hh_inv_row_iter = &(hh_return[pred_uidx * predictor_cta4]);
+	    const float* hh_inv_diag_sqrts_iter = sample_variance_buf;
+	    for (uint32_t pred_uidx2 = 0; pred_uidx2 < pred_uidx; ++pred_uidx2) {
+	      if ((*hh_inv_row_iter++) > cur_hh_inv_diag_sqrt * (*hh_inv_diag_sqrts_iter++)) {
+		goto glm_logistic_thread_skip_variant;
+	      }
+	    }
+	  }
+	  double* beta_se_iter2 = beta_se_iter;
+	  for (uint32_t pred_uidx = reported_pred_uidx_start; pred_uidx < reported_pred_uidx_end; ++pred_uidx) {
+	    *beta_se_iter2++ = coef_return[pred_uidx];
+	    *beta_se_iter2++ = (double)sample_variance_buf[pred_uidx];
+	  }
+	  if (cur_constraint_ct) {
+	    *beta_se_iter2++ = 0.0;
+	    double chisq;
+	    if (!linear_hypothesis_chisq_f(coef_return, cur_constraints_con_major, hh_return, cur_constraint_ct, cur_predictor_ct, predictor_cta4, &chisq, tmphxs_buf, h_transpose_buf, inner_buf, inv_1d_buf, flt_2d_buf)) {
+	      *beta_se_iter2++ = chisq;
+	    } else {
+	      *beta_se_iter2++ = -9;
+	    }
+	  }
+	}
+	while (0) {
+	glm_logistic_thread_skip_variant:
+	  beta_se_iter[primary_pred_idx * 2 + 1] = -9;
+	}
+	beta_se_iter = &(beta_se_iter[2 * max_reported_test_ct]);
+	++block_aux_iter;
+	if (local_covars_iter) {
+	  local_covars_iter = &(local_covars_iter[local_covar_ct * max_sample_ct]);
+	}
+	// todo?
+      }
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+    variant_idx_offset += cur_block_variant_ct;
+  }
+}
+
+uint32_t get_reported_test_ct(const uintptr_t* parameter_subset, glm_flags_t glm_flags, uint32_t covar_ct) {
+  const uint32_t hide_covar = (glm_flags / kfGlmHideCovar) & 1;
+  const uint32_t include_intercept = (glm_flags / kfGlmIntercept) & 1;
+  const uint32_t domdev_present = (glm_flags & (kfGlmGenotypic | kfGlmHethom))? 1 : 0;
+  // TODO: --tests
+  const uint32_t joint_test = domdev_present;
+
+  if (hide_covar) {
+    if (!parameter_subset) {
+      return 1 + include_intercept + domdev_present + joint_test;
+    }
+    return include_intercept + domdev_present + joint_test + is_set(parameter_subset, 1);
+  }
+  
+  const uint32_t domdev_present_p1 = domdev_present + 1;
+  const uint32_t add_interactions = (glm_flags / kfGlmInteraction) & 1;
+  const uint32_t predictor_ct_base = 2 + domdev_present + covar_ct * (1 + add_interactions * domdev_present_p1);
+  uint32_t predictor_ct = predictor_ct_base;
+  if (parameter_subset) {
+    predictor_ct = popcount_longs(parameter_subset, BITCT_TO_WORDCT(predictor_ct_base));
+  }
+  return predictor_ct + joint_test + include_intercept - 1;
+}
+
+boolerr_t alloc_and_init_reported_test_names(const uintptr_t* parameter_subset, char** covar_names, glm_flags_t glm_flags, uint32_t covar_ct, char*** cur_test_names_ptr) {
+  const uint32_t model_dominant = (glm_flags / kfGlmDominant) & 1;
+  const uint32_t model_recessive = (glm_flags / kfGlmRecessive) & 1;
+  const uint32_t is_hethom = (glm_flags / kfGlmHethom) & 1;
+  const uint32_t domdev_present = (glm_flags & kfGlmGenotypic) || is_hethom;
+  char main_effect[4];
+  if (model_dominant) {
+    memcpy(main_effect, "DOMx", 4);
+  } else if (model_recessive) {
+    memcpy(main_effect, "RECx", 4);
+  } else if (is_hethom) {
+    memcpy(main_effect, "HOMx", 4);
+  } else {
+    memcpy(main_effect, "ADDx", 4);
+  }
+  const uint32_t include_intercept = (glm_flags / kfGlmIntercept) & 1;
+  const uint32_t include_main_effect = (!parameter_subset) || is_set(parameter_subset, 1);
+  char domdev_str[8];
+  uint32_t domdev_slen = 7;
+  if (!is_hethom) {
+    strcpy(domdev_str, "DOMDEVx");
+  } else {
+    strcpy(domdev_str, "HETx");
+    domdev_slen = 4;
+  }
+  
+  // TODO: --tests
+  const uint32_t joint_test = domdev_present;
+  
+  if (glm_flags & kfGlmHideCovar) {
+    const uint32_t reported_test_ct = include_intercept + include_main_effect + domdev_present + joint_test;
+    char* test_name_buf_iter;
+    if (bigstack_alloc_cp(reported_test_ct, cur_test_names_ptr) ||
+	bigstack_alloc_c(64, &test_name_buf_iter)) {
+      return 1;
+    }
+    char** cur_test_names = *cur_test_names_ptr;
+    uint32_t write_idx = 0;
+    if (include_intercept) {
+      char* iter_next = memcpya(test_name_buf_iter, "INTERCEPT", 10);
+      cur_test_names[write_idx++] = test_name_buf_iter;
+      test_name_buf_iter = iter_next;
+    }
+    if (include_main_effect) {
+      char* iter_next = memcpyax(test_name_buf_iter, main_effect, 3, '\0');
+      cur_test_names[write_idx++] = test_name_buf_iter;
+      test_name_buf_iter = iter_next;
+    }
+    if (domdev_present) {
+      char* iter_next = memcpyax(test_name_buf_iter, domdev_str, domdev_slen - 1, '\0');
+      cur_test_names[write_idx++] = test_name_buf_iter;
+      test_name_buf_iter = iter_next;
+    }
+    if (joint_test) {
+      // TODO: --tests
+      strcpy(test_name_buf_iter, "GENO_2DF");
+      cur_test_names[write_idx++] = test_name_buf_iter;
+    }
+    assert(write_idx == reported_test_ct);
+    return 0;
+  }
+  const uint32_t add_interactions = (glm_flags / kfGlmInteraction) & 1;
+  const uint32_t domdev_present_p1 = domdev_present + 1;
+  uint32_t predictor_ct_base = 2 + domdev_present + covar_ct * (1 + add_interactions * domdev_present_p1);
+  uint32_t predictor_ct = predictor_ct_base;
+  if (parameter_subset) {
+    predictor_ct = popcount_longs(parameter_subset, BITCT_TO_WORDCT(predictor_ct_base));
+  }
+  const uint32_t reported_test_ct = predictor_ct + joint_test + include_intercept - 1;
+  uintptr_t test_name_buf_alloc = 64;
+  if (add_interactions) {
+    // don't bother optimizing this for parameter_subset case for now
+    uintptr_t covar_name_total_blen = covar_ct;
+    for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx) {
+      covar_name_total_blen += strlen(covar_names[covar_idx]);
+    }
+    // ADDx[covar name], etc.
+    test_name_buf_alloc += 4 * covar_ct + covar_name_total_blen;
+    if (is_hethom) {
+      // HETx
+      test_name_buf_alloc += 4 * covar_ct + covar_name_total_blen;
+    } else if (domdev_present) {
+      // DOMDEVx
+      test_name_buf_alloc += 7 * covar_ct + covar_name_total_blen;
+    }
+  }
+  char* test_name_buf_iter;
+  if (bigstack_alloc_cp(reported_test_ct, cur_test_names_ptr) ||
+      bigstack_alloc_c(test_name_buf_alloc, &test_name_buf_iter)) {
+    return 1;
+  }
+  char** cur_test_names = *cur_test_names_ptr;
+  uint32_t write_idx = 0;
+  if (include_intercept) {
+    char* iter_next = memcpya(test_name_buf_iter, "INTERCEPT", 10);
+    cur_test_names[write_idx++] = test_name_buf_iter;
+    test_name_buf_iter = iter_next;
+  }
+  if (include_main_effect) {
+    char* iter_next = memcpyax(test_name_buf_iter, main_effect, 3, '\0');
+    cur_test_names[write_idx++] = test_name_buf_iter;
+    test_name_buf_iter = iter_next;
+  }
+  if (domdev_present) {
+    char* iter_next = memcpyax(test_name_buf_iter, domdev_str, domdev_slen - 1, '\0');
+    cur_test_names[write_idx++] = test_name_buf_iter;
+    test_name_buf_iter = iter_next;
+  }
+  uint32_t pred_uidx = 2 + domdev_present;
+  for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx, ++pred_uidx) {
+    if (parameter_subset && (!is_set(parameter_subset, pred_uidx))) {
+      continue;
+    }
+    // just point to the existing string, its lifetime is sufficient
+    cur_test_names[write_idx++] = covar_names[covar_idx];
+  }
+  if (add_interactions) {
+    for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx) {
+      const char* cur_covar_name = covar_names[covar_idx];
+      if ((!parameter_subset) || is_set(parameter_subset, pred_uidx)) {
+	char* iter_next = memcpya(test_name_buf_iter, main_effect, 4);
+	iter_next = strcpyax(iter_next, cur_covar_name, '\0');
+	cur_test_names[write_idx++] = test_name_buf_iter;
+	test_name_buf_iter = iter_next;
+      }
+      ++pred_uidx;
+      if (domdev_present) {
+	if ((!parameter_subset) || is_set(parameter_subset, pred_uidx)) {
+	  char* iter_next = memcpya(test_name_buf_iter, domdev_str, domdev_slen);
+	  iter_next = strcpyax(iter_next, cur_covar_name, '\0');
+	  cur_test_names[write_idx++] = test_name_buf_iter;
+	  test_name_buf_iter = iter_next;
+	}
+	++pred_uidx;
+      }
+    }
+  }
+  if (joint_test) {
+    // todo: --tests
+    strcpy(test_name_buf_iter, "GENO_2DF");
+    cur_test_names[write_idx++] = test_name_buf_iter;
+  }
+  assert(write_idx == reported_test_ct);
+  return 0;
+}
+
+boolerr_t alloc_and_init_constraints_f(uint32_t predictor_ct, uint32_t* constraint_ct_ptr, float** constraints_con_major_f_ptr) {
+  // todo: --tests
+  const uint32_t constraint_ct = 2;
+  if (bigstack_calloc_f(constraint_ct * predictor_ct, constraints_con_major_f_ptr)) {
+    return 1;
+  }
+  float* constraints_con_major_f = *constraints_con_major_f_ptr;
+  constraints_con_major_f[1] = 1; // [0][1]
+  constraints_con_major_f[predictor_ct + 2] = 1; // [1][2]
+  *constraint_ct_ptr = constraint_ct;
+  return 0;
+}
+
+boolerr_t alloc_and_init_constraints_d(uint32_t predictor_ct, uint32_t* constraint_ct_ptr, double** constraints_con_major_ptr) {
+  const uint32_t constraint_ct = 2;
+  if (bigstack_calloc_d(constraint_ct * predictor_ct, constraints_con_major_ptr)) {
+    return 1;
+  }
+  double* constraints_con_major = *constraints_con_major_ptr;
+  constraints_con_major[1] = 1; // [0][1]
+  constraints_con_major[predictor_ct + 2] = 1; // [1][2]
+  *constraint_ct_ptr = constraint_ct;
+  return 0;
+}
+
+pglerr_t read_local_covar_block(const uintptr_t* sample_include, const uintptr_t* sample_include_x, const uintptr_t* sample_include_y, const uint32_t* sample_include_cumulative_popcounts, const uint32_t* sample_include_x_cumulative_popcounts, const uint32_t* sample_include_y_cumulative_popcounts, const chr_info_t* cip, const uintptr_t* variant_include, const uint32_t* local_sample_uidx_order, const uintptr_t* local_variant_include, uint32_t sample_ct, uint32_t sample_ct_x, uint32_t sampl [...]
+  const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+  const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+  const uint32_t tokens_per_sample = local_cat_ct? 1 : (local_covar_ct + omit_last);
+  uint32_t max_sample_ct = MAXV(sample_ct, sample_ct_x);
+  if (max_sample_ct < sample_ct_y) {
+    max_sample_ct = sample_ct_y;
+  }
+  uint32_t variant_bidx = 0;
+  if (local_cat_ct) {
+    // assert(local_covar_ct == local_cat_ct - 1);
+    if (local_covars_vcmaj_f_iter) {
+      fill_float_zero(local_covar_ct * max_sample_ct * ((uintptr_t)cur_block_variant_ct), local_covars_vcmaj_f_iter);
+    } else {
+      fill_double_zero(local_covar_ct * max_sample_ct * ((uintptr_t)cur_block_variant_ct), local_covars_vcmaj_d_iter);
+    }
+  }
+  uint32_t local_line_idx = *local_line_idx_ptr;
+  while (variant_bidx < cur_block_variant_ct) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    const uint32_t chr_fo_idx = get_variant_chr_fo_idx(cip, variant_uidx);
+    const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+    const uint32_t chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+    uint32_t cur_variant_bidx_end = cur_block_variant_ct;
+    if (chr_end < variant_uidx_end) {
+      cur_variant_bidx_end = variant_bidx + popcount_bit_idx(variant_include, variant_uidx, chr_end);
+      assert(cur_variant_bidx_end <= cur_block_variant_ct);
+    }
+    const uint32_t is_x = (chr_idx == x_code);
+    const uint32_t is_y = (chr_idx == y_code);
+    const uintptr_t* cur_sample_include;
+    const uint32_t* cur_sample_include_cumulative_popcounts;
+    uint32_t cur_sample_ct;
+    if (is_y && sample_include_y) {
+      cur_sample_include = sample_include_y;
+      cur_sample_include_cumulative_popcounts = sample_include_y_cumulative_popcounts;
+      cur_sample_ct = sample_ct_y;
+    } else if (is_x && sample_include_x) {
+      cur_sample_include = sample_include_x;
+      cur_sample_include_cumulative_popcounts = sample_include_x_cumulative_popcounts;
+      cur_sample_ct = sample_ct_x;
+    } else {
+      cur_sample_include = sample_include;
+      cur_sample_include_cumulative_popcounts = sample_include_cumulative_popcounts;
+      cur_sample_ct = sample_ct;
+    }
+    const uint32_t new_local_xy = is_x + 2 * is_y;
+    if (new_local_xy != *local_xy_ptr) {
+      for (uint32_t uii = 0; uii < local_sample_ct; ++uii) {
+	const uint32_t cur_uidx = local_sample_uidx_order[uii];
+	uint32_t cur_idx = 0xffffffffU;
+	if ((cur_uidx != 0xffffffffU) && is_set(cur_sample_include, cur_uidx)) {
+	  cur_idx = raw_to_subsetted_pos(cur_sample_include, cur_sample_include_cumulative_popcounts, cur_uidx);
+	}
+	local_sample_idx_order[uii] = cur_idx;
+      }
+      *local_xy_ptr = new_local_xy;
+    }
+    for (; variant_bidx < cur_variant_bidx_end; ++variant_bidx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      do {
+        ++local_line_idx;
+	if (!gzgets(gz_local_covar_file, local_loadbuf, local_loadbuf_size)) {
+	  if (!gzeof(gz_local_covar_file)) {
+	    return kPglRetReadFail;
+	  }
+	  logprint("\n");
+	  logerrprint("Error: --glm local-covar= file has fewer lines than local-pvar= file.\n");
+	  return kPglRetMalformedInput;
+	}
+	if (!local_loadbuf[local_loadbuf_size - 1]) {
+	  logprint("\n");
+	  LOGERRPRINTF("Error: Line %u of --glm local-covar= file is longer than expected.\n", local_line_idx);
+	  return kPglRetMalformedInput;
+	}
+      } while (!is_set(local_variant_include, local_line_idx - 1));
+      char* loadbuf_iter = skip_initial_spaces(local_loadbuf);
+      uint32_t sample_idx = 0;
+      for (uint32_t local_sample_idx = 0; sample_idx < cur_sample_ct; ++local_sample_idx) {
+	const uint32_t cur_sample_idx = local_sample_idx_order[local_sample_idx];
+	if (cur_sample_idx == 0xffffffffU) {
+	  loadbuf_iter = next_token_mult(loadbuf_iter, tokens_per_sample);
+	  if (!loadbuf_iter) {
+	    logprint("\n");
+	    LOGERRPRINTFWW("Error: Fewer tokens than expected on line %u of --glm local-covar= file.\n", local_line_idx);
+	    return kPglRetMalformedInput;
+	  }
+	  continue;
+	}
+	if (local_cat_ct) {
+	  uint32_t cat_idx;
+	  if (scanadv_posint_capped(local_cat_ct, &loadbuf_iter, &cat_idx)) {
+	    logprint("\n");
+	    LOGERRPRINTF("Error: Invalid category index on line %u of --glm local-covar= file.\n", local_line_idx);
+	    return kPglRetMalformedInput;
+	  }
+	  if (cat_idx != local_cat_ct) {
+	    --cat_idx;
+	    const uint32_t offset = cat_idx * max_sample_ct + cur_sample_idx;
+	    if (local_covars_vcmaj_f_iter) {
+	      local_covars_vcmaj_f_iter[offset] = 1.0;
+	    } else {
+	      local_covars_vcmaj_d_iter[offset] = 1.0;
+	    }
+	  }
+	  while (!is_space_or_eoln(*loadbuf_iter)) {
+	    ++loadbuf_iter;
+	  }
+	  loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	} else {
+	  if (local_covars_vcmaj_f_iter) {
+	    float* local_covars_f_iter2 = &(local_covars_vcmaj_f_iter[cur_sample_idx]);
+	    for (uint32_t covar_idx = 0; covar_idx < local_covar_ct; ++covar_idx) {
+	      double dxx;
+	      loadbuf_iter = scanadv_double(loadbuf_iter, &dxx);
+	      if ((!loadbuf_iter) || (fabs(dxx) > 3.4028235677973362e38)) {
+		logprint("\n");
+		LOGERRPRINTF("Error: Invalid or missing token on line %u of --glm local-covar= file.\n", local_line_idx);
+		return kPglRetMalformedInput;
+	      }
+	      *local_covars_f_iter2 = (float)dxx;
+	      local_covars_f_iter2 = &(local_covars_f_iter2[max_sample_ct]);
+	      while (!is_space_or_eoln(*loadbuf_iter)) {
+		++loadbuf_iter;
+	      }
+	      loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	    }
+	  } else {
+	    double* local_covars_d_iter2 = &(local_covars_vcmaj_d_iter[cur_sample_idx]);
+	    for (uint32_t covar_idx = 0; covar_idx < local_covar_ct; ++covar_idx) {
+	      double dxx;
+	      loadbuf_iter = scanadv_double(loadbuf_iter, &dxx);
+	      if (!loadbuf_iter) {
+		logprint("\n");
+		LOGERRPRINTF("Error: Invalid or missing token on line %u of --glm local-covar= file.\n", local_line_idx);
+		return kPglRetMalformedInput;
+	      }
+	      *local_covars_d_iter2 = dxx;
+	      local_covars_d_iter2 = &(local_covars_d_iter2[max_sample_ct]);
+	      while (!is_space_or_eoln(*loadbuf_iter)) {
+		++loadbuf_iter;
+	      }
+	      loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	    }
+	  }
+	  if (omit_last) {
+	    while (!is_space_or_eoln(*loadbuf_iter)) {
+	      ++loadbuf_iter;
+	    }
+	    loadbuf_iter = skip_initial_spaces(loadbuf_iter);
+	  }
+	}
+	++sample_idx;
+      }
+      if (local_covars_vcmaj_f_iter) {
+	local_covars_vcmaj_f_iter += max_sample_ct * local_covar_ct;
+      } else {
+	local_covars_vcmaj_d_iter += max_sample_ct * local_covar_ct;
+      }
+    }
+  }
+  *local_line_idx_ptr = local_line_idx;
+  return kPglRetSuccess;
+}
+
+// only pass the parameters which aren't also needed by the compute threads,
+// for now
+pglerr_t glm_logistic(const char* cur_pheno_name, char** test_names, char** test_names_x, char** test_names_y, const uint32_t* variant_bps, char** variant_ids, char** allele_storage, const glm_info_t* glm_info_ptr, const uint32_t* local_sample_uidx_order, const uintptr_t* local_variant_include, const char* outname, uint32_t raw_variant_ct, uint32_t max_chr_blen, double ci_size, double pfilter, double output_min_p, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, uint32_t local_s [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uintptr_t* variant_include = g_variant_include;
+    const chr_info_t* cip = g_cip;
+    const uintptr_t* variant_allele_idxs = g_variant_allele_idxs;
+
+    const uint32_t sample_ct = g_sample_ct;
+    const uint32_t sample_ct_x = g_sample_ct_x;
+    const uint32_t sample_ct_y = g_sample_ct_y;
+    const uint32_t covar_ct = g_covar_ct;
+    const uintptr_t local_covar_ct = g_local_covar_ct;
+    const uint32_t covar_ct_x = g_covar_ct_x;
+    const uint32_t covar_ct_y = g_covar_ct_y;
+
+    uint32_t max_sample_ct = MAXV(sample_ct, sample_ct_x);
+    if (max_sample_ct < sample_ct_y) {
+      max_sample_ct = sample_ct_y;
+    }
+    uint32_t* local_sample_idx_order = nullptr;
+    uint32_t local_line_idx = 0;
+    uint32_t local_xy = 0; // 1 = chrX, 2 = chrY
+    if (gz_local_covar_file) {
+      if (gzrewind(gz_local_covar_file)) {
+	goto glm_logistic_ret_READ_FAIL;
+      }
+      if (bigstack_alloc_ui(local_sample_ct, &local_sample_idx_order)) {
+	goto glm_logistic_ret_NOMEM;
+      }
+      for (uint32_t uii = 0; uii < local_sample_ct; ++uii) {
+	const uint32_t cur_uidx = local_sample_uidx_order[uii];
+	uint32_t cur_idx = 0xffffffffU;
+	if ((cur_uidx != 0xffffffffU) && is_set(g_sample_include, cur_uidx)) {
+	  cur_idx = raw_to_subsetted_pos(g_sample_include, g_sample_include_cumulative_popcounts, cur_uidx);
+	}
+	local_sample_idx_order[uii] = cur_idx;
+      }
+    }
+    
+    const uint32_t variant_ct = g_variant_ct;
+    
+    const glm_flags_t glm_flags = glm_info_ptr->flags;    
+    const uint32_t output_zst = (glm_flags / kfGlmZs) & 1;
+    // er, do not want to use multithreaded compression here... make sure to
+    // add a forced-singlethreaded mode when multithreaded compression is
+    // implemented.
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto glm_logistic_ret_OPEN_FAIL;
+    }
+    const uint32_t add_interactions = (glm_flags / kfGlmInteraction) & 1;
+    const uint32_t domdev_present = (glm_flags & (kfGlmGenotypic | kfGlmHethom))? 1 : 0;
+    const uint32_t domdev_present_p1 = domdev_present + 1;
+
+    // todo: --tests
+    const uint32_t constraint_ct = g_constraint_ct;
+    const uint32_t constraint_ct_x = g_constraint_ct_x;
+    const uint32_t constraint_ct_y = g_constraint_ct_y;
+    
+    uint32_t predictor_ct = 2 + domdev_present + covar_ct * (1 + add_interactions * domdev_present_p1);
+    uint32_t predictor_ct_x = 2 + domdev_present + covar_ct_x * (1 + add_interactions * domdev_present_p1);
+    uint32_t predictor_ct_y = 2 + domdev_present + covar_ct_y * (1 + add_interactions * domdev_present_p1);
+    const uintptr_t* parameter_subset = g_parameter_subset;
+    const uintptr_t* parameter_subset_x = g_parameter_subset_x;
+    const uintptr_t* parameter_subset_y = g_parameter_subset_y;
+    if (parameter_subset) {
+      predictor_ct = popcount_longs(parameter_subset, BITCT_TO_WORDCT(predictor_ct));
+      if (sample_ct_x) {
+	predictor_ct_x = popcount_longs(parameter_subset_x, BITCT_TO_WORDCT(predictor_ct_x));
+      } else {
+	predictor_ct_x = 0;
+      }
+      if (sample_ct_y) {
+	predictor_ct_y = popcount_longs(parameter_subset_y, BITCT_TO_WORDCT(predictor_ct_x));
+      } else {
+	predictor_ct_y = 0;
+      }
+    }
+    uint32_t reported_test_ct = get_reported_test_ct(parameter_subset, glm_flags, covar_ct);
+    uintptr_t max_reported_test_ct = reported_test_ct;
+    uint32_t reported_test_ct_x = 0;
+    if (sample_ct_x) {
+      reported_test_ct_x = get_reported_test_ct(parameter_subset_x, glm_flags, covar_ct_x);
+      if (reported_test_ct_x > max_reported_test_ct) {
+	max_reported_test_ct = reported_test_ct_x;
+      }
+    }
+    uint32_t reported_test_ct_y = 0;
+    if (sample_ct_y) {
+      reported_test_ct_y = get_reported_test_ct(parameter_subset_y, glm_flags, covar_ct_y);
+      if (reported_test_ct_y > max_reported_test_ct) {
+	max_reported_test_ct = reported_test_ct_y;
+      }
+    }
+    const uint32_t include_intercept = (glm_flags / kfGlmIntercept) & 1;
+    const glm_cols_t glm_cols = glm_info_ptr->cols;
+    const uint32_t test_col = glm_cols & kfGlmColTest;
+    if ((!test_col) && (max_reported_test_ct > 1)) {
+      logerrprint("Error: --glm's 'test' column cannot be omitted when results for multiple\npredictors are reported.  (Did you forget 'hide-covar'?)\n");
+      goto glm_logistic_ret_INCONSISTENT_INPUT;
+    }
+    g_max_reported_test_ct = max_reported_test_ct;
+    
+    const uint32_t is_sometimes_firth = (glm_flags & (kfGlmFirthFallback | kfGlmFirth))? 1 : 0;
+    const uint32_t is_always_firth = (glm_flags / kfGlmFirth) & 1;
+
+    int32_t x_code = -2;
+    uint32_t x_start = 0;
+    uint32_t x_end = 0;
+    if (sample_ct_x) {
+      get_xymt_code_start_and_end_unsafe(cip, kChrOffsetX, &x_code, &x_start, &x_end);
+    }
+    int32_t y_code = -2;
+    uint32_t y_start = 0;
+    uint32_t y_end = 0;
+    if (sample_ct_y) {
+      get_xymt_code_start_and_end_unsafe(cip, kChrOffsetY, &y_code, &y_start, &y_end);
+    }
+    const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+    const uint32_t chr_col = glm_cols & kfGlmColChrom;
+
+    // includes trailing tab
+    char* chr_buf = nullptr;
+    if (chr_col) {
+      if (bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+	goto glm_logistic_ret_NOMEM;
+      }
+    }
+
+    uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
+    if (calc_thread_ct > variant_ct) {
+      calc_thread_ct = variant_ct;
+    }
+
+    const uint32_t genof_buffer_needed = parameter_subset && (!is_set(parameter_subset, 1));
+    // workflow is similar to --make-bed
+    uintptr_t workspace_alloc = get_logistic_workspace_size(sample_ct, predictor_ct, constraint_ct, genof_buffer_needed, is_sometimes_firth);
+    if (sample_ct_x) {
+      const uintptr_t workspace_alloc_x = get_logistic_workspace_size(sample_ct_x, predictor_ct_x, constraint_ct_x, genof_buffer_needed, is_sometimes_firth);
+      if (workspace_alloc_x > workspace_alloc) {
+	workspace_alloc = workspace_alloc_x;
+      }
+    }
+    if (sample_ct_y) {
+      const uintptr_t workspace_alloc_y = get_logistic_workspace_size(sample_ct_y, predictor_ct_y, constraint_ct_y, genof_buffer_needed, is_sometimes_firth);
+      if (workspace_alloc_y > workspace_alloc) {
+	workspace_alloc = workspace_alloc_y;
+      }
+    }
+    // +1 is for top-level g_workspace_bufs
+    const uint32_t dosage_is_present = pgfip->gflags & kfPgenGlobalDosagePresent;
+    uintptr_t thread_xalloc_cacheline_ct = (workspace_alloc / kCacheline) + 1;
+    uintptr_t per_variant_xalloc_byte_ct = sizeof(logistic_aux_result_t) + 2 * max_reported_test_ct * sizeof(double) + max_sample_ct * local_covar_ct * sizeof(float);
+    unsigned char* main_loadbufs[2];
+    uint32_t read_block_size;
+    if (multithread_load_init(variant_include, max_sample_ct, variant_ct, pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, per_variant_xalloc_byte_ct, pgfip, &calc_thread_ct, &g_genovecs, dosage_is_present? (&g_dosage_presents) : nullptr, dosage_is_present? (&g_dosage_val_bufs) : nullptr, &read_block_size, main_loadbufs, &ts.threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+      goto glm_logistic_ret_NOMEM;
+    }
+    ts.calc_thread_ct = calc_thread_ct;
+    g_calc_thread_ct = calc_thread_ct;
+    logistic_aux_result_t* logistic_block_aux_bufs[2];
+    double* block_beta_se_bufs[2];
+    
+    for (uint32_t uii = 0; uii < 2; ++uii) {
+      logistic_block_aux_bufs[uii] = (logistic_aux_result_t*)bigstack_alloc(read_block_size * sizeof(logistic_aux_result_t));
+      if ((!logistic_block_aux_bufs[uii]) ||
+	  bigstack_alloc_d(read_block_size * 2 * max_reported_test_ct, &(block_beta_se_bufs[uii]))) {
+	goto glm_logistic_ret_NOMEM;
+      }
+      if (local_covar_ct) {
+	if (bigstack_alloc_f(read_block_size * max_sample_ct * local_covar_ct * sizeof(float), &(g_local_covars_vcmaj_f[uii]))) {
+	  goto glm_logistic_ret_NOMEM;
+	}
+      } else {
+	g_local_covars_vcmaj_f[uii] = nullptr;
+      }
+    }
+
+    if (max_sample_ct > 2000000) {
+      logerrprint("Warning: --glm logistic regression is unreliable on more than ~2 million\nsamples, since it uses single-precision arithmetic.\n");
+    }
+    g_workspace_bufs = (unsigned char**)bigstack_alloc_raw_rd(calc_thread_ct * sizeof(intptr_t));
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      g_workspace_bufs[tidx] = bigstack_alloc_raw(workspace_alloc);
+    }
+    
+    const uint32_t ref_col = glm_cols & kfGlmColRef;
+    const uint32_t alt1_col = glm_cols & kfGlmColAlt1;
+    const uint32_t alt_col = glm_cols & kfGlmColAlt;
+    const uint32_t alt_ct_col = glm_cols & kfGlmColAltcount;
+    const uint32_t tot_allele_col = glm_cols & kfGlmColTotallele;
+    const uint32_t alt_ct_cc_col = glm_cols & kfGlmColAltcountcc;
+    const uint32_t tot_allele_cc_col = glm_cols & kfGlmColTotallelecc;
+    const uint32_t alt_freq_col = glm_cols & kfGlmColAltfreq;
+    const uint32_t alt_freq_cc_col = glm_cols & kfGlmColAltfreqcc;
+    const uint32_t mach_r2_col = glm_cols & kfGlmColMachR2;
+    const uint32_t firth_yn_col = (glm_cols & kfGlmColFirthYn) && is_sometimes_firth && (!is_always_firth);
+    const uint32_t nobs_col = glm_cols & kfGlmColNobs;
+    const uint32_t orbeta_col = glm_cols & (kfGlmColBeta | kfGlmColOrbeta);
+    const uint32_t report_beta_instead_of_odds_ratio = glm_cols & kfGlmColBeta;
+    const uint32_t se_col = glm_cols & kfGlmColSe;
+    const uint32_t ci_col = (ci_size != 0.0) && (glm_cols & kfGlmColCi);
+    const uint32_t t_col = glm_cols & kfGlmColT;
+    const uint32_t p_col = glm_cols & kfGlmColP;
+    cswritep = (char*)overflow_buf;
+    *cswritep++ = '#';
+    if (chr_col) {
+      cswritep = strcpya(cswritep, "CHROM\t");
+    }
+    if (variant_bps) {
+      cswritep = strcpya(cswritep, "POS\t");
+    }
+    cswritep = strcpya(cswritep, "ID");
+    if (ref_col) {
+      cswritep = strcpya(cswritep, "\tREF");
+    }
+    if (alt1_col) {
+      cswritep = strcpya(cswritep, "\tALT1");
+    }
+    if (alt_col) {
+      cswritep = strcpya(cswritep, "\tALT");
+    }
+    if (alt_ct_col) {
+      cswritep = strcpya(cswritep, "\tALT_CT");
+    }
+    if (tot_allele_col) {
+      cswritep = strcpya(cswritep, "\tALLELE_CT");
+    }
+    if (alt_ct_cc_col) {
+      cswritep = strcpya(cswritep, "\tALT_CASE_CT\tALT_CTRL_CT");
+    }
+    if (tot_allele_cc_col) {
+      cswritep = strcpya(cswritep, "\tCASE_ALLELE_CT\tCTRL_ALLELE_CT");
+    }
+    if (alt_freq_col) {
+      cswritep = strcpya(cswritep, "\tALT_FREQ");
+    }
+    if (alt_freq_cc_col) {
+      cswritep = strcpya(cswritep, "\tALT_CASE_FREQ\tALT_CTRL_FREQ");
+    }
+    if (mach_r2_col) {
+      cswritep = strcpya(cswritep, "\tMACH_R2");
+    }
+    if (firth_yn_col) {
+      cswritep = strcpya(cswritep, "\tFIRTH?");
+    }
+    if (test_col) {
+      cswritep = strcpya(cswritep, "\tTEST");
+    }
+    if (nobs_col) {
+      cswritep = strcpya(cswritep, "\tOBS_CT");
+    }
+    if (orbeta_col) {
+      if (report_beta_instead_of_odds_ratio) {
+	cswritep = strcpya(cswritep, "\tBETA");
+      } else {
+	cswritep = strcpya(cswritep, "\tOR");
+      }
+    }
+    if (se_col) {
+      cswritep = strcpya(cswritep, "\tSE");
+    }
+    double ci_zt = 0.0;
+    if (ci_col) {
+      cswritep = strcpya(cswritep, "\tL");
+      cswritep = dtoa_g(ci_size * 100, cswritep);
+      cswritep = strcpya(cswritep, "\tU");
+      cswritep = dtoa_g(ci_size * 100, cswritep);
+      ci_zt = ltqnorm((ci_size + 1.0) * 0.5);
+    }
+    if (t_col) {
+      if (!constraint_ct) {
+        cswritep = strcpya(cswritep, "\tT_STAT");
+      } else {
+	// chisq for joint tests.  may switch to F-statistic (just divide by
+	// df; the hard part there is porting a function to convert that to a
+	// p-value)
+        cswritep = strcpya(cswritep, "\tT_OR_CHISQ_STAT");
+      }
+    }
+    if (p_col) {
+      cswritep = strcpya(cswritep, "\tP");
+    }
+    append_binary_eoln(&cswritep);
+
+    // Main workflow:
+    // 1. Set n=0, load/skip block 0
+    //
+    // 2. Spawn threads processing block n
+    // 3. If n>0, write results for block (n-1)
+    // 4. Increment n by 1
+    // 5. Load/skip block n unless eof
+    // 6. Join threads
+    // 7. Goto step 2 unless eof
+    //
+    // 8, Write results for last block
+    const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+    const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+    uint32_t parity = 0;
+    uint32_t read_block_idx = 0;
+    uint32_t write_variant_uidx = 0;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_buf_blen = 0;
+    uint32_t suppress_mach_r2 = 0;
+
+    // todo: --tests
+    uint32_t cur_reported_test_ct = 0;
+    uint32_t primary_reported_test_idx = include_intercept;
+    uint32_t cur_constraint_ct = 0;
+
+    char** cur_test_names = nullptr;
+    uint32_t prev_block_variant_ct = 0;
+    uint32_t variant_idx = 0;
+    uint32_t cur_read_block_size = read_block_size;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    uint32_t cur_allele_ct = 2;
+    LOGPRINTFWW5("--glm %s regression on phenotype '%s': ", is_always_firth? "Firth" : (is_sometimes_firth? "logistic-Firth hybrid" : "logistic"), cur_pheno_name);
+    fputs("0%", stdout);
+    fflush(stdout);
+    while (1) {
+      uintptr_t cur_block_variant_ct = 0;
+      if (!ts.is_last_block) {
+	while (read_block_idx < read_block_ct_m1) {
+	  cur_block_variant_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	  if (cur_block_variant_ct) {
+	    break;
+	  }
+	  ++read_block_idx;
+	}
+	if (read_block_idx == read_block_ct_m1) {
+	  cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	  cur_block_variant_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	}
+	if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_variant_ct, pgfip)) {
+	  goto glm_logistic_ret_READ_FAIL;
+	}
+	if (gz_local_covar_file) {
+	  reterr = read_local_covar_block(g_sample_include, g_sample_include_x, g_sample_include_y, g_sample_include_cumulative_popcounts, g_sample_include_x_cumulative_popcounts, g_sample_include_y_cumulative_popcounts, cip, variant_include, local_sample_uidx_order, local_variant_include, sample_ct, sample_ct_x, sample_ct_y, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_variant_ct, local_sample_ct, local_covar_ct, (glm_info_ptr->flags / kfG [...]
+	  if (reterr) {
+	    goto glm_logistic_ret_1;
+	  }
+	}
+      }
+      if (variant_idx) {
+	join_threads3z(&ts);
+	reterr = g_error_ret;
+	if (reterr) {
+	  if (reterr == kPglRetMalformedInput) {
+	    logprint("\n");
+	    logerrprint("Error: Malformed .pgen file.\n");
+	  }
+	  goto glm_logistic_ret_1;
+	}
+      }
+      if (!ts.is_last_block) {
+	g_cur_block_variant_ct = cur_block_variant_ct;
+	const uint32_t uidx_start = read_block_idx * read_block_size;
+	compute_uidx_start_partition(variant_include, cur_block_variant_ct, calc_thread_ct, uidx_start, g_read_variant_uidx_starts);
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	  g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	}
+	g_logistic_block_aux = logistic_block_aux_bufs[parity];
+	g_block_beta_se = block_beta_se_bufs[parity];
+	ts.is_last_block = (variant_idx + cur_block_variant_ct == variant_ct);
+	ts.thread_func_ptr = glm_logistic_thread;
+	if (spawn_threads3z(variant_idx, &ts)) {
+	  goto glm_logistic_ret_THREAD_CREATE_FAIL;
+	}
+      }
+      parity = 1 - parity;
+      if (variant_idx) {
+	// write *previous* block results
+	const double* cur_block_beta_se = block_beta_se_bufs[parity];
+	const logistic_aux_result_t* cur_block_aux = logistic_block_aux_bufs[parity];
+	const uint32_t variant_idx_start = variant_idx - prev_block_variant_ct;
+	double* cur_pval_write = orig_pvals? (&(orig_pvals[variant_idx_start])) : nullptr;
+	double* cur_chisq_write = orig_chisq? (&(orig_chisq[variant_idx_start])) : nullptr;
+	for (uint32_t variant_bidx = 0; variant_bidx < prev_block_variant_ct; ++variant_bidx, ++write_variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &write_variant_uidx);
+	  if (write_variant_uidx >= chr_end) {
+	    do {
+	      ++chr_fo_idx;
+	      chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	    } while (write_variant_uidx >= chr_end);
+	    const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	    suppress_mach_r2 = 1;
+	    if ((chr_idx == ((uint32_t)x_code)) && sample_ct_x) {
+	      cur_reported_test_ct = reported_test_ct_x;
+	      cur_constraint_ct = constraint_ct_x;
+	      cur_test_names = test_names_x;
+	    } else if ((chr_idx == ((uint32_t)y_code)) && sample_ct_y) {
+	      cur_reported_test_ct = reported_test_ct_y;
+	      cur_constraint_ct = constraint_ct_y;
+	      cur_test_names = test_names_y;
+	    } else {
+	      cur_reported_test_ct = reported_test_ct;
+	      cur_constraint_ct = constraint_ct;
+	      cur_test_names = test_names;
+	      if ((chr_idx != ((uint32_t)x_code)) && (chr_idx != ((uint32_t)mt_code)) && (!is_set(cip->haploid_mask, chr_idx))) {
+		suppress_mach_r2 = 0;
+	      }
+	    }
+	    if (cur_constraint_ct) {
+	      primary_reported_test_idx = reported_test_ct - 1;
+	    }
+	    if (chr_col) {
+	      char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	      *chr_name_end = '\t';
+	      chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	    }
+	  }
+	  const double* beta_se_iter = &(cur_block_beta_se[2 * max_reported_test_ct * variant_bidx]);
+	  const double primary_beta = beta_se_iter[primary_reported_test_idx * 2];
+	  const double primary_se = beta_se_iter[primary_reported_test_idx * 2 + 1];
+	  const uint32_t is_invalid = (primary_se == -9);
+	  if (is_invalid && valid_variants) {
+	    CLEAR_BIT(write_variant_uidx, valid_variants);
+	  }
+	  if (pfilter != 2.0) {
+	    double primary_pval = 2.0;
+	    if (!is_invalid) {
+	      if (!cur_constraint_ct) {
+		double primary_tstat = primary_beta / primary_se;
+		// could precompute a tstat threshold instead
+		primary_pval = chiprob_p(primary_tstat * primary_tstat, 1);
+	      } else {
+		// possible todo: support for F-distribution p-values instead
+		// of asymptotic chi-square p-values
+		primary_pval = chiprob_p(primary_se, cur_constraint_ct);
+	      }
+	    }
+	    if (primary_pval > pfilter) {
+	      if (cur_pval_write) {
+		cur_pval_write[variant_bidx] = -9;
+	      }
+	      if (cur_chisq_write) {
+		cur_chisq_write[variant_bidx] = -9;
+	      }
+	      continue;
+	    }
+	  }
+	  const logistic_aux_result_t* auxp = &(cur_block_aux[variant_bidx]);
+	  uintptr_t variant_allele_idx_base = write_variant_uidx * 2;
+	  if (variant_allele_idxs) {
+	    variant_allele_idx_base = variant_allele_idxs[write_variant_uidx];
+	    cur_allele_ct = variant_allele_idxs[write_variant_uidx + 1] - variant_allele_idxs[write_variant_uidx];
+	  }
+	  char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	  // possible todo: make number-to-string operations, strlen(), etc.
+	  //   happen only once per variant.
+	  for (uint32_t test_idx = 0; test_idx < cur_reported_test_ct; ++test_idx) {
+	    if (chr_col) {
+	      cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+	    }
+	    if (variant_bps) {
+	      cswritep = uint32toa_x(variant_bps[write_variant_uidx], '\t', cswritep);
+	    }
+	    cswritep = strcpya(cswritep, variant_ids[write_variant_uidx]);
+	    if (ref_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_alleles[0]);
+	    }
+	    if (alt1_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_alleles[1]);
+	    }
+	    if (alt_col) {
+	      *cswritep++ = '\t';
+	      for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+		if (cswrite(&css, &cswritep)) {
+		  goto glm_logistic_ret_WRITE_FAIL;
+		}
+		cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	      }
+	      --cswritep;
+	    }
+	    if (alt_ct_col) {
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(auxp->alt_dosage, cswritep);
+	    }
+	    if (tot_allele_col) {
+	      *cswritep++ = '\t';
+	      cswritep = uint32toa(auxp->allele_obs_ct, cswritep);
+	    }
+	    if (alt_ct_cc_col) {
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(auxp->alt_case_dosage, cswritep);
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(auxp->alt_dosage - auxp->alt_case_dosage, cswritep);
+	    }
+	    if (tot_allele_cc_col) {
+	      *cswritep++ = '\t';
+	      cswritep = uint32toa_x(auxp->case_allele_obs_ct, '\t', cswritep);
+	      cswritep = uint32toa(auxp->allele_obs_ct - auxp->case_allele_obs_ct, cswritep);
+	    }
+	    if (alt_freq_col) {
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(auxp->alt_dosage / ((double)auxp->allele_obs_ct), cswritep);
+	    }
+	    if (alt_freq_cc_col) {
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(auxp->alt_case_dosage / ((double)auxp->case_allele_obs_ct), cswritep);
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g((auxp->alt_dosage - auxp->alt_case_dosage) / ((double)(auxp->allele_obs_ct - auxp->case_allele_obs_ct)), cswritep);
+	    }
+	    if (mach_r2_col) {
+	      *cswritep++ = '\t';
+	      if (!suppress_mach_r2) {
+		cswritep = dtoa_g(auxp->mach_r2, cswritep);
+	      } else {
+		cswritep = strcpya(cswritep, "NA");
+	      }
+	    }
+	    if (firth_yn_col) {
+	      *cswritep++ = '\t';
+	      // 'Y' - 'N' = 11
+	      *cswritep++ = 'N' + 11 * auxp->firth_fallback;
+	    }
+	    if (test_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_test_names[test_idx]);
+	    }
+	    if (nobs_col) {
+	      *cswritep++ = '\t';
+	      cswritep = uint32toa(auxp->sample_obs_ct, cswritep);
+	    }
+	    double pval = -9;
+	    double tstat = 0.0;
+	    if ((!cur_constraint_ct) || (test_idx != primary_reported_test_idx)) {
+	      double beta = *beta_se_iter++;
+	      double se = *beta_se_iter++;
+	      if (!is_invalid) {
+		tstat = beta / se;
+		pval = chiprob_p(tstat * tstat, 1);
+	      }
+	      if (orbeta_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(report_beta_instead_of_odds_ratio? beta : exp(beta), cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	      if (se_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(se, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	      if (ci_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  const double ci_halfwidth = ci_zt * se;
+		  if (report_beta_instead_of_odds_ratio) {
+		    cswritep = dtoa_g(beta - ci_halfwidth, cswritep);
+		    *cswritep++ = '\t';
+		    cswritep = dtoa_g(beta + ci_halfwidth, cswritep);
+		  } else {
+		    cswritep = dtoa_g(exp(beta - ci_halfwidth), cswritep);
+		    *cswritep++ = '\t';
+		    cswritep = dtoa_g(exp(beta + ci_halfwidth), cswritep);
+		  }
+		} else {
+		  cswritep = strcpya(cswritep, "NA\tNA");
+		}
+	      }
+	      if (t_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(tstat, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	    } else {
+	      // joint test: use (currently approximate) F-test instead of T
+	      // test
+	      // beta_se_iter = &(beta_se_iter[2]);
+	      if (orbeta_col) {
+		cswritep = memcpyl3a(cswritep, "\tNA");
+	      }
+	      if (se_col) {
+		cswritep = memcpyl3a(cswritep, "\tNA");
+	      }
+	      if (ci_col) {
+		cswritep = strcpya(cswritep, "\tNA\tNA");
+	      }
+	      if (t_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(primary_se, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	      // could avoid recomputing
+	      if (!is_invalid) {
+		pval = chiprob_p(primary_se, cur_constraint_ct);
+	      }
+	    }
+	    if (p_col) {
+	      *cswritep++ = '\t';
+	      if (!is_invalid) {
+		cswritep = dtoa_g(MAXV(pval, output_min_p), cswritep);
+	      } else {
+		cswritep = strcpya(cswritep, "NA");
+	      }
+	    }
+	    append_binary_eoln(&cswritep);
+	    if (cswrite(&css, &cswritep)) {
+	      goto glm_logistic_ret_WRITE_FAIL;
+	    }
+	    if (test_idx == primary_reported_test_idx) {
+	      if (cur_pval_write) {
+		cur_pval_write[variant_bidx] = pval;
+	      }
+	      if (cur_chisq_write) {
+		if (!is_invalid) {
+		  if (!cur_constraint_ct) {
+		    cur_chisq_write[variant_bidx] = tstat * tstat;
+		  } else {
+		    cur_chisq_write[variant_bidx] = primary_se;
+		  }
+		} else {
+		  cur_chisq_write[variant_bidx] = -9;
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      if (variant_idx == variant_ct) {
+	break;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+      ++read_block_idx;
+      prev_block_variant_ct = cur_block_variant_ct;
+      variant_idx += cur_block_variant_ct;
+      // crucially, this is independent of the pgen_reader_t block_base
+      // pointers
+      pgfip->block_base = main_loadbufs[parity];
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto glm_logistic_ret_WRITE_FAIL;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+    LOGPRINTF("Results written to %s .\n", outname);
+    bigstack_reset(bigstack_mark);
+  }
+  while (0) {
+  glm_logistic_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  glm_logistic_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  glm_logistic_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  glm_logistic_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  glm_logistic_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  glm_logistic_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ glm_logistic_ret_1:
+  threads3z_cleanup(&ts, &g_cur_block_variant_ct);
+  cswrite_close_cond(&css, cswritep);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+static const double kSmallDoubles[4] = {0.0, 1.0, 2.0, 3.0};
+
+void genoarr_to_doubles(const uintptr_t* genoarr, uint32_t sample_ct, double* doublebuf) {
+  assert(sample_ct);
+  const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint32_t widx = 0;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  double* doublebuf_iter = doublebuf;
+  while (1) {
+    if (widx >= sample_ctl2m1) {
+      if (widx > sample_ctl2m1) {
+	return;
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      const uintptr_t cur_geno = geno_word & 3;
+      // *doublebuf_iter++ = (double)((int32_t)cur_geno);
+      *doublebuf_iter++ = kSmallDoubles[cur_geno];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+}
+
+uint32_t genoarr_to_doubles_remove_missing(const uintptr_t* genoarr, uint32_t sample_ct, double* doublebuf) {
+  assert(sample_ct);
+  const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint32_t widx = 0;
+  uint32_t subgroup_len = kBitsPerWordD2;
+  double* doublebuf_iter = doublebuf;
+  while (1) {
+    if (widx >= sample_ctl2m1) {
+      if (widx > sample_ctl2m1) {
+	return (uintptr_t)(doublebuf_iter - doublebuf);
+      }
+      subgroup_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = genoarr[widx];
+    for (uint32_t uii = 0; uii < subgroup_len; ++uii) {
+      const uintptr_t cur_geno = geno_word & 3;
+      if (cur_geno < 3) {
+	// *doublebuf_iter++ = (double)((int32_t)cur_geno);
+	*doublebuf_iter++ = kSmallDoubles[cur_geno];
+      }
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+}
+
+uintptr_t get_linear_workspace_size(uint32_t sample_ct, uint32_t predictor_ct, uint32_t constraint_ct, uint32_t genod_buffer_needed) {
+  // sample_ct * predictor_ct < 2^31, and sample_ct >= predictor_ct, so no
+  // overflows
+  // could round everything up to multiples of 16 instead of 64
+
+  // sample_nm, male_nm = sample_ctl words
+  uintptr_t workspace_size = 2 * round_up_pow2(BITCT_TO_WORDCT(sample_ct) * sizeof(intptr_t), kCacheline);
+  
+  // nm_pheno_buf = sample_ct doubles
+  workspace_size += round_up_pow2(sample_ct * sizeof(double), kCacheline);
+  
+  // predictors_pmaj = (predictor_ct + genod_buffer_needed) * sample_ct doubles
+  workspace_size += round_up_pow2((predictor_ct + genod_buffer_needed) * sample_ct * sizeof(double), kCacheline);
+
+  // xtx_inv, dbl_2d_buf = predictor_ct * predictor_ct doubles
+  workspace_size += 2 * round_up_pow2(predictor_ct * predictor_ct * sizeof(double), kCacheline);
+
+  // fitted_coefs, xt_y = predictor_ct doubles
+  workspace_size += 2 * round_up_pow2(predictor_ct * sizeof(double), kCacheline);
+
+#ifdef NOLAPACK
+  // mi_buf = constraint_ct * kMatrixInvertBuf1CheckedAlloc bytes
+  workspace_size += round_up_pow2(constraint_ct * kMatrixInvertBuf1CheckedAlloc, kCacheline);
+#endif
+  if (constraint_ct) {
+    // tmphxs_buf, h_transpose_buf = constraint_ct * predictor_ct doubles
+    workspace_size += 2 * round_up_pow2(constraint_ct * predictor_ct * sizeof(double), kCacheline);
+
+    // inner_buf = constraint_ct * constraint_ct
+    workspace_size += round_up_pow2(constraint_ct * constraint_ct * sizeof(double), kCacheline);
+
+#ifndef NOLAPACK
+    // mi_buf = constraint_ct * kMatrixInvertBuf1CheckedAlloc bytes
+    workspace_size += round_up_pow2(constraint_ct * kMatrixInvertBuf1CheckedAlloc, kCacheline);
+#endif
+  }
+  return workspace_size;  
+}
+
+THREAD_FUNC_DECL glm_linear_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  pgen_reader_t* pgrp = g_pgr_ptrs[tidx];
+  uintptr_t* genovec = g_genovecs[tidx];
+  uintptr_t* dosage_present = nullptr;
+  dosage_t* dosage_vals = nullptr;
+  if (g_dosage_presents) {
+    dosage_present = g_dosage_presents[tidx];
+    dosage_vals = g_dosage_val_bufs[tidx];
+  }
+  unsigned char* workspace_buf = g_workspace_bufs[tidx];
+  const uintptr_t* variant_include = g_variant_include;
+  const uintptr_t* sex_male_collapsed = g_sex_male_collapsed;
+  const chr_info_t* cip = g_cip;
+  const uint32_t* subset_chr_fo_vidx_start = g_subset_chr_fo_vidx_start;
+  // const uint32_t raw_sample_ct = g_raw_sample_ct;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const glm_flags_t glm_flags = g_glm_flags;
+  const uint32_t add_interactions = (glm_flags / kfGlmInteraction) & 1;
+  const uint32_t hide_covar = (glm_flags / kfGlmHideCovar) & 1;
+  const uint32_t include_intercept = (glm_flags / kfGlmIntercept) & 1;
+  const uint32_t model_dominant = (glm_flags / kfGlmDominant) & 1;
+  const uint32_t model_recessive = (glm_flags / kfGlmRecessive) & 1;
+  const uint32_t joint_genotypic = (glm_flags / kfGlmGenotypic) & 1;
+  const uint32_t joint_hethom = (glm_flags / kfGlmHethom) & 1;
+  const uint32_t domdev_present = joint_genotypic || joint_hethom;
+  const uint32_t domdev_present_p1 = domdev_present + 1;  
+  const uint32_t reported_pred_uidx_start = 1 - include_intercept;
+  const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+  const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+  const uint32_t is_xchr_model_1 = g_is_xchr_model_1;
+  const uintptr_t max_reported_test_ct = g_max_reported_test_ct;
+  const uintptr_t local_covar_ct = g_local_covar_ct;
+  uintptr_t max_sample_ct = MAXV(g_sample_ct, g_sample_ct_x);
+  if (max_sample_ct < g_sample_ct_y) {
+    max_sample_ct = g_sample_ct_y;
+  }
+  uint32_t variant_idx_offset = 0;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uintptr_t cur_block_variant_ct = g_cur_block_variant_ct;
+    uint32_t variant_bidx = (tidx * cur_block_variant_ct) / calc_thread_ct;
+    const uint32_t variant_bidx_end = ((tidx + 1) * cur_block_variant_ct) / calc_thread_ct;
+    uint32_t variant_uidx = g_read_variant_uidx_starts[tidx];
+    double* beta_se_iter = &(g_block_beta_se[2 * max_reported_test_ct * variant_bidx]);
+    linear_aux_result_t* block_aux_iter = &(g_linear_block_aux[variant_bidx]);
+    const double* local_covars_iter = nullptr;
+    if (local_covar_ct) {
+      // &(nullptr[0]) is okay in C++, but undefined in C
+      local_covars_iter = &(g_local_covars_vcmaj_d[parity][variant_bidx * max_sample_ct * local_covar_ct]);
+    }
+    while (variant_bidx < variant_bidx_end) {
+      const uint32_t variant_idx = variant_bidx + variant_idx_offset;
+      const uint32_t chr_fo_idx = uint32arr_greater_than(&(subset_chr_fo_vidx_start[1]), cip->chr_ct, variant_idx + 1);
+      const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+      uint32_t cur_variant_bidx_end = subset_chr_fo_vidx_start[chr_fo_idx + 1] - variant_idx_offset;
+      if (cur_variant_bidx_end > variant_bidx_end) {
+	cur_variant_bidx_end = variant_bidx_end;
+      }
+      const uint32_t is_x = (chr_idx == x_code);
+      const uint32_t is_y = (chr_idx == y_code);
+      const uint32_t is_nonx_haploid = (!is_x) && is_set(cip->haploid_mask, chr_idx);
+      const uintptr_t* cur_sample_include;
+      const uint32_t* cur_sample_include_cumulative_popcounts;
+      const double* cur_pheno;
+      const double* cur_covars_cmaj;
+      const uintptr_t* cur_parameter_subset;
+      const double* cur_constraints_con_major;
+      uint32_t cur_sample_ct;
+      uint32_t cur_covar_ct;
+      uint32_t cur_constraint_ct;
+      uint32_t primary_pred_idx = include_intercept;
+      if (is_y && g_sample_include_y) {
+	cur_sample_include = g_sample_include_y;
+	cur_sample_include_cumulative_popcounts = g_sample_include_y_cumulative_popcounts;
+	cur_pheno = g_pheno_y_d;
+	cur_covars_cmaj = g_covars_cmaj_y_d;
+	cur_parameter_subset = g_parameter_subset_y;
+	cur_constraints_con_major = g_constraints_con_major_y;
+	cur_sample_ct = g_sample_ct_y;
+	cur_covar_ct = g_covar_ct_y;
+	cur_constraint_ct = g_constraint_ct_y;
+      } else if (is_x && g_sample_include_x) {
+	cur_sample_include = g_sample_include_x;
+	cur_sample_include_cumulative_popcounts = g_sample_include_x_cumulative_popcounts;
+	cur_pheno = g_pheno_x_d;
+	cur_covars_cmaj = g_covars_cmaj_x_d;
+	cur_parameter_subset = g_parameter_subset_x;
+	cur_constraints_con_major = g_constraints_con_major_x;
+	cur_sample_ct = g_sample_ct_x;
+	cur_covar_ct = g_covar_ct_x;
+	cur_constraint_ct = g_constraint_ct_x;
+      } else {
+	cur_sample_include = g_sample_include;
+	cur_sample_include_cumulative_popcounts = g_sample_include_cumulative_popcounts;
+	cur_pheno = g_pheno_d;
+	cur_covars_cmaj = g_covars_cmaj_d;
+	cur_parameter_subset = g_parameter_subset;
+	cur_constraints_con_major = g_constraints_con_major;
+	cur_sample_ct = g_sample_ct;
+	cur_covar_ct = g_covar_ct;
+	cur_constraint_ct = g_constraint_ct;
+      }
+      const uint32_t sample_ctl = BITCT_TO_WORDCT(cur_sample_ct);
+      const uint32_t cur_predictor_ct_base = 2 + domdev_present + cur_covar_ct * (1 + add_interactions * domdev_present_p1);
+      uint32_t cur_predictor_ct = cur_predictor_ct_base;
+      if (cur_parameter_subset) {
+	cur_predictor_ct = popcount_longs(cur_parameter_subset, BITCT_TO_WORDCT(cur_predictor_ct_base));
+      }
+      uint32_t reported_pred_uidx_end;
+      if (hide_covar) {
+	if (!cur_parameter_subset) {
+	  reported_pred_uidx_end = 2 + domdev_present;
+	} else {
+	  reported_pred_uidx_end = 1 + is_set(cur_parameter_subset, 1) + domdev_present;
+	}
+      } else {
+	reported_pred_uidx_end = cur_predictor_ct;
+      }
+      // todo: --tests
+      if (cur_constraint_ct) {
+	primary_pred_idx = reported_pred_uidx_end - reported_pred_uidx_start;
+      }
+      const uint32_t genod_buffer_needed = cur_parameter_subset && (!is_set(cur_parameter_subset, 1));
+      unsigned char* workspace_iter = workspace_buf;
+      uintptr_t* sample_nm = (uintptr_t*)arena_alloc_raw_rd(sample_ctl * sizeof(intptr_t), &workspace_iter);
+      uintptr_t* male_nm = (uintptr_t*)arena_alloc_raw_rd(sample_ctl * sizeof(intptr_t), &workspace_iter);
+      double* nm_pheno_buf = (double*)arena_alloc_raw_rd(cur_sample_ct * sizeof(double), &workspace_iter);
+      double* nm_predictors_pmaj_buf = (double*)arena_alloc_raw_rd((cur_predictor_ct + genod_buffer_needed) * cur_sample_ct * sizeof(double), &workspace_iter);
+      double* xtx_inv = (double*)arena_alloc_raw_rd(cur_predictor_ct * cur_predictor_ct * sizeof(double), &workspace_iter);
+      double* fitted_coefs = (double*)arena_alloc_raw_rd(cur_predictor_ct * sizeof(double), &workspace_iter);
+      double* xt_y = (double*)arena_alloc_raw_rd(cur_predictor_ct * sizeof(double), &workspace_iter);
+      double* dbl_2d_buf = (double*)arena_alloc_raw_rd(cur_predictor_ct * cur_predictor_ct * sizeof(double), &workspace_iter);
+      
+      // joint test only
+      matrix_invert_buf1_t* inv_1d_buf = nullptr;
+      double* tmphxs_buf = nullptr;
+      double* h_transpose_buf = nullptr;
+      double* inner_buf = nullptr;
+#ifdef NOLAPACK
+      // (well, except if LAPACK is missing)
+      inv_1d_buf = (matrix_invert_buf1_t*)arena_alloc_raw_rd(cur_predictor_ct * kMatrixInvertBuf1CheckedAlloc, &workspace_iter);
+#endif
+      if (cur_constraint_ct) {
+#ifndef NOLAPACK
+	inv_1d_buf = (matrix_invert_buf1_t*)arena_alloc_raw_rd(cur_predictor_ct * kMatrixInvertBuf1CheckedAlloc, &workspace_iter);
+#endif
+	tmphxs_buf = (double*)arena_alloc_raw_rd(cur_constraint_ct * cur_predictor_ct * sizeof(double), &workspace_iter);
+	h_transpose_buf = (double*)arena_alloc_raw_rd(cur_constraint_ct * cur_predictor_ct * sizeof(double), &workspace_iter);
+	inner_buf = (double*)arena_alloc_raw_rd(cur_constraint_ct * cur_constraint_ct * sizeof(double), &workspace_iter);
+      }
+      assert((uintptr_t)(workspace_iter - workspace_buf) == get_linear_workspace_size(cur_sample_ct, cur_predictor_ct, cur_constraint_ct, genod_buffer_needed));
+      double pheno_ssq_base = 0.0;
+      for (uint32_t sample_idx = 0; sample_idx < cur_sample_ct; ++sample_idx) {
+	pheno_ssq_base += cur_pheno[sample_idx] * cur_pheno[sample_idx];
+      }
+      pgr_clear_ld_cache(pgrp);
+      uint32_t genocounts[4];
+      for (; variant_bidx < cur_variant_bidx_end; ++variant_bidx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	{
+	  uint32_t dosage_ct;
+	  uint32_t is_explicit_alt1;
+	  pglerr_t reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(cur_sample_include, cur_sample_include_cumulative_popcounts, cur_sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+	  if (reterr) {
+	    g_error_ret = reterr;
+	    variant_bidx = variant_bidx_end;
+	    break;
+	  }
+	  zero_trailing_quaters(cur_sample_ct, genovec);
+	  genovec_count_freqs_unsafe(genovec, cur_sample_ct, genocounts);
+	  uint32_t missing_ct = genocounts[3];
+	  if (!missing_ct) {
+	    fill_all_bits(cur_sample_ct, sample_nm);
+	  } else {
+	    genoarr_to_nonmissing(genovec, cur_sample_ct, sample_nm);
+	    if (dosage_ct) {
+	      bitvec_or(dosage_present, sample_ctl, sample_nm);
+	      missing_ct = cur_sample_ct - popcount_longs(sample_nm, sample_ctl);
+	    }
+	  }
+	  uint32_t nm_sample_ct = cur_sample_ct - missing_ct;
+	  // todo: alt2/alt3/etc. dosage > 0.5 -> missing
+	  const uint32_t nm_sample_ctl = BITCT_TO_WORDCT(nm_sample_ct);
+	  double* nm_predictors_pmaj_iter = nm_predictors_pmaj_buf;
+	  // first predictor column: intercept
+	  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	    *nm_predictors_pmaj_iter++ = 1.0;
+	  }
+	  // second predictor column: genotype
+	  double* genotype_vals = &(nm_predictors_pmaj_buf[nm_sample_ct]);
+	  if (genod_buffer_needed) {
+	    // special case: --parameters excludes the main genotype column,
+	    // but does care about an interaction
+	    genotype_vals = &(nm_predictors_pmaj_buf[cur_predictor_ct * nm_sample_ct]);
+	  }
+	  nm_predictors_pmaj_iter = genotype_vals;
+	  double cur_pheno_ssq = pheno_ssq_base;
+	  if (!missing_ct) {
+	    genoarr_to_doubles(genovec, nm_sample_ct, nm_predictors_pmaj_iter);
+	    if (dosage_ct) {
+	      uint32_t sample_idx = 0;
+	      for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_idx) {
+		next_set_unsafe_ck(dosage_present, &sample_idx);
+		// 32768 -> 2, 16384 -> 1, 0 -> 0
+		nm_predictors_pmaj_iter[sample_idx] = kRecipDosageMid * ((int32_t)((uint32_t)dosage_vals[dosage_idx]));
+	      }
+	    }
+	  } else {
+	    uint32_t sample_midx = 0;
+	    for (uint32_t missing_idx = 0; missing_idx < missing_ct; ++missing_idx, ++sample_midx) {
+	      next_unset_unsafe_ck(sample_nm, &sample_midx);
+	      cur_pheno_ssq -= cur_pheno[sample_midx] * cur_pheno[sample_midx];
+	    }
+	    if (!dosage_ct) {
+	      genoarr_to_doubles_remove_missing(genovec, cur_sample_ct, nm_predictors_pmaj_iter);
+	    } else {
+	      sample_midx = 0;
+	      uint32_t dosage_idx = 0;
+	      for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+		next_set_unsafe_ck(sample_nm, &sample_midx);
+		double cur_val;
+		if (is_set(dosage_present, sample_midx)) {
+		  cur_val = kRecipDosageMid * ((int32_t)((uint32_t)dosage_vals[dosage_idx++]));
+		} else {
+		  cur_val = (intptr_t)(GET_QUATERARR_ENTRY(genovec, sample_midx));
+		}
+	        nm_predictors_pmaj_iter[sample_idx] = cur_val;
+	      }
+	    }
+	  }
+	  nm_predictors_pmaj_iter = &(nm_predictors_pmaj_iter[nm_sample_ct]);
+	  // usually need to save some of {sample_obs_ct, allele_obs_ct,
+	  // alt_dosage, mach_r2 even for skipped variants
+	  // compute them all for now, could conditionally skip later
+	  block_aux_iter->sample_obs_ct = nm_sample_ct;
+	  double dosage_ceil = 2.0;
+	  if (!is_x) {
+	    if (!is_nonx_haploid) {
+	      block_aux_iter->allele_obs_ct = nm_sample_ct * 2;
+	    } else {
+	      block_aux_iter->allele_obs_ct = nm_sample_ct;
+	      // everything is on 0..1 scale, not 0..2
+	      dosage_ceil = 1.0;
+	      for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+		genotype_vals[sample_idx] *= 0.5;
+	      }
+	    }
+	  } else {
+	    copy_bitarr_subset(sex_male_collapsed, sample_nm, nm_sample_ct, male_nm);
+	    const uint32_t nm_male_ct = popcount_longs(male_nm, nm_sample_ctl);
+	    block_aux_iter->allele_obs_ct = nm_sample_ct * 2;
+	    if (is_xchr_model_1) {
+	      // special case: multiply male values by 0.5
+	      uint32_t sample_idx = 0;
+	      for (uint32_t male_idx = 0; male_idx < nm_male_ct; ++male_idx, ++sample_idx) {
+		next_set_unsafe_ck(male_nm, &sample_idx);
+		genotype_vals[sample_idx] *= 0.5;
+	      }
+	      block_aux_iter->allele_obs_ct -= nm_male_ct;
+	    }
+	  }
+	  double dosage_sum = 0.0;
+	  double dosage_ssq = 0.0;
+	  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	    const double cur_genotype_val = genotype_vals[sample_idx];
+	    dosage_sum += cur_genotype_val;
+	    dosage_ssq += cur_genotype_val * cur_genotype_val;
+	  }
+	  block_aux_iter->alt_dosage = dosage_sum;
+
+	  const double dosage_avg = dosage_sum / ((double)((int32_t)nm_sample_ct));
+	  const double dosage_variance = dosage_ssq - dosage_sum * dosage_avg;
+	  block_aux_iter->mach_r2 = 2 * dosage_variance / (dosage_sum * (dosage_ceil - dosage_avg));
+	  // okay, now we're free to skip the actual regression if there are
+	  // too few samples, or variant is monomorphic (or all-het)
+	  if ((nm_sample_ct <= cur_predictor_ct) || (fabs(dosage_variance) < kBigEpsilon)) {
+	    goto glm_linear_thread_skip_variant;
+	  }
+	  double* domdev_vals = nullptr;
+	  if (genod_buffer_needed) {
+	    nm_predictors_pmaj_iter = &(nm_predictors_pmaj_buf[nm_sample_ct]);
+	  } else if (joint_genotypic || joint_hethom) {
+	    // in hethom case, do this before clobbering genotype data
+	    domdev_vals = nm_predictors_pmaj_iter;
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	      double cur_genotype_val = genotype_vals[sample_idx];
+	      if (cur_genotype_val > 1.0) {
+		cur_genotype_val = 2.0 - cur_genotype_val;
+	      }
+	      nm_predictors_pmaj_iter[sample_idx] = cur_genotype_val;
+	    }
+	    nm_predictors_pmaj_iter = &(nm_predictors_pmaj_iter[nm_sample_ct]);
+	  }
+	  if (model_dominant) {
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	      const double cur_genotype_val = genotype_vals[sample_idx];
+	      // 0..1..1
+	      if (cur_genotype_val > 1.0) {
+		genotype_vals[sample_idx] = 1.0;
+	      }
+	    }
+	  } else if (model_recessive || joint_hethom) {
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx) {
+	      const double cur_genotype_val = genotype_vals[sample_idx];
+	      // 0..0..1
+	      if (cur_genotype_val < 1.0) {
+		genotype_vals[sample_idx] = 0.0;
+	      } else {
+		genotype_vals[sample_idx] = cur_genotype_val - 1.0;
+	      }
+	    }
+	  }
+
+	  // fill phenotype
+	  uint32_t sample_midx = 0;
+	  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+	    next_set_unsafe_ck(sample_nm, &sample_midx);
+	    nm_pheno_buf[sample_idx] = cur_pheno[sample_midx];
+	  }
+	  
+	  // fill covariates
+	  uint32_t parameter_uidx = 2 + domdev_present;
+	  for (uint32_t covar_idx = 0; covar_idx < cur_covar_ct; ++covar_idx, ++parameter_uidx) {
+	    // strictly speaking, we don't need cur_covars_cmaj to be
+	    // vector-aligned
+	    if (cur_parameter_subset && (!is_set(cur_parameter_subset, parameter_uidx))) {
+	      continue;
+	    }
+	    const double* cur_covar_col;
+	    if (covar_idx < local_covar_ct) {
+	      cur_covar_col = &(local_covars_iter[covar_idx * max_sample_ct]);
+	    } else {
+	      cur_covar_col = &(cur_covars_cmaj[(covar_idx - local_covar_ct) * cur_sample_ct]);
+	    }
+	    sample_midx = 0;
+	    for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+	      next_set_unsafe_ck(sample_nm, &sample_midx);
+	      *nm_predictors_pmaj_iter++ = cur_covar_col[sample_midx];
+	    }
+	  }
+	  // fill interaction terms
+	  if (add_interactions) {
+	    for (uint32_t covar_idx = 0; covar_idx < cur_covar_ct; ++covar_idx) {
+	      const double* cur_covar_col;
+	      if (covar_idx < local_covar_ct) {
+	        cur_covar_col = &(local_covars_iter[covar_idx * max_sample_ct]);
+	      } else {
+		cur_covar_col = &(cur_covars_cmaj[covar_idx * cur_sample_ct]);
+	      }
+	      if ((!cur_parameter_subset) || is_set(cur_parameter_subset, parameter_uidx)) {
+		sample_midx = 0;
+		for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+		  next_set_unsafe_ck(sample_nm, &sample_midx);
+		  *nm_predictors_pmaj_iter++ = genotype_vals[sample_idx] * cur_covar_col[sample_midx];
+		}
+	      }
+	      ++parameter_uidx;
+	      if (domdev_present) {
+		if ((!cur_parameter_subset) || is_set(cur_parameter_subset, parameter_uidx)) {
+		  sample_midx = 0;
+		  for (uint32_t sample_idx = 0; sample_idx < nm_sample_ct; ++sample_idx, ++sample_midx) {
+		    next_set_unsafe_ck(sample_nm, &sample_midx);
+		    *nm_predictors_pmaj_iter++ = domdev_vals[sample_idx] * cur_covar_col[sample_midx];
+		  }
+		}
+		++parameter_uidx;
+	      }
+	    }
+	  }
+	  if (linear_regression_inv(nm_pheno_buf, nm_predictors_pmaj_buf, cur_predictor_ct, nm_sample_ct, fitted_coefs, xtx_inv, xt_y, inv_1d_buf, dbl_2d_buf)) {
+	    goto glm_linear_thread_skip_variant;
+	  }
+	  // RSS = y^T y - y^T X (X^T X)^{-1} X^T y
+	  //     = cur_pheno_ssq - xt_y * fitted_coefs
+	  // s^2 = RSS / df
+	  // possible todo: improve numerical stability of this computation in
+	  // non-mean-centered phenotype case
+	  double sigma = cur_pheno_ssq;
+	  for (uint32_t pred_uidx = 0; pred_uidx < cur_predictor_ct; ++pred_uidx) {
+	    sigma -= xt_y[pred_uidx] * fitted_coefs[pred_uidx];
+	  }
+	  sigma /= nm_sample_ct - cur_predictor_ct;
+	  for (uint32_t uii = 0; uii < cur_predictor_ct; ++uii) {
+	    double* s_iter = &(xtx_inv[uii * cur_predictor_ct]);
+#ifdef NOLAPACK
+	    for (uint32_t ujj = 0; ujj < cur_predictor_ct; ++ujj) {
+	      *s_iter *= sigma;
+	      ++s_iter;
+	    }
+#else
+	    for (uint32_t ujj = 0; ujj <= uii; ++ujj) {
+	      *s_iter *= sigma;
+	      ++s_iter;
+	    }
+#endif
+	  }
+	  // validParameters() check
+	  for (uint32_t pred_uidx = 1; pred_uidx < cur_predictor_ct; ++pred_uidx) {
+	    const double xtx_inv_diag_element = xtx_inv[pred_uidx * (cur_predictor_ct + 1)];
+	    if (xtx_inv_diag_element < 1e-20) {
+	      goto glm_linear_thread_skip_variant;
+	    }
+	    // use dbl_2d_buf[] to store diagonal square roots
+	    dbl_2d_buf[pred_uidx] = sqrt(xtx_inv_diag_element);
+	  }
+	  dbl_2d_buf[0] = sqrt(xtx_inv[0]);
+	  for (uint32_t pred_uidx = 1; pred_uidx < cur_predictor_ct; ++pred_uidx) {
+	    const double cur_xtx_inv_diag_sqrt = 0.99999 * dbl_2d_buf[pred_uidx];
+	    const double* xtx_inv_row = &(xtx_inv[pred_uidx * cur_predictor_ct]);
+	    for (uint32_t pred_uidx2 = 0; pred_uidx2 < pred_uidx; ++pred_uidx2) {
+	      if (xtx_inv_row[pred_uidx2] > cur_xtx_inv_diag_sqrt * dbl_2d_buf[pred_uidx2]) {
+		goto glm_linear_thread_skip_variant;
+	      }
+	    }
+	  }
+	  double* beta_se_iter2 = beta_se_iter;
+	  for (uint32_t pred_uidx = reported_pred_uidx_start; pred_uidx < reported_pred_uidx_end; ++pred_uidx) {
+	    *beta_se_iter2++ = fitted_coefs[pred_uidx];
+	    *beta_se_iter2++ = dbl_2d_buf[pred_uidx];
+	  }
+	  if (cur_constraint_ct) {
+	    *beta_se_iter2++ = 0.0;
+#ifndef NOLAPACK
+	    // xtx_inv upper triangle was not filled
+	    for (uint32_t row_idx = 0; row_idx < cur_predictor_ct; ++row_idx) {
+	      double* cur_row = &(xtx_inv[row_idx * cur_predictor_ct]);
+	      double* cur_col = &(xtx_inv[row_idx]);
+	      for (uint32_t col_idx = row_idx + 1; col_idx < cur_predictor_ct; ++col_idx) {
+		cur_row[col_idx] = cur_col[col_idx * cur_predictor_ct];
+	      }
+	    }
+#endif
+	    double chisq;
+	    if (!linear_hypothesis_chisq(fitted_coefs, cur_constraints_con_major, xtx_inv, cur_constraint_ct, cur_predictor_ct, &chisq, tmphxs_buf, h_transpose_buf, inner_buf, inv_1d_buf, dbl_2d_buf)) {
+	      *beta_se_iter2++ = chisq;
+	    } else {
+	      *beta_se_iter2++ = -9;
+	    }
+	  }
+	}
+	while (0) {
+	glm_linear_thread_skip_variant:
+	  beta_se_iter[primary_pred_idx * 2 + 1] = -9;
+	}
+	beta_se_iter = &(beta_se_iter[2 * max_reported_test_ct]);
+	++block_aux_iter;
+	if (local_covars_iter) {
+	  local_covars_iter = &(local_covars_iter[local_covar_ct * max_sample_ct]);
+	}
+	// todo?
+      }
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+    variant_idx_offset += cur_block_variant_ct;
+  }
+}
+
+pglerr_t glm_linear(const char* cur_pheno_name, char** test_names, char** test_names_x, char** test_names_y, const uint32_t* variant_bps, char** variant_ids, char** allele_storage, const glm_info_t* glm_info_ptr, const uint32_t* local_sample_uidx_order, const uintptr_t* local_variant_include, const char* outname, uint32_t raw_variant_ct, uint32_t max_chr_blen, double ci_size, double pfilter, double output_min_p, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, uint32_t local_sam [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uintptr_t* variant_include = g_variant_include;
+    const chr_info_t* cip = g_cip;
+    const uintptr_t* variant_allele_idxs = g_variant_allele_idxs;
+
+    const uint32_t sample_ct = g_sample_ct;
+    const uint32_t sample_ct_x = g_sample_ct_x;
+    const uint32_t sample_ct_y = g_sample_ct_y;
+    const uint32_t covar_ct = g_covar_ct;
+    const uintptr_t local_covar_ct = g_local_covar_ct;
+    const uint32_t covar_ct_x = g_covar_ct_x;
+    const uint32_t covar_ct_y = g_covar_ct_y;
+
+    uint32_t max_sample_ct = MAXV(sample_ct, sample_ct_x);
+    if (max_sample_ct < sample_ct_y) {
+      max_sample_ct = sample_ct_y;
+    }
+    uint32_t* local_sample_idx_order = nullptr;
+    uint32_t local_line_idx = 0;
+    uint32_t local_xy = 0; // 1 = chrX, 2 = chrY
+    if (gz_local_covar_file) {
+      if (gzrewind(gz_local_covar_file)) {
+	goto glm_linear_ret_READ_FAIL;
+      }
+      if (bigstack_alloc_ui(local_sample_ct, &local_sample_idx_order)) {
+	goto glm_linear_ret_NOMEM;
+      }
+      for (uint32_t uii = 0; uii < local_sample_ct; ++uii) {
+	const uint32_t cur_uidx = local_sample_uidx_order[uii];
+	uint32_t cur_idx = 0xffffffffU;
+	if ((cur_uidx != 0xffffffffU) && is_set(g_sample_include, cur_uidx)) {
+	  cur_idx = raw_to_subsetted_pos(g_sample_include, g_sample_include_cumulative_popcounts, cur_uidx);
+	}
+	local_sample_idx_order[uii] = cur_idx;
+      }
+    }
+
+    const uint32_t variant_ct = g_variant_ct;
+
+    const glm_flags_t glm_flags = glm_info_ptr->flags;
+    const uint32_t output_zst = (glm_flags / kfGlmZs) & 1;
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto glm_linear_ret_OPEN_FAIL;
+    }
+    const uint32_t add_interactions = (glm_flags / kfGlmInteraction) & 1;
+    const uint32_t domdev_present = (glm_flags & (kfGlmGenotypic | kfGlmHethom))? 1 : 0;
+    const uint32_t domdev_present_p1 = domdev_present + 1;
+
+    // todo: --tests
+    const uint32_t constraint_ct = g_constraint_ct;
+    const uint32_t constraint_ct_x = g_constraint_ct_x;
+    const uint32_t constraint_ct_y = g_constraint_ct_y;
+    
+    uint32_t predictor_ct = 2 + domdev_present + covar_ct * (1 + add_interactions * domdev_present_p1);
+    uint32_t predictor_ct_x = 2 + domdev_present + covar_ct_x * (1 + add_interactions * domdev_present_p1);
+    uint32_t predictor_ct_y = 2 + domdev_present + covar_ct_y * (1 + add_interactions * domdev_present_p1);
+    const uintptr_t* parameter_subset = g_parameter_subset;
+    const uintptr_t* parameter_subset_x = g_parameter_subset_x;
+    const uintptr_t* parameter_subset_y = g_parameter_subset_y;
+    if (parameter_subset) {
+      predictor_ct = popcount_longs(parameter_subset, BITCT_TO_WORDCT(predictor_ct));
+      if (sample_ct_x) {
+	predictor_ct_x = popcount_longs(parameter_subset_x, BITCT_TO_WORDCT(predictor_ct_x));
+      } else {
+	predictor_ct_x = 0;
+      }
+      if (sample_ct_y) {
+	predictor_ct_y = popcount_longs(parameter_subset_y, BITCT_TO_WORDCT(predictor_ct_x));
+      } else {
+	predictor_ct_y = 0;
+      }
+    }
+    uint32_t reported_test_ct = get_reported_test_ct(parameter_subset, glm_flags, covar_ct);
+    uintptr_t max_reported_test_ct = reported_test_ct;
+    uint32_t reported_test_ct_x = 0;
+    if (sample_ct_x) {
+      reported_test_ct_x = get_reported_test_ct(parameter_subset_x, glm_flags, covar_ct_x);
+      if (reported_test_ct_x > max_reported_test_ct) {
+	max_reported_test_ct = reported_test_ct_x;
+      }
+    }
+    uint32_t reported_test_ct_y = 0;
+    if (sample_ct_y) {
+      reported_test_ct_y = get_reported_test_ct(parameter_subset_y, glm_flags, covar_ct_y);
+      if (reported_test_ct_y > max_reported_test_ct) {
+	max_reported_test_ct = reported_test_ct_y;
+      }
+    }
+    const uint32_t include_intercept = (glm_flags / kfGlmIntercept) & 1;
+    const glm_cols_t glm_cols = glm_info_ptr->cols;
+    const uint32_t test_col = glm_cols & kfGlmColTest;
+    if ((!test_col) && (max_reported_test_ct > 1)) {
+      logerrprint("Error: --glm's 'test' column cannot be omitted when results for multiple\npredictors are reported.  (Did you forget 'hide-covar'?)\n");
+      goto glm_linear_ret_INCONSISTENT_INPUT;
+    }
+    g_max_reported_test_ct = max_reported_test_ct;
+
+    int32_t x_code = -2;
+    uint32_t x_start = 0;
+    uint32_t x_end = 0;
+    if (sample_ct_x) {
+      get_xymt_code_start_and_end_unsafe(cip, kChrOffsetX, &x_code, &x_start, &x_end);
+    }
+    int32_t y_code = -2;
+    uint32_t y_start = 0;
+    uint32_t y_end = 0;
+    if (sample_ct_y) {
+      get_xymt_code_start_and_end_unsafe(cip, kChrOffsetY, &y_code, &y_start, &y_end);
+    }
+    const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+    const uint32_t chr_col = glm_cols & kfGlmColChrom;
+
+    // includes trailing tab
+    char* chr_buf = nullptr;
+    if (chr_col) {
+      if (bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+	goto glm_linear_ret_NOMEM;
+      }
+    }
+
+    uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
+    if (calc_thread_ct > variant_ct) {
+      calc_thread_ct = variant_ct;
+    }
+
+    const uint32_t genod_buffer_needed = parameter_subset && (!is_set(parameter_subset, 1));
+    uintptr_t workspace_alloc = get_linear_workspace_size(sample_ct, predictor_ct, constraint_ct, genod_buffer_needed);
+    if (sample_ct_x) {
+      const uintptr_t workspace_alloc_x = get_linear_workspace_size(sample_ct_x, predictor_ct_x, constraint_ct_x, genod_buffer_needed);
+      if (workspace_alloc_x > workspace_alloc) {
+	workspace_alloc = workspace_alloc_x;
+      }
+    }
+    if (sample_ct_y) {
+      const uintptr_t workspace_alloc_y = get_linear_workspace_size(sample_ct_y, predictor_ct_y, constraint_ct_y, genod_buffer_needed);
+      if (workspace_alloc_y > workspace_alloc) {
+	workspace_alloc = workspace_alloc_y;
+      }
+    }
+    // +1 is for top-level g_workspace_bufs
+    const uint32_t dosage_is_present = pgfip->gflags & kfPgenGlobalDosagePresent;
+    uintptr_t thread_xalloc_cacheline_ct = (workspace_alloc / kCacheline) + 1;
+    uintptr_t per_variant_xalloc_byte_ct = sizeof(linear_aux_result_t) + 2 * max_reported_test_ct * sizeof(double) + max_sample_ct * local_covar_ct * sizeof(double);
+    unsigned char* main_loadbufs[2];
+    uint32_t read_block_size;
+    if (multithread_load_init(variant_include, max_sample_ct, variant_ct, pgr_alloc_cacheline_ct, thread_xalloc_cacheline_ct, per_variant_xalloc_byte_ct, pgfip, &calc_thread_ct, &g_genovecs, dosage_is_present? (&g_dosage_presents) : nullptr, dosage_is_present? (&g_dosage_val_bufs) : nullptr, &read_block_size, main_loadbufs, &ts.threads, &g_pgr_ptrs, &g_read_variant_uidx_starts)) {
+      goto glm_linear_ret_NOMEM;
+    }
+    ts.calc_thread_ct = calc_thread_ct;
+    g_calc_thread_ct = calc_thread_ct;
+    linear_aux_result_t* linear_block_aux_bufs[2];
+    double* block_beta_se_bufs[2];
+    
+    for (uint32_t uii = 0; uii < 2; ++uii) {
+      linear_block_aux_bufs[uii] = (linear_aux_result_t*)bigstack_alloc(read_block_size * sizeof(linear_aux_result_t));
+      if ((!linear_block_aux_bufs[uii]) ||
+	  bigstack_alloc_d(read_block_size * 2 * max_reported_test_ct, &(block_beta_se_bufs[uii]))) {
+	goto glm_linear_ret_NOMEM;
+      }
+      if (local_covar_ct) {
+	if (bigstack_alloc_d(read_block_size * max_sample_ct * local_covar_ct * sizeof(double), &(g_local_covars_vcmaj_d[uii]))) {
+	  goto glm_linear_ret_NOMEM;
+	}
+      } else {
+	g_local_covars_vcmaj_d[uii] = nullptr;
+      }
+    }
+
+    g_workspace_bufs = (unsigned char**)bigstack_alloc_raw_rd(calc_thread_ct * sizeof(intptr_t));
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      g_workspace_bufs[tidx] = bigstack_alloc_raw(workspace_alloc);
+    }
+    
+    const uint32_t ref_col = glm_cols & kfGlmColRef;
+    const uint32_t alt1_col = glm_cols & kfGlmColAlt1;
+    const uint32_t alt_col = glm_cols & kfGlmColAlt;
+    const uint32_t alt_ct_col = glm_cols & kfGlmColAltcount;
+    const uint32_t tot_allele_col = glm_cols & kfGlmColTotallele;
+    const uint32_t alt_freq_col = glm_cols & kfGlmColAltfreq;
+    const uint32_t mach_r2_col = glm_cols & kfGlmColMachR2;
+    const uint32_t nobs_col = glm_cols & kfGlmColNobs;
+    const uint32_t beta_col = glm_cols & (kfGlmColBeta | kfGlmColOrbeta);
+    const uint32_t se_col = glm_cols & kfGlmColSe;
+    const uint32_t ci_col = (ci_size != 0.0) && (glm_cols & kfGlmColCi);
+    const uint32_t t_col = glm_cols & kfGlmColT;
+    const uint32_t p_col = glm_cols & kfGlmColP;
+    cswritep = (char*)overflow_buf;
+    *cswritep++ = '#';
+    if (chr_col) {
+      cswritep = strcpya(cswritep, "CHROM\t");
+    }
+    if (variant_bps) {
+      cswritep = strcpya(cswritep, "POS\t");
+    }
+    cswritep = strcpya(cswritep, "ID");
+    if (ref_col) {
+      cswritep = strcpya(cswritep, "\tREF");
+    }
+    if (alt1_col) {
+      cswritep = strcpya(cswritep, "\tALT1");
+    }
+    if (alt_col) {
+      cswritep = strcpya(cswritep, "\tALT");
+    }
+    if (alt_ct_col) {
+      cswritep = strcpya(cswritep, "\tALT1_CT");
+    }
+    if (tot_allele_col) {
+      cswritep = strcpya(cswritep, "\tALLELE_CT");
+    }
+    if (alt_freq_col) {
+      cswritep = strcpya(cswritep, "\tALT_FREQ");
+    }
+    if (mach_r2_col) {
+      cswritep = strcpya(cswritep, "\tMACH_R2");
+    }
+    if (test_col) {
+      cswritep = strcpya(cswritep, "\tTEST");
+    }
+    if (nobs_col) {
+      cswritep = strcpya(cswritep, "\tOBS_CT");
+    }
+    if (beta_col) {
+      cswritep = strcpya(cswritep, "\tBETA");
+    }
+    if (se_col) {
+      cswritep = strcpya(cswritep, "\tSE");
+    }
+    double ci_zt = 0.0;
+    if (ci_col) {
+      cswritep = strcpya(cswritep, "\tL");
+      cswritep = dtoa_g(ci_size * 100, cswritep);
+      cswritep = strcpya(cswritep, "\tU");
+      cswritep = dtoa_g(ci_size * 100, cswritep);
+      ci_zt = ltqnorm((ci_size + 1.0) * 0.5);
+    }
+    if (t_col) {
+      if (!constraint_ct) {
+        cswritep = strcpya(cswritep, "\tT_STAT");
+      } else {
+	// chisq for joint tests.  may switch to F-statistic (just divide by
+	// df; the hard part there is porting a function to convert that to a
+	// p-value)
+        cswritep = strcpya(cswritep, "\tT_OR_CHISQ_STAT");
+      }
+    }
+    if (p_col) {
+      cswritep = strcpya(cswritep, "\tP");
+    }
+    append_binary_eoln(&cswritep);
+
+    // Main workflow:
+    // 1. Set n=0, load/skip block 0
+    //
+    // 2. Spawn threads processing block n
+    // 3. If n>0, write results for block (n-1)
+    // 4. Increment n by 1
+    // 5. Load/skip block n unless eof
+    // 6. Join threads
+    // 7. Goto step 2 unless eof
+    //
+    // 8, Write results for last block
+    const uint32_t read_block_sizel = BITCT_TO_WORDCT(read_block_size);
+    const uint32_t read_block_ct_m1 = (raw_variant_ct - 1) / read_block_size;
+    uint32_t parity = 0;
+    uint32_t read_block_idx = 0;
+    uint32_t write_variant_uidx = 0;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_buf_blen = 0;
+    uint32_t suppress_mach_r2 = 0;
+
+    // todo: --tests
+    uint32_t cur_reported_test_ct = 0;
+    uint32_t primary_reported_test_idx = include_intercept;
+    uint32_t cur_predictor_ct = 0;
+    uint32_t cur_constraint_ct = 0;
+
+    char** cur_test_names = nullptr;
+    uint32_t prev_block_variant_ct = 0;
+    uint32_t variant_idx = 0;
+    uint32_t cur_read_block_size = read_block_size;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    uint32_t cur_allele_ct = 2;
+    LOGPRINTFWW5("--glm linear regression on phenotype '%s': ", cur_pheno_name);
+    fputs("0%", stdout);
+    fflush(stdout);
+    while (1) {
+      uintptr_t cur_block_variant_ct = 0;
+      if (!ts.is_last_block) {
+	while (read_block_idx < read_block_ct_m1) {
+	  cur_block_variant_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
+	  if (cur_block_variant_ct) {
+	    break;
+	  }
+	  ++read_block_idx;
+	}
+	if (read_block_idx == read_block_ct_m1) {
+	  cur_read_block_size = raw_variant_ct - (read_block_idx * read_block_size);
+	  cur_block_variant_ct = popcount_longs(&(variant_include[read_block_idx * read_block_sizel]), BITCT_TO_WORDCT(cur_read_block_size));
+	}
+	if (pgfi_multiread(variant_include, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_variant_ct, pgfip)) {
+	  goto glm_linear_ret_READ_FAIL;
+	}
+	if (gz_local_covar_file) {
+	  reterr = read_local_covar_block(g_sample_include, g_sample_include_x, g_sample_include_y, g_sample_include_cumulative_popcounts, g_sample_include_x_cumulative_popcounts, g_sample_include_y_cumulative_popcounts, cip, variant_include, local_sample_uidx_order, local_variant_include, sample_ct, sample_ct_x, sample_ct_y, read_block_idx * read_block_size, read_block_idx * read_block_size + cur_read_block_size, cur_block_variant_ct, local_sample_ct, local_covar_ct, (glm_info_ptr->flags / kfG [...]
+	  if (reterr) {
+	    goto glm_linear_ret_1;
+	  }
+	}
+      }
+      if (variant_idx) {
+	join_threads3z(&ts);
+	reterr = g_error_ret;
+	if (reterr) {
+	  if (reterr == kPglRetMalformedInput) {
+	    logprint("\n");
+	    logerrprint("Error: Malformed .pgen file.\n");
+	  }
+	  goto glm_linear_ret_1;
+	}
+      }
+      if (!ts.is_last_block) {
+	g_cur_block_variant_ct = cur_block_variant_ct;
+	const uint32_t uidx_start = read_block_idx * read_block_size;
+	compute_uidx_start_partition(variant_include, cur_block_variant_ct, calc_thread_ct, uidx_start, g_read_variant_uidx_starts);
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_pgr_ptrs[tidx]->fi.block_base = pgfip->block_base;
+	  g_pgr_ptrs[tidx]->fi.block_offset = pgfip->block_offset;
+	}
+	g_linear_block_aux = linear_block_aux_bufs[parity];
+	g_block_beta_se = block_beta_se_bufs[parity];
+	ts.is_last_block = (variant_idx + cur_block_variant_ct == variant_ct);
+	ts.thread_func_ptr = glm_linear_thread;
+	if (spawn_threads3z(variant_idx, &ts)) {
+	  goto glm_linear_ret_THREAD_CREATE_FAIL;
+	}
+      }
+      parity = 1 - parity;
+      if (variant_idx) {
+	// write *previous* block results
+	const double* cur_block_beta_se = block_beta_se_bufs[parity];
+	const linear_aux_result_t* cur_block_aux = linear_block_aux_bufs[parity];
+	const uint32_t variant_idx_start = variant_idx - prev_block_variant_ct;
+	double* cur_pval_write = orig_pvals? (&(orig_pvals[variant_idx_start])) : nullptr;
+	double* cur_chisq_write = orig_chisq? (&(orig_chisq[variant_idx_start])) : nullptr;
+	for (uint32_t variant_bidx = 0; variant_bidx < prev_block_variant_ct; ++variant_bidx, ++write_variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &write_variant_uidx);
+	  if (write_variant_uidx >= chr_end) {
+	    do {
+	      ++chr_fo_idx;
+	      chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	    } while (write_variant_uidx >= chr_end);
+	    const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	    suppress_mach_r2 = 1;
+	    if ((chr_idx == ((uint32_t)x_code)) && sample_ct_x) {
+	      cur_reported_test_ct = reported_test_ct_x;
+	      cur_predictor_ct = predictor_ct_x;
+	      cur_constraint_ct = constraint_ct_x;
+	      cur_test_names = test_names_x;
+	    } else if ((chr_idx == ((uint32_t)y_code)) && sample_ct_y) {
+	      cur_reported_test_ct = reported_test_ct_y;
+	      cur_predictor_ct = predictor_ct_y;
+	      cur_constraint_ct = constraint_ct_y;
+	      cur_test_names = test_names_y;
+	    } else {
+	      cur_reported_test_ct = reported_test_ct;
+	      cur_predictor_ct = predictor_ct;
+	      cur_constraint_ct = constraint_ct;
+	      cur_test_names = test_names;
+	      if ((chr_idx != ((uint32_t)x_code)) && (chr_idx != ((uint32_t)mt_code)) && (!is_set(cip->haploid_mask, chr_idx))) {
+		suppress_mach_r2 = 0;
+	      }
+	    }
+	    if (cur_constraint_ct) {
+	      primary_reported_test_idx = reported_test_ct - 1;
+	    }
+	    if (chr_col) {
+	      char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	      *chr_name_end = '\t';
+	      chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	    }
+	  }
+	  const double* beta_se_iter = &(cur_block_beta_se[2 * max_reported_test_ct * variant_bidx]);
+	  const double primary_beta = beta_se_iter[primary_reported_test_idx * 2];
+	  const double primary_se = beta_se_iter[primary_reported_test_idx * 2 + 1];
+	  const uint32_t is_invalid = (primary_se == -9);
+	  if (is_invalid && valid_variants) {
+	    CLEAR_BIT(write_variant_uidx, valid_variants);
+	  }
+	  const linear_aux_result_t* auxp = &(cur_block_aux[variant_bidx]);
+	  if (pfilter != 2.0) {
+	    double primary_pval = 2.0;
+	    if (!is_invalid) {
+	      if (!cur_constraint_ct) {
+		double primary_tstat = primary_beta / primary_se;
+		primary_pval = calc_tprob(primary_tstat, auxp->sample_obs_ct - cur_predictor_ct);
+	      } else {
+		// possible todo: support for F-distribution p-values instead
+		// of asymptotic chi-square p-values
+		primary_pval = chiprob_p(primary_se, cur_constraint_ct);
+	      }
+	    }
+	    if (primary_pval > pfilter) {
+	      if (cur_pval_write) {
+		cur_pval_write[variant_bidx] = -9;
+	      }
+	      if (cur_chisq_write) {
+		cur_chisq_write[variant_bidx] = -9;
+	      }
+	      continue;
+	    }
+	  }
+	  uintptr_t variant_allele_idx_base = write_variant_uidx * 2;
+	  if (variant_allele_idxs) {
+	    variant_allele_idx_base = variant_allele_idxs[write_variant_uidx];
+	    cur_allele_ct = variant_allele_idxs[write_variant_uidx + 1] - variant_allele_idxs[write_variant_uidx];
+	  }
+	  char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	  // possible todo: make number-to-string operations, strlen(), etc.
+	  //   happen only once per variant.
+	  for (uint32_t test_idx = 0; test_idx < cur_reported_test_ct; ++test_idx) {
+	    if (chr_col) {
+	      cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+	    }
+	    if (variant_bps) {
+	      cswritep = uint32toa_x(variant_bps[write_variant_uidx], '\t', cswritep);
+	    }
+	    cswritep = strcpya(cswritep, variant_ids[write_variant_uidx]);
+	    if (ref_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_alleles[0]);
+	    }
+	    if (alt1_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_alleles[1]);
+	    }
+	    if (alt_col) {
+	      *cswritep++ = '\t';
+	      for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+		if (cswrite(&css, &cswritep)) {
+		  goto glm_linear_ret_WRITE_FAIL;
+		}
+		cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	      }
+	      --cswritep;
+	    }
+	    if (alt_ct_col) {
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(auxp->alt_dosage, cswritep);
+	    }
+	    if (tot_allele_col) {
+	      *cswritep++ = '\t';
+	      cswritep = uint32toa(auxp->allele_obs_ct, cswritep);
+	    }
+	    if (alt_freq_col) {
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(auxp->alt_dosage / ((double)auxp->allele_obs_ct), cswritep);
+	    }
+	    if (mach_r2_col) {
+	      *cswritep++ = '\t';
+	      if (!suppress_mach_r2) {
+	        cswritep = dtoa_g(auxp->mach_r2, cswritep);
+	      } else {
+		cswritep = strcpya(cswritep, "NA");
+	      }
+	    }
+	    if (test_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_test_names[test_idx]);
+	    }
+	    if (nobs_col) {
+	      *cswritep++ = '\t';
+	      cswritep = uint32toa(auxp->sample_obs_ct, cswritep);
+	    }
+	    double pval = -9;
+	    double tstat = 0.0;
+	    if ((!cur_constraint_ct) || (test_idx != primary_reported_test_idx)) {
+	      double beta = *beta_se_iter++;
+	      double se = *beta_se_iter++;
+	      if (!is_invalid) {
+		tstat = beta / se;
+		pval = calc_tprob(tstat, auxp->sample_obs_ct - cur_predictor_ct);
+	      }
+	      if (beta_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(beta, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	      if (se_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(se, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	      if (ci_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  const double ci_halfwidth = ci_zt * se;
+		  cswritep = dtoa_g(beta - ci_halfwidth, cswritep);
+		  *cswritep++ = '\t';
+		  cswritep = dtoa_g(beta + ci_halfwidth, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA\tNA");
+		}
+	      }
+	      if (t_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(tstat, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	    } else {
+	      // joint test: use (currently approximate) F-test instead of T
+	      // test
+	      // beta_se_iter = &(beta_se_iter[2]);
+	      if (beta_col) {
+		cswritep = memcpyl3a(cswritep, "\tNA");
+	      }
+	      if (se_col) {
+		cswritep = memcpyl3a(cswritep, "\tNA");
+	      }
+	      if (ci_col) {
+		cswritep = strcpya(cswritep, "\tNA\tNA");
+	      }
+	      if (t_col) {
+		*cswritep++ = '\t';
+		if (!is_invalid) {
+		  cswritep = dtoa_g(primary_se, cswritep);
+		} else {
+		  cswritep = strcpya(cswritep, "NA");
+		}
+	      }
+	      // could avoid recomputing
+	      if (!is_invalid) {
+		pval = chiprob_p(primary_se, cur_constraint_ct);
+	      }
+	    }
+	    if (p_col) {
+	      *cswritep++ = '\t';
+	      if (!is_invalid) {
+		cswritep = dtoa_g(MAXV(pval, output_min_p), cswritep);
+	      } else {
+		cswritep = strcpya(cswritep, "NA");
+	      }
+	    }
+	    append_binary_eoln(&cswritep);
+	    if (cswrite(&css, &cswritep)) {
+	      goto glm_linear_ret_WRITE_FAIL;
+	    }
+	    if (test_idx == primary_reported_test_idx) {
+	      if (cur_pval_write) {
+		cur_pval_write[variant_bidx] = pval;
+	      }
+	      if (cur_chisq_write) {
+		if (!is_invalid) {
+		  if (!cur_constraint_ct) {
+		    cur_chisq_write[variant_bidx] = tstat * tstat;
+		  } else {
+		    cur_chisq_write[variant_bidx] = primary_se;
+		  }
+		} else {
+		  cur_chisq_write[variant_bidx] = -9;
+		}
+	      }
+	    }
+	  }
+	}
+      }
+      if (variant_idx == variant_ct) {
+	break;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+      ++read_block_idx;
+      prev_block_variant_ct = cur_block_variant_ct;
+      variant_idx += cur_block_variant_ct;
+      // crucially, this is independent of the pgen_reader_t block_base
+      // pointers
+      pgfip->block_base = main_loadbufs[parity];
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto glm_linear_ret_WRITE_FAIL;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    LOGPRINTF("done.\n");
+    LOGPRINTF("Results written to %s .\n", outname);
+    bigstack_reset(bigstack_mark);
+  }
+  while (0) {
+  glm_linear_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  glm_linear_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  glm_linear_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  glm_linear_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  glm_linear_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  glm_linear_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ glm_linear_ret_1:
+  threads3z_cleanup(&ts, &g_cur_block_variant_ct);
+  cswrite_close_cond(&css, cswritep);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t glm_main(const uintptr_t* orig_sample_include, const char* sample_ids, const char* sids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, const uintptr_t* orig_variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const glm_info_t* glm_info_ptr, const adjust_info_t* adjus [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  gzFile gz_local_covar_file = nullptr;
+  gz_token_stream_t gts;
+  gz_token_stream_preinit(&gts);
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!pheno_ct) {
+      logerrprint("Error: No phenotypes loaded.\n");
+      goto glm_main_ret_INCONSISTENT_INPUT;
+    }
+    if (orig_sample_ct < 2) {
+      logerrprint("Error: --glm requires at least two samples.\n");
+      goto glm_main_ret_INCONSISTENT_INPUT;
+    }
+    if (!orig_variant_ct) {
+      logerrprint("Error: --glm requires at least one variant.\n");
+      goto glm_main_ret_INCONSISTENT_INPUT;
+    }
+    // common linear/logistic initialization
+    const uintptr_t* early_variant_include = orig_variant_include;
+    uint32_t* local_sample_uidx_order = nullptr;
+    uintptr_t* local_variant_include = nullptr;
+    char* local_loadbuf = nullptr;
+    uint32_t variant_ct = orig_variant_ct;
+    uint32_t local_sample_ct = 0;
+    uint32_t local_variant_ctl = 0;
+    uint32_t local_covar_ct = 0;
+    uint32_t local_loadbuf_size = 0;
+    if (local_covar_fname) {
+      reterr = glm_local_init(local_covar_fname, local_pvar_fname, local_psam_fname, sample_ids, cip, variant_bps, variant_ids, glm_info_ptr, raw_sample_ct, max_sample_id_blen, raw_variant_ct, &orig_sample_include, &sex_nm, &sex_male, &early_variant_include, &orig_sample_ct, &variant_ct, &gz_local_covar_file, &local_sample_uidx_order, &local_variant_include, &local_sample_ct, &local_variant_ctl, &local_covar_ct);
+      if (reterr) {
+	goto glm_main_ret_1;
+      }
+      uint64_t ullii = local_sample_ct;
+      if (glm_info_ptr->local_cat_ct) {
+	ullii *= 1 + int_slen(glm_info_ptr->local_cat_ct);
+      } else {
+	// permit 24 characters per floating point number instead of 16, since
+	// some tools dump 15-17 significant digits
+	ullii *= 24 * (local_covar_ct + ((glm_info_ptr->flags / kfGlmLocalOmitLast) & 1));
+      }
+      // +2 bytes for null terminator, \r\n; 1 more so we can detect gzgets
+      // hitting the limit
+      ullii += 3;
+      if (ullii > kMaxLongLine) {
+	logerrprint("Error: Too many samples/covariates for --glm local-covar=.\n");
+	goto glm_main_ret_MALFORMED_INPUT;
+      }
+      if (ullii < kMaxMediumLine) {
+	ullii = kMaxMediumLine; // may as well unconditionally support this
+      }
+      local_loadbuf_size = ullii;
+      if (bigstack_alloc_c(local_loadbuf_size, &local_loadbuf)) {
+	goto glm_main_ret_NOMEM;
+      }
+      local_loadbuf[local_loadbuf_size - 1] = ' ';
+    }
+    
+    const glm_flags_t glm_flags = glm_info_ptr->flags;
+    g_glm_flags = glm_flags;
+    g_dosage_presents = nullptr;
+    g_dosage_val_bufs = nullptr;
+    const uint32_t output_zst = (glm_flags / kfGlmZs) & 1;
+    const uint32_t perm_adapt = (glm_flags / kfGlmPerm) & 1;
+    const uint32_t perms_total = perm_adapt? aperm_ptr->max : glm_info_ptr->mperm_ct;
+    // [output prefix].[pheno name].glm.[linear/logistic]{.perm, .mperm}{.zst}
+    uint32_t pheno_name_blen_capacity = kPglFnamesize - 14 - (4 * output_zst) - (uintptr_t)(outname_end - outname);
+    if (perms_total) {
+      pheno_name_blen_capacity -= 6 - perm_adapt;
+    }
+    if (max_pheno_name_blen > pheno_name_blen_capacity) {
+      logerrprint("Error: Phenotype name and/or --out parameter too long.\n");
+      goto glm_main_ret_INCONSISTENT_INPUT;
+    }
+    *outname_end = '.';
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+    unsigned char* overflow_buf;
+    uintptr_t* cur_sample_include;
+    // synthetic categorical covariate name could be ~twice max ID length?
+    if (bigstack_alloc_uc(kCompressStreamBlock + 2 * kMaxIdSlen + max_chr_blen + kMaxIdSlen + 512 + 2 * max_allele_slen, &overflow_buf) ||
+	bigstack_alloc_ul(raw_sample_ctl, &cur_sample_include) ||
+	bigstack_alloc_ui(raw_sample_ctl, &g_sample_include_cumulative_popcounts)) {
+      goto glm_main_ret_NOMEM;
+    }
+    g_sample_include = cur_sample_include;
+    g_cip = cip;
+    g_variant_allele_idxs = variant_allele_idxs;
+    
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uint32_t max_variant_ct = variant_ct;
+
+    uint32_t x_start;
+    uint32_t x_end;
+    get_xymt_start_and_end(cip, kChrOffsetX, &x_start, &x_end);
+    uint32_t y_start;
+    uint32_t y_end;
+    get_xymt_start_and_end(cip, kChrOffsetY, &y_start, &y_end);
+
+    uintptr_t* sex_male_collapsed_buf = nullptr;
+    int32_t x_code;
+    uint32_t variant_ct_x = 0;
+    uint32_t variant_ct_y = 0;
+    const uint32_t domdev_present = (glm_flags & (kfGlmGenotypic | kfGlmHethom))? 1 : 0;
+    const uint32_t sex_nm_ct = popcount_longs(sex_nm, raw_sample_ctl);
+    const uint32_t male_ct = popcount_longs(sex_male, raw_sample_ctl);
+    uint32_t add_sex_covar = !(glm_flags & kfGlmNoXSex);
+    if (add_sex_covar && ((!male_ct) || (male_ct == sex_nm_ct))) {
+      add_sex_covar = 0;
+    }
+    uintptr_t* cur_sample_include_y_buf = nullptr;
+    if (domdev_present || (glm_flags & (kfGlmDominant | kfGlmRecessive))) {
+      // dominant/recessive/genotypic/hethom suppress all chromosomes which
+      // aren't fully diploid.  (could throw in a hack to permit chrX if
+      // all samples are female?  i.e. synthesize a chr_info_t where
+      // xymt_codes[0] is -2 and haploid_mask X bit is cleared)
+      uintptr_t* variant_include_nohap = nullptr;
+      const uint32_t chr_ct = cip->chr_ct;
+      uint32_t removed_variant_ct = 0;
+      for (uint32_t chr_fo_idx = 0; chr_fo_idx < chr_ct; ++chr_fo_idx) {
+	const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	if (is_set(cip->haploid_mask, chr_idx)) {
+	  const uint32_t variant_uidx_start = cip->chr_fo_vidx_start[chr_fo_idx];
+	  const uint32_t variant_uidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	  const uint32_t cur_chr_variant_ct = popcount_bit_idx(early_variant_include, variant_uidx_start, variant_uidx_end);
+	  if (cur_chr_variant_ct) {
+	    if (!removed_variant_ct) {
+	      // no main-loop logic for excluding all haploid chromosomes, so
+	      // make a full copy of early_variant_include and throw away our
+	      // reference to the original
+	      if (bigstack_alloc_ul(raw_variant_ctl, &variant_include_nohap)) {
+		goto glm_main_ret_NOMEM;
+	      }
+	      memcpy(variant_include_nohap, early_variant_include, raw_variant_ctl * sizeof(intptr_t));
+	    }
+	    clear_bits_nz(variant_uidx_start, variant_uidx_end, variant_include_nohap);
+	    removed_variant_ct += cur_chr_variant_ct;
+	  }
+	}
+      }
+      if (removed_variant_ct) {
+	if (variant_ct == removed_variant_ct) {
+	  logerrprint("Error: No variants remaining for --glm ('dominant', 'recessive', 'genotypic',\nand 'hethom' only operate on diploid data).\n");
+	  goto glm_main_ret_INCONSISTENT_INPUT;
+	}
+	variant_ct -= removed_variant_ct;
+	early_variant_include = variant_include_nohap;
+	max_variant_ct = variant_ct;
+      }
+    } else {
+      if (xymt_exists(cip, kChrOffsetX, &x_code)) {
+	variant_ct_x = count_chr_variants_unsafe(early_variant_include, cip, (uint32_t)x_code);
+	// --xchr-model 0 now only suppresses chrX.
+	if (xchr_model) {
+	  if (variant_ct_x) {
+	    if (bigstack_alloc_ul(BITCT_TO_WORDCT(orig_sample_ct), &sex_male_collapsed_buf)) {
+	      goto glm_main_ret_NOMEM;
+	    }
+	  }
+	} else {
+	  max_variant_ct -= variant_ct_x;
+	  if (!max_variant_ct) {
+	    logerrprint("Error: No variants remaining for --glm, due to --xchr-model 0.\n");
+	    goto glm_main_ret_INCONSISTENT_INPUT;
+	  }
+	}
+      }
+      int32_t y_code;
+      if (xymt_exists(cip, kChrOffsetY, &y_code)) {
+	variant_ct_y = count_chr_variants_unsafe(early_variant_include, cip, (uint32_t)y_code);
+	if (variant_ct_y) {
+	  if (!male_ct) {
+	    logprint("--glm: Skipping chrY since there are no males.\n");
+	    max_variant_ct -= variant_ct_y;
+	    if (!max_variant_ct) {
+	      logerrprint("Error: No variants remaining for --glm.\n");
+	      goto glm_main_ret_INCONSISTENT_INPUT;
+	    }
+	  } else if (male_ct < orig_sample_ct) {
+	    // may as well check for only-chrY special case
+	    if (max_variant_ct != variant_ct_y) {
+	      if (bigstack_alloc_ul(raw_sample_ctl, &cur_sample_include_y_buf)) {
+		// covar_include_y allocation postponed since raw_covar_ct not
+		// yet known
+		goto glm_main_ret_NOMEM;
+	      }
+	    } else {
+	      orig_sample_include = sex_male;
+	      orig_sample_ct = male_ct;
+	    }
+	  }
+	}
+      }
+    }
+    if (add_sex_covar && (!variant_ct_x) && (!(glm_flags & kfGlmSex))) {
+      add_sex_covar = 0;
+    }
+    g_sex_male_collapsed = sex_male_collapsed_buf;
+    uint32_t raw_covar_ct = orig_covar_ct + local_covar_ct;
+    if (glm_info_ptr->condition_varname || glm_info_ptr->condition_list_fname || local_covar_ct || add_sex_covar) {
+      uint32_t condition_ct = 0;
+      pheno_col_t* new_covar_cols;
+      char* new_covar_names;
+      uintptr_t new_max_covar_name_blen = max_covar_name_blen;
+      if (add_sex_covar && (new_max_covar_name_blen < 4)) {
+	new_max_covar_name_blen = 4;
+      }
+      if (local_covar_ct && (new_max_covar_name_blen < 6 + int_slen(local_covar_ct + 1))) {
+	new_max_covar_name_blen = 6 + int_slen(local_covar_ct + 1);
+      }
+      if (glm_info_ptr->condition_varname || glm_info_ptr->condition_list_fname) {
+	assert(g_bigstack_end == bigstack_end_mark);
+	// reserve space for condition-list worst case (roughly sqrt(2^31)),
+	// since that's relatively small
+	const uint32_t condition_ct_max = 46338;
+	uint32_t* condition_uidxs;
+	if (bigstack_end_alloc_ui(condition_ct_max, &condition_uidxs)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	if (glm_info_ptr->condition_varname) {
+	  int32_t ii = get_variant_uidx_without_htable(glm_info_ptr->condition_varname, variant_ids, orig_variant_include, orig_variant_ct);
+	  if (ii >= 0) {
+	    condition_uidxs[0] = (uint32_t)ii;
+	    condition_ct = 1;
+	    const uint32_t condition_blen = strlen(glm_info_ptr->condition_varname) + 1;
+	    // drop "CSNP" column name for sanity's sake
+	    if (new_max_covar_name_blen < condition_blen) {
+	      new_max_covar_name_blen = condition_blen;
+	    }
+	  } else {
+	    if (ii == -2) {
+	      LOGERRPRINTFWW("Error: Duplicate --condition variant ID '%s'.\n", glm_info_ptr->condition_varname);
+	      goto glm_main_ret_INVALID_CMDLINE;
+	    }
+	    LOGERRPRINTFWW("Warning: --condition variant ID '%s' not found.\n", glm_info_ptr->condition_varname);
+	  }
+	} else {
+	  // 1. (re)construct variant ID hash table
+	  uintptr_t* already_seen;
+	  if (bigstack_calloc_ul(raw_variant_ctl, &already_seen)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	  uint32_t* variant_id_htable = nullptr;
+	  uint32_t variant_id_htable_size;
+	  reterr = alloc_and_populate_id_htable_mt(orig_variant_include, variant_ids, orig_variant_ct, max_thread_ct, &variant_id_htable, nullptr, &variant_id_htable_size);
+	  if (reterr) {
+	    goto glm_main_ret_1;
+	  }
+
+	  // 2. iterate through --condition-list file, make sure no IDs are
+	  //    duplicate in loaded fileset, warn about duplicates in
+	  //    --condition-list file
+	  reterr = gz_token_stream_init(glm_info_ptr->condition_list_fname, &gts, g_textbuf);
+	  if (reterr) {
+	    goto glm_main_ret_1;
+	  }
+	  uintptr_t skip_ct = 0;
+	  uintptr_t duplicate_ct = 0;
+	  uint32_t token_slen;
+	  while (1) {
+	    char* token_start = gz_token_stream_advance(&gts, &token_slen);
+	    if (!token_start) {
+	      break;
+	    }
+	    uint32_t cur_variant_uidx = variant_id_dupflag_htable_find(token_start, variant_ids, variant_id_htable, token_slen, variant_id_htable_size, max_variant_id_slen);
+	    if (cur_variant_uidx >> 31) {
+	      if (cur_variant_uidx != 0xffffffffU) {
+		LOGERRPRINTFWW("Error: --condition-list variant ID '%s' appears multiple times.\n", variant_ids[cur_variant_uidx & 0x7fffffff]);
+		goto glm_main_ret_INCONSISTENT_INPUT;
+	      }
+	      ++skip_ct;
+	    } else if (is_set(already_seen, cur_variant_uidx)) {
+	      ++duplicate_ct;
+	    } else {
+	      if (condition_ct == condition_ct_max) {
+		logerrprint("Error: Too many --condition-list variant IDs.\n");
+		goto glm_main_ret_MALFORMED_INPUT;
+	      }
+	      set_bit(cur_variant_uidx, already_seen);
+	      condition_uidxs[condition_ct++] = cur_variant_uidx;
+	      if (new_max_covar_name_blen <= token_slen) {
+		new_max_covar_name_blen = token_slen + 1;
+	      }
+	    }
+	  }
+	  if (token_slen) {
+	    // error code
+	    if (token_slen == 0xffffffffU) {
+	      logerrprint("Error: Excessively long ID in --condition-list file.\n");
+	      goto glm_main_ret_MALFORMED_INPUT;
+	    }
+	    goto glm_main_ret_READ_FAIL;
+	  }
+	  if (gz_token_stream_close(&gts)) {
+	    goto glm_main_ret_READ_FAIL;
+	  }
+	  if (skip_ct || duplicate_ct) {
+	    if (skip_ct && duplicate_ct) {
+	      LOGERRPRINTFWW("Warning: %" PRIuPTR " --condition-list variant ID%s not found, and %" PRIuPTR " duplicate ID%s present.\n", skip_ct, (skip_ct == 1)? "" : "s", duplicate_ct, (duplicate_ct == 1)? "" : "s");
+	    } else if (skip_ct) {
+	      LOGERRPRINTF("Warning: %" PRIuPTR " --condition-list variant ID%s not found.\n", skip_ct, (skip_ct == 1)? "" : "s");
+	    } else {
+	      LOGERRPRINTF("Warning: %" PRIuPTR " duplicate --condition-list variant ID%s present.\n", duplicate_ct, (duplicate_ct == 1)? "" : "s");
+	    }
+	  }
+	  LOGPRINTF("--condition-list: %u variant ID%s loaded.\n", condition_ct, (condition_ct == 1)? "" : "s");
+
+	  // free hash table and duplicate tracker
+	  bigstack_reset(already_seen);
+	}
+	raw_covar_ct += condition_ct;
+	new_covar_cols = (pheno_col_t*)bigstack_alloc((raw_covar_ct + add_sex_covar) * sizeof(pheno_col_t));
+	if ((!new_covar_cols) ||
+	    bigstack_alloc_c((raw_covar_ct + add_sex_covar) * new_max_covar_name_blen, &new_covar_names)) {
+	  goto glm_main_ret_NOMEM;
+	}
+        if (condition_ct) {
+	  bigstack_end_set(condition_uidxs);
+	  uintptr_t* genovec;
+	  uintptr_t* dosage_present;
+	  dosage_t* dosage_vals;
+	  if (bigstack_end_alloc_ul(QUATERCT_TO_WORDCT(raw_sample_ct), &genovec) ||
+	      bigstack_end_alloc_ul(raw_sample_ctl, &dosage_present) ||
+	      bigstack_end_alloc_dosage(raw_sample_ct, &dosage_vals)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	  pgr_clear_ld_cache(simple_pgrp);
+	  for (uint32_t condition_idx = 0; condition_idx < condition_ct; ++condition_idx) {
+	    const uint32_t cur_variant_uidx = condition_uidxs[condition_idx];
+	    uint32_t dosage_ct;
+	    uint32_t is_explicit_alt1;
+	    reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(nullptr, nullptr, raw_sample_ct, cur_variant_uidx, simple_pgrp, genovec, dosage_present, dosage_vals, &dosage_ct, &is_explicit_alt1);
+	    if (reterr) {
+	      if (reterr == kPglRetMalformedInput) {
+		logerrprint("Error: Malformed .pgen file.\n");
+	      }
+	      goto glm_main_ret_1;
+	    }
+	    pheno_col_t* cur_covar_col = &(new_covar_cols[local_covar_ct + condition_idx]);
+	    uintptr_t* cur_nonmiss;
+	    double* cur_covar_vals;
+	    if (bigstack_alloc_ul(raw_sample_ctl, &cur_nonmiss) ||
+		bigstack_alloc_d(raw_sample_ct, &cur_covar_vals)) {
+	      goto glm_main_ret_NOMEM;
+	    }
+	    cur_covar_col->category_names = nullptr;
+	    cur_covar_col->nonmiss = cur_nonmiss;
+	    cur_covar_col->data.qt = cur_covar_vals;
+	    cur_covar_col->type_code = kPhenoDtypeQt;
+	    cur_covar_col->nonnull_category_ct = 0;
+	    genoarr_to_nonmissing(genovec, raw_sample_ct, cur_nonmiss);
+	    genoarr_to_doubles(genovec, raw_sample_ct, cur_covar_vals);
+	    if (dosage_ct) {
+	      uint32_t sample_uidx = 0;
+	      for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_uidx) {
+		next_set_unsafe_ck(dosage_present, &sample_uidx);
+	        cur_covar_vals[sample_uidx] = kRecipDosageMid * ((int32_t)((uint32_t)dosage_vals[dosage_idx]));
+	      }
+	      bitvec_or(dosage_present, raw_sample_ctl, cur_nonmiss);
+	    }
+	    if (glm_flags & kfGlmConditionDominant) {
+	      for (uint32_t sample_uidx = 0; sample_uidx < raw_sample_ct; ++sample_uidx) {
+		if (cur_covar_vals[sample_uidx] > 1.0) {
+		  cur_covar_vals[sample_uidx] = 1.0;
+		}
+	      }
+	    } else if (glm_flags & kfGlmConditionRecessive) {
+	      for (uint32_t sample_uidx = 0; sample_uidx < raw_sample_ct; ++sample_uidx) {
+		double dxx = cur_covar_vals[sample_uidx];
+		if (dxx <= 1.0) {
+		  dxx = 0;
+		} else {
+		  dxx -= 1.0;
+		}
+		cur_covar_vals[sample_uidx] = dxx;
+	      }
+	    }
+	    strcpy(&(new_covar_names[(local_covar_ct + condition_idx) * new_max_covar_name_blen]), variant_ids[cur_variant_uidx]);
+	  }
+	  bigstack_end_reset(bigstack_end_mark);
+	}
+      } else {
+	new_covar_cols = (pheno_col_t*)bigstack_alloc((raw_covar_ct + add_sex_covar) * sizeof(pheno_col_t));
+	if ((!new_covar_cols) ||
+	    bigstack_alloc_c((raw_covar_ct + add_sex_covar) * new_max_covar_name_blen, &new_covar_names)) {
+	  goto glm_main_ret_NOMEM;
+	}
+      }
+      if (local_covar_ct) {
+	memcpy(new_covar_cols, covar_cols, local_covar_ct * sizeof(pheno_col_t));
+      }
+      memcpy(&(new_covar_cols[condition_ct + local_covar_ct]), covar_cols, orig_covar_ct * sizeof(pheno_col_t));
+      const char* covar_names_read_iter = covar_names;
+      // bugfix (11 May 2017): local covar names come before, not after,
+      //   --condition{-list} covar names
+      char* covar_names_write_iter = new_covar_names;
+      for (uint32_t local_covar_idx = 0; local_covar_idx < local_covar_ct; ++local_covar_idx) {
+	memcpy(covar_names_write_iter, "LOCAL", 5);
+	char* name_end = uint32toa(local_covar_idx + 1, &(covar_names_write_iter[5]));
+	*name_end = '\0';
+	new_covar_cols[local_covar_idx].type_code = kPhenoDtypeOther;
+	new_covar_cols[local_covar_idx].nonmiss = nullptr;
+	covar_names_write_iter = &(covar_names_write_iter[new_max_covar_name_blen]);
+      }
+      covar_names_write_iter = &(covar_names_write_iter[condition_ct * new_max_covar_name_blen]);
+      for (uint32_t old_covar_idx = 0; old_covar_idx < orig_covar_ct; ++old_covar_idx) {
+	strcpy(covar_names_write_iter, covar_names_read_iter);
+	covar_names_read_iter = &(covar_names_read_iter[max_covar_name_blen]);
+	covar_names_write_iter = &(covar_names_write_iter[new_max_covar_name_blen]);
+      }
+      if (add_sex_covar) {
+	pheno_col_t* new_sex_col = &(new_covar_cols[raw_covar_ct++]);
+	double* sex_covar_vals;
+	if (bigstack_alloc_d(raw_sample_ct, &sex_covar_vals)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	uint32_t sample_uidx = 0;
+	for (uint32_t sample_idx = 0; sample_idx < orig_sample_ct; ++sample_idx, ++sample_uidx) {
+	  next_set_unsafe_ck(sex_nm, &sample_uidx);
+	  // 1/2 instead of 1/0 coding; user shouldn't have to worry about
+	  // signs changing when they use --sex instead of using the sex column
+	  // from a .bim/.psam file
+	  sex_covar_vals[sample_uidx] = (double)((int32_t)(2 - is_set(sex_male, sample_uidx)));
+	}
+	new_sex_col->category_names = nullptr;
+	// const_cast
+	new_sex_col->nonmiss = (uintptr_t*)((uintptr_t)sex_nm);
+	new_sex_col->data.qt = sex_covar_vals;
+	new_sex_col->type_code = kPhenoDtypeQt;
+	new_sex_col->nonnull_category_ct = 0;
+	strcpy(covar_names_write_iter, "SEX");
+      }
+      covar_cols = new_covar_cols;
+      covar_names = new_covar_names;
+      max_covar_name_blen = new_max_covar_name_blen;
+    }
+    const uint32_t raw_covar_ctl = BITCT_TO_WORDCT(raw_covar_ct);
+    uintptr_t* initial_covar_include = nullptr;
+    uintptr_t* covar_include = nullptr;
+    uintptr_t* cur_sample_include_x_buf = nullptr;
+    uintptr_t* covar_include_x = nullptr;
+    uint32_t covar_max_nonnull_cat_ct = 0;
+    if (raw_covar_ctl) {
+      if (bigstack_alloc_ul(raw_covar_ctl, &initial_covar_include) ||
+	  bigstack_alloc_ul(raw_covar_ctl, &covar_include)) {
+	goto glm_main_ret_NOMEM;
+      }
+      fill_ulong_zero(raw_covar_ctl, initial_covar_include);
+      for (uint32_t covar_uidx = 0; covar_uidx < raw_covar_ct; ++covar_uidx) {
+	const pheno_col_t* cur_covar_col = &(covar_cols[covar_uidx]);
+	if (cur_covar_col->type_code != kPhenoDtypeOther) {
+	  if (!is_const_covar(cur_covar_col, orig_sample_include, orig_sample_ct)) {
+	    set_bit(covar_uidx, initial_covar_include);
+	    if (cur_covar_col->type_code == kPhenoDtypeCat) {
+	      if (cur_covar_col->nonnull_category_ct > covar_max_nonnull_cat_ct) {
+		covar_max_nonnull_cat_ct = cur_covar_col->nonnull_category_ct;
+	      }
+	    }
+	  } else {
+	    LOGERRPRINTF("Warning: Excluding constant covariate '%s' from --glm.\n", &(covar_names[covar_uidx * max_covar_name_blen]));
+	  }
+	} else {
+	  // local covariate, always include
+	  set_bit(covar_uidx, initial_covar_include);
+	}
+      }
+      if (covar_max_nonnull_cat_ct && (glm_info_ptr->parameters_range_list.name_ct || glm_info_ptr->tests_range_list.name_ct)) {
+	// todo: permit this, and automatically expand a single parameter index
+	// referring to a categorical covariate into the appropriate range of
+	// final predictor indices
+	logerrprint("Error: --parameters/--tests cannot be used directly with categorical\ncovariates; expand them into binary covariates with --split-cat-pheno first.\n");
+	goto glm_main_ret_INCONSISTENT_INPUT;
+      }
+    }
+    const uint32_t domdev_present_p1 = domdev_present + 1;
+    const uint32_t add_interactions = (glm_flags / kfGlmInteraction) & 1;
+    const uint32_t raw_predictor_ct = 2 + domdev_present + raw_covar_ct * (1 + add_interactions * domdev_present_p1);
+    const uint32_t raw_predictor_ctl = BITCT_TO_WORDCT(raw_predictor_ct);
+    const uint32_t first_covar_pred_uidx = 2 + domdev_present;
+    uint32_t first_interaction_pred_uidx = 0;
+    if (add_interactions) {
+      first_interaction_pred_uidx = first_covar_pred_uidx + raw_covar_ct;
+    }
+
+    // TODO: --tests
+    const uint32_t joint_test = domdev_present;
+    g_constraints_con_major = nullptr;
+    g_constraints_con_major_x = nullptr;
+    g_constraints_con_major_y = nullptr;
+    g_constraints_con_major_f = nullptr;
+    g_constraints_con_major_x_f = nullptr;
+    g_constraints_con_major_y_f = nullptr;
+    
+    uintptr_t* raw_parameter_subset = nullptr;
+    g_parameter_subset = nullptr;
+    g_parameter_subset_x = nullptr;
+    g_parameter_subset_y = nullptr;
+    if (glm_info_ptr->parameters_range_list.name_ct) {
+      if (bigstack_calloc_ul(raw_predictor_ctl, &raw_parameter_subset) ||
+	  bigstack_alloc_ul(raw_predictor_ctl, &g_parameter_subset) ||
+	  bigstack_alloc_ul(raw_predictor_ctl, &g_parameter_subset_x) ||
+	  bigstack_alloc_ul(raw_predictor_ctl, &g_parameter_subset_y)) {
+	goto glm_main_ret_NOMEM;
+      }
+      raw_parameter_subset[0] = 1; // intercept (index 0) always included
+      numeric_range_list_to_bitarr(&(glm_info_ptr->parameters_range_list), raw_predictor_ct, 0, 1, raw_parameter_subset);
+      if (domdev_present && ((raw_parameter_subset[0] & 7) != 7)) {
+	// this breaks the joint test
+	logerrprint("Error: --parameters cannot exclude 1 or 2 when the 'genotypic' or 'hethom'\nmodifier is present.\n");
+	goto glm_main_ret_INVALID_CMDLINE;
+      }
+      if (add_sex_covar && first_interaction_pred_uidx) {
+	// special case: when add_sex_covar is true, the added sex covariate is
+	// simply the last covariate, with predictor index
+	// (first_interaction_pred_uidx - 1).  This lines up with --parameters
+	// when interactions are not requested; but when they are, we have a
+	// small reshuffle to do.
+	uintptr_t* parameter_subset_reshuffle_buf;
+	if (bigstack_calloc_ul(raw_predictor_ctl, &parameter_subset_reshuffle_buf)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	copy_bitarr_range(raw_parameter_subset, 0, 0, first_interaction_pred_uidx - 1, parameter_subset_reshuffle_buf);
+	copy_bitarr_range(raw_parameter_subset, first_interaction_pred_uidx - 1, first_interaction_pred_uidx, raw_covar_ct * domdev_present_p1, parameter_subset_reshuffle_buf);
+	const uint32_t first_sex_parameter_idx = first_interaction_pred_uidx - 1 + raw_covar_ct * domdev_present_p1;
+	if (is_set(raw_parameter_subset, first_sex_parameter_idx)) {
+	  set_bit(first_interaction_pred_uidx - 1, parameter_subset_reshuffle_buf);
+	}
+	if (is_set(raw_parameter_subset, first_sex_parameter_idx + 1)) {
+	  set_bit(first_sex_parameter_idx + 1, parameter_subset_reshuffle_buf);
+	}
+	if (domdev_present && is_set(raw_parameter_subset, first_sex_parameter_idx + 2)) {
+	  set_bit(first_sex_parameter_idx + 2, parameter_subset_reshuffle_buf);
+	}
+	memcpy(raw_parameter_subset, parameter_subset_reshuffle_buf, raw_predictor_ctl * sizeof(intptr_t));
+	bigstack_reset(parameter_subset_reshuffle_buf);
+      }
+      // if there were any constant covariates, exclude them from
+      // raw_parameter_subset
+      // note that, if appended sex covariate is present at all, it is always
+      // nonconstant
+      uint32_t nonconst_covar_ct = 0;
+      if (initial_covar_include) {
+	nonconst_covar_ct = popcount_longs(initial_covar_include, raw_covar_ctl);
+      }
+      const uint32_t removed_covar_ct = raw_covar_ct - nonconst_covar_ct;
+      uint32_t covar_uidx = 0;
+      for (uint32_t removed_covar_idx = 0; removed_covar_idx < removed_covar_ct; ++removed_covar_idx, ++covar_uidx) {
+	next_unset_unsafe_ck(initial_covar_include, &covar_uidx);
+	clear_bit(first_covar_pred_uidx + covar_uidx, raw_parameter_subset);
+	if (first_interaction_pred_uidx) {
+	  const uint32_t geno_interaction_uidx = first_interaction_pred_uidx + covar_uidx * domdev_present_p1;
+	  clear_bit(geno_interaction_uidx, raw_parameter_subset);
+	  if (domdev_present) {
+	    clear_bit(geno_interaction_uidx + 1, raw_parameter_subset);
+	  }
+	}
+      }
+      // if any loaded nonconstant covariates aren't referenced in
+      // raw_parameter_subset, remove them from initial_covar_include
+      covar_uidx = 0;
+      for (uint32_t nonconst_covar_idx = 0; nonconst_covar_idx < nonconst_covar_ct; ++nonconst_covar_idx, ++covar_uidx) {
+	next_set_unsafe_ck(initial_covar_include, &covar_uidx);
+	uint32_t cur_covar_is_referenced = is_set(raw_parameter_subset, first_covar_pred_uidx + covar_uidx);
+	if (add_interactions) {
+	  cur_covar_is_referenced = cur_covar_is_referenced || is_set(raw_parameter_subset, first_interaction_pred_uidx + covar_uidx * domdev_present_p1);
+	  if (domdev_present) {
+	    cur_covar_is_referenced = cur_covar_is_referenced || is_set(raw_parameter_subset, first_interaction_pred_uidx + covar_uidx * 2 + 1);
+	  }
+	}
+	if (!cur_covar_is_referenced) {
+	  clear_bit(covar_uidx, initial_covar_include);
+	}
+      }
+      // if your regression doesn't involve genotype data, you should be using
+      // e.g. R, not plink...
+      if ((!(raw_parameter_subset[0] & 2)) &&
+	  ((!domdev_present) || (!(raw_parameter_subset[0] & 4))) &&
+	  ((!add_interactions) || (!popcount_bit_idx(raw_parameter_subset, first_interaction_pred_uidx, raw_predictor_ct)))) {
+	logerrprint("Error: --parameters must retain at least one dosage-dependent variable.\n");
+	goto glm_main_ret_INCONSISTENT_INPUT;
+      }
+    }
+    // computation of these counts moved here, since --parameters can reduce
+    // the number of relevant covariates
+    uint32_t initial_nonx_covar_ct = 0;
+    if (initial_covar_include) {
+      initial_nonx_covar_ct = popcount_longs(initial_covar_include, raw_covar_ctl);
+    }
+    uint32_t initial_y_covar_ct = 0;
+    uintptr_t* covar_include_y = nullptr;
+    if (!initial_nonx_covar_ct) {
+      // bigstack_reset(initial_covar_include); // not ok with parameters
+      initial_covar_include = nullptr;
+      covar_include = nullptr;
+    } else {
+      initial_y_covar_ct = initial_nonx_covar_ct - (cur_sample_include_y_buf && add_sex_covar && is_set(initial_covar_include, raw_covar_ct - 1));
+      if (add_sex_covar && (!(glm_flags & kfGlmSex))) {
+	// may as well verify there's at least one non-x/non-y variant
+	// (if only chrX and chrY present, don't allocate
+	// cur_sample_include_x_buf, just make chrX the baseline instead)
+	if (is_set(initial_covar_include, raw_covar_ct - 1) && (variant_ct != variant_ct_x + variant_ct_y)) {
+	  if (bigstack_alloc_ul(raw_sample_ctl, &cur_sample_include_x_buf) ||
+	      bigstack_alloc_ul(raw_covar_ctl, &covar_include_x)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	  --initial_nonx_covar_ct;
+	}
+      }
+      if (cur_sample_include_y_buf) {
+	if (bigstack_alloc_ul(raw_covar_ctl, &covar_include_y)) {
+	  goto glm_main_ret_NOMEM;
+	}
+      }
+    }
+    const uint32_t report_adjust = (adjust_info_ptr->flags & kfAdjustColAll);
+    const uint32_t is_sometimes_firth = (glm_flags & (kfGlmFirthFallback | kfGlmFirth))? 1 : 0;
+    const uint32_t is_always_firth = glm_flags & kfGlmFirth;
+    const uint32_t glm_pos_col = glm_info_ptr->cols & kfGlmColPos;
+
+    unsigned char* bigstack_mark2 = g_bigstack_base;
+    for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+      const pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_idx]);
+      const pheno_dtype_t dtype_code = cur_pheno_col->type_code;
+      const char* cur_pheno_name = &(pheno_names[pheno_idx * max_pheno_name_blen]);
+      if (dtype_code == kPhenoDtypeCat) {
+	// todo: check if there are only two categories after linear-style
+	// covariate QC, and automatically use ordinary logistic regression in
+	// that case?  (need to indicate which category is treated as 'case'
+	// and which is 'control'...)
+	// longer-term todo: multinomial logistic regression?
+	LOGPRINTFWW("--glm: Skipping categorical phenotype '%s'.\n", cur_pheno_name);
+	continue;
+      }
+
+      bitvec_and_copy(orig_sample_include, cur_pheno_col->nonmiss, raw_sample_ctl, cur_sample_include);
+      const uint32_t is_logistic = (dtype_code == kPhenoDtypeCc);
+      uint32_t sample_ct = popcount_longs(cur_sample_include, raw_sample_ctl);
+      if (is_logistic) {
+	const uint32_t initial_case_ct = popcount_longs_intersect(cur_sample_include, cur_pheno_col->data.cc, raw_sample_ctl);
+	if ((!initial_case_ct) || (initial_case_ct == sample_ct)) {
+	  LOGPRINTFWW("--glm: Skipping case/control phenotype '%s' since all samples are %s.\n", cur_pheno_name, initial_case_ct? "cases" : "controls");
+	  continue;
+	}
+      } else {
+	if (is_const_covar(cur_pheno_col, cur_sample_include, sample_ct)) {
+	  LOGPRINTFWW("--glm: Skipping constant quantitative phenotype '%s'.\n", cur_pheno_name);
+	  continue;
+	}
+      }
+      uint32_t covar_ct = 0;
+      uint32_t extra_cat_ct = 0;
+      uint32_t separation_warning = 0;
+      bigstack_double_reset(bigstack_mark2, bigstack_end_mark);
+      /*
+      for (uint32_t uii = 0; uii < raw_covar_ct; ++uii) {
+	const pheno_col_t* cur_covar_col = &(covar_cols[uii]);
+	for (uint32_t sample_uidx = 0; sample_uidx < raw_sample_ct; ++sample_uidx) {
+	  printf("%g ", cur_covar_col->data.qt[sample_uidx]);
+	}
+	printf("\n\n");
+      }
+      */
+      if (initial_nonx_covar_ct) {
+	if (glm_determine_covars(is_logistic? cur_pheno_col->data.cc : nullptr, initial_covar_include, covar_cols, raw_sample_ct, raw_covar_ctl, initial_nonx_covar_ct, covar_max_nonnull_cat_ct, is_sometimes_firth, cur_sample_include, covar_include, &sample_ct, &covar_ct, &extra_cat_ct, &separation_warning)) {
+	  goto glm_main_ret_NOMEM;
+	}
+      }
+      uint32_t predictor_ct = 2 + domdev_present + (covar_ct + extra_cat_ct) * (1 + add_interactions * domdev_present_p1);
+      if (raw_parameter_subset) {
+	collapse_parameter_subset(covar_include, raw_parameter_subset, domdev_present, raw_covar_ct, covar_ct, add_interactions, g_parameter_subset, &predictor_ct);
+      }
+      if (sample_ct <= predictor_ct) {
+	LOGERRPRINTFWW("Warning: Skipping --glm regression on phenotype '%s' since # samples <= # predictor columns.\n", cur_pheno_name);
+	if (separation_warning) {
+	  logerrprint("(Quasi-)separated covariate(s) were present.  Try removing inappropriate\ncovariates, and/or using Firth logistic regression.\n");
+	}
+	continue;
+      }
+#ifdef __LP64__
+      if (round_up_pow2(sample_ct, 4) * ((uint64_t)predictor_ct) > 0x7fffffff) {
+	// todo: remove this constraint in LAPACK_ILP64 case?
+	LOGERRPRINTFWW("Warning: Skipping --glm regression on phenotype '%s' since there are too many\nsamples or predictors (internal matrices limited to ~2^31 entries).\n", cur_pheno_name);
+	continue;
+      }
+#endif
+      uint32_t case_ct = 0;
+      if (is_logistic) {
+	case_ct = popcount_longs_intersect(cur_sample_include, cur_pheno_col->data.cc, raw_sample_ctl);
+	if ((!case_ct) || (case_ct == sample_ct)) {
+	  LOGPRINTFWW("--glm: Skipping case/control phenotype '%s' since all remaining samples are %s.\n", cur_pheno_name, case_ct? "cases" : "controls");
+	  // without any e.g. cases in the dataset, every single covariate
+	  // should fail the separation check, so covar_ct should be zero here
+	  assert(!covar_ct);
+	  continue;
+	}
+	if (sample_ct < 10 * predictor_ct) {
+	  LOGERRPRINTFWW("Warning: --glm remaining sample count is less than 10x predictor count for case/control phenotype '%s'.\n", cur_pheno_name);
+	}
+      } else {
+	// verify phenotype is still nonconstant
+	if (is_const_covar(cur_pheno_col, cur_sample_include, sample_ct)) {
+	  LOGPRINTFWW("--glm: Skipping quantitative phenotype '%s' since phenotype is constant for all remaining samples.\n", cur_pheno_name);
+	  continue;
+	}
+      }
+      if (covar_ct < initial_nonx_covar_ct) {
+	uint32_t covar_uidx = 0;
+	for (uint32_t covar_idx = 0; covar_idx < initial_nonx_covar_ct; ++covar_idx, ++covar_uidx) {
+	  next_set_unsafe_ck(initial_covar_include, &covar_uidx);
+	  if (!is_set(covar_include, covar_uidx)) {
+	    LOGERRPRINTFWW("Warning: %sot including covariate '%s' in --glm regression on phenotype '%s'.\n", cur_sample_include_x_buf? (cur_sample_include_y_buf? "Outside of chrX, n" : "Outside of chrX and chrY, n") : (cur_sample_include_y_buf? "Outside of chrY, n" : "N"), &(covar_names[covar_uidx * max_covar_name_blen]), cur_pheno_name);
+	  }
+	}
+      }
+      if (separation_warning) {
+	logerrprint("(Quasi-)separated covariate(s) were present.  Try removing inappropriate\ncovariates, and/or using Firth logistic regression.\n");
+      }
+
+      // cur_sample_include_x == nullptr: chrX uses same samples and covariates
+      //   as the rest of the genome.  sample_ct_x always zero to force most
+      //   chrX-specific initialization to be skipped (exception:
+      //   sex_male_collapsed, needed for allele count/freq reporting)
+      // cur_sample_include_x non-null: if sample_ct_x == 0, we skip the entire
+      //   chromosome.  otherwise, we have different covariates than the rest
+      //   of the genome.
+      uintptr_t* cur_sample_include_x = cur_sample_include_x_buf;
+      uint32_t sample_ct_x = 0;
+      uint32_t covar_ct_x = 0;
+      uint32_t extra_cat_ct_x = 0;
+      uint32_t predictor_ct_x = 0;
+      uint32_t x_samples_are_different = 0;
+      if (cur_sample_include_x) {
+	bitvec_and_copy(orig_sample_include, cur_pheno_col->nonmiss, raw_sample_ctl, cur_sample_include_x);
+        uint32_t separation_warning_x = 0;
+	if (glm_determine_covars(is_logistic? cur_pheno_col->data.cc : nullptr, initial_covar_include, covar_cols, raw_sample_ct, raw_covar_ctl, initial_nonx_covar_ct + 1, covar_max_nonnull_cat_ct, is_sometimes_firth, cur_sample_include_x, covar_include_x, &sample_ct_x, &covar_ct_x, &extra_cat_ct_x, &separation_warning_x)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	x_samples_are_different = (sample_ct_x != sample_ct) || (!are_all_words_identical(cur_sample_include, cur_sample_include_x, raw_sample_ctl));
+	if ((!x_samples_are_different) && (covar_ct == covar_ct_x) && are_all_words_identical(covar_include, covar_include_x, raw_covar_ctl)) {
+	  LOGPRINTFWW("Note: chrX samples and covariate(s) in --glm regression on phenotype '%s' are the same as that for the rest of the genome.\n", cur_pheno_name);
+	  sample_ct_x = 0;
+	  cur_sample_include_x = nullptr;
+	} else {
+	  if (!sample_ct_x) {
+	    LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s'.\n", cur_pheno_name);
+	  } else {
+            predictor_ct_x = 2 + domdev_present + (covar_ct_x + extra_cat_ct_x) * (1 + add_interactions * domdev_present_p1);
+	    if (raw_parameter_subset) {
+	      collapse_parameter_subset(covar_include, raw_parameter_subset, domdev_present, raw_covar_ct, covar_ct_x, add_interactions, g_parameter_subset_x, &predictor_ct_x);
+	    }
+	    if (sample_ct_x <= predictor_ct_x) {
+	      LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s', since # remaining samples <= # predictor columns.\n", cur_pheno_name);
+	      sample_ct_x = 0;
+#ifdef __LP64__
+	    } else if (round_up_pow2(sample_ct_x, 4) * ((uint64_t)predictor_ct_x) > 0x7fffffff) {
+	      LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s', since there are\ntoo many samples or predictors (internal matrices limited to ~2^31 entries).\n", cur_pheno_name);
+	      sample_ct_x = 0;
+#endif
+	    } else if (is_logistic) {
+	      const uint32_t case_ct_x = popcount_longs_intersect(cur_sample_include_x, cur_pheno_col->data.cc, raw_sample_ctl);
+	      if ((!case_ct_x) || (case_ct_x == sample_ct_x)) {
+		LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s', since all remaining samples are %s.\n", cur_pheno_name, case_ct_x? "cases" : "controls");
+		sample_ct_x = 0;
+	      }
+	    } else {
+	      if (is_const_covar(cur_pheno_col, cur_sample_include_x, sample_ct_x)) {
+		LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s', since phenotype is constant for all remaining samples.\n", cur_pheno_name);
+		sample_ct_x = 0;
+	      }
+	    }
+	    if (sample_ct_x && (covar_ct_x < initial_nonx_covar_ct + 1)) {
+	      uint32_t covar_uidx = 0;
+	      for (uint32_t covar_idx = 0; covar_idx < covar_ct_x; ++covar_idx, ++covar_uidx) {
+		next_set_unsafe_ck(initial_covar_include, &covar_uidx);
+		if (!is_set(covar_include_x, covar_uidx)) {
+		  LOGERRPRINTFWW("Warning: On chrX, not including covariate '%s' in --glm regression on phenotype '%s'.\n", &(covar_names[covar_uidx * max_covar_name_blen]), cur_pheno_name);
+		}
+	      }
+	    }
+	  }
+	  if (separation_warning_x && (!separation_warning)) {
+	    logerrprint("(Quasi-)separated covariate(s) were present on chrX.  Try removing inappropriate\ncovariates, and/or using Firth logistic regression.\n");
+	  }
+	}
+      }
+
+      uintptr_t* cur_sample_include_y = cur_sample_include_y_buf;
+      uint32_t sample_ct_y = 0;
+      uint32_t covar_ct_y = 0;
+      uint32_t extra_cat_ct_y = 0;
+      uint32_t predictor_ct_y = 0;
+      uint32_t y_samples_are_different = 0;
+      if (cur_sample_include_y) {
+	bitvec_and_copy(orig_sample_include, sex_male, raw_sample_ctl, cur_sample_include_y);
+	bitvec_and(cur_pheno_col->nonmiss, raw_sample_ctl, cur_sample_include_y);
+	uint32_t separation_warning_y = 0;
+	if (glm_determine_covars(is_logistic? cur_pheno_col->data.cc : nullptr, initial_covar_include, covar_cols, raw_sample_ct, raw_covar_ctl, initial_y_covar_ct, covar_max_nonnull_cat_ct, is_sometimes_firth, cur_sample_include_y, covar_include_y, &sample_ct_y, &covar_ct_y, &extra_cat_ct_y, &separation_warning_y)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	y_samples_are_different = (sample_ct_y != sample_ct) || (!are_all_words_identical(cur_sample_include, cur_sample_include_y, raw_sample_ctl));
+	if ((!y_samples_are_different) && (covar_ct == covar_ct_y) && are_all_words_identical(covar_include, covar_include_y, raw_covar_ctl)) {
+	  LOGPRINTFWW("Note: chrY samples and covariate(s) in --glm regression on phenotype '%s' are the same as that for the rest of the genome.\n", cur_pheno_name);
+	  sample_ct_y = 0;
+	  cur_sample_include_y = nullptr;
+	} else {
+	  if (!sample_ct_y) {
+	    LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s'.\n", cur_pheno_name);
+	  } else {
+            predictor_ct_y = 2 + domdev_present + (covar_ct_y + extra_cat_ct_y) * (1 + add_interactions * domdev_present_p1);
+	    if (raw_parameter_subset) {
+	      collapse_parameter_subset(covar_include, raw_parameter_subset, domdev_present, raw_covar_ct, covar_ct_y, add_interactions, g_parameter_subset_y, &predictor_ct_y);
+	    }
+	    if (sample_ct_y <= predictor_ct_y) {
+	      LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s', since # remaining samples <= # predictor columns.\n", cur_pheno_name);
+	      sample_ct_y = 0;
+#ifdef __LP64__
+	    } else if (round_up_pow2(sample_ct_y, 4) * ((uint64_t)predictor_ct_y) > 0x7fffffff) {
+	      LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s', since there are\ntoo many samples or predictors (internal matrices limited to ~2^31 entries).\n", cur_pheno_name);
+	      sample_ct_y = 0;
+#endif
+	    } else if (is_logistic) {
+	      const uint32_t case_ct_y = popcount_longs_intersect(cur_sample_include_y, cur_pheno_col->data.cc, raw_sample_ctl);
+	      if ((!case_ct_y) || (case_ct_y == sample_ct_y)) {
+		LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s', since all remaining samples are %s.\n", cur_pheno_name, case_ct_y? "cases" : "controls");
+		sample_ct_y = 0;
+	      }
+	    } else {
+	      if (is_const_covar(cur_pheno_col, cur_sample_include_y, sample_ct_y)) {
+		LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s', since phenotype is constant for all remaining samples.\n", cur_pheno_name);
+		sample_ct_y = 0;
+	      }
+	    }
+	    if (sample_ct_y && (covar_ct_y < initial_y_covar_ct)) {
+	      uint32_t covar_uidx = 0;
+	      for (uint32_t covar_idx = 0; covar_idx < covar_ct_y; ++covar_idx, ++covar_uidx) {
+		next_set_unsafe_ck(initial_covar_include, &covar_uidx);
+		if (!is_set(covar_include_y, covar_uidx)) {
+		  LOGERRPRINTFWW("Warning: On chrY, not including covariate '%s' in --glm regression on phenotype '%s'.\n", &(covar_names[covar_uidx * max_covar_name_blen]), cur_pheno_name);
+		}
+	      }
+	    }
+	  }
+	  if (separation_warning_y && (!separation_warning)) {
+	    logerrprint("(Quasi-)separated covariate(s) were present on chrY.  Try removing inappropriate\ncovariates, and/or using Firth logistic regression.\n");
+	  }
+	}
+      }
+
+      // Expand categorical covariates and perform VIF and correlation checks
+      // here.
+      double* pheno_d = nullptr;
+      double* covars_cmaj_d = nullptr;
+      // double* covar_dotprod_d = nullptr;
+      uintptr_t* pheno_cc = nullptr;
+      float* pheno_f = nullptr;
+      float* covars_cmaj_f = nullptr;
+      char** cur_covar_names = nullptr;
+      vif_corr_err_t vif_corr_check_result;
+      if (is_logistic) {
+	if (glm_alloc_fill_and_test_pheno_covars_cc(cur_sample_include, cur_pheno_col->data.cc, covar_include, covar_cols, covar_names, sample_ct, covar_ct, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct, max_covar_name_blen, vif_thresh, glm_info_ptr->max_corr, &pheno_cc, &pheno_f, &covars_cmaj_f, &cur_covar_names, &vif_corr_check_result)) {
+	  goto glm_main_ret_NOMEM;
+	}
+      } else {
+	if (glm_alloc_fill_and_test_pheno_covars_qt(cur_sample_include, cur_pheno_col->data.qt, covar_include, covar_cols, covar_names, sample_ct, covar_ct, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct, max_covar_name_blen, vif_thresh, glm_info_ptr->max_corr, &pheno_d, &covars_cmaj_d, &cur_covar_names, &vif_corr_check_result)) {
+	  goto glm_main_ret_NOMEM;
+	}
+      }
+      if (vif_corr_check_result.errcode) {
+	if (vif_corr_check_result.covar_idx1 == 0xffffffffU) {
+	  // must be correlation matrix inversion failure
+	  LOGERRPRINTFWW("Warning: Skipping --glm regression on phenotype '%s' since covariate correlation matrix could not be inverted. You may want to remove redundant covariates and try again.\n", cur_pheno_name);
+	} else {
+	  if (vif_corr_check_result.errcode == kVifCorrCheckVifFail) {
+	    LOGERRPRINTFWW("Warning: Skipping --glm regression on phenotype '%s' since variance inflation factor for covariate '%s' is too high. You may want to remove redundant covariates and try again.\n", cur_pheno_name, cur_covar_names[vif_corr_check_result.covar_idx1]);
+	  } else {
+	    LOGERRPRINTFWW("Warning: Skipping --glm regression on phenotype '%s' since correlation between covariates '%s' and '%s' is too high. You may want to remove redundant covariates and try again.\n", cur_pheno_name, cur_covar_names[vif_corr_check_result.covar_idx1], cur_covar_names[vif_corr_check_result.covar_idx2]);
+	  }
+	}
+	continue;
+      }
+      char** cur_covar_names_x = nullptr;
+      if (sample_ct_x) {
+	if (is_logistic) {
+	  if (glm_alloc_fill_and_test_pheno_covars_cc(cur_sample_include_x, cur_pheno_col->data.cc, covar_include_x, covar_cols, covar_names, sample_ct_x, covar_ct_x, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct_x, max_covar_name_blen, vif_thresh, glm_info_ptr->max_corr, &g_pheno_x_cc, &g_pheno_x_f, &g_covars_cmaj_x_f, &cur_covar_names_x, &vif_corr_check_result)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	} else {
+	  if (glm_alloc_fill_and_test_pheno_covars_qt(cur_sample_include_x, cur_pheno_col->data.qt, covar_include_x, covar_cols, covar_names, sample_ct_x, covar_ct_x, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct_x, max_covar_name_blen, vif_thresh, glm_info_ptr->max_corr, &g_pheno_x_d, &g_covars_cmaj_x_d, &cur_covar_names_x, &vif_corr_check_result)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	}
+	if (vif_corr_check_result.errcode) {
+	  // maybe these prints should be in a separate function...
+	  if (vif_corr_check_result.covar_idx1 == 0xffffffffU) {
+	    LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s', since covariate correlation matrix could not be inverted. You may want to remove redundant covariates and try again.\n", cur_pheno_name);
+	  } else {
+	    if (vif_corr_check_result.errcode == kVifCorrCheckVifFail) {
+	      LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s', since variance inflation factor for covariate '%s' is too high. You may want to remove redundant covariates and try again.\n", cur_pheno_name, cur_covar_names_x[vif_corr_check_result.covar_idx1]);
+	    } else {
+	      LOGERRPRINTFWW("Warning: Skipping chrX in --glm regression on phenotype '%s', since correlation between covariates '%s' and '%s' is too high. You may want to remove redundant covariates and try again.\n", cur_pheno_name, cur_covar_names_x[vif_corr_check_result.covar_idx1], cur_covar_names_x[vif_corr_check_result.covar_idx2]);
+	    }
+	  }
+	  sample_ct_x = 0;
+	}
+      }
+      char** cur_covar_names_y = nullptr;
+      if (sample_ct_y) {
+	if (is_logistic) {
+	  if (glm_alloc_fill_and_test_pheno_covars_cc(cur_sample_include_y, cur_pheno_col->data.cc, covar_include_y, covar_cols, covar_names, sample_ct_y, covar_ct_y, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct_y, max_covar_name_blen, vif_thresh, glm_info_ptr->max_corr, &g_pheno_y_cc, &g_pheno_y_f, &g_covars_cmaj_y_f, &cur_covar_names_y, &vif_corr_check_result)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	} else {
+	  if (glm_alloc_fill_and_test_pheno_covars_qt(cur_sample_include_y, cur_pheno_col->data.qt, covar_include_y, covar_cols, covar_names, sample_ct_y, covar_ct_y, local_covar_ct, covar_max_nonnull_cat_ct, extra_cat_ct_y, max_covar_name_blen, vif_thresh, glm_info_ptr->max_corr, &g_pheno_y_d, &g_covars_cmaj_y_d, &cur_covar_names_y, &vif_corr_check_result)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	}
+	if (vif_corr_check_result.errcode) {
+	  if (vif_corr_check_result.covar_idx1 == 0xffffffffU) {
+	    LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s', since covariate correlation matrix could not be inverted.\n", cur_pheno_name);
+	  } else {
+	    if (vif_corr_check_result.errcode == kVifCorrCheckVifFail) {
+	      LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s', since variance inflation factor for covariate '%s' is too high.\n", cur_pheno_name, cur_covar_names[vif_corr_check_result.covar_idx1]);
+	    } else {
+	      LOGERRPRINTFWW("Warning: Skipping chrY in --glm regression on phenotype '%s', since correlation between covariates '%s' and '%s' is too high.\n", cur_pheno_name, cur_covar_names[vif_corr_check_result.covar_idx1], cur_covar_names[vif_corr_check_result.covar_idx2]);
+	    }
+	  }
+	  sample_ct_y = 0;
+	}
+      }
+      char** cur_test_names = nullptr;
+      char** cur_test_names_x = nullptr;
+      char** cur_test_names_y = nullptr;
+      if (alloc_and_init_reported_test_names(g_parameter_subset, cur_covar_names, glm_flags, covar_ct + extra_cat_ct, &cur_test_names)) {
+	goto glm_main_ret_NOMEM;
+      }
+      if (sample_ct_x) {
+	if (alloc_and_init_reported_test_names(g_parameter_subset_x, cur_covar_names_x, glm_flags, covar_ct_x + extra_cat_ct_x, &cur_test_names_x)) {
+	  goto glm_main_ret_NOMEM;
+	}
+      }
+      if (sample_ct_y) {
+	if (alloc_and_init_reported_test_names(g_parameter_subset_y, cur_covar_names_y, glm_flags, covar_ct_y + extra_cat_ct_y, &cur_test_names_y)) {
+	  goto glm_main_ret_NOMEM;
+	}
+      }
+      if (joint_test) {
+	if (is_logistic) {
+	  // will need more parameters when --tests is implemented
+	  if (alloc_and_init_constraints_f(predictor_ct, &g_constraint_ct, &g_constraints_con_major_f)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	  if (sample_ct_x) {
+	    if (alloc_and_init_constraints_f(predictor_ct_x, &g_constraint_ct_x, &g_constraints_con_major_x_f)) {
+	      goto glm_main_ret_NOMEM;
+	    }
+	  }
+	  if (sample_ct_y) {
+	    if (alloc_and_init_constraints_f(predictor_ct_y, &g_constraint_ct_y, &g_constraints_con_major_y_f)) {
+	      goto glm_main_ret_NOMEM;
+	    }
+	  }
+	} else {
+	  if (alloc_and_init_constraints_d(predictor_ct, &g_constraint_ct, &g_constraints_con_major)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	  if (sample_ct_x) {
+	    if (alloc_and_init_constraints_d(predictor_ct_x, &g_constraint_ct_x, &g_constraints_con_major_x)) {
+	      goto glm_main_ret_NOMEM;
+	    }
+	  }
+	  if (sample_ct_y) {
+	    if (alloc_and_init_constraints_d(predictor_ct_y, &g_constraint_ct_y, &g_constraints_con_major_y)) {
+	      goto glm_main_ret_NOMEM;
+	    }
+	  }
+	}
+      }
+      
+      // okay, we know what variants we're running the regression on, and we've
+      // done much of the necessary covariate preprocessing.  now prepare to
+      // launch glm_logistic()/glm_linear().
+
+      const uintptr_t* cur_variant_include = early_variant_include;
+      const uintptr_t* cur_local_variant_include = local_variant_include;
+      const uint32_t skip_x = variant_ct_x && ((!xchr_model) || (cur_sample_include_x && (!sample_ct_x)));
+      const uint32_t skip_y = variant_ct_y && ((!male_ct) || (cur_sample_include_y && (!sample_ct_y)));
+      uint32_t cur_variant_ct = variant_ct;
+      if (skip_x || skip_y) {
+	uintptr_t* tmp_variant_include;
+	if (bigstack_alloc_ul(raw_variant_ctl, &tmp_variant_include)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	memcpy(tmp_variant_include, early_variant_include, raw_variant_ctl * sizeof(intptr_t));
+	uintptr_t* tmp_local_variant_include = nullptr;
+	if (local_variant_include) {
+	  if (bigstack_alloc_ul(local_variant_ctl, &tmp_local_variant_include)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	  memcpy(tmp_local_variant_include, local_variant_include, local_variant_ctl * sizeof(intptr_t));
+	}
+	if (skip_x) {
+	  if (local_variant_include) {
+	    const uint32_t variant_ct_before_x = popcount_bit_idx(early_variant_include, 0, x_start);
+	    uint32_t local_uidx_first = idx_to_uidx_basic(local_variant_include, variant_ct_before_x);
+	    uint32_t local_uidx_last = jump_forward_set_unsafe(local_variant_include, local_uidx_first, variant_ct_x);
+	    clear_bits_nz(local_uidx_first, local_uidx_last + 1, tmp_local_variant_include);
+	  }
+	  clear_bits_nz(x_start, x_end, tmp_variant_include);
+	  cur_variant_ct -= variant_ct_x;
+	}
+	if (skip_y) {
+	  if (local_variant_include) {
+	    const uint32_t variant_ct_before_y = popcount_bit_idx(early_variant_include, 0, y_start);
+	    uint32_t local_uidx_first = idx_to_uidx_basic(local_variant_include, variant_ct_before_y);
+	    uint32_t local_uidx_last = jump_forward_set_unsafe(local_variant_include, local_uidx_first, variant_ct_y);
+	    clear_bits_nz(local_uidx_first, local_uidx_last + 1, tmp_local_variant_include);
+	  }
+	  clear_bits_nz(y_start, y_end, tmp_variant_include);
+	  cur_variant_ct -= variant_ct_y;
+	}
+	cur_variant_include = tmp_variant_include;
+	cur_local_variant_include = tmp_local_variant_include;
+      }
+      if (sex_male_collapsed_buf && (!skip_x)) {
+	if (!cur_sample_include_x) {
+	  copy_bitarr_subset(sex_male, cur_sample_include, sample_ct, sex_male_collapsed_buf);
+	} else {
+	  copy_bitarr_subset(sex_male, cur_sample_include_x, sample_ct_x, sex_male_collapsed_buf);
+	}
+      }
+      // todo: if permutation test, also keep whatever statistic is most
+      // appropriate for that
+      fill_cumulative_popcounts(cur_sample_include, raw_sample_ctl, g_sample_include_cumulative_popcounts);
+      g_sample_ct = sample_ct;
+      g_sample_ct_x = sample_ct_x;
+      g_covar_ct = covar_ct + extra_cat_ct;
+      g_local_covar_ct = local_covar_ct;
+      if (sample_ct_x) {
+	if (bigstack_alloc_ui(raw_sample_ctl, &g_sample_include_x_cumulative_popcounts)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	fill_cumulative_popcounts(cur_sample_include_x, raw_sample_ctl, g_sample_include_x_cumulative_popcounts);
+	g_sample_include_x = cur_sample_include_x;
+	g_covar_ct_x = covar_ct_x + extra_cat_ct_x;
+        // g_male_ct = popcount_longs_intersect(cur_sample_include_x, sex_male, raw_sample_ctl);
+      } else {
+	// technically only need this if variant_ct_x && (!skip_x)
+	// g_male_ct = popcount_longs_intersect(cur_sample_include, sex_male, raw_sample_ctl);
+	
+	// defensive
+	g_sample_include_x = nullptr;
+	g_sample_include_x_cumulative_popcounts = nullptr;
+	g_covar_ct_x = 0;
+      }
+      g_sample_ct_y = sample_ct_y;
+      if (sample_ct_y) {
+	if (bigstack_alloc_ui(raw_sample_ctl, &g_sample_include_y_cumulative_popcounts)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	fill_cumulative_popcounts(cur_sample_include_y, raw_sample_ctl, g_sample_include_y_cumulative_popcounts);
+	g_sample_include_y = cur_sample_include_y;
+	g_covar_ct_y = covar_ct_y + extra_cat_ct_y;
+      } else {
+	g_sample_include_y = nullptr;
+	g_sample_include_y_cumulative_popcounts = nullptr;
+	g_covar_ct_y = 0;
+      }
+
+      uintptr_t* valid_variants = nullptr;
+      double* orig_pvals = nullptr;
+      double* orig_chisq = nullptr;
+      if (report_adjust || perms_total) {
+	if (bigstack_alloc_ul(raw_variant_ctl, &valid_variants) ||
+	    bigstack_alloc_d(cur_variant_ct, &orig_pvals)) {
+	  goto glm_main_ret_NOMEM;
+	}
+	memcpy(valid_variants, cur_variant_include, raw_variant_ctl * sizeof(intptr_t));
+	if (report_adjust || (!is_logistic)) {
+	  if (bigstack_alloc_d(cur_variant_ct, &orig_chisq)) {
+	    goto glm_main_ret_NOMEM;
+	  }
+	}
+      }
+
+      if (alloc_and_fill_subset_chr_fo_vidx_start(cur_variant_include, cip, &g_subset_chr_fo_vidx_start)) {
+	goto glm_main_ret_NOMEM;
+      }
+      g_variant_include = cur_variant_include;
+      g_variant_ct = cur_variant_ct;
+      char* outname_end2 = strcpya(&(outname_end[1]), cur_pheno_name);
+      if (is_logistic) {
+	g_pheno_cc = pheno_cc;
+	g_pheno_f = pheno_f;
+	g_covars_cmaj_f = covars_cmaj_f;
+	if (is_always_firth) {
+	  outname_end2 = strcpya(outname_end2, ".glm.firth");
+	} else if (is_sometimes_firth) {
+	  outname_end2 = strcpya(outname_end2, ".glm.logistic.hybrid");
+	} else {
+	  outname_end2 = strcpya(outname_end2, ".glm.logistic");
+	}
+      } else {
+	g_pheno_d = pheno_d;
+	g_covars_cmaj_d = covars_cmaj_d;
+	outname_end2 = strcpya(outname_end2, ".glm.linear");
+      }
+      // write IDs
+      strcpy(outname_end2, ".id");
+      reterr = write_sample_ids(cur_sample_include, sample_ids, sids, outname, sample_ct, max_sample_id_blen, max_sid_blen);
+      if (reterr) {
+	goto glm_main_ret_1;
+      }
+      if (sample_ct_x && x_samples_are_different) {
+	strcpy(&(outname_end2[3]), ".x");
+	reterr = write_sample_ids(cur_sample_include_x, sample_ids, sids, outname, sample_ct_x, max_sample_id_blen, max_sid_blen);
+	if (reterr) {
+	  goto glm_main_ret_1;
+	}
+      }
+      if (sample_ct_y && y_samples_are_different) {
+	strcpy(&(outname_end2[3]), ".y");
+	reterr = write_sample_ids(cur_sample_include_y, sample_ids, sids, outname, sample_ct_y, max_sample_id_blen, max_sid_blen);
+	if (reterr) {
+	  goto glm_main_ret_1;
+	}
+      }
+      
+      if (output_zst) {
+	outname_end2 = strcpya(outname_end2, ".zst");
+      }
+      *outname_end2 = '\0';
+
+      if (is_logistic) {
+	reterr = glm_logistic(cur_pheno_name, cur_test_names, cur_test_names_x, cur_test_names_y, glm_pos_col? variant_bps : nullptr, variant_ids, allele_storage, glm_info_ptr, local_sample_uidx_order, cur_local_variant_include, outname, raw_variant_ct, max_chr_blen, ci_size, pfilter, output_min_p, max_thread_ct, pgr_alloc_cacheline_ct, local_sample_ct, local_loadbuf_size, pgfip, gz_local_covar_file, valid_variants, orig_pvals, orig_chisq, overflow_buf, local_loadbuf);
+      } else {
+	reterr = glm_linear(cur_pheno_name, cur_test_names, cur_test_names_x, cur_test_names_y, glm_pos_col? variant_bps : nullptr, variant_ids, allele_storage, glm_info_ptr, local_sample_uidx_order, cur_local_variant_include, outname, raw_variant_ct, max_chr_blen, ci_size, pfilter, output_min_p, max_thread_ct, pgr_alloc_cacheline_ct, local_sample_ct, local_loadbuf_size, pgfip, gz_local_covar_file, valid_variants, orig_pvals, orig_chisq, overflow_buf, local_loadbuf);
+      }
+      if (reterr) {
+	goto glm_main_ret_1;
+      }
+      if (perms_total) {
+	// todo
+	logerrprint("Error: --glm permutation tests are under development.\n");
+	reterr = kPglRetNotYetSupported;
+	goto glm_main_ret_1;
+      }
+    }
+  }
+  while (0) {
+  glm_main_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  glm_main_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  glm_main_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  glm_main_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  glm_main_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ glm_main_ret_1:
+  gz_token_stream_close(&gts);
+  gzclose_cond(gz_local_covar_file);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_glm.h b/plink2_glm.h
new file mode 100644
index 0000000..badf943
--- /dev/null
+++ b/plink2_glm.h
@@ -0,0 +1,112 @@
+#ifndef __PLINK2_GLM_H__
+#define __PLINK2_GLM_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_adjust.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+FLAGSET_DEF_START()
+  kfGlm0,
+  kfGlmZs = (1 << 0),
+
+  // mutually exclusive
+  kfGlmSex = (1 << 1),
+  kfGlmNoXSex = (1 << 2),
+
+  // mutually exclusive
+  kfGlmGenotypic = (1 << 3),
+  kfGlmHethom = (1 << 4),
+  kfGlmDominant = (1 << 5),
+  kfGlmRecessive = (1 << 6),
+
+  kfGlmInteraction = (1 << 7),
+  kfGlmHideCovar = (1 << 8),
+  kfGlmIntercept = (1 << 9),
+  kfGlmFirthFallback = (1 << 10),
+  kfGlmFirth = (1 << 11),
+  kfGlmPerm = (1 << 12),
+  kfGlmPermCount = (1 << 13),
+  kfGlmConditionDominant = (1 << 14),
+  kfGlmConditionRecessive = (1 << 15),
+  kfGlmLocalOmitLast = (1 << 16),
+  kfGlmTestsAll = (1 << 17)
+FLAGSET_DEF_END(glm_flags_t);
+
+FLAGSET_DEF_START()
+  kfGlmCol0,
+  kfGlmColChrom = (1 << 0),
+  kfGlmColPos = (1 << 1),
+  kfGlmColRef = (1 << 2),
+  kfGlmColAlt1 = (1 << 3),
+  kfGlmColAlt = (1 << 4),
+  kfGlmColAltcount = (1 << 5),
+  kfGlmColTotallele = (1 << 6),
+  kfGlmColAltcountcc = (1 << 7),
+  kfGlmColTotallelecc = (1 << 8),
+  kfGlmColAltfreq = (1 << 9),
+  kfGlmColAltfreqcc = (1 << 10),
+  kfGlmColMachR2 = (1 << 11),
+  kfGlmColFirthYn = (1 << 12),
+  kfGlmColTest = (1 << 13),
+  kfGlmColNobs = (1 << 14),
+
+  // if beta specified, ignore orbeta
+  kfGlmColBeta = (1 << 15),
+  kfGlmColOrbeta = (1 << 16),
+  
+  kfGlmColSe = (1 << 17),
+  kfGlmColCi = (1 << 18),
+  kfGlmColT = (1 << 19),
+  kfGlmColP = (1 << 20),
+  kfGlmColDefault = (kfGlmColChrom | kfGlmColPos | kfGlmColRef | kfGlmColAlt | kfGlmColFirthYn | kfGlmColTest | kfGlmColNobs | kfGlmColOrbeta | kfGlmColSe | kfGlmColCi | kfGlmColT | kfGlmColP),
+  kfGlmColAll = ((kfGlmColCi * 2) - kfGlmColChrom)
+FLAGSET_DEF_END(glm_cols_t);
+
+typedef struct glm_info_struct {
+  glm_flags_t flags;
+  glm_cols_t cols;
+  uint32_t mperm_ct;
+  uint32_t local_cat_ct;
+  double max_corr;
+  char* condition_varname;
+  char* condition_list_fname;
+  range_list_t parameters_range_list;
+  range_list_t tests_range_list;
+} glm_info_t;
+
+void init_glm(glm_info_t* glm_info_ptr);
+
+void cleanup_glm(glm_info_t* glm_info_ptr);
+
+// for testing purposes
+// plink2_matrix.h must be included in this file
+// boolerr_t logistic_regression(const float* yy, const float* xx, uint32_t sample_ct, uint32_t predictor_ct, float* coef, float* ll, float* pp, float* vv, float* hh, float* grad, float* dcoef);
+
+// boolerr_t firth_regression(const float* yy, const float* xx, uint32_t sample_ct, uint32_t predictor_ct, float* coef, float* hh, matrix_finvert_buf1_t* inv_1d_buf, float* flt_2d_buf, float* pp, float* vv, float* grad, float* dcoef, float* ww, float* tmpnxk_buf);
+
+pglerr_t glm_main(const uintptr_t* orig_sample_include, const char* sample_ids, const char* sids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, const uintptr_t* orig_variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const glm_info_t* glm_info_ptr, const adjust_info_t* adjus [...]
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+ 
+#endif // __PLINK2_GLM_H__
diff --git a/plink2_help.cpp b/plink2_help.cpp
new file mode 100644
index 0000000..99e9dd2
--- /dev/null
+++ b/plink2_help.cpp
@@ -0,0 +1,1717 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+// necessary to include this instead of plink2_common so g_cmdline_format_str[]
+// is known to have external linkage
+#include "plink2_help.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+const char g_cmdline_format_str[] = "\n  plink2 [input flag(s)...] {command flag(s)...} {other flag(s)...}\n  plink2 --help {flag name(s)...}\n\n";
+
+uint32_t edit1_match(const char* s1, const char* s2, uint32_t len1, uint32_t len2) {
+  // permit one difference of the following forms:
+  // - inserted/deleted character
+  // - replaced character
+  // - adjacent pair of swapped characters
+  uint32_t diff_found = 0;
+  uint32_t pos = 0;
+  if (len1 == len2) {
+    while (pos < len1) {
+      if (s1[pos] != s2[pos]) {
+	if (diff_found) {
+	  if ((diff_found == 2) || (s1[pos] != s2[pos - 1]) || (s1[pos - 1] != s2[pos])) {
+	    return 0;
+	  }
+	}
+	++diff_found;
+      }
+      ++pos;
+    }
+  } else if (len1 == len2 - 1) {
+    do {
+      if (s1[pos - diff_found] != s2[pos]) {
+	if (diff_found) {
+	  return 0;
+	}
+	++diff_found;
+      }
+      ++pos;
+    } while (pos < len2);
+  } else if (len1 == len2 + 1) {
+    do {
+      if (s1[pos] != s2[pos - diff_found]) {
+	if (diff_found) {
+	  return 0;
+	}
+	++diff_found;
+      }
+      ++pos;
+    } while (pos < len1);
+  } else {
+    return 0;
+  }
+  return 1;
+}
+
+// for better or worse, when this is too small we find out quickly due to
+// segfault...
+CONSTU31(kMaxEqualHelpParams, 13);
+
+typedef struct help_ctrl_struct {
+  uint32_t iters_left;
+  uint32_t param_ct;
+  char** argv;
+  uintptr_t unmatched_ct;
+  uintptr_t* all_match_arr;
+  uintptr_t* prefix_match_arr;
+  uintptr_t* perfect_match_arr;
+  uint32_t* param_slens;
+  uint32_t preprint_newline;
+} help_ctrl_t;
+
+void help_print(const char* cur_params, help_ctrl_t* help_ctrl_ptr, uint32_t postprint_newline, const char* payload) {
+  if (help_ctrl_ptr->param_ct) {
+    strcpy(g_textbuf, cur_params);
+    uint32_t cur_param_ct = 1;
+    char* cur_param_start[kMaxEqualHelpParams];
+    cur_param_start[0] = g_textbuf;
+    char* textbuf_iter = strchr(g_textbuf, '\t');
+    while (textbuf_iter) {
+      *textbuf_iter++ = '\0';
+      cur_param_start[cur_param_ct++] = textbuf_iter;
+      textbuf_iter = strchr(textbuf_iter, '\t');
+    }
+    if (help_ctrl_ptr->iters_left) {
+      const uint32_t orig_unmatched_ct = help_ctrl_ptr->unmatched_ct;
+      if (help_ctrl_ptr->unmatched_ct) {
+	uint32_t arg_uidx = 0;
+	if (help_ctrl_ptr->iters_left == 2) {
+	  for (uint32_t arg_idx = 0; arg_idx < orig_unmatched_ct; ++arg_idx, ++arg_uidx) {
+	    arg_uidx = next_unset_unsafe(help_ctrl_ptr->all_match_arr, arg_uidx);
+	    for (uint32_t cur_param_idx = 0; cur_param_idx < cur_param_ct; ++cur_param_idx) {
+	      if (!strcmp(cur_param_start[cur_param_idx], help_ctrl_ptr->argv[arg_uidx])) {
+		SET_BIT(arg_uidx, help_ctrl_ptr->perfect_match_arr);
+		SET_BIT(arg_uidx, help_ctrl_ptr->prefix_match_arr);
+		SET_BIT(arg_uidx, help_ctrl_ptr->all_match_arr);
+		help_ctrl_ptr->unmatched_ct -= 1;
+		break;
+	      }
+	    }
+	  }
+	} else {
+          uint32_t cur_param_slens[kMaxEqualHelpParams];
+	  for (uint32_t cur_param_idx = 0; cur_param_idx < cur_param_ct; ++cur_param_idx) {
+	    cur_param_slens[cur_param_idx] = strlen(cur_param_start[cur_param_idx]);
+	  }
+	  for (uint32_t arg_idx = 0; arg_idx < orig_unmatched_ct; ++arg_idx, ++arg_uidx) {
+	    arg_uidx = next_unset_unsafe(help_ctrl_ptr->all_match_arr, arg_uidx);
+	    const uint32_t slen = help_ctrl_ptr->param_slens[arg_uidx];
+	    for (uint32_t cur_param_idx = 0; cur_param_idx < cur_param_ct; ++cur_param_idx) {
+	      if (cur_param_slens[cur_param_idx] > slen) {
+		if (!memcmp(help_ctrl_ptr->argv[arg_uidx], cur_param_start[cur_param_idx], slen)) {
+		  SET_BIT(arg_uidx, help_ctrl_ptr->prefix_match_arr);
+		  SET_BIT(arg_uidx, help_ctrl_ptr->all_match_arr);
+		  help_ctrl_ptr->unmatched_ct -= 1;
+		  break;
+		}
+	      }
+	    }
+	  }
+	}
+      }
+    } else {
+      uint32_t cur_param_slens[kMaxEqualHelpParams];
+      for (uint32_t cur_param_idx = 0; cur_param_idx < cur_param_ct; ++cur_param_idx) {
+	cur_param_slens[cur_param_idx] = strlen(cur_param_start[cur_param_idx]);
+      }
+      uint32_t print_this = 0;
+      for (uint32_t arg_uidx = 0; arg_uidx < help_ctrl_ptr->param_ct; ++arg_uidx) {
+	if (IS_SET(help_ctrl_ptr->prefix_match_arr, arg_uidx)) {
+	  if (!print_this) {
+	    if (IS_SET(help_ctrl_ptr->perfect_match_arr, arg_uidx)) {
+	      for (uint32_t cur_param_idx = 0; cur_param_idx < cur_param_ct; ++cur_param_idx) {
+		if (!strcmp(cur_param_start[cur_param_idx], help_ctrl_ptr->argv[arg_uidx])) {
+		  print_this = 1;
+		  break;
+		}
+	      }
+	    } else {
+	      const uint32_t slen = help_ctrl_ptr->param_slens[arg_uidx];
+	      for (uint32_t cur_param_idx = 0; cur_param_idx < cur_param_ct; ++cur_param_idx) {
+		if (cur_param_slens[cur_param_idx] > slen) {
+		  if (!memcmp(help_ctrl_ptr->argv[arg_uidx], cur_param_start[cur_param_idx], slen)) {
+		    print_this = 1;
+		    break;
+		  }
+		}
+	      }
+	    }
+	  }
+	} else {
+	  for (uint32_t cur_param_idx = 0; cur_param_idx < cur_param_ct; ++cur_param_idx) {
+	    if (edit1_match(cur_param_start[cur_param_idx], help_ctrl_ptr->argv[arg_uidx], cur_param_slens[cur_param_idx], help_ctrl_ptr->param_slens[arg_uidx])) {
+	      print_this = 1;
+	      if (!IS_SET(help_ctrl_ptr->all_match_arr, arg_uidx)) {
+		SET_BIT(arg_uidx, help_ctrl_ptr->all_match_arr);
+		help_ctrl_ptr->unmatched_ct -= 1;
+	      }
+	      break;
+	    }
+	  }
+	}
+      }
+      if (print_this) {
+	const uint32_t payload_slen = strlen(payload);
+	const char* payload_end;
+	if (payload[payload_slen - 2] == '\n') {
+	  payload_end = &(payload[payload_slen - 1]);
+	} else {
+	  payload_end = &(payload[payload_slen]);
+	}
+	if (help_ctrl_ptr->preprint_newline) {
+	  putc_unlocked('\n', stdout);
+	}
+	help_ctrl_ptr->preprint_newline = postprint_newline;
+	const char* payload_iter = payload;
+	do {
+	  const char* line_end = (const char*)rawmemchr(payload_iter, '\n') + 1;
+	  uint32_t line_slen = (uint32_t)(line_end - payload_iter);
+	  if (line_slen > 2) {
+	    payload_iter = &(payload_iter[2]);
+	    line_slen -= 2;
+	  }
+	  memcpyx(g_textbuf, payload_iter, line_slen, 0);
+	  fputs(g_textbuf, stdout);
+	  payload_iter = line_end;
+	} while (payload_iter < payload_end);
+      }
+    }
+  } else {
+    fputs(payload, stdout);
+  }
+}
+
+pglerr_t disp_help(uint32_t param_ct, char** argv) {
+  // yes, this is overkill.  But it should be a good template for other
+  // command-line programs to use.
+  uint32_t param_ctl = BITCT_TO_WORDCT(param_ct);
+  pglerr_t reterr = kPglRetSuccess;
+  help_ctrl_t help_ctrl;
+  uint32_t arg_uidx;
+  uint32_t arg_idx;
+  uint32_t net_unmatched_ct;
+  int32_t col_num;
+  int32_t leading_dashes;
+  help_ctrl.iters_left = param_ct? 2 : 0;
+  help_ctrl.param_ct = param_ct;
+  help_ctrl.argv = argv;
+  help_ctrl.unmatched_ct = param_ct;
+  help_ctrl.param_slens = nullptr;
+  help_ctrl.all_match_arr = nullptr;
+  help_ctrl.argv = nullptr;
+  if (param_ct) {
+    if (pgl_malloc(param_ct * sizeof(int32_t), &help_ctrl.param_slens) ||
+	pgl_malloc(param_ctl * 3 * sizeof(intptr_t), &help_ctrl.all_match_arr)) {
+      goto disp_help_ret_NOMEM;
+    }
+    leading_dashes = 0;
+    for (arg_uidx = 0; arg_uidx < param_ct; arg_uidx++) {
+      if (argv[arg_uidx][0] == '-') {
+	leading_dashes = 1;
+	break;
+      }
+    }
+    if (leading_dashes) {
+      if (pgl_malloc(param_ct * sizeof(intptr_t), &help_ctrl.argv)) {
+	goto disp_help_ret_NOMEM;
+      }
+      for (arg_uidx = 0; arg_uidx < param_ct; arg_uidx++) {
+	if (argv[arg_uidx][0] == '-') {
+	  if (argv[arg_uidx][1] == '-') {
+	    help_ctrl.argv[arg_uidx] = &(argv[arg_uidx][2]);
+	  } else {
+	    help_ctrl.argv[arg_uidx] = &(argv[arg_uidx][1]);
+	  }
+	} else {
+	  help_ctrl.argv[arg_uidx] = argv[arg_uidx];
+	}
+      }
+    } else {
+      help_ctrl.argv = argv;
+    }
+    for (arg_idx = 0; arg_idx < param_ct; arg_idx++) {
+      help_ctrl.param_slens[arg_idx] = strlen(help_ctrl.argv[arg_idx]);
+    }
+    fill_ulong_zero(param_ctl * 3, help_ctrl.all_match_arr);
+    help_ctrl.prefix_match_arr = &(help_ctrl.all_match_arr[param_ctl]);
+    help_ctrl.perfect_match_arr = &(help_ctrl.all_match_arr[param_ctl * 2]);
+    help_ctrl.preprint_newline = 1;
+  } else {
+    help_ctrl.argv = nullptr;
+    fputs(
+"\nIn the command line flag definitions that follow,\n"
+"  * [square brackets] denote a required parameter, where the text between the\n"
+"    brackets describes its nature.\n"
+"  * <angle brackets> denote an optional modifier (or if '|' is present, a set\n"
+"    of mutually exclusive optional modifiers).  Use the EXACT text in the\n"
+"    definition.\n"
+"  * There's one exception to the angle brackets/exact text rule: when an angle\n"
+"    bracket term ends with '=[value]', '[value]' designates a variable\n"
+"    parameter.\n"
+"  * {curly braces} denote an optional parameter, where the text between the\n"
+"    braces describes its nature.\n"
+"  * An ellipsis (...) indicates that you may enter multiple parameters of the\n"
+"    specified type.\n"
+"  * A \"column set descriptor\" is either\n"
+"    1. a comma-separated sequence of column set names; this is interpreted as\n"
+"       the full list of column sets to include.\n"
+"    2. a comma-separated sequence of column set names, all preceded by '+' or\n"
+"       '-'; this is interpreted as a list of changes to the default.\n"
+, stdout);
+    fputs(g_cmdline_format_str, stdout);
+    fputs(
+"Most " PROG_NAME_STR " runs require exactly one main input fileset.  The following flags\n"
+"are available for defining its form and location:\n\n"
+, stdout);
+  }
+  do {
+    // explicit gzipped .pvar/.bim support was tried, and then rejected since
+    // decompression was too slow
+    // Zstd should have the necessary x86 performance characteristics, though
+    help_print("pfile\tpgen\tbfile\tbed", &help_ctrl, 1,
+"  --pfile [prefix] <vzs> : Specify .pgen + .pvar{.zst} + .psam prefix.\n"
+"  --pgen [filename]      : Specify full name of .pgen/.bed file.\n"
+	       );
+    help_print("pfile\tpgen\tpvar\tpsam\tbfile\tbed\tbim\tfam\timport-dosage\tdosage", &help_ctrl, 1,
+"  --pvar [filename]      : Specify full name of .pvar/.bim file.\n"
+"  --psam [filename]      : Specify full name of .psam/.fam file.\n\n"
+	       );
+    help_print("bfile\tbpfile\tbed\tbim\tfam", &help_ctrl, 1,
+"  --bfile [prefix] <vzs> : Specify .bed + .bim{.zst} + .fam prefix.\n"
+"  --bpfile [prefx] <vzs> : Specify .pgen + .bim{.zst} + .fam prefix.\n\n"
+	       );
+    help_print("vcf\tbcf\tkeep-autoconv", &help_ctrl, 1,
+"  --keep-autoconv    : When importing non-PLINK-binary data, don't delete\n"
+"                       autogenerated binary fileset at end of run.\n\n"
+	       );
+    help_print("bfile\tfam", &help_ctrl, 1,
+"  --no-fid           : .fam file does not contain column 1 (family ID).\n"
+"  --no-parents       : .fam file does not contain columns 3-4 (parents).\n"
+"  --no-sex           : .fam file does not contain column 5 (sex).\n"
+"  --no-pheno         : .fam file does not contain column 6 (phenotype).\n\n"
+	       );
+    // todo: allele fraction import.  but first need to see how it's
+    // represented in practice, since it isn't in the spec...
+    help_print("vcf\tbcf\tpsam", &help_ctrl, 1,
+"  --vcf [filename] <dosage=[field]>\n"
+"  --bcf [filename] <dosage=[field]>  (not implemented yet)\n"
+"    Specify full name of .vcf{.gz|.zst} or BCF2 file to import.\n"
+"    * These can be used with --psam.\n"
+"    * By default, dosage information is not imported.  To import the GP field\n"
+"      (must be VCFv4.3-style 0..1, one probability per possible genotype), add\n"
+"      'dosage=GP'.  'dosage=DS' (or anything else) causes the named field to be\n"
+"      interpreted as a Minimac3-style dosage.\n\n"
+	       );
+    help_print("data\tgen\tbgen\tsample\thaps\tlegend", &help_ctrl, 1,
+"  --data [filename prefix] <ref-first | ref-second> <gzs>\n"
+"  --bgen [filename] <snpid-chr> <ref-first | ref-second>\n"
+"  --gen [filename] <ref-first | ref-second>\n"
+"  --sample [filename]\n"
+"    Specify an Oxford-format dataset to import.  --data specifies a .gen{.zst}\n"
+"    + .sample pair, while --bgen specifies a BGEN v1.1+ file.\n"
+"    * If a BGEN v1.2+ file contains sample IDs, it may be imported without a\n"
+"      companion .sample file.\n"
+"    * With 'snpid-chr', chromosome codes are read from the 'SNP ID' field\n"
+"      instead of the usual chromosome field.\n"
+"    * By default, the second allele for each variant is treated as a\n"
+"      provisional reference allele.  To specify that the first (resp. second)\n"
+"      allele really is always reference, add the 'ref-first' (resp.\n"
+"      'ref-second') modifier.\n\n"
+	       );
+    // todo: make 'per' prefix modifiable
+    help_print("haps\tlegend", &help_ctrl, 1,
+"  --haps [filename] <ref-first | ref-second>\n"
+"  --legend [filename] [chr code]\n"
+"    Specify .haps {+ .legend} file(s) to import.\n"
+"    * When --legend is specified, it's assumed that the --haps file doesn't\n"
+"      contain header columns.\n"
+"    * On chrX, the second male column may contain dummy '-' entries.  (However,\n"
+"      PLINK currently cannot handle omitted male columns.)\n"
+"    * If not used with --sample, new sample IDs are of the form 'per#/per#'.\n\n"
+	       );
+    help_print("map\timport-dosage\tdosage", &help_ctrl, 1,
+"  --map [fname]      : Specify full name of .map file.\n"
+	       );
+    help_print("import-dosage\tdosage", &help_ctrl, 1,
+"  --import-dosage [allele dosage file] <noheader> <skip0=[i]> <skip1=[j]>\n"
+"                  <skip2=[k]> <dose1> <format=[m]> <ref-first | ref-second>\n"
+"                  <single-chr=[code]> <chr-col-num=[#]> <pos-col-num=[#]>\n"
+"    Specify PLINK 1.x-style dosage file to import.\n"
+"    * You must also specify a companion .psam/.fam file.\n"
+"    * By default, PLINK assumes that the file contains a header line, which has\n"
+"      'SNP' in (1-based) column i+1, 'A1' in column i+j+2, 'A2' in column\n"
+"      i+j+3, and sample FID/IIDs starting from column i+j+k+4.  (i/j/k are\n"
+"      normally zero, but can be changed with 'skip0', 'skip1', and 'skip2'\n"
+"      respectively.)  If such a header line is not present, use the 'noheader'\n"
+"      modifier; samples will then be assumed to appear in the same order as\n"
+"      they do in the .psam/.fam file.\n"
+"    * You may specify a companion .map file.  If you do not,\n"
+"      * 'single-chr=' can be used to specify that all variants are on the named\n"
+"        chromosome.  Otherwise, you can use 'chr-col-num=' to read chromosome\n"
+"        codes from the given (1-based) column number.\n"
+"      * 'pos-col-num=' causes bp coordinates to be read from the given column\n"
+"        number.\n"
+"    * The 'format' modifier lets you specify the number of values used to\n"
+"      represent each dosage.  'format=1' normally indicates a single 0..2 A1\n"
+"      expected count; 'dose1' modifies this to a 0..1 frequency.  'format=2'\n"
+"      (the default) indicates a 0..1 homozygous A1 likelihood followed by a\n"
+"      0..1 het likelihood, while 'format=3' indicates 0..1 hom A1, 0..1 het,\n"
+"      0..1 hom A2.\n\n"
+	       );
+    // todo: triallelic rate
+    help_print("dummy", &help_ctrl, 1,
+"  --dummy [sample ct] [SNP ct] {missing dosage freq} {missing pheno freq}\n"
+"          <acgt | 1234 | 12> <pheno-ct=[count]> <scalar-pheno>\n"
+"          <dosage-freq=[rate]>\n"
+"    This generates a fake input dataset with the specified number of samples\n"
+"    and SNPs.\n"
+"    * By default, the missing dosage and phenotype frequencies are zero.\n"
+"      These can be changed by providing 3rd and 4th numeric parameters.\n"
+"    * By default, allele codes are As and Bs; this can be changed with the\n"
+"      'acgt', '1234', or '12' modifier.\n"
+"    * By default, one binary phenotype is generated.  'pheno-ct=' can be used\n"
+"      to change the number of phenotypes, and 'scalar-pheno' causes these\n"
+"      phenotypes to be normally distributed scalars.\n"
+"    * By default, all (nonmissing) dosages are in {0,1,2}.  To make some of\n"
+"      them take on decimal values, use 'dosage-freq='.  (These dosages are\n"
+"      affected by --hard-call-threshold and --dosage-erase-threshold.)\n\n"
+	       );
+    if (!param_ct) {
+      fputs(
+"Output files have names of the form '" PROG_NAME_STR ".{extension}' by default.  You can\n"
+"change the '" PROG_NAME_STR "' prefix with\n\n"
+, stdout);
+    }
+    help_print("out", &help_ctrl, 1,
+"  --out [prefix]     : Specify prefix for output files.\n\n"
+	       );
+    if (!param_ct) {
+      fputs(
+"Most runs also require at least one of the following commands:\n\n"
+, stdout);
+    }
+    help_print("make-pgen\tmake-bpgen\tmake-bed\tmake-just-pvar\tmake-just-psam", &help_ctrl, 1,
+"  --make-pgen <vzs> <format=[code]> <trim-alts>\n"
+"              <erase-phase> <erase-dosage>\n"
+"              <pvar-cols=[col set descriptor]> <psam-cols=[col set descriptor]>\n"
+"  --make-bpgen <vzs> <format=[code]> <trim-alts>\n"
+"               <erase-phase> <erase-dosage>\n"
+"  --make-bed <vzs> <trim-alts>\n"
+	       /*
+"  --make-pgen <vzs> <format=[code]> <multiallelics=[mode]> <trim-alts>\n"
+"              <erase-alt2+> <erase-phase> <erase-dosage>\n"
+"              <pvar-cols=[col set descriptor]> <psam-cols=[col set descriptor]>\n"
+"  --make-bpgen <vzs> <format=[code]> <multiallelics=[mode]> <trim-alts>\n"
+"               <erase-alt2+> <erase-phase> <erase-dosage>\n"
+"  --make-bed <vzs> <multiallelics=[split mode]> <trim-alts>\n"
+	       */
+"    Create a new PLINK binary fileset (--make-pgen = .pgen + .pvar{.zst} +\n"
+"    .psam, --make-bpgen = .pgen + .bim{.zst} + .fam).\n"
+"    * Unlike the automatic text-to-binary converters (which only heed\n"
+"      chromosome filters), this supports all of " PROG_NAME_STR "'s filtering flags.\n"
+"    * The 'vzs' modifier causes the variant file (.pvar/.bim) to be\n"
+"      Zstd-compressed.\n"
+"    * The 'format' modifier requests an uncompressed fixed-variant-width .pgen\n"
+"      file.  (These do not directly support multiallelic variants.)  The\n"
+"      following format code is currently supported:\n"
+"        2: just like .bed, except with an extended (12-byte instead of 3-byte)\n"
+"           header containing variant/sample counts, and rotated genotype codes\n"
+"           (00 = hom ref, 01 = het, 10 = hom alt, 11 = missing).\n"
+	       /*
+"        3: unphased dosage data\n"
+"        4: phased dosage data\n"
+	       */
+    // Commented out since, while this is on the roadmap, it isn't implemented
+    // yet.  (This also applies to other commented-out help text.)
+	       /*
+"    * The 'multiallelics' modifier (alias: 'm') specifies a merge or split\n"
+"      mode.  The following modes are currently supported (well, not yet):\n"
+"      * '-': Split all multiallelic records.\n"
+"      * '-snps': Split SNP-only multiallelic records.\n"
+"      * '+'/'+both': Adjacent variants with identical CHROM/POS/REF are\n"
+"                     classified as SNPs and non-SNPs; SNPs are merged into one\n"
+"                     variant, and non-SNPs are merged into another.\n"
+"      * '+snps': Similar to '+both', except only SNPs are merged.\n"
+"      * '+any': All adjacent biallelic variants with identical CHROM/POS/REF\n"
+"                are merged into a single multiallelic variant.\n"
+"      If a variant ID template was specified with --set-[missing/all]-var-ids,\n"
+"      it is applied to the newly created variants.  Otherwise, the ID is set to\n"
+"      the missing value.\n"
+"      When merging, the new variant gets the lowest QUAL and the union of the\n"
+"      FILTER values.\n"
+"      INFO splitting/merging and left-alignment and normalization of indels are\n"
+"      not currently supported.  'bcftools norm' (possibly on a single-sample\n"
+"      file) can be used for this.\n"
+"    * The 'trim-alts' modifier causes alternate alleles not present in the\n"
+"      dataset after filtering to be removed.\n"
+"    * The 'erase-alt2+' modifier causes alt alleles past the first to be\n"
+"      removed; affected genotypes are set to missing.  (trim-alts happens\n"
+"      first.)\n"
+	       */
+"    * The 'erase-phase' and 'erase-dosage' modifiers prevent phase and dosage\n"
+"      information from being written to the new .pgen.\n"
+	       /*
+"    * When the 'multiallelics=', 'trim-alts', and/or 'erase-...' modifier is\n"
+"      present, --make-bed/--make-{b}pgen cannot be combined with other\n"
+"      commands.  (They can be combined with other filters.)\n"
+	       */
+"    * The first five columns of a .pvar file are always #CHROM/POS/ID/REF/ALT.\n"
+"      Supported optional .pvar column sets are:\n"
+"        xheader: All ## header lines (yeah, this is technically not a column).\n"
+"                 Without this, only the #CHROM header line is kept.\n"
+"        maybequal: QUAL.  Omitted if all loaded values are missing.\n"
+"        qual: Force QUAL column to be written even when empty.\n"
+"        maybefilter: FILTER.  Omitted if all loaded values are missing.\n"
+"        filter: Force FILTER column to be written even when empty.\n"
+"        maybeinfo: INFO.  Omitted if all loaded values are missing, or if\n"
+"                   INFO:PR is the only subfield.\n"
+"        info: Force INFO column to be written.\n"
+"        maybecm: Centimorgan coordinate.  Omitted if all loaded values are 0.\n"
+"        cm: Force CM column to be written even when empty.\n"
+"      The default is xheader,maybequal,maybefilter,maybeinfo,maybecm.\n"
+"    * The first two columns of a .psam file are always #FID/IID.  Supported\n"
+"      optional .psam column sets are:\n"
+"        maybesid: Sample disambiguation ID (useful when multiple samples are\n"
+"                  collected from a single organism), '0' = missing.  Omitted if\n"
+"                  all loaded values are missing.\n"
+"        sid: Force SID column to be written even when empty.\n"
+"        maybeparents: Father and mother IIDs, '0' = missing.  Omitted if all\n"
+"                      loaded values are missing.\n"
+"        parents: Force PAT and MAT columns to be written even when empty.\n"
+"        sex: '1'/'M'/'m' = male, '2'/'F'/'f' = female, 'NA'/'0' = missing.\n"
+"        pheno1: First active phenotype.  If none, all column entries are set to\n"
+"                the --output-missing-phenotype string.\n"
+"        phenos: All active phenotypes, if any.  (Can be combined with pheno1 to\n"
+"                force at least one phenotype column to be written.)\n"
+"      The default is maybesid,maybeparents,sex,phenos.\n\n"
+	       );
+    help_print("make-just-pvar\tmake-just-psam\tmake-just-bim\tmake-just-fam\twrite-cluster\n", &help_ctrl, 1,
+"  --make-just-pvar <zs> <cols=[column set descriptor]>\n"
+"  --make-just-psam <cols=[column set descriptor]>\n"
+"  --make-just-bim <zs>\n"
+"  --make-just-fam\n"
+"    Variants of --make-pgen/--make-bed which only write a new variant or sample\n"
+"    file.  These don't always require an input genotype file.\n"
+"    USE THESE CAUTIOUSLY.  It is very easy to desynchronize your binary\n"
+"    genotype data and your sample/variant indexes if you use these commands\n"
+"    improperly.  If you have any doubt, stick with --make-{b}pgen/--make-bed.\n\n"
+	       );
+    help_print("export\trecode", &help_ctrl, 1,
+"  --export [output format(s)...] <01 | 12> <bgz> <id-delim=[char]>\n"
+"    <id-paste=[column set descriptor]> <include-alt> <omit-nonmale-y> <spaces>\n"
+"    <vcf-dosage=[field]> <ref-first> <bits=[#]>\n"
+"    Create a new fileset with all filters applied.  The following output\n"
+"    formats are supported:\n"
+"    (actually, only A-transpose, bgen-1.1, ind-major-bed, haps, hapslegend,\n"
+"    oxford, and vcf are implemented for now)\n"
+"    * '23': 23andMe 4-column format.  This can only be used on a single\n"
+"            sample's data (--keep may be handy), and does not support\n"
+"            multicharacter allele codes.\n"
+"    * 'A': Sample-major additive (0/1/2) coding, suitable for loading from R.\n"
+"           If you need uncounted alleles to be named in the header line, add\n"
+"           the 'include-alt' modifier.\n"
+"    * 'AD': Sample-major additive (0/1/2) + dominant (het=1/hom=0) coding.\n"
+"            Also supports 'include-alt'.\n"
+"    * 'A-transpose': Variant-major 0/1/2.\n"
+"    * 'beagle': Unphased per-autosome .dat and .map files, readable by early\n"
+"                BEAGLE versions.\n"
+"    * 'beagle-nomap': Single .beagle.dat file.\n"
+"    * 'bgen-1.x': Oxford-format .bgen + .sample.  For v1.2/v1.3, sample\n"
+"                  identifiers are stored in the .bgen (with id-delim and\n"
+"                  id-paste settings applied), and default precision is 16-bit\n"
+"                  (use the 'bits' modifier to change this).\n"
+"    * 'bimbam': Regular BIMBAM format.\n"
+"    * 'bimbam-1chr': BIMBAM format, with a two-column .pos.txt file.  Does not\n"
+"                     support multiple chromosomes.\n"
+"    * 'fastphase': Per-chromosome fastPHASE files, with\n"
+"                   .chr-[chr #].phase.inp filename extensions.\n"
+"    * 'fastphase-1chr': Single .phase.inp file.  Does not support\n"
+"                        multiple chromosomes.\n"
+"    * 'haps', 'hapslegend': Oxford-format .haps + .sample{ + .legend}.  All\n"
+"                            data must be biallelic and phased.\n"
+"    * 'HV': Per-chromosome Haploview files, with .chr-[chr #][.ped + .info]\n"
+"            filename extensions.\n"
+"    * 'HV-1chr': Single Haploview .ped + .info file pair.  Does not support\n"
+"                 multiple chromosomes.\n"
+"    * 'ind-major-bed': PLINK 1 sample-major .bed (+ .bim + .fam).\n"
+"    * 'lgen': PLINK 1 long-format (.lgen + .fam + .map), loadable with --lfile.\n"
+"    * 'lgen-ref': .lgen + .fam + .map + .ref, loadable with --lfile +\n"
+"                  --reference.\n"
+"    * 'list': Single genotype-based list, up to 4 lines per variant.  To omit\n"
+"              nonmale genotypes on the Y chromosome, add the 'omit-nonmale-y'\n"
+"              modifier.\n"
+"    * 'rlist': .rlist + .fam + .map fileset, where the .rlist file is a\n"
+"                genotype-based list which omits the most common genotype for\n"
+"                each variant.  Also supports 'omit-nonmale-y'.\n"
+"    * 'oxford': Oxford-format .gen + .sample.\n"
+"    * 'ped': PLINK 1 sample-major (.ped + .map), loadable with --file.\n"
+"    * 'compound-genotypes': Same as 'ped', except that the space between each\n"
+"                            pair of same-variant allele codes is removed.\n"
+"    * 'structure': Structure-format.\n"
+"    * 'transpose': PLINK 1 variant-major (.tped + .tfam), loadable with\n"
+"                   --tfile.\n"
+"    * 'vcf': VCFv4.3.  If PAR1 and PAR2 are present, they are automatically\n"
+"             merged with chrX, with proper handling of chromosome codes and\n"
+"             male ploidy.  If the 'bgz' modifier is added, the VCF file is\n"
+"             block-gzipped.\n"
+"             The 'id-paste' modifier controls which .psam columns are used to\n"
+"             construct sample IDs (choices are fid, iid, maybesid, and sid;\n"
+"             default is fid,iid,maybesid), while the 'id-delim' modifier sets\n"
+"             the character between the ID pieces (default '_').\n"
+"             By default, dosages are not exported; use 'vcf-dosage=GP' to\n"
+"             export them as genotype posterior probabilities, or\n"
+"             'vcf-dosage=DS' to export Minimac3-style dosages.\n"
+	       // possible todo: pedigree output?
+"    In addition,\n"
+"    * When the output format only supports biallelic variants, multiallelic\n"
+"      variants are downcoded to ref/alt1, not split.\n"
+	       // todo: implement CPRA <-> CPR
+"    * The '12' modifier causes alt1 alleles to be coded as '1' and ref alleles\n"
+"      to be coded as '2', while '01' maps alt1 -> 0 and ref -> 1.\n"
+"    * The 'spaces' modifier makes the output space-delimited instead of\n"
+"      tab-delimited, whenever both are permitted.\n"
+"    * For biallelic formats where it's unspecified whether the reference/major\n"
+"      allele should appear first or second, --export defaults to second for\n"
+"      compatibility with PLINK 1.9.  Use 'ref-first' to change this.\n\n"
+	       );
+    
+    // don't bother with case/control or cluster-stratification any more, since
+    // user can loop through subgroups and then use Unix cut/paste
+
+    // todo: add optional columns for computed MAF (nothing here quite
+    // corresponds to nonmajor_freqs when e.g. --maf-succ was specified) and
+    // machr2 (this is probably the best home since, unlike --geno-counts and
+    // --hardy, it's dosage-aware).
+    help_print("freq\tmach-r2-filter", &help_ctrl, 1,
+"  --freq <zs> <counts> <cols=[column set descriptor]> <bins-only>\n"
+"         <refbins=[comma-separated bin boundaries] | refbins-file=[filename]>\n"
+"         <alt1bins=[comma-separated bin boundaries] | alt1bins-file=[filename]>\n"
+"    Empirical allele frequency report.  By default, only founders are\n"
+"    considered.  Dosages are taken into account (e.g. heterozygous haploid\n"
+"    calls count as 0.5).  chrM dosages are scaled to sum to 2.\n"
+"    Supported column sets are:\n"
+"      chrom: Chromosome ID.\n"
+"      pos: Base-pair coordinate.\n"
+"      (ID is always present, and positioned here.)\n"
+"      ref: Reference allele.\n"
+"      alt1: Alternate allele 1.\n"
+"      alt: All alternate alleles, comma-separated.\n"
+"      reffreq: Reference allele frequency/dosage.\n"
+"      alt1freq: Alt1 frequency/dosage.\n"
+"      altfreq: Comma-separated frequencies/dosages for all alternate alleles.\n"
+"      freq: Similar to altfreq, except ref is also included at the start.\n"
+"      eq: Comma-separated [allele]=[freq] for all present alleles.  (If no\n"
+"          alleles are present, the column contains a single '.'.)\n"
+"      eqz: Same as eq, except zero-counts are included.\n"
+"      alteq/alteqz: Same as eq/eqz, except reference allele is omitted.\n"
+"      numeq: 0=[freq],1=[freq], etc.  Zero-counts are omitted.\n"
+"      altnumeq: Same as numeq, except reference allele is omitted.\n"
+"      machr2: Empirical divided by theoretical variance quality metric.\n"
+"      nobs: Number of allele observations.\n"
+"    The default is chrom,ref,alt,altfreq,nobs.\n"
+"    Additional .afreq.{ref,alt1}.bins (or .acount.{ref,alt1}.bins with\n"
+"    'counts') file(s) are generated when 'refbins='/'refbins-file=' or\n"
+"    'alt1bins='/'alt1bins-file=' is present; these report the total number of\n"
+"    frequencies or counts in each left-closed, right-open interval.  (If you\n"
+"    only want these histogram(s), and not the main report, add 'bins-only'.)\n\n"
+	       );
+    // this can't really handle dosages, so we specify "hardcall"
+    help_print("geno-counts\tfreq\tfreqx\frqx", &help_ctrl, 1,
+"  --geno-counts <zs> <cols=[column set descriptor]>\n"
+"    Hardcall genotype count report (considering both alleles simultaneously in\n"
+"    the diploid case).  Nonfounders are now included; use --keep-founders if\n"
+"    this is a problem.  Heterozygous haploid calls are treated as missing.\n"
+"    Supported column sets are:\n"
+"      chrom: Chromosome ID.\n"
+"      pos: Base-pair coordinate.\n"
+"      (ID is always present, and positioned here.)\n"
+"      ref: Reference allele.\n"
+"      alt1: Alternate allele 1.\n"
+"      alt: All alternate alleles, comma-separated.\n"
+"      homref: Homozygous-ref count.\n"
+"      refalt1: Heterozygous ref-alt1 count.\n"
+"      refalt: Comma-separated het ref-altx counts.\n"
+"      homalt1: Homozygous-alt1 count.\n"
+"      altxy: Comma-separated altx-alty counts, in (1/1)-(1/2)-(2/2)-(1/3)-...\n"
+"             order.\n"
+"      xy: Similar to altxy, except the reference allele is treated as alt0,\n"
+"          and the sequence starts (0/0)-(0/1)-(1/1)-(0/2)-...\n"
+"      hapref: Haploid-ref count.\n"
+"      hapalt1: Haploid-alt1 count.\n"
+"      hapalt: Comma-separated haploid-altx counts.\n"
+"      hap: Similar to hapalts, except ref is also included at the start.\n"
+"      numeq: 0/0=[hom ref ct],0/1=[het ref-alt1],1/1=[hom alt1],...,0=[hap ref]\n"
+"             etc.  Zero-counts are omitted.  (If all genotypes are missing, the\n"
+"             column contains a single '.'.)\n"
+"      missing: Number of missing genotypes.\n"
+"      nobs: Number of (nonmissing) genotype observations.\n"
+"    The default is chrom,ref,alt,homref,refalt,altxy,hapref,hapalt,missing.\n\n"
+	       );
+    // todo: add cluster-stratification
+    help_print("missing", &help_ctrl, 1,
+"  --missing <zs> <sample-only | variant-only> <scols=[column set descriptor]>\n"
+"            <vcols=[column set descriptor]>\n"
+"    Generate sample- and variant-based missing data reports (or just one report\n"
+"    if 'sample-only'/'variant-only' is specified).\n"
+"    Supported column sets in the sample-based report are:\n"
+"      (FID and IID are always present, and positioned here.)\n"
+"      maybesid: SID, if at least one nonmissing value is present.\n"
+"      sid: Force SID column to be written even when empty.\n"
+"      misspheno1: First active phenotype missing (Y/N)?  Always 'Y' if no\n"
+"                  phenotypes are loaded.\n"
+"      missphenos: A Y/N column for each loaded phenotype.  (Can be combined\n"
+"                  with misspheno1 to force at least one such column.)\n"
+"      nmissdosage: Number of missing dosages.\n"
+"      nmiss: Number of missing hardcalls, not counting het haploids.\n"
+"      nmisshh: Number of missing hardcalls, counting het haploids.\n"
+"      hethap: Number of heterozygous haploid hardcalls.\n"
+"      nobs: Denominator (male count on chrY, otherwise total sample count).\n"
+"      fmissdosage: Missing dosage rate.\n"
+"      fmiss: Missing hardcall rate, not counting het haploids.\n"
+"      fmisshh: Missing hardcall rate, counting het haploids.\n"
+"    The default is maybesid,missphenos,nmiss,nobs,fmiss.\n"
+"    Supported column sets in the variant-based report are:\n"
+"      chrom: Chromosome ID.\n"
+"      pos: Base-pair coordinate.\n"
+"      (ID is always present, and positioned here.)\n"
+"      ref: Reference allele.\n"
+"      alt1: Alternate allele 1.\n"
+"      alt: All alternate alleles, comma-separated.\n"
+"      nmissdosage: Number of missing dosages.\n"
+"      nmiss: Number of missing hardcalls, not counting het haploids.\n"
+"      nmisshh: Number of missing hardcalls, counting het haploids.\n"
+"      hethap: Number of heterozygous haploid calls.\n"
+"      nobs: Number of potentially valid calls.\n"
+"      fmissdosage: Missing dosage rate.\n"
+"      fmiss: Missing hardcall rate, not counting het haploids.\n"
+"      fmisshh: Missing hardcall rate, counting het haploids.\n"
+"      fhethap: Heterozygous haploid rate.\n"
+"    The default is chrom,nmiss,nobs,fmiss.\n\n"
+	       );
+    help_print("hardy", &help_ctrl, 1,
+"  --hardy <zs> <midp> <cols=[column set descriptor]>\n"
+"    Hardy-Weinberg exact test p-value report(s).\n"
+"    * For multiallelic variants, the test is based on the reference allele.\n"
+"    * By default, only founders are considered; change this with --nonfounders.\n"
+"    * chrX is now omitted from the main {output prefix}.hardy report.  Instead,\n"
+"      (if present) it gets its own {output prefix}.hardy.x report based on the\n"
+"      method described in Graffelman J, Weir BS (2016) Hardy-Weinberg\n"
+"      equilibrium and the X chromosome.\n"
+"    * There is currently no special handling of case/control phenotypes.\n"
+"    Supported column sets are:\n"
+"      chrom: Chromosome ID.\n"
+"      pos: Base-pair coordinate.\n"
+"      (ID is always present, and positioned here.)\n"
+"      ref: Reference allele.\n"
+"      alt1: Alternate allele 1.\n"
+"      alt: All alternate alleles, comma-separated.\n"
+"      gcounts: Hom-ref count, total number of ref-altx heterozygous calls, and\n"
+"               total number of nonmissing calls with no reference allele.  On\n"
+"               chrX, these are followed by male ref and male alt counts.\n"
+"      gcount1col: gcounts values in a single comma-separated column.\n"
+"      hetfreq: Observed and expected heterozygote frequencies.\n"
+"      sexaf: Female and male ref allele frequencies (chrX only).\n"
+"      femalep: Female-only p/midp-value (chrX only).\n"
+"      p: Hardy-Weinberg equilibrium exact test p/midp-value.\n"
+"    The default is chrom,ref,alt,gcounts,hetfreq,sexaf,p.\n\n"
+	       );
+    help_print("indep\tindep-pairwise", &help_ctrl, 1,
+"  --indep-pairwise [window size]<kb> {step size (variant ct)} [r^2 threshold]\n"
+"    Generate a list of variants in approximate linkage equilibrium.  With the\n"
+"    'kb' modifier, the window size is in kilobase instead of variant count\n"
+"    units.  (Pre-'kb' space is optional, i.e. '--indep-pairwise 500 kb 0.5' and\n"
+"    and '--indep-pairwise 500kb 0.5' have the same effect.)\n"
+"    The step size now defaults to 1 if it's unspecified, and *must* be 1 if the\n"
+"    window is in kilobase units.\n"
+"    Note that you need to rerun " PROG_NAME_STR " using --extract or --exclude on the\n"
+"    .prune.in/.prune.out file to apply the list to another computation.\n\n"
+	       );
+    // todo: replace --indep-pairphase with method which takes fully phased
+    // haplotypes as input (unphased het treated as missing).
+    
+    // for kinship estimation, LD pruning isn't really advisable (if more speed
+    // is needed, the humble --bp-space may lead to a better approximation; and
+    // in practice speed isn't an issue any more with --make-king)
+    help_print("make-king\tmake-king-table", &help_ctrl, 1,
+"  --make-king <square | square0 | triangle> <zs | bin | bin4>\n"
+"    KING-robust kinship estimator, described by Manichaikul A, Mychaleckyj JC,\n"
+"    Rich SS, Daly K, Sale M, Chen WM (2010) Robust relationship inference in\n"
+"    genome-wide association studies.  By default, this writes a\n"
+"    lower-triangular tab-delimited table of kinship coefficients to\n"
+"    {output prefix}.king, and a list of the corresponding sample IDs to\n"
+"    {output prefix}.king.id.  The first row of the .king file contains a single\n"
+"    [genome 1-genome 2] kinship coefficient, the second row has the\n"
+"    [genome 1-genome 3] and [genome 2-genome 3] kinship values in that order,\n"
+"    etc.\n"
+"    * Only autosomes are currently considered.\n"
+"    * Pedigree information is currently ignored; the between-family estimator\n"
+"      is used for all pairs.\n"
+"    * If the 'square' or 'square0' modifier is present, a square matrix is\n"
+"      written instead; 'square0' fills the upper right triangle with zeroes.\n"
+"    * If the 'zs' modifier is present, the .king file is Zstd-compressed.\n"
+"    * If the 'bin' modifier is present, a binary (square) matrix of\n"
+"      double-precision floating point values, suitable for loading from R, is\n"
+"      instead written to {output prefix}.king.bin.  ('bin4' specifies\n"
+"      single-precision numbers instead.)  This can be combined with 'square0'\n"
+"      if you still want the upper right zeroed out, or 'triangle' if you don't\n"
+"      want to pad the upper right at all.\n"
+"    * The computation can be subdivided with --parallel.\n"
+"  --make-king-table <zs> <counts> <cols=[column set descriptor]>\n"
+"    Similar to --make-king, except results are reported in the original .kin0\n"
+"    text table format (with minor changes, e.g. row order is more friendly to\n"
+"    incremental addition of samples), and --king-table-filter can be used to\n"
+"    restrict the report to high kinship values.\n"
+"    Supported column sets are:\n"
+"      (FID and IID are always present, and positioned here.)\n"
+"      maybesid: SID, if at least one nonmissing value is present.\n"
+"      sid: Force SID column to be written even when empty.\n"
+"      misspheno1: First active phenotype missing (Y/N)?  Always 'Y' if no\n"
+"                  phenotypes are loaded.\n"
+"      missphenos: A Y/N column for each loaded phenotype.  (Can be combined\n"
+"                  with misspheno1 to force at least one such column.)\n"
+"      id: FID1/ID1/FID2/ID2.\n"
+"      maybesid: SID1/SID2, if at least one value is nonmissing.  Must be used\n"
+"                with 'id'.\n"
+"      sid: Force SID1/SID2 even when all values are missing.\n"
+"      nsnp: Number of variants considered (autosomal, neither call missing).\n"
+"      hethet: Proportion/count of considered call pairs which are het-het.\n"
+"      ibs0: Proportion/count of considered call pairs which are opposite homs.\n"
+"      ibs1: HET1_HOM2 and HET2_HOM1 proportions/counts.\n"
+"      kinship: KING-robust between-family kinship estimator.\n"
+"    The default is id,maybesid,nsnp,hethet,ibs0,kinship.  hethet/ibs0/ibs1\n"
+"    values are proportions unless the 'counts' modifier is present.  If id is\n"
+"    omitted, a .kin0.id file is also written.\n\n"
+	       );
+    help_print("make-rel\tmake-grm\tmake-grm-bin\tmake-grm-gz", &help_ctrl, 1,
+"  --make-rel <cov> <meanimpute> <square | square0 | triangle> <zs | bin | bin4>\n"
+"    Write a lower-triangular variance-standardized relationship matrix to\n"
+"    {output prefix}.rel, and corresponding IDs to {output prefix}.rel.id.\n"
+"    * It is usually best to perform this calculation on a variant set in\n"
+"      approximate linkage equilibrium, with no very-low-MAF variants.\n"
+"    * The 'cov' modifier removes the variance standardization step, causing a\n"
+"      covariance matrix to be calculated instead.\n"
+"    * The computation can be subdivided with --parallel.\n"
+"  --make-grm-gz <cov> <meanimpute> <no-gz | zs>\n"
+"  --make-grm-bin <cov> <meanimpute>\n"
+"    --make-grm-gz causes the relationships to be written to GCTA's original\n"
+"    gzipped list format, which describes one pair per line, while\n"
+"    --make-grm-bin writes them in GCTA 1.1+'s single-precision triangular\n"
+"    binary format.  Note that these formats explicitly report the number of\n"
+"    valid observations (where neither sample has a missing call) for each pair,\n"
+"    which is useful input for some scripts.\n\n"
+	       );
+#ifndef NOLAPACK
+    // GRM, PCA, etc. based on major vs. nonmajor alleles
+    // possible todo: have an 'approx2' mode which implements the flashpca 2.0
+    //   algorithm, which does not require memory quadratic in the # of PCs
+    help_print("pca", &help_ctrl, 1,
+"  --pca {count} <approx | meanimpute> <sid>\n"
+"  --pca var-wts {count} <approx | meanimpute> <sid> <vzs>\n"
+"                <vcols=[col set descriptor]>\n"
+"    Extracts top principal components from the variance-standardized\n"
+"    relationship matrix.\n"
+"    * It is usually best to perform this calculation on a variant set in\n"
+"      approximate linkage equilibrium, with no very-low-MAF variants.\n"
+"    * By default, 10 PCs are extracted; you can adjust this by passing a\n"
+"      numeric parameter.  (Note that 10 is lower than the PLINK 1.9 default of\n"
+"      20; this is due to the randomized algorithm's memory footprint growing\n"
+"      quadratically w.r.t. the PC count.)\n"
+"    * The 'approx' modifier causes the standard deterministic computation to be\n"
+"      replaced with the randomized algorithm originally implemented for\n"
+"      Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,\n"
+"      Price AL (2016) Fast Principal-Component Analysis Reveals Convergent\n"
+"      Evolution of ADH1B in Europe and East Asia.  This can be a good idea when\n"
+"      you have >5k samples.\n"
+"    * The randomized algorithm always uses mean imputation for missing genotype\n"
+"      calls.  For comparison purposes, you can use the 'meanimpute' modifier to\n"
+"      request this behavior for the standard computation.\n"
+"    * The 'var-wts' modifier requests an additional .eigenvec.var file with PCs\n"
+"      expressed as variant weights instead of sample weights.  When it's\n"
+"      present, 'vzs' causes the .eigenvec.var file to be Zstd-compressed.\n"
+"      'vcols' can be used to customize the report columns; supported column\n"
+"      sets are:\n"
+"        chrom: Chromosome ID.\n"
+"        pos: Base-pair coordinate.\n"
+"        (ID is always present, and positioned here.)\n"
+"        ref: Reference allele.\n"
+"        alt1: Alternate allele 1.\n"
+"        alt: All alternate alleles, comma-separated.\n"
+"        maj: Major allele.\n"
+"        nonmaj: All nonmajor alleles, comma-separated.\n"
+"        (PCs are always present, and positioned here.  Signs are w.r.t. the\n"
+"        major, not necessarily reference, allele.)\n"
+"      Default is chrom,maj,nonmaj.\n\n"
+	       );
+#endif
+    help_print("king-cutoff\tmake-king\tmake-king-table\trel-cutoff\tgrm-cutoff", &help_ctrl, 1,
+"  --king-cutoff {.king.bin + .king.id fileset prefix} [threshold]\n"
+"    Exclude one member of each pair of samples with KING-robust kinship greater\n"
+"    than the given threshold.  Remaining/excluded sample IDs are written to\n"
+"    {output prefix}.king.cutoff.in + .king.cutoff.out.\n"
+"    If present, the .king.bin file must be triangular (either precision is ok).\n\n"
+	       );
+    help_print("write-covar\twith-phenotype", &help_ctrl, 1,
+"  --write-covar <cols=[column set descriptor]>\n"
+"    If covariates are defined, an updated version (with all filters applied) is\n"
+"    automatically written to {output prefix}.cov whenever --make-pgen,\n"
+"    --make-just-psam, --export, or a similar command is present.  However, if\n"
+"    you do not wish to simultaneously generate a new sample file, you can use\n"
+"    --write-covar to just produce a pruned covariate file.\n"
+"    Supported column sets are:\n"
+"      maybesid: SID, if at least one nonmissing value is present.\n"
+"      sid: Force SID column to be written even when empty.\n"
+"      maybeparents: Father and mother IIDs, '0' = missing.  Omitted if all\n"
+"                    loaded values are missing.\n"
+"      parents: Force PAT and MAT columns to be written even when empty.\n"
+"      sex: '1'/'M'/'m' = male, '2'/'F'/'f' = female, 'NA'/'0' = missing.\n"
+"      pheno1: First active phenotype.  If none, all column entries are set to\n"
+"              the --output-missing-phenotype string.\n"
+"      phenos: All active phenotypes, if any.  (Can be combined with pheno1 to\n"
+"              force at least one phenotype column to be written.)\n"
+"      (Covariates are always present, and positioned here.)\n"
+"    The default is just maybesid.\n\n"
+	       );
+    help_print("write-snplist", &help_ctrl, 1,
+"  --write-snplist <zs>\n"
+"    List all variants which pass your filters/inclusion thresholds.\n\n"
+	       );
+    help_print("glm\tlinear\tlogistic\tassoc", &help_ctrl, 1,
+"  --glm <zs> <sex | no-x-sex> <genotypic | hethom | dominant | recessive>\n"
+"        <interaction> <hide-covar> <intercept> <firth-fallback | firth>\n"
+"        <cols=[col set descriptor]> <local-covar=[f]> <local-pvar=[f]>\n"
+"        <local-psam=[f]> <local-omit-last | local-cats=[category ct]>\n"
+	       // "        <perm | mperm=[value]> <perm-count>\n"
+"    Basic association analysis on quantitative and/or case/control phenotypes.\n"
+"    For each variant, a linear (for quantitative traits) or logistic (for\n"
+"    case/control) regression is run with the phenotype as the dependent\n"
+"    variable, and alt dosage and a constant-1 column as predictors.\n"
+"    * For multiallelic variants, the total alt1 + alt2 + ... dosage is used.\n"
+"    * By default, sex (male = 1, female = 2; note that this is a change from\n"
+"      PLINK 1.x) is automatically added as a predictor for X chromosome\n"
+"      variants, and no others.  The 'sex' modifier causes it to be added\n"
+"      everywhere (except chrY), while 'no-x-sex' excludes it entirely.\n"
+"    * The 'genotypic' modifier adds an additive effect/dominance deviation 2df\n"
+"      joint test (0-2 and 0..1..0 coding), while 'hethom' uses 0..0..1 and\n"
+"      0..1..0 coding instead.\n"
+	       /*
+"  If permutation is also requested, these\n"
+"      modifiers cause permutation to be based on the joint test.\n"
+	       */
+"    * 'dominant' and 'recessive' specify a model assuming full dominance or\n"
+"      recessiveness, respectively, for the ref allele.  I.e. the genotype\n"
+"      column is recoded as 0..1..1 or 0..0..1, respectively.\n"
+"    * 'interaction' adds genotype x covariate interactions to the model.\n"
+	       /*
+"  This\n"
+"      cannot be combined with the usual permutation tests; use --tests to\n"
+"      define the permutation test statistic instead.\n"
+	       */
+"    * Additional predictors can be added with --covar.  By default, association\n"
+"      statistics are reported for all nonconstant predictors; 'hide-covar'\n"
+"      suppresses covariate-only results, while 'intercept' causes intercepts\n"
+"      to be reported.\n"
+"    * For logistic regression, when the phenotype {quasi-}separates the\n"
+"      genotype, an NA result will normally be reported.  To fall back on Firth\n"
+"      logistic regression instead when the basic logistic regression fails to\n"
+"      converge, add the 'firth-fallback' modifier.  To eliminate the special\n"
+"      case and use Firth logistic regression everywhere, add 'firth'.\n"
+"    * To add covariates which are not constant across all variants, add the\n"
+"      'local-covar=', 'local-pvar=', and 'local-psam=' modifiers, and use full\n"
+"      filenames for each.\n"
+"      Normally, the local-covar file should have c * n real-valued columns,\n""      where the first c columns correspond to the first sample in the\n"
+"      local-psam file, columns (c+1) to 2c correspond to the second sample,\n"
+"      etc.; and the mth line correspond to the mth nonheader line of the\n"
+"      local-pvar file.  (Variants outside of the local-pvar file are excluded\n"
+"      from the regression.)  The local covariates are assigned the names\n"
+"      LOCAL1, LOCAL2, etc.; to exclude the last local covariate from the\n"
+"      regression (necessary if they are e.g. local ancestry coefficients which\n"
+"      sum to 1), add 'local-omit-last'.\n"
+"      Alternatively, with 'local-cats=[k]', the local-covar file is expected to\n"
+"      have n columns with integer-valued entries in [1, k].  These category\n"
+"      assignments are expanded into (k-1) local covariates in the usual manner.\n"
+	       /*
+"    * 'perm' normally causes an adaptive permutation test to be performed on\n"
+"      the main effect, while 'mperm=[value]' starts a max(T) permutation test.\n"
+"    * 'perm-count' causes the permutation test report to include counts instead\n"
+"      of frequencies.\n"
+	       */
+// May want to change or leave out set-based test; punt for now.
+"    The main report supports the following column sets:\n"
+"      chrom: Chromosome ID.\n"
+"      pos: Base-pair coordinate.\n"
+"      (ID is always present, and positioned here.)\n"
+"      ref: Reference allele.\n"
+"      alt1: Alternate allele 1.\n"
+"      alt: All alternate alleles, comma-separated.\n"
+"      altcount: Alternate allele count (can be decimal with dosage data).\n"
+"      totallele: Allele observation count (can be higher than --freq value, due\n"
+"                 to inclusion of het haploids and chrX model).\n"
+"      altcountcc: alt count in cases, then controls (case/control only).\n"
+"      totallelecc: Case and control allele observation counts.\n"
+"      altfreq: alt allele frequency.\n"
+"      altfreqcc: alt frequency in cases, then controls (case/control only).\n"
+"      machr2: Empirical divided by theoretical variance quality metric.\n"
+"      firth: Reports whether Firth regression was used (firth-fallback only).\n"
+"      test: Test identifier.  (Required unless only one test is run.)\n"
+"      nobs: Number of samples in the regression.\n"
+"      beta: Regression coefficient (for alternate allele).\n"
+"      orbeta: Odds ratio for case/control, beta for quantitative traits.\n"
+"      se: Standard error of beta/odds ratio.\n"
+"      ci: Bounds of symmetric approximate confidence interval (requires --ci).\n"
+"      t: T-statistic.\n"
+"      p: Asymptotic p-value for t-statistic.\n"
+"    The default is chrom,pos,ref,alt,firth,test,nobs,orbeta,se,ci,t,p.\n\n"
+	       );
+    help_print("score", &help_ctrl, 1,
+"  --score [filename] {i} {j} {k} <header | header-read> <no-mean-imputation>\n"
+"          <center | variance-standardize> <se> <zs>\n"
+"          <list-variants | list-variants-zs> <cols=[col set descriptor]>\n"
+"    Apply linear scoring system(s) to each sample.\n"
+"    The input file should have one line per scored variant.  Variant IDs are\n"
+"    read from column #i and allele codes are read from column #j, where i\n"
+"    defaults to 1 and j defaults to i+1.\n"
+"    * By default, a single column of input coefficients is read from column #k,\n"
+"      where k defaults to j+1.  (--score-number can be used to specify multiple\n"
+"      columns.)\n"
+"    * The 'header' modifier causes the first nonempty line of the input file to\n"
+"      be treated as an ignorable header line, while 'header-read' causes score\n"
+"      column header(s) to be read and included in the report.\n"
+"    * By default, copies of unnamed alleles contribute zero to score, while\n"
+"      missing genotypes contribute an amount proportional to the loaded (via\n"
+"      --read-freq) or imputed allele frequency.  To throw out missing\n"
+"      observations instead (decreasing the denominator in the final average\n"
+"      when this happens), use the 'no-mean-imputation' modifier.\n"
+"    * You can use the 'center' modifier to shift all genotypes to mean zero, or\n"
+"      'variance-standardize' to linearly transform the genotypes to mean-0,\n"
+"      variance-1.  ('variance-standardize' cannot be used with chrX or MT.)\n"
+"    * The 'se' modifier causes the score coefficients to be treated as\n"
+"      independent standard errors; in this case, standard errors for the score\n"
+"      average/sum are reported.  (Note that this will systematically\n"
+"      underestimate standard errors when scored variants are in LD.)\n"
+"    * The 'list-variants{-zs}' modifier causes variant IDs used for scoring to\n"
+"      be written to [output prefix].sscore.vars{.zst}.\n"
+"    The main report supports the following column sets:\n"
+"      (FID and IID are always present, and positioned here.)\n"
+"      maybesid: SID, if at least one nonmissing value is present.\n"
+"      sid: Force SID column to be written even when empty.\n"
+"      pheno1: First active phenotype.\n"
+"      phenos: All active phenotypes, if any.\n"
+"      nmissallele: Number of nonmissing alleles.\n"
+"      denom: Denominator of score average (equal to nmissallele value when\n"
+"             'no-mean-imputation' specified)\n"
+"      dosagesum: Sum of named allele dosages.\n"
+"      scoreavgs: Score averages.\n"
+"      scoresums: Score sums.\n"
+"    The default is maybesid,phenos,nmissallele,dosagesum,scoreavgs.\n\n"
+	       );
+    help_print("genotyping-rate", &help_ctrl, 1,
+"  --genotyping-rate <dosage>\n"
+"    Report genotyping rate in log (this was automatic in PLINK 1.x).\n\n"
+	       );
+    help_print("validate", &help_ctrl, 1,
+"  --validate\n"
+"    Validates all variant records in a .pgen file.\n\n"
+	       );
+    help_print("zst-decompress", &help_ctrl, 1,
+"  --zst-decompress [.zst file] {output filename}\n"
+"    Decompress a Zstd-compressed file.  If no output filename is specified, the\n"
+"    file is decompressed to standard output.\n"
+"    This cannot be used with any other flags, and does not cause a log file to\n"
+"    be generated.\n\n"
+	       );
+    if (!param_ct) {
+      fputs(
+"The following other flags are supported.\n"
+// tbd: document order of operations
+, stdout);
+    }
+    help_print("script\trerun", &help_ctrl, 0,
+"  --script [fname]   : Include command-line options from file.\n"
+"  --rerun {log}      : Rerun commands in log (default '" PROG_NAME_STR ".log').\n"
+	       );
+    help_print("version", &help_ctrl, 0,
+"  --version          : Display only version number before exiting.\n"
+	       );
+    help_print("silent", &help_ctrl, 0,
+"  --silent           : Suppress output to console.\n"
+	       );
+    help_print("input-missing-genotype\tmissing-genotype", &help_ctrl, 0,
+"  --input-missing-genotype [c] : '.' is always interpreted as a missing\n"
+"                                 genotype code in input files.  By default, '0'\n"
+"                                 also is; you can change this second missing\n"
+"                                 code with --input-missing-genotype.\n"
+	       );
+    help_print("vcf\tbcf\tbgen\tdouble-id\tconst-fid\tid-delim", &help_ctrl, 0,
+"  --double-id        : Set both FIDs and IIDs to the VCF/.bgen sample ID.\n"
+"  --const-fid {ID}   : Set all FIDs to the given constant (default '0').\n"
+"  --id-delim {d}     : Parse sample IDs as [FID][d][IID] (or\n"
+"                       [FID][d][IID][d][SID] when delimiter appears twice).\n"
+"                       Default delimiter is '_'.\n"
+	       );
+    help_print("idspace-to\tvcf\tbcf\tbgen\tid-delim\tvcf-idspace-to", &help_ctrl, 0,
+"  --idspace-to [c]   : Convert spaces in VCF/.bgen sample IDs to the given\n"
+"                       character.\n"
+	       );
+    help_print("vcf\tbcf\tvcf-half-call\tvcf-min-gq\tvcf-min-dp\tvcf-require-gt", &help_ctrl, 0,
+"  --vcf-require-gt   : Skip variants with no GT field.\n"
+"  --vcf-min-gq [val] : No-call genotypes when GQ is present and below the\n"
+"                       threshold.\n"
+"  --vcf-min-dp [val] : No-call genotypes when DP is present and below the\n"
+"                       threshold.\n"
+"  --vcf-half-call [] : Specify how '0/.' and similar VCF GT values should be\n"
+"                       handled.  The following four modes are supported:\n"
+"                       * 'error'/'e' (default) errors out and reports line #.\n"
+"                       * 'haploid'/'h' treats them as haploid calls.\n"
+"                       * 'missing'/'m' treats them as missing.\n"
+"                       * 'reference'/'r' treats the missing value as 0.\n"
+	       );
+    help_print("oxford-single-chr\tdata\tgen", &help_ctrl, 0,
+"  --oxford-single-chr [chr name]  : Specify single-chromosome .gen file with\n"
+"                                    ignorable first column.\n"
+	       );
+    // any need to keep --hard-call-threshold random?  postpone it for now...
+    help_print("hard-call-threshold\tgen\tbgen\tdata\timport-dosage", &help_ctrl, 0,
+"  --hard-call-threshold [val]     : When importing dosage data, a hardcall is\n"
+"                                    normally saved when the distance from the\n"
+"                                    nearest hardcall, defined as\n"
+"                                      0.5 * sum_i |x_i - round(x_i)|\n"
+"                                    (where the x_i's are 0..2 allele dosages),\n"
+"                                    is not greater than 0.1.  You can adjust\n"
+"                                    this threshold by providing a numeric\n"
+"                                    parameter to --hard-call-threshold.\n"
+"                                    You can also use this with --make-{b}pgen\n"
+"                                    to alter the saved hardcalls while leaving\n"
+"                                    the dosages untouched.\n"
+	       );
+    help_print("dosage-erase-threshold\timport-dosage-certainty\tgen\tbgen\tdata\tvcf\tbcf\timport-dosage", &help_ctrl, 0,
+"  --dosage-erase-threshold [val]  : --hard-call-threshold normally preserves\n"
+"                                    the original dosages, and several PLINK 2.x\n"
+"                                    commands use them when they're available.\n"
+"                                    Use --dosage-erase-threshold to make PLINK\n"
+"                                    erase dosages and keep only hardcalls when\n"
+"                                    distance-from-hardcall <= the given level.\n"
+"  --import-dosage-certainty [val] : The PLINK 2.0 file format currently\n"
+"                                    supports a single dosage for each allele.\n"
+"                                    Some other dosage file formats include a\n"
+"                                    separate probability for every possible\n"
+"                                    genotype, e.g. {P(0/0)=0.2, P(0/1)=0.52,\n"
+"                                    P(1/1)=0.28}, a highly uncertain call that\n"
+"                                    is nevertheless treated as a hardcall under\n"
+"                                    '--hard-call-threshold 0.1'.  To make PLINK\n"
+"                                    treat a dosage as missing whenever the\n"
+"                                    largest probability is less than a\n"
+"                                    threshold, use --import-dosage-certainty.\n"
+	       );
+    help_print("missing-code\tmissing_code\tdata\tsample", &help_ctrl, 0,
+"  --missing-code {string list}    : Comma-delimited list of missing phenotype\n"
+"    (alias: --missing_code)         values for Oxford-format import (default\n"
+"                                    'NA').\n"
+	       );
+    help_print("allow-extra-chr\taec", &help_ctrl, 0,
+"  --allow-extra-chr  : Permit unrecognized chromosome codes (alias --aec).\n"
+	       );
+    // possible todo: nonhuman PARs?
+    help_print("chr-set\tcow\tdog\thorse\thound\tmouse\trice\tsheep\tautosome-num\thuman\tchr-override", &help_ctrl, 0,
+"  --chr-set [autosome ct] <no-x> <no-y> <no-xy> <no-mt> :\n"
+"    Specify a nonhuman chromosome set.  The first parameter sets the number of\n"
+"    diploid autosome pairs if positive, or haploid chromosomes if negative.\n"
+"    Given diploid autosomes, the remaining modifiers indicate the absence of\n"
+"    the named non-autosomal chromosomes.\n"
+"  --cow/--dog/--horse/--mouse/--rice/--sheep : Shortcuts for those species.\n"
+"  --autosome-num [val]  : Alias for '--chr-set [value] no-y no-xy no-mt'.\n"
+"  --human               : Explicitly specify human chromosome set, and make\n"
+"                          output .pvar/VCF files include a ##chrSet header\n"
+"                          line.  (.pvar/VCF output files automatically include\n"
+"                          ##chrSet when a nonhuman set is specified.)\n"
+"  --chr-override <file> : By default, if --chr-set/--autosome-num/--human/etc.\n"
+"                          conflict with an input file ##chrSet header line,\n"
+"                          PLINK will error out.  --chr-override with no\n"
+"                          parameter causes the command line to take precedence;\n"
+"                          '--chr-override file' defers to the file.\n"
+	       );
+    help_print("biallelic-only\tvar-min-qual\tvar-filter\tvcf-min-qual\tvcf-filter", &help_ctrl, 0,
+"  --biallelic-only <strict> <list> : Skip variants with 2+ alt. alleles.\n"
+"  --var-min-qual [val]             : Skip variants with low/missing QUAL.\n"
+"  --var-filter {exception(s)...}   : Skip variants which have FILTER failures.\n"
+	       );
+    /*
+    help_print("allow-no-samples\tallow-no-vars", &help_ctrl, 0,
+"  --allow-no-samples : Allow the input fileset to contain no samples.\n"
+"  --allow-no-vars    : Allow the input fileset to contain no variants.\n"
+	       );
+    */
+    help_print("pheno\tpheno-name", &help_ctrl, 0,
+"  --pheno [filename] : Specify additional phenotype/covariate file.\n"
+"  --pheno-name [...] : Only load the designated phenotype(s) from the\n"
+"                       --pheno (if one was specified) or .psam (if no --pheno)\n"
+"                       file.  Separate multiple names with spaces or commas,\n"
+"                       and use dashes to designate ranges.\n"
+	       );
+    help_print("input-missing-phenotype\t1\tmissing-catname\tmissing-phenotype", &help_ctrl, 0,
+"  --input-missing-phenotype [v] : Set number to treat as a missing phenotype in\n"
+"                                  input files (default -9).\n"
+"  --1                           : Expect case/control phenotypes in input files\n"
+"                                  to be coded as 0 = control, 1 = case, instead\n"
+"                                  of the usual 0 = missing, 1 = ctrl, 2 = case.\n"
+"  --missing-catname [str]       : Set missing-categorical-phenotype string\n"
+"                                  (case-sensitive, default 'NONE').\n"
+	       );
+    help_print("covar\tcovar-name", &help_ctrl, 0,
+"  --covar [filename] : Specify additional covariate file.\n"
+"  --covar-name [...] : Only load the designated covariate(s) from the\n"
+"                       --covar (if one was specified), --pheno (if no --covar),\n"
+"                       or .psam (if no --covar or --pheno) file.\n"
+	       );
+    help_print("within\tmwithin\tfamily\tfamily-missing-catname", &help_ctrl, 0,
+"  --within [f] {new pheno name} : Import a PLINK 1.x categorical phenotype.\n"
+"                                  (Phenotype name defaults to 'CATPHENO'.)\n"
+"                                  * If any numeric values are present, ALL\n"
+"                                    values must be numeric.  In that case, 'C'\n"
+"                                    is added in front of all category names.\n"
+"                                  * 'NA' is treated as a missing value.\n"
+"  --mwithin [n]                 : Load --within categories from column n+2.\n"
+"  --family {new pheno name}     : Create a categorical phenotype from FID.\n"
+"                                  Restrictions on and handling of numeric\n"
+"                                  values are the same as for --within.\n"
+"  --family-missing-catname [nm] : Make --family treat the specified FID as\n"
+"                                  missing.\n"
+	       );
+    help_print("keep\tremove\tkeep-fam\tremove-fam", &help_ctrl, 0,
+"  --keep <sid> [fn...]  : Exclude all samples not named in a file.\n"
+"  --remove <sid> [f...] : Exclude all samples named in a file.\n"
+"  --keep-fam [fname...] : Exclude all families not named in a file.\n"
+"  --remove-fam [fn...]  : Exclude all families named in a file.\n"
+	       );
+    help_print("extract\texclude\trange", &help_ctrl, 0,
+"  --extract <range> [f...] : Exclude all variants not named in a file.\n"
+"  --exclude <range> [f...] : Exclude all variants named in a file.\n"
+	       );
+    help_print("keep-cats\tkeep-cat-names\tkeep-cat-pheno\tremove-cats\tremove-cat-names\tremove-cat-pheno\tkeep-clusters\tkeep-cluster-names\tremove-clusters\tremove-cluster-names", &help_ctrl, 0,
+"  --keep-cats [filename]   : These can be used individually or in combination\n"
+"  --keep-cat-names [nm...]   to define a list of categories to keep; all\n"
+"                             samples not in one of the named categories are\n"
+"                             excluded.  Use spaces to separate category names\n"
+"                             for --keep-cat-names.  Use the --missing-catname\n"
+"                             value (default 'NONE') to refer to the group of\n"
+"                             uncategorized samples.\n"
+"  --keep-cat-pheno [pheno] : If more than one categorical phenotype is loaded,\n"
+"                             or you wish to filter on a categorical covariate,\n"
+"                             --keep-cat-pheno must be used to specify which\n"
+"                             phenotype/covariate --keep-cats and\n"
+"                             --keep-cat-names apply to.\n"
+"  --remove-cats [filename] : Exclude all categories named in the file.\n"
+"  --remove-cat-names [...] : Exclude named categories.\n"
+"  --remove-cat-pheno [phe] : Specify pheno for --remove-cats/remove-cat-names.\n"
+	       );
+    help_print("split-cat-pheno\tdummy-coding\tloop-assoc", &help_ctrl, 0,
+"  --split-cat-pheno <omit-last> <covar-01> {cat. pheno/covar name(s)...} :\n"
+"    Split n-category phenotype(s) into n (or n-1, with 'omit-last') binary\n"
+"    phenotypes, with names of the form [orig. pheno name]=[category name].  (As\n"
+"    a consequence, affected phenotypes and categories are not permitted to\n"
+"    contain the '=' character.)\n"
+"    * This happens after all sample filters.\n"
+"    * If no phenotype or covariate names are provided, all categorical\n"
+"      phenotypes (but not covariates) are processed.\n"
+"    * By default, generated covariates are coded as 1=false, 2=true.  To code\n"
+"      them as 0=false, 1=true instead, add the 'covar-01' modifier.\n"
+	       );
+    help_print("variance-standardize\tcovar-variance-standardize\tquantile-normalize\tpheno-quantile-normalize\tcovar-quantile-normalize\tstandard-beta\tglm\tlinear\tlogistic", &help_ctrl, 0,
+"  --variance-standardize {pheno/covar name(s)...}\n"
+"  --covar-variance-standardize {covar name(s)...} :\n"
+"    Linearly transform named covariates (and quantitative phenotypes, if\n"
+"    --variance-standardize) to mean-zero, variance 1.  If no parameters are\n"
+"    provided, all possible phenotypes/covariates are affected.\n"
+"  --quantile-normalize {...}       : Force named covariates and quantitative\n"
+"  --pheno-quantile-normalize {...}   phenotypes to a N(0,1) distribution,\n"
+"  --covar-quantile-normalize {...}   preserving only the original rank orders.\n"
+	       );
+    help_print("chr\tnot-chr", &help_ctrl, 0,
+"  --chr [chr(s)...]  : Exclude all variants not on the given chromosome(s).\n"
+"                       Valid choices for humans are 0 (unplaced), 1-22, X, Y,\n"
+"                       XY, MT, PAR1, and PAR2.  Separate multiple chromosomes\n"
+"                       with spaces and/or commas, and use a dash (no adjacent\n"
+"                       spaces permitted) to denote a range, e.g.\n"
+"                       '--chr 1-4, 22, par1, x, par2'.\n"
+"  --not-chr [...]    : Reverse of --chr (exclude variants on listed\n"
+"                       chromosomes).\n"
+	       );
+    help_print("autosome\tautosome-par\tautosome-xy\tchr\tnot-chr", &help_ctrl, 0,
+"  --autosome         : Exclude all non-autosomal variants.\n"
+"  --autosome-par     : Exclude all non-autosomal variants, except those in a\n"
+"                       pseudo-autosomal region.\n"
+	       );
+    help_print("snps-only", &help_ctrl, 0,
+"  --snps-only <just-acgt> : Exclude non-SNP variants.  By default, SNP = all\n"
+"                            allele codes are single-character; 'just-acgt'\n"
+"                            restricts SNP codes to {A,C,G,T,a,c,g,t,[missing]}.\n"
+	       );
+    // best to only support --chr with --from-bp/--to-bp/etc., now that
+    // finalize_chrset() is deferred
+    help_print("from\tto\tsnp\twindow\tfrom-bp\tto-bp\tfrom-kb\tto-kb\tfrom-mb\tto-mb\texclude-snp\textract-snp", &help_ctrl, 0,
+"  --from [var ID]    : Use ID(s) to specify a variant range to load.  When used\n"
+"  --to   [var ID]      together, both variants must be on the same chromosome.\n"
+"                       (--snps can be used to specify intervals which cross\n"
+"                       chromosome boundaries.)\n"
+"  --snp  [var ID]    : Specify a single variant to load.\n"
+"  --exclude-snp [ID] : Specify a single variant to exclude.\n"
+"  --window  [kbs]    : With --snp/--exclude-snp, loads/excludes all variants\n"
+"                       within half the specified kb distance of the named one.\n"
+"  --from-bp [pos]    : Use base-pair coordinates to define a variant range to\n"
+"  --to-bp   [pos]      load.\n"
+"  --from-kb [pos]      * You must use these with --chr, specifying a single\n"
+"  --to-kb   [pos]        chromosome.\n"
+"  --from-mb [pos]      * Decimals and negative numbers are permitted.\n"
+"  --to-mb   [pos]      * The --to-bp(/-kb/-mb) position is no longer permitted\n"
+"                         to be smaller than the --from-bp position.\n"
+	       );
+    help_print("snps\texclude-snps", &help_ctrl, 0,
+"  --snps [var IDs...]  : Use IDs to specify variant range(s) to load or\n"
+"  --exclude-snps [...]   exclude.  E.g. '--snps rs1111-rs2222, rs3333, rs4444'.\n"
+	       );
+    help_print("force-intersect\textract\tfrom\tto\tsnp\tfrom-bp\tto-bp\tfrom-kb\tto-kb\tfrom-mb\tto-mb\textract-snp\tsnps", &help_ctrl, 0,
+"  --force-intersect    : PLINK 2 normally errors out when multiple variant\n"
+"                         inclusion filters (--extract, --from/--to,\n"
+"                         --from-bp/--to-bp, --snp, --snps) are specified.\n"
+"                         --force-intersect allows the run to proceed; the set\n"
+"                         intersection will be taken.\n"
+	       );
+    help_print("geno\tmind\toblig-clusters\toblig-missing", &help_ctrl, 0,
+"  --geno {val} <dosage | hh-missing>\n"
+"  --mind {val} <dosage | hh-missing> : \n"
+"    Exclude variants (--geno) and/or samples (--mind) with missing call\n"
+"    frequencies greater than a threshold (default 0.1).  (Note that the default\n"
+"    threshold is only applied if --geno/--mind is invoked without a parameter;\n"
+"    when --geno/--mind is not invoked, no missing call frequency ceiling is\n""    enforced at all.  Other inclusion/exclusion default thresholds work the\n"
+"    same way.)\n"
+"    By default, when a dosage is present but a hardcall is not, the genotype is\n"
+"    treated as missing; add the 'dosage' modifier to treat this case as\n"
+"    nonmissing.  Alternatively, you can use 'hh-missing' to also treat\n"
+"    heterozygous haploid calls as missing.\n"
+	       );
+    /*
+    help_print("oblig-clusters\toblig-missing", &help_ctrl, 0,
+"  --oblig-missing [f1] [f2] : Specify blocks of missing genotype calls for\n"
+"                              --geno/--mind to ignore.  The first file should\n"
+"                              have variant IDs in the first column and block\n"
+"                              IDs in the second, while the second file should\n"
+"                              have FIDs in the first column, IIDs in the\n"
+"                              second, and block IDs in the third.\n"
+	       );
+    */
+    help_print("require-pheno\trequire-covar\tprune", &help_ctrl, 0,
+"  --require-pheno {name(s)...} : Remove samples missing any of the named\n"
+"  --require-covar {name(s)...}   phenotype(s)/covariate(s).  If no parameters\n"
+"                                 are provided, all phenotype(s)/covariate(s)\n"
+"                                 must be present.\n"
+	       );
+    help_print("maf\tmax-maf\tmac\tmin-ac\tmax-mac\tmax-ac", &help_ctrl, 0,
+"  --maf {freq}       : Exclude variants with nonmajor allele frequency lower\n"
+"                       than a threshold (default 0.01).\n"
+"  --max-maf [freq]   : Exclude variants with MAF greater than the threshold.\n"
+"  --mac [ct]         : Exclude variants with nonmajor allele dosage lower than\n"
+"                       the given threshold.\n"
+"  --max-mac [ct]     : Exclude variants with nonmajor allele dosage greater than\n"
+"                       the given threshold.\n"
+	       );
+    help_print("maf-succ", &help_ctrl, 0,
+"  --maf-succ         : Rule of succession allele frequency estimation (used in\n"
+"                       EIGENSOFT).  Given a j observations of one allele and k\n"
+"                       observations of the other for a biallelic variant, infer\n"
+"                       allele frequencies of (j+1) / (j+k+2) and\n"
+"                       (k+1) / (j+k+2), rather than the default j / (j+k) and\n"
+"                       k / (j+k).\n"
+"                       Note that this does not affect --freq's output.\n"
+	       );
+    help_print("read-freq", &help_ctrl, 0,
+"  --read-freq [file] : Load allele frequency estimates from the given --freq or\n"
+"                       --geno-counts (or PLINK 1.9 --freqx) report, instead of\n"
+"                       imputing them from the immediate dataset.\n"
+	       );
+// todo: something like <check-ctrls>/<check-ctrl=[case/ctrl phenotype name]>
+// and maybe <ctrls-only>/<ctrl-only=[case/ctrl phenotype name]>
+    help_print("hwe\tmach-r2-filter", &help_ctrl, 0,
+"  --hwe [p] <midp> <keep-fewhet> : Exclude variants with Hardy-Weinberg\n"
+"                                   equilibrium exact test p-values below a\n"
+"                                   threshold.\n"
+"                                   * By default, only founders are considered.\n"
+"                                   * chrX p-values are now computed using\n"
+"                                     Graffelman and Weir's method.\n"
+"                                   * With 'keep-fewhet', variants which fail\n"
+"                                     the test in the too-few-hets direction are\n"
+"                                     not excluded.  (On chrX, this uses the\n"
+"                                     ratio between the Graffelman/Weir p-value\n"
+"                                     and the female-only p-value.)\n"
+"                                   * There is currently no special handling of\n"
+"                                     case/control phenotypes.\n"
+"  --mach-r2-filter {min} {max}   : Exclude variants with MaCH\n"
+"                                   empirical-theoretical variance ratio outside\n"
+"                                   of [min, max] (defaults 0.1 and 2.0).\n"
+"                                   * For multiallelic variants, only the\n"
+"                                     ref-nonref dimension is considered.\n"
+"                                   * If a single parameter is provided, it is\t"
+"                                     treated as the minimum.\n"
+	       );
+    help_print("keep-females\tkeep-males\tkeep-nosex\tremove-females\tremove-males\tremove-nosex\tfilter-males\tfilter-females", &help_ctrl, 0,
+"  --keep-females     : Exclude male and unknown-sex samples.\n"
+"  --keep-males       : Exclude female and unknown-sex samples.\n"
+"  --keep-nosex       : Exclude all known-sex samples.\n"
+"  --remove-females   : Exclude female samples.\n"
+"  --remove-males     : Exclude male samples.\n"
+"  --remove-nosex     : Exclude unknown-sex samples.\n"
+	       );
+    help_print("keep-founders\tkeep-nonfounders\tfilter-founders\tfilter-nonfounders\tgeno-counts", &help_ctrl, 0,
+"  --keep-founders    : Exclude nonfounder samples.\n"
+"  --keep-nonfounders : Exclude founder samples.\n"
+	       );
+    // possible todo: allow or/and of multiple predicates
+    // best if syntax allows for '=' character inside phenotype/covariate
+    // names, though...
+    help_print("keep-if\tremove-if\tfilter-cases\tfilter-controls", &help_ctrl, 0,
+"  --keep-if [pheno/covar] [op] [val] : Exclude samples which don't/do satisfy a\n"
+"  --remove-if [pheno/covar] [op] [v]   comparison predicate, e.g.\n"
+"                                         --keep-if PHENO1 == case\n"
+"                                       Unless the operator is !=, the predicate\n"
+"                                       always evaluates to false when the\n"
+"                                       phenotype/covariate is missing.\n"
+	       );
+    help_print("nonfounders\tfreq\thardy\thwe", &help_ctrl, 0,
+"  --nonfounders      : Include nonfounders in allele freq/HWE calculations.\n"
+	       );
+    help_print("output-chr", &help_ctrl, 0,
+"  --output-chr [MT code] : Set chromosome coding scheme in output files by\n"
+"                           providing the desired human mitochondrial code.\n"
+"                           Options are '26', 'M', 'MT', '0M', 'chr26', 'chrM',\n"
+"                           and 'chrMT'; default is now 'MT' (note that this is\n"
+"                           a change from PLINK 1.x, which defaulted to '26').\n"
+	       );
+    help_print("output-missing-genotype\toutput-missing-phenotype\tmissing-genotype\tmissing-phenotype", &help_ctrl, 0,
+"  --output-missing-genotype [ch] : Set the code used to represent missing\n"
+"                                   genotypes in output files (default '.').\n"
+"  --output-missing-phenotype [s] : Set the string used to represent missing\n"
+"                                   phenotypes in output files (default 'NA').\n"
+	       );
+    /*
+    help_print("sort-vars", &help_ctrl, 0,
+"  --sort-vars {mode}      : Sort variants by chromosome, then position, then\n"
+"                            ID.  The following string orders are supported:\n"
+"                            * 'natural'/'n': Natural sort (default).\n"
+"                            * 'ascii'/'a': ASCII.\n"
+"                            This must be used with --make-{b}pgen/--make-bed.\n"
+	       );
+    */
+    help_print("set-hh-missing\tset-mixed-mt-missing", &help_ctrl, 0,
+"  --set-hh-missing        : Make --make-{b}pgen/--make-bed set heterozygous\n"
+"                            haploid and female chrY genotypes to missing.\n"
+"                            (Unlike PLINK 1.x, this does not change unknown-sex\n"
+"                            chrY genotypes.)\n"
+"  --set-mixed-mt-missing  : Make --make-{b}pgen/--make-bed set mixed MT\n"
+"                            genotypes to missing.\n"
+	       );
+    help_print("split-par\tmerge-par\tsplit-x\tmerge-x", &help_ctrl, 0,
+"  --split-par [bp1] [bp2] : Changes chromosome code of all X chromosome\n"
+"  --split-par [build]       variants with bp position <= bp1 to PAR1, and those\n"
+"                            with position >= bp2 to PAR2.  The following build\n"
+"                            codes are supported as shorthand:\n"
+"                            * 'b36'/'hg18' = NCBI 36, 2709521/154584237\n"
+"                            * 'b37'/'hg19' = GRCh37, 2699520/154931044\n"
+"                            * 'b38'/'hg38' = GRCh38, 2781479/155701383\n"
+"  --merge-par             : Merge PAR1/PAR2 back with X.  Requires PAR1 to be\n"
+"                            positioned immediately before X, and PAR2 to be\n"
+"                            immediately after X.  (Should *not* be used with\n"
+"                            \"--export vcf\", since it causes male\n"
+"                            homozygous/missing calls in PAR1/PAR2 to be\n"
+"                            reported as haploid.)\n"
+	       );
+    help_print("set-all-var-ids\tset-missing-var-ids\tnew-id-max-allele-len\tmissing-var-code", &help_ctrl, 0,
+"  --set-missing-var-ids [t]  : Given a template string with a '@' where the\n"
+"  --set-all-var-ids [t]        chromosome code should go and '#' where the bp\n"
+"                               coordinate belongs, --set-missing-var-ids\n"
+"                               assigns chromosome-and-bp-based IDs to unnamed\n"
+"                               variants, while --set-all-var-ids resets all\n"
+"                               IDs.\n"
+"                               You may also use '$r'/'$a' to refer to the\n"
+"                               ref and alt1 alleles, or '$1'/'$2' to refer to\n"
+"                               them in alphabetical order.\n"
+"  --new-id-max-allele-len [len] <error | missing | truncate> :\n"
+"    Specify maximum number of leading characters from allele codes to include\n"
+"    in new variant IDs, and behavior on longer codes (defaults 23, error).\n"
+"  --missing-var-code [str]   : Change unnamed variant code for\n"
+"                               --set-[missing/all]-var-ids (default '.').\n"
+	       );
+    help_print("update-sex", &help_ctrl, 0,
+"  --update-sex [f] {n} : Update sexes.  Sex (1/M/m = male, 2/F/f = female, 0 =\n"
+"                         missing) is loaded from column n+2 (default n is 1).\n"
+	       );
+    // don't make --real-ref-alleles apply to e.g. Oxford import, since
+    // explicit 'ref-first'/'ref-second' modifiers are clearer
+    help_print("real-ref-alleles\tmaj-ref\tkeep-allele-order", &help_ctrl, 0,
+"  --real-ref-alleles : Treat A2 alleles in a PLINK 1.x fileset as actual ref\n"
+"                       alleles; otherwise they're flagged as provisional.\n"
+"  --maj-ref <force>  : Set major alleles to reference, like PLINK 1.x\n"
+"                       automatically did.  (Note that this is now opt-in rather\n"
+"                       than opt-out; --keep-allele-order is no longer necessary\n"
+"                       to prevent allele-swapping.)  By default, this only\n"
+"                       affects variants with \"provisional reference\" flags;\n"
+"                       add 'force' to override validated reference alleles as\n"
+"                       well.\n"
+"                       All new reference alleles are marked as provisional.\n"
+	       );
+    help_print("indiv-sort", &help_ctrl, 0,
+"  --indiv-sort [m] <sid> {f} : Specify FID/IID(/SID) sort order for merge and\n"
+"                               --make-{b}pgen/--make-bed.  The following four\n"
+"                               modes are supported:\n"
+"                               * 'none'/'0' keeps samples in the order they\n"
+"                                 were loaded.  Default for non-merge.\n"
+"                               * 'natural'/'n' invokes \"natural sort\", e.g.\n"
+"                                 'id2' < 'ID3' < 'id10'.  Default when merging.\n"
+"                               * 'ascii'/'a' sorts in ASCII order, e.g.\n"
+"                                 'ID3' < 'id10' < 'id2'.\n"
+"                               * 'file'/'f' uses the order in the given file\n"
+"                                 (named in the last parameter).  The 'sid'\n"
+"                                 modifier has the usual effect when this mode\n"
+"                                 is requested.\n"
+	       );
+    help_print("make-king\tmake-king-table\tking-table-filter", &help_ctrl, 0,
+"  --king-table-filter [min]  : Specify minimum kinship coefficient for\n"
+"                               inclusion in --make-king-table report.\n"
+	       );
+    help_print("glm\tlinear\tlogistic\tcondition\tcondition-list\tparameters\ttests", &help_ctrl, 0,
+"  --condition [var ID] <dominant | recessive> : Add one variant's alt1 dosages\n"
+"                                                as a --glm covariate.\n"
+"  --condition-list [f] <dominant | recessive> : Add all variants in the file as\n"
+"                                                --glm covariates.\n"
+"  --parameters [...] : Include only the given covariates/interactions in the\n"
+"                       --glm model, identified by a list of 1-based indices\n"
+"                       and/or ranges of them.\n"
+	       /*
+"  --tests [...]      : Perform a (joint) test on the specified term(s) in the\n"
+"  --tests all          --glm model, identified by 1-based indices and/or ranges\n"
+"                       of them.  If permutation was requested, it is based on\n"
+"                       this test.\n"
+"                       * Note that, when --parameters is also present, the\n"
+"                         indices refer to the terms remaining AFTER pruning by\n"
+"                         --parameters.\n"
+"                       * You can use '--tests all' to include all terms.\n"
+	       */
+	       );
+    help_print("glm\tlinear\tlogistic\tvif\tmax-corr", &help_ctrl, 0,
+"  --vif [max VIF]    : Set VIF threshold for --glm multicollinearity check\n"
+"                       (default 50).  For case/control phenotypes, only the\n"
+"                       covariates are checked, and the entire phenotype is\n"
+"                       skipped if a VIF is too high.\n"
+"  --max-corr [val]   : Skip --glm regression when the absolute value of the\n"
+"                       correlation between two predictors exceeds this value\n"
+"                       (default 0.999).  For case/control phenotypes, only\n"
+"                       covariates are checked.\n"
+	       );
+    help_print("glm\tlinear\tlogistic\tscore\txchr-model", &help_ctrl, 0,
+"  --xchr-model [m]   : Set the chrX --glm/--score model.\n"
+"                       * '0' = skip chrX.\n"
+"                       * '1' = add sex as a covar on chrX, code males 0..1.\n"
+"                       * '2' (default) = chrX sex covar, code males 0..2.\n"
+"                       (Use the --glm 'interaction' modifier to test for\n"
+"                       interaction between genotype and sex.)\n"
+	       );
+    /*
+    help_print("adjust", &help_ctrl, 0,
+"  --adjust <gc> <log10> <cols=[column set descriptor]> :\n"
+"    For each association test, report some multiple-testing corrections, sorted\n"
+"    in increasing-p-value order.\n"
+"    * 'gc' causes genomic-controlled p-values to be used in the formulas.\n"
+"    * 'log10' causes negative base 10 logs of p-values to be reported, instead\n"
+"      of raw p-values.\n"
+"    The following column sets are supported:\n"
+"      chrom: Chromosome ID.\n"
+"      pos: Base-pair coordinate.\n"
+"      (ID is always present, and positioned here.)\n"
+"      ref: Reference allele.\n"
+"      alt1: Alternate allele 1.\n"
+"      alt: All alternate alleles, comma-separated.\n"
+"      unadj: Unadjusted p-value.\n"
+"      gc: Devlin & Roeder (1999) genomic control corrected p-value (additive\n"
+"          models only).\n"
+"      qq: P-value quantile.\n"
+"      bonf: Bonferroni correction.\n"
+"      holm: Holm-Bonferroni (1979) adjusted p-value.\n"
+"      sidakss: Sidak single-step adjusted p-value.\n"
+"      sidaksd: Sidak step-down adjusted p-value.\n"
+"      fdrbh: Benjamini & Hochberg (1995) step-up false discovery control.\n"
+"      fdrby: Benjamini & Yekutieli (2001) step-up false discovery control.\n"
+"    Default set is chrom,unadj,gc,bonf,holm,sidakss,sidaksd,fdrbh,fdrby.\n"
+	       );
+    help_print("adjust\tlambda", &help_ctrl, 0,
+"  --lambda           : Set genomic control lambda for --adjust.\n"
+	       );
+    */
+    help_print("ci\tlinear\tlogistic", &help_ctrl, 0,
+"  --ci [size]        : Report confidence ratios for odds ratios/betas.\n"
+	       );
+    help_print("pfilter", &help_ctrl, 0,
+"  --pfilter [val]    : Filter out assoc. test results with higher p-values.\n"
+	       );
+    /*
+    help_print("aperm", &help_ctrl, 0,
+"  --aperm [min perms - 1] {max perms} {alpha} {beta} {init interval} {slope} :\n"
+"    Set up to six parameters controlling adaptive permutation tests.\n"
+"    * The first two control the minimum and maximum number of permutations that\n"
+"      may be run for each variant; default values are 5 and 1000000.\n"
+"    * The next two control the early termination condition.  A\n"
+"      100% * (1 - beta/2T) confidence interval is calculated for each empirical\n"
+"      p-value, where T is the total number of variants; whenever this\n"
+"      confidence interval doesn't contain alpha, the variant is exempted from\n"
+"      further permutation testing.  Default values are 0 and 1e-4.\n"
+"    * The last two control when the early termination condition is checked.  If\n"
+"      a check occurs at permutation #p, the next check occurs after\n"
+"      [slope]p + [init interval] more permutations (rounded down).  Default\n"
+"      initial interval is 1, and default slope is 0.001.\n"
+	       );
+    help_print("mperm-save\tmperm-save-all", &help_ctrl, 0,
+"  --mperm-save       : Save best max(T) permutation test statistics.\n"
+"  --mperm-save-all   : Save all max(T) permutation test statistics.\n"
+	       );
+    */
+    help_print("score-col-nums\tscore", &help_ctrl, 0,
+"  --score-col-nums [...] : Process all the specified coefficient columns in the\n"
+"                           --score file, identified by 1-based indexes and/or\n"
+"                           ranges of them.\n"
+	       );
+    help_print("parallel", &help_ctrl, 0,
+"  --parallel [k] [n] : Divide the output matrix into n pieces, and only compute\n"
+"                       the kth piece.  The primary output file will have the\n"
+"                       piece number included in its name, e.g. plink2.king.13\n"
+"                       or plink2.king.13.zst if k is 13.  Concatenating these\n"
+"                       files in order will yield the full matrix of interest.\n"
+"                       (Yes, this can be done before decompression.)\n"
+"                       N.B. This generally cannot be used to directly write a\n"
+"                       symmetric square matrix.  Choose square0 or triangle\n"
+"                       shape instead, and postprocess as necessary.\n"
+	       );
+    help_print("memory\tseed", &help_ctrl, 0,
+"  --memory [val] <require> : Set size, in MB, of initial workspace malloc\n"
+"                             attempt.  To error out instead of reducing the\n"
+"                             request size when the initial attempt fails, add\n"
+"                             the 'require' modifier.\n"
+	       );
+    help_print("threads\tnum_threads\tthread-num\tseed", &help_ctrl, 0,
+"  --threads [val]    : Set maximum number of compute threads.\n"
+	       );
+    help_print("seed", &help_ctrl, 0,
+"  --seed [val...]    : Set random number seed(s).  Each value must be an\n"
+"                       integer between 0 and 4294967295 inclusive.\n"
+"                       Note that --threads and \"--memory require\" may also be\n"
+"                       needed to reproduce some randomized runs.\n"
+	       );
+    help_print("output-min-p", &help_ctrl, 0,
+"  --output-min-p [p] : Specify minimum p-value to write to reports.\n"
+	       );
+    help_print("debug", &help_ctrl, 0,
+"  --debug            : Use slower, more crash-resistant logging method.\n"
+	       );
+    help_print("warning-errcode", &help_ctrl, 0,
+"  --warning-errcode  : Return a nonzero error code to the OS when a run\n"
+"                       completes with warning(s).\n"
+	       );
+    if (!param_ct) {
+      fputs(
+"\nPrimary methods paper:\n"
+"Chang CC, Chow CC, Tellier LCAM, Vattikuti S, Purcell SM, Lee JJ (2015)\n"
+"Second-generation PLINK: rising to the challenge of larger and richer datasets.\n"
+"GigaScience, 4.\n"
+, stdout);
+    }
+  } while (help_ctrl.iters_left--);
+  if (help_ctrl.unmatched_ct) {
+    net_unmatched_ct = help_ctrl.unmatched_ct;
+    printf("\nNo help entr%s for", (help_ctrl.unmatched_ct == 1)? "y" : "ies");
+    col_num = (help_ctrl.unmatched_ct == 1)? 17 : 19;
+    arg_uidx = 0;
+    // er, should replace the \n logic with a wordwrap() call
+    while (help_ctrl.unmatched_ct) {
+      arg_uidx = next_unset_unsafe(help_ctrl.all_match_arr, arg_uidx);
+      help_ctrl.unmatched_ct--;
+      if (help_ctrl.unmatched_ct) {
+	if (net_unmatched_ct == 2) {
+	  if (help_ctrl.param_slens[arg_uidx] + col_num > 76) {
+	    putc_unlocked('\n', stdout);
+	    col_num = 2 + help_ctrl.param_slens[arg_uidx];
+	  } else {
+	    putc_unlocked(' ', stdout);
+	    col_num += 3 + help_ctrl.param_slens[arg_uidx];
+	  }
+	  putc_unlocked('\'', stdout);
+	  fputs(argv[arg_uidx], stdout);
+	  putc_unlocked('\'', stdout);
+	} else {
+	  if (help_ctrl.param_slens[arg_uidx] + col_num > 75) {
+	    putc_unlocked('\n', stdout);
+	    col_num = 3 + help_ctrl.param_slens[arg_uidx];
+	  } else {
+	    putc_unlocked(' ', stdout);
+	    col_num += 4 + help_ctrl.param_slens[arg_uidx];
+	  }
+	  putc_unlocked('\'', stdout);
+	  fputs(argv[arg_uidx], stdout);
+          fputs("',", stdout);
+	}
+	if (help_ctrl.unmatched_ct == 1) {
+	  if (col_num > 76) {
+	    fputs("\nor", stdout);
+	    col_num = 2;
+	  } else {
+	    fputs(" or", stdout);
+	    col_num += 3;
+	  }
+	}
+      } else {
+	putc_unlocked((help_ctrl.param_slens[arg_uidx] + col_num > 75)? '\n' : ' ', stdout);
+	putc_unlocked('\'', stdout);
+        fputs(argv[arg_uidx], stdout);
+        fputs("\'.\n", stdout);
+      }
+      arg_uidx++;
+    }
+  }
+  if (param_ct) {
+    while (0) {
+    disp_help_ret_NOMEM:
+      reterr = kPglRetNomem;
+    }
+    free_cond(help_ctrl.param_slens);
+    free_cond(help_ctrl.all_match_arr);
+    if (help_ctrl.argv && (help_ctrl.argv != argv)) {
+      free(help_ctrl.argv);
+    }
+  }
+  return reterr;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/plink2_help.h b/plink2_help.h
new file mode 100644
index 0000000..77191a0
--- /dev/null
+++ b/plink2_help.h
@@ -0,0 +1,35 @@
+#ifndef __PLINK2_HELP_H__
+#define __PLINK2_HELP_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+extern const char g_cmdline_format_str[];
+
+pglerr_t disp_help(uint32_t param_ct, char** argv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __PLINK2_HELP_H__
diff --git a/plink2_ld.cpp b/plink2_ld.cpp
new file mode 100644
index 0000000..7ac719e
--- /dev/null
+++ b/plink2_ld.cpp
@@ -0,0 +1,1488 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_ld.h"
+
+#ifdef __cplusplus
+#include <functional> // std::greater
+#endif
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+void init_ld(ld_info_t* ldip) {
+  ldip->prune_modifier = kfLdPrune0;
+  ldip->prune_window_size = 0;
+  ldip->prune_window_incr = 0;
+  ldip->prune_last_param = 0.0;
+}
+
+void cleanup_ld(__attribute__((unused)) ld_info_t* ldip) {
+}
+
+
+static inline void popcount_vecs_2intersect(const vul_t* __restrict vvec1_iter, const vul_t* __restrict vvec2a_iter, const vul_t* __restrict vvec2b_iter, uintptr_t vec_ct, uint32_t* popcount_1_2a_ptr, uint32_t* popcount_1_2b_ptr) {
+  // popcounts (vvec1 AND vvec2a[0..(ct-1)]) as well as (vvec1 AND vvec2b).  ct
+  // is a multiple of 3.
+  assert(!(vec_ct % 3));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+
+  // todo: check if moving this right before usage is better, looks like we
+  // barely have enough registers...
+  const vul_t m8 = VCONST_UL(kMask00FF);
+  uint32_t popcount_1_2a = 0;
+  uint32_t popcount_1_2b = 0;
+
+  while (1) {
+    univec_t acc_a;
+    univec_t acc_b;
+    acc_a.vi = vul_setzero();
+    acc_b.vi = vul_setzero();
+
+    const vul_t* vvec1_stop;
+    if (vec_ct < 30) {
+      if (!vec_ct) {
+	*popcount_1_2a_ptr = popcount_1_2a;
+	*popcount_1_2b_ptr = popcount_1_2b;
+	return;
+      }
+      vvec1_stop = &(vvec1_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      vvec1_stop = &(vvec1_iter[30]);
+      vec_ct -= 30;
+    }
+    do {
+      vul_t loader = *vvec1_iter++;
+      vul_t count1a = loader & (*vvec2a_iter++);
+      vul_t count1b = loader & (*vvec2b_iter++);
+      loader = *vvec1_iter++;
+      vul_t count2a = loader & (*vvec2a_iter++);
+      vul_t count2b = loader & (*vvec2b_iter++);
+      loader = *vvec1_iter++;
+      vul_t half1a = loader & (*vvec2a_iter++);
+      vul_t half1b = loader & (*vvec2b_iter++);
+      const vul_t half2a = vul_rshift(half1a, 1) & m1;
+      const vul_t half2b = vul_rshift(half1b, 1) & m1;
+      half1a = half1a & m1;
+      half1b = half1b & m1;
+      count1a = count1a - (vul_rshift(count1a, 1) & m1);
+      count1b = count1b - (vul_rshift(count1b, 1) & m1);
+      count2a = count2a - (vul_rshift(count2a, 1) & m1);
+      count2b = count2b - (vul_rshift(count2b, 1) & m1);
+      count1a = count1a + half1a;
+      count1b = count1b + half1b;
+      count2a = count2a + half2a;
+      count2b = count2b + half2b;
+      count1a = (count1a & m2) + (vul_rshift(count1a, 2) & m2);
+      count1b = (count1b & m2) + (vul_rshift(count1b, 2) & m2);
+      count1a = count1a + (count2a & m2) + (vul_rshift(count2a, 2) & m2);
+      count1b = count1b + (count2b & m2) + (vul_rshift(count2b, 2) & m2);
+      acc_a.vi = acc_a.vi + (count1a & m4) + (vul_rshift(count1a, 4) & m4);
+      acc_b.vi = acc_b.vi + (count1b & m4) + (vul_rshift(count1b, 4) & m4);
+    } while (vvec1_iter < vvec1_stop);
+    acc_a.vi = (acc_a.vi & m8) + (vul_rshift(acc_a.vi, 8) & m8);
+    acc_b.vi = (acc_b.vi & m8) + (vul_rshift(acc_b.vi, 8) & m8);
+    popcount_1_2a += univec_hsum_16bit(acc_a);
+    popcount_1_2b += univec_hsum_16bit(acc_b);
+  }
+}
+
+// don't bother with popcount_vecs_3intersect for now, but test later
+
+void popcount_longs_2intersect(const uintptr_t* __restrict bitvec1_iter, const uintptr_t* __restrict bitvec2a_iter, const uintptr_t* __restrict bitvec2b_iter, uintptr_t word_ct, uint32_t* popcount_1_2a_ptr, uint32_t* popcount_1_2b_ptr) {
+  const uintptr_t* bitvec1_end = &(bitvec1_iter[word_ct]);
+  uintptr_t trivec_ct = word_ct / (3 * kWordsPerVec);
+  uint32_t popcount_1_2a;
+  uint32_t popcount_1_2b;
+  popcount_vecs_2intersect((const vul_t*)bitvec1_iter, (const vul_t*)bitvec2a_iter, (const vul_t*)bitvec2b_iter, trivec_ct * 3, &popcount_1_2a, &popcount_1_2b);
+  bitvec1_iter = &(bitvec1_iter[trivec_ct * (3 * kWordsPerVec)]);
+  bitvec2a_iter = &(bitvec2a_iter[trivec_ct * (3 * kWordsPerVec)]);
+  bitvec2b_iter = &(bitvec2b_iter[trivec_ct * (3 * kWordsPerVec)]);
+  while (bitvec1_iter < bitvec1_end) {
+    const uintptr_t loader1 = *bitvec1_iter++;
+    popcount_1_2a += popcount_long(loader1 & (*bitvec2a_iter++));
+    popcount_1_2b += popcount_long(loader1 & (*bitvec2b_iter++));
+  }
+  *popcount_1_2a_ptr = popcount_1_2a;
+  *popcount_1_2b_ptr = popcount_1_2b;
+}
+
+
+static inline int32_t dotprod_vecs(const vul_t* __restrict vvec1a_iter, const vul_t* __restrict vvec1b_iter, const vul_t* __restrict vvec2a_iter, const vul_t* __restrict vvec2b_iter, uintptr_t vec_ct) {
+  // assumes vvec1a/vvec2a represesent +1s, vvec1b/vvec2b represent -1s, and
+  // everything else is 0.  computes
+  //   popcount(vvec1a & vvec2a) + popcount(vvec1b & vvec2b)
+  //   - popcount(vvec1a & vvec2b) - popcount(vvec1b & vvec2a).
+  // ct must be a multiple of 3.
+  assert(!(vec_ct % 3));
+  const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  int32_t tot = 0;
+  while (1) {
+    univec_t acc_plus;
+    univec_t acc_minus;
+    acc_plus.vi = vul_setzero();
+    acc_minus.vi = vul_setzero();
+
+    const vul_t* vvec1a_stop;
+    if (vec_ct < 30) {
+      if (!vec_ct) {
+	return tot;
+      }
+      vvec1a_stop = &(vvec1a_iter[vec_ct]);
+      vec_ct = 0;
+    } else {
+      vvec1a_stop = &(vvec1a_iter[30]);
+      vec_ct -= 30;
+    }
+    do {
+      vul_t loader1a = *vvec1a_iter++;
+      vul_t loader1b = *vvec1b_iter++;
+      vul_t loader2a = *vvec2a_iter++;
+      vul_t loader2b = *vvec2b_iter++;
+      // loader1a and loader1b are disjoint, etc.; take advantage of that
+      vul_t count1_plus = (loader1a & loader2a) | (loader1b & loader2b);
+      vul_t count1_minus = (loader1a & loader2b) | (loader1b & loader2a);
+
+      loader1a = *vvec1a_iter++;
+      loader1b = *vvec1b_iter++;
+      loader2a = *vvec2a_iter++;
+      loader2b = *vvec2b_iter++;
+      vul_t count2_plus = (loader1a & loader2a) | (loader1b & loader2b);
+      vul_t count2_minus = (loader1a & loader2b) | (loader1b & loader2a);
+
+      loader1a = *vvec1a_iter++;
+      loader1b = *vvec1b_iter++;
+      loader2a = *vvec2a_iter++;
+      loader2b = *vvec2b_iter++;
+      vul_t half1_plus = (loader1a & loader2a) | (loader1b & loader2b);
+      vul_t half1_minus = (loader1a & loader2b) | (loader1b & loader2a);
+      const vul_t half2_plus = vul_rshift(half1_plus, 1) & m1;
+      const vul_t half2_minus = vul_rshift(half1_minus, 1) & m1;
+      half1_plus = half1_plus & m1;
+      half1_minus = half1_minus & m1;
+      count1_plus = count1_plus - (vul_rshift(count1_plus, 1) & m1);
+      count1_minus = count1_minus - (vul_rshift(count1_minus, 1) & m1);
+      count2_plus = count2_plus - (vul_rshift(count2_plus, 1) & m1);
+      count2_minus = count2_minus - (vul_rshift(count2_minus, 1) & m1);
+      count1_plus = count1_plus + half1_plus;
+      count1_minus = count1_minus + half1_minus;
+      count2_plus = count2_plus + half2_plus;
+      count2_minus = count2_minus + half2_minus;
+      count1_plus = (count1_plus & m2) + (vul_rshift(count1_plus, 2) & m2);
+      count1_minus = (count1_minus & m2) + (vul_rshift(count1_minus, 2) & m2);
+      count1_plus = count1_plus + (count2_plus & m2) + (vul_rshift(count2_plus, 2) & m2);
+      count1_minus = count1_minus + (count2_minus & m2) + (vul_rshift(count2_minus, 2) & m2);
+      acc_plus.vi = acc_plus.vi + (count1_plus & m4) + (vul_rshift(count1_plus, 4) & m4);
+      acc_minus.vi = acc_minus.vi + (count1_minus & m4) + (vul_rshift(count1_minus, 4) & m4);
+    } while (vvec1a_iter < vvec1a_stop);
+    const vul_t m8 = VCONST_UL(kMask00FF);
+    acc_plus.vi = (acc_plus.vi & m8) + (vul_rshift(acc_plus.vi, 8) & m8);
+    acc_minus.vi = (acc_minus.vi & m8) + (vul_rshift(acc_minus.vi, 8) & m8);
+    tot += (uint32_t)univec_hsum_16bit(acc_plus);
+    tot -= (uint32_t)univec_hsum_16bit(acc_minus);
+  }
+}
+
+int32_t dotprod_longs(const uintptr_t* __restrict bitvec1a_iter, const uintptr_t* __restrict bitvec1b_iter, const uintptr_t* __restrict bitvec2a_iter, const uintptr_t* __restrict bitvec2b_iter, uintptr_t word_ct) {
+  const uintptr_t* bitvec1a_end = &(bitvec1a_iter[word_ct]);
+  uintptr_t trivec_ct = word_ct / (kWordsPerVec * 3);
+  int32_t tot = dotprod_vecs((const vul_t*)bitvec1a_iter, (const vul_t*)bitvec1b_iter, (const vul_t*)bitvec2a_iter, (const vul_t*)bitvec2b_iter, trivec_ct * 3);
+  bitvec1a_iter = &(bitvec1a_iter[trivec_ct * (3 * kWordsPerVec)]);
+  bitvec1b_iter = &(bitvec1b_iter[trivec_ct * (3 * kWordsPerVec)]);
+  bitvec2a_iter = &(bitvec2a_iter[trivec_ct * (3 * kWordsPerVec)]);
+  bitvec2b_iter = &(bitvec2b_iter[trivec_ct * (3 * kWordsPerVec)]);
+  while (bitvec1a_iter < bitvec1a_end) {
+    uintptr_t loader1a = *bitvec1a_iter++;
+    uintptr_t loader1b = *bitvec1b_iter++;
+    uintptr_t loader2a = *bitvec2a_iter++;
+    uintptr_t loader2b = *bitvec2b_iter++;
+    tot += popcount_long((loader1a & loader2a) | (loader1b & loader2b));
+    tot -= popcount_long((loader1a & loader2b) | (loader1b & loader2a));
+  }
+  return tot;
+}
+
+void ldprune_next_subcontig(const uintptr_t* variant_include, const uint32_t* variant_bps, const uint32_t* subcontig_info, const uint32_t* subcontig_thread_assignments, uint32_t x_start, uint32_t x_len, uint32_t y_start, uint32_t y_len, uint32_t founder_ct, uint32_t founder_male_ct, uint32_t prune_window_size, uint32_t thread_idx, uint32_t* subcontig_idx_ptr, uint32_t* subcontig_end_tvidx_ptr, uint32_t* next_window_end_tvidx_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, uint32_t* cur_foun [...]
+  uint32_t subcontig_idx = *subcontig_idx_ptr;
+  do {
+    ++subcontig_idx;
+  } while (subcontig_thread_assignments[subcontig_idx] != thread_idx);
+  *subcontig_idx_ptr = subcontig_idx;
+  const uint32_t subcontig_first_tvidx = *subcontig_end_tvidx_ptr;
+  const uint32_t subcontig_len = subcontig_info[3 * subcontig_idx];
+  const uint32_t variant_uidx_winstart = subcontig_info[3 * subcontig_idx + 2];
+  const uint32_t subcontig_end_tvidx = subcontig_first_tvidx + subcontig_len;
+  *subcontig_end_tvidx_ptr = subcontig_end_tvidx;
+  if (variant_bps) {
+    const uint32_t variant_bp_thresh = variant_bps[variant_uidx_winstart] + prune_window_size;
+    uint32_t variant_uidx_winend = variant_uidx_winstart;
+    uint32_t first_window_len = 1;
+    do {
+      ++variant_uidx_winend;
+      next_set_unsafe_ck(variant_include, &variant_uidx_winend);
+    } while ((variant_bps[variant_uidx_winend] <= variant_bp_thresh) && (++first_window_len < subcontig_len));
+    *next_window_end_tvidx_ptr = subcontig_first_tvidx + first_window_len;
+    *variant_uidx_winend_ptr = variant_uidx_winend;
+  } else {
+    *next_window_end_tvidx_ptr = subcontig_first_tvidx + MINV(subcontig_len, prune_window_size);
+  }
+
+  *variant_uidx_winstart_ptr = variant_uidx_winstart;
+  // _len is better than _end here since we can exploit unsignedness
+  const uint32_t is_x = ((variant_uidx_winstart - x_start) < x_len);
+  const uint32_t is_y = ((variant_uidx_winstart - y_start) < y_len);
+  if ((is_x != (*is_x_ptr)) || (is_y != (*is_y_ptr))) {
+    *is_x_ptr = is_x;
+    *is_y_ptr = is_y;
+    const uint32_t cur_founder_ct = (is_x || is_y)? founder_male_ct : founder_ct;
+    const uint32_t cur_founder_ctaw = BITCT_TO_ALIGNED_WORDCT(cur_founder_ct);
+    *cur_founder_ct_ptr = cur_founder_ct;
+    *cur_founder_ctaw_ptr = cur_founder_ctaw;
+    *cur_founder_ctl_ptr = BITCT_TO_WORDCT(cur_founder_ct);
+    *entire_variant_buf_word_ct_ptr = 3 * cur_founder_ctaw;
+    if (is_x) {
+      *entire_variant_buf_word_ct_ptr += 3 * BITCT_TO_ALIGNED_WORDCT(founder_ct - founder_male_ct);
+    }
+  }
+}
+
+void genoarr_split_02nm(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict zero_bitarr, uintptr_t* __restrict two_bitarr, uintptr_t* __restrict nm_bitarr) {
+  // ok if trailing bits of genoarr are not zeroed out
+  // trailing bits of {zero,two,nm}_bitarr are zeroed out
+  const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+  halfword_t* zero_bitarr_alias = (halfword_t*)zero_bitarr;
+  halfword_t* two_bitarr_alias = (halfword_t*)two_bitarr;
+  halfword_t* nm_bitarr_alias = (halfword_t*)nm_bitarr;
+  for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+    const uintptr_t cur_geno_word = genoarr[widx];
+    const uint32_t low_halfword = pack_word_to_halfword(cur_geno_word & kMask5555);
+    const uint32_t high_halfword = pack_word_to_halfword((cur_geno_word >> 1) & kMask5555);
+    zero_bitarr_alias[widx] = ~(low_halfword | high_halfword);
+    two_bitarr_alias[widx] = high_halfword & (~low_halfword);
+    nm_bitarr_alias[widx] = ~(low_halfword & high_halfword);
+  }
+  const uint32_t sample_ct_rem = sample_ct % kBitsPerWord;
+  if (sample_ct_rem) {
+    const uintptr_t trailing_mask = (~k0LU) >> (kBitsPerWord - sample_ct_rem);
+    // note that we don't use the halfword aliases here
+    const uint32_t last_write_word_idx = sample_ct / kBitsPerWord;
+    zero_bitarr[last_write_word_idx] &= trailing_mask;
+    two_bitarr[last_write_word_idx] &= trailing_mask;
+    nm_bitarr[last_write_word_idx] &= trailing_mask;
+  }
+}
+
+void ldprune_next_window(const uintptr_t* __restrict variant_include, const uint32_t* __restrict variant_bps, const uint32_t* __restrict tvidxs, const uintptr_t* __restrict cur_window_removed, uint32_t prune_window_size, uint32_t window_incr, uint32_t window_maxl, uint32_t subcontig_end_tvidx, uint32_t* cur_window_size_ptr, uint32_t* __restrict window_start_tvidx_ptr, uint32_t* __restrict variant_uidx_winstart_ptr, uint32_t* __restrict next_window_end_tvidx_ptr, uint32_t* __restrict vari [...]
+  uint32_t next_window_end_tvidx = *next_window_end_tvidx_ptr;
+  if (next_window_end_tvidx == subcontig_end_tvidx) {
+    // just completed last window in subcontig
+    *cur_window_size_ptr = 0;
+    *window_start_tvidx_ptr = subcontig_end_tvidx;
+    fill_ulong_zero(window_maxl, occupied_window_slots);
+    return;
+  }
+  uint32_t next_window_start_tvidx = *window_start_tvidx_ptr;
+  if (variant_bps) {
+    // this is guaranteed to be nonnegative
+    uint32_t variant_uidx_winstart = *variant_uidx_winstart_ptr;
+    uint32_t variant_uidx_winend = *variant_uidx_winend_ptr;
+    const uint32_t window_start_min_bp = variant_bps[variant_uidx_winend] - prune_window_size;
+    uint32_t window_start_bp;
+    do {
+      // advance window start by as much as necessary to make end advance by at
+      // least 1
+      ++next_window_start_tvidx;
+      ++variant_uidx_winstart;
+      next_set_unsafe_ck(variant_include, &variant_uidx_winstart);
+      window_start_bp = variant_bps[variant_uidx_winstart];
+    } while (window_start_bp < window_start_min_bp);
+    // now advance window end as appropriate
+    const uint32_t window_end_thresh = window_start_bp + prune_window_size;
+    do {
+      if (++next_window_end_tvidx == subcontig_end_tvidx) {
+	break;
+      }
+      ++variant_uidx_winend;
+      next_set_unsafe_ck(variant_include, &variant_uidx_winend);
+    } while (variant_bps[variant_uidx_winend] <= window_end_thresh);
+    *variant_uidx_winstart_ptr = variant_uidx_winstart;
+    *variant_uidx_winend_ptr = variant_uidx_winend;
+  } else {
+    next_window_start_tvidx += window_incr;
+    next_window_end_tvidx = MINV(next_window_start_tvidx + prune_window_size, subcontig_end_tvidx);
+  }
+  const uint32_t cur_window_size = *cur_window_size_ptr;
+  uint32_t winpos_write = 0;
+  for (uint32_t winpos_read = 0; winpos_read < cur_window_size; ++winpos_read) {
+    const uint32_t slot_idx = winpos_to_slot_idx[winpos_read];
+    if (IS_SET(cur_window_removed, winpos_read) || (tvidxs[slot_idx] < next_window_start_tvidx)) {
+      CLEAR_BIT(slot_idx, occupied_window_slots);
+    } else {
+      winpos_to_slot_idx[winpos_write++] = slot_idx;
+    }
+  }
+  *cur_window_size_ptr = winpos_write;
+  *window_start_tvidx_ptr = next_window_start_tvidx;
+  *next_window_end_tvidx_ptr = next_window_end_tvidx;
+}
+
+void compute_indep_pairwise_r2_components(const uintptr_t* __restrict first_genobufs, const uintptr_t* __restrict second_genobufs, const int32_t* __restrict second_vstats, uint32_t founder_ct, uint32_t* cur_nm_ct_ptr, int32_t* cur_first_sum_ptr, uint32_t* cur_first_ssq_ptr, int32_t* second_sum_ptr, uint32_t* second_ssq_ptr, int32_t* cur_dotprod_ptr) {
+  const uint32_t founder_ctaw = BITCT_TO_ALIGNED_WORDCT(founder_ct);
+  const uint32_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
+  *cur_dotprod_ptr = dotprod_longs(first_genobufs, &(first_genobufs[founder_ctaw]), second_genobufs, &(second_genobufs[founder_ctaw]), founder_ctl);
+  if (*cur_nm_ct_ptr != founder_ct) {
+    uint32_t plusone_ct;
+    uint32_t minusone_ct;
+    popcount_longs_2intersect(&(first_genobufs[2 * founder_ctaw]), second_genobufs, &(second_genobufs[founder_ctaw]), founder_ctl, &plusone_ct, &minusone_ct);
+    *second_sum_ptr = ((int32_t)plusone_ct) - ((int32_t)minusone_ct);
+    *second_ssq_ptr = plusone_ct + minusone_ct;
+  } else {
+    *second_sum_ptr = second_vstats[1];
+    *second_ssq_ptr = second_vstats[2];
+  }
+  const uint32_t second_nm_ct = second_vstats[0];
+  if (second_nm_ct == founder_ct) {
+    // assumed that cur_first_nm initialized to first_vstats[0], cur_first_sum
+    // initialized to first_vstats[1], cur_first_ssq to first_vstats[2]
+    return;
+  }
+  uint32_t plusone_ct;
+  uint32_t minusone_ct;
+  popcount_longs_2intersect(&(second_genobufs[2 * founder_ctaw]), first_genobufs, &(first_genobufs[founder_ctaw]), founder_ctl, &plusone_ct, &minusone_ct);
+  *cur_first_sum_ptr = ((int32_t)plusone_ct) - ((int32_t)minusone_ct);
+  *cur_first_ssq_ptr = plusone_ct + minusone_ct;
+  if (*cur_nm_ct_ptr == founder_ct) {
+    *cur_nm_ct_ptr = second_nm_ct;
+    return;
+  }
+  *cur_nm_ct_ptr = popcount_longs_intersect(&(first_genobufs[2 * founder_ctaw]), &(second_genobufs[2 * founder_ctaw]), founder_ctl);
+}
+
+// multithread globals
+static const uint32_t* g_subcontig_info = nullptr;
+static const uint32_t* g_subcontig_thread_assignments = nullptr;
+static const uintptr_t* g_variant_include = nullptr;
+static const uintptr_t* g_variant_allele_idxs = nullptr;
+static const alt_allele_ct_t* g_maj_alleles = nullptr;
+static const double* g_all_allele_freqs = nullptr;
+static const uint32_t* g_variant_bps = nullptr;
+static uint32_t* g_tvidx_end = nullptr;
+static uint32_t g_x_start = 0;
+static uint32_t g_x_len = 0;
+static uint32_t g_y_start = 0;
+static uint32_t g_y_len = 0;
+static uint32_t g_founder_ct = 0;
+static uint32_t g_founder_male_ct = 0;
+static uint32_t g_prune_window_size = 0;
+static uint32_t g_window_maxl = 0;
+static double g_prune_ld_thresh = 0.0;
+static uint32_t g_window_incr = 0;
+static uint32_t g_cur_batch_size = 0;
+static uintptr_t** g_genobufs = nullptr;
+static uintptr_t** g_occupied_window_slots = nullptr;
+static uintptr_t** g_cur_window_removed = nullptr;
+static double** g_cur_maj_freqs = nullptr;
+static uintptr_t** g_removed_variants_write = nullptr;
+static int32_t** g_vstats = nullptr;
+static int32_t** g_nonmale_vstats = nullptr;
+static uint32_t** g_winpos_to_slot_idx = nullptr;
+static uint32_t** g_tvidxs = nullptr;
+static uint32_t** g_first_unchecked_tvidx = nullptr;
+static uintptr_t** g_raw_tgenovecs[2] = {nullptr, nullptr};
+
+THREAD_FUNC_DECL indep_pairwise_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t* subcontig_info = g_subcontig_info;
+  const uint32_t* subcontig_thread_assignments = g_subcontig_thread_assignments;
+  const uintptr_t* variant_include = g_variant_include;
+  const uint32_t x_start = g_x_start;
+  const uint32_t x_len = g_x_len;
+  const uint32_t y_start = g_y_start;
+  const uint32_t y_len = g_y_len;
+  const uintptr_t* variant_allele_idxs = g_variant_allele_idxs;
+  const alt_allele_ct_t* maj_alleles = g_maj_alleles;
+  const double* all_allele_freqs = g_all_allele_freqs;
+  const uint32_t* variant_bps = g_variant_bps;
+  const uint32_t founder_ct = g_founder_ct;
+  const uint32_t founder_male_ct = g_founder_male_ct;
+  const uint32_t founder_male_ctl2 = QUATERCT_TO_WORDCT(founder_male_ct);
+  const uint32_t nonmale_ct = founder_ct - founder_male_ct;
+  const uint32_t nonmale_ctaw = BITCT_TO_ALIGNED_WORDCT(nonmale_ct);
+  const uint32_t nonmale_ctl = BITCT_TO_WORDCT(nonmale_ct);
+  const uintptr_t raw_tgenovec_single_variant_word_ct = round_up_pow2(QUATERCT_TO_WORDCT(nonmale_ct) + founder_male_ctl2, kWordsPerVec);
+  const uint32_t prune_window_size = g_prune_window_size;
+  const uint32_t window_maxl = g_window_maxl;
+  const double prune_ld_thresh = g_prune_ld_thresh;
+  const uint32_t window_incr = g_window_incr;
+  const uint32_t tvidx_end = g_tvidx_end[tidx];
+  uintptr_t* genobufs = g_genobufs[tidx];
+  uintptr_t* occupied_window_slots = g_occupied_window_slots[tidx];
+  uintptr_t* cur_window_removed = g_cur_window_removed[tidx];
+  uintptr_t* removed_variants_write = g_removed_variants_write[tidx];
+  double* cur_maj_freqs = g_cur_maj_freqs[tidx];
+  int32_t* vstats = g_vstats[tidx];
+  int32_t* nonmale_vstats = g_nonmale_vstats[tidx];
+  uint32_t* winpos_to_slot_idx = g_winpos_to_slot_idx[tidx];
+  uint32_t* tvidxs = g_tvidxs[tidx];
+  uint32_t* first_unchecked_tvidx = g_first_unchecked_tvidx[tidx];
+  
+  uint32_t subcontig_end_tvidx = 0;
+  uint32_t subcontig_idx = 0xffffffffU; // deliberate overflow
+  uint32_t window_start_tvidx = 0;
+  uint32_t next_window_end_tvidx = 0;
+  uint32_t write_slot_idx = 0;
+  uint32_t is_x = 0;
+  uint32_t is_y = 0;
+  uint32_t cur_window_size = 0;
+  uint32_t tvidx_start = 0;
+  uint32_t cur_founder_ct = founder_ct;
+  uint32_t cur_founder_ctaw = BITCT_TO_ALIGNED_WORDCT(founder_ct);
+  uint32_t cur_founder_ctl = BITCT_TO_WORDCT(founder_ct);
+  uint32_t variant_uidx = 0;
+  uint32_t variant_uidx_winstart = 0;
+  uint32_t variant_uidx_winend = 0;
+  uintptr_t entire_variant_buf_word_ct = 3 * cur_founder_ctaw;
+  uint32_t cur_allele_ct = 2;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    const uint32_t tvidx_stop = MINV(tvidx_start + cur_batch_size, tvidx_end);
+    // main loop has to be variant-, not window-, based due to how datasets too
+    // large to fit in memory are handled: we may have to halt in the middle of
+    // unpacking data for a window, waiting until the current I/O pass is
+    // complete before proceeding
+    const uintptr_t* raw_tgenovecs = g_raw_tgenovecs[parity][tidx];
+    for (uint32_t cur_tvidx = tvidx_start; cur_tvidx < tvidx_stop; ++variant_uidx) {
+      if (cur_tvidx == subcontig_end_tvidx) {
+	ldprune_next_subcontig(variant_include, variant_bps, subcontig_info, subcontig_thread_assignments, x_start, x_len, y_start, y_len, founder_ct, founder_male_ct, prune_window_size, tidx, &subcontig_idx, &subcontig_end_tvidx, &next_window_end_tvidx, &is_x, &is_y, &cur_founder_ct, &cur_founder_ctaw, &cur_founder_ctl, &entire_variant_buf_word_ct, &variant_uidx_winstart, &variant_uidx_winend);
+	variant_uidx = variant_uidx_winstart;
+      }
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      write_slot_idx = next_unset_unsafe(occupied_window_slots, write_slot_idx);
+      uintptr_t tvidx_offset = cur_tvidx - tvidx_start;
+      const uintptr_t* cur_raw_tgenovecs = &(raw_tgenovecs[tvidx_offset * raw_tgenovec_single_variant_word_ct]);
+      uintptr_t* cur_genobuf = &(genobufs[write_slot_idx * entire_variant_buf_word_ct]);
+      uintptr_t* cur_genobuf_minus = &(cur_genobuf[cur_founder_ctaw]);
+      uintptr_t* cur_genobuf_nm = &(cur_genobuf_minus[cur_founder_ctaw]);
+      genoarr_split_02nm(cur_raw_tgenovecs, cur_founder_ct, cur_genobuf, cur_genobuf_minus, cur_genobuf_nm);
+      uint32_t nm_ct = popcount_longs(cur_genobuf_nm, cur_founder_ctl);
+      uint32_t plusone_ct = popcount_longs(cur_genobuf, cur_founder_ctl);
+      uint32_t minusone_ct = popcount_longs(cur_genobuf_minus, cur_founder_ctl);
+      vstats[3 * write_slot_idx] = nm_ct;
+      vstats[3 * write_slot_idx + 1] = ((int32_t)plusone_ct) - ((int32_t)minusone_ct);
+      vstats[3 * write_slot_idx + 2] = plusone_ct + minusone_ct;
+      if (is_x) {
+	cur_genobuf = &(cur_genobuf[3 * cur_founder_ctaw]);
+	cur_genobuf_minus = &(cur_genobuf[nonmale_ctaw]);
+	cur_genobuf_nm = &(cur_genobuf_minus[nonmale_ctaw]);
+	genoarr_split_02nm(&(cur_raw_tgenovecs[founder_male_ctl2]), nonmale_ct, cur_genobuf, cur_genobuf_minus, cur_genobuf_nm);
+	const uint32_t x_nonmale_nm_ct = popcount_longs(cur_genobuf_nm, nonmale_ctl);
+	const uint32_t x_nonmale_plusone_ct = popcount_longs(cur_genobuf, nonmale_ctl);
+	const uint32_t x_nonmale_minusone_ct = popcount_longs(cur_genobuf_minus, nonmale_ctl);
+	nonmale_vstats[3 * write_slot_idx] = x_nonmale_nm_ct;
+	nonmale_vstats[3 * write_slot_idx + 1] = ((int32_t)x_nonmale_plusone_ct) - ((int32_t)x_nonmale_minusone_ct);
+	nonmale_vstats[3 * write_slot_idx + 2] = x_nonmale_plusone_ct + x_nonmale_minusone_ct;
+	nm_ct += 2 * x_nonmale_nm_ct;
+	plusone_ct += 2 * x_nonmale_plusone_ct;
+	minusone_ct += 2 * x_nonmale_minusone_ct;
+      }
+      if (((!plusone_ct) && (!minusone_ct)) || (plusone_ct == nm_ct) || (minusone_ct == nm_ct)) {
+	SET_BIT(cur_window_size, cur_window_removed);
+	SET_BIT(cur_tvidx, removed_variants_write);
+      } else {
+	tvidxs[write_slot_idx] = cur_tvidx;
+	uintptr_t allele_idx_base;
+	if (!variant_allele_idxs) {
+	  allele_idx_base = variant_uidx;
+	} else {
+	  allele_idx_base = variant_allele_idxs[variant_uidx];
+	  cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - allele_idx_base;
+	  allele_idx_base -= variant_uidx;
+	}
+	cur_maj_freqs[write_slot_idx] = get_allele_freq(&(all_allele_freqs[allele_idx_base]), maj_alleles[variant_uidx], cur_allele_ct);
+	first_unchecked_tvidx[write_slot_idx] = cur_tvidx + 1;
+      }
+      SET_BIT(write_slot_idx, occupied_window_slots);
+      winpos_to_slot_idx[cur_window_size++] = write_slot_idx;
+      // are we at the end of a window?
+      if (++cur_tvidx == next_window_end_tvidx) {
+	// possible for cur_window_size == 1, if all variants at the end of the
+	// previous window were pruned
+	uint32_t cur_removed_ct = popcount_longs(cur_window_removed, BITCT_TO_WORDCT(cur_window_size));
+	uint32_t prev_removed_ct;
+	do {
+	  prev_removed_ct = cur_removed_ct;
+	  uint32_t first_winpos = 0;
+	  // const uint32_t debug_print = (!IS_SET(cur_window_removed, 0)) && (tvidxs[winpos_to_slot_idx[0]] == 0);
+	  while (1) {
+	    next_unset_unsafe_ck(cur_window_removed, &first_winpos);
+	    // can assume empty trailing bit for cur_window_removed
+	    if (first_winpos == cur_window_size) {
+	      break;
+	    }
+	    uint32_t first_slot_idx = winpos_to_slot_idx[first_winpos];
+	    const uint32_t cur_first_unchecked_tvidx = first_unchecked_tvidx[first_slot_idx];
+	    uint32_t second_winpos = first_winpos;
+	    while (1) {
+	      ++second_winpos;
+	      next_unset_unsafe_ck(cur_window_removed, &second_winpos);
+	      if (second_winpos == cur_window_size) {
+		break;
+	      }
+	      uint32_t second_slot_idx = winpos_to_slot_idx[second_winpos];
+	      if (tvidxs[second_slot_idx] >= cur_first_unchecked_tvidx) {
+		uintptr_t* first_genobufs = &(genobufs[first_slot_idx * entire_variant_buf_word_ct]);
+		const uint32_t first_nm_ct = vstats[3 * first_slot_idx];
+		const int32_t first_sum = vstats[3 * first_slot_idx + 1];
+		const uint32_t first_ssq = vstats[3 * first_slot_idx + 2];
+		while (1) {
+		  uintptr_t* second_genobufs = &(genobufs[second_slot_idx * entire_variant_buf_word_ct]);
+		  uint32_t cur_nm_ct = first_nm_ct;
+		  int32_t cur_first_sum = first_sum;
+		  uint32_t cur_first_ssq = first_ssq;
+		  int32_t second_sum;
+		  uint32_t second_ssq;
+		  int32_t cur_dotprod;
+		  compute_indep_pairwise_r2_components(first_genobufs, second_genobufs, &(vstats[3 * second_slot_idx]), cur_founder_ct, &cur_nm_ct, &cur_first_sum, &cur_first_ssq, &second_sum, &second_ssq, &cur_dotprod);
+		  if (is_x) {
+		    uint32_t nonmale_nm_ct = nonmale_vstats[3 * first_slot_idx];
+		    int32_t nonmale_first_sum = nonmale_vstats[3 * first_slot_idx + 1];
+		    uint32_t nonmale_first_ssq = nonmale_vstats[3 * first_slot_idx + 2];
+		    int32_t nonmale_dotprod;
+		    int32_t nonmale_second_sum;
+		    uint32_t nonmale_second_ssq;
+		    compute_indep_pairwise_r2_components(&(first_genobufs[3 * cur_founder_ctaw]), &(second_genobufs[3 * cur_founder_ctaw]), &(nonmale_vstats[3 * second_slot_idx]), nonmale_ct, &nonmale_nm_ct, &nonmale_first_sum, &nonmale_first_ssq, &nonmale_second_sum, &nonmale_second_ssq, &nonmale_dotprod);
+		    // only --ld-xchr 3 for now
+		    // assumes founder_ct < 2^30
+		    cur_nm_ct += 2 * nonmale_nm_ct;
+		    cur_first_sum += 2 * nonmale_first_sum;
+		    cur_first_ssq += 2 * nonmale_first_ssq;
+		    second_sum += 2 * nonmale_second_sum;
+		    second_ssq += 2 * nonmale_second_ssq;
+		    cur_dotprod += 2 * nonmale_dotprod;
+		  }
+		  // these three values are actually cur_nm_ct times their
+		  // true values, but that cancels out
+		  const double cov12 = (double)(cur_dotprod * ((int64_t)cur_nm_ct) - ((int64_t)cur_first_sum) * second_sum);
+		  const double variance1 = (double)(cur_first_ssq * ((int64_t)cur_nm_ct) - ((int64_t)cur_first_sum) * cur_first_sum);
+		  const double variance2 = (double)(second_ssq * ((int64_t)cur_nm_ct) - ((int64_t)second_sum) * second_sum);
+		  // > instead of >=, so we don't prune from a pair of
+		  // variants with zero common observations
+		  if (cov12 * cov12 > prune_ld_thresh * variance1 * variance2) {
+		    // strictly speaking, the (1 + kSmallEpsilon) tolerance
+		    // does not appear to be needed yet, but it will be once
+		    // --read-freq is implemented.
+		    // this has a surprisingly large ~3% speed penalty on my
+		    // main test scenario, but that's an acceptable price to
+		    // pay for reproducibility.
+		    if (cur_maj_freqs[first_slot_idx] > cur_maj_freqs[second_slot_idx] * (1 + kSmallEpsilon)) {
+		      /*
+		      if (debug_print) {
+			printf("removing %u, keeping %u, freqs %g/%g, r2 = %g\n", tvidxs[first_slot_idx], tvidxs[second_slot_idx], cur_maj_freqs[first_slot_idx], cur_maj_freqs[second_slot_idx], cov12 * cov12 / (variance1 * variance2));
+		      }
+		      */
+		      SET_BIT(first_winpos, cur_window_removed);
+		      SET_BIT(tvidxs[first_slot_idx], removed_variants_write);
+		    } else {
+		      /*
+		      if (debug_print) {
+		        printf("removing %u (second), keeping %u, freqs %g/%g, r2 = %g\n", tvidxs[second_slot_idx], tvidxs[first_slot_idx], cur_maj_freqs[second_slot_idx], cur_maj_freqs[first_slot_idx], cov12 * cov12 / (variance1 * variance2));
+		      }
+		      */
+		      SET_BIT(second_winpos, cur_window_removed);
+		      SET_BIT(tvidxs[second_slot_idx], removed_variants_write);
+		      const uint32_t next_start_winpos = next_unset_unsafe(cur_window_removed, second_winpos);
+		      if (next_start_winpos < cur_window_size) {
+			first_unchecked_tvidx[first_slot_idx] = tvidxs[winpos_to_slot_idx[next_start_winpos]];
+		      } else {
+			first_unchecked_tvidx[first_slot_idx] = cur_tvidx;
+		      }
+		    }
+		    break;
+		  }
+		  ++second_winpos;
+		  next_unset_unsafe_ck(cur_window_removed, &second_winpos);
+		  if (second_winpos == cur_window_size) {
+		    first_unchecked_tvidx[first_slot_idx] = cur_tvidx;
+		    break;
+		  }
+		  second_slot_idx = winpos_to_slot_idx[second_winpos];
+		} // while (1)
+		break;
+	      }
+	    }
+	    ++first_winpos;
+	  }
+	  cur_removed_ct = popcount_longs(cur_window_removed, BITCT_TO_WORDCT(cur_window_size));
+	} while (cur_removed_ct > prev_removed_ct);
+	const uint32_t prev_window_size = cur_window_size;
+	ldprune_next_window(variant_include, variant_bps, tvidxs, cur_window_removed, prune_window_size, window_incr, window_maxl, subcontig_end_tvidx, &cur_window_size, &window_start_tvidx, &variant_uidx_winstart, &next_window_end_tvidx, &variant_uidx_winend, occupied_window_slots, winpos_to_slot_idx);
+	// clear bits here since we set cur_window_removed bits during loading
+	// process in monomorphic case
+	fill_ulong_zero(BITCT_TO_WORDCT(prev_window_size), cur_window_removed);
+	write_slot_idx = 0;
+      }
+    }
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+    tvidx_start = tvidx_stop;
+  }
+}
+
+pglerr_t indep_pairwise(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, const uintptr_t* variant_allele_idxs, const alt_allele_ct_t* maj_alleles, const double* allele_freqs, const uintptr_t* founder_info, const uint32_t* founder_info_cumulative_popcounts, const uintptr_t* founder_nonmale, const uintptr_t* founder_male, const ld_info_t* ldip, const uint32_t* subcontig_info, const uint32_t* subcontig_thread_assignments, uint32_t raw_sample_ct, uint32_t [...]
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t founder_nonmale_ct = founder_ct - founder_male_ct;
+    if (founder_nonmale_ct * 2 + founder_male_ct > 0x7fffffffU) {
+      // may as well document this
+      logerrprint("Error: --indep-pairwise does not support >= 2^30 founders.\n");
+      goto indep_pairwise_ret_NOT_YET_SUPPORTED;
+    }
+    const uint32_t founder_nonmale_ctaw = BITCT_TO_ALIGNED_WORDCT(founder_nonmale_ct);
+    const uint32_t founder_male_ctaw = BITCT_TO_ALIGNED_WORDCT(founder_male_ct);
+    // Per-thread allocations:
+    // - tvidx_batch_size * raw_tgenovec_single_variant_word_ct *
+    //     sizeof(intptr_t) for raw genotype data (g_raw_tgenovecs)
+    // - tvidx_batch_size * sizeof(double) for g_maj_freqs
+    // - if pos-based window, tvidx_batch_size * sizeof(int32_t)
+    // - All of the above again, to allow loader thread to operate
+    //     independently
+    // - window_max * 3 * (founder_nonmale_ctaw + founder_male_ctaw) *
+    //     kBytesPerVec for split genotype data
+    // - max_loadl * sizeof(intptr_t) for removed-variant bitarray
+    // - window_max * 3 * sizeof(int32_t) for main missing_ct, sum(x_i),
+    //     sum(x_i^2) array
+    // - window_max * 3 * sizeof(int32_t) for chrX founder_male missing_ct,
+    //     sum(x_i), sum(x_i^2) array
+    // - window_max * sizeof(int32_t) for indexes into genotype data bitarrays
+    //     (for now, anyway)
+    // - window_max * sizeof(int32_t) for live_indices (variant_idxs?)
+    // - window_max * sizeof(int32_t) for start_arr (first uncompared
+    //     variant_idx)
+    uintptr_t* tmp_genovec;
+    uint32_t* thread_last_subcontig;
+    uint32_t* thread_subcontig_start_tvidx;
+    uint32_t* thread_last_tvidx;
+    uint32_t* thread_last_uidx;
+    pthread_t* threads = nullptr;
+    if (bigstack_alloc_ul(QUATERCT_TO_WORDCT(raw_sample_ct), &tmp_genovec) ||
+	bigstack_calloc_ui(calc_thread_ct, &g_tvidx_end) ||
+	bigstack_calloc_ui(calc_thread_ct, &thread_last_subcontig) ||
+	bigstack_calloc_ui(calc_thread_ct, &thread_subcontig_start_tvidx) ||
+	bigstack_calloc_ui(calc_thread_ct, &thread_last_tvidx) ||
+	bigstack_calloc_ui(calc_thread_ct, &thread_last_uidx) ||
+	bigstack_alloc_ulp(calc_thread_ct, &g_genobufs) ||
+	bigstack_alloc_ulp(calc_thread_ct, &g_occupied_window_slots) ||
+        bigstack_alloc_ulp(calc_thread_ct, &g_cur_window_removed) ||
+	bigstack_alloc_dp(calc_thread_ct, &g_cur_maj_freqs) ||
+	bigstack_alloc_ulp(calc_thread_ct, &g_removed_variants_write) ||
+	bigstack_alloc_ip(calc_thread_ct, &g_vstats) ||
+	bigstack_alloc_ip(calc_thread_ct, &g_nonmale_vstats) ||
+	bigstack_alloc_uip(calc_thread_ct, &g_winpos_to_slot_idx) ||
+	bigstack_alloc_uip(calc_thread_ct, &g_tvidxs) ||
+	bigstack_alloc_uip(calc_thread_ct, &g_first_unchecked_tvidx) ||
+	bigstack_alloc_ulp(calc_thread_ct, &(g_raw_tgenovecs[0])) ||
+        bigstack_alloc_ulp(calc_thread_ct, &(g_raw_tgenovecs[1])) ||
+	bigstack_alloc_thread(calc_thread_ct, &threads)) {
+      goto indep_pairwise_ret_NOMEM;
+    }
+    for (uint32_t subcontig_idx = 0; subcontig_idx < subcontig_ct; ++subcontig_idx) {
+      const uint32_t cur_thread_idx = subcontig_thread_assignments[subcontig_idx];
+      g_tvidx_end[cur_thread_idx] += subcontig_info[3 * subcontig_idx];
+    }
+    const uintptr_t entire_variant_buf_word_ct = 3 * (founder_nonmale_ctaw + founder_male_ctaw);
+    const uint32_t window_maxl = BITCT_TO_WORDCT(window_max);
+    const uint32_t max_loadl = BITCT_TO_WORDCT(max_load);
+    const uintptr_t genobuf_alloc = round_up_pow2(window_max * entire_variant_buf_word_ct * sizeof(intptr_t), kCacheline);
+    const uintptr_t occupied_window_slots_alloc = round_up_pow2(window_maxl * sizeof(intptr_t), kCacheline);
+    const uintptr_t cur_window_removed_alloc = round_up_pow2((1 + window_max / kBitsPerWord) * sizeof(intptr_t), kCacheline);
+    const uintptr_t cur_maj_freqs_alloc = round_up_pow2(window_max * sizeof(double), kCacheline);
+    const uintptr_t removed_variants_write_alloc = round_up_pow2(max_loadl * sizeof(intptr_t), kCacheline);
+    const uintptr_t vstats_alloc = round_up_pow2(3 * window_max * sizeof(int32_t), kCacheline); // two of these
+    const uintptr_t window_int32_alloc = round_up_pow2(window_max * sizeof(int32_t), kCacheline); // three of these
+    const uintptr_t thread_alloc_base = genobuf_alloc + occupied_window_slots_alloc + cur_window_removed_alloc + cur_maj_freqs_alloc + removed_variants_write_alloc + 2 * vstats_alloc + 3 * window_int32_alloc;
+
+    const uint32_t founder_ctl2 = QUATERCT_TO_WORDCT(founder_ct);
+    const uint32_t founder_male_ctl2 = QUATERCT_TO_WORDCT(founder_male_ct);
+    const uint32_t founder_nonmale_ctl2 = QUATERCT_TO_WORDCT(founder_nonmale_ct);
+    const uintptr_t raw_tgenovec_single_variant_word_ct = round_up_pow2(founder_nonmale_ctl2 + founder_male_ctl2, kWordsPerVec);
+    // round down
+    uintptr_t bigstack_avail_per_thread = round_down_pow2(bigstack_left() / calc_thread_ct, kCacheline);
+    // may as well require capacity for >= 256 variants per thread per pass
+    if (bigstack_avail_per_thread <= thread_alloc_base + 2 * 256 * raw_tgenovec_single_variant_word_ct * sizeof(intptr_t)) {
+      goto indep_pairwise_ret_NOMEM;
+    }
+    bigstack_avail_per_thread -= thread_alloc_base;
+    uint32_t tvidx_batch_size = DIV_UP(max_load, 2);
+    // tried a bunch of powers of two, this seems to be a good value
+    if (tvidx_batch_size > 65536) {
+      tvidx_batch_size = 65536;
+    }
+    // tvidx_batch_size = max_load; // temporary debugging
+    if (2 * tvidx_batch_size * raw_tgenovec_single_variant_word_ct * sizeof(intptr_t) > bigstack_avail_per_thread) {
+      tvidx_batch_size = bigstack_avail_per_thread / round_up_pow2(raw_tgenovec_single_variant_word_ct * 2 * sizeof(intptr_t), kCacheline);
+    }
+    for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+      g_genobufs[tidx] = (uintptr_t*)bigstack_alloc_raw(genobuf_alloc);
+      g_occupied_window_slots[tidx] = (uintptr_t*)bigstack_alloc_raw(occupied_window_slots_alloc);
+      fill_ulong_zero(window_maxl, g_occupied_window_slots[tidx]);
+      g_cur_window_removed[tidx] = (uintptr_t*)bigstack_alloc_raw(cur_window_removed_alloc);
+      fill_ulong_zero(1 + window_max / kBitsPerWord, g_cur_window_removed[tidx]);
+      g_cur_maj_freqs[tidx] = (double*)bigstack_alloc_raw(cur_maj_freqs_alloc);
+      g_removed_variants_write[tidx] = (uintptr_t*)bigstack_alloc_raw(removed_variants_write_alloc);
+      fill_ulong_zero(max_loadl, g_removed_variants_write[tidx]);
+      g_vstats[tidx] = (int32_t*)bigstack_alloc_raw(vstats_alloc);
+      g_nonmale_vstats[tidx] = (int32_t*)bigstack_alloc_raw(vstats_alloc);
+      g_winpos_to_slot_idx[tidx] = (uint32_t*)bigstack_alloc_raw(window_int32_alloc);
+      g_tvidxs[tidx] = (uint32_t*)bigstack_alloc_raw(window_int32_alloc);
+      g_first_unchecked_tvidx[tidx] = (uint32_t*)bigstack_alloc_raw(window_int32_alloc);
+      g_raw_tgenovecs[0][tidx] = (uintptr_t*)bigstack_alloc_raw_rd(tvidx_batch_size * raw_tgenovec_single_variant_word_ct * sizeof(intptr_t));
+      g_raw_tgenovecs[1][tidx] = (uintptr_t*)bigstack_alloc_raw_rd(tvidx_batch_size * raw_tgenovec_single_variant_word_ct * sizeof(intptr_t));
+    }
+    g_subcontig_info = subcontig_info;
+    g_subcontig_thread_assignments = subcontig_thread_assignments;
+    g_variant_include = variant_include;
+    g_variant_allele_idxs = variant_allele_idxs;
+    g_maj_alleles = maj_alleles;
+    g_all_allele_freqs = allele_freqs;
+    g_variant_bps = variant_bps;
+    g_founder_ct = founder_ct;
+    g_founder_male_ct = founder_male_ct;
+    g_prune_ld_thresh = ldip->prune_last_param * (1 + kSmallEpsilon);
+    g_prune_window_size = ldip->prune_window_size;
+    g_window_maxl = window_maxl;
+    g_window_incr = ldip->prune_window_incr;
+    g_cur_batch_size = tvidx_batch_size;
+
+    const uint32_t all_haploid = IS_SET(cip->haploid_mask, 0);
+    uint32_t x_start = 0;
+    uint32_t x_end = 0;
+    uint32_t y_start = 0;
+    uint32_t y_end = 0;
+    get_xymt_start_and_end(cip, kChrOffsetX, &x_start, &x_end);
+    get_xymt_start_and_end(cip, kChrOffsetY, &y_start, &y_end);
+    const uint32_t x_len = x_end - x_start;
+    const uint32_t y_len = y_end - y_start;
+    g_x_start = x_start;
+    g_x_len = x_len;
+    g_y_start = y_start;
+    g_y_len = y_len;
+    // Main workflow:
+    // 1. Set n=0, load batch 0
+    
+    // 2. Spawn threads processing batch n
+    // 3. Increment n by 1
+    // 4. Load batch n unless eof
+    // 5. Join threads
+    // 6. Goto step 2 unless eof
+    //
+    // 7. Assemble final results with copy_bitarr_range()
+    uint32_t cur_tvidx_start = 0;
+    uint32_t is_last_batch = 0;
+    uint32_t parity = 0;
+    uint32_t pct = 0;
+    uint32_t next_print_tvidx_start = max_load / 100;
+    LOGPRINTF("--indep-pairwise (%u compute thread%s): ", calc_thread_ct, (calc_thread_ct == 1)? "" : "s");
+    fputs("0%", stdout);
+    fflush(stdout);
+    while (1) {
+      if (!is_last_batch) {
+	pgr_clear_ld_cache(simple_pgrp);
+	uintptr_t** cur_raw_tgenovecs = g_raw_tgenovecs[parity];
+	const uint32_t cur_tvidx_end = cur_tvidx_start + tvidx_batch_size;
+	uint32_t is_x_or_y = 0;
+	for (uint32_t subcontig_idx = 0; subcontig_idx < subcontig_ct; ++subcontig_idx) {
+	  const uint32_t cur_thread_idx = subcontig_thread_assignments[subcontig_idx];
+	  if (thread_last_subcontig[cur_thread_idx] > subcontig_idx) {
+	    continue;
+	  }
+	  uint32_t cur_tvidx = thread_last_tvidx[cur_thread_idx];
+	  if (cur_tvidx == cur_tvidx_end) {
+	    continue;
+	  }
+	  uint32_t subcontig_start_tvidx = thread_subcontig_start_tvidx[cur_thread_idx];
+	  uint32_t tvidx_end = subcontig_start_tvidx + subcontig_info[3 * subcontig_idx];
+	  if (tvidx_end > cur_tvidx_end) {
+	    tvidx_end = cur_tvidx_end;
+	    thread_last_subcontig[cur_thread_idx] = subcontig_idx;
+	  } else {
+	    thread_subcontig_start_tvidx[cur_thread_idx] = tvidx_end;
+	    thread_last_subcontig[cur_thread_idx] = subcontig_idx + 1;
+	  }
+	  uintptr_t tvidx_offset_end = tvidx_end - cur_tvidx_start;
+	  uint32_t variant_uidx;
+	  if (subcontig_start_tvidx == cur_tvidx) {
+	    variant_uidx = subcontig_info[3 * subcontig_idx + 2];
+	  } else {
+	    variant_uidx = thread_last_uidx[cur_thread_idx];
+	  }
+	  const uint32_t is_haploid = IS_SET(cip->haploid_mask, get_variant_chr(cip, variant_uidx));
+	  uint32_t is_x = ((variant_uidx - x_start) < x_len);
+	  const uint32_t new_is_x_or_y = is_x || ((variant_uidx - y_start) < y_len);
+
+	  // due to nonempty subset requirement (removed?)
+	  is_x = is_x && founder_nonmale_ct;
+	  if (is_x_or_y != new_is_x_or_y) {
+	    is_x_or_y = new_is_x_or_y;
+	    pgr_clear_ld_cache(simple_pgrp);
+	  }
+	  uintptr_t* cur_thread_raw_tgenovec = cur_raw_tgenovecs[cur_thread_idx];
+	  for (uintptr_t tvidx_offset = cur_tvidx - cur_tvidx_start; tvidx_offset < tvidx_offset_end; ++tvidx_offset, ++variant_uidx) {
+	    next_set_unsafe_ck(variant_include, &variant_uidx);
+	    uintptr_t* cur_raw_tgenovec = &(cur_thread_raw_tgenovec[tvidx_offset * raw_tgenovec_single_variant_word_ct]);
+	    if (!is_x_or_y) {
+	      reterr = pgr_read_allele_countvec_subset_unsafe(founder_info, founder_info_cumulative_popcounts, founder_ct, variant_uidx, maj_alleles[variant_uidx], simple_pgrp, cur_raw_tgenovec);
+	      if (is_haploid) {
+		set_het_missing(founder_ctl2, cur_raw_tgenovec);
+	      }
+	    } else {
+	      reterr = pgr_read_allele_countvec_subset_unsafe(nullptr, nullptr, raw_sample_ct, variant_uidx, maj_alleles[variant_uidx], simple_pgrp, tmp_genovec);
+	      if (founder_male_ct) {
+		copy_quaterarr_nonempty_subset(tmp_genovec, founder_male, raw_sample_ct, founder_male_ct, cur_raw_tgenovec);
+		set_het_missing(founder_male_ctl2, cur_raw_tgenovec);
+	      }
+	      if (is_x) {
+	        copy_quaterarr_nonempty_subset(tmp_genovec, founder_nonmale, raw_sample_ct, founder_nonmale_ct, &(cur_raw_tgenovec[founder_male_ctl2]));
+		if (all_haploid) {
+		  // don't just treat chrX identically to autosomes, since for
+		  // doubled haploids we still want to give females 2x the
+		  // weight of males.  I think.
+		  set_het_missing(founder_nonmale_ctl2, &(cur_raw_tgenovec[founder_male_ctl2]));
+		}
+	      }
+	    }
+	    if (reterr) {
+	      if (cur_tvidx_start) {
+		join_threads2z(calc_thread_ct, 0, threads);
+		g_cur_batch_size = 0;
+		error_cleanup_threads2z(indep_pairwise_thread, calc_thread_ct, threads);
+	      }
+	      if (reterr != kPglRetReadFail) {
+		logprint("\n");
+		logerrprint("Error: Malformed .pgen file.\n");
+	      }
+	      goto indep_pairwise_ret_1;
+	    }
+	  }
+	  thread_last_tvidx[cur_thread_idx] = tvidx_end;
+	  thread_last_uidx[cur_thread_idx] = variant_uidx;
+	}
+      }
+      if (cur_tvidx_start) {
+	join_threads2z(calc_thread_ct, is_last_batch, threads);
+	if (is_last_batch) {
+	  break;
+	}
+	if (cur_tvidx_start >= next_print_tvidx_start) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (cur_tvidx_start * 100LLU) / max_load;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_tvidx_start = (pct * ((uint64_t)max_load)) / 100;
+	}
+      }
+      is_last_batch = (cur_tvidx_start + tvidx_batch_size >= max_load);
+      if (spawn_threads2z(indep_pairwise_thread, calc_thread_ct, is_last_batch, threads)) {
+	goto indep_pairwise_ret_THREAD_CREATE_FAIL;
+      }
+      parity = 1 - parity;
+      cur_tvidx_start += tvidx_batch_size;
+    }
+    fill_uint_zero(calc_thread_ct, thread_subcontig_start_tvidx);
+    for (uint32_t subcontig_idx = 0; subcontig_idx < subcontig_ct; ++subcontig_idx) {
+      const uint32_t cur_thread_idx = subcontig_thread_assignments[subcontig_idx];
+      const uintptr_t* cur_removed_variants = g_removed_variants_write[cur_thread_idx];
+      const uint32_t subcontig_len = subcontig_info[3 * subcontig_idx];
+      const uint32_t subcontig_idx_start = subcontig_info[3 * subcontig_idx + 1];
+      copy_bitarr_range(cur_removed_variants, thread_subcontig_start_tvidx[cur_thread_idx], subcontig_idx_start, subcontig_len, removed_variants_collapsed);
+      thread_subcontig_start_tvidx[cur_thread_idx] += subcontig_len;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+  }
+  while (0) {
+  indep_pairwise_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  indep_pairwise_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  indep_pairwise_ret_NOT_YET_SUPPORTED:
+    reterr = kPglRetNotYetSupported;
+    break;
+  }
+ indep_pairwise_ret_1:
+  // caller will free memory
+  return reterr;
+}
+
+pglerr_t indep_pairphase() {
+  logerrprint("Error: --indep-pairphase is currently under development.\n");
+  return kPglRetNotYetSupported;
+}
+
+pglerr_t ld_prune_subcontig_split_all(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, uint32_t prune_window_size, uint32_t* window_max_ptr, uint32_t** subcontig_info_ptr, uint32_t* subcontig_ct_ptr) {
+  // variant_bps must be nullptr if window size is not bp-based
+  // chr0 assumed to already be removed from variant_include.
+  // this will skip over chromosomes/contigs with only 1 variant.
+  const uint32_t chr_ct = cip->chr_ct;
+  uint32_t* subcontig_info = (uint32_t*)g_bigstack_base;
+  uint32_t* subcontig_info_iter = subcontig_info;
+  uint32_t* subcontig_info_limit = &(((uint32_t*)g_bigstack_end)[-3]);
+  uint32_t window_max = 0;
+  uint32_t variant_idx = 0;
+  if (variant_bps) {
+    window_max = 1;
+    for (uint32_t chr_fo_idx = 0; chr_fo_idx < chr_ct; ++chr_fo_idx) {
+      const uint32_t chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+      uint32_t variant_uidx = next_set(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], chr_end);
+      const uint32_t chr_variant_ct = popcount_bit_idx(variant_include, variant_uidx, chr_end);
+      const uint32_t variant_idx_end = variant_idx + chr_variant_ct;
+      if (chr_variant_ct > 1) {
+	uint32_t subcontig_uidx_first = variant_uidx;
+	uint32_t subcontig_idx_first = variant_idx;
+	uint32_t window_idx_first = variant_idx;
+	uint32_t window_uidx_first = variant_uidx;
+	uint32_t window_pos_first = variant_bps[variant_uidx];
+	uint32_t prev_pos = window_pos_first;
+	++variant_idx;
+	do {
+	  ++variant_uidx;
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  uint32_t variant_bp_thresh = variant_bps[variant_uidx];
+	  if (variant_bp_thresh < prune_window_size) {
+	    prev_pos = variant_bp_thresh;
+	    variant_bp_thresh = 0;
+	  } else {
+	    if (variant_bp_thresh - prune_window_size > prev_pos) {
+	      if (variant_idx > subcontig_idx_first + 1) {
+		if (subcontig_info_iter > subcontig_info_limit) {
+		  return kPglRetNomem;
+		}
+		*subcontig_info_iter++ = variant_idx - subcontig_idx_first;
+		*subcontig_info_iter++ = subcontig_idx_first;
+		*subcontig_info_iter++ = subcontig_uidx_first;
+	      }
+	      subcontig_uidx_first = variant_uidx;
+	      subcontig_idx_first = variant_idx;
+	    }
+	    prev_pos = variant_bp_thresh;
+	    variant_bp_thresh -= prune_window_size;
+	  }
+	  if (variant_bp_thresh > window_pos_first) {
+	    do {
+	      ++window_uidx_first;
+	      next_set_unsafe_ck(variant_include, &window_uidx_first);
+	      window_pos_first = variant_bps[window_uidx_first];
+	      ++window_idx_first;
+	    } while (variant_bp_thresh > window_pos_first);
+	  } else if (variant_idx - window_idx_first == window_max) {
+	    ++window_max;
+	  }
+	} while (++variant_idx < variant_idx_end);
+	if (variant_idx > subcontig_idx_first + 1) {
+	  if (subcontig_info_iter > subcontig_info_limit) {
+	    return kPglRetNomem;
+	  }
+	  *subcontig_info_iter++ = variant_idx - subcontig_idx_first;
+	  *subcontig_info_iter++ = subcontig_idx_first;
+	  *subcontig_info_iter++ = subcontig_uidx_first;
+	}
+      }
+      variant_idx = variant_idx_end;
+    }
+  } else {
+    for (uint32_t chr_fo_idx = 0; chr_fo_idx < chr_ct; ++chr_fo_idx) {
+      const uint32_t chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+      const uint32_t first_variant_uidx = next_set(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], chr_end);
+      const uint32_t chr_variant_ct = popcount_bit_idx(variant_include, first_variant_uidx, chr_end);
+      if (chr_variant_ct > 1) {
+	if (subcontig_info_iter > subcontig_info_limit) {
+	  return kPglRetNomem;
+	}
+	*subcontig_info_iter++ = chr_variant_ct;
+	*subcontig_info_iter++ = variant_idx;
+	*subcontig_info_iter++ = first_variant_uidx;
+	if (window_max < prune_window_size) {
+	  if (chr_variant_ct > window_max) {
+	    window_max = chr_variant_ct;
+	  }
+	}
+      }
+      variant_idx += chr_variant_ct;
+    }
+    if (window_max > prune_window_size) {
+      window_max = prune_window_size;
+    }
+  }
+  *subcontig_ct_ptr = ((uintptr_t)(subcontig_info_iter - subcontig_info)) / 3;
+  *subcontig_info_ptr = subcontig_info;
+  bigstack_finalize_ui(subcontig_info, (*subcontig_ct_ptr) * 3);
+  *window_max_ptr = window_max;
+  return kPglRetSuccess;
+}
+
+// next several functions (including load_balance()) will probably move to
+// plink2_common
+void minheap64_replace_root(uint32_t heap_size, uint64_t new_root, uint64_t* minheap64_preroot) {
+  uint32_t cur_pos = 1;
+  while (1) {
+    uint32_t child_pos = cur_pos * 2;
+    if (child_pos >= heap_size) {
+      if (child_pos == heap_size) {
+	// special case: one child at end of heap
+	const uint64_t child_val = minheap64_preroot[child_pos];
+	if (new_root > child_val) {
+	  minheap64_preroot[cur_pos] = child_val;
+	  cur_pos = child_pos;
+	}
+      }
+      break;
+    }
+    uint64_t min_child_val = minheap64_preroot[child_pos];
+    const uint64_t child_val2 = minheap64_preroot[child_pos + 1];
+    if (child_val2 < min_child_val) {
+      min_child_val = child_val2;
+      ++child_pos;
+    }
+    if (new_root <= min_child_val) {
+      break;
+    }
+    minheap64_preroot[cur_pos] = min_child_val;
+    cur_pos = child_pos;
+  }
+  minheap64_preroot[cur_pos] = new_root;
+}
+
+/*
+void minheap64_delete_root(uint64_t* minheap64_preroot, uint32_t* heap_size_ptr) {
+  uint32_t heap_size = *heap_size_ptr;
+  const uint64_t new_root = minheap64_preroot[heap_size];
+  minheap64_replace_root(--heap_size, new_root, minheap64_preroot);
+  *heap_size_ptr = heap_size;
+}
+*/
+
+void minheap64_insert(uint64_t new_entry, uint64_t* minheap64_preroot, uint32_t* heap_size_ptr) {
+  // assumes minheap64_preroot[0] == 0
+  const uint32_t heap_size = 1 + (*heap_size_ptr);
+  *heap_size_ptr = heap_size;
+  uint32_t cur_pos = heap_size;
+  while (1) {
+    uint32_t parent_pos = cur_pos / 2;
+    const uint64_t parent_val = minheap64_preroot[parent_pos];
+    if (new_entry >= parent_val) {
+      minheap64_preroot[cur_pos] = new_entry;
+      return;
+    }
+    minheap64_preroot[cur_pos] = parent_val;
+    cur_pos = parent_pos;
+  }
+}
+
+// This is intended to split a relatively small number of contig-like regions
+// between threads, but it shouldn't totally fall apart if there are millions
+// of regions and hundreds of threads.
+// Based on the Longest Processing Time algorithm, but with a few adjustments:
+// * max(largest_weight, round_up(total_weight / thread_ct)) is noted, and the
+//   first 8 * thread_ct thread assignments are based on best-fit to that
+//   capacity.  The constant 8 is chosen to be enough to beat basic LPT's
+//   4/3 - 1/{3m} approximation factor by a relevant margin, while keeping
+//   runtime under control.  (In the event that there is no fit, the capacity
+//   is increased.)
+// * If any task assignments remain, we use LPT, but attempt to use a lower
+//   number of threads; we only add another thread if we would otherwise have
+//   to increase max_load.
+pglerr_t load_balance(const uint32_t* task_weights, uint32_t task_ct, uint32_t* thread_ct_ptr, uint32_t* thread_assignments, uint32_t* max_load_ptr) {
+  // max_load assumed to be initialized to zero
+  assert(task_ct);
+  const uint32_t orig_thread_ct = *thread_ct_ptr;
+  if (orig_thread_ct == 1) {
+    fill_uint_zero(task_ct, thread_assignments);
+    // replace this with an acc_uint32 call?
+    uint32_t max_load = task_weights[0];
+    for (uint32_t task_idx = 1; task_idx < task_ct; ++task_idx) {
+      max_load += task_weights[task_idx];
+    }
+    *max_load_ptr = max_load;
+    return kPglRetSuccess;
+  }
+  assert(task_ct >= orig_thread_ct);
+  uint64_t* sorted_tagged_weights;
+  uint64_t* minheap64_preroot;
+  if (bigstack_alloc_ull(task_ct, &sorted_tagged_weights) ||
+      bigstack_alloc_ull(orig_thread_ct + 2, &minheap64_preroot)) {
+    return kPglRetNomem;
+  }
+  minheap64_preroot[0] = 0;
+  uint64_t* minheap64 = &(minheap64_preroot[1]);
+  uint32_t total_weight = 0;
+  for (uintptr_t task_idx = 0; task_idx < task_ct; ++task_idx) {
+    const uintptr_t cur_weight = task_weights[task_idx];
+    total_weight += cur_weight;
+    sorted_tagged_weights[task_idx] = (((uint64_t)cur_weight) << 32) + (uint64_t)task_idx;
+  }
+  uint64_t* sorted_tagged_weights_end = &(sorted_tagged_weights[task_ct]);
+#ifdef __cplusplus
+  // could try std::nth_element if this is ever a bottleneck
+  std::sort(sorted_tagged_weights, sorted_tagged_weights_end, std::greater<uint64_t>());
+#else
+  qsort(sorted_tagged_weights, task_ct, sizeof(int64_t), uint64cmp_decr);
+#endif
+  const uint64_t largest_tagged_weight = sorted_tagged_weights[0];
+  uint32_t initial_max_load = largest_tagged_weight >> 32;
+  uint32_t thread_ct = 1 + (total_weight - 1) / initial_max_load;
+  if (thread_ct > orig_thread_ct) {
+    thread_ct = orig_thread_ct;
+    initial_max_load = 1 + (total_weight - 1) / orig_thread_ct;
+  }
+  
+  for (uintptr_t thread_idx = 1; thread_idx < thread_ct; ++thread_idx) {
+    minheap64[thread_idx - 1] = thread_ct - thread_idx;
+  }
+  minheap64[thread_ct - 1] = largest_tagged_weight & 0xffffffff00000000LLU;
+  for (uint32_t thread_idx = thread_ct; thread_idx <= orig_thread_ct; ++thread_idx) {
+    minheap64[thread_idx] = 0xffffffffffffffffLLU;
+  }
+  thread_assignments[(uint32_t)largest_tagged_weight] = 0;
+  uint64_t max_load_shifted = (((uint64_t)initial_max_load) << 32) | 0xffffffffLLU;
+  uint64_t* best_fit_end = sorted_tagged_weights_end;
+  if (task_ct > 8 * orig_thread_ct) {
+    // stop best-fit here
+    best_fit_end = &(sorted_tagged_weights[8 * orig_thread_ct]);
+  }
+  uint64_t* sorted_tagged_weights_iter = &(sorted_tagged_weights[1]);
+  while (sorted_tagged_weights_iter != best_fit_end) {
+    // maintain minheap64 as fully sorted list
+    uint64_t cur_tagged_weight = *sorted_tagged_weights_iter++;
+    const uint32_t task_idx = (uint32_t)cur_tagged_weight;
+    cur_tagged_weight &= 0xffffffff00000000LLU;
+    const uintptr_t idxp1 = uint64arr_greater_than(minheap64, thread_ct, max_load_shifted - cur_tagged_weight);
+    if (idxp1) {
+      uintptr_t idx = idxp1 - 1;
+      const uint64_t new_entry = minheap64[idx] + cur_tagged_weight;
+      while (1) {
+	const uint64_t next_entry = minheap64[idx + 1];
+	if (new_entry < next_entry) {
+	  break;
+	}
+	minheap64[idx++] = next_entry;
+      }
+      thread_assignments[task_idx] = (uint32_t)new_entry;
+      minheap64[idx] = new_entry;
+    } else if (thread_ct < orig_thread_ct) {
+      const uint64_t new_entry = cur_tagged_weight + thread_ct;
+      const uintptr_t insert_pt = uint64arr_greater_than(minheap64, thread_ct, new_entry);
+      for (uintptr_t thread_idx = thread_ct; thread_idx > insert_pt; --thread_idx) {
+	minheap64[thread_idx] = minheap64[thread_idx - 1];
+      }
+      minheap64[insert_pt] = new_entry;
+      thread_assignments[task_idx] = thread_ct++;
+    } else {
+      // move lowest entry to end of list, shift everything else down
+      const uint64_t new_entry = minheap64[0] + cur_tagged_weight;
+      for (uint32_t thread_idx = 1; thread_idx < thread_ct; ++thread_idx) {
+	minheap64[thread_idx - 1] = minheap64[thread_idx];
+      }
+      minheap64[thread_ct - 1] = new_entry;
+      max_load_shifted = new_entry | 0xffffffffLLU;
+      thread_assignments[task_idx] = (uint32_t)new_entry;
+    }
+  }
+  if (best_fit_end != sorted_tagged_weights_end) {
+    do {
+      const uint64_t cur_heaproot = minheap64[0];
+      uint64_t cur_tagged_weight = *sorted_tagged_weights_iter++;
+      const uint32_t task_idx = (uint32_t)cur_tagged_weight;
+      uint32_t cur_thread = (uint32_t)cur_heaproot;
+      cur_tagged_weight &= 0xffffffff00000000LLU;
+      uint64_t new_entry = cur_heaproot + cur_tagged_weight;
+      if (new_entry > max_load_shifted) {
+	if (thread_ct < orig_thread_ct) {
+	  thread_assignments[task_idx] = thread_ct;
+	  minheap64_insert(cur_tagged_weight + thread_ct, minheap64_preroot, &thread_ct);
+	  continue;
+	} else {
+	  max_load_shifted = new_entry | 0xffffffffLLU;
+	}
+      }
+      thread_assignments[task_idx] = cur_thread;
+      minheap64_replace_root(thread_ct, new_entry, minheap64_preroot);
+    } while (sorted_tagged_weights_iter != sorted_tagged_weights_end);
+  }  
+  bigstack_reset(sorted_tagged_weights);
+  *thread_ct_ptr = thread_ct;
+  *max_load_ptr = max_load_shifted >> 32;
+  return kPglRetSuccess;
+}
+
+pglerr_t ld_prune_write(const uintptr_t* variant_include, const uintptr_t* removed_variants_collapsed, char** variant_ids, uint32_t variant_ct, char* outname, char* outname_end) {
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    fputs("Writing...", stdout);
+    fflush(stdout);
+    strcpy(outname_end, ".prune.in");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto ld_prune_write_ret_OPEN_FAIL;
+    }
+    char* textbuf = g_textbuf;
+    char* write_iter = textbuf;
+    char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+    uint32_t variant_uidx = 0;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (is_set(removed_variants_collapsed, variant_idx)) {
+	continue;
+      }
+      write_iter = strcpya(write_iter, variant_ids[variant_uidx]);
+      append_binary_eoln(&write_iter);
+      if (write_iter >= textbuf_flush) {
+        if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	  goto ld_prune_write_ret_WRITE_FAIL;
+	}
+	write_iter = textbuf;
+      }
+    }
+    if (write_iter > textbuf) {
+      if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	goto ld_prune_write_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto ld_prune_write_ret_WRITE_FAIL;
+    }
+
+    strcpy(&(outname_end[7]), "out");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto ld_prune_write_ret_OPEN_FAIL;
+    }
+    write_iter = textbuf;
+    variant_uidx = 0;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (!is_set(removed_variants_collapsed, variant_idx)) {
+	continue;
+      }
+      write_iter = strcpya(write_iter, variant_ids[variant_uidx]);
+      append_binary_eoln(&write_iter);
+      if (write_iter >= textbuf_flush) {
+        if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	  goto ld_prune_write_ret_WRITE_FAIL;
+	}
+	write_iter = textbuf;
+      }
+    }
+    if (write_iter > textbuf) {
+      if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	goto ld_prune_write_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto ld_prune_write_ret_WRITE_FAIL;
+    }
+    *outname_end = '\0';
+    putc_unlocked('\r', stdout);
+    LOGPRINTFWW("Variant lists written to %s.prune.in and %s.prune.out .\n", outname, outname);
+  }
+  while (0) {
+  ld_prune_write_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  ld_prune_write_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  fclose_cond(outfile);
+  return reterr;
+}
+
+pglerr_t ld_prune(const uintptr_t* orig_variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, const alt_allele_ct_t* maj_alleles, const double* allele_freqs, const uintptr_t* founder_info, const uintptr_t* sex_male, const ld_info_t* ldip, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t raw_sample_ct, uint32_t founder_ct, uint32_t max_thread_ct, pgen_reader_t* simple_pgrp, char* outname, char* outname_end) {
+  // common initialization between --indep-pairwise and --indep-pairphase
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t is_pairphase = (ldip->prune_modifier / kfLdPrunePairphase) & 1;
+    if (founder_ct < 2) {
+      LOGERRPRINTF("Warning: Skipping --indep-pair%s since there are less than two founders.\n(--make-founders may come in handy here.)\n", is_pairphase? "phase" : "wise");
+      goto ld_prune_ret_1;
+    }
+    uint32_t skipped_variant_ct = 0;
+    if (is_set(cip->chr_mask, 0)) {
+      skipped_variant_ct = count_chr_variants_unsafe(orig_variant_include, cip, 0);
+    }
+    const uint32_t chr_code_end = cip->max_code + 1 + cip->name_ct;
+    if (cip->zero_extra_chrs) {
+      for (uint32_t chr_idx = cip->max_code + 1; chr_idx < chr_code_end; ++chr_idx) {
+	if (is_set(cip->chr_mask, chr_idx)) {
+	  skipped_variant_ct += count_chr_variants_unsafe(orig_variant_include, cip, cip->chr_idx_to_foidx[chr_idx]);
+	}
+      }
+    }
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    const uintptr_t* variant_include;
+    if (skipped_variant_ct) {
+      uintptr_t* new_variant_include;
+      if (bigstack_alloc_ul(raw_variant_ctl, &new_variant_include)) {
+	goto ld_prune_ret_NOMEM;
+      }
+      memcpy(new_variant_include, orig_variant_include, raw_variant_ctl * sizeof(intptr_t));
+      if (is_set(cip->chr_mask, 0)) {
+	const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[0];
+	const uint32_t start_uidx = cip->chr_fo_vidx_start[chr_fo_idx];
+	clear_bits_nz(start_uidx, cip->chr_fo_vidx_start[chr_fo_idx + 1], new_variant_include);
+      }
+      if (cip->zero_extra_chrs) {
+        for (uint32_t chr_idx = cip->max_code + 1; chr_idx < chr_code_end; ++chr_idx) {
+	  const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
+	  const uint32_t start_uidx = cip->chr_fo_vidx_start[chr_fo_idx];
+	  clear_bits_nz(start_uidx, cip->chr_fo_vidx_start[chr_fo_idx + 1], new_variant_include);
+	}
+      }
+      variant_include = new_variant_include;
+      variant_ct -= skipped_variant_ct;
+      LOGPRINTF("--indep-pair%s: Ignoring %u chromosome 0 variant%s.\n", is_pairphase? "phase" : "wise", skipped_variant_ct, (skipped_variant_ct == 1)? "" : "s");
+    } else {
+      variant_include = orig_variant_include;
+    }
+
+    if (!(ldip->prune_modifier & kfLdPruneWindowBp)) {
+      variant_bps = nullptr;
+    }
+    const uint32_t prune_window_size = ldip->prune_window_size;
+    uint32_t* subcontig_info;
+    uint32_t window_max;
+    uint32_t subcontig_ct;
+    if (ld_prune_subcontig_split_all(variant_include, cip, variant_bps, prune_window_size, &window_max, &subcontig_info, &subcontig_ct)) {
+      return kPglRetNomem;
+    }
+    if (!subcontig_ct) {
+      LOGERRPRINTF("Warning: Skipping --indep-pair%s since there are no pairs of variants to\nprocess.\n", is_pairphase? "phase" : "wise");
+      goto ld_prune_ret_1;
+    }
+    if (max_thread_ct > 2) {
+      --max_thread_ct;
+    }
+    if (max_thread_ct > subcontig_ct) {
+      max_thread_ct = subcontig_ct;
+    }
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t variant_ctl = BITCT_TO_WORDCT(variant_ct);
+    const uint32_t founder_male_ct = popcount_longs_intersect(founder_info, sex_male, raw_sample_ctl);
+    const uint32_t founder_ctl = BITCT_TO_WORDCT(founder_ct);
+    uint32_t* founder_info_cumulative_popcounts;
+    uintptr_t* founder_nonmale_collapsed;
+    uintptr_t* founder_male_collapsed;
+    uintptr_t* removed_variants_collapsed;
+    uint32_t* subcontig_thread_assignments;
+    if (bigstack_alloc_ui(raw_sample_ctl, &founder_info_cumulative_popcounts) ||
+	bigstack_alloc_ul(founder_ctl, &founder_nonmale_collapsed) ||
+	bigstack_alloc_ul(founder_ctl, &founder_male_collapsed) ||
+	bigstack_calloc_ul(variant_ctl, &removed_variants_collapsed) ||
+	bigstack_alloc_ui(subcontig_ct, &subcontig_thread_assignments)) {
+      goto ld_prune_ret_NOMEM;
+    }
+    fill_cumulative_popcounts(founder_info, raw_sample_ctl, founder_info_cumulative_popcounts);
+    copy_bitarr_subset(sex_male, founder_info, founder_ct, founder_male_collapsed);
+    bitarr_invert_copy(founder_male_collapsed, founder_ct, founder_nonmale_collapsed);
+    uint32_t* subcontig_weights;
+    if (bigstack_end_alloc_ui(subcontig_ct, &subcontig_weights)) {
+      goto ld_prune_ret_NOMEM;
+    }
+
+    // initial window_max-based memory requirement estimate
+    if (is_pairphase) {
+      // todo
+    } else {
+      const uintptr_t entire_variant_buf_word_ct = 3 * (BITCT_TO_ALIGNED_WORDCT(founder_ct - founder_male_ct) + BITCT_TO_ALIGNED_WORDCT(founder_male_ct));
+      // reserve ~1/2 of space for main variant data buffer,
+      //   removed_variant_write
+      // everything else:
+      //   genobufs: thread_ct * window_max * entire_variant_buf_word_ct * word
+      //   occupied_window_slots: thread_ct * window_maxl * word
+      //   cur_window_removed: thread_ct * (1 + window_max / kBitsPerWord) *
+      //     word
+      //   (ignore removed_variant_write)
+      //   maj_freqs: thread_ct * window_max * 8
+      //   vstats, nonmale_vstats: thread_ct * window_max * 3 * int32
+      //   winpos_to_slot_idx, tvidxs, first_unchecked_vidx: window_max * 3 *
+      //     int32
+      uintptr_t per_thread_alloc = round_up_pow2(window_max * entire_variant_buf_word_ct * sizeof(intptr_t), kCacheline) + 2 * round_up_pow2((1 + window_max / kBitsPerWord) * sizeof(intptr_t), kCacheline) + round_up_pow2(window_max * sizeof(double), kCacheline) + 2 * round_up_pow2(window_max * (3 * sizeof(int32_t)), kCacheline) + 3 * round_up_pow2(window_max * sizeof(int32_t), kCacheline);
+      uintptr_t bigstack_left2 = bigstack_left();
+      if (per_thread_alloc * max_thread_ct > bigstack_left2) {
+	if (per_thread_alloc > bigstack_left2) {
+	  goto ld_prune_ret_NOMEM;
+	}
+	max_thread_ct = bigstack_left2 / per_thread_alloc;
+      }
+    }
+
+    
+    for (uint32_t subcontig_idx = 0; subcontig_idx < subcontig_ct; ++subcontig_idx) {
+      // todo: adjust chrX weights upward, and chrY downward
+      subcontig_weights[subcontig_idx] = subcontig_info[3 * subcontig_idx];
+      // printf("%u %u %u\n", subcontig_info[3 * subcontig_idx], subcontig_info[3 * subcontig_idx + 1], subcontig_info[3 * subcontig_idx + 2]);
+    }
+    uint32_t max_load = 0;
+    if (load_balance(subcontig_weights, subcontig_ct, &max_thread_ct, subcontig_thread_assignments, &max_load)) {
+      goto ld_prune_ret_NOMEM;
+    }
+    bigstack_end_reset(bigstack_end_mark);
+    
+    if (is_pairphase) {
+      reterr = indep_pairphase();
+    } else {
+      reterr = indep_pairwise(variant_include, cip, variant_bps, variant_allele_idxs, maj_alleles, allele_freqs, founder_info, founder_info_cumulative_popcounts, founder_nonmale_collapsed, founder_male_collapsed, ldip, subcontig_info, subcontig_thread_assignments, raw_sample_ct, founder_ct, founder_male_ct, subcontig_ct, window_max, max_thread_ct, max_load, simple_pgrp, removed_variants_collapsed);
+    }
+    if (reterr) {
+      goto ld_prune_ret_1;
+    }
+    const uint32_t removed_ct = popcount_longs(removed_variants_collapsed, variant_ctl);
+    LOGPRINTF("%u/%u variants removed.\n", removed_ct, variant_ct);
+    reterr = ld_prune_write(variant_include, removed_variants_collapsed, variant_ids, variant_ct, outname, outname_end);
+    if (reterr) {
+      goto ld_prune_ret_1;
+    }
+  }
+  while (0) {
+  ld_prune_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  }
+ ld_prune_ret_1:
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_ld.h b/plink2_ld.h
new file mode 100644
index 0000000..91a7c1d
--- /dev/null
+++ b/plink2_ld.h
@@ -0,0 +1,52 @@
+#ifndef __PLINK2_LD_H__
+#define __PLINK2_LD_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+FLAGSET_DEF_START()
+  kfLdPrune0,
+  kfLdPruneWindowBp = (1 << 0),
+  kfLdPrunePairwise = (1 << 1),
+  kfLdPrunePairphase = (1 << 2)
+FLAGSET_DEF_END(ld_prune_t);
+// todo: old multicollinearity test; new multiallelic option
+
+typedef struct ld_info_struct {
+  double prune_last_param; // VIF or r^2 threshold
+  ld_prune_t prune_modifier;
+  uint32_t prune_window_size;
+  uint32_t prune_window_incr;
+} ld_info_t;
+
+void init_ld(ld_info_t* ldip);
+
+void cleanup_ld(ld_info_t* ldip);
+
+pglerr_t ld_prune(const uintptr_t* orig_variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, const alt_allele_ct_t* maj_alleles, const double* allele_freqs, const uintptr_t* founder_info, const uintptr_t* sex_male, const ld_info_t* ldip, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t raw_sample_ct, uint32_t founder_ct, uint32_t max_thread_ct, pgen_reader_t* simple_pgrp, char* outname, char* outname_end);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_LD_H__
diff --git a/plink2_matrix.cpp b/plink2_matrix.cpp
new file mode 100644
index 0000000..9523496
--- /dev/null
+++ b/plink2_matrix.cpp
@@ -0,0 +1,1256 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_matrix.h"
+
+#ifndef NOLAPACK
+  #ifdef __APPLE__
+    #define USE_CBLAS_XGEMM
+  #endif
+  #ifndef __APPLE__
+
+    #ifdef __cplusplus
+extern "C" {
+    #endif
+  
+    #ifdef _WIN32
+      // openblas is easy enough to set up on Windows nowadays.
+      // not worth the trouble of ripping out vector extensions, etc. just so
+      // we can compile with Visual Studio and gain access to MKL
+      #ifndef USE_OPENBLAS
+        #error "Windows build currently requires OpenBLAS's LAPACK."
+      #endif
+      #define HAVE_LAPACK_CONFIG_H
+      #define LAPACK_COMPLEX_STRUCTURE
+      #include "lapacke.h"
+
+  void dgemm_(char* transa, char* transb, int* m, int* n, int* k,
+              double* alpha, double* a, int* lda, double* b, int* ldb,
+              double* beta, double* c, int* ldc);
+
+  void dgetrf_(__CLPK_integer* m, __CLPK_integer* n,
+               __CLPK_doublereal* a, __CLPK_integer* lda,
+               __CLPK_integer* ipiv, __CLPK_integer* info);
+
+  void dsyrk_(char* uplo, char* trans, __CLPK_integer* n, __CLPK_integer* k,
+	      __CLPK_doublereal* alpha, __CLPK_doublereal* a,
+	      __CLPK_integer* lda, __CLPK_doublereal* beta,
+	      __CLPK_doublereal* c, __CLPK_integer* ldc);
+
+  void sgemm_(char* transa, char* transb, int* m, int* n, int* k,
+              float* alpha, float* a, int* lda, float* b, int* ldb,
+              float* beta, float* c, int* ldc);
+
+    #else // Linux
+      #ifdef USE_MKL
+        #define USE_CBLAS_XGEMM
+        #ifdef DYNAMIC_MKL
+          #include <mkl_cblas.h>
+          #include <mkl_lapack.h>
+        #else
+          #include "/opt/intel/mkl/include/mkl_cblas.h"
+          #include "/opt/intel/mkl/include/mkl_lapack.h"
+        #endif
+        static_assert(sizeof(MKL_INT) == 8, "Unexpected MKL_INT size.");
+      #else
+        // If you want 64-bit index support, but not MKL (e.g. you're targeting
+        // an AMD processor), modify the Makefile to link to a LAPACK library
+        // recompiled with -fdefault-integer-8.
+
+        #ifdef USE_CBLAS_XGEMM
+	  #include <cblas.h>
+	#else
+	  // ARGH
+	  // cmake on Ubuntu 14 seems to require use of cblas_f77.h instead of
+	  // cblas.h.  Conversely, cblas_f77.h does not seem to be available on
+	  // the Scientific Linux ATLAS/LAPACK install, and right now that's my
+	  // only option for producing 32-bit static builds...
+          // So.  Default include is cblas.h.  To play well with cmake + Ubuntu
+          // 14 and 16 simultaneously, there is a CBLAS_F77_ON_OLD_GCC mode
+          // which picks cblas_f77.h on Ubuntu 14 and cblas.h on 16.
+          #ifdef FORCE_CBLAS_F77
+            #include <cblas_f77.h>
+          #elif !defined(CBLAS_F77_ON_OLD_GCC)
+            #include <cblas.h>
+          #else
+	    #if (__GNUC__ <= 4)
+              #include <cblas_f77.h>
+            #else
+	      #if __has_include(<cblas.h>)
+	        #include <cblas.h>
+	      #else
+	        #include <cblas_f77.h>
+	      #endif
+            #endif
+	  #endif
+	#endif
+  int dgetrf_(__CLPK_integer* m, __CLPK_integer* n,
+              __CLPK_doublereal* a, __CLPK_integer* lda,
+              __CLPK_integer* ipiv, __CLPK_integer* info);
+
+  int dgetri_(__CLPK_integer* n, __CLPK_doublereal* a,
+              __CLPK_integer* lda, __CLPK_integer* ipiv,
+              __CLPK_doublereal* work, __CLPK_integer* lwork,
+              __CLPK_integer* info);
+
+  double dlange_(char* norm, __CLPK_integer* m, __CLPK_integer* n,
+                 __CLPK_doublereal* a, __CLPK_integer* lda,
+                 __CLPK_doublereal* work);
+
+  int dgecon_(char* norm, __CLPK_integer* n, __CLPK_doublereal* a,
+              __CLPK_integer* lda, __CLPK_doublereal* anorm,
+              __CLPK_doublereal* rcond, __CLPK_doublereal* work,
+              __CLPK_integer* iwork, __CLPK_integer* info);
+
+  float slange_(char* norm, __CLPK_integer* m, __CLPK_integer* n, float* a,
+		__CLPK_integer* lda, float* work);
+
+  int sgetrf_(__CLPK_integer* m, __CLPK_integer* n, float* a,
+	      __CLPK_integer* lda, __CLPK_integer* ipiv, __CLPK_integer* info);
+
+  int sgecon_(char* norm, __CLPK_integer* n, float* a, __CLPK_integer* lda,
+	      float* anorm, float* rcond, float* work, __CLPK_integer* iwork,
+	      __CLPK_integer* info);
+
+  int sgetri_(__CLPK_integer* n, float* a, __CLPK_integer* lda,
+	      __CLPK_integer* ipiv, float* work, __CLPK_integer* lwork,
+	      __CLPK_integer* info);
+
+  /*
+  void dgels_(char* trans, __CLPK_integer* m, __CLPK_integer* n,
+              __CLPK_integer* nrhs, __CLPK_doublereal* a, __CLPK_integer* lda,
+              __CLPK_doublereal* b, __CLPK_integer* ldb,
+              __CLPK_doublereal* work, __CLPK_integer* lwork,
+              __CLPK_integer* info);
+
+  void dgesdd_(char* jobs, __CLPK_integer* m, __CLPK_integer* n,
+               __CLPK_doublereal* a, __CLPK_integer* lda, __CLPK_doublereal* s,
+               __CLPK_doublereal* u, __CLPK_integer* ldu,
+               __CLPK_doublereal* vt, __CLPK_integer* ldvt,
+               __CLPK_doublereal* work, __CLPK_integer* lwork,
+               __CLPK_integer* iwork, __CLPK_integer* info);
+  */
+
+  void dgesvd_(char* jobu, char* jobvt, __CLPK_integer* m, __CLPK_integer* n,
+	       __CLPK_doublereal* a, __CLPK_integer* lda, __CLPK_doublereal* s,
+	       __CLPK_doublereal* u, __CLPK_integer* ldu,
+	       __CLPK_doublereal* vt, __CLPK_integer* ldvt,
+	       __CLPK_doublereal* work, __CLPK_integer* lwork,
+	       __CLPK_integer* info);
+
+  int dsyevr_(char* jobz, char* range, char* uplo, __CLPK_integer* n,
+              __CLPK_doublereal* a, __CLPK_integer* lda, __CLPK_doublereal* vl,
+              __CLPK_doublereal* vu, __CLPK_integer* il, __CLPK_integer* iu,
+              __CLPK_doublereal* abstol, __CLPK_integer* m,
+              __CLPK_doublereal* w, __CLPK_doublereal* z, __CLPK_integer* ldz,
+              __CLPK_integer* isuppz, __CLPK_doublereal* work,
+              __CLPK_integer* lwork, __CLPK_integer* iwork,
+              __CLPK_integer* liwork, __CLPK_integer* info);
+
+  int dpotrf_(char* uplo, __CLPK_integer* n, __CLPK_doublereal* a,
+              __CLPK_integer* lda, __CLPK_integer* info);
+
+
+  void dpotrs_(char* uplo, __CLPK_integer* n, __CLPK_integer* nrhs,
+               __CLPK_doublereal* a, __CLPK_integer* lda, __CLPK_doublereal* b,
+	       __CLPK_integer* ldb, __CLPK_integer* info);
+
+  int dpotri_(char* uplo, __CLPK_integer* n, __CLPK_doublereal* a,
+              __CLPK_integer* lda, __CLPK_integer* info);
+
+        #ifndef USE_CBLAS_XGEMM
+  void dgemm_(char* transa, char* transb, __CLPK_integer* m, __CLPK_integer* n,
+              __CLPK_integer* k, __CLPK_doublereal* alpha,
+              __CLPK_doublereal* a, __CLPK_integer* lda, __CLPK_doublereal* b,
+              __CLPK_integer* ldb, __CLPK_doublereal* beta,
+              __CLPK_doublereal* c, __CLPK_integer* ldc);
+
+  void sgemm_(char* transa, char* transb, __CLPK_integer* m, __CLPK_integer* n,
+              __CLPK_integer* k, float* alpha, float* a, __CLPK_integer* lda,
+              float* b, __CLPK_integer* ldb, float* beta, float* c,
+              __CLPK_integer* ldc);
+
+  void dsyrk_(char* uplo, char* trans, __CLPK_integer* n, __CLPK_integer* k,
+	      __CLPK_doublereal* alpha, __CLPK_doublereal* a,
+	      __CLPK_integer* lda, __CLPK_doublereal* beta,
+	      __CLPK_doublereal* c, __CLPK_integer* ldc);
+        #endif
+      #endif // !USE_MKL
+    #endif // Linux
+
+  void xerbla_(void);
+    #ifdef __cplusplus
+}
+    #endif // __cplusplus
+    void xerbla_(void) {} // fix static linking error
+  #endif // not __APPLE__
+
+#endif // !NOLAPACK
+
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+inline double SQR(const double a) {
+  return a * a;
+}
+
+#ifdef __cplusplus
+inline double SIGN(const double &a, const double &b) {
+  // PLINK helper.h SIGN() template specialized to doubles.
+  return (b >= 0)? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);
+}
+#else
+inline double SIGN(const double a, const double b) {
+  // PLINK helper.h SIGN() template specialized to doubles.
+  return (b >= 0)? (a >= 0 ? a : -a) : (a >= 0 ? -a : a);
+}
+#endif
+
+double pythag(const double a, const double b) {
+  // PLINK stats.cpp pythag().
+  double absa,absb;
+ 
+  absa=fabs(a);
+  absb=fabs(b);
+  if (absa > absb) return absa*sqrt(1.0+SQR(absb/absa));
+  else return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+SQR(absa/absb)));
+}
+
+#ifdef NOLAPACK
+uint32_t svdcmp_c(int32_t m, double* a, double* w, double* v) {
+  // C port of PLINK stats.cpp svdcmp().
+  // Now thread-safe.
+  double* rv1 = &(w[(uint32_t)m]);
+  int32_t n = m;
+  int32_t flag;
+  int32_t l = 0; // suppress compile warning
+  int32_t i,its,j,jj,k,nm;
+  double anorm,c,f,g,h,s,scale,x,y,z;
+  double temp;
+
+  g=scale=anorm=0.0;
+  for (i=0;i<n;i++) {
+    l=i+2;
+    rv1[i]=scale*g;
+    g=s=scale=0.0;
+    if (i < m) {
+      for (k=i;k<m;k++) scale += fabs(a[k * m + i]);
+      if (scale != 0.0) {
+	for (k=i;k<m;k++) {
+	  a[k * m + i] /= scale;
+	  s += a[k * m + i]*a[k * m + i];
+	}
+	f=a[i * m + i];
+	g = -SIGN(sqrt(s),f);
+	h=f*g-s;
+	a[i * m + i]=f-g;
+	for (j=l-1;j<n;j++) {
+	  for (s=0.0,k=i;k<m;k++) s += a[k * m + i]*a[k * m + j];
+	  f=s/h;
+	  for (k=i;k<m;k++) a[k * m + j] += f*a[k * m + i];
+	}
+	for (k=i;k<m;k++) a[k * m + i] *= scale;
+      }
+    }
+    w[i]=scale *g;
+    g=s=scale=0.0;
+    if (i+1 <= m && i+1 != n) {
+      for (k=l-1;k<n;k++) scale += fabs(a[i * m + k]);
+      if (scale != 0.0) {
+	for (k=l-1;k<n;k++) {
+	  a[i * m + k] /= scale;
+	  s += a[i * m + k]*a[i * m + k];
+	}
+	f=a[i * m + l-1];
+	g = -SIGN(sqrt(s),f);
+	h=f*g-s;
+	a[i * m + l-1]=f-g;
+	for (k=l-1;k<n;k++) rv1[k]=a[i * m + k]/h;
+	for (j=l-1;j<m;j++) {
+	  for (s=0.0,k=l-1;k<n;k++) s += a[j * m + k]*a[i * m + k];
+	  for (k=l-1;k<n;k++) a[j * m + k] += s*rv1[k];
+	}
+	for (k=l-1;k<n;k++) a[i * m + k] *= scale;
+      }
+    }
+    anorm=MAXV(anorm,(fabs(w[i])+fabs(rv1[i])));
+  }
+  for (i=n-1;i>=0;i--) {
+    if (i < n-1) {
+      if (g != 0.0) {
+	for (j=l;j<n;j++)
+	  v[j * m + i]=(a[i * m + j]/a[i * m + l])/g;
+	for (j=l;j<n;j++) {
+	  for (s=0.0,k=l;k<n;k++) s += a[i * m + k]*v[k * m + j];
+	  for (k=l;k<n;k++) v[k * m + j] += s*v[k * m + i];
+	}
+      }
+      for (j=l;j<n;j++) v[i * m + j]=v[j * m + i]=0.0;
+    }
+    v[i * m + i]=1.0;
+    g=rv1[i];
+    l=i;
+  }
+  for (i=MINV(m,n)-1;i>=0;i--) {
+    l=i+1;
+    g=w[i];
+    for (j=l;j<n;j++) a[i * m + j]=0.0;
+    if (g != 0.0) {
+      g=1.0/g;
+      for (j=l;j<n;j++) {
+	for (s=0.0,k=l;k<m;k++) s += a[k * m + i]*a[k * m + j];
+	f=(s/a[i * m + i])*g;
+	for (k=i;k<m;k++) a[k * m + j] += f*a[k * m + i];
+      }
+      for (j=i;j<m;j++) a[j * m + i] *= g;
+    } else for (j=i;j<m;j++) a[j * m + i]=0.0;
+    ++a[i * m + i];
+  }
+  for (k=n-1;k>=0;k--) {
+    for (its=0;its<30;its++) {
+      flag=1;
+      for (l=k;l>=0;l--) {
+	nm=l-1;
+	temp=fabs(rv1[l])+anorm;
+	if (temp == anorm) {
+	  flag=0;
+	  break;
+	}
+	temp=fabs(w[nm])+anorm;
+	if (temp == anorm) break;
+      }
+      if (flag) {
+	c=0.0;
+	s=1.0;
+	for (i=l;i<k+1;i++) {
+	  f=s*rv1[i];
+	  rv1[i]=c*rv1[i];
+	  temp = fabs(f)+anorm;
+	  if (temp == anorm) break;
+	  g=w[i];
+	  h=pythag(f,g);
+	  w[i]=h;
+	  h=1.0/h;
+	  c=g*h;
+	  s = -f*h;
+	  for (j=0;j<m;j++) {
+	    y=a[j * m + nm];
+	    z=a[j * m + i];
+	    a[j * m + nm]=y*c+z*s;
+	    a[j * m + i]=z*c-y*s;
+	  }
+	}
+      }
+      z=w[k];
+      if (l == k) {
+	if (z < 0.0) {
+	  w[k] = -z;
+	  for (j=0;j<n;j++) v[j * m + k] = -v[j * m + k];
+	}
+	break;
+      }
+      if (its == 29) 
+	return 0; // cannot converge: multi-collinearity?
+      x=w[l];
+      nm=k-1;
+      y=w[nm];
+      g=rv1[nm];
+      h=rv1[k];
+      f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
+      g=pythag(f,1.0);
+      f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+      c=s=1.0;
+      for (j=l;j<=nm;j++) {
+	i=j+1;
+	g=rv1[i];
+	y=w[i];
+	h=s*g;
+	g=c*g;
+	z=pythag(f,h);
+	rv1[j]=z;
+	c=f/z;
+	s=h/z;
+	f=x*c+g*s;
+	g=g*c-x*s;
+	h=y*s;
+	y *= c;
+	for (jj=0;jj<n;jj++) {
+	  x=v[jj * m + j];
+	  z=v[jj * m + i];
+	  v[jj * m + j]=x*c+z*s;
+	  v[jj * m + i]=z*c-x*s;
+	}
+	z=pythag(f,h);
+	w[j]=z;
+	if (z) {
+	  z=1.0/z;
+	  c=f*z;
+	  s=h*z;
+	}
+	f=c*g+s*y;
+	x=c*y-s*g;
+	for (jj=0;jj<m;jj++) {
+	  y=a[jj * m + j];
+	  z=a[jj * m + i];
+	  a[jj * m + j]=y*c+z*s;
+	  a[jj * m + i]=z*c-y*s;
+	}
+      }
+      rv1[l]=0.0;
+      rv1[k]=f;
+      w[k]=x;
+    }
+  }
+  return 1;
+}
+
+boolerr_t invert_matrix(int32_t dim, double* matrix, matrix_invert_buf1_t* dbl_1d_buf, double* dbl_2d_buf) {
+  // C port of PLINK stats.cpp's svd_inverse() function.
+
+  // w -> dbl_1d_buf
+  // v -> dbl_2d_buf
+  const double eps = 1e-24;
+  int32_t i;
+  int32_t j;
+  int32_t k;
+  if (!svdcmp_c(dim, matrix, dbl_1d_buf, dbl_2d_buf)) {
+    return 1;
+  }
+
+  // Look for singular values
+  double wmax = 0;
+  for (i=0; i<dim; i++) {
+    wmax = dbl_1d_buf[i] > wmax ? dbl_1d_buf[i] : wmax;
+  }
+  double wmin = wmax * eps;
+  for (i=0; i<dim; i++) {
+    dbl_1d_buf[i] = dbl_1d_buf[i] < wmin ? 0 : (1 / dbl_1d_buf[i]);
+  }
+  
+  for (i=0; i<dim; i++) {
+    for (j=0; j<dim; j++) {
+      matrix[i * dim + j] = matrix[i * dim + j] * dbl_1d_buf[j];
+    }
+  }
+
+  // [nxn].[t(v)]
+  for (i=0; i<dim; i++) {
+    fill_double_zero(dim, dbl_1d_buf);
+    for (j=0; j<dim; j++) {
+      for (k=0; k<dim; k++) {
+	dbl_1d_buf[j] += matrix[i * dim + k] * dbl_2d_buf[j * dim + k];
+      }
+    }
+    for (j = 0; j < dim; j++) {
+      matrix[i * dim + j] = dbl_1d_buf[j];
+    }
+  }
+  for (i=1; i<dim; ++i) {
+    for(j=0; j<i; ++j) {
+      const double tmp = matrix[i * dim + j];
+      matrix[i * dim + j] = matrix[j * dim + i];
+      matrix[j * dim + i] = tmp;
+    }
+  }
+  return 0;
+}
+
+uint32_t svdcmp_float_c(int32_t m, int32_t stride, float* a, float* w, float* v) {
+  float* rv1 = &(w[(uint32_t)m]);
+  int32_t n = m;
+  int32_t flag;
+  int32_t l = 0; // suppress compile warning
+  int32_t i,its,j,jj,k,nm;
+  float anorm,c,f,g,h,s,scale,x,y,z;
+  float temp;
+
+  g=scale=anorm=0.0;
+  for (i=0;i<n;i++) {
+    l=i+2;
+    rv1[i]=scale*g;
+    g=s=scale=0.0;
+    if (i < m) {
+      for (k=i;k<m;k++) scale += fabs(a[k * stride + i]);
+      if (scale != 0.0) {
+	for (k=i;k<m;k++) {
+	  a[k * stride + i] /= scale;
+	  s += a[k * stride + i]*a[k * stride + i];
+	}
+	f=a[i * stride + i];
+	g = -SIGN(sqrt(s),f);
+	h=f*g-s;
+	a[i * stride + i]=f-g;
+	for (j=l-1;j<n;j++) {
+	  for (s=0.0,k=i;k<m;k++) s += a[k * stride + i]*a[k * stride + j];
+	  f=s/h;
+	  for (k=i;k<m;k++) a[k * stride + j] += f*a[k * stride + i];
+	}
+	for (k=i;k<m;k++) a[k * stride + i] *= scale;
+      }
+    }
+    w[i]=scale *g;
+    g=s=scale=0.0;
+    if (i+1 <= m && i+1 != n) {
+      for (k=l-1;k<n;k++) scale += fabs(a[i * stride + k]);
+      if (scale != 0.0) {
+	for (k=l-1;k<n;k++) {
+	  a[i * stride + k] /= scale;
+	  s += a[i * stride + k]*a[i * stride + k];
+	}
+	f=a[i * stride + l-1];
+	g = -SIGN(sqrt(s),f);
+	h=f*g-s;
+	a[i * stride + l-1]=f-g;
+	for (k=l-1;k<n;k++) rv1[k]=a[i * stride + k]/h;
+	for (j=l-1;j<m;j++) {
+	  for (s=0.0,k=l-1;k<n;k++) s += a[j * stride + k]*a[i * stride + k];
+	  for (k=l-1;k<n;k++) a[j * stride + k] += s*rv1[k];
+	}
+	for (k=l-1;k<n;k++) a[i * stride + k] *= scale;
+      }
+    }
+    anorm=MAXV(anorm,(fabs(w[i])+fabs(rv1[i])));
+  }
+  for (i=n-1;i>=0;i--) {
+    if (i < n-1) {
+      if (g != 0.0) {
+	for (j=l;j<n;j++)
+	  v[j * m + i]=(a[i * stride + j]/a[i * stride + l])/g;
+	for (j=l;j<n;j++) {
+	  for (s=0.0,k=l;k<n;k++) s += a[i * stride + k]*v[k * m + j];
+	  for (k=l;k<n;k++) v[k * m + j] += s*v[k * m + i];
+	}
+      }
+      for (j=l;j<n;j++) v[i * m + j]=v[j * m + i]=0.0;
+    }
+    v[i * m + i]=1.0;
+    g=rv1[i];
+    l=i;
+  }
+  for (i=MINV(m,n)-1;i>=0;i--) {
+    l=i+1;
+    g=w[i];
+    for (j=l;j<n;j++) a[i * stride + j]=0.0;
+    if (g != 0.0) {
+      g=1.0/g;
+      for (j=l;j<n;j++) {
+	for (s=0.0,k=l;k<m;k++) s += a[k * stride + i]*a[k * stride + j];
+	f=(s/a[i * stride + i])*g;
+	for (k=i;k<m;k++) a[k * stride + j] += f*a[k * stride + i];
+      }
+      for (j=i;j<m;j++) a[j * stride + i] *= g;
+    } else for (j=i;j<m;j++) a[j * stride + i]=0.0;
+    ++a[i * stride + i];
+  }
+  for (k=n-1;k>=0;k--) {
+    for (its=0;its<30;its++) {
+      flag=1;
+      for (l=k;l>=0;l--) {
+	nm=l-1;
+	temp=fabs(rv1[l])+anorm;
+	if (temp == anorm) {
+	  flag=0;
+	  break;
+	}
+	temp=fabs(w[nm])+anorm;
+	if (temp == anorm) break;
+      }
+      if (flag) {
+	c=0.0;
+	s=1.0;
+	for (i=l;i<k+1;i++) {
+	  f=s*rv1[i];
+	  rv1[i]=c*rv1[i];
+	  temp = fabs(f)+anorm;
+	  if (temp == anorm) break;
+	  g=w[i];
+	  h=pythag(f,g);
+	  w[i]=h;
+	  h=1.0/h;
+	  c=g*h;
+	  s = -f*h;
+	  for (j=0;j<m;j++) {
+	    y=a[j * stride + nm];
+	    z=a[j * stride + i];
+	    a[j * stride + nm]=y*c+z*s;
+	    a[j * stride + i]=z*c-y*s;
+	  }
+	}
+      }
+      z=w[k];
+      if (l == k) {
+	if (z < 0.0) {
+	  w[k] = -z;
+	  for (j=0;j<n;j++) v[j * m + k] = -v[j * m + k];
+	}
+	break;
+      }
+      if (its == 29) 
+	return 0; // cannot converge: multi-collinearity?
+      x=w[l];
+      nm=k-1;
+      y=w[nm];
+      g=rv1[nm];
+      h=rv1[k];
+      f=((y-z)*(y+z)+(g-h)*(g+h))/(2.0*h*y);
+      g=pythag(f,1.0);
+      f=((x-z)*(x+z)+h*((y/(f+SIGN(g,f)))-h))/x;
+      c=s=1.0;
+      for (j=l;j<=nm;j++) {
+	i=j+1;
+	g=rv1[i];
+	y=w[i];
+	h=s*g;
+	g=c*g;
+	z=pythag(f,h);
+	rv1[j]=z;
+	c=f/z;
+	s=h/z;
+	f=x*c+g*s;
+	g=g*c-x*s;
+	h=y*s;
+	y *= c;
+	for (jj=0;jj<n;jj++) {
+	  x=v[jj * m + j];
+	  z=v[jj * m + i];
+	  v[jj * m + j]=x*c+z*s;
+	  v[jj * m + i]=z*c-x*s;
+	}
+	z=pythag(f,h);
+	w[j]=z;
+	if (z) {
+	  z=1.0/z;
+	  c=f*z;
+	  s=h*z;
+	}
+	f=c*g+s*y;
+	x=c*y-s*g;
+	for (jj=0;jj<m;jj++) {
+	  y=a[jj * stride + j];
+	  z=a[jj * stride + i];
+	  a[jj * stride + j]=y*c+z*s;
+	  a[jj * stride + i]=z*c-y*s;
+	}
+      }
+      rv1[l]=0.0;
+      rv1[k]=f;
+      w[k]=x;
+    }
+  }
+  return 1;
+}
+
+/*
+boolerr_t invert_fmatrix_checked(int32_t dim, int32_t stride, float* matrix, float* absdet_ptr, matrix_finvert_buf1_t* flt_1d_buf, float* flt_2d_buf) {
+  // w -> flt_1d_buf
+  // v -> flt_2d_buf
+  const float eps = 1e-24;
+  int32_t i;
+  int32_t j;
+  int32_t k;
+  if (!svdcmp_float_c(dim, stride, matrix, flt_1d_buf, flt_2d_buf)) {
+    return 1;
+  }
+
+  if (absdet_ptr) {
+    float sv_prod = flt_1d_buf[0];
+    for (i=1; i<dim; ++i) {
+      sv_prod *= flt_1d_buf[i];
+    }
+    *absdet_ptr = fabsf(sv_prod);
+  }
+  // Look for singular values
+  float wmax = 0;
+  for (i=0; i<dim; i++) {
+    wmax = flt_1d_buf[i] > wmax ? flt_1d_buf[i] : wmax;
+  }
+  float wmin = wmax * eps;
+  for (i=0; i<dim; i++) {
+    flt_1d_buf[i] = flt_1d_buf[i] < wmin ? 0 : (1 / flt_1d_buf[i]);
+  }
+  
+  for (i=0; i<dim; i++) {
+    for (j=0; j<dim; j++) {
+      matrix[i * stride + j] = matrix[i * stride + j] * flt_1d_buf[j];
+    }
+  }
+
+  // [nxn].[t(v)] 
+  for (i=0; i<dim; i++) {
+    fill_float_zero(dim, flt_1d_buf);
+    for (j=0; j<dim; j++) {
+      for (k=0; k<dim; k++) {
+	flt_1d_buf[j] += matrix[i * stride + k] * flt_2d_buf[j * dim + k];
+      }
+    }
+    for (j = 0; j < dim; j++) {
+      matrix[i * stride + j] = flt_1d_buf[j];
+    }
+  }
+  for (i=1; i<dim; ++i) {
+    for(j=0; j<i; ++j) {
+      const float tmp = matrix[i * stride + j];
+      matrix[i * stride + j] = matrix[j * stride + i];
+      matrix[j * stride + i] = tmp;
+    }
+  }
+  return 0;
+}
+*/
+
+boolerr_t invert_fmatrix_first_half(int32_t dim, int32_t stride, float* matrix, float* absdet_ptr, matrix_finvert_buf1_t* flt_1d_buf, float* flt_2d_buf) {
+  // w -> flt_1d_buf
+  // v -> flt_2d_buf
+  if (!svdcmp_float_c(dim, stride, matrix, flt_1d_buf, flt_2d_buf)) {
+    return 1;
+  }
+  float sv_prod = flt_1d_buf[0];
+  for (int32_t i=1; i<dim; ++i) {
+    sv_prod *= flt_1d_buf[i];
+  }
+  *absdet_ptr = fabsf(sv_prod);
+  return 0;
+}
+
+void invert_fmatrix_second_half(int32_t dim, int32_t stride, float* matrix, matrix_finvert_buf1_t* flt_1d_buf, float* flt_2d_buf) {
+  // w -> flt_1d_buf
+  // v -> flt_2d_buf
+  const float eps = 1e-24;
+  int32_t i;
+  int32_t j;
+  int32_t k;
+  // Look for singular values
+  float wmax = 0;
+  for (i=0; i<dim; i++) {
+    wmax = flt_1d_buf[i] > wmax ? flt_1d_buf[i] : wmax;
+  }
+  float wmin = wmax * eps;
+  for (i=0; i<dim; i++) {
+    flt_1d_buf[i] = flt_1d_buf[i] < wmin ? 0 : (1 / flt_1d_buf[i]);
+  }
+  
+  for (i=0; i<dim; i++) {
+    for (j=0; j<dim; j++) {
+      matrix[i * stride + j] = matrix[i * stride + j] * flt_1d_buf[j];
+    }
+  }
+
+  // [nxn].[t(v)] 
+  for (i=0; i<dim; i++) {
+    fill_float_zero(dim, flt_1d_buf);
+    for (j=0; j<dim; j++) {
+      for (k=0; k<dim; k++) {
+	flt_1d_buf[j] += matrix[i * stride + k] * flt_2d_buf[j * dim + k];
+      }
+    }
+    for (j = 0; j < dim; j++) {
+      matrix[i * stride + j] = flt_1d_buf[j];
+    }
+  }
+  for (i=1; i<dim; ++i) {
+    for(j=0; j<i; ++j) {
+      const float tmp = matrix[i * stride + j];
+      matrix[i * stride + j] = matrix[j * stride + i];
+      matrix[j * stride + i] = tmp;
+    }
+  }
+}
+#else // !NOLAPACK
+boolerr_t invert_matrix(__CLPK_integer dim, double* matrix, matrix_invert_buf1_t* int_1d_buf, double* dbl_2d_buf) {
+  // todo: dgetrf_/dgetri_ was more efficient than dpotrf_/dpotri_ on OS X the
+  // last time I checked, but is this still true?  re-benchmark this, and
+  // create a new symmetric-positive-definite-only function if appropriate.
+  __CLPK_integer info;
+  dgetrf_(&dim, &dim, matrix, &dim, int_1d_buf, &info);
+  if (info) {
+    return 1;
+  }
+  __CLPK_integer lwork = dim * dim;
+  dgetri_(&dim, matrix, &dim, int_1d_buf, dbl_2d_buf, &lwork, &info);
+  assert(info == 0);
+  return 0;
+}
+
+boolerr_t invert_matrix_checked(__CLPK_integer dim, double* matrix, matrix_invert_buf1_t* int_1d_buf, double* dbl_2d_buf) {
+  // This used to fall back on PLINK 1.07's SVD-based implementation when the
+  // rcond estimate was too small, but in practice that just slowed things down
+  // without meaningfully improving inversion of nonsingular matrices.  So now
+  // this just exits a bit earlier, while leaving the old "binary search for
+  // the first row/column causing multicollinearity" logic to the caller.
+  char cc = '1';
+  double norm = dlange_(&cc, &dim, &dim, matrix, &dim, dbl_2d_buf);
+  __CLPK_integer info;
+  dgetrf_(&dim, &dim, matrix, &dim, int_1d_buf, &info);
+  if (info > 0) {
+    return 1;
+  }
+  double rcond;
+  dgecon_(&cc, &dim, matrix, &dim, &norm, &rcond, dbl_2d_buf, &(int_1d_buf[(uint32_t)dim]), &info);
+  if (rcond < kMatrixSingularRcond) {
+    return 1;
+  }
+  __CLPK_integer lwork = dim * dim;
+  dgetri_(&dim, matrix, &dim, int_1d_buf, dbl_2d_buf, &lwork, &info);
+  return 0;
+}
+
+/*
+boolerr_t invert_fmatrix_checked(__CLPK_integer dim, __CLPK_integer stride, float* matrix, float* absdet_ptr, matrix_finvert_buf1_t* int_1d_buf, float* flt_2d_buf) {
+  __CLPK_integer lwork = dim * stride;
+  char cc = '1';
+  float norm = slange_(&cc, &dim, &dim, matrix, &stride, flt_2d_buf);
+  __CLPK_integer info;
+  float rcond;
+  sgetrf_(&dim, &dim, matrix, &stride, int_1d_buf, &info);
+  if (info > 0) {
+    return 1;
+  }
+  sgecon_(&cc, &dim, matrix, &stride, &norm, &rcond, flt_2d_buf, &(int_1d_buf[dim]), &info);
+  if (rcond < kMatrixSingularRcond) {
+    return 1;
+  }
+  if (absdet_ptr) {
+    const uintptr_t stridep1 = stride + 1;
+    float det_u = matrix[0];
+    for (uintptr_t ulii = 1; ulii < ((uintptr_t)dim); ++ulii) {
+      det_u *= matrix[ulii * stridep1];
+    }
+    *absdet_ptr = fabsf(det_u);
+  }
+  sgetri_(&dim, matrix, &stride, int_1d_buf, flt_2d_buf, &lwork, &info);
+  return 0;
+}
+*/
+
+boolerr_t invert_fmatrix_first_half(__CLPK_integer dim, __CLPK_integer stride, float* matrix, float* absdet_ptr, matrix_finvert_buf1_t* int_1d_buf, float* flt_2d_buf) {
+  char cc = '1';
+  float norm = slange_(&cc, &dim, &dim, matrix, &stride, flt_2d_buf);
+  __CLPK_integer info;
+  sgetrf_(&dim, &dim, matrix, &stride, int_1d_buf, &info);
+  if (info > 0) {
+    return 1;
+  }
+  float rcond;
+  sgecon_(&cc, &dim, matrix, &stride, &norm, &rcond, flt_2d_buf, &(int_1d_buf[(uint32_t)dim]), &info);
+  if (rcond < kMatrixSingularRcond) {
+    return 1;
+  }
+  const uintptr_t stridep1 = stride + 1;
+  float det_u = matrix[0];
+  for (uintptr_t ulii = 1; ulii < ((uintptr_t)dim); ++ulii) {
+    det_u *= matrix[ulii * stridep1];
+  }
+  *absdet_ptr = fabsf(det_u);
+  return 0;
+}
+
+void invert_fmatrix_second_half(__CLPK_integer dim, __CLPK_integer stride, float* matrix, matrix_finvert_buf1_t* int_1d_buf, float* flt_2d_buf) {
+  __CLPK_integer lwork = dim * stride;
+  __CLPK_integer info;
+  sgetri_(&dim, matrix, &stride, int_1d_buf, flt_2d_buf, &lwork, &info);
+}
+#endif // !NOLAPACK
+
+void col_major_matrix_multiply(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, double* outmatrix) {
+#ifdef NOLAPACK
+  uintptr_t row1_ct_l = row1_ct;
+  uintptr_t col2_ct_l = col2_ct;
+  uintptr_t common_ct_l = common_ct;
+  uintptr_t row_idx;
+  uintptr_t col_idx;
+  uintptr_t com_idx;
+  const double* dptr;
+  double dxx;
+  // not optimized
+  for (col_idx = 0; col_idx < col2_ct_l; col_idx++) {
+    for (row_idx = 0; row_idx < row1_ct_l; row_idx++) {
+      dxx = 0;
+      dptr = &(inmatrix2[col_idx * common_ct]);
+      for (com_idx = 0; com_idx < common_ct_l; com_idx++) {
+        dxx += (*dptr++) * inmatrix1[com_idx * row1_ct_l + row_idx];
+      }
+      *outmatrix++ = dxx;
+    }
+  }
+#else
+  #ifndef USE_CBLAS_XGEMM
+  char blas_char = 'N';
+  double dyy = 1;
+  double dzz = 0;
+  // const_cast
+  dgemm_(&blas_char, &blas_char, &row1_ct, &col2_ct, &common_ct, &dyy, (double*)((uintptr_t)inmatrix1), &row1_ct, (double*)((uintptr_t)inmatrix2), &common_ct, &dzz, outmatrix, &row1_ct);
+  #else
+  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, row1_ct, col2_ct, common_ct, 1.0, inmatrix1, row1_ct, inmatrix2, common_ct, 0.0, outmatrix, row1_ct);
+  #endif // USE_CBLAS_XGEMM
+#endif // !NOLAPACK
+}
+
+void col_major_matrix_multiply_strided_addassign(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double beta, double* outmatrix) {
+  // stride1 should be close to row1_ct
+  // stride2 should be close to common_ct
+  // output matrix uses stride3, which should be close to row1_ct
+#ifdef NOLAPACK
+  const uintptr_t row1_ct_l = row1_ct;
+  const uintptr_t col2_ct_l = col2_ct;
+  const uintptr_t common_ct_l = common_ct;
+  // not optimized, no beta == 0 special case
+  for (uintptr_t col_idx = 0; col_idx < col2_ct_l; ++col_idx) {
+    double* outmatrix_row_iter = &(outmatrix[col_idx * stride3]);
+    for (uintptr_t row_idx = 0; row_idx < row1_ct_l; ++row_idx) {
+      double cur_entry = 0.0;
+      const double* col2_iter = &(inmatrix2[col_idx * stride2]);
+      for (uintptr_t com_idx = 0; com_idx < common_ct_l; com_idx++) {
+        cur_entry += (*col2_iter++) * inmatrix1[com_idx * stride1 + row_idx];
+      }
+      *outmatrix_row_iter = (*outmatrix_row_iter) * beta + cur_entry;
+      ++outmatrix_row_iter;
+    }
+  }
+#else
+  #ifndef USE_CBLAS_XGEMM
+  char blas_char = 'N';
+  double alpha = 1;
+  // const_cast
+  dgemm_(&blas_char, &blas_char, &row1_ct, &col2_ct, &common_ct, &alpha, (double*)((uintptr_t)inmatrix1), &stride1, (double*)((uintptr_t)inmatrix2), &stride2, &beta, outmatrix, &stride3);
+  #else
+  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, row1_ct, col2_ct, common_ct, 1.0, inmatrix1, stride1, inmatrix2, stride2, beta, outmatrix, stride3);
+  #endif // USE_CBLAS_XGEMM
+#endif // !NOLAPACK
+}
+
+// er, should make this _addassign for consistency...
+void col_major_fmatrix_multiply_strided(const float* inmatrix1, const float* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, float* outmatrix) {
+#ifdef NOLAPACK
+  const uintptr_t row1_ct_l = row1_ct;
+  const uintptr_t col2_ct_l = col2_ct;
+  const uintptr_t common_ct_l = common_ct;
+  // not optimized
+  for (uintptr_t col_idx = 0; col_idx < col2_ct_l; ++col_idx) {
+    float* outmatrix_row_iter = &(outmatrix[col_idx * stride3]);
+    for (uintptr_t row_idx = 0; row_idx < row1_ct_l; ++row_idx) {
+      float cur_entry = 0.0;
+      const float* col2_iter = &(inmatrix2[col_idx * stride2]);
+      for (uintptr_t com_idx = 0; com_idx < common_ct_l; com_idx++) {
+        cur_entry += (*col2_iter++) * inmatrix1[com_idx * stride1 + row_idx];
+      }
+      *outmatrix_row_iter++ = cur_entry;
+    }
+  }
+#else
+  #ifndef USE_CBLAS_XGEMM
+  char blas_char = 'N';
+  float fyy = 1;
+  float fzz = 0;
+  // const_cast
+  sgemm_(&blas_char, &blas_char, &row1_ct, &col2_ct, &common_ct, &fyy, (float*)((uintptr_t)inmatrix1), &stride1, (float*)((uintptr_t)inmatrix2), &stride2, &fzz, outmatrix, &stride3);
+  #else
+  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, row1_ct, col2_ct, common_ct, 1.0, inmatrix1, stride1, inmatrix2, stride2, 0.0, outmatrix, stride3);
+  #endif // USE_CBLAS_XGEMM
+#endif // !NOLAPACK
+}
+
+// Briefly experimented with trying to speed this up, didn't make any progress.
+void transpose_copy(const double* old_matrix, uint32_t old_maj, uint32_t new_maj, double* new_matrix_iter) {
+  for (uint32_t new_maj_idx = 0; new_maj_idx < new_maj; ++new_maj_idx) {
+    const double* old_matrix_col_iter = &(old_matrix[new_maj_idx]);
+    for (uint32_t old_maj_idx = 0; old_maj_idx < old_maj; ++old_maj_idx) {
+      *new_matrix_iter++ = *old_matrix_col_iter;
+      old_matrix_col_iter = &(old_matrix_col_iter[new_maj]);
+    }
+  }
+}
+
+void transpose_copy_float(const float* old_matrix, uint32_t old_maj, uint32_t new_maj, uint32_t new_maj_max, float* new_matrix_iter) {
+  // new_maj = in-memory stride of old_matrix rows
+  // new_maj_max = actual number of rows in new_matrix
+  // (distinction is necessary for SSE alignment)
+  for (uint32_t new_maj_idx = 0; new_maj_idx < new_maj_max; ++new_maj_idx) {
+    const float* old_matrix_col_iter = &(old_matrix[new_maj_idx]);
+    for (uint32_t old_maj_idx = 0; old_maj_idx < old_maj; ++old_maj_idx) {
+      *new_matrix_iter++ = *old_matrix_col_iter;
+      old_matrix_col_iter = &(old_matrix_col_iter[new_maj]);
+    }
+  }
+}
+
+/*
+__CLPK_integer qr_square_factor_float_get_lwork(uint32_t dim) {
+  __CLPK_integer dim_i = (__CLPK_integer)dim;
+  __CLPK_integer sgeqrf_lwork = -1;
+  float work[1];
+  __CLPK_integer info;
+  float dummy;
+  float dummy2;
+  sgeqrf_(&dim_i, &dim_i, &dummy, &dim_i, &dummy2, work, &sgeqrf_lwork, &info);
+  assert(info == 0);
+  sgeqrf_lwork = (__CLPK_integer)work[0];
+  __CLPK_integer sorgqr_lwork = -1;
+  sorgqr_(&dim_i, &dim_i, &dim_i, &dummy, &dim_i, &dummy2, work, &sorgqr_lwork, &info);
+  assert(info == 0);
+  sorgqr_lwork = (__CLPK_integer)work[0];
+  return MAXV(sgeqrf_lwork, sorgqr_lwork);
+}
+
+boolerr_t qr_square_factor_float(const float* input_matrix, uint32_t dim, uintptr_t stride, __CLPK_integer lwork, float* qq, float* r_determinant_ptr, float* tau_buf, float* work_buf) {
+  // only returns Q and, optionally, the product of R's diagonal entries (which
+  // should be the determinant of the original matrix).
+  // tau_buf should have space for dim entries
+  if (dim == stride) {
+    memcpy(qq, input_matrix, dim * ((uintptr_t)dim) * sizeof(float));
+  } else {
+    for (uintptr_t col_idx = 0; col_idx < dim; ++col_idx) {
+      memcpy(&(qq[col_idx * dim]), &(input_matrix[col_idx * stride]), dim * sizeof(float));
+    }
+  }
+  __CLPK_integer dim_i = (__CLPK_integer)dim;
+  __CLPK_integer info;
+  sgeqrf_(&dim_i, &dim_i, qq, &dim_i, tau_buf, work_buf, &lwork, &info);
+  if (info != 0) {
+    return 1;
+  }
+  if (r_determinant_ptr) {
+    const uintptr_t dimp1 = dim + 1;
+    float prod = qq[0];
+    for (uintptr_t col_idx = 1; col_idx < dim; ++col_idx) {
+      prod *= qq[col_idx * dimp1];
+    }
+    *r_determinant_ptr = prod;
+  }
+  sorgqr_(&dim_i, &dim_i, &dim_i, qq, &dim_i, tau_buf, work_buf, &lwork, &info);
+  if (info != 0) {
+    return 1;
+  }
+  return 0;
+}
+*/
+
+// A(A^T), where A is row-major; result is dim x dim
+// ONLY UPDATES LOWER TRIANGLE OF result[].
+void multiply_self_transpose(double* input_matrix, uint32_t dim, uint32_t col_ct, double* result) {
+#ifdef NOLAPACK
+  for (uintptr_t row1_idx = 0; row1_idx < dim; ++row1_idx) {
+    const double* pred_row1 = &(input_matrix[row1_idx * col_ct]);
+    double* result_row = &(result[row1_idx * dim]);
+    for (uintptr_t row2_idx = 0; row2_idx <= row1_idx; ++row2_idx) {
+      const double* pred_row2 = &(input_matrix[row2_idx * col_ct]);
+      double cur_dotprod = 0.0;
+      for (uint32_t col_idx = 0; col_idx < col_ct; ++col_idx) {
+	cur_dotprod += pred_row1[col_idx] * pred_row2[col_idx];
+      }
+      result_row[row2_idx] = cur_dotprod;
+    }
+  }
+#else
+  #ifndef USE_CBLAS_XGEMM
+  char uplo = 'U';
+  char trans = 'T';
+  __CLPK_integer tmp_n = dim;
+  __CLPK_integer tmp_k = col_ct;
+  double alpha = 1.0;
+  double beta = 0.0;
+  dsyrk_(&uplo, &trans, &tmp_n, &tmp_k, &alpha, input_matrix, &tmp_k, &beta, result, &tmp_n);
+  #else
+  cblas_dsyrk(CblasColMajor, CblasUpper, CblasTrans, dim, col_ct, 1.0, input_matrix, col_ct, 0.0, result, dim);
+  #endif
+#endif
+}
+
+void transpose_multiply_self_incr(double* input_part, uint32_t dim, uint32_t partial_row_ct, double* result) {
+#ifdef NOLAPACK
+  // friends do not let friends use this implementation
+  const uintptr_t dim_l = dim;
+  const uintptr_t row_ct_l = partial_row_ct;
+  for (uintptr_t idx1 = 0; idx1 < dim_l; ++idx1) {
+    const double* col1 = &(input_part[idx1]);
+    double* write_iter = &(result[idx1 * dim_l]);
+    for (uintptr_t idx2 = 0; idx2 <= idx1; ++idx2) {
+      double cur_dotprod = *write_iter;
+      const double* col2 = &(input_part[idx2]);
+      for (uintptr_t row_idx = 0; row_idx < row_ct_l; ++row_idx) {
+	cur_dotprod += col1[row_idx * dim_l] * col2[row_idx * dim_l];
+      }
+      *write_iter = cur_dotprod;
+      ++write_iter;
+    }
+  }
+#else
+  #ifndef USE_CBLAS_XGEMM
+  char uplo = 'U';
+  char trans = 'N';
+  __CLPK_integer tmp_n = dim;
+  __CLPK_integer tmp_k = partial_row_ct;
+  double alpha = 1.0;
+  double beta = 1.0;
+  dsyrk_(&uplo, &trans, &tmp_n, &tmp_k, &alpha, input_part, &tmp_n, &beta, result, &tmp_n);
+  #else
+  cblas_dsyrk(CblasColMajor, CblasUpper, CblasNoTrans, dim, partial_row_ct, 1.0, input_part, dim, 1.0, result, dim);
+  #endif
+#endif // !NOLAPACK
+}
+
+#ifndef NOLAPACK
+boolerr_t get_svd_rect_lwork(uint32_t major_ct, uint32_t minor_ct, __CLPK_integer* lwork_ptr) {
+  char jobu = 'S';
+  char jobvt = 'O';
+  __CLPK_integer tmp_m = minor_ct;
+  __CLPK_integer tmp_n = major_ct;
+  __CLPK_integer wkspace_size = -1;
+  double wkspace_size_d;
+  __CLPK_integer info;
+  dgesvd_(&jobu, &jobvt, &tmp_m, &tmp_n, nullptr, &tmp_m, nullptr, nullptr, &tmp_m, nullptr, &tmp_m, &wkspace_size_d, &wkspace_size, &info);
+  #ifdef LAPACK_ILP64
+  if (info) {
+    return 1;
+  }
+  #else
+  if (info || (wkspace_size_d > 2147483640.0)) {
+    return 1;
+  }
+  #endif
+  *lwork_ptr = round_up_pow2((__CLPK_integer)wkspace_size_d, kCacheline / sizeof(double));
+  return 0;
+}
+
+boolerr_t svd_rect(uint32_t major_ct, uint32_t minor_ct, __CLPK_integer lwork, double* matrix, double* ss, unsigned char* svd_rect_wkspace) {
+  double* work = (double*)svd_rect_wkspace;
+  double* vv_buf = &(work[lwork]);
+  char jobu = 'S';
+  char jobvt = 'O';
+  __CLPK_integer tmp_m = minor_ct;
+  __CLPK_integer tmp_n = major_ct;
+  __CLPK_integer info;
+  dgesvd_(&jobu, &jobvt, &tmp_m, &tmp_n, matrix, &tmp_m, ss, vv_buf, &tmp_m, nullptr, &tmp_m, work, &lwork, &info);
+  return (info != 0);
+}
+
+// dsyevr_ takes ~30% less time than dsyevd_ on OS X dev machine.  todo: retest
+// for Linux 64-bit MKL.
+boolerr_t get_extract_eigvecs_lworks(uint32_t dim, uint32_t pc_ct, __CLPK_integer* lwork_ptr, __CLPK_integer* liwork_ptr, uintptr_t* wkspace_byte_ct_ptr) {
+  char jobz = 'V';
+  char range = 'I';
+  char uplo = 'U';
+  __CLPK_integer tmp_n = dim;
+  __CLPK_integer il = dim + 1 - pc_ct;
+  __CLPK_integer iu = dim;
+  double abstol = -1.0;
+  __CLPK_integer lwork_dummy = -1;
+  double lwork_d;
+  __CLPK_integer liwork;
+  __CLPK_integer info;
+  dsyevr_(&jobz, &range, &uplo, &tmp_n, nullptr, &tmp_n, nullptr, nullptr, &il, &iu, &abstol, nullptr, nullptr, nullptr, &tmp_n, nullptr, &lwork_d, &lwork_dummy, &liwork, &lwork_dummy, &info);
+#ifdef LAPACK_ILP64
+  if (info) {
+    return 1;
+  }
+#else
+  if (info || (lwork_d > 2147483640.0)) {
+    return 1;
+  }
+#endif
+  const __CLPK_integer lwork = round_up_pow2((__CLPK_integer)lwork_d, kCacheline / sizeof(double));
+  liwork = round_up_pow2(liwork, kCacheline / sizeof(__CLPK_integer));
+  *lwork_ptr = lwork;
+  *liwork_ptr = liwork;
+  *wkspace_byte_ct_ptr = lwork * sizeof(double) + liwork * sizeof(__CLPK_integer) + round_up_pow2(2 * dim * sizeof(__CLPK_integer), kCacheline);
+  return 0;
+}
+
+boolerr_t extract_eigvecs(uint32_t dim, uint32_t pc_ct, __CLPK_integer lwork, __CLPK_integer liwork, double* matrix, double* eigvals, double* reverse_eigvecs, unsigned char* extract_eigvecs_wkspace) {
+  char jobz = 'V';
+  char range = 'I';
+  char uplo = 'U';
+  __CLPK_integer tmp_n = dim;
+  __CLPK_integer il = dim + 1 - pc_ct;
+  __CLPK_integer iu = dim;
+  double abstol = -1.0;
+  __CLPK_integer out_m;
+  double* work = (double*)extract_eigvecs_wkspace;
+  __CLPK_integer* iwork = (__CLPK_integer*)(&(work[lwork]));
+  __CLPK_integer* isuppz = &(iwork[liwork]);
+  __CLPK_integer info;
+  // vl and vu may actually be referenced in some implementations
+  double dummy_d = 0.0;
+  dsyevr_(&jobz, &range, &uplo, &tmp_n, matrix, &tmp_n, &dummy_d, &dummy_d, &il, &iu, &abstol, &out_m, eigvals, reverse_eigvecs, &tmp_n, isuppz, work, &lwork, iwork, &liwork, &info);
+  return (info != 0);
+}
+#endif // !NOLAPACK
+
+// can't use this, since we need (X^T X)^{-1} for validParameters() check
+/*
+void linear_regression_first_half(uint32_t sample_ct, uint32_t predictor_ct, double* pheno_d, double* predictors_pmaj, double* xt_y, double* xtx) {
+  // Note that only the lower triangle of X^T X is filled.  (well, upper
+  // triangle in column-major Fortran notation.)
+  multiply_self_transpose(predictors_pmaj, predictor_ct, sample_ct, xtx);
+  row_major_matrix_multiply(predictors_pmaj, pheno_d, predictor_ct, 1, sample_ct, xt_y);
+}
+
+#ifndef NOLAPACK
+boolerr_t linear_regression_second_half(const double* xt_y, uint32_t predictor_ct, double* xtx_destroy, double* fitted_coefs) {
+  // See e.g. wls.c in Alex Blocker's go-lm code
+  // (https://github.com/awblocker/go-lm ).
+  char uplo = 'U';
+  __CLPK_integer tmp_n = predictor_ct;
+  memcpy(fitted_coefs, xt_y, predictor_ct * sizeof(double));
+  __CLPK_integer nrhs = 1;
+  __CLPK_integer info;
+  dposv_(&uplo, &tmp_n, &nrhs, xtx_destroy, &tmp_n, fitted_coefs, &tmp_n, &info);
+  return (info != 0);
+}
+#endif // !NOLAPACK
+*/
+
+// todo: support nrhs > 1 when permutation test implemented
+boolerr_t linear_regression_inv(const double* pheno_d, double* predictors_pmaj, uint32_t predictor_ct, uint32_t sample_ct, double* fitted_coefs, double* xtx_inv, double* xt_y, __maybe_unused matrix_invert_buf1_t* mi_buf, __maybe_unused double* dbl_2d_buf) {
+  multiply_self_transpose(predictors_pmaj, predictor_ct, sample_ct, xtx_inv);
+  row_major_matrix_multiply(predictors_pmaj, pheno_d, predictor_ct, 1, sample_ct, xt_y);
+#ifdef NOLAPACK
+  // Need to fill the upper triangle of xtx, since linear_regression_first_half
+  // didn't do it for us.
+  for (uintptr_t row_idx = 0; row_idx < predictor_ct; ++row_idx) {
+    double* cur_row = &(xtx_inv[row_idx * predictor_ct]);
+    double* cur_col = &(xtx_inv[row_idx]);
+    for (uintptr_t col_idx = row_idx + 1; col_idx < predictor_ct; ++col_idx) {
+      cur_row[col_idx] = cur_col[col_idx * predictor_ct];
+    }
+  }
+  if (invert_matrix(predictor_ct, xtx_inv, mi_buf, dbl_2d_buf)) {
+    return 1;
+  }
+  row_major_matrix_multiply(xtx_inv, xt_y, predictor_ct, 1, predictor_ct, fitted_coefs);
+  return 0;
+#else
+  char uplo = 'U';
+  __CLPK_integer tmp_n = predictor_ct;
+  __CLPK_integer info;
+  dpotrf_(&uplo, &tmp_n, xtx_inv, &tmp_n, &info);
+  if (info) {
+    return 1;
+  }
+  memcpy(fitted_coefs, xt_y, predictor_ct * sizeof(double));
+  __CLPK_integer nrhs = 1;
+  dpotrs_(&uplo, &tmp_n, &nrhs, xtx_inv, &tmp_n, fitted_coefs, &tmp_n, &info);
+  assert(!info);
+  dpotri_(&uplo, &tmp_n, xtx_inv, &tmp_n, &info);
+  return (info != 0);
+#endif // !NOLAPACK
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_matrix.h b/plink2_matrix.h
new file mode 100644
index 0000000..0999734
--- /dev/null
+++ b/plink2_matrix.h
@@ -0,0 +1,178 @@
+#ifndef __PLINK2_MATRIX_H__
+#define __PLINK2_MATRIX_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+// Wrappers for frequent LAPACK calls (sometimes with no-LAPACK fallbacks).
+// Now supports MKL backend.
+
+// todo: allow this to take advantage of 64-bit integer LAPACK.  As of this
+// writing, it's available on Amazon EC2 64-bit Linux instances, but I can't
+// find it for Windows.  (And even if OS X vecLib adds it soon, we can't use it
+// there anytime soon because static linking is not an option.)
+
+#include "plink2_common.h"
+
+#ifdef NOLAPACK
+
+  typedef double matrix_invert_buf1_t;
+  CONSTU31(kMatrixInvertBuf1ElemAlloc, 2 * sizeof(double));
+  CONSTU31(kMatrixInvertBuf1CheckedAlloc, 2 * sizeof(double));
+  #define __CLPK_integer int
+  typedef float matrix_finvert_buf1_t;
+  CONSTU31(kMatrixFinvertBuf1CheckedAlloc, 2 * sizeof(float));
+
+#else // not NOLAPACK
+  #ifdef __APPLE__
+    #include <Accelerate/Accelerate.h>
+  #endif
+
+  #ifndef __APPLE__
+
+    #ifdef __cplusplus
+extern "C" {
+    #endif
+  typedef double __CLPK_doublereal;
+    #ifdef __LP64__
+      #ifdef LAPACK_ILP64
+  typedef long long __CLPK_integer;
+      #else
+  typedef int32_t __CLPK_integer;
+      #endif
+    #else
+      #ifdef LAPACK_ILP64
+	#error "Invalid compile flags."
+      #else
+	#ifdef _WIN32
+  // probably don't need this?
+  typedef int32_t __CLPK_integer;
+	#else
+  typedef long int __CLPK_integer;
+        #endif
+      #endif
+    #endif // !__LP64__
+
+  void xerbla_(void);
+    #ifdef __cplusplus
+}
+    #endif
+
+  #endif // !__APPLE__
+
+  typedef __CLPK_integer matrix_invert_buf1_t;
+  // need to be careful about >= 2^32?
+  CONSTU31(kMatrixInvertBuf1ElemAlloc, sizeof(__CLPK_integer));
+  // invert_matrix_checked() usually requires a larger buffer
+  CONSTU31(kMatrixInvertBuf1CheckedAlloc, 2 * sizeof(__CLPK_integer));
+  typedef __CLPK_integer matrix_finvert_buf1_t;
+  CONSTU31(kMatrixFinvertBuf1CheckedAlloc, 2 * sizeof(__CLPK_integer));
+
+#endif // NOLAPACK
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+static const double kMatrixSingularRcond = 1e-14;
+
+#ifdef NOLAPACK
+boolerr_t invert_matrix(int32_t dim, double* matrix, matrix_invert_buf1_t* dbl_1d_buf, double* dbl_2d_buf);
+
+HEADER_INLINE boolerr_t invert_matrix_checked(int32_t dim, double* matrix, matrix_invert_buf1_t* dbl_1d_buf, double* dbl_2d_buf) {
+  return invert_matrix(dim, matrix, dbl_1d_buf, dbl_2d_buf);
+}
+
+// if we're using float32s instead of float64s, we care enough about low-level
+// details that this split interface makes sense.
+// first half computes either LU or singular value decomposition, and
+//   determinant (could make latter optional)
+// second half actually inverts matrix, assuming 1d_buf and 2d_buf have results
+//   from first half
+boolerr_t invert_fmatrix_first_half(int32_t dim, int32_t stride, float* matrix, float* absdet_ptr, matrix_finvert_buf1_t* flt_1d_buf, float* flt_2d_buf);
+
+void invert_fmatrix_second_half(int32_t dim, int32_t stride, float* matrix, matrix_finvert_buf1_t* flt_1d_buf, float* flt_2d_buf);
+#else
+boolerr_t invert_matrix(__CLPK_integer dim, double* matrix, matrix_invert_buf1_t* int_1d_buf, double* dbl_2d_buf);
+
+boolerr_t invert_matrix_checked(__CLPK_integer dim, double* matrix, matrix_invert_buf1_t* int_1d_buf, double* dbl_2d_buf);
+
+boolerr_t invert_fmatrix_first_half(__CLPK_integer dim, __CLPK_integer stride, float* matrix, float* absdet_ptr, matrix_finvert_buf1_t* int_1d_buf, float* flt_2d_buf);
+
+void invert_fmatrix_second_half(__CLPK_integer dim, __CLPK_integer stride, float* matrix, matrix_finvert_buf1_t* int_1d_buf, float* flt_2d_buf);
+#endif
+
+void col_major_matrix_multiply(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, double* outmatrix);
+
+HEADER_INLINE void row_major_matrix_multiply(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, double* outmatrix) {
+  return col_major_matrix_multiply(inmatrix2, inmatrix1, col2_ct, row1_ct, common_ct, outmatrix);
+}
+
+// this is essentially a full-blown dgemm wrapper, only missing the alpha
+// parameter now
+void col_major_matrix_multiply_strided_addassign(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double beta, double* outmatrix);
+
+HEADER_INLINE void row_major_matrix_multiply_incr(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer col2_ct, __CLPK_integer common_ct, double* outmatrix) {
+  return col_major_matrix_multiply_strided_addassign(inmatrix2, inmatrix1, col2_ct, col2_ct, row1_ct, common_ct, common_ct, col2_ct, 1.0, outmatrix);
+}
+
+HEADER_INLINE void row_major_matrix_multiply_strided(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double* outmatrix) {
+  // stride1 should be close to common_ct
+  // stride2 should be close to col2_ct
+  // output matrix uses stride3, which should be close to col2_ct
+  return col_major_matrix_multiply_strided_addassign(inmatrix2, inmatrix1, col2_ct, stride2, row1_ct, stride1, common_ct, stride3, 0.0, outmatrix);
+}
+
+HEADER_INLINE void row_major_matrix_multiply_strided_incr(const double* inmatrix1, const double* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, double* outmatrix) {
+  return col_major_matrix_multiply_strided_addassign(inmatrix2, inmatrix1, col2_ct, stride2, row1_ct, stride1, common_ct, stride3, 1.0, outmatrix);
+}
+
+void col_major_fmatrix_multiply_strided(const float* inmatrix1, const float* inmatrix2, __CLPK_integer row1_ct, __CLPK_integer stride1, __CLPK_integer col2_ct, __CLPK_integer stride2, __CLPK_integer common_ct, __CLPK_integer stride3, float* outmatrix);
+
+void transpose_copy(const double* old_matrix, uint32_t old_maj, uint32_t new_maj, double* new_matrix_iter);
+
+void transpose_copy_float(const float* old_matrix, uint32_t old_maj, uint32_t new_maj, uint32_t new_maj_max, float* new_matrix_iter);
+
+
+// A(A^T), where A is row-major; result is dim x dim
+// ONLY UPDATES LOWER TRIANGLE OF result[].
+void multiply_self_transpose(double* input_matrix, uint32_t dim, uint32_t col_ct, double* result);
+
+// (A^T)A
+void transpose_multiply_self_incr(double* input_part, uint32_t dim, uint32_t partial_row_ct, double* result);
+
+#ifndef NOLAPACK
+boolerr_t get_svd_rect_lwork(uint32_t major_ct, uint32_t minor_ct, __CLPK_integer* lwork_ptr);
+
+// currently a wrapper for dgesvd_().
+boolerr_t svd_rect(uint32_t major_ct, uint32_t minor_ct, __CLPK_integer lwork, double* matrix, double* ss, unsigned char* svd_rect_wkspace);
+
+boolerr_t get_extract_eigvecs_lworks(uint32_t dim, uint32_t pc_ct, __CLPK_integer* lwork_ptr, __CLPK_integer* liwork_ptr, uintptr_t* wkspace_byte_ct_ptr);
+
+// currently a wrapper for dsyevr_().
+// reverse_eigvecs is eigenvector-major, but the vectors are in order of
+// *increasing* eigenvalue.
+boolerr_t extract_eigvecs(uint32_t dim, uint32_t pc_ct, __CLPK_integer lwork, __CLPK_integer liwork, double* matrix, double* eigvals, double* reverse_eigvecs, unsigned char* extract_eigvecs_wkspace);
+#endif
+
+boolerr_t linear_regression_inv(const double* pheno_d, double* predictors_pmaj, uint32_t predictor_ct, uint32_t sample_ct, double* fitted_coefs, double* xtx_inv, double* xt_y, matrix_invert_buf1_t* mi_buf, double* dbl_2d_buf);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_MATRIX_H__
diff --git a/plink2_matrix_calc.cpp b/plink2_matrix_calc.cpp
new file mode 100644
index 0000000..a173a54
--- /dev/null
+++ b/plink2_matrix_calc.cpp
@@ -0,0 +1,4221 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_compress_stream.h"
+#include "plink2_matrix.h"
+#include "plink2_matrix_calc.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+void init_score(score_info_t* score_info_ptr) {
+  score_info_ptr->flags = kfScore0;
+  score_info_ptr->varid_col_p1 = 1;
+  score_info_ptr->allele_col_p1 = 0;
+  score_info_ptr->input_fname = nullptr;
+  init_range_list(&(score_info_ptr->input_col_idx_range_list));
+}
+
+void cleanup_score(score_info_t* score_info_ptr) {
+  free_cond(score_info_ptr->input_fname);
+  cleanup_range_list(&(score_info_ptr->input_col_idx_range_list));
+}
+
+
+uint32_t triangle_divide(int64_t cur_prod, int32_t modif) {
+  // return smallest integer vv for which (vv * (vv + modif)) is no smaller
+  // than cur_prod, and neither term in the product is negative.  (Note the
+  // lack of a divide by two; cur_prod should also be double its "true" value
+  // as a result.)
+  int64_t vv;
+  if (cur_prod == 0) {
+    if (modif < 0) {
+      return -modif;
+    }
+    return 0;
+  }
+  vv = (int64_t)sqrt((double)cur_prod);
+  while ((vv - 1) * (vv + modif - 1) >= cur_prod) {
+    vv--;
+  }
+  while (vv * (vv + modif) < cur_prod) {
+    vv++;
+  }
+  return vv;
+}
+
+void parallel_bounds(uint32_t ct, int32_t start, uint32_t parallel_idx, uint32_t parallel_tot, int32_t* __restrict bound_start_ptr, int32_t* __restrict bound_end_ptr) {
+  int32_t modif = 1 - start * 2;
+  int64_t ct_tot = ((int64_t)ct) * (ct + modif);
+  *bound_start_ptr = triangle_divide((ct_tot * parallel_idx) / parallel_tot, modif);
+  *bound_end_ptr = triangle_divide((ct_tot * (parallel_idx + 1)) / parallel_tot, modif);
+}
+
+// set align to 1 for no alignment
+void triangle_fill(uint32_t ct, uint32_t piece_ct, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align, uint32_t* target_arr) {
+  int32_t modif = 1 - start * 2;
+  int64_t cur_prod;
+  int32_t lbound;
+  int32_t ubound;
+  uint32_t uii;
+  uint32_t align_m1;
+  parallel_bounds(ct, start, parallel_idx, parallel_tot, &lbound, &ubound);
+  // x(x+1)/2 is divisible by y iff (x % (2y)) is 0 or (2y - 1).
+  align *= 2;
+  align_m1 = align - 1;
+  target_arr[0] = lbound;
+  target_arr[piece_ct] = ubound;
+  cur_prod = ((int64_t)lbound) * (lbound + modif);
+  const int64_t ct_tr = (((int64_t)ubound) * (ubound + modif) - cur_prod) / piece_ct;
+  for (uint32_t piece_idx = 1; piece_idx < piece_ct; ++piece_idx) {
+    cur_prod += ct_tr;
+    lbound = triangle_divide(cur_prod, modif);
+    uii = (lbound - ((int32_t)start)) & align_m1;
+    if ((uii) && (uii != align_m1)) {
+      lbound = start + ((lbound - ((int32_t)start)) | align_m1);
+    }
+    // lack of this check caused a nasty bug earlier
+    if (((uint32_t)lbound) > ct) {
+      lbound = ct;
+    }
+    target_arr[piece_idx] = lbound;
+  }
+}
+
+pglerr_t kinship_prune_destructive(uintptr_t* kinship_table, uintptr_t* sample_include, uint32_t* sample_ct_ptr) {
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uintptr_t orig_sample_ct = *sample_ct_ptr;
+    const uintptr_t orig_sample_ctl = BITCT_TO_WORDCT(orig_sample_ct);
+    uintptr_t* sample_include_collapsed_nz;
+    uintptr_t* sample_remove_collapsed;
+    uint32_t* vertex_degree;
+    if (bigstack_calloc_ul(orig_sample_ctl, &sample_include_collapsed_nz) ||
+	bigstack_calloc_ul(orig_sample_ctl, &sample_remove_collapsed) ||
+	bigstack_alloc_ui(orig_sample_ct, &vertex_degree)) {
+      goto kinship_prune_destructive_ret_NOMEM;
+    }
+    // 1. count the number of constraints for each remaining sample
+    uint32_t degree_1_vertex_ct = 0;
+    for (uint32_t sample_idx = 0; sample_idx < orig_sample_ct; ++sample_idx) {
+      const uintptr_t woffset = sample_idx * orig_sample_ctl;
+      const uintptr_t* read_iter1 = &(kinship_table[woffset]);
+      // don't currently guarantee vector-alignment of kinship_table rows, so
+      // can't use popcount_longs().  (change this?)
+      uint32_t cur_degree = 0;
+      for (uint32_t widx = 0; widx < orig_sample_ctl; ++widx) {
+	const uintptr_t cur_word = *read_iter1++;
+	cur_degree += popcount_long(cur_word);
+      }
+      if (cur_degree) {
+	vertex_degree[sample_idx] = cur_degree;
+        degree_1_vertex_ct += (cur_degree == 1);
+	SET_BIT(sample_idx, sample_include_collapsed_nz);
+      }
+    }
+    uint32_t cur_sample_nz_ct = popcount_longs(sample_include_collapsed_nz, orig_sample_ctl);
+    // 2. as long as edges remain,
+    //    a. remove partner of first degree-one vertex, if such a vertex exists
+    //    b. otherwise, remove first maximal-degree vertex
+    //    (similar to plink 1.9 rel_cutoff_batch(), but data structure is not
+    //    triangular since more speed is needed)
+    while (cur_sample_nz_ct) {
+      uint32_t prune_uidx;
+      uint32_t cur_degree;
+      if (degree_1_vertex_ct) {
+	uint32_t degree_1_vertex_uidx = 0;
+	while (1) {
+	  // sparse
+	  degree_1_vertex_uidx = next_set_unsafe(sample_include_collapsed_nz, degree_1_vertex_uidx);
+	  if (vertex_degree[degree_1_vertex_uidx] == 1) {
+	    break;
+	  }
+	  ++degree_1_vertex_uidx;
+	}
+	// find partner
+	prune_uidx = next_set_unsafe(&(kinship_table[degree_1_vertex_uidx * orig_sample_ctl]), 0);
+	cur_degree = vertex_degree[prune_uidx];
+      } else {
+	uint32_t sample_uidx = next_set_unsafe(sample_include_collapsed_nz, 0);
+	cur_degree = vertex_degree[sample_uidx];
+	prune_uidx = sample_uidx;
+	for (uint32_t sample_idx = 1; sample_idx < cur_sample_nz_ct; ++sample_idx) {
+	  // sparse
+	  sample_uidx = next_set_unsafe(sample_include_collapsed_nz, sample_uidx + 1);
+	  const uint32_t new_degree = vertex_degree[sample_uidx];
+	  if (new_degree > cur_degree) {
+	    cur_degree = new_degree;
+	    prune_uidx = sample_uidx;
+	  }
+	}
+      }
+      // remove row/column
+      uintptr_t* cur_kinship_col = &(kinship_table[prune_uidx / kBitsPerWord]);
+      const uintptr_t kinship_col_mask = ~(k1LU << (prune_uidx % kBitsPerWord));
+      uintptr_t* cur_kinship_row = &(kinship_table[prune_uidx * orig_sample_ctl]);
+      uint32_t sample_uidx = 0;
+      for (uint32_t partner_idx = 0; partner_idx < cur_degree; ++partner_idx, ++sample_uidx) {
+	// sparse
+	sample_uidx = next_set_unsafe(cur_kinship_row, sample_uidx);
+	const uint32_t new_degree = vertex_degree[sample_uidx] - 1;
+	if (!new_degree) {
+	  CLEAR_BIT(sample_uidx, sample_include_collapsed_nz);
+	  --degree_1_vertex_ct;
+	  --cur_sample_nz_ct;
+	  // unnecessary to write to kinship_table[] or vertex_degree[]
+	} else {
+	  cur_kinship_col[sample_uidx * orig_sample_ctl] &= kinship_col_mask;
+	  degree_1_vertex_ct += (new_degree == 1);
+	  vertex_degree[sample_uidx] = new_degree;
+	}
+      }
+      if (vertex_degree[prune_uidx] == 1) {
+	--degree_1_vertex_ct;
+      }
+      sample_remove_collapsed[prune_uidx / kBitsPerWord] |= ~kinship_col_mask;
+      sample_include_collapsed_nz[prune_uidx / kBitsPerWord] &= kinship_col_mask;
+      // unnecessary to update current kinship_table[] row
+      --cur_sample_nz_ct;
+    }
+    uint32_t sample_ct = orig_sample_ct;
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < orig_sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      if (IS_SET(sample_remove_collapsed, sample_idx)) {
+	CLEAR_BIT(sample_uidx, sample_include);
+	--sample_ct;
+      }
+    }
+    *sample_ct_ptr = sample_ct;
+  }
+  while (0) {
+  kinship_prune_destructive_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  }
+  return reterr;
+}
+
+pglerr_t king_cutoff_batch(const char* sample_ids, const char* sids, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, double king_cutoff, uintptr_t* sample_include, char* king_cutoff_fprefix, uint32_t* sample_ct_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gzFile gz_infile = nullptr;
+  FILE* binfile = nullptr;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    uint32_t sample_ct = *sample_ct_ptr;
+    const uint32_t orig_sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uintptr_t* kinship_table;
+    if (bigstack_calloc_ul(sample_ct * orig_sample_ctl, &kinship_table)) {
+      goto king_cutoff_batch_ret_NOMEM;
+    }
+    
+    char* fprefix_end = &(king_cutoff_fprefix[strlen(king_cutoff_fprefix)]);
+    strcpy(fprefix_end, ".king.id");
+    reterr = gzopen_read_checked(king_cutoff_fprefix, &gz_infile);
+    if (reterr) {
+      goto king_cutoff_batch_ret_1;
+    }
+
+    uint32_t* sample_uidx_to_king_uidx;
+    char* loadbuf;
+    if (bigstack_alloc_ui(raw_sample_ct, &sample_uidx_to_king_uidx) ||
+	bigstack_alloc_c(kMaxMediumLine, &loadbuf)) {
+      goto king_cutoff_batch_ret_NOMEM;
+    }
+    ++line_idx;
+    loadbuf[kMaxMediumLine - 1] = ' ';
+    if (!gzgets(gz_infile, loadbuf, kMaxMediumLine)) {
+      if (!gzeof(gz_infile)) {
+	goto king_cutoff_batch_ret_READ_FAIL;
+      }
+      logerrprint("Error: Empty --king-cutoff ID file.\n");
+      goto king_cutoff_batch_ret_MALFORMED_INPUT;
+    }
+    if (!loadbuf[kMaxMediumLine - 1]) {
+      goto king_cutoff_batch_ret_LONG_LINE;
+    }
+    char* loadbuf_first_token = skip_initial_spaces(loadbuf);
+    if (is_eoln_kns(*loadbuf_first_token)) {
+      goto king_cutoff_batch_ret_MISSING_TOKENS;
+    }
+    const xid_mode_t xid_mode = (sids && next_token_mult(loadbuf_first_token, 2))? kfXidModeFidiidSid : kfXidModeFidiid;
+    
+    uint32_t* xid_map; // IDs not collapsed
+    char* sorted_xidbox;
+    uintptr_t max_xid_blen;
+    reterr = sorted_xidbox_init_alloc(sample_include, sample_ids, sids, sample_ct, max_sample_id_blen, max_sid_blen, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
+    if (reterr) {
+      goto king_cutoff_batch_ret_1;
+    }
+    char* idbuf;
+    if (bigstack_alloc_c(max_xid_blen, &idbuf)) {
+      goto king_cutoff_batch_ret_NOMEM;
+    }
+    fill_uint_one(raw_sample_ct, sample_uidx_to_king_uidx);
+    while (1) {
+      char* loadbuf_iter = loadbuf_first_token;
+      uint32_t sample_uidx;
+      if (!sorted_xidbox_read_find(sorted_xidbox, xid_map, max_xid_blen, sample_ct, 0, xid_mode, &loadbuf_iter, &sample_uidx, idbuf)) {
+	if (sample_uidx_to_king_uidx[sample_uidx] != 0xffffffffU) {
+	  char* first_tab = (char*)rawmemchr(idbuf, '\t');
+	  char* second_tab = strchr(&(first_tab[1]), '\t');
+	  *first_tab = ' ';
+	  if (second_tab) {
+	    *second_tab = ' ';
+	  }
+	  sprintf(g_logbuf, "Error: Duplicate ID '%s' in %s .\n", idbuf, king_cutoff_fprefix);
+	  goto king_cutoff_batch_ret_MALFORMED_INPUT_WW;
+	}
+	sample_uidx_to_king_uidx[sample_uidx] = line_idx - 1;
+      } else {
+	if (!loadbuf_iter) {
+	  goto king_cutoff_batch_ret_MISSING_TOKENS;
+	}
+      }
+
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, kMaxMediumLine)) {
+	if (!gzeof(gz_infile)) {
+	  goto king_cutoff_batch_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[kMaxMediumLine - 1]) {
+	goto king_cutoff_batch_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*loadbuf_first_token)) {
+	goto king_cutoff_batch_ret_MISSING_TOKENS;
+      }
+    }
+    if (gzclose_null(&gz_infile)) {
+      goto king_cutoff_batch_ret_READ_FAIL;
+    }
+    const uintptr_t king_id_ct = line_idx - 1;
+    
+    bigstack_reset(loadbuf);
+    uintptr_t* king_include;
+    uint32_t* king_uidx_to_sample_idx;
+    if (bigstack_calloc_ul(BITCT_TO_WORDCT(king_id_ct), &king_include) ||
+	bigstack_alloc_ui(king_id_ct, &king_uidx_to_sample_idx)) {
+      goto king_cutoff_batch_ret_NOMEM;
+    }
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      const uint32_t king_uidx = sample_uidx_to_king_uidx[sample_uidx];
+      if (king_uidx != 0xffffffffU) {
+	SET_BIT(king_uidx, king_include);
+        king_uidx_to_sample_idx[king_uidx] = sample_idx;
+      }
+    }
+    strcpy(fprefix_end, ".king.bin");
+    if (fopen_checked(king_cutoff_fprefix, FOPEN_RB, &binfile)) {
+      goto king_cutoff_batch_ret_OPEN_FAIL;
+    }
+    if (fseeko(binfile, 0, SEEK_END)) {
+      goto king_cutoff_batch_ret_READ_FAIL;
+    }
+    const uint64_t fsize = ftello(binfile);
+    const uint64_t fsize_double_expected = (king_id_ct * (((uint64_t)king_id_ct) - 1) * (sizeof(double) / 2));
+    const uint32_t is_double = (fsize == fsize_double_expected);
+    rewind(binfile);
+    const uint32_t first_king_uidx = next_set(king_include, 0, king_id_ct);
+    uintptr_t king_uidx = next_set(king_include, first_king_uidx + 1, king_id_ct);
+    if (king_uidx > 1) {
+      if (fseeko(binfile, king_uidx * (((uint64_t)king_uidx) - 1) * (2 + (2 * is_double)), SEEK_SET)) {
+	goto king_cutoff_batch_ret_READ_FAIL;
+      }
+    }
+    uintptr_t constraint_ct = 0;
+    if (is_double) {
+      // fread limit
+      assert(king_id_ct <= ((kMaxBytesPerIO / sizeof(double)) + 1));
+      double* king_drow;
+      if (bigstack_alloc_d(king_id_ct - 1, &king_drow)) {
+	goto king_cutoff_batch_ret_NOMEM;
+      }
+      for (uint32_t king_idx = 1; king_uidx < king_id_ct; ++king_idx, ++king_uidx) {
+	if (!IS_SET(king_include, king_uidx)) {
+	  king_uidx = next_set(king_include, king_uidx + 1, king_id_ct);
+	  if (king_uidx == king_id_ct) {
+	    break;
+	  }
+	  if (fseeko(binfile, ((uint64_t)king_uidx) * (king_uidx - 1) * (sizeof(double) / 2), SEEK_SET)) {
+	    goto king_cutoff_batch_ret_READ_FAIL;
+	  }
+	}
+	if (!fread(king_drow, king_uidx * sizeof(double), 1, binfile)) {
+	  goto king_cutoff_batch_ret_READ_FAIL;
+	}
+	const uintptr_t sample_idx = king_uidx_to_sample_idx[king_uidx];
+	uintptr_t* kinship_table_row = &(kinship_table[sample_idx * orig_sample_ctl]);
+	uintptr_t* kinship_table_col = &(kinship_table[sample_idx / kBitsPerWord]);
+	const uintptr_t kinship_new_bit = k1LU << (sample_idx % kBitsPerWord);
+	uint32_t king_uidx2 = first_king_uidx;
+	for (uint32_t king_idx2 = 0; king_idx2 < king_idx; ++king_idx2, ++king_uidx2) {
+	  next_set_unsafe_ck(king_include, &king_uidx2);
+	  if (king_drow[king_uidx2] > king_cutoff) {
+	    const uintptr_t sample_idx2 = king_uidx_to_sample_idx[king_uidx2];
+	    SET_BIT(sample_idx2, kinship_table_row);
+	    kinship_table_col[sample_idx2 * orig_sample_ctl] |= kinship_new_bit;
+	    ++constraint_ct;
+	  }
+	}
+      }
+    } else {
+      if (fsize != (fsize_double_expected / 2)) {
+	LOGERRPRINTFWW("Error: Invalid --king-cutoff .bin file size (expected %" PRIu64 " or %" PRIu64 " bytes).\n", fsize_double_expected / 2, fsize_double_expected);
+	goto king_cutoff_batch_ret_MALFORMED_INPUT;
+      }
+      assert(king_id_ct <= ((0x7ffff000 / sizeof(float)) + 1));
+      const float king_cutoff_f = (float)king_cutoff;
+      float* king_frow;
+      if (bigstack_alloc_f(king_id_ct - 1, &king_frow)) {
+	goto king_cutoff_batch_ret_NOMEM;
+      }
+      for (uint32_t king_idx = 1; king_uidx < king_id_ct; ++king_idx, ++king_uidx) {
+	if (!IS_SET(king_include, king_uidx)) {
+	  king_uidx = next_set(king_include, king_uidx + 1, king_id_ct);
+	  if (king_uidx == king_id_ct) {
+	    break;
+	  }
+	  if (fseeko(binfile, ((uint64_t)king_uidx) * (king_uidx - 1) * (sizeof(float) / 2), SEEK_SET)) {
+	    goto king_cutoff_batch_ret_READ_FAIL;
+	  }
+	}
+	if (!fread(king_frow, king_uidx * sizeof(float), 1, binfile)) {
+	  goto king_cutoff_batch_ret_READ_FAIL;
+	}
+	const uintptr_t sample_idx = king_uidx_to_sample_idx[king_uidx];
+	uintptr_t* kinship_table_row = &(kinship_table[sample_idx * orig_sample_ctl]);
+	uintptr_t* kinship_table_col = &(kinship_table[sample_idx / kBitsPerWord]);
+	const uintptr_t kinship_new_bit = k1LU << (sample_idx % kBitsPerWord);
+	uint32_t king_uidx2 = first_king_uidx;
+	for (uint32_t king_idx2 = 0; king_idx2 < king_idx; ++king_idx2, ++king_uidx2) {
+	  next_set_unsafe_ck(king_include, &king_uidx2);
+	  if (king_frow[king_uidx2] > king_cutoff_f) {
+	    const uintptr_t sample_idx2 = king_uidx_to_sample_idx[king_uidx2];
+	    SET_BIT(sample_idx2, kinship_table_row);
+	    kinship_table_col[sample_idx2 * orig_sample_ctl] |= kinship_new_bit;
+	    ++constraint_ct;
+	  }
+	}
+      }
+    }
+    LOGPRINTF("--king-cutoff: %" PRIuPTR " constraint%s loaded.\n", constraint_ct, (constraint_ct == 1)? "" : "s");
+    bigstack_reset(sample_uidx_to_king_uidx);
+    if (kinship_prune_destructive(kinship_table, sample_include, sample_ct_ptr)) {
+      goto king_cutoff_batch_ret_NOMEM;
+    }
+  }
+  while (0) {
+  king_cutoff_batch_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  king_cutoff_batch_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  king_cutoff_batch_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  king_cutoff_batch_ret_MISSING_TOKENS:
+    LOGERRPRINTFWW("Error: Fewer tokens than expected on line %" PRIuPTR " of %s .\n", line_idx, king_cutoff_fprefix);
+    break;
+  king_cutoff_batch_ret_LONG_LINE:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, king_cutoff_fprefix);
+  king_cutoff_batch_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  king_cutoff_batch_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ king_cutoff_batch_ret_1:
+  fclose_cond(binfile);
+  gzclose_cond(gz_infile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+// multithread globals
+static uintptr_t* g_smaj_genobuf[2] = {nullptr, nullptr};
+static uintptr_t* g_smaj_maskbuf[2] = {nullptr, nullptr};
+static uint32_t* g_thread_start = nullptr;
+static uint32_t* g_king_counts = nullptr;
+
+#ifdef __LP64__
+  // must be multiple of 192 for SSE2, 384 for AVX2; max 1920
+  // (1920 yields slightly higher performance than smaller values on my Mac)
+  // more precisely, must be multiple of both 3 and (kBitsPerVec / 2)
+  CONSTU31(kKingMultiplex, 1920);
+#else
+  CONSTU31(kKingMultiplex, 1008);
+#endif
+
+void incr_king(const uintptr_t* smaj_genobuf, const uintptr_t* smaj_maskbuf, uint32_t start_idx, uint32_t end_idx, uint32_t* king_counts_iter) {
+  // const vul_t m1 = VCONST_UL(kMask5555);
+  const vul_t m2 = VCONST_UL(kMask3333);
+  const vul_t m4 = VCONST_UL(kMask0F0F);
+  for (uint32_t second_idx = start_idx; second_idx < end_idx; ++second_idx) {
+    // technically overflows for huge sample_ct
+    const uint32_t second_offset = second_idx * (kKingMultiplex / kBitsPerWordD2);
+    const vul_t* second_geno_vvec = (const vul_t*)(&(smaj_genobuf[second_offset]));
+    const vul_t* second_geno_vvec_end = (const vul_t*)(&(smaj_genobuf[second_offset + (kKingMultiplex / kBitsPerWordD2)]));
+    const vul_t* second_mask_vvec = (const vul_t*)(&(smaj_maskbuf[second_offset]));
+    const vul_t* first_geno_vvec_iter = (const vul_t*)smaj_genobuf;
+    const vul_t* first_mask_vvec_iter = (const vul_t*)smaj_maskbuf;
+    // tried special-casing the situation where second_mask_vvec has no missing
+    // calls; turns out it doesn't help
+    while (first_geno_vvec_iter < second_geno_vvec) {
+      const vul_t* second_geno_vvec_iter = second_geno_vvec;
+      const vul_t* second_mask_vvec_iter = second_mask_vvec;
+      univec_t acc_nonmiss;
+      univec_t acc_hom_match;
+      univec_t acc_ibs0;
+      univec_t acc_het2hom1_or_ibs0;
+      univec_t acc_het1hom2_or_ibs0;
+      acc_nonmiss.vi = vul_setzero();
+      acc_hom_match.vi = vul_setzero();
+      acc_ibs0.vi = vul_setzero();
+      acc_het2hom1_or_ibs0.vi = vul_setzero();
+      acc_het1hom2_or_ibs0.vi = vul_setzero();
+      do {
+	// could use alternating mask, but we'd have to process 6 16-byte
+	// blocks at a time
+	vul_t geno1_vmask = *first_mask_vvec_iter++;
+	vul_t geno2_vmask = *second_mask_vvec_iter++;
+	vul_t geno1_vword = *first_geno_vvec_iter++;
+	vul_t geno2_vword = *second_geno_vvec_iter++;
+
+	vul_t agg_nonmiss = geno1_vmask & geno2_vmask;
+	vul_t geno_and = geno1_vword & geno2_vword;
+	vul_t geno_xor = geno1_vword ^ geno2_vword;
+	// could use m1 in place of agg_nonmiss
+	vul_t agg_hom_match = (geno_and | vul_rshift(geno_and, 1)) & agg_nonmiss;
+	vul_t agg_ibs0 = geno_xor & vul_rshift(geno_xor, 1) & agg_nonmiss;
+	vul_t geno2_and_not1 = (~geno1_vword) & geno2_vword;
+	vul_t geno1_and_not2 = geno1_vword & (~geno2_vword);
+	vul_t agg_het1hom2_or_ibs0 = (geno2_and_not1 | vul_rshift(geno2_and_not1, 1)) & agg_nonmiss;
+	vul_t agg_het2hom1_or_ibs0 = (geno1_and_not2 | vul_rshift(geno1_and_not2, 1)) & agg_nonmiss;
+
+	geno1_vmask = *first_mask_vvec_iter++;
+	geno2_vmask = *second_mask_vvec_iter++;
+	geno1_vword = *first_geno_vvec_iter++;
+	geno2_vword = *second_geno_vvec_iter++;
+
+	vul_t geno_nonmiss = geno1_vmask & geno2_vmask;
+	geno_and = geno1_vword & geno2_vword;
+	geno_xor = geno1_vword ^ geno2_vword;
+	agg_nonmiss += geno_nonmiss;
+	agg_hom_match += (geno_and | vul_rshift(geno_and, 1)) & geno_nonmiss;
+	agg_ibs0 += geno_xor & vul_rshift(geno_xor, 1) & geno_nonmiss;
+	geno2_and_not1 = (~geno1_vword) & geno2_vword;
+	geno1_and_not2 = geno1_vword & (~geno2_vword);
+	agg_het1hom2_or_ibs0 += (geno2_and_not1 | vul_rshift(geno2_and_not1, 1)) & geno_nonmiss;
+	agg_het2hom1_or_ibs0 += (geno1_and_not2 | vul_rshift(geno1_and_not2, 1)) & geno_nonmiss;
+
+	geno1_vmask = *first_mask_vvec_iter++;
+	geno2_vmask = *second_mask_vvec_iter++;
+	geno1_vword = *first_geno_vvec_iter++;
+	geno2_vword = *second_geno_vvec_iter++;
+
+	geno_nonmiss = geno1_vmask & geno2_vmask;
+	geno_and = geno1_vword & geno2_vword;
+	geno_xor = geno1_vword ^ geno2_vword;
+	agg_nonmiss += geno_nonmiss;
+	agg_hom_match += (geno_and | vul_rshift(geno_and, 1)) & geno_nonmiss;
+	agg_ibs0 += geno_xor & vul_rshift(geno_xor, 1) & geno_nonmiss;
+	geno2_and_not1 = (~geno1_vword) & geno2_vword;
+	geno1_and_not2 = geno1_vword & (~geno2_vword);
+	agg_het1hom2_or_ibs0 += (geno2_and_not1 | vul_rshift(geno2_and_not1, 1)) & geno_nonmiss;
+	agg_het2hom1_or_ibs0 += (geno1_and_not2 | vul_rshift(geno1_and_not2, 1)) & geno_nonmiss;
+
+	agg_nonmiss = (agg_nonmiss & m2) + (vul_rshift(agg_nonmiss, 2) & m2);
+	agg_hom_match = (agg_hom_match & m2) + (vul_rshift(agg_hom_match, 2) & m2);
+	agg_ibs0 = (agg_ibs0 & m2) + (vul_rshift(agg_ibs0, 2) & m2);
+	agg_het1hom2_or_ibs0 = (agg_het1hom2_or_ibs0 & m2) + (vul_rshift(agg_het1hom2_or_ibs0, 2) & m2);
+	agg_het2hom1_or_ibs0 = (agg_het2hom1_or_ibs0 & m2) + (vul_rshift(agg_het2hom1_or_ibs0, 2) & m2);
+
+	acc_nonmiss.vi += (agg_nonmiss + vul_rshift(agg_nonmiss, 4)) & m4;
+	acc_hom_match.vi += (agg_hom_match + vul_rshift(agg_hom_match, 4)) & m4;
+	acc_ibs0.vi += (agg_ibs0 + vul_rshift(agg_ibs0, 4)) & m4;
+	acc_het1hom2_or_ibs0.vi += (agg_het1hom2_or_ibs0 + vul_rshift(agg_het1hom2_or_ibs0, 4)) & m4;
+	acc_het2hom1_or_ibs0.vi += (agg_het2hom1_or_ibs0 + vul_rshift(agg_het2hom1_or_ibs0, 4)) & m4;
+      } while (second_geno_vvec_iter < second_geno_vvec_end);
+      const vul_t m8 = VCONST_UL(kMask00FF);
+      // can add before masking instead if kKingMultiplex <= 960
+      acc_nonmiss.vi = (acc_nonmiss.vi & m8) + (vul_rshift(acc_nonmiss.vi, 8) & m8);
+      acc_hom_match.vi = (acc_hom_match.vi & m8) + (vul_rshift(acc_hom_match.vi, 8) & m8);
+      acc_ibs0.vi = (acc_ibs0.vi & m8) + (vul_rshift(acc_ibs0.vi, 8) & m8);
+      acc_het1hom2_or_ibs0.vi = (acc_het1hom2_or_ibs0.vi & m8) + (vul_rshift(acc_het1hom2_or_ibs0.vi, 8) & m8);
+      acc_het2hom1_or_ibs0.vi = (acc_het2hom1_or_ibs0.vi & m8) + (vul_rshift(acc_het2hom1_or_ibs0.vi, 8) & m8);
+      *king_counts_iter++ += univec_hsum_16bit(acc_nonmiss);
+      *king_counts_iter++ += univec_hsum_16bit(acc_hom_match);
+      *king_counts_iter++ += univec_hsum_16bit(acc_ibs0);
+      *king_counts_iter++ += univec_hsum_16bit(acc_het1hom2_or_ibs0);
+      *king_counts_iter++ += univec_hsum_16bit(acc_het2hom1_or_ibs0);
+    }
+  }
+}
+
+THREAD_FUNC_DECL calc_king_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t mem_start_idx = g_thread_start[0];
+  const uintptr_t start_idx = g_thread_start[tidx];
+  const uint32_t end_idx = g_thread_start[tidx + 1];
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_block = g_is_last_thread_block;
+    incr_king(g_smaj_genobuf[parity], g_smaj_maskbuf[parity], start_idx, end_idx, &(g_king_counts[((start_idx * (start_idx - 1) - mem_start_idx * (mem_start_idx - 1)) / 2) * 5]));
+    if (is_last_block) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+double compute_kinship(const uint32_t* king_counts_entry) {
+  const uint32_t nonmiss_ct = king_counts_entry[0];
+  const uint32_t hom_match_ct = king_counts_entry[1];
+  const uint32_t ibs0_ct = king_counts_entry[2];
+  const uint32_t het2hom1_ct = king_counts_entry[3] - ibs0_ct;
+  const uint32_t het1hom2_ct = king_counts_entry[4] - ibs0_ct;
+
+  const uint32_t hethet_ct = nonmiss_ct - ibs0_ct - het1hom2_ct - het2hom1_ct - hom_match_ct;
+  const intptr_t smaller_het_ct = (intptr_t)(hethet_ct + MINV(het1hom2_ct, het2hom1_ct));
+  return 0.5 - ((double)(4 * ((intptr_t)ibs0_ct) + het1hom2_ct + het2hom1_ct)) / ((double)(4 * smaller_het_ct));
+}
+
+pglerr_t calc_king(const char* sample_ids, const char* sids, uintptr_t* variant_include, const chr_info_t* cip, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t raw_variant_ct, uint32_t variant_ct, double king_cutoff, double king_table_filter, king_flags_t king_modifier, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, pgen_reader_t* simple_pgrp, uintptr_t* sample_include, uint32_t* sample_ct_ptr, char* outname, char* outname_end) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const king_flags_t matrix_shape = king_modifier & kfKingMatrixShapemask;
+    const char* flagname = matrix_shape? "--make-king" : ((king_modifier & kfKingColAll)? "--make-king-table" : "--king-cutoff");
+    if (is_set(cip->haploid_mask, 0)) {
+      LOGERRPRINTF("Error: %s cannot be used on haploid genomes.\n", flagname);
+      goto calc_king_ret_INCONSISTENT_INPUT;
+    }
+    reterr = conditional_allocate_non_autosomal_variants(cip, "KING-robust calculation", raw_variant_ct, &variant_include, &variant_ct);
+    if (reterr) {
+      goto calc_king_ret_1;
+    }
+    uint32_t sample_ct = *sample_ct_ptr;
+    if (sample_ct < 2) {
+      LOGERRPRINTF("Error: %s requires at least 2 samples.\n", flagname);
+      goto calc_king_ret_INCONSISTENT_INPUT;
+    }
+#ifdef __LP64__
+    if (sample_ct > 71582788) {
+      // may as well document (kKingMultiplex / kBitsPerWordD2) limit
+      LOGERRPRINTF("Error: %s does not support > 71582788 samples.\n", flagname);
+      reterr = kPglRetNotYetSupported;
+      goto calc_king_ret_1;
+    }
+#endif
+    uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    if (calc_thread_ct > sample_ct / 32) {
+      calc_thread_ct = sample_ct / 32;
+    }
+    if (!calc_thread_ct) {
+      calc_thread_ct = 1;
+    }
+    if (bigstack_alloc_ui(calc_thread_ct + 1, &g_thread_start)) {
+      goto calc_king_ret_NOMEM;
+    }
+    triangle_fill(sample_ct, calc_thread_ct, parallel_idx, parallel_tot, 1, 1, g_thread_start);
+    const uint32_t row_start_idx = g_thread_start[0];
+    const uint32_t row_end_idx = g_thread_start[calc_thread_ct];
+    const uintptr_t tot_cells = (((uint64_t)row_end_idx) * (row_end_idx - 1) - ((uint64_t)row_start_idx) * (row_start_idx - 1)) / 2;
+    const uint32_t sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
+    pthread_t* threads = (pthread_t*)bigstack_alloc(calc_thread_ct * sizeof(pthread_t));
+    if (!threads) {
+      goto calc_king_ret_NOMEM;
+    }
+    const uintptr_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    uintptr_t* kinship_table = nullptr;
+    if (king_cutoff != -1) {
+      if (bigstack_calloc_ul(sample_ct * sample_ctl, &kinship_table)) {
+	goto calc_king_ret_NOMEM;
+      }
+    }
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    uint32_t* sample_include_cumulative_popcounts;
+    uintptr_t* loadbuf;
+    if (bigstack_alloc_ui(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
+	bigstack_alloc_ul(kQuatersPerCacheline * sample_ctaw2, &loadbuf) ||
+	bigstack_alloc_ul((kKingMultiplex / kBitsPerWordD2) * row_end_idx, &(g_smaj_genobuf[0])) ||
+	bigstack_alloc_ul((kKingMultiplex / kBitsPerWordD2) * row_end_idx, &(g_smaj_maskbuf[0])) ||
+	bigstack_alloc_ul((kKingMultiplex / kBitsPerWordD2) * row_end_idx, &(g_smaj_genobuf[1])) ||
+	bigstack_alloc_ul((kKingMultiplex / kBitsPerWordD2) * row_end_idx, &(g_smaj_maskbuf[1])) ||
+        bigstack_calloc_ui(tot_cells * 5, &g_king_counts)) {
+      goto calc_king_ret_NOMEM;
+    }
+    // force this to be cacheline-aligned
+    vul_t* vecaligned_buf = (vul_t*)bigstack_alloc(2 * kQuatersPerCacheline * kCacheline);
+    if (!vecaligned_buf) {
+      goto calc_king_ret_NOMEM;
+    }
+    fill_cumulative_popcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
+    uint32_t variant_uidx = 0;
+    uint32_t variants_completed = 0;
+    uint32_t parity = 0;
+    const uint32_t sample_batch_ct_m1 = (row_end_idx - 1) / kQuatersPerCacheline;
+    uint32_t is_last_block;
+    // Similar to plink 1.9 --genome.  For each pair of samples S1-S2, we need
+    // to determine counts of the following:
+    //   * S1 hom-S2 het
+    //   * S2 hom-S1 het
+    //   * S1 hom-S2 opposite hom
+    //   * S1 hom-S2 same hom
+    //   * nonmissing
+    //   * (het-het determined via subtraction)
+    // We handle this as follows:
+    //   1. set n=0, reader thread loads first kKingMultiplex variants and
+    //      converts+transposes the data to a sample-major format suitable for
+    //      multithreaded computation.
+    //   2. spawn threads
+    //
+    //   3. increment n by 1
+    //   4. load block n unless eof
+    //   5. permit threads to continue to next block (join_threads2()/respawn
+    //      seems suboptimal?), unless eof
+    //   6. goto step 3 unless eof
+    //
+    //   7. write results
+    // Results are always reported in lower-triangular order, rather than
+    // KING's upper-triangular order, since the former plays more nicely with
+    // incremental addition of samples.
+    pgr_clear_ld_cache(simple_pgrp);
+    do {
+      uint32_t cur_block_size = variant_ct - variants_completed;
+      uintptr_t* cur_smaj_genobuf = g_smaj_genobuf[parity];
+      uintptr_t* cur_smaj_maskbuf = g_smaj_maskbuf[parity];
+      is_last_block = (cur_block_size <= kKingMultiplex);
+      if (!is_last_block) {
+	cur_block_size = kKingMultiplex;
+      }
+      uint32_t write_batch_idx = 0;
+      // "block" = distance computation granularity, usually 1920 variants
+      // "batch" = variant-major-to-sample-major transpose granularity,
+      //           currently 256 variants
+      uint32_t variant_batch_size = kPglQuaterTransposeBatch;
+      uint32_t variant_batch_size_rounded_up = kPglQuaterTransposeBatch;
+      const uint32_t write_batch_ct_m1 = (cur_block_size - 1) / kPglQuaterTransposeBatch;
+      while (1) {
+	if (write_batch_idx >= write_batch_ct_m1) {
+	  if (write_batch_idx > write_batch_ct_m1) {
+	    break;
+	  }
+	  variant_batch_size = MOD_NZ(cur_block_size, kPglQuaterTransposeBatch);
+	  variant_batch_size_rounded_up = variant_batch_size;
+	  const uint32_t variant_batch_size_rem = variant_batch_size % kBitsPerWordD2;
+	  if (variant_batch_size_rem) {
+	    const uint32_t trailing_variant_ct = kBitsPerWordD2 - variant_batch_size_rem;
+	    variant_batch_size_rounded_up += trailing_variant_ct;
+	    fill_ulong_one(trailing_variant_ct * sample_ctaw2, &(loadbuf[variant_batch_size * sample_ctaw2]));
+	  }
+	}
+	uintptr_t* loadbuf_iter = loadbuf;
+	for (uint32_t uii = 0; uii < variant_batch_size; ++uii, ++variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  reterr = pgr_read_refalt1_genovec_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, loadbuf_iter);
+	  if (reterr) {
+	    goto calc_king_ret_PGR_FAIL;
+	  }
+	  loadbuf_iter = &(loadbuf_iter[sample_ctaw2]);
+	}
+	// uintptr_t* read_iter = loadbuf;
+	uintptr_t* write_iter = &(cur_smaj_genobuf[write_batch_idx * kPglQuaterTransposeWords]);
+	uint32_t sample_batch_idx = 0;
+	uint32_t write_batch_size = kPglQuaterTransposeBatch;
+	while (1) {
+	  if (sample_batch_idx >= sample_batch_ct_m1) {
+	    if (sample_batch_idx > sample_batch_ct_m1) {
+	      break;
+	    }
+	    write_batch_size = MOD_NZ(row_end_idx, kPglQuaterTransposeBatch);
+	  }
+	  // bugfix: read_batch_size must be rounded up to word boundary, since
+	  // we want to one-out instead of zero-out the trailing bits
+	  //
+	  // bugfix: if we always use kPglQuaterTransposeBatch instead of
+	  // variant_batch_size_rounded_up, we read/write past the
+	  // kKingMultiplex limit and clobber the first variants of the next
+	  // sample with garbage.
+	  transpose_quaterblock(&(loadbuf[sample_batch_idx * kPglQuaterTransposeWords]), sample_ctaw2, kKingMultiplex / kBitsPerWordD2, variant_batch_size_rounded_up, write_batch_size, write_iter, vecaligned_buf);
+	  ++sample_batch_idx;
+	  write_iter = &(write_iter[kKingMultiplex * kPglQuaterTransposeWords]);
+	}
+	++write_batch_idx;
+      }
+      const uint32_t cur_block_sizew = QUATERCT_TO_WORDCT(cur_block_size);
+      if (cur_block_sizew < (kKingMultiplex / kBitsPerWordD2)) {
+	uintptr_t* write_iter = &(cur_smaj_genobuf[cur_block_sizew]);
+	const uint32_t write_word_ct = (kKingMultiplex / kBitsPerWordD2) - cur_block_sizew;
+	for (uint32_t sample_idx = 0; sample_idx < row_end_idx; ++sample_idx) {
+	  fill_ulong_one(write_word_ct, write_iter);
+	  write_iter = &(write_iter[kKingMultiplex / kBitsPerWordD2]);
+	}
+      }
+      
+      vul_t* geno_viter = (vul_t*)cur_smaj_genobuf;
+      vul_t* mask_viter = (vul_t*)cur_smaj_maskbuf;
+      const uint32_t vec_ct = row_end_idx * (kKingMultiplex / kQuatersPerVec);
+      const vul_t m1 = VCONST_UL(kMask5555);
+      for (uint32_t vec_idx = 0; vec_idx < vec_ct; ++vec_idx) {
+	const vul_t raw_genov = *geno_viter;
+	const vul_t rshifted_genov = vul_rshift(raw_genov, 1);
+	const vul_t cur_maskv = (~(raw_genov & rshifted_genov)) & m1;
+	*mask_viter++ = cur_maskv;
+	// goal: 00 -> 01, 01 -> 00, 10 -> 10, 11 -> 00
+	// mask: 01 01 01 00
+	// + (rshifted_genov & m1): 01 01 10 01
+	// - (raw_genov & m1): 01 00 10 00	
+	*geno_viter++ = cur_maskv + (rshifted_genov & m1) - (raw_genov & m1);
+      }
+      if (variants_completed) {
+	join_threads2z(calc_thread_ct, 0, threads);
+      }
+      if (spawn_threads2z(calc_king_thread, calc_thread_ct, is_last_block, threads)) {
+	goto calc_king_ret_THREAD_CREATE_FAIL;
+      }
+      printf("\r%u variants complete.", variants_completed);
+      fflush(stdout);
+      variants_completed += cur_block_size;
+      parity = 1 - parity;
+    } while (!is_last_block);
+    join_threads2z(calc_thread_ct, 1, threads);
+    putc_unlocked('\r', stdout);
+    LOGPRINTF("%s: %u variant%s processed.\n", flagname, variant_ct, (variant_ct == 1)? "" : "s");
+    if (matrix_shape || (king_modifier & kfKingColAll)) {
+      // allow simultaneous --make-king + --make-king-table
+      if (matrix_shape) {
+        fputs("--make-king: Writing...", stdout);
+        fflush(stdout);
+	if (!(king_modifier & (kfKingMatrixBin | kfKingMatrixBin4))) {
+	  // text matrix
+	  char* outname_end2 = strcpya(outname_end, ".king");
+	  // won't be >2gb since sample_ct < 79m
+	  const uint32_t overflow_buf_size = kCompressStreamBlock + 16 * sample_ct;
+	  unsigned char* overflow_buf;
+	  if (bigstack_alloc_uc(overflow_buf_size, &overflow_buf)) {
+	    goto calc_king_ret_NOMEM;
+	  }
+	  const uint32_t output_zst = king_modifier & kfKingMatrixZs;
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  if (output_zst) {
+	    outname_end2 = strcpya(outname_end2, ".zst");
+	  }
+	  *outname_end2 = '\0';
+	  if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	    goto calc_king_ret_OPEN_FAIL;
+	  }
+	  cswritep = (char*)overflow_buf;
+	  const uint32_t is_squarex = king_modifier & (kfKingMatrixSq | kfKingMatrixSq0);
+	  const uint32_t is_square0 = king_modifier & kfKingMatrixSq0;
+	  uint32_t* results_iter = g_king_counts;
+	  uint32_t sample_idx1 = row_start_idx;
+	  if (is_squarex && (!parallel_idx)) {
+	    // dump "empty" first row
+	    sample_idx1 = 0;
+	  }
+	  for (; sample_idx1 < row_end_idx; ++sample_idx1) {
+	    for (uint32_t sample_idx2 = 0; sample_idx2 < sample_idx1; ++sample_idx2) {
+	      const double kinship_coeff = compute_kinship(results_iter);
+	      if (kinship_table && (kinship_coeff > king_cutoff)) {
+		SET_BIT(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
+		SET_BIT(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
+	      }
+	      cswritep = dtoa_g(kinship_coeff, cswritep);
+	      *cswritep++ = '\t';
+	      results_iter = &(results_iter[5]);
+	    }
+	    if (is_squarex) {
+	      cswritep = memcpyl3a(cswritep, "0.5");
+	      if (is_square0) {
+		// (roughly same performance as creating a tab-zero constant
+		// buffer in advance)
+		const uint32_t zcount = sample_ct - sample_idx1 - 1;
+		const uint32_t wct = DIV_UP(zcount, kBytesPerWord / 2);
+		// assumes little-endian
+		const uintptr_t tabzero_word = 0x3009 * kMask0001;
+#ifdef __arm__
+  #error "Unaligned accesses in calc_king()."
+#endif
+		uintptr_t* writep_alias = (uintptr_t*)cswritep;
+		for (uintptr_t widx = 0; widx < wct; ++widx) {
+		  *writep_alias++ = tabzero_word;
+		}
+		cswritep = &(cswritep[zcount * 2]);
+	      } else {
+		const uint32_t* results_iter2 = &(results_iter[sample_idx1 * 5]);
+		// 0
+		// 1  2
+		// 3  4  5
+		// 6  7  8  9
+		// 10 11 12 13 14
+
+		// sample_idx1 = 0: [0] 0 1 3 6 10...
+		// sample_idx1 = 1: [1] 2 4 7 11...
+		// sample_idx1 = 2: [3] 5 8 12...
+		// sample_idx1 = 3: [6] 9 13...
+		for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 < sample_ct; ++sample_idx2) {
+		  *cswritep++ = '\t';
+		  cswritep = dtoa_g(compute_kinship(results_iter2), cswritep);
+		  results_iter2 = &(results_iter2[sample_idx2 * 5]);
+		}
+	      }
+	      ++cswritep;
+	    }
+	    decr_append_binary_eoln(&cswritep);
+	    if (cswrite(&css, &cswritep)) {
+	      goto calc_king_ret_WRITE_FAIL;
+	    }
+	  }
+	  if (cswrite_close_null(&css, cswritep)) {
+	    goto calc_king_ret_WRITE_FAIL;
+	  }
+	} else {
+	  // binary matrix output
+	  char* outname_end2 = strcpya(outname_end, ".king.bin");
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  *outname_end2 = '\0';
+	  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	    goto calc_king_ret_OPEN_FAIL;
+	  }
+	  // er, probably want to revise this so there's less duplicated code
+	  // from text matrix output...
+	  const uint32_t is_squarex = king_modifier & (kfKingMatrixSq | kfKingMatrixSq0);
+	  const uint32_t is_square0 = king_modifier & kfKingMatrixSq0;
+	  uint32_t* results_iter = g_king_counts;
+	  uint32_t sample_idx1 = row_start_idx;
+	  if (is_squarex && (!parallel_idx)) {
+	    sample_idx1 = 0;
+	  }
+	  if (king_modifier & kfKingMatrixBin4) {
+	    float* write_row;
+	    if (bigstack_alloc_f(sample_ct, &write_row)) {
+	      goto calc_king_ret_NOMEM;
+	    }
+	    uintptr_t row_byte_ct = sample_ct * sizeof(float);
+	    for (; sample_idx1 < row_end_idx; ++sample_idx1) {
+	      for (uint32_t sample_idx2 = 0; sample_idx2 < sample_idx1; ++sample_idx2) {
+		const double kinship_coeff = compute_kinship(results_iter);
+		if (kinship_table && (kinship_coeff > king_cutoff)) {
+		  SET_BIT(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
+		  SET_BIT(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
+		}
+		write_row[sample_idx2] = (float)kinship_coeff;
+		results_iter = &(results_iter[5]);
+	      }
+	      if (is_squarex) {
+		write_row[sample_idx1] = 0.5f;
+		if (is_square0) {
+		  const uint32_t right_fill_idx = sample_idx1 + 1;
+		  fill_float_zero(sample_ct - right_fill_idx, &(write_row[right_fill_idx]));
+		} else {
+		  const uint32_t* results_iter2 = &(results_iter[sample_idx1 * 5]);
+		  for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 < sample_ct; ++sample_idx2) {
+		    write_row[sample_idx2] = (float)compute_kinship(results_iter2);
+		    results_iter2 = &(results_iter2[sample_idx2 * 5]);
+		  }
+		}
+	      } else {
+		row_byte_ct = sample_idx1 * sizeof(float);
+	      }
+	      if (fwrite_checked(write_row, row_byte_ct, outfile)) {
+		goto calc_king_ret_WRITE_FAIL;
+	      }
+	    }
+	  } else {
+	    double* write_row;
+	    if (bigstack_alloc_d(sample_ct, &write_row)) {
+	      goto calc_king_ret_NOMEM;
+	    }
+	    uintptr_t row_byte_ct = sample_ct * sizeof(double);
+	    for (; sample_idx1 < row_end_idx; ++sample_idx1) {
+	      for (uint32_t sample_idx2 = 0; sample_idx2 < sample_idx1; ++sample_idx2) {
+		const double kinship_coeff = compute_kinship(results_iter);
+		if (kinship_table && (kinship_coeff > king_cutoff)) {
+		  SET_BIT(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
+		  SET_BIT(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
+		}
+		write_row[sample_idx2] = kinship_coeff;
+		results_iter = &(results_iter[5]);
+	      }
+	      if (is_squarex) {
+		write_row[sample_idx1] = 0.5;
+		if (is_square0) {
+		  const uint32_t right_fill_idx = sample_idx1 + 1;
+		  fill_double_zero(sample_ct - right_fill_idx, &(write_row[right_fill_idx]));
+		} else {
+		  const uint32_t* results_iter2 = &(results_iter[sample_idx1 * 5]);
+		  for (uint32_t sample_idx2 = sample_idx1 + 1; sample_idx2 < sample_ct; ++sample_idx2) {
+		    write_row[sample_idx2] = compute_kinship(results_iter2);
+		    results_iter2 = &(results_iter2[sample_idx2 * 5]);
+		  }
+		}
+	      } else {
+		row_byte_ct = sample_idx1 * sizeof(double);
+	      }
+	      if (fwrite_checked(write_row, row_byte_ct, outfile)) {
+		goto calc_king_ret_WRITE_FAIL;
+	      }
+	    }
+	  }
+	  if (fclose_null(&outfile)) {
+	    goto calc_king_ret_WRITE_FAIL;
+	  }
+	}
+	putc_unlocked('\r', stdout);
+        char* write_iter = strcpya(g_logbuf, "--make-king: Results written to ");
+	write_iter = strcpya(write_iter, outname);
+	write_iter = strcpya(write_iter, " and ");
+	strcpy(&(outname_end[5]), ".id");
+	write_iter = strcpya(write_iter, outname);
+	strcpy(write_iter, " .\n");
+	wordwrapb(0);
+	logprintb();
+	reterr = write_sample_ids(sample_include, sample_ids, sids, outname, sample_ct, max_sample_id_blen, max_sid_blen);
+	if (reterr) {
+	  goto calc_king_ret_1;
+	}
+      }
+      if (king_modifier & kfKingColAll) {
+        fputs("--make-king-table: Writing...", stdout);
+        fflush(stdout);
+	uintptr_t* kinship_table_backup = nullptr;
+	if (matrix_shape) {
+	  kinship_table_backup = kinship_table;
+	  kinship_table = nullptr;
+	}
+	const uint32_t overflow_buf_size = kCompressStreamBlock + kMaxMediumLine;
+	char* outname_end2 = strcpya(outname_end, ".kin0");
+	unsigned char* overflow_buf;
+	if (bigstack_alloc_uc(overflow_buf_size, &overflow_buf)) {
+	  goto calc_king_ret_NOMEM;
+	}
+	const uint32_t output_zst = king_modifier & kfKingTableZs;
+	if (parallel_tot != 1) {
+	  *outname_end2++ = '.';
+	  outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	}
+	if (output_zst) {
+	  outname_end2 = strcpya(outname_end2, ".zst");
+	}
+	*outname_end2 = '\0';
+	if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	  goto calc_king_ret_OPEN_FAIL;
+	}
+	cswritep = (char*)overflow_buf;
+	const uint32_t king_col_id = king_modifier & kfKingColId;
+        const uint32_t king_col_sid = sid_col_required(sample_include, sids, sample_ct, max_sid_blen, king_modifier / kfKingColMaybesid);
+	const uint32_t king_col_nsnp = king_modifier & kfKingColNsnp;
+	const uint32_t king_col_hethet = king_modifier & kfKingColHethet;
+	const uint32_t king_col_ibs0 = king_modifier & kfKingColIbs0;
+	const uint32_t king_col_ibs1 = king_modifier & kfKingColIbs1;
+	const uint32_t king_col_kinship = king_modifier & kfKingColKinship;
+	const uint32_t report_counts = king_modifier & kfKingCounts;
+	if (!parallel_idx) {
+	  *cswritep++ = '#';
+	  if (king_col_id) {
+	    cswritep = strcpya(cswritep, "FID1\tID1\t");
+	    if (king_col_sid) {
+	      cswritep = strcpya(cswritep, "SID1\tFID2\tID2\tSID2\t");
+	    } else {
+	      cswritep = strcpya(cswritep, "FID2\tID2\t");
+	    }
+	  }
+	  if (king_col_nsnp) {
+	    cswritep = strcpya(cswritep, "NSNP\t");
+	  }
+	  if (king_col_hethet) {
+	    cswritep = strcpya(cswritep, "HETHET\t");
+	  }
+	  if (king_col_ibs0) {
+	    cswritep = strcpya(cswritep, "IBS0\t");
+	  }
+	  if (king_col_ibs1) {
+	    cswritep = strcpya(cswritep, "HET1_HOM2\tHET2_HOM1\t");
+	  }
+	  if (king_col_kinship) {
+	    cswritep = strcpya(cswritep, "KINSHIP\t");
+	  }
+	  decr_append_binary_eoln(&cswritep);
+	}
+	uintptr_t max_sample_augid_blen = max_sample_id_blen;
+	char* collapsed_sample_augids;
+	if (king_col_sid) {
+	  if (augid_init_alloc(sample_include, sample_ids, sids, row_end_idx, max_sample_id_blen, max_sid_blen, nullptr, &collapsed_sample_augids, &max_sample_augid_blen)) {
+	    goto calc_king_ret_NOMEM;
+	  }
+	} else {
+	  if (bigstack_alloc_c(row_end_idx * max_sample_augid_blen, &collapsed_sample_augids)) {
+	    goto calc_king_ret_NOMEM;
+	  }
+	  uint32_t sample_uidx = 0;
+	  for (uint32_t sample_idx = 0; sample_idx < row_end_idx; ++sample_idx, ++sample_uidx) {
+	    next_set_unsafe_ck(sample_include, &sample_uidx);
+	    strcpy(&(collapsed_sample_augids[sample_idx * max_sample_augid_blen]), &(sample_ids[sample_uidx * max_sample_id_blen]));
+	  }
+	}
+	uintptr_t king_table_filter_ct = 0;
+	uint32_t* results_iter = g_king_counts;
+	for (uint32_t sample_idx1 = row_start_idx; sample_idx1 < row_end_idx; ++sample_idx1) {
+	  const char* sample_augid1 = &(collapsed_sample_augids[max_sample_augid_blen * sample_idx1]);
+	  uint32_t sample_augid1_len = strlen(sample_augid1);
+	  for (uint32_t sample_idx2 = 0; sample_idx2 < sample_idx1; ++sample_idx2) {
+	    const uint32_t nonmiss_ct = *results_iter++;
+	    const uint32_t hom_match_ct = *results_iter++;
+	    const uint32_t ibs0_ct = *results_iter++;
+	    // order is reversed
+	    const uint32_t het2hom1_ct = (*results_iter++) - ibs0_ct;
+	    const uint32_t het1hom2_ct = (*results_iter++) - ibs0_ct;
+	    
+	    const uint32_t hethet_ct = nonmiss_ct - ibs0_ct - het1hom2_ct - het2hom1_ct - hom_match_ct;
+	    const intptr_t smaller_het_ct = (intptr_t)(hethet_ct + MINV(het1hom2_ct, het2hom1_ct));
+	    const double kinship_coeff = 0.5 - ((double)(4 * ((intptr_t)ibs0_ct) + het1hom2_ct + het2hom1_ct)) / ((double)(4 * smaller_het_ct));
+	    if (kinship_table && (kinship_coeff > king_cutoff)) {
+	      SET_BIT(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
+	      SET_BIT(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
+	    }
+	    if (kinship_coeff < king_table_filter) {
+	      ++king_table_filter_ct;
+	      continue;
+	    }
+	    if (king_col_id) {
+	      cswritep = memcpyax(cswritep, sample_augid1, sample_augid1_len, '\t');
+	      cswritep = strcpyax(cswritep, &(collapsed_sample_augids[max_sample_augid_blen * sample_idx2]), '\t');
+	    }
+	    if (king_col_nsnp) {
+	      cswritep = uint32toa_x(nonmiss_ct, '\t', cswritep);
+	    }
+	    double nonmiss_recip = 0.0;
+	    if (!report_counts) {
+	      nonmiss_recip = 1.0 / ((double)((int32_t)nonmiss_ct));
+	    }
+	    if (king_col_hethet) {
+	      if (report_counts) {
+		cswritep = uint32toa(hethet_ct, cswritep);
+	      } else {
+		cswritep = dtoa_g(nonmiss_recip * ((double)((int32_t)hethet_ct)), cswritep);
+	      }
+	      *cswritep++ = '\t';
+	    }
+	    if (king_col_ibs0) {
+	      if (report_counts) {
+		cswritep = uint32toa(ibs0_ct, cswritep);
+	      } else {
+		cswritep = dtoa_g(nonmiss_recip * ((double)((int32_t)ibs0_ct)), cswritep);
+	      }
+	      *cswritep++ = '\t';
+	    }
+	    if (king_col_ibs1) {
+	      if (report_counts) {
+		cswritep = uint32toa_x(het1hom2_ct, '\t', cswritep);
+		cswritep = uint32toa(het2hom1_ct, cswritep);
+	      } else {
+		cswritep = dtoa_g(nonmiss_recip * ((double)((int32_t)het1hom2_ct)), cswritep);
+		*cswritep++ = '\t';
+		cswritep = dtoa_g(nonmiss_recip * ((double)((int32_t)het2hom1_ct)), cswritep);
+	      }
+	      *cswritep++ = '\t';
+	    }
+	    if (king_col_kinship) {
+	      cswritep = dtoa_g(kinship_coeff, cswritep);
+	      ++cswritep;
+	    }
+	    decr_append_binary_eoln(&cswritep);
+	    if (cswrite(&css, &cswritep)) {
+	      goto calc_king_ret_WRITE_FAIL;
+	    }
+	  }
+	}
+	if (cswrite_close_null(&css, cswritep)) {
+	  goto calc_king_ret_WRITE_FAIL;
+	}
+	putc_unlocked('\r', stdout);
+        char* write_iter = strcpya(g_logbuf, "--make-king-table: Results written to ");
+	write_iter = strcpya(write_iter, outname);
+	if ((!parallel_idx) && (!(king_modifier & kfKingColId))) {
+	  write_iter = strcpya(write_iter, " and ");
+	  strcpy(&(outname_end[5]), ".id");
+	  write_iter = strcpya(write_iter, outname);
+	  strcpy(write_iter, " .\n");
+	  wordwrapb(0);
+	  logprintb();
+	  reterr = write_sample_ids(sample_include, sample_ids, sids, outname, sample_ct, max_sample_id_blen, max_sid_blen);
+	  if (reterr) {
+	    goto calc_king_ret_1;
+	  }
+	} else {
+	  strcpy(write_iter, " .\n");
+	  wordwrapb(0);
+	  logprintb();
+	}
+	if (king_table_filter != -DBL_MAX) {
+	  const uintptr_t reported_ct = tot_cells - king_table_filter_ct;
+	  LOGPRINTF("--king-table-filter: %" PRIuPTR " relationship%s reported (%" PRIuPTR " filtered out).\n", reported_ct, (reported_ct == 1)? "" : "s", king_table_filter_ct);
+	}
+
+	if (matrix_shape) {
+	  kinship_table = kinship_table_backup;
+	}
+      }
+    } else {
+      uint32_t* results_iter = g_king_counts;
+      for (uint32_t sample_idx1 = row_start_idx; sample_idx1 < row_end_idx; ++sample_idx1) {
+	for (uint32_t sample_idx2 = 0; sample_idx2 < sample_idx1; ++sample_idx2) {
+	  const double kinship_coeff = compute_kinship(results_iter);
+	  if (kinship_coeff > king_cutoff) {
+	    SET_BIT(sample_idx2, &(kinship_table[sample_idx1 * sample_ctl]));
+	    SET_BIT(sample_idx1, &(kinship_table[sample_idx2 * sample_ctl]));
+	  }
+	  results_iter = &(results_iter[5]);
+	}
+      }
+    }
+    if (kinship_table) {
+      bigstack_reset(sample_include_cumulative_popcounts);
+      *sample_ct_ptr = sample_ct;
+      if (kinship_prune_destructive(kinship_table, sample_include, sample_ct_ptr)) {
+	goto calc_king_ret_NOMEM;
+      }
+    }
+  }
+  while (0) {
+  calc_king_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  calc_king_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  calc_king_ret_PGR_FAIL:
+    if (reterr != kPglRetReadFail) {
+      logerrprint("Error: Malformed .pgen file.\n");
+    }
+    break;
+  calc_king_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  calc_king_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  calc_king_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ calc_king_ret_1:
+  cswrite_close_cond(&css, cswritep);
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+// this probably belongs in plink2_common
+void expand_variant_dosages(const uintptr_t* genovec, const uintptr_t* dosage_present, const dosage_t* dosage_vals, double slope, double intercept, double missing_val, uint32_t sample_ct, uint32_t dosage_ct, double* expanded_dosages) {
+  double lookup_vals[4];
+  lookup_vals[0] = intercept;
+  lookup_vals[1] = intercept + slope;
+  lookup_vals[2] = intercept + 2 * slope;
+  lookup_vals[3] = missing_val;
+  const uintptr_t* genovec_iter = genovec;
+  const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint32_t widx = 0;
+  uint32_t loop_len = kBitsPerWordD2;
+  double* expanded_dosages_iter = expanded_dosages;
+  while (1) {
+    if (widx >= sample_ctl2_m1) {
+      if (widx > sample_ctl2_m1) {
+	break;
+      }
+      loop_len = MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t geno_word = *genovec_iter++;
+    for (uint32_t uii = 0; uii < loop_len; ++uii) {
+      *expanded_dosages_iter++ = lookup_vals[geno_word & 3];
+      geno_word >>= 2;
+    }
+    ++widx;
+  }
+  if (dosage_ct) {
+    slope *= kRecipDosageMid;
+    uint32_t sample_uidx = 0;
+    for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_uidx) {
+      next_set_unsafe_ck(dosage_present, &sample_uidx);
+      expanded_dosages[sample_uidx] = dosage_vals[dosage_idx] * slope + intercept;
+    }
+  }
+}
+
+// assumes trailing bits of genovec are zeroed out
+pglerr_t expand_centered_varmaj(const uintptr_t* genovec, const uintptr_t* dosage_present, const dosage_t* dosage_vals, uint32_t variance_standardize, uint32_t sample_ct, uint32_t dosage_ct, double maj_freq, double* normed_dosages) {
+  const double nonmaj_freq = 1.0 - maj_freq;
+  double inv_stdev;
+  if (variance_standardize) {
+    const double variance = 2 * maj_freq * nonmaj_freq;
+    if (variance < kSmallEpsilon) {
+      uint32_t genocounts[4];
+      genovec_count_freqs_unsafe(genovec, sample_ct, genocounts);
+      if (dosage_ct || genocounts[1] || genocounts[2]) {
+	return kPglRetInconsistentInput;
+      }
+      fill_double_zero(sample_ct, normed_dosages);
+      return kPglRetSuccess;
+    }
+    inv_stdev = 1.0 / sqrt(variance);
+  } else {
+    inv_stdev = 1.0;
+  }
+  expand_variant_dosages(genovec, dosage_present, dosage_vals, inv_stdev, -2 * nonmaj_freq * inv_stdev, 0.0, sample_ct, dosage_ct, normed_dosages);
+  return kPglRetSuccess;
+}
+
+pglerr_t load_centered_varmaj(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, uint32_t variance_standardize, uint32_t sample_ct, uint32_t variant_uidx, alt_allele_ct_t maj_allele_idx, double maj_freq, pgen_reader_t* simple_pgrp, uint32_t* missing_presentp, double* normed_dosages, uintptr_t* genovec_buf, uintptr_t* dosage_present_buf, dosage_t* dosage_vals_buf) {
+  // todo: multiallelic case
+  uint32_t dosage_ct;
+  uint32_t is_explicit_alt1;
+  pglerr_t reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec_buf, dosage_present_buf, dosage_vals_buf, &dosage_ct, &is_explicit_alt1);
+  if (reterr) {
+    // don't print malformed-.pgen error message here for now, since we may
+    // want to put this in a multithreaded loop?
+    return reterr;
+  }
+  if (maj_allele_idx) {
+    genovec_invert_unsafe(sample_ct, genovec_buf);
+    if (dosage_ct) {
+      biallelic_dosage16_invert(dosage_ct, dosage_vals_buf);
+    }
+  }
+  zero_trailing_quaters(sample_ct, genovec_buf);
+  if (missing_presentp) {
+    // missing_present assumed to be initialized to 0
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    if (!dosage_ct) {
+      for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+	const uintptr_t genovec_word = genovec_buf[widx];
+	if (genovec_word & (genovec_word >> 1) & kMask5555) {
+	  *missing_presentp = 1;
+	  break;
+	}
+      }
+    } else {
+      halfword_t* dosage_present_alias = (halfword_t*)dosage_present_buf;
+      for (uint32_t widx = 0; widx < sample_ctl2; ++widx) {
+	const uintptr_t genovec_word = genovec_buf[widx];
+	const uintptr_t ulii = genovec_word & (genovec_word >> 1) & kMask5555;
+	if (ulii) {
+	  if (pack_word_to_halfword(ulii) & (~dosage_present_alias[widx])) {
+	    *missing_presentp = 1;
+	    break;
+	  }
+	}
+      }
+    }
+  }
+  return expand_centered_varmaj(genovec_buf, dosage_present_buf, dosage_vals_buf, variance_standardize, sample_ct, dosage_ct, maj_freq, normed_dosages);
+}
+
+// multithread globals
+double* g_normed_dosage_vmaj_bufs[2] = {nullptr, nullptr};
+double* g_normed_dosage_smaj_bufs[2] = {nullptr, nullptr};
+
+double* g_grm = nullptr;
+
+static uint32_t g_pca_sample_ct = 0;
+static uint32_t g_cur_batch_size = 0;
+
+CONSTU31(kGrmVariantBlockSize, 144);
+
+// turns out dsyrk_ does exactly what we want here
+THREAD_FUNC_DECL calc_grm_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  assert(!tidx);
+  const uint32_t sample_ct = g_pca_sample_ct;
+  double* grm = g_grm;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    if (cur_batch_size) {
+      transpose_multiply_self_incr(g_normed_dosage_vmaj_bufs[parity], sample_ct, cur_batch_size, grm);
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+// can't use dsyrk_, so we manually partition the GRM piece we need to compute
+// into an appropriate number of sub-pieces
+THREAD_FUNC_DECL calc_grm_part_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t sample_ct = g_pca_sample_ct;
+  const uintptr_t first_thread_row_start_idx = g_thread_start[0];
+  const uintptr_t row_start_idx = g_thread_start[tidx];
+  const uintptr_t row_ct = g_thread_start[tidx + 1] - row_start_idx;
+  double* grm_piece = &(g_grm[(row_start_idx - first_thread_row_start_idx) * sample_ct]);
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+    const uintptr_t cur_batch_size = g_cur_batch_size;
+    if (cur_batch_size) {
+      double* normed_vmaj = g_normed_dosage_vmaj_bufs[parity];
+      double* normed_smaj = g_normed_dosage_smaj_bufs[parity];
+      row_major_matrix_multiply_incr(&(normed_smaj[row_start_idx * cur_batch_size]), normed_vmaj, row_ct, sample_ct, cur_batch_size, grm_piece);
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+// missing_nz bit is set iff that sample has at least one missing entry in
+// current block
+uintptr_t* g_missing_nz[2] = {nullptr, nullptr};
+uintptr_t* g_missing_smaj[2] = {nullptr, nullptr};
+uint32_t* g_missing_dbl_exclude_cts = nullptr;
+
+CONSTU31(kDblMissingBlockWordCt, 2);
+CONSTU31(kDblMissingBlockSize, kDblMissingBlockWordCt * kBitsPerWord);
+
+THREAD_FUNC_DECL calc_dbl_missing_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint64_t first_thread_row_start_idx = g_thread_start[0];
+  const uint64_t dbl_exclude_offset = (first_thread_row_start_idx * (first_thread_row_start_idx - 1)) / 2;
+  const uint32_t row_start_idx = g_thread_start[tidx];
+  const uintptr_t row_end_idx = g_thread_start[tidx + 1];
+  uint32_t* missing_dbl_exclude_cts = g_missing_dbl_exclude_cts;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+
+    // currently only care about zero vs. nonzero (I/O error)
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    if (cur_batch_size) {
+      const uintptr_t* missing_nz = g_missing_nz[parity];
+      const uintptr_t* missing_smaj = g_missing_smaj[parity];
+      const uint32_t first_idx = next_set(missing_nz, 0, row_end_idx);
+      uint32_t sample_idx = first_idx;
+      uint32_t prev_missing_nz_ct = 0;
+      if (sample_idx < row_start_idx) {
+	sample_idx = next_set(missing_nz, row_start_idx, row_end_idx);
+	if (sample_idx != row_end_idx) {
+	  prev_missing_nz_ct = popcount_bit_idx(missing_nz, 0, row_start_idx);
+	}
+      }
+      while (sample_idx < row_end_idx) {
+	uint32_t sample_idx2 = first_idx;
+	// todo: compare this explicit unroll with ordinary iteration over a
+	// cur_words[] array
+	// todo: try 1 word at a time, and 30 words at a time
+	const uintptr_t cur_word0 = missing_smaj[sample_idx * kDblMissingBlockWordCt];
+	const uintptr_t cur_word1 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 1];
+#ifndef __LP64__
+	const uintptr_t cur_word2 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 2];
+	const uintptr_t cur_word3 = missing_smaj[sample_idx * kDblMissingBlockWordCt + 3];
+#endif
+	// (sample_idx - 1) underflow ok
+	uint32_t* write_base = &(missing_dbl_exclude_cts[((((uint64_t)sample_idx) * (sample_idx - 1)) / 2) - dbl_exclude_offset]);
+	for (uint32_t uii = 0; uii < prev_missing_nz_ct; ++uii, ++sample_idx2) {
+	  next_set_unsafe_ck(missing_nz, &sample_idx2);
+	  const uintptr_t* cur_missing_smaj_base = &(missing_smaj[sample_idx2 * kDblMissingBlockWordCt]);
+	  const uintptr_t cur_and0 = cur_word0 & cur_missing_smaj_base[0];
+	  const uintptr_t cur_and1 = cur_word1 & cur_missing_smaj_base[1];
+#ifdef __LP64__
+	  if (cur_and0 || cur_and1) {
+	    write_base[sample_idx2] += popcount_2_longs(cur_and0, cur_and1);
+	  }
+#else
+	  const uintptr_t cur_and2 = cur_word2 & cur_missing_smaj_base[2];
+	  const uintptr_t cur_and3 = cur_word3 & cur_missing_smaj_base[3];
+	  if (cur_and0 || cur_and1 || cur_and2 || cur_and3) {
+	    write_base[sample_idx2] += popcount_4_longs(cur_and0, cur_and1, cur_and2, cur_and3);
+	  }
+#endif
+	}
+	++prev_missing_nz_ct;
+	sample_idx = next_set(missing_nz, sample_idx + 1, row_end_idx);
+      }
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+pglerr_t calc_missing_matrix(const uintptr_t* sample_include, const uint32_t* sample_include_cumulative_popcounts, const uintptr_t* variant_include, uint32_t sample_ct, uint32_t variant_ct, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t row_start_idx, uintptr_t row_end_idx, uint32_t max_thread_ct, pgen_reader_t* simple_pgrp, uint32_t** missing_cts_ptr, uint32_t** missing_dbl_exclude_cts_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uintptr_t row_end_idxl = BITCT_TO_WORDCT(row_end_idx);
+    uintptr_t* missing_vmaj = nullptr;
+    uintptr_t* genovec_buf = nullptr;
+    if (bigstack_calloc_ui(row_end_idx, missing_cts_ptr) ||
+	bigstack_calloc_ui((((uint64_t)row_end_idx) * (row_end_idx - 1) - ((uint64_t)row_start_idx) * (row_start_idx - 1)) / 2, missing_dbl_exclude_cts_ptr) ||
+	bigstack_calloc_ul(row_end_idxl, &g_missing_nz[0]) ||
+	bigstack_calloc_ul(row_end_idxl, &g_missing_nz[1]) ||
+	bigstack_alloc_ul(QUATERCT_TO_WORDCT(row_end_idx), &genovec_buf) ||
+	bigstack_alloc_ul(row_end_idxl * (k1LU * kDblMissingBlockSize), &missing_vmaj) ||
+	bigstack_alloc_ul(round_up_pow2(row_end_idx, 2) * kDblMissingBlockWordCt, &g_missing_smaj[0]) ||
+	bigstack_alloc_ul(round_up_pow2(row_end_idx, 2) * kDblMissingBlockWordCt, &g_missing_smaj[1])) {
+      goto calc_missing_matrix_ret_NOMEM;
+    }
+    uint32_t* missing_cts = *missing_cts_ptr;
+    uint32_t* missing_dbl_exclude_cts = *missing_dbl_exclude_cts_ptr;
+    g_missing_dbl_exclude_cts = missing_dbl_exclude_cts;
+    vul_t* transpose_bitblock_wkspace = (vul_t*)bigstack_alloc_raw(kPglBitTransposeBufbytes);
+    uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
+    ts.calc_thread_ct = calc_thread_ct;
+    if (bigstack_alloc_ui(calc_thread_ct + 1, &g_thread_start) ||
+	bigstack_alloc_thread(calc_thread_ct, &ts.threads)) {
+      goto calc_missing_matrix_ret_NOMEM;
+    }
+    // note that this g_thread_start[] may have different values than the one
+    // computed by calc_grm(), since calc_thread_ct changes in the MTBLAS and
+    // OS X cases.
+    triangle_fill(sample_ct, calc_thread_ct, parallel_idx, parallel_tot, 0, 1, g_thread_start);
+    assert(g_thread_start[0] == row_start_idx);
+    assert(g_thread_start[calc_thread_ct] == row_end_idx);
+    const uint32_t sample_transpose_batch_ct_m1 = (row_end_idx - 1) / kPglBitTransposeBatch;
+    
+    uint32_t parity = 0;
+    uint32_t cur_variant_idx_start = 0;
+    uint32_t variant_uidx = 0;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    // caller's responsibility to print this
+    // logprint("Correcting for missingness: ");
+    fputs("0%", stdout);
+    fflush(stdout);
+    pgr_clear_ld_cache(simple_pgrp);
+    while (1) {
+      uint32_t cur_batch_size = 0;
+      if (!ts.is_last_block) {
+	cur_batch_size = kDblMissingBlockSize;
+	uint32_t cur_variant_idx_end = cur_variant_idx_start + cur_batch_size;
+	if (cur_variant_idx_end > variant_ct) {
+	  cur_batch_size = variant_ct - cur_variant_idx_start;
+	  cur_variant_idx_end = variant_ct;
+	  fill_ulong_zero((kDblMissingBlockSize - cur_batch_size) * row_end_idxl, &(missing_vmaj[cur_batch_size * row_end_idxl]));
+	}
+	uintptr_t* missing_vmaj_iter = missing_vmaj;
+	for (uint32_t variant_idx = cur_variant_idx_start; variant_idx < cur_variant_idx_end; ++variant_uidx, ++variant_idx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  reterr = pgr_read_missingness_multi(sample_include, sample_include_cumulative_popcounts, row_end_idx, variant_uidx, simple_pgrp, nullptr, missing_vmaj_iter, nullptr, genovec_buf);
+	  if (reterr) {
+	    if (reterr == kPglRetMalformedInput) {
+	      logprint("\n");
+	      logerrprint("Error: Malformed .pgen file.\n");
+	    }
+	    goto calc_missing_matrix_ret_1;
+	  }	  
+	  missing_vmaj_iter = &(missing_vmaj_iter[row_end_idxl]);
+	}
+	uintptr_t* cur_missing_smaj_iter = g_missing_smaj[parity];
+	uint32_t sample_transpose_batch_idx = 0;
+	uint32_t sample_batch_size = kPglBitTransposeBatch;
+	while (1) {
+	  if (sample_transpose_batch_idx >= sample_transpose_batch_ct_m1) {
+	    if (sample_transpose_batch_idx > sample_transpose_batch_ct_m1) {
+	      break;
+	    }
+	    sample_batch_size = MOD_NZ(row_end_idx, kPglBitTransposeBatch);
+	  }
+	  // missing_smaj offset needs to be 64-bit if kDblMissingBlockWordCt
+	  // increases
+	  transpose_bitblock(&(missing_vmaj[sample_transpose_batch_idx * (kPglBitTransposeBatch / kBitsPerWord)]), row_end_idxl, kDblMissingBlockWordCt, kDblMissingBlockSize, sample_batch_size, &(cur_missing_smaj_iter[sample_transpose_batch_idx * kPglBitTransposeBatch * kDblMissingBlockWordCt]), transpose_bitblock_wkspace);
+	  ++sample_transpose_batch_idx;
+	}
+	uintptr_t* cur_missing_nz = g_missing_nz[parity];
+	fill_ulong_zero(row_end_idxl, cur_missing_nz);
+	for (uint32_t sample_idx = 0; sample_idx < row_end_idx; ++sample_idx) {
+	  const uintptr_t cur_word0 = *cur_missing_smaj_iter++;
+	  const uintptr_t cur_word1 = *cur_missing_smaj_iter++;
+#ifdef __LP64__
+	  if (cur_word0 || cur_word1) {
+	    set_bit(sample_idx, cur_missing_nz);
+	    missing_cts[sample_idx] += popcount_2_longs(cur_word0, cur_word1);
+	  }
+#else
+	  const uintptr_t cur_word2 = *cur_missing_smaj_iter++;
+	  const uintptr_t cur_word3 = *cur_missing_smaj_iter++;
+	  if (cur_word0 || cur_word1 || cur_word2 || cur_word3) {
+	    set_bit(sample_idx, cur_missing_nz);
+	    missing_cts[sample_idx] += popcount_4_longs(cur_word0, cur_word1, cur_word2, cur_word3);
+	  }
+#endif
+	}
+      }
+      if (cur_variant_idx_start) {
+	join_threads3z(&ts);
+	if (ts.is_last_block) {
+	  break;
+	}
+	if (cur_variant_idx_start >= next_print_variant_idx) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (cur_variant_idx_start * 100LLU) / variant_ct;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	}
+      }
+      ts.is_last_block = (cur_variant_idx_start + cur_batch_size == variant_ct);
+      g_cur_batch_size = cur_batch_size;
+      ts.thread_func_ptr = calc_dbl_missing_thread;
+      if (spawn_threads3z(cur_variant_idx_start, &ts)) {
+	goto calc_missing_matrix_ret_THREAD_CREATE_FAIL;
+      }
+      cur_variant_idx_start += cur_batch_size;
+      parity = 1 - parity;
+    }
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    logprint("done.\n");
+    bigstack_mark = (unsigned char*)g_missing_nz[0];
+  }
+  while (0) {
+  calc_missing_matrix_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  calc_missing_matrix_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ calc_missing_matrix_ret_1:
+  threads3z_cleanup(&ts, &g_cur_batch_size);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t calc_grm(const uintptr_t* orig_sample_include, const char* sample_ids, const char* sids, uintptr_t* variant_include, const chr_info_t* cip, const uintptr_t* variant_allele_idxs, const alt_allele_ct_t* maj_alleles, const double* allele_freqs, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t raw_variant_ct, uint32_t variant_ct, grm_flags_t grm_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, pgen [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  FILE* outfile = nullptr;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  threads_state_t ts;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  init_threads3z(&ts);
+  {
+    if (sample_ct < 2) {
+      logerrprint("Error: GRM construction requires at least two samples.\n");
+      goto calc_grm_ret_INCONSISTENT_INPUT;
+    }
+    assert(variant_ct);
+#if defined(__APPLE__) || defined(USE_MTBLAS)
+    uint32_t calc_thread_ct = 1;
+#else
+    uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    if (calc_thread_ct * parallel_tot > sample_ct / 32) {
+      calc_thread_ct = sample_ct / (32 * parallel_tot);
+      if (!calc_thread_ct) {
+	calc_thread_ct = 1;
+      }
+    }
+#endif
+    ts.calc_thread_ct = calc_thread_ct;
+    const uintptr_t* sample_include = orig_sample_include;
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    uint32_t row_start_idx = 0;
+    uintptr_t row_end_idx = sample_ct;
+    uint32_t* thread_start = nullptr;
+    if ((calc_thread_ct != 1) || (parallel_tot != 1)) {
+      // note that grm should be allocated on bottom if no --parallel, since it
+      // may continue to be used after function exit.  So we allocate this on
+      // top.
+      if (bigstack_end_alloc_ui(calc_thread_ct + 1, &thread_start)) {
+	goto calc_grm_ret_NOMEM;
+      }
+      // slightly different from plink 1.9 since we don't bother to treat the
+      // diagonal as a special case any more.
+      triangle_fill(sample_ct, calc_thread_ct, parallel_idx, parallel_tot, 0, 1, thread_start);
+      row_start_idx = thread_start[0];
+      row_end_idx = thread_start[calc_thread_ct];
+      if (row_end_idx < sample_ct) {
+	// 0
+	// 0 0
+	// 0 0 0
+	// 0 0 0 0
+	// 1 1 1 1 1
+	// 1 1 1 1 1 1
+	// 2 2 2 2 2 2 2
+	// 2 2 2 2 2 2 2 2
+	// If we're computing part 0, we never need to load the last 4 samples;
+	// if part 1, we don't need the last two; etc.
+	uintptr_t* new_sample_include;
+	if (bigstack_alloc_ul(raw_sample_ctl, &new_sample_include)) {
+	  goto calc_grm_ret_NOMEM;
+	}
+	const uint32_t sample_uidx_end = 1 + idx_to_uidx_basic(orig_sample_include, row_end_idx - 1);
+	memcpy(new_sample_include, orig_sample_include, round_up_pow2(sample_uidx_end, kBitsPerWord) / CHAR_BIT);
+	clear_bits_nz(sample_uidx_end, raw_sample_ctl * kBitsPerWord, new_sample_include);
+	sample_include = new_sample_include;
+      }
+    }
+    g_thread_start = thread_start;
+    double* grm;
+    if (bigstack_calloc_d((row_end_idx - row_start_idx) * row_end_idx, &grm)) {
+      goto calc_grm_ret_NOMEM;
+    }
+    g_pca_sample_ct = row_end_idx;
+    g_grm = grm;
+    const uint32_t row_end_idxl2 = QUATERCT_TO_WORDCT(row_end_idx);
+    const uint32_t row_end_idxl = BITCT_TO_WORDCT(row_end_idx);
+    uint32_t* sample_include_cumulative_popcounts;
+    uintptr_t* genovec_buf;
+    uintptr_t* dosage_present_buf;
+    dosage_t* dosage_vals_buf;
+    if (bigstack_alloc_ui(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
+	bigstack_alloc_thread(calc_thread_ct, &ts.threads) ||
+        bigstack_alloc_ul(row_end_idxl2, &genovec_buf) ||
+	bigstack_alloc_ul(row_end_idxl, &dosage_present_buf) ||
+	bigstack_alloc_dosage(row_end_idx, &dosage_vals_buf)) {
+      goto calc_grm_ret_NOMEM;
+    }
+    fill_cumulative_popcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
+    reterr = conditional_allocate_non_autosomal_variants(cip, "GRM construction", raw_variant_ct, &variant_include, &variant_ct);
+    if (reterr) {
+      goto calc_grm_ret_1;
+    }
+    if (bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &g_normed_dosage_vmaj_bufs[0]) ||
+	bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &g_normed_dosage_vmaj_bufs[1])) {
+      goto calc_grm_ret_NOMEM;
+    }
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uintptr_t* variant_include_has_missing = nullptr;
+    if (!(grm_flags & kfGrmMeanimpute)) {
+      if (bigstack_calloc_ul(raw_variant_ctl, &variant_include_has_missing)) {
+	goto calc_grm_ret_NOMEM;
+      }
+    }
+    if (thread_start) {
+      if (bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &g_normed_dosage_smaj_bufs[0]) ||
+	  bigstack_alloc_d(row_end_idx * kGrmVariantBlockSize, &g_normed_dosage_smaj_bufs[1])) {
+	goto calc_grm_ret_NOMEM;
+      }
+    }
+#ifdef USE_MTBLAS
+    const uint32_t blas_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
+    BLAS_SET_NUM_THREADS(blas_thread_ct);
+#endif
+    // Main workflow:
+    // 1. Set n=0, load batch 0
+    //
+    // 2. Spawn threads processing batch n
+    // 3. Increment n by 1
+    // 4. Load batch n unless eof
+    // 5. Join threads
+    // 6. Goto step 2 unless eof
+    const uint32_t variance_standardize = !(grm_flags & kfGrmCov);
+    uint32_t parity = 0;
+    uint32_t cur_variant_idx_start = 0;
+    uint32_t variant_uidx = 0;
+    uint32_t cur_allele_ct = 2;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    logprint("Constructing GRM: ");
+    fputs("0%", stdout);
+    fflush(stdout);
+    pgr_clear_ld_cache(simple_pgrp);
+    while (1) {
+      uint32_t cur_batch_size = 0;
+      if (!ts.is_last_block) {
+	cur_batch_size = kGrmVariantBlockSize;
+	uint32_t cur_variant_idx_end = cur_variant_idx_start + cur_batch_size;
+	if (cur_variant_idx_end > variant_ct) {
+	  cur_batch_size = variant_ct - cur_variant_idx_start;
+	  cur_variant_idx_end = variant_ct;
+	}
+	double* normed_vmaj_iter = g_normed_dosage_vmaj_bufs[parity];
+	for (uint32_t variant_idx = cur_variant_idx_start; variant_idx < cur_variant_idx_end; ++variant_uidx, ++variant_idx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  const uint32_t maj_allele_idx = maj_alleles[variant_uidx];
+	  uint32_t missing_present = 0;
+	  uintptr_t allele_idx_base;
+	  if (!variant_allele_idxs) {
+	    allele_idx_base = variant_uidx;
+	  } else {
+	    allele_idx_base = variant_allele_idxs[variant_uidx];
+	    cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - allele_idx_base;
+	    allele_idx_base -= variant_uidx;
+	  }
+	  reterr = load_centered_varmaj(sample_include, sample_include_cumulative_popcounts, variance_standardize, row_end_idx, variant_uidx, maj_allele_idx, get_allele_freq(&(allele_freqs[allele_idx_base]), maj_allele_idx, cur_allele_ct), simple_pgrp, variant_include_has_missing? (&missing_present) : nullptr, normed_vmaj_iter, genovec_buf, dosage_present_buf, dosage_vals_buf);
+	  if (reterr) {
+	    if (reterr == kPglRetInconsistentInput) {
+	      logprint("\n");
+	      logerrprint("Error: Zero-MAF variant is not actually monomorphic.  (This is possible when\ne.g. MAF is estimated from founders, but the minor allele was only observed in\nnonfounders.  In any case, you should be using e.g. --maf to filter out all\nvery-low-MAF variants, since the relationship matrix distance formula does not\nhandle them well.)\n");
+	    } else if (reterr == kPglRetMalformedInput) {
+	      logprint("\n");
+	      logerrprint("Error: Malformed .pgen file.\n");
+	    }
+	    goto calc_grm_ret_1;
+	  }
+	  if (missing_present) {
+	    set_bit(variant_uidx, variant_include_has_missing);
+	  }
+	  normed_vmaj_iter = &(normed_vmaj_iter[row_end_idx]);
+	}
+	if (thread_start) {
+	  transpose_copy(g_normed_dosage_vmaj_bufs[parity], cur_batch_size, row_end_idx, g_normed_dosage_smaj_bufs[parity]);
+	}
+      }
+      if (cur_variant_idx_start) {
+	join_threads3z(&ts);
+	if (ts.is_last_block) {
+	  break;
+	}
+	if (cur_variant_idx_start >= next_print_variant_idx) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (cur_variant_idx_start * 100LLU) / variant_ct;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	}
+      }
+      ts.is_last_block = (cur_variant_idx_start + cur_batch_size == variant_ct);
+      g_cur_batch_size = cur_batch_size;
+      if (!ts.thread_func_ptr) {
+	if (thread_start) {
+	  ts.thread_func_ptr = calc_grm_part_thread;
+	} else {
+	  ts.thread_func_ptr = calc_grm_thread;
+	}
+      }
+      if (spawn_threads3z(cur_variant_idx_start, &ts)) {
+	goto calc_grm_ret_THREAD_CREATE_FAIL;
+      }
+      cur_variant_idx_start += cur_batch_size;
+      parity = 1 - parity;
+    }
+    BLAS_SET_NUM_THREADS(1);
+    if (pct > 10) {
+      putc_unlocked('\b', stdout);
+    }
+    fputs("\b\b", stdout);
+    logprint("done.\n");
+    uint32_t* missing_cts = nullptr; // stays null iff meanimpute
+    uint32_t* missing_dbl_exclude_cts = nullptr;
+    if (variant_include_has_missing) {
+      const uint32_t variant_ct_with_missing = popcount_longs(variant_include_has_missing, raw_variant_ctl);
+      // if no missing calls at all, act as if meanimpute was on
+      if (variant_ct_with_missing) {
+	logprint("Correcting for missingness... ");
+	reterr = calc_missing_matrix(sample_include, sample_include_cumulative_popcounts, variant_include_has_missing, sample_ct, variant_ct_with_missing, parallel_idx, parallel_tot, row_start_idx, row_end_idx, max_thread_ct, simple_pgrp, &missing_cts, &missing_dbl_exclude_cts);
+	if (reterr) {
+	  goto calc_grm_ret_1;
+	}
+      }
+    }
+    if (missing_cts) {
+      // could parallelize this loop if it ever matters
+      const uint32_t* missing_dbl_exclude_iter = missing_dbl_exclude_cts;
+      for (uintptr_t row_idx = row_start_idx; row_idx < row_end_idx; ++row_idx) {
+	const uint32_t variant_ct_base = variant_ct - missing_cts[row_idx];
+	double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
+	for (uint32_t col_idx = 0; col_idx < row_idx; ++col_idx) {
+	  *grm_iter++ /= (double)((int32_t)(variant_ct_base - missing_cts[col_idx] + (*missing_dbl_exclude_iter++)));
+	}
+	*grm_iter++ /= (double)((int32_t)variant_ct_base);
+      }
+    } else {
+      const double variant_ct_recip = 1.0 / ((double)((int32_t)variant_ct));
+      for (uintptr_t row_idx = row_start_idx; row_idx < row_end_idx; ++row_idx) {
+	double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
+	for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
+	  *grm_iter++ *= variant_ct_recip;
+	}
+      }
+    }
+    // N.B. Only the lower right of grm[] is valid when parallel_tot == 1.
+
+    // possible todo: allow simultaneous --make-rel and
+    // --make-grm-gz/--make-grm-bin
+    // (note that this routine may also be called by --pca, which may not write
+    // a matrix to disk at all.)
+    if (grm_flags & (kfGrmMatrixShapemask | kfGrmTablemask | kfGrmBin)) {
+      const grm_flags_t matrix_shape = grm_flags & kfGrmMatrixShapemask;
+      char* log_write_iter;
+      if (matrix_shape) {
+	// --make-rel
+	fputs("--make-rel: Writing...", stdout);
+	fflush(stdout);
+	if (grm_flags & kfGrmMatrixBin) {
+	  char* outname_end2 = strcpya(outname_end, ".rel.bin");
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  *outname_end2 = '\0';
+	  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	    goto calc_grm_ret_OPEN_FAIL;
+	  }
+	  double* write_double_buf = nullptr;
+	  if (matrix_shape == kfGrmMatrixSq0) {
+	    write_double_buf = (double*)g_textbuf;
+	    fill_double_zero(kTextbufMainSize / sizeof(double), write_double_buf);
+	  } else if (matrix_shape == kfGrmMatrixSq) {
+	    if (bigstack_alloc_d(row_end_idx - row_start_idx - 1, &write_double_buf)) {
+	      goto calc_grm_ret_NOMEM;
+	    }
+	  }
+	  uintptr_t row_idx = row_start_idx;
+	  while (1) {
+	    const double* grm_row = &(grm[(row_idx - row_start_idx) * row_end_idx]);
+	    ++row_idx;
+	    if (fwrite_checked(grm_row, row_idx * sizeof(double), outfile)) {
+	      goto calc_grm_ret_WRITE_FAIL;
+	    }
+	    if (row_idx == row_end_idx) {
+	      break;
+	    }
+	    if (matrix_shape == kfGrmMatrixSq0) {
+	      uintptr_t zbytes_to_dump = (sample_ct - row_idx) * sizeof(double);
+	      while (zbytes_to_dump >= kTextbufMainSize) {
+		if (fwrite_checked(write_double_buf, kTextbufMainSize, outfile)) {
+		  goto calc_grm_ret_WRITE_FAIL;
+		}
+		zbytes_to_dump -= kTextbufMainSize;
+	      }
+	      if (zbytes_to_dump) {
+		if (fwrite_checked(write_double_buf, zbytes_to_dump, outfile)) {
+		  goto calc_grm_ret_WRITE_FAIL;
+		}
+	      }
+	    } else if (matrix_shape == kfGrmMatrixSq) {
+	      double* write_double_iter = write_double_buf;
+	      const double* grm_col = &(grm[row_idx - 1]);
+	      for (uintptr_t row_idx2 = row_idx; row_idx2 < sample_ct; ++row_idx2) {
+		*write_double_iter++ = grm_col[(row_idx2 - row_start_idx) * sample_ct];
+	      }
+	      if (fwrite_checked(write_double_buf, (sample_ct - row_idx) * sizeof(double), outfile)) {
+		goto calc_grm_ret_WRITE_FAIL;
+	      }
+	    }
+	  }
+	  if (fclose_null(&outfile)) {
+	    goto calc_grm_ret_WRITE_FAIL;
+	  }
+	} else if (grm_flags & kfGrmMatrixBin4) {
+	  // downcode all entries to floats
+	  char* outname_end2 = strcpya(outname_end, ".rel.bin");
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  *outname_end2 = '\0';
+	  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	    goto calc_grm_ret_OPEN_FAIL;
+	  }
+	  float* write_float_buf;
+	  if (bigstack_alloc_f(row_end_idx, &write_float_buf)) {
+	    goto calc_grm_ret_NOMEM;
+	  }
+	  uintptr_t row_idx = row_start_idx;
+	  do {
+	    const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
+	    float* write_float_iter = write_float_buf;
+	    for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
+	      *write_float_iter++ = (float)(*grm_iter++);
+	    }
+	    ++row_idx;
+	    if (matrix_shape == kfGrmMatrixSq0) {
+	      fill_float_zero(sample_ct - row_idx, write_float_iter);
+	      write_float_iter = &(write_float_buf[sample_ct]);
+	    } else if (matrix_shape == kfGrmMatrixSq) {
+	      const double* grm_col = &(grm[row_idx - 1]);
+	      for (uintptr_t row_idx2 = row_idx; row_idx2 < sample_ct; ++row_idx2) {
+		*write_float_iter++ = (float)(grm_col[(row_idx2 - row_start_idx) * sample_ct]);
+	      }
+	    }
+	    if (fwrite_checked(write_float_buf, sizeof(float) * ((uintptr_t)(write_float_iter - write_float_buf)), outfile)) {
+	      goto calc_grm_ret_WRITE_FAIL;
+	    }
+	  } while (row_idx < row_end_idx);
+	  if (fclose_null(&outfile)) {
+	    goto calc_grm_ret_WRITE_FAIL;
+	  }
+	} else {
+	  unsigned char* overflow_buf;
+	  if (bigstack_alloc_uc(kCompressStreamBlock + 16 * row_end_idx, &overflow_buf)) {
+	    goto calc_grm_ret_NOMEM;
+	  }
+	  char* outname_end2 = strcpya(outname_end, ".rel");
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  const uint32_t output_zst = (grm_flags / kfGrmMatrixZs) & 1;
+	  if (output_zst) {
+	    outname_end2 = strcpya(outname_end2, ".zst");
+	  }
+	  *outname_end2 = '\0';
+	  if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	    goto calc_grm_ret_OPEN_FAIL;
+	  }
+	  cswritep = (char*)overflow_buf;
+	  uintptr_t row_idx = row_start_idx;
+	  do {
+	    const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
+	    ++row_idx;
+	    for (uint32_t col_idx = 0; col_idx < row_idx; ++col_idx) {
+	      cswritep = dtoa_g(*grm_iter++, cswritep);
+	      *cswritep++ = '\t';
+	    }
+	    if (matrix_shape == kfGrmMatrixSq0) {
+	      // (roughly same performance as creating a zero-tab constant
+	      // buffer in advance)
+	      const uint32_t zcount = sample_ct - row_idx;
+	      const uint32_t wct = DIV_UP(zcount, kBytesPerWord / 2);
+	      // assumes little-endian
+	      const uintptr_t zerotab_word = 0x930 * kMask0001;
+#ifdef __arm__
+  #error "Unaligned accesses in calc_grm()."
+#endif
+	      uintptr_t* writep_alias = (uintptr_t*)cswritep;
+	      for (uintptr_t widx = 0; widx < wct; ++widx) {
+		*writep_alias++ = zerotab_word;
+	      }
+	      cswritep = &(cswritep[zcount * 2]);
+	    } else if (matrix_shape == kfGrmMatrixSq) {
+	      const double* grm_col = &(grm[row_idx - 1]);
+	      for (uintptr_t row_idx2 = row_idx; row_idx2 < sample_ct; ++row_idx2) {
+		cswritep = dtoa_g(grm_col[(row_idx2 - row_start_idx) * sample_ct], cswritep);
+		*cswritep++ = '\t';
+	      }
+	    }
+	    decr_append_binary_eoln(&cswritep);
+	    if (cswrite(&css, &cswritep)) {
+	      goto calc_grm_ret_WRITE_FAIL;
+	    }
+	  } while (row_idx < row_end_idx);
+	  if (cswrite_close_null(&css, cswritep)) {
+	    goto calc_grm_ret_WRITE_FAIL;
+	  }
+	}
+	putc_unlocked('\r', stdout);
+	log_write_iter = strcpya(g_logbuf, "--make-rel: GRM ");
+	if (parallel_tot != 1) {
+	  log_write_iter = strcpya(log_write_iter, "component ");
+	}
+	log_write_iter = strcpya(log_write_iter, "written to ");
+	log_write_iter = strcpya(log_write_iter, outname);
+      } else {
+	const uint32_t* missing_dbl_exclude_iter = missing_dbl_exclude_cts;
+	if (grm_flags & kfGrmBin) {
+	  // --make-grm-bin
+	  float* write_float_buf;
+	  if (bigstack_alloc_f(row_end_idx, &write_float_buf)) {
+	    goto calc_grm_ret_NOMEM;
+	  }
+	  char* outname_end2 = strcpya(outname_end, ".grm.bin");
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  *outname_end2 = '\0';
+	  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	    goto calc_grm_ret_OPEN_FAIL;
+	  }
+	  fputs("--make-grm-bin: Writing...", stdout);
+	  fflush(stdout);
+	  for (uintptr_t row_idx = row_start_idx; row_idx < row_end_idx; ++row_idx) {
+	    const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
+	    for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
+	      write_float_buf[col_idx] = (float)(*grm_iter++);
+	    }
+	    if (fwrite_checked(write_float_buf, (row_idx + 1) * sizeof(float), outfile)) {
+	      goto calc_grm_ret_WRITE_FAIL;
+	    }
+	  }
+	  if (fclose_null(&outfile)) {
+	    goto calc_grm_ret_WRITE_FAIL;
+	  }
+
+	  outname_end2 = strcpya(outname_end, ".grm.N.bin");
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  *outname_end2 = '\0';
+	  if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	    goto calc_grm_ret_OPEN_FAIL;
+	  }
+	  if (!missing_cts) {
+	    // trivial case: write the same number repeatedly
+	    const uintptr_t tot_cells = (((uint64_t)row_end_idx) * (row_end_idx - 1) - ((uint64_t)row_start_idx) * (row_start_idx - 1)) / 2;
+	    const float variant_ctf = (float)((int32_t)variant_ct);
+	    write_float_buf = (float*)g_textbuf;
+	    for (uint32_t uii = 0; uii < (kTextbufMainSize / sizeof(float)); ++uii) {
+	      write_float_buf[uii] = variant_ctf;
+	    }
+	    const uintptr_t full_write_ct = tot_cells / (kTextbufMainSize / sizeof(float));
+	    for (uintptr_t ulii = 0; ulii < full_write_ct; ++ulii) {
+	      if (fwrite_checked(write_float_buf, kTextbufMainSize, outfile)) {
+		goto calc_grm_ret_WRITE_FAIL;
+	      }
+	    }
+	    const uintptr_t remainder = tot_cells % (kTextbufMainSize / sizeof(float));
+	    if (remainder) {
+	      if (fwrite_checked(write_float_buf, remainder * sizeof(float), outfile)) {
+		goto calc_grm_ret_WRITE_FAIL;
+	      }
+	    }
+	  } else {
+	    for (uintptr_t row_idx = row_start_idx; row_idx < row_end_idx; ++row_idx) {
+	      const uint32_t variant_ct_base = variant_ct - missing_cts[row_idx];
+	      for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
+		uint32_t cur_obs_ct = variant_ct_base;
+		if (col_idx != row_idx) {
+		  cur_obs_ct = cur_obs_ct - missing_cts[col_idx] + (*missing_dbl_exclude_iter++);
+		}
+		write_float_buf[col_idx] = (float)((int32_t)cur_obs_ct);
+	      }
+	      if (fwrite_checked(write_float_buf, (row_idx + 1) * sizeof(float), outfile)) {
+		goto calc_grm_ret_WRITE_FAIL;
+	      }
+	    }
+	  }
+	  if (fclose_null(&outfile)) {
+	    goto calc_grm_ret_WRITE_FAIL;
+	  }
+	  putc_unlocked('\r', stdout);
+	  const uint32_t outname_copy_byte_ct = 5 + (uintptr_t)(outname_end - outname);
+	  log_write_iter = strcpya(g_logbuf, "--make-grm-bin: GRM ");
+	  if (parallel_tot != 1) {
+	    log_write_iter = strcpya(log_write_iter, "component ");
+	  }
+	  log_write_iter = strcpya(log_write_iter, "written to ");
+	  log_write_iter = memcpya(log_write_iter, outname, outname_copy_byte_ct);
+	  log_write_iter = memcpyl3a(log_write_iter, "bin");
+	  if (parallel_tot != 1) {
+	    *log_write_iter++ = '.';
+	    log_write_iter = uint32toa(parallel_idx + 1, log_write_iter);
+	  }
+	  log_write_iter = memcpyl3a(log_write_iter, " , ");
+	  if (parallel_idx) {
+	    log_write_iter = strcpya(log_write_iter, "and ");
+	  }
+	  log_write_iter = strcpya(log_write_iter, "observation counts to ");
+	  log_write_iter = memcpya(log_write_iter, outname, (uintptr_t)(outname_end2 - outname));
+	} else {
+	  // --make-grm-gz
+	  unsigned char* overflow_buf;
+	  if (bigstack_alloc_uc(kCompressStreamBlock + kMaxMediumLine, &overflow_buf)) {
+	    goto calc_grm_ret_NOMEM;
+	  }
+	  char* outname_end2 = strcpya(outname_end, ".grm");
+	  if (parallel_tot != 1) {
+	    *outname_end2++ = '.';
+	    outname_end2 = uint32toa(parallel_idx + 1, outname_end2);
+	  }
+	  if (grm_flags & kfGrmTableGz) {
+	    // since the flag is named --make-grm-gz, we maintain support for
+	    // gzipped output, but it's deprecated (no longer parallel).
+	    ZWRAP_useZSTDcompression(0);
+	    outname_end2 = strcpya(outname_end2, ".gz");
+	  } else if (grm_flags & kfGrmTableZs) {
+	    outname_end2 = strcpya(outname_end2, ".zst");
+	  }
+	  *outname_end2 = '\0';
+	  if (cswrite_init(outname, 0, !(grm_flags & kfGrmTableNoGz), overflow_buf, &css)) {
+	    goto calc_grm_ret_OPEN_FAIL;
+	  }
+	  cswritep = (char*)overflow_buf;
+	  fputs("--make-grm-gz: Writing...", stdout);
+	  fflush(stdout);
+	  for (uintptr_t row_idx = row_start_idx; row_idx < row_end_idx; ++row_idx) {
+	    uint32_t variant_ct_base = variant_ct;
+	    if (missing_cts) {
+	      variant_ct_base -= missing_cts[row_idx];
+	    }
+	    const double* grm_iter = &(grm[(row_idx - row_start_idx) * row_end_idx]);
+	    for (uint32_t col_idx = 0; col_idx <= row_idx; ++col_idx) {
+	      cswritep = uint32toa_x(row_idx + 1, '\t', cswritep);
+	      cswritep = uint32toa_x(col_idx + 1, '\t', cswritep);
+	      if (missing_cts) {
+		uint32_t cur_obs_ct = variant_ct_base;
+		if (col_idx != row_idx) {
+		  cur_obs_ct = cur_obs_ct - missing_cts[col_idx] + (*missing_dbl_exclude_iter++);
+		}
+		cswritep = uint32toa(cur_obs_ct, cswritep);
+	      } else {
+		cswritep = uint32toa(variant_ct_base, cswritep);
+	      }
+	      *cswritep++ = '\t';
+	      cswritep = dtoa_g(*grm_iter++, cswritep);
+	      append_binary_eoln(&cswritep);
+	      if (cswrite(&css, &cswritep)) {
+		goto calc_grm_ret_WRITE_FAIL;
+	      }
+	    }
+	  }
+	  if (cswrite_close_null(&css, cswritep)) {
+	    goto calc_grm_ret_WRITE_FAIL;
+	  }
+	  putc_unlocked('\r', stdout);
+	  log_write_iter = strcpya(g_logbuf, "--make-grm-gz: GRM ");
+	  if (parallel_tot != 1) {
+	    log_write_iter = strcpya(log_write_iter, "component ");
+	  }
+	  log_write_iter = strcpya(log_write_iter, "written to ");
+	  log_write_iter = strcpya(log_write_iter, outname);
+	}
+      }
+      if (!parallel_idx) {
+	strcpy(&(outname_end[4]), ".id");
+	reterr = write_sample_ids(orig_sample_include, sample_ids, sids, outname, sample_ct, max_sample_id_blen, max_sid_blen);
+	if (reterr) {
+	  goto calc_grm_ret_1;
+	}
+	log_write_iter = strcpya(log_write_iter, " , and IDs to ");
+	log_write_iter = strcpya(log_write_iter, outname);
+      }
+      strcpy(log_write_iter, " .\n");
+      wordwrapb(0);
+      logprintb();
+    }
+    
+    if (grm_ptr) {
+      *grm_ptr = grm;
+      // allocation right on top of grm[]
+      bigstack_mark = (unsigned char*)sample_include_cumulative_popcounts;
+    }
+  }
+  while (0) {
+  calc_grm_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  calc_grm_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  calc_grm_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  calc_grm_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  calc_grm_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ calc_grm_ret_1:
+  cswrite_close_cond(&css, cswritep);
+  ZWRAP_useZSTDcompression(1);
+  fclose_cond(outfile);
+  threads3z_cleanup(&ts, &g_cur_batch_size);
+  BLAS_SET_NUM_THREADS(1);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+// should be able to remove NOLAPACK later since we already have a non-LAPACK
+// SVD implementation
+#ifndef NOLAPACK
+// this seems to be better than 256 (due to avoidance of cache critical
+// stride?)
+// (still want this to be a multiple of 8, for cleaner multithreading)
+CONSTU31(kPcaVariantBlockSize, 240);
+
+// multithread globals
+static uintptr_t* g_genovecs[2] = {nullptr, nullptr};
+static uint32_t* g_dosage_cts[2] = {nullptr, nullptr};
+static uintptr_t* g_dosage_presents[2] = {nullptr, nullptr};
+static dosage_t* g_dosage_val_bufs[2] = {nullptr, nullptr};
+static double* g_cur_maj_freqs[2] = {nullptr, nullptr};
+static double** g_yy_bufs = nullptr;
+static double** g_y_transpose_bufs = nullptr;
+static double** g_g2_bb_part_bufs = nullptr;
+static double* g_g1 = nullptr;
+static double* g_qq = nullptr;
+
+static uint32_t g_pc_ct = 0;
+static pglerr_t g_error_ret = kPglRetSuccess;
+
+THREAD_FUNC_DECL calc_pca_xtxa_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t pca_sample_ct = g_pca_sample_ct;
+  const uintptr_t pca_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uintptr_t pca_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uint32_t pc_ct_x2 = g_pc_ct * 2;
+  const uintptr_t qq_col_ct = (g_pc_ct + 1) * pc_ct_x2;
+  const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
+  const double* g1 = g_g1;
+  double* qq_iter = g_qq;
+  double* yy_buf = g_yy_bufs[tidx];
+  double* y_transpose_buf = g_y_transpose_bufs[tidx];
+  double* g2_part_buf = g_g2_bb_part_bufs[tidx];
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    if (vidx_offset < cur_batch_size) {
+      uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
+      if (cur_thread_batch_size > kPcaVariantBlockSize) {
+	cur_thread_batch_size = kPcaVariantBlockSize;
+      }
+      const uintptr_t* genovec_iter = &(g_genovecs[parity][vidx_offset * pca_sample_ctaw2]);
+      const uint32_t* cur_dosage_cts = &(g_dosage_cts[parity][vidx_offset]);
+      const uintptr_t* dosage_present_iter = &(g_dosage_presents[parity][vidx_offset * pca_sample_ctaw]);
+      const dosage_t* dosage_vals_iter = &(g_dosage_val_bufs[parity][vidx_offset * pca_sample_ct]);
+      const double* cur_maj_freqs_iter = &(g_cur_maj_freqs[parity][vidx_offset]);
+      double* yy_iter = yy_buf;
+      for (uint32_t uii = 0; uii < cur_thread_batch_size; ++uii) {
+	pglerr_t reterr = expand_centered_varmaj(genovec_iter, dosage_present_iter, dosage_vals_iter, 1, pca_sample_ct, cur_dosage_cts[uii], cur_maj_freqs_iter[uii], yy_iter);
+	if (reterr) {
+	  g_error_ret = reterr;
+	  break;
+	}
+	yy_iter = &(yy_iter[pca_sample_ct]);
+	genovec_iter = &(genovec_iter[pca_sample_ctaw2]);
+	dosage_present_iter = &(dosage_present_iter[pca_sample_ctaw]);
+	dosage_vals_iter = &(dosage_vals_iter[pca_sample_ct]);
+      }
+      double* cur_qq = &(qq_iter[vidx_offset * qq_col_ct]);
+      row_major_matrix_multiply_strided(yy_buf, g1, cur_thread_batch_size, pca_sample_ct, pc_ct_x2, pc_ct_x2, pca_sample_ct, qq_col_ct, cur_qq);
+      transpose_copy(yy_buf, cur_thread_batch_size, pca_sample_ct, y_transpose_buf);
+      row_major_matrix_multiply_strided_incr(y_transpose_buf, cur_qq, pca_sample_ct, cur_thread_batch_size, pc_ct_x2, qq_col_ct, cur_thread_batch_size, pc_ct_x2, g2_part_buf);
+      qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+THREAD_FUNC_DECL calc_pca_xa_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t pca_sample_ct = g_pca_sample_ct;
+  const uintptr_t pca_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uintptr_t pca_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uint32_t pc_ct_x2 = g_pc_ct * 2;
+  const uintptr_t qq_col_ct = (g_pc_ct + 1) * pc_ct_x2;
+  const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
+  const double* g1 = g_g1;
+  double* qq_iter = g_qq;
+  double* yy_buf = g_yy_bufs[tidx];
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    if (vidx_offset < cur_batch_size) {
+      uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
+      if (cur_thread_batch_size > kPcaVariantBlockSize) {
+	cur_thread_batch_size = kPcaVariantBlockSize;
+      }
+      const uintptr_t* genovec_iter = &(g_genovecs[parity][vidx_offset * pca_sample_ctaw2]);
+      const uint32_t* cur_dosage_cts = &(g_dosage_cts[parity][vidx_offset]);
+      const uintptr_t* dosage_present_iter = &(g_dosage_presents[parity][vidx_offset * pca_sample_ctaw]);
+      const dosage_t* dosage_vals_iter = &(g_dosage_val_bufs[parity][vidx_offset * pca_sample_ct]);
+      const double* cur_maj_freqs_iter = &(g_cur_maj_freqs[parity][vidx_offset]);
+      double* yy_iter = yy_buf;
+      for (uint32_t uii = 0; uii < cur_thread_batch_size; ++uii) {
+	pglerr_t reterr = expand_centered_varmaj(genovec_iter, dosage_present_iter, dosage_vals_iter, 1, pca_sample_ct, cur_dosage_cts[uii], cur_maj_freqs_iter[uii], yy_iter);
+	if (reterr) {
+	  g_error_ret = reterr;
+	  break;
+	}
+	yy_iter = &(yy_iter[pca_sample_ct]);
+	genovec_iter = &(genovec_iter[pca_sample_ctaw2]);
+	dosage_present_iter = &(dosage_present_iter[pca_sample_ctaw]);
+	dosage_vals_iter = &(dosage_vals_iter[pca_sample_ct]);
+      }
+      double* cur_qq = &(qq_iter[vidx_offset * qq_col_ct]);
+      row_major_matrix_multiply_strided(yy_buf, g1, cur_thread_batch_size, pca_sample_ct, pc_ct_x2, pc_ct_x2, pca_sample_ct, qq_col_ct, cur_qq);
+      qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+THREAD_FUNC_DECL calc_pca_xtb_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t pca_sample_ct = g_pca_sample_ct;
+  const uintptr_t pca_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uintptr_t pca_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uint32_t pc_ct_x2 = g_pc_ct * 2;
+  const uintptr_t qq_col_ct = (g_pc_ct + 1) * pc_ct_x2;
+  const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
+  const double* qq_iter = &(g_qq[vidx_offset * qq_col_ct]);
+  double* yy_buf = g_yy_bufs[tidx];
+  double* y_transpose_buf = g_y_transpose_bufs[tidx];
+  double* bb_part_buf = g_g2_bb_part_bufs[tidx];
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    if (vidx_offset < cur_batch_size) {
+      uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
+      if (cur_thread_batch_size > kPcaVariantBlockSize) {
+	cur_thread_batch_size = kPcaVariantBlockSize;
+      }
+      const uintptr_t* genovec_iter = &(g_genovecs[parity][vidx_offset * pca_sample_ctaw2]);
+      const uint32_t* cur_dosage_cts = &(g_dosage_cts[parity][vidx_offset]);
+      const uintptr_t* dosage_present_iter = &(g_dosage_presents[parity][vidx_offset * pca_sample_ctaw]);
+      const dosage_t* dosage_vals_iter = &(g_dosage_val_bufs[parity][vidx_offset * pca_sample_ct]);
+      const double* cur_maj_freqs_iter = &(g_cur_maj_freqs[parity][vidx_offset]);
+      double* yy_iter = yy_buf;
+      for (uint32_t uii = 0; uii < cur_thread_batch_size; ++uii) {
+	pglerr_t reterr = expand_centered_varmaj(genovec_iter, dosage_present_iter, dosage_vals_iter, 1, pca_sample_ct, cur_dosage_cts[uii], cur_maj_freqs_iter[uii], yy_iter);
+	if (reterr) {
+	  g_error_ret = reterr;
+	  break;
+	}
+	yy_iter = &(yy_iter[pca_sample_ct]);
+	genovec_iter = &(genovec_iter[pca_sample_ctaw2]);
+	dosage_present_iter = &(dosage_present_iter[pca_sample_ctaw]);
+	dosage_vals_iter = &(dosage_vals_iter[pca_sample_ct]);
+      }
+      transpose_copy(yy_buf, cur_thread_batch_size, pca_sample_ct, y_transpose_buf);
+      row_major_matrix_multiply_incr(y_transpose_buf, qq_iter, pca_sample_ct, qq_col_ct, cur_thread_batch_size, bb_part_buf);
+      qq_iter = &(qq_iter[cur_batch_size * qq_col_ct]);
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+THREAD_FUNC_DECL calc_pca_var_wts_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uint32_t pca_sample_ct = g_pca_sample_ct;
+  const uintptr_t pca_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uintptr_t pca_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+  const uint32_t pc_ct = g_pc_ct;
+  const uint32_t vidx_offset = tidx * kPcaVariantBlockSize;
+
+  // either first batch size is calc_thread_ct * kPcaVariantBlockSize, or there
+  // is only one batch
+  const uintptr_t var_wts_part_size = ((uintptr_t)pc_ct) * g_cur_batch_size;
+  
+  const double* sample_wts = g_g1; // sample-major, pc_ct columns
+  double* yy_buf = g_yy_bufs[tidx];
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    if (vidx_offset < cur_batch_size) {
+      uint32_t cur_thread_batch_size = cur_batch_size - vidx_offset;
+      if (cur_thread_batch_size > kPcaVariantBlockSize) {
+	cur_thread_batch_size = kPcaVariantBlockSize;
+      }
+      const uintptr_t* genovec_iter = &(g_genovecs[parity][vidx_offset * pca_sample_ctaw2]);
+      const uint32_t* cur_dosage_cts = &(g_dosage_cts[parity][vidx_offset]);
+      const uintptr_t* dosage_present_iter = &(g_dosage_presents[parity][vidx_offset * pca_sample_ctaw]);
+      const dosage_t* dosage_vals_iter = &(g_dosage_val_bufs[parity][vidx_offset * pca_sample_ct]);
+      const double* cur_maj_freqs_iter = &(g_cur_maj_freqs[parity][vidx_offset]);
+      double* yy_iter = yy_buf;
+      for (uint32_t uii = 0; uii < cur_thread_batch_size; ++uii) {
+	pglerr_t reterr = expand_centered_varmaj(genovec_iter, dosage_present_iter, dosage_vals_iter, 1, pca_sample_ct, cur_dosage_cts[uii], cur_maj_freqs_iter[uii], yy_iter);
+	if (reterr) {
+	  g_error_ret = reterr;
+	  break;
+	}
+	yy_iter = &(yy_iter[pca_sample_ct]);
+	genovec_iter = &(genovec_iter[pca_sample_ctaw2]);
+	dosage_present_iter = &(dosage_present_iter[pca_sample_ctaw]);
+	dosage_vals_iter = &(dosage_vals_iter[pca_sample_ct]);
+      }
+      // Variant weight matrix = X^T * S * D^{-1/2}, where X^T is the
+      // variance-standardized genotype matrix, S is the sample weight matrix,
+      // and D is a diagonal eigenvalue matrix.
+      // We postpone the D^{-1/2} part for now, but it's straightforward to
+      // switch to using precomputed (S * D^{-1/2}).
+      double* cur_var_wts_part = &(g_qq[parity * var_wts_part_size + vidx_offset * ((uintptr_t)pc_ct)]);
+      row_major_matrix_multiply(yy_buf, sample_wts, cur_thread_batch_size, pc_ct, pca_sample_ct, cur_var_wts_part);
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+pglerr_t calc_pca(const uintptr_t* sample_include, const char* sample_ids, const char* sids, uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* maj_alleles, const double* allele_freqs, uint32_t raw_sample_ct, uintptr_t pca_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    if ((pca_flags & kfPcaSid) && (!sids)) {
+      // put this in plink2_common?
+      const uint32_t dummy_sids_word_ct = DIV_UP(raw_sample_ct, (kBytesPerWord / 2));
+      uintptr_t* dummy_sids;
+      if (bigstack_alloc_ul(dummy_sids_word_ct, &dummy_sids)) {
+	goto calc_pca_ret_NOMEM;
+      }
+      // repeated "0\0", little-endian
+      const uintptr_t text_zero_word = kMask0001 * 48;
+      for (uint32_t uii = 0; uii < dummy_sids_word_ct; ++uii) {
+	dummy_sids[uii] = text_zero_word;
+      }
+      sids = (char*)dummy_sids;
+      max_sid_blen = 2;
+    }
+    const uint32_t is_approx = (pca_flags / kfPcaApprox) & 1;
+    reterr = conditional_allocate_non_autosomal_variants(cip, is_approx? "PCA approximation" : "PCA", raw_variant_ct, &variant_include, &variant_ct);
+    if (reterr) {
+      goto calc_pca_ret_1;
+    }
+#ifdef __APPLE__
+    // min OS X version is 10.7, so we can take Grand Central Dispatch dgemm
+    // for granted
+    // (tried this with Linux MKL + OpenMP as well, but results were inferior)
+    uint32_t calc_thread_ct = 1;
+#else
+    // I/O thread generally has <1/8 of workload
+    uint32_t calc_thread_ct = (max_thread_ct > 8)? (max_thread_ct - 1) : max_thread_ct;
+    if ((calc_thread_ct - 1) * kPcaVariantBlockSize >= variant_ct) {
+      calc_thread_ct = 1 + (variant_ct - 1) / kPcaVariantBlockSize;
+    }
+#endif
+    ts.calc_thread_ct = calc_thread_ct;
+    if (pc_ct > pca_sample_ct) {
+      if (pca_sample_ct <= variant_ct) {
+	pc_ct = pca_sample_ct;
+	sprintf(g_logbuf, "Warning: calculating %u PCs, since there are only %u samples.\n", pc_ct, pc_ct);
+      } else {
+	pc_ct = variant_ct;
+	sprintf(g_logbuf, "Warning: calculating %u PCs, since there are only %u autosomal variants.\n", pc_ct, pc_ct);
+      }
+      if (pc_ct < 2) {
+	logerrprint("Error: Too few samples or autosomal variants for PCA.\n");
+	goto calc_pca_ret_INCONSISTENT_INPUT;
+      }
+      logerrprintb();
+    }
+    const uint32_t var_wts = (pca_flags / kfPcaVarWts) & 1;
+    const uint32_t chr_col = pca_flags & kfPcaVcolChrom;
+    const uint32_t ref_col = pca_flags & kfPcaVcolRef;
+    const uint32_t alt1_col = pca_flags & kfPcaVcolAlt1;
+    const uint32_t alt_col = pca_flags & kfPcaVcolAlt;
+    const uint32_t maj_col = pca_flags & kfPcaVcolMaj;
+    const uint32_t nonmaj_col = pca_flags & kfPcaVcolNonmaj;
+    double* cur_var_wts = nullptr;
+    double* eigval_inv_sqrts = nullptr;
+    char* chr_buf = nullptr;
+    uintptr_t writebuf_alloc = 3 * kMaxMediumLine;
+    if (var_wts) {
+      if (bigstack_alloc_d(pc_ct, &cur_var_wts) ||
+	  bigstack_alloc_d(pc_ct, &eigval_inv_sqrts)) {
+	goto calc_pca_ret_NOMEM;
+      }
+      uint32_t max_chr_blen = 0;
+      if (chr_col) {
+	max_chr_blen = get_max_chr_slen(cip) + 1;
+	if (bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+	  goto calc_pca_ret_NOMEM;
+	}
+      }
+      const uintptr_t writebuf_alloc2 = kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 2 * max_allele_slen + 32 + 16 * pc_ct;
+      if (writebuf_alloc2 > writebuf_alloc) {
+	writebuf_alloc = writebuf_alloc2;
+      }
+    }
+    // temporary
+    // todo: additional --pca-clusters allocations
+    const uintptr_t* pca_sample_include = sample_include;
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t pca_sample_ctaw2 = QUATERCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+    const uint32_t pca_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(pca_sample_ct);
+    uint32_t* pca_sample_include_cumulative_popcounts;
+    double* eigvals;
+    if (bigstack_alloc_ui(raw_sample_ctl, &pca_sample_include_cumulative_popcounts) ||
+	bigstack_alloc_d(pc_ct, &eigvals) ||
+	bigstack_alloc_thread(calc_thread_ct, &ts.threads) ||
+	bigstack_alloc_dp(calc_thread_ct, &g_yy_bufs)) {
+      goto calc_pca_ret_NOMEM;
+    }
+    fill_cumulative_popcounts(pca_sample_include, raw_sample_ctl, pca_sample_include_cumulative_popcounts);
+    g_pca_sample_ct = pca_sample_ct;
+    g_pc_ct = pc_ct;
+    g_error_ret = kPglRetSuccess;
+    uint32_t cur_allele_ct = 2;
+    double* qq = nullptr;
+    double* eigvecs_smaj;
+    char* writebuf;
+    if (is_approx) {
+      if (pca_sample_ct <= 5000) {
+	logerrprint("Warning: --pca approx is only recommended for analysis of >5000 samples.\n");
+      }
+      if (variant_ct > 5000000) {
+	logerrprint("Warning: Use of --pca approx on >5m variants is not advisable.  Apply a MAF\nfilter if you haven't done so yet, and consider LD-pruning your variant set as\nwell.\n");
+      }
+      // This is ported from EIGENSOFT 6 src/ksrc/kjg_fpca.c , which is in turn
+      // primarily based on Halko N, Martinsson P, Shkolnisky Y, Tygert M
+      // (2011) An Algorithm for the Principal Component Analysis of Large Data
+      // Sets.
+      const uintptr_t pc_ct_x2 = pc_ct * 2;
+      const uintptr_t qq_col_ct = (pc_ct + 1) * pc_ct_x2;
+#ifndef LAPACK_ILP64
+      if ((variant_ct * ((uint64_t)qq_col_ct)) > 0x7effffff) {
+	logerrprint("Error: --pca approx problem instance too large for this " PROG_NAME_STR " build.  If this\nis really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
+	goto calc_pca_ret_INCONSISTENT_INPUT;
+      }
+#endif
+      const double variant_ct_recip = 1.0 / (double)((int32_t)variant_ct);
+
+      const uintptr_t g_size = pca_sample_ct * pc_ct_x2;
+      __CLPK_integer svd_rect_lwork;
+#ifdef LAPACK_ILP64
+      get_svd_rect_lwork(MAXV(pca_sample_ct, variant_ct), qq_col_ct, &svd_rect_lwork);
+#else
+      if (get_svd_rect_lwork(MAXV(pca_sample_ct, variant_ct), qq_col_ct, &svd_rect_lwork)) {
+	logerrprint("Error: --pca approx problem instance too large for this " PROG_NAME_STR " build.  If this\nis really the computation you want, use a " PROG_NAME_STR " build with large-matrix\nsupport.\n");
+	goto calc_pca_ret_INCONSISTENT_INPUT;
+      }
+#endif
+      uintptr_t svd_rect_wkspace_size = (svd_rect_lwork + qq_col_ct * qq_col_ct) * sizeof(double);
+      if (svd_rect_wkspace_size < writebuf_alloc) {
+	// used as writebuf later
+	svd_rect_wkspace_size = writebuf_alloc;
+      }
+
+      unsigned char* svd_rect_wkspace;
+      double* ss;
+      double* g1;
+      if (bigstack_alloc_d(qq_col_ct, &ss) ||
+	  bigstack_alloc_d(variant_ct * qq_col_ct, &qq) ||
+	  bigstack_alloc_dp(calc_thread_ct, &g_y_transpose_bufs) ||
+	  bigstack_alloc_dp(calc_thread_ct, &g_g2_bb_part_bufs) ||
+	  bigstack_alloc_uc(svd_rect_wkspace_size, &svd_rect_wkspace) ||
+	  bigstack_alloc_d(g_size, &g1)) {
+	goto calc_pca_ret_NOMEM;
+      }
+      const uintptr_t genovecs_alloc = round_up_pow2(pca_sample_ctaw2 * kPcaVariantBlockSize * sizeof(intptr_t), kCacheline);
+      const uintptr_t dosage_cts_alloc = round_up_pow2(kPcaVariantBlockSize * sizeof(int32_t), kCacheline);
+      const uintptr_t dosage_presents_alloc = round_up_pow2(pca_sample_ctaw * kPcaVariantBlockSize * sizeof(intptr_t), kCacheline);
+      const uintptr_t dosage_vals_alloc = round_up_pow2(pca_sample_ct * kPcaVariantBlockSize * sizeof(dosage_t), kCacheline);
+      const uintptr_t cur_maj_freqs_alloc = round_up_pow2(kPcaVariantBlockSize * sizeof(double), kCacheline);
+      const uintptr_t yy_alloc = round_up_pow2(kPcaVariantBlockSize * pca_sample_ct * sizeof(double), kCacheline);
+      const uintptr_t b_size = pca_sample_ct * qq_col_ct;
+      const uintptr_t g2_bb_part_alloc = round_up_pow2(b_size * sizeof(double), kCacheline);
+      const uintptr_t per_thread_alloc = 2 * (genovecs_alloc + dosage_cts_alloc + dosage_presents_alloc + dosage_vals_alloc + cur_maj_freqs_alloc + yy_alloc) + g2_bb_part_alloc;
+      
+      const uintptr_t bigstack_avail = bigstack_left();
+      if (per_thread_alloc * calc_thread_ct > bigstack_avail) {
+	if (bigstack_avail < per_thread_alloc) {
+	  goto calc_pca_ret_NOMEM;
+	}
+	calc_thread_ct = bigstack_avail / per_thread_alloc;
+      }
+      for (uint32_t parity = 0; parity < 2; ++parity) {
+	g_genovecs[parity] = (uintptr_t*)bigstack_alloc_raw(genovecs_alloc * calc_thread_ct);
+	g_dosage_cts[parity] = (uint32_t*)bigstack_alloc_raw(dosage_cts_alloc * calc_thread_ct);
+	g_dosage_presents[parity] = (uintptr_t*)bigstack_alloc_raw(dosage_presents_alloc * calc_thread_ct);
+	g_dosage_val_bufs[parity] = (dosage_t*)bigstack_alloc_raw(dosage_vals_alloc * calc_thread_ct);
+	g_cur_maj_freqs[parity] = (double*)bigstack_alloc_raw(cur_maj_freqs_alloc * calc_thread_ct);
+      }
+      for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	g_yy_bufs[tidx] = (double*)bigstack_alloc_raw(yy_alloc);
+	g_y_transpose_bufs[tidx] = (double*)bigstack_alloc_raw(yy_alloc);
+	g_g2_bb_part_bufs[tidx] = (double*)bigstack_alloc_raw(g2_bb_part_alloc);
+      }
+      fill_gaussian_darray(g_size / 2, max_thread_ct, g1);
+      g_g1 = g1;
+#ifdef __APPLE__
+      fputs("Projecting random vectors... ", stdout);
+#else
+      printf("Projecting random vectors (%u compute thread%s)... ", calc_thread_ct, (calc_thread_ct == 1)? "" : "s");
+#endif
+      fflush(stdout);
+      pgr_clear_ld_cache(simple_pgrp);
+      for (uint32_t iter_idx = 0; iter_idx <= pc_ct; ++iter_idx) {
+	// kjg_fpca_XTXA(), kjg_fpca_XA()
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  fill_double_zero(g_size, g_g2_bb_part_bufs[tidx]);
+	}
+	double* qq_iter = &(qq[iter_idx * pc_ct_x2]); // offset on first row
+	g_qq = qq_iter;
+
+	// Main workflow:
+	// 1. Set n=0, load batch 0
+	//
+	// 2. Spawn threads processing batch n
+	// 3. Increment n by 1
+	// 4. Load batch n unless eof
+	// 5. Join threads
+	// 6. Goto step 2 unless eof
+	//
+	// 7. Assemble next g1 by summing g2_parts
+	uint32_t parity = 0;
+	uint32_t cur_variant_idx_start = 0;
+	uint32_t variant_uidx = 0;
+	while (1) {
+	  uint32_t cur_batch_size = 0;
+	  if (!ts.is_last_block) {
+	    cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
+	    uint32_t cur_variant_idx_end = cur_variant_idx_start + cur_batch_size;
+	    if (cur_variant_idx_end > variant_ct) {
+	      cur_batch_size = variant_ct - cur_variant_idx_start;
+	      cur_variant_idx_end = variant_ct;
+	    }
+	    uintptr_t* genovec_iter = g_genovecs[parity];
+	    uint32_t* dosage_ct_iter = g_dosage_cts[parity];
+	    uintptr_t* dosage_present_iter = g_dosage_presents[parity];
+	    dosage_t* dosage_vals_iter = g_dosage_val_bufs[parity];
+	    double* maj_freqs_write_iter = g_cur_maj_freqs[parity];
+	    for (uint32_t variant_idx = cur_variant_idx_start; variant_idx < cur_variant_idx_end; ++variant_uidx, ++variant_idx) {
+	      next_set_unsafe_ck(variant_include, &variant_uidx);
+	      uint32_t dosage_ct;
+	      uint32_t is_explicit_alt1;
+	      reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(pca_sample_include, pca_sample_include_cumulative_popcounts, pca_sample_ct, variant_uidx, simple_pgrp, genovec_iter, dosage_present_iter, dosage_vals_iter, &dosage_ct, &is_explicit_alt1);
+	      if (reterr) {
+		if (reterr == kPglRetMalformedInput) {
+		  logprint("\n");
+		  logerrprint("Error: Malformed .pgen file.\n");
+		}
+		goto calc_pca_ret_1;
+	      }
+	      const uint32_t maj_allele_idx = maj_alleles[variant_uidx];
+	      if (maj_allele_idx) {
+		genovec_invert_unsafe(pca_sample_ct, genovec_iter);
+		if (dosage_ct) {
+		  biallelic_dosage16_invert(dosage_ct, dosage_vals_iter);
+		}
+	      }
+	      zero_trailing_quaters(pca_sample_ct, genovec_iter);
+	      genovec_iter = &(genovec_iter[pca_sample_ctaw2]);
+	      *dosage_ct_iter++ = dosage_ct;
+	      dosage_present_iter = &(dosage_present_iter[pca_sample_ctaw]);
+	      dosage_vals_iter = &(dosage_vals_iter[pca_sample_ct]);
+	      uintptr_t allele_idx_base;
+	      if (!variant_allele_idxs) {
+		allele_idx_base = variant_uidx;
+	      } else {
+		allele_idx_base = variant_allele_idxs[variant_uidx];
+		cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - allele_idx_base;
+		allele_idx_base -= variant_uidx;
+	      }
+	      *maj_freqs_write_iter++ = get_allele_freq(&(allele_freqs[allele_idx_base - variant_uidx]), maj_allele_idx, cur_allele_ct);
+	    }
+	  }
+	  if (cur_variant_idx_start) {
+	    join_threads3z(&ts);
+	    reterr = g_error_ret;
+	    if (reterr) {
+	      logprint("\n");
+	      logerrprint("Error: Zero-MAF variant is not actually monomorphic.  (This is possible when\ne.g. MAF is estimated from founders, but the minor allele was only observed in\nnonfounders.  In any case, you should be using e.g. --maf to filter out all\nvery-low-MAF variants, since the relationship matrix distance formula does not\nhandle them well.)\n");
+	      goto calc_pca_ret_1;
+	    }
+	    if (ts.is_last_block) {
+	      break;
+	    }
+	  }
+	  if (!cur_variant_idx_start) {
+	    if (iter_idx < pc_ct) {
+	      ts.thread_func_ptr = calc_pca_xtxa_thread;
+	    } else {
+	      ts.thread_func_ptr = calc_pca_xa_thread;
+	    }
+	  }
+	  ts.is_last_block = (cur_variant_idx_start + cur_batch_size == variant_ct);
+	  g_cur_batch_size = cur_batch_size;
+	  if (spawn_threads3z(cur_variant_idx_start, &ts)) {
+	    goto calc_pca_ret_THREAD_CREATE_FAIL;
+	  }
+	  cur_variant_idx_start += cur_batch_size;
+	  parity = 1 - parity;
+	}
+	if (iter_idx < pc_ct) {
+	  memcpy(g1, g_g2_bb_part_bufs[0], g_size * sizeof(double));
+	  for (uint32_t tidx = 1; tidx < calc_thread_ct; ++tidx) {
+	    const double* cur_g2_part = g_g2_bb_part_bufs[tidx];
+	    for (uintptr_t ulii = 0; ulii < g_size; ++ulii) {
+	      g1[ulii] += cur_g2_part[ulii];
+	    }
+	  }
+	  for (uintptr_t ulii = 0; ulii < g_size; ++ulii) {
+	    g1[ulii] *= variant_ct_recip;
+	  }
+	}
+#ifdef __APPLE__
+        printf("\rProjecting random vectors... %u/%u", iter_idx + 1, pc_ct + 1);
+#else
+        printf("\rProjecting random vectors (%u compute thread%s)... %u/%u", calc_thread_ct, (calc_thread_ct == 1)? "" : "s", iter_idx + 1, pc_ct + 1);
+#endif
+	fflush(stdout);
+      }
+      fputs(".\n", stdout);
+      logprint("Computing SVD of Krylov matrix... ");
+      fflush(stdout);
+      BLAS_SET_NUM_THREADS(max_thread_ct);
+      if (svd_rect(variant_ct, qq_col_ct, svd_rect_lwork, qq, ss, svd_rect_wkspace)) {
+	logprint("\n");
+	logerrprint("Error: Failed to compute SVD of Krylov matrix.\n");
+	goto calc_pca_ret_INCONSISTENT_INPUT;
+      }
+      BLAS_SET_NUM_THREADS(1);
+      logprint("done.\nRecovering top PCs from range approximation... ");
+      fflush(stdout);
+
+      // kjg_fpca_XTB()
+      for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	fill_double_zero(b_size, g_g2_bb_part_bufs[tidx]);
+      }
+      uint32_t parity = 0;
+      uint32_t cur_variant_idx_start = 0;
+      uint32_t variant_uidx = 0;
+      reinit_threads3z(&ts);
+      g_qq = qq;
+      while (1) {
+	uint32_t cur_batch_size = 0;
+	if (!ts.is_last_block) {
+	  // probable todo: move this boilerplate in its own function
+	  cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
+	  uint32_t cur_variant_idx_end = cur_variant_idx_start + cur_batch_size;
+	  if (cur_variant_idx_end > variant_ct) {
+	    cur_batch_size = variant_ct - cur_variant_idx_start;
+	    cur_variant_idx_end = variant_ct;
+	  }
+	  uintptr_t* genovec_iter = g_genovecs[parity];
+	  uint32_t* dosage_ct_iter = g_dosage_cts[parity];
+	  uintptr_t* dosage_present_iter = g_dosage_presents[parity];
+	  dosage_t* dosage_vals_iter = g_dosage_val_bufs[parity];
+	  double* maj_freqs_write_iter = g_cur_maj_freqs[parity];
+	  for (uint32_t variant_idx = cur_variant_idx_start; variant_idx < cur_variant_idx_end; ++variant_uidx, ++variant_idx) {
+	    next_set_unsafe_ck(variant_include, &variant_uidx);
+	    uint32_t dosage_ct;
+	    uint32_t is_explicit_alt1;
+	    reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(pca_sample_include, pca_sample_include_cumulative_popcounts, pca_sample_ct, variant_uidx, simple_pgrp, genovec_iter, dosage_present_iter, dosage_vals_iter, &dosage_ct, &is_explicit_alt1);
+	    if (reterr) {
+	      goto calc_pca_ret_READ_FAIL;
+	    }
+	    const uint32_t maj_allele_idx = maj_alleles[variant_uidx];
+	    if (maj_allele_idx) {
+	      genovec_invert_unsafe(pca_sample_ct, genovec_iter);
+	      if (dosage_ct) {
+		biallelic_dosage16_invert(dosage_ct, dosage_vals_iter);
+	      }
+	    }
+	    zero_trailing_quaters(pca_sample_ct, genovec_iter);
+	    genovec_iter = &(genovec_iter[pca_sample_ctaw2]);
+	    *dosage_ct_iter++ = dosage_ct;
+	    dosage_present_iter = &(dosage_present_iter[pca_sample_ctaw]);
+	    dosage_vals_iter = &(dosage_vals_iter[pca_sample_ct]);
+	    uintptr_t allele_idx_base;
+	    if (!variant_allele_idxs) {
+	      allele_idx_base = variant_uidx;
+	    } else {
+	      allele_idx_base = variant_allele_idxs[variant_uidx];
+	      cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - allele_idx_base;
+	      allele_idx_base -= variant_uidx;
+	    }
+	    *maj_freqs_write_iter++ = get_allele_freq(&(allele_freqs[allele_idx_base]), maj_allele_idx, cur_allele_ct);
+	  }
+	}
+	if (cur_variant_idx_start) {
+	  join_threads3z(&ts);
+	  if (g_error_ret) {
+	    // this error *didn't* happen on an earlier pass, so assign blame
+	    // to I/O instead
+	    goto calc_pca_ret_READ_FAIL;
+	  }
+	  if (ts.is_last_block) {
+	    break;
+	  }
+	}
+	ts.is_last_block = (cur_variant_idx_start + cur_batch_size == variant_ct);
+	g_cur_batch_size = cur_batch_size;
+	ts.thread_func_ptr = calc_pca_xtb_thread;
+	if (spawn_threads3z(cur_variant_idx_start, &ts)) {
+	  goto calc_pca_ret_THREAD_CREATE_FAIL;
+	}
+	cur_variant_idx_start += cur_batch_size;
+	parity = 1 - parity;
+      }
+      double* bb = g_g2_bb_part_bufs[0];
+      for (uint32_t tidx = 1; tidx < calc_thread_ct; ++tidx) {
+	const double* cur_bb_part = g_g2_bb_part_bufs[tidx];
+	for (uintptr_t ulii = 0; ulii < b_size; ++ulii) {
+	  bb[ulii] += cur_bb_part[ulii];
+	}
+      }
+      BLAS_SET_NUM_THREADS(max_thread_ct);
+      if (svd_rect(pca_sample_ct, qq_col_ct, svd_rect_lwork, bb, ss, svd_rect_wkspace)) {
+	logerrprint("Error: Failed to compute SVD of final matrix.\n");
+	goto calc_pca_ret_INCONSISTENT_INPUT;
+      }
+      BLAS_SET_NUM_THREADS(1);
+      logprint("done.\n");
+      eigvecs_smaj = g1;
+      for (uint32_t sample_idx = 0; sample_idx < pca_sample_ct; ++sample_idx) {
+	memcpy(&(eigvecs_smaj[sample_idx * ((uintptr_t)pc_ct)]), &(bb[sample_idx * qq_col_ct]), pc_ct * sizeof(double));
+      }
+      for (uint32_t pc_idx = 0; pc_idx < pc_ct; ++pc_idx) {
+	eigvals[pc_idx] = ss[pc_idx] * ss[pc_idx] * variant_ct_recip;
+      }
+      writebuf = (char*)svd_rect_wkspace;
+    } else {
+      __CLPK_integer lwork;
+      __CLPK_integer liwork;
+      uintptr_t wkspace_byte_ct;
+      if (get_extract_eigvecs_lworks(pca_sample_ct, pc_ct, &lwork, &liwork, &wkspace_byte_ct)) {
+	goto calc_pca_ret_NOMEM;
+      }
+      const uintptr_t eigvecs_smaj_alloc = pc_ct * pca_sample_ct * sizeof(double);
+      if (wkspace_byte_ct < eigvecs_smaj_alloc) {
+	wkspace_byte_ct = eigvecs_smaj_alloc;
+      }
+      double* reverse_eigvecs_pcmaj;
+      unsigned char* extract_eigvecs_wkspace;
+      if (bigstack_alloc_d(pc_ct * pca_sample_ct, &reverse_eigvecs_pcmaj) ||
+	  bigstack_alloc_uc(wkspace_byte_ct, &extract_eigvecs_wkspace)) {
+	goto calc_pca_ret_NOMEM;
+      }
+      LOGPRINTF("Extracting eigenvalue%s and eigenvector%s... ", (pc_ct == 1)? "" : "s", (pc_ct == 1)? "" : "s");
+      fflush(stdout);
+      BLAS_SET_NUM_THREADS(max_thread_ct);
+      if (extract_eigvecs(pca_sample_ct, pc_ct, lwork, liwork, grm, eigvals, reverse_eigvecs_pcmaj, extract_eigvecs_wkspace)) {
+	logerrprint("Error: Failed to extract eigenvector(s) from GRM.\n");
+	goto calc_pca_ret_INCONSISTENT_INPUT;
+      }
+      BLAS_SET_NUM_THREADS(1);
+      logprint("done.\n");
+      eigvecs_smaj = (double*)extract_eigvecs_wkspace;
+      bigstack_shrink_top(eigvecs_smaj, eigvecs_smaj_alloc);
+      if (bigstack_alloc_c(writebuf_alloc, &writebuf)) {
+	goto calc_pca_ret_NOMEM;
+      }
+
+      // extract_eigvecs() results are in reverse order, and we also need to
+      // transpose eigenvectors to sample-major
+      const uint32_t pc_ct_m1 = pc_ct - 1;
+      const uint32_t pc_ct_div2 = pc_ct / 2;
+      for (uint32_t pc_idx = 0; pc_idx < pc_ct_div2; ++pc_idx) {
+	double tmp_eigval = eigvals[pc_idx];
+	eigvals[pc_idx] = eigvals[pc_ct_m1 - pc_idx];
+	eigvals[pc_ct_m1 - pc_idx] = tmp_eigval;
+      }
+      double* eigvecs_smaj_iter = eigvecs_smaj;
+      for (uint32_t sample_idx = 0; sample_idx < pca_sample_ct; ++sample_idx) {
+	uintptr_t pc_inv_idx = pc_ct;
+	const double* reverse_eigvecs_col = &(reverse_eigvecs_pcmaj[sample_idx]);
+	do {
+	  --pc_inv_idx;
+	  *eigvecs_smaj_iter++ = reverse_eigvecs_col[pc_inv_idx * pca_sample_ct];
+	} while (pc_inv_idx);
+      }
+    }
+    // (later: --pca-cluster-names, --pca-clusters)
+    char* writebuf_flush = &(writebuf[kMaxMediumLine]);
+
+    if (var_wts) {
+      g_g1 = eigvecs_smaj;
+      for (uint32_t pc_idx = 0; pc_idx < pc_ct; ++pc_idx) {
+	eigval_inv_sqrts[pc_idx] = 1.0 / sqrt(eigvals[pc_idx]);
+      }
+
+      char* outname_end2 = strcpya0(outname_end, ".eigenvec.var");
+      const uint32_t output_zst = (pca_flags / kfPcaVarZs) & 1;
+      if (output_zst) {
+	strcpy(outname_end2, ".zst");
+      }
+      if (cswrite_init(outname, 0, output_zst, (unsigned char*)writebuf, &css)) {
+	goto calc_pca_ret_OPEN_FAIL;
+      }
+      cswritep = writebuf;
+      *cswritep++ = '#';
+      if (chr_col) {
+	cswritep = strcpya(cswritep, "CHROM\t");
+      }
+      if (pca_flags & kfPcaVcolPos) {
+	cswritep = strcpya(cswritep, "POS\t");
+      } else {
+	variant_bps = nullptr;
+      }
+      cswritep = strcpya(cswritep, "ID");
+      if (ref_col) {
+	cswritep = strcpya(cswritep, "\tREF");
+      }
+      if (alt1_col) {
+	cswritep = strcpya(cswritep, "\tALT1");
+      }
+      if (alt_col) {
+	cswritep = strcpya(cswritep, "\tALT");
+      }
+      if (maj_col) {
+	cswritep = strcpya(cswritep, "\tMAJ");
+      }
+      if (nonmaj_col) {
+	cswritep = strcpya(cswritep, "\tNONMAJ");
+      }
+      for (uint32_t pc_idx = 0; pc_idx < pc_ct;) {
+	++pc_idx;
+	cswritep = memcpyl3a(cswritep, "\tPC");
+	cswritep = uint32toa(pc_idx, cswritep);
+      }
+      append_binary_eoln(&cswritep);
+
+      // Main workflow:
+      // 1. Set n=0, load batch 0
+      //
+      // 2. Spawn threads processing batch n
+      // 3. If n>0, write results and update projection for block (n-1)
+      // 4. Increment n by 1
+      // 5. Load batch n unless eof
+      // 6. Join threads
+      // 7. Goto step 2 unless eof
+      //
+      // 8. Write results and update projection for last block
+      uint32_t cur_variant_idx_start = 0;
+#ifndef __APPLE__
+      if (output_zst) {
+	// compression is relatively expensive?
+	calc_thread_ct = 1;
+	ts.calc_thread_ct = 1;
+      }
+#endif
+      uintptr_t var_wts_part_size;
+      if (qq) {
+        var_wts_part_size = (MINV(variant_ct, calc_thread_ct * kPcaVariantBlockSize)) * ((uintptr_t)pc_ct);
+      } else {
+	// non-approximate PCA, bunch of buffers have not been allocated yet
+	
+	// if grm[] (which we no longer need) has at least as much remaining
+	// space as bigstack, allocate from grm
+	unsigned char* arena_bottom = (unsigned char*)grm;
+	unsigned char* arena_top = bigstack_mark;
+	uintptr_t arena_avail = (uintptr_t)(arena_top - arena_bottom);
+	if (arena_avail < bigstack_left()) {
+	  arena_bottom = g_bigstack_base;
+	  arena_top = g_bigstack_end;
+	  arena_avail = bigstack_left();
+	}
+	const uintptr_t var_wts_part_alloc = round_up_pow2(2 * kPcaVariantBlockSize * sizeof(double) * pc_ct, kCacheline);
+	const uintptr_t genovecs_alloc = round_up_pow2(pca_sample_ctaw2 * kPcaVariantBlockSize * sizeof(intptr_t), kCacheline);
+	const uintptr_t dosage_cts_alloc = round_up_pow2(kPcaVariantBlockSize * sizeof(int32_t), kCacheline);
+	const uintptr_t dosage_presents_alloc = round_up_pow2(pca_sample_ctaw * kPcaVariantBlockSize * sizeof(intptr_t), kCacheline);
+	const uintptr_t dosage_vals_alloc = round_up_pow2(pca_sample_ct * kPcaVariantBlockSize * sizeof(dosage_t), kCacheline);
+	const uintptr_t cur_maj_freqs_alloc = round_up_pow2(kPcaVariantBlockSize * sizeof(double), kCacheline);
+	const uintptr_t yy_alloc = round_up_pow2(kPcaVariantBlockSize * pca_sample_ct * sizeof(double), kCacheline);
+        const uintptr_t per_thread_alloc = 2 * (genovecs_alloc + dosage_cts_alloc + dosage_presents_alloc + dosage_vals_alloc + cur_maj_freqs_alloc) + yy_alloc + var_wts_part_alloc;
+	if (per_thread_alloc * calc_thread_ct > arena_avail) {
+	  if (arena_avail < per_thread_alloc) {
+	    goto calc_pca_ret_NOMEM;
+	  }
+	  calc_thread_ct = arena_avail / per_thread_alloc;
+	}
+	ts.calc_thread_ct = calc_thread_ct;
+	for (uint32_t parity = 0; parity < 2; ++parity) {
+	  g_genovecs[parity] = (uintptr_t*)arena_alloc_raw(genovecs_alloc * calc_thread_ct, &arena_bottom);
+	  g_dosage_cts[parity] = (uint32_t*)arena_alloc_raw(dosage_cts_alloc * calc_thread_ct, &arena_bottom);
+	  g_dosage_presents[parity] = (uintptr_t*)arena_alloc_raw(dosage_presents_alloc * calc_thread_ct, &arena_bottom);
+	  g_dosage_val_bufs[parity] = (dosage_t*)arena_alloc_raw(dosage_vals_alloc * calc_thread_ct, &arena_bottom);
+	  g_cur_maj_freqs[parity] = (double*)arena_alloc_raw(cur_maj_freqs_alloc * calc_thread_ct, &arena_bottom);
+	}
+	for (uint32_t tidx = 0; tidx < calc_thread_ct; ++tidx) {
+	  g_yy_bufs[tidx] = (double*)arena_alloc_raw(yy_alloc, &arena_bottom);
+	}
+        var_wts_part_size = (MINV(variant_ct, calc_thread_ct * kPcaVariantBlockSize)) * ((uintptr_t)pc_ct);
+	qq = (double*)arena_alloc_raw_rd(2 * var_wts_part_size * sizeof(double), &arena_bottom);
+	g_qq = qq;
+#ifndef NDEBUG
+	if (arena_top == g_bigstack_end) {
+	  // we shouldn't make any more allocations, but just in case...
+	  g_bigstack_base = arena_bottom;
+	}
+#endif
+      }
+      uint32_t prev_batch_size = 0;
+      uint32_t variant_uidx = next_set_unsafe(variant_include, 0);
+      uint32_t variant_uidx_load = variant_uidx;
+      uint32_t parity = 0;
+      reinit_threads3z(&ts);
+      uint32_t chr_fo_idx = 0xffffffffU;
+      uint32_t chr_end = 0;
+      uint32_t chr_buf_blen = 0;
+      while (1) {
+	uint32_t cur_batch_size = 0;
+	if (!ts.is_last_block) {
+	  cur_batch_size = calc_thread_ct * kPcaVariantBlockSize;
+	  uint32_t cur_variant_idx_end = cur_variant_idx_start + cur_batch_size;
+	  if (cur_variant_idx_end > variant_ct) {
+	    cur_batch_size = variant_ct - cur_variant_idx_start;
+	    cur_variant_idx_end = variant_ct;
+	  }
+	  uintptr_t* genovec_iter = g_genovecs[parity];
+	  uint32_t* dosage_ct_iter = g_dosage_cts[parity];
+	  uintptr_t* dosage_present_iter = g_dosage_presents[parity];
+	  dosage_t* dosage_vals_iter = g_dosage_val_bufs[parity];
+	  double* maj_freqs_write_iter = g_cur_maj_freqs[parity];
+	  for (uint32_t variant_idx = cur_variant_idx_start; variant_idx < cur_variant_idx_end; ++variant_uidx_load, ++variant_idx) {
+	    next_set_unsafe_ck(variant_include, &variant_uidx_load);
+	    uint32_t dosage_ct;
+	    uint32_t is_explicit_alt1;
+	    reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(pca_sample_include, pca_sample_include_cumulative_popcounts, pca_sample_ct, variant_uidx_load, simple_pgrp, genovec_iter, dosage_present_iter, dosage_vals_iter, &dosage_ct, &is_explicit_alt1);
+	    if (reterr) {
+	      goto calc_pca_ret_READ_FAIL;
+	    }
+	    const uint32_t maj_allele_idx = maj_alleles[variant_uidx_load];
+	    if (maj_allele_idx) {
+	      genovec_invert_unsafe(pca_sample_ct, genovec_iter);
+	      if (dosage_ct) {
+		biallelic_dosage16_invert(dosage_ct, dosage_vals_iter);
+	      }
+	    }
+	    zero_trailing_quaters(pca_sample_ct, genovec_iter);
+	    genovec_iter = &(genovec_iter[pca_sample_ctaw2]);
+	    *dosage_ct_iter++ = dosage_ct;
+	    dosage_present_iter = &(dosage_present_iter[pca_sample_ctaw]);
+	    dosage_vals_iter = &(dosage_vals_iter[pca_sample_ct]);
+	    uintptr_t allele_idx_base;
+	    if (!variant_allele_idxs) {
+	      allele_idx_base = variant_uidx_load;
+	    } else {
+	      allele_idx_base = variant_allele_idxs[variant_uidx_load];
+	      cur_allele_ct = variant_allele_idxs[variant_uidx_load + 1] - allele_idx_base;
+	      allele_idx_base -= variant_uidx_load;
+	    }
+	    *maj_freqs_write_iter++ = get_allele_freq(&(allele_freqs[allele_idx_base]), maj_allele_idx, cur_allele_ct);
+	  }
+	}
+	if (cur_variant_idx_start) {
+	  join_threads3z(&ts);
+	  if (g_error_ret) {
+	    goto calc_pca_ret_READ_FAIL;
+	  }
+	}
+	if (!ts.is_last_block) {
+	  g_cur_batch_size = cur_batch_size;
+	  ts.is_last_block = (cur_variant_idx_start + cur_batch_size == variant_ct);
+	  ts.thread_func_ptr = calc_pca_var_wts_thread;
+	  if (spawn_threads3z(cur_variant_idx_start, &ts)) {
+	    goto calc_pca_ret_THREAD_CREATE_FAIL;
+	  }
+	}
+	parity = 1 - parity;
+	if (cur_variant_idx_start) {
+	  // write *previous* block results
+	  const double* var_wts_iter = &(qq[parity * var_wts_part_size]);
+	  // (todo: update projection here)
+	  for (uint32_t vidx = cur_variant_idx_start - prev_batch_size; vidx < cur_variant_idx_start; ++vidx, ++variant_uidx) {
+	    next_set_unsafe_ck(variant_include, &variant_uidx);
+	    if (chr_col) {
+	      if (variant_uidx >= chr_end) {
+		int32_t chr_idx;
+		do {
+		  ++chr_fo_idx;
+		  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+		} while (variant_uidx >= chr_end);
+		chr_idx = cip->chr_file_order[chr_fo_idx];
+		char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+		*chr_name_end = '\t';
+		chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	      }
+	      cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+	    }
+	    if (variant_bps) {
+	      cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+	    }
+	    cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+	    uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	    if (variant_allele_idxs) {
+	      variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	      cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	    }
+	    char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	    if (ref_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_alleles[0]);
+	    }
+	    if (alt1_col) {
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_alleles[1]);
+	    }
+	    if (alt_col) {
+	      *cswritep++ = '\t';
+	      for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+		if (cswrite(&css, &cswritep)) {
+		  goto calc_pca_ret_WRITE_FAIL;
+		}
+		cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	      }
+	      --cswritep;
+	    }
+	    const uint32_t maj_allele_idx = maj_alleles[variant_uidx];
+	    if (maj_col) {
+	      if (cswrite(&css, &cswritep)) {
+		goto calc_pca_ret_WRITE_FAIL;
+	      }
+	      *cswritep++ = '\t';
+	      cswritep = strcpya(cswritep, cur_alleles[maj_allele_idx]);
+	    }
+	    if (nonmaj_col) {
+	      *cswritep++ = '\t';
+	      for (uint32_t allele_idx = 0; allele_idx < cur_allele_ct; ++allele_idx) {
+		if (allele_idx == maj_allele_idx) {
+		  continue;
+		}
+		if (cswrite(&css, &cswritep)) {
+		  goto calc_pca_ret_WRITE_FAIL;
+		}
+		cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	      }
+	      --cswritep;
+	    }
+	    for (uint32_t pc_idx = 0; pc_idx < pc_ct; ++pc_idx) {
+	      *cswritep++ = '\t';
+	      // could avoid these multiplications by premultiplying the
+	      // sample weight matrix
+	      cswritep = dtoa_g((*var_wts_iter++) * eigval_inv_sqrts[pc_idx], cswritep);
+	    }
+	    append_binary_eoln(&cswritep);
+	  }
+	}
+	if (cur_variant_idx_start == variant_ct) {
+	  break;
+	}
+	cur_variant_idx_start += cur_batch_size;
+	prev_batch_size = cur_batch_size;
+      }
+      if (cswrite_close_null(&css, cswritep)) {
+	goto calc_pca_ret_WRITE_FAIL;
+      }
+      LOGPRINTFWW("--pca%s: Variant weights written to %s .\n", is_approx? " approx" : "", outname);
+    }
+      
+    strcpy(outname_end, ".eigenvec");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto calc_pca_ret_OPEN_FAIL;
+    }
+    char* write_iter = strcpya(writebuf, "#FID\tIID");
+    if (sids) {
+      write_iter = strcpya(write_iter, "\tSID");
+    }
+    for (uint32_t pc_idx = 0; pc_idx < pc_ct;) {
+      ++pc_idx;
+      write_iter = memcpyl3a(write_iter, "\tPC");
+      write_iter = uint32toa(pc_idx, write_iter);
+    }
+    append_binary_eoln(&write_iter);
+    const uint32_t sample_ct = pca_sample_ct;
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      write_iter = strcpya(write_iter, &(sample_ids[sample_uidx * max_sample_id_blen]));
+      if (sids) {
+	*write_iter++ = '\t';
+	write_iter = strcpya(write_iter, &(sids[sample_uidx * max_sid_blen]));
+      }
+      double* sample_wts_iter = &(eigvecs_smaj[sample_idx * pc_ct]);
+      // todo: read from proj_sample_wts instead when pca_sample_include bit
+      // not set
+      for (uint32_t pc_idx = 0; pc_idx < pc_ct; ++pc_idx) {
+	*write_iter++ = '\t';
+	write_iter = dtoa_g(*sample_wts_iter++, write_iter);
+      }
+      append_binary_eoln(&write_iter);
+      if (write_iter >= writebuf_flush) {
+	if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	  goto calc_pca_ret_WRITE_FAIL;
+	}
+	write_iter = writebuf;
+      }
+    }
+    if (write_iter != writebuf) {
+      if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+	goto calc_pca_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto calc_pca_ret_WRITE_FAIL;
+    }
+
+    strcpy(outname_end, ".eigenval");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto calc_pca_ret_OPEN_FAIL;
+    }
+    write_iter = writebuf;
+    for (uint32_t pc_idx = 0; pc_idx < pc_ct; ++pc_idx) {
+      write_iter = dtoa_g(eigvals[pc_idx], write_iter);
+      append_binary_eoln(&write_iter);
+    }
+    if (fwrite_checked(writebuf, (uintptr_t)(write_iter - writebuf), outfile)) {
+      goto calc_pca_ret_WRITE_FAIL;
+    }
+    if (fclose_null(&outfile)) {
+      goto calc_pca_ret_WRITE_FAIL;
+    }
+    *outname_end = '\0';
+    LOGPRINTFWW("--pca%s: Eigenvector%s written to %s.eigenvec , and eigenvalue%s written to %s.eigenval .\n", is_approx? " approx" : "", (pc_ct == 1)? "" : "s", outname, (pc_ct == 1)? "" : "s", outname);
+  }
+  while (0) {
+  calc_pca_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  calc_pca_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  calc_pca_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  calc_pca_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  calc_pca_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  calc_pca_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ calc_pca_ret_1:
+  threads3z_cleanup(&ts, &g_cur_batch_size);
+  BLAS_SET_NUM_THREADS(1);
+  cswrite_close_cond(&css, cswritep);
+  fclose_cond(outfile);
+  if (grm) {
+    // nothing after --pca in the plink2 order of operations uses grm[]
+    bigstack_reset(grm);
+  } else {
+    bigstack_reset(bigstack_mark);
+  }
+  return reterr;
+}
+#endif
+
+// to test: do we actually want cur_dosage_ints to be uint64_t* instead of
+// uint32_t*?
+void fill_cur_dosage_ints(const uintptr_t* genovec_buf, const uintptr_t* dosage_present, const dosage_t* dosage_vals_buf, uint32_t sample_ct, uint32_t dosage_ct, uint32_t is_diploid_p1, uint64_t* cur_dosage_ints) {
+  const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
+  uint32_t loop_len = kBitsPerWordD2;
+  uint32_t widx = 0;
+  uint64_t lookup_table[4];
+  lookup_table[0] = 0;
+  lookup_table[1] = is_diploid_p1 * kDosageMid;
+  lookup_table[2] = is_diploid_p1 * kDosageMax;
+  lookup_table[3] = 0;
+  uint64_t* cur_dosage_ints_iter = cur_dosage_ints;
+  while (1) {
+    if (widx >= sample_ctl2_m1) {
+      if (widx > sample_ctl2_m1) {
+	break;
+      }
+      loop_len = 1 + MOD_NZ(sample_ct, kBitsPerWordD2);
+    }
+    uintptr_t cur_geno_word = genovec_buf[widx];
+    for (uint32_t uii = 0; uii < loop_len; ++uii) {
+      const uintptr_t cur_geno = cur_geno_word & 3;
+      *cur_dosage_ints_iter++ = lookup_table[cur_geno];
+      cur_geno_word >>= 2;
+    }
+    ++widx;
+  }
+  uint32_t sample_idx = 0;
+  for (uint32_t dosage_idx = 0; dosage_idx < dosage_ct; ++dosage_idx, ++sample_idx) {
+    next_set_unsafe_ck(dosage_present, &sample_idx);
+    cur_dosage_ints[sample_idx] = dosage_vals_buf[dosage_idx] * is_diploid_p1;
+  }
+}
+
+CONSTU31(kScoreVariantBlockSize, 240);
+static double* g_dosages_vmaj[2] = {nullptr, nullptr};
+static double* g_score_coefs_cmaj[2] = {nullptr, nullptr};
+// don't bother to explicitly multithread for now
+static double* g_final_scores_cmaj = nullptr;
+static uint32_t g_score_col_ct = 0;
+static uint32_t g_sample_ct = 0;
+
+THREAD_FUNC_DECL calc_score_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  assert(!tidx);
+  double* final_scores_cmaj = g_final_scores_cmaj;
+  const uint32_t score_col_ct = g_score_col_ct;
+  const uint32_t sample_ct = g_sample_ct;
+  uint32_t parity = 0;
+  while (1) {
+    const uint32_t is_last_batch = g_is_last_thread_block;
+    const uint32_t cur_batch_size = g_cur_batch_size;
+    if (cur_batch_size) {
+      row_major_matrix_multiply_strided_incr(g_score_coefs_cmaj[parity], g_dosages_vmaj[parity], score_col_ct, kScoreVariantBlockSize, sample_ct, sample_ct, cur_batch_size, sample_ct, final_scores_cmaj);
+    }
+    if (is_last_batch) {
+      THREAD_RETURN;
+    }
+    THREAD_BLOCK_FINISH(tidx);
+    parity = 1 - parity;
+  }
+}
+
+pglerr_t score_report(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const chr_info_t* cip, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const double* allele_freqs, const score_info_t* score_info_ptr, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t pheno_ct, uintptr_t max_phe [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  gzFile gz_infile = nullptr;
+  uintptr_t loadbuf_size = 0;
+  uintptr_t line_idx = 0;
+  threads_state_t ts;
+  init_threads3z(&ts);
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  cswrite_init_null(&css);
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    if (!xchr_model) {
+      int32_t x_code;
+      if (xymt_exists(cip, kChrOffsetX, &x_code)) {
+	uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)x_code];
+	uint32_t x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
+	uint32_t x_end = cip->chr_fo_vidx_start[x_chr_fo_idx + 1];
+	if (!are_all_bits_zero(variant_include, x_start, x_end)) {
+	  uintptr_t* variant_include_no_x;
+	  if (bigstack_alloc_ul(raw_variant_ctl, &variant_include_no_x)) {
+	    goto score_report_ret_NOMEM;
+	  }
+	  memcpy(variant_include_no_x, variant_include, raw_variant_ctl * sizeof(intptr_t));
+	  clear_bits_nz(x_start, x_end, variant_include_no_x);
+	  variant_include = variant_include_no_x;
+	}
+      }
+    } else if (xchr_model == 2) {
+      xchr_model = 0;
+    }
+    // now xchr_model is set iff it's 1
+
+    const score_flags_t score_flags = score_info_ptr->flags;
+    reterr = gzopen_read_checked(score_info_ptr->input_fname, &gz_infile);
+    if (reterr) {
+      goto score_report_ret_1;
+    }
+
+    loadbuf_size = bigstack_left() / 8;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else {
+      loadbuf_size &= ~(kCacheline - 1);
+      if (loadbuf_size <= kMaxMediumLine) {
+	goto score_report_ret_NOMEM;
+      }
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* loadbuf_first_token;
+    uint32_t lines_to_skip_p1 = 1 + ((score_flags / kfScoreHeaderIgnore) & 1);
+    for (uint32_t uii = 0; uii < lines_to_skip_p1; ++uii) {
+      do {
+	if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	  if (!gzeof(gz_infile)) {
+	    goto score_report_ret_READ_FAIL;
+	  }
+	  logerrprint("Error: Empty --score file.\n");
+	  goto score_report_ret_MALFORMED_INPUT;
+	}
+	++line_idx;
+	if (!loadbuf[loadbuf_size - 1]) {
+	  goto score_report_ret_LONG_LINE;
+	}
+	loadbuf_first_token = skip_initial_spaces(loadbuf);
+      } while (is_eoln_kns(*loadbuf_first_token));
+    }
+    uint32_t last_col_idx = count_tokens(loadbuf_first_token);
+    const uint32_t varid_col_idx = score_info_ptr->varid_col_p1 - 1;
+    const uint32_t allele_col_idx = score_info_ptr->allele_col_p1 - 1;
+    if (MAXV(varid_col_idx, allele_col_idx) >= last_col_idx) {
+      goto score_report_ret_MISSING_TOKENS;
+    }
+    uint32_t* score_col_idx_deltas = nullptr;
+    uintptr_t score_col_ct = 1;
+    if (!score_info_ptr->input_col_idx_range_list.name_ct) {
+      if (allele_col_idx == last_col_idx) {
+	goto score_report_ret_MISSING_TOKENS;
+      }
+      if (bigstack_alloc_ui(1, &score_col_idx_deltas)) {
+	goto score_report_ret_NOMEM;
+      }
+      // catch corner case
+      if (allele_col_idx + 1 == varid_col_idx) {
+	logerrprint("Error: --score variant ID column index matches a coefficient column index.\n");
+	goto score_report_ret_INVALID_CMDLINE;
+      }
+      score_col_idx_deltas[0] = allele_col_idx + 1;
+    } else {
+      const uint32_t last_col_idxl = BITCT_TO_WORDCT(last_col_idx);
+      uintptr_t* score_col_bitarr;
+      if (bigstack_end_calloc_ul(last_col_idxl, &score_col_bitarr)) {
+	goto score_report_ret_NOMEM;
+      }
+      if (numeric_range_list_to_bitarr(&(score_info_ptr->input_col_idx_range_list), last_col_idx, 1, 0, score_col_bitarr)) {
+	goto score_report_ret_MISSING_TOKENS;
+      }
+      if (is_set(score_col_bitarr, varid_col_idx)) {
+	logerrprint("Error: --score variant ID column index matches a coefficient column index.\n");
+	goto score_report_ret_INVALID_CMDLINE;
+      }
+      if (is_set(score_col_bitarr, allele_col_idx)) {
+	logerrprint("Error: --score allele column index matches a coefficient column index.\n");
+	goto score_report_ret_INVALID_CMDLINE;
+      }
+      score_col_ct = popcount_longs(score_col_bitarr, last_col_idxl);
+      if (bigstack_alloc_ui(score_col_ct, &score_col_idx_deltas)) {
+	goto score_report_ret_NOMEM;
+      }
+      uint32_t col_uidx = 0;
+      for (uintptr_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx, ++col_uidx) {
+	next_set_unsafe_ck(score_col_bitarr, &col_uidx);
+	score_col_idx_deltas[score_col_idx] = col_uidx;
+      }
+      // now convert to deltas
+      for (uintptr_t score_col_idx = score_col_ct - 1; score_col_idx; --score_col_idx) {
+	score_col_idx_deltas[score_col_idx] -= score_col_idx_deltas[score_col_idx - 1];
+      }
+      bigstack_end_reset(bigstack_end_mark);
+    }
+    char** score_col_names;
+    if (bigstack_alloc_cp(score_col_ct, &score_col_names)) {
+      goto score_report_ret_NOMEM;
+    }
+    char* write_iter = (char*)g_bigstack_base;
+    // don't have to worry about overflow, since loadbuf was limited to 1/8
+    // of available workspace.
+    if (score_flags & kfScoreHeaderRead) {
+      char* read_iter = loadbuf_first_token;
+      for (uintptr_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	read_iter = next_token_multz(read_iter, score_col_idx_deltas[score_col_idx]);
+	score_col_names[score_col_idx] = write_iter;
+	char* token_end = token_endnn(read_iter);
+	const uint32_t slen = (uintptr_t)(token_end - read_iter);
+	write_iter = memcpyax(write_iter, read_iter, slen, '\0');
+      }
+
+      // don't reparse this line
+      *loadbuf_first_token = '\0';
+    } else {
+      for (uintptr_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	score_col_names[score_col_idx] = write_iter;
+	write_iter = strcpya(write_iter, "SCORE");
+	write_iter = uint32toa_x(score_col_idx + 1, '\0', write_iter);
+      }
+    }
+    g_bigstack_base = (unsigned char*)round_up_pow2((uintptr_t)write_iter, kCacheline);
+
+    g_score_col_ct = score_col_ct;
+    g_sample_ct = sample_ct;
+    g_cur_batch_size = kScoreVariantBlockSize;
+    ts.calc_thread_ct = 1;
+    const uint32_t sample_ctl2 = QUATERCT_TO_WORDCT(sample_ct);
+    const uint32_t sample_ctl = BITCT_TO_WORDCT(sample_ct);
+    const uint32_t acc1_vec_ct = BITCT_TO_VECCT(sample_ct);
+    const uint32_t acc4_vec_ct = acc1_vec_ct * 4;
+    const uint32_t acc8_vec_ct = acc1_vec_ct * 8;
+    const uint32_t write_score_avgs = (score_flags / kfScoreColScoreAvgs) & 1;
+    const uint32_t write_score_sums = (score_flags / kfScoreColScoreSums) & 1;
+    uint32_t* sample_include_cumulative_popcounts;
+    uintptr_t* sex_nonmale_collapsed;
+    uintptr_t* genovec_buf;
+    uintptr_t* dosage_present_buf;
+    dosage_t* dosage_vals_buf;
+    uintptr_t* missing_acc1;
+    uintptr_t* missing_male_acc1;
+    uint64_t* dosage_sums;
+    uint64_t* dosage_incrs;
+    uintptr_t* already_seen;
+    unsigned char* overflow_buf = nullptr;
+    if (bigstack_alloc_thread(1, &ts.threads) ||
+	bigstack_alloc_d((kScoreVariantBlockSize * k1LU) * sample_ct, &(g_dosages_vmaj[0])) ||
+	bigstack_alloc_d((kScoreVariantBlockSize * k1LU) * sample_ct, &(g_dosages_vmaj[1])) ||
+	bigstack_alloc_d(kScoreVariantBlockSize * score_col_ct, &(g_score_coefs_cmaj[0])) ||
+	bigstack_alloc_d(kScoreVariantBlockSize * score_col_ct, &(g_score_coefs_cmaj[1])) ||
+	bigstack_alloc_d(score_col_ct * sample_ct, &g_final_scores_cmaj) ||
+	bigstack_alloc_ui(sample_ctl, &sample_include_cumulative_popcounts) ||
+	bigstack_alloc_ul(sample_ctl, &sex_nonmale_collapsed) ||
+	bigstack_alloc_ul(sample_ctl2, &genovec_buf) ||
+	bigstack_alloc_ul(sample_ctl, &dosage_present_buf) ||
+        bigstack_alloc_dosage(sample_ct, &dosage_vals_buf) ||
+	bigstack_alloc_ul(45 * acc1_vec_ct * kWordsPerVec, &missing_acc1) ||
+	bigstack_alloc_ul(45 * acc1_vec_ct * kWordsPerVec, &missing_male_acc1) ||
+	bigstack_calloc_ull(sample_ct, &dosage_sums) ||
+	bigstack_alloc_ull(sample_ct, &dosage_incrs) ||
+	bigstack_calloc_ul(raw_variant_ctl, &already_seen) ||
+	bigstack_alloc_uc((score_col_ct * (write_score_avgs + write_score_sums) + pheno_ct) * 16 + 3 * kMaxIdSlen + kCompressStreamBlock + 64, &overflow_buf)) {
+      goto score_report_ret_NOMEM;
+    }
+    uintptr_t* missing_diploid_acc4 = &(missing_acc1[acc1_vec_ct * kWordsPerVec]);
+    uintptr_t* missing_diploid_acc8 = &(missing_diploid_acc4[acc4_vec_ct * kWordsPerVec]);
+    uintptr_t* missing_diploid_acc32 = &(missing_diploid_acc8[acc8_vec_ct * kWordsPerVec]);
+    uintptr_t* missing_haploid_acc4 = &(missing_male_acc1[acc1_vec_ct * kWordsPerVec]);
+    uintptr_t* missing_haploid_acc8 = &(missing_haploid_acc4[acc4_vec_ct * kWordsPerVec]);
+    uintptr_t* missing_haploid_acc32 = &(missing_haploid_acc8[acc8_vec_ct * kWordsPerVec]);
+    fill_ulong_zero(acc4_vec_ct * kWordsPerVec, missing_diploid_acc4);
+    fill_ulong_zero(acc8_vec_ct * kWordsPerVec, missing_diploid_acc8);
+    fill_ulong_zero(acc8_vec_ct * (4 * kWordsPerVec), missing_diploid_acc32);
+    fill_ulong_zero(acc4_vec_ct * kWordsPerVec, missing_haploid_acc4);
+    fill_ulong_zero(acc8_vec_ct * kWordsPerVec, missing_haploid_acc8);
+    fill_ulong_zero(acc8_vec_ct * (4 * kWordsPerVec), missing_haploid_acc32);
+    fill_cumulative_popcounts(sample_include, sample_ctl, sample_include_cumulative_popcounts);
+    copy_bitarr_subset(sex_male, sample_include, sample_ct, sex_nonmale_collapsed);
+    bitarr_invert(sample_ct, sex_nonmale_collapsed);
+    const uint32_t nonmale_ct = popcount_longs(sex_nonmale_collapsed, sample_ctl);
+    const uint32_t male_ct = sample_ct - nonmale_ct;
+    uint32_t* variant_id_htable = nullptr;
+    uint32_t variant_id_htable_size;
+    reterr = alloc_and_populate_id_htable_mt(variant_include, variant_ids, variant_ct, max_thread_ct, &variant_id_htable, nullptr, &variant_id_htable_size);
+    if (reterr) {
+      goto score_report_ret_1;
+    }
+
+    const uint32_t list_variants = (score_flags / kfScoreListVariants) & 1;
+    if (list_variants) {
+      char* outname_end2 = strcpya0(outname_end, ".sscore.vars");
+      const uint32_t output_zst = (score_flags / kfScoreListVariantsZs) & 1;
+      if (output_zst) {
+	strcpy(outname_end2, ".zst");
+      }
+      if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	goto score_report_ret_OPEN_FAIL;
+      }
+      cswritep = (char*)overflow_buf;
+    }
+
+    const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+    const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+    const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+    const uint32_t variance_standardize = (score_flags / kfScoreVarianceStandardize) & 1;
+    const uint32_t center = variance_standardize || (score_flags & kfScoreCenter);
+    const uint32_t no_meanimpute = (score_flags / kfScoreNoMeanimpute) & 1;
+    const uint32_t se_mode = (score_flags / kfScoreSe) & 1;
+    uint32_t block_vidx = 0;
+    uint32_t parity = 0;
+    uint32_t cur_allele_ct = 2;
+    double* cur_dosages_vmaj_iter = g_dosages_vmaj[0];
+    double* cur_score_coefs_cmaj = g_score_coefs_cmaj[0];
+    double geno_slope = kRecipDosageMax;
+    double geno_intercept = 0.0;
+    uint32_t variant_ct_rem15 = 15;
+    uint32_t variant_ct_rem255d15 = 17;
+    uint32_t variant_hap_ct_rem15 = 15;
+    uint32_t variant_hap_ct_rem255d15 = 17;
+    uint32_t allele_ct_base = 0;
+    int32_t male_allele_ct_delta = 0;
+    uint32_t valid_variant_ct = 0;
+    uintptr_t missing_var_id_ct = 0;
+    uintptr_t missing_allele_code_ct = 0;
+#ifdef USE_MTBLAS
+    const uint32_t matrix_multiply_thread_ct = (max_thread_ct > 1)? (max_thread_ct - 1) : 1;
+    BLAS_SET_NUM_THREADS(matrix_multiply_thread_ct);
+#endif
+    pgr_clear_ld_cache(simple_pgrp);
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	// varid_col_idx and allele_col_idx will almost always be very small
+	char* variant_id_start = next_token_multz(loadbuf_first_token, varid_col_idx);
+	if (!variant_id_start) {
+	  goto score_report_ret_MISSING_TOKENS;
+	}
+	char* variant_id_token_end = token_endnn(variant_id_start);
+	const uint32_t variant_id_slen = (uintptr_t)(variant_id_token_end - variant_id_start);
+	uint32_t variant_uidx = variant_id_dupflag_htable_find(variant_id_start, variant_ids, variant_id_htable, variant_id_slen, variant_id_htable_size, max_variant_id_slen);
+	if (!(variant_uidx >> 31)) {
+	  if (is_set(already_seen, variant_uidx)) {
+	    sprintf(g_logbuf, "Error: Variant ID '%s' appears multiple times in --score file.\n", variant_ids[variant_uidx]);
+	    goto score_report_ret_MALFORMED_INPUT_WW;
+	  }
+	  set_bit(variant_uidx, already_seen);
+	  char* allele_start = next_token_multz(loadbuf_first_token, allele_col_idx);
+	  if (!allele_start) {
+	    goto score_report_ret_MISSING_TOKENS;
+	  }
+	  uintptr_t variant_allele_idx_base;
+	  if (!variant_allele_idxs) {
+	    variant_allele_idx_base = variant_uidx * 2;
+	  } else {
+	    variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	    cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	  }
+	  char* allele_end = token_endnn(allele_start);
+	  char allele_end_char = *allele_end;
+	  *allele_end = '\0';
+	  char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	  uint32_t cur_allele_idx = 0;
+	  for (; cur_allele_idx < cur_allele_ct; ++cur_allele_idx) {
+	    // for very long alleles, strcmp_se might read past the end of the
+	    // workspace, so just use plain strcmp.
+	    if (!strcmp(allele_start, cur_alleles[cur_allele_idx])) {
+	      break;
+	    }
+	  }
+	  if (cur_allele_idx != cur_allele_ct) {
+	    // okay, the variant and allele are in our dataset.  Load it.
+	    // (todo: make this work in multiallelic case)
+	    uint32_t dosage_ct;
+	    uint32_t is_explicit_alt1;
+	    pglerr_t reterr = pgr_read_refalt1_genovec_dosage16_subset_unsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, variant_uidx, simple_pgrp, genovec_buf, dosage_present_buf, dosage_vals_buf, &dosage_ct, &is_explicit_alt1);
+	    if (reterr) {
+	      if (reterr == kPglRetMalformedInput) {
+	        logprint("\n");
+		logerrprint("Error: Malformed .pgen file.\n");
+	      }
+	      goto score_report_ret_1;
+	    }
+	    const uint32_t chr_idx = get_variant_chr(cip, variant_uidx);
+	    uint32_t is_relevant_x = (((int32_t)chr_idx) == x_code);
+	    if (variance_standardize && (is_relevant_x || (((int32_t)chr_idx) == mt_code))) {
+	      logerrprint("Error: --score 'variance-standardize' modifier cannot be used with chrX or MT.\n");
+	      goto score_report_ret_INCONSISTENT_INPUT;
+	    }
+	    const uint32_t is_nonx_haploid = (!is_relevant_x) && is_set(cip->haploid_mask, chr_idx);
+
+            // only if --xchr-model 1 (which is no longer the default)
+	    is_relevant_x = is_relevant_x && xchr_model;
+
+	    const uint32_t is_y = (((int32_t)chr_idx) == y_code);
+	    // pre-multiallelic kludge: current counts are for alt1, invert if
+	    // score is based on ref allele
+	    if (!cur_allele_idx) {
+	      genovec_invert_unsafe(sample_ct, genovec_buf);
+	      if (dosage_ct) {
+		biallelic_dosage16_invert(dosage_ct, dosage_vals_buf);
+	      }
+	    }
+	    zero_trailing_quaters(sample_ct, genovec_buf);
+	    genovec_to_missingness_unsafe(genovec_buf, sample_ct, missing_acc1);
+	    if (dosage_ct) {
+	      bitvec_andnot(dosage_present_buf, sample_ctl, missing_acc1);
+	    }
+	    fill_cur_dosage_ints(genovec_buf, dosage_present_buf, dosage_vals_buf, sample_ct, dosage_ct, 2 - is_nonx_haploid, dosage_incrs);
+	    double ploidy_d;
+	    if (is_nonx_haploid) {
+	      if (is_y) {
+		uint32_t sample_idx = 0;
+		for (uint32_t nonmale_idx = 0; nonmale_idx < nonmale_ct; ++nonmale_idx, ++sample_idx) {
+		  next_set_unsafe_ck(sex_nonmale_collapsed, &sample_idx);
+		  dosage_incrs[sample_idx] = 0;
+		}
+		++male_allele_ct_delta;
+		bitvec_andnot(sex_nonmale_collapsed, sample_ctl, missing_acc1);
+	      } else {
+		++allele_ct_base;
+	      }
+	      unroll_incr_1_4(missing_acc1, acc1_vec_ct, missing_haploid_acc4);
+	      if (!(--variant_hap_ct_rem15)) {
+		unroll_zero_incr_4_8(acc4_vec_ct, missing_haploid_acc4, missing_haploid_acc8);
+		variant_hap_ct_rem15 = 15;
+		if (!(--variant_hap_ct_rem255d15)) {
+		  unroll_zero_incr_8_32(acc8_vec_ct, missing_haploid_acc8, missing_haploid_acc32);
+		  variant_hap_ct_rem255d15 = 17;
+		}
+	      }
+	      if (is_y) {
+		memcpy(missing_male_acc1, missing_acc1, sample_ctl * sizeof(intptr_t));
+		bitvec_or(sex_nonmale_collapsed, sample_ctl, missing_acc1);
+	      }
+	      ploidy_d = 1.0;
+	    } else {
+	      if (is_relevant_x) {
+		uint32_t sample_idx = 0;
+		for (uint32_t male_idx = 0; male_idx < male_ct; ++male_idx, ++sample_idx) {
+		  next_unset_unsafe_ck(sex_nonmale_collapsed, &sample_idx);
+		  dosage_incrs[sample_idx] /= 2;
+		}
+		bitvec_andnot_copy(missing_acc1, sex_nonmale_collapsed, sample_ctl, missing_male_acc1);
+		bitvec_and(sex_nonmale_collapsed, sample_ctl, missing_acc1);
+	      }
+	      unroll_incr_1_4(missing_acc1, acc1_vec_ct, missing_diploid_acc4);
+	      if (!(--variant_ct_rem15)) {
+		unroll_zero_incr_4_8(acc4_vec_ct, missing_diploid_acc4, missing_diploid_acc8);
+		variant_ct_rem15 = 15;
+		if (!(--variant_ct_rem255d15)) {
+		  unroll_zero_incr_8_32(acc8_vec_ct, missing_diploid_acc8, missing_diploid_acc32);
+		  variant_ct_rem255d15 = 17;
+		}
+	      }
+	      allele_ct_base += 2;
+	      if (is_relevant_x) {
+		--male_allele_ct_delta;
+		unroll_incr_1_4(missing_male_acc1, acc1_vec_ct, missing_haploid_acc4);
+		if (!(--variant_hap_ct_rem15)) {
+		  unroll_zero_incr_4_8(acc4_vec_ct, missing_haploid_acc4, missing_haploid_acc8);
+		  variant_hap_ct_rem15 = 15;
+		  if (!(--variant_hap_ct_rem255d15)) {
+		    unroll_zero_incr_8_32(acc8_vec_ct, missing_haploid_acc8, missing_haploid_acc32);
+		    variant_hap_ct_rem255d15 = 17;
+		  }
+		}
+		bitvec_or(missing_male_acc1, sample_ctl, missing_acc1);
+	      }
+	      ploidy_d = 2.0;
+	    }
+	    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+	      dosage_sums[sample_idx] += dosage_incrs[sample_idx];
+	    }
+	    const double cur_allele_freq = get_allele_freq(&(allele_freqs[variant_allele_idx_base - variant_uidx]), cur_allele_idx, cur_allele_ct);
+	    if (center) {
+	      if (variance_standardize) {
+		const double variance = ploidy_d * cur_allele_freq * (1.0 - cur_allele_freq);
+		if (variance < kSmallEpsilon) {
+		  zero_trailing_quaters(sample_ct, genovec_buf);
+		  uint32_t genocounts[4];
+		  genovec_count_freqs_unsafe(genovec_buf, sample_ct, genocounts);
+		  if (dosage_ct || genocounts[1] || genocounts[2]) {
+		    sprintf(g_logbuf, "Error: --score variance-standardize failure for ID '%s': estimated allele frequency is zero, but not all dosages are zero. (This is possible when e.g. allele frequencies are estimated from founders, but the allele is only observed in nonfounders.)\n", variant_ids[variant_uidx]);
+		    goto score_report_ret_INCONSISTENT_INPUT_WW;
+		  }
+		  geno_slope = 0.0;
+		} else {
+		  geno_slope = kRecipDosageMax / sqrt(variance);
+		}
+	      }
+	      // (ploidy * cur_allele_freq * kDosageMax) * geno_slope +
+	      //   geno_intercept == 0
+	      // bugfix: must use "-1.0 *" instead of - to avoid unsigned int
+	      //   wraparound
+	      geno_intercept = (-1.0 * kDosageMax) * ploidy_d * cur_allele_freq * geno_slope;
+	    }
+	    const uint32_t missing_ct = popcount_longs(missing_acc1, sample_ctl);
+	    const uint32_t nm_sample_ct = sample_ct - missing_ct;
+	    if (missing_ct) {
+	      double missing_effect = 0.0;
+	      if (!no_meanimpute) {
+		missing_effect = kDosageMax * cur_allele_freq * geno_slope;
+	      }
+	      uint32_t sample_idx = 0;
+	      if (is_y || is_relevant_x) {
+		fill_double_zero(sample_ct, cur_dosages_vmaj_iter);
+		if (!no_meanimpute) {
+		  const uint32_t male_missing_ct = popcount_longs(missing_male_acc1, sample_ctl);
+		  for (uint32_t male_missing_idx = 0; male_missing_idx < male_missing_ct; ++male_missing_idx, ++sample_idx) {
+		    next_set_unsafe_ck(missing_male_acc1, &sample_idx);
+		    cur_dosages_vmaj_iter[sample_idx] = missing_effect;
+		  }
+		  if (is_relevant_x) {
+		    // missing_male_acc1 not used after this point, so okay to
+		    // use buffer for nonmales
+		    bitvec_and_copy(missing_acc1, sex_nonmale_collapsed, sample_ctl, missing_male_acc1);
+		    missing_effect *= 2;
+		    const uint32_t nonmale_missing_ct = popcount_longs(missing_male_acc1, sample_ctl);
+		    for (uint32_t nonmale_missing_idx = 0; nonmale_missing_idx < nonmale_missing_ct; ++nonmale_missing_idx, ++sample_idx) {
+		      next_set_unsafe_ck(missing_male_acc1, &sample_idx);
+		      cur_dosages_vmaj_iter[sample_idx] = missing_effect;
+		    }
+		  }
+		}
+	      } else {
+		missing_effect *= ploidy_d;
+		for (uint32_t missing_idx = 0; missing_idx < missing_ct; ++missing_idx, ++sample_idx) {
+		  next_set_unsafe_ck(missing_acc1, &sample_idx);
+		  cur_dosages_vmaj_iter[sample_idx] = missing_effect;
+		}
+	      }
+	    }
+	    uint32_t sample_idx = 0;
+	    for (uint32_t nm_sample_idx = 0; nm_sample_idx < nm_sample_ct; ++nm_sample_idx, ++sample_idx) {
+	      next_unset_unsafe_ck(missing_acc1, &sample_idx);
+	      cur_dosages_vmaj_iter[sample_idx] = ((int64_t)dosage_incrs[sample_idx]) * geno_slope + geno_intercept;
+	    }
+	    if (se_mode) {
+	      // Suppose our score coefficients are drawn from independent
+	      // Gaussians.  Then the variance of the final score average is
+	      // the sum of the variances of the individual terms, divided by
+	      // (T^2) where T is the number of terms.  These individual
+	      // variances are of the form ([genotype value] * [stdev])^2.
+	      //
+	      // Thus, we can use the same inner loop to compute standard
+	      // errors, as long as
+	      //   1. we square the genotypes and the standard errors before
+	      //      matrix multiplication, and
+	      //   2. we take the square root of the sums at the end.
+	      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx) {
+		cur_dosages_vmaj_iter[sample_idx] *= cur_dosages_vmaj_iter[sample_idx];
+	      }
+	    }
+	    cur_dosages_vmaj_iter = &(cur_dosages_vmaj_iter[sample_ct]);
+
+	    *allele_end = allele_end_char;
+	    double* cur_score_coefs_iter = &(cur_score_coefs_cmaj[block_vidx]);
+	    char* read_iter = loadbuf_first_token;
+	    for (uint32_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	      read_iter = next_token_multz(read_iter, score_col_idx_deltas[score_col_idx]);
+	      double raw_coef;
+	      char* token_end = scanadv_double(read_iter, &raw_coef);
+	      if (!token_end) {
+		sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --score file has an invalid coefficient.\n", line_idx);
+		goto score_report_ret_MALFORMED_INPUT_2;
+	      }
+	      *cur_score_coefs_iter = raw_coef;	      
+	      cur_score_coefs_iter = &(cur_score_coefs_iter[kScoreVariantBlockSize]);
+	      read_iter = token_end;
+	    }
+	    if (list_variants) {
+	      cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+	      append_binary_eoln(&cswritep);
+	      if (cswrite(&css, &cswritep)) {
+		goto score_report_ret_WRITE_FAIL;
+	      }
+	    }
+	    ++valid_variant_ct;
+	    if (!(valid_variant_ct % 10000)) {
+	      printf("\r--score: %uk variants loaded.", valid_variant_ct / 1000);
+	      fflush(stdout);
+	    }
+	    ++block_vidx;
+	    if (block_vidx == kScoreVariantBlockSize) {
+	      if (se_mode) {
+		for (uintptr_t ulii = 0; ulii < kScoreVariantBlockSize * score_col_ct; ++ulii) {
+		  cur_score_coefs_cmaj[ulii] *= cur_score_coefs_cmaj[ulii];
+		}
+	      }
+	      parity = 1 - parity;
+	      const uint32_t is_not_first_block = (ts.thread_func_ptr != nullptr);
+	      if (is_not_first_block) {
+		join_threads3z(&ts);
+	      } else {
+		ts.thread_func_ptr = calc_score_thread;
+	      }
+	      if (spawn_threads3z(is_not_first_block, &ts)) {
+		goto score_report_ret_THREAD_CREATE_FAIL;
+	      }
+	      cur_dosages_vmaj_iter = g_dosages_vmaj[parity];
+	      cur_score_coefs_cmaj = g_score_coefs_cmaj[parity];
+	      block_vidx = 0;
+	    }
+	  } else {
+	    ++missing_allele_code_ct;
+	  }
+	} else {
+	  if (variant_uidx != 0xffffffffU) {
+	    sprintf(g_logbuf, "Error: --score variant ID '%s' appears multiple times in main dataset.\n", variant_ids[variant_uidx & 0x7fffffff]);
+	    goto score_report_ret_INCONSISTENT_INPUT_WW;
+	  }
+	  ++missing_var_id_ct;
+	}
+      }
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto score_report_ret_READ_FAIL;
+	}
+	break;
+      }
+      ++line_idx;
+      if (!loadbuf[loadbuf_size - 1]) {
+	goto score_report_ret_LONG_LINE;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    }
+    unroll_incr_4_8(missing_diploid_acc4, acc4_vec_ct, missing_diploid_acc8);
+    unroll_incr_8_32(missing_diploid_acc8, acc8_vec_ct, missing_diploid_acc32);
+    unroll_incr_4_8(missing_haploid_acc4, acc4_vec_ct, missing_haploid_acc8);
+    unroll_incr_8_32(missing_haploid_acc8, acc8_vec_ct, missing_haploid_acc32);
+    const uint32_t is_not_first_block = (ts.thread_func_ptr != nullptr);
+    putc_unlocked('\r', stdout);
+    if (missing_var_id_ct || missing_allele_code_ct) {
+      if (!missing_var_id_ct) {
+	sprintf(g_logbuf, "Warning: %" PRIuPTR " --score file entr%s.\n", missing_allele_code_ct, (missing_allele_code_ct == 1)? "y was skipped due to a mismatching allele code" : "ies were skipped due to mismatching allele codes");
+      } else if (!missing_allele_code_ct) {
+	sprintf(g_logbuf, "Warning: %" PRIuPTR " --score file entr%s.\n", missing_var_id_ct, (missing_var_id_ct == 1)? "y was skipped due to a missing variant ID" : "ies were skipped due to missing variant IDs");
+      } else {
+	sprintf(g_logbuf, "Warning: %" PRIuPTR " --score file entr%s, and %" PRIuPTR " %s.\n", missing_var_id_ct, (missing_var_id_ct == 1)? "y was skipped due to a missing variant ID" : "ies were skipped due to missing variant IDs", missing_allele_code_ct, (missing_allele_code_ct == 1)? "was skipped due to a mismatching allele code" : "were skipped due to mismatching allele codes");
+      }
+      wordwrapb(0);
+      logerrprintb();
+      if (!list_variants) {
+	logerrprint("(Add the 'list-variants' modifier to see which variants were actually used for\nscoring.)\n");
+      }
+    }
+    if (block_vidx) {
+      if (is_not_first_block) {
+	join_threads3z(&ts);
+      } else {
+	ts.thread_func_ptr = calc_score_thread;
+      }
+    } else if (!valid_variant_ct) {
+      logerrprint("Error: No valid variants in --score file.\n");
+      goto score_report_ret_MALFORMED_INPUT;
+    } else {
+      join_threads3z(&ts);
+    }
+    ts.is_last_block = 1;
+    g_cur_batch_size = block_vidx;
+    if (se_mode) {
+      for (uintptr_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	double* cur_score_coefs_row = &(cur_score_coefs_cmaj[score_col_idx * kScoreVariantBlockSize]);
+	for (uint32_t uii = 0; uii < block_vidx; ++uii) {
+	  cur_score_coefs_row[uii] *= cur_score_coefs_row[uii];
+	}
+      }
+    }
+    if (spawn_threads3z(is_not_first_block, &ts)) {
+      goto score_report_ret_THREAD_CREATE_FAIL;
+    }
+    join_threads3z(&ts);
+    if (gzclose_null(&gz_infile)) {
+      goto score_report_ret_READ_FAIL;
+    }
+    if (se_mode) {
+      // sample_ct * score_col_ct
+      for (uintptr_t ulii = 0; ulii < sample_ct * score_col_ct; ++ulii) {
+	g_final_scores_cmaj[ulii] = sqrt(g_final_scores_cmaj[ulii]);
+      }
+    }
+    LOGPRINTF("--score: %u variant%s processed.\n", valid_variant_ct, (valid_variant_ct == 1)? "" : "s");
+    if (list_variants) {
+      if (cswrite_close_null(&css, cswritep)) {
+	goto score_report_ret_WRITE_FAIL;
+      }
+      cswritep = nullptr;
+      LOGPRINTF("Variant list written to %s .\n", outname);
+    }
+
+    char* outname_end2 = strcpya0(outname_end, ".sscore");
+    const uint32_t output_zst = (score_flags / kfScoreZs) & 1;
+    if (output_zst) {
+      strcpy(outname_end2, ".zst");
+    }
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto score_report_ret_OPEN_FAIL;
+    }
+    cswritep = (char*)overflow_buf;
+    // see e.g. write_psam() in plink2_data.cpp
+    const uint32_t write_sid = sid_col_required(sample_include, sids, sample_ct, max_sid_blen, score_flags / kfScoreColMaybesid);
+    const uint32_t write_empty_pheno = (score_flags & kfScoreColPheno1) && (!pheno_ct);
+    const uint32_t write_phenos = (score_flags & (kfScoreColPheno1 | kfScoreColPhenos)) && pheno_ct;
+    if (write_phenos && (!(score_flags & kfScoreColPhenos))) {
+      pheno_ct = 1;
+    }
+    cswritep = strcpya(cswritep, "#FID\tIID");
+    if (write_sid) {
+      cswritep = strcpya(cswritep, "\tSID");
+    }
+    if (write_phenos) {
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	*cswritep++ = '\t';
+	cswritep = strcpya(cswritep, &(pheno_names[pheno_idx * max_pheno_name_blen]));
+	if (cswrite(&css, &cswritep)) {
+	  goto score_report_ret_WRITE_FAIL;
+	}
+      }
+    } else if (write_empty_pheno) {
+      cswritep = strcpya(cswritep, "\tPHENO1");
+    }
+    const uint32_t write_nmiss_allele = (score_flags / kfScoreColNmissAllele) & 1;
+    if (write_nmiss_allele) {
+      cswritep = strcpya(cswritep, "\tNMISS_ALLELE_CT");
+    }
+    const uint32_t write_denom = (score_flags / kfScoreColDenom) & 1;
+    if (write_denom) {
+      cswritep = strcpya(cswritep, "\tDENOM");
+    }
+    const uint32_t write_dosage_sum = (score_flags / kfScoreColDosageSum) & 1;
+    if (write_dosage_sum) {
+      cswritep = strcpya(cswritep, "\tNAMED_ALLELE_DOSAGE_SUM");
+    }
+    if (write_score_avgs) {
+      for (uint32_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	*cswritep++ = '\t';
+	cswritep = strcpya(cswritep, score_col_names[score_col_idx]);
+	cswritep = strcpya(cswritep, "_AVG");
+	if (cswrite(&css, &cswritep)) {
+	  goto score_report_ret_WRITE_FAIL;
+	}
+      }
+    }
+    if (write_score_sums) {
+      for (uint32_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	*cswritep++ = '\t';
+	cswritep = strcpya(cswritep, score_col_names[score_col_idx]);
+	cswritep = strcpya(cswritep, "_SUM");
+	if (cswrite(&css, &cswritep)) {
+	  goto score_report_ret_WRITE_FAIL;
+	}
+      }
+    }
+    append_binary_eoln(&cswritep);
+    const uint32_t* scrambled_missing_diploid_cts = (uint32_t*)missing_diploid_acc32;
+    const uint32_t* scrambled_missing_haploid_cts = (uint32_t*)missing_haploid_acc32;
+    const char* output_missing_pheno = g_output_missing_pheno;
+    const uint32_t omp_slen = strlen(output_missing_pheno);
+
+    uint32_t sample_uidx = 0;
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      next_set_unsafe_ck(sample_include, &sample_uidx);
+      cswritep = strcpya(cswritep, &(sample_ids[sample_uidx * max_sample_id_blen]));
+      if (write_sid) {
+	*cswritep++ = '\t';
+	if (sids) {
+	  cswritep = strcpya(cswritep, &(sids[max_sid_blen * sample_uidx]));
+	} else {
+	  *cswritep++ = '0';
+	}
+      }
+      if (write_phenos) {
+	// er, this probably belongs in its own function
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  const pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_idx]);
+	  const pheno_dtype_t type_code = cur_pheno_col->type_code;
+	  *cswritep++ = '\t';
+	  if (type_code <= kPhenoDtypeQt) {
+	    if (!IS_SET(cur_pheno_col->nonmiss, sample_uidx)) {
+	      cswritep = memcpya(cswritep, output_missing_pheno, omp_slen);
+	    } else if (type_code == kPhenoDtypeCc) {
+	      *cswritep++ = '1' + IS_SET(cur_pheno_col->data.cc, sample_uidx);
+	    } else {
+	      cswritep = dtoa_g(cur_pheno_col->data.qt[sample_uidx], cswritep);
+	    }
+	  } else {
+	    // category index guaranteed to be zero for missing values
+	    cswritep = strcpya(cswritep, cur_pheno_col->category_names[cur_pheno_col->data.cat[sample_uidx]]);
+	    if (cswrite(&css, &cswritep)) {
+	      goto score_report_ret_WRITE_FAIL;
+	    }
+	  }
+	}
+      } else if (write_empty_pheno) {
+	*cswritep++ = '\t';
+	cswritep = memcpya(cswritep, output_missing_pheno, omp_slen);
+      }
+      const uint32_t scrambled_idx = scramble_1_4_8_32(sample_idx);
+      uint32_t denom = allele_ct_base + is_set(sex_male, sample_uidx) * male_allele_ct_delta;
+      const uint32_t nmiss_allele_ct = denom - 2 * scrambled_missing_diploid_cts[scrambled_idx] - scrambled_missing_haploid_cts[scrambled_idx];
+      if (write_nmiss_allele) {
+	*cswritep++ = '\t';
+	cswritep = uint32toa(nmiss_allele_ct, cswritep);
+      }
+      if (no_meanimpute) {
+        denom = nmiss_allele_ct;
+      }
+      if (write_denom) {
+	*cswritep++ = '\t';
+	cswritep = uint32toa(denom, cswritep);
+      }
+      if (write_dosage_sum) {
+	*cswritep++ = '\t';
+	cswritep = print_dosage(dosage_sums[sample_idx], cswritep);
+      }
+      const double* final_score_col = &(g_final_scores_cmaj[sample_idx]);
+      if (write_score_avgs) {
+	const double denom_recip = 1.0 / ((double)denom);
+	for (uint32_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(final_score_col[score_col_idx * sample_ct] * denom_recip, cswritep);
+	}
+      }
+      if (write_score_sums) {
+	for (uint32_t score_col_idx = 0; score_col_idx < score_col_ct; ++score_col_idx) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(final_score_col[score_col_idx * sample_ct], cswritep);
+	}
+      }
+      append_binary_eoln(&cswritep);
+      if (cswrite(&css, &cswritep)) {
+	goto score_report_ret_WRITE_FAIL;
+      }
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto score_report_ret_WRITE_FAIL;
+    }
+    LOGPRINTFWW("--score: Results written to %s .\n", outname);
+  }
+  while (0) {
+  score_report_ret_LONG_LINE:
+    if (loadbuf_size == kMaxLongLine) {
+      LOGERRPRINTF("Error: Line %" PRIuPTR " of --score file is pathologically long.\n", line_idx);
+      reterr = kPglRetMalformedInput;
+      break;
+    }
+  score_report_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  score_report_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  score_report_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  score_report_ret_WRITE_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  score_report_ret_INVALID_CMDLINE:
+    reterr = kPglRetInvalidCmdline;
+    break;
+  score_report_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+  score_report_ret_MALFORMED_INPUT_2:
+    logprint("\n");
+    logerrprintb();
+  score_report_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  score_report_ret_MISSING_TOKENS:
+    logprint("\n");
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, score_info_ptr->input_fname);
+    reterr = kPglRetInconsistentInput;
+    break;
+  score_report_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logprint("\n");
+    logerrprintb();
+  score_report_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  score_report_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+ score_report_ret_1:
+  cswrite_close_cond(&css, cswritep);
+  threads3z_cleanup(&ts, &g_cur_batch_size);
+  BLAS_SET_NUM_THREADS(1);
+  gzclose_cond(gz_infile);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_matrix_calc.h b/plink2_matrix_calc.h
new file mode 100644
index 0000000..acd736c
--- /dev/null
+++ b/plink2_matrix_calc.h
@@ -0,0 +1,145 @@
+#ifndef __PLINK2_MATRIX_CALC_H__
+#define __PLINK2_MATRIX_CALC_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_random.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+FLAGSET_DEF_START()
+  kfKing0,
+  kfKingMatrixZs = (1 << 0),
+  kfKingMatrixBin = (1 << 1),
+  kfKingMatrixBin4 = (1 << 2),
+  kfKingMatrixEncodemask = (kfKingMatrixZs | kfKingMatrixBin | kfKingMatrixBin4),
+  kfKingMatrixSq = (1 << 3),
+  kfKingMatrixSq0 = (1 << 4),
+  kfKingMatrixTri = (1 << 5),
+  kfKingMatrixShapemask = (kfKingMatrixSq0 | kfKingMatrixSq | kfKingMatrixTri),
+
+  kfKingTableZs = (1 << 6),
+  kfKingCounts = (1 << 7),
+  
+  kfKingColId = (1 << 8),
+  kfKingColMaybesid = (1 << 9),
+  kfKingColSid = (1 << 10),
+  kfKingColNsnp = (1 << 11),
+  kfKingColHethet = (1 << 12),
+  kfKingColIbs0 = (1 << 13),
+  kfKingColIbs1 = (1 << 14),
+  kfKingColKinship = (1 << 15),
+  kfKingColDefault = (kfKingColId | kfKingColMaybesid | kfKingColNsnp | kfKingColHethet | kfKingColIbs0 | kfKingColKinship),
+  kfKingColAll = ((kfKingColKinship * 2) - kfKingColId)
+FLAGSET_DEF_END(king_flags_t);
+
+FLAGSET_DEF_START()
+  kfGrm0,
+  kfGrmMatrixZs = (1 << 0),
+  kfGrmMatrixBin = (1 << 1),
+  kfGrmMatrixBin4 = (1 << 2),
+  kfGrmMatrixEncodemask = (kfGrmMatrixZs | kfGrmMatrixBin | kfGrmMatrixBin4),
+  kfGrmMatrixSq = (1 << 3),
+  kfGrmMatrixSq0 = (1 << 4),
+  kfGrmMatrixTri = (1 << 5),
+  kfGrmMatrixShapemask = (kfGrmMatrixSq0 | kfGrmMatrixSq | kfGrmMatrixTri),
+  kfGrmTableGz = (1 << 6),
+  kfGrmTableNoGz = (1 << 7),
+  kfGrmTableZs = (1 << 8),
+  kfGrmTablemask = (kfGrmTableGz | kfGrmTableNoGz | kfGrmTableZs),
+  kfGrmBin = (1 << 9),
+
+  kfGrmMeanimpute = (1 << 10),
+  kfGrmCov = (1 << 11)
+FLAGSET_DEF_END(grm_flags_t);
+
+FLAGSET_DEF_START()
+  kfPca0,
+  kfPcaApprox = (1 << 0),
+  kfPcaMeanimpute = (1 << 1),
+  kfPcaSid = (1 << 2),
+  kfPcaVarWts = (1 << 3),
+  kfPcaVarZs = (1 << 4),
+
+  kfPcaVcolChrom = (1 << 5),
+  kfPcaVcolPos = (1 << 6),
+  kfPcaVcolRef = (1 << 7),
+  kfPcaVcolAlt1 = (1 << 8),
+  kfPcaVcolAlt = (1 << 9),
+  kfPcaVcolMaj = (1 << 10),
+  kfPcaVcolNonmaj = (1 << 11),
+  kfPcaVcolDefault = (kfPcaVcolChrom | kfPcaVcolMaj | kfPcaVcolNonmaj),
+  kfPcaVcolAll = ((kfPcaVcolNonmaj * 2) - kfPcaVcolChrom)
+FLAGSET_DEF_END(pca_flags_t);
+
+FLAGSET_DEF_START()
+  kfScore0,
+  kfScoreHeaderIgnore = (1 << 0),
+  kfScoreHeaderRead = (1 << 1),
+  kfScoreNoMeanimpute = (1 << 2),
+  kfScoreCenter = (1 << 3),
+  kfScoreVarianceStandardize = (1 << 4),
+  kfScoreSe = (1 << 5),
+  kfScoreZs = (1 << 6),
+  kfScoreListVariants = (1 << 7),
+  kfScoreListVariantsZs = (1 << 8),
+  
+  kfScoreColMaybesid = (1 << 9),
+  kfScoreColSid = (1 << 10),
+  kfScoreColPheno1 = (1 << 11),
+  kfScoreColPhenos = (1 << 12),
+  kfScoreColNmissAllele = (1 << 13),
+  kfScoreColDenom = (1 << 14),
+  kfScoreColDosageSum = (1 << 15),
+  kfScoreColScoreAvgs = (1 << 16),
+  kfScoreColScoreSums = (1 << 17),
+  kfScoreColDefault = (kfScoreColMaybesid | kfScoreColPhenos | kfScoreColNmissAllele | kfScoreColDosageSum | kfScoreColScoreAvgs),
+  kfScoreColAll = ((kfScoreColScoreSums * 2) - kfScoreColMaybesid)
+FLAGSET_DEF_END(score_flags_t);
+
+typedef struct score_info_struct {
+  score_flags_t flags;
+  uint32_t varid_col_p1;
+  uint32_t allele_col_p1;
+  char* input_fname;
+  range_list_t input_col_idx_range_list;
+} score_info_t;
+
+void init_score(score_info_t* score_info_ptr);
+
+void cleanup_score(score_info_t* score_info_ptr);
+
+pglerr_t king_cutoff_batch(const char* sample_ids, const char* sids, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, double king_cutoff, uintptr_t* sample_include, char* king_cutoff_fprefix, uint32_t* sample_ct_ptr);
+
+pglerr_t calc_king(const char* sample_ids, const char* sids, uintptr_t* variant_include, const chr_info_t* cip, uint32_t raw_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t raw_variant_ct, uint32_t variant_ct, double king_cutoff, double king_table_filter, king_flags_t king_modifier, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, pgen_reader_t* simple_pgrp, uintptr_t* sample_include, uint32_t* sample_ct_ptr, char* outname, char* outname_end);
+
+pglerr_t calc_grm(const uintptr_t* orig_sample_include, const char* sample_ids, const char* sids, uintptr_t* variant_include, const chr_info_t* cip, const uintptr_t* variant_allele_idxs, const alt_allele_ct_t* maj_alleles, const double* allele_freqs, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t raw_variant_ct, uint32_t variant_ct, grm_flags_t grm_flags, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t max_thread_ct, pgen [...]
+
+#ifndef NOLAPACK
+pglerr_t calc_pca(const uintptr_t* sample_include, const char* sample_ids, const char* sids, uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const alt_allele_ct_t* maj_alleles, const double* allele_freqs, uint32_t raw_sample_ct, uintptr_t pca_sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, [...]
+#endif
+
+pglerr_t score_report(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const uintptr_t* variant_include, const chr_info_t* cip, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const double* allele_freqs, const score_info_t* score_info_ptr, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uint32_t pheno_ct, uintptr_t max_phe [...]
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_MATRIX_CALC_H__
diff --git a/plink2_misc.cpp b/plink2_misc.cpp
new file mode 100644
index 0000000..2748fdb
--- /dev/null
+++ b/plink2_misc.cpp
@@ -0,0 +1,3317 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_compress_stream.h"
+#include "plink2_data.h"
+#include "plink2_misc.h"
+#include "plink2_stats.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+pglerr_t plink1_cluster_import(const char* within_fname, const char* catpheno_name, const char* family_missing_catname, const uintptr_t* sample_include, const char* sample_ids, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, uint32_t mwithin_val, pheno_col_t** pheno_cols_ptr, char** pheno_names_ptr, uint32_t* pheno_ct_ptr, uintptr_t* max_pheno_name_blen_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+
+  gzFile gz_infile = nullptr;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const char catpheno_name_default[] = "CATPHENO";
+    uint32_t catpheno_name_blen;
+    if (!catpheno_name) {
+      catpheno_name = catpheno_name_default;
+      catpheno_name_blen = 9;
+    } else {
+      catpheno_name_blen = 1 + strlen(catpheno_name);
+    }
+    const uintptr_t old_max_pheno_name_blen = *max_pheno_name_blen_ptr;
+    const uint32_t old_pheno_ct = *pheno_ct_ptr;
+    char* old_pheno_names = *pheno_names_ptr;
+    uintptr_t new_max_pheno_name_blen;
+    if (old_pheno_names && (catpheno_name_blen <= old_max_pheno_name_blen)) {
+      new_max_pheno_name_blen = old_max_pheno_name_blen;
+      for (uint32_t pheno_idx = 0; pheno_idx < old_pheno_ct; ++pheno_idx) {
+	if (!memcmp(catpheno_name, &(old_pheno_names[pheno_idx * old_max_pheno_name_blen]), catpheno_name_blen)) {
+	  sprintf(g_logbuf, "Error: Cannot create a new categorical phenotype named '%s', since another phenotype of the same name already exists.\n", catpheno_name);
+	  goto plink1_cluster_import_ret_INCONSISTENT_INPUT_WW;
+	}
+      }
+    } else {
+      new_max_pheno_name_blen = catpheno_name_blen;
+    }
+    const uint32_t new_pheno_ct = old_pheno_ct + 1;
+    uintptr_t new_pheno_names_byte_ct = new_pheno_ct * new_max_pheno_name_blen;
+    char* pheno_names = (char*)malloc(new_pheno_names_byte_ct);
+    if (!pheno_names) {
+      goto plink1_cluster_import_ret_NOMEM;
+    }
+    if (old_pheno_names && (old_max_pheno_name_blen == new_max_pheno_name_blen)) {
+      memcpy(pheno_names, old_pheno_names, old_pheno_ct * new_max_pheno_name_blen);
+    } else {
+      for (uint32_t pheno_idx = 0; pheno_idx < old_pheno_ct; ++pheno_idx) {
+	strcpy(&(pheno_names[pheno_idx * new_max_pheno_name_blen]), &(old_pheno_names[pheno_idx * old_max_pheno_name_blen]));
+      }
+    }
+    memcpy(&(pheno_names[old_pheno_ct * new_max_pheno_name_blen]), catpheno_name, catpheno_name_blen);
+    free_cond(old_pheno_names);
+    *pheno_names_ptr = pheno_names;
+
+    pheno_col_t* new_pheno_cols = (pheno_col_t*)realloc(*pheno_cols_ptr, new_pheno_ct * sizeof(pheno_col_t));
+    if (!new_pheno_cols) {
+      goto plink1_cluster_import_ret_NOMEM;
+    }
+    *pheno_cols_ptr = new_pheno_cols;
+    *pheno_ct_ptr = new_pheno_ct;
+    *max_pheno_name_blen_ptr = new_max_pheno_name_blen;
+    new_pheno_cols[old_pheno_ct].nonmiss = nullptr;
+    new_pheno_cols[old_pheno_ct].type_code = (pheno_dtype_t)kPhenoDtypeCat;
+
+    const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+    uintptr_t* cat_nm = nullptr;
+    uint32_t* cat_idxs = nullptr;
+    if (!within_fname) {
+      if (bigstack_alloc_ul(raw_sample_ctaw, &cat_nm) ||
+	  bigstack_calloc_ui(raw_sample_ct, &cat_idxs)) {
+        goto plink1_cluster_import_ret_NOMEM;
+      }
+      memcpy(cat_nm, sample_include, raw_sample_ctaw * sizeof(intptr_t));
+    }
+    uint32_t* cat_htable;
+    uint32_t cat_htable_size;
+    if (htable_good_size_alloc(sample_ct + 2, bigstack_left() / 4, &cat_htable, &cat_htable_size)) {
+      goto plink1_cluster_import_ret_NOMEM;
+    }
+    fill_uint_one(cat_htable_size, cat_htable);
+    char* missing_catname = g_missing_catname;
+    const uintptr_t data_vec_ct = INT32CT_TO_VECCT(raw_sample_ct);
+    const uint32_t missing_catname_slen = strlen(missing_catname);
+    const uint32_t missing_catname_hval = hashceil(missing_catname, missing_catname_slen, cat_htable_size);
+    if (within_fname) {
+      reterr = gzopen_read_checked(within_fname, &gz_infile);
+      uintptr_t* already_seen;
+      uint32_t* sorted_cat_idxs;
+      char* idbuf;
+      char** cur_cat_names;
+      if (bigstack_calloc_ul(BITCT_TO_WORDCT(sample_ct), &already_seen) ||
+	  bigstack_calloc_ui(sample_ct, &sorted_cat_idxs) ||
+	  bigstack_alloc_c(max_sample_id_blen, &idbuf) ||
+	  bigstack_alloc_cp(sample_ct + 2, &cur_cat_names)) {
+	goto plink1_cluster_import_ret_NOMEM;
+      }
+      cat_htable[missing_catname_hval] = 0;
+      cur_cat_names[0] = missing_catname;
+      char na_str[] = "NA";
+      uint32_t na_hashval = hashceil(na_str, 2, cat_htable_size);
+      if (na_hashval == missing_catname_hval) {
+	if (++na_hashval == cat_htable_size) {
+	  na_hashval = 0;
+	}
+      }
+      cat_htable[na_hashval] = sample_ct + 1;
+      cur_cat_names[sample_ct + 1] = (char*)na_str;
+
+      uint32_t* id_map;
+      char* sorted_idbox;
+      if (copy_sort_strbox_subset(sample_include, sample_ids, sample_ct, max_sample_id_blen, 1, 0, 0, &sorted_idbox, &id_map)) {
+	goto plink1_cluster_import_ret_NOMEM;
+      }
+      uintptr_t loadbuf_size = bigstack_left();
+      loadbuf_size -= loadbuf_size / 4;
+      if (loadbuf_size > kMaxLongLine) {
+	loadbuf_size = kMaxLongLine;
+      } else {
+	loadbuf_size &= ~(kCacheline - 1);
+	if (loadbuf_size <= kMaxMediumLine) {
+	  goto plink1_cluster_import_ret_NOMEM;
+	}
+      }
+      char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+      loadbuf[loadbuf_size - 1] = ' ';
+      char* cat_name_write_start = (char*)g_bigstack_base;
+      char* cat_name_iter = cat_name_write_start;
+      char* cat_name_write_max = (char*)g_bigstack_end;
+
+      uint32_t nonnull_cat_ct = 0;
+      uintptr_t miss_ct = 0;
+      uintptr_t duplicate_ct = 0;
+      while (gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	++line_idx;
+	if (!loadbuf[loadbuf_size - 1]) {
+	  if (loadbuf_size == kMaxLongLine) {
+	    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --within file is pathologically long.\n", line_idx);
+	    goto plink1_cluster_import_ret_MALFORMED_INPUT_2;
+	  }
+	  goto plink1_cluster_import_ret_NOMEM;
+	}
+	char* fid_start = skip_initial_spaces(loadbuf);
+	if (is_eoln_kns(*fid_start)) {
+	  continue;
+	}
+	char* fid_end = token_endnn(fid_start);
+	char* iid_start = skip_initial_spaces(fid_end);
+	if (is_eoln_kns(*iid_start)) {
+	  goto plink1_cluster_import_ret_MISSING_TOKENS;
+	}
+	char* iid_end = token_endnn(iid_start);
+	const uint32_t fid_slen = (uintptr_t)(fid_end - fid_start);
+	const uint32_t iid_slen = (uintptr_t)(iid_end - iid_start);
+	const uint32_t id_blen = fid_slen + iid_slen + 2;
+	if (id_blen > max_sample_id_blen) {
+	  ++miss_ct;
+	  continue;
+	}
+        char* idbuf_iter = memcpyax(idbuf, fid_start, fid_slen, '\t');
+	idbuf_iter = memcpya(idbuf_iter, iid_start, iid_slen);
+	*idbuf_iter = '\0';
+	uint32_t lb_idx = bsearch_str_lb(idbuf, sorted_idbox, id_blen, max_sample_id_blen, sample_ct);
+	*idbuf_iter = ' ';
+	const uint32_t ub_idx = bsearch_str_lb(idbuf, sorted_idbox, id_blen, max_sample_id_blen, sample_ct);
+	if (ub_idx == lb_idx) {
+	  ++miss_ct;
+	  continue;
+	}
+	char* main_token_start = next_token_mult(iid_end, mwithin_val);
+	if (!main_token_start) {
+	  goto plink1_cluster_import_ret_MISSING_TOKENS;
+	}
+	char* main_token_end = token_endnn(main_token_start);
+	*main_token_end = '\0';
+	const uint32_t main_token_slen = (uintptr_t)(main_token_end - main_token_start);
+	if (main_token_slen > kMaxIdSlen) {
+	  logerrprint("Error: Category names are limited to " MAX_ID_SLEN_STR " characters.\n");
+	  goto plink1_cluster_import_ret_INCONSISTENT_INPUT;
+	}
+	uint32_t hashval = hashceil(main_token_start, main_token_slen, cat_htable_size);
+	const uint32_t main_token_blen = main_token_slen + 1;
+	uint32_t cur_htable_entry;
+	while (1) {
+	  cur_htable_entry = cat_htable[hashval];
+	  if (cur_htable_entry == 0xffffffffU) {
+	    if (main_token_blen > (uintptr_t)(cat_name_write_max - cat_name_iter)) {
+	      goto plink1_cluster_import_ret_NOMEM;
+	    }
+	    cur_cat_names[++nonnull_cat_ct] = cat_name_iter;
+	    cat_name_iter = memcpya(cat_name_iter, main_token_start, main_token_blen);
+	    cur_htable_entry = nonnull_cat_ct;
+	    cat_htable[hashval] = cur_htable_entry;
+	    break;
+	  }
+	  if (!memcmp(main_token_start, cur_cat_names[cur_htable_entry], main_token_blen)) {
+	    break;
+	  }
+	  if (++hashval == cat_htable_size) {
+	    hashval = 0;
+	  }
+	}
+	// permit duplicates if category is identical
+	if (is_set(already_seen, lb_idx)) {
+	  const uint32_t existing_cat_idx = sorted_cat_idxs[lb_idx];
+	  if (existing_cat_idx != cur_htable_entry) {
+	    idbuf[fid_slen] = ' ';
+	    LOGPREPRINTFWW("Error: Duplicate sample ID '%s' with conflicting category assignments in --within file.\n", idbuf);
+	    goto plink1_cluster_import_ret_MALFORMED_INPUT_2;
+	  }
+	  ++duplicate_ct;
+	} else {
+	  set_bit(lb_idx, already_seen);
+	  for (; lb_idx < ub_idx; ++lb_idx) {
+	    sorted_cat_idxs[lb_idx] = cur_htable_entry;
+	  }
+	}
+      }
+      if ((!gzeof(gz_infile)) || gzclose_null(&gz_infile)) {
+	goto plink1_cluster_import_ret_READ_FAIL;
+      }
+      if (!nonnull_cat_ct) {
+	logerrprint("Error: All --within categories are null.\n");
+	goto plink1_cluster_import_ret_INCONSISTENT_INPUT_WW;
+      }
+      double dxx;
+      const uint32_t prepend_c = (scanadv_double(cur_cat_names[1], &dxx) != nullptr);
+      if (prepend_c) {
+	for (uint32_t catname_idx = 2; catname_idx <= nonnull_cat_ct; ++catname_idx) {
+	  if (!scanadv_double(cur_cat_names[catname_idx], &dxx)) {
+	    logerrprint("Error: Either all non-null --within categories must be numeric, or none can be.\n");
+	    goto plink1_cluster_import_ret_INCONSISTENT_INPUT;
+	  }
+	}
+	logprint("Note: Prepending 'C' to all --within category names.\n");
+      } else {
+	for (uint32_t catname_idx = 2; catname_idx <= nonnull_cat_ct; ++catname_idx) {
+	  if (scanadv_double(cur_cat_names[catname_idx], &dxx)) {
+	    logerrprint("Error: Either all non-null --within categories must be numeric, or none can be.\n");
+	    goto plink1_cluster_import_ret_INCONSISTENT_INPUT;
+	  }
+	}
+      }
+      // see end of e.g. load_psam()
+      const uintptr_t catname_vec_ct = WORDCT_TO_VECCT(nonnull_cat_ct + 1);
+      const uintptr_t total_catname_blen = (prepend_c * nonnull_cat_ct) + (uintptr_t)(cat_name_iter - cat_name_write_start);
+      const uintptr_t catname_storage_vec_ct = DIV_UP(total_catname_blen, kBytesPerVec);
+      if (vecaligned_malloc((raw_sample_ctaw * kWordsPerVec + data_vec_ct + catname_vec_ct + catname_storage_vec_ct) * kBytesPerVec, &(new_pheno_cols[old_pheno_ct].nonmiss))) {
+	goto plink1_cluster_import_ret_NOMEM;
+      }
+      new_pheno_cols[old_pheno_ct].nonnull_category_ct = nonnull_cat_ct;
+      uintptr_t* catdata_iter = new_pheno_cols[old_pheno_ct].nonmiss;
+      cat_nm = catdata_iter;
+      fill_ulong_zero(raw_sample_ctaw, cat_nm);
+      catdata_iter = &(catdata_iter[raw_sample_ctaw]);
+
+      cat_idxs = (uint32_t*)catdata_iter;
+      fill_uint_zero(raw_sample_ct, cat_idxs);
+      new_pheno_cols[old_pheno_ct].data.cat = cat_idxs;
+      catdata_iter = &(catdata_iter[data_vec_ct * kWordsPerVec]);
+
+      for (uint32_t sorted_sample_idx = 0; sorted_sample_idx < sample_ct; ++sorted_sample_idx) {
+	const uint32_t cur_sample_uidx = id_map[sorted_sample_idx];
+	uint32_t cur_cat_idx = sorted_cat_idxs[sorted_sample_idx];
+	if (cur_cat_idx > sample_ct) {
+	  cur_cat_idx = 0;
+	}
+	if (cur_cat_idx) {
+	  set_bit(cur_sample_uidx, cat_nm);
+	}
+	cat_idxs[cur_sample_uidx] = cur_cat_idx;
+      }
+
+      char** cur_name_ptrs = (char**)catdata_iter;
+      new_pheno_cols[old_pheno_ct].category_names = cur_name_ptrs;
+      *cur_name_ptrs++ = missing_catname;
+      char* name_storage_iter = (char*)(&(catdata_iter[catname_vec_ct * kWordsPerVec]));
+      cat_name_iter = cat_name_write_start;
+      for (uint32_t uii = 0; uii < nonnull_cat_ct; ++uii) {
+	*cur_name_ptrs++ = name_storage_iter;
+	if (prepend_c) {
+	  *name_storage_iter++ = 'C';
+	}
+	const uint32_t cur_catname_blen = 1 + strlen(cat_name_iter);
+	name_storage_iter = memcpya(name_storage_iter, cat_name_iter, cur_catname_blen);
+	cat_name_iter = &(cat_name_iter[cur_catname_blen]);
+      }
+
+      if (duplicate_ct) {
+	LOGPRINTFWW("Note: %" PRIuPTR " duplicate sample ID%s) in --within file.\n", duplicate_ct, (duplicate_ct == 1)? " (with a consistent category assignment" : "s (with consistent category assignments");
+      }
+      if (miss_ct) {
+	sprintf(g_logbuf, "--within: %u non-null categories present, %" PRIuPTR " sample ID%s skipped.\n", nonnull_cat_ct, miss_ct, (miss_ct == 1)? "" : "s");
+	wordwrapb(0);
+      } else {
+	sprintf(g_logbuf, "--within: %u non-null categories present.\n", nonnull_cat_ct);
+      }
+      logprintb();
+    } else {
+      // --family
+      cat_htable[missing_catname_hval] = 0xfffffffdU;
+      uintptr_t total_catname_blen = 0; // does not need to include 'NONE'
+      uint32_t family_missing_catname_slen = 0;
+      if (family_missing_catname) {
+	family_missing_catname_slen = strlen(family_missing_catname);
+	uint32_t family_missing_catname_hval = hashceil(family_missing_catname, family_missing_catname_slen, cat_htable_size);
+	if (cat_htable[family_missing_catname_hval] == 0xffffffffU) {
+	  cat_htable[family_missing_catname_hval] = 0xfffffffeU;
+	} else if ((missing_catname_slen != family_missing_catname_slen) || memcmp(family_missing_catname, missing_catname, missing_catname_slen)) {
+	  if (++family_missing_catname_hval == cat_htable_size) {
+	    family_missing_catname_hval = 0;
+	  }
+	  cat_htable[family_missing_catname_hval] = 0xfffffffeU;
+	}
+      }
+      // guaranteed to have enough space
+      uint32_t* cat_idx_m1_to_first_sample_uidx = (uint32_t*)g_bigstack_base;
+      uint32_t sample_uidx = 0;
+      uint32_t nonnull_cat_ct = 0;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	const char* cur_fid = &(sample_ids[sample_uidx * max_sample_id_blen]);
+	const char* cur_fid_end = (const char*)rawmemchr(cur_fid, '\t');
+        const uint32_t slen = (uintptr_t)(cur_fid_end - cur_fid);
+        uint32_t hashval = hashceil(cur_fid, slen, cat_htable_size);
+	const uint32_t blen = slen + 1;
+	while (1) {
+	  const uint32_t cur_htable_entry = cat_htable[hashval];
+	  if (cur_htable_entry >= 0xfffffffdU) {
+	    if (cur_htable_entry == 0xffffffffU) {
+	      cat_htable[hashval] = sample_uidx;
+	      total_catname_blen += blen;
+	      cat_idx_m1_to_first_sample_uidx[nonnull_cat_ct] = sample_uidx;
+	      cat_idxs[sample_uidx] = ++nonnull_cat_ct;
+	      break;
+	    } else if (cur_htable_entry == 0xfffffffeU) {
+	      if ((slen == family_missing_catname_slen) && (!memcmp(cur_fid, family_missing_catname, family_missing_catname_slen))) {
+		clear_bit(sample_uidx, cat_nm);
+		cat_idxs[sample_uidx] = 0;
+		break;
+	      }
+	    } else {
+	      if ((slen == missing_catname_slen) && (!memcmp(cur_fid, missing_catname, missing_catname_slen))) {
+		clear_bit(sample_uidx, cat_nm);
+		cat_idxs[sample_uidx] = 0;
+		break;
+	      }
+	    }
+	  } else {
+	    if (!memcmp(cur_fid, &(sample_ids[cur_htable_entry * max_sample_id_blen]), blen)) {
+	      cat_idxs[sample_uidx] = cat_idxs[cur_htable_entry];
+	      break;
+	    }
+	  }
+	  if (++hashval == cat_htable_size) {
+	    hashval = 0;
+	  }
+	}
+      }
+      if (!nonnull_cat_ct) {
+	logerrprint("Error: All --family FIDs are null.\n");
+	goto plink1_cluster_import_ret_INCONSISTENT_INPUT_WW;
+      }
+      // add 'P' prefixes?
+      double dxx;
+      const uint32_t prepend_c = (scanadv_double((char*)(&(sample_ids[cat_idx_m1_to_first_sample_uidx[0] * max_sample_id_blen])), &dxx) != nullptr);
+      if (prepend_c) {
+	for (uint32_t uii = 1; uii < nonnull_cat_ct; ++uii) {
+	  if (!scanadv_double((char*)(&(sample_ids[cat_idx_m1_to_first_sample_uidx[uii] * max_sample_id_blen])), &dxx)) {
+	    logerrprint("Error: Either all non-null --family FIDs must be numeric, or none can be.\n");
+	    goto plink1_cluster_import_ret_INCONSISTENT_INPUT;
+	  }
+	}
+	logprint("Note: Prepending 'C' to all --family category names.\n");
+	total_catname_blen += nonnull_cat_ct;
+      } else {
+	for (uint32_t uii = 1; uii < nonnull_cat_ct; ++uii) {
+	  if (scanadv_double((char*)(&(sample_ids[cat_idx_m1_to_first_sample_uidx[uii] * max_sample_id_blen])), &dxx)) {
+	    logerrprint("Error: Either all non-null --family FIDs must be numeric, or none can be.\n");
+	    goto plink1_cluster_import_ret_INCONSISTENT_INPUT;
+	  }
+	}
+      }
+      // see end of e.g. load_psam()
+      const uintptr_t catname_vec_ct = WORDCT_TO_VECCT(nonnull_cat_ct + 1);
+      const uintptr_t catname_storage_vec_ct = DIV_UP(total_catname_blen, kBytesPerVec);
+      if (vecaligned_malloc((raw_sample_ctaw * kWordsPerVec + data_vec_ct + catname_vec_ct + catname_storage_vec_ct) * kBytesPerVec, &(new_pheno_cols[old_pheno_ct].nonmiss))) {
+	goto plink1_cluster_import_ret_NOMEM;
+      }
+      new_pheno_cols[old_pheno_ct].nonnull_category_ct = nonnull_cat_ct;
+      uintptr_t* catdata_iter = new_pheno_cols[old_pheno_ct].nonmiss;
+      memcpy(catdata_iter, cat_nm, raw_sample_ctaw * sizeof(intptr_t));
+      catdata_iter = &(catdata_iter[raw_sample_ctaw]);
+
+      new_pheno_cols[old_pheno_ct].data.cat = (uint32_t*)catdata_iter;
+      memcpy(catdata_iter, cat_idxs, data_vec_ct * kBytesPerVec);
+      catdata_iter = &(catdata_iter[data_vec_ct * kWordsPerVec]);
+
+      char** cur_name_ptrs = (char**)catdata_iter;
+      new_pheno_cols[old_pheno_ct].category_names = cur_name_ptrs;
+      *cur_name_ptrs++ = missing_catname;
+      char* name_storage_iter = (char*)(&(catdata_iter[catname_vec_ct * kWordsPerVec]));
+      for (uint32_t uii = 0; uii < nonnull_cat_ct; ++uii) {
+	*cur_name_ptrs++ = name_storage_iter;
+	if (prepend_c) {
+	  *name_storage_iter++ = 'C';
+	}
+	const char* cur_fid = &(sample_ids[cat_idx_m1_to_first_sample_uidx[uii] * max_sample_id_blen]);
+	const char* cur_fid_end = (const char*)rawmemchr(cur_fid, '\t');
+	name_storage_iter = memcpyax(name_storage_iter, cur_fid, (uintptr_t)(cur_fid_end - cur_fid), '\0');
+      }
+      LOGPRINTF("--family: %u non-null categor%s present.\n", nonnull_cat_ct, (nonnull_cat_ct == 1)? "y" : "ies");
+    }
+  }
+  while (0) {
+  plink1_cluster_import_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  plink1_cluster_import_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  plink1_cluster_import_ret_MISSING_TOKENS:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --within file has fewer tokens than expected.\n", line_idx);
+  plink1_cluster_import_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  plink1_cluster_import_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  plink1_cluster_import_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  gzclose_cond(gz_infile);
+  bigstack_reset(bigstack_mark);
+  if (reterr) {
+    if (*pheno_names_ptr) {
+      free(*pheno_names_ptr);
+      *pheno_names_ptr = nullptr;
+    }
+    cleanup_pheno_cols(*pheno_ct_ptr, *pheno_cols_ptr);
+    *pheno_ct_ptr = 0;
+    *pheno_cols_ptr = nullptr;
+  }
+  return reterr;
+}
+
+pglerr_t update_sample_sexes(const char* update_sex_fname, const uintptr_t* sample_include, char* sample_ids, uint32_t raw_sample_ct, uintptr_t sample_ct, uintptr_t max_sample_id_blen, uint32_t update_sex_colm2, uintptr_t* sex_nm, uintptr_t* sex_male) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  gzFile gz_infile = nullptr;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    reterr = gzopen_read_checked(update_sex_fname, &gz_infile);
+    if (reterr) {
+      goto update_sample_sexes_ret_1;
+    }
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    uintptr_t* already_seen;
+    char* idbuf;
+    if (bigstack_calloc_ul(raw_sample_ctl, &already_seen) ||
+	bigstack_alloc_c(max_sample_id_blen, &idbuf)) {
+      goto update_sample_sexes_ret_NOMEM;
+    }
+    uint32_t* id_map;
+    char* sorted_idbox;
+    if (copy_sort_strbox_subset(sample_include, sample_ids, sample_ct, max_sample_id_blen, 1, 0, 0, &sorted_idbox, &id_map)) {
+      goto update_sample_sexes_ret_NOMEM;
+    }
+    // permit very long lines since this can be pointed at .ped files
+    uintptr_t loadbuf_size = bigstack_left();
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else {
+      loadbuf_size &= ~(kCacheline - 1);
+      if (loadbuf_size <= kMaxMediumLine) {
+        goto update_sample_sexes_ret_NOMEM;
+      }
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    uint32_t hit_ct = 0;
+    uintptr_t miss_ct = 0;
+    uintptr_t duplicate_ct = 0;
+    while (gzgets(gz_infile, loadbuf, loadbuf_size)) {
+      ++line_idx;
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-sex file is pathologically long.\n", line_idx);
+	  goto update_sample_sexes_ret_MALFORMED_INPUT_2;
+	}
+	goto update_sample_sexes_ret_NOMEM;
+      }
+      char* fid_start = skip_initial_spaces(loadbuf);
+      if (is_eoln_kns(*fid_start)) {
+	continue;
+      }
+      char* fid_end = token_endnn(fid_start);
+      char* iid_start = skip_initial_spaces(fid_end);
+      if (is_eoln_kns(*iid_start)) {
+	goto update_sample_sexes_ret_MISSING_TOKENS;
+      }
+      char* iid_end = token_endnn(iid_start);
+      const uint32_t fid_slen = (uintptr_t)(fid_end - fid_start);
+      const uint32_t iid_slen = (uintptr_t)(iid_end - iid_start);
+      const uint32_t id_blen = fid_slen + iid_slen + 2;
+      if (id_blen > max_sample_id_blen) {
+	++miss_ct;
+	continue;
+      }
+      char* idbuf_iter = memcpyax(idbuf, fid_start, fid_slen, '\t');
+      idbuf_iter = memcpya(idbuf_iter, iid_start, iid_slen);
+      *idbuf_iter = '\0';
+      uint32_t lb_idx = bsearch_str_lb(idbuf, sorted_idbox, id_blen, max_sample_id_blen, sample_ct);
+      *idbuf_iter = ' ';
+      const uint32_t ub_idx = bsearch_str_lb(idbuf, sorted_idbox, id_blen, max_sample_id_blen, sample_ct);
+      if (ub_idx == lb_idx) {
+	++miss_ct;
+	continue;
+      }
+      char* sex_start = next_token_mult(iid_end, update_sex_colm2);
+      if (!sex_start) {
+	goto update_sample_sexes_ret_MISSING_TOKENS;
+      }
+      uint32_t sexval = (unsigned char)(*sex_start);
+      const uint32_t ujj = sexval & 0xdfU;
+      sexval -= 48;
+      if (sexval > 2) {
+	if (ujj == 77) {
+	  // 'M'/'m'
+	  sexval = 1;
+	} else if (ujj == 70) {
+	  // 'F'/'f'
+	  sexval = 2;
+	} else {
+	  sprintf(g_logbuf, "Error: Invalid sex value on line %" PRIuPTR " of --update-sex file. (Acceptable values: 1/M/m = male, 2/F/f = female, 0 = missing.)\n", line_idx);
+	  wordwrapb(0);
+	  goto update_sample_sexes_ret_MALFORMED_INPUT_2;
+	}
+      }
+      uint32_t sample_uidx = id_map[lb_idx];
+      if (IS_SET(already_seen, sample_uidx)) {
+	// permit duplicates iff sex value is identical
+	const uint32_t old_sexval = IS_SET(sex_nm, sample_uidx) * (2 - IS_SET(sex_male, sample_uidx));
+	if (sexval != old_sexval) {
+	  idbuf[fid_slen] = ' ';
+	  LOGPREPRINTFWW("Error: Duplicate sample ID '%s' with conflicting sex assignments in --update-sex file.\n", idbuf);
+	  goto update_sample_sexes_ret_MALFORMED_INPUT_2;
+	}
+	++duplicate_ct;
+	continue;
+      }
+      SET_BIT(sample_uidx, already_seen);
+      while (1) {
+	if (sexval) {
+	  SET_BIT(sample_uidx, sex_nm);
+	  if (sexval == 1) {
+	    SET_BIT(sample_uidx, sex_male);
+	  } else {
+	    CLEAR_BIT(sample_uidx, sex_male);
+	  }
+	} else {
+	  CLEAR_BIT(sample_uidx, sex_nm);
+	  CLEAR_BIT(sample_uidx, sex_male);
+	}
+	++hit_ct;
+	if (++lb_idx == ub_idx) {
+	  break;
+	}
+	sample_uidx = id_map[lb_idx];
+      }
+    }
+    if ((!gzeof(gz_infile)) || gzclose_null(&gz_infile)) {
+      goto update_sample_sexes_ret_READ_FAIL;
+    }
+    if (duplicate_ct) {
+      LOGPRINTFWW("Note: %" PRIuPTR " duplicate sample ID%s) in --update-sex file.\n", duplicate_ct, (duplicate_ct == 1)? " (with a consistent sex assignment" : "s (with consistent sex assignments");
+    }
+    if (miss_ct) {
+      sprintf(g_logbuf, "--update-sex: %u sample%s updated, %" PRIuPTR " ID%s not present.\n", hit_ct, (hit_ct == 1)? "" : "s", miss_ct, (miss_ct == 1)? "" : "s");
+    } else {
+      sprintf(g_logbuf, "--update-sex: %u sample%s updated.\n", hit_ct, (hit_ct == 1)? "" : "s");
+    }
+    logprintb();
+  }
+  while (0) {
+  update_sample_sexes_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  update_sample_sexes_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  update_sample_sexes_ret_MISSING_TOKENS:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of --update-sex file has fewer tokens than expected.\n", line_idx);
+  update_sample_sexes_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ update_sample_sexes_ret_1:
+  gzclose_cond(gz_infile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t split_cat_pheno(const char* split_cat_phenonames_flattened, const uintptr_t* sample_include, uint32_t raw_sample_ct, pheno_transform_flags_t pheno_transform_flags, pheno_col_t** pheno_cols_ptr, char** pheno_names_ptr, uint32_t* pheno_ct_ptr, uintptr_t* max_pheno_name_blen_ptr, pheno_col_t** covar_cols_ptr, char** covar_names_ptr, uint32_t* covar_ct_ptr, uintptr_t* max_covar_name_blen_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* doomed_pheno_names = nullptr;
+  pheno_col_t* doomed_pheno_cols = nullptr;
+  uint32_t doomed_pheno_ct = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uint32_t omit_last = (pheno_transform_flags / kfPhenoTransformSplitCatOmitLast) & 1;
+    uint32_t qt_12 = 0;
+    uint32_t at_least_one_cat_pheno_processed = 0;
+    for (uint32_t is_covar = 0; is_covar < 2; ++is_covar) {
+      pheno_col_t** xpheno_cols_ptr;
+      char** xpheno_names_ptr;
+      uint32_t* xpheno_ct_ptr;
+      uintptr_t* max_xpheno_name_blen_ptr;
+      if (!is_covar) {
+	xpheno_cols_ptr = pheno_cols_ptr;
+	xpheno_names_ptr = pheno_names_ptr;
+	xpheno_ct_ptr = pheno_ct_ptr;
+	max_xpheno_name_blen_ptr = max_pheno_name_blen_ptr;
+      } else {
+	if (!split_cat_phenonames_flattened) {
+	  break;
+	}
+        bigstack_reset(bigstack_mark);
+	xpheno_cols_ptr = covar_cols_ptr;
+	xpheno_names_ptr = covar_names_ptr;
+	xpheno_ct_ptr = covar_ct_ptr;
+	max_xpheno_name_blen_ptr = max_covar_name_blen_ptr;
+	qt_12 = !(pheno_transform_flags & kfPhenoTransformSplitCatCovar01);
+      }
+      const uint32_t old_pheno_ct = *xpheno_ct_ptr;
+      if (!old_pheno_ct) {
+	continue;
+      }
+      const uint32_t old_pheno_ctl = BITCT_TO_WORDCT(old_pheno_ct);
+      const uintptr_t old_max_pheno_name_blen = *max_xpheno_name_blen_ptr;
+      pheno_col_t* old_pheno_cols = *xpheno_cols_ptr;
+      char* old_pheno_names = *xpheno_names_ptr;
+      uintptr_t* phenos_to_split;
+      if (bigstack_calloc_ul(old_pheno_ctl, &phenos_to_split)) {
+	goto split_cat_pheno_ret_NOMEM;
+      }
+      if (!split_cat_phenonames_flattened) {
+	for (uint32_t pheno_idx = 0; pheno_idx < old_pheno_ct; ++pheno_idx) {
+	  const pheno_col_t* cur_pheno_col = &(old_pheno_cols[pheno_idx]);
+	  if (cur_pheno_col->type_code == kPhenoDtypeCat) {
+	    if (strchr(&(old_pheno_names[pheno_idx * old_max_pheno_name_blen]), '=')) {
+	      logerrprint("Error: --split-cat-pheno cannot be used on phenotypes containing the '='\ncharacter.\n");
+	      goto split_cat_pheno_ret_INCONSISTENT_INPUT;
+	    }
+	    set_bit(pheno_idx, phenos_to_split);
+	  }
+	}
+      } else {
+	uint32_t* id_htable;
+	uint32_t id_htable_size;
+	if (htable_good_size_alloc(old_pheno_ct, bigstack_left(), &id_htable, &id_htable_size)) {
+	  goto split_cat_pheno_ret_NOMEM;
+	}
+	// shouldn't be possible for this to fail
+	populate_strbox_htable(old_pheno_names, old_pheno_ct, old_max_pheno_name_blen, id_htable_size, id_htable);
+	const char* split_cat_phenonames_iter = split_cat_phenonames_flattened;
+	do {
+	  const uint32_t cur_phenoname_slen = strlen(split_cat_phenonames_iter);
+	  if (cur_phenoname_slen < old_max_pheno_name_blen) {
+	    uint32_t pheno_idx = strbox_htable_find(split_cat_phenonames_iter, old_pheno_names, id_htable, old_max_pheno_name_blen, cur_phenoname_slen, id_htable_size);
+	    if (pheno_idx != 0xffffffffU) {
+	      if (old_pheno_cols[pheno_idx].type_code != kPhenoDtypeCat) {
+		sprintf(g_logbuf, "Error: '%s' is not a categorical %s.\n", split_cat_phenonames_iter, is_covar? "covariate" : "phenotype");
+		goto split_cat_pheno_ret_INCONSISTENT_INPUT_WW;
+	      }
+	      set_bit(pheno_idx, phenos_to_split);
+	    }
+	  }
+	  split_cat_phenonames_iter = &(split_cat_phenonames_iter[cur_phenoname_slen + 1]);
+	} while (*split_cat_phenonames_iter);
+	bigstack_reset(id_htable);
+      }
+      const uint32_t split_pheno_ct = popcount_longs(phenos_to_split, old_pheno_ctl);
+      if (!split_pheno_ct) {
+	continue;
+      }
+      at_least_one_cat_pheno_processed = 1;
+      // first pass: determine new memory allocation sizes
+      const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+      uintptr_t new_max_pheno_name_blen = old_max_pheno_name_blen;
+      uint32_t* observed_cat_cts; // excludes null, excludes last if omit-last
+      uintptr_t** observed_cats;
+      uintptr_t* sample_include_intersect;
+      if (bigstack_alloc_ui(split_pheno_ct, &observed_cat_cts) ||
+	  bigstack_alloc_ulp(split_pheno_ct, &observed_cats) ||
+	  bigstack_alloc_ul(raw_sample_ctl, &sample_include_intersect)) {
+	goto split_cat_pheno_ret_NOMEM;
+      }
+      uintptr_t create_pheno_ct = 0;
+      uint32_t split_pheno_uidx = 0;
+      uint32_t max_cat_uidx_p1 = 0;
+      for (uint32_t split_pheno_idx = 0; split_pheno_idx < split_pheno_ct; ++split_pheno_idx, ++split_pheno_uidx) {
+	next_set_unsafe_ck(phenos_to_split, &split_pheno_uidx);
+	const pheno_col_t* cur_pheno_col = &(old_pheno_cols[split_pheno_uidx]);
+	bitvec_and_copy(sample_include, cur_pheno_col->nonmiss, raw_sample_ctl, sample_include_intersect);
+	const uint32_t cur_cat_ct = cur_pheno_col->nonnull_category_ct + 1;
+	const uint32_t cur_cat_ctl = BITCT_TO_WORDCT(cur_cat_ct);
+	uintptr_t* cur_observed_cats;
+	if (bigstack_alloc_ul(cur_cat_ctl, &cur_observed_cats)) {
+	  goto split_cat_pheno_ret_NOMEM;
+	}
+	observed_cats[split_pheno_idx] = cur_observed_cats;
+	const uint32_t cur_nmiss_ct = popcount_longs(sample_include_intersect, raw_sample_ctl);
+	uint32_t cur_observed_cat_ct = identify_remaining_cats(sample_include_intersect, cur_pheno_col, cur_nmiss_ct, cur_observed_cats);
+	if (cur_observed_cat_ct > omit_last) {
+	  cur_observed_cat_ct -= omit_last;
+	  // old phenotype name, '=' character, null terminator
+	  const uintptr_t blen_base = strlen(&(old_pheno_names[split_pheno_uidx * old_max_pheno_name_blen])) + 2;
+	  char** cat_names = cur_pheno_col->category_names;
+	  uint32_t cat_uidx = 0;
+	  for (uint32_t cat_idx = 0; cat_idx < cur_observed_cat_ct; ++cat_idx, ++cat_uidx) {
+	    next_set_unsafe_ck(cur_observed_cats, &cat_uidx);
+	    const char* cur_cat_name = cat_names[cat_uidx];
+	    const uint32_t cur_slen = strlen(cur_cat_name);
+	    if (memchr(cur_cat_name, '=', cur_slen)) {
+	      logerrprint("Error: --split-cat-pheno category names may not contain the '=' character.\n");
+	      goto split_cat_pheno_ret_INCONSISTENT_INPUT;
+	    }
+	    const uintptr_t total_blen = cur_slen + blen_base;
+	    if (total_blen > new_max_pheno_name_blen) {
+	      new_max_pheno_name_blen = total_blen;
+	    }
+	  }
+	  if (cat_uidx > max_cat_uidx_p1) {
+	    max_cat_uidx_p1 = cat_uidx;
+	  }
+	} else {
+	  cur_observed_cat_ct = 0;
+	}
+	observed_cat_cts[split_pheno_idx] = cur_observed_cat_ct;
+	create_pheno_ct += cur_observed_cat_ct;
+      }
+      if (new_max_pheno_name_blen > kMaxIdBlen) {
+	logerrprint("Error: New --split-cat-pheno phenotype name too long.  Shorten your phenotype\nor your category names.\n");
+	goto split_cat_pheno_ret_INCONSISTENT_INPUT;
+      }
+      const uint32_t copy_pheno_ct = old_pheno_ct - split_pheno_ct;
+      // before new_pheno_ct variable definition due to potential integer
+      // overflow
+      if (create_pheno_ct + copy_pheno_ct > kMaxPhenoCt) {
+	logerrprint("Error: --split-cat-pheno would create too many phenotypes (" PROG_NAME_STR " is limited to\n" MAX_PHENO_CT_STR ").\n");
+	goto split_cat_pheno_ret_INCONSISTENT_INPUT;
+      }
+      const uint32_t new_pheno_ct = create_pheno_ct + copy_pheno_ct;
+      uintptr_t** write_data_ptrs;
+      if (bigstack_alloc_ulp(max_cat_uidx_p1, &write_data_ptrs)) {
+	goto split_cat_pheno_ret_NOMEM;
+      }
+      const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+      uint32_t new_data_word_ct = raw_sample_ctaw;
+      if (is_covar) {
+	new_data_word_ct = DBLCT_TO_VECCT(raw_sample_ct) * kWordsPerVec;
+      }
+      uintptr_t* omit_last_dummy = nullptr;
+      if (omit_last) {
+	if (bigstack_alloc_ul(new_data_word_ct, &omit_last_dummy)) {
+	  goto split_cat_pheno_ret_NOMEM;
+	}
+      }
+
+      // second pass: allocate memory and actually create the new phenotypes
+      char* new_pheno_names = (char*)malloc(new_pheno_ct * new_max_pheno_name_blen);
+      if (!new_pheno_names) {
+	goto split_cat_pheno_ret_NOMEM;
+      }
+      doomed_pheno_names = old_pheno_names;
+      *xpheno_names_ptr = new_pheno_names;
+      pheno_col_t* new_pheno_cols = (pheno_col_t*)malloc(new_pheno_ct * sizeof(pheno_col_t));
+      if (!new_pheno_cols) {
+	goto split_cat_pheno_ret_NOMEM;
+      }
+      doomed_pheno_cols = old_pheno_cols;
+      doomed_pheno_ct = old_pheno_ct;
+      *xpheno_cols_ptr = new_pheno_cols;
+      *xpheno_ct_ptr = new_pheno_ct;
+      *max_xpheno_name_blen_ptr = new_max_pheno_name_blen;
+      uint32_t pheno_read_idx = 0;
+      for (uint32_t pheno_write_idx = 0; pheno_write_idx < copy_pheno_ct; ++pheno_write_idx, ++pheno_read_idx) {
+	next_unset_unsafe_ck(phenos_to_split, &pheno_read_idx);
+	new_pheno_cols[pheno_write_idx] = doomed_pheno_cols[pheno_read_idx];
+
+	// prevent double-free
+	doomed_pheno_cols[pheno_read_idx].nonmiss = nullptr;
+
+	strcpy(&(new_pheno_names[pheno_write_idx * new_max_pheno_name_blen]), &(old_pheno_names[pheno_read_idx * old_max_pheno_name_blen]));
+      }
+      for (uint32_t pheno_write_idx = copy_pheno_ct; pheno_write_idx < new_pheno_ct; ++pheno_write_idx) {
+	new_pheno_cols[pheno_write_idx].nonmiss = nullptr;
+      }
+
+      const uintptr_t new_pheno_bytes_req = (raw_sample_ctaw + new_data_word_ct) * sizeof(intptr_t);
+      uint32_t pheno_write_idx = copy_pheno_ct;
+      pheno_read_idx = 0;
+      for (uint32_t split_pheno_idx = 0; split_pheno_idx < split_pheno_ct; ++split_pheno_idx, ++pheno_read_idx) {
+	next_set_unsafe_ck(phenos_to_split, &pheno_read_idx);
+	const uint32_t cur_pheno_write_ct = observed_cat_cts[split_pheno_idx];
+	if (!cur_pheno_write_ct) {
+	  continue;
+	}
+	const uintptr_t* cur_observed_cats = observed_cats[split_pheno_idx];
+	const pheno_col_t* old_pheno_col = &(old_pheno_cols[pheno_read_idx]);
+	bitvec_and_copy(sample_include, old_pheno_col->nonmiss, raw_sample_ctaw, sample_include_intersect);
+	const char* old_pheno_name = &(old_pheno_names[pheno_read_idx * old_max_pheno_name_blen]);
+	const uint32_t old_pheno_name_slen = strlen(old_pheno_name);
+	char** old_cat_names = old_pheno_col->category_names;
+	uint32_t orig_cat_idx = 1;
+	for (uint32_t uii = 0; uii < cur_pheno_write_ct; ++uii, ++orig_cat_idx, ++pheno_write_idx) {
+	  next_set_unsafe_ck(cur_observed_cats, &orig_cat_idx);
+	  uintptr_t* new_pheno_data_iter;
+	  if (vecaligned_malloc(new_pheno_bytes_req, &new_pheno_data_iter)) {
+	    goto split_cat_pheno_ret_NOMEM;
+	  }
+	  char* new_phenoname_write_iter = memcpyax(&(new_pheno_names[pheno_write_idx * new_max_pheno_name_blen]), old_pheno_name, old_pheno_name_slen, '=');
+	  strcpy(new_phenoname_write_iter, old_cat_names[orig_cat_idx]);
+	  pheno_col_t* pheno_write_col = &(new_pheno_cols[pheno_write_idx]);
+	  pheno_write_col->nonmiss = new_pheno_data_iter;
+	  pheno_write_col->category_names = nullptr;
+	  pheno_write_col->type_code = (pheno_dtype_t)is_covar;
+	  pheno_write_col->nonnull_category_ct = 0;
+	  memcpy(new_pheno_data_iter, sample_include_intersect, raw_sample_ctaw * sizeof(intptr_t));
+	  new_pheno_data_iter = &(new_pheno_data_iter[raw_sample_ctaw]);
+	  write_data_ptrs[orig_cat_idx] = new_pheno_data_iter;
+	  // assigning to one element of a union and reading from another with
+	  // a different type is undefined behavior in C++11
+	  if (!is_covar) {
+	    pheno_write_col->data.cc = new_pheno_data_iter;
+	    fill_ulong_zero(raw_sample_ctaw, new_pheno_data_iter);
+	  } else {
+	    double* pheno_qt = (double*)new_pheno_data_iter;
+	    pheno_write_col->data.qt = pheno_qt;
+	    if (qt_12) {
+	      for (uint32_t ujj = 0; ujj < raw_sample_ct; ++ujj) {
+		pheno_qt[ujj] = 1.0;
+	      }
+	    } else {
+	      fill_double_zero(raw_sample_ct, pheno_qt);
+	    }
+	  }
+	}
+	if (omit_last) {
+	  next_set_unsafe_ck(cur_observed_cats, &orig_cat_idx);
+	  write_data_ptrs[orig_cat_idx] = omit_last_dummy;
+	}
+
+	const uint32_t cur_nmiss_ct = popcount_longs(sample_include_intersect, raw_sample_ctl);
+	const uint32_t* cur_cats = old_pheno_col->data.cat;
+	uint32_t sample_uidx = 0;
+	if (!is_covar) {
+	  for (uint32_t sample_idx = 0; sample_idx < cur_nmiss_ct; ++sample_idx, ++sample_uidx) {
+	    next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	    set_bit(sample_uidx, write_data_ptrs[cur_cats[sample_uidx]]);
+	  }
+	} else {
+	  double** write_qt_ptrs = (double**)write_data_ptrs;
+	  const double write_val = (int32_t)(1 + qt_12);
+	  for (uint32_t sample_idx = 0; sample_idx < cur_nmiss_ct; ++sample_idx, ++sample_uidx) {
+	    next_set_unsafe_ck(sample_include_intersect, &sample_uidx);
+	    write_qt_ptrs[cur_cats[sample_uidx]][sample_uidx] = write_val;
+	  }
+	}
+      }
+
+      // if any preexisting phenotype names contain a single copy of the '='
+      // character, verify that we didn't create any duplicate IDs
+      for (uint32_t pheno_idx = 0; pheno_idx < copy_pheno_ct; ++pheno_idx) {
+	char* first_eq = strchr(&(new_pheno_names[pheno_idx * new_max_pheno_name_blen]), '=');
+	if (first_eq && (!strchr(&(first_eq[1]), '='))) {
+	  uint32_t* id_htable;
+	  uint32_t id_htable_size;
+	  if (htable_good_size_alloc(new_pheno_ct, bigstack_left(), &id_htable, &id_htable_size)) {
+	    goto split_cat_pheno_ret_NOMEM;
+	  }
+	  uint32_t duplicate_idx = populate_strbox_htable(new_pheno_names, new_pheno_ct, new_max_pheno_name_blen, id_htable_size, id_htable);
+	  if (duplicate_idx) {
+	    sprintf(g_logbuf, "Error: Duplicate %s '%s' created by --split-cat-pheno.\n", is_covar? "covariate" : "phenotype", &(new_pheno_names[duplicate_idx * new_max_pheno_name_blen]));
+	    goto split_cat_pheno_ret_INCONSISTENT_INPUT_WW;
+	  }
+	  break;
+	}
+      }
+
+      free(doomed_pheno_names);
+      doomed_pheno_names = nullptr;
+      cleanup_pheno_cols(doomed_pheno_ct, doomed_pheno_cols);
+      doomed_pheno_cols = nullptr;
+
+      LOGPRINTFWW("--split-cat-pheno: %u categorical %s%s converted to %" PRIuPTR " %s%s.\n", split_pheno_ct, is_covar? "covariate" : "phenotype", (split_pheno_ct == 1)? "" : "s", create_pheno_ct, is_covar? "covariate" : "phenotype", (create_pheno_ct == 1)? "" : "s");
+    }
+    if (!at_least_one_cat_pheno_processed) {
+      LOGERRPRINTF("Warning: No categorical phenotypes%s processed by --split-cat-pheno.\n", (split_cat_phenonames_flattened && (!(*covar_ct_ptr)))? "/covariates" : "");
+    }
+  }
+  while (0) {
+  split_cat_pheno_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  split_cat_pheno_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  split_cat_pheno_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  free_cond(doomed_pheno_names);
+  cleanup_pheno_cols(doomed_pheno_ct, doomed_pheno_cols);
+  if (reterr) {
+    if (*pheno_names_ptr) {
+      free(*pheno_names_ptr);
+      *pheno_names_ptr = nullptr;
+    }
+    cleanup_pheno_cols(*pheno_ct_ptr, *pheno_cols_ptr);
+    *pheno_ct_ptr = 0;
+    *pheno_cols_ptr = nullptr;
+    if (*covar_names_ptr) {
+      free(*covar_names_ptr);
+      *covar_names_ptr = nullptr;
+    }
+    cleanup_pheno_cols(*covar_ct_ptr, *covar_cols_ptr);
+    *covar_ct_ptr = 0;
+    *covar_cols_ptr = nullptr;
+  }
+  return reterr;
+}
+
+pglerr_t pheno_variance_standardize(const char* vstd_flattened, const uintptr_t* sample_include, const char* pheno_names, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t is_covar, uint32_t is_covar_flag, pheno_col_t* pheno_cols) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!pheno_ct) {
+      goto pheno_variance_standardize_ret_SKIP;
+    }
+    const uint32_t pheno_ctl = BITCT_TO_WORDCT(pheno_ct);
+    uintptr_t* phenos_to_transform;
+    if (bigstack_calloc_ul(pheno_ctl, &phenos_to_transform)) {
+      goto pheno_variance_standardize_ret_NOMEM;
+    }
+    if (!vstd_flattened) {
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	const pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_idx]);
+	if (cur_pheno_col->type_code == kPhenoDtypeQt) {
+	  set_bit(pheno_idx, phenos_to_transform);
+	}
+      }
+    } else {
+      uint32_t* id_htable;
+      uint32_t id_htable_size;
+      if (htable_good_size_alloc(pheno_ct, bigstack_left(), &id_htable, &id_htable_size)) {
+	goto pheno_variance_standardize_ret_NOMEM;
+      }
+      populate_strbox_htable(pheno_names, pheno_ct, max_pheno_name_blen, id_htable_size, id_htable);
+      const char* vstd_phenonames_iter = vstd_flattened;
+      do {
+	const uint32_t cur_phenoname_slen = strlen(vstd_phenonames_iter);
+	if (cur_phenoname_slen < max_pheno_name_blen) {
+	  uint32_t pheno_idx = strbox_htable_find(vstd_phenonames_iter, pheno_names, id_htable, max_pheno_name_blen, cur_phenoname_slen, id_htable_size);
+	  if (pheno_idx != 0xffffffffU) {
+	    if (pheno_cols[pheno_idx].type_code != kPhenoDtypeQt) {
+	      sprintf(g_logbuf, "Error: '%s' is not a quantitative %s.\n", vstd_phenonames_iter, is_covar? "covariate" : "phenotype");
+	      goto pheno_variance_standardize_ret_INCONSISTENT_INPUT_WW;
+	    }
+	    set_bit(pheno_idx, phenos_to_transform);
+	  }
+	}
+	vstd_phenonames_iter = &(vstd_phenonames_iter[cur_phenoname_slen + 1]);
+      } while (*vstd_phenonames_iter);
+      bigstack_reset(id_htable);
+    }
+    const uint32_t pheno_transform_ct = popcount_longs(phenos_to_transform, pheno_ctl);
+    if (!pheno_transform_ct) {
+      goto pheno_variance_standardize_ret_SKIP;
+    }
+    double* shifted_pheno_qt;
+    if (bigstack_alloc_d(raw_sample_ct, &shifted_pheno_qt)) {
+      goto pheno_variance_standardize_ret_NOMEM;
+    }
+    const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+    uint32_t pheno_uidx = 0;
+    for (uint32_t pheno_transform_idx = 0; pheno_transform_idx < pheno_transform_ct; ++pheno_transform_idx, ++pheno_uidx) {
+      next_set_unsafe_ck(phenos_to_transform, &pheno_uidx);
+      pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_uidx]);
+      uintptr_t* pheno_nm = cur_pheno_col->nonmiss;
+      bitvec_and(sample_include, raw_sample_ctaw, pheno_nm);
+      const uint32_t cur_sample_ct = popcount_longs(pheno_nm, raw_sample_ctaw);
+      if (cur_sample_ct < 2) {
+	if (cur_sample_ct) {
+	  LOGERRPRINTFWW("Warning: Exactly one value present for %s '%s'; standardizing to missing.\n", is_covar? "covariate" : "quantitative phenotype", &(pheno_names[pheno_uidx * max_pheno_name_blen]));
+	  fill_ulong_zero(raw_sample_ctaw, pheno_nm);
+	}
+	continue;
+      }
+      double* pheno_qt = cur_pheno_col->data.qt;
+      double shifted_pheno_sum = 0.0;
+      double shifted_pheno_ssq = 0.0;
+      const uint32_t first_sample_uidx = next_set_unsafe(pheno_nm, 0);
+      uint32_t sample_uidx = first_sample_uidx;
+      shifted_pheno_qt[sample_uidx] = 0.0;
+      const double pheno_shift = pheno_qt[sample_uidx++];
+      for (uint32_t sample_idx = 1; sample_idx < cur_sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	const double cur_shifted_pheno_val = pheno_qt[sample_uidx] - pheno_shift;
+	shifted_pheno_sum += cur_shifted_pheno_val;
+	shifted_pheno_ssq += cur_shifted_pheno_val * cur_shifted_pheno_val;
+        shifted_pheno_qt[sample_uidx] = cur_shifted_pheno_val;
+      }
+      const double cur_shifted_mean = shifted_pheno_sum / ((double)((int32_t)cur_sample_ct));
+      const double variance_numer = shifted_pheno_ssq - shifted_pheno_sum * cur_shifted_mean;
+      if (!(variance_numer > 0.0)) {
+	LOGERRPRINTFWW("Warning: %s '%s' is constant; standardizing to all-missing.\n", is_covar? "Covariate" : "Quantitative phenotype", &(pheno_names[pheno_uidx * max_pheno_name_blen]));
+	fill_ulong_zero(raw_sample_ctaw, pheno_nm);
+	continue;
+      }
+      const double cur_stdev_recip = sqrt((double)((int32_t)(cur_sample_ct - 1)) / variance_numer);
+      sample_uidx = first_sample_uidx;
+      for (uint32_t sample_idx = 0; sample_idx < cur_sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	pheno_qt[sample_uidx] = (shifted_pheno_qt[sample_uidx] - cur_shifted_mean) * cur_stdev_recip;
+      }
+    }
+    // could reduce the reported number when all values were originally missing
+    LOGPRINTF("--%svariance-standardize: %u %s%s transformed.\n", is_covar_flag? "covar-" : "", pheno_transform_ct, is_covar? "covariate" : "phenotype", (pheno_transform_ct == 1)? "" : "s");
+  }
+  while (0) {
+  pheno_variance_standardize_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  pheno_variance_standardize_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  pheno_variance_standardize_ret_SKIP:
+    LOGPRINTF("--%svariance-standardize: No %s affected.\n", is_covar_flag? "covar-" : "", is_covar? "covariates" : "quantitative phenotypes");
+    break;    
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+typedef struct dbl_index_struct {
+  double dxx;
+  uint32_t uii;
+#ifdef __cplusplus
+  bool operator<(const struct dbl_index_struct& rhs) const {
+    return dxx < rhs.dxx;
+  }
+#endif
+} dbl_index_t;
+
+pglerr_t pheno_quantile_normalize(const char* quantnorm_flattened, const uintptr_t* sample_include, const char* pheno_names, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t is_covar, uint32_t is_subset_flag, pheno_col_t* pheno_cols) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  const char* flag_prefix = is_subset_flag? (is_covar? "covar-" : "pheno-") : "";
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    if (!pheno_ct) {
+      goto pheno_quantile_normalize_ret_SKIP;
+    }
+    // this boilerplate probably belongs in its own function
+    const uint32_t pheno_ctl = BITCT_TO_WORDCT(pheno_ct);
+    uintptr_t* phenos_to_transform;
+    if (bigstack_calloc_ul(pheno_ctl, &phenos_to_transform)) {
+      goto pheno_quantile_normalize_ret_NOMEM;
+    }
+    if (!quantnorm_flattened) {
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	const pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_idx]);
+	if (cur_pheno_col->type_code == kPhenoDtypeQt) {
+	  set_bit(pheno_idx, phenos_to_transform);
+	}
+      }
+    } else {
+      uint32_t* id_htable;
+      uint32_t id_htable_size;
+      if (htable_good_size_alloc(pheno_ct, bigstack_left(), &id_htable, &id_htable_size)) {
+	goto pheno_quantile_normalize_ret_NOMEM;
+      }
+      populate_strbox_htable(pheno_names, pheno_ct, max_pheno_name_blen, id_htable_size, id_htable);
+      const char* quantnorm_phenonames_iter = quantnorm_flattened;
+      do {
+	const uint32_t cur_phenoname_slen = strlen(quantnorm_phenonames_iter);
+	if (cur_phenoname_slen < max_pheno_name_blen) {
+	  uint32_t pheno_idx = strbox_htable_find(quantnorm_phenonames_iter, pheno_names, id_htable, max_pheno_name_blen, cur_phenoname_slen, id_htable_size);
+	  if (pheno_idx != 0xffffffffU) {
+	    if (pheno_cols[pheno_idx].type_code != kPhenoDtypeQt) {
+	      sprintf(g_logbuf, "Error: '%s' is not a quantitative %s.\n", quantnorm_phenonames_iter, is_covar? "covariate" : "phenotype");
+	      goto pheno_quantile_normalize_ret_INCONSISTENT_INPUT_WW;
+	    }
+	    set_bit(pheno_idx, phenos_to_transform);
+	  }
+	}
+	quantnorm_phenonames_iter = &(quantnorm_phenonames_iter[cur_phenoname_slen + 1]);
+      } while (*quantnorm_phenonames_iter);
+      bigstack_reset(id_htable);
+    }
+    const uint32_t pheno_transform_ct = popcount_longs(phenos_to_transform, pheno_ctl);
+    if (!pheno_transform_ct) {
+      goto pheno_quantile_normalize_ret_SKIP;
+    }
+    dbl_index_t* tagged_raw_pheno_vals = (dbl_index_t*)bigstack_alloc(sample_ct * sizeof(dbl_index_t));
+    if (!tagged_raw_pheno_vals) {
+      goto pheno_quantile_normalize_ret_NOMEM;
+    }
+    const uint32_t raw_sample_ctaw = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+    uint32_t pheno_uidx = 0;
+    for (uint32_t pheno_transform_idx = 0; pheno_transform_idx < pheno_transform_ct; ++pheno_transform_idx, ++pheno_uidx) {
+      next_set_unsafe_ck(phenos_to_transform, &pheno_uidx);
+      pheno_col_t* cur_pheno_col = &(pheno_cols[pheno_uidx]);
+      uintptr_t* pheno_nm = cur_pheno_col->nonmiss;
+      bitvec_and(sample_include, raw_sample_ctaw, pheno_nm);
+      const uint32_t cur_sample_ct = popcount_longs(pheno_nm, raw_sample_ctaw);
+      if (!cur_sample_ct) {
+	continue;
+      }
+      double* pheno_qt = cur_pheno_col->data.qt;
+      uint32_t sample_uidx = 0;
+      for (uint32_t sample_idx = 0; sample_idx < cur_sample_ct; ++sample_idx, ++sample_uidx) {
+	next_set_unsafe_ck(sample_include, &sample_uidx);
+	tagged_raw_pheno_vals[sample_idx].dxx = pheno_qt[sample_uidx];
+	tagged_raw_pheno_vals[sample_idx].uii = sample_uidx;
+      }
+#ifdef __cplusplus
+      std::sort(tagged_raw_pheno_vals, &(tagged_raw_pheno_vals[cur_sample_ct]));
+#else
+      qsort(tagged_raw_pheno_vals, cur_sample_ct, sizeof(dbl_index_t), double_cmp);
+#endif
+      const double sample_ct_x2_recip = 1.0 / ((double)(2 * cur_sample_ct));
+      for (uint32_t sample_idx_start = 0; sample_idx_start < cur_sample_ct;) {
+	const double cur_raw_pheno = tagged_raw_pheno_vals[sample_idx_start].dxx;
+	uint32_t sample_idx_end = sample_idx_start + 1;
+	for (; sample_idx_end < cur_sample_ct; ++sample_idx_end) {
+	  if (tagged_raw_pheno_vals[sample_idx_end].dxx != cur_raw_pheno) {
+	    break;
+	  }
+	}
+	const double cur_zscore = ltqnorm((double)(sample_idx_start + sample_idx_end) * sample_ct_x2_recip);
+	for (; sample_idx_start < sample_idx_end; ++sample_idx_start) {
+	  pheno_qt[tagged_raw_pheno_vals[sample_idx_start].uii] = cur_zscore;
+	}
+      }
+    }
+    LOGPRINTF("--%squantile-normalize: %u %s%s transformed.\n", flag_prefix, pheno_transform_ct, is_covar? "covariate" : "phenotype", (pheno_transform_ct == 1)? "" : "s");
+  }
+  while (0) {
+  pheno_quantile_normalize_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  pheno_quantile_normalize_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  pheno_quantile_normalize_ret_SKIP:
+    LOGPRINTF("--%squantile-normalize: No %s affected.\n", flag_prefix, is_covar? "covariates" : "quantitative phenotypes");
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+
+pglerr_t process_boundary_token(char* tok_start, char* tok_end, const char* token_source_str, uint32_t max_boundary_ct, pglerr_t err_type, double* prev_boundary_ptr, uint32_t* boundary_ct_ptr, double** freq_bounds_ptr, uint64_t** dosage_bounds_ptr) {
+  double cur_boundary;
+  char* scan_end = scanadv_double(tok_start, &cur_boundary);
+  if ((!scan_end) || (scan_end != tok_end)) {
+    LOGERRPRINTF("Error: Invalid token in %s.\n", token_source_str);
+    return err_type;
+  }
+  if (cur_boundary <= (*prev_boundary_ptr)) {
+    logerrprint("Error: Invalid bin boundary sequence (must be strictly increasing, and start\nwith a positive number).\n");
+    return err_type;
+  }
+  uint32_t boundary_ct = *boundary_ct_ptr;
+  if (boundary_ct == max_boundary_ct) {
+#ifdef __LP64__
+    if (max_boundary_ct == 0x40000000) {
+      logerrprint("Error: Too many bin boundaries.\n");
+      return err_type;
+    }
+#endif
+    return kPglRetNomem;
+  }
+  if (freq_bounds_ptr) {
+    if (cur_boundary > 1.0) {
+      logerrprint("Error: --freq bin boundary too large (must be <= 1).\n");
+      return err_type;
+    }
+    // strictly-greater-than comparisons
+    (*freq_bounds_ptr)[boundary_ct] = cur_boundary * (1 - kSmallEpsilon);
+  } else {
+    // max 2^31 - 3 variants
+    if (cur_boundary > 4294967290.0) {
+      logerrprint("Error: --freq counts bin boundary too large.\n");
+      return err_type;
+    }
+    // due to the use of strictly-greater-than for comparison, we round
+    // exact multiples of 1/32768 down
+    const int64_t int_part = (int64_t)cur_boundary;
+    const double cur_boundary_frac_part = cur_boundary - int_part;
+    const int64_t int_part_scaled = int_part * kDosageMax;
+    if (cur_boundary_frac_part == 0.0) {
+      (*dosage_bounds_ptr)[boundary_ct] = int_part_scaled - 1;
+    } else {
+      (*dosage_bounds_ptr)[boundary_ct] = int_part_scaled + (int64_t)(cur_boundary_frac_part * (kDosageMax * (1 - kSmallEpsilon)));
+    }
+  }
+  *prev_boundary_ptr = cur_boundary;
+  *boundary_ct_ptr = boundary_ct + 1;
+  return kPglRetSuccess;
+}
+
+pglerr_t init_histogram_from_file_or_commalist(const char* binstr, uint32_t is_fname, double** freq_bounds_ptr, uint64_t** dosage_bounds_ptr, uint32_t* boundary_ct_ptr, uint32_t** histogram_ptr) {
+  gz_token_stream_t gts;
+  gz_token_stream_preinit(&gts);
+  uint32_t max_boundary_ct = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    uintptr_t ulii = round_down_pow2(bigstack_left(), kCacheline);
+    if (ulii < 2 * kCacheline) {
+      goto init_histogram_from_file_or_commalist_ret_NOMEM;
+    }
+    // 12 = 8 bytes for boundary value + 4 bytes for histogram entry
+    ulii = (ulii - 2 * kCacheline) / 12;
+#ifdef __LP64__
+    max_boundary_ct = MINV(ulii, 0x40000000);
+#else
+    max_boundary_ct = ulii;
+#endif
+    if (freq_bounds_ptr) {
+      *freq_bounds_ptr = (double*)g_bigstack_base;
+    } else {
+      *dosage_bounds_ptr = (uint64_t*)g_bigstack_base;
+    }
+    uint32_t boundary_ct = 0;
+    double prev_boundary = 0.0;
+    if (is_fname) {
+      // we want to accept >100000 numbers on a single line.  this will reject
+      // "000...{a million more zeroes}...1"; pretty sure that's okay.
+      reterr = gz_token_stream_init(binstr, &gts, g_textbuf);
+      if (reterr) {
+	goto init_histogram_from_file_or_commalist_ret_1;
+      }
+      uint32_t token_slen;
+      while (1) {
+	char* token_start = gz_token_stream_advance(&gts, &token_slen);
+	if (!token_start) {
+	  break;
+	}
+	reterr = process_boundary_token(token_start, &(token_start[token_slen]), binstr, max_boundary_ct, kPglRetMalformedInput, &prev_boundary, &boundary_ct, freq_bounds_ptr, dosage_bounds_ptr);
+	if (reterr) {
+	  goto init_histogram_from_file_or_commalist_ret_1;
+	}
+      }
+      if (token_slen) {
+	if (token_slen == 0xffffffffU) {
+	  sprintf(g_logbuf, "Error: Excessively long token in %s.\n", binstr);
+	  goto init_histogram_from_file_or_commalist_ret_MALFORMED_INPUT_2;
+	} else {
+	  goto init_histogram_from_file_or_commalist_ret_READ_FAIL;
+	}
+      }
+    } else {
+      // const_cast
+      char* binstr_iter = (char*)((uintptr_t)binstr);
+      while (1) {
+	char* tok_end = strchr(binstr_iter, ',');
+	const uint32_t is_last_token = (tok_end == nullptr);
+	if (is_last_token) {
+	  tok_end = (char*)rawmemchr(binstr_iter, '\0');
+	}
+	reterr = process_boundary_token(binstr_iter, tok_end, "--freq {ref,alt1}bins= list", max_boundary_ct, kPglRetInvalidCmdline, &prev_boundary, &boundary_ct, freq_bounds_ptr, dosage_bounds_ptr);
+	if (reterr) {
+	  goto init_histogram_from_file_or_commalist_ret_1;
+	}
+	if (is_last_token) {
+	  break;
+	}
+	binstr_iter = &(tok_end[1]);
+      }
+    }
+    *boundary_ct_ptr = boundary_ct;
+    g_bigstack_base += round_up_pow2(boundary_ct * (8 * k1LU), kCacheline);
+    *histogram_ptr = (uint32_t*)bigstack_alloc_raw_rd((boundary_ct + 1) * sizeof(int32_t));
+    fill_uint_zero(boundary_ct + 1, *histogram_ptr);
+  }
+  while (0) {
+  init_histogram_from_file_or_commalist_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  init_histogram_from_file_or_commalist_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  init_histogram_from_file_or_commalist_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ init_histogram_from_file_or_commalist_ret_1:
+  gz_token_stream_close(&gts);
+  return reterr;
+}
+
+pglerr_t write_allele_freqs(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint64_t* founder_allele_dosages, const double* mach_r2_vals, const char* ref_binstr, const char* alt1_binstr, uint32_t variant_ct, uint32_t max_alt_allele_ct, uint32_t max_allele_slen, allele_freq_t allele_freq_modifier, uint32_t nonfounders, char* outname, char* outname_end) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uint32_t counts = (allele_freq_modifier / kfAlleleFreqCounts) & 1;
+    if (counts) {
+      strcpy(outname_end, ".acount");
+    } else {
+      strcpy(outname_end, ".afreq");
+    }
+    if (!(allele_freq_modifier & kfAlleleFreqBinsOnly)) {
+      const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+      unsigned char* overflow_buf;
+      if (bigstack_alloc_uc(kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 512 + max_alt_allele_ct * (24 * k1LU) + 2 * max_allele_slen, &overflow_buf)) {
+	goto write_allele_freqs_ret_NOMEM;
+      }
+      const uint32_t output_zst = allele_freq_modifier & kfAlleleFreqZs;
+      if (output_zst) {
+	strcpy(&(outname_end[6 + counts]), ".zst");
+      }
+      if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	goto write_allele_freqs_ret_OPEN_FAIL;
+      }
+      cswritep = (char*)overflow_buf;
+      *cswritep++ = '#';
+      const uint32_t chr_col = allele_freq_modifier & kfAlleleFreqColChrom;
+
+      // includes trailing tab
+      char* chr_buf;
+      if (bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+	goto write_allele_freqs_ret_NOMEM;
+      }
+      if (chr_col) {
+	cswritep = strcpya(cswritep, "CHROM\t");
+      }
+      if (allele_freq_modifier & kfAlleleFreqColPos) {
+	cswritep = strcpya(cswritep, "POS\t");
+      } else {
+	variant_bps = nullptr;
+      }
+      cswritep = strcpya(cswritep, "ID");
+      const uint32_t ref_col = allele_freq_modifier & kfAlleleFreqColRef;
+      if (ref_col) {
+	cswritep = strcpya(cswritep, "\tREF");
+      }
+      const uint32_t alt1_col = allele_freq_modifier & kfAlleleFreqColAlt1;
+      if (alt1_col) {
+	cswritep = strcpya(cswritep, "\tALT1");
+      }
+      const uint32_t alt_col = allele_freq_modifier & kfAlleleFreqColAlt;
+      if (alt_col) {
+	cswritep = strcpya(cswritep, "\tALT");
+      }
+      const uint32_t reffreq_col = allele_freq_modifier & kfAlleleFreqColReffreq;
+      if (reffreq_col) {
+	cswritep = strcpya(cswritep, "\tREF_");
+	if (counts) {
+	  cswritep = strcpya(cswritep, "CT");
+	} else {
+	  cswritep = strcpya(cswritep, "FREQ");
+	}
+      }
+      const uint32_t alt1freq_col = allele_freq_modifier & kfAlleleFreqColAlt1freq;
+      if (alt1freq_col) {
+	cswritep = strcpya(cswritep, "\tALT1_");
+	if (counts) {
+	  cswritep = strcpya(cswritep, "CT");
+	} else {
+	  cswritep = strcpya(cswritep, "FREQ");
+	}
+      }
+      const uint32_t freq_col = allele_freq_modifier & (kfAlleleFreqColFreq | kfAlleleFreqColAltfreq);
+      const uint32_t commalist_exclude_ref = (allele_freq_modifier & (kfAlleleFreqColAltfreq | kfAlleleFreqColAlteq | kfAlleleFreqColAlteqz | kfAlleleFreqColAltnumeq))? 1 : 0;
+      const uint32_t eq_col = allele_freq_modifier & (kfAlleleFreqColEq | kfAlleleFreqColEqz | kfAlleleFreqColAlteq | kfAlleleFreqColAlteqz | kfAlleleFreqColNumeq | kfAlleleFreqColAltnumeq);
+      const uint32_t eq_includez = allele_freq_modifier & (kfAlleleFreqColEqz | kfAlleleFreqColAlteqz);
+      const uint32_t eq_num = allele_freq_modifier & (kfAlleleFreqColNumeq | kfAlleleFreqColAltnumeq);
+      if (freq_col || eq_col) {
+	*cswritep++ = '\t';
+	if (commalist_exclude_ref) {
+	  cswritep = strcpya(cswritep, "ALT_");
+	}
+	if (eq_num) {
+	  cswritep = strcpya(cswritep, "NUM_");
+	}
+	if (counts) {
+	  cswritep = memcpyl3a(cswritep, "CTS");
+	} else {
+	  cswritep = strcpya(cswritep, "FREQS");
+	}
+      }
+      const uint32_t mach_r2_col = allele_freq_modifier & kfAlleleFreqColMachR2;
+      if (mach_r2_col) {
+	cswritep = strcpya(cswritep, "\tMACH_R2");
+      }
+      const uint32_t nobs_col = allele_freq_modifier & kfAlleleFreqColNobs;
+      if (nobs_col) {
+	cswritep = strcpya(cswritep, "\tOBS_CT");
+      }
+      append_binary_eoln(&cswritep);
+
+      const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+      uint32_t variant_uidx = 0;
+      uint32_t chr_fo_idx = 0xffffffffU;
+      uint32_t chr_end = 0;
+      uint32_t chr_buf_blen = 0;
+      uint32_t suppress_mach_r2 = 0;
+      uint32_t pct = 0;
+      uint32_t next_print_variant_idx = variant_ct / 100;
+      uint32_t cur_allele_ct = 2;
+      printf("--freq%s%s: 0%%", output_zst? " zs" : "", counts? " counts" : "");
+      fflush(stdout);
+      for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	if (variant_uidx >= chr_end) {
+	  do {
+	    ++chr_fo_idx;
+	    chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	  } while (variant_uidx >= chr_end);
+	  const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	  char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	  suppress_mach_r2 = is_set(cip->haploid_mask, chr_idx) || (chr_idx == ((uint32_t)mt_code));
+	  *chr_name_end = '\t';
+	  chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	}
+	if (chr_col) {
+	  cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+	}
+	if (variant_bps) {
+	  cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+	}
+	cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+	uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	if (variant_allele_idxs) {
+	  variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	  cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	}
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	if (ref_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[0]);
+	}
+	if (alt1_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[1]);
+	}
+	if (alt_col) {
+	  *cswritep++ = '\t';
+	  for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	    if (cswrite(&css, &cswritep)) {
+	      goto write_allele_freqs_ret_WRITE_FAIL;
+	    }
+	    cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	  }
+	  --cswritep;
+	}
+	const uint64_t* cur_allele_dosages = &(founder_allele_dosages[variant_allele_idx_base]);
+	uint64_t tot_allele_dosage = cur_allele_dosages[0];
+	for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	  tot_allele_dosage += cur_allele_dosages[allele_idx];
+	}
+	double tot_allele_dosage_recip = 0.0;
+	if (!counts) {
+	  tot_allele_dosage_recip = 1.0 / ((double)((int64_t)tot_allele_dosage));
+	}
+	if (reffreq_col) {
+	  *cswritep++ = '\t';
+	  const uint64_t ref_dosage = cur_allele_dosages[0];
+	  if (counts) {
+	    cswritep = print_dosage(ref_dosage, cswritep);
+	  } else {
+	    cswritep = dtoa_g(((double)((int64_t)ref_dosage)) * tot_allele_dosage_recip, cswritep);
+	  }
+	}
+	if (alt1freq_col) {
+	  *cswritep++ = '\t';
+	  const uint64_t alt1_dosage = cur_allele_dosages[1];
+	  if (counts) {
+	    cswritep = print_dosage(alt1_dosage, cswritep);
+	  } else {
+	    cswritep = dtoa_g(((double)((int64_t)alt1_dosage)) * tot_allele_dosage_recip, cswritep);
+	  }
+	}
+	if (freq_col) {
+	  *cswritep++ = '\t';
+	  for (uint32_t allele_idx = commalist_exclude_ref; allele_idx < cur_allele_ct; ++allele_idx) {
+	    const uint64_t cur_allele_dosage = cur_allele_dosages[allele_idx];
+	    if (counts) {
+	      cswritep = print_dosage(cur_allele_dosage, cswritep);
+	    } else {
+	      cswritep = dtoa_g(((double)((int64_t)cur_allele_dosage)) * tot_allele_dosage_recip, cswritep);
+	    }
+	    *cswritep++ = ',';
+	  }
+	  --cswritep;
+	} else if (eq_col) {
+	  *cswritep++ = '\t';
+	  uint32_t at_least_one_entry = 0;
+	  for (uint32_t allele_idx = commalist_exclude_ref; allele_idx < cur_allele_ct; ++allele_idx) {
+	    const uint64_t cur_allele_dosage = cur_allele_dosages[allele_idx];
+	    if (eq_includez || cur_allele_dosage) {
+	      if (eq_num) {
+		cswritep = uint32toa(allele_idx, cswritep);
+	      } else {
+		if (cswrite(&css, &cswritep)) {
+		  goto write_allele_freqs_ret_WRITE_FAIL;
+		}
+		const char* cur_allele = cur_alleles[allele_idx];
+		const uint32_t allele_slen = strlen(cur_allele);
+		if (memchr(cur_allele, '=', allele_slen) != nullptr) {
+		  logerrprint("Error: --freq's 'eq', 'eqz', 'alteq', and 'alteqz' columns cannot be requested\nwhen an allele code contains a '='.\n");
+		  goto write_allele_freqs_ret_INCONSISTENT_INPUT;
+		}
+		cswritep = memcpya(cswritep, cur_allele, allele_slen);
+	      }
+	      *cswritep++ = '=';
+	      if (counts) {
+		cswritep = print_dosage(cur_allele_dosage, cswritep);
+	      } else {
+		cswritep = dtoa_g(((double)((int64_t)cur_allele_dosage)) * tot_allele_dosage_recip, cswritep);
+	      }
+	      *cswritep++ = ',';
+	      at_least_one_entry = 1;
+	    }
+	  }
+	  if (at_least_one_entry) {
+	    --cswritep;
+	  } else {
+	    *cswritep++ = '.';
+	  }
+	}
+	if (mach_r2_col) {
+	  *cswritep++ = '\t';
+	  if (!suppress_mach_r2) {
+	    cswritep = dtoa_g(mach_r2_vals[variant_uidx], cswritep);
+	  } else {
+	    cswritep = strcpya(cswritep, "NA");
+	  }
+	}
+	if (nobs_col) {
+	  *cswritep++ = '\t';
+	  cswritep = print_dosage(tot_allele_dosage, cswritep);
+	}
+	append_binary_eoln(&cswritep);
+	if (cswrite(&css, &cswritep)) {
+	  goto write_allele_freqs_ret_WRITE_FAIL;
+	}
+	if (variant_idx >= next_print_variant_idx) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (variant_idx * 100LLU) / variant_ct;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	}
+      }
+      if (cswrite_close_null(&css, cswritep)) {
+	goto write_allele_freqs_ret_WRITE_FAIL;
+      }
+      putc_unlocked('\r', stdout);
+      LOGPRINTFWW("--freq%s%s: Allele %s (%s) written to %s .\n", output_zst? " zs" : "", counts? " counts" : "", counts? "counts" : "frequencies", nonfounders? "all samples" : "founders only", outname);
+    }
+
+    if (ref_binstr || alt1_binstr) {
+      bigstack_reset(bigstack_mark);
+      double* ref_freq_bounds = nullptr;
+      uint64_t* ref_dosage_bounds = nullptr;
+      uint32_t* ref_histogram = nullptr;
+      uint32_t ref_boundary_ct = 0;
+      if (ref_binstr) {
+	reterr = init_histogram_from_file_or_commalist(ref_binstr, (allele_freq_modifier / kfAlleleFreqBinsRefFname) & 1, counts? nullptr : (&ref_freq_bounds), counts? (&ref_dosage_bounds) : nullptr, &ref_boundary_ct, &ref_histogram);
+	if (reterr) {
+	  goto write_allele_freqs_ret_1;
+	}
+      }
+      double* alt1_freq_bounds = nullptr;
+      uint64_t* alt1_dosage_bounds = nullptr;
+      uint32_t* alt1_histogram = nullptr;
+      uint32_t alt1_boundary_ct = 0;
+      if (alt1_binstr) {
+	reterr = init_histogram_from_file_or_commalist(alt1_binstr, (allele_freq_modifier / kfAlleleFreqBinsAlt1Fname) & 1, counts? nullptr : (&alt1_freq_bounds), counts? (&alt1_dosage_bounds) : nullptr, &alt1_boundary_ct, &alt1_histogram);
+	if (reterr) {
+	  goto write_allele_freqs_ret_1;
+	}
+      }
+      uint32_t variant_uidx = 0;
+      if (!counts) {
+        uint32_t cur_allele_ct = 2;
+	for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	  if (variant_allele_idxs) {
+	    variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	    cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	  }
+	  const uint64_t* cur_allele_dosages = &(founder_allele_dosages[variant_allele_idx_base]);
+	  const uint64_t ref_allele_dosage = cur_allele_dosages[0];
+	  const uint64_t alt1_allele_dosage = cur_allele_dosages[1];
+	  uint64_t tot_allele_dosage = ref_allele_dosage + alt1_allele_dosage;
+	  for (uint32_t allele_idx = 2; allele_idx < cur_allele_ct; ++allele_idx) {
+	    tot_allele_dosage += cur_allele_dosages[allele_idx];
+	  }
+	  const double tot_allele_dosage_recip = 1.0 / ((double)((int64_t)tot_allele_dosage));
+	  if (ref_histogram) {
+	    ref_histogram[doublearr_greater_than(ref_freq_bounds, ref_boundary_ct, ref_allele_dosage * tot_allele_dosage_recip)] += 1;
+	  }
+	  if (alt1_histogram) {
+	    alt1_histogram[doublearr_greater_than(alt1_freq_bounds, alt1_boundary_ct, alt1_allele_dosage * tot_allele_dosage_recip)] += 1;
+	  }
+	}
+      } else {
+	for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	  next_set_unsafe_ck(variant_include, &variant_uidx);
+	  uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	  if (variant_allele_idxs) {
+	    variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	  }
+	  const uint64_t* cur_allele_dosages = &(founder_allele_dosages[variant_allele_idx_base]);
+	  if (ref_histogram) {
+	    ref_histogram[uint64arr_greater_than(ref_dosage_bounds, ref_boundary_ct, cur_allele_dosages[0])] += 1;
+	  }
+	  if (alt1_histogram) {
+	    alt1_histogram[uint64arr_greater_than(alt1_dosage_bounds, alt1_boundary_ct, cur_allele_dosages[1])] += 1;
+	  }
+	}
+      }
+      for (uint32_t is_alt1 = 0; is_alt1 < 2; ++is_alt1) {
+	const uint32_t* cur_histogram = is_alt1? alt1_histogram : ref_histogram;
+	if (!cur_histogram) {
+	  continue;
+	}
+	char* outname_end2 = &(outname_end[6 + counts]);
+	if (!is_alt1) {
+	  outname_end2 = strcpya(outname_end2, ".ref");
+	} else {
+	  outname_end2 = strcpya(outname_end2, ".alt1");
+	}
+	strcpy(outname_end2, ".bins");
+	if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+	  goto write_allele_freqs_ret_OPEN_FAIL;
+	}
+	char* textbuf = g_textbuf;
+	char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+	char* write_iter = strcpya(textbuf, "#BIN_START\tOBS_CT" EOLN_STR);
+	const uint32_t cur_boundary_ct = is_alt1? alt1_boundary_ct : ref_boundary_ct;
+	if (!counts) {
+	  const double* cur_freq_bounds = is_alt1? alt1_freq_bounds : ref_freq_bounds;
+	  for (uint32_t bin_idx = 0; bin_idx <= cur_boundary_ct; ++bin_idx) {
+	    if (!bin_idx) {
+	      *write_iter++ = '0';
+	    } else {
+	      write_iter = dtoa_g(cur_freq_bounds[bin_idx - 1] * (1.0 / (1 - kSmallEpsilon)), write_iter);
+	    }
+	    *write_iter++ = '\t';
+	    write_iter = uint32toa(cur_histogram[bin_idx], write_iter);
+	    append_binary_eoln(&write_iter);
+	    if (write_iter >= textbuf_flush) {
+	      if (fwrite_checked(textbuf, (uintptr_t)(write_iter - textbuf), outfile)) {
+		goto write_allele_freqs_ret_WRITE_FAIL;
+	      }
+	      write_iter = textbuf;
+	    }
+	  }
+	} else {
+	  const uint64_t* cur_dosage_bounds = is_alt1? alt1_dosage_bounds : ref_dosage_bounds;
+	  for (uint32_t bin_idx = 0; bin_idx <= cur_boundary_ct; ++bin_idx) {
+	    if (!bin_idx) {
+	      *write_iter++ = '0';
+	    } else {
+	      write_iter = print_dosage(1 + cur_dosage_bounds[bin_idx - 1], write_iter);
+	    }
+	    *write_iter++ = '\t';
+	    write_iter = uint32toa(cur_histogram[bin_idx], write_iter);
+	    append_binary_eoln(&write_iter);
+	    if (write_iter >= textbuf_flush) {
+	      if (fwrite_checked(textbuf, (uintptr_t)(write_iter - textbuf), outfile)) {
+		goto write_allele_freqs_ret_WRITE_FAIL;
+	      }
+	      write_iter = textbuf;
+	    }
+	  }
+	}
+	if (write_iter != textbuf) {
+	  if (fwrite_checked(textbuf, (uintptr_t)(write_iter - textbuf), outfile)) {
+	    goto write_allele_freqs_ret_WRITE_FAIL;
+	  }
+	}
+	if (fclose_null(&outfile)) {
+	  goto write_allele_freqs_ret_WRITE_FAIL;
+	}
+	const uint32_t cur_is_file = allele_freq_modifier & (is_alt1? kfAlleleFreqBinsAlt1Fname : kfAlleleFreqBinsRefFname);
+        LOGPRINTFWW("--freq%s %sbins%s=: Histogram written to %s .\n", counts? " counts" : "", is_alt1? "alt1" : "ref", cur_is_file? "-file" : "", outname);
+      }
+    }
+  }
+  while (0) {
+  write_allele_freqs_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_allele_freqs_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_allele_freqs_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  write_allele_freqs_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ write_allele_freqs_ret_1:
+  cswrite_close_cond(&css, cswritep);
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t write_geno_counts(__attribute__((unused)) const uintptr_t* sample_include, __attribute__((unused)) const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint32_t* raw_geno_cts, const uint32_t* x_male_geno_cts, __attribute__((unused)) uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t male_ct, uint32_t variant_ct, uint32_t x_start, ui [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+    unsigned char* overflow_buf;
+    char* chr_buf;
+    if (bigstack_alloc_uc(kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 512 + simple_pgrp->fi.max_alt_allele_ct * (24 * k1LU) + 2 * max_allele_slen, &overflow_buf) ||
+	bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+      goto write_geno_counts_ret_NOMEM;
+    }
+    /*
+    // need the following once multiallelic variants are supported
+    const uint32_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    const uint32_t raw_sample_ctv = BITCT_TO_VECCT(raw_sample_ct);
+    uintptr_t* sample_include_interleaved_vec = nullptr;
+    uint32_t* sample_include_cumulative_popcounts = nullptr;
+    uintptr_t* genovec = nullptr;
+    uintptr_t* sex_male_interleaved_vec = nullptr;
+    uint32_t* sex_male_cumulative_popcounts = nullptr;
+    if (simple_pgrp->fi.max_alt_allele_ct > 1) {
+      if (bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &sample_include_interleaved_vec) ||
+	  bigstack_alloc_ui(raw_sample_ctl, &sample_include_cumulative_popcounts) ||
+	  bigstack_alloc_ul(QUATERCT_TO_WORDCT(raw_sample_ct), &genovec) ||
+	  bigstack_alloc_ul(raw_sample_ctv * kWordsPerVec, &sex_male_interleaved_vec) ||
+	  bigstack_alloc_ui(raw_sample_ctl, &sex_male_cumulative_popcounts)) {
+	goto write_geno_counts_ret_NOMEM;
+      }
+      fill_interleaved_mask_vec(sample_include, raw_sample_ctv, sample_include_interleaved_vec);
+      fill_cumulative_popcounts(sample_include, raw_sample_ctl, sample_include_cumulative_popcounts);
+      fill_interleaved_mask_vec(sex_male, raw_sample_ctv, sex_male_interleaved_vec);
+      fill_cumulative_popcounts(sex_male, raw_sample_ctl, sex_male_cumulative_popcounts);
+      pgr_clear_ld_cache(simple_pgrp);
+    }
+    */
+    const uint32_t output_zst = geno_counts_modifier & kfGenoCountsZs;
+    char* outname_end2 = strcpya0(outname_end, ".gcount");
+    if (output_zst) {
+      strcpy(outname_end2, ".zst");
+    }
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto write_geno_counts_ret_OPEN_FAIL;
+    }
+    cswritep = (char*)overflow_buf;
+    *cswritep++ = '#';
+    const uint32_t chr_col = geno_counts_modifier & kfGenoCountsColChrom;
+
+    // includes trailing tab
+    if (chr_col) {
+      cswritep = strcpya(cswritep, "CHROM\t");
+    }
+    if (geno_counts_modifier & kfGenoCountsColPos) {
+      cswritep = strcpya(cswritep, "POS\t");
+    } else {
+      variant_bps = nullptr;
+    }
+    cswritep = strcpya(cswritep, "ID");
+    const uint32_t ref_col = geno_counts_modifier & kfGenoCountsColRef;
+    if (ref_col) {
+      cswritep = strcpya(cswritep, "\tREF");
+    }
+    const uint32_t alt1_col = geno_counts_modifier & kfGenoCountsColAlt1;
+    if (alt1_col) {
+      cswritep = strcpya(cswritep, "\tALT1");
+    }
+    const uint32_t alt_col = geno_counts_modifier & kfGenoCountsColAlt;
+    if (alt_col) {
+      cswritep = strcpya(cswritep, "\tALT");
+    }
+    const uint32_t homref_col = geno_counts_modifier & kfGenoCountsColHomref;
+    if (homref_col) {
+      cswritep = strcpya(cswritep, "\tHOM_REF_CT");
+    }
+    const uint32_t refalt1_col = geno_counts_modifier & kfGenoCountsColRefalt1;
+    if (refalt1_col) {
+      cswritep = strcpya(cswritep, "\tHET_REF_ALT1_CT");
+    }
+    const uint32_t refalt_col = geno_counts_modifier & kfGenoCountsColRefalt;
+    if (refalt_col) {
+      cswritep = strcpya(cswritep, "\tHET_REF_ALT_CTS");
+    }
+    const uint32_t homalt1_col = geno_counts_modifier & kfGenoCountsColHomalt1;
+    if (homalt1_col) {
+      cswritep = strcpya(cswritep, "\tHOM_ALT1_CT");
+    }
+    const uint32_t xy_col = geno_counts_modifier & (kfGenoCountsColAltxy | kfGenoCountsColXy);
+    const uint32_t xy_col_altonly = (geno_counts_modifier / kfGenoCountsColAltxy) & 1;
+    if (xy_col) {
+      *cswritep++ = '\t';
+      if (xy_col_altonly) {
+	cswritep = strcpya(cswritep, "NONREF_");
+      }
+      cswritep = strcpya(cswritep, "DIPLOID_GENO_CTS");
+    }
+    const uint32_t hapref_col = geno_counts_modifier & kfGenoCountsColHapref;
+    if (hapref_col) {
+      cswritep = strcpya(cswritep, "\tHAP_REF_CT");
+    }
+    const uint32_t hapalt1_col = geno_counts_modifier & kfGenoCountsColHapalt1;
+    if (hapalt1_col) {
+      cswritep = strcpya(cswritep, "\tHAP_ALT1_CT");
+    }
+    const uint32_t hap_col = geno_counts_modifier & (kfGenoCountsColHapalt | kfGenoCountsColHap);
+    const uint32_t hap_col_altonly = (geno_counts_modifier / kfGenoCountsColHapalt) & 1;
+    if (hap_col) {
+      if (hap_col_altonly) {
+	cswritep = strcpya(cswritep, "\tHAP_ALT_CTS");
+      } else {
+	cswritep = strcpya(cswritep, "\tHAP_CTS");
+      }
+    }
+    const uint32_t numeq_col = geno_counts_modifier & kfGenoCountsColNumeq;
+    if (numeq_col) {
+      cswritep = strcpya(cswritep, "\tGENO_NUM_CTS");
+    }
+    const uint32_t missing_col = geno_counts_modifier & kfGenoCountsColMissing;
+    if (missing_col) {
+      cswritep = strcpya(cswritep, "\tMISSING_CT");
+    }
+    const uint32_t nobs_col = geno_counts_modifier & kfGenoCountsColNobs;
+    if (nobs_col) {
+      cswritep = strcpya(cswritep, "\tOBS_CT");
+    }
+    append_binary_eoln(&cswritep);
+
+    const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+    const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+    uint32_t is_autosomal_diploid = 0; // also includes MT for now
+    uint32_t is_x = 0;
+    uint32_t nobs_base = 0;
+    uint32_t chr_fo_idx = 0xffffffffU;
+    uint32_t chr_end = 0;
+    uint32_t chr_buf_blen = 0;
+    uint32_t variant_uidx = 0;
+    uint32_t homref_ct = 0;
+    uint32_t het_ct = 0;
+    uint32_t homalt1_ct = 0;
+    uint32_t hapref_ct = 0;
+    uint32_t hapalt1_ct = 0;
+    uint32_t pct = 0;
+    uint32_t next_print_variant_idx = variant_ct / 100;
+    printf("--geno-counts%s: 0%%", output_zst? " zs" : "");
+    fflush(stdout);
+    uint32_t cur_allele_ct = 2;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      if (variant_uidx >= chr_end) {
+	do {
+	  ++chr_fo_idx;
+	  chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	} while (variant_uidx >= chr_end);
+	const int32_t chr_idx = cip->chr_file_order[chr_fo_idx];
+	char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	*chr_name_end = '\t';
+	chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	is_autosomal_diploid = !is_set(cip->haploid_mask, chr_idx);
+	nobs_base = sample_ct;
+	is_x = (chr_idx == x_code);
+	/*
+        if (!is_autosomal_diploid) {
+          pgr_clear_ld_cache(simple_pgrp);
+          // move chr_idx == y_code check here
+          // update cur_sample_include, etc.
+        }
+	 */
+	if (chr_idx == y_code) {
+	  nobs_base = male_ct;
+	}
+	homref_ct = 0;
+	het_ct = 0;
+	homalt1_ct = 0;
+	hapref_ct = 0;
+	hapalt1_ct = 0;
+      }
+      if (chr_col) {
+	cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+      }
+      if (variant_bps) {
+	cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+      }
+      cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+      uintptr_t variant_allele_idx_base = variant_uidx * 2;
+      if (variant_allele_idxs) {
+	variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+      }
+      char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+      if (ref_col) {
+	*cswritep++ = '\t';
+	cswritep = strcpya(cswritep, cur_alleles[0]);
+      }
+      if (alt1_col) {
+	*cswritep++ = '\t';
+	cswritep = strcpya(cswritep, cur_alleles[1]);
+      }
+      if (alt_col) {
+	*cswritep++ = '\t';
+	for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	  if (cswrite(&css, &cswritep)) {
+	    goto write_geno_counts_ret_WRITE_FAIL;
+	  }
+	  cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	}
+	--cswritep;
+      }
+      uint32_t missing_ct;
+      if (cur_allele_ct == 2) {
+	const uint32_t* cur_raw_geno_cts = &(raw_geno_cts[(3 * k1LU) * variant_uidx]);
+	if (is_autosomal_diploid) {
+	  homref_ct = cur_raw_geno_cts[0];
+	  het_ct = cur_raw_geno_cts[1];
+	  homalt1_ct = cur_raw_geno_cts[2];
+	  missing_ct = nobs_base - homref_ct - het_ct - homalt1_ct;
+	} else {
+	  homref_ct = cur_raw_geno_cts[0];
+	  het_ct = cur_raw_geno_cts[1];
+	  homalt1_ct = cur_raw_geno_cts[2];
+	  if (is_x) {
+	    if (x_male_geno_cts) {
+	      const uint32_t* cur_male_geno_cts = &(x_male_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+	      hapref_ct = cur_male_geno_cts[0];
+	      homref_ct -= hapref_ct;
+	      het_ct -= cur_male_geno_cts[1];
+	      hapalt1_ct = cur_male_geno_cts[2];
+	      homalt1_ct -= hapalt1_ct;
+	    }
+	    missing_ct = nobs_base - homref_ct - het_ct - homalt1_ct - hapref_ct - hapalt1_ct;
+	  } else {
+	    // chrY or haploid
+	    hapref_ct = cur_raw_geno_cts[0];
+	    hapalt1_ct = cur_raw_geno_cts[2];
+	    missing_ct = nobs_base - hapref_ct - hapalt1_ct;
+	  }
+	}
+	if (homref_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(homref_ct, cswritep);
+	}
+	if (refalt1_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(het_ct, cswritep);
+	}
+	if (refalt_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(het_ct, cswritep);
+	}
+	if (homalt1_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(homalt1_ct, cswritep);
+	}
+	if (xy_col_altonly) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(homalt1_ct, cswritep);
+	} else if (xy_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa_x(homref_ct, ',', cswritep);
+	  cswritep = uint32toa_x(het_ct, ',', cswritep);
+	  cswritep = uint32toa(homalt1_ct, cswritep);
+	}
+	if (hapref_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(hapref_ct, cswritep);
+	}
+	if (hapalt1_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(hapalt1_ct, cswritep);
+	}
+	if (hap_col) {
+	  *cswritep++ = '\t';
+	  if (!hap_col_altonly) {
+	    cswritep = uint32toa_x(hapref_ct, ',', cswritep);
+	  }
+	  cswritep = uint32toa(hapalt1_ct, cswritep);
+	}
+	if (numeq_col) {
+	  *cswritep++ = '\t';
+	  if (homref_ct) {
+	    cswritep = strcpya(cswritep, "0/0=");
+	    cswritep = uint32toa_x(homref_ct, ',', cswritep);
+	  }
+	  if (het_ct) {
+	    cswritep = strcpya(cswritep, "0/1=");
+	    cswritep = uint32toa_x(het_ct, ',', cswritep);
+	  }
+	  if (homalt1_ct) {
+	    cswritep = strcpya(cswritep, "1/1=");
+	    cswritep = uint32toa_x(homalt1_ct, ',', cswritep);
+	  }
+	  if (hapref_ct) {
+	    cswritep = strcpya(cswritep, "0=");
+	    cswritep = uint32toa_x(hapref_ct, ',', cswritep);
+	  }
+	  if (hapalt1_ct) {
+	    cswritep = strcpya(cswritep, "1=");
+	    cswritep = uint32toa_x(hapalt1_ct, ',', cswritep);
+	  }
+	  if (missing_ct != nobs_base) {
+	    --cswritep;
+	  } else {
+	    *cswritep++ = '.';
+	  }
+	}
+      } else {
+	// todo
+	missing_ct = 0;
+	assert(0);
+      }
+      if (missing_col) {
+	*cswritep++ = '\t';
+	cswritep = uint32toa(missing_ct, cswritep);
+      }
+      if (nobs_col) {
+	*cswritep++ = '\t';
+	cswritep = uint32toa(nobs_base - missing_ct, cswritep);
+      }
+      append_binary_eoln(&cswritep);
+      if (cswrite(&css, &cswritep)) {
+	goto write_geno_counts_ret_WRITE_FAIL;
+      }
+      if (variant_idx >= next_print_variant_idx) {
+	if (pct > 10) {
+	  putc_unlocked('\b', stdout);
+	}
+	pct = (variant_idx * 100LLU) / variant_ct;
+	printf("\b\b%u%%", pct++);
+	fflush(stdout);
+	next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+      }
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto write_geno_counts_ret_WRITE_FAIL;
+    }
+    putc_unlocked('\r', stdout);
+    LOGPRINTFWW("--geno-counts%s: Genotype counts written to %s .\n", output_zst? " zs" : "", outname);
+  }
+  while (0) {
+  write_geno_counts_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_geno_counts_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_geno_counts_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  // write_geno_counts_ret_1:
+  cswrite_close_cond(&css, cswritep);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t write_missingness_reports(const uintptr_t* sample_include, const uintptr_t* sex_male, const char* sample_ids, const char* sids, const pheno_col_t* pheno_cols, const char* pheno_names, const uint32_t* sample_missing_hc_cts, const uint32_t* sample_missing_dosage_cts, const uint32_t* sample_hethap_cts, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint32_t* [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    const uint32_t output_zst = missing_rpt_modifier & kfMissingRptZs;
+    if (!(missing_rpt_modifier & kfMissingRptVariantOnly)) {
+      unsigned char* overflow_buf;
+      if (bigstack_alloc_uc(kCompressStreamBlock + kMaxMediumLine + pheno_ct * 2, &overflow_buf)) {
+	goto write_missingness_reports_ret_NOMEM;
+      }
+      char* outname_end2 = strcpya0(outname_end, ".smiss");
+      if (output_zst) {
+	strcpy(outname_end2, ".zst");
+      }
+      if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	goto write_missingness_reports_ret_OPEN_FAIL;
+      }
+      cswritep = strcpya((char*)overflow_buf, "#FID\tIID");
+      const uint32_t scol_sid = sid_col_required(sample_include, sids, sample_ct, max_sid_blen, missing_rpt_modifier / kfMissingRptScolMaybesid);
+      if (scol_sid) {
+	cswritep = strcpya(cswritep, "\tSID");
+      }
+      const uint32_t scol_empty_pheno = (missing_rpt_modifier & kfMissingRptScolMisspheno1) && (!pheno_ct);
+      if (scol_empty_pheno) {
+	cswritep = strcpya(cswritep, "\tMISS_PHENO1");
+      }
+      const uint32_t scol_phenos = (missing_rpt_modifier & (kfMissingRptScolMisspheno1 | kfMissingRptScolMissphenos)) && pheno_ct;
+      if (scol_phenos) {
+	if (!(missing_rpt_modifier & kfMissingRptScolMissphenos)) {
+	  pheno_ct = 1;
+	}
+	for (uintptr_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, &(pheno_names[pheno_idx * max_pheno_name_blen]));
+	  if (cswrite(&css, &cswritep)) {
+	    goto write_missingness_reports_ret_WRITE_FAIL;
+	  }
+	}
+      }
+      const uint32_t scol_nmiss_dosage = (missing_rpt_modifier / kfMissingRptScolNmissDosage) & 1;
+      if (scol_nmiss_dosage) {
+	cswritep = strcpya(cswritep, "\tMISSING_DOSAGE_CT");
+      }
+      const uint32_t scol_nmiss = (missing_rpt_modifier / kfMissingRptScolNmiss) & 1;
+      if (scol_nmiss) {
+	cswritep = strcpya(cswritep, "\tMISSING_CT");
+      }
+      const uint32_t scol_nmiss_hh = (missing_rpt_modifier / kfMissingRptScolNmissHh) & 1;
+      if (scol_nmiss_hh) {
+	cswritep = strcpya(cswritep, "\tMISSING_AND_HETHAP_CT");
+      }
+      const uint32_t scol_hethap = (missing_rpt_modifier / kfMissingRptScolHethap) & 1;
+      if (scol_hethap) {
+	cswritep = strcpya(cswritep, "\tHETHAP_CT");
+      }
+      const uint32_t scol_nobs = (missing_rpt_modifier / kfMissingRptScolNobs) & 1;
+      if (scol_nobs) {
+	cswritep = strcpya(cswritep, "\tOBS_CT");
+      }
+      const uint32_t scol_fmiss_dosage = (missing_rpt_modifier / kfMissingRptScolFmissDosage) & 1;
+      if (scol_fmiss_dosage) {
+	cswritep = strcpya(cswritep, "\tF_MISS_DOSAGE");
+      }
+      const uint32_t scol_fmiss = (missing_rpt_modifier / kfMissingRptScolFmiss) & 1;
+      if (scol_fmiss) {
+	cswritep = strcpya(cswritep, "\tF_MISS");
+      }
+      const uint32_t scol_fmiss_hh = (missing_rpt_modifier / kfMissingRptScolFmissHh) & 1;
+      if (scol_fmiss_hh) {
+	cswritep = strcpya(cswritep, "\tF_MISS_AND_HETHAP");
+      }
+      append_binary_eoln(&cswritep);
+      uint32_t variant_ct_y = 0;
+      int32_t y_code;
+      if (xymt_exists(cip, kChrOffsetY, &y_code)) {
+	variant_ct_y = count_chr_variants_unsafe(variant_include, cip, y_code);
+      }
+      const uint32_t variant_ct_nony = variant_ct - variant_ct_y;
+      char nobs_strs[2][16];
+      uint32_t nobs_slens[2];
+      double variant_ct_recips[2];
+      char* write_iter = nobs_strs[0];
+      *write_iter++ = '\t';
+      write_iter = uint32toa(variant_ct_nony, write_iter);
+      nobs_slens[0] = (uintptr_t)(write_iter - nobs_strs[0]);
+      variant_ct_recips[0] = 1.0 / ((double)((int32_t)variant_ct_nony));
+      write_iter = nobs_strs[1];
+      *write_iter++ = '\t';
+      write_iter = uint32toa(variant_ct, write_iter);
+      nobs_slens[1] = (uintptr_t)(write_iter - nobs_strs[1]);
+      variant_ct_recips[1] = 1.0 / ((double)((int32_t)variant_ct));
+      uintptr_t sample_uidx = 0;
+      for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+        next_set_ul_unsafe_ck(sample_include, &sample_uidx);
+	cswritep = strcpya(cswritep, &(sample_ids[sample_uidx * max_sample_id_blen]));
+	if (scol_sid) {
+	  *cswritep++ = '\t';
+	  if (sids) {
+	    cswritep = strcpya(cswritep, &(sids[sample_uidx * max_sid_blen]));
+	  } else {
+	    *cswritep++ = '0';
+	  }
+	}
+	if (scol_phenos) {
+	  for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	    *cswritep++ = '\t';
+	    // 'Y' - 'N' == 11
+	    *cswritep++ = 'Y' - 11 * IS_SET(pheno_cols[pheno_idx].nonmiss, sample_uidx);
+	    if (cswrite(&css, &cswritep)) {
+	      goto write_missingness_reports_ret_WRITE_FAIL;
+	    }
+	  }
+	} else {
+	  if (scol_empty_pheno) {
+	    cswritep = strcpya(cswritep, "\tY");
+	  }
+	  if (cswrite(&css, &cswritep)) {
+	    goto write_missingness_reports_ret_WRITE_FAIL;
+	  }
+	}
+	if (scol_nmiss_dosage) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(sample_missing_dosage_cts[sample_uidx], cswritep);
+	}
+	const uint32_t cur_missing_hc_base = sample_missing_hc_cts[sample_uidx];
+	if (scol_nmiss) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(cur_missing_hc_base, cswritep);
+	}
+	if (scol_nmiss_hh) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(cur_missing_hc_base + sample_hethap_cts[sample_uidx], cswritep);
+	}
+	if (scol_hethap) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(sample_hethap_cts[sample_uidx], cswritep);
+	}
+	const uint32_t is_male = IS_SET(sex_male, sample_uidx);
+	if (scol_nobs) {
+	  cswritep = memcpya(cswritep, nobs_strs[is_male], nobs_slens[is_male]);
+	}
+	const double cur_variant_ct_recip = variant_ct_recips[is_male];
+	if (scol_fmiss_dosage) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(((double)((int32_t)sample_missing_dosage_cts[sample_uidx])) * cur_variant_ct_recip, cswritep);
+	}
+	if (scol_fmiss) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(((double)((int32_t)cur_missing_hc_base)) * cur_variant_ct_recip, cswritep);
+	}
+	if (scol_fmiss_hh) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(((double)((int32_t)(cur_missing_hc_base + sample_hethap_cts[sample_uidx]))) * cur_variant_ct_recip, cswritep);
+	}
+	append_binary_eoln(&cswritep);
+      }
+      if (cswrite_close_null(&css, cswritep)) {
+	goto write_missingness_reports_ret_WRITE_FAIL;
+      }
+      bigstack_reset(bigstack_mark);
+      LOGPRINTFWW("--missing: Sample missing data report written to %s .\n", outname);
+    }
+    if (!(missing_rpt_modifier & kfMissingRptSampleOnly)) {
+      const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+      unsigned char* overflow_buf;
+      char* chr_buf; // includes trailing tab
+      if (bigstack_alloc_uc(kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 512 + 2 * max_allele_slen, &overflow_buf) ||
+	  bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+	goto write_missingness_reports_ret_NOMEM;
+      }
+      char* outname_end2 = strcpya0(outname_end, ".vmiss");
+      if (output_zst) {
+	strcpy(outname_end2, ".zst");
+      }
+      if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	goto write_missingness_reports_ret_OPEN_FAIL;
+      }
+      cswritep = (char*)overflow_buf;
+      *cswritep++ = '#';
+      const uint32_t chr_col = missing_rpt_modifier & kfMissingRptVcolChrom;
+
+      if (chr_col) {
+	cswritep = strcpya(cswritep, "CHROM\t");
+      }
+      if (missing_rpt_modifier & kfMissingRptVcolPos) {
+	cswritep = strcpya(cswritep, "POS\t");
+      } else {
+	variant_bps = nullptr;
+      }
+      cswritep = strcpya(cswritep, "ID");
+      const uint32_t ref_col = missing_rpt_modifier & kfMissingRptVcolRef;
+      if (ref_col) {
+	cswritep = strcpya(cswritep, "\tREF");
+      }
+      const uint32_t alt1_col = missing_rpt_modifier & kfMissingRptVcolAlt1;
+      if (alt1_col) {
+	cswritep = strcpya(cswritep, "\tALT1");
+      }
+      const uint32_t alt_col = missing_rpt_modifier & kfMissingRptVcolAlt;
+      if (alt_col) {
+	cswritep = strcpya(cswritep, "\tALT");
+      }
+      const uint32_t nmiss_dosage_col = missing_rpt_modifier & kfMissingRptVcolNmissDosage;
+      if (nmiss_dosage_col) {
+	cswritep = strcpya(cswritep, "\tMISSING_DOSAGE_CT");
+      }
+      const uint32_t nmiss_col = (missing_rpt_modifier / kfMissingRptVcolNmiss) & 1;
+      if (nmiss_col) {
+	cswritep = strcpya(cswritep, "\tMISSING_CT");
+      }
+      const uint32_t nmiss_hh_col = (missing_rpt_modifier / kfMissingRptVcolNmissHh) & 1;
+      if (nmiss_hh_col) {
+	cswritep = strcpya(cswritep, "\tMISSING_AND_HETHAP_CT");
+      }
+      const uint32_t hethap_col = (missing_rpt_modifier / kfMissingRptVcolHethap) & 1;
+      if (hethap_col) {
+	cswritep = strcpya(cswritep, "\tHETHAP_CT");
+      }
+      const uint32_t nobs_col = (missing_rpt_modifier / kfMissingRptVcolNobs) & 1;
+      if (nobs_col) {
+	cswritep = strcpya(cswritep, "\tOBS_CT");
+      }
+      const uint32_t fmiss_dosage_col = missing_rpt_modifier & kfMissingRptVcolFmissDosage;
+      if (fmiss_dosage_col) {
+	cswritep = strcpya(cswritep, "\tF_MISS_DOSAGE");
+      }
+      const uint32_t fmiss_col = (missing_rpt_modifier / kfMissingRptVcolFmiss) & 1;
+      if (fmiss_col) {
+	cswritep = strcpya(cswritep, "\tF_MISS");
+      }
+      const uint32_t fmiss_hh_col = (missing_rpt_modifier / kfMissingRptVcolFmissHh) & 1;
+      if (fmiss_hh_col) {
+	cswritep = strcpya(cswritep, "\tF_MISS_AND_HETHAP");
+      }
+      const uint32_t fhethap_col = (missing_rpt_modifier / kfMissingRptVcolFhethap) & 1;
+      if (fhethap_col) {
+	cswritep = strcpya(cswritep, "\tF_HETHAP");
+      }
+      append_binary_eoln(&cswritep);
+      char nobs_str[16];
+      nobs_str[0] = '\t';
+      const int32_t y_code = cip->xymt_codes[kChrOffsetY];
+      uint32_t nobs_slen = 0;
+      uint32_t variant_uidx = 0;
+      uint32_t chr_fo_idx = 0xffffffffU;
+      uint32_t chr_end = 0;
+      uint32_t chr_buf_blen = 0;
+      uint32_t pct = 0;
+      uint32_t next_print_variant_idx = variant_ct / 100;
+      uint32_t is_y = 2;
+      double nobs_recip = 0.0;
+      fputs("--missing variant report: 0%", stdout);
+      fflush(stdout);
+      uint32_t cur_allele_ct = 2;
+      uint32_t cur_missing_hc_ct = 0;
+      uint32_t cur_hethap_ct = 0;
+      for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	if (variant_uidx >= chr_end) {
+	  int32_t chr_idx;
+	  do {
+	    ++chr_fo_idx;
+	    chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	    chr_idx = cip->chr_file_order[chr_fo_idx];
+	  } while (variant_uidx >= chr_end);
+	  char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	  *chr_name_end = '\t';
+	  chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	  const uint32_t new_is_y = (chr_idx == y_code);
+	  if (new_is_y != is_y) {
+	    is_y = new_is_y;
+	    const uint32_t cur_nobs = is_y? male_ct : sample_ct;
+	    nobs_recip = 1.0 / ((double)((int32_t)cur_nobs));
+	    char* nobs_str_end = uint32toa(cur_nobs, &(nobs_str[1]));
+	    nobs_slen = (uintptr_t)(nobs_str_end - nobs_str);
+	  }
+	}
+	if (chr_col) {
+	  cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+	}
+	if (variant_bps) {
+	  cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+	}
+	cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+	uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	if (variant_allele_idxs) {
+	  variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	  cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	}
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	if (ref_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[0]);
+	}
+	if (alt1_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[1]);
+	}
+	if (alt_col) {
+	  *cswritep++ = '\t';
+	  for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	    if (cswrite(&css, &cswritep)) {
+	      goto write_missingness_reports_ret_WRITE_FAIL;
+	    }
+	    cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	  }
+	  --cswritep;
+	}
+	if (nmiss_dosage_col) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa(variant_missing_dosage_cts[variant_uidx], cswritep);
+	}
+	if (variant_missing_hc_cts) {
+	  cur_missing_hc_ct = variant_missing_hc_cts[variant_uidx];
+	  cur_hethap_ct = 0;
+	  if (variant_uidx >= first_hap_uidx) {
+	    cur_hethap_ct = variant_hethap_cts[variant_uidx - first_hap_uidx];
+	  }
+	  if (nmiss_col) {
+	    *cswritep++ = '\t';
+	    cswritep = uint32toa(cur_missing_hc_ct, cswritep);
+	  }
+	  if (nmiss_hh_col) {
+	    *cswritep++ = '\t';
+	    cswritep = uint32toa(cur_missing_hc_ct + cur_hethap_ct, cswritep);
+	  }
+	  if (hethap_col) {
+	    *cswritep++ = '\t';
+	    cswritep = uint32toa(cur_hethap_ct, cswritep);
+	  }
+	}
+	if (nobs_col) {
+	  cswritep = memcpya(cswritep, nobs_str, nobs_slen);
+	}
+	if (fmiss_dosage_col) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(((double)((int32_t)variant_missing_dosage_cts[variant_uidx])) * nobs_recip, cswritep);
+	}
+	if (fmiss_col) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(((double)((int32_t)cur_missing_hc_ct)) * nobs_recip, cswritep);
+	}
+	if (fmiss_hh_col) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(((double)((int32_t)(cur_missing_hc_ct + cur_hethap_ct))) * nobs_recip, cswritep);
+	}
+	if (fhethap_col) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(((double)((int32_t)cur_hethap_ct)) * nobs_recip, cswritep);
+	}
+	append_binary_eoln(&cswritep);
+	if (cswrite(&css, &cswritep)) {
+	  goto write_missingness_reports_ret_WRITE_FAIL;
+	}
+	if (variant_idx >= next_print_variant_idx) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (variant_idx * 100LLU) / variant_ct;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	}
+      }
+      if (cswrite_close_null(&css, cswritep)) {
+	goto write_missingness_reports_ret_WRITE_FAIL;
+      }
+      putc_unlocked('\r', stdout);
+      LOGPRINTFWW("--missing: Variant missing data report written to %s .\n", outname);
+    }
+  }
+  while (0) {
+  write_missingness_reports_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_missingness_reports_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_missingness_reports_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  cswrite_close_cond(&css, cswritep);
+  return reterr;
+}
+
+// multithread globals
+static const uintptr_t* g_variant_include = nullptr;
+static const uint32_t* g_founder_raw_geno_cts = nullptr;
+static const uint32_t* g_founder_x_male_geno_cts = nullptr;
+static const uint32_t* g_founder_x_nosex_geno_cts = nullptr;
+static uint32_t* g_variant_uidx_starts = nullptr;
+static double* g_hwe_x_pvals = nullptr;
+static uint32_t g_x_start = 0;
+static uint32_t g_hwe_x_ct = 0;
+static uint32_t g_calc_thread_ct = 0;
+static uint32_t g_hwe_midp = 0;
+
+THREAD_FUNC_DECL compute_hwe_x_pvals_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t* variant_include = g_variant_include;
+  const uint32_t* founder_raw_geno_cts = g_founder_raw_geno_cts;
+  const uint32_t* founder_x_male_geno_cts = g_founder_x_male_geno_cts;
+  const uint32_t* founder_x_nosex_geno_cts = g_founder_x_nosex_geno_cts;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  const uint32_t x_start = g_x_start;
+  const uint32_t hwe_x_ct = g_hwe_x_ct;
+  const uint32_t hwe_midp = g_hwe_midp;
+
+  // this needs to be aligned with compute_uidx_start_partition()
+  const uint32_t variant_idx_end = (hwe_x_ct * (((uint64_t)tidx) + 1)) / calc_thread_ct;
+  uint32_t variant_idx = (hwe_x_ct * ((uint64_t)tidx)) / calc_thread_ct;
+  
+  double* hwe_x_pvals_iter = &(g_hwe_x_pvals[variant_idx]);
+  uint32_t variant_uidx = g_variant_uidx_starts[tidx];
+  uint32_t pct = 0;
+  uint32_t next_print_variant_idx = variant_idx_end;
+  if (!tidx) {
+    next_print_variant_idx = variant_idx_end / 100;
+  }
+  uint32_t male_ref_ct = 0;
+  uint32_t male_alt_ct = 0;
+  for (; variant_idx < variant_idx_end; ++variant_idx, ++variant_uidx) {
+    next_set_unsafe_ck(variant_include, &variant_uidx);
+    const uint32_t* cur_raw_geno_cts = &(founder_raw_geno_cts[(3 * k1LU) * variant_uidx]);
+    uint32_t female_homref_ct = cur_raw_geno_cts[0];
+    uint32_t female_refalt_ct = cur_raw_geno_cts[1];
+    uint32_t female_altalt_ct = cur_raw_geno_cts[2];
+    if (founder_x_male_geno_cts) {
+      const uint32_t* cur_male_geno_cts = &(founder_x_male_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+      male_ref_ct = cur_male_geno_cts[0];
+      female_homref_ct -= male_ref_ct;
+      female_refalt_ct -= cur_male_geno_cts[1];
+      male_alt_ct = cur_male_geno_cts[2];
+      female_altalt_ct -= male_alt_ct;
+    }
+    if (founder_x_nosex_geno_cts) {
+      const uint32_t* cur_nosex_geno_cts = &(founder_x_nosex_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+      female_homref_ct -= cur_nosex_geno_cts[0];
+      female_refalt_ct -= cur_nosex_geno_cts[1];
+      female_altalt_ct -= cur_nosex_geno_cts[2];
+    }
+    *hwe_x_pvals_iter++ = SNPHWEX(female_refalt_ct, female_homref_ct, female_altalt_ct, male_ref_ct, male_alt_ct, hwe_midp);
+    if (variant_idx >= next_print_variant_idx) {
+      if (pct > 10) {
+	putc_unlocked('\b', stdout);
+      }
+      pct = (variant_idx * 100LLU) / variant_idx_end;
+      printf("\b\b%u%%", pct++);
+      fflush(stdout);
+      next_print_variant_idx = (pct * ((uint64_t)variant_idx_end)) / 100;
+    }
+  }
+  if (pct > 10) {
+    putc_unlocked('\b', stdout);
+  }
+  THREAD_RETURN;
+}
+
+pglerr_t compute_hwe_x_pvals(const uintptr_t* variant_include, const uint32_t* founder_raw_geno_cts, const uint32_t* founder_x_male_geno_cts, const uint32_t* founder_x_nosex_geno_cts, uint32_t x_start, uint32_t hwe_x_ct, uint32_t hwe_midp, uint32_t calc_thread_ct, double** hwe_x_pvals_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    assert(hwe_x_ct);
+    if (bigstack_alloc_d(hwe_x_ct, hwe_x_pvals_ptr)) {
+      goto compute_hwe_x_pvals_ret_NOMEM;
+    }
+    bigstack_mark = g_bigstack_base;
+    g_hwe_x_pvals = *hwe_x_pvals_ptr;
+    
+    if (calc_thread_ct > hwe_x_ct) {
+      calc_thread_ct = hwe_x_ct;
+    }
+    pthread_t* threads = (pthread_t*)bigstack_alloc(calc_thread_ct * sizeof(intptr_t));
+    if (!threads) {
+      goto compute_hwe_x_pvals_ret_NOMEM;
+    }
+    if (bigstack_alloc_ui(calc_thread_ct, &g_variant_uidx_starts)) {
+      goto compute_hwe_x_pvals_ret_NOMEM;
+    }
+    compute_uidx_start_partition(variant_include, hwe_x_ct, calc_thread_ct, x_start, g_variant_uidx_starts);
+    g_variant_include = variant_include;
+    g_founder_raw_geno_cts = founder_raw_geno_cts;
+    g_founder_x_male_geno_cts = founder_x_male_geno_cts;
+    g_founder_x_nosex_geno_cts = founder_x_nosex_geno_cts;
+    g_calc_thread_ct = calc_thread_ct;
+    g_x_start = x_start;
+    g_hwe_x_ct = hwe_x_ct;
+    g_hwe_midp = hwe_midp;
+    LOGPRINTF("Computing chrX Hardy-Weinberg %sp-values... ", hwe_midp? "mid" : "");
+    fputs("0%", stdout);
+    fflush(stdout);
+    if (spawn_threads(compute_hwe_x_pvals_thread, calc_thread_ct, threads)) {
+      goto compute_hwe_x_pvals_ret_THREAD_CREATE_FAIL;
+    }
+    compute_hwe_x_pvals_thread((void*)0);
+    join_threads(calc_thread_ct, threads);
+    fputs("\b\b", stdout);
+    logprint("done.\n");
+  }
+  while (0) {
+  compute_hwe_x_pvals_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  compute_hwe_x_pvals_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t hardy_report(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint32_t* founder_raw_geno_cts, const uint32_t* founder_x_male_geno_cts, const uint32_t* founder_x_nosex_geno_cts, const double* hwe_x_pvals, uint32_t variant_ct, uint32_t hwe_x_ct, uint32_t max_allele_slen, double output_min_p, hardy_flags_t hardy_modifier, uint32_t nonfounders, char* outname, c [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    if (cip->haploid_mask[0] & 1) {
+      logerrprint("Error: --hardy is pointless on an all-haploid genome.\n");
+      goto hardy_report_ret_INCONSISTENT_INPUT;
+    }
+    const uint32_t max_chr_blen = get_max_chr_slen(cip) + 1;
+    const uint32_t chr_code_end = cip->max_code + 1 + cip->name_ct;
+    const uint32_t chr_code_endl = BITCT_TO_WORDCT(chr_code_end);
+    unsigned char* overflow_buf;
+    uintptr_t* chr_skips;
+    if (bigstack_alloc_uc(kCompressStreamBlock + max_chr_blen + kMaxIdSlen + 512 + 2 * max_allele_slen, &overflow_buf) ||
+	bigstack_alloc_ul(chr_code_endl, &chr_skips)) {
+      goto hardy_report_ret_NOMEM;
+    }
+    // skip chrX, chrY, chrM here
+    const int32_t mt_code = cip->xymt_codes[kChrOffsetMT];
+    memcpy(chr_skips, cip->haploid_mask, chr_code_endl * sizeof(intptr_t));
+    if (mt_code >= 0) {
+      set_bit(mt_code, chr_skips);
+    }
+    const uint32_t chr_skip_ct = popcount_longs(chr_skips, chr_code_endl);
+    uint32_t variant_skip_ct = 0;
+    uint32_t chr_uidx = 0;
+    for (uint32_t chr_skip_idx = 0; chr_skip_idx < chr_skip_ct; ++chr_skip_idx, ++chr_uidx) {
+      next_set_unsafe_ck(chr_skips, &chr_uidx);
+      if (is_set(cip->chr_mask, chr_uidx)) {
+	const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[chr_uidx];
+	variant_skip_ct += popcount_bit_idx(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1]);
+      }
+    }
+    if (variant_skip_ct - hwe_x_ct) {
+      LOGPRINTF("--hardy: Skipping %u haploid/chrM variant%s.\n", variant_skip_ct - hwe_x_ct, (variant_skip_ct - hwe_x_ct == 1)? "" : "s");
+    }
+    variant_ct -= variant_skip_ct;
+    const uint32_t output_zst = hardy_modifier & kfHardyZs;
+    const uint32_t midp = (hardy_modifier / kfHardyMidp) & 1;
+    const uint32_t chr_col = hardy_modifier & kfHardyColChrom;
+    const uint32_t ref_col = hardy_modifier & kfHardyColRef;
+    const uint32_t alt1_col = hardy_modifier & kfHardyColAlt1;
+    const uint32_t alt_col = hardy_modifier & kfHardyColAlt;
+    const uint32_t gcounts = hardy_modifier & (kfHardyColGcounts | kfHardyColGcount1col);
+    const uint32_t gcount_1col = hardy_modifier & kfHardyColGcount1col;
+    const char gcount_delim = gcount_1col? ',' : '\t';
+    const uint32_t hetfreq_cols = hardy_modifier & kfHardyColHetfreq;
+    const uint32_t p_col = hardy_modifier & kfHardyColP;
+    if (variant_ct) {
+      char* outname_end2 = strcpya0(outname_end, ".hardy");
+      if (output_zst) {
+	strcpy(outname_end2, ".zst");
+      }
+      if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	goto hardy_report_ret_OPEN_FAIL;
+      }
+      cswritep = (char*)overflow_buf;
+      *cswritep++ = '#';
+
+      // includes trailing tab
+      char* chr_buf = nullptr;
+      if (chr_col) {
+	if (bigstack_alloc_c(max_chr_blen, &chr_buf)) {
+	  goto hardy_report_ret_NOMEM;
+	}
+	cswritep = strcpya(cswritep, "CHROM\t");
+      }
+      if (hardy_modifier & kfHardyColPos) {
+	cswritep = strcpya(cswritep, "POS\t");
+      } else {
+	variant_bps = nullptr;
+      }
+      cswritep = strcpya(cswritep, "ID");
+      if (ref_col) {
+	cswritep = strcpya(cswritep, "\tREF");
+      }
+      if (alt1_col) {
+	cswritep = strcpya(cswritep, "\tALT1");
+      }
+      if (alt_col) {
+	cswritep = strcpya(cswritep, "\tALT");
+      }
+      if (gcounts) {
+	if (gcount_1col) {
+	  cswritep = strcpya(cswritep, "\tGCOUNTS");
+	} else {
+	  cswritep = strcpya(cswritep, "\tHOM_REF_CT\tHET_REF_CT\tNONREF_CT");
+	}
+      }
+      if (hetfreq_cols) {
+	cswritep = strcpya(cswritep, "\tO(HET_REF)\tE(HET_REF)");
+      }
+      if (p_col) {
+	*cswritep++ = '\t';
+	if (midp) {
+	  cswritep = strcpya(cswritep, "MIDP");
+	} else {
+	  *cswritep++ = 'P';
+	}
+      }
+      append_binary_eoln(&cswritep);
+      uint32_t variant_uidx = 0;
+      uint32_t chr_fo_idx = 0xffffffffU;
+      uint32_t chr_end = 0;
+      uint32_t chr_buf_blen = 0;
+      uint32_t pct = 0;
+      uint32_t next_print_variant_idx = variant_ct / 100;
+      printf("--hardy%s%s: 0%%", output_zst? " zs" : "", midp? " midp" : "");
+      fflush(stdout);
+      uint32_t cur_allele_ct = 2;
+      for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	if (chr_col) {
+	  if (variant_uidx >= chr_end) {
+	    int32_t chr_idx;
+	    do {
+	      ++chr_fo_idx;
+	      chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	      chr_idx = cip->chr_file_order[chr_fo_idx];
+	    } while ((variant_uidx >= chr_end) || is_set(chr_skips, chr_idx));
+	    variant_uidx = next_set_unsafe(variant_include, cip->chr_fo_vidx_start[chr_fo_idx]);
+	    char* chr_name_end = chr_name_write(cip, chr_idx, chr_buf);
+	    *chr_name_end = '\t';
+	    chr_buf_blen = 1 + (uintptr_t)(chr_name_end - chr_buf);
+	  }
+	  cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
+	}
+	if (variant_bps) {
+	  cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+	}
+	cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+	uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	if (variant_allele_idxs) {
+	  variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	  cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	}
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	if (ref_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[0]);
+	}
+	if (alt1_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[1]);
+	}
+	if (alt_col) {
+	  *cswritep++ = '\t';
+	  for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	    if (cswrite(&css, &cswritep)) {
+	      goto hardy_report_ret_WRITE_FAIL;
+	    }
+	    cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	  }
+	  --cswritep;
+	}
+	const uint32_t* cur_geno_cts = &(founder_raw_geno_cts[(3 * k1LU) * variant_uidx]);
+	if (gcounts) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa_x(cur_geno_cts[0], gcount_delim, cswritep);
+	  cswritep = uint32toa_x(cur_geno_cts[1], gcount_delim, cswritep);
+	  cswritep = uint32toa(cur_geno_cts[2], cswritep);
+	}
+	if (hetfreq_cols) {
+	  *cswritep++ = '\t';
+	  const uint32_t tot_obs = cur_geno_cts[0] + cur_geno_cts[1] + cur_geno_cts[2];
+	  const double tot_obs_recip = 1.0 / (double)((int32_t)tot_obs);
+	  cswritep = dtoa_g(((int32_t)cur_geno_cts[1]) * tot_obs_recip, cswritep);
+	  *cswritep++ = '\t';
+	  const double dbl_ref_freq = (cur_geno_cts[0] * 2 + cur_geno_cts[1]) * tot_obs_recip;
+	  const double expected_het_freq = dbl_ref_freq * (1.0 - dbl_ref_freq * 0.5);
+	  cswritep = dtoa_g(expected_het_freq, cswritep);
+	}
+	if (p_col) {
+	  // possible todo: multithread this
+	  *cswritep++ = '\t';
+	  const double hwe_p = SNPHWE2(cur_geno_cts[1], cur_geno_cts[0], cur_geno_cts[2], midp);
+	  cswritep = dtoa_g(MAXV(hwe_p, output_min_p), cswritep);
+	}
+	append_binary_eoln(&cswritep);
+	if (cswrite(&css, &cswritep)) {
+	  goto hardy_report_ret_WRITE_FAIL;
+	}
+	if (variant_idx >= next_print_variant_idx) {
+	  if (pct > 10) {
+	    putc_unlocked('\b', stdout);
+	  }
+	  pct = (variant_idx * 100LLU) / variant_ct;
+	  printf("\b\b%u%%", pct++);
+	  fflush(stdout);
+	  next_print_variant_idx = (pct * ((uint64_t)variant_ct)) / 100;
+	}
+      }
+      if (cswrite_close_null(&css, cswritep)) {
+	goto hardy_report_ret_WRITE_FAIL;
+      }
+      putc_unlocked('\r', stdout);
+      LOGPRINTFWW("--hardy%s%s: Autosomal Hardy-Weinberg report (%s) written to %s .\n", output_zst? " zs" : "", midp? " midp" : "", nonfounders? "all samples" : "founders only", outname);
+    }
+    if (hwe_x_ct) {
+      bigstack_reset(chr_skips);
+      char* outname_end2 = strcpya0(outname_end, ".hardy.x");
+      if (output_zst) {
+	strcpy(outname_end2, ".zst");
+      }
+      if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+	goto hardy_report_ret_OPEN_FAIL;
+      }
+      cswritep = (char*)overflow_buf;
+      *cswritep++ = '#';
+
+      // includes trailing tab
+      char x_name_buf[8];
+      uint32_t x_name_blen = 0;
+      const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+      if (chr_col) {
+	cswritep = strcpya(cswritep, "CHROM\t");
+	char* write_iter = chr_name_write(cip, x_code, x_name_buf);
+	*write_iter++ = '\t';
+	x_name_blen = (uintptr_t)(write_iter - x_name_buf);
+      }
+      if (hardy_modifier & kfHardyColPos) {
+	cswritep = strcpya(cswritep, "POS\t");
+      } else {
+	variant_bps = nullptr;
+      }
+      cswritep = strcpya(cswritep, "ID");
+      if (ref_col) {
+	cswritep = strcpya(cswritep, "\tREF");
+      }
+      if (alt1_col) {
+	cswritep = strcpya(cswritep, "\tALT1");
+      }
+      if (alt_col) {
+	cswritep = strcpya(cswritep, "\tALT");
+      }
+      if (gcounts) {
+	if (gcount_1col) {
+	  cswritep = strcpya(cswritep, "\tGCOUNTS");
+	} else {
+	  cswritep = strcpya(cswritep, "\tFEMALE_HOM_REF_CT\tFEMALE_HET_REF_CT\tFEMALE_NONREF_CT\tMALE_REF_CT\tMALE_ALT_CT");
+	}
+      }
+      if (hetfreq_cols) {
+	cswritep = strcpya(cswritep, "\tO(FEMALE_HET_REF)\tE(FEMALE_HET_REF)");
+      }
+      const uint32_t sexaf_cols = hardy_modifier & kfHardyColSexaf;
+      if (sexaf_cols) {
+	cswritep = strcpya(cswritep, "\tFEMALE_REF_FREQ\tMALE_REF_FREQ");
+      }
+      const uint32_t femalep_col = hardy_modifier & kfHardyColFemalep;
+      if (femalep_col) {
+	cswritep = strcpya(cswritep, "\tFEMALE_ONLY_");
+	if (midp) {
+	  cswritep = strcpya(cswritep, "MIDP");
+	} else {
+	  *cswritep++ = 'P';
+	}
+      }
+      if (p_col) {
+	*cswritep++ = '\t';
+	if (midp) {
+	  cswritep = strcpya(cswritep, "MIDP");
+	} else {
+	  *cswritep++ = 'P';
+	}
+      }
+      append_binary_eoln(&cswritep);
+      fputs("--hardy: Writing chrX results...", stdout);
+      fflush(stdout);
+      const uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)x_code];
+      const uint32_t x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
+      uint32_t variant_uidx = x_start;
+      uint32_t cur_allele_ct = 2;
+      uint32_t male_ref_ct = 0;
+      uint32_t male_alt_ct = 0;
+      for (uint32_t variant_idx = 0; variant_idx < hwe_x_ct; ++variant_idx, ++variant_uidx) {
+	next_set_unsafe_ck(variant_include, &variant_uidx);
+	cswritep = memcpya(cswritep, x_name_buf, x_name_blen);
+	if (variant_bps) {
+	  cswritep = uint32toa_x(variant_bps[variant_uidx], '\t', cswritep);
+	}
+	cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+	uintptr_t variant_allele_idx_base = variant_uidx * 2;
+	if (variant_allele_idxs) {
+	  variant_allele_idx_base = variant_allele_idxs[variant_uidx];
+	  cur_allele_ct = variant_allele_idxs[variant_uidx + 1] - variant_allele_idx_base;
+	}
+	char** cur_alleles = &(allele_storage[variant_allele_idx_base]);
+	if (ref_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[0]);
+	}
+	if (alt1_col) {
+	  *cswritep++ = '\t';
+	  cswritep = strcpya(cswritep, cur_alleles[1]);
+	}
+	if (alt_col) {
+	  *cswritep++ = '\t';
+	  for (uint32_t allele_idx = 1; allele_idx < cur_allele_ct; ++allele_idx) {
+	    if (cswrite(&css, &cswritep)) {
+	      goto hardy_report_ret_WRITE_FAIL;
+	    }
+	    cswritep = strcpyax(cswritep, cur_alleles[allele_idx], ',');
+	  }
+	  --cswritep;
+	}
+	const uint32_t* cur_geno_cts = &(founder_raw_geno_cts[(3 * k1LU) * variant_uidx]);
+	uint32_t female_homref_ct = cur_geno_cts[0];
+	uint32_t female_refalt_ct = cur_geno_cts[1];
+	uint32_t female_altalt_ct = cur_geno_cts[2];
+	if (founder_x_male_geno_cts) {
+	  const uint32_t* cur_male_geno_cts = &(founder_x_male_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+	  male_ref_ct = cur_male_geno_cts[0];
+	  female_homref_ct -= male_ref_ct;
+	  female_refalt_ct -= cur_male_geno_cts[1];
+	  male_alt_ct = cur_male_geno_cts[2];
+	  female_altalt_ct -= male_alt_ct;
+	}
+	if (founder_x_nosex_geno_cts) {
+	  const uint32_t* cur_nosex_geno_cts = &(founder_x_nosex_geno_cts[(3 * k1LU) * (variant_uidx - x_start)]);
+	  female_homref_ct -= cur_nosex_geno_cts[0];
+	  female_refalt_ct -= cur_nosex_geno_cts[1];
+	  female_altalt_ct -= cur_nosex_geno_cts[2];
+	}
+	if (gcounts) {
+	  *cswritep++ = '\t';
+	  cswritep = uint32toa_x(female_homref_ct, gcount_delim, cswritep);
+	  cswritep = uint32toa_x(female_refalt_ct, gcount_delim, cswritep);
+	  cswritep = uint32toa_x(female_altalt_ct, gcount_delim, cswritep);
+	  cswritep = uint32toa_x(male_ref_ct, gcount_delim, cswritep);
+	  cswritep = uint32toa(male_alt_ct, cswritep);
+	}
+	if (hetfreq_cols || sexaf_cols) {
+	  const uint32_t tot_female_obs = female_homref_ct + female_refalt_ct + female_altalt_ct;
+	  const double tot_female_obs_recip = 1.0 / (double)((int32_t)tot_female_obs);
+	  const double dbl_ref_freq = (female_homref_ct * 2 + female_refalt_ct) * tot_female_obs_recip;
+	  const double ref_freq = dbl_ref_freq * 0.5;
+	  if (hetfreq_cols) {
+	    *cswritep++ = '\t';
+	    cswritep = dtoa_g(((int32_t)female_refalt_ct) * tot_female_obs_recip, cswritep);
+	    *cswritep++ = '\t';
+	    const double expected_het_freq = dbl_ref_freq * (1.0 - ref_freq);
+	    cswritep = dtoa_g(expected_het_freq, cswritep);
+	  }
+	  if (sexaf_cols) {
+	    *cswritep++ = '\t';
+	    cswritep = dtoa_g(ref_freq, cswritep);
+	    *cswritep++ = '\t';
+	    const double male_ref_freq = ((double)((int32_t)male_ref_ct)) / ((double)((int32_t)(male_ref_ct + male_alt_ct)));
+	    cswritep = dtoa_g(male_ref_freq, cswritep);
+	  }
+	}
+	if (femalep_col) {
+	  *cswritep++ = '\t';
+	  const double female_hwe_p = SNPHWE2(female_refalt_ct, female_homref_ct, female_altalt_ct, midp);
+	  cswritep = dtoa_g(MAXV(female_hwe_p, output_min_p), cswritep);
+	}
+	if (p_col) {
+	  *cswritep++ = '\t';
+	  cswritep = dtoa_g(MAXV(hwe_x_pvals[variant_idx], output_min_p), cswritep);
+	}
+	append_binary_eoln(&cswritep);
+	if (cswrite(&css, &cswritep)) {
+	  goto hardy_report_ret_WRITE_FAIL;
+	}
+      }
+      if (cswrite_close_null(&css, cswritep)) {
+	goto hardy_report_ret_WRITE_FAIL;
+      }
+      putc_unlocked('\r', stdout);
+      LOGPRINTFWW("--hardy%s%s: chrX Hardy-Weinberg report (%s) written to %s .\n", output_zst? " zs" : "", midp? " midp" : "", nonfounders? "all samples" : "founders only", outname);
+    }
+  }
+  while (0) {
+  hardy_report_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  hardy_report_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  hardy_report_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  hardy_report_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  cswrite_close_cond(&css, cswritep);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+pglerr_t write_snplist(const uintptr_t* variant_include, char** variant_ids, uint32_t variant_ct, uint32_t output_zst, char* outname, char* outname_end) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  char* cswritep = nullptr;
+  compress_stream_state_t css;
+  pglerr_t reterr = kPglRetSuccess;
+  cswrite_init_null(&css);
+  {
+    unsigned char* overflow_buf;
+    if (bigstack_alloc_uc(kCompressStreamBlock + kMaxIdSlen + 2, &overflow_buf)) {
+      goto write_snplist_ret_NOMEM;
+    }
+    char* outname_end2 = strcpy(outname_end, ".snplist");
+    if (output_zst) {
+      strcpy(outname_end2, ".zst");
+    }
+    if (cswrite_init(outname, 0, output_zst, overflow_buf, &css)) {
+      goto write_snplist_ret_OPEN_FAIL;
+    }
+    cswritep = (char*)overflow_buf;
+    uint32_t variant_uidx = 0;
+    for (uint32_t variant_idx = 0; variant_idx < variant_ct; ++variant_idx, ++variant_uidx) {
+      next_set_unsafe_ck(variant_include, &variant_uidx);
+      cswritep = strcpya(cswritep, variant_ids[variant_uidx]);
+      append_binary_eoln(&cswritep);
+      if (cswrite(&css, &cswritep)) {
+	goto write_snplist_ret_WRITE_FAIL;
+      }
+    }
+    if (cswrite_close_null(&css, cswritep)) {
+      goto write_snplist_ret_WRITE_FAIL;
+    }
+    LOGPRINTFWW("--write-snplist%s: Variant IDs written to %s .\n", output_zst? " zs" : "", outname);
+  }
+  while (0) {
+  write_snplist_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_snplist_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_snplist_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  }
+  cswrite_close_cond(&css, cswritep);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+// similar to write_psam().
+pglerr_t write_covar(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uintptr_t max_paternal_id_blen, uintptr_t max_maternal_id_blen, uin [...]
+  unsigned char* bigstack_mark = g_bigstack_base;
+  FILE* outfile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    strcpy(outname_end, ".cov");
+    if (fopen_checked(outname, FOPEN_WB, &outfile)) {
+      goto write_covar_ret_OPEN_FAIL;
+    }
+    const char* output_missing_pheno = g_output_missing_pheno;
+    const uint32_t omp_slen = strlen(output_missing_pheno);
+    
+    char* textbuf = g_textbuf;
+    char* textbuf_flush = &(textbuf[kMaxMediumLine]);
+
+    const uint32_t write_sid = sid_col_required(sample_include, sids, sample_ct, max_sid_blen, write_covar_flags / kfWriteCovarColMaybesid);
+    uint32_t write_parents = 0;
+    if (write_covar_flags & kfWriteCovarColParents) {
+      write_parents = 1;
+    } else if (write_covar_flags & kfWriteCovarColMaybeparents) {
+      write_parents = is_parental_info_present(sample_include, paternal_ids, maternal_ids, sample_ct, max_paternal_id_blen, max_maternal_id_blen);
+    }
+    const uint32_t write_sex = (write_covar_flags / kfWriteCovarColSex) & 1;
+    const uint32_t write_empty_pheno = (write_covar_flags & kfWriteCovarColPheno1) && (!pheno_ct);
+    const uint32_t write_phenos = (write_covar_flags & (kfWriteCovarColPheno1 | kfWriteCovarColPhenos)) && pheno_ct;
+    if (write_phenos && (!(write_covar_flags & kfWriteCovarColPhenos))) {
+      pheno_ct = 1;
+    }
+    char* write_iter = strcpya(textbuf, "#FID\tIID");
+    if (write_sid) {
+      write_iter = strcpya(write_iter, "\tSID");
+    }
+    if (write_parents) {
+      write_iter = strcpya(write_iter, "\tPAT\tMAT");
+    }
+    if (write_sex) {
+      write_iter = strcpya(write_iter, "\tSEX");
+    }
+    if (write_phenos || write_empty_pheno || write_sex) {
+      // verify that no names are duplicated
+      uint32_t* covar_name_htable;
+      uint32_t covar_name_htable_size;
+      if (htable_good_size_alloc(covar_ct + write_sex, bigstack_left(), &covar_name_htable, &covar_name_htable_size)) {
+	goto write_covar_ret_NOMEM;
+      }
+      // shouldn't be possible for this to fail
+      populate_strbox_htable(covar_names, covar_ct, max_covar_name_blen, covar_name_htable_size, covar_name_htable);
+      uint32_t max_xcovar_name_blen = max_covar_name_blen;
+      if (write_sex) {
+	// add "SEX"
+	uint32_t hashval = hashceil("SEX", 3, covar_name_htable_size);
+	while (1) {
+	  const uint32_t cur_htable_entry = covar_name_htable[hashval];
+	  if (cur_htable_entry == 0xffffffffU) {
+	    covar_name_htable[hashval] = covar_ct;
+	    break;
+	  }
+	  if (!memcmp("SEX", &(covar_names[cur_htable_entry * max_covar_name_blen]), 4)) {
+	    logerrprint("Error: .cov file cannot have both a regular SEX column and a covariate named\n'SEX'.  Exclude or rename one of these columns.\n");
+	    goto write_covar_ret_INCONSISTENT_INPUT;
+	  }
+	  if (++hashval == covar_name_htable_size) {
+	    hashval = 0;
+	  }
+	}
+	if (max_xcovar_name_blen < 4) {
+	  max_xcovar_name_blen = 4;
+	}
+      }
+      if (write_phenos) {
+	const char* pheno_name_iter = pheno_names;
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  *write_iter++ = '\t';
+	  const uint32_t cur_pheno_name_slen = strlen(pheno_name_iter);
+	  if (cur_pheno_name_slen < max_xcovar_name_blen) {
+	    const uint32_t cur_pheno_name_blen = cur_pheno_name_slen + 1;
+	    uint32_t hashval = hashceil(pheno_name_iter, cur_pheno_name_slen, covar_name_htable_size);
+	    while (1) {
+	      uint32_t cur_htable_idval = covar_name_htable[hashval];
+	      if (cur_htable_idval >= covar_ct) {
+		if (cur_htable_idval == 0xffffffffU) {
+		  break;
+		}
+		if (!memcmp(pheno_name_iter, "SEX", 4)) {
+		  logerrprint(write_sex? "Error: .cov file cannot have both a regular SEX column and a phenotype named\n'SEX'.  Exclude or rename one of these columns.\n" : "Error: .cov file cannot have a phenotype and a covariate with the same name.\n");
+		  goto write_covar_ret_INCONSISTENT_INPUT;
+		}
+	      } else {
+		if (!memcmp(pheno_name_iter, &(covar_names[cur_htable_idval * max_covar_name_blen]), cur_pheno_name_blen)) {
+		  logerrprint("Error: .cov file cannot have a phenotype and a covariate with the same name.\n");
+		  goto write_covar_ret_INCONSISTENT_INPUT;
+		}
+	      }
+	      if (++hashval == covar_name_htable_size) {
+		hashval = 0;
+	      }
+	    }
+	  }
+	  write_iter = memcpya(write_iter, pheno_name_iter, cur_pheno_name_slen);
+	  pheno_name_iter = &(pheno_name_iter[max_pheno_name_blen]);
+	  if (write_iter >= textbuf_flush) {
+	    if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	      goto write_covar_ret_WRITE_FAIL;
+	    }
+	    write_iter = textbuf;
+	  }
+	}
+      } else if (write_empty_pheno) {
+	if (max_covar_name_blen > 6) {
+	  uint32_t hashval = hashceil("PHENO1", 6, covar_name_htable_size);
+	  while (1) {
+	    uint32_t cur_htable_idval = covar_name_htable[hashval];
+	    if (cur_htable_idval >= covar_ct) {
+	      if (cur_htable_idval == 0xffffffffU) {
+		break;
+	      }
+	    } else {
+	      if (!memcmp("PHENO1", &(covar_names[cur_htable_idval * max_covar_name_blen]), 7)) {
+		logerrprint("Error: .cov file cannot have a phenotype and a covariate with the same name.\n");
+		goto write_covar_ret_INCONSISTENT_INPUT;
+	      }
+	    }
+	    if (++hashval == covar_name_htable_size) {
+	      hashval = 0;
+	    }
+	  }
+	}
+	write_iter = strcpya(write_iter, "\tPHENO1");
+      }
+    }
+    for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx) {
+      *write_iter++ = '\t';
+      const char* cur_covar_name = &(covar_names[covar_idx * max_covar_name_blen]);
+      const uint32_t cur_covar_name_slen = strlen(cur_covar_name);
+      write_iter = memcpya(write_iter, cur_covar_name, cur_covar_name_slen);
+      if (write_iter >= textbuf_flush) {
+	if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	  goto write_covar_ret_WRITE_FAIL;
+	}
+	write_iter = textbuf;
+      }
+    }
+    append_binary_eoln(&write_iter);
+
+    uintptr_t sample_uidx = 0;
+    uint32_t sample_uidx2 = 0;
+    // not really necessary to make sample_uidx increment dependent on
+    // new_sample_idx_to_old == nullptr
+    for (uint32_t sample_idx = 0; sample_idx < sample_ct; ++sample_idx, ++sample_uidx) {
+      if (!new_sample_idx_to_old) {
+	next_set_ul_unsafe_ck(sample_include, &sample_uidx);
+      } else {
+	do {
+	  sample_uidx = new_sample_idx_to_old[sample_uidx2++];
+	} while (!IS_SET(sample_include, sample_uidx));
+      }
+      write_iter = strcpya(write_iter, &(sample_ids[max_sample_id_blen * sample_uidx]));
+      if (write_sid) {
+	*write_iter++ = '\t';
+	if (sids) {
+	  write_iter = strcpya(write_iter, &(sids[max_sid_blen * sample_uidx]));
+	} else {
+	  *write_iter++ = '0';
+	}
+      }
+      if (write_parents) {
+	*write_iter++ = '\t';
+	write_iter = strcpyax(write_iter, &(paternal_ids[max_paternal_id_blen * sample_uidx]), '\t');
+	write_iter = strcpya(write_iter, &(maternal_ids[max_maternal_id_blen * sample_uidx]));
+      }
+      if (write_sex) {
+	*write_iter++ = '\t';
+	if (IS_SET(sex_nm, sample_uidx)) {
+	  *write_iter++ = '2' - IS_SET(sex_male, sample_uidx);
+	} else {
+	  // this is better than '0' since it allows the raw column to be used
+	  // as --covar input
+	  // (can't do this for .fam export, though: not worth the
+	  // compatibility issues)
+	  write_iter = strcpya(write_iter, "NA");
+	}
+      }
+      if (write_phenos) {
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  *write_iter++ = '\t';
+	  write_iter = append_pheno_str(&(pheno_cols[pheno_idx]), output_missing_pheno, omp_slen, sample_uidx, write_iter);
+	  if (write_iter >= textbuf_flush) {
+	    if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	      goto write_covar_ret_WRITE_FAIL;
+	    }
+	    write_iter = textbuf;
+	  }
+	}
+      } else {
+	if (write_empty_pheno) {
+	  *write_iter++ = '\t';
+	  write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
+	}
+	if (write_iter >= textbuf_flush) {
+	  if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	    goto write_covar_ret_WRITE_FAIL;
+	  }
+	  write_iter = textbuf;
+	}	
+      }
+      for (uint32_t covar_idx = 0; covar_idx < covar_ct; ++covar_idx) {
+	*write_iter++ = '\t';
+	write_iter = append_pheno_str(&(covar_cols[covar_idx]), output_missing_pheno, omp_slen, sample_uidx, write_iter);
+	if (write_iter >= textbuf_flush) {
+	  if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	    goto write_covar_ret_WRITE_FAIL;
+	  }
+	  write_iter = textbuf;
+	}
+      }
+      append_binary_eoln(&write_iter);
+    }
+    if (write_iter != textbuf) {
+      if (fwrite_checked(textbuf, write_iter - textbuf, outfile)) {
+	goto write_covar_ret_WRITE_FAIL;
+      }
+    }
+    if (fclose_null(&outfile)) {
+      goto write_covar_ret_WRITE_FAIL;
+    }
+    LOGPRINTFWW("Covariates written to %s.\n", outname);
+  }
+  while (0) {
+  write_covar_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  write_covar_ret_OPEN_FAIL:
+    reterr = kPglRetOpenFail;
+    break;
+  write_covar_ret_WRITE_FAIL:
+    reterr = kPglRetWriteFail;
+    break;
+  write_covar_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+  fclose_cond(outfile);
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_misc.h b/plink2_misc.h
new file mode 100644
index 0000000..9b45a8f
--- /dev/null
+++ b/plink2_misc.h
@@ -0,0 +1,202 @@
+#ifndef __PLINK2_MISC_H__
+#define __PLINK2_MISC_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+FLAGSET_DEF_START()
+  kfPhenoTransform0,
+  kfPhenoTransformSplitCat = (1 << 0),
+  kfPhenoTransformSplitCatOmitLast = (1 << 1),
+  kfPhenoTransformSplitCatCovar01 = (1 << 2),
+  kfPhenoTransformVstdCovar = (1 << 3),
+  kfPhenoTransformVstdAll = (1 << 4),
+  kfPhenoTransformQuantnormPheno = (1 << 5),
+  kfPhenoTransformQuantnormCovar = (1 << 6),
+  kfPhenoTransformQuantnormAll = (1 << 7),
+FLAGSET_DEF_END(pheno_transform_flags_t);
+
+FLAGSET_DEF_START()
+  kfWriteCovar0,
+  kfWriteCovarColMaybesid = (1 << 0),
+  kfWriteCovarColSid = (1 << 1),
+  kfWriteCovarColMaybeparents = (1 << 2),
+  kfWriteCovarColParents = (1 << 3),
+  kfWriteCovarColSex = (1 << 4),
+  kfWriteCovarColPheno1 = (1 << 5),
+  kfWriteCovarColPhenos = (1 << 6),
+  kfWriteCovarColDefault = kfWriteCovarColMaybesid,
+  kfWriteCovarColAll = ((kfWriteCovarColPhenos * 2) - kfWriteCovarColMaybesid)
+FLAGSET_DEF_END(write_covar_flags_t);
+
+FLAGSET_DEF_START()
+  kfAlleleFreq0,
+  kfAlleleFreqZs = (1 << 0),
+  kfAlleleFreqCounts = (1 << 1),
+  kfAlleleFreqBinsRefFname = (1 << 2),
+  kfAlleleFreqBinsAlt1Fname = (1 << 3),
+  kfAlleleFreqBinsOnly = (1 << 4),
+
+  kfAlleleFreqColChrom = (1 << 5),
+  kfAlleleFreqColPos = (1 << 6),
+  kfAlleleFreqColRef = (1 << 7),
+  kfAlleleFreqColAlt1 = (1 << 8),
+  kfAlleleFreqColAlt = (1 << 9),
+  kfAlleleFreqColReffreq = (1 << 10),
+  kfAlleleFreqColAlt1freq = (1 << 11),
+  kfAlleleFreqColAltfreq = (1 << 12),
+  kfAlleleFreqColFreq = (1 << 13),
+  kfAlleleFreqColEq = (1 << 14),
+  kfAlleleFreqColEqz = (1 << 15),
+  kfAlleleFreqColAlteq = (1 << 16),
+  kfAlleleFreqColAlteqz = (1 << 17),
+  kfAlleleFreqColNumeq = (1 << 18),
+  kfAlleleFreqColAltnumeq = (1 << 19),
+  kfAlleleFreqColMachR2 = (1 << 20),
+  kfAlleleFreqColNobs = (1 << 21),
+  kfAlleleFreqColDefault = (kfAlleleFreqColChrom | kfAlleleFreqColRef | kfAlleleFreqColAlt | kfAlleleFreqColAltfreq | kfAlleleFreqColNobs),
+  kfAlleleFreqColAll = ((kfAlleleFreqColNobs * 2) - kfAlleleFreqColChrom),
+  // only mutual exclusion is altfreq/freq/eq/eqz/alteq/alteqz/numeq/numeqz
+  // don't force alt1freq/altfreq mutual exclusion since the former plays a bit
+  // better with shell scripts
+  // alt+alteqz is a bit silly, but I won't bother prohibiting it
+  kfAlleleFreqColMutex = ((kfAlleleFreqColAltnumeq * 2) - kfAlleleFreqColAltfreq)
+FLAGSET_DEF_END(allele_freq_t);
+
+FLAGSET_DEF_START()
+  kfMissingRpt0,
+  kfMissingRptZs = (1 << 0),
+  kfMissingRptSampleOnly = (1 << 1),
+  kfMissingRptVariantOnly = (1 << 2),
+  
+  kfMissingRptScolMaybesid = (1 << 3),
+  kfMissingRptScolSid = (1 << 4),
+  kfMissingRptScolMisspheno1 = (1 << 5),
+  kfMissingRptScolMissphenos = (1 << 6),
+  kfMissingRptScolNmissDosage = (1 << 7),
+  kfMissingRptScolNmiss = (1 << 8),
+  kfMissingRptScolNmissHh = (1 << 9),
+  kfMissingRptScolHethap = (1 << 10),
+  kfMissingRptScolNobs = (1 << 11),
+  kfMissingRptScolFmissDosage = (1 << 12),
+  kfMissingRptScolFmiss = (1 << 13),
+  kfMissingRptScolFmissHh = (1 << 14),
+  kfMissingRptScolDefault = (kfMissingRptScolMaybesid | kfMissingRptScolMissphenos | kfMissingRptScolNmiss | kfMissingRptScolNobs | kfMissingRptScolFmiss),
+  kfMissingRptScolAll = ((kfMissingRptScolFmissHh * 2) - kfMissingRptScolMaybesid),
+
+  kfMissingRptVcolChrom = (1 << 15),
+  kfMissingRptVcolPos = (1 << 16),
+  kfMissingRptVcolRef = (1 << 17),
+  kfMissingRptVcolAlt1 = (1 << 18),
+  kfMissingRptVcolAlt = (1 << 19),
+  kfMissingRptVcolNmissDosage = (1 << 20),
+  kfMissingRptVcolNmiss = (1 << 21),
+  kfMissingRptVcolNmissHh = (1 << 22),
+  kfMissingRptVcolHethap = (1 << 23),
+  kfMissingRptVcolNobs = (1 << 24),
+  kfMissingRptVcolFmissDosage = (1 << 25),
+  kfMissingRptVcolFmiss = (1 << 26),
+  kfMissingRptVcolFmissHh = (1 << 27),
+  kfMissingRptVcolFhethap = (1 << 28),
+  kfMissingRptVcolDefault = (kfMissingRptVcolChrom | kfMissingRptVcolNmiss | kfMissingRptVcolNobs | kfMissingRptVcolFmiss),
+  kfMissingRptVcolAll = ((kfMissingRptVcolFhethap * 2) - kfMissingRptVcolChrom)
+FLAGSET_DEF_END(missing_rpt_t);
+
+FLAGSET_DEF_START()
+  kfGenoCounts0,
+  kfGenoCountsZs = (1 << 0),
+  
+  kfGenoCountsColChrom = (1 << 1),
+  kfGenoCountsColPos = (1 << 2),
+  kfGenoCountsColRef = (1 << 3),
+  kfGenoCountsColAlt1 = (1 << 4),
+  kfGenoCountsColAlt = (1 << 5),
+  kfGenoCountsColHomref = (1 << 6),
+  kfGenoCountsColRefalt1 = (1 << 7),
+  kfGenoCountsColRefalt = (1 << 8),
+  kfGenoCountsColHomalt1 = (1 << 9),
+  kfGenoCountsColAltxy = (1 << 10),
+  kfGenoCountsColXy = (1 << 11),
+  kfGenoCountsColHapref = (1 << 12),
+  kfGenoCountsColHapalt1 = (1 << 13),
+  kfGenoCountsColHapalt = (1 << 14),
+  kfGenoCountsColHap = (1 << 15),
+  kfGenoCountsColNumeq = (1 << 16),
+  kfGenoCountsColMissing = (1 << 17),
+  kfGenoCountsColNobs = (1 << 18),
+  kfGenoCountsColDefault = (kfGenoCountsColChrom | kfGenoCountsColRef | kfGenoCountsColAlt | kfGenoCountsColHomref | kfGenoCountsColRefalt | kfGenoCountsColAltxy | kfGenoCountsColHapref | kfGenoCountsColHapalt | kfGenoCountsColMissing),
+  kfGenoCountsColAll = ((kfGenoCountsColNobs * 2) - kfGenoCountsColChrom),
+  
+  kfGenoCountsColPairex = (kfGenoCountsColHapalt | kfGenoCountsColHap),
+  kfGenoCountsColMutex = (kfGenoCountsColAltxy | kfGenoCountsColXy | kfGenoCountsColNumeq)
+FLAGSET_DEF_END(geno_counts_t);
+
+FLAGSET_DEF_START()
+  kfHardy0,
+  kfHardyZs = (1 << 0),
+  kfHardyMidp = (1 << 1),
+  
+  kfHardyColChrom = (1 << 2),
+  kfHardyColPos = (1 << 3),
+  kfHardyColRef = (1 << 4),
+  kfHardyColAlt1 = (1 << 5),
+  kfHardyColAlt = (1 << 6),
+  kfHardyColGcounts = (1 << 7),
+  kfHardyColGcount1col = (1 << 8),
+  kfHardyColHetfreq = (1 << 9),
+  kfHardyColSexaf = (1 << 10),
+  kfHardyColFemalep = (1 << 11),
+  kfHardyColP = (1 << 12),
+  kfHardyColDefault = (kfHardyColChrom | kfHardyColRef | kfHardyColAlt | kfHardyColGcounts | kfHardyColHetfreq | kfHardyColSexaf | kfHardyColP),
+  kfHardyColAll = ((kfHardyColP * 2) - kfHardyColChrom)
+FLAGSET_DEF_END(hardy_flags_t);
+
+pglerr_t plink1_cluster_import(const char* within_fname, const char* catpheno_name, const char* family_missing_catname, const uintptr_t* sample_include, const char* sample_ids, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, uint32_t mwithin_val, pheno_col_t** pheno_cols_ptr, char** pheno_names_ptr, uint32_t* pheno_ct_ptr, uintptr_t* max_pheno_name_blen_ptr);
+
+pglerr_t update_sample_sexes(const char* update_sex_fname, const uintptr_t* sample_include, char* sample_ids, uint32_t raw_sample_ct, uintptr_t sample_ct, uintptr_t max_sample_id_blen, uint32_t update_sex_colm2, uintptr_t* sex_nm, uintptr_t* sex_male);
+
+pglerr_t split_cat_pheno(const char* split_cat_phenonames_flattened, const uintptr_t* sample_include, uint32_t raw_sample_ct, pheno_transform_flags_t pheno_transform_flags, pheno_col_t** pheno_cols_ptr, char** pheno_names_ptr, uint32_t* pheno_ct_ptr, uintptr_t* max_pheno_name_blen_ptr, pheno_col_t** covar_cols_ptr, char** covar_names_ptr, uint32_t* covar_ct_ptr, uintptr_t* max_covar_name_blen_ptr);
+
+pglerr_t pheno_variance_standardize(const char* vstd_flattened, const uintptr_t* sample_include, const char* pheno_names, uint32_t raw_sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t is_covar, uint32_t is_covar_flag, pheno_col_t* pheno_cols);
+
+pglerr_t pheno_quantile_normalize(const char* quantnorm_flattened, const uintptr_t* sample_include, const char* pheno_names, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t is_covar, uint32_t is_subset_flag, pheno_col_t* pheno_cols);
+
+pglerr_t write_allele_freqs(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint64_t* founder_allele_dosages, const double* mach_r2_vals, const char* ref_binstr, const char* alt1_binstr, uint32_t variant_ct, uint32_t max_alt_allele_ct, uint32_t max_allele_slen, allele_freq_t allele_freq_modifier, uint32_t nonfounders, char* outname, char* outname_end);
+
+pglerr_t write_geno_counts(const uintptr_t* sample_include, const uintptr_t* sex_male, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint32_t* raw_geno_cts, const uint32_t* x_male_geno_cts, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t male_ct, uint32_t variant_ct, uint32_t x_start, uint32_t max_allele_slen, geno_counts_t geno_counts_modifier, pgen_reader_ [...]
+
+pglerr_t write_missingness_reports(const uintptr_t* sample_include, const uintptr_t* sex_male, const char* sample_ids, const char* sids, const pheno_col_t* pheno_cols, const char* pheno_names, const uint32_t* sample_missing_hc_cts, const uint32_t* sample_missing_dosage_cts, const uint32_t* sample_hethap_cts, const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint32_t* [...]
+
+pglerr_t compute_hwe_x_pvals(const uintptr_t* variant_include, const uint32_t* founder_raw_geno_cts, const uint32_t* founder_x_male_geno_cts, const uint32_t* founder_x_nosex_geno_cts, uint32_t x_start, uint32_t hwe_x_ct, uint32_t hwe_midp, uint32_t calc_thread_ct, double** hwe_x_pvals_ptr);
+
+pglerr_t hardy_report(const uintptr_t* variant_include, const chr_info_t* cip, const uint32_t* variant_bps, char** variant_ids, const uintptr_t* variant_allele_idxs, char** allele_storage, const uint32_t* founder_raw_geno_cts, const uint32_t* founder_x_male_geno_cts, const uint32_t* founder_x_nosex_geno_cts, const double* hwe_x_pvals, uint32_t variant_ct, uint32_t hwe_x_ct, uint32_t max_allele_slen, double output_min_p, hardy_flags_t hardy_modifier, uint32_t nonfounders, char* outname, c [...]
+
+pglerr_t write_snplist(const uintptr_t* variant_include, char** variant_ids, uint32_t variant_ct, uint32_t output_zst, char* outname, char* outname_end);
+
+pglerr_t write_covar(const uintptr_t* sample_include, const char* sample_ids, const char* sids, const char* paternal_ids, const char* maternal_ids, const uintptr_t* sex_nm, const uintptr_t* sex_male, const pheno_col_t* pheno_cols, const char* pheno_names, const pheno_col_t* covar_cols, const char* covar_names, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uintptr_t max_sample_id_blen, uintptr_t max_sid_blen, uintptr_t max_paternal_id_blen, uintptr_t max_maternal_id_blen, uin [...]
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_MISC_H__
diff --git a/plink2_psam.cpp b/plink2_psam.cpp
new file mode 100644
index 0000000..7433e68
--- /dev/null
+++ b/plink2_psam.cpp
@@ -0,0 +1,1416 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_decompress.h"
+#include "plink2_psam.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+typedef struct psam_info_ll_struct {
+  // vardata[] starts with 8-byte phenotypes (we don't want to parse the same
+  // same numeric string twice), followed by NON-null-terminated sample_id, and
+  // then non-terminated paternal and maternal IDs.
+  struct psam_info_ll_struct* next;
+  uint32_t sample_id_slen;
+  uint32_t sid_slen;
+  uint32_t paternal_id_slen;
+  uint32_t maternal_id_slen;
+  uint32_t sex_code; // 0 = unknown, 1 = male, 2 = female
+  unsigned char vardata[];
+} psam_info_ll_t;
+
+pglerr_t load_psam(const char* psamname, const range_list_t* pheno_range_list_ptr, fam_col_t fam_cols, uint32_t pheno_ct_max, int32_t missing_pheno, uint32_t affection_01, uintptr_t* max_sample_id_blen_ptr, uintptr_t* max_sid_blen_ptr, uintptr_t* max_paternal_id_blen_ptr, uintptr_t* max_maternal_id_blen_ptr, uintptr_t** sample_include_ptr, char** sample_ids_ptr, char** sids_ptr, char** paternal_ids_ptr, char** maternal_ids_ptr, uintptr_t** founder_info_ptr, uintptr_t** sex_nm_ptr, uintpt [...]
+  // outparameter pointers assumed to be initialized to nullptr
+  //
+  // pheno_ct_max should default to something like 0x7fffffff, not 0xffffffffU
+  //
+  // max_{sample,sid,paternal,maternal}_id_blen are in/out, to support data
+  // management operations which change these values
+  //
+  // permanent allocations are at stack end, not base, to work better with
+  // variant_id_htable_find()
+
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  
+  gzFile gz_infile = nullptr;
+  pheno_col_t* pheno_cols = nullptr;
+  uintptr_t line_idx = 0;
+  uint32_t pheno_ct = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    reterr = gzopen_read_checked(psamname, &gz_infile);
+    if (reterr) {
+      goto load_psam_ret_1;
+    }
+    const uintptr_t initial_bigstack_size = bigstack_left();
+    uintptr_t loadbuf_size = initial_bigstack_size / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto load_psam_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    // allocated at bottom now, so short string comparsions against end cannot
+    // fail
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* loadbuf_first_token;
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_psam_ret_READ_FAIL;
+	}
+	loadbuf_first_token = loadbuf;
+	loadbuf_first_token[0] = '\0';
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto load_psam_ret_LONG_LINE;
+	}
+	goto load_psam_ret_NOMEM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    } while (is_eoln_kns(*loadbuf_first_token) || ((loadbuf_first_token[0] == '#') && strcmp_se(&(loadbuf_first_token[1]), "FID", 3) && strcmp_se(&(loadbuf_first_token[1]), "IID", 3)));
+    const uint32_t pheno_name_subset = pheno_range_list_ptr && pheno_range_list_ptr->names;
+    uint32_t* col_skips = nullptr;
+    uint32_t* col_types = nullptr;
+    uint32_t psam_cols_mask = 0;
+    ll_str_t* pheno_names_reverse_ll = nullptr;
+    uintptr_t max_pheno_name_blen = 0;
+    uint32_t relevant_postfid_col_ct = 0;
+    g_bigstack_end -= kMaxIdSlen;
+    unsigned char* tmp_bigstack_end = g_bigstack_end;
+    unsigned char* bigstack_mark2;
+    if (loadbuf_first_token[0] == '#') {
+      // parse header
+      // [-1] = #FID (if present, must be first column)
+      // [0] = IID (could also be first column)
+      // [1] = SID
+      // [2] = PAT
+      // [3] = MAT
+      // [4] = SEX
+      // [5+] = phenotypes
+      relevant_postfid_col_ct = count_tokens(loadbuf_first_token);
+      if (relevant_postfid_col_ct > pheno_ct_max + 5) {
+	relevant_postfid_col_ct = pheno_ct_max + 5;
+      }
+      if (bigstack_alloc_ui(relevant_postfid_col_ct, &col_skips) ||
+	  bigstack_alloc_ui(relevant_postfid_col_ct, &col_types)) {
+	goto load_psam_ret_NOMEM;
+      }
+      bigstack_mark2 = g_bigstack_base;
+      uint32_t rpf_col_idx = 0;
+      if (loadbuf_first_token[1] == 'I') {
+	col_skips[0] = 0;
+	col_types[0] = 0;
+	++rpf_col_idx;
+	psam_cols_mask = 1;
+      }
+      uint32_t col_idx = 0;
+      uint32_t in_interval = 0;
+      char* cmdline_pheno_sorted_ids = nullptr;
+      uint32_t* cmdline_pheno_id_map = nullptr;
+      uintptr_t max_cmdline_pheno_id_blen = 0;
+      uintptr_t cmdline_pheno_name_ct = 0;
+      if (pheno_name_subset) {
+	max_cmdline_pheno_id_blen = pheno_range_list_ptr->name_max_blen;
+	cmdline_pheno_name_ct = pheno_range_list_ptr->name_ct;
+	uintptr_t* dummy_bitarr;
+	// don't bother freeing these before load_psam() is done
+	if (bigstack_alloc_c(cmdline_pheno_name_ct * max_cmdline_pheno_id_blen, &cmdline_pheno_sorted_ids) ||
+	    bigstack_alloc_ui(cmdline_pheno_name_ct, &cmdline_pheno_id_map) ||
+	    bigstack_alloc_ul(BITCT_TO_WORDCT(cmdline_pheno_name_ct), &dummy_bitarr)) {
+	  goto load_psam_ret_NOMEM;
+	}
+	fill_all_bits(cmdline_pheno_name_ct, dummy_bitarr);
+	reterr = copy_sort_strbox_subset_noalloc(dummy_bitarr, pheno_range_list_ptr->names, cmdline_pheno_name_ct, max_cmdline_pheno_id_blen, 0, 0, 0, cmdline_pheno_sorted_ids, cmdline_pheno_id_map);
+	if (reterr) {
+	  goto load_psam_ret_1;
+	}
+	bigstack_reset(dummy_bitarr);
+      }
+      char* token_end = &(loadbuf_first_token[4]);
+      unsigned char* ll_alloc_base = g_bigstack_base;
+      while (1) {
+        char* loadbuf_iter = skip_initial_spaces(token_end);
+	if (is_eoln_kns(*loadbuf_iter)) {
+	  break;
+	}
+	++col_idx;
+	token_end = token_endnn(loadbuf_iter);
+	const uint32_t token_slen = (uintptr_t)(token_end - loadbuf_iter);
+	if (token_slen == 3) {
+	  uint32_t cur_col_type = 0xffffffffU;
+	  if (!memcmp(loadbuf_iter, "IID", 3)) {
+	    cur_col_type = 0;
+	  } else if (!memcmp(loadbuf_iter, "SID", 3)) {
+	    cur_col_type = 1;
+	  } else if (!memcmp(loadbuf_iter, "PAT", 3)) {
+	    cur_col_type = 2;
+	  } else if (!memcmp(loadbuf_iter, "MAT", 3)) {
+	    cur_col_type = 3;
+	  } else if (!memcmp(loadbuf_iter, "SEX", 3)) {
+	    cur_col_type = 4;
+	  } else if (!memcmp(loadbuf_iter, "FID", 3)) {
+	    sprintf(g_logbuf, "Error: 'FID' column header on line %" PRIuPTR " of %s is not at the beginning.\n", line_idx, psamname);
+	    goto load_psam_ret_MALFORMED_INPUT_WW;
+	  }
+	  if (cur_col_type != 0xffffffffU) {
+	    const uint32_t cur_col_type_shifted = 1 << cur_col_type;
+	    if (psam_cols_mask & cur_col_type_shifted) {
+	      *token_end = '\0';
+	      sprintf(g_logbuf, "Error: Duplicate column header '%s' on line %" PRIuPTR " of %s.\n", loadbuf_iter, line_idx, psamname);
+	      goto load_psam_ret_MALFORMED_INPUT_WW;
+	    }
+	    psam_cols_mask |= cur_col_type_shifted;
+	    col_skips[rpf_col_idx] = col_idx;
+	    col_types[rpf_col_idx++] = cur_col_type;
+	    continue;
+	  }
+	}
+	if (pheno_ct < pheno_ct_max) {
+	  if (pheno_name_subset) {
+	    uint32_t cmdline_pos;
+	    if (!sorted_idbox_find(loadbuf_iter, cmdline_pheno_sorted_ids, cmdline_pheno_id_map, token_slen, max_cmdline_pheno_id_blen, cmdline_pheno_name_ct, &cmdline_pos)) {
+	      // similar to string_range_list_to_bitarr()
+	      if (pheno_range_list_ptr->starts_range[cmdline_pos]) {
+		if (in_interval) {
+		  logerrprint("Error: Overlapping --pheno-name ranges.\n");
+		  goto load_psam_ret_INCONSISTENT_INPUT;
+		}
+		in_interval = 1;
+	      } else if (cmdline_pos && pheno_range_list_ptr->starts_range[cmdline_pos - 1]) {
+		if (!in_interval) {
+		  sprintf(g_logbuf, "Error: --pheno-name range is inconsistent with %s.\n", psamname);
+		  goto load_psam_ret_INCONSISTENT_INPUT_WW;
+		}
+		in_interval = 0;
+	      }
+	    } else if (!in_interval) {
+	      continue;
+	    }
+	  }
+	  const uint32_t tok_blen = token_slen + 1;
+	  ll_str_t* ll_str_new = (ll_str_t*)ll_alloc_base;
+	  // just word-aligned, not cacheline-aligned
+	  ll_alloc_base += round_up_pow2(tok_blen + sizeof(ll_str_t), kBytesPerWord);
+	  if (ll_alloc_base > tmp_bigstack_end) {
+	    goto load_psam_ret_NOMEM;
+	  }
+	  ll_str_new->next = pheno_names_reverse_ll;
+	  memcpyx(ll_str_new->ss, loadbuf_iter, token_slen, '\0');
+	  if (tok_blen > max_pheno_name_blen) {
+	    max_pheno_name_blen = tok_blen;
+	  }
+	  pheno_names_reverse_ll = ll_str_new;
+	  col_skips[rpf_col_idx] = col_idx;
+	  col_types[rpf_col_idx++] = pheno_ct + 5;
+	  ++pheno_ct;
+	}
+      }
+      if (max_pheno_name_blen > kMaxIdBlen) {
+	logerrprint("Error: Phenotype/covariate names are limited to " MAX_ID_SLEN_STR " characters.\n");
+	goto load_psam_ret_MALFORMED_INPUT;
+      }
+      g_bigstack_base = (unsigned char*)round_up_pow2((uintptr_t)ll_alloc_base, kCacheline);
+      if (!(psam_cols_mask & 1)) {
+	sprintf(g_logbuf, "Error: No IID column on line %" PRIuPTR " of %s.\n", line_idx, psamname);
+	goto load_psam_ret_MALFORMED_INPUT_WW;
+      }
+      if (in_interval) {
+	sprintf(g_logbuf, "Error: --pheno-name range is inconsistent with %s.\n", psamname);
+	goto load_psam_ret_INCONSISTENT_INPUT_WW;
+      }
+      relevant_postfid_col_ct = rpf_col_idx;
+      for (rpf_col_idx = relevant_postfid_col_ct - 1; rpf_col_idx; --rpf_col_idx) {
+	col_skips[rpf_col_idx] -= col_skips[rpf_col_idx - 1];
+      }
+      loadbuf_first_token[0] = '\0'; // forces line to be skipped by main loop
+    } else if (loadbuf_first_token[0]) {
+      if (pheno_name_subset) {
+	logerrprint("Error: --pheno-name requires a --pheno or .psam file with a header.\n");
+	goto load_psam_ret_INCONSISTENT_INPUT;
+      }
+      
+      pheno_ct = (fam_cols & kfFamCol6) && pheno_ct_max;
+      relevant_postfid_col_ct = ((fam_cols / kfFamCol1) & 1) + ((fam_cols / (kfFamCol34 / 2)) & 2) + ((fam_cols / kfFamCol5) & 1) + pheno_ct;
+      // these small allocations can't fail, since kMaxMediumLine <
+      // loadbuf_size <= 1/3 of remaining space
+      col_skips = (uint32_t*)bigstack_alloc_raw_rd(relevant_postfid_col_ct * sizeof(int32_t));
+      col_types = (uint32_t*)bigstack_alloc_raw_rd(relevant_postfid_col_ct * sizeof(int32_t));
+      bigstack_mark2 = g_bigstack_base;
+      col_skips[0] = fam_cols & 1; // assumes kfFamCol1 == 1
+      col_types[0] = 0;
+      // psam_cols_mask = 1; // may need this later
+      uint32_t rpf_col_idx = 1;
+      if (fam_cols & kfFamCol34) {
+	col_skips[rpf_col_idx] = 1;
+	col_types[rpf_col_idx++] = 2;
+	col_skips[rpf_col_idx] = 1;
+	col_types[rpf_col_idx++] = 3;
+	psam_cols_mask |= 12;
+      }
+      if (fam_cols & kfFamCol5) {
+	col_skips[rpf_col_idx] = 1;
+	col_types[rpf_col_idx++] = 4;
+	psam_cols_mask |= 0x10;
+      }
+      if (pheno_ct) {
+	col_skips[rpf_col_idx] = 1;
+	// col_types[rpf_col_idx++] = 6;
+	col_types[rpf_col_idx] = 5;
+	ll_str_t* ll_str_new = (ll_str_t*)bigstack_alloc_raw_rd(7 + sizeof(ll_str_t));
+	ll_str_new->next = pheno_names_reverse_ll;
+	strcpy(ll_str_new->ss, "PHENO1");
+	max_pheno_name_blen = 7;
+	pheno_names_reverse_ll = ll_str_new;
+      }
+    }
+    if (pheno_ct) {
+      char* pheno_names = (char*)malloc(pheno_ct * max_pheno_name_blen);
+      if (!pheno_names) {
+	goto load_psam_ret_NOMEM;
+      }
+      *pheno_names_ptr = pheno_names;
+      for (uint32_t pheno_idx = pheno_ct; pheno_idx;) {
+	--pheno_idx;
+	strcpy(&(pheno_names[pheno_idx * max_pheno_name_blen]), pheno_names_reverse_ll->ss);
+	pheno_names_reverse_ll = pheno_names_reverse_ll->next;
+      }
+      if (pheno_ct > 1) {
+	if (pheno_ct > kMaxPhenoCt) {
+	  // yeah, yeah, this will never come up
+	  logerrprint("Error: " PROG_NAME_STR " does not support more than " MAX_PHENO_CT_STR " phenotypes.\n");
+	  goto load_psam_ret_MALFORMED_INPUT;
+	}
+	// verify there are no duplicates
+	uint32_t tmp_htable_size;
+	uint32_t* htable_tmp;
+	if (htable_good_size_alloc(pheno_ct, bigstack_left(), &htable_tmp, &tmp_htable_size)) {
+	  goto load_psam_ret_NOMEM;
+	}
+	const uint32_t duplicate_idx = populate_strbox_htable(pheno_names, pheno_ct, max_pheno_name_blen, tmp_htable_size, htable_tmp);
+	if (duplicate_idx) {
+	  const char* duplicate_pheno_name = &(pheno_names[duplicate_idx * max_pheno_name_blen]);
+	  sprintf(g_logbuf, "Error: Duplicate phenotype/covariate name '%s' on line %" PRIuPTR " of %s.\n", duplicate_pheno_name, line_idx, psamname);
+	  goto load_psam_ret_MALFORMED_INPUT_WW;
+	}
+      }
+      // free pheno_names_reverse_ll
+      bigstack_reset(bigstack_mark2);
+    }
+    
+    // make sure to handle sample_ct == 0 case properly
+    psam_info_ll_t* psam_info_reverse_ll = nullptr;
+    const uint32_t sids_present = (psam_cols_mask / 2) & 1;
+    const uint32_t paternal_ids_present = psam_cols_mask & 4;
+    const uint32_t maternal_ids_present = psam_cols_mask & 8;
+    const uint32_t sex_present = psam_cols_mask & 0x10;
+    const uint32_t col_type_end = 5 + pheno_ct;
+    const uint32_t pheno_ctl = BITCT_TO_WORDCT(pheno_ct);
+    const double missing_phenod = (double)missing_pheno;
+    const double pheno_ctrld = (double)((int32_t)(1 - affection_01));
+    const double pheno_cased = pheno_ctrld + 1.0;
+    uintptr_t max_sample_id_blen = *max_sample_id_blen_ptr;
+    uintptr_t max_sid_blen = *max_sid_blen_ptr;
+    uintptr_t max_paternal_id_blen = *max_paternal_id_blen_ptr;
+    uintptr_t max_maternal_id_blen = *max_maternal_id_blen_ptr;
+    uint32_t raw_sample_ct = 0;
+    uint32_t categorical_pheno_ct = 0;
+    
+    char** token_ptrs;
+    uint32_t* token_slens;
+    uintptr_t* categorical_phenos;
+    uintptr_t* quantitative_phenos;
+    if (bigstack_alloc_cp(col_type_end, &token_ptrs) ||
+        bigstack_alloc_ui(col_type_end, &token_slens) ||
+	bigstack_calloc_ul(pheno_ctl, &categorical_phenos) ||
+	bigstack_calloc_ul(pheno_ctl, &quantitative_phenos)) {
+      goto load_psam_ret_NOMEM;
+    }
+    char* missing_catname = g_missing_catname;
+    const uint32_t missing_catname_blen = strlen(missing_catname) + 1;
+    const uint32_t missing_catname_hval = hashceil(missing_catname, missing_catname_blen - 1, kCatHtableSize);
+    unsigned char* tmp_bigstack_base = g_bigstack_base;
+    catname_ll2_t** catname_htable = nullptr;
+    catname_ll2_t** pheno_catname_last = nullptr;
+    uintptr_t* total_catname_blens = nullptr;
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	if (raw_sample_ct == 0x7ffffffe) {
+	  logerrprint("Error: " PROG_NAME_STR " does not support more than 2^31 - 2 samples.\n");
+	  goto load_psam_ret_MALFORMED_INPUT;
+	}
+	char* loadbuf_iter = loadbuf_first_token;
+	for (uint32_t rpf_col_idx = 0; rpf_col_idx < relevant_postfid_col_ct; ++rpf_col_idx) {
+	  const uint32_t cur_col_type = col_types[rpf_col_idx];
+	  loadbuf_iter = next_token_multz(loadbuf_iter, col_skips[rpf_col_idx]);
+	  if (!loadbuf_iter) {
+	    goto load_psam_ret_MISSING_TOKENS;
+	  }
+	  token_ptrs[cur_col_type] = loadbuf_iter;
+	  char* token_end = token_endnn(loadbuf_iter);
+	  token_slens[cur_col_type] = (uintptr_t)(token_end - loadbuf_iter);
+	  loadbuf_iter = token_end;
+	}
+	const uint32_t fid_slen = (uintptr_t)(token_endnn(loadbuf_first_token) - loadbuf_first_token);
+	const uint32_t iid_slen = token_slens[0];
+	const uint32_t sid_slen = sids_present? token_slens[1] : 0;
+	const uint32_t paternal_id_slen = paternal_ids_present? token_slens[2] : 1;
+	const uint32_t maternal_id_slen = maternal_ids_present? token_slens[3] : 1;
+	// phenotypes
+	if (!raw_sample_ct) {
+	  for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	    if (is_categorical_phenostr_nocsv(token_ptrs[pheno_idx + 5])) {
+	      SET_BIT(pheno_idx, categorical_phenos);
+	    }
+	  }
+	  categorical_pheno_ct = popcount_longs(categorical_phenos, pheno_ctl);
+	  if (categorical_pheno_ct) {
+	    // initialize hash table
+	    /*
+	    if (categorical_pheno_ct > kCatHtableSize) {
+	      // use a larger hash table if/when we ever care for this case
+	      logerrprint("Error: " PROG_NAME_STR " does not support more than 2^19 - 1 categorical phenotypes.\n");
+	      goto load_psam_ret_MALFORMED_INPUT;
+	    }
+	    */
+	    const uint32_t cat_ul_byte_ct = categorical_pheno_ct * sizeof(intptr_t);
+	    const uint32_t htable_byte_ct = kCatHtableSize * sizeof(uintptr_t);
+	    const uintptr_t entry_byte_ct = round_up_pow2(offsetof(catname_ll2_t, ss) + missing_catname_blen, sizeof(intptr_t));	    
+	    if ((uintptr_t)(tmp_bigstack_end - tmp_bigstack_base) < htable_byte_ct + categorical_pheno_ct * entry_byte_ct + 2 * cat_ul_byte_ct) {
+	      goto load_psam_ret_NOMEM;
+	    }
+	    pheno_catname_last = (catname_ll2_t**)tmp_bigstack_base;
+	    tmp_bigstack_base += cat_ul_byte_ct;	    
+	    total_catname_blens = (uintptr_t*)tmp_bigstack_base;
+	    tmp_bigstack_base += cat_ul_byte_ct;
+	    fill_ulong_zero(categorical_pheno_ct, total_catname_blens);
+	    catname_htable = (catname_ll2_t**)tmp_bigstack_base;
+	    tmp_bigstack_base += htable_byte_ct;
+	    for (uint32_t uii = 0; uii < kCatHtableSize; ++uii) {
+	      catname_htable[uii] = nullptr;
+	    }
+	    uint32_t cur_hval = missing_catname_hval;
+	    for (uint32_t cat_pheno_idx = 0; cat_pheno_idx < categorical_pheno_ct; ++cat_pheno_idx) {
+	      catname_ll2_t* new_entry = (catname_ll2_t*)tmp_bigstack_base;
+	      tmp_bigstack_base += entry_byte_ct;
+	      pheno_catname_last[cat_pheno_idx] = new_entry;
+	      new_entry->cat_idx = 0;
+	      new_entry->htable_next = nullptr;
+	      new_entry->pheno_next = nullptr;
+	      memcpy(new_entry->ss, missing_catname, missing_catname_blen);
+	      catname_htable[cur_hval++] = new_entry;
+	      if (cur_hval == kCatHtableSize) {
+		cur_hval = 0;
+	      }
+	    }
+	  }
+	}
+	// 1 extra byte for tab between FID and IID; this gets absorbed into
+	// the "+ sizeof(intptr_t)" at the end, since that would normally be
+	// "+ (sizeof(intptr_t) - 1)"
+	// bugfix: pheno_ct * sizeof(intptr_t) -> pheno_ct * 8
+	const uint32_t alloc_byte_ct = sizeof(psam_info_ll_t) + sizeof(intptr_t) + round_down_pow2(fid_slen + iid_slen + sid_slen + paternal_id_slen + maternal_id_slen + pheno_ct * 8, sizeof(intptr_t));
+	psam_info_ll_t* new_psam_info = (psam_info_ll_t*)tmp_bigstack_base;
+	tmp_bigstack_base += alloc_byte_ct;
+	if (tmp_bigstack_base > tmp_bigstack_end) {
+	  goto load_psam_ret_NOMEM;
+	}
+	new_psam_info->next = psam_info_reverse_ll;
+	char* sample_id_storage = (char*)(&(new_psam_info->vardata[pheno_ct * 8]));
+	char* ss_iter = memcpyax(sample_id_storage, loadbuf_first_token, fid_slen, '\t');
+	psam_info_reverse_ll = new_psam_info;
+	if ((iid_slen == 1) && (token_ptrs[0][0] == '0')) {
+	  sprintf(g_logbuf, "Error: Invalid IID '0' on line %" PRIuPTR " of %s.\n", line_idx, psamname);
+	  goto load_psam_ret_MALFORMED_INPUT_WW;
+	}
+	ss_iter = memcpya(ss_iter, token_ptrs[0], iid_slen);
+	const uint32_t sample_id_slen = (uintptr_t)(ss_iter - sample_id_storage);
+	if (sample_id_slen >= max_sample_id_blen) {
+	  max_sample_id_blen = sample_id_slen + 1;
+	}
+	new_psam_info->sample_id_slen = sample_id_slen;
+	if (sids_present) {
+	  ss_iter = memcpya(ss_iter, token_ptrs[1], sid_slen);
+	  if (sid_slen >= max_sid_blen) {
+	    max_sid_blen = sid_slen + 1;
+	  }
+	}
+	if (paternal_ids_present) {
+	  if (paternal_id_slen >= max_paternal_id_blen) {
+	    max_paternal_id_blen = paternal_id_slen + 1;
+	  }
+	  ss_iter = memcpya(ss_iter, token_ptrs[2], paternal_id_slen);
+	} else {
+	  *ss_iter++ = '0';
+	}
+	new_psam_info->paternal_id_slen = paternal_id_slen;
+	if (maternal_ids_present) {
+	  if (maternal_id_slen >= max_maternal_id_blen) {
+	    max_maternal_id_blen = maternal_id_slen + 1;
+	  }
+	  ss_iter = memcpya(ss_iter, token_ptrs[3], maternal_id_slen);
+	} else {
+	  *ss_iter++ = '0';
+	}
+	new_psam_info->maternal_id_slen = maternal_id_slen;
+	uint32_t cur_sex_code = 0;
+	// accept 'M'/'F'/'m'/'f' since that's more readable without being any
+	// less efficient
+	// don't accept "male"/"female", that's overkill
+	if (sex_present && (token_slens[4] == 1)) {
+	  const unsigned char sex_ucc = token_ptrs[4][0];
+	  const unsigned char sex_ucc_upcase = sex_ucc & 0xdfU;
+	  if ((sex_ucc == '1') || (sex_ucc_upcase == 'M')) {
+	    cur_sex_code = 1;
+	  } else if ((sex_ucc == '2') || (sex_ucc_upcase == 'F')) {
+	    cur_sex_code = 2;
+	  }
+	}
+	new_psam_info->sex_code = cur_sex_code;
+	// phenotypes
+	unsigned char* pheno_data = new_psam_info->vardata;
+	uint32_t cat_pheno_idx = 0;
+	for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	  const uint32_t col_type_idx = pheno_idx + 5;
+	  char* cur_phenostr = token_ptrs[col_type_idx];
+	  double dxx;
+	  if (!scanadv_double(cur_phenostr, &dxx)) {
+	    const uint32_t slen = token_slens[col_type_idx];
+	    if (is_nan_str(cur_phenostr, slen)) {
+	      dxx = missing_phenod;
+	    } else {
+	      if (!IS_SET(categorical_phenos, pheno_idx)) {
+		goto load_psam_ret_INCOMPATIBLE_PHENOSTRS;
+	      }
+	      if (slen > kMaxIdSlen) {
+                logerrprint("Error: Categorical phenotypes are limited to " MAX_ID_SLEN_STR " characters.\n");
+		goto load_psam_ret_MALFORMED_INPUT;
+	      }
+	      uint32_t hashval = hashceil(cur_phenostr, slen, kCatHtableSize) + cat_pheno_idx;
+	      if (hashval >= kCatHtableSize) {
+		hashval -= kCatHtableSize;
+	      }
+	      uintptr_t htable_idx = 0;
+	      catname_ll2_t** cur_entry_ptr = &(catname_htable[hashval]);
+	      while (1) {
+		catname_ll2_t* cur_entry = *cur_entry_ptr;
+		if (!cur_entry) {
+		  const uint32_t entry_byte_ct = round_up_pow2(offsetof(catname_ll2_t, ss) + slen + 1, sizeof(intptr_t));
+		  htable_idx = pheno_catname_last[cat_pheno_idx]->cat_idx + 1;
+		  catname_ll2_t* new_entry = (catname_ll2_t*)tmp_bigstack_base;
+		  tmp_bigstack_base += entry_byte_ct;
+		  if (tmp_bigstack_base > tmp_bigstack_end) {
+		    goto load_psam_ret_NOMEM;
+		  }
+		  new_entry->htable_next = nullptr;
+		  new_entry->pheno_next = nullptr;
+		  pheno_catname_last[cat_pheno_idx]->pheno_next = new_entry;
+		  pheno_catname_last[cat_pheno_idx] = new_entry;
+		  *cur_entry_ptr = new_entry;
+		  new_entry->cat_idx = htable_idx;
+		  memcpyx(new_entry->ss, cur_phenostr, slen, '\0');
+		  total_catname_blens[cat_pheno_idx] += slen + 1;
+		  break;
+		}
+		// safe since we guarantee kMaxIdSlen spare bytes at the end
+		// of bigstack
+		if ((!memcmp(cur_entry->ss, cur_phenostr, slen)) && (!cur_entry->ss[slen])) {
+		  htable_idx = cur_entry->cat_idx;
+		  break;
+		}
+		cur_entry_ptr = &(cur_entry->htable_next);
+	      }
+	      // don't bother writing top 4 bytes in 32-bit build
+	      memcpy(&(pheno_data[pheno_idx * 8]), &htable_idx, sizeof(intptr_t));
+	      ++cat_pheno_idx;
+	      continue;
+	    }
+	  }
+	  if (IS_SET(categorical_phenos, pheno_idx)) {
+	    goto load_psam_ret_INCOMPATIBLE_PHENOSTRS;
+	  }
+	  if (!IS_SET(quantitative_phenos, pheno_idx)) {
+	    if ((dxx != missing_phenod) && (dxx != pheno_ctrld) && (dxx != pheno_cased) && (dxx != 0.0)) {
+	      SET_BIT(pheno_idx, quantitative_phenos);
+	    }
+	  }
+	  memcpy(&(pheno_data[pheno_idx * 8]), &dxx, sizeof(double));
+	}
+	++raw_sample_ct;
+      }
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_psam_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto load_psam_ret_LONG_LINE;
+	}
+	goto load_psam_ret_NOMEM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (loadbuf_first_token[0] == '#') {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx, psamname);
+	goto load_psam_ret_MALFORMED_INPUT_WW;
+      }
+    }
+    if ((max_sample_id_blen > 2 * kMaxIdBlen) || (max_paternal_id_blen > kMaxIdBlen) || (max_maternal_id_blen > kMaxIdBlen)) {
+      logerrprint("Error: FIDs and IIDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+      goto load_psam_ret_MALFORMED_INPUT;
+    }
+    if (gzclose_null(&gz_infile)) {
+      goto load_psam_ret_READ_FAIL;
+    }
+    const uintptr_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    g_bigstack_base = (unsigned char*)round_up_pow2((uintptr_t)tmp_bigstack_base, kCacheline);
+    if (pheno_ct) {
+      pheno_cols = (pheno_col_t*)malloc(pheno_ct * sizeof(pheno_col_t));
+      if (!pheno_cols) {
+	goto load_psam_ret_NOMEM;
+      }
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	// ensure cleanup works if initialization fails in the middle
+	pheno_cols[pheno_idx].nonmiss = nullptr;
+      }
+      uint32_t cat_pheno_idx = 0;
+      pheno_col_t* pheno_cols_iter = pheno_cols;
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	const uintptr_t nonmiss_vec_ct = BITCT_TO_VECCT(raw_sample_ct);
+	const uint32_t is_categorical = IS_SET(categorical_phenos, pheno_idx);
+	const uint32_t is_qt = IS_SET(quantitative_phenos, pheno_idx);
+	uintptr_t data_vec_ct = 0;
+	uintptr_t catname_vec_ct = 0;
+	uintptr_t catname_storage_vec_ct = 0;
+	uint32_t nonnull_catname_ct = 0;
+	if (!is_categorical) {
+	  pheno_cols_iter->category_names = nullptr;
+	  pheno_cols_iter->type_code = (pheno_dtype_t)is_qt;
+	  pheno_cols_iter->nonnull_category_ct = 0;
+	  if (is_qt) {
+	    data_vec_ct = DBLCT_TO_VECCT(raw_sample_ct);
+	  } else {
+	    data_vec_ct = nonmiss_vec_ct;
+	  }
+	} else {
+	  nonnull_catname_ct = pheno_catname_last[cat_pheno_idx]->cat_idx;
+	  data_vec_ct = INT32CT_TO_VECCT(raw_sample_ct);
+	  catname_vec_ct = WORDCT_TO_VECCT(nonnull_catname_ct + 1);
+	  catname_storage_vec_ct = DIV_UP(total_catname_blens[cat_pheno_idx], kBytesPerVec);
+	  pheno_cols_iter->type_code = kPhenoDtypeCat;
+	  pheno_cols_iter->nonnull_category_ct = nonnull_catname_ct;
+	}
+	// pheno_cols_iter->nonmiss = nullptr;
+	uintptr_t* new_pheno_data_iter;
+	if (vecaligned_malloc((nonmiss_vec_ct + data_vec_ct + catname_vec_ct + catname_storage_vec_ct) * kBytesPerVec, &new_pheno_data_iter)) {
+	  goto load_psam_ret_NOMEM;
+	}
+	pheno_cols_iter->nonmiss = new_pheno_data_iter;
+	fill_ulong_zero(nonmiss_vec_ct * kWordsPerVec, new_pheno_data_iter);
+	new_pheno_data_iter = &(new_pheno_data_iter[nonmiss_vec_ct * kWordsPerVec]);
+	if (is_categorical) {
+	  pheno_cols_iter->data.cat = (uint32_t*)new_pheno_data_iter;
+	  new_pheno_data_iter = &(new_pheno_data_iter[data_vec_ct * kWordsPerVec]);
+	  char** cur_name_ptrs = (char**)new_pheno_data_iter;
+	  pheno_cols_iter->category_names = cur_name_ptrs;
+	  *cur_name_ptrs++ = missing_catname;
+	  char* name_storage_iter = (char*)(&(new_pheno_data_iter[catname_vec_ct * kWordsPerVec]));
+	  uint32_t cur_hval = missing_catname_hval + cat_pheno_idx;
+	  if (cur_hval >= kCatHtableSize) {
+	    cur_hval -= kCatHtableSize;
+	  }
+	  // make this point to the "NONE" entry for the current phenotype,
+	  // which starts the linked list
+	  catname_ll2_t* catname_entry_ptr = catname_htable[cur_hval];
+	  
+	  for (uint32_t catname_idx = 0; catname_idx < nonnull_catname_ct; ++catname_idx) {
+	    catname_entry_ptr = catname_entry_ptr->pheno_next;
+	    *cur_name_ptrs++ = name_storage_iter;
+	    name_storage_iter = strcpyax(name_storage_iter, catname_entry_ptr->ss, '\0');
+	  }
+	  ++cat_pheno_idx;
+	} else if (!is_qt) {
+	  pheno_cols_iter->data.cc = new_pheno_data_iter;
+	  fill_ulong_zero(nonmiss_vec_ct * kWordsPerVec, new_pheno_data_iter);
+	} else {
+	  pheno_cols_iter->data.qt = (double*)new_pheno_data_iter;
+	}
+	++pheno_cols_iter;
+      }
+    }
+    // real allocations start here
+    // could make these cacheline-aligned?
+    g_bigstack_end = bigstack_end_mark;
+    const uint32_t aligned_wct = BITCT_TO_ALIGNED_WORDCT(raw_sample_ct);
+    if (bigstack_end_alloc_c(raw_sample_ct * max_sample_id_blen, sample_ids_ptr) ||
+	bigstack_end_alloc_c(raw_sample_ct * max_paternal_id_blen, paternal_ids_ptr) ||
+	bigstack_end_alloc_c(raw_sample_ct * max_maternal_id_blen, maternal_ids_ptr) ||
+	bigstack_end_alloc_ul(raw_sample_ctl, sample_include_ptr) ||
+	bigstack_end_calloc_ul(aligned_wct, founder_info_ptr) ||
+	bigstack_end_calloc_ul(aligned_wct, sex_nm_ptr) ||
+	bigstack_end_calloc_ul(aligned_wct, sex_male_ptr)) {
+      goto load_psam_ret_NOMEM;
+    }
+    if (sids_present) {
+      if (bigstack_end_alloc_c(raw_sample_ct * max_sid_blen, sids_ptr)) {
+	goto load_psam_ret_NOMEM;
+      }
+    }
+    bigstack_end_mark = g_bigstack_end;
+    fill_all_bits(raw_sample_ct, *sample_include_ptr);
+    // make fill_interleaved_mask_vec() work by default
+    fill_ulong_zero(aligned_wct - raw_sample_ctl, &((*sample_include_ptr)[raw_sample_ctl]));
+    *raw_sample_ct_ptr = raw_sample_ct;
+    *max_sample_id_blen_ptr = max_sample_id_blen;
+    *max_sid_blen_ptr = max_sid_blen;
+    *max_paternal_id_blen_ptr = max_paternal_id_blen;
+    *max_maternal_id_blen_ptr = max_maternal_id_blen;
+    *max_pheno_name_blen_ptr = max_pheno_name_blen;
+    char* sample_ids = *sample_ids_ptr;
+    char* sids = sids_present? (*sids_ptr) : nullptr;
+    char* paternal_ids = *paternal_ids_ptr;
+    char* maternal_ids = *maternal_ids_ptr;
+    uintptr_t* founder_info = *founder_info_ptr;
+    uintptr_t* sex_nm = *sex_nm_ptr;
+    uintptr_t* sex_male = *sex_male_ptr;
+    uint32_t sample_uidx = raw_sample_ct;
+    while (sample_uidx) {
+      --sample_uidx;
+      unsigned char* cur_vardata = psam_info_reverse_ll->vardata;
+      for (uint32_t pheno_idx = 0; pheno_idx < pheno_ct; ++pheno_idx) {
+	if (IS_SET(categorical_phenos, pheno_idx)) {
+	  uint32_t cur_cat;
+	  memcpy(&cur_cat, &(cur_vardata[pheno_idx * 8]), sizeof(int32_t));
+	  pheno_cols[pheno_idx].data.cat[sample_uidx] = cur_cat;
+	  if (cur_cat) {
+	    SET_BIT(sample_uidx, pheno_cols[pheno_idx].nonmiss);
+	  }
+	} else {
+	  double dxx;
+	  memcpy(&dxx, &(cur_vardata[pheno_idx * 8]), sizeof(double));
+	  if (IS_SET(quantitative_phenos, pheno_idx)) {
+	    if (dxx != missing_phenod) {
+	      SET_BIT(sample_uidx, pheno_cols[pheno_idx].nonmiss);
+	      pheno_cols[pheno_idx].data.qt[sample_uidx] = dxx;
+	    }
+	  } else {
+	    if (dxx == pheno_cased) {
+	      SET_BIT(sample_uidx, pheno_cols[pheno_idx].data.cc);
+	      SET_BIT(sample_uidx, pheno_cols[pheno_idx].nonmiss);
+	    } else if (dxx == pheno_ctrld) {
+	      SET_BIT(sample_uidx, pheno_cols[pheno_idx].nonmiss);
+	    }
+	  }
+	}
+      }
+      const uint32_t sample_id_slen = psam_info_reverse_ll->sample_id_slen;
+      const uint32_t paternal_id_slen = psam_info_reverse_ll->paternal_id_slen;
+      const uint32_t maternal_id_slen = psam_info_reverse_ll->maternal_id_slen;
+      const uint32_t sex_code = psam_info_reverse_ll->sex_code;
+      char* cur_sample_id = (char*)(&(cur_vardata[pheno_ct * 8]));
+      memcpyx(&(sample_ids[sample_uidx * max_sample_id_blen]), cur_sample_id, sample_id_slen, '\0');
+      char* cur_paternal_id = &(cur_sample_id[sample_id_slen]);
+      if (sids) {
+        const uint32_t sid_slen = psam_info_reverse_ll->sid_slen;
+	char* cur_sid = cur_paternal_id;
+	memcpyx(&(sids[sample_uidx * max_sid_blen]), cur_sid, sid_slen, '\0');
+	cur_paternal_id = &(cur_sid[sid_slen]);
+      }
+      memcpyx(&(paternal_ids[sample_uidx * max_paternal_id_blen]), cur_paternal_id, paternal_id_slen, '\0');
+      char* cur_maternal_id = &(cur_paternal_id[paternal_id_slen]);
+      if ((paternal_id_slen == 1) && (maternal_id_slen == 1) && (cur_paternal_id[0] == '0') && (cur_maternal_id[0] == '0')) {
+	SET_BIT(sample_uidx, founder_info);
+      }
+      memcpyx(&(maternal_ids[sample_uidx * max_maternal_id_blen]), cur_maternal_id, maternal_id_slen, '\0');
+      if (sex_code) {
+	SET_BIT(sample_uidx, sex_nm);
+	if (sex_code == 1) {
+	  SET_BIT(sample_uidx, sex_male);
+	}
+      }
+      psam_info_reverse_ll = psam_info_reverse_ll->next;
+    }
+    // special case: if there's exactly one phenotype and it's all-missing,
+    // discard it
+    if ((pheno_ct == 1) && (!popcount_longs(pheno_cols[0].nonmiss, raw_sample_ctl))) {
+      free(*pheno_names_ptr);
+      *pheno_names_ptr = nullptr;
+      cleanup_pheno_cols(1, pheno_cols);
+      *pheno_cols_ptr = nullptr;
+      *pheno_ct_ptr = 0;
+    } else {
+      *pheno_cols_ptr = pheno_cols;
+      *pheno_ct_ptr = pheno_ct;
+    }
+  }
+  while (0) {
+  load_psam_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  load_psam_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  load_psam_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  load_psam_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  load_psam_ret_LONG_LINE:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, psamname);
+  load_psam_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  load_psam_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  load_psam_ret_MISSING_TOKENS:
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, psamname);
+    reterr = kPglRetMalformedInput;
+    break;
+  load_psam_ret_INCOMPATIBLE_PHENOSTRS:
+    LOGERRPRINTFWW("Error: Incompatible phenotype values in %s. (Case/control and quantitative phenotypes must be entirely numeric/\"NA\", and categorical phenotypes must be entirely non-numeric.)\n", psamname);
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ load_psam_ret_1:
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  gzclose_cond(gz_infile);
+  if (reterr) {
+    if (*pheno_names_ptr) {
+      free(*pheno_names_ptr);
+      *pheno_names_ptr = nullptr;
+    }
+    cleanup_pheno_cols(pheno_ct, pheno_cols);
+    *pheno_ct_ptr = 0;
+    *pheno_cols_ptr = nullptr;
+  }
+  return reterr;
+}
+
+
+typedef struct pheno_info_ll_struct {
+  // for categorical phenotypes, phenodata entry should be reinterpreted as
+  // uint32_t
+  struct pheno_info_ll_struct* next;
+  uint32_t sample_uidx;
+  double phenodata[];
+} pheno_info_ll_t;
+
+// also for loading covariates.  set affection_01 to 2 to prohibit case/control
+// and make unnamed variables start with "COVAR" instead of "PHENO"
+pglerr_t load_phenos(const char* pheno_fname, const range_list_t* pheno_range_list_ptr, const uintptr_t* sample_include, const char* sample_ids, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, int32_t missing_pheno, uint32_t affection_01, pheno_col_t** pheno_cols_ptr, char** pheno_names_ptr, uint32_t* pheno_ct_ptr, uintptr_t* max_pheno_name_blen_ptr) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  
+  gzFile gz_infile = nullptr;
+  char* pheno_names = nullptr;
+  uintptr_t line_idx = 0;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    reterr = gzopen_read_checked(pheno_fname, &gz_infile);
+    if (reterr) {
+      goto load_phenos_ret_1;
+    }
+    const uintptr_t initial_bigstack_size = bigstack_left();
+    uintptr_t loadbuf_size = initial_bigstack_size / 4;
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kMaxMediumLine) {
+      goto load_phenos_ret_NOMEM;
+    } else {
+      loadbuf_size = round_up_pow2(loadbuf_size, kCacheline);
+    }
+    char* loadbuf = (char*)bigstack_alloc_raw(loadbuf_size);
+    loadbuf[loadbuf_size - 1] = ' ';
+    char* loadbuf_first_token;
+    do {
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_phenos_ret_READ_FAIL;
+	}
+	loadbuf_first_token = loadbuf;
+	loadbuf_first_token[0] = '\0';
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto load_phenos_ret_LONG_LINE;
+	}
+	goto load_phenos_ret_NOMEM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+    } while (is_eoln_kns(*loadbuf_first_token) || ((loadbuf_first_token[0] == '#') && (((loadbuf_first_token[1] != 'F') && (loadbuf_first_token[1] != 'I')) || memcmp(&(loadbuf_first_token[2]), "ID", 2) || (((unsigned char)loadbuf_first_token[4] > ' ') && (loadbuf_first_token[4] != ',')))));
+    if (loadbuf_first_token[0] == '#') {
+      ++loadbuf_first_token;
+    }    
+    const uint32_t old_pheno_ct = *pheno_ct_ptr;
+    const uintptr_t old_max_pheno_name_blen = *max_pheno_name_blen_ptr;
+    uintptr_t max_pheno_name_blen = old_max_pheno_name_blen;
+    uint32_t comma_delim = 0;
+    xid_mode_t xid_mode = (loadbuf_first_token[0] == 'I')? kfXidModeIid : kfXidModeFidiid;
+    uint32_t* col_skips = nullptr;
+    uint32_t new_pheno_ct;
+    uint32_t final_pheno_ct;
+    uintptr_t final_pheno_names_byte_ct;
+    if (((loadbuf_first_token[0] == 'F') || xid_mode) && (!memcmp(&(loadbuf_first_token[1]), "ID", 2)) && (((unsigned char)loadbuf_first_token[3] <= 32) || (loadbuf_first_token[3] == ','))) {
+      // treat this as a header line
+      // autodetect CSV vs. space/tab-delimited
+      // (note that we don't permit CSVs without header lines)
+      comma_delim = (loadbuf_first_token[3] == ',');
+      char* loadbuf_iter = skip_initial_spaces(&(loadbuf_first_token[3 + comma_delim]));
+      if (is_eoln_kns(*loadbuf_iter)) {
+	goto load_phenos_ret_MISSING_TOKENS;
+      }
+      char* iid_end;
+      if (!xid_mode) {
+        iid_end = comma_or_space_token_end(loadbuf_iter, comma_delim);
+	const uintptr_t token_slen = (uintptr_t)(iid_end - loadbuf_iter);
+	if ((token_slen != 3) || memcmp(loadbuf_iter, "IID", 3)) {
+	  sprintf(g_logbuf, "Error: Second column header in %s must be 'IID'.\n", pheno_fname);
+	  goto load_phenos_ret_MALFORMED_INPUT_WW;
+	}
+	loadbuf_iter = comma_or_space_next_token(iid_end, comma_delim);
+      } else {
+	iid_end = &(loadbuf_first_token[3]);
+      }
+      uint32_t pheno_col_ct = 0;
+      char* pheno_start = loadbuf_iter;
+      while (loadbuf_iter) {
+	char* token_end = comma_or_space_token_end(loadbuf_iter, comma_delim);
+	const uintptr_t token_slen = (uintptr_t)(token_end - loadbuf_iter);
+	if (max_pheno_name_blen <= token_slen) {
+	  max_pheno_name_blen = token_slen + 1;
+	}
+	++pheno_col_ct;
+	loadbuf_iter = comma_or_space_next_token(token_end, comma_delim);
+      }
+      if (max_pheno_name_blen > kMaxIdBlen) {
+	logerrprint("Error: Phenotype/covariate names are limited to " MAX_ID_SLEN_STR " characters.\n");
+	goto load_phenos_ret_MALFORMED_INPUT;
+      }
+      if (pheno_range_list_ptr->names && pheno_col_ct) {
+	uintptr_t* bitarr;
+	reterr = string_range_list_to_bitarr_alloc(pheno_start, pheno_range_list_ptr, "pheno-name", "--pheno file", pheno_col_ct, 0, comma_delim, &bitarr);
+	if (reterr) {
+	  goto load_phenos_ret_1;
+	}
+	new_pheno_ct = popcount_longs(bitarr, BITCT_TO_WORDCT(pheno_col_ct));
+	if (bigstack_alloc_ui(new_pheno_ct, &col_skips)) {
+	  goto load_phenos_ret_NOMEM;
+	}
+	uint32_t col_uidx = 0;
+	int32_t prev_col_uidx = -1;
+	for (uint32_t col_idx = 0; col_idx < new_pheno_ct; ++col_idx, ++col_uidx) {
+	  next_set_unsafe_ck(bitarr, &col_uidx);
+	  col_skips[col_idx] = ((int32_t)col_uidx) - prev_col_uidx;
+	  prev_col_uidx = (int32_t)col_uidx;
+	}
+      } else {
+	// usual case, load all phenotypes
+	new_pheno_ct = pheno_col_ct;
+	if (bigstack_alloc_ui(new_pheno_ct, &col_skips)) {
+	  goto load_phenos_ret_NOMEM;
+	}
+	for (uint32_t col_idx = 0; col_idx < pheno_col_ct; ++col_idx) {
+	  col_skips[col_idx] = 1;
+	}
+      }
+      final_pheno_ct = new_pheno_ct + old_pheno_ct;
+      final_pheno_names_byte_ct = final_pheno_ct * max_pheno_name_blen;
+      pheno_names = (char*)malloc(final_pheno_names_byte_ct);
+      if (!pheno_names) {
+	goto load_phenos_ret_NOMEM;
+      }
+      loadbuf_iter = iid_end;
+      char* pheno_names_iter = &(pheno_names[old_pheno_ct * max_pheno_name_blen]);
+      for (uint32_t new_pheno_idx = 0; new_pheno_idx < new_pheno_ct; ++new_pheno_idx) {
+	loadbuf_iter = comma_or_space_next_token_mult(loadbuf_iter, col_skips[new_pheno_idx], comma_delim);
+	char* token_end = comma_or_space_token_end(loadbuf_iter, comma_delim);
+	const uint32_t name_slen = (uintptr_t)(token_end - loadbuf_iter);
+	if (is_reserved_pheno_name(loadbuf_iter, name_slen)) {
+	  *token_end = '\0';
+	  sprintf(g_logbuf, "Error: '%s' cannot be used as a phenotype/covariate name.\n", loadbuf_iter);
+	  goto load_phenos_ret_MALFORMED_INPUT_2;
+	}
+	memcpyx(pheno_names_iter, loadbuf_iter, name_slen, '\0');
+	pheno_names_iter = &(pheno_names_iter[max_pheno_name_blen]);
+	loadbuf_iter = token_end;
+      }
+      
+      // forces line to be skipped by main loop
+      loadbuf_first_token[0] = '\0';
+    } else {
+      // no header line
+      xid_mode = kfXidModeFidiid;
+      if (pheno_range_list_ptr->names) {
+	// possible todo: support e.g. "PHENO2-PHENO10"
+	sprintf(g_logbuf, "Error: Header line expected in %s (due to --pheno-name/--covar-name).\n", pheno_fname);
+	goto load_phenos_ret_INCONSISTENT_INPUT_WW;
+      }
+      const uint32_t col_ct = count_tokens(loadbuf_first_token);
+      if (col_ct < 3) {
+	// todo: tolerate col_ct == 2 with --allow-no-phenos
+	goto load_phenos_ret_MISSING_TOKENS;
+      }
+      new_pheno_ct = col_ct - 2;
+      final_pheno_ct = new_pheno_ct + old_pheno_ct;
+      const uintptr_t max_new_name_blen = 6 + int_slen(final_pheno_ct - 1);
+      if (max_new_name_blen > max_pheno_name_blen) {
+	max_pheno_name_blen = max_new_name_blen;
+      }
+      final_pheno_names_byte_ct = final_pheno_ct * max_pheno_name_blen;
+      if (bigstack_alloc_ui(new_pheno_ct, &col_skips)) {
+	goto load_phenos_ret_NOMEM;
+      }
+      pheno_names = (char*)malloc(final_pheno_names_byte_ct);
+      if (!pheno_names) {
+	goto load_phenos_ret_NOMEM;
+      }
+      for (uint32_t col_idx = 0; col_idx < new_pheno_ct; ++col_idx) {
+	col_skips[col_idx] = 1;
+      }
+      const char* default_prefix = (affection_01 == 2)? "COVAR" : "PHENO";
+      for (uint32_t pheno_idx = old_pheno_ct; pheno_idx < final_pheno_ct;) {
+	char* write_iter = memcpya(&(pheno_names[pheno_idx * max_pheno_name_blen]), default_prefix, 5);
+	++pheno_idx; // 1-based default names, not 0-based
+	write_iter = uint32toa(pheno_idx, write_iter);
+	*write_iter = '\0';
+      }
+    }
+    for (uint32_t old_pheno_idx = 0; old_pheno_idx < old_pheno_ct; ++old_pheno_idx) {
+      strcpy(&(pheno_names[old_pheno_idx * max_pheno_name_blen]), &((*pheno_names_ptr)[old_pheno_idx * old_max_pheno_name_blen]));
+    }
+
+    uint32_t tmp_htable_size;
+    uint32_t* htable_tmp;
+    if (htable_good_size_alloc(final_pheno_ct, bigstack_left(), &htable_tmp, &tmp_htable_size)) {
+      goto load_phenos_ret_NOMEM;
+    }
+    const uint32_t duplicate_idx = populate_strbox_htable(pheno_names, final_pheno_ct, max_pheno_name_blen, tmp_htable_size, htable_tmp);
+    if (duplicate_idx) {
+      const char* duplicate_pheno_name = &(pheno_names[duplicate_idx * max_pheno_name_blen]);
+      sprintf(g_logbuf, "Error: Duplicate phenotype/covariate ID '%s'.\n", duplicate_pheno_name);
+      goto load_phenos_ret_MALFORMED_INPUT_WW;
+    }
+    bigstack_reset(htable_tmp);
+
+    pheno_col_t* new_pheno_cols = (pheno_col_t*)realloc(*pheno_cols_ptr, final_pheno_ct * sizeof(pheno_col_t));
+    if (!new_pheno_cols) {
+      goto load_phenos_ret_NOMEM;
+    }
+    // ensure cleanup works if initialization fails in the middle
+    for (uint32_t pheno_idx = old_pheno_ct; pheno_idx < final_pheno_ct; ++pheno_idx) {
+      new_pheno_cols[pheno_idx].nonmiss = nullptr;
+    }
+    *pheno_ct_ptr = final_pheno_ct;
+    *pheno_cols_ptr = new_pheno_cols;
+
+    // switch to hash table?
+    char* sorted_sample_ids;
+    uint32_t* sample_id_map;
+    // todo: permit duplicates if SIDs are defined
+    reterr = copy_sort_strbox_subset(sample_include, sample_ids, sample_ct, max_sample_id_blen, 0, 0, 0, &sorted_sample_ids, &sample_id_map);
+    if (reterr) {
+      goto load_phenos_ret_1;
+    }
+    const uintptr_t raw_sample_ctl = BITCT_TO_WORDCT(raw_sample_ct);
+    char* id_buf;
+    uintptr_t* already_seen;
+    if (bigstack_alloc_c(max_sample_id_blen, &id_buf) ||
+	bigstack_calloc_ul(raw_sample_ctl, &already_seen)) {
+      goto load_phenos_ret_NOMEM;
+    }
+
+    pheno_info_ll_t* pheno_info_reverse_ll = nullptr;
+    const uint32_t pheno_info_alloc_byte_ct = sizeof(pheno_info_ll_t) + new_pheno_ct * sizeof(double);
+    const uint32_t new_pheno_ctl = BITCT_TO_WORDCT(new_pheno_ct);
+    const double missing_phenod = (double)missing_pheno;
+    const double pheno_ctrld = (double)((int32_t)(1 - affection_01));
+    const double pheno_cased = pheno_ctrld + 1.0;
+    uint32_t categorical_pheno_ct = 0;
+    char** token_ptrs;
+    uint32_t* token_slens;
+    uintptr_t* categorical_phenos;
+    uintptr_t* quantitative_phenos;
+    if (bigstack_alloc_cp(new_pheno_ct, &token_ptrs) ||
+	bigstack_alloc_ui(new_pheno_ct, &token_slens) ||
+	bigstack_calloc_ul(new_pheno_ctl, &categorical_phenos) ||
+	bigstack_calloc_ul(new_pheno_ctl, &quantitative_phenos)) {
+      goto load_phenos_ret_NOMEM;
+    }
+    const char* missing_catname = g_missing_catname;
+    const uint32_t missing_catname_blen = strlen(missing_catname) + 1;
+    const uint32_t missing_catname_hval = hashceil(missing_catname, missing_catname_blen - 1, kCatHtableSize);
+    unsigned char* bigstack_base_copy = g_bigstack_base;
+    unsigned char* tmp_bigstack_end = g_bigstack_end;
+    catname_ll2_t** catname_htable = nullptr;
+    catname_ll2_t** pheno_catname_last = nullptr;
+    uintptr_t* total_catname_blens = nullptr;
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+	char* loadbuf_iter = loadbuf_first_token;
+	uint32_t sample_uidx;
+	if (sorted_xidbox_read_find(sorted_sample_ids, sample_id_map, max_sample_id_blen, sample_ct, comma_delim, xid_mode, &loadbuf_iter, &sample_uidx, id_buf)) {
+	  if (!loadbuf_iter) {
+	    goto load_phenos_ret_MISSING_TOKENS;
+	  }
+	} else {
+	  if (is_set(already_seen, sample_uidx)) {
+	    logerrprint("Error: Duplicate sample ID in --pheno/--covar file.\n");
+	    goto load_phenos_ret_MALFORMED_INPUT;
+	  }
+	  set_bit(sample_uidx, already_seen);
+	  /*
+	  if (comma_delim) {
+	  } else {
+	  }
+	  */
+	  for (uint32_t new_pheno_idx = 0; new_pheno_idx < new_pheno_ct; ++new_pheno_idx) {
+	    loadbuf_iter = comma_or_space_next_token_mult(loadbuf_iter, col_skips[new_pheno_idx], comma_delim);
+	    if (!loadbuf_iter) {
+	      goto load_phenos_ret_MISSING_TOKENS;
+	    }
+	    token_ptrs[new_pheno_idx] = loadbuf_iter;
+	    char* token_end = comma_or_space_token_end(loadbuf_iter, comma_delim);
+	    token_slens[new_pheno_idx] = (uintptr_t)(token_end - loadbuf_iter);
+	    loadbuf_iter = token_end;
+	  }
+	  if (!pheno_info_reverse_ll) {
+	    // first relevant line, detect categorical phenotypes
+	    for (uint32_t new_pheno_idx = 0; new_pheno_idx < new_pheno_ct; ++new_pheno_idx) {
+	      if (is_categorical_phenostr(token_ptrs[new_pheno_idx])) {
+		SET_BIT(new_pheno_idx, categorical_phenos);
+	      } else if (affection_01 == 2) {
+		SET_BIT(new_pheno_idx, quantitative_phenos);
+	      }
+	    }
+	    categorical_pheno_ct = popcount_longs(categorical_phenos, new_pheno_ctl);
+	    if (categorical_pheno_ct) {
+	      // initialize hash table
+	      if (categorical_pheno_ct > kCatHtableSize) {
+		// use a larger hash table if/when we ever care for this case
+		logerrprint("Error: " PROG_NAME_STR " does not support more than 2^19 - 1 categorical phenotypes.\n");
+		goto load_phenos_ret_MALFORMED_INPUT;
+	      }
+	      const uint32_t cat_ul_byte_ct = categorical_pheno_ct * sizeof(intptr_t);
+	      const uint32_t htable_byte_ct = kCatHtableSize * sizeof(uintptr_t);
+	      const uintptr_t entry_byte_ct = round_up_pow2(offsetof(catname_ll2_t, ss) + missing_catname_blen, sizeof(intptr_t));
+
+	      if ((uintptr_t)(tmp_bigstack_end - bigstack_base_copy) < htable_byte_ct + categorical_pheno_ct * entry_byte_ct + 2 * cat_ul_byte_ct) {
+		goto load_phenos_ret_NOMEM;
+	      }
+	      tmp_bigstack_end -= cat_ul_byte_ct;
+	      total_catname_blens = (uintptr_t*)tmp_bigstack_end;
+	      tmp_bigstack_end -= cat_ul_byte_ct;
+	      pheno_catname_last = (catname_ll2_t**)tmp_bigstack_end;
+	      fill_ulong_zero(categorical_pheno_ct, total_catname_blens);
+	      tmp_bigstack_end -= htable_byte_ct;
+	      catname_htable = (catname_ll2_t**)tmp_bigstack_end;
+	      for (uint32_t uii = 0; uii < kCatHtableSize; ++uii) {
+		catname_htable[uii] = nullptr;
+	      }
+	      uint32_t cur_hval = missing_catname_hval;
+	      for (uint32_t cat_pheno_idx = 0; cat_pheno_idx < categorical_pheno_ct; ++cat_pheno_idx) {
+		tmp_bigstack_end -= entry_byte_ct;
+		catname_ll2_t* new_entry = (catname_ll2_t*)tmp_bigstack_end;
+		pheno_catname_last[cat_pheno_idx] = new_entry;
+		new_entry->cat_idx = 0;
+		new_entry->htable_next = nullptr;
+		new_entry->pheno_next = nullptr;
+		memcpy(new_entry->ss, missing_catname, missing_catname_blen);
+		catname_htable[cur_hval++] = new_entry;
+		if (cur_hval == kCatHtableSize) {
+		  cur_hval = 0;
+		}
+	      }
+	    }
+	  }
+	  if ((uintptr_t)(tmp_bigstack_end - bigstack_base_copy) < pheno_info_alloc_byte_ct) {
+	    goto load_phenos_ret_NOMEM;
+	  }
+	  tmp_bigstack_end -= pheno_info_alloc_byte_ct;
+
+	  pheno_info_ll_t* new_pheno_info = (pheno_info_ll_t*)tmp_bigstack_end;
+	  new_pheno_info->next = pheno_info_reverse_ll;
+	  new_pheno_info->sample_uidx = sample_uidx;
+	  double* pheno_data = new_pheno_info->phenodata;
+	  uint32_t cat_pheno_idx = 0;
+	  for (uint32_t new_pheno_idx = 0; new_pheno_idx < new_pheno_ct; ++new_pheno_idx) {
+	    char* cur_phenostr = token_ptrs[new_pheno_idx];
+	    double dxx;
+	    if (!scanadv_double(cur_phenostr, &dxx)) {
+	      const uint32_t slen = token_slens[new_pheno_idx];
+	      if (is_nan_str(cur_phenostr, slen)) {
+		// note that, in CSVs, empty string is interpreted as a
+		// missing non-categorical phenotype; explicit "NONE" is needed
+		// to denote a missing category
+		dxx = missing_phenod;
+	      } else {
+		if (!IS_SET(categorical_phenos, new_pheno_idx)) {
+		  goto load_phenos_ret_INCOMPATIBLE_PHENOSTRS;
+		}
+		uint32_t hashval;
+		hashval = hashceil(cur_phenostr, slen, kCatHtableSize) + cat_pheno_idx;
+		if (hashval >= kCatHtableSize) {
+		  hashval -= kCatHtableSize;
+		}
+		uintptr_t htable_idx = 0;
+		catname_ll2_t** cur_entry_ptr = &(catname_htable[hashval]);
+		while (1) {
+		  catname_ll2_t* cur_entry = *cur_entry_ptr;
+		  if (!cur_entry) {
+		    const uint32_t entry_byte_ct = round_up_pow2(offsetof(catname_ll2_t, ss) + slen + 1, sizeof(intptr_t));
+		    if ((uintptr_t)(tmp_bigstack_end - bigstack_base_copy) < entry_byte_ct) {
+		      goto load_phenos_ret_NOMEM;
+		    }
+		    tmp_bigstack_end -= entry_byte_ct;
+		    htable_idx = pheno_catname_last[cat_pheno_idx]->cat_idx + 1;
+		    catname_ll2_t* new_entry = (catname_ll2_t*)tmp_bigstack_end;
+		    new_entry->htable_next = nullptr;
+		    new_entry->pheno_next = nullptr;
+		    pheno_catname_last[cat_pheno_idx]->pheno_next = new_entry;
+		    pheno_catname_last[cat_pheno_idx] = new_entry;
+		    *cur_entry_ptr = new_entry;
+		    new_entry->cat_idx = htable_idx;
+		    memcpyx(new_entry->ss, cur_phenostr, slen, '\0');
+		    total_catname_blens[cat_pheno_idx] += slen + 1;
+		    break;
+		  }
+		  // safe since hash table entries are in the middle of
+		  // bigstack
+		  if ((!memcmp(cur_entry->ss, cur_phenostr, slen)) && (!cur_entry->ss[slen])) {
+		    htable_idx = cur_entry->cat_idx;
+		    break;
+		  }
+		  cur_entry_ptr = &(cur_entry->htable_next);
+		}
+		// don't bother writing top 4 bytes in 32-bit build
+		memcpy(&(pheno_data[new_pheno_idx]), &htable_idx, sizeof(intptr_t));
+		++cat_pheno_idx;
+		continue;
+	      }
+	    }
+	    if (IS_SET(categorical_phenos, new_pheno_idx)) {
+	      goto load_phenos_ret_INCOMPATIBLE_PHENOSTRS;
+	    }
+	    if (!IS_SET(quantitative_phenos, new_pheno_idx)) {
+	      if ((dxx != missing_phenod) && (dxx != pheno_ctrld) && (dxx != pheno_cased) && (dxx != 0.0)) {
+		SET_BIT(new_pheno_idx, quantitative_phenos);
+	      }
+	    }
+	    pheno_data[new_pheno_idx] = dxx;
+	  }
+	  pheno_info_reverse_ll = new_pheno_info;
+	}
+      }
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_phenos_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto load_phenos_ret_LONG_LINE;
+	}
+	goto load_phenos_ret_NOMEM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (loadbuf_first_token[0] == '#') {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx, pheno_fname);
+	goto load_phenos_ret_MALFORMED_INPUT_WW;
+      }
+    }    
+    if (gzclose_null(&gz_infile)) {
+      goto load_phenos_ret_READ_FAIL;
+    }
+    if (!pheno_info_reverse_ll) {
+      if (line_idx == 1) {
+	sprintf(g_logbuf, "Error: %s is empty.\n", pheno_fname);
+	goto load_phenos_ret_MALFORMED_INPUT_WW;
+      }
+      // could make this a warning, and automatically delete phenotypes?
+      LOGERRPRINTF("Error: No entries in %s correspond to loaded sample IDs.\n", pheno_fname);
+      goto load_phenos_ret_INCONSISTENT_INPUT;
+    }
+    if (new_pheno_ct) {
+      const uintptr_t nonmiss_vec_ct = BITCT_TO_VECCT(raw_sample_ct);
+      uint32_t cat_pheno_idx = 0;
+      pheno_col_t* pheno_cols_iter = &(new_pheno_cols[old_pheno_ct]);
+      for (uint32_t new_pheno_idx = 0; new_pheno_idx < new_pheno_ct; ++new_pheno_idx) {
+	const uint32_t is_categorical = IS_SET(categorical_phenos, new_pheno_idx);
+	const uint32_t is_qt = IS_SET(quantitative_phenos, new_pheno_idx);
+	uintptr_t data_vec_ct = 0;
+	uintptr_t catname_vec_ct = 0;
+	uintptr_t catname_storage_vec_ct = 0;
+	uint32_t nonnull_catname_ct = 0;
+	if (!is_categorical) {
+	  pheno_cols_iter->category_names = nullptr;
+	  pheno_cols_iter->type_code = (pheno_dtype_t)is_qt;
+	  pheno_cols_iter->nonnull_category_ct = 0;
+	  if (is_qt) {
+	    data_vec_ct = DBLCT_TO_VECCT(raw_sample_ct);
+	  } else {
+	    data_vec_ct = nonmiss_vec_ct;
+	  }
+	} else {
+	  nonnull_catname_ct = pheno_catname_last[cat_pheno_idx]->cat_idx;
+	  data_vec_ct = INT32CT_TO_VECCT(raw_sample_ct);
+	  catname_vec_ct = WORDCT_TO_VECCT(nonnull_catname_ct + 1);
+	  catname_storage_vec_ct = DIV_UP(total_catname_blens[cat_pheno_idx], kBytesPerVec);
+	  pheno_cols_iter->type_code = kPhenoDtypeCat;
+	  pheno_cols_iter->nonnull_category_ct = nonnull_catname_ct;
+	}
+	// pheno_cols_iter->nonmiss = nullptr;
+	uintptr_t* new_pheno_data_iter;
+	if (vecaligned_malloc((nonmiss_vec_ct + data_vec_ct + catname_vec_ct + catname_storage_vec_ct) * kBytesPerVec, &new_pheno_data_iter)) {
+	  goto load_phenos_ret_NOMEM;
+	}
+	pheno_cols_iter->nonmiss = new_pheno_data_iter;
+	fill_ulong_zero(nonmiss_vec_ct * kWordsPerVec, new_pheno_data_iter);
+	new_pheno_data_iter = &(new_pheno_data_iter[nonmiss_vec_ct * kWordsPerVec]);
+	if (is_categorical) {
+	  // allow nonmiss[] to be ignored in categorical case
+	  fill_ulong_zero(data_vec_ct, new_pheno_data_iter);
+	  pheno_cols_iter->data.cat = (uint32_t*)new_pheno_data_iter;
+	  new_pheno_data_iter = &(new_pheno_data_iter[data_vec_ct * kWordsPerVec]);
+	  char** cur_name_ptrs = (char**)new_pheno_data_iter;
+	  pheno_cols_iter->category_names = cur_name_ptrs;
+	  *cur_name_ptrs++ = g_missing_catname;
+	  char* name_storage_iter = (char*)(&(new_pheno_data_iter[catname_vec_ct * kWordsPerVec]));
+	  uint32_t cur_hval = missing_catname_hval + cat_pheno_idx;
+	  if (cur_hval >= kCatHtableSize) {
+	    cur_hval -= kCatHtableSize;
+	  }
+	  // make this point to the "NONE" entry for the current phenotype,
+	  // which starts the linked list
+	  catname_ll2_t* catname_entry_ptr = catname_htable[cur_hval];
+	  
+	  for (uint32_t catname_idx = 0; catname_idx < nonnull_catname_ct; ++catname_idx) {
+	    catname_entry_ptr = catname_entry_ptr->pheno_next;
+	    *cur_name_ptrs++ = name_storage_iter;
+	    name_storage_iter = strcpyax(name_storage_iter, catname_entry_ptr->ss, '\0');
+	  }
+	  ++cat_pheno_idx;
+	} else if (!is_qt) {
+	  pheno_cols_iter->data.cc = new_pheno_data_iter;
+	  fill_ulong_zero(nonmiss_vec_ct * kWordsPerVec, new_pheno_data_iter);
+	} else {
+	  pheno_cols_iter->data.qt = (double*)new_pheno_data_iter;
+	}
+	++pheno_cols_iter;
+      }
+      while (pheno_info_reverse_ll) {
+	const uint32_t sample_uidx = pheno_info_reverse_ll->sample_uidx;
+	double* pheno_data = pheno_info_reverse_ll->phenodata;
+	pheno_cols_iter = &(new_pheno_cols[old_pheno_ct]);
+	for (uint32_t new_pheno_idx = 0; new_pheno_idx < new_pheno_ct; ++new_pheno_idx) {
+	  if (IS_SET(categorical_phenos, new_pheno_idx)) {
+	    uint32_t cur_cat;
+	    memcpy(&cur_cat, &(pheno_data[new_pheno_idx]), sizeof(int32_t));
+	    pheno_cols_iter->data.cat[sample_uidx] = cur_cat;
+	    if (cur_cat) {
+	      SET_BIT(sample_uidx, pheno_cols_iter->nonmiss);
+	    }
+	  } else {
+	    double dxx = pheno_data[new_pheno_idx];
+	    // bugfix (6 May 2017): forgot to accept 0 as missing value for
+	    // case/control
+	    if (IS_SET(quantitative_phenos, new_pheno_idx)) {
+	      if (dxx != missing_phenod) {
+		SET_BIT(sample_uidx, pheno_cols_iter->nonmiss);
+		pheno_cols_iter->data.qt[sample_uidx] = dxx;
+	      }
+	    } else {
+	      if (dxx == pheno_cased) {
+	        SET_BIT(sample_uidx, pheno_cols_iter->data.cc);
+		SET_BIT(sample_uidx, pheno_cols_iter->nonmiss);
+	      } else if (dxx == pheno_ctrld) {
+		SET_BIT(sample_uidx, pheno_cols_iter->nonmiss);
+	      }
+	    }
+	  }
+	  ++pheno_cols_iter;
+	}
+	pheno_info_reverse_ll = pheno_info_reverse_ll->next;
+      }
+    }
+    *pheno_names_ptr = pheno_names;
+    *max_pheno_name_blen_ptr = max_pheno_name_blen;
+  }
+  while (0) {
+  load_phenos_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  load_phenos_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  load_phenos_ret_LONG_LINE:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, pheno_fname);
+  load_phenos_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+  load_phenos_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+  load_phenos_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  load_phenos_ret_MISSING_TOKENS:
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, pheno_fname);
+    reterr = kPglRetMalformedInput;
+    break;
+  load_phenos_ret_INCOMPATIBLE_PHENOSTRS:
+    LOGERRPRINTFWW("Error: Incompatible phenotype values in %s. (Case/control and quantitative phenotypes must be entirely numeric/\"NA\", and categorical phenotypes must be entirely non-numeric.)\n", pheno_fname);
+    reterr = kPglRetMalformedInput;
+    break;
+  load_phenos_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  load_phenos_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ load_phenos_ret_1:
+  bigstack_reset(bigstack_mark);
+  gzclose_cond(gz_infile);
+  if (reterr) {
+    free_cond(pheno_names);
+    if (*pheno_names_ptr) {
+      free(*pheno_names_ptr);
+      *pheno_names_ptr = nullptr;
+    }
+    cleanup_pheno_cols(*pheno_ct_ptr, *pheno_cols_ptr);
+    *pheno_ct_ptr = 0;
+    *pheno_cols_ptr = nullptr;
+  }
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_psam.h b/plink2_psam.h
new file mode 100644
index 0000000..8f0a8cc
--- /dev/null
+++ b/plink2_psam.h
@@ -0,0 +1,91 @@
+#ifndef __PLINK2_PSAM_H__
+#define __PLINK2_PSAM_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// Key .psam properties: (i) .fam files are valid .psam files; (ii) .fam-like
+// files with an additional "SID" column (disambiguating multiple samples from
+// e.g. the same cancer patient) are valid; (iii) zero, or many, phenotypes are
+// now ok; (iv) Oxford .sample files can be converted to .psam without loss of
+// information; and (v) loader should work on .ped files as well.
+
+// File starts with an arbitrary (possibly zero) number of header lines marked
+// by a leading '#'.  All lines which don't start with '#FID' or '#IID' are
+// currently ignored.  The #FID/IID line specifies the columns in the .psam
+// file; the following column headers are recognized:
+//   IID (required)
+//   SID
+//   PAT
+//   MAT
+//   SEX
+// FID must either be the first column, or absent.  If it's absent, the IID
+// value is copied into the FID field.
+// Any other value is treated as a phenotype/covariate name.
+//
+// The loader will error out of multiple #FID/IID lines are in the header for
+// some bizarre reason.  If no #FID/IID line is present, fam_cols controls the
+// default (e.g. fam_cols == FAM_COL_13456 means "#FID IID PAT MAT SEX PHENO").
+
+
+// Memory for all the return arrays is allocated off the bottom of g_bigstack.
+
+// todo: the new indiv_represent[] bitvector tracks which sample should be
+// considered the primary one for an individual, when there are multiple
+// samples with identical FID/IID but differing SID.  (Default is the first
+// sample for each individual, but if a later sample has SID == '0' that takes
+// precedence.)
+// plink2 order of operations should be arranged so that FID/IID-insensitive
+// sample filters happen first, then indiv_represent is computed, then
+// FID/IID-sensitive sample filters are applied.
+
+// chosen to be likely to fit in L3 cache
+CONSTU31(kCatHtableSize, 524287);
+static_assert(kCatHtableSize >= kMaxPhenoCt, "kCatHtableSize cannot be smaller than kMaxPhenoCt.");
+
+typedef struct catname_ll2_struct {
+  struct catname_ll2_struct* htable_next;
+  struct catname_ll2_struct* pheno_next;
+  uint32_t cat_idx; // 0 == "NONE", etc.
+  char ss[];
+} catname_ll2_t;
+
+pglerr_t load_psam(const char* psamname, const range_list_t* pheno_range_list_ptr, fam_col_t fam_cols, uint32_t pheno_ct_max, int32_t missing_pheno, uint32_t affection_01, uintptr_t* max_sample_id_blen_ptr, uintptr_t* max_sid_blen_ptr, uintptr_t* max_paternal_id_blen_ptr, uintptr_t* max_maternal_id_blen_ptr, uintptr_t** sample_include_ptr, char** sample_ids_ptr, char** sids_ptr, char** paternal_ids_ptr, char** maternal_ids_ptr, uintptr_t** founder_info_ptr, uintptr_t** sex_nm_ptr, uintpt [...]
+
+HEADER_INLINE boolerr_t is_reserved_pheno_name(const char* pheno_name, uint32_t pheno_name_slen) {
+  if (pheno_name_slen != 3) {
+    return 0;
+  }
+  // tolerate "SEX" column in phenotype/covariate files; just impose some
+  // restrictions on it when writing .psam files.
+  return (!memcmp(pheno_name, "FID", 3)) || (!memcmp(pheno_name, "IID", 3)) || (!memcmp(pheno_name, "SID", 3)) || (!memcmp(pheno_name, "PAT", 3)) || (!memcmp(pheno_name, "MAT", 3));
+}
+
+// also for loading covariates.  set affection_01 to 2 to prohibit case/control
+pglerr_t load_phenos(const char* pheno_fname, const range_list_t* pheno_range_list_ptr, const uintptr_t* sample_include, const char* sample_ids, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t max_sample_id_blen, int32_t missing_pheno, uint32_t affection_01, pheno_col_t** pheno_cols_ptr, char** pheno_names_ptr, uint32_t* pheno_ct_ptr, uintptr_t* max_pheno_name_blen_ptr);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_PSAM_H__
diff --git a/plink2_pvar.cpp b/plink2_pvar.cpp
new file mode 100644
index 0000000..f86d6f0
--- /dev/null
+++ b/plink2_pvar.cpp
@@ -0,0 +1,1584 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_data.h"
+#include "plink2_decompress.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// this used to employ a backward-growing linked list, with variable-size
+// elements, but demultiplexing was relatively expensive.  now we allocate
+// size-64k pos[], allele_idxs[], ids[], cms[], etc. blocks, and just memcpy
+// those chunks at the end.  (cms[] is lazy-initialized.)
+CONSTU31(kLoadPvarBlockSize, 65536);
+static_assert(!(kLoadPvarBlockSize & (kLoadPvarBlockSize - 1)), "kLoadPvarBlockSize must be a power of 2.");
+static_assert(kLoadPvarBlockSize >= (kMaxMediumLine / 8), "kLoadPvarBlockSize cannot be smaller than kMaxMediumLine / 8.");
+
+
+static_assert(!kChrOffsetX, "read_chrset_header_line() assumes kChrOffsetX == 0.");
+static_assert(kChrOffsetY == 1, "read_chrset_header_line() assumes kChrOffsetY == 1.");
+static_assert(kChrOffsetPAR1 == 4, "read_chrset_header_line() assumes kChrOffsetPAR1 == 4.");
+pglerr_t read_chrset_header_line(char* chrset_iter, const char* file_descrip, misc_flags_t misc_flags, uintptr_t line_idx, chr_info_t* cip) {
+  // chrset_iter is expected to point to first character after
+  // "##chrSet=<".
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    uint32_t cmdline_autosome_ct = 0;
+    uint32_t cmdline_haploid = 0;
+    int32_t cmdline_xymt_codes[kChrOffsetCt];
+    if (cip->chrset_source == kChrsetSourceCmdline) {
+      if (misc_flags & kfMiscChrOverrideCmdline) {
+	goto read_chrset_header_line_ret_1;
+      }
+      if (!(misc_flags & kfMiscChrOverrideFile)) {
+	// save off info we need for consistency check
+	cmdline_autosome_ct = cip->autosome_ct;
+	cmdline_haploid = cip->haploid_mask[0] & 1;
+	memcpy(cmdline_xymt_codes, cip->xymt_codes, kChrOffsetCt * sizeof(int32_t));
+      }
+      fill_ulong_zero(kChrMaskWords, cip->haploid_mask);
+    }
+    for (uint32_t uii = 0; uii < kChrOffsetCt; ++uii) {
+      cip->xymt_codes[uii] = -2;
+    }
+    if (!memcmp(chrset_iter, "haploidAutosomeCt=", 18)) {
+      uint32_t explicit_haploid_ct;
+      if (scan_posint_capped(&(chrset_iter[18]), kMaxChrTextnum, &explicit_haploid_ct)) {
+	sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s has an invalid ##chrSet haploid count (max %u).\n", line_idx, file_descrip, kMaxChrTextnum);
+	goto read_chrset_header_line_ret_MALFORMED_INPUT_WW;
+      }
+      // could verify that X, Y, etc. are not present?
+      if (cmdline_autosome_ct) {
+	if (!cmdline_haploid) {
+	  sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s specifies a haploid genome, while a diploid genome was specified on the command line.\n", line_idx, file_descrip);
+	  goto read_chrset_header_line_ret_INCONSISTENT_INPUT_WW;
+	}
+	if (explicit_haploid_ct != cmdline_autosome_ct) {
+	  sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s specifies %u autosome%s, while the command line specified %u.\n", line_idx, file_descrip, explicit_haploid_ct, (explicit_haploid_ct == 1)? "" : "s", cmdline_autosome_ct);
+	  goto read_chrset_header_line_ret_INCONSISTENT_INPUT_WW;
+	}
+      }
+      cip->autosome_ct = explicit_haploid_ct;
+      fill_all_bits(explicit_haploid_ct + 1, cip->haploid_mask);
+    } else {
+      if (memcmp(chrset_iter, "autosomePairCt=", 15)) {
+	sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s does not have expected ##chrSet format.\n", line_idx, file_descrip);
+	goto read_chrset_header_line_ret_MALFORMED_INPUT_WW;
+      }
+      chrset_iter = &(chrset_iter[15]);
+      uint32_t explicit_autosome_ct;
+      if (scanadv_posint_capped(kMaxChrTextnum, &chrset_iter, &explicit_autosome_ct)) {
+	sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s has an invalid ##chrSet autosome count (max %u).\n", line_idx, file_descrip, kMaxChrTextnum);
+	goto read_chrset_header_line_ret_MALFORMED_INPUT_WW;
+      }
+      cip->autosome_ct = explicit_autosome_ct;
+      if (*chrset_iter != '>') {
+	if (*chrset_iter != ',') {
+	  sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s does not have expected ##chrSet format.\n", line_idx, file_descrip);
+	  goto read_chrset_header_line_ret_MALFORMED_INPUT_WW;
+	}
+	// this can theoretically be confused by e.g. a Description="..." field
+	// containing commas not followed by spaces
+	while (1) {
+	  ++chrset_iter;
+	  // uppercase
+	  uint32_t first_char_ui = ((unsigned char)(*chrset_iter)) & 0xdf;
+
+	  uint32_t second_char_ui = (unsigned char)chrset_iter[1];
+	  // 44 is ',', 62 is '>'
+	  if ((second_char_ui == 44) || (second_char_ui == 62)) {
+	    if (first_char_ui == 77) {
+	      // M
+	      cip->xymt_codes[kChrOffsetMT] = explicit_autosome_ct + 1 + kChrOffsetMT;
+	    } else {
+	      first_char_ui -= 88; // X = 0, Y = 1, everything else larger
+	      if (first_char_ui < 2) {
+		cip->xymt_codes[first_char_ui] = explicit_autosome_ct + 1 + first_char_ui;
+	      }
+	    }
+	  } else {
+	    second_char_ui &= 0xdf;
+	    const uint32_t third_char_ui = (unsigned char)chrset_iter[2];
+	    if ((third_char_ui == 44) || (third_char_ui == 62)) {
+	      if ((first_char_ui == 88) && (second_char_ui == 89)) {
+		// XY
+		cip->xymt_codes[kChrOffsetXY] = explicit_autosome_ct + 1 + kChrOffsetXY;
+	      } else if ((first_char_ui == 77) && (second_char_ui == 84)) {
+		// MT
+		cip->xymt_codes[kChrOffsetMT] = explicit_autosome_ct + 1 + kChrOffsetMT;
+	      }
+	    } else if ((first_char_ui == 80) && (second_char_ui == 65) && ((third_char_ui & 0xdf) == 82)) {	    
+	      // PAR1, PAR2
+	      const uint32_t par_idx_m1 = ((unsigned char)chrset_iter[3]) - '1';
+	      if ((par_idx_m1 < 2) && ((chrset_iter[4] == ',') || (chrset_iter[4] == '>'))) {
+		cip->xymt_codes[kChrOffsetPAR1] = explicit_autosome_ct + 1 + kChrOffsetPAR1 + par_idx_m1;
+	      }
+	    }
+	  }
+	  chrset_iter = strchr(chrset_iter, ',');
+	  if (!chrset_iter) {
+	    break;
+	  }
+	}
+      }
+      if (cmdline_autosome_ct) {
+	if (cmdline_haploid) {
+	  sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s specifies a diploid genome, while a haploid genome was specified on the command line.\n", line_idx, file_descrip);
+	  goto read_chrset_header_line_ret_INCONSISTENT_INPUT_WW;
+	}
+	if (explicit_autosome_ct != cmdline_autosome_ct) {
+	  sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s specifies %u autosome%s, while the command line specified %u.\n", line_idx, file_descrip, explicit_autosome_ct, (explicit_autosome_ct == 1)? "" : "s", cmdline_autosome_ct);
+	  goto read_chrset_header_line_ret_INCONSISTENT_INPUT_WW;
+	}
+	for (uint32_t xymt_idx = 0; xymt_idx < kChrOffsetPAR1; ++xymt_idx) {
+	  // it's okay if the command line doesn't explicitly exclude e.g. chrX
+	  // while for whatever reason it is excluded from ##chrSet; but the
+	  // reverse can create problems
+	  if ((cmdline_xymt_codes[xymt_idx] < 0) && (cip->xymt_codes[xymt_idx] >= 0)) {
+	    sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s specifies a chromosome set including %s, while the command line excludes it.\n", line_idx, file_descrip, g_xymt_log_names[xymt_idx]);
+	    goto read_chrset_header_line_ret_INCONSISTENT_INPUT_WW;
+	  }
+	}
+      }
+    }
+    cip->chrset_source = kChrsetSourceFile;
+  }
+  while (0) {
+  read_chrset_header_line_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetMalformedInput;
+    break;
+  read_chrset_header_line_ret_INCONSISTENT_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+    reterr = kPglRetInconsistentInput;
+    break;
+  }
+ read_chrset_header_line_ret_1:
+  return reterr;
+}
+
+void varid_template_init(const char* varid_template, uint32_t* template_insert_ct_ptr, uint32_t* template_base_len_ptr, uint32_t* alleles_needed_ptr, const char* varid_template_segs[5], uint32_t* varid_template_seg_lens, uint32_t* varid_template_types) {
+  // template string was previously validated
+  // varid_template is only input, everything else is output values
+  const char* varid_template_iter = varid_template;
+  uint32_t template_insert_ct = 0;
+  uint32_t template_base_len = 0;
+  unsigned char ucc = (unsigned char)(*varid_template_iter);
+  uint32_t alleles_needed = 0; // bit 0 = ref, bit 1 = alt, bit 2 = ascii sort
+  varid_template_segs[0] = varid_template_iter;
+  do {
+    if (ucc <= '@') {
+      uint32_t seg_len;
+      uint32_t insert_type;
+      if (ucc == '@') {
+	seg_len = (uintptr_t)(varid_template_iter - varid_template_segs[template_insert_ct]);
+	insert_type = 0;
+	goto varid_template_init_match;
+      }
+      if (ucc == '#') {
+	seg_len = (uintptr_t)(varid_template_iter - varid_template_segs[template_insert_ct]);
+	insert_type = 1;
+	goto varid_template_init_match;
+      }
+      if (ucc == '$') {
+	seg_len = (uintptr_t)(varid_template_iter - varid_template_segs[template_insert_ct]);
+	{
+	  const uint32_t uii = (unsigned char)(*(++varid_template_iter));
+	  if (uii <= '2') {
+	    alleles_needed += 2; // this happens twice
+	    insert_type = uii - 48; // '1' -> type 2, '2' -> type 3
+	  } else {
+	    // 'r' -> type 2, 'a' -> type 3
+	    insert_type = 1 + ((uii & 0xdf) == 'A');
+	  }
+	  alleles_needed += insert_type;
+	  ++insert_type;
+	}
+      varid_template_init_match:
+	varid_template_seg_lens[template_insert_ct] = seg_len;
+	template_base_len += seg_len;
+	varid_template_types[template_insert_ct++] = insert_type;
+	varid_template_segs[template_insert_ct] = &(varid_template_iter[1]);
+      }
+    }
+    ucc = (unsigned char)(*(++varid_template_iter));
+  } while (ucc);
+  const uint32_t seg_len = (uintptr_t)(varid_template_iter - varid_template_segs[template_insert_ct]);
+  varid_template_seg_lens[template_insert_ct] = seg_len;
+  *template_insert_ct_ptr = template_insert_ct;
+  *template_base_len_ptr = template_base_len + seg_len;
+  *alleles_needed_ptr = alleles_needed;
+}
+
+void backfill_chr_idxs(const chr_info_t* cip, uint32_t chrs_encountered_m1, uint32_t offset, uint32_t end_vidx, chr_idx_t* chr_idxs) {
+  uint32_t chr_fo_idx = chrs_encountered_m1;
+  while (1) {
+    uint32_t start_vidx = cip->chr_fo_vidx_start[chr_fo_idx];
+    if (start_vidx < offset) {
+      start_vidx = offset;
+    }
+    chr_idx_t* chr_idxs_write_base = &(chr_idxs[start_vidx - offset]);
+    const uint32_t vidx_ct = end_vidx - start_vidx;
+    const chr_idx_t cur_chr_idx = (uint32_t)(cip->chr_file_order[chr_fo_idx]);
+    for (uint32_t uii = 0; uii < vidx_ct; ++uii) {
+      chr_idxs_write_base[uii] = cur_chr_idx;
+    }
+    if (start_vidx == offset) {
+      return;
+    }
+    end_vidx = start_vidx;
+    --chr_fo_idx;
+  }
+}
+
+char* pr_in_info_token(uint32_t info_slen, char* info_token) {
+  if ((!memcmp(info_token, "PR", 2)) && ((info_slen == 2) || (info_token[2] == ';'))) {
+    return info_token;
+  }
+  if (!memcmp(&(info_token[((int32_t)info_slen) - 3]), ";PR", 3)) {
+    return &(info_token[info_slen - 2]);
+  }
+  info_token[info_slen] = '\0';
+  char* first_info_end = strchr(info_token, ';');
+  if (!first_info_end) {
+    return nullptr;
+  }
+  char* pr_prestart = strstr(first_info_end, ";PR;");
+  return pr_prestart? nullptr : (&(pr_prestart[1]));
+}
+
+pglerr_t splitpar(const uint32_t* variant_bps, unsorted_var_t vpos_sortstatus, uint32_t splitpar_bound1, uint32_t splitpar_bound2, uintptr_t* variant_include, uintptr_t* loaded_chr_mask, chr_info_t* cip, uint32_t* chrs_encountered_m1_ptr, uint32_t* exclude_ct_ptr) {
+  const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+  if ((x_code < 0) || (!is_set(loaded_chr_mask, x_code))) {
+    logerrprint("Warning: --split-par had no effect (no X chromosome in dataset).\n");
+    return kPglRetSuccess;
+  }
+  const int32_t par1_code = cip->xymt_codes[kChrOffsetPAR1];
+  const int32_t par2_code = cip->xymt_codes[kChrOffsetPAR2];
+  if (par2_code < 0) {
+    // may want to remove this restriction later
+    logerrprint("Error: --split-par cannot currently be used with a custom chromosome set.\n");
+    return kPglRetInvalidCmdline;
+  }
+  if (is_set(loaded_chr_mask, par1_code) || is_set(loaded_chr_mask, par2_code)) {
+    logerrprint("Error: --split-par cannot be used on a dataset which already contains a PAR1 or\nPAR2 region.\n");
+    return kPglRetInvalidCmdline;
+  }
+  if (vpos_sortstatus & kfUnsortedVarBp) {
+    logerrprint("Error: --split-par cannot be used with an unsorted .bim/.pvar file.\n");
+    return kPglRetInvalidCmdline;
+  }
+  const uint32_t orig_xchr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)x_code];
+  const uint32_t orig_x_start = cip->chr_fo_vidx_start[orig_xchr_fo_idx];
+  const uint32_t orig_x_end = cip->chr_fo_vidx_start[orig_xchr_fo_idx + 1];
+  const uint32_t par1_end = orig_x_start + uint32arr_greater_than(&(variant_bps[orig_x_start]), orig_x_end - orig_x_start, splitpar_bound1 + 1);
+  const uint32_t par2_start = par1_end + uint32arr_greater_than(&(variant_bps[par1_end]), orig_x_end - par1_end, splitpar_bound2);
+  uint32_t tot_codes_changed = (par1_end - orig_x_start) + (orig_x_end - par2_start);
+  if (!tot_codes_changed) {
+    logerrprint("Warning: --split-par had no effect (no X variants were in the PARs).\n");
+    return kPglRetSuccess;
+  }
+  // one of the PARs, and/or the main chrX body, may be empty; that's not a big
+  // deal
+  *chrs_encountered_m1_ptr += 2;
+  const uint32_t chrs_encountered_m1 = *chrs_encountered_m1_ptr;
+  cip->chr_fo_vidx_start[chrs_encountered_m1 + 1] = cip->chr_fo_vidx_start[chrs_encountered_m1 - 1];
+  for (uint32_t chr_fo_idx = chrs_encountered_m1 - 2; chr_fo_idx > orig_xchr_fo_idx; --chr_fo_idx) {
+    cip->chr_fo_vidx_start[chr_fo_idx + 2] = cip->chr_fo_vidx_start[chr_fo_idx];
+    const int32_t cur_chr_idx = cip->chr_file_order[chr_fo_idx];
+    cip->chr_file_order[chr_fo_idx + 2] = cur_chr_idx;
+    cip->chr_idx_to_foidx[cur_chr_idx] = chr_fo_idx + 2;
+  }
+  cip->chr_fo_vidx_start[orig_xchr_fo_idx + 1] = par1_end;
+  cip->chr_fo_vidx_start[orig_xchr_fo_idx + 2] = par2_start;
+  cip->chr_file_order[orig_xchr_fo_idx] = par1_code;
+  cip->chr_file_order[orig_xchr_fo_idx + 1] = x_code;
+  cip->chr_file_order[orig_xchr_fo_idx + 2] = par2_code;
+  cip->chr_idx_to_foidx[(uint32_t)par1_code] = orig_xchr_fo_idx;
+  cip->chr_idx_to_foidx[(uint32_t)x_code] = orig_xchr_fo_idx + 1;
+  cip->chr_idx_to_foidx[(uint32_t)par2_code] = orig_xchr_fo_idx + 2;
+  uintptr_t* chr_mask = cip->chr_mask;
+  if (par1_end > orig_x_start) {
+    if (!is_set(chr_mask, par1_code)) {
+      *exclude_ct_ptr += popcount_bit_idx(variant_include, orig_x_start, par1_end);
+      clear_bits_nz(orig_x_start, par1_end, variant_include);
+    } else {
+      set_bit(par1_code, loaded_chr_mask);
+    }
+  }
+  if (par1_end == par2_start) {
+    clear_bit(x_code, chr_mask);
+  } else if (!is_set(chr_mask, x_code)) {
+    clear_bit(x_code, chr_mask);
+    *exclude_ct_ptr += popcount_bit_idx(variant_include, par1_end, par2_start);
+    clear_bits_nz(par1_end, par2_start, variant_include);
+  }
+  if (par2_start < orig_x_end) {
+    if (!is_set(chr_mask, par2_code)) {
+      *exclude_ct_ptr += popcount_bit_idx(variant_include, par2_start, orig_x_end);
+      clear_bits_nz(par2_start, orig_x_end, variant_include);
+    } else {
+      set_bit(par2_code, loaded_chr_mask);
+    }
+  }
+  LOGPRINTF("--split-par: %u chromosome code%s changed.\n", tot_codes_changed, (tot_codes_changed == 1)? "" : "s");
+  return kPglRetSuccess;
+}
+
+// --input-missing-genotype code set to 1 by load_pvar()
+static uint8_t acgtm_bool_table[256] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+static inline uint32_t is_acgtm(unsigned char ucc) {
+  return (uint32_t)(acgtm_bool_table[ucc]);
+}
+
+static_assert((!(kMaxIdSlen % kCacheline)), "load_pvar() must be updated.");
+pglerr_t load_pvar(const char* pvarname, char* var_filter_exceptions_flattened, const char* varid_template, const char* missing_varid_match, misc_flags_t misc_flags, pvar_psam_t pvar_psam_modifier, exportf_flags_t exportf_modifier, float var_min_qual, uint32_t splitpar_bound1, uint32_t splitpar_bound2, uint32_t new_variant_id_max_allele_slen, uint32_t snps_only, uint32_t split_chr_ok, chr_info_t* cip, uint32_t* max_variant_id_slen_ptr, uint32_t* info_reload_slen_ptr, unsorted_var_t* vpos [...]
+  // chr_info, max_variant_id_slen, and info_reload_slen are in/out; just
+  // outparameters after them.  (Due to its large size in some VCFs, INFO is
+  // not kept in memory for now.  This has a speed penalty, of course; maybe
+  // it's worthwhile to conditionally load it later.)
+
+  // variant_allele_idxs currently assumed to be initialized to nullptr
+
+  // should handle raw_variant_ct == 0 properly
+
+  // todo: upgrade this to handle split chromosomes, unsorted chromosomes/bp
+  //   coordinates, maybe skipping of allele code loading
+  // probable todo: load INFO:END.  (does this allow the CNV module to be
+  //   unified with the rest of the program?)  but this will probably wait
+  //   until I need to analyze some sort of CNV data, and that day keeps
+  //   getting postponed...
+  // possible todo: require FILTER to only contain values declared in header,
+  //   and modify its storage accordingly?  (pointless for now, but worthwhile
+  //   to keep an eye on what typical VCF files look like.)
+  
+  // Workspace is used as follows:
+  // |--header, allele_storage->----|--other return arrays---|--loadbuf--|-
+  //                                                        1/4
+  //
+  // -temp-->----|----<- filter failures, variant IDs, long alleles--|
+  //                                                                end
+  // I.e. on successful return, both bigstack_base and bigstack_end will move.
+  // This is designed to be called near the start of a program, at a time when
+  // no large temporary buffer is needed.
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+
+  gzFile gz_infile = nullptr;
+  uintptr_t line_idx = 0;
+  uint32_t max_allele_slen = 1;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    reterr = gzopen_read_checked(pvarname, &gz_infile);
+    if (reterr) {
+      goto load_pvar_ret_1;
+    }
+    const uintptr_t initial_bigstack_size = bigstack_left();
+    uintptr_t loadbuf_size = round_down_pow2(initial_bigstack_size / 4, kCacheline);
+    char* loadbuf = (char*)(&(bigstack_mark[loadbuf_size]));
+    if (loadbuf_size > kMaxLongLine) {
+      loadbuf_size = kMaxLongLine;
+    } else if (loadbuf_size <= kLoadPvarBlockSize * 2 * sizeof(intptr_t)) {
+      goto load_pvar_ret_NOMEM;
+    }
+    loadbuf[loadbuf_size - 1] = ' ';
+
+    char* xheader_end = ((pvar_psam_modifier & kfPvarColXheader) || (exportf_modifier & kfExportfVcf))? ((char*)bigstack_mark) : nullptr;
+    uint32_t chrset_present = 0;
+    uint32_t info_pr_present = 0;
+    uint32_t info_nonpr_present = 0;
+    char* loadbuf_first_token;
+    while (1) {
+      ++line_idx;
+      // strangely, gzgets tends to be more than twice as fast as fgets on my
+      // dev machine.  may as well support gzipped input files everywhere...
+      // (update: now using zstd's zlibWrapper gzgets.  todo: verify that this
+      // wrapper has negligible performance cost.)
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_pvar_ret_READ_FAIL;
+	}
+	loadbuf_first_token = loadbuf;
+	loadbuf_first_token[0] = '\0';
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto load_pvar_ret_LONG_LINE;
+	}
+	goto load_pvar_ret_NOMEM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if ((loadbuf_first_token[0] != '#') || (!strcmp_se(loadbuf_first_token, "#CHROM", 6))) {
+	break;
+      }
+      if (!memcmp(loadbuf_first_token, "##INFO=<ID=PR,Number=", 21)) {
+	if (info_pr_present) {
+	  sprintf(g_logbuf, "Error: Duplicate INFO:PR header line in %s.\n", pvarname);
+	  goto load_pvar_ret_MALFORMED_INPUT_WW;
+	}
+	if (memcmp(&(loadbuf_first_token[21]), "0,Type=Flag,Description=", 24)) {
+	  sprintf(g_logbuf, "Error: Header line %" PRIuPTR " of %s does not have expected INFO:PR format.\n", line_idx, pvarname);
+	  goto load_pvar_ret_MALFORMED_INPUT_WW;
+	}
+	info_pr_present = 1;
+      } else if ((!info_nonpr_present) && (!memcmp(loadbuf_first_token, "##INFO=<ID=", 11))) {
+	info_nonpr_present = 1;
+      }
+      if (!memcmp(loadbuf_first_token, "##chrSet=<", 10)) {
+	if (chrset_present) {
+	  sprintf(g_logbuf, "Error: Multiple ##chrSet header lines in %s.\n", pvarname);
+	  goto load_pvar_ret_MALFORMED_INPUT_WW;
+	}
+	chrset_present = 1;
+	const uint32_t cmdline_chrset = (cip->chrset_source == kChrsetSourceCmdline) && (!(misc_flags & kfMiscChrOverrideFile));
+	reterr = read_chrset_header_line(&(loadbuf_first_token[10]), pvarname, misc_flags, line_idx, cip);
+	if (reterr) {
+	  goto load_pvar_ret_1;
+	}
+	if (!cmdline_chrset) {
+	  const uint32_t autosome_ct = cip->autosome_ct;
+	  if (cip->haploid_mask[0] & 1) {
+	    LOGPRINTF("chrSet header line: %u autosome%s (haploid).\n", autosome_ct, (autosome_ct == 1)? "" : "s");
+	  } else {
+	    LOGPRINTF("chrSet header line: %u autosome pair%s.\n", autosome_ct, (autosome_ct == 1)? "" : "s");
+	  }
+	}
+      } else if (xheader_end) {
+	// if the "pvar file" was actually a VCF, suppress the same lines we'd
+	// suppress when importing with --vcf.
+	if (memcmp(loadbuf_first_token, "##fileformat=", 13) && memcmp(loadbuf_first_token, "##fileDate=", 11) && memcmp(loadbuf_first_token, "##source=", 9) && memcmp(loadbuf_first_token, "##FORMAT=", 9)) {
+	  uint32_t line_slen = strlen(loadbuf_first_token);
+	  if (loadbuf_first_token[line_slen - 1] == '\n') {
+	    --line_slen;
+	    if (loadbuf_first_token[line_slen - 1] == '\r') {
+	      --line_slen;
+	    }
+	  }
+	  if ((uintptr_t)(loadbuf - xheader_end) < line_slen + 2) {
+	    goto load_pvar_ret_NOMEM;
+	  }
+	  xheader_end = memcpya(xheader_end, loadbuf_first_token, line_slen);
+	  append_binary_eoln(&xheader_end);
+	}
+      }
+    }
+    *xheader_info_pr_ptr = info_pr_present;
+    if (xheader_end) {
+      *xheader_ptr = (char*)bigstack_mark;
+      *xheader_blen_ptr = (uintptr_t)(xheader_end - (*xheader_ptr));
+      g_bigstack_base = (unsigned char*)round_up_pow2((uintptr_t)xheader_end, kCacheline);
+    }
+    finalize_chrset(misc_flags, cip);
+    char** allele_storage = (char**)g_bigstack_base;
+    char** allele_storage_iter = allele_storage;
+
+    uint32_t col_skips[8];
+    uint32_t col_types[8];
+    uint32_t relevant_postchr_col_ct = 5;
+    uint32_t alt_col_idx = 4;
+    uint32_t load_qual_col = 0;
+    uint32_t load_filter_col = 0;
+    uint32_t info_col_present = 0;
+    uint32_t cm_col_present = 0;
+    if (loadbuf_first_token[0] == '#') {
+      // parse header
+      // [-1] = #CHROM (must be first column)
+      // [0] = POS
+      // [1] = ID
+      // [2] = REF
+      // [3] = ALT
+      // [4] = QUAL
+      // [5] = FILTER
+      // [6] = INFO
+      // [7] = CM (usually absent)
+      
+      // code is similar to plink 1.9 annotate() and gene_report(), but they
+      // don't have a forced first column
+      // might want to write plink2_common library functions for this...
+      uint32_t col_idx = 0;
+      char* token_end = &(loadbuf_first_token[6]);
+      uint32_t found_header_bitset = 0;
+      relevant_postchr_col_ct = 0;
+      char* loadbuf_iter;
+      while (1) {
+        loadbuf_iter = skip_initial_spaces(token_end);
+	if (is_eoln_kns(*loadbuf_iter)) {
+	  break;
+	}
+	++col_idx;
+	token_end = token_endnn(loadbuf_iter);
+        const uint32_t token_slen = (uintptr_t)(token_end - loadbuf_iter);
+	uint32_t cur_col_type;
+	if (token_slen <= 3) {
+	  if (token_slen == 3) {
+	    if (!memcmp(loadbuf_iter, "POS", 3)) {
+	      cur_col_type = 0;
+	    } else if (!memcmp(loadbuf_iter, "REF", 3)) {
+	      cur_col_type = 2;
+	    } else if (!memcmp(loadbuf_iter, "ALT", 3)) {
+	      cur_col_type = 3;
+	      alt_col_idx = col_idx;
+	    } else {
+	      continue;
+	    }
+	  } else if (token_slen == 2) {
+	    if (!memcmp(loadbuf_iter, "ID", 2)) {
+	      cur_col_type = 1;
+	    } else if (!memcmp(loadbuf_iter, "CM", 2)) {
+	      cur_col_type = 7;
+	      cm_col_present = 1;
+	    } else {
+	      continue;
+	    }
+	  } else {
+	    continue;
+	  }
+	} else if ((token_slen == 4) && (!memcmp(loadbuf_iter, "QUAL", 4))) {
+	  load_qual_col = 2 * ((pvar_psam_modifier & (kfPvarColMaybequal | kfPvarColQual)) || (exportf_modifier & kfExportfVcf)) + (var_min_qual != -1);
+	  if (!load_qual_col) {
+	    continue;
+	  }
+	  cur_col_type = 4;
+	} else if ((token_slen == 4) && (!memcmp(loadbuf_iter, "INFO", 4))) {
+	  cur_col_type = 6;
+	  info_col_present = 1;
+	} else if (token_slen == 6) {
+	  if (!memcmp(loadbuf_iter, "FILTER", 6)) {
+	    load_filter_col = 2 * ((pvar_psam_modifier & (kfPvarColMaybefilter | kfPvarColFilter)) || (exportf_modifier & kfExportfVcf)) + ((misc_flags / kfMiscExcludePvarFilterFail) & 1);
+	    if (!load_filter_col) {
+	      continue;
+	    }
+	    cur_col_type = 5;
+	  } else if (!memcmp(loadbuf_iter, "FORMAT", 6)) {
+	    break;
+	  } else {
+	    continue;
+	  }
+	} else {
+	  continue;
+	}
+	const uint32_t cur_col_type_shifted = 1 << cur_col_type;
+	if (found_header_bitset & cur_col_type_shifted) {
+	  *token_end = '\0';
+	  sprintf(g_logbuf, "Error: Duplicate column header '%s' on line %" PRIuPTR " of %s.\n", loadbuf_iter, line_idx, pvarname);
+	  goto load_pvar_ret_MALFORMED_INPUT_WW;
+	}
+	found_header_bitset |= cur_col_type_shifted;
+	col_skips[relevant_postchr_col_ct] = col_idx;
+	col_types[relevant_postchr_col_ct++] = cur_col_type;
+      }
+      if ((found_header_bitset & 0x0f) != 0x0f) {
+	sprintf(g_logbuf, "Error: Missing column header(s) on line %" PRIuPTR " of %s. (POS, ID, REF, and ALT are required.)\n", line_idx, pvarname);
+	goto load_pvar_ret_MALFORMED_INPUT_WW;
+      }
+      for (uint32_t rpc_col_idx = relevant_postchr_col_ct - 1; rpc_col_idx; --rpc_col_idx) {
+	col_skips[rpc_col_idx] -= col_skips[rpc_col_idx - 1];
+      }
+      loadbuf_first_token[0] = '\0'; // forces line to be skipped by main loop
+    } else if (loadbuf_first_token[0]) {
+      col_skips[0] = 1;
+      col_skips[1] = 1;
+      col_skips[2] = 1;
+      col_skips[3] = 1;
+      col_types[0] = 1;
+      // CM column is formally optional in headerless .pvar files (and it was
+      // "secretly" optional for the standard plink 1.9 standard .bim loader).
+      // If the line has exactly 5 columns, assume CM is omitted.
+      char* loadbuf_iter = next_token_mult(loadbuf_first_token, 4);
+      if (!loadbuf_iter) {
+	goto load_pvar_ret_MISSING_TOKENS;
+      }
+      loadbuf_iter = next_token(loadbuf_iter);
+      if (!loadbuf_iter) {
+        // #CHROM ID POS ALT REF
+	relevant_postchr_col_ct = 4;
+	col_types[1] = 0;
+	col_types[2] = 3;
+	col_types[3] = 2;
+	alt_col_idx = 3;
+      } else {
+        // #CHROM ID CM POS ALT REF
+        col_skips[4] = 1;
+	col_types[1] = 7;
+	col_types[2] = 0;
+	col_types[3] = 3;
+	col_types[4] = 2;
+	// alt_col_idx = 4;
+	cm_col_present = 1;
+      }
+    }
+    uint32_t info_reload_slen = *info_reload_slen_ptr;
+    if (!info_col_present) {
+      info_pr_present = 0;
+      info_reload_slen = 0;
+    } else if ((!info_pr_present) && (!info_reload_slen)) {
+      info_col_present = 0;
+    }
+    // done with header, loadbuf_first_token now points to beginning of first
+    // real line.
+    uint32_t max_variant_id_slen = *max_variant_id_slen_ptr;
+    uint32_t chrs_encountered_m1 = 0xffffffffU; // intentional overflow
+    uint32_t prev_chr_code = 0xffffffffU; // force initial mismatch
+    uint32_t raw_variant_ct = 0;
+    uintptr_t* chr_mask = cip->chr_mask;
+    const char* missing_allele_str = &(g_one_char_strs[92]);
+    double last_cm = -DBL_MAX;
+    int32_t last_bp = 0;
+
+    // this way, we only need to check allele_storage_iter against this (i)
+    // when processing a multiallelic variant or (ii) at the end of a block
+    char** allele_storage_limit = (char**)(&(loadbuf[kLoadPvarBlockSize * (-2) * sizeof(intptr_t)]));
+    
+    unsigned char* tmp_alloc_base = (unsigned char*)(&(loadbuf[loadbuf_size]));
+    uintptr_t* loaded_chr_mask = (uintptr_t*)tmp_alloc_base;
+    // bugfix (2 Jun 2017): forgot to zero-initialize loaded_chr_mask
+    fill_ulong_zero(kChrMaskWords, loaded_chr_mask);
+    tmp_alloc_base = &(tmp_alloc_base[round_up_pow2(kChrMaskWords * sizeof(intptr_t), kCacheline)]);
+    unsigned char* tmp_alloc_end = bigstack_end_mark;
+    uint32_t fexcept_ct = 0;
+    uintptr_t max_fexcept_blen = 2;
+    char* sorted_fexcepts = nullptr;
+    if (var_filter_exceptions_flattened) {
+      char** strptr_arr = (char**)tmp_alloc_end;
+      if (count_and_measure_multistr_reverse_alloc(var_filter_exceptions_flattened, ((uintptr_t)(tmp_alloc_end - tmp_alloc_base)) / sizeof(intptr_t), &fexcept_ct, &max_fexcept_blen, &strptr_arr)) {
+	goto load_pvar_ret_NOMEM;
+      }
+      if ((uintptr_t)(((unsigned char*)strptr_arr) - tmp_alloc_base) < fexcept_ct * max_fexcept_blen) {
+	goto load_pvar_ret_NOMEM;
+      }
+      strptr_arr_sort(fexcept_ct, strptr_arr);
+      sorted_fexcepts = (char*)tmp_alloc_base;
+      fexcept_ct = copy_and_dedup_sorted_strptrs_to_strbox(strptr_arr, fexcept_ct, max_fexcept_blen, sorted_fexcepts);
+      tmp_alloc_base = &(tmp_alloc_base[round_up_pow2(fexcept_ct * max_fexcept_blen, kCacheline)]);
+    }
+    char* chr_output_name_buf = nullptr;
+    const char* varid_template_segs[5];
+    uint32_t insert_slens[4];
+    uint32_t varid_template_seg_lens[5];
+    uint32_t varid_template_insert_types[4];
+    uint32_t varid_template_insert_ct = 0;
+    uint32_t varid_template_base_len = 0;
+    uint32_t varid_alleles_needed = 0;
+    uint32_t missing_varid_blen = 0;
+    uint32_t missing_varid_match_slen = 0;
+    fill_uint_zero(4, insert_slens);
+    if (varid_template) {
+      if ((uintptr_t)(tmp_alloc_end - tmp_alloc_base) < kMaxIdSlen) {
+	goto load_pvar_ret_NOMEM;
+      }
+      chr_output_name_buf = (char*)tmp_alloc_base;
+      tmp_alloc_base = &(tmp_alloc_base[kMaxIdSlen]);
+      if (!missing_varid_match) {
+	missing_varid_match = &(g_one_char_strs[92]); // '.'
+      }
+      missing_varid_blen = strlen(missing_varid_match);
+      if (misc_flags & kfMiscSetMissingVarIds) {
+	missing_varid_match_slen = missing_varid_blen;
+      }
+      ++missing_varid_blen;
+      varid_template_init(varid_template, &varid_template_insert_ct, &varid_template_base_len, &varid_alleles_needed, varid_template_segs, varid_template_seg_lens, varid_template_insert_types);
+    }
+
+    // prevent later return-array allocations from overlapping with temporary
+    // storage
+    g_bigstack_end = tmp_alloc_base;
+
+    // prevent variant_id_htable_find from breaking
+    if (((const char*)tmp_alloc_end) > (&(g_one_char_strs[512 - kMaxIdSlen]))) {
+      // const_cast
+      tmp_alloc_end = (unsigned char*)((uintptr_t)(&(g_one_char_strs[512 - kMaxIdSlen])));
+    }
+    const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
+    const uint32_t merge_par = (misc_flags / kfMiscMergePar) & 1;
+    const int32_t x_code = cip->xymt_codes[kChrOffsetX];
+    const int32_t par2_code = cip->xymt_codes[kChrOffsetPAR2];
+    int32_t parx_code = cip->xymt_codes[kChrOffsetPAR1];
+    uint32_t merge_par_ct = 0;
+
+    // Corner case: with --split-par + --not-chr x, we should keep the
+    // pseudoautosomal regions.  To facilitate this, we temporarily don't mask
+    // out chrX; splitpar() handles this properly later.
+    const uint32_t splitpar_and_exclude_x = splitpar_bound2 && (x_code >= 0) && (!is_set(cip->chr_mask, x_code));
+    if (splitpar_and_exclude_x) {
+      set_bit(x_code, cip->chr_mask);
+    }
+
+    if (snps_only > 1) {
+      acgtm_bool_table[(unsigned char)(*g_input_missing_geno_ptr)] = 1;
+    }
+    
+    uint32_t* cur_bps = nullptr;
+    uintptr_t* cur_allele_idxs = nullptr;
+    char** cur_ids = nullptr;
+    uintptr_t* cur_include = nullptr;
+    uintptr_t* cur_qual_present = nullptr;
+    float* cur_quals = nullptr;
+    uintptr_t* cur_filter_present = nullptr;
+    uintptr_t* cur_filter_npass = nullptr;
+    char** cur_filter_storage = nullptr;
+    uintptr_t* cur_nonref_flags = nullptr;
+    uint32_t max_filter_slen = 0;
+    uint32_t exclude_ct = 0;
+
+    // only allocated when necessary
+    // if we want to scale this approach to more fields, we'll need to add a
+    // few pointers to the start of each block.  right now, we force cur_cms[]
+    // to be allocated before cur_chr_idxs[] when both are present, but this
+    // is error-prone.
+    uint32_t at_least_one_npass_filter = 0;
+    uint32_t at_least_one_nzero_cm = 0;
+    const uint32_t new_variant_id_overflow_missing = (misc_flags / kfMiscNewVarIdOverflowMissing) & 1;
+    uintptr_t new_variant_id_allele_len_overflow = 0;
+    double* cur_cms = nullptr;
+    uint32_t cms_start_block = 0xffffffffU;
+
+    chr_idx_t* cur_chr_idxs = nullptr;
+    uint32_t chr_idxs_start_block = 0xffffffffU;
+    uint32_t is_split_chr = 0;
+    unsorted_var_t vpos_sortstatus = kfUnsortedVar0;
+    
+    while (1) {
+      if (!is_eoln_kns(*loadbuf_first_token)) {
+#ifdef __LP64__
+	// maximum prime < 2^32 is 4294967291; quadratic hashing guarantee
+	// breaks down past that divided by 2.
+	if (raw_variant_ct == 0x7ffffffd) {
+	  logerrprint("Error: " PROG_NAME_STR " does not support more than 2^31 - 3 variants.  We recommend other\nsoftware, such as PLINK/SEQ, for very deep studies of small numbers of genomes.\n");
+	  goto load_pvar_ret_MALFORMED_INPUT;
+	}
+#endif
+	const uint32_t variant_idx_lowbits = raw_variant_ct % kLoadPvarBlockSize;
+	if (!variant_idx_lowbits) {
+	  if (((uintptr_t)(tmp_alloc_end - tmp_alloc_base) <= kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t) + at_least_one_nzero_cm * sizeof(double)) + is_split_chr * sizeof(chr_idx_t) + (1 + info_pr_present) * (kLoadPvarBlockSize / CHAR_BIT) + (load_qual_col? ((kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(float)) : 0) + (load_filter_col? (2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)) : 0)) || (allele_storage_iter >= allele_storage_l [...]
+	    goto load_pvar_ret_NOMEM;
+	  }
+	  cur_bps = (uint32_t*)tmp_alloc_base;
+	  cur_allele_idxs = (uintptr_t*)(&(tmp_alloc_base[kLoadPvarBlockSize * sizeof(int32_t)]));
+	  cur_ids = (char**)(&(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]));
+	  cur_include = (uintptr_t*)(&(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t))]));
+	  fill_ulong_one(kLoadPvarBlockSize / kBitsPerWord, cur_include);
+	  tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t)) + (kLoadPvarBlockSize / CHAR_BIT)]);
+	  if (load_qual_col > 1) {
+	    cur_qual_present = (uintptr_t*)tmp_alloc_base;
+	    fill_ulong_zero(kLoadPvarBlockSize / kBitsPerWord, cur_qual_present);
+	    cur_quals = (float*)(&(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]));
+	    tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize * sizeof(float) + (kLoadPvarBlockSize / CHAR_BIT)]);
+	  }
+	  if (load_filter_col > 1) {
+	    cur_filter_present = (uintptr_t*)tmp_alloc_base;
+	    cur_filter_npass = (uintptr_t*)(&(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]));
+	    cur_filter_storage = (char**)(&(tmp_alloc_base[2 * (kLoadPvarBlockSize / CHAR_BIT)]));
+	    fill_ulong_zero(kLoadPvarBlockSize / kBitsPerWord, cur_filter_present);
+	    fill_ulong_zero(kLoadPvarBlockSize / kBitsPerWord, cur_filter_npass);
+	    tmp_alloc_base = &(tmp_alloc_base[2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)]);
+	  }
+	  if (info_pr_present) {
+	    cur_nonref_flags = (uintptr_t*)tmp_alloc_base;
+	    fill_ulong_zero(kLoadPvarBlockSize / kBitsPerWord, cur_nonref_flags);
+	    tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]);
+	  }
+	  if (at_least_one_nzero_cm) {
+	    cur_cms = (double*)tmp_alloc_base;
+	    fill_double_zero(kLoadPvarBlockSize, cur_cms);
+	    tmp_alloc_base = (unsigned char*)(&(cur_cms[kLoadPvarBlockSize]));
+	  }
+	  if (is_split_chr) {
+	    cur_chr_idxs = (chr_idx_t*)tmp_alloc_base;
+	    tmp_alloc_base = (unsigned char*)(&(cur_chr_idxs[kLoadPvarBlockSize]));
+	  }
+	}
+	char* loadbuf_iter = token_endnn(loadbuf_first_token);	
+	// #CHROM
+	if (!(*loadbuf_iter)) {
+	  goto load_pvar_ret_MISSING_TOKENS;
+	}
+	int32_t cur_chr_code;
+	reterr = get_or_add_chr_code_destructive(".pvar file", line_idx, allow_extra_chrs, loadbuf_first_token, loadbuf_iter, cip, &cur_chr_code);
+	if (reterr) {
+	  goto load_pvar_ret_1;
+	}
+	if (merge_par) {
+	  if (cur_chr_code == par2_code) {
+	    // don't permit PAR1 variants after PAR2
+	    parx_code = par2_code;
+	  }
+	  if (cur_chr_code == parx_code) {
+	    ++merge_par_ct;
+	    cur_chr_code = x_code;
+	  }
+	}
+	if (((uint32_t)cur_chr_code) != prev_chr_code) {
+	  prev_chr_code = cur_chr_code;
+	  if (!is_split_chr) {
+	    if (is_set(loaded_chr_mask, cur_chr_code)) {
+	      if (!split_chr_ok) {
+		sprintf(g_logbuf, "Error: %s has a split chromosome. Use --make-pgen by itself to remedy this.\n", pvarname);
+		goto load_pvar_ret_MALFORMED_INPUT_WW;
+	      }
+	      if ((uintptr_t)(tmp_alloc_end - tmp_alloc_base) < kLoadPvarBlockSize * sizeof(chr_idx_t)) {
+		goto load_pvar_ret_NOMEM;
+	      }
+	      cur_chr_idxs = (chr_idx_t*)tmp_alloc_base;
+	      tmp_alloc_base = (unsigned char*)(&(cur_chr_idxs[kLoadPvarBlockSize]));
+	      // may want to track the first problem variant index
+	      // cip->chr_fo_vidx_start[chrs_encountered_m1] = raw_variant_ct;
+	      backfill_chr_idxs(cip, chrs_encountered_m1, round_down_pow2(raw_variant_ct, kLoadPvarBlockSize), raw_variant_ct, cur_chr_idxs);
+	      chr_idxs_start_block = raw_variant_ct / kLoadPvarBlockSize;
+	      is_split_chr = 1;
+	      vpos_sortstatus |= kfUnsortedVarBp | kfUnsortedVarCm | kfUnsortedVarSplitChr;
+	    } else {
+	      // how much of this do we need in split-chrom case?
+	      cip->chr_file_order[++chrs_encountered_m1] = cur_chr_code;
+	      cip->chr_fo_vidx_start[chrs_encountered_m1] = raw_variant_ct;
+	      cip->chr_idx_to_foidx[(uint32_t)cur_chr_code] = chrs_encountered_m1;
+	      last_bp = 0;
+	      last_cm = -DBL_MAX;
+	    }
+	  }
+	  set_bit(cur_chr_code, loaded_chr_mask);
+	  if (chr_output_name_buf) {
+	    varid_template_base_len -= insert_slens[0];
+	    char* chr_name_end = chr_name_write(cip, (uint32_t)cur_chr_code, chr_output_name_buf);
+	    insert_slens[0] = (uintptr_t)(chr_name_end - chr_output_name_buf);
+	    varid_template_base_len += insert_slens[0];
+	  }
+	}
+	*loadbuf_iter = '\t';
+
+	// could make this store (and cur_allele_idxs[] allocation) conditional
+	// on a multiallelic variant being sighted, but unlike the CM column
+	// this should become common
+	cur_allele_idxs[variant_idx_lowbits] = (uintptr_t)(allele_storage_iter - allele_storage);
+
+        char* token_ptrs[8];
+	uint32_t token_slens[8];	
+	if (is_set(chr_mask, cur_chr_code) || info_pr_present) {
+	  for (uint32_t rpc_col_idx = 0; rpc_col_idx < relevant_postchr_col_ct; ++rpc_col_idx) {
+	    const uint32_t cur_col_type = col_types[rpc_col_idx];
+	    loadbuf_iter = next_token_mult(loadbuf_iter, col_skips[rpc_col_idx]);
+	    if (!loadbuf_iter) {
+	      goto load_pvar_ret_MISSING_TOKENS;
+	    }
+	    token_ptrs[cur_col_type] = loadbuf_iter;
+	    char* token_end = token_endnn(loadbuf_iter);
+	    token_slens[cur_col_type] = (uintptr_t)(token_end - loadbuf_iter);
+	    loadbuf_iter = token_end;
+	  }
+	  if (info_col_present) {
+	    const uint32_t info_slen = token_slens[6];
+	    if (info_slen > info_reload_slen) {
+	      info_reload_slen = info_slen;
+	    }
+	    if (info_pr_present) {
+	      // always load all nonref_flags entries so they can be compared
+	      // against .pgen for now.
+
+	      // (todo: general INFO filtering code)
+	      char* info_token = token_ptrs[6];
+	      if (((!memcmp(info_token, "PR", 2)) && ((info_slen == 2) || (info_token[2] == ';'))) || (!memcmp(&(info_token[((int32_t)info_slen) - 3]), ";PR", 3))) {
+		SET_BIT(variant_idx_lowbits, cur_nonref_flags);
+	      } else {
+		info_token[info_slen] = '\0';
+		char* first_info_end = strchr(info_token, ';');
+		if (first_info_end && strstr(first_info_end, ";PR;")) {
+		  SET_BIT(variant_idx_lowbits, cur_nonref_flags);
+		}
+	      }
+	      if (!is_set(chr_mask, cur_chr_code)) {
+		goto load_pvar_skip_variant;
+	      }
+	    }
+	  }
+	  
+	  // POS
+	  int32_t cur_bp;
+	  if (scan_int_abs_defcap(token_ptrs[0], &cur_bp)) {
+	    sprintf(g_logbuf, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
+	    goto load_pvar_ret_MALFORMED_INPUT_WW;
+	  }
+
+	  if (cur_bp < 0) {
+	    goto load_pvar_skip_variant;
+	  }
+	  
+	  // QUAL
+	  if (load_qual_col) {
+	    char* qual_token = token_ptrs[4];
+	    if ((qual_token[0] != '.') || (qual_token[1] > ' ')) {
+	      float cur_qual;
+	      if (scan_float(qual_token, &cur_qual)) {
+		sprintf(g_logbuf, "Error: Invalid QUAL value on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
+		goto load_pvar_ret_MALFORMED_INPUT_WW;
+	      }
+	      if ((load_qual_col & 1) && (cur_qual < var_min_qual)) {
+		goto load_pvar_skip_variant;
+	      }
+	      if (load_qual_col > 1) {
+	        SET_BIT(variant_idx_lowbits, cur_qual_present);
+		// possible todo: optimize all-quals-same case
+		// possible todo: conditionally allocate, like cur_cms
+		cur_quals[variant_idx_lowbits] = cur_qual;
+	      }
+	    } else if (load_qual_col & 1) {
+	      goto load_pvar_skip_variant;
+	    }
+	  }
+
+	  // avoid repeating the ALT string split in --set-...-var-ids case
+	  loadbuf_iter = token_ptrs[3];
+	  uint32_t remaining_alt_char_ct = token_slens[3];
+	  // handle --snps-only here instead of later, since it reduces the
+	  // amount of data we need to load
+	  if (snps_only) {
+	    if ((token_slens[2] != 1) || (!(remaining_alt_char_ct % 2))) {
+	      goto load_pvar_skip_variant;
+	    }
+	    const uint32_t extra_alt_ct = remaining_alt_char_ct / 2;
+	    for (uint32_t uii = 0; uii < extra_alt_ct; ++uii) {
+	      // no need to check for empty allele code here, that'll be
+	      // caught later
+	      if (loadbuf_iter[2 * uii + 1] != ',') {
+		goto load_pvar_skip_variant;
+	      }
+	    }
+	    if (snps_only > 1) {
+	      // just-acgt
+	      if (!is_acgtm(token_ptrs[2][0])) {
+		goto load_pvar_skip_variant;
+	      }
+	      for (uint32_t uii = 0; uii <= extra_alt_ct; ++uii) {
+		if (!is_acgtm(loadbuf_iter[2 * uii])) {
+		  goto load_pvar_skip_variant;
+		}
+	      }
+	    }
+	  }
+	  
+	  // FILTER
+	  if (load_filter_col) {
+	    char* filter_token = token_ptrs[5];
+	    const uint32_t filter_slen = token_slens[5];
+	    if ((filter_slen > 1) || (filter_token[0] != '.')) {
+	      if ((filter_slen != 4) || memcmp(filter_token, "PASS", 4)) {
+		if (load_filter_col & 1) {
+		  if (!fexcept_ct) {
+		    goto load_pvar_skip_variant;
+		  }
+		  char* filter_token_iter = filter_token;
+		  uint32_t remaining_byte_ct = filter_slen;
+		  while (1) {
+		    char* cur_filter_name_end = (char*)memchr(filter_token_iter, ';', remaining_byte_ct);
+		    uint32_t cur_slen = remaining_byte_ct;
+		    if (cur_filter_name_end) {
+		      cur_slen = (uintptr_t)(cur_filter_name_end - filter_token_iter);
+		    }
+		    // possible todo: error out on "PASS", since that
+		    // shouldn't coexist with other filters
+		    // possible todo: maintain a dictionary of FILTER
+		    // strings, analogous to what BCF2 does on disk
+		    if (bsearch_str(filter_token_iter, sorted_fexcepts, cur_slen, max_fexcept_blen, fexcept_ct) == -1) {
+		      goto load_pvar_skip_variant;
+		    }
+		    const uint32_t cur_blen = cur_slen + 1;
+		    if (cur_blen >= remaining_byte_ct) {
+		      break;
+		    }
+		    filter_token_iter = &(filter_token_iter[cur_blen]);
+		    remaining_byte_ct -= cur_blen;
+		  }
+		}
+		if (load_filter_col > 1) {
+		  SET_BIT(variant_idx_lowbits, cur_filter_npass);
+		  at_least_one_npass_filter = 1;
+		  // possible todo: detect repeated filter values, store more
+		  // compactly
+		  if (filter_slen > max_filter_slen) {
+		    max_filter_slen = filter_slen;
+		  }
+		  tmp_alloc_end -= filter_slen + 1;
+		  if (tmp_alloc_end < tmp_alloc_base) {
+		    goto load_pvar_ret_NOMEM;
+		  }
+		  cur_filter_storage[variant_idx_lowbits] = (char*)tmp_alloc_end;
+		  memcpyx(tmp_alloc_end, filter_token, filter_slen, '\0');
+		}
+	      }
+	      if (load_filter_col > 1) {
+		SET_BIT(variant_idx_lowbits, cur_filter_present);
+	      }
+	    }
+	  }
+
+	  if (cur_chr_idxs) {
+	    cur_chr_idxs[variant_idx_lowbits] = (uint32_t)cur_chr_code;
+	  }
+	  if (cur_bp < last_bp) {
+	    vpos_sortstatus |= kfUnsortedVarBp;
+	  }
+	  cur_bps[variant_idx_lowbits] = cur_bp;
+	  last_bp = cur_bp;
+	  char* alt_allele_iter = (char*)memchr(loadbuf_iter, ',', remaining_alt_char_ct);
+	  uint32_t id_slen;
+	  if ((!varid_template) || (missing_varid_match_slen && ((token_slens[1] != missing_varid_match_slen) || memcmp(token_ptrs[1], missing_varid_match, missing_varid_match_slen)))) {
+	    id_slen = token_slens[1];
+	    tmp_alloc_end -= id_slen + 1;
+	    if (tmp_alloc_end < tmp_alloc_base) {
+	      goto load_pvar_ret_NOMEM;
+	    }
+	    memcpyx(tmp_alloc_end, token_ptrs[1], id_slen, '\0');
+	  } else {
+	    insert_slens[1] = int_slen(cur_bp);
+	    uint32_t ref_slen = 0;
+	    uint32_t cur_overflow = 0;
+	    char* tmp_allele_ptrs[2];
+	    if (varid_alleles_needed & 1) {
+	      ref_slen = token_slens[2];
+	      if (ref_slen > new_variant_id_max_allele_slen) {
+		ref_slen = new_variant_id_max_allele_slen;
+		cur_overflow = 1;
+	      }
+	      insert_slens[2] = ref_slen;
+	      tmp_allele_ptrs[0] = token_ptrs[2];
+	    }
+	    if (varid_alleles_needed > 1) {
+	      uint32_t alt1_slen;
+	      if (!alt_allele_iter) {
+		alt1_slen = remaining_alt_char_ct;
+	      } else {
+		alt1_slen = (uintptr_t)(alt_allele_iter - loadbuf_iter);
+	      }
+	      if (alt1_slen > new_variant_id_max_allele_slen) {
+		alt1_slen = new_variant_id_max_allele_slen;
+		++cur_overflow;
+	      }
+	      if (varid_alleles_needed <= 3) {
+	      load_pvar_keep_allele_ascii_order:
+		insert_slens[3] = alt1_slen;
+		tmp_allele_ptrs[1] = loadbuf_iter;
+	      } else {
+		uint32_t smaller_slen = alt1_slen;
+		const int32_t ref_slen_geq = (ref_slen >= alt1_slen);
+		if (!ref_slen_geq) {
+		  smaller_slen = ref_slen;
+		}
+		int32_t memcmp_result = memcmp(token_ptrs[2], loadbuf_iter, smaller_slen);
+		if (!memcmp_result) {
+		  memcmp_result = ref_slen_geq;
+		}
+		if (memcmp_result <= 0) {
+		  goto load_pvar_keep_allele_ascii_order;
+		}
+		insert_slens[3] = ref_slen;
+		tmp_allele_ptrs[1] = tmp_allele_ptrs[0];
+		insert_slens[2] = alt1_slen;
+		tmp_allele_ptrs[0] = loadbuf_iter;
+	      }
+	    }
+	    id_slen = varid_template_base_len + insert_slens[1] + insert_slens[2] + insert_slens[3];
+	    if (new_variant_id_overflow_missing && cur_overflow) {
+	      tmp_alloc_end -= missing_varid_blen;
+	      if (tmp_alloc_end < tmp_alloc_base) {
+		goto load_pvar_ret_NOMEM;
+	      }
+	      memcpy(tmp_alloc_end, missing_varid_match, missing_varid_blen);
+	      id_slen = 0;
+	      cur_overflow = 1;
+	    } else {
+	      tmp_alloc_end -= id_slen + 1;
+	      if (tmp_alloc_end < tmp_alloc_base) {
+		goto load_pvar_ret_NOMEM;
+	      }
+	      char* id_iter = (char*)tmp_alloc_end;
+	      char* insert_ptrs[4];
+	      for (uint32_t insert_idx = 0; insert_idx < varid_template_insert_ct; ++insert_idx) {
+		id_iter = memcpya(id_iter, varid_template_segs[insert_idx], varid_template_seg_lens[insert_idx]);
+		const uint32_t cur_insert_type = varid_template_insert_types[insert_idx];
+		insert_ptrs[cur_insert_type] = id_iter;
+		id_iter = &(id_iter[insert_slens[cur_insert_type]]);
+	      }
+	      memcpyx(id_iter, varid_template_segs[varid_template_insert_ct], varid_template_seg_lens[varid_template_insert_ct], '\0');
+
+	      memcpy(insert_ptrs[0], chr_output_name_buf, insert_slens[0]);
+	      uint32toa(cur_bp, insert_ptrs[1]);
+	      for (uint32_t insert_type_idx = 2; insert_type_idx < varid_template_insert_ct; ++insert_type_idx) {
+		memcpy(insert_ptrs[insert_type_idx], tmp_allele_ptrs[insert_type_idx - 2], insert_slens[insert_type_idx]);
+	      }
+	    }
+            new_variant_id_allele_len_overflow += cur_overflow;
+	  }
+	  if (id_slen > max_variant_id_slen) {
+	    max_variant_id_slen = id_slen;
+	  }
+	  cur_ids[variant_idx_lowbits] = (char*)tmp_alloc_end;
+
+	  // REF
+	  char* ref_allele = token_ptrs[2];
+	  const uint32_t ref_slen = token_slens[2];
+	  if (ref_slen == 1) {
+	    // const_cast
+	    *allele_storage_iter = (char*)((uintptr_t)(&(g_one_char_strs[2 * ref_allele[0]])));
+	  } else {
+	    tmp_alloc_end -= ref_slen + 1;
+	    if (tmp_alloc_end < tmp_alloc_base) {
+	      goto load_pvar_ret_NOMEM;
+	    }
+	    memcpyx(tmp_alloc_end, ref_allele, ref_slen, '\0');
+	    *allele_storage_iter = (char*)tmp_alloc_end;
+	    if (ref_slen > max_allele_slen) {
+	      max_allele_slen = ref_slen;
+	    }
+	  }
+	  ++allele_storage_iter;
+
+	  // ALT
+	  if (alt_allele_iter) {
+	    do {
+	      if (allele_storage_iter >= allele_storage_limit) {
+		goto load_pvar_ret_NOMEM;
+	      }
+	      const uint32_t cur_allele_slen = (uintptr_t)(alt_allele_iter - loadbuf_iter);
+	      // possible todo: convert '0' to '.' here?
+	      if (cur_allele_slen == 1) {
+		// const_cast
+		*allele_storage_iter = (char*)((uintptr_t)(&(g_one_char_strs[2 * loadbuf_iter[0]])));
+	      } else {
+		if (!cur_allele_slen) {
+		  goto load_pvar_ret_EMPTY_ALLELE_CODE;
+		}
+		tmp_alloc_end -= cur_allele_slen + 1;
+		if (tmp_alloc_end < tmp_alloc_base) {
+		  goto load_pvar_ret_NOMEM;
+		}
+		memcpyx(tmp_alloc_end, loadbuf_iter, cur_allele_slen, '\0');
+		*allele_storage_iter = (char*)tmp_alloc_end;
+		if (cur_allele_slen > max_allele_slen) {
+		  max_allele_slen = cur_allele_slen;
+		}
+	      }
+	      ++allele_storage_iter;
+	      remaining_alt_char_ct -= cur_allele_slen + 1;
+	      loadbuf_iter = alt_allele_iter;
+	      alt_allele_iter = (char*)memchr(loadbuf_iter, ',', remaining_alt_char_ct);
+	    } while (alt_allele_iter);
+	    if (!remaining_alt_char_ct) {
+	      goto load_pvar_ret_EMPTY_ALLELE_CODE;
+	    }
+	  }
+	  if (remaining_alt_char_ct == 1) {
+	    // const_cast
+	    *allele_storage_iter = (char*)((uintptr_t)(&(g_one_char_strs[2 * loadbuf_iter[0]])));
+	  } else {
+	    tmp_alloc_end -= remaining_alt_char_ct + 1;
+	    if (tmp_alloc_end < tmp_alloc_base) {
+	      goto load_pvar_ret_NOMEM;
+	    }
+	    memcpyx(tmp_alloc_end, loadbuf_iter, remaining_alt_char_ct, '\0');
+	    *allele_storage_iter = (char*)tmp_alloc_end;
+	    if (remaining_alt_char_ct > max_allele_slen) {
+	      max_allele_slen = remaining_alt_char_ct;
+	    }
+	  }
+	  ++allele_storage_iter;
+
+	  // CM
+	  if (cm_col_present) {
+	    char* cm_token = token_ptrs[7];
+	    if ((cm_token[0] != '0') || (cm_token[1] > ' ')) {
+	      double cur_cm;
+	      if (!scanadv_double(cm_token, &cur_cm)) {
+		sprintf(g_logbuf, "Error: Invalid centimorgan position on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
+		goto load_pvar_ret_MALFORMED_INPUT_WW;
+	      }
+	      if (cur_cm < last_cm) {
+		vpos_sortstatus |= kfUnsortedVarCm;
+	      } else {
+		last_cm = cur_cm;
+	      }
+	      if (cur_cm != 0.0) {
+		if (!at_least_one_nzero_cm) {
+		  if ((uintptr_t)(tmp_alloc_end - tmp_alloc_base) < kLoadPvarBlockSize * sizeof(double)) {
+		    goto load_pvar_ret_NOMEM;
+		  }
+		  if (cur_chr_idxs) {
+		    // reposition cur_chr_idxs[] after cur_cms[]
+		    cur_cms = (double*)cur_chr_idxs;
+		    cur_chr_idxs = (chr_idx_t*)(&(cur_cms[kLoadPvarBlockSize]));
+		    memcpy(cur_chr_idxs, cur_cms, kLoadPvarBlockSize * sizeof(chr_idx_t));
+		    tmp_alloc_base = (unsigned char*)(&(cur_chr_idxs[kLoadPvarBlockSize]));
+		  } else {
+		    cur_cms = (double*)tmp_alloc_base;
+		    tmp_alloc_base = (unsigned char*)(&(cur_cms[kLoadPvarBlockSize]));
+		  }
+		  fill_double_zero(kLoadPvarBlockSize, cur_cms);
+		  cms_start_block = raw_variant_ct / kLoadPvarBlockSize;
+		  at_least_one_nzero_cm = 1;
+		}
+	        cur_cms[variant_idx_lowbits] = cur_cm;
+	      }
+	    }
+	  }
+	} else {
+	  token_ptrs[3] = next_token_mult(loadbuf_iter, alt_col_idx);
+	  if (!token_ptrs[3]) {
+	    goto load_pvar_ret_MISSING_TOKENS;
+	  }
+	  token_slens[3] = strlen_se(token_ptrs[3]);
+	load_pvar_skip_variant:
+	  ++exclude_ct;
+	  clear_bit(variant_idx_lowbits, cur_include);
+	  cur_bps[variant_idx_lowbits] = last_bp;
+	  // necessary to check alt allele count
+	  // const_cast
+	  *allele_storage_iter++ = (char*)((uintptr_t)missing_allele_str);
+	  *allele_storage_iter++ = (char*)((uintptr_t)missing_allele_str);
+	  loadbuf_iter = token_ptrs[3];
+	  char* token_end = &(loadbuf_iter[token_slens[3]]);
+	  while (1) {
+	    loadbuf_iter = (char*)memchr(loadbuf_iter, ',', (uintptr_t)(token_end - loadbuf_iter));
+	    if (!loadbuf_iter) {
+	      break;
+	    }
+	    if (allele_storage_iter >= allele_storage_limit) {
+	      goto load_pvar_ret_NOMEM;
+	    }
+	    ++loadbuf_iter;
+	    // const_cast
+	    *allele_storage_iter++ = (char*)((uintptr_t)missing_allele_str);
+	  }
+	}
+	++raw_variant_ct;
+      }
+      ++line_idx;
+      if (!gzgets(gz_infile, loadbuf, loadbuf_size)) {
+	if (!gzeof(gz_infile)) {
+	  goto load_pvar_ret_READ_FAIL;
+	}
+	break;
+      }
+      if (!loadbuf[loadbuf_size - 1]) {
+	if (loadbuf_size == kMaxLongLine) {
+	  goto load_pvar_ret_LONG_LINE;
+	}
+	goto load_pvar_ret_NOMEM;
+      }
+      loadbuf_first_token = skip_initial_spaces(loadbuf);
+      if (loadbuf_first_token[0] == '#') {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #CHROM header line is present it must denote the end of the header block.)\n", line_idx, pvarname);
+	goto load_pvar_ret_MALFORMED_INPUT_WW;
+      }
+    }
+    if (max_variant_id_slen > kMaxIdSlen) {
+      logerrprint("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
+      goto load_pvar_ret_MALFORMED_INPUT;
+    }
+    if (new_variant_id_allele_len_overflow) {
+      if (new_variant_id_overflow_missing) {
+	LOGERRPRINTFWW("Warning: %" PRIuPTR " variant ID%s %s due to allele code length.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "unchanged by --set-missing-var-ids" : "erased by --set-all-var-ids");
+	if (max_variant_id_slen < missing_varid_blen - 1) {
+	  max_variant_id_slen = missing_varid_blen - 1;
+	}
+      } else if (misc_flags & kfMiscNewVarIdOverflowTruncate) {
+	LOGERRPRINTF("Warning: %" PRIuPTR " allele code%s truncated by --set-%s-var-ids.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "missing" : "all");
+      } else {
+	LOGERRPRINTFWW("Error: %" PRIuPTR " allele code%s too long for --set-%s-var-ids. You should either switch to a different allele/variant naming scheme for long indels, or use --new-id-max-allele-len to raise the length limit.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "missing" : "all");
+	goto load_pvar_ret_INCONSISTENT_INPUT;
+      }
+    }
+    if (gzclose_null(&gz_infile)) {
+      goto load_pvar_ret_READ_FAIL;
+    }
+    *max_variant_id_slen_ptr = max_variant_id_slen;
+    *max_allele_slen_ptr = max_allele_slen;
+    *max_filter_slen_ptr = max_filter_slen;
+    *raw_variant_ct_ptr = raw_variant_ct;
+    uintptr_t allele_idx_end = (uintptr_t)(allele_storage_iter - allele_storage);
+    bigstack_finalize_ul((uintptr_t*)allele_storage, allele_idx_end);
+    uintptr_t* variant_allele_idxs = nullptr;
+    const uint32_t full_block_ct = raw_variant_ct / kLoadPvarBlockSize;
+    const uintptr_t raw_variant_ct_lowbits = raw_variant_ct % kLoadPvarBlockSize;
+    // todo: determine whether we want variant_include to be guaranteed to be
+    // terminated by a zero bit
+    const uint32_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    if (bigstack_alloc_ul(raw_variant_ctl, variant_include_ptr) ||
+	bigstack_alloc_ui(raw_variant_ct, variant_bps_ptr) ||
+	bigstack_alloc_cp(raw_variant_ct, variant_ids_ptr)) {
+      goto load_pvar_ret_NOMEM;
+    }
+    uintptr_t* qual_present = nullptr;
+    float* quals = nullptr;
+    if (load_qual_col > 1) {
+      if (bigstack_alloc_ul(raw_variant_ctl, qual_present_ptr) ||
+	  bigstack_alloc_f(raw_variant_ct, quals_ptr)) {
+	goto load_pvar_ret_NOMEM;
+      }
+      qual_present = *qual_present_ptr;
+      quals = *quals_ptr;
+    }
+    uintptr_t* filter_present = nullptr;
+    uintptr_t* filter_npass = nullptr;
+    char** filter_storage = nullptr;
+    if (load_filter_col > 1) {
+      if (bigstack_alloc_ul(raw_variant_ctl, filter_present_ptr) ||
+	  bigstack_alloc_ul(raw_variant_ctl, filter_npass_ptr)) {
+	goto load_pvar_ret_NOMEM;
+      }
+      filter_present = *filter_present_ptr;
+      filter_npass = *filter_npass_ptr;
+      if (at_least_one_npass_filter) {
+	// possible todo: store this in a sparse manner
+	if (bigstack_alloc_cp(raw_variant_ct, filter_storage_ptr)) {
+	  goto load_pvar_ret_NOMEM;
+	}
+	filter_storage = *filter_storage_ptr;
+      }
+    }
+    uintptr_t* nonref_flags = nullptr;
+    if (info_pr_present) {
+      if (bigstack_alloc_ul(raw_variant_ctl, nonref_flags_ptr)) {
+	goto load_pvar_ret_NOMEM;
+      }
+      nonref_flags = *nonref_flags_ptr;
+    }
+    // load_qual_col > 1:
+    //   kLoadPvarBlockSize / CHAR_BIT for qual_present
+    //   kLoadPvarBlockSize * sizeof(float) for quals
+    // load_filter_col > 1:
+    //   2 * (kLoadPvarBlockSize / CHAR_BIT) for filter_present, filter_npass
+    //   kLoadPvarBlockSize * sizeof(intptr_t) for filter_storage
+    // at_least_one_nzero_cm:
+    //   kLoadPvarBlockSize * sizeof(double)
+    // is_split_chr:
+    //   kLoadPvarBlockSize * sizeof(chr_idx_t)
+    unsigned char* read_iter = g_bigstack_end;
+    uint32_t* variant_bps = *variant_bps_ptr;
+    char** variant_ids = *variant_ids_ptr;
+    uintptr_t* variant_include = *variant_include_ptr;
+    for (uint32_t block_idx = 0; block_idx < full_block_ct; ++block_idx) {
+      memcpy(&(variant_bps[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(int32_t));
+      // skip over variant_allele_idxs
+      read_iter = &(read_iter[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]);
+      memcpy(&(variant_ids[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
+      read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
+      memcpy(&(variant_include[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
+      read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+      if (qual_present) {
+	memcpy(&(qual_present[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
+	read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+	memcpy(&(quals[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(float));
+	read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(float)]);
+      }
+      if (filter_present) {
+	memcpy(&(filter_present[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
+	read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+	memcpy(&(filter_npass[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
+	read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+	if (filter_storage) {
+	  memcpy(&(filter_storage[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
+	}
+	read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
+      }
+      if (info_pr_present) {
+	memcpy(&(nonref_flags[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
+	read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+      }
+      // skip over cms
+      if (block_idx >= cms_start_block) {
+        read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(double)]);
+      }
+      // skip over chr_idxs
+      if (block_idx >= chr_idxs_start_block) {
+	read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(chr_idx_t)]);
+      }
+    }
+    memcpy(&(variant_bps[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(int32_t));
+    read_iter = &(read_iter[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]);
+    memcpy(&(variant_ids[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
+    read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
+    const uint32_t last_bitblock_size = DIV_UP(raw_variant_ct_lowbits, CHAR_BIT);
+    memcpy(&(variant_include[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
+    zero_trailing_bits(raw_variant_ct, variant_include);
+    read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+    if (qual_present) {
+      memcpy(&(qual_present[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
+      zero_trailing_bits(raw_variant_ct, qual_present);
+      read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+      memcpy(&(quals[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(float));
+      read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(float)]);
+    }
+    if (filter_present) {
+      memcpy(&(filter_present[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
+      zero_trailing_bits(raw_variant_ct, filter_present);
+      read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+      memcpy(&(filter_npass[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
+      zero_trailing_bits(raw_variant_ct, filter_npass);
+      read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+      if (filter_storage) {
+	memcpy(&(filter_storage[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
+	read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
+      }
+    }
+    if (info_pr_present) {
+      memcpy(&(nonref_flags[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
+      zero_trailing_bits(raw_variant_ct, nonref_flags);
+      // read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
+    }
+    const uintptr_t read_iter_stride_base = kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t)) + (kLoadPvarBlockSize / CHAR_BIT) + (load_qual_col > 1) * ((kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(float)) + (load_filter_col > 1) * (2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)) + info_pr_present * (kLoadPvarBlockSize / CHAR_BIT);
+    if (allele_idx_end > 2 * ((uintptr_t)raw_variant_ct)) {
+      if (bigstack_alloc_ul(raw_variant_ct + 1, variant_allele_idxs_ptr)) {
+	goto load_pvar_ret_NOMEM;
+      }
+      variant_allele_idxs = *variant_allele_idxs_ptr;
+      uintptr_t* allele_idx_read_iter = (uintptr_t*)(&(g_bigstack_end[kLoadPvarBlockSize * sizeof(int32_t)]));
+      for (uint32_t block_idx = 0; block_idx < full_block_ct; ++block_idx) {
+	memcpy(&(variant_allele_idxs[block_idx * kLoadPvarBlockSize]), allele_idx_read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
+	allele_idx_read_iter = (uintptr_t*)(((uintptr_t)allele_idx_read_iter) + read_iter_stride_base + (block_idx >= cms_start_block) * kLoadPvarBlockSize * sizeof(double) + (block_idx >= chr_idxs_start_block) * kLoadPvarBlockSize * sizeof(chr_idx_t));
+      }
+      memcpy(&(variant_allele_idxs[full_block_ct * kLoadPvarBlockSize]), allele_idx_read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
+      variant_allele_idxs[raw_variant_ct] = allele_idx_end;
+    }
+    if (at_least_one_nzero_cm) {
+      if (bigstack_alloc_d(raw_variant_ct, variant_cms_ptr)) {
+	goto load_pvar_ret_NOMEM;
+      }
+      double* variant_cms = *variant_cms_ptr;
+      fill_double_zero(cms_start_block * kLoadPvarBlockSize, variant_cms);
+      double* cms_read_iter = (double*)(&(g_bigstack_end[read_iter_stride_base * (cms_start_block + 1)]));
+      if (cms_start_block > chr_idxs_start_block) {
+	cms_read_iter = (double*)(((uintptr_t)cms_read_iter) + kLoadPvarBlockSize * sizeof(chr_idx_t) * (cms_start_block - chr_idxs_start_block));
+      }
+      for (uint32_t block_idx = cms_start_block; block_idx < full_block_ct; ++block_idx) {
+	memcpy(&(variant_cms[block_idx * kLoadPvarBlockSize]), cms_read_iter, kLoadPvarBlockSize * sizeof(double));
+	cms_read_iter = (double*)(((uintptr_t)cms_read_iter) + read_iter_stride_base + kLoadPvarBlockSize * sizeof(double) + (block_idx >= chr_idxs_start_block) * kLoadPvarBlockSize * sizeof(chr_idx_t));
+      }
+      memcpy(&(variant_cms[full_block_ct * kLoadPvarBlockSize]), cms_read_iter, raw_variant_ct_lowbits * sizeof(double));
+    } else {
+      *variant_cms_ptr = nullptr;
+    }
+    if (!is_split_chr) {
+      cip->chr_fo_vidx_start[chrs_encountered_m1 + 1] = raw_variant_ct;
+      if (splitpar_bound2) {
+	if (splitpar_and_exclude_x) {
+	  clear_bit(x_code, chr_mask);
+	}
+	reterr = splitpar(variant_bps, *vpos_sortstatus_ptr, splitpar_bound1, splitpar_bound2, variant_include, loaded_chr_mask, cip, &chrs_encountered_m1, &exclude_ct);
+	if (reterr) {
+	  goto load_pvar_ret_1;
+	}
+      } else if (merge_par) {
+	if (merge_par_ct) {
+	  LOGPRINTF("--merge-par: %u chromosome code%s changed.\n", merge_par_ct, (merge_par_ct == 1)? "" : "s");
+	} else {
+	  logerrprint("Warning: --merge-par had no effect (no PAR1/PAR2 chromosome codes present).\n");
+	}
+      }
+      cip->chr_ct = chrs_encountered_m1 + 1;
+    } else {
+      chr_idx_t* chr_idxs = (chr_idx_t*)bigstack_alloc(raw_variant_ct * sizeof(chr_idx_t));
+      if (!chr_idxs) {
+	goto load_pvar_ret_NOMEM;
+      }
+      *chr_idxs_ptr = chr_idxs;
+      if (chr_idxs_start_block) {
+	const uint32_t end_vidx = chr_idxs_start_block * kLoadPvarBlockSize;
+	uint32_t chr_fo_idx = chrs_encountered_m1;
+	while (cip->chr_fo_vidx_start[chr_fo_idx] >= end_vidx) {
+	  --chr_fo_idx;
+	}
+	backfill_chr_idxs(cip, chr_fo_idx, 0, end_vidx, chr_idxs);
+      }
+      chr_idx_t* chr_idxs_read_iter = (chr_idx_t*)(&(g_bigstack_end[read_iter_stride_base * (chr_idxs_start_block + 1)]));
+      if (chr_idxs_start_block >= cms_start_block) {
+	chr_idxs_read_iter = (chr_idx_t*)(((uintptr_t)chr_idxs_read_iter) + kLoadPvarBlockSize * sizeof(double) * (chr_idxs_start_block + 1 - cms_start_block));
+      }
+      for (uint32_t block_idx = chr_idxs_start_block; block_idx < full_block_ct;) {
+	memcpy(&(chr_idxs[block_idx * kLoadPvarBlockSize]), chr_idxs_read_iter, kLoadPvarBlockSize * sizeof(chr_idx_t));
+	++block_idx;
+	chr_idxs_read_iter = (chr_idx_t*)(((uintptr_t)chr_idxs_read_iter) + read_iter_stride_base + kLoadPvarBlockSize * sizeof(chr_idx_t) + (block_idx >= cms_start_block) * kLoadPvarBlockSize * sizeof(double));
+      }
+      memcpy(&(chr_idxs[full_block_ct * kLoadPvarBlockSize]), chr_idxs_read_iter, raw_variant_ct_lowbits * sizeof(chr_idx_t));
+    }
+    const uint32_t last_chr_code = cip->max_code + cip->name_ct;
+    const uint32_t chr_word_ct = BITCT_TO_WORDCT(last_chr_code + 1);
+    bitvec_and(loaded_chr_mask, chr_word_ct, chr_mask);
+    bigstack_end_set(tmp_alloc_end);
+    *variant_ct_ptr = raw_variant_ct - exclude_ct;
+    *vpos_sortstatus_ptr = vpos_sortstatus;
+    *allele_storage_ptr = allele_storage;
+    // if only INFO:PR present, no need to reload
+    *info_reload_slen_ptr = info_nonpr_present? info_reload_slen : 0;
+  }
+
+  while (0) {
+  load_pvar_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  load_pvar_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  load_pvar_ret_EMPTY_ALLELE_CODE:
+    LOGERRPRINTFWW("Error: Empty allele code on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
+    reterr = kPglRetMalformedInput;
+    break;
+  load_pvar_ret_LONG_LINE:
+    sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, pvarname);
+  load_pvar_ret_MALFORMED_INPUT_WW:
+    wordwrapb(0);
+    logerrprintb();
+  load_pvar_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  load_pvar_ret_INCONSISTENT_INPUT:
+    reterr = kPglRetInconsistentInput;
+    break;
+  load_pvar_ret_MISSING_TOKENS:
+    LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, pvarname);
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ load_pvar_ret_1:
+  if (reterr) {
+    bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  }
+  gzclose_cond(gz_infile);
+  return reterr;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/plink2_pvar.h b/plink2_pvar.h
new file mode 100644
index 0000000..c636f89
--- /dev/null
+++ b/plink2_pvar.h
@@ -0,0 +1,76 @@
+#ifndef __PLINK2_PVAR_H__
+#define __PLINK2_PVAR_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// Key .pvar properties: (i) .bim files are valid .pvar files; (ii) .vcf files,
+// with or without post-INFO columns removed, are valid .pvar files; (iii)
+// these files remain valid when {b}gzipped.
+//
+// File starts with an arbitrary (possibly zero) number of header lines marked
+// by a leading '#'.  All lines which don't start with '#CHROM' are currently
+// ignored.  The #CHROM line specifies the columns in the .pvar file; the
+// following column headers are recognized:
+//   POS (bp coordinate)
+//   ID (variant ID)
+//   REF (reference allele)
+//   ALT (alternate alleles, comma-separated)
+//   QUAL (phred-scaled quality score for whether the locus is variable at all)
+//   FILTER (PASS, ., or semicolon-separated list of failing filter codes)
+//   INFO (semicolon-separated list of flags and key-value pairs, with types
+//     declared in header)
+//   FORMAT (terminates header line parsing, so any VCF sample data is ignored)
+//   CM (centimorgan position)
+// The loader will error out if multiple #CHROM lines are in the header for
+// some bizarre reason.  If no #CHROM line is present, it is assumed to be
+// "#CHROM ID CM POS ALT REF" for .bim compatibility, or
+// "#CHROM ID POS ALT REF" in the case where the first nonheader line has
+// exactly 5 columns.
+
+
+// variant_allele_idxs[] is a length-(variant_ct + 1) array of reference allele
+// indices in allele_storage[].  The alt1 allele index is the reference allele
+// index plus 1, etc.  The number of alt alleles for (0-based) variant n is
+// (variant_allele_idxs[n+1] - variant_allele_idxs[n] - 1).
+
+// Memory for all the return arrays is allocated off the bottom of g_bigstack.
+// (I.e. it's assumed that you'll want to keep these arrays until your program
+// is about to terminate.)
+
+
+pglerr_t read_chrset_header_line(char* chrset_iter, const char* file_descrip, misc_flags_t misc_flags, uintptr_t line_idx, chr_info_t* cip);
+
+// assumes info_token[-1] is safe to read
+// may set info_token[info_slen] to \0
+char* pr_in_info_token(uint32_t info_slen, char* info_token);
+
+// cip, max_variant_id_slen, and info_reload are in/out parameters.
+// Chromosome filtering is performed if cip requests it.
+pglerr_t load_pvar(const char* pvarname, char* var_filter_exceptions_flattened, const char* varid_template, const char* missing_varid_match, misc_flags_t misc_flags, pvar_psam_t pvar_psam_modifier, exportf_flags_t exportf_modifier, float var_min_qual, uint32_t splitpar_bound1, uint32_t splitpar_bound2, uint32_t new_variant_id_max_allele_slen, uint32_t snps_only, uint32_t split_chr_ok, chr_info_t* cip, uint32_t* max_variant_id_slen_ptr, uint32_t* info_reload_slen_ptr, unsorted_var_t* vpos [...]
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PLINK2_PVAR_H__
diff --git a/plink2_random.cpp b/plink2_random.cpp
new file mode 100644
index 0000000..79182b2
--- /dev/null
+++ b/plink2_random.cpp
@@ -0,0 +1,118 @@
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_random.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+sfmt_t g_sfmt;
+
+double rand_normal(sfmt_t* sfmtp, double* secondval_ptr) {
+  // Box-Muller.  try changing this to e.g. ziggurat if it's ever a serious
+  // bottleneck.
+  const double dxx = sqrt(-2 * log(rand_unif(sfmtp)));
+  const double dyy = (2 * kPi) * rand_unif(sfmtp);
+  *secondval_ptr = dxx * cos(dyy);
+  return dxx * sin(dyy);
+}
+
+sfmt_t** g_sfmtp_arr;
+
+boolerr_t bigstack_init_sfmtp(uint32_t thread_ct, uint32_t use_main_sfmt_as_element_zero) {
+  g_sfmtp_arr = (sfmt_t**)bigstack_alloc(thread_ct * sizeof(intptr_t));
+  if (!g_sfmtp_arr) {
+    return 1;
+  }
+  if (use_main_sfmt_as_element_zero) {
+    g_sfmtp_arr[0] = &g_sfmt;
+  }
+  if (thread_ct > use_main_sfmt_as_element_zero) {
+    uint32_t uibuf[4];
+    for (uint32_t tidx = use_main_sfmt_as_element_zero; tidx < thread_ct; ++tidx) {
+      g_sfmtp_arr[tidx] = (sfmt_t*)bigstack_alloc(sizeof(sfmt_t));
+      if (!g_sfmtp_arr[tidx]) {
+	return 1;
+      }
+      for (uint32_t uii = 0; uii < 4; ++uii) {
+	uibuf[uii] = sfmt_genrand_uint32(&g_sfmt);
+      }
+      sfmt_init_by_array(g_sfmtp_arr[tidx], uibuf, 4);
+    }
+  }
+  return 0;
+}
+
+// multithread globals
+static double* g_darray = nullptr;
+static uint32_t g_calc_thread_ct = 0;
+static uintptr_t g_entry_pair_ct = 0;
+
+THREAD_FUNC_DECL fill_gaussian_darray_thread(void* arg) {
+  const uintptr_t tidx = (uintptr_t)arg;
+  const uintptr_t entry_pair_ct = g_entry_pair_ct;
+  const uint32_t calc_thread_ct = g_calc_thread_ct;
+  sfmt_t* sfmtp = g_sfmtp_arr[tidx];
+  uintptr_t idx_start = (tidx * entry_pair_ct) / calc_thread_ct;
+  uintptr_t idx_ct = (((tidx + 1) * entry_pair_ct) / calc_thread_ct) - idx_start;
+  double* darray_iter = &(g_darray[idx_start * 2]);
+  for (uintptr_t ulii = 0; ulii < idx_ct; ++ulii) {
+    double dxx;
+    *darray_iter++ = rand_normal(sfmtp, &dxx);
+    *darray_iter++ = dxx;
+  }
+  THREAD_RETURN;
+}
+
+pglerr_t fill_gaussian_darray(uintptr_t entry_pair_ct, uint32_t thread_ct, double* darray) {
+  unsigned char* bigstack_mark = g_bigstack_base;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uintptr_t max_useful_thread_ct = DIV_UP(entry_pair_ct, 262144);
+    if (thread_ct > max_useful_thread_ct) {
+      thread_ct = max_useful_thread_ct;
+    }
+    pthread_t* threads;
+    if (bigstack_init_sfmtp(thread_ct, 1) ||
+	bigstack_alloc_thread(thread_ct, &threads)) {
+      goto fill_gaussian_darray_ret_NOMEM;
+    }
+    g_darray = darray;
+    g_entry_pair_ct = entry_pair_ct;
+    g_calc_thread_ct = thread_ct;
+    if (spawn_threads(fill_gaussian_darray_thread, thread_ct, threads)) {
+      goto fill_gaussian_darray_ret_THREAD_CREATE_FAIL;
+    }
+    fill_gaussian_darray_thread((void*)0);
+    join_threads(thread_ct, threads);
+  }
+  while (0) {
+  fill_gaussian_darray_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  fill_gaussian_darray_ret_THREAD_CREATE_FAIL:
+    reterr = kPglRetThreadCreateFail;
+    break;
+  }
+  bigstack_reset(bigstack_mark);
+  return reterr;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/plink2_random.h b/plink2_random.h
new file mode 100644
index 0000000..3aa0074
--- /dev/null
+++ b/plink2_random.h
@@ -0,0 +1,46 @@
+#ifndef __PLINK2_RANDOM_H__
+#define __PLINK2_RANDOM_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+#include "SFMT.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+extern sfmt_t g_sfmt;
+
+HEADER_INLINE double rand_unif(sfmt_t* sfmtp) {
+  return (sfmt_genrand_uint32(sfmtp) + 0.5) * kRecip2m32;
+}
+
+double rand_normal(sfmt_t* sfmtp, double* secondval_ptr);
+
+extern sfmt_t** g_sfmtp_arr;
+
+boolerr_t bigstack_init_sfmtp(uint32_t thread_ct, uint32_t use_main_sfmt_as_element_zero);
+
+pglerr_t fill_gaussian_darray(uintptr_t entry_pair_ct, uint32_t thread_ct, double* darray);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+ 
+#endif // __PLINK2_RANDOM_H__
diff --git a/plink2_set.cpp b/plink2_set.cpp
new file mode 100644
index 0000000..ebd2336
--- /dev/null
+++ b/plink2_set.cpp
@@ -0,0 +1,408 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_decompress.h"
+#include "plink2_set.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+pglerr_t load_range_list(const chr_info_t* cip, const uint32_t* variant_bps, const char* sorted_subset_ids, const char* file_descrip, uint32_t track_set_names, uint32_t border_extend, uint32_t collapse_group, uint32_t fail_on_no_sets, uint32_t c_prefix, uint32_t allow_no_variants, uintptr_t subset_ct, uintptr_t max_subset_id_blen, gzFile gz_infile, uintptr_t* set_ct_ptr, char** set_names_ptr, uintptr_t* max_set_id_blen_ptr, uint64_t** range_sort_buf_ptr, make_set_range_t*** make_set_rang [...]
+  // In plink 1.9, called directly by extract_exclude_range(), define_sets(),
+  // and indirectly by annotate(), gene_report(), and clump_reports().
+  // Assumes caller will reset g_bigstack_end later.
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    g_textbuf[kMaxMediumLine - 1] = ' ';
+    ll_str_t* make_set_ll = nullptr;
+    char* set_names = nullptr;
+    uintptr_t set_ct = 0;
+    uintptr_t max_set_id_blen = 0;
+    // if we need to track set names, put together a sorted list
+    if (track_set_names) {
+      uintptr_t line_idx = 0;
+      while (gzgets(gz_infile, g_textbuf, kMaxMediumLine)) {
+	++line_idx;
+	if (!g_textbuf[kMaxMediumLine - 1]) {
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
+	  goto load_range_list_ret_MALFORMED_INPUT_2;
+	}
+	char* textbuf_first_token = skip_initial_spaces(g_textbuf);
+	if (is_eoln_kns(*textbuf_first_token)) {
+	  continue;
+	}
+	char* first_token_end = token_endnn(textbuf_first_token);
+	char* cur_set_id = next_token_mult(first_token_end, 3);
+	char* last_token;
+	if (!collapse_group) {
+	  last_token = cur_set_id;
+	} else {
+	  last_token = next_token(cur_set_id);
+	}
+	if (no_more_tokens_kns(last_token)) {
+	  sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
+	  goto load_range_list_ret_MALFORMED_INPUT_2;
+	}
+	const uint32_t chr_name_slen = (uintptr_t)(first_token_end - textbuf_first_token);
+	*first_token_end = '\0';
+	const int32_t cur_chr_code = get_chr_code(textbuf_first_token, cip, chr_name_slen);
+	if (cur_chr_code < 0) {
+	  sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+	  goto load_range_list_ret_MALFORMED_INPUT_2;
+	}
+	// chr_mask check removed, we want to track empty sets
+	uint32_t set_id_slen = strlen_se(cur_set_id);
+	cur_set_id[set_id_slen] = '\0';
+	if (subset_ct) {
+	  if (bsearch_str(cur_set_id, sorted_subset_ids, set_id_slen, max_subset_id_blen, subset_ct) == -1) {
+	    continue;
+	  }
+	}
+	if (collapse_group) {
+	  set_id_slen = strlen_se(last_token);
+	  last_token[set_id_slen] = '\0';
+	}
+	// when there are repeats, they are likely to be next to each other
+	if (make_set_ll && (!strcmp(make_set_ll->ss, last_token))) {
+	  continue;
+	}
+	uint32_t set_id_blen = set_id_slen + 1;
+	// argh, --clump counts positional overlaps which don't include any
+	// variants in the dataset.  So we prefix set IDs with a chromosome
+	// index in that case (with leading zeroes) and treat cross-chromosome
+	// sets as distinct.
+	if (!variant_bps) {
+	  set_id_blen += 4;
+	}
+	if (set_id_blen > max_set_id_blen) {
+	  max_set_id_blen = set_id_blen;
+	}
+	ll_str_t* ll_tmp;
+	if (bigstack_end_alloc_llstr(set_id_blen, &ll_tmp)) {
+	  goto load_range_list_ret_NOMEM;
+	}
+	ll_tmp->next = make_set_ll;
+	if (variant_bps) {
+	  memcpy(ll_tmp->ss, last_token, set_id_blen);
+	} else {
+	  uitoa_z4((uint32_t)cur_chr_code, ll_tmp->ss);
+	  // if first character of gene name is a digit, natural sort has
+	  // strange effects unless we force [3] to be nonnumeric...
+	  ll_tmp->ss[3] -= 15;
+	  memcpy(&(ll_tmp->ss[4]), last_token, set_id_blen - 4);
+	}
+	make_set_ll = ll_tmp;
+	++set_ct;
+      }
+      if (!gzeof(gz_infile)) {
+	goto load_range_list_ret_READ_FAIL;
+      }
+      if (!set_ct) {
+	if (fail_on_no_sets) {
+	  if (variant_bps) {
+	    if (!allow_no_variants) {
+	      // okay, this is a kludge
+	      logerrprint("Error: All variants excluded by --gene{-all}, since no sets were defined from\n--make-set file.\n");
+	      reterr = kPglRetMalformedInput;
+	      goto load_range_list_ret_1;
+	    }
+	  } else {
+	    if (subset_ct) {
+	      logerrprint("Error: No --gene-subset genes present in --gene-report file.\n");
+	      reterr = kPglRetInconsistentInput;
+	    } else {
+	      logerrprint("Error: Empty --gene-report file.\n");
+	      reterr = kPglRetMalformedInput;
+	    }
+	    goto load_range_list_ret_1;
+	  }
+	}
+	LOGERRPRINTF("Warning: No valid ranges in %s file.\n", file_descrip);
+	goto load_range_list_ret_1;
+      }
+      // c_prefix is 0 or 2
+      max_set_id_blen += c_prefix;
+      if (max_set_id_blen > kMaxIdBlen) {
+	logerrprint("Error: Set IDs are limited to " MAX_ID_SLEN_STR " characters.\n");
+	goto load_range_list_ret_MALFORMED_INPUT;
+      }
+      char** strptr_arr;
+      if (bigstack_alloc_c(set_ct * max_set_id_blen, set_names_ptr) ||
+	  bigstack_alloc_cp(set_ct, &strptr_arr)) {
+	goto load_range_list_ret_NOMEM;
+      }
+      set_names = *set_names_ptr;
+      for (uintptr_t set_idx = 0; set_idx < set_ct; ++set_idx) {
+	strptr_arr[set_idx] = make_set_ll->ss;
+	make_set_ll = make_set_ll->next;
+      }
+      strptr_arr_nsort(set_ct, strptr_arr);
+      set_ct = copy_and_dedup_sorted_strptrs_to_strbox(strptr_arr, set_ct, max_set_id_blen, &(set_names[c_prefix]));
+      if (c_prefix) {
+	for (uintptr_t set_idx = 0; set_idx < set_ct; ++set_idx) {
+	  memcpy(&(set_names[set_idx * max_set_id_blen]), "C_", 2);
+	}
+      }
+      bigstack_shrink_top(set_names, set_ct * max_set_id_blen);
+      if (gzrewind(gz_infile)) {
+	goto load_range_list_ret_READ_FAIL;
+      }
+    } else {
+      set_ct = 1;
+    }
+    make_set_range_t** make_set_range_arr = (make_set_range_t**)bigstack_end_alloc(set_ct * sizeof(intptr_t));
+    if (!make_set_range_arr) {
+      goto load_range_list_ret_NOMEM;
+    }
+    for (uintptr_t set_idx = 0; set_idx < set_ct; ++set_idx) {
+      make_set_range_arr[set_idx] = nullptr;
+    }
+    uintptr_t line_idx = 0;
+    uint32_t chr_start = 0;
+    uint32_t chr_end = 0;
+    while (gzgets(gz_infile, g_textbuf, kMaxMediumLine)) {
+      ++line_idx;
+      if (!g_textbuf[kMaxMediumLine - 1]) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file is pathologically long.\n", line_idx, file_descrip);
+	goto load_range_list_ret_MALFORMED_INPUT_2;
+      }
+      char* textbuf_first_token = skip_initial_spaces(g_textbuf);
+      if (is_eoln_kns(*textbuf_first_token)) {
+	continue;
+      }
+      char* first_token_end = token_endnn(textbuf_first_token);
+      char* cur_set_id = next_token_mult(first_token_end, 3);
+      char* last_token;
+      if (!collapse_group) {
+	last_token = cur_set_id;
+      } else {
+	last_token = next_token(cur_set_id);
+      }
+      if (no_more_tokens_kns(last_token)) {
+	sprintf(g_logbuf, "Error: Line %" PRIuPTR " of %s file has fewer tokens than expected.\n", line_idx, file_descrip);
+	goto load_range_list_ret_MALFORMED_INPUT_2;
+      }
+      const uint32_t chr_name_slen = (uintptr_t)(first_token_end - textbuf_first_token);
+      *first_token_end = '\0';
+      const int32_t cur_chr_code = get_chr_code(textbuf_first_token, cip, chr_name_slen);
+      if (cur_chr_code < 0) {
+	sprintf(g_logbuf, "Error: Invalid chromosome code on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+	goto load_range_list_ret_MALFORMED_INPUT_2;
+      }
+      if (!is_set(cip->chr_mask, cur_chr_code)) {
+	continue;
+      }
+      if (variant_bps) {
+	const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[(uint32_t)cur_chr_code];
+	chr_start = cip->chr_fo_vidx_start[chr_fo_idx];
+	chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
+	if (chr_end == chr_start) {
+	  continue;
+	}
+	// might need to move this outside the if-statement later
+	if (subset_ct && (bsearch_str(cur_set_id, sorted_subset_ids, strlen_se(cur_set_id), max_subset_id_blen, subset_ct) == -1)) {
+	  continue;
+	}
+      }
+      char* textbuf_iter = skip_initial_spaces(&(first_token_end[1]));
+      uint32_t range_first;
+      if (scanadv_uint_defcap(&textbuf_iter, &range_first)) {
+	sprintf(g_logbuf, "Error: Invalid range start position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+	goto load_range_list_ret_MALFORMED_INPUT_2;
+      }
+      textbuf_iter = next_token(textbuf_iter);
+      uint32_t range_last;
+      if (scanadv_uint_defcap(&textbuf_iter, &range_last)) {
+	sprintf(g_logbuf, "Error: Invalid range end position on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+	goto load_range_list_ret_MALFORMED_INPUT_2;
+      }
+      if (range_last < range_first) {
+	sprintf(g_logbuf, "Error: Range end position smaller than range start on line %" PRIuPTR " of %s file.\n", line_idx, file_descrip);
+	wordwrapb(0);
+	goto load_range_list_ret_MALFORMED_INPUT_2;
+      }
+      if (border_extend > range_first) {
+	range_first = 0;
+      } else {
+	range_first -= border_extend;
+      }
+      range_last += border_extend;
+      uint32_t cur_set_idx = 0;
+      if (set_ct > 1) {
+	// bugfix: bsearch_str_natural requires null-terminated string
+	const uint32_t last_token_slen = strlen_se(last_token);
+	last_token[last_token_slen] = '\0';
+	if (c_prefix) {
+	  last_token = &(last_token[-2]);
+	  memcpy(last_token, "C_", 2);
+	} else if (!variant_bps) {
+	  last_token = &(last_token[-4]);
+	  uitoa_z4((uint32_t)cur_chr_code, last_token);
+	  last_token[3] -= 15;
+	}
+	// this should never fail
+	cur_set_idx = (uint32_t)bsearch_str_natural(last_token, set_names, max_set_id_blen, set_ct);
+      }
+      if (variant_bps) {
+	// translate to within-chromosome uidx
+	range_first = uint32arr_greater_than(&(variant_bps[chr_start]), chr_end - chr_start, range_first);
+	range_last = uint32arr_greater_than(&(variant_bps[chr_start]), chr_end - chr_start, range_last + 1);
+	if (range_last > range_first) {
+	  make_set_range_t* msr_tmp = (make_set_range_t*)bigstack_end_alloc(sizeof(make_set_range_t));
+	  if (!msr_tmp) {
+	    goto load_range_list_ret_NOMEM;
+	  }
+	  msr_tmp->next = make_set_range_arr[cur_set_idx];
+	  // normally, I'd keep chr_idx here since that enables by-chromosome
+	  // sorting, but that's probably not worth bloating make_set_range_t
+	  // from 16 to 32 bytes
+	  msr_tmp->uidx_start = chr_start + range_first;
+	  msr_tmp->uidx_end = chr_start + range_last;
+	  make_set_range_arr[cur_set_idx] = msr_tmp;
+	}
+      } else {
+	make_set_range_t* msr_tmp = (make_set_range_t*)bigstack_end_alloc(sizeof(make_set_range_t));
+	if (!msr_tmp) {
+	  goto load_range_list_ret_NOMEM;
+	}
+	msr_tmp->next = make_set_range_arr[cur_set_idx];
+	msr_tmp->uidx_start = range_first;
+	msr_tmp->uidx_end = range_last + 1;
+	make_set_range_arr[cur_set_idx] = msr_tmp;
+      }
+    }
+    if (!gzeof(gz_infile)) {
+      goto load_range_list_ret_READ_FAIL;
+    }
+    // allocate buffer for sorting ranges later
+    uint32_t max_set_range_ct = 0;
+    for (uint32_t set_idx = 0; set_idx < set_ct; ++set_idx) {
+      uint32_t cur_set_range_ct = 0;
+      make_set_range_t* msr_tmp = make_set_range_arr[set_idx];
+      while (msr_tmp) {
+	++cur_set_range_ct;
+	msr_tmp = msr_tmp->next;
+      }
+      if (cur_set_range_ct > max_set_range_ct) {
+	max_set_range_ct = cur_set_range_ct;
+      }
+    }
+    if (range_sort_buf_ptr) {
+      if (bigstack_end_alloc_ull(max_set_range_ct, range_sort_buf_ptr)) {
+	goto load_range_list_ret_NOMEM;
+      }
+    }
+    if (set_ct_ptr) {
+      *set_ct_ptr = set_ct;
+    }
+    if (max_set_id_blen_ptr) {
+      *max_set_id_blen_ptr = max_set_id_blen;
+    }
+    *make_set_range_arr_ptr = make_set_range_arr;
+  }
+  while (0) {
+  load_range_list_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  load_range_list_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  load_range_list_ret_MALFORMED_INPUT_2:
+    logerrprintb();
+  load_range_list_ret_MALFORMED_INPUT:
+    reterr = kPglRetMalformedInput;
+    break;
+  }
+ load_range_list_ret_1:
+  return reterr;
+}
+
+pglerr_t extract_exclude_range(const char* fnames, const chr_info_t* cip, const uint32_t* variant_bps, uint32_t raw_variant_ct, uint32_t do_exclude, uintptr_t* variant_include, uint32_t* variant_ct_ptr) {
+  const uint32_t orig_variant_ct = *variant_ct_ptr;
+  if (!orig_variant_ct) {
+    return kPglRetSuccess;
+  }
+  unsigned char* bigstack_mark = g_bigstack_base;
+  unsigned char* bigstack_end_mark = g_bigstack_end;
+  gzFile gz_infile = nullptr;
+  pglerr_t reterr = kPglRetSuccess;
+  {
+    const uintptr_t raw_variant_ctl = BITCT_TO_WORDCT(raw_variant_ct);
+    uintptr_t* variant_include_mask = nullptr;
+    if (!do_exclude) {
+      if (bigstack_calloc_ul(raw_variant_ctl, &variant_include_mask)) {
+	goto extract_exclude_range_ret_NOMEM;
+      }
+    }
+    const char* fnames_iter = fnames;
+    do {
+      reterr = gzopen_read_checked(fnames_iter, &gz_infile);
+      if (reterr) {
+	goto extract_exclude_range_ret_1;
+      }
+      make_set_range_t** range_arr = nullptr;
+      reterr = load_range_list(cip, variant_bps, nullptr, do_exclude? "--exclude range" : "--extract range", 0, 0, 0, 0, 0, 1, 0, 0, gz_infile, nullptr, nullptr, nullptr, nullptr, &range_arr);
+      if (reterr) {
+	goto extract_exclude_range_ret_1;
+      }
+      if (gzclose_null(&gz_infile)) {
+	goto extract_exclude_range_ret_READ_FAIL;
+      }
+      make_set_range_t* msr_tmp = range_arr[0];
+      if (do_exclude) {
+	while (msr_tmp) {
+	  clear_bits_nz(msr_tmp->uidx_start, msr_tmp->uidx_end, variant_include);
+	  msr_tmp = msr_tmp->next;
+	}
+      } else {
+	while (msr_tmp) {
+	  fill_bits_nz(msr_tmp->uidx_start, msr_tmp->uidx_end, variant_include_mask);
+	  msr_tmp = msr_tmp->next;
+	}
+      }
+      fnames_iter = (const char*)rawmemchr(fnames_iter, '\0');
+      ++fnames_iter;
+    } while (*fnames_iter);
+    if (!do_exclude) {
+      bitvec_and(variant_include_mask, raw_variant_ctl, variant_include);
+    }
+    *variant_ct_ptr = popcount_longs(variant_include, raw_variant_ctl);
+    if (*variant_ct_ptr == orig_variant_ct) {
+      LOGERRPRINTF("Warning: No variants excluded by '--%s range'.\n", do_exclude? "exclude" : "extract");
+    } else {
+      const uint32_t excluded_ct = orig_variant_ct - (*variant_ct_ptr);
+      LOGPRINTF("--%s range: %u variant%s excluded.\n", do_exclude? "exclude" : "extract", excluded_ct, (excluded_ct == 1)? "" : "s");
+    }
+  }
+  while (0) {
+  extract_exclude_range_ret_NOMEM:
+    reterr = kPglRetNomem;
+    break;
+  extract_exclude_range_ret_READ_FAIL:
+    reterr = kPglRetReadFail;
+    break;
+  }
+ extract_exclude_range_ret_1:
+  gzclose_cond(gz_infile);
+  bigstack_double_reset(bigstack_mark, bigstack_end_mark);
+  return reterr;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/plink2_set.h b/plink2_set.h
new file mode 100644
index 0000000..2d677ae
--- /dev/null
+++ b/plink2_set.h
@@ -0,0 +1,39 @@
+#ifndef __PLINK2_SET_H__
+#define __PLINK2_SET_H__
+
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_common.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+typedef struct make_set_range_struct {
+  struct make_set_range_struct* next;
+  uint32_t uidx_start;
+  uint32_t uidx_end;
+} make_set_range_t;
+
+pglerr_t extract_exclude_range(const char* fname, const chr_info_t* cip, const uint32_t* variant_bps, uint32_t raw_variant_ct, uint32_t do_exclude, uintptr_t* variant_include, uint32_t* variant_ct_ptr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __PLINK2_SET_H__
diff --git a/plink2_stats.cpp b/plink2_stats.cpp
new file mode 100644
index 0000000..7a256d9
--- /dev/null
+++ b/plink2_stats.cpp
@@ -0,0 +1,2102 @@
+// This file is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This program is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+// more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_stats.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// Thread-unsafe portions of plink_stats.c have been replaced, mostly by code
+// derived from boost/math/special_functions/gamma.hpp and
+// boost/math/special_functions/detail/igamma_inverse.hpp in Boost 1.60
+// (Maddock et al.).  The derived portions are subject to the following
+// license:
+//
+// *****
+// Boost Software License - Version 1.0 - August 17th, 2003
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+// *****
+
+// ***** thread-safe chiprob_p *****
+// port of Boost 1.60 implementation, float precision
+
+static const double log_min_value = -708.0;
+static const double log_max_value = 709.0;
+
+static const double kLentzFpmin = 1.0e-30;
+
+static const double kFactorials[30] = {
+  1.0,
+  1.0,
+  2.0,
+  6.0,
+  24.0,
+  120.0,
+  720.0,
+  5040.0,
+  40320.0,
+  362880.0,
+  3628800.0,
+  39916800.0,
+  479001600.0,
+  6227020800.0,
+  87178291200.0,
+  1307674368000.0,
+  20922789888000.0,
+  355687428096000.0,
+  6402373705728000.0,
+  121645100408832000.0,
+  0.243290200817664e19,
+  0.5109094217170944e20,
+  0.112400072777760768e22,
+  0.2585201673888497664e23,
+  0.62044840173323943936e24,
+  0.15511210043330985984e26,
+  0.403291461126605635584e27,
+  0.10888869450418352160768e29,
+  0.304888344611713860501504e30,
+  0.8841761993739701954543616e31
+};
+
+double finite_gamma_q(uint32_t aa, double xx, double* p_derivative) {
+  // a is a positive integer < 30; max(0.6, a-1) < x < log_max_value
+  // (e^{-x})(1 + x + x^2/2 + x^3/3! + x^4/4! + ... + x^{a-1}/(a-1)!)
+  const double ee = exp(-xx);
+  if (ee == 0.0) {
+    return 0;
+  }
+  double sum = ee;
+  double term = sum;
+  for (uint32_t nn = 1; nn < aa; ++nn) {
+    term /= (double)((int32_t)nn);
+    term *= xx;
+    sum += term;
+  }
+  if (p_derivative) {
+    *p_derivative = ee * pow(xx, (int32_t)aa) / kFactorials[aa - 1];
+  }
+  return sum;
+}
+
+static const double kSqrtPi = 1.7724538509055159;
+static const double kSqrt2 = 1.4142135623730951;
+
+double lower_gamma_series(double aa, double zz, double init_value) {
+  // z must not be much larger than a
+  double result = 1;
+  double total = init_value;
+  double rr;
+  do {
+    rr = result;
+    aa += 1.0;
+    result *= zz / aa;
+    total += rr;
+  } while (fabs(rr) > (kBigEpsilon * kBigEpsilon));
+  return total;
+}
+
+double upper_gamma_fraction(double a1, double z1) {
+  // evaluate a_1 / (b_1 + (a_2 / (b_2 + (a_3 / (b_3 + ...)))))
+  // see Boost continued_fraction_a(), upper_incomplete_gamma_fract
+  double cur_b = z1 - a1 + 3;
+
+  double hh = cur_b;
+  const double a0 = a1 - 1.0;
+  if (fabs(hh) < kLentzFpmin) {
+    hh = kLentzFpmin;
+  }
+  double cc = hh;
+  double dd = 0.0;
+  for (double kk = 2.0; kk <= 100.0; kk += 1.0) {
+    const double cur_a = kk * (a1 - kk);
+    cur_b += 2.0;
+    dd = cur_b + cur_a * dd;
+    if (fabs(dd) < kLentzFpmin) {
+      dd = kLentzFpmin;
+    }
+    cc = cur_b + cur_a / cc;
+    if (fabs(cc) < kLentzFpmin) {
+      cc = kLentzFpmin;
+    }
+    dd = 1.0 / dd;
+    const double delta = cc * dd;
+    hh *= delta;
+    if (fabs(delta - 1.0) < 3.0e-7) {
+      break;
+    }
+  }
+  const double cont_frac = a0 / hh;
+  return 1 / (z1 - a1 + 1 + cont_frac);
+}
+
+double small_gamma2_series(double aa, double xx, double init_value) {
+  double apn = aa + 1;
+  const double negx = -xx;
+  double nn = 1;
+  double result = negx;
+  double total = init_value;
+  double rr;
+  do {
+    rr = result / apn;
+    result *= negx;
+    nn += 1.0;
+    result /= nn;
+    apn += 1;
+    total += rr;
+  } while (fabs(rr) > (kBigEpsilon * kBigEpsilon));
+  return total;
+}
+
+double tgamma_small_upper_part_df1(double xx, uint32_t invert, double* p_derivative, double* pgam) {
+  // x < 1.1
+  // df == 1, a == 0.5
+  double result = 0.5 * kSqrtPi - 1.0;
+  *pgam = (result + 1) * 2;
+  double pp = sqrt(xx) - 1.0; // no point in using powm1() with ^0.5
+  result -= pp;
+  result *= 2;
+  pp += 1;
+  if (p_derivative) {
+    *p_derivative = pp / ((*pgam) * exp(xx));
+  }
+  const double init_value = invert? (*pgam) : 0;
+  result = -pp * small_gamma2_series(0.5, xx, (init_value - result) / pp);
+  if (invert) {
+    result = -result;
+  }
+  return result;
+}
+
+// from Numerical Recipes in Fortran 77: The Art of Scientific Computing, via
+// Wikipedia
+// maximal error of 1.2e-7
+double erfc_fast(double zz) {
+  const double tt = 1.0 / (1.0 + 0.5 * zz);
+  const double tau = tt * exp(((((((((0.17087277 * tt - 0.82215223) * tt + 1.48851587) * tt - 1.13520398) * tt + 0.27886807) * tt - 0.18628806) * tt + 0.09678418) * tt + 0.37409196) * tt + 1.00002368) * tt - 1.26551223 - zz * zz);
+  return tau;
+}
+
+double finite_half_gamma_q(double aa, double xx, double* p_derivative) {
+  // a is in {0.5, 1.5, ..., 29.5}; max(0.2, a-1) < x < log_max_value
+  const double sqrt_x = sqrt(xx);
+  double ee = erfc_fast(sqrt_x);
+  if ((ee != 0) && (aa > 1)) {
+    double term = exp(-xx) / (kSqrtPi * sqrt_x);
+    term *= xx * 2;
+    double sum = term;
+    for (double nn = 1.5; nn < aa; nn += 1.0) {
+      term /= nn;
+      term *= xx;
+      sum += term;
+    }
+    ee += sum;
+    if (p_derivative) {
+      *p_derivative = 0;
+    }
+  } else if (p_derivative) {
+    *p_derivative = sqrt_x * exp(-xx) * (1.0 / kSqrtPi);
+  }
+  return ee;
+}
+
+static const double kLanczosSumExpgNumer[6] = {32.812445410297834, 32.123889414443320, 12.580347294552161, 2.4634444783532414, 0.2412010548258800, 0.0094469677045392};
+static const double kLanczosSumExpgDenom[6] = {0, 24, 50, 35, 10, 1};
+
+// this depends on the polynomial coefficients above
+static const double kLanczosG = 5.581;
+
+double lanczos_sum_expg_scaled_recip(double zz) {
+  double s1;
+  double s2;
+  if (zz <= 1) {
+    s1 = kLanczosSumExpgNumer[5];
+    s2 = kLanczosSumExpgDenom[5];
+    for (int32_t ii = 4; ii >= 0; --ii) {
+      s1 *= zz;
+      s2 *= zz;
+      s1 += kLanczosSumExpgNumer[(uint32_t)ii];
+      s2 += kLanczosSumExpgDenom[(uint32_t)ii];
+    }
+  } else {
+    zz = 1 / zz;
+    s1 = kLanczosSumExpgNumer[0];
+    s2 = kLanczosSumExpgDenom[0];
+    for (uint32_t uii = 1; uii < 6; ++uii) {
+      s1 *= zz;
+      s2 *= zz;
+      s1 += kLanczosSumExpgNumer[uii];
+      s2 += kLanczosSumExpgDenom[uii];
+    }
+  }
+  // may as well flip this
+  return s2 / s1;
+}
+
+double log1pmx(double xx) {
+  // log(1+x) - x
+  // assumes abs(xx) < 0.95
+  const double aa = fabs(xx);
+  if (aa < (kBigEpsilon / kSqrt2)) { // 2^{-21.5}
+    return -xx * xx * 0.5;
+  }
+  double kk = 1.0; // skip first term of usual log(1+x) series
+  const double m_mult = -xx;
+  double m_prod = xx;
+  double total = 0.0;
+  double rr;
+  do {
+    m_prod *= m_mult;
+    kk += 1.0;
+    rr = m_prod / kk;
+    total += rr;
+    // todo: tune these epsilons, but let's wait until we know all of the
+    // callers of these functions
+  } while (fabs(rr) > (kBigEpsilon * kBigEpsilon));
+  return total;
+}
+
+// compute (z^a)(e^{-z})/tgamma(a)
+double regularized_gamma_prefix(double aa, double zz) {
+  // assumes a == 0.5 if a < 1.  assumes z > 0.
+  // we are fine with float-level precision, so lanczos_n=6, kLanczosG=5.581
+  if (aa < 1) {
+    return sqrt(zz) * exp(-zz) * (1.0 / kSqrtPi);
+  }
+  const double agh = aa + kLanczosG - 0.5;
+  const double agh_recip = 1.0 / agh;
+  const double dd = ((zz - aa) - (kLanczosG - 0.5)) * agh_recip;
+  double prefix;
+  if ((fabs(dd * dd * aa) <= 100) && (aa > 150)) {
+    // abs(dd) < sqrt(2/3) < 0.95
+    prefix = aa * log1pmx(dd) + zz * (0.5 - kLanczosG) * agh_recip;
+    prefix = exp(prefix);
+  } else {
+    const double alz = aa * log(zz * agh_recip);
+    const double amz = aa - zz;
+    const double cur_minv = MINV(alz, amz);
+    if ((cur_minv <= log_min_value) || (MAXV(alz, amz) >= log_max_value)) {
+      const double amza = amz / aa;
+      double sq;
+      if ((cur_minv > 2 * log_min_value) && (MAXV(alz, amz) < 2 * log_max_value)) {
+	sq = pow(zz * agh_recip, aa * 0.5) * exp(amz * 0.5);
+	prefix = sq * sq;
+      } else if ((cur_minv > 4 * log_min_value) && (MAXV(alz, amz) < 4 * log_max_value) && (zz > aa)) {
+	sq = pow(zz * agh_recip, aa * 0.25) * exp(amz * 0.25);
+	prefix = sq * sq;
+	prefix *= prefix;
+      } else if ((amza > log_min_value) && (amza < log_max_value)) {
+	prefix = pow((zz * exp(amza)) * agh_recip, aa);
+      } else {
+	prefix = exp(alz + amz);
+      }
+    } else {
+      prefix = pow(zz * agh_recip, aa) * exp(amz);
+    }
+  }
+  prefix *= sqrt(agh * kRecipE) * lanczos_sum_expg_scaled_recip(aa);
+  return prefix;
+}
+
+static const double kTemmeC0[7] = {-0.333333333, 0.0833333333, -0.0148148148, 0.00115740741, 0.000352733686, -0.000178755144, 0.391926318e-4};
+static const double kTemmeC1[5] = {-0.00185185185, -0.00347222222, 0.00264550265, -0.000990226337, 0.000205761317};
+static const double kTemmeC2[3] = {0.00413359788, -0.00268132716, 0.000771604938};
+
+double igamma_temme_large(double aa, double xx) {
+  // 24-bit precision is fine
+  const double sigma = (xx - aa) / aa;
+  // abs(sigma) < 0.4
+  const double phi = -log1pmx(sigma);
+  const double sqrt_a = sqrt(aa);
+  const double sqrt_phi = sqrt(phi);
+  const double yy = aa * phi;
+  double zz = kSqrt2 * sqrt_phi;
+  if (xx < aa) {
+    zz = -zz;
+  }
+  double workspace[3];
+  workspace[0] = (((((kTemmeC0[6] * zz + kTemmeC0[5]) * zz + kTemmeC0[4]) * zz + kTemmeC0[3]) * zz + kTemmeC0[2]) * zz + kTemmeC0[1]) * zz + kTemmeC0[0];
+  workspace[1] = (((kTemmeC1[4] * zz + kTemmeC1[3]) * zz + kTemmeC1[2]) * zz + kTemmeC1[1]) * zz + kTemmeC1[0];
+  workspace[2] = (kTemmeC2[2] * zz + kTemmeC2[1]) * zz + kTemmeC2[0];
+  const double a_recip = 1 / aa;
+  double result = (workspace[2] * a_recip + workspace[1]) * a_recip + workspace[0];
+  result *= exp(-yy) / ((kSqrt2 * kSqrtPi) * sqrt_a);
+  if (xx < aa) {
+    result = -result;
+  }
+  result += erfc_fast(sqrt_a * sqrt_phi) * 0.5;
+  return result;
+}
+
+double gamma_incomplete_imp2(uint32_t df, double xx, uint32_t invert, double* p_derivative) {
+  assert(df);
+  assert(xx >= 0.0);
+  const double aa = ((double)((int32_t)df)) * 0.5;
+  const uint32_t is_small_a = (df < 60) && (aa <= xx + 1) && (xx < log_max_value);
+  uint32_t is_int = 0;
+  uint32_t is_half_int = 0;
+  if (is_small_a) {
+    is_half_int = df % 2;
+    is_int = !is_half_int;
+  }
+  uint32_t eval_method;
+  if (is_int && (xx > 0.6)) {
+    invert = !invert;
+    eval_method = 0;
+  } else if (is_half_int && (xx > 0.2)) {
+    invert = !invert;
+    eval_method = 1;
+  } else if (xx < kSmallEpsilon) {
+    // avoid computing log(0)
+    // don't need more precision here, 6 digits is enough
+    assert(!p_derivative);
+    return 1.0;
+  } else if (xx < 0.5) {
+    // log(x) is negative
+    // -0.4 / log(x) >= 0.5 (this is impossible for larger a)
+    // -> -0.4 <= 0.5 * log(x)
+    // -> -0.8 <= log(x)
+    // -> e^{-0.8} <= x
+    eval_method = 2 + ((df == 1) && (xx >= 0.44932896411722156));
+  } else if (xx < 1.1) {
+    // x * 0.75 >= 0.5
+    // x >= 2/3
+    eval_method = 2 + ((df == 1) && (xx >= (2.0 / 3.0)));
+  } else {
+    const double x_minus_a = xx - aa;
+    uint32_t use_temme = 0;
+    if (aa > 20) {
+      // sigma = abs((x - a) / a);
+      // igamma_temme_large() assumes abs(sigma) < 0.95
+      if (aa > 200) {
+	// abs(sigma) < sqrt(20 / a) < 0.316...
+	use_temme = (20 * aa > x_minus_a * x_minus_a);
+      } else {
+	// abs(sigma) < 0.4
+        const double sigma_times_a = fabs(x_minus_a);
+	use_temme = (sigma_times_a < 0.4 * aa);
+      }
+    }
+    if (use_temme) {
+      eval_method = 5;
+    } else {
+      // x - (1 / (3 * x)) < a
+      // x * x - (1/3) < a * x
+      // x * x - a * x < 1/3
+      // x * (x - a) < 1/3
+      if (xx * x_minus_a < (1.0 / 3.0)) {
+	eval_method = 2;
+      } else {
+	eval_method = 4;
+	invert = !invert;
+      }
+    }
+  }
+  double result;
+  switch(eval_method) {
+  case 0:
+    result = finite_gamma_q(df / 2, xx, p_derivative);
+    break;
+  case 1:
+    // previously used erfc, but that was up to ~3x as slow as dcdflib (e.g.
+    // chiprob_p(2.706, 1) case).
+    result = finite_half_gamma_q(aa, xx, p_derivative);
+    if (p_derivative && (*p_derivative == 0)) {
+      *p_derivative = regularized_gamma_prefix(aa, xx);
+    }
+    break;
+  case 2:
+    result = regularized_gamma_prefix(aa, xx);
+    if (p_derivative) {
+      *p_derivative = result;
+    }
+    if (result != 0) {
+      // uint32_t optimized_invert = 0;
+      double init_value = 0;
+      if (invert) {
+	init_value = -aa / result;
+	// optimized_invert = 1;
+      }
+      result *= lower_gamma_series(aa, xx, init_value) / aa;
+      // if (optimized_invert) {
+      if (invert) {
+	invert = 0;
+	result = -result;
+      }
+    }
+    break;
+  case 3:
+    {
+      invert = !invert;
+      double gg;
+      result = tgamma_small_upper_part_df1(xx, invert, p_derivative, &gg);
+      invert = 0;
+      result /= gg;
+    }
+    break;
+  case 4:
+    result = regularized_gamma_prefix(aa, xx);
+    if (p_derivative) {
+      *p_derivative = result;
+    }
+    if (result != 0) {
+      result *= upper_gamma_fraction(aa, xx);
+    }
+    break;
+  case 5:
+    result = igamma_temme_large(aa, xx);
+    if (xx >= aa) {
+      invert = !invert;
+    }
+    if (p_derivative) {
+      *p_derivative = regularized_gamma_prefix(aa, xx);
+    }
+  }
+  if (result > 1) {
+    result = 1;
+  }
+  if (invert) {
+    result = 1 - result;
+  }
+  if (p_derivative) {
+    if ((xx < 1) && (DBL_MAX * xx < (*p_derivative))) {
+      *p_derivative = DBL_MAX / 2; // overflow; do we really need this?
+    } else {
+      *p_derivative /= xx;
+    }
+  }
+  return result;
+}
+
+double chiprob_p(double chisq, uint32_t df) {
+  // todo: figure out when we were depending on this to return -9, and decide
+  // how to handle those situations now
+  return gamma_incomplete_imp2(df, chisq * 0.5, 1, nullptr);
+}
+
+// ***** end thread-safe chiprob_p *****
+
+
+// ***** thread-safe inverse_chiprob *****
+// port of Boost 1.60 implementation
+
+double find_inverse_gamma2(uint32_t df, double pp, double qq, uint32_t* has_10_digits_ptr) {
+  // currently assumes *has_10_digits_ptr initialized to zero
+  if (df == 2) {
+    return -log(qq);
+  }
+  if (df == 1) {
+    // g == tgamma(0.5) == sqrt(pi)
+    const double bb = qq * kSqrtPi;
+    if (bb >= 0.45) {
+      // b * q > 1e-8, q > 1e-5 guaranteed
+      // u = pow(p * g * a, 1/a)
+      //   = pow(p * g * 0.5, 2)
+      //   = p * p * g * g * 0.25
+      //   = p * p * pi * 0.25
+      const double uu = pp * pp * (0.25 * kPi);
+      return (uu / (1 - (uu * (1.0 / 1.5))));
+    } else {
+      const double yy = -log(bb);
+      if (bb > 0.1) {
+	const double uu = yy - 0.5 * log(yy);
+	if (bb > 0.15) {
+	  return (yy - 0.5 * log(uu) - log(1 + 0.5 / uu));
+	}
+	return (yy - 0.5 * log(uu) - log(((uu + 5) * uu + 3.75) / ((uu + 4.5) * uu + 2)));
+      } else {
+	const double c1 = -0.5 * log(yy);
+	const double c1_2 = c1 * c1;
+	const double c1_3 = c1_2 * c1;
+	const double c1_4 = c1_2 * c1_2;
+	// a_2 = 0.25
+	// a_3 = 0.125
+
+	const double c2 = -0.5 * (1 + c1);
+	const double c3 = 0.25 * c1_2 + 0.75 * c1 + 0.875;
+	const double c4 = c1_3 * (-1.0 / 6.0) - 0.875 * c1_2 - 1.875 * c1 - (26.75 / 12.0);
+	const double c5 = 0.125 * c1_4 + (5.75 / 6.0) * c1_3 + 3.625 * c1_2 + 7.75 * c1 + (83.0625 / 12.0);
+
+	const double y_recip = 1.0 / yy;
+	const double y_recip_2 = y_recip * y_recip;
+	const double y_recip_3 = y_recip_2 * y_recip;
+	const double y_recip_4 = y_recip_2 * y_recip_2;
+	if (bb < 1e-28) {
+	  *has_10_digits_ptr = 1;
+	}
+	// er, I'd think this should just use Horner's instead?
+	return (yy + c1 + c2 * y_recip + c3 * y_recip_2 + c4 * y_recip_3 + c5 * y_recip_4);
+      }
+    }
+  }
+  // not implemented yet
+  assert(0);
+  exit(1);
+  return 0;
+}
+
+double gamma_p_inv_imp2(uint32_t df, double qq) {
+  assert(df);
+  assert(qq > 0.0);
+  if (qq >= 1.0 - kSmallEpsilon) {
+    return 0;
+  }
+  double pp = 1.0 - qq;
+  uint32_t has_10_digits = 0;
+  double guess = find_inverse_gamma2(df, pp, qq, &has_10_digits);
+  if (has_10_digits) {
+    return guess;
+  }
+  double min_guess = kSmallEpsilon;
+  double max_guess = DBL_MAX;
+  if (guess < kSmallEpsilon) {
+    guess = kSmallEpsilon;
+  }
+  // halley iteration, digits == 24, lower == kSmallEpsilon
+  // see second_order_root_finder in boost/math/tools/roots.hpp
+  const uint32_t invert = (pp > 0.9);
+  if (invert) {
+    pp = qq;
+  }
+  const double a_minus_1 = 0.5 * (double)(((int32_t)df) - 2);
+  const double factor = 1.1920928955078125e-07; // 2^{-23}
+  double result = guess;
+  double delta = 10000000;
+  double delta1 = delta;
+  uint32_t out_of_bounds_sentry = 0;
+  do {
+    double delta2 = delta1;
+    delta1 = delta;
+
+    // see gamma_p_inverse_func in
+    // boost/math/special_functions/detail/igamma_inverse.hpp
+    double f1;
+    const double ff = gamma_incomplete_imp2(df, result, invert, &f1);
+    const double div = (a_minus_1 - result) / result;
+    double f2 = f1;
+    if ((fabs(div) > 1) && (DBL_MAX / fabs(div) < f2)) {
+      // overflow
+      f2 = -DBL_MAX / 2;
+    } else {
+      f2 *= div;
+    }
+    if (invert) {
+      f1 = -f1;
+      f2 = -f2;
+    }
+    const double f0 = ff - pp;
+    if (f0 == 0) {
+      break;
+    }
+    assert(f1 != 0); // shouldn't be possible, function is monotonic
+    delta = f0 / f1;
+    if (f2 != 0) {
+      // delta = Stepper::step(result, f0, f1, f2);
+      const double denom = 2 * f0;
+      const double numer = 2 * f1 - f0 * (f2 / f1);
+      if ((fabs(numer) >= 1) || (fabs(denom) < fabs(numer) * DBL_MAX)) {
+	const double halley_step = denom / numer;
+	if (halley_step / delta < 0) {
+	  if (fabs(delta) > 2 * fabs(guess)) {
+	    delta = ((delta < 0)? -1 : 1) * 2 * fabs(guess);
+	  }
+	} else {
+	  delta = halley_step;
+	}
+      }
+    }
+    double convergence = fabs(delta / delta2);
+    if ((convergence > 0.8) && (convergence < 2)) {
+      delta = (delta > 0)? (0.5 * (result - min_guess)) : (0.5 * (result - max_guess));
+      if (fabs(delta) > result) {
+	delta = ((delta > 0)? 1 : -1) * result;
+      }
+      // delta2 = delta * 3;
+    }
+    guess = result;
+    result -= delta;
+    // do we actually need this?
+    if (result < min_guess) {
+      double diff = ((fabs(min_guess) < 1) && (fabs(result) > 1) && ((DBL_MAX / fabs(result)) < fabs(min_guess)))? 1000 : (result / min_guess);
+      if (fabs(diff) < 1) {
+	diff = 1 / diff;
+      }
+      if ((!out_of_bounds_sentry) && (diff > 0) && (diff < 3)) {
+	delta = 0.99 * (guess - min_guess);
+	result = guess - delta;
+	out_of_bounds_sentry = 1;
+      } else {
+	delta = (guess - min_guess) * 0.5;
+	result = guess - delta;
+	if ((result == min_guess) || (result == max_guess)) {
+	  break;
+	}
+      }
+    } else if (result > max_guess) {
+      double diff = ((fabs(max_guess) < 1) && (fabs(result) > 1) && ((DBL_MAX / fabs(result)) < fabs(max_guess)))? 1000 : (result / max_guess);
+      if (fabs(diff) < 1) {
+	diff = 1 / diff;
+      }
+      if ((!out_of_bounds_sentry) && (diff > 0) && (diff < 3)) {
+	delta = 0.99 * (guess - max_guess);
+	result = guess - delta;
+	out_of_bounds_sentry = 1;
+      } else {
+	delta = (guess - max_guess) * 0.5;
+	result = guess - delta;
+	if ((result == min_guess) || (result == max_guess)) {
+	  break;
+	}
+      }
+    }
+    if (delta > 0) {
+      max_guess = guess;
+    } else {
+      min_guess = guess;
+    }
+  } while (fabs(result * factor) < fabs(delta));
+  return result;
+}
+
+double inverse_chiprob(double pval, uint32_t df) {
+  // only need this to handle df=1, 2, 4 for now
+  return gamma_p_inv_imp2(df, pval) * 2;
+}
+
+// ***** end thread-safe inverse_chiprob *****
+
+
+// ***** thread-safe cdft *****
+
+// see Numerical Recipes, section 6.4
+double betacf_slow(double aa, double bb, double xx) {
+  double qab = aa + bb;
+  double qap = aa + 1.0;
+  double qam = aa - 1.0;
+  double cc = 1.0;
+  double dd = 1.0 - qab * xx / qap;
+  if (fabs(dd) < kLentzFpmin) {
+    dd = kLentzFpmin;
+  }
+  dd = 1.0 / dd;
+  double hh = dd;
+  // evaluate 1 / (1 + d_1 / (1 + d_2 / (1 + d_3 / (...))))
+  for (double mm = 1.0; mm <= 100.0; mm += 1.0) {
+    double m2 = 2 * mm;
+
+    // d_{2m}
+    double tmp_aa = mm * (bb - mm) * xx / ((qam + m2) * (aa + m2));
+
+    dd = 1.0 + tmp_aa * dd;
+    if (fabs(dd) < kLentzFpmin) {
+      dd = kLentzFpmin;
+    }
+    cc = 1.0 + tmp_aa / cc;
+    if (fabs(cc) < kLentzFpmin) {
+      cc = kLentzFpmin;
+    }
+    dd = 1.0 / dd;
+    hh *= dd * cc;
+
+    // d_{2m+1}
+    tmp_aa = -(aa + mm) * (qab + mm) * xx / ((aa + m2) * (qap + m2));
+    
+    dd = 1.0 + tmp_aa * dd;
+    if (fabs(dd) < kLentzFpmin) {
+      dd = kLentzFpmin;
+    }
+    cc = 1.0 + tmp_aa / cc;
+    if (fabs(cc) < kLentzFpmin) {
+      cc = kLentzFpmin;
+    }
+    dd = 1.0 / dd;
+    double del = dd * cc;
+    hh *= del;
+    if (fabs(del - 1.0) < 3.0e-7) {
+      return hh;
+    }
+  }
+  // don't detect failure for now
+  return hh;
+}
+
+double betai_slow(double aa, double bb, double xx) {
+  if ((xx < 0.0) || (xx > 1.0)) {
+    return -9;
+  }
+  uint32_t do_invert = (xx * (aa + bb + 2.0)) >= (aa + 1.0);  
+  if ((xx == 0.0) || (xx == 1.0)) {
+    return (double)((int32_t)do_invert);
+  }
+  // this is very expensive
+  double bt = exp(lgamma(aa + bb) - lgamma(aa) - lgamma(bb) + aa * log(xx) + bb * log(1.0 - xx));
+
+  if (!do_invert) {
+    return bt * betacf_slow(aa, bb, xx) / aa;
+  }
+  return 1.0 - bt * betacf_slow(bb, aa, 1.0 - xx) / bb;
+}
+
+// todo: try to adapt Boost beta_small_b_large_a_series()
+
+double calc_tprob(double tt, double df) {
+  // must be thread-safe, so dcdflib won't cut it.
+  // move this to plink2_stats once it's ready (and probably just eliminate
+  // dcdflib entirely)
+  if (!realnum(tt)) {
+    return -9;
+  }
+  return betai_slow(df * 0.5, 0.5, df / (df + tt * tt));
+}
+
+double calc_tprob2(double tt, double df, double cached_gamma_mult) {
+  // assumes cached_mult == exp(lgamma(df * 0.5 + 0.5) - lgamma(df * 0.5) -
+  //   lgamma(0.5))
+  //         invert_thresh = (df + 2) / (df + 5)
+  double tt_sq = tt * tt;
+  double denom_recip = 1.0 / (df + tt_sq);
+  double xx = df * denom_recip;
+  double yy = tt_sq * denom_recip;
+  if ((xx < 0.0) || (yy < 0.0)) {
+    return -9;
+  }
+  uint32_t do_invert = (xx * (df + 5.0)) >= (df + 2.0);
+  if ((xx == 0.0) || (yy == 0.0)) {
+    return (double)((int32_t)do_invert);
+  }
+  double aa = df * 0.5;
+  double bt = cached_gamma_mult * pow(xx, aa) * sqrt(yy);
+  if (!do_invert) {
+    return bt * betacf_slow(aa, 0.5, xx) / aa;
+  }
+  return 1.0 - bt * 2 * betacf_slow(0.5, aa, yy);
+}
+// ***** end thread-safe cdft calculation *****
+
+
+// Inverse normal distribution
+// (todo: check if boost implementation is better)
+
+// Lower tail quantile for standard normal distribution function.
+//
+// This function returns an approximation of the inverse cumulative
+// standard normal distribution function.  I.e., given P, it returns
+// an approximation to the X satisfying P = Pr{Z <= X} where Z is a
+// random variable from the standard normal distribution.
+//
+// The algorithm uses a minimax approximation by rational functions
+// and the result has a relative error whose absolute value is less
+// than 1.15e-9.
+//
+// Author:      Peter J. Acklam
+// Time-stamp:  2002-06-09 18:45:44 +0200
+// E-mail:      jacklam at math.uio.no
+// WWW URL:     http://www.math.uio.no/~jacklam
+//
+// C implementation adapted from Peter's Perl version
+
+// Coefficients in rational approximations.
+
+static const double kIvnA[] =
+  {
+    -3.969683028665376e+01,
+    2.209460984245205e+02,
+    -2.759285104469687e+02,
+    1.383577518672690e+02,
+    -3.066479806614716e+01,
+     2.506628277459239e+00
+  };
+
+static const double kIvnB[] =
+  {
+    -5.447609879822406e+01,
+    1.615858368580409e+02,
+    -1.556989798598866e+02,
+    6.680131188771972e+01,
+    -1.328068155288572e+01
+  };
+
+static const double kIvnC[] =
+  {
+    -7.784894002430293e-03,
+    -3.223964580411365e-01,
+    -2.400758277161838e+00,
+    -2.549732539343734e+00,
+    4.374664141464968e+00,
+     2.938163982698783e+00
+  };
+
+static const double kIvnD[] =
+  {
+    7.784695709041462e-03,
+    3.224671290700398e-01,
+    2.445134137142996e+00,
+    3.754408661907416e+00
+  };
+
+static const double kIvnLow = 0.02425;
+static const double kIvnHigh = 0.97575;
+
+double ltqnorm(double p) {
+  // assumes 0 < p < 1
+  double q, r;
+
+  if (p < kIvnLow) {
+    // Rational approximation for lower region
+    q = sqrt(-2*log(p));
+    return (((((kIvnC[0]*q+kIvnC[1])*q+kIvnC[2])*q+kIvnC[3])*q+kIvnC[4])*q+kIvnC[5]) /
+      ((((kIvnD[0]*q+kIvnD[1])*q+kIvnD[2])*q+kIvnD[3])*q+1);
+  }
+  if (p > kIvnHigh) {
+    // Rational approximation for upper region
+    q  = sqrt(-2*log(1-p));
+    return -(((((kIvnC[0]*q+kIvnC[1])*q+kIvnC[2])*q+kIvnC[3])*q+kIvnC[4])*q+kIvnC[5]) /
+      ((((kIvnD[0]*q+kIvnD[1])*q+kIvnD[2])*q+kIvnD[3])*q+1);
+  }
+  // Rational approximation for central region
+  q = p - 0.5;
+  r = q*q;
+  return (((((kIvnA[0]*r+kIvnA[1])*r+kIvnA[2])*r+kIvnA[3])*r+kIvnA[4])*r+kIvnA[5])*q /
+    (((((kIvnB[0]*r+kIvnB[1])*r+kIvnB[2])*r+kIvnB[3])*r+kIvnB[4])*r+1);
+}
+
+
+// SNPHWE2() and SNPHWEX() are now licensed as GPL 2+.
+double SNPHWE2(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp) {
+  // This function implements an exact SNP test of Hardy-Weinberg
+  // Equilibrium as described in Wigginton, JE, Cutler, DJ, and
+  // Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg
+  // Equilibrium. American Journal of Human Genetics. 76: 887 - 893.
+  //
+  // The original version was written by Jan Wigginton.
+  //
+  // This version was written by Christopher Chang.  It contains the following
+  // improvements over the original SNPHWE():
+  // - Proper handling of >64k genotypes.  Previously, there was a potential
+  //   integer overflow.
+  // - Detection and efficient handling of floating point overflow and
+  //   underflow.  E.g. instead of summing a tail all the way down, the loop
+  //   stops once the latest increment underflows the partial sum's 53-bit
+  //   precision; this results in a large speedup when max heterozygote count
+  //   >1k.
+  // - No malloc() call.  It's only necessary to keep track of a few partial
+  //   sums.
+  // - Support for the mid-p variant of this test.  See Graffelman J, Moreno V
+  //   (2013) The mid p-value in exact tests for Hardy-Weinberg equilibrium.
+  //
+  // Note that the SNPHWE_t() function below is a lot more efficient for
+  // testing against a p-value inclusion threshold.  SNPHWE2() should only be
+  // used if you need the actual p-value.
+  intptr_t obs_homc;
+  intptr_t obs_homr;
+  if (obs_hom1 < obs_hom2) {
+    obs_homc = obs_hom2;
+    obs_homr = obs_hom1;
+  } else {
+    obs_homc = obs_hom1;
+    obs_homr = obs_hom2;
+  }
+  const int64_t rare_copies = 2LL * obs_homr + obs_hets;
+  const int64_t genotypes2 = (obs_hets + obs_homc + obs_homr) * 2LL;
+  if (!genotypes2) {
+    if (midp) {
+      return 0.5;
+    }
+    return 1;
+  }
+  int32_t tie_ct = 1;
+  double curr_hets_t2 = obs_hets;
+  double curr_homr_t2 = obs_homr;
+  double curr_homc_t2 = obs_homc;
+  double tailp = (1 - kSmallEpsilon) * kExactTestBias;
+  double centerp = 0;
+  double lastp2 = tailp;
+  double lastp1 = tailp;
+
+  if (obs_hets * genotypes2 > rare_copies * (genotypes2 - rare_copies)) {
+    // tail 1 = upper
+    while (curr_hets_t2 > 1.5) {
+      // het_probs[curr_hets] = 1
+      // het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0)
+      curr_homr_t2 += 1;
+      curr_homc_t2 += 1;
+      lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2);
+      curr_hets_t2 -= 2;
+      if (lastp2 < kExactTestBias) {
+	tie_ct += (lastp2 > (1 - 2 * kSmallEpsilon) * kExactTestBias);
+	tailp += lastp2;
+	break;
+      }
+      centerp += lastp2;
+      // doesn't seem to make a difference, but seems best to minimize use of
+      // INFINITY
+      if (centerp > DBL_MAX) {
+	return 0;
+      }
+    }
+    if ((centerp == 0) && (!midp)) {
+      return 1;
+    }
+    while (curr_hets_t2 > 1.5) {
+      curr_homr_t2 += 1;
+      curr_homc_t2 += 1;
+      lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2);
+      curr_hets_t2 -= 2;
+      const double preaddp = tailp;
+      tailp += lastp2;
+      if (tailp <= preaddp) {
+	break;
+      }
+    }
+    double curr_hets_t1 = obs_hets + 2;
+    double curr_homr_t1 = obs_homr;
+    double curr_homc_t1 = obs_homc;
+    while (curr_homr_t1 > 0.5) {
+      // het_probs[curr_hets + 2] = het_probs[curr_hets] * 4 * curr_homr * curr_homc / ((curr_hets + 2) * (curr_hets + 1))
+      lastp1 *= (4 * curr_homr_t1 * curr_homc_t1) / (curr_hets_t1 * (curr_hets_t1 - 1));
+      const double preaddp = tailp;
+      tailp += lastp1;
+      if (tailp <= preaddp) {
+	break;
+      }
+      curr_hets_t1 += 2;
+      curr_homr_t1 -= 1;
+      curr_homc_t1 -= 1;
+    }
+  } else {
+    // tail 1 = lower
+    while (curr_homr_t2 > 0.5) {
+      curr_hets_t2 += 2;
+      lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1));
+      curr_homr_t2 -= 1;
+      curr_homc_t2 -= 1;
+      if (lastp2 < kExactTestBias) {
+	tie_ct += (lastp2 > (1 - 2 * kSmallEpsilon) * kExactTestBias);
+	tailp += lastp2;
+	break;
+      }
+      centerp += lastp2;
+      if (centerp > DBL_MAX) {
+	return 0;
+      }
+    }
+    if ((centerp == 0) && (!midp)) {
+      return 1;
+    }
+    while (curr_homr_t2 > 0.5) {
+      curr_hets_t2 += 2;
+      lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1));
+      curr_homr_t2 -= 1;
+      curr_homc_t2 -= 1;
+      const double preaddp = tailp;
+      tailp += lastp2;
+      if (tailp <= preaddp) {
+	break;
+      }
+    }
+    double curr_hets_t1 = obs_hets;
+    double curr_homr_t1 = obs_homr;
+    double curr_homc_t1 = obs_homc;
+    while (curr_hets_t1 > 1.5) {
+      curr_homr_t1 += 1;
+      curr_homc_t1 += 1;
+      lastp1 *= (curr_hets_t1 * (curr_hets_t1 - 1)) / (4 * curr_homr_t1 * curr_homc_t1);
+      const double preaddp = tailp;
+      tailp += lastp1;
+      if (tailp <= preaddp) {
+	break;
+      }
+      curr_hets_t1 -= 2;
+    }
+  }
+  if (!midp) {
+    return tailp / (tailp + centerp);
+  }
+  return (tailp - ((1 - kSmallEpsilon) * kExactTestBias * 0.5) * tie_ct) / (tailp + centerp);
+}
+
+uint32_t SNPHWE_t(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double thresh) {
+  // Threshold-test-only version of SNPHWE2() which is usually able to exit
+  // from the calculation earlier.  Returns 0 if these counts are close enough
+  // to Hardy-Weinberg equilibrium, 1 otherwise.
+  //
+  // Suppose, for definiteness, that the number of observed hets is no less
+  // than expectation.  (Same ideas apply for the other case.)  We proceed as
+  // follows:
+  // - Sum the *relative* likelihoods of more likely smaller het counts.
+  // - Determine the minimum tail mass to pass the threshold.
+  // - The majority of the time, the tail boundary elements are enough to pass
+  //   the threshold; we never need to sum the remainder of the tails.
+  // - And in the case of disequilibrium, we will often be able to immediately
+  //   determine that the tail sum cannot possibly pass the threshold, just by
+  //   looking at the tail boundary elements and using a geometric series to
+  //   upper-bound the tail sums.
+  // - Only when neither of these conditions hold do we start traveling down
+  //   the tails.
+  intptr_t obs_homc;
+  intptr_t obs_homr;
+  if (obs_hom1 < obs_hom2) {
+    obs_homc = obs_hom2;
+    obs_homr = obs_hom1;
+  } else {
+    obs_homc = obs_hom1;
+    obs_homr = obs_hom2;
+  }
+  int64_t rare_copies = 2LL * obs_homr + obs_hets;
+  int64_t genotypes2 = (obs_hets + obs_homc + obs_homr) * 2LL;
+  double curr_hets_t2 = obs_hets; // tail 2
+  double curr_homr_t2 = obs_homr;
+  double curr_homc_t2 = obs_homc;
+
+  // Subtract epsilon from initial probability mass, so that we can compare to
+  // 1 when determining tail vs. center membership without floating point error
+  // biting us in the ass
+  double tailp1 = (1 - kSmallEpsilon) * kExactTestBias;
+  double centerp = 0;
+  double lastp2 = tailp1;
+  double tailp2 = 0;
+  double tail1_ceil;
+  double tail2_ceil;
+  double lastp1;
+  double curr_hets_t1;
+  double curr_homr_t1;
+  double curr_homc_t1;
+
+  // Initially, if center sum reaches this, the test can immediately fail.
+  // Once center is summed, this is recalculated, and when tail sum has reached
+  // this, we've passed.
+  double exit_thresh;
+  double exit_threshx;
+  double ratio;
+  double preaddp;
+  if (!genotypes2) {
+    return 0;
+  }
+
+  // Convert thresh into reverse odds ratio.
+  thresh = (1 - thresh) / thresh;
+
+  // Expected het count:
+  //   2 * rarefreq * (1 - rarefreq) * genotypes
+  // = 2 * (rare_copies / (2 * genotypes)) * (1 - rarefreq) * genotypes
+  // = rare_copies * (1 - (rare_copies / (2 * genotypes)))
+  // = (rare_copies * (2 * genotypes - rare_copies)) / (2 * genotypes)
+  // 
+  // The computational identity is
+  //   P(nhets == n) := P(nhets == n+2) * (n+2) * (n+1) /
+  //                    (4 * homr(n) * homc(n))
+  // where homr() and homc() are the number of homozygous rares/commons needed
+  // to maintain the same allele frequencies.
+  // This probability is always decreasing when proceeding away from the
+  // expected het count.
+
+  if (obs_hets * genotypes2 > rare_copies * (genotypes2 - rare_copies)) {
+    // tail 1 = upper
+    if (obs_hets < 2) {
+      return 0;
+    }
+
+    // An initial upper bound on the tail sum is useful, since it lets us
+    // report test failure before summing the entire center.  We use the
+    // trivial bound of 1 + floor(rare_copies / 2): that's the total number
+    // of possible het counts, and the relative probability for each count must
+    // be <= 1 if it's in the tail.
+    exit_thresh = (1 + (rare_copies / 2)) * (thresh * kExactTestBias);
+
+    // het_probs[curr_hets] = 1
+    // het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1) / (4 * (curr_homr + 1) * (curr_homc + 1))
+    do {
+      curr_homr_t2 += 1;
+      curr_homc_t2 += 1;
+      lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2);
+      curr_hets_t2 -= 2;
+      if (lastp2 < kExactTestBias) {
+	tailp2 = lastp2;
+	break;
+      }
+      centerp += lastp2;
+      if (centerp > exit_thresh) {
+	return 1;
+      }
+    } while (curr_hets_t2 > 1.5);
+    exit_thresh = centerp / thresh;
+    if (tailp1 + tailp2 >= exit_thresh) {
+      return 0;
+    }
+    // c + cr + cr^2 + ... = c/(1-r), which is an upper bound for the tail sum
+    ratio = (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * (curr_homr_t2 + 1) * (curr_homc_t2 + 1));
+    tail2_ceil = tailp2 / (1 - ratio);
+    curr_hets_t1 = obs_hets + 2;
+    curr_homr_t1 = obs_homr;
+    curr_homc_t1 = obs_homc;
+    // ratio for the other tail
+    lastp1 = (4 * curr_homr_t1 * curr_homc_t1) / (curr_hets_t1 * (curr_hets_t1 - 1));
+    tail1_ceil = tailp1 / (1 - lastp1);
+    if (tail1_ceil + tail2_ceil < exit_thresh) {
+      return 1;
+    }
+    lastp1 *= tailp1;
+    tailp1 += lastp1;
+
+    if (obs_homr > 1) {
+      // het_probs[curr_hets + 2] = het_probs[curr_hets] * 4 * curr_homr * curr_homc / ((curr_hets + 2) * (curr_hets + 1))
+      exit_threshx = exit_thresh - tailp2;
+      do {
+	curr_hets_t1 += 2;
+	curr_homr_t1 -= 1;
+	curr_homc_t1 -= 1;
+	lastp1 *= (4 * curr_homr_t1 * curr_homc_t1) / (curr_hets_t1 * (curr_hets_t1 - 1));
+	preaddp = tailp1;
+	tailp1 += lastp1;
+	if (tailp1 > exit_threshx) {
+	  return 0;
+	}
+	if (tailp1 <= preaddp) {
+	  break;
+	}
+      } while (curr_homr_t1 > 1.5);
+    }
+    if (tailp1 + tail2_ceil < exit_thresh) {
+      return 1;
+    }
+    exit_threshx = exit_thresh - tailp1;
+    while (curr_hets_t2 > 1) {
+      curr_homr_t2 += 1;
+      curr_homc_t2 += 1;
+      lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2);
+      preaddp = tailp2;
+      tailp2 += lastp2;
+      if (tailp2 >= exit_threshx) {
+	return 0;
+      }
+      if (tailp2 <= preaddp) {
+	return 1;
+      }
+      curr_hets_t2 -= 2;
+    }
+    return 1;
+  }
+  // tail 1 = lower
+  if (!obs_homr) {
+    return 0;
+  }
+  exit_thresh = (1 + (rare_copies / 2)) * (thresh * kExactTestBias);
+  do {
+    curr_hets_t2 += 2;
+    lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1));
+    curr_homr_t2 -= 1;
+    curr_homc_t2 -= 1;
+    if (lastp2 < kExactTestBias) {
+      tailp2 = lastp2;
+      break;
+    }
+    centerp += lastp2;
+    if (centerp > exit_thresh) {
+      return 1;
+    }
+  } while (curr_homr_t2 > 0.5);
+  exit_thresh = centerp / thresh;
+  if (tailp1 + tailp2 >= exit_thresh) {
+    return 0;
+  }
+  ratio = (4 * curr_homr_t2 * curr_homc_t2) / ((curr_hets_t2 + 2) * (curr_hets_t2 + 1));
+  tail2_ceil = tailp2 / (1 - ratio);
+  curr_hets_t1 = obs_hets;
+  curr_homr_t1 = obs_homr + 1;
+  curr_homc_t1 = obs_homc + 1;
+  lastp1 = (curr_hets_t1 * (curr_hets_t1 - 1)) / (4 * curr_homr_t1 * curr_homc_t1);
+  tail1_ceil = tailp1 / (1 - lastp1);
+  lastp1 *= tailp1;
+  tailp1 += lastp1;
+
+  if (tail1_ceil + tail2_ceil < exit_thresh) {
+    return 1;
+  }
+  if (obs_hets >= 4) {
+    exit_threshx = exit_thresh - tailp2;
+    do {
+      curr_hets_t1 -= 2;
+      curr_homr_t1 += 1;
+      curr_homc_t1 += 1;
+      lastp1 *= (curr_hets_t1 * (curr_hets_t1 - 1)) / (4 * curr_homr_t1 * curr_homc_t1);
+      preaddp = tailp1;
+      tailp1 += lastp1;
+      if (tailp1 > exit_threshx) {
+	return 0;
+      }
+      if (tailp1 <= preaddp) {
+	break;
+      }
+    } while (curr_hets_t1 > 3.5);
+  }
+  if (tailp1 + tail2_ceil < exit_thresh) {
+    return 1;
+  }
+  exit_threshx = exit_thresh - tailp1;
+  while (curr_homr_t2 > 0.5) {
+    curr_hets_t2 += 2;
+    lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1));
+    curr_homr_t2 -= 1;
+    curr_homc_t2 -= 1;
+    preaddp = tailp2;
+    tailp2 += lastp2;
+    if (tailp2 >= exit_threshx) {
+      return 0;
+    }
+    if (tailp2 <= preaddp) {
+      return 1;
+    }
+  }
+  return 1;
+}
+
+uint32_t SNPHWE_midp_t(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double thresh) {
+  // Mid-p version of SNPHWE_t().  (There are enough fiddly differences that I
+  // think it's better for this to be a separate function.)  Assumes threshold
+  // is smaller than 0.5.
+  intptr_t obs_homc;
+  intptr_t obs_homr;
+  if (obs_hom1 < obs_hom2) {
+    obs_homc = obs_hom2;
+    obs_homr = obs_hom1;
+  } else {
+    obs_homc = obs_hom1;
+    obs_homr = obs_hom2;
+  }
+  int64_t rare_copies = 2LL * obs_homr + obs_hets;
+  int64_t genotypes2 = (obs_hets + obs_homc + obs_homr) * 2LL;
+  double curr_hets_t2 = obs_hets; // tail 2
+  double curr_homr_t2 = obs_homr;
+  double curr_homc_t2 = obs_homc;
+  double tailp1 = (1 - kSmallEpsilon) * kExactTestBias * 0.5;
+  double centerp = tailp1;
+  double lastp2 = (1 - kSmallEpsilon) * kExactTestBias;
+  double tailp2 = 0;
+  double tail1_ceil;
+  double tail2_ceil;
+  double lastp1;
+  double curr_hets_t1;
+  double curr_homr_t1;
+  double curr_homc_t1;
+  double exit_thresh;
+  double exit_threshx;
+  double ratio;
+  double preaddp;
+  if (!genotypes2) {
+    return 0;
+  }
+  thresh = (1 - thresh) / thresh;
+  if (obs_hets * genotypes2 > rare_copies * (genotypes2 - rare_copies)) {
+    if (obs_hets < 2) {
+      return 0;
+    }
+    exit_thresh = (1 + (rare_copies / 2)) * (thresh * kExactTestBias);
+    do {
+      curr_homr_t2 += 1;
+      curr_homc_t2 += 1;
+      lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2);
+      curr_hets_t2 -= 2;
+      if (lastp2 < kExactTestBias) {
+	if (lastp2 > (1 - 2 * kSmallEpsilon) * kExactTestBias) {
+	  // tie with original contingency table, apply mid-p correction here
+	  // too
+          tailp2 = tailp1;
+          centerp += tailp1;
+	} else {
+	  tailp2 = lastp2;
+	}
+	break;
+      }
+      centerp += lastp2;
+      if (centerp > exit_thresh) {
+	return 1;
+      }
+    } while (curr_hets_t2 > 1.5);
+    exit_thresh = centerp / thresh;
+    if (tailp1 + tailp2 >= exit_thresh) {
+      return 0;
+    }
+    ratio = (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * (curr_homr_t2 + 1) * (curr_homc_t2 + 1));
+    // this needs to work in both the tie and no-tie cases
+    tail2_ceil = tailp2 + lastp2 * ratio / (1 - ratio);
+    curr_hets_t1 = obs_hets + 2;
+    curr_homr_t1 = obs_homr;
+    curr_homc_t1 = obs_homc;
+    lastp1 = (4 * curr_homr_t1 * curr_homc_t1) / (curr_hets_t1 * (curr_hets_t1 - 1));
+    // always a tie here
+    tail1_ceil = tailp1 * 2 / (1 - lastp1) - tailp1;
+    if (tail1_ceil + tail2_ceil < exit_thresh) {
+      return 1;
+    }
+    lastp1 *= tailp1 * 2;
+    tailp1 += lastp1;
+
+    if (obs_homr > 1) {
+      exit_threshx = exit_thresh - tailp2;
+      do {
+	curr_hets_t1 += 2;
+	curr_homr_t1 -= 1;
+	curr_homc_t1 -= 1;
+	lastp1 *= (4 * curr_homr_t1 * curr_homc_t1) / (curr_hets_t1 * (curr_hets_t1 - 1));
+	preaddp = tailp1;
+	tailp1 += lastp1;
+	if (tailp1 > exit_threshx) {
+	  return 0;
+	}
+	if (tailp1 <= preaddp) {
+	  break;
+	}
+      } while (curr_homr_t1 > 1.5);
+    }
+    if (tailp1 + tail2_ceil < exit_thresh) {
+      return 1;
+    }
+    exit_threshx = exit_thresh - tailp1;
+    while (curr_hets_t2 > 1) {
+      curr_homr_t2 += 1;
+      curr_homc_t2 += 1;
+      lastp2 *= (curr_hets_t2 * (curr_hets_t2 - 1)) / (4 * curr_homr_t2 * curr_homc_t2);
+      preaddp = tailp2;
+      tailp2 += lastp2;
+      if (tailp2 >= exit_threshx) {
+	return 0;
+      }
+      if (tailp2 <= preaddp) {
+	return 1;
+      }
+      curr_hets_t2 -= 2;
+    }
+    return 1;
+  }
+  if (!obs_homr) {
+    return 0;
+  }
+  exit_thresh = (1 + (rare_copies / 2)) * (thresh * kExactTestBias);
+  do {
+    curr_hets_t2 += 2;
+    lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1));
+    curr_homr_t2 -= 1;
+    curr_homc_t2 -= 1;
+    if (lastp2 < kExactTestBias) {
+      if (lastp2 > (1 - 2 * kSmallEpsilon) * kExactTestBias) {
+	tailp2 = tailp1;
+	centerp += tailp1;
+      } else {
+	tailp2 = lastp2;
+      }
+      break;
+    }
+    centerp += lastp2;
+    if (centerp > exit_thresh) {
+      return 1;
+    }
+  } while (curr_homr_t2 > 0.5);
+  exit_thresh = centerp / thresh;
+  if (tailp1 + tailp2 >= exit_thresh) {
+    return 0;
+  }
+  ratio = (4 * curr_homr_t2 * curr_homc_t2) / ((curr_hets_t2 + 2) * (curr_hets_t2 + 1));
+  tail2_ceil = tailp2 + lastp2 * ratio / (1 - ratio);
+  curr_hets_t1 = obs_hets;
+  curr_homr_t1 = obs_homr + 1;
+  curr_homc_t1 = obs_homc + 1;
+  lastp1 = (curr_hets_t1 * (curr_hets_t1 - 1)) / (4 * curr_homr_t1 * curr_homc_t1);
+  tail1_ceil = 2 * tailp1 / (1 - lastp1) - tailp1;
+  lastp1 *= 2 * tailp1;
+  tailp1 += lastp1;
+
+  if (tail1_ceil + tail2_ceil < exit_thresh) {
+    return 1;
+  }
+  if (obs_hets >= 4) {
+    exit_threshx = exit_thresh - tailp2;
+    do {
+      curr_hets_t1 -= 2;
+      curr_homr_t1 += 1;
+      curr_homc_t1 += 1;
+      lastp1 *= (curr_hets_t1 * (curr_hets_t1 - 1)) / (4 * curr_homr_t1 * curr_homc_t1);
+      preaddp = tailp1;
+      tailp1 += lastp1;
+      if (tailp1 > exit_threshx) {
+	return 0;
+      }
+      if (tailp1 <= preaddp) {
+	break;
+      }
+    } while (curr_hets_t1 > 3.5);
+  }
+  if (tailp1 + tail2_ceil < exit_thresh) {
+    return 1;
+  }
+  exit_threshx = exit_thresh - tailp1;
+  while (curr_homr_t2 > 0.5) {
+    curr_hets_t2 += 2;
+    lastp2 *= (4 * curr_homr_t2 * curr_homc_t2) / (curr_hets_t2 * (curr_hets_t2 - 1));
+    curr_homr_t2 -= 1;
+    curr_homc_t2 -= 1;
+    preaddp = tailp2;
+    tailp2 += lastp2;
+    if (tailp2 >= exit_threshx) {
+      return 0;
+    }
+    if (tailp2 <= preaddp) {
+      return 1;
+    }
+  }
+  return 1;
+}
+
+// 2^{-40} for now, since 2^{-44} was too small on real data
+static const double kExactTestEpsilon2 = 0.0000000000009094947017729282379150390625;
+
+double fisher22(uint32_t m11, uint32_t m12, uint32_t m21, uint32_t m22, uint32_t midp) {
+  // Basic 2x2 Fisher exact test p-value calculation.
+  double tprob = (1 - kExactTestEpsilon2) * kExactTestBias;
+  double cur_prob = tprob;
+  double cprob = 0;
+  int32_t tie_ct = 1;
+  uint32_t uii;
+  double cur11;
+  double cur12;
+  double cur21;
+  double cur22;
+  double preaddp;
+  // Ensure we are left of the distribution center, m11 <= m22, and m12 <= m21.
+  if (m12 > m21) {
+    uii = m12;
+    m12 = m21;
+    m21 = uii;
+  }
+  if (m11 > m22) {
+    uii = m11;
+    m11 = m22;
+    m22 = uii;
+  }
+  if ((((uint64_t)m11) * m22) > (((uint64_t)m12) * m21)) {
+    uii = m11;
+    m11 = m12;
+    m12 = uii;
+    uii = m21;
+    m21 = m22;
+    m22 = uii;
+  }
+  cur11 = m11;
+  cur12 = m12;
+  cur21 = m21;
+  cur22 = m22;
+  while (cur12 > 0.5) {
+    cur11 += 1;
+    cur22 += 1;
+    cur_prob *= (cur12 * cur21) / (cur11 * cur22);
+    cur12 -= 1;
+    cur21 -= 1;
+    if (cur_prob > DBL_MAX) {
+      return 0;
+    }
+    if (cur_prob < kExactTestBias) {
+      if (cur_prob > (1 - 2 * kExactTestEpsilon2) * kExactTestBias) {
+        tie_ct++;
+      }
+      tprob += cur_prob;
+      break;
+    }
+    cprob += cur_prob;
+  }
+  if ((cprob == 0) && (!midp)) {
+    return 1;
+  }
+  while (cur12 > 0.5) {
+    cur11 += 1;
+    cur22 += 1;
+    cur_prob *= (cur12 * cur21) / (cur11 * cur22);
+    cur12 -= 1;
+    cur21 -= 1;
+    preaddp = tprob;
+    tprob += cur_prob;
+    if (tprob <= preaddp) {
+      break;
+    }
+  }
+  if (m11) {
+    cur11 = m11;
+    cur12 = m12;
+    cur21 = m21;
+    cur22 = m22;
+    cur_prob = (1 - kExactTestEpsilon2) * kExactTestBias;
+    do {
+      cur12 += 1;
+      cur21 += 1;
+      cur_prob *= (cur11 * cur22) / (cur12 * cur21);
+      cur11 -= 1;
+      cur22 -= 1;
+      preaddp = tprob;
+      tprob += cur_prob;
+      if (tprob <= preaddp) {
+        if (!midp) {
+	  return preaddp / (cprob + preaddp);
+	}
+        return (preaddp - ((1 - kExactTestEpsilon2) * kExactTestBias * 0.5) * tie_ct) / (cprob + preaddp);
+      }
+    } while (cur11 > 0.5);
+  }
+  if (!midp) {
+    return tprob / (cprob + tprob);
+  }
+  return (tprob - ((1 - kExactTestEpsilon2) * kExactTestBias * 0.5) * tie_ct) / (cprob + tprob);
+}
+
+int32_t SNPHWEX_tailsum(uint32_t high_het_side, double* base_probp, double* saved_hetsp, double* saved_hom1p, double* saved_hom2p, uint32_t* tie_ctp, double *totalp) {
+  // similar to fisher23_tailsum()
+  double total = 0;
+  double cur_prob = *base_probp;
+  double tmp_hets = *saved_hetsp;
+  double tmp_hom1 = *saved_hom1p;
+  double tmp_hom2 = *saved_hom2p;
+  double tmps_hets;
+  double tmps_hom1;
+  double tmps_hom2;
+  // identify beginning of tail
+  if (high_het_side) {
+    if (cur_prob > kExactTestBias) {
+      double prev_prob = tmp_hom1 * tmp_hom2;
+      while (prev_prob > 0.5) {
+	tmp_hets += 2;
+	cur_prob *= (4 * prev_prob) / (tmp_hets * (tmp_hets - 1));
+	tmp_hom1 -= 1;
+	tmp_hom2 -= 1;
+	if (cur_prob <= kExactTestBias) {
+	  break;
+	}
+	prev_prob = tmp_hom1 * tmp_hom2;
+      }
+      *base_probp = cur_prob;
+      tmps_hets = tmp_hets;
+      tmps_hom1 = tmp_hom1;
+      tmps_hom2 = tmp_hom2;
+    } else {
+      tmps_hets = tmp_hets;
+      tmps_hom1 = tmp_hom1;
+      tmps_hom2 = tmp_hom2;
+      while (1) {
+	const double prev_prob = cur_prob;
+	tmp_hom1 += 1;
+	tmp_hom2 += 1;
+	cur_prob *= (tmp_hets * (tmp_hets - 1)) / (4 * tmp_hom1 * tmp_hom2);
+	if (cur_prob < prev_prob) {
+	  // this should never happen, but better to play it safe re: rounding
+	  // error
+	  return 1;
+	}
+	tmp_hets -= 2;
+	if (cur_prob > (1 - 2 * kExactTestEpsilon2) * kExactTestBias) {
+	  // throw in extra (1 - kSmallEpsilon) multiplier to prevent rounding
+	  // errors from causing this to keep going when the left-side test
+	  // stopped
+	  if (cur_prob > (1 - kSmallEpsilon) * kExactTestBias) {
+	    break;
+	  }
+          *tie_ctp += 1;
+	}
+	total += cur_prob;
+      }
+      const double prev_prob = cur_prob;
+      cur_prob = *base_probp;
+      *base_probp = prev_prob;
+    }
+  } else {
+    if (cur_prob > kExactTestBias) {
+      while (tmp_hets > 1.5) {
+	tmp_hom1 += 1;
+	tmp_hom2 += 1;
+	cur_prob *= (tmp_hets * (tmp_hets - 1)) / (4 * tmp_hom1 * tmp_hom2);
+	tmp_hets -= 2;
+	if (cur_prob <= kExactTestBias) {
+	  break;
+	}
+      }
+      *base_probp = cur_prob;
+      tmps_hets = tmp_hets;
+      tmps_hom1 = tmp_hom1;
+      tmps_hom2 = tmp_hom2;
+    } else {
+      tmps_hets = tmp_hets;
+      tmps_hom1 = tmp_hom1;
+      tmps_hom2 = tmp_hom2;
+      while (1) {
+	const double prev_prob = cur_prob;
+	tmp_hets += 2;
+	cur_prob *= (4 * tmp_hom1 * tmp_hom2) / (tmp_hets * (tmp_hets - 1));
+	if (cur_prob < prev_prob) {
+	  return 1;
+	}
+	tmp_hom1 -= 1;
+	tmp_hom2 -= 1;
+	if (cur_prob > (1 - 2 * kExactTestEpsilon2) * kExactTestBias) {
+	  if (cur_prob > kExactTestBias) {
+	    break;
+	  }
+          *tie_ctp += 1;
+	}
+	total += cur_prob;
+      }
+      const double prev_prob = cur_prob;
+      cur_prob = *base_probp;
+      *base_probp = prev_prob;
+    }
+  }
+  *saved_hetsp = tmp_hets;
+  *saved_hom1p = tmp_hom1;
+  *saved_hom2p = tmp_hom2;
+  if (cur_prob > (1 - 2 * kExactTestEpsilon2) * kExactTestBias) {
+    if (cur_prob > kExactTestBias) {
+      // even most extreme table on this side is too probable
+      *totalp = 0;
+      return 0;
+    }
+    *tie_ctp += 1;
+  }
+  // sum tail to floating point precision limit
+  if (high_het_side) {
+    while (1) {
+      const double prev_tot = total;
+      total += cur_prob;
+      if (total <= prev_tot) {
+	break;
+      }
+      tmps_hets += 2;
+      cur_prob *= (4 * tmps_hom1 * tmps_hom2) / (tmps_hets * (tmps_hets - 1));
+      tmps_hom1 -= 1;
+      tmps_hom2 -= 1;
+    }
+  } else {
+    while (1) {
+      const double prev_tot = total;
+      total += cur_prob;
+      if (total <= prev_tot) {
+	break;
+      }
+      tmps_hom1 += 1;
+      tmps_hom2 += 1;
+      cur_prob *= (tmps_hets * (tmps_hets - 1)) / (4 * tmps_hom1 * tmps_hom2);
+      tmps_hets -= 2;
+    }
+  }
+  *totalp = total;
+  return 0;
+}
+
+double SNPHWEX(int32_t female_hets, int32_t female_hom1, int32_t female_hom2, int32_t male1, int32_t male2, uint32_t midp) {
+  // See Graffelman J, Weir BS (2016) Testing for Hardy-Weinberg equilibrium at
+  // biallelic genetic markers on the X chromosome.
+  // Evaluation strategy is similar to fisher23().
+  if ((!male1) && (!male2)) {
+    return SNPHWE2(female_hets, female_hom1, female_hom2, midp);
+  }
+  double cur_prob = (1 - kExactTestEpsilon2) * kExactTestBias;
+  double tailp = cur_prob;
+  double centerp = 0;
+  uint32_t tie_ct = 1;
+  // 1. Determine relative tail vs. center masses for the male1/male2-unchanged
+  //    slice.
+  double cur_female_hetd = (double)female_hets;
+  double cur_female_hom1d = (double)female_hom1;
+  double cur_female_hom2d = (double)female_hom2;
+  double n1 = cur_female_hetd + 2 * cur_female_hom1d;
+  double n2 = cur_female_hetd + 2 * cur_female_hom2d;
+  double tmp_hets = cur_female_hetd;
+  // "left" = low hets side, "right" = high hets side
+  double orig_base_probl;
+  double orig_base_probr;
+  double orig_saved_lhets;
+  double orig_saved_lhom1;
+  double orig_saved_lhom2;
+  double orig_saved_rhets;
+  double orig_saved_rhom1;
+  double orig_saved_rhom2;
+  if (cur_female_hetd * (n1 + n2) > n1 * n2) {
+    // current het count is greater than expected 2f(1-f), so we're on the
+    // "right" side
+    orig_base_probr = cur_prob;
+    orig_saved_rhets = cur_female_hetd;
+    orig_saved_rhom1 = cur_female_hom1d;
+    orig_saved_rhom2 = cur_female_hom2d;
+
+    // scan leftwards
+    double tmp_hom1 = cur_female_hom1d;
+    double tmp_hom2 = cur_female_hom2d;
+    while (tmp_hets > 1.5) {
+      tmp_hom1 += 1;
+      tmp_hom2 += 1;
+      cur_prob *= (tmp_hets * (tmp_hets - 1)) / (4 * tmp_hom1 * tmp_hom2);
+      tmp_hets -= 2;
+      if (cur_prob < kExactTestBias) {
+	tie_ct += (cur_prob > (1 - 2 * kExactTestEpsilon2) * kExactTestBias);
+	tailp += cur_prob;
+	break;
+      }
+      centerp += cur_prob;
+      if (centerp > DBL_MAX) {
+	return 0;
+      }
+    }
+    orig_saved_lhets = tmp_hets;
+    orig_saved_lhom1 = tmp_hom1;
+    orig_saved_lhom2 = tmp_hom2;
+    orig_base_probl = cur_prob;
+    while (tmp_hets > 1.5) {
+      tmp_hom1 += 1;
+      tmp_hom2 += 1;
+      cur_prob *= (tmp_hets * (tmp_hets - 1)) / (4 * tmp_hom1 * tmp_hom2);
+      tmp_hets -= 2;
+      const double preaddp = tailp;
+      tailp += cur_prob;
+      if (tailp <= preaddp) {
+	break;
+      }
+    }
+    tmp_hets = cur_female_hetd;
+    tmp_hom1 = cur_female_hom1d;
+    tmp_hom2 = cur_female_hom2d;
+    cur_prob = orig_base_probr;
+    while (1) {
+      tmp_hets += 2;
+      cur_prob *= (4 * tmp_hom1 * tmp_hom2) / (tmp_hets * (tmp_hets - 1));
+      const double preaddp = tailp;
+      tailp += cur_prob;
+      if (tailp <= preaddp) {
+	break;
+      }
+      tmp_hom1 -= 1;
+      tmp_hom2 -= 1;
+    }
+  } else {
+    // on the "left" side
+    orig_base_probl = cur_prob;
+    orig_saved_lhets = cur_female_hetd;
+    orig_saved_lhom1 = cur_female_hom1d;
+    orig_saved_lhom2 = cur_female_hom2d;
+
+    // scan rightwards
+    double tmp_hom1 = cur_female_hom1d;
+    double tmp_hom2 = cur_female_hom2d;
+    double quarter_numer;
+    while (1) {
+      quarter_numer = tmp_hom1 * tmp_hom2;
+      if (quarter_numer <= 0.5) {
+	break;
+      }
+      tmp_hets += 2;
+      cur_prob *= (4 * quarter_numer) / (tmp_hets * (tmp_hets - 1));
+      tmp_hom1 -= 1;
+      tmp_hom2 -= 1;
+      if (cur_prob < kExactTestBias) {
+	tie_ct += (cur_prob > (1 - 2 * kExactTestEpsilon2) * kExactTestBias);
+	tailp += cur_prob;
+	quarter_numer = tmp_hom1 * tmp_hom2;
+	break;
+      }
+      centerp += cur_prob;
+      if (centerp > DBL_MAX) {
+	return 0;
+      }
+    }
+    orig_saved_rhets = tmp_hets;
+    orig_saved_rhom1 = tmp_hom1;
+    orig_saved_rhom2 = tmp_hom2;
+    orig_base_probr = cur_prob;
+    while (quarter_numer > 0.5) {
+      tmp_hets += 2;
+      cur_prob *= (4 * quarter_numer) / (tmp_hets * (tmp_hets - 1));
+      tmp_hom1 -= 1;
+      tmp_hom2 -= 1;
+      const double preaddp = tailp;
+      tailp += cur_prob;
+      if (tailp <= preaddp) {
+	break;
+      }
+      quarter_numer = tmp_hom1 * tmp_hom2;
+    }
+    tmp_hets = cur_female_hetd;
+    tmp_hom1 = cur_female_hom1d;
+    tmp_hom2 = cur_female_hom2d;
+    cur_prob = orig_base_probl;
+    while (tmp_hets > 1.5) {
+      tmp_hom1 += 1;
+      tmp_hom2 += 1;
+      cur_prob *= (tmp_hets * (tmp_hets - 1)) / (4 * tmp_hom1 * tmp_hom2);
+      const double preaddp = tailp;
+      tailp += cur_prob;
+      if (tailp <= preaddp) {
+	break;
+      }
+      tmp_hets -= 2;
+    }
+  }
+  // a "row" holds male1/male2 constant.
+  const double orig_row_prob = tailp + centerp;
+  n1 += male1;
+  n2 += male2;
+  for (uint32_t male1_decreasing = 0; male1_decreasing < 2; ++male1_decreasing) {
+    double cur_male1 = male1;
+    double cur_male2 = male2;
+    double row_prob = orig_row_prob;
+    double cur_lhets = orig_saved_lhets;
+    double cur_lhom1 = orig_saved_lhom1;
+    double cur_lhom2 = orig_saved_lhom2;
+    double cur_rhets = orig_saved_rhets;
+    double cur_rhom1 = orig_saved_rhom1;
+    double cur_rhom2 = orig_saved_rhom2;
+    double base_probl = orig_base_probl;
+    double base_probr = orig_base_probr;
+    uint32_t iter_ct;
+    if (male1_decreasing) {
+      iter_ct = 2 * female_hom2 + female_hets;
+      if (iter_ct > ((uint32_t)male1)) {
+	iter_ct = male1;
+      }
+    } else {
+      iter_ct = 2 * female_hom1 + female_hets;
+      if (iter_ct > ((uint32_t)male2)) {
+	iter_ct = male2;
+      }
+    }
+    for (uint32_t iter_idx = 0; iter_idx < iter_ct; ++iter_idx) {
+      if (male1_decreasing) {
+	const double old_male1 = cur_male1;
+	const double old_female2 = n2 - cur_male2;
+	cur_male2 += 1;
+	cur_male1 -= 1;
+	// row likelihood is ((n1 choose male1) * (n2 choose male2)) /
+	//   ((n1 + n2) choose (male1 + male2))
+	row_prob *= (old_male1 * old_female2) / (cur_male2 * (n1 - cur_male1));
+	// bugfix (19 Apr 2017): We cannot move to the right of the mode here.
+	// Otherwise, if the mode itself is more probable than our initial
+	// table, but the table to the immediate right of the mode is not,
+	// we'll fail to count the mode.
+	// ("right" = high het count, "left" = low het count.)
+	if (cur_lhets) {
+	  cur_lhom1 += 1;
+	  base_probl *= (old_male1 * cur_lhets) / (2 * cur_male2 * cur_lhom1);
+	  cur_lhets -= 1;
+	} else {
+	  cur_lhets += 1;
+	  base_probl *= (2 * old_male1 * cur_lhom2) / (cur_male2 * cur_lhets);
+	  cur_lhom2 -= 1;
+	}
+      } else {
+	const double old_male2 = cur_male2;
+	const double old_female1 = n1 - cur_male1;
+	cur_male1 += 1;
+	cur_male2 -= 1;
+	row_prob *= (old_male2 * old_female1) / (cur_male1 * (n2 - cur_male2));
+	if (cur_lhets) {
+	  cur_lhom2 += 1;
+	  base_probl *= (old_male2 * cur_lhets) / (2 * cur_male1 * cur_lhom2);
+	  cur_lhets -= 1;
+	} else {
+	  cur_lhets += 1;
+	  base_probl *= (2 * old_male2 * cur_lhom1) / (cur_male1 * cur_lhets);
+	  cur_lhom1 -= 1;
+	}
+      }
+      double tail_incr1;
+      if (SNPHWEX_tailsum(0, &base_probl, &cur_lhets, &cur_lhom1, &cur_lhom2, &tie_ct, &tail_incr1)) {
+	// all tables in this row, and all subsequent rows, are less probable
+	// than the initial table.
+	double cur_female1 = n1 - cur_male1;
+	double cur_female2 = n2 - cur_male2;
+	if (male1_decreasing) {
+	  while (1) {
+	    const double preaddp = tailp;
+	    tailp += row_prob;
+	    if (tailp == preaddp) {
+	      break;
+	    }
+	    cur_male2 += 1;
+	    cur_female1 += 1;
+	    row_prob *= (cur_male1 * cur_female2) / (cur_male2 * cur_female1);
+	    cur_male1 -= 1;
+	    cur_female2 -= 1;
+	  }
+	} else {
+	  while (1) {
+	    const double preaddp = tailp;
+	    tailp += row_prob;
+	    if (tailp == preaddp) {
+	      break;
+	    }
+	    cur_male1 += 1;
+	    cur_female2 += 1;
+	    row_prob *= (cur_male2 * cur_female1) / (cur_male1 * cur_female2);
+	    cur_male2 -= 1;
+	    cur_female1 -= 1;
+	  }
+	}
+	break;
+      }
+      tailp += tail_incr1;
+      if (male1_decreasing) {
+	const double old_male1 = cur_male1 + 1;
+	if (cur_rhom2) {
+	  cur_rhets += 1;
+	  base_probr *= (2 * old_male1 * cur_rhom2) / (cur_male2 * cur_rhets);
+	  cur_rhom2 -= 1;
+	} else {
+	  cur_rhom1 += 1;
+	  base_probr *= (old_male1 * cur_rhets) / (2 * cur_male2 * cur_rhom1);
+	  cur_rhets -= 1;
+	}
+      } else {
+	const double old_male2 = cur_male2 + 1;
+	if (cur_rhom1) {
+	  cur_rhets += 1;
+	  base_probr *= (2 * old_male2 * cur_rhom1) / (cur_male1 * cur_rhets);
+	  cur_rhom1 -= 1;
+	} else {
+	  cur_rhom2 += 1;
+	  base_probr *= (old_male2 * cur_rhets) / (2 * cur_male1 * cur_rhom2);
+	  cur_rhets -= 1;
+	}
+      }
+      double tail_incr2 = 0.0; // maybe-uninitialized warning
+      SNPHWEX_tailsum(1, &base_probr, &cur_rhets, &cur_rhom1, &cur_rhom2, &tie_ct, &tail_incr2);
+      tailp += tail_incr2;
+      centerp += row_prob - tail_incr1 - tail_incr2;
+      if (centerp > DBL_MAX) {
+	return 0;
+      }
+    }
+  }
+  if (!midp) {
+    return tailp / (tailp + centerp);
+  }
+  return (tailp - ((1 - kExactTestEpsilon2) * kExactTestBias * 0.5) * ((int32_t)tie_ct)) / (tailp + centerp);
+}
+
+boolerr_t linear_hypothesis_chisq_f(const float* coef, const float* constraints_con_major, const float* cov_matrix, uint32_t constraint_ct, uint32_t predictor_ct, uint32_t cov_stride, double* chisq_ptr, float* tmphxs_buf, float* h_transpose_buf, float* inner_buf, matrix_finvert_buf1_t* mi_buf, float* outer_buf) {
+  const float* constraints_con_major_iter = constraints_con_major;
+  for (uint32_t constraint_idx = 0; constraint_idx < constraint_ct; constraint_idx++) {
+    float cur_outer_term = 0.0;
+    const float* coef_iter = coef;
+    for (uint32_t pred_idx = 0; pred_idx < predictor_ct; ++pred_idx) {
+      cur_outer_term += (*constraints_con_major_iter++) * (*coef_iter++);
+    }
+    outer_buf[constraint_idx] = cur_outer_term;
+  }
+  // h-transpose does not have a special stride
+  transpose_copy_float(constraints_con_major, constraint_ct, predictor_ct, predictor_ct, h_transpose_buf);
+  col_major_fmatrix_multiply_strided(h_transpose_buf, cov_matrix, constraint_ct, constraint_ct, predictor_ct, cov_stride, predictor_ct, constraint_ct, tmphxs_buf);
+  // tmp[][] is now predictor-major
+  col_major_fmatrix_multiply_strided(tmphxs_buf, constraints_con_major, constraint_ct, constraint_ct, constraint_ct, predictor_ct, predictor_ct, constraint_ct, inner_buf);
+
+  // don't need H-transpose any more, so we can use h_transpose_buf for matrix
+  // inversion
+  float absdet;
+  if (invert_fmatrix_first_half(constraint_ct, constraint_ct, inner_buf, &absdet, mi_buf, h_transpose_buf)) {
+    return 1;
+  }
+  invert_fmatrix_second_half(constraint_ct, constraint_ct, inner_buf, mi_buf, h_transpose_buf);
+  double result = 0.0;
+  const float* inner_iter = inner_buf;
+  for (uint32_t constraint_idx = 0; constraint_idx < constraint_ct; ++constraint_idx) {
+    float cur_dotprod = 0.0; // tmp2[c]
+    const float* outer_iter = outer_buf;
+    for (uint32_t constraint_idx2 = 0; constraint_idx2 < constraint_ct; ++constraint_idx2) {
+      cur_dotprod += (*inner_iter++) * (*outer_iter++);
+    }
+    result += cur_dotprod * outer_buf[constraint_idx];
+  }
+  if (result < 0.0) {
+    // guard against floating point error
+    result = 0.0;
+  }
+  *chisq_ptr = result;
+  return 0;
+}
+
+boolerr_t linear_hypothesis_chisq(const double* coef, const double* constraints_con_major, const double* cov_matrix, uintptr_t constraint_ct, uintptr_t predictor_ct, double* chisq_ptr, double* tmphxs_buf, double* h_transpose_buf, double* inner_buf, matrix_invert_buf1_t* mi_buf, double* outer_buf) {
+  // See PLINK model.cpp Model::linearHypothesis().
+  //
+  // outer_buf = constraint_ct
+  // inner_buf = constraint_ct x constraint_ct
+  // tmphxs_buf and h_transpose_buf are constraint_ct x predictor_ct
+  // mi_buf only needs to be of length 2 * constraint_ct
+  //
+  // Since no PLINK function ever calls this with nonzero h[] values, this just
+  // takes a df (constraint_ct) parameter for now; it's trivial to switch to
+  // the more general interface later.
+  const double* constraints_con_major_iter = constraints_con_major;
+  for (uintptr_t constraint_idx = 0; constraint_idx < constraint_ct; constraint_idx++) {
+    double cur_outer_term = 0.0;
+    const double* coef_iter = coef;
+    for (uintptr_t pred_idx = 0; pred_idx < predictor_ct; ++pred_idx) {
+      cur_outer_term += (*constraints_con_major_iter++) * (*coef_iter++);
+    }
+    outer_buf[constraint_idx] = cur_outer_term;
+  }
+  transpose_copy(constraints_con_major, constraint_ct, predictor_ct, h_transpose_buf);
+  col_major_matrix_multiply(h_transpose_buf, cov_matrix, constraint_ct, predictor_ct, predictor_ct, tmphxs_buf);
+  // tmp[][] is now predictor-major
+  col_major_matrix_multiply(tmphxs_buf, constraints_con_major, constraint_ct, constraint_ct, predictor_ct, inner_buf);
+
+  // don't need H-transpose any more, so we can use h_transpose_buf for matrix
+  // inversion
+  if (invert_matrix((uint32_t)constraint_ct, inner_buf, mi_buf, h_transpose_buf)) {
+    return 1;
+  }
+  double result = 0.0;
+  const double* inner_iter = inner_buf;
+  for (uintptr_t constraint_idx = 0; constraint_idx < constraint_ct; ++constraint_idx) {
+    double cur_dotprod = 0.0; // tmp2[c]
+    const double* outer_iter = outer_buf;
+    for (uintptr_t constraint_idx2 = 0; constraint_idx2 < constraint_ct; ++constraint_idx2) {
+      cur_dotprod += (*inner_iter++) * (*outer_iter++);
+    }
+    result += cur_dotprod * outer_buf[constraint_idx];
+  }
+  if (result < 0.0) {
+    // guard against floating point error
+    result = 0.0;
+  }
+  *chisq_ptr = result;
+  return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/plink2_stats.h b/plink2_stats.h
new file mode 100644
index 0000000..75e6d65
--- /dev/null
+++ b/plink2_stats.h
@@ -0,0 +1,61 @@
+#ifndef __PLINK2_STATS_H__
+#define __PLINK2_STATS_H__
+
+// This library is part of PLINK 2.00, copyright (C) 2005-2017 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation, either version 3 of the License, or (at your
+// option) any later version.
+//
+// This program is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+#include "plink2_matrix.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+// now thread-safe!
+double chiprob_p(double chisq, uint32_t df);
+
+double inverse_chiprob(double pval, uint32_t df);
+
+double calc_tprob(double tt, double df);
+
+double calc_tprob2(double tt, double df, double cached_gamma_mult);
+
+double ltqnorm(double p);
+
+double SNPHWE2(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp);
+
+// returns 0 if close enough to Hardy-Weinberg equilibrium
+uint32_t SNPHWE_t(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double thresh);
+
+uint32_t SNPHWE_midp_t(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, double thresh);
+
+double fisher22(uint32_t m11, uint32_t m12, uint32_t m21, uint32_t m22, uint32_t midp);
+
+double SNPHWEX(int32_t female_hets, int32_t female_hom1, int32_t female_hom2, int32_t male1, int32_t male2, uint32_t midp);
+
+// outer_buf = constraint_ct
+// inner_buf = constraint_ct x constraint_ct
+// tmphxs_buf and h_transpose_buf are constraint_ct x predictor_ct
+// mi_buf only needs to be of length 2 * constraint_ct
+boolerr_t linear_hypothesis_chisq_f(const float* coef, const float* constraints_con_major, const float* cov_matrix, uint32_t constraint_ct, uint32_t predictor_ct, uint32_t cov_stride, double* chisq_ptr, float* tmphxs_buf, float* h_transpose_buf, float* inner_buf, matrix_finvert_buf1_t* mi_buf, float* outer_buf);
+
+boolerr_t linear_hypothesis_chisq(const double* coef, const double* constraints_con_major, const double* cov_matrix, uintptr_t constraint_ct, uintptr_t predictor_ct, double* chisq_ptr, double* tmphxs_buf, double* h_transpose_buf, double* inner_buf, matrix_invert_buf1_t* mi_buf, double* outer_buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __PLINK2_STATS_H__
diff --git a/zstd/lib/common/bitstream.h b/zstd/lib/common/bitstream.h
new file mode 100644
index 0000000..d387300
--- /dev/null
+++ b/zstd/lib/common/bitstream.h
@@ -0,0 +1,417 @@
+/* ******************************************************************
+   bitstream
+   Part of FSE library
+   header file (to include)
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "mem.h"            /* unaligned access routines */
+#include "error_private.h"  /* error codes and messages */
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+*  A critical property of these streams is that they encode and decode in **reverse** direction.
+*  So the first bit sequence you add will be the last to be read, like a LIFO stack.
+*/
+typedef struct
+{
+    size_t bitContainer;
+    int    bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct
+{
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (register U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse ( &r, val );
+    return (unsigned) r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+    return 31 - __builtin_clz (val);
+#   else   /* Software version */
+    static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+#   endif
+}
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = { 0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF,  0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF };   /* up to 26 bits */
+
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(void*)
+ *  @return : 0 if success,
+              otherwise an error code (can be tested using ERR_isError() ) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr);
+    if (dstCapacity <= sizeof(bitC->ptr)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+/*! BIT_addBits() :
+    can add up to 26 bits into `bitC`.
+    Does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+{
+    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits)
+{
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;   /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_flushBits() :
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;   /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+              or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+
+    if (bitC->ptr >= bitC->endPtr) return 0; /* doesn't fit within authorized budget : cancel */
+
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+*   Initialize a BIT_DStream_t.
+*   `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->start = (const char*)srcBuffer;
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+            case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+            case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+            case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+            case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+            case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+            case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
+            default:;
+        }
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__) && __GNUC__*1000+__GNUC_MINOR__ >= 4008  /* experimental */
+#  if defined(__x86_64__)
+    if (sizeof(bitContainer)==8)
+        return _bextr_u64(bitContainer, start, nbBits);
+    else
+#  endif
+        return _bextr_u32(bitContainer, start, nbBits);
+#else
+    return (bitContainer >> start) & BIT_mask[nbBits];
+#endif
+}
+
+MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    return bitContainer & BIT_mask[nbBits];
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ *  @return : value extracted
+ */
+ MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
+{
+#if defined(__BMI__) && defined(__GNUC__)   /* experimental; fails if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8 */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask-nbBits) & bitMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const bitMask = sizeof(bitD->bitContainer)*8 - 1;
+    return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask+1)-nbBits) & bitMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ *  @return : extracted value.
+ */
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, U32 nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStream() :
+*   Refill `bitD` from buffer previously set in BIT_initDStream() .
+*   This function is safe, it guarantees it will not read beyond src buffer.
+*   @return : status of `BIT_DStream_t` internal register.
+              if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* should not happen => corruption detected */
+		return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+        bitD->ptr -= bitD->bitsConsumed >> 3;
+        bitD->bitsConsumed &= 7;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        return BIT_DStream_unfinished;
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD) */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/zstd/lib/common/entropy_common.c b/zstd/lib/common/entropy_common.c
new file mode 100644
index 0000000..b37a082
--- /dev/null
+++ b/zstd/lib/common/entropy_common.c
@@ -0,0 +1,221 @@
+/*
+   Common functions of New Generation Entropy library
+   Copyright (C) 2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+*************************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "mem.h"
+#include "error_private.h"       /* ERR_*, ERROR */
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+#include "huf.h"
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) return ERROR(srcSize_wrong);
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) & (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0 += 24;
+                if (ip < iend-5) {
+                    ip += 2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount   += 16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0 += 3;
+                bitStream >>= 2;
+                bitCount += 2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            } else {
+                bitStream >>= 2;
+        }   }
+        {   int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= count < 0 ? -count : count;   /* -1 means +1 */
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(corruption_detected);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)];  /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BIT_highbit32(rest);
+            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
diff --git a/zstd/lib/common/entropy_common.o b/zstd/lib/common/entropy_common.o
new file mode 100644
index 0000000..1a0c5b9
Binary files /dev/null and b/zstd/lib/common/entropy_common.o differ
diff --git a/zstd/lib/common/error_private.c b/zstd/lib/common/error_private.c
new file mode 100644
index 0000000..a0fa172
--- /dev/null
+++ b/zstd/lib/common/error_private.c
@@ -0,0 +1,43 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+#include "error_private.h"
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(parameter_unknown): return "Unknown parameter type";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_unsupportedBy32bits): return "Frame parameter unsupported in 32-bits mode";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(compressionParameter_unsupported): return "Compression parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size incorrect";
+    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+}
diff --git a/zstd/lib/common/error_private.h b/zstd/lib/common/error_private.h
new file mode 100644
index 0000000..1bc2e49
--- /dev/null
+++ b/zstd/lib/common/error_private.h
@@ -0,0 +1,76 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>        /* size_t */
+#include "zstd_errors.h"  /* enum list */
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#ifdef ERROR
+#  undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#endif
+#define ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
diff --git a/zstd/lib/common/error_private.o b/zstd/lib/common/error_private.o
new file mode 100644
index 0000000..a576f76
Binary files /dev/null and b/zstd/lib/common/error_private.o differ
diff --git a/zstd/lib/common/fse.h b/zstd/lib/common/fse.h
new file mode 100644
index 0000000..baac390
--- /dev/null
+++ b/zstd/lib/common/fse.h
@@ -0,0 +1,694 @@
+/* ******************************************************************
+   FSE : Finite State Entropy codec
+   Public Prototypes declaration
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef FSE_H
+#define FSE_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define FSE_PUBLIC_API
+#endif
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSE_compress() :
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+    @return : size of compressed data (<= dstCapacity).
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+*/
+FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/*! FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSE_isError() .
+
+    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE advanced functions
+******************************************/
+/*! FSE_compress2() :
+    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+    Both parameters can be defined as '0' to mean : use default value
+    @return : size of compressed data
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
+*/
+FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_count():
+    Provides the precise count of each byte within a table 'count'.
+    'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+    *maxSymbolValuePtr will be updated if detected smaller than initial value.
+    @return : the count of the most frequent symbol (which is not identified).
+              if return == srcSize, there is only one symbol.
+              Can also return an error code, which can be tested with FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+/*! FSE_optimalTableLog():
+    dynamically downsize 'tableLog' when conditions are met.
+    It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+    @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+    normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+    'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+    @return : tableLog,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+    Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+    Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+    Compactly save 'normalizedCounter' into 'buffer'.
+    @return : size of the compressed table,
+              or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*! Constructor and Destructor of FSE_CTable.
+    Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned tableLog, unsigned maxSymbolValue);
+FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+
+/*! FSE_buildCTable():
+    Builds `ct`, which must be already allocated, using FSE_createCTable().
+    @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+    Compress `src` using `ct` into `dst` which must be already allocated.
+    @return : size of compressed data (<= `dstCapacity`),
+              or 0 if compressed data could not fit into `dst`,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSE_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+
+/*! FSE_buildDTable():
+    Builds 'dt', which must be already allocated, using FSE_createDTable().
+    return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+
+#ifdef FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned
+ */
+size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace);
+
+/** FSE_countFast() :
+ *  same as FSE_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr
+ */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of minimum `1024` unsigned
+ */
+size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* workSpace);
+
+/*! FSE_count_simple
+ * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
+*/
+size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize);
+
+
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+ */
+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + (1<<((maxTableLog>2)?(maxTableLog-2):0)) )
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+    FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+    errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+    FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ((const U32*)ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1));
+    statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, U32 symbol)
+{
+    const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
+}
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#  define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#  define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#  define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* FSE_H */
diff --git a/zstd/lib/common/fse_decompress.c b/zstd/lib/common/fse_decompress.c
new file mode 100644
index 0000000..8474a4c
--- /dev/null
+++ b/zstd/lib/common/fse_decompress.c
@@ -0,0 +1,328 @@
+/* ******************************************************************
+   FSE : Finite State Entropy decoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include "bitstream.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f) { size_t const e = f; if (FSE_isError(e)) return e; }
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+FSE_DTable* FSE_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_freeDTable (FSE_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U16 nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32 ((U32)nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+
+    /* normal FSE decoding mode */
+    size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(NCountLength)) return NCountLength;
+    //if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */
+    if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+    ip += NCountLength;
+    cSrcSize -= NCountLength;
+
+    CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
+
+    return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace);   /* always return, even if it is an error code */
+}
+
+
+typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
+{
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/zstd/lib/common/fse_decompress.o b/zstd/lib/common/fse_decompress.o
new file mode 100644
index 0000000..ed4fa2e
Binary files /dev/null and b/zstd/lib/common/fse_decompress.o differ
diff --git a/zstd/lib/common/huf.h b/zstd/lib/common/huf.h
new file mode 100644
index 0000000..e557276
--- /dev/null
+++ b/zstd/lib/common/huf.h
@@ -0,0 +1,260 @@
+/* ******************************************************************
+   Huffman coder, part of New Generation Entropy library
+   header file
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+****************************************************************** */
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* *** Dependencies *** */
+#include <stddef.h>    /* size_t */
+
+
+/* *** simple functions *** */
+/**
+HUF_compress() :
+    Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+    'dst' buffer must be already allocated.
+    Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+    `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+    @return : size of compressed data (<= `dstCapacity`).
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single repeated byte symbol (RLE compression).
+                     if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+*/
+size_t HUF_compress(void* dst, size_t dstCapacity,
+              const void* src, size_t srcSize);
+
+/**
+HUF_decompress() :
+    Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated buffer 'dst', of minimum size 'dstSize'.
+    `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+    Note : in contrast with FSE, HUF_decompress can regenerate
+           RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+           because it knows size to regenerate.
+    @return : size of regenerated data (== originalSize),
+              or an error code, which can be tested using HUF_isError()
+*/
+size_t HUF_decompress(void* dst,  size_t originalSize,
+                const void* cSrc, size_t cSrcSize);
+
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)       /**< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size);       /**< maximum compressed size (worst case) */
+
+/* Error Management */
+unsigned    HUF_isError(size_t code);        /**< tells if a return value is an error code */
+const char* HUF_getErrorName(size_t code);   /**< provides error code string (useful for debugging) */
+
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress2() :
+ *   Same as HUF_compress(), but offers direct control over `maxSymbolValue` and `tableLog` .
+ *   `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+/** HUF_compress4X_wksp() :
+*   Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+
+
+
+#ifdef HUF_STATIC_LINKING_ONLY
+
+/* *** Dependencies *** */
+#include "mem.h"   /* U32 */
+
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX      12       /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT  11       /* tableLog by default, when not specified */
+#define HUF_SYMBOLVALUE_MAX  255
+
+#define HUF_TABLELOG_ABSOLUTEMAX  15   /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    U32 name##hb[maxSymbolValue+1]; \
+    void* name##hv = &(name##hb); \
+    HUF_CElt* name = (HUF_CElt*)(name##hv)   /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_WORKSPACE_SIZE (6 << 10)
+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+
+/* ****************************************
+*  HUF detailed API
+******************************************/
+/*!
+HUF_compress() does the following:
+1. count symbol occurrence from source[] into table count[] using FSE_count()
+2. (optional) refine tableLog using HUF_optimalTableLog()
+3. build Huffman table from count using HUF_buildCTable()
+4. save Huffman table to memory buffer using HUF_writeCTable()
+5. encode the data stream using HUF_compress4X_usingCTable()
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and regenerate 'CTable' using external methods.
+*/
+/* FSE_count() : find it within "fse.h" */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+
+typedef enum {
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+ } HUF_repeat;
+/** HUF_compress4X_repeat() :
+*   Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+/** HUF_readCTable() :
+*   Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned maxSymbolValue, const void* src, size_t srcSize);
+
+
+/*
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
+*/
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize);
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+
+/* single stream variants */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+/** HUF_compress1X_repeat() :
+*   Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+
+#endif /* HUF_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* HUF_H_298734234 */
diff --git a/zstd/lib/common/mem.h b/zstd/lib/common/mem.h
new file mode 100644
index 0000000..7a3f721
--- /dev/null
+++ b/zstd/lib/common/mem.h
@@ -0,0 +1,372 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>     /* size_t, ptrdiff_t */
+#include <string.h>     /* memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+/* code only tested on 32 and 64 bits systems */
+#define MEM_STATIC_ASSERT(c)   { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
+MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef  int16_t S16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef  int64_t S64;
+  typedef intptr_t iPtrDiff;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+  typedef ptrdiff_t      iPtrDiff;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) /*|| defined(_MSC_VER)*/ || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC U64 MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+	__pragma( pack(push, 1) )
+    typedef union { U16 u16; U32 u32; U64 u64; size_t st; } unalign;
+    __pragma( pack(pop) )
+#else
+    typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
+#endif
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+MEM_STATIC U64 MEM_readST(const void* ptr) { return ((const unalign*)ptr)->st; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+
+/* function safe only for comparisons */
+MEM_STATIC U32 MEM_readMINMATCH(const void* memPtr, U32 length)
+{
+    switch (length)
+    {
+    default :
+    case 4 : return MEM_read32(memPtr);
+    case 3 : if (MEM_isLittleEndian())
+                return MEM_read32(memPtr)<<8;
+             else
+                return MEM_read32(memPtr)>>8;
+    }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
diff --git a/zstd/lib/common/pool.c b/zstd/lib/common/pool.c
new file mode 100644
index 0000000..e439fe1
--- /dev/null
+++ b/zstd/lib/common/pool.c
@@ -0,0 +1,194 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include <stddef.h>  /* size_t */
+#include <stdlib.h>  /* malloc, calloc, free */
+#include "pool.h"
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+#include "threading.h"   /* pthread adaptation */
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+  POOL_function function;
+  void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    /* Keep track of the threads */
+    pthread_t *threads;
+    size_t numThreads;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+    /* The mutex protects the queue */
+    pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+   Work thread for the thread pool.
+   Waits for jobs and executes them.
+   @returns : NULL on failure else non-null.
+*/
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        pthread_mutex_lock(&ctx->queueMutex);
+        while (ctx->queueHead == ctx->queueTail && !ctx->shutdown) {
+            pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* empty => shutting down: so stop */
+        if (ctx->queueHead == ctx->queueTail) {
+            pthread_mutex_unlock(&ctx->queueMutex);
+            return opaque;
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            /* Unlock the mutex, signal a pusher, and run the job */
+            pthread_mutex_unlock(&ctx->queueMutex);
+            pthread_cond_signal(&ctx->queuePushCond);
+            job.function(job.opaque);
+        }
+    }
+    /* Unreachable */
+}
+
+POOL_ctx *POOL_create(size_t numThreads, size_t queueSize) {
+    POOL_ctx *ctx;
+    /* Check the parameters */
+    if (!numThreads || !queueSize) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx *)calloc(1, sizeof(POOL_ctx));
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate empty
+     * and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job *)malloc(ctx->queueSize * sizeof(POOL_job));
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    pthread_mutex_init(&ctx->queueMutex, NULL);
+    pthread_cond_init(&ctx->queuePushCond, NULL);
+    pthread_cond_init(&ctx->queuePopCond, NULL);
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (pthread_t *)malloc(numThreads * sizeof(pthread_t));
+    ctx->numThreads = 0;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->numThreads = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->numThreads = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx *ctx) {
+    /* Shut down the queue */
+    pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    pthread_cond_broadcast(&ctx->queuePushCond);
+    pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->numThreads; ++i) {
+            pthread_join(ctx->threads[i], NULL);
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    pthread_mutex_destroy(&ctx->queueMutex);
+    pthread_cond_destroy(&ctx->queuePushCond);
+    pthread_cond_destroy(&ctx->queuePopCond);
+    if (ctx->queue) free(ctx->queue);
+    if (ctx->threads) free(ctx->threads);
+    free(ctx);
+}
+
+void POOL_add(void *ctxVoid, POOL_function function, void *opaque) {
+    POOL_ctx *ctx = (POOL_ctx *)ctxVoid;
+    if (!ctx) { return; }
+
+    pthread_mutex_lock(&ctx->queueMutex);
+    {   POOL_job const job = {function, opaque};
+        /* Wait until there is space in the queue for the new job */
+        size_t newTail = (ctx->queueTail + 1) % ctx->queueSize;
+        while (ctx->queueHead == newTail && !ctx->shutdown) {
+          pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+          newTail = (ctx->queueTail + 1) % ctx->queueSize;
+        }
+        /* The queue is still going => there is space */
+        if (!ctx->shutdown) {
+            ctx->queue[ctx->queueTail] = job;
+            ctx->queueTail = newTail;
+        }
+    }
+    pthread_mutex_unlock(&ctx->queueMutex);
+    pthread_cond_signal(&ctx->queuePopCond);
+}
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+/* No multi-threading support */
+
+/* We don't need any data, but if it is empty malloc() might return NULL. */
+struct POOL_ctx_s {
+  int data;
+};
+
+POOL_ctx *POOL_create(size_t numThreads, size_t queueSize) {
+  (void)numThreads;
+  (void)queueSize;
+  return (POOL_ctx *)malloc(sizeof(POOL_ctx));
+}
+
+void POOL_free(POOL_ctx *ctx) {
+  if (ctx) free(ctx);
+}
+
+void POOL_add(void *ctx, POOL_function function, void *opaque) {
+  (void)ctx;
+  function(opaque);
+}
+
+#endif  /* ZSTD_MULTITHREAD */
diff --git a/zstd/lib/common/pool.h b/zstd/lib/common/pool.h
new file mode 100644
index 0000000..50cb25b
--- /dev/null
+++ b/zstd/lib/common/pool.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include <stddef.h>   /* size_t */
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+    Create a thread pool with at most `numThreads` threads.
+    `numThreads` must be at least 1.
+    The maximum number of queued jobs before blocking is `queueSize`.
+    `queueSize` must be at least 1.
+    @return : The POOL_ctx pointer on success else NULL.
+*/
+POOL_ctx *POOL_create(size_t numThreads, size_t queueSize);
+
+/*! POOL_free() :
+    Free a thread pool returned by POOL_create().
+*/
+void POOL_free(POOL_ctx *ctx);
+
+/*! POOL_function :
+    The function type that can be added to a thread pool.
+*/
+typedef void (*POOL_function)(void *);
+/*! POOL_add_function :
+    The function type for a generic thread pool add function.
+*/
+typedef void (*POOL_add_function)(void *, POOL_function, void *);
+
+/*! POOL_add() :
+    Add the job `function(opaque)` to the thread pool.
+    Possibly blocks until there is room in the queue.
+    Note : The function may be executed asynchronously, so `opaque` must live until the function has been completed.
+*/
+void POOL_add(void *ctx, POOL_function function, void *opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
diff --git a/zstd/lib/common/threading.c b/zstd/lib/common/threading.c
new file mode 100644
index 0000000..32d5879
--- /dev/null
+++ b/zstd/lib/common/threading.c
@@ -0,0 +1,80 @@
+
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+/**
+ * This file will hold wrapper for systems, which do not support pthreads
+ */
+
+/* When ZSTD_MULTITHREAD is not defined, this file would become an empty translation unit.
+* Include some ISO C header code to prevent this and portably avoid related warnings.
+* (Visual C++: C4206 / GCC: -Wpedantic / Clang: -Wempty-translation-unit)
+*/
+#include <stddef.h>
+
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+
+
+/* ===  Dependencies  === */
+#include <process.h>
+#include <errno.h>
+#include "threading.h"
+
+
+/* ===  Implementation  === */
+
+static unsigned __stdcall worker(void *arg)
+{
+    pthread_t* const thread = (pthread_t*) arg;
+    thread->arg = thread->start_routine(thread->arg);
+    return 0;
+}
+
+int pthread_create(pthread_t* thread, const void* unused,
+            void* (*start_routine) (void*), void* arg)
+{
+    (void)unused;
+    thread->arg = arg;
+    thread->start_routine = start_routine;
+    thread->handle = (HANDLE) _beginthreadex(NULL, 0, worker, thread, 0, NULL);
+
+    if (!thread->handle)
+        return errno;
+    else
+        return 0;
+}
+
+int _pthread_join(pthread_t * thread, void **value_ptr)
+{
+    DWORD result;
+
+    if (!thread->handle) return 0;
+
+    result = WaitForSingleObject(thread->handle, INFINITE);
+    switch (result) {
+    case WAIT_OBJECT_0:
+        if (value_ptr) *value_ptr = thread->arg;
+        return 0;
+    case WAIT_ABANDONED:
+        return EINVAL;
+    default:
+        return GetLastError();
+    }
+}
+
+#endif   /* ZSTD_MULTITHREAD */
diff --git a/zstd/lib/common/threading.h b/zstd/lib/common/threading.h
new file mode 100644
index 0000000..c008613
--- /dev/null
+++ b/zstd/lib/common/threading.h
@@ -0,0 +1,104 @@
+
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+
+/* mutex */
+#define pthread_mutex_t           CRITICAL_SECTION
+#define pthread_mutex_init(a,b)   InitializeCriticalSection((a))
+#define pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define pthread_cond_t             CONDITION_VARIABLE
+#define pthread_cond_init(a, b)    InitializeConditionVariable((a))
+#define pthread_cond_destroy(a)    /* No delete */
+#define pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define pthread_cond_signal(a)     WakeConditionVariable((a))
+#define pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* pthread_create() and pthread_join() */
+typedef struct {
+    HANDLE handle;
+    void* (*start_routine)(void*);
+    void* arg;
+} pthread_t;
+
+int pthread_create(pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+#define pthread_join(a, b) _pthread_join(&(a), (b))
+int _pthread_join(pthread_t* thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)   /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#  include <pthread.h>
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+#define pthread_mutex_t int   /* #define rather than typedef, as sometimes pthread support is implicit, resulting in duplicated symbols */
+#define pthread_mutex_init(a,b)
+#define pthread_mutex_destroy(a)
+#define pthread_mutex_lock(a)
+#define pthread_mutex_unlock(a)
+
+#define pthread_cond_t int
+#define pthread_cond_init(a,b)
+#define pthread_cond_destroy(a)
+#define pthread_cond_wait(a,b)
+#define pthread_cond_signal(a)
+#define pthread_cond_broadcast(a)
+
+/* do not use pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
diff --git a/zstd/lib/common/xxhash.c b/zstd/lib/common/xxhash.c
new file mode 100644
index 0000000..eb44222
--- /dev/null
+++ b/zstd/lib/common/xxhash.c
@@ -0,0 +1,869 @@
+/*
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independance be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
+#  define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash; set to 0 when the input data
+ * is guaranteed to be aligned.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY
+#endif
+#include "xxhash.h"
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define FORCE_INLINE static __forceinline
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint8_t  BYTE;
+    typedef uint16_t U16;
+    typedef uint32_t U32;
+    typedef  int32_t S32;
+    typedef uint64_t U64;
+#  else
+    typedef unsigned char      BYTE;
+    typedef unsigned short     U16;
+    typedef unsigned int       U32;
+    typedef   signed int       S32;
+    typedef unsigned long long U64;   /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */
+#  endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* *************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int g_one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/* *************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(int)(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/* *************************************
+*  Constants
+***************************************/
+static const U32 PRIME32_1 = 2654435761U;
+static const U32 PRIME32_2 = 2246822519U;
+static const U32 PRIME32_3 = 3266489917U;
+static const U32 PRIME32_4 =  668265263U;
+static const U32 PRIME32_5 =  374761393U;
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;
+static const U64 PRIME64_2 = 14029467366897019727ULL;
+static const U64 PRIME64_3 =  1609587929392839161ULL;
+static const U64 PRIME64_4 =  9650029242287828579ULL;
+static const U64 PRIME64_5 =  2870177450012600261ULL;
+
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* **************************
+*  Utils
+****************************/
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+
+/* ***************************
+*  Simple Hash Functions
+*****************************/
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+    seed += input * PRIME32_2;
+    seed  = XXH_rotl32(seed, 13);
+    seed *= PRIME32_1;
+    return seed;
+}
+
+FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+            v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+            v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+            v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_CREATESTATE_STATIC(state);
+    XXH32_reset(state, seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+        } while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_get64bits(p));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_CREATESTATE_STATIC(state);
+    XXH64_reset(state, seed);
+    XXH64_update(state, input, len);
+    return XXH64_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+/* **************************************************
+*  Advanced Hash Functions
+****************************************************/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-4);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-8);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len_32 += (unsigned)len;
+    state->large_len |= (len>=16) | (state->total_len_32>=16);
+
+    if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (unsigned)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* some data left from previous update */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {   const U32* p32 = state->mem32;
+            state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+            state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+            state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do {
+            v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+            v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+            v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+            v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32  = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+
+/* **** XXH64 **** */
+
+FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) {  /* fill in tmp buffer */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* tmp buffer is full */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+        state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+        state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+        state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do {
+            v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+            v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+            v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+            v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32) {
+        U64 const v1 = state->v1;
+        U64 const v2 = state->v2;
+        U64 const v3 = state->v3;
+        U64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64  = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/* **************************
+*  Canonical representation
+****************************/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
diff --git a/zstd/lib/common/xxhash.h b/zstd/lib/common/xxhash.h
new file mode 100644
index 0000000..9bad1f5
--- /dev/null
+++ b/zstd/lib/common/xxhash.h
@@ -0,0 +1,305 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+*  API modifier
+******************************/
+/** XXH_PRIVATE_API
+*   This is useful if you want to include xxhash functions in `static` mode
+*   in order to inline them, and remove their symbol from the public list.
+*   Methodology :
+*     #define XXH_PRIVATE_API
+*     #include "xxhash.h"
+*   `xxhash.c` is automatically included.
+*   It's not useful to compile and link it as a separate module anymore.
+*/
+#ifdef XXH_PRIVATE_API
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+#    define XXH_PUBLIC_API static   /* this version may generate warnings for unused static functions; disable the relevant warning */
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_PRIVATE_API */
+
+/*!XXH_NAMESPACE, aka Namespace Emulation :
+
+If you want to include _and expose_ xxHash functions from within your own library,
+but also want to avoid symbol collisions with another library which also includes xxHash,
+
+you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values).
+
+Note that no change is required within the calling program as long as it includes `xxhash.h` :
+regular symbol name will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Simple Hash Functions
+******************************/
+typedef unsigned int       XXH32_hash_t;
+typedef unsigned long long XXH64_hash_t;
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*!
+XXH32() :
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+*/
+
+
+/* ****************************
+*  Streaming Hash Functions
+******************************/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*! State allocation, compatible with dynamic libraries */
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+
+/* hash streaming */
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions generate the xxHash of an input provided in multiple segments.
+Note that, for small input, they are slower than single-call functions, due to state management.
+For small input, prefer `XXH32()` and `XXH64()` .
+
+XXH state must first be allocated, using XXH*_createState() .
+
+Start a new hash by initializing state with a seed, using XXH*_reset().
+
+Then, feed the hash state by calling XXH*_update() as many times as necessary.
+Obviously, input must be allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, a hash value can be produced anytime, by using XXH*_digest().
+This function returns the nn-bits hash as an int or long long.
+
+It's still possible to continue inserting input into the hash state after a digest,
+and generate some new hashes later on, by calling again XXH*_digest().
+
+When done, free XXH state space if it was allocated dynamically.
+*/
+
+
+/* **************************
+*  Utils
+****************************/
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* ! C99 */
+#  define restrict   /* disable restrict */
+#endif
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state);
+
+
+/* **************************
+*  Canonical representation
+****************************/
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+*  The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+*  These functions allow transformation of hash result into and from its canonical format.
+*  This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+*/
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+/* ================================================================================================
+   This section contains definitions which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   They shall only be used with static linking.
+   Never use these definitions in association with dynamic linking !
+=================================================================================================== */
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345)
+#define XXH_STATIC_H_3543687687345
+
+/* These definitions are only meant to allow allocation of XXH state
+   statically, on stack, or in a struct for example.
+   Do not use members directly. */
+
+   struct XXH32_state_s {
+       unsigned total_len_32;
+       unsigned large_len;
+       unsigned v1;
+       unsigned v2;
+       unsigned v3;
+       unsigned v4;
+       unsigned mem32[4];   /* buffer defined as U32 for alignment */
+       unsigned memsize;
+       unsigned reserved;   /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH32_state_t */
+
+   struct XXH64_state_s {
+       unsigned long long total_len;
+       unsigned long long v1;
+       unsigned long long v2;
+       unsigned long long v3;
+       unsigned long long v4;
+       unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+       unsigned memsize;
+       unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH64_state_t */
+
+
+#  ifdef XXH_PRIVATE_API
+#    include "xxhash.c"   /* include xxhash functions as `static`, for inlining */
+#  endif
+
+#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/zstd/lib/common/xxhash.o b/zstd/lib/common/xxhash.o
new file mode 100644
index 0000000..138ce76
Binary files /dev/null and b/zstd/lib/common/xxhash.o differ
diff --git a/zstd/lib/common/zstd_common.c b/zstd/lib/common/zstd_common.c
new file mode 100644
index 0000000..8408a58
--- /dev/null
+++ b/zstd/lib/common/zstd_common.c
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>         /* malloc */
+#include "error_private.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"           /* declaration of ZSTD_isError, ZSTD_getErrorName, ZSTD_getErrorCode, ZSTD_getErrorString, ZSTD_versionNumber */
+
+
+/*-****************************************
+*  Version
+******************************************/
+unsigned ZSTD_versionNumber (void) { return ZSTD_VERSION_NUMBER; }
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+/*! ZSTD_isError() :
+*   tells if a return value is an error code */
+unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTD_getErrorName() :
+*   provides error code string from function result (useful for debugging) */
+const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+/*! ZSTD_getError() :
+*   convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+
+/*! ZSTD_getErrorString() :
+*   provides error code string from enum */
+const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+/* default uses stdlib */
+void* ZSTD_defaultAllocFunction(void* opaque, size_t size)
+{
+    void* address = malloc(size);
+    (void)opaque;
+    return address;
+}
+
+void ZSTD_defaultFreeFunction(void* opaque, void* address)
+{
+    (void)opaque;
+    free(address);
+}
+
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem)
+{
+    return customMem.customAlloc(customMem.opaque, size);
+}
+
+void ZSTD_free(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL)
+        customMem.customFree(customMem.opaque, ptr);
+}
diff --git a/zstd/lib/common/zstd_common.o b/zstd/lib/common/zstd_common.o
new file mode 100644
index 0000000..c846459
Binary files /dev/null and b/zstd/lib/common/zstd_common.o differ
diff --git a/zstd/lib/common/zstd_errors.h b/zstd/lib/common/zstd_errors.h
new file mode 100644
index 0000000..949dbd0
--- /dev/null
+++ b/zstd/lib/common/zstd_errors.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#  define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#else
+#  define ZSTDERRORLIB_VISIBILITY
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-****************************************
+*  error codes list
+******************************************/
+typedef enum {
+  ZSTD_error_no_error,
+  ZSTD_error_GENERIC,
+  ZSTD_error_prefix_unknown,
+  ZSTD_error_version_unsupported,
+  ZSTD_error_parameter_unknown,
+  ZSTD_error_frameParameter_unsupported,
+  ZSTD_error_frameParameter_unsupportedBy32bits,
+  ZSTD_error_frameParameter_windowTooLarge,
+  ZSTD_error_compressionParameter_unsupported,
+  ZSTD_error_init_missing,
+  ZSTD_error_memory_allocation,
+  ZSTD_error_stage_wrong,
+  ZSTD_error_dstSize_tooSmall,
+  ZSTD_error_srcSize_wrong,
+  ZSTD_error_corruption_detected,
+  ZSTD_error_checksum_wrong,
+  ZSTD_error_tableLog_tooLarge,
+  ZSTD_error_maxSymbolValue_tooLarge,
+  ZSTD_error_maxSymbolValue_tooSmall,
+  ZSTD_error_dictionary_corrupted,
+  ZSTD_error_dictionary_wrong,
+  ZSTD_error_maxCode
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare directly with enum list published into "error_public.h" */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */
diff --git a/zstd/lib/common/zstd_internal.h b/zstd/lib/common/zstd_internal.h
new file mode 100644
index 0000000..5c5b287
--- /dev/null
+++ b/zstd/lib/common/zstd_internal.h
@@ -0,0 +1,283 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  ifdef __GNUC__
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "mem.h"
+#include "error_private.h"
+#define ZSTD_STATIC_LINKING_ONLY
+#include "zstd.h"
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY   /* XXH64_state_t */
+#endif
+#include "xxhash.h"               /* XXH_reset, update, digest */
+
+
+/*-*************************************
+*  shared macros
+***************************************/
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+#define CHECK_F(f) { size_t const errcod = f; if (ERR_isError(errcod)) return errcod; }  /* check and Forward error code */
+#define CHECK_E(f, e) { size_t const errcod = f; if (ERR_isError(errcod)) return ERROR(e); }  /* check and send Error code */
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM    (1<<12)
+#define ZSTD_DICT_MAGIC  0xEC30A437   /* v0.7+ */
+
+#define ZSTD_REP_NUM      3                 /* number of repcodes */
+#define ZSTD_REP_CHECK    (ZSTD_REP_NUM)    /* number of repcodes to check by the optimal parser */
+#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+#define ZSTD_REP_MOVE_OPT (ZSTD_REP_NUM)
+static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML  52
+#define MaxLL  35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+#define LL_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+#define ML_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[MaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+                                              1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 };
+#define OF_DEFAULTNORMLOG 5  /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+
+/*! ZSTD_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+MEM_STATIC void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd)   /* should be faster for decoding, but strangely, not verified on all platform */
+{
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = (BYTE*)dstEnd;
+    do
+        COPY8(op, ip)
+    while (op < oend);
+}
+
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct ZSTD_stats_s ZSTD_stats_t;
+
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+    U32 price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+
+typedef struct seqDef_s {
+    U32 offset;
+    U16 litLength;
+    U16 matchLength;
+} seqDef;
+
+
+typedef struct {
+    seqDef* sequencesStart;
+    seqDef* sequences;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* llCode;
+    BYTE* mlCode;
+    BYTE* ofCode;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+    /* opt */
+    ZSTD_optimal_t* priceTable;
+    ZSTD_match_t* matchTable;
+    U32* matchLengthFreq;
+    U32* litLengthFreq;
+    U32* litFreq;
+    U32* offCodeFreq;
+    U32  matchLengthSum;
+    U32  matchSum;
+    U32  litLengthSum;
+    U32  litSum;
+    U32  offCodeSum;
+    U32  log2matchLengthSum;
+    U32  log2matchSum;
+    U32  log2litLengthSum;
+    U32  log2litSum;
+    U32  log2offCodeSum;
+    U32  factor;
+    U32  staticPrices;
+    U32  cachedPrice;
+    U32  cachedLitLength;
+    const BYTE* cachedLiterals;
+} seqStore_t;
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);
+int ZSTD_isSkipFrame(ZSTD_DCtx* dctx);
+
+/* custom memory allocation functions */
+void* ZSTD_defaultAllocFunction(void* opaque, size_t size);
+void ZSTD_defaultFreeFunction(void* opaque, void* address);
+#ifndef ZSTD_DLL_IMPORT
+static const ZSTD_customMem defaultCustomMem = { ZSTD_defaultAllocFunction, ZSTD_defaultFreeFunction, NULL };
+#endif
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void* ptr, ZSTD_customMem customMem);
+
+
+/*======  common function  ======*/
+
+MEM_STATIC U32 ZSTD_highbit32(U32 val)
+{
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    _BitScanReverse(&r, val);
+    return (unsigned)r;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* GCC Intrinsic */
+    return 31 - __builtin_clz(val);
+#   else   /* Software version */
+    static const int DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+    U32 v = val;
+    int r;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    r = DeBruijnClz[(U32)(v * 0x07C4ACDDU) >> 27];
+    return r;
+#   endif
+}
+
+
+/* hidden functions */
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx);
+
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
diff --git a/zstd/lib/compress/fse_compress.c b/zstd/lib/compress/fse_compress.c
new file mode 100644
index 0000000..6708fb9
--- /dev/null
+++ b/zstd/lib/compress/fse_compress.c
@@ -0,0 +1,857 @@
+/* ******************************************************************
+   FSE : Finite State Entropy encoder
+   Copyright (C) 2013-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+#include "bitstream.h"
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c) { enum { FSE_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 cumul[FSE_MAX_SYMBOL_VALUE+2];
+
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace;
+    U32 highThreshold = tableSize-1;
+
+    /* CTable header */
+    if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* For explanations on how to distribute symbol values over the table :
+    *  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u<=maxSymbolValue+1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                cumul[u] = cumul[u-1] + normalizedCounter[u-1];
+        }   }
+        cumul[maxSymbolValue+1] = tableSize+1;
+    }
+
+    /* Spread symbols */
+    {   U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            int nbOccurences;
+            for (nbOccurences=0; nbOccurences<normalizedCounter[symbol]; nbOccurences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+
+        if (position!=0) return ERROR(GENERIC);   /* Must have gone through all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0: break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                symbolTT[s].deltaFindState = total - 1;
+                total ++;
+                break;
+            default :
+                {
+                    U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
+                    U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                    total +=  normalizedCounter[s];
+    }   }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE];   /* memset() is not necessary, even if static analyzer complain about it */
+    return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                                       const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                                       unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    bitStream = 0;
+    bitCount  = 0;
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while (remaining>1) {  /* stops at 1 */
+        if (previous0) {
+            unsigned start = charnum;
+            while (!normalizedCounter[charnum]) charnum++;
+            while (charnum >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (charnum >= start+3) {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (charnum-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[charnum++];
+            int const max = (2*threshold-1)-remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold) count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previous0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) nbBits--, threshold>>=1;
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2)) return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    if (charnum > maxSymbolValue + 1) return ERROR(GENERIC);
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+
+
+/*-**************************************************************
+*  Counting histogram
+****************************************************************/
+/*! FSE_count_simple
+    This function counts byte values within `src`, and store the histogram into table `count`.
+    It doesn't use any additional memory.
+    But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
+    For this reason, prefer using a table `count` with 256 elements.
+    @return : count of most numerous element
+*/
+size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                        const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+
+    memset(count, 0, (maxSymbolValue+1)*sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) count[*ip++]++;
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    { U32 s; for (s=0; s<=maxSymbolValue; s++) if (count[s] > max) max = count[s]; }
+
+    return (size_t)max;
+}
+
+
+/* FSE_count_parallel_wksp() :
+ * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
+ * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`` */
+static size_t FSE_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                unsigned checkMax, unsigned* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    memset(Counting1, 0, 4*256*sizeof(unsigned));
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (checkMax) {   /* verify stats will fit into destination table */
+        U32 s; for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    {   U32 s; for (s=0; s<=maxSymbolValue; s++) {
+            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+            if (count[s] > max) max = count[s];
+    }   }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize, unsigned* workSpace)
+{
+    if (sourceSize < 1500) return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[1024];
+    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters);
+}
+
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* source, size_t sourceSize, unsigned* workSpace)
+{
+    if (*maxSymbolValuePtr < 255)
+        return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+    *maxSymbolValuePtr = 255;
+    return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+size_t FSE_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[1024];
+    return FSE_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters);
+}
+
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+/*! FSE_sizeof_CTable() :
+    FSE_CTable is a variable size structure which contains :
+    `U16 tableLog;`
+    `U16 maxSymbolValue;`
+    `U16 nextStateNumber[1 << tableLog];`                         // This size is variable
+    `FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
+Allocation is manual (C standard does not support variable-size structures).
+*/
+size_t FSE_sizeof_CTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+}
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return (FSE_CTable*)malloc(size);
+}
+
+void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+	U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+	U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+	return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+	U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+	U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+	if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+	if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = -1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) maxV=s, maxC=count[s];
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) ToDistribute--, norm[s]++;
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        U64 const scale = 62 - tableLog;
+        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = -1;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) largestP=proba, largest=s;
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            printf("%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            printf("Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+
+    /* header */
+    tableU16[-2] = (U16) nbBits;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* Build table */
+    for (s=0; s<tableSize; s++)
+        tableU16[s] = (U16)(tableSize + s);
+
+    /* Build Symbol Transformation Table */
+    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+        for (s=0; s<=maxSymbolValue; s++) {
+            symbolTT[s].deltaNbBits = deltaNbBits;
+            symbolTT[s].deltaFindState = s-1;
+    }   }
+
+    return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return f
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` size must be `(1<<tableLog)`.
+ */
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32   count[FSE_MAX_SYMBOL_VALUE+1];
+    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
+    FSE_CTable* CTable = (FSE_CTable*)workSpace;
+    size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
+    void* scratchBuffer = (void*)(CTable + CTableSize);
+    size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
+
+    /* init conditions */
+    if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
+    if (srcSize <= 1) return 0;  /* Not compressible */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, FSE_count(count, &maxSymbolValue, src, srcSize) );
+        if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += nc_err;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    /* check compressibility */
+    if ( (size_t)(op-ostart) >= srcSize-1 ) return 0;
+
+    return op-ostart;
+}
+
+typedef struct {
+    FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+    BYTE scratchBuffer[1 << FSE_MAX_TABLELOG];
+} fseWkspMax_t;
+
+size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+    fseWkspMax_t scratchBuffer;
+    FSE_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
+}
+
+size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/zstd/lib/compress/fse_compress.o b/zstd/lib/compress/fse_compress.o
new file mode 100644
index 0000000..81ec47d
Binary files /dev/null and b/zstd/lib/compress/fse_compress.o differ
diff --git a/zstd/lib/compress/huf_compress.c b/zstd/lib/compress/huf_compress.c
new file mode 100644
index 0000000..fe11aaf
--- /dev/null
+++ b/zstd/lib/compress/huf_compress.c
@@ -0,0 +1,684 @@
+/* ******************************************************************
+   Huffman encoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+#include "bitstream.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+#include "fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return f
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+
+/* **************************************************************
+*  Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    U32 maxSymbolValue = HUF_TABLELOG_MAX;
+    U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+    FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+    BYTE scratchBuffer[1<<MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
+
+    U32 count[HUF_TABLELOG_MAX+1];
+    S16 norm[HUF_TABLELOG_MAX+1];
+
+    /* init conditions */
+    if (wtSize <= 1) return 0;  /* Not compressible */
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize) );
+        if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += hSize;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+
+struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+};   /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable() :
+    `CTable` : Huffman tree to save, using huf representation.
+    @return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+                        const HUF_CElt* CTable, U32 maxSymbolValue, U32 huffLog)
+{
+    BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+    BYTE* op = (BYTE*)dst;
+    U32 n;
+
+     /* check conditions */
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+    /* convert to weight */
+    bitsToWeight[0] = 0;
+    for (n=1; n<huffLog+1; n++)
+        bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+    /* attempt weights compression by FSE */
+    {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) );
+        if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */
+            op[0] = (BYTE)hSize;
+            return hSize+1;
+    }   }
+
+    /* write raw values as 4-bits (max : 15) */
+    if (maxSymbolValue > (256-128)) return ERROR(GENERIC);   /* should not happen : likely means source cannot be compressed */
+    if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall);   /* not enough space within dst buffer */
+    op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+    huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause msan issue in final combination */
+    for (n=0; n<maxSymbolValue; n+=2)
+        op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+    return ((maxSymbolValue+1)/2) + 1;
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, U32 maxSymbolValue, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];   /* init not required, even though some static analyzer may complain */
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+
+    /* get symbol weights */
+    CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+
+    /* check result */
+    if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > maxSymbolValue+1) return ERROR(maxSymbolValue_tooSmall);
+
+    /* Prepare base value per rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<=tableLog; n++) {
+            U32 current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill nbBits */
+    {   U32 n; for (n=0; n<nbSymbols; n++) {
+            const U32 w = huffWeight[n];
+            CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+    }   }
+
+    /* fill val */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */
+        U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
+        /* determine stating value per rank */
+        valPerRank[tableLog+1] = 0;   /* for w==0 */
+        {   U16 min = 0;
+            U32 n; for (n=tableLog; n>0; n--) {  /* start at n=tablelog <-> w=1 */
+                valPerRank[n] = min;     /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        /* assign value within rank, symbol order */
+        { U32 n; for (n=0; n<=maxSymbolValue; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
+    }
+
+    return readSize;
+}
+
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+    if (largestBits <= maxNbBits) return largestBits;   /* early exit : no elt > maxNbBits */
+
+    /* there are several too large elements (at least >= 2) */
+    {   int totalCost = 0;
+        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        U32 n = lastNonNull;
+
+        while (huffNode[n].nbBits > maxNbBits) {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)maxNbBits;
+            n --;
+        }  /* n stops at huffNode[n].nbBits <= maxNbBits */
+        while (huffNode[n].nbBits == maxNbBits) n--;   /* n end at index of smallest symbol using < maxNbBits */
+
+        /* renorm totalCost */
+        totalCost >>= (largestBits - maxNbBits);  /* note : totalCost is necessarily a multiple of baseCost */
+
+        /* repay normalized cost */
+        {   U32 const noSymbol = 0xF0F0F0F0;
+            U32 rankLast[HUF_TABLELOG_MAX+2];
+            int pos;
+
+            /* Get pos of last (smallest) symbol per rank */
+            memset(rankLast, 0xF0, sizeof(rankLast));
+            {   U32 currentNbBits = maxNbBits;
+                for (pos=n ; pos >= 0; pos--) {
+                    if (huffNode[pos].nbBits >= currentNbBits) continue;
+                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+                    rankLast[maxNbBits-currentNbBits] = pos;
+            }   }
+
+            while (totalCost > 0) {
+                U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                    U32 highPos = rankLast[nBitsToDecrease];
+                    U32 lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noSymbol) continue;
+                    if (lowPos == noSymbol) break;
+                    {   U32 const highTotal = huffNode[highPos].count;
+                        U32 const lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                }   }
+                /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+                while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))  /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+                    nBitsToDecrease ++;
+                totalCost -= 1 << (nBitsToDecrease-1);
+                if (rankLast[nBitsToDecrease-1] == noSymbol)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];   /* this rank is no longer empty */
+                huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+                if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
+                    rankLast[nBitsToDecrease] = noSymbol;
+                else {
+                    rankLast[nBitsToDecrease]--;
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                        rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+            }   }   /* while (totalCost > 0) */
+
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                if (rankLast[1] == noSymbol) {  /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
+                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    huffNode[n+1].nbBits--;
+                    rankLast[1] = n+1;
+                    totalCost++;
+                    continue;
+                }
+                huffNode[ rankLast[1] + 1 ].nbBits--;
+                rankLast[1]++;
+                totalCost ++;
+    }   }   }   /* there are several too large elements (at least >= 2) */
+
+    return maxNbBits;
+}
+
+
+typedef struct {
+    U32 base;
+    U32 current;
+} rankPos;
+
+static void HUF_sort(nodeElt* huffNode, const U32* count, U32 maxSymbolValue)
+{
+    rankPos rank[32];
+    U32 n;
+
+    memset(rank, 0, sizeof(rank));
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 r = BIT_highbit32(count[n] + 1);
+        rank[r].base ++;
+    }
+    for (n=30; n>0; n--) rank[n-1].base += rank[n].base;
+    for (n=0; n<32; n++) rank[n].current = rank[n].base;
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 const c = count[n];
+        U32 const r = BIT_highbit32(c+1) + 1;
+        U32 pos = rank[r].current++;
+        while ((pos > rank[r].base) && (c > huffNode[pos-1].count)) huffNode[pos]=huffNode[pos-1], pos--;
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+}
+
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+typedef nodeElt huffNodeTable[2*HUF_SYMBOLVALUE_MAX+1 +1];
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+{
+    nodeElt* const huffNode0 = (nodeElt*)workSpace;
+    nodeElt* const huffNode = huffNode0+1;
+    U32 n, nonNullRank;
+    int lowS, lowN;
+    U16 nodeNb = STARTNODE;
+    U32 nodeRoot;
+
+    /* safety checks */
+    if (wkspSize < sizeof(huffNodeTable)) return ERROR(GENERIC);   /* workSpace is not large enough */
+    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(GENERIC);
+    memset(huffNode0, 0, sizeof(huffNodeTable));
+
+    /* sort, decreasing order */
+    HUF_sort(huffNode, count, maxSymbolValue);
+
+    /* init for parents */
+    nonNullRank = maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
+
+    /* create parents */
+    while (nodeNb <= nodeRoot) {
+        U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+        nodeNb++;
+    }
+
+    /* distribute weights (unlimited tree height) */
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    /* enforce maxTableLog */
+    maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+    /* fill result into tree (val, nbBits) */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+        U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+        if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+        for (n=0; n<=nonNullRank; n++)
+            nbPerRank[huffNode[n].nbBits]++;
+        /* determine stating value per rank */
+        {   U16 min = 0;
+            for (n=maxNbBits; n>0; n--) {
+                valPerRank[n] = min;      /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[huffNode[n].byte].nbBits = huffNode[n].nbBits;   /* push nbBits per symbol, symbol order */
+        for (n=0; n<=maxSymbolValue; n++)
+            tree[n].val = valPerRank[tree[n].nbBits]++;   /* assign value within rank, symbol order */
+    }
+
+    return maxNbBits;
+}
+
+/** HUF_buildCTable() :
+ *  Note : count is used before tree is written, so they can safely overlap
+ */
+size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits)
+{
+    huffNodeTable nodeTable;
+    return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, nodeTable, sizeof(nodeTable));
+}
+
+static size_t HUF_estimateCompressedSize(HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+    size_t nbBits = 0;
+    int s;
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        nbBits += CTable[s].nbBits * count[s];
+    }
+    return nbBits >> 3;
+}
+
+static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+  int bad = 0;
+  int s;
+  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+    bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+  }
+  return !bad;
+}
+
+static void HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+#define HUF_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+#define HUF_FLUSHBITS_1(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+    size_t n;
+    const unsigned fast = (dstSize >= HUF_BLOCKBOUND(srcSize));
+    BIT_CStream_t bitC;
+
+    /* init */
+    if (dstSize < 8) return 0;   /* not enough space to compress */
+    { size_t const initErr = BIT_initCStream(&bitC, op, oend-op);
+      if (HUF_isError(initErr)) return 0; }
+
+    n = srcSize & ~3;  /* join to mod 4 */
+    switch (srcSize & 3)
+    {
+        case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+                 HUF_FLUSHBITS_2(&bitC);
+        case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+                 HUF_FLUSHBITS_1(&bitC);
+        case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
+                 HUF_FLUSHBITS(&bitC);
+        case 0 :
+        default: ;
+    }
+
+    for (; n>0; n-=4) {  /* note : n&3==0 at this stage */
+        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+        HUF_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+        HUF_FLUSHBITS(&bitC);
+    }
+
+    return BIT_closeCStream(&bitC);
+}
+
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    if (dstSize < 6 + 1 + 1 + 1 + 8) return 0;   /* minimum space to compress successfully */
+    if (srcSize < 12) return 0;   /* no saving possible : too small input */
+    op += 6;   /* jumpTable */
+
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) );
+        if (cSize==0) return 0;
+        MEM_writeLE16(ostart, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) );
+        if (cSize==0) return 0;
+        MEM_writeLE16(ostart+2, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) );
+        if (cSize==0) return 0;
+        MEM_writeLE16(ostart+4, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, iend-ip, CTable) );
+        if (cSize==0) return 0;
+        op += cSize;
+    }
+
+    return op-ostart;
+}
+
+
+static size_t HUF_compressCTable_internal(
+                BYTE* const ostart, BYTE* op, BYTE* const oend,
+                const void* src, size_t srcSize,
+                unsigned singleStream, const HUF_CElt* CTable)
+{
+    size_t const cSize = singleStream ?
+                         HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) :
+                         HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable);
+    if (HUF_isError(cSize)) { return cSize; }
+    if (cSize==0) { return 0; }   /* uncompressible */
+    op += cSize;
+    /* check compressibility */
+    if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+    return op-ostart;
+}
+
+
+/* `workSpace` must a table of at least 1024 unsigned */
+static size_t HUF_compress_internal (
+                void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog,
+                unsigned singleStream,
+                void* workSpace, size_t wkspSize,
+                HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    U32* count;
+    size_t const countSize = sizeof(U32) * (HUF_SYMBOLVALUE_MAX + 1);
+    HUF_CElt* CTable;
+    size_t const CTableSize = sizeof(HUF_CElt) * (HUF_SYMBOLVALUE_MAX + 1);
+
+    /* checks & inits */
+    if (wkspSize < sizeof(huffNodeTable) + countSize + CTableSize) return ERROR(GENERIC);
+    if (!srcSize) return 0;  /* Uncompressed (note : 1 means rle, so first byte must be correct) */
+    if (!dstSize) return 0;  /* cannot fit within dst budget */
+    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
+    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+    count = (U32*)workSpace;
+    workSpace = (BYTE*)workSpace + countSize;
+    wkspSize -= countSize;
+    CTable = (HUF_CElt*)workSpace;
+    workSpace = (BYTE*)workSpace + CTableSize;
+    wkspSize -= CTableSize;
+
+    /* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */
+    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+        return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+    }
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(largest, FSE_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, (U32*)workSpace) );
+        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+        if (largest <= (srcSize >> 7)+1) return 0;   /* Fast heuristic : not compressible enough */
+    }
+
+    /* Check validity of previous table */
+    if (repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, count, maxSymbolValue)) {
+        *repeat = HUF_repeat_none;
+    }
+    /* Heuristic : use existing table for small inputs */
+    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+        return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+    }
+
+    /* Build Huffman Tree */
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   CHECK_V_F(maxBits, HUF_buildCTable_wksp (CTable, count, maxSymbolValue, huffLog, workSpace, wkspSize) );
+        huffLog = (U32)maxBits;
+        /* Zero the unused symbols so we can check it for validity */
+        memset(CTable + maxSymbolValue + 1, 0, CTableSize - (maxSymbolValue + 1) * sizeof(HUF_CElt));
+    }
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, CTable, maxSymbolValue, huffLog) );
+        /* Check if using the previous table will be beneficial */
+        if (repeat && *repeat != HUF_repeat_none) {
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue);
+            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+            }
+        }
+        /* Use the new table */
+        if (hSize + 12ul >= srcSize) { return 0; }
+        op += hSize;
+        if (repeat) { *repeat = HUF_repeat_none; }
+        if (oldHufTable) { memcpy(oldHufTable, CTable, CTableSize); } /* Save the new table */
+    }
+    return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable);
+}
+
+
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat, preferRepeat);
+}
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                 unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[1024];
+    return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat, preferRepeat);
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[1024];
+    return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, (U32)srcSize, 255, HUF_TABLELOG_DEFAULT);
+}
diff --git a/zstd/lib/compress/huf_compress.o b/zstd/lib/compress/huf_compress.o
new file mode 100644
index 0000000..273dd28
Binary files /dev/null and b/zstd/lib/compress/huf_compress.o differ
diff --git a/zstd/lib/compress/zstd_compress.c b/zstd/lib/compress/zstd_compress.c
new file mode 100644
index 0000000..15a9245
--- /dev/null
+++ b/zstd/lib/compress/zstd_compress.c
@@ -0,0 +1,3387 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <string.h>         /* memset */
+#include "mem.h"
+#define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "zstd_internal.h"  /* includes zstd.h */
+
+
+/*-*************************************
+*  Constants
+***************************************/
+static const U32 g_searchStrength = 8;   /* control skip over incompressible data */
+#define HASH_READ_SIZE 8
+typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+#define ZSTD_STATIC_ASSERT(c) { enum { ZSTD_static_assert = 1/(int)(!!(c)) }; }
+size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
+
+
+/*-*************************************
+*  Sequence storage
+***************************************/
+static void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+{
+    ssPtr->lit = ssPtr->litStart;
+    ssPtr->sequences = ssPtr->sequencesStart;
+    ssPtr->longLengthID = 0;
+}
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CCtx_s {
+    const BYTE* nextSrc;    /* next block here to continue on current prefix */
+    const BYTE* base;       /* All regular indexes relative to this position */
+    const BYTE* dictBase;   /* extDict indexes relative to this position */
+    U32   dictLimit;        /* below that point, need extDict */
+    U32   lowLimit;         /* below that point, no more data */
+    U32   nextToUpdate;     /* index from which to continue dictionary update */
+    U32   nextToUpdate3;    /* index from which to continue dictionary update */
+    U32   hashLog3;         /* dispatch table : larger == faster, more memory */
+    U32   loadedDictEnd;    /* index of end of dictionary */
+    U32   forceWindow;      /* force back-references to respect limit of 1<<wLog, even for dictionary */
+    U32   forceRawDict;     /* Force loading dictionary in "content-only" mode (no header analysis) */
+    ZSTD_compressionStage_e stage;
+    U32   rep[ZSTD_REP_NUM];
+    U32   repToConfirm[ZSTD_REP_NUM];
+    U32   dictID;
+    ZSTD_parameters params;
+    void* workSpace;
+    size_t workSpaceSize;
+    size_t blockSize;
+    U64 frameContentSize;
+    XXH64_state_t xxhState;
+    ZSTD_customMem customMem;
+
+    seqStore_t seqStore;    /* sequences storage ptrs */
+    U32* hashTable;
+    U32* hashTable3;
+    U32* chainTable;
+    HUF_CElt* hufTable;
+    U32 flagStaticTables;
+    HUF_repeat flagStaticHufTable;
+    FSE_CTable offcodeCTable  [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+    FSE_CTable litlengthCTable  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+    unsigned tmpCounters[HUF_WORKSPACE_SIZE_U32];
+};
+
+ZSTD_CCtx* ZSTD_createCCtx(void)
+{
+    return ZSTD_createCCtx_advanced(defaultCustomMem);
+}
+
+ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_CCtx* cctx;
+
+    if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem;
+    if (!customMem.customAlloc || !customMem.customFree) return NULL;
+
+    cctx = (ZSTD_CCtx*) ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
+    if (!cctx) return NULL;
+    memset(cctx, 0, sizeof(ZSTD_CCtx));
+    cctx->customMem = customMem;
+    return cctx;
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    ZSTD_free(cctx->workSpace, cctx->customMem);
+    ZSTD_free(cctx, cctx->customMem);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*cctx) + cctx->workSpaceSize;
+}
+
+size_t ZSTD_setCCtxParameter(ZSTD_CCtx* cctx, ZSTD_CCtxParameter param, unsigned value)
+{
+    switch(param)
+    {
+    case ZSTD_p_forceWindow : cctx->forceWindow = value>0; cctx->loadedDictEnd = 0; return 0;
+    case ZSTD_p_forceRawDict : cctx->forceRawDict = value>0; return 0;
+    default: return ERROR(parameter_unknown);
+    }
+}
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx)   /* hidden interface */
+{
+    return &(ctx->seqStore);
+}
+
+static ZSTD_parameters ZSTD_getParamsFromCCtx(const ZSTD_CCtx* cctx)
+{
+    return cctx->params;
+}
+
+
+/** ZSTD_checkParams() :
+    ensure param values remain within authorized range.
+    @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+#   define CLAMPCHECK(val,min,max) { if ((val<min) | (val>max)) return ERROR(compressionParameter_unsupported); }
+    CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+    CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+    CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+    CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+    { U32 const searchLengthMin = ((cParams.strategy == ZSTD_fast) | (cParams.strategy == ZSTD_greedy)) ? ZSTD_SEARCHLENGTH_MIN+1 : ZSTD_SEARCHLENGTH_MIN;
+      U32 const searchLengthMax = (cParams.strategy == ZSTD_fast) ? ZSTD_SEARCHLENGTH_MAX : ZSTD_SEARCHLENGTH_MAX-1;
+      CLAMPCHECK(cParams.searchLength, searchLengthMin, searchLengthMax); }
+    CLAMPCHECK(cParams.targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX);
+    if ((U32)(cParams.strategy) > (U32)ZSTD_btopt2) return ERROR(compressionParameter_unsupported);
+    return 0;
+}
+
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+    U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+    return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams() :
+    optimize `cPar` for a given input (`srcSize` and `dictSize`).
+    mostly downsizing to reduce memory consumption and initialization.
+    Both `srcSize` and `dictSize` are optional (use 0 if unknown),
+    but if both are 0, no optimization can be done.
+    Note : cPar is considered validated at this stage. Use ZSTD_checkParams() to ensure that. */
+ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+{
+    if (srcSize+dictSize == 0) return cPar;   /* no size information available : no adjustment */
+
+    /* resize params, to use less memory when necessary */
+    {   U32 const minSrcSize = (srcSize==0) ? 500 : 0;
+        U64 const rSize = srcSize + dictSize + minSrcSize;
+        if (rSize < ((U64)1<<ZSTD_WINDOWLOG_MAX)) {
+            U32 const srcLog = MAX(ZSTD_HASHLOG_MIN, ZSTD_highbit32((U32)(rSize)-1) + 1);
+            if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
+    }   }
+    if (cPar.hashLog > cPar.windowLog) cPar.hashLog = cPar.windowLog;
+    {   U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+        if (cycleLog > cPar.windowLog) cPar.chainLog -= (cycleLog - cPar.windowLog);
+    }
+
+    if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* required for frame header */
+
+    return cPar;
+}
+
+
+size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams)
+{
+    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << cParams.windowLog);
+    U32    const divider = (cParams.searchLength==3) ? 3 : 4;
+    size_t const maxNbSeq = blockSize / divider;
+    size_t const tokenSpace = blockSize + 11*maxNbSeq;
+
+    size_t const chainSize = (cParams.strategy == ZSTD_fast) ? 0 : (1 << cParams.chainLog);
+    size_t const hSize = ((size_t)1) << cParams.hashLog;
+    U32    const hashLog3 = (cParams.searchLength>3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, cParams.windowLog);
+    size_t const h3Size = ((size_t)1) << hashLog3;
+    size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+
+    size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<<Litbits))*sizeof(U32)
+                          + (ZSTD_OPT_NUM+1)*(sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+    size_t const neededSpace = tableSpace + (256*sizeof(U32)) /* huffTable */ + tokenSpace
+                             + (((cParams.strategy == ZSTD_btopt) || (cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+
+    return sizeof(ZSTD_CCtx) + neededSpace;
+}
+
+
+static U32 ZSTD_equivalentParams(ZSTD_parameters param1, ZSTD_parameters param2)
+{
+    return (param1.cParams.hashLog  == param2.cParams.hashLog)
+         & (param1.cParams.chainLog == param2.cParams.chainLog)
+         & (param1.cParams.strategy == param2.cParams.strategy)
+         & ((param1.cParams.searchLength==3) == (param2.cParams.searchLength==3));
+}
+
+/*! ZSTD_continueCCtx() :
+    reuse CCtx without reset (note : requires no dictionary) */
+static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_parameters params, U64 frameContentSize)
+{
+    U32 const end = (U32)(cctx->nextSrc - cctx->base);
+    cctx->params = params;
+    cctx->frameContentSize = frameContentSize;
+    cctx->lowLimit = end;
+    cctx->dictLimit = end;
+    cctx->nextToUpdate = end+1;
+    cctx->stage = ZSTDcs_init;
+    cctx->dictID = 0;
+    cctx->loadedDictEnd = 0;
+    { int i; for (i=0; i<ZSTD_REP_NUM; i++) cctx->rep[i] = repStartValue[i]; }
+    cctx->seqStore.litLengthSum = 0;  /* force reset of btopt stats */
+    XXH64_reset(&cctx->xxhState, 0);
+    return 0;
+}
+
+typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e;
+
+/*! ZSTD_resetCCtx_advanced() :
+    note : `params` must be validated */
+static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc,
+                                       ZSTD_parameters params, U64 frameContentSize,
+                                       ZSTD_compResetPolicy_e const crp)
+{
+    if (crp == ZSTDcrp_continue)
+        if (ZSTD_equivalentParams(params, zc->params)) {
+            zc->flagStaticTables = 0;
+            zc->flagStaticHufTable = HUF_repeat_none;
+            return ZSTD_continueCCtx(zc, params, frameContentSize);
+        }
+
+    {   size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog);
+        U32    const divider = (params.cParams.searchLength==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const tokenSpace = blockSize + 11*maxNbSeq;
+        size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog);
+        size_t const hSize = ((size_t)1) << params.cParams.hashLog;
+        U32    const hashLog3 = (params.cParams.searchLength>3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog);
+        size_t const h3Size = ((size_t)1) << hashLog3;
+        size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+        void* ptr;
+
+        /* Check if workSpace is large enough, alloc a new one if needed */
+        {   size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<<Litbits))*sizeof(U32)
+                                  + (ZSTD_OPT_NUM+1)*(sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+            size_t const neededSpace = tableSpace + (256*sizeof(U32)) /* huffTable */ + tokenSpace
+                                  + (((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+            if (zc->workSpaceSize < neededSpace) {
+                ZSTD_free(zc->workSpace, zc->customMem);
+                zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
+                if (zc->workSpace == NULL) return ERROR(memory_allocation);
+                zc->workSpaceSize = neededSpace;
+        }   }
+
+        if (crp!=ZSTDcrp_noMemset) memset(zc->workSpace, 0, tableSpace);   /* reset tables only */
+        XXH64_reset(&zc->xxhState, 0);
+        zc->hashLog3 = hashLog3;
+        zc->hashTable = (U32*)(zc->workSpace);
+        zc->chainTable = zc->hashTable + hSize;
+        zc->hashTable3 = zc->chainTable + chainSize;
+        ptr = zc->hashTable3 + h3Size;
+        zc->hufTable = (HUF_CElt*)ptr;
+        zc->flagStaticTables = 0;
+        zc->flagStaticHufTable = HUF_repeat_none;
+        ptr = ((U32*)ptr) + 256;  /* note : HUF_CElt* is incomplete type, size is simulated using U32 */
+
+        zc->nextToUpdate = 1;
+        zc->nextSrc = NULL;
+        zc->base = NULL;
+        zc->dictBase = NULL;
+        zc->dictLimit = 0;
+        zc->lowLimit = 0;
+        zc->params = params;
+        zc->blockSize = blockSize;
+        zc->frameContentSize = frameContentSize;
+        { int i; for (i=0; i<ZSTD_REP_NUM; i++) zc->rep[i] = repStartValue[i]; }
+
+        if ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) {
+            zc->seqStore.litFreq = (U32*)ptr;
+            zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1<<Litbits);
+            zc->seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL+1);
+            zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML+1);
+            ptr = zc->seqStore.offCodeFreq + (MaxOff+1);
+            zc->seqStore.matchTable = (ZSTD_match_t*)ptr;
+            ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM+1;
+            zc->seqStore.priceTable = (ZSTD_optimal_t*)ptr;
+            ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM+1;
+            zc->seqStore.litLengthSum = 0;
+        }
+        zc->seqStore.sequencesStart = (seqDef*)ptr;
+        ptr = zc->seqStore.sequencesStart + maxNbSeq;
+        zc->seqStore.llCode = (BYTE*) ptr;
+        zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
+        zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
+        zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
+
+        zc->stage = ZSTDcs_init;
+        zc->dictID = 0;
+        zc->loadedDictEnd = 0;
+
+        return 0;
+    }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
+    int i;
+    for (i=0; i<ZSTD_REP_NUM; i++) cctx->rep[i] = 0;
+}
+
+/*! ZSTD_copyCCtx() :
+*   Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+*   Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+    if (srcCCtx->stage!=ZSTDcs_init) return ERROR(stage_wrong);
+
+
+    memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+    {   ZSTD_parameters params = srcCCtx->params;
+        params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+        ZSTD_resetCCtx_advanced(dstCCtx, params, pledgedSrcSize, ZSTDcrp_noMemset);
+    }
+
+    /* copy tables */
+    {   size_t const chainSize = (srcCCtx->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << srcCCtx->params.cParams.chainLog);
+        size_t const hSize = ((size_t)1) << srcCCtx->params.cParams.hashLog;
+        size_t const h3Size = (size_t)1 << srcCCtx->hashLog3;
+        size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+        memcpy(dstCCtx->workSpace, srcCCtx->workSpace, tableSpace);
+    }
+
+    /* copy dictionary offsets */
+    dstCCtx->nextToUpdate = srcCCtx->nextToUpdate;
+    dstCCtx->nextToUpdate3= srcCCtx->nextToUpdate3;
+    dstCCtx->nextSrc      = srcCCtx->nextSrc;
+    dstCCtx->base         = srcCCtx->base;
+    dstCCtx->dictBase     = srcCCtx->dictBase;
+    dstCCtx->dictLimit    = srcCCtx->dictLimit;
+    dstCCtx->lowLimit     = srcCCtx->lowLimit;
+    dstCCtx->loadedDictEnd= srcCCtx->loadedDictEnd;
+    dstCCtx->dictID       = srcCCtx->dictID;
+
+    /* copy entropy tables */
+    dstCCtx->flagStaticTables = srcCCtx->flagStaticTables;
+    dstCCtx->flagStaticHufTable = srcCCtx->flagStaticHufTable;
+    if (srcCCtx->flagStaticTables) {
+        memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable));
+        memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable));
+        memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable));
+    }
+    if (srcCCtx->flagStaticHufTable) {
+        memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256*4);
+    }
+
+    return 0;
+}
+
+
+/*! ZSTD_reduceTable() :
+*   reduce table indexes by `reducerValue` */
+static void ZSTD_reduceTable (U32* const table, U32 const size, U32 const reducerValue)
+{
+    U32 u;
+    for (u=0 ; u < size ; u++) {
+        if (table[u] < reducerValue) table[u] = 0;
+        else table[u] -= reducerValue;
+    }
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex (ZSTD_CCtx* zc, const U32 reducerValue)
+{
+    { U32 const hSize = 1 << zc->params.cParams.hashLog;
+      ZSTD_reduceTable(zc->hashTable, hSize, reducerValue); }
+
+    { U32 const chainSize = (zc->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << zc->params.cParams.chainLog);
+      ZSTD_reduceTable(zc->chainTable, chainSize, reducerValue); }
+
+    { U32 const h3Size = (zc->hashLog3) ? 1 << zc->hashLog3 : 0;
+      ZSTD_reduceTable(zc->hashTable3, h3Size, reducerValue); }
+}
+
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+    MEM_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
+    return ZSTD_blockHeaderSize+srcSize;
+}
+
+
+static size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    if (srcSize + flSize > dstCapacity) return ERROR(dstSize_tooSmall);
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+            break;
+        default:   /*note : should not be necessary : flSize is within {1,2,3} */
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+            break;
+    }
+
+    memcpy(ostart + flSize, src, srcSize);
+    return srcSize + flSize;
+}
+
+static size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+            break;
+        default:   /*note : should not be necessary : flSize is necessarily within {1,2,3} */
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+            break;
+    }
+
+    ostart[flSize] = *(const BYTE*)src;
+    return flSize+1;
+}
+
+
+static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
+
+static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize)
+{
+    size_t const minGain = ZSTD_minGain(srcSize);
+    size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    BYTE*  const ostart = (BYTE*)dst;
+    U32 singleStream = srcSize < 256;
+    symbolEncodingType_e hType = set_compressed;
+    size_t cLitSize;
+
+
+    /* small ? don't even attempt compression (speed opt) */
+#   define LITERAL_NOENTROPY 63
+    {   size_t const minLitSize = zc->flagStaticHufTable == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY;
+        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+
+    if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall);   /* not enough space for compression */
+    {   HUF_repeat repeat = zc->flagStaticHufTable;
+        int const preferRepeat = zc->params.cParams.strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+        cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat)
+                                : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, zc->tmpCounters, sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat);
+        if (repeat != HUF_repeat_none) { hType = set_repeat; }    /* reused the existing table */
+        else { zc->flagStaticHufTable = HUF_repeat_check; }       /* now have a table to reuse */
+    }
+
+    if ((cLitSize==0) | (cLitSize >= srcSize - minGain)) {
+        zc->flagStaticHufTable = HUF_repeat_none;
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+    if (cLitSize==1) {
+        zc->flagStaticHufTable = HUF_repeat_none;
+        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    default:   /* should not be necessary, lhSize is only {3,4,5} */
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    }
+    return lhSize+cLitSize;
+}
+
+static const BYTE LL_Code[64] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                   8,  9, 10, 11, 12, 13, 14, 15,
+                                  16, 16, 17, 17, 18, 18, 19, 19,
+                                  20, 20, 20, 20, 21, 21, 21, 21,
+                                  22, 22, 22, 22, 22, 22, 22, 22,
+                                  23, 23, 23, 23, 23, 23, 23, 23,
+                                  24, 24, 24, 24, 24, 24, 24, 24,
+                                  24, 24, 24, 24, 24, 24, 24, 24 };
+
+static const BYTE ML_Code[128] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                  32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+                                  38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+                                  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+                                  41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                  42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+                                  42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 };
+
+
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+{
+    BYTE const LL_deltaCode = 19;
+    BYTE const ML_deltaCode = 36;
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    BYTE* const llCodeTable = seqStorePtr->llCode;
+    BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    U32 u;
+    for (u=0; u<nbSeq; u++) {
+        U32 const llv = sequences[u].litLength;
+        U32 const mlv = sequences[u].matchLength;
+        llCodeTable[u] = (llv> 63) ? (BYTE)ZSTD_highbit32(llv) + LL_deltaCode : LL_Code[llv];
+        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+        mlCodeTable[u] = (mlv>127) ? (BYTE)ZSTD_highbit32(mlv) + ML_deltaCode : ML_Code[mlv];
+    }
+    if (seqStorePtr->longLengthID==1)
+        llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+    if (seqStorePtr->longLengthID==2)
+        mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+MEM_STATIC size_t ZSTD_compressSequences (ZSTD_CCtx* zc,
+                              void* dst, size_t dstCapacity,
+                              size_t srcSize)
+{
+    const int longOffsets = zc->params.cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    const seqStore_t* seqStorePtr = &(zc->seqStore);
+    U32 count[MaxSeq+1];
+    S16 norm[MaxSeq+1];
+    FSE_CTable* CTable_LitLength = zc->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = zc->offcodeCTable;
+    FSE_CTable* CTable_MatchLength = zc->matchlengthCTable;
+    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    BYTE* seqHead;
+    BYTE scratchBuffer[1<<MAX(MLFSELog,LLFSELog)];
+
+    /* Compress literals */
+    {   const BYTE* const literals = seqStorePtr->litStart;
+        size_t const litSize = seqStorePtr->lit - literals;
+        size_t const cSize = ZSTD_compressLiterals(zc, op, dstCapacity, literals, litSize);
+        if (ZSTD_isError(cSize)) return cSize;
+        op += cSize;
+    }
+
+    /* Sequences Header */
+    if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall);
+    if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (nbSeq==0) goto _check_compressibility;
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+#define MIN_SEQ_FOR_DYNAMIC_FSE   64
+#define MAX_SEQ_FOR_STATIC_FSE  1000
+
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+
+    /* CTable for Literal Lengths */
+    {   U32 max = MaxLL;
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, zc->tmpCounters);
+        if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+            *op++ = llCodeTable[0];
+            FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
+            LLtype = set_rle;
+        } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+            LLtype = set_repeat;
+        } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog-1)))) {
+            FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
+            LLtype = set_basic;
+        } else {
+            size_t nbSeq_1 = nbSeq;
+            const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
+            if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; }
+            FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+            { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+              if (FSE_isError(NCountSize)) return ERROR(GENERIC);
+              op += NCountSize; }
+            FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
+            LLtype = set_compressed;
+    }   }
+
+    /* CTable for Offsets */
+    {   U32 max = MaxOff;
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, zc->tmpCounters);
+        if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+            *op++ = ofCodeTable[0];
+            FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
+            Offtype = set_rle;
+        } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+            Offtype = set_repeat;
+        } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog-1)))) {
+            FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
+            Offtype = set_basic;
+        } else {
+            size_t nbSeq_1 = nbSeq;
+            const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
+            if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; }
+            FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+            { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+              if (FSE_isError(NCountSize)) return ERROR(GENERIC);
+              op += NCountSize; }
+            FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
+            Offtype = set_compressed;
+    }   }
+
+    /* CTable for MatchLengths */
+    {   U32 max = MaxML;
+        size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, zc->tmpCounters);
+        if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+            *op++ = *mlCodeTable;
+            FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
+            MLtype = set_rle;
+        } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+            MLtype = set_repeat;
+        } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog-1)))) {
+            FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
+            MLtype = set_basic;
+        } else {
+            size_t nbSeq_1 = nbSeq;
+            const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
+            if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; }
+            FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+            { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
+              if (FSE_isError(NCountSize)) return ERROR(GENERIC);
+              op += NCountSize; }
+            FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
+            MLtype = set_compressed;
+    }   }
+
+    *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+    zc->flagStaticTables = 0;
+
+    /* Encoding Sequences */
+    {   BIT_CStream_t blockStream;
+        FSE_CState_t  stateMatchLength;
+        FSE_CState_t  stateOffsetBits;
+        FSE_CState_t  stateLitLength;
+
+        CHECK_E(BIT_initCStream(&blockStream, op, oend-op), dstSize_tooSmall); /* not enough space remaining */
+
+        /* first symbols */
+        FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+        FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
+        FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
+        BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+        if (MEM_32bits()) BIT_flushBits(&blockStream);
+        BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
+        if (MEM_32bits()) BIT_flushBits(&blockStream);
+        if (longOffsets) {
+            U32 const ofBits = ofCodeTable[nbSeq-1];
+            int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+            if (extraBits) {
+                BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits);
+                BIT_flushBits(&blockStream);
+            }
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits,
+                        ofBits - extraBits);
+        } else {
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
+        }
+        BIT_flushBits(&blockStream);
+
+        {   size_t n;
+            for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
+                BYTE const llCode = llCodeTable[n];
+                BYTE const ofCode = ofCodeTable[n];
+                BYTE const mlCode = mlCodeTable[n];
+                U32  const llBits = LL_bits[llCode];
+                U32  const ofBits = ofCode;                                     /* 32b*/  /* 64b*/
+                U32  const mlBits = ML_bits[mlCode];
+                                                                                /* (7)*/  /* (7)*/
+                FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
+                FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
+                if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+                FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
+                if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+                    BIT_flushBits(&blockStream);                                /* (7)*/
+                BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+                if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+                BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+                if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+                if (longOffsets) {
+                    int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+                    if (extraBits) {
+                        BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+                        BIT_flushBits(&blockStream);                            /* (7)*/
+                    }
+                    BIT_addBits(&blockStream, sequences[n].offset >> extraBits,
+                                ofBits - extraBits);                            /* 31 */
+                } else {
+                    BIT_addBits(&blockStream, sequences[n].offset, ofBits);     /* 31 */
+                }
+                BIT_flushBits(&blockStream);                                    /* (7)*/
+        }   }
+
+        FSE_flushCState(&blockStream, &stateMatchLength);
+        FSE_flushCState(&blockStream, &stateOffsetBits);
+        FSE_flushCState(&blockStream, &stateLitLength);
+
+        {   size_t const streamSize = BIT_closeCStream(&blockStream);
+            if (streamSize==0) return ERROR(dstSize_tooSmall);   /* not enough space */
+            op += streamSize;
+    }   }
+
+    /* check compressibility */
+_check_compressibility:
+    {   size_t const minGain = ZSTD_minGain(srcSize);
+        size_t const maxCSize = srcSize - minGain;
+        if ((size_t)(op-ostart) >= maxCSize) {
+            zc->flagStaticHufTable = HUF_repeat_none;
+            return 0;
+    }   }
+
+    /* confirm repcodes */
+    { int i; for (i=0; i<ZSTD_REP_NUM; i++) zc->rep[i] = zc->repToConfirm[i]; }
+
+    return op - ostart;
+}
+
+#if 0 /* for debug */
+#  define STORESEQ_DEBUG
+#include <stdio.h>   /* fprintf */
+U32 g_startDebug = 0;
+const BYTE* g_start = NULL;
+#endif
+
+/*! ZSTD_storeSeq() :
+    Store a sequence (literal length, literals, offset code and match length code) into seqStore_t.
+    `offsetCode` : distance to match, or 0 == repCode.
+    `matchCode` : matchLength - MINMATCH
+*/
+MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const void* literals, U32 offsetCode, size_t matchCode)
+{
+#ifdef STORESEQ_DEBUG
+    if (g_startDebug) {
+        const U32 pos = (U32)((const BYTE*)literals - g_start);
+        if (g_start==NULL) g_start = (const BYTE*)literals;
+        if ((pos > 1895000) && (pos < 1895300))
+            fprintf(stderr, "Cpos %6u :%5u literals & match %3u bytes at distance %6u \n",
+                   pos, (U32)litLength, (U32)matchCode+MINMATCH, (U32)offsetCode);
+    }
+#endif
+    /* copy Literals */
+    ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
+    seqStorePtr->lit += litLength;
+
+    /* literal Length */
+    if (litLength>0xFFFF) { seqStorePtr->longLengthID = 1; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); }
+    seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+    /* match offset */
+    seqStorePtr->sequences[0].offset = offsetCode + 1;
+
+    /* match Length */
+    if (matchCode>0xFFFF) { seqStorePtr->longLengthID = 2; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); }
+    seqStorePtr->sequences[0].matchLength = (U16)matchCode;
+
+    seqStorePtr->sequences++;
+}
+
+
+/*-*************************************
+*  Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes (register size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            _BitScanForward( &r, (U32)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+static size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+{
+    const BYTE* const pStart = pIn;
+    const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
+
+    while (pIn < pInLoopLimit) {
+        size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+        if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+        pIn += ZSTD_NbCommonBytes(diff);
+        return (size_t)(pIn - pStart);
+    }
+    if (MEM_64bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+*   can count match length with `ip` & `match` in 2 different segments.
+*   convention : on reaching mEnd, match count continue starting from iStart
+*/
+static size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+{
+    const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
+    size_t const matchLength = ZSTD_count(ip, match, vEnd);
+    if (match + matchLength != mEnd) return matchLength;
+    return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+}
+
+
+/*-*************************************
+*  Hashes
+***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); }   /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+static size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+{
+    switch(mls)
+    {
+    default:
+    case 4: return ZSTD_hash4Ptr(p, hBits);
+    case 5: return ZSTD_hash5Ptr(p, hBits);
+    case 6: return ZSTD_hash6Ptr(p, hBits);
+    case 7: return ZSTD_hash7Ptr(p, hBits);
+    case 8: return ZSTD_hash8Ptr(p, hBits);
+    }
+}
+
+
+/*-*************************************
+*  Fast Scan
+***************************************/
+static void ZSTD_fillHashTable (ZSTD_CCtx* zc, const void* end, const U32 mls)
+{
+    U32* const hashTable = zc->hashTable;
+    U32  const hBits = zc->params.cParams.hashLog;
+    const BYTE* const base = zc->base;
+    const BYTE* ip = base + zc->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const size_t fastHashFillStep = 3;
+
+    while(ip <= iend) {
+        hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+        ip += fastHashFillStep;
+    }
+}
+
+
+FORCE_INLINE
+void ZSTD_compressBlock_fast_generic(ZSTD_CCtx* cctx,
+                               const void* src, size_t srcSize,
+                               const U32 mls)
+{
+    U32* const hashTable = cctx->hashTable;
+    U32  const hBits = cctx->params.cParams.hashLog;
+    seqStore_t* seqStorePtr = &(cctx->seqStore);
+    const BYTE* const base = cctx->base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   lowestIndex = cctx->dictLimit;
+    const BYTE* const lowest = base + lowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=cctx->rep[0], offset_2=cctx->rep[1];
+    U32 offsetSaved = 0;
+
+    /* init */
+    ip += (ip==lowest);
+    {   U32 const maxRep = (U32)(ip-lowest);
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        size_t const h = ZSTD_hashPtr(ip, hBits, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndex = hashTable[h];
+        const BYTE* match = base + matchIndex;
+        hashTable[h] = current;   /* update hash table */
+
+        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            U32 offset;
+            if ( (matchIndex <= lowestIndex) || (MEM_read32(match) != MEM_read32(ip)) ) {
+                ip += ((ip-anchor) >> g_searchStrength) + 1;
+                continue;
+            }
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip-match);
+            while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+            offset_2 = offset_1;
+            offset_1 = offset;
+
+            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }
+
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while ( (ip <= ilimit)
+                 && ( (offset_2>0)
+                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; }  /* swap offset_2 <=> offset_1 */
+                hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH);
+                ip += rLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* save reps for next block */
+    cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+    cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Last Literals */
+    {   size_t const lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+static void ZSTD_compressBlock_fast(ZSTD_CCtx* ctx,
+                       const void* src, size_t srcSize)
+{
+    const U32 mls = ctx->params.cParams.searchLength;
+    switch(mls)
+    {
+    default:
+    case 4 :
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return;
+    case 5 :
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5); return;
+    case 6 :
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6); return;
+    case 7 :
+        ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7); return;
+    }
+}
+
+
+static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx* ctx,
+                                 const void* src, size_t srcSize,
+                                 const U32 mls)
+{
+    U32* hashTable = ctx->hashTable;
+    const U32 hBits = ctx->params.cParams.hashLog;
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const base = ctx->base;
+    const BYTE* const dictBase = ctx->dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   lowestIndex = ctx->lowLimit;
+    const BYTE* const dictStart = dictBase + lowestIndex;
+    const U32   dictLimit = ctx->dictLimit;
+    const BYTE* const lowPrefixPtr = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=ctx->rep[0], offset_2=ctx->rep[1];
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t h = ZSTD_hashPtr(ip, hBits, mls);
+        const U32 matchIndex = hashTable[h];
+        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+        const U32 current = (U32)(ip-base);
+        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
+        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
+        const BYTE* repMatch = repBase + repIndex;
+        size_t mLength;
+        hashTable[h] = current;   /* update hash table */
+
+        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32;
+            ip++;
+            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            if ( (matchIndex < lowestIndex) ||
+                 (MEM_read32(match) != MEM_read32(ip)) ) {
+                ip += ((ip-anchor) >> g_searchStrength) + 1;
+                continue;
+            }
+            {   const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+                const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+EQUAL_READ32, match+EQUAL_READ32, iend, matchEnd, lowPrefixPtr) + EQUAL_READ32;
+                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                offset = current - matchIndex;
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }   }
+
+        /* found a match : store it */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashTable[ZSTD_hashPtr(base+current+2, hBits, mls)] = current+2;
+            hashTable[ZSTD_hashPtr(ip-2, hBits, mls)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+                    size_t repLength2 = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch2+EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH);
+                    hashTable[ZSTD_hashPtr(ip, hBits, mls)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2;
+
+    /* Last Literals */
+    {   size_t const lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx* ctx,
+                         const void* src, size_t srcSize)
+{
+    U32 const mls = ctx->params.cParams.searchLength;
+    switch(mls)
+    {
+    default:
+    case 4 :
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return;
+    case 5 :
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5); return;
+    case 6 :
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6); return;
+    case 7 :
+        ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7); return;
+    }
+}
+
+
+/*-*************************************
+*  Double Fast
+***************************************/
+static void ZSTD_fillDoubleHashTable (ZSTD_CCtx* cctx, const void* end, const U32 mls)
+{
+    U32* const hashLarge = cctx->hashTable;
+    U32  const hBitsL = cctx->params.cParams.hashLog;
+    U32* const hashSmall = cctx->chainTable;
+    U32  const hBitsS = cctx->params.cParams.chainLog;
+    const BYTE* const base = cctx->base;
+    const BYTE* ip = base + cctx->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const size_t fastHashFillStep = 3;
+
+    while(ip <= iend) {
+        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
+        hashLarge[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
+        ip += fastHashFillStep;
+    }
+}
+
+
+FORCE_INLINE
+void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx* cctx,
+                                 const void* src, size_t srcSize,
+                                 const U32 mls)
+{
+    U32* const hashLong = cctx->hashTable;
+    const U32 hBitsL = cctx->params.cParams.hashLog;
+    U32* const hashSmall = cctx->chainTable;
+    const U32 hBitsS = cctx->params.cParams.chainLog;
+    seqStore_t* seqStorePtr = &(cctx->seqStore);
+    const BYTE* const base = cctx->base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32 lowestIndex = cctx->dictLimit;
+    const BYTE* const lowest = base + lowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=cctx->rep[0], offset_2=cctx->rep[1];
+    U32 offsetSaved = 0;
+
+    /* init */
+    ip += (ip==lowest);
+    {   U32 const maxRep = (U32)(ip-lowest);
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+        size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndexL = hashLong[h2];
+        U32 const matchIndexS = hashSmall[h];
+        const BYTE* matchLong = base + matchIndexL;
+        const BYTE* match = base + matchIndexS;
+        hashLong[h2] = hashSmall[h] = current;   /* update hash tables */
+
+        if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { /* note : by construction, offset_1 <= current */
+            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            U32 offset;
+            if ( (matchIndexL > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip)) ) {
+                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+                offset = (U32)(ip-matchLong);
+                while (((ip>anchor) & (matchLong>lowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+            } else if ( (matchIndexS > lowestIndex) && (MEM_read32(match) == MEM_read32(ip)) ) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* match3 = base + matchIndex3;
+                hashLong[h3] = current + 1;
+                if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    mLength = ZSTD_count(ip+9, match3+8, iend) + 8;
+                    ip++;
+                    offset = (U32)(ip-match3);
+                    while (((ip>anchor) & (match3>lowest)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+                    offset = (U32)(ip-match);
+                    while (((ip>anchor) & (match>lowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+                }
+            } else {
+                ip += ((ip-anchor) >> g_searchStrength) + 1;
+                continue;
+            }
+
+            offset_2 = offset_1;
+            offset_1 = offset;
+
+            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }
+
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] =
+                hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] =
+                hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
+
+            /* check immediate repcode */
+            while ( (ip <= ilimit)
+                 && ( (offset_2>0)
+                 & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                /* store sequence */
+                size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
+                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength-MINMATCH);
+                ip += rLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* save reps for next block */
+    cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+    cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Last Literals */
+    {   size_t const lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+static void ZSTD_compressBlock_doubleFast(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    const U32 mls = ctx->params.cParams.searchLength;
+    switch(mls)
+    {
+    default:
+    case 4 :
+        ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4); return;
+    case 5 :
+        ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 5); return;
+    case 6 :
+        ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 6); return;
+    case 7 :
+        ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 7); return;
+    }
+}
+
+
+static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx* ctx,
+                                 const void* src, size_t srcSize,
+                                 const U32 mls)
+{
+    U32* const hashLong = ctx->hashTable;
+    U32  const hBitsL = ctx->params.cParams.hashLog;
+    U32* const hashSmall = ctx->chainTable;
+    U32  const hBitsS = ctx->params.cParams.chainLog;
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const base = ctx->base;
+    const BYTE* const dictBase = ctx->dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   lowestIndex = ctx->lowLimit;
+    const BYTE* const dictStart = dictBase + lowestIndex;
+    const U32   dictLimit = ctx->dictLimit;
+    const BYTE* const lowPrefixPtr = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=ctx->rep[0], offset_2=ctx->rep[1];
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+        const U32 matchIndex = hashSmall[hSmall];
+        const BYTE* matchBase = matchIndex < dictLimit ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+
+        const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+        const U32 matchLongIndex = hashLong[hLong];
+        const BYTE* matchLongBase = matchLongIndex < dictLimit ? dictBase : base;
+        const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+        const U32 current = (U32)(ip-base);
+        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
+        const BYTE* repBase = repIndex < dictLimit ? dictBase : base;
+        const BYTE* repMatch = repBase + repIndex;
+        size_t mLength;
+        hashSmall[hSmall] = hashLong[hLong] = current;   /* update hash table */
+
+        if ( (((U32)((dictLimit-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex))
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, lowPrefixPtr) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, 0, mLength-MINMATCH);
+        } else {
+            if ((matchLongIndex > lowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
+                const BYTE* lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, lowPrefixPtr) + 8;
+                offset = current - matchLongIndex;
+                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else if ((matchIndex > lowestIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* const match3Base = matchIndex3 < dictLimit ? dictBase : base;
+                const BYTE* match3 = match3Base + matchIndex3;
+                U32 offset;
+                hashLong[h3] = current + 1;
+                if ( (matchIndex3 > lowestIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* matchEnd = matchIndex3 < dictLimit ? dictEnd : iend;
+                    const BYTE* lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, lowPrefixPtr) + 8;
+                    ip++;
+                    offset = current+1 - matchIndex3;
+                    while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    const BYTE* matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+                    const BYTE* lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, lowPrefixPtr) + 4;
+                    offset = current - matchIndex;
+                    while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                }
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStorePtr, ip-anchor, anchor, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else {
+                ip += ((ip-anchor) >> g_searchStrength) + 1;
+                continue;
+        }   }
+
+        /* found a match : store it */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+			hashSmall[ZSTD_hashPtr(base+current+2, hBitsS, mls)] = current+2;
+			hashLong[ZSTD_hashPtr(base+current+2, hBitsL, 8)] = current+2;
+            hashSmall[ZSTD_hashPtr(ip-2, hBitsS, mls)] = (U32)(ip-2-base);
+            hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((dictLimit-1) - repIndex2) >= 3) & (repIndex2 > lowestIndex))  /* intentional overflow */
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch2+EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2-MINMATCH);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2;
+
+    /* Last Literals */
+    {   size_t const lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+static void ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx* ctx,
+                         const void* src, size_t srcSize)
+{
+    U32 const mls = ctx->params.cParams.searchLength;
+    switch(mls)
+    {
+    default:
+    case 4 :
+        ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4); return;
+    case 5 :
+        ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 5); return;
+    case 6 :
+        ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 6); return;
+    case 7 :
+        ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 7); return;
+    }
+}
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+*   ip : assumed <= iend-8 .
+*   @return : nb of positions added */
+static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, const BYTE* const iend, U32 nbCompares,
+                          U32 extDict)
+{
+    U32*   const hashTable = zc->hashTable;
+    U32    const hashLog = zc->params.cParams.hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = zc->chainTable;
+    U32    const btLog  = zc->params.cParams.chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = zc->base;
+    const BYTE* const dictBase = zc->dictBase;
+    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    const U32 current = (U32)(ip-base);
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowLow = zc->lowLimit;
+    U32 matchEndIdx = current+8;
+    size_t bestLength = 8;
+#ifdef ZSTD_C_PREDICT
+    U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
+    U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
+    predictedSmall += (predictedSmall>0);
+    predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+
+#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
+            *smallerPtr = matchIndex;
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+            continue;
+        }
+        if (matchIndex == predictedLarge) {
+            *largerPtr = matchIndex;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+            continue;
+        }
+#endif
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            match = base + matchIndex;
+            if (match[matchLength] == ip[matchLength])
+                matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+				match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
+
+        if (ip+matchLength == iend)   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within correct buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+    if (bestLength > 384) return MIN(192, (U32)(bestLength - 384));   /* speed optimization */
+    if (matchEndIdx > current + 8) return matchEndIdx - current - 8;
+    return 1;
+}
+
+
+static size_t ZSTD_insertBtAndFindBestMatch (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iend,
+                        size_t* offsetPtr,
+                        U32 nbCompares, const U32 mls,
+                        U32 extDict)
+{
+    U32*   const hashTable = zc->hashTable;
+    U32    const hashLog = zc->params.cParams.hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = zc->chainTable;
+    U32    const btLog  = zc->params.cParams.chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex  = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = zc->base;
+    const BYTE* const dictBase = zc->dictBase;
+    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const U32 current = (U32)(ip-base);
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    const U32 windowLow = zc->lowLimit;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = bt + 2*(current&btMask) + 1;
+    U32 matchEndIdx = current+8;
+    U32 dummy32;   /* to be nullified at the end */
+    size_t bestLength = 0;
+
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match;
+
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            match = base + matchIndex;
+            if (match[matchLength] == ip[matchLength])
+                matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+				match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+            if (ip+matchLength == iend)   /* equal : no way to know if inf or sup */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+    zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1;
+    return bestLength;
+}
+
+
+static void ZSTD_updateTree(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls)
+{
+    const BYTE* const base = zc->base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = zc->nextToUpdate;
+
+    while(idx < target)
+        idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 0);
+}
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 maxNbAttempts, const U32 mls)
+{
+    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+    return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 0);
+}
+
+
+static size_t ZSTD_BtFindBestMatch_selectMLS (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch)
+{
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+    case 5 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+    case 6 : return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+    }
+}
+
+
+static void ZSTD_updateTree_extDict(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, const U32 nbCompares, const U32 mls)
+{
+    const BYTE* const base = zc->base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = zc->nextToUpdate;
+
+    while (idx < target) idx += ZSTD_insertBt1(zc, base+idx, mls, iend, nbCompares, 1);
+}
+
+
+/** Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch_extDict (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 maxNbAttempts, const U32 mls)
+{
+    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+    return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 1);
+}
+
+
+static size_t ZSTD_BtFindBestMatch_selectMLS_extDict (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch)
+{
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+    case 5 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+    case 6 : return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+    }
+}
+
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & mask]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndex (ZSTD_CCtx* zc, const BYTE* ip, U32 mls)
+{
+    U32* const hashTable  = zc->hashTable;
+    const U32 hashLog = zc->params.cParams.hashLog;
+    U32* const chainTable = zc->chainTable;
+    const U32 chainMask = (1 << zc->params.cParams.chainLog) - 1;
+    const BYTE* const base = zc->base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = zc->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    zc->nextToUpdate = target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+
+
+FORCE_INLINE /* inlining is important to hardwire a hot branch (template emulation) */
+size_t ZSTD_HcFindBestMatch_generic (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 maxNbAttempts, const U32 mls, const U32 extDict)
+{
+    U32* const chainTable = zc->chainTable;
+    const U32 chainSize = (1 << zc->params.cParams.chainLog);
+    const U32 chainMask = chainSize-1;
+    const BYTE* const base = zc->base;
+    const BYTE* const dictBase = zc->dictBase;
+    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 lowLimit = zc->lowLimit;
+    const U32 current = (U32)(ip-base);
+    const U32 minChain = current > chainSize ? current - chainSize : 0;
+    int nbAttempts=maxNbAttempts;
+    size_t ml=EQUAL_READ32-1;
+
+    /* HC4 match finder */
+    U32 matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls);
+
+    for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+        const BYTE* match;
+        size_t currentMl=0;
+        if ((!extDict) || matchIndex >= dictLimit) {
+            match = base + matchIndex;
+            if (match[ml] == ip[ml])   /* potentially better */
+                currentMl = ZSTD_count(ip, match, iLimit);
+        } else {
+            match = dictBase + matchIndex;
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+EQUAL_READ32, match+EQUAL_READ32, iLimit, dictEnd, prefixStart) + EQUAL_READ32;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) { ml = currentMl; *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; if (ip+currentMl == iLimit) break; /* best possible, and avoid read overflow*/ }
+
+        if (matchIndex <= minChain) break;
+        matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+    }
+
+    return ml;
+}
+
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_selectMLS (
+                        ZSTD_CCtx* zc,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch)
+{
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0);
+    case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0);
+    case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0);
+    }
+}
+
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
+                        ZSTD_CCtx* zc,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch)
+{
+    switch(matchLengthSearch)
+    {
+    default :
+    case 4 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1);
+    case 5 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1);
+    case 6 : return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1);
+    }
+}
+
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
+                                     const void* src, size_t srcSize,
+                                     const U32 searchMethod, const U32 depth)
+{
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ctx->base + ctx->dictLimit;
+
+    U32 const maxSearches = 1 << ctx->params.cParams.searchLog;
+    U32 const mls = ctx->params.cParams.searchLength;
+
+    typedef size_t (*searchMax_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLimit,
+                        size_t* offsetPtr,
+                        U32 maxNbAttempts, U32 matchLengthSearch);
+    searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
+    U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1], savedOffset=0;
+
+    /* init */
+    ip += (ip==base);
+    ctx->nextToUpdate3 = ctx->nextToUpdate;
+    {   U32 const maxRep = (U32)(ip-base);
+        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+    }
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+
+        /* check repCode */
+        if ((offset_1>0) & (MEM_read32(ip+1) == MEM_read32(ip+1 - offset_1))) {
+            /* repcode : we take it */
+            matchLength = ZSTD_count(ip+1+EQUAL_READ32, ip+1+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
+            if (depth==0) goto _storeSequence;
+        }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 99999999;
+            size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+        if (matchLength < EQUAL_READ32) {
+            ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                size_t const mlRep = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
+                int const gain2 = (int)(mlRep * 3);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                if ((mlRep >= EQUAL_READ32) && (gain2 > gain1))
+                    matchLength = mlRep, offset = 0, start = ip;
+            }
+            {   size_t offset2=99999999;
+                size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                if ((offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const ml2 = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_1, iend) + EQUAL_READ32;
+                    int const gain2 = (int)(ml2 * 4);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((ml2 >= EQUAL_READ32) && (gain2 > gain1))
+                        matchLength = ml2, offset = 0, start = ip;
+                }
+                {   size_t offset2=99999999;
+                    size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* catch up */
+        if (offset) {
+            while ((start>anchor) && (start>base+offset-ZSTD_REP_MOVE) && (start[-1] == start[-1-offset+ZSTD_REP_MOVE]))   /* only search for offset within prefix */
+                { start--; matchLength++; }
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        while ( (ip <= ilimit)
+             && ((offset_2>0)
+             & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+            /* store sequence */
+            matchLength = ZSTD_count(ip+EQUAL_READ32, ip+EQUAL_READ32-offset_2, iend) + EQUAL_READ32;
+            offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
+            ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
+            ip += matchLength;
+            anchor = ip;
+            continue;   /* faster when present ... (?) */
+    }   }
+
+    /* Save reps for next block */
+    ctx->repToConfirm[0] = offset_1 ? offset_1 : savedOffset;
+    ctx->repToConfirm[1] = offset_2 ? offset_2 : savedOffset;
+
+    /* Last Literals */
+    {   size_t const lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+static void ZSTD_compressBlock_btlazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2);
+}
+
+static void ZSTD_compressBlock_lazy2(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2);
+}
+
+static void ZSTD_compressBlock_lazy(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1);
+}
+
+static void ZSTD_compressBlock_greedy(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0);
+}
+
+
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx* ctx,
+                                     const void* src, size_t srcSize,
+                                     const U32 searchMethod, const U32 depth)
+{
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ctx->base;
+    const U32 dictLimit = ctx->dictLimit;
+    const U32 lowestIndex = ctx->lowLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ctx->dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+    const BYTE* const dictStart  = dictBase + ctx->lowLimit;
+
+    const U32 maxSearches = 1 << ctx->params.cParams.searchLog;
+    const U32 mls = ctx->params.cParams.searchLength;
+
+    typedef size_t (*searchMax_f)(ZSTD_CCtx* zc, const BYTE* ip, const BYTE* iLimit,
+                        size_t* offsetPtr,
+                        U32 maxNbAttempts, U32 matchLengthSearch);
+    searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+    U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+    /* init */
+    ctx->nextToUpdate3 = ctx->nextToUpdate;
+    ip += (ip == prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+        U32 current = (U32)(ip-base);
+
+        /* check repCode */
+        {   const U32 repIndex = (U32)(current+1 - offset_1);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))   /* intentional overflow */
+            if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+                if (depth==0) goto _storeSequence;
+        }   }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 99999999;
+            size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+         if (matchLength < EQUAL_READ32) {
+            ip += ((ip-anchor) >> g_searchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            current++;
+            /* check repCode */
+            if (offset) {
+                const U32 repIndex = (U32)(current - offset_1);
+                const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                const BYTE* const repMatch = repBase + repIndex;
+                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+                if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                    /* repcode detected */
+                    const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                    size_t const repLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+                    int const gain2 = (int)(repLength * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+                        matchLength = repLength, offset = 0, start = ip;
+            }   }
+
+            /* search match, depth 1 */
+            {   size_t offset2=99999999;
+                size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                current++;
+                /* check repCode */
+                if (offset) {
+                    const U32 repIndex = (U32)(current - offset_1);
+                    const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                    const BYTE* const repMatch = repBase + repIndex;
+                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+                    if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                        /* repcode detected */
+                        const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                        size_t repLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+                        int gain2 = (int)(repLength * 4);
+                        int gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+                            matchLength = repLength, offset = 0, start = ip;
+                }   }
+
+                /* search match, depth 2 */
+                {   size_t offset2=99999999;
+                    size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* catch up */
+        if (offset) {
+            U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+            const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+            const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+            while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        while (ip <= ilimit) {
+            const U32 repIndex = (U32)((ip-base) - offset_2);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */
+            if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+EQUAL_READ32, repMatch+EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */
+                ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+            break;
+    }   }
+
+    /* Save reps for next block */
+    ctx->repToConfirm[0] = offset_1; ctx->repToConfirm[1] = offset_2;
+
+    /* Last Literals */
+    {   size_t const lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+void ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0);
+}
+
+static void ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1);
+}
+
+static void ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2);
+}
+
+static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+    ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2);
+}
+
+
+/* The optimal parser */
+#include "zstd_opt.h"
+
+static void ZSTD_compressBlock_btopt(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+    ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0);
+#else
+    (void)ctx; (void)src; (void)srcSize;
+    return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+    ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1);
+#else
+    (void)ctx; (void)src; (void)srcSize;
+    return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+    ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 0);
+#else
+    (void)ctx; (void)src; (void)srcSize;
+    return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+    ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 1);
+#else
+    (void)ctx; (void)src; (void)srcSize;
+    return;
+#endif
+}
+
+
+typedef void (*ZSTD_blockCompressor) (ZSTD_CCtx* ctx, const void* src, size_t srcSize);
+
+static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
+{
+    static const ZSTD_blockCompressor blockCompressor[2][8] = {
+        { ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2, ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2 },
+        { ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict }
+    };
+
+    return blockCompressor[extDict][(U32)strat];
+}
+
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->params.cParams.strategy, zc->lowLimit < zc->dictLimit);
+    const BYTE* const base = zc->base;
+    const BYTE* const istart = (const BYTE*)src;
+    const U32 current = (U32)(istart-base);
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) return 0;   /* don't even attempt compression below a certain srcSize */
+    ZSTD_resetSeqStore(&(zc->seqStore));
+    if (current > zc->nextToUpdate + 384)
+        zc->nextToUpdate = current - MIN(192, (U32)(current - zc->nextToUpdate - 384));   /* update tree not updated after finding very long rep matches */
+    blockCompressor(zc, src, srcSize);
+    return ZSTD_compressSequences(zc, dst, dstCapacity, srcSize);
+}
+
+
+/*! ZSTD_compress_generic() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*   @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_generic (ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32 lastFrameChunk)
+{
+    size_t blockSize = cctx->blockSize;
+    size_t remaining = srcSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    U32 const maxDist = 1 << cctx->params.cParams.windowLog;
+
+    if (cctx->params.fParams.checksumFlag && srcSize)
+        XXH64_update(&cctx->xxhState, src, srcSize);
+
+    while (remaining) {
+        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+        size_t cSize;
+
+        if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE) return ERROR(dstSize_tooSmall);   /* not enough space to store compressed block */
+        if (remaining < blockSize) blockSize = remaining;
+
+        /* preemptive overflow correction */
+        if (cctx->lowLimit > (3U<<29)) {
+            U32 const cycleMask = (1 << ZSTD_cycleLog(cctx->params.cParams.hashLog, cctx->params.cParams.strategy)) - 1;
+            U32 const current = (U32)(ip - cctx->base);
+            U32 const newCurrent = (current & cycleMask) + (1 << cctx->params.cParams.windowLog);
+            U32 const correction = current - newCurrent;
+            ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_64 <= 30);
+            ZSTD_reduceIndex(cctx, correction);
+            cctx->base += correction;
+            cctx->dictBase += correction;
+            cctx->lowLimit -= correction;
+            cctx->dictLimit -= correction;
+            if (cctx->nextToUpdate < correction) cctx->nextToUpdate = 0;
+            else cctx->nextToUpdate -= correction;
+        }
+
+        if ((U32)(ip+blockSize - cctx->base) > cctx->loadedDictEnd + maxDist) {
+            /* enforce maxDist */
+            U32 const newLowLimit = (U32)(ip+blockSize - cctx->base) - maxDist;
+            if (cctx->lowLimit < newLowLimit) cctx->lowLimit = newLowLimit;
+            if (cctx->dictLimit < cctx->lowLimit) cctx->dictLimit = cctx->lowLimit;
+        }
+
+        cSize = ZSTD_compressBlock_internal(cctx, op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, ip, blockSize);
+        if (ZSTD_isError(cSize)) return cSize;
+
+        if (cSize == 0) {  /* block is not compressible */
+            U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(blockSize << 3);
+            if (blockSize + ZSTD_blockHeaderSize > dstCapacity) return ERROR(dstSize_tooSmall);
+            MEM_writeLE32(op, cBlockHeader24);   /* no pb, 4th byte will be overwritten */
+            memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
+            cSize = ZSTD_blockHeaderSize+blockSize;
+        } else {
+            U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+            MEM_writeLE24(op, cBlockHeader24);
+            cSize += ZSTD_blockHeaderSize;
+        }
+
+        remaining -= blockSize;
+        dstCapacity -= cSize;
+        ip += blockSize;
+        op += cSize;
+    }
+
+    if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending;
+    return op-ostart;
+}
+
+
+static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+                                    ZSTD_parameters params, U64 pledgedSrcSize, U32 dictID)
+{   BYTE* const op = (BYTE*)dst;
+    U32   const dictIDSizeCode = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const checksumFlag = params.fParams.checksumFlag>0;
+    U32   const windowSize = 1U << params.cParams.windowLog;
+    U32   const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+    BYTE  const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+    U32   const fcsCode = params.fParams.contentSizeFlag ?
+                     (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) :   /* 0-3 */
+                      0;
+    BYTE  const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
+    size_t pos;
+
+    if (dstCapacity < ZSTD_frameHeaderSize_max) return ERROR(dstSize_tooSmall);
+
+    MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
+    op[4] = frameHeaderDecriptionByte; pos=5;
+    if (!singleSegment) op[pos++] = windowLogByte;
+    switch(dictIDSizeCode)
+    {
+        default:   /* impossible */
+        case 0 : break;
+        case 1 : op[pos] = (BYTE)(dictID); pos++; break;
+        case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
+        case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break;
+    }
+    switch(fcsCode)
+    {
+        default:   /* impossible */
+        case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
+        case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
+        case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
+        case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break;
+    }
+    return pos;
+}
+
+
+static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                               U32 frame, U32 lastFrameChunk)
+{
+    const BYTE* const ip = (const BYTE*) src;
+    size_t fhSize = 0;
+
+    if (cctx->stage==ZSTDcs_created) return ERROR(stage_wrong);   /* missing init (ZSTD_compressBegin) */
+
+    if (frame && (cctx->stage==ZSTDcs_init)) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, cctx->frameContentSize, cctx->dictID);
+        if (ZSTD_isError(fhSize)) return fhSize;
+        dstCapacity -= fhSize;
+        dst = (char*)dst + fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    /* Check if blocks follow each other */
+    if (src != cctx->nextSrc) {
+        /* not contiguous */
+        ptrdiff_t const delta = cctx->nextSrc - ip;
+        cctx->lowLimit = cctx->dictLimit;
+        cctx->dictLimit = (U32)(cctx->nextSrc - cctx->base);
+        cctx->dictBase = cctx->base;
+        cctx->base -= delta;
+        cctx->nextToUpdate = cctx->dictLimit;
+        if (cctx->dictLimit - cctx->lowLimit < HASH_READ_SIZE) cctx->lowLimit = cctx->dictLimit;   /* too small extDict */
+    }
+
+    /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+    if ((ip+srcSize > cctx->dictBase + cctx->lowLimit) & (ip < cctx->dictBase + cctx->dictLimit)) {
+        ptrdiff_t const highInputIdx = (ip + srcSize) - cctx->dictBase;
+        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)cctx->dictLimit) ? cctx->dictLimit : (U32)highInputIdx;
+        cctx->lowLimit = lowLimitMax;
+    }
+
+    cctx->nextSrc = ip + srcSize;
+
+    if (srcSize) {
+        size_t const cSize = frame ?
+                             ZSTD_compress_generic (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                             ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize);
+        if (ZSTD_isError(cSize)) return cSize;
+        return cSize + fhSize;
+    } else
+        return fhSize;
+}
+
+
+size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize)
+{
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 0);
+}
+
+
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx* cctx)
+{
+    return MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << cctx->params.cParams.windowLog);
+}
+
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const blockSizeMax = ZSTD_getBlockSizeMax(cctx);
+    if (srcSize > blockSizeMax) return ERROR(srcSize_wrong);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0, 0);
+}
+
+
+static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+
+    /* input becomes current prefix */
+    zc->lowLimit = zc->dictLimit;
+    zc->dictLimit = (U32)(zc->nextSrc - zc->base);
+    zc->dictBase = zc->base;
+    zc->base += ip - zc->nextSrc;
+    zc->nextToUpdate = zc->dictLimit;
+    zc->loadedDictEnd = zc->forceWindow ? 0 : (U32)(iend - zc->base);
+
+    zc->nextSrc = iend;
+    if (srcSize <= HASH_READ_SIZE) return 0;
+
+    switch(zc->params.cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable (zc, iend, zc->params.cParams.searchLength);
+        break;
+
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable (zc, iend, zc->params.cParams.searchLength);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+        ZSTD_insertAndFindFirstIndex (zc, iend-HASH_READ_SIZE, zc->params.cParams.searchLength);
+        break;
+
+    case ZSTD_btlazy2:
+    case ZSTD_btopt:
+    case ZSTD_btopt2:
+        ZSTD_updateTree(zc, iend-HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength);
+        break;
+
+    default:
+        return ERROR(GENERIC);   /* strategy doesn't exist; impossible */
+    }
+
+    zc->nextToUpdate = (U32)(iend - zc->base);
+    return 0;
+}
+
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
+   that we may encounter during compression.
+   NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
+    U32 s;
+    if (dictMaxSymbolValue < maxSymbolValue) return ERROR(dictionary_corrupted);
+    for (s = 0; s <= maxSymbolValue; ++s) {
+        if (normalizedCounter[s] == 0) return ERROR(dictionary_corrupted);
+    }
+    return 0;
+}
+
+
+/* Dictionary format :
+    Magic == ZSTD_DICT_MAGIC (4 bytes)
+    HUF_writeCTable(256)
+    FSE_writeNCount(off)
+    FSE_writeNCount(ml)
+    FSE_writeNCount(ll)
+    RepOffsets
+    Dictionary content
+*/
+/*! ZSTD_loadDictEntropyStats() :
+    @return : size read from dictionary
+    note : magic number supposed already checked */
+static size_t ZSTD_loadDictEntropyStats(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
+    BYTE scratchBuffer[1<<MAX(MLFSELog,LLFSELog)];
+
+    {   size_t const hufHeaderSize = HUF_readCTable(cctx->hufTable, 255, dict, dictSize);
+        if (HUF_isError(hufHeaderSize)) return ERROR(dictionary_corrupted);
+        dictPtr += hufHeaderSize;
+    }
+
+    {   unsigned offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+        CHECK_E (FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        /* Every match length code must have non-zero probability */
+        CHECK_F (ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
+        CHECK_E (FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        /* Every literal length code must have non-zero probability */
+        CHECK_F (ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
+        CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, scratchBuffer, sizeof(scratchBuffer)), dictionary_corrupted);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    cctx->rep[0] = MEM_readLE32(dictPtr+0); if (cctx->rep[0] == 0 || cctx->rep[0] >= dictSize) return ERROR(dictionary_corrupted);
+    cctx->rep[1] = MEM_readLE32(dictPtr+4); if (cctx->rep[1] == 0 || cctx->rep[1] >= dictSize) return ERROR(dictionary_corrupted);
+    cctx->rep[2] = MEM_readLE32(dictPtr+8); if (cctx->rep[2] == 0 || cctx->rep[2] >= dictSize) return ERROR(dictionary_corrupted);
+    dictPtr += 12;
+
+    {   U32 offcodeMax = MaxOff;
+        if ((size_t)(dictEnd - dictPtr) <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)(dictEnd - dictPtr) + 128 KB; /* The maximum offset that must be supported */
+            /* Calculate minimum offset code required to represent maxOffset */
+            offcodeMax = ZSTD_highbit32(maxOffset);
+        }
+        /* Every possible supported offset <= dictContentSize + 128 KB must be representable */
+        CHECK_F (ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)));
+    }
+
+    cctx->flagStaticTables = 1;
+    cctx->flagStaticHufTable = HUF_repeat_valid;
+    return dictPtr - (const BYTE*)dict;
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : 0, or an error code */
+static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* dict, size_t dictSize)
+{
+    if ((dict==NULL) || (dictSize<=8)) return 0;
+
+    /* dict as pure content */
+    if ((MEM_readLE32(dict) != ZSTD_DICT_MAGIC) || (zc->forceRawDict))
+        return ZSTD_loadDictionaryContent(zc, dict, dictSize);
+    zc->dictID = zc->params.fParams.noDictIDFlag ? 0 :  MEM_readLE32((const char*)dict+4);
+
+    /* known magic number : dict is parsed for entropy stats and content */
+    {   size_t const loadError = ZSTD_loadDictEntropyStats(zc, (const char*)dict+8 /* skip dictHeader */, dictSize-8);
+        size_t const eSize = loadError + 8;
+        if (ZSTD_isError(loadError)) return loadError;
+        return ZSTD_loadDictionaryContent(zc, (const char*)dict+eSize, dictSize-eSize);
+    }
+}
+
+/*! ZSTD_compressBegin_internal() :
+*   @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, U64 pledgedSrcSize)
+{
+    ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue;
+    CHECK_F(ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp));
+    return ZSTD_compress_insertDictionary(cctx, dict, dictSize);
+}
+
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    /* compression parameters verification and optimization */
+    CHECK_F(ZSTD_checkCParams(params.cParams));
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, pledgedSrcSize);
+}
+
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, 0);
+}
+
+
+size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+}
+
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    size_t fhSize = 0;
+
+    if (cctx->stage == ZSTDcs_created) return ERROR(stage_wrong);  /* init missing */
+
+    /* special case : empty frame */
+    if (cctx->stage == ZSTDcs_init) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, 0, 0);
+        if (ZSTD_isError(fhSize)) return fhSize;
+        dstCapacity -= fhSize;
+        op += fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (cctx->stage != ZSTDcs_ending) {
+        /* write one last empty block, make it the "last" block */
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+        if (dstCapacity<4) return ERROR(dstSize_tooSmall);
+        MEM_writeLE32(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+    }
+
+    if (cctx->params.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        if (dstCapacity<4) return ERROR(dstSize_tooSmall);
+        MEM_writeLE32(op, checksum);
+        op += 4;
+    }
+
+    cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+    return op-ostart;
+}
+
+
+size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize)
+{
+    size_t endResult;
+    size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 1);
+    if (ZSTD_isError(cSize)) return cSize;
+    endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
+    if (ZSTD_isError(endResult)) return endResult;
+    return cSize + endResult;
+}
+
+
+static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    CHECK_F(ZSTD_compressBegin_internal(cctx, dict, dictSize, params, srcSize));
+    return ZSTD_compressEnd(cctx, dst,  dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    CHECK_F(ZSTD_checkCParams(params.cParams));
+    return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, dict ? dictSize : 0);
+    params.fParams.contentSizeFlag = 1;
+    return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+}
+
+size_t ZSTD_compressCCtx (ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+{
+    return ZSTD_compress_usingDict(ctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel)
+{
+    size_t result;
+    ZSTD_CCtx ctxBody;
+    memset(&ctxBody, 0, sizeof(ctxBody));
+    memcpy(&ctxBody.customMem, &defaultCustomMem, sizeof(ZSTD_customMem));
+    result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_free(ctxBody.workSpace, defaultCustomMem);  /* can't free ctxBody itself, as it's on stack; free only heap content */
+    return result;
+}
+
+
+/* =====  Dictionary API  ===== */
+
+struct ZSTD_CDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictContentSize;
+    ZSTD_CCtx* refContext;
+};  /* typedef'd tp ZSTD_CDict within "zstd.h" */
+
+size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support sizeof on NULL */
+    return ZSTD_sizeof_CCtx(cdict->refContext) + (cdict->dictBuffer ? cdict->dictContentSize : 0) + sizeof(*cdict);
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, unsigned byReference,
+                                      ZSTD_parameters params, ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem;
+    if (!customMem.customAlloc || !customMem.customFree) return NULL;
+
+    {   ZSTD_CDict* const cdict = (ZSTD_CDict*) ZSTD_malloc(sizeof(ZSTD_CDict), customMem);
+        ZSTD_CCtx* const cctx = ZSTD_createCCtx_advanced(customMem);
+
+        if (!cdict || !cctx) {
+            ZSTD_free(cdict, customMem);
+            ZSTD_freeCCtx(cctx);
+            return NULL;
+        }
+
+        if ((byReference) || (!dictBuffer) || (!dictSize)) {
+            cdict->dictBuffer = NULL;
+            cdict->dictContent = dictBuffer;
+        } else {
+            void* const internalBuffer = ZSTD_malloc(dictSize, customMem);
+            if (!internalBuffer) { ZSTD_free(cctx, customMem); ZSTD_free(cdict, customMem); return NULL; }
+            memcpy(internalBuffer, dictBuffer, dictSize);
+            cdict->dictBuffer = internalBuffer;
+            cdict->dictContent = internalBuffer;
+        }
+
+        {   size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0);
+            if (ZSTD_isError(errorCode)) {
+                ZSTD_free(cdict->dictBuffer, customMem);
+                ZSTD_free(cdict, customMem);
+                ZSTD_freeCCtx(cctx);
+                return NULL;
+        }   }
+
+        cdict->refContext = cctx;
+        cdict->dictContentSize = dictSize;
+        return cdict;
+    }
+}
+
+ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize);
+    params.fParams.contentSizeFlag = 1;
+    return ZSTD_createCDict_advanced(dict, dictSize, 0, params, allocator);
+}
+
+ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    ZSTD_parameters params = ZSTD_getParams(compressionLevel, 0, dictSize);
+    params.fParams.contentSizeFlag = 1;
+    return ZSTD_createCDict_advanced(dict, dictSize, 1, params, allocator);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = cdict->refContext->customMem;
+        ZSTD_freeCCtx(cdict->refContext);
+        ZSTD_free(cdict->dictBuffer, cMem);
+        ZSTD_free(cdict, cMem);
+        return 0;
+    }
+}
+
+static ZSTD_parameters ZSTD_getParamsFromCDict(const ZSTD_CDict* cdict) {
+    return ZSTD_getParamsFromCCtx(cdict->refContext);
+}
+
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize)
+{
+    if (cdict->dictContentSize) CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize))
+    else {
+        ZSTD_parameters params = cdict->refContext->params;
+        params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+        CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, params, pledgedSrcSize));
+    }
+    return 0;
+}
+
+/*! ZSTD_compress_usingCDict() :
+*   Compression using a digested Dictionary.
+*   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+*   Note that compression level is decided during dictionary creation */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict)
+{
+    CHECK_F(ZSTD_compressBegin_usingCDict(cctx, cdict, srcSize));
+
+    if (cdict->refContext->params.fParams.contentSizeFlag==1) {
+        cctx->params.fParams.contentSizeFlag = 1;
+        cctx->frameContentSize = srcSize;
+    }
+
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+typedef enum { zcss_init, zcss_load, zcss_flush, zcss_final } ZSTD_cStreamStage;
+
+struct ZSTD_CStream_s {
+    ZSTD_CCtx* cctx;
+    ZSTD_CDict* cdictLocal;
+    const ZSTD_CDict* cdict;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inToCompress;
+    size_t inBuffPos;
+    size_t inBuffTarget;
+    size_t blockSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outBuffContentSize;
+    size_t outBuffFlushedSize;
+    ZSTD_cStreamStage stage;
+    U32    checksum;
+    U32    frameEnded;
+    U64    pledgedSrcSize;
+    U64    inputProcessed;
+    ZSTD_parameters params;
+    ZSTD_customMem customMem;
+};   /* typedef'd to ZSTD_CStream within "zstd.h" */
+
+ZSTD_CStream* ZSTD_createCStream(void)
+{
+    return ZSTD_createCStream_advanced(defaultCustomMem);
+}
+
+ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_CStream* zcs;
+
+    if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem;
+    if (!customMem.customAlloc || !customMem.customFree) return NULL;
+
+    zcs = (ZSTD_CStream*)ZSTD_malloc(sizeof(ZSTD_CStream), customMem);
+    if (zcs==NULL) return NULL;
+    memset(zcs, 0, sizeof(ZSTD_CStream));
+    memcpy(&zcs->customMem, &customMem, sizeof(ZSTD_customMem));
+    zcs->cctx = ZSTD_createCCtx_advanced(customMem);
+    if (zcs->cctx == NULL) { ZSTD_freeCStream(zcs); return NULL; }
+    return zcs;
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
+{
+    if (zcs==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = zcs->customMem;
+        ZSTD_freeCCtx(zcs->cctx);
+        ZSTD_freeCDict(zcs->cdictLocal);
+        ZSTD_free(zcs->inBuff, cMem);
+        ZSTD_free(zcs->outBuff, cMem);
+        ZSTD_free(zcs, cMem);
+        return 0;
+    }
+}
+
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void)  { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; }
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
+{
+    if (zcs->inBuffSize==0) return ERROR(stage_wrong);   /* zcs has not been init at least once => can't reset */
+
+    if (zcs->cdict) CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize))
+    else CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize));
+
+    zcs->inToCompress = 0;
+    zcs->inBuffPos = 0;
+    zcs->inBuffTarget = zcs->blockSize;
+    zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+    zcs->stage = zcss_load;
+    zcs->frameEnded = 0;
+    zcs->pledgedSrcSize = pledgedSrcSize;
+    zcs->inputProcessed = 0;
+    return 0;   /* ready to go */
+}
+
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize)
+{
+
+    zcs->params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+
+    return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                                 const void* dict, size_t dictSize,
+                                 ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    /* allocate buffers */
+    {   size_t const neededInBuffSize = (size_t)1 << params.cParams.windowLog;
+        if (zcs->inBuffSize < neededInBuffSize) {
+            zcs->inBuffSize = neededInBuffSize;
+            ZSTD_free(zcs->inBuff, zcs->customMem);
+            zcs->inBuff = (char*) ZSTD_malloc(neededInBuffSize, zcs->customMem);
+            if (zcs->inBuff == NULL) return ERROR(memory_allocation);
+        }
+        zcs->blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, neededInBuffSize);
+    }
+    if (zcs->outBuffSize < ZSTD_compressBound(zcs->blockSize)+1) {
+        zcs->outBuffSize = ZSTD_compressBound(zcs->blockSize)+1;
+        ZSTD_free(zcs->outBuff, zcs->customMem);
+        zcs->outBuff = (char*) ZSTD_malloc(zcs->outBuffSize, zcs->customMem);
+        if (zcs->outBuff == NULL) return ERROR(memory_allocation);
+    }
+
+    if (dict && dictSize >= 8) {
+        ZSTD_freeCDict(zcs->cdictLocal);
+        zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem);
+        if (zcs->cdictLocal == NULL) return ERROR(memory_allocation);
+        zcs->cdict = zcs->cdictLocal;
+    } else zcs->cdict = NULL;
+
+    zcs->checksum = params.fParams.checksumFlag > 0;
+    zcs->params = params;
+
+    return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+/* note : cdict must outlive compression session */
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
+{
+    ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict);
+    size_t const initError =  ZSTD_initCStream_advanced(zcs, NULL, 0, params, 0);
+    zcs->cdict = cdict;
+    zcs->cctx->dictID = params.fParams.noDictIDFlag ? 0 : cdict->refContext->dictID;
+    return initError;
+}
+
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
+    return ZSTD_initCStream_advanced(zcs, dict, dictSize, params, 0);
+}
+
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize)
+{
+    ZSTD_parameters params = ZSTD_getParams(compressionLevel, pledgedSrcSize, 0);
+    if (pledgedSrcSize) params.fParams.contentSizeFlag = 1;
+    return ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize);
+}
+
+size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+{
+    return ZSTD_initCStream_usingDict(zcs, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+{
+    if (zcs==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(zcs) + ZSTD_sizeof_CCtx(zcs->cctx) + ZSTD_sizeof_CDict(zcs->cdictLocal) + zcs->outBuffSize + zcs->inBuffSize;
+}
+
+/*======   Compression   ======*/
+
+typedef enum { zsf_gather, zsf_flush, zsf_end } ZSTD_flush_e;
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                              void* dst, size_t* dstCapacityPtr,
+                        const void* src, size_t* srcSizePtr,
+                              ZSTD_flush_e const flush)
+{
+    U32 someMoreWork = 1;
+    const char* const istart = (const char*)src;
+    const char* const iend = istart + *srcSizePtr;
+    const char* ip = istart;
+    char* const ostart = (char*)dst;
+    char* const oend = ostart + *dstCapacityPtr;
+    char* op = ostart;
+
+    while (someMoreWork) {
+        switch(zcs->stage)
+        {
+        case zcss_init: return ERROR(init_missing);   /* call ZBUFF_compressInit() first ! */
+
+        case zcss_load:
+            /* complete inBuffer */
+            {   size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                size_t const loaded = ZSTD_limitCopy(zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend-ip);
+                zcs->inBuffPos += loaded;
+                ip += loaded;
+                if ( (zcs->inBuffPos==zcs->inToCompress) || (!flush && (toLoad != loaded)) ) {
+                    someMoreWork = 0; break;  /* not enough input to get a full block : stop there, wait for more */
+            }   }
+            /* compress current block (note : this stage cannot be stopped in the middle) */
+            {   void* cDst;
+                size_t cSize;
+                size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+                size_t oSize = oend-op;
+                if (oSize >= ZSTD_compressBound(iSize))
+                    cDst = op;   /* compress directly into output buffer (avoid flush stage) */
+                else
+                    cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+                cSize = (flush == zsf_end) ?
+                        ZSTD_compressEnd(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize) :
+                        ZSTD_compressContinue(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize);
+                if (ZSTD_isError(cSize)) return cSize;
+                if (flush == zsf_end) zcs->frameEnded = 1;
+                /* prepare next block */
+                zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+                if (zcs->inBuffTarget > zcs->inBuffSize)
+                    zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;   /* note : inBuffSize >= blockSize */
+                zcs->inToCompress = zcs->inBuffPos;
+                if (cDst == op) { op += cSize; break; }   /* no need to flush */
+                zcs->outBuffContentSize = cSize;
+                zcs->outBuffFlushedSize = 0;
+                zcs->stage = zcss_flush;   /* pass-through to flush stage */
+            }
+
+        case zcss_flush:
+            {   size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+                size_t const flushed = ZSTD_limitCopy(op, oend-op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+                op += flushed;
+                zcs->outBuffFlushedSize += flushed;
+                if (toFlush!=flushed) { someMoreWork = 0; break; }  /* dst too small to store flushed data : stop there */
+                zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+                zcs->stage = zcss_load;
+                break;
+            }
+
+        case zcss_final:
+            someMoreWork = 0;   /* do nothing */
+            break;
+
+        default:
+            return ERROR(GENERIC);   /* impossible */
+        }
+    }
+
+    *srcSizePtr = ip - istart;
+    *dstCapacityPtr = op - ostart;
+    zcs->inputProcessed += *srcSizePtr;
+    if (zcs->frameEnded) return 0;
+    {   size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos;
+        if (hintInSize==0) hintInSize = zcs->blockSize;
+        return hintInSize;
+    }
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    size_t sizeRead = input->size - input->pos;
+    size_t sizeWritten = output->size - output->pos;
+    size_t const result = ZSTD_compressStream_generic(zcs,
+                                                      (char*)(output->dst) + output->pos, &sizeWritten,
+                                                      (const char*)(input->src) + input->pos, &sizeRead, zsf_gather);
+    input->pos += sizeRead;
+    output->pos += sizeWritten;
+    return result;
+}
+
+
+/*======   Finalize   ======*/
+
+/*! ZSTD_flushStream() :
+*   @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    size_t srcSize = 0;
+    size_t sizeWritten = output->size - output->pos;
+    size_t const result = ZSTD_compressStream_generic(zcs,
+                                                     (char*)(output->dst) + output->pos, &sizeWritten,
+                                                     &srcSize, &srcSize, /* use a valid src address instead of NULL */
+                                                      zsf_flush);
+    output->pos += sizeWritten;
+    if (ZSTD_isError(result)) return result;
+    return zcs->outBuffContentSize - zcs->outBuffFlushedSize;   /* remaining to flush */
+}
+
+
+size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    BYTE* const ostart = (BYTE*)(output->dst) + output->pos;
+    BYTE* const oend = (BYTE*)(output->dst) + output->size;
+    BYTE* op = ostart;
+
+    if ((zcs->pledgedSrcSize) && (zcs->inputProcessed != zcs->pledgedSrcSize))
+        return ERROR(srcSize_wrong);   /* pledgedSrcSize not respected */
+
+    if (zcs->stage != zcss_final) {
+        /* flush whatever remains */
+        size_t srcSize = 0;
+        size_t sizeWritten = output->size - output->pos;
+        size_t const notEnded = ZSTD_compressStream_generic(zcs, ostart, &sizeWritten, &srcSize, &srcSize, zsf_end);  /* use a valid src address instead of NULL */
+        size_t const remainingToFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+        op += sizeWritten;
+        if (remainingToFlush) {
+            output->pos += sizeWritten;
+            return remainingToFlush + ZSTD_BLOCKHEADERSIZE /* final empty block */ + (zcs->checksum * 4);
+        }
+        /* create epilogue */
+        zcs->stage = zcss_final;
+        zcs->outBuffContentSize = !notEnded ? 0 :
+            ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL, 0);  /* write epilogue, including final empty block, into outBuff */
+    }
+
+    /* flush epilogue */
+    {   size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+        size_t const flushed = ZSTD_limitCopy(op, oend-op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+        op += flushed;
+        zcs->outBuffFlushedSize += flushed;
+        output->pos += op-ostart;
+        if (toFlush==flushed) zcs->stage = zcss_init;  /* end reached */
+        return toFlush - flushed;
+    }
+}
+
+
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_DEFAULT_CLEVEL 1
+#define ZSTD_MAX_CLEVEL     22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 18, 12, 12,  1,  7, 16, ZSTD_fast    },  /* level  0 - never used */
+    { 19, 13, 14,  1,  7, 16, ZSTD_fast    },  /* level  1 */
+    { 19, 15, 16,  1,  6, 16, ZSTD_fast    },  /* level  2 */
+    { 20, 16, 17,  1,  5, 16, ZSTD_dfast   },  /* level  3.*/
+    { 20, 18, 18,  1,  5, 16, ZSTD_dfast   },  /* level  4.*/
+    { 20, 15, 18,  3,  5, 16, ZSTD_greedy  },  /* level  5 */
+    { 21, 16, 19,  2,  5, 16, ZSTD_lazy    },  /* level  6 */
+    { 21, 17, 20,  3,  5, 16, ZSTD_lazy    },  /* level  7 */
+    { 21, 18, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 21, 20, 20,  3,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 21, 19, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 22, 20, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 20, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
+    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 13 */
+    { 22, 21, 22,  6,  5, 16, ZSTD_lazy2   },  /* level 14 */
+    { 22, 21, 21,  5,  5, 16, ZSTD_btlazy2 },  /* level 15 */
+    { 23, 22, 22,  5,  5, 16, ZSTD_btlazy2 },  /* level 16 */
+    { 23, 21, 22,  4,  5, 24, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  5, 32, ZSTD_btopt   },  /* level 18 */
+    { 23, 23, 22,  6,  3, 48, ZSTD_btopt   },  /* level 19 */
+    { 25, 25, 23,  7,  3, 64, ZSTD_btopt2  },  /* level 20 */
+    { 26, 26, 23,  7,  3,256, ZSTD_btopt2  },  /* level 21 */
+    { 27, 27, 25,  9,  3,512, ZSTD_btopt2  },  /* level 22 */
+},
+{   /* for srcSize <= 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    {  0,  0,  0,  0,  0,  0, ZSTD_fast    },  /* level  0 - not used */
+    { 18, 13, 14,  1,  6,  8, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 13,  1,  5,  8, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 15,  1,  5,  8, ZSTD_dfast   },  /* level  3 */
+    { 18, 15, 17,  1,  5,  8, ZSTD_greedy  },  /* level  4.*/
+    { 18, 16, 17,  4,  5,  8, ZSTD_greedy  },  /* level  5.*/
+    { 18, 16, 17,  3,  5,  8, ZSTD_lazy    },  /* level  6.*/
+    { 18, 17, 17,  4,  4,  8, ZSTD_lazy    },  /* level  7 */
+    { 18, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 11.*/
+    { 18, 18, 17,  7,  4,  8, ZSTD_lazy2   },  /* level 12.*/
+    { 18, 19, 17,  6,  4,  8, ZSTD_btlazy2 },  /* level 13 */
+    { 18, 18, 18,  4,  4, 16, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 18,  4,  3, 16, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 18,  6,  3, 32, ZSTD_btopt   },  /* level 16.*/
+    { 18, 19, 18,  8,  3, 64, ZSTD_btopt   },  /* level 17.*/
+    { 18, 19, 18,  9,  3,128, ZSTD_btopt   },  /* level 18.*/
+    { 18, 19, 18, 10,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 18, 19, 18, 11,  3,512, ZSTD_btopt2  },  /* level 20.*/
+    { 18, 19, 18, 12,  3,512, ZSTD_btopt2  },  /* level 21.*/
+    { 18, 19, 18, 13,  3,512, ZSTD_btopt2  },  /* level 22.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  7,  8, ZSTD_fast    },  /* level  0 - not used */
+    { 17, 12, 13,  1,  6,  8, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 16,  1,  5,  8, ZSTD_fast    },  /* level  2 */
+    { 17, 16, 16,  2,  5,  8, ZSTD_dfast   },  /* level  3 */
+    { 17, 13, 15,  3,  4,  8, ZSTD_greedy  },  /* level  4 */
+    { 17, 15, 17,  4,  4,  8, ZSTD_greedy  },  /* level  5 */
+    { 17, 16, 17,  3,  4,  8, ZSTD_lazy    },  /* level  6 */
+    { 17, 15, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  7,  4,  8, ZSTD_lazy2   },  /* level 11 */
+    { 17, 17, 17,  8,  4,  8, ZSTD_lazy2   },  /* level 12 */
+    { 17, 18, 17,  6,  4,  8, ZSTD_btlazy2 },  /* level 13.*/
+    { 17, 17, 17,  7,  3,  8, ZSTD_btopt   },  /* level 14.*/
+    { 17, 17, 17,  7,  3, 16, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  7,  3, 32, ZSTD_btopt   },  /* level 16.*/
+    { 17, 18, 17,  7,  3, 64, ZSTD_btopt   },  /* level 17.*/
+    { 17, 18, 17,  7,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 17, 18, 17,  9,  3,256, ZSTD_btopt2  },  /* level 20.*/
+    { 17, 18, 17, 10,  3,256, ZSTD_btopt2  },  /* level 21.*/
+    { 17, 18, 17, 11,  3,512, ZSTD_btopt2  },  /* level 22.*/
+},
+{   /* for srcSize <= 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 12,  1,  7,  6, ZSTD_fast    },  /* level  0 - not used */
+    { 14, 14, 14,  1,  6,  6, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 14,  1,  4,  6, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 14,  1,  4,  6, ZSTD_dfast   },  /* level  3.*/
+    { 14, 14, 14,  4,  4,  6, ZSTD_greedy  },  /* level  4.*/
+    { 14, 14, 14,  3,  4,  6, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  6, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  5,  4,  6, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  6,  4,  6, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  6,  4,  6, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  3,  3,  6, ZSTD_btopt   },  /* level 10.*/
+    { 14, 15, 14,  6,  3,  8, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  6,  3, 16, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  6,  3, 24, ZSTD_btopt   },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 48, ZSTD_btopt   },  /* level 14.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btopt   },  /* level 15.*/
+    { 14, 15, 15,  6,  3, 96, ZSTD_btopt   },  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btopt   },  /* level 17.*/
+    { 14, 15, 15,  6,  3,256, ZSTD_btopt   },  /* level 18.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btopt   },  /* level 19.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btopt2  },  /* level 20.*/
+    { 14, 15, 15,  9,  3,256, ZSTD_btopt2  },  /* level 21.*/
+    { 14, 15, 15, 10,  3,256, ZSTD_btopt2  },  /* level 22.*/
+},
+};
+
+/*! ZSTD_getCParams() :
+*   @return ZSTD_compressionParameters structure for a selected compression level, `srcSize` and `dictSize`.
+*   Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSize, size_t dictSize)
+{
+    ZSTD_compressionParameters cp;
+    size_t const addedSize = srcSize ? 0 : 500;
+    U64 const rSize = srcSize+dictSize ? srcSize+dictSize+addedSize : (U64)-1;
+    U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);   /* intentional underflow for srcSizeHint == 0 */
+    if (compressionLevel <= 0) compressionLevel = ZSTD_DEFAULT_CLEVEL;   /* 0 == default; no negative compressionLevel yet */
+    if (compressionLevel > ZSTD_MAX_CLEVEL) compressionLevel = ZSTD_MAX_CLEVEL;
+    cp = ZSTD_defaultCParameters[tableID][compressionLevel];
+    if (MEM_32bits()) {   /* auto-correction, for 32-bits mode */
+        if (cp.windowLog > ZSTD_WINDOWLOG_MAX) cp.windowLog = ZSTD_WINDOWLOG_MAX;
+        if (cp.chainLog > ZSTD_CHAINLOG_MAX) cp.chainLog = ZSTD_CHAINLOG_MAX;
+        if (cp.hashLog > ZSTD_HASHLOG_MAX) cp.hashLog = ZSTD_HASHLOG_MAX;
+    }
+    cp = ZSTD_adjustCParams(cp, srcSize, dictSize);
+    return cp;
+}
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`).
+*   All fields of `ZSTD_frameParameters` are set to default (0) */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSize, size_t dictSize) {
+    ZSTD_parameters params;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize);
+    memset(&params, 0, sizeof(params));
+    params.cParams = cParams;
+    return params;
+}
diff --git a/zstd/lib/compress/zstd_compress.o b/zstd/lib/compress/zstd_compress.o
new file mode 100644
index 0000000..c2c9939
Binary files /dev/null and b/zstd/lib/compress/zstd_compress.o differ
diff --git a/zstd/lib/compress/zstd_opt.h b/zstd/lib/compress/zstd_opt.h
new file mode 100644
index 0000000..ac418b6
--- /dev/null
+++ b/zstd/lib/compress/zstd_opt.h
@@ -0,0 +1,919 @@
+/**
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+/* Note : this file is intended to be included within zstd_compress.c */
+
+
+#ifndef ZSTD_OPT_H_91842398743
+#define ZSTD_OPT_H_91842398743
+
+
+#define ZSTD_LITFREQ_ADD    2
+#define ZSTD_FREQ_DIV       4
+#define ZSTD_MAX_PRICE      (1<<30)
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+FORCE_INLINE void ZSTD_setLog2Prices(seqStore_t* ssPtr)
+{
+    ssPtr->log2matchLengthSum = ZSTD_highbit32(ssPtr->matchLengthSum+1);
+    ssPtr->log2litLengthSum = ZSTD_highbit32(ssPtr->litLengthSum+1);
+    ssPtr->log2litSum = ZSTD_highbit32(ssPtr->litSum+1);
+    ssPtr->log2offCodeSum = ZSTD_highbit32(ssPtr->offCodeSum+1);
+    ssPtr->factor = 1 + ((ssPtr->litSum>>5) / ssPtr->litLengthSum) + ((ssPtr->litSum<<1) / (ssPtr->litSum + ssPtr->matchSum));
+}
+
+
+MEM_STATIC void ZSTD_rescaleFreqs(seqStore_t* ssPtr, const BYTE* src, size_t srcSize)
+{
+    unsigned u;
+
+    ssPtr->cachedLiterals = NULL;
+    ssPtr->cachedPrice = ssPtr->cachedLitLength = 0;
+    ssPtr->staticPrices = 0;
+
+    if (ssPtr->litLengthSum == 0) {
+        if (srcSize <= 1024) ssPtr->staticPrices = 1;
+
+        for (u=0; u<=MaxLit; u++)
+            ssPtr->litFreq[u] = 0;
+        for (u=0; u<srcSize; u++)
+            ssPtr->litFreq[src[u]]++;
+
+        ssPtr->litSum = 0;
+        ssPtr->litLengthSum = MaxLL+1;
+        ssPtr->matchLengthSum = MaxML+1;
+        ssPtr->offCodeSum = (MaxOff+1);
+        ssPtr->matchSum = (ZSTD_LITFREQ_ADD<<Litbits);
+
+        for (u=0; u<=MaxLit; u++) {
+            ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u]>>ZSTD_FREQ_DIV);
+            ssPtr->litSum += ssPtr->litFreq[u];
+        }
+        for (u=0; u<=MaxLL; u++)
+            ssPtr->litLengthFreq[u] = 1;
+        for (u=0; u<=MaxML; u++)
+            ssPtr->matchLengthFreq[u] = 1;
+        for (u=0; u<=MaxOff; u++)
+            ssPtr->offCodeFreq[u] = 1;
+    } else {
+        ssPtr->matchLengthSum = 0;
+        ssPtr->litLengthSum = 0;
+        ssPtr->offCodeSum = 0;
+        ssPtr->matchSum = 0;
+        ssPtr->litSum = 0;
+
+        for (u=0; u<=MaxLit; u++) {
+            ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u]>>(ZSTD_FREQ_DIV+1));
+            ssPtr->litSum += ssPtr->litFreq[u];
+        }
+        for (u=0; u<=MaxLL; u++) {
+            ssPtr->litLengthFreq[u] = 1 + (ssPtr->litLengthFreq[u]>>(ZSTD_FREQ_DIV+1));
+            ssPtr->litLengthSum += ssPtr->litLengthFreq[u];
+        }
+        for (u=0; u<=MaxML; u++) {
+            ssPtr->matchLengthFreq[u] = 1 + (ssPtr->matchLengthFreq[u]>>ZSTD_FREQ_DIV);
+            ssPtr->matchLengthSum += ssPtr->matchLengthFreq[u];
+            ssPtr->matchSum += ssPtr->matchLengthFreq[u] * (u + 3);
+        }
+        ssPtr->matchSum *= ZSTD_LITFREQ_ADD;
+        for (u=0; u<=MaxOff; u++) {
+            ssPtr->offCodeFreq[u] = 1 + (ssPtr->offCodeFreq[u]>>ZSTD_FREQ_DIV);
+            ssPtr->offCodeSum += ssPtr->offCodeFreq[u];
+        }
+    }
+
+    ZSTD_setLog2Prices(ssPtr);
+}
+
+
+FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t* ssPtr, U32 litLength, const BYTE* literals)
+{
+    U32 price, u;
+
+    if (ssPtr->staticPrices)
+        return ZSTD_highbit32((U32)litLength+1) + (litLength*6);
+
+    if (litLength == 0)
+        return ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[0]+1);
+
+    /* literals */
+    if (ssPtr->cachedLiterals == literals) {
+        U32 const additional = litLength - ssPtr->cachedLitLength;
+        const BYTE* literals2 = ssPtr->cachedLiterals + ssPtr->cachedLitLength;
+        price = ssPtr->cachedPrice + additional * ssPtr->log2litSum;
+        for (u=0; u < additional; u++)
+            price -= ZSTD_highbit32(ssPtr->litFreq[literals2[u]]+1);
+        ssPtr->cachedPrice = price;
+        ssPtr->cachedLitLength = litLength;
+    } else {
+        price = litLength * ssPtr->log2litSum;
+        for (u=0; u < litLength; u++)
+            price -= ZSTD_highbit32(ssPtr->litFreq[literals[u]]+1);
+
+        if (litLength >= 12) {
+            ssPtr->cachedLiterals = literals;
+            ssPtr->cachedPrice = price;
+            ssPtr->cachedLitLength = litLength;
+        }
+    }
+
+    /* literal Length */
+    {   const BYTE LL_deltaCode = 19;
+        const BYTE llCode = (litLength>63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+        price += LL_bits[llCode] + ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[llCode]+1);
+    }
+
+    return price;
+}
+
+
+FORCE_INLINE U32 ZSTD_getPrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength, const int ultra)
+{
+    /* offset */
+    U32 price;
+    BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
+
+    if (seqStorePtr->staticPrices)
+        return ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + ZSTD_highbit32((U32)matchLength+1) + 16 + offCode;
+
+    price = offCode + seqStorePtr->log2offCodeSum - ZSTD_highbit32(seqStorePtr->offCodeFreq[offCode]+1);
+    if (!ultra && offCode >= 20) price += (offCode-19)*2;
+
+    /* match Length */
+    {   const BYTE ML_deltaCode = 36;
+        const BYTE mlCode = (matchLength>127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+        price += ML_bits[mlCode] + seqStorePtr->log2matchLengthSum - ZSTD_highbit32(seqStorePtr->matchLengthFreq[mlCode]+1);
+    }
+
+    return price + ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + seqStorePtr->factor;
+}
+
+
+MEM_STATIC void ZSTD_updatePrice(seqStore_t* seqStorePtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength)
+{
+    U32 u;
+
+    /* literals */
+    seqStorePtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+    for (u=0; u < litLength; u++)
+        seqStorePtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+
+    /* literal Length */
+    {   const BYTE LL_deltaCode = 19;
+        const BYTE llCode = (litLength>63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+        seqStorePtr->litLengthFreq[llCode]++;
+        seqStorePtr->litLengthSum++;
+    }
+
+    /* match offset */
+	{   BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
+		seqStorePtr->offCodeSum++;
+		seqStorePtr->offCodeFreq[offCode]++;
+	}
+
+    /* match Length */
+    {   const BYTE ML_deltaCode = 36;
+        const BYTE mlCode = (matchLength>127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+        seqStorePtr->matchLengthFreq[mlCode]++;
+        seqStorePtr->matchLengthSum++;
+    }
+
+    ZSTD_setLog2Prices(seqStorePtr);
+}
+
+
+#define SET_PRICE(pos, mlen_, offset_, litlen_, price_)   \
+    {                                                 \
+        while (last_pos < pos)  { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } \
+        opt[pos].mlen = mlen_;                         \
+        opt[pos].off = offset_;                        \
+        opt[pos].litlen = litlen_;                     \
+        opt[pos].price = price_;                       \
+    }
+
+
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_CCtx* zc, const BYTE* ip)
+{
+    U32* const hashTable3  = zc->hashTable3;
+    U32 const hashLog3  = zc->hashLog3;
+    const BYTE* const base = zc->base;
+    U32 idx = zc->nextToUpdate3;
+    const U32 target = zc->nextToUpdate3 = (U32)(ip - base);
+    const size_t hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+
+    while(idx < target) {
+        hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
+        idx++;
+    }
+
+    return hashTable3[hash3];
+}
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+static U32 ZSTD_insertBtAndGetAllMatches (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        U32 nbCompares, const U32 mls,
+                        U32 extDict, ZSTD_match_t* matches, const U32 minMatchLen)
+{
+    const BYTE* const base = zc->base;
+    const U32 current = (U32)(ip-base);
+    const U32 hashLog = zc->params.cParams.hashLog;
+    const size_t h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32* const hashTable = zc->hashTable;
+    U32 matchIndex  = hashTable[h];
+    U32* const bt   = zc->chainTable;
+    const U32 btLog = zc->params.cParams.chainLog - 1;
+    const U32 btMask= (1U << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const dictBase = zc->dictBase;
+    const U32 dictLimit = zc->dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    const U32 windowLow = zc->lowLimit;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = bt + 2*(current&btMask) + 1;
+    U32 matchEndIdx = current+8;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 mnum = 0;
+
+    const U32 minMatch = (mls == 3) ? 3 : 4;
+    size_t bestLength = minMatchLen-1;
+
+    if (minMatch == 3) { /* HC3 match finder */
+        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3 (zc, ip);
+        if (matchIndex3>windowLow && (current - matchIndex3 < (1<<18))) {
+            const BYTE* match;
+            size_t currentMl=0;
+            if ((!extDict) || matchIndex3 >= dictLimit) {
+                match = base + matchIndex3;
+                if (match[bestLength] == ip[bestLength]) currentMl = ZSTD_count(ip, match, iLimit);
+            } else {
+                match = dictBase + matchIndex3;
+                if (MEM_readMINMATCH(match, MINMATCH) == MEM_readMINMATCH(ip, MINMATCH))    /* assumption : matchIndex3 <= dictLimit-4 (by table construction) */
+                    currentMl = ZSTD_count_2segments(ip+MINMATCH, match+MINMATCH, iLimit, dictEnd, prefixStart) + MINMATCH;
+            }
+
+            /* save best solution */
+            if (currentMl > bestLength) {
+                bestLength = currentMl;
+                matches[mnum].off = ZSTD_REP_MOVE_OPT + current - matchIndex3;
+                matches[mnum].len = (U32)currentMl;
+                mnum++;
+                if (currentMl > ZSTD_OPT_NUM) goto update;
+                if (ip+currentMl == iLimit) goto update; /* best possible, and avoid read overflow*/
+            }
+        }
+    }
+
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match;
+
+        if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
+            match = base + matchIndex;
+            if (match[matchLength] == ip[matchLength]) {
+                matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1;
+            }
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength;
+            bestLength = matchLength;
+            matches[mnum].off = ZSTD_REP_MOVE_OPT + current - matchIndex;
+            matches[mnum].len = (U32)matchLength;
+            mnum++;
+            if (matchLength > ZSTD_OPT_NUM) break;
+            if (ip+matchLength == iLimit)   /* equal : no way to know if inf or sup */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+update:
+    zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1;
+    return mnum;
+}
+
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        const U32 maxNbAttempts, const U32 mls, ZSTD_match_t* matches, const U32 minMatchLen)
+{
+    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+    return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 0, matches, minMatchLen);
+}
+
+
+static U32 ZSTD_BtGetAllMatches_selectMLS (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* ip, const BYTE* const iHighLimit,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, const U32 minMatchLen)
+{
+    switch(matchLengthSearch)
+    {
+    case 3 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+    default :
+    case 4 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+    case 5 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+    case 6 : return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+    }
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches_extDict (
+                        ZSTD_CCtx* zc,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        const U32 maxNbAttempts, const U32 mls, ZSTD_match_t* matches, const U32 minMatchLen)
+{
+    if (ip < zc->base + zc->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+    return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 1, matches, minMatchLen);
+}
+
+
+static U32 ZSTD_BtGetAllMatches_selectMLS_extDict (
+                        ZSTD_CCtx* zc,   /* Index table will be updated */
+                        const BYTE* ip, const BYTE* const iHighLimit,
+                        const U32 maxNbAttempts, const U32 matchLengthSearch, ZSTD_match_t* matches, const U32 minMatchLen)
+{
+    switch(matchLengthSearch)
+    {
+    case 3 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+    default :
+    case 4 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+    case 5 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+    case 6 : return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+    }
+}
+
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
+                                    const void* src, size_t srcSize, const int ultra)
+{
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ctx->base;
+    const BYTE* const prefixStart = base + ctx->dictLimit;
+
+    const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+    const U32 sufficient_len = ctx->params.cParams.targetLength;
+    const U32 mls = ctx->params.cParams.searchLength;
+    const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+    ZSTD_optimal_t* opt = seqStorePtr->priceTable;
+    ZSTD_match_t* matches = seqStorePtr->matchTable;
+    const BYTE* inr;
+    U32 offset, rep[ZSTD_REP_NUM];
+
+    /* init */
+    ctx->nextToUpdate3 = ctx->nextToUpdate;
+    ZSTD_rescaleFreqs(seqStorePtr, (const BYTE*)src, srcSize);
+    ip += (ip==prefixStart);
+    { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) rep[i]=ctx->rep[i]; }
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 cur, match_num, last_pos, litlen, price;
+        U32 u, mlen, best_mlen, best_off, litLength;
+        memset(opt, 0, sizeof(ZSTD_optimal_t));
+        last_pos = 0;
+        litlen = (U32)(ip - anchor);
+
+        /* check repCode */
+        {   U32 i, last_i = ZSTD_REP_CHECK + (ip==anchor);
+            for (i=(ip == anchor); i<last_i; i++) {
+                const S32 repCur = (i==ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+                if ( (repCur > 0) && (repCur < (S32)(ip-prefixStart))
+                    && (MEM_readMINMATCH(ip, minMatch) == MEM_readMINMATCH(ip - repCur, minMatch))) {
+                    mlen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repCur, iend) + minMatch;
+                    if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+                        best_mlen = mlen; best_off = i; cur = 0; last_pos = 1;
+                        goto _storeSequence;
+                    }
+                    best_off = i - (ip == anchor);
+                    do {
+                        price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+                        if (mlen > last_pos || price < opt[mlen].price)
+                            SET_PRICE(mlen, mlen, i, litlen, price);   /* note : macro modifies last_pos */
+                        mlen--;
+                    } while (mlen >= minMatch);
+        }   }   }
+
+        match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, ip, iend, maxSearches, mls, matches, minMatch);
+
+        if (!last_pos && !match_num) { ip++; continue; }
+
+        if (match_num && (matches[match_num-1].len > sufficient_len || matches[match_num-1].len >= ZSTD_OPT_NUM)) {
+            best_mlen = matches[match_num-1].len;
+            best_off = matches[match_num-1].off;
+            cur = 0;
+            last_pos = 1;
+            goto _storeSequence;
+        }
+
+        /* set prices using matches at position = 0 */
+        best_mlen = (last_pos) ? last_pos : minMatch;
+        for (u = 0; u < match_num; u++) {
+            mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+            best_mlen = matches[u].len;
+            while (mlen <= best_mlen) {
+                price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra);
+                if (mlen > last_pos || price < opt[mlen].price)
+                    SET_PRICE(mlen, mlen, matches[u].off, litlen, price);   /* note : macro modifies last_pos */
+                mlen++;
+        }   }
+
+        if (last_pos < minMatch) { ip++; continue; }
+
+        /* initialize opt[0] */
+        { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+        opt[0].mlen = 1;
+        opt[0].litlen = litlen;
+
+         /* check further positions */
+        for (cur = 1; cur <= last_pos; cur++) {
+           inr = ip + cur;
+
+           if (opt[cur-1].mlen == 1) {
+                litlen = opt[cur-1].litlen + 1;
+                if (cur > litlen) {
+                    price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-litlen);
+                } else
+                    price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+           } else {
+                litlen = 1;
+                price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1);
+           }
+
+           if (cur > last_pos || price <= opt[cur].price)
+                SET_PRICE(cur, 1, 0, litlen, price);
+
+           if (cur == last_pos) break;
+
+           if (inr > ilimit)  /* last match must start at a minimum distance of 8 from oend */
+               continue;
+
+           mlen = opt[cur].mlen;
+           if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+                opt[cur].rep[2] = opt[cur-mlen].rep[1];
+                opt[cur].rep[1] = opt[cur-mlen].rep[0];
+                opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+           } else {
+                opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur-mlen].rep[1] : opt[cur-mlen].rep[2];
+                opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur-mlen].rep[0] : opt[cur-mlen].rep[1];
+                opt[cur].rep[0] = ((opt[cur].off==ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur-mlen].rep[0] - 1) : (opt[cur-mlen].rep[opt[cur].off]);
+           }
+
+            best_mlen = minMatch;
+            {   U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+                for (i=(opt[cur].mlen != 1); i<last_i; i++) {  /* check rep */
+                    const S32 repCur = (i==ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+                    if ( (repCur > 0) && (repCur < (S32)(inr-prefixStart))
+                       && (MEM_readMINMATCH(inr, minMatch) == MEM_readMINMATCH(inr - repCur, minMatch))) {
+                       mlen = (U32)ZSTD_count(inr+minMatch, inr+minMatch - repCur, iend) + minMatch;
+
+                       if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+                            best_mlen = mlen; best_off = i; last_pos = cur + 1;
+                            goto _storeSequence;
+                       }
+
+                       best_off = i - (opt[cur].mlen != 1);
+                       if (mlen > best_mlen) best_mlen = mlen;
+
+                       do {
+                           if (opt[cur].mlen == 1) {
+                                litlen = opt[cur].litlen;
+                                if (cur > litlen) {
+                                    price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr-litlen, best_off, mlen - MINMATCH, ultra);
+                                } else
+                                    price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+                            } else {
+                                litlen = 0;
+                                price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+                            }
+
+                            if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+                                SET_PRICE(cur + mlen, mlen, i, litlen, price);
+                            mlen--;
+                        } while (mlen >= minMatch);
+            }   }   }
+
+            match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, inr, iend, maxSearches, mls, matches, best_mlen);
+
+            if (match_num > 0 && (matches[match_num-1].len > sufficient_len || cur + matches[match_num-1].len >= ZSTD_OPT_NUM)) {
+                best_mlen = matches[match_num-1].len;
+                best_off = matches[match_num-1].off;
+                last_pos = cur + 1;
+                goto _storeSequence;
+            }
+
+            /* set prices using matches at position = cur */
+            for (u = 0; u < match_num; u++) {
+                mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+                best_mlen = matches[u].len;
+
+                while (mlen <= best_mlen) {
+                    if (opt[cur].mlen == 1) {
+                        litlen = opt[cur].litlen;
+                        if (cur > litlen)
+                            price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip+cur-litlen, matches[u].off-1, mlen - MINMATCH, ultra);
+                        else
+                            price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra);
+                    } else {
+                        litlen = 0;
+                        price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off-1, mlen - MINMATCH, ultra);
+                    }
+
+                    if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+                        SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+                    mlen++;
+        }   }   }
+
+        best_mlen = opt[last_pos].mlen;
+        best_off = opt[last_pos].off;
+        cur = last_pos - best_mlen;
+
+        /* store sequence */
+_storeSequence:   /* cur, last_pos, best_mlen, best_off have to be set */
+        opt[0].mlen = 1;
+
+        while (1) {
+            mlen = opt[cur].mlen;
+            offset = opt[cur].off;
+            opt[cur].mlen = best_mlen;
+            opt[cur].off = best_off;
+            best_mlen = mlen;
+            best_off = offset;
+            if (mlen > cur) break;
+            cur -= mlen;
+        }
+
+        for (u = 0; u <= last_pos;) {
+            u += opt[u].mlen;
+        }
+
+        for (cur=0; cur < last_pos; ) {
+            mlen = opt[cur].mlen;
+            if (mlen == 1) { ip++; cur++; continue; }
+            offset = opt[cur].off;
+            cur += mlen;
+            litLength = (U32)(ip - anchor);
+
+            if (offset > ZSTD_REP_MOVE_OPT) {
+                rep[2] = rep[1];
+                rep[1] = rep[0];
+                rep[0] = offset - ZSTD_REP_MOVE_OPT;
+                offset--;
+            } else {
+                if (offset != 0) {
+                    best_off = (offset==ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+                    if (offset != 1) rep[2] = rep[1];
+                    rep[1] = rep[0];
+                    rep[0] = best_off;
+                }
+                if (litLength==0) offset--;
+            }
+
+            ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            anchor = ip = ip + mlen;
+    }    }   /* for (cur=0; cur < last_pos; ) */
+
+    /* Save reps for next block */
+    { int i; for (i=0; i<ZSTD_REP_NUM; i++) ctx->repToConfirm[i] = rep[i]; }
+
+    /* Last Literals */
+    {   size_t const lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+
+FORCE_INLINE
+void ZSTD_compressBlock_opt_extDict_generic(ZSTD_CCtx* ctx,
+                                     const void* src, size_t srcSize, const int ultra)
+{
+    seqStore_t* seqStorePtr = &(ctx->seqStore);
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ctx->base;
+    const U32 lowestIndex = ctx->lowLimit;
+    const U32 dictLimit = ctx->dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ctx->dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+
+    const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+    const U32 sufficient_len = ctx->params.cParams.targetLength;
+    const U32 mls = ctx->params.cParams.searchLength;
+    const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+    ZSTD_optimal_t* opt = seqStorePtr->priceTable;
+    ZSTD_match_t* matches = seqStorePtr->matchTable;
+    const BYTE* inr;
+
+    /* init */
+    U32 offset, rep[ZSTD_REP_NUM];
+    { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) rep[i]=ctx->rep[i]; }
+
+    ctx->nextToUpdate3 = ctx->nextToUpdate;
+    ZSTD_rescaleFreqs(seqStorePtr, (const BYTE*)src, srcSize);
+    ip += (ip==prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 cur, match_num, last_pos, litlen, price;
+        U32 u, mlen, best_mlen, best_off, litLength;
+        U32 current = (U32)(ip-base);
+        memset(opt, 0, sizeof(ZSTD_optimal_t));
+        last_pos = 0;
+        opt[0].litlen = (U32)(ip - anchor);
+
+        /* check repCode */
+        {   U32 i, last_i = ZSTD_REP_CHECK + (ip==anchor);
+            for (i = (ip==anchor); i<last_i; i++) {
+                const S32 repCur = (i==ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+                const U32 repIndex = (U32)(current - repCur);
+                const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                const BYTE* const repMatch = repBase + repIndex;
+                if ( (repCur > 0 && repCur <= (S32)current)
+                   && (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex>lowestIndex))  /* intentional overflow */
+                   && (MEM_readMINMATCH(ip, minMatch) == MEM_readMINMATCH(repMatch, minMatch)) ) {
+                    /* repcode detected we should take it */
+                    const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                    mlen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iend, repEnd, prefixStart) + minMatch;
+
+                    if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+                        best_mlen = mlen; best_off = i; cur = 0; last_pos = 1;
+                        goto _storeSequence;
+                    }
+
+                    best_off = i - (ip==anchor);
+                    litlen = opt[0].litlen;
+                    do {
+                        price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+                        if (mlen > last_pos || price < opt[mlen].price)
+                            SET_PRICE(mlen, mlen, i, litlen, price);   /* note : macro modifies last_pos */
+                        mlen--;
+                    } while (mlen >= minMatch);
+        }   }   }
+
+        match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, ip, iend, maxSearches, mls, matches, minMatch);  /* first search (depth 0) */
+
+        if (!last_pos && !match_num) { ip++; continue; }
+
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+        opt[0].mlen = 1;
+
+        if (match_num && (matches[match_num-1].len > sufficient_len || matches[match_num-1].len >= ZSTD_OPT_NUM)) {
+            best_mlen = matches[match_num-1].len;
+            best_off = matches[match_num-1].off;
+            cur = 0;
+            last_pos = 1;
+            goto _storeSequence;
+        }
+
+        best_mlen = (last_pos) ? last_pos : minMatch;
+
+        /* set prices using matches at position = 0 */
+        for (u = 0; u < match_num; u++) {
+            mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+            best_mlen = matches[u].len;
+            litlen = opt[0].litlen;
+            while (mlen <= best_mlen) {
+                price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra);
+                if (mlen > last_pos || price < opt[mlen].price)
+                    SET_PRICE(mlen, mlen, matches[u].off, litlen, price);
+                mlen++;
+        }   }
+
+        if (last_pos < minMatch) {
+            ip++; continue;
+        }
+
+        /* check further positions */
+        for (cur = 1; cur <= last_pos; cur++) {
+            inr = ip + cur;
+
+            if (opt[cur-1].mlen == 1) {
+                litlen = opt[cur-1].litlen + 1;
+                if (cur > litlen) {
+                    price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-litlen);
+                } else
+                    price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+            } else {
+                litlen = 1;
+                price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr-1);
+            }
+
+            if (cur > last_pos || price <= opt[cur].price)
+                SET_PRICE(cur, 1, 0, litlen, price);
+
+            if (cur == last_pos) break;
+
+            if (inr > ilimit)  /* last match must start at a minimum distance of 8 from oend */
+                continue;
+
+            mlen = opt[cur].mlen;
+            if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+                opt[cur].rep[2] = opt[cur-mlen].rep[1];
+                opt[cur].rep[1] = opt[cur-mlen].rep[0];
+                opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+            } else {
+                opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur-mlen].rep[1] : opt[cur-mlen].rep[2];
+                opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur-mlen].rep[0] : opt[cur-mlen].rep[1];
+                opt[cur].rep[0] = ((opt[cur].off==ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur-mlen].rep[0] - 1) : (opt[cur-mlen].rep[opt[cur].off]);
+            }
+
+            best_mlen = minMatch;
+            {   U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+                for (i = (mlen != 1); i<last_i; i++) {
+                    const S32 repCur = (i==ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+                    const U32 repIndex = (U32)(current+cur - repCur);
+                    const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                    const BYTE* const repMatch = repBase + repIndex;
+                    if ( (repCur > 0 && repCur <= (S32)(current+cur))
+                      && (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex>lowestIndex))  /* intentional overflow */
+                      && (MEM_readMINMATCH(inr, minMatch) == MEM_readMINMATCH(repMatch, minMatch)) ) {
+                        /* repcode detected */
+                        const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                        mlen = (U32)ZSTD_count_2segments(inr+minMatch, repMatch+minMatch, iend, repEnd, prefixStart) + minMatch;
+
+                        if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+                            best_mlen = mlen; best_off = i; last_pos = cur + 1;
+                            goto _storeSequence;
+                        }
+
+                        best_off = i - (opt[cur].mlen != 1);
+                        if (mlen > best_mlen) best_mlen = mlen;
+
+                        do {
+                            if (opt[cur].mlen == 1) {
+                                litlen = opt[cur].litlen;
+                                if (cur > litlen) {
+                                    price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr-litlen, best_off, mlen - MINMATCH, ultra);
+                                } else
+                                    price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+                            } else {
+                                litlen = 0;
+                                price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+                            }
+
+                            if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+                                SET_PRICE(cur + mlen, mlen, i, litlen, price);
+                            mlen--;
+                        } while (mlen >= minMatch);
+            }   }   }
+
+            match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, inr, iend, maxSearches, mls, matches, minMatch);
+
+            if (match_num > 0 && (matches[match_num-1].len > sufficient_len || cur + matches[match_num-1].len >= ZSTD_OPT_NUM)) {
+                best_mlen = matches[match_num-1].len;
+                best_off = matches[match_num-1].off;
+                last_pos = cur + 1;
+                goto _storeSequence;
+            }
+
+            /* set prices using matches at position = cur */
+            for (u = 0; u < match_num; u++) {
+                mlen = (u>0) ? matches[u-1].len+1 : best_mlen;
+                best_mlen = matches[u].len;
+
+                while (mlen <= best_mlen) {
+                    if (opt[cur].mlen == 1) {
+                        litlen = opt[cur].litlen;
+                        if (cur > litlen)
+                            price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip+cur-litlen, matches[u].off-1, mlen - MINMATCH, ultra);
+                        else
+                            price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off-1, mlen - MINMATCH, ultra);
+                    } else {
+                        litlen = 0;
+                        price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off-1, mlen - MINMATCH, ultra);
+                    }
+
+                    if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+                        SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+                    mlen++;
+        }   }   }   /* for (cur = 1; cur <= last_pos; cur++) */
+
+        best_mlen = opt[last_pos].mlen;
+        best_off = opt[last_pos].off;
+        cur = last_pos - best_mlen;
+
+        /* store sequence */
+_storeSequence:   /* cur, last_pos, best_mlen, best_off have to be set */
+        opt[0].mlen = 1;
+
+        while (1) {
+            mlen = opt[cur].mlen;
+            offset = opt[cur].off;
+            opt[cur].mlen = best_mlen;
+            opt[cur].off = best_off;
+            best_mlen = mlen;
+            best_off = offset;
+            if (mlen > cur) break;
+            cur -= mlen;
+        }
+
+        for (u = 0; u <= last_pos; ) {
+            u += opt[u].mlen;
+        }
+
+        for (cur=0; cur < last_pos; ) {
+            mlen = opt[cur].mlen;
+            if (mlen == 1) { ip++; cur++; continue; }
+            offset = opt[cur].off;
+            cur += mlen;
+            litLength = (U32)(ip - anchor);
+
+            if (offset > ZSTD_REP_MOVE_OPT) {
+                rep[2] = rep[1];
+                rep[1] = rep[0];
+                rep[0] = offset - ZSTD_REP_MOVE_OPT;
+                offset--;
+            } else {
+                if (offset != 0) {
+                    best_off = (offset==ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+                    if (offset != 1) rep[2] = rep[1];
+                    rep[1] = rep[0];
+                    rep[0] = best_off;
+                }
+
+                if (litLength==0) offset--;
+            }
+
+            ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen-MINMATCH);
+            anchor = ip = ip + mlen;
+    }    }   /* for (cur=0; cur < last_pos; ) */
+
+    /* Save reps for next block */
+    { int i; for (i=0; i<ZSTD_REP_NUM; i++) ctx->repToConfirm[i] = rep[i]; }
+
+    /* Last Literals */
+    {   size_t lastLLSize = iend - anchor;
+        memcpy(seqStorePtr->lit, anchor, lastLLSize);
+        seqStorePtr->lit += lastLLSize;
+    }
+}
+
+#endif  /* ZSTD_OPT_H_91842398743 */
diff --git a/zstd/lib/compress/zstdmt_compress.c b/zstd/lib/compress/zstdmt_compress.c
new file mode 100644
index 0000000..45514a8
--- /dev/null
+++ b/zstd/lib/compress/zstdmt_compress.c
@@ -0,0 +1,739 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+/* ======   Tuning parameters   ====== */
+#define ZSTDMT_NBTHREADS_MAX 128
+
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+/* ======   Dependencies   ====== */
+#include <stdlib.h>   /* malloc */
+#include <string.h>   /* memcpy */
+#include "pool.h"     /* threadpool */
+#include "threading.h"  /* mutex */
+#include "zstd_internal.h"   /* MIN, ERROR, ZSTD_*, ZSTD_highbit32 */
+#include "zstdmt_compress.h"
+
+
+/* ======   Debug   ====== */
+#if 0
+
+#  include <stdio.h>
+#  include <unistd.h>
+#  include <sys/times.h>
+   static unsigned g_debugLevel = 3;
+#  define DEBUGLOGRAW(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __VA_ARGS__); }
+#  define DEBUGLOG(l, ...) if (l<=g_debugLevel) { fprintf(stderr, __FILE__ ": "); fprintf(stderr, __VA_ARGS__); fprintf(stderr, " \n"); }
+
+#  define DEBUG_PRINTHEX(l,p,n) { \
+    unsigned debug_u;                   \
+    for (debug_u=0; debug_u<(n); debug_u++)           \
+        DEBUGLOGRAW(l, "%02X ", ((const unsigned char*)(p))[debug_u]); \
+    DEBUGLOGRAW(l, " \n");       \
+}
+
+static unsigned long long GetCurrentClockTimeMicroseconds()
+{
+   static clock_t _ticksPerSecond = 0;
+   if (_ticksPerSecond <= 0) _ticksPerSecond = sysconf(_SC_CLK_TCK);
+
+   struct tms junk; clock_t newTicks = (clock_t) times(&junk);
+   return ((((unsigned long long)newTicks)*(1000000))/_ticksPerSecond);
+}
+
+#define MUTEX_WAIT_TIME_DLEVEL 5
+#define PTHREAD_MUTEX_LOCK(mutex) \
+if (g_debugLevel>=MUTEX_WAIT_TIME_DLEVEL) { \
+   unsigned long long beforeTime = GetCurrentClockTimeMicroseconds(); \
+   pthread_mutex_lock(mutex); \
+   unsigned long long afterTime = GetCurrentClockTimeMicroseconds(); \
+   unsigned long long elapsedTime = (afterTime-beforeTime); \
+   if (elapsedTime > 1000) {  /* or whatever threshold you like; I'm using 1 millisecond here */ \
+      DEBUGLOG(MUTEX_WAIT_TIME_DLEVEL, "Thread took %llu microseconds to acquire mutex %s \n", \
+               elapsedTime, #mutex); \
+  } \
+} else pthread_mutex_lock(mutex);
+
+#else
+
+#  define DEBUGLOG(l, ...)      {}    /* disabled */
+#  define PTHREAD_MUTEX_LOCK(m) pthread_mutex_lock(m)
+#  define DEBUG_PRINTHEX(l,p,n) {}
+
+#endif
+
+
+/* =====   Buffer Pool   ===== */
+
+typedef struct buffer_s {
+    void* start;
+    size_t size;
+} buffer_t;
+
+static const buffer_t g_nullBuffer = { NULL, 0 };
+
+typedef struct ZSTDMT_bufferPool_s {
+    unsigned totalBuffers;
+    unsigned nbBuffers;
+    buffer_t bTable[1];   /* variable size */
+} ZSTDMT_bufferPool;
+
+static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbThreads)
+{
+    unsigned const maxNbBuffers = 2*nbThreads + 2;
+    ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)calloc(1, sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t));
+    if (bufPool==NULL) return NULL;
+    bufPool->totalBuffers = maxNbBuffers;
+    bufPool->nbBuffers = 0;
+    return bufPool;
+}
+
+static void ZSTDMT_freeBufferPool(ZSTDMT_bufferPool* bufPool)
+{
+    unsigned u;
+    if (!bufPool) return;   /* compatibility with free on NULL */
+    for (u=0; u<bufPool->totalBuffers; u++)
+        free(bufPool->bTable[u].start);
+    free(bufPool);
+}
+
+/* assumption : invocation from main thread only ! */
+static buffer_t ZSTDMT_getBuffer(ZSTDMT_bufferPool* pool, size_t bSize)
+{
+    if (pool->nbBuffers) {   /* try to use an existing buffer */
+        buffer_t const buf = pool->bTable[--(pool->nbBuffers)];
+        size_t const availBufferSize = buf.size;
+        if ((availBufferSize >= bSize) & (availBufferSize <= 10*bSize))   /* large enough, but not too much */
+            return buf;
+        free(buf.start);   /* size conditions not respected : scratch this buffer and create a new one */
+    }
+    /* create new buffer */
+    {   buffer_t buffer;
+        void* const start = malloc(bSize);
+        if (start==NULL) bSize = 0;
+        buffer.start = start;   /* note : start can be NULL if malloc fails ! */
+        buffer.size = bSize;
+        return buffer;
+    }
+}
+
+/* store buffer for later re-use, up to pool capacity */
+static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* pool, buffer_t buf)
+{
+    if (buf.start == NULL) return;   /* release on NULL */
+    if (pool->nbBuffers < pool->totalBuffers) {
+        pool->bTable[pool->nbBuffers++] = buf;   /* store for later re-use */
+        return;
+    }
+    /* Reached bufferPool capacity (should not happen) */
+    free(buf.start);
+}
+
+
+/* =====   CCtx Pool   ===== */
+
+typedef struct {
+    unsigned totalCCtx;
+    unsigned availCCtx;
+    ZSTD_CCtx* cctx[1];   /* variable size */
+} ZSTDMT_CCtxPool;
+
+/* assumption : CCtxPool invocation only from main thread */
+
+/* note : all CCtx borrowed from the pool should be released back to the pool _before_ freeing the pool */
+static void ZSTDMT_freeCCtxPool(ZSTDMT_CCtxPool* pool)
+{
+    unsigned u;
+    for (u=0; u<pool->totalCCtx; u++)
+        ZSTD_freeCCtx(pool->cctx[u]);  /* note : compatible with free on NULL */
+    free(pool);
+}
+
+/* ZSTDMT_createCCtxPool() :
+ * implies nbThreads >= 1 , checked by caller ZSTDMT_createCCtx() */
+static ZSTDMT_CCtxPool* ZSTDMT_createCCtxPool(unsigned nbThreads)
+{
+    ZSTDMT_CCtxPool* const cctxPool = (ZSTDMT_CCtxPool*) calloc(1, sizeof(ZSTDMT_CCtxPool) + (nbThreads-1)*sizeof(ZSTD_CCtx*));
+    if (!cctxPool) return NULL;
+    cctxPool->totalCCtx = nbThreads;
+    cctxPool->availCCtx = 1;   /* at least one cctx for single-thread mode */
+    cctxPool->cctx[0] = ZSTD_createCCtx();
+    if (!cctxPool->cctx[0]) { ZSTDMT_freeCCtxPool(cctxPool); return NULL; }
+    DEBUGLOG(1, "cctxPool created, with %u threads", nbThreads);
+    return cctxPool;
+}
+
+static ZSTD_CCtx* ZSTDMT_getCCtx(ZSTDMT_CCtxPool* pool)
+{
+    if (pool->availCCtx) {
+        pool->availCCtx--;
+        return pool->cctx[pool->availCCtx];
+    }
+    return ZSTD_createCCtx();   /* note : can be NULL, when creation fails ! */
+}
+
+static void ZSTDMT_releaseCCtx(ZSTDMT_CCtxPool* pool, ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return;   /* compatibility with release on NULL */
+    if (pool->availCCtx < pool->totalCCtx)
+        pool->cctx[pool->availCCtx++] = cctx;
+    else
+        /* pool overflow : should not happen, since totalCCtx==nbThreads */
+        ZSTD_freeCCtx(cctx);
+}
+
+
+/* =====   Thread worker   ===== */
+
+typedef struct {
+    buffer_t buffer;
+    size_t filled;
+} inBuff_t;
+
+typedef struct {
+    ZSTD_CCtx* cctx;
+    buffer_t src;
+    const void* srcStart;
+    size_t   srcSize;
+    size_t   dictSize;
+    buffer_t dstBuff;
+    size_t   cSize;
+    size_t   dstFlushed;
+    unsigned firstChunk;
+    unsigned lastChunk;
+    unsigned jobCompleted;
+    unsigned jobScanned;
+    pthread_mutex_t* jobCompleted_mutex;
+    pthread_cond_t* jobCompleted_cond;
+    ZSTD_parameters params;
+    ZSTD_CDict* cdict;
+    unsigned long long fullFrameSize;
+} ZSTDMT_jobDescription;
+
+/* ZSTDMT_compressChunk() : POOL_function type */
+void ZSTDMT_compressChunk(void* jobDescription)
+{
+    ZSTDMT_jobDescription* const job = (ZSTDMT_jobDescription*)jobDescription;
+    const void* const src = (const char*)job->srcStart + job->dictSize;
+    buffer_t const dstBuff = job->dstBuff;
+    DEBUGLOG(3, "job (first:%u) (last:%u) : dictSize %u, srcSize %u", job->firstChunk, job->lastChunk, (U32)job->dictSize, (U32)job->srcSize);
+    if (job->cdict) {  /* should only happen for first segment */
+        size_t const initError = ZSTD_compressBegin_usingCDict(job->cctx, job->cdict, job->fullFrameSize);
+        if (job->cdict) DEBUGLOG(3, "using CDict ");
+        if (ZSTD_isError(initError)) { job->cSize = initError; goto _endJob; }
+    } else {  /* srcStart points at reloaded section */
+        size_t const dictModeError = ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceRawDict, 1);  /* Force loading dictionary in "content-only" mode (no header analysis) */
+        size_t const initError = ZSTD_compressBegin_advanced(job->cctx, job->srcStart, job->dictSize, job->params, 0);
+        if (ZSTD_isError(initError) || ZSTD_isError(dictModeError)) { job->cSize = initError; goto _endJob; }
+        ZSTD_setCCtxParameter(job->cctx, ZSTD_p_forceWindow, 1);
+    }
+    if (!job->firstChunk) {  /* flush and overwrite frame header when it's not first segment */
+        size_t const hSize = ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, 0);
+        if (ZSTD_isError(hSize)) { job->cSize = hSize; goto _endJob; }
+        ZSTD_invalidateRepCodes(job->cctx);
+    }
+
+    DEBUGLOG(4, "Compressing : ");
+    DEBUG_PRINTHEX(4, job->srcStart, 12);
+    job->cSize = (job->lastChunk) ?
+                 ZSTD_compressEnd     (job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize) :
+                 ZSTD_compressContinue(job->cctx, dstBuff.start, dstBuff.size, src, job->srcSize);
+    DEBUGLOG(3, "compressed %u bytes into %u bytes   (first:%u) (last:%u)", (unsigned)job->srcSize, (unsigned)job->cSize, job->firstChunk, job->lastChunk);
+
+_endJob:
+    PTHREAD_MUTEX_LOCK(job->jobCompleted_mutex);
+    job->jobCompleted = 1;
+    job->jobScanned = 0;
+    pthread_cond_signal(job->jobCompleted_cond);
+    pthread_mutex_unlock(job->jobCompleted_mutex);
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+struct ZSTDMT_CCtx_s {
+    POOL_ctx* factory;
+    ZSTDMT_bufferPool* buffPool;
+    ZSTDMT_CCtxPool* cctxPool;
+    pthread_mutex_t jobCompleted_mutex;
+    pthread_cond_t jobCompleted_cond;
+    size_t targetSectionSize;
+    size_t marginSize;
+    size_t inBuffSize;
+    size_t dictSize;
+    size_t targetDictSize;
+    inBuff_t inBuff;
+    ZSTD_parameters params;
+    XXH64_state_t xxhState;
+    unsigned nbThreads;
+    unsigned jobIDMask;
+    unsigned doneJobID;
+    unsigned nextJobID;
+    unsigned frameEnded;
+    unsigned allJobsCompleted;
+    unsigned overlapRLog;
+    unsigned long long frameContentSize;
+    size_t sectionSize;
+    ZSTD_CDict* cdict;
+    ZSTD_CStream* cstream;
+    ZSTDMT_jobDescription jobs[1];   /* variable size (must lies at the end) */
+};
+
+ZSTDMT_CCtx *ZSTDMT_createCCtx(unsigned nbThreads)
+{
+    ZSTDMT_CCtx* cctx;
+    U32 const minNbJobs = nbThreads + 2;
+    U32 const nbJobsLog2 = ZSTD_highbit32(minNbJobs) + 1;
+    U32 const nbJobs = 1 << nbJobsLog2;
+    DEBUGLOG(5, "nbThreads : %u  ; minNbJobs : %u ;  nbJobsLog2 : %u ;  nbJobs : %u  \n",
+            nbThreads, minNbJobs, nbJobsLog2, nbJobs);
+    if ((nbThreads < 1) | (nbThreads > ZSTDMT_NBTHREADS_MAX)) return NULL;
+    cctx = (ZSTDMT_CCtx*) calloc(1, sizeof(ZSTDMT_CCtx) + nbJobs*sizeof(ZSTDMT_jobDescription));
+    if (!cctx) return NULL;
+    cctx->nbThreads = nbThreads;
+    cctx->jobIDMask = nbJobs - 1;
+    cctx->allJobsCompleted = 1;
+    cctx->sectionSize = 0;
+    cctx->overlapRLog = 3;
+    cctx->factory = POOL_create(nbThreads, 1);
+    cctx->buffPool = ZSTDMT_createBufferPool(nbThreads);
+    cctx->cctxPool = ZSTDMT_createCCtxPool(nbThreads);
+    if (!cctx->factory | !cctx->buffPool | !cctx->cctxPool) {  /* one object was not created */
+        ZSTDMT_freeCCtx(cctx);
+        return NULL;
+    }
+    if (nbThreads==1) {
+        cctx->cstream = ZSTD_createCStream();
+        if (!cctx->cstream) {
+            ZSTDMT_freeCCtx(cctx); return NULL;
+    }   }
+    pthread_mutex_init(&cctx->jobCompleted_mutex, NULL);   /* Todo : check init function return */
+    pthread_cond_init(&cctx->jobCompleted_cond, NULL);
+    DEBUGLOG(4, "mt_cctx created, for %u threads \n", nbThreads);
+    return cctx;
+}
+
+/* ZSTDMT_releaseAllJobResources() :
+ * Ensure all workers are killed first. */
+static void ZSTDMT_releaseAllJobResources(ZSTDMT_CCtx* mtctx)
+{
+    unsigned jobID;
+    for (jobID=0; jobID <= mtctx->jobIDMask; jobID++) {
+        ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[jobID].dstBuff);
+        mtctx->jobs[jobID].dstBuff = g_nullBuffer;
+        ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[jobID].src);
+        mtctx->jobs[jobID].src = g_nullBuffer;
+        ZSTDMT_releaseCCtx(mtctx->cctxPool, mtctx->jobs[jobID].cctx);
+        mtctx->jobs[jobID].cctx = NULL;
+    }
+    memset(mtctx->jobs, 0, (mtctx->jobIDMask+1)*sizeof(ZSTDMT_jobDescription));
+    ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->inBuff.buffer);
+    mtctx->inBuff.buffer = g_nullBuffer;
+    mtctx->allJobsCompleted = 1;
+}
+
+size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx)
+{
+    if (mtctx==NULL) return 0;   /* compatible with free on NULL */
+    POOL_free(mtctx->factory);
+    if (!mtctx->allJobsCompleted) ZSTDMT_releaseAllJobResources(mtctx); /* stop workers first */
+    ZSTDMT_freeBufferPool(mtctx->buffPool);  /* release job resources into pools first */
+    ZSTDMT_freeCCtxPool(mtctx->cctxPool);
+    ZSTD_freeCDict(mtctx->cdict);
+    ZSTD_freeCStream(mtctx->cstream);
+    pthread_mutex_destroy(&mtctx->jobCompleted_mutex);
+    pthread_cond_destroy(&mtctx->jobCompleted_cond);
+    free(mtctx);
+    return 0;
+}
+
+size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSDTMT_parameter parameter, unsigned value)
+{
+    switch(parameter)
+    {
+    case ZSTDMT_p_sectionSize :
+        mtctx->sectionSize = value;
+        return 0;
+    case ZSTDMT_p_overlapSectionLog :
+    DEBUGLOG(4, "ZSTDMT_p_overlapSectionLog : %u", value);
+        mtctx->overlapRLog = (value >= 9) ? 0 : 9 - value;
+        return 0;
+    default :
+        return ERROR(compressionParameter_unsupported);
+    }
+}
+
+
+/* ------------------------------------------ */
+/* =====   Multi-threaded compression   ===== */
+/* ------------------------------------------ */
+
+size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
+                           void* dst, size_t dstCapacity,
+                     const void* src, size_t srcSize,
+                           int compressionLevel)
+{
+    ZSTD_parameters params = ZSTD_getParams(compressionLevel, srcSize, 0);
+    size_t const chunkTargetSize = (size_t)1 << (params.cParams.windowLog + 2);
+    unsigned const nbChunksMax = (unsigned)(srcSize / chunkTargetSize) + (srcSize < chunkTargetSize) /* min 1 */;
+    unsigned nbChunks = MIN(nbChunksMax, mtctx->nbThreads);
+    size_t const proposedChunkSize = (srcSize + (nbChunks-1)) / nbChunks;
+    size_t const avgChunkSize = ((proposedChunkSize & 0x1FFFF) < 0xFFFF) ? proposedChunkSize + 0xFFFF : proposedChunkSize;   /* avoid too small last block */
+    size_t remainingSrcSize = srcSize;
+    const char* const srcStart = (const char*)src;
+    size_t frameStartPos = 0;
+
+    DEBUGLOG(3, "windowLog : %2u => chunkTargetSize : %u bytes  ", params.cParams.windowLog, (U32)chunkTargetSize);
+    DEBUGLOG(2, "nbChunks  : %2u   (chunkSize : %u bytes)   ", nbChunks, (U32)avgChunkSize);
+    params.fParams.contentSizeFlag = 1;
+
+    if (nbChunks==1) {   /* fallback to single-thread mode */
+        ZSTD_CCtx* const cctx = mtctx->cctxPool->cctx[0];
+        return ZSTD_compressCCtx(cctx, dst, dstCapacity, src, srcSize, compressionLevel);
+    }
+
+    {   unsigned u;
+        for (u=0; u<nbChunks; u++) {
+            size_t const chunkSize = MIN(remainingSrcSize, avgChunkSize);
+            size_t const dstBufferCapacity = u ? ZSTD_compressBound(chunkSize) : dstCapacity;
+            buffer_t const dstAsBuffer = { dst, dstCapacity };
+            buffer_t const dstBuffer = u ? ZSTDMT_getBuffer(mtctx->buffPool, dstBufferCapacity) : dstAsBuffer;
+            ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(mtctx->cctxPool);
+
+            if ((cctx==NULL) || (dstBuffer.start==NULL)) {
+                mtctx->jobs[u].cSize = ERROR(memory_allocation);   /* job result */
+                mtctx->jobs[u].jobCompleted = 1;
+                nbChunks = u+1;
+                break;   /* let's wait for previous jobs to complete, but don't start new ones */
+            }
+
+            mtctx->jobs[u].srcStart = srcStart + frameStartPos;
+            mtctx->jobs[u].srcSize = chunkSize;
+            mtctx->jobs[u].fullFrameSize = srcSize;
+            mtctx->jobs[u].params = params;
+            mtctx->jobs[u].dstBuff = dstBuffer;
+            mtctx->jobs[u].cctx = cctx;
+            mtctx->jobs[u].firstChunk = (u==0);
+            mtctx->jobs[u].lastChunk = (u==nbChunks-1);
+            mtctx->jobs[u].jobCompleted = 0;
+            mtctx->jobs[u].jobCompleted_mutex = &mtctx->jobCompleted_mutex;
+            mtctx->jobs[u].jobCompleted_cond = &mtctx->jobCompleted_cond;
+
+            DEBUGLOG(3, "posting job %u   (%u bytes)", u, (U32)chunkSize);
+            DEBUG_PRINTHEX(3, mtctx->jobs[u].srcStart, 12);
+            POOL_add(mtctx->factory, ZSTDMT_compressChunk, &mtctx->jobs[u]);
+
+            frameStartPos += chunkSize;
+            remainingSrcSize -= chunkSize;
+    }   }
+    /* note : since nbChunks <= nbThreads, all jobs should be running immediately in parallel */
+
+    {   unsigned chunkID;
+        size_t error = 0, dstPos = 0;
+        for (chunkID=0; chunkID<nbChunks; chunkID++) {
+            DEBUGLOG(3, "waiting for chunk %u ", chunkID);
+            PTHREAD_MUTEX_LOCK(&mtctx->jobCompleted_mutex);
+            while (mtctx->jobs[chunkID].jobCompleted==0) {
+                DEBUGLOG(4, "waiting for jobCompleted signal from chunk %u", chunkID);
+                pthread_cond_wait(&mtctx->jobCompleted_cond, &mtctx->jobCompleted_mutex);
+            }
+            pthread_mutex_unlock(&mtctx->jobCompleted_mutex);
+            DEBUGLOG(3, "ready to write chunk %u ", chunkID);
+
+            ZSTDMT_releaseCCtx(mtctx->cctxPool, mtctx->jobs[chunkID].cctx);
+            mtctx->jobs[chunkID].cctx = NULL;
+            mtctx->jobs[chunkID].srcStart = NULL;
+            {   size_t const cSize = mtctx->jobs[chunkID].cSize;
+                if (ZSTD_isError(cSize)) error = cSize;
+                if ((!error) && (dstPos + cSize > dstCapacity)) error = ERROR(dstSize_tooSmall);
+                if (chunkID) {   /* note : chunk 0 is already written directly into dst */
+                    if (!error) memcpy((char*)dst + dstPos, mtctx->jobs[chunkID].dstBuff.start, cSize);
+                    ZSTDMT_releaseBuffer(mtctx->buffPool, mtctx->jobs[chunkID].dstBuff);
+                    mtctx->jobs[chunkID].dstBuff = g_nullBuffer;
+                }
+                dstPos += cSize ;
+            }
+        }
+        if (!error) DEBUGLOG(3, "compressed size : %u  ", (U32)dstPos);
+        return error ? error : dstPos;
+    }
+
+}
+
+
+/* ====================================== */
+/* =======      Streaming API     ======= */
+/* ====================================== */
+
+static void ZSTDMT_waitForAllJobsCompleted(ZSTDMT_CCtx* zcs) {
+    while (zcs->doneJobID < zcs->nextJobID) {
+        unsigned const jobID = zcs->doneJobID & zcs->jobIDMask;
+        PTHREAD_MUTEX_LOCK(&zcs->jobCompleted_mutex);
+        while (zcs->jobs[jobID].jobCompleted==0) {
+            DEBUGLOG(4, "waiting for jobCompleted signal from chunk %u", zcs->doneJobID);   /* we want to block when waiting for data to flush */
+            pthread_cond_wait(&zcs->jobCompleted_cond, &zcs->jobCompleted_mutex);
+        }
+        pthread_mutex_unlock(&zcs->jobCompleted_mutex);
+        zcs->doneJobID++;
+    }
+}
+
+
+static size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
+                                    const void* dict, size_t dictSize, unsigned updateDict,
+                                    ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    ZSTD_customMem const cmem = { NULL, NULL, NULL };
+    DEBUGLOG(3, "Started new compression, with windowLog : %u", params.cParams.windowLog);
+    if (zcs->nbThreads==1) return ZSTD_initCStream_advanced(zcs->cstream, dict, dictSize, params, pledgedSrcSize);
+    if (zcs->allJobsCompleted == 0) {   /* previous job not correctly finished */
+        ZSTDMT_waitForAllJobsCompleted(zcs);
+        ZSTDMT_releaseAllJobResources(zcs);
+        zcs->allJobsCompleted = 1;
+    }
+    zcs->params = params;
+    if (updateDict) {
+        ZSTD_freeCDict(zcs->cdict); zcs->cdict = NULL;
+        if (dict && dictSize) {
+            zcs->cdict = ZSTD_createCDict_advanced(dict, dictSize, 0, params, cmem);
+            if (zcs->cdict == NULL) return ERROR(memory_allocation);
+    }   }
+    zcs->frameContentSize = pledgedSrcSize;
+    zcs->targetDictSize = (zcs->overlapRLog>=9) ? 0 : (size_t)1 << (zcs->params.cParams.windowLog - zcs->overlapRLog);
+    DEBUGLOG(4, "overlapRLog : %u ", zcs->overlapRLog);
+    DEBUGLOG(3, "overlap Size : %u KB", (U32)(zcs->targetDictSize>>10));
+    zcs->targetSectionSize = zcs->sectionSize ? zcs->sectionSize : (size_t)1 << (zcs->params.cParams.windowLog + 2);
+    zcs->targetSectionSize = MAX(ZSTDMT_SECTION_SIZE_MIN, zcs->targetSectionSize);
+    zcs->targetSectionSize = MAX(zcs->targetDictSize, zcs->targetSectionSize);
+    DEBUGLOG(3, "Section Size : %u KB", (U32)(zcs->targetSectionSize>>10));
+    zcs->marginSize = zcs->targetSectionSize >> 2;
+    zcs->inBuffSize = zcs->targetDictSize + zcs->targetSectionSize + zcs->marginSize;
+    zcs->inBuff.buffer = ZSTDMT_getBuffer(zcs->buffPool, zcs->inBuffSize);
+    if (zcs->inBuff.buffer.start == NULL) return ERROR(memory_allocation);
+    zcs->inBuff.filled = 0;
+    zcs->dictSize = 0;
+    zcs->doneJobID = 0;
+    zcs->nextJobID = 0;
+    zcs->frameEnded = 0;
+    zcs->allJobsCompleted = 0;
+    if (params.fParams.checksumFlag) XXH64_reset(&zcs->xxhState, 0);
+    return 0;
+}
+
+size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* zcs,
+                                const void* dict, size_t dictSize,
+                                ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    return ZSTDMT_initCStream_internal(zcs, dict, dictSize, 1, params, pledgedSrcSize);
+}
+
+/* ZSTDMT_resetCStream() :
+ * pledgedSrcSize is optional and can be zero == unknown */
+size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* zcs, unsigned long long pledgedSrcSize)
+{
+    if (zcs->nbThreads==1) return ZSTD_resetCStream(zcs->cstream, pledgedSrcSize);
+    return ZSTDMT_initCStream_internal(zcs, NULL, 0, 0, zcs->params, pledgedSrcSize);
+}
+
+size_t ZSTDMT_initCStream(ZSTDMT_CCtx* zcs, int compressionLevel) {
+    ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, 0);
+    return ZSTDMT_initCStream_internal(zcs, NULL, 0, 1, params, 0);
+}
+
+
+static size_t ZSTDMT_createCompressionJob(ZSTDMT_CCtx* zcs, size_t srcSize, unsigned endFrame)
+{
+    size_t const dstBufferCapacity = ZSTD_compressBound(srcSize);
+    buffer_t const dstBuffer = ZSTDMT_getBuffer(zcs->buffPool, dstBufferCapacity);
+    ZSTD_CCtx* const cctx = ZSTDMT_getCCtx(zcs->cctxPool);
+    unsigned const jobID = zcs->nextJobID & zcs->jobIDMask;
+
+    if ((cctx==NULL) || (dstBuffer.start==NULL)) {
+        zcs->jobs[jobID].jobCompleted = 1;
+        zcs->nextJobID++;
+        ZSTDMT_waitForAllJobsCompleted(zcs);
+        ZSTDMT_releaseAllJobResources(zcs);
+        return ERROR(memory_allocation);
+    }
+
+    DEBUGLOG(4, "preparing job %u to compress %u bytes with %u preload ", zcs->nextJobID, (U32)srcSize, (U32)zcs->dictSize);
+    zcs->jobs[jobID].src = zcs->inBuff.buffer;
+    zcs->jobs[jobID].srcStart = zcs->inBuff.buffer.start;
+    zcs->jobs[jobID].srcSize = srcSize;
+    zcs->jobs[jobID].dictSize = zcs->dictSize;   /* note : zcs->inBuff.filled is presumed >= srcSize + dictSize */
+    zcs->jobs[jobID].params = zcs->params;
+    if (zcs->nextJobID) zcs->jobs[jobID].params.fParams.checksumFlag = 0;  /* do not calculate checksum within sections, just keep it in header for first section */
+    zcs->jobs[jobID].cdict = zcs->nextJobID==0 ? zcs->cdict : NULL;
+    zcs->jobs[jobID].fullFrameSize = zcs->frameContentSize;
+    zcs->jobs[jobID].dstBuff = dstBuffer;
+    zcs->jobs[jobID].cctx = cctx;
+    zcs->jobs[jobID].firstChunk = (zcs->nextJobID==0);
+    zcs->jobs[jobID].lastChunk = endFrame;
+    zcs->jobs[jobID].jobCompleted = 0;
+    zcs->jobs[jobID].dstFlushed = 0;
+    zcs->jobs[jobID].jobCompleted_mutex = &zcs->jobCompleted_mutex;
+    zcs->jobs[jobID].jobCompleted_cond = &zcs->jobCompleted_cond;
+
+    /* get a new buffer for next input */
+    if (!endFrame) {
+        size_t const newDictSize = MIN(srcSize + zcs->dictSize, zcs->targetDictSize);
+        zcs->inBuff.buffer = ZSTDMT_getBuffer(zcs->buffPool, zcs->inBuffSize);
+        if (zcs->inBuff.buffer.start == NULL) {   /* not enough memory to allocate next input buffer */
+            zcs->jobs[jobID].jobCompleted = 1;
+            zcs->nextJobID++;
+            ZSTDMT_waitForAllJobsCompleted(zcs);
+            ZSTDMT_releaseAllJobResources(zcs);
+            return ERROR(memory_allocation);
+        }
+        DEBUGLOG(5, "inBuff filled to %u", (U32)zcs->inBuff.filled);
+        zcs->inBuff.filled -= srcSize + zcs->dictSize - newDictSize;
+        DEBUGLOG(5, "new job : filled to %u, with %u dict and %u src", (U32)zcs->inBuff.filled, (U32)newDictSize, (U32)(zcs->inBuff.filled - newDictSize));
+        memmove(zcs->inBuff.buffer.start, (const char*)zcs->jobs[jobID].srcStart + zcs->dictSize + srcSize - newDictSize, zcs->inBuff.filled);
+        DEBUGLOG(5, "new inBuff pre-filled");
+        zcs->dictSize = newDictSize;
+    } else {
+        zcs->inBuff.buffer = g_nullBuffer;
+        zcs->inBuff.filled = 0;
+        zcs->dictSize = 0;
+        zcs->frameEnded = 1;
+        if (zcs->nextJobID == 0)
+            zcs->params.fParams.checksumFlag = 0;   /* single chunk : checksum is calculated directly within worker thread */
+    }
+
+    DEBUGLOG(3, "posting job %u : %u bytes  (end:%u) (note : doneJob = %u=>%u)", zcs->nextJobID, (U32)zcs->jobs[jobID].srcSize, zcs->jobs[jobID].lastChunk, zcs->doneJobID, zcs->doneJobID & zcs->jobIDMask);
+    POOL_add(zcs->factory, ZSTDMT_compressChunk, &zcs->jobs[jobID]);   /* this call is blocking when thread worker pool is exhausted */
+    zcs->nextJobID++;
+    return 0;
+}
+
+
+/* ZSTDMT_flushNextJob() :
+ * output : will be updated with amount of data flushed .
+ * blockToFlush : if >0, the function will block and wait if there is no data available to flush .
+ * @return : amount of data remaining within internal buffer, 1 if unknown but > 0, 0 if no more, or an error code */
+static size_t ZSTDMT_flushNextJob(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, unsigned blockToFlush)
+{
+    unsigned const wJobID = zcs->doneJobID & zcs->jobIDMask;
+    if (zcs->doneJobID == zcs->nextJobID) return 0;   /* all flushed ! */
+    PTHREAD_MUTEX_LOCK(&zcs->jobCompleted_mutex);
+    while (zcs->jobs[wJobID].jobCompleted==0) {
+        DEBUGLOG(5, "waiting for jobCompleted signal from job %u", zcs->doneJobID);
+        if (!blockToFlush) { pthread_mutex_unlock(&zcs->jobCompleted_mutex); return 0; }  /* nothing ready to be flushed => skip */
+        pthread_cond_wait(&zcs->jobCompleted_cond, &zcs->jobCompleted_mutex);  /* block when nothing available to flush */
+    }
+    pthread_mutex_unlock(&zcs->jobCompleted_mutex);
+    /* compression job completed : output can be flushed */
+    {   ZSTDMT_jobDescription job = zcs->jobs[wJobID];
+        if (!job.jobScanned) {
+            if (ZSTD_isError(job.cSize)) {
+                DEBUGLOG(5, "compression error detected ");
+                ZSTDMT_waitForAllJobsCompleted(zcs);
+                ZSTDMT_releaseAllJobResources(zcs);
+                return job.cSize;
+            }
+            ZSTDMT_releaseCCtx(zcs->cctxPool, job.cctx);
+            zcs->jobs[wJobID].cctx = NULL;
+            DEBUGLOG(5, "zcs->params.fParams.checksumFlag : %u ", zcs->params.fParams.checksumFlag);
+            if (zcs->params.fParams.checksumFlag) {
+                XXH64_update(&zcs->xxhState, (const char*)job.srcStart + job.dictSize, job.srcSize);
+                if (zcs->frameEnded && (zcs->doneJobID+1 == zcs->nextJobID)) {  /* write checksum at end of last section */
+                    U32 const checksum = (U32)XXH64_digest(&zcs->xxhState);
+                    DEBUGLOG(4, "writing checksum : %08X \n", checksum);
+                    MEM_writeLE32((char*)job.dstBuff.start + job.cSize, checksum);
+                    job.cSize += 4;
+                    zcs->jobs[wJobID].cSize += 4;
+            }   }
+            ZSTDMT_releaseBuffer(zcs->buffPool, job.src);
+            zcs->jobs[wJobID].srcStart = NULL;
+            zcs->jobs[wJobID].src = g_nullBuffer;
+            zcs->jobs[wJobID].jobScanned = 1;
+        }
+        {   size_t const toWrite = MIN(job.cSize - job.dstFlushed, output->size - output->pos);
+            DEBUGLOG(4, "Flushing %u bytes from job %u ", (U32)toWrite, zcs->doneJobID);
+            memcpy((char*)output->dst + output->pos, (const char*)job.dstBuff.start + job.dstFlushed, toWrite);
+            output->pos += toWrite;
+            job.dstFlushed += toWrite;
+        }
+        if (job.dstFlushed == job.cSize) {   /* output buffer fully flushed => move to next one */
+            ZSTDMT_releaseBuffer(zcs->buffPool, job.dstBuff);
+            zcs->jobs[wJobID].dstBuff = g_nullBuffer;
+            zcs->jobs[wJobID].jobCompleted = 0;
+            zcs->doneJobID++;
+        } else {
+            zcs->jobs[wJobID].dstFlushed = job.dstFlushed;
+        }
+        /* return value : how many bytes left in buffer ; fake it to 1 if unknown but >0 */
+        if (job.cSize > job.dstFlushed) return (job.cSize - job.dstFlushed);
+        if (zcs->doneJobID < zcs->nextJobID) return 1;   /* still some buffer to flush */
+        zcs->allJobsCompleted = zcs->frameEnded;   /* frame completed and entirely flushed */
+        return 0;   /* everything flushed */
+}   }
+
+
+size_t ZSTDMT_compressStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    size_t const newJobThreshold = zcs->dictSize + zcs->targetSectionSize + zcs->marginSize;
+    if (zcs->frameEnded) return ERROR(stage_wrong);   /* current frame being ended. Only flush is allowed. Restart with init */
+    if (zcs->nbThreads==1) return ZSTD_compressStream(zcs->cstream, output, input);
+
+    /* fill input buffer */
+    {   size_t const toLoad = MIN(input->size - input->pos, zcs->inBuffSize - zcs->inBuff.filled);
+        memcpy((char*)zcs->inBuff.buffer.start + zcs->inBuff.filled, input->src, toLoad);
+        input->pos += toLoad;
+        zcs->inBuff.filled += toLoad;
+    }
+
+    if ( (zcs->inBuff.filled >= newJobThreshold)  /* filled enough : let's compress */
+        && (zcs->nextJobID <= zcs->doneJobID + zcs->jobIDMask) ) {   /* avoid overwriting job round buffer */
+        CHECK_F( ZSTDMT_createCompressionJob(zcs, zcs->targetSectionSize, 0) );
+    }
+
+    /* check for data to flush */
+    CHECK_F( ZSTDMT_flushNextJob(zcs, output, (zcs->inBuff.filled == zcs->inBuffSize)) ); /* block if it wasn't possible to create new job due to saturation */
+
+    /* recommended next input size : fill current input buffer */
+    return zcs->inBuffSize - zcs->inBuff.filled;   /* note : could be zero when input buffer is fully filled and no more availability to create new job */
+}
+
+
+static size_t ZSTDMT_flushStream_internal(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output, unsigned endFrame)
+{
+    size_t const srcSize = zcs->inBuff.filled - zcs->dictSize;
+
+    if (srcSize) DEBUGLOG(4, "flushing : %u bytes left to compress", (U32)srcSize);
+    if ( ((srcSize > 0) || (endFrame && !zcs->frameEnded))
+       && (zcs->nextJobID <= zcs->doneJobID + zcs->jobIDMask) ) {
+        CHECK_F( ZSTDMT_createCompressionJob(zcs, srcSize, endFrame) );
+    }
+
+    /* check if there is any data available to flush */
+    DEBUGLOG(5, "zcs->doneJobID : %u  ; zcs->nextJobID : %u ", zcs->doneJobID, zcs->nextJobID);
+    return ZSTDMT_flushNextJob(zcs, output, 1);
+}
+
+
+size_t ZSTDMT_flushStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output)
+{
+    if (zcs->nbThreads==1) return ZSTD_flushStream(zcs->cstream, output);
+    return ZSTDMT_flushStream_internal(zcs, output, 0);
+}
+
+size_t ZSTDMT_endStream(ZSTDMT_CCtx* zcs, ZSTD_outBuffer* output)
+{
+    if (zcs->nbThreads==1) return ZSTD_endStream(zcs->cstream, output);
+    return ZSTDMT_flushStream_internal(zcs, output, 1);
+}
diff --git a/zstd/lib/compress/zstdmt_compress.h b/zstd/lib/compress/zstdmt_compress.h
new file mode 100644
index 0000000..27f78ee
--- /dev/null
+++ b/zstd/lib/compress/zstdmt_compress.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+
+
+/* Note : All prototypes defined in this file shall be considered experimental.
+ *        There is no guarantee of API continuity (yet) on any of these prototypes */
+
+/* ===   Dependencies   === */
+#include <stddef.h>   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters */
+#include "zstd.h"     /* ZSTD_inBuffer, ZSTD_outBuffer, ZSTDLIB_API */
+
+
+/* ===   Simple one-pass functions   === */
+
+typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
+ZSTDLIB_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbThreads);
+ZSTDLIB_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* cctx);
+
+ZSTDLIB_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* cctx,
+                           void* dst, size_t dstCapacity,
+                     const void* src, size_t srcSize,
+                           int compressionLevel);
+
+
+/* ===   Streaming functions   === */
+
+ZSTDLIB_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize);    /**< pledgedSrcSize is optional and can be zero == unknown */
+
+ZSTDLIB_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);   /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);     /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+
+
+/* ===   Advanced functions and parameters  === */
+
+#ifndef ZSTDMT_SECTION_SIZE_MIN
+#  define ZSTDMT_SECTION_SIZE_MIN (1U << 20)   /* 1 MB - Minimum size of each compression job */
+#endif
+
+ZSTDLIB_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, const void* dict, size_t dictSize,  /**< dict can be released after init, a local copy is preserved within zcs */
+                                          ZSTD_parameters params, unsigned long long pledgedSrcSize);  /**< pledgedSrcSize is optional and can be zero == unknown */
+
+/* ZSDTMT_parameter :
+ * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
+typedef enum {
+    ZSTDMT_p_sectionSize,        /* size of input "section". Each section is compressed in parallel. 0 means default, which is dynamically determined within compression functions */
+    ZSTDMT_p_overlapSectionLog   /* Log of overlapped section; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window */
+} ZSDTMT_parameter;
+
+/* ZSTDMT_setMTCtxParameter() :
+ * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter.
+ * The function must be called typically after ZSTD_createCCtx().
+ * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSDTMT_parameter parameter, unsigned value);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTDMT_COMPRESS_H */
diff --git a/zstd/lib/decompress/huf_decompress.c b/zstd/lib/decompress/huf_decompress.c
new file mode 100644
index 0000000..6e76cc4
--- /dev/null
+++ b/zstd/lib/decompress/huf_decompress.c
@@ -0,0 +1,885 @@
+/* ******************************************************************
+   Huffman decoder, part of New Generation Entropy library
+   Copyright (C) 2013-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact the author at :
+    - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+    - Public forum : https://groups.google.com/forum/#!forum/lz4c
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+/* inline is defined */
+#elif defined(_MSC_VER) || defined(__GNUC__)
+#  define inline __inline
+#else
+#  define inline /* disable inline */
+#endif
+
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include "bitstream.h"  /* BIT_* */
+#include "fse.h"        /* header compression */
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c) { enum { HUF_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+    DTableDesc dtd;
+    memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX2;   /* single-symbol decoding */
+
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+
+    HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* Table header */
+    {   DTableDesc dtd = HUF_getDTableDesc(DTable);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Calculate starting value for each rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<tableLog+1; n++) {
+            U32 const current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill DTable */
+    {   U32 n;
+        for (n=0; n<nbSymbols; n++) {
+            U32 const w = huffWeight[n];
+            U32 const length = (1 << w) >> 1;
+            U32 u;
+            HUF_DEltX2 D;
+            D.byte = (BYTE)n; D.nbBits = (BYTE)(tableLog + 1 - w);
+            for (u = rankVal[w]; u < rankVal[w] + length; u++)
+                dt[u] = D;
+            rankVal[w] += length;
+    }   }
+
+    return iSize;
+}
+
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t* Dstream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BIT_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+static inline size_t HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd-4)) {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to the end */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, hence no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+static size_t HUF_decompress1X2_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+    BIT_DStream_t bitD;
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    { size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+      if (HUF_isError(errorCode)) return errorCode; }
+
+    HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+size_t HUF_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X2_DCtx (HUF_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2 (DCtx, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+static size_t HUF_decompress4X2_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable + 1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+          if (HUF_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+          if (HUF_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+          if (HUF_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+          if (HUF_isError(errorCode)) return errorCode; }
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) && (op4<(oend-7)) ; ) {
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+        if (!endSignal) return ERROR(corruption_detected);
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+
+size_t HUF_decompress4X2_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2 (dctx, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X2_usingDTable_internal (dst, dstSize, ip, cSrcSize, dctx);
+}
+
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX4;  /* double-symbols decoding */
+
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+
+/* HUF_fillDTableX4Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX4Level2(HUF_DEltX4* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX4 DElt;
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    {   U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+            const U32 symbol = sortedSymbols[s].symbol;
+            const U32 weight = sortedSymbols[s].weight;
+            const U32 nbBits = nbBitsBaseline - weight;
+            const U32 length = 1 << (sizeLog-nbBits);
+            const U32 start = rankVal[weight];
+            U32 i = start;
+            const U32 end = start + length;
+
+            MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+            DElt.nbBits = (BYTE)(nbBits + consumed);
+            DElt.length = 2;
+            do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+            rankVal[weight] += length;
+    }   }
+}
+
+typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX4Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUF_DEltX4 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 const end = start + length;
+                U32 u;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
+    sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
+    U32 rankStats[HUF_TABLELOG_MAX + 1] = { 0 };
+    U32 rankStart0[HUF_TABLELOG_MAX + 2] = { 0 };
+    U32* const rankStart = rankStart0+1;
+    rankVal_t rankVal;
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    U32 const maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUF_DEltX4* const dt = (HUF_DEltX4*)dtPtr;
+
+    HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    /* memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUF_fillDTableX4(dt, maxTableLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+
+static U32 HUF_decodeSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void* op, BIT_DStream_t* DStream, const HUF_DEltX4* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);   /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+    }   }
+    return 1;
+}
+
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+static inline size_t HUF_decodeStreamX4(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, const HUF_DEltX4* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX4_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+
+static size_t HUF_decompress1X4_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    {   size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+        if (HUF_isError(errorCode)) return errorCode;
+    }
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+size_t HUF_decompress1X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X4_DCtx (HUF_DTable* DCtx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX4 (DCtx, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X4_usingDTable_internal (dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+static size_t HUF_decompress4X4_usingDTable_internal(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX4* const dt = (const HUF_DEltX4*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        { size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+          if (HUF_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+          if (HUF_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+          if (HUF_isError(errorCode)) return errorCode; }
+        { size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+          if (HUF_isError(errorCode)) return errorCode; }
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+            endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX4(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+size_t HUF_decompress4X4_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+
+size_t HUF_decompress4X4_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX4 (dctx, cSrc, cSrcSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+size_t HUF_decompress4X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX4(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X4_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
+                           HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable) :
+                           HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    /* decoder timing evaluation */
+    U32 const Q = (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 since dstSize > cSrcSize */
+    U32 const D256 = (U32)(dstSize >> 8);
+    U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+    U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+    DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, for cache eviction */
+
+    return DTime1 < DTime0;
+}
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X2, HUF_decompress4X4 };
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+    }
+}
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUF_decompress4X_hufOnly (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if ((cSrcSize >= dstSize) || (cSrcSize <= 1)) return ERROR(corruption_detected);   /* invalid */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress4X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+        return algoNb ? HUF_decompress1X4_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress1X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+    }
+}
diff --git a/zstd/lib/decompress/huf_decompress.o b/zstd/lib/decompress/huf_decompress.o
new file mode 100644
index 0000000..4e71acd
Binary files /dev/null and b/zstd/lib/decompress/huf_decompress.o differ
diff --git a/zstd/lib/decompress/zstd_decompress.c b/zstd/lib/decompress/zstd_decompress.c
new file mode 100644
index 0000000..482c334
--- /dev/null
+++ b/zstd/lib/decompress/zstd_decompress.c
@@ -0,0 +1,2480 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() will allocate memory,
+ * in memory stack (0), or in memory heap (1, requires malloc())
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+/*!
+*  LEGACY_SUPPORT :
+*  if set to 1, ZSTD_decompress() can decode older formats (v0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 0
+#endif
+
+/*!
+*  MAXWINDOWSIZE_DEFAULT :
+*  maximum window size accepted by DStream, by default.
+*  Frames requiring more memory will be rejected.
+*/
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#  define ZSTD_MAXWINDOWSIZE_DEFAULT ((1 << ZSTD_WINDOWLOG_MAX) + 1)   /* defined within zstd.h */
+#endif
+
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <string.h>      /* memcpy, memmove, memset */
+#include "mem.h"         /* low level memory routines */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
+#define HUF_STATIC_LINKING_ONLY
+#include "huf.h"
+#include "zstd_internal.h"
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+#  include "zstd_legacy.h"
+#endif
+
+
+#if defined(_MSC_VER)
+#  include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#  define ZSTD_PREFETCH(ptr)   _mm_prefetch((const char*)ptr, _MM_HINT_T0)
+#elif defined(__GNUC__)
+#  define ZSTD_PREFETCH(ptr)   __builtin_prefetch(ptr, 0, 0)
+#else
+#  define ZSTD_PREFETCH(ptr)   /* disabled */
+#endif
+
+/*-*************************************
+*  Macros
+***************************************/
+#define ZSTD_isError ERR_isError   /* for inlining */
+#define FSE_isError  ERR_isError
+#define HUF_isError  ERR_isError
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef struct {
+    FSE_DTable LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+    FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+    FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyTables_t;
+
+struct ZSTD_DCtx_s
+{
+    const FSE_DTable* LLTptr;
+    const FSE_DTable* MLTptr;
+    const FSE_DTable* OFTptr;
+    const HUF_DTable* HUFptr;
+    ZSTD_entropyTables_t entropy;
+    const void* previousDstEnd;   /* detect continuity */
+    const void* base;             /* start of current segment */
+    const void* vBase;            /* virtual start of previous segment if it was just before current one */
+    const void* dictEnd;          /* end of previous segment */
+    size_t expected;
+    ZSTD_frameParams fParams;
+    blockType_e bType;   /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+    ZSTD_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    U32 dictID;
+    const BYTE* litPtr;
+    ZSTD_customMem customMem;
+    size_t litSize;
+    size_t rleSize;
+    BYTE litBuffer[ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+};  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx) { return (dctx==NULL) ? 0 : sizeof(ZSTD_DCtx); }
+
+size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+{
+    dctx->expected = ZSTD_frameHeaderSize_prefix;
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->previousDstEnd = NULL;
+    dctx->base = NULL;
+    dctx->vBase = NULL;
+    dctx->dictEnd = NULL;
+    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    MEM_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+    memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+    dctx->LLTptr = dctx->entropy.LLTable;
+    dctx->MLTptr = dctx->entropy.MLTable;
+    dctx->OFTptr = dctx->entropy.OFTable;
+    dctx->HUFptr = dctx->entropy.hufTable;
+    return 0;
+}
+
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_DCtx* dctx;
+
+    if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem;
+    if (!customMem.customAlloc || !customMem.customFree) return NULL;
+
+    dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(ZSTD_DCtx), customMem);
+    if (!dctx) return NULL;
+    memcpy(&dctx->customMem, &customMem, sizeof(customMem));
+    ZSTD_decompressBegin(dctx);
+    return dctx;
+}
+
+ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    return ZSTD_createDCtx_advanced(defaultCustomMem);
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    ZSTD_free(dctx, dctx->customMem);
+    return 0;   /* reserved as a potential error code in the future */
+}
+
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    size_t const workSpaceSize = (ZSTD_BLOCKSIZE_ABSOLUTEMAX+WILDCOPY_OVERLENGTH) + ZSTD_frameHeaderSize_max;
+    memcpy(dstDCtx, srcDCtx, sizeof(ZSTD_DCtx) - workSpaceSize);  /* no need to copy workspace */
+}
+
+#if 0
+/* deprecated */
+static void ZSTD_refDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    ZSTD_decompressBegin(dstDCtx);  /* init */
+    if (srcDCtx) {   /* support refDCtx on NULL */
+        dstDCtx->dictEnd = srcDCtx->dictEnd;
+        dstDCtx->vBase = srcDCtx->vBase;
+        dstDCtx->base = srcDCtx->base;
+        dstDCtx->previousDstEnd = srcDCtx->previousDstEnd;
+        dstDCtx->dictID = srcDCtx->dictID;
+        dstDCtx->litEntropy = srcDCtx->litEntropy;
+        dstDCtx->fseEntropy = srcDCtx->fseEntropy;
+        dstDCtx->LLTptr = srcDCtx->entropy.LLTable;
+        dstDCtx->MLTptr = srcDCtx->entropy.MLTable;
+        dstDCtx->OFTptr = srcDCtx->entropy.OFTable;
+        dstDCtx->HUFptr = srcDCtx->entropy.hufTable;
+        dstDCtx->entropy.rep[0] = srcDCtx->entropy.rep[0];
+        dstDCtx->entropy.rep[1] = srcDCtx->entropy.rep[1];
+        dstDCtx->entropy.rep[2] = srcDCtx->entropy.rep[2];
+    }
+}
+#endif
+
+static void ZSTD_refDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict);
+
+
+/*-*************************************************************
+*   Decompression section
+***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void* buffer, size_t size)
+{
+    if (size < 4) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if (magic == ZSTD_MAGICNUMBER) return 1;
+        if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(buffer, size)) return 1;
+#endif
+    return 0;
+}
+
+
+/** ZSTD_frameHeaderSize() :
+*   srcSize must be >= ZSTD_frameHeaderSize_prefix.
+*   @return : size of the Frame Header */
+static size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+{
+    if (srcSize < ZSTD_frameHeaderSize_prefix) return ERROR(srcSize_wrong);
+    {   BYTE const fhd = ((const BYTE*)src)[4];
+        U32 const dictID= fhd & 3;
+        U32 const singleSegment = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return ZSTD_frameHeaderSize_prefix + !singleSegment + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId]
+                + (singleSegment && !fcsId);
+    }
+}
+
+
+/** ZSTD_getFrameParams() :
+*   decode Frame Header, or require larger `srcSize`.
+*   @return : 0, `fparamsPtr` is correctly filled,
+*            >0, `srcSize` is too small, result is expected `srcSize`,
+*             or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameParams(ZSTD_frameParams* fparamsPtr, const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize < ZSTD_frameHeaderSize_prefix) return ZSTD_frameHeaderSize_prefix;
+    if (MEM_readLE32(src) != ZSTD_MAGICNUMBER) {
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+            if (srcSize < ZSTD_skippableHeaderSize) return ZSTD_skippableHeaderSize; /* magic number + skippable frame length */
+            memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+            fparamsPtr->frameContentSize = MEM_readLE32((const char *)src + 4);
+            fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
+            return 0;
+        }
+        return ERROR(prefix_unknown);
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    { size_t const fhsize = ZSTD_frameHeaderSize(src, srcSize);
+      if (srcSize < fhsize) return fhsize; }
+
+    {   BYTE const fhdByte = ip[4];
+        size_t pos = 5;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const singleSegment = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;
+        U32 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = 0;
+        if ((fhdByte & 0x08) != 0) return ERROR(frameParameter_unsupported);   /* reserved bits, which must be zero */
+        if (!singleSegment) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            if (windowLog > ZSTD_WINDOWLOG_MAX) return ERROR(frameParameter_windowTooLarge);  /* avoids issue with 1 << windowLog */
+            windowSize = (1U << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+
+        switch(dictIDSizeCode)
+        {
+            default:   /* impossible */
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default:   /* impossible */
+            case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (!windowSize) windowSize = (U32)frameContentSize;
+        if (windowSize > windowSizeMax) return ERROR(frameParameter_windowTooLarge);
+        fparamsPtr->frameContentSize = frameContentSize;
+        fparamsPtr->windowSize = windowSize;
+        fparamsPtr->dictID = dictID;
+        fparamsPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+/** ZSTD_getFrameContentSize() :
+*   compatible with legacy mode
+*   @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+*             - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+*             - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1)
+    if (ZSTD_isLegacy(src, srcSize)) {
+        unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
+        return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
+    }
+#endif
+    {
+        ZSTD_frameParams fParams;
+        if (ZSTD_getFrameParams(&fParams, src, srcSize) != 0) return ZSTD_CONTENTSIZE_ERROR;
+        if (fParams.windowSize == 0) {
+            /* Either skippable or empty frame, size == 0 either way */
+            return 0;
+        } else if (fParams.frameContentSize != 0) {
+            return fParams.frameContentSize;
+        } else {
+            return ZSTD_CONTENTSIZE_UNKNOWN;
+        }
+    }
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  compatible with legacy mode
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+{
+    {
+        unsigned long long totalDstSize = 0;
+        while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+            const U32 magicNumber = MEM_readLE32(src);
+
+            if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+                size_t skippableSize;
+                if (srcSize < ZSTD_skippableHeaderSize)
+                    return ERROR(srcSize_wrong);
+                skippableSize = MEM_readLE32((const BYTE *)src + 4) +
+                                ZSTD_skippableHeaderSize;
+                if (srcSize < skippableSize) {
+                    return ZSTD_CONTENTSIZE_ERROR;
+                }
+
+                src = (const BYTE *)src + skippableSize;
+                srcSize -= skippableSize;
+                continue;
+            }
+
+            {
+                unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+                if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+
+                /* check for overflow */
+                if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+                totalDstSize += ret;
+            }
+            {
+                size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+                if (ZSTD_isError(frameSrcSize)) {
+                    return ZSTD_CONTENTSIZE_ERROR;
+                }
+
+                src = (const BYTE *)src + frameSrcSize;
+                srcSize -= frameSrcSize;
+            }
+        }
+
+        if (srcSize) {
+            return ZSTD_CONTENTSIZE_ERROR;
+        }
+
+        return totalDstSize;
+    }
+}
+
+/** ZSTD_getDecompressedSize() :
+*   compatible with legacy mode
+*   @return : decompressed size if known, 0 otherwise
+              note : 0 can mean any of the following :
+                   - decompressed size is not present within frame header
+                   - frame header unknown / not supported
+                   - frame header not complete (`srcSize` too small) */
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+    return ret >= ZSTD_CONTENTSIZE_ERROR ? 0 : ret;
+}
+
+
+/** ZSTD_decodeFrameHeader() :
+*   `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
+{
+    size_t const result = ZSTD_getFrameParams(&(dctx->fParams), src, headerSize);
+    if (ZSTD_isError(result)) return result;  /* invalid header */
+    if (result>0) return ERROR(srcSize_wrong);   /* headerSize too small */
+    if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID)) return ERROR(dictionary_wrong);
+    if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0);
+    return 0;
+}
+
+
+typedef struct
+{
+    blockType_e blockType;
+    U32 lastBlock;
+    U32 origSize;
+} blockProperties_t;
+
+/*! ZSTD_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr)
+{
+    if (srcSize < ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+    {   U32 const cBlockHeader = MEM_readLE24(src);
+        U32 const cSize = cBlockHeader >> 3;
+        bpPtr->lastBlock = cBlockHeader & 1;
+        bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+        bpPtr->origSize = cSize;   /* only useful for RLE */
+        if (bpPtr->blockType == bt_rle) return 1;
+        if (bpPtr->blockType == bt_reserved) return ERROR(corruption_detected);
+        return cSize;
+    }
+}
+
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    if (srcSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, size_t regenSize)
+{
+    if (srcSize != 1) return ERROR(srcSize_wrong);
+    if (regenSize > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, *(const BYTE*)src, regenSize);
+    return regenSize;
+}
+
+/*! ZSTD_decodeLiteralsBlock() :
+    @return : nb of bytes read from src (< srcSize ) */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected);
+
+    {   const BYTE* const istart = (const BYTE*) src;
+        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+        switch(litEncType)
+        {
+        case set_repeat:
+            if (dctx->litEntropy==0) return ERROR(dictionary_corrupted);
+            /* fall-through */
+        case set_compressed:
+            if (srcSize < 5) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+            {   size_t lhSize, litSize, litCSize;
+                U32 singleStream=0;
+                U32 const lhlCode = (istart[0] >> 2) & 3;
+                U32 const lhc = MEM_readLE32(istart);
+                switch(lhlCode)
+                {
+                case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    /* 2 - 2 - 10 - 10 */
+                    singleStream = !lhlCode;
+                    lhSize = 3;
+                    litSize  = (lhc >> 4) & 0x3FF;
+                    litCSize = (lhc >> 14) & 0x3FF;
+                    break;
+                case 2:
+                    /* 2 - 2 - 14 - 14 */
+                    lhSize = 4;
+                    litSize  = (lhc >> 4) & 0x3FFF;
+                    litCSize = lhc >> 18;
+                    break;
+                case 3:
+                    /* 2 - 2 - 18 - 18 */
+                    lhSize = 5;
+                    litSize  = (lhc >> 4) & 0x3FFFF;
+                    litCSize = (lhc >> 22) + (istart[4] << 10);
+                    break;
+                }
+                if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected);
+                if (litCSize + lhSize > srcSize) return ERROR(corruption_detected);
+
+                if (HUF_isError((litEncType==set_repeat) ?
+                                    ( singleStream ?
+                                        HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr) :
+                                        HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr) ) :
+                                    ( singleStream ?
+                                        HUF_decompress1X2_DCtx(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize) :
+                                        HUF_decompress4X_hufOnly (dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize)) ))
+                    return ERROR(corruption_detected);
+
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                dctx->litEntropy = 1;
+                if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return litCSize + lhSize;
+            }
+
+        case set_basic:
+            {   size_t litSize, lhSize;
+                U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+
+                if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                    if (litSize+lhSize > srcSize) return ERROR(corruption_detected);
+                    memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                    dctx->litPtr = dctx->litBuffer;
+                    dctx->litSize = litSize;
+                    memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                    return lhSize+litSize;
+                }
+                /* direct reference into compressed stream */
+                dctx->litPtr = istart+lhSize;
+                dctx->litSize = litSize;
+                return lhSize+litSize;
+            }
+
+        case set_rle:
+            {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t litSize, lhSize;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    if (srcSize<4) return ERROR(corruption_detected);   /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+                    break;
+                }
+                if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected);
+                memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                return lhSize+1;
+            }
+        default:
+            return ERROR(corruption_detected);   /* impossible */
+        }
+    }
+}
+
+
+typedef union {
+    FSE_decode_t realData;
+    U32 alignedBy4;
+} FSE_decode_t4;
+
+static const FSE_decode_t4 LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+    { { LL_DEFAULTNORMLOG, 1, 1 } }, /* header : tableLog, fastMode, fastMode */
+    { {  0,  0,  4 } },              /* 0 : base, symbol, bits */
+    { { 16,  0,  4 } },
+    { { 32,  1,  5 } },
+    { {  0,  3,  5 } },
+    { {  0,  4,  5 } },
+    { {  0,  6,  5 } },
+    { {  0,  7,  5 } },
+    { {  0,  9,  5 } },
+    { {  0, 10,  5 } },
+    { {  0, 12,  5 } },
+    { {  0, 14,  6 } },
+    { {  0, 16,  5 } },
+    { {  0, 18,  5 } },
+    { {  0, 19,  5 } },
+    { {  0, 21,  5 } },
+    { {  0, 22,  5 } },
+    { {  0, 24,  5 } },
+    { { 32, 25,  5 } },
+    { {  0, 26,  5 } },
+    { {  0, 27,  6 } },
+    { {  0, 29,  6 } },
+    { {  0, 31,  6 } },
+    { { 32,  0,  4 } },
+    { {  0,  1,  4 } },
+    { {  0,  2,  5 } },
+    { { 32,  4,  5 } },
+    { {  0,  5,  5 } },
+    { { 32,  7,  5 } },
+    { {  0,  8,  5 } },
+    { { 32, 10,  5 } },
+    { {  0, 11,  5 } },
+    { {  0, 13,  6 } },
+    { { 32, 16,  5 } },
+    { {  0, 17,  5 } },
+    { { 32, 19,  5 } },
+    { {  0, 20,  5 } },
+    { { 32, 22,  5 } },
+    { {  0, 23,  5 } },
+    { {  0, 25,  4 } },
+    { { 16, 25,  4 } },
+    { { 32, 26,  5 } },
+    { {  0, 28,  6 } },
+    { {  0, 30,  6 } },
+    { { 48,  0,  4 } },
+    { { 16,  1,  4 } },
+    { { 32,  2,  5 } },
+    { { 32,  3,  5 } },
+    { { 32,  5,  5 } },
+    { { 32,  6,  5 } },
+    { { 32,  8,  5 } },
+    { { 32,  9,  5 } },
+    { { 32, 11,  5 } },
+    { { 32, 12,  5 } },
+    { {  0, 15,  6 } },
+    { { 32, 17,  5 } },
+    { { 32, 18,  5 } },
+    { { 32, 20,  5 } },
+    { { 32, 21,  5 } },
+    { { 32, 23,  5 } },
+    { { 32, 24,  5 } },
+    { {  0, 35,  6 } },
+    { {  0, 34,  6 } },
+    { {  0, 33,  6 } },
+    { {  0, 32,  6 } },
+};   /* LL_defaultDTable */
+
+static const FSE_decode_t4 ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
+    { { ML_DEFAULTNORMLOG, 1, 1 } }, /* header : tableLog, fastMode, fastMode */
+    { {  0,  0,  6 } },              /* 0 : base, symbol, bits */
+    { {  0,  1,  4 } },
+    { { 32,  2,  5 } },
+    { {  0,  3,  5 } },
+    { {  0,  5,  5 } },
+    { {  0,  6,  5 } },
+    { {  0,  8,  5 } },
+    { {  0, 10,  6 } },
+    { {  0, 13,  6 } },
+    { {  0, 16,  6 } },
+    { {  0, 19,  6 } },
+    { {  0, 22,  6 } },
+    { {  0, 25,  6 } },
+    { {  0, 28,  6 } },
+    { {  0, 31,  6 } },
+    { {  0, 33,  6 } },
+    { {  0, 35,  6 } },
+    { {  0, 37,  6 } },
+    { {  0, 39,  6 } },
+    { {  0, 41,  6 } },
+    { {  0, 43,  6 } },
+    { {  0, 45,  6 } },
+    { { 16,  1,  4 } },
+    { {  0,  2,  4 } },
+    { { 32,  3,  5 } },
+    { {  0,  4,  5 } },
+    { { 32,  6,  5 } },
+    { {  0,  7,  5 } },
+    { {  0,  9,  6 } },
+    { {  0, 12,  6 } },
+    { {  0, 15,  6 } },
+    { {  0, 18,  6 } },
+    { {  0, 21,  6 } },
+    { {  0, 24,  6 } },
+    { {  0, 27,  6 } },
+    { {  0, 30,  6 } },
+    { {  0, 32,  6 } },
+    { {  0, 34,  6 } },
+    { {  0, 36,  6 } },
+    { {  0, 38,  6 } },
+    { {  0, 40,  6 } },
+    { {  0, 42,  6 } },
+    { {  0, 44,  6 } },
+    { { 32,  1,  4 } },
+    { { 48,  1,  4 } },
+    { { 16,  2,  4 } },
+    { { 32,  4,  5 } },
+    { { 32,  5,  5 } },
+    { { 32,  7,  5 } },
+    { { 32,  8,  5 } },
+    { {  0, 11,  6 } },
+    { {  0, 14,  6 } },
+    { {  0, 17,  6 } },
+    { {  0, 20,  6 } },
+    { {  0, 23,  6 } },
+    { {  0, 26,  6 } },
+    { {  0, 29,  6 } },
+    { {  0, 52,  6 } },
+    { {  0, 51,  6 } },
+    { {  0, 50,  6 } },
+    { {  0, 49,  6 } },
+    { {  0, 48,  6 } },
+    { {  0, 47,  6 } },
+    { {  0, 46,  6 } },
+};   /* ML_defaultDTable */
+
+static const FSE_decode_t4 OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
+    { { OF_DEFAULTNORMLOG, 1, 1 } }, /* header : tableLog, fastMode, fastMode */
+    { {  0,  0,  5 } },              /* 0 : base, symbol, bits */
+    { {  0,  6,  4 } },
+    { {  0,  9,  5 } },
+    { {  0, 15,  5 } },
+    { {  0, 21,  5 } },
+    { {  0,  3,  5 } },
+    { {  0,  7,  4 } },
+    { {  0, 12,  5 } },
+    { {  0, 18,  5 } },
+    { {  0, 23,  5 } },
+    { {  0,  5,  5 } },
+    { {  0,  8,  4 } },
+    { {  0, 14,  5 } },
+    { {  0, 20,  5 } },
+    { {  0,  2,  5 } },
+    { { 16,  7,  4 } },
+    { {  0, 11,  5 } },
+    { {  0, 17,  5 } },
+    { {  0, 22,  5 } },
+    { {  0,  4,  5 } },
+    { { 16,  8,  4 } },
+    { {  0, 13,  5 } },
+    { {  0, 19,  5 } },
+    { {  0,  1,  5 } },
+    { { 16,  6,  4 } },
+    { {  0, 10,  5 } },
+    { {  0, 16,  5 } },
+    { {  0, 28,  5 } },
+    { {  0, 27,  5 } },
+    { {  0, 26,  5 } },
+    { {  0, 25,  5 } },
+    { {  0, 24,  5 } },
+};   /* OF_defaultDTable */
+
+/*! ZSTD_buildSeqTable() :
+    @return : nb bytes read from src,
+              or an error code if it fails, testable with ZSTD_isError()
+*/
+static size_t ZSTD_buildSeqTable(FSE_DTable* DTableSpace, const FSE_DTable** DTablePtr,
+                                 symbolEncodingType_e type, U32 max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const FSE_decode_t4* defaultTable, U32 flagRepeatTable)
+{
+    const void* const tmpPtr = defaultTable;   /* bypass strict aliasing */
+    switch(type)
+    {
+    case set_rle :
+        if (!srcSize) return ERROR(srcSize_wrong);
+        if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected);
+        FSE_buildDTable_rle(DTableSpace, *(const BYTE*)src);
+        *DTablePtr = DTableSpace;
+        return 1;
+    case set_basic :
+        *DTablePtr = (const FSE_DTable*)tmpPtr;
+        return 0;
+    case set_repeat:
+        if (!flagRepeatTable) return ERROR(corruption_detected);
+        return 0;
+    default :   /* impossible */
+    case set_compressed :
+        {   U32 tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+            if (FSE_isError(headerSize)) return ERROR(corruption_detected);
+            if (tableLog > maxLog) return ERROR(corruption_detected);
+            FSE_buildDTable(DTableSpace, norm, max, tableLog);
+            *DTablePtr = DTableSpace;
+            return headerSize;
+    }   }
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+
+    /* check */
+    if (srcSize < MIN_SEQUENCES_SIZE) return ERROR(srcSize_wrong);
+
+    /* SeqHead */
+    {   int nbSeq = *ip++;
+        if (!nbSeq) { *nbSeqPtr=0; return 1; }
+        if (nbSeq > 0x7F) {
+            if (nbSeq == 0xFF) {
+                if (ip+2 > iend) return ERROR(srcSize_wrong);
+                nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+            } else {
+                if (ip >= iend) return ERROR(srcSize_wrong);
+                nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+            }
+        }
+        *nbSeqPtr = nbSeq;
+    }
+
+    /* FSE table descriptors */
+    if (ip+4 > iend) return ERROR(srcSize_wrong); /* minimum possible size */
+    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+        ip++;
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
+                                                      LLtype, MaxLL, LLFSELog,
+                                                      ip, iend-ip, LL_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(llhSize)) return ERROR(corruption_detected);
+            ip += llhSize;
+        }
+        {   size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
+                                                      OFtype, MaxOff, OffFSELog,
+                                                      ip, iend-ip, OF_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(ofhSize)) return ERROR(corruption_detected);
+            ip += ofhSize;
+        }
+        {   size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
+                                                      MLtype, MaxML, MLFSELog,
+                                                      ip, iend-ip, ML_defaultDTable, dctx->fseEntropy);
+            if (ZSTD_isError(mlhSize)) return ERROR(corruption_detected);
+            ip += mlhSize;
+        }
+    }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+    const BYTE* match;
+} seq_t;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    FSE_DState_t stateLL;
+    FSE_DState_t stateOffb;
+    FSE_DState_t stateML;
+    size_t prevOffset[ZSTD_REP_NUM];
+    const BYTE* base;
+    size_t pos;
+    iPtrDiff gotoDict;
+} seqState_t;
+
+
+FORCE_NOINLINE
+size_t ZSTD_execSequenceLast7(BYTE* op,
+                              BYTE* const oend, seq_t sequence,
+                              const BYTE** litPtr, const BYTE* const litLimit,
+                              const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd <= oend_w) return ERROR(GENERIC);   /* Precondition */
+
+    /* copy literals */
+    if (op < oend_w) {
+        ZSTD_wildcopy(op, *litPtr, oend_w - op);
+        *litPtr += oend_w - op;
+        op = oend_w;
+    }
+    while (op < oLitEnd) *op++ = *(*litPtr)++;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match = dictEnd - (base-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+    }   }
+    while (op < oMatchEnd) *op++ = *match++;
+    return sequenceLength;
+}
+
+
+
+
+static seq_t ZSTD_decodeSequence(seqState_t* seqState)
+{
+    seq_t seq;
+
+    U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+    U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+    U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb);   /* <= maxOff, by table construction */
+
+    U32 const llBits = LL_bits[llCode];
+    U32 const mlBits = ML_bits[mlCode];
+    U32 const ofBits = ofCode;
+    U32 const totalBits = llBits+mlBits+ofBits;
+
+    static const U32 LL_base[MaxLL+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
+                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                            0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 ML_base[MaxML+1] = {
+                             3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,   14,    15,    16,    17,    18,
+                            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,   30,    31,    32,    33,    34,
+                            35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                            0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+    static const U32 OF_base[MaxOff+1] = {
+                             0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                             0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                             0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                             0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofCode)
+            offset = 0;
+        else {
+            offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+        }
+
+        if (ofCode <= 1) {
+            offset += (llCode==0);
+            if (offset) {
+                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream);
+
+    seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
+    if (MEM_32bits() ||
+       (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream);
+
+    /* ANS state update */
+    FSE_updateState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
+    FSE_updateState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
+    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+    FSE_updateState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+
+    return seq;
+}
+
+
+FORCE_INLINE
+size_t ZSTD_execSequence(BYTE* op,
+                         BYTE* const oend, seq_t sequence,
+                         const BYTE** litPtr, const BYTE* const litLimit,
+                         const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    /* check */
+    if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+    /* copy Literals */
+    ZSTD_copy8(op, *litPtr);
+    if (sequence.litLength > 8)
+        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        match += (dictEnd-base);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              U32 i;
+              for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTD_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+
+static size_t ZSTD_decompressSequences(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    int nbSeq;
+
+    /* Build Decoding Tables */
+    {   size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+        if (ZSTD_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+    }
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected);
+        FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
+            nbSeq--;
+            {   seq_t const sequence = ZSTD_decodeSequence(&seqState);
+                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                op += oneSeqSize;
+        }   }
+
+        /* check if reached exact end */
+        if (nbSeq) return ERROR(corruption_detected);
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+FORCE_INLINE seq_t ZSTD_decodeSequenceLong_generic(seqState_t* seqState, int const longOffsets)
+{
+    seq_t seq;
+
+    U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+    U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+    U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb);   /* <= maxOff, by table construction */
+
+    U32 const llBits = LL_bits[llCode];
+    U32 const mlBits = ML_bits[mlCode];
+    U32 const ofBits = ofCode;
+    U32 const totalBits = llBits+mlBits+ofBits;
+
+    static const U32 LL_base[MaxLL+1] = {
+                             0,  1,  2,  3,  4,  5,  6,  7,  8,  9,   10,    11,    12,    13,    14,     15,
+                            16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                            0x2000, 0x4000, 0x8000, 0x10000 };
+
+    static const U32 ML_base[MaxML+1] = {
+                             3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,   14,    15,    16,    17,    18,
+                            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,   30,    31,    32,    33,    34,
+                            35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                            0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+    static const U32 OF_base[MaxOff+1] = {
+                             0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                             0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                             0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                             0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD };
+
+    /* sequence */
+    {   size_t offset;
+        if (!ofCode)
+            offset = 0;
+        else {
+            if (longOffsets) {
+                int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN);
+                offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
+                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+            } else {
+                offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+            }
+        }
+
+        if (ofCode <= 1) {
+            offset += (llCode==0);
+            if (offset) {
+                size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                seqState->prevOffset[1] = seqState->prevOffset[0];
+                seqState->prevOffset[0] = offset = temp;
+            } else {
+                offset = seqState->prevOffset[0];
+            }
+        } else {
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0);  /* <=  16 bits */
+    if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream);
+
+    seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0);    /* <=  16 bits */
+    if (MEM_32bits() ||
+       (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream);
+
+    {   size_t const pos = seqState->pos + seq.litLength;
+        seq.match = seqState->base + pos - seq.offset;    /* single memory segment */
+        if (seq.offset > pos) seq.match += seqState->gotoDict;   /* separate memory segment */
+        seqState->pos = pos + seq.matchLength;
+    }
+
+    /* ANS state update */
+    FSE_updateState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
+    FSE_updateState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
+    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+    FSE_updateState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+
+    return seq;
+}
+
+static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState, unsigned const windowSize) {
+    if (ZSTD_highbit32(windowSize) > STREAM_ACCUMULATOR_MIN) {
+        return ZSTD_decodeSequenceLong_generic(seqState, 1);
+    } else {
+        return ZSTD_decodeSequenceLong_generic(seqState, 0);
+    }
+}
+
+FORCE_INLINE
+size_t ZSTD_execSequenceLong(BYTE* op,
+                                BYTE* const oend, seq_t sequence,
+                                const BYTE** litPtr, const BYTE* const litLimit,
+                                const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = sequence.match;
+
+    /* check */
+#if 1
+    if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+    if (iLitEnd > litLimit) return ERROR(corruption_detected);   /* over-read beyond lit buffer */
+    if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+#endif
+
+    /* copy Literals */
+    ZSTD_copy8(op, *litPtr);
+    if (sequence.litLength > 8)
+        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* copy Match */
+#if 1
+    if (sequence.offset > (size_t)(oLitEnd - base)) {
+        /* offset beyond prefix */
+        if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = base;
+            if (op > oend_w || sequence.matchLength < MINMATCH) {
+              U32 i;
+              for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i];
+              return sequenceLength;
+            }
+    }   }
+    /* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+#endif
+
+    /* match within prefix */
+    if (sequence.offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[sequence.offset];
+        op[0] = match[0];
+        op[1] = match[1];
+        op[2] = match[2];
+        op[3] = match[3];
+        match += dec32table[sequence.offset];
+        ZSTD_copy4(op+4, match);
+        match -= sub2;
+    } else {
+        ZSTD_copy8(op, match);
+    }
+    op += 8; match += 8;
+
+    if (oMatchEnd > oend-(16-MINMATCH)) {
+        if (op < oend_w) {
+            ZSTD_wildcopy(op, match, oend_w - op);
+            match += oend_w - op;
+            op = oend_w;
+        }
+        while (op < oMatchEnd) *op++ = *match++;
+    } else {
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+    }
+    return sequenceLength;
+}
+
+static size_t ZSTD_decompressSequencesLong(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const base = (const BYTE*) (dctx->base);
+    const BYTE* const vBase = (const BYTE*) (dctx->vBase);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    unsigned const windowSize = dctx->fParams.windowSize;
+    int nbSeq;
+
+    /* Build Decoding Tables */
+    {   size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+        if (ZSTD_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+    }
+
+    /* Regen sequences */
+    if (nbSeq) {
+#define STORED_SEQS 4
+#define STOSEQ_MASK (STORED_SEQS-1)
+#define ADVANCED_SEQS 4
+        seq_t sequences[STORED_SEQS];
+        int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+        seqState_t seqState;
+        int seqNb;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        seqState.base = base;
+        seqState.pos = (size_t)(op-base);
+        seqState.gotoDict = (iPtrDiff)(dictEnd - base);
+        CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected);
+        FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        /* prepare in advance */
+        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb<seqAdvance; seqNb++) {
+            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize);
+        }
+        if (seqNb<seqAdvance) return ERROR(corruption_detected);
+
+        /* decode and decompress */
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb<nbSeq ; seqNb++) {
+            seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize);
+            size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            ZSTD_PREFETCH(sequence.match);
+            sequences[seqNb&STOSEQ_MASK] = sequence;
+            op += oneSeqSize;
+        }
+        if (seqNb<nbSeq) return ERROR(corruption_detected);
+
+        /* finish queue */
+        seqNb -= seqAdvance;
+        for ( ; seqNb<nbSeq ; seqNb++) {
+            size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb&STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall);
+        memcpy(op, litPtr, lastLLSize);
+        op += lastLLSize;
+    }
+
+    return op-ostart;
+}
+
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+
+    if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(srcSize_wrong);
+
+    /* Decode literals section */
+    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+        if (ZSTD_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+    if (sizeof(size_t) > 4)  /* do not enable prefetching on 32-bits x86, as it's performance detrimental */
+                             /* likely because of register pressure */
+                             /* if that's the correct cause, then 32-bits ARM should be affected differently */
+                             /* it would be good to test this on ARM real hardware, to see if prefetch version improves speed */
+        if (dctx->fParams.windowSize > (1<<23))
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize);
+    return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+
+static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+        dctx->base = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    size_t dSize;
+    ZSTD_checkContinuity(dctx, dst);
+    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
+
+
+/** ZSTD_insertBlock() :
+    insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    ZSTD_checkContinuity(dctx, blockStart);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+size_t ZSTD_generateNxBytes(void* dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+    if (length > dstCapacity) return ERROR(dstSize_tooSmall);
+    memset(dst, byte, length);
+    return length;
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT==1)
+    if (ZSTD_isLegacy(src, srcSize)) return ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+#endif
+    if (srcSize >= ZSTD_skippableHeaderSize &&
+            (MEM_readLE32(src) & 0xFFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+        return ZSTD_skippableHeaderSize + MEM_readLE32((const BYTE*)src + 4);
+    } else {
+        const BYTE* ip = (const BYTE*)src;
+        const BYTE* const ipstart = ip;
+        size_t remainingSize = srcSize;
+        ZSTD_frameParams fParams;
+
+        size_t const headerSize = ZSTD_frameHeaderSize(ip, remainingSize);
+        if (ZSTD_isError(headerSize)) return headerSize;
+
+        /* Frame Header */
+        {   size_t const ret = ZSTD_getFrameParams(&fParams, ip, remainingSize);
+            if (ZSTD_isError(ret)) return ret;
+            if (ret > 0) return ERROR(srcSize_wrong);
+        }
+
+        ip += headerSize;
+        remainingSize -= headerSize;
+
+        /* Loop on each block */
+        while (1) {
+            blockProperties_t blockProperties;
+            size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+            if (ZSTD_blockHeaderSize + cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+            ip += ZSTD_blockHeaderSize + cBlockSize;
+            remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+
+            if (blockProperties.lastBlock) break;
+        }
+
+        if (fParams.checksumFlag) {   /* Frame content checksum */
+            if (remainingSize < 4) return ERROR(srcSize_wrong);
+            ip += 4;
+            remainingSize -= 4;
+        }
+
+        return ip - ipstart;
+    }
+}
+
+/*! ZSTD_decompressFrame() :
+*   @dctx must be properly initialized */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                                 const void** srcPtr, size_t *srcSizePtr)
+{
+    const BYTE* ip = (const BYTE*)(*srcPtr);
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t remainingSize = *srcSizePtr;
+
+    /* check */
+    if (remainingSize < ZSTD_frameHeaderSize_min+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        if (remainingSize < frameHeaderSize+ZSTD_blockHeaderSize) return ERROR(srcSize_wrong);
+        CHECK_F(ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize));
+        ip += frameHeaderSize; remainingSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSize -= ZSTD_blockHeaderSize;
+        if (cBlockSize > remainingSize) return ERROR(srcSize_wrong);
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTD_generateNxBytes(op, oend-op, *ip, blockProperties.origSize);
+            break;
+        case bt_reserved :
+        default:
+            return ERROR(corruption_detected);
+        }
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, op, decodedSize);
+        op += decodedSize;
+        ip += cBlockSize;
+        remainingSize -= cBlockSize;
+        if (blockProperties.lastBlock) break;
+    }
+
+    if (dctx->fParams.checksumFlag) {   /* Frame content checksum verification */
+        U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
+        U32 checkRead;
+        if (remainingSize<4) return ERROR(checksum_wrong);
+        checkRead = MEM_readLE32(ip);
+        if (checkRead != checkCalc) return ERROR(checksum_wrong);
+        ip += 4;
+        remainingSize -= 4;
+    }
+
+    /* Allow caller to get size read */
+    *srcPtr = ip;
+    *srcSizePtr = remainingSize;
+    return op-ostart;
+}
+
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict);
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  const void *dict, size_t dictSize,
+                                  const ZSTD_DDict* ddict)
+{
+    void* const dststart = dst;
+
+    if (ddict) {
+        if (dict) {
+            /* programmer error, these two cases should be mutually exclusive */
+            return ERROR(GENERIC);
+        }
+
+        dict = ZSTD_DDictDictContent(ddict);
+        dictSize = ZSTD_DDictDictSize(ddict);
+    }
+
+    while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+        U32 magicNumber;
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (ZSTD_isLegacy(src, srcSize)) {
+            size_t decodedSize;
+            size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+            if (ZSTD_isError(frameSize)) return frameSize;
+
+            decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
+
+            dst = (BYTE*)dst + decodedSize;
+            dstCapacity -= decodedSize;
+
+            src = (const BYTE*)src + frameSize;
+            srcSize -= frameSize;
+
+            continue;
+        }
+#endif
+
+        magicNumber = MEM_readLE32(src);
+        if (magicNumber != ZSTD_MAGICNUMBER) {
+            if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+                size_t skippableSize;
+                if (srcSize < ZSTD_skippableHeaderSize)
+                    return ERROR(srcSize_wrong);
+                skippableSize = MEM_readLE32((const BYTE *)src + 4) +
+                                ZSTD_skippableHeaderSize;
+                if (srcSize < skippableSize) {
+                    return ERROR(srcSize_wrong);
+                }
+
+                src = (const BYTE *)src + skippableSize;
+                srcSize -= skippableSize;
+                continue;
+            } else {
+                return ERROR(prefix_unknown);
+            }
+        }
+
+        if (ddict) {
+            /* we were called from ZSTD_decompress_usingDDict */
+            ZSTD_refDDict(dctx, ddict);
+        } else {
+            /* this will initialize correctly with no dict if dict == NULL, so
+             * use this in all cases but ddict */
+            CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize));
+        }
+        ZSTD_checkContinuity(dctx, dst);
+
+        {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
+                                                    &src, &srcSize);
+            if (ZSTD_isError(res)) return res;
+            /* don't need to bounds check this, ZSTD_decompressFrame will have
+             * already */
+            dst = (BYTE*)dst + res;
+            dstCapacity -= res;
+        }
+    }
+
+    if (srcSize) return ERROR(srcSize_wrong); /* input not entirely consumed */
+
+    return (BYTE*)dst - (BYTE*)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize,
+                           const void* dict, size_t dictSize)
+{
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+
+size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE==1)
+    size_t regenSize;
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+    if (dctx==NULL) return ERROR(memory_allocation);
+    regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTD_DCtx dctx;
+    return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
+    switch(dctx->stage)
+    {
+    default:   /* should not happen */
+    case ZSTDds_getFrameHeaderSize:
+    case ZSTDds_decodeFrameHeader:
+        return ZSTDnit_frameHeader;
+    case ZSTDds_decodeBlockHeader:
+        return ZSTDnit_blockHeader;
+    case ZSTDds_decompressBlock:
+        return ZSTDnit_block;
+    case ZSTDds_decompressLastBlock:
+        return ZSTDnit_lastBlock;
+    case ZSTDds_checkChecksum:
+        return ZSTDnit_checksum;
+    case ZSTDds_decodeSkippableHeader:
+    case ZSTDds_skipFrame:
+        return ZSTDnit_skippableFrame;
+    }
+}
+
+int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }   /* for zbuff */
+
+/** ZSTD_decompressContinue() :
+*   @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+*             or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    /* Sanity check */
+    if (srcSize != dctx->expected) return ERROR(srcSize_wrong);
+    if (dstCapacity) ZSTD_checkContinuity(dctx, dst);
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        if (srcSize != ZSTD_frameHeaderSize_prefix) return ERROR(srcSize_wrong);      /* impossible */
+        if ((MEM_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
+            memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+            dctx->expected = ZSTD_skippableHeaderSize - ZSTD_frameHeaderSize_prefix;  /* magic number + skippable frame length */
+            dctx->stage = ZSTDds_decodeSkippableHeader;
+            return 0;
+        }
+        dctx->headerSize = ZSTD_frameHeaderSize(src, ZSTD_frameHeaderSize_prefix);
+        if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+        if (dctx->headerSize > ZSTD_frameHeaderSize_prefix) {
+            dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_prefix;
+            dctx->stage = ZSTDds_decodeFrameHeader;
+            return 0;
+        }
+        dctx->expected = 0;   /* not necessary to copy more */
+
+    case ZSTDds_decodeFrameHeader:
+        memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+        CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize));
+        dctx->expected = ZSTD_blockHeaderSize;
+        dctx->stage = ZSTDds_decodeBlockHeader;
+        return 0;
+
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+            dctx->expected = cBlockSize;
+            dctx->bType = bp.blockType;
+            dctx->rleSize = bp.origSize;
+            if (cBlockSize) {
+                dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+                return 0;
+            }
+            /* empty block */
+            if (bp.lastBlock) {
+                if (dctx->fParams.checksumFlag) {
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0; /* end of frame */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->expected = 3;  /* go directly to next header */
+                dctx->stage = ZSTDds_decodeBlockHeader;
+            }
+            return 0;
+        }
+    case ZSTDds_decompressLastBlock:
+    case ZSTDds_decompressBlock:
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+                break;
+            case bt_raw :
+                rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
+                break;
+            case bt_rle :
+                rSize = ZSTD_setRleBlock(dst, dstCapacity, src, srcSize, dctx->rleSize);
+                break;
+            case bt_reserved :   /* should never happen */
+            default:
+                return ERROR(corruption_detected);
+            }
+            if (ZSTD_isError(rSize)) return rSize;
+            if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
+
+            if (dctx->stage == ZSTDds_decompressLastBlock) {   /* end of frame */
+                if (dctx->fParams.checksumFlag) {  /* another round for frame checksum */
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0;   /* ends here */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->stage = ZSTDds_decodeBlockHeader;
+                dctx->expected = ZSTD_blockHeaderSize;
+                dctx->previousDstEnd = (char*)dst + rSize;
+            }
+            return rSize;
+        }
+    case ZSTDds_checkChecksum:
+        {   U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
+            U32 const check32 = MEM_readLE32(src);   /* srcSize == 4, guaranteed by dctx->expected */
+            if (check32 != h32) return ERROR(checksum_wrong);
+            dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+    case ZSTDds_decodeSkippableHeader:
+        {   memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+            dctx->expected = MEM_readLE32(dctx->headerBuffer + 4);
+            dctx->stage = ZSTDds_skipFrame;
+            return 0;
+        }
+    case ZSTDds_skipFrame:
+        {   dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+    default:
+        return ERROR(GENERIC);   /* impossible */
+    }
+}
+
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->vBase = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base));
+    dctx->base = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+    return 0;
+}
+
+/* ZSTD_loadEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary
+ * @return : size of entropy tables read */
+static size_t ZSTD_loadEntropy(ZSTD_entropyTables_t* entropy, const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    if (dictSize <= 8) return ERROR(dictionary_corrupted);
+    dictPtr += 8;   /* skip header = magic + dictID */
+
+
+    {   size_t const hSize = HUF_readDTableX4(entropy->hufTable, dictPtr, dictEnd-dictPtr);
+        if (HUF_isError(hSize)) return ERROR(dictionary_corrupted);
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        U32 offcodeMaxValue = MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(offcodeHeaderSize)) return ERROR(dictionary_corrupted);
+        if (offcodeLog > OffFSELog) return ERROR(dictionary_corrupted);
+        CHECK_E(FSE_buildDTable(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog), dictionary_corrupted);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(matchlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (matchlengthLog > MLFSELog) return ERROR(dictionary_corrupted);
+        CHECK_E(FSE_buildDTable(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog), dictionary_corrupted);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        if (FSE_isError(litlengthHeaderSize)) return ERROR(dictionary_corrupted);
+        if (litlengthLog > LLFSELog) return ERROR(dictionary_corrupted);
+        CHECK_E(FSE_buildDTable(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog), dictionary_corrupted);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    if (dictPtr+12 > dictEnd) return ERROR(dictionary_corrupted);
+    {   int i;
+        size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12));
+        for (i=0; i<3; i++) {
+            U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4;
+            if (rep==0 || rep >= dictContentSize) return ERROR(dictionary_corrupted);
+            entropy->rep[i] = rep;
+    }   }
+
+    return dictPtr - (const BYTE*)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTD_DICT_MAGIC) {
+            return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + 4);
+
+    /* load entropy tables */
+    {   size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
+        if (ZSTD_isError(eSize)) return ERROR(dictionary_corrupted);
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+    dctx->litEntropy = dctx->fseEntropy = 1;
+
+    /* reference dictionary content */
+    return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    CHECK_F(ZSTD_decompressBegin(dctx));
+    if (dict && dictSize) CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted);
+    return 0;
+}
+
+
+/* ======   ZSTD_DDict   ====== */
+
+struct ZSTD_DDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictSize;
+    ZSTD_entropyTables_t entropy;
+    U32 dictID;
+    U32 entropyPresent;
+    ZSTD_customMem cMem;
+};  /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+static const void* ZSTD_DDictDictContent(const ZSTD_DDict* ddict)
+{
+    return ddict->dictContent;
+}
+
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict* ddict)
+{
+    return ddict->dictSize;
+}
+
+static void ZSTD_refDDict(ZSTD_DCtx* dstDCtx, const ZSTD_DDict* ddict)
+{
+    ZSTD_decompressBegin(dstDCtx);  /* init */
+    if (ddict) {   /* support refDDict on NULL */
+        dstDCtx->dictID = ddict->dictID;
+        dstDCtx->base = ddict->dictContent;
+        dstDCtx->vBase = ddict->dictContent;
+        dstDCtx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+        dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+        if (ddict->entropyPresent) {
+            dstDCtx->litEntropy = 1;
+            dstDCtx->fseEntropy = 1;
+            dstDCtx->LLTptr = ddict->entropy.LLTable;
+            dstDCtx->MLTptr = ddict->entropy.MLTable;
+            dstDCtx->OFTptr = ddict->entropy.OFTable;
+            dstDCtx->HUFptr = ddict->entropy.hufTable;
+            dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
+            dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
+            dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+        } else {
+            dstDCtx->litEntropy = 0;
+            dstDCtx->fseEntropy = 0;
+        }
+    }
+}
+
+static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict* ddict)
+{
+    ddict->dictID = 0;
+    ddict->entropyPresent = 0;
+    if (ddict->dictSize < 8) return 0;
+    {   U32 const magic = MEM_readLE32(ddict->dictContent);
+        if (magic != ZSTD_DICT_MAGIC) return 0;   /* pure content mode */
+    }
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + 4);
+
+    /* load entropy tables */
+    CHECK_E( ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted );
+    ddict->entropyPresent = 1;
+    return 0;
+}
+
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, unsigned byReference, ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem;
+    if (!customMem.customAlloc || !customMem.customFree) return NULL;
+
+    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+        if (!ddict) return NULL;
+        ddict->cMem = customMem;
+
+        if ((byReference) || (!dict) || (!dictSize)) {
+            ddict->dictBuffer = NULL;
+            ddict->dictContent = dict;
+        } else {
+            void* const internalBuffer = ZSTD_malloc(dictSize, customMem);
+            if (!internalBuffer) { ZSTD_freeDDict(ddict); return NULL; }
+            memcpy(internalBuffer, dict, dictSize);
+            ddict->dictBuffer = internalBuffer;
+            ddict->dictContent = internalBuffer;
+        }
+        ddict->dictSize = dictSize;
+        ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+        /* parse dictionary content */
+        {   size_t const errorCode = ZSTD_loadEntropy_inDDict(ddict);
+            if (ZSTD_isError(errorCode)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
+
+        return ddict;
+    }
+}
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dict, dictSize, 0, allocator);
+}
+
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, to start decompression without startup delay.
+ *  Dictionary content is simply referenced, it will be accessed during decompression.
+ *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dictBuffer, dictSize, 1, allocator);
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = ddict->cMem;
+        ZSTD_free(ddict->dictBuffer, cMem);
+        ZSTD_free(ddict, cMem);
+        return 0;
+    }
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dict) != ZSTD_DICT_MAGIC) return 0;
+    return MEM_readLE32((const char*)dict + 4);
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;
+    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+{
+    ZSTD_frameParams zfp = { 0 , 0 , 0 , 0 };
+    size_t const hError = ZSTD_getFrameParams(&zfp, src, srcSize);
+    if (ZSTD_isError(hError)) return 0;
+    return zfp.dictID;
+}
+
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_DDict* ddict)
+{
+    /* pass content and size in case legacy frames are encountered */
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
+                                     NULL, 0,
+                                     ddict);
+}
+
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+typedef enum { zdss_init, zdss_loadHeader,
+               zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+/* *** Resource management *** */
+struct ZSTD_DStream_s {
+    ZSTD_DCtx* dctx;
+    ZSTD_DDict* ddictLocal;
+    const ZSTD_DDict* ddict;
+    ZSTD_frameParams fParams;
+    ZSTD_dStreamStage stage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    size_t maxWindowSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t blockSize;
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];   /* tmp buffer to store frame header */
+    size_t lhSize;
+    ZSTD_customMem customMem;
+    void* legacyContext;
+    U32 previousLegacyVersion;
+    U32 legacyVersion;
+    U32 hostageByte;
+};   /* typedef'd to ZSTD_DStream within "zstd.h" */
+
+
+ZSTD_DStream* ZSTD_createDStream(void)
+{
+    return ZSTD_createDStream_advanced(defaultCustomMem);
+}
+
+ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_DStream* zds;
+
+    if (!customMem.customAlloc && !customMem.customFree) customMem = defaultCustomMem;
+    if (!customMem.customAlloc || !customMem.customFree) return NULL;
+
+    zds = (ZSTD_DStream*) ZSTD_malloc(sizeof(ZSTD_DStream), customMem);
+    if (zds==NULL) return NULL;
+    memset(zds, 0, sizeof(ZSTD_DStream));
+    memcpy(&zds->customMem, &customMem, sizeof(ZSTD_customMem));
+    zds->dctx = ZSTD_createDCtx_advanced(customMem);
+    if (zds->dctx == NULL) { ZSTD_freeDStream(zds); return NULL; }
+    zds->stage = zdss_init;
+    zds->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+    return zds;
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream* zds)
+{
+    if (zds==NULL) return 0;   /* support free on null */
+    {   ZSTD_customMem const cMem = zds->customMem;
+        ZSTD_freeDCtx(zds->dctx);
+        ZSTD_freeDDict(zds->ddictLocal);
+        ZSTD_free(zds->inBuff, cMem);
+        ZSTD_free(zds->outBuff, cMem);
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (zds->legacyContext)
+            ZSTD_freeLegacyStreamContext(zds->legacyContext, zds->previousLegacyVersion);
+#endif
+        ZSTD_free(zds, cMem);
+        return 0;
+    }
+}
+
+
+/* *** Initialization *** */
+
+size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_ABSOLUTEMAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+
+size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize)
+{
+    zds->stage = zdss_loadHeader;
+    zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+    ZSTD_freeDDict(zds->ddictLocal);
+    if (dict && dictSize >= 8) {
+        zds->ddictLocal = ZSTD_createDDict(dict, dictSize);
+        if (zds->ddictLocal == NULL) return ERROR(memory_allocation);
+    } else zds->ddictLocal = NULL;
+    zds->ddict = zds->ddictLocal;
+    zds->legacyVersion = 0;
+    zds->hostageByte = 0;
+    return ZSTD_frameHeaderSize_prefix;
+}
+
+size_t ZSTD_initDStream(ZSTD_DStream* zds)
+{
+    return ZSTD_initDStream_usingDict(zds, NULL, 0);
+}
+
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict)  /**< note : ddict will just be referenced, and must outlive decompression session */
+{
+    size_t const initResult = ZSTD_initDStream(zds);
+    zds->ddict = ddict;
+    return initResult;
+}
+
+size_t ZSTD_resetDStream(ZSTD_DStream* zds)
+{
+    zds->stage = zdss_loadHeader;
+    zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+    zds->legacyVersion = 0;
+    zds->hostageByte = 0;
+    return ZSTD_frameHeaderSize_prefix;
+}
+
+size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds,
+                                ZSTD_DStreamParameter_e paramType, unsigned paramValue)
+{
+    switch(paramType)
+    {
+        default : return ERROR(parameter_unknown);
+        case DStream_p_maxWindowSize : zds->maxWindowSize = paramValue ? paramValue : (U32)(-1); break;
+    }
+    return 0;
+}
+
+
+size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds)
+{
+    if (zds==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*zds) + ZSTD_sizeof_DCtx(zds->dctx) + ZSTD_sizeof_DDict(zds->ddictLocal) + zds->inBuffSize + zds->outBuffSize;
+}
+
+
+/* *****   Decompression   ***** */
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    memcpy(dst, src, length);
+    return length;
+}
+
+
+size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    const char* const istart = (const char*)(input->src) + input->pos;
+    const char* const iend = (const char*)(input->src) + input->size;
+    const char* ip = istart;
+    char* const ostart = (char*)(output->dst) + output->pos;
+    char* const oend = (char*)(output->dst) + output->size;
+    char* op = ostart;
+    U32 someMoreWork = 1;
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+    if (zds->legacyVersion)
+        return ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+#endif
+
+    while (someMoreWork) {
+        switch(zds->stage)
+        {
+        case zdss_init :
+            ZSTD_resetDStream(zds);   /* transparent reset on starting decoding a new frame */
+            /* fall-through */
+
+        case zdss_loadHeader :
+            {   size_t const hSize = ZSTD_getFrameParams(&zds->fParams, zds->headerBuffer, zds->lhSize);
+                if (ZSTD_isError(hSize))
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+                {   U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
+                    if (legacyVersion) {
+                        const void* const dict = zds->ddict ? zds->ddict->dictContent : NULL;
+                        size_t const dictSize = zds->ddict ? zds->ddict->dictSize : 0;
+                        CHECK_F(ZSTD_initLegacyStream(&zds->legacyContext, zds->previousLegacyVersion, legacyVersion,
+                                                       dict, dictSize));
+                        zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
+                        return ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+                    } else {
+                        return hSize; /* error */
+                }   }
+#else
+                return hSize;
+#endif
+                if (hSize != 0) {   /* need more input */
+                    size_t const toLoad = hSize - zds->lhSize;   /* if hSize!=0, hSize > zds->lhSize */
+                    if (toLoad > (size_t)(iend-ip)) {   /* not enough input to load full header */
+                        memcpy(zds->headerBuffer + zds->lhSize, ip, iend-ip);
+                        zds->lhSize += iend-ip;
+                        input->pos = input->size;
+                        return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* check for single-pass mode opportunity */
+            if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */
+                && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+                size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart);
+                if (cSize <= (size_t)(iend-istart)) {
+                    size_t const decompressedSize = ZSTD_decompress_usingDDict(zds->dctx, op, oend-op, istart, cSize, zds->ddict);
+                    if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                    ip = istart + cSize;
+                    op += decompressedSize;
+                    zds->dctx->expected = 0;
+                    zds->stage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+            }   }
+
+            /* Consume header */
+            ZSTD_refDDict(zds->dctx, zds->ddict);
+            {   size_t const h1Size = ZSTD_nextSrcSizeToDecompress(zds->dctx);  /* == ZSTD_frameHeaderSize_prefix */
+                CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer, h1Size));
+                {   size_t const h2Size = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+                    CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer+h1Size, h2Size));
+            }   }
+
+            zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+            if (zds->fParams.windowSize > zds->maxWindowSize) return ERROR(frameParameter_windowTooLarge);
+
+            /* Adapt buffer sizes to frame header instructions */
+            {   size_t const blockSize = MIN(zds->fParams.windowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+                size_t const neededOutSize = zds->fParams.windowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+                zds->blockSize = blockSize;
+                if (zds->inBuffSize < blockSize) {
+                    ZSTD_free(zds->inBuff, zds->customMem);
+                    zds->inBuffSize = blockSize;
+                    zds->inBuff = (char*)ZSTD_malloc(blockSize, zds->customMem);
+                    if (zds->inBuff == NULL) return ERROR(memory_allocation);
+                }
+                if (zds->outBuffSize < neededOutSize) {
+                    ZSTD_free(zds->outBuff, zds->customMem);
+                    zds->outBuffSize = neededOutSize;
+                    zds->outBuff = (char*)ZSTD_malloc(neededOutSize, zds->customMem);
+                    if (zds->outBuff == NULL) return ERROR(memory_allocation);
+            }   }
+            zds->stage = zdss_read;
+            /* pass-through */
+
+        case zdss_read:
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+                if (neededInSize==0) {  /* end of frame */
+                    zds->stage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+                    size_t const decodedSize = ZSTD_decompressContinue(zds->dctx,
+                        zds->outBuff + zds->outStart, (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart),
+                        ip, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    ip += neededInSize;
+                    if (!decodedSize && !isSkipFrame) break;   /* this was just a header */
+                    zds->outEnd = zds->outStart + decodedSize;
+                    zds->stage = zdss_flush;
+                    break;
+                }
+                if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
+                zds->stage = zdss_load;
+                /* pass-through */
+            }
+
+        case zdss_load:
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+                size_t const toLoad = neededInSize - zds->inPos;   /* should always be <= remaining space within inBuff */
+                size_t loadedSize;
+                if (toLoad > zds->inBuffSize - zds->inPos) return ERROR(corruption_detected);   /* should never happen */
+                loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip);
+                ip += loadedSize;
+                zds->inPos += loadedSize;
+                if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                {  const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+                   size_t const decodedSize = ZSTD_decompressContinue(zds->dctx,
+                        zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart,
+                        zds->inBuff, neededInSize);
+                    if (ZSTD_isError(decodedSize)) return decodedSize;
+                    zds->inPos = 0;   /* input is consumed */
+                    if (!decodedSize && !isSkipFrame) { zds->stage = zdss_read; break; }   /* this was just a header */
+                    zds->outEnd = zds->outStart +  decodedSize;
+                    zds->stage = zdss_flush;
+                    /* pass-through */
+            }   }
+
+        case zdss_flush:
+            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+                size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize);
+                op += flushedSize;
+                zds->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {  /* flush completed */
+                    zds->stage = zdss_read;
+                    if (zds->outStart + zds->blockSize > zds->outBuffSize)
+                        zds->outStart = zds->outEnd = 0;
+                    break;
+                }
+                /* cannot complete flush */
+                someMoreWork = 0;
+                break;
+            }
+        default: return ERROR(GENERIC);   /* impossible */
+    }   }
+
+    /* result */
+    input->pos += (size_t)(ip-istart);
+    output->pos += (size_t)(op-ostart);
+    {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+        if (!nextSrcSizeHint) {   /* frame fully decoded */
+            if (zds->outEnd == zds->outStart) {  /* output fully flushed */
+                if (zds->hostageByte) {
+                    if (input->pos >= input->size) { zds->stage = zdss_read; return 1; }  /* can't release hostage (not present) */
+                    input->pos++;  /* release hostage */
+                }
+                return 0;
+            }
+            if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+                input->pos--;   /* note : pos > 0, otherwise, impossible to finish reading last block */
+                zds->hostageByte=1;
+            }
+            return 1;
+        }
+        nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds->dctx) == ZSTDnit_block);   /* preload header of next block */
+        if (zds->inPos > nextSrcSizeHint) return ERROR(GENERIC);   /* should never happen */
+        nextSrcSizeHint -= zds->inPos;   /* already loaded*/
+        return nextSrcSizeHint;
+    }
+}
diff --git a/zstd/lib/decompress/zstd_decompress.o b/zstd/lib/decompress/zstd_decompress.o
new file mode 100644
index 0000000..015c57f
Binary files /dev/null and b/zstd/lib/decompress/zstd_decompress.o differ
diff --git a/zstd/lib/zstd.h b/zstd/lib/zstd.h
new file mode 100644
index 0000000..f462d09
--- /dev/null
+++ b/zstd/lib/zstd.h
@@ -0,0 +1,765 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#  define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#else
+#  define ZSTDLIB_VISIBILITY
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting real-time compression scenarios
+  at zlib-level and better compression ratios. The zstd compression library provides in-memory compression and
+  decompression functions. The library supports compression levels from 1 up to ZSTD_maxCLevel() which is 22.
+  Levels >= 20, labeled `--ultra`, should be used with caution, as they require more memory.
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit memory management)
+    - unbounded multiple steps (described as Streaming compression)
+  The compression ratio achievable on small data can be highly improved using compression with a dictionary in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Fast dictionary API)
+
+  Advanced experimental functions can be accessed using #define ZSTD_STATIC_LINKING_ONLY before including zstd.h.
+  These APIs shall never be used with a dynamic library.
+  They are not "stable", their definition may change in the future. Only static linking is allowed.
+*********************************************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    1
+#define ZSTD_VERSION_RELEASE  4
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+    Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+    Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+    @return : compressed size written into `dst` (<= `dstCapacity),
+              or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+    `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+    `dstCapacity` is an upper bound of originalSize.
+    If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getDecompressedSize() :
+*   NOTE: This function is planned to be obsolete, in favour of ZSTD_getFrameContentSize.
+*   ZSTD_getFrameContentSize functions the same way, returning the decompressed size of a single
+*   frame, but distinguishes empty frames from frames with an unknown size, or errors.
+*
+*   Additionally, ZSTD_findDecompressedSize can be used instead.  It can handle multiple
+*   concatenated frames in one buffer, and so is more general.
+*   As a result however, it requires more computation and entire frames to be passed to it,
+*   as opposed to ZSTD_getFrameContentSize which requires only a single frame's header.
+*
+*   'src' is the start of a zstd compressed frame.
+*   @return : content size to be decompressed, as a 64-bits value _if known_, 0 otherwise.
+*    note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+*             When `return==0`, data to decompress could be any size.
+*             In which case, it's necessary to use streaming mode to decompress data.
+*             Optionally, application can still use ZSTD_decompress() while relying on implied limits.
+*             (For example, data may be necessarily cut into blocks <= 16 KB).
+*    note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+*    note 3 : decompressed size can be very large (64-bits value),
+*             potentially larger than what local system can handle as a single memory segment.
+*             In which case, it's necessary to use streaming mode to decompress data.
+*    note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+*             Always ensure result fits within application's authorized limits.
+*             Each application can set its own limits.
+*    note 5 : when `return==0`, if precise failure cause is needed, use ZSTD_getFrameParams() to know more. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+
+
+/***************************************
+*  Explicit memory management
+***************************************/
+/*= Compression context
+*   When compressing many times,
+*   it is recommended to allocate a context just once, and re-use it for each successive compression operation.
+*   This will make workload friendlier for system's memory.
+*   Use one context per thread for parallel execution in multi-threaded environments. */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+    Same as ZSTD_compress(), requires an allocated ZSTD_CCtx (see ZSTD_createCCtx()). */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, int compressionLevel);
+
+/*= Decompression context */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+*   Same as ZSTD_decompress(), requires an allocated ZSTD_DCtx (see ZSTD_createDCtx()). */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+*   Compression using a predefined Dictionary (see dictBuilder/zdict.h).
+*   Note : This function loads the dictionary, resulting in significant startup delay.
+*   Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+*   Decompression using a predefined Dictionary (see dictBuilder/zdict.h).
+*   Dictionary must be identical to the one used during compression.
+*   Note : This function loads the dictionary, resulting in significant startup delay.
+*   Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/****************************
+*  Fast dictionary API
+****************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+*   When compressing multiple messages / blocks with the same dictionary, it's recommended to load it just once.
+*   ZSTD_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+*   ZSTD_CDict can be created once and used by multiple threads concurrently, as its usage is read-only.
+*   `dictBuffer` can be released after ZSTD_CDict creation, as its content is copied within CDict */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+*   Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+*   Compression using a digested Dictionary.
+*   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+*   Note that compression level is decided during dictionary creation. */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, ready to start decompression operation without startup delay.
+*   dictBuffer can be released after DDict creation, as its content is copied inside DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+*   Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a digested Dictionary.
+*   Faster startup than ZSTD_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream in situations where many streaming operations will be achieved consecutively,
+*  since it will play nicer with system's memory, by re-using already allocated memory.
+*  Use one separate ZSTD_CStream per thread for parallel execution.
+*
+*  Start a new compression by initializing ZSTD_CStream.
+*  Use ZSTD_initCStream() to start a new compression operation.
+*  Use ZSTD_initCStream_usingDict() or ZSTD_initCStream_usingCDict() for a compression which requires a dictionary (experimental section)
+*
+*  Use ZSTD_compressStream() repetitively to consume input stream.
+*  The function will automatically update both `pos` fields.
+*  Note that it may not consume the entire input, in which case `pos < size`,
+*  and it's up to the caller to present again remaining data.
+*  @return : a size hint, preferred nb of bytes to use as input for next function call
+*            or an error code, which can be tested using ZSTD_isError().
+*            Note 1 : it's just a hint, to help latency a little, any other value will work fine.
+*            Note 2 : size hint is guaranteed to be <= ZSTD_CStreamInSize()
+*
+*  At any moment, it's possible to flush whatever data remains within internal buffer, using ZSTD_flushStream().
+*  `output->pos` will be updated.
+*  Note that some content might still be left within internal buffer if `output->size` is too small.
+*  @return : nb of bytes still present within internal buffer (0 if it's empty)
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  ZSTD_endStream() instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  Similar to ZSTD_flushStream(), it may not be able to flush the full content if `output->size` is too small.
+*  In which case, call again ZSTD_endStream() to complete the flush.
+*  @return : nb of bytes still present within internal buffer (0 if it's empty, hence compression completed)
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef struct ZSTD_CStream_s ZSTD_CStream;
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block in all circumstances. */
+
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation,
+*   or ZSTD_initDStream_usingDict() if decompression requires a dictionary.
+*   @return : recommended first input size
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  @return : 0 when a frame is completely decoded and fully flushed,
+*            an error code, which can be tested using ZSTD_isError(),
+*            any other value > 0, which means there is still some decoding to do to complete current frame.
+*            The return value is a suggested next input size (a hint to improve latency) that will never load more than the current frame.
+* *******************************************************************************/
+
+typedef struct ZSTD_DStream_s ZSTD_DStream;
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+#endif  /* ZSTD_H_235446 */
+
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/****************************************************************************************
+ * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS
+ * The definitions in this section are considered experimental.
+ * They should never be used with a dynamic library, as they may change in the future.
+ * They are provided for advanced usages.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+
+#define ZSTD_WINDOWLOG_MAX_32  27
+#define ZSTD_WINDOWLOG_MAX_64  27
+#define ZSTD_WINDOWLOG_MAX    ((U32)(MEM_32bits() ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN     10
+#define ZSTD_HASHLOG_MAX       ZSTD_WINDOWLOG_MAX
+#define ZSTD_HASHLOG_MIN        6
+#define ZSTD_CHAINLOG_MAX     (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CHAINLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX      17
+#define ZSTD_SEARCHLOG_MAX    (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN      1
+#define ZSTD_SEARCHLENGTH_MAX   7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MIN   3   /* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MIN   4
+#define ZSTD_TARGETLENGTH_MAX 999
+
+#define ZSTD_FRAMEHEADERSIZE_MAX 18    /* for static allocation */
+#define ZSTD_FRAMEHEADERSIZE_MIN  6
+static const size_t ZSTD_frameHeaderSize_prefix = 5;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+static const size_t ZSTD_skippableHeaderSize = 8;  /* magic number + skippable frame length */
+
+
+/*--- Advanced types ---*/
+typedef enum { ZSTD_fast, ZSTD_dfast, ZSTD_greedy, ZSTD_lazy, ZSTD_lazy2, ZSTD_btlazy2, ZSTD_btopt, ZSTD_btopt2 } ZSTD_strategy;   /* from faster to stronger */
+
+typedef struct {
+    unsigned windowLog;      /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;       /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;        /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;      /**< nb of searches : larger == more compression, slower */
+    unsigned searchLength;   /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;   /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+typedef struct {
+    unsigned contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    unsigned checksumFlag;    /**< 1: generate a 32-bits checksum at end of frame, for error detection */
+    unsigned noDictIDFlag;    /**< 1: no dictID will be saved into frame header (if dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+/*= Custom memory allocation functions */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+
+/***************************************
+*  Compressed size functions
+***************************************/
+
+/*! ZSTD_findFrameCompressedSize() :
+ *  `src` should point to the start of a ZSTD encoded frame or skippable frame
+ *  `srcSize` must be at least as large as the frame
+ *  @return : the compressed size of the frame pointed to by `src`, suitable to pass to
+ *      `ZSTD_decompress` or similar, or an error code if given invalid input. */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+/***************************************
+*  Decompressed size functions
+***************************************/
+/*! ZSTD_getFrameContentSize() :
+*   `src` should point to the start of a ZSTD encoded frame
+*   `srcSize` must be at least as large as the frame header.  A value greater than or equal
+*       to `ZSTD_frameHeaderSize_max` is guaranteed to be large enough in all cases.
+*   @return : decompressed size of the frame pointed to be `src` if known, otherwise
+*             - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+*             - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_findDecompressedSize() :
+*   `src` should point the start of a series of ZSTD encoded and/or skippable frames
+*   `srcSize` must be the _exact_ size of this series
+*       (i.e. there should be a frame boundary exactly `srcSize` bytes after `src`)
+*   @return : the decompressed size of all data in the contained frames, as a 64-bit value _if known_
+*             - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+*             - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+*
+*    note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+*             When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+*             In which case, it's necessary to use streaming mode to decompress data.
+*             Optionally, application can still use ZSTD_decompress() while relying on implied limits.
+*             (For example, data may be necessarily cut into blocks <= 16 KB).
+*    note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+*    note 3 : decompressed size can be very large (64-bits value),
+*             potentially larger than what local system can handle as a single memory segment.
+*             In which case, it's necessary to use streaming mode to decompress data.
+*    note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+*             Always ensure result fits within application's authorized limits.
+*             Each application can set its own limits.
+*    note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+*             read each contained frame header.  This is efficient as most of the data is skipped,
+*             however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+
+/***************************************
+*  Advanced compression functions
+***************************************/
+/*! ZSTD_estimateCCtxSize() :
+ *  Gives the amount of memory allocated for a ZSTD_CCtx given a set of compression parameters.
+ *  `frameContentSize` is an optional parameter, provide `0` if unknown */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams);
+
+/*! ZSTD_createCCtx_advanced() :
+ *  Create a ZSTD compression context using external alloc and free functions */
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+
+/*! ZSTD_sizeofCCtx() :
+ *  Gives the amount of memory used by a given ZSTD_CCtx */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+
+typedef enum {
+    ZSTD_p_forceWindow,   /* Force back-references to remain < windowSize, even when referencing Dictionary content (default:0) */
+    ZSTD_p_forceRawDict   /* Force loading dictionary in "content-only" mode (no header analysis) */
+} ZSTD_CCtxParameter;
+/*! ZSTD_setCCtxParameter() :
+ *  Set advanced parameters, selected through enum ZSTD_CCtxParameter
+ *  @result : 0, or an error code (which can be tested with ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_setCCtxParameter(ZSTD_CCtx* cctx, ZSTD_CCtxParameter param, unsigned value);
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is simply referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives CDict, it must remain read accessible throughout the lifetime of CDict */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_createCDict_advanced() :
+ *  Create a ZSTD_CDict using external alloc and free, and customized compression parameters */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, unsigned byReference,
+                                                  ZSTD_parameters params, ZSTD_customMem customMem);
+
+/*! ZSTD_sizeof_CDict() :
+ *  Gives the amount of memory used by a given ZSTD_sizeof_CDict */
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+
+/*! ZSTD_getCParams() :
+*   @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+*   `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+*   All fields of `ZSTD_frameParameters` are set to default (0) */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+*   Ensure param values remain within authorized range */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+*   optimize params for a given `srcSize` and `dictSize`.
+*   both values are optional, select `0` if unknown. */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+*   Same as ZSTD_compress_usingDict(), with fine-tune control of each compression parameter */
+ZSTDLIB_API size_t ZSTD_compress_advanced (ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           ZSTD_parameters params);
+
+
+/*--- Advanced decompression functions ---*/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_estimateDCtxSize() :
+ *  Gives the potential amount of memory allocated to create a ZSTD_DCtx */
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_createDCtx_advanced() :
+ *  Create a ZSTD decompression context using external alloc and free functions */
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+
+/*! ZSTD_sizeof_DCtx() :
+ *  Gives the amount of memory used by a given ZSTD_DCtx */
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is simply referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict, it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                                  unsigned byReference, ZSTD_customMem customMem);
+
+/*! ZSTD_sizeof_DDict() :
+ *  Gives the amount of memory used by a given ZSTD_DDict */
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/********************************************************************
+*  Advanced streaming functions
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize);   /**< pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */
+ZSTDLIB_API size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */
+ZSTDLIB_API size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize,
+                                             ZSTD_parameters params, unsigned long long pledgedSrcSize);  /**< pledgedSrcSize is optional and can be 0 (meaning unknown). note: if the contentSizeFlag is set, pledgedSrcSize == 0 means the source size is actually 0 */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);  /**< note : cdict will just be referenced, and must outlive compression session */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);  /**< re-use compression parameters from previous init; skip dictionary loading stage; zcs must be init at least once before. note: pledgedSrcSize must be correct, a size of 0 means unknown.  for a frame size of 0 use initCStream_advanced */
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+
+
+/*=====   Advanced Streaming decompression functions  =====*/
+typedef enum { DStream_p_maxWindowSize } ZSTD_DStreamParameter_e;
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); /**< note: a dict will not be used if dict == NULL or dictSize < 8 */
+ZSTDLIB_API size_t ZSTD_setDStreamParameter(ZSTD_DStream* zds, ZSTD_DStreamParameter_e paramType, unsigned paramValue);
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);  /**< note : ddict will just be referenced, and must outlive decompression session */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);  /**< re-use decompression parameters from previous init; saves dictionary loading */
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with many restrictions (documented below).
+*  Prefer using normal streaming API for an easier experience
+********************************************************************* */
+
+/**
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffer only.
+  - Interface is synchronous : input is consumed entirely and produce 1+ (or more) compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames will be considered unfinished (corrupted) by decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new frame.
+*/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize is optional and can be 0 (meaning unknown). note: if the contentSizeFlag is set, pledgedSrcSize == 0 means the source size is actually 0 */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize can be 0, indicating unknown size.  if it is non-zero, it must be accurate.  for 0 size frames, use compressBegin_advanced */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize can be 0, indicating unknown size.  if it is non-zero, it must be accurate.  for 0 size frames, use compressBegin_advanced */
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+/*-
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameParams().
+  It fills a ZSTD_frameParams structure which provide important information to correctly decode the frame,
+  such as the minimum rolling buffer size to allocate to decompress data (`windowSize`),
+  and the dictionary ID used.
+  (Note : content size is optional, it may not be present. 0 means : content size unknown).
+  Note that these values could be wrong, either because of data malformation, or because an attacker is spoofing deliberate false information.
+  As a consequence, check that values remain within valid application range, especially `windowSize`, before allocation.
+  Each application can set its own limit, depending on local restrictions. For extended interoperability, it is recommended to support at least 8 MB.
+  Frame parameters are extracted from the beginning of the compressed frame.
+  Data fragment must be large enough to ensure successful decoding, typically `ZSTD_frameHeaderSize_max` bytes.
+  @result : 0 : successful decoding, the `ZSTD_frameParams` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  Start decompression, with ZSTD_decompressBegin() or ZSTD_decompressBegin_usingDict().
+  Alternatively, you can copy a prepared context, using ZSTD_copyDCtx().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+  @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize`.
+  They should preferably be located contiguously, prior to current block.
+  Alternatively, a round buffer of sufficient size is also possible. Sufficient size is determined by frame parameters.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference.
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by a decompressor. The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_decompressContinue() always returns 0.
+  For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0 what means that a frame is skippable.
+    Note : If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might actually be a Zstd encoded frame with no content.
+           For purposes of decompression, it is valid in both cases to skip the frame using
+           ZSTD_findFrameCompressedSize to find its size in bytes.
+  It also returns Frame Size as fparamsPtr->frameContentSize.
+*/
+
+typedef struct {
+    unsigned long long frameContentSize;
+    unsigned windowSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameParams;
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+ZSTDLIB_API size_t ZSTD_getFrameParams(ZSTD_frameParams* fparamsPtr, const void* src, size_t srcSize);   /**< doesn't consume input, see details below */
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+/**
+    Block functions
+
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~18 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    User will have to take in charge required information to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : ZSTD_compressBegin()
+      + decompression : ZSTD_decompressBegin()
+      + variants _usingDict() are also allowed
+      + copyCCtx() and copyDCtx() work too
+    - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
+      + If you need to compress more, cut data into multiple blocks
+      + Consider using the regular ZSTD_compress() instead, as frame metadata costs become negligible when source size is large.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be zero.
+      In which case, nothing is produced into `dst`.
+      + User must test for such outcome and deal directly with uncompressed data
+      + ZSTD_decompressBlock() doesn't accept uncompressed data as input !!!
+      + In case of multiple successive blocks, decoder must be informed of uncompressed block existence to follow proper history.
+        Use ZSTD_insertBlock() in such a case.
+*/
+
+#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)   /* define, for static allocation */
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSizeMax(ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert block into `dctx` history. Useful for uncompressed blocks */
+
+
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/zstd/zlibWrapper/BUCK b/zstd/zlibWrapper/BUCK
new file mode 100644
index 0000000..a3b74ac
--- /dev/null
+++ b/zstd/zlibWrapper/BUCK
@@ -0,0 +1,22 @@
+cxx_library(
+    name='zlib_wrapper',
+    visibility=['PUBLIC'],
+    exported_linker_flags=['-lz'],
+    header_namespace='',
+    exported_headers=['zstd_zlibwrapper.h'],
+    headers=[
+        'gzcompatibility.h',
+        'gzguts.h',
+    ],
+    srcs=glob(['*.c']),
+    deps=[
+        '//lib:zstd',
+        '//lib:zstd_common',
+    ]
+)
+
+cxx_binary(
+    name='minigzip',
+    srcs=['examples/minigzip.c'],
+    deps=[':zlib_wrapper'],
+)
diff --git a/zstd/zlibWrapper/Makefile b/zstd/zlibWrapper/Makefile
new file mode 100644
index 0000000..5a63787
--- /dev/null
+++ b/zstd/zlibWrapper/Makefile
@@ -0,0 +1,107 @@
+# Makefile for example of using zstd wrapper for zlib
+#
+# make - compiles examples
+# make MOREFLAGS=-DZWRAP_USE_ZSTD=1 - compiles examples with zstd compression turned on
+# make test - runs examples
+
+
+# Paths to static and dynamic zlib and zstd libraries
+# Use "make ZLIB_PATH=path/to/zlib ZLIB_LIBRARY=path/to/libz.a" to select a path to library
+ZLIB_LIBRARY ?= -lz
+ZLIB_PATH ?= .
+
+ZSTDLIBDIR = ../lib
+ZSTDLIBRARY = $(ZSTDLIBDIR)/libzstd.a
+ZLIBWRAPPER_PATH = .
+GZFILES = gzclose.o gzlib.o gzread.o gzwrite.o
+EXAMPLE_PATH = examples
+PROGRAMS_PATH = ../programs
+TEST_FILE = ../doc/zstd_compression_format.md
+
+CPPFLAGS = -DXXH_NAMESPACE=ZSTD_ -I$(ZLIB_PATH) -I$(PROGRAMS_PATH) -I$(ZSTDLIBDIR) -I$(ZSTDLIBDIR)/common -I$(ZLIBWRAPPER_PATH)
+CFLAGS  ?= $(MOREFLAGS) -O3 -std=gnu99
+CFLAGS  += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow -Wswitch-enum -Wdeclaration-after-statement -Wstrict-prototypes -Wundef -Wstrict-aliasing=1
+
+
+# Define *.exe as extension for Windows systems
+ifneq (,$(filter Windows%,$(OS)))
+EXT =.exe
+else
+EXT =
+endif
+
+
+all: clean fitblk example zwrapbench minigzip
+
+test: example fitblk example_zstd fitblk_zstd zwrapbench minigzip minigzip_zstd
+	./example
+	./example_zstd
+	./fitblk 10240 <$(TEST_FILE)
+	./fitblk 40960 <$(TEST_FILE)
+	./fitblk_zstd 10240 <$(TEST_FILE)
+	./fitblk_zstd 40960 <$(TEST_FILE)
+	@echo ---- minigzip start ----
+	./minigzip_zstd example$(EXT)
+	#cp example$(EXT).gz example$(EXT)_zstd.gz
+	./minigzip_zstd -d example$(EXT).gz
+	./minigzip example$(EXT)
+	#cp example$(EXT).gz example$(EXT)_gz.gz
+	./minigzip_zstd -d example$(EXT).gz
+	@echo ---- minigzip end ----
+	./zwrapbench -qb3B1K $(TEST_FILE)
+	./zwrapbench -rqb1e5 ../lib ../programs ../tests
+
+#valgrindTest: ZSTDLIBRARY = $(ZSTDLIBDIR)/libzstd.so
+valgrindTest: VALGRIND = LD_LIBRARY_PATH=$(ZSTDLIBDIR) valgrind --track-origins=yes --leak-check=full --error-exitcode=1
+valgrindTest: clean example fitblk example_zstd fitblk_zstd zwrapbench
+	@echo "\n ---- valgrind tests ----"
+	$(VALGRIND) ./example
+	$(VALGRIND) ./example_zstd
+	$(VALGRIND) ./fitblk 10240 <$(TEST_FILE)
+	$(VALGRIND) ./fitblk 40960 <$(TEST_FILE)
+	$(VALGRIND) ./fitblk_zstd 10240 <$(TEST_FILE)
+	$(VALGRIND) ./fitblk_zstd 40960 <$(TEST_FILE)
+	$(VALGRIND) ./zwrapbench -qb3B1K $(TEST_FILE)
+	$(VALGRIND) ./zwrapbench -rqb1e5 ../lib ../programs ../tests
+
+#.c.o:
+#	$(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
+
+minigzip: $(EXAMPLE_PATH)/minigzip.o $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.o $(GZFILES) $(ZSTDLIBRARY)
+	$(CC) $(LDFLAGS) $^ $(ZSTDLIBRARY) $(ZLIB_LIBRARY) -o $@
+
+minigzip_zstd: $(EXAMPLE_PATH)/minigzip.o $(ZLIBWRAPPER_PATH)/zstdTurnedOn_zlibwrapper.o $(GZFILES) $(ZSTDLIBRARY)
+	$(CC) $(LDFLAGS) $^ $(ZSTDLIBRARY) $(ZLIB_LIBRARY) -o $@
+
+example: $(EXAMPLE_PATH)/example.o $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.o $(GZFILES) $(ZSTDLIBRARY)
+	$(CC) $(LDFLAGS) $^ $(ZLIB_LIBRARY) -o $@
+
+example_zstd: $(EXAMPLE_PATH)/example.o $(ZLIBWRAPPER_PATH)/zstdTurnedOn_zlibwrapper.o $(GZFILES) $(ZSTDLIBRARY)
+	$(CC) $(LDFLAGS) $^ $(ZLIB_LIBRARY) -o $@
+
+fitblk: $(EXAMPLE_PATH)/fitblk.o $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.o $(ZSTDLIBRARY)
+	$(CC) $(LDFLAGS) $^ $(ZLIB_LIBRARY) -o $@
+
+fitblk_zstd: $(EXAMPLE_PATH)/fitblk.o $(ZLIBWRAPPER_PATH)/zstdTurnedOn_zlibwrapper.o $(ZSTDLIBRARY)
+	$(CC) $(LDFLAGS) $^ $(ZLIB_LIBRARY) -o $@
+
+zwrapbench: $(EXAMPLE_PATH)/zwrapbench.o $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.o $(PROGRAMS_PATH)/datagen.o $(ZSTDLIBRARY)
+	$(CC) $(LDFLAGS) $^ $(ZLIB_LIBRARY) -o $@
+
+
+$(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.o: $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.c $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.h
+	$(CC) $(CFLAGS) $(CPPFLAGS) -I. -c -o $@ $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.c
+
+$(ZLIBWRAPPER_PATH)/zstdTurnedOn_zlibwrapper.o: $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.c $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.h
+	$(CC) $(CFLAGS) $(CPPFLAGS) -DZWRAP_USE_ZSTD=1 -I. -c -o $@ $(ZLIBWRAPPER_PATH)/zstd_zlibwrapper.c
+
+$(ZSTDLIBDIR)/libzstd.a:
+	$(MAKE) -C $(ZSTDLIBDIR) libzstd.a
+
+$(ZSTDLIBDIR)/libzstd.so:
+	$(MAKE) -C $(ZSTDLIBDIR) libzstd
+
+
+clean:
+	-$(RM) $(ZLIBWRAPPER_PATH)/*.o $(EXAMPLE_PATH)/*.o *.o foo.gz example$(EXT) example_zstd$(EXT) fitblk$(EXT) fitblk_zstd$(EXT) zwrapbench$(EXT) minigzip$(EXT) minigzip_zstd$(EXT)
+	@echo Cleaning completed
diff --git a/zstd/zlibWrapper/README.md b/zstd/zlibWrapper/README.md
new file mode 100644
index 0000000..164b69a
--- /dev/null
+++ b/zstd/zlibWrapper/README.md
@@ -0,0 +1,163 @@
+Zstandard wrapper for zlib
+================================
+
+The main objective of creating a zstd wrapper for [zlib](http://zlib.net/) is to allow a quick and smooth transition to zstd for projects already using zlib.
+
+#### Required files
+
+To build the zstd wrapper for zlib the following files are required:
+- zlib.h
+- a static or dynamic zlib library
+- zlibWrapper/zstd_zlibwrapper.h
+- zlibWrapper/zstd_zlibwrapper.c
+- zlibWrapper/gz*.c files (gzclose.c, gzlib.c, gzread.c, gzwrite.c)
+- zlibWrapper/gz*.h files (gzcompatibility.h, gzguts.h)
+- a static or dynamic zstd library
+
+The first two files are required by all projects using zlib and they are not included with the zstd distribution.
+The further files are supplied with the zstd distribution.
+
+
+#### Embedding the zstd wrapper within your project
+
+Let's assume that your project that uses zlib is compiled with:
+```gcc project.o -lz```
+
+To compile the zstd wrapper with your project you have to do the following:
+- change all references with `#include "zlib.h"` to `#include "zstd_zlibwrapper.h"`
+- compile your project with `zstd_zlibwrapper.c`, `gz*.c` and a static or dynamic zstd library
+
+The linking should be changed to:
+```gcc project.o zstd_zlibwrapper.o gz*.c -lz -lzstd```
+
+
+#### Enabling zstd compression within your project
+
+After embedding the zstd wrapper within your project the zstd library is turned off by default.
+Your project should work as before with zlib. There are two options to enable zstd compression:
+- compilation with `-DZWRAP_USE_ZSTD=1` (or using `#define ZWRAP_USE_ZSTD 1` before `#include "zstd_zlibwrapper.h"`)
+- using the `void ZWRAP_useZSTDcompression(int turn_on)` function (declared in `#include "zstd_zlibwrapper.h"`)
+
+During decompression zlib and zstd streams are automatically detected and decompressed using a proper library.
+This behavior can be changed using `ZWRAP_setDecompressionType(ZWRAP_FORCE_ZLIB)` what will make zlib decompression slightly faster.
+
+
+#### Example
+We have take the file `test/example.c` from [the zlib library distribution](http://zlib.net/) and copied it to [zlibWrapper/examples/example.c](examples/example.c).
+After compilation and execution it shows the following results: 
+```
+zlib version 1.2.8 = 0x1280, compile flags = 0x65
+uncompress(): hello, hello!
+gzread(): hello, hello!
+gzgets() after gzseek:  hello!
+inflate(): hello, hello!
+large_inflate(): OK
+after inflateSync(): hello, hello!
+inflate with dictionary: hello, hello!
+```
+Then we have changed `#include "zlib.h"` to `#include "zstd_zlibwrapper.h"`, compiled the [example.c](examples/example.c) file
+with `-DZWRAP_USE_ZSTD=1` and linked with additional `zstd_zlibwrapper.o gz*.c -lzstd`.
+We were forced to turn off the following functions: `test_flush`, `test_sync` which use currently unsupported features.
+After running it shows the following results:
+```
+zlib version 1.2.8 = 0x1280, compile flags = 0x65
+uncompress(): hello, hello!
+gzread(): hello, hello!
+gzgets() after gzseek:  hello!
+inflate(): hello, hello!
+large_inflate(): OK
+inflate with dictionary: hello, hello!
+```
+The script used for compilation can be found at [zlibWrapper/Makefile](Makefile).
+
+
+#### The measurement of performace of Zstandard wrapper for zlib
+
+The zstd distribution contains a tool called `zwrapbench` which can measure speed and ratio of zlib, zstd, and the wrapper.
+The benchmark is conducted using given filenames or synthetic data if filenames are not provided.
+The files are read into memory and processed independently.
+It makes benchmark more precise as it eliminates I/O overhead. 
+Many filenames can be supplied as multiple parameters, parameters with wildcards or names of directories can be used as parameters with the -r option.
+One can select compression levels starting from `-b` and ending with `-e`. The `-i` parameter selects minimal time used for each of tested levels.
+With `-B` option bigger files can be divided into smaller, independently compressed blocks. 
+The benchmark tool can be compiled with `make zwrapbench` using [zlibWrapper/Makefile](Makefile).
+
+
+#### Improving speed of streaming compression
+
+During streaming compression the compressor never knows how big is data to compress.
+Zstandard compression can be improved by providing size of source data to the compressor. By default streaming compressor assumes that data is bigger than 256 KB but it can hurt compression speed on smaller data. 
+The zstd wrapper provides the `ZWRAP_setPledgedSrcSize()` function that allows to change a pledged source size for a given compression stream.
+The function will change zstd compression parameters what may improve compression speed and/or ratio.
+It should be called just after `deflateInit()`or `deflateReset()` and before `deflate()` or `deflateSetDictionary()`. The function is only helpful when data is compressed in blocks. There will be no change in case of `deflateInit()` or `deflateReset()`  immediately followed by `deflate(strm, Z_FINISH)`
+as this case is automatically detected.
+
+
+#### Reusing contexts
+
+The ordinary zlib compression of two files/streams allocates two contexts:
+- for the 1st file calls `deflateInit`, `deflate`, `...`, `deflate`, `defalateEnd`
+- for the 2nd file calls `deflateInit`, `deflate`, `...`, `deflate`, `defalateEnd`
+
+The speed of compression can be improved with reusing a single context with following steps:
+- initialize the context with `deflateInit`
+- for the 1st file call `deflate`, `...`, `deflate`
+- for the 2nd file call `deflateReset`, `deflate`, `...`, `deflate`
+- free the context with `deflateEnd`
+
+To check the difference we made experiments using `zwrapbench -ri6b6` with zstd and zlib compression (both at level 6).
+The input data was decompressed git repository downloaded from https://github.com/git/git/archive/master.zip which contains 2979 files.
+The table below shows that reusing contexts has a minor influence on zlib but it gives improvement for zstd.
+In our example (the last 2 lines) it gives 4% better compression speed and 5% better decompression speed.
+
+| Compression type                                  | Compression | Decompress.| Compr. size | Ratio |
+| ------------------------------------------------- | ------------| -----------| ----------- | ----- |
+| zlib 1.2.8                                        |  30.51 MB/s | 219.3 MB/s |     6819783 | 3.459 |
+| zlib 1.2.8 not reusing a context                  |  30.22 MB/s | 218.1 MB/s |     6819783 | 3.459 |
+| zlib 1.2.8 with zlibWrapper and reusing a context |  30.40 MB/s | 218.9 MB/s |     6819783 | 3.459 |
+| zlib 1.2.8 with zlibWrapper not reusing a context |  30.28 MB/s | 218.1 MB/s |     6819783 | 3.459 |
+| zstd 1.1.0 using ZSTD_CCtx                        |  68.35 MB/s | 430.9 MB/s |     6868521 | 3.435 |
+| zstd 1.1.0 using ZSTD_CStream                     |  66.63 MB/s | 422.3 MB/s |     6868521 | 3.435 |
+| zstd 1.1.0 with zlibWrapper and reusing a context |  54.01 MB/s | 403.2 MB/s |     6763482 | 3.488 |
+| zstd 1.1.0 with zlibWrapper not reusing a context |  51.59 MB/s | 383.7 MB/s |     6763482 | 3.488 |
+
+
+#### Compatibility issues
+After enabling zstd compression not all native zlib functions are supported. When calling unsupported methods they put error message into `strm->msg` and return Z_STREAM_ERROR.
+
+Supported methods:
+- deflateInit
+- deflate (with exception of Z_FULL_FLUSH, Z_BLOCK, and Z_TREES)
+- deflateSetDictionary
+- deflateEnd
+- deflateReset
+- deflateBound
+- inflateInit
+- inflate
+- inflateSetDictionary
+- inflateReset
+- inflateReset2
+- compress
+- compress2
+- compressBound
+- uncompress
+- gzip file access functions
+
+Ignored methods (they do nothing):
+- deflateParams
+
+Unsupported methods:
+- deflateCopy
+- deflateTune
+- deflatePending
+- deflatePrime
+- deflateSetHeader
+- inflateGetDictionary
+- inflateCopy
+- inflateSync
+- inflatePrime
+- inflateMark
+- inflateGetHeader
+- inflateBackInit
+- inflateBack
+- inflateBackEnd
diff --git a/zstd/zlibWrapper/example b/zstd/zlibWrapper/example
new file mode 100755
index 0000000..84c4406
Binary files /dev/null and b/zstd/zlibWrapper/example differ
diff --git a/zstd/zlibWrapper/fitblk b/zstd/zlibWrapper/fitblk
new file mode 100755
index 0000000..e4c8eea
Binary files /dev/null and b/zstd/zlibWrapper/fitblk differ
diff --git a/zstd/zlibWrapper/gzclose.c b/zstd/zlibWrapper/gzclose.c
new file mode 100644
index 0000000..d4493d0
--- /dev/null
+++ b/zstd/zlibWrapper/gzclose.c
@@ -0,0 +1,28 @@
+/* gzclose.c contains minimal changes required to be compiled with zlibWrapper:
+ * - gz_statep was converted to union to work with -Wstrict-aliasing=1      */
+
+/* gzclose.c -- zlib gzclose() function
+ * Copyright (C) 2004, 2010 Mark Adler
+ * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
+ */
+
+#include "gzguts.h"
+
+/* gzclose() is in a separate file so that it is linked in only if it is used.
+   That way the other gzclose functions can be used instead to avoid linking in
+   unneeded compression or decompression routines. */
+int ZEXPORT gzclose(file)
+    gzFile file;
+{
+#ifndef NO_GZCOMPRESS
+    gz_statep state;
+
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    return state.state->mode == GZ_READ ? gzclose_r(file) : gzclose_w(file);
+#else
+    return gzclose_r(file);
+#endif
+}
diff --git a/zstd/zlibWrapper/gzclose.o b/zstd/zlibWrapper/gzclose.o
new file mode 100644
index 0000000..32b2b55
Binary files /dev/null and b/zstd/zlibWrapper/gzclose.o differ
diff --git a/zstd/zlibWrapper/gzcompatibility.h b/zstd/zlibWrapper/gzcompatibility.h
new file mode 100644
index 0000000..e2ec1ad
--- /dev/null
+++ b/zstd/zlibWrapper/gzcompatibility.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+
+#if ZLIB_VERNUM <= 0x1240
+ZEXTERN int ZEXPORT gzclose_r OF((gzFile file));
+ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
+ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size)); 
+ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
+ 
+#if !defined(_WIN32) && defined(Z_LARGE64)
+#  define z_off64_t off64_t
+#else
+#  if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO)
+#    define z_off64_t __int64
+#  else
+#    define z_off64_t z_off_t
+#  endif
+#endif
+#endif
+
+
+#if ZLIB_VERNUM <= 0x1250
+struct gzFile_s {
+    unsigned have;
+    unsigned char *next;
+    z_off64_t pos;
+};
+#endif
+
+
+#if ZLIB_VERNUM <= 0x1270
+#if defined(_WIN32) && !defined(Z_SOLO)
+#    include <stddef.h>         /* for wchar_t */ 
+ZEXTERN gzFile         ZEXPORT gzopen_w OF((const wchar_t *path,
+                                            const char *mode));
+#endif
+#endif
+
+
+#if ZLIB_VERNUM < 0x12B0
+#ifdef Z_SOLO
+   typedef unsigned long z_size_t;
+#else
+#  define z_longlong long long
+#  if defined(NO_SIZE_T)
+     typedef unsigned NO_SIZE_T z_size_t;
+#  elif defined(STDC)
+#    include <stddef.h>
+     typedef size_t z_size_t;
+#  else
+     typedef unsigned long z_size_t;
+#  endif
+#  undef z_longlong
+#endif
+ZEXTERN z_size_t ZEXPORT gzfread OF((voidp buf, z_size_t size, z_size_t nitems,
+                                     gzFile file));
+ZEXTERN z_size_t ZEXPORT gzfwrite OF((voidpc buf, z_size_t size,
+                                      z_size_t nitems, gzFile file));
+#endif
diff --git a/zstd/zlibWrapper/gzguts.h b/zstd/zlibWrapper/gzguts.h
new file mode 100644
index 0000000..84651b8
--- /dev/null
+++ b/zstd/zlibWrapper/gzguts.h
@@ -0,0 +1,227 @@
+/* gzguts.h contains minimal changes required to be compiled with zlibWrapper:
+ * - #include "zlib.h" was changed to #include "zstd_zlibwrapper.h"        
+ * - gz_statep was converted to union to work with -Wstrict-aliasing=1      */
+
+/* gzguts.h -- zlib internal header definitions for gz* operations
+ * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013, 2016 Mark Adler
+ * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
+ */
+
+#ifdef _LARGEFILE64_SOURCE
+#  ifndef _LARGEFILE_SOURCE
+#    define _LARGEFILE_SOURCE 1
+#  endif
+#  ifdef _FILE_OFFSET_BITS
+#    undef _FILE_OFFSET_BITS
+#  endif
+#endif
+
+#ifdef HAVE_HIDDEN
+#  define ZLIB_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define ZLIB_INTERNAL
+#endif
+
+#include <stdio.h>
+#include "zstd_zlibwrapper.h"
+#include "gzcompatibility.h"
+#ifdef STDC
+#  include <string.h>
+#  include <stdlib.h>
+#  include <limits.h>
+#endif
+
+#ifndef _POSIX_SOURCE
+#  define _POSIX_SOURCE
+#endif
+#include <fcntl.h>
+
+#ifdef _WIN32
+#  include <stddef.h>
+#endif
+
+#if defined(__TURBOC__) || defined(_MSC_VER) || defined(_WIN32)
+#  include <io.h>
+#endif
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  define WIDECHAR
+#endif
+
+#ifdef WINAPI_FAMILY
+#  define open _open
+#  define read _read
+#  define write _write
+#  define close _close
+#endif
+
+#ifdef NO_DEFLATE       /* for compatibility with old definition */
+#  define NO_GZCOMPRESS
+#endif
+
+#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#if defined(__CYGWIN__)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#if defined(MSDOS) && defined(__BORLANDC__) && (BORLANDC > 0x410)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#ifndef HAVE_VSNPRINTF
+#  ifdef MSDOS
+/* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
+   but for now we just assume it doesn't. */
+#    define NO_vsnprintf
+#  endif
+#  ifdef __TURBOC__
+#    define NO_vsnprintf
+#  endif
+#  ifdef WIN32
+/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
+#    if !defined(vsnprintf) && !defined(NO_vsnprintf)
+#      if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
+#         define vsnprintf _vsnprintf
+#      endif
+#    endif
+#  endif
+#  ifdef __SASC
+#    define NO_vsnprintf
+#  endif
+#  ifdef VMS
+#    define NO_vsnprintf
+#  endif
+#  ifdef __OS400__
+#    define NO_vsnprintf
+#  endif
+#  ifdef __MVS__
+#    define NO_vsnprintf
+#  endif
+#endif
+
+/* unlike snprintf (which is required in C99), _snprintf does not guarantee
+   null termination of the result -- however this is only used in gzlib.c where
+   the result is assured to fit in the space provided */
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#  define snprintf _snprintf
+#endif
+
+#ifndef local
+#  define local static
+#endif
+/* since "static" is used to mean two completely different things in C, we
+   define "local" for the non-static meaning of "static", for readability
+   (compile with -Dlocal if your debugger can't find static symbols) */
+
+/* gz* functions always use library allocation functions */
+#ifndef STDC
+  extern voidp  malloc OF((uInt size));
+  extern void   free   OF((voidpf ptr));
+#endif
+
+/* get errno and strerror definition */
+#if defined UNDER_CE
+#  include <windows.h>
+#  define zstrerror() gz_strwinerror((DWORD)GetLastError())
+#else
+#  ifndef NO_STRERROR
+#    include <errno.h>
+#    define zstrerror() strerror(errno)
+#  else
+#    define zstrerror() "stdio error (consult errno)"
+#  endif
+#endif
+
+/* provide prototypes for these when building zlib without LFS */
+#if !defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0
+    ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+    ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+    ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+    ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+
+/* default i/o buffer size -- double this for output when reading (this and
+   twice this must be able to fit in an unsigned type) */
+#define GZBUFSIZE 8192
+
+/* gzip modes, also provide a little integrity check on the passed structure */
+#define GZ_NONE 0
+#define GZ_READ 7247
+#define GZ_WRITE 31153
+#define GZ_APPEND 1     /* mode set to GZ_WRITE after the file is opened */
+
+/* values for gz_state how */
+#define LOOK 0      /* look for a gzip header */
+#define COPY 1      /* copy input directly */
+#define GZIP 2      /* decompress a gzip stream */
+
+/* internal gzip file state data structure */
+typedef struct {
+        /* exposed contents for gzgetc() macro */
+    struct gzFile_s x;      /* "x" for exposed */
+                            /* x.have: number of bytes available at x.next */
+                            /* x.next: next output data to deliver or write */
+                            /* x.pos: current position in uncompressed data */
+        /* used for both reading and writing */
+    int mode;               /* see gzip modes above */
+    int fd;                 /* file descriptor */
+    char *path;             /* path or fd for error messages */
+    unsigned size;          /* buffer size, zero if not allocated yet */
+    unsigned want;          /* requested buffer size, default is GZBUFSIZE */
+    unsigned char *in;      /* input buffer (double-sized when writing) */
+    unsigned char *out;     /* output buffer (double-sized when reading) */
+    int direct;             /* 0 if processing gzip, 1 if transparent */
+        /* just for reading */
+    int how;                /* 0: get header, 1: copy, 2: decompress */
+    z_off64_t start;        /* where the gzip data started, for rewinding */
+    int eof;                /* true if end of input file reached */
+    int past;               /* true if read requested past end */
+        /* just for writing */
+    int level;              /* compression level */
+    int strategy;           /* compression strategy */
+        /* seek request */
+    z_off64_t skip;         /* amount to skip (already rewound if backwards) */
+    int seek;               /* true if seek request pending */
+        /* error information */
+    int err;                /* error code */
+    char *msg;              /* error message */
+        /* zlib inflate or deflate stream */
+    z_stream strm;          /* stream structure in-place (not a pointer) */
+} gz_state;
+
+typedef union {
+    gz_state FAR *state;
+    gzFile file;
+} gz_statep;
+
+/* shared functions */
+void ZLIB_INTERNAL gz_error OF((gz_statep, int, const char *));
+#if defined UNDER_CE
+char ZLIB_INTERNAL *gz_strwinerror OF((DWORD error));
+#endif
+
+/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
+   value -- needed when comparing unsigned to z_off64_t, which is signed
+   (possible z_off64_t types off_t, off64_t, and long are all signed) */
+#ifdef INT_MAX
+#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
+#else
+unsigned ZLIB_INTERNAL gz_intmax OF((void));
+#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax())
+#endif
diff --git a/zstd/zlibWrapper/gzlib.c b/zstd/zlibWrapper/gzlib.c
new file mode 100644
index 0000000..aa94206
--- /dev/null
+++ b/zstd/zlibWrapper/gzlib.c
@@ -0,0 +1,640 @@
+/* gzlib.c contains minimal changes required to be compiled with zlibWrapper:
+ * - gz_statep was converted to union to work with -Wstrict-aliasing=1      */ 
+
+/* gzlib.c -- zlib functions common to reading and writing gzip files
+ * Copyright (C) 2004-2017 Mark Adler
+ * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
+ */
+
+#include "gzguts.h"
+
+#if defined(_WIN32) && !defined(__BORLANDC__) && !defined(__MINGW32__)
+#  define LSEEK _lseeki64
+#else
+#if defined(_LARGEFILE64_SOURCE) && _LFS64_LARGEFILE-0
+#  define LSEEK lseek64
+#else
+#  define LSEEK lseek
+#endif
+#endif
+
+/* Local functions */
+local void gz_reset OF((gz_statep));
+local gzFile gz_open OF((const void *, int, const char *));
+
+#if defined UNDER_CE
+
+/* Map the Windows error number in ERROR to a locale-dependent error message
+   string and return a pointer to it.  Typically, the values for ERROR come
+   from GetLastError.
+
+   The string pointed to shall not be modified by the application, but may be
+   overwritten by a subsequent call to gz_strwinerror
+
+   The gz_strwinerror function does not change the current setting of
+   GetLastError. */
+char ZLIB_INTERNAL *gz_strwinerror (error)
+     DWORD error;
+{
+    static char buf[1024];
+
+    wchar_t *msgbuf;
+    DWORD lasterr = GetLastError();
+    DWORD chars = FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM
+        | FORMAT_MESSAGE_ALLOCATE_BUFFER,
+        NULL,
+        error,
+        0, /* Default language */
+        (LPVOID)&msgbuf,
+        0,
+        NULL);
+    if (chars != 0) {
+        /* If there is an \r\n appended, zap it.  */
+        if (chars >= 2
+            && msgbuf[chars - 2] == '\r' && msgbuf[chars - 1] == '\n') {
+            chars -= 2;
+            msgbuf[chars] = 0;
+        }
+
+        if (chars > sizeof (buf) - 1) {
+            chars = sizeof (buf) - 1;
+            msgbuf[chars] = 0;
+        }
+
+        wcstombs(buf, msgbuf, chars + 1);
+        LocalFree(msgbuf);
+    }
+    else {
+        sprintf(buf, "unknown win32 error (%ld)", error);
+    }
+
+    SetLastError(lasterr);
+    return buf;
+}
+
+#endif /* UNDER_CE */
+
+/* Reset gzip file state */
+local void gz_reset(state)
+    gz_statep state;
+{
+    state.state->x.have = 0;              /* no output data available */
+    if (state.state->mode == GZ_READ) {   /* for reading ... */
+        state.state->eof = 0;             /* not at end of file */
+        state.state->past = 0;            /* have not read past end yet */
+        state.state->how = LOOK;          /* look for gzip header */
+    }
+    state.state->seek = 0;                /* no seek request pending */
+    gz_error(state, Z_OK, NULL);    /* clear error */
+    state.state->x.pos = 0;               /* no uncompressed data yet */
+    state.state->strm.avail_in = 0;       /* no input data yet */
+}
+
+/* Open a gzip file either by name or file descriptor. */
+local gzFile gz_open(path, fd, mode)
+    const void *path;
+    int fd;
+    const char *mode;
+{
+    gz_statep state;
+    z_size_t len;
+    int oflag;
+#ifdef O_CLOEXEC
+    int cloexec = 0;
+#endif
+#ifdef O_EXCL
+    int exclusive = 0;
+#endif
+
+    /* check input */
+    if (path == NULL)
+        return NULL;
+
+    /* allocate gzFile structure to return */
+    state = (gz_statep)(gz_state*)malloc(sizeof(gz_state));
+    if (state.state == NULL)
+        return NULL;
+    state.state->size = 0;            /* no buffers allocated yet */
+    state.state->want = GZBUFSIZE;    /* requested buffer size */
+    state.state->msg = NULL;          /* no error message yet */
+
+    /* interpret mode */
+    state.state->mode = GZ_NONE;
+    state.state->level = Z_DEFAULT_COMPRESSION;
+    state.state->strategy = Z_DEFAULT_STRATEGY;
+    state.state->direct = 0;
+    while (*mode) {
+        if (*mode >= '0' && *mode <= '9')
+            state.state->level = *mode - '0';
+        else
+            switch (*mode) {
+            case 'r':
+                state.state->mode = GZ_READ;
+                break;
+#ifndef NO_GZCOMPRESS
+            case 'w':
+                state.state->mode = GZ_WRITE;
+                break;
+            case 'a':
+                state.state->mode = GZ_APPEND;
+                break;
+#endif
+            case '+':       /* can't read and write at the same time */
+                free(state.state);
+                return NULL;
+            case 'b':       /* ignore -- will request binary anyway */
+                break;
+#ifdef O_CLOEXEC
+            case 'e':
+                cloexec = 1;
+                break;
+#endif
+#ifdef O_EXCL
+            case 'x':
+                exclusive = 1;
+                break;
+#endif
+            case 'f':
+                state.state->strategy = Z_FILTERED;
+                break;
+            case 'h':
+                state.state->strategy = Z_HUFFMAN_ONLY;
+                break;
+            case 'R':
+                state.state->strategy = Z_RLE;
+                break;
+            case 'F':
+                state.state->strategy = Z_FIXED;
+                break;
+            case 'T':
+                state.state->direct = 1;
+                break;
+            default:        /* could consider as an error, but just ignore */
+                ;
+            }
+        mode++;
+    }
+
+    /* must provide an "r", "w", or "a" */
+    if (state.state->mode == GZ_NONE) {
+        free(state.state);
+        return NULL;
+    }
+
+    /* can't force transparent read */
+    if (state.state->mode == GZ_READ) {
+        if (state.state->direct) {
+            free(state.state);
+            return NULL;
+        }
+        state.state->direct = 1;      /* for empty file */
+    }
+
+    /* save the path name for error messages */
+#ifdef WIDECHAR
+    if (fd == -2) {
+        len = wcstombs(NULL, path, 0);
+        if (len == (z_size_t)-1)
+            len = 0;
+    }
+    else
+#endif
+        len = strlen((const char *)path);
+    state.state->path = (char *)malloc(len + 1);
+    if (state.state->path == NULL) {
+        free(state.state);
+        return NULL;
+    }
+#ifdef WIDECHAR
+    if (fd == -2)
+        if (len)
+            wcstombs(state.state->path, path, len + 1);
+        else
+            *(state.state->path) = 0;
+    else
+#endif
+#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
+        (void)snprintf(state.state->path, len + 1, "%s", (const char *)path);
+#else
+        strcpy(state.state->path, path);
+#endif
+
+    /* compute the flags for open() */
+    oflag =
+#ifdef O_LARGEFILE
+        O_LARGEFILE |
+#endif
+#ifdef O_BINARY
+        O_BINARY |
+#endif
+#ifdef O_CLOEXEC
+        (cloexec ? O_CLOEXEC : 0) |
+#endif
+        (state.state->mode == GZ_READ ?
+         O_RDONLY :
+         (O_WRONLY | O_CREAT |
+#ifdef O_EXCL
+          (exclusive ? O_EXCL : 0) |
+#endif
+          (state.state->mode == GZ_WRITE ?
+           O_TRUNC :
+           O_APPEND)));
+
+    /* open the file with the appropriate flags (or just use fd) */
+    state.state->fd = fd > -1 ? fd : (
+#ifdef WIDECHAR
+        fd == -2 ? _wopen(path, oflag, 0666) :
+#endif
+        open((const char *)path, oflag, 0666));
+    if (state.state->fd == -1) {
+        free(state.state->path);
+        free(state.state);
+        return NULL;
+    }
+    if (state.state->mode == GZ_APPEND) {
+        LSEEK(state.state->fd, 0, SEEK_END);  /* so gzoffset() is correct */
+        state.state->mode = GZ_WRITE;         /* simplify later checks */
+    }
+
+    /* save the current position for rewinding (only if reading) */
+    if (state.state->mode == GZ_READ) {
+        state.state->start = LSEEK(state.state->fd, 0, SEEK_CUR);
+        if (state.state->start == -1) state.state->start = 0;
+    }
+
+    /* initialize stream */
+    gz_reset(state);
+
+    /* return stream */
+    return (gzFile)state.file;
+}
+
+/* -- see zlib.h -- */
+gzFile ZEXPORT gzopen(path, mode)
+    const char *path;
+    const char *mode;
+{
+    return gz_open(path, -1, mode);
+}
+
+/* -- see zlib.h -- */
+gzFile ZEXPORT gzopen64(path, mode)
+    const char *path;
+    const char *mode;
+{
+    return gz_open(path, -1, mode);
+}
+
+/* -- see zlib.h -- */
+gzFile ZEXPORT gzdopen(fd, mode)
+    int fd;
+    const char *mode;
+{
+    char *path;         /* identifier for error messages */
+    gzFile gz;
+
+    if (fd == -1 || (path = (char *)malloc(7 + 3 * sizeof(int))) == NULL)
+        return NULL;
+#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
+    (void)snprintf(path, 7 + 3 * sizeof(int), "<fd:%d>", fd);
+#else
+    sprintf(path, "<fd:%d>", fd);   /* for debugging */
+#endif
+    gz = gz_open(path, fd, mode);
+    free(path);
+    return gz;
+}
+
+/* -- see zlib.h -- */
+#ifdef WIDECHAR
+gzFile ZEXPORT gzopen_w(path, mode)
+    const wchar_t *path;
+    const char *mode;
+{
+    return gz_open(path, -2, mode);
+}
+#endif
+
+/* -- see zlib.h -- */
+int ZEXPORT gzbuffer(file, size)
+    gzFile file;
+    unsigned size;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state.state->mode != GZ_READ && state.state->mode != GZ_WRITE)
+        return -1;
+
+    /* make sure we haven't already allocated memory */
+    if (state.state->size != 0)
+        return -1;
+
+    /* check and set requested size */
+    if ((size << 1) < size)
+        return -1;              /* need to be able to double it */
+    if (size < 2)
+        size = 2;               /* need two bytes to check magic header */
+    state.state->want = size;
+    return 0;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzrewind(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no error */
+    if (state.state->mode != GZ_READ ||
+            (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* back up and start over */
+    if (LSEEK(state.state->fd, state.state->start, SEEK_SET) == -1)
+        return -1;
+    gz_reset(state);
+    return 0;
+}
+
+/* -- see zlib.h -- */
+z_off64_t ZEXPORT gzseek64(file, offset, whence)
+    gzFile file;
+    z_off64_t offset;
+    int whence;
+{
+    unsigned n;
+    z_off64_t ret;
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state.state->mode != GZ_READ && state.state->mode != GZ_WRITE)
+        return -1;
+
+    /* check that there's no error */
+    if (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* can only seek from start or relative to current position */
+    if (whence != SEEK_SET && whence != SEEK_CUR)
+        return -1;
+
+    /* normalize offset to a SEEK_CUR specification */
+    if (whence == SEEK_SET)
+        offset -= state.state->x.pos;
+    else if (state.state->seek)
+        offset += state.state->skip;
+    state.state->seek = 0;
+
+    /* if within raw area while reading, just go there */
+    if (state.state->mode == GZ_READ && state.state->how == COPY &&
+            state.state->x.pos + offset >= 0) {
+        ret = LSEEK(state.state->fd, offset - state.state->x.have, SEEK_CUR);
+        if (ret == -1)
+            return -1;
+        state.state->x.have = 0;
+        state.state->eof = 0;
+        state.state->past = 0;
+        state.state->seek = 0;
+        gz_error(state, Z_OK, NULL);
+        state.state->strm.avail_in = 0;
+        state.state->x.pos += offset;
+        return state.state->x.pos;
+    }
+
+    /* calculate skip amount, rewinding if needed for back seek when reading */
+    if (offset < 0) {
+        if (state.state->mode != GZ_READ)         /* writing -- can't go backwards */
+            return -1;
+        offset += state.state->x.pos;
+        if (offset < 0)                     /* before start of file! */
+            return -1;
+        if (gzrewind(file) == -1)           /* rewind, then skip to offset */
+            return -1;
+    }
+
+    /* if reading, skip what's in output buffer (one less gzgetc() check) */
+    if (state.state->mode == GZ_READ) {
+        n = GT_OFF(state.state->x.have) || (z_off64_t)state.state->x.have > offset ?
+            (unsigned)offset : state.state->x.have;
+        state.state->x.have -= n;
+        state.state->x.next += n;
+        state.state->x.pos += n;
+        offset -= n;
+    }
+
+    /* request skip (if not zero) */
+    if (offset) {
+        state.state->seek = 1;
+        state.state->skip = offset;
+    }
+    return state.state->x.pos + offset;
+}
+
+/* -- see zlib.h -- */
+z_off_t ZEXPORT gzseek(file, offset, whence)
+    gzFile file;
+    z_off_t offset;
+    int whence;
+{
+    z_off64_t ret;
+
+    ret = gzseek64(file, (z_off64_t)offset, whence);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+
+/* -- see zlib.h -- */
+z_off64_t ZEXPORT gztell64(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state.state->mode != GZ_READ && state.state->mode != GZ_WRITE)
+        return -1;
+
+    /* return position */
+    return state.state->x.pos + (state.state->seek ? state.state->skip : 0);
+}
+
+/* -- see zlib.h -- */
+z_off_t ZEXPORT gztell(file)
+    gzFile file;
+{
+    z_off64_t ret;
+
+    ret = gztell64(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+
+/* -- see zlib.h -- */
+z_off64_t ZEXPORT gzoffset64(file)
+    gzFile file;
+{
+    z_off64_t offset;
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    if (state.state->mode != GZ_READ && state.state->mode != GZ_WRITE)
+        return -1;
+
+    /* compute and return effective offset in file */
+    offset = LSEEK(state.state->fd, 0, SEEK_CUR);
+    if (offset == -1)
+        return -1;
+    if (state.state->mode == GZ_READ)             /* reading */
+        offset -= state.state->strm.avail_in;     /* don't count buffered input */
+    return offset;
+}
+
+/* -- see zlib.h -- */
+z_off_t ZEXPORT gzoffset(file)
+    gzFile file;
+{
+    z_off64_t ret;
+
+    ret = gzoffset64(file);
+    return ret == (z_off_t)ret ? (z_off_t)ret : -1;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzeof(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+    if (state.state->mode != GZ_READ && state.state->mode != GZ_WRITE)
+        return 0;
+
+    /* return end-of-file state */
+    return state.state->mode == GZ_READ ? state.state->past : 0;
+}
+
+/* -- see zlib.h -- */
+const char * ZEXPORT gzerror(file, errnum)
+    gzFile file;
+    int *errnum;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return NULL;
+    state = (gz_statep)file;
+    if (state.state->mode != GZ_READ && state.state->mode != GZ_WRITE)
+        return NULL;
+
+    /* return error information */
+    if (errnum != NULL)
+        *errnum = state.state->err;
+    return state.state->err == Z_MEM_ERROR ? "out of memory" :
+                                       (state.state->msg == NULL ? "" : state.state->msg);
+}
+
+/* -- see zlib.h -- */
+void ZEXPORT gzclearerr(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure and check integrity */
+    if (file == NULL)
+        return;
+    state = (gz_statep)file;
+    if (state.state->mode != GZ_READ && state.state->mode != GZ_WRITE)
+        return;
+
+    /* clear error and end-of-file */
+    if (state.state->mode == GZ_READ) {
+        state.state->eof = 0;
+        state.state->past = 0;
+    }
+    gz_error(state, Z_OK, NULL);
+}
+
+/* Create an error message in allocated memory and set state.state->err and
+   state.state->msg accordingly.  Free any previous error message already there.  Do
+   not try to free or allocate space if the error is Z_MEM_ERROR (out of
+   memory).  Simply save the error message as a static string.  If there is an
+   allocation failure constructing the error message, then convert the error to
+   out of memory. */
+void ZLIB_INTERNAL gz_error(state, err, msg)
+    gz_statep state;
+    int err;
+    const char *msg;
+{
+    /* free previously allocated message and clear */
+    if (state.state->msg != NULL) {
+        if (state.state->err != Z_MEM_ERROR)
+            free(state.state->msg);
+        state.state->msg = NULL;
+    }
+
+    /* if fatal, set state.state->x.have to 0 so that the gzgetc() macro fails */
+    if (err != Z_OK && err != Z_BUF_ERROR)
+        state.state->x.have = 0;
+
+    /* set error code, and if no message, then done */
+    state.state->err = err;
+    if (msg == NULL)
+        return;
+
+    /* for an out of memory error, return literal string when requested */
+    if (err == Z_MEM_ERROR)
+        return;
+
+    /* construct error message with path */
+    if ((state.state->msg = (char *)malloc(strlen(state.state->path) + strlen(msg) + 3)) ==
+            NULL) {
+        state.state->err = Z_MEM_ERROR;
+        return;
+    }
+#if !defined(NO_snprintf) && !defined(NO_vsnprintf)
+    (void)snprintf(state.state->msg, strlen(state.state->path) + strlen(msg) + 3,
+                   "%s%s%s", state.state->path, ": ", msg);
+#else
+    strcpy(state.state->msg, state.state->path);
+    strcat(state.state->msg, ": ");
+    strcat(state.state->msg, msg);
+#endif
+}
+
+#ifndef INT_MAX
+/* portably return maximum value for an int (when limits.h presumed not
+   available) -- we need to do this to cover cases where 2's complement not
+   used, since C standard permits 1's complement and sign-bit representations,
+   otherwise we could just use ((unsigned)-1) >> 1 */
+unsigned ZLIB_INTERNAL gz_intmax()
+{
+    unsigned p, q;
+
+    p = 1;
+    do {
+        q = p;
+        p <<= 1;
+        p++;
+    } while (p > q);
+    return q >> 1;
+}
+#endif
diff --git a/zstd/zlibWrapper/gzlib.o b/zstd/zlibWrapper/gzlib.o
new file mode 100644
index 0000000..9e45356
Binary files /dev/null and b/zstd/zlibWrapper/gzlib.o differ
diff --git a/zstd/zlibWrapper/gzread.c b/zstd/zlibWrapper/gzread.c
new file mode 100644
index 0000000..d37aaa1
--- /dev/null
+++ b/zstd/zlibWrapper/gzread.c
@@ -0,0 +1,670 @@
+/* gzread.c contains minimal changes required to be compiled with zlibWrapper:
+ * - gz_statep was converted to union to work with -Wstrict-aliasing=1      */ 
+ 
+ /* gzread.c -- zlib functions for reading gzip files
+ * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013, 2016 Mark Adler
+ * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
+ */
+
+#include "gzguts.h"
+
+/* Local functions */
+local int gz_load OF((gz_statep, unsigned char *, unsigned, unsigned *));
+local int gz_avail OF((gz_statep));
+local int gz_look OF((gz_statep));
+local int gz_decomp OF((gz_statep));
+local int gz_fetch OF((gz_statep));
+local int gz_skip OF((gz_statep, z_off64_t));
+local z_size_t gz_read OF((gz_statep, voidp, z_size_t));
+
+/* Use read() to load a buffer -- return -1 on error, otherwise 0.  Read from
+   state.state->fd, and update state.state->eof, state.state->err, and state.state->msg as appropriate.
+   This function needs to loop on read(), since read() is not guaranteed to
+   read the number of bytes requested, depending on the type of descriptor. */
+local int gz_load(state, buf, len, have)
+    gz_statep state;
+    unsigned char *buf;
+    unsigned len;
+    unsigned *have;
+{
+    ssize_t ret;
+    unsigned get, max = ((unsigned)-1 >> 2) + 1;
+
+    *have = 0;
+    do {
+        get = len - *have;
+        if (get > max)
+            get = max;
+        ret = read(state.state->fd, buf + *have, get);
+        if (ret <= 0)
+            break;
+        *have += (unsigned)ret;
+    } while (*have < len);
+    if (ret < 0) {
+        gz_error(state, Z_ERRNO, zstrerror());
+        return -1;
+    }
+    if (ret == 0)
+        state.state->eof = 1;
+    return 0;
+}
+
+/* Load up input buffer and set eof flag if last data loaded -- return -1 on
+   error, 0 otherwise.  Note that the eof flag is set when the end of the input
+   file is reached, even though there may be unused data in the buffer.  Once
+   that data has been used, no more attempts will be made to read the file.
+   If strm->avail_in != 0, then the current data is moved to the beginning of
+   the input buffer, and then the remainder of the buffer is loaded with the
+   available data from the input file. */
+local int gz_avail(state)
+    gz_statep state;
+{
+    unsigned got;
+    z_streamp strm = &(state.state->strm);
+
+    if (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR)
+        return -1;
+    if (state.state->eof == 0) {
+        if (strm->avail_in) {       /* copy what's there to the start */
+            unsigned char *p = state.state->in;
+            unsigned const char *q = strm->next_in;
+            unsigned n = strm->avail_in;
+            do {
+                *p++ = *q++;
+            } while (--n);
+        }
+        if (gz_load(state, state.state->in + strm->avail_in,
+                    state.state->size - strm->avail_in, &got) == -1)
+            return -1;
+        strm->avail_in += got;
+        strm->next_in = state.state->in;
+    }
+    return 0;
+}
+
+/* Look for gzip header, set up for inflate or copy.  state.state->x.have must be 0.
+   If this is the first time in, allocate required memory.  state.state->how will be
+   left unchanged if there is no more input data available, will be set to COPY
+   if there is no gzip header and direct copying will be performed, or it will
+   be set to GZIP for decompression.  If direct copying, then leftover input
+   data from the input buffer will be copied to the output buffer.  In that
+   case, all further file reads will be directly to either the output buffer or
+   a user buffer.  If decompressing, the inflate state will be initialized.
+   gz_look() will return 0 on success or -1 on failure. */
+local int gz_look(state)
+    gz_statep state;
+{
+    z_streamp strm = &(state.state->strm);
+
+    /* allocate read buffers and inflate memory */
+    if (state.state->size == 0) {
+        /* allocate buffers */
+        state.state->in = (unsigned char *)malloc(state.state->want);
+        state.state->out = (unsigned char *)malloc(state.state->want << 1);
+        if (state.state->in == NULL || state.state->out == NULL) {
+            free(state.state->out);
+            free(state.state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        state.state->size = state.state->want;
+
+        /* allocate inflate memory */
+        state.state->strm.zalloc = Z_NULL;
+        state.state->strm.zfree = Z_NULL;
+        state.state->strm.opaque = Z_NULL;
+        state.state->strm.avail_in = 0;
+        state.state->strm.next_in = Z_NULL;
+        if (inflateInit2(&(state.state->strm), 15 + 16) != Z_OK) {    /* gunzip */
+            free(state.state->out);
+            free(state.state->in);
+            state.state->size = 0;
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+    }
+
+    /* get at least the magic bytes in the input buffer */
+    if (strm->avail_in < 2) {
+        if (gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0)
+            return 0;
+    }
+
+    /* look for gzip magic bytes -- if there, do gzip decoding (note: there is
+       a logical dilemma here when considering the case of a partially written
+       gzip file, to wit, if a single 31 byte is written, then we cannot tell
+       whether this is a single-byte file, or just a partially written gzip
+       file -- for here we assume that if a gzip file is being written, then
+       the header will be written in a single operation, so that reading a
+       single byte is sufficient indication that it is not a gzip file) */
+    if (strm->avail_in > 1 &&
+            ((strm->next_in[0] == 31 && strm->next_in[1] == 139) /* gz header */
+            || (strm->next_in[0] == 40 && strm->next_in[1] == 181))) { /* zstd header */
+        inflateReset(strm);
+        state.state->how = GZIP;
+        state.state->direct = 0;
+        return 0;
+    }
+
+    /* no gzip header -- if we were decoding gzip before, then this is trailing
+       garbage.  Ignore the trailing garbage and finish. */
+    if (state.state->direct == 0) {
+        strm->avail_in = 0;
+        state.state->eof = 1;
+        state.state->x.have = 0;
+        return 0;
+    }
+
+    /* doing raw i/o, copy any leftover input to output -- this assumes that
+       the output buffer is larger than the input buffer, which also assures
+       space for gzungetc() */
+    state.state->x.next = state.state->out;
+    if (strm->avail_in) {
+        memcpy(state.state->x.next, strm->next_in, strm->avail_in);
+        state.state->x.have = strm->avail_in;
+        strm->avail_in = 0;
+    }
+    state.state->how = COPY;
+    state.state->direct = 1;
+    return 0;
+}
+
+/* Decompress from input to the provided next_out and avail_out in the state.
+   On return, state.state->x.have and state.state->x.next point to the just decompressed
+   data.  If the gzip stream completes, state.state->how is reset to LOOK to look for
+   the next gzip stream or raw data, once state.state->x.have is depleted.  Returns 0
+   on success, -1 on failure. */
+local int gz_decomp(state)
+    gz_statep state;
+{
+    int ret = Z_OK;
+    unsigned had;
+    z_streamp strm = &(state.state->strm);
+
+    /* fill output buffer up to end of deflate stream */
+    had = strm->avail_out;
+    do {
+        /* get more input for inflate() */
+        if (strm->avail_in == 0 && gz_avail(state) == -1)
+            return -1;
+        if (strm->avail_in == 0) {
+            gz_error(state, Z_BUF_ERROR, "unexpected end of file");
+            break;
+        }
+
+        /* decompress and handle errors */
+        ret = inflate(strm, Z_NO_FLUSH);
+        if (ret == Z_STREAM_ERROR || ret == Z_NEED_DICT) {
+            gz_error(state, Z_STREAM_ERROR,
+                     "internal error: inflate stream corrupt");
+            return -1;
+        }
+        if (ret == Z_MEM_ERROR) {
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        if (ret == Z_DATA_ERROR) {              /* deflate stream invalid */
+            gz_error(state, Z_DATA_ERROR,
+                     strm->msg == NULL ? "compressed data error" : strm->msg);
+            return -1;
+        }
+    } while (strm->avail_out && ret != Z_STREAM_END);
+
+    /* update available output */
+    state.state->x.have = had - strm->avail_out;
+    state.state->x.next = strm->next_out - state.state->x.have;
+
+    /* if the gzip stream completed successfully, look for another */
+    if (ret == Z_STREAM_END)
+        state.state->how = LOOK;
+
+    /* good decompression */
+    return 0;
+}
+
+/* Fetch data and put it in the output buffer.  Assumes state.state->x.have is 0.
+   Data is either copied from the input file or decompressed from the input
+   file depending on state.state->how.  If state.state->how is LOOK, then a gzip header is
+   looked for to determine whether to copy or decompress.  Returns -1 on error,
+   otherwise 0.  gz_fetch() will leave state.state->how as COPY or GZIP unless the
+   end of the input file has been reached and all data has been processed.  */
+local int gz_fetch(state)
+    gz_statep state;
+{
+    z_streamp strm = &(state.state->strm);
+
+    do {
+        switch(state.state->how) {
+        case LOOK:      /* -> LOOK, COPY (only if never GZIP), or GZIP */
+            if (gz_look(state) == -1)
+                return -1;
+            if (state.state->how == LOOK)
+                return 0;
+            break;
+        case COPY:      /* -> COPY */
+            if (gz_load(state, state.state->out, state.state->size << 1, &(state.state->x.have))
+                    == -1)
+                return -1;
+            state.state->x.next = state.state->out;
+            return 0;
+        case GZIP:      /* -> GZIP or LOOK (if end of gzip stream) */
+            strm->avail_out = state.state->size << 1;
+            strm->next_out = state.state->out;
+            if (gz_decomp(state) == -1)
+                return -1;
+        }
+    } while (state.state->x.have == 0 && (!state.state->eof || strm->avail_in));
+    return 0;
+}
+
+/* Skip len uncompressed bytes of output.  Return -1 on error, 0 on success. */
+local int gz_skip(state, len)
+    gz_statep state;
+    z_off64_t len;
+{
+    unsigned n;
+
+    /* skip over len bytes or reach end-of-file, whichever comes first */
+    while (len)
+        /* skip over whatever is in output buffer */
+        if (state.state->x.have) {
+            n = GT_OFF(state.state->x.have) || (z_off64_t)state.state->x.have > len ?
+                (unsigned)len : state.state->x.have;
+            state.state->x.have -= n;
+            state.state->x.next += n;
+            state.state->x.pos += n;
+            len -= n;
+        }
+
+        /* output buffer empty -- return if we're at the end of the input */
+        else if (state.state->eof && state.state->strm.avail_in == 0)
+            break;
+
+        /* need more data to skip -- load up output buffer */
+        else {
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return -1;
+        }
+    return 0;
+}
+
+/* Read len bytes into buf from file, or less than len up to the end of the
+   input.  Return the number of bytes read.  If zero is returned, either the
+   end of file was reached, or there was an error.  state.state->err must be
+   consulted in that case to determine which. */
+local z_size_t gz_read(state, buf, len)
+    gz_statep state;
+    voidp buf;
+    z_size_t len;
+{
+    z_size_t got;
+    unsigned n;
+
+    /* if len is zero, avoid unnecessary operations */
+    if (len == 0)
+        return 0;
+
+    /* process a skip request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_skip(state, state.state->skip) == -1)
+            return 0;
+    }
+
+    /* get len bytes to buf, or less than len if at the end */
+    got = 0;
+    do {
+        /* set n to the maximum amount of len that fits in an unsigned int */
+        n = -1;
+        if (n > len)
+            n = (unsigned)len;
+
+        /* first just try copying data from the output buffer */
+        if (state.state->x.have) {
+            if (state.state->x.have < n)
+                n = state.state->x.have;
+            memcpy(buf, state.state->x.next, n);
+            state.state->x.next += n;
+            state.state->x.have -= n;
+        }
+
+        /* output buffer empty -- return if we're at the end of the input */
+        else if (state.state->eof && state.state->strm.avail_in == 0) {
+            state.state->past = 1;        /* tried to read past end */
+            break;
+        }
+
+        /* need output data -- for small len or new stream load up our output
+           buffer */
+        else if (state.state->how == LOOK || n < (state.state->size << 1)) {
+            /* get more output, looking for header if required */
+            if (gz_fetch(state) == -1)
+                return 0;
+            continue;       /* no progress yet -- go back to copy above */
+            /* the copy above assures that we will leave with space in the
+               output buffer, allowing at least one gzungetc() to succeed */
+        }
+
+        /* large len -- read directly into user buffer */
+        else if (state.state->how == COPY) {      /* read directly */
+            if (gz_load(state, (unsigned char *)buf, n, &n) == -1)
+                return 0;
+        }
+
+        /* large len -- decompress directly into user buffer */
+        else {  /* state.state->how == GZIP */
+            state.state->strm.avail_out = n;
+            state.state->strm.next_out = (unsigned char *)buf;
+            if (gz_decomp(state) == -1)
+                return 0;
+            n = state.state->x.have;
+            state.state->x.have = 0;
+        }
+
+        /* update progress */
+        len -= n;
+        buf = (char *)buf + n;
+        got += n;
+        state.state->x.pos += n;
+    } while (len);
+
+    /* return number of bytes read into user buffer */
+    return got;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzread(file, buf, len)
+    gzFile file;
+    voidp buf;
+    unsigned len;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state.state->mode != GZ_READ ||
+            (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* since an int is returned, make sure len fits in one, otherwise return
+       with an error (this avoids a flaw in the interface) */
+    if ((int)len < 0) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in an int");
+        return -1;
+    }
+
+    /* read len or fewer bytes to buf */
+    len = (unsigned)gz_read(state, buf, len);
+
+    /* check for an error */
+    if (len == 0 && state.state->err != Z_OK && state.state->err != Z_BUF_ERROR)
+        return -1;
+
+    /* return the number of bytes read (this is assured to fit in an int) */
+    return (int)len;
+}
+
+/* -- see zlib.h -- */
+z_size_t ZEXPORT gzfread(buf, size, nitems, file)
+    voidp buf;
+    z_size_t size;
+    z_size_t nitems;
+    gzFile file;
+{
+    z_size_t len;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state.state->mode != GZ_READ ||
+            (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR))
+        return 0;
+
+    /* compute bytes to read -- error on overflow */
+    len = nitems * size;
+    if (size && len / size != nitems) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
+        return 0;
+    }
+
+    /* read len or fewer bytes to buf, return the number of full items read */
+    return len ? gz_read(state, buf, len) / size : 0;
+}
+
+/* -- see zlib.h -- */
+#if ZLIB_VERNUM >= 0x1261
+#ifdef Z_PREFIX_SET
+#  undef z_gzgetc
+#else
+#  undef gzgetc
+#endif
+#endif
+
+#if ZLIB_VERNUM == 0x1260
+#  undef gzgetc
+#endif
+
+#if ZLIB_VERNUM <= 0x1250
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
+ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file));
+#endif
+
+int ZEXPORT gzgetc(file)
+    gzFile file;
+{
+    int ret;
+    unsigned char buf[1];
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state.state->mode != GZ_READ ||
+        (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* try output buffer (no need to check for skip request) */
+    if (state.state->x.have) {
+        state.state->x.have--;
+        state.state->x.pos++;
+        return *(state.state->x.next)++;
+    }
+
+    /* nothing there -- try gz_read() */
+    ret = (unsigned)gz_read(state, buf, 1);
+    return ret < 1 ? -1 : buf[0];
+}
+
+int ZEXPORT gzgetc_(file)
+gzFile file;
+{
+    return gzgetc(file);
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzungetc(c, file)
+    int c;
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state.state->mode != GZ_READ ||
+        (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR))
+        return -1;
+
+    /* process a skip request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_skip(state, state.state->skip) == -1)
+            return -1;
+    }
+
+    /* can't push EOF */
+    if (c < 0)
+        return -1;
+
+    /* if output buffer empty, put byte at end (allows more pushing) */
+    if (state.state->x.have == 0) {
+        state.state->x.have = 1;
+        state.state->x.next = state.state->out + (state.state->size << 1) - 1;
+        state.state->x.next[0] = (unsigned char)c;
+        state.state->x.pos--;
+        state.state->past = 0;
+        return c;
+    }
+
+    /* if no room, give up (must have already done a gzungetc()) */
+    if (state.state->x.have == (state.state->size << 1)) {
+        gz_error(state, Z_DATA_ERROR, "out of room to push characters");
+        return -1;
+    }
+
+    /* slide output data if needed and insert byte before existing data */
+    if (state.state->x.next == state.state->out) {
+        unsigned char *src = state.state->out + state.state->x.have;
+        unsigned char *dest = state.state->out + (state.state->size << 1);
+        while (src > state.state->out)
+            *--dest = *--src;
+        state.state->x.next = dest;
+    }
+    state.state->x.have++;
+    state.state->x.next--;
+    state.state->x.next[0] = (unsigned char)c;
+    state.state->x.pos--;
+    state.state->past = 0;
+    return c;
+}
+
+/* -- see zlib.h -- */
+char * ZEXPORT gzgets(file, buf, len)
+    gzFile file;
+    char *buf;
+    int len;
+{
+    unsigned left, n;
+    char *str;
+    unsigned char *eol;
+    gz_statep state;
+
+    /* check parameters and get internal structure */
+    if (file == NULL || buf == NULL || len < 1)
+        return NULL;
+    state = (gz_statep)file;
+
+    /* check that we're reading and that there's no (serious) error */
+    if (state.state->mode != GZ_READ ||
+        (state.state->err != Z_OK && state.state->err != Z_BUF_ERROR))
+        return NULL;
+
+    /* process a skip request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_skip(state, state.state->skip) == -1)
+            return NULL;
+    }
+
+    /* copy output bytes up to new line or len - 1, whichever comes first --
+       append a terminating zero to the string (we don't check for a zero in
+       the contents, let the user worry about that) */
+    str = buf;
+    left = (unsigned)len - 1;
+    if (left) do {
+        /* assure that something is in the output buffer */
+        if (state.state->x.have == 0 && gz_fetch(state) == -1)
+            return NULL;                /* error */
+        if (state.state->x.have == 0) {       /* end of file */
+            state.state->past = 1;            /* read past end */
+            break;                      /* return what we have */
+        }
+
+        /* look for end-of-line in current output buffer */
+        n = state.state->x.have > left ? left : state.state->x.have;
+        eol = (unsigned char *)memchr(state.state->x.next, '\n', n);
+        if (eol != NULL)
+            n = (unsigned)(eol - state.state->x.next) + 1;
+
+        /* copy through end-of-line, or remainder if not found */
+        memcpy(buf, state.state->x.next, n);
+        state.state->x.have -= n;
+        state.state->x.next += n;
+        state.state->x.pos += n;
+        left -= n;
+        buf += n;
+    } while (left && eol == NULL);
+
+    /* return terminated string, or if nothing, end of file */
+    if (buf == str)
+        return NULL;
+    buf[0] = 0;
+    return str;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzdirect(file)
+    gzFile file;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* if the state is not known, but we can find out, then do so (this is
+       mainly for right after a gzopen() or gzdopen()) */
+    if (state.state->mode == GZ_READ && state.state->how == LOOK && state.state->x.have == 0)
+        (void)gz_look(state);
+
+    /* return 1 if transparent, 0 if processing a gzip stream */
+    return state.state->direct;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzclose_r(file)
+    gzFile file;
+{
+    int ret, err;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    /* check that we're reading */
+    if (state.state->mode != GZ_READ)
+        return Z_STREAM_ERROR;
+
+    /* free memory and close file */
+    if (state.state->size) {
+        inflateEnd(&(state.state->strm));
+        free(state.state->out);
+        free(state.state->in);
+    }
+    err = state.state->err == Z_BUF_ERROR ? Z_BUF_ERROR : Z_OK;
+    gz_error(state, Z_OK, NULL);
+    free(state.state->path);
+    ret = close(state.state->fd);
+    free(state.state);
+    return ret ? Z_ERRNO : err;
+}
diff --git a/zstd/zlibWrapper/gzread.o b/zstd/zlibWrapper/gzread.o
new file mode 100644
index 0000000..9c5f9a7
Binary files /dev/null and b/zstd/zlibWrapper/gzread.o differ
diff --git a/zstd/zlibWrapper/gzwrite.c b/zstd/zlibWrapper/gzwrite.c
new file mode 100644
index 0000000..bcda477
--- /dev/null
+++ b/zstd/zlibWrapper/gzwrite.c
@@ -0,0 +1,668 @@
+/* gzwrite.c contains minimal changes required to be compiled with zlibWrapper:
+ * - gz_statep was converted to union to work with -Wstrict-aliasing=1      */ 
+ 
+ /* gzwrite.c -- zlib functions for writing gzip files
+ * Copyright (C) 2004-2017 Mark Adler
+ * For conditions of distribution and use, see http://www.zlib.net/zlib_license.html
+ */
+
+#include "gzguts.h"
+
+/* Local functions */
+local int gz_init OF((gz_statep));
+local int gz_comp OF((gz_statep, int));
+local int gz_zero OF((gz_statep, z_off64_t));
+local z_size_t gz_write OF((gz_statep, voidpc, z_size_t));
+
+/* Initialize state for writing a gzip file.  Mark initialization by setting
+   state.state->size to non-zero.  Return -1 on a memory allocation failure, or 0 on
+   success. */
+local int gz_init(state)
+    gz_statep state;
+{
+    int ret;
+    z_streamp strm = &(state.state->strm);
+
+    /* allocate input buffer (double size for gzprintf) */
+    state.state->in = (unsigned char *)malloc(state.state->want << 1);
+    if (state.state->in == NULL) {
+        gz_error(state, Z_MEM_ERROR, "out of memory");
+        return -1;
+    }
+
+    /* only need output buffer and deflate state if compressing */
+    if (!state.state->direct) {
+        /* allocate output buffer */
+        state.state->out = (unsigned char *)malloc(state.state->want);
+        if (state.state->out == NULL) {
+            free(state.state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+
+        /* allocate deflate memory, set up for gzip compression */
+        strm->zalloc = Z_NULL;
+        strm->zfree = Z_NULL;
+        strm->opaque = Z_NULL;
+        ret = deflateInit2(strm, state.state->level, Z_DEFLATED,
+                           MAX_WBITS + 16, DEF_MEM_LEVEL, state.state->strategy);
+        if (ret != Z_OK) {
+            free(state.state->out);
+            free(state.state->in);
+            gz_error(state, Z_MEM_ERROR, "out of memory");
+            return -1;
+        }
+        strm->next_in = NULL;
+    }
+
+    /* mark state as initialized */
+    state.state->size = state.state->want;
+
+    /* initialize write buffer if compressing */
+    if (!state.state->direct) {
+        strm->avail_out = state.state->size;
+        strm->next_out = state.state->out;
+        state.state->x.next = strm->next_out;
+    }
+    return 0;
+}
+
+/* Compress whatever is at avail_in and next_in and write to the output file.
+   Return -1 if there is an error writing to the output file or if gz_init()
+   fails to allocate memory, otherwise 0.  flush is assumed to be a valid
+   deflate() flush value.  If flush is Z_FINISH, then the deflate() state is
+   reset to start a new gzip stream.  If gz->direct is true, then simply write
+   to the output file without compressing, and ignore flush. */
+local int gz_comp(state, flush)
+    gz_statep state;
+    int flush;
+{
+    int ret, writ;
+    unsigned have, put, max = ((unsigned)-1 >> 2) + 1;
+    z_streamp strm = &(state.state->strm);
+
+    /* allocate memory if this is the first time through */
+    if (state.state->size == 0 && gz_init(state) == -1)
+        return -1;
+
+    /* write directly if requested */
+    if (state.state->direct) {
+        while (strm->avail_in) {
+            put = strm->avail_in > max ? max : strm->avail_in;
+            writ = (int)write(state.state->fd, strm->next_in, put);
+            if (writ < 0) {
+                gz_error(state, Z_ERRNO, zstrerror());
+                return -1;
+            }
+            strm->avail_in -= (unsigned)writ;
+            strm->next_in += writ;
+        }
+        return 0;
+    }
+
+    /* run deflate() on provided input until it produces no more output */
+    ret = Z_OK;
+    do {
+        /* write out current buffer contents if full, or if flushing, but if
+           doing Z_FINISH then don't write until we get to Z_STREAM_END */
+        if (strm->avail_out == 0 || (flush != Z_NO_FLUSH &&
+            (flush != Z_FINISH || ret == Z_STREAM_END))) {
+            while (strm->next_out > state.state->x.next) {
+                put = strm->next_out - state.state->x.next > (int)max ? max :
+                      (unsigned)(strm->next_out - state.state->x.next);
+                writ = (int)write(state.state->fd, state.state->x.next, put);
+                if (writ < 0) {
+                    gz_error(state, Z_ERRNO, zstrerror());
+                    return -1;
+                }
+                state.state->x.next += writ;
+            }
+            if (strm->avail_out == 0) {
+                strm->avail_out = state.state->size;
+                strm->next_out = state.state->out;
+                state.state->x.next = state.state->out;
+            }
+        }
+
+        /* compress */
+        have = strm->avail_out;
+        ret = deflate(strm, flush);
+        if (ret == Z_STREAM_ERROR) {
+            gz_error(state, Z_STREAM_ERROR,
+                      "internal error: deflate stream corrupt");
+            return -1;
+        }
+        have -= strm->avail_out;
+    } while (have);
+
+    /* if that completed a deflate stream, allow another to start */
+    if (flush == Z_FINISH)
+        deflateReset(strm);
+
+    /* all done, no errors */
+    return 0;
+}
+
+/* Compress len zeros to output.  Return -1 on a write error or memory
+   allocation failure by gz_comp(), or 0 on success. */
+local int gz_zero(state, len)
+    gz_statep state;
+    z_off64_t len;
+{
+    int first;
+    unsigned n;
+    z_streamp strm = &(state.state->strm);
+
+    /* consume whatever's left in the input buffer */
+    if (strm->avail_in && gz_comp(state, Z_NO_FLUSH) == -1)
+        return -1;
+
+    /* compress len zeros (len guaranteed > 0) */
+    first = 1;
+    while (len) {
+        n = GT_OFF(state.state->size) || (z_off64_t)state.state->size > len ?
+            (unsigned)len : state.state->size;
+        if (first) {
+            memset(state.state->in, 0, n);
+            first = 0;
+        }
+        strm->avail_in = n;
+        strm->next_in = state.state->in;
+        state.state->x.pos += n;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return -1;
+        len -= n;
+    }
+    return 0;
+}
+
+/* Write len bytes from buf to file.  Return the number of bytes written.  If
+   the returned value is less than len, then there was an error. */
+local z_size_t gz_write(state, buf, len)
+    gz_statep state;
+    voidpc buf;
+    z_size_t len;
+{
+    z_size_t put = len;
+
+    /* if len is zero, avoid unnecessary operations */
+    if (len == 0)
+        return 0;
+
+    /* allocate memory if this is the first time through */
+    if (state.state->size == 0 && gz_init(state) == -1)
+        return 0;
+
+    /* check for seek request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_zero(state, state.state->skip) == -1)
+            return 0;
+    }
+
+    /* for small len, copy to input buffer, otherwise compress directly */
+    if (len < state.state->size) {
+        /* copy to input buffer, compress when full */
+        do {
+            z_size_t have, copy;
+
+            if (state.state->strm.avail_in == 0)
+                state.state->strm.next_in = state.state->in;
+            have = (unsigned)((state.state->strm.next_in + state.state->strm.avail_in) -
+                              state.state->in);
+            copy = state.state->size - have;
+            if (copy > len)
+                copy = len;
+            memcpy(state.state->in + have, buf, copy);
+            state.state->strm.avail_in += copy;
+            state.state->x.pos += copy;
+            buf = (const char *)buf + copy;
+            len -= copy;
+            if (len && gz_comp(state, Z_NO_FLUSH) == -1)
+                return 0;
+        } while (len);
+    }
+    else {
+        /* consume whatever's left in the input buffer */
+        if (state.state->strm.avail_in && gz_comp(state, Z_NO_FLUSH) == -1)
+            return 0;
+
+        /* directly compress user buffer to file */
+        state.state->strm.next_in = (z_const Bytef *)buf;
+        do {
+            z_size_t n = (unsigned)-1;
+            if (n > len)
+                n = len;
+            state.state->strm.avail_in = (z_uInt)n;
+            state.state->x.pos += n;
+            if (gz_comp(state, Z_NO_FLUSH) == -1)
+                return 0;
+            len -= n;
+        } while (len);
+    }
+
+    /* input was all buffered or compressed */
+    return put;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzwrite(file, buf, len)
+    gzFile file;
+    voidpc buf;
+    unsigned len;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return 0;
+
+    /* since an int is returned, make sure len fits in one, otherwise return
+       with an error (this avoids a flaw in the interface) */
+    if ((int)len < 0) {
+        gz_error(state, Z_DATA_ERROR, "requested length does not fit in int");
+        return 0;
+    }
+
+    /* write len bytes from buf (the return value will fit in an int) */
+    return (int)gz_write(state, buf, len);
+}
+
+/* -- see zlib.h -- */
+z_size_t ZEXPORT gzfwrite(buf, size, nitems, file)
+    voidpc buf;
+    z_size_t size;
+    z_size_t nitems;
+    gzFile file;
+{
+    z_size_t len;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return 0;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return 0;
+
+    /* compute bytes to read -- error on overflow */
+    len = nitems * size;
+    if (size && len / size != nitems) {
+        gz_error(state, Z_STREAM_ERROR, "request does not fit in a size_t");
+        return 0;
+    }
+
+    /* write len bytes to buf, return the number of full items written */
+    return len ? gz_write(state, buf, len) / size : 0;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzputc(file, c)
+    gzFile file;
+    int c;
+{
+    unsigned have;
+    unsigned char buf[1];
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+    strm = &(state.state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return -1;
+
+    /* check for seek request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_zero(state, state.state->skip) == -1)
+            return -1;
+    }
+
+    /* try writing to input buffer for speed (state.state->size == 0 if buffer not
+       initialized) */
+    if (state.state->size) {
+        if (strm->avail_in == 0)
+            strm->next_in = state.state->in;
+        have = (unsigned)((strm->next_in + strm->avail_in) - state.state->in);
+        if (have < state.state->size) {
+            state.state->in[have] = (unsigned char)c;
+            strm->avail_in++;
+            state.state->x.pos++;
+            return c & 0xff;
+        }
+    }
+
+    /* no room in buffer or not initialized, use gz_write() */
+    buf[0] = (unsigned char)c;
+    if (gz_write(state, buf, 1) != 1)
+        return -1;
+    return c & 0xff;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzputs(file, str)
+    gzFile file;
+    const char *str;
+{
+    int ret;
+    z_size_t len;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return -1;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return -1;
+
+    /* write string */
+    len = strlen(str);
+    ret = (int)gz_write(state, str, len);
+    return ret == 0 && len != 0 ? -1 : ret;
+}
+
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#include <stdarg.h>
+
+/* -- see zlib.h -- */
+int ZEXPORTVA gzvprintf(gzFile file, const char *format, va_list va)
+{
+    int len;
+    unsigned left;
+    char *next;
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+    strm = &(state.state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* make sure we have some buffer space */
+    if (state.state->size == 0 && gz_init(state) == -1)
+        return state.state->err;
+
+    /* check for seek request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_zero(state, state.state->skip) == -1)
+            return state.state->err;
+    }
+
+    /* do the printf() into the input buffer, put length in len -- the input
+       buffer is double-sized just for this function, so there is guaranteed to
+       be state.state->size bytes available after the current contents */
+    if (strm->avail_in == 0)
+        strm->next_in = state.state->in;
+    next = (char *)(state.state->in + (strm->next_in - state.state->in) + strm->avail_in);
+    next[state.state->size - 1] = 0;
+#ifdef NO_vsnprintf
+#  ifdef HAS_vsprintf_void
+    (void)vsprintf(next, format, va);
+    for (len = 0; len < state.state->size; len++)
+        if (next[len] == 0) break;
+#  else
+    len = vsprintf(next, format, va);
+#  endif
+#else
+#  ifdef HAS_vsnprintf_void
+    (void)vsnprintf(next, state.state->size, format, va);
+    len = strlen(next);
+#  else
+    len = vsnprintf(next, state.state->size, format, va);
+#  endif
+#endif
+
+    /* check that printf() results fit in buffer */
+    if (len == 0 || (unsigned)len >= state.state->size || next[state.state->size - 1] != 0)
+        return 0;
+
+    /* update buffer and position, compress first half if past that */
+    strm->avail_in += (unsigned)len;
+    state.state->x.pos += len;
+    if (strm->avail_in >= state.state->size) {
+        left = strm->avail_in - state.state->size;
+        strm->avail_in = state.state->size;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return state.state->err;
+        memcpy(state.state->in, state.state->in + state.state->size, left);
+        strm->next_in = state.state->in;
+        strm->avail_in = left;
+    }
+    return len;
+}
+
+int ZEXPORTVA gzprintf(gzFile file, const char *format, ...)
+{
+    va_list va;
+    int ret;
+
+    va_start(va, format);
+    ret = gzvprintf(file, format, va);
+    va_end(va);
+    return ret;
+}
+
+#else /* !STDC && !Z_HAVE_STDARG_H */
+
+/* -- see zlib.h -- */
+int ZEXPORTVA gzprintf (file, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
+                       a11, a12, a13, a14, a15, a16, a17, a18, a19, a20)
+    gzFile file;
+    const char *format;
+    int a1, a2, a3, a4, a5, a6, a7, a8, a9, a10,
+        a11, a12, a13, a14, a15, a16, a17, a18, a19, a20;
+{
+    unsigned len, left;
+    char *next;
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+    strm = &(state.state->strm);
+
+    /* check that can really pass pointer in ints */
+    if (sizeof(int) != sizeof(void *))
+        return Z_STREAM_ERROR;
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* make sure we have some buffer space */
+    if (state.state->size == 0 && gz_init(state) == -1)
+        return state.state->error;
+
+    /* check for seek request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_zero(state, state.state->skip) == -1)
+            return state.state->error;
+    }
+
+    /* do the printf() into the input buffer, put length in len -- the input
+       buffer is double-sized just for this function, so there is guaranteed to
+       be state.state->size bytes available after the current contents */
+    if (strm->avail_in == 0)
+        strm->next_in = state.state->in;
+    next = (char *)(strm->next_in + strm->avail_in);
+    next[state.state->size - 1] = 0;
+#ifdef NO_snprintf
+#  ifdef HAS_sprintf_void
+    sprintf(next, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
+            a13, a14, a15, a16, a17, a18, a19, a20);
+    for (len = 0; len < size; len++)
+        if (next[len] == 0)
+            break;
+#  else
+    len = sprintf(next, format, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11,
+                  a12, a13, a14, a15, a16, a17, a18, a19, a20);
+#  endif
+#else
+#  ifdef HAS_snprintf_void
+    snprintf(next, state.state->size, format, a1, a2, a3, a4, a5, a6, a7, a8, a9,
+             a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20);
+    len = strlen(next);
+#  else
+    len = snprintf(next, state.state->size, format, a1, a2, a3, a4, a5, a6, a7, a8,
+                   a9, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20);
+#  endif
+#endif
+
+    /* check that printf() results fit in buffer */
+    if (len == 0 || len >= state.state->size || next[state.state->size - 1] != 0)
+        return 0;
+
+    /* update buffer and position, compress first half if past that */
+    strm->avail_in += len;
+    state.state->x.pos += len;
+    if (strm->avail_in >= state.state->size) {
+        left = strm->avail_in - state.state->size;
+        strm->avail_in = state.state->size;
+        if (gz_comp(state, Z_NO_FLUSH) == -1)
+            return state.state->err;
+        memcpy(state.state->in, state.state->in + state.state->size, left);
+        strm->next_in = state.state->in;
+        strm->avail_in = left;
+    }
+    return (int)len;
+}
+
+#endif
+
+/* -- see zlib.h -- */
+int ZEXPORT gzflush(file, flush)
+    gzFile file;
+    int flush;
+{
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* check flush parameter */
+    if (flush < 0 || flush > Z_FINISH)
+        return Z_STREAM_ERROR;
+
+    /* check for seek request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_zero(state, state.state->skip) == -1)
+            return state.state->err;
+    }
+
+    /* compress remaining data with requested flush */
+    (void)gz_comp(state, flush);
+    return state.state->err;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzsetparams(file, level, strategy)
+    gzFile file;
+    int level;
+    int strategy;
+{
+    gz_statep state;
+    z_streamp strm;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+    strm = &(state.state->strm);
+
+    /* check that we're writing and that there's no error */
+    if (state.state->mode != GZ_WRITE || state.state->err != Z_OK)
+        return Z_STREAM_ERROR;
+
+    /* if no change is requested, then do nothing */
+    if (level == state.state->level && strategy == state.state->strategy)
+        return Z_OK;
+
+    /* check for seek request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_zero(state, state.state->skip) == -1)
+            return state.state->err;
+    }
+
+    /* change compression parameters for subsequent input */
+    if (state.state->size) {
+        /* flush previous input with previous parameters before changing */
+        if (strm->avail_in && gz_comp(state, Z_BLOCK) == -1)
+            return state.state->err;
+        deflateParams(strm, level, strategy);
+    }
+    state.state->level = level;
+    state.state->strategy = strategy;
+    return Z_OK;
+}
+
+/* -- see zlib.h -- */
+int ZEXPORT gzclose_w(file)
+    gzFile file;
+{
+    int ret = Z_OK;
+    gz_statep state;
+
+    /* get internal structure */
+    if (file == NULL)
+        return Z_STREAM_ERROR;
+    state = (gz_statep)file;
+
+    /* check that we're writing */
+    if (state.state->mode != GZ_WRITE)
+        return Z_STREAM_ERROR;
+
+    /* check for seek request */
+    if (state.state->seek) {
+        state.state->seek = 0;
+        if (gz_zero(state, state.state->skip) == -1)
+            ret = state.state->err;
+    }
+
+    /* flush, free memory, and close file */
+    if (gz_comp(state, Z_FINISH) == -1)
+        ret = state.state->err;
+    if (state.state->size) {
+        if (!state.state->direct) {
+            (void)deflateEnd(&(state.state->strm));
+            free(state.state->out);
+        }
+        free(state.state->in);
+    }
+    gz_error(state, Z_OK, NULL);
+    free(state.state->path);
+    if (close(state.state->fd) == -1)
+        ret = Z_ERRNO;
+    free(state.state);
+    return ret;
+}
diff --git a/zstd/zlibWrapper/gzwrite.o b/zstd/zlibWrapper/gzwrite.o
new file mode 100644
index 0000000..5dc663c
Binary files /dev/null and b/zstd/zlibWrapper/gzwrite.o differ
diff --git a/zstd/zlibWrapper/minigzip b/zstd/zlibWrapper/minigzip
new file mode 100755
index 0000000..f43c3b3
Binary files /dev/null and b/zstd/zlibWrapper/minigzip differ
diff --git a/zstd/zlibWrapper/zstd_zlibwrapper.c b/zstd/zlibWrapper/zstd_zlibwrapper.c
new file mode 100644
index 0000000..a2f6752
--- /dev/null
+++ b/zstd/zlibWrapper/zstd_zlibwrapper.c
@@ -0,0 +1,1082 @@
+/**
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+
+#include <stdio.h>                 /* vsprintf */
+#include <stdarg.h>                /* va_list, for z_gzprintf */
+#define NO_DUMMY_DECL
+#define ZLIB_CONST
+#ifdef STATIC_ZLIB
+  #include "../../../zlib-1.2.11/zlib.h"
+#else
+  #include <zlib.h>
+#endif
+#include "zstd_zlibwrapper.h"
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_MAGICNUMBER */
+#include "zstd.h"
+#include "zstd_internal.h"         /* defaultCustomMem */
+
+
+#define Z_INFLATE_SYNC              8
+#define ZLIB_HEADERSIZE             4
+#define ZSTD_HEADERSIZE             ZSTD_frameHeaderSize_min
+#define ZWRAP_DEFAULT_CLEVEL        3   /* Z_DEFAULT_COMPRESSION is translated to ZWRAP_DEFAULT_CLEVEL for zstd */
+
+#define LOG_WRAPPERC(...)  /* printf(__VA_ARGS__) */
+#define LOG_WRAPPERD(...)  /* printf(__VA_ARGS__) */
+
+#define FINISH_WITH_GZ_ERR(msg) { (void)msg; return Z_STREAM_ERROR; }
+#define FINISH_WITH_NULL_ERR(msg) { (void)msg; return NULL; }
+
+
+
+#define ZWRAP_USE_ZSTD 1
+#ifndef ZWRAP_USE_ZSTD
+    #define ZWRAP_USE_ZSTD 0
+#endif
+
+static int g_ZWRAP_useZSTDcompression = ZWRAP_USE_ZSTD;   /* 0 = don't use ZSTD */
+
+void ZWRAP_useZSTDcompression(int turn_on) { g_ZWRAP_useZSTDcompression = turn_on; }
+
+int ZWRAP_isUsingZSTDcompression(void) { return g_ZWRAP_useZSTDcompression; }
+
+
+
+static ZWRAP_decompress_type g_ZWRAPdecompressionType = ZWRAP_AUTO;
+
+void ZWRAP_setDecompressionType(ZWRAP_decompress_type type) { g_ZWRAPdecompressionType = type; };
+
+ZWRAP_decompress_type ZWRAP_getDecompressionType(void) { return g_ZWRAPdecompressionType; }
+
+
+
+const char * zstdVersion(void) { return ZSTD_VERSION_STRING; }
+
+ZEXTERN const char * ZEXPORT z_zlibVersion OF((void)) { return zlibVersion();  }
+
+
+
+static void* ZWRAP_allocFunction(void* opaque, size_t size)
+{
+    z_streamp strm = (z_streamp) opaque;
+    void* address = strm->zalloc(strm->opaque, 1, (uInt)size);
+  /*  printf("ZWRAP alloc %p, %d \n", address, (int)size); */
+    return address;
+}
+
+static void ZWRAP_freeFunction(void* opaque, void* address)
+{
+    z_streamp strm = (z_streamp) opaque;
+    strm->zfree(strm->opaque, address);
+   /* if (address) printf("ZWRAP free %p \n", address); */
+}
+
+
+
+/* *** Compression *** */
+typedef enum { ZWRAP_useInit, ZWRAP_useReset, ZWRAP_streamEnd } ZWRAP_state_t;
+
+typedef struct {
+    ZSTD_CStream* zbc;
+    int compressionLevel;
+    int streamEnd; /* a flag to signal the end of a stream */
+    unsigned long long totalInBytes; /* we need it as strm->total_in can be reset by user */
+    ZSTD_customMem customMem;
+    z_stream allocFunc; /* copy of zalloc, zfree, opaque */
+    ZSTD_inBuffer inBuffer;
+    ZSTD_outBuffer outBuffer;
+    ZWRAP_state_t comprState;
+    unsigned long long pledgedSrcSize;
+} ZWRAP_CCtx;
+
+typedef ZWRAP_CCtx internal_state;
+
+
+
+size_t ZWRAP_freeCCtx(ZWRAP_CCtx* zwc)
+{
+    if (zwc==NULL) return 0;   /* support free on NULL */
+    if (zwc->zbc) ZSTD_freeCStream(zwc->zbc);
+    zwc->customMem.customFree(zwc->customMem.opaque, zwc);
+    return 0;
+}
+
+
+ZWRAP_CCtx* ZWRAP_createCCtx(z_streamp strm)
+{
+    ZWRAP_CCtx* zwc;
+
+    if (strm->zalloc && strm->zfree) {
+        zwc = (ZWRAP_CCtx*)strm->zalloc(strm->opaque, 1, sizeof(ZWRAP_CCtx));
+        if (zwc==NULL) return NULL;
+        memset(zwc, 0, sizeof(ZWRAP_CCtx));
+        memcpy(&zwc->allocFunc, strm, sizeof(z_stream));
+        { ZSTD_customMem ZWRAP_customMem = { ZWRAP_allocFunction, ZWRAP_freeFunction, &zwc->allocFunc };
+          memcpy(&zwc->customMem, &ZWRAP_customMem, sizeof(ZSTD_customMem));
+        }
+    } else {
+        zwc = (ZWRAP_CCtx*)defaultCustomMem.customAlloc(defaultCustomMem.opaque, sizeof(ZWRAP_CCtx));
+        if (zwc==NULL) return NULL;
+        memset(zwc, 0, sizeof(ZWRAP_CCtx));
+        memcpy(&zwc->customMem, &defaultCustomMem, sizeof(ZSTD_customMem));
+    }
+
+    return zwc;
+}
+
+
+int ZWRAP_initializeCStream(ZWRAP_CCtx* zwc, const void* dict, size_t dictSize, unsigned long long pledgedSrcSize)
+{
+    LOG_WRAPPERC("- ZWRAP_initializeCStream=%p\n", zwc);
+    if (zwc == NULL || zwc->zbc == NULL) return Z_STREAM_ERROR;
+
+    if (!pledgedSrcSize) pledgedSrcSize = zwc->pledgedSrcSize;
+    { ZSTD_parameters const params = ZSTD_getParams(zwc->compressionLevel, pledgedSrcSize, dictSize);
+      size_t errorCode;
+      LOG_WRAPPERC("pledgedSrcSize=%d windowLog=%d chainLog=%d hashLog=%d searchLog=%d searchLength=%d strategy=%d\n", (int)pledgedSrcSize, params.cParams.windowLog, params.cParams.chainLog, params.cParams.hashLog, params.cParams.searchLog, params.cParams.searchLength, params.cParams.strategy);
+      errorCode = ZSTD_initCStream_advanced(zwc->zbc, dict, dictSize, params, pledgedSrcSize);
+      if (ZSTD_isError(errorCode)) return Z_STREAM_ERROR; }
+
+    return Z_OK;
+}
+
+
+int ZWRAPC_finishWithError(ZWRAP_CCtx* zwc, z_streamp strm, int error)
+{
+    LOG_WRAPPERC("- ZWRAPC_finishWithError=%d\n", error);
+    if (zwc) ZWRAP_freeCCtx(zwc);
+    if (strm) strm->state = NULL;
+    return (error) ? error : Z_STREAM_ERROR;
+}
+
+
+int ZWRAPC_finishWithErrorMsg(z_streamp strm, char* message)
+{
+    ZWRAP_CCtx* zwc = (ZWRAP_CCtx*) strm->state;
+    strm->msg = message;
+    if (zwc == NULL) return Z_STREAM_ERROR;
+
+    return ZWRAPC_finishWithError(zwc, strm, 0);
+}
+
+
+int ZWRAP_setPledgedSrcSize(z_streamp strm, unsigned long long pledgedSrcSize)
+{
+    ZWRAP_CCtx* zwc = (ZWRAP_CCtx*) strm->state;
+    if (zwc == NULL) return Z_STREAM_ERROR;
+
+    zwc->pledgedSrcSize = pledgedSrcSize;
+    zwc->comprState = ZWRAP_useInit;
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_deflateInit_ OF((z_streamp strm, int level,
+                                     const char *version, int stream_size))
+{
+    ZWRAP_CCtx* zwc;
+
+    LOG_WRAPPERC("- deflateInit level=%d\n", level);
+    if (!g_ZWRAP_useZSTDcompression) {
+        return deflateInit_((strm), (level), version, stream_size);
+    }
+
+    zwc = ZWRAP_createCCtx(strm);
+    if (zwc == NULL) return Z_MEM_ERROR;
+
+    if (level == Z_DEFAULT_COMPRESSION)
+        level = ZWRAP_DEFAULT_CLEVEL;
+
+    zwc->streamEnd = 0;
+    zwc->totalInBytes = 0;
+    zwc->compressionLevel = level;
+    strm->state = (struct internal_state*) zwc; /* use state which in not used by user */
+    strm->total_in = 0;
+    strm->total_out = 0;
+    strm->adler = 0;
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_deflateInit2_ OF((z_streamp strm, int level, int method,
+                                      int windowBits, int memLevel,
+                                      int strategy, const char *version,
+                                      int stream_size))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflateInit2_(strm, level, method, windowBits, memLevel, strategy, version, stream_size);
+
+    return z_deflateInit_ (strm, level, version, stream_size);
+}
+
+
+int ZWRAP_deflateReset_keepDict(z_streamp strm)
+{
+    LOG_WRAPPERC("- ZWRAP_deflateReset_keepDict\n");
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflateReset(strm);
+
+    { ZWRAP_CCtx* zwc = (ZWRAP_CCtx*) strm->state;
+      if (zwc) { 
+          zwc->streamEnd = 0;
+          zwc->totalInBytes = 0;
+      }
+    }
+
+    strm->total_in = 0;
+    strm->total_out = 0;
+    strm->adler = 0;
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_deflateReset OF((z_streamp strm))
+{
+    LOG_WRAPPERC("- deflateReset\n");
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflateReset(strm);
+
+    ZWRAP_deflateReset_keepDict(strm);
+
+    { ZWRAP_CCtx* zwc = (ZWRAP_CCtx*) strm->state;
+      if (zwc) zwc->comprState = ZWRAP_useInit;
+    }
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_deflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength))
+{
+    if (!g_ZWRAP_useZSTDcompression) {
+        LOG_WRAPPERC("- deflateSetDictionary\n");
+        return deflateSetDictionary(strm, dictionary, dictLength);
+    }
+
+    {   ZWRAP_CCtx* zwc = (ZWRAP_CCtx*) strm->state;
+        LOG_WRAPPERC("- deflateSetDictionary level=%d\n", (int)zwc->compressionLevel);
+        if (!zwc) return Z_STREAM_ERROR;
+        if (zwc->zbc == NULL) {
+            zwc->zbc = ZSTD_createCStream_advanced(zwc->customMem);
+            if (zwc->zbc == NULL) return ZWRAPC_finishWithError(zwc, strm, 0);
+        }
+        { int res = ZWRAP_initializeCStream(zwc, dictionary, dictLength, 0);
+          if (res != Z_OK) return ZWRAPC_finishWithError(zwc, strm, res); }
+        zwc->comprState = ZWRAP_useReset;
+    }
+
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_deflate OF((z_streamp strm, int flush))
+{
+    ZWRAP_CCtx* zwc;
+
+    if (!g_ZWRAP_useZSTDcompression) {
+        int res;
+        LOG_WRAPPERC("- deflate1 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out);
+        res = deflate(strm, flush);
+        return res;
+    }
+
+    zwc = (ZWRAP_CCtx*) strm->state;
+    if (zwc == NULL) { LOG_WRAPPERC("zwc == NULL\n"); return Z_STREAM_ERROR; }
+
+    if (zwc->zbc == NULL) {
+        int res;
+        zwc->zbc = ZSTD_createCStream_advanced(zwc->customMem);
+        if (zwc->zbc == NULL) return ZWRAPC_finishWithError(zwc, strm, 0);
+        res = ZWRAP_initializeCStream(zwc, NULL, 0, (flush == Z_FINISH) ? strm->avail_in : 0);
+        if (res != Z_OK) return ZWRAPC_finishWithError(zwc, strm, res);
+        if (flush != Z_FINISH) zwc->comprState = ZWRAP_useReset;
+    } else {
+        if (zwc->totalInBytes == 0) {
+            if (zwc->comprState == ZWRAP_useReset) {
+                size_t const errorCode = ZSTD_resetCStream(zwc->zbc, (flush == Z_FINISH) ? strm->avail_in : zwc->pledgedSrcSize);
+                if (ZSTD_isError(errorCode)) { LOG_WRAPPERC("ERROR: ZSTD_resetCStream errorCode=%s\n", ZSTD_getErrorName(errorCode)); return ZWRAPC_finishWithError(zwc, strm, 0); }
+            } else {
+                int res = ZWRAP_initializeCStream(zwc, NULL, 0, (flush == Z_FINISH) ? strm->avail_in : 0);
+                if (res != Z_OK) return ZWRAPC_finishWithError(zwc, strm, res);
+                if (flush != Z_FINISH) zwc->comprState = ZWRAP_useReset;
+            }
+        }
+    }
+
+    LOG_WRAPPERC("- deflate2 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out);
+    if (strm->avail_in > 0) {
+        zwc->inBuffer.src = strm->next_in;
+        zwc->inBuffer.size = strm->avail_in;
+        zwc->inBuffer.pos = 0;
+        zwc->outBuffer.dst = strm->next_out;
+        zwc->outBuffer.size = strm->avail_out;
+        zwc->outBuffer.pos = 0;
+        { size_t const errorCode = ZSTD_compressStream(zwc->zbc, &zwc->outBuffer, &zwc->inBuffer);
+          LOG_WRAPPERC("deflate ZSTD_compressStream srcSize=%d dstCapacity=%d\n", (int)zwc->inBuffer.size, (int)zwc->outBuffer.size);
+          if (ZSTD_isError(errorCode)) return ZWRAPC_finishWithError(zwc, strm, 0);
+        }
+        strm->next_out += zwc->outBuffer.pos;
+        strm->total_out += zwc->outBuffer.pos;
+        strm->avail_out -= zwc->outBuffer.pos;
+        strm->total_in += zwc->inBuffer.pos;
+        zwc->totalInBytes += zwc->inBuffer.pos;
+        strm->next_in += zwc->inBuffer.pos;
+        strm->avail_in -= zwc->inBuffer.pos;
+    }
+
+    if (flush == Z_FULL_FLUSH
+#if ZLIB_VERNUM >= 0x1240
+        || flush == Z_TREES
+#endif
+        || flush == Z_BLOCK)
+        return ZWRAPC_finishWithErrorMsg(strm, "Z_FULL_FLUSH, Z_BLOCK and Z_TREES are not supported!");
+
+    if (flush == Z_FINISH) {
+        size_t bytesLeft;
+        if (zwc->streamEnd) return Z_STREAM_END;
+        zwc->outBuffer.dst = strm->next_out;
+        zwc->outBuffer.size = strm->avail_out;
+        zwc->outBuffer.pos = 0;
+        bytesLeft = ZSTD_endStream(zwc->zbc, &zwc->outBuffer);
+        LOG_WRAPPERC("deflate ZSTD_endStream dstCapacity=%d bytesLeft=%d\n", (int)strm->avail_out, (int)bytesLeft);
+        if (ZSTD_isError(bytesLeft)) return ZWRAPC_finishWithError(zwc, strm, 0);
+        strm->next_out += zwc->outBuffer.pos;
+        strm->total_out += zwc->outBuffer.pos;
+        strm->avail_out -= zwc->outBuffer.pos;
+        if (bytesLeft == 0) { zwc->streamEnd = 1; LOG_WRAPPERC("Z_STREAM_END2 strm->total_in=%d strm->avail_out=%d strm->total_out=%d\n", (int)strm->total_in, (int)strm->avail_out, (int)strm->total_out); return Z_STREAM_END; }
+    }
+    else
+    if (flush == Z_SYNC_FLUSH || flush == Z_PARTIAL_FLUSH) {
+        size_t bytesLeft;
+        zwc->outBuffer.dst = strm->next_out;
+        zwc->outBuffer.size = strm->avail_out;
+        zwc->outBuffer.pos = 0;
+        bytesLeft = ZSTD_flushStream(zwc->zbc, &zwc->outBuffer);
+        LOG_WRAPPERC("deflate ZSTD_flushStream dstCapacity=%d bytesLeft=%d\n", (int)strm->avail_out, (int)bytesLeft);
+        if (ZSTD_isError(bytesLeft)) return ZWRAPC_finishWithError(zwc, strm, 0);
+        strm->next_out += zwc->outBuffer.pos;
+        strm->total_out += zwc->outBuffer.pos;
+        strm->avail_out -= zwc->outBuffer.pos;
+    }
+    LOG_WRAPPERC("- deflate3 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out);
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_deflateEnd OF((z_streamp strm))
+{
+    if (!g_ZWRAP_useZSTDcompression) {
+        LOG_WRAPPERC("- deflateEnd\n");
+        return deflateEnd(strm);
+    }
+    LOG_WRAPPERC("- deflateEnd total_in=%d total_out=%d\n", (int)(strm->total_in), (int)(strm->total_out));
+    {   size_t errorCode;
+        ZWRAP_CCtx* zwc = (ZWRAP_CCtx*) strm->state;
+        if (zwc == NULL) return Z_OK;  /* structures are already freed */
+        strm->state = NULL;
+        errorCode = ZWRAP_freeCCtx(zwc);
+        if (ZSTD_isError(errorCode)) return Z_STREAM_ERROR;
+    }
+    return Z_OK;
+}
+
+
+ZEXTERN uLong ZEXPORT z_deflateBound OF((z_streamp strm,
+                                       uLong sourceLen))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflateBound(strm, sourceLen);
+
+    return ZSTD_compressBound(sourceLen);
+}
+
+
+ZEXTERN int ZEXPORT z_deflateParams OF((z_streamp strm,
+                                      int level,
+                                      int strategy))
+{
+    if (!g_ZWRAP_useZSTDcompression) {
+        LOG_WRAPPERC("- deflateParams level=%d strategy=%d\n", level, strategy);
+        return deflateParams(strm, level, strategy);
+    }
+
+    return Z_OK;
+}
+
+
+
+
+
+/* *** Decompression *** */
+typedef enum { ZWRAP_ZLIB_STREAM, ZWRAP_ZSTD_STREAM, ZWRAP_UNKNOWN_STREAM } ZWRAP_stream_type;
+
+typedef struct {
+    ZSTD_DStream* zbd;
+    char headerBuf[16]; /* should be equal or bigger than ZSTD_frameHeaderSize_min */
+    int errorCount;
+    unsigned long long totalInBytes; /* we need it as strm->total_in can be reset by user */
+    ZWRAP_state_t decompState;
+    ZSTD_inBuffer inBuffer;
+    ZSTD_outBuffer outBuffer;
+
+    /* zlib params */
+    int stream_size;
+    char *version;
+    int windowBits;
+    ZSTD_customMem customMem;
+    z_stream allocFunc; /* copy of zalloc, zfree, opaque */
+} ZWRAP_DCtx;
+
+
+int ZWRAP_isUsingZSTDdecompression(z_streamp strm)
+{
+    if (strm == NULL) return 0;
+    return (strm->reserved == ZWRAP_ZSTD_STREAM);
+}
+
+
+void ZWRAP_initDCtx(ZWRAP_DCtx* zwd)
+{
+    zwd->errorCount = 0;
+    zwd->outBuffer.pos = 0;
+    zwd->outBuffer.size = 0;
+}
+
+
+ZWRAP_DCtx* ZWRAP_createDCtx(z_streamp strm)
+{
+    ZWRAP_DCtx* zwd;
+
+    if (strm->zalloc && strm->zfree) {
+        zwd = (ZWRAP_DCtx*)strm->zalloc(strm->opaque, 1, sizeof(ZWRAP_DCtx));
+        if (zwd==NULL) return NULL;
+        memset(zwd, 0, sizeof(ZWRAP_DCtx));
+        memcpy(&zwd->allocFunc, strm, sizeof(z_stream));
+        { ZSTD_customMem ZWRAP_customMem = { ZWRAP_allocFunction, ZWRAP_freeFunction, &zwd->allocFunc };
+          memcpy(&zwd->customMem, &ZWRAP_customMem, sizeof(ZSTD_customMem));
+        }
+    } else {
+        zwd = (ZWRAP_DCtx*)defaultCustomMem.customAlloc(defaultCustomMem.opaque, sizeof(ZWRAP_DCtx));
+        if (zwd==NULL) return NULL;
+        memset(zwd, 0, sizeof(ZWRAP_DCtx));
+        memcpy(&zwd->customMem, &defaultCustomMem, sizeof(ZSTD_customMem));
+    }
+
+    MEM_STATIC_ASSERT(sizeof(zwd->headerBuf) >= ZSTD_FRAMEHEADERSIZE_MIN);   /* if compilation fails here, assertion is false */
+    ZWRAP_initDCtx(zwd);
+    return zwd;
+}
+
+
+size_t ZWRAP_freeDCtx(ZWRAP_DCtx* zwd)
+{
+    if (zwd==NULL) return 0;   /* support free on null */
+    if (zwd->zbd) ZSTD_freeDStream(zwd->zbd);
+    if (zwd->version) zwd->customMem.customFree(zwd->customMem.opaque, zwd->version);
+    zwd->customMem.customFree(zwd->customMem.opaque, zwd);
+    return 0;
+}
+
+
+int ZWRAPD_finishWithError(ZWRAP_DCtx* zwd, z_streamp strm, int error)
+{
+    LOG_WRAPPERD("- ZWRAPD_finishWithError=%d\n", error);
+    if (zwd) ZWRAP_freeDCtx(zwd);
+    if (strm) strm->state = NULL;
+    return (error) ? error : Z_STREAM_ERROR;
+}
+
+
+int ZWRAPD_finishWithErrorMsg(z_streamp strm, char* message)
+{
+    ZWRAP_DCtx* zwd = (ZWRAP_DCtx*) strm->state;
+    strm->msg = message;
+    if (zwd == NULL) return Z_STREAM_ERROR;
+
+    return ZWRAPD_finishWithError(zwd, strm, 0);
+}
+
+
+ZEXTERN int ZEXPORT z_inflateInit_ OF((z_streamp strm,
+                                     const char *version, int stream_size))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB) {
+        strm->reserved = ZWRAP_ZLIB_STREAM; /* mark as zlib stream */
+        return inflateInit(strm);
+    }
+
+    {
+    ZWRAP_DCtx* zwd = ZWRAP_createDCtx(strm);
+    LOG_WRAPPERD("- inflateInit\n");
+    if (zwd == NULL) return ZWRAPD_finishWithError(zwd, strm, 0);
+
+    zwd->version = zwd->customMem.customAlloc(zwd->customMem.opaque, strlen(version) + 1);
+    if (zwd->version == NULL) return ZWRAPD_finishWithError(zwd, strm, 0);
+    strcpy(zwd->version, version);
+
+    zwd->stream_size = stream_size;
+    zwd->totalInBytes = 0;
+    strm->state = (struct internal_state*) zwd; /* use state which in not used by user */
+    strm->total_in = 0;
+    strm->total_out = 0;
+    strm->reserved = ZWRAP_UNKNOWN_STREAM; /* mark as unknown steam */
+    strm->adler = 0;
+    }
+
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_inflateInit2_ OF((z_streamp strm, int  windowBits,
+                                      const char *version, int stream_size))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB) {
+        return inflateInit2_(strm, windowBits, version, stream_size);
+    }
+
+    {
+    int ret = z_inflateInit_ (strm, version, stream_size);
+    LOG_WRAPPERD("- inflateInit2 windowBits=%d\n", windowBits);
+    if (ret == Z_OK) {
+        ZWRAP_DCtx* zwd = (ZWRAP_DCtx*)strm->state;
+        if (zwd == NULL) return Z_STREAM_ERROR;
+        zwd->windowBits = windowBits;
+    }
+    return ret;
+    }
+}
+
+int ZWRAP_inflateReset_keepDict(z_streamp strm)
+{
+    LOG_WRAPPERD("- ZWRAP_inflateReset_keepDict\n");
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateReset(strm);
+
+    {   ZWRAP_DCtx* zwd = (ZWRAP_DCtx*) strm->state;
+        if (zwd == NULL) return Z_STREAM_ERROR;
+        ZWRAP_initDCtx(zwd);
+        zwd->decompState = ZWRAP_useReset;
+        zwd->totalInBytes = 0;
+    }
+
+    strm->total_in = 0;
+    strm->total_out = 0;
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_inflateReset OF((z_streamp strm))
+{
+    LOG_WRAPPERD("- inflateReset\n");
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateReset(strm);
+
+    { int ret = ZWRAP_inflateReset_keepDict(strm);
+      if (ret != Z_OK) return ret; }
+
+    { ZWRAP_DCtx* zwd = (ZWRAP_DCtx*) strm->state;
+      if (zwd == NULL) return Z_STREAM_ERROR;
+      zwd->decompState = ZWRAP_useInit; }
+
+    return Z_OK;
+}
+
+
+#if ZLIB_VERNUM >= 0x1240
+ZEXTERN int ZEXPORT z_inflateReset2 OF((z_streamp strm,
+                                      int windowBits))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateReset2(strm, windowBits);
+
+    {   int ret = z_inflateReset (strm);
+        if (ret == Z_OK) {
+            ZWRAP_DCtx* zwd = (ZWRAP_DCtx*)strm->state;
+            if (zwd == NULL) return Z_STREAM_ERROR;
+            zwd->windowBits = windowBits;
+        }
+        return ret;
+    }
+}
+#endif
+
+
+ZEXTERN int ZEXPORT z_inflateSetDictionary OF((z_streamp strm,
+                                             const Bytef *dictionary,
+                                             uInt  dictLength))
+{
+    LOG_WRAPPERD("- inflateSetDictionary\n");
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateSetDictionary(strm, dictionary, dictLength);
+
+    {   size_t errorCode;
+        ZWRAP_DCtx* zwd = (ZWRAP_DCtx*) strm->state;
+        if (zwd == NULL || zwd->zbd == NULL) return Z_STREAM_ERROR;
+        errorCode = ZSTD_initDStream_usingDict(zwd->zbd, dictionary, dictLength);
+        if (ZSTD_isError(errorCode)) return ZWRAPD_finishWithError(zwd, strm, 0);
+        zwd->decompState = ZWRAP_useReset;
+
+        if (zwd->totalInBytes == ZSTD_HEADERSIZE) {
+            zwd->inBuffer.src = zwd->headerBuf;
+            zwd->inBuffer.size = zwd->totalInBytes;
+            zwd->inBuffer.pos = 0;
+            zwd->outBuffer.dst = strm->next_out;
+            zwd->outBuffer.size = 0;
+            zwd->outBuffer.pos = 0;
+            errorCode = ZSTD_decompressStream(zwd->zbd, &zwd->outBuffer, &zwd->inBuffer);
+            LOG_WRAPPERD("inflateSetDictionary ZSTD_decompressStream errorCode=%d srcSize=%d dstCapacity=%d\n", (int)errorCode, (int)zwd->inBuffer.size, (int)zwd->outBuffer.size);
+            if (zwd->inBuffer.pos < zwd->outBuffer.size || ZSTD_isError(errorCode)) {
+                LOG_WRAPPERD("ERROR: ZSTD_decompressStream %s\n", ZSTD_getErrorName(errorCode));
+                return ZWRAPD_finishWithError(zwd, strm, 0);
+            }
+        }
+    }
+
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_inflate OF((z_streamp strm, int flush))
+{
+    ZWRAP_DCtx* zwd;
+    int res;
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved) {
+        LOG_WRAPPERD("- inflate1 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out);
+        res = inflate(strm, flush);
+        LOG_WRAPPERD("- inflate2 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d res=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out, res);
+        return res;
+    }
+
+    if (strm->avail_in <= 0) return Z_OK;
+
+    {   size_t errorCode, srcSize;
+        zwd = (ZWRAP_DCtx*) strm->state;
+        LOG_WRAPPERD("- inflate1 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out);
+
+        if (zwd == NULL) return Z_STREAM_ERROR;
+        if (zwd->decompState == ZWRAP_streamEnd) return Z_STREAM_END;
+
+        if (zwd->totalInBytes < ZLIB_HEADERSIZE) {
+            if (zwd->totalInBytes == 0 && strm->avail_in >= ZLIB_HEADERSIZE) {
+                if (MEM_readLE32(strm->next_in) != ZSTD_MAGICNUMBER) {
+                    if (zwd->windowBits)
+                        errorCode = inflateInit2_(strm, zwd->windowBits, zwd->version, zwd->stream_size);
+                    else
+                        errorCode = inflateInit_(strm, zwd->version, zwd->stream_size);
+
+                    strm->reserved = ZWRAP_ZLIB_STREAM; /* mark as zlib stream */
+                    errorCode = ZWRAP_freeDCtx(zwd);
+                    if (ZSTD_isError(errorCode)) goto error;
+
+                    if (flush == Z_INFLATE_SYNC) res = inflateSync(strm);
+                    else res = inflate(strm, flush);
+                    LOG_WRAPPERD("- inflate3 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d res=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out, res);
+                    return res;
+                }
+            } else {
+                srcSize = MIN(strm->avail_in, ZLIB_HEADERSIZE - zwd->totalInBytes);
+                memcpy(zwd->headerBuf+zwd->totalInBytes, strm->next_in, srcSize);
+                strm->total_in += srcSize;
+                zwd->totalInBytes += srcSize;
+                strm->next_in += srcSize;
+                strm->avail_in -= srcSize;
+                if (zwd->totalInBytes < ZLIB_HEADERSIZE) return Z_OK;
+
+                if (MEM_readLE32(zwd->headerBuf) != ZSTD_MAGICNUMBER) {
+                    z_stream strm2;
+                    strm2.next_in = strm->next_in;
+                    strm2.avail_in = strm->avail_in;
+                    strm2.next_out = strm->next_out;
+                    strm2.avail_out = strm->avail_out;
+
+                    if (zwd->windowBits)
+                        errorCode = inflateInit2_(strm, zwd->windowBits, zwd->version, zwd->stream_size);
+                    else
+                        errorCode = inflateInit_(strm, zwd->version, zwd->stream_size);
+                    LOG_WRAPPERD("ZLIB inflateInit errorCode=%d\n", (int)errorCode);
+                    if (errorCode != Z_OK) return ZWRAPD_finishWithError(zwd, strm, (int)errorCode);
+
+                    /* inflate header */
+                    strm->next_in = (unsigned char*)zwd->headerBuf;
+                    strm->avail_in = ZLIB_HEADERSIZE;
+                    strm->avail_out = 0;
+                    errorCode = inflate(strm, Z_NO_FLUSH);
+                    LOG_WRAPPERD("ZLIB inflate errorCode=%d strm->avail_in=%d\n", (int)errorCode, (int)strm->avail_in);
+                    if (errorCode != Z_OK) return ZWRAPD_finishWithError(zwd, strm, (int)errorCode);
+                    if (strm->avail_in > 0) goto error;
+
+                    strm->next_in = strm2.next_in;
+                    strm->avail_in = strm2.avail_in;
+                    strm->next_out = strm2.next_out;
+                    strm->avail_out = strm2.avail_out;
+
+                    strm->reserved = ZWRAP_ZLIB_STREAM; /* mark as zlib stream */
+                    errorCode = ZWRAP_freeDCtx(zwd);
+                    if (ZSTD_isError(errorCode)) goto error;
+
+                    if (flush == Z_INFLATE_SYNC) res = inflateSync(strm);
+                    else res = inflate(strm, flush);
+                    LOG_WRAPPERD("- inflate2 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d res=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out, res);
+                    return res;
+                }
+            }
+        }
+
+        strm->reserved = ZWRAP_ZSTD_STREAM; /* mark as zstd steam */
+
+        if (flush == Z_INFLATE_SYNC) { strm->msg = "inflateSync is not supported!"; goto error; }
+
+        if (!zwd->zbd) {
+            zwd->zbd = ZSTD_createDStream_advanced(zwd->customMem);
+            if (zwd->zbd == NULL) { LOG_WRAPPERD("ERROR: ZSTD_createDStream_advanced\n"); goto error; }
+            zwd->decompState = ZWRAP_useInit;
+        }
+
+        if (zwd->totalInBytes < ZSTD_HEADERSIZE)
+        {
+            if (zwd->totalInBytes == 0 && strm->avail_in >= ZSTD_HEADERSIZE) {
+                if (zwd->decompState == ZWRAP_useInit) {
+                    errorCode = ZSTD_initDStream(zwd->zbd);
+                    if (ZSTD_isError(errorCode)) { LOG_WRAPPERD("ERROR: ZSTD_initDStream errorCode=%s\n", ZSTD_getErrorName(errorCode)); goto error; }
+                } else {
+                    errorCode = ZSTD_resetDStream(zwd->zbd);
+                    if (ZSTD_isError(errorCode)) goto error;
+                }
+            } else {
+                srcSize = MIN(strm->avail_in, ZSTD_HEADERSIZE - zwd->totalInBytes);
+                memcpy(zwd->headerBuf+zwd->totalInBytes, strm->next_in, srcSize);
+                strm->total_in += srcSize;
+                zwd->totalInBytes += srcSize;
+                strm->next_in += srcSize;
+                strm->avail_in -= srcSize;
+                if (zwd->totalInBytes < ZSTD_HEADERSIZE) return Z_OK;
+
+                if (zwd->decompState == ZWRAP_useInit) {
+                    errorCode = ZSTD_initDStream(zwd->zbd);
+                    if (ZSTD_isError(errorCode)) { LOG_WRAPPERD("ERROR: ZSTD_initDStream errorCode=%s\n", ZSTD_getErrorName(errorCode)); goto error; }
+                } else {
+                    errorCode = ZSTD_resetDStream(zwd->zbd);
+                    if (ZSTD_isError(errorCode)) goto error;
+                }
+
+                zwd->inBuffer.src = zwd->headerBuf;
+                zwd->inBuffer.size = ZSTD_HEADERSIZE;
+                zwd->inBuffer.pos = 0;
+                zwd->outBuffer.dst = strm->next_out;
+                zwd->outBuffer.size = 0;
+                zwd->outBuffer.pos = 0;
+                errorCode = ZSTD_decompressStream(zwd->zbd, &zwd->outBuffer, &zwd->inBuffer);
+                LOG_WRAPPERD("inflate ZSTD_decompressStream1 errorCode=%d srcSize=%d dstCapacity=%d\n", (int)errorCode, (int)zwd->inBuffer.size, (int)zwd->outBuffer.size);
+                if (ZSTD_isError(errorCode)) {
+                    LOG_WRAPPERD("ERROR: ZSTD_decompressStream1 %s\n", ZSTD_getErrorName(errorCode));
+                    goto error;
+                }
+                if (zwd->inBuffer.pos != zwd->inBuffer.size) goto error; /* not consumed */
+            }
+        }
+
+        zwd->inBuffer.src = strm->next_in;
+        zwd->inBuffer.size = strm->avail_in;
+        zwd->inBuffer.pos = 0;
+        zwd->outBuffer.dst = strm->next_out;
+        zwd->outBuffer.size = strm->avail_out;
+        zwd->outBuffer.pos = 0;
+        errorCode = ZSTD_decompressStream(zwd->zbd, &zwd->outBuffer, &zwd->inBuffer);
+        LOG_WRAPPERD("inflate ZSTD_decompressStream2 errorCode=%d srcSize=%d dstCapacity=%d\n", (int)errorCode, (int)strm->avail_in, (int)strm->avail_out);
+        if (ZSTD_isError(errorCode)) {
+            zwd->errorCount++;
+            LOG_WRAPPERD("ERROR: ZSTD_decompressStream2 %s zwd->errorCount=%d\n", ZSTD_getErrorName(errorCode), zwd->errorCount);
+            if (zwd->errorCount<=1) return Z_NEED_DICT; else goto error;
+        }
+        LOG_WRAPPERD("inflate inBuffer.pos=%d inBuffer.size=%d outBuffer.pos=%d outBuffer.size=%d o\n", (int)zwd->inBuffer.pos, (int)zwd->inBuffer.size, (int)zwd->outBuffer.pos, (int)zwd->outBuffer.size);
+        strm->next_out += zwd->outBuffer.pos;
+        strm->total_out += zwd->outBuffer.pos;
+        strm->avail_out -= zwd->outBuffer.pos;
+        strm->total_in += zwd->inBuffer.pos;
+        zwd->totalInBytes += zwd->inBuffer.pos;
+        strm->next_in += zwd->inBuffer.pos;
+        strm->avail_in -= zwd->inBuffer.pos;
+        if (errorCode == 0) {
+            LOG_WRAPPERD("inflate Z_STREAM_END1 avail_in=%d avail_out=%d total_in=%d total_out=%d\n", (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out);
+            zwd->decompState = ZWRAP_streamEnd;
+            return Z_STREAM_END;
+        }
+    }
+    LOG_WRAPPERD("- inflate2 flush=%d avail_in=%d avail_out=%d total_in=%d total_out=%d res=%d\n", (int)flush, (int)strm->avail_in, (int)strm->avail_out, (int)strm->total_in, (int)strm->total_out, Z_OK);
+    return Z_OK;
+
+error:
+    return ZWRAPD_finishWithError(zwd, strm, 0);
+}
+
+
+ZEXTERN int ZEXPORT z_inflateEnd OF((z_streamp strm))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateEnd(strm);
+
+    LOG_WRAPPERD("- inflateEnd total_in=%d total_out=%d\n", (int)(strm->total_in), (int)(strm->total_out));
+    {   size_t errorCode;
+        ZWRAP_DCtx* zwd = (ZWRAP_DCtx*) strm->state;
+        if (zwd == NULL) return Z_OK;  /* structures are already freed */
+        strm->state = NULL;
+        errorCode = ZWRAP_freeDCtx(zwd);
+        if (ZSTD_isError(errorCode)) return Z_STREAM_ERROR;
+    }
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_inflateSync OF((z_streamp strm))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved) {
+        return inflateSync(strm);
+    }
+
+    return z_inflate(strm, Z_INFLATE_SYNC);
+}
+
+
+
+
+
+/* Advanced compression functions */
+ZEXTERN int ZEXPORT z_deflateCopy OF((z_streamp dest,
+                                    z_streamp source))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflateCopy(dest, source);
+    return ZWRAPC_finishWithErrorMsg(source, "deflateCopy is not supported!");
+}
+
+
+ZEXTERN int ZEXPORT z_deflateTune OF((z_streamp strm,
+                                    int good_length,
+                                    int max_lazy,
+                                    int nice_length,
+                                    int max_chain))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflateTune(strm, good_length, max_lazy, nice_length, max_chain);
+    return ZWRAPC_finishWithErrorMsg(strm, "deflateTune is not supported!");
+}
+
+
+#if ZLIB_VERNUM >= 0x1260
+ZEXTERN int ZEXPORT z_deflatePending OF((z_streamp strm,
+                                       unsigned *pending,
+                                       int *bits))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflatePending(strm, pending, bits);
+    return ZWRAPC_finishWithErrorMsg(strm, "deflatePending is not supported!");
+}
+#endif
+
+
+ZEXTERN int ZEXPORT z_deflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflatePrime(strm, bits, value);
+    return ZWRAPC_finishWithErrorMsg(strm, "deflatePrime is not supported!");
+}
+
+
+ZEXTERN int ZEXPORT z_deflateSetHeader OF((z_streamp strm,
+                                         gz_headerp head))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return deflateSetHeader(strm, head);
+    return ZWRAPC_finishWithErrorMsg(strm, "deflateSetHeader is not supported!");
+}
+
+
+
+
+/* Advanced decompression functions */
+#if ZLIB_VERNUM >= 0x1280
+ZEXTERN int ZEXPORT z_inflateGetDictionary OF((z_streamp strm,
+                                             Bytef *dictionary,
+                                             uInt  *dictLength))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateGetDictionary(strm, dictionary, dictLength);
+    return ZWRAPD_finishWithErrorMsg(strm, "inflateGetDictionary is not supported!");
+}
+#endif
+
+
+ZEXTERN int ZEXPORT z_inflateCopy OF((z_streamp dest,
+                                    z_streamp source))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !source->reserved)
+        return inflateCopy(dest, source);
+    return ZWRAPD_finishWithErrorMsg(source, "inflateCopy is not supported!");
+}
+
+
+#if ZLIB_VERNUM >= 0x1240
+ZEXTERN long ZEXPORT z_inflateMark OF((z_streamp strm))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateMark(strm);
+    return ZWRAPD_finishWithErrorMsg(strm, "inflateMark is not supported!");
+}
+#endif
+
+
+ZEXTERN int ZEXPORT z_inflatePrime OF((z_streamp strm,
+                                     int bits,
+                                     int value))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflatePrime(strm, bits, value);
+    return ZWRAPD_finishWithErrorMsg(strm, "inflatePrime is not supported!");
+}
+
+
+ZEXTERN int ZEXPORT z_inflateGetHeader OF((z_streamp strm,
+                                         gz_headerp head))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateGetHeader(strm, head);
+    return ZWRAPD_finishWithErrorMsg(strm, "inflateGetHeader is not supported!");
+}
+
+
+ZEXTERN int ZEXPORT z_inflateBackInit_ OF((z_streamp strm, int windowBits,
+                                         unsigned char FAR *window,
+                                         const char *version,
+                                         int stream_size))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateBackInit_(strm, windowBits, window, version, stream_size);
+    return ZWRAPD_finishWithErrorMsg(strm, "inflateBackInit is not supported!");
+}
+
+
+ZEXTERN int ZEXPORT z_inflateBack OF((z_streamp strm,
+                                    in_func in, void FAR *in_desc,
+                                    out_func out, void FAR *out_desc))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateBack(strm, in, in_desc, out, out_desc);
+    return ZWRAPD_finishWithErrorMsg(strm, "inflateBack is not supported!");
+}
+
+
+ZEXTERN int ZEXPORT z_inflateBackEnd OF((z_streamp strm))
+{
+    if (g_ZWRAPdecompressionType == ZWRAP_FORCE_ZLIB || !strm->reserved)
+        return inflateBackEnd(strm);
+    return ZWRAPD_finishWithErrorMsg(strm, "inflateBackEnd is not supported!");
+}
+
+
+ZEXTERN uLong ZEXPORT z_zlibCompileFlags OF((void)) { return zlibCompileFlags(); };
+
+
+
+                        /* utility functions */
+#ifndef Z_SOLO
+
+ZEXTERN int ZEXPORT z_compress OF((Bytef *dest,   uLongf *destLen,
+                                 const Bytef *source, uLong sourceLen))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return compress(dest, destLen, source, sourceLen);
+
+    { size_t dstCapacity = *destLen;
+      size_t const errorCode = ZSTD_compress(dest, dstCapacity, source, sourceLen, ZWRAP_DEFAULT_CLEVEL);
+      LOG_WRAPPERD("z_compress sourceLen=%d dstCapacity=%d\n", (int)sourceLen, (int)dstCapacity);
+      if (ZSTD_isError(errorCode)) return Z_STREAM_ERROR;
+      *destLen = errorCode;
+    }
+    return Z_OK;
+}
+
+
+ZEXTERN int ZEXPORT z_compress2 OF((Bytef *dest,   uLongf *destLen,
+                                  const Bytef *source, uLong sourceLen,
+                                  int level))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return compress2(dest, destLen, source, sourceLen, level);
+
+    { size_t dstCapacity = *destLen;
+      size_t const errorCode = ZSTD_compress(dest, dstCapacity, source, sourceLen, level);
+      if (ZSTD_isError(errorCode)) return Z_STREAM_ERROR;
+      *destLen = errorCode;
+    }
+    return Z_OK;
+}
+
+
+ZEXTERN uLong ZEXPORT z_compressBound OF((uLong sourceLen))
+{
+    if (!g_ZWRAP_useZSTDcompression)
+        return compressBound(sourceLen);
+
+    return ZSTD_compressBound(sourceLen);
+}
+
+
+ZEXTERN int ZEXPORT z_uncompress OF((Bytef *dest,   uLongf *destLen,
+                                   const Bytef *source, uLong sourceLen))
+{
+    if (sourceLen < 4 || MEM_readLE32(source) != ZSTD_MAGICNUMBER)
+        return uncompress(dest, destLen, source, sourceLen);
+
+    { size_t dstCapacity = *destLen;
+      size_t const errorCode = ZSTD_decompress(dest, dstCapacity, source, sourceLen);
+      if (ZSTD_isError(errorCode)) return Z_STREAM_ERROR;
+      *destLen = errorCode;
+     }
+    return Z_OK;
+}
+
+#endif /* !Z_SOLO */
+
+
+                        /* checksum functions */
+
+ZEXTERN uLong ZEXPORT z_adler32 OF((uLong adler, const Bytef *buf, uInt len))
+{
+    return adler32(adler, buf, len);
+}
+
+ZEXTERN uLong ZEXPORT z_crc32   OF((uLong crc, const Bytef *buf, uInt len))
+{
+    return crc32(crc, buf, len);
+}
+
+
+#if ZLIB_VERNUM >= 0x12B0
+ZEXTERN uLong ZEXPORT z_adler32_z OF((uLong adler, const Bytef *buf, z_size_t len))
+{
+    return adler32_z(adler, buf, len);
+}
+
+ZEXTERN uLong ZEXPORT z_crc32_z OF((uLong crc, const Bytef *buf, z_size_t len))
+{
+    return crc32_z(crc, buf, len);
+}
+#endif
+
+
+#if ZLIB_VERNUM >= 0x1270
+ZEXTERN const z_crc_t FAR * ZEXPORT z_get_crc_table    OF((void))
+{
+    return get_crc_table();
+}
+#endif
diff --git a/zstd/zlibWrapper/zstd_zlibwrapper.h b/zstd/zlibWrapper/zstd_zlibwrapper.h
new file mode 100644
index 0000000..647a7b2
--- /dev/null
+++ b/zstd/zlibWrapper/zstd_zlibwrapper.h
@@ -0,0 +1,91 @@
+/**
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+
+#ifndef ZSTD_ZLIBWRAPPER_H
+#define ZSTD_ZLIBWRAPPER_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#define ZLIB_CONST
+#define Z_PREFIX
+#define ZLIB_INTERNAL   /* disables gz*64 functions but fixes zlib 1.2.4 with Z_PREFIX */
+#ifdef STATIC_ZLIB
+  #include "../../../zlib-1.2.11/zlib.h"
+#else
+  #include <zlib.h>
+#endif
+
+#if !defined(z_const)
+    #define z_const
+#endif
+
+
+/* returns a string with version of zstd library */
+const char * zstdVersion(void);
+
+
+/*** COMPRESSION ***/
+/* ZWRAP_useZSTDcompression() enables/disables zstd compression during runtime.
+   By default zstd compression is disabled. To enable zstd compression please use one of the methods:
+   - compilation with the additional option -DZWRAP_USE_ZSTD=1 
+   - using '#define ZWRAP_USE_ZSTD 1' in source code before '#include "zstd_zlibwrapper.h"'
+   - calling ZWRAP_useZSTDcompression(1)
+   All above-mentioned methods will enable zstd compression for all threads.
+   Be aware that ZWRAP_useZSTDcompression() is not thread-safe and may lead to a race condition. */
+void ZWRAP_useZSTDcompression(int turn_on);
+
+/* checks if zstd compression is turned on */
+int ZWRAP_isUsingZSTDcompression(void);
+
+/* Changes a pledged source size for a given compression stream.
+   It will change ZSTD compression parameters what may improve compression speed and/or ratio.
+   The function should be called just after deflateInit() or deflateReset() and before deflate() or deflateSetDictionary().
+   It's only helpful when data is compressed in blocks. 
+   There will be no change in case of deflateInit() or deflateReset() immediately followed by deflate(strm, Z_FINISH) 
+   as this case is automatically detected.  */
+int ZWRAP_setPledgedSrcSize(z_streamp strm, unsigned long long pledgedSrcSize);
+
+/* Similar to deflateReset but preserves dictionary set using deflateSetDictionary.
+   It should improve compression speed because there will be less calls to deflateSetDictionary 
+   When using zlib compression this method redirects to deflateReset. */
+int ZWRAP_deflateReset_keepDict(z_streamp strm);
+
+
+
+/*** DECOMPRESSION ***/
+typedef enum { ZWRAP_FORCE_ZLIB, ZWRAP_AUTO } ZWRAP_decompress_type;
+
+/* ZWRAP_setDecompressionType() enables/disables automatic recognition of zstd/zlib compressed data during runtime.
+   By default auto-detection of zstd and zlib streams in enabled (ZWRAP_AUTO).
+   Forcing zlib decompression with ZWRAP_setDecompressionType(ZWRAP_FORCE_ZLIB) slightly improves
+   decompression speed of zlib-encoded streams.
+   Be aware that ZWRAP_setDecompressionType() is not thread-safe and may lead to a race condition. */
+void ZWRAP_setDecompressionType(ZWRAP_decompress_type type);
+
+/* checks zstd decompression type */
+ZWRAP_decompress_type ZWRAP_getDecompressionType(void);
+
+/* Checks if zstd decompression is used for a given stream.
+   If will return 1 only when inflate() was called and zstd header was detected. */
+int ZWRAP_isUsingZSTDdecompression(z_streamp strm);
+
+/* Similar to inflateReset but preserves dictionary set using inflateSetDictionary.
+   inflate() will return Z_NEED_DICT only for the first time what will improve decompression speed.
+   For zlib streams this method redirects to inflateReset. */
+int ZWRAP_inflateReset_keepDict(z_streamp strm);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ZLIBWRAPPER_H */
diff --git a/zstd/zlibWrapper/zstd_zlibwrapper.o b/zstd/zlibWrapper/zstd_zlibwrapper.o
new file mode 100644
index 0000000..4d7dbac
Binary files /dev/null and b/zstd/zlibWrapper/zstd_zlibwrapper.o differ
diff --git a/zstd/zlibWrapper/zwrapbench b/zstd/zlibWrapper/zwrapbench
new file mode 100755
index 0000000..6529817
Binary files /dev/null and b/zstd/zlibWrapper/zwrapbench differ

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink2.git