[med-svn] [asmlib] 01/02: Imported Upstream version 0.1
Jorge Soares
jssoares-guest at moszumanska.debian.org
Tue Nov 4 10:54:09 UTC 2014
This is an automated email from the git hooks/post-receive script.
jssoares-guest pushed a commit to branch master
in repository asmlib.
commit b012ac3d8f1b55857bf74364b00d79e57261ea84
Author: Jorge Soares <j.s.soares at gmail.com>
Date: Tue Nov 4 11:21:52 2014 +0100
Imported Upstream version 0.1
---
LICENSE | 675 +++++++++++++++++
README.md | 5 +
asmlibSrc/MakeAsmlib.bat | 22 +
asmlibSrc/asmlib.make | 252 +++++++
asmlibSrc/cachesize32.asm | 335 +++++++++
asmlibSrc/cachesize64.asm | 333 +++++++++
asmlibSrc/cpuid32.asm | 38 +
asmlibSrc/cpuid64.asm | 53 ++
asmlibSrc/cputype32.asm | 139 ++++
asmlibSrc/cputype64.asm | 125 ++++
asmlibSrc/debugbreak32.asm | 31 +
asmlibSrc/debugbreak64.asm | 31 +
asmlibSrc/dispatchpatch32.asm | 311 ++++++++
asmlibSrc/dispatchpatch64.asm | 328 +++++++++
asmlibSrc/divfixedi32.asm | 152 ++++
asmlibSrc/divfixedi64.asm | 171 +++++
asmlibSrc/divfixedv32.asm | 490 +++++++++++++
asmlibSrc/divfixedv64.asm | 496 +++++++++++++
asmlibSrc/instrset32.asm | 244 +++++++
asmlibSrc/instrset64.asm | 173 +++++
asmlibSrc/libad32.asm | 14 +
asmlibSrc/libad32.def | 44 ++
asmlibSrc/libad64.asm | 13 +
asmlibSrc/libad64.def | 42 ++
asmlibSrc/memcmp32.asm | 366 ++++++++++
asmlibSrc/memcmp64.asm | 293 ++++++++
asmlibSrc/memcpy32.asm | 1460 +++++++++++++++++++++++++++++++++++++
asmlibSrc/memcpy64.asm | 1313 +++++++++++++++++++++++++++++++++
asmlibSrc/memmove32.asm | 1238 +++++++++++++++++++++++++++++++
asmlibSrc/memmove64.asm | 1073 +++++++++++++++++++++++++++
asmlibSrc/memset32.asm | 487 +++++++++++++
asmlibSrc/memset64.asm | 368 ++++++++++
asmlibSrc/mersenne32.asm | 821 +++++++++++++++++++++
asmlibSrc/mersenne64.asm | 614 ++++++++++++++++
asmlibSrc/mother32.asm | 370 ++++++++++
asmlibSrc/mother64.asm | 250 +++++++
asmlibSrc/physseed32.asm | 334 +++++++++
asmlibSrc/physseed64.asm | 394 ++++++++++
asmlibSrc/popcount32.asm | 137 ++++
asmlibSrc/popcount64.asm | 110 +++
asmlibSrc/procname32.asm | 186 +++++
asmlibSrc/procname64.asm | 143 ++++
asmlibSrc/randomah.asi | 290 ++++++++
asmlibSrc/rdtsc32.asm | 51 ++
asmlibSrc/rdtsc64.asm | 51 ++
asmlibSrc/round32.asm | 41 ++
asmlibSrc/round64.asm | 38 +
asmlibSrc/sfmt32.asm | 1265 ++++++++++++++++++++++++++++++++
asmlibSrc/sfmt64.asm | 908 +++++++++++++++++++++++
asmlibSrc/strcat32.asm | 60 ++
asmlibSrc/strcat64.asm | 68 ++
asmlibSrc/strcmp32.asm | 177 +++++
asmlibSrc/strcmp64.asm | 162 ++++
asmlibSrc/strcountset32.asm | 194 +++++
asmlibSrc/strcountset64.asm | 175 +++++
asmlibSrc/strcountutf832.asm | 162 ++++
asmlibSrc/strcountutf864.asm | 127 ++++
asmlibSrc/strcpy32.asm | 53 ++
asmlibSrc/strcpy64.asm | 64 ++
asmlibSrc/stricmp32.asm | 70 ++
asmlibSrc/stricmp64.asm | 84 +++
asmlibSrc/strlen32.asm | 182 +++++
asmlibSrc/strlen64.asm | 84 +++
asmlibSrc/strspn32.asm | 338 +++++++++
asmlibSrc/strspn64.asm | 304 ++++++++
asmlibSrc/strstr32.asm | 251 +++++++
asmlibSrc/strstr64.asm | 218 ++++++
asmlibSrc/strtouplow32.asm | 285 ++++++++
asmlibSrc/strtouplow64.asm | 213 ++++++
asmlibSrc/substring32.asm | 61 ++
asmlibSrc/substring64.asm | 73 ++
asmlibSrc/testalib.cpp | 151 ++++
asmlibSrc/testmem.cpp | 396 ++++++++++
asmlibSrc/testrandom.cpp | 130 ++++
asmlibSrc/unalignedisfaster32.asm | 178 +++++
asmlibSrc/unalignedisfaster64.asm | 186 +++++
76 files changed, 21564 insertions(+)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6b156fe
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,675 @@
+GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ {one line to give the program's name and a brief idea of what it does.}
+ Copyright (C) {year} {name of author}
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ {project} Copyright (C) {year} {fullname}
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a90beb8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,5 @@
+asmlib
+======
+
+This is a library of optimized subroutines coded in assembly language. The functions in this library can be called from C, C++ and other compiled high-level languages. Supports many different compilers under Windows, Linux, BSD and Mac OS X operating systems, 32 and 64 bits. This library contains faster versions of common C/C++ memory and string functions, fast functions for string search and string parsing, fast integer division and integer vector division, as well as several useful fun [...]
+
diff --git a/asmlibSrc/MakeAsmlib.bat b/asmlibSrc/MakeAsmlib.bat
new file mode 100755
index 0000000..95ca00f
--- /dev/null
+++ b/asmlibSrc/MakeAsmlib.bat
@@ -0,0 +1,22 @@
+rem MakeAsmlib.bat 2011-07-01 Agner Fog
+
+rem Make function library from assembly source with multiple
+rem versions for different operating systems using objconv.
+
+
+rem Set path to assembler and objconv:
+rem You need to modify this path to fit your installation
+rem set path=C:\Program Files\Microsoft Visual Studio 9.0\VC\bin;C:\Program Files\Microsoft Visual Studio 9.0\Common7\IDE;C:\Program Files\Microsoft Visual Studio 9.0\VC\bin\x86_amd64;E:\Program Files\Microsoft SDKs\Windows\v6.1\Bin\x64;%path%
+
+rem Path to nmake:
+set mspath=C:\Program Files (x86)\Microsoft Visual Studio 11.0
+
+set path=%mspath%\VC\bin;%mspath%\Common7\IDE;%mspath%\VC\bin\amd64;%path%
+
+
+rem Make everything according to makefile asmlib.make
+nmake /Fasmlib.make
+
+wzzip asmlibbak.zip asmlib.zip asmlib-instructions.doc *.cpp
+
+pause
\ No newline at end of file
diff --git a/asmlibSrc/asmlib.make b/asmlibSrc/asmlib.make
new file mode 100755
index 0000000..a083f8c
--- /dev/null
+++ b/asmlibSrc/asmlib.make
@@ -0,0 +1,252 @@
+# ASMLIB.MAKE 2013-09-11 Agner Fog
+
+# Makefile for ASMLIB function library, YASM version
+# See asmlib-instructions.doc for a description
+
+# The following tools are required for building this library package:
+# Microsoft nmake or other make utility
+# Microsoft link
+# YASM assembler yasm.exe
+# (Works with NASM assembler as well, except for position-independent versions)
+# Object file converter objconv.exe (www.agner.org/optimize)
+# Winzip command line version (www.winzip.com) or other zip utility
+
+libpath64="C:\Program Files\Microsoft Visual Studio 9.0\VC\lib\amd64"
+
+# Main target is zip file
+# Using wzzip, which is the command line version of Winzip (www.winzip.com)
+asmlib.zip: lib/libacof32.lib lib/libacof32o.lib lib/libacof64.lib lib/libacof64o.lib \
+lib/libaomf32.lib lib/libaomf32o.lib \
+lib/libaelf32.a lib/libaelf32o.a lib/libaelf32p.a lib/libaelf32op.a \
+lib/libaelf64.a lib/libaelf64o.a \
+lib/libamac32.a lib/libamac32o.a lib/libamac32p.a lib/libamac32op.a \
+lib/libamac64.a lib/libamac64o.a \
+lib/libad32.dll lib/libad32.lib lib/libad64.dll lib/libad64.lib \
+asmlib.h asmlibran.h asmlib-instructions.pdf license.txt \
+asmlibSrc.zip inteldispatchpatch.zip
+ wzzip $@ $?
+
+# Make zip archive of source code
+asmlibSrc.zip: makeasmlib.bat asmlib.make \
+asm/instrset32.asm asm/instrset64.asm asm/procname32.asm asm/procname64.asm \
+asm/rdtsc32.asm asm/rdtsc64.asm asm/round32.asm asm/round64.asm \
+asm/libad32.asm asm/libad32.def asm/libad64.asm asm/libad64.def \
+asm/memcpy32.asm asm/memmove32.asm asm/memcpy64.asm asm/memmove64.asm \
+asm/memset32.asm asm/memset64.asm asm/memcmp32.asm asm/memcmp64.asm \
+asm/strlen32.asm asm/strlen64.asm \
+asm/strcpy32.asm asm/strcpy64.asm asm/strcat32.asm asm/strcat64.asm \
+asm/strcmp32.asm asm/strcmp64.asm asm/stricmp32.asm asm/stricmp64.asm \
+asm/strtouplow32.asm asm/strtouplow64.asm asm/strstr32.asm asm/strstr64.asm \
+asm/substring32.asm asm/substring64.asm asm/strspn32.asm asm/strspn64.asm \
+asm/strcountutf832.asm asm/strcountutf864.asm \
+asm/strcountset32.asm asm/strcountset64.asm \
+asm/divfixedi32.asm asm/divfixedi64.asm \
+asm/divfixedv32.asm asm/divfixedv64.asm \
+asm/popcount32.asm asm/popcount64.asm \
+asm/cpuid32.asm asm/cpuid64.asm asm/cputype32.asm asm/cputype64.asm \
+asm/physseed32.asm asm/physseed64.asm \
+asm/mother32.asm asm/mother64.asm asm/mersenne32.asm asm/mersenne64.asm \
+asm/randomah.asi asm/sfmt32.asm asm/sfmt64.asm \
+asm/debugbreak32.asm asm/debugbreak64.asm \
+asm/unalignedisfaster32.asm asm/unalignedisfaster64.asm \
+asm/cachesize32.asm asm/cachesize64.asm \
+asm/dispatchpatch32.asm asm/dispatchpatch64.asm \
+testalib.cpp testrandom.cpp testmem.cpp
+ wzzip $@ $?
+
+# Make zip archive of inteldispatchpatch
+inteldispatchpatch.zip: patch/dispatchpatch.txt \
+patch/dispatchpatch32.obj patch/dispatchpatch32.o patch/dispatchpatch32.mac.o \
+patch/dispatchpatch64.obj patch/dispatchpatch64.o patch/dispatchpatch64.mac.o \
+patch/intel_cpu_feature_patch.c patch/intel_mkl_feature_patch.c
+ wzzip $@ $?
+
+
+# Build each library version:
+
+# 32 bit Windows/COFF library
+lib/libacof32.lib: obj/instrset32.obj32 obj/procname32.obj32 \
+obj/cpuid32.obj32 obj/rdtsc32.obj32 obj/round32.obj32 \
+obj/memcpy32.obj32 obj/memmove32.obj32 obj/memset32.obj32 obj/memcmp32.obj32 \
+obj/strlen32.obj32 obj/strcpy32.obj32 obj/strcat32.obj32 \
+obj/strstr32.obj32 obj/strcmp32.obj32 obj/stricmp32.obj32 \
+obj/strtouplow32.obj32 obj/substring32.obj32 obj/strspn32.obj32 \
+obj/strcountutf832.obj32 obj/strcountset32.obj32 \
+obj/divfixedi32.obj32 obj/divfixedv32.obj32 obj/popcount32.obj32 \
+obj/physseed32.obj32 obj/mother32.obj32 obj/mersenne32.obj32 \
+obj/sfmt32.obj32 \
+obj/cputype32.obj32 obj/debugbreak32.obj32 obj/unalignedisfaster32.obj32 \
+obj/cachesize32.obj32
+ objconv -fcof32 -wex -lib $@ $?
+
+# 32 bit ELF library, position dependent
+lib/libaelf32.a: obj/instrset32.o32 obj/procname32.o32 \
+obj/cpuid32.o32 obj/rdtsc32.o32 obj/round32.o32 \
+obj/memcpy32.o32 obj/memmove32.o32 obj/memset32.o32 obj/memcmp32.o32 \
+obj/strlen32.o32 obj/strcpy32.o32 obj/strcat32.o32 \
+obj/strstr32.o32 obj/strcmp32.o32 obj/stricmp32.o32 \
+obj/strtouplow32.o32 obj/substring32.o32 obj/strspn32.o32 \
+obj/strcountutf832.o32 obj/strcountset32.o32 \
+obj/divfixedi32.o32 obj/divfixedv32.o32 obj/popcount32.o32 \
+obj/physseed32.o32 obj/mother32.o32 obj/mersenne32.o32 \
+obj/sfmt32.o32 \
+obj/cputype32.o32 obj/debugbreak32.o32 obj/unalignedisfaster32.o32 \
+obj/cachesize32.o32
+ objconv -felf32 -nu -wex -lib $@ $?
+
+# 32 bit ELF library, position independent
+lib/libaelf32p.a: obj/instrset32.o32pic obj/procname32.o32pic \
+obj/cpuid32.o32pic obj/rdtsc32.o32pic obj/round32.o32pic \
+obj/memcpy32.o32pic obj/memmove32.o32pic obj/memset32.o32pic obj/memcmp32.o32pic \
+obj/strlen32.o32pic obj/strcpy32.o32pic obj/strcat32.o32pic \
+obj/strstr32.o32pic obj/strcmp32.o32pic obj/stricmp32.o32pic \
+obj/strtouplow32.o32pic obj/substring32.o32pic obj/strspn32.o32pic \
+obj/strcountutf832.o32pic obj/strcountset32.o32pic \
+obj/divfixedi32.o32pic obj/divfixedv32.o32pic obj/popcount32.o32pic \
+obj/physseed32.o32pic obj/mother32.o32pic obj/mersenne32.o32pic \
+obj/sfmt32.o32pic \
+obj/cputype32.o32pic obj/debugbreak32.o32 obj/unalignedisfaster32.o32 \
+obj/cachesize32.o32pic
+ objconv -felf32 -nu -wex -lib $@ $?
+
+# 64 bit COFF library Windows
+lib/libacof64.lib: obj/instrset64.obj64 obj/procname64.obj64 \
+obj/cpuid64.obj64 obj/rdtsc64.obj64 obj/round64.obj64 \
+obj/memcpy64.obj64 obj/memmove64.obj64 obj/memset64.obj64 obj/memcmp64.obj64 \
+obj/strlen64.obj64 obj/strcpy64.obj64 obj/strcat64.obj64 \
+obj/strstr64.obj64 obj/strcmp64.obj64 obj/stricmp64.obj64 \
+obj/strtouplow64.obj64 obj/substring64.obj64 obj/strspn64.obj64 \
+obj/strcountutf864.obj64 obj/strcountset64.obj64 \
+obj/divfixedi64.obj64 obj/divfixedv64.obj64 obj/popcount64.obj64 \
+obj/physseed64.obj64 obj/mother64.obj64 obj/mersenne64.obj64 \
+obj/sfmt64.obj64 \
+obj/cputype64.obj64 obj/debugbreak64.obj64 obj/unalignedisfaster64.obj64 \
+obj/cachesize64.obj64
+ objconv -fcof64 -wex -lib $@ $?
+
+# 64 bit ELF library Unix
+lib/libaelf64.a: obj/instrset64.o64 obj/procname64.o64 \
+obj/cpuid64.o64 obj/rdtsc64.o64 obj/round64.o64 \
+obj/memcpy64.o64 obj/memmove64.o64 obj/memset64.o64 obj/memcmp64.o64 \
+obj/strlen64.o64 obj/strcpy64.o64 obj/strcat64.o64 \
+obj/strstr64.o64 obj/strcmp64.o64 obj/stricmp64.o64 \
+obj/strtouplow64.o64 obj/substring64.o64 obj/strspn64.o64 \
+obj/strcountutf864.o64 obj/strcountset64.o64 \
+obj/divfixedi64.o64 obj/divfixedv64.o64 obj/popcount64.o64 \
+obj/physseed64.o64 obj/mother64.o64 obj/mersenne64.o64 \
+obj/sfmt64.o64 \
+obj/cputype64.o64 obj/debugbreak64.o64 obj/unalignedisfaster64.o64 \
+obj/cachesize64.o64
+ objconv -felf64 -nu -wex -wd1029 -lib $@ $?
+
+# Convert these libraries to other versions:
+
+# 32 bit COFF library, override version
+lib/libacof32o.lib: lib/libacof32.lib
+ objconv -fcof32 -np:?OVR_:_ -wex $** $@
+
+# 32 bit OMF library
+lib/libaomf32.lib: lib/libacof32.lib
+ objconv -fomf32 -nu -wex $** $@
+
+# 32 bit OMF library, override version
+lib/libaomf32o.lib: lib/libacof32o.lib
+ objconv -fomf32 -nu -wex $** $@
+
+# 32 bit ELF library, override, position dependent
+lib/libaelf32o.a: lib/libaelf32.a
+ objconv -felf32 -np:?OVR_: -wex $** $@
+
+# 32 bit ELF library, override, position independent
+lib/libaelf32op.a: lib/libaelf32p.a
+ objconv -felf32 -np:?OVR_: -wex $** $@
+
+# 32 bit Mach-O library, position dependent
+lib/libamac32.a: lib/libaelf32.a
+ objconv -fmac32 -nu -wex -wd1050 $** $@
+
+# 32 bit Mach-O library, position independent
+lib/libamac32p.a: lib/libaelf32p.a
+ objconv -fmac32 -nu -wex -wd1050 $** $@
+
+# 32 bit Mach-O library, override
+lib/libamac32o.a: lib/libaelf32o.a
+ objconv -fmac32 -nu -wex -wd1050 $** $@
+
+# 32 bit Mach-O library, override, position independent
+lib/libamac32op.a: lib/libaelf32op.a
+ objconv -fmac32 -nu -wex -wd1050 $** $@
+
+# Make 64 bit COFF library, override
+lib/libacof64o.lib: lib/libacof64.lib
+ objconv -fcof64 -np:?OVR_: -wex $** $@
+
+# 64 bit ELF library, override
+lib/libaelf64o.a: lib/libaelf64.a
+ objconv -felf64 -np:?OVR_: -wex -wd1029 $** $@
+
+# 64 bit Mach-O library
+lib/libamac64.a: lib/libaelf64.a
+ objconv -fmac64 -nu -wex $** $@
+
+# 64 bit Mach-O library, override
+lib/libamac64o.a: lib/libaelf64o.a
+ objconv -fmac64 -nu -wex $** $@
+
+# Convert 32 bit COFF library to DLL
+lib/libad32.dll: lib/libacof32.lib obj/libad32.obj32 asm/libad32.def
+ link /DLL /DEF:asm\libad32.def /SUBSYSTEM:WINDOWS /NODEFAULTLIB /ENTRY:DllEntry obj\libad32.obj32 lib/libacof32.lib
+ move libad32.* lib\
+
+# Convert 64 bit COFF library to DLL
+lib/libad64.dll: lib/libacof64.lib obj/libad64.obj64 asm/libad64.def
+ link /DLL /DEF:asm\libad64.def /SUBSYSTEM:WINDOWS /LIBPATH:$(libpath64) /ENTRY:DllEntry obj\libad64.obj64 lib/libacof64.lib
+ move libad64.* lib\
+
+
+# Object files for inteldispatchpatch.zip:
+
+patch/dispatchpatch32.obj: obj/dispatchpatch32.obj32
+ copy obj\dispatchpatch32.obj32 patch\dispatchpatch32.obj
+# Note: copy must have '\', not '/'
+
+patch/dispatchpatch32.o: obj/dispatchpatch32.o32pic
+ copy obj\dispatchpatch32.o32pic patch\dispatchpatch32.o
+
+patch/dispatchpatch32.mac.o: obj/dispatchpatch32.o32pic
+ objconv -fmac32 -nu -wex -wd1050 $** $@
+
+patch/dispatchpatch64.obj: obj/dispatchpatch64.obj64
+ copy obj\dispatchpatch64.obj64 patch\dispatchpatch64.obj
+
+patch/dispatchpatch64.o: obj/dispatchpatch64.o64
+ copy obj\dispatchpatch64.o64 patch\dispatchpatch64.o
+
+patch/dispatchpatch64.mac.o: obj/dispatchpatch64.o64
+ objconv -fmac64 -nu -wex $** $@
+
+
+# Generic rules for assembling
+
+# Generic rule for assembling 32-bit code for Windows (position dependent)
+{asm\}.asm{obj\}.obj32:
+ yasm -fwin32 -DWINDOWS -Worphan-labels -Werror -o $*.obj32 $<
+# ML /c /Cx /W3 /coff /Fl /Fo$*.obj32
+
+# Generic rule for assembling 32-bit for Unix, position-dependent
+{asm\}.asm{obj\}.o32:
+ yasm -felf32 -DUNIX -Worphan-labels -Werror -o $*.o32 $<
+ objconv -felf32 -nu- -wd2005 $*.o32 $*.o32
+
+# Generic rule for assembling 32-bit for Unix, position-independent
+{asm\}.asm{obj\}.o32pic:
+ yasm -felf32 -DUNIX -DPOSITIONINDEPENDENT -Worphan-labels -Werror -o $*.o32pic $<
+ objconv -felf32 -nu- -wd2005 $*.o32pic $*.o32pic
+
+# Generic rule for assembling 64-bit code for Windows
+{asm\}.asm{obj\}.obj64:
+ yasm -fwin64 -DWINDOWS -Worphan-labels -Werror -o $*.obj64 $<
+
+# Generic rule for assembling 64-bit code for Linux, BSD, Mac
+{asm\}.asm{obj\}.o64:
+ yasm -felf64 -DUNIX -Worphan-labels -Werror -o $*.o64 $<
diff --git a/asmlibSrc/cachesize32.asm b/asmlibSrc/cachesize32.asm
new file mode 100755
index 0000000..c50b957
--- /dev/null
+++ b/asmlibSrc/cachesize32.asm
@@ -0,0 +1,335 @@
+;************************* cachesize32.asm *************************************
+; Author: Agner Fog
+; Date created: 2011-07-11
+; Last modified: 2013-08-14
+; Description:
+; Determines the size of the data caches
+;
+; extern "C" site_t DataCacheSize(int level);
+; Input:
+; level: n = 1 - 4: level n data cache
+; 0 = largest level data cache
+; Return value: size in bytes of data cache
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _DataCacheSize: function
+
+; Imported from cputype32.asm
+extern _CpuType ; near. Determine CPU vendor
+
+; data are referenced as [esi+structuremember] rather than [esi+label-dataref] because
+; of a bug in yasm v. 1.1.0.2352:
+
+struc data_layout
+ok: resd 1
+level1: resd 1
+level2: resd 1
+level3: resd 1
+level4: resd 1
+descriptortable: resd 60
+endstruc
+
+struc descriptor_record ; record for table of cache descriptors
+d_key: resb 1 ; key from cpuid instruction
+d_level: resb 1 ; cache level
+d_sizem: resb 1 ; size multiplier
+d_2pow: resb 1 ; power of 2. size = d_sizem << d_2pow
+endstruc
+
+SECTION .data
+
+dataref: ; reference point
+ok_: DD 0 ; 1 when values are determined
+level1_: DD 0 ; level 1 data cache size
+level2_: DD 0 ; level 2 data cache size
+level3_: DD 0 ; level 3 data cache size
+level4_: DD 0 ; level 4 data cache size
+numlevels equ 4 ; max level
+
+; From "Intel Processor Identification and the CPUID Instruction, Application note 485
+descriptortable_: ; table of Intel cache descriptors
+db 0Ah, 1, 1, 13 ; 8 kb L1 data cache
+db 0Ch, 1, 1, 14 ; 16 kb L1 data cache
+db 0Dh, 1, 1, 14 ; 16 kb L1 data cache
+db 21h, 2, 1, 18 ; 256 kb L2 data cache
+db 22h, 3, 1, 19 ; 512 kb L3 data cache
+db 23h, 3, 1, 20 ; 1 Mb L3 data cache
+db 25h, 3, 1, 21 ; 2 Mb L3 data cache
+db 29h, 3, 1, 22 ; 4 Mb L3 data cache
+db 2Ch, 1, 1, 15 ; 32 kb L1 data cache
+db 39h, 2, 1, 17 ; 128 kb L2 data cache
+db 3Ah, 2, 3, 16 ; 192 kb L2 data cache
+db 3Bh, 2, 1, 17 ; 128 kb L1 data cache
+db 3Ch, 2, 1, 18 ; 256 kb L1 data cache
+db 3Dh, 2, 3, 17 ; 384 kb L2 data cache
+db 3Eh, 2, 1, 19 ; 512 kb L2 data cache
+db 41h, 2, 1, 17 ; 128 kb L2 data cache
+db 42h, 2, 1, 18 ; 256 kb L2 data cache
+db 43h, 2, 1, 19 ; 512 kb L2 data cache
+db 44h, 2, 1, 20 ; 1 Mb L2 data cache
+db 45h, 2, 1, 21 ; 2 Mb L2 data cache
+db 46h, 3, 1, 22 ; 4 Mb L3 data cache
+db 47h, 3, 1, 23 ; 8 Mb L3 data cache
+db 48h, 2, 3, 20 ; 3 Mb L2 data cache
+db 49h, 2, 1, 22 ; 4 Mb L2 or 3 data cache
+db 4Ah, 3, 3, 21 ; 6 Mb L3 data cache
+db 4Bh, 3, 1, 23 ; 8 Mb L3 data cache
+db 4Ch, 3, 3, 22 ; 12 Mb L3 data cache
+db 4Dh, 3, 1, 24 ; 16 Mb L3 data cache
+db 4Eh, 2, 3, 21 ; 6 Mb L2 data cache
+db 60h, 1, 1, 14 ; 16 kb L1 data cache
+db 66h, 1, 1, 13 ; 8 kb L1 data cache
+db 67h, 1, 1, 14 ; 16 kb L1 data cache
+db 68h, 1, 1, 15 ; 32 kb L1 data cache
+db 78h, 2, 1, 20 ; 1 Mb L2 data cache
+db 79h, 2, 1, 17 ; 128 kb L2 data cache
+db 7Ah, 2, 1, 18 ; 256 kb L2 data cache
+db 7Bh, 2, 1, 19 ; 512 kb L2 data cache
+db 7Ch, 2, 1, 20 ; 1 Mb L2 data cache
+db 7Dh, 2, 1, 21 ; 2 Mb L2 data cache
+db 7Fh, 2, 1, 19 ; 512 kb L2 data cache
+db 82h, 2, 1, 18 ; 256 kb L2 data cache
+db 83h, 2, 1, 19 ; 512 kb L2 data cache
+db 84h, 2, 1, 20 ; 1 Mb L2 data cache
+db 85h, 2, 1, 21 ; 2 Mb L2 data cache
+db 86h, 2, 1, 19 ; 512 kb L2 data cache
+db 87h, 2, 1, 20 ; 1 Mb L2 data cache
+db 0D0h, 3, 1, 19 ; 512 kb L3 data cache
+db 0D1h, 3, 1, 20 ; 1 Mb L3 data cache
+db 0D2h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0D6h, 3, 1, 20 ; 1 Mb L3 data cache
+db 0D7h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0D8h, 3, 1, 22 ; 4 Mb L3 data cache
+db 0DCh, 3, 3, 19 ; 1.5 Mb L3 data cache
+db 0DDh, 3, 3, 20 ; 3 Mb L3 data cache
+db 0DEh, 3, 3, 21 ; 6 Mb L3 data cache
+db 0E2h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0E3h, 3, 1, 22 ; 4 Mb L3 data cache
+db 0E4h, 3, 1, 23 ; 8 Mb L3 data cache
+db 0EAh, 3, 3, 22 ; 12 Mb L3 data cache
+db 0EBh, 3, 9, 21 ; 18 Mb L3 data cache
+db 0ECh, 3, 3, 23 ; 24 Mb L3 data cache
+descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
+
+
+SECTION .text
+
+; extern "C" site_t _DataCacheSize(int level);
+
+; Function entry:
+_DataCacheSize:
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov edi, [esp+20] ; level
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_esi
+ add esi, dataref - $ ; point to dataref
+%ELSE
+ mov esi, dataref ; point to dataref
+%ENDIF
+ ; check if called before
+ cmp dword [esi + ok], 1
+ je D800
+
+ ; find cpu vendor
+ push 0
+ mov eax, esp
+ push 0
+ push 0
+ push eax
+ call _CpuType
+ add esp, 12
+ pop eax ; eax = vendor
+ dec eax
+ jz Intel
+ dec eax
+ jz AMD
+ dec eax
+ jz VIA
+ ; unknown vendor, try all methods
+ call IntelNewMethod
+ jnc D800 ; not carry = success
+ call AMDMethod
+ jnc D800 ; not carry = success
+ call IntelOldMethod
+ jmp D800 ; return whether success or not
+
+Intel: call IntelNewMethod
+ jnc D800 ; not carry = success
+ call IntelOldMethod
+ jmp D800 ; return whether success or not
+
+AMD: ; AMD and VIA use same method
+VIA: call AMDMethod
+
+D800: ; cache data known, get desired return value
+ xor eax, eax
+ cmp edi, numlevels
+ ja D900
+ cmp edi, 0
+ je D820
+ ; level = 1 .. numlevels
+ mov eax, [esi + edi*4] ; size of selected cache
+ jmp D850
+D820: ; level = 0. Get size of largest level cache
+ mov eax, [esi + level3]
+ test eax, eax
+ jnz D850
+ mov eax, [esi + level2]
+ test eax, eax
+ jnz D850
+ mov eax, [esi + level1]
+D850: mov dword [esi + ok], 1 ; remember called, whether success or not
+D900: pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_esi:
+ mov esi, [esp]
+ ret
+%ENDIF
+
+
+; Determine cache sizes by CPUID function 4
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelNewMethod:
+ xor eax, eax
+ cpuid ; get number of CPUID functions
+ cmp eax, 4
+ jb I900 ; fail
+ xor ebp, ebp ; loop counter
+I100: mov eax, 4
+ mov ecx, ebp
+ cpuid ; get cache parameters
+ mov edx, eax
+ and edx, 11111b ; cache type
+ jz I500 ; no more caches
+ cmp edx, 2
+ je I200 ; code cache, ignore
+ inc ecx ; sets
+ mov edx, ebx
+ shr edx, 22
+ inc edx ; ways
+ imul ecx, edx
+ mov edx, ebx
+ shr edx, 12
+ and edx, 1111111111b
+ inc edx ; partitions
+ imul ecx, edx
+ and ebx, 111111111111b
+ inc ebx ; line size
+ imul ecx, ebx ; calculated cache size
+ shr eax, 5
+ and eax, 111b ; cache level
+ cmp eax, numlevels
+ jna I180
+ mov eax, numlevels ; limit higher levels
+I180: mov [esi+eax*4], ecx ; store size of data cache level eax
+I200: inc ebp
+ cmp ebp, 100h ; avoid infinite loop
+ jb I100 ; next cache
+I500: ; loop finished
+ ; check if OK
+ mov eax, [esi+level1]
+ cmp eax, 1024
+I900: ret ; carry flag set if fail
+
+; Determine cache sizes by CPUID function 2
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelOldMethod:
+ xor eax, eax
+ cpuid ; get number of CPUID functions
+ cmp eax, 2
+ jb J900 ; fail
+ mov eax, 2
+ xor ecx, ecx
+ cpuid ; get 16 descriptor bytes in eax, ebx, ecx, edx
+ mov al, 0 ; al does not contain a descriptor
+ push eax ; save all descriptors
+ push ebx
+ push ecx
+ push edx ; now esp points to descriptors
+ mov edx, 15 ; loop counter
+ ; loop to read 16 descriptor bytes
+J100: mov al, byte [esp+edx]
+ ; find in table
+ mov ebx, descriptortablelength-1 ; loop counter
+ ; loop to search in descriptortable
+J200: cmp al, [esi + descriptortable + ebx*descriptor_record_size + d_key]
+ jne J300
+ ; descriptor found
+; YASM v. 1.1.0 fails if there are too many of (label-dataref): !
+; movzx eax, byte [esi + ebx*4 + (descriptortable_-dataref) + d_sizem]
+ movzx eax, byte [esi + ebx*4 + descriptortable + d_sizem]
+ mov cl, [esi + ebx*4 + descriptortable + d_2pow]
+ shl eax, cl ; compute size
+ movzx ecx, byte [esi + descriptortable + ebx*4 + d_level]
+ ; check that level = 1-3
+ cmp ecx, 3
+ ja J300
+ mov [esi+ecx*4], eax ; store size eax of data cache level ecx
+J300: dec ebx
+ jns J200 ; inner loop
+ dec edx
+ jns J100 ; outer loop
+ add esp, 16 ; remove from stack
+ ; check if OK
+ mov eax, [esi+level1]
+ cmp eax, 1024
+J900: ret ; carry flag set if fail
+
+
+; Determine cache sizes by CPUID function 80000005H - 80000006H
+; input: esi = pointer to dataref
+; output: values returned in dataref
+; carry flag = 0 on succes
+AMDMethod:
+ mov eax, 80000000H
+ cpuid ; get number of CPUID functions
+ cmp eax, 6
+ jb K900 ; fail
+ mov eax, 80000005H
+ cpuid ; get L1 cache size
+ shr ecx, 24 ; L1 data cache size in kbytes
+ shl ecx, 10 ; L1 data cache size in bytes
+ mov [esi+level1], ecx ; store L1 data cache size
+ mov eax, 80000006H
+ cpuid ; get L2 and L3 cache sizes
+ shr ecx, 16 ; L2 data cache size in kbytes
+ shl ecx, 10 ; L2 data cache size in bytes
+ mov [esi+level2], ecx ; store L2 data cache size
+ mov ecx, edx
+ shr ecx, 18 ; L3 data cache size / 512 kbytes
+ shl ecx, 19 ; L3 data cache size in bytes
+%if 0 ; AMD manual is unclear:
+ ; do we have to increase the value if the number of ways is not a power or 2?
+ shr edx, 12
+ and edx, 1111b ; L3 associativity
+ cmp edx, 3
+ jb K100
+ test edx, 1
+ jz K100
+ ; number of ways is not a power of 2, multiply by 1.5 ?
+ mov eax, ecx
+ shr eax, 1
+ add ecx, eax
+%endif
+K100: mov [esi+level3], ecx ; store L3 data cache size
+ ; check if OK
+ mov eax, [esi+level1]
+ cmp eax, 1024
+K900: ret ; carry flag set if fail
diff --git a/asmlibSrc/cachesize64.asm b/asmlibSrc/cachesize64.asm
new file mode 100755
index 0000000..b8c07b8
--- /dev/null
+++ b/asmlibSrc/cachesize64.asm
@@ -0,0 +1,333 @@
+;************************* cachesize64.asm *************************************
+; Author: Agner Fog
+; Date created: 2011-07-11
+; Last modified: 2013-08-14
+; Description:
+; Determines the size of the data caches
+;
+; extern "C" site_t DataCacheSize(int level);
+; Input:
+; level: n = 1 - 4: level n data cache
+; 0 = largest level data cache
+; Return value: size in bytes of data cache
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2011-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global DataCacheSize: function
+
+; Imported from cputype64.asm
+extern CpuType ; near. Determine CPU vendor
+
+struc data_layout
+ok: resd 2
+level1: resq 1
+level2: resq 1
+level3: resq 1
+level4: resq 1
+descriptortable: resd 60
+endstruc
+
+struc descriptor_record ; record for table of cache descriptors
+d_key: resb 1 ; key from cpuid instruction
+d_level: resb 1 ; cache level
+d_sizem: resb 1 ; size multiplier
+d_2pow: resb 1 ; power of 2. size = d_sizem << d_2pow
+endstruc
+
+SECTION .data
+
+dataref: ; reference point
+ok_: DD 0, 0 ; 1 when values are determined
+level1_: DQ 0 ; level 1 data cache size
+level2_: DQ 0 ; level 2 data cache size
+level3_: DQ 0 ; level 3 data cache size
+level4_: DQ 0 ; level 4 data cache size
+numlevels equ 4 ; max level
+
+; From "Intel Processor Identification and the CPUID Instruction, Application note 485
+descriptortable_: ; table of Intel cache descriptors
+db 0Ah, 1, 1, 13 ; 8 kb L1 data cache
+db 0Ch, 1, 1, 14 ; 16 kb L1 data cache
+db 0Dh, 1, 1, 14 ; 16 kb L1 data cache
+db 21h, 2, 1, 18 ; 256 kb L2 data cache
+db 22h, 3, 1, 19 ; 512 kb L3 data cache
+db 23h, 3, 1, 20 ; 1 Mb L3 data cache
+db 25h, 3, 1, 21 ; 2 Mb L3 data cache
+db 29h, 3, 1, 22 ; 4 Mb L3 data cache
+db 2Ch, 1, 1, 15 ; 32 kb L1 data cache
+db 39h, 2, 1, 17 ; 128 kb L2 data cache
+db 3Ah, 2, 3, 16 ; 192 kb L2 data cache
+db 3Bh, 2, 1, 17 ; 128 kb L1 data cache
+db 3Ch, 2, 1, 18 ; 256 kb L1 data cache
+db 3Dh, 2, 3, 17 ; 384 kb L2 data cache
+db 3Eh, 2, 1, 19 ; 512 kb L2 data cache
+db 41h, 2, 1, 17 ; 128 kb L2 data cache
+db 42h, 2, 1, 18 ; 256 kb L2 data cache
+db 43h, 2, 1, 19 ; 512 kb L2 data cache
+db 44h, 2, 1, 20 ; 1 Mb L2 data cache
+db 45h, 2, 1, 21 ; 2 Mb L2 data cache
+db 46h, 3, 1, 22 ; 4 Mb L3 data cache
+db 47h, 3, 1, 23 ; 8 Mb L3 data cache
+db 48h, 2, 3, 20 ; 3 Mb L2 data cache
+db 49h, 2, 1, 22 ; 4 Mb L2 or 3 data cache
+db 4Ah, 3, 3, 21 ; 6 Mb L3 data cache
+db 4Bh, 3, 1, 23 ; 8 Mb L3 data cache
+db 4Ch, 3, 3, 22 ; 12 Mb L3 data cache
+db 4Dh, 3, 1, 24 ; 16 Mb L3 data cache
+db 4Eh, 2, 3, 21 ; 6 Mb L2 data cache
+db 60h, 1, 1, 14 ; 16 kb L1 data cache
+db 66h, 1, 1, 13 ; 8 kb L1 data cache
+db 67h, 1, 1, 14 ; 16 kb L1 data cache
+db 68h, 1, 1, 15 ; 32 kb L1 data cache
+db 78h, 2, 1, 20 ; 1 Mb L2 data cache
+db 79h, 2, 1, 17 ; 128 kb L2 data cache
+db 7Ah, 2, 1, 18 ; 256 kb L2 data cache
+db 7Bh, 2, 1, 19 ; 512 kb L2 data cache
+db 7Ch, 2, 1, 20 ; 1 Mb L2 data cache
+db 7Dh, 2, 1, 21 ; 2 Mb L2 data cache
+db 7Fh, 2, 1, 19 ; 512 kb L2 data cache
+db 82h, 2, 1, 18 ; 256 kb L2 data cache
+db 83h, 2, 1, 19 ; 512 kb L2 data cache
+db 84h, 2, 1, 20 ; 1 Mb L2 data cache
+db 85h, 2, 1, 21 ; 2 Mb L2 data cache
+db 86h, 2, 1, 19 ; 512 kb L2 data cache
+db 87h, 2, 1, 20 ; 1 Mb L2 data cache
+db 0D0h, 3, 1, 19 ; 512 kb L3 data cache
+db 0D1h, 3, 1, 20 ; 1 Mb L3 data cache
+db 0D2h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0D6h, 3, 1, 20 ; 1 Mb L3 data cache
+db 0D7h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0D8h, 3, 1, 22 ; 4 Mb L3 data cache
+db 0DCh, 3, 3, 19 ; 1.5 Mb L3 data cache
+db 0DDh, 3, 3, 20 ; 3 Mb L3 data cache
+db 0DEh, 3, 3, 21 ; 6 Mb L3 data cache
+db 0E2h, 3, 1, 21 ; 2 Mb L3 data cache
+db 0E3h, 3, 1, 22 ; 4 Mb L3 data cache
+db 0E4h, 3, 1, 23 ; 8 Mb L3 data cache
+db 0EAh, 3, 3, 22 ; 12 Mb L3 data cache
+db 0EBh, 3, 9, 21 ; 18 Mb L3 data cache
+db 0ECh, 3, 3, 23 ; 24 Mb L3 data cache
+descriptortablelength equ ($ - descriptortable_) / descriptor_record_size
+
+
+SECTION .text
+
+; extern "C" site_t DataCacheSize(int level);
+
+; Function entry:
+DataCacheSize:
+ push rbx
+ push r14
+%ifdef WINDOWS
+ push rsi
+ push rdi
+ mov r14d, ecx ; level
+%else ; UNIX
+ mov r14d, edi ; level
+%endif
+ ; check if called before
+ lea r9, [dataref]
+ cmp dword [r9+ok], 1 ; ok
+ je D800
+
+ ; find cpu vendor
+ push 0
+%ifdef WINDOWS
+ mov rcx, rsp
+ xor edx, edx
+ xor r8d, r8d
+%else ; UNIX
+ mov rdi, rsp
+ xor esi, esi
+ xor edx, edx
+%endif
+ call CpuType
+ lea r9, [dataref]
+ pop rax ; eax = vendor
+ dec eax
+ jz Intel
+ dec eax
+ jz AMD
+ dec eax
+ jz VIA
+ ; unknown vendor, try all methods
+ call IntelNewMethod
+ jnc D800 ; not carry = success
+ call AMDMethod
+ jnc D800 ; not carry = success
+ call IntelOldMethod
+ jmp D800 ; return whether success or not
+
+Intel: call IntelNewMethod
+ jnc D800 ; not carry = success
+ call IntelOldMethod
+ jmp D800 ; return whether success or not
+
+AMD: ; AMD and VIA use same method
+VIA: call AMDMethod
+
+D800: ; cache data known, get desired return value
+ xor eax, eax
+ cmp r14d, numlevels
+ ja D900
+ cmp r14d, 0
+ je D820
+ ; level = 1 .. numlevels
+ mov rax, [r9 + r14*8] ; size of selected cache
+ jmp D850
+D820: ; level = 0. Get size of largest level cache
+ mov rax, [r9 + level3] ; level3
+ test rax, rax
+ jnz D850
+ mov rax, [r9 + level2] ; level2
+ test rax, rax
+ jnz D850
+ mov eax, [r9 + level1] ; level1
+D850: mov dword [r9 + ok], 1 ; remember called, whether success or not
+D900:
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r14
+ pop rbx
+ ret
+
+
+; Determine cache sizes by CPUID function 4
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelNewMethod:
+ xor eax, eax
+ cpuid ; get number of CPUID functions
+ cmp eax, 4
+ jb I900 ; fail
+ xor esi, esi ; loop counter
+I100: mov eax, 4
+ mov ecx, esi
+ cpuid ; get cache parameters
+ mov edx, eax
+ and edx, 11111b ; cache type
+ jz I500 ; no more caches
+ cmp edx, 2
+ je I200 ; code cache, ignore
+ inc ecx ; sets
+ mov edx, ebx
+ shr edx, 22
+ inc edx ; ways
+ imul ecx, edx
+ mov edx, ebx
+ shr edx, 12
+ and edx, 1111111111b
+ inc edx ; partitions
+ imul ecx, edx
+ and ebx, 111111111111b
+ inc ebx ; line size
+ imul rcx, rbx ; calculated cache size (64 bit)
+ shr eax, 5
+ and eax, 111b ; cache level
+ cmp eax, numlevels
+ jna I180
+ mov eax, numlevels ; limit higher levels
+I180: mov [r9+rax*8], rcx ; store size of data cache level eax
+I200: inc esi
+ cmp esi, 100h ; avoid infinite loop
+ jb I100 ; next cache
+I500: ; loop finished
+ ; check if OK
+ mov eax, [r9+level1] ; level1
+ cmp eax, 1024
+I900: ret ; carry flag set if fail
+
+; Determine cache sizes by CPUID function 2
+; input: esi = pointer to dataref
+; output: values returned in dataref + level1, level2, level3
+; carry flag = 0 on succes
+IntelOldMethod:
+ xor eax, eax
+ cpuid ; get number of CPUID functions
+ cmp eax, 2
+ jb J900 ; fail
+ mov eax, 2
+ xor ecx, ecx
+ cpuid ; get 16 descriptor bytes in eax, ebx, ecx, edx
+ mov al, 0 ; al does not contain a descriptor
+ sub rsp, 16
+ mov [rsp], eax ; save all descriptors
+ mov [rsp+4], ebx
+ mov [rsp+8], ecx
+ mov [rsp+12], edx
+ mov edx, 15 ; loop counter
+ ; loop to read 16 descriptor bytes
+J100: mov al, byte [rsp+rdx]
+ ; find in table
+ mov ebx, descriptortablelength-1 ; loop counter
+ ; loop to search in descriptortable
+J200: cmp al, [r9 + descriptortable + rbx*4 + d_key]
+ jne J300
+ ; descriptor found
+ movzx eax, byte [r9 + descriptortable + rbx*4 + d_sizem]
+ mov cl, [r9 + descriptortable + rbx*4 + d_2pow]
+ shl eax, cl ; compute size
+ movzx ecx, byte [r9 + descriptortable + rbx*4 + d_level]
+ ; check that level = 1-3
+ cmp ecx, 3
+ ja J300
+ mov [r9+rcx*8], rax ; store size eax of data cache level ecx
+J300: dec ebx
+ jns J200 ; inner loop
+ dec edx
+ jns J100 ; outer loop
+ add rsp, 16 ; remove from stack
+ ; check if OK
+ mov eax, [r9 + level1]
+ cmp eax, 1024
+J900: ret ; carry flag set if fail
+
+
+; Determine cache sizes by CPUID function 80000005H - 80000006H
+; input: esi = pointer to dataref
+; output: values returned in dataref
+; carry flag = 0 on succes
+AMDMethod:
+ mov eax, 80000000H
+ cpuid ; get number of CPUID functions
+ cmp eax, 6
+ jb K900 ; fail
+ mov eax, 80000005H
+ cpuid ; get L1 cache size
+ shr ecx, 24 ; L1 data cache size in kbytes
+ shl ecx, 10 ; L1 data cache size in bytes
+ mov [r9 + level1], ecx ; store L1 data cache size
+ mov eax, 80000006H
+ cpuid ; get L2 and L3 cache sizes
+ shr ecx, 16 ; L2 data cache size in kbytes
+ shl ecx, 10 ; L2 data cache size in bytes
+ mov [r9 + level2], ecx ; store L2 data cache size
+ mov ecx, edx
+ shr ecx, 18 ; L3 data cache size / 512 kbytes
+ shl rcx, 19 ; L3 data cache size in bytes
+%if 0 ; AMD manual is unclear:
+ ; do we have to increase the value if the number of ways is not a power or 2?
+ shr edx, 12
+ and edx, 1111b ; L3 associativity
+ cmp edx, 3
+ jb K100
+ test edx, 1
+ jz K100
+ ; number of ways is not a power of 2, multiply by 1.5 ?
+ mov rax, rcx
+ shr rax, 1
+ add rcx, rax
+%endif
+K100: mov [r9 + level3], rcx ; store L3 data cache size
+ ; check if OK
+ mov eax, [r9 + level1]
+ cmp eax, 1024
+K900: ret ; carry flag set if fail
diff --git a/asmlibSrc/cpuid32.asm b/asmlibSrc/cpuid32.asm
new file mode 100755
index 0000000..ec601a3
--- /dev/null
+++ b/asmlibSrc/cpuid32.asm
@@ -0,0 +1,38 @@
+;************************* cpuid32.asm *********************************
+; Author: Agner Fog
+; Date created: 2008-12-14
+; Last modified: 2011-07-01
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; This function calls the CPUID instruction.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _cpuid_ex: function
+
+SECTION .text align=16
+
+; ********** cpuid_ex function **********
+; C++ prototype:
+; extern "C" void cpuid_ex (int abcd[4], int eax, int ecx);
+; Input: a = eax, c = ecx
+; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
+
+
+_cpuid_ex:
+ push ebx
+ push edi
+ mov edi, [esp+12] ; abcd out
+ mov eax, [esp+16] ; eax in
+ mov ecx, [esp+20] ; ecx in
+ cpuid ; input eax, ecx. output eax, ebx, ecx, edx
+ mov [edi], eax
+ mov [edi+4], ebx
+ mov [edi+8], ecx
+ mov [edi+12], edx
+ pop edi
+ pop ebx
+ ret
+;_cpuid_ex END
diff --git a/asmlibSrc/cpuid64.asm b/asmlibSrc/cpuid64.asm
new file mode 100755
index 0000000..80cd249
--- /dev/null
+++ b/asmlibSrc/cpuid64.asm
@@ -0,0 +1,53 @@
+;************************* cpuid64.asm *********************************
+; Author: Agner Fog
+; Date created: 2008-12-14
+; Last modified: 2011-07-01
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; This function calls the CPUID instruction.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global cpuid_ex: function
+
+SECTION .text align=16
+
+; ********** cpuid_ex function **********
+; C++ prototype:
+; extern "C" void cpuid_ex (int abcd[4], int a, int c);
+; Input: a = eax, c = ecx
+; Output: abcd[0] = eax, abcd[1] = ebx, abcd[2] = ecx, abcd[3] = edx
+
+
+cpuid_ex:
+
+%IFDEF WINDOWS
+; parameters: rcx = abcd, edx = a, r8d = c
+ push rbx
+ xchg rcx, r8
+ mov eax, edx
+ cpuid ; input eax, ecx. output eax, ebx, ecx, edx
+ mov [r8], eax
+ mov [r8+4], ebx
+ mov [r8+8], ecx
+ mov [r8+12], edx
+ pop rbx
+%ENDIF
+%IFDEF UNIX
+; parameters: rdi = abcd, esi = a, edx = c
+ push rbx
+ mov eax, esi
+ mov ecx, edx
+ cpuid ; input eax, ecx. output eax, ebx, ecx, edx
+ mov [rdi], eax
+ mov [rdi+4], ebx
+ mov [rdi+8], ecx
+ mov [rdi+12], edx
+ pop rbx
+%ENDIF
+ ret
+;cpuid_ex END
diff --git a/asmlibSrc/cputype32.asm b/asmlibSrc/cputype32.asm
new file mode 100755
index 0000000..0ab02e2
--- /dev/null
+++ b/asmlibSrc/cputype32.asm
@@ -0,0 +1,139 @@
+;************************* cputype32.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2011-07-09
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+;
+; Description:
+; This function finds the vendor, family and model number of the CPU
+; and returns the values through the pointers. If a pointer is zero
+; then the value is not returned.
+;
+; Vendor:
+; 0 = unknown
+; 1 = Intel
+; 2 = AMD
+; 3 = VIA/Centaur
+; 4 = Cyrix
+; 5 = NexGen
+;
+; Family: This is the sum of the family and extended family fields of the cpuid
+; Model: This is the model + (extended model << 8)
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+
+global _CpuType: function
+
+
+SECTION .text
+
+_CpuType:
+ push ebx
+ push esi
+ push edi
+
+; parameters
+%define vendor esp+16
+%define family esp+20
+%define model esp+24
+
+ xor esi, esi ; vendor
+ xor edi, edi ; family
+
+ ; detect if CPUID instruction supported by microprocessor:
+ pushfd
+ pop eax
+ btc eax, 21 ; check if CPUID bit can toggle
+ push eax
+ popfd
+ pushfd
+ pop ebx
+ xor ebx, eax
+ bt ebx, 21
+ jc C900 ; CPUID not supported
+
+ xor eax, eax
+ cpuid ; get number of CPUID functions
+
+ ; get vendor
+ ; ecx = last 4 characters of vendor string
+ ; ebx = first 4 characters of vendor string
+ cmp ecx, 'ntel' ; 'GenuineIntel'
+ je C110
+ cmp ecx, 'cAMD' ; 'AuthenticAMD'
+ je C120
+ cmp ebx, 'Cent' ; 'CentaurHauls'
+ je C130
+ cmp ebx, 'VIA ' ; 'VIA VIA VIA '
+ je C130
+ cmp ebx, 'Cyri' ; 'CyrixInstead'
+ je C140
+ cmp ebx, 'NexG' ; 'NexGenDriven'
+ je C150
+ jmp C200 ; other
+C110: or esi, 1
+ jmp C200
+C120: or esi, 2
+ jmp C200
+C130: or esi, 3
+ jmp C200
+C140: or esi, 4
+ jmp C200
+C150: or esi, 5
+ ;jmp C200
+C200:
+ test eax, eax
+ jz C900 ; function 1 not supported
+
+ ; Get family and model
+ mov eax, 1
+ cpuid
+ mov ebx, eax
+ mov edi, eax
+ shr ebx, 8
+ and ebx, 0FH ; Family
+ shr edi, 20
+ and edi, 0FFH ; Extended family
+ add edi, ebx ; Family + extended family
+
+ mov ebx, eax
+ shr ebx, 4
+ and ebx, 0FH ; Model
+ mov ecx, eax
+ shr ecx, 12
+ and ecx, 0F0H ; Extended model
+ or ebx, ecx ; extended model - Model
+
+C300: ; return esi = vendor, edi = family, ebx = model
+ mov eax, [vendor]
+ test eax, eax
+ jz C310
+ mov [eax], esi
+C310: mov eax, [family]
+ test eax, eax
+ jz C320
+ mov [eax], edi
+C320: mov eax, [model]
+ test eax, eax
+ jz C330
+ mov [eax], ebx
+C330: xor eax, eax
+ ; return
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+C900: ; no cpuid
+ xor ebx, ebx
+ jmp C300
+;_CpuType ENDP
diff --git a/asmlibSrc/cputype64.asm b/asmlibSrc/cputype64.asm
new file mode 100755
index 0000000..c74c9d2
--- /dev/null
+++ b/asmlibSrc/cputype64.asm
@@ -0,0 +1,125 @@
+;************************* cputype64.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2011-07-09
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+;
+; Description:
+; This function finds the vendor, family and model number of the CPU
+; and returns the values through the pointers. If a pointer is zero
+; then the value is not returned.
+;
+; Vendor:
+; 0 = unknown
+; 1 = Intel
+; 2 = AMD
+; 3 = VIA/Centaur
+; 4 = Cyrix
+; 5 = NexGen
+;
+; Family: This is the sum of the family and extended family fields of the cpuid
+; Model: This is the model + (extended model << 8)
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void CpuType(int * vendor, int * family, int * model);
+
+global CpuType: function
+
+
+SECTION .text
+
+CpuType:
+ push rbx
+%ifdef UNIX
+ mov r8, rdx
+%endif
+%ifdef WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx
+ mov rsi, rdx
+%endif
+
+; parameters
+; vendor rdi
+; family rsi
+; model r8
+
+ xor r9d, r9d ; vendor
+ xor r10d, r10d ; family
+ xor r11d, r11d ; model
+
+ xor eax, eax
+ cpuid ; get vendor
+ ; ecx = last 4 characters of vendor string
+ ; ebx = first 4 characters of vendor string
+ cmp ecx, 'ntel' ; 'GenuineIntel'
+ je C110
+ cmp ecx, 'cAMD' ; 'AuthenticAMD'
+ je C120
+ cmp ebx, 'Cent' ; 'CentaurHauls'
+ je C130
+ cmp ebx, 'VIA ' ; 'VIA VIA VIA '
+ je C130
+ cmp ebx, 'Cyri' ; 'CyrixInstead'
+ je C140
+ cmp ebx, 'NexG' ; 'NexGenDriven'
+ je C150
+ jmp C200 ; other
+C110: or r9d, 1
+ jmp C200
+C120: or r9d, 2
+ jmp C200
+C130: or r9d, 3
+ jmp C200
+C140: or r9d, 4
+ jmp C200
+C150: or r9d, 5
+ ;jmp C200
+C200:
+
+ ; Get family and model
+ mov eax, 1
+ cpuid
+ mov ebx, eax
+ mov r10d, eax
+ shr ebx, 8
+ and ebx, 0FH ; Family
+ shr r10d, 20
+ and r10d, 0FFH ; Extended family
+ add r10d, ebx ; Family + extended family
+
+ mov r11d, eax
+ shr r11d, 4
+ and r11d, 0FH ; Model
+ shr eax, 12
+ and eax, 0F0H ; Extended model
+ or r11d, eax ; extended model | Model
+
+C300: ; return r9d = vendor, r10d = family, r11d = model
+ test rdi, rdi
+ jz C310
+ mov [rdi], r9d
+C310: test rsi, rsi
+ jz C320
+ mov [rsi], r10d
+C320: test r8, r8
+ jz C330
+ mov [r8], r11d
+C330: xor eax, eax
+ ; return
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+ ret
+;CpuType ENDP
diff --git a/asmlibSrc/debugbreak32.asm b/asmlibSrc/debugbreak32.asm
new file mode 100755
index 0000000..17a3ec9
--- /dev/null
+++ b/asmlibSrc/debugbreak32.asm
@@ -0,0 +1,31 @@
+;************************* debugbreak32.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2011-07-09
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+;
+; Description:
+; Makes a debug breakpoint. Works only when running under a debugger
+;
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+
+global _A_DebugBreak: function
+
+
+SECTION .text
+
+_A_DebugBreak:
+ int3
+ nop
+ ret
+;_A_DebugBreak ENDP
diff --git a/asmlibSrc/debugbreak64.asm b/asmlibSrc/debugbreak64.asm
new file mode 100755
index 0000000..bbb32ef
--- /dev/null
+++ b/asmlibSrc/debugbreak64.asm
@@ -0,0 +1,31 @@
+;************************* debugbreak64.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2011-07-09
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+;
+; Description:
+; Makes a debug breakpoint. Works only when running under a debugger
+;
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" void A_DebugBreak(void);
+
+global A_DebugBreak: function
+
+
+SECTION .text
+
+A_DebugBreak:
+ int3
+ nop
+ ret
+;A_DebugBreak ENDP
diff --git a/asmlibSrc/dispatchpatch32.asm b/asmlibSrc/dispatchpatch32.asm
new file mode 100755
index 0000000..ef03b69
--- /dev/null
+++ b/asmlibSrc/dispatchpatch32.asm
@@ -0,0 +1,311 @@
+;*********************** dispatchpatch32.asm ********************************
+; Author: Agner Fog
+; Date created: 2007-07-20
+; Last modified: 2014-07-30
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" int __intel_cpu_indicator = 0;
+; extern "C" void __intel_cpu_indicator_init()
+;
+; Description:
+; Example of how to replace Intel CPU dispatcher in order to improve
+; compatibility of Intel function libraries with non-Intel processors.
+; In Windows, use static link libraries (*.lib), not dynamic libraries
+; (*.dll). Linking in this as an object file will override the functions
+; with the same name in the library.;
+;
+; Copyright (c) 2007-2014 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
+;******************************************************************************
+
+; extern _InstructionSet: function
+%include "instrset32.asm" ; include code for _InstructionSet function
+
+; InstructionSet function return value:
+; 0 = 80386 instruction set only
+; 1 or above = MMX instructions supported
+; 2 or above = conditional move and FCOMI supported
+; 3 or above = SSE (XMM) supported by processor and operating system
+; 4 or above = SSE2 supported
+; 5 or above = SSE3 supported
+; 6 or above = Supplementary SSE3
+; 8 or above = SSE4.1 supported
+; 9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVVX512F
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Intel standard libraries and SVML library,
+; old versions
+;
+; __intel_cpu_indicator is for older versions of Intel compiler
+; version 14.0 uses __intel_cpu_features_init_x() instead
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global ___intel_cpu_indicator
+global ___intel_cpu_indicator_init
+
+
+SECTION .data
+intel_cpu_indicator@: ; local name
+___intel_cpu_indicator: dd 0
+; table of indicator values
+itable DD 1 ; 0: generic version, 80386 instruction set
+ DD 8, 8 ; 1, 2: MMX
+ DD 0x80 ; 3: SSE
+ DD 0x200 ; 4: SSE2
+ DD 0x800 ; 5: SSE3
+ DD 0x1000, 0x1000 ; 6, 7: SSSE3
+ DD 0x2000, 0x2000 ; 8, 9: SSE4.1
+ DD 0x8000, 0x8000 ; 10, 11: SSE4.2 and popcnt
+ DD 0x20000, 0x20000 ; 12, 13: AVX, pclmul, aes
+ DD 0x400000 ; 14: AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
+ DD 0x400000 ;
+
+itablelen equ ($ - itable) / 4 ; length of table
+
+SECTION .text
+
+; This is already in instrset.asm file
+;%IFDEF POSITIONINDEPENDENT
+; Local function for reading instruction pointer into edi
+;GetThunkEDX:
+; mov edx, [esp]
+; ret
+;%ENDIF ; POSITIONINDEPENDENT
+
+
+___intel_cpu_indicator_init:
+ pushad ; Must save registers
+ call _InstructionSet
+ cmp eax, itablelen
+ jb L100
+ mov eax, itablelen - 1 ; limit to table length
+L100:
+%IFDEF POSITIONINDEPENDENT
+ ; Position-independent code for ELF and Mach-O shared objects:
+ call GetThunkEDX
+ add edx, intel_cpu_indicator@ - $
+%ELSE
+ lea edx, [intel_cpu_indicator@]
+%ENDIF
+ mov eax, [edx + (itable - intel_cpu_indicator@) + 4*eax]
+ mov [edx], eax ; store in ___intel_cpu_indicator
+ popad
+ ret
+;___intel_cpu_indicator_init ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Math Kernel Library (MKL),
+; version 10.2 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _mkl_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+mkltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
+ DD 2 ; 4: SSE2
+ DD 3 ; 5: SSE3
+ DD 4 ; 6: SSSE3
+ DD 4 ; 7: unused
+ DD 4 ; 8: SSE4.1
+ DD 4 ; 9: POPCNT
+ DD 5 ; 10: SSE4.2
+ DD 6 ; 11: AVX
+ DD 6 ; 12: PCLMUL, AES
+ DD 6 ; 13: AVX2
+ DD 7 ; 14: FMA3, BMI1/2, LZCNT
+; DD 7 ; 15: AVX512F
+
+mkltablen equ ($ - mkltab) / 4 ; length of table
+
+SECTION .text
+
+_mkl_serv_cpu_detect:
+ push ecx ; Perhaps not needed
+ push edx
+ call _InstructionSet
+ cmp eax, mkltablen
+ jb M100
+ mov eax, mkltablen - 1 ; limit to table length
+M100:
+%IFDEF POSITIONINDEPENDENT
+ ; Position-independent code for ELF and Mach-O shared objects:
+ call GetThunkEDX
+ add edx, mkltab - $
+%ELSE
+ lea edx, [mkltab]
+%ENDIF
+ mov eax, [edx + 4*eax]
+ pop edx
+ pop ecx
+ ret
+; end _mkl_serv_cpu_detect
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Vector Math Library (VML)
+; version 14.0 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _mkl_vml_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+vmltab DD 0, 0, 0 ; 0-2: generic version, 80386 instruction set
+ DD 2 ; 3: SSE
+ DD 3 ; 4: SSE2
+ DD 4 ; 5: SSE3
+ DD 5 ; 6: SSSE3
+ DD 5 ; 7: unused
+ DD 6 ; 8: SSE4.1
+ DD 6 ; 9: POPCNT
+ DD 7 ; 10: SSE4.2
+ DD 8 ; 11: AVX
+ DD 8 ; 12: PCLMUL, AES
+ DD 8 ; 13: AVX2
+ DD 9 ; 14: FMA3, BMI1/2, LZCNT
+; DD 9 ; 15: AVX512F
+
+vmltablen equ ($ - vmltab) / 4 ; length of table
+
+SECTION .text
+
+_mkl_vml_serv_cpu_detect:
+ push ecx ; Perhaps not needed
+ push edx
+ call _InstructionSet
+ cmp eax, vmltablen
+ jb V100
+ mov eax, vmltablen - 1 ; limit to table length
+V100:
+%IFDEF POSITIONINDEPENDENT
+ ; Position-independent code for ELF and Mach-O shared objects:
+ call GetThunkEDX
+ add edx, vmltab - $
+%ELSE
+ lea edx, [vmltab]
+%ENDIF
+ mov eax, [edx + 4*eax]
+ pop edx
+ pop ecx
+ ret
+; end _mkl_vml_serv_cpu_detect
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for __intel_cpu_feature_indicator
+; version 13 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if 0 ; Don't include this!
+
+; __intel_cpu_features_init and __intel_cpu_features_init_x are
+; identical, except that the former checks the CPU brand, the
+; latter does not. Don't override this function. Instead, set
+; the indicator variables to 0 to force a re-evaluation,
+; and call __intel_cpu_features_init_x.
+; If you do want to override these functions then you must
+; save all registers.
+
+
+global __intel_cpu_features_init
+global __intel_cpu_feature_indicator
+global __intel_cpu_fms_indicator
+global __intel_cpu_features_init_x
+global __intel_cpu_feature_indicator_x
+global __intel_cpu_fms_indicator_x
+
+SECTION .data
+; table of indicator values
+
+intel_cpu_feature_indicator@:
+__intel_cpu_feature_indicator:
+__intel_cpu_feature_indicator_x DD 0, 0
+intel_cpu_fms_indicator@:
+__intel_cpu_fms_indicator:
+__intel_cpu_fms_indicator_x: DD 0, 0
+
+
+feattab DD 1 ; 0 default
+ DD 0BH ; 1 MMX
+ DD 0FH ; 2 conditional move and FCOMI supported
+ DD 3FH ; 3 SSE
+ DD 7FH ; 4 SSE2
+ DD 0FFH ; 5 SSE3
+ DD 1FFH, 1FFH ; 6 Supplementary SSE3
+ DD 3FFH ; 8 SSE4.1
+ DD 0BFFH ; 9 POPCNT
+ DD 0FFFH ; 10 SSE4.2
+ DD 10FFFH ; 11 AVX
+ DD 16FFFH ; 12 PCLMUL and AES
+ DD 816FFFH ; 13 AVX2
+ DD 9DEFFFH ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
+; DD 0FDEFFFH ; 15 HLE, RTM
+
+feattablen equ ($ - feattab) / 4 ; length of table
+
+SECTION .text
+
+__intel_cpu_features_init:
+__intel_cpu_features_init_x:
+ push ecx
+ push edx
+ call _InstructionSet
+ cmp eax, feattablen
+ jb F100
+ mov eax, vmltablen - 1 ; limit to table length
+F100:
+ lea edx, [feattab]
+ mov ebx, [edx + 4*eax] ; look up in table
+ push ebx
+ mov eax, 1
+ cpuid
+ pop ebx
+ bt ecx, 22 ; MOVBE
+ jnc F200
+ or ebx, 1000H
+F200: mov [intel_cpu_feature_indicator@], ebx
+
+ ; get family and model
+ mov edx, eax
+ and eax, 0FH ; stepping bit 0-3
+ mov ecx, edx
+ shr ecx, 4
+ and ecx, 0FH ; model
+ mov ebx, edx
+ shr ebx, 12
+ and ebx, 0F0H ; x model
+ or ecx, ebx ; full model
+ mov ah, cl ; model bit 8 - 15
+ mov ecx, edx
+ shr ecx, 8
+ and ecx, 0FH ; family
+ mov ebx, edx
+ shr ebx, 20
+ and ebx, 0FFH ; x family
+ add ecx, ebx ; full family
+ shl ecx, 16
+ or eax, ecx ; full family bit 16 - 23
+ mov [intel_cpu_fms_indicator@], eax
+
+ pop edx
+ pop ecx
+ ret
+; end __intel_cpu_features_init
+
+%endif
diff --git a/asmlibSrc/dispatchpatch64.asm b/asmlibSrc/dispatchpatch64.asm
new file mode 100755
index 0000000..8f9457a
--- /dev/null
+++ b/asmlibSrc/dispatchpatch64.asm
@@ -0,0 +1,328 @@
+;*********************** dispatchpatch64.asm ********************************
+; Author: Agner Fog
+; Date created: 2007-07-20
+; Last modified: 2014-07-30
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int __intel_cpu_indicator = 0;
+; extern "C" void __intel_cpu_indicator_init()
+;
+; Description:
+; Example of how to replace Intel CPU dispatcher in order to improve
+; compatibility of Intel function libraries with non-Intel processors.
+; Only works with static link libraries (*.lib, *.a), not dynamic libraries
+; (*.dll, *.so). Linking in this as an object file will override the functions
+; with the same name in the library.;
+;
+; Copyright (c) 2007-2014 GNU LGPL License v. 3.0 www.gnu.org/licenses/lgpl.html
+;******************************************************************************
+
+; extern InstructionSet: function
+%include "instrset64.asm" ; include code for InstructionSet function
+
+; InstructionSet function return value:
+; 4 or above = SSE2 supported
+; 5 or above = SSE3 supported
+; 6 or above = Supplementary SSE3
+; 8 or above = SSE4.1 supported
+; 9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVX512F
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Intel standard libraries and SVML library,
+; old versions
+;
+; __intel_cpu_indicator is for older versions of Intel compiler
+; version 14.0 uses __intel_cpu_features_init_x() instead
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global __intel_cpu_indicator
+global __intel_cpu_indicator_init
+
+
+SECTION .data
+intel_cpu_indicator@: ; local name
+__intel_cpu_indicator: dd 0
+
+; table of indicator values
+itable DD 1 ; 0: generic version, 80386 instruction set
+ DD 8, 8 ; 1, 2: MMX
+ DD 0x80 ; 3: SSE
+ DD 0x200 ; 4: SSE2
+ DD 0x800 ; 5: SSE3
+ DD 0x1000, 0x1000 ; 6, 7: SSSE3
+ DD 0x2000, 0x2000 ; 8, 9: SSE4.1
+ DD 0x8000, 0x8000 ; 10, 11: SSE4.2 and popcnt
+ DD 0x20000, 0x20000 ; 12, 13: AVX, pclmul, aes
+ DD 0x400000 ; 14: AVX2, F16C, BMI1, BMI2, LZCNT, FMA3
+; DD 0x800000 ; 15: HLE, RTM
+itablelen equ ($ - itable) / 4 ; length of table
+
+SECTION .text
+
+__intel_cpu_indicator_init:
+ push rax ; registers must be pushed
+ push rcx
+ push rdx
+ push r8
+ push r9
+ push r10
+ push r11
+ push rsi
+ push rdi
+ call InstructionSet
+ cmp eax, itablelen
+ jb L100
+ mov eax, itablelen - 1 ; limit to table length
+L100: lea rdx, [rel itable]
+ mov eax, [rdx + 4*rax]
+ mov [rel intel_cpu_indicator@], eax ; store in __intel_cpu_indicator
+ pop rdi
+ pop rsi
+ pop r11
+ pop r10
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ pop rax
+ ret
+
+;__intel_cpu_indicator_init ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Math Kernel Library (MKL),
+; version 10.2 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global mkl_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+mkltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
+ DD 0 ; 4: SSE2
+ DD 1 ; 5: SSE3
+ DD 2 ; 6: SSSE3
+ DD 2 ; 7: unused
+ DD 2 ; 8: SSE4.1
+ DD 2 ; 9: POPCNT
+ DD 3 ; 10: SSE4.2
+ DD 4 ; 11: AVX
+ DD 4 ; 12: PCLMUL, AES
+ DD 4 ; 13: AVX2
+ DD 5 ; 14: FMA3, BMI1/2, LZCNT
+; DD 5 ; 15: AVX512F
+mkltablen equ ($ - mkltab) / 4 ; length of table
+
+SECTION .text
+
+mkl_serv_cpu_detect:
+ push rcx ; Perhaps not needed
+ push rdx
+ push r8
+ push r9
+%ifdef WINDOWS
+ push rsi
+ push rdi
+%endif
+ call InstructionSet
+ cmp eax, mkltablen
+ jb M100
+ mov eax, mkltablen - 1 ; limit to table length
+M100:
+ lea rdx, [rel mkltab]
+ mov eax, [rdx + 4*rax]
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ ret
+; end mkl_serv_cpu_detect
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for Vector Math Library (VML)
+; version 14.0 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global mkl_vml_serv_cpu_detect
+
+SECTION .data
+; table of indicator values
+; Note: the table is different in 32 bit and 64 bit mode
+
+vmltab DD 0, 0, 0, 0 ; 0-3: generic version, 80386 instruction set
+ DD 1 ; 4: SSE2
+ DD 2 ; 5: SSE3
+ DD 3 ; 6: SSSE3
+ DD 3 ; 7: unused
+ DD 4 ; 8: SSE4.1
+ DD 4 ; 9: POPCNT
+ DD 5 ; 10: SSE4.2
+ DD 6 ; 11: AVX
+ DD 6 ; 12: PCLMUL, AES
+ DD 6 ; 13: AVX2
+ DD 7 ; 14: FMA3, BMI1/2, LZCNT
+; DD 7 ; 15: AVX512F
+vmltablen equ ($ - vmltab) / 4 ; length of table
+
+SECTION .text
+
+mkl_vml_serv_cpu_detect:
+ push rcx ; Perhaps not needed
+ push rdx
+ push r8
+ push r9
+%ifdef WINDOWS
+ push rsi
+ push rdi
+%endif
+ call InstructionSet
+ cmp eax, vmltablen
+ jb V100
+ mov eax, vmltablen - 1 ; limit to table length
+V100:
+ lea rdx, [rel vmltab]
+ mov eax, [rdx + 4*rax]
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ ret
+; end mkl_vml_serv_cpu_detect
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Dispatcher for __intel_cpu_feature_indicator
+; version 13 and higher
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if 0 ; Don't include this!
+
+; __intel_cpu_features_init and __intel_cpu_features_init_x are
+; identical, except that the former checks the CPU brand, the
+; latter does not. Don't override this function. Instead, set
+; the indicator variables to 0 to force a re-evaluation,
+; and call __intel_cpu_features_init_x.
+; If you do want to override these functions then you must
+; save all registers.
+
+
+global __intel_cpu_features_init
+global __intel_cpu_features_init_x
+global __intel_cpu_feature_indicator
+global __intel_cpu_feature_indicator_x
+global __intel_cpu_fms_indicator
+global __intel_cpu_fms_indicator_x
+
+SECTION .data
+; table of indicator values
+
+intel_cpu_feature_indicator@:
+__intel_cpu_feature_indicator:
+__intel_cpu_feature_indicator_x DD 0, 0
+intel_cpu_fms_indicator@:
+__intel_cpu_fms_indicator:
+__intel_cpu_fms_indicator_x: DD 0, 0
+
+
+feattab DD 1 ; 0 default
+ DD 0BH ; 1 MMX
+ DD 0FH ; 2 conditional move and FCOMI supported
+ DD 3FH ; 3 SSE
+ DD 7FH ; 4 SSE2
+ DD 0FFH ; 5 SSE3
+ DD 1FFH, 1FFH ; 6 Supplementary SSE3
+ DD 3FFH ; 8 SSE4.1
+ DD 0BFFH ; 9 POPCNT
+ DD 0FFFH ; 10 SSE4.2
+ DD 10FFFH ; 11 AVX
+ DD 16FFFH ; 12 PCLMUL and AES
+ DD 816FFFH ; 13 AVX2
+ DD 9DEFFFH ; 14 FMA3, F16C, BMI1, BMI2, LZCNT
+ DD 0FDEFFFH ; 15 HLE, RTM
+
+feattablen equ ($ - feattab) / 4 ; length of table
+
+SECTION .text
+
+__intel_cpu_features_init:
+__intel_cpu_features_init_x:
+ push rcx
+ push rdx
+ push r8
+ push r9
+%ifdef WINDOWS
+ push rsi
+ push rdi
+%endif
+ call InstructionSet
+ cmp eax, feattablen
+ jb F100
+ mov eax, vmltablen - 1 ; limit to table length
+F100:
+ lea rdx, [rel feattab]
+ mov ebx, [rdx + 4*rax] ; look up in table
+ push rbx
+ mov eax, 1
+ cpuid
+ pop rbx
+ bt ecx, 22 ; MOVBE
+ jnc F200
+ or ebx, 1000H
+F200: mov [intel_cpu_feature_indicator@], rbx
+
+ ; get family and model
+ mov edx, eax
+ and eax, 0FH ; stepping bit 0-3
+ mov ecx, edx
+ shr ecx, 4
+ and ecx, 0FH ; model
+ mov ebx, edx
+ shr ebx, 12
+ and ebx, 0F0H ; x model
+ or ecx, ebx ; full model
+ mov ah, cl ; model bit 8 - 15
+ mov ecx, edx
+ shr ecx, 8
+ and ecx, 0FH ; family
+ mov ebx, edx
+ shr ebx, 20
+ and ebx, 0FFH ; x family
+ add ecx, ebx ; full family
+ shl ecx, 16
+ or eax, ecx ; full family bit 16 - 23
+ mov [intel_cpu_fms_indicator@], eax
+
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+ ret
+; end __intel_cpu_features_init
+
+%endif
diff --git a/asmlibSrc/divfixedi32.asm b/asmlibSrc/divfixedi32.asm
new file mode 100755
index 0000000..ebb85a7
--- /dev/null
+++ b/asmlibSrc/divfixedi32.asm
@@ -0,0 +1,152 @@
+;************************* divfixedi32.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-22
+; Last modified: 2011-07-22
+;
+; Function prototypes:
+; void setdivisori32(int buffer[2], int d);
+; int dividefixedi32(const int buffer[2], int x);
+; void setdivisoru32(uint32_t buffer[2], uint32_t d);
+; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
+;
+; Description:
+; Functions for fast repeated integer division by the same divisor, signed
+; and unsigned 32-bit integer versions. The divisor must be positive.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift.
+;
+; The methods used are described by:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n) [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+section .text
+
+; extern "C" void setdivisori32(int buffer[2], int d);
+; 32 bit signed
+
+global _setdivisori32: function
+_setdivisori32:
+ push ebx
+ mov ebx, [esp+12] ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ js H120 ; d < 0. Generate error
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp ebx, edx
+ je H110 ; avoid division overflow when d = 1
+ shl edx, cl
+ div ebx
+H110: inc eax
+ mov ebx, [esp+8] ; buffer
+ mov [ebx], eax ; multiplier
+ mov [ebx+4], ecx ; shift count
+ pop ebx
+ ret
+
+H120: ; d <= 0 not supported. Generate error
+ mov edx, 1
+ div edx
+ ud2
+
+
+; extern "C" int dividefixedi32(int buffer[2], int x);
+global _dividefixedi32: function
+_dividefixedi32:
+ push ebx
+ mov eax, [esp+12] ; x
+ mov ecx, [esp+8] ; buffer
+ mov ebx, eax
+ imul dword [ecx] ; m
+ lea eax, [edx+ebx]
+ mov ecx, [ecx+4] ; shift count
+ sar eax, cl
+ sar ebx, 31 ; sign(x)
+ sub eax, ebx
+ pop ebx
+ ret
+
+
+;extern "C" void setdivisoru32(int buffer[2], int d);
+; 32 bit unsigned
+
+global _setdivisoru32: function
+_setdivisoru32:
+ push ebx
+ mov ebx, [esp+12] ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl edx, cl ; 2^L
+ cmp cl, 20h
+ adc edx, -1 ; fix cl overflow, must give edx = 0
+ sub edx, ebx
+ xor eax, eax
+ div ebx
+ inc eax
+ mov ebx, [esp+8] ; buffer
+ mov [ebx], eax ; multiplier
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al ; shift 2
+ shl eax, 8
+ or eax, edx
+ mov [ebx+4], eax ; shift 1 and shift 2
+ pop ebx
+ ret
+
+;extern "C" int dividefixedu32(int buffer[2], int x);
+global _dividefixedu32: function ; unsigned
+_dividefixedu32:
+ mov eax, [esp+8] ; x
+ mov ecx, [esp+4] ; buffer
+ mul dword [ecx] ; m
+ mov eax, [esp+8] ; x
+ sub eax, edx ; x-t
+ mov ecx, [ecx+4] ; shift 1 and shift 2
+ shr eax, cl
+ add eax, edx
+ shr ecx, 8
+ shr eax, cl
+ ret
diff --git a/asmlibSrc/divfixedi64.asm b/asmlibSrc/divfixedi64.asm
new file mode 100755
index 0000000..4e52d31
--- /dev/null
+++ b/asmlibSrc/divfixedi64.asm
@@ -0,0 +1,171 @@
+;************************* divfixedi64.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-22
+; Last modified: 2011-07-22
+;
+; Function prototypes:
+; void setdivisori32(int buffer[2], int d);
+; int dividefixedi32(const int buffer[2], int x);
+; void setdivisoru32(uint32_t buffer[2], uint32_t d);
+; uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x);
+;
+; Description:
+; Functions for fast repeated integer division by the same divisor, signed
+; and unsigned 32-bit integer versions. The divisor must be positive.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift.
+;
+; The methods used are described by:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n) [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+%IFDEF WINDOWS
+%define par1 rcx ; function parameter 1
+%define par2 edx ; function parameter 2
+%define buf r9 ; copy of function parameter 1: buffer
+%define rx r8
+%define rxd r8d ; d or x
+%ELSE ; UNIX
+%define par1 rdi ; function parameter 1
+%define par2 esi ; function parameter 2
+%define buf rdi ; function parameter 1: buffer
+%define rx rsi
+%define rxd esi ; d or x
+%ENDIF
+
+
+section .text
+
+; extern "C" void setdivisori32(int buffer[2], int d);
+; 32 bit signed
+
+global setdivisori32: function
+setdivisori32:
+%IFDEF WINDOWS
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ENDIF
+ dec rxd ; rxd = r8d or esi
+ mov ecx, -1 ; value for bsr if rxd = 0 (assuming bsr leaves dest unchanged if src = 0, this works on both Intel, AMD and VIA processors)
+ bsr ecx, rxd ; floor(log2(d-1))
+ inc rxd
+ js H120 ; d < 0. Generate error
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp rxd, edx
+ je H110 ; avoid overflow when d = 1
+ shl edx, cl
+ div rxd
+H110: inc eax
+ mov [buf], eax ; multiplier
+ mov [buf+4], ecx ; shift count
+ ret
+
+H120: ; d <= 0 not supported. Generate error
+ mov edx, 1
+ div edx ; will overflow
+ ud2
+
+
+; extern "C" int dividefixedi32(int buffer[2], int x);
+global dividefixedi32: function
+dividefixedi32:
+%IFDEF WINDOWS
+ mov eax, edx
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ELSE
+ mov eax, esi
+%ENDIF
+ imul dword [buf] ; m
+ lea eax, [rdx+rx] ; rx = r8 or rsi
+ mov ecx, [buf+4] ; shift count
+ sar eax, cl
+ sar rxd, 31 ; sign(x)
+ sub eax, rxd
+ ret
+
+
+;extern "C" void setdivisoru32(int buffer[2], int d);
+; 32 bit unsigned
+
+global setdivisoru32: function
+setdivisoru32:
+%IFDEF WINDOWS
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ENDIF
+ dec rxd ; rxd = r8d or esi
+ mov ecx, -1 ; value for bsr if r8d = 0
+ bsr ecx, rxd ; floor(log2(d-1))
+ inc rxd
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl rdx, cl ; 2^L (64 bit shift because cl may be 32)
+ sub edx, rxd
+ xor eax, eax
+ div rxd
+ inc eax
+ mov [buf], eax ; multiplier
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al ; shift 2
+ shl eax, 8
+ or eax, edx
+ mov [buf+4], eax ; shift 1 and shift 2
+ ret
+
+;extern "C" int dividefixedu32(int buffer[2], int x);
+global dividefixedu32: function ; unsigned
+dividefixedu32:
+%IFDEF WINDOWS
+ mov eax, edx
+ mov rxd, edx ; x
+ mov buf, rcx ; buffer
+%ELSE
+ mov eax, esi
+%ENDIF
+ mul dword [buf] ; m
+ sub rxd, edx ; x-t
+ mov ecx, [buf+4] ; shift 1 and shift 2
+ shr rxd, cl
+ lea eax, [rx+rdx]
+ shr ecx, 8
+ shr eax, cl
+ ret
diff --git a/asmlibSrc/divfixedv32.asm b/asmlibSrc/divfixedv32.asm
new file mode 100755
index 0000000..c3c6294
--- /dev/null
+++ b/asmlibSrc/divfixedv32.asm
@@ -0,0 +1,490 @@
+;************************* divfixedv32.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-25
+; Last modified: 2012-03-10
+;
+; Function prototypes:
+; void setdivisorV8i16(__m128i buf[2], int16_t d);
+; void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; void setdivisorV4i32(__m128i buf[2], int32_t d);
+; void setdivisorV4u32(__m128i buf[2], uint32_t d);
+;
+; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
+; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+;
+; Alternative versions for VectorClass.h:
+; (These versions pack all parameters into a single register)
+; __m128i setdivisor8s(int16_t d);
+; __m128i setdivisor8us(uint16_t d);
+; __m128i setdivisor4i(int32_t d);
+; __m128i setdivisor4ui(uint32_t d);
+;
+; Description:
+; Functions for integer vector division by the same divisor, signed
+; and unsigned 16-bit and 32-bit integer versions.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift of the
+; vector elements of packed 16-bit or 32-bit signed or unsigned integers.
+;
+; The divisor must be positive. A zero divisor generated a divide by zero error.
+; A negative divisor generates a division overflow error. To divide by a negative
+; divisor, change the sign of the divisor and the result.
+;
+; The methods used are described in this article:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n) [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+section .text align = 16
+
+;******************************************************************************
+; 16 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8s(int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global _setdivisor8s: function
+_setdivisor8s:
+ push ebx
+ movsx ebx, word [esp+8] ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ js H120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp ebx, edx
+ je H110 ; avoid division overflow when d = 1
+ shl edx, cl
+ div bx ; 2^(16+L-1)/d
+H110: inc eax
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
+ movd xmm1, ecx ; shift count
+ punpcklqdq xmm0, xmm1 ; insert shift count into upper half
+ pop ebx
+ ret
+
+H120: ; d < 0 not supported. Generate error
+ mov edx, 1
+ div edx
+ ud2
+; _setdivisor8s end
+
+; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global _setdivisorV8i16: function
+_setdivisorV8i16:
+ mov eax, dword [esp+8] ; d
+ push eax
+ call _setdivisor8s
+ pop ecx
+ mov eax, dword [esp+4] ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [eax], xmm0 ; multiplier
+ movdqa [eax+16], xmm1 ; shift count is still in xmm1
+ ret
+; _setdivisorV8i16 end
+
+
+; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
+global _dividefixedV8i16: function
+
+align 16
+_dividefixedV8i16:
+ mov ecx, [esp+4] ; buffer
+ movdqa xmm1, xmm0 ; x
+ pmulhw xmm0, [ecx] ; multiply high signed words
+ paddw xmm0, xmm1
+ movd xmm2, [ecx+16] ; shift count
+ psraw xmm0, xmm2 ; shift right arithmetic
+ psraw xmm1, 15 ; sign of x
+ psubw xmm0, xmm1
+ ret
+;_dividefixedV8i16 end
+
+
+
+;******************************************************************************
+; 16 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8us(uint16_t d);
+; vector of 8 x 16 bit unsigned integers
+
+global _setdivisor8us: function
+_setdivisor8us:
+ push ebx
+ movzx ebx, word [esp+8] ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl edx, cl ; 2^L [32-bit shift to allow overflow]
+ sub edx, ebx
+ xor eax, eax
+ div bx
+ inc eax
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al ; shift 2
+ movd xmm1, edx ; shift 1
+ movd xmm2, eax ; shift 2
+ punpckldq xmm1, xmm2 ; combine into two dwords
+ punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
+ pop ebx
+ ret
+; _setdivisor8us
+
+;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; 8 x 16 bit unsigned
+
+global _setdivisorV8u16: function
+_setdivisorV8u16:
+ mov eax, dword [esp+8] ; d
+ push eax
+ call _setdivisor8us
+ pop ecx
+ mov eax, dword [esp+4] ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [eax], xmm0 ; multiplier
+ movdqa [eax+16], xmm1 ; shift counts are still in xmm1
+ ret
+; _setdivisorV8u16 end
+
+
+;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
+global _dividefixedV8u16: function
+
+align 16
+_dividefixedV8u16:
+ mov ecx, [esp+4] ; buffer
+ movdqa xmm1, xmm0 ; x
+ pmulhuw xmm0, [ecx] ; multiply high unsigned words
+ psubw xmm1, xmm0
+ movd xmm2, [ecx+16] ; shift1
+ psrlw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd xmm2, [ecx+20] ; shift2
+ psrlw xmm0, xmm2
+ ret
+; _dividefixedV8u16 end
+
+
+
+;******************************************************************************
+; 32 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4i(int32_t d);
+; vector of 4 x 32 bit signed integers
+
+align 16
+global _setdivisor4i: function
+_setdivisor4i:
+ push ebx
+ mov ebx, [esp+8] ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ js K120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp ebx, edx
+ je K110 ; avoid division overflow when d = 1
+ shl edx, cl
+ div ebx ; 2^(16+L-1)/d
+K110: inc eax
+ movd xmm0, eax ; multiplier
+ pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
+ movd xmm1, ecx ; shift count
+ punpcklqdq xmm0, xmm1 ; insert shift count into upper half
+ pop ebx
+ ret
+
+K120: ; d < 0 not supported. Generate error
+ mov edx, 1
+ div edx
+ ud2
+; _setdivisor4i end
+
+
+; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
+; vector of 4 x 32 bit signed integers
+
+global _setdivisorV4i32: function
+_setdivisorV4i32:
+ mov eax, dword [esp+8] ; d
+ push eax
+ call _setdivisor4i
+ pop ecx
+ mov eax, dword [esp+4] ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [eax], xmm0 ; multiplier
+ movdqa [eax+16], xmm1 ; shift counts are still in xmm1
+ ret
+; _setdivisorV4i32 end
+
+
+; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
+global _dividefixedV4i32: function
+
+; Direct entries to CPU-specific versions
+global _dividefixedV4i32SSE2: function
+global _dividefixedV4i32SSE41: function
+
+align 8
+_dividefixedV4i32: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
+%ELSE ; Position-independent code
+ call get_thunk_edx ; get reference point for position-independent code
+RP1: ; reference point edx = offset RP1
+; Make the following instruction with address relative to RP1:
+ jmp near [edx+dividefixedV4i32Dispatch-RP1]
+%ENDIF
+
+align 16
+_dividefixedV4i32SSE41:
+ mov ecx, [esp+4] ; buffer
+ movdqa xmm1, xmm0 ; x
+ movdqa xmm2, xmm0 ; x
+ movdqa xmm3, [ecx] ; multiplier
+ pmuldq xmm0, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+ psrlq xmm0, 32 ; high dword of result 0 and 2
+ psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
+ pmuldq xmm1, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+ pcmpeqd xmm3, xmm3
+ psllq xmm3, 32 ; generate mask of dword 1 and 3
+ pand xmm1, xmm3 ; high dword of result 1 and 3
+ por xmm0, xmm1 ; combine all four results into one vector
+ paddd xmm0, xmm2
+ movd xmm3, [ecx+16] ; shift count
+ psrad xmm0, xmm3 ; shift right arithmetic
+ psrad xmm2, 31 ; sign of x
+ psubd xmm0, xmm2
+ ret
+;_dividefixedV4i32SSE41 end
+
+
+_dividefixedV4i32SSE2:
+; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
+; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
+ push ebp
+ mov ebp, esp
+ sub esp, 16
+ and esp, -16 ; make aligned stack space
+ movdqa [esp], xmm0 ; store x
+ movdqa xmm2, xmm0 ; x
+ mov ecx, [ebp+8] ; buffer
+ mov ecx, [ecx] ; multiplier
+ ; do four signed high multiplications
+ mov eax, [esp]
+ imul ecx
+ mov [esp], edx
+ mov eax, [esp+4]
+ imul ecx
+ mov [esp+4], edx
+ mov eax, [esp+8]
+ imul ecx
+ mov [esp+8], edx
+ mov eax, [esp+12]
+ imul ecx
+ mov [esp+12], edx
+ movdqa xmm0, [esp] ; x*m vector
+ mov ecx, [ebp+8] ; buffer
+ paddd xmm0, xmm2
+ movd xmm3, [ecx+16] ; shift count
+ psrad xmm0, xmm3 ; shift right arithmetic
+ psrad xmm2, 31 ; sign of x
+ psubd xmm0, xmm2
+ mov esp, ebp
+ pop ebp
+ ret
+;_dividefixedV4i32SSE2 end
+
+
+; ********************************************************************************
+; CPU dispatching for _dividefixedV4i32. This is executed only once
+; ********************************************************************************
+
+dividefixedV4i32CPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version
+ mov ecx, _dividefixedV4i32SSE2
+ cmp eax, 8 ; check if PMULDQ supported
+ jb Q100
+ ; SSE4.1 supported
+ ; Point to SSE4.1 version of strstr
+ mov ecx, _dividefixedV4i32SSE41
+Q100: mov [dividefixedV4i32Dispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP10: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_dividefixedV4i32SSE2-RP10]
+ cmp eax, 8 ; check if PMULDQ supported
+ jb Q100
+ ; SSE4.1 supported
+ ; Point to SSE4.1 version of strstr
+ lea ecx, [edx+_dividefixedV4i32SSE41-RP10]
+Q100: mov [edx+dividefixedV4i32Dispatch-RP10], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+dividefixedV4i32Dispatch DD dividefixedV4i32CPUDispatch
+
+section .text
+
+
+
+;******************************************************************************
+; 32 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4ui(uint32_t d);
+; vector of 4 x 32 bit unsigned integers
+
+align 16
+global _setdivisor4ui: function
+_setdivisor4ui:
+ push ebx
+ mov ebx, [esp+8] ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl edx, cl ; 2^L
+ cmp cl, 20h
+ adc edx, -1 ; fix cl overflow, must give edx = 0
+ sub edx, ebx
+ xor eax, eax
+ div ebx
+ inc eax
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al ; shift 2
+ movd xmm1, edx ; shift 1
+ movd xmm2, eax ; shift 2
+ punpckldq xmm1, xmm2 ; combine into two dwords
+ punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
+ pop ebx
+ ret
+; _setdivisor4ui end
+
+;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
+; 4 x 32 bit unsigned
+
+global _setdivisorV4u32: function
+_setdivisorV4u32:
+ mov eax, dword [esp+8] ; d
+ push eax
+ call _setdivisor4ui
+ pop ecx
+ mov eax, dword [esp+4] ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [eax], xmm0 ; multiplier
+ movdqa [eax+16], xmm1 ; shift counts are still in xmm1
+ ret
+; _setdivisorV4u32 end
+
+;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+global _dividefixedV4u32: function
+
+align 16
+_dividefixedV4u32:
+ mov ecx, [esp+4] ; buffer
+ movdqa xmm1, xmm0 ; x
+ movdqa xmm2, xmm0 ; x
+ movdqa xmm3, [ecx] ; multiplier
+ pmuludq xmm0, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+ psrlq xmm0, 32 ; high dword of result 0 and 2
+ psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
+ pmuludq xmm1, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+ pcmpeqd xmm3, xmm3
+ psllq xmm3, 32 ; generate mask of dword 1 and 3
+ pand xmm1, xmm3 ; high dword of result 1 and 3
+ por xmm0, xmm1 ; combine all four results into one vector
+ psubd xmm2, xmm0
+ movd xmm3, [ecx+16] ; shift1
+ psrld xmm2, xmm3
+ paddd xmm0, xmm2
+ movd xmm3, [ecx+20] ; shift2
+ psrld xmm0, xmm3
+ ret
+;_dividefixedV4u32 end
diff --git a/asmlibSrc/divfixedv64.asm b/asmlibSrc/divfixedv64.asm
new file mode 100755
index 0000000..145b125
--- /dev/null
+++ b/asmlibSrc/divfixedv64.asm
@@ -0,0 +1,496 @@
+;************************* divfixedv64.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-25
+; Last modified: 2012-03-10
+;
+; Function prototypes:
+; void setdivisorV8i16(__m128i buf[2], int16_t d);
+; void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; void setdivisorV4i32(__m128i buf[2], int32_t d);
+; void setdivisorV4u32(__m128i buf[2], uint32_t d);
+;
+; __m128i dividefixedV8i16(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);;
+; __m128i dividefixedV4i32(const __m128i buf[2], __m128i x);
+; __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+;
+; Alternative versions for VectorClass.h:
+; (These versions pack all parameters into a single register)
+; __m128i setdivisor8s(int16_t d);
+; __m128i setdivisor8us(uint16_t d);
+; __m128i setdivisor4i(int32_t d);
+; __m128i setdivisor4ui(uint32_t d);
+;
+; Description:
+; Functions for integer vector division by the same divisor, signed
+; and unsigned 16-bit and 32-bit integer versions.
+;
+; The setdivisor functions calculate the reciprocal divisor and shift counts,
+; the dividefixed functions do the division by multiplication and shift of the
+; vector elements of packed 16-bit or 32-bit signed or unsigned integers.
+;
+; The divisor must be positive. A zero divisor generated a divide by zero error.
+; A negative divisor generates a division overflow error. To divide by a negative
+; divisor, change the sign of the divisor and the result.
+;
+; The methods used are described in this article:
+; T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+; Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+; http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556
+;
+; Mathematical formula, unsigned division:
+; x = dividend
+; d = divisor
+; n = integer size, bits
+; L = ceil(log2(d))
+; m = 1 + 2^n * (2^L-d) / d [2^L should overflow to 0 if L = n]
+; sh1 = min(L,1)
+; sh2 = max(L-1,0)
+; t = m*x >> n [high part of unsigned multiplication]
+; x/d = (((x-t) >> sh1) + t) >> sh2
+;
+; Mathematical formula, signed division:
+; x = dividend
+; d = abs(divisor)
+; n = integer size, bits
+; L = ceil(log2(d))
+; L = max(L,1)
+; m = 1 + 2^(n+L-1)/d - 2^n [division should overflow to 0 if d = 1]
+; sh1 = L-1
+; q = x + (m*x >> n) [high part of signed multiplication]
+; q = (q >> sh1) - (x<0 ? -1 : 0)
+; if (divisor < 0) q = -q [negative divisor not supported in present implementation]
+; x/d = q
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%IFDEF WINDOWS
+%define par1 rcx ; function parameter 1
+%define par1d ecx
+%define par1w cx
+%define par2 rdx ; function parameter 2
+%define par2d edx
+%define par2w dx
+%define buf r8 ; pointer to buffer
+%ENDIF
+%IFDEF UNIX
+%define par1 rdi ; function parameter 1
+%define par1d edi
+%define par1w di
+%define par2 rsi ; function parameter 2
+%define par2d esi
+%define par2w si
+%define buf rdi ; pointer to buffer
+%ENDIF
+
+
+; Imported from instrset64.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text align = 16
+
+;******************************************************************************
+; 16 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8s(int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisor8s: function
+setdivisor8s:
+ push rbx
+ movsx ebx, par1w ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ js H120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp ebx, edx
+ je H110 ; avoid division overflow when d = 1
+ shl edx, cl
+ div bx ; 2^(16+L-1)/d
+H110: inc eax
+ movd xmm0, eax ; multiplier
+ pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
+ movd xmm1, ecx ; shift count
+ punpcklqdq xmm0, xmm1 ; insert shift count into upper half
+ pop rbx
+ ret
+H120: ; d < 0 not supported. Generate error
+ mov edx, 1
+ div edx
+ ud2
+; setdivisor8s end
+
+
+; extern "C" void setdivisorV8i16(__m128i buf[2], int16_t d);
+; vector of 8 x 16 bit signed integers
+
+global setdivisorV8i16: function
+setdivisorV8i16:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor8s
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift count is still in xmm1
+ ret
+; setdivisorV8i16 end
+
+
+; extern "C" int dividefixedV8i16(const __m128i buf[2], __m128i x);
+global dividefixedV8i16: function
+
+dividefixedV8i16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0, [par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ pmulhw xmm0, [par1] ; multiply high signed words
+ paddw xmm0, xmm1
+ movd xmm2, [par1+16] ; shift count
+ psraw xmm0, xmm2 ; shift right arithmetic
+ psraw xmm1, 15 ; sign of x
+ psubw xmm0, xmm1
+ ret
+;dividefixedV8i16 end
+
+
+
+;******************************************************************************
+; 16 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor8us(uint16_t d);
+; vector of 8 x 16 bit unsigned integers
+
+align 16
+global setdivisor8us: function
+setdivisor8us:
+ push rbx
+ movzx ebx, par1w ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl edx, cl ; 2^L [32-bit shift to allow overflow]
+ sub edx, ebx
+ xor eax, eax
+ div bx
+ inc eax
+ movd xmm0, eax
+ pshuflw xmm0, xmm0, 0 ; broadcast into lower 4 words
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift 1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al ; shift 2
+ movd xmm1, edx ; shift 1
+ movd xmm2, eax ; shift 2
+ punpckldq xmm1, xmm2 ; combine into two dwords
+ punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
+ pop rbx
+ ret
+; setdivisor8us end
+
+
+;extern "C" void setdivisorV8u16(__m128i buf[2], uint16_t d);
+; 8 x 16 bit unsigned
+
+global setdivisorV8u16: function
+setdivisorV8u16:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor8us
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift counts are still in xmm1
+ ret
+; setdivisorV8u16 end
+
+
+;extern "C" __m128i dividefixedV8u16(const __m128i buf[2], __m128i x);
+global dividefixedV8u16: function
+
+align 16
+dividefixedV8u16:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0, [par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ pmulhuw xmm0, [par1] ; multiply high unsigned words
+ psubw xmm1, xmm0
+ movd xmm2, [par1+16] ; shift1
+ psrlw xmm1, xmm2
+ paddw xmm0, xmm1
+ movd xmm2, [par1+20] ; shift2
+ psrlw xmm0, xmm2
+ ret
+;dividefixedV8u16 end
+
+
+
+;******************************************************************************
+; 32 bit signed integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4i(int32_t d);
+; vector of 4 x 32 bit signed integers
+
+align 16
+global setdivisor4i: function
+setdivisor4i:
+ push rbx
+ mov ebx, par1d ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ js K120 ; Generate error if d < 0. (error for d=0 will come in the div instruction)
+ inc ecx ; L = ceil(log2(d))
+ sub ecx, 1 ; shift count = L - 1
+ adc ecx, 0 ; avoid negative shift count
+ xor eax, eax
+ mov edx, 1
+ cmp ebx, edx
+ je K110 ; avoid division overflow when d = 1
+ shl edx, cl
+ div ebx ; 2^(16+L-1)/d
+K110: inc eax
+ movd xmm0, eax ; multiplier
+ pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
+ movd xmm1, ecx ; shift count
+ punpcklqdq xmm0, xmm1 ; insert shift count into upper half
+ pop rbx
+ ret
+
+K120: ; d < 0 not supported. Generate error
+ mov edx, 1
+ div edx
+ ud2
+; setdivisor4i end
+
+
+; extern "C" void setdivisorV4i32(__m128i buf[2], int32_t d);
+; vector of 4 x 32 bit signed integers
+
+global setdivisorV4i32: function
+setdivisorV4i32:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor4i
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift count is still in xmm1
+ ret
+; setdivisorV4i32 end
+
+
+; extern "C" int dividefixedV4i32(const __m128i buf[2], __m128i x);
+global dividefixedV4i32: function
+
+; Direct entries to CPU-specific versions
+global dividefixedV4i32SSE2: function
+global dividefixedV4i32SSE41: function
+
+align 8
+dividefixedV4i32: ; function dispatching
+ jmp near [dividefixedV4i32Dispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+dividefixedV4i32SSE41:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0,[par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ movdqa xmm2, xmm0 ; x
+ movdqa xmm3, [par1] ; multiplier
+ pmuldq xmm0, xmm3 ; 32 x 32 -> 64 bit signed multiplication of x[0] and x[2]
+ psrlq xmm0, 32 ; high dword of result 0 and 2
+ psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
+ pmuldq xmm1, xmm3 ; 32 x 32 -> 64 bit signed multiplication of x[1] and x[3]
+ pcmpeqd xmm3, xmm3
+ psllq xmm3, 32 ; generate mask of dword 1 and 3
+ pand xmm1, xmm3 ; high dword of result 1 and 3
+ por xmm0, xmm1 ; combine all four results into one vector
+ paddd xmm0, xmm2
+ movd xmm3, [par1+16] ; shift count
+ psrad xmm0, xmm3 ; shift right arithmetic
+ psrad xmm2, 31 ; sign of x
+ psubd xmm0, xmm2
+ ret
+;dividefixedV4i32SSE41 end
+
+dividefixedV4i32SSE2:
+; I have tried to change sign and use pmuludq, but get rounding error (gives 9/10 = 1).
+; This solution, with 4 separate multiplications, is probably faster anyway despite store forwarding stall
+ push rbp
+ mov rbp, rsp
+%IFDEF WINDOWS
+ movdqa xmm0,[par2] ; x
+ mov buf, par1
+%ENDIF
+ sub rsp, 16 ; allocate stack space
+ and rsp, -16 ; stack should be aligned already. align anyway to be safe
+ movdqa [rsp], xmm0 ; store x
+ movdqa xmm2, xmm0 ; x
+ mov ecx, [buf] ; multiplier
+ ; do four signed high multiplications
+ mov eax, [rsp]
+ imul ecx
+ mov [rsp], edx
+ mov eax, [rsp+4]
+ imul ecx
+ mov [rsp+4], edx
+ mov eax, [rsp+8]
+ imul ecx
+ mov [rsp+8], edx
+ mov eax, [rsp+12]
+ imul ecx
+ mov [rsp+12], edx
+ movdqa xmm0, [rsp] ; x*m vector
+ paddd xmm0, xmm2
+ movd xmm3, [buf+16] ; shift count
+ psrad xmm0, xmm3 ; shift right arithmetic
+ psrad xmm2, 31 ; sign of x
+ psubd xmm0, xmm2
+ mov rsp, rbp
+ pop rbp
+ ret
+;dividefixedV4i32SSE2 end
+
+
+; ********************************************************************************
+; CPU dispatching for dividefixedV4i32. This is executed only once
+; ********************************************************************************
+
+dividefixedV4i32CPUDispatch:
+ ; get supported instruction set
+ push par1
+ push par2
+ call InstructionSet
+ pop par2
+ pop par1
+ ; Point to generic version
+ lea r8, [dividefixedV4i32SSE2]
+ cmp eax, 8 ; check if PMULDQ supported
+ jb Q100
+ ; SSE4.1 supported
+ ; Point to SSE4.1 version of strstr
+ lea r8, [dividefixedV4i32SSE41]
+Q100: mov [dividefixedV4i32Dispatch], r8
+ ; Continue in appropriate version
+ jmp r8
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+dividefixedV4i32Dispatch Dq dividefixedV4i32CPUDispatch
+
+section .text
+
+
+;******************************************************************************
+; 32 bit unsigned integers
+;******************************************************************************
+
+; extern "C" __m128i setdivisor4ui(uint32_t d);
+; vector of 4 x 32 bit unsigned integers
+
+align 16
+global setdivisor4ui: function
+setdivisor4ui:
+ push rbx
+ mov ebx, par1d ; d
+ dec ebx
+ mov ecx, -1 ; value for bsr if ebx = 0
+ bsr ecx, ebx ; floor(log2(d-1))
+ inc ebx
+ inc ecx ; L = ceil(log2(d))
+ mov edx, 1
+ shl rdx, cl ; 2^L [64 bit shift to allow overflow]
+ sub edx, ebx
+ xor eax, eax
+ div ebx
+ inc eax
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; broadcast into 4 dwords
+ sub ecx, 1
+ setae dl
+ movzx edx, dl ; shift1
+ seta al
+ neg al
+ and al,cl
+ movzx eax, al
+ movd xmm1, edx ; shift 1
+ movd xmm2, eax ; shift 2
+ punpckldq xmm1, xmm2 ; combine into two dwords
+ punpcklqdq xmm0, xmm1 ; multipliers, shift1, shift2
+ pop rbx
+ ret
+; setdivisor4ui end
+
+;extern "C" void setdivisorV4u32(__m128i buf[2], uint32_t d);
+; 4 x 32 bit unsigned
+
+global setdivisorV4u32: function
+setdivisorV4u32:
+ push par1 ; buf
+ mov par1d, par2d ; d
+ call setdivisor4ui
+ pop rax ; buf
+ punpcklqdq xmm0, xmm0 ; copy multiplier into upper 4 words
+ movdqa [rax], xmm0 ; multiplier
+ movdqa [rax+16], xmm1 ; shift counts are still in xmm1
+ ret
+; setdivisorV4u32 end
+
+;extern "C" __m128i dividefixedV4u32(const __m128i buf[2], __m128i x);
+global dividefixedV4u32: function
+
+align 16
+dividefixedV4u32:
+; buf = par1
+; x = xmm0 (UNIX) or [par2] (Windows)
+%IFDEF WINDOWS
+ movdqa xmm0,[par2] ; x
+%ENDIF
+ movdqa xmm1, xmm0 ; x
+ movdqa xmm2, xmm0 ; x
+ movdqa xmm3, [par1] ; multiplier
+ pmuludq xmm0, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[0] and x[2]
+ psrlq xmm0, 32 ; high dword of result 0 and 2
+ psrlq xmm1, 32 ; get x[1] and x[3] into position for multiplication
+ pmuludq xmm1, xmm3 ; 32 x 32 -> 64 bit unsigned multiplication of x[1] and x[3]
+ pcmpeqd xmm3, xmm3
+ psllq xmm3, 32 ; generate mask of dword 1 and 3
+ pand xmm1, xmm3 ; high dword of result 1 and 3
+ por xmm0, xmm1 ; combine all four results into one vector
+ psubd xmm2, xmm0
+ movd xmm3, [par1+16] ; shift1
+ psrld xmm2, xmm3
+ paddd xmm0, xmm2
+ movd xmm3, [par1+20] ; shift2
+ psrld xmm0, xmm3
+ ret
+;dividefixedV4u32 end
diff --git a/asmlibSrc/instrset32.asm b/asmlibSrc/instrset32.asm
new file mode 100755
index 0000000..994f725
--- /dev/null
+++ b/asmlibSrc/instrset32.asm
@@ -0,0 +1,244 @@
+;************************* instrset32.asm **********************************
+; Author: Agner Fog
+; Date created: 2003-12-12
+; Last modified: 2014-07-30
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+;
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; Description:
+; This function returns an integer indicating which instruction set is
+; supported by the microprocessor and operating system. A program can
+; call this function to determine if a particular set of instructions can
+; be used.
+;
+; The method used here for detecting whether XMM instructions are enabled by
+; the operating system is different from the method recommended by Intel.
+; The method used here has the advantage that it is independent of the
+; ability of the operating system to catch invalid opcode exceptions. The
+; method used here has been thoroughly tested on many different versions of
+; Intel and AMD microprocessors, and is believed to work reliably. For further
+; discussion of this method, see my manual "Optimizing subroutines in assembly
+; language" (www.agner.org/optimize/).
+;
+; Copyright (c) 2003-2014 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; return value:
+; 0 = 80386 instruction set only
+; 1 or above = MMX instructions supported
+; 2 or above = conditional move and FCOMI supported
+; 3 or above = SSE (XMM) supported by processor and operating system
+; 4 or above = SSE2 supported
+; 5 or above = SSE3 supported
+; 6 or above = Supplementary SSE3
+; 8 or above = SSE4.1 supported
+; 9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVX512f supported
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _InstructionSet: function
+global _IInstrSet
+
+
+SECTION .data
+align 16
+_IInstrSet:
+_IInstrSet@: dd -1 ; local name
+
+SECTION .text align=16
+
+%IFDEF POSITIONINDEPENDENT
+
+; Local function for reading instruction pointer into edi
+GetThunkEDX:
+ mov edx, [esp]
+ ret
+
+%ENDIF ; POSITIONINDEPENDENT
+
+
+_InstructionSet:
+
+%IFDEF POSITIONINDEPENDENT
+ ; Position-independent code for ELF and Mach-O shared objects:
+ call GetThunkEDX
+ add edx, _IInstrSet@ - $
+ mov eax, [edx]
+%ELSE
+ mov eax, [_IInstrSet@]
+%ENDIF
+ ; Check if this function has been called before
+ test eax, eax
+ js FirstTime ; Negative means first time
+ ret ; Early return. Has been called before
+
+FirstTime: ; Function has not been called before
+ push ebx
+
+%IFNDEF POSITIONINDEPENDENT
+ mov edx, _IInstrSet@ ; make edx point to _IInstrSet
+%ENDIF
+ push edx ; save address of _IInstrSet
+
+ ; detect if CPUID instruction supported by microprocessor:
+ pushfd
+ pop eax
+ btc eax, 21 ; check if CPUID bit can toggle
+ push eax
+ popfd
+ pushfd
+ pop ebx
+ xor ebx, eax
+ xor eax, eax ; 0
+ bt ebx, 21
+ jc ISEND ; CPUID not supported
+
+ cpuid ; get number of CPUID functions
+ test eax, eax
+ jz ISEND ; function 1 not supported
+ mov eax, 1
+ cpuid ; get features
+ xor eax, eax ; 0
+
+ test edx, 1 ; floating point support
+ jz ISEND
+ bt edx, 23 ; MMX support
+ jnc ISEND
+ inc eax ; 1
+
+ bt edx, 15 ; conditional move support
+ jnc ISEND
+ inc eax ; 2
+
+ ; check OS support for XMM registers (SSE)
+ bt edx, 24 ; FXSAVE support by microprocessor
+ jnc ISEND
+ push ecx
+ push edx
+ mov ebx, esp ; save stack pointer
+ sub esp, 200H ; allocate space for FXSAVE
+ and esp, -10H ; align by 16
+TESTDATA EQU 0D95A34BEH ; random test value
+TESTPS EQU 10CH ; position to write TESTDATA = upper part of XMM6 image
+ fxsave [esp] ; save FP/MMX and XMM registers
+ mov ecx,[esp+TESTPS] ; read part of XMM6 register
+ xor DWORD [esp+TESTPS],TESTDATA ; change value
+ fxrstor [esp] ; load changed value into XMM6
+ mov [esp+TESTPS],ecx ; restore old value in buffer
+ fxsave [esp] ; save again
+ mov edx,[esp+TESTPS] ; read changed XMM6 register
+ mov [esp+TESTPS],ecx ; restore old value
+ fxrstor [esp] ; load old value into XMM6
+ xor ecx, edx ; get difference between old and new value
+ mov esp, ebx ; restore stack pointer
+ cmp ecx, TESTDATA ; test if XMM6 was changed correctly
+ pop edx
+ pop ecx
+ jne ISEND
+
+ bt edx, 25 ; SSE support by microprocessor
+ jnc ISEND
+ inc eax ; 3
+
+ bt edx, 26 ; SSE2 support by microprocessor
+ jnc ISEND
+ inc eax ; 4
+
+ test ecx, 1 ; SSE3 support by microprocessor
+ jz ISEND
+ inc eax ; 5
+
+ bt ecx, 9 ; Suppl-SSE3 support by microprocessor
+ jnc ISEND
+ inc eax ; 6
+
+ bt ecx, 19 ; SSE4.1 support by microprocessor
+ jnc ISEND
+ mov al, 8 ; 8
+
+ bt ecx, 23 ; POPCNT support by microprocessor
+ jnc ISEND
+ inc eax ; 9
+
+ bt ecx, 20 ; SSE4.2 support by microprocessor
+ jnc ISEND
+ inc eax ; 10
+
+ ; check OS support for YMM registers (AVX)
+ bt ecx, 27 ; OSXSAVE: XGETBV supported
+ jnc ISEND
+ pushad
+ xor ecx, ecx
+ db 0FH, 01H, 0D0H ; XGETBV
+ and eax, 6
+ cmp eax, 6 ; AVX support by OS
+ popad
+ jne ISEND
+
+ bt ecx, 28 ; AVX support by microprocessor
+ jnc ISEND
+ inc eax ; 11
+
+ bt ecx, 1 ; PCLMUL support
+ jnc ISEND
+ bt ecx, 25 ; AES support
+ jnc ISEND
+ inc eax ; 12
+
+ push eax
+ push ecx
+ mov eax, 7
+ xor ecx, ecx
+ cpuid ; check for AVX2
+ bt ebx, 5
+ pop ecx
+ pop eax
+ jnc ISEND
+ inc eax ; 13
+
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+ bt ecx, 12 ; FMA3
+ jnc ISEND
+ bt ecx, 29 ; F16C
+ jnc ISEND
+ bt ebx, 3 ; BMI1
+ jnc ISEND
+ bt ebx, 8 ; BMI2
+ jnc ISEND
+
+ push eax
+ push ebx
+ push ecx
+ mov eax, 80000001H
+ cpuid
+ bt ecx, 5 ; LZCNT
+ pop ecx
+ pop ebx
+ pop eax
+ jnc ISEND
+ inc eax ; 14
+
+ bt ebx, 16 ; AVX512f
+ jnc ISEND
+ inc eax ; 15
+
+ISEND: pop edx ; address of _IInstrSet
+ mov [edx], eax ; save value in public variable _IInstrSet
+ pop ebx
+ ret ; return value is in eax
+
+;_InstructionSet ENDP
diff --git a/asmlibSrc/instrset64.asm b/asmlibSrc/instrset64.asm
new file mode 100755
index 0000000..d40938e
--- /dev/null
+++ b/asmlibSrc/instrset64.asm
@@ -0,0 +1,173 @@
+;************************* instrset64.asm **********************************
+; Author: Agner Fog
+; Date created: 2003-12-12
+; Last modified: 2014-07-30
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; Description:
+; This function returns an integer indicating which instruction set is
+; supported by the microprocessor and operating system. A program can
+; call this function to determine if a particular set of instructions can
+; be used.
+;
+; The method used here for detecting whether XMM instructions are enabled by
+; the operating system is different from the method recommended by Intel.
+; The method used here has the advantage that it is independent of the
+; ability of the operating system to catch invalid opcode exceptions. For
+; further discussion of this method, see my manual "Optimizing subroutines
+; in assembly language" (www.agner.org/optimize/).
+;
+; Copyright (c) 2003-2014 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+;
+; return value:
+; 0 = 80386 instruction set only
+; 1 or above = MMX instructions supported
+; 2 or above = conditional move and FCOMI supported
+; 3 or above = SSE (XMM) supported by processor and operating system
+; 4 or above = SSE2 supported
+; 5 or above = SSE3 supported
+; 6 or above = Supplementary SSE3
+; 8 or above = SSE4.1 supported
+; 9 or above = POPCNT supported
+; 10 or above = SSE4.2 supported
+; 11 or above = AVX supported by processor and operating system
+; 12 or above = PCLMUL and AES supported
+; 13 or above = AVX2 supported
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+; 15 or above = AVX512f supported
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global InstructionSet: function
+global IInstrSet
+
+
+SECTION .data
+align 16
+
+IInstrSet@: ; local name to avoid problems in shared objects
+IInstrSet: dd -1 ; this global variable is valid after first call
+
+
+SECTION .text align=16
+
+; ********** InstructionSet function **********
+; C++ prototype:
+; extern "C" int InstructionSet (void);
+
+
+InstructionSet:
+ ; Check if this function has been called before
+ mov eax, [IInstrSet@]
+ test eax, eax
+ js FirstTime ; Negative means first time
+ ; Early return. Has been called before
+ ret ; Return value is in eax
+
+FirstTime:
+ push rbx
+
+ mov eax, 1
+ cpuid ; get features into edx and ecx
+
+ mov eax, 4 ; at least SSE2 supported in 64 bit mode
+ test ecx, 1 ; SSE3 support by microprocessor
+ jz ISEND
+ inc eax ; 5
+
+ bt ecx, 9 ; Suppl-SSE3 support by microprocessor
+ jnc ISEND
+ inc eax ; 6
+
+ bt ecx, 19 ; SSE4.1 support by microprocessor
+ jnc ISEND
+ mov al, 8 ; 8
+
+ bt ecx, 23 ; POPCNT support by microprocessor
+ jnc ISEND
+ inc eax ; 9
+
+ bt ecx, 20 ; SSE4.2 support by microprocessor
+ jnc ISEND
+ inc eax ; 10
+
+ ; check OS support for YMM registers (AVX)
+ bt ecx, 27 ; OSXSAVE: XGETBV supported
+ jnc ISEND
+ push rax
+ push rcx
+ push rdx
+ xor ecx, ecx
+ db 0FH, 01H, 0D0H ; XGETBV
+ and eax, 6
+ cmp eax, 6 ; AVX support by OS
+ pop rdx
+ pop rcx
+ pop rax
+ jne ISEND
+
+ bt ecx, 28 ; AVX support by microprocessor
+ jnc ISEND
+ inc eax ; 11
+
+ bt ecx, 1 ; PCLMUL support
+ jnc ISEND
+ bt ecx, 25 ; AES support
+ jnc ISEND
+ inc eax ; 12
+
+ push rax
+ push rcx
+ mov eax, 7
+ xor ecx, ecx
+ cpuid ; check for AVX2
+ bt ebx, 5
+ pop rcx
+ pop rax
+ jnc ISEND
+ inc eax ; 13
+
+; 14 or above = FMA3, F16C, BMI1, BMI2, LZCNT
+ bt ecx, 12 ; FMA3
+ jnc ISEND
+ bt ecx, 29 ; F16C
+ jnc ISEND
+ bt ebx, 3 ; BMI1
+ jnc ISEND
+ bt ebx, 8 ; BMI2
+ jnc ISEND
+
+ push rax
+ push rbx
+ push rcx
+ mov eax, 80000001H
+ cpuid
+ bt ecx, 5 ; LZCNT
+ pop rcx
+ pop rbx
+ pop rax
+ jnc ISEND
+ inc eax ; 14
+
+ bt ebx, 16 ; AVX512f
+ jnc ISEND
+ inc eax ; 15
+
+ISEND: mov [IInstrSet@], eax ; save value in global variable
+
+ pop rbx
+ ret ; return value is in eax
+
+;InstructionSet ENDP
diff --git a/asmlibSrc/libad32.asm b/asmlibSrc/libad32.asm
new file mode 100755
index 0000000..96bf994
--- /dev/null
+++ b/asmlibSrc/libad32.asm
@@ -0,0 +1,14 @@
+; ----------------------------- LIBAD.ASM ---------------------------
+; DLL entry function for LIBAD32.DLL
+
+
+SECTION .text align=16
+
+GLOBAL _DllEntry at 12: function
+
+_DllEntry at 12: ; proc hInstance:DWORD, reason:DWORD, reserved1:DWORD
+ mov eax, 1
+ ret 12
+;_DllEntry at 12 endp
+
+; END _DllEntry at 12
diff --git a/asmlibSrc/libad32.def b/asmlibSrc/libad32.def
new file mode 100755
index 0000000..bfba973
--- /dev/null
+++ b/asmlibSrc/libad32.def
@@ -0,0 +1,44 @@
+LIBRARY libad32
+
+VERSION 2013.0913
+
+EXPORTS
+ InstructionSet
+ ProcessorName
+ ReadTSC
+ RoundF
+ RoundD
+ A_strcmp
+ A_stricmp
+ A_strstr
+ A_strtolower
+ A_strtoupper
+ A_strspn
+ A_strcspn
+ strCountInSet
+ strcount_UTF8
+ CpuType
+ A_DebugBreak
+ cpuid_ex
+ setdivisori32
+ setdivisoru32
+ dividefixedi32
+ dividefixedu32
+ PhysicalSeedD
+ MersenneRandomInitD
+ MersenneRandomInitByArrayD
+ MersenneRandomD
+ MersenneIRandomD
+ MersenneIRandomXD
+ MersenneBRandomD
+ MotherRandomInitD
+ MotherIRandomD
+ MotherRandomD
+ MotherBRandomD
+ SFMTgenRandomInitD
+ SFMTgenRandomInitByArrayD
+ SFMTgenIRandomD
+ SFMTgenIRandomXD
+ SFMTgenRandomD
+ SFMTgenBRandomD
+
diff --git a/asmlibSrc/libad64.asm b/asmlibSrc/libad64.asm
new file mode 100755
index 0000000..25c5208
--- /dev/null
+++ b/asmlibSrc/libad64.asm
@@ -0,0 +1,13 @@
+; ----------------------------- LIBAD64.ASM ---------------------------
+; DLL entry function for LIBAD64.DLL
+
+default rel
+
+global DllEntry: function
+
+SECTION .text align=16
+
+DllEntry:
+ mov eax, 1
+ ret
+;DllMain endp
diff --git a/asmlibSrc/libad64.def b/asmlibSrc/libad64.def
new file mode 100755
index 0000000..5948615
--- /dev/null
+++ b/asmlibSrc/libad64.def
@@ -0,0 +1,42 @@
+LIBRARY libad64
+
+VERSION 2013.0913
+
+EXPORTS InstructionSet
+ ProcessorName
+ ReadTSC
+ RoundF
+ RoundD
+ A_strcmp
+ A_stricmp
+ A_strstr
+ A_strtolower
+ A_strtoupper
+ A_strspn
+ A_strcspn
+ strCountInSet
+ strcount_UTF8
+ CpuType
+ A_DebugBreak
+ cpuid_ex
+ setdivisori32
+ setdivisoru32
+ dividefixedi32
+ dividefixedu32
+ PhysicalSeedD
+ MersenneRandomInitD
+ MersenneRandomInitByArrayD
+ MersenneRandomD
+ MersenneIRandomD
+ MersenneIRandomXD
+ MersenneBRandomD
+ MotherRandomInitD
+ MotherIRandomD
+ MotherRandomD
+ MotherBRandomD
+ SFMTgenRandomInitD
+ SFMTgenRandomInitByArrayD
+ SFMTgenIRandomD
+ SFMTgenIRandomXD
+ SFMTgenRandomD
+ SFMTgenBRandomD
diff --git a/asmlibSrc/memcmp32.asm b/asmlibSrc/memcmp32.asm
new file mode 100755
index 0000000..8e4cc00
--- /dev/null
+++ b/asmlibSrc/memcmp32.asm
@@ -0,0 +1,366 @@
+;************************* memcmp32.asm *************************************
+; Author: Agner Fog
+; Date created: 2013-10-03
+; Last modified: 2013-10-03
+; Description:
+; Faster version of the standard memcmp function:
+;
+; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+;
+; Compares two memory blocks of size num.
+; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
+; The return value is positive if the first differing byte of ptr1 is bigger
+; than ptr2 when compared as unsigned bytes.
+; The return value is negative if the first differing byte of ptr1 is smaller
+; than ptr2 when compared as unsigned bytes.
+;
+; Overriding standard function memcmp:
+; The alias ?OVR_memcmp is changed to _memcmp in the object file if
+; it is desired to override the standard library function memcmp.
+;
+; Optimization:
+; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memcmp: function ; Function memcmp
+global ?OVR_memcmp: function ; ?OVR removed if standard function memcmp overridden
+; Direct entries to CPU-specific versions
+global _memcmp386: function ; version for old CPUs without SSE
+global _memcmpSSE2: function ; SSE2 version
+global _memcmpAVX2: function ; AVX2 version
+
+; Imported from instrset32.asm
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+
+SECTION .text align=16
+
+; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+; Function entry:
+_A_memcmp:
+?OVR_memcmp:
+%IFNDEF POSITIONINDEPENDENT
+ jmp dword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+ call get_thunk_edx ; get reference point for position-independent code
+RP: ; reference point edx = offset RP
+; Make the following instruction with address relative to RP:
+ jmp dword [edx+memcmpDispatch-RP]
+%ENDIF
+
+
+align 16
+_memcmpAVX2: ; AVX2 version. Use ymm register
+memcmpAVX2@: ; internal reference
+ push esi
+ push edi
+ mov esi, [esp+12] ; ptr1
+ mov edi, [esp+16] ; ptr2
+ mov ecx, [esp+20] ; size
+ add esi, ecx ; use negative index from end of memory block
+ add edi, ecx
+ neg ecx
+ jz A900
+ mov edx, 0FFFFH
+ cmp ecx, -32
+ ja A100
+
+A000: ; loop comparing 32 bytes
+ vmovdqu ymm1, [esi+ecx]
+ vpcmpeqb ymm0, ymm1, [edi+ecx] ; compare 32 bytes
+ vpmovmskb eax, ymm0 ; get byte mask
+ xor eax, -1 ; not eax would not set flags
+ jnz A700 ; difference found
+ add ecx, 32
+ jz A900 ; finished, equal
+ cmp ecx, -32
+ jna A000 ; next 32 bytes
+ vzeroupper ; end ymm state
+
+A100: ; less than 32 bytes left
+ cmp ecx, -16
+ ja A200
+ movdqu xmm1, [esi+ecx]
+ movdqu xmm2, [edi+ecx]
+ pcmpeqb xmm1, xmm2 ; compare 16 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, edx ; not ax
+ jnz A701 ; difference found
+ add ecx, 16
+ jz A901 ; finished, equal
+
+A200: ; less than 16 bytes left
+ cmp ecx, -8
+ ja A300
+ ; compare 8 bytes
+ movq xmm1, [esi+ecx]
+ movq xmm2, [edi+ecx]
+ pcmpeqb xmm1, xmm2 ; compare 8 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, edx ; not ax
+ jnz A701 ; difference found
+ add ecx, 8
+ jz A901
+
+A300: ; less than 8 bytes left
+ cmp ecx, -4
+ ja A400
+ ; compare 4 bytes
+ movd xmm1, [esi+ecx]
+ movd xmm2, [edi+ecx]
+ pcmpeqb xmm1, xmm2 ; compare 4 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, edx ; not ax
+ jnz A701 ; difference found
+ add ecx, 4
+ jz A901
+
+A400: ; less than 4 bytes left
+ cmp ecx, -2
+ ja A500
+ movzx eax, word [esi+ecx]
+ movzx edx, word [edi+ecx]
+ sub eax, edx
+ jnz A800 ; difference in byte 0 or 1
+ add ecx, 2
+ jz A901
+
+A500: ; less than 2 bytes left
+ test ecx, ecx
+ jz A901 ; no bytes left
+
+A600: ; one byte left
+ movzx eax, byte [esi+ecx]
+ movzx edx, byte [edi+ecx]
+ sub eax, edx ; return result
+ pop edi
+ pop esi
+ ret
+
+A700: ; difference found. find position
+ vzeroupper
+A701:
+ bsf eax, eax
+ add ecx, eax
+ movzx eax, byte [esi+ecx]
+ movzx edx, byte [edi+ecx]
+ sub eax, edx ; return result
+ pop edi
+ pop esi
+ ret
+
+A800: ; difference in byte 0 or 1
+ neg al
+ sbb ecx, -1 ; add 1 to ecx if al == 0
+ movzx eax, byte [esi+ecx]
+ movzx edx, byte [edi+ecx]
+ sub eax, edx ; return result
+ pop edi
+ pop esi
+ ret
+
+A900: ; equal
+ vzeroupper
+A901: xor eax, eax
+ pop edi
+ pop esi
+ ret
+
+
+_memcmpSSE2: ; SSE2 version. Use xmm register
+memcmpSSE2@: ; internal reference
+
+ push esi
+ push edi
+ mov esi, [esp+12] ; ptr1
+ mov edi, [esp+16] ; ptr2
+ mov ecx, [esp+20] ; size
+ add esi, ecx ; use negative index from end of memory block
+ add edi, ecx
+ neg ecx
+ jz S900
+ mov edx, 0FFFFH
+ cmp ecx, -16
+ ja S200
+
+S100: ; loop comparing 16 bytes
+ movdqu xmm1, [esi+ecx]
+ movdqu xmm2, [edi+ecx]
+ pcmpeqb xmm1, xmm2 ; compare 16 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, edx ; not ax
+ jnz S700 ; difference found
+ add ecx, 16
+ jz S900 ; finished, equal
+ cmp ecx, -16
+ jna S100 ; next 16 bytes
+
+S200: ; less than 16 bytes left
+ cmp ecx, -8
+ ja S300
+ ; compare 8 bytes
+ movq xmm1, [esi+ecx]
+ movq xmm2, [edi+ecx]
+ pcmpeqb xmm1, xmm2 ; compare 8 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, edx ; not ax
+ jnz S700 ; difference found
+ add ecx, 8
+ jz S900
+
+S300: ; less than 8 bytes left
+ cmp ecx, -4
+ ja S400
+ ; compare 4 bytes
+ movd xmm1, [esi+ecx]
+ movd xmm2, [edi+ecx]
+ pcmpeqb xmm1, xmm2 ; compare 4 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, edx ; not ax
+ jnz S700 ; difference found
+ add ecx, 4
+ jz S900
+
+S400: ; less than 4 bytes left
+ cmp ecx, -2
+ ja S500
+ movzx eax, word [esi+ecx]
+ movzx edx, word [edi+ecx]
+ sub eax, edx
+ jnz S800 ; difference in byte 0 or 1
+ add ecx, 2
+ jz S900
+
+S500: ; less than 2 bytes left
+ test ecx, ecx
+ jz S900 ; no bytes left
+
+ ; one byte left
+ movzx eax, byte [esi+ecx]
+ movzx edx, byte [edi+ecx]
+ sub eax, edx ; return result
+ pop edi
+ pop esi
+ ret
+
+S700: ; difference found. find position
+ bsf eax, eax
+ add ecx, eax
+ movzx eax, byte [esi+ecx]
+ movzx edx, byte [edi+ecx]
+ sub eax, edx ; return result
+ pop edi
+ pop esi
+ ret
+
+S800: ; difference in byte 0 or 1
+ neg al
+ sbb ecx, -1 ; add 1 to ecx if al == 0
+S820: movzx eax, byte [esi+ecx]
+ movzx edx, byte [edi+ecx]
+ sub eax, edx ; return result
+ pop edi
+ pop esi
+ ret
+
+S900: ; equal
+ xor eax, eax
+ pop edi
+ pop esi
+ ret
+
+
+_memcmp386: ; 80386 version
+memcmp386@: ; internal reference
+ ; This is not perfectly optimized because it is unlikely to ever be used
+ push esi
+ push edi
+ mov esi, [esp+12] ; ptr1
+ mov edi, [esp+16] ; ptr2
+ mov ecx, [esp+20] ; size
+ mov edx, ecx
+ shr ecx, 2 ; size/4 = number of dwords
+ repe cmpsd ; compare dwords
+ jnz M700
+ mov ecx, edx
+ and ecx, 3 ; remainder
+M600: repe cmpsb ; compare bytes
+ je M800 ; equal
+ movzx eax, byte [esi-1] ; esi, edi point past the differing byte. find difference
+ movzx edx, byte [edi-1]
+ sub eax, edx ; calculate return value
+ pop edi
+ pop esi
+ ret
+
+M700: ; dwords differ. search in last 4 bytes
+ mov ecx, 4
+ sub esi, ecx
+ sub edi, ecx
+ jmp M600
+
+M800: ; equal. return zero
+ xor eax, eax
+ pop edi
+ pop esi
+ ret
+
+
+; CPU dispatching for memcmp. This is executed only once
+memcmpCPUDispatch:
+
+%IFNDEF POSITIONINDEPENDENT
+ call _InstructionSet ; get supported instruction set
+ ; Point to generic version of memcmp
+ mov dword [memcmpDispatch], memcmp386@
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; SSE2 supported
+ mov dword [memcmpDispatch], memcmpSSE2@
+ cmp eax, 13 ; check AVX2
+ jb Q100
+ ; AVX2 supported
+ mov dword [memcmpDispatch], memcmpAVX2@
+Q100: ; Continue in appropriate version of memcmp
+ jmp dword [memcmpDispatch]
+
+%ELSE ; Position-independent version
+ push edx
+ call _InstructionSet
+ pop edx
+
+ ; Point to generic version of memcmp
+ lea ecx, [edx+memcmp386 at -RP]
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; Point to SSE2 version of memcmp
+ lea ecx, [edx+memcmpSSE2 at -RP]
+ cmp eax, 13 ; check AVX2
+ jb Q100
+ ; Point to AVX2 version of memcmp
+ lea ecx, [edx+memcmpAVX2 at -RP]
+Q100: mov [edx+memcmpDispatch-RP], ecx
+ ; Continue in appropriate version of memcmp
+ jmp ecx
+
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+
+SECTION .data
+align 16
+
+
+; Pointer to appropriate version.
+; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
+; change this to the appropriate version of memcmp, so that
+; memcmpCPUDispatch is only executed once:
+memcmpDispatch DD memcmpCPUDispatch
+
diff --git a/asmlibSrc/memcmp64.asm b/asmlibSrc/memcmp64.asm
new file mode 100755
index 0000000..c7f14c9
--- /dev/null
+++ b/asmlibSrc/memcmp64.asm
@@ -0,0 +1,293 @@
+;************************* memcmp64.asm *************************************
+; Author: Agner Fog
+; Date created: 2013-10-03
+; Last modified: 2013-10-03
+; Description:
+; Faster version of the standard memcmp function:
+;
+; int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+;
+; Compares two memory blocks of size num.
+; The return value is zero if the two memory blocks ptr1 and ptr2 are equal
+; The return value is positive if the first differing byte of ptr1 is bigger
+; than ptr2 when compared as unsigned bytes.
+; The return value is negative if the first differing byte of ptr1 is smaller
+; than ptr2 when compared as unsigned bytes.
+;
+; Overriding standard function memcmp:
+; The alias ?OVR_memcmp is changed to _memcmp in the object file if
+; it is desired to override the standard library function memcmp.
+;
+; Optimization:
+; Uses XMM registers if SSE2 is available, uses YMM registers if AVX2.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global A_memcmp: function ; Function memcmp
+global ?OVR_memcmp: function ; ?OVR_ removed if standard function memcmp overridden
+; Direct entries to CPU-specific versions
+global memcmpSSE2: function ; SSE2 version
+global memcmpAVX2: function ; AVX2 version
+
+; Imported from instrset64.asm
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+default rel
+
+; define registers used for parameters
+%IFDEF WINDOWS
+%define par1 rcx ; function parameter 1
+%define par2 rdx ; function parameter 2
+%define par3 r8 ; function parameter 3
+%define par4 r9 ; scratch register
+%define par4d r9d ; scratch register
+%ENDIF
+%IFDEF UNIX
+%define par1 rdi ; function parameter 1
+%define par2 rsi ; function parameter 2
+%define par3 rdx ; function parameter 3
+%define par4 rcx ; scratch register
+%define par4d ecx ; scratch register
+%ENDIF
+
+
+
+SECTION .text align=16
+
+; extern "C" int A_memcmp (const void * ptr1, const void * ptr2, size_t count);
+; Function entry:
+A_memcmp:
+?OVR_memcmp:
+ jmp qword [memcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+
+align 16
+memcmpAVX2: ; AVX2 version. Use ymm register
+memcmpAVX2@: ; internal reference
+
+ add par1, par3 ; use negative index from end of memory block
+ add par2, par3
+ neg par3
+ jz A900
+ mov par4d, 0FFFFH
+ cmp par3, -32
+ ja A100
+
+A000: ; loop comparing 32 bytes
+ vmovdqu ymm1, [par1+par3]
+ vpcmpeqb ymm0, ymm1, [par2+par3] ; compare 32 bytes
+ vpmovmskb eax, ymm0 ; get byte mask
+ xor eax, -1 ; not eax would not set flags
+ jnz A700 ; difference found
+ add par3, 32
+ jz A900 ; finished, equal
+ cmp par3, -32
+ jna A000 ; next 32 bytes
+ vzeroupper ; end ymm state
+
+A100: ; less than 32 bytes left
+ cmp par3, -16
+ ja A200
+ movdqu xmm1, [par1+par3]
+ movdqu xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 16 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; invert lower 16 bits
+ jnz A701 ; difference found
+ add par3, 16
+ jz A901 ; finished, equal
+
+A200: ; less than 16 bytes left
+ cmp par3, -8
+ ja A300
+ ; compare 8 bytes
+ movq xmm1, [par1+par3]
+ movq xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 8 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d
+ jnz A701 ; difference found
+ add par3, 8
+ jz A901
+
+A300: ; less than 8 bytes left
+ cmp par3, -4
+ ja A400
+ ; compare 4 bytes
+ movd xmm1, [par1+par3]
+ movd xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 4 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz A701 ; difference found
+ add par3, 4
+ jz A901
+
+A400: ; less than 4 bytes left
+ cmp par3, -2
+ ja A500
+ movzx eax, word [par1+par3]
+ movzx par4d, word [par2+par3]
+ sub eax, par4d
+ jnz A800 ; difference in byte 0 or 1
+ add par3, 2
+ jz A901
+
+A500: ; less than 2 bytes left
+ test par3, par3
+ jz A901 ; no bytes left
+
+A600: ; one byte left
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+A700: ; difference found. find position
+ vzeroupper
+A701:
+ bsf eax, eax
+ add par3, rax
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+A800: ; difference in byte 0 or 1
+ neg al
+ sbb par3, -1 ; add 1 to par3 if al == 0
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+A900: ; equal
+ vzeroupper
+A901: xor eax, eax
+ ret
+
+
+memcmpSSE2: ; SSE2 version. Use xmm register
+memcmpSSE2@: ; internal reference
+
+ add par1, par3 ; use negative index from end of memory block
+ add par2, par3
+ neg par3
+ jz S900
+ mov par4d, 0FFFFH
+ cmp par3, -16
+ ja S200
+
+S100: ; loop comparing 16 bytes
+ movdqu xmm1, [par1+par3]
+ movdqu xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 16 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz S700 ; difference found
+ add par3, 16
+ jz S900 ; finished, equal
+ cmp par3, -16
+ jna S100 ; next 16 bytes
+
+S200: ; less than 16 bytes left
+ cmp par3, -8
+ ja S300
+ ; compare 8 bytes
+ movq xmm1, [par1+par3]
+ movq xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 8 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz S700 ; difference found
+ add par3, 8
+ jz S900
+
+S300: ; less than 8 bytes left
+ cmp par3, -4
+ ja S400
+ ; compare 4 bytes
+ movd xmm1, [par1+par3]
+ movd xmm2, [par2+par3]
+ pcmpeqb xmm1, xmm2 ; compare 4 bytes
+ pmovmskb eax, xmm1 ; get byte mask
+ xor eax, par4d ; not ax
+ jnz S700 ; difference found
+ add par3, 4
+ jz S900
+
+S400: ; less than 4 bytes left
+ cmp par3, -2
+ ja S500
+ movzx eax, word [par1+par3]
+ movzx par4d, word [par2+par3]
+ sub eax, par4d
+ jnz S800 ; difference in byte 0 or 1
+ add par3, 2
+ jz S900
+
+S500: ; less than 2 bytes left
+ test par3, par3
+ jz S900 ; no bytes left
+
+ ; one byte left
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+S700: ; difference found. find position
+ bsf eax, eax
+ add par3, rax
+ movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+S800: ; difference in byte 0 or 1
+ neg al
+ sbb par3, -1 ; add 1 to par3 if al == 0
+S820: movzx eax, byte [par1+par3]
+ movzx par4d, byte [par2+par3]
+ sub eax, par4d ; return result
+ ret
+
+S900: ; equal
+ xor eax, eax
+ ret
+
+
+; CPU dispatching for memcmp. This is executed only once
+memcmpCPUDispatch:
+ push par1
+ push par2
+ push par3
+ call InstructionSet ; get supported instruction set
+ ; SSE2 always supported
+ lea par4, [memcmpSSE2@]
+ cmp eax, 13 ; check AVX2
+ jb Q100
+ ; AVX2 supported
+ lea par4, [memcmpAVX2@]
+Q100: ; save pointer
+ mov qword [memcmpDispatch], par4
+; Continue in appropriate version of memcmp
+ pop par3
+ pop par2
+ pop par1
+ jmp par4
+
+
+SECTION .data
+align 16
+
+
+; Pointer to appropriate version.
+; This initially points to memcmpCPUDispatch. memcmpCPUDispatch will
+; change this to the appropriate version of memcmp, so that
+; memcmpCPUDispatch is only executed once:
+memcmpDispatch DQ memcmpCPUDispatch
+
diff --git a/asmlibSrc/memcpy32.asm b/asmlibSrc/memcpy32.asm
new file mode 100755
index 0000000..257fe2c
--- /dev/null
+++ b/asmlibSrc/memcpy32.asm
@@ -0,0 +1,1460 @@
+;************************* memcpy32.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-18
+; Last modified: 2013-09-11
+
+; Description:
+; Faster version of the standard memcpy function:
+; void * A_memcpy(void *dest, const void *src, size_t count);
+; Copies 'count' bytes from 'src' to 'dest'
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_memcpy is changed to _memcpy in the object file if
+; it is desired to override the standard library function memcpy.
+;
+; The function uses non-temporal writes to bypass the cache when the size is
+; bigger than half the size of the largest_level cache. This limit can be
+; read with _GetMemcpyCacheLimit and changed with _SetMemcpyCacheLimit (in
+; memmove32.asm). C++ prototypes:
+; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy32.asm
+; extern "C" void SetMemcpyCacheLimit(); // in memmove32.asm
+; extern "C" void SetMemcpyCacheLimit1(); // used internally
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386, SSE2, Suppl-SSE3 and AVX instruction sets.
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memcpy: function ; Function A_memcpy
+global ?OVR_memcpy: function ; ?OVR removed if standard function memcpy overridden
+
+; Direct entries to CPU-specific versions
+global _memcpy386: function ; Generic version for processors without SSE2
+global _memcpySSE2: function ; Version for processors with SSE2
+global _memcpySSSE3: function ; Version for processors with SSSE3
+global _memcpyU: function ; Alternative version for processors with fast unaligned read
+global _memcpyU256: function ; Version for processors with fast 256-bit read/write
+
+global _GetMemcpyCacheLimit: function ; Get the size limit for bypassing cache when copying with memcpy and memmove
+global _SetMemcpyCacheLimit1: function ; Set the size limit for bypassing cache when copying with memcpy
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster32.asm:
+extern _UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
+extern _Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+
+; Imported from cachesize32.asm:
+extern _DataCacheSize ; Gets size of data cache
+
+
+; Define prolog for this function
+%MACRO PROLOGM 0
+ push esi
+ push edi
+ mov edi, [esp+12] ; dest
+ mov esi, [esp+16] ; src
+ mov ecx, [esp+20] ; count
+%IFDEF POSITIONINDEPENDENT
+ push ebx
+ mov ebx, edx ; pointer to reference point RP
+%ENDIF
+%ENDM
+
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF POSITIONINDEPENDENT
+ pop ebx
+%ENDIF
+ pop edi
+ pop esi
+ mov eax, [esp+4] ; Return value = dest
+ ret
+%ENDMACRO
+
+
+SECTION .text align=16
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
+; Function entry:
+_A_memcpy:
+?OVR_memcpy:
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp dword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
+RP equ 0 ; RP = 0 if not position-independent
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP: ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+ jmp dword [edx+memcpyDispatch-RP]
+
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memcpyU256: ; global label
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memcpyU256@:
+ PROLOGM
+ cmp ecx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 1FH
+ jz B3100 ; Skip if dest aligned by 32
+
+ ; edx = size of first partial block, 1 - 31 bytes
+ test dl, 3
+ jz B3030
+ test dl, 1
+ jz B3020
+ ; move 1 byte
+ movzx eax, byte [esi]
+ mov [edi], al
+ inc esi
+ inc edi
+B3020: test dl, 2
+ jz B3030
+ ; move 2 bytes
+ movzx eax, word [esi]
+ mov [edi], ax
+ add esi, 2
+ add edi, 2
+B3030: test dl, 4
+ jz B3040
+ ; move 4 bytes
+ mov eax, [esi]
+ mov [edi], eax
+ add esi, 4
+ add edi, 4
+B3040: test dl, 8
+ jz B3050
+ ; move 8 bytes
+ movq xmm0, qword [esi]
+ movq qword [edi], xmm0
+ add esi, 8
+ add edi, 8
+B3050: test dl, 16
+ jz B3060
+ ; move 16 bytes
+ movups xmm0, [esi]
+ movaps [edi], xmm0
+ add esi, 16
+ add edi, 16
+B3060: sub ecx, edx
+
+B3100: ; Now dest is aligned by 32. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and ecx, -20H ; Round down to nearest multiple of 32
+ add esi, ecx ; Point to the end
+ add edi, ecx ; Point to the end
+ sub edx, ecx ; Remaining data after loop
+
+ ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+ ; Check if count very big
+ cmp ecx, [_CacheBypassLimit]
+%ELSE
+ cmp ecx, [ebx+_CacheBypassLimit-RP]
+%ENDIF
+ ja I3100 ; Use non-temporal store if count > CacheBypassLimit
+ neg ecx ; Negative index from the end
+
+H3100: ; copy -ecx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test esi, 1FH
+ jz H3110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J3100
+
+H3110: ; main copy loop, 32 bytes at a time
+ ; ecx has negative index from the end, counting up to zero
+ vmovups ymm0, [esi+ecx]
+ vmovaps [edi+ecx], ymm0
+ add ecx, 20H
+ jnz H3110
+ vzeroupper ; end of AVX mode
+
+ ; Move the remaining edx bytes (0 - 31):
+H3120: add esi, edx
+ add edi, edx
+ neg edx
+ jz H3500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H3200
+ ; move 16 bytes
+ movups xmm0, [esi+edx]
+ movaps [edi+edx], xmm0
+ add edx, 10H
+H3200: cmp edx, -8
+ jg H3210
+ ; move 8 bytes
+ movq xmm0, qword [esi+edx]
+ movq qword [edi+edx], xmm0
+ add edx, 8
+ jz H3500 ; Early skip if count divisible by 8
+H3210: cmp edx, -4
+ jg H3220
+ ; move 4 bytes
+ mov eax, [esi+edx]
+ mov [edi+edx], eax
+ add edx, 4
+H3220: cmp edx, -2
+ jg H3230
+ ; move 2 bytes
+ movzx eax, word [esi+edx]
+ mov [edi+edx], ax
+ add edx, 2
+H3230: cmp edx, -1
+ jg H3500
+ ; move 1 byte
+ movzx eax, byte [esi+edx]
+ mov [edi+edx], al
+H3500: ; finished
+ RETURNM
+
+I3100: ; non-temporal move
+ neg ecx ; Negative index from the end
+align 16
+I3110: ; main copy loop, 32 bytes at a time
+ ; ecx has negative index from the end, counting up to zero
+ vmovups ymm0, [esi+ecx]
+ vmovntps [edi+ecx], ymm0
+ add ecx, 20H
+ jnz I3110
+ vzeroupper ; end of AVX mode
+ jmp H3120 ; Move the remaining edx bytes (0 - 31):
+
+align 16
+J3100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push edx
+ mov eax, esi
+ sub eax, edi
+ cdq
+ xor eax, edx
+ sub eax, edx ; abs(src-dest)
+ neg ecx ; size
+ pop edx ; restore rdx
+ cmp eax, ecx
+ jnb J3110
+ neg ecx ; restore rcx
+ jmp H3110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov eax, esi
+ sub eax, edi
+ neg ecx ; size
+ cmp eax, ecx
+ jnb J3110 ; OK to copy backwards
+ ; must copy forwards
+ neg ecx ; restore ecx
+ jmp H3110 ; copy forwards
+%endif
+
+J3110: ; copy backwards, ecx = size. esi, edi = end of src, dest
+ push esi
+ push edi
+ sub esi, ecx
+ sub edi, ecx
+J3120: ; loop backwards
+ vmovups ymm1, [esi+ecx-20H]
+ vmovaps [edi+ecx-20H], ymm1
+ sub ecx, 20H
+ jnz J3120
+ vzeroupper
+ pop edi
+ pop esi
+ jmp H3120
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ ; multiple CPU versions (SSSE3 and later)
+A1000: add esi, ecx ; end of src
+ add edi, ecx ; end of dest
+ neg ecx ; negative index from the end
+ cmp ecx, -20H
+ jg A1100
+ ; move 32 bytes
+ ; movdqu is faster than movq on all processors with SSSE3
+ movups xmm0, oword [esi+ecx]
+ movups xmm1, oword [esi+ecx+10H]
+ movups oword [edi+ecx], xmm0
+ movups oword [edi+ecx+10H], xmm1
+ add ecx, 20H
+A1100: cmp ecx, -10H
+ jg A1200
+ ; move 16 bytes
+ movups xmm0, oword [esi+ecx]
+ movups oword [edi+ecx], xmm0
+ add ecx, 10H
+A1200: cmp ecx, -8
+ jg A1300
+ ; move 8 bytes
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+ add ecx, 8
+A1300: cmp ecx, -4
+ jg A1400
+ ; move 4 bytes
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+ add ecx, 4
+ jz A1900 ; early out if count divisible by 4
+A1400: cmp ecx, -2
+ jg A1500
+ ; move 2 bytes
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+ add ecx, 2
+A1500: cmp ecx, -1
+ jg A1900
+ ; move 1 byte
+ movzx eax, byte [esi+ecx]
+ mov [edi+ecx], al
+A1900: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memcpyU: ; global label
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memcpyU@: ; local label
+ PROLOGM
+ cmp ecx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B2100 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B2030
+ test dl, 1
+ jz B2020
+ ; move 1 byte
+ movzx eax, byte [esi]
+ mov [edi], al
+ inc esi
+ inc edi
+B2020: test dl, 2
+ jz B2030
+ ; move 2 bytes
+ movzx eax, word [esi]
+ mov [edi], ax
+ add esi, 2
+ add edi, 2
+B2030: test dl, 4
+ jz B2040
+ ; move 4 bytes
+ mov eax, [esi]
+ mov [edi], eax
+ add esi, 4
+ add edi, 4
+B2040: test dl, 8
+ jz B2050
+ ; move 8 bytes
+ movq xmm0, qword [esi]
+ movq qword [edi], xmm0
+ add esi, 8
+ add edi, 8
+B2050: sub ecx, edx
+B2100: ; Now dest is aligned by 16. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and ecx, -20H ; Round down to nearest multiple of 32
+ add esi, ecx ; Point to the end
+ add edi, ecx ; Point to the end
+ sub edx, ecx ; Remaining data after loop
+
+ ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+ ; Check if count very big
+ cmp ecx, [_CacheBypassLimit]
+%ELSE
+ cmp ecx, [ebx+_CacheBypassLimit-RP]
+%ENDIF
+ ja I100 ; Use non-temporal store if count > CacheBypassLimit
+ neg ecx ; Negative index from the end
+
+H100: ; copy -ecx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test esi, 0FH
+ jz H110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J100
+
+H110: ; main copy loop, 32 bytes at a time
+ ; ecx has negative index from the end, counting up to zero
+ movups xmm0, [esi+ecx]
+ movups xmm1, [esi+ecx+10H]
+ movaps [edi+ecx], xmm0
+ movaps [edi+ecx+10H], xmm1
+ add ecx, 20H
+ jnz H110
+
+ ; Move the remaining edx bytes (0 - 31):
+H120: add esi, edx
+ add edi, edx
+ neg edx
+ jz H500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H200
+ ; move 16 bytes
+ movups xmm0, [esi+edx]
+ movaps [edi+edx], xmm0
+ add edx, 10H
+H200: cmp edx, -8
+ jg H210
+ ; move 8 bytes
+ movq xmm0, qword [esi+edx]
+ movq qword [edi+edx], xmm0
+ add edx, 8
+ jz H500 ; Early skip if count divisible by 8
+H210: cmp edx, -4
+ jg H220
+ ; move 4 bytes
+ mov eax, [esi+edx]
+ mov [edi+edx], eax
+ add edx, 4
+H220: cmp edx, -2
+ jg H230
+ ; move 2 bytes
+ movzx eax, word [esi+edx]
+ mov [edi+edx], ax
+ add edx, 2
+H230: cmp edx, -1
+ jg H500
+ ; move 1 byte
+ movzx eax, byte [esi+edx]
+ mov [edi+edx], al
+H500: ; finished
+ RETURNM
+
+I100: ; non-temporal move
+ neg ecx ; Negative index from the end
+align 16
+I110: ; main copy loop, 32 bytes at a time
+ ; ecx has negative index from the end, counting up to zero
+ movups xmm0, [esi+ecx]
+ movups xmm1, [esi+ecx+10H]
+ movntps [edi+ecx], xmm0
+ movntps [edi+ecx+10H], xmm1
+ add ecx, 20H
+ jnz I110
+ jmp H120 ; Move the remaining edx bytes (0 - 31):
+
+align 16
+J100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push edx
+ mov eax, esi
+ sub eax, edi
+ cdq
+ xor eax, edx
+ sub eax, edx ; abs(src-dest)
+ neg ecx ; size
+ pop edx ; restore rdx
+ cmp eax, ecx
+ jnb J110
+ neg ecx ; restore rcx
+ jmp H110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov eax, esi
+ sub eax, edi
+ neg ecx ; size
+ cmp eax, ecx
+ jnb J110 ; OK to copy backwards
+ ; must copy forwards
+ neg ecx ; restore ecx
+ jmp H110 ; copy forwards
+%endif
+
+J110: ; copy backwards, ecx = size. esi, edi = end of src, dest
+ push esi
+ push edi
+ sub esi, ecx
+ sub edi, ecx
+J120: ; loop backwards
+ movups xmm1, [esi+ecx-20H]
+ movups xmm0, [esi+ecx-10H]
+ movaps [edi+ecx-20H], xmm1
+ movaps [edi+ecx-10H], xmm0
+ sub ecx, 20H
+ jnz J120
+ pop edi
+ pop esi
+ jmp H120
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memcpySSSE3: ; global label
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memcpySSSE3@: ; local label
+ PROLOGM
+ cmp ecx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; This part will not always work if count < 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B1200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B1120
+ test edx, 1
+ jz B1110
+ ; move 1 byte
+ movzx eax, byte [esi]
+ mov [edi], al
+ inc esi
+ inc edi
+B1110: test dl, 2
+ jz B1120
+ ; move 2 bytes
+ movzx eax, word [esi]
+ mov [edi], ax
+ add esi, 2
+ add edi, 2
+B1120: test dl, 4
+ jz B1130
+ ; move 4 bytes
+ mov eax, [esi]
+ mov [edi], eax
+ add esi, 4
+ add edi, 4
+B1130: test dl, 8
+ jz B1140
+ ; move 8 bytes
+ movq xmm0, qword [esi]
+ movq qword [edi], xmm0
+ add esi, 8
+ add edi, 8
+B1140: sub ecx, edx
+
+B1200: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and ecx, -20H ; Round down to nearest multiple of 32
+ add esi, ecx ; Point to the end
+ add edi, ecx ; Point to the end
+ sub edx, ecx ; Remaining data after loop
+ sub esi, eax ; Nearest preceding aligned block of src
+
+%IFNDEF POSITIONINDEPENDENT
+ ; Check if count very big
+ cmp ecx, [_CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > _CacheBypassLimit
+ neg ecx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ jmp dword [AlignmentDispatchSSSE3+eax*4]
+
+B1400: neg ecx
+ ; Dispatch to different codes depending on src alignment
+ jmp dword [AlignmentDispatchNT+eax*4]
+
+%ELSE ; Position-independent code
+
+ ; Check if count very big
+ ; Make the following instruction with address relative to RP:
+ cmp ecx, [ebx-RP+_CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > _CacheBypassLimit
+ neg ecx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+
+ ; AlignmentDispatch table contains addresses relative to RP
+ ; Add table entry to ebx=RP to get jump address.
+
+ ; Make the following instruction with address relative to RP:
+ add ebx, [ebx-RP+AlignmentDispatchSSSE3+eax*4]
+ jmp ebx
+
+B1400: neg ecx
+
+ ; Same with AlignmentDispatchNT:
+ add ebx, [ebx-RP+AlignmentDispatchNT+eax*4]
+ jmp ebx
+%ENDIF
+
+align 16
+C100: ; Code for aligned src. SSE2 and later instruction set
+ ; The nice case, src and dest have same alignment.
+
+ ; Loop. ecx has negative index from the end, counting up to zero
+ movaps xmm0, [esi+ecx]
+ movaps xmm1, [esi+ecx+10H]
+ movaps [edi+ecx], xmm0
+ movaps [edi+ecx+10H], xmm1
+ add ecx, 20H
+ jnz C100
+
+ ; Move the remaining edx bytes (0 - 31):
+ add esi, edx
+ add edi, edx
+ neg edx
+ jz C500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes
+ movaps xmm0, [esi+edx]
+ movaps [edi+edx], xmm0
+ add edx, 10H
+C200: cmp edx, -8
+ jg C210
+ ; move 8 bytes
+ movq xmm0, qword [esi+edx]
+ movq qword [edi+edx], xmm0
+ add edx, 8
+ jz C500 ; Early skip if count divisible by 8
+C210: cmp edx, -4
+ jg C220
+ ; move 4 bytes
+ mov eax, [esi+edx]
+ mov [edi+edx], eax
+ add edx, 4
+C220: cmp edx, -2
+ jg C230
+ ; move 2 bytes
+ movzx eax, word [esi+edx]
+ mov [edi+edx], ax
+ add edx, 2
+C230: cmp edx, -1
+ jg C500
+ ; move 1 byte
+ movzx eax, byte [esi+edx]
+ mov [edi+edx], al
+C500: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memcpySSE2: ; global label
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memcpySSE2@: ; local label
+ PROLOGM
+ cmp ecx, 40H
+ jae B100 ; Use simpler code if count < 64
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ add esi, ecx ; end of src
+ add edi, ecx ; end of dest
+ neg ecx ; negative index from the end
+ cmp ecx, -20H
+ jg A100
+ ; move 32 bytes
+ ; movq is faster than movdqu on Intel Pentium M and Core 1
+ ; movdqu is fast on Nehalem and later
+ movq xmm0, qword [esi+ecx]
+ movq xmm1, qword [esi+ecx+8]
+ movq xmm2, qword [esi+ecx+10H]
+ movq xmm3, qword [esi+ecx+18H]
+ movq qword [edi+ecx], xmm0
+ movq qword [edi+ecx+8], xmm1
+ movq qword [edi+ecx+10H], xmm2
+ movq qword [edi+ecx+18H], xmm3
+ add ecx, 20H
+A100: cmp ecx, -10H
+ jg A200
+ ; move 16 bytes
+ movq xmm0, qword [esi+ecx]
+ movq xmm1, qword [esi+ecx+8]
+ movq qword [edi+ecx], xmm0
+ movq qword [edi+ecx+8], xmm1
+ add ecx, 10H
+A200: cmp ecx, -8
+ jg A300
+ ; move 8 bytes
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+ add ecx, 8
+A300: cmp ecx, -4
+ jg A400
+ ; move 4 bytes
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+ add ecx, 4
+ jz A900 ; early out if count divisible by 4
+A400: cmp ecx, -2
+ jg A500
+ ; move 2 bytes
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+ add ecx, 2
+A500: cmp ecx, -1
+ jg A900
+ ; move 1 byte
+ movzx eax, byte [esi+ecx]
+ mov [edi+ecx], al
+A900: ; finished
+ RETURNM
+
+B100: ; count >= 64
+ ; This part will not always work if count < 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B120
+ test dl, 1
+ jz B110
+ ; move 1 byte
+ movzx eax, byte [esi]
+ mov [edi], al
+ inc esi
+ inc edi
+B110: test dl, 2
+ jz B120
+ ; move 2 bytes
+ movzx eax, word [esi]
+ mov [edi], ax
+ add esi, 2
+ add edi, 2
+B120: test dl, 4
+ jz B130
+ ; move 4 bytes
+ mov eax, [esi]
+ mov [edi], eax
+ add esi, 4
+ add edi, 4
+B130: test dl, 8
+ jz B140
+ ; move 8 bytes
+ movq xmm0, qword [esi]
+ movq qword [edi], xmm0
+ add esi, 8
+ add edi, 8
+B140: sub ecx, edx
+
+B200: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and ecx, -20H ; Round down to nearest multiple of 32
+ add esi, ecx ; Point to the end
+ add edi, ecx ; Point to the end
+ sub edx, ecx ; Remaining data after loop
+ sub esi, eax ; Nearest preceding aligned block of src
+
+%IFNDEF POSITIONINDEPENDENT
+ ; Check if count very big
+ cmp ecx, [_CacheBypassLimit]
+ ja B400 ; Use non-temporal store if count > _CacheBypassLimit
+ neg ecx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ jmp dword [AlignmentDispatchSSE2+eax*4]
+
+B400: neg ecx
+ ; Dispatch to different codes depending on src alignment
+ jmp dword [AlignmentDispatchNT+eax*4]
+
+%ELSE ; Position-independent code
+
+ ; Check if count very big
+ ; Make the following instruction with address relative to RP:
+ cmp ecx, [ebx-RP+_CacheBypassLimit]
+ ja B400 ; Use non-temporal store if count > _CacheBypassLimit
+ neg ecx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+
+ ; AlignmentDispatch tables contain addresses relative to RP
+ ; Add table entry to ebx=RP to get jump address.
+
+ ; Make the following instruction with address relative to RP:
+ add ebx, [ebx-RP+AlignmentDispatchSSE2+eax*4]
+ jmp ebx
+
+B400: neg ecx
+
+ ; Same with AlignmentDispatchNT:
+ add ebx, [ebx-RP+AlignmentDispatchNT+eax*4]
+ jmp ebx
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSE2 2
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [esi+ecx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. ecx has negative index from the end, counting up to zero
+ movdqa xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ movdqa xmm2, [esi+ecx+20H]
+ movdqa xmm3, xmm1 ; Copy because used twice
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [edi+ecx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [edi+ecx], xmm0 ; non-temporal save
+ %ENDIF
+ movdqa xmm0, xmm2 ; Save for next iteration
+ psrldq xmm3, %1 ; shift right
+ pslldq xmm2, 16-%1 ; shift left
+ por xmm3, xmm2 ; combine blocks
+ %IF %2 == 0
+ movdqa [edi+ecx+10H], xmm3 ; Save aligned
+ %ELSE
+ movntdq [edi+ecx+10H], xmm3 ; non-temporal save
+ %ENDIF
+ add ecx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add esi, edx
+ add edi, edx
+ neg edx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm1, [esi+edx+10H]
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [edi+edx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [edi+edx], xmm0 ; non-temporal save
+ %ENDIF
+ add edx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+%MACRO MOVE_UNALIGNED_SSE2_4 1
+; Special case for u = 4
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [esi+ecx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. ecx has negative index from the end, counting up to zero
+ movaps xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ ;pshufd xmm0, xmm0, 00111001B
+ shufps xmm0, xmm0, 00111001B
+ %IF %1 == 0
+ movaps [edi+ecx], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [esi+ecx+20H]
+ movss xmm1, xmm0
+ shufps xmm1, xmm1, 00111001B
+ %IF %1 == 0
+ movaps [edi+ecx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [edi+ecx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add ecx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add esi, edx
+ add edi, edx
+ neg edx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [esi+edx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1
+ shufps xmm0, xmm0, 00111001B
+ %IF %1 == 0
+ movaps [edi+edx], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+edx], xmm0 ; Non-temporal save
+ %ENDIF
+ add edx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+%MACRO MOVE_UNALIGNED_SSE2_8 1
+; Special case for u = 8
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [esi+ecx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. ecx has negative index from the end, counting up to zero
+ movaps xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged
+ shufps xmm0, xmm0, 01001110B ; Rotate
+ %IF %1 == 0
+ movaps [edi+ecx], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [esi+ecx+20H]
+ movsd xmm1, xmm0
+ shufps xmm1, xmm1, 01001110B
+ %IF %1 == 0
+ movaps [edi+ecx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [edi+ecx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add ecx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add esi, edx
+ add edi, edx
+ neg edx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [esi+edx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1
+ shufps xmm0, xmm0, 01001110B
+ %IF %1 == 0
+ movaps [edi+edx], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+edx], xmm0 ; Non-temporal save
+ %ENDIF
+ add edx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+%MACRO MOVE_UNALIGNED_SSE2_12 1
+; %1 = 1 if non-temporal store desired
+; Special case for u = 12
+ movaps xmm0, [esi+ecx] ; Read from nearest preceding 16B boundary
+ shufps xmm0, xmm0, 10010011B
+%%L1: ; Loop. ecx has negative index from the end, counting up to zero
+ movaps xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ movaps xmm2, [esi+ecx+20H]
+ shufps xmm1, xmm1, 10010011B
+ shufps xmm2, xmm2, 10010011B
+ movaps xmm3, xmm2
+ movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [edi+ecx], xmm1 ; Save aligned
+ movaps [edi+ecx+10H], xmm2 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm1 ; Non-temporal save
+ movntps [edi+ecx+10H], xmm2 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, xmm3 ; Save for next iteration
+ add ecx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add esi, edx
+ add edi, edx
+ neg edx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [esi+edx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 10010011B
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [edi+edx], xmm1 ; Save aligned
+ %ELSE
+ movntps [edi+edx], xmm1 ; Non-temporal save
+ %ENDIF
+ add edx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSSE3 1
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [esi+ecx] ; Read from nearest preceding 16B boundary
+
+%%L1: ; Loop. ecx has negative index from the end, counting up to zero
+ movdqa xmm2, [esi+ecx+10H] ; Read next two blocks
+ movdqa xmm3, [esi+ecx+20H]
+ movdqa xmm1, xmm0 ; Save xmm0
+ movdqa xmm0, xmm3 ; Save for next iteration
+ palignr xmm3, xmm2, %1 ; Combine parts into aligned block
+ palignr xmm2, xmm1, %1 ; Combine parts into aligned block
+ movdqa [edi+ecx], xmm2 ; Save aligned
+ movdqa [edi+ecx+10H], xmm3 ; Save aligned
+ add ecx, 20H
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add esi, edx
+ add edi, edx
+ neg edx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm2, [esi+edx+10H]
+ palignr xmm2, xmm0, %1
+ movdqa [edi+edx], xmm2
+ add edx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes
+ jmp C200
+%ENDMACRO
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSSE2 below
+
+; (aligns are inserted manually to minimize the number of 16-bytes
+; boundaries inside loops in the most common cases)
+
+align 16
+D104: MOVE_UNALIGNED_SSE2_4 0
+D108: MOVE_UNALIGNED_SSE2_8 0
+align 8
+D10C: MOVE_UNALIGNED_SSE2_12 0
+D101: MOVE_UNALIGNED_SSE2 1, 0
+D102: MOVE_UNALIGNED_SSE2 2, 0
+D103: MOVE_UNALIGNED_SSE2 3, 0
+D105: MOVE_UNALIGNED_SSE2 5, 0
+D106: MOVE_UNALIGNED_SSE2 6, 0
+D107: MOVE_UNALIGNED_SSE2 7, 0
+D109: MOVE_UNALIGNED_SSE2 9, 0
+D10A: MOVE_UNALIGNED_SSE2 0AH, 0
+D10B: MOVE_UNALIGNED_SSE2 0BH, 0
+D10D: MOVE_UNALIGNED_SSE2 0DH, 0
+D10E: MOVE_UNALIGNED_SSE2 0EH, 0
+D10F: MOVE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
+
+align 16
+times 11 nop
+E104: MOVE_UNALIGNED_SSSE3 4
+times 5 nop
+E108: MOVE_UNALIGNED_SSSE3 8
+times 5 nop
+E10C: MOVE_UNALIGNED_SSSE3 0CH
+times 5 nop
+E101: MOVE_UNALIGNED_SSSE3 1
+times 5 nop
+E102: MOVE_UNALIGNED_SSSE3 2
+times 5 nop
+E103: MOVE_UNALIGNED_SSSE3 3
+times 5 nop
+E105: MOVE_UNALIGNED_SSSE3 5
+times 5 nop
+E106: MOVE_UNALIGNED_SSSE3 6
+times 5 nop
+E107: MOVE_UNALIGNED_SSSE3 7
+times 5 nop
+E109: MOVE_UNALIGNED_SSSE3 9
+times 5 nop
+E10A: MOVE_UNALIGNED_SSSE3 0AH
+times 5 nop
+E10B: MOVE_UNALIGNED_SSSE3 0BH
+times 5 nop
+E10D: MOVE_UNALIGNED_SSSE3 0DH
+times 5 nop
+E10E: MOVE_UNALIGNED_SSSE3 0EH
+times 5 nop
+E10F: MOVE_UNALIGNED_SSSE3 0FH
+
+; Codes for non-temporal move. Aligned case first
+
+align 8
+F100: ; Non-temporal move, src and dest have same alignment.
+ ; Loop. ecx has negative index from the end, counting up to zero
+ movaps xmm0, [esi+ecx] ; Read
+ movaps xmm1, [esi+ecx+10H]
+ movntps [edi+ecx], xmm0 ; Write non-temporal (bypass cache)
+ movntps [edi+ecx+10H], xmm1
+ add ecx, 20H
+ jnz F100 ; Loop through negative ecx up to zero
+
+ ; Move the remaining edx bytes (0 - 31):
+ add esi, edx
+ add edi, edx
+ neg edx
+ jz C500 ; Skip if no more data
+ ; Check if we can more one more 16-bytes block
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes, aligned
+ movaps xmm0, [esi+edx]
+ movntps [edi+edx], xmm0
+ add edx, 10H
+ ; move the remaining 0 - 15 bytes
+ jmp C200
+
+; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
+; the alignment u.
+; These are pointed to by the jump table AlignmentDispatchNT below
+
+;align 16
+F104: MOVE_UNALIGNED_SSE2_4 1
+F108: MOVE_UNALIGNED_SSE2_8 1
+F10C: MOVE_UNALIGNED_SSE2_12 1
+F101: MOVE_UNALIGNED_SSE2 1, 1
+F102: MOVE_UNALIGNED_SSE2 2, 1
+F103: MOVE_UNALIGNED_SSE2 3, 1
+F105: MOVE_UNALIGNED_SSE2 5, 1
+F106: MOVE_UNALIGNED_SSE2 6, 1
+F107: MOVE_UNALIGNED_SSE2 7, 1
+F109: MOVE_UNALIGNED_SSE2 9, 1
+F10A: MOVE_UNALIGNED_SSE2 0AH, 1
+F10B: MOVE_UNALIGNED_SSE2 0BH, 1
+F10D: MOVE_UNALIGNED_SSE2 0DH, 1
+F10E: MOVE_UNALIGNED_SSE2 0EH, 1
+F10F: MOVE_UNALIGNED_SSE2 0FH, 1
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for old processors without SSE2
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 8
+; 80386 version used when SSE2 not supported:
+_memcpy386: ; global label
+memcpy386@: ; local label
+ PROLOGM
+; edi = dest
+; esi = src
+; ecx = count
+ cld
+ cmp ecx, 8
+ jb G500
+G100: test edi, 1
+ jz G200
+ movsb
+ dec ecx
+G200: test edi, 2
+ jz G300
+ movsw
+ sub ecx, 2
+G300: ; edi is aligned now
+ mov edx, ecx
+ shr ecx, 2
+ rep movsd ; move 4 bytes at a time
+ mov ecx, edx
+ and ecx, 3
+ rep movsb ; move remaining 0-3 bytes
+ RETURNM
+
+G500: ; count < 8. Move one byte at a time
+ rep movsb ; move count bytes
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CPU dispatching for memcpy. This is executed only once
+memcpyCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ pushad
+ ; set _CacheBypassLimit to half the size of the largest level cache
+ call GetMemcpyCacheLimit@
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of memcpy
+ mov esi, memcpy386@
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; SSE2 supported
+ ; Point to SSE2 version of memcpy
+ mov esi, memcpySSE2@
+ cmp eax, 6 ; check Suppl-SSE3
+ jb Q100
+ ; Suppl-SSE3 supported
+ ; Point to SSSE3 version of memcpy
+ mov esi, memcpySSSE3@
+ call _UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
+ test eax, eax
+ jz Q100
+ ; Point to unaligned version of memcpy
+ mov esi, memcpyU@
+ call _Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
+ test eax, eax
+ jz Q100
+ mov esi, memcpyU256@
+Q100:
+ mov [memcpyDispatch], esi
+ popad
+ ; Continue in appropriate version of memcpy
+ jmp [memcpyDispatch]
+
+%ELSE ; Position-independent version
+ pushad
+ mov ebx, edx ; reference point
+ ; set _CacheBypassLimit to half the size of the largest level cache
+ call GetMemcpyCacheLimit@
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of memcpy
+ lea esi, [ebx+memcpy386 at -RP]
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; SSE2 supported
+ ; Point to SSE2 version of memcpy
+ lea esi, [ebx+memcpySSE2 at -RP]
+ cmp eax, 6 ; check Suppl-SSE3
+ jb Q100
+ ; Suppl-SSE3 supported
+ ; Point to SSSE3 version of memcpy
+ lea esi, [ebx+memcpySSSE3 at -RP]
+ call _UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
+ test eax, eax
+ jz Q100
+ ; Point to unaligned version of memcpy
+ lea esi, [ebx+memcpyU at -RP]
+ call _Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
+ test eax, eax
+ jz Q100
+ lea esi, [ebx+memcpyU256 at -RP]
+Q100: ; insert appropriate pointer
+ mov dword [ebx+memcpyDispatch-RP], esi
+ popad
+ ; Continue in appropriate version of memcpy
+ jmp [edx+memcpyDispatch-RP]
+%ENDIF
+
+; extern "C" size_t GetMemcpyCacheLimit();
+_GetMemcpyCacheLimit:
+GetMemcpyCacheLimit@: ; local label
+ push ebx
+%ifdef POSITIONINDEPENDENT
+ call get_thunk_edx
+ lea ebx, [edx + _CacheBypassLimit - $]
+%else
+ mov ebx, _CacheBypassLimit
+%endif
+ mov eax, [ebx]
+ test eax, eax
+ jnz U200
+ ; Get half the size of the largest level cache
+ push 0 ; 0 means largest level cache
+ call _DataCacheSize ; get cache size
+ pop ecx
+ shr eax, 1 ; half the size
+ jnz U100
+ mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
+U100: mov [ebx], eax
+U200: pop ebx
+ ret
+
+; Called internally from _SetMemcpyCacheLimit defined in memmove32.asm
+; Must return the value set
+_SetMemcpyCacheLimit1:
+ push ebx
+%ifdef POSITIONINDEPENDENT
+ call get_thunk_edx
+ lea ebx, [edx + _CacheBypassLimit - $]
+%else
+ mov ebx, _CacheBypassLimit
+%endif
+ mov eax, [esp+8]
+ test eax, eax
+ jnz U400
+ ; zero, means default
+ mov [ebx], eax
+ call GetMemcpyCacheLimit@
+U400:
+ mov [ebx], eax
+ pop ebx
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; getDispatch, for testing only
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+getDispatch:
+mov eax,[memcpyDispatch]
+ret
+
+global getDispatch
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces AlignmentDispatchSSE2 with
+; AlignmentDispatchSupSSE3 if Suppl-SSE3 is supported
+; RP = reference point if position-independent code, otherwise RP = 0
+
+; Code pointer for each alignment for SSE2 instruction set
+AlignmentDispatchSSE2:
+DD C100-RP, D101-RP, D102-RP, D103-RP, D104-RP, D105-RP, D106-RP, D107-RP
+DD D108-RP, D109-RP, D10A-RP, D10B-RP, D10C-RP, D10D-RP, D10E-RP, D10F-RP
+
+; Code pointer for each alignment for Suppl.SSE3 instruction set
+AlignmentDispatchSSSE3:
+DD C100-RP, E101-RP, E102-RP, E103-RP, E104-RP, E105-RP, E106-RP, E107-RP
+DD E108-RP, E109-RP, E10A-RP, E10B-RP, E10C-RP, E10D-RP, E10E-RP, E10F-RP
+
+; Code pointer for each alignment for non-temporal store
+AlignmentDispatchNT:
+DD F100-RP, F101-RP, F102-RP, F103-RP, F104-RP, F105-RP, F106-RP, F107-RP
+DD F108-RP, F109-RP, F10A-RP, F10B-RP, F10C-RP, F10D-RP, F10E-RP, F10F-RP
+
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memcpyDispatch: DD memcpyCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of _CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+_CacheBypassLimit: DD 0
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/asmlibSrc/memcpy64.asm b/asmlibSrc/memcpy64.asm
new file mode 100755
index 0000000..e112153
--- /dev/null
+++ b/asmlibSrc/memcpy64.asm
@@ -0,0 +1,1313 @@
+;************************* memcpy64.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2013-09-11
+;
+; Description:
+; Faster version of the standard memcpy function:
+; void * A_memcpy(void *dest, const void *src, size_t count);
+; Copies 'count' bytes from 'src' to 'dest'
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_memcpy is changed to _memcpy in the object file if
+; it is desired to override the standard library function memcpy.
+;
+; The function uses non-temporal writes to bypass the cache when the size is
+; bigger than half the size of the largest_level cache. This limit can be
+; read with GetMemcpyCacheLimit and changed with SetMemcpyCacheLimit
+; C++ prototypes:
+; extern "C" size_t GetMemcpyCacheLimit(); // in memcpy64.asm
+; extern "C" void SetMemcpyCacheLimit(); // in memmove64.asm
+; extern "C" void SetMemcpyCacheLimit1(); // used internally
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included SSE2, Suppl-SSE3 and AVX instruction sets.
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memcpy: function ; Function A_memcpy
+global ?OVR_memcpy: function ; ?OVR removed if standard function memcpy overridden
+global memcpySSE2: function ; Version for processors with only SSE2
+global memcpySSSE3: function ; Version for processors with SSSE3
+global memcpyU: function ; Version for processors with fast unaligned read
+global memcpyU256: function ; Version for processors with fast 256-bit read/write
+
+global GetMemcpyCacheLimit: function ; Get the size limit for bypassing cache when copying with memcpy and memmove
+global SetMemcpyCacheLimit1: function ; Set the size limit for bypassing cache when copying with memcpy
+
+
+; Imported from instrset64.asm
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from cachesize32.asm:
+extern DataCacheSize ; Gets size of data cache
+
+
+; Define prolog for this function
+%MACRO PROLOGM 0
+%IFDEF WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx ; dest
+ mov r9, rcx ; dest
+ mov rsi, rdx ; src
+ mov rcx, r8 ; count
+%ELSE ; Unix
+ mov rcx, rdx ; count
+ mov r9, rdi ; dest
+%ENDIF
+%ENDM
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF WINDOWS
+ pop rdi
+ pop rsi
+%ENDIF
+ mov rax, r9 ; Return value = dest
+ ret
+%ENDM
+
+
+SECTION .text align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memcpy(void * dest, const void * src, size_t count);
+; Function entry:
+A_memcpy:
+?OVR_memcpy:
+ jmp qword [memcpyDispatch] ; Go to appropriate version, depending on instruction set
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU256: ; global label
+memcpyU256@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 1FH
+ jz B3100 ; Skip if dest aligned by 32
+
+ ; edx = size of first partial block, 1 - 31 bytes
+ test dl, 3
+ jz B3030
+ test dl, 1
+ jz B3020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B3020: test dl, 2
+ jz B3030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B3030: test dl, 4
+ jz B3040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B3040: test dl, 8
+ jz B3050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B3050: test dl, 16
+ jz B3060
+ ; move 16 bytes
+ movups xmm0, [rsi]
+ movaps [rdi], xmm0
+ add rsi, 16
+ add rdi, 16
+B3060: sub rcx, rdx
+
+B3100: ; Now dest is aligned by 32. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov rdx, rcx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub rdx, rcx ; Remaining data after loop
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja I3100 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+H3100: ; copy -rcx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test sil, 1FH
+ jz H3110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J3100
+
+align 16
+H3110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ vmovups ymm0, [rsi+rcx]
+ vmovaps [rdi+rcx], ymm0
+ add rcx, 20H
+ jnz H3110
+ vzeroupper ; end of AVX mode
+
+H3120: ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz H3500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H3200
+ ; move 16 bytes
+ movups xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+H3200: cmp edx, -8
+ jg H3210
+ ; move 8 bytes
+ movq xmm0, qword [rsi+rdx]
+ movq qword [rdi+rdx], xmm0
+ add rdx, 8
+ jz H500 ; Early skip if count divisible by 8
+H3210: cmp edx, -4
+ jg H3220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+H3220: cmp edx, -2
+ jg H3230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+H3230: cmp edx, -1
+ jg H3500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+H3500: ; finished
+ RETURNM
+
+I3100: ; non-temporal move
+ neg rcx ; Negative index from the end
+
+align 16
+I3110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ vmovups ymm0, [rsi+rcx]
+ vmovntps [rdi+rcx], ymm0
+ add rcx, 20H
+ jnz I3110
+ vzeroupper ; end of AVX mode
+ jmp H3120 ; Move the remaining edx bytes (0 - 31)
+
+
+align 16
+J3100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push rdx
+ mov rax, rsi
+ sub rax, rdi
+ cqo
+ xor rax, rdx
+ sub rax, rdx ; abs(src-dest)
+ neg rcx ; size
+ pop rdx ; restore rdx
+ cmp rax, rcx
+ jnb J3110
+ neg rcx ; restore rcx
+ jmp H3110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov rax, rsi
+ sub rax, rdi
+ neg rcx ; size
+ cmp rax, rcx
+ jnb J3110 ; OK to copy backwards
+ ; must copy forwards
+ neg rcx ; restore ecx
+ jmp H3110 ; copy forwards
+
+%endif
+
+J3110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+ push rsi
+ push rdi
+ sub rsi, rcx
+ sub rdi, rcx
+J3120: ; loop backwards
+ vmovups ymm0, [rsi+rcx-20H]
+ vmovaps [rdi+rcx-20H], ymm0
+ sub rcx, 20H
+ jnz J3120
+ vzeroupper
+ pop rdi
+ pop rsi
+ jmp H3120
+
+align 16
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ ; multiple CPU versions (SSSE3 and above)
+A1000: add rsi, rcx ; end of src
+ add rdi, rcx ; end of dest
+ neg rcx ; negative index from the end
+ cmp ecx, -20H
+ jg A1100
+ ; move 32 bytes
+ ; movdqu is faster than 64-bit moves on processors with SSSE3
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movups [rdi+rcx], xmm0
+ movups [rdi+rcx+10H], xmm1
+ add rcx, 20H
+A1100: cmp ecx, -10H
+ jg A1200
+ ; move 16 bytes
+ movups xmm0, [rsi+rcx]
+ movups [rdi+rcx], xmm0
+ add rcx, 10H
+A1200: cmp ecx, -8
+ jg A1300
+ ; move 8 bytes
+ mov rax, qword [rsi+rcx]
+ mov qword [rdi+rcx], rax
+ add rcx, 8
+A1300: cmp ecx, -4
+ jg A1400
+ ; move 4 bytes
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ add rcx, 4
+ jz A1900 ; early out if count divisible by 4
+A1400: cmp ecx, -2
+ jg A1500
+ ; move 2 bytes
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+ add rcx, 2
+A1500: cmp ecx, -1
+ jg A1900
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+A1900: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpyU: ; global label
+memcpyU@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B2100 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B2030
+ test dl, 1
+ jz B2020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B2020: test dl, 2
+ jz B2030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B2030: test dl, 4
+ jz B2040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B2040: test dl, 8
+ jz B2050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B2050: sub rcx, rdx
+B2100: ; Now dest is aligned by 16. Any partial block has been moved
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov rdx, rcx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub rdx, rcx ; Remaining data after loop
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja I100 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+H100: ; copy -rcx bytes in blocks of 32 bytes.
+
+ ; Check for false memory dependence: The CPU may falsely assume
+ ; a partial overlap between the written destination and the following
+ ; read source if source is unaligned and
+ ; (src-dest) modulo 4096 is close to 4096
+ test sil, 0FH
+ jz H110 ; aligned
+ mov eax, esi
+ sub eax, edi
+ and eax, 0FFFH ; modulo 4096
+ cmp eax, 1000H - 200H
+ ja J100
+
+H110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movaps [rdi+rcx], xmm0
+ movaps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz H110
+
+H120: ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz H500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg H200
+ ; move 16 bytes
+ movups xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+H200: cmp edx, -8
+ jg H210
+ ; move 8 bytes
+ movq xmm0, qword [rsi+rdx]
+ movq qword [rdi+rdx], xmm0
+ add rdx, 8
+ jz H500 ; Early skip if count divisible by 8
+H210: cmp edx, -4
+ jg H220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+H220: cmp edx, -2
+ jg H230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+H230: cmp edx, -1
+ jg H500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+H500: ; finished
+ RETURNM
+
+I100: ; non-temporal move
+ neg rcx ; Negative index from the end
+
+align 16
+I110: ; main copy loop, 32 bytes at a time
+ ; rcx has negative index from the end, counting up to zero
+ movups xmm0, [rsi+rcx]
+ movups xmm1, [rsi+rcx+10H]
+ movntps [rdi+rcx], xmm0
+ movntps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz I110
+ jmp H120 ; Move the remaining edx bytes (0 - 31):
+
+
+align 16
+J100: ; There is a false memory dependence.
+ ; check if src and dest overlap, if not then it is safe
+ ; to copy backwards to avoid false memory dependence
+%if 1
+ ; Use this version if you want consistent behavior in the case
+ ; where dest > src and overlap. However, this case is undefined
+ ; anyway because part of src is overwritten before copying
+ push rdx
+ mov rax, rsi
+ sub rax, rdi
+ cqo
+ xor rax, rdx
+ sub rax, rdx ; abs(src-dest)
+ neg rcx ; size
+ pop rdx ; restore rdx
+ cmp rax, rcx
+ jnb J110
+ neg rcx ; restore rcx
+ jmp H110 ; overlap between src and dest. Can't copy backwards
+%else
+ ; save time by not checking the case that is undefined anyway
+ mov rax, rsi
+ sub rax, rdi
+ neg rcx ; size
+ cmp rax, rcx
+ jnb J110 ; OK to copy backwards
+ ; must copy forwards
+ neg rcx ; restore ecx
+ jmp H110 ; copy forwards
+
+%endif
+
+J110: ; copy backwards, rcx = size. rsi, rdi = end of src, dest
+ push rsi
+ push rdi
+ sub rsi, rcx
+ sub rdi, rcx
+J120: ; loop backwards
+ movups xmm1, [rsi+rcx-20H]
+ movups xmm0, [rsi+rcx-10H]
+ movaps [rdi+rcx-20H], xmm1
+ movaps [rdi+rcx-10H], xmm0
+ sub rcx, 20H
+ jnz J120
+ pop rdi
+ pop rsi
+ jmp H120
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memcpySSSE3: ; global label
+memcpySSSE3@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B1200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B1030
+ test dl, 1
+ jz B1020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B1020: test dl, 2
+ jz B1030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B1030: test dl, 4
+ jz B1040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B1040: test dl, 8
+ jz B1050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B1050: sub rcx, rdx
+B1200: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count (lower 32 bits)
+ and rcx, -20H ; Round down count to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub edx, ecx ; Remaining data after loop (0-31)
+ sub rsi, rax ; Nearest preceding aligned block of src
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchSSSE3]
+ jmp near [r8+rax*8]
+
+B1400: neg rcx
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+align 16
+C100: ; Code for aligned src. SSE2 and SSSE3 versions
+ ; The nice case, src and dest have same alignment.
+
+ ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm0, [rsi+rcx]
+ movaps xmm1, [rsi+rcx+10H]
+ movaps [rdi+rcx], xmm0
+ movaps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz C100
+
+ ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz C500 ; Skip if no more data
+ ; move 16-8-4-2-1 bytes, aligned
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes
+ movaps xmm0, [rsi+rdx]
+ movaps [rdi+rdx], xmm0
+ add rdx, 10H
+C200: cmp edx, -8
+ jg C210
+ ; move 8 bytes
+ mov rax, [rsi+rdx]
+ mov [rdi+rdx], rax
+ add rdx, 8
+ jz C500 ; Early skip if count divisible by 8
+C210: cmp edx, -4
+ jg C220
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+C220: cmp edx, -2
+ jg C230
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+C230: cmp edx, -1
+ jg C500
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+C500: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpySSE2: ; global label
+memcpySSE2@: ; local label
+ PROLOGM
+ cmp rcx, 40H
+ jae B0100 ; Use simpler code if count < 64
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ add rsi, rcx ; end of src
+ add rdi, rcx ; end of dest
+ neg rcx ; negative index from the end
+ cmp ecx, -20H
+ jg A100
+ ; move 32 bytes
+ ; mov r64 is faster than movdqu on Intel Pentium M and Core 1
+ ; movdqu is fast on Nehalem and later
+ mov rax, [rsi+rcx]
+ mov rdx, [rsi+rcx+8]
+ mov [rdi+rcx], rax
+ mov [rdi+rcx+8], rdx
+ mov rax, qword [rsi+rcx+10H]
+ mov rdx, qword [rsi+rcx+18H]
+ mov qword [rdi+rcx+10H], rax
+ mov qword [rdi+rcx+18H], rdx
+ add rcx, 20H
+A100: cmp ecx, -10H
+ jg A200
+ ; move 16 bytes
+ mov rax, [rsi+rcx]
+ mov rdx, [rsi+rcx+8]
+ mov [rdi+rcx], rax
+ mov [rdi+rcx+8], rdx
+ add rcx, 10H
+A200: cmp ecx, -8
+ jg A300
+ ; move 8 bytes
+ mov rax, qword [rsi+rcx]
+ mov qword [rdi+rcx], rax
+ add rcx, 8
+A300: cmp ecx, -4
+ jg A400
+ ; move 4 bytes
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ add rcx, 4
+ jz A900 ; early out if count divisible by 4
+A400: cmp ecx, -2
+ jg A500
+ ; move 2 bytes
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+ add rcx, 2
+A500: cmp ecx, -1
+ jg A900
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+A900: ; finished
+ RETURNM
+
+B0100: ; count >= 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B0200 ; Skip if dest aligned by 16
+
+ ; edx = size of first partial block, 1 - 15 bytes
+ test dl, 3
+ jz B0030
+ test dl, 1
+ jz B0020
+ ; move 1 byte
+ movzx eax, byte [rsi]
+ mov [rdi], al
+ inc rsi
+ inc rdi
+B0020: test dl, 2
+ jz B0030
+ ; move 2 bytes
+ movzx eax, word [rsi]
+ mov [rdi], ax
+ add rsi, 2
+ add rdi, 2
+B0030: test dl, 4
+ jz B0040
+ ; move 4 bytes
+ mov eax, [rsi]
+ mov [rdi], eax
+ add rsi, 4
+ add rdi, 4
+B0040: test dl, 8
+ jz B0050
+ ; move 8 bytes
+ mov rax, [rsi]
+ mov [rdi], rax
+ add rsi, 8
+ add rdi, 8
+B0050: sub rcx, rdx
+B0200: ; Now dest is aligned by 16. Any partial block has been moved
+
+ ; This part will not always work if count < 64
+ ; Calculate size of first block up to first regular boundary of dest
+ mov edx, edi
+ neg edx
+ and edx, 0FH
+ jz B300 ; Skip if dest aligned by 16
+
+ ; rdx = size of first partial block, 1 - 15 bytes
+ add rsi, rdx
+ add rdi, rdx
+ sub rcx, rdx
+ neg rdx
+ cmp edx, -8
+ jg B200
+ ; move 8 bytes
+ mov rax, [rsi+rdx]
+ mov [rdi+rdx], rax
+ add rdx, 8
+B200: cmp edx, -4
+ jg B210
+ ; move 4 bytes
+ mov eax, [rsi+rdx]
+ mov [rdi+rdx], eax
+ add rdx, 4
+ jz B300 ; early out if aligned by 4
+B210: cmp edx, -2
+ jg B220
+ ; move 2 bytes
+ movzx eax, word [rsi+rdx]
+ mov [rdi+rdx], ax
+ add rdx, 2
+B220: cmp edx, -1
+ jg B300
+ ; move 1 byte
+ movzx eax, byte [rsi+rdx]
+ mov [rdi+rdx], al
+
+B300: ; Now dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of src modulo 16 at this point:
+ mov eax, esi
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count (lower 32 bits)
+ and rcx, -20H ; Round down count to nearest multiple of 32
+ add rsi, rcx ; Point to the end
+ add rdi, rcx ; Point to the end
+ sub edx, ecx ; Remaining data after loop (0-31)
+ sub rsi, rax ; Nearest preceding aligned block of src
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B400 ; Use non-temporal store if count > CacheBypassLimit
+ neg rcx ; Negative index from the end
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchSSE2]
+ jmp near [r8+rax*8]
+
+B400: neg rcx
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [AlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSE2 2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movdqa xmm2, [rsi+rcx+20H]
+ movdqa xmm3, xmm1 ; Copy because used twice
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx], xmm0 ; non-temporal save
+ %ENDIF
+ movdqa xmm0, xmm2 ; Save for next iteration
+ psrldq xmm3, %1 ; shift right
+ pslldq xmm2, 16-%1 ; shift left
+ por xmm3, xmm2 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx+10H], xmm3 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx+10H], xmm3 ; non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm1, [rsi+rdx+10H]
+ psrldq xmm0, %1 ; shift right
+ pslldq xmm1, 16-%1 ; shift left
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rdx], xmm0 ; non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_4 1 ; nt
+; Special case for u = 4
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ shufps xmm0, xmm0, 00111001B ; Rotate
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx+20H]
+ movss xmm1, xmm0
+ shufps xmm1, xmm1, 00111001B
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ movss xmm0, xmm1
+ shufps xmm0, xmm0, 00111001B
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm0 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_8 1 ; nt
+; Special case for u = 8
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1 ; Moves 8 bytes, leaves remaining bytes unchanged
+ shufps xmm0, xmm0, 01001110B ; Rotate
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx+20H]
+ movsd xmm1, xmm0
+ shufps xmm1, xmm1, 01001110B
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ %ENDIF
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ movsd xmm0, xmm1
+ shufps xmm0, xmm0, 01001110B
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm0 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_UNALIGNED_SSE2_12 1 ; nt
+; Special case for u = 12
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+ shufps xmm0, xmm0, 10010011B
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movaps xmm2, [rsi+rcx+20H]
+ shufps xmm1, xmm1, 10010011B
+ shufps xmm2, xmm2, 10010011B
+ movaps xmm3, xmm2
+ movss xmm2, xmm1 ; Moves 4 bytes, leaves remaining bytes unchanged
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ movaps [rdi+rcx+10H], xmm2 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ movntps [rdi+rcx+10H], xmm2 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, xmm3 ; Save for next iteration
+ add rcx, 20H ; Loop through negative values up to zero
+ jnz %%L1
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movaps xmm1, [rsi+rdx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 10010011B
+ movss xmm1, xmm0 ; Moves 4 bytes, leaves remaining bytes unchanged
+ %IF %1 == 0
+ movaps [rdi+rdx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rdx], xmm1 ; Non-temporal save
+ %ENDIF
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_UNALIGNED_SSSE3 1 ; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest preceding 16B boundary
+
+%%L1: ; Loop. rcx has negative index from the end, counting up to zero
+ movdqa xmm2, [rsi+rcx+10H] ; Read next two blocks
+ movdqa xmm3, [rsi+rcx+20H]
+ movdqa xmm1, xmm0 ; Save xmm0
+ movdqa xmm0, xmm3 ; Save for next iteration
+ palignr xmm3, xmm2, %1 ; Combine parts into aligned block
+ palignr xmm2, xmm1, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx], xmm2 ; Save aligned
+ movdqa [rdi+rcx+10H], xmm3 ; Save aligned
+ add rcx, 20H
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ cmp edx, -10H
+ jg %%L2
+ ; One more 16-bytes block to move
+ movdqa xmm2, [rsi+rdx+10H]
+ palignr xmm2, xmm0, %1
+ movdqa [rdi+rdx], xmm2
+ add rdx, 10H
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes
+ jmp C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSSE2 below
+; (alignments and fillers are inserted manually to minimize the number
+; of 16-bytes boundaries inside loops)
+
+align 16
+D104: MOVE_UNALIGNED_SSE2_4 0
+times 4 nop
+D108: MOVE_UNALIGNED_SSE2_8 0
+times 4 nop
+D10C: MOVE_UNALIGNED_SSE2_12 0
+times 1 nop
+D101: MOVE_UNALIGNED_SSE2 1, 0
+D102: MOVE_UNALIGNED_SSE2 2, 0
+D103: MOVE_UNALIGNED_SSE2 3, 0
+D105: MOVE_UNALIGNED_SSE2 5, 0
+D106: MOVE_UNALIGNED_SSE2 6, 0
+D107: MOVE_UNALIGNED_SSE2 7, 0
+D109: MOVE_UNALIGNED_SSE2 9, 0
+times 1 nop
+D10A: MOVE_UNALIGNED_SSE2 0AH, 0
+D10B: MOVE_UNALIGNED_SSE2 0BH, 0
+D10D: MOVE_UNALIGNED_SSE2 0DH, 0
+D10E: MOVE_UNALIGNED_SSE2 0EH, 0
+D10F: MOVE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table AlignmentDispatchSupSSE3 below
+
+align 16
+E104: MOVE_UNALIGNED_SSSE3 4
+E108: MOVE_UNALIGNED_SSSE3 8
+E10C: MOVE_UNALIGNED_SSSE3 0CH
+E101: MOVE_UNALIGNED_SSSE3 1
+E102: MOVE_UNALIGNED_SSSE3 2
+E103: MOVE_UNALIGNED_SSSE3 3
+E105: MOVE_UNALIGNED_SSSE3 5
+E106: MOVE_UNALIGNED_SSSE3 6
+E107: MOVE_UNALIGNED_SSSE3 7
+E109: MOVE_UNALIGNED_SSSE3 9
+times 1 nop
+E10A: MOVE_UNALIGNED_SSSE3 0AH
+E10B: MOVE_UNALIGNED_SSSE3 0BH
+E10D: MOVE_UNALIGNED_SSSE3 0DH
+E10E: MOVE_UNALIGNED_SSSE3 0EH
+E10F: MOVE_UNALIGNED_SSSE3 0FH
+
+; Codes for non-temporal move. Aligned case first
+
+align 16
+F100: ; Non-temporal move, src and dest have same alignment.
+ ; Loop. rcx has negative index from the end, counting up to zero
+ movaps xmm0, [rsi+rcx] ; Read
+ movaps xmm1, [rsi+rcx+10H]
+ movntps [rdi+rcx], xmm0 ; Write non-temporal (bypass cache)
+ movntps [rdi+rcx+10H], xmm1
+ add rcx, 20H
+ jnz F100 ; Loop through negative rcx up to zero
+
+ ; Move the remaining edx bytes (0 - 31):
+ add rsi, rdx
+ add rdi, rdx
+ neg rdx
+ jz C500 ; Skip if no more data
+ ; Check if we can more one more 16-bytes block
+ cmp edx, -10H
+ jg C200
+ ; move 16 bytes, aligned
+ movaps xmm0, [rsi+rdx]
+ movntps [rdi+rdx], xmm0
+ add rdx, 10H
+ ; move the remaining 0 - 15 bytes
+ jmp C200
+
+; Make 15 instances of MOVE_UNALIGNED_SSE2 macro for each value of
+; the alignment u.
+; These are pointed to by the jump table AlignmentDispatchNT below
+
+;align 16
+F104: MOVE_UNALIGNED_SSE2_4 1
+F108: MOVE_UNALIGNED_SSE2_8 1
+F10C: MOVE_UNALIGNED_SSE2_12 1
+F101: MOVE_UNALIGNED_SSE2 1, 1
+F102: MOVE_UNALIGNED_SSE2 2, 1
+F103: MOVE_UNALIGNED_SSE2 3, 1
+F105: MOVE_UNALIGNED_SSE2 5, 1
+F106: MOVE_UNALIGNED_SSE2 6, 1
+F107: MOVE_UNALIGNED_SSE2 7, 1
+F109: MOVE_UNALIGNED_SSE2 9, 1
+F10A: MOVE_UNALIGNED_SSE2 0AH, 1
+F10B: MOVE_UNALIGNED_SSE2 0BH, 1
+F10D: MOVE_UNALIGNED_SSE2 0DH, 1
+F10E: MOVE_UNALIGNED_SSE2 0EH, 1
+F10F: MOVE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memcpyCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
+ ; This part is executed only once
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ push r8
+ ; set CacheBypassLimit to half the size of the largest level cache
+ call GetMemcpyCacheLimit@
+ mov eax, 1
+ cpuid ; Get feature flags
+ lea rbx, [memcpySSE2@]
+ bt ecx, 9 ; Test bit for SupplSSE3
+ jnc Q100
+ lea rbx, [memcpySSSE3@]
+ call UnalignedIsFaster ; Test if unaligned read is faster than aligned read and shift
+ test eax, eax
+ jz Q100
+ lea rbx, [memcpyU@]
+ call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
+ test eax, eax
+ jz Q100
+ lea rbx, [memcpyU256@]
+Q100:
+ ; Insert appropriate pointer
+ mov [memcpyDispatch], rbx
+ mov rax, rbx
+ pop r8
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ ; Jump according to the replaced function pointer
+ jmp rax
+
+; extern "C" size_t GetMemcpyCacheLimit();
+GetMemcpyCacheLimit:
+GetMemcpyCacheLimit@: ; local limit
+ mov rax, [CacheBypassLimit]
+ test rax, rax
+ jnz U200
+ ; Get half the size of the largest level cache
+%ifdef WINDOWS
+ xor ecx, ecx ; 0 means largest level cache
+%else
+ xor edi, edi ; 0 means largest level cache
+%endif
+ call DataCacheSize ; get cache size
+ shr rax, 1 ; half the size
+ jnz U100
+ mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
+U100: mov [CacheBypassLimit], rax
+U200: ret
+
+; Note: SetMemcpyCacheLimit is defined in memmove64.asm, calling SetMemcpyCacheLimit1
+SetMemcpyCacheLimit1:
+%ifdef WINDOWS
+ mov rax, rcx
+%else
+ mov rax, rdi
+%endif
+ test rax, rax
+ jnz U400
+ ; zero, means default
+ mov [CacheBypassLimit], rax
+ call GetMemcpyCacheLimit@
+U400: mov [CacheBypassLimit], rax
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; getDispatch, for testing only
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+getDispatch:
+mov rax,[memcpyDispatch]
+ret
+
+global getDispatch
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces AlignmentDispatch with
+; AlignmentDispatchSSE2 or AlignmentDispatchSupSSE3 if Suppl-SSE3
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+AlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+AlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+AlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memcpyDispatch DQ memcpyCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > CacheBypassLimit
+; The optimal value of _CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DQ 0
diff --git a/asmlibSrc/memmove32.asm b/asmlibSrc/memmove32.asm
new file mode 100755
index 0000000..9113e3d
--- /dev/null
+++ b/asmlibSrc/memmove32.asm
@@ -0,0 +1,1238 @@
+;************************* memmove32.asm ***********************************
+; Author: Agner Fog
+; Date created: 2008-07-18
+; Last modified: 2013-09-11
+; Description:
+; Faster version of the standard memmove function:
+; void * A_memmove(void *dest, const void *src, size_t count);
+; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
+;
+; Overriding standard function memmove:
+; The alias ?OVR_memmove is changed to _memmove in the object file if
+; it is desired to override the standard library function memmove.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for different CPUs
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memmove: function ; Function A_memmove
+global ?OVR_memmove: function ; ?OVR removed if standard function memmove overridden
+
+; Direct entries to CPU-specific versions
+global _memmove386: function ; Version for processors without SSE2
+global _memmoveSSE2: function ; Version for processors with SSE2
+global _memmoveSSSE3: function ; Version for processors with SSSE3
+global _memmoveU: function ; Version for processors with fast unaligned read
+global _memmoveU256: function ; Version for processors with fast 256-bit read/write
+global _SetMemcpyCacheLimit ; Change limit for bypassing cache
+
+; Imported from memcpy32.asm:
+extern _A_memcpy ; function entry
+extern _memcpy386 ; CPU specific function entry
+extern _memcpySSE2 ; CPU specific function entry
+extern _memcpySSSE3 ; CPU specific function entry
+extern _memcpyU ; CPU specific function entry
+extern _memcpyU256 ; CPU specific function entry
+
+; Imported from instrset32.asm
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster32.asm:
+extern _UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
+extern _Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from memcpy32.asm
+extern _GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove
+extern _SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Prolog macro. Determine if we should move forwards or backwards
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define prolog for this function
+; Parameter 1 is forward function label
+%MACRO PROLOGM 1
+ ; Check if dest overlaps src
+ mov eax, [esp+4] ; dest
+ sub eax, [esp+8] ; src
+ cmp eax, [esp+12] ; count
+ ; We can avoid testing for dest < src by using unsigned compare:
+ ; (Assume that the memory block cannot span across address 0)
+ ; Must move backwards if unsigned(dest-src) < count
+ jae %1 ; Jump to memcpy if we can move forwards
+
+ push esi
+ push edi
+ mov edi, [esp+12] ; dest
+ mov esi, [esp+16] ; src
+ mov ecx, [esp+20] ; count
+%IFDEF POSITIONINDEPENDENT
+ push ebx
+ mov ebx, edx ; pointer to reference point RP
+%ENDIF
+
+%ENDM
+
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF POSITIONINDEPENDENT
+ pop ebx
+%ENDIF
+ pop edi
+ pop esi
+ mov eax, [esp+4] ; Return value = dest
+ ret
+%ENDMACRO
+
+
+SECTION .text align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
+; Function entry:
+_A_memmove:
+?OVR_memmove:
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp dword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
+RP equ 0 ; RP = 0 if not position-independent
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP: ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+ jmp dword [edx+memmoveDispatch-RP]
+
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memmoveU256: ; Version for processors with fast 256-bit read/write
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memmoveU256@:
+ PROLOGM _memcpyU256
+
+ cmp ecx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [edi+ecx] ; end of dext
+ and edx, 1FH
+ jz B4300 ; Skip if end of dest aligned by 32
+
+ ; edx = size of last partial block, 1 - 32 bytes
+ test dl, 3
+ jz B4210
+ test dl, 1
+ jz B4201 ; B4200 if we haven't tested edx,3
+ ; move 1 byte
+ dec ecx
+ movzx eax, byte [esi+ecx]
+ mov [edi+ecx], al
+B4200: test dl, 2
+ jz B4210
+B4201: ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+B4210: test dl, 4
+ jz B4220
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+B4220: test dl, 8
+ jz B4230
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+B4230: test dl, 16
+ jz B4300
+ ; move 16 bytes
+ sub ecx, 16
+ movups xmm0, [esi+ecx]
+ movaps [edi+ecx], xmm0
+
+B4300: ; Now end of dest is aligned by 32. Any partial block has been moved
+ mov edx, ecx
+ and ecx, 1FH ; remaining size after 32 bytes blocks moved
+ and edx, -20H ; number of 32 bytes blocks
+ jz H4100
+ add esi, ecx
+ add edi, ecx
+
+ ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+ cmp edx, [_CacheBypassLimit]
+%ELSE
+ cmp edx, [ebx-RP+_CacheBypassLimit]
+%ENDIF
+ ja H4800 ; Use non-temporal store if count > _CacheBypassLimit
+
+align 16
+H4000: ; 32 bytes move loop
+ vmovups ymm0, [esi+edx-20H]
+ vmovaps [edi+edx-20H], ymm0
+ sub edx, 20H
+ jnz H4000
+ vzeroupper
+
+H4090: sub esi, ecx
+ sub edi, ecx
+
+H4100: ; remaining 0-31 bytes
+ test ecx, ecx
+ jz H4600
+ test cl, 10H
+ jz H4200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [esi+ecx]
+ movaps [edi+ecx], xmm0
+ jz H4600 ; early out if count divisible by 16
+H4200: test cl, 8
+ jz H4300
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+H4300: test cl, 4
+ jz H4400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+ jz H4600 ; early out if count divisible by 4
+H4400: test cl, 2
+ jz H4500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+H4500: test cl, 1
+ jz H4600
+ ; move 1 byte
+ movzx eax, byte [esi] ; ecx-1 = 0
+ mov [edi], al
+H4600: ; finished
+ RETURNM
+
+align 16
+H4800: ; 32 bytes move loop, bypass cache
+ vmovups ymm0, [esi+edx-20H]
+ vmovntps [edi+edx-20H], ymm0
+ sub edx, 20H
+ jnz H4800
+ vzeroupper
+ jmp H4090
+
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ ; multiple CPU versions, SSSE3 and later
+A1000: test cl, 20H
+ jz A1100
+ ; move 32 bytes
+ ; movups is faster than 64-bit moves on processors with SSSE3
+ sub ecx, 20H
+ movups xmm0, [esi+ecx+10H]
+ movups xmm1, [esi+ecx]
+ movups [edi+ecx+10H], xmm0
+ movups [edi+ecx], xmm1
+A1100: test cl, 10H
+ jz A1200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [esi+ecx]
+ movups [edi+ecx], xmm0
+A1200: test cl, 8
+ jz A1300
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+A1300: test cl, 4
+ jz A1400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+ jz A1900 ; early out if count divisible by 4
+A1400: test cl, 2
+ jz A1500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+A1500: test cl, 1
+ jz A1900
+ ; move 1 byte
+ movzx eax, byte [esi] ; ecx-1 = 0
+ mov [edi], al
+A1900: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memmoveU: ; Version for processors with fast unaligned read
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memmoveU@:
+ PROLOGM _memcpyU
+
+ cmp ecx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [edi+ecx] ; end of dext
+ and edx, 0FH
+ jz B3300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B3210
+ test dl, 1
+ jz B3201 ; B3200 if we haven't tested edx,3
+ ; move 1 byte
+ dec ecx
+ movzx eax, byte [esi+ecx]
+ mov [edi+ecx], al
+B3200: test dl, 2
+ jz B3210
+B3201: ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+B3210: test dl, 4
+ jz B3220
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+B3220: test dl, 8
+ jz B3300
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+
+B3300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ mov edx, ecx
+ and ecx, 1FH ; remaining size after 32 bytes blocks moved
+ and edx, -20H ; number of 32 bytes blocks
+ jz H1100
+ add esi, ecx
+ add edi, ecx
+
+ ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+ cmp edx, [_CacheBypassLimit]
+%ELSE
+ cmp edx, [ebx+_CacheBypassLimit-RP]
+%ENDIF
+ ja H1800 ; Use non-temporal store if count > _CacheBypassLimit
+
+align 16
+H1000: ; 32 bytes move loop
+ movups xmm1, [esi+edx-20H]
+ movups xmm0, [esi+edx-10H]
+ movaps [edi+edx-20H], xmm1
+ movaps [edi+edx-10H], xmm0
+ sub edx, 20H
+ jnz H1000
+
+H1090: sub esi, ecx
+ sub edi, ecx
+
+H1100: ; remaining 0-31 bytes
+ test ecx, ecx
+ jz H1600
+ test cl, 10H
+ jz H1200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [esi+ecx]
+ movaps [edi+ecx], xmm0
+ jz H1600 ; early out if count divisible by 16
+H1200: test cl, 8
+ jz H1300
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+H1300: test cl, 4
+ jz H1400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+ jz H1600 ; early out if count divisible by 4
+H1400: test cl, 2
+ jz H1500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+H1500: test cl, 1
+ jz H1600
+ ; move 1 byte
+ movzx eax, byte [esi] ; ecx-1 = 0
+ mov [edi], al
+H1600: ; finished
+ RETURNM
+
+align 16
+H1800: ; 32 bytes move loop, bypass cache
+ movups xmm1, [esi+edx-20H]
+ movups xmm0, [esi+edx-10H]
+ movntps [edi+edx-20H], xmm1
+ movntps [edi+edx-10H], xmm0
+ sub edx, 20H
+ jnz H1800
+ jmp H1090
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memmoveSSSE3: ; SSSE3 version begins here
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memmoveSSSE3@:
+ PROLOGM _memcpySSSE3
+
+ cmp ecx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [edi+ecx] ; end of dext
+ and edx, 0FH
+ jz B1300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B1210
+ test dl, 1
+ jz B1201 ; B1200 if we haven't tested edx,3
+ ; move 1 byte
+ dec ecx
+ movzx eax, byte [esi+ecx]
+ mov [edi+ecx], al
+B1200: test dl, 2
+ jz B1210
+B1201: ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+B1210: test dl, 4
+ jz B1220
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+B1220: test dl, 8
+ jz B1300
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+
+B1300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of end of src modulo 16 at this point:
+ lea eax, [esi+ecx]
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and ecx, -20H ; Round down to nearest multiple of 32
+ sub edx, ecx ; Remaining data after loop
+ sub esi, eax ; Nearest preceding aligned block of src
+ ; Add the same to esi and edi as we have subtracted from ecx
+ add esi, edx
+ add edi, edx
+
+%IFNDEF POSITIONINDEPENDENT
+ ; Check if count very big
+ cmp ecx, [_CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > _CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ jmp [MAlignmentDispatchSSSE3+eax*4]
+
+B1400: ; Dispatch to different codes depending on src alignment
+ jmp [MAlignmentDispatchNT+eax*4]
+
+%ELSE ; Position-independent code
+
+ ; Check if count very big
+ ; Make the following instruction with address relative to RP:
+ cmp ecx, [ebx-RP+_CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > _CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ ; MAlignmentDispatch table contains addresses relative to RP
+ ; Add table entry to ebx=RP to get jump address.
+
+ ; Make the following instruction with address relative to RP:
+ add ebx,[ebx-RP+MAlignmentDispatchSSSE3+eax*4]
+ jmp ebx
+
+B1400: ; Same with MAlignmentDispatchNT:
+ add ebx,[ebx-RP+MAlignmentDispatchNT+eax*4]
+ jmp ebx
+%ENDIF
+
+
+align 16
+C100: ; Code for aligned src. SSE2 or later instruction set
+ ; The nice case, src and dest have same alignment.
+
+ ; Loop. ecx has positive index from the beginning, counting down to zero
+ movaps xmm0, [esi+ecx-10H]
+ movaps xmm1, [esi+ecx-20H]
+ movaps [edi+ecx-10H], xmm0
+ movaps [edi+ecx-20H], xmm1
+ sub ecx, 20H
+ jnz C100
+
+ ; Move the remaining edx bytes (0 - 31):
+ ; move 16-8-4-2-1 bytes, aligned
+ test edx, edx
+ jz C500 ; Early out if no more data
+ test dl, 10H
+ jz C200
+ ; move 16 bytes
+ sub ecx, 10H
+ movaps xmm0, [esi+ecx]
+ movaps [edi+ecx], xmm0
+
+C200: ; Other branches come in here, ecx may contain arbitrary offset
+ test edx, edx
+ jz C500 ; Early out if no more data
+ test dl, 8
+ jz C210
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+C210: test dl, 4
+ jz C220
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+ jz C500 ; Early out if count divisible by 4
+C220: test dl, 2
+ jz C230
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+C230: test dl, 1
+ jz C500
+ ; move 1 byte
+ movzx eax, byte [esi+ecx-1] ; ecx-1 not always 0
+ mov [edi+ecx-1], al
+C500: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+_memmoveSSE2: ; SSE2 version begins here
+%IFDEF POSITIONINDEPENDENT
+ call get_thunk_edx
+ add edx, RP-$
+%ENDIF
+memmoveSSE2@:
+ PROLOGM _memcpySSE2
+
+ cmp ecx, 40H
+ jae B0100 ; Use simpler code if count < 64
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ test cl, 20H
+ jz A100
+ ; move 32 bytes
+ ; movq is faster than movdqu on Intel Pentium M and Core 1
+ ; movdqu is faster on later processors
+ sub ecx, 20H
+ movq xmm0, qword [esi+ecx+18H]
+ movq xmm1, qword [esi+ecx+10H]
+ movq xmm2, qword [esi+ecx+8]
+ movq xmm3, qword [esi+ecx]
+ movq qword [edi+ecx+18H], xmm0
+ movq qword [edi+ecx+10H], xmm1
+ movq qword [edi+ecx+8], xmm2
+ movq qword [edi+ecx], xmm3
+A100: test cl, 10H
+ jz A200
+ ; move 16 bytes
+ sub ecx, 10H
+ movq xmm0, qword [esi+ecx+8]
+ movq xmm1, qword [esi+ecx]
+ movq qword [edi+ecx+8], xmm0
+ movq qword [edi+ecx], xmm1
+A200: test cl, 8
+ jz A300
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+A300: test cl, 4
+ jz A400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+ jz A900 ; early out if count divisible by 4
+A400: test cl, 2
+ jz A500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+A500: test cl, 1
+ jz A900
+ ; move 1 byte
+ movzx eax, byte [esi] ; ecx-1 = 0
+ mov [edi], al
+A900: ; finished
+ RETURNM
+
+B0100: ; count >= 64
+ ; This part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [edi+ecx] ; end of dest
+ and edx, 0FH
+ jz B0300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B0210
+ test dl, 1
+ jz B0201 ; B0200 if we haven't tested edx,3
+ ; move 1 byte
+ dec ecx
+ movzx eax, byte [esi+ecx]
+ mov [edi+ecx], al
+B0200: test dl, 2
+ jz B0210
+B0201: ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [esi+ecx]
+ mov [edi+ecx], ax
+B0210: test dl, 4
+ jz B0220
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [esi+ecx]
+ mov [edi+ecx], eax
+B0220: test dl, 8
+ jz B0300
+ ; move 8 bytes
+ sub ecx, 8
+ movq xmm0, qword [esi+ecx]
+ movq qword [edi+ecx], xmm0
+
+B0300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of end of src modulo 16 at this point:
+ lea eax, [esi+ecx]
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and ecx, -20H ; Round down to nearest multiple of 32
+ sub edx, ecx ; Remaining data after loop
+ sub esi, eax ; Nearest preceding aligned block of src
+ ; Add the same to esi and edi as we have subtracted from ecx
+ add esi, edx
+ add edi, edx
+
+%IFNDEF POSITIONINDEPENDENT
+ ; Check if count very big
+ cmp ecx, [_CacheBypassLimit]
+ ja B0400 ; Use non-temporal store if count > _CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ jmp [MAlignmentDispatchSSE2+eax*4]
+
+B0400: ; Dispatch to different codes depending on src alignment
+ jmp [MAlignmentDispatchNT+eax*4]
+
+%ELSE ; Position-independent code
+
+ ; Check if count very big
+ ; Make the following instruction with address relative to RP:
+ cmp ecx, [ebx-RP+_CacheBypassLimit]
+ ja B0400 ; Use non-temporal store if count > _CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ ; MAlignmentDispatch table contains addresses relative to RP
+ ; Add table entry to ebx=RP to get jump address.
+
+ ; Make the following instruction with address relative to RP:
+ add ebx,[ebx-RP+MAlignmentDispatchSSE2+eax*4]
+ jmp ebx
+
+B0400: ; Same with MAlignmentDispatchNT:
+ add ebx,[ebx-RP+MAlignmentDispatchNT+eax*4]
+ jmp ebx
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2 2
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = count rounded down to nearest divisible by 32
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [esi+ecx] ; Read from nearest following 16B boundary
+%%L1: ; Loop. ecx has positive index from the beginning, counting down to zero
+ sub ecx, 20H
+ movdqa xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ movdqa xmm2, [esi+ecx]
+ movdqa xmm3, xmm1 ; Copy because used twice
+ pslldq xmm0, 16-%1 ; shift left
+ psrldq xmm1, %1 ; shift right
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [edi+ecx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntdq [edi+ecx+10H], xmm0 ; Save aligned
+ %ENDIF
+ movdqa xmm0, xmm2 ; Save for next iteration
+ pslldq xmm3, 16-%1 ; shift left
+ psrldq xmm2, %1 ; shift right
+ por xmm3, xmm2 ; combine blocks
+ %IF %2 == 0
+ movdqa [edi+ecx], xmm3 ; Save aligned
+ %ELSE
+ movntdq [edi+ecx], xmm3 ; Save aligned
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub ecx, 10H
+ movdqa xmm1, [esi+ecx]
+ pslldq xmm0, 16-%1 ; shift left
+ psrldq xmm1, %1 ; shift right
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [edi+ecx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [edi+ecx], xmm0 ; Save aligned
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_4 1
+; Special case: u = 4
+ movaps xmm0, [esi+ecx] ; Read from nearest following 16B boundary
+%%L1: ; Loop. ecx has positive index from the beginning, counting down to zero
+ sub ecx, 20H
+ movaps xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ movaps xmm2, [esi+ecx]
+ movaps xmm3, xmm0
+ movaps xmm0, xmm2
+ movss xmm2, xmm1
+ shufps xmm2, xmm2, 00111001B ; Rotate right
+ movss xmm1, xmm3
+ shufps xmm1, xmm1, 00111001B ; Rotate right
+ %IF %1 == 0
+ movaps [edi+ecx+10H], xmm1 ; Save aligned
+ movaps [edi+ecx], xmm2 ; Save aligned
+ %ELSE
+ movntps [edi+ecx+10H], xmm1 ; Non-temporal save
+ movntps [edi+ecx], xmm2 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub ecx, 10H
+ movaps xmm1, [esi+ecx]
+ movss xmm1, xmm0
+ shufps xmm1, xmm1, 00111001B ; Rotate right
+ %IF %1 == 0
+ movaps [edi+ecx], xmm1 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm1 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_8 1
+; Special case: u = 8
+ movaps xmm0, [esi+ecx] ; Read from nearest following 16B boundary
+ shufps xmm0, xmm0, 01001110B ; Rotate
+%%L1: ; Loop. ecx has positive index from the beginning, counting down to zero
+ sub ecx, 20H
+ movaps xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 01001110B ; Rotate
+ movsd xmm0, xmm1
+ %IF %1 == 0
+ movaps [edi+ecx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+ecx+10H], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [esi+ecx]
+ shufps xmm0, xmm0, 01001110B ; Rotate
+ movsd xmm1, xmm0
+ %IF %1 == 0
+ movaps [edi+ecx], xmm1 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm1 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub ecx, 10H
+ movaps xmm1, [esi+ecx]
+ shufps xmm1, xmm1, 01001110B ; Rotate
+ movsd xmm0, xmm1
+ %IF %1 == 0
+ movaps [edi+ecx], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm0 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_12 1
+; Special case: u = 12
+ movaps xmm0, [esi+ecx] ; Read from nearest following 16B boundary
+ shufps xmm0, xmm0, 10010011B ; Rotate right
+%%L1: ; Loop. ecx has positive index from the beginning, counting down to zero
+ sub ecx, 20H
+ movaps xmm1, [esi+ecx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 10010011B ; Rotate left
+ movss xmm0, xmm1
+ %IF %1 == 0
+ movaps [edi+ecx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+ecx+10H], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [esi+ecx]
+ shufps xmm0, xmm0, 10010011B ; Rotate left
+ movss xmm1, xmm0
+ %IF %1 == 0
+ movaps [edi+ecx], xmm1 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm1 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub ecx, 10H
+ movaps xmm1, [esi+ecx]
+ shufps xmm1, xmm1, 10010011B ; Rotate left
+ movss xmm0, xmm1
+ %IF %1 == 0
+ movaps [edi+ecx], xmm0 ; Save aligned
+ %ELSE
+ movntps [edi+ecx], xmm0 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Code for unaligned src, Suppl.SSE3 instruction set.
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSSE3 1
+; Move ecx + edx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; esi = src - %1 = nearest preceding 16-bytes boundary
+; edi = dest (aligned)
+; ecx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [esi+ecx] ; Read from nearest following 16B boundary
+
+%%L1: ; Loop. ecx has positive index from the beginning, counting down to zero
+ movdqa xmm1, [esi+ecx-10H] ; Read next two blocks
+ palignr xmm0, xmm1, %1 ; Combine parts into aligned block
+ movdqa [edi+ecx-10H], xmm0 ; Save aligned
+ movdqa xmm0, [esi+ecx-20H]
+ palignr xmm1, xmm0, %1 ; Combine parts into aligned block
+ movdqa [edi+ecx-20H], xmm1 ; Save aligned
+ sub ecx, 20H
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub ecx, 10H
+ movdqa xmm1, [esi+ecx] ; Read next two blocks
+ palignr xmm0, xmm1, %1 ; Combine parts into aligned block
+ movdqa [edi+ecx], xmm0 ; Save aligned
+
+%%L2: ; Get src pointer back to misaligned state
+ add esi, eax
+ ; Move remaining 0 - 15 bytes
+ jmp C200
+%ENDMACRO
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSSE2 below
+; (aligns and fillers are inserted manually to minimize the
+; number of 16-bytes boundaries inside loops)
+
+align 16
+D104: MOVE_REVERSE_UNALIGNED_SSE2_4 0
+D108: MOVE_REVERSE_UNALIGNED_SSE2_8 0
+D10C: MOVE_REVERSE_UNALIGNED_SSE2_12 0
+D101: MOVE_REVERSE_UNALIGNED_SSE2 1, 0
+D102: MOVE_REVERSE_UNALIGNED_SSE2 2, 0
+D103: MOVE_REVERSE_UNALIGNED_SSE2 3, 0
+D105: MOVE_REVERSE_UNALIGNED_SSE2 5, 0
+D106: MOVE_REVERSE_UNALIGNED_SSE2 6, 0
+D107: MOVE_REVERSE_UNALIGNED_SSE2 7, 0
+D109: MOVE_REVERSE_UNALIGNED_SSE2 9, 0
+D10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
+D10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
+D10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
+D10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
+D10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Sup.SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
+
+align 16
+E104: MOVE_REVERSE_UNALIGNED_SSSE3 4
+E108: MOVE_REVERSE_UNALIGNED_SSSE3 8
+E10C: MOVE_REVERSE_UNALIGNED_SSSE3 0CH
+E101: MOVE_REVERSE_UNALIGNED_SSSE3 1
+E102: MOVE_REVERSE_UNALIGNED_SSSE3 2
+E103: MOVE_REVERSE_UNALIGNED_SSSE3 3
+E105: MOVE_REVERSE_UNALIGNED_SSSE3 5
+E106: MOVE_REVERSE_UNALIGNED_SSSE3 6
+E107: MOVE_REVERSE_UNALIGNED_SSSE3 7
+E109: MOVE_REVERSE_UNALIGNED_SSSE3 9
+E10A: MOVE_REVERSE_UNALIGNED_SSSE3 0AH
+E10B: MOVE_REVERSE_UNALIGNED_SSSE3 0BH
+E10D: MOVE_REVERSE_UNALIGNED_SSSE3 0DH
+E10E: MOVE_REVERSE_UNALIGNED_SSSE3 0EH
+E10F: MOVE_REVERSE_UNALIGNED_SSSE3 0FH
+
+align 16
+F100: ; Non-temporal move, src and dest have same alignment.
+ ; Loop. ecx has positive index from the beginning, counting down to zero
+ sub ecx, 20H
+ movaps xmm0, [esi+ecx+10H]
+ movaps xmm1, [esi+ecx]
+ movntps [edi+ecx+10H], xmm0
+ movntps [edi+ecx], xmm1
+ jnz F100
+
+ ; Move the remaining edx bytes (0 - 31):
+ ; move 16-8-4-2-1 bytes, aligned
+ test dl, 10H
+ jz C200
+ ; move 16 bytes
+ sub ecx, 10H
+ movaps xmm0, [esi+ecx]
+ movntps [edi+ecx], xmm0
+ ; move the remaining 0 - 15 bytes
+ jmp C200
+
+; Non-temporal move, src and dest have different alignment.
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchNT below
+
+align 16
+F104: MOVE_REVERSE_UNALIGNED_SSE2_4 1
+F108: MOVE_REVERSE_UNALIGNED_SSE2_8 1
+F10C: MOVE_REVERSE_UNALIGNED_SSE2_12 1
+F101: MOVE_REVERSE_UNALIGNED_SSE2 1, 1
+F102: MOVE_REVERSE_UNALIGNED_SSE2 2, 1
+F103: MOVE_REVERSE_UNALIGNED_SSE2 3, 1
+F105: MOVE_REVERSE_UNALIGNED_SSE2 5, 1
+F106: MOVE_REVERSE_UNALIGNED_SSE2 6, 1
+F107: MOVE_REVERSE_UNALIGNED_SSE2 7, 1
+F109: MOVE_REVERSE_UNALIGNED_SSE2 9, 1
+F10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
+F10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
+F10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
+F10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
+F10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for old processors without SSE2
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 8
+; 80386 version used when SSE2 not supported:
+_memmove386:
+memmove386@:
+ PROLOGM _memcpy386
+; edi = dest
+; esi = src
+; ecx = count
+ std ; Move backwards
+ lea edi, [edi+ecx-1] ; Point to last byte of dest
+ lea esi, [esi+ecx-1] ; Point to last byte of src
+ cmp ecx, 8
+ jb G500
+G100: test edi, 3 ; Test if unaligned
+ jz G200
+ movsb
+ dec ecx
+ jmp G100 ; Repeat while edi unaligned
+
+G200: ; edi is aligned now. Move 4 bytes at a time
+ sub edi, 3 ; Point to last dword of dest
+ sub esi, 3 ; Point to last dword of src
+ mov edx, ecx
+ shr ecx, 2
+ rep movsd ; move 4 bytes at a time
+ mov ecx, edx
+ and ecx, 3
+ add edi, 3 ; Point to last byte of dest
+ add esi, 3 ; Point to last byte of src
+ rep movsb ; move remaining 0-3 bytes
+ cld
+ RETURNM
+
+G500: ; count < 8. Move one byte at a time
+ rep movsb ; move count bytes
+ cld
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CPU dispatching for memmove. This is executed only once
+memmoveCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ pushad
+ ; set _CacheBypassLimit to half the size of the largest level cache
+ push 0
+ call SetMemcpyCacheLimit@
+ pop ecx
+ call _InstructionSet
+ ; Point to generic version of memmove
+ mov esi, memmove386@
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; SSE2 supported
+ ; Point to SSE2 version of memmove
+ mov esi, memmoveSSE2@
+ cmp eax, 6 ; check Suppl-SSE3
+ jb Q100
+ ; Suppl-SSE3 supported
+ ; Point to SSSE3 version of memmove
+ mov esi, memmoveSSSE3@
+ call _UnalignedIsFaster
+ test eax, eax
+ jz Q100
+ ; Point to unaligned version of memmove
+ mov esi, memmoveU@
+ call _Store256BitIsFaster
+ test eax, eax
+ jz Q100
+ ; Point to 256 bit move version of memmove
+ mov esi, memmoveU256@
+
+Q100: mov [memmoveDispatch], esi
+ popad
+ ; Continue in appropriate version of memmove
+ jmp [memmoveDispatch]
+
+%ELSE ; Position-independent version
+ pushad
+ mov ebx, edx ; reference point
+ ; set _CacheBypassLimit to half the size of the largest level cache
+ push 0
+ call SetMemcpyCacheLimit@
+ pop ecx
+ call _InstructionSet
+ ; Point to generic version of memmove
+ lea esi, [ebx+memmove386 at -RP]
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; SSE2 supported
+ ; Point to SSE2 version of memmove
+ lea esi, [ebx+memmoveSSE2 at -RP]
+ cmp eax, 6 ; check Suppl-SSE3
+ jb Q100
+ ; Suppl-SSE3 supported
+ ; Point to SSSE3 version of memmove
+ lea esi, [ebx+memmoveSSSE3 at -RP]
+ call _UnalignedIsFaster
+ test eax, eax
+ jz Q100
+ ; Point to unaligned version of memmove
+ lea esi, [ebx+memmoveU at -RP]
+ call _Store256BitIsFaster
+ test eax, eax
+ jz Q100
+ ; Point to 256 bit move version of memmove
+ lea esi, [ebx+memmoveU256 at -RP]
+
+Q100: ; insert appropriate pointer
+ mov dword [ebx+memmoveDispatch-RP], esi
+ popad
+ ; Continue in appropriate version of memmove
+ jmp [edx+memmoveDispatch-RP]
+%ENDIF
+
+
+; Note: Must call _SetMemcpyCacheLimit1 defined in memcpy32.asm
+_SetMemcpyCacheLimit:
+SetMemcpyCacheLimit@: ; local label
+ mov eax, [esp+4]
+ push eax
+ call _SetMemcpyCacheLimit1
+ pop ecx
+%ifdef POSITIONINDEPENDENT
+ call get_thunk_edx
+ mov [edx + _CacheBypassLimit - $], eax
+%else
+ mov [_CacheBypassLimit], eax
+%endif
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces MAlignmentDispatchSSE2 with
+; MAlignmentDispatchSupSSE3 if Suppl-SSE3 is supported
+; RP = reference point if position-independent code, otherwise RP = 0
+
+MAlignmentDispatchSSE2:
+DD C100-RP, D101-RP, D102-RP, D103-RP, D104-RP, D105-RP, D106-RP, D107-RP
+DD D108-RP, D109-RP, D10A-RP, D10B-RP, D10C-RP, D10D-RP, D10E-RP, D10F-RP
+
+MAlignmentDispatchSSSE3:
+DD C100-RP, E101-RP, E102-RP, E103-RP, E104-RP, E105-RP, E106-RP, E107-RP
+DD E108-RP, E109-RP, E10A-RP, E10B-RP, E10C-RP, E10D-RP, E10E-RP, E10F-RP
+
+MAlignmentDispatchNT:
+DD F100-RP, F101-RP, F102-RP, F103-RP, F104-RP, F105-RP, F106-RP, F107-RP
+DD F108-RP, F109-RP, F10A-RP, F10B-RP, F10C-RP, F10D-RP, F10E-RP, F10F-RP
+
+; Pointer to appropriate version.
+; This initially points to memcpyCPUDispatch. memcpyCPUDispatch will
+; change this to the appropriate version of memcpy, so that
+; memcpyCPUDispatch is only executed once:
+memmoveDispatch: DD memmoveCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+_CacheBypassLimit: DD 0
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/asmlibSrc/memmove64.asm b/asmlibSrc/memmove64.asm
new file mode 100755
index 0000000..a09c95a
--- /dev/null
+++ b/asmlibSrc/memmove64.asm
@@ -0,0 +1,1073 @@
+;************************* memmove64.asm ***********************************
+; Author: Agner Fog
+; Date created: 2008-07-18
+; Last modified: 2013-09-11
+; Description:
+; Faster version of the standard memmove function:
+; void * A_memmove(void *dest, const void *src, size_t count);
+; Moves 'count' bytes from 'src' to 'dest'. src and dest may overlap.
+;
+; Overriding standard function memmove:
+; The alias ?OVR_memmove is changed to _memmove in the object file if
+; it is desired to override the standard library function memmove.
+;
+; CPU dispatching included for different CPUs
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memmove: function ; Function A_memmove
+global ?OVR_memmove: function ; ?OVR removed if standard function memmove overridden
+global memmoveSSE2: function ; Version for processors with only SSE2
+global memmoveSSSE3: function ; Version for processors with SSSE3
+global memmoveU: function ; Version for processors with fast unaligned read
+global memmoveU256: function ; Version for processors with fast 256-bit read/write
+global SetMemcpyCacheLimit ; Change limit for bypassing cache
+
+; Imported from memcpy64.asm:
+extern A_memcpy ; function entry
+extern memcpySSE2 ; CPU specific function entry
+extern memcpySSSE3 ; CPU specific function entry
+extern memcpyU ; CPU specific function entry
+extern memcpyU256 ; CPU specific function entry
+
+; Imported from instrset64.asm
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster64.asm:
+extern UnalignedIsFaster ; Tells if unaligned read is faster than PALIGNR
+extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Imported from memcpy64.asm
+extern GetMemcpyCacheLimit ; Get the size limit for bypassing cache when copying with memcpy and memmove
+extern SetMemcpyCacheLimit1 ; Set the size limit for bypassing cache when copying with memcpy
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Prolog macro. Determine if we should move forwards or backwards
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define prolog for this function
+; Parameter 1 is forward function label
+%MACRO PROLOGM 1
+%IFDEF WINDOWS
+ ; Check if dest overlaps src
+ mov rax, rcx
+ sub rax, rdx
+ cmp rax, r8
+ ; We can avoid testing for dest < src by using unsigned compare:
+ ; (Assume that the memory block cannot span across address 0)
+ ; Must move backwards if unsigned(dest-src) < count
+ jae %1 ; Jump to memcpy if we can move forwards
+ push rsi
+ push rdi
+ mov rdi, rcx ; dest
+ mov r9, rcx ; dest
+ mov rsi, rdx ; src
+ mov rcx, r8 ; count
+%ELSE ; Unix
+ ; Check if dest overlaps src
+ mov rax, rdi
+ sub rax, rsi
+ cmp rax, rdx
+ ; Must move backwards if unsigned(dest-src) < count
+ jae %1 ; Jump to memcpy if we can move forwards
+ mov rcx, rdx ; count
+ mov r9, rdi ; dest
+%ENDIF
+%ENDM
+
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF WINDOWS
+ pop rdi
+ pop rsi
+%ENDIF
+ mov rax, r9 ; Return value = dest
+ ret
+%ENDMACRO
+
+
+SECTION .text align=16
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Common entry for dispatch
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; extern "C" void * A_memmove(void * dest, const void * src, size_t count);
+; Function entry:
+A_memmove:
+?OVR_memmove:
+ jmp qword [memmoveDispatch] ; Go to appropriate version, depending on instruction set
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; AVX Version for processors with fast unaligned read and fast 32 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveU256: ; Version for processors with fast 256-bit read/write
+memmoveU256@: ; local label
+ PROLOGM memcpyU256
+
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 1FH
+ jz B4300 ; Skip if end of dest aligned by 32
+
+ ; edx = size of last partial block, 1 - 31 bytes
+ test dl, 3
+ jz B4210
+ test dl, 1
+ jz B4201 ; B4200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B4200: test dl, 2
+ jz B4210
+B4201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B4210: test dl, 4
+ jz B4220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B4220: test dl, 8
+ jz B4230
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+B4230: test dl, 16
+ jz B4300
+ ; move 16 bytes
+ sub rcx, 16
+ movups xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+
+B4300: ; Now end of dest is aligned by 32. Any partial block has been moved
+ mov rdx, rcx
+ and ecx, 1FH ; remaining size after 32 bytes blocks moved
+ and rdx, -20H ; number of 32 bytes blocks
+ jz H4100
+ add rsi, rcx
+ add rdi, rcx
+
+ ; Check if count very big
+ cmp rdx, [CacheBypassLimit]
+ ja H4800 ; Use non-temporal store if count > _CacheBypassLimit
+
+align 16
+H4000: ; 32 bytes move loop
+ vmovups ymm0, [rsi+rdx-20H]
+ vmovaps [rdi+rdx-20H], ymm0
+ sub rdx, 20H
+ jnz H4000
+ vzeroupper
+
+H4090: sub rsi, rcx
+ sub rdi, rcx
+
+H4100: ; remaining 0-31 bytes
+ test ecx, ecx
+ jz H4600
+ test cl, 10H
+ jz H4200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+ jz H4600 ; early out if count divisible by 16
+H4200: test cl, 8
+ jz H4300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+H4300: test cl, 4
+ jz H4400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz H4600 ; early out if count divisible by 4
+H4400: test cl, 2
+ jz H4500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+H4500: test cl, 1
+ jz H4600
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+H4600: ; finished
+ RETURNM
+
+align 16
+H4800: ; 32 bytes move loop, bypass cache
+ vmovups ymm0, [rsi+rdx-20H]
+ vmovntps [rdi+rdx-20H], ymm0
+ sub rdx, 20H
+ jnz H4800
+ vzeroupper
+ jmp H4090
+
+A1000: ; count < 64. Move 32-16-8-4-2-1 bytes
+ test cl, 20H
+ jz A1100
+ ; move 32 bytes
+ ; movups is faster on processors with SSSE3
+ sub ecx, 20H
+ movups xmm0, [rsi+rcx+10H]
+ movups xmm1, [rsi+rcx]
+ movups [rdi+rcx+10H], xmm0
+ movups [rdi+rcx], xmm1
+A1100: test cl, 10H
+ jz A1200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [rsi+rcx]
+ movups [rdi+rcx], xmm0
+A1200: test cl, 8
+ jz A1300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+A1300: test cl, 4
+ jz A1400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz A1900 ; early out if count divisible by 4
+A1400: test cl, 2
+ jz A1500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+A1500: test cl, 1
+ jz A1900
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+A1900: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with fast unaligned read and fast 16 bytes write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveU: ; Version for processors with fast unaligned read
+memmoveU@: ; local label
+ PROLOGM memcpyU
+
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 0FH
+ jz B3300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B3210
+ test dl, 1
+ jz B3201 ; B3200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B3200: test dl, 2
+ jz B3210
+B3201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B3210: test dl, 4
+ jz B3220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B3220: test dl, 8
+ jz B3300
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+
+B3300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ mov rdx, rcx
+ and ecx, 1FH ; remaining size after 32 bytes blocks moved
+ and rdx, -20H ; number of 32 bytes blocks
+ jz H1100
+ add rsi, rcx
+ add rdi, rcx
+
+ ; Check if count very big
+ cmp rdx, [CacheBypassLimit]
+ ja H1800 ; Use non-temporal store if count > _CacheBypassLimit
+
+align 16 ; minimize 16-bytes boundaries in H1000 loop
+H1000: ; 32 bytes move loop
+ movups xmm1, [rsi+rdx-20H]
+ movups xmm0, [rsi+rdx-10H]
+ movaps [rdi+rdx-20H], xmm1
+ movaps [rdi+rdx-10H], xmm0
+ sub rdx, 20H
+ jnz H1000
+
+H1090: sub rsi, rcx
+ sub rdi, rcx
+
+H1100: ; remaining 0-31 bytes
+ test ecx, ecx
+ jz H1600
+ test cl, 10H
+ jz H1200
+ ; move 16 bytes
+ sub ecx, 10H
+ movups xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+ jz H1600 ; early out if count divisible by 16
+H1200: test cl, 8
+ jz H1300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+H1300: test cl, 4
+ jz H1400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz H1600 ; early out if count divisible by 4
+H1400: test cl, 2
+ jz H1500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+H1500: test cl, 1
+ jz H1600
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+H1600: ; finished
+ RETURNM
+
+align 16
+H1800: ; 32 bytes move loop, bypass cache
+ movups xmm1, [rsi+rdx-20H]
+ movups xmm0, [rsi+rdx-10H]
+ movntps [rdi+rdx-20H], xmm1
+ movntps [rdi+rdx-10H], xmm0
+ sub rdx, 20H
+ jnz H1800
+ jmp H1090
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSSE3. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+align 16
+memmoveSSSE3: ; SSSE3 version begins here
+memmoveSSSE3@: ; local label
+ PROLOGM memcpySSSE3
+
+ ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+ cmp rcx, 40H
+ jb A1000 ; Use simpler code if count < 64
+ ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 0FH
+ jz B1300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B1210
+ test dl, 1
+ jz B1201 ; B1200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B1200: test dl, 2
+ jz B1210
+B1201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B1210: test dl, 4
+ jz B1220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B1220: test dl, 8
+ jz B1300
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+
+B1300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of end of src modulo 16 at this point:
+ lea eax, [rsi+rcx]
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ sub edx, ecx ; Remaining data after loop
+ sub rsi, rax ; Nearest preceding aligned block of src
+ ; Add the same to rsi and rdi as we have subtracted from rcx
+ add rsi, rdx
+ add rdi, rdx
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B1400 ; Use non-temporal store if count > CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchSSSE3]
+ jmp near [r8+rax*8]
+
+B1400: ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+align 16
+C100: ; Code for aligned src. SSE2 and later CPUs
+ ; The nice case, src and dest have same alignment.
+
+ ; Loop. rcx has positive index from the beginning, counting down to zero
+ movaps xmm0, [rsi+rcx-10H]
+ movaps xmm1, [rsi+rcx-20H]
+ movaps [rdi+rcx-10H], xmm0
+ movaps [rdi+rcx-20H], xmm1
+ sub rcx, 20H
+ jnz C100
+
+ ; Move the remaining edx bytes (0 - 31):
+ ; move 16-8-4-2-1 bytes, aligned
+ test edx, edx
+ jz C500 ; Early out if no more data
+ test dl, 10H
+ jz C200
+ ; move 16 bytes
+ sub rcx, 10H
+ movaps xmm0, [rsi+rcx]
+ movaps [rdi+rcx], xmm0
+
+C200: ; Other branches come in here, rcx may contain arbitrary offset
+ test edx, edx
+ jz C500 ; Early out if no more data
+ test dl, 8
+ jz C210
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+C210: test dl, 4
+ jz C220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz C500 ; Early out if count divisible by 4
+C220: test dl, 2
+ jz C230
+ ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+C230: test dl, 1
+ jz C500
+ ; move 1 byte
+ movzx eax, byte [rsi+rcx-1] ; rcx-1 is not always 0 here
+ mov [rdi+rcx-1], al
+C500: ; finished
+ RETURNM
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Version for processors with SSE2. Aligned read + shift + aligned write
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memmoveSSE2: ; SSE2 version begins here
+memmoveSSE2@: ; local label
+ PROLOGM memcpySSE2
+
+ ; Cannot use memcpy. Must move backwards because of overlap between src and dest
+ cmp rcx, 40H
+ jae B0100 ; Use simpler code if count < 64
+
+ ; count < 64. Move 32-16-8-4-2-1 bytes
+ test cl, 20H
+ jz A100
+ ; move 32 bytes
+ ; mov is faster than movdqu on SSE2 processors,
+ ; movdqu is faster on later processors
+ sub ecx, 20H
+ mov rax, [rsi+rcx+18H]
+ mov rdx, [rsi+rcx+10H]
+ mov [rdi+rcx+18H], rax
+ mov [rdi+rcx+10H], rdx
+ mov rax, [rsi+rcx+8]
+ mov rdx, [rsi+rcx]
+ mov [rdi+rcx+8], rax
+ mov [rdi+rcx], rdx
+A100: test cl, 10H
+ jz A200
+ ; move 16 bytes
+ sub ecx, 10H
+ mov rax, [rsi+rcx+8]
+ mov rdx, [rsi+rcx]
+ mov [rdi+rcx+8], rax
+ mov [rdi+rcx], rdx
+A200: test cl, 8
+ jz A300
+ ; move 8 bytes
+ sub ecx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+A300: test cl, 4
+ jz A400
+ ; move 4 bytes
+ sub ecx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+ jz A900 ; early out if count divisible by 4
+A400: test cl, 2
+ jz A500
+ ; move 2 bytes
+ sub ecx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+A500: test cl, 1
+ jz A900
+ ; move 1 byte
+ movzx eax, byte [rsi] ; rcx-1 = 0
+ mov [rdi], al
+A900: ; finished
+ RETURNM
+
+B0100: ; count >= 64
+ ; Note: this part will not always work if count < 64
+ ; Calculate size of last block after last regular boundary of dest
+ lea edx, [rdi+rcx] ; end of dext
+ and edx, 0FH
+ jz B0300 ; Skip if end of dest aligned by 16
+
+ ; edx = size of last partial block, 1 - 15 bytes
+ test dl, 3
+ jz B0210
+ test dl, 1
+ jz B0201 ; B0200 if we haven't tested edx,3
+ ; move 1 byte
+ dec rcx
+ movzx eax, byte [rsi+rcx]
+ mov [rdi+rcx], al
+B0200: test dl, 2
+ jz B0210
+B0201: ; move 2 bytes
+ sub rcx, 2
+ movzx eax, word [rsi+rcx]
+ mov [rdi+rcx], ax
+B0210: test dl, 4
+ jz B0220
+ ; move 4 bytes
+ sub rcx, 4
+ mov eax, [rsi+rcx]
+ mov [rdi+rcx], eax
+B0220: test dl, 8
+ jz B0300
+ ; move 8 bytes
+ sub rcx, 8
+ mov rax, [rsi+rcx]
+ mov [rdi+rcx], rax
+
+B0300: ; Now end of dest is aligned by 16. Any partial block has been moved
+ ; Find alignment of end of src modulo 16 at this point:
+ lea eax, [rsi+rcx]
+ and eax, 0FH
+
+ ; Set up for loop moving 32 bytes per iteration:
+ mov edx, ecx ; Save count
+ and rcx, -20H ; Round down to nearest multiple of 32
+ sub edx, ecx ; Remaining data after loop
+ sub rsi, rax ; Nearest preceding aligned block of src
+ ; Add the same to rsi and rdi as we have subtracted from rcx
+ add rsi, rdx
+ add rdi, rdx
+
+ ; Check if count very big
+ cmp rcx, [CacheBypassLimit]
+ ja B0400 ; Use non-temporal store if count > CacheBypassLimit
+
+ ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchSSE2]
+ jmp near [r8+rax*8]
+
+B0400: ; Dispatch to different codes depending on src alignment
+ lea r8, [MAlignmentDispatchNT]
+ jmp near [r8+rax*8]
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Macros and alignment jump tables
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for each src alignment, SSE2 instruction set:
+; Make separate code for each alignment u because the shift instructions
+; have the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2 2 ; u, nt
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; %2 = 1 if non-temporal store desired
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = count rounded down to nearest divisible by 32
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movdqa xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movdqa xmm2, [rsi+rcx]
+ movdqa xmm3, xmm1 ; Copy because used twice
+ pslldq xmm0, 16-%1 ; shift left
+ psrldq xmm1, %1 ; shift right
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx+10H], xmm0 ; Save aligned
+ %ENDIF
+ movdqa xmm0, xmm2 ; Save for next iteration
+ pslldq xmm3, 16-%1 ; shift left
+ psrldq xmm2, %1 ; shift right
+ por xmm3, xmm2 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx], xmm3 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx], xmm3 ; Save aligned
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movdqa xmm1, [rsi+rcx]
+ pslldq xmm0, 16-%1 ; shift left
+ psrldq xmm1, %1 ; shift right
+ por xmm0, xmm1 ; combine blocks
+ %IF %2 == 0
+ movdqa [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntdq [rdi+rcx], xmm0 ; Save aligned
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_4 1 ; nt
+; Special case: u = 4
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ movaps xmm2, [rsi+rcx]
+ movaps xmm3, xmm0
+ movaps xmm0, xmm2
+ movss xmm2, xmm1
+ shufps xmm2, xmm2, 00111001B ; Rotate right
+ movss xmm1, xmm3
+ shufps xmm1, xmm1, 00111001B ; Rotate right
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm1 ; Save aligned
+ movaps [rdi+rcx], xmm2 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm1 ; Non-temporal save
+ movntps [rdi+rcx], xmm2 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movaps xmm1, [rsi+rcx]
+ movss xmm1, xmm0
+ shufps xmm1, xmm1, 00111001B ; Rotate right
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_8 1 ; nt
+; Special case: u = 8
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+ shufps xmm0, xmm0, 01001110B ; Rotate
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 01001110B ; Rotate
+ movsd xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx]
+ shufps xmm0, xmm0, 01001110B ; Rotate
+ movsd xmm1, xmm0
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movaps xmm1, [rsi+rcx]
+ shufps xmm1, xmm1, 01001110B ; Rotate
+ movsd xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSE2_12 1 ; nt
+; Special case: u = 12
+; %1 = 1 if non-temporal store desired
+ movaps xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+ shufps xmm0, xmm0, 10010011B ; Rotate right
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm1, [rsi+rcx+10H] ; Read next two blocks aligned
+ shufps xmm1, xmm1, 10010011B ; Rotate left
+ movss xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx+10H], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx+10H], xmm0 ; Non-temporal save
+ %ENDIF
+ movaps xmm0, [rsi+rcx]
+ shufps xmm0, xmm0, 10010011B ; Rotate left
+ movss xmm1, xmm0
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm1 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm1 ; Non-temporal save
+ %ENDIF
+ jnz %%L1
+
+ ; Move edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movaps xmm1, [rsi+rcx]
+ shufps xmm1, xmm1, 10010011B ; Rotate left
+ movss xmm0, xmm1
+ %IF %1 == 0
+ movaps [rdi+rcx], xmm0 ; Save aligned
+ %ELSE
+ movntps [rdi+rcx], xmm0 ; Non-temporal save
+ %ENDIF
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes, unaligned
+ jmp C200
+%ENDMACRO
+
+
+; Macros for each src alignment, Suppl.SSE3 instruction set:
+; Code for unaligned src, Suppl.SSE3 instruction set.
+; Make separate code for each alignment u because the palignr instruction
+; has the shift count as a constant:
+
+%MACRO MOVE_REVERSE_UNALIGNED_SSSE3 1; u
+; Move rcx + rdx bytes of data
+; Source is misaligned. (src-dest) modulo 16 = %1
+; eax = %1
+; rsi = src - %1 = nearest preceding 16-bytes boundary
+; rdi = dest (aligned)
+; rcx = - (count rounded down to nearest divisible by 32)
+; edx = remaining bytes to move after loop
+ movdqa xmm0, [rsi+rcx] ; Read from nearest following 16B boundary
+
+%%L1: ; Loop. rcx has positive index from the beginning, counting down to zero
+ movdqa xmm1, [rsi+rcx-10H] ; Read next two blocks
+ palignr xmm0, xmm1, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx-10H], xmm0 ; Save aligned
+ movdqa xmm0, [rsi+rcx-20H]
+ palignr xmm1, xmm0, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx-20H], xmm1 ; Save aligned
+ sub rcx, 20H
+ jnz %%L1
+
+ ; Set up for edx remaining bytes
+ test dl, 10H
+ jz %%L2
+ ; One more 16-bytes block to move
+ sub rcx, 10H
+ movdqa xmm1, [rsi+rcx] ; Read next two blocks
+ palignr xmm0, xmm1, %1 ; Combine parts into aligned block
+ movdqa [rdi+rcx], xmm0 ; Save aligned
+
+%%L2: ; Get src pointer back to misaligned state
+ add rsi, rax
+ ; Move remaining 0 - 15 bytes
+ jmp C200
+%ENDMACRO
+
+
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSSE2 below
+; (aligns and fillers are inserted manually to minimize the
+; number of 16-bytes boundaries inside loops)
+
+align 16
+D104: MOVE_REVERSE_UNALIGNED_SSE2_4 0
+D108: MOVE_REVERSE_UNALIGNED_SSE2_8 0
+D10C: MOVE_REVERSE_UNALIGNED_SSE2_12 0
+D101: MOVE_REVERSE_UNALIGNED_SSE2 1, 0
+D102: MOVE_REVERSE_UNALIGNED_SSE2 2, 0
+D103: MOVE_REVERSE_UNALIGNED_SSE2 3, 0
+D105: MOVE_REVERSE_UNALIGNED_SSE2 5, 0
+D106: MOVE_REVERSE_UNALIGNED_SSE2 6, 0
+D107: MOVE_REVERSE_UNALIGNED_SSE2 7, 0
+D109: MOVE_REVERSE_UNALIGNED_SSE2 9, 0
+D10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 0
+D10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 0
+D10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 0
+D10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 0
+D10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 0
+
+; Make 15 instances of Suppl-SSE3 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchSupSSE3 below
+
+align 16
+E104: MOVE_REVERSE_UNALIGNED_SSSE3 4
+E108: MOVE_REVERSE_UNALIGNED_SSSE3 8
+E10C: MOVE_REVERSE_UNALIGNED_SSSE3 0CH
+E101: MOVE_REVERSE_UNALIGNED_SSSE3 1
+E102: MOVE_REVERSE_UNALIGNED_SSSE3 2
+E103: MOVE_REVERSE_UNALIGNED_SSSE3 3
+E105: MOVE_REVERSE_UNALIGNED_SSSE3 5
+E106: MOVE_REVERSE_UNALIGNED_SSSE3 6
+E107: MOVE_REVERSE_UNALIGNED_SSSE3 7
+E109: MOVE_REVERSE_UNALIGNED_SSSE3 9
+E10A: MOVE_REVERSE_UNALIGNED_SSSE3 0AH
+E10B: MOVE_REVERSE_UNALIGNED_SSSE3 0BH
+E10D: MOVE_REVERSE_UNALIGNED_SSSE3 0DH
+E10E: MOVE_REVERSE_UNALIGNED_SSSE3 0EH
+E10F: MOVE_REVERSE_UNALIGNED_SSSE3 0FH
+
+align 16
+F100: ; Non-temporal move, src and dest have same alignment.
+ ; Loop. rcx has positive index from the beginning, counting down to zero
+ sub rcx, 20H
+ movaps xmm0, [rsi+rcx+10H]
+ movaps xmm1, [rsi+rcx]
+ movntps [rdi+rcx+10H], xmm0
+ movntps [rdi+rcx], xmm1
+ jnz F100
+
+ ; Move the remaining edx bytes (0 - 31):
+ ; move 16-8-4-2-1 bytes, aligned
+ test dl, 10H
+ jz C200
+ ; move 16 bytes
+ sub rcx, 10H
+ movaps xmm0, [rsi+rcx]
+ movntps [rdi+rcx], xmm0
+ ; move the remaining 0 - 15 bytes
+ jmp C200
+
+; Non-temporal move, src and dest have different alignment.
+; Make 15 instances of SSE2 macro for each value of the alignment u.
+; These are pointed to by the jump table MAlignmentDispatchNT below
+
+align 16
+F101: MOVE_REVERSE_UNALIGNED_SSE2 1, 1
+F102: MOVE_REVERSE_UNALIGNED_SSE2 2, 1
+F103: MOVE_REVERSE_UNALIGNED_SSE2 3, 1
+F104: MOVE_REVERSE_UNALIGNED_SSE2_4 1
+F105: MOVE_REVERSE_UNALIGNED_SSE2 5, 1
+F106: MOVE_REVERSE_UNALIGNED_SSE2 6, 1
+F107: MOVE_REVERSE_UNALIGNED_SSE2 7, 1
+F108: MOVE_REVERSE_UNALIGNED_SSE2_8 1
+F109: MOVE_REVERSE_UNALIGNED_SSE2 9, 1
+F10A: MOVE_REVERSE_UNALIGNED_SSE2 0AH, 1
+F10B: MOVE_REVERSE_UNALIGNED_SSE2 0BH, 1
+F10C: MOVE_REVERSE_UNALIGNED_SSE2_12 1
+F10D: MOVE_REVERSE_UNALIGNED_SSE2 0DH, 1
+F10E: MOVE_REVERSE_UNALIGNED_SSE2 0EH, 1
+F10F: MOVE_REVERSE_UNALIGNED_SSE2 0FH, 1
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; CPU dispatcher
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+memmoveCPUDispatch: ; CPU dispatcher, check for Suppl-SSE3 instruction set
+ ; This part is executed only once
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ push r8
+
+ ; set CacheBypassLimit to half the size of the largest level cache
+%ifdef WINDOWS
+ xor ecx, ecx ; 0 means default
+%else
+ xor edi, edi
+%endif
+ call SetMemcpyCacheLimit@
+ mov eax, 1
+ cpuid ; Get feature flags
+ lea rbx, [memmoveSSE2@]
+ bt ecx, 9 ; Test bit for SupplSSE3
+ jnc Q100
+ lea rbx, [memmoveSSSE3@]
+ call UnalignedIsFaster
+ test eax, eax
+ jz Q100
+ lea rbx, [memmoveU@]
+ call Store256BitIsFaster
+ test eax, eax
+ jz Q100
+ lea rbx, [memmoveU256@]
+
+Q100: ; Insert appropriate pointer
+ mov [memmoveDispatch], rbx
+ mov rax, rbx
+ pop r8
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ ; Jump according to the replaced function pointer
+ jmp rax
+
+; Note: Must call SetMemcpyCacheLimit1 defined in memcpy64.asm
+SetMemcpyCacheLimit:
+SetMemcpyCacheLimit@:
+ call SetMemcpyCacheLimit1
+ mov [CacheBypassLimit], rax
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; data section. jump tables, dispatch function pointer, cache size
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Data segment must be included in function namespace
+SECTION .data
+align 16
+
+; Jump tables for alignments 0 - 15:
+; The CPU dispatcher replaces MAlignmentDispatch with
+; MAlignmentDispatchSSE2 or MAlignmentDispatchSupSSE3 if Suppl-SSE3
+; is supported.
+
+; Code pointer for each alignment for SSE2 instruction set
+MAlignmentDispatchSSE2:
+DQ C100, D101, D102, D103, D104, D105, D106, D107
+DQ D108, D109, D10A, D10B, D10C, D10D, D10E, D10F
+
+; Code pointer for each alignment for Suppl-SSE3 instruction set
+MAlignmentDispatchSSSE3:
+DQ C100, E101, E102, E103, E104, E105, E106, E107
+DQ E108, E109, E10A, E10B, E10C, E10D, E10E, E10F
+
+; Code pointer for each alignment for non-temporal store
+MAlignmentDispatchNT:
+DQ F100, F101, F102, F103, F104, F105, F106, F107
+DQ F108, F109, F10A, F10B, F10C, F10D, F10E, F10F
+
+memmoveDispatch: DQ memmoveCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > _CacheBypassLimit
+; The optimal value of CacheBypassLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache:
+CacheBypassLimit: DD 0
diff --git a/asmlibSrc/memset32.asm b/asmlibSrc/memset32.asm
new file mode 100755
index 0000000..f4d025f
--- /dev/null
+++ b/asmlibSrc/memset32.asm
@@ -0,0 +1,487 @@
+;************************* memset32.asm *************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2013-09-11
+; Description:
+; Faster version of the standard memset function:
+; void * A_memset(void * dest, int c, size_t count);
+; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
+;
+; Overriding standard function memset:
+; The alias ?OVR_memset is changed to _memset in the object file if
+; it is desired to override the standard library function memset.
+;
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; Optimization:
+; Uses XMM registers to set 16 bytes at a time, aligned.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_memset: function ; Function memset
+global ?OVR_memset: function ; ?OVR removed if standard function memset overridden
+global _GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
+global _SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
+; Direct entries to CPU-specific versions
+global _memset386: function ; version for old CPUs without SSE
+global _memsetSSE2: function ; SSE2 version
+global _memsetAVX: function ; version for CPUs with fast 256-bit store
+
+
+; Imported from cachesize32.asm:
+extern _DataCacheSize ; Get size of data cache
+
+; Imported from instrset32.asm
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+; Imported from unalignedisfaster32.asm:
+extern _Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Define return from this function
+%MACRO RETURNM 0
+%IFDEF POSITIONINDEPENDENT
+ pop ebx
+%ENDIF
+ mov eax, [esp+4] ; return dest
+ ret
+%ENDMACRO
+
+
+SECTION .text align=16
+
+; extern "C" void * memset(void * dest, int c, size_t count);
+; Function entry:
+_A_memset:
+?OVR_memset:
+%IFNDEF POSITIONINDEPENDENT
+ jmp dword [memsetDispatch] ; Go to appropriate version, depending on instruction set
+RP equ 0 ; RP = 0 if not position-independent
+
+%ELSE ; Position-independent code
+ push ebx
+ call get_thunk_ebx ; get reference point for position-independent code
+RP: ; reference point ebx = offset RP
+
+; Make the following instruction with address relative to RP:
+ jmp dword [ebx+memsetDispatch-RP]
+
+%ENDIF
+
+_memsetAVX: ; AVX version. Use ymm register
+%IFDEF POSITIONINDEPENDENT
+ push ebx
+ call get_thunk_ebx ; get reference point for position-independent code
+ add ebx, RP - $
+memsetAVX@: ; local label
+ mov edx, [esp+4+4] ; dest
+ movzx eax, byte [esp+4+8] ; c
+ mov ecx, [esp+4+12] ; count
+%ELSE
+memsetAVX@: ; local label
+ mov edx, [esp+4] ; dest
+ movzx eax, byte [esp+8] ; c
+ mov ecx, [esp+12] ; count
+%ENDIF
+ imul eax, 01010101H ; Broadcast c into all bytes of eax
+ cmp ecx, 16
+ ja B100
+
+B050: ; count <= 16, both SSE2 and AVX version
+%IFNDEF POSITIONINDEPENDENT
+ jmp dword [MemsetJTab+ecx*4]
+%ELSE
+ jmp dword [MemsetJTab-RP+ebx+ecx*4]
+%ENDIF
+
+; Separate code for each count from 0 to 16:
+M16: mov [edx+12], eax
+M12: mov [edx+8], eax
+M08: mov [edx+4], eax
+M04: mov [edx], eax
+M00: RETURNM
+
+M15: mov [edx+11], eax
+M11: mov [edx+7], eax
+M07: mov [edx+3], eax
+M03: mov [edx+1], ax
+M01: mov [edx], al
+ RETURNM
+
+M14: mov [edx+10], eax
+M10: mov [edx+6], eax
+M06: mov [edx+2], eax
+M02: mov [edx], ax
+ RETURNM
+
+M13: mov [edx+9], eax
+M09: mov [edx+5], eax
+M05: mov [edx+1], eax
+ mov [edx], al
+ RETURNM
+
+align 16
+B100: ; count > 16.
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
+ lea eax, [edx+ecx] ; point to end
+
+ cmp ecx, 20H
+ jbe K600 ; faster to use xmm registers if small
+
+ ; Store the first possibly unaligned 16 bytes
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the subsequent regular part, than to make possibly mispredicted
+ ; branches depending on the size of the first part.
+ movups oword [edx], xmm0
+
+ ; store another 16 bytes, aligned
+ add edx, 10H
+ and edx, -10H
+ movaps oword [edx], xmm0
+
+ ; go to next 32 bytes boundary
+ add edx, 10H
+ and edx, -20H
+
+ ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+ cmp ecx, [_MemsetCacheLimit]
+%ELSE ; position-independent code
+ cmp ecx, [ebx+_MemsetCacheLimit-RP]
+%ENDIF
+ ja K300 ; Use non-temporal store if count > MemsetCacheLimit
+
+ ; find last 32 bytes boundary
+ mov ecx, eax
+ and ecx, -20H
+
+ ; - size of 32-bytes blocks
+ sub edx, ecx
+ jnb K200 ; Jump if not negative
+
+ ; extend value to 256 bits
+ vinsertf128 ymm0,ymm0,xmm0,1
+
+K100: ; Loop through 32-bytes blocks
+ ; ecx = end of 32-bytes blocks part
+ ; edx = negative index from the end, counting up to zero
+ vmovaps [ecx+edx], ymm0
+ add edx, 20H
+ jnz K100
+ vzeroupper
+
+K200: ; the last part from ecx to eax is < 32 bytes. write last 32 bytes with overlap
+ movups [eax-20H], xmm0
+ movups [eax-10H], xmm0
+ RETURNM
+
+K300: ; Use non-temporal moves, same code as above:
+
+ ; find last 32 bytes boundary
+ mov ecx, eax
+ and ecx, -20H
+
+ ; - size of 32-bytes blocks
+ sub edx, ecx
+ jnb K500 ; Jump if not negative
+
+ ; extend value to 256 bits
+ vinsertf128 ymm0,ymm0,xmm0,1
+
+align 16
+K400: ; Loop through 32-bytes blocks
+ ; ecx = end of 32-bytes blocks part
+ ; edx = negative index from the end, counting up to zero
+ vmovntps [ecx+edx], ymm0
+ add edx, 20H
+ jnz K400
+ vzeroupper
+
+K500: ; the last part from ecx to eax is < 32 bytes. write last 32 bytes with overlap
+ movups [eax-20H], xmm0
+ movups [eax-10H], xmm0
+ RETURNM
+
+K600: ; 16 < count <= 32
+ movups [edx], xmm0
+ movups [eax-10H], xmm0
+ RETURNM
+
+
+align 16
+_memsetSSE2: ; SSE2 version. Use xmm register
+%IFDEF POSITIONINDEPENDENT
+ push ebx
+ call get_thunk_ebx ; get reference point for position-independent code
+ add ebx, RP - $
+memsetSSE2@: ; local label
+ mov edx, [esp+4+4] ; dest
+ movzx eax, byte [esp+4+8] ; c
+ mov ecx, [esp+4+12] ; count
+%ELSE
+memsetSSE2@: ; local label
+ mov edx, [esp+4] ; dest
+ movzx eax, byte [esp+8] ; c
+ mov ecx, [esp+12] ; count
+%ENDIF
+ imul eax, 01010101H ; Broadcast c into all bytes of eax
+ cmp ecx, 16
+ jna B050 ; small counts: same as AVX version
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
+
+ ; Store the first unaligned part.
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the subsequent regular part, than to make possibly mispredicted
+ ; branches depending on the size of the first part.
+ movq qword [edx], xmm0
+ movq qword [edx+8], xmm0
+
+ ; Check if count very big
+%IFNDEF POSITIONINDEPENDENT
+ cmp ecx, [_MemsetCacheLimit]
+%ELSE ; position-independent code
+ cmp ecx, [ebx+_MemsetCacheLimit-RP]
+%ENDIF
+ ja M500 ; Use non-temporal store if count > MemsetCacheLimit
+
+ ; Point to end of regular part:
+ ; Round down dest+count to nearest preceding 16-bytes boundary
+ lea ecx, [edx+ecx-1]
+ and ecx, -10H
+
+ ; Point to start of regular part:
+ ; Round up dest to next 16-bytes boundary
+ add edx, 10H
+ and edx, -10H
+
+ ; -(size of regular part)
+ sub edx, ecx
+ jnb M300 ; Jump if not negative
+
+align 16
+M200: ; Loop through regular part
+ ; ecx = end of regular part
+ ; edx = negative index from the end, counting up to zero
+ movdqa [ecx+edx], xmm0
+ add edx, 10H
+ jnz M200
+
+M300: ; Do the last irregular part
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the preceding regular part, than to make possibly mispredicted
+ ; branches depending on the size of the last part.
+%IFDEF POSITIONINDEPENDENT ; (ebx is pushed)
+ mov eax, [esp+4+4] ; dest
+ mov ecx, [esp+4+12] ; count
+%ELSE
+ mov eax, [esp+4] ; dest
+ mov ecx, [esp+12] ; count
+%ENDIF
+ movq qword [eax+ecx-10H], xmm0
+ movq qword [eax+ecx-8], xmm0
+ RETURNM
+
+M500: ; Use non-temporal moves, same code as above:
+ ; End of regular part:
+ ; Round down dest+count to nearest preceding 16-bytes boundary
+ lea ecx, [edx+ecx-1]
+ and ecx, -10H
+
+ ; Start of regular part:
+ ; Round up dest to next 16-bytes boundary
+ add edx, 10H
+ and edx, -10H
+
+ ; -(size of regular part)
+ sub edx, ecx
+ jnb M700 ; Jump if not negative
+
+align 16
+M600: ; Loop through regular part
+ ; ecx = end of regular part
+ ; edx = negative index from the end, counting up to zero
+ movntdq [ecx+edx], xmm0
+ add edx, 10H
+ jnz M600
+
+M700: ; Do the last irregular part (same as M300)
+%IFDEF POSITIONINDEPENDENT ; (ebx is pushed)
+ mov eax, [esp+4+4] ; dest
+ mov ecx, [esp+4+12] ; count
+%ELSE
+ mov eax, [esp+4] ; dest
+ mov ecx, [esp+12] ; count
+%ENDIF
+ movq qword [eax+ecx-10H], xmm0
+ movq qword [eax+ecx-8], xmm0
+ RETURNM
+
+
+
+_memset386: ; 80386 version
+%IFDEF POSITIONINDEPENDENT
+ push ebx
+ call get_thunk_ebx ; get reference point for position-independent code
+ add ebx, RP - $
+memset386@: ; local label
+ mov edx, [esp+4+4] ; dest
+ xor eax, eax
+ mov al, byte [esp+4+8] ; c
+ mov ecx, [esp+4+12] ; count
+%ELSE
+memset386@: ; local label
+ mov edx, [esp+4] ; dest
+ xor eax, eax
+ mov al, byte [esp+8] ; c
+ mov ecx, [esp+12] ; count
+%ENDIF
+ imul eax, 01010101H ; Broadcast c into all bytes of eax
+ push edi
+ mov edi, edx
+ cmp ecx, 4
+ jb N400
+N200: test edi, 3
+ jz N300
+ ; unaligned
+N210: mov [edi], al ; store 1 byte until edi aligned
+ inc edi
+ dec ecx
+ test edi, 3
+ jnz N210
+N300: ; aligned
+ mov edx, ecx
+ shr ecx, 2
+ cld
+ rep stosd ; store 4 bytes at a time
+ mov ecx, edx
+ and ecx, 3
+N400: rep stosb ; store any remaining bytes
+ pop edi
+ RETURNM
+
+
+; CPU dispatching for memset. This is executed only once
+memsetCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ pushad
+ call GetMemsetCacheLimit@ ; calculate cache limit
+ call _InstructionSet ; get supported instruction set
+ ; Point to generic version of memset
+ mov dword [memsetDispatch], memset386@
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; SSE2 supported
+ ; Point to SSE2 version of memset
+ mov dword [memsetDispatch], memsetSSE2@
+ call _Store256BitIsFaster ; check if 256-bit stores are available and faster
+ test eax, eax
+ jz Q100
+ mov dword [memsetDispatch], memsetAVX@
+
+Q100: popad
+ ; Continue in appropriate version of memset
+ jmp dword [memsetDispatch]
+
+%ELSE ; Position-independent version
+ pushad
+ call GetMemsetCacheLimit@
+ call _InstructionSet
+
+ ; Point to generic version of memset
+ lea esi, [ebx+memset386 at -RP]
+ cmp eax, 4 ; check SSE2
+ jb Q100
+ ; SSE2 supported
+ ; Point to SSE2 version of memset
+ lea esi, [ebx+memsetSSE2 at -RP]
+ call _Store256BitIsFaster ; check if 256-bit stores are available and faster
+ test eax, eax
+ jz Q100
+ lea esi, [ebx+memsetAVX at -RP]
+Q100: mov [ebx+memsetDispatch-RP], esi
+ popad
+ ; Continue in appropriate version of memset
+ jmp [ebx+memsetDispatch-RP]
+
+get_thunk_ebx: ; load caller address into ebx for position-independent code
+ mov ebx, [esp]
+ ret
+%ENDIF
+
+
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+_GetMemsetCacheLimit:
+GetMemsetCacheLimit@: ; local label
+ push ebx
+%ifdef POSITIONINDEPENDENT
+ call get_thunk_ebx
+ add ebx, _MemsetCacheLimit - $
+%else
+ mov ebx, _MemsetCacheLimit
+%endif
+ mov eax, [ebx]
+ test eax, eax
+ jnz U200
+ ; Get half the size of the largest level cache
+ push 0 ; 0 means largest level cache
+ call _DataCacheSize ; get cache size
+ pop ecx
+ shr eax, 1 ; half the size
+ jnz U100
+ mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
+U100: mov [ebx], eax
+U200: pop ebx
+ ret
+
+; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+_SetMemsetCacheLimit:
+ push ebx
+%ifdef POSITIONINDEPENDENT
+ call get_thunk_ebx
+ add ebx, _MemsetCacheLimit - $
+%else
+ mov ebx, _MemsetCacheLimit
+%endif
+ mov eax, [esp+8]
+ test eax, eax
+ jnz U400
+ ; zero, means default
+ mov [ebx], eax
+ call GetMemsetCacheLimit@
+U400:
+ mov [ebx], eax
+ pop ebx
+ ret
+
+
+SECTION .data
+align 16
+
+; Jump table for count from 0 to 16:
+MemsetJTab DD M00, M01, M02, M03, M04, M05, M06, M07
+ DD M08, M09, M10, M11, M12, M13, M14, M15, M16
+
+; Pointer to appropriate version.
+; This initially points to memsetCPUDispatch. memsetCPUDispatch will
+; change this to the appropriate version of memset, so that
+; memsetCPUDispatch is only executed once:
+memsetDispatch DD memsetCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
+; The optimal value of MemsetCacheLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache
+_MemsetCacheLimit: DD 0
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/asmlibSrc/memset64.asm b/asmlibSrc/memset64.asm
new file mode 100755
index 0000000..6fb0490
--- /dev/null
+++ b/asmlibSrc/memset64.asm
@@ -0,0 +1,368 @@
+;************************* memset64.asm *************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2013-08-04
+; Description:
+; Faster version of the standard memset function:
+; void * A_memset(void * dest, int c, size_t count);
+; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
+;
+; Overriding standard function memset:
+; The alias ?OVR_memset is changed to _memset in the object file if
+; it is desired to override the standard library function memset.
+;
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+;
+; Optimization:
+; Uses XMM registers to set 16 bytes at a time, aligned.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_memset: function ; Function memset
+global ?OVR_memset: function ; ?OVR removed if standard function memset overridden
+global memsetSSE2: function ; SSE2 version
+global memsetAVX: function ; version for CPUs with fast 256-bit store
+global GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
+global SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
+
+; Imported from cachesize64.asm:
+extern DataCacheSize ; Get size of data cache
+
+; Imported from unalignedisfaster64.asm:
+extern Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
+
+; Define prolog for this function
+%MACRO PROLOGM 0
+%IFDEF WINDOWS
+%define Rdest rcx ; dest
+ movzx eax, dl ; c
+ mov rdx, r8 ; count
+%define Rcount rdx ; count
+%define Rdest2 r9 ; copy of dest
+%define Rcount2 r8 ; copy of count
+
+%ELSE ; Unix
+%define Rdest rdi ; dest
+ movzx eax, sil ; c
+%define Rcount rdx ; count
+%define Rdest2 rcx ; copy of dest
+%define Rcount2 rsi ; copy of count
+ mov Rcount2, Rcount ; copy count
+%ENDIF
+%ENDMACRO
+
+
+SECTION .text align=16
+
+; extern "C" void * memset(void * dest, int c, size_t count);
+; Function entry:
+A_memset:
+?OVR_memset:
+ jmp [memsetDispatch] ; CPU dispatch table
+
+memsetAVX: ; AVX version. Use ymm register
+memsetAVX@: ; local label
+ PROLOGM
+ imul eax, 01010101H ; Broadcast c into all bytes of eax
+ mov Rdest2, Rdest ; save dest
+ cmp Rcount, 16
+ ja B100
+B050: lea r10, [MemsetJTab] ; SSE2 version comes in here
+ jmp qword [r10+Rcount*8] ; jump table for small counts
+
+; Separate code for each count from 0 to 16:
+M16: mov [Rdest+12], eax
+M12: mov [Rdest+8], eax
+M08: mov [Rdest+4], eax
+M04: mov [Rdest], eax
+M00: mov rax, Rdest2 ; return dest
+ ret
+
+M15: mov [Rdest+11], eax
+M11: mov [Rdest+7], eax
+M07: mov [Rdest+3], eax
+M03: mov [Rdest+1], ax
+M01: mov [Rdest], al
+ mov rax, Rdest2 ; return dest
+ ret
+
+M14: mov [Rdest+10], eax
+M10: mov [Rdest+6], eax
+M06: mov [Rdest+2], eax
+M02: mov [Rdest], ax
+ mov rax, Rdest2 ; return dest
+ ret
+
+M13: mov [Rdest+9], eax
+M09: mov [Rdest+5], eax
+M05: mov [Rdest+1], eax
+ mov [Rdest], al
+ mov rax, Rdest2 ; return dest
+ ret
+
+B100: ; AVX version, Rcount > 16
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
+
+ lea rax, [Rdest+Rcount] ; point to end
+
+ cmp Rcount, 20H
+ jbe K600 ; faster to use xmm registers if small
+
+ ; Store the first possibly unaligned 16 bytes
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the subsequent regular part, than to make possibly mispredicted
+ ; branches depending on the size of the first part.
+ movups oword [Rdest], xmm0
+
+ ; store another 16 bytes, aligned
+ add Rdest, 10H
+ and Rdest, -10H
+ movaps oword [Rdest], xmm0
+
+ ; go to next 32 bytes boundary
+ add Rdest, 10H
+ and Rdest, -20H
+
+ ; Check if count very big
+ cmp Rcount, [MemsetCacheLimit]
+ ja K300 ; Use non-temporal store if count > MemsetCacheLimit
+
+ ; find last 32 bytes boundary
+ mov Rcount, rax
+ and Rcount, -20H
+
+ ; - size of 32-bytes blocks
+ sub Rdest, Rcount
+ jnb K200 ; Jump if not negative
+
+ ; extend value to 256 bits
+ vinsertf128 ymm0,ymm0,xmm0,1
+
+align 16
+K100: ; Loop through 32-bytes blocks. Register use is swapped
+ ; Rcount = end of 32-bytes blocks part
+ ; Rdest = negative index from the end, counting up to zero
+ vmovaps [Rcount+Rdest], ymm0
+ add Rdest, 20H
+ jnz K100
+ vzeroupper
+
+K200: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+ movups [rax-20H], xmm0
+ movups [rax-10H], xmm0
+ mov rax, Rdest2 ; return dest
+ ret
+
+K300: ; Use non-temporal moves, same code as above:
+
+ ; find last 32 bytes boundary
+ mov Rcount, rax
+ and Rcount, -20H
+
+ ; - size of 32-bytes blocks
+ sub Rdest, Rcount
+ jnb K500 ; Jump if not negative
+
+ ; extend value to 256 bits
+ vinsertf128 ymm0,ymm0,xmm0,1
+
+align 16
+K400: ; Loop through 32-bytes blocks. Register use is swapped
+ ; Rcount = end of 32-bytes blocks part
+ ; Rdest = negative index from the end, counting up to zero
+ vmovntps [Rcount+Rdest], ymm0
+ add Rdest, 20H
+ jnz K400
+ vzeroupper
+
+K500: ; the last part from Rcount to rax is < 32 bytes. write last 32 bytes with overlap
+ movups [rax-20H], xmm0
+ movups [rax-10H], xmm0
+ mov rax, Rdest2 ; return dest
+ ret
+
+K600: ; 16 < count <= 32
+ movups [Rdest], xmm0
+ movups [rax-10H], xmm0
+ mov rax, Rdest2 ; return dest
+ ret
+
+
+memsetSSE2: ; count > 16. Use SSE2 instruction set
+memsetSSE2@: ; local label
+ PROLOGM
+ imul eax, 01010101H ; Broadcast c into all bytes of eax
+ mov Rdest2, Rdest ; save dest
+ cmp Rcount, 16
+ jna B050
+
+ movd xmm0, eax
+ pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
+
+ ; Store the first unaligned part.
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the subsequent regular part, than to make possibly mispredicted
+ ; branches depending on the size of the first part.
+ movq qword [Rdest], xmm0
+ movq qword [Rdest+8], xmm0
+
+ ; Check if count very big
+M150: mov rax, [MemsetCacheLimit]
+ cmp Rcount, rax
+ ja M500 ; Use non-temporal store if count > MemsetCacheLimit
+
+ ; Point to end of regular part:
+ ; Round down dest+count to nearest preceding 16-bytes boundary
+ lea Rcount, [Rdest+Rcount-1]
+ and Rcount, -10H
+
+ ; Point to start of regular part:
+ ; Round up dest to next 16-bytes boundary
+ add Rdest, 10H
+ and Rdest, -10H
+
+ ; -(size of regular part)
+ sub Rdest, Rcount
+ jnb M300 ; Jump if not negative
+
+align 16
+M200: ; Loop through regular part
+ ; Rcount = end of regular part
+ ; Rdest = negative index from the end, counting up to zero
+ movdqa [Rcount+Rdest], xmm0
+ add Rdest, 10H
+ jnz M200
+
+M300: ; Do the last irregular part
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the preceding regular part, than to make possibly mispredicted
+ ; branches depending on the size of the last part.
+ mov rax, Rdest2 ; dest
+ movq qword [rax+Rcount2-10H], xmm0
+ movq qword [rax+Rcount2-8], xmm0
+ ret
+
+
+M500: ; Use non-temporal moves, same code as above:
+ ; End of regular part:
+ ; Round down dest+count to nearest preceding 16-bytes boundary
+ lea Rcount, [Rdest+Rcount-1]
+ and Rcount, -10H
+
+ ; Start of regular part:
+ ; Round up dest to next 16-bytes boundary
+ add Rdest, 10H
+ and Rdest, -10H
+
+ ; -(size of regular part)
+ sub Rdest, Rcount
+ jnb M700 ; Jump if not negative
+
+align 16
+M600: ; Loop through regular part
+ ; Rcount = end of regular part
+ ; Rdest = negative index from the end, counting up to zero
+ movntdq [Rcount+Rdest], xmm0
+ add Rdest, 10H
+ jnz M600
+
+M700: ; Do the last irregular part
+ ; The size of this part is 1 - 16 bytes.
+ ; It is faster to always write 16 bytes, possibly overlapping
+ ; with the preceding regular part, than to make possibly mispredicted
+ ; branches depending on the size of the last part.
+ mov rax, Rdest2 ; dest
+ movq qword [rax+Rcount2-10H], xmm0
+ movq qword [rax+Rcount2-8], xmm0
+ ret
+
+
+memsetCPUDispatch: ; CPU dispatcher, check for instruction sets and which method is fastest
+ ; This part is executed only once
+ push rbx
+ push rcx
+ push rdx
+ push rsi
+ push rdi
+ push r8
+ ; set CacheBypassLimit to half the size of the largest level cache
+ call GetMemsetCacheLimit@
+ lea rbx, [memsetSSE2@]
+ call Store256BitIsFaster ; Test if 256-bit read/write is available and faster than 128-bit read/write
+ test eax, eax
+ jz Q100
+ lea rbx, [memsetAVX@]
+Q100:
+ ; Insert appropriate pointer
+ mov [memsetDispatch], rbx
+ mov rax, rbx
+ pop r8
+ pop rdi
+ pop rsi
+ pop rdx
+ pop rcx
+ pop rbx
+ ; Jump according to the replaced function pointer
+ jmp rax
+
+
+; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
+GetMemsetCacheLimit:
+GetMemsetCacheLimit@:
+ mov rax, [MemsetCacheLimit]
+ test rax, rax
+ jnz U200
+ ; Get half the size of the largest level cache
+%ifdef WINDOWS
+ xor ecx, ecx ; 0 means largest level cache
+%else
+ xor edi, edi ; 0 means largest level cache
+%endif
+ call DataCacheSize ; get cache size
+ shr eax, 1 ; half the size
+ jnz U100
+ mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
+U100: mov [MemsetCacheLimit], eax
+U200: ret
+
+; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
+SetMemsetCacheLimit:
+%ifdef WINDOWS
+ mov rax, rcx
+%else
+ mov rax, rdi
+%endif
+ test rax, rax
+ jnz U400
+ ; zero, means default
+ mov [MemsetCacheLimit], rax
+ call GetMemsetCacheLimit@
+U400: mov [MemsetCacheLimit], rax
+ ret
+
+
+SECTION .data
+align 16
+; Jump table for count from 0 to 16:
+MemsetJTab:DQ M00, M01, M02, M03, M04, M05, M06, M07
+ DQ M08, M09, M10, M11, M12, M13, M14, M15, M16
+
+; Pointer to appropriate version.
+; This initially points to memsetCPUDispatch. memsetCPUDispatch will
+; change this to the appropriate version of memset, so that
+; memsetCPUDispatch is only executed once:
+memsetDispatch: DQ memsetCPUDispatch
+
+; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
+; The optimal value of MemsetCacheLimit is difficult to estimate, but
+; a reasonable value is half the size of the largest cache
+MemsetCacheLimit: DQ 0
diff --git a/asmlibSrc/mersenne32.asm b/asmlibSrc/mersenne32.asm
new file mode 100755
index 0000000..ed1a100
--- /dev/null
+++ b/asmlibSrc/mersenne32.asm
@@ -0,0 +1,821 @@
+; ----------------------------- MERSENNE32.ASM ---------------------------
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-09-13
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+; Description:
+; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
+;
+; This random number generator is described in the article by
+; M. Matsumoto & T. Nishimura, in:
+; ACM Transactions on Modeling and Computer Simulation,
+; vol. 8, no. 1, 1998, pp. 3-30. See also:
+; http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+;
+; Initialization:
+; MersRandomInit must be called before the first call to any of the other
+; random number functions. The seed is any 32-bit integer.
+; You may use MersRandomInitByArray instead if you want more
+; than 32 bits for seed. length is the number of integers in seeds[].
+; length must be > 0, there is no upper limit for length.
+;
+; Generating random numbers:
+; MersRandom returns a floating point number in the interval 0 <= x < 1 with
+; a resolution of 32 bits.
+; MersIRandom returns an integer in the interval defined by min and max with
+; a resolution of 32 bits.
+; MersIRandomX returns an integer in the interval defined by min and max with
+; exactly equal probabilities of all values in the interval.
+; MersBRandom returns 32 random bits.
+;
+; Error conditions:
+; If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
+; and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
+; MersIRandom and MersIRandomX return a large negative number if max < min.
+;
+; C++ prototypes in randoma.h, 32-bit Windows:
+;
+; Thread-safe static link versions for Mersenne Twister
+; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+; extern "C" void MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+; extern "C" int MersIRandom (void * Pthis, int min, int max); // Output random integer
+; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Output random integer, exact
+; extern "C" double MersRandom(void * Pthis); // Output random float
+; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
+;
+; Single-threaded static link versions for Mersenne Twister, Windows only
+; extern "C" void MersenneRandomInit(int seed); // Re-seed
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+; extern "C" int MersenneIRandom (int min, int max); // Output random integer
+; extern "C" int MersenneIRandomX(int min, int max); // Output random integer, exact
+; extern "C" double MersenneRandom(); // Output random float
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+;
+; Single threaded dynamic link versions for Mersenne Twister, Windows only
+; extern "C" void __stdcall MersenneRandomInitD(int seed); // Re-seed
+; extern "C" void __stdcall MersenneRandomInitByArrayD(unsigned int seeds[], int length); // Seed by more than 32 bits
+; extern "C" int __stdcall MersenneIRandomD (int min, int max); // Output random integer
+; extern "C" int __stdcall MersenneIRandomXD(int min, int max); // Output random integer, exact
+; extern "C" double __stdcall MersenneRandomD(); // Output random float
+; extern "C" unsigned int __stdcall MersenneBRandomD(); // Output random bits
+;
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global _MersRandomInit, _MersRandomInitByArray
+global _MersBRandom, _MersRandom, _MersIRandom, _MersIRandomX
+global _MersenneRandomInitByArray,_MersenneRandomInit
+global _MersenneRandom, _MersenneIRandom, _MersenneIRandomX, _MersenneBRandom
+%IFDEF WINDOWS
+global _MersenneRandomInitByArrayD at 8, _MersenneRandomInitD at 4
+global _MersenneRandomD at 0, _MersenneIRandomD at 8, _MersenneIRandomXD at 8, _MersenneBRandomD at 0
+%ENDIF
+
+
+SECTION .data
+align 16
+; Data for single instance of random number generator
+MersenneInstance:
+ISTRUC CRandomMersenneA
+IEND
+; Size of structure
+MersenneSize equ $ - MersenneInstance
+
+
+SECTION .CODE ALIGN=16
+
+extern _InstructionSet
+
+
+; ---------------------------------------------------------------
+; Thread-safe static link versions for Mersenne Twister
+; ---------------------------------------------------------------
+
+; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+
+_MersRandomInit: ; PROC NEAR
+ mov ecx, [esp+4] ; Pthis
+ mov eax, [esp+8] ; seed
+ and ecx, -16 ; align buffer
+
+MersRandomInit_reg: ; Entry for register parameters, used internally
+ call Mers_init0 ; initialize mt buffer with seeds
+
+ ; Number of premade numbers that are lost in the initialization when the
+ ; SSE2 implementation makes up to 4 premade numbers at a time:
+%IF MERS_N & 3
+ PREMADELOST equ (MERS_N & 3)
+%ELSE
+ PREMADELOST equ 4
+%ENDIF
+ ; We want the C++ and the assembly implementation to give exactly the same
+ ; sequence. The C++ version discards 37 random numbers after initialization.
+ ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
+ ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
+ ; SSE2 is supported, otherwise 37 + 1.
+
+ push edi
+ mov edi, 37+PREMADELOST+1
+ cmp dword [ecx+CRandomMersenneA.Instset], 4 ; can we use XMM registers and SSE2 ?
+ jae M110
+ sub edi, PREMADELOST ; SSE2 not supported
+ mov dword [ecx+CRandomMersenneA.PreInx], 0 ; reset index to premade list
+M110: ; loop
+M120: call MersBRandom_reg
+ dec edi
+ jnz M120
+ pop edi
+ ret
+;_MersRandomInit ENDP
+
+
+Mers_init0: ; make random seeds from eax and put them into MT buffer
+; Input parameters:
+; eax: seed
+; ecx points to CRandomMersenneA
+
+ push ebx
+ push edi
+ mov ebx, eax ; seed
+
+ ; clear my buffer
+ push ecx
+ mov edi, ecx ; Pthis
+ add edi, 16 ; skip alignment filler
+ mov ecx, (MersenneSize - 16) / 4
+ xor eax, eax
+ cld
+ rep stosd
+ pop ecx ; Pthis
+
+ ; initialize CRandomMersenneA structure
+ mov dword [ecx+CRandomMersenneA.PreInx], 4*4
+ push ecx
+ call _InstructionSet ; detect instruction set
+ pop ecx
+ mov [ecx+CRandomMersenneA.Instset], eax
+ mov eax, MERS_B
+ mov [ecx+CRandomMersenneA.TMB], eax
+ mov [ecx+CRandomMersenneA.TMB+4], eax
+ mov [ecx+CRandomMersenneA.TMB+8], eax
+ mov [ecx+CRandomMersenneA.TMB+12], eax
+ mov eax, MERS_C
+ mov [ecx+CRandomMersenneA.TMC], eax
+ mov [ecx+CRandomMersenneA.TMC+4], eax
+ mov [ecx+CRandomMersenneA.TMC+8], eax
+ mov [ecx+CRandomMersenneA.TMC+12], eax
+ mov eax, 3FF00000H ; upper dword of 1.0, double precision
+ mov dword [ecx+CRandomMersenneA.one+4], eax
+ mov dword [ecx+CRandomMersenneA.one+12], eax
+ mov dword [ecx+CRandomMersenneA.LMASK], LOWER_MASK
+ mov dword [ecx+CRandomMersenneA.UMASK], UPPER_MASK
+ mov dword [ecx+CRandomMersenneA.MATA], MERS_A
+
+ ; put random numbers into MT buffer
+ xor edi, edi
+M210: mov [ecx+edi*4+CRandomMersenneA.MT], ebx
+ mov edx, ebx
+ shr ebx, 30
+ xor ebx, edx
+ imul ebx, 1812433253
+ inc edi
+ add ebx, edi
+ cmp edi, MERS_N
+ jb M210
+
+ ; Set index MTI to end of list, (scaled by 4)
+ ; Round up to multiple of 4 to avoid alignment error
+ mov dword [ecx+CRandomMersenneA.MTI], ((MERS_N+3) & -4) * 4
+
+ pop edi
+ pop ebx
+ ret
+;Mers_init0 ENDP
+
+
+; extern "C" void MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+_MersRandomInitByArray: ; PROC NEAR
+
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov ecx, [esp+20] ; Pthis
+ mov ebx, [esp+24] ; seeds
+ mov ebp, [esp+28] ; length
+ and ecx, -16 ; align buffer
+
+MersRandomInitByArray_reg: ; Entry for register parameters, used internally
+
+ push ebp ; save length
+ mov eax, 19650218
+ call Mers_init0 ; init0(19650218);
+
+ test ebp, ebp
+ jle M380 ; error: length <= 0
+ xor edi, edi ; j = 0
+ lea esi, [edi+1] ; i = 1
+ cmp ebp, MERS_N
+ ja M310
+ mov ebp, MERS_N ; k = max (MERS_N,length)
+M310:
+
+ ; for (; k; k--) {
+M320: mov eax, [ecx+esi*4-4+CRandomMersenneA.MT] ; mt[i-1]
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
+ imul eax, 1664525 ; * 1664525
+ xor eax, [ecx+esi*4+CRandomMersenneA.MT] ; ^ mt[i]
+ add eax, [ebx+edi*4] ; + seeds[j]
+ add eax, edi ; + j
+ mov [ecx+esi*4+CRandomMersenneA.MT], eax ; save in mt[i]
+ inc esi ; i++
+ inc edi ; j++
+ cmp esi, MERS_N
+ jb M330 ; if (i>=MERS_N)
+ mov eax, [ecx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+ mov [ecx+CRandomMersenneA.MT], eax
+ mov esi, 1 ; i=1;
+M330:
+ cmp edi, [esp] ; length
+ jb M340 ; if (j>=length)
+ xor edi, edi ; j = 0;
+M340:
+ dec ebp ; k--
+ jnz M320 ; first k loop
+M350:
+ mov ebp, MERS_N-1 ; k
+M360: mov eax, [ecx+esi*4-4+CRandomMersenneA.MT] ; mt[i-1]
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
+ imul eax, 1566083941 ; * 1566083941
+ xor eax, [ecx+esi*4+CRandomMersenneA.MT] ; ^ mt[i]
+ sub eax, esi ; - i
+ mov [ecx+esi*4+CRandomMersenneA.MT], eax ; save in mt[i]
+ inc esi ; i++
+ cmp esi, MERS_N
+ jb M370 ; if (i>=MERS_N)
+ mov eax, [ecx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+ mov [ecx+CRandomMersenneA.MT], eax
+ mov esi, 1 ; i=1;
+M370:
+ dec ebp ; k--
+ jnz M360 ; second k loop
+ mov dword [ecx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
+M380:
+ mov dword [ecx+CRandomMersenneA.MTI], 0
+ mov dword [ecx+CRandomMersenneA.PreInx], 0
+
+; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
+ mov edi, MERS_N + PREMADELOST+1
+ CMP dword [ecx+CRandomMersenneA.Instset], 4 ; can we use XMM registers and SSE2 ?
+ jae M390
+ sub edi, PREMADELOST ; SSE2 not supported
+ mov dword [ecx+CRandomMersenneA.PreInx], 0 ; reset index to premade list
+M390: ; loop
+M391: call MersBRandom_reg
+ dec edi
+ jnz M391
+
+ pop ecx ; remove local copy of length
+ pop ebp ; restore registers
+ pop edi
+ pop esi
+ pop ebx
+ ret
+;_MersRandomInitByArray ENDP
+
+; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
+
+_MersBRandom: ; PROC NEAR ; generate random bits
+ mov ecx, [esp+4] ; Pthis
+ and ecx, -16 ; align buffer
+
+MersBRandom_reg: ; Entry for register parameters, used internally
+
+ cmp dword [ecx+CRandomMersenneA.Instset], 4 ; can we use XMM registers and SSE2 ?
+ jb M500
+
+ ; this version uses XMM registers and SSE2 instructions:
+ mov edx, [ecx+CRandomMersenneA.PreInx] ; index into premade numbers
+ mov eax, [ecx+edx*1+CRandomMersenneA.PreInt] ; fetch premade random number
+ add edx, 4
+ mov [ecx+CRandomMersenneA.PreInx], edx
+ cmp edx, 4*4
+ jnb M410
+ ret ; return premade number
+
+M410:
+; PREMADE list is empty. Make 4 more numbers ready for next call:
+ mov edx, [ecx+CRandomMersenneA.MTI] ; fetch 4 numbers from MT buffer
+ movdqa xmm0, oword [ecx+edx*1+CRandomMersenneA.MT]
+
+%IF TEMPERING ; optional tempering algorithm
+ movdqa xmm1, xmm0
+ psrld xmm0, MERS_U
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ pslld xmm0, MERS_S
+ pand xmm0, oword [ecx+CRandomMersenneA.TMB]
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ pslld xmm0, MERS_T
+ pand xmm0, oword [ecx+CRandomMersenneA.TMC]
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrld xmm0, MERS_L
+ pxor xmm0, xmm1
+%ENDIF ; tempering
+
+ ; save four premade integers
+ movdqa oword [ecx+CRandomMersenneA.PreInt], xmm0
+ ; premake four floating point numbers
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpckldq xmm1, xmm0 ; get first two numbers into bits 32-63 and 96-127
+ punpckhdq xmm2, xmm0 ; get next two numbers into bits 32-63 and 96-127
+ psrlq xmm1, 12 ; get bits into mantissa position
+ psrlq xmm2, 12 ; get bits into mantissa position
+ por xmm1, oword [ecx+CRandomMersenneA.one] ; set exponent for interval [1,2)
+ por xmm2, oword [ecx+CRandomMersenneA.one] ; set exponent for interval [1,2)
+ movdqa oword [ecx+CRandomMersenneA.PreFlt], xmm1 ; store two premade numbers
+ movdqa oword [ecx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers
+ mov dword [ecx+CRandomMersenneA.PreInx], 0 ; index to premade numbers
+ add edx, 4*4 ; increment MTI index into MT buffer by 4
+ mov [ecx+CRandomMersenneA.MTI], edx
+ cmp edx, MERS_N*4
+ jae M420
+ ret ; return random number in eax
+
+; MT buffer exhausted. Make MERS_N new numbers ready for next time
+M420: ; eax is the random number to return
+%IF MERS_N & 3 ; if MERS_N is not divisible by 4
+ NVALID = MERS_N & 3 ; only NVALID of the 4 premade numbers are valid
+ ; Move premade numbers (4-NVALID) positions forward
+ movdqa xmm0, [ecx+CRandomMersenneA.PreInt]
+ movdqa xmm1, [ecx+CRandomMersenneA.PreFlt]
+ movdqa xmm2, [ecx+CRandomMersenneA.PreFlt+16]
+ movdqu [ecx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
+ movdqu [ecx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
+%IF NVALID == 3
+ movq [ecx+CRandomMersenneA.PreFlt+16 + 8], xmm2
+%ENDIF
+ ; save index to first valid premade number
+ mov [ecx+CRandomMersenneA.PreInx], (4-NVALID)*4
+%ENDIF
+
+ ; MT buffer is empty. Fill it up
+ push ebx
+ movd xmm3, [ecx+CRandomMersenneA.UMASK] ; load constants
+ movd xmm4, [ecx+CRandomMersenneA.LMASK]
+ movd xmm5, [ecx+CRandomMersenneA.MATA]
+ pshufd xmm3, xmm3, 0 ; broadcast constants
+ pshufd xmm4, xmm4, 0
+ pshufd xmm5, xmm5, 0
+ xor ebx, ebx ; kk = 0
+ mov edx, MERS_M*4 ; km
+
+; change ecx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+ add ecx, CRandomMersenneA.MT
+
+M430: ; kk loop
+ movdqa xmm2, oword [ecx+ebx] ; mt[kk]
+ movd xmm6, [ecx+ebx+16]
+ movdqa xmm1, oword [ecx+ebx] ; mt[kk]
+ movss xmm2, xmm6 ; faster than movdqu xmm2, [ebx+4] ?
+ pshufd xmm2, xmm2, 00111001B ; mt[kk+1]
+ movdqu xmm0, oword [ecx+edx] ; mt[km]
+ ;movq xmm0, qword [ecx+edx] ; mt[km]
+ ;movhps xmm0, qword [ecx+edx+8] ; this is faster than movdqu on older processors
+ pand xmm1, xmm3 ; mt[kk] & UPPER_MASK
+ pand xmm2, xmm4 ; mt[kk+1] & LOWER_MASK
+ por xmm1, xmm2 ; y
+ movdqa xmm2, xmm1 ; y
+ pslld xmm1, 31 ; copy bit 0 into all bits
+ psrad xmm1, 31 ; -(y & 1)
+ pand xmm1, xmm5 ; & MERS_A
+ psrld xmm2, 1 ; y >> 1
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa oword [ecx+ebx], xmm0 ; result into mt[kk]
+ cmp ebx, (MERS_N-4)*4
+ jae M440 ; exit loop when kk past end of buffer
+ add ebx, 16 ; kk += 4
+ add edx, 16 ; km += 4
+ cmp edx, (MERS_N-4)*4
+ jbe M430 ; skip unless km wraparound
+ sub edx, MERS_N*4 ; km wraparound
+ movdqu xmm0, oword [ecx+(MERS_N-4)*4] ; copy end to before begin for km wraparound
+ movdqa oword [ecx-4*4], xmm0
+ movdqa xmm0, oword [ecx] ; copy begin to after end for kk wraparound
+ movdqu oword [ecx+MERS_N*4], xmm0
+ jmp M430
+
+M440: ; loop finished. discard excess part of last result
+
+; change ecx back to pointing to CRandomMersenneA
+ sub ecx, CRandomMersenneA.MT
+ mov dword [ecx+CRandomMersenneA.MTI], 0
+ pop ebx
+ ret ; random number is still in eax
+
+; Generic version
+; this version is for old processors without XMM support:
+M500:
+ mov edx, [ecx+CRandomMersenneA.MTI]
+ cmp edx, MERS_N*4
+ jnb short M520 ; buffer is empty, fill it
+M510: mov eax, [ecx+edx*1+CRandomMersenneA.MT]
+ add edx, 4
+ mov [ecx+CRandomMersenneA.MTI], edx
+
+%IF TEMPERING
+ mov edx, eax
+ shr eax, MERS_U
+ xor eax, edx
+ mov edx, eax
+ shl eax, MERS_S
+ and eax, MERS_B
+ xor eax, edx
+ mov edx, eax
+ shl eax, MERS_T
+ and eax, MERS_C
+ xor eax, edx
+ mov edx, eax
+ shr eax, MERS_L
+ xor eax, edx
+%ENDIF ; tempering
+
+ mov edx, [ecx+CRandomMersenneA.PreInt] ; previously premade number
+ mov [ecx+CRandomMersenneA.PreInt], eax ; store number for next call
+ shl eax, 20 ; convert to float
+ mov dword [ecx+CRandomMersenneA.PreFlt], eax
+ mov eax, [ecx+CRandomMersenneA.PreInt]
+ shr eax, 12
+ or eax, 3FF00000H
+ mov dword [ecx+CRandomMersenneA.PreFlt+4], eax
+ mov eax, edx ; return value is premade integer
+ ret
+
+ ; fill buffer with random numbers
+M520: push ebx
+ push esi
+ xor esi, esi ; kk
+ mov ebx, MERS_M*4 ; km
+; change ecx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+ add ecx, CRandomMersenneA.MT
+
+ ; kk loop
+M530: mov eax, [ecx+esi]
+ mov edx, [ecx+esi+4]
+ and eax, UPPER_MASK
+ and edx, LOWER_MASK
+ or eax, edx
+ shr eax, 1
+ sbb edx, edx
+ and edx, MERS_A
+ xor eax, edx
+ xor eax, [ecx+ebx]
+ mov [ecx+esi], eax
+ add ebx, 4
+ cmp ebx, MERS_N*4
+ jb short M540
+ ; copy begin of table to after end to simplify kk+1 wraparound
+ mov eax, [ecx]
+ mov [ecx+ebx], eax
+ xor ebx, ebx
+M540: add esi, 4
+ cmp esi, MERS_N*4
+ jb M530 ; loop end
+
+; change ecx back to pointing to CRandomMersenneA
+ sub ecx, CRandomMersenneA.MT
+ xor edx, edx
+ mov [ecx+CRandomMersenneA.MTI], edx
+ pop esi
+ pop ebx
+ jmp M510
+
+;_MersBRandom ENDP
+
+; extern "C" double MersRandom(void * Pthis); // Output random float
+
+_MersRandom:; PROC NEAR ; generate random float with 32 bits resolution
+ mov ecx, [esp+4] ; Pthis
+ and ecx, -16 ; align buffer
+ mov edx, [ecx+CRandomMersenneA.PreInx] ; index into premade numbers
+ fld qword [ecx+edx*2+CRandomMersenneA.PreFlt] ; fetch premade floating point random number
+ fsub qword [ecx+CRandomMersenneA.one] ; subtract 1.0
+ jmp MersBRandom_reg ; random bits
+;_MersRandom ENDP
+
+
+; extern "C" int MersIRandom (void * Pthis, int min, int max); // Output random integer
+
+_MersIRandom: ; PROC NEAR
+ mov ecx, [esp+4] ; Pthis
+ and ecx, -16 ; align buffer
+ call MersBRandom_reg ; random bits
+ mov edx, [esp+12] ; max
+ mov ecx, [esp+8] ; min
+ sub edx, ecx
+ js short M720 ; max < min
+ add edx, 1 ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [edx+ecx] ; add min
+ ret
+M720: mov eax, 80000000H ; error exit
+ ret
+;_MersIRandom ENDP
+
+
+; extern "C" int MersIRandomX (void * Pthis, int min, int max); // Output random integer
+
+_MersIRandomX: ; PROC NEAR
+ push edi
+ mov ecx, [esp+8] ; Pthis
+ mov edx, [esp+12] ; min
+ mov edi, [esp+16] ; max
+ and ecx, -16 ; align buffer
+ sub edi, edx ; max - min
+ jle short M830 ; max <= min (signed)
+ inc edi ; interval = max - min + 1
+
+ ; if (interval != LastInterval) {
+ cmp edi, [ecx+CRandomMersenneA.LastInterval]
+ je M810
+ ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+ xor eax, eax ; 0
+ lea edx, [eax+1] ; 1
+ div edi ; (would give overflow if interval = 1)
+ mul edi
+ dec eax
+ mov [ecx+CRandomMersenneA.RLimit], eax
+ mov [ecx+CRandomMersenneA.LastInterval], edi
+M810:
+M820: ; do { // Rejection loop
+ call MersBRandom_reg ; random bits (ecx is preserved)
+ ; longran = (uint64)BRandom() * interval;
+ mul edi
+ ; } while (remainder > RLimit);
+ cmp eax, [ecx+CRandomMersenneA.RLimit]
+ ja M820
+
+ ; return (int32)iran + min
+ mov eax, [esp+12] ; min
+ add eax, edx
+ pop edi
+ ret
+
+M830: jl M840
+ ; max = min. Return min
+ mov eax, edx
+ pop edi
+ ret ; max = min exit
+
+M840: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop edi
+ ret
+;_MersIRandomX ENDP
+
+
+; -------------------------------------------------------------------------
+; Single-threaded static link versions of Mersenne Twister
+; -------------------------------------------------------------------------
+
+%IFDEF POSITIONINDEPENDENT
+; Get ecx = eip for self-relative addressing
+GetThunkECX:
+ mov ecx, [esp]
+ ret
+
+; Get address of MersenneInstance into ecx, position independent
+; This works only in YASM, not in NASM:
+%macro GetMersenneInstanceAddress 0
+ call GetThunkECX
+ add ecx, MersenneInstance - $
+%endmacro
+
+%ELSE
+
+; Get address of MersenneInstance into ecx, position dependent
+; This works only in YASM, not in NASM:
+%macro GetMersenneInstanceAddress 0
+ mov ecx, MersenneInstance
+%endmacro
+
+%ENDIF
+
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+_MersenneRandomInitByArray: ; PROC NEAR
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov ebx, [esp+20] ; seeds
+ mov ebp, [esp+24] ; length
+ GetMersenneInstanceAddress ; Macro different for position-dependent and -independent version
+ jmp MersRandomInitByArray_reg ; jump to function in mersenne32.asm
+;_MersenneRandomInitByArray ENDP
+
+
+; extern "C" void MersenneRandomInit(int seed); // Re-seed
+_MersenneRandomInit: ; PROC NEAR
+ mov eax, [esp+4] ; seed
+ GetMersenneInstanceAddress
+ jmp MersRandomInit_reg ; jump to function in mersenne32.asm
+;_MersenneRandomInit ENDP
+
+
+; extern "C" double MersenneRandom(); // Output random float
+_MersenneRandom: ; PROC NEAR ; generate random float with 32 bits resolution
+ GetMersenneInstanceAddress
+ mov edx, [ecx+CRandomMersenneA.PreInx] ; index into premade numbers
+ fld qword [ecx+edx*2+CRandomMersenneA.PreFlt] ; fetch premade floating point random number
+ fsub qword [ecx+CRandomMersenneA.one] ; subtract 1.0
+ jmp MersBRandom_reg ; random bits
+;_MersenneRandom ENDP
+
+
+; extern "C" int MersenneIRandom (int min, int max); // Output random integer
+_MersenneIRandom: ; PROC NEAR
+ GetMersenneInstanceAddress
+ call MersBRandom_reg ; random bits
+ mov edx, [esp+8] ; max
+ mov ecx, [esp+4] ; min
+ sub edx, ecx
+ js short S410 ; max < min
+ add edx, 1 ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [edx+ecx] ; add min
+ ret
+S410: mov eax, 80000000H ; error exit
+ ret
+;_MersenneIRandom ENDP
+
+
+; extern "C" int MersenneIRandomX(int min, int max); // Output random integer, exact
+
+_MersenneIRandomX: ; PROC NEAR
+ push edi
+ GetMersenneInstanceAddress
+ mov edx, [esp+8] ; min
+ mov edi, [esp+12] ; max
+ sub edi, edx ; max - min
+ jle short S530 ; max <= min (signed)
+ inc edi ; interval = max - min + 1
+ cmp edi, [ecx+CRandomMersenneA.LastInterval]
+ je S510
+ xor eax, eax ; 0
+ lea edx, [eax+1] ; 1
+ div edi ; (would give overflow if interval = 1)
+ mul edi
+ dec eax
+ mov [ecx+CRandomMersenneA.RLimit], eax
+ mov [ecx+CRandomMersenneA.LastInterval], edi
+S510:
+S520: call MersBRandom_reg ; random bits (ecx is preserved)
+ mul edi
+ cmp eax, [ecx+CRandomMersenneA.RLimit]
+ ja S520
+ mov eax, [esp+8] ; min
+ add eax, edx
+ pop edi
+ ret
+
+S530: jl S540
+ ; max = min. Return min
+ mov eax, edx
+ pop edi
+ ret ; max = min exit
+
+S540: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop edi
+ ret
+;_MersenneIRandomX ENDP
+
+
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+_MersenneBRandom: ; PROC NEAR ; generate random float with 32 bits resolution
+ GetMersenneInstanceAddress
+ jmp MersBRandom_reg ; random bits
+;_MersenneBRandom ENDP
+
+
+; -----------------------------------------------------------------
+; Single-threaded DLL versions for Mersenne Twister, Windows only
+; -----------------------------------------------------------------
+%IFDEF WINDOWS
+
+; extern "C" void __stdcall MersenneRandomInitByArrayD(unsigned int seeds[], int length); // Seed by more than 32 bits
+_MersenneRandomInitByArrayD at 8: ; PROC NEAR
+ ; translate __cdecl to __stdcall calling
+ mov eax, [esp+4] ; seeds
+ mov edx, [esp+8] ; length
+ push edx
+ push eax
+ call _MersenneRandomInitByArray
+ pop ecx
+ pop ecx
+ ret 8
+;_MersenneRandomInitByArrayD at 8 ENDP
+
+
+; extern "C" void __stdcall MersenneRandomInitD(int seed); // Re-seed
+_MersenneRandomInitD at 4: ; PROC NEAR
+ ; remove parameter from stack
+ pop edx ; return address
+ pop eax ; seed
+ push edx ; put return address back in
+ mov ecx, MersenneInstance
+ ; eax = seed, ecx = Pthis
+ jmp MersRandomInit_reg ; jump to function in mersenne32.asm
+;_MersenneRandomInitD at 4 ENDP
+
+
+; extern "C" double __stdcall MersenneRandomD(); // Output random float
+_MersenneRandomD at 0: ; PROC NEAR ; generate random float with 32 bits resolution
+ mov ecx, MersenneInstance
+ mov edx, [ecx+CRandomMersenneA.PreInx] ; index into premade numbers
+ fld qword [ecx+edx*2+CRandomMersenneA.PreFlt] ; fetch premade floating point random number
+ fsub qword [ecx+CRandomMersenneA.one] ; subtract 1.0
+ jmp MersBRandom_reg ; random bits
+;_MersenneRandomD at 0 ENDP
+
+
+; extern "C" int __stdcall MersenneIRandomD (int min, int max); // Output random integer
+_MersenneIRandomD at 8: ; PROC NEAR
+ mov ecx, MersenneInstance
+ call MersBRandom_reg ; random bits
+ mov edx, [esp+8] ; max
+ mov ecx, [esp+4] ; min
+ sub edx, ecx
+ js short S710 ; max < min
+ add edx, 1 ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [edx+ecx] ; add min
+ ret 8
+S710: mov eax, 80000000H ; error exit
+ ret 8
+;_MersenneIRandomD at 8 ENDP
+
+
+; extern "C" int __stdcall MersenneIRandomXD(int min, int max); // Output random integer, exact
+
+_MersenneIRandomXD at 8: ; PROC NEAR
+ push edi
+ mov ecx, MersenneInstance
+ mov edx, [esp+8] ; min
+ mov edi, [esp+12] ; max
+ sub edi, edx ; max - min
+ jle short S830 ; max <= min (signed)
+ inc edi ; interval = max - min + 1
+ cmp edi, [ecx+CRandomMersenneA.LastInterval]
+ je S810
+ xor eax, eax ; 0
+ lea edx, [eax+1] ; 1
+ div edi ; (would give overflow if interval = 1)
+ mul edi
+ dec eax
+ mov [ecx+CRandomMersenneA.RLimit], eax
+ mov [ecx+CRandomMersenneA.LastInterval], edi
+S810:
+S820: call MersBRandom_reg ; random bits (ecx is preserved)
+ mul edi
+ cmp eax, [ecx+CRandomMersenneA.RLimit]
+ ja S820
+ mov eax, [esp+8] ; min
+ add eax, edx
+ pop edi
+ ret 8
+
+S830: jl S840
+ ; max = min. Return min
+ mov eax, edx
+ pop edi
+ ret 8 ; max = min exit
+
+S840: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop edi
+ ret 8
+;_MersenneIRandomXD at 8 ENDP
+
+
+; extern "C" unsigned int __stdcall MersenneBRandomD(); // Output random bits
+_MersenneBRandomD at 0: ; PROC NEAR ; generate random float with 32 bits resolution
+ mov ecx, MersenneInstance
+ jmp MersBRandom_reg ; random bits
+;_MersenneBRandomD at 0 ENDP
+
+%ENDIF ; WINDOWS
diff --git a/asmlibSrc/mersenne64.asm b/asmlibSrc/mersenne64.asm
new file mode 100755
index 0000000..f510e7c
--- /dev/null
+++ b/asmlibSrc/mersenne64.asm
@@ -0,0 +1,614 @@
+; ----------------------------- MERSENNE64.ASM ---------------------------
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-09-13
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
+;
+;
+; This random number generator is described in the article by
+; M. Matsumoto & T. Nishimura, in:
+; ACM Transactions on Modeling and Computer Simulation,
+; vol. 8, no. 1, 1998, pp. 3-30. See also:
+; http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+;
+; Initialization:
+; MersRandomInit must be called before the first call to any of the other
+; random number functions. The seed is any 32-bit integer.
+; You may use MersRandomInitByArray instead if you want more
+; than 32 bits for seed. length is the number of integers in seeds[].
+; length must be > 0, there is no upper limit for length.
+;
+; Generating random numbers:
+; MersRandom returns a floating point number in the interval 0 <= x < 1 with
+; a resolution of 32 bits.
+; MersIRandom returns an integer in the interval defined by min and max with
+; a resolution of 32 bits.
+; MersIRandomX returns an integer in the interval defined by min and max with
+; exactly equal probabilities of all values in the interval.
+; MersBRandom returns 32 random bits.
+;
+; Error conditions:
+; If MersRandomInit or MersRandomInitByArray has not been called then MersRandom
+; and MersBRandom keep returning 0, and MersIRandom and MersIRandomX return min.
+; MersIRandom and MersIRandomX return a large negative number if max < min.
+;
+; C++ prototypes in randoma.h:
+; Thread-safe versions:
+; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+; extern "C" void MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length); // Seed by more than 32 bits
+; extern "C" int MersIRandom (void * Pthis, int min, int max); // Output random integer
+; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Output random integer, exact
+; extern "C" double MersRandom(void * Pthis); // Output random float
+; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
+;
+; Single-threaded versions:
+; extern "C" void MersenneRandomInit(int seed); // Re-seed
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length); // Seed by more than 32 bits
+; extern "C" int MersenneIRandom (int min, int max); // Output random integer
+; extern "C" int MersenneIRandomX(int min, int max); // Output random integer, exact
+; extern "C" double MersenneRandom(); // Output random float
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global MersenneRandomInit, MersenneRandomInitD, MersRandomInit
+global MersenneRandomInitByArray, MersenneRandomInitByArrayD, MersRandomInitByArray
+global MersenneBRandom, MersenneBRandomD, MersBRandom
+global MersenneRandom, MersenneRandomD, MersRandom
+global MersenneIRandom, MersenneIRandomD, MersIRandom
+global MersenneIRandomX, MersenneIRandomXD, MersIRandomX
+
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MersenneInstance: ISTRUC CRandomMersenneA
+IEND
+; Size of structure
+MersenneSize equ $ - MersenneInstance
+
+
+SECTION .CODE ALIGN=16
+
+MersenneRandomInit: ; PROC
+%IFDEF UNIX
+ mov edx, edi ; seed
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersRandomInit
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitD: ; alias
+ mov edx, ecx ; seed
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ ;jmp ?Windows_MersRandomInit
+%ENDIF
+;MersenneRandomInit ENDP
+
+
+; Thread-safe version:
+; extern "C" void MersRandomInit(void * Pthis, int seed); // Re-seed
+MersRandomInit: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov edx, esi ; seed
+ mov rcx, rdi ; Pthis
+%ENDIF
+ ; parameters: rcx = Pthis, edx = seed
+ and rcx, -16 ; align buffer
+ ?Windows_MersRandomInit:
+ call Mers_init0 ; initialize mt buffer with seeds
+
+ ; Number of premade numbers that are lost in the initialization when the
+ ; SSE2 implementation makes up to 4 premade numbers at a time:
+%IF MERS_N & 3
+ PREMADELOST equ (MERS_N & 3)
+%ELSE
+ PREMADELOST equ 4
+%ENDIF
+ ; We want the C++ and the assembly implementation to give exactly the same
+ ; sequence. The C++ version discards 37 random numbers after initialization.
+ ; The assembly version generates a sequence that is PREMADELOST + 1 numbers
+ ; behind. Therefore we discard the first 37 + PREMADELOST + 1 numbers if
+ ; SSE2 is supported, otherwise 37 + 1.
+
+ push rbx
+ mov ebx, 37+PREMADELOST+1
+ ; CMP dword [rcx+CRandomMersenneA.Instset], 4 ; can we use XMM registers and SSE2 ?
+ ; jae M110
+ ; sub ebx, PREMADELOST ; SSE2 not supported
+ ; mov dword [rcx+CRandomMersenneA.PreInx], 0 ; reset index to premade list
+M110: ; loop
+M120: call ?Windows_MersBRandom
+ dec ebx
+ jnz M120
+ pop rbx
+ ret
+;MersRandomInit ENDP
+
+
+Mers_init0: ; make random seeds from eax and put them into MT buffer
+; Input parameters:
+; rcx points to CRandomMersenneA
+; edx: seed
+; rcx unchanged by procedure
+
+ push rdi
+ ; clear my buffer
+ push rcx
+ mov rdi, rcx ; Pthis
+ add rdi, 16
+ mov ecx, (MersenneSize - 16) / 4
+ xor eax, eax
+ cld
+ rep stosd
+ pop rcx ; Pthis
+ mov edi, edx ; seed
+
+ ; initialize CRandomMersenneA structure
+ mov dword [rcx+CRandomMersenneA.PreInx], 4*4
+ mov dword [rcx+CRandomMersenneA.Instset], 4
+ mov eax, MERS_B
+ mov [rcx+CRandomMersenneA.TMB], eax
+ mov [rcx+CRandomMersenneA.TMB+4], eax
+ mov [rcx+CRandomMersenneA.TMB+8], eax
+ mov [rcx+CRandomMersenneA.TMB+12], eax
+ mov eax, MERS_C
+ mov [rcx+CRandomMersenneA.TMC], eax
+ mov [rcx+CRandomMersenneA.TMC+4], eax
+ mov [rcx+CRandomMersenneA.TMC+8], eax
+ mov [rcx+CRandomMersenneA.TMC+12], eax
+ mov eax, 3FF00000H ; upper dword of 1.0, double precision
+ mov [rcx+CRandomMersenneA.one+4], eax
+ mov [rcx+CRandomMersenneA.one+12], eax
+ mov dword [rcx+CRandomMersenneA.LMASK], LOWER_MASK
+ mov dword [rcx+CRandomMersenneA.UMASK], UPPER_MASK
+ mov dword [rcx+CRandomMersenneA.MATA], MERS_A
+
+ ; put random numbers into MT buffer
+ xor eax, eax
+M210: mov [rcx+rax*4+CRandomMersenneA.MT], edi
+ mov edx, edi
+ shr edi, 30
+ xor edi, edx
+ imul edi, 1812433253
+ inc eax
+ add edi, eax
+ cmp eax, MERS_N
+ jb M210
+
+ ; Set index MTI to end of list, (scaled by 4)
+ ; Round up to multiple of 4 to avoid alignment error
+ mov dword [rcx+CRandomMersenneA.MTI], ((MERS_N+3) & (-4)) * 4
+
+ pop rdi
+ ret
+
+
+; Single threaded version:
+; extern "C" void MersenneRandomInitByArray(unsigned int seeds[], int length);
+
+MersenneRandomInitByArray: ; PROC ; entry for Linux call
+%IFDEF UNIX
+ mov r8d, esi ; length
+ mov rdx, rdi ; seeds
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersRandomInitByArray
+%ENDIF
+%IFDEF WINDOWS
+MersenneRandomInitByArrayD: ; LABEL NEAR ; alias
+ mov r8d, edx ; length
+ mov rdx, rcx ; seeds
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersRandomInitByArray
+%ENDIF
+;MersenneRandomInitByArray ENDP
+
+; Thread-safe version:
+; extern "C" int MersRandomInitByArray(void * Pthis, unsigned int seeds[], int length);
+MersRandomInitByArray: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; length
+ mov rdx, rsi ; seeds
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MersRandomInitByArray:
+; parameters: rcx = Pthis, rdx = seeds, r8d = length
+
+ and rcx, -16 ; align buffer
+ push rbx
+ push rsi
+ push rdi
+ push rbp
+ mov rbx, rdx ; seeds
+ mov ebp, r8d ; length
+
+ mov edx, 19650218
+ call Mers_init0 ; init0(19650218); (rcx unchanged)
+
+ mov r8d, ebp ; r8d = length, ebp = k
+ test ebp, ebp
+ jle M380 ; error: length <= 0
+ xor edi, edi ; j = 0
+ lea esi, [rdi+1] ; i = 1
+ cmp ebp, MERS_N
+ ja M310
+ mov ebp, MERS_N ; k = max (MERS_N,length)
+M310:
+
+ ; for (; k; k--) {
+M320: mov eax, [rcx+rsi*4-4+CRandomMersenneA.MT] ; mt[i-1]
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
+ imul eax, 1664525 ; * 1664525
+ xor eax, [rcx+rsi*4+CRandomMersenneA.MT] ; ^ mt[i]
+ add eax, [rbx+rdi*4] ; + seeds[j]
+ add eax, edi ; + j
+ mov [rcx+rsi*4+CRandomMersenneA.MT], eax ; save in mt[i]
+ inc esi ; i++
+ inc edi ; j++
+ cmp esi, MERS_N
+ jb M330 ; if (i>=MERS_N)
+ mov eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+ mov [rcx+CRandomMersenneA.MT], eax
+ mov esi, 1 ; i=1;
+M330:
+ cmp edi, r8d ; length
+ jb M340 ; if (j>=length)
+ xor edi, edi ; j = 0;
+M340:
+ dec ebp ; k--
+ jnz M320 ; first k loop
+M350:
+ mov ebp, MERS_N-1 ; k
+M360: mov eax, [rcx+rsi*4-4+CRandomMersenneA.MT] ; mt[i-1]
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx ; mt[i-1] ^ (mt[i-1] >> 30)
+ imul eax, 1566083941 ; * 1566083941
+ xor eax, [rcx+rsi*4+CRandomMersenneA.MT] ; ^ mt[i]
+ sub eax, esi ; - i
+ mov [rcx+rsi*4+CRandomMersenneA.MT], eax ; save in mt[i]
+ inc esi ; i++
+ cmp esi, MERS_N
+ jb M370 ; if (i>=MERS_N)
+ mov eax, [rcx+(MERS_N-1)*4+CRandomMersenneA.MT]; mt[0] = mt[MERS_N-1];
+ mov [rcx+CRandomMersenneA.MT], eax
+ mov esi, 1 ; i=1;
+M370:
+ dec ebp ; k--
+ jnz M360 ; second k loop
+ mov dword [rcx+CRandomMersenneA.MT], 80000000H ; mt[0] = 0x80000000
+M380:
+ mov dword [rcx+CRandomMersenneA.MTI], 0
+ mov dword [rcx+CRandomMersenneA.PreInx], 0
+
+; discard first MERS_N random numbers + PREMADELOST+1 to compensate for lag
+ mov edi, MERS_N + PREMADELOST+1
+M391: call ?Windows_MersBRandom
+ dec edi
+ jnz M391
+
+ pop rbp ; restore registers
+ pop rdi
+ pop rsi
+ pop rbx
+ ret
+;MersRandomInitByArray ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneBRandom(); // Output random bits
+
+MersenneBRandom: ; PROC ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneBRandomD: ; LABEL NEAR ; alias
+%ENDIF
+ lea rcx, [MersenneInstance] ; Point to instance
+ jmp ?Windows_MersBRandom
+;MersenneBRandom ENDP
+
+; Thread-safe version:
+; extern "C" unsigned int MersBRandom(void * Pthis); // Output random bits
+
+MersBRandom: ; PROC
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+
+?Windows_MersBRandom: ; LABEL NEAR ; Label used internally
+ and rcx, -16 ; align buffer
+ mov edx, [rcx+CRandomMersenneA.PreInx] ; index into premade numbers
+ mov eax, [rcx+rdx*1+CRandomMersenneA.PreInt] ; fetch premade random number
+ add edx, 4
+ mov [rcx+CRandomMersenneA.PreInx], edx
+ cmp edx, 4*4
+ jnb M410
+ ret ; return premade number
+
+M410:
+; PREMADE list is empty. Make 4 more numbers ready for next call:
+ mov edx, [rcx+CRandomMersenneA.MTI] ; fetch 4 numbers from MT buffer
+ movdqa xmm0, oword [rcx+rdx*1+CRandomMersenneA.MT]
+
+%IF TEMPERING ; optional tempering algorithm
+ movdqa xmm1, xmm0
+ psrld xmm0, MERS_U
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ pslld xmm0, MERS_S
+ pand xmm0, oword [rcx+CRandomMersenneA.TMB]
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ pslld xmm0, MERS_T
+ pand xmm0, oword [rcx+CRandomMersenneA.TMC]
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrld xmm0, MERS_L
+ pxor xmm0, xmm1
+%ENDIF ; tempering
+
+ ; save four premade integers
+ movdqa oword [rcx+CRandomMersenneA.PreInt], xmm0
+ ; premake four floating point numbers
+ pxor xmm1, xmm1
+ pxor xmm2, xmm2
+ punpckldq xmm1, xmm0 ; get first two numbers into bits 32-63 and 96-127
+ punpckhdq xmm2, xmm0 ; get next two numbers into bits 32-63 and 96-127
+ psrlq xmm1, 12 ; get bits into mantissa position
+ psrlq xmm2, 12 ; get bits into mantissa position
+ por xmm1,oword[rcx+CRandomMersenneA.one] ; set exponent for interval [1,2)
+ por xmm2,oword[rcx+CRandomMersenneA.one] ; set exponent for interval [1,2)
+ movdqa oword [rcx+CRandomMersenneA.PreFlt], xmm1 ; store two premade numbers
+ movdqa oword [rcx+CRandomMersenneA.PreFlt+16],xmm2; store two more premade numbers
+ mov dword [rcx+CRandomMersenneA.PreInx], 0 ; index to premade numbers
+ add edx, 4*4 ; increment MTI index into MT buffer by 4
+ mov [rcx+CRandomMersenneA.MTI], edx
+ cmp edx, MERS_N*4
+ jae M420
+ ret ; return random number in eax
+
+; MT buffer exhausted. Make MERS_N new numbers ready for next time
+M420: ; eax is the random number to return
+%IF MERS_N & 3 ; if MERS_N is not divisible by 4
+ NVALID equ MERS_N & 3 ; only NVALID of the 4 premade numbers are valid
+ ; Move premade numbers (4-NVALID) positions forward
+ movdqa xmm0, [rcx+CRandomMersenneA.PreInt]
+ movdqa xmm1, [rcx+CRandomMersenneA.PreFlt]
+ movdqa xmm2, [rcx+CRandomMersenneA.PreFlt+16]
+ movdqu [rcx+CRandomMersenneA.PreInt + (4-NVALID)*4], xmm0
+ movdqu [rcx+CRandomMersenneA.PreFlt + (4-NVALID)*8], xmm1
+%IF NVALID == 3
+ movq [rcx+CRandomMersenneA.PreFlt+16 + 8], xmm2
+%ENDIF
+ ; save index to first valid premade number
+ mov [rcx+CRandomMersenneA.PreInx], (4-NVALID)*4
+%ENDIF
+
+; MT buffer is empty. Fill it up
+ push rbx
+ movd xmm3, [rcx+CRandomMersenneA.UMASK] ; load constants
+ movd xmm4, [rcx+CRandomMersenneA.LMASK]
+ movd xmm5, [rcx+CRandomMersenneA.MATA]
+ pshufd xmm3, xmm3, 0 ; broadcast constants
+ pshufd xmm4, xmm4, 0
+ pshufd xmm5, xmm5, 0
+ xor rbx, rbx ; kk = 0
+ mov edx, MERS_M*4 ; km
+
+; change rcx from pointing to CRandomMersenneA to pointing to CRandomMersenneA.MT
+ add rcx, CRandomMersenneA.MT
+
+M430: ; kk loop
+ movdqa xmm2, [rcx+rbx] ; mt[kk]
+ movd xmm0, dword [rcx+rbx+16]
+ movdqa xmm1, [rcx+rbx] ; mt[kk]
+ movss xmm2, xmm0 ; faster than movdqu xmm2,[]
+ pshufd xmm2, xmm2, 00111001B ; mt[kk+1]
+ movdqu xmm0, oword [rcx+rdx] ; mt[km]
+ ;movq xmm0, qword [rcx+rdx] ; mt[km]
+ ;movhps xmm0, qword [rcx+rdx+8] ; faster than movdqu on older processors
+ pand xmm1, xmm3 ; mt[kk] & UPPER_MASK
+ pand xmm2, xmm4 ; mt[kk+1] & LOWER_MASK
+ por xmm1, xmm2 ; y
+ movdqa xmm2, xmm1 ; y
+ pslld xmm1, 31 ; copy bit 0 into all bits
+ psrad xmm1, 31 ; -(y & 1)
+ pand xmm1, xmm5 ; & MERS_A
+ psrld xmm2, 1 ; y >> 1
+ pxor xmm0, xmm1
+ pxor xmm0, xmm2
+ movdqa [rcx+rbx], xmm0 ; result into mt[kk]
+ cmp ebx, (MERS_N-4)*4
+ jae M440 ; exit loop when kk past end of buffer
+ add ebx, 16 ; kk += 4
+ add rdx, 16 ; km += 4 (signed)
+ cmp edx, (MERS_N-4)*4
+ jbe M430 ; skip unless km wraparound
+ sub rdx, MERS_N*4 ; km wraparound (signed)
+ movdqu xmm0, [rcx+(MERS_N-4)*4] ; copy end to before begin for km wraparound
+ movdqa [rcx-4*4], xmm0
+ movdqa xmm0, [rcx] ; copy begin to after end for kk wraparound
+ movdqu [rcx+MERS_N*4], xmm0
+ jmp M430
+
+M440: ; loop finished. discard excess part of last result
+
+; change ecx back to pointing to CRandomMersenneA
+ sub rcx, CRandomMersenneA.MT
+
+ mov dword [rcx+CRandomMersenneA.MTI], 0
+ pop rbx
+ ret ; random number is still in eax
+
+;MersBRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneRandom(); // Get floating point random number
+
+MersenneRandom: ; PROC ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MersenneRandomD: ; alias
+ lea rcx, [MersenneInstance] ; Point to instance
+ ; continue in next function
+%ENDIF
+%IFDEF UNIX
+ lea rdi, [MersenneInstance] ; Point to instance
+ ; continue in next function
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MersRandom(void * Pthis); // Get floating point random number
+MersRandom:
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+ mov edx, [rcx+CRandomMersenneA.PreInx] ; index into premade numbers
+ movsd xmm0, [rcx+rdx*2+CRandomMersenneA.PreFlt] ; fetch premade floating point random number
+ subsd xmm0, [rcx+CRandomMersenneA.one] ; subtract 1.0
+ movsd [rcx+CRandomMersenneA.TmpFlt], xmm0 ; store random number
+ call ?Windows_MersBRandom ; prepare next random number
+ movsd xmm0, [rcx+CRandomMersenneA.TmpFlt] ; recall random number
+ ret
+;MersenneRandom ENDP
+
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandom(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandom: ; PROC
+%IFDEF UNIX
+ push rsi ; max
+ push rdi ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp MersIRandom_max_min_on_stack
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomD: ; Alias
+ push rdx ; max
+ push rcx ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp MersIRandom_max_min_on_stack
+%ENDIF
+;MersenneIRandom ENDP
+
+; Thread-safe version:
+; extern "C" int MersIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandom: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; max
+ mov edx, esi ; min
+ mov rcx, rdi ; Pthis
+%ENDIF
+ push r8 ; max
+ push rdx ; min
+MersIRandom_max_min_on_stack:
+
+ call ?Windows_MersBRandom ; random bits
+ pop rcx ; min
+ pop rdx ; max
+ sub edx, ecx
+ js short M720 ; max < min
+ add edx, 1 ; interval = max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [rdx+rcx] ; add min
+ ret
+M720: mov eax, 80000000H ; error exit
+ ret
+;MersIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MersenneIRandomX(int min, int max); // Get integer random number in desired interval
+
+MersenneIRandomX: ; PROC
+%IFDEF UNIX
+ mov r8d, esi ; max
+ mov edx, edi ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersIRandomX
+%ENDIF
+%IFDEF WINDOWS
+MersenneIRandomXD: ; alias
+ mov r8d, edx ; max
+ mov edx, ecx ; min
+ lea rcx, [MersenneInstance] ; Pthis = point to instance
+ jmp ?Windows_MersIRandomX
+%ENDIF
+;MersenneIRandomX ENDP
+
+; Thread-safe version:
+; extern "C" int MersIRandomX(void * Pthis, int min, int max); // Get integer random number in desired interval
+MersIRandomX: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; max
+ mov edx, esi ; min
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MersIRandomX:
+; parameters: rcx = Pthis, edx = min, r8d = max
+
+ and rcx, -16 ; align buffer
+ push rdi
+ mov edi, r8d ; max
+
+ sub edi, edx ; max - min
+ jle short M830 ; max <= min (signed)
+ inc edi ; interval = max - min + 1
+ push rdx ; save min
+
+ ; if (interval != LastInterval) {
+ cmp edi, [rcx+CRandomMersenneA.LastInterval]
+ je M810
+ ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+ xor eax, eax ; 0
+ lea edx, [rax+1] ; 1
+ div edi ; (would give overflow if interval = 1)
+ mul edi
+ dec eax
+ mov [rcx+CRandomMersenneA.RLimit], eax
+ mov [rcx+CRandomMersenneA.LastInterval], edi
+M810:
+M820: ; do { // Rejection loop
+ call ?Windows_MersBRandom ; random bits (rcx is preserved)
+ ; longran = (uint64)BRandom() * interval;
+ mul edi
+ ; } while (remainder > RLimit);
+ cmp eax, [rcx+CRandomMersenneA.RLimit]
+ ja M820
+
+ ; return (int32)iran + min
+ pop rax ; min
+ add eax, edx
+ pop rdi
+ ret
+
+M830: jl M840
+ ; max = min. Return min
+ mov eax, edx
+ pop rdi
+ ret ; max = min exit
+
+M840: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop rdi
+ ret
+;MersIRandomX ENDP
diff --git a/asmlibSrc/mother32.asm b/asmlibSrc/mother32.asm
new file mode 100755
index 0000000..af5cf6f
--- /dev/null
+++ b/asmlibSrc/mother32.asm
@@ -0,0 +1,370 @@
+; ----------------------------- MOTHER32.ASM -----------------------------
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-09-11
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+; Description:
+;
+; Mother-of-All random number generator by Agner Fog 1998 - 2008
+; 32-bit mode version for 80x86 and compatible microprocessors
+;
+; This is a multiply-with-carry type of random number generator
+; invented by George Marsaglia. The algorithm is:
+; S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
+; X[n] = S modulo 2^32
+; C = floor(S / 2^32)
+;
+; C++ prototypes:
+;
+; Thread-safe versions:
+; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+; extern "C" double MotRandom(void * Pthis); // Get floating point random number
+; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
+;
+; Single-threaded static link versions
+; extern "C" void MotherRandomInit(int seed); // Initialization
+; extern "C" int MotherIRandom(int min, int max); // Get integer random number in desired interval
+; extern "C" double MotherRandom(); // Get floating point random number
+; extern "C" unsigned int MotherBRandom(); // Output random bits
+;
+; Single-threaded dynamic link versions
+; extern "C" void __stdcall MotherRandomInitD(int seed); // Initialization
+; extern "C" int __stdcall MotherIRandomD(int min, int max); // Get integer random number in desired interval
+; extern "C" double __stdcall MotherRandomD(); // Get floating point random number
+; extern "C" unsigned int __stdcall MotherBRandomD(); // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _MotBRandom, _MotRandom, _MotIRandom, _MotRandomInit,
+global _MotherRandomInit, _MotherRandom, _MotherIRandom, _MotherBRandom
+%IFDEF WINDOWS
+global _MotherRandomInitD at 4, _MotherRandomD at 0, _MotherIRandomD at 8, _MotherBRandomD at 0
+%ENDIF
+
+extern _InstructionSet
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+; dummy offset operator
+%define offset
+
+section .data
+align 16
+; Data for single instance of random number generator
+MotherInstance: ISTRUC CRandomMotherA
+; Size of structure
+IEND
+MotherSize equ $-MotherInstance
+
+
+SECTION .CODE align=16 ; code segment
+
+; extern "C" unsigned int MotherBRandom(void * Pthis); // Output random bits
+
+_MotBRandom: ; PROC NEAR
+ mov ecx, [esp+4] ; Pthis
+ and ecx, -16 ; align
+MotBRandom_reg: ; Alternative entry for Pthis in ecx
+
+ ; CPU dispatch:
+ cmp dword [ecx+CRandomMotherA.Instset], 4
+ jb MotBRandomGeneric
+
+; SSE2 version
+ ; ecx = Pthis
+ movdqa xmm1, oword [ecx+CRandomMotherA.M3] ; load M3,M2,M1,M0
+ mov eax, [ecx+CRandomMotherA.M0] ; Retrieve previous random number
+ movdqa xmm2, xmm1 ; copy
+ movdqa xmm3, oword [ecx+CRandomMotherA.MF3] ; factors
+ psrlq xmm2, 32 ; move M2,M0 down
+ movq qword [ecx+CRandomMotherA.M4], xmm1 ; M4=M3, M3=M2
+ movhps qword [ecx+CRandomMotherA.M2], xmm1 ; M2=M1, M1=M0
+ pmuludq xmm1, xmm3 ; M3*MF3, M1*MF1
+ psrlq xmm3, 32 ; move MF2,MF0 down
+ pmuludq xmm2, xmm3 ; M2*MF2, M0*MF0
+ paddq xmm1, xmm2 ; P2+P3, P0+P1
+ movhlps xmm2, xmm1 ; Get high qword
+ paddq xmm1, xmm2 ; P0+P1+P2+P3
+ paddq xmm1, [ecx+CRandomMotherA.MC] ; +carry
+ movq qword [ecx+CRandomMotherA.M0], xmm1 ; Store new M0 and carry
+ ; convert to double precision float
+ psllq xmm1, 32 ; Discard carry bits
+ psrlq xmm1, 12 ; Get bits into mantissa position
+ por xmm1, oword [ecx+CRandomMotherA.one] ; Add exponent bits to get number in interval [1,2)
+ movq [ecx+CRandomMotherA.RanP1], xmm1 ; Store floating point number
+ ret
+
+
+; Generic version for old processors
+MotBRandomGeneric: ; Generic version for old processors
+ ; ecx = Pthis
+ push esi
+ push edi
+ ; recall previous random number
+ push dword [ecx+CRandomMotherA.M0]
+ ; prepare new random number
+ mov eax, [ecx+CRandomMotherA.MF3]
+ mul dword [ecx+CRandomMotherA.M3] ; x[n-4]
+ mov esi,eax
+ mov eax, [ecx+CRandomMotherA.M2] ; x[n-3]
+ mov edi,edx
+ mov [ecx+CRandomMotherA.M3],eax
+ mul dword [ecx+CRandomMotherA.MF2]
+ add esi,eax
+ mov eax, [ecx+CRandomMotherA.M1] ; x[n-2]
+ adc edi,edx
+ mov [ecx+CRandomMotherA.M2],eax
+ mul dword [ecx+CRandomMotherA.MF1]
+ add esi,eax
+ mov eax,[ecx+CRandomMotherA.M0] ; x[n-1]
+ adc edi,edx
+ mov [ecx+CRandomMotherA.M1],eax
+ mul dword [ecx+CRandomMotherA.MF0]
+ add eax,esi
+ adc edx,edi
+ add eax,[ecx+CRandomMotherA.MC]
+ adc edx,0
+ ; store next random number and carry
+ mov [ecx+CRandomMotherA.M0],eax
+ mov [ecx+CRandomMotherA.MC],edx
+ ; convert to float in case next call needs a float
+ mov edx, eax
+ shr eax, 12
+ or eax, 3ff00000h
+ shl edx, 20
+ mov dword [ecx+CRandomMotherA.RanP1+4], eax
+ mov dword [ecx+CRandomMotherA.RanP1], edx
+ ; retrieve previous random number
+ pop eax
+ pop edi
+ pop esi
+ ret
+;CRandomMotherA ENDP
+
+
+; extern "C" double MotRandom(void * Pthis); // Get floating point random number
+_MotRandom: ; PROC NEAR
+
+ mov ecx, [esp+4] ; Pthis
+ and ecx, -16 ; align
+ ; get previously prepared random number
+ fld qword [ecx+CRandomMotherA.RanP1]
+ fsub qword [ecx+CRandomMotherA.one]
+
+ ; make new random number ready for next time
+ call MotBRandom_reg ; random bits
+ ret
+;_MotRandom ENDP
+
+
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+_MotIRandom: ; PROC NEAR ; make random integer in desired interval
+
+ mov ecx, [esp+4] ; Pthis
+ and ecx, -16 ; align
+ call MotBRandom_reg ; make random number
+ mov edx, [esp+12] ; max
+ mov ecx, [esp+8] ; min
+ sub edx, ecx
+ js short rerror ; max < min
+ inc edx ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [edx+ecx] ; add min
+ ret ; ret 8 if not _cdecl calling
+
+rerror: mov eax, 80000000h ; error exit
+ ret ; ret 8 if not _cdecl calling
+;_MotIRandom ENDP
+
+
+; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
+_MotRandomInit: ; PROC NEAR
+MotRandomInit@: ; local alias
+
+ ; clear my buffer
+ push edi
+ mov edi, [esp+8] ; Pthis
+ and edi, -16 ; align
+ add edi, 16
+ mov ecx, (MotherSize - 16) / 4
+ xor eax, eax
+ cld
+ rep stosd
+
+ ; insert constants
+ mov ecx, [esp+8] ; Pthis
+ and ecx, -16 ; align
+ mov dword [ecx+CRandomMotherA.one+4],3FF00000H ; high dword of 1.0
+ mov dword [ecx+CRandomMotherA.MF0], 5115 ; factors
+ mov dword [ecx+CRandomMotherA.MF1], 1776
+ mov dword [ecx+CRandomMotherA.MF2], 1492
+ mov dword [ecx+CRandomMotherA.MF3], 2111111111
+
+ ; get instruction set
+ push ecx
+ call _InstructionSet
+ pop ecx
+ mov [ecx+CRandomMotherA.Instset], eax
+
+ ; initialize from seed
+ mov eax, [esp+12] ; seed
+ ; make random numbers and put them into buffer
+ mov edx, 29943829
+ imul eax, edx
+ dec eax
+ mov [ecx+CRandomMotherA.M0], eax
+ imul eax, edx
+ dec eax
+ mov [ecx+CRandomMotherA.M1], eax
+ imul eax, edx
+ dec eax
+ mov [ecx+CRandomMotherA.M2], eax
+ imul eax, edx
+ dec eax
+ mov [ecx+CRandomMotherA.M3], eax
+ imul eax, edx
+ dec eax
+ mov [ecx+CRandomMotherA.MC], eax
+
+ ; randomize some more
+ mov edi, 20 ; loop counter
+r90: call MotBRandom_reg
+ dec edi
+ jnz r90
+ pop edi
+ ret 0 ; ret 4 if not _cdecl calling
+;_MotRandomInit ENDP
+
+
+; ------------------------------------------------------------------
+; Single-threaded static link versions of Mother-of-all generator
+; ------------------------------------------------------------------
+
+%IFDEF POSITIONINDEPENDENT
+; Get ecx = eip for self-relative addressing
+GetThunkECX:
+ mov ecx, [esp]
+ ret
+
+; Get address of MotherInstance into ecx, position independent
+; This works only in YASM, not in NASM:
+%macro GetMotherInstanceAddress 0
+ call GetThunkECX
+ add ecx, MotherInstance - $
+%endmacro
+
+%ELSE
+
+; Get address of MotherInstance into ecx, position dependent
+; This works only in YASM, not in NASM:
+%macro GetMotherInstanceAddress 0
+ mov ecx, MotherInstance
+%endmacro
+
+%ENDIF
+
+
+; extern "C" void MotherRandomInit(int seed); // Initialization
+_MotherRandomInit: ; PROC NEAR
+ push dword [esp+4] ; seed
+ GetMotherInstanceAddress
+ push ecx
+ call MotRandomInit@
+ pop ecx
+ pop ecx
+ ret
+;_MotherRandomInit ENDP
+
+
+; extern "C" double MotherRandom(); // Get floating point random number
+_MotherRandom: ; PROC NEAR
+ GetMotherInstanceAddress
+ fld qword [ecx+CRandomMotherA.RanP1]
+ fsub qword [ecx+CRandomMotherA.one]
+ call MotBRandom_reg ; random bits
+ ret
+;_MotherRandom ENDP
+
+
+; extern "C" int MotherIRandom(int min, int max); // Get integer random number in desired interval
+_MotherIRandom: ; PROC NEAR ; make random integer in desired interval
+ GetMotherInstanceAddress
+ call MotBRandom_reg ; make random number
+ mov edx, [esp+8] ; max
+ mov ecx, [esp+4] ; min
+ sub edx, ecx
+ jl RR100 ; max < min
+ inc edx ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [edx+ecx] ; add min
+ ret ; ret 8 if not _cdecl calling
+
+RR100: mov eax, 80000000H ; error exit
+ ret ; ret 8 if not _cdecl calling
+;_MotherIRandom ENDP
+
+
+; extern "C" unsigned int MotherBRandom(); // Output random bits
+_MotherBRandom: ; PROC NEAR
+ GetMotherInstanceAddress
+ jmp MotBRandom_reg
+;_MotherBRandom ENDP
+
+
+; ------------------------------------------------------------------
+; Single-threaded dynamic link versions
+; ------------------------------------------------------------------
+
+%IFDEF WINDOWS
+
+; extern "C" void __stdcall MotherRandomInitD(int seed); // Initialization
+_MotherRandomInitD at 4: ; PROC NEAR
+ push dword [esp+4] ; seed
+ push offset MotherInstance
+ call MotRandomInit@
+ pop ecx
+ pop ecx
+ ret 4
+;_MotherRandomInitD at 4 ENDP
+
+
+; extern "C" double __stdcall MotherRandomD(); // Get floating point random number
+_MotherRandomD at 0: ; PROC NEAR
+ mov ecx, offset MotherInstance
+ fld qword [ecx+CRandomMotherA.RanP1]
+ fsub qword [ecx+CRandomMotherA.one]
+ call MotBRandom_reg ; random bits
+ ret
+;_MotherRandomD at 0 ENDP
+
+
+; extern "C" int __stdcall MotherIRandomD(int min, int max); // Get integer random number in desired interval
+_MotherIRandomD at 8: ; PROC NEAR ; make random integer in desired interval
+ mov ecx, offset MotherInstance
+ call MotBRandom_reg ; make random number
+ mov edx, [esp+8] ; max
+ mov ecx, [esp+4] ; min
+ sub edx, ecx
+ js RR200 ; max < min
+ inc edx ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [edx+ecx] ; add min
+ ret 8
+
+RR200: mov eax, 80000000h ; error exit
+ ret 8
+;_MotherIRandomD at 8 ENDP
+
+
+; extern "C" unsigned int __stdcall MotherBRandomD(); // Output random bits
+_MotherBRandomD at 0: ; PROC NEAR
+ mov ecx, offset MotherInstance
+ jmp MotBRandom_reg
+;_MotherBRandomD at 0 ENDP
+
+%ENDIF ; WINDOWS
diff --git a/asmlibSrc/mother64.asm b/asmlibSrc/mother64.asm
new file mode 100755
index 0000000..83b6c50
--- /dev/null
+++ b/asmlibSrc/mother64.asm
@@ -0,0 +1,250 @@
+; ----------------------------- MOTHER64.ASM -----------------------------
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-12-15
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+; Description:
+; Mother-of-All random number generator by Agner Fog
+; 64-bit mode version for x86-64 compatible microprocessors.
+;
+; This is a multiply-with-carry type of random number generator
+; invented by George Marsaglia. The algorithm is:
+; S = 2111111111*X[n-4] + 1492*X[n-3] + 1776*X[n-2] + 5115*X[n-1] + C
+; X[n] = S modulo 2^32
+; C = floor(S / 2^32)
+;
+; C++ prototypes:
+; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+; extern "C" double MotRandom(void * Pthis); // Get floating point random number
+; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
+;
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+; publics:
+global MotherBRandom, MotBRandom, ?Windows_MotBRandom
+global MotherRandom, MotRandom, MotherIRandom, MotIRandom
+global MotherRandomInit, MotRandomInit
+%IFDEF WINDOWS
+global MotherRandomInitD, MotherRandomD, MotherIRandomD, MotherBRandomD
+%ENDIF
+
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+MotherInstance: ISTRUC CRandomMotherA
+IEND
+; Size of structure
+MotherSize equ $-MotherInstance
+
+
+SECTION .CODE ALIGN=16 ; code segment
+
+; Single threaded version:
+; extern "C" unsigned int MotherBRandom(); // Output random bits
+
+MotherBRandom: ; PROC ; entry for both Windows and Linux call
+%IFDEF WINDOWS
+MotherBRandomD:
+%ENDIF
+ lea rcx, [MotherInstance] ; Point to instance
+ jmp ?Windows_MotBRandom
+;MotherBRandom ENDP
+
+; Thread-safe version:
+; extern "C" unsigned int MotBRandom(void * Pthis); // Output random bits
+
+MotBRandom: ; PROC
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+?Windows_MotBRandom:
+ and rcx, -16 ; align
+ movdqa xmm1, oword [rcx+CRandomMotherA.M3] ; load M3,M2,M1,M0
+ mov eax, [rcx+CRandomMotherA.M0] ; Retrieve previous random number
+ movdqa xmm2, xmm1 ; copy
+ movdqa xmm3, oword [rcx+CRandomMotherA.MF3] ; factors
+ psrlq xmm2, 32 ; move M2,M0 down
+ movq qword [rcx+CRandomMotherA.M4], xmm1 ; M4=M3, M3=M2
+ movhps qword [rcx+CRandomMotherA.M2], xmm1 ; M2=M1, M1=M0
+ pmuludq xmm1, xmm3 ; M3*MF3, M1*MF1
+ psrlq xmm3, 32 ; move MF2,MF0 down
+ pmuludq xmm2, xmm3 ; M2*MF2, M0*MF0
+ paddq xmm1, xmm2 ; P2+P3, P0+P1
+ movhlps xmm2, xmm1 ; Get high qword
+ paddq xmm1, xmm2 ; P0+P1+P2+P3
+ paddq xmm1, oword [rcx+CRandomMotherA.MC] ; +carry
+ movq qword [rcx+CRandomMotherA.M0], xmm1 ; Store new M0 and carry
+ ; convert to double precision float
+ psllq xmm1, 32 ; Discard carry bits
+ psrlq xmm1, 12 ; Get bits into mantissa position
+ por xmm1, oword [rcx+CRandomMotherA.one] ; Add exponent bits to get number in interval [1,2)
+ movq [rcx+CRandomMotherA.RanP1], xmm1 ; Store floating point number
+ ret
+
+;MotBRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherRandom(); // Get floating point random number
+
+MotherRandom:
+%IFDEF UNIX
+ lea rdi, [MotherInstance] ; Point to instance
+%ENDIF
+%IFDEF WINDOWS
+MotherRandomD:
+ lea rcx, [MotherInstance] ; Point to instance
+%ENDIF
+
+; Thread-safe version:
+; extern "C" double MotRandom(void * Pthis); // Get floating point random number
+MotRandom:
+%IFDEF UNIX
+ mov rcx, rdi ; translate calling convention
+%ENDIF
+ and rcx, -16 ; align
+ ; get previously prepared random number
+ movsd xmm0, [rcx+CRandomMotherA.RanP1]
+ subsd xmm0, [rcx+CRandomMotherA.one]
+
+ ; make new random number ready for next time
+ call ?Windows_MotBRandom
+ ret
+;MotherRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherIRandom(int min, int max); // Get integer random number in desired interval
+
+MotherIRandom: ; PROC
+%IFDEF UNIX
+ mov r8d, esi ; max
+ mov edx, edi ; min
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotIRandom
+%ENDIF
+%IFDEF WINDOWS
+MotherIRandomD:
+ mov r8d, edx ; max
+ mov edx, ecx ; min
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotIRandom
+%ENDIF
+; MotherIRandom ENDP
+
+; Thread-safe version:
+; extern "C" int MotIRandom(void * Pthis, int min, int max); // Get integer random number in desired interval
+MotIRandom:
+%IFDEF UNIX
+ ; translate calling convention
+ mov r8d, edx ; max
+ mov edx, esi ; min
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MotIRandom: ; LABEL NEAR ; entry for Windows call
+ and rcx, -16 ; align
+ push r8
+ push rdx
+ call ?Windows_MotBRandom ; make random number
+ pop rcx ; min
+ pop r8 ; max
+ sub r8d, ecx
+ js short rerror ; max < min
+ inc r8d ; interval = max - min + 1
+ mul r8d ; multiply random number eax by interval and truncate
+ lea eax, [rdx+rcx] ; add min to interval*BRandom >> 32
+ ret ; ret 8 if not _cdecl calling
+
+rerror: mov eax, 80000000h ; error exit
+ ret ; ret 8 if not _cdecl calling
+;MotIRandom ENDP
+
+
+; Single threaded version:
+; extern "C" unsigned int MotherRandomInit(int seed); // Initialization
+
+MotherRandomInit: ; PROC
+%IFDEF UNIX
+ mov edx, edi ; seed
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotRandomInit
+%ENDIF
+%IFDEF WINDOWS
+MotherRandomInitD:
+ mov edx, ecx ; seed
+ lea rcx, [MotherInstance] ; Pthis = point to instance
+ jmp ?Windows_MotRandomInit
+%ENDIF
+;MotherRandomInit ENDP
+
+; Thread-safe version:
+; extern "C" void MotRandomInit(void * Pthis, int seed); // Initialization
+MotRandomInit: ; PROC
+%IFDEF UNIX
+ ; translate calling convention
+ mov edx, esi ; seed
+ mov rcx, rdi ; Pthis
+%ENDIF
+
+?Windows_MotRandomInit: ; LABEL NEAR ; entry for Windows call
+ and rcx, -16 ; align
+ ; clear my buffer
+ push rdi
+ push rcx
+ mov rdi, rcx ; Pthis
+ add rdi, 16
+ mov ecx, (MotherSize - 16) / 4
+ xor eax, eax
+ cld
+ rep stosd
+ pop rcx
+
+ ; insert constants
+ mov dword [rcx+CRandomMotherA.one+4], 3FF00000H ; high dword of 1.0
+ mov dword [rcx+CRandomMotherA.MF0], 5115 ; factors
+ mov dword [rcx+CRandomMotherA.MF1], 1776
+ mov dword [rcx+CRandomMotherA.MF2], 1492
+ mov dword [rcx+CRandomMotherA.MF3], 2111111111
+
+ ; initialize from seed
+ mov eax, edx ; seed
+ ; make random numbers and put them into buffer
+ mov edx, 29943829
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M0], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M1], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M2], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.M3], eax
+ imul eax, edx
+ dec eax
+ mov [rcx+CRandomMotherA.MC], eax
+
+ ; randomize some more
+ mov edi, 20 ; loop counter
+r90: call ?Windows_MotBRandom ; (rcx and rdi unchanged)
+ dec edi
+ jnz r90
+ pop rdi
+ ret
+;MotRandomInit ENDP
+
+ ; END
diff --git a/asmlibSrc/physseed32.asm b/asmlibSrc/physseed32.asm
new file mode 100755
index 0000000..304b137
--- /dev/null
+++ b/asmlibSrc/physseed32.asm
@@ -0,0 +1,334 @@
+;************************* physseed32.asm **********************************
+; Author: Agner Fog
+; Date created: 2010-08-03
+; Last modified: 2013-09-13
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; C++ prototype:
+; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
+;
+; Description:
+; Generates a non-deterministic random seed from a physical random number generator
+; which is available on some processors.
+; Uses the time stamp counter (which is less random) if no physical random number
+; generator is available.
+; The code is not optimized for speed because it is typically called only once.
+;
+; Parameters:
+; int seeds[] An array which will be filled with random numbers
+; int NumSeeds Indicates the desired number of 32-bit random numbers
+;
+; Return value: 0 Failure. No suitable instruction available (processor older than Pentium)
+; 1 No physical random number generator. Used time stamp counter instead
+; 2 Success. VIA physical random number generator used
+; 3 Success. Intel physical random number generator used
+; 4 Success. Intel physical seed generator used
+;
+; The return value will indicate the availability of a physical random number generator
+; even if NumSeeds = 0.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define NUM_TRIES 20 ; max number of tries for rdseed and rdrand instructions
+
+%define TESTING 0 ; 1 for test only
+
+global _PhysicalSeed
+
+; Direct entries to CPU-specific versions
+global _PhysicalSeedNone: function
+global _PhysicalSeedRDTSC: function
+global _PhysicalSeedVIA: function
+global _PhysicalSeedRDRand: function
+global _PhysicalSeedRDSeed function
+
+
+SECTION .text align=16
+
+_PhysicalSeed:
+
+%IFNDEF POSITIONINDEPENDENT
+
+ jmp near [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP1: ; Make the following instruction with address relative to RP1:
+ jmp near [edx+PhysicalSeedDispatch-RP1]
+
+%ENDIF
+
+_PhysicalSeedRDSeed:
+ push ebx
+ mov edx, [esp+8] ; seeds
+ mov ecx, [esp+12] ; NumSeeds
+ jecxz S300
+ ; do 32 bits at a time
+S100: mov ebx, NUM_TRIES
+S110: ; rdseed eax
+%if TESTING
+ mov eax, ecx
+ stc
+%ELSE
+ db 0Fh, 0C7h, 0F8h ; rdseed rax
+%ENDIF
+ jc S120
+ ; failed. try again
+ dec ebx
+ jz S900
+ jmp S110
+S120: mov [edx], eax
+ add edx, 4
+ dec ecx
+ jnz S100 ; loop 32 bits
+S300: mov eax, 4 ; return value
+ pop ebx
+ ret
+S900: ; failure
+ xor eax, eax ; return 0
+ pop ebx
+ ret
+
+_PhysicalSeedRDRand:
+ push ebx
+ mov edx, [esp+8] ; seeds
+ mov ecx, [esp+12] ; NumSeeds
+ jecxz R300
+ ; do 32 bits at a time
+R100: mov ebx, NUM_TRIES
+R110: ; rdrand eax
+%if TESTING
+ mov eax, ecx
+ stc
+%ELSE
+ db 0Fh, 0C7h, 0F0h ; rdrand eax
+%ENDIF
+ jc R120
+ ; failed. try again
+ dec ebx
+ jz R900
+ jmp R110
+R120: mov [edx], eax
+ add edx, 4
+ dec ecx
+ jnz R100 ; loop 32 bits
+R300: mov eax, 3 ; return value
+ pop ebx
+ ret
+R900: ; failure
+ xor eax, eax ; return 0
+ pop ebx
+ ret
+
+
+_PhysicalSeedVIA:
+; VIA XSTORE supported
+ push ebx
+ push esi
+ push edi
+ mov edi, [esp+16] ; seeds
+ mov ecx, [esp+20] ; NumSeeds
+ mov ebx, ecx
+ and ecx, -2 ; round down to nearest even
+ jz T200 ; NumSeeds <= 1
+ ; make an even number of random dwords
+ shl ecx, 2 ; number of bytes (divisible by 8)
+ mov edx, 3 ; quality factor
+%if TESTING
+ mov eax, 1
+ rep stosb
+%ELSE
+ db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
+%ENDIF
+T200:
+ test ebx, 1
+ jz T300
+ ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
+ mov esi, edi ; current output pointer
+ push ebp
+ mov ebp, esp
+ sub esp, 8 ; make temporary space on stack
+ and esp, -8 ; align by 8
+ mov edi, esp
+ mov ecx, 4 ; Will generate 4 or 8 bytes, depending on CPU
+ mov edx, 3 ; quality factor
+%if TESTING
+ mov eax, 1
+ rep stosb
+%ELSE
+ db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
+%ENDIF
+ mov eax, [esp]
+ mov [esi], eax ; store the last 4 bytes
+ mov esp, ebp
+ pop ebp
+T300:
+ mov eax, 2 ; return value
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+
+_PhysicalSeedRDTSC:
+ push ebx
+ xor eax, eax
+ cpuid ; serialize
+ rdtsc ; get time stamp counter
+ mov ebx, [esp+8] ; seeds
+ mov ecx, [esp+12] ; NumSeeds
+ test ecx, ecx
+ jz U300 ; zero seeds
+ js U900 ; failure
+ mov [ebx], eax ; store time stamp counter as seeds[0]
+ add ebx, 4
+ dec ecx
+ jz U300
+ mov [ebx], edx ; store upper part of time stamp counter as seeds[1]
+ add ebx, 4
+ dec ecx
+ jz U300
+ xor eax, eax
+U100: mov [ebx], eax ; store 0 for the rest
+ add ebx, 4
+ dec ecx
+ jnz U100
+U300: mov eax, 1 ; return value
+ pop ebx
+ ret
+U900: ; failure
+ xor eax, eax ; return 0
+ pop ebx
+ ret
+
+
+_PhysicalSeedNone: ; no possible generation
+ mov edx, [esp+4] ; seeds
+ mov ecx, [esp+8] ; NumSeeds
+ xor eax, eax
+ jecxz N200
+N100: mov [edx], eax
+ add edx, 4
+ dec ecx
+ jnz N100
+N200: ret ; return 0
+
+
+PhysicalSeedDispatcher:
+ push ebx
+ pushfd
+ pop eax
+ btc eax, 21 ; check if CPUID bit can toggle
+ push eax
+ popfd
+ pushfd
+ pop ebx
+ xor ebx, eax
+ bt ebx, 21
+ jc FAILURE ; CPUID not supported
+
+ xor eax, eax ; 0
+ cpuid ; get number of CPUID functions
+ test eax, eax
+ jz FAILURE ; function 1 not supported
+
+ ; test if RDSEED supported
+ xor eax, eax
+ cpuid
+ cmp eax, 7
+ jb P200 ; RDSEED not supported
+ mov eax, 7
+ xor ecx, ecx
+ cpuid
+ bt ebx, 18
+ ; jc USE_RDSEED ; not tested yet!!
+
+P200: ; test if RDRAND supported
+ mov eax, 1
+ cpuid
+ bt ecx, 30
+ jc USE_RDRAND
+
+ ; test if VIA xstore instruction supported
+ mov eax, 0C0000000H
+ push eax
+ cpuid
+ pop ebx
+ cmp eax, ebx
+ jna P300 ; not a VIA processor
+ lea eax, [ebx+1]
+ cpuid
+ bt edx, 3
+ jc VIA_METHOD
+
+P300: ; test if RDTSC supported
+ mov eax, 1
+ cpuid
+ bt edx, 4
+ jc USE_RDTSC ; XSTORE instruction not supported or not enabled
+
+FAILURE: ; No useful instruction supported
+ mov edx, _PhysicalSeedNone
+ jmp P800
+
+USE_RDRAND: ; Use RDRAND instruction
+ mov edx, _PhysicalSeedRDRand
+ jmp P800
+
+USE_RDSEED: ; Use RDSEED instruction (not tested yet)
+ mov edx, _PhysicalSeedRDSeed
+ jmp P800
+
+VIA_METHOD: ; Use VIA xstore instructions
+ mov edx, _PhysicalSeedVIA
+ jmp P800
+
+USE_RDTSC:
+ mov edx, _PhysicalSeedRDTSC
+ ;jmp P800
+
+P800: mov [PhysicalSeedDispatch], edx
+ pop ebx
+ jmp edx ; continue in dispatched version
+
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+
+; -----------------------------------------------------------------
+; DLL version, Windows only
+; -----------------------------------------------------------------
+%IFDEF WINDOWS
+
+_PhysicalSeedD at 8:
+global _PhysicalSeedD at 8
+ ; translate __cdecl to __stdcall calling
+ mov eax, [esp+4]
+ mov edx, [esp+8]
+ push edx
+ push eax
+ call _PhysicalSeed
+ pop ecx
+ pop ecx
+ ret 8
+
+%ENDIF ; WINDOWS
+
+
+; -----------------------------------------------------------------
+; Data section for dispatcher
+; -----------------------------------------------------------------
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+PhysicalSeedDispatch DD PhysicalSeedDispatcher
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/asmlibSrc/physseed64.asm b/asmlibSrc/physseed64.asm
new file mode 100755
index 0000000..7dcecf4
--- /dev/null
+++ b/asmlibSrc/physseed64.asm
@@ -0,0 +1,394 @@
+;************************* physseed64.asm **********************************
+; Author: Agner Fog
+; Date created: 2010-08-03
+; Last modified: 2013-09-13
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; C++ prototype:
+; extern "C" int PhysicalSeed(int seeds[], int NumSeeds);
+;
+; Description:
+; Generates a non-deterministic random seed from a physical random number generator
+; which is available on some processors.
+; Uses the time stamp counter (which is less random) if no physical random number
+; generator is available.
+; The code is not optimized for speed because it is typically called only once.
+;
+; Parameters:
+; int seeds[] An array which will be filled with random numbers
+; int NumSeeds Indicates the desired number of 32-bit random numbers
+;
+; Return value: 0 Failure. No suitable instruction available (processor older than Pentium)
+; 1 No physical random number generator. Used time stamp counter instead
+; 2 Success. VIA physical random number generator used
+; 3 Success. Intel physical random number generator used
+; 4 Success. Intel physical seed generator used
+;
+; The return value will indicate the availability of a physical random number generator
+; even if NumSeeds = 0.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+%define NUM_TRIES 20 ; max number of tries for rdseed and rdrand instructions
+
+%define TESTING 0 ; 1 for test only
+
+global PhysicalSeed
+
+; Direct entries to CPU-specific versions
+global PhysicalSeedNone: function
+global PhysicalSeedRDTSC: function
+global PhysicalSeedVIA: function
+global PhysicalSeedRDRand: function
+global PhysicalSeedRDSeed function
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+
+%IFDEF WINDOWS
+ %define par1 rcx
+ %define par2 rdx
+ %define par3 r8
+ %define par1d ecx
+ %define par2d edx
+ %define par3d r8d
+%ENDIF
+
+%IFDEF UNIX
+ %define par1 rdi
+ %define par2 rsi
+ %define par3 rdx
+ %define par1d edi
+ %define par2d esi
+ %define par3d edx
+%ENDIF
+
+
+SECTION .text align=16
+
+%IFDEF WINDOWS
+global PhysicalSeedD at 8 ; DLL version
+PhysicalSeedD at 8:
+%ENDIF
+
+PhysicalSeed:
+ jmp [PhysicalSeedDispatch] ; Go to appropriate version, depending on instructions available
+
+
+PhysicalSeedRDSeed:
+ push rbx
+ test par2d, par2d ; NumSeeds
+ jz S300
+ js S900
+ mov par3d, par2d ; NumSeeds
+ shr par3d, 1
+ jz S150
+ ; do 64 bits at a time
+S100: mov ebx, NUM_TRIES
+S110: ; rdseed rax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 48h, 0Fh, 0C7h, 0F8h ; rdseed rax
+%ENDIF
+ jc S120
+ ; failed. try again
+ dec ebx
+ jz S900
+ jmp S110
+S120: mov [par1], rax
+ add par1, 8
+ dec par3d
+ jnz S100 ; loop 64 bits
+S150:
+ and par2d, 1
+ jz S300
+ ; an odd 32 bit remains
+S200: mov ebx, NUM_TRIES
+S210: ; rdseed rax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 0Fh, 0C7h, 0F8h ; rdseed eax
+%ENDIF
+ jc S220
+ ; failed. try again
+ dec ebx
+ jz S900
+ jmp S210
+S220: mov [par1], eax
+S300: mov eax, 4 ; return value
+ pop rbx
+ ret
+S900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+
+PhysicalSeedRDRand:
+ push rbx
+ test par2d, par2d ; NumSeeds
+ jz R300
+ js R900
+ mov par3d, par2d ; NumSeeds
+ shr par3d, 1 ; NumSeeds/2
+ jz R150
+ ; do 64 bits at a time
+R100: mov ebx, NUM_TRIES
+R110: ; rdrand rax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 48h, 0Fh, 0C7h, 0F0h ; rdrand rax
+%ENDIF
+ jc R120
+ ; failed. try again
+ dec ebx
+ jz R900
+ jmp R110
+R120: mov [par1], rax
+ add par1, 8
+ dec par3d
+ jnz R100 ; loop 64 bits
+R150:
+ and par2d, 1
+ jz R300
+ ; an odd 32 bit remains
+R200: mov ebx, NUM_TRIES
+R210: ; rdrand eax
+%if TESTING
+ mov eax, par3d
+ stc
+%ELSE
+ db 0Fh, 0C7h, 0F0h ; rdrand eax
+%ENDIF
+ jc R220
+ ; failed. try again
+ dec ebx
+ jz R900
+ jmp R210
+R220: mov [par1], eax
+R300: mov eax, 4 ; return value
+ pop rbx
+ ret
+R900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+
+PhysicalSeedVIA:
+; VIA XSTORE supported
+ push rbx
+%IFDEF WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx ; seeds
+ mov esi, edx ; NumSeeds
+%ENDIF
+ mov ecx, esi ; NumSeeds
+ and ecx, -2 ; round down to nearest even
+ jz T200 ; NumSeeds <= 1
+ ; make an even number of random dwords
+ shl ecx, 2 ; number of bytes (divisible by 8)
+ mov edx, 3 ; quality factor
+%if TESTING
+ mov eax, 1
+ rep stosb
+%ELSE
+ db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
+%ENDIF
+T200:
+ test esi, 1
+ jz T300
+ ; NumSeeds is odd. Make 8 bytes in temporary buffer and store 4 of the bytes
+ mov rbx, rdi ; current output pointer
+ mov ecx, 4 ; Will generate 4 or 8 bytes, depending on CPU
+ mov edx, 3 ; quality factor
+ push rcx ; make temporary space on stack
+ mov rdi, rsp ; point to buffer on stack
+%if TESTING
+ mov eax, 1
+ rep stosb
+%ELSE
+ db 0F3H, 00FH, 0A7H, 0C0H ; rep xstore instuction
+%ENDIF
+ pop rax
+ mov [rbx], eax ; store the last 4 bytes
+T300:
+ mov eax, 2 ; return value
+%IFDEF WINDOWS
+ pop rdi
+ pop rsi
+%ENDIF
+ pop rbx
+ ret
+
+
+PhysicalSeedRDTSC:
+%IFDEF WINDOWS
+ push rbx
+ push rcx
+ push rdx
+ xor eax, eax
+ cpuid ; serialize
+ rdtsc ; get time stamp counter
+ pop rbx ; numseeds
+ pop rcx ; seeds
+ test ebx, ebx
+ jz U300 ; zero seeds
+ js U900 ; failure
+ mov [rcx], eax ; store time stamp counter as seeds[0]
+ add rcx, 4
+ dec ebx
+ jz U300
+ mov [rcx], edx ; store upper part of time stamp counter as seeds[1]
+ add rcx, 4
+ dec ebx
+ jz U300
+ xor eax, eax
+U100: mov [rcx], eax ; store 0 for the rest
+ add rcx, 4
+ dec ebx
+ jnz U100
+U300: mov eax, 1 ; return value
+ pop rbx
+ ret
+U900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+%ELSE ; UNIX
+
+ push rbx
+ xor eax, eax
+ cpuid ; serialize
+ rdtsc ; get time stamp counter
+ test esi, esi ; numseeds
+ jz U300 ; zero seeds
+ js U900 ; failure
+ mov [rdi], eax ; store time stamp counter as seeds[0]
+ add rdi, 4
+ dec esi
+ jz U300
+ mov [rdi], edx ; store upper part of time stamp counter as seeds[1]
+ add rdi, 4
+ dec esi
+ jz U300
+ xor eax, eax
+U100: mov [rdi], eax ; store 0 for the rest
+ add rdi, 4
+ dec esi
+ jnz U100
+U300: mov eax, 1 ; return value
+ pop rbx
+ ret
+U900: ; failure
+ xor eax, eax ; return 0
+ pop rbx
+ ret
+
+%ENDIF
+
+
+PhysicalSeedNone: ; no possible generation
+ xor eax, eax
+ test par2d, par2d ; numseeds
+ jz N200
+N100: mov [par1], eax
+ add par1, 4
+ dec par2d
+ jnz N100
+N200: ret ; return 0
+
+
+PhysicalSeedDispatcher:
+ push rbx
+%IFDEF WINDOWS
+ push rcx
+ push rdx
+%ENDIF
+ ; test if RDSEED supported
+ xor eax, eax
+ cpuid
+ cmp eax, 7
+ jb P200 ; RDSEED not supported
+ mov eax, 7
+ xor ecx, ecx
+ cpuid
+ bt ebx, 18
+ ; jc USE_RDSEED ; not tested yet!!
+
+P200: ; test if RDRAND supported
+ mov eax, 1
+ cpuid
+ bt ecx, 30
+ jc USE_RDRAND
+
+ ; test if VIA xstore instruction supported
+ mov eax, 0C0000000H
+ push rax
+ cpuid
+ pop rbx
+ cmp eax, ebx
+ jna P300 ; not a VIA processor
+ lea eax, [rbx+1]
+ cpuid
+ bt edx, 3
+ jc VIA_METHOD
+
+P300: ; test if RDTSC supported
+ mov eax, 1
+ cpuid
+ bt edx, 4
+ jc USE_RDTSC ; XSTORE instruction not supported or not enabled
+
+FAILURE: ; No useful instruction supported
+ lea rax, [PhysicalSeedNone]
+ jmp P800
+
+USE_RDRAND: ; Use RDRAND instruction
+ lea rax, [PhysicalSeedRDRand]
+ jmp P800
+
+USE_RDSEED: ; Use RDSEED instruction (not tested yet)
+ lea rax, [PhysicalSeedRDSeed]
+ jmp P800
+
+VIA_METHOD: ; Use VIA xstore instructions
+ lea rax, [PhysicalSeedVIA]
+ jmp P800
+
+USE_RDTSC:
+ lea rax, [PhysicalSeedRDTSC]
+ ;jmp P800
+
+P800: mov [PhysicalSeedDispatch], rax
+%IFDEF WINDOWS
+ pop rdx
+ pop rcx
+%ENDIF
+ pop rbx
+ jmp rax ; continue in dispatched version
+
+
+; -----------------------------------------------------------------
+; Data section for dispatcher
+; -----------------------------------------------------------------
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+PhysicalSeedDispatch DQ PhysicalSeedDispatcher
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/asmlibSrc/popcount32.asm b/asmlibSrc/popcount32.asm
new file mode 100755
index 0000000..29b137d
--- /dev/null
+++ b/asmlibSrc/popcount32.asm
@@ -0,0 +1,137 @@
+;************************* popcount32.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-20
+; Last modified: 2011-08-21
+
+; Description:
+; Population count function. Counts the number of 1-bits in a 32-bit integer
+; unsigned int A_popcount (unsigned int x);
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_popcount: function
+
+; Direct entries to CPU-specific versions
+global _popcountGeneric: function
+global _popcountSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+; popcount function
+;******************************************************************************
+
+
+_A_popcount: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp near [popcountDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP1: ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+ jmp near [edx+popcountDispatch-RP1]
+
+%ENDIF
+
+align 16
+_popcountSSE42: ; SSE4.2 version
+ popcnt eax, dword [esp+4]
+ ret
+
+
+;******************************************************************************
+; popcount function generic
+;******************************************************************************
+
+_popcountGeneric: ; Generic version
+ mov eax, [esp+4] ; x
+ mov edx, eax
+ shr eax, 1
+ and eax, 55555555h ; odd bits in eax, even bits in edx
+ and edx, 55555555h
+ add eax, edx
+ mov edx, eax
+ shr eax, 2
+ and eax, 33333333h
+ and edx, 33333333h
+ add eax, edx
+ mov edx, eax
+ shr eax, 4
+ add eax, edx
+ and eax, 0F0F0F0Fh
+ mov edx, eax
+ shr eax, 8
+ add eax, edx
+ mov edx, eax
+ shr eax, 16
+ add eax, edx
+ and eax, 03FH
+ ret
+;_popcountGeneric end
+
+; ********************************************************************************
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for popcount. This is executed only once
+; ********************************************************************************
+
+popcountCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version
+ mov ecx, _popcountGeneric
+ cmp eax, 9 ; check popcnt supported
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ mov ecx, _popcountSSE42
+Q100: mov [popcountDispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP10: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_popcountGeneric-RP10]
+ cmp eax, 9 ; check popcnt supported
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea ecx, [edx+_popcountSSE42-RP10]
+Q100: mov [edx+popcountDispatch-RP10], ecx
+ ; Continue in appropriate version
+ jmp ecx
+%ENDIF
+
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+popcountDispatch DD popcountCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/asmlibSrc/popcount64.asm b/asmlibSrc/popcount64.asm
new file mode 100755
index 0000000..f05e0c9
--- /dev/null
+++ b/asmlibSrc/popcount64.asm
@@ -0,0 +1,110 @@
+;************************* popcount64.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-20
+; Last modified: 2011-07-20
+
+; Description:
+; Population count function. Counts the number of 1-bits in a 32-bit integer
+; unsigned int A_popcount (unsigned int x);
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global A_popcount: function
+
+; Direct entries to CPU-specific versions
+global popcountGeneric: function
+global popcountSSE42: function
+
+; Imported from instrset32.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+; popcount function
+;******************************************************************************
+
+
+A_popcount: ; function dispatching
+ jmp near [popcountDispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+popcountSSE42: ; SSE4.2 version
+%ifdef WINDOWS
+ popcnt eax, ecx
+%else
+ popcnt eax, edi
+%endif
+ ret
+
+
+;******************************************************************************
+; popcount function generic
+;******************************************************************************
+
+popcountGeneric: ; Generic version
+%ifdef WINDOWS
+ mov eax, ecx
+%else
+ mov eax, edi
+%endif
+ mov edx, eax
+ shr eax, 1
+ and eax, 55555555h ; odd bits in eax, even bits in edx
+ and edx, 55555555h
+ add eax, edx
+ mov edx, eax
+ shr eax, 2
+ and eax, 33333333h
+ and edx, 33333333h
+ add eax, edx
+ mov edx, eax
+ shr eax, 4
+ add eax, edx
+ and eax, 0F0F0F0Fh
+ mov edx, eax
+ shr eax, 8
+ add eax, edx
+ mov edx, eax
+ shr eax, 16
+ add eax, edx
+ and eax, 03FH
+ ret
+;popcountGeneric end
+
+; ********************************************************************************
+; CPU dispatching for popcount. This is executed only once
+; ********************************************************************************
+
+%ifdef WINDOWS
+%define par1 rcx ; parameter 1, pointer to haystack
+%else
+%define par1 rdi ; parameter 1, pointer to haystack
+%endif
+
+popcountCPUDispatch:
+ ; get supported instruction set
+ push par1
+ call InstructionSet
+ pop par1
+ ; Point to generic version of strstr
+ lea rdx, [popcountGeneric]
+ cmp eax, 9 ; check popcnt supported
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea rdx, [popcountSSE42]
+Q100: mov [popcountDispatch], rdx
+ ; Continue in appropriate version
+ jmp rdx
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+popcountDispatch DQ popcountCPUDispatch
diff --git a/asmlibSrc/procname32.asm b/asmlibSrc/procname32.asm
new file mode 100755
index 0000000..23f16bf
--- /dev/null
+++ b/asmlibSrc/procname32.asm
@@ -0,0 +1,186 @@
+; procname32.asm
+;
+; Author: Agner Fog
+; Date created: 2007
+; Last modified: 2013-09-11
+; Description:
+; ProcessorName
+; =============
+; This function produces a zero-terminated ASCII string containing a name
+; for the microprocessor in human-readable format.
+;
+; Copyright (c) 2007-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _ProcessorName: function
+
+
+SECTION .data
+align 16
+
+NameBuffer times 50H db 0 ; Static buffer to contain name
+
+
+SECTION .text align=16
+
+%IFDEF POSITIONINDEPENDENT
+; Local function for reading instruction pointer into edi
+GetThunkEDI:
+ mov edi, [esp]
+ ret
+%ENDIF ; End IF POSITIONINDEPENDENT
+
+
+; ********** ProcessorName function **********
+; C++ prototype:
+; extern "C" char * ProcessorName ();
+
+; This function finds the name of the microprocessor. The name is returned
+; in the parameter text, which must be a character array of at least 68 bytes.
+
+_ProcessorName:
+ push ebx
+ push edi
+
+; Make edi point to NameBuffer:
+
+%IFDEF POSITIONINDEPENDENT
+ ; Position-independent code. Get edi = eip for reference point
+ call GetThunkEDI
+ add edi, NameBuffer - $
+%ELSE
+ ; Normal code requiring base relocation:
+ mov edi, NameBuffer
+%ENDIF
+
+; detect if CPUID instruction supported by microprocessor:
+ pushfd
+ pop eax
+ xor eax, 1 << 21 ; Check if CPUID bit can toggle
+ push eax
+ popfd
+ pushfd
+ pop ebx
+ xor eax, ebx
+ and eax, 1 << 21
+ jnz NOID ; CPUID not supported
+ xor eax, eax
+ cpuid ; Get number of CPUID functions
+ test eax, eax
+ jnz IDENTIFYABLE ; Function 1 supported
+
+NOID:
+ ; processor has no CPUID
+ mov DWORD [edi], '8038' ; Write text '80386 or 80486'
+ mov DWORD [edi+4], '6 or'
+ mov DWORD [edi+8], ' 804'
+ mov DWORD [edi+12], '86' ; End with 0
+ jmp PNEND
+
+IDENTIFYABLE:
+ mov eax, 80000000H
+ cpuid
+ cmp eax, 80000004H ; Text if extended vendor string available
+ jb no_ext_vendor_string
+
+ ; Has extended vendor string
+ mov eax, 80000002H
+ cpuid
+ mov [edi], eax ; Store 16 bytes of extended vendor string
+ mov [edi+4], ebx
+ mov [edi+8], ecx
+ mov [edi+0CH], edx
+ mov eax, 80000003H
+ cpuid
+ mov [edi+10H], eax ; Next 16 bytes
+ mov [edi+14H], ebx
+ mov [edi+18H], ecx
+ mov [edi+1CH], edx
+ mov eax, 80000004H
+ cpuid
+ mov [edi+20H], eax ; Next 16 bytes
+ mov [edi+24H], ebx
+ mov [edi+28H], ecx
+ mov [edi+2CH], edx
+ jmp get_family_and_model
+
+no_ext_vendor_string:
+ ; No extended vendor string. Get short vendor string
+ xor eax, eax
+ cpuid
+ mov [edi],ebx ; Store short vendor string
+ mov [edi+4],edx
+ mov [edi+8],ecx
+ mov byte [edi+12],0 ; Terminate string
+
+get_family_and_model:
+ push edi ; Save string address
+ xor eax, eax
+ mov ecx, 30H
+ cld
+ repne scasb ; Find end of text
+ dec edi
+ mov dword [edi], ' Fam' ; Append text " Family "
+ mov dword [edi+4], 'ily '
+ add edi, 8
+
+ mov eax, 1
+ cpuid ; Get family and model
+ mov ebx, eax
+ mov ecx, eax
+ shr eax, 8
+ and eax, 0FH ; Family
+ shr ecx, 20
+ and ecx, 0FFH ; Extended family
+ add eax, ecx ; Family + extended family
+ call WriteHex ; Write as hexadecimal
+
+ mov dword [edi], 'H Mo' ; Write text "H Model "
+ mov dword [edi+4], 'del '
+ add edi, 8
+
+ mov eax, ebx
+ shr eax, 4
+ and eax, 0FH ; Model
+ mov ecx, ebx
+ shr ecx, 12
+ and ecx, 0F0H ; Extended model
+ or eax, ecx ; Model | extended model
+ call WriteHex ; Write as hexadecimal
+
+ mov dword [edi], 'H' ; Write text "H"
+ pop edi ; Restore string address
+
+PNEND: ; finished
+ mov eax, edi ; Pointer to result
+ pop edi
+ pop ebx
+ ret
+;_ProcessorName ENDP
+
+WriteHex: ; Local function: Write 2 hexadecimal digits
+ ; Parameters: AL = number to write, EDI = text destination
+ mov ecx, eax
+ shr ecx, 4
+ and ecx, 0FH ; most significant digit first
+ cmp ecx, 10
+ jnb W1
+ ; 0 - 9
+ add ecx, '0'
+ jmp W2
+W1: ; A - F
+ add ecx, 'A' - 10
+W2: mov [edi], cl ; write digit
+
+ mov ecx, eax
+ and ecx, 0FH ; next digit
+ cmp ecx, 10
+ jnb W3
+ ; 0 - 9
+ add ecx, '0'
+ jmp W4
+W3: ; A - F
+ add ecx, 'A' - 10
+W4: mov [edi+1], cl ; write digit
+ add edi, 2 ; advance string pointer
+ ret
diff --git a/asmlibSrc/procname64.asm b/asmlibSrc/procname64.asm
new file mode 100755
index 0000000..e65c384
--- /dev/null
+++ b/asmlibSrc/procname64.asm
@@ -0,0 +1,143 @@
+; procname64.asm
+;
+; Author: Agner Fog
+; Date created: 2007
+; Last modified: 2011-07-02
+; Description:
+; ProcessorName
+; =============
+; This function produces a zero-terminated ASCII string containing a name
+; for the microprocessor in human-readable format.
+;
+; Copyright (c) 2007-2011 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global ProcessorName: function
+
+SECTION .data
+align 16
+
+NameBuffer times 50H db 0 ; Static buffer to contain name
+
+
+SECTION .text align=16
+
+; ********** ProcessorName function **********
+; C++ prototype:
+; void ProcessorName (char * text);
+
+; This function finds the name of the microprocessor. The name is returned
+; in the parameter text, which must be a character array of at least 68 bytes.
+
+ProcessorName:
+ push rbx
+ push rdi
+ lea rdi, [NameBuffer] ; text pointer
+
+ mov eax, 80000000H
+ cpuid
+ cmp eax, 80000004H ; text if extended vendor string available
+ jb no_ext_vendor_string
+
+ ; Has extended vendor string
+ mov eax, 80000002H
+ cpuid
+ mov [rdi], eax ; store 16 bytes of extended vendor string
+ mov [rdi+4], ebx
+ mov [rdi+8], ecx
+ mov [rdi+0CH], edx
+ mov eax, 80000003H
+ cpuid
+ mov [rdi+10H], eax ; next 16 bytes
+ mov [rdi+14H], ebx
+ mov [rdi+18H], ecx
+ mov [rdi+1CH], edx
+ mov eax, 80000004H
+ cpuid
+ mov [rdi+20H], eax ; next 16 bytes
+ mov [rdi+24H], ebx
+ mov [rdi+28H], ecx
+ mov [rdi+2CH], edx
+ jmp get_family_and_model
+
+no_ext_vendor_string:
+ ; No extended vendor string. Get short vendor string
+ xor eax, eax
+ cpuid
+ mov [rdi],ebx ; store short vendor string
+ mov [rdi+4],edx
+ mov [rdi+8],ecx
+ mov byte [rdi+12],0 ; terminate string
+
+get_family_and_model:
+ xor eax, eax
+ mov ecx, 30H
+ cld
+ repne scasb ; find end of text
+ dec rdi
+
+ mov dword [rdi], ' Fam' ; Append text " Family "
+ mov dword [rdi+4], 'ily '
+ add rdi, 8
+
+ mov eax, 1
+ cpuid ; Get family and model
+ mov ebx, eax
+ mov ecx, eax
+ shr eax, 8
+ and eax, 0FH ; Family
+ shr ecx, 20
+ and ecx, 0FFH ; Extended family
+ add eax, ecx ; Family + extended family
+ call WriteHex ; Write as hexadecimal
+
+ mov dword [rdi], 'H Mo' ; Write text "H Model "
+ mov dword [rdi+4], 'del '
+ add rdi, 8
+
+ mov eax, ebx
+ shr eax, 4
+ and eax, 0FH ; Model
+ mov ecx, ebx
+ shr ecx, 12
+ and ecx, 0F0H ; Extended model
+ or eax, ecx ; Model | extended model
+ call WriteHex ; Write as hexadecimal
+
+ mov dword [rdi], 'H' ; Write text "H"
+
+PNEND: ; finished
+ lea rax, [NameBuffer] ; Pointer to result
+ pop rdi
+ pop rbx
+ ret
+;ProcessorName ENDP
+
+WriteHex: ; Local function: Write 2 hexadecimal digits
+ ; Parameters: AL = number to write, RDI = text destination
+ mov ecx, eax
+ shr ecx, 4
+ and ecx, 0FH ; most significant digit first
+ cmp ecx, 10
+ jnb W1
+ ; 0 - 9
+ add ecx, '0'
+ jmp W2
+W1: ; A - F
+ add ecx, 'A' - 10
+W2: mov [rdi], cl ; write digit
+
+ mov ecx, eax
+ and ecx, 0FH ; next digit
+ cmp ecx, 10
+ jnb W3
+ ; 0 - 9
+ add ecx, '0'
+ jmp W4
+W3: ; A - F
+ add ecx, 'A' - 10
+W4: mov [rdi+1], cl ; write digit
+ add rdi, 2 ; advance string pointer
+ ret
diff --git a/asmlibSrc/randomah.asi b/asmlibSrc/randomah.asi
new file mode 100755
index 0000000..ed7a018
--- /dev/null
+++ b/asmlibSrc/randomah.asi
@@ -0,0 +1,290 @@
+; ----------------------------- RANDOMAH.ASI ---------------------------
+;
+; Author: Agner Fog
+; Date created: 1998
+; Last modified: 2013-09-09
+; Description:
+; Assembly include file containing
+; structure/class definitions for random number generators
+;
+; Copyright (c) 1998-2013 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Definitions for Mersenne Twister:
+
+TEMPERING EQU 1 ; set to 0 if no tempering (improves speed by 25%)
+
+%if 0
+; define constants for MT11213A:
+MERS_N EQU 351
+MERS_M EQU 175
+MERS_R EQU 19
+MERS_A EQU 0E4BD75F5H
+MERS_U EQU 11
+MERS_S EQU 7
+MERS_T EQU 15
+MERS_L EQU 17
+MERS_B EQU 655E5280H
+MERS_C EQU 0FFD58000H
+
+%ELSE
+; or constants for MT19937:
+MERS_N EQU 624
+MERS_M EQU 397
+MERS_R EQU 31
+MERS_A EQU 09908B0DFH
+MERS_U EQU 11
+MERS_S EQU 7
+MERS_T EQU 15
+MERS_L EQU 18
+MERS_B EQU 9D2C5680H
+MERS_C EQU 0EFC60000H
+
+%ENDIF
+
+LOWER_MASK EQU (1 << MERS_R) - 1 ; lower MERS_R bits
+UPPER_MASK EQU -1 << MERS_R ; upper 32-MERS_R bits
+
+; Define class CRandomMersenneA member data
+; Must be aligned by 16.
+
+STRUC CRandomMersenneA
+.Fill1 RESD 4 ; Alignment filler
+.PreInt: RESD 4 ; premade tempered integer numbers, ready to use
+.PreFlt: RESQ 4 ; premade floating point numbers, ready to use (subtract 1.0)
+ RESQ 1 ; last PreFlt unaligned overrun if MERS_N mod 4 = 1
+.TmpFlt: RESQ 1 ; temporary storage of floating point random number
+.PreInx: RESD 1 ; index to next PreInt and PreFlt number
+.Instset: RESD 1 ; Instruction set
+.LastInterval: RESD 1 ; Last interval length for IRandomX
+.RLimit: RESD 1 ; Rejection limit used by IRandomX
+.TMB: RESD 4 ; 4 copies of MERS_B constant
+.TMC: RESD 4 ; 4 copies of MERS_C constant
+.one: RESQ 2 ; 2 copies of 1.0 constant
+.MTI: RESD 1 ; index into MT buffer
+.UMASK: RESD 1 ; UPPER_MASK
+.LMASK: RESD 1 ; LOWER_MASK ; constants
+.MATA: RESD 1 ; MERS_A
+.wrap1: RESD 4 ; MT buffer km wraparound
+.MT: RESD MERS_N ; MT history buffer (aligned by 16)
+.wrap2: RESD 4 ; MT buffer kk wraparound
+%if MERS_N & 3
+ ; MERS_N not divisible by 4. align by 4
+ RESD (4 - (MERS_N & 3))
+%ENDIF
+endstruc ; CRandomMersenneA
+
+
+; Definitions for Mother-of-all generator:
+
+; Define class CRandomMotherA member data
+; Must be aligned by 16. Preferably aligned by 64 to fit a cache line
+STRUC CRandomMotherA
+.Fill2 RESD 4 ; Alignment filler
+.one RESQ 1 ; 1.0
+.Instset RESD 1 ; Instruction set
+.M4 RESD 1 ; x[n-4]
+.M3 RESD 1 ; x[n-3] (aligned)
+.M2 RESD 1 ; x[n-2]
+.M1 RESD 1 ; x[n-1]
+.M0 RESD 1 ; x[n]
+.MC RESD 1 ; Carry (aligned)
+.zero RESD 1 ; Zero-extension of carry
+.RanP1 RESQ 1 ; Double random number in interval [1,2)
+.MF3 RESD 1 ; 2111111111 (aligned)
+.MF2 RESD 1 ; 1492
+.MF1 RESD 1 ; 1776
+.MF0 RESD 1 ; 5115
+endstruc ; CRandomMotherA
+
+MOTHERF0 EQU 5115 ; factor 0
+MOTHERF1 EQU 1776 ; factor 1
+MOTHERF2 EQU 1492 ; factor 2
+MOTHERF3 EQU 2111111111 ; factor 3
+
+
+; ***************************************************************************
+; Definitions for SFMT generator
+; ***************************************************************************
+
+; Choose Mersenne exponent.
+; Higher values give longer cycle length and use more memory:
+; MEXP equ 607
+; MEXP equ 1279
+; MEXP equ 2281
+; MEXP equ 4253
+ MEXP equ 11213
+; MEXP equ 19937
+; MEXP equ 44497
+
+%if MEXP == 44497
+SFMT_N equ 348 ; Size of state vector
+SFMT_M equ 330 ; Position of intermediate feedback
+SFMT_SL1 equ 5 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 9 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0effffffbH ;first DWORD of AND mask
+; AND mask:
+%define SFMT_MASK 0effffffbH,0dfbebfffH,0bfbf7befH,09ffd7bffH
+; Period certification vector
+%define 1,0,0a3ac4000H,0ecc1327aH
+
+%elif MEXP == 19937
+SFMT_N equ 156 ; Size of state vector
+SFMT_M equ 122 ; Position of intermediate feedback
+SFMT_SL1 equ 18 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 11 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0dfffffefH ;first DWORD of AND mask
+%define SFMT_MASK 0dfffffefH,0ddfecb7fH,0bffaffffH,0bffffff6H
+%define SFMT_PARITY 1,0,0,013c9e684H
+
+%elif MEXP == 11213
+SFMT_N equ 88 ; Size of state vector
+SFMT_M equ 68 ; Position of intermediate feedback
+SFMT_SL1 equ 14 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 7 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0effff7fbH ;first DWORD of AND mask
+%define SFMT_MASK 0effff7fbH,0ffffffefH,0dfdfbfffH,07fffdbfdH
+%define SFMT_PARITY 1,0,0e8148000H,0d0c7afa3H
+
+%elif MEXP == 4253
+SFMT_N equ 34 ; Size of state vector
+SFMT_M equ 17 ; Position of intermediate feedback
+SFMT_SL1 equ 20 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 7 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 09f7bffffH ;first DWORD of AND mask
+%define SFMT_MASK 09f7bffffH,09fffff5fH,03efffffbH,0fffff7bbH
+%define SFMT_PARITY 0a8000001H,0af5390a3H,0b740b3f8H,06c11486dH
+
+%elif MEXP == 2281
+SFMT_N equ 18 ; Size of state vector
+SFMT_M equ 12 ; Position of intermediate feedback
+SFMT_SL1 equ 19 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 1 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 5 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0bff7ffbfH ;first DWORD of AND mask
+%define SFMT_MASK 0bff7ffbfH,0fdfffffeH,0f7ffef7fH,0f2f7cbbfH
+%define SFMT_PARITY 1,0,0,041dfa600H
+
+%elif MEXP == 1279
+SFMT_N equ 10 ; Size of state vector
+SFMT_M equ 7 ; Position of intermediate feedback
+SFMT_SL1 equ 14 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 5 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 1 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0f7fefffdH ;first DWORD of AND mask
+%define SFMT_MASK 0f7fefffdH,07fefcfffH,0aff3ef3fH,0b5ffff7fH
+%define SFMT_PARITY 1,0,0,020000000H
+
+%elif MEXP == 607
+SFMT_N equ 5 ; Size of state vector
+SFMT_M equ 2 ; Position of intermediate feedback
+SFMT_SL1 equ 15 ; Left shift of W[N-1], 32-bit words
+SFMT_SL2 equ 3 ; Left shift of W[0], *8, 128-bit words
+SFMT_SR1 equ 13 ; Right shift of W[M], 32-bit words
+SFMT_SR2 equ 3 ; Right shift of W[N-2], *8, 128-bit words
+SFMT_MASK1 equ 0fdff37ffH ;first DWORD of AND mask
+%define SFMT_MASK 0fdff37ffH,0ef7f3f7dH,0ff777b7dH,07ff7fb2fH
+%define SFMT_PARITY 1,0,0,05986f054H
+
+%ELSE
+%error MEXP must have one of the predefined values
+%ENDIF
+
+STRUC CRandomSFMTA
+.Fill3 RESD 4 ; Alignment filler
+
+; Parameters for Mother-Of-All generator:
+.M3: RESD 1 ; x[n-3] (aligned)
+ RESD 1 ; unused filler to fit the pmuludq instruction
+.M2: RESD 1 ; x[n-2]
+ RESD 1 ; unused filler to fit the pmuludq instruction
+.M1: RESD 1 ; x[n-1]
+ RESD 1 ; unused filler to fit the pmuludq instruction
+.M0: RESD 1 ; x[n]
+.MC: RESD 1 ; Carry (zero-extends into one)
+.one: RESQ 1 ; 1.0 (low dword = zero-extension of carry) (aligned)
+.TempRan: RESQ 1 ; Temporary random number
+.MF3: RESD 1 ; 2111111111 (aligned)
+.Instset: RESD 1 ; Instruction set
+.MF2: RESD 1 ; 1492 (MF3,MF2,MF1,MF0 interleaved with other variables to fit the pmuludq instruction)
+ RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF1: RESD 1 ; 1776
+ RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
+.MF0: RESD 1 ; 5115
+ RESD 1 ; Filler (may be used for read-only parameter, but not for read/write parameter)
+
+; Parameters for IRandomX:
+.LASTINTERVAL: RESD 1 ; Last interval length for IRandomX
+.RLIMIT: RESD 1 ; Rejection limit used by IRandomX
+
+; Parameters for SFMT generator:
+.USEMOTHER: RESD 1 ; 1 if combine with Mother-Of-All generator
+.IX: RESD 1 ; Index into state buffer for SFMT
+
+.AMASK: RESD 4 ; AND mask (aligned)
+.STATE: RESD SFMT_N*4 ; State vector (aligned)
+endstruc ; CRandomSFMTA
+
+
+; Load offset of TARGET into ecx. Use position-independent method if necessary
+%macro LOADOFFSET2ECX 1
+%IFNDEF POSITIONINDEPENDENT
+ mov ecx, %1
+%ELSE
+ ; get position-independent address of TARGET
+ call get_thunk_ecx
+ add ecx, %1 - $
+%ENDIF
+%endmacro
+
+; Load offset of TARGET into edi. Use position-independent method if necessary
+%macro LOADOFFSET2EDI 1
+%IFNDEF POSITIONINDEPENDENT
+ mov edi, %1
+%ELSE
+ ; get position-independent address of TARGET
+ call get_thunk_edi
+ add edi, %1 - $
+%ENDIF
+%endmacro
+
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+
+%IFDEF WINDOWS
+ %define par1 rcx
+ %define par2 rdx
+ %define par3 r8
+ %define par4 r9
+ %define par5 qword [rsp+32+8] ; stack offset including shadow space
+ %define par1d ecx
+ %define par2d edx
+ %define par3d r8d
+ %define par4d r9d
+ %define par5d dword [rsp+32+8]
+%ENDIF
+
+%IFDEF UNIX
+ %define par1 rdi
+ %define par2 rsi
+ %define par3 rdx
+ %define par4 rcx
+ %define par5 r8
+ %define par1d edi
+ %define par2d esi
+ %define par3d edx
+ %define par4d ecx
+ %define par5d r8d
+%ENDIF
diff --git a/asmlibSrc/rdtsc32.asm b/asmlibSrc/rdtsc32.asm
new file mode 100755
index 0000000..1ada795
--- /dev/null
+++ b/asmlibSrc/rdtsc32.asm
@@ -0,0 +1,51 @@
+; RDTSC32.ASM
+;
+; Author: Agner Fog
+; Date created: 2003
+; Last modified: 2008-10-16
+; Description:
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _ReadTSC: function
+
+SECTION .text align=16
+
+; ********** ReadTSC function **********
+; C++ prototype:
+; extern "C" int ReadTSC (void);
+; or:
+; extern "C" __int64 ReadTSC (void);
+
+; This function returns the value of the time stamp counter, which counts
+; clock cycles. To count how many clock cycles a piece of code takes, call
+; Rdtsc before and after the code to measure and calculate the difference.
+
+; The number of clock cycles taken by the ReadTSC function itself is approximately:
+; Core 2: 730
+; Pentium 4: 700
+; Pentium II and Pentium III: 225
+; AMD Athlon 64, Opteron: 126
+; Does not work on 80386 and 80486.
+
+; Note that clock counts may not be fully reproducible on Intel Core and
+; Core 2 processors because the clock frequency can change. More reliable
+; instruction timings are obtained with the performance monitor counter
+; for "core clock cycles". This requires a kernel mode driver as the one
+; included with www.agner.org/optimize/testp.zip.
+
+_ReadTSC:
+ push ebx ; ebx is modified by cpuid
+ sub eax, eax ; 0
+ cpuid ; serialize
+ rdtsc ; read time stamp counter
+ push eax
+ push edx
+ sub eax, eax
+ cpuid ; serialize
+ pop edx
+ pop eax
+ pop ebx
+ ret
+;_ReadTSC ENDP
diff --git a/asmlibSrc/rdtsc64.asm b/asmlibSrc/rdtsc64.asm
new file mode 100755
index 0000000..cfa30a4
--- /dev/null
+++ b/asmlibSrc/rdtsc64.asm
@@ -0,0 +1,51 @@
+; RDTSC64.ASM
+;
+; Author: Agner Fog
+; Date created: 2003
+; Last modified: 2008-10-16
+; Description:
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global ReadTSC: function
+
+SECTION .text align=16
+
+; ********** ReadTSC function **********
+; C++ prototype:
+; extern "C" __int64 ReadTSC (void);
+
+; This function returns the value of the time stamp counter, which counts
+; clock cycles. To count how many clock cycles a piece of code takes, call
+; Rdtsc before and after the code to measure and calculate the difference.
+
+; The number of clock cycles taken by the ReadTSC function itself is approximately:
+; Core 2: 730
+; Pentium 4: 700
+; Pentium II and Pentium III: 225
+; AMD Athlon 64, Opteron: 126
+; Does not work on 80386 and 80486.
+
+; Note that clock counts may not be fully reproducible on Intel Core and
+; Core 2 processors because the clock frequency can change. More reliable
+; instruction timings are obtained with the performance monitor counter
+; for "core clock cycles". This requires a kernel mode driver as the one
+; included with www.agner.org/optimize/testp.zip.
+
+ReadTSC:
+ push rbx ; ebx is modified by cpuid
+ sub eax, eax ; 0
+ cpuid ; serialize
+ rdtsc ; read time stamp counter into edx:eax
+ shl rdx, 32
+ or rax, rdx ; combine into 64 bit register
+ push rax
+ sub eax, eax
+ cpuid ; serialize
+ pop rax ; return value
+ pop rbx
+ ret
+;ReadTSC ENDP
diff --git a/asmlibSrc/round32.asm b/asmlibSrc/round32.asm
new file mode 100755
index 0000000..eaa3cd4
--- /dev/null
+++ b/asmlibSrc/round32.asm
@@ -0,0 +1,41 @@
+; ROUND32.ASM
+
+; Author: Agner Fog
+; Date created: 2003
+; Last modified: 2008-10-16
+; Description:
+; Round function
+
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global _RoundD: function
+global _RoundF: function
+
+SECTION .text align=16
+
+; ********** round function **********
+; C++ prototype:
+; extern "C" int RoundD (double x);
+; extern "C" int RoundF (float x);
+
+; This function converts a single or double precision floating point number
+; to an integer, rounding to nearest or even. Does not check for overflow.
+; This function is much faster than the default conversion method in C++
+; which uses truncation.
+
+_RoundD:
+ fld qword [esp+4] ; Load x
+ push eax ; Make temporary space on stack
+ fistp dword [esp] ; Round. Store in temporary stack space
+ pop eax ; Read from temporary stack space
+ ret
+;_RoundD ENDP
+
+_RoundF:
+ fld dword [esp+4]
+ push eax
+ fistp dword [esp]
+ pop eax
+ ret
+;_RoundF ENDP
diff --git a/asmlibSrc/round64.asm b/asmlibSrc/round64.asm
new file mode 100755
index 0000000..826d4d9
--- /dev/null
+++ b/asmlibSrc/round64.asm
@@ -0,0 +1,38 @@
+; ROUND64.ASM
+
+; Author: Agner Fog
+; Date created: 2007-06-15
+; Last modified: 2008-10-16
+; Description:
+; Round function
+
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+
+global RoundD: function
+global RoundF: function
+
+
+SECTION .text align=16
+
+; ********** round function **********
+; C++ prototype:
+; extern "C" int RoundD (double x);
+; extern "C" int RoundF (float x);
+
+; This function converts a single or double precision floating point number
+; to an integer, rounding to nearest or even. Does not check for overflow.
+; This function is much faster than the default conversion method in C++
+; which uses truncation.
+
+RoundD:
+ cvtsd2si eax, xmm0 ; Round xmm0 to eax
+ ret
+;RoundD ENDP
+
+RoundF:
+ cvtss2si eax, xmm0 ; Round xmm0 to eax
+ ret
+;RoundF ENDP
diff --git a/asmlibSrc/sfmt32.asm b/asmlibSrc/sfmt32.asm
new file mode 100755
index 0000000..f20140a
--- /dev/null
+++ b/asmlibSrc/sfmt32.asm
@@ -0,0 +1,1265 @@
+; ----------------------------- SFMT32.ASM ---------------------------
+; Author: Agner Fog
+; Date created: 2008-11-01
+; Last modified: 2013-09-13
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 32 bit
+; Description:
+; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
+; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
+; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte
+; Carlo Methods 2006, Springer, 2008, pp. 607-622).
+;
+; 32-bit mode version for x86 compatible microprocessors.
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+; ----------------------------------------------------------------------
+
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+global _SFMTRandomInit, _SFMTRandomInitByArray, _SFMTBRandom, _SFMTRandom
+global _SFMTRandomL, _SFMTIRandom, _SFMTIRandomX, _SFMTgenRandomInit
+global _SFMTgenRandomInitByArray, _SFMTgenRandom, _SFMTgenRandomL
+global _SFMTgenIRandom, _SFMTgenIRandomX, _SFMTgenBRandom
+
+%ifdef WINDOWS
+global _SFMTgenRandomInitD at 8, _SFMTgenRandomInitByArrayD at 12, _SFMTgenRandomD at 0
+global _SFMTgenRandomLD at 0, _SFMTgenIRandomD at 8, _SFMTgenIRandomXD at 8, _SFMTgenIRandomDX at 8
+global _SFMTgenBRandomD at 0
+%endif
+
+
+extern _InstructionSet
+
+section .data
+align 16
+
+; Data for single instance of random number generator
+SFMTInstance: ISTRUC CRandomSFMTA
+; Size of structure
+IEND
+SFMTSize equ $-SFMTInstance
+
+
+align 16
+; Initialization constants for Mother-Of-All:
+InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
+
+; Initialization Mask for SFMT:
+InitMask DD SFMT_MASK
+
+; Period certification vector for SFMT:
+InitParity DD SFMT_PARITY
+
+
+SECTION .CODE align=16 ; code segment
+
+; ---------------------------------------------------------------
+; Thread-safe static link versions for SFMT
+; ---------------------------------------------------------------
+
+; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
+; Parameters:
+; [esp+4] = Pthis
+; [esp+8] = ThisSize
+; [esp+12] = seed
+; [esp+16] = IncludeMother
+
+_SFMTRandomInit:
+ mov ecx, [esp+4] ; Pthis
+ cmp dword [esp+8], SFMTSize
+ jb Error ; Error exit if buffer too small
+ push edi
+
+ ; Align by 16. Will overlap part of Fill if Pthis unaligned
+ and ecx, -16
+ xor eax, eax
+ cmp dword [esp+16+4], eax ; IncludeMother
+ setnz al ; convert any nonzero value to 1
+ ; Store USEMOTHER
+ mov [ecx+CRandomSFMTA.USEMOTHER], eax
+
+ mov eax, [esp+12+4] ; seed
+ xor edi, edi ; loop counter i
+ jmp L002 ; go into seeding loop
+
+L001: ; seeding loop for SFMT
+ ; y = factor * (y ^ (y >> 30)) + (++i);
+ call InitSubf0 ; randomization subfunction
+L002: mov [ecx+edi*4+CRandomSFMTA.STATE],eax ; initialize state
+ cmp edi, SFMT_N*4 - 1
+ jb L001
+
+ ; Put 5 more values into Mother-Of-All generator
+ call InitSubf0
+ mov [ecx+CRandomSFMTA.M0], eax
+ call InitSubf0
+ mov [ecx+CRandomSFMTA.M1], eax
+ call InitSubf0
+ mov [ecx+CRandomSFMTA.M2], eax
+ call InitSubf0
+ mov [ecx+CRandomSFMTA.M3], eax
+ call InitSubf0
+ mov [ecx+CRandomSFMTA.MC], eax
+
+ ; more initialization and period certification
+ call InitAndPeriod
+
+ pop edi
+ ret
+;_SFMTRandomInit ENDP
+
+Error: ; Error exit
+ xor eax, eax
+ div eax ; Divide by 0
+ ret
+
+; Subfunction used by _SFMTRandomInit
+InitSubf0: ; private
+; y = 1812433253 * (y ^ (y >> 30)) + (++i);
+; input parameters:
+; eax = y
+; edi = i
+; output:
+; eax = new y
+; edi = i+1
+; edx modified
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx
+ imul eax, 1812433253
+ inc edi
+ add eax, edi
+ ret
+;InitSubf0 endp
+
+; Subfunction used by _SFMTRandomInitByArray
+InitSubf1: ; private
+; r = 1664525U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; edx modified
+ mov edx, eax
+ shr eax, 27
+ xor eax, edx
+ imul eax, 1664525
+ ret
+;InitSubf1 endp
+
+; Subfunction used by _SFMTRandomInitByArray
+InitSubf2: ; private
+; r = 1566083941U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; edx modified
+ mov edx, eax
+ shr eax, 27
+ xor eax, edx
+ imul eax, 1566083941
+ ret
+;InitSubf2 endp
+
+; Subfunciton for initialization and period certification, except seeding
+; ecx = aligned pointer to CRandomSFMTA
+InitAndPeriod: ; private
+ push ebx
+ push edi
+ ; initialize constants for Mother-Of-All and SFMT
+ LOADOFFSET2EDI InitMother ; edi points to InitMother
+
+ xor edx, edx
+L101: ; Loop fills MF3 - MF0
+ mov eax, [edi+edx] ; load from InitMother
+ mov [ecx+edx+CRandomSFMTA.MF3], eax
+ add edx, 4
+ cmp edx, 32
+ jb L101
+ xor edx, edx
+L102: ; Loop fills AMASK
+ mov eax, [edi+edx+32] ; load from InitMask
+ mov [ecx+edx+CRandomSFMTA.AMASK], eax
+ add edx, 4
+ cmp edx, 4*4
+ jb L102
+
+ ; get instruction set
+ push ecx
+ call _InstructionSet
+ pop ecx
+ mov [ecx+CRandomSFMTA.Instset], eax
+ xor eax, eax
+ mov dword [ecx+CRandomSFMTA.one], eax
+ mov dword [ecx+4+CRandomSFMTA.one], 3FF00000H
+
+ ; Period certification
+ ; Compute parity of STATE[0-4] & InitParity
+ xor edx, edx ; parity
+ xor ebx, ebx ; loop counter
+L104: mov eax, [ecx+ebx*4+CRandomSFMTA.STATE]
+ and eax, [edi+(InitParity-InitMother)+ebx*4] ; and InitParity[i]
+ xor edx, eax
+ inc ebx
+ cmp ebx, 4
+ jb L104
+
+ ; get parity of edx
+ mov eax, edx
+ shr edx, 16
+ xor eax, edx
+ xor al, ah
+ jpo L108 ; parity odd: period OK
+
+ ; parity even: period not OK
+ ; Find a nonzero dword in period certification vector
+ xor ebx, ebx ; loop counter
+L105: mov eax, [edi+(InitParity-InitMother)+ebx*4] ; InitParity[i]
+ test eax, eax
+ jnz L106
+ inc ebx
+ ; assume that there is a nonzero dword in InitParity
+ jmp L105 ; loop until nonzero found
+
+L106: ; find first nonzero bit in eax
+ bsf edx, eax
+ ; flip the corresponding bit in STATE
+ btc [ecx+ebx*4+CRandomSFMTA.STATE], edx
+
+L108: cmp dword [ecx+CRandomSFMTA.USEMOTHER], 0
+ je L109
+ call Mother_Next ; Make first random number ready
+
+L109: ; Generate first random numbers and set IX = 0
+ call SFMT_Generate
+ pop edi
+ pop ebx
+ ret
+;InitAndPeriod endp
+
+
+; extern "C" void SFMTRandomInitByArray
+; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
+; // Seed by more than 32 bits
+_SFMTRandomInitByArray:
+; Parameters
+; [esp+ 4] = Pthis
+; [esp+ 8] = ThisSize
+; [esp+12] = seeds
+; [esp+16] = NumSeeds
+; [esp+20] = IncludeMother
+
+; define constants:
+SFMT_SIZE equ SFMT_N*4 ; number of 32-bit integers in state
+
+%IF SFMT_SIZE >= 623
+ SFMT_LAG equ 11
+%ELIF SFMT_SIZE >= 68
+ SFMT_LAG equ 7
+%ELIF SFMT_SIZE >= 39
+ SFMT_LAG equ 5
+%ELSE
+ SFMT_LAG equ 3
+%ENDIF
+
+SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
+
+ push ebx
+ push esi
+ push edi
+ push ebp
+ cmp dword [esp+8+16], SFMTSize
+ jb Error ; Error exit if buffer too small
+ mov ecx, [esp+4+16] ; Pthis
+ mov ebx, [esp+12+16] ; seeds
+ mov ebp, [esp+16+16] ; NumSeeds
+
+ ; Align by 16. Will overlap part of Fill if Pthis unaligned
+ and ecx, -16
+ xor eax, eax
+ cmp dword [esp+20+16], eax ; IncludeMother
+ setnz al ; convert any nonzero value to 1
+ ; Store USEMOTHER
+ mov [ecx+CRandomSFMTA.USEMOTHER], eax
+
+; 1. loop: Fill state vector with random numbers from NumSeeds
+; r = NumSeeds;
+; for (i = 0; i < SFMT_N*4; i++) {
+; r = factor * (r ^ (r >> 30)) + i;
+; sta[i] = r;}
+
+ mov eax, ebp ; r = NumSeeds
+ xor esi, esi ; i
+L200: mov edx, eax
+ shr eax, 30
+ xor eax, edx
+ imul eax, 1812433253
+ add eax, esi
+ mov [ecx+esi*4+CRandomSFMTA.STATE], eax
+ inc esi
+ cmp esi, SFMT_SIZE
+ jb L200
+
+ ; count = max(NumSeeds,size-1)
+ mov eax, SFMT_SIZE - 1
+ cmp ebp, eax
+ cmovb ebp, eax
+ push ebp ; save count as local variable
+
+; 2. loop: Fill state vector with random numbers from seeds[]
+; for (i = 1, j = 0; j < count; j++) {
+; r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
+; sta[(i + mid) % size] += r;
+; if (j < NumSeeds) r += seeds[j]
+; r += i;
+; sta[(i + mid + lag) % size] += r;
+; sta[i] = r;
+; i = (i + 1) % size;
+; }
+ xor edi, edi
+ lea esi, [edi+1]
+
+ ; ecx = Pthis
+ ; ebx = seeds
+ ; esi = i
+ ; edi = j
+ ; eax = r
+ ; [esp] = count
+ ; [esp+20+16] = NumSeeds
+
+L201: ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
+ mov eax, [ecx+esi*4+CRandomSFMTA.STATE] ; sta[i]
+ lea ebp, [esi+SFMT_MID]
+ cmp ebp, SFMT_SIZE
+ jb L202
+ sub ebp, SFMT_SIZE
+L202: xor eax, [ecx+ebp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
+ lea edx, [esi+SFMT_SIZE-1]
+ cmp edx, SFMT_SIZE
+ jb L203
+ sub edx, SFMT_SIZE
+L203: xor eax, [ecx+edx*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
+
+ ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
+ call InitSubf1
+
+ ; sta[(i + mid) % size] += r;
+ add [ecx+ebp*4+CRandomSFMTA.STATE], eax
+
+ ; if (j < NumSeeds) r += seeds[j]
+ cmp edi, [esp+20+16]
+ jnb L204
+ add eax, [ebx+edi*4]
+L204:
+ ; r += i;
+ add eax, esi
+
+ ; sta[(i + mid + lag) % size] += r;
+ lea edx, [esi+SFMT_MID+SFMT_LAG]
+ cmp edx, SFMT_SIZE
+ jb L205
+ sub edx, SFMT_SIZE
+L205: add [ecx+edx*4+CRandomSFMTA.STATE], eax
+
+ ;sta[i] = r;
+ mov [ecx+esi*4+CRandomSFMTA.STATE], eax
+
+ ; i = (i + 1) % size;
+ inc esi
+ cmp esi, SFMT_SIZE
+ jb L206
+ sub esi, SFMT_SIZE
+L206:
+ ; j++, loop while j < count
+ inc edi
+ cmp edi, [esp]
+ jb L201
+
+; 3. loop: Randomize some more
+; for (j = 0; j < size; j++) {
+; r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
+; sta[(i + mid) % size] ^= r;
+; r -= i;
+; sta[(i + mid + lag) % size] ^= r;
+; sta[i] = r;
+; i = (i + 1) % size;
+; }
+ ; j = 0
+ xor edi, edi
+
+L210: ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
+ mov eax, [ecx+esi*4+CRandomSFMTA.STATE] ; sta[i]
+ lea ebp, [esi+SFMT_MID]
+ cmp ebp, SFMT_SIZE
+ jb L211
+ sub ebp, SFMT_SIZE
+L211: add eax, [ecx+ebp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
+ lea edx, [esi+SFMT_SIZE-1]
+ cmp edx, SFMT_SIZE
+ jb L212
+ sub edx, SFMT_SIZE
+L212: add eax, [ecx+edx*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
+
+ ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
+ call InitSubf2
+
+ ; sta[(i + mid) % size] ^= r;
+ xor [ecx+ebp*4+CRandomSFMTA.STATE], eax
+
+ ; r -= i;
+ sub eax, esi
+
+ ; sta[(i + mid + lag) % size] ^= r;
+ lea edx, [esi+SFMT_MID+SFMT_LAG]
+ cmp edx, SFMT_SIZE
+ jb L213
+ sub edx, SFMT_SIZE
+L213: xor [ecx+edx*4+CRandomSFMTA.STATE], eax
+
+ ; sta[i] = r;
+ mov [ecx+esi*4+CRandomSFMTA.STATE], eax
+
+ ; i = (i + 1) % size;
+ inc esi
+ cmp esi, SFMT_SIZE
+ jb L214
+ sub esi, SFMT_SIZE
+L214:
+ ; j++, loop while j < size
+ inc edi
+ cmp edi, SFMT_SIZE
+ jb L210
+
+ pop ebp ; remove local variable count
+
+ ; if (UseMother) {
+ cmp dword [ecx+CRandomSFMTA.USEMOTHER], 0
+ jz L220
+
+; 4. loop: Initialize MotherState
+; for (j = 0; j < 5; j++) {
+; r = func2(r) + j;
+; MotherState[j] = r + sta[2*j];
+; }
+ call InitSubf2
+ mov edx, [ecx+CRandomSFMTA.STATE]
+ add edx, eax
+ mov [ecx+CRandomSFMTA.M0], edx
+ call InitSubf2
+ inc eax
+ mov edx, [ecx+8+CRandomSFMTA.STATE]
+ add edx, eax
+ mov [ecx+CRandomSFMTA.M1], edx
+ call InitSubf2
+ add eax, 2
+ mov edx, [ecx+16+CRandomSFMTA.STATE]
+ add edx, eax
+ mov [ecx+CRandomSFMTA.M2], edx
+ call InitSubf2
+ add eax, 3
+ mov edx, [ecx+24+CRandomSFMTA.STATE]
+ add edx, eax
+ mov [ecx+CRandomSFMTA.M3], edx
+ call InitSubf2
+ add eax, 4
+ mov edx, [ecx+32+CRandomSFMTA.STATE]
+ add edx, eax
+ mov [ecx+CRandomSFMTA.MC], edx
+
+L220: ; More initialization and period certification
+ call InitAndPeriod
+
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+;_SFMTRandomInitByArray ENDP
+
+
+align 16
+Mother_Next: ; private
+; Internal procedure: advance Mother-Of-All generator
+; The random value is in M0
+; ecx = pointer to structure CRandomSFMTA
+; eax, ecx, xmm0 unchanged
+ cmp dword [ecx+CRandomSFMTA.Instset], 4
+ jb Mother_Next_386
+ movdqa xmm1, oword [ecx+CRandomSFMTA.M3] ; load M3,M2
+ movdqa xmm2, oword [ecx+CRandomSFMTA.M1] ; load M1,M0
+ movhps qword [ecx+CRandomSFMTA.M3], xmm1 ; M3=M2
+ movq qword [ecx+CRandomSFMTA.M2], xmm2 ; M2=M1
+ movhps qword [ecx+CRandomSFMTA.M1], xmm2 ; M1=M0
+ pmuludq xmm1, oword [ecx+CRandomSFMTA.MF3] ; M3*MF3, M2*MF2
+ pmuludq xmm2, oword [ecx+CRandomSFMTA.MF1] ; M1*MF1, M0*MF0
+ paddq xmm1, xmm2 ; P3+P1, P2+P0
+ movhlps xmm2, xmm1 ; Get high qword
+ movq xmm3, qword [ecx+CRandomSFMTA.MC] ; +carry
+ paddq xmm1, xmm3
+ paddq xmm1, xmm2 ; P0+P1+P2+P3
+ movq qword [ecx+CRandomSFMTA.M0], xmm1 ; Store new M0 and carry
+ ret
+
+Mother_Next_386: ; same, no SSE2
+ push eax
+ push esi
+ push edi
+ ; prepare new random number
+ mov eax, [ecx+CRandomSFMTA.MF3]
+ mul dword [ecx+CRandomSFMTA.M3] ; x[n-4]
+ mov esi, eax
+ mov eax, [ecx+CRandomSFMTA.M2] ; x[n-3]
+ mov edi, edx
+ mov [ecx+CRandomSFMTA.M3], eax
+ mul dword [ecx+CRandomSFMTA.MF2]
+ add esi, eax
+ mov eax, [ecx+CRandomSFMTA.M1] ; x[n-2]
+ adc edi, edx
+ mov [ecx+CRandomSFMTA.M2], eax
+ mul dword [ecx+CRandomSFMTA.MF1]
+ add esi, eax
+ mov eax,[ecx+CRandomSFMTA.M0] ; x[n-1]
+ adc edi, edx
+ mov [ecx+CRandomSFMTA.M1], eax
+ mul dword [ecx+CRandomSFMTA.MF0]
+ add eax, esi
+ adc edx, edi
+ add eax, [ecx+CRandomSFMTA.MC]
+ adc edx, 0
+ ; store next random number and carry
+ mov [ecx+CRandomSFMTA.M0], eax
+ mov [ecx+CRandomSFMTA.MC], edx
+ pop edi
+ pop esi
+ pop eax
+ ret
+
+;Mother_Next endp
+
+
+align 16
+SFMT_Generate: ; private
+; void CRandomSFMT::Generate() {
+; Fill state array with new random numbers
+
+; check if SSE2 instruction set supported
+ cmp dword [ecx+CRandomSFMTA.Instset], 4
+ jb SFMT_Generate_386
+ push ebx
+
+ ; register use
+ ; ecx = Pthis
+ ; edx = i*16 + offset state
+ ; eax, ebx = loop end
+ ; xmm1 = r1
+ ; xmm2 = r2 = r
+ ; xmm0, xmm3 = scratch
+
+ ; r1 = state[SFMT_N*16 - 2];
+ ; r2 = state[SFMT_N*16 - 1];
+ movdqa xmm1, oword [ecx+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+ movdqa xmm2, oword [ecx+(SFMT_N-1)*16+CRandomSFMTA.STATE]
+ mov edx, CRandomSFMTA.STATE
+
+;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b,
+;__m128i const &c, __m128i const &d, __m128i const &mask) {
+; __m128i a1, b1, c1, d1, z1, z2;
+; b1 = _mm_srli_epi32(b, SFMT_SR1);
+; a1 = _mm_slli_si128(a, SFMT_SL2);
+; c1 = _mm_srli_si128(c, SFMT_SR2);
+; d1 = _mm_slli_epi32(d, SFMT_SL1);
+; b1 = _mm_and_si128(b1, mask);
+; z1 = _mm_xor_si128(a, a1);
+; z2 = _mm_xor_si128(b1, d1);
+; z1 = _mm_xor_si128(z1, c1);
+; z2 = _mm_xor_si128(z1, z2);
+; return z2;}
+
+; for (i = 0; i < SFMT_N - SFMT_M; i++) {
+; r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
+; state[i] = r;
+; r1 = r2;
+; r2 = r;
+; }
+
+ mov eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
+ mov ebx, SFMT_N*16 + CRandomSFMTA.STATE ; second loop end
+
+; first i loop from 0 to SFMT_N - SFMT_M
+align 8
+L301: movdqa xmm0, oword [ecx+edx+SFMT_M*16] ; b
+ psrld xmm0, SFMT_SR1 ; b1
+ pand xmm0, oword [ecx+CRandomSFMTA.AMASK] ; b1
+ movdqa xmm3, oword [ecx+edx] ; a
+ pxor xmm0, xmm3
+ pslldq xmm3, SFMT_SL2 ; a1
+ psrldq xmm1, SFMT_SR2 ; c1, c = r1
+ pxor xmm0, xmm3
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm2 ; r1 = r2
+ pslld xmm2, SFMT_SL1 ; d1, d = r2
+ pxor xmm2, xmm0 ; r2 = r
+ ; state[i] = r;
+ movdqa oword [ecx+edx], xmm2
+
+ ; i++ while i < SFMT_N - SFMT_M
+ add edx, 16
+ cmp edx, eax
+ jb L301
+
+;align 16
+L302: ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
+ movdqa xmm0, oword [ecx+edx+(SFMT_M-SFMT_N)*16] ; b
+ psrld xmm0, SFMT_SR1 ; b1
+ pand xmm0, oword [ecx+CRandomSFMTA.AMASK ] ; b1
+ movdqa xmm3, oword [ecx+edx] ; a
+ pxor xmm0, xmm3
+ pslldq xmm3, SFMT_SL2 ; a1
+ psrldq xmm1, SFMT_SR2 ; c1, c = r1
+ pxor xmm0, xmm3
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm2 ; r1 = r2
+ pslld xmm2, SFMT_SL1 ; d1, d = r2
+ pxor xmm2, xmm0 ; r2 = r
+ ; state[i] = r;
+ movdqa oword [ecx+edx], xmm2
+
+ ; i++ while i < SFMT_N
+ add edx, 16
+ cmp edx, ebx
+ jb L302
+
+ ; Check if initialized
+L308: cmp dword [ecx+CRandomSFMTA.AMASK], SFMT_MASK1
+ jne Error ; Make error if not initialized
+
+ ; ix = 0;
+ mov dword [ecx+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
+ pop ebx
+ ret
+
+; Same, SSE2 instruction set not supported:
+SFMT_Generate_386:
+ push ebx
+ push esi
+ push edi
+ push ebp
+ sub esp, 32
+
+ ; register use
+ ; ecx = Pthis
+ ; edx = i*16
+ ; ebx = ((i+SFMT_M) mod SFMT_N) * 16
+ ; ebp = accumulator
+ ; eax = temporary
+ ; esi, edi = previous state[i]
+
+ %define RR1 esp ; r1
+ %define RR2 esp+16 ; r2 = r
+
+ ; r1 = state[SFMT_N - 2];
+ ; r2 = state[SFMT_N - 1];
+ lea esi, [ecx+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+ mov edi, esp
+ push ecx
+ mov ecx, 8
+ rep movsd
+ pop ecx
+
+; The two loops from i = 0 to SFMT_N - SFMT_M - 1 and
+; from SFMT_N - SFMT_M to SFMT_N - 1 are joined together here:
+; for (i = 0; i < SFMT_N; i++) {
+; r = sfmt_recursion(state[i], state[(i+SFMT_M)%SFMT_N], r1, r2, mask);
+; state[i] = r;
+; r1 = r2;
+; r2 = r;
+
+ xor edx, edx ; i = 0
+ mov ebx, SFMT_M * 16 ; j = ((i+SFMT_M)%SFMT_N)*16
+
+M1: ; loop start
+ ; 1. dword:
+ mov ebp, [ecx+ebx+CRandomSFMTA.STATE+0]
+ shr ebp, SFMT_SR1 ; 32-bit shifts right
+ and ebp, [ecx+CRandomSFMTA.AMASK+0]
+ mov eax, [ecx+edx+CRandomSFMTA.STATE+0]
+ xor ebp, eax
+ mov esi, eax ; save for 2. dword
+ shl eax, SFMT_SL2*8 ; 128-bit shift left
+ xor ebp, eax
+ mov eax, [RR1+0]
+ mov edi, [RR1+4]
+ shrd eax, edi, SFMT_SR2*8 ; 128-bit shift right
+ xor ebp, eax
+ mov eax, [RR2+0]
+ mov [RR1+0], eax ; r1 = r2
+ shl eax, SFMT_SL1 ; 32-bit shifts left
+ xor ebp, eax
+ mov [RR2+0], ebp ; r2 = r
+ mov [ecx+edx+CRandomSFMTA.STATE+0], ebp ; state[i] = r
+
+ ; 2. dword:
+ mov ebp, [ecx+ebx+CRandomSFMTA.STATE+4]
+ shr ebp, SFMT_SR1 ; 32-bit shifts right
+ and ebp, [ecx+CRandomSFMTA.AMASK+4]
+ mov eax, [ecx+edx+CRandomSFMTA.STATE+4]
+ xor ebp, eax
+ mov edi, eax ; save for 3. dword
+ ; esi = [ecx+edx].STATE[0] before change
+ shld eax, esi, SFMT_SL2*8 ; 128-bit shift left
+ xor ebp, eax
+ mov eax, [RR1+4]
+ mov esi, [RR1+8]
+ shrd eax, esi, SFMT_SR2*8 ; 128-bit shift right
+ xor ebp, eax
+ mov eax, [RR2+4]
+ mov [RR1+4], eax ; r1 = r2
+ shl eax, SFMT_SL1 ; 32-bit shifts left
+ xor ebp, eax
+ mov [RR2+4], ebp ; r2 = r
+ mov [ecx+edx+CRandomSFMTA.STATE+4], ebp ; state[i] = r
+
+ ; 3. dword:
+ mov ebp, [ecx+ebx+CRandomSFMTA.STATE+8]
+ shr ebp, SFMT_SR1 ; 32-bit shifts right
+ and ebp, [ecx+CRandomSFMTA.AMASK+8]
+ mov eax, [ecx+edx+CRandomSFMTA.STATE+8]
+ mov esi, eax ; save for 4. dword
+ xor ebp, eax
+ ; edi = [ecx+edx+CRandomSFMTA.STATE+4] before change
+ shld eax, edi, SFMT_SL2*8 ; 128-bit shift left
+ xor ebp, eax
+ mov eax, [RR1+8]
+ mov edi, [RR1+12]
+ shrd eax, edi, SFMT_SR2*8 ; 128-bit shift right
+ xor ebp, eax
+ mov eax, [RR2+8]
+ mov [RR1+8], eax ; r1 = r2
+ shl eax, SFMT_SL1 ; 32-bit shifts left
+ xor ebp, eax
+ mov [RR2+8], ebp ; r2 = r
+ mov [ecx+edx+CRandomSFMTA.STATE+8], ebp ; state[i] = r
+
+ ; 4. dword:
+ mov ebp, [ecx+ebx+CRandomSFMTA.STATE+12]
+ shr ebp, SFMT_SR1 ; 32-bit shifts right
+ and ebp, [ecx+CRandomSFMTA.AMASK+12]
+ mov eax, [ecx+edx+CRandomSFMTA.STATE+12]
+ xor ebp, eax
+ ; esi = [ecx+edx+CRandomSFMTA.STATE+8] before change
+ shld eax, esi, SFMT_SL2*8 ; 128-bit shift left
+ xor ebp, eax
+ mov eax, [RR1+12]
+ shr eax, SFMT_SR2*8 ; 128-bit shift right
+ xor ebp, eax
+ mov eax, [RR2+12]
+ mov [RR1+12], eax ; r1 = r2
+ shl eax, SFMT_SL1 ; 32-bit shifts left
+ xor ebp, eax
+ mov [RR2+12], ebp ; r2 = r
+ mov [ecx+edx+CRandomSFMTA.STATE+12], ebp ; state[i] = r
+
+ ; increment i, j
+ add ebx, 16
+ cmp ebx, SFMT_N*16
+ jb M4
+ sub ebx, SFMT_N*16 ; modulo SFMT_N
+M4: add edx, 16
+ cmp edx, SFMT_N*16
+ jb M1
+
+ ; free r1, r2 from stack
+ add esp, 32
+ pop ebp
+ pop edi
+ pop esi
+ ; pop ebx
+ jmp L308
+
+;SFMT_Generate endp
+
+
+; extern "C" unsigned int SFMTBRandom(void * Pthis); // Output random bits
+
+_SFMTBRandom: ; generate random bits
+ mov ecx, [esp+4] ; Pthis
+ ; Align by 16. Will overlap part of Fill1 if Pthis unaligned
+ and ecx, -16
+
+SFMTBRandom_reg: ; Entry for register parameters, used internally
+
+; if (ix >= SFMT_N*4) Generate();
+ mov edx, [ecx+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16
+ jnb NeedGenerate
+
+; y = ((uint32_t*)state)[ix++];
+ mov eax, dword [ecx+edx+CRandomSFMTA.STATE]
+ add edx, 4
+ mov [ecx+CRandomSFMTA.IX], edx
+
+AfterGenerate:
+; if (UseMother) y += MotherBits();
+ cmp dword [ecx+CRandomSFMTA.USEMOTHER], 0
+ jz NoMother
+
+ ; add mother bits
+ add eax, [ecx+CRandomSFMTA.M0] ; Add Mother random number
+ call Mother_Next ; Make next Mother random number ready
+
+NoMother: ; return y;
+ ret
+
+NeedGenerate:
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ mov eax, [ecx+CRandomSFMTA.STATE]
+ mov dword [ecx+CRandomSFMTA.IX], 4
+ jmp AfterGenerate
+
+;_SFMTBRandom ENDP
+
+
+; extern "C" double SFMTRandom (void * Pthis); // Output random float
+
+_SFMTRandom: ; generate random float with 52 bits resolution
+ mov ecx, [esp+4] ; Pthis
+ ; Align by 16. Will overlap part of Fill1 if Pthis unaligned
+ and ecx, -16
+
+SFMTRandom_reg: ; internal entry point
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+ mov edx, [ecx+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16-4
+ jnb L403
+
+ ; check instruction set
+L401: cmp dword [ecx+CRandomSFMTA.Instset], 4
+ jb L404
+
+ ; read 64 random bits
+ movq xmm0, qword [ecx+edx+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [ecx+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [ecx+CRandomSFMTA.USEMOTHER], 0
+ jz L402
+
+ ; add mother bits
+ movq xmm1, qword [ecx+CRandomSFMTA.M0] ; Mother random number MC and M0
+ pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
+ paddq xmm0, xmm1 ; Add SFMT and Mother outputs
+ call Mother_Next ; Make next Mother random number ready
+
+L402: ; ConvertToFloat
+ psrlq xmm0, 12 ; align with mantissa field of double precision float
+ movsd xmm1, [ecx+CRandomSFMTA.one] ; 1.0 double precision
+ por xmm0, xmm1 ; insert exponent to get 1.0 <= x < 2.0
+ subsd xmm0, xmm1 ; subtract 1.0 to get 0.0 <= x < 1.0
+ movsd [ecx+CRandomSFMTA.TempRan], xmm0
+ fld qword [ecx+CRandomSFMTA.TempRan] ; transfer to st(0) register
+ ret ; return value
+
+L403: ;NeedGenerateR
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ xor edx, edx
+ jmp L401
+
+L404: ;NoSSE2 ; Use old 386 instruction set:
+ push ebx
+ ; read 64 random bits
+ mov eax, [ecx+edx+CRandomSFMTA.STATE]
+ mov ebx, [ecx+edx+4+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [ecx+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [ecx+CRandomSFMTA.USEMOTHER], 0
+ jz L405
+
+ ; add mother bits
+ mov edx, [ecx+CRandomSFMTA.MC] ; Mother random number MC
+ ror edx, 16 ; rotate
+ add eax, edx ; 64 bit add
+ adc ebx, [ecx+CRandomSFMTA.M0] ; Mother random number M0
+ call Mother_Next ; next Mother. eax, ebx unchanged
+
+L405: ;ToFloatNoSSE2
+ shrd eax, ebx, 12 ; align with mantissa field of double precision float
+ shr ebx, 12
+ or ebx, 3FF00000H ; insert exponent to get 1.0 <= x < 2.0
+ mov dword [ecx+CRandomSFMTA.TempRan], eax
+ mov dword [ecx+4+CRandomSFMTA.TempRan], ebx
+ fld qword [ecx+CRandomSFMTA.TempRan] ; transfer to st(0) register
+ fsub qword [ecx+CRandomSFMTA.one] ; subtract 1.0 to get 0.0 <= x < 1.0
+ pop ebx
+ ret ; return value
+
+;_SFMTRandom ENDP
+
+
+; extern "C" long double SFMTRandomL (void * Pthis);
+
+_SFMTRandomL: ; generate random float with 63 bits resolution
+ mov ecx, [esp+4] ; Pthis
+ ; Align by 16. Will overlap part of Fill1 if Pthis unaligned
+ and ecx, -16
+
+SFMTRandomL_reg: ; internal entry point
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+ mov edx, [ecx+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16-4
+ jnb L505
+
+ ; check instruction set
+L501: cmp dword [ecx+CRandomSFMTA.Instset], 4
+ jb L506
+
+ ; read 64 random bits
+ movq xmm0, qword [ecx+edx+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [ecx+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [ecx+CRandomSFMTA.USEMOTHER], 0
+ jz L502
+
+ ; add mother bits
+ movq xmm1, qword [ecx+CRandomSFMTA.M0] ; Mother random number MC and M0
+ pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
+ paddq xmm0, xmm1 ; Add SFMT and Mother outputs
+ call Mother_Next ; Make next Mother random number ready
+
+L502: ;ConvertToFloat
+ sub esp, 16 ; make space for long double
+ psrlq xmm0, 1 ; align with mantissa field of long double
+ pcmpeqw xmm1, xmm1 ; all 1's
+ psllq xmm1, 63 ; create a 1 in bit 63
+ por xmm0, xmm1 ; bit 63 is always 1 in long double
+ movq qword [esp], xmm0 ; store mantissa
+L504: mov dword [esp+8], 3FFFH ; exponent
+ fld tword [esp] ; load long double
+ fsub qword [ecx+CRandomSFMTA.one] ; subtract 1.0 to get 0.0 <= x < 1.0
+ add esp, 16
+ ret ; return value
+
+L505: ; NeedGenerateR
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ xor edx, edx
+ jmp L501
+
+L506: ;NoSSE2 ; Use old 386 instruction set:
+ push ebx
+ ; read 64 random bits
+ mov eax, [ecx+edx+CRandomSFMTA.STATE]
+ mov ebx, [ecx+edx+4+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [ecx+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [ecx+CRandomSFMTA.USEMOTHER], 0
+ jz L507
+
+ ; add mother bits
+ mov edx, [ecx+CRandomSFMTA.MC] ; Mother random number MC
+ ror edx, 16 ; rotate
+ add eax, edx ; 64 bit add
+ adc ebx, [ecx+CRandomSFMTA.M0] ; Mother random number M0
+ call Mother_Next ; next Mother. eax, ebx unchanged
+
+L507: ;ToFloatNoSSE2
+ mov edx, ebx ; now random bits are in edx:eax
+ pop ebx ; clean stack
+ sub esp, 16 ; make room for long double
+ shrd eax, edx, 1 ; align with mantissa field of long double
+ stc
+ rcr edx, 1 ; bit 63 is always 1
+ mov [esp], eax
+ mov [esp+4], edx
+ jmp L504 ; the rest is the same as above
+
+;_SFMTRandomL ENDP
+
+
+; extern "C" int SFMTIRandom (void * Pthis, int min, int max); // Output random integer
+
+_SFMTIRandom:
+ mov ecx, [esp+4] ; Pthis
+ ; Align by 16. Will overlap part of Fill if Pthis unaligned
+ and ecx, -16
+ call SFMTBRandom_reg ; random bits
+ mov edx, [esp+12] ; max
+ mov ecx, [esp+8] ; min
+ sub edx, ecx
+ jl short WrongInterval ; max < min
+ add edx, 1 ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [edx+ecx] ; add min
+ ret
+WrongInterval:
+ mov eax, 80000000H ; error exit
+ ret
+;_SFMTIRandom ENDP
+
+
+; extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
+
+_SFMTIRandomX:
+ push edi
+ mov ecx, [esp+8] ; Pthis
+ mov edx, [esp+12] ; min
+ mov edi, [esp+16] ; max
+ ; Align by 16. Will overlap part of Fill1 if Pthis unaligned
+ and ecx, -16
+ sub edi, edx ; max - min
+ jle short M30 ; max <= min (signed)
+ inc edi ; interval = max - min + 1
+
+ ; if (interval != LastInterval) {
+ cmp edi, [ecx+CRandomSFMTA.LASTINTERVAL]
+ je M10
+ ; need to calculate new rejection limit
+ ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+ xor eax, eax ; 0
+ lea edx, [eax+1] ; 1
+ div edi ; (would give overflow if interval = 1)
+ mul edi
+ dec eax
+ mov [ecx+CRandomSFMTA.RLIMIT], eax
+ mov [ecx+CRandomSFMTA.LASTINTERVAL], edi
+M10:
+M20: ; do { // Rejection loop
+ call SFMTBRandom_reg ; random bits (ecx is preserved)
+ ; longran = (uint64)BRandom() * interval;
+ mul edi
+ ; } while (remainder > RLimit);
+ cmp eax, [ecx+CRandomSFMTA.RLIMIT]
+ ja M20
+
+ ; return (int32)iran + min
+ mov eax, [esp+12] ; min
+ add eax, edx
+ pop edi
+ ret
+
+M30: jl M40
+ ; max = min. Return min
+ mov eax, edx
+ pop edi
+ ret ; max = min exit
+
+M40: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop edi
+ ret
+;_SFMTIRandomX ENDP
+
+
+
+; -------------------------------------------------------------------------
+; Single-threaded static link versions for SFMT generator
+; -------------------------------------------------------------------------
+
+; extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0);
+_SFMTgenRandomInit:
+ mov eax, [esp+4] ; seed
+ mov edx, [esp+8] ; IncludeMother
+ LOADOFFSET2ECX SFMTInstance ; Get address of SFMTInstance into ecx
+
+ ; call _SFMTRandomInit with Pthis pointing to SFMTInstance
+ push edx ; IncludeMother
+ push eax ; seed
+ push SFMTSize ; ThisSize
+ push ecx ; Pthis
+ call _SFMTRandomInit
+ add esp, 16
+ ret
+;_SFMTgenRandomInit ENDP
+
+
+; extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
+_SFMTgenRandomInitByArray:
+ mov eax, [esp+4] ; seeds
+ mov ecx, [esp+8] ; NumSeeds
+ mov edx, [esp+12] ; IncludeMother
+ push edx
+ push ecx
+ push eax
+ push SFMTSize ; ThisSize
+ LOADOFFSET2ECX SFMTInstance ; Get address of SFMTInstance into ecx
+ push ecx
+ call _SFMTRandomInitByArray
+ add esp, 20
+ ret
+;_SFMTgenRandomInitByArray ENDP
+
+
+; extern "C" double SFMTgenRandom();
+_SFMTgenRandom: ; generate random float with 52 bits resolution
+ LOADOFFSET2ECX SFMTInstance ; Get address of SFMTInstance into ecx
+ jmp SFMTRandom_reg ; random bits
+;_SFMTgenRandom ENDP
+
+
+; extern "C" double SFMTgenRandom();
+_SFMTgenRandomL: ; generate random float with 63 bits resolution
+ LOADOFFSET2ECX SFMTInstance ; Get address of SFMTInstance into ecx
+ jmp SFMTRandomL_reg ; random bits
+;_SFMTgenRandomL ENDP
+
+
+; extern "C" int SFMTgenIRandom (int min, int max);
+_SFMTgenIRandom:
+ mov eax, [esp+4] ; min
+ mov edx, [esp+8] ; max
+ LOADOFFSET2ECX SFMTInstance ; Get address of SFMTInstance into ecx
+ push edx
+ push eax
+ push ecx ; Pthis
+ call _SFMTIRandom ; continue in _SFMTIRandom
+ add esp, 12
+ ret
+;_SFMTgenIRandom ENDP
+
+
+; extern "C" int SFMTgenIRandomX (int min, int max);
+_SFMTgenIRandomX:
+ mov eax, [esp+4] ; min
+ mov edx, [esp+8] ; max
+ LOADOFFSET2ECX SFMTInstance ; Get address of SFMTInstance into ecx
+ push edx
+ push eax
+ push ecx ; Pthis
+ call _SFMTIRandomX ; continue in _SFMTIRandomX
+ add esp, 12
+ ret
+;_SFMTgenIRandomX ENDP
+
+
+; extern "C" uint32_t SFMTgenBRandom();
+_SFMTgenBRandom: ; generate random float with 32 bits resolution
+ LOADOFFSET2ECX SFMTInstance ; Get address of SFMTInstance into ecx
+ jmp SFMTBRandom_reg ; random bits
+;_SFMTgenBRandom ENDP
+
+
+
+%IFDEF WINDOWS
+; -----------------------------------------------------------------
+; Single-threaded DLL versions for SFMT generator, Windows only
+; -----------------------------------------------------------------
+
+; extern "C" void __stdcall SFMTgenRandomInitD(int seed, int IncludeMother = 0);
+_SFMTgenRandomInitD at 8:
+ mov eax, [esp+4] ; seed
+ mov edx, [esp+8] ; IncludeMother
+ push edx
+ push eax
+ push SFMTSize ; ThisSize
+ push SFMTInstance ; Pthis
+ call _SFMTRandomInit
+ add esp, 16
+ ret 8
+;_SFMTgenRandomInitD at 8 ENDP
+
+
+
+; extern "C" void __stdcall SFMTgenRandomInitByArrayD
+; (int const seeds[], int NumSeeds, int IncludeMother = 0);
+_SFMTgenRandomInitByArrayD at 12:
+ mov eax, [esp+4] ; seeds
+ mov ecx, [esp+8] ; NumSeeds
+ mov edx, [esp+12] ; IncludeMother
+ push edx
+ push ecx
+ push eax
+ push SFMTSize ; ThisSize
+ push SFMTInstance
+ call _SFMTRandomInitByArray
+ add esp, 20
+ ret 12
+;_SFMTgenRandomInitByArrayD at 12 ENDP
+
+
+
+; extern "C" double __stdcall SFMTgenRandomD(); // Output random float
+_SFMTgenRandomD at 0: ; generate random float with 52 bits resolution
+ mov ecx, SFMTInstance
+ jmp SFMTRandom_reg ; random bits
+;_SFMTgenRandomD at 0 ENDP
+
+
+; extern "C" long double __stdcall SFMTgenRandomLD();
+_SFMTgenRandomLD at 0: ; generate random float with 63 bits resolution
+ mov ecx, SFMTInstance
+ jmp SFMTRandomL_reg ; random bits
+;_SFMTgenRandomLD at 0 ENDP
+
+
+; extern "C" int __stdcall SFMTgenIRandomD (int min, int max);
+_SFMTgenIRandomD at 8:
+ mov eax, [esp+4] ; min
+ mov edx, [esp+8] ; max
+ push edx
+ push eax
+ push SFMTInstance
+ call _SFMTIRandom ; continue in _SFMTIRandom
+ add esp, 12
+ ret 8
+;_SFMTgenIRandomD at 8 ENDP
+
+
+; extern "C" int __stdcall SFMTgenIRandomD (int min, int max);
+_SFMTgenIRandomXD at 8:
+ mov eax, [esp+4] ; min
+ mov edx, [esp+8] ; max
+ push edx
+ push eax
+ push SFMTInstance
+ call _SFMTIRandomX ; continue in _SFMTIRandom
+ add esp, 12
+ ret 8
+;_SFMTgenIRandomXD at 8 ENDP
+
+
+
+; extern "C" int __stdcall SFMTgenIRandomDX (int min, int max);
+_SFMTgenIRandomDX at 8:
+ mov eax, [esp+4] ; min
+ mov edx, [esp+8] ; max
+ push edx
+ push eax
+ push SFMTInstance
+ call _SFMTIRandomX ; continue in _SFMTIRandomX
+ add esp, 12
+ ret 8
+;_SFMTgenIRandomDX at 8 ENDP
+
+
+; extern "C" unsigned int __stdcall SFMTgenBRandomD();
+_SFMTgenBRandomD at 0: ; generate random float with 32 bits resolution
+ mov ecx, SFMTInstance
+ jmp SFMTBRandom_reg ; random bits
+;_SFMTgenBRandomD at 0 ENDP
+
+%ENDIF ; WINDOWS
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_ecx: ; load caller address into ecx for position-independent code
+ mov ecx, [esp]
+ ret
+
+get_thunk_edi: ; load caller address into edi for position-independent code
+ mov edi, [esp]
+ ret
+%ENDIF ; POSITIONINDEPENDENT
+
+;END
diff --git a/asmlibSrc/sfmt64.asm b/asmlibSrc/sfmt64.asm
new file mode 100755
index 0000000..24bde65
--- /dev/null
+++ b/asmlibSrc/sfmt64.asm
@@ -0,0 +1,908 @@
+; ----------------------------- SFMT64.ASM ---------------------------
+; Author: Agner Fog
+; Date created: 2008-11-01
+; Last modified: 2013-12-15
+; Project: randoma library of random number generators
+; Source URL: www.agner.org/random
+; Description:
+; Random number generator of type SIMD-oriented Fast Mersenne Twister (SFMT)
+; (Mutsuo Saito and Makoto Matsumoto: "SIMD-oriented Fast Mersenne Twister:
+; a 128-bit Pseudorandom Number Generator", Monte Carlo and Quasi-Monte
+; Carlo Methods 2006, Springer, 2008, pp. 607-622).
+;
+; 64-bit mode version for x86-64 compatible microprocessors.
+; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
+; ----------------------------------------------------------------------
+
+default rel
+
+global SFMTRandomInit, SFMTRandomInitByArray, SFMTBRandom, SFMTRandom
+global SFMTRandomL, SFMTIRandom, SFMTIRandomX, SFMTgenRandomInit
+global SFMTgenRandomInitByArray, SFMTgenRandom, SFMTgenRandomL, SFMTgenIRandom
+global SFMTgenIRandomX, SFMTgenBRandom
+%IFDEF WINDOWS
+global SFMTgenRandomInitD, SFMTgenRandomInitByArrayD, SFMTgenIRandomD
+global SFMTgenIRandomXD, SFMTgenRandomD, SFMTgenBRandomD
+%ENDIF
+
+
+extern InstructionSet
+
+; structure definition and constants:
+%INCLUDE "randomah.asi"
+
+
+section .data
+align 16
+; Data for single instance of random number generator
+SFMTInstance: ISTRUC CRandomSFMTA
+; Size of structure
+IEND
+SFMTSize equ $-SFMTInstance
+
+
+align 16
+; Initialization constants for Mother-Of-All:
+InitMother DD 2111111111, 0, 1492, 0, 1776, 0, 5115, 0
+; Initialization Mask for SFMT:
+InitMask DD SFMT_MASK
+; Period certification vector for SFMT:
+InitParity DD SFMT_PARITY
+
+
+SECTION .CODE align=16 ; code segment
+
+
+; ---------------------------------------------------------------
+; Thread-safe static link versions for SFMT
+; ---------------------------------------------------------------
+
+; extern "C" void SFMTRandomInit(void * Pthis, int ThisSize, int seed, int IncludeMother = 0);
+; Parameters:
+; par1 = Pthis
+; par2d = ThisSize
+; par3d = seed
+; par4d = IncludeMother
+
+SFMTRandomInit:
+ cmp par2d, SFMTSize
+ jb Error ; Error exit if buffer too small
+ push rbx
+
+ ; Align by 16. Will overlap part of Fill if Pthis unaligned
+ and par1, -16
+
+ xor eax, eax
+ test par4d, par4d ; IncludeMother
+ setnz al ; convert any nonzero value to 1
+ ; Store USEMOTHER
+ mov [par1+CRandomSFMTA.USEMOTHER], eax
+
+ mov eax, par3d ; seed
+ xor ebx, ebx ; loop counter i
+ jmp L002 ; go into seeding loop
+
+L001: ; seeding loop for SFMT
+ ; y = factor * (y ^ (y >> 30)) + (++i);
+ call InitSubf0 ; randomization subfunction
+L002: mov [par1+rbx*4+CRandomSFMTA.STATE],eax ; initialize state
+ cmp ebx, SFMT_N*4 - 1
+ jb L001
+
+ ; Put 5 more values into Mother-Of-All generator
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M0], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M1], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M2], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.M3], eax
+ call InitSubf0
+ mov [par1+CRandomSFMTA.MC], eax
+
+ ; more initialization and period certification
+ call InitAndPeriod
+
+ pop rbx
+ ret
+;SFMTRandomInit ENDP
+
+Error: ; Error exit
+ xor eax, eax
+ div eax ; Divide by 0
+ ret
+
+; Subfunction used by SFMTRandomInit
+InitSubf0: ; private
+; y = 1812433253 * (y ^ (y >> 30)) + (++i);
+; input parameters:
+; eax = y
+; ebx = i
+; output:
+; eax = new y
+; ebx = i+1
+; edx modified
+ mov edx, eax
+ shr eax, 30
+ xor eax, edx
+ imul eax, 1812433253
+ inc ebx
+ add eax, ebx
+ ret
+;InitSubf0 endp
+
+; Subfunction used by SFMTRandomInitByArray
+InitSubf1: ; private
+; r = 1664525U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+ mov r10d, eax
+ shr eax, 27
+ xor eax, r10d
+ imul eax, 1664525
+ ret
+;InitSubf1 endp
+
+; Subfunction used by SFMTRandomInitByArray
+InitSubf2: ; private
+; r = 1566083941U * (r ^ (r >> 27));
+; input parameters:
+; eax = r
+; output:
+; eax = new r
+; r10 modified
+ mov r10d, eax
+ shr eax, 27
+ xor eax, r10d
+ imul eax, 1566083941
+ ret
+;InitSubf2 endp
+
+
+; Subfunciton for initialization and period certification, except seeding
+; par1 = aligned pointer to CRandomSFMTA
+InitAndPeriod: ; private
+ push rbx
+
+ ; initialize constants for Mother-Of-All
+ movaps xmm0, oword [InitMother]
+ movaps oword [par1+CRandomSFMTA.MF3], xmm0
+ movaps xmm0, oword [InitMother+16]
+ movaps oword [par1+CRandomSFMTA.MF1], xmm0
+
+ ; initialize constants for SFMT
+ movaps xmm0, oword [InitMask]
+ movaps oword [par1+CRandomSFMTA.AMASK], xmm0
+
+ ; initialize various variables
+ xor eax, eax
+ mov dword [par1+CRandomSFMTA.one], eax
+ mov dword [par1+4+CRandomSFMTA.one], 3FF00000H
+ mov dword [par1+CRandomSFMTA.LASTINTERVAL], eax
+
+ ; get instruction set
+ push par1
+ call InstructionSet
+ pop par1
+ mov [par1+CRandomSFMTA.Instset], eax
+
+ ; Period certification
+ ; Compute parity of STATE[0-4] & InitParity
+ movaps xmm1, oword [par1+CRandomSFMTA.STATE]
+ andps xmm1, oword [InitParity]
+ movhlps xmm2, xmm1 ; high qword
+ xorps xmm1, xmm2 ; xor two qwords
+ pshufd xmm2, xmm1, 1 ; high dword
+ xorps xmm1, xmm2 ; xor two dwords
+ movd eax, xmm1 ; do rest of xor in eax
+ mov edx, eax
+ shr eax, 16
+ xor eax, edx ; xor two words
+ xor al, ah ; xor two bytes
+ jpo L008 ; parity odd: period OK
+
+ ; parity even: period not OK
+ ; Find a nonzero dword in period certification vector
+ xor ebx, ebx ; loop counter
+ lea rdx, [InitParity]
+L005: mov eax, [rdx+rbx*4] ; InitParity[i]
+ test eax, eax
+ jnz L006
+ inc ebx
+ ; assume that there is a nonzero dword in InitParity
+ jmp L005 ; loop until nonzero found
+
+L006: ; find first nonzero bit in eax
+ bsf edx, eax
+ ; flip the corresponding bit in STATE
+ btc [par1+rbx*4+CRandomSFMTA.STATE], edx
+
+L008: cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ je L009
+ call Mother_Next ; Make first random number ready
+
+L009: ; Generate first random numbers and set IX = 0
+ call SFMT_Generate
+ pop rbx
+ ret
+;InitAndPeriod endp
+
+
+; extern "C" void SFMTRandomInitByArray
+; (void * Pthis, int ThisSize, int const seeds[], int NumSeeds, int IncludeMother = 0);
+; // Seed by more than 32 bits
+SFMTRandomInitByArray:
+; Parameters
+; par1 = Pthis
+; par2d = ThisSize
+; par3 = seeds
+; par4d = NumSeeds
+; par5d = IncludeMother
+
+; define constants:
+SFMT_SIZE equ SFMT_N*4 ; number of 32-bit integers in state
+
+%IF SFMT_SIZE >= 623
+ SFMT_LAG equ 11
+%ELIF SFMT_SIZE >= 68
+ SFMT_LAG equ 7
+%ELIF SFMT_SIZE >= 39
+ SFMT_LAG equ 5
+%ELSE
+ SFMT_LAG equ 3
+%ENDIF
+
+SFMT_MID equ ((SFMT_SIZE - SFMT_LAG) / 2)
+
+ xor eax, eax
+ cmp par5d, eax ; IncludeMother (parameter is on stack if windows)
+ setnz al ; convert any nonzero value to 1
+
+ push rbx
+ push rbp
+
+ cmp par2d, SFMTSize ; ThisSize
+ jb Error ; Error exit if buffer too small
+
+ ; Align by 16. Will overlap part of Fill if Pthis unaligned
+ and par1, -16
+
+ ; Store USEMOTHER
+ mov [par1+CRandomSFMTA.USEMOTHER], eax
+
+; 1. loop: Fill state vector with random numbers from NumSeeds
+; r = NumSeeds;
+; for (i = 0; i < SFMT_N*4; i++) {
+; r = factor * (r ^ (r >> 30)) + i;
+; sta[i] = r;}
+
+ mov eax, par4d ; r = NumSeeds
+ xor ebx, ebx ; i
+L100: mov par2d, eax
+ shr eax, 30
+ xor eax, par2d
+ imul eax, 1812433253
+ add eax, ebx
+ mov [par1+rbx*4+CRandomSFMTA.STATE], eax
+ inc ebx
+ cmp ebx, SFMT_SIZE
+ jb L100
+
+ ; count = max(NumSeeds,size-1)
+ mov eax, SFMT_SIZE - 1
+ mov r11d, par4d ; NumSeeds
+ cmp r11d, eax
+ cmovb r11d, eax
+
+; 2. loop: Fill state vector with random numbers from seeds[]
+; for (i = 1, j = 0; j < count; j++) {
+; r = func1(sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size]);
+; sta[(i + mid) % size] += r;
+; if (j < NumSeeds) r += seeds[j]
+; r += i;
+; sta[(i + mid + lag) % size] += r;
+; sta[i] = r;
+; i = (i + 1) % size;
+; }
+ ; register use:
+ ; par1 = Pthis
+ ; par2 = j
+ ; par3 = seeds
+ ; par4 = NumSeeds
+ ; eax = r
+ ; ebx = i
+ ; ebp = (i + mid) % size, (i + mid + lag) % size
+ ; r10 = (i + size - 1) % size
+ ; r11 = count
+
+ xor par2d, par2d ; j = 0
+ lea ebx, [par2+1] ; i = 1
+
+L101: ; r = sta[i] ^ sta[(i + mid) % size] ^ sta[(i + size - 1) % size];
+ mov eax, [par1+rbx*4+CRandomSFMTA.STATE] ; sta[i]
+ lea ebp, [rbx+SFMT_MID]
+ cmp ebp, SFMT_SIZE
+ jb L102
+ sub ebp, SFMT_SIZE
+L102: xor eax, [par1+rbp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
+ lea r10d, [rbx+SFMT_SIZE-1]
+ cmp r10d, SFMT_SIZE
+ jb L103
+ sub r10d, SFMT_SIZE
+L103: xor eax, [par1+r10*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
+
+ ; r = func1(r) = (r ^ (r >> 27)) * 1664525U;
+ call InitSubf1
+
+ ; sta[(i + mid) % size] += r;
+ add [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ; if (j < NumSeeds) r += seeds[j]
+ cmp par2d, par4d
+ jnb L104
+ add eax, [par3+par2*4]
+L104:
+ ; r += i;
+ add eax, ebx
+
+ ; sta[(i + mid + lag) % size] += r;
+ lea ebp, [rbx+SFMT_MID+SFMT_LAG]
+ cmp ebp, SFMT_SIZE
+ jb L105
+ sub ebp, SFMT_SIZE
+L105: add [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ;sta[i] = r;
+ mov [par1+rbx*4+CRandomSFMTA.STATE], eax
+
+ ; i = (i + 1) % size;
+ inc ebx
+ cmp ebx, SFMT_SIZE
+ jb L106
+ sub ebx, SFMT_SIZE
+L106:
+ ; j++, loop while j < count
+ inc par2d
+ cmp par2d, r11d
+ jb L101
+
+; 3. loop: Randomize some more
+; for (j = 0; j < size; j++) {
+; r = func2(sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]);
+; sta[(i + mid) % size] ^= r;
+; r -= i;
+; sta[(i + mid + lag) % size] ^= r;
+; sta[i] = r;
+; i = (i + 1) % size;
+; }
+ ; j = 0
+ xor par2d, par2d
+
+L110: ; r = sta[i] + sta[(i + mid) % size] + sta[(i + size - 1) % size]
+ mov eax, [par1+rbx*4+CRandomSFMTA.STATE] ; sta[i]
+ lea ebp, [rbx+SFMT_MID]
+ cmp ebp, SFMT_SIZE
+ jb L111
+ sub ebp, SFMT_SIZE
+L111: add eax, [par1+rbp*4+CRandomSFMTA.STATE] ; sta[(i + mid) % size]
+ lea r10d, [rbx+SFMT_SIZE-1]
+ cmp r10d, SFMT_SIZE
+ jb L112
+ sub r10d, SFMT_SIZE
+L112: add eax, [par1+r10*4+CRandomSFMTA.STATE] ; sta[(i + size - 1) % size]
+
+ ; r = func2(r) = (x ^ (x >> 27)) * 1566083941U;
+ call InitSubf2
+
+ ; sta[(i + mid) % size] ^= r;
+ xor [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ; r -= i;
+ sub eax, ebx
+
+ ; sta[(i + mid + lag) % size] ^= r;
+ lea ebp, [rbx+SFMT_MID+SFMT_LAG]
+ cmp ebp, SFMT_SIZE
+ jb L113
+ sub ebp, SFMT_SIZE
+L113: xor [par1+rbp*4+CRandomSFMTA.STATE], eax
+
+ ; sta[i] = r;
+ mov [par1+rbx*4+CRandomSFMTA.STATE], eax
+
+ ; i = (i + 1) % size;
+ inc ebx
+ cmp ebx, SFMT_SIZE
+ jb L114
+ sub ebx, SFMT_SIZE
+L114:
+ ; j++, loop while j < size
+ inc par2d
+ cmp par2d, SFMT_SIZE
+ jb L110
+
+ ; if (UseMother) {
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz L120
+
+; 4. loop: Initialize MotherState
+; for (j = 0; j < 5; j++) {
+; r = func2(r) + j;
+; MotherState[j] = r + sta[2*j];
+; }
+ call InitSubf2
+ mov par2d, [par1+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M0], par2d
+ call InitSubf2
+ inc eax
+ mov par2d, [par1+8+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M1], par2d
+ call InitSubf2
+ add eax, 2
+ mov par2d, [par1+16+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M2], par2d
+ call InitSubf2
+ add eax, 3
+ mov par2d, [par1+24+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.M3], par2d
+ call InitSubf2
+ add eax, 4
+ mov par2d, [par1+32+CRandomSFMTA.STATE]
+ add par2d, eax
+ mov [par1+CRandomSFMTA.MC], par2d
+
+L120: ; More initialization and period certification
+ call InitAndPeriod
+
+ pop rbp
+ pop rbx
+ ret
+;SFMTRandomInitByArray ENDP
+
+
+Mother_Next: ; private
+; Internal procedure: advance Mother-Of-All generator
+; The random value is in M0
+; par1 = aligned pointer to structure CRandomSFMTA
+; eax, par1, xmm0 unchanged
+
+ movdqa xmm1, oword [par1+CRandomSFMTA.M3] ; load M3,M2
+ movdqa xmm2, oword [par1+CRandomSFMTA.M1] ; load M1,M0
+ movhps qword [par1+CRandomSFMTA.M3], xmm1 ; M3=M2
+ movq qword [par1+CRandomSFMTA.M2], xmm2 ; M2=M1
+ movhps qword [par1+CRandomSFMTA.M1], xmm2 ; M1=M0
+ pmuludq xmm1, oword [par1+CRandomSFMTA.MF3] ; M3*MF3, M2*MF2
+ pmuludq xmm2, oword [par1+CRandomSFMTA.MF1] ; M1*MF1, M0*MF0
+ paddq xmm1, xmm2 ; P3+P1, P2+P0
+ movhlps xmm2, xmm1 ; Get high qword
+ movq xmm3, qword [par1+CRandomSFMTA.MC] ; +carry
+ paddq xmm1, xmm3
+ paddq xmm1, xmm2 ; P0+P1+P2+P3
+ movq qword [par1+CRandomSFMTA.M0], xmm1 ; Store new M0 and carry
+ ret
+;Mother_Next endp
+
+
+align 16
+SFMT_Generate: ; private
+; void CRandomSFMT::Generate() {
+; Fill state array with new random numbers
+
+ push rbx
+
+ ; register use
+ ; par1 = Pthis (rcx or rdi)
+ ; edx = i*16 + offset state
+ ; eax, ebx = loop end
+ ; xmm1 = r1
+ ; xmm2 = r2 = r
+ ; xmm0, xmm3 = scratch
+
+ ; r1 = state[SFMT_N*16 - 2];
+ ; r2 = state[SFMT_N*16 - 1];
+ movdqa xmm1, oword [par1+(SFMT_N-2)*16+CRandomSFMTA.STATE]
+ movdqa xmm2, oword [par1+(SFMT_N-1)*16+CRandomSFMTA.STATE]
+ mov edx, CRandomSFMTA.STATE
+
+;static inline __m128i sfmt_recursion(__m128i const &a, __m128i const &b,
+;__m128i const &c, __m128i const &d, __m128i const &mask) {
+; __m128i a1, b1, c1, d1, z1, z2;
+; b1 = _mm_srli_epi32(b, SFMT_SR1);
+; a1 = _mm_slli_si128(a, SFMT_SL2);
+; c1 = _mm_srli_si128(c, SFMT_SR2);
+; d1 = _mm_slli_epi32(d, SFMT_SL1);
+; b1 = _mm_and_si128(b1, mask);
+; z1 = _mm_xor_si128(a, a1);
+; z2 = _mm_xor_si128(b1, d1);
+; z1 = _mm_xor_si128(z1, c1);
+; z2 = _mm_xor_si128(z1, z2);
+; return z2;}
+
+; for (i = 0; i < SFMT_N - SFMT_M; i++) {
+; r = sfmt_recursion(state[i], state[i + SFMT_M], r1, r2, mask);
+; state[i] = r;
+; r1 = r2;
+; r2 = r;
+; }
+
+ mov eax, (SFMT_N-SFMT_M)*16 + CRandomSFMTA.STATE ; first loop end
+ mov ebx, SFMT_N*16 + CRandomSFMTA.STATE ; second loop end
+
+; first i loop from 0 to SFMT_N - SFMT_M
+align 8
+L201: movdqa xmm0, oword [par1+rdx+SFMT_M*16] ; b
+ psrld xmm0, SFMT_SR1 ; b1
+ pand xmm0, oword [par1+CRandomSFMTA.AMASK] ; b1
+ movdqa xmm3, oword [par1+rdx] ; a
+ pxor xmm0, xmm3
+ pslldq xmm3, SFMT_SL2 ; a1
+ psrldq xmm1, SFMT_SR2 ; c1, c = r1
+ pxor xmm0, xmm3
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm2 ; r1 = r2
+ pslld xmm2, SFMT_SL1 ; d1, d = r2
+ pxor xmm2, xmm0 ; r2 = r
+ ; state[i] = r;
+ movdqa oword [par1+rdx], xmm2
+
+ ; i++ while i < SFMT_N - SFMT_M
+ add edx, 16
+ cmp edx, eax
+ jb L201
+
+;align 16
+L202: ; second i loop from SFMT_N - SFMT_M + 1 to SFMT_N
+ movdqa xmm0, oword [par1+rdx+(SFMT_M-SFMT_N)*16]; b
+ psrld xmm0, SFMT_SR1 ; b1
+ pand xmm0, oword [par1+CRandomSFMTA.AMASK] ; b1
+ movdqa xmm3, oword [par1+rdx] ; a
+ pxor xmm0, xmm3
+ pslldq xmm3, SFMT_SL2 ; a1
+ psrldq xmm1, SFMT_SR2 ; c1, c = r1
+ pxor xmm0, xmm3
+ pxor xmm0, xmm1
+ movdqa xmm1, xmm2 ; r1 = r2
+ pslld xmm2, SFMT_SL1 ; d1, d = r2
+ pxor xmm2, xmm0 ; r2 = r
+ ; state[i] = r;
+ movdqa oword [par1+rdx], xmm2
+
+ ; i++ while i < SFMT_N
+ add edx, 16
+ cmp edx, ebx
+ jb L202
+
+ ; Check if initialized
+L208: cmp dword [par1+CRandomSFMTA.AMASK], SFMT_MASK1
+ jne Error ; Make error if not initialized
+
+ ; ix = 0;
+ mov dword [par1+CRandomSFMTA.IX], 0 ; point to start of STATE buffer
+ pop rbx
+ ret
+;SFMT_Generate endp
+
+
+; extern "C" unsigned int SFMTBRandom(void * Pthis); // Output random bits
+
+SFMTBRandom: ; generate random bits
+ ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned
+ and par1, -16
+
+SFMTBRandom_reg: ; Entry for register parameters, used internally
+
+; if (ix >= SFMT_N*4) Generate();
+ mov edx, [par1+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16
+ jnb NeedGenerate
+
+; y = ((uint32_t*)state)[ix++];
+ mov eax, dword [par1+rdx+CRandomSFMTA.STATE]
+ add edx, 4
+ mov [par1+CRandomSFMTA.IX], edx
+
+AfterGenerate:
+; if (UseMother) y += MotherBits();
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz NoMother
+
+ ; add mother bits
+ add eax, [par1+CRandomSFMTA.M0] ; Add Mother random number
+ call Mother_Next ; Make next Mother random number ready
+
+NoMother: ; return y;
+ ret
+
+NeedGenerate:
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ mov eax, [par1+CRandomSFMTA.STATE]
+ mov dword [par1+CRandomSFMTA.IX], 4
+ jmp AfterGenerate
+
+;SFMTBRandom ENDP
+
+
+; extern "C" double SFMTRandom (void * Pthis); // Output random float
+SFMTRandom: ; generate random float with 52 bits resolution
+ ; Align Pthis by 16. Will overlap part of Fill1 if Pthis unaligned
+ and par1, -16
+
+SFMTRandom_reg: ; internal entry point
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+ mov edx, [par1+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16-4
+ jnb L303
+
+L301: ; read 64 random bits
+ movq xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [par1+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz L302 ; ConvertToFloat
+
+ ; add mother bits
+ movq xmm1, qword [par1+CRandomSFMTA.M0] ; Mother random number MC and M0
+ pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
+ paddq xmm0, xmm1 ; Add SFMT and Mother outputs
+ call Mother_Next ; Make next Mother random number ready
+
+L302: ; ConvertToFloat
+ psrlq xmm0, 12 ; align with mantissa field of double precision float
+ movsd xmm1, [par1+CRandomSFMTA.one] ; 1.0 double precision
+ por xmm0, xmm1 ; insert exponent to get 1.0 <= x < 2.0
+ subsd xmm0, xmm1 ; subtract 1.0 to get 0.0 <= x < 1.0
+ ret ; return value
+
+L303: ; NeedGenerateR
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ xor edx, edx
+ jmp L301
+
+;SFMTRandom ENDP
+
+
+; extern "C" long double SFMTRandomL (void * Pthis);
+SFMTRandomL: ; generate random float with 63 bits resolution
+ ; Align Pthis by 16.
+ and par1, -16
+
+SFMTRandomL_reg: ; internal entry point
+
+; check if there are at least 64 random bits in state buffer
+; if (ix >= SFMT_N*4-1) Generate();
+ mov edx, [par1+CRandomSFMTA.IX]
+ cmp edx, SFMT_N*16-4
+ jnb L403
+
+L401: ; read 64 random bits
+ movq xmm0, qword [par1+rdx+CRandomSFMTA.STATE]
+ add edx, 8
+ mov [par1+CRandomSFMTA.IX], edx
+
+ ; combine with Mother-Of-All generator?
+ cmp dword [par1+CRandomSFMTA.USEMOTHER], 0
+ jz L402
+
+ ; add mother bits
+ movq xmm1, qword [par1+CRandomSFMTA.M0] ; Mother random number MC and M0
+ pshuflw xmm1, xmm1, 01001011B ; Put M0 before MC, and swap the words in MC
+ paddq xmm0, xmm1 ; Add SFMT and Mother outputs
+ call Mother_Next ; Make next Mother random number ready
+
+L402: ;ConvertToFloat
+ sub rsp, 16 ; make space for long double
+ psrlq xmm0, 1 ; align with mantissa field of long double
+ pcmpeqw xmm1, xmm1 ; all 1's
+ psllq xmm1, 63 ; create a 1 in bit 63
+ por xmm0, xmm1 ; bit 63 is always 1 in long double
+ movq qword [rsp], xmm0 ; store mantissa
+ mov dword [rsp+8], 3FFFH ; exponent
+ fld tword [rsp] ; load long double
+ fsub qword [par1+CRandomSFMTA.one] ; subtract 1.0 to get 0.0 <= x < 1.0
+ pcmpeqw xmm0, xmm0 ; make a NAN for compilers that don't support long double
+ add rsp, 16
+ ret ; return value in st(0)
+
+L403: ;NeedGenerateR
+ call SFMT_Generate ; generate SFMT_N*4 random dwords
+ xor edx, edx
+ jmp L401
+;SFMTRandomL ENDP
+
+
+; extern "C" int SFMTIRandom (void * Pthis, int min, int max); // Output random integer
+
+SFMTIRandom:
+; par1 = Pthis
+; par2d = min
+; par3d = max
+
+ ; Align Pthis by 16.
+ and par1, -16
+ push par2 ; save min, max
+ push par3
+ call SFMTBRandom_reg ; random bits
+ pop rdx ; max
+ pop rcx ; min
+ sub edx, ecx
+ jl short WrongInterval ; max < min
+ inc edx ; max - min + 1
+ mul edx ; multiply random number by interval and truncate
+ lea eax, [rdx+rcx] ; add min to high dword of product
+ ret
+WrongInterval:
+ mov eax, 80000000H ; error exit
+ ret
+;SFMTIRandom ENDP
+
+
+; extern "C" int SFMTIRandomX (void * Pthis, int min, int max); // Output random integer
+
+SFMTIRandomX:
+; par1 = Pthis
+; par2d = min
+; par3d = max
+
+ push rbx
+ ; Align Pthis by 16.
+ and par1, -16
+
+ mov ebx, par3d
+ sub ebx, par2d ; max - min
+ jle short M30 ; max <= min (signed)
+ inc ebx ; interval = max - min + 1
+
+ ; if (interval != LastInterval) {
+ cmp ebx, [par1+CRandomSFMTA.LASTINTERVAL]
+ je M10
+ ; need to calculate new rejection limit
+ ; RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;}
+ xor eax, eax ; 0
+ lea edx, [eax+1] ; 1
+ div ebx ; (would give overflow if interval = 1)
+ mul ebx
+ dec eax
+ mov [par1+CRandomSFMTA.RLIMIT], eax
+ mov [par1+CRandomSFMTA.LASTINTERVAL], ebx
+M10: mov ebx, par2d ; save min
+
+M20: ; do { // Rejection loop
+ call SFMTBRandom_reg ; random bits (par1 is preserved)
+ ; longran = (uint64)BRandom() * interval;
+ mul dword [par1+CRandomSFMTA.LASTINTERVAL]
+ ; } while (remainder > RLimit);
+ cmp eax, [par1+CRandomSFMTA.RLIMIT]
+ ja M20
+
+ ; return (int32)iran + min
+ lea eax, [rbx+rdx]
+ pop rbx
+ ret
+
+M30: jl M40
+ ; max = min. Return min
+ mov eax, par2d
+ pop rbx
+ ret ; max = min exit
+
+M40: ; max < min: error
+ mov eax, 80000000H ; error exit
+ pop rbx
+ ret
+;SFMTIRandomX ENDP
+
+
+
+; -------------------------------------------------------------------------
+; Single-threaded static link versions for SFMT generator
+; -------------------------------------------------------------------------
+
+; extern "C" void SFMTgenRandomInit(int seed, int IncludeMother = 0);
+SFMTgenRandomInit:
+%IFDEF WINDOWS
+SFMTgenRandomInitD:
+%ENDIF
+; par1d = seed
+; par2d = IncludeMother
+
+ ; set up parameters for call SFMTRandomInit
+ mov par4d, par2d ; IncludeMother
+ mov par3d, par1d ; seed
+ mov par2d, SFMTSize ; ThisSize
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandomInit
+;SFMTgenRandomInit ENDP
+
+
+; extern "C" void SFMTgenRandomInitByArray(int const seeds[], int NumSeeds, int IncludeMother = 0);
+SFMTgenRandomInitByArray:
+; par1 = seeds
+; par2d = NumSeeds
+; par3d = IncludeMother
+
+ ; set up parameters for call SFMTRandomInitByArray
+%IFDEF WINDOWS
+SFMTgenRandomInitByArrayD:
+ push par3 ; IncludeMother on stack
+ sub rsp, 32 ; empty shadow space
+ mov par4d, par2d ; NumSeeds
+ mov par3, par1 ; seeds
+ mov par2d, SFMTSize ; ThisSize
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ call SFMTRandomInitByArray
+ add rsp, 40
+ ret
+%ELSE ; UNIX
+ mov par5d, par3d ; IncludeMother in register
+ mov par4d, par2d ; NumSeeds
+ mov par3, par1 ; seeds
+ mov par2d, SFMTSize ; ThisSize
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandomInitByArray
+%ENDIF
+;SFMTgenRandomInitByArray ENDP
+
+
+; extern "C" double SFMTgenRandom();
+SFMTgenRandom: ; generate random float with 52 bits resolution
+%IFDEF WINDOWS
+SFMTgenRandomD:
+%ENDIF
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandom_reg ; random bits
+;SFMTgenRandom ENDP
+
+
+; extern "C" double SFMTgenRandom();
+SFMTgenRandomL: ; generate random float with 63 bits resolution
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTRandomL_reg ; random bits
+;SFMTgenRandomL ENDP
+
+
+; extern "C" int SFMTgenIRandom (int min, int max);
+SFMTgenIRandom:
+%IFDEF WINDOWS
+SFMTgenIRandomD:
+%ENDIF
+ mov par3d, par2d
+ mov par2d, par1d
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTIRandom ; continue in _SFMTIRandom
+;SFMTgenIRandom ENDP
+
+
+; extern "C" int SFMTgenIRandomX (int min, int max);
+SFMTgenIRandomX:
+%IFDEF WINDOWS
+SFMTgenIRandomXD:
+%ENDIF
+ mov par3d, par2d
+ mov par2d, par1d
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTIRandomX ; continue in _SFMTIRandomX
+;SFMTgenIRandomX ENDP
+
+
+; extern "C" uint32_t SFMTgenBRandom();
+SFMTgenBRandom: ; generate random float with 32 bits resolution
+%IFDEF WINDOWS
+SFMTgenBRandomD:
+%ENDIF
+ lea par1, [SFMTInstance] ; Get address of SFMTInstance into par1
+ jmp SFMTBRandom_reg ; random bits
+;SFMTgenBRandom ENDP
+
+;END
diff --git a/asmlibSrc/strcat32.asm b/asmlibSrc/strcat32.asm
new file mode 100755
index 0000000..523808d
--- /dev/null
+++ b/asmlibSrc/strcat32.asm
@@ -0,0 +1,60 @@
+;************************* strcat32.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2008-10-16
+; Description:
+; Faster version of the standard strcat function:
+; char * strcat(char * dest, const char * src);
+; Copies zero-terminated string from src to end of dest.
+;
+; Overriding standard function strcat:
+; The alias ?OVR_strcat is changed to _strcat in the object file if
+; it is desired to override the standard library function strcat.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_strcat: function ; Function _A_strcat
+global ?OVR_strcat: function ; ?OVR removed if standard function strcat overridden
+
+; Imported from strlen32.asm
+extern _A_strlen
+
+; Imported from memcpy32.asm
+extern _A_memcpy
+
+
+SECTION .text align=16
+
+; extern "C" char * A_strcat(char * dest, const char * src) {
+; memcpy(dest+strlen(dest), src, strlen(src)+1);
+; return dest
+; }
+
+; Function entry:
+_A_strcat:
+?OVR_strcat:
+
+ mov eax, [esp+8] ; src
+ push eax
+ call _A_strlen ; length of src
+ inc eax ; include terminating zero in length
+ push eax ; strlen(src)+1
+ mov edx, [esp+4+8] ; dest
+ push edx
+ call _A_strlen ; length of dest
+ pop edx ; dest. Assume unchanged by _A_strlen
+ add edx, eax ; dest+strlen(dest)
+ mov ecx, [esp+8+8] ; src
+ ; strlen(src)+1 is on stack
+ push ecx ; src
+ push edx ; dest+strlen(dest)
+ call _A_memcpy ; copy
+ add esp, 16 ; clean up stack
+ mov eax, [esp+4] ; return dest
+ ret
+
+;_A_strcat ENDP
diff --git a/asmlibSrc/strcat64.asm b/asmlibSrc/strcat64.asm
new file mode 100755
index 0000000..bf9de71
--- /dev/null
+++ b/asmlibSrc/strcat64.asm
@@ -0,0 +1,68 @@
+;************************* strcat64.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2008-10-16
+; Description:
+; Faster version of the standard strcat function:
+; char * strcat(char *dest, const char * src);
+; Copies zero-terminated string from src to end of dest.
+;
+; Overriding standard function strcat:
+; The alias ?OVR_strcat is changed to _strcat in the object file if
+; it is desired to override the standard library function strcat.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcat: function ; Function A_strcat
+global ?OVR_strcat: function ; ?OVR removed if standard function strcat overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text align=16
+
+; extern "C" char * A_strcat(char * dest, const char * src) {
+; memcpy(dest+strlen(dest), src, strlen(src)+1);
+; return dest
+; }
+
+; Function entry:
+A_strcat:
+?OVR_strcat:
+
+%IFDEF WINDOWS
+%define Rpar1 rcx ; function parameter 1
+%define Rpar2 rdx ; function parameter 2
+%define Rpar3 r8 ; function parameter 3
+%ENDIF
+%IFDEF UNIX
+%define Rpar1 rdi ; function parameter 1
+%define Rpar2 rsi ; function parameter 2
+%define Rpar3 rdx ; function parameter 3
+%ENDIF
+
+ push Rpar1 ; dest
+ push Rpar2 ; src
+ call A_strlen ; length of dest
+ push rax ; strlen(dest)
+ mov Rpar1, [rsp+8] ; src
+ call A_strlen ; length of src
+ pop Rpar1 ; strlen(dest)
+ pop Rpar2 ; src
+ add Rpar1, [rsp] ; dest + strlen(dest)
+ lea Rpar3, [rax+1] ; strlen(src)+1
+ call A_memcpy ; copy
+ pop rax ; return dest
+ ret
+
+;A_strcat ENDP
diff --git a/asmlibSrc/strcmp32.asm b/asmlibSrc/strcmp32.asm
new file mode 100755
index 0000000..008d1fb
--- /dev/null
+++ b/asmlibSrc/strcmp32.asm
@@ -0,0 +1,177 @@
+;************************* strcmp32.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-14
+; Last modified: 2012-07-07
+
+; Description:
+; Faster version of the standard strcmp function:
+; int A_strcmp(const char * s1, const char * s2);
+; Tests if two strings are equal. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard function strcmp:
+; The alias ?OVR_strcmp is changed to _strcmp in the object file if
+; it is desired to override the standard library function strcmp.
+; Overriding is disabled because the function may read beyond the end of a
+; string, while the standard strcmp function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386, and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+%define ALLOW_OVERRIDE 0 ; Set to one if override of standard function desired
+
+global _A_strcmp: function ; Function A_strcmp
+
+; Direct entries to CPU-specific versions
+global _strcmpGeneric: function ; Generic version for processors without SSE4.2
+global _strcmpSSE42: function ; Version for processors with SSE4.2
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+; strcmp function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcmp: function
+?OVR_strcmp:
+%endif
+
+_A_strcmp: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp dword [strcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP: ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+ jmp near [edx+strcmpDispatch-RP]
+
+%ENDIF
+
+align 16
+_strcmpSSE42:
+ mov eax, [esp+4] ; string 1
+ mov edx, [esp+8] ; string 2
+ push ebx
+ mov ebx, -16 ; offset counter
+
+compareloop:
+ add ebx, 16 ; increment offset
+ movdqu xmm1, [eax+ebx] ; read 16 bytes of string 1
+ pcmpistri xmm1, [edx+ebx], 00011000B ; unsigned bytes, equal each, invert. returns index in ecx
+ jnbe compareloop ; jump if not carry flag and not zero flag
+
+ jnc equal
+notequal: ; strings are not equal
+ ; strings are not equal
+ add ecx, ebx ; offset to first differing byte
+ movzx eax, byte [eax+ecx] ; compare bytes
+ movzx edx, byte [edx+ecx]
+ sub eax, edx
+ pop ebx
+ ret
+
+equal:
+ xor eax, eax ; strings are equal
+ pop ebx
+ ret
+
+;_strcmpSSE42: endp
+
+
+align 16
+_strcmpGeneric: ; generic version
+; This is a very simple solution. There is not much gained by using SSE2 or anything complicated
+ mov ecx, [esp+4] ; string 1
+ mov edx, [esp+8] ; string 2
+
+_compareloop:
+ mov al, [ecx]
+ cmp al, [edx]
+ jne _notequal
+ test al, al
+ jz _equal
+ inc ecx
+ inc edx
+ jmp _compareloop
+
+_equal: xor eax, eax ; strings are equal
+ ret
+
+_notequal: ; strings are not equal
+ movzx eax, byte [ecx] ; compare first differing byte
+ movzx edx, byte [edx]
+ sub eax, edx
+ ret
+
+;_strcmpGeneric end
+
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+; CPU dispatching for strcmp. This is executed only once
+strcmpCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of strcmp
+ mov ecx, _strcmpGeneric
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strcmp
+ mov ecx, _strcmpSSE42
+Q100: mov [strcmpDispatch], ecx
+ ; Continue in appropriate version of strcmp
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP2: ; reference point edx
+ ; Point to generic version of strcmp
+ lea ecx, [edx+_strcmpGeneric-RP2]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strcmp
+ lea ecx, [edx+_strcmpSSE42-RP2]
+Q100: mov [edx+strcmpDispatch-RP2], ecx
+ ; Continue in appropriate version of strcmp
+ jmp ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strcmpDispatch DD strcmpCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
+
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+ dq 0, 0
diff --git a/asmlibSrc/strcmp64.asm b/asmlibSrc/strcmp64.asm
new file mode 100755
index 0000000..2dc7738
--- /dev/null
+++ b/asmlibSrc/strcmp64.asm
@@ -0,0 +1,162 @@
+;************************* strcmp64.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-14
+; Last modified: 2012-07-07
+
+; Description:
+; Faster version of the standard strcmp function:
+; int A_strcmp(const char * s1, const char * s2);
+; Tests if two strings are equal. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard function strcmp:
+; The alias ?OVR_strcmp is changed to _strcmp in the object file if
+; it is desired to override the standard library function strcmp.
+; Overriding is disabled because the function may read beyond the end of a
+; string, while the standard strcmp function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386, and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 - 2012 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%define ALLOW_OVERRIDE 0 ; Set to one if override of standard function desired
+
+global A_strcmp: function ; Function A_strcmp
+
+; Direct entries to CPU-specific versions
+global strcmpGeneric: function ; Generic version for processors without SSE4.2
+global strcmpSSE42: function ; Version for processors with SSE4.2
+
+; Imported from instrset32.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+; strcmp function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcmp: function
+?OVR_strcmp:
+%endif
+
+A_strcmp: ; function dispatching
+
+ jmp near [strcmpDispatch] ; Go to appropriate version, depending on instruction set
+
+align 16
+strcmpSSE42:
+%ifdef WINDOWS
+ push rdi
+ mov rdi, rcx
+%define rs1 rdi ; pointer to string 1
+%define rs2 rdx ; pointer to string 2
+%define par1 rcx
+%define par2 rdx
+%else ; UNIX
+%define rs1 rdi
+%define rs2 rsi
+%define par1 rdi
+%define par2 rsi
+%endif
+
+ mov rax, -16 ; offset counter
+compareloop:
+ add rax, 16 ; increment offset
+ movdqu xmm1, [rs1+rax] ; read 16 bytes of string 1
+ pcmpistri xmm1, [rs2+rax], 00011000B ; unsigned bytes, equal each, invert. returns index in ecx
+ jnbe compareloop ; jump if not carry flag and not zero flag
+
+ jnc equal
+notequal:
+ ; strings are not equal
+ add rcx, rax ; offset to first differing byte
+ movzx eax, byte [rs1+rcx] ; compare first differing byte
+ movzx edx, byte [rs2+rcx]
+ sub rax, rdx
+%ifdef WINDOWS
+ pop rdi
+%endif
+ ret
+
+equal:
+ xor eax, eax ; strings are equal
+%ifdef WINDOWS
+ pop rdi
+%endif
+ ret
+
+;strcmpSSE42: endp
+
+
+align 16
+strcmpGeneric: ; generic version
+; This is a very simple solution. There is not much gained by using SSE2 or anything complicated
+%ifdef WINDOWS
+%define ss1 rcx ; pointer to string 1
+%define ss2 rdx ; pointer to string 2
+%else ; UNIX
+%define ss1 rdi
+%define ss2 rsi
+%endif
+
+
+_compareloop:
+ mov al, [ss1]
+ cmp al, [ss2]
+ jne _notequal
+ test al, al
+ jz _equal
+ inc ss1
+ inc ss2
+ jmp _compareloop
+
+_equal: xor eax, eax ; strings are equal
+ ret
+
+_notequal: ; strings are not equal
+ movzx eax, byte [ss1] ; compare first differing byte
+ movzx edx, byte [ss2]
+ sub eax, edx
+ ret
+
+;strcmpGeneric end
+
+
+; CPU dispatching for strcmp. This is executed only once
+strcmpCPUDispatch:
+ ; get supported instruction set
+ push par1
+ push par2
+ call InstructionSet
+ pop par2
+ pop par1
+ ; Point to generic version of strcmp
+ lea r9, [strcmpGeneric]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strcmp
+ lea r9, [strcmpSSE42]
+Q100: mov [strcmpDispatch], r9
+ ; Continue in appropriate version of strcmp
+ jmp r9
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strcmpDispatch DQ strcmpCPUDispatch
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+ dq 0, 0
diff --git a/asmlibSrc/strcountset32.asm b/asmlibSrc/strcountset32.asm
new file mode 100755
index 0000000..f9770ba
--- /dev/null
+++ b/asmlibSrc/strcountset32.asm
@@ -0,0 +1,194 @@
+;************************* strcountinset32.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-20
+; Last modified: 2011-08-21
+
+; Description:
+; size_t strCountInSet(const char * str, const char * set);
+;
+; Counts how many characters in str that belong to the set defined by set.
+; Both strings are zero-terminated ASCII strings.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _strCountInSet: function
+
+; Direct entries to CPU-specific versions
+global _strCountInSetGeneric: function
+global _strCountInSetSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+; strCountInSet function
+;******************************************************************************
+
+_strCountInSet: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp near [strCountInSetDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP1: ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+ jmp near [edx+strCountInSetDispatch-RP1]
+
+%ENDIF
+
+
+align 16
+_strCountInSetSSE42: ; SSE4.2 version
+ push esi
+ push edi
+ mov esi, [esp+12] ; str
+ mov edi, [esp+16] ; set
+ xor eax, eax ; match counter
+str_next:
+ movdqu xmm2, [esi] ; str
+ movdqu xmm1, [edi] ; set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd ecx, xmm0
+ jns set_extends ; the set is more than 16 bytes
+ jz str_finished
+set_finished:
+ popcnt ecx, ecx
+ add eax, ecx
+ ; first 16 characters checked, continue with next 16 characters (a terminating zero would never match)
+ add esi, 16 ; next 16 bytes of str
+ jmp str_next
+
+set_and_str_finished:
+ or ecx, edx ; accumulate matches
+str_finished:
+ popcnt ecx, ecx
+ add eax, ecx
+ pop edi
+ pop esi
+ ret
+
+set_loop:
+ or ecx, edx ; accumulate matches
+set_extends:
+ add edi, 16
+ movdqu xmm1, [edi] ; next part of set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd edx, xmm0
+ jns set_loop
+ jz set_and_str_finished
+ mov edi, [esp+16] ; restore set pointer
+ or ecx, edx ; accumulate matches
+ jmp set_finished
+
+;_strCountInSetSSE42 end
+
+;******************************************************************************
+; strCountInSet function generic
+;******************************************************************************
+
+align 8
+_strCountInSetGeneric: ; Generic version
+ push esi
+ push edi
+ mov esi, [esp+12] ; str pointer
+ mov edi, [esp+16] ; set pointer
+ xor eax, eax ; match counter
+str_next10:
+ mov cl, [esi] ; read one byte from str
+ test cl, cl
+ jz str_finished10 ; str finished
+set_next10:
+ mov dl, [edi]
+ test dl, dl
+ jz set_finished10
+ inc edi ; next in set
+ cmp cl, dl
+ jne set_next10
+ ; character match found, goto next character
+ inc eax ; count match
+ inc esi
+ jmp str_next10
+
+set_finished10: ; end of set, no match found
+ mov edi, [esp+16] ; restore set pointer
+ inc esi
+ jmp str_next10 ; next in string
+
+str_finished10: ; end of str, count is in eax
+ pop edi
+ pop esi
+ ret
+;_strCountInSetGeneric end
+
+
+; ********************************************************************************
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for strCountInSet. This is executed only once
+; ********************************************************************************
+
+strCountInSetCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of strstr
+ mov ecx, _strCountInSetGeneric
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ mov ecx, _strCountInSetSSE42
+Q100: mov [strCountInSetDispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP11: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_strCountInSetGeneric-RP11]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea ecx, [edx+_strCountInSetSSE42-RP11]
+Q100: mov [edx+strCountInSetDispatch-RP11], ecx
+ ; Continue in appropriate version
+ jmp ecx
+%ENDIF
+
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strCountInSetDispatch DD strCountInSetCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcountset64.asm b/asmlibSrc/strcountset64.asm
new file mode 100755
index 0000000..909987c
--- /dev/null
+++ b/asmlibSrc/strcountset64.asm
@@ -0,0 +1,175 @@
+;************************* strcountinset64.asm *********************************
+; Author: Agner Fog
+; Date created: 2011-07-20
+; Last modified: 2011-07-20
+
+; Description:
+; size_t strCountInSet(const char * str, const char * set);
+;
+; Counts how many characters in str that belong to the set defined by set.
+; Both strings are zero-terminated ASCII strings.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global strCountInSet: function
+
+; Direct entries to CPU-specific versions
+global strCountInSetGeneric: function
+global strCountInSetSSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+; strCountInSet function
+;******************************************************************************
+%ifdef WINDOWS
+%define par1 rcx
+%define par2 rdx
+%else
+%define par1 rdi
+%define par2 rsi
+%endif
+
+
+strCountInSet: ; function dispatching
+ jmp near [strCountInSetDispatch] ; Go to appropriate version, depending on instruction set
+
+
+align 16
+strCountInSetSSE42: ; SSE4.2 version
+%ifdef WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx ; str
+ mov rsi, rdx ; set
+%endif
+ mov r8, rsi
+ xor eax, eax ; match counter
+str_next:
+ movdqu xmm2, [rdi] ; str
+ movdqu xmm1, [rsi] ; set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd ecx, xmm0
+ jns set_extends ; the set is more than 16 bytes
+ jz str_finished
+set_finished:
+ popcnt ecx, ecx
+ add rax, rcx
+ ; first 16 characters checked, continue with next 16 characters (a terminating zero would never match)
+ add rdi, 16 ; next 16 bytes of str
+ jmp str_next
+
+set_and_str_finished:
+ or ecx, edx ; accumulate matches
+str_finished:
+ popcnt ecx, ecx
+ add rax, rcx
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ ret
+
+set_loop:
+ or ecx, edx ; accumulate matches
+set_extends:
+ add rsi, 16
+ movdqu xmm1, [rsi] ; next part of set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd edx, xmm0
+ jns set_loop
+ jz set_and_str_finished
+ mov rsi, r8 ; restore set pointer
+ or ecx, edx ; accumulate matches
+ jmp set_finished
+
+;strCountInSetSSE42 end
+
+;******************************************************************************
+; strCountInSet function generic
+;******************************************************************************
+
+align 8
+strCountInSetGeneric: ; Generic version
+%ifdef WINDOWS
+ push rsi
+ push rdi
+ mov rdi, rcx ; str
+ mov rsi, rdx ; set
+%endif
+ mov r8, rsi
+ xor eax, eax ; match counter
+str_next10:
+ mov cl, [rdi] ; read one byte from str
+ test cl, cl
+ jz str_finished10 ; str finished
+set_next10:
+ mov dl, [rsi]
+ test dl, dl
+ jz set_finished10
+ inc rsi ; next in set
+ cmp cl, dl
+ jne set_next10
+ ; character match found, goto next character
+ inc rax ; count match
+ inc rdi
+ jmp str_next10
+
+set_finished10: ; end of set, no match found
+ mov rsi, r8 ; restore set pointer
+ inc rdi
+ jmp str_next10 ; next in string
+
+str_finished10: ; end of str, count is in eax
+%ifdef WINDOWS
+ pop rdi
+ pop rsi
+%endif
+ ret
+;strCountInSetGeneric end
+
+
+
+; ********************************************************************************
+; CPU dispatching for strCountInSet. This is executed only once
+; ********************************************************************************
+
+strCountInSetCPUDispatch:
+ ; get supported instruction set
+ push par1
+ push par2
+ call InstructionSet
+ pop par2
+ pop par1
+ ; Point to generic version of strstr
+ lea r8, [strCountInSetGeneric]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea r8, [strCountInSetSSE42]
+Q100: mov [strCountInSetDispatch], r8
+ ; Continue in appropriate version
+ jmp r8
+
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strCountInSetDispatch DQ strCountInSetCPUDispatch
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcountutf832.asm b/asmlibSrc/strcountutf832.asm
new file mode 100755
index 0000000..46910b8
--- /dev/null
+++ b/asmlibSrc/strcountutf832.asm
@@ -0,0 +1,162 @@
+;************************* strcountutf832.asm ***********************************
+; Author: Agner Fog
+; Date created: 2011-07-20
+; Last modified: 2013-09-11
+
+; Description:
+; size_t strcount_UTF8(const char * str);
+; Counts the number of characters in a UTF-8 encoded string.
+;
+; This functions does not check if the string contains valid UTF-8 code, it
+; simply counts all bytes except continuation bytes 10xxxxxxB.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the string.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _strcount_UTF8: function
+
+; Direct entries to CPU-specific versions
+global _strcount_UTF8Generic: function
+global _strcount_UTF8SSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+section .data
+align 16
+byterange: times 8 DB 10000000b, 10111111b ; range for UTF-8 continuation bytes
+
+section .text
+
+;******************************************************************************
+; strcount_UTF8 function
+;******************************************************************************
+
+
+_strcount_UTF8: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp near [strcount_UTF8Dispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP1: ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+ jmp near [edx+strcount_UTF8Dispatch-RP1]
+
+%ENDIF
+
+;******************************************************************************
+; strcount_UTF8 function SSE4.2 version
+;******************************************************************************
+align 16
+_strcount_UTF8SSE42: ; SSE4.2 version
+ mov edx, [esp+4] ; str
+ movdqa xmm1, [byterange] ; define range of continuation bytes to ignore
+ xor ecx, ecx ; character counter
+str_next:
+ pcmpistrm xmm1, [edx], 00110100b; check range, invert valid bits, return bit mask in xmm0
+ movd eax, xmm0
+ jz str_finished ; terminating zero found
+ popcnt eax, eax ; count
+ add ecx, eax
+ add edx, 16
+ jmp str_next
+
+str_finished:
+ popcnt eax, eax
+ add eax, ecx
+ ret
+
+
+;******************************************************************************
+; strcount_UTF8 function generic
+;******************************************************************************
+
+align 8
+_strcount_UTF8Generic:
+ mov edx, [esp+4] ; str
+ xor eax, eax ; character counter
+ xor ecx, ecx ; zero extend cl
+str_next1:
+ mov cl, [edx] ; one byte fron string
+ test cl, cl
+ jz str_finished1 ; terminating zero
+ sub cl, 10000000b ; lower limit of continuation bytes
+ cmp cl, 00111111b ; upper limit - lower limit
+ seta cl ; 1 if outside limit (unsigned compare includes negative values as above)
+ add eax, ecx
+ inc edx
+ jmp str_next1
+
+str_finished1:
+ ret
+;_strcount_UTF8Generic end
+
+
+; ********************************************************************************
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for strcount_UTF8. This is executed only once
+; ********************************************************************************
+
+strcount_UTF8CPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of strstr
+ mov ecx, _strcount_UTF8Generic
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ mov ecx, _strcount_UTF8SSE42
+Q100: mov [strcount_UTF8Dispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP11: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_strcount_UTF8Generic-RP11]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea ecx, [edx+_strcount_UTF8SSE42-RP11]
+Q100: mov [edx+strcount_UTF8Dispatch-RP11], ecx
+ ; Continue in appropriate version
+ jmp ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strcount_UTF8Dispatch DD strcount_UTF8CPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcountutf864.asm b/asmlibSrc/strcountutf864.asm
new file mode 100755
index 0000000..b155e57
--- /dev/null
+++ b/asmlibSrc/strcountutf864.asm
@@ -0,0 +1,127 @@
+;************************* strcountutf864.asm ***********************************
+; Author: Agner Fog
+; Date created: 2011-07-20
+; Last modified: 2013-09-11
+
+; Description:
+; size_t strcount_UTF8(const char * str);
+; Counts the number of characters in a UTF-8 encoded string.
+;
+; This functions does not check if the string contains valid UTF-8 code, it
+; simply counts all bytes except continuation bytes 10xxxxxxB.
+;
+; Note that this functions may read up to 15 bytes beyond the end of the string.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; CPU dispatching included for SSE2 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+global strcount_UTF8: function
+
+; Direct entries to CPU-specific versions
+global strcount_UTF8Generic: function
+global strcount_UTF8SSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .data
+align 16
+byterange: times 8 DB 10000000b, 10111111b ; range for UTF-8 continuation bytes
+
+section .text
+
+;******************************************************************************
+; strcount_UTF8 function
+;******************************************************************************
+
+
+strcount_UTF8: ; function dispatching
+
+ jmp near [strcount_UTF8Dispatch] ; Go to appropriate version, depending on instruction set
+
+
+;******************************************************************************
+; strcount_UTF8 function SSE4.2 version
+;******************************************************************************
+
+%ifdef WINDOWS
+%define par1 rcx
+%else
+%define par1 rdi
+%endif
+
+align 16
+strcount_UTF8SSE42: ; SSE4.2 version
+ movdqa xmm1, [byterange] ; define range of continuation bytes to ignore
+ xor edx, edx ; character counter
+str_next:
+ pcmpistrm xmm1, [par1], 00110100b; check range, invert valid bits, return bit mask in xmm0
+ movd eax, xmm0
+ jz str_finished ; terminating zero found
+ popcnt eax, eax ; count
+ add rdx, rax
+ add par1, 16
+ jmp str_next
+
+str_finished:
+ popcnt eax, eax
+ add rax, rdx
+ ret
+
+
+;******************************************************************************
+; strcount_UTF8 function generic
+;******************************************************************************
+
+align 8
+strcount_UTF8Generic:
+ xor eax, eax ; character counter
+ xor edx, edx ; zero extend dl
+str_next1:
+ mov dl, [par1] ; one byte fron string
+ test dl, dl
+ jz str_finished1 ; terminating zero
+ sub dl, 10000000b ; lower limit of continuation bytes
+ cmp dl, 00111111b ; upper limit - lower limit
+ seta dl ; 1 if outside limit (unsigned compare includes negative values as above)
+ add rax, rdx
+ inc par1
+ jmp str_next1
+
+str_finished1:
+ ret
+;_strcount_UTF8Generic end
+
+
+; ********************************************************************************
+; CPU dispatching for strcount_UTF8. This is executed only once
+; ********************************************************************************
+
+strcount_UTF8CPUDispatch:
+ ; get supported instruction set
+ push par1
+ call InstructionSet
+ pop par1
+ ; Point to generic version of strstr
+ lea rdx, [strcount_UTF8Generic]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea rdx, [strcount_UTF8SSE42]
+Q100: mov [strcount_UTF8Dispatch], rdx
+ ; Continue in appropriate version
+ jmp rdx
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strcount_UTF8Dispatch DQ strcount_UTF8CPUDispatch
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strcpy32.asm b/asmlibSrc/strcpy32.asm
new file mode 100755
index 0000000..0062114
--- /dev/null
+++ b/asmlibSrc/strcpy32.asm
@@ -0,0 +1,53 @@
+;************************* strcpy32.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2011-07-01
+; Description:
+; Faster version of the standard strcpy function:
+; char * A_strcpy(char * dest, const char * src);
+; Copies zero-terminated string from src to dest, including terminating zero.
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_strcpy is changed to _strcpy in the object file if
+; it is desired to override the standard library function strcpy.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_strcpy: function ; Function _A_strcpy
+global ?OVR_strcpy: function ; ?OVR removed if standard function memcpy overridden
+
+; Imported from strlen32.asm
+extern _A_strlen
+
+; Imported from memcpy32.asm
+extern _A_memcpy
+
+
+SECTION .text align=16
+
+; extern "C" char * A_strcpy(char * dest, const char * src) {
+; return memcpy(dest, src, strlen(src)+1);
+; }
+
+; Function entry:
+_A_strcpy:
+?OVR_strcpy:
+
+ mov eax, [esp+8] ; src
+ push eax
+ call _A_strlen ; length of src
+ pop ecx ; ecx = src. Assume unchanged by _A_strlen
+ inc eax ; include terminating zero in length
+ mov edx, [esp+4] ; dest
+ push eax ; length+1
+ push ecx ; src
+ push edx ; dest
+ call _A_memcpy ; copy
+ add esp, 12 ; clean up stack
+ ret
+
+;_A_strcpy ENDP
diff --git a/asmlibSrc/strcpy64.asm b/asmlibSrc/strcpy64.asm
new file mode 100755
index 0000000..f7a6836
--- /dev/null
+++ b/asmlibSrc/strcpy64.asm
@@ -0,0 +1,64 @@
+;************************* strcpy64.asm ************************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2011-07-01
+; Description:
+; Faster version of the standard strcpy function:
+; char * A_strcpy(char * dest, const char * src);
+; Copies zero-terminated string from src to dest, including terminating zero.
+;
+; Overriding standard function memcpy:
+; The alias ?OVR_strcpy is changed to _strcpy in the object file if
+; it is desired to override the standard library function strcpy.
+;
+; Optimization:
+; Uses optimized functions A_strlen and A_memcpy. These functions allow
+; calling without proper stack alignment.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strcpy: function ; Function A_strcpy
+global ?OVR_strcpy: function ; ?OVR removed if standard function memcpy overridden
+
+; Imported from strlen64.asm
+extern A_strlen
+
+; Imported from memcpy64.asm
+extern A_memcpy
+
+
+SECTION .text align=16
+
+; extern "C" char * A_strcpy(char * dest, const char * src) {
+; return memcpy(dest, src, strlen(src)+1);
+; }
+
+; Function entry:
+A_strcpy:
+?OVR_strcpy:
+
+%IFDEF WINDOWS
+%define Rpar1 rcx ; function parameter 1
+%define Rpar2 rdx ; function parameter 2
+%define Rpar3 r8 ; function parameter 3
+%ENDIF
+%IFDEF UNIX
+%define Rpar1 rdi ; function parameter 1
+%define Rpar2 rsi ; function parameter 2
+%define Rpar3 rdx ; function parameter 3
+%ENDIF
+
+ push Rpar1 ; dest
+ push Rpar2 ; src
+ mov Rpar1, Rpar2
+ ; (A_strlen does not require stack alignment)
+ call A_strlen ; length of src
+ lea Rpar3,[rax+1] ; include terminating zero in length
+ pop Rpar2 ; src
+ pop Rpar1 ; dest
+ jmp A_memcpy ; copy and return
+
+;A_strcpy ENDP
diff --git a/asmlibSrc/stricmp32.asm b/asmlibSrc/stricmp32.asm
new file mode 100755
index 0000000..2050b24
--- /dev/null
+++ b/asmlibSrc/stricmp32.asm
@@ -0,0 +1,70 @@
+;************************* stricmpaz32.asm **********************************
+; Author: Agner Fog
+; Date created: 2008-12-05
+; Last modified: 2011-07-01
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; Faster version of the standard stricmp or strcasecmp function:
+; int stricmp_az(const char *string1, const char *string2);
+; Compares two zero-terminated strings without case sensitivity.
+; Does not recognize locale-specific characters. A-Z are changed
+; to a-z before comparing, while other upper-case letters are not
+; converted but considered unique.
+;
+; Optimization:
+; SSE4.2 version not implemented because the gain is small.
+;
+; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global _A_stricmp: function ; Function _A_stricmp
+
+SECTION .text align=16
+
+; extern "C" int stricmp_az(const char *string1, const char *string2);
+
+_A_stricmp:
+ mov ecx, [esp+4] ; string1
+ mov edx, [esp+8] ; string2
+ sub edx, ecx
+
+L10: mov al, [ecx]
+ cmp al, [ecx+edx]
+ jne L20
+ inc ecx
+ test al, al
+ jnz L10 ; continue with next byte
+
+ ; terminating zero found. Strings are equal
+ xor eax, eax
+ ret
+
+L20: ; bytes are different. check case
+ xor al, 20H ; toggle case
+ cmp al, [ecx+edx]
+ jne L30
+ ; possibly differing only by case. Check if a-z
+ or al, 20H ; upper case
+ sub al, 'a'
+ cmp al, 'z'-'a'
+ ja L30 ; not a-z
+ ; a-z and differing only by case
+ inc ecx
+ jmp L10 ; continue with next byte
+
+L30: ; bytes are different, even after changing case
+ movzx eax, byte [ecx] ; get original value again
+ sub eax, 'A'
+ cmp eax, 'Z' - 'A'
+ ja L40
+ add eax, 20H
+L40: movzx edx, byte [ecx+edx]
+ sub edx, 'A'
+ cmp edx, 'Z' - 'A'
+ ja L50
+ add edx, 20H
+L50: sub eax, edx ; subtract to get result
+ ret
+
+;_A_stricmp END
diff --git a/asmlibSrc/stricmp64.asm b/asmlibSrc/stricmp64.asm
new file mode 100755
index 0000000..ef8d152
--- /dev/null
+++ b/asmlibSrc/stricmp64.asm
@@ -0,0 +1,84 @@
+;************************* stricmpaz64.asm **********************************
+; Author: Agner Fog
+; Date created: 2008-12-05
+; Last modified: 2011-07-01
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; Faster version of the standard stricmp or strcasecmp function:
+; int A_stricmp(const char *string1, const char *string2);
+; Compares two zero-terminated strings without case sensitivity.
+; Does not recognize locale-specific characters. A-Z are changed
+; to a-z before comparing, while other upper-case letters are not
+; converted but considered unique.
+;
+; Optimization:
+; SSE4.2 version not implemented because the gain is small.
+;
+; Copyright (c) 2008-2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+default rel
+
+global A_stricmp: function ; Function A_stricmp
+
+; ***************************************************************************
+; Define registers used for function parameters, used in 64-bit mode only
+; ***************************************************************************
+
+%IFDEF WINDOWS
+ %define par1 rcx ; first parameter
+ %define par2 rdx ; second parameter
+%ENDIF
+
+%IFDEF UNIX
+ %define par1 rdi ; first parameter
+ %define par2 rsi ; second parameter
+%ENDIF
+
+SECTION .text align=16
+
+; extern "C" int A_stricmp(const char *string1, const char *string2);
+
+A_stricmp:
+ sub par2, par1
+
+L10: mov al, [par1] ; string1
+ cmp al, [par1+par2] ; string2
+ jne L20
+ inc par1
+ test al, al
+ jnz L10 ; continue with next byte
+
+ ; terminating zero found. Strings are equal
+ xor eax, eax
+ ret
+
+L20: ; bytes are different. check case
+ xor al, 20H ; toggle case
+ cmp al, [par1+par2]
+ jne L30
+ ; possibly differing only by case. Check if a-z
+ or al, 20H ; upper case
+ sub al, 'a'
+ cmp al, 'z'-'a'
+ ja L30 ; not a-z
+ ; a-z and differing only by case
+ inc par1
+ jmp L10 ; continue with next byte
+
+L30: ; bytes are different, even after changing case
+ movzx eax, byte [par1] ; get original value again
+ sub eax, 'A'
+ cmp eax, 'Z' - 'A'
+ ja L40
+ add eax, 20H ; A-Z, make lower case
+L40: movzx edx, byte [par1+par2]
+ sub edx, 'A'
+ cmp edx, 'Z' - 'A'
+ ja L50
+ add edx, 20H ; A-Z, make lower case
+L50: sub eax, edx ; subtract to get result
+ ret
+
+;A_stricmp END
diff --git a/asmlibSrc/strlen32.asm b/asmlibSrc/strlen32.asm
new file mode 100755
index 0000000..7083590
--- /dev/null
+++ b/asmlibSrc/strlen32.asm
@@ -0,0 +1,182 @@
+;************************** strlen32.asm **********************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2008-10-16
+; Description:
+; Faster version of the standard strlen function:
+; size_t strlen(const char * str);
+; Finds the length of a zero-terminated string of bytes, optimized for speed.
+;
+; Overriding standard function strlen:
+; The alias ?OVR_strlen is changed to _strlen in the object file if
+; it is desired to override the standard library function strlen.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; Internal calls: The parameter on the stack is left unchanged for the sake
+; of calls from strcpy and strcat.
+;
+; Optimization:
+; Uses XMM registers to read 16 bytes at a time, aligned.
+; Misaligned parts of the string are read from the nearest 16-bytes boundary
+; and the irrelevant part masked out. It may read both before the begin of
+; the string and after the end, but will never load any unnecessary cache
+; line and never trigger a page fault for reading from non-existing memory
+; pages because it never reads past the nearest following 16-bytes boundary.
+; It may, though, trigger any debug watch within the same 16-bytes boundary.
+; CPU dispatching included for 386 and SSE2 instruction sets.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+global _A_strlen: function ; Function _A_strlen
+global ?OVR_strlen: function ; ?OVR removed if standard function strlen overridden
+
+; Imported from instrset32.asm
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+
+SECTION .text align=16
+
+; extern "C" int strlen (const char * s);
+_A_strlen:
+?OVR_strlen:
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp [strlenDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_eax ; get reference point for position-independent code
+RP: ; reference point eax = offset RP
+A020: ; Go here after CPU dispatching
+
+ ; Make the following instruction with address relative to RP:
+ cmp dword [eax-RP+strlenCPUVersion], 1
+ jb strlenCPUDispatch ; First time: strlenCPUVersion = 0, go to dispatcher
+ je strlen386 ; strlenCPUVersion = 1, go to 80386 version
+%ENDIF
+
+; SSE2 version
+strlenSSE2:
+ mov eax, [esp+4] ; get pointer to string
+ mov ecx, eax ; copy pointer
+ pxor xmm0, xmm0 ; set to zero
+ and ecx, 0FH ; lower 4 bits indicate misalignment
+ and eax, -10H ; align pointer by 16
+ movdqa xmm1, [eax] ; read from nearest preceding boundary
+ pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
+ pmovmskb edx, xmm1 ; get one bit for each byte result
+ shr edx, cl ; shift out false bits
+ shl edx, cl ; shift back again
+ bsf edx, edx ; find first 1-bit
+ jnz A200 ; found
+
+ ; Main loop, search 16 bytes at a time
+A100: add eax, 10H ; increment pointer by 16
+ movdqa xmm1, [eax] ; read 16 bytes aligned
+ pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
+ pmovmskb edx, xmm1 ; get one bit for each byte result
+ bsf edx, edx ; find first 1-bit
+ ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
+ ; but we are assuming that most strings are short, and newer processors have higher priority)
+ jz A100 ; loop if not found
+
+A200: ; Zero-byte found. Compute string length
+ sub eax, [esp+4] ; subtract start address
+ add eax, edx ; add byte index
+ ret
+
+strlen386: ; 80386 version
+ push ebx
+ mov ecx, [esp+8] ; get pointer to string
+ mov eax, ecx ; copy pointer
+ and ecx, 3 ; lower 2 bits of address, check alignment
+ jz L2 ; string is aligned by 4. Go to loop
+ and eax, -4 ; align pointer by 4
+ mov ebx, [eax] ; read from nearest preceding boundary
+ shl ecx, 3 ; mul by 8 = displacement in bits
+ mov edx, -1
+ shl edx, cl ; make byte mask
+ not edx ; mask = 0FFH for false bytes
+ or ebx, edx ; mask out false bytes
+
+ ; check first four bytes for zero
+ lea ecx, [ebx-01010101H] ; subtract 1 from each byte
+ not ebx ; invert all bytes
+ and ecx, ebx ; and these two
+ and ecx, 80808080H ; test all sign bits
+ jnz L3 ; zero-byte found
+
+ ; Main loop, read 4 bytes aligned
+L1: add eax, 4 ; increment pointer by 4
+L2: mov ebx, [eax] ; read 4 bytes of string
+ lea ecx, [ebx-01010101H] ; subtract 1 from each byte
+ not ebx ; invert all bytes
+ and ecx, ebx ; and these two
+ and ecx, 80808080H ; test all sign bits
+ jz L1 ; no zero bytes, continue loop
+
+L3: bsf ecx, ecx ; find right-most 1-bit
+ shr ecx, 3 ; divide by 8 = byte index
+ sub eax, [esp+8] ; subtract start address
+ add eax, ecx ; add index to byte
+ pop ebx
+ ret
+
+
+; CPU dispatching for strlen. This is executed only once
+strlenCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ pushad
+ call _InstructionSet
+ ; Point to generic version of strlen
+ mov dword [strlenDispatch], strlen386
+ cmp eax, 4 ; check SSE2
+ jb M100
+ ; SSE2 supported
+ ; Point to SSE2 version of strlen
+ mov dword [strlenDispatch], strlenSSE2
+M100: popad
+ ; Continue in appropriate version of strlen
+ jmp dword [strlenDispatch]
+
+%ELSE ; Position-independent version
+ pushad
+
+ ; Make the following instruction with address relative to RP:
+ lea ebx, [eax-RP+strlenCPUVersion]
+ ; Now ebx points to strlenCPUVersion.
+
+ call _InstructionSet
+
+ mov byte [ebx], 1 ; Indicate generic version
+ cmp eax, 4 ; check SSE2
+ jb M100
+ ; SSE2 supported
+ mov byte [ebx], 2 ; Indicate SSE2 or later version
+M100: popad
+ jmp A020 ; Go back and dispatch
+
+get_thunk_eax: ; load caller address into ebx for position-independent code
+ mov eax, [esp]
+ ret
+
+%ENDIF
+
+SECTION .data
+align 16
+%IFNDEF POSITIONINDEPENDENT
+; Pointer to appropriate version.
+; This initially points to strlenCPUDispatch. strlenCPUDispatch will
+; change this to the appropriate version of strlen, so that
+; strlenCPUDispatch is only executed once:
+strlenDispatch: DD strlenCPUDispatch
+%ELSE ; position-independent
+; CPU version: 0=unknown, 1=80386, 2=SSE2
+strlenCPUVersion: DD 0
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
diff --git a/asmlibSrc/strlen64.asm b/asmlibSrc/strlen64.asm
new file mode 100755
index 0000000..005fafd
--- /dev/null
+++ b/asmlibSrc/strlen64.asm
@@ -0,0 +1,84 @@
+;************************** strlen64.asm **********************************
+; Author: Agner Fog
+; Date created: 2008-07-19
+; Last modified: 2008-10-16
+; Description:
+; Faster version of the standard strlen function:
+; size_t strlen(const char * str);
+; Finds the length of a zero-terminated string of bytes, optimized for speed.
+;
+; Overriding standard function strlen:
+; The alias ?OVR_strlen is changed to _strlen in the object file if
+; it is desired to override the standard library function strlen.
+;
+; Calling conventions:
+; Stack alignment is not required. No shadow space or red zone used.
+; Called internally from strcpy and strcat without stack aligned.
+;
+; Optimization:
+; Uses XMM registers to read 16 bytes at a time, aligned.
+; Misaligned parts of the string are read from the nearest 16-bytes boundary
+; and the irrelevant part masked out. It may read both before the begin of
+; the string and after the end, but will never load any unnecessary cache
+; line and never trigger a page fault for reading from non-existing memory
+; pages because it never reads past the nearest following 16-bytes boundary.
+; It may, though, trigger any debug watch within the same 16-bytes boundary.
+;
+; The latest version of this file is available at:
+; www.agner.org/optimize/asmexamples.zip
+; Copyright (c) 2009 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+global A_strlen: function ; Function A_strlen
+global ?OVR_strlen: function ; ?OVR removed if standard function strlen overridden
+
+
+SECTION .text align=16
+
+; extern "C" int strlen (const char * s);
+
+; 64-bit Windows version:
+A_strlen:
+?OVR_strlen:
+
+%IFDEF WINDOWS
+ mov rax, rcx ; get pointer to string from rcx
+ mov r8, rcx ; copy pointer
+%define Rscopy r8 ; Copy of s
+
+%ELSE ; Unix
+ mov rax, rdi ; get pointer to string from rdi
+ mov ecx, edi ; copy pointer (lower 32 bits)
+%define Rscopy rdi ; Copy of s
+%ENDIF
+
+ ; rax = s, ecx = 32 bits of s
+ pxor xmm0, xmm0 ; set to zero
+ and ecx, 0FH ; lower 4 bits indicate misalignment
+ and rax, -10H ; align pointer by 16
+ movdqa xmm1, [rax] ; read from nearest preceding boundary
+ pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
+ pmovmskb edx, xmm1 ; get one bit for each byte result
+ shr edx, cl ; shift out false bits
+ shl edx, cl ; shift back again
+ bsf edx, edx ; find first 1-bit
+ jnz L2 ; found
+
+ ; Main loop, search 16 bytes at a time
+L1: add rax, 10H ; increment pointer by 16
+ movdqa xmm1, [rax] ; read 16 bytes aligned
+ pcmpeqb xmm1, xmm0 ; compare 16 bytes with zero
+ pmovmskb edx, xmm1 ; get one bit for each byte result
+ bsf edx, edx ; find first 1-bit
+ ; (moving the bsf out of the loop and using test here would be faster for long strings on old processors,
+ ; but we are assuming that most strings are short, and newer processors have higher priority)
+ jz L1 ; loop if not found
+
+L2: ; Zero-byte found. Compute string length
+ sub rax, Rscopy ; subtract start address
+ add rax, rdx ; add byte index
+ ret
+
+;A_strlen ENDP
diff --git a/asmlibSrc/strspn32.asm b/asmlibSrc/strspn32.asm
new file mode 100755
index 0000000..eeb4b89
--- /dev/null
+++ b/asmlibSrc/strspn32.asm
@@ -0,0 +1,338 @@
+;************************* strspn32.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-19
+; Last modified: 2011-08-21
+
+; Description:
+; Faster version of the standard strspn and strcspn functions:
+; size_t A_strspn (const char * str, const char * set);
+; size_t A_strcspn(const char * str, const char * set);
+;
+; A_strspn finds the length of the initial portion of str which consists only of
+; characters that are part of set.
+; A_strcspn finds the length of the initial portion of str which consists only of
+; characters that are not part of set.
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard functions strspn and strcspn:
+; Overriding is disabled because the functions may read beyond the end of a string,
+; while the standard strspn and strcspn functions are guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+%define ALLOW_OVERRIDE 0 ; Set to one if override of standard function desired
+
+global _A_strspn: function
+global _A_strcspn: function
+
+; Direct entries to CPU-specific versions
+global _strspnGeneric: function
+global _strcspnGeneric: function
+global _strspnSSE42: function
+global _strcspnSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+; strspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strspn: function
+?OVR_strspn:
+%endif
+
+_A_strspn: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp near [strspnDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP1: ; reference point edx = offset RP1
+
+; Make the following instruction with address relative to RP1:
+ jmp near [edx+strspnDispatch-RP1]
+
+%ENDIF
+
+align 16
+_strspnSSE42: ; SSE4.2 version
+ push esi
+ push edi
+ mov esi, [esp+12] ; str
+ mov edi, [esp+16] ; set
+ xor ecx, ecx ; span counter
+str_next:
+ movdqu xmm2, [esi] ; str
+ movdqu xmm1, [edi] ; set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd eax, xmm0
+ jns set_extends
+set_finished:
+ cmp ax, -1
+ jne str_finished
+ ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+ add esi, 16 ; next 16 bytes of str
+ add ecx, 16 ; count span
+ jmp str_next
+
+str_finished:
+ not eax
+ bsf eax, eax
+ add eax, ecx
+ pop edi
+ pop esi
+ ret
+
+set_loop:
+ or eax, edx ; accumulate matches
+set_extends: ; the set is more than 16 bytes
+ add edi, 16
+ movdqu xmm1, [edi] ; next part of set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd edx, xmm0
+ jns set_loop
+ mov edi, [esp+16] ; restore set pointer
+ or eax, edx ; accumulate matches
+ jmp set_finished
+
+
+;******************************************************************************
+; strcspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcspn: function
+?OVR_strcspn:
+%endif
+
+_A_strcspn: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp near [strcspnDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP2: ; reference point edx = offset RP2
+
+; Make the following instruction with address relative to RP2:
+ jmp near [edx+strcspnDispatch-RP2]
+
+%ENDIF
+
+align 16
+_strcspnSSE42: ; SSE4.2 version
+ push esi
+ push edi
+ mov esi, [esp+12] ; str
+ mov edi, [esp+16] ; set
+ xor ecx, ecx ; span counter
+str_next2:
+ movdqu xmm2, [esi] ; str
+ movdqu xmm1, [edi] ; set
+ pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+ movd eax, xmm0
+ jns set_extends2
+set_finished2:
+ cmp ax, -1
+ jne str_finished2
+ ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+ add esi, 16 ; next 16 bytes of str
+ add ecx, 16 ; count span
+ jmp str_next2
+
+str_finished2:
+ not eax
+ bsf eax, eax
+ add eax, ecx
+ pop edi
+ pop esi
+ ret
+
+set_loop2:
+ and eax, edx ; accumulate matches
+set_extends2: ; the set is more than 16 bytes
+ add edi, 16
+ movdqu xmm1, [edi] ; next part of set
+ pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+ movd edx, xmm0
+ jns set_loop2
+ mov edi, [esp+16] ; restore set pointer
+ and eax, edx ; accumulate matches
+ jmp set_finished2
+
+
+;******************************************************************************
+; strspn function generic
+;******************************************************************************
+
+align 8
+_strspnGeneric: ; Generic version
+ push esi
+ push edi
+ mov esi, [esp+12] ; str pointer
+str_next10:
+ mov edi, [esp+16] ; set pointer
+ mov al, [esi] ; read one byte from str
+ test al, al
+ jz str_finished10 ; str finished
+set_next10:
+ mov dl, [edi]
+ test dl, dl
+ jz set_finished10
+ inc edi
+ cmp al, dl
+ jne set_next10
+ ; character match found, goto next character
+ inc esi
+ jmp str_next10
+
+str_finished10: ; end of str, all match
+set_finished10: ; end of set, mismatch found
+ sub esi, [esp+12] ; calculate position
+ mov eax, esi
+ pop edi
+ pop esi
+ ret
+;_strspnGeneric end
+
+align 8
+_strcspnGeneric: ; Generic version
+ push esi
+ push edi
+ mov esi, [esp+12] ; str pointer
+str_next20:
+ mov edi, [esp+16] ; set pointer
+ mov al, [esi] ; read one byte from str
+ test al, al
+ jz str_finished20 ; str finished
+set_next20:
+ mov dl, [edi]
+ test dl, dl
+ jz set_finished20
+ inc edi
+ cmp al, dl
+ jne set_next20
+ ; character match found, stop search
+ jmp str_finished20
+
+set_finished20: ; end of set, mismatch found
+ inc esi
+ jmp str_next20
+
+str_finished20: ; end of str, all match
+ sub esi, [esp+12] ; calculate position
+ mov eax, esi
+ pop edi
+ pop esi
+ ret
+;_strcspnGeneric end
+
+; ********************************************************************************
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+; ********************************************************************************
+; CPU dispatching for strspn. This is executed only once
+; ********************************************************************************
+
+strspnCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of strstr
+ mov ecx, _strspnGeneric
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ mov ecx, _strspnSSE42
+Q100: mov [strspnDispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP11: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_strspnGeneric-RP11]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea ecx, [edx+_strspnSSE42-RP11]
+Q100: mov [edx+strspnDispatch-RP11], ecx
+ ; Continue in appropriate version
+ jmp ecx
+%ENDIF
+
+strcspnCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of strstr
+ mov ecx, _strcspnGeneric
+ cmp eax, 10 ; check SSE4.2
+ jb Q200
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ mov ecx, _strcspnSSE42
+Q200: mov [strcspnDispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP12: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_strcspnGeneric-RP12]
+ cmp eax, 10 ; check SSE4.2
+ jb Q200
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea ecx, [edx+_strcspnSSE42-RP12]
+Q200: mov [edx+strcspnDispatch-RP12], ecx
+ ; Continue in appropriate version
+ jmp ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strspnDispatch DD strspnCPUDispatch
+strcspnDispatch DD strcspnCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix problem in Mac linker
+ DD 0,0,0,0
+%ENDIF
+
+
+SECTION .bss
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+ dq 0, 0
diff --git a/asmlibSrc/strspn64.asm b/asmlibSrc/strspn64.asm
new file mode 100755
index 0000000..60c0a4d
--- /dev/null
+++ b/asmlibSrc/strspn64.asm
@@ -0,0 +1,304 @@
+;************************* strspn64.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-19
+; Last modified: 2011-07-19
+
+; Description:
+; Faster version of the standard strspn and strcspn functions:
+; size_t A_strspn (const char * str, const char * set);
+; size_t A_strcspn(const char * str, const char * set);
+;
+; A_strspn finds the length of the initial portion of str which consists only of
+; characters that are part of set.
+; A_strcspn finds the length of the initial portion of str which consists only of
+; characters that are not part of set.
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; Overriding standard functions strspn and strcspn:
+; Overriding is disabled because the functions may read beyond the end of a string,
+; while the standard strspn and strcspn functions are guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%define ALLOW_OVERRIDE 0 ; Set to one if override of standard function desired
+
+global A_strspn: function
+global A_strcspn: function
+
+; Direct entries to CPU-specific versions
+global strspnGeneric: function
+global strcspnGeneric: function
+global strspnSSE42: function
+global strcspnSSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+;******************************************************************************
+; strspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strspn: function
+?OVR_strspn:
+%endif
+
+align 16
+A_strspn: ; function dispatching
+ jmp near [strspnDispatch] ; Go to appropriate version, depending on instruction set
+
+strspnSSE42: ; SSE4.2 version
+%ifdef WINDOWS
+ push rdi
+ push rsi
+ mov rdi, rcx ; str
+ mov rsi, rdx ; set
+%endif
+ mov r8, rsi
+ xor ecx, ecx ; span counter
+str_next:
+ movdqu xmm2, [rdi] ; str
+ movdqu xmm1, [rsi] ; set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd eax, xmm0
+ jns set_extends
+set_finished:
+ cmp ax, -1
+ jne str_finished
+ ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+ add rdi, 16 ; next 16 bytes of str
+ add rcx, 16 ; count span
+ jmp str_next
+
+str_finished:
+ not eax
+ bsf eax, eax
+ add rax, rcx
+%ifdef WINDOWS
+ pop rsi
+ pop rdi
+%endif
+ ret
+
+set_loop:
+ or eax, edx ; accumulate matches
+set_extends: ; the set is more than 16 bytes
+ add rsi, 16
+ movdqu xmm1, [rsi] ; next part of set
+ pcmpistrm xmm1, xmm2, 00000000b; find in set, return bit mask in xmm0
+ movd edx, xmm0
+ jns set_loop
+ mov rsi, r8 ; restore set pointer
+ or eax, edx ; accumulate matches
+ jmp set_finished
+
+
+;******************************************************************************
+; strcspn function
+;******************************************************************************
+
+%if ALLOW_OVERRIDE
+global ?OVR_strcspn: function
+?OVR_strcspn:
+%endif
+
+align 16
+A_strcspn: ; function dispatching
+ jmp near [strcspnDispatch] ; Go to appropriate version, depending on instruction set
+
+strcspnSSE42: ; SSE4.2 version
+%ifdef WINDOWS
+ push rdi
+ push rsi
+ mov rdi, rcx ; str
+ mov rsi, rdx ; set
+%endif
+ mov r8, rsi
+ xor ecx, ecx ; span counter
+str_next2:
+ movdqu xmm2, [rdi] ; str
+ movdqu xmm1, [rsi] ; set
+ pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+ movd eax, xmm0
+ jns set_extends2
+set_finished2:
+ cmp ax, -1
+ jne str_finished2
+ ; first 16 characters matched, continue with next 16 characters (a terminating zero would never match)
+ add rdi, 16 ; next 16 bytes of str
+ add rcx, 16 ; count span
+ jmp str_next2
+
+str_finished2:
+ not eax
+ bsf eax, eax
+ add rax, rcx
+%ifdef WINDOWS
+ pop rsi
+ pop rdi
+%endif
+ ret
+
+set_loop2:
+ and eax, edx ; accumulate matches
+set_extends2: ; the set is more than 16 bytes
+ add rsi, 16
+ movdqu xmm1, [rsi] ; next part of set
+ pcmpistrm xmm1, xmm2, 00110000b; find in set, invert valid bits, return bit mask in xmm0
+ movd edx, xmm0
+ jns set_loop2
+ mov rsi, r8 ; restore set pointer
+ and eax, edx ; accumulate matches
+ jmp set_finished2
+
+
+;******************************************************************************
+; strspn function generic
+;******************************************************************************
+
+align 8
+strspnGeneric: ; Generic version
+%ifdef WINDOWS
+ push rdi
+ push rsi
+ mov rdi, rcx ; str
+ mov rsi, rdx ; set
+%endif
+ mov r8, rsi
+ mov r9, rdi
+
+str_next10:
+ mov al, [rdi] ; read one byte from str
+ test al, al
+ jz str_finished10 ; str finished
+set_next10:
+ mov dl, [rsi]
+ test dl, dl
+ jz set_finished10
+ inc rsi
+ cmp al, dl
+ jne set_next10
+ ; character match found, goto next character
+ inc rdi
+ mov rsi, r8 ; set pointer
+ jmp str_next10
+
+str_finished10: ; end of str, all match
+set_finished10: ; end of set, mismatch found
+ sub rdi, r9 ; calculate position
+ mov rax, rdi
+%ifdef WINDOWS
+ pop rsi
+ pop rdi
+%endif
+ ret
+;_strspnGeneric end
+
+align 8
+strcspnGeneric: ; Generic version
+%ifdef WINDOWS
+ push rdi
+ push rsi
+ mov rdi, rcx ; str
+ mov rsi, rdx ; set
+%endif
+ mov r8, rsi
+ mov r9, rdi
+str_next20:
+ mov al, [rdi] ; read one byte from str
+ test al, al
+ jz str_finished20 ; str finished
+set_next20:
+ mov dl, [rsi]
+ test dl, dl
+ jz set_finished20
+ inc rsi
+ cmp al, dl
+ jne set_next20
+ ; character match found, stop search
+ jmp str_finished20
+
+set_finished20: ; end of set, mismatch found
+ inc rdi
+ mov rsi, r8 ; set pointer
+ jmp str_next20
+
+str_finished20: ; end of str, all match
+ sub rdi, r9 ; calculate position
+ mov rax, rdi
+%ifdef WINDOWS
+ pop rsi
+ pop rdi
+%endif
+ ret
+;_strcspnGeneric end
+
+
+; ********************************************************************************
+; CPU dispatching for strspn. This is executed only once
+; ********************************************************************************
+
+%ifdef WINDOWS
+%define par1 rcx
+%define par2 rdx
+%else ; UNIX
+%define par1 rdi
+%define par2 rsi
+%endif
+
+strspnCPUDispatch:
+ ; get supported instruction set
+ push par1
+ push par2
+ call InstructionSet
+ pop par2
+ pop par1
+ ; Point to generic version of strstr
+ lea r8, [strspnGeneric]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea r8, [strspnSSE42]
+Q100: mov [strspnDispatch], r8
+ ; Continue in appropriate version
+ jmp r8
+
+
+strcspnCPUDispatch:
+ ; get supported instruction set
+ push par1
+ push par2
+ call InstructionSet
+ pop par2
+ pop par1
+ ; Point to generic version of strstr
+ lea r8, [strcspnGeneric]
+ cmp eax, 10 ; check SSE4.2
+ jb Q200
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea r8, [strcspnSSE42]
+Q200: mov [strcspnDispatch], r8
+ ; Continue in appropriate version
+ jmp r8
+
+SECTION .data
+
+; Pointer to appropriate versions. Initially point to dispatcher
+strspnDispatch DQ strspnCPUDispatch
+strcspnDispatch DQ strcspnCPUDispatch
+
+SECTION .bss
+dq 0, 0
diff --git a/asmlibSrc/strstr32.asm b/asmlibSrc/strstr32.asm
new file mode 100755
index 0000000..8ee6450
--- /dev/null
+++ b/asmlibSrc/strstr32.asm
@@ -0,0 +1,251 @@
+;************************* strstr32.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-14
+; Last modified: 2011-08-21
+
+; Description:
+; Faster version of the standard strstr function:
+; char * A_strstr(char * haystack, const char * needle);
+; Searches for substring needle in string haystack. Return value is pointer to
+; first occurrence of needle, or NULL if not found. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment. Avoiding this would be complicated
+; and make the function much slower: For every unaligned 16-bytes read we would have to
+; check if it crosses a page boundary (4 kbytes), and if so check if the string ends
+; before the page boundary. Only if the string does not end before the page boundary
+; can we read into the next memory page.
+;
+; Overriding standard function strstr:
+; The alias ?OVR_strstr is changed to _strstr in the object file if
+; it is desired to override the standard library function strstr.
+; Overriding is disabled because the function may read beyond the end of a
+; string, while the standard strstr function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+%define ALLOW_OVERRIDE 0 ; Set to one if override of standard function desired
+
+global _A_strstr: function ; Function A_strstr
+
+; Direct entries to CPU-specific versions
+global _strstrGeneric: function ; Generic version for processors without SSE4.2
+global _strstrSSE42: function ; Version for processors with SSE4.2
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+; strstr function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strstr: function
+?OVR_strstr:
+%endif
+
+_A_strstr: ; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+ jmp near [strstrDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+ call get_thunk_edx ; get reference point for position-independent code
+RP: ; reference point edx = offset RP
+
+; Make the following instruction with address relative to RP:
+ jmp dword [edx+strstrDispatch-RP]
+
+%ENDIF
+
+align 16
+_strstrSSE42: ; SSE4.2 version
+ push ebx
+ push esi
+ mov esi, [esp+12] ; haystack
+ mov eax, [esp+16] ; needle
+ movdqu xmm1, [eax] ; needle
+
+align 8
+haystacknext:
+ ; [esi] = haystack
+ pcmpistrm xmm1, [esi], 00001100b ; unsigned byte search, equal ordered, return mask in xmm0
+ jc matchbegin ; found beginning of a match
+ jz nomatch ; end of haystack found, no match
+ add esi, 16
+ jmp haystacknext
+
+matchbegin:
+ jz foundshort ; haystack ends here, a short match is found
+ movd eax, xmm0 ; bit mask of possible matches
+nextindexbit:
+ bsf ecx, eax ; index of first bit in mask of possible matches
+
+ ; compare strings for full match
+ lea ebx, [esi+ecx] ; haystack + index
+ mov edx, [esp+16] ; needle
+
+compareloop: ; compare loop for long match
+ movdqu xmm2, [edx] ; paragraph of needle
+ pcmpistrm xmm2, [ebx], 00001100B ; unsigned bytes, equal ordered, modifies xmm0
+ ; (can't use "equal each, masked" because it inverts when past end of needle, but not when past end of both)
+
+ jno longmatchfail ; difference found after extending partial match
+ js longmatchsuccess ; end of needle found, and no difference
+ add edx, 16
+ add ebx, 16
+ jmp compareloop ; loop to next 16 bytes
+
+longmatchfail:
+ ; remove index bit of first partial match
+ btr eax, ecx
+ test eax, eax
+ jnz nextindexbit ; mask contains more index bits, loop to next bit in eax mask
+ ; mask exhausted for possible matches, continue to next haystack paragraph
+ add esi, 16
+ jmp haystacknext ; loop to next paragraph of haystack
+
+longmatchsuccess: ; match found over more than one paragraph
+ lea eax, [esi+ecx] ; haystack + index to begin of long match
+ pop esi
+ pop ebx
+ ret
+
+foundshort: ; match found within single paragraph
+ movd eax, xmm0 ; bit mask of matches
+ bsf eax, eax ; index of first match
+ add eax, esi ; pointer to first match
+ pop esi
+ pop ebx
+ ret
+
+nomatch: ; needle not found, return 0
+ xor eax, eax
+ pop esi
+ pop ebx
+ ret
+
+;_strstrSSE42: endp
+
+
+align 16
+_strstrGeneric: ; generic version
+ push esi
+ push edi
+ mov esi, [esp+12] ; haystack
+ mov edi, [esp+16] ; needle
+
+ mov ax, [edi]
+ test al, al
+ jz _Found ; a zero-length needle is always found
+ test ah, ah
+ jz _SingleCharNeedle
+
+_SearchLoop: ; search for first character match
+ mov cl, [esi]
+ test cl, cl
+ jz _NotFound ; end of haystack reached without finding
+ cmp al, cl
+ je _FirstCharMatch ; first character match
+_IncompleteMatch:
+ inc esi
+ jmp _SearchLoop ; loop through haystack
+
+_FirstCharMatch:
+ mov ecx, esi ; begin of match position
+_MatchLoop:
+ inc ecx
+ inc edi
+ mov al, [edi]
+ test al, al
+ jz _Found ; end of needle. match ok
+ cmp al, [ecx]
+ je _MatchLoop
+ ; match failed, recover and continue
+ mov edi, [esp+16] ; needle
+ mov al, [edi]
+ jmp _IncompleteMatch
+
+_NotFound: ; needle not found. return 0
+ xor eax, eax
+ pop edi
+ pop esi
+ ret
+
+_Found: ; needle found. return pointer to position in haystack
+ mov eax, esi
+ pop edi
+ pop esi
+ ret
+
+_SingleCharNeedle: ; Needle is a single character
+ movzx ecx, byte [esi]
+ test cl, cl
+ jz _NotFound ; end of haystack reached without finding
+ cmp al, cl
+ je _Found
+ inc esi
+ jmp _SingleCharNeedle ; loop through haystack
+
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+; CPU dispatching for strstr. This is executed only once
+strstrCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version of strstr
+ mov ecx, _strstrGeneric
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ mov ecx, _strstrSSE42
+Q100: mov [strstrDispatch], ecx
+ ; Continue in appropriate version of strstr
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP2: ; reference point edx
+ ; Point to generic version of strstr
+ lea ecx, [edx+_strstrGeneric-RP2]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea ecx, [edx+_strstrSSE42-RP2]
+Q100: mov [edx+strstrDispatch-RP2], ecx
+ ; Continue in appropriate version of strstr
+ jmp ecx
+%ENDIF
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strstrDispatch DD strstrCPUDispatch
+%IFDEF POSITIONINDEPENDENT
+; Fix potential problem in Mac linker
+ DD 0, 0
+%ENDIF
+
+SECTION .bss
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+ dq 0, 0
diff --git a/asmlibSrc/strstr64.asm b/asmlibSrc/strstr64.asm
new file mode 100755
index 0000000..d4cc1e1
--- /dev/null
+++ b/asmlibSrc/strstr64.asm
@@ -0,0 +1,218 @@
+;************************* strstr64.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-14
+; Last modified: 2011-07-14
+
+; Description:
+; Faster version of the standard strstr function:
+; char * A_strstr(char * haystack, const char * needle);
+; Searches for substring needle in string haystack. Return value is pointer to
+; first occurrence of needle, or NULL if not found. The strings must be zero-terminated.
+;
+; Note that this function may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment. Avoiding this would be complicated
+; and make the function much slower: For every unaligned 16-bytes read we would have to
+; check if it crosses a page boundary (4 kbytes), and if so check if the string ends
+; before the page boundary. Only if the string does not end before the page boundary
+; can we read into the next memory page.
+;
+; Overriding standard function strstr:
+; The alias ?OVR_strstr is changed to _strstr in the object file if
+; it is desired to override the standard library function strstr.
+; Overriding is disabled because the function may read beyond the end of a
+; string, while the standard strstr function is guaranteed to work in all cases.
+;
+; Position-independent code is generated if POSITIONINDEPENDENT is defined.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+default rel
+
+%define ALLOW_OVERRIDE 0 ; Set to one if override of standard function desired
+
+global A_strstr: function ; Function A_strstr
+
+; Direct entries to CPU-specific versions
+global strstrGeneric: function ; Generic version for processors without SSE4.2
+global strstrSSE42: function ; Version for processors with SSE4.2
+
+; Imported from instrset64.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+section .text
+
+; strstr function
+
+%if ALLOW_OVERRIDE
+global ?OVR_strstr: function
+?OVR_strstr:
+%endif
+
+A_strstr: ; function dispatching
+ jmp near [strstrDispatch] ; Go to appropriate version, depending on instruction set
+
+; define register use
+%ifdef WINDOWS
+%define par1 rcx ; parameter 1, pointer to haystack
+%define par2 rdx ; parameter 2, pointer to needle
+%define bitindex r8d ; bit index in eax mask
+%define bitindexr r8 ; bit index in eax mask
+%define phay r9 ; pointer to match in haystack
+%define pnee r10 ; pointer to match in needle
+%define tempb r8b ; temporary byte
+%else
+%define par1 rdi ; parameter 1, pointer to haystack
+%define par2 rsi ; parameter 2, pointer to needle
+%define bitindex ecx ; bit index in eax mask
+%define bitindexr rcx ; bit index in eax mask
+%define phay r9 ; pointer to match in haystack
+%define pnee rdx ; pointer to match in needle
+%define tempb cl ; temporary byte
+%endif
+
+align 16
+strstrSSE42: ; SSE4.2 version
+ movdqu xmm1, [par2] ; needle
+
+;align 8
+haystacknext:
+ ; [par1] = haystack
+ pcmpistrm xmm1, [par1], 00001100b ; unsigned byte search, equal ordered, return mask in xmm0
+ jc matchbegin ; found beginning of a match
+ jz nomatch ; end of haystack found, no match
+ add par1, 16
+ jmp haystacknext
+
+matchbegin:
+ jz foundshort ; haystack ends here, a short match is found
+ movd eax, xmm0 ; bit mask of possible matches
+nextindexbit:
+ bsf bitindex, eax ; index of first bit in mask of possible matches
+
+ ; compare strings for full match
+ lea phay, [par1+bitindexr] ; haystack + index
+ mov pnee, par2 ; needle
+
+compareloop: ; compare loop for long match
+ movdqu xmm2, [pnee] ; paragraph of needle
+ pcmpistrm xmm2, [phay], 00001100B ; unsigned bytes, equal ordered, modifies xmm0
+ ; (can't use "equal each, masked" because it inverts when past end of needle, but not when past end of both)
+
+ jno longmatchfail ; difference found after extending partial match
+ js longmatchsuccess ; end of needle found, and no difference
+ add pnee, 16
+ add phay, 16
+ jmp compareloop ; loop to next 16 bytes
+
+longmatchfail:
+ ; remove index bit of first partial match
+ btr eax, bitindex
+ test eax, eax
+ jnz nextindexbit ; mask contains more index bits, loop to next bit in eax mask
+ ; mask exhausted for possible matches, continue to next haystack paragraph
+ add par1, 16
+ jmp haystacknext ; loop to next paragraph of haystack
+
+longmatchsuccess: ; match found over more than one paragraph
+ lea rax, [par1+bitindexr] ; haystack + index to begin of long match
+ ret
+
+foundshort: ; match found within single paragraph
+ movd eax, xmm0 ; bit mask of matches
+ bsf eax, eax ; index of first match
+ add rax, par1 ; pointer to first match
+ ret
+
+nomatch: ; needle not found, return 0
+ xor eax, eax
+ ret
+
+;strstrSSE42: endp
+
+
+align 16
+strstrGeneric: ; generic version
+
+ mov ax, [par2]
+ test al, al
+ jz _Found ; a zero-length needle is always found
+ test ah, ah
+ jz _SingleCharNeedle
+
+_SearchLoop: ; search for first character match
+ mov tempb, [par1]
+ test tempb, tempb
+ jz _NotFound ; end of haystack reached without finding
+ cmp al, tempb
+ je _FirstCharMatch ; first character match
+_IncompleteMatch:
+ inc par1
+ jmp _SearchLoop ; loop through haystack
+
+_FirstCharMatch:
+ mov phay, par1 ; begin of match position
+ mov pnee, par2
+_MatchLoop:
+ inc phay
+ inc pnee
+ mov al, [pnee]
+ test al, al
+ jz _Found ; end of needle. match ok
+ cmp al, [phay]
+ je _MatchLoop
+ ; match failed, recover and continue
+ mov al, [par2]
+ jmp _IncompleteMatch
+
+_NotFound: ; needle not found. return 0
+ xor eax, eax
+ ret
+
+_Found: ; needle found. return pointer to position in haystack
+ mov rax, par1
+ ret
+
+_SingleCharNeedle: ; Needle is a single character
+ mov tempb, byte [par1]
+ test tempb, tempb
+ jz _NotFound ; end of haystack reached without finding
+ cmp al, tempb
+ je _Found
+ inc par1
+ jmp _SingleCharNeedle ; loop through haystack
+
+
+; CPU dispatching for strstr. This is executed only once
+strstrCPUDispatch:
+ ; get supported instruction set
+ push par1
+ push par2
+ call InstructionSet
+ pop par2
+ pop par1
+ ; Point to generic version of strstr
+ lea r9, [strstrGeneric]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version of strstr
+ lea r9, [strstrSSE42]
+Q100: mov [strstrDispatch], r9
+ ; Continue in appropriate version of strstr
+ jmp r9
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strstrDispatch DQ strstrCPUDispatch
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+ dq 0, 0
diff --git a/asmlibSrc/strtouplow32.asm b/asmlibSrc/strtouplow32.asm
new file mode 100755
index 0000000..c0aacf2
--- /dev/null
+++ b/asmlibSrc/strtouplow32.asm
@@ -0,0 +1,285 @@
+;************************* strtouplow32.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-17
+; Last modified: 2013-09-11
+
+; Description:
+; A_strtolower converts a sting to lower case
+; A_strtoupper converts a sting to upper case
+; Only characters a-z or A-Z are converted, other characters are ignored.
+; The functions save time by ignoring locale-specific characters or UTF-8
+; characters so it doesn't have to look up each character in a table.
+;
+; Function prototypes:
+; extern "C" void A_strtolower(char * string);
+; extern "C" void A_strtoupper(char * string);
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; CPU dispatching included for 386 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+section .data
+align 16
+
+azlow: db 'azazazazazazazaz' ; define range for lower case
+azhigh: db 'AZAZAZAZAZAZAZAZ' ; define range for upper case
+casebit: times 16 db 20h ; bit to change when changing case
+
+section .text
+
+global _A_strtolower: function
+global _A_strtoupper: function
+global _strtolowerGeneric: function
+global _strtoupperGeneric: function
+global _strtolowerSSE42: function
+global _strtoupperSSE42: function
+
+; Imported from instrset32.asm:
+extern _InstructionSet ; Instruction set for CPU dispatcher
+
+; function dispatching
+
+%IFNDEF POSITIONINDEPENDENT
+_A_strtolower:
+ jmp near [strtolowerDispatch] ; Go to appropriate version, depending on instruction set
+
+_A_strtoupper:
+ jmp near [strtoupperDispatch] ; Go to appropriate version, depending on instruction set
+
+%ELSE ; Position-independent code
+
+_A_strtolower:
+ call get_thunk_edx ; get reference point for position-independent code
+RP1: ; reference point edx = offset RP1
+; Make the following instruction with address relative to RP1:
+ jmp dword [edx+strtolowerDispatch-RP1]
+
+_A_strtoupper:
+ call get_thunk_edx ; get reference point for position-independent code
+RP2: ; reference point edx = offset RP2
+; Make the following instruction with address relative to RP2:
+ jmp dword [edx+strtoupperDispatch-RP2]
+
+%ENDIF
+
+
+_strtoupperSSE42:
+%IFNDEF POSITIONINDEPENDENT
+ movdqa xmm1, [azlow] ; define range a-z
+ movdqa xmm3, [casebit] ; bit to change
+%ELSE
+ call get_thunk_edx ; get reference point for position-independent code
+RP11:
+ movdqa xmm1, [edx+azlow-RP11] ; same, with relative address
+ movdqa xmm3, [edx+casebit-RP11]
+%ENDIF
+ jmp strupperlower
+
+_strtolowerSSE42:
+%IFNDEF POSITIONINDEPENDENT
+ movdqa xmm1, [azhigh] ; define range A-Z
+ movdqa xmm3, [casebit] ; bit to change
+%ELSE
+ call get_thunk_edx ; get reference point for position-independent code
+RP12:
+ movdqa xmm1, [edx+azhigh-RP12]; same, with relative address
+ movdqa xmm3, [edx+casebit-RP12]
+%ENDIF
+
+
+strupperlower:
+ ; common code for strtoupper and strtolower
+ mov edx, [esp+4] ; string
+next: ; loop
+ movdqu xmm2, [edx] ; read 16 bytes from string
+ pcmpistrm xmm1, xmm2, 01000100b; find bytes in range A-Z or a-z, return mask in xmm0
+ jz last ; string ends in this paragraph
+ pand xmm0, xmm3 ; mask AND case bit
+ pxor xmm2, xmm0 ; change case bit in masked bytes of string
+ movdqu [edx], xmm2 ; write changed value
+ add edx, 16
+ jmp next ; next 16 bytes
+
+last: ; Write last 0-15 bytes
+ ; While we can read past the end of the string if precautions are made, we cannot write
+ ; past the end of the string, even if the value is unchanged, because the value may have
+ ; been changed in the meantime by another thread
+ jnc finish ; nothing changed, no need to write
+ pand xmm3, xmm0 ; mask and case bit
+ pxor xmm2, xmm3 ; change case bit
+
+%if 0 ; Method with maskmovdqu is elegant, but slow because maskmovdqu uses nontemporal (uncached) write
+ push edi
+ mov edi, edx
+ maskmovdqu xmm2, xmm0
+ pop edi
+finish: ret
+
+%else ; less elegant alternative, but probably faster if data needed again soon
+ ; write 8-4-2-1 bytes, if necessary
+ pmovmskb eax, xmm0 ; create bit mask
+ cmp eax, 10000000b
+ jb L10
+ ; there are at least 8 bytes to write
+ movq [edx], xmm2
+ psrldq xmm2, 8
+ add edx, 8
+ shr eax, 8
+L10: cmp eax, 1000b
+ jb L20
+ ; there are at least 4 bytes to write
+ movd [edx], xmm2
+ psrldq xmm2, 4
+ add edx, 4
+ shr eax, 4
+L20: movd ecx, xmm2 ; use ecx for last 3 bytes
+ cmp eax, 10b
+ jb L30
+ ; there are at least 2 bytes to write
+ mov [edx], cx
+ shr ecx, 16
+ add edx, 2
+ shr eax, 2
+L30: cmp eax, 1
+ jb finish
+ ; there is one more byte to write
+ mov [edx], cl
+finish: ret
+%endif
+
+; 386 version
+_strtolowerGeneric:
+ mov edx, [esp+4] ; string
+A100: ; loop
+ mov al, [edx]
+ test al, al
+ jz A900 ; end of string
+ sub al, 'A'
+ cmp al, 'Z' - 'A'
+ jbe A200 ; is upper case
+ inc edx
+ jmp A100 ; loop to next character
+A200: ; convert to lower case
+ add al, 'a'
+ mov [edx], al
+ inc edx
+ jmp A100
+A900: ret
+;_strtolowerGeneric end
+
+_strtoupperGeneric:
+ mov edx, [esp+4] ; string
+B100: ; loop
+ mov al, [edx]
+ test al, al
+ jz B900 ; end of string
+ sub al, 'a'
+ cmp al, 'z' - 'a'
+ jbe B200 ; is lower case
+ inc edx
+ jmp B100 ; loop to next character
+B200: ; convert to upper case
+ add al, 'A'
+ mov [edx], al
+ inc edx
+ jmp B100
+B900: ret
+;_strtoupperGeneric end
+
+%IFDEF POSITIONINDEPENDENT
+get_thunk_edx: ; load caller address into edx for position-independent code
+ mov edx, [esp]
+ ret
+%ENDIF
+
+; CPU dispatching for strtolower. This is executed only once
+strtolowerCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version
+ mov ecx, _strtolowerGeneric
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version
+ mov ecx, _strtolowerSSE42
+Q100: mov [strtolowerDispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP21: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_strtolowerGeneric-RP21]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version
+ lea ecx, [edx+_strtolowerSSE42-RP21]
+Q100: mov [edx+strtolowerDispatch-RP21], ecx
+ ; Continue in appropriate version
+ jmp ecx
+%ENDIF
+
+; CPU dispatching for strtoupper. This is executed only once
+strtoupperCPUDispatch:
+%IFNDEF POSITIONINDEPENDENT
+ ; get supported instruction set
+ call _InstructionSet
+ ; Point to generic version
+ mov ecx, _strtoupperGeneric
+ cmp eax, 10 ; check SSE4.2
+ jb Q200
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version
+ mov ecx, _strtoupperSSE42
+Q200: mov [strtoupperDispatch], ecx
+ ; Continue in appropriate version
+ jmp ecx
+
+%ELSE ; Position-independent version
+ ; get supported instruction set
+ call _InstructionSet
+ call get_thunk_edx
+RP22: ; reference point edx
+ ; Point to generic version
+ lea ecx, [edx+_strtoupperGeneric-RP22]
+ cmp eax, 10 ; check SSE4.2
+ jb Q200
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version
+ lea ecx, [edx+_strtoupperSSE42-RP22]
+Q200: mov [edx+strtoupperDispatch-RP22], ecx
+ ; Continue in appropriate version
+ jmp ecx
+%ENDIF
+
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strtolowerDispatch DD strtolowerCPUDispatch
+strtoupperDispatch DD strtoupperCPUDispatch
+
+%IFDEF POSITIONINDEPENDENT
+; Fix problem in Mac linker
+ DD 0,0,0,0
+%ENDIF
+
+
+SECTION .bss
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+ dq 0, 0
diff --git a/asmlibSrc/strtouplow64.asm b/asmlibSrc/strtouplow64.asm
new file mode 100755
index 0000000..9ead7db
--- /dev/null
+++ b/asmlibSrc/strtouplow64.asm
@@ -0,0 +1,213 @@
+;************************* strtouplow64.asm ************************************
+; Author: Agner Fog
+; Date created: 2011-07-17
+; Last modified: 2013-09-11
+
+; Description:
+; A_strtolower converts a sting to lower case
+; A_strtoupper converts a sting to upper case
+; Only characters a-z or A-Z are converted, other characters are ignored.
+; The functions save time by ignoring locale-specific characters or UTF-8
+; characters so it doesn't have to look up each character in a table.
+;
+; Function prototypes:
+; extern "C" void A_strtolower(char * string);
+; extern "C" void A_strtoupper(char * string);
+;
+; Note that these functions may read up to 15 bytes beyond the end of the strings.
+; This is rarely a problem but it can in principle generate a protection violation
+; if a string is placed at the end of the data segment.
+;
+; CPU dispatching included for SSE2 and SSE4.2 instruction sets.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+
+default rel
+
+section .data
+align 16
+
+azlow: db 'azazazazazazazaz' ; define range for lower case
+azhigh: db 'AZAZAZAZAZAZAZAZ' ; define range for upper case
+casebit: times 16 db 20h ; bit to change when changing case
+
+%ifdef WINDOWS
+%define par1 rcx ; register for parameter 1
+%else ; UNIX
+%define par1 rdi
+%endif
+
+section .text
+
+global A_strtolower: function
+global A_strtoupper: function
+global strtolowerGeneric: function
+global strtoupperGeneric: function
+global strtolowerSSE42: function
+global strtoupperSSE42: function
+
+; Imported from instrset64.asm:
+extern InstructionSet ; Instruction set for CPU dispatcher
+
+; function dispatching
+
+A_strtolower:
+ jmp near [strtolowerDispatch] ; Go to appropriate version, depending on instruction set
+
+A_strtoupper:
+ jmp near [strtoupperDispatch] ; Go to appropriate version, depending on instruction set
+
+
+; SSE4.2 version
+strtoupperSSE42:
+ movdqa xmm1, [azlow] ; define range a-z
+ jmp strupperlower
+strtolowerSSE42:
+ movdqa xmm1, [azhigh] ; define range A-Z
+strupperlower:
+ ; common code for strtoupper and strtolower
+ movdqa xmm3, [casebit] ; bit to change
+next: ; loop
+ movdqu xmm2, [par1] ; read 16 bytes from string
+ pcmpistrm xmm1, xmm2, 01000100b; find bytes in range A-Z or a-z, return mask in xmm0
+ jz last ; string ends in this paragraph
+ pand xmm0, xmm3 ; mask AND case bit
+ pxor xmm2, xmm0 ; change case bit in masked bytes of string
+ movdqu [par1], xmm2 ; write changed value
+ add par1, 16
+ jmp next ; next 16 bytes
+
+last: ; Write last 0-15 bytes
+ ; While we can read past the end of the string if precautions are made, we cannot write
+ ; past the end of the string, even if the value is unchanged, because the value may have
+ ; been changed in the meantime by another thread
+ jnc finish ; nothing changed, no need to write
+ pand xmm3, xmm0 ; mask and case bit
+ pxor xmm2, xmm3 ; change case bit
+
+%if 0 ; Method with maskmovdqu is elegant, but slow because maskmovdqu uses nontemporal (uncached) write
+ push rdi
+ mov rdi, par1
+ maskmovdqu xmm2, xmm0
+ pop rdi
+finish: ret
+
+%else ; less elegant alternative, but probably faster if data needed again soon
+ ; write 8-4-2-1 bytes, if necessary
+ pmovmskb eax, xmm0 ; create bit mask
+ cmp eax, 10000000b
+ jb L10
+ ; there are at least 8 bytes to write
+ movq [par1], xmm2
+ psrldq xmm2, 8
+ add par1, 8
+ shr eax, 8
+L10: cmp eax, 1000b
+ jb L20
+ ; there are at least 4 bytes to write
+ movd [par1], xmm2
+ psrldq xmm2, 4
+ add par1, 4
+ shr eax, 4
+L20: movd edx, xmm2 ; use edx for last 3 bytes
+ cmp eax, 10b
+ jb L30
+ ; there are at least 2 bytes to write
+ mov [par1], dx
+ shr edx, 16
+ add par1, 2
+ shr eax, 2
+L30: cmp eax, 1
+ jb finish
+ ; there is one more byte to write
+ mov [par1], dl
+finish: ret
+%endif
+
+; SSE2 version
+strtolowerGeneric:
+A100: ; loop
+ mov al, [par1]
+ test al, al
+ jz A900 ; end of string
+ sub al, 'A'
+ cmp al, 'Z' - 'A'
+ jbe A200 ; is upper case
+ inc par1
+ jmp A100 ; loop to next character
+A200: ; convert to lower case
+ add al, 'a'
+ mov [par1], al
+ inc par1
+ jmp A100
+A900: ret
+;strtolowerGeneric end
+
+strtoupperGeneric:
+B100: ; loop
+ mov al, [par1]
+ test al, al
+ jz B900 ; end of string
+ sub al, 'a'
+ cmp al, 'z' - 'a'
+ jbe B200 ; is lower case
+ inc par1
+ jmp B100 ; loop to next character
+B200: ; convert to upper case
+ add al, 'A'
+ mov [par1], al
+ inc edx
+ jmp B100
+B900: ret
+;strtoupperGeneric end
+
+
+; CPU dispatching for strtolower. This is executed only once
+strtolowerCPUDispatch:
+ ; get supported instruction set
+ push par1
+ call InstructionSet
+ pop par1
+ ; Point to generic version
+ lea rdx, [strtolowerGeneric]
+ cmp eax, 10 ; check SSE4.2
+ jb Q100
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version
+ lea rdx, [strtolowerSSE42]
+Q100: mov [strtolowerDispatch], rdx
+ ; Continue in appropriate version
+ jmp rdx
+
+; CPU dispatching for strtoupper. This is executed only once
+strtoupperCPUDispatch:
+ ; get supported instruction set
+ push par1
+ call InstructionSet
+ pop par1
+ ; Point to generic version
+ lea rdx, [strtoupperGeneric]
+ cmp eax, 10 ; check SSE4.2
+ jb Q200
+ ; SSE4.2 supported
+ ; Point to SSE4.2 version
+ lea rdx, [strtoupperSSE42]
+Q200: mov [strtoupperDispatch], rdx
+ ; Continue in appropriate version
+ jmp rdx
+
+
+SECTION .data
+
+; Pointer to appropriate version. Initially points to dispatcher
+strtolowerDispatch DQ strtolowerCPUDispatch
+strtoupperDispatch DQ strtoupperCPUDispatch
+
+; Append 16 bytes to end of last data section to allow reading past end of strings:
+; (We might use names .bss$zzz etc. under Windows to make it is placed
+; last, but the assembler gives sections with unknown names wrong attributes.
+; Here, we are just relying on library data being placed after main data.
+; This can be verified by making a link map file)
+SECTION .bss
+ dq 0, 0
diff --git a/asmlibSrc/substring32.asm b/asmlibSrc/substring32.asm
new file mode 100755
index 0000000..de450d2
--- /dev/null
+++ b/asmlibSrc/substring32.asm
@@ -0,0 +1,61 @@
+;************************* substring32.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-18
+; Last modified: 2011-07-18
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; Makes a substring of a zero-terminated ASCII string
+;
+; C++ prototype:
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+; Makes a substring from source, starting at position pos (zero-based) and length
+; len and stores it in the array dest. It is the responsibility of the programmer
+; that the size of the dest array is at least len + 1.
+; The return value is the actual length of the substring. This may be less than
+; len if the length of source is less than pos + len.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global _A_substring: function ; Function _A_substring
+
+extern _A_strlen
+extern _A_memcpy
+
+SECTION .text
+
+; extern "C" 4 8 12 16
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+
+_A_substring:
+ mov ecx, [esp+8] ; source
+ push ecx
+ call _A_strlen ; eax = strlen(source)
+ pop ecx
+ mov edx, [esp+12] ; pos
+ sub eax, edx ; max length = strlen(source) - pos
+ jbe empty ; strlen(source) <= pos. Return empty string
+ mov ecx, [esp+16] ; len
+ cmp eax, ecx
+ cmova eax, ecx ; min(len, maxlen)
+ add edx, [esp+8] ; source + pos
+ mov ecx, [esp+4] ; dest
+ push eax ; length for memcpy
+ push edx ; source for memcpy
+ push ecx ; dest for memcpy
+ call _A_memcpy
+ pop ecx
+ pop edx
+ pop eax ; return final length
+ mov byte [ecx+eax], 0 ; terminating zero
+ ret
+
+empty: ; return empty string
+ mov ecx, [esp+4] ; dest
+ xor eax, eax ; return 0
+ mov byte [ecx], al
+ ret
+
+;_A_substring END
diff --git a/asmlibSrc/substring64.asm b/asmlibSrc/substring64.asm
new file mode 100755
index 0000000..f911a98
--- /dev/null
+++ b/asmlibSrc/substring64.asm
@@ -0,0 +1,73 @@
+;************************* substring64.asm **********************************
+; Author: Agner Fog
+; Date created: 2011-07-18
+; Last modified: 2011-07-18
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Description:
+; Makes a substring of a zero-terminated ASCII string
+;
+; C++ prototype:
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+; Makes a substring from source, starting at position pos (zero-based) and length
+; len and stores it in the array dest. It is the responsibility of the programmer
+; that the size of the dest array is at least len + 1.
+; The return value is the actual length of the substring. This may be less than
+; len if the length of source is less than pos + len.
+;
+; Copyright (c) 2011 GNU General Public License www.gnu.org/licenses/gpl.html
+;******************************************************************************
+
+global A_substring: function ; Function _A_substring
+
+extern A_strlen
+extern A_memcpy
+
+SECTION .text
+
+; extern "C"
+; size_t A_substring(char * dest, const char * source, size_t pos, size_t len);
+
+%ifdef WINDOWS
+%define par1 rcx ; dest
+%define par2 rdx ; source
+%define par3 r8 ; pos
+%define par4 r9 ; len
+%else ; UNIX
+%define par1 rdi
+%define par2 rsi
+%define par3 rdx
+%define par4 rcx
+%endif
+
+A_substring:
+ push par1
+ push par2
+ push par3
+ push par4
+ mov par1, par2
+ call A_strlen ; rax = strlen(source)
+ pop par4
+ pop par3
+ pop par2
+ pop par1
+ sub rax, par3 ; max length = strlen(source) - pos
+ jbe empty ; strlen(source) <= pos. Return empty string
+ cmp rax, par4
+ cmova rax, par4 ; min(len, maxlen)
+ add par2, par3 ; source + pos = source for memcpy
+ mov par3, rax ; length for memcpy
+ push rax ; new length
+ call A_memcpy
+ pop rcx ; new length = return value, rax = dest
+ mov byte [rcx+rax], 0 ; terminating zero
+ mov rax, rcx ; return new length
+ ret
+
+empty: ; return empty string
+ xor eax, eax ; return 0
+ mov byte [par1], al
+ ret
+
+;A_substring END
diff --git a/asmlibSrc/testalib.cpp b/asmlibSrc/testalib.cpp
new file mode 100755
index 0000000..fa185c8
--- /dev/null
+++ b/asmlibSrc/testalib.cpp
@@ -0,0 +1,151 @@
+/*************************** testalib.cpp **********************************
+* Author: Agner Fog
+* Date created: 2007-06-14
+* Last modified: 2011-07-17
+* Project: asmlib.zip
+* Source URL: www.agner.org/optimize
+*
+* Description:
+* Simple test of asmlib library
+*
+* Instructions:
+* Compile for console mode and link with the appropriate version of asmlib
+*
+* Further documentation:
+* The file asmlib-instructions.pdf contains further documentation and
+* instructions.
+*
+* Copyright 2007-2011 by Agner Fog.
+* GNU General Public License http://www.gnu.org/licenses/gpl.html
+*****************************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "asmlib.h"
+
+
+void Failure(const char * text) {
+ // Report if test failure
+ printf("\nTest failed: %s\n", text);
+ exit(1);
+}
+
+int main () {
+
+ // test InstructionSet()
+ printf("\nInstructionSet = %i", InstructionSet());
+
+ // test cpuid_abcd
+ int abcd[4]; char s[16];
+ cpuid_abcd(abcd, 0);
+ *(int*)(s+0) = abcd[1]; // ebx
+ *(int*)(s+4) = abcd[3]; // edx
+ *(int*)(s+8) = abcd[2]; // ecx
+ s[12] = 0; // terminate string
+ printf("\nVendor string = %s", s);
+
+ // test ProcessorName()
+ printf("\nProcessorName = %s", ProcessorName());
+
+ // test CpuType
+ int vendor, family, model;
+ CpuType(&vendor, &family, &model);
+ printf("\nCpuType: vendor %i, family 0x%X, model 0x%X", vendor, family, model);
+
+ // test DataCacheSize
+ printf("\nData cache size: L1 %ikb, L2 %ikb, L3 %ikb",
+ (int)DataCacheSize(1)/1024, (int)DataCacheSize(2)/1024, (int)DataCacheSize(3)/1024);
+
+ // test ReadTSC()
+ ReadTSC();
+ int tsc = (int)ReadTSC();
+ tsc = (int)ReadTSC() - tsc;
+ printf("\nReadTSC takes %i clocks\n\n", tsc);
+
+ // test Round();
+ double d;
+ for (d = -1; d <= 1; d += 0.5) {
+ printf("Round %f = %i = %i\n", d, Round(d), Round(float(d)));
+ }
+
+ // Test memory and string functions
+ int i, n;
+ const int strsize = 256;
+ char string1[strsize], string2[strsize];
+ const char * teststring = "abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ 1234567890 @`'{}[]()<>";
+
+ // Initialize strings
+ A_memset(string1, 0, strsize);
+ A_memset(string2, 0, strsize);
+
+ // Test A_strcpy, A_strcat, A_strlen
+ A_strcpy(string1, teststring);
+ n = strsize/(int)A_strlen(teststring);
+ for (i = 0; i < n-1; i++) {
+ A_strcat(string1, teststring);
+ }
+ if (A_strlen(string1) != n * A_strlen(teststring)) Failure("A_strcpy, A_strcat, A_strlen");
+
+ // Test A_stricmp
+ A_memcpy(string2, string1, strsize);
+ string2[4] ^= 0x20; string1[30] ^= 0x20; // Change case
+ if (A_stricmp(string1, string2) != 0) Failure("A_stricmp");
+ string2[8] += 2; // Make strings different
+ if (A_stricmp(string1, string2) >= 0) Failure("A_stricmp");
+ string2[7] -= 2; // Make strings different
+ if (A_stricmp(string1, string2) <= 0) Failure("A_stricmp");
+
+ // test A_strtolower and A_strtoupper
+ A_strcpy(string1, teststring);
+ A_strcpy(string2, teststring);
+ A_strtolower(string1);
+ A_strtoupper(string2);
+ printf("\nstring converted to lower and upper case:\n%s\n%s\n%s",
+ teststring, string1, string2);
+
+ // test strspn and strcspn
+ int n1, n2;
+ const int nset = 4;
+ const char * tset[] = {"abc", "", "01234567890123456789", "abcdefghijklmnopqrstuvwxyz"};
+ for (i = 0; i < nset; i++) {
+ n1 = A_strspn(teststring, tset[i]);
+ n2 = strspn(teststring, tset[i]);
+ if (n1 != n2) Failure("A_strspn");
+ n1 = A_strcspn(teststring, tset[i]);
+ n2 = strcspn(teststring, tset[i]);
+ if (n1 != n2) Failure("A_strcspn");
+ }
+
+ // Test A_memmove with overlapping source and destination
+ A_memcpy(string2, string1, strsize);
+
+ A_memcpy(string1+5, string1+12, 12);
+ memcpy (string2+5, string2+12, 12);
+ if (A_stricmp(string1, string2) != 0) Failure("memcpy");
+
+ A_memcpy(string1+5, string1+12, 130);
+ memcpy (string2+5, string2+12, 130);
+ if (A_stricmp(string1, string2) != 0) Failure("memcpy");
+
+ A_memmove(string1+5, string1+2, 12);
+ memmove (string2+5, string2+2, 12);
+ if (A_stricmp(string1, string2) != 0) Failure("A_memmove");
+
+ A_memmove(string1+3, string1+8, 12);
+ memmove (string2+3, string2+8, 12);
+ if (A_stricmp(string1, string2) != 0) Failure("A_memmove");
+
+ A_memmove(string1+41, string1+30, 100);
+ memmove (string2+41, string2+30, 100);
+ if (A_stricmp(string1, string2) != 0) Failure("A_memmove");
+
+ A_memmove(string1+32, string1+48, 177);
+ memmove (string2+32, string2+48, 177);
+ if (A_stricmp(string1, string2) != 0) Failure("A_memmove");
+
+ printf("\n\nTests passed OK\n");
+
+ return 0;
+}
diff --git a/asmlibSrc/testmem.cpp b/asmlibSrc/testmem.cpp
new file mode 100755
index 0000000..34fe160
--- /dev/null
+++ b/asmlibSrc/testmem.cpp
@@ -0,0 +1,396 @@
+// TESTMEM.CPP Agner Fog 2011-07-04
+
+// Test file for asmlib memcpy and memmove functions
+// Instructions: Compile on any platform and link with the appropriate
+// version of the asmlib library.
+
+#include <stdio.h>
+//#include <process.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#include "asmlib.h"
+
+// define function type
+typedef void * memcpyF (void * dest, const void * src, size_t count);
+typedef void * memsetF (void * dest, int c, size_t count);
+
+
+extern "C" {
+ extern int IInstrSet;
+ // function prototypes for CPU specific function versions
+ memcpyF memcpy386, memcpySSE2, memcpySSSE3, memcpyU, memcpyU256;
+ memcpyF memmove386, memmoveSSE2, memmoveSSSE3, memmoveU, memmoveU256;
+ memsetF memset386, memsetSSE2, memsetAVX;
+}
+
+// Tables of function pointers
+#if defined(_WIN64) || defined(_M_X64) || defined(__amd64)
+const int NUMFUNC = 5;
+memcpyF * memcpyTab[NUMFUNC] = {A_memcpy, memcpySSE2, memcpySSSE3, memcpyU, memcpyU256};
+memcpyF * memmoveTab[NUMFUNC] = {A_memmove, memmoveSSE2, memmoveSSSE3, memmoveU, memmoveU256};
+const char * DispatchNames[NUMFUNC] = {"Dispatched", "SSE2", "SSSE3", "Unalign", "U256"};
+int isetreq [NUMFUNC] = {0, 4, 6, 4, 11}; // instruction set required
+const int MEMSETFUNCS = 3;
+memsetF * memsetTab[MEMSETFUNCS] = {A_memset, memsetSSE2, memsetAVX};
+const char * memsetNames[MEMSETFUNCS] = {"Dispatched", "SSE2", "AVX"};
+int memsetreq [NUMFUNC] = {0, 4, 11}; // instruction set required
+#else
+const int NUMFUNC = 6;
+memcpyF * memcpyTab[NUMFUNC] = {A_memcpy, memcpy386, memcpySSE2, memcpySSSE3, memcpyU, memcpyU256};
+memcpyF * memmoveTab[NUMFUNC] = {A_memmove, memmove386, memmoveSSE2, memmoveSSSE3, memmoveU, memmoveU256};
+const char * DispatchNames[NUMFUNC] = {"Dispatched", "386", "SSE2", "SSSE3", "Unalign", "U256"};
+int isetreq [NUMFUNC] = {0, 0, 4, 6, 4, 11}; // instruction set required
+const int MEMSETFUNCS = 4;
+memsetF * memsetTab[MEMSETFUNCS] = {A_memset, memset386, memsetSSE2, memsetAVX};
+const char * memsetNames[MEMSETFUNCS] = {"Dispatched", "386", "SSE2", "AVX"};
+int memsetreq [NUMFUNC] = {0, 0, 4, 11}; // instruction set required
+#endif
+
+
+
+void error(const char * s, int a, int b, int c) {
+ printf("\nError %s: %i %i %i\n", s, a, b, c);
+ exit (1);
+}
+
+void error(const char * s, int i, int a, int b, int c) {
+ printf("\nError %s: %i %i %i %i\n", s, i, a, b, c);
+ exit (1);
+}
+
+int main () {
+
+ int ao, bo, os, len;
+ int version;
+ const int pagesize = 0x1000; // 4 kbytes
+ const int n = 16*pagesize;
+ char a[n], b[n], c[n];
+ int instrset = InstructionSet();
+
+ // CacheBypassLimit = 5;
+ printf("\nmemcpy cache limit = 0x%X, memset cache limit 0x%X\n",
+ (int)GetMemcpyCacheLimit(), (int)GetMemsetCacheLimit());
+
+ printf("\nTest memcpy");
+
+ int i, x = 91;
+ for (i=0; i<n; i++) {
+ x += 23;
+ a[i] = (char)x;
+ }
+
+ A_memset(b, -1, n);
+
+ SetMemcpyCacheLimit(0); // default
+
+#if 1
+ // Test memcpy for correctness
+ // Loop through versions
+ for (version = 0; version < NUMFUNC; version++) {
+
+ printf("\n%s", DispatchNames[version]);
+ if (instrset < isetreq[version]) {
+ // instruction set not supported
+ printf(" skipped"); continue;
+ }
+
+ for (len=0; len<514; len++) {
+ for (ao = 0; ao <=20; ao++) {
+ for (bo = 0; bo <=32; bo++) {
+ A_memset(b, -1, len+96);
+ (*memcpyTab[version])(b+bo, a+ao, len);
+ if (bo && b[bo-1] != -1) error("A", ao, bo, len);
+ if (b[bo+len] != -1) error("B", ao, bo, len);
+ if (len==0) continue;
+ if (b[bo] != a[ao]) error("C", ao, bo, len);
+ if (b[bo+len-1] != a[ao+len-1]) error("D", ao, bo, len);
+ if (memcmp(b+bo, a+ao, len)) error("E", ao, bo, len);
+ }
+ }
+ }
+ // check false memory dependence branches
+ len = 300;
+ A_memcpy(b, a, 3*pagesize);
+ for (ao = pagesize-300; ao < pagesize+200; ao++) {
+ for (bo = 3*pagesize; bo <=3*pagesize+33; bo++) {
+ A_memset(b+bo-64, -1, len+128);
+ (*memcpyTab[version])(b+bo, b+ao, len);
+ if (b[bo-1] != -1) error("A1", ao, bo, len);
+ if (b[bo+len] != -1) error("B1", ao, bo, len);
+ if (memcmp(b+bo, b+ao, len)) error("E1", ao, bo, len);
+ }
+ }
+ // check false memory dependence branches with overlap
+ // src > dest and overlap: must copy forwards
+ len = pagesize+1000;
+ for (ao = 2*pagesize; ao <=2*pagesize+33; ao++) {
+ for (bo = pagesize-200; bo < pagesize+300; bo++) {
+ A_memcpy(b, a, 4*pagesize);
+ A_memcpy(c, a, 4*pagesize);
+ (*memcpyTab[version])(b+bo, b+ao, len);
+ //memcpy(c+bo, c+ao, len); // Most library versions of memcpy are actually memmove
+ memcpySSE2(c+bo, c+ao, len);
+ if (memcmp(b, c, 4*pagesize)) {
+ error("E2", ao-pagesize, bo-2*pagesize, len);
+ }
+ }
+ }
+ // check false memory dependence branches with overlap
+ // dest > src and overlap: undefined behavior
+#if 1
+ len = pagesize+1000;
+ for (ao = pagesize-200; ao < pagesize+200; ao++) {
+ for (bo = 2*pagesize; bo <=2*pagesize+33; bo++) {
+ A_memcpy(b, a, 4*pagesize);
+ A_memcpy(c, a, 4*pagesize);
+ (*memcpyTab[version])(b+bo, b+ao, len);
+ //memcpy(c+bo, c+ao, len); // MS Most library versions of memcpy are actually memmove
+ memcpySSE2(c+bo, c+ao, len);
+ if (memcmp(b, c, 4*pagesize)) {
+ error("E3", ao-pagesize, bo-2*pagesize, len);
+ }
+ }
+ }
+#endif
+ }
+ printf("\n\nTest memmove");
+
+ // Test memmove for correctness
+ for (i=0; i<n; i++) {
+ x += 23;
+ a[i] = char(x);
+ }
+
+ // Loop through versions
+ for (version = 0; version < NUMFUNC; version++) {
+ printf("\n%s", DispatchNames[version]);
+ if (instrset < isetreq[version]) {
+ // instruction set not supported
+ printf(" skipped"); continue;
+ }
+
+ // move forward
+ for (len=0; len<400; len++) {
+ for (bo = 0; bo <=33; bo++) {
+ for (os = 0; os <= 33; os++) {
+ A_memcpy(b, a, len+100);
+ (*memmoveTab[version])(b+bo+os, b+bo, len);
+ for (i=0; i<bo+os; i++) if (b[i]!=a[i]) error("E", i, bo, os, len);
+ for (i=bo+os; i<bo+os+len; i++) if (b[i] != a[i-os]) error("F", i, bo, os, len);
+ for (;i < bo+os+len+20; i++) if (b[i]!=a[i]) error("G", i, bo, os, len);
+ }
+ }
+ }
+ // move backwards
+ for (len=0; len<400; len++) {
+ for (bo = 0; bo <=33; bo++) {
+ for (os = 0; os < 33; os++) {
+ A_memcpy(b, a, len+96);
+ (*memmoveTab[version])(b+bo, b+bo+os, len);
+ for (i=0; i<bo; i++) if (b[i]!=a[i]) error("H", i, bo, os, len);
+ for (i=bo; i<bo+len; i++) if (b[i] != a[i+os]) error("I", i, bo, os, len);
+ for (;i < bo+len+20; i++) if (b[i]!=a[i]) error("J", i, bo, os, len);
+ }
+ }
+ }
+ }
+
+ printf("\n\nSame, with non-temporal moves");
+ SetMemcpyCacheLimit(1); // bypass cache
+
+ // Loop through versions
+ for (version = 0; version < NUMFUNC; version++) {
+
+ printf("\n%s", DispatchNames[version]);
+ if (instrset < isetreq[version]) {
+ // instruction set not supported
+ printf(" skipped"); continue;
+ }
+
+ for (len=0; len<514; len++) {
+ for (ao = 0; ao <=20; ao++) {
+ for (bo = 0; bo <=32; bo++) {
+ A_memset(b, -1, len+96);
+ (*memcpyTab[version])(b+bo, a+ao, len);
+ if (bo && b[bo-1] != -1) error("A", ao, bo, len);
+ if (b[bo+len] != -1) error("B", ao, bo, len);
+ if (len==0) continue;
+ if (b[bo] != a[ao]) error("C", ao, bo, len);
+ if (b[bo+len-1] != a[ao+len-1]) error("D", ao, bo, len);
+ if (memcmp(b+bo, a+ao, len)) error("E", ao, bo, len);
+ }
+ }
+ }
+ // check false memory dependence branches
+ len = 300;
+ A_memcpy(b, a, 3*pagesize);
+ for (ao = pagesize-200; ao < pagesize+200; ao++) {
+ for (bo = 3*pagesize; bo <=3*pagesize+33; bo++) {
+ A_memset(b+bo-64, -1, len+128);
+ (*memcpyTab[version])(b+bo, b+ao, len);
+ if (b[bo-1] != -1) error("A1", ao, bo, len);
+ if (b[bo+len] != -1) error("B1", ao, bo, len);
+ if (memcmp(b+bo, b+ao, len)) error("E1", ao, bo, len);
+ }
+ }
+ // check false memory dependence branches with overlap
+ // src > dest and overlap: must copy forwards
+ len = pagesize+1000;
+ for (ao = 2*pagesize; ao <=2*pagesize+33; ao++) {
+ for (bo = pagesize-200; bo < pagesize+200; bo++) {
+ A_memcpy(b, a, 4*pagesize);
+ A_memcpy(c, a, 4*pagesize);
+ (*memcpyTab[version])(b+bo, b+ao, len);
+ //memcpy(c+bo, c+ao, len); // Most library versions of memcpy are actually memmove
+ memcpySSE2(c+bo, c+ao, len);
+ if (memcmp(b, c, 4*pagesize)) {
+ error("E2", ao-pagesize, bo-2*pagesize, len);
+ }
+ }
+ }
+ // (check false memory dependence branches with overlap. skipped)
+ }
+ printf("\n\nTest memmove");
+
+ // Test memmove for correctness
+ for (i=0; i<n; i++) {
+ x += 23;
+ a[i] = char(x);
+ }
+
+ // Loop through versions
+ for (version = 0; version < NUMFUNC; version++) {
+ printf("\n%s", DispatchNames[version]);
+ if (instrset < isetreq[version]) {
+ // instruction set not supported
+ printf(" skipped"); continue;
+ }
+
+ // move forward
+ for (len=0; len<400; len++) {
+ for (bo = 0; bo <=33; bo++) {
+ for (os = 0; os <= 33; os++) {
+ A_memcpy(b, a, len+100);
+ (*memmoveTab[version])(b+bo+os, b+bo, len);
+ for (i=0; i<bo+os; i++) if (b[i]!=a[i]) error("E", i, bo, os, len);
+ for (i=bo+os; i<bo+os+len; i++) if (b[i] != a[i-os]) error("F", i, bo, os, len);
+ for (;i < bo+os+len+20; i++) if (b[i]!=a[i]) error("G", i, bo, os, len);
+ }
+ }
+ }
+ // move backwards
+ for (len=0; len<400; len++) {
+ for (bo = 0; bo <=33; bo++) {
+ for (os = 0; os < 33; os++) {
+ A_memcpy(b, a, len+96);
+ (*memmoveTab[version])(b+bo, b+bo+os, len);
+ for (i=0; i<bo; i++) if (b[i]!=a[i]) error("H", i, bo, os, len);
+ for (i=bo; i<bo+len; i++) if (b[i] != a[i+os]) error("I", i, bo, os, len);
+ for (;i < bo+len+20; i++) if (b[i]!=a[i]) error("J", i, bo, os, len);
+ }
+ }
+ }
+ }
+#endif
+ SetMemcpyCacheLimit(0); // back to default
+ SetMemsetCacheLimit(0);
+
+ printf("\n\nTest memset");
+
+ // test memset
+ const int val1 = 0x4C, val2 = 0xA2, len2 = 1024;
+ for (version = 0; version < MEMSETFUNCS; version++) {
+ memsetF * func = memsetTab[version];
+ printf("\n%s", memsetNames[version]);
+ if (instrset < memsetreq[version]) {
+ // instruction set not supported
+ printf(" skipped"); continue;
+ }
+ for (os = 0; os < 34; os++) {
+ for (len = 0; len < 500; len++) {
+ memset(a, val1, len2);
+ memset(a+os, val2, len);
+ (*func)(b, val1, len2);
+ (*func)(b+os, val2, len);
+ if (memcmp(a, b, len2)) {
+ error("MS", version, os, len);
+ }
+ }
+ }
+ for (len=0; len<200; len++) {
+ for (os = 0; os <= 33; os++) {
+ A_memcpy(b, a, len+64);
+ A_memset(b+os, 55, len);
+ for (i=0; i<os; i++) if (b[i] != a[i]) error("K", i, os, len);
+ for (; i<os+len; i++) if (b[i] != 55) error("L", i, os, len);
+ for (; i<os+len+17; i++) if (b[i] != a[i]) error("M", i, os, len);
+ }
+ }
+ }
+
+ printf("\n\nSame, with non-temporal moves");
+ SetMemsetCacheLimit(1); // bypass cache
+
+ for (version = 0; version < MEMSETFUNCS; version++) {
+ memsetF * func = memsetTab[version];
+ printf("\n%s", memsetNames[version]);
+ if (instrset < memsetreq[version]) {
+ // instruction set not supported
+ printf(" skipped"); continue;
+ }
+ for (os = 0; os < 34; os++) {
+ for (len = 0; len < 500; len++) {
+ memset(a, val1, len2);
+ memset(a+os, val2, len);
+ (*func)(b, val1, len2);
+ (*func)(b+os, val2, len);
+ if (memcmp(a, b, len2)) {
+ error("MS", version, os, len);
+ }
+ }
+ }
+ }
+ SetMemsetCacheLimit(0); // back to default
+
+ printf("\n\nTest strlen");
+
+ // test strlen
+ for (len=0; len<400; len++) {
+ for (os = 0; os <= 32; os++) {
+ A_memset(b, 0, len+64);
+ A_memset(b+os, 'a', len);
+ x = A_strlen(b+os);
+ if (x != len) error("N", 0, os, len);
+ A_memset(b, 1, len+64);
+ b[os+len] = 0;
+ x = A_strlen(b+os);
+ if (x != len) error("O", 0, os, len);
+ }
+ }
+
+ printf("\n\nTest strcpy and strcat");
+
+ // test strcpy and strcat
+ for (i=0; i<n; i++) {
+ x += 23;
+ a[i] = char(x) | 1;
+ }
+ for (len=0; len<400; len++) {
+ for (os = 0; os <= 16; os++) {
+ for (i=0; i<33; i++) {
+ A_memmove(b, a, len+64);
+ b[os+len] = 0;
+ A_strcpy(c+5, b+os);
+ if (A_strlen(c+5) != len) error("P", 0, os, len);
+ A_memmove(b+55, a, i+4);
+ b[55+i] = 0;
+ A_strcat(c+5, b+55);
+ if (A_strlen(c+5) != len+i) error("R", 0, os, len);
+ }
+ }
+ }
+ printf("\n\nSuccess\n");
+
+ return 0;
+}
diff --git a/asmlibSrc/testrandom.cpp b/asmlibSrc/testrandom.cpp
new file mode 100755
index 0000000..4cfc59b
--- /dev/null
+++ b/asmlibSrc/testrandom.cpp
@@ -0,0 +1,130 @@
+/*************************** random.cpp **********************************
+* Author: Agner Fog
+* Date created: 2013-09-09
+* Last modified: 2013-09-09
+* Project: asmlib.zip
+* Source URL: www.agner.org/optimize
+*
+* Description:
+* Test random number generators
+*
+* Instructions:
+* Compile for console mode and link with the appropriate version of asmlib
+*
+* Further documentation:
+* The file asmlib-instructions.pdf contains further documentation and
+* instructions.
+*
+* Copyright 2007-2011 by Agner Fog.
+* GNU General Public License http://www.gnu.org/licenses/gpl.html
+*****************************************************************************/
+
+#include <stdio.h>
+#include "asmlibran.h"
+#include "randomc.h"
+#include "sfmt.h"
+
+#include "mersenne.cpp"
+#include "mother.cpp"
+#include "sfmt.cpp"
+
+
+const int includeMother = 1;
+const int useInitByArray = 1;
+
+
+
+int main () {
+ int i;
+ uint32_t a, b, c;
+ const int numseeds = 5;
+ int seeds[numseeds] = {1,2,3,4,5};
+ PhysicalSeed(seeds, 1);
+ printf("\nSeed: %08X\n", seeds[0]);
+
+ CRandomMersenneA mersa(0);
+ CRandomMersenne mersc(0);
+ mersa.RandomInit(seeds[0]);
+ mersc.RandomInit(seeds[0]);
+ MersenneRandomInit(seeds[0]);
+
+ if (useInitByArray) {
+ mersa.RandomInitByArray(seeds, numseeds);
+ mersc.RandomInitByArray(seeds, numseeds);
+ MersenneRandomInitByArray(seeds, numseeds);
+ }
+
+ printf("\nMersenne:");
+ for (i=0; i<1000; i++) {
+ a = mersa.BRandom();
+ b = MersenneBRandom();
+ c = mersc.BRandom();
+ if (a != b || a != c) {
+ printf("\nerror: %08X %08X %08X", a, b, c);
+ break;
+ }
+ else if (i == 0 || i == 99) {
+ printf("\n %08X %08X %08X", a, b, c);
+ }
+ }
+ printf("\n %8i %8i %8i", mersa.IRandom(0,9999), mersc.IRandom(0,9999), MersenneIRandom(0,9999));
+ printf("\n %8i %8i %8i", mersa.IRandomX(0,9999), mersc.IRandomX(0,9999), MersenneIRandomX(0,9999));
+ printf("\n %12.8f %12.8f %12.8f", mersa.Random(), mersc.Random(), MersenneRandom());
+
+
+ CRandomMotherA motha(0);
+ CRandomMother mothc(0);
+ motha.RandomInit(seeds[0]);
+ mothc.RandomInit(seeds[0]);
+ MotherRandomInit(seeds[0]);
+
+ printf("\n\nMother:");
+ for (i=0; i<1000; i++) {
+ a = motha.BRandom();
+ b = MotherBRandom();
+ c = mothc.BRandom();
+ if (a != b || a != c) {
+ printf("\nerror: %08X %08X %08X", a, b, c);
+ break;
+ }
+ else if (i == 0 || i == 99) {
+ printf("\n %08X %08X %08X", a, b, c);
+ }
+ }
+ printf("\n %8i %8i %8i", motha.IRandom(0,9999), mothc.IRandom(0,9999), MotherIRandom(0,9999));
+ printf("\n %12.8f %12.8f %12.8f", motha.Random(), mothc.Random(), MotherRandom());
+
+ CRandomSFMTA sfmta(0, includeMother);
+ CRandomSFMT sfmtc(0, includeMother);
+ sfmta.RandomInit(1, includeMother);
+ sfmtc.RandomInit(1);
+ SFMTgenRandomInit(1,includeMother);
+
+ if (useInitByArray) {
+ sfmta.RandomInitByArray(seeds, numseeds, includeMother);
+ sfmtc.RandomInitByArray(seeds, numseeds);
+ SFMTgenRandomInitByArray(seeds, numseeds, includeMother);
+ }
+
+ printf("\n\nSFMT:");
+ for (i=0; i<1000; i++) {
+ a = sfmta.BRandom();
+ b = SFMTgenBRandom();
+ c = sfmtc.BRandom();
+ if (a != b || a != c) {
+ printf("\nerror @%i: %08X %08X %08X", i, a, b, c);
+ break;
+ }
+ else if (i == 0 || i == 99) {
+ printf("\n %08X %08X %08X", a, b, c);
+ }
+ }
+ printf("\n %8i %8i %8i", sfmta.IRandom(0,9999), sfmtc.IRandom(0,9999), SFMTgenIRandom(0,9999));
+ printf("\n %8i %8i %8i", sfmta.IRandomX(0,9999), sfmtc.IRandomX(0,9999), SFMTgenIRandomX(0,9999));
+ printf("\n %12.8f %12.8f %12.8f", sfmta.Random(), sfmtc.Random(), SFMTgenRandom());
+
+
+ printf("\n");
+
+ return 0;
+}
diff --git a/asmlibSrc/unalignedisfaster32.asm b/asmlibSrc/unalignedisfaster32.asm
new file mode 100755
index 0000000..aac78e8
--- /dev/null
+++ b/asmlibSrc/unalignedisfaster32.asm
@@ -0,0 +1,178 @@
+;************************* unalignedisfaster32.asm ******************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2013-08-30
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+;
+; Description:
+; This function finds out if unaligned 16-bytes memory read is
+; faster than aligned read followed by an alignment shift (PALIGNR) on the
+; current CPU.
+;
+; Return value:
+; 0: Unaligned read is probably slower than alignment shift
+; 1: Unknown
+; 2: Unaligned read is probably faster than alignment shift
+;
+;
+; C++ prototype:
+; extern "C" int Store256BitIsFaster(void);
+;
+; Description:
+; This function finds out if a 32-bytes memory write is
+; faster than two 16-bytes writes on the current CPU.
+;
+; Return value:
+; 0: 32-bytes memory write is slower or AVX not supported
+; 1: Unknown
+; 2: 32-bytes memory write is faster
+;
+; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+
+global _UnalignedIsFaster: function
+global _Store256BitIsFaster: function
+extern _CpuType
+extern _InstructionSet
+
+
+SECTION .text
+
+_UnalignedIsFaster:
+ push ebx
+ push 0 ; vendor
+ push 0 ; family
+ push 0 ; model
+ mov eax, esp
+ push eax ; &model
+ add eax, 4
+ push eax ; &family
+ add eax, 4
+ push eax ; &vendor
+ call _CpuType ; get vendor, family, model
+ add esp, 12
+ pop edx ; model
+ pop ecx ; family
+ pop ebx ; vendor
+ xor eax, eax ; return value
+ dec ebx
+ jz Intel
+ dec ebx
+ jz AMD
+ dec ebx
+ jz VIA
+ ; unknown vendor
+ inc eax
+ jmp Uend
+
+Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
+ ; Nehalem = family 6, model 1AH
+ ; Atom = family 6, model 1CH
+ ; Netburst = family 0FH
+ ; Future models are likely to be family 6, mayby > 6, model > 1C
+ cmp ecx, 6
+ jb Uend ; old Pentium 1, etc
+ cmp ecx, 0FH
+ je Uend ; old Netburst architecture
+ cmp edx, 1AH
+ jb Uend ; earlier than Nehalem
+ cmp edx, 1CH
+ je Uend ; Intel Atom
+ or eax, 2 ; Intel Nehalem and later, except Atom
+ jmp Uend
+
+AMD: ; AMD processors:
+ ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
+ ; K10/Opteron = family 10H ; Use unaligned
+ ; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
+ ; Piledriver = family 15H ; Use unaligned
+ ; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
+ cmp ecx, 10H ; AMD K8 or earlier: use aligned
+ jb Uend
+ cmp ecx, 16H ; Jaguar: use aligned
+ je Uend
+ or eax, 2 ; AMD K10 or later: use unaligned
+ jmp Uend
+
+
+VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
+ cmp ecx, 0FH
+ jna Uend ; VIA Nano
+ inc eax ; Future versions: unknown
+ ;jmp Uend
+
+Uend: pop ebx
+ ret
+
+;_UnalignedIsFaster ENDP
+
+
+_Store256BitIsFaster:
+ call _InstructionSet
+ cmp eax, 11 ; AVX supported
+ jb S90
+ push 0 ; vendor
+ push 0 ; family
+ push 0 ; model
+ mov eax, esp
+ push eax ; &model
+ add eax, 4
+ push eax ; &family
+ add eax, 4
+ push eax ; &vendor
+ call _CpuType ; get vendor, family, model
+ add esp, 12
+ pop edx ; model
+ pop ecx ; family
+ pop eax ; vendor
+
+ cmp eax, 1 ; Intel
+ je S_Intel
+ cmp eax, 2 ; AMD
+ je S_AMD
+ cmp eax, 3 ; VIA
+ je S_VIA
+ jmp S91 ; other vendor, not known
+
+S_Intel:cmp ecx, 6
+ jne S92 ; unknown family. possibly future model
+ ; model 2AH Sandy Bridge
+ ; model 3AH Ivy Bridge
+ ; model 3CH Haswell
+ ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
+ ; Haswell is much faster with 256 bit moves
+ cmp edx, 3AH
+ jbe S90
+ jmp S92
+
+S_AMD: ; AMD
+ cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
+ ja S92 ; assume future AMD families are faster
+ ; model 1 = Bulldozer is a little slower on 256 bit write
+ ; model 2 = Piledriver is terribly slow on 256 bit write
+ ; assume future models 3-4 are like Bulldozer
+ cmp edx, 4
+ jbe S90
+ jmp S91 ; later models: don't know
+
+S_VIA: jmp S91 ; don't know
+
+S90: xor eax, eax ; return 0
+ ret
+
+S91: mov eax, 1 ; return 1
+ ret
+
+S92: mov eax, 2 ; return 2
+ ret
+
+; _Store256BitIsFaster ENDP
+
diff --git a/asmlibSrc/unalignedisfaster64.asm b/asmlibSrc/unalignedisfaster64.asm
new file mode 100755
index 0000000..c6a5ac9
--- /dev/null
+++ b/asmlibSrc/unalignedisfaster64.asm
@@ -0,0 +1,186 @@
+;************************* unalignedisfaster64.asm ******************************
+; Author: Agner Fog
+; Date created: 2011-07-09
+; Last modified: 2013-08-30
+; Source URL: www.agner.org/optimize
+; Project: asmlib.zip
+; Language: assembly, NASM/YASM syntax, 64 bit
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+;
+; Description:
+; This function finds out if unaligned 16-bytes memory read is
+; faster than aligned read followed by an alignment shift (PALIGNR) on the
+; current CPU.
+;
+; Return value:
+; 0: Unaligned read is probably slower than alignment shift
+; 1: Unknown or equal
+; 2: Unaligned read is probably faster than alignment shift
+;
+;
+; C++ prototype:
+; extern "C" int Store256BitIsFaster(void);
+;
+; Description:
+; This function finds out if a 32-bytes memory write is
+; faster than two 16-bytes writes on the current CPU.
+;
+; Return value:
+; 0: 32-bytes memory write is slower or AVX not supported
+; 1: Unknown
+; 2: 32-bytes memory write is faster
+;
+; Copyright (c) 2011 - 2013 GNU General Public License www.gnu.org/licenses
+;******************************************************************************
+;
+; C++ prototype:
+; extern "C" int UnalignedIsFaster(void);
+
+global UnalignedIsFaster: function
+global Store256BitIsFaster: function
+extern CpuType
+extern InstructionSet
+
+
+SECTION .text
+
+UnalignedIsFaster:
+
+%ifdef UNIX
+ push 0 ; vendor
+ mov rdi, rsp
+ push 0 ; family
+ mov rsi, rsp
+ push 0 ; model
+ mov rdx, rsp
+%else ; WINDOWS
+ push 0 ; vendor
+ mov rcx, rsp
+ push 0 ; family
+ mov rdx, rsp
+ push 0 ; model
+ mov r8, rsp
+%endif
+ call CpuType ; get vendor, family, model
+ pop rdx ; model
+ pop rcx ; family
+ pop r8 ; vendor
+ xor eax, eax ; return value
+ dec r8d
+ jz Intel
+ dec r8d
+ jz AMD
+ dec r8d
+ jz VIA
+ ; unknown vendor
+ inc eax
+ jmp Uend
+
+Intel: ; Unaligned read is faster on Intel Nehalem and later, but not Atom
+ ; Nehalem = family 6, model 1AH
+ ; Atom = family 6, model 1CH
+ ; Netburst = family 0FH
+ ; Future models are likely to be family 6, mayby > 6, model > 1C
+ cmp ecx, 6
+ jb Uend ; old Pentium 1, etc
+ cmp ecx, 0FH
+ je Uend ; old Netburst architecture
+ cmp edx, 1AH
+ jb Uend ; earlier than Nehalem
+ cmp edx, 1CH
+ je Uend ; Intel Atom
+ or eax, 2 ; Intel Nehalem and later, except Atom
+ jmp Uend
+
+AMD: ; AMD processors:
+ ; The PALIGNR instruction is slow on AMD Bobcat but fast on Jaguar
+ ; K10/Opteron = family 10H ; Use unaligned
+ ; Bobcat = family 14H ; PALIGNR is very slow. Use unaligned
+ ; Piledriver = family 15H ; Use unaligned
+ ; Jaguar = family 16H ; PALIGNR is fast. Use aligned (aligned is faster in most cases, but not all)
+ cmp ecx, 10H ; AMD K8 or earlier: use aligned
+ jb Uend
+ cmp ecx, 16H ; Jaguar: use aligned
+ je Uend
+ or eax, 2 ; AMD K10 or later: use unaligned
+ jmp Uend
+
+VIA: ; Unaligned read is not faster than PALIGNR on VIA Nano 2000 and 3000
+ cmp ecx, 0FH
+ jna Uend ; VIA Nano
+ inc eax ; Future versions: unknown
+ ;jmp Uend
+
+Uend: ret
+
+;UnalignedIsFaster ENDP
+
+
+Store256BitIsFaster:
+ call InstructionSet
+ cmp eax, 11 ; AVX supported
+ jb S90
+%ifdef UNIX
+ push 0 ; vendor
+ mov rdi, rsp
+ push 0 ; family
+ mov rsi, rsp
+ push 0 ; model
+ mov rdx, rsp
+%else ; WINDOWS
+ push 0 ; vendor
+ mov rcx, rsp
+ push 0 ; family
+ mov rdx, rsp
+ push 0 ; model
+ mov r8, rsp
+%endif
+ call CpuType ; get vendor, family, model
+ pop rdx ; model
+ pop rcx ; family
+ pop rax ; vendor
+
+ cmp eax, 1 ; Intel
+ je S_Intel
+ cmp eax, 2 ; AMD
+ je S_AMD
+ cmp eax, 3
+ je S_VIA
+ jmp S91 ; other vendor, not known
+
+S_Intel:cmp ecx, 6
+ jne S92 ; unknown family. possibly future model
+ ; model 2AH Sandy Bridge
+ ; model 3AH Ivy Bridge
+ ; model 3CH Haswell
+ ; Sandy Bridge and Ivy Bridge are slightly faster with 128 than with 256 bit moves on large data blocks
+ ; Haswell is much faster with 256 bit moves
+ cmp edx, 3AH
+ jbe S90
+ jmp S92
+
+S_AMD: ; AMD
+ cmp ecx, 15H ; family 15h = Bulldozer, Piledriver
+ ja S92 ; assume future AMD families are faster
+ ; family 16H = Jaguar. 256 bit write is slightly faster
+ ; model 1 = Bulldozer is a little slower on 256 bit write
+ ; model 2 = Piledriver is terribly slow on 256 bit write
+ ; model 30h = Steamroller is reasonable on 256 bit write
+ cmp edx, 30h
+ jb S90
+ jmp S91 ; Steamroller: moderate
+
+S_VIA: jmp S91 ; don't know
+
+S90: xor eax, eax ; return 0
+ ret
+
+S91: mov eax, 1 ; return 1
+ ret
+
+S92: mov eax, 2 ; return 2
+ ret
+
+; Store256BitIsFaster ENDP
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/asmlib.git
More information about the debian-med-commit
mailing list