[med-svn] [snpomatic] 01/15: Imported Upstream version 0.0.20151015
Sascha Steinbiss
sascha-guest at moszumanska.debian.org
Sat Dec 12 21:59:58 UTC 2015
This is an automated email from the git hooks/post-receive script.
sascha-guest pushed a commit to branch master
in repository snpomatic.
commit 662e5b7b774797ea074f0266aa37749a91f31200
Author: Sascha Steinbiss <sascha at steinbiss.name>
Date: Sat Dec 12 20:20:15 2015 +0000
Imported Upstream version 0.0.20151015
---
Makefile | 35 +
README.md | 1 +
doc/Manual.doc | Bin 0 -> 44032 bytes
gpl-3.0.txt | 674 +++++++++++++++++
src/TAlignmentOutput.cpp | 109 +++
src/TChromosomalIndices.cpp | 480 ++++++++++++
src/TChromosome.cpp | 129 ++++
src/TChromosomeAlign.cpp | 1750 +++++++++++++++++++++++++++++++++++++++++++
src/findknownsnps.cpp | 338 +++++++++
src/global_functions.cpp | 264 +++++++
src/mapcontigs.cpp | 90 +++
src/reassemble.cpp | 884 ++++++++++++++++++++++
src/snpomatic.h | 247 ++++++
src/ungap.cpp | 274 +++++++
src/variety.cpp | 249 ++++++
15 files changed, 5524 insertions(+)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..bb5fdb7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,35 @@
+# Use g++
+CXX=g++
+CXXFLAGS=-g -O3
+LIBS=-lm
+
+# Use Intel compiler (faster!)
+#CXX=icpc
+#CXXFLAGS=-g -O2 -ip -align -falign-functions -Wno-deprecated -Isrc -w
+#LIBS=
+
+
+
+findknownsnps: src/findknownsnps.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o
+ $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@
+
+reassemble: src/reassemble.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o
+ $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@
+
+variety: src/variety.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o
+ $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@
+
+mapcontigs: src/mapcontigs.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o
+ $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@
+
+ungap: src/ungap.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o
+ $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@
+
+.cpp.o:
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+ rm -f *.o
+ rm -f src/*.o
+ rm -f findknownsnps
+ rm -f variety
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b75c473
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+SNP-o-matic is a fast, stringent short-read mapping software. It supports a multitude of output types and formats, for uses in filtering reads, alignments, sequence-based genotyping calls, assisted reassembly of contigs etc.
diff --git a/doc/Manual.doc b/doc/Manual.doc
new file mode 100644
index 0000000..51d3f7e
Binary files /dev/null and b/doc/Manual.doc differ
diff --git a/gpl-3.0.txt b/gpl-3.0.txt
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/gpl-3.0.txt
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/src/TAlignmentOutput.cpp b/src/TAlignmentOutput.cpp
new file mode 100644
index 0000000..4c97a7c
--- /dev/null
+++ b/src/TAlignmentOutput.cpp
@@ -0,0 +1,109 @@
+#include "snpomatic.h"
+
+#define MINOCCUR 1
+
+void TAlignmentOutput::init ( TChromosome *_chr ) {
+ chr = _chr ;
+ align_full.assign ( chr->sequence.length() , false ) ;
+
+ align_size = MAXLINES * chr->sequence.length() ;
+ align = new char[align_size] ;
+ qalign = new char[align_size] ;
+
+ for ( uint i = 0 ; i < align_size ; i++ ) align[i] = qalign[i] = ' ' ;
+}
+
+void TAlignmentOutput::add_align ( const string &seq , const string &quality , uint pos , int chromosome ) {
+ if ( align_full[pos] ) return ;
+ int lane , seql = seq.length() ;
+ for ( lane = 0 ; lane < MAXLINES ; lane++ ) {
+ if ( align[twotoone(lane,pos)] + align[twotoone(lane,pos+seql/2-1)] + align[twotoone(lane,pos+seql-1)] == 96 ) break ;
+ }
+
+ if ( lane >= MAXLINES ) {
+ align_full[pos] = true ;
+ return ; // HARD CUT OFF
+ }
+
+ for ( int i = 0 ; i < seql ; i++ ) {
+ uint p = twotoone(lane,pos+i) ;
+ if ( p >= align_size ) continue ;
+ align[p] = seq[i] ;
+ qalign[p] = quality[i] ;
+ }
+}
+
+void TAlignmentOutput::show_pileup ( FILE *pileup , bool snps_only ) {
+ int search_snps = 0 , found_snps = 0 , bogus = 0 ;
+ uint base , l , cnt , p , lastns ;
+ uchar common ;
+ char *s2 = new char[MAXLINES+5] ;
+ char *s3 = new char[MAXLINES+5] ;
+ char *t , *u ;
+
+ uint count[256] ;
+
+ for ( base = 0 ; base < chr->sequence.length() ; base++ ) {
+ if ( snps_only && !isIUPAC[chr->sequence[base]] ) continue ;
+ if ( isIUPAC[chr->sequence[base]] ) search_snps++ ;
+ count['A']=count['C']=count['G']=count['T']=0 ;
+ cnt = 0 ;
+ common = ' ' ;
+ lastns = 0 ;
+ p = twotoone(0,base) ;
+ t = s2 ;
+ u = s3 ;
+ char *c = align + p ;
+ char *q = qalign + p ;
+ for ( l = 0 ; l < MAXLINES ; l++ , c++ , q++ ) {
+ if ( *c && *c != ' ' ) {
+ lastns = l ;
+ cnt++ ;
+ count[*c]++ ;
+ common = MERGE_IUPAC ( *c , common ) ;
+ if ( *q < MINQUAL + 33 ) *t++ = to_lc[*c] ;
+ else *t++ = *c ;
+ *u++ = *q ;
+ } else {
+ *t++ = ' ' ;
+ *u++ = ' ' ;
+ }
+ }
+ s2[lastns+1] = 0 ;
+ s3[lastns+1] = 0 ;
+
+ bool confirmed_snp = false ;
+ uint n = 0 ;
+ if ( count['A'] >= MINOCCUR ) n++ ;
+ if ( count['C'] >= MINOCCUR ) n++ ;
+ if ( count['G'] >= MINOCCUR ) n++ ;
+ if ( count['T'] >= MINOCCUR ) n++ ;
+ if ( n > 1 ) {
+ confirmed_snp = true ;
+ found_snps++ ;
+ }
+
+/*
+ if ( common != ' ' && isIUPAC[common] ) {
+ confirmed_snp = true ;
+ found_snps++ ;
+ }
+*/
+
+ if ( common != ' ' && isIUPAC[common] && !isIUPAC[chr->sequence[base]] ) bogus++ ;
+
+ if ( snps_only && common == ' ' ) continue ;
+
+ char mark = confirmed_snp ? '*' : ' ' ;
+
+ string ref ;
+ if ( !chr->original_sequence.empty() ) {
+ ref += chr->original_sequence[base] ;
+ ref += "\t" ;
+ }
+ ref += chr->sequence[base] ;
+
+ fprintf ( pileup , "%s\t%s\t%d\t%c%c\t%d\t" , chr->name.c_str() , ref.c_str() , base+1 , common , mark , cnt ) ;
+ fprintf ( pileup , "%s\t%s\n" , s2 , s3 ) ;
+ }
+}
diff --git a/src/TChromosomalIndices.cpp b/src/TChromosomalIndices.cpp
new file mode 100644
index 0000000..a57e8b2
--- /dev/null
+++ b/src/TChromosomalIndices.cpp
@@ -0,0 +1,480 @@
+#include "snpomatic.h"
+
+/// Constructor.
+TChromosomalIndices::TChromosomalIndices ( int _index_length , int _index_length2 ) {
+ index_length = _index_length ;
+ index_length2 = _index_length2 ;
+ index_length_double = index_length + index_length2 ;
+ uint size = 1 ;
+ for ( int a = 0 ; a < index_length ; a++ ) size *= 4 ;
+ position.assign ( size , pwi_vector() ) ;
+ noshortcuts = 0 ; // Take shortcuts by default
+ max_snps_per_index = 8 ;
+ index_from = 0 ;
+ index_to = 0 ;
+ index_steps = 1 ;
+}
+
+/// Indexes reads for reassembly.
+void TChromosomalIndices::run_reads ( string filename ) {
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ char dummy[READ_CHAR_BUFFER] , *c ;
+ ulong count = 0 , a , from , to , l ;
+ ulong filepos = 0 ;
+
+ for ( a = 0 ; a < position.size() ; a++ ) position[a].reserve ( 95 ) ; // Speedup cheat
+
+ while ( !feof(file) ) {
+ count++ ;
+ filepos = ftell ( file ) ;
+ if ( count % 1000000 == 0 ) cerr << count << endl ;
+
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+ if ( *dummy != '>' ) break ; // Not a FASTA file, apparently
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+
+ l = 0 ;
+ for ( c = dummy ; *c > 13 ; c++ ) {
+ if ( isIUPAC[*c] ) { // No IUPAC reads, please...
+ *dummy = 0 ;
+ break ;
+ }
+ l++ ;
+ }
+ if ( !*dummy ) continue ;
+ *c = 0 ;
+
+ from = 3 ; // 0 = no offset
+ to = l - from - index_length_double ;
+
+ for ( a = from ; a <= to ; a++ ) add_index ( dummy + from , false , filepos , from ) ;
+ REVERSE_COMPLEMENT_CHARP ( dummy ) ;
+ for ( a = from ; a <= to ; a++ ) add_index ( dummy + from , true , filepos , from ) ;
+ }
+
+ // Sort
+ for ( a = 0 ; a < position.size() ; a++ ) sort ( position[a].begin() , position[a].end() ) ;
+
+ // Analyze
+ uint min = 5 ;
+ uint max = 100 ;
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ uint b , last = 0 , cnt = 0 ;
+ for ( b = 0 ; b < position[a].size() ; b++ ) {
+ if ( position[a][b].index2 == position[a][last].index2 ) {
+ cnt++ ;
+ } else {
+ if ( cnt >= min && cnt <= max ) reassemble_reads ( file , a , last , b-1 ) ;
+ last = b ;
+ cnt = 1 ;
+ }
+ }
+ if ( cnt >= min && cnt <= max ) reassemble_reads ( file , a , last , b-1 ) ;
+ }
+}
+
+bool TChromosomalIndices::align_read2contigs ( char *read , vector <string> &vs ) {
+ uint a , vsmax = vs.size() ;
+ int l , b , c ;
+ bool ret = false ;
+ for ( l = 0 ; *(read+l) ; l++ ) ;
+ int from = -10 ;
+ int to = 20 ;
+ for ( a = 0 ; a < vsmax ; a++ ) {
+ if ( vs[a] == read ) return ret ;
+ for ( b = from ; b < to ; b++ ) {
+ bool match = true ;
+ for ( c = 0 ; match && c < l ; c++ ) {
+ if ( b + c < 0 ) continue ;
+ if ( b + c >= vs[a].length() ) continue ;
+ if ( *(read+c) != vs[a][b+c] ) match = false ;
+ }
+ if ( match ) {
+ string m ;
+ string n ;
+ for ( c = b ; c < 0 ; c++ ) n += ' ' ;
+ for ( c = 0 ; c < b ; c++ ) m += ' ' ;
+ n += vs[a] ;
+ m += read ;
+ if ( n.length() == vs[a].length() ) continue ;
+
+ while ( n.length() < m.length() ) n += ' ' ;
+ while ( m.length() < n.length() ) m += ' ' ;
+ for ( c = 0 ; c < n.length() ; c++ ) {
+ if ( n[c] == ' ' ) n[c] = m[c] ;
+ }
+ vs.push_back ( n ) ;
+
+ break ; // One match will do just fine...
+ }
+ }
+ }
+ if ( !ret ) vs.push_back ( read ) ; // New contig since it doesn't match old ones
+ return ret ;
+}
+
+void TChromosomalIndices::reassemble_reads ( FILE *file , uint i1 , index2type i2a , index2type i2b ) {
+ char dummy[READ_CHAR_BUFFER] , *c ;
+ vector <string> contigs ;
+ uint maxlen = 0 ;
+ for ( index2type i2 = i2a ; i2 <= i2b ; i2++ ) {
+ fseek ( file , position[i1][i2].position , SEEK_SET ) ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ uint l = 0 ;
+ for ( c = dummy ; *c > 13 ; l++ , c++ ) ; // All possible hits are already IUPAC-filtered, no need to do it again
+ *c = 0 ;
+ if ( l > maxlen ) maxlen = l ;
+ if ( position[i1][i2].reverse_complement ) REVERSE_COMPLEMENT_CHARP ( dummy ) ;
+
+ align_read2contigs ( dummy , contigs ) ;
+ }
+ for ( uint a = 0 ; a < contigs.size() ; a++ ) {
+ if ( contigs[a].length() <= maxlen ) continue ;
+ cout << contigs[a] << endl ;
+ }
+}
+
+void TChromosomalIndices::add_index ( char *start , bool rc , uint pos , uchar chromosome ) {
+ uint key = calculate_index ( start ) ;
+ TPositionWithIndex pwi ;
+ pwi.position = pos ;
+ pwi.index2 = calculate_index2 ( start + index_length ) ;
+ pwi.reverse_complement = rc ;
+ pwi.chromosome = chromosome ;
+ position[key].push_back ( pwi ) ;
+}
+
+
+/// Creates index for all regions, mark regions N
+void TChromosomalIndices::run_regions ( chrvec *_chrs , string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Indexing regions ... " ) ; fflush(stdout); }
+ chrs = _chrs ;
+ int a , chr ;
+
+// for ( a = 0 ; a < position.size() ; a++ ) position[a].reserve ( 100 ) ; // Pre-reserve memory, rule-of-thumb
+
+// cout << endl ;
+ // Go through region file
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ char dummy[READ_CHAR_BUFFER] ;
+ while ( !feof(file) ) {
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+ vector <string> parts ;
+ Tokenize ( dummy , parts , "\t" ) ;
+ if ( parts.size() < 3 ) continue ;
+
+ for ( chr = 0 ; chr < chrs->size() && (*chrs)[chr].name != parts[0] ; a++ ) ;
+ if ( chr == chrs->size() ) {
+ cout << "Unknown chromosome " << parts[0] << " in region list!\n" ;
+ continue ;
+ }
+
+// cout << dummy ;
+ uint from = atoi ( parts[1].c_str() ) - 1 ;
+ uint to = atoi ( parts[2].c_str() ) - 1 ;
+
+ // Mark region as "N"
+ for ( a = from ; a <= to ; a++ ) (*chrs)[chr].sequence[a] = 'N' ;
+
+ uint offset = 20 ;
+ string s ;
+ uint pl = (*chrs)[chr].sequence.length() - index_length_double ;
+ string *p = & ( (*chrs)[chr].sequence ) ;
+
+ for ( a = from - index_length_double - offset ; a <= to ; a++ ) {
+ if ( a < 0 ) continue ;
+ if ( a >= pl ) continue ;
+ s = p->substr ( a , index_length_double ) ;
+
+ char *sc = (char*) s.c_str() ;
+ add_sequence ( sc , sc , a , chr , false ) ;
+ add_sequence ( sc , sc , a+index_length_double , chr , true ) ;
+ }
+
+ }
+
+ // Sort all sub-lists
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ sort ( position[a].begin() , position[a].end() ) ;
+ }
+
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+}
+
+/// Create index for beginning and end of all contigs.
+void TChromosomalIndices::run_ends ( chrvec *_chrs , int min_length ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Indexing contigs ... " ) ; fflush(stdout); }
+ chrs = _chrs ;
+ uint a , chr ;
+
+// for ( a = 0 ; a < position.size() ; a++ ) position[a].reserve ( 100 ) ; // Pre-reserve memory, rule-of-thumb
+
+ for ( chr = 0 ; chr < chrs->size() ; chr++ ) {
+ if ( (*chrs)[chr].sequence.length() < min_length ) continue ;
+ string s , *p ;
+ uint pl = (*chrs)[chr].sequence.length() - index_length_double ;
+ char *sc = (char*) (*chrs)[chr].sequence.c_str() ;
+
+ for ( uint a = 0 ; a < pl ; a++ ) {
+ add_sequence ( sc , sc , a , chr , false ) ;
+// add_sequence ( sc , sc , a+index_length_double , chr , true ) ;
+ }
+ }
+
+/*
+ if ( DEBUGGING ) { fprintf ( stdout , "sorting ... " ) ; fflush(stdout); }
+ // Sort all sub-lists
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ sort ( position[a].begin() , position[a].end() ) ;
+ }
+*/
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+}
+
+string TChromosomalIndices::decompile ( uint index , uint bases ) {
+ string ret ;
+ while ( bases > 0 ) {
+ ret += num2char[(index&3)+1] ;
+ index >>= 2 ;
+ bases-- ;
+ }
+ reverse ( ret.begin() , ret.end() ) ;
+ return ret ;
+}
+
+
+/// Create index for all chromosomes.
+void TChromosomalIndices::run ( chrvec *_chrs ) {
+ chrs = _chrs ;
+ if ( !index_file.empty() && fopen ( index_file.c_str() , "r" ) ) {
+ load_index ( index_file ) ;
+ return ;
+ }
+
+ if ( DEBUGGING ) {
+ if ( index_from == 0 && index_to == 0 ) fprintf ( stdout , "Indexing %d chromosomes ... " , chrs->size() ) ;
+ else fprintf ( stdout , "Indexing %d chromosomes (%d-%d) ... " , chrs->size() , index_from , index_to ) ;
+ fflush(stdout);
+ }
+
+ uint a , chr ;
+
+ for ( chr = 0 ; chr < chrs->size() ; chr++ ) {
+ string s , *p ;
+ char *sc = (char*) (*chrs)[chr].sequence.c_str() ;
+ uint pl = (*chrs)[chr].sequence.length() ;
+ if ( index_to > 0 && index_to < pl ) pl = index_to ;
+ sc += index_from ;
+ for ( a = index_from ; a < pl ; a += index_steps ) {
+ int r = 0 ;
+ r += add_sequence ( sc , sc , a , chr , false ) ;
+ r += add_sequence ( sc , sc , a+index_length_double , chr , true ) ;
+ sc += index_steps ;
+ }
+ }
+
+ if ( DEBUGGING ) { fprintf ( stdout , "sorting ... " ) ; fflush(stdout); }
+
+ // Nuke huge indices; speedup: 50%
+ if ( !noshortcuts ) {
+ pwi_vector largest ;
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ if ( largest.size() < 20 ) {
+ TPositionWithIndex tmp ;
+ tmp.index2 = position[a].size() ;
+ tmp.position = a ;
+ largest.push_back ( tmp ) ;
+ sort ( largest.begin() , largest.end() ) ;
+ continue ;
+ }
+ if ( largest[0].index2 >= position[a].size() ) continue ;
+
+ largest[0].index2 = position[a].size() ;
+ largest[0].position = a ;
+ sort ( largest.begin() , largest.end() ) ;
+ }
+
+ for ( a = 0 ; a < largest.size() ; a++ ) {
+ if ( largest[a].index2 < largest.back().index2 / 4 ) continue ;
+ position[largest[a].position].clear() ;
+ }
+ }
+
+ // Sort all sub-lists
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ sort ( position[a].begin() , position[a].end() ) ;
+ }
+
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+
+ if ( !index_file.empty() ) {
+ save_index ( index_file ) ;
+ return ;
+ }
+}
+
+
+/// Recurses/iterates through sequence with possible IUPAC codes, creates all possible combinations, and adds them to the index list.
+int TChromosomalIndices::add_sequence ( char *begin , char *cur , int from , int chr , bool rc , int nos ) {
+ if ( nos > max_snps_per_index ) return 0 ;
+ int ret = 0 ;
+ while ( index_length_double > cur - begin ) {
+ if ( isIUPAC[*cur] ) {
+ if ( *cur == 'X' ) return ret ; // Contains X, aborting
+ uchar c = char2IUPAC[*cur] ;
+ uchar oc = *cur ;
+ if ( c & IUPAC_A ) { *cur = 'A' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; }
+ if ( c & IUPAC_C ) { *cur = 'C' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; }
+ if ( c & IUPAC_G ) { *cur = 'G' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; }
+ if ( c & IUPAC_T ) { *cur = 'T' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; }
+ *cur = oc ;
+ return ret ;
+ }
+ cur++ ;
+ }
+
+ char *sc = begin ;
+ if ( rc ) {
+ sc = tmp ;
+ memcpy ( tmp , begin , index_length_double ) ;
+ tmp[index_length_double] = 0 ;
+ REVERSE_COMPLEMENT_CHARP ( sc ) ;
+ }
+
+ uint key = calculate_index ( sc ) ;
+ TPositionWithIndex pwi ;
+ pwi.position = from ;
+ pwi.index2 = calculate_index2 ( sc + index_length ) ;
+ pwi.reverse_complement = rc ;
+ pwi.chromosome = chr ;
+ position[key].push_back ( pwi ) ;
+ return ret + 1 ;
+}
+
+/// sequence_kmer MUST only contain As, Cs, Gs, or Ts
+uint TChromosomalIndices::calculate_index ( const char *sequence_kmer ) {
+ uint ret = 0 ;
+ for ( int a = 0 ; a < index_length ; a++ ) {
+ ret = ( ret << 2 ) | ( ((uint)char2num[(uint)sequence_kmer[a]])-1 ) ;
+ }
+ return ret ;
+}
+
+/// sequence_kmer MUST only contain As, Cs, Gs, or Ts
+index2type TChromosomalIndices::calculate_index2 ( const char *sequence_kmer ) {
+ uint ret = 0 ;
+ for ( int a = 0 ; a < index_length2 ; a++ ) {
+ ret = ( ret << 2 ) | ( ((uint)char2num[(uint)sequence_kmer[a]])-1 ) ;
+ }
+ return ret ;
+}
+
+void TChromosomalIndices::save_index ( string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Saving index file ... " ) ; fflush(stdout); }
+ FILE *file = fopen ( string ( filename.c_str() ).c_str() , "w" ) ;
+ uint a , b ;
+
+ uint temp = 1 ; // Version
+ fwrite ( &temp , sizeof ( temp ) , 1 , file ) ;
+ temp = position.size() ;
+// cout << endl << position.size() << endl << temp << endl ;
+ fwrite ( &temp , sizeof ( temp ) , 1 , file ) ;
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ temp = position[a].size() ;
+ fwrite ( &temp , sizeof ( temp ) , 1 , file ) ;
+ for ( b = 0 ; b < position[a].size() ; b++ ) {
+ fwrite ( &position[a][b].index2 , sizeof ( position[a][b].index2 ) , 1 , file ) ;
+ fwrite ( &position[a][b].position , sizeof ( position[a][b].position ) , 1 , file ) ;
+ fwrite ( &position[a][b].chromosome , sizeof ( position[a][b].chromosome ) , 1 , file ) ;
+ fwrite ( &position[a][b].reverse_complement , sizeof ( position[a][b].reverse_complement ) , 1 , file ) ;
+ }
+ }
+
+ fclose ( file ) ;
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+}
+
+void TChromosomalIndices::load_index ( string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Loading index file ... " ) ; fflush(stdout); }
+ uint temp ;
+ FILE *file = fopen ( string ( filename.c_str() ).c_str() , "r" ) ;
+ if ( !file ) exit ( 1 ) ;
+ fread ( &temp , sizeof ( temp ) , 1 , file ) ; // Version
+ fread ( &temp , sizeof ( temp ) , 1 , file ) ; // position size
+ if ( position.size() != temp ) exit ( 1 ) ;
+
+ uint a , b ;
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ fread ( &temp , sizeof ( temp ) , 1 , file ) ; // Sub-index size
+ position[a].reserve ( temp ) ;
+ for ( b = 0 ; b < temp ; b++ ) {
+ TPositionWithIndex i ;
+ fread ( &i.index2 , sizeof ( i.index2 ) , 1 , file ) ;
+ fread ( &i.position , sizeof ( i.position ) , 1 , file ) ;
+ fread ( &i.chromosome , sizeof ( i.chromosome ) , 1 , file ) ;
+ fread ( &i.reverse_complement , sizeof ( i.reverse_complement ) , 1 , file ) ;
+ position[a].push_back ( i ) ;
+// if ( i.chromosome > 100 ) cerr << "!" << i.chromosome << endl ;
+ }
+ }
+
+ fclose ( file ) ;
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+}
+
+void TChromosomalIndices::uniqueness ( string filename ) {
+ uint a , b , c ;
+
+ for ( a = 0 ; a < chrs->size() ; a++ ) {
+ (*chrs)[a].uniqueness.resize ( (*chrs)[a].sequence.length() , 0 ) ;
+ }
+
+ for ( a = 0 ; a < position.size() ; a++ ) {
+ for ( b = 0 ; b < position[a].size() ; b++ ) {
+ for ( c = b+1 ; c < position[a].size() && position[a][b].index2 == position[a][c].index2 ; c++ ) ;
+ if ( c == b+1 ) continue ;
+ uint count = c - b ;
+ cout << count << endl ;
+ for ( c = b ; c < position[a].size() && position[a][b].index2 == position[a][c].index2 ; c++ ) {
+ uint p = position[a][c].position ;
+ uint chr = position[a][c].chromosome ;
+ if ( chr > chrs->size() ) {
+ cerr << position[a][c].chromosome << "/" << c << " : " << chr << endl ;
+ exit ( 1 ) ;
+ }
+ uint si = (*chrs)[chr].uniqueness.size() ;
+ if ( si == 0 ) {
+ (*chrs)[chr].uniqueness.resize ( (*chrs)[chr].sequence.length() , 0 ) ;
+ si = (*chrs)[chr].uniqueness.size() ;
+ }
+ for ( uint u = 0 ; u < index_length_double ; u++ ) {
+ if ( u + p >= si ) break ;
+ (*chrs)[chr].uniqueness[u+p] += count ;
+ }
+ }
+ b = c - 1 ;
+ }
+ }
+
+ for ( a = 0 ; a < chrs->size() ; a++ ) {
+ for ( b = 0 ; b < (*chrs)[a].uniqueness.size() ; b++ ) {
+ if ( (*chrs)[a].uniqueness[b] == 0 )
+ (*chrs)[a].uniqueness[b] = 1 ;
+ }
+ }
+
+ FILE *u = fopen ( filename.c_str() , "w" ) ;
+ for ( a = 0 ; a < chrs->size() ; a++ ) {
+ for ( b = 0 ; b < (*chrs)[a].uniqueness.size() ; b++ ) {
+ fprintf ( u , "%s\t%d\t%d\n" , (*chrs)[a].name.c_str() , b + 1 , (*chrs)[a].uniqueness[b] ) ;
+ }
+ }
+ fclose ( u ) ;
+}
diff --git a/src/TChromosome.cpp b/src/TChromosome.cpp
new file mode 100644
index 0000000..e6ba5a8
--- /dev/null
+++ b/src/TChromosome.cpp
@@ -0,0 +1,129 @@
+#include "snpomatic.h"
+
+/// Reads a single chromosome from a FASTA file; superceded by load_all_chromosomes().
+void TChromosome::read_chromosome ( string filename , string chromosome_name ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading %s from %s ... " , chromosome_name.c_str() , filename.c_str() ) ; fflush(stdout); }
+
+ name = chromosome_name ;
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ uchar store = 0 ;
+ while ( !feof(file) ) {
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( *dummy == '>' ) {
+ if ( store ) break ; // Read the chromosome
+ for ( c1 = dummy+1 ; *c1 == ' ' ; c1++ ) ;
+ for ( c2 = c1 ; *c2 && *c2 != 10 && *c2 != 13 && *c2 != ' ' && *c2 != '|' ; c2++ ) ;
+ *c2 = 0 ;
+ if ( c1 == chromosome_name ) store = 1 ;
+ } else if ( store ) {
+ for ( c1 = dummy ; *c1 ; c1++ ) {
+ switch ( *c1 ) {
+ case 'a' : *c1 = 'A' ; break ;
+ case 'c' : *c1 = 'C' ; break ;
+ case 'g' : *c1 = 'G' ; break ;
+ case 't' : *c1 = 'T' ; break ;
+ }
+
+// if ( *c1 == 'A' || *c1 == 'C' || *c1 == 'G' || *c1 == 'T' ) {
+ sequence += *c1 ;
+// } else if ( *c1 != 13 && *c1 != 10 ) {
+// fprintf ( stdout , "Genome data contains incompatible character '%c' - aborting.\n" , *c1 ) ;
+// exit ( 1 ) ;
+// }
+
+ }
+ }
+ }
+ fclose ( file ) ;
+
+ if ( DEBUGGING ) { fprintf ( stdout , "read %d nucleotides.\n" , sequence.length() ) ; fflush(stdout); }
+}
+
+/// Reads a SNP list in .gff and puts the SNPs as IUPAC bases into the chromosome; superceded by incorporate_all_gff().
+void TChromosome::incorporate_known_snps_gff ( string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading SNPS into %s from %s ... " , name.c_str() , filename.c_str() ) ; fflush(stdout); }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ int snpcnt = 0 ;
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ while ( !feof(file) ) {
+ vector <string> parts , parts2 ;
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ Tokenize ( dummy , parts , "\t" ) ;
+ if ( parts.size() < 7 ) continue ; // These is not the data you are looking for
+ if ( parts[0] != name ) continue ; // Wrong chromosome
+ if ( parts[2] != "SNP" ) continue ; // Not a SNP
+ if ( parts[3] != parts[4] ) continue ; // Not a single point
+ if ( parts[5] != "+" ) continue ; // Wrong strand (?)
+ int pos = atoi ( parts[3].c_str() ) ;
+
+ char merge = sequence[pos-1] ;
+ Tokenize ( parts[6].c_str() , parts2 , "\"\" \"\"" ) ;
+ bool found_reference = false ;
+ for ( int b = 0 ; b < parts2.size() ; b++ ) {
+ int l = parts2[b].length() ;
+ if ( l < 2 ) continue ;
+ if ( parts2[b][l-2] != ':' ) continue ;
+ char c = parts2[b][l-1] ;
+ if ( c == sequence[pos-1] ) found_reference = true ;
+ merge = MERGE_IUPAC(merge,c) ;
+ }
+ if ( !found_reference ) { // Strange; maybe original sequence contains "N"
+ // We should count this!
+ continue ;
+ }
+ sequence[pos-1] = merge ;
+ snpcnt++ ;
+ }
+ fclose ( file ) ;
+ if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs.\n" , snpcnt ) ; fflush(stdout); }
+}
+
+/// Reads a SNP list in simple format and puts the SNPs as IUPAC bases into the chromosome.
+void TChromosome::incorporate_known_snps ( string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading from %s ... " , filename.c_str() ) ; fflush(stdout); }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ int snpcnt = 0 ;
+ while ( !feof(file) ) {
+ vector <string> parts ;
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ Tokenize ( dummy , parts , "\t" ) ;
+ if ( parts.size() != 4 ) continue ;
+ if ( name != parts[0] ) continue ;
+ int pos = atoi ( parts[1].c_str() ) ;
+ uchar orig = parts[2][0] ;
+ uchar snp = parts[3][0] ;
+ sequence[pos-1] = MERGE_IUPAC(orig,snp) ;
+ snpcnt++ ;
+ }
+ fclose ( file ) ;
+
+ if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs.\n" , snpcnt ) ; fflush(stdout); }
+}
+
+/// Adds a (single) read to the coverage statistics
+void TChromosome::add_coverage ( const string &seq , uint pos , bool rc ) {
+ if ( coverage.size() == 0 ) { // Set up vector
+ TCoverage dc ;
+ dc.A = dc.C = dc.T = dc.G = dc.rc = 0 ; // Paranoia
+ coverage.resize ( sequence.length() , dc ) ;
+ }
+
+ for ( uint p = 0 ; p < seq.length() ; p++ ) {
+ if ( p + pos >= sequence.length() ) break ; // End of the line, my friend...
+ if ( rc ) coverage[p+pos].rc++ ;
+ switch ( seq[p] ) {
+ case 'A' : coverage[p+pos].A++ ; break ;
+ case 'C' : coverage[p+pos].C++ ; break ;
+ case 'G' : coverage[p+pos].G++ ; break ;
+ case 'T' : coverage[p+pos].T++ ; break ;
+ default : cerr << seq[p] << endl ;
+ } ;
+ }
+}
diff --git a/src/TChromosomeAlign.cpp b/src/TChromosomeAlign.cpp
new file mode 100644
index 0000000..e3258a9
--- /dev/null
+++ b/src/TChromosomeAlign.cpp
@@ -0,0 +1,1750 @@
+#include "snpomatic.h"
+
+string dor ( string s ) { // Do Reverse
+ reverse ( s.begin() , s.end() ) ;
+ return s ;
+}
+
+string doc ( string s ) { // Do Complementary
+ for ( int a = 0 ; a < s.length() ; a++ ) s[a] = base_rc[s[a]] ;
+ return s ;
+}
+
+string dorc ( string s ) { // Do Reverse/Complementary
+ for ( int a = 0 ; a < s.length() ; a++ ) s[a] = base_rc[s[a]] ;
+ reverse ( s.begin() , s.end() ) ;
+ return s ;
+}
+
+bool tpi_full_compare ( const TPositionWithIndex a , const TPositionWithIndex b ) {
+ return ( a.chromosome != b.chromosome ) ? a.chromosome < b.chromosome : a.position < b.position ;
+}
+
+
+
+
+/// Constructor
+TChromosomeAlign::TChromosomeAlign ( chrvec &_chrs , TChromosomalIndices &_index ) {
+ pileup = binfile_no_match = binfile_single_match = binfile_multi_match = binfile_iupac = cigar = NULL ;
+ wobble = fragmentplot = featuretable = gffout = coverage = snpsinreads = indelplot = inversions = NULL ;
+ sqlite = sam = spancontigs = faceaway = rpa = NULL ;
+ chrs = &_chrs ;
+ index = &_index ;
+ chop = 0 ;
+ wobblemax = 2 ;
+ max_align = (uint) 2 << 30 ; // Just a very large number
+ fasta = false ;
+ use_nono = false ;
+ using_bins = false ;
+ snpsinreads_first = true ;
+ multimatch = false ;
+ singlematch = false ;
+ force_one_unique_match = false ;
+ sqlite_prefix_first = true ;
+ rpa_header_written = false ;
+ single_read_length = 0 ;
+}
+
+void TChromosomeAlign::dump_coverage () {
+ if ( !coverage ) return ;
+ fprintf ( coverage , "Chromosome\tPosition\tReference\tA\tC\tG\tT\tReverse-complement\n" ) ;
+ for ( uint chr = 0 ; chr < chrs->size() ; chr++ ) {
+ if ( (*chrs)[chr].coverage.size() == 0 ) continue ; // Paranoia
+ for ( uint pos = 0 ; pos < (*chrs)[chr].sequence.length() ; pos++ ) {
+ if ( pos >= (*chrs)[chr].coverage.size() ) {
+ fprintf ( coverage , "%s\t%d\t%c\t%d\t%d\t%d\t%d\t%d\n" ,
+ (*chrs)[chr].name.c_str() ,
+ pos + 1 ,
+ (*chrs)[chr].original_sequence.length() > 0 ? (*chrs)[chr].original_sequence[pos] : (*chrs)[chr].sequence[pos] ,
+ 0 , 0 , 0 , 0 , 0 ) ;
+ }
+ fprintf ( coverage , "%s\t%d\t%c\t%d\t%d\t%d\t%d\t%d\n" ,
+ (*chrs)[chr].name.c_str() ,
+ pos + 1 ,
+ (*chrs)[chr].original_sequence.length() > 0 ? (*chrs)[chr].original_sequence[pos] : (*chrs)[chr].sequence[pos] ,
+ (*chrs)[chr].coverage[pos].A ,
+ (*chrs)[chr].coverage[pos].C ,
+ (*chrs)[chr].coverage[pos].G ,
+ (*chrs)[chr].coverage[pos].T ,
+ (*chrs)[chr].coverage[pos].rc ) ;
+ }
+ }
+ fclose ( coverage ) ;
+ coverage = NULL ; // Paranoia
+}
+
+/// Initialization (after constructor)
+void TChromosomeAlign::init () {
+ int i ;
+ if ( pileup ) for ( i = 0 ; i < chrs->size() ; i++ ) (*chrs)[i].ao.init(&((*chrs)[i])) ;
+ if ( sam ) {
+ fprintf ( sam , "@HD\tVN:1.0\n" ) ;
+ for ( i = 0 ; i < chrs->size() ; i++ ) {
+ fprintf ( sam , "@SQ\tSN:%s\tLN:%d\n" , (*chrs)[i].name.c_str() , (*chrs)[i].sequence.length() ) ;
+ }
+ }
+}
+
+void TChromosomeAlign::add_nono_list ( string filename ) {
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ while ( !feof(file) ) {
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+
+ c1 = dummy ;
+ if ( *c1 == '>' || *c1 == '@' ) c1++ ;
+ while ( *c1 == ' ' ) c1++ ;
+ for ( c2 = c1 ; *c2 > 13 ; c2++ ) ;
+ *c2 = 0 ;
+ nono.push_back ( c1 ) ;
+ }
+ sort ( nono.begin() , nono.end() ) ;
+ use_nono = true ;
+}
+
+bool TChromosomeAlign::is_nono ( const string &s ) {
+ pair <TSVI , TSVI> ss = equal_range ( nono.begin() , nono.end() , s ) ;
+ return ss.first != ss.second ;
+}
+
+void TChromosomeAlign::write2bin ( const string &name , const string &sequence , const string &quality , int matches ) {
+ string out ;
+ if ( fasta ) {
+ out = ">" + name + "\n" + sequence + "\n" ;
+ } else {
+ out = "@" + name + "\n" + sequence + "\n+\n" + quality + "\n" ;
+ }
+
+ if ( matches < 0 ) { if ( binfile_iupac ) fprintf ( binfile_iupac , "%s" , out.c_str() ) ; }
+ else if ( matches == 0 ) { if ( binfile_no_match ) fprintf ( binfile_no_match , "%s" , out.c_str() ) ; }
+ else if ( matches == 1 ) { if ( binfile_single_match ) fprintf ( binfile_single_match , "%s" , out.c_str() ) ; }
+ else { if ( binfile_multi_match ) fprintf ( binfile_multi_match , "%s" , out.c_str() ) ; }
+}
+
+uint TChromosomeAlign::align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) {
+ string seq1 = sequence.substr ( 0 , read_length_1 ) ;
+ string seq2 = sequence.substr ( read_length_1 , sequence.length() - read_length_1 ) ;
+
+ while ( quality.length() < sequence.length() ) quality += (char) 33+30 ;
+ string qual1 = quality.substr ( 0 , read_length_1 ) ;
+ string qual2 = quality.substr ( read_length_1 , sequence.length() - read_length_1 ) ;
+
+ pwi_vector res1 , res2 ;
+ get_align_solexa_read ( seq1.c_str() , res1 ) ;
+ if ( !sam && !wobble && !chop && !sqlite && res1.size() == 0 ) return 0 ;
+ get_align_solexa_read ( seq2.c_str() , res2 ) ;
+ if ( !sam && !wobble && !chop && !sqlite && res2.size() == 0 ) return 0 ;
+
+ sort ( res1.begin() , res1.end() , tpi_full_compare ) ;
+ sort ( res2.begin() , res2.end() , tpi_full_compare ) ;
+
+ int ret = 0 , old_fragment , old_variance ;
+
+ // Spanning contigs
+ if ( ( rpa || spancontigs ) && res1.size() == 1 && res2.size() == 1 && res1[0].chromosome != res2[0].chromosome ) {
+ if ( spancontigs ) {
+ fprintf ( spancontigs , "%s\t%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%s\t%s\n" ,
+ last_solexa_name.c_str() ,
+ (*chrs)[res1[0].chromosome].name.c_str() , res1[0].position , res1[0].reverse_complement , seq1.c_str() , qual1.c_str() ,
+ (*chrs)[res2[0].chromosome].name.c_str() , res2[0].position , res2[0].reverse_complement , seq2.c_str() , qual2.c_str()
+ ) ;
+ }
+ if ( rpa ) rpa_out ( res1[0] , res2[0] , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ;
+ return 0 ;
+ }
+
+ if ( ( sqlite || sam ) && res1.size() + res2.size() == 1 ) { // Unique single read
+ TPositionWithIndex r = res1.size() ? res1[0] : res2[0] ;
+ string seq = res1.size() ? seq1 : seq2 , s ;
+ if ( sqlite ) {
+ if ( r.reverse_complement ) s = generate_sqlite_paired_command ( r.chromosome , dorc ( seq ) , r.position - seq.length() , "" , 0 , '1' ) ;
+ else s = generate_sqlite_paired_command ( r.chromosome , seq , r.position , "" , 0 , '1' ) ;
+ add2sqlite_cache ( s ) ;
+ }
+ if ( sam ) {
+ string qual = res1.size() ? qual1 : qual2 , s ;
+ int readpart = res1.size() ? 1 : 2 ;
+ if ( r.reverse_complement ) add_sam ( dorc ( seq ) , dor ( qual ) , r.chromosome , r.position - seq.length() , readpart , 0 , 0 , r.reverse_complement , false , false , true ) ;
+ else add_sam ( seq , qual , r.chromosome , r.position , readpart , 0 , 0 , r.reverse_complement , false , false , true ) ;
+ }
+ return 0 ;
+ }
+
+
+ if ( sqlite ) { // Clear temporary cache, increase fragment/variance temporarily
+ sqlite_out.clear () ;
+ }
+ sam_wrote = false ;
+ ret += paired_read_combine ( res1 , res2 , fragment_length , range , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ;
+ ret += paired_read_combine ( res2 , res1 , fragment_length , range , read_length_1 , seq2 , qual2 , seq1 , qual1 ) ;
+
+ if ( sqlite && sqlite_out.size() == 1 ) {
+ add2sqlite_cache ( sqlite_out[0] ) ;
+ return ret ;
+ }
+
+ // Try all fragment sizes if sqlite and no in-range find
+ if ( ( sam && !sam_wrote ) || ( sqlite && sqlite_out.size() == 0 ) ) {
+ old_fragment = fragment_length ;
+ old_variance = range ;
+ fragment_length = 90000000 ; // Long...
+ range = 90000000 ; // Looooooong...
+ bool done = false ;
+
+ paired_read_combine ( res1 , res2 , fragment_length , range , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ;
+ paired_read_combine ( res2 , res1 , fragment_length , range , read_length_1 , seq2 , qual2 , seq1 , qual1 ) ;
+ if ( sqlite_out.size() == 1 ) {
+ add2sqlite_cache ( sqlite_out[0] ) ;
+ done = true ;
+ }
+
+ fragment_length = old_fragment ; // Ah. Just kidding.
+ range = old_variance ;
+ if ( done ) return 0 ;
+ }
+
+ // Try inversions
+ if ( ret == 0 && ( rpa || inversions || sqlite || ( sam && !sam_wrote ) ) ) {
+ bool fake_long_region = true ; // sqlite ;
+
+ if ( fake_long_region ) {
+ old_fragment = fragment_length ;
+ old_variance = range ;
+ fragment_length = 90000000 ; // Long...
+ range = 90000000 ; // Looooooong...
+ }
+
+ paired_read_combine ( res1 , res2 , fragment_length , range , read_length_1 , seq1 , qual1 , seq2 , qual2 , false , true , 1 ) ;
+ paired_read_combine ( res2 , res1 , fragment_length , range , read_length_1 , seq2 , qual2 , seq1 , qual1 , false , true , 2 ) ;
+
+ if ( fake_long_region ) {
+ fragment_length = old_fragment ; // Ah. Just kidding.
+ range = old_variance ;
+ }
+
+ if ( sqlite_out.size() == 1 ) {
+ add2sqlite_cache ( sqlite_out[0] ) ;
+ return 0 ;
+ }
+ }
+
+ if ( ret > 0 ) return ret ;
+
+ // Chop off read ends and try again (optional!)
+ if ( chop > 0 ) {
+ uint temp_chop = chop ;
+
+ if ( 1 ) { // Chop off read end
+ string nseq = seq1.substr ( 0 , seq1.length() - chop ) + seq2.substr ( chop , seq2.length() - chop ) ;
+ string nqual = qual1.substr ( 0 , qual1.length() - chop ) + qual2.substr ( chop , qual2.length() - chop ) ;
+ chop = 0 ;
+ ret = align_solexa_paired_read ( name , nseq , nqual , read_length_1 - temp_chop , fragment_length , range ) ;
+ chop = temp_chop ;
+ }
+
+ if ( ret == 0 ) { // Still no luck, try to chop off the beginning
+ string nseq = seq1.substr ( chop , seq1.length() - chop ) + seq2.substr ( 0 , seq2.length() - chop ) ;
+ string nqual = qual1.substr ( chop , qual1.length() - chop ) + qual2.substr ( 0 , qual2.length() - chop ) ;
+ chop = 0 ;
+ ret = align_solexa_paired_read ( name , nseq , nqual , read_length_1 - temp_chop , fragment_length , range ) ;
+ chop = temp_chop ;
+ }
+
+ if ( ret == 0 ) { // Still no luck, try to chop off some of both beginning and end
+ uint chop1 = chop / 2 ;
+ uint chop2 = chop - chop1 ;
+ string nseq = seq1.substr ( chop1 , seq1.length() - chop ) + seq2.substr ( chop2 , seq2.length() - chop ) ;
+ string nqual = qual1.substr ( chop1 , qual1.length() - chop ) + qual2.substr ( chop2 , qual2.length() - chop ) ;
+ chop = 0 ;
+ ret = align_solexa_paired_read ( name , nseq , nqual , read_length_1 - temp_chop , fragment_length , range ) ;
+ chop = temp_chop ;
+ }
+
+ }
+ else if ( wobble ) run_wobble ( seq1 , seq2 , qual1 , qual2 , res1 , res2 , fragment_length ) ; // Don't do in chopped and unchopped state
+
+ return ret ;
+}
+
+void TChromosomeAlign::get_align_solexa_read ( const char *_seq , pwi_vector &results ) {
+ string rc , rcq ;
+ uint l = strlen ( _seq ) ;
+// results.reserve ( 50 ) ;
+
+ for ( uint offset = 0 ; offset < index->index_steps ; offset++ ) {
+ uint i1 = index->calculate_index ( _seq + offset ) ;
+ if ( !index->position[i1].size() ) continue ;
+
+ // Binary search
+ TPositionWithIndex findme ;
+ findme.index2 = index->calculate_index2 ( _seq + offset + index->index_length ) ;
+ pair <TPWIi , TPWIi> pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ;
+
+ for ( TPWIi it = pp.first ; it < pp.second ; it++ ) {
+ int p = it->position ;
+ if ( it->reverse_complement ) {
+ if ( p+offset < l ) continue ;
+ if ( rc.empty() ) { // Creating the reverse-complement
+ rc = string ( _seq ) ;
+ REVERSE_COMPLEMENT_CHARP ( (char*)rc.c_str() ) ;
+ }
+ char *chrseq = (char*) (*chrs)[it->chromosome].sequence.c_str() ;
+ if ( !matchIUPAC ( rc.c_str() , chrseq+p-l+offset ) ) continue ;
+ p += offset ;
+ } else {
+ if ( p + l > (*chrs)[it->chromosome].sequence.length() ) continue ;
+ if ( p < offset ) continue ;
+ char *chrseq = (char*) (*chrs)[it->chromosome].sequence.c_str() ;
+ if ( !matchIUPAC ( _seq , chrseq+p-offset ) ) continue ;
+ p -= offset ;
+ }
+ results.push_back ( *it ) ;
+ if ( offset ) results.back().position = p ;
+ }
+ }
+}
+
+void TChromosomeAlign::run_wobble ( const string &seq1 , const string &seq2 , const string &qual1 , const string &qual2 , const pwi_vector &res1 , const pwi_vector &res2 , uint fragment_length ) {
+ uint cnt , frag , range = fragment_range ;
+ uint read_length_1 = seq1.length() ;
+ uint insertion_length = 10000 ;
+
+ if ( res1.size() + res2.size() > 10 ) return ; // Too many (number's just a whim)
+
+ if ( res1.size() * res2.size() == 0 ) { // One read matches perfectly but the other does not
+
+ if ( res1.size() + res2.size() == 1 ) { // Single match for one side, none for the other
+ int the_pos , the_chr , match_pos ;
+ string out ;
+ bool rc ;
+ if ( res1.size() == 1 ) {
+ rc = res1[0].reverse_complement ;
+ match_pos = res1[0].position ;
+ the_pos = res1[0].position + ( fragment_length - seq1.length() * 2 ) * ( rc ? -1 : 1 ) ;
+ the_chr = res1[0].chromosome ;
+ out = !rc ? dorc ( seq2 ) : seq2 ;
+ } else {
+ rc = res2[0].reverse_complement ;
+ match_pos = res2[0].position ;
+ the_pos = res2[0].position - ( fragment_length - seq1.length() * 2 ) * ( rc ? -1 : 1 ) ;
+ the_chr = res2[0].chromosome ;
+ out = rc ? dorc ( seq1 ) : seq1 ;
+ }
+ fprintf ( wobble , "MIS\t%s\t%d\t%d\t%s\n" , (*chrs)[the_chr].name.c_str() , match_pos , the_pos , out.c_str() ) ;
+ } else if ( res1.size() == 1 || res2.size() == 1 ) {
+ pwi_vector single = res1.size() == 1 ? res1 : res2 ;
+ pwi_vector multi = res1.size() == 1 ? res2 : res1 ;
+ bool good = true ;
+ int avg_pos = 0 ;
+ int avg_cnt = 0 ;
+ for ( int a = 0 ; a < multi.size() ; a++ ) {
+ if ( single[0].chromosome != multi[a].chromosome ) {
+ good = false ;
+ break ;
+ }
+ if ( abs ( ((int)single[0].position) - ((int)multi[a].position) ) > fragment_length * 2 ) {
+ good = false ;
+ break ;
+ }
+ if ( single[0].reverse_complement == multi[a].reverse_complement ) {
+ good = false ;
+ break ;
+ }
+ avg_pos += multi[a].position ;
+ avg_cnt++ ;
+ }
+ if ( good ) {
+ int the_chr = single[0].chromosome ;
+ int match_pos = single[0].position ;
+ int the_pos = avg_pos / avg_cnt ;
+ string out = res1.size() == 1 ? seq2 : seq1 ;
+ out = multi[0].reverse_complement ? dorc ( out ) : out ;
+ fprintf ( wobble , "MUL\t%s\t%d\t%d\t%s\n" , (*chrs)[the_chr].name.c_str() , match_pos , the_pos , out.c_str() ) ;
+ }
+ }
+
+ if ( sqlite ) sqlite_out.clear() ;
+
+ range = fragment_range ;
+// if ( res1.size() + res2.size() == 1 ) {
+ if ( res1.size() > 0 ) wobble4snps ( res1 , seq2 , fragment_length , range , seq1 , false ) ;
+ if ( res2.size() > 0 ) wobble4snps ( res2 , seq1 , fragment_length , range , seq2 , true ) ;
+// }
+
+ if ( sqlite_out.size() == 1 ) {
+ add2sqlite_cache ( sqlite_out[0] ) ;
+ }
+
+ return ;
+ }
+ return ;
+
+
+ uint a ;
+ cout << endl << seq1 << "\t" ;
+ for ( a = 0 ; a < res1.size() ; a++ ) cout << res1[a].chromosome << "@" << res1[a].position << ":" << res1[a].reverse_complement << ", " ;
+ cout << endl ;
+ cout << seq2 << "\t" ;
+ for ( a = 0 ; a < res2.size() ; a++ ) cout << res2[a].chromosome << "@" << res2[a].position << ":" << res2[a].reverse_complement << ", " ;
+ cout << endl ;
+
+ uint b ;
+ for ( a = 0 ; a < res1.size() ; a++ ) {
+ for ( b = 0 ; b < res2.size() ; b++ ) {
+ if ( res1[a].chromosome != res2[b].chromosome ) continue ;
+
+ TPositionWithIndex pa , pb ;
+ if ( res1[a].position > res2[b].position ) {
+ pa = res2[b] ;
+ pb = res1[a] ;
+ } else {
+ pa = res1[a] ;
+ pb = res2[b] ;
+ }
+
+ pa.position += (int) ( pa.reverse_complement ? -1 : 1 ) * seq1.length() ;
+ pb.position += (int) ( pb.reverse_complement ? -1 : 1 ) * seq2.length() ;
+
+// uint diff = pb.position - pa.position ;
+ fprintf ( wobble , "DEL\t%s\t%d\t%d\t%d\n" , (*chrs)[pa.chromosome].name.c_str() , pa.position , pb.position , pb.position - pa.position ) ;
+// cout << res1[a].position << ":" << res1[a].reverse_complement << " <=> " ;
+// cout << res2[b].position << ":" << res2[b].reverse_complement << "\t" << diff << endl ;
+ }
+ }
+
+
+return;
+
+/*
+
+ // Deletions
+ frag = fragment_length / 2 ;
+ range = frag - read_length_1 ;
+ wobble_results.clear() ;
+ cnt = paired_read_combine ( res1 , res2 , frag , range , read_length_1 , seq1 , qual1 , seq2 , qual2 , true ) ;
+ while ( wobble_results.size() ) {
+ TPositionWithIndex p1 , p2 ;
+ p2 = wobble_results.back() ; // Reverse order
+ wobble_results.pop_back() ;
+ p1 = wobble_results.back() ; // than pushing
+ wobble_results.pop_back() ;
+ if ( p1.position > p2.position ) { TPositionWithIndex p3 = p1 ; p1 = p2 ; p2 = p1 ; }
+ fprintf ( wobble , "DEL\t%s\t%d\t%d\t%d\n" , (*chrs)[p1.chromosome].name.c_str() , p1.position , p1.position + fragment_length , p2.position - p1.position ) ;
+ }
+
+
+ // Insertions
+ frag = fragment_length + insertion_length / 2 ;
+ range = insertion_length / 2 ;
+ wobble_results.clear() ;
+ cnt = paired_read_combine ( res1 , res2 , frag , range , read_length_1 , seq1 , qual1 , seq2 , qual2 , true ) ;
+ while ( wobble_results.size() ) {
+ TPositionWithIndex p1 , p2 ;
+ p2 = wobble_results.back() ; // Reverse order
+ wobble_results.pop_back() ;
+ p1 = wobble_results.back() ; // than pushing
+ wobble_results.pop_back() ;
+ if ( p1.position > p2.position ) { TPositionWithIndex p3 = p1 ; p1 = p2 ; p2 = p1 ; }
+ fprintf ( wobble , "INS\t%s\t%d\t%d\t%d\n" , (*chrs)[p1.chromosome].name.c_str() , p1.position , p1.position + fragment_length , p2.position - p1.position ) ;
+ }
+
+ */
+}
+
+void TChromosomeAlign::wobble4snps ( const pwi_vector &res , const string &seq , uint fragment_size , uint range , const string &anchor_seq , bool rc ) {
+ if ( res.empty() ) return ;
+
+ string seq_rc = dorc ( seq ) ;
+ int from , to , c ;
+ vector <string> out ;
+ char dummy[READ_CHAR_BUFFER] ;
+ int total_matches = 0 ;
+
+// string rc = dorc ( seq ) ;
+ int sl = seq.length() ;
+ for ( uint a = 0 ; a < res.size() ; a++ ) {
+ string *cseq = &(*chrs)[res[a].chromosome].sequence ;
+ if ( !(*chrs)[res[a].chromosome].original_sequence.empty() ) cseq = &(*chrs)[res[a].chromosome].original_sequence ;
+// if ( !cseq->empty() ) cseq = &(*chrs)[res[a].chromosome].sequence ;
+ if ( res[a].reverse_complement ) from = res[a].position - fragment_size - range ;
+ else from = res[a].position + fragment_size - range ;
+ if ( from < 0 ) from = 0 ;
+ to = from + range * 2 ;
+ if ( to >= cseq->length() ) to = cseq->length() - 1 ;
+ to -= sl ;
+
+ // Add general variation range
+// fprintf ( wobble , "VAR\t%s\t%d\t%d\n" , (*chrs)[res[a].chromosome].name.c_str() , (int) ((from+to)/2-seq.length()/2) , (int) ((from+to)/2+seq.length()/2) ) ;
+// out.push_back ( dummy ) ;
+
+ for ( int b = from ; b <= to ; b++ ) {
+ vector <int> pos ;
+ vector <char> o , n ;
+ pos.reserve ( wobblemax ) ;
+ o.reserve ( wobblemax ) ;
+ n.reserve ( wobblemax ) ;
+ const char *t = res[a].reverse_complement ? seq.c_str() : seq_rc.c_str() ;
+ int mismatches = 0 ;
+ const char *orig = cseq->c_str() + b ;
+ for ( c = 0 ; c < sl ; c++ , t++ , orig++ ) {
+ if ( char2IUPAC[*t] & char2IUPAC[*orig] ) continue ;
+ if ( ++mismatches > wobblemax ) break ;
+ pos.push_back ( b + c + 1 ) ;
+ o.push_back ( *orig ) ;
+ n.push_back ( *t ) ;
+ }
+ if ( mismatches == 0 ) continue ; // Weird...
+ if ( mismatches > wobblemax ) continue ;
+
+// total_matches++ ;
+// if ( total_matches > 1 ) break ;
+ if ( ++total_matches > 1 ) return ;
+
+ for ( c = 0 ; c < pos.size() ; c++ ) {
+ sprintf ( dummy , "SNP\t%s\t%d\t%c\t%c\n" , (*chrs)[res[a].chromosome].name.c_str() , pos[c] , o[c] , n[c] ) ;
+ out.push_back ( dummy ) ;
+ }
+
+ if ( sqlite ) {
+ uint opos = res[a].position ;
+ if ( res[a].reverse_complement ) opos -= anchor_seq.length() ;
+ string s = generate_sqlite_paired_command ( res[a].chromosome ,
+ res[a].reverse_complement ? dorc ( anchor_seq ) : anchor_seq ,
+ opos ,
+ res[a].reverse_complement ? seq : seq_rc ,
+ b , 'S' ) ;
+/* cout << s << endl ;
+ cout << ( res[a].reverse_complement ? "RC" : "NORM" ) << endl ;
+ cout << anchor_seq << endl ;
+ cout << dorc ( anchor_seq ) << endl ;
+ cout << dor ( anchor_seq ) << endl ;
+ cout << doc ( anchor_seq ) << endl ;
+ cout << cseq->substr ( opos , anchor_seq.length() ) << endl ;
+ cout << endl ;*/
+
+ sqlite_out.push_back ( s ) ;
+ }
+ }
+ }
+
+ if ( total_matches > 1 ) return ;
+ for ( c = 0 ; c < out.size() ; c++ ) {
+ fprintf ( wobble , "%s" , out[c].c_str() ) ;
+ }
+}
+
+string TChromosomeAlign::generate_sqlite_paired_command ( int chr , string seq1 , int pos1 , string seq2 , int pos2 , char mode , string add_keys , string add_values ) {
+ string ret ;
+ string *refp = (*chrs)[chr].original_sequence.empty() ? &((*chrs)[chr].sequence) : &((*chrs)[chr].original_sequence) ;
+ string ref1 = refp->substr ( pos1 , seq1.length() ) ;
+ string ref2 = refp->substr ( pos2 , seq2.length() ) ;
+ if ( seq1 == ref1 && seq2 == ref2 && mode == ' ' ) { // No SNPs
+ char tmp[1000] , *chrn = (char*) (*chrs)[chr].name.c_str() ;
+ sprintf ( tmp , "/*P|%s|%9d*/ INSERT INTO %s_perfect_match (read_name,pos1,pos2) VALUES (\"%s\",%d,%d);" , chrn , pos1 , chrn , last_solexa_name.c_str() , pos1+1 , pos2+1 ) ;
+ ret = tmp ;
+ } else { // SNP(s)
+ char tmp[1000] , *chrn = (char*) (*chrs)[chr].name.c_str() ;
+ int a ;
+ string out1 , out2 ;
+ string table = "_snp_match" ;
+ if ( mode == ' ' ) mode = 'S' ;
+ else if ( mode == 'I' ) table = "_inversions" ;
+ else if ( mode == '1' ) table = "_single" ;
+ if ( seq1 != ref1 ) {
+ out1 = seq1 ;
+ for ( a = 0 ; a < out1.length() ; a++ ) out1[a] = seq1[a] != ref1[a] ? seq1[a] : seq1[a] - 'A' + 'a' ;
+ }
+ if ( seq2 != ref2 && mode != '1' ) {
+ out2 = seq2 ;
+ for ( a = 0 ; a < out2.length() ; a++ ) out2[a] = seq2[a] != ref2[a] ? seq2[a] : seq2[a] - 'A' + 'a' ;
+ }
+
+ if ( mode == '1' ) {
+ sprintf ( tmp , "/*%c|%s|%9d*/ INSERT INTO %s%s (read_name,seq1,pos1) VALUES (\"%s\",\"%s\",%d);" ,
+ mode , chrn , pos1 , chrn , table.c_str() , last_solexa_name.c_str() ,
+ out1.c_str() ,
+ pos1+1 ) ;
+ } else {
+ sprintf ( tmp , "/*%c|%s|%9d*/ INSERT INTO %s%s (read_name,seq1,pos1,seq2,pos2%s) VALUES (\"%s\",\"%s\",%d,\"%s\",%d%s);" ,
+ mode , chrn , pos1 , chrn , table.c_str() ,
+ add_keys.c_str(),
+ last_solexa_name.c_str() ,
+ out1.c_str() ,
+ pos1+1 ,
+ out2.c_str() ,
+ pos2+1 ,
+ add_values.c_str()
+ ) ;
+ }
+ ret = tmp ;
+ }
+ return ret ;
+}
+
+int TChromosomeAlign::paired_read_combine ( const pwi_vector &v1 , const pwi_vector &v2 , uint fragment_length , uint range ,
+ uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 ,
+ bool wobbling , bool inverting , int order ) {
+ int ret = 0 ;
+ vector <prc_cache> cache ;
+ uint a , b , c ;
+ uint seq1_length = seq1.length() ;
+ uint seq2_length = seq2.length() ;
+
+ if ( force_one_unique_match && v1.size() > 1 && v2.size() > 1 ) return ret ;
+
+ a = 0 ;
+ b = 0 ;
+ while ( a < v1.size() && b < v2.size() ) {
+ int chr = v1[a].chromosome ;
+ int l1 = v1[a].position + fragment_length - range - read_length_1 ;
+
+ if ( l1 < v1[a].position + read_length_1 ) l1 = v1[a].position + read_length_1 ;
+
+ while ( b < v2.size() && ( v2[b].chromosome < chr || ( v2[b].chromosome == chr && v2[b].position < l1 ) ) ) b++ ;
+ if ( b == v2.size() ) break ;
+
+ int l2 = v1[a].position + fragment_length + range ;
+
+
+ for ( c = b ; c < v2.size() && chr == v2[c].chromosome && v2[c].position <= l2 ; c++ ) {
+ if ( v2[c].position < l1 ) continue ;
+ if ( inverting && v1[a].reverse_complement == v2[c].reverse_complement ) { // INVERSION! FROM MARS!!!
+ if ( inversions ) fprintf ( inversions , "%s\t%d\t%d\n" , (*chrs)[v1[a].chromosome].name.c_str() , v1[a].position , v2[c].position ) ;
+ if ( sqlite ) {
+ string s , t ;
+ if ( v1[a].reverse_complement ) {
+ t = ",\"--\"" ;
+ t[2] = '0' + order ;
+ s = generate_sqlite_paired_command ( chr , dorc ( seq1 ) , v1[a].position - seq1_length ,
+ dorc ( seq2 ) , v2[c].position - seq2_length ,
+ 'I' , ",dir" , t ) ;
+ } else {
+ t = ",\"++\"" ;
+ t[2] = '0' + order ;
+ s = generate_sqlite_paired_command ( chr , seq1 , v1[a].position , seq2 , v2[c].position , 'I' , ",dir" , t ) ;
+ }
+ sqlite_out.push_back ( s ) ;
+ }
+
+ if ( sam ) {
+ int position_a = v1[a].reverse_complement ? v1[a].position - seq1_length : v1[a].position ;
+ int position_b = v2[c].reverse_complement ? v2[c].position - seq2_length : v2[c].position ;
+ int dist = v2[c].position > v1[a].position ? v2[c].position - v1[a].position : v1[a].position - v2[c].position ;
+ bool unique = true ; // Not true, but too complicated to fix...
+ if ( v1[a].reverse_complement ) {
+ add_sam ( dorc ( seq1 ) , dor ( qual1 ) , chr , position_a , 1 , position_b , dist , true , true , true , unique ) ;
+ add_sam ( dorc ( seq2 ) , dor ( qual2 ) , chr , position_b , 2 , position_a , dist , true , true , true , unique ) ;
+ } else {
+ add_sam ( seq1 , qual1 , chr , position_a , 1 , position_b , dist , false , true , false , unique ) ;
+ add_sam ( seq2 , qual2 , chr , position_b , 2 , position_a , dist , false , true , false , unique ) ;
+ }
+ sam_wrote = true ;
+ }
+ continue ;
+ }
+ if ( v1[a].reverse_complement == v2[c].reverse_complement ) continue ;
+
+ ret++ ;
+
+ // Add to output cache
+ prc_cache ca ;
+ ca.a = v1[a] ;
+ ca.b = v2[c] ;
+ cache.push_back ( ca ) ;
+
+ if ( gffout ) {
+ fprintf ( gffout , "\"%s\"\t\"SNP-O-MATIC\"\t\"misc_feature\"\t%d\t%d\t\"-\"\t\"%c\"\t\"-\"\t\"ID=%s_a;Gap=M%d\"\n" ,
+ (*chrs)[v1[a].chromosome].name.c_str() ,
+ v1[a].position ,
+ v1[a].position + read_length_1 ,
+ v1[a].reverse_complement ? '-' : '+' ,
+ last_solexa_name.c_str() ,
+ read_length_1
+ ) ;
+ fprintf ( gffout , "\"%s\"\t\"SNP-O-MATIC\"\t\"misc_feature\"\t%d\t%d\t\"-\"\t\"%c\"\t\"-\"\t\"ID=%s_b;Gap=M%d\"\n" ,
+ (*chrs)[v2[c].chromosome].name.c_str() ,
+ v2[c].position ,
+ v2[c].position + read_length_1 ,
+ v2[c].reverse_complement ? '-' : '+' ,
+ last_solexa_name.c_str() ,
+ read_length_1
+ ) ;
+ }
+
+
+ if ( wobbling ) {
+ TPositionWithIndex p1 , p2 ;
+ p1.chromosome = chr ;
+ p2.chromosome = chr ;
+ if ( v1[a].reverse_complement ) p1.position = v1[a].position - seq1_length ;
+ else p1.position = v1[a].position ;
+ if ( v2[c].reverse_complement ) p2.position = v2[c].position - seq2_length ;
+ else p2.position = v2[c].position ;
+ wobble_results.push_back ( p1 ) ;
+ wobble_results.push_back ( p2 ) ;
+ }
+
+ }
+ a++ ;
+ }
+
+ if ( multimatch && cache.size() > 1 ) { // Force to map the read in one of its possible locations
+ cache[0] = cache[rand()%cache.size()] ;
+ cache.resize ( 1 ) ;
+ }
+
+ if ( singlematch && cache.size() > 1 ) return ret ;
+
+ for ( uint n = 0 ; n < cache.size() ; n++ ) {
+ prc_cache *p = &cache[n] ;
+ int chr = p->a.chromosome ; // Same as p->b.chromosome
+
+ int position_a = p->a.reverse_complement ? p->a.position - seq1_length : p->a.position ;
+ int position_b = p->b.reverse_complement ? p->b.position - seq2_length : p->b.position ;
+ int dist = p->b.position > p->a.position ? p->b.position - p->a.position : p->a.position - p->b.position ;
+
+ if ( faceaway ) {
+ if ( p->a.reverse_complement && !p->b.reverse_complement && p->a.position < p->b.position ) {
+ fprintf ( faceaway , "%s\t%s" , last_solexa_name.c_str() , (*chrs)[p->a.chromosome].name.c_str() ) ;
+ fprintf ( faceaway , "\t%d" , position_a ) ;
+ fprintf ( faceaway , "\t%s" , p->a.reverse_complement ? dorc(seq1).c_str() : seq1.c_str() ) ;
+ fprintf ( faceaway , "\t%d" , position_b ) ;
+ fprintf ( faceaway , "\t%s" , p->b.reverse_complement ? dorc(seq2).c_str() : seq2.c_str() ) ;
+ fprintf ( faceaway , "\n" ) ;
+ }
+ }
+
+ if ( cigar ) {
+ add_cigar_sqlite ( seq1 , last_solexa_name , position_a , p->a.chromosome , p->a.reverse_complement ? '-' : '+' ) ;
+ add_cigar_sqlite ( seq2 , last_solexa_name , position_b , p->b.chromosome , p->b.reverse_complement ? '-' : '+' ) ;
+ }
+
+
+ if ( rpa ) {
+ rpa_out ( p->a , p->b , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ;
+ }
+
+ if ( sqlite ) {
+ string s ;
+ if ( p->a.reverse_complement ) s = generate_sqlite_paired_command ( chr , dorc ( seq1 ) , position_a , seq2 , position_b ) ;
+ else s = generate_sqlite_paired_command ( chr , seq1 , position_a , dorc ( seq2 ) , position_b ) ;
+ sqlite_out.push_back ( s ) ;
+ }
+
+ if ( fragmentplot ) {
+ while ( fragment_stats.size() <= dist ) fragment_stats.push_back ( 0 ) ;
+ fragment_stats[dist]++ ;
+ }
+ if ( indelplot ) {
+ fprintf ( indelplot , "%s\t%d\t%d\t%d\n" , (*chrs)[p->a.chromosome].name.c_str() , p->a.position , p->b.position , dist ) ;
+ }
+ if ( coverage ) {
+ if ( p->a.reverse_complement ) (*chrs)[chr].add_coverage ( dorc ( seq1 ) , position_a , true ) ;
+ else (*chrs)[chr].add_coverage ( seq1 , position_a , false ) ;
+ if ( p->b.reverse_complement ) (*chrs)[chr].add_coverage ( dorc ( seq2 ) , position_b , true ) ;
+ else (*chrs)[chr].add_coverage ( seq2 , position_b , false ) ;
+ }
+ if ( sam ) {
+ bool unique = cache.size() == 1 ;
+ if ( p->a.reverse_complement ) add_sam ( dorc ( seq1 ) , dor ( qual1 ) , chr , position_a , 1 , position_b , dist , p->a.reverse_complement , true , p->b.reverse_complement , unique ) ;
+ else add_sam ( seq1 , qual1 , chr , position_a , 1 , position_b , dist , p->a.reverse_complement , true , p->b.reverse_complement , unique ) ;
+ if ( p->b.reverse_complement ) add_sam ( dorc ( seq2 ) , dor ( qual2 ) , chr , position_b , 2 , position_a , dist , p->b.reverse_complement , true , p->a.reverse_complement , unique ) ;
+ else add_sam ( seq2 , qual2 , chr , position_b , 2 , position_a , dist , p->b.reverse_complement , true , p->a.reverse_complement , unique ) ;
+ sam_wrote = true ;
+ }
+ if ( snpsinreads ) {
+ if ( p->a.reverse_complement ) add_snpsinreads ( dorc ( seq1 ) , dor ( qual1 ) , chr , position_a , 1 ) ;
+ else add_snpsinreads ( seq1 , qual1 , chr , position_a , 1 ) ;
+ if ( p->b.reverse_complement ) add_snpsinreads ( dorc ( seq2 ) , dor ( qual2 ) , chr , position_b , 2 ) ;
+ else add_snpsinreads ( seq2 , qual2 , chr , position_b , 2 ) ;
+ }
+ if ( pileup ) {
+ if ( p->a.reverse_complement ) (*chrs)[chr].ao.add_align ( dorc ( seq1 ) , dor ( qual1 ) , position_a , chr ) ;
+ else (*chrs)[chr].ao.add_align ( seq1 , qual1 , position_a , chr ) ;
+ if ( p->b.reverse_complement ) (*chrs)[chr].ao.add_align ( dorc ( seq2 ) , dor ( qual2 ) , position_b , chr ) ;
+ else (*chrs)[chr].ao.add_align ( seq2 , qual2 , position_b , chr ) ;
+ }
+ if ( featuretable ) {
+ fprintf ( featuretable , "gene\t%d-%d\n\t/product=\"%s\"\n\t/chromosome=\"%s\"\n" , p->a.position , p->a.position + read_length_1 , last_solexa_name.c_str() , (*chrs)[p->a.chromosome].name.c_str() ) ;
+ fprintf ( featuretable , "gene\t%d-%d\n\t/product=\"%s\"\n\t/chromosome=\"%s\"\n" , p->b.position , p->b.position + read_length_1 , last_solexa_name.c_str() , (*chrs)[p->b.chromosome].name.c_str() ) ;
+ }
+ }
+
+ return ret ;
+}
+
+void TChromosomeAlign::rpa_out ( const TPositionWithIndex &a , const TPositionWithIndex &b , uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 ) {
+ if ( !rpa_header_written ) {
+ fprintf ( rpa , "#Readname\tr1_seq\tr1_qual\tr1_chr\tr1_pos\tr1_rc\tr2_seq\tr2_qual\tr2_chr\tr2_pos\tr2_rc\n" ) ;
+ rpa_header_written = true ;
+ }
+ fprintf ( rpa , "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\n" ,
+ last_solexa_name.c_str() ,
+ seq1.c_str() ,
+ qual1.c_str() ,
+ (*chrs)[a.chromosome].name.c_str() ,
+ a.position ,
+ a.reverse_complement ? 1 : 0 ,
+ seq2.c_str() ,
+ qual2.c_str() ,
+ (*chrs)[b.chromosome].name.c_str() ,
+ b.position ,
+ b.reverse_complement ? 1 : 0
+ ) ;
+}
+
+void TChromosomeAlign::add_sam ( const string &seq , const string &quality , int chr , int pos , int readpart , int matepos , int ins_size , bool rc , bool as_pair , bool mate_rc , bool unique_match ) {
+ char cigar[100] ;
+ sprintf ( cigar , "%dM" , (int) seq.length() ) ;
+
+ unsigned int flag = 0 ;
+ flag |= 0x0001 ; // Paired in sequencing
+ if ( as_pair ) flag |= 0x0002 ; // Mapped as pair
+ else flag |= 0x0008 ; // Mate unmapped
+ if ( rc ) flag |= 0x0010 ; // On reverse strand
+ if ( as_pair ) {
+ if ( mate_rc ) flag |= 0x0020 ; // Mate RC
+ if ( 1 == readpart ) flag |= 0x0040 ; // First read in a pair
+ if ( 2 == readpart ) flag |= 0x0080 ; // Second read in a pair
+ }
+ if ( !unique_match ) flag |= 0x0100 ; // Not uniquely mapped
+
+ char mismatches[1000] ;
+ sprintf ( mismatches , "\tNM:i:%d" , count_snpsinreads ( seq , chr , pos ) ) ;
+ string tags ; // FILL ME
+// tags += "\tRG:Z:" + rgid ;
+// tags += "\tPU:Z:" + rgid ;
+ tags += "\tPG:Z:SNP-o-matic" ;
+ tags += mismatches ;
+
+ if ( pos > matepos ) ins_size = -ins_size ;
+
+ fprintf ( sam , "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s%s\n" ,
+ last_solexa_name.c_str() , // Read name
+ flag , // flag
+ (*chrs)[chr].name.c_str() , // Chromosome name
+ pos + 1 , // Alignment start position
+ 255 , // PHRED alignment score dummy
+ cigar , // CIGAR
+ "=" , // Mate
+ matepos , // Mate position
+ ins_size , // ISIZE (actually, fragment size)
+ seq.c_str() , // Sequence
+ quality.c_str() , // Quality
+ tags.c_str() // Tag
+ ) ;
+}
+
+int TChromosomeAlign::count_snpsinreads ( const string &seq , int chr , int pos ) {
+ uint sl = (*chrs)[chr].original_sequence.length() ;
+ if ( sl == 0 ) return 0 ; // No SNPs here
+
+ int ret = 0 ;
+ int seql = seq.length() , n , cnt ;
+ const char *os = (*chrs)[chr].original_sequence.c_str() ;
+ const char *ss = (*chrs)[chr].sequence.c_str() ;
+ for ( int p = 0 ; p < seql && p + pos < sl ; p++ ) {
+ if ( !isIUPAC[ss[p+pos]] || os[p+pos] == 'X' ) continue ;
+ if ( seq[p] != os[p+pos] ) ret++ ;
+ }
+ return ret ;
+}
+
+void TChromosomeAlign::add_snpsinreads ( const string &seq , const string &quality , int chr , int pos , int readpart ) {
+ uint sl = (*chrs)[chr].original_sequence.length() ;
+ if ( sl == 0 ) return ; // No SNPs here
+
+ if ( snpsinreads_first ) { // Output header
+ snpsinreads_first = false ;
+ fprintf ( snpsinreads , "#read_name\tread_part\tsnp_name\tchromosome\tposition_on_chromosome\tposition_on_read\tobserved_base\treference_base\tbase_quality\tmin_total\tavg_total\tmin_5\tavg_5\n" ) ;
+ }
+
+ int min_total = -1 , avg_total , min_5 , avg_5 ;
+ int seql = seq.length() , n , cnt ;
+
+ const char *os = (*chrs)[chr].original_sequence.c_str() ;
+ const char *ss = (*chrs)[chr].sequence.c_str() ;
+ for ( int p = 0 ; p < seql && p + pos < sl ; p++ ) {
+// if ( os[p+pos] == seq[p] || os[p+pos] == 'X' ) continue ;
+ if ( !isIUPAC[ss[p+pos]] || os[p+pos] == 'X' ) continue ;
+
+ // Preparing total quality scores
+ if ( min_total == -1 ) {
+ min_total = 1000 ; // Some high number
+ avg_total = 0 ;
+ for ( n = 0 ; n < seql ; n++ ) {
+ if ( quality[n] < min_total ) min_total = quality[n] ;
+ avg_total += quality[n] ;
+ }
+ min_total -= 33 ;
+ avg_total = avg_total / seql - 33 ;
+ }
+
+ // Prepare surrounding quality scores
+ min_5 = 1000 ; // Some high number
+ avg_5 = 0 ;
+ cnt = 0 ;
+ for ( n = ( p >= 5 ) ? p - 5 : 0 ; n < p + 5 && n < seql ; n++ ) {
+ if ( n == p ) continue ; // Do not use actual SNP base for quality
+ cnt++ ;
+ if ( quality[n] < min_5 ) min_5 = quality[n] ;
+ avg_5 += quality[n] ;
+ }
+ if ( cnt > 0 ) {
+ min_5 -= 33 ;
+ avg_5 = avg_5 / cnt - 33 ;
+ } else {
+ avg_5 = 0 ;
+ cerr << "TChromosomeAlign::add_snpsinreads : cnt == 0\n" ;
+ }
+
+ fprintf ( snpsinreads ,
+ "%s\t%d\t%s:%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n" ,
+ last_solexa_name.c_str() ,
+ readpart ,
+ (*chrs)[chr].name.c_str() ,
+ pos+p+1 ,
+ (*chrs)[chr].name.c_str() ,
+ pos+p+1 ,
+ p+1 ,
+ seq[p] ,
+ os[p+pos] ,
+ (unsigned char) quality[p] - 33 ,
+ min_total ,
+ avg_total ,
+ min_5 ,
+ avg_5
+ ) ;
+ }
+}
+
+void TChromosomeAlign::add2sqlite_cache ( string s ) {
+ if ( sqlite_cache_name.empty() ) {
+ char *c = new char[10000] ;
+ strcpy ( c , "sqlite_cache_XXXXXXXX" ) ;
+ close ( mkstemp ( c ) ) ;
+ //tmpnam ( c ) ;
+ sqlite_cache_name = c ;
+ sqlite_cache = fopen ( c , "w" ) ;
+ delete c ;
+ }
+ fprintf ( sqlite_cache , "%s\n" , s.c_str() ) ;
+}
+
+void TChromosomeAlign::finish_sqlite ( int fragment , int variance , int read_length ) {
+ if ( !sqlite ) return ;
+ int a , b ;
+ fprintf ( sqlite , "BEGIN EXCLUSIVE TRANSACTION;\n" ) ;
+
+ // Meta-data
+ fprintf ( sqlite , "CREATE TABLE meta ( key VARCHAR[64] , value VARCHAR[256] ) ;\n" ) ;
+ fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"dbversion\" , \"3\" ) ;\n" ) ;
+ fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%d\" ) ;\n" , "fragment" , fragment ) ;
+ fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%d\" ) ;\n" , "variance" , variance ) ;
+ fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%d\" ) ;\n" , "read_length" , read_length ) ;
+ fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%s\" ) ;\n" , "name_prefix" , sqlite_prefix.c_str() ) ;
+
+ // Create tables
+ fprintf ( sqlite , "CREATE TABLE chromosomes ( name VARCHAR[256] , size INTEGER ) ;\n" ) ;
+ for ( a = 0 ; a < chrs->size() ; a++ ) {
+ fprintf ( sqlite , "CREATE TABLE %s_perfect_match ( read_name VARCHAR[32], pos1 INTEGER, pos2 INTEGER ) ;\n" , (*chrs)[a].name.c_str() ) ;
+ fprintf ( sqlite , "CREATE TABLE %s_snp_match ( read_name VARCHAR[32], pos1 INTEGER, seq1 VARCHAR[64] , pos2 INTEGER , seq2 VARCHAR[64] ) ;\n" , (*chrs)[a].name.c_str() ) ;
+ fprintf ( sqlite , "CREATE TABLE %s_inversions ( read_name VARCHAR[32], pos1 INTEGER, seq1 VARCHAR[64] , pos2 INTEGER , seq2 VARCHAR[64] , dir VARCHAR[4] ) ;\n" , (*chrs)[a].name.c_str() ) ;
+ fprintf ( sqlite , "CREATE TABLE %s_single ( read_name VARCHAR[32], pos1 INTEGER, seq1 VARCHAR[64] ) ;\n" , (*chrs)[a].name.c_str() ) ;
+ fprintf ( sqlite , "INSERT INTO chromosomes ( name , size ) VALUES ( \"%s\" , \"%d\") ;\n" , (*chrs)[a].name.c_str() , (*chrs)[a].sequence.length() ) ;
+ }
+
+ // Write contents
+// if ( DEBUGGING ) { fprintf ( stdout , "Sorting sqlite output ... " ) ; fflush(stdout); }
+// sort ( sqlite_cache.begin() , sqlite_cache.end() ) ;
+// if ( DEBUGGING ) { fprintf ( stdout , "writing ... " ) ; fflush(stdout); }
+
+ fclose ( sqlite_cache ) ;
+ sqlite_cache = fopen ( sqlite_cache_name.c_str() , "r" ) ;
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+
+ string pref = "\"" + sqlite_prefix ;
+// for ( a = 0 ; a < sqlite_cache.size() ; a++ ) {
+// string s = sqlite_cache[a] ;
+ while ( !feof ( sqlite_cache ) ) {
+ fgets ( dummy , READ_CHAR_BUFFER , sqlite_cache ) ;
+ for ( c1 = dummy ; *c1 > 13 ; c1++ ) ;
+ *c1 = 0 ;
+ string s ( dummy ) ;
+ if ( s.empty() ) continue ;
+
+ // Removing initial SQL comment to save disk space; we could leave it in, and sqlite would ignore it
+ b = s.find ( "*/" ) ;
+ if ( b > 0 && b < s.length() ) {
+ b += 2 ;
+ while ( s[b] == ' ' ) b++ ;
+ s = s.substr ( b ) ;
+ }
+
+ // Remove common read name prefix
+ int position = s.find ( pref ) ; // find first space
+ while ( position != string::npos ) {
+ s.replace( position, pref.length(), "\"" );
+ position = s.find( pref, position + 1 );
+ }
+
+ fprintf ( sqlite , "%s\n" , s.c_str() ) ;
+ }
+
+ // Create indices
+ fprintf ( sqlite , "COMMIT;\n" ) ;
+ fprintf ( sqlite , "BEGIN EXCLUSIVE TRANSACTION;\n" ) ;
+ for ( a = 0 ; a < chrs->size() ; a++ ) {
+ char *c = (char*) (*chrs)[a].name.c_str() ;
+ fprintf ( sqlite , "CREATE INDEX %s_perfect_index ON %s_perfect_match ( pos1 , pos2 ) ;\n" , c , c ) ;
+ fprintf ( sqlite , "CREATE INDEX %s_snp_index ON %s_snp_match ( pos1 , pos2 ) ;\n" , c , c ) ;
+ fprintf ( sqlite , "CREATE INDEX %s_inv_index ON %s_inversions ( pos1 , pos2 ) ;\n" , c , c ) ;
+ fprintf ( sqlite , "CREATE INDEX %s_sin_index ON %s_single ( pos1 ) ;\n" , c , c ) ;
+ }
+
+ // Close up
+ fprintf ( sqlite , "COMMIT;\n" ) ;
+ fclose ( sqlite_cache ) ;
+ remove ( sqlite_cache_name.c_str() ) ;
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+}
+
+void TChromosomeAlign::align_solexa_paired ( string filename , string filename2 , uint read_length_1 , uint fragment_length , uint range ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa pair data from %s ... " , filename.c_str() ) ; fflush(stdout); }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ if ( !file ) {
+ cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ;
+ exit ( 1 ) ;
+ }
+
+ FILE *file2 = NULL ;
+ if ( !filename2.empty() ) {
+ file2 = fopen ( filename2.c_str() , "r" ) ;
+ if ( !file2 ) {
+ cerr << "Could not open FASTQ file \"" << filename2 << "\" - aborting." << endl ;
+ exit ( 1 ) ;
+ }
+ }
+
+ if ( sam ) {
+ for ( int a = 0 ; a < filename.length() ; a++ ) {
+ if ( filename[a] == '/' ) rgid.clear() ;
+ else rgid += filename[a] ;
+ }
+
+ char predicted_insert_size[100] ;
+ sprintf ( predicted_insert_size , "\tPI:%d" , fragment_length - read_length_1 * 2 ) ;
+
+/* string rg ( "@RG" ) ;
+ rg += "\tID:" + rgid ;
+ rg += "\tPU:" + rgid ;
+ rg += predicted_insert_size ;
+ rg += "\tPL:SOLEXA" ;
+ fprintf ( sam , "%s\n" , rg.c_str() ) ;
+*/
+ string pg ( "@PG\tID:SNP-o-matic" ) ;
+ fprintf ( sam , "%s\n" , pg.c_str() ) ;
+ }
+
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ string lastseq , lastqual ;
+ fasta = true ;
+ bool last_was_quality = false , lastone = false , eof = false , skip = false ;
+ uint readcount = 0 , matched_reads = 0 , total_matches = 0 , skipped = 0 ;
+
+ if ( range == 0 ) range = fragment_length / 4 ;
+ fragment_range = range ;
+
+ int linecounter = 0 ;
+
+ while ( !eof || lastone ) {
+// if ( readcount > 10000 ) break ; // TESTING
+ if ( lastone ) {
+ strcpy ( dummy , "@dummy" ) ;
+ if ( fasta ) *dummy = '>' ;
+ } else {
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( file2 ) {
+ int len = 0 ;
+ for ( c1 = dummy ; *c1 > 13 ; c1++ , len++ ) ;
+ if ( linecounter == 1 ) read_length_1 = len ;
+ fgets ( c1 , READ_CHAR_BUFFER , file2 ) ;
+ }
+ }
+
+ // Remove EOL
+ for ( c1 = dummy ; *c1 > 13 ; c1++ ) ;
+ *c1 = 0 ;
+
+ if ( linecounter == 0 && ( ( fasta && *dummy == '>' ) || *dummy == '@' ) ) {
+ if ( *dummy == '@' ) fasta = false ;
+
+ if ( !skip ) {
+
+ if ( sqlite && !last_solexa_name.empty() ) {
+ if ( sqlite_prefix_first ) {
+ sqlite_prefix_first = false ;
+ sqlite_prefix = last_solexa_name ;
+ } else {
+ int a ;
+ for ( a = 1 ; a < sqlite_prefix.length() && a < last_solexa_name.length() && last_solexa_name[a] == sqlite_prefix[a] ; a++ ) ;
+ if ( sqlite_prefix.length() != a ) sqlite_prefix = sqlite_prefix.substr ( 0 , a ) ;
+ }
+ }
+
+ for ( c1 = (char*) lastseq.c_str() ; *c1 && !isIUPAC[*c1] ; c1++ ) ;
+ if ( *c1 ) { // Contains IUPAC
+ if ( using_bins ) write2bin ( last_solexa_name , lastseq , lastqual , -1 ) ;
+ } else if ( !lastseq.empty() ) {
+ uint matches = align_solexa_paired_read ( last_solexa_name , lastseq , lastqual , read_length_1 , fragment_length , range ) ;
+ if ( matches > 0 ) matched_reads++ ;
+ total_matches += matches ;
+ if ( using_bins ) write2bin ( last_solexa_name , lastseq , lastqual , (int) matches ) ;
+ }
+ }
+
+ if ( lastone ) break ;
+
+ // Clear data
+ c1 = dummy ;
+ if ( *c1 == '>' || *c1 == '@' ) c1++ ;
+ while ( *c1 == ' ' ) c1++ ;
+ last_solexa_name = c1 ;
+ readcount++ ;
+ last_was_quality = false ;
+ lastseq.clear() ;
+ lastqual.clear() ;
+
+ if ( use_nono ) {
+ skip = is_nono ( last_solexa_name ) ;
+ if ( skip ) skipped++ ;
+ }
+ } else if ( *dummy == '+' ) {
+ last_was_quality = true ;
+ } else if ( last_was_quality ) {
+ lastqual += dummy ;
+ } else {
+ lastseq += dummy ;
+ }
+
+ linecounter++ ;
+ linecounter %= 4 ;
+
+ eof = feof ( file ) ;
+ if ( eof ) lastone = true ; // Fake last read
+ }
+
+ if ( fragmentplot ) {
+ for ( uint a = 0 ; a < fragment_stats.size() ; a++ ) {
+ fprintf ( fragmentplot , "%d\t%d\n" , a , fragment_stats[a] ) ;
+ }
+ }
+
+ if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches, %d skipped.\n" , readcount , matched_reads , (float) matched_reads * 100 / readcount , total_matches , skipped ) ; fflush(stdout); }
+}
+
+void TChromosomeAlign::align_contigs ( chrvec &contigs , TChromosomalIndices &contig_index , uint maxerr , string output_filename ) {
+ uint a , b , c ;
+ vector <pwi_vector> vp ;
+ vector <vuint> vi ;
+ for ( a = 0 ; a < contig_index.position.size() ; a++ ) {
+ for ( b = 0 ; b < contig_index.position[a].size() ; b++ ) {
+ uint chr = contig_index.position[a][b].chromosome ;
+ while ( vp.size() <= chr ) vp.push_back ( pwi_vector () ) ;
+ while ( vi.size() <= chr ) vi.push_back ( vuint () ) ;
+ vp[chr].push_back ( contig_index.position[a][b] ) ;
+ vi[chr].push_back ( a ) ;
+ }
+ }
+
+ uint mismatch = 0 ;
+ vector <string> out ;
+ char c2[50000] ;
+
+ for ( a = 0 ; a < vp.size() ; a++ ) {
+ string refseq = contigs[a].sequence ;
+ for ( b = 0 ; b < vp[a].size() ; b++ ) {
+ vector <string> matches ;
+ uint i1 = vi[a][b] ;
+ pair <TPWIi , TPWIi> pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , vp[a][b] ) ;
+
+ for ( TPWIi it = pp.first ; it < pp.second ; it++ ) {
+ uint chr = it->chromosome ;
+ string s ;
+ uint from , len = contigs[a].sequence.length() ;
+ if ( vp[a][b].reverse_complement ) {
+ if ( it->reverse_complement ) {
+ from = it->position - index->index_length_double ;
+ from -= vp[a][b].position - index->index_length_double ;
+ } else {
+ from = it->position + index->index_length_double - contigs[a].sequence.length() ;
+ from += vp[a][b].position - index->index_length_double ;
+ }
+ } else {
+ if ( it->reverse_complement ) {
+ from = it->position - contigs[a].sequence.length() ;
+ from += vp[a][b].position ;
+ } else {
+ from = it->position ;
+ from -= vp[a][b].position ;
+ }
+ }
+
+ bool rc = false ;
+ s = (*chrs)[chr].sequence.substr ( from , len ) ;
+ if ( s.length() != refseq.length() ) continue ; // Sanity check
+
+ if ( it->reverse_complement ) {
+ REVERSE_COMPLEMENT_CHARP ( (char*)s.c_str() ) ;
+ rc = !rc ;
+ }
+ if ( vp[a][b].reverse_complement ) {
+ REVERSE_COMPLEMENT_CHARP ( (char*)s.c_str() ) ;
+ rc = !rc ;
+ }
+ if ( s == refseq ) continue ; // No need to bother with a perfect match - or is there? Where did it come from?
+
+ // Check for errors
+ uint errcnt = 0 ;
+ for ( c = 0 ; errcnt <= maxerr && c < s.length() ; c++ ) {
+ if ( s[c] != refseq[c] ) {
+ errcnt++ ;
+ } else { // Lower case for all matching chars
+ s[c] = s[c] - 'A' + 'a' ;
+ }
+ }
+ if ( errcnt > maxerr ) continue ;
+
+ sprintf ( c2 , "%s\t%d\t%d\t%s\t%c" , s.c_str() , chr , from , refseq.c_str() , rc ? 'X' : ' ' ) ;
+ string s2 ( c2 ) ;
+ for ( c = 0 ; c < matches.size() && matches[c] != s2 ; c++ ) ;
+ if ( c < matches.size() ) continue ; // We have that one already
+ matches.push_back ( s2 ) ;
+ }
+
+ int lastpos , lastchr = -1 ;
+ for ( c = 0 ; c < matches.size() ; c++ ) {
+ if ( c > 0 && matches[c-1] == matches[c] ) continue ;
+
+ int d ;
+ vector <string> vs ;
+ Tokenize ( matches[c] , vs , "\t" ) ;
+ int _chr = atoi ( vs[1].c_str() ) ;
+ int _pos = atoi ( vs[2].c_str() ) ;
+ if ( lastchr == _chr && lastpos == _pos ) continue ;
+
+ lastchr = _chr ;
+ lastpos = _pos ;
+
+ bool rc = vs[4] == "X" ;
+
+ int thepos ;
+ for ( d = 0 ; d < vs[0].length() ; d++ ) {
+ if ( vs[0][d] < 'A' || vs[0][d] > 'Z' ) continue ; // Perfect match
+ if ( rc ) thepos = _pos + ( vs[0].length() - d ) ;
+ else thepos = _pos + 1 + d ;
+ char the_c = rc ? base_rc[vs[0][d]] : vs[0][d] ;
+ if ( (*chrs)[_chr].sequence[thepos-1] != the_c ) mismatch++ ;
+ if ( the_c == vs[3][d] ) continue ; // Pa-ra-no-ia
+ sprintf ( c2 , "%s\t%d\t%c\t%c\n" , (*chrs)[_chr].name.c_str() , thepos , the_c , vs[3][d] ) ;
+ out.push_back ( c2 ) ;
+ }
+ }
+
+ }
+ }
+
+ sort ( out.begin() , out.end() ) ;
+ FILE * file = fopen ( output_filename.c_str() , "w" ) ;
+ for ( a = 0 ; a < out.size() ; a++ ) {
+ if ( a > 0 && out[a] == out[a-1] ) continue ;
+ fwrite ( out[a].c_str() , 1 , out[a].length() , file ) ;
+ }
+ fclose ( file ) ;
+
+// if ( DEBUGGING ) cerr << mismatch << " mismatches\n" ;
+}
+
+
+/// Gets Solexa reads from a FASTQ file and matches them against the sequence.
+void TChromosomeAlign::align_solexa_fastq ( string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa data from %s ... " , filename.c_str() ) ; fflush(stdout); }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ if ( !file ) {
+ cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ;
+ exit ( 1 ) ;
+ }
+
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ int solexacount = 0 , total_matches = 0 , matched_reads = 0 ;
+ string seq , q ;
+ char *c ;
+ bool invalid ;
+ fasta = false ;
+
+ if ( !pileup ) max_align = 2 ; // If we don't do the pileup but just bin, 2 matches are all we need (=> multiple_bin)
+
+ // Run through file
+ while ( !feof(file) ) {
+ invalid = false ;
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+
+ // Read name
+ if ( *dummy != '@' ) { cerr << "Expected '@'\n" << dummy ; exit ( 1 ) ; }
+ for ( c = dummy ; *c > 13 ; c++ ) ;
+ *c = 0 ;
+ last_solexa_name = dummy + 1 ;
+
+ // Sequence
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ for ( c = dummy ; !isIUPAC[*c] ; c++ ) ;
+ if ( *c == 10 || *c == 13 ) *c = 0 ;
+ if ( *c ) {
+ invalid = true ;
+ while ( *c > 13 ) c++ ;
+ *c = 0 ;
+ }
+ seq = dummy ;
+
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+
+ if ( *dummy != '+' ) { cerr << "Expected '+'\n" ; exit ( 1 ) ; }
+ for ( c = dummy ; *c > 13 ; c++ ) ;
+ *c = 0 ;
+ string qname = dummy + 1 ;
+ if ( qname.empty() ) qname = last_solexa_name ;
+ if ( last_solexa_name != qname ) { cerr << "Name mismatch\n" << last_solexa_name << " : " << qname ; exit ( 1 ) ; }
+
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ for ( c = dummy ; *c > 13 ; c++ ) ;
+ *c = 0 ;
+ q = dummy ;
+
+ if ( !single_read_length ) single_read_length = seq.length() ;
+ int m = invalid ? -1 : align_solexa_read ( (char*) seq.c_str() , q ) ;
+ if ( m > 0 ) matched_reads++ ;
+ if ( m > 0 ) total_matches += m ;
+ if ( using_bins ) { // Write to bins
+ if ( last_solexa_name.empty() ) {} // Paranoia
+ else write2bin ( last_solexa_name , seq , q , m ) ;
+/* else if ( invalid ) fprintf ( binfile_iupac , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ;
+ else if ( m == 0 ) fprintf ( binfile_no_match , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ;
+ else if ( m == 1 ) fprintf ( binfile_single_match , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ;
+ else fprintf ( binfile_multi_match , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ;*/
+ last_solexa_name.clear() ;
+ }
+ if ( !invalid ) solexacount++ ;
+ }
+ fclose ( file ) ;
+
+ if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches.\n" , solexacount , matched_reads , (float) matched_reads * 100 / solexacount , total_matches ) ; fflush(stdout); }
+}
+
+
+/// Gets Solexa reads from a FASTA file and matches them against the sequence. Should not be used anymore.
+void TChromosomeAlign::align_solexa_fasta ( string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa data from %s ... " , filename.c_str() ) ; fflush(stdout); }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ if ( !file ) {
+ cerr << "Could not open FASTA file \"" << filename << "\" - aborting." << endl ;
+ exit ( 1 ) ;
+ }
+ char dummy[READ_CHAR_BUFFER] , *c ;
+ int solexacount = 0 , total_matches = 0 , matched_reads = 0 ;
+ bool invalid ;
+ fasta = true ;
+
+ // Artificial quality strings
+ vector <string> q ;
+ for ( int a = 0 ; a < 100 ; a++ ) q.push_back ( string ( a , (char) 30+33 ) ) ;
+
+ if ( !pileup ) max_align = 2 ; // If we don't do the pileup but just bin, 2 matches are all we need (=> multiple_bin)
+
+ // Run through file
+ while ( !feof(file) ) {
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+ if ( *dummy == '>' ) {
+ if ( using_bins || cigar ) { // Writing to binfiles, storing solexa name
+ for ( c = dummy ; *c > 13 ; c++ ) ;
+ *c = 0 ;
+ last_solexa_name = dummy + 1 ;
+ }
+ continue ;
+ }
+
+ invalid = false ;
+ for ( c = dummy ; !isIUPAC[*c] ; c++ ) ;
+ if ( *c == 10 || *c == 13 ) *c = 0 ;
+
+ if ( *c ) {
+ invalid = true ;
+ while ( *c > 13 ) c++ ;
+ *c = 0 ;
+ }
+
+ if ( !invalid && !single_read_length ) single_read_length = strlen ( dummy ) ;
+ int m = invalid ? -1 : align_solexa_read ( dummy , q[strlen(dummy)] ) ;
+ if ( m > 0 ) matched_reads++ ;
+ if ( m > 0 ) total_matches += m ;
+ if ( using_bins ) { // Write to bins
+ if ( last_solexa_name.empty() ) {} // Paranoia
+ else write2bin ( last_solexa_name , string ( dummy ) , "" , m ) ;
+/* else if ( invalid ) fprintf ( binfile_iupac , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ;
+ else if ( m == 0 ) fprintf ( binfile_no_match , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ;
+ else if ( m == 1 ) fprintf ( binfile_single_match , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ;
+ else fprintf ( binfile_multi_match , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ;*/
+ last_solexa_name.clear() ;
+ }
+ solexacount++ ;
+ }
+ fclose ( file ) ;
+
+ if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches.\n" , solexacount , matched_reads , (float) matched_reads * 100 / solexacount , total_matches ) ; fflush(stdout); }
+}
+
+
+int TChromosomeAlign::align_solexa_read ( char *_seq , const string &_quality , bool inv_compl ) {
+ uint ret = 0 ;
+ uint i1 = index->calculate_index ( _seq ) ;
+ if ( !index->position[i1].size() ) return 0 ;
+ string rc , rcq ;
+ uint l = strlen ( _seq ) ;
+
+ // Binary search
+ TPositionWithIndex findme ;
+ findme.index2 = index->calculate_index2 ( _seq + index->index_length ) ;
+ pair <TPWIi , TPWIi> pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ;
+ if ( sqlite ) sqlite_out.clear () ;
+
+ for ( TPWIi it = pp.first ; it < pp.second ; it++ ) {
+ uint chr = it->chromosome ;
+ char *chrseq = (char*) (*chrs)[chr].sequence.c_str() ;
+ int p = it->position ;
+ if ( it->reverse_complement ) {
+ if ( p < l ) continue ;
+ if ( rc.empty() ) { // Creating the reverse-complement
+ rc = string ( _seq ) ;
+ REVERSE_COMPLEMENT_CHARP ( (char*)rc.c_str() ) ;
+ }
+ if ( !matchIUPAC ( rc.substr(0,l-index->index_length_double).c_str() , chrseq+p-l ) ) continue ;
+ if ( cigar || sqlite ) add_cigar_sqlite ( rc , last_solexa_name , p-l , chr , '-' ) ;
+ if ( coverage ) (*chrs)[chr].add_coverage ( rc , p-l , true ) ;
+ if ( pileup ) {
+ if ( rcq.empty() ) {
+ rcq = _quality ;
+ reverse ( rcq.begin() , rcq.end() ) ;
+ }
+ (*chrs)[chr].ao.add_align ( rc , rcq , p-l , chr ) ;
+ }
+ if ( snpsinreads ) add_snpsinreads ( rc , rcq , chr , p-l , 1 ) ;
+ if ( sqlite ) sqlite_out.push_back ( generate_sqlite_paired_command ( chr , rc , p-l , "" , 0 , '1' ) ) ;
+ } else {
+ uint chrl = (*chrs)[chr].sequence.length() ;
+ if ( p + l > chrl ) continue ;
+ if ( !matchIUPAC ( _seq+index->index_length_double , chrseq+p+index->index_length_double ) ) continue ;
+ if ( pileup ) (*chrs)[chr].ao.add_align ( _seq , _quality , p , chr ) ;
+ if ( cigar || sqlite ) add_cigar_sqlite ( _seq , last_solexa_name , p , chr , '+' ) ;
+ if ( coverage ) (*chrs)[chr].add_coverage ( _seq , p , false ) ;
+ if ( snpsinreads ) add_snpsinreads ( _seq , _quality , chr , p , 1 ) ;
+ if ( sqlite ) sqlite_out.push_back ( generate_sqlite_paired_command ( chr , _seq , p , "" , 0 , '1' ) ) ;
+ }
+ ret++ ;
+ if ( ret >= max_align && !sqlite ) return ret ; // Shortcut for binning-only
+ }
+
+ if ( wobble && ret == 0 ) wobble_single_read ( _seq , _quality ) ;
+
+ if ( sqlite && sqlite_out.size() == 1 ) add2sqlite_cache ( sqlite_out[0] ) ;
+
+ return ret ;
+}
+
+string TChromosomeAlign::matchIUPAC_tolerant ( const char *c1 , const char *c2 , int tolerance ) {
+ string ret ;
+ while ( *c1 && *c2 ) {
+ if ( char2IUPAC[*c1++] & char2IUPAC[*c2++] ) {
+ ret += *(c2-1) ;
+ continue ;
+ }
+ if ( tolerance-- == 0 ) return "" ;
+ ret += *(c2-1) ;
+ }
+ return ret ;
+}
+
+
+
+void TChromosomeAlign::wobble_single_read ( char *seq , const string &_quality ) {
+ uint i1 = index->calculate_index ( seq ) ;
+ uint l = strlen ( seq ) ;
+ string rc , rcq , query , realseq ;
+ int start , dir ;
+
+ // Binary search
+ TPositionWithIndex findme ;
+ findme.index2 = index->calculate_index2 ( seq + index->index_length ) ;
+ pair <TPWIi , TPWIi> pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ;
+
+ for ( TPWIi it = pp.first ; it < pp.second ; it++ ) {
+ uint chr = it->chromosome ;
+ char *chrseq = (char*) (*chrs)[chr].sequence.c_str() ;
+ int p = it->position ;
+ string found ;
+ if ( it->reverse_complement ) {
+ if ( p < l ) continue ;
+ if ( rc.empty() ) { // Creating the reverse-complement
+ rc = string ( seq ) ;
+ REVERSE_COMPLEMENT_CHARP ( (char*)rc.c_str() ) ;
+ }
+ query = rc.substr(0,l-index->index_length_double) ;
+ start = p-l ;
+ found = matchIUPAC_tolerant ( query.c_str() , chrseq+start , wobblemax ) ;
+ if ( found.empty() ) continue ;
+ realseq = dorc(seq).substr(0,l-index->index_length_double) ;
+ } else {
+ uint chrl = (*chrs)[chr].sequence.length() ;
+ if ( p + l > chrl ) continue ;
+ query = seq+index->index_length_double ;
+ start = p+index->index_length_double ;
+ found = matchIUPAC_tolerant ( query.c_str() , chrseq+start , wobblemax ) ;
+ if ( found.empty() ) continue ;
+ realseq = query ;
+ }
+ dir = 1 ;
+ for ( int a = 0 ; a < query.length() && a < found.length() ; a++ ) {
+ char c = (*chrs)[chr].original_sequence.empty() ? (*chrs)[chr].sequence[start + a * dir] : (*chrs)[chr].original_sequence[start + a * dir] ;
+// cout << start + a * dir + 1 << "\t" << c << "\t" << query[a] << "\t" << found[a] << endl ;
+ if ( char2IUPAC[c] & char2IUPAC[query[a]] ) continue ;
+// if ( c == found[a] ) continue ;
+ fprintf ( wobble , "SNP\t%s\t%d\t%c\t%c\n" , (*chrs)[chr].name.c_str() , start + a * dir + 1 , c , query[a] ) ;
+ }
+// cout << query << " O" << endl << found << " N" << endl << endl ;
+ }
+}
+
+void TChromosomeAlign::add_cigar_sqlite ( const string &sequence , const string &name , uint position , uint chromosome , char orientation ) {
+ char cig[READ_CHAR_BUFFER] ;
+ string cigar_text ;
+
+ int a ;
+ bool iu = false ;
+ for ( a = 0 ; a < sequence.length() && !iu ; a++ ) {
+ if ( isIUPAC[(*chrs)[chromosome].sequence[a+position]] ) iu = true ;
+ }
+ if ( iu ) {
+ string s ;
+ for ( a = 0 ; a < sequence.length() ; a++ ) {
+ if ( (*chrs)[chromosome].sequence[a+position] != sequence[a] ) s += "D" ;
+ else s += "M" ;
+ }
+
+ // Compressing CIGAR
+ char last = ' ' ;
+ uint cnt = 0 ;
+ for ( a = 0 ; a < s.length() ; a++ ) {
+ if ( s[a] != last ) {
+ if ( last != ' ' ) {
+ if ( !cigar_text.empty() ) cigar_text += " " ;
+ sprintf ( cig , "%c %d" , last , cnt ) ;
+ cigar_text += cig ;
+ if ( last == 'D' ) {
+ sprintf ( cig , " I %d" , cnt ) ;
+ cigar_text += cig ;
+ }
+ }
+ last = s[a] ;
+ cnt = 1 ;
+ } else cnt++ ;
+ }
+ sprintf ( cig , " %c %d" , last , cnt ) ;
+ cigar_text += cig ;
+ if ( last == 'D' ) {
+ sprintf ( cig , " I %d" , cnt ) ;
+ cigar_text += cig ;
+ }
+
+// cout << cigar_text << endl ;
+ } else {
+ sprintf ( cig , "M %d" , (int)sequence.length() ) ;
+ cigar_text = cig ;
+ }
+
+ if ( cigar ) {
+ fprintf ( cigar , "%s %d %d %c %s %d %d + %d %s\n" ,
+ name.c_str() , // Read name
+ 1 , // Read begin
+ sequence.length() , // Read end
+ orientation , // Read orientation
+ (*chrs)[chromosome].name.c_str(), // Chromosome name
+ position + 1 , // Begin on chromosome
+ position + sequence.length() , // End on chromosome
+ 99 , // Fake Smith-Waterman score
+ cigar_text.c_str() // CIGAR commands
+ ) ;
+ }
+}
+
+void TChromosomeAlign::show_pileup ( bool snps_only ) {
+ if ( !pileup ) return ;
+ for ( int chromosome = 0 ; chromosome < chrs->size() ; chromosome++ ) (*chrs)[chromosome].ao.show_pileup ( pileup , snps_only ) ;
+// cerr << "SNPS searched : " << search_snps << ", found : " << found_snps << " (" << (found_snps*100/search_snps) << "%)" << endl ;
+// if ( bogus ) cerr << "Bogus : " << bogus << endl ;
+}
+
+
+
+void TChromosomeAlign::align_solexa_fastq_variety ( string filename ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa data from %s ... " , filename.c_str() ) ; fflush(stdout); }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ if ( !file ) {
+ cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ;
+ exit ( 1 ) ;
+ }
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ int solexacount = 0 , total_matches = 0 , matched_reads = 0 ;
+
+ string seq , q ;
+ char *c ;
+ bool invalid ;
+ fasta = false ;
+
+ if ( !pileup ) max_align = 2 ; // If we don't do the pileup but just bin, 2 matches are all we need (=> multiple_bin)
+
+ // Run through file
+ while ( !feof(file) ) {
+ invalid = false ;
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+ // Read name
+ if ( *dummy != '@' ) { cerr << "Expected '@'\n" << dummy ; exit ( 1 ) ; }
+ for ( c = dummy ; *c > 13 ; c++ ) ;
+ *c = 0 ;
+ last_solexa_name = dummy + 1 ;
+
+ // Sequence
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ for ( c = dummy ; !isIUPAC[*c] ; c++ ) ;
+ if ( *c == 10 || *c == 13 ) *c = 0 ;
+ if ( *c ) {
+ invalid = true ;
+ while ( *c > 13 ) c++ ;
+ *c = 0 ;
+ }
+ seq = dummy ;
+
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( *dummy != '+' ) { cerr << "Expected '+'\n" ; exit ( 1 ) ; }
+ for ( c = dummy ; *c > 13 ; c++ ) ;
+ *c = 0 ;
+ string qname = dummy + 1 ;
+ if ( qname.empty() ) qname = last_solexa_name ;
+ if ( last_solexa_name != qname ) { cerr << "Name mismatch\n" << last_solexa_name << " : " << qname ; exit ( 1 ) ; }
+
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ for ( c = dummy ; *c > 13 ; *c++ ) ;
+ *c = 0 ;
+ q = dummy ;
+
+ try_matching_read ( seq , q ) ;
+
+ int m = invalid ? -1 : try_matching_read ( seq , q ) ;
+ if ( m > 0 ) matched_reads++ ;
+ if ( m > 0 ) total_matches += m ;
+ if ( using_bins ) { // Write to bins
+ if ( last_solexa_name.empty() ) {} // Paranoia
+ else write2bin ( last_solexa_name , seq , q , m ) ;
+ last_solexa_name.clear() ;
+ }
+ if ( !invalid ) solexacount++ ;
+ }
+ fclose ( file ) ;
+
+ if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches.\n" , solexacount , matched_reads , (float) matched_reads * 100 / solexacount , total_matches ) ; fflush(stdout); }
+}
+
+int TChromosomeAlign::try_matching_read ( const string & seq , string quality ) {
+ vector <uint> indices ;
+ uint idd = index->index_length_double ;
+
+ // Beginning
+ indices.push_back ( index->calculate_index ( seq.c_str() ) ) ;
+ indices.push_back ( index->calculate_index2 ( seq.c_str() + index->index_length ) ) ;
+
+ // Middle
+// indices.push_back ( index->calculate_index ( seq.c_str() + seq.length() / 2 + idd / 2 ) ) ;
+// indices.push_back ( index->calculate_index2 ( seq.c_str() + seq.length() / 2 + idd / 2 + index->index_length ) ) ;
+
+ // End
+ indices.push_back ( index->calculate_index ( seq.c_str() + seq.length() - 1 - idd ) ) ;
+ indices.push_back ( index->calculate_index2 ( seq.c_str() + seq.length() - 1 - idd + index->index_length ) ) ;
+
+ uint m , n ;
+ pwi_vector out ;
+
+ for ( n = 0 ; n < indices.size() ; n += 2 ) {
+ uint i1 = indices[n] ;
+ TPositionWithIndex findme ;
+ findme.index2 = indices[n+1] ;
+ pair <TPWIi , TPWIi> pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ;
+
+ for ( TPWIi it = pp.first ; it < pp.second ; it++ ) {
+ out.push_back ( *it ) ;
+ }
+ }
+
+ if ( out.size() < 2 ) return 0 ;
+
+ sort ( out.begin() , out.end() , tpi_full_compare ) ;
+
+
+ int cnt = 0 , start = -1 ;
+ for ( n = 0 ; n < out.size() ; n++ ) {
+ bool found = false ;
+ for ( m = n + 1 ; m < out.size() ; m++ ) {
+ if ( out[m].chromosome != out[n].chromosome ) break ;
+ if ( out[m].reverse_complement != out[n].reverse_complement ) break ;
+
+ int diff = out[m].position - out[n].position ;
+ if ( diff < 0 ) diff = -diff ;
+
+ if ( diff > seq.length() ) continue ;
+ found = true ;
+ start = out[m].position < out[n].position ? m : n ;
+ break ;
+ }
+ if ( !found ) continue ;
+ cnt++ ;
+// cout << out[n].chromosome << "\t" << out[n].position << "\t" << out[n].reverse_complement << endl ;
+ }
+// cout << endl ;
+
+ if ( cnt == 1 ) {
+ string ref = get_ref_substr ( out[start] , seq.length() ) ;
+ string s = out[start].reverse_complement ? dorc ( seq ) : seq ; // ????
+ for ( int a = 0 ; a < s.length() ; a++ ) if ( s[a] != ref[a] ) s[a] = s[a] - 'A' + 'a' ;
+ cout << out[start].chromosome << "\t" << out[start].position << "\t" << ref << "\t" << s << "\t" << out.size() << "\t" << cnt << "\t" << out[start].reverse_complement << endl ;
+ }
+
+ return cnt ;
+}
+
+string TChromosomeAlign::get_ref_substr ( const TPositionWithIndex &p , int length ) {
+ int start = p.position ;
+ if ( p.reverse_complement ) start -= index->index_length_double + 1 ;
+ if ( start < 0 ) return "" ;
+ string ret = (*chrs)[p.chromosome].sequence.substr ( start , length ) ;
+// if ( p.reverse_complement ) return doc ( ret ) ;
+ return ret ;
+}
diff --git a/src/findknownsnps.cpp b/src/findknownsnps.cpp
new file mode 100644
index 0000000..557c475
--- /dev/null
+++ b/src/findknownsnps.cpp
@@ -0,0 +1,338 @@
+/*! \mainpage SNP-O-MATIC
+ *
+ * \section authors Authors
+ *
+ * - Magnus Manske (mm6 at sanger.ac.uk)
+ *
+ * \section intro_sec Introduction
+ *
+ * SNP-O-MATIC can align Solexa reads against a reference sequence.
+ * Known SNPs can be added to the reference sequence as IUPAC bases.
+ * Only perfectly matching Solexa reads, with respect to the IUPAC bases, will be aligned.
+ * Reads can be binned into those that do not match the reference, those that match uniquely, and those that match two or more times.
+ *
+ * \section install_sec Installation
+ *
+ * SNP-O-MATIC comes with a Makefile. The Intel compiler is used by default for performance reasons, but the g++ variant is in the Makefile as well in the form of comments.
+ */
+
+#include "snpomatic.h"
+#include <fstream>
+
+using namespace std ;
+
+
+
+// __________________________________________________________________________________________________________________________________________________________________ MAIN
+
+/// Shows possible parameters, and an (optional) error message.
+void die_with_description ( const char *msg = NULL ) {
+ if ( msg ) cerr << msg << endl ;
+ cerr << "Parameters : \n" ;
+ cerr << "--genome=GENOME_FILE FASTA file with chromosomes (mandatory)\n" ;
+ cerr << "--fasta=FASTA_FILE FASTA file with Solexa reads (mandatory, except when --fastq or --index is used)\n" ;
+ cerr << "--fastq=FASTQ_FILE FASTQ file with Solexa reads (mandatory, except when --fasta or --index is used)\n" ;
+ cerr << "--fastq2=FASTQ_FILE2 FASTQ file with Solexa reads (optional; read mate)\n" ;
+ cerr << "--nono=FILENAME File with list of read names (!) to ignore (optional)\n" ;
+ cerr << "--regions=REGION_FILE Region file for finding new SNPs (optional) [DEPRECATED]\n" ;
+ cerr << "--snps=SNP_FILE Simple SNP file (optional)\n" ;
+ cerr << "--gff=GFF_FILE GFF file with SNPs (optional)\n" ;
+ cerr << "--uniqueness=FILE Output a uniqueness data file for the reference; no Solexa reads needed; implies --noshortcuts (optional)\n" ;
+ cerr << "--pileup=FILE Outputs complete pileup into that file (optional)\n" ;
+ cerr << "--cigar=FILE Outputs alignment in CIGAR format (optional)\n" ;
+ cerr << "--gffout=FILE Outputs reads in GFF format (optional)\n" ;
+ cerr << "--coverage=FILENAME Outputs (high depth) coverage data (optional)\n" ;
+ cerr << "--wobble=FILENAME Outputs a list of possible variations (optional; paired reads only) [UNDER CONSTRUCTION]\n" ;
+ cerr << "--fragmentplot=FILENAME Outputs a plot of fragment size distribution to a file (optional)\n" ;
+ cerr << "--snpsinreads=FILENAME Outputs a list of reads containing known SNPs to a file (optional)\n" ;
+ cerr << "--indelplot=FILENAME Outputs indel data to a file (optional)\n" ;
+ cerr << "--inversions=FILENAME For paired reads, writes read matches indicating inversions into a file (optional)\n" ;
+ cerr << "--faceaway=FILENAME For paired reads, writes read matches that \"face away\" from each other into a file (optional)\n" ;
+ cerr << "--sqlite=FILENAME Creates a sqlite text file with alignment data [EXPERIMENTAL] (optional)\n" ;
+ cerr << "--sam=FILENAME Creates a SAM alignment file (optional)\n" ;
+ cerr << "--spancontigs=FILENAME Outputs read pairs where \"half\" reads map uniquely to different contigs (optional)\n" ;
+ cerr << "--bins=FILE_PREFIX Outputs no-match, single-match and multi-match Solexa reads into prefixed files (optional)\n" ;
+ cerr << "--binmask=MASK Mask of 1s and 0s to turn off individual bins. Order: No match, single match, multi-match, IUPAC.\n" ;
+ cout << " Example: 0100 creates only single-match bin. (optional; default:1111)\n" ;
+ cerr << "--pair=NUMBER For paired reads, the length of the first part of the read (mandatory for paired reads)\n" ;
+ cerr << "--fragment=NUMBER For paired reads, the average fragment length (mandatory for paired reads)\n" ;
+ cerr << "--variance=NUMBER For paired reads, the variance of the fragment length to either side (optional; default: 1/4 of fragment size)\n" ;
+ cerr << "--wobblemax=NUMBER Maximum number of mismatches for wobble (optional; default 2; see --wobble)\n" ;
+ cerr << "--mspi=NUMBER Maximum number of SNPs per chromosomal index (optional; default:8)\n" ;
+ cerr << "--index=FILENAME Index filename (index will be created if it does not exist; optional)\n" ;
+ cerr << "--noshortcuts Will process all chrososomal regions, even those with lots'o'repeats (optional; no value)\n" ;
+ cerr << "--snpsonly Only lists found SNPs in the pileup (optional; no value)\n" ;
+ cerr << "--chromosome=NAME Discards all chromosomes but NAME prior to run (optional)\n" ;
+ cerr << "--index_from=NUMBER Starts indexing at this position on all chromosomes (optional)\n" ;
+ cerr << "--index_to=NUMBER Stops indexing at this position on all chromosomes (optional)\n" ;
+ cerr << "--chop=NUMBER For paired reads, if one but not the other matches, shorten the other by NUMBER bases (optional)\n" ;
+ cerr << "--index1=NUMBER Length of internal index 1 (optional; default:10)\n" ;
+ cerr << "--index2=NUMBER Length of internal index 2 (optional; default:16)\n" ;
+ cerr << "--memory_save=NUMBER Indexes the genome every NUMBER of positions; saves memory and runtime, but can have strange side effects (optional)\n" ;
+ cerr << "--multimatch Puts a multiple-matching read to a random position (optional) [currently paired reads only]\n" ;
+ cerr << "--singlematch Only performs additional output functions for single matches (optional) [currently paired reads only]\n" ;
+ cerr << "--foum For paired reads, at least one read has to match uniquely in the genome (force one unique match) (optional)\n" ;
+ cerr << "--mismatch The number of mismatches allowed outside the index (index1+index2) (optional)\n" ;
+ cerr << "--rpa=FILENAME Writes all read pair alignments into a file (optional)\n" ;
+// cerr << "--featuretable=OUTPUT_FILE\tOutputs a DDBJ/EMBL/GenBank feature table for matched reads (optional; paired reads only for now)\n" ;
+ exit ( 0 ) ;
+}
+
+/// Well, guess what.
+int main ( int argc , char **argv ) {
+ string genome_file , gff_file , simple_snp_file , fastq_file , fastq2_file , fasta_file , pileup_file , bin_prefix , regions_file ;
+ string cigar_file , wobble_file , nono , fragmentplot , featuretable , gffout , coverage_file , uniqueness , snpsinreads ;
+ string index_file , indelplot_file , inversions_file , chromosome , sqlite , sam_file , spancontigs , face_away , rpa ;
+ string binmask ( "1111" ) ;
+ uint wobblemax = 2 ;
+ uint mspi = 8 ;
+ uint index_from = 0 ;
+ uint index_to = 0 ;
+ uint pair_length = 0 ;
+ uint fragment_length = 0 ;
+ uint index_length_1 = 10 ;
+ uint index_length_2 = 16 ;
+ int noshortcuts = false ;
+ int snpsonly = false ;
+ int chop = 0 ;
+ uint variance = 0 ;
+ int multimatch = false ;
+ int singlematch = false ;
+ int force_one_unique_match = false ;
+ int memory_save = 1 ;
+
+ // Parameters
+ static struct option long_options[] = {
+ { "genome" , required_argument , 0 , 'g' } ,
+ { "fasta" , optional_argument , 0 , 'a' } ,
+ { "fastq" , optional_argument , 0 , 'q' } ,
+ { "fastq2" , optional_argument , 0 , 'Q' } ,
+ { "snps" , optional_argument , 0 , 's' } ,
+ { "coverage" , optional_argument , 0 , 'e' } ,
+ { "sam" , optional_argument , 0 , 'S' } ,
+ { "gff" , optional_argument , 0 , 'f' } ,
+ { "rpa" , optional_argument , 0 , 'R' } ,
+ { "faceaway" , optional_argument , 0 , 'F' } ,
+ { "pileup" , optional_argument , 0 , 'p' } ,
+ { "bins" , optional_argument , 0 , 'b' } ,
+ { "binmask" , optional_argument , 0 , 'z' } ,
+ { "index" , optional_argument , 0 , 'x' } ,
+ { "nono" , optional_argument , 0 , 'n' } ,
+ { "uniqueness" , optional_argument , 0 , '0' } ,
+ { "wobble" , optional_argument , 0 , 'w' } ,
+ { "pair" , optional_argument , 0 , 'i' } ,
+ { "fragment" , optional_argument , 0 , 't' } ,
+ { "featuretable" , optional_argument , 0 , 'u' } ,
+ { "gffout" , optional_argument , 0 , 'y' } ,
+ { "variance" , optional_argument , 0 , 'v' } ,
+ { "mspi" , optional_argument , 0 , 'm' } ,
+ { "cigar" , optional_argument , 0 , 'c' } ,
+ { "sqlite" , optional_argument , 0 , 'P' } ,
+ { "fragmentplot" , optional_argument , 0 , 'o' } ,
+ { "snpsinreads" , optional_argument , 0 , 'd' } ,
+ { "regions" , optional_argument , 0 , 'r' } ,
+ { "index1" , optional_argument , 0 , '1' } ,
+ { "index2" , optional_argument , 0 , '2' } ,
+ { "chop" , optional_argument , 0 , '3' } ,
+ { "indelplot" , optional_argument , 0 , '4' } ,
+ { "wobblemax" , optional_argument , 0 , '5' } ,
+ { "inversions" , optional_argument , 0 , '6' } ,
+ { "chromosome" , optional_argument , 0 , '7' } ,
+ { "index_from" , optional_argument , 0 , '8' } ,
+ { "index_to" , optional_argument , 0 , '9' } ,
+ { "mismatch" , optional_argument , 0 , 'M' } ,
+ { "spancontigs" , optional_argument , 0 , 'X' } ,
+ { "memory_save" , optional_argument , 0 , 'Y' } ,
+ { "multimatch" , optional_argument , &multimatch , true } ,
+ { "singlematch" , optional_argument , &singlematch , true } ,
+ { "noshortcuts" , optional_argument , &noshortcuts , true } ,
+ { "foum" , optional_argument , &force_one_unique_match , true } ,
+ { "snpsonly" , optional_argument , &snpsonly , true } ,
+ { 0 , 0 , 0 , 0 }
+ } ;
+
+
+ int c ;
+ int option_index = 0;
+ while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) {
+
+ switch ( c ) {
+ case 0 : break ;
+ case 'g' : genome_file = optarg ; break ;
+ case 'a' : fasta_file = optarg ; break ;
+ case 'q' : fastq_file = optarg ; break ;
+ case 'S' : sam_file = optarg ; break ;
+ case 'Q' : fastq2_file = optarg ; break ;
+ case 's' : simple_snp_file = optarg ; break ;
+ case 'f' : gff_file = optarg ; break ;
+ case 'e' : coverage_file = optarg ; break ;
+ case 'p' : pileup_file = optarg ; break ;
+ case 'b' : bin_prefix = optarg ; break ;
+ case 'z' : binmask = optarg ; break ;
+ case 'F' : face_away = optarg ; break ;
+ case '0' : uniqueness = optarg ; break ;
+ case 'w' : wobble_file = optarg ; break ;
+ case 'n' : nono = optarg ; break ;
+ case 'x' : index_file = optarg ; break ;
+ case 'i' : pair_length = atoi ( optarg ) ; break ;
+ case 'o' : fragmentplot = optarg ; break ;
+ case 'u' : featuretable = optarg ; break ;
+ case 'y' : gffout = optarg ; break ;
+ case 'X' : spancontigs = optarg ; break ;
+ case 'd' : snpsinreads = optarg ; break ;
+ case 'v' : variance = atoi ( optarg ) ; break ;
+ case 'Y' : memory_save = atoi ( optarg ) ; break ;
+ case 't' : fragment_length = atoi ( optarg ) ; break ;
+ case 'm' : mspi = atoi ( optarg ) ; break ;
+ case '1' : index_length_1 = atoi ( optarg ) ; break ;
+ case '2' : index_length_2 = atoi ( optarg ) ; break ;
+ case '3' : chop = atoi ( optarg ) ; break ;
+ case '4' : indelplot_file = optarg ; break ;
+ case '5' : wobblemax = atoi ( optarg ) ; break ;
+ case '6' : inversions_file = optarg ; break ;
+ case '7' : chromosome = optarg ; break ;
+ case '8' : index_from = atoi ( optarg ) ; break ;
+ case '9' : index_to = atoi ( optarg ) ; break ;
+ case 'M' : globally_allowed_mismatches = atoi ( optarg ) ; break ;
+ case 'c' : cigar_file = optarg ; break ;
+ case 'r' : regions_file = optarg ; break ;
+ case 'P' : sqlite = optarg ; break ;
+ case 'R' : rpa = optarg ; break ;
+ default : die_with_description("Unknown option") ;
+ }
+ }
+ binmask += "0000" ; // Ignore bins if shorter string was supplied
+
+
+ // Sanity checks
+ if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need this
+ if ( index_file.empty() ) {
+ if ( fastq_file.empty() && fasta_file.empty() && uniqueness.empty() ) die_with_description("No Solexa file given!") ; // Need one
+ if ( !fastq_file.empty() && !fasta_file.empty() ) die_with_description("Two Solexa files given!") ; // Can't have both
+ }
+ if ( !regions_file.empty() || !uniqueness.empty() ) noshortcuts = true ;
+
+ // Incorporating known SNPs and indexing chromosomes
+ init_iupac () ;
+ TChromosomalIndices index ( index_length_1 , index_length_2 ) ;
+ index.noshortcuts = noshortcuts ;
+ index.max_snps_per_index = mspi ;
+ index.index_file = index_file ;
+ index.index_from = index_from ;
+ index.index_to = index_to ;
+ vector <TChromosome> chrs ;
+ load_all_chromosomes ( genome_file , chrs ) ;
+
+ if ( !gff_file.empty() ) incorporate_all_gff ( gff_file , chrs ) ;
+ if ( !simple_snp_file.empty() ) incorporate_simple_snps ( simple_snp_file , chrs ) ;
+
+ if ( !chromosome.empty() ) {
+ for ( int a = 0 ; a < chrs.size() ; a++ ) {
+ if ( chrs[a].name == chromosome ) continue ;
+ chrs.erase ( chrs.begin() + a ) ;
+ a-- ;
+ }
+ if ( chrs.size() == 0 ) {
+ cout << "No chromosome with that name (" << chromosome << ") in file -- aborting.\n" ;
+ exit ( 1 ) ;
+ }
+ cout << "Keeping " << chrs.size() << " chromosomes" << endl ;
+ }
+
+ index.index_steps = memory_save ;
+
+ if ( !regions_file.empty() ) index.run_regions ( &chrs , regions_file ) ;
+ else index.run ( &chrs ) ;
+
+ if ( !uniqueness.empty() ) {
+ index.uniqueness ( uniqueness ) ;
+ if ( fastq_file.empty() && fasta_file.empty() ) return 0 ; // My work here is done
+ }
+
+ if ( !fastq_file.empty() && !fastq2_file.empty() && pair_length == 0 ) { // Auto-detect read length
+ ifstream in ( fastq_file.c_str() ) ;
+ char tmp[10000] ;
+ in.getline ( tmp , 10000 ) ;
+ in.getline ( tmp , 10000 ) ;
+ in.close() ;
+ pair_length = strlen ( tmp ) ;
+ cout << "Setting read length to " << pair_length << endl ;
+ }
+
+
+ // Now look at the Solexa reads and do the thing
+ TChromosomeAlign ca ( chrs , index ) ;
+ ca.wobblemax = wobblemax ;
+ ca.multimatch = multimatch ;
+ ca.singlematch = singlematch ;
+ ca.force_one_unique_match = force_one_unique_match ;
+ if ( !pileup_file.empty() ) ca.pileup = fopen ( pileup_file.c_str() , "w" ) ;
+ if ( !cigar_file.empty() ) ca.cigar = fopen ( cigar_file.c_str() , "w" ) ;
+ if ( !fragmentplot.empty() ) ca.fragmentplot = fopen ( fragmentplot.c_str() , "w" ) ;
+ if ( !indelplot_file.empty() ) ca.indelplot = fopen ( indelplot_file.c_str() , "w" ) ;
+ if ( !gffout.empty() ) ca.gffout = fopen ( gffout.c_str() , "w" ) ;
+ if ( !snpsinreads.empty() ) ca.snpsinreads = fopen ( snpsinreads.c_str() , "w" ) ;
+ if ( !face_away.empty() ) ca.faceaway = fopen ( face_away.c_str() , "w" ) ;
+ if ( !coverage_file.empty() ) ca.coverage = fopen ( coverage_file.c_str() , "w" ) ;
+ if ( !inversions_file.empty() ) ca.inversions = fopen ( inversions_file.c_str() , "w" ) ;
+ if ( !sqlite.empty() ) ca.sqlite = fopen ( sqlite.c_str() , "w" ) ;
+ if ( !sam_file.empty() ) ca.sam = fopen ( sam_file.c_str() , "w" ) ;
+ if ( !spancontigs.empty() ) ca.spancontigs = fopen ( spancontigs.c_str() , "w" ) ;
+ if ( !rpa.empty() ) ca.rpa = fopen ( rpa.c_str() , "w" ) ;
+ if ( !featuretable.empty() ) {
+ ca.featuretable = fopen ( featuretable.c_str() , "w" ) ;
+ fprintf ( ca.featuretable , "Key\tLocation/Qualifiers\n" ) ;
+ }
+ if ( !bin_prefix.empty() ) {
+ string ext = ".fasta" ;
+ if ( !fastq_file.empty() ) ext = ".fastq" ;
+ cout << "Using binmask " << binmask << endl ;
+ if ( binmask[0] == '1' ) ca.binfile_no_match = fopen ( string ( bin_prefix + "_no_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[1] == '1' ) ca.binfile_single_match = fopen ( string ( bin_prefix + "_single_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[2] == '1' ) ca.binfile_multi_match = fopen ( string ( bin_prefix + "_multi_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[3] == '1' ) ca.binfile_iupac = fopen ( string ( bin_prefix + "_iupac" + ext ).c_str() , "w" ) ;
+ ca.using_bins = true ;
+ }
+ ca.chop = chop ;
+ ca.init () ;
+ if ( !nono.empty() ) ca.add_nono_list ( nono ) ;
+ if ( pair_length == 0 ) {
+ if ( !wobble_file.empty() ) ca.wobble = fopen ( string ( wobble_file ).c_str() , "w" ) ;
+ if ( !fasta_file.empty() ) ca.align_solexa_fasta ( fasta_file ) ;
+ if ( !fastq_file.empty() ) ca.align_solexa_fastq ( fastq_file ) ;
+ } else {
+ if ( fragment_length == 0 && variance == 0 ) { // If no fragment size is given, do entire chromosome
+ for ( uint a = 0 ; a < chrs.size() ; a++ ) {
+ if ( fragment_length < chrs[a].sequence.length() ) fragment_length = chrs[a].sequence.length() ;
+ }
+ fragment_length = fragment_length / 2 ;
+ variance = fragment_length ;
+ }
+ if ( !wobble_file.empty() ) ca.wobble = fopen ( string ( wobble_file ).c_str() , "w" ) ;
+ if ( !fasta_file.empty() ) ca.align_solexa_paired ( fasta_file , fastq2_file , pair_length , fragment_length , variance ) ;
+ if ( !fastq_file.empty() ) ca.align_solexa_paired ( fastq_file , fastq2_file , pair_length , fragment_length , variance ) ;
+ }
+ ca.show_pileup ( !regions_file.empty() || snpsonly ) ;
+ ca.dump_coverage () ;
+ ca.finish_sqlite ( fragment_length , variance , pair_length != 0 ? pair_length : ca.single_read_length ) ;
+
+ return 0 ;
+}
+
+// Example runs:
+// rm test ; make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=test
+// make clean ; make ; time ./findknownsnps --genome=/nfs/malaria/data2/human_sequence/Homo_sapiens.NCBI36.48.dna.chromosome.22.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=human
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --bins=q2
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --cigar=test.cigar
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=q1_no_match.fasta --snps=new_snps_60_uniq --bins=q3
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/faculty/mm6/3D7_pm.fa --fastq=$MAL_SOLEXA_HOME/run368/368_s_3.fastq --pair=35 --fragment=300 --bins=q3
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=../3D7_pm.fa --snps=MERGED_CAPILLARY_MAQ.tab.subset --fastq=$MAL_SOLEXA_DATA/986_s_8/986_s_8.fastq --pair=37 --fragment=200 --variance=200 --sqlite=test.sqlite.text
+
+/*
+ // Test params, obsolete
+ genome_file = "/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa" ;
+ gff_file = "/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff" ;
+ fasta_file = "/nfs/malaria/data2/3D7/all.solexa.fasta" ;
+ //simple_snp_file = "/nfs/malaria/data2/3D7/known_snps/MAL1.snps" ;
+ //fastq_file = "/nfs/malaria/data2/3D7/fastq/PFCLIN_slx.fastq" ;
+*/
diff --git a/src/global_functions.cpp b/src/global_functions.cpp
new file mode 100644
index 0000000..58358d5
--- /dev/null
+++ b/src/global_functions.cpp
@@ -0,0 +1,264 @@
+#include "snpomatic.h"
+
+// __________________________________________________________________________________________________________________________________________________________________ GLOBAL VARIABLES
+
+
+uchar char2num[256] , num2char[256] , to_lc[256] , to_uc[256] , char2IUPAC[256] , IUPAC2char[256] , isIUPAC[256] , base_rc[256] , IUPAC_variance[256] ;
+int globally_allowed_mismatches = 0 ;
+
+// __________________________________________________________________________________________________________________________________________________________________ GLOBAL FUNCTIONS
+
+
+/// Breaks up a string in a vector of strings at given sepatator
+void Tokenize(const string& str,
+ vector<string>& tokens,
+ const string& delimiters)
+{
+ // Skip delimiters at beginning.
+ string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+ // Find first "non-delimiter".
+ string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ while (string::npos != pos || string::npos != lastPos)
+ {
+ // Found a token, add it to the vector.
+ tokens.push_back(str.substr(lastPos, pos - lastPos));
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+ // Find next "non-delimiter"
+ pos = str.find_first_of(delimiters, lastPos);
+ }
+}
+
+/// Initializes IUPAC lookup tables
+void init_iupac () {
+ int a ;
+ for ( a = 0 ; a < 256 ; a++ ) char2IUPAC[a] = 0 ;
+ char2IUPAC['A'] = IUPAC_A ;
+ char2IUPAC['C'] = IUPAC_C ;
+ char2IUPAC['G'] = IUPAC_G ;
+ char2IUPAC['T'] = IUPAC_T ;
+// char2IUPAC['U'] = IUPAC_T ; // U <=> T
+ char2IUPAC['R'] = IUPAC_A|IUPAC_G ; // R Purine (A or G)
+ char2IUPAC['Y'] = IUPAC_C|IUPAC_T ; // Y Pyrimidine (C, T)
+ char2IUPAC['M'] = IUPAC_C|IUPAC_A ; // M C or A
+ char2IUPAC['K'] = IUPAC_T|IUPAC_G ; // K T, or G
+ char2IUPAC['W'] = IUPAC_T|IUPAC_A ; // W T, or A
+ char2IUPAC['S'] = IUPAC_C|IUPAC_G ; // S C or G
+ char2IUPAC['B'] = IUPAC_C|IUPAC_T|IUPAC_G ; // B C, T, or G (not A)
+ char2IUPAC['D'] = IUPAC_T|IUPAC_A|IUPAC_G ; // D A, T, or G (not C)
+ char2IUPAC['H'] = IUPAC_T|IUPAC_A|IUPAC_C ; // H A, T, or C (not G)
+ char2IUPAC['V'] = IUPAC_A|IUPAC_C|IUPAC_G ; // V A, C, or G (not T)
+ char2IUPAC['N'] = IUPAC_A|IUPAC_C|IUPAC_G|IUPAC_T ; // N Any base (A, C, G, T)
+ for ( a = 0 ; a < 256 ; a++ ) IUPAC2char[char2IUPAC[a]] = a ;
+ char2IUPAC['X'] = IUPAC_A|IUPAC_C|IUPAC_G|IUPAC_T ; // X Any base (A, C, G, T)
+
+ for ( a = 0 ; a < 256 ; a++ ) isIUPAC[a] = 1 ;
+ isIUPAC['A'] = isIUPAC['C'] = isIUPAC['G'] = isIUPAC['T'] = 0 ;
+ isIUPAC[IUPAC_A] = isIUPAC[IUPAC_C] = isIUPAC[IUPAC_G] = isIUPAC[IUPAC_T] = 0 ; // 1,2,4,8 - isIUPAC is safe for double use...
+ for ( a = 0 ; a < 256 ; a++ ) base_rc[a] = ' ' ;
+ base_rc['A'] = 'T' ;
+ base_rc['T'] = 'A' ;
+ base_rc['C'] = 'G' ;
+ base_rc['G'] = 'C' ;
+ base_rc['N'] = 'N' ;
+ base_rc['X'] = 'X' ;
+ for ( a = 0 ; a < 256 ; a++ ) IUPAC_variance[a] = 0 ;
+ for ( a = 0 ; a < 256 ; a++ ) {
+ if ( ( char2IUPAC[a] & IUPAC_A ) > 0 ) IUPAC_variance[a]++ ;
+ if ( ( char2IUPAC[a] & IUPAC_C ) > 0 ) IUPAC_variance[a]++ ;
+ if ( ( char2IUPAC[a] & IUPAC_G ) > 0 ) IUPAC_variance[a]++ ;
+ if ( ( char2IUPAC[a] & IUPAC_T ) > 0 ) IUPAC_variance[a]++ ;
+ }
+
+ for ( a = 0 ; a < 256 ; a++ ) to_lc[a] = a ;
+ for ( a = 'A' ; a <= 'Z' ; a++ ) to_lc[a] = a - 'A' + 'a' ;
+
+ for ( a = 0 ; a < 256 ; a++ ) to_uc[a] = a ;
+ for ( a = 'a' ; a <= 'z' ; a++ ) to_uc[a] = a - 'a' + 'A' ;
+
+ for ( a = 0 ; a < 256 ; a++ ) char2num[a] = 0 ;
+ char2num[(uchar)'A'] = 1 ;
+ char2num[(uchar)'C'] = 2 ;
+ char2num[(uchar)'G'] = 3 ;
+ char2num[(uchar)'T'] = 4 ;
+ char2num[(uchar)'N'] = 5 ;
+ char2num[(uchar)'X'] = 5 ;
+ for ( a = 0 ; a < 256 ; a++ ) num2char[char2num[a]] = a ;
+}
+
+/// IUPAC-tolerant basematch for two char strings
+bool matchIUPAC ( const char *c1 , const char *c2 ) {
+ int left = globally_allowed_mismatches ;
+ while ( *c1 && *c2 ) {
+ if ( char2IUPAC[*c1++] & char2IUPAC[*c2++] ) continue ;
+ if ( --left >= 0 ) continue ;
+ return false ;
+ }
+ return true ;
+}
+
+
+
+
+/// Loads an entire FASTA file into a vector of TChromosome.
+void load_all_chromosomes ( string filename , vector <TChromosome> &chrs , bool keep_original_snps ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading all chromosomes from %s ... " , filename.c_str() ) ; fflush(stdout); }
+ string *seq = NULL ;
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ if ( NULL == file ) {
+ cerr << "file could not be accessed - aborting.\n" ;
+ exit ( 1 ) ;
+ }
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ while ( !feof(file) ) {
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( *dummy == '>' ) {
+ for ( c1 = dummy+1 ; *c1 == ' ' ; c1++ ) ;
+ for ( c2 = c1 ; *c2 && *c2 != 10 && *c2 != 13 && *c2 != ' ' && *c2 != '|' ; c2++ ) ;
+ *c2 = 0 ;
+ chrs.push_back ( TChromosome() ) ;
+ chrs[chrs.size()-1].name = c1 ;
+ seq = &( chrs[chrs.size()-1].sequence ) ;
+ } else {
+ for ( c1 = dummy ; *c1 ; c1++ ) {
+ switch ( *c1 ) {
+ case 'a' : *c1 = 'A' ; break ;
+ case 'c' : *c1 = 'C' ; break ;
+ case 'g' : *c1 = 'G' ; break ;
+ case 't' : *c1 = 'T' ; break ;
+ case 'A' : break ;
+ case 'C' : break ;
+ case 'G' : break ;
+ case 'T' : break ;
+ case 10 : *c1 = 0 ; break ; // Line end
+ case 13 : *c1 = 0 ; break ; // Line end
+ default : if ( !keep_original_snps ) *c1 = 'X' ; // Setting everything else to 'X'
+ }
+// if ( !*c1 ) break ;
+ }
+ *seq += dummy ;
+ }
+ }
+ fclose ( file ) ;
+
+ if ( DEBUGGING ) { fprintf ( stdout , "read %d chromosomes.\n" , chrs.size() ) ; fflush(stdout); }
+}
+
+/// Includes SNPs from a .gff file into chromosomes.
+void incorporate_all_gff ( string filename , vector <TChromosome> &chrs ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading SNPS from %s ... ", filename.c_str() ) ; fflush(stdout); }
+
+ // Backup
+ for ( uint a = 0 ; a < chrs.size() ; a++ ) {
+ if ( chrs[a].original_sequence.empty() )
+ chrs[a].original_sequence = chrs[a].sequence ;
+ }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ int snpcnt = 0 ;
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ while ( !feof(file) ) {
+ vector <string> parts , parts2 ;
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ Tokenize ( dummy , parts , "\t" ) ;
+ if ( parts.size() < 7 ) continue ; // These is not the data you are looking for
+
+ int i ;
+ for ( i = 0 ; i < chrs.size() && chrs[i].name != parts[0] ; i++ ) ;
+ if ( i >= chrs.size() ) continue ; // Chromosome unknown, ignore this one
+
+ if ( parts[2] != "SNP" ) continue ; // Not a SNP
+ if ( parts[3] != parts[4] ) continue ; // Not a single point
+ if ( parts[5] != "+" ) continue ; // Wrong strand (?)
+ int pos = atoi ( parts[3].c_str() ) ;
+
+ char merge = chrs[i].sequence[pos-1] ;
+ Tokenize ( parts[6].c_str() , parts2 , "\"\" \"\"" ) ;
+ bool found_reference = false ;
+ for ( int b = 0 ; b < parts2.size() ; b++ ) {
+ int l = parts2[b].length() ;
+ if ( l < 2 ) continue ;
+ if ( parts2[b][l-2] != ':' ) continue ;
+ char c = parts2[b][l-1] ;
+ if ( c == chrs[i].sequence[pos-1] ) found_reference = true ;
+ merge = MERGE_IUPAC(merge,c) ;
+ }
+ if ( !found_reference ) { // Strange; maybe original sequence contains "N"
+ // We should count this!
+ continue ;
+ }
+ chrs[i].sequence[pos-1] = merge ;
+ snpcnt++ ;
+ }
+ fclose ( file ) ;
+ if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs.\n" , snpcnt ) ; fflush(stdout); }
+}
+
+/// Include SNPs from a simple SNP list into chromosome.
+void incorporate_simple_snps ( string filename , vector <TChromosome> &chrs ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading SNPS from %s ... ", filename.c_str() ) ; fflush(stdout); }
+
+ // Backup
+ for ( uint a = 0 ; a < chrs.size() ; a++ ) {
+ if ( chrs[a].original_sequence.empty() )
+ chrs[a].original_sequence = chrs[a].sequence ;
+ }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ int snpcnt = 0 , already_snp = 0 ;
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ uint chr_unknown = 0 ;
+ uint reference_mismatch = 0 ;
+
+ while ( !feof(file) ) {
+ bool found_reference = false ;
+ vector <string> parts ;
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ if ( !*dummy ) break ;
+// cout << dummy << endl ;
+
+ Tokenize ( dummy , parts , "\t" ) ;
+ if ( parts.size() < 4 ) {
+ parts.clear() ;
+ Tokenize ( dummy , parts , " " ) ;
+ if ( parts.size() < 4 ) continue ; // These is not the data you are looking for
+ }
+
+ int i ;
+ for ( i = 0 ; i < chrs.size() && chrs[i].name != parts[0] ; i++ ) ;
+ if ( i >= chrs.size() ) { chr_unknown++ ; continue ; } // Chromosome unknown, ignore this one
+
+ int pos = atoi ( parts[1].c_str() ) ;
+ if ( pos < 1 || pos >= chrs[i].sequence.length() ) continue ; // Out-of-range
+ char c = parts[2][0] ;
+ char d = chrs[i].original_sequence[pos-1] ;
+ if ( d == c ) found_reference = true ;
+ else if ( d != 'A' && d != 'C' && d != 'G' && d != 'T' ) {
+ found_reference = true ;
+ already_snp++ ;
+ } else {
+ reference_mismatch++ ;
+ //cerr << "Called as " << c << ", is really " << d << endl ;
+ }
+ c = parts[3][0] ;
+ char merge = MERGE_IUPAC ( chrs[i].sequence[pos-1] , c ) ;
+ chrs[i].sequence[pos-1] = merge ;
+ snpcnt++ ;
+ }
+
+ fclose ( file ) ;
+ if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs (%d unknown chromosomes, %d reference mismatches, %d already marked as SNPs in genome).\n" , snpcnt , chr_unknown , reference_mismatch , already_snp ) ; fflush(stdout); }
+}
+
+
+
+// _________________________________________________________________________________________________________________________________________________________________ OPERATORS
+
+/// Compares two indices; important for finding second-level indices only, so chromosome is not important.
+bool operator < ( const TPositionWithIndex &a , const TPositionWithIndex &b ) {
+ return a.index2 < b.index2 ;
+}
diff --git a/src/mapcontigs.cpp b/src/mapcontigs.cpp
new file mode 100644
index 0000000..a76d7ad
--- /dev/null
+++ b/src/mapcontigs.cpp
@@ -0,0 +1,90 @@
+#include "snpomatic.h"
+
+/// Shows possible parameters, and an (optional) error message.
+void die_with_description ( const char *msg = NULL ) {
+ if ( msg ) cerr << msg << endl ;
+ cerr << "Parameters : \n" ;
+ cerr << "--genome=GENOME_FILE\tFASTA file with chromosomes (mandatory)\n" ;
+ cerr << "--fasta=FASTA_FILE\tFASTA file with Solexa reads (mandatory, except when --fastq is used)\n" ;
+ cerr << "--output=SNP_FILE\tWhere the output goes (mandatory)\n" ;
+ cerr << "--mincontigsize=NUMBER\tMinimal size of contigs (optional)\n" ;
+ exit ( 0 ) ;
+}
+
+void analyze_contigs ( vector <TChromosome> &contigs ) {
+ vector <int> sizes ;
+ int a ;
+ for ( a = 0 ; a < contigs.size() ; a++ ) {
+ int l = contigs[a].sequence.length() ;
+ if ( l > 99 ) l = 100 ;
+ while ( sizes.size() <= l ) sizes.push_back ( 0 ) ;
+ sizes[l]++ ;
+ }
+
+ for ( a = 0 ; a < sizes.size() ; a++ ) {
+ if ( sizes[a] == 0 ) continue ;
+ cout << a << "\t" << sizes[a] << endl ;
+ }
+ exit ( 0 ) ;
+}
+
+int main ( int argc , char **argv ) {
+ string genome_file , fasta_file , output_file ;
+ uint min_contig_size = 0 ;
+ uint index_length_1 = 8 ;
+ uint index_length_2 = 16 ;
+
+ // Parameters
+ static struct option long_options[] = {
+ { "fasta" , optional_argument , 0 , 'a' } ,
+ { "genome" , optional_argument , 0 , 'g' } ,
+ { "output" , optional_argument , 0 , 'o' } ,
+ { "mincontigsize" , optional_argument , 0 , 'm' } ,
+ { 0 , 0 , 0 , 0 }
+ } ;
+
+ int c ;
+ int option_index = 0;
+ while ( -1 != ( c = getopt_long (argc, argv, "g:a::o::s::f::p::b::m::",long_options, NULL) ) ) {
+
+ switch ( c ) {
+ case 0 : break ;
+ case 'a' : fasta_file = optarg ; break ;
+ case 'g' : genome_file = optarg ; break ;
+ case 'o' : output_file = optarg ; break ;
+ case 'm' : min_contig_size = atoi ( optarg ) ; break ;
+ default : die_with_description("Unknown option") ;
+ }
+ }
+
+ if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need one
+ if ( fasta_file.empty() ) die_with_description("No Solexa file given!") ; // Need one
+ if ( output_file.empty() ) die_with_description("No output file given!") ; // Need one
+
+ init_iupac () ;
+
+ vector <TChromosome> contigs ;
+ load_all_chromosomes ( fasta_file , contigs ) ;
+ TChromosomalIndices contig_index ( index_length_1 , index_length_2 ) ;
+ contig_index.noshortcuts = true ;
+ contig_index.run_ends ( &contigs , min_contig_size ) ;
+
+// analyze_contigs ( contigs ) ;
+
+ vector <TChromosome> chrs ;
+ load_all_chromosomes ( genome_file , chrs ) ;
+ TChromosomalIndices genome_index ( index_length_1 , index_length_2 ) ;
+ genome_index.noshortcuts = true ;
+ genome_index.run ( &chrs ) ;
+
+ TChromosomeAlign ca ( chrs , genome_index ) ;
+ ca.align_contigs ( contigs , contig_index , 3 , output_file ) ;
+
+}
+
+/*
+./velveth temp1 21 -shortPaired ~/som++/q1_no_match.fasta
+./velvetg temp1
+
+rm mapcontigs ; make mapcontigs ; time ./mapcontigs --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=../velvet/velvet_0.5.05/temp1/contigs.fa --out=new_snps_60 --mincontigsize=60
+*/
\ No newline at end of file
diff --git a/src/reassemble.cpp b/src/reassemble.cpp
new file mode 100644
index 0000000..c3b69df
--- /dev/null
+++ b/src/reassemble.cpp
@@ -0,0 +1,884 @@
+/*! \mainpage SNP-O-MATIC
+ *
+ * \section authors Authors
+ *
+ * - Magnus Manske (mm6 at sanger.ac.uk)
+ *
+ * \section intro_sec Introduction
+ *
+ * SNP-O-MATIC can align Solexa reads against a reference sequence.
+ * Known SNPs can be added to the reference sequence as IUPAC bases.
+ * Only perfectly matching Solexa reads, with respect to the IUPAC bases, will be aligned.
+ * Reads can be binned into those that do not match the reference, those that match uniquely, and those that match two or more times.
+ *
+ * \section install_sec Installation
+ *
+ * SNP-O-MATIC comes with a Makefile. The Intel compiler is used by default for performance reasons, but the g++ variant is in the Makefile as well in the form of comments.
+ */
+
+#include "snpomatic.h"
+#include <algorithm>
+#include <map>
+#include <fstream>
+
+#define MAX_NEW 1
+
+TChromosomalIndices *idx ;
+
+class TRead ;
+
+class TPart {
+ public :
+ void do_index ( uint id , bool rc = false ) ;
+ string seq ;
+} ;
+
+class TRead {
+ public :
+ TRead () {} ;
+ TRead ( const string &seq ) ;
+ TRead ( const string &seq1 , const string &seq2 ) ;
+ inline void reset ( uint id ) ;
+ bool used ;
+ vector <uint> related ;
+ TPart p1 , p2 ;
+} ;
+
+TRead::TRead ( const string &seq ) {
+ int l = seq.length() / 2 ;
+ p1.seq = seq.substr ( 0 , l ) ;
+ p2.seq = seq.substr ( l , l ) ;
+}
+
+TRead::TRead ( const string &seq1 , const string &seq2 ) {
+ p1.seq = seq1 ;
+ p2.seq = seq2 ;
+}
+
+void TRead::reset ( uint id ) {
+ p1.do_index ( id , false ) ;
+ p2.do_index ( id , true ) ;
+ used = false ;
+ related.clear() ;
+}
+
+void TPart::do_index ( uint id , bool rc ) {
+ for ( int a = 0 ; a + idx->index_length_double < seq.length() ; a++ ) {
+ uint i1 = idx->calculate_index ( (char*) seq.c_str() + a ) ;
+ TPositionWithIndex p ;
+ p.index2 = idx->calculate_index2 ( (char*) seq.c_str() + a + idx->index_length ) ;
+ p.position = a ;
+ p.reverse_complement = rc ;
+ p.chromosome = id ;
+ idx->position[i1].push_back ( p ) ;
+ break ;
+ }
+}
+
+
+
+string pad ( int a ) {
+ string ret ;
+ while ( ret.length() < a ) ret += " " ;
+ return ret ;
+}
+
+
+
+/*
+int remove_full_sequences ( vector <TRead> &vr , vector <string> &vs ) {
+ int min = 30 ;
+ int cnt = 0 ;
+ for ( int a = 0 ; a < vr.size() ; a++ ) {
+ if ( vr[a].p1.seq.length() < min ) continue ;
+ string m = vr[a].p1.seq.substr ( vr[a].p1.seq.length() - min , min ) ;
+ int n = vr[a].p2.seq.find ( m , 0 ) ;
+ if ( n == std::string::npos ) continue ;
+ m = vr[a].p1.seq + vr[a].p2.seq.substr ( n ) ;
+ vs.push_back ( m ) ;
+ cnt++ ;
+ vr[a].p1.seq = "" ;
+ vr[a].p2.seq = "" ;
+ }
+ return cnt ;
+}
+
+void create_contigs ( vector <TRead> &vr , vector <TRead> &vn ) {
+ for ( uint r1 = 0 ; r1 < vr.size() ; r1++ ) {
+ if ( vr[r1].used ) continue ;
+ for ( uint r2x = 0 ; r2x < vr[r1].related.size() ; r2x++ ) {
+ uint r2 = vr[r1].related[r2x] ;
+ if ( vr[r1].p1.seq == vr[r2].p1.seq ) continue ;
+ if ( vr[r1].p2.seq == vr[r2].p2.seq ) continue ;
+ string t1 = combine_strings ( vr[r1].p1.seq , vr[r2].p1.seq ) ;
+ if ( t1.empty() ) continue ;
+ string t2 = combine_strings ( vr[r1].p2.seq , vr[r2].p2.seq ) ;
+ if ( t2.empty() ) continue ;
+ vr[r1].used = true ;
+ vr[r2].used = true ;
+ vn.push_back ( TRead ( t1 , t2 ) ) ;
+// cout << t1 << "\t" << t2 << endl ;
+ break ;
+ }
+ }
+}
+
+void join_single_sequences ( vector <string> &vs ) {
+ vector <string> vs2 ;
+ vs.swap ( vs2 ) ;
+ do {
+ for ( int a = 0 ; a < vs.size() ; a++ ) {
+ if ( vs[a].empty() ) continue ;
+ vs2.push_back ( vs[a] ) ;
+ }
+ vs.swap ( vs2 ) ;
+ vs2.clear() ;
+ for ( int a = 0 ; a < vs.size() ; a++ ) {
+ if ( vs[a].length() < 100 ) continue ;
+ for ( int b = a+1 ; b < vs.size() ; b++ ) {
+ if ( vs[b].empty() ) continue ;
+ string m = combine_strings ( vs[a] , vs[b] , 100 ) ;
+ if ( m.empty() ) continue ;
+ vs[a].clear() ;
+ vs[b].clear() ;
+ vs2.push_back ( m ) ;
+ }
+ }
+ } while ( vs2.size() > 0 ) ;
+}
+
+void interlink_reads ( vector <TRead> &vr ) {
+ uint p1s = idx->position.size() ;
+ for ( uint i1 = 0 ; i1 < p1s ; i1++ ) {
+ uint last = 0 ;
+ uint p2s = idx->position[i1].size() ;
+ for ( uint a = 1 ; a < p2s ; a++ ) {
+ if ( idx->position[i1][last].index2 == idx->position[i1][a].index2 ) {
+ uint n2 = idx->position[i1][a].chromosome ;
+ for ( uint b = last ; b < a ; b++ ) {
+ uint n1 = idx->position[i1][b].chromosome ;
+ vr[n2].related.push_back ( n1 ) ;
+ vr[n1].related.push_back ( n2 ) ;
+ }
+ } else {
+ last = a ;
+ }
+ }
+ }
+
+ uint cnt = 0 ;
+ for ( uint a = 0 ; a < vr.size() ; a++ ) {
+ sort ( vr[a].related.begin() , vr[a].related.end() ) ;
+ vector <uint> vi ;
+ vi.reserve ( vr[a].related.size() ) ;
+ for ( uint b = 0 ; b < vr[a].related.size() ; b++ ) {
+ if ( b > 0 && vr[a].related[b-1] == vr[a].related[b] ) continue ;
+ vi.push_back ( vr[a].related[b] ) ;
+ }
+ cnt += vi.size() ;
+ vr[a].related.swap ( vi ) ;
+ }
+ cerr << cnt << " crosslinks found\n" ;
+}
+
+
+void align_perfect ( vector <TRead> &reads , int offset ) {
+ int a , b ;
+ if ( DEBUGGING ) { fprintf ( stdout , "Indexing ... " ) ; fflush(stdout); }
+ for ( a = 0 ; a < reads.size() ; a++ ) reads[a].reset ( a ) ;
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+
+ if ( DEBUGGING ) { fprintf ( stdout , "Scanning with offset %d ... " , offset ) ; fflush(stdout); }
+
+ for ( uint i1 = 0 ; i1 < idx->position.size() ; i1++ ) {
+ uint i1s = idx->position[i1].size() ;
+ for ( uint a = 0 ; a < i1s ; a++ ) {
+ if ( idx->position[i1][a].position > offset ) continue ;
+ uint i2 = idx->position[i1][a].index2 ;
+ for ( uint b = a+1 ; b < i1s && idx->position[i1][b].index2 == i2 ; b++ ) {
+ if ( idx->position[i1][b].position > offset ) continue ;
+ uint r1 = idx->position[i1][a].chromosome ;
+ uint r2 = idx->position[i1][b].chromosome ;
+ if ( reads[r2].p1.seq != reads[r1].p1.seq && reads[r2].p1.seq != reads[r1].p2.seq &&
+ reads[r2].p2.seq != reads[r1].p1.seq && reads[r2].p2.seq != reads[r1].p2.seq ) continue ;
+ }
+ }
+ }
+
+ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); }
+}
+
+string combine_strings ( const string &s1 , const string &s2 , int max = -1 ) {
+ if ( max == -1 ) max = idx->index_length_double ;
+ string ret ;
+ int s1l = s1.length() ;
+ int s2l = s2.length() ;
+ int maxa = s2l - max ;
+ int a , b ;
+ for ( a = - ( s1l - max ) ; a < maxa ; a++ ) {
+ int maxb = s2l < s1l-a ? s2l : s1l-a ;
+ for ( b = a<0?-a:0 ; b < maxb && s1[a+b] == s2[b] ; b++ ) ;
+ if ( b < maxb ) continue ;
+
+ if ( !ret.empty() ) return string () ; // Do not allow multiple hits
+
+ if ( a < 0 ) {
+ ret = s2.substr ( 0 , -a ) + s1 ;
+ if ( s1l-a < s2l ) ret += s2.substr ( s1l-a ) ;
+ } else {
+ ret = s1.substr ( 0 , a ) + s2 ;
+ if ( s2l+a < s1l ) ret += s1.substr ( s2l+a ) ;
+ }
+ }
+ return ret ;
+}
+
+*/
+
+string merge_strings ( string s1 , string s2 ) {
+ string ret ;
+ if ( s1.length() > s2.length() ) {
+ string tmp = s1 ;
+ s1 = s2 ;
+ s2 = tmp ;
+ }
+ int sl1 = s1.length() ;
+ int sl2 = s2.length() ;
+
+ int best = sl1 + sl2 ;
+ int bestpos = 0 ;
+
+ int a , b ;
+ int min_offset = 5 ;
+ for ( a = -sl1 + min_offset ; a < sl2 - min_offset ; a++ ) {
+ int match = 0 ;
+ int mismatch = 0 ;
+ for ( b = 0 ; b < sl1 ; b++ ) {
+ int p2 = a + b ;
+ if ( p2 < 0 || p2 >= sl2 ) continue ;
+ if ( s1[b] == 'N' || s2[p2] == 'N' ) continue ;
+ if ( s1[b] == s2[p2] ) match++ ;
+ else mismatch++ ;
+ }
+ if ( mismatch > match / 3 ) continue ;
+ int total = match + mismatch ;
+ if ( mismatch < best ) {
+ best = mismatch ;
+ bestpos = a ;
+ }
+ }
+
+ if ( best > 5 ) return ret ; // Hard cutoff
+
+ ret = s2 + pad ( sl1 ) ;
+ if ( bestpos < 0 ) {
+ ret = pad ( -bestpos ) + ret ;
+ bestpos = 0 ;
+ }
+
+ // Merge
+ for ( a = 0 ; a < sl1 ; a++ ) {
+ b = bestpos + a ;
+ if ( s1[a] == ret[b] ) continue ;
+ if ( ret[b] != 'N' && s1[a] != ' ' ) {
+ if ( ret[b] == ' ' ) ret[b] = s1[a] ;
+ else ret[b] = 'N' ;
+ }
+ }
+
+ // Trim blanks
+ for ( a = 0 ; ret[a] == ' ' ; a++ ) ;
+ if ( a > 0 ) ret = ret.substr ( a , ret.length()-a ) ;
+ for ( a = 0 ; a < ret.length() && ret[a] != ' ' ; a++ ) ;
+ ret = ret.substr ( 0 , a ) ;
+
+ return ret ;
+}
+
+
+void read_fastq ( string filename , vector <TRead> &vr , int min_qual = 0 ) {
+ if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa pair data from %s ... " , filename.c_str() ) ; fflush(stdout); }
+
+ FILE * file = fopen ( filename.c_str() , "r" ) ;
+ if ( !file ) {
+ cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ;
+ exit ( 1 ) ;
+ }
+ char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ;
+ string lastseq , lastqual , last_solexa_name ;
+ bool fasta = true ;
+ bool last_was_quality = false , lastone = false , eof = false , skip = false , goodqual = true ;
+ uint readcount = 0 , matched_reads = 0 , total_matches = 0 , skipped = 0 , badqualcnt = 0 ;
+
+ while ( !eof || lastone ) {
+ if ( lastone ) {
+ strcpy ( dummy , "@dummy" ) ;
+ if ( fasta ) *dummy = '>' ;
+ } else {
+ *dummy = 0 ;
+ fgets ( dummy , READ_CHAR_BUFFER , file ) ;
+ }
+
+ // Remove EOL
+ for ( c1 = dummy ; *c1 > 13 ; c1++ ) ;
+ *c1 = 0 ;
+
+ if ( ( fasta && *dummy == '>' ) || *dummy == '@' ) {
+ if ( *dummy == '@' ) fasta = false ;
+
+ if ( !skip ) {
+ for ( c1 = (char*) lastseq.c_str() ; *c1 && !isIUPAC[*c1] ; c1++ ) ;
+ if ( *c1 ) { // Contains IUPAC
+ } else if ( !goodqual ) {
+ badqualcnt++ ;
+ } else if ( !lastseq.empty() ) {
+ vr.push_back ( TRead ( lastseq ) ) ;
+ }
+ }
+
+ goodqual = true ;
+ if ( lastone ) break ;
+
+ // Clear data
+ c1 = dummy ;
+ if ( *c1 == '>' || *c1 == '@' ) c1++ ;
+ while ( *c1 == ' ' ) c1++ ;
+ last_solexa_name = c1 ;
+ readcount++ ;
+ last_was_quality = false ;
+ lastseq.clear() ;
+ lastqual.clear() ;
+
+ } else if ( *dummy == '+' ) {
+ last_was_quality = true ;
+ } else if ( last_was_quality ) {
+ if ( min_qual > 0 ) {
+ goodqual = true ;
+ for ( c1 = dummy ; goodqual && *c1 > 20 ; c1++ ) {
+// cout << (int) *c1-33 << ", " ;
+ goodqual = ( *c1-33 ) >= min_qual ;
+ }
+// cout << endl ;
+ }
+ last_was_quality = false ;
+// lastqual += dummy ;
+ } else {
+ lastseq += dummy ;
+ }
+
+ eof = feof ( file ) ;
+// if ( vr.size() >= 100000 ) eof = 1 ; // TESTING!!!!
+ if ( eof ) lastone = true ; // Fake last read
+ }
+
+ if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches, %d skipped, %d bad quality.\n" ,
+ readcount , matched_reads , (float) matched_reads * 100 / readcount , total_matches , skipped , badqualcnt ) ; fflush(stdout); }
+}
+
+
+// __________________________________________________________________________________________________________________________________________________________________ MAIN
+
+/// Shows possible parameters, and an (optional) error message.
+void die_with_description ( const char *msg = NULL ) {
+ if ( msg ) cerr << msg << endl ;
+ cerr << "Parameters : \n" ;
+ cerr << "--genome=GENOME_FILE FASTA file with chromosomes (mandatory)\n" ;
+ cerr << "--fasta=FASTA_FILE FASTA file with Solexa reads (mandatory, except when --fastq or --index is used)\n" ;
+ cerr << "--fastq=FASTQ_FILE FASTQ file with Solexa reads (mandatory, except when --fasta or --index is used)\n" ;
+ cerr << "--nono=FILENAME File with list of read names (!) to ignore (optional)\n" ;
+ cerr << "--regions=REGION_FILE Region file for finding new SNPs (optional) [DEPRECATED]\n" ;
+ cerr << "--snps=SNP_FILE Simple SNP file (optional)\n" ;
+ cerr << "--gff=GFF_FILE GFF file with SNPs (optional)\n" ;
+ cerr << "--uniqueness=FILE Output a uniqueness data file for the reference; no Solexa reads needed; implies --noshortcuts (optional)\n" ;
+ cerr << "--pileup=FILE Outputs complete pileup into that file (optional)\n" ;
+ cerr << "--cigar=FILE Outputs alignment in CIGAR format (optional)\n" ;
+ cerr << "--gffout=FILE Outputs reads in GFF format (optional)\n" ;
+ cerr << "--coverage=FILENAME Outputs (high depth) coverage data (optional)\n" ;
+ cerr << "--wobble=FILENAME Outputs a list of possible variations (optional; paired reads only) [UNDER CONSTRUCTION]\n" ;
+ cerr << "--fragmentplot=FILENAME Outputs a plot of fragment size distribution to a file (optional)\n" ;
+ cerr << "--snpsinreads=FILENAME Outputs a list of reads containing known SNPs to a file (optional)\n" ;
+ cerr << "--indelplot=FILENAME Outputs indel data to a file (optional)\n" ;
+ cerr << "--inversions=FILENAME For paired reads, writes read matches indicating inversions into a file (optional)\n" ;
+ cerr << "--bins=FILE_PREFIX Outputs no-match, single-match and multi-match Solexa reads into prefixed files (optional)\n" ;
+ cerr << "--binmask=MASK Mask of 1s and 0s to turn off individual bins. Order: No match, single match, multi-match, IUPAC.\n" ;
+ cout << " Example: 0100 creates only single-match bin. (optional; default:1111)\n" ;
+ cerr << "--pair=NUMBER For paired reads, the length of the first part of the read (mandatory for paired reads)\n" ;
+ cerr << "--fragment=NUMBER For paired reads, the average fragment length (mandatory for paired reads)\n" ;
+ cerr << "--variance=NUMBER For paired reads, the variance of the fragment length to either side (optional; default: 1/4 of fragment size)\n" ;
+ cerr << "--wobblemax=NUMBER Maximum number of mismatches for wobble (optional; default 2; see --wobble)\n" ;
+ cerr << "--mspi=NUMBER Maximum number of SNPs per chromosomal index (optional; default:8)\n" ;
+ cerr << "--index=FILENAME Index filename (index will be created if it does not exist; optional)\n" ;
+ cerr << "--noshortcuts Will process all chrososomal regions, even those with lots'o'repeats (optional; no value)\n" ;
+ cerr << "--snpsonly Only lists found SNPs in the pileup (optional; no value)\n" ;
+ cerr << "--chromosome=NAME Discards all chromosomes but NAME prior to run (optional)\n" ;
+ cerr << "--chop=NUMBER For paired reads, if one but not the other matches, shorten the other by NUMBER bases (optional)\n" ;
+ cerr << "--index1=NUMBER Length of internal index 1 (optional; default:10)\n" ;
+ cerr << "--index2=NUMBER Length of internal index 2 (optional; default:16)\n" ;
+ cerr << "--multimatch Puts a multiple-matching read to a random position (optional) [currently paired reads only]\n" ;
+ cerr << "--foum For paired reads, at least one read has to match uniquely in the genome (force one unique match) (optional)\n" ;
+// cerr << "--featuretable=OUTPUT_FILE\tOutputs a DDBJ/EMBL/GenBank feature table for matched reads (optional; paired reads only for now)\n" ;
+ exit ( 0 ) ;
+}
+
+
+class TContig {
+ public :
+ TContig () { remove = false ; }
+ string sequence ;
+// vector <unsigned int> reads ;
+ bool remove ;
+} ;
+
+vector <TContig> contigs ;
+map <string,bool> cache ;
+vector <TPositionWithIndex> contig_idx ;
+
+void initialize_contigs ( vector <TRead> & vr , string start ) {
+ unsigned int a , b ;
+ for ( a = 0 ; a < vr.size() ; a++ ) {
+// char *cc = (char*) vr[a].p1.seq.c_str() ;
+// if ( *(cc) != 'A' || *(cc+1) != 'T' || *(cc+2) != 'G' ) continue ;
+
+
+ if ( vr[a].p1.seq.substr(0,3) == start && cache.end() == cache.find ( vr[a].p1.seq ) ) {
+ cache[vr[a].p1.seq] = true ;
+ TContig c ;
+ c.sequence = vr[a].p1.seq ;
+ contigs.push_back ( c ) ;
+ vr[a].used = true ;
+ }
+
+ if ( vr[a].p2.seq.substr(0,3) == start && cache.end() == cache.find ( vr[a].p2.seq ) ) {
+ cache[vr[a].p2.seq] = true ;
+ TContig c ;
+ c.sequence = vr[a].p2.seq ;
+ contigs.push_back ( c ) ;
+ vr[a].used = true ;
+ }
+
+ }
+ cout << contigs.size() << " contigs created (from " << vr.size() << " reads).\n" ;
+}
+
+void add_contig ( string seq , int readnum ) {
+ if ( cache.end() == cache.find ( seq ) ) {
+ TContig c ;
+ c.sequence = seq ;
+ contigs.push_back ( c ) ;
+ cache[seq] = 1 ;
+ } else {
+ }
+}
+
+
+int grow_contigs ( vector <TRead> & vr , int pos , string start ) {
+ cout << "Growing at position " << pos << " ... " ;
+ int read_length = vr[0].p1.seq.length() ;
+ int ret = 0 ;
+
+ contig_idx.clear() ;
+ for ( unsigned int a = 0 ; a < contigs.size() ; a++ ) {
+ TPositionWithIndex p ;
+ p.index2 = idx->calculate_index2 ( (char*) contigs[a].sequence.c_str() ) ;
+ p.chromosome = a ;
+ contig_idx.push_back ( p ) ;
+ }
+ sort ( contig_idx.begin() , contig_idx.end() ) ;
+
+ for ( unsigned int a = 0 ; a < contigs.size() ; a++ ) {
+ if ( contigs[a].remove ) continue ; // Marked for deletion
+ string cseq = contigs[a].sequence ;
+ if ( read_length + pos > cseq.length() + MAX_NEW ) continue ;
+
+ int len = cseq.length() - pos ;
+ if ( len > read_length ) len = read_length ;
+ string csub = cseq.substr ( pos , len ) ;
+
+ contigs[a].remove = true ;
+ bool altered = false ;
+
+ if ( csub.substr ( 0 , 3 ) == start ) { // Elongate with another "contig"
+ csub = cseq.substr ( pos , cseq.length()-pos ) ;
+ unsigned int i2 = idx->calculate_index2 ( (char*) csub.c_str() ) ;
+ unsigned int b ;
+ for ( unsigned int b2 = 0 ; b2 < contig_idx.size() ; b2++ ) {
+ if ( contig_idx[b2].index2 > i2 ) break ;
+ if ( contig_idx[b2].index2 < i2 ) continue ;
+ b = contig_idx[b2].chromosome ;
+ if ( a == b ) continue ; // No self-elongation
+ // if ( contigs[b].remove ) continue ;
+ if ( contigs[b].sequence.length() <= csub.length() ) continue ; // Contig smaller than fragment
+ if ( contigs[b].sequence.substr ( 0 , csub.length() ) == csub ) {
+ string s = cseq.substr ( 0 , pos ) + contigs[b].sequence ;
+ add_contig ( s , -1 ) ;
+ altered = true ;
+ contigs[b].remove = true ;
+ break ;
+ }
+ }
+ } else { // Try to elongate with reads
+ uint i1 = idx->calculate_index ( (char*) csub.c_str() ) ;
+ uint i2 = idx->calculate_index2 ( (char*) csub.c_str() + idx->index_length ) ;
+
+
+ for ( unsigned d = 0 ; d < idx->position[i1].size() ; d++ ) {
+ if ( idx->position[i1][d].index2 > i2 ) break ;
+ if ( idx->position[i1][d].index2 < i2 ) continue ;
+ unsigned int b = idx->position[i1][d].chromosome ;
+ if ( vr[b].used ) continue ;
+
+ string *sp = idx->position[i1][d].reverse_complement ? &(vr[b].p2.seq) : &(vr[b].p1.seq) ;
+
+ if ( sp->substr ( 0 , len ) != csub ) continue ;
+ string s = cseq.substr ( 0 , pos ) + *sp ;
+ if ( s.length() <= cseq.length() ) continue ; // No new length increase
+ add_contig ( s , b ) ;
+ vr[b].used = true ;
+ altered = true ;
+ }
+ }
+
+ if ( altered ) ret++ ;
+ else contigs[a].remove = false ;
+ }
+
+ // Cleanup
+ for ( int a = 0 ; a < contigs.size() ; a++ ) {
+ if ( !contigs[a].remove ) continue ;
+ cache.erase ( contigs[a].sequence ) ;
+ contigs[a] = contigs[contigs.size()-1] ;
+ contigs.pop_back() ;
+ a-- ;
+ }
+
+ cout << ret << " contigs grew (" << contigs.size() << " contigs total)." << endl ;
+ return ret ;
+}
+
+void assemble_contigs ( vector <TRead> &vr , vector <TChromosome> &chrs , string start ) {
+ initialize_contigs ( vr , start ) ;
+ int a = 1 , growing , growth ;
+ do {
+ growing = grow_contigs ( vr , a++ , start ) ;
+ if ( growing > 0 ) growth = 10 ;
+ else growth-- ;
+ } while ( growth > 0 ) ;
+
+
+ cout << "Writing contigs to contigs.fasta\n" ;
+ ofstream fout( "contigs.fasta", ios::out|ios::trunc );
+ for ( a = 0 ; a < contigs.size() ; a++ ) {
+ if ( contigs[a].sequence.length() == vr[0].p1.seq.length() ) continue ;
+
+ string s = contigs[a].sequence ;
+ //new_contigs.push_back ( s ) ;
+
+ // FIXME : Add to chrs
+
+ fout << ">contig_" << a << "_" << s.length() << endl ;
+ while ( !s.empty() ) {
+ if ( s.length() >= 60 ) {
+ fout << s.substr ( 0 , 60 ) << endl ;
+ s = s.substr ( 60 ) ;
+ } else {
+ fout << s << endl ;
+ break ;
+ }
+ }
+ }
+ fout.close() ;
+
+ int used = 0 ;
+ for ( a = 0 ; a < vr.size() ; a++ ) {
+ if ( vr[a].used ) used++ ;
+ vr[a].used = false ; // Resetting
+ }
+ cout << used << " of " << vr.size() << " reads (" << 100 * used / vr.size() << "%) used in contigs\n" ;
+
+ // Cleanup
+ contigs.clear() ;
+ cache.clear() ;
+ contig_idx.clear() ;
+}
+
+void write_chromosomes_to_fasta ( vector <TChromosome> &chrs ) {
+ cout << "Writing contigs to chrs.fasta\n" ;
+ ofstream fout( "chrs.fasta", ios::out|ios::trunc );
+ map <string,bool> seen ;
+ for ( int a = 0 ; a < chrs.size() ; a++ ) {
+ string s = chrs[a].sequence ;
+ if ( seen.end() != seen.find ( s ) ) continue ; // Wrote that one already
+ seen[s] = true ;
+ fout << ">" << chrs[a].name << endl ;
+ while ( !s.empty() ) {
+ if ( s.length() >= 60 ) {
+ fout << s.substr ( 0 , 60 ) << endl ;
+ s = s.substr ( 60 ) ;
+ } else {
+ fout << s << endl ;
+ break ;
+ }
+ }
+ }
+ fout.close() ;
+}
+
+void make_read_indices ( vector <TRead> &vr ) {
+ // Create indices
+ for ( uint a = 0 ; a < vr.size() ; a++ ) {
+ vr[a].reset ( a ) ;
+ }
+
+ // Sort indices
+ for ( uint a = 0 ; a < idx->position.size() ; a++ ) {
+ sort ( idx->position[a].begin() , idx->position[a].end() ) ;
+ }
+}
+
+
+
+void join_chromosomes ( vector <TRead> &vr , uint index_length_1 , uint index_length_2 , string genome_file ) {
+
+ for ( int cnt = 0 ; cnt < 100 ; cnt++ ) {
+ cout << "Chromosome joining, round " << cnt << endl ;
+
+ vector <TChromosome> chrs ;
+ TChromosomalIndices contig_index ( index_length_1 , index_length_2 ) ;
+// contig_index.noshortcuts = noshortcuts ;
+ idx = &contig_index ;
+
+ load_all_chromosomes ( genome_file , chrs , true ) ;
+ contig_index.run ( &chrs ) ;
+
+ TChromosomeAlign ca ( chrs , contig_index ) ;
+ ca.init () ;
+ for ( uint a = 0 ; a < vr.size() ; a++ ) {
+ pwi_vector res1 , res2 ;
+ ca.get_align_solexa_read ( vr[a].p1.seq.c_str() , res1 ) ;
+ if ( res1.size() != 1 ) continue ;
+ ca.get_align_solexa_read ( vr[a].p2.seq.c_str() , res2 ) ;
+ if ( res2.size() != 1 ) continue ;
+ if ( res1[0].chromosome == res2[0].chromosome ) continue ;
+ if ( res1[0].reverse_complement == res2[0].reverse_complement ) continue ;
+
+ // Abusing uniqueness vector to store connections
+ chrs[res1[0].chromosome].uniqueness.push_back ( res2[0].chromosome ) ;
+ chrs[res2[0].chromosome].uniqueness.push_back ( res1[0].chromosome ) ;
+ }
+
+ for ( uint a = 0 ; a < chrs.size() ; a++ ) {
+ sort ( chrs[a].uniqueness.begin() , chrs[a].uniqueness.end() ) ;
+ chrs[a].uniqueness.push_back ( chrs.size()+1 ) ; // Dummy, will be ignored
+ uint last = chrs.size()+1 ;
+ int cnt = -1 ;
+ for ( uint b = 0 ; b < chrs[a].uniqueness.size() ; b++ ) {
+ if ( chrs[a].uniqueness[b] == last ) {
+ cnt++ ;
+ } else {
+ if ( cnt >= 0 ) {
+ if ( cnt > 5 ) {
+ string joined = merge_strings ( chrs[a].sequence , chrs[last].sequence ) ;
+ if ( !joined.empty() ) {
+ chrs[a].sequence = joined ;
+ chrs[last].sequence = joined ;
+ }
+// cout << a << " => " << last << " (" << cnt << "x)\n" ;
+// cout << chrs[a].sequence << endl << chrs[last].sequence << endl ;
+// cout << merge_strings ( chrs[a].sequence , chrs[last].sequence ) << endl ;
+ }
+ }
+ cnt = 1 ;
+ last = chrs[a].uniqueness[b] ;
+ }
+ }
+ }
+
+ write_chromosomes_to_fasta ( chrs ) ;
+ }
+}
+
+
+
+
+/// Well, guess what.
+int main ( int argc , char **argv ) {
+ string genome_file , gff_file , simple_snp_file , fastq_file , fasta_file , pileup_file , bin_prefix , regions_file ;
+ string cigar_file , wobble_file , nono , fragmentplot , featuretable , gffout , coverage_file , uniqueness , snpsinreads ;
+ string index_file , indelplot_file , inversions_file , chromosome ;
+ string binmask ( "1111" ) ;
+ uint wobblemax = 2 ;
+ uint mspi = 8 ;
+ uint pair_length = 0 ;
+ uint fragment_length = 0 ;
+ uint index_length_1 = 10 ;
+ uint index_length_2 = 16 ;
+ int noshortcuts = false ;
+ int snpsonly = false ;
+ int chop = 0 ;
+ uint variance = 0 ;
+ int multimatch = false ;
+ int force_one_unique_match = false ;
+
+ // Parameters
+ static struct option long_options[] = {
+ { "genome" , required_argument , 0 , 'g' } ,
+ { "fasta" , optional_argument , 0 , 'a' } ,
+ { "fastq" , optional_argument , 0 , 'q' } ,
+ { "snps" , optional_argument , 0 , 's' } ,
+ { "coverage" , optional_argument , 0 , 'e' } ,
+ { "gff" , optional_argument , 0 , 'f' } ,
+ { "pileup" , optional_argument , 0 , 'p' } ,
+ { "bins" , optional_argument , 0 , 'b' } ,
+ { "binmask" , optional_argument , 0 , 'z' } ,
+ { "index" , optional_argument , 0 , 'x' } ,
+ { "nono" , optional_argument , 0 , 'n' } ,
+ { "uniqueness" , optional_argument , 0 , '0' } ,
+ { "wobble" , optional_argument , 0 , 'w' } ,
+ { "pair" , optional_argument , 0 , 'i' } ,
+ { "fragment" , optional_argument , 0 , 't' } ,
+ { "featuretable" , optional_argument , 0 , 'u' } ,
+ { "gffout" , optional_argument , 0 , 'y' } ,
+ { "variance" , optional_argument , 0 , 'v' } ,
+ { "mspi" , optional_argument , 0 , 'm' } ,
+ { "cigar" , optional_argument , 0 , 'c' } ,
+ { "fragmentplot" , optional_argument , 0 , 'o' } ,
+ { "snpsinreads" , optional_argument , 0 , 'd' } ,
+ { "regions" , optional_argument , 0 , 'r' } ,
+ { "index1" , optional_argument , 0 , '1' } ,
+ { "index2" , optional_argument , 0 , '2' } ,
+ { "chop" , optional_argument , 0 , '3' } ,
+ { "indelplot" , optional_argument , 0 , '4' } ,
+ { "wobblemax" , optional_argument , 0 , '5' } ,
+ { "inversions" , optional_argument , 0 , '6' } ,
+ { "chromosome" , optional_argument , 0 , '7' } ,
+ { "multimatch" , optional_argument , &multimatch , true } ,
+ { "noshortcuts" , optional_argument , &noshortcuts , true } ,
+ { "foum" , optional_argument , &force_one_unique_match , true } ,
+ { "snpsonly" , optional_argument , &snpsonly , true } ,
+ { 0 , 0 , 0 , 0 }
+ } ;
+
+
+ int c ;
+ int option_index = 0;
+ while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) {
+
+ switch ( c ) {
+ case 0 : break ;
+ case 'g' : genome_file = optarg ; break ;
+ case 'a' : fasta_file = optarg ; break ;
+ case 'q' : fastq_file = optarg ; break ;
+ case 's' : simple_snp_file = optarg ; break ;
+ case 'f' : gff_file = optarg ; break ;
+ case 'e' : coverage_file = optarg ; break ;
+ case 'p' : pileup_file = optarg ; break ;
+ case 'b' : bin_prefix = optarg ; break ;
+ case 'z' : binmask = optarg ; break ;
+ case '0' : uniqueness = optarg ; break ;
+ case 'w' : wobble_file = optarg ; break ;
+ case 'n' : nono = optarg ; break ;
+ case 'x' : index_file = optarg ; break ;
+ case 'i' : pair_length = atoi ( optarg ) ; break ;
+ case 'o' : fragmentplot = optarg ; break ;
+ case 'u' : featuretable = optarg ; break ;
+ case 'y' : gffout = optarg ; break ;
+ case 'd' : snpsinreads = optarg ; break ;
+ case 'v' : variance = atoi ( optarg ) ; break ;
+ case 't' : fragment_length = atoi ( optarg ) ; break ;
+ case 'm' : mspi = atoi ( optarg ) ; break ;
+ case '1' : index_length_1 = atoi ( optarg ) ; break ;
+ case '2' : index_length_2 = atoi ( optarg ) ; break ;
+ case '3' : chop = atoi ( optarg ) ; break ;
+ case '4' : indelplot_file = optarg ; break ;
+ case '5' : wobblemax = atoi ( optarg ) ; break ;
+ case '6' : inversions_file = optarg ; break ;
+ case '7' : chromosome = optarg ; break ;
+ case 'c' : cigar_file = optarg ; break ;
+ case 'r' : regions_file = optarg ; break ;
+ default : die_with_description("Unknown option") ;
+ }
+ }
+ binmask += "0000" ; // Ignore bins if shorter string was supplied
+
+ init_iupac () ;
+ TChromosomalIndices main_index ( index_length_1 , index_length_2 ) ;
+ main_index.noshortcuts = noshortcuts ;
+ idx = &main_index ;
+
+ // FASTA sequence? Just try to join them up and ignore everything smaller than 100 bases.
+/* if ( !fasta_file.empty() ) {
+ vector <TChromosome> chrs ;
+ load_all_chromosomes ( fasta_file , chrs ) ;
+ vector <string> vs ;
+ for ( int a = 0 ; a < chrs.size() ; a++ ) vs.push_back ( chrs[a].sequence ) ;
+ join_single_sequences ( vs ) ;
+ for ( int a = 0 ; a < vs.size() ; a++ ) {
+ if ( vs[a].length() < 100 ) continue ; // Hard cutoff
+ cout << ">CONTIG" << a << "_(" << vs[a].length() << ")" << endl << vs[a] << endl ;
+ }
+ return 0 ;
+ }*/
+
+ vector <TRead> vr ;
+
+
+ read_fastq ( fastq_file , vr , 5 ) ;
+ make_read_indices ( vr ) ;
+ vector <TChromosome> chrs ;
+ assemble_contigs ( vr , chrs , "ATG" ) ;
+
+// join_chromosomes ( vr , index_length_1 , index_length_2 , genome_file ) ;
+
+
+
+
+
+
+
+
+// align_perfect ( vr , 0 ) ;
+
+
+
+/*
+ vector <string> vs ;
+ vector <TRead> vr , vn ;
+ read_fastq ( fastq_file , vr ) ;
+ vn.swap ( vr ) ;
+ int iteration = 0 ;
+ do {
+ cerr << "Iteration " << ++iteration << endl ;
+ cerr << vn.size() << " items left" << endl ;
+ vn.swap ( vr ) ;
+ vn.clear() ;
+ vn.reserve ( vr.size() ) ;
+ interlink_reads ( vr ) ;
+ for ( uint a = 0 ; a < idx->position.size() ; a++ ) {
+ idx->position[a].clear() ;
+ }
+ create_contigs ( vr , vn ) ;
+
+ // for ( int a = 0 ; a < vr.size() ; a++ ) {
+ // if ( vr[a].used ) continue ;
+ // vn.push_back ( vr[a] ) ;
+ // }
+
+ for ( int a = 0 ; a < vn.size() ; a++ ) {
+ vn[a].reset ( a ) ;
+ }
+
+ if ( remove_full_sequences ( vn , vs ) == 0 && iteration > 5 ) break ;
+ if ( iteration > 15 ) break ; // Hard cutoff
+ } while ( 1 ) ;
+
+// join_single_sequences ( vs ) ;
+
+ for ( int a = 0 ; a < vs.size() ; a++ ) {
+ if ( vs[a].length() < 100 ) continue ; // Hard cutoff
+ cout << ">CONTIG" << a << "_(" << vs[a].length() << ")" << endl << vs[a] << endl ;
+ }*/
+ return 0 ;
+}
+
+
+// rm -f reassemble ; make clean ; make reassemble ; ./reassemble --fastq=/nfs/repository/d0071/SLX_ST_1451_BK1/1547_s_2.fastq --pair=76 --genome=contigs.fasta
diff --git a/src/snpomatic.h b/src/snpomatic.h
new file mode 100644
index 0000000..327c356
--- /dev/null
+++ b/src/snpomatic.h
@@ -0,0 +1,247 @@
+#ifndef __SNPOMATIC_H__
+#define __SNPOMATIC_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <getopt.h>
+#include <string.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+using namespace std ;
+
+#ifdef __APPLE__
+#define ulong unsigned long
+#endif
+
+// __________________________________________________________________________________________________________________________________________________________________ DEFINES
+
+// DEBUGGING: set to 1 to get status information over STDOUT
+#define DEBUGGING 1
+
+// READ_CHAR_BUFFER: Buffer size for reading a FASTA/FASTQ line
+#define READ_CHAR_BUFFER 1024
+
+// MINQUAL: Make bases in output lowercase if their quality falls below this threshold
+#define MINQUAL 20
+
+// MAXLINES: Max number of lines in alignment
+#define MAXLINES 70
+
+// Misc stuff
+#define IUPAC_A 1
+#define IUPAC_C 2
+#define IUPAC_G 4
+#define IUPAC_T 8
+#define IUPAC_ALL 15
+
+// Macros (to save runtime)
+#define MERGE_IUPAC(_a,_b) IUPAC2char[char2IUPAC[_a]|char2IUPAC[_b]]
+#define twotoone(__lane,__pos) ((__pos)*MAXLINES+(__lane))
+
+#define REVERSE_COMPLEMENT_CHARP(__start) { \
+ char *__c , *__d , __e ; \
+ for ( __c = __start ; *__c ; __c++ ) ; \
+ for ( __c-- , __d = __start ; __d < __c ; __c-- , __d++ ) { \
+ __e = base_rc[*__d] ; \
+ *__d = base_rc[*__c] ; \
+ *__c = __e ; \
+ } \
+ if ( __c == __d ) *__c = base_rc[*__c] ; \
+}
+
+
+// __________________________________________________________________________________________________________________________________________________________________ TYPEDEFS
+
+class TPositionWithIndex ;
+class TChromosome ;
+class TCoverage ;
+
+typedef unsigned char uchar ;
+typedef unsigned int index2type ;
+typedef vector <uint> vuint ;
+typedef vector <TPositionWithIndex> pwi_vector ;
+typedef vector <TChromosome> chrvec ;
+typedef vector <TPositionWithIndex>::iterator TPWIi ;
+typedef vector <string>::iterator TSVI ;
+typedef vector <TCoverage> TVC ;
+
+// __________________________________________________________________________________________________________________________________________________________________ GLOBAL VARIABLES
+
+
+extern uchar char2num[256] , num2char[256] , to_lc[256] , to_uc[256] , char2IUPAC[256] , IUPAC2char[256] , isIUPAC[256] , base_rc[256] , IUPAC_variance[256] ;
+extern int globally_allowed_mismatches ;
+
+// __________________________________________________________________________________________________________________________________________________________________ GLOBAL FUNCTIONS
+
+void Tokenize(const string& str, vector<string>& tokens, const string& delimiters = " ") ;
+void init_iupac () ;
+bool matchIUPAC ( const char *c1 , const char *c2 ) ;
+void load_all_chromosomes ( string filename , vector <TChromosome> &chrs , bool keep_original_snps = false ) ;
+void incorporate_all_gff ( string filename , vector <TChromosome> &chrs ) ;
+void incorporate_simple_snps ( string filename , vector <TChromosome> &chrs ) ;
+
+// __________________________________________________________________________________________________________________________________________________________________ CLASSES
+
+class TCoverage {
+ public :
+ TCoverage () { A = C = G = T = rc = 0 ; }
+ uint A , C , G , T , rc ;
+} ;
+
+/// Stores and outputs the pileup.
+class TAlignmentOutput {
+ public :
+ char *align ; ///< Two-dimensional alignment array for the bases.
+ char *qalign ; ///< Same as *align, but for quality.
+ vector <bool> align_full ; ///< Shortcut mechanism to speed up alignment of extreme depths.
+
+ void init ( TChromosome *_chr ) ; ///< Initialization of align and qualign memory.
+
+ void show_pileup ( FILE *pileup , bool snps_only ) ; ///< Outputs the alignment to the given file.
+ void add_align ( const string &seq , const string &quality , uint pos , int chromosome ) ;
+ TChromosome *chr ; ///< Pointer back to the chromosome.
+
+ protected:
+ uint align_size ;
+} ;
+
+/// Stores secondary index, position, chromosome, and reverse/complement information. Heavily used by TChromosomalIndices.
+class TPositionWithIndex {
+ public :
+ index2type index2 ; ///< The secondary index.
+ uint position ; ///< The position in the chromosome.
+ bool reverse_complement ; ///< Is this reverse/complement?
+ uint chromosome ; ///< Number of chromosome.
+} ;
+
+
+/// Calculates and stores the indices on the chromosomes.
+class TChromosomalIndices {
+ public:
+ TChromosomalIndices ( int _index_length , int _index_length2 ) ;
+ void run_reads ( string filename ) ;
+ void run ( chrvec *_chrs ) ;
+ void run_ends ( chrvec *_chrs , int min_length = 0 ) ;
+ void run_regions ( chrvec *_chrs , string filename ) ;
+ uint calculate_index ( const char *sequence_kmer ) ;
+ index2type calculate_index2 ( const char *sequence_kmer ) ;
+ void save_index ( string filename ) ;
+ void load_index ( string filename ) ;
+ void uniqueness ( string filename ) ;
+ string decompile ( uint index , uint bases ) ;
+
+ vector <pwi_vector> position ; ///< The complete index.
+ int index_length , index_length2 , index_length_double , noshortcuts ;
+ int max_snps_per_index ; ///< Maximal number of known SNPs per index (1+2); prevent excessive memory usage.
+ chrvec *chrs ; ///< Pointer to vector with all the chromosomes (TChromosome).
+ string index_file ;
+ uint index_from , index_to ;
+ uint index_steps ;
+
+ private:
+ int add_sequence ( char *begin , char *cur , int from , int chr , bool rc , int nos = 0 ) ;
+ void add_index ( char *start , bool rc , uint pos , uchar chromosome ) ;
+ void reassemble_reads ( FILE *file , uint i1 , index2type i2a , index2type i2b ) ;
+ bool align_read2contigs ( char *read , vector <string> &vs ) ;
+
+ char tmp[READ_CHAR_BUFFER] ;
+} ;
+
+/// Holds an individual chromosome.
+class TChromosome {
+ public:
+ string name , sequence , original_sequence ;
+ TAlignmentOutput ao ;
+ TVC coverage ;
+ vector <uint> uniqueness ;
+
+ void read_chromosome ( string filename , string chromosome_name ) ;
+ void incorporate_known_snps ( string filename ) ;
+ void incorporate_known_snps_gff ( string filename ) ;
+ void add_coverage ( const string &seq , uint pos , bool rc ) ;
+} ;
+
+/// Reads Solexa data from FASTA/FASTQ and finds perfect matches in the reference.
+class TChromosomeAlign {
+ public:
+ TChromosomeAlign ( chrvec &_chrs , TChromosomalIndices &_index ) ;
+ void init () ;
+ void align_solexa_fasta ( string filename ) ;
+ void align_solexa_fastq ( string filename ) ;
+ void align_solexa_paired ( string filename , string filename2 , uint read_length_1 , uint fragment_length , uint range ) ;
+ void align_solexa_fastq_variety ( string filename ) ;
+ void show_pileup ( bool snps_only = false ) ;
+ void align_contigs ( chrvec &contigs , TChromosomalIndices &contig_index , uint maxerr , string output_filename ) ;
+ void add_nono_list ( string filename ) ;
+ bool is_nono ( const string &s ) ;
+ void dump_coverage () ;
+ void finish_sqlite ( int fragment , int variance , int read_length ) ;
+ void get_align_solexa_read ( const char *_seq , pwi_vector &results ) ;
+
+ FILE *pileup , *binfile_no_match , *binfile_single_match , *binfile_multi_match , *snpsinreads ;
+ FILE *binfile_iupac , *cigar , *wobble , *fragmentplot , *featuretable , *gffout , *coverage ;
+ FILE *indelplot , *inversions , *sqlite , *sam , *spancontigs , *faceaway , *rpa ;
+ bool using_bins , multimatch , singlematch , force_one_unique_match ;
+ uint chop , wobblemax , single_read_length ;
+
+ protected:
+
+ int align_solexa_read ( char *_seq , const string &_quality , bool inv_compl = false ) ;
+ void add_cigar_sqlite ( const string &sequence , const string &name , uint position , uint chromosome , char orientation ) ;
+ virtual uint align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) ;
+ int paired_read_combine ( const pwi_vector &v1 , const pwi_vector &v2 , uint fragment_length , uint range , uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 , bool wobbling = false , bool inverting = false , int order = 0 ) ;
+ void write2bin ( const string &name , const string &sequence , const string &quality , int matches ) ;
+ void run_wobble ( const string &seq1 , const string &seq2 , const string &qual1 , const string &qual2 , const pwi_vector &res1 , const pwi_vector &res2 , uint fragment_length ) ;
+ void wobble4snps ( const pwi_vector &res , const string &seq , uint fragment_size , uint range , const string &anchor_seq , bool rc ) ;
+ void add_coverage ( const string &seq , uint pos ) ;
+ int try_matching_read ( const string & seq , string quality ) ;
+ string get_ref_substr ( const TPositionWithIndex &p , int length ) ;
+ void add_snpsinreads ( const string &seq , const string &quality , int chr , int pos , int readpart ) ;
+ int count_snpsinreads ( const string &seq , int chr , int pos ) ;
+ void add_sam ( const string &seq , const string &quality , int chr , int pos , int readpart , int matepos , int ins_size , bool rc , bool as_pair , bool mate_rc , bool unique_match ) ;
+ void wobble_single_read ( char *seq , const string &_quality ) ;
+ string matchIUPAC_tolerant ( const char *c1 , const char *c2 , int tolerance ) ;
+ string generate_sqlite_paired_command ( int chr , string seq1 , int pos1 , string seq2 , int pos2 , char mode = ' ' , string add_keys = "" , string add_values = "" ) ;
+ void add2sqlite_cache ( string s ) ;
+ void rpa_out ( const TPositionWithIndex &a , const TPositionWithIndex &b , uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 ) ;
+
+ uint max_align ;
+ string last_solexa_name ;
+ vector <string> sqlite_out ;
+ FILE *sqlite_cache ;
+ string sqlite_cache_name ;
+ chrvec *chrs ;
+ TChromosomalIndices *index ;
+ bool fasta , use_nono , sam_wrote ;
+ pwi_vector wobble_results ;
+ vector <string> nono ;
+ uint fragment_range ;
+ vector <int> fragment_stats ;
+ bool snpsinreads_first ;
+ bool sqlite_prefix_first ;
+ bool rpa_header_written ;
+ string sqlite_prefix ;
+ string rgid ;
+} ;
+
+class prc_cache {
+ public :
+ TPositionWithIndex a , b ;
+} ;
+
+
+bool operator < ( const TPositionWithIndex &a , const TPositionWithIndex &b ) ;
+bool tpi_full_compare ( const TPositionWithIndex a , const TPositionWithIndex b ) ;
+string dor ( string s ) ;
+string doc ( string s ) ;
+string dorc ( string s ) ;
+
+#endif
diff --git a/src/ungap.cpp b/src/ungap.cpp
new file mode 100644
index 0000000..1927f1e
--- /dev/null
+++ b/src/ungap.cpp
@@ -0,0 +1,274 @@
+#include "snpomatic.h"
+#include <fstream>
+
+int maxfrag = 0 ;
+
+class TLeftovers {
+ public :
+ string seq ;
+ int pos ; // Not used ATM
+ bool used ;
+ vector <uint> keys ;
+ vector <uint> related ;
+} ;
+
+class TRegionScan : public TChromosomeAlign {
+ public:
+ TRegionScan ( chrvec &_chrs , TChromosomalIndices &_index ) : TChromosomeAlign ( _chrs , _index ) {} ;
+ void assemble_leftovers ( int cnt ) ;
+ vector <TLeftovers> leftovers ;
+
+ protected:
+ virtual uint align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) ;
+ int get_overlap ( TLeftovers &l1 , TLeftovers &l2 ) ;
+ TLeftovers merge_reads ( TLeftovers &l1 , TLeftovers &l2 , int overlap ) ;
+} ;
+
+bool operator < ( const TLeftovers &a , const TLeftovers &b ) {
+ return a.seq < b.seq ;
+}
+
+TLeftovers TRegionScan::merge_reads ( TLeftovers &l1 , TLeftovers &l2 , int overlap ) {
+ TLeftovers l ;
+ l.seq = l2.seq.substr ( 0 , overlap ) + l1.seq ;
+ for ( int n = 0 ; n + 16 < l.seq.length() ; n++ ) {
+ l.keys.push_back ( index->calculate_index2 ( l.seq.c_str() + n ) ) ;
+ }
+ l.used = false ;
+ return l ;
+}
+
+int TRegionScan::get_overlap ( TLeftovers &l1 , TLeftovers &l2 ) {
+ int a, b ;
+ int o = -1 ;
+ for ( a = 0 ; a < l1.keys.size() ; a++ ) {
+ for ( b = a+1 ; b < l2.keys.size() ; b++ ) {
+ if ( l1.keys[a] != l2.keys[b] ) continue ;
+ int diff = b - a ;
+ if ( o == -1 ) o = diff ;
+ else if ( o != diff ) return -1 ; // More than one overlap
+ }
+ }
+/*
+ string s1 = l1.seq.substr ( o , l1.seq.length() - o ) ;
+ string s2 = l2.seq ;
+
+ if ( s1.length() < s2.length() ) s2 = s2.substr ( 0 , s1.length() ) ;
+ else if ( s2.length() < s1.length() ) s1 = s1.substr ( 0 , s2.length() ) ;
+ if ( s1 != s2 ) return -1 ; // No match
+*/
+ return o ;
+}
+
+void TRegionScan::assemble_leftovers ( int cnt ) {
+ uint a , b ;
+ vector <TLeftovers> lo ;
+
+ cout << "Assembling, depth " << cnt << endl ;
+
+ for ( a = 0 ; a+1 < leftovers.size() ; a++ ) {
+ for ( b = a+1 ; b < leftovers.size() ; b++ ) {
+ int o = get_overlap ( leftovers[a] , leftovers[b] ) ;
+ if ( o < 0 ) continue ; // No overlap, or more than one
+ leftovers[a].used = true ;
+ leftovers[b].used = true ;
+ lo.push_back ( merge_reads ( leftovers[a] , leftovers[b] , o ) ) ;
+// cout << leftovers[a].seq << " <=> " << leftovers[b].seq << " : " << o << endl ;
+ }
+ }
+ if ( cnt <= 0 ) return ;
+
+ for ( a = 0 ; a < leftovers.size() ; a++ ) {
+ if ( leftovers[a].used ) continue ;
+ lo.push_back ( leftovers[a] ) ;
+ }
+
+ // Add unique sequences as new source set
+ sort ( lo.begin() , lo.end() ) ;
+ leftovers.clear() ;
+ for ( a = 0 ; a < lo.size() ; a++ ) {
+ if ( a == 0 || lo[a-1].seq != lo[a].seq ) leftovers.push_back ( lo[a] ) ;
+ }
+
+ assemble_leftovers ( cnt - 1 ) ;
+}
+
+uint TRegionScan::align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) {
+ string seq1 = sequence.substr ( 0 , read_length_1 ) ;
+ string seq2 = sequence.substr ( read_length_1 , sequence.length() - read_length_1 ) ;
+
+ while ( quality.length() < sequence.length() ) quality += (char) 33+30 ;
+ string qual1 = quality.substr ( 0 , read_length_1 ) ;
+ string qual2 = quality.substr ( read_length_1 , sequence.length() - read_length_1 ) ;
+
+ pwi_vector res1 , res2 ;
+ get_align_solexa_read ( seq1.c_str() , res1 ) ;
+ get_align_solexa_read ( seq2.c_str() , res2 ) ;
+ if ( res1.size() + res2.size() == 0 ) return 0 ;
+
+ sort ( res1.begin() , res1.end() , tpi_full_compare ) ;
+ sort ( res2.begin() , res2.end() , tpi_full_compare ) ;
+
+ int ret = 0 ;
+ ret += paired_read_combine ( res1 , res2 , maxfrag/2 , maxfrag/2 , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ;
+ ret += paired_read_combine ( res2 , res1 , maxfrag/2 , maxfrag/2 , read_length_1 , seq2 , qual2 , seq1 , qual1 ) ;
+
+ if ( ret == 0 && res1.size() + res2.size() == 1 ) {
+ string seq ;
+ int newpos , chr ;
+ if ( res1.size() ) {
+ chr = res1[0].chromosome ;
+ if ( res1[0].reverse_complement ) { seq = seq2 ; newpos = res1[0].position - fragment_length ; }
+ else { seq = dorc ( seq2 ) ; newpos = (int) res1[0].position + fragment_length ; }
+ } else {
+ chr = res2[0].chromosome ;
+ if ( res2[0].reverse_complement ) { seq = seq1 ; newpos = (int) res2[0].position - fragment_length ; }
+ else { seq = dorc ( seq1 ) ; newpos = res2[0].position + fragment_length ; }
+ }
+ if ( newpos > 0 || newpos + seq.length() < (*chrs)[chr].sequence.length() ) {
+ TLeftovers l ;
+ l.seq = seq ;
+ l.pos = newpos ;
+ l.used = false ;
+ for ( int n = 0 ; n + 16 < seq.length() ; n++ ) {
+ l.keys.push_back ( index->calculate_index2 ( seq.c_str() + n ) ) ;
+ }
+ leftovers.push_back ( l ) ;
+ }
+ }
+
+ return ret ;
+}
+
+// __________________________________________________________________________________________________________________________________________________________________ MAIN
+
+/// Shows possible parameters, and an (optional) error message.
+void die_with_description ( const char *msg = NULL ) {
+ if ( msg ) cerr << msg << endl ;
+ cerr << "Parameters : \n" ;
+ exit ( 1 ) ;
+}
+
+/// Well, guess what.
+int main ( int argc , char **argv ) {
+ string genome_file , gff_file , simple_snp_file , fastq_file , chromosome , name , coverage_file ;
+ string bin_prefix , binmask ;
+ uint mspi = 8 ;
+ uint pair_length = 0 ;
+ uint fragment_length = 0 ;
+ uint variance = 0 ;
+ uint index_length_1 = 10 ;
+ uint index_length_2 = 16 ;
+ int noshortcuts = false ;
+ uint from = 0 , to = 0 ;
+
+ // Parameters
+ static struct option long_options[] = {
+ { "genome" , required_argument , 0 , 'g' } ,
+ { "fastq" , optional_argument , 0 , 'q' } ,
+ { "snps" , optional_argument , 0 , 's' } ,
+ { "gff" , optional_argument , 0 , 'f' } ,
+ { "pair" , optional_argument , 0 , 'i' } ,
+ { "fragment" , optional_argument , 0 , 't' } ,
+ { "variance" , optional_argument , 0 , 'v' } ,
+ { "coverage" , optional_argument , 0 , 'e' } ,
+ { "mspi" , optional_argument , 0 , 'm' } ,
+ { "chr" , optional_argument , 0 , 'c' } ,
+ { "from" , optional_argument , 0 , 'r' } ,
+ { "to" , optional_argument , 0 , 'o' } ,
+ { "name" , optional_argument , 0 , 'n' } ,
+ { "bins" , optional_argument , 0 , 'b' } ,
+ { "binmask" , optional_argument , 0 , 'z' } ,
+ { 0 , 0 , 0 , 0 }
+ } ;
+
+ int c ;
+ int option_index = 0;
+ while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) {
+
+ switch ( c ) {
+ case 0 : break ;
+ case 'g' : genome_file = optarg ; break ;
+ case 'q' : fastq_file = optarg ; break ;
+ case 's' : simple_snp_file = optarg ; break ;
+ case 'f' : gff_file = optarg ; break ;
+ case 'c' : chromosome = optarg ; break ;
+ case 'n' : name = optarg ; break ;
+ case 'e' : coverage_file = optarg ; break ;
+ case 'i' : pair_length = atoi ( optarg ) ; break ;
+ case 't' : fragment_length = atoi ( optarg ) ; break ;
+ case 'v' : variance = atoi ( optarg ) ; break ;
+ case 'm' : mspi = atoi ( optarg ) ; break ;
+ case 'r' : from = atoi ( optarg ) ; break ;
+ case 'o' : to = atoi ( optarg ) ; break ;
+ case 'b' : bin_prefix = optarg ; break ;
+ case 'z' : binmask = optarg ; break ;
+ default : die_with_description("Unknown option") ;
+ }
+ }
+ binmask += "0000" ; // Ignore bins if shorter string was supplied
+
+ if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need this
+ if ( fastq_file.empty() ) die_with_description("No Solexa file given!") ; // Need one
+ if ( chromosome.empty() ) die_with_description("No chromosome given!") ; // Need one
+ if ( name.empty() ) die_with_description("No name given!") ; // Need one
+ if ( from == 0 ) die_with_description("Needs --from!") ; // Need one
+ if ( to == 0 ) die_with_description("Needs --to!") ; // Need one
+
+
+ // Incorporating known SNPs and indexing chromosomes
+ init_iupac () ;
+ TChromosomalIndices index ( index_length_1 , index_length_2 ) ;
+ index.noshortcuts = noshortcuts ;
+ index.max_snps_per_index = mspi ;
+ vector <TChromosome> chrs ;
+ load_all_chromosomes ( genome_file , chrs ) ;
+
+ if ( !gff_file.empty() ) incorporate_all_gff ( gff_file , chrs ) ;
+ if ( !simple_snp_file.empty() ) incorporate_simple_snps ( simple_snp_file , chrs ) ;
+
+ // Remove all but one chromosome
+ uint a ;
+ for ( a = 0 ; a < chrs.size() ; a++ ) {
+ if ( chrs[a].name == chromosome ) continue ;
+ for ( int b = a ; b+1 < chrs.size() ; b++ ) chrs[b] = chrs[b+1] ;
+ chrs.pop_back() ;
+ a-- ;
+ }
+ chrs[0].sequence = chrs[0].sequence.substr ( from-1 , to-from ) ;
+ maxfrag = chrs[0].sequence.length() ;
+ cout << "Keeping " << chrs[0].sequence.length() << " bases from " << chromosome << endl ;
+
+ index.run ( &chrs ) ;
+
+ // Now look at the SOlexa reads and do the thing
+ TRegionScan ca ( chrs , index ) ;
+ if ( !coverage_file.empty() ) ca.coverage = fopen ( coverage_file.c_str() , "w" ) ;
+
+ ca.init () ;
+/*
+ if ( !bin_prefix.empty() ) {
+ string ext = ".fasta" ;
+ if ( !fastq_file.empty() ) ext = ".fastq" ;
+ cout << "Using binmask " << binmask << endl ;
+ if ( binmask[0] == '1' ) ca.binfile_no_match = fopen ( string ( bin_prefix + "_no_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[1] == '1' ) ca.binfile_single_match = fopen ( string ( bin_prefix + "_single_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[2] == '1' ) ca.binfile_multi_match = fopen ( string ( bin_prefix + "_multi_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[3] == '1' ) ca.binfile_iupac = fopen ( string ( bin_prefix + "_iupac" + ext ).c_str() , "w" ) ;
+ ca.using_bins = true ;
+ }
+*/
+ ca.align_solexa_paired ( fastq_file , pair_length , fragment_length , variance ) ;
+// ca.assemble_leftovers ( 2 ) ;
+
+ string fasta_output = name + ".fasta" ;
+ fstream filestr;
+ filestr.open ( fasta_output.c_str() , fstream::out ); //| fstream::app);
+ for ( a = 0 ; a < ca.leftovers.size() ; a++ ) {
+ filestr << ">" << a << endl << ca.leftovers[a].seq << endl ;
+ }
+ filestr.close();
+
+ ca.dump_coverage() ;
+// cout << chrs[0].sequence << endl ;
+}
diff --git a/src/variety.cpp b/src/variety.cpp
new file mode 100644
index 0000000..711b812
--- /dev/null
+++ b/src/variety.cpp
@@ -0,0 +1,249 @@
+/*! \mainpage SNP-O-MATIC
+ *
+ * \section authors Authors
+ *
+ * - Magnus Manske (mm6 at sanger.ac.uk)
+ *
+ * \section intro_sec Introduction
+ *
+ * SNP-O-MATIC can align Solexa reads against a reference sequence.
+ * Known SNPs can be added to the reference sequence as IUPAC bases.
+ * Only perfectly matching Solexa reads, with respect to the IUPAC bases, will be aligned.
+ * Reads can be binned into those that do not match the reference, those that match uniquely, and those that match two or more times.
+ *
+ * \section install_sec Installation
+ *
+ * SNP-O-MATIC comes with a Makefile. The Intel compiler is used by default for performance reasons, but the g++ variant is in the Makefile as well in the form of comments.
+ */
+
+#include "snpomatic.h"
+
+
+
+
+// __________________________________________________________________________________________________________________________________________________________________ MAIN
+
+/// Shows possible parameters, and an (optional) error message.
+void die_with_description ( const char *msg = NULL ) {
+ if ( msg ) cerr << msg << endl ;
+ cerr << "Parameters : \n" ;
+ cerr << "--genome=GENOME_FILE\tFASTA file with chromosomes (mandatory)\n" ;
+ cerr << "--fasta=FASTA_FILE\tFASTA file with Solexa reads (mandatory, except when --fastq or --index is used)\n" ;
+ cerr << "--fastq=FASTQ_FILE\tFASTQ file with Solexa reads (mandatory, except when --fasta or --index is used)\n" ;
+ cerr << "--nono=FILENAME\tFile with list of read names (!) to ignore (optional)\n" ;
+ cerr << "--regions=REGION_FILE\tRegion file for finding new SNPs (optional)\n" ;
+ cerr << "--snps=SNP_FILE\t\tSimple SNP file (optional)\n" ;
+ cerr << "--gff=GFF_FILE\t\tGFF file with SNPs (optional)\n" ;
+ cerr << "--uniqueness=OUTPUT_FILE\tOutput a uniqueness data file for the reference; no Solexa reads needed; implies --noshortcuts (optional)\n" ;
+ cerr << "--pileup=OUTPUT_FILE\tOutputs complete pileup into that file (optional)\n" ;
+ cerr << "--cigar=OUTPUT_FILE\tOutputs alignment in CIGAR format (optional)\n" ;
+ cerr << "--gffout=OUTPUT_FILE\tOutputs reads in GFF format (optional)\n" ;
+// cerr << "--featuretable=OUTPUT_FILE\tOutputs a DDBJ/EMBL/GenBank feature table for matched reads (optional; paired reads only for now)\n" ;
+ cerr << "--bins=FILE_PREFIX\tOutputs no-match, single-match and multi-match Solexa reads into prefixed files (optional)\n" ;
+ cerr << "--binmask=MASK\tMask of 1s and 0s to turn off individual bins. Order: No match, single match, multi-match, IUPAC. Example: 0100 creates only single-match bin. (optional; default:1111)\n" ;
+ cerr << "--pair=NUMBER\t\tFor paired reads, the length of the first part of the read (mandatory for paired reads)\n" ;
+ cerr << "--fragment=NUMBER\tFor paired reads, the average fragment length (mandatory for paired reads)\n" ;
+ cerr << "--variance=NUMBER\tFor paired reads, the variance of the fragment length to either side (optional; default: 1/4 of fragment size)\n" ;
+ cerr << "--wobble=FILENAME\tOutputs a list of possible variations (optional; paired reads only) [UNDER CONSTRUCTION]\n" ;
+ cerr << "--coverage=FILENAME\tOutputs (high depth) coverage data (optional)\n" ;
+ cerr << "--index=FILENAME\tIndex filename (index will be created if it does not exist; optional)\n" ;
+ cerr << "--mspi=NUMBER\t\tMaximum number of SNPs per chromosomal index (optional; default:8)\n" ;
+ cerr << "--noshortcuts\t\tWill process all chrososomal regions, even those with lots'o'repeats (optional; no value)\n" ;
+ cerr << "--snpsonly\t\tOnly lists found SNPs in the pileup (optional; no value)\n" ;
+ cerr << "--fragmentplot=FILENAME\tWrites a plot of fragment size distribution to a file (optional)\n" ;
+ exit ( 0 ) ;
+}
+
+/// Well, guess what.
+int main ( int argc , char **argv ) {
+ string genome_file , fastq_file ;
+ uint index_length_1 = 4 ;
+ uint index_length_2 = 10 ;
+ int noshortcuts = true ;
+ uint mspi = 8 ;
+
+ static struct option long_options[] = {
+ { "genome" , required_argument , 0 , 'g' } ,
+ { "fastq" , optional_argument , 0 , 'q' } ,
+ { 0 , 0 , 0 , 0 }
+ } ;
+
+ int c ;
+ int option_index = 0;
+ while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) {
+
+ switch ( c ) {
+ case 0 : break ;
+ case 'g' : genome_file = optarg ; break ;
+ case 'q' : fastq_file = optarg ; break ;
+ }
+ }
+
+ init_iupac () ;
+ TChromosomalIndices index ( index_length_1 , index_length_2 ) ;
+ index.noshortcuts = noshortcuts ;
+ index.max_snps_per_index = mspi ;
+ vector <TChromosome> chrs ;
+ load_all_chromosomes ( genome_file , chrs ) ;
+ index.run ( &chrs ) ;
+
+ TChromosomeAlign ca ( chrs , index ) ;
+ ca.align_solexa_fastq_variety ( fastq_file ) ;
+
+
+/*
+ string genome_file , gff_file , simple_snp_file , fastq_file , fasta_file , pileup_file , bin_prefix , regions_file ;
+ string cigar_file , wobble_file , nono , fragmentplot , featuretable , gffout , coverage_file , uniqueness ;
+ string index_file ;
+ string binmask ( "1111" ) ;
+ uint mspi = 8 ;
+ uint pair_length = 0 ;
+ uint fragment_length = 0 ;
+ uint index_length_1 = 10 ;
+ uint index_length_2 = 16 ;
+ int noshortcuts = false ;
+ int snpsonly = false ;
+ uint variance = 0 ;
+
+ // Parameters
+ static struct option long_options[] = {
+ { "genome" , required_argument , 0 , 'g' } ,
+ { "fasta" , optional_argument , 0 , 'a' } ,
+ { "fastq" , optional_argument , 0 , 'q' } ,
+ { "snps" , optional_argument , 0 , 's' } ,
+ { "coverage" , optional_argument , 0 , 'e' } ,
+ { "gff" , optional_argument , 0 , 'f' } ,
+ { "pileup" , optional_argument , 0 , 'p' } ,
+ { "bins" , optional_argument , 0 , 'b' } ,
+ { "binmask" , optional_argument , 0 , 'z' } ,
+ { "index" , optional_argument , 0 , 'x' } ,
+ { "nono" , optional_argument , 0 , 'n' } ,
+ { "uniqueness" , optional_argument , 0 , '0' } ,
+ { "wobble" , optional_argument , 0 , 'w' } ,
+ { "pair" , optional_argument , 0 , 'i' } ,
+ { "fragment" , optional_argument , 0 , 't' } ,
+ { "featuretable" , optional_argument , 0 , 'u' } ,
+ { "gffout" , optional_argument , 0 , 'y' } ,
+ { "variance" , optional_argument , 0 , 'v' } ,
+ { "mspi" , optional_argument , 0 , 'm' } ,
+ { "cigar" , optional_argument , 0 , 'c' } ,
+ { "fragmentplot" , optional_argument , 0 , 'o' } ,
+ { "regions" , optional_argument , 0 , 'r' } ,
+ { "noshortcuts" , optional_argument , &noshortcuts , true } ,
+ { "snpsonly" , optional_argument , &snpsonly , true } ,
+ { 0 , 0 , 0 , 0 }
+ } ;
+
+
+ int c ;
+ int option_index = 0;
+ while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) {
+
+ switch ( c ) {
+ case 0 : break ;
+ case 'g' : genome_file = optarg ; break ;
+ case 'a' : fasta_file = optarg ; break ;
+ case 'q' : fastq_file = optarg ; break ;
+ case 's' : simple_snp_file = optarg ; break ;
+ case 'f' : gff_file = optarg ; break ;
+ case 'e' : coverage_file = optarg ; break ;
+ case 'p' : pileup_file = optarg ; break ;
+ case 'b' : bin_prefix = optarg ; break ;
+ case 'z' : binmask = optarg ; break ;
+ case '0' : uniqueness = optarg ; break ;
+ case 'w' : wobble_file = optarg ; break ;
+ case 'n' : nono = optarg ; break ;
+ case 'x' : index_file = optarg ; break ;
+ case 'i' : pair_length = atoi ( optarg ) ; break ;
+ case 'o' : fragmentplot = optarg ; break ;
+ case 'u' : featuretable = optarg ; break ;
+ case 'y' : gffout = optarg ; break ;
+ case 'v' : variance = atoi ( optarg ) ; break ;
+ case 't' : fragment_length = atoi ( optarg ) ; break ;
+ case 'm' : mspi = atoi ( optarg ) ; break ;
+ case 'c' : cigar_file = optarg ; break ;
+ case 'r' : regions_file = optarg ; break ;
+ default : die_with_description("Unknown option") ;
+ }
+ }
+ binmask += "0000" ; // Ignore bins if shorter string was supplied
+
+
+ // Sanity checks
+ if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need this
+ if ( index_file.empty() ) {
+ if ( fastq_file.empty() && fasta_file.empty() && uniqueness.empty() ) die_with_description("No Solexa file given!") ; // Need one
+ if ( !fastq_file.empty() && !fasta_file.empty() ) die_with_description("Two Solexa files given!") ; // Can't have both
+ }
+ if ( !regions_file.empty() || !uniqueness.empty() ) noshortcuts = true ;
+
+ // Incorporating known SNPs and indexing chromosomes
+ init_iupac () ;
+ TChromosomalIndices index ( index_length_1 , index_length_2 ) ;
+ index.noshortcuts = noshortcuts ;
+ index.max_snps_per_index = mspi ;
+ index.index_file = index_file ;
+ vector <TChromosome> chrs ;
+ load_all_chromosomes ( genome_file , chrs ) ;
+
+ if ( !gff_file.empty() ) incorporate_all_gff ( gff_file , chrs ) ;
+ if ( !simple_snp_file.empty() ) incorporate_simple_snps ( simple_snp_file , chrs ) ;
+
+ if ( !regions_file.empty() ) index.run_regions ( &chrs , regions_file ) ;
+ else index.run ( &chrs ) ;
+
+ if ( !uniqueness.empty() ) {
+ index.uniqueness ( uniqueness ) ;
+ if ( fastq_file.empty() && fasta_file.empty() ) return 0 ; // My work here is done
+ }
+
+
+ // Now look at the SOlexa reads and do the thing
+ TChromosomeAlign ca ( chrs , index ) ;
+ if ( !pileup_file.empty() ) ca.pileup = fopen ( pileup_file.c_str() , "w" ) ;
+ if ( !cigar_file.empty() ) ca.cigar = fopen ( cigar_file.c_str() , "w" ) ;
+ if ( !fragmentplot.empty() ) ca.fragmentplot = fopen ( fragmentplot.c_str() , "w" ) ;
+ if ( !gffout.empty() ) ca.gffout = fopen ( gffout.c_str() , "w" ) ;
+ if ( !coverage_file.empty() ) ca.coverage = fopen ( coverage_file.c_str() , "w" ) ;
+ if ( !featuretable.empty() ) {
+ ca.featuretable = fopen ( featuretable.c_str() , "w" ) ;
+ fprintf ( ca.featuretable , "Key\tLocation/Qualifiers\n" ) ;
+ }
+ if ( !bin_prefix.empty() ) {
+ string ext = ".fasta" ;
+ if ( !fastq_file.empty() ) ext = ".fastq" ;
+ cout << "Using binmask " << binmask << endl ;
+ if ( binmask[0] == '1' ) ca.binfile_no_match = fopen ( string ( bin_prefix + "_no_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[1] == '1' ) ca.binfile_single_match = fopen ( string ( bin_prefix + "_single_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[2] == '1' ) ca.binfile_multi_match = fopen ( string ( bin_prefix + "_multi_match" + ext ).c_str() , "w" ) ;
+ if ( binmask[3] == '1' ) ca.binfile_iupac = fopen ( string ( bin_prefix + "_iupac" + ext ).c_str() , "w" ) ;
+ ca.using_bins = true ;
+ }
+ ca.init () ;
+ if ( !nono.empty() ) ca.add_nono_list ( nono ) ;
+ if ( pair_length == 0 ) {
+ if ( !fasta_file.empty() ) ca.align_solexa_fasta ( fasta_file ) ;
+ if ( !fastq_file.empty() ) ca.align_solexa_fastq ( fastq_file ) ;
+ } else {
+ ca.wobble = fopen ( string ( wobble_file ).c_str() , "w" ) ;
+ if ( !fasta_file.empty() ) ca.align_solexa_paired ( fasta_file , pair_length , fragment_length , variance ) ;
+ if ( !fastq_file.empty() ) ca.align_solexa_paired ( fastq_file , pair_length , fragment_length , variance ) ;
+ }
+ ca.show_pileup ( !regions_file.empty() || snpsonly ) ;
+ ca.dump_coverage () ;
+*/
+ return 0 ;
+}
+
+// Example runs:
+// rm test ; make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=test
+// make clean ; make ; time ./findknownsnps --genome=/nfs/malaria/data2/human_sequence/Homo_sapiens.NCBI36.48.dna.chromosome.22.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=human
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --bins=q2
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --cigar=test.cigar
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=q1_no_match.fasta --snps=new_snps_60_uniq --bins=q3
+// make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/faculty/mm6/3D7_pm.fa --fastq=$MAL_SOLEXA_HOME/run368/368_s_3.fastq --pair=35 --fragment=300 --bins=q3
+
+
+/* TEST:
+ make clean ; make variety ; ./variety --genome=tools/MAL3.925000-927000.fasta --fastq=tools/585_s_5.bin2_no_match.fastq > v.out
+*/
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/snpomatic.git
More information about the debian-med-commit
mailing list