[med-svn] [pscan-tfbs] 12/15: New upstream version 1.2.2
Andreas Tille
tille at debian.org
Mon Dec 18 20:23:33 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository pscan-tfbs.
commit 1baba514b9de414ea7fc6e1012913b4428757f75
Author: Andreas Tille <tille at debian.org>
Date: Mon Dec 18 21:18:05 2017 +0100
New upstream version 1.2.2
---
HELP.txt | 90 +
INSTALL | 13 +
LICENSE.txt | 622 +++++
REFERENCE.txt | 5 +
debian/README.Debian | 11 -
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 19 -
debian/copyright | 34 -
debian/docs | 1 -
debian/examples | 1 -
debian/pscan.dirs | 1 -
debian/pscan.doc-base | 10 -
debian/pscan.install | 1 -
debian/pscan.substvars | 2 -
debian/rules | 9 -
debian/source/format | 1 -
debian/upstream/metadata | 12 -
debian/watch | 4 -
example_matrix_file.wil | 10 +
pscan.cpp | 6050 ++++++++++++++++++++++++++++++++++++++++++++++
21 files changed, 6790 insertions(+), 112 deletions(-)
diff --git a/HELP.txt b/HELP.txt
new file mode 100644
index 0000000..599d3fb
--- /dev/null
+++ b/HELP.txt
@@ -0,0 +1,90 @@
+SYNOPSIS
+ pscan -q multifastafile -p multifastafile [options]
+ pscan -p multifastafile [options]
+ pscan -q multifastafile -M matrixfile [options]
+
+OPTIONS
+ [-q file] | Specify the multifasta file containing the foreground sequences.
+
+ [-p file] | Specify the multifasta file containing the background sequences.
+
+ [-m file] | Use it if the background data are already available in a file (see -g option).
+
+ [-M file] | Scan the foreground sequences using only the Jaspar/Transfac matrix file contained in the specified file.
+
+ [-l file] | Use the matrices contained in that file (for matrix file format see below).
+
+ [-N name] | Use only the matrix with that name (usable only in association with -l).
+
+ [-ss] | Perform single strand only analysis.
+
+ [-rs] | Perform single strand only analysis on the reverse strand.
+
+ [-split num1 num2] | Sequences are scanned only from position num1 and for num2 nucleotides.
+
+ [-trashn] | Discards sequences containing "N".
+
+ [-n] | Oligos containing "N" will not be discarded. Instead a "N" will obtain an "average" score.
+
+ [-g] | If a background sequences file is used than a file will be written containing the data calculated
+ for that background sequences and the current set of matrices.
+ From now on one can use that file (-m option) instead of the sequences file for faster processing.
+
+ [-ui file] | An index of the background file will be used to avoid duplicated sequences.
+
+ [-bi] | Build an index of the background sequences file (to be used later with the -ui option).
+ This is useful when you have duplicated sequences in your background that may introduce a bias in your results.
+
+ [-h] | Display this help.
+
+NOTES
+ The sequences to be used with Pscan have to be promoter sequences.
+ To obtain meaningful results it's critical that the background and the foreground sequences are consistent between them either in size
+ and in position (with respect to the transcription start site). For optimal results the foreground set should be a subset of the background set.
+
+ If the "-l" option is not used Pscan will try to find Jaspar/Transfac matrix files in the current folder.
+ Jaspar files have ".pfm" extension while Transfac ones have ".pro" extension.
+ If Jaspar matrix files are used than a file called "matrix_list.txt" must be present in the same folder.
+ That file contains required info about the matrices in the ".pfm" files.
+
+ For info on how Pscan works pleare refer to the paper.
+
+EXAMPLES
+
+1) pscan -p human_450_50.fasta -bi
+
+ This command will scan the file "human_450_50.fasta" using the matrices in the current folder.
+ It is handy to use that command the first time one uses a set of matrices with a given background sequences file.
+ A file called human_450_50.short_matrix will be written and it can be used from now on every time you want to use
+ the same background sequences with the same set of matrices. A file called human_450_50.index will be written too
+ and it will be useful every time you will use the same background file.
+
+2) pscan -q human_nfy_targets.fasta -m human_450_50.short_matrix -ui human_450_50.index
+
+ This command will scan the file human_nfy_targets.fasta searching for over-represented binding sites (with respect
+ to the preprocessed background contained in the "human_450_50.short_matrix" file) using the matrices in the current folder.
+ Please note that the query file "human_nfy_targets.fasta" must be a subset of the sequences contained in the background file "human_450_50.fasta"
+ in order to use the index file with the "-ui" option. This means that both the sequences and their FASTA headers used in the query file must appear
+ in the background file as well. Using the "-ui" option when the sequences contained in the query file are not a subset of the background file will
+ have undefined/unpredictable outcomes.
+ The output will be a file called "human_nfy_targets.fasta.res" where you will find all the used matrices sorted by ascending P-value.
+ The lower the P-value obtained by a matrix, the higher are the chances that the transcription factor associated to that matrix
+ is a regulator of the input promoter sequences.
+ The fields of the output are the following: "Transcription Factor Name", "Matrix ID", "Z Score", "Pvalue", "Foreground Average", "Background Average".
+
+3) pscan -q human_nfy_targets.fasta -M MA0108.pfm
+
+ This command will scan the sequences file "human_nfy_targets.fasta" using the matrix contained in "MA0108.pfm".
+ The result will be written in a file called "human_nfy_targets.fasta.ris" where you will find the sequences in input
+ sorted by a descending score (between 1 and 0). The higher the score, the better is the oligo found with respect to the used matrix.
+ The fields of the output are the following: "Sequence Header", "Score", "Position from the end of sequence", "Oligo that obtained the score",
+ "Strand where the oligo was found".
+
+4) pscan -p human_450_50.fasta -bi -l matrixfile.wil
+
+ This command is like Example #1 with the difference that the matrices set to be used is the one contained in the "matrixfile.wil" file.
+ Please look at the "example_matrix_file.wil" file included in this Pscan distribution to see the correct format for matrices file.
+
+5) pscan -q human_nfy_targets.fasta -l matrixfile.wil -N MATRIX1
+
+ This command is like Example #3 but it will use the matrix called "MATRIX1" contained in the "matrixfile.wil" file.
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..31eaec3
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,13 @@
+To use Pscan you will need a C++ compiler like gcc (http://gcc.gnu.org/) and the
+Gnu Scientific Library (http://www.gnu.org/software/gsl/) both installed in your system.
+
+To compile the binary from the source file simply type:
+
+g++ pscan.cpp -o pscan -O3 -lgsl -lgslcblas
+
+If all is ok than you will find a binary file called "pscan" in the same folder.
+To allow the use of Pscan from every folder without having to type the whole path to the binary file every time you use it,
+it could be useful to have a symbolic link to it in your "bin" folder.
+(For help on symbolic links type: man ln )
+
+Please note that this archive does not include matrices and other related stuff that may be useful to run Pscan. You can download them for your organism(s) of interest at: http://159.149.109.11/pscan/source.html .
\ No newline at end of file
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..e963df8
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,622 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
diff --git a/REFERENCE.txt b/REFERENCE.txt
new file mode 100644
index 0000000..4f1cd6e
--- /dev/null
+++ b/REFERENCE.txt
@@ -0,0 +1,5 @@
+If you find Pscan useful in your research please cite our paper:
+
+F.Zambelli, G.Pesole, G.Pavesi
+Pscan: Finding Over-represented Transcription Factor Binding Site Motifs in Sequences from Co-Regulated or Co-Expressed Genes.
+Nucleic Acids Research 2009 37(Web Server issue):W247-W252.
diff --git a/debian/README.Debian b/debian/README.Debian
deleted file mode 100644
index 19f5415..0000000
--- a/debian/README.Debian
+++ /dev/null
@@ -1,11 +0,0 @@
-pscan-tfbs for Debian
----------------------
-
-Please note that on Debian systems we can not use the name pscan because
-there is another package with the very same name. So the package is
-renamed to pscan-tfbs.
-
-Pscan depends on a transcription factor binding site description. The
-database "JASPAR" can be downloaded with the Debian 'getData' interface.
-
- -- Steffen Moeller <moeller at debian.org> Wed, 16 May 2012 18:59:59 +0200
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 7128c28..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-pscan-tfbs (1.2.2-1) UNRELEASED; urgency=low
-
- * Initial release (Closes: #673182)
-
- -- Steffen Moeller <moeller at debian.org> Wed, 16 May 2012 18:59:59 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index f599e28..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 573e250..0000000
--- a/debian/control
+++ /dev/null
@@ -1,19 +0,0 @@
-Source: pscan-tfbs
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Steffen Moeller <moeller at debian.org>
-Section: science
-Priority: optional
-Build-Depends: debhelper (>= 10),
- libgsl-dev
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/viewvc/debian-med/trunk/packages/pscan/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/pscan/trunk/
-Homepage: http://www.beaconlab.it/pscan
-
-Package: pscan-tfbs
-Architecture: any
-Depends: ${shlibs:Depends},
- ${misc:Depends}
-Description: search for transcription factor binding sites
- Pscan finds Over-represented Transcription Factor Binding Site Motifs
- in Sequences from Co-Regulated or Co-Expressed Genes.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 6a0c4db..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,34 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: pscan
-Source: http://159.149.160.88/pscan/source.html
-Comment: IP addresses might change but when starting from the main page
- http://www.beaconlab.it
- select "English flag"
- and follow "Tools" the link to Pscan can be found
-
-Files: *
-Copyright: 2012 G.Pesole
- 2012 G.Pavesi <giulio.pavesi at unimi.it>
- 2012 F.Zambelli <federico.zambelli at unimi.it>
-License: GPL-3.0+
-
-Files: debian/*
-Copyright: 2012 Steffen Moeller <moeller at debian.org>
-License: GPL-3.0+
-
-License: GPL-3.0+
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- .
- This package is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- .
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- .
- On Debian systems, the complete text of the GNU General
- Public License version 3 can be found in "/usr/share/common-licenses/GPL-3".
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index 3d755d3..0000000
--- a/debian/docs
+++ /dev/null
@@ -1 +0,0 @@
-HELP.txt
diff --git a/debian/examples b/debian/examples
deleted file mode 100644
index 438592d..0000000
--- a/debian/examples
+++ /dev/null
@@ -1 +0,0 @@
-example_matrix_file.wil
diff --git a/debian/pscan.dirs b/debian/pscan.dirs
deleted file mode 100644
index e772481..0000000
--- a/debian/pscan.dirs
+++ /dev/null
@@ -1 +0,0 @@
-usr/bin
diff --git a/debian/pscan.doc-base b/debian/pscan.doc-base
deleted file mode 100644
index bc0bb0d..0000000
--- a/debian/pscan.doc-base
+++ /dev/null
@@ -1,10 +0,0 @@
-Document: pscan
-Title: Debian pscan Manual
-Author: G.Pesole,
- G.Pavesi <giulio.pavesi at unimi.it>,
- F.Zambelli <federico.zambelli at unimi.it>
-Abstract: The document resembles a UNIX man page.
-Section: science
-
-Format: text
-Files: /usr/share/doc/pscan/HELP.txt.gz
diff --git a/debian/pscan.install b/debian/pscan.install
deleted file mode 100644
index 13010f5..0000000
--- a/debian/pscan.install
+++ /dev/null
@@ -1 +0,0 @@
-pscan /usr/bin
diff --git a/debian/pscan.substvars b/debian/pscan.substvars
deleted file mode 100644
index 18f3275..0000000
--- a/debian/pscan.substvars
+++ /dev/null
@@ -1,2 +0,0 @@
-shlibs:Depends=libc6 (>= 2.2.5), libgcc1 (>= 1:4.1.1), libgsl0ldbl (>= 1.9), libstdc++6 (>= 4.2.1)
-misc:Depends=
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index f76658c..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/make -f
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-
-%:
- dh $@
-
-override_dh_auto_build:
- g++ -g pscan.cpp -o pscan -O3 -lgsl -lgslcblas
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
deleted file mode 100644
index 7a741c7..0000000
--- a/debian/upstream/metadata
+++ /dev/null
@@ -1,12 +0,0 @@
-Reference:
- author: Federico Zambelli and Graziano Pesole and Giulio Pavesi
- title: "Pscan: Finding Over-represented Transcription Factor Binding Site Motifs in Sequences from Co-Regulated or Co-Expressed Genes"
- journal: Nucleic Acids Research
- year: 2009
- volume: 37
- number: Web Server Issue
- pages: W247-W252
- pmid: 19487240
- doi: 10.1093/nar/gkp464
- URL: http://nar.oxfordjournals.org/content/37/suppl_2/W247
- eprint: http://nar.oxfordjournals.org/content/37/suppl_2/W247.full.pdf+html
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index fe3eb0b..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,4 +0,0 @@
-version=4
-
-opts=dversionmangle=s/.*/0.No-Release/ \
- https://people.debian.org/~eriberto/ FakeWatchNoUpstreamReleaseForThisPackage-(\d\S+)\.gz
diff --git a/example_matrix_file.wil b/example_matrix_file.wil
new file mode 100644
index 0000000..0779d8b
--- /dev/null
+++ b/example_matrix_file.wil
@@ -0,0 +1,10 @@
+>NFY
+ 34 16 7 58 51 0 2 112 116 0 14 66 13 39 36 25
+ 37 33 51 14 4 116 113 0 0 1 65 6 20 43 9 35
+ 27 26 25 41 56 0 1 1 0 0 33 42 73 22 47 29
+ 18 41 33 3 5 0 0 3 0 115 4 2 10 12 24 27
+ >TBP
+ 61 16 352 3 354 268 360 222 155 56 83 82 82 68 77
+145 46 0 10 0 0 3 2 44 135 147 127 118 107 101
+152 18 2 2 5 0 10 44 157 150 128 128 128 139 140
+ 31 309 35 374 30 121 6 121 33 48 31 52 61 75 71
\ No newline at end of file
diff --git a/pscan.cpp b/pscan.cpp
new file mode 100644
index 0000000..701447b
--- /dev/null
+++ b/pscan.cpp
@@ -0,0 +1,6050 @@
+// Pscan 1.2.2
+// Copyright (C) 2009 Federico Zambelli and Giulio Pavesi
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+//
+// If you find Pscan useful in your research please cite our paper:
+//
+// F.Zambelli, G.Pesole, G.Pavesi
+// Pscan: Finding Over-represented Transcription Factor Binding Site Motifs in Sequences from Co-Regulated or Co-Expressed Genes.
+// Nucleic Acids Research 2009 37(Web Server issue):W247-W252.
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <sstream>
+#include <math.h>
+#include <cstdlib>
+#include <map>
+#include <gsl/gsl_randist.h>
+#include <gsl/gsl_cdf.h>
+//#include <gd.h>
+
+//#define DEBUG1
+//#define DEBUG2
+//#define DEBUG3
+//#define DEBUG4
+//#define DEBUG5
+//#define DEBUG6
+//#define DEBUG7
+//#define DEBUG8
+//#define DEBUG9
+//#define MATRIX_TRANSLATOR
+//#define BOUND_OLIGOS
+
+using namespace std;
+
+//CONSTANTS DEFINITIONS
+typedef double m_point;
+const int ALPHABET_SIZE = 4, SWITCH_TO_Z_VALUE = 500, MAX_DOTS = 150000, DA_TOP = 20;
+const char ALPHABET[4] = {'A','C','G','T'};
+const char C_ALPHABET[4] = {'T','G','C','A'};
+const m_point MIN_ADD_FREQ = 0.01, INF = 1000000000, CONSERVATION_THRESHOLD = 0.75;
+const char *matrix_files = "*.pro *.pfm";
+enum {A,C,G,T};
+char VI['T' + 1], rc_VI['T' + 1];
+//*fasta_files = "*.fasta",
+const char *m_start_identifier = "P0", *m_start_identifier_for_free_transfac = "PO", *m_data_start_identifier = "AC";
+enum {NO_FILES, BAD_MATRIX_FILE, MATRIX_NOT_ALLOCATED, MISSING_FASTA_FILE, BAD_FASTA_SEQUENCE,
+ NO_MATRIX_FILE, BAD_MATRIX, NO_JASPAR_MATRIX_LIST, JASPAR_FILE_NOT_IN_LIST, CANT_COMPUTE_SCORES, READING_OUT_OF_MATRIX,
+ NEED_COMPLETE_MATRIX, NO_BACKGROUND_AVAILABLE, NO_SEQUENCE, NO_TFBS, CANT_DO_POSITIONAL, SEQUENCE_SHORTER_THAN_WINDOW, BAD_BIG_VALUES, CANTOPENOUTPUT, TOO_MANY_EDGES, NO_BOF, BAD_FILE_FORMAT, NO_INDEX_FOUND, EVEN_PROBE, MISSING_CHIP_FILE, INCONSISTENT_CHIP, NO_MATRIX_FOR_DISTANCE_ANALYSIS}; //ERROR CODES
+const m_point step = 0.025; //bins step!!
+//END
+
+//OBJECTS DECLARATION
+#ifdef BOUND_OLIGOS
+class bound_oligos
+{
+ private:
+ void build_matrix();
+ void build_pos_cor_matrix();
+ void w2_vector(vector<string>*);
+ int char_to_index(char);
+ int observed_cooccurrences(int, char, int, char);
+ vector<string> raw_mat;
+ m_point **pos_cor_mat;
+ string NAME;
+ int **mat, x_size, y_size;
+ public:
+ vector<string> assign(vector<string>);
+ string show_pos_cor_mat();
+};
+#endif
+
+class tfbs
+{
+ private:
+ bool raw_tfbsdata_parser();
+ void j_raw_tfbsdata_parser();
+ bool raw_matrix_parser();
+ bool j_raw_matrix_parser();
+ bool matrix_normalizer();
+ void matrix_to_log();
+ void matrix_min_max();
+ void bns_sum();
+ void matrix_average();
+ void build_conservation_vector();
+ vector<bool> v_conservation;
+ vector<string> raw_matrix;
+ vector<int> bns;
+ vector<m_point> row;
+ vector<string> raw_data;
+ m_point bin_freq(int);
+ string j_raw_data;
+ m_point **matrix;
+ m_point *mat_avg;
+ m_point avg_row;
+ m_point stddev_row;
+ string file_name;
+// string raw_data;
+ string ID;
+ string NAME;
+ string AC;
+ int rowsize;
+ int x_size;
+ int bin_sum;
+ m_point max_score;
+ m_point min_score;
+ bool matrix_alloc;
+ bool norm_sec_step;
+ public:
+ bool assign(const char*);
+ bool j_assign(const char*);
+ bool w2_assign(vector<string>);
+ #ifdef BOUND_OLIGOS
+ bound_oligos bo;
+ #endif
+ string name();
+ string id();
+ string file();
+ string matrix_id();
+ string ac();
+ string show_raw_matrix();
+ #ifdef MATRIX_TRANSLATOR
+ string matrix_translator();
+ #endif
+ string show_matrix();
+ int get_bin_sum();
+ m_point max();
+ m_point min();
+ void gen_bns();
+ string show_bns();
+ string show_bns_line();
+ string show_conservation_vector(int);
+ int m_length();
+ bool assign_short_matrix(string);
+ void row_average();
+ void row_stddev();
+ m_point get_value(int,int);
+ m_point get_matrix_average(int);
+ void row_push(m_point);
+ void row_v_push(vector<m_point>);
+ int row_size();
+ int row_actual_size();
+ m_point row_get(int);
+ vector<m_point> row_get();
+ m_point row_avg_get();
+ m_point row_stddev_get();
+ m_point bin_prob(int);
+ vector<m_point>::iterator row_iter();
+ unsigned int bin_hm(int);
+ vector<int> bin_vector();
+// void draw_background();
+};
+class sequence
+{
+ private:
+ vector<vector<m_point> > seq_rel_scores;
+ vector<vector<char > > seq_r_strand;
+ vector<m_point> max_score;
+ vector<m_point> rel_score;
+ vector<m_point> rel_score2;
+ vector<int> max_pos;
+ vector<int> max_pos2;
+ vector<char> max_strand;
+ string NAME;
+ string SEQ;
+ string WRAP_NAME;
+ string ID;
+ string CHR;
+ unsigned long int ABS_START;
+ unsigned long int ABS_END;
+ char STRAND;
+ virtual void SCAN(vector<tfbs>::iterator,bool);
+ void SCAN_SS(vector<tfbs>::iterator,bool);
+ void store_complete_seq_scores(vector<m_point>::iterator,vector<m_point>::iterator,int,vector<tfbs>::iterator);
+ void store_complete_seq_scores_ss(vector<m_point>::iterator,int,vector<tfbs>::iterator);
+// void set_seq_rel_max_pos();
+ bool absolute_position_parser();
+ protected:
+ bool seq_cleaner();
+ int char_to_index(char);
+ int comp_char_to_index(char);
+ m_point max_v(vector<m_point>*,int*);
+ m_point max_v2(vector<m_point>*,int*);
+ m_point sum_v(vector<m_point>*);
+ public:
+ m_point get_seq_rel_score_at_pos(int,int);
+ m_point get_seq_rel_max_pos(int,int*,char*);
+ unsigned long int get_abs_start();
+ unsigned long int get_abs_end();
+ string get_chr();
+ char get_strand();
+ bool assign(string);
+ virtual void scan(vector<tfbs>*,int,bool);
+ string name();
+ string seq();
+ string w_name();
+ string get_id();
+ m_point max(int);
+ m_point rel(int);
+ m_point rel2(int);
+ int mpos(int);
+ int mpos2(int);
+ char mstrand(int);
+ void reverse();
+};
+
+class big_sequence : public sequence
+{
+ private:
+ vector<vector <m_point> > MAX_REL_SCORE;
+ vector<vector <int> > MAX_POS;
+ vector<vector <string> > MAX_MOTIF;
+ vector<vector<char> > MOTIF_STRAND;
+ void SCAN(vector<tfbs>::iterator,bool);
+ public:
+// vector<m_point> row_return(int);
+ void scan(vector<tfbs>*,int,bool);
+ string track_output(int);
+ string inline_output();
+};
+
+class query
+{
+ private:
+// m_point **correlation_matrix;
+ int dam, bigst_seq_size;
+ vector<sequence> QUERY;
+ vector<tfbs> QTFBS;
+ vector<m_point> ts;
+ vector<m_point> zt;
+ vector<m_point> bt;
+ vector<m_point> min_binom_bins;
+ vector<vector <m_point> > binom_bins;
+ vector<int> seq_bin_num;
+ vector<int> soil_pos;
+ void distance_analysis();
+ char comp_char(char);
+ string revcomp(string, char);
+ m_point t_den_calc(int,m_point,int,m_point);
+ int max_v_pos(vector<m_point>);
+ int min_v_pos(vector<m_point>);
+// void build_correlation_matrix(vector<tfbs>*);
+// void correlation_matrix_output(vector<tfbs>*,string);
+// void correlation_output(vector<tfbs>*,string);
+// void correlation_matrix_copy(m_point**, int);
+ vector<int> matrix_line_max_pos(m_point*,int,int);
+ vector<int> matrix_line_min_pos(m_point*,int,int);
+ string positional_output();
+ m_point seq_stats(int, m_point*);
+ char flip_strand(char);
+// void draw_distr_graph(vector<m_point>*, vector<int>*, int, string);
+ public:
+ void assign(vector<sequence>,vector<tfbs>);
+ void scan();
+ void t_stud_calc(vector<tfbs>*);
+ void z_test(vector<tfbs>*);
+ void b_test(vector<tfbs>*);
+ void bins_stuff(vector<tfbs>*);
+// void correlation(vector<tfbs>*);
+ void pearson_corr();
+ string binom_bins_line(int);
+ void output(int,vector<tfbs>*);
+// void img_output(vector<tfbs>*);
+ void zvectors_output(vector<tfbs>*);
+};
+
+class chip
+{
+ private:
+ vector<m_point> Log_Ratio;
+ unsigned short int noe;
+ vector<string> Chr;
+ vector<m_point> exp_bg_means;
+ vector<m_point> exp_fg_means;
+// vector<m_point> bin_label;
+ vector<unsigned int> bins;
+ vector<unsigned int> Start;
+ vector<unsigned int> Stop;
+ vector<m_point> Score;
+ string current_chr, current_chr_seq;
+ ofstream fout;
+ bool consistency_check(vector<string> *);
+ void stack_processor(vector<vector<string> > *);
+ void set_bins();
+ void display_bins();
+ void Exp_Push_Mean(vector<string>*);
+ void get_current_chr_seq(string, string*);
+ public:
+ chip();
+};
+
+
+//END
+
+//GLOBAL VARIABLES DECLARATION
+bool VERBOSE = false, BINS = false, COMPLETE_MATRIX = false, SHORT = false, SHOW_BINS_BINOM = false, USE_N = false, CORRELATION = false,
+ COMPLETE_CORRELATION_MATRIX = false, GENERATE_MATRIX = false, POSITIONAL = false, USE_SUM = false, BIG_SEQ = false, PEARSON = false,
+ NO_BACKGROUND = false, DOUBLE_STRAND = true, IMG_OUTPUT = false, TRASH_N = false, ZVECTOR_OUTPUT = false, USE_T = false,
+ SHOW_CONSERVATION_VECTOR = false, REVERSE_STRAND = false, BED_OUTPUT = false, DRAW_BACKGROUND = false, BUILD_INDEX = false, NO_HEADER = false, CHIP = false;
+int BIG_WINDOW = 500, BIG_STEP = 250, SEQ_EDGE_START = 0, SEQ_EDGE_END = 0, MIN_SEQ_LENGTH = 0, TSS_POS = 0;
+string q_idlist, idfile, queryfile, promfile, matrixfile, usethismatrix, usethismatrixname, matrixlist, trackbg, w2file, fasta_matrix_file, bound_oligos_file, bg_for_img, magic, index_file, distance_analysis_matrix;
+vector<string> chip_files;
+vector<m_point> bins;
+double TRACK_CUTOFF = 0, CHIP_CUTOFF = 0.05, DISTANCE_CUTOFF = 0.000001;
+unsigned int SPLIT[2] = {0,0}, PROBE = 3;
+//END
+
+//FUNCTIONS DECLARATION
+vector<string> get_file_names(const char *);
+void error_handler(int, string);
+vector<tfbs> v_tfbs_builder(vector<string>);
+vector<sequence> v_sequence_builder(vector<string>, bool);
+vector<big_sequence> v_big_sequence_builder(vector<string>, bool);
+vector<string> fasta_reader(const char*);
+vector<vector <string> > w2_mat_reader(string);
+vector<vector <string> > wil_mat_reader(string);
+void v_tfbs_add_bof(vector<tfbs>*);
+void generate_output_matrix(vector<tfbs>*, vector<sequence>*);
+void generate_output_bins(vector<tfbs>*, vector<sequence>*);
+void generate_output_big_sequence(vector<tfbs>*, vector<big_sequence>*);
+void generate_tfbs_rows(vector<tfbs>*, vector<sequence>*, bool);
+//void generate_tfbs_rows(vector<tfbs>*, vector<big_sequence>*);
+void generate_tfbs_rows_from_matrix(vector<tfbs>*);
+void generate_tfbs_rows_from_short_matrix(vector<tfbs>*,ifstream*);
+void bins_gen();
+void command_line_parser(int, char**);
+void min_seq_length_fixer(vector<tfbs>*);
+void idfile_handler(vector<sequence> *);
+string build_sequence_index(vector<sequence>*);
+string show_conservation(vector<tfbs>*);
+bool id_check(vector<string> *, vector<sequence>::iterator);
+bool qlist_check(string, vector<string>*);
+void display_help();
+//END
+
+int main(int argc, char **argv)
+{
+ vector<tfbs> TFBS;
+ vector<sequence> SEQUENCE;
+ vector<big_sequence> BIG_SEQUENCE;
+ query Q;
+
+ VI['A'] = 0;
+ VI['C'] = 1;
+ VI['G'] = 2;
+ VI['T'] = 3;
+
+ rc_VI['T'] = 0;
+ rc_VI['G'] = 1;
+ rc_VI['C'] = 2;
+ rc_VI['A'] = 3;
+
+ command_line_parser(argc, argv);
+ bins_gen();
+
+ if(CHIP)
+ {
+ chip C;
+ }
+
+ TFBS = v_tfbs_builder(get_file_names(matrix_files));
+ min_seq_length_fixer(&TFBS);
+
+ if((int)promfile.size() == 0 && (int)matrixfile.size() == 0 && usethismatrix.size() == 0 && !PEARSON && (int)usethismatrixname.size() == 0 && distance_analysis_matrix.empty())
+ // SEQUENCE = v_sequence_builder(get_file_names(fasta_files),false);
+ error_handler(NO_BACKGROUND_AVAILABLE,"");
+
+ else if((int)promfile.size() != 0 && (int)matrixfile.size() == 0)
+ {
+ vector<string> pfile;
+ pfile.push_back(promfile);
+
+ if(!BIG_SEQ)
+ {
+ SEQUENCE = v_sequence_builder(pfile,false);
+
+ if(!idfile.empty())
+ idfile_handler(&SEQUENCE);
+ }
+
+ else
+ {
+ if(BIG_STEP > BIG_WINDOW || BIG_STEP < 1)
+ error_handler(BAD_BIG_VALUES,"");
+
+ BIG_SEQUENCE = v_big_sequence_builder(pfile,false);
+ }
+ }
+
+ if((int)queryfile.size() != 0 && BIG_SEQUENCE.size() == 0)
+ {
+ vector<string> qfile;
+ vector<sequence> QUERY;
+ qfile.push_back(queryfile);
+ QUERY = v_sequence_builder(qfile,true);
+ Q.assign(QUERY,TFBS);
+ QUERY.clear();
+ qfile.clear();
+ }
+
+ else if(BIG_SEQUENCE.size() != 0)
+ {
+ cerr << "Processing..." << endl;
+ for(int S = 0; S < BIG_SEQUENCE.size(); S++)
+ BIG_SEQUENCE[S].scan(&TFBS,S,false);
+
+ generate_output_big_sequence(&TFBS, &BIG_SEQUENCE);
+ }
+
+
+ if((int)matrixfile.size() == 0 && promfile.size() != 0)
+ {
+ for(int S = 0; S < SEQUENCE.size(); S++)
+ SEQUENCE[S].scan(&TFBS,S,false);
+
+ generate_tfbs_rows(&TFBS, &SEQUENCE, false);
+
+ }
+
+ else if(matrixfile.size() != 0 && promfile.size() == 0)
+ generate_tfbs_rows_from_matrix(&TFBS);
+
+ else if(matrixfile.size() == 0 && promfile.size() == 0)
+ NO_BACKGROUND = true;
+
+ if((int)queryfile.size() != 0)
+ {
+ Q.scan();
+
+ if(PEARSON)
+ Q.pearson_corr();
+
+ if(!POSITIONAL && !NO_BACKGROUND)
+ {
+ Q.t_stud_calc(&TFBS);
+ Q.bins_stuff(&TFBS);
+ Q.z_test(&TFBS);
+ Q.b_test(&TFBS);
+ }
+
+ Q.output(TFBS[0].row_size(),&TFBS);
+
+// if(IMG_OUTPUT && !POSITIONAL)
+// Q.img_output(&TFBS);
+
+ if(ZVECTOR_OUTPUT)
+ Q.zvectors_output(&TFBS);
+ }
+
+// if(CORRELATION && TFBS.size() > 1)
+// Q.correlation(&TFBS);
+
+ if((int)matrixfile.size() == 0 && (GENERATE_MATRIX || queryfile.size() == 0))
+ generate_output_matrix(&TFBS, &SEQUENCE);
+
+ else if((int)matrixfile.size() != 0 && !SHORT && (GENERATE_MATRIX || queryfile.size() == 0))
+ {
+ COMPLETE_MATRIX = false;
+ promfile = matrixfile;
+ generate_output_matrix(&TFBS, &SEQUENCE);
+ }
+
+ if(BINS)
+ generate_output_bins(&TFBS, &SEQUENCE);
+
+ cerr << endl;
+
+ exit(EXIT_SUCCESS);
+}
+
+void display_help()
+{
+ ifstream in("HELP.txt");
+
+ if(!in)
+ {
+ cerr << "\nCan't find file: \"HELP.txt\" in this folder. Please check your installation." << endl;
+ exit(1);
+ }
+
+ string line;
+
+ while(getline(in,line))
+ cerr << line << endl;
+
+ in.close();
+
+/* cerr << endl << "SYNOPSIS" << endl
+ << "\tpscan -q multifastafile -p multifastafile [options]" << endl
+ << "\tpscan -p multifastafile [options]" << endl
+ << "\tpscan -q multifastafile -M matrixfile [options]" << endl << endl
+ << "OPTIONS" << endl
+ << "\t[-q file] | Specify the multifasta file containing the foreground sequences." << endl << endl
+ << "\t[-p file] | Specify the multifasta file containing the background sequences." << endl << endl
+ << "\t[-m file] | Use it if the background data are already available in a file (see -g option)." << endl << endl
+ << "\t[-M file] | Scan the foreground sequences using only the Jaspar/Transfac matrix file contained in the specified file." << endl << endl
+ << "\t[-l file] | Use the matrices contained in the file (for matrix file format see below)." << endl << endl
+ << "\t[-N name] | Use only the matrix with that name (usable only in association with -l)." << endl << endl
+ << "\t[-ss] | Perform single strand only analysis." << endl << endl
+ << "\t[-rs] | Perform single strand only analysis on the reverse strand." << endl << endl
+ << "\t[-split num1 num2] | Sequences are scanned only from position num1 and for num2 nucleotides." << endl << endl
+ << "\t[-trashn] | Discards sequences containing \"N\"." << endl << endl
+ << "\t[-n] | Oligos containing \"N\" will not be discarded. Instead a \"N\" will obtain an \"average\" score." << endl << endl
+ << "\t[-g] | If a background sequences file is used than a file will be written containing the data calculated" << endl <<
+ "\t\tfor that background and the current set of matrices." << endl <<
+ "\t\tFrom now on one can use that file (-m option) instead of the sequences file for faster calculations." << endl << endl
+ << "\t[-ui file] | An index of the background file will be used to avoid duplicated sequences." << endl<< endl
+ << "\t[-bi] | Build an index of the background sequences file (to be used later with the -ui option)." << endl <<
+ "\t\tThis is useful when you have duplicated sequences in your background that may introduce a bias in your results." << endl << endl
+ << "\t[-h] | Display this help." << endl << endl
+
+ << "NOTES" << endl
+ << "\tThe sequences to be used with Pscan have to be promoter sequences." << endl
+ << "\tTo obtain meaningful results it's critical that the background and the foreground sequences are consistent between them either in size" << endl
+ << "\tand in position (with respect to the transcription start site). For optimal results the foreground set should be a subset of the background set." << endl << endl
+
+ << "\tIf the \"-l\" option is not used Pscan will try to find Jaspar/Transfac matrix files in the current folder." << endl
+ << "\tJaspar files have \".pfm\" extension while Transfac ones have \".pro\" extension." << endl
+ << "\tIf Jaspar matrix files are used than a file called \"matrix_list.txt\" must be present in the same folder." << endl
+ << "\tThat file contains required info about the matrices in the \".pfm\" files." << endl << endl
+
+ << "\tFor info on how Pscan works pleare refer to the paper." << endl << endl
+
+ << "EXAMPLES" << endl << endl
+ << "1)\tpscan -p human_450_50.fasta -bi" << endl << endl
+
+ << "\tThis command will scan the file \"human_450_50.fasta\" using the matrices in the current folder." << endl
+ << "\tIt is handy to use that command the first time one uses a set of matrices with a given background sequences file." << endl
+ << "\tA file called human_450_50.short_matrix will be written and it can be used from now on every time you want to use" << endl
+ << "\tthe same background sequences with the same set of matrices. A file called human_450_50.index will be written too" << endl
+ << "\tand it will be useful every time you will use the same background file." << endl << endl
+
+ << "2)\tpscan -q human_nfy_targets.fasta -m human_450_50.short_matrix -ui human_450_50.index" << endl << endl
+
+ << "\tThis command will scan the file human_nfy_targets.fasta searching for over-represented binding sites (with respect" << endl
+ << "\tto the preprocessed background contained in the \"human_450_50.short_matrix\" file) using the matrices in the current folder." << endl
+ << "\tPlease note that the query file \"human_nfy_targets.fasta\" must be a subset of the sequences contained in the " << endl
+ << "\tbackground file \"human_450_50.fasta\""
+ << "\tin order to use the index file with the \"-ui\" option." << endl << "\tThis means that both the sequences and their FASTA headers used" << endl
+ << "\tin the query file must appear"
+ << "\tin the background file as well." << endl << "\tUsing the \"-ui\" option when the sequences contained in the query file are not a subset of" << endl
+ << "\tthe background file will"
+ << "have undefined/unpredictable outcomes."
+ << "\tThe output will be a file called \"human_nfy_targets.fasta.res\" where you will find all the used matrices sorted by ascending P-value." << endl
+ << "\tThe lower the P-value obtained by a matrix, the higher are the chances that the transcription factor associated to that matrix" << endl
+ << "\tis a regulator of the input promoter sequences." << endl
+ << "\tThe fields of the output are the following: \"Transcription Factor Name\", \"Matrix ID\", \"Z Score\", \"Pvalue\", \"Foreground Average\", \"Background Average\"." << endl << endl
+
+ << "3)\tpscan -q human_nfy_targets.fasta -M MA0108.pfm" << endl << endl
+
+ << "\tThis command will scan the sequences file \"human_nfy_targets.fasta\" using the matrix contained in \"MA0108.pfm\"." << endl
+ << "\tThe result will be written in a file called \"human_nfy_targets.fasta.ris\" where you will find the sequences in input" << endl
+ << "\tsorted by a descending score (between 1 and 0). The higher the score, the better is the oligo found with respect to the used matrix." << endl
+ << "\tThe fields of the output are the following: \"Sequence Header\", \"Score\", \"Position from the end of sequence\", \"Oligo that obtained the score\"," << endl
+ << "\tStrand where the oligo was found\"." << endl << endl
+
+ << "4)\tpscan -p human_450_50.fasta -bi -l matrixfile.wil" << endl << endl
+
+ << "\tThis command is like Example #1 with the difference that the matrices set to be used is the one contained in the \"matrixfile.wil\" file." << endl
+ << "\tPlease look at the \"example_matrix_file.wil\" file included in this Pscan distribution to see the correct format for matrices file." << endl << endl
+
+ << "5)\tpscan -q human_nfy_targets.fasta -l matrixfile.wil -N MATRIX1" << endl << endl
+
+ << "\tThis command is like Example #3 but it will use the matrix called \"MATRIX1\" contained in the \"matrixfile.wil\" file." << endl << endl;
+*/
+ exit(1);
+}
+
+void command_line_parser(int argc, char **argv)
+{
+ if(argc == 1)
+ display_help();
+
+ for(int i = 1; i < argc; i++)
+ {
+ string buf = argv[i];
+
+ if(buf == "-q")
+ {
+ if(i < argc - 1)
+ queryfile = argv[++i];
+ continue;
+ }
+
+ if(buf == "-h")
+ {
+ display_help();
+ continue;
+ }
+/* if(buf == "-Q")
+ {
+ if(i < argc - 1)
+ idfile = argv[++i];
+ continue;
+ }
+ if(buf == "-d")
+ {
+ if(i < argc - 1)
+ distance_analysis_matrix = argv[++i];
+ continue;
+ }
+*/
+ if(buf == "-ql")
+ {
+ if(i < argc - 1)
+ q_idlist = argv[++i];
+ continue;
+ }
+/*
+ if(buf == "-imgm")
+ {
+ if(i < argc - 1)
+ bg_for_img = argv[++i];
+ continue;
+ }
+ if(buf == "-magic")
+ {
+ if(i < argc - 1)
+ magic = argv[++i];
+ continue;
+ }
+ if(buf == "-track")
+ {
+ if(i < argc - 1)
+ trackbg = argv[++i];
+ continue;
+ }*/
+
+ else if(buf == "-p")
+ {
+ if(i < argc - 1)
+ promfile = argv[++i];
+ BIG_SEQ = false;
+ continue;
+ }
+/*
+ else if(buf == "-P")
+ {
+ if(i < argc - 1)
+ promfile = argv[++i];
+ BIG_SEQ = true;
+// USE_N = true;
+ continue;
+ }
+
+ else if(buf == "-chip")
+ {
+ i++;
+
+ while(i < (argc))
+ {
+ buf = argv[i];
+
+ if(buf[0] == '-')
+ {
+ i--;
+ break;
+ }
+ else
+ {
+ chip_files.push_back(buf);
+ i++;
+ }
+ }
+
+ if(!chip_files.empty())
+ CHIP = true;
+
+ continue;
+ }
+*/
+ else if(buf == "-m")
+ {
+ if(i < argc - 1)
+ matrixfile = argv[++i];
+ continue;
+ }
+ #ifdef BOUND_OLIGOS
+ else if(buf == "-bo")
+ {
+ if(i < argc - 1)
+ bound_oligos_file = argv[++i];
+
+ continue;
+ }
+ #endif
+
+/* else if(buf == "-w2")
+ {
+ if(i < argc - 1)
+ w2file = argv[++i];
+
+ continue;
+ }*/
+ else if(buf == "-l")
+ {
+ if(i < argc - 1)
+ fasta_matrix_file = argv[++i];
+
+ continue;
+ }
+ else if(buf == "-ui")
+ {
+ if(i < argc - 1)
+ index_file = argv[++i];
+
+ continue;
+ }
+ else if(buf == "-M")
+ {
+ if(i < argc - 1)
+ usethismatrix = argv[++i];
+ continue;
+ }
+ else if(buf == "-N")
+ {
+ if(i < argc - 1)
+ usethismatrixname = argv[++i];
+ continue;
+ }
+/*
+ else if(buf == "-L")
+ {
+ if(i < argc - 1)
+ matrixlist = argv[++i];
+ continue;
+ }*/
+
+/* else if(buf == "-start")
+ {
+ if(i < argc - 1)
+ {
+ SEQ_EDGE_START = atoi(argv[++i]);
+ MIN_SEQ_LENGTH = SEQ_EDGE_START;
+ }
+ continue;
+ }
+ else if(buf == "-end")
+ {
+ if(i < argc - 1)
+ {
+ SEQ_EDGE_END = atoi(argv[++i]);
+ MIN_SEQ_LENGTH = SEQ_EDGE_END;
+ }
+ continue;
+ }
+ else if(buf == "-min")
+ {
+ if(i < argc - 1)
+ MIN_SEQ_LENGTH = atoi(argv[++i]);
+ continue;
+ }*/
+ else if(buf == "-split")
+ {
+ if(i < argc - 2)
+ {
+ SPLIT[0] = atoi(argv[++i]);
+ SPLIT[1] = atoi(argv[++i]);
+ }
+ continue;
+ }
+/* else if(buf == "-w")
+ {
+ if(i < argc - 1)
+ BIG_WINDOW = atoi(argv[++i]);
+ continue;
+ }
+ else if(buf == "-step")
+ {
+ if(i< argc - 1)
+ BIG_STEP = atoi(argv[++i]);
+ continue;
+ }
+ else if(buf == "-probe")
+ {
+ if(i< argc - 1)
+ PROBE = atoi(argv[++i]);
+
+ continue;
+ }
+ else if(buf == "-tss")
+ {
+ if(i< argc - 1)
+ TSS_POS = -atoi(argv[++i]);
+ continue;
+ }
+ else if(buf == "-cut")
+ {
+ istringstream str;
+
+ if(i< argc - 1)
+ str.str(argv[++i]);
+
+ str >> TRACK_CUTOFF;
+
+ continue;
+ }
+ else if(buf == "-chip_cutoff")
+ {
+ istringstream str;
+
+ if(i < argc - 1)
+ str.str(argv[++i]);
+
+ str >> CHIP_CUTOFF;
+ }
+ else if(buf == "-d_cutoff")
+ {
+ istringstream str;
+
+ if(i < argc - 1)
+ str.str(argv[++i]);
+
+ str >> DISTANCE_CUTOFF;
+ }
+ else if(buf == "-v")
+ {
+ VERBOSE = true;
+ continue;
+ }*/
+ else if(buf == "-bi")
+ {
+ BUILD_INDEX = true;
+ continue;
+ }
+/* else if(buf == "-drawbg")
+ {
+ DRAW_BACKGROUND = true;
+ continue;
+ }
+ else if(buf == "-noheader")
+ {
+ NO_HEADER = true;
+ continue;
+ }
+ else if(buf == "-cv")
+ {
+ SHOW_CONSERVATION_VECTOR = true;
+ continue;
+ }
+ else if(buf == "-img")
+ {
+ IMG_OUTPUT = true;
+ continue;
+ }
+ else if(buf == "-zv")
+ {
+ ZVECTOR_OUTPUT = true;
+ continue;
+ }*/
+ else if(buf == "-ss")
+ {
+ DOUBLE_STRAND = false;
+ continue;
+ }
+ else if(buf == "-rs")
+ {
+ DOUBLE_STRAND = false;
+ REVERSE_STRAND = true;
+ }
+/*
+ else if(buf == "-b")
+ {
+ SHOW_BINS_BINOM = true;
+ continue;
+ }
+ else if(buf == "-bed")
+ {
+ BED_OUTPUT = true;
+ continue;
+ }
+ else if(buf == "-bins")
+ {
+ BINS = true;
+ continue;
+ }
+ else if(buf == "--complete" || buf == "-c")
+ {
+ COMPLETE_MATRIX = true;
+ continue;
+ }
+ else if(buf == "-n")
+ {
+ USE_N = true;
+ continue;
+ }*/
+ else if(buf == "-g")
+ {
+ GENERATE_MATRIX = true;
+ continue;
+ }
+/*
+ else if(buf == "-sum")
+ {
+ USE_SUM = true;
+ continue;
+ }*/
+ else if(buf == "-trashn")
+ {
+ TRASH_N = true;
+ continue;
+ }
+/* else if(buf == "-corr")
+ {
+// CORRELATION = true;
+ PEARSON = true;
+ continue;
+ }*/
+ else
+ {
+ cerr << "\nBad argument: " << buf << endl << endl;
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if(SEQ_EDGE_START && SEQ_EDGE_END)
+ error_handler(TOO_MANY_EDGES,"");
+
+ return;
+}
+
+
+vector<tfbs> v_tfbs_builder(vector<string> f_names)
+{
+ vector<tfbs> TFBS;
+
+ cerr << "\nReading matrix file(s)... ";
+
+ for(int i = 0; i < (int)f_names.size(); i++)
+ {
+// if(f_names[i].find(".pro") != string::npos)
+ if(f_names[i].substr(0,f_names[i].find(".pro")).size() != f_names[i].size() ||
+ (usethismatrix.size() != 0 && f_names[i].substr(0,f_names[i].find(".pfm")).size() == f_names[i].size()))
+ {
+ tfbs one_more_tfbs;
+
+ if(!one_more_tfbs.assign(f_names[i].c_str()))
+ error_handler(BAD_MATRIX_FILE,f_names[i]);
+ else
+ {
+ TFBS.push_back(one_more_tfbs);
+ continue;
+ }
+ }
+
+ }
+
+ for(int i = 0; i < (int)f_names.size(); i++)
+ {
+// if(f_names[i].find(".pfm") != string::npos) // gives problems with -O3 option using g++. DO NOT CHANGE THE BELOW IF STATEMENT!
+ if(f_names[i].substr(0,f_names[i].find(".pfm")).size() != f_names[i].size())
+ {
+ tfbs one_more_tfbs;
+
+ if(!one_more_tfbs.j_assign(f_names[i].c_str()))
+ error_handler(BAD_MATRIX_FILE,f_names[i]);
+ else
+ {
+ TFBS.push_back(one_more_tfbs);
+ continue;
+ }
+ }
+ }
+
+ //NUOVO CODICE
+ for(int i = 0; i < (int)f_names.size(); i++)
+ {
+ if(f_names[i].substr(0,f_names[i].find(".w2")).size() != f_names[i].size())
+ {
+ vector<vector<string> > w2_mat = w2_mat_reader(f_names[i]);
+
+ for(int j = 0; j < w2_mat.size(); j++)
+ {
+
+ tfbs one_more_tfbs;
+
+ if(!one_more_tfbs.w2_assign(w2_mat[j]))
+ error_handler(BAD_MATRIX_FILE,f_names[i]);
+ else
+ {
+ TFBS.push_back(one_more_tfbs);
+ continue;
+ }
+ }
+ }
+ }
+
+ for(int i = 0; i < (int)f_names.size(); i++)
+ {
+ if(f_names[i].substr(0,f_names[i].find(".wil")).size() != f_names[i].size())
+ {
+ vector<vector<string> > wil_mat = wil_mat_reader(f_names[i]);
+
+ for(int j = 0; j < wil_mat.size(); j++)
+ {
+ tfbs one_more_tfbs;
+
+ if(!one_more_tfbs.w2_assign(wil_mat[j]))
+ error_handler(BAD_MATRIX_FILE,f_names[i]);
+ else
+ {
+ TFBS.push_back(one_more_tfbs);
+ continue;
+ }
+ }
+ }
+ }
+
+ //END
+
+ #ifdef BOUND_OLIGOS
+ if(!bound_oligos_file.empty())
+ v_tfbs_add_bof(&TFBS);
+ #endif
+
+ if(SHOW_CONSERVATION_VECTOR)
+ {
+ cout << "Printing matrix conservation vectors..." << endl << show_conservation(&TFBS);
+ exit(EXIT_SUCCESS);
+ }
+
+ cerr << TFBS.size() << " matrices acquired.\n";
+
+ if(TFBS.size() == 0)
+ error_handler(NO_TFBS,"");
+
+ #ifdef DEBUG1
+ for(int db0 = 0; db0 < TFBS.size(); db0++)
+ cerr << "\nTFBS[" << db0 << "].names = " << TFBS[db0].name() << '\t' << "ID = " << TFBS[db0].id();
+ cerr << endl;
+ #endif
+
+ return TFBS;
+}
+
+#ifdef BOUND_OLIGOS
+void v_tfbs_add_bof(vector<tfbs> *TFBS)
+{
+ ifstream in(bound_oligos_file.c_str());
+
+ if(!in)
+ {
+ error_handler(NO_BOF,bound_oligos_file);
+ return;
+ }
+
+ vector<vector< string> > BO;
+ vector<string> bo;
+ string line;
+ short int count = 0;
+
+ while(getline(in,line))
+ {
+ if(line[0] == '>' && !bo.empty())
+ {
+ BO.push_back(bo);
+ bo.clear();
+ bo.push_back(line);
+ }
+ else
+ bo.push_back(line);
+ }
+
+ BO.push_back(bo);
+
+ for(int i = 0; i < BO.size(); i++)
+ {
+ tfbs one_more_tfbs;
+
+ if(!one_more_tfbs.w2_assign(one_more_tfbs.bo.assign(BO[i])))
+ error_handler(BAD_MATRIX_FILE,bound_oligos_file);
+ else
+ {
+ TFBS->push_back(one_more_tfbs);
+ count++;
+ continue;
+ }
+ }
+
+ in.close();
+
+ return;
+}
+#endif
+
+vector<sequence> v_sequence_builder(vector<string> f_names, bool q)
+{
+ vector<sequence> SEQUENCE;
+ vector<string> idlist;
+ unsigned int original_seq_num;
+
+ if(!q && !BUILD_INDEX)
+ cerr << "\nReading background sequence file... ";
+ else if(!q && BUILD_INDEX)
+ cerr << "\nBuilding index... ";
+ else
+ cerr << "\nReading query file... ";
+
+ if(q && !q_idlist.empty())
+ {
+ ifstream in(q_idlist.c_str());
+ string line;
+
+ while(getline(in,line))
+ {
+ if(!line.empty())
+ {
+ istringstream sl(line);
+ sl >> line;
+
+ idlist.push_back(line);
+ }
+ }
+
+ in.close();
+ }
+
+ if(q)
+ {
+ if(usethismatrix.size() > 0 || usethismatrixname.size() > 0 || distance_analysis_matrix.size() > 0)
+ {
+ if(promfile.size() == 0)
+ POSITIONAL = true;
+
+ USE_N = true;
+ }
+ }
+
+ for(int i = 0; i < (int)f_names.size(); i++)
+ {
+ vector<string> seqs;
+ vector<bool> used;
+ map<string,int> index;
+
+ seqs = fasta_reader(f_names[i].c_str());
+
+ original_seq_num = seqs.size();
+
+ if(!index_file.empty())
+ {
+ string iline;
+ // used.assign(seqs.size(),false);
+ unsigned int MAX_I = 0;
+
+ ifstream indexin(index_file.c_str());
+
+ if(!indexin)
+ error_handler(NO_INDEX_FOUND,index_file);
+
+ while(getline(indexin,iline))
+ {
+ if(iline.empty())
+ continue;
+
+ istringstream istr(iline);
+
+ string name;
+ int I;
+
+ istr >> name >> I;
+
+ index[name] = I;
+
+ if(I > MAX_I)
+ MAX_I = I;
+ }
+
+ indexin.close();
+ used.assign(MAX_I + 1,false);
+ }
+
+ for(int t = 0; t < (int)seqs.size(); t++)
+ {
+ sequence one_more_sequence;
+
+ #ifdef DEBUG4
+ cout << endl << seqs[t] << endl;
+ #endif
+
+ if(!one_more_sequence.assign(seqs[t]))
+ error_handler(BAD_FASTA_SEQUENCE,seqs[t]);
+ else
+ {
+ if(REVERSE_STRAND)
+ one_more_sequence.reverse();
+
+ if(!q || q_idlist.empty() || qlist_check(one_more_sequence.get_id(), &idlist) || !distance_analysis_matrix.empty())
+ {
+ if(index_file.empty() /* || (q && q_idlist.empty())*/ || (POSITIONAL && distance_analysis_matrix.empty()))
+ SEQUENCE.push_back(one_more_sequence);
+ else
+ {
+ string ibuf;
+ istringstream bstr(one_more_sequence.name());
+
+ bstr >> ibuf;
+
+ if(!used[index[ibuf]])
+ {
+ SEQUENCE.push_back(one_more_sequence);
+ used[index[ibuf]] = true;
+ }
+ else
+ continue;
+ }
+ }
+ }
+ }
+ }
+
+ if(SEQUENCE.size() == 0)
+ error_handler(NO_SEQUENCE,"");
+
+ if(!q)
+ {
+ bool rflag = false;
+
+ if(BUILD_INDEX)
+ {
+ index_file = build_sequence_index(&SEQUENCE);
+
+ cerr << "done\n";
+ BUILD_INDEX = false;
+
+ SEQUENCE.clear();
+
+ SEQUENCE = v_sequence_builder(f_names,q);
+ rflag = true;
+ }
+
+ if(!rflag)
+ {
+ if(index_file.empty())
+ cerr << SEQUENCE.size() << " background sequences acquired." << endl;
+ else
+ cerr << SEQUENCE.size() << " background sequences acquired out of " << original_seq_num << endl;
+ }
+ }
+ else
+ {
+ if(index_file.empty())
+ cerr << SEQUENCE.size() << " query sequences acquired." << endl;
+ else
+ cerr << SEQUENCE.size() << " query sequences acquired out of " << original_seq_num << endl;
+
+/* if(usethismatrix.size() > 0 || usethismatrixname.size() > 0)
+ {
+ bool flag = true;
+
+ if(flag)
+ {
+ if(promfile.size() == 0)
+ POSITIONAL = true;
+ USE_N = true;
+ }
+ }*/
+ }
+
+ #ifdef DEBUG5
+ for(int db0 = 0; db0 < (int)SEQUENCE.size(); db0++)
+ cout << endl << SEQUENCE[db0].name() << endl << SEQUENCE[db0].seq() << endl;
+ #endif
+
+ return SEQUENCE;
+}
+
+bool qlist_check(string id, vector<string> *v)
+{
+ for(int i = 0; i < v->size(); i++)
+ if(v->at(i) == id)
+ return true;
+
+ return false;
+}
+
+
+vector<big_sequence> v_big_sequence_builder(vector<string> f_names, bool q)
+{
+ vector<big_sequence> BIG_SEQUENCE;
+
+ if(!q)
+ cerr << "\nReading background sequence file... ";
+ else
+ cerr << "\nReading query file... ";
+
+ for(int i = 0; i < (int)f_names.size(); i++)
+ {
+ vector<string> seqs;
+
+ seqs = fasta_reader(f_names[i].c_str());
+
+ for(int t = 0; t < (int)seqs.size(); t++)
+ {
+ big_sequence one_more_sequence;
+
+ #ifdef DEBUG4
+ cout << endl << seqs[t] << endl;
+ #endif
+
+ if(!one_more_sequence.assign(seqs[t]))
+ error_handler(BAD_FASTA_SEQUENCE,seqs[t]);
+ else
+ {
+ if(one_more_sequence.seq().size() <= BIG_WINDOW)
+ error_handler(SEQUENCE_SHORTER_THAN_WINDOW,one_more_sequence.name());
+
+ BIG_SEQUENCE.push_back(one_more_sequence);
+ }
+ }
+ }
+
+ if(BIG_SEQUENCE.size() == 0)
+ error_handler(NO_SEQUENCE,"");
+
+ if(!q)
+ cerr << BIG_SEQUENCE.size() << " background sequences acquired." << endl;
+ else
+ cerr << BIG_SEQUENCE.size() << " query sequences acquired." << endl;
+
+ return BIG_SEQUENCE;
+}
+
+vector<string> fasta_reader(const char *f_name)
+{
+ ifstream in(f_name);
+ string line,buf;
+ vector<string> seqs;
+ bool flag = false;
+
+ if(!in)
+ {
+ string tmp = f_name;
+ error_handler(MISSING_FASTA_FILE, tmp);
+ }
+
+ while(getline(in,line))
+ {
+ if(line[0] == '>' && flag)
+ {
+ seqs.push_back(buf);
+ buf.clear();
+ buf += line;
+ buf += '\n';
+ }
+
+ else if(line[0] == '>' && !flag)
+ {
+ buf += line;
+ buf += '\n';
+ flag = true;
+ }
+
+ else
+ {
+ buf += line;
+ buf += '\n';
+ }
+ }
+
+ seqs.push_back(buf);
+ buf.clear();
+
+ in.close();
+
+ return seqs;
+}
+
+
+vector<string> get_file_names(const char *extension)
+{
+ vector<string> f_names;
+ string tmpfile_1 = ".pscan.tmp";
+ ostringstream str1,str2;
+ ifstream in;
+ string line;
+
+ if(usethismatrix.size() > 0)
+ {
+ f_names.clear();
+ f_names.push_back(usethismatrix);
+ return f_names;
+ }
+
+ if(w2file.size() > 0)
+ {
+ f_names.clear();
+ f_names.push_back(w2file);
+ return f_names;
+ }
+
+ if(fasta_matrix_file.size() > 0)
+ {
+ f_names.clear();
+ f_names.push_back(fasta_matrix_file);
+
+ ostringstream d2u;
+
+ d2u << "dos2unix " << fasta_matrix_file;
+
+ system(d2u.str().c_str());
+
+ return f_names;
+ }
+
+ if(matrixlist.size() > 0)
+ tmpfile_1 = matrixlist;
+ else
+ {
+ str1 << "ls -1 " << extension << " >" << tmpfile_1 << " 2> .tmp";
+ system(str1.str().c_str());
+ }
+
+ in.open(tmpfile_1.c_str());
+
+ if(!in)
+ {
+ string ext = extension;
+ error_handler(NO_FILES,ext);
+ }
+
+ while(getline(in,line))
+ f_names.push_back(line);
+
+ in.close();
+
+ if(matrixlist.size() == 0)
+ {
+ str2 << "rm -f " << tmpfile_1;
+ system(str2.str().c_str());
+ system("rm -f .tmp");
+ }
+
+ return f_names;
+}
+
+void error_handler(int error, string s_err)
+{
+ cerr << endl;
+
+ switch(error)
+ {
+ case NO_FILES:
+ {
+ cerr << "\nI can't find " << s_err << "files in this directory...\n";
+ break;
+ }
+ case BAD_MATRIX_FILE:
+ {
+ cerr << "\nBad matrix file: " << s_err << "...skipping.\n";
+ return;
+ break;
+ }
+ case MATRIX_NOT_ALLOCATED:
+ {
+ cerr << "\nMatrix not yet allocated...\n";
+ return;
+ break;
+ }
+ case MISSING_FASTA_FILE:
+ {
+ cerr << "\nMmmh.. i was sure " << s_err << " was here just a moment ago...\n";
+ break;
+ }
+ case BAD_FASTA_SEQUENCE:
+ {
+ if(VERBOSE)
+ cerr << "\nI think this sequence has some problem, i'm skipping it...\n" << s_err << endl;
+ else
+ cerr << "\nI think this sequence has some problem, i'm skipping it...\n" << s_err.substr(0,s_err.find("\n")) << endl;
+ return;
+ break;
+ }
+ case NO_MATRIX_FILE:
+ {
+ cerr << "\nI can't find " << matrixfile << endl;
+ break;
+ }
+ case BAD_MATRIX:
+ {
+ cerr << "\nMatrix file / matrices number mismatch!\n" << endl;
+ break;
+ }
+ case NO_JASPAR_MATRIX_LIST:
+ {
+ cerr << "\nCan't find Jaspar's matrix list file: \"matrix_list.txt\"." << endl;
+ break;
+ }
+ case JASPAR_FILE_NOT_IN_LIST:
+ {
+ cerr << "\nI can not find " << s_err << " in Jaspar's matrix list file." << endl;
+ break;
+ }
+ case CANT_COMPUTE_SCORES:
+ {
+ cerr << "\nI cant compute scores for one or more of your sequence files, try using \"-N\" option" << endl;
+ break;
+ }
+ case READING_OUT_OF_MATRIX:
+ {
+ cerr << "\nA function is reading out of matrix, some big trouble ahead...\n";
+ return;
+ break;
+ }
+ case NEED_COMPLETE_MATRIX:
+ {
+ cerr << "\nSorry, I need a complete matrix to do correlations...\n";
+ break;
+ }
+ case NO_BACKGROUND_AVAILABLE:
+ {
+ cerr << "\nSorry, you should input a background set of sequence/matrix\n";
+ break;
+ }
+ case NO_SEQUENCE:
+ {
+ cerr << "\nThere is not sequences to work on...\n";
+ break;
+ }
+ case NO_TFBS:
+ {
+ cerr << "\nI need at least one TFBS matrix to work...\n";
+ break;
+ }
+ case NO_BOF:
+ {
+ cerr << s_err << " not found." << endl;
+ break;
+ }
+ case BAD_FILE_FORMAT:
+ {
+ cerr << "\nWeird things in " << s_err << "...\n";
+ return;
+ break;
+ }
+ case CANT_DO_POSITIONAL:
+ {
+ cerr << "\nI cant do positional analysis because " << s_err << endl;
+ return;
+ break;
+ }
+ case SEQUENCE_SHORTER_THAN_WINDOW:
+ {
+ cerr << "\nSequence " << s_err << " appears to be shorter than window scan size; use -w to reduce window size.\n";
+ break;
+ }
+ case BAD_BIG_VALUES:
+ {
+ cerr << "\nWrong window or step size...\n";
+ break;
+ }
+ case CANTOPENOUTPUT:
+ {
+ cerr << "\nCan't open output file: " << s_err << endl;
+ break;
+ }
+ case TOO_MANY_EDGES:
+ {
+ cerr << "\nYou have to take a decision... -start or -stop ??" << endl;
+ break;
+ }
+ case NO_INDEX_FOUND:
+ {
+ cerr << "\nNo index file found: " << s_err << endl;
+ break;
+ }
+ case EVEN_PROBE:
+ {
+ cerr << "\nWarning: invalid (or even) value for \"-probe\": using default (3).\n";
+ PROBE = 3;
+ break;
+ return;
+ }
+ case MISSING_CHIP_FILE:
+ {
+ cerr << "\nMissing chip file: " << s_err << endl;
+ break;
+ }
+
+ case INCONSISTENT_CHIP:
+ {
+ cerr << "\nChip files inconsistency detected at line: " << s_err << endl;
+ break;
+ }
+ case NO_MATRIX_FOR_DISTANCE_ANALYSIS:
+ {
+ cerr << "\nCan't find " << s_err << " matrix. Distance analysis aborted." << endl;
+ break;
+ }
+
+ default:
+ {
+ cerr << "\nSome weird error occurred...\n";
+ break;
+ }
+ }
+
+ cerr << endl;
+
+ exit(error);
+}
+
+void generate_output_matrix(vector<tfbs> *TFBS, vector<sequence> *SEQUENCE)
+{
+ if(COMPLETE_MATRIX)
+ {
+ ofstream out;
+ if((int)promfile.size() == 0)
+ out.open("pscan.complete_matrix");
+
+ else
+ {
+ string buf = promfile.substr(0,promfile.find("."));
+ buf += ".complete_matrix";
+ out.open(buf.c_str());
+ }
+
+ if(!out)
+ error_handler(CANTOPENOUTPUT, "");
+
+ int s_indx = 0;
+
+ cerr << "\nWriting matrix output...";
+
+ out << '\t' << '\t' << '\t' << '\t';
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ out << i->name().substr(0,i->name().find(" ")) << '\t';
+
+ out << endl << "MAX" << '\t' << '\t' << '\t' << '\t';
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ out << i->max() << '\t';
+
+ out << endl << "MIN" << '\t' << '\t' << '\t' << '\t';
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ out << i->min() << '\t';
+
+ out << endl;
+
+ for(vector<sequence>::iterator s = SEQUENCE->begin(); s != SEQUENCE->end(); s++)
+ {
+ out << s->w_name();
+ vector<tfbs>::iterator ti = TFBS->begin();
+
+ for(int t = 0; t < (int)TFBS->size(); t++)
+ {
+ out << '\t' << ti->row_get(s_indx);
+ ti++;
+ }
+
+ out << endl;
+ s_indx++;
+ }
+
+ cerr << "done\n";
+
+ out.close();
+ }
+
+ else
+ {
+ cerr << "\nWriting short matrix output...";
+
+ ofstream out;
+
+ if((int)promfile.size() == 0)
+ out.open("pscan.short_matrix");
+
+ else
+ {
+ string buf = promfile.substr(0,promfile.find(".fasta"));
+
+ if(buf.empty())
+ buf = promfile;
+
+ if(!fasta_matrix_file.empty() && !q_idlist.empty())
+ buf = q_idlist;
+
+ buf += ".short_matrix";
+ out.open(buf.c_str());
+ }
+
+ if(!out)
+ error_handler(CANTOPENOUTPUT,"");
+
+ out << "[SHORT TFBS MATRIX]" << endl;
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ out << i->id() << '\t' << i->row_size() << '\t' << i->row_avg_get() << '\t' << i->row_stddev_get()
+ << '\t' << i->show_bns_line() << endl;
+
+ out.close();
+
+ cerr << "done\n";
+ }
+
+ return;
+}
+
+void generate_tfbs_rows(vector<tfbs> *TFBS, vector<sequence> *SEQUENCE, bool qmode)
+{
+ for(vector<sequence>::iterator s = SEQUENCE->begin(); s != SEQUENCE->end(); s++)
+ {
+ vector<tfbs>::iterator ti = TFBS->begin();
+
+ for(int t = 0; t < (int)TFBS->size(); t++)
+ {
+ ti->row_push(1-fabs(s->rel(t)));
+ ti++;
+ }
+ }
+
+ for(vector<tfbs>::iterator j = TFBS->begin(); j != TFBS->end(); j++)
+ {
+ j->gen_bns();
+ j->row_average();
+ j->row_stddev();
+
+// if(IMG_OUTPUT && !qmode && DRAW_BACKGROUND)
+// j->draw_background();
+ }
+
+ return;
+}
+
+/*void generate_tfbs_rows(vector<tfbs> *TFBS, vector<big_sequence> *BIG_SEQUENCE)
+{
+ vector<tfbs>::iterator t_iv = TFBS->begin();
+ int count = 0;
+
+ while(t_iv != TFBS->end())
+ {
+ vector<big_sequence>::iterator s_iv = BIG_SEQUENCE->begin();
+
+ while(s_iv != BIG_SEQUENCE->end())
+ {
+ t_iv->row_v_push(s_iv->row_return(count));
+ s_iv++;
+ }
+
+ count++;
+ t_iv++;
+ }
+
+ for(vector<tfbs>::iterator j = TFBS->begin(); j != TFBS->end(); j++)
+ {
+ j->gen_bns();
+ j->row_average();
+ j->row_stddev();
+ }
+
+ return;
+}*/
+
+void generate_tfbs_rows_from_matrix(vector<tfbs> *TFBS)
+{
+ ifstream in(matrixfile.c_str());
+
+ if(!in)
+ error_handler(NO_MATRIX_FILE,"");
+
+ cerr << "\nReading matrix file...";
+
+ string line;
+
+ while(getline(in,line))
+ {
+ if(line.find("[SHORT TFBS MATRIX]") != string::npos)
+ {
+ generate_tfbs_rows_from_short_matrix(TFBS,&in);
+ in.close();
+ SHORT = true;
+ return;
+ }
+
+ if(line[0] != '>')
+ continue;
+
+ istringstream str(line);
+ string buf;
+
+ str >> buf;
+
+ vector<tfbs>::iterator i = TFBS->begin();
+
+ while(str)
+ {
+ m_point v;
+ str >> v;
+
+ if(!str)
+ break;
+
+ if(i >= TFBS->end())
+ error_handler(BAD_MATRIX,"");
+
+ i->row_push(v);
+
+ i++;
+ }
+
+ if(i != TFBS->end())
+ error_handler(BAD_MATRIX,"");
+ }
+
+ cerr << "done\n";
+
+ for(vector<tfbs>::iterator j = TFBS->begin(); j != TFBS->end(); j++)
+ {
+ j->gen_bns();
+ j->row_average();
+ j->row_stddev();
+
+// if(IMG_OUTPUT && DRAW_BACKGROUND)
+ // j->draw_background();
+ }
+
+ return;
+}
+
+void generate_tfbs_rows_from_short_matrix(vector<tfbs> *TFBS, ifstream *in)
+{
+ string line;
+ vector<tfbs>::iterator i = TFBS->begin();
+
+ while(getline(*in,line))
+ {
+ if(line.size() != 0)
+ {
+ if(i >= TFBS->end())
+ error_handler(BAD_MATRIX,"");
+
+ if(i->assign_short_matrix(line))
+ {
+ // if(IMG_OUTPUT && DRAW_BACKGROUND)
+ // i->draw_background();
+
+ i++;
+ }
+ else
+ error_handler(BAD_MATRIX,"");
+ }
+ }
+
+ if(i != TFBS->end())
+ error_handler(BAD_MATRIX,"");
+
+ return;
+}
+
+
+
+void generate_output_bins(vector<tfbs> *TFBS, vector<sequence> *SEQUENCE)
+{
+
+ ofstream out("pscan_bins.txt");
+ cerr << "\nWriting bins output...";
+
+ if(!out)
+ error_handler(CANTOPENOUTPUT, "pscan_bins.txt");
+
+ for(vector<tfbs>::iterator j = TFBS->begin(); j != TFBS->end(); j++)
+ out << endl << j->name() << endl << j->show_bns() << "TOT\t" << j->get_bin_sum() << endl;
+
+ out.close();
+
+ cerr << "done\n";
+
+ return;
+}
+
+
+void bins_gen()
+{
+ m_point c = 0;
+
+ while(c < 1)
+ {
+ bins.push_back(c);
+ c += step;
+ }
+
+ bins.push_back(1);
+ bins.push_back(1);
+
+ return;
+}
+
+void generate_output_big_sequence(vector<tfbs> *TFBS, vector<big_sequence> *BIG_SEQUENCE)
+{
+ string outfile = promfile;
+
+ outfile += ".res";
+
+ if(trackbg.size())
+ outfile += ".bed";
+
+ ofstream out(outfile.c_str());
+
+ if(!out)
+ error_handler(CANTOPENOUTPUT, outfile);
+
+ vector<big_sequence>::iterator s_iv = BIG_SEQUENCE->begin();
+
+ if(trackbg.size())
+ {
+ while(s_iv != BIG_SEQUENCE->end())
+ {
+ vector<tfbs>::iterator t_iv = TFBS->begin();
+ int count = 0;
+
+ while(t_iv != TFBS->end())
+ {
+ out << s_iv->name() << '\t' << t_iv->name() << endl << s_iv->track_output(count) << endl << endl;
+ count++;
+ t_iv++;
+ }
+
+ s_iv++;
+ }
+ }
+ else
+ {
+ while(s_iv != BIG_SEQUENCE->end())
+ {
+ vector<tfbs>::iterator t_iv = TFBS->begin();
+
+ out << s_iv->name().substr(0,30) << endl << '\t' << '\t';
+
+ while(t_iv != TFBS->end())
+ {
+ out << t_iv->name() << '\t';
+ t_iv++;
+ }
+
+ out << endl;
+
+ out << s_iv->inline_output() << endl;
+
+ out << endl;
+
+ s_iv++;
+ }
+ }
+
+
+ cerr << endl;
+
+ exit(EXIT_SUCCESS);
+}
+
+vector<vector<string > > w2_mat_reader(string file)
+{
+ ifstream in(file.c_str());
+ vector<vector<string> > vvs;
+ vector<string> vs;
+
+ string line;
+
+ while(getline(in,line))
+ {
+ if(line.find("Matrix") == string::npos)
+ continue;
+ else
+ {
+ vs.push_back(line);
+
+ while(getline(in,line) && line[0] != '*')
+ vs.push_back(line);
+
+ vvs.push_back(vs);
+
+ vs.clear();
+ }
+ }
+
+ return vvs;
+}
+
+vector<vector<string > > wil_mat_reader(string file)
+{
+ ifstream in(file.c_str());
+ vector<vector<string> > vvs;
+ vector<string> vs;
+
+ string line;
+
+ if(q_idlist.empty())
+ {
+ while(getline(in,line))
+ {
+ if(line.find(">") != string::npos && (usethismatrixname.size() == 0 || line.find(usethismatrixname) != string::npos))
+ {
+ vs.push_back(line);
+
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ {
+ getline(in,line);
+
+ if(line.size())
+ vs.push_back(line);
+ }
+
+ vvs.push_back(vs);
+
+ vs.clear();
+ }
+
+ }
+ }
+ else
+ {
+ while(getline(in,line))
+ {
+ if(line.find(">") != string::npos && (usethismatrixname.size() == 0 || line.substr(1) == usethismatrixname))
+ {
+ vs.push_back(line);
+
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ {
+ getline(in,line);
+
+ if(line.size())
+ vs.push_back(line);
+ }
+
+ vvs.push_back(vs);
+
+ vs.clear();
+ }
+
+ }
+ }
+
+/* cerr << endl;
+ for(int i = 0; i < vvs.size(); i++)
+ for(int j = 0; j < vvs[i].size(); j++)
+ cerr << vvs[i][j] << endl;
+*/
+ return vvs;
+}
+
+void min_seq_length_fixer(vector<tfbs> *TFBS)
+{
+ int min = 0;
+
+ for(int i = 0; i < TFBS->size(); i++)
+ {
+ if(TFBS->at(i).m_length() > min)
+ min = TFBS->at(i).m_length();
+ }
+
+ if(MIN_SEQ_LENGTH < min)
+ MIN_SEQ_LENGTH = min;
+
+ if(SEQ_EDGE_START != 0 && SEQ_EDGE_START < min)
+ SEQ_EDGE_START = min;
+
+ if(SEQ_EDGE_END != 0 && SEQ_EDGE_END < min)
+ SEQ_EDGE_END = min;
+
+ if(VERBOSE)
+ cerr << endl << "Minimum length for sequences to be processed: " << min << endl;
+
+ return;
+}
+
+void idfile_handler(vector<sequence> *SEQUENCE)
+{
+ vector<string> ids;
+ vector<sequence> query;
+ string line;
+ ifstream in(idfile.c_str());
+
+ if(!in)
+ error_handler(MISSING_FASTA_FILE,idfile);
+
+ while(getline(in,line))
+ if(line[0] != '#' && !line.empty())
+ ids.push_back(line);
+
+ if((int)ids.size() >= SWITCH_TO_Z_VALUE)
+ {
+ USE_T = true;
+ cerr << "Using t-test... " << endl;
+ }
+
+ cerr << "Building query file..." << endl;
+
+ for(vector<sequence>::iterator si = SEQUENCE->begin(); si < SEQUENCE->end(); si++)
+ {
+ if(id_check(&ids, si))
+ {
+ query.push_back(*si);
+
+ if(USE_T)
+ {
+ SEQUENCE->erase(si);
+ si--;
+ }
+ }
+ }
+
+ in.close();
+
+ cerr << "Background sequences: " << SEQUENCE->size() << "\tForegroung sequences: " << query.size() << endl;
+
+ queryfile = idfile;
+
+ queryfile += ".fasta";
+
+ ofstream out(queryfile.c_str());
+
+ for(int i = 0; i < query.size(); i++)
+ out << query[i].name() << endl << query[i].seq() << endl;
+
+ out.close();
+
+ return;
+}
+
+bool id_check(vector<string> *ids, vector<sequence>::iterator si)
+{
+ for(int i = 0; i < (int)ids->size(); i++)
+ {
+ if(si->name().find(ids->at(i)) != string::npos)
+ {
+ // cerr << si->name() << endl;
+
+ if(si->name().find(ids->at(i)) + ids->at(i).size() == si->name().size())
+ return true;
+
+ if(si->name().at(si->name().find(ids->at(i)) + ids->at(i).size()) == ' ')
+ return true;
+ }
+ }
+
+ return false;
+}
+
+string show_conservation(vector<tfbs> *TFBS)
+{
+ int max_l = 0;
+ ostringstream ss;
+
+ for(int i = 0; i < (int)TFBS->size(); i++)
+ {
+ if(TFBS->at(i).m_length() > max_l)
+ max_l = TFBS->at(i).m_length();
+ }
+
+ for(int i = 0; i < (int)TFBS->size(); i++)
+ ss << TFBS->at(i).show_conservation_vector(max_l - TFBS->at(i).m_length()) << endl;
+
+ return ss.str();
+}
+
+string build_sequence_index(vector<sequence> *SEQUENCE)
+{
+ string fname = promfile;
+ fname += ".index";
+
+ ofstream out(fname.c_str());
+
+ for(int i = 0; i < SEQUENCE->size(); i++)
+ {
+ string buf;
+ istringstream str(SEQUENCE->at(i).name());
+
+ str >> buf;
+
+ out << buf << '\t';
+
+ for(int j = 0; j <= i; j++)
+ {
+ if(SEQUENCE->at(i).seq() == SEQUENCE->at(j).seq())
+ {
+ out << j << endl;
+ break;
+ }
+ }
+ }
+
+ out.close();
+
+ return fname;
+}
+
+//START OF TFBS CLASS
+
+bool tfbs::assign(const char *f_name)
+{
+ ifstream in(f_name);
+ string line;
+ bool flag1 = false;
+
+ matrix_alloc = false;
+ norm_sec_step = false;
+
+ if(!in)
+ return false;
+
+ file_name = f_name;
+
+ while(getline(in,line))
+ {
+ if(line[0] == '#')
+ continue;
+
+ if(line.substr(0,2) == m_data_start_identifier)
+ {
+ while(line.substr(0,2) != m_start_identifier && line.substr(0,2) != m_start_identifier_for_free_transfac)
+ {
+ raw_data.push_back(line);
+ getline(in,line);
+ }
+
+ continue;
+ }
+
+ else
+ raw_matrix.push_back(line);
+ }
+
+ in.close();
+
+ if(!raw_tfbsdata_parser())
+ return false;
+
+/* while(getline(in,line))
+ {
+ if(line[0] == '#')
+ continue;
+
+ if(line.find(m_start_identifier) != string::npos && !flag1)
+ {
+ raw_data = line;
+
+ if(!raw_tfbsdata_parser())
+ {
+ in.close();
+ return false;
+ }
+ }
+
+ else if(!flag1)
+ {
+ flag1 = true;
+ getline(in,line);
+ }
+
+ if(flag1)
+ raw_matrix.push_back(line);
+ }*/
+
+
+// in.close();
+
+ if(!raw_matrix_parser())
+ return false;
+
+ else
+ {
+/* if(usethismatrix.size() > 0)
+ {
+ cerr << endl << AC << endl;
+ cerr << show_matrix();
+ cerr << endl << endl;
+ }*/
+
+ if(matrix_normalizer())
+ {
+ build_conservation_vector();
+ matrix_to_log();
+ matrix_average();
+ matrix_min_max();
+ #ifdef DEBUG3
+ cout << endl << show_raw_matrix() << endl << show_matrix() << endl << "Max = " << max() << endl << "Min = " << min()
+ << endl << endl;
+ #endif
+// if(SHOW_CONSERVATION_VECTOR)
+// cout << show_conservation_vector() << endl;
+ #ifdef MATRIX_TRANSLATOR
+ cout << matrix_translator();
+ #endif
+ return true;
+ }
+ else
+ return false;
+ }
+
+}
+
+bool tfbs::w2_assign(vector<string> w2_raw_mat)
+{
+ if(w2_raw_mat.empty())
+ return false;
+
+ j_raw_data = w2_raw_mat[0];
+ matrix_alloc = false;
+ norm_sec_step = false;
+
+ w2_raw_mat.erase(w2_raw_mat.begin());
+
+ if(fasta_matrix_file.size() == 0 && bound_oligos_file.size() == 0)
+ for(int i = 0; i < w2_raw_mat.size(); i++)
+ raw_matrix.push_back(w2_raw_mat[i].substr(3));
+ else
+ for(int i = 0; i < w2_raw_mat.size(); i++)
+ raw_matrix.push_back(w2_raw_mat[i]);
+
+ j_raw_tfbsdata_parser();
+
+ if(!j_raw_matrix_parser())
+ return false;
+
+ else
+ {
+// cout << show_matrix() << endl;
+ if(matrix_normalizer())
+ {
+ build_conservation_vector();
+ matrix_to_log();
+ matrix_min_max();
+ matrix_average();
+ // cout << show_matrix() << endl << endl;
+ // if(SHOW_CONSERVATION_VECTOR)
+ // cout << show_conservation_vector() << endl;
+ #ifdef DEBUG3
+ cout << endl << show_raw_matrix() << endl << show_matrix() << endl << "Max = " << max() << endl << "Min = " << min()
+ << endl << endl;
+ #endif
+ return true;
+ }
+ else
+ return false;
+ }
+
+ return true;
+
+}
+
+bool tfbs::j_assign(const char *f_name)
+{
+ ifstream in(f_name), mlist("matrix_list.txt");
+ string line, data;
+ bool flag1 = false;
+
+ matrix_alloc = false;
+ norm_sec_step = false;
+ file_name = f_name;
+
+ data = file_name.substr(0,file_name.find(".pfm"));
+
+ if(!in)
+ return false;
+
+ if(!mlist)
+ error_handler(NO_JASPAR_MATRIX_LIST,"");
+
+ while(getline(mlist,line))
+ {
+ if(line.find(data) != string::npos)
+ {
+ AC = data;
+ data = line;
+ break;
+ }
+ }
+
+ mlist.close();
+
+ if(data == file_name.substr(0,file_name.find(".pfm")))
+ {
+/* ostringstream dd;
+ dd << data.substr(data.find("_") + 1) << '\t' << 0 << '\t' << data.substr(0,data.find("_"));
+ data = dd.str();*/
+ error_handler(JASPAR_FILE_NOT_IN_LIST,file_name);
+ }
+
+ j_raw_data = data;
+
+ j_raw_tfbsdata_parser();
+
+ while(getline(in,line))
+ raw_matrix.push_back(line);
+
+ in.close();
+
+ if(!j_raw_matrix_parser())
+ return false;
+
+ else
+ {
+/* if(usethismatrix.size() > 0)
+ {
+ cerr << endl;
+ cerr << show_matrix();
+ cerr << endl << endl;
+ }*/
+
+ if(matrix_normalizer())
+ {
+ build_conservation_vector();
+ matrix_to_log();
+ matrix_min_max();
+ matrix_average();
+ // if(SHOW_CONSERVATION_VECTOR)
+ // cout << show_conservation_vector() << endl;
+ #ifdef DEBUG3
+ cout << endl << show_raw_matrix() << endl << show_matrix() << endl << "Max = " << max() << endl << "Min = " << min()
+ << endl << endl;
+ #endif
+ return true;
+ }
+ else
+ return false;
+ }
+
+ return true;
+}
+
+void tfbs::j_raw_tfbsdata_parser()
+{
+ istringstream s(j_raw_data);
+ string buf;
+
+ if(j_raw_data[0] == '>')
+ NAME = j_raw_data.substr(1);
+ else
+ s >> ID >> buf >> NAME;
+
+ ID = NAME;
+
+ if(AC.empty())
+ AC = NAME;
+
+ return;
+}
+
+void tfbs::row_push(m_point m)
+{
+ row.push_back(m);
+}
+
+void tfbs::row_v_push(vector<m_point> v)
+{
+ for(int i = 0; i < v.size(); i++)
+ row.push_back(v[i]);
+}
+
+m_point tfbs::row_get(int i)
+{
+ return row.at(i);
+}
+
+vector<m_point> tfbs::row_get()
+{
+ return row;
+}
+
+bool tfbs::raw_tfbsdata_parser()
+{
+/* istringstream str(raw_data);
+ string buf1;
+
+ if(!str)
+ return false;
+
+ str >> buf1 >> ID;
+
+ if(buf1 != m_start_identifier)
+ return false;
+
+ ID = ID.substr(0, ID.find(";") - 1);
+
+ buf1.clear();
+
+ NAME = raw_data.substr(raw_data.find(";") + 1);
+ buf1 = NAME;
+
+ NAME = NAME.substr(1,NAME.find(";") - 1);
+
+ SPECIES = buf1.substr(buf1.find("Species:"));
+
+ do
+ {
+ SPECIES.erase(SPECIES.find("Species:"),8);
+ }
+ while(SPECIES.find("Species:") != string::npos);*/
+
+ for(int i = 0; i < (int)raw_data.size(); i++)
+ {
+ istringstream str;
+
+ // cerr << endl << "rawdata[" << i << "] = " << raw_data[i] << endl;
+
+ if(raw_data[i][0] == 'I' && raw_data[i][1] == 'D')
+ {
+ str.str(raw_data[i]);
+ str >> ID >> ID;
+ continue;
+ }
+
+ if(raw_data[i][0] == 'N' && raw_data[i][1] == 'A')
+ {
+ str.str(raw_data[i]);
+ str >> NAME >> NAME;
+ continue;
+ }
+
+ if(raw_data[i][0] == 'A' && raw_data[i][1] == 'C')
+ {
+ str.str(raw_data[i]);
+ str >> AC >> AC;
+ continue;
+ }
+ }
+
+// cerr << endl << "NAME = " << NAME << endl << "ID = " << ID << endl << "AC = " << AC << endl;
+
+ if((int)ID.size() == 0 || (int)AC.size() == 0 || (int)NAME.size() == 0)
+ return false;
+
+ return true;
+}
+
+bool tfbs::raw_matrix_parser()
+{
+ #ifdef DEBUG2
+ cout << endl << show_raw_matrix();
+ #endif
+
+ x_size = 0;
+
+ for(int i = 0; i < raw_matrix.size(); i++)
+ if(raw_matrix[i].size() > 0)
+ x_size++;
+
+ if(!x_size)
+ return false;
+
+ #ifdef DEBUG2
+ cout << "\nx_size = " << x_size << endl;
+ #endif
+
+ matrix = new m_point*[x_size];
+
+ for(int a = 0; a < x_size; a++)
+ matrix[a] = new m_point[ALPHABET_SIZE];
+
+
+ for(int t = 0; t < x_size; t++)
+ {
+ if(raw_matrix[t].size() > 0)
+ {
+ istringstream str1(raw_matrix[t]);
+ string buf;
+
+ str1 >> buf;
+
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ str1 >> matrix[t][i];
+ }
+ }
+
+ matrix_alloc = true;
+
+ #ifdef DEBUG2
+ cout << endl << show_matrix();
+ #endif
+
+ return true;
+}
+
+bool tfbs::j_raw_matrix_parser()
+{
+ vector<vector <m_point> > mlines;
+
+ for(int i = 0; i < raw_matrix.size(); i++)
+ {
+ vector<m_point> mbuf;
+ istringstream s(raw_matrix[i]);
+
+ while(s)
+ {
+ m_point buf;
+ s >> buf;
+ if(!s)
+ break;
+
+ mbuf.push_back(buf);
+ }
+
+ mlines.push_back(mbuf);
+ }
+
+ x_size = mlines[0].size();
+
+ if(!x_size)
+ return false;
+
+ if(mlines.size() != ALPHABET_SIZE)
+ return false;
+
+ matrix = new m_point*[x_size];
+
+ for(int a = 0; a < x_size; a++)
+ matrix[a] = new m_point[ALPHABET_SIZE];
+
+ for(int q = 0; q < x_size; q++)
+ for(int w = 0; w < ALPHABET_SIZE; w++)
+ matrix[q][w] = mlines[w][q];
+
+ matrix_alloc = true;
+
+ return true;
+}
+
+string tfbs::show_matrix()
+{
+ if(!matrix_alloc)
+ {
+ error_handler(MATRIX_NOT_ALLOCATED,"");
+ return("");
+ }
+
+ ostringstream out;
+
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ {
+ out << endl << ALPHABET[i] << '\t';
+
+ for(int t = 0; t < x_size; t++)
+ out << matrix[t][i] << '\t';
+ }
+
+ return out.str();
+}
+
+void tfbs::build_conservation_vector()
+{
+ for(int t = 0; t < x_size; t++)
+ {
+ bool flag = false;
+
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ {
+ if(matrix[t][i] >= CONSERVATION_THRESHOLD)
+ flag = true;
+ }
+
+ v_conservation.push_back(flag);
+ }
+
+ return;
+}
+
+string tfbs::show_conservation_vector(int fs)
+{
+ ostringstream ss;
+
+ if(x_size%2)
+ fs --;
+
+ ss << NAME ;
+
+ for(int i = NAME.size(); i < 14; i++)
+ ss << ' ';
+
+ for(int i = 0; i < fs; i++)
+ ss << ' ';
+
+// ss << ' ';
+
+ for(int i = 0; i < v_conservation.size(); i++)
+ ss << v_conservation[i] << ' ';
+
+ return ss.str();
+}
+
+string tfbs::name()
+{
+ return NAME;
+}
+
+string tfbs::id()
+{
+ return ID;
+}
+
+string tfbs::file()
+{
+ return file_name;
+}
+
+string tfbs::matrix_id()
+{
+ if(file_name.find("_") != string::npos && file_name.find(".") != string::npos)
+ {
+ int off = (int)file_name.find(".") - (int)file_name.find("_") - 1;
+ return file_name.substr(file_name.find("_") + 1, off);
+ }
+ else
+ return file_name.substr(0,file_name.find("."));
+}
+
+string tfbs::ac()
+{
+ return AC;
+}
+
+string tfbs::show_raw_matrix()
+{
+ ostringstream out;
+
+ out << file_name << endl;
+ for(int i = 0; i < raw_matrix.size(); i++)
+ out << raw_matrix[i] << endl;
+
+ return out.str();
+}
+
+#ifdef MATRIX_TRANSLATOR
+string tfbs::matrix_translator()
+{
+ ostringstream out;
+
+ out << '>' << name() << endl;
+
+ for(int i = 0; i < raw_matrix.size(); i++)
+ {
+ istringstream in(raw_matrix[i]);
+
+ string trash, s1, s2, s3, s4;
+
+ in >> trash >> s1 >> s2 >> s3 >> s4 >> trash;
+
+ out << s1 << '\t' << s2 << '\t' << s3 << '\t' << s4 << endl;
+ }
+
+ return out.str();
+}
+#endif
+
+bool tfbs::matrix_normalizer()
+{
+ if(!matrix_alloc)
+ return false;
+
+ m_point *sums = new m_point[x_size];
+
+ for(int x = 0; x < x_size; x++)
+ {
+ m_point sum = 0;
+
+ for(int y = 0; y < ALPHABET_SIZE; y++)
+ sum += matrix[x][y];
+
+ sums[x] = sum;
+ }
+
+ for(int x = 0; x < x_size; x++)
+ for(int y = 0; y < ALPHABET_SIZE; y++)
+ matrix[x][y] /= sums[x];
+
+ delete[] sums;
+
+ if(!norm_sec_step)
+ {
+ for(int x = 0; x < x_size; x++)
+ for(int y = 0; y < ALPHABET_SIZE; y++)
+ matrix[x][y] += MIN_ADD_FREQ;
+
+ norm_sec_step = true;
+
+ matrix_normalizer();
+ }
+
+ return true;
+}
+
+void tfbs::matrix_to_log()
+{
+ for(int x = 0; x < x_size; x++)
+ for(int y = 0; y < ALPHABET_SIZE; y++)
+ matrix[x][y] = log(matrix[x][y]);
+
+ return;
+}
+
+void tfbs::matrix_min_max()
+{
+ max_score = 0;
+ min_score = 0;
+
+ for(int x = 0; x < x_size; x++)
+ {
+ m_point min = INF;
+
+ for(int y = 0; y < ALPHABET_SIZE; y++)
+ {
+ if(matrix[x][y] < min)
+ min = matrix[x][y];
+ }
+
+ min_score += min;
+ }
+
+ for(int x = 0; x < x_size; x++)
+ {
+ m_point max = -INF;
+
+ for(int y = 0; y < ALPHABET_SIZE; y++)
+ {
+ if(matrix[x][y] > max)
+ max = matrix[x][y];
+ }
+
+ max_score += max;
+
+ }
+
+ return;
+}
+
+m_point tfbs::max()
+{
+ return max_score;
+}
+
+m_point tfbs::min()
+{
+ return min_score;
+}
+
+int tfbs::m_length()
+{
+ return x_size;
+}
+
+m_point tfbs::get_value(int x, int y)
+{
+/* if(x >= x_size || y >= ALPHABET_SIZE || !matrix_alloc)
+ {
+ error_handler(READING_OUT_OF_MATRIX,"");
+ return 0;
+ }
+ else*/
+ return matrix[x][y];
+}
+
+m_point tfbs::get_matrix_average(int x)
+{
+ return mat_avg[x];
+}
+
+void tfbs::gen_bns()
+{
+ int i = 0;
+
+ while(i < bins.size() - 1)
+ {
+ int count = 0;
+
+ for(int c = 0; c < row.size(); c++)
+ if(row[c] > bins[i] && row[c] <= bins[i+1])
+ count++;
+
+ bns.push_back(count);
+
+ i++;
+ }
+
+ bns_sum();
+
+ return;
+}
+
+string tfbs::show_bns()
+{
+ ostringstream str;
+
+ for(int i = 0; i < bns.size(); i++)
+ str << bins[i] << '\t' << bns[i] << endl;
+
+ return str.str();
+}
+
+string tfbs::show_bns_line()
+{
+ ostringstream str;
+
+ for(int i = 0; i < bns.size(); i++)
+ str << bns[i] << '\t';
+
+ return str.str();
+}
+
+void tfbs::bns_sum()
+{
+ int sum = 0;
+
+ for(int i = 0; i < bns.size(); i++)
+ sum += bns[i];
+
+ bin_sum = sum;
+
+ return;
+}
+
+void tfbs::row_average()
+{
+ if(row.size() == 0)
+ error_handler(CANT_COMPUTE_SCORES,"");
+
+ avg_row = 0;
+
+ for(int i = 0; (int)i < row.size(); i++)
+ avg_row += row[i];
+
+ avg_row /= (int)row.size();
+
+ rowsize = (int)row.size();
+
+ return;
+}
+
+m_point tfbs::row_avg_get()
+{
+ return avg_row;
+}
+
+void tfbs::row_stddev()
+{
+ if(row.size() == 1)
+ {
+ stddev_row = avg_row;
+ return;
+ }
+
+ stddev_row = 0;
+
+ for(int i = 0; i < row.size(); i++)
+ stddev_row += (pow((row[i] - avg_row),2));
+
+ stddev_row /= (m_point)((m_point)row.size() - 1);
+
+ stddev_row = sqrt(stddev_row);
+
+ return;
+
+}
+
+m_point tfbs::row_stddev_get()
+{
+ return stddev_row;
+}
+
+int tfbs::row_size()
+{
+ return rowsize;
+}
+
+int tfbs::row_actual_size()
+{
+ return (int)row.size();
+}
+
+vector<m_point>::iterator tfbs::row_iter()
+{
+ return row.begin();
+}
+
+int tfbs::get_bin_sum()
+{
+ return bin_sum;
+}
+
+m_point tfbs::bin_prob(int bpos)
+{
+ int psum = 0;
+
+ for(bpos; bpos < (int)bns.size(); bpos++)
+ psum += bns[bpos];
+
+ return (m_point)((m_point)psum/(m_point)bin_sum);
+}
+
+unsigned int tfbs::bin_hm(int bpos)
+{
+ unsigned int psum = 0;
+
+ for(bpos; bpos < (int)bns.size(); bpos++)
+ psum += bns[bpos];
+
+ return psum;
+}
+
+vector<int> tfbs::bin_vector()
+{
+ return bns;
+}
+
+bool tfbs::assign_short_matrix(string s)
+{
+ istringstream str(s);
+
+ str >> NAME >> rowsize >> avg_row;
+
+ if(avg_row < 0 || avg_row > 1)
+ return false;
+
+ str >> stddev_row;
+
+ if(stddev_row < 0 || stddev_row > 1)
+ return false;
+
+ while(str)
+ {
+ if(!str)
+ break;
+
+ int buf;
+
+ str >> buf;
+
+ bns.push_back(buf);
+ }
+
+ bns_sum();
+
+ return true;
+}
+
+void tfbs::matrix_average()
+{
+ mat_avg = new m_point[x_size];
+
+ for(int x = 0; x < x_size; x++)
+ {
+ m_point buf = 0;
+
+ for(int y = 0; y < ALPHABET_SIZE; y++)
+ buf += matrix[x][y];
+
+ mat_avg[x] = buf/ALPHABET_SIZE;
+ }
+
+ return;
+}
+/*
+void tfbs::draw_background()
+{
+ gdImagePtr bg;
+ int WHITE, BLACK, RED, ampl;
+ char *font_1 = "./SUSESansMono-Bold.ttf";
+
+ FILE *bg_out;
+
+ string bg_img_file;
+
+ if(!fasta_matrix_file.empty())
+ {
+ bg_img_file = fasta_matrix_file;
+ bg_img_file += ".";
+ bg_img_file += NAME;
+ }
+ else
+ bg_img_file = AC;
+
+ if(!magic.empty())
+ {
+ bg_img_file += ".";
+
+ bg_img_file += magic;
+ }
+
+ bg_img_file += ".bg.png";
+
+ bg = gdImageCreateTrueColor(1600,50);
+
+ WHITE = gdImageColorAllocate(bg,255,255,255);
+
+ BLACK = gdImageColorAllocate(bg,0,0,0);
+
+ RED = gdImageColorAllocate(bg,255,0,0);
+
+ int std_dev_pix = (int)(row_stddev_get() * 1600);
+ int avg_pix = (int)(row_avg_get() * 1600);
+ int bin_size = (int)(1600 / (1.0 / step));
+
+ char *p0 = "1", *p1 = "0", pavg[6];
+ int brect[8], l;
+
+ ostringstream buf;
+
+ buf << row_avg_get();
+
+ for(l = 0; l < buf.str().size() && l < 5; l++)
+ pavg[l] = buf.str().at(l);
+
+ pavg[l] = '\0';
+
+ for(int i = (1600 - avg_pix - std_dev_pix); i <= 1600 - avg_pix + std_dev_pix; i++)
+ {
+ if(i < 0 || i >= 1600)
+ continue;
+
+ int light = (int)((fabs(1600 - i-avg_pix)/((m_point) std_dev_pix)) * 255.0);
+
+ if(light > 200)
+ light = 200;
+
+ int color = gdImageColorAllocate(bg, light, 255, light);
+
+ gdImageLine(bg, i, 31, i, 49, color);
+ }
+
+ gdImageLine(bg, 1600 - avg_pix, 31, 1600 - avg_pix, 49, RED);
+
+ m_point max_freq = 0;
+
+ for(int i = 0; i < bns.size(); i++)
+ {
+ if(bns.at(i) > max_freq)
+ max_freq = bns.at(i);
+ }
+
+ max_freq /= (m_point)bin_sum;
+
+ ampl = (int)(235.0/max_freq);
+
+ for(int i = 1600; i > 0; i -= bin_size)
+ {
+ int blue = 20 + (int)((m_point)ampl * bin_freq((1600 - i)/bin_size)), j, red = 0;
+ char bn[6];
+
+ ostringstream buf2;
+
+ buf2 << bins[(1600 - i)/bin_size];
+
+ for(j = 0; j < buf2.str().size() && j < 5; j++)
+ bn[j] = buf2.str().at(j);
+
+ bn[j] = '\0';
+
+ if(bin_freq((1600 - i)/bin_size) == 0)
+ {
+ blue = 0;
+ red = 50;
+ }
+
+ if(blue > 255)
+ blue = 255;
+
+ int color = gdImageColorAllocate(bg, red, 0, blue);
+ gdImageFilledRectangle(bg, i - bin_size, 0, i, 30, color);
+
+ gdImageStringFT(NULL, brect, WHITE, font_1, 8, 0, i - bin_size, 20, bn);
+ gdImageStringFT(bg, brect, WHITE, font_1, 8, 0, (i - bin_size) + (bin_size - (brect[2]-brect[0])) /2, 20, bn);
+ }
+
+ gdImageRectangle(bg, 0, 0, 1599, 49,BLACK);
+
+ gdImageStringFT(NULL, brect, BLACK, font_1, 10, 0, (1600 - avg_pix - std_dev_pix), 45, pavg);
+ gdImageStringFT(bg, brect, BLACK, font_1, 10, 0, (1600 - avg_pix - std_dev_pix) + (2*std_dev_pix - (brect[2] - brect[0]))/2, 45, pavg);
+
+ bg_out = fopen(bg_img_file.c_str(),"wb");
+
+ gdImagePng(bg,bg_out);
+
+ fclose(bg_out);
+ gdImageDestroy(bg);
+
+ return;
+}
+*/
+
+m_point tfbs::bin_freq(int pos)
+{
+ return (m_point)bns.at(pos)/(m_point)bin_sum;
+}
+
+//END OF TFBS CLASS
+
+//START OF SEQUENCE CLASS
+
+bool sequence::assign(string s)
+{
+ istringstream str(s),bstr;
+ string line;
+
+ getline(str,line);
+
+ NAME = line;
+
+ bstr.str(NAME);
+
+ bstr >> WRAP_NAME; //RIPRISTINARE
+
+// WRAP_NAME = NAME; //TOGLIERE
+
+ if(WRAP_NAME.find("NM_") != string::npos)
+ ID = WRAP_NAME.substr(WRAP_NAME.find("NM_"));
+ else if(WRAP_NAME.find("NR_") != string::npos)
+ ID = WRAP_NAME.substr(WRAP_NAME.find("NR_"));
+ else if(WRAP_NAME.find("_Y") != string::npos)
+ ID = WRAP_NAME.substr(WRAP_NAME.find("_Y")+1);
+ else
+ ID = WRAP_NAME.substr(1);
+
+ while(getline(str,line))
+ SEQ += line;
+
+ if(BED_OUTPUT)
+ {
+ if(!absolute_position_parser())
+ {
+ CHR = "Unknown";
+ STRAND = '?';
+ ABS_START = 0;
+ ABS_END = (unsigned long int)SEQ.size();
+ }
+
+// cerr << NAME << endl << CHR << '\t' << ABS_START << '\t' << ABS_END << '\t' << STRAND << endl;
+// cout << CHR << '\t' << ABS_START << '\t' << ABS_END << '\t' << "." << '\t' << 1 << '\t' << STRAND << endl;
+ }
+
+ return seq_cleaner();
+}
+
+bool sequence::absolute_position_parser()
+{
+ istringstream str(NAME);
+ string buf;
+
+ if(NAME.find("TRUE") != string::npos)
+ STRAND = '-';
+ else if(NAME.find("FALSE") != string::npos)
+ STRAND = '+';
+ else
+ STRAND = '?';
+
+ str >> buf >> buf;
+
+ if(buf.find("range=") == string::npos)
+ return false;
+ else
+ buf = buf.substr(buf.find("range=")+6);
+
+ for(int i = 0; i < buf.size(); i++)
+ if(buf[i] == ':' || buf[i] == '-')
+ buf[i] = ' ';
+
+ str.str().clear();
+ str.str(buf);
+
+ str >> CHR >> ABS_START >> ABS_END;
+
+ return true;
+}
+
+bool sequence::seq_cleaner()
+{
+ if(SEQ.size() < MIN_SEQ_LENGTH)
+ return false;
+
+ for(int i = 0; i < SEQ.size(); i++)
+ {
+ bool flag = false;
+
+ SEQ[i] = toupper(SEQ[i]);
+
+ for(int a = 0; a < ALPHABET_SIZE; a++)
+ {
+ if(SEQ[i] == ALPHABET[a])
+ flag = true;
+ }
+
+ if(!flag && TRASH_N)
+ return false;
+ else if(!flag && !TRASH_N)
+ SEQ[i] = 'N';
+
+ // if(!flag && (SEQ[i] != 'N' || TRASH_N))
+ // return false;
+ }
+
+ if(SEQ_EDGE_START > 0 && SEQ.size() > SEQ_EDGE_START)
+ {
+ SEQ = SEQ.substr(0, SEQ_EDGE_START);
+ return true;
+ }
+
+ if(SEQ_EDGE_END > 0 && SEQ.size() > SEQ_EDGE_END)
+ {
+ SEQ = SEQ.substr(SEQ.size() - SEQ_EDGE_END);
+ return true;
+ }
+
+ if(SPLIT[1])
+ {
+ if(SEQ.size() < SPLIT[0] + SPLIT[1])
+ return false;
+
+ SEQ = SEQ.substr(SPLIT[0], SPLIT[1]);
+ return true;
+ }
+
+ return true;
+}
+
+string sequence::name()
+{
+ return NAME;
+}
+
+string sequence::seq()
+{
+ return SEQ;
+}
+
+void sequence::scan(vector<tfbs> *TFBS, int Snum, bool q)
+{
+
+// if(VERBOSE)
+ cerr << (char)13 << " "
+ << (char)13 << "Processing " << NAME.substr(0,24) << " (" << Snum+1 << ')';
+
+ if(DOUBLE_STRAND)
+ {
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ SCAN(i,q);
+ }
+ else
+ {
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ SCAN_SS(i,q);
+ }
+
+ return;
+}
+
+int sequence::char_to_index(char c)
+{
+/* for(int i = 0; i < ALPHABET_SIZE; i++)
+ if(c == ALPHABET[i])
+ return i;*/
+
+ return VI[c];
+}
+
+int sequence::comp_char_to_index(char c)
+{
+ /* for(int i = 0; i < ALPHABET_SIZE; i++)
+ if(c == C_ALPHABET[i])
+ return i;*/
+
+ return rc_VI[c];
+}
+
+void sequence::SCAN(vector<tfbs>::iterator t, bool q)
+{
+ vector<m_point> scores, scores_rc;
+ int sub_length = t->m_length();
+ m_point r_score, maxs, maxs_rc, maxs2, maxs_rc2;
+
+ for(int i = 0; i < SEQ.size() - sub_length + 1; i++)
+ {
+ string sub = SEQ.substr(i,sub_length);
+
+ #ifdef DEBUG6
+ cerr << "\nSub_str = " << sub << " Sub.size = " << sub.size() << " sub_length = " << sub_length;
+ #endif
+
+ if(sub.find("N") == string::npos && !USE_N)
+ {
+ m_point score = 0, score_rc = 0;
+
+ for(int x = 0; x < sub_length; x++)
+ {
+ score += t->get_value(x,char_to_index(sub[x]));
+ score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ }
+
+ scores.push_back(score);
+ scores_rc.push_back(score_rc);
+ }
+
+ else if(USE_N)
+ {
+ m_point score = 0, score_rc = 0;
+
+ for(int x = 0; x < sub_length; x++)
+ {
+ if(sub[x] != 'N')
+ score += t->get_value(x,char_to_index(sub[x]));
+ else
+ score += t->get_matrix_average(x);
+
+ if(sub[sub_length - 1 - x] != 'N')
+ score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ else
+ score_rc += t->get_matrix_average(sub_length - 1 - x);
+ }
+
+ scores.push_back(score);
+ scores_rc.push_back(score_rc);
+ }
+
+ #ifdef DEBUG6
+ cerr << " Score = " << score;
+ #endif
+ }
+
+ if(scores.size() == 0 || scores_rc.size() == 0)
+ error_handler(CANT_COMPUTE_SCORES,"");
+
+ if(POSITIONAL && q && distance_analysis_matrix.empty())
+ store_complete_seq_scores(scores.begin(), scores_rc.begin(), (int)scores.size(), t);
+
+ int pos, pos_rc, pos2, pos_rc2;
+
+ if(!USE_SUM)
+ {
+ maxs = max_v(&scores,&pos);
+ maxs_rc = max_v(&scores_rc,&pos_rc);
+
+ if(!distance_analysis_matrix.empty())
+ if(distance_analysis_matrix == t->name())
+ {
+ maxs2 = max_v2(&scores,&pos2);
+ maxs_rc2 = max_v2(&scores,&pos_rc2);
+ }
+ }
+ else
+ {
+ maxs = sum_v(&scores);
+ maxs_rc = sum_v(&scores_rc);
+ }
+
+ if(maxs >= maxs_rc)
+ {
+ max_score.push_back(maxs);
+ max_pos.push_back(pos);
+ max_strand.push_back('+');
+ }
+ else
+ {
+ max_score.push_back(maxs_rc);
+ max_pos.push_back(pos_rc);
+ max_strand.push_back('-');
+ }
+
+ r_score = max_score[max_score.size() - 1];
+
+ r_score -= t->max();
+
+ r_score /= (t->min() - t->max());
+
+ rel_score.push_back(r_score);
+
+ if(!distance_analysis_matrix.empty())
+ if(distance_analysis_matrix == t->name())
+ {
+ m_point r_score2, max2;
+
+ if(maxs2 >= maxs_rc2)
+ {
+ max2 = maxs2;
+ max_pos2.push_back(pos2);
+ }
+ else
+ {
+ max2 = maxs_rc2;
+ max_pos2.push_back(pos_rc2);
+ }
+
+ r_score2 = max2 -= t->max();
+ r_score2 /= (t->min() - t->max());
+ rel_score2.push_back(r_score2);
+ }
+
+ return;
+}
+
+void sequence::SCAN_SS(vector<tfbs>::iterator t, bool q)
+{
+ vector<m_point> scores;//, scores_rc;
+ int sub_length = t->m_length();
+ m_point r_score, maxs, maxs2;//, maxs_rc;
+
+ for(int i = 0; i < SEQ.size() - sub_length + 1; i++)
+ {
+ string sub = SEQ.substr(i,sub_length);
+
+ #ifdef DEBUG6
+ cerr << "\nSub_str = " << sub << " Sub.size = " << sub.size() << " sub_length = " << sub_length;
+ #endif
+
+ if(sub.find("N") == string::npos && !USE_N)
+ {
+ m_point score = 0;//, score_rc = 0;
+
+ for(int x = 0; x < sub_length; x++)
+ {
+ score += t->get_value(x,char_to_index(sub[x]));
+ // score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ }
+
+ scores.push_back(score);
+ // scores_rc.push_back(score_rc);
+ }
+
+ else if(USE_N)
+ {
+ m_point score = 0;//, score_rc = 0;
+
+ for(int x = 0; x < sub_length; x++)
+ {
+ if(sub[x] != 'N')
+ score += t->get_value(x,char_to_index(sub[x]));
+ else
+ score += t->get_matrix_average(x);
+
+ /* if(sub[sub_length - 1 - x] != 'N')
+ score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ else
+ score_rc += t->get_matrix_average(sub_length - 1 - x);*/
+ }
+
+ scores.push_back(score);
+ // scores_rc.push_back(score_rc);
+ }
+
+ #ifdef DEBUG6
+ cerr << " Score = " << score;
+ #endif
+ }
+
+ if(scores.size() == 0)// || scores_rc.size() == 0)
+ error_handler(CANT_COMPUTE_SCORES,"");
+
+ if(POSITIONAL && q && distance_analysis_matrix.empty())
+ store_complete_seq_scores_ss(scores.begin(), (int)scores.size(), t);
+
+ int pos, pos2;
+
+ if(!USE_SUM)
+ {
+ maxs = max_v(&scores,&pos);
+// maxs_rc = max_v(&scores_rc,&pos);
+ if(!distance_analysis_matrix.empty())
+ if(distance_analysis_matrix == t->name())
+ maxs2 = max_v2(&scores,&pos2);
+ }
+ else
+ {
+ maxs = sum_v(&scores);
+// maxs_rc = sum_v(&scores_rc);
+ }
+
+// if(maxs >= maxs_rc)
+ max_score.push_back(maxs);
+ max_pos.push_back(pos);
+ max_strand.push_back('+');
+// else
+// max_score.push_back(maxs_rc);
+
+ r_score = max_score[max_score.size() - 1];
+
+ r_score -= t->max();
+
+ r_score /= (t->min() - t->max());
+
+ rel_score.push_back(r_score);
+
+ if(!distance_analysis_matrix.empty())
+ if(distance_analysis_matrix == t->name())
+ {
+ m_point r_score2, max2;
+
+ max2 = maxs2;
+ max_pos2.push_back(pos2);
+
+ r_score2 = max2 -= t->max();
+ r_score2 /= (t->min() - t->max());
+ rel_score2.push_back(r_score2);
+ }
+
+
+ return;
+}
+
+void sequence::store_complete_seq_scores(vector<m_point>::iterator i_scores, vector<m_point>::iterator i_scores_rc, int size, vector<tfbs>::iterator t)
+{
+ vector<m_point> all_r_score;
+ vector<char> strand_r;
+
+ for(int i = 0; i < size; i++)
+ {
+ m_point r_score;
+
+ if(*i_scores >= *i_scores_rc)
+ {
+ r_score = *i_scores;
+ strand_r.push_back('+');
+ }
+ else
+ {
+ r_score = *i_scores_rc;
+ strand_r.push_back('-');
+ }
+
+ r_score -= t->max();
+
+ r_score /= (t->min() - t->max());
+
+ r_score = 1 - r_score;
+
+ all_r_score.push_back(r_score);
+
+ i_scores++;
+ i_scores_rc++;
+ }
+
+ seq_rel_scores.push_back(all_r_score);
+ seq_r_strand.push_back(strand_r);
+
+ return;
+}
+void sequence::store_complete_seq_scores_ss(vector<m_point>::iterator i_scores, int size, vector<tfbs>::iterator t)
+{
+ vector<m_point> all_r_score;
+ vector<char> strand_r;
+
+ for(int i = 0; i < size; i++)
+ {
+ m_point r_score;
+
+// if(*i_scores >= *i_scores_rc)
+// {
+ r_score = *i_scores;
+
+ if(!REVERSE_STRAND)
+ strand_r.push_back('+');
+ else
+ strand_r.push_back('-');
+// }
+// else
+// {
+// r_score = *i_scores_rc;
+// strand_r.push_back('-');
+// }
+
+ r_score -= t->max();
+
+ r_score /= (t->min() - t->max());
+
+ r_score = 1 - r_score;
+
+ all_r_score.push_back(r_score);
+
+ i_scores++;
+// i_scores_rc++;
+ }
+
+ seq_rel_scores.push_back(all_r_score);
+ seq_r_strand.push_back(strand_r);
+
+ return;
+}
+
+m_point sequence::max_v(vector<m_point> *scores,int *pos)
+{
+ m_point max = -INF;
+
+ int p = 0;
+
+ for(vector<m_point>::iterator i = scores->begin(); i < scores->end(); i++)
+ {
+ if(*i > max)
+ {
+ max = *i;
+ *pos = p;
+ }
+
+ p++;
+ }
+
+ return max;
+}
+
+m_point sequence::max_v2(vector<m_point> *scores,int *pos)
+{
+ m_point max = -INF;
+ vector<m_point> fscores = *scores;
+
+ int p = 0;
+
+ for(int i = 0; i < fscores.size(); i++)
+ {
+ if(fscores.at(i) > max)
+ {
+ max = fscores.at(i);
+ *pos = i;
+ }
+ }
+
+ fscores[*pos] = -INF;
+ max = -INF;
+
+ for(int i = 0; i < fscores.size(); i++)
+ {
+ if(fscores.at(i) > max)
+ {
+ max = fscores.at(i);
+ *pos = i;
+ }
+ }
+
+ return max;
+}
+
+m_point sequence::sum_v(vector<m_point> *scores)
+{
+ m_point sum = 0;
+
+ for(vector<m_point>::iterator i = scores->begin(); i < scores->end(); i++)
+ sum += *i;
+
+ sum /= (int)scores->size();
+
+ return sum;
+}
+
+m_point sequence::max(int i)
+{
+ return max_score[i];
+}
+
+m_point sequence::rel(int i)
+{
+ return rel_score[i];
+}
+
+m_point sequence::rel2(int i)
+{
+ return rel_score2.at(0);
+}
+
+int sequence::mpos(int i)
+{
+ return max_pos[i];
+}
+
+int sequence::mpos2(int i)
+{
+ return max_pos2.at(0);
+}
+
+char sequence::mstrand(int i)
+{
+ return max_strand[i];
+}
+
+string sequence::w_name()
+{
+ return WRAP_NAME;
+}
+
+string sequence::get_id()
+{
+ return ID;
+}
+
+void sequence::reverse()
+{
+ string r_seq;
+
+ for(int i = SEQ.size() - 1; i >= 0; i--)
+ r_seq.push_back(C_ALPHABET[char_to_index(SEQ[i])]);
+
+ SEQ = r_seq;
+
+ return;
+}
+
+m_point sequence::get_seq_rel_score_at_pos(int a, int b)
+{
+ return seq_rel_scores[a][b];
+}
+
+m_point sequence::get_seq_rel_max_pos(int a, int *pos, char *strand)
+{
+ m_point max;
+ max = max_v(&seq_rel_scores[a],pos);
+
+ *strand = seq_r_strand[a][*pos];
+
+ return max;
+}
+
+/*void sequence::set_seq_rel_max_pos()
+{
+ m_point max;
+ int pos;
+
+ max = max_v(&seq_rel_scores.back(),&pos);
+
+ vmaxpos.push_back(pos);
+ vmaxstrand.push_back(seq_r_strand.back()[pos]);
+
+ seq_rel_scores.clear();
+ seq_r_strand.clear();
+
+ return;
+}
+
+m_point sequence::get_seq_rel_max_pos_2(int a, int *pos, char *strand)
+{
+ *pos = max_pos[a];
+ *strand = max_strand[a];
+
+ return rel_score[a];
+}
+*/
+unsigned long int sequence::get_abs_start()
+{
+ return ABS_START;
+}
+
+unsigned long int sequence::get_abs_end()
+{
+ return ABS_END;
+}
+
+string sequence::get_chr()
+{
+ return CHR;
+}
+
+char sequence::get_strand()
+{
+ return STRAND;
+}
+
+//END OF SEQUENCE CLASS
+
+//START OF QUERY CLASS
+
+void query::assign(vector<sequence> q, vector<tfbs> t)
+{
+ QUERY = q;
+ QTFBS = t;
+
+ bigst_seq_size = 0;
+
+ for(int a = 0; a < QUERY.size(); a++)
+ {
+ if(bigst_seq_size < QUERY.at(a).seq().size())
+ bigst_seq_size = QUERY.at(a).seq().size();
+ }
+
+ return;
+}
+
+void query::scan()
+{
+ for(int i = 0; i < (int)QUERY.size(); i++)
+ QUERY[i].scan(&QTFBS,i,true);
+
+ generate_tfbs_rows(&QTFBS,&QUERY, true);
+
+ return;
+}
+
+void query::distance_analysis()
+{
+ dam = -1;
+
+ for(int a = 0; a < QTFBS.size(); a++)
+ {
+ if(QTFBS.at(a).name() == distance_analysis_matrix)
+ {
+ dam = a;
+ break;
+ }
+ }
+
+ if(dam == -1)
+ error_handler(NO_MATRIX_FOR_DISTANCE_ANALYSIS, distance_analysis_matrix);
+
+ vector<m_point> Scores, S_Scores;
+ vector<int> Pos,Bpos;
+ int dabs = 5;
+
+ for(int i = 0; i < QUERY.size(); i++)
+ {
+ int pos;
+ m_point srmp;
+// m_point srmp = QUERY.at(i).get_seq_rel_max_pos(dam, &pos, &strand);
+ srmp = 1 - QUERY.at(i).rel(dam);
+ pos = QUERY.at(i).mpos(dam);
+
+ Scores.push_back(srmp);
+ Pos.push_back(pos);
+ }
+
+ S_Scores = Scores;
+
+ for(int i = 0; i < (int)Scores.size(); i++)
+ {
+ m_point max = -INF;
+ int bpos;
+
+ for(int j = 0; j < (int)Scores.size(); j++)
+ {
+ if(Scores[j] > max)
+ {
+ max = Scores[j];
+ bpos = j;
+ }
+ }
+
+ Scores[bpos] = -INF;
+ Bpos.push_back(bpos);
+ }
+
+ for(int a = 0; a < QTFBS.size(); a++)
+ {
+ vector<m_point> TScores, S_TScores, TPos, TBpos;
+ vector<unsigned int> H_C((bigst_seq_size * 2)/dabs, 0), P_C((bigst_seq_size * 2)/dabs, 0);
+
+ for(int i = 0; i < Bpos.size() * ((float)DA_TOP/100.0); i++)
+ {
+ int pos;
+ m_point srmp;
+// m_point srmp = QUERY.at(Bpos.at(i)).get_seq_rel_max_pos(a, &pos, &strand);
+
+ if(a != dam)
+ {
+ srmp = 1 - QUERY.at(Bpos.at(i)).rel(a);
+ pos = QUERY.at(Bpos.at(i)).mpos(a);
+ }
+ else
+ {
+ srmp = 1 - QUERY.at(Bpos.at(i)).rel2(a);
+ pos = QUERY.at(Bpos.at(i)).mpos2(a);
+ }
+
+ TScores.push_back(srmp);
+ TPos.push_back(pos);
+ }
+
+ S_TScores = TScores;
+
+ for(int b = 0; b < (int)TScores.size(); b++)
+ {
+ m_point max = -INF;
+ int bpos;
+
+ for(int j = 0; j < (int)TScores.size(); j++)
+ {
+ if(TScores[j] > max)
+ {
+ max = TScores[j];
+ bpos = j;
+ }
+ }
+
+ TScores[bpos] = -INF;
+ TBpos.push_back(bpos);
+ }
+
+ cout << endl << "*** " << QTFBS.at(a).name() << " *** (" << QTFBS.at(a).id() << ")" << endl;
+
+/* for(int c = 0; c < TBpos.size(); c++)
+ {
+ cout << QUERY.at(Bpos.at(TBpos.at(c))).w_name() << '\t' << S_Scores.at(Bpos.at(TBpos.at(c))) << '\t' << S_TScores.at(TBpos.at(c)) << '\t'
+ << Pos.at(Bpos.at(TBpos.at(c))) << '\t' << TPos.at(TBpos.at(c)) << endl;
+ }
+*/
+ for(int c = 0; c < TBpos.size() * ((float)DA_TOP/100.0); c++)
+ {
+ int rpos = TPos.at(TBpos.at(c)) - Pos.at(Bpos.at(TBpos.at(c)));
+
+ H_C[(bigst_seq_size + rpos)/dabs]++;
+
+ for(int d = 0; d < QUERY.at(Bpos.at(TBpos.at(c))).seq().size(); d++)
+ {
+ rpos = d - Pos.at(Bpos.at(TBpos.at(c)));
+
+ P_C[(bigst_seq_size + rpos)/dabs]++;
+ }
+ }
+
+ int outpos = -bigst_seq_size;
+ m_point nk = 0, nt = 0, p;
+
+ for(int c = 0; c < H_C.size(); c++)
+ {
+ nk += H_C.at(c);
+ nt += P_C.at(c);
+ }
+
+ p = nk/nt;
+
+ for(int c = 0; c < H_C.size(); c++)
+ {
+ m_point pvalue = gsl_cdf_binomial_Q ((unsigned int)H_C[c], p, (unsigned int) P_C[c]);
+
+ if(pvalue <= DISTANCE_CUTOFF && P_C[c] > 0)
+ cout << outpos << '\t' << H_C[c] << '\t' << P_C[c] //<< '\t' << gsl_ran_binomial_pdf ((unsigned int)H_C[c], p, (unsigned int) P_C[c])
+ << '\t' << pvalue << endl;
+
+ outpos += dabs;
+ }
+ }
+
+ return;
+}
+
+char query::flip_strand(char c)
+{
+ if(c == '+')
+ return '-';
+ else if(c == '-')
+ return '+';
+
+ return c;
+}
+
+void query::t_stud_calc(vector<tfbs> *TFBS)
+{
+ int count = 0;
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ {
+ m_point t = (QTFBS[count].row_avg_get() - i->row_avg_get());
+
+ t /= t_den_calc(i->row_size(), i->row_stddev_get(), QTFBS[count].row_size(), QTFBS[count].row_stddev_get());
+
+ count++;
+ ts.push_back(t);
+ }
+
+ return;
+}
+
+void query::z_test(vector<tfbs> *TFBS)
+{
+ int count = 0;
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ {
+ m_point se = i->row_stddev_get(), z;
+
+ se /= sqrt(QTFBS[count].row_size());
+
+ z = QTFBS[count].row_avg_get() - i->row_avg_get();
+
+ z /= se;
+
+ count++;
+ zt.push_back(z);
+ }
+
+ return;
+}
+
+void query::b_test(vector<tfbs> *TFBS)
+{
+ int count = 0;
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ {
+ m_point num,den;
+
+ num = QUERY.size() * pow((QTFBS[count].row_avg_get() - i->row_avg_get()),2);
+
+ den = (2 * pow(QTFBS[count].row_stddev_get(),2)) + (2/3)*(QTFBS[count].row_avg_get() - i->row_avg_get());
+
+ bt.push_back(exp(-num/den));
+
+ count++;
+ }
+
+ return;
+}
+
+m_point query::t_den_calc(int N1, m_point s1, int N2, m_point s2)
+{
+ m_point num,den,buf;
+
+ num = ((N1 - 1) * pow(s1,2));
+ num += ((N2 - 1) * pow(s2,2));
+
+ den = N1 + N2 - 2;
+
+ buf = num/den;
+
+ den = (((m_point)1/N1) + ((m_point)1/N2));
+
+ buf *= den;
+
+ return sqrt(buf);
+}
+
+void query::output(int df,vector<tfbs> *TFBS)
+{
+ string outfile;
+
+ if(!q_idlist.empty())
+ outfile = q_idlist;
+ else
+ outfile = queryfile;
+
+ if(!POSITIONAL)
+ outfile += ".res";
+ else
+ outfile += ".ris";
+
+ vector<m_point> buf_ts;
+
+ if(!POSITIONAL && NO_BACKGROUND)
+ return;
+
+ if(!idfile.empty())
+ buf_ts = ts;
+ else
+ buf_ts = zt;
+/* buf_ts = bt;
+
+ for(int i = 0; i < buf_ts.size(); i++) //ELIMINA IL VALORE DI DISEGUAGLIANZE X ZSCORE < 0 .... POI DA LEVARE!!!
+ if(ts[i] < 0)
+ buf_ts[i] = 1;
+ bt = buf_ts; //FINE
+*/
+ df += (int)QTFBS[0].row_size() - 2;
+
+ cerr << "\nWriting query output...";
+
+ ofstream out(outfile.c_str());
+
+ if(!out)
+ error_handler(CANTOPENOUTPUT, outfile);
+
+ if(!POSITIONAL || (int)promfile.size() != 0 || matrixfile.size() != 0)
+ {
+ if(!VERBOSE && !NO_HEADER)
+ out << "TF_NAME\tMATRIX_ID\tZ_SCORE\tP_VALUE\tSAMPLE_AVERAGE\tBACKGROUND_AVERAGE\n";
+
+ for(int i = 0; i < (int)QTFBS.size(); i++)
+ {
+ int m_pos = max_v_pos(buf_ts);
+// int m_pos = min_v_pos(buf_ts);
+
+ if(VERBOSE)
+ out << QTFBS[m_pos].id() << '\t' << QTFBS[m_pos].matrix_id() << '\t' << ts[m_pos] << '\t'
+ << gsl_cdf_tdist_Q (ts[m_pos], (m_point)df)
+ << '\t' << min_binom_bins[m_pos] << " (" << seq_bin_num[m_pos] << ' ' << bins[soil_pos[m_pos]] << ")"
+ << '\t' << gsl_cdf_ugaussian_Q(zt[m_pos]) << '\t' << bt[m_pos] << '\t' << QTFBS[m_pos].row_avg_get();
+ else
+ out << QTFBS[m_pos].id() << '\t' << QTFBS[m_pos].matrix_id() << '\t' << ts[m_pos] << '\t'
+ /*<< gsl_cdf_tdist_Q (ts[m_pos], (m_point)df)
+ << '\t' << min_binom_bins[m_pos] << " (" << seq_bin_num[m_pos] << ' ' << bins[soil_pos[m_pos]] << ")"
+ << '\t'*/ << gsl_cdf_ugaussian_Q(zt[m_pos]) << '\t' /*<< bt[m_pos] << '\t'*/ << QTFBS[m_pos].row_avg_get()
+ << '\t' << TFBS->at(m_pos).row_avg_get();
+
+ if(SHOW_BINS_BINOM)
+ out << "\tbins binom\t" << binom_bins_line(m_pos) << endl;
+ else
+ out << endl;
+
+ buf_ts[m_pos] = -INF;
+// buf_ts[m_pos] = INF;
+ }
+
+ out << "\nDegrees of freedom: " << df << endl;
+ }
+
+ if(POSITIONAL && distance_analysis_matrix.empty())
+ out << positional_output() << endl;
+ else if(POSITIONAL && !distance_analysis_matrix.empty())
+ {
+ distance_analysis();
+ }
+
+
+ cerr << "done\n";
+
+ out.close();
+
+ return;
+}
+
+void query::bins_stuff(vector<tfbs> *TFBS)
+{
+ vector<tfbs>::iterator iv = TFBS->begin();
+
+ for(int i = 0; i < (int)QTFBS.size(); i++)
+ {
+ vector<m_point> bino_b;
+ vector<int> lbins = QTFBS[i].bin_vector(), soil;
+ unsigned int count = 0, snum;
+
+ if(VERBOSE)
+ cerr << "\nbins_stuff for " << QTFBS[i].name();
+
+ for(int j = (int)lbins.size() - 1; j >= 0; j--)
+ {
+ for(int z = 0; z < lbins[j]; z++)
+ {
+ count++;
+ bino_b.push_back(
+ gsl_cdf_hypergeometric_Q(count - 1, (unsigned int) QUERY.size(),
+ (unsigned int)iv->row_size() - (unsigned int)QUERY.size(),
+ iv->bin_hm(j)));
+ soil.push_back(j);
+ }
+ }
+
+ binom_bins.push_back(bino_b);
+ snum = min_v_pos(bino_b) + 1;
+ min_binom_bins.push_back(bino_b[snum - 1]);
+ soil_pos.push_back(soil[snum-1]);
+ seq_bin_num.push_back(snum);
+ iv++;
+ }
+
+ return;
+}
+
+string query::binom_bins_line(int pos)
+{
+ ostringstream str;
+
+ for(int i = 0; i < (int)binom_bins[pos].size(); i++)
+ str << binom_bins[pos][i] << '\t';
+
+ return str.str();
+}
+
+int query::max_v_pos(vector<m_point> scores)
+{
+ m_point max = -INF;
+ int pos = 0;
+
+ for(int i = 0; i < scores.size(); i++)
+ if(scores[i] > max)
+ {
+ max = scores[i];
+ pos = i;
+ }
+
+ return pos;
+}
+
+int query::min_v_pos(vector<m_point> scores)
+{
+ m_point min = +INF;
+ int pos = 0;
+
+ for(int i = 0; i < scores.size(); i++)
+ if(scores[i] < min)
+ {
+ min = scores[i];
+ pos = i;
+ }
+
+ return pos;
+}
+
+/*void query::correlation(vector<tfbs> *TFBS)
+{
+ string outfile;
+ vector<tfbs>::iterator iv = TFBS->begin();
+
+ cerr << "\nDoing correlations...";
+
+ if(QTFBS.size() > 0) // CHOOSING FILENAME
+ {
+ outfile = queryfile;
+ outfile += ".crl";
+ }
+ else
+ {
+ if(promfile.size() > 0)
+ {
+ outfile = promfile;
+ outfile += ".crl";
+ }
+ else if(promfile.size() == 0 && matrixfile.size() > 0)
+ {
+ if(iv->row_actual_size() == 0)
+ error_handler(NEED_COMPLETE_MATRIX,"");
+ outfile = matrixfile;
+ outfile += ".crl";
+ }
+ else
+ outfile = "pscan.crl";
+ } //END OF CHOOSING FILENAME
+
+ build_correlation_matrix(TFBS);
+
+ if(COMPLETE_CORRELATION_MATRIX)
+ correlation_matrix_output(TFBS, outfile);
+ else
+ correlation_output(TFBS,outfile);
+
+ correlation_output(TFBS, outfile);
+
+ cerr << "done\n";
+
+ return;
+}
+
+void query::correlation_output(vector<tfbs> *TFBS, string outfile)
+{
+ vector<vector <int> > ascending, descending;
+ m_point **cm_copy;
+ int vnum = 10;
+
+ cm_copy = new m_point*[TFBS->size()];
+
+ for(int i = 0; i < TFBS->size(); i++)
+ cm_copy[i] = new m_point[TFBS->size()];
+
+ correlation_matrix_copy(cm_copy, (int)TFBS->size());
+
+ for(int x = 0; x < TFBS->size(); x++)
+ descending.push_back(matrix_line_max_pos(cm_copy[x], TFBS->size(), x));
+
+ correlation_matrix_copy(cm_copy, (int)TFBS->size());
+
+ for(int x = 0; x < TFBS->size(); x++)
+ ascending.push_back(matrix_line_min_pos(cm_copy[x], TFBS->size(), x));
+
+ for(int i = 0; i < TFBS->size(); i++)
+ delete[] cm_copy[i];
+
+ delete[] cm_copy;
+
+ ofstream out(outfile.c_str());
+ vector<tfbs>::iterator iv = TFBS->begin();
+
+ if(TFBS->size() < vnum)
+ vnum = TFBS->size();
+
+ for(int i = 0; i < TFBS->size(); i++)
+ {
+ out << "***Min for " << iv->name() << " :";
+ {
+ for(int f = 0; f < vnum; f++)
+ {
+ vector<tfbs>::iterator fv = TFBS->begin();
+
+ for(int a = 0; a < ascending[i][f]; a++)
+ fv++;
+
+ out << endl << fv->name() << '\t' << correlation_matrix[i][ascending[i][f]];
+ }
+ }
+
+ out << endl << "\n***Max for " << iv->name() << " :";
+ {
+ for(int f = 0; f < vnum; f++)
+ {
+ vector<tfbs>::iterator fv = TFBS->begin();
+
+ for(int a = 0; a < descending[i][f]; a++)
+ fv++;
+
+ out << endl << fv->name() << '\t' << correlation_matrix[i][descending[i][f]];
+ }
+ }
+
+ out << endl << endl;
+
+ iv++;
+ }
+
+ return;
+}*/
+
+vector<int> query::matrix_line_max_pos(m_point* line, int size, int p)
+{
+ vector<int> desc;
+
+ for(int h = 0; h < size; h++)
+ {
+ int x = 0;
+ m_point max = -INF;
+
+ for(int i = 0; i < size; i++)
+ {
+ if(i != p)
+ {
+ if(line[i] > max)
+ {
+ max = line[i];
+ x = i;
+ }
+ }
+ }
+
+ line[x] = -INF;
+ desc.push_back(x);
+ }
+
+ return desc;
+}
+
+vector<int> query::matrix_line_min_pos(m_point* line, int size, int p)
+{
+ vector<int> asc;
+
+ for(int h = 0; h < size; h++)
+ {
+ m_point min = INF;
+ int x = 0;
+
+ for(int i = 0; i < size; i++)
+ {
+ if(i != p)
+ {
+ if(line[i] < min)
+ {
+ min = line[i];
+ x = i;
+ }
+ }
+ }
+
+ line[x] = INF;
+ asc.push_back(x);
+ }
+
+ return asc;
+}
+
+/*void query::correlation_matrix_copy(m_point **cm_copy, int size)
+{
+ for(int x = 0; x < size; x++)
+ for(int y = 0; y < size; y++)
+ cm_copy[x][y] = correlation_matrix[x][y];
+
+ return;
+}
+
+void query::correlation_matrix_output(vector<tfbs> *TFBS, string outfile)
+{
+ ofstream out(outfile.c_str());
+
+ vector<tfbs>::iterator iv = TFBS->begin();
+
+ out << '\t';
+
+ for(int i = 0; i < TFBS->size(); i++)
+ {
+ out << iv->name() << '\t';
+ iv++;
+ }
+
+ out << endl;
+
+ iv = TFBS->begin();
+
+ for(int i = 0; i < TFBS->size(); i++)
+ {
+ out << iv->name() << '\t';
+
+ for(int h = 0; h < TFBS->size(); h++)
+ out << correlation_matrix[i][h] << '\t';
+
+ out << endl;
+ iv++;
+ }
+
+ return;
+}
+
+void query::build_correlation_matrix(vector<tfbs> *TFBS)
+{
+ correlation_matrix = new m_point*[TFBS->size()];
+
+ for(int i = 0; i < TFBS->size(); i++)
+ correlation_matrix[i] = new m_point[TFBS->size()];
+
+ if(QTFBS.size() > 0)
+ {
+ vector<tfbs>::iterator iv_a = TFBS->begin();
+
+ for(int a = 0; a < QTFBS.size(); a++)
+ {
+ vector<tfbs>::iterator iv_b = TFBS->begin();
+
+ for(int b = 0; b < QTFBS.size(); b++)
+ {
+ vector<m_point>::iterator row_1_i = QTFBS[a].row_iter();
+ vector<m_point>::iterator row_2_i = QTFBS[b].row_iter();
+ m_point sum = 0;
+
+ for(int c = 0; c < QTFBS[b].row_actual_size(); c++)
+ {
+ m_point num, den;
+
+ num = ((*row_1_i + *row_2_i) - (iv_a->row_avg_get() + iv_b->row_avg_get()));
+ num = pow(num,2);
+
+ den = pow(iv_a->row_stddev_get(),2) + pow(iv_b->row_stddev_get(),2);
+
+ if(den == 0)
+ cerr << endl << iv_a->row_stddev_get() << '\t' << iv_b->row_stddev_get() << endl;
+
+ // cerr << "\nnum = " << num << '\t' << "den = " << den << endl;
+
+ sum += num/den;
+
+ row_1_i++;
+ row_2_i++;
+ }
+
+ // cerr << "\n sum = " << sum << " dof = " << (double)QTFBS[b].row_actual_size() << endl;
+ correlation_matrix[a][b] = gsl_cdf_chisq_Q(sum,(double)QTFBS[b].row_actual_size());
+ iv_b++;
+ }
+
+ iv_a++;
+ }
+ }
+
+ else
+ {
+ vector<tfbs>::iterator iv_a = TFBS->begin();
+
+ for(int a = 0; a < TFBS->size(); a++)
+ {
+ vector<tfbs>::iterator iv_b = TFBS->begin();
+
+ for(int b = 0; b < TFBS->size(); b++)
+ {
+ vector<m_point>::iterator row_1_i = iv_a->row_iter();
+ vector<m_point>::iterator row_2_i = iv_b->row_iter();
+ m_point sum = 0;
+
+ for(int c = 0; c < iv_b->row_actual_size(); c++)
+ {
+ m_point num, den;
+
+ num = ((*row_1_i + *row_2_i) - (iv_a->row_avg_get() + iv_b->row_avg_get()));
+ num = pow(num,2);
+
+ den = pow(iv_a->row_stddev_get(),2) + pow(iv_b->row_stddev_get(),2);
+
+ sum += num/den;
+
+ row_1_i++;
+ row_2_i++;
+ }
+
+ correlation_matrix[a][b] = gsl_cdf_chisq_Q(sum,(double)iv_b->row_actual_size());
+ iv_b++;
+ }
+
+ iv_a++;
+ }
+ }
+
+ return;
+}*/
+
+string query::positional_output()
+{
+ ostringstream str;
+
+ str << endl;
+
+ if(BED_OUTPUT)
+ cout << "\ntrack name=\"" << queryfile << "\" useScore=1" << endl << endl;
+
+ for(int h = 0; h < QTFBS.size(); h++)
+ {
+ str << QTFBS[h].name() << endl << endl;
+
+ vector<m_point> Scores, bufScores;
+ vector<int> Pos,Bpos;
+ vector<char> Strand;
+
+ for(int i = 0; i < QUERY.size(); i++)
+ {
+ int pos;
+ char strand;
+ m_point srmp = QUERY[i].get_seq_rel_max_pos(h, &pos, &strand);
+
+ Scores.push_back(srmp);
+ Pos.push_back(pos);
+ Strand.push_back(strand);
+ }
+
+// if(IMG_OUTPUT)
+// draw_distr_graph(&Scores, &Pos, h, QTFBS[h].ac());
+
+ bufScores = Scores;
+
+ for(int i = 0; i < (int)Scores.size(); i++)
+ {
+ m_point max = -INF;
+ int bpos;
+
+ for(int j = 0; j < (int)Scores.size(); j++)
+ if(Scores[j] > max)
+ {
+ max = Scores[j];
+ bpos = j;
+ }
+
+
+ Scores[bpos] = -INF;
+ Bpos.push_back(bpos);
+ }
+
+ for(int i = 0; i < Bpos.size(); i++)
+ {
+ str << QUERY[Bpos[i]].w_name() << '\t' << bufScores[Bpos[i]] << '\t';
+
+ if(!REVERSE_STRAND)
+ str << Pos[Bpos[i]] - (int)QUERY[Bpos[i]].seq().size() + abs(TSS_POS);
+ else
+ str << -Pos[Bpos[i]] - QTFBS[h].m_length() + abs(TSS_POS);
+
+ if(BED_OUTPUT)
+ {
+ cout << QUERY[Bpos[i]].get_chr() << '\t';
+
+ if(QUERY[Bpos[i]].get_strand() != '-')
+ {
+ cout << QUERY[Bpos[i]].get_abs_start() + Pos[Bpos[i]] << '\t'
+ << QUERY[Bpos[i]].get_abs_start() + Pos[Bpos[i]] + QTFBS[h].m_length() - 1 << '\t';
+ }
+ else
+ {
+ cout << QUERY[Bpos[i]].get_abs_end() - Pos[Bpos[i]] - QTFBS[h].m_length() + 1 << '\t'
+ << QUERY[Bpos[i]].get_abs_end() - Pos[Bpos[i]] << '\t';
+ }
+
+ cout << revcomp(QUERY[Bpos[i]].seq().substr(Pos[Bpos[i]],QTFBS[h].m_length()),Strand[Bpos[i]]) << '\t'
+ << bufScores[Bpos[i]]*1000 << '\t';
+
+ if(QUERY[Bpos[i]].get_strand() == Strand[Bpos[i]])
+ cout << Strand[Bpos[i]] << endl;
+ else
+ cout << flip_strand(Strand[Bpos[i]]) << endl;
+ }
+
+ str << '\t'
+ << revcomp(QUERY[Bpos[i]].seq().substr(Pos[Bpos[i]],QTFBS[h].m_length()),Strand[Bpos[i]])
+ << '\t'
+ << Strand[Bpos[i]] << endl;
+ }
+ }
+
+ return str.str();
+}
+/*
+void query::draw_distr_graph(vector<m_point> *Score, vector<int> *Pos, int tf_index, string tname)
+{
+
+ int bgt = -1;
+
+ vector<tfbs> T;
+
+ if(!bg_for_img.empty() && (!matrixlist.empty() || !fasta_matrix_file.empty()))
+ {
+
+ usethismatrix.clear();
+ usethismatrixname.clear();
+
+ T = v_tfbs_builder(get_file_names(matrix_files));
+
+ ifstream pin(bg_for_img.c_str());
+
+ if(pin)
+ {
+ string line;
+ getline(pin,line);
+ generate_tfbs_rows_from_short_matrix(&T,&pin);
+ }
+
+ pin.close();
+
+ for(int i = 0; i < T.size(); i++)
+ if(T[i].ac() == tname)
+ {
+ bgt = i;
+ break;
+ }
+ }
+
+ gdImagePtr img_score_dist, img_pos_dist;
+ int WHITE, BLACK, BLUE, GREEN;
+ char *font_1 = "./SUSESansMono-Bold.ttf";
+ int max_qsize = 0;
+ int brect[8];
+
+ for(int a = 0; a < QUERY.size(); a++)
+ if(QUERY.at(a).seq().size() > max_qsize)
+ max_qsize = QUERY.at(a).seq().size();
+
+ FILE *score_out, *pos_out;
+
+ string img_score_file, img_pos_file;
+
+ img_score_file = tname;
+ img_score_file += ".";
+
+ img_pos_file = img_score_file;
+
+ if(!q_idlist.empty())
+ {
+ img_score_file += q_idlist;
+ img_pos_file += q_idlist;
+ }
+ else
+ {
+ img_score_file += queryfile;
+ img_pos_file += queryfile;
+ }
+
+ img_score_file += ".score.png";
+ img_pos_file += ".pos.png";
+
+ img_score_dist = gdImageCreateTrueColor(1600,50);
+ img_pos_dist = gdImageCreateTrueColor(max_qsize + 50, 50);
+
+ WHITE = gdImageColorAllocate(img_score_dist,255,255,255);
+ gdImageColorAllocate(img_pos_dist,255,255,255);
+
+ BLACK = gdImageColorAllocate(img_score_dist,0,0,0);
+ gdImageColorAllocate(img_pos_dist,0,0,0);
+
+ BLUE = gdImageColorAllocate(img_score_dist,0,0,255);
+ gdImageColorAllocate(img_pos_dist,0,0,255);
+
+ GREEN = gdImageColorAllocate(img_score_dist,0,255,0);
+ gdImageColorAllocate(img_pos_dist,0,255,0);
+
+ int std_dev_pix = (int)(QTFBS.at(tf_index).row_stddev_get() * 1600);
+ int avg_pix = (int)(QTFBS.at(tf_index).row_avg_get() * 1600);
+
+ gdImageLine(img_score_dist, 1600 - avg_pix, 31, 1600 - avg_pix, 49, GREEN);
+
+ for(int i = 0; i < Score->size(); i++)
+ {
+ int x_pos = (int)(1600*Score->at(i)), green = 0, red, blue = 0;
+
+ if(bgt == -1)
+ red = (int)(((Score->at(i)-QTFBS.at(tf_index).row_avg_get())/QTFBS.at(tf_index).row_stddev_get()) * 100.0);
+ else
+ red = (int)(((Score->at(i)-T.at(bgt).row_avg_get())/T.at(bgt).row_stddev_get()) * 100.0);
+
+ if(red > 255)
+ red = 255;
+
+ if(red < 0)
+ {
+ green = -red;
+ red = 0;
+ }
+
+ if(green > 255)
+ green = 255;
+
+ if(green <= 100 && red <= 100)
+ {
+ green = 150;
+ red = 150;
+ }
+
+ int color = gdImageColorAllocate(img_pos_dist,red,green,blue);
+ int color2 = gdImageColorAllocate(img_score_dist,red,green,blue);
+
+ gdImageLine(img_score_dist, 1600 - x_pos, 0, 1600 - x_pos, 30, color2);
+
+ if(bgt == -1)
+ {
+ if(Score->at(i) >= QTFBS.at(tf_index).row_avg_get())
+ gdImageLine(img_pos_dist, 5 + max_qsize - Pos->at(i), 0, 5 + max_qsize - Pos->at(i), 30, color);
+ }
+ else
+ {
+ if(Score->at(i) >= T.at(bgt).row_avg_get())
+ gdImageLine(img_pos_dist, 5 + max_qsize - Pos->at(i), 0, 5 + max_qsize - Pos->at(i), 30, color);
+ }
+ }
+
+ for(int i = TSS_POS + 5; i <= TSS_POS + max_qsize + 5; i += 50)
+ {
+ gdImageLine(img_pos_dist, i+abs(TSS_POS), 32, i+abs(TSS_POS), 49, WHITE);
+
+ char pi[5];
+ int f;
+
+ ostringstream buf3;
+
+ buf3 << -(i - 5);
+
+
+ for(f = 0; f < buf3.str().size() && f < 5; f++)
+ pi[f] = buf3.str().at(f);
+
+ pi[f] = '\0';
+
+ gdImageStringFT(img_pos_dist, brect, WHITE, font_1, 8, 0, i+2 + abs(TSS_POS), 42, pi);
+ }
+
+ gdImageRectangle(img_pos_dist, 0, 0, max_qsize + 50, 49, BLACK);
+
+ char *p0 = "1", *p1 = "0", pavg[5];
+
+ ostringstream buf;
+
+ buf << QTFBS.at(tf_index).row_avg_get();
+
+ int h;
+
+ for(h = 0; h < 4 && h < buf.str().size(); h++)
+ pavg[h] = buf.str().at(h);
+
+ pavg[h] = '\0';
+
+ if(QTFBS.at(tf_index).row_avg_get() < 0.99)
+ gdImageStringFT(img_score_dist, brect, WHITE, font_1, 10, 0, 2, 45, p0);
+
+ if(QTFBS.at(tf_index).row_avg_get() > 0.1)
+ gdImageStringFT(img_score_dist, brect, WHITE, font_1, 10, 0, 1590, 45, p1);
+
+ gdImageStringFT(img_score_dist, brect, WHITE, font_1, 10, 0, 1600 - avg_pix + 10, 45, pavg);
+
+ score_out = fopen(img_score_file.c_str(),"wb");
+ pos_out = fopen(img_pos_file.c_str(),"wb");
+
+ gdImagePng(img_score_dist,score_out);
+ gdImagePng(img_pos_dist, pos_out);
+
+ fclose(score_out);
+ fclose(pos_out);
+
+ gdImageDestroy(img_score_dist);
+ gdImageDestroy(img_pos_dist);
+
+ return;
+}
+*/
+
+char query::comp_char(char c)
+{
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ if(c == C_ALPHABET[i])
+ return ALPHABET[i];
+}
+
+
+string query::revcomp(string oligo, char strand)
+{
+ if(strand == '+' || REVERSE_STRAND)
+ return oligo;
+
+ string rc_oligo;
+
+ for(int i = (int)oligo.size() - 1; i >= 0; i--)
+ rc_oligo.push_back(comp_char(oligo[i]));
+
+ return rc_oligo;
+}
+
+void query::pearson_corr()
+{
+ if(!PEARSON || !QUERY.size() || QTFBS.size() <= 1)
+ return;
+
+ string ofile = queryfile;
+
+ ofile += ".pearson_matrix";
+
+ cerr << "\nWriting sequences correlation matrix: " << ofile << endl;
+
+ ofstream out(ofile.c_str());
+
+ if(!out)
+ error_handler(CANTOPENOUTPUT,ofile);
+
+ out << '\t';
+
+ for(int i = 0; i < QUERY.size(); i++)
+ out << QUERY[i].w_name() << '\t';
+
+ out << endl;
+
+ for(int a = 0; a < QUERY.size(); a++)
+ {
+ out << QUERY[a].w_name();
+ m_point a_avg, a_dev;
+
+ a_avg = seq_stats(a, &a_dev);
+
+ for(int b = 0; b < QUERY.size(); b++)
+ {
+ m_point b_avg, b_dev, sum = 0;
+
+ b_avg = seq_stats(b,&b_dev);
+
+ for(int c = 0; c < QTFBS.size(); c++)
+ {
+ m_point z1,z2;
+
+ z1 = (QTFBS[c].row_get(a) - a_avg)/a_dev;
+ z2 = (QTFBS[c].row_get(b) - b_avg)/b_dev;
+
+ sum += z1*z2;
+ }
+
+ out << '\t' << sum / (int)((int)QTFBS.size() - 1);
+ }
+
+ out << endl;
+ }
+
+ return;
+}
+
+m_point query::seq_stats(int pos, m_point *dev_std)
+{
+ vector<m_point> s_v;
+ m_point avg = 0;
+ *dev_std = 0;
+
+ for(int t = 0; t < QTFBS.size(); t++)
+ s_v.push_back(QTFBS[t].row_get(pos));
+
+ for(int t = 0; t < s_v.size(); t++)
+ avg += s_v[t];
+
+ avg /= (int)s_v.size();
+
+ for(int t = 0; t < s_v.size(); t++)
+ *dev_std += pow(s_v[t] - avg,2);
+
+ *dev_std /= ((int)s_v.size() - 1);
+
+ *dev_std = pow(*dev_std,0.5);
+
+ return avg;
+}
+
+void query::zvectors_output(vector<tfbs> *TFBS)
+{
+ string zv_file = queryfile;
+
+ zv_file += ".zv";
+
+ ofstream out(zv_file.c_str());
+
+ for(int j = 0; j < QUERY.size(); j++)
+ {
+ out << QUERY.at(j).name() << '\t';
+
+ for(int i = 0; i < TFBS->size(); i++)
+ out << (QTFBS.at(i).row_get(j) - TFBS->at(i).row_avg_get()) / TFBS->at(i).row_stddev_get() << '\t';
+
+ out << endl;
+ }
+
+ out.close();
+
+ return;
+}
+/*
+void query::img_output(vector<tfbs> *TFBS)
+{
+ if((int)TFBS->size() * (int)QUERY.size() > MAX_DOTS)
+ {
+ cerr << "Image too big... i will not try to draw it!\n";
+ return;
+ }
+
+ cerr << "Drawing...(" << (int)TFBS->size() * (int)QUERY.size() << ") dots... ";
+
+ vector<m_point> buf_ts;
+ vector<int> seq_order;
+
+ buf_ts = ts;
+
+ FILE *img_out, *small_img_out;
+
+ string img_file = queryfile, mat_file = queryfile, small_img_file;
+
+ if(!q_idlist.empty())
+ img_file = q_idlist;
+
+ char *font_1 = "./SUSESansMono-Bold.ttf";
+
+ small_img_file = img_file;
+
+ img_file += ".png";
+ small_img_file += ".small.png";
+ mat_file += ".vma";
+
+ const int spot_width = 35, spot_height = 20, y_label_width = 320, x_label_height = 120;
+
+ int img_width = (spot_width * QTFBS.size()) + y_label_width, img_height = (spot_height * QUERY.size()) + x_label_height, f_color,
+ red[256], green[256];
+
+ gdImagePtr im = gdImageCreateTrueColor(img_width, img_height);
+ gdImagePtr small_im = gdImageCreateTrueColor(100,100);
+
+ f_color = gdImageColorAllocate(im,255,255,255);
+
+ for(int g = 0; g < 256; g++)
+ red[g] = gdImageColorAllocate(im,g,0,0);
+
+ for(int g = 0; g < 256; g++)
+ green[g] = gdImageColorAllocate(im,0,g,0);
+
+ for(int i = 0; i < TFBS->size(); i++)
+ {
+ int brect[8], c;
+ char tname[13];
+ double PI = 3.14159;
+ int m_pos = max_v_pos(buf_ts);
+
+// cerr << QTFBS.at(m_pos).name() << endl;
+
+ for(c = 0; c < 12 && c < QTFBS.at(m_pos).name().size(); c++)
+ tname[c] = QTFBS.at(m_pos).name().at(c);
+
+ while(c < 13)
+ {
+ tname[c] = '\0';
+ c++;
+ }
+
+ gdImageStringFT(im, brect, f_color, font_1, 12.0, PI/2, (spot_width * i)+y_label_width+spot_width-10 ,x_label_height, tname);
+
+ for(int j = 0; j < QUERY.size(); j++)
+ {
+ if(!i)
+ {
+ vector<m_point> buf_row = QTFBS.at(m_pos).row_get();
+ char sname[31];
+ int d;
+
+ if(!j)
+ for(int k = 0; k < QUERY.size(); k++)
+ {
+ seq_order.push_back(max_v_pos(buf_row));
+ buf_row[seq_order.at(seq_order.size() - 1)] = -INF;
+ }
+
+ for(d = 0; d < 30 && d < QUERY.at(seq_order[j]).w_name().size(); d++)
+ sname[d] = QUERY.at(seq_order[j]).name().at(d);
+
+ while(d < 31)
+ {
+ sname[d] = '\0';
+ d++;
+ }
+
+ gdImageStringFT(im,brect, f_color, font_1, 12.0, 0, 10, 15 + x_label_height + spot_height*j, sname);
+
+ }
+
+ m_point zs = (QTFBS.at(m_pos).row_get(seq_order[j]) - TFBS->at(m_pos).row_avg_get()) / TFBS->at(m_pos).row_stddev_get();
+
+ zs = ceil(zs*100);
+
+ // vma[m_pos][seq_order[j]] = (short int)zs;
+
+ if(fabs(zs) != zs)
+ {
+ zs = fabs(zs);
+
+ if(zs > 255)
+ zs = 255;
+
+ gdImageFilledRectangle(im, y_label_width + spot_width * i, x_label_height + spot_height * j,
+ y_label_width + (spot_width * (i + 1) - 1), x_label_height + (spot_height * (j + 1) - 1),
+ green[(int)zs]);
+ }
+ else
+ {
+ if(zs > 255)
+ zs = 255;
+
+ gdImageFilledRectangle(im, y_label_width + spot_width * i, x_label_height + spot_height * j,
+ y_label_width + (spot_width * (i + 1) - 1), x_label_height + (spot_height * (j + 1) - 1),
+ red[(int)zs]);
+ }
+ }
+
+ buf_ts[m_pos] = -INF;
+
+ }
+
+
+ gdImageCopyResampled(small_im,im,0,0,0,0,100,100,img_width, img_height);
+
+ img_out = fopen(img_file.c_str(), "wb");
+
+ if(!img_out)
+ error_handler(CANTOPENOUTPUT, img_file);
+
+ gdImagePng(im,img_out);
+
+ fclose(img_out);
+
+ small_img_out = fopen(small_img_file.c_str(),"wb");
+
+ if(!small_img_out)
+ error_handler(CANTOPENOUTPUT, small_img_file);
+
+ gdImagePng(small_im,small_img_out);
+
+ fclose(small_img_out);
+
+ gdImageDestroy(im);
+ gdImageDestroy(small_im);
+
+
+ cerr << "done\n";
+
+ return;
+
+}*/
+
+//END OF QUERY CLASS
+
+//START OF BIG_SEQUENCE CLASS
+
+/*vector<m_point> big_sequence::row_return(int r)
+{
+ if(r >= MAX_REL_SCORE.size())
+ error_handler(1000,"");
+
+ return MAX_REL_SCORE[r];
+} */
+
+void big_sequence::scan(vector<tfbs> *TFBS, int Snum, bool q)
+{
+
+ cerr << (char)13 << " " << (char)13
+ << name().substr(0,24) << " (" << Snum+1 << ": " << seq().size() << ")";
+
+ for(vector<tfbs>::iterator i = TFBS->begin(); i != TFBS->end(); i++)
+ SCAN(i,q);
+
+ return;
+}
+
+void big_sequence::SCAN(vector<tfbs>::iterator t, bool q)
+{
+ vector<m_point> scores, scores_rc, Rel_score;
+ vector<int> max_pos;
+ vector<string> max_motif;
+ vector<char> motif_strand;
+ int sub_length = t->m_length(), stp = ((int)seq().size() -BIG_WINDOW) / 100, Pos,pos,pos_rc;
+ m_point r_score, maxs, maxs_rc, Max;
+ char strand;
+
+ for(int i = 0; i < BIG_WINDOW; i++)
+ {
+ string sub = seq().substr(i,sub_length);
+
+ if(sub.size() != sub_length)
+ break;
+
+ m_point score = 0, score_rc = 0;
+
+ if(USE_N)
+ {
+ for(int x = 0; x < sub_length; x++)
+ {
+ if(sub[x] != 'N')
+ score += t->get_value(x,char_to_index(sub[x]));
+ else
+ score += t->get_matrix_average(x);
+
+ if(sub[sub_length - 1 - x] != 'N')
+ score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ else
+ score_rc += t->get_matrix_average(sub_length - 1 - x);
+ }
+ }
+
+ else if(!USE_N && sub.find("N") == string::npos)
+ {
+ for(int x = 0; x < sub_length; x++)
+ {
+ score += t->get_value(x,char_to_index(sub[x]));
+ score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ }
+ }
+ else
+ {
+ score = t->min()*2;
+ score_rc = t->min()*2;
+ }
+ scores.push_back(score);
+ scores_rc.push_back(score_rc);
+ }
+
+ maxs = max_v(&scores,&pos);
+ maxs_rc = max_v(&scores_rc,&pos_rc);
+
+ if(!DOUBLE_STRAND)
+ maxs_rc = -INF;
+
+ if(maxs >= maxs_rc)
+ {
+ Max = maxs;
+ Pos = pos;
+ strand = '+';
+ }
+ else
+ {
+ Max = maxs_rc;
+ Pos = pos_rc;
+ strand = '-';
+ }
+
+ r_score = Max;
+
+ r_score -= t->max();
+
+ r_score /= (t->min() - t->max());
+
+ max_pos.push_back(Pos);
+ max_motif.push_back(seq().substr(Pos,sub_length));
+
+ Rel_score.push_back(r_score);
+ motif_strand.push_back(strand);
+
+ for(int i = BIG_STEP; i < seq().size() - (BIG_WINDOW - BIG_STEP + sub_length); i += BIG_STEP)
+ {
+ int count = 0;
+
+ scores.erase(scores.begin(),scores.begin() + BIG_STEP);
+ scores_rc.erase(scores_rc.begin(),scores_rc.begin() + BIG_STEP);
+
+ if(i%100000 < BIG_STEP)
+ cerr << (char)13 << "(" << (i/stp) + 1 << "%)";
+
+ while(count < BIG_STEP)
+ {
+ string sub = seq().substr(i + BIG_WINDOW - BIG_STEP + count,sub_length);
+
+ if(sub.size() < sub_length)
+ break;
+
+ m_point score = 0, score_rc = 0;
+
+ if(USE_N)
+ {
+ for(int x = 0; x < sub_length; x++)
+ {
+ if(sub[x] != 'N')
+ score += t->get_value(x,char_to_index(sub[x]));
+ else
+ score += t->get_matrix_average(x);
+
+ if(sub[sub_length - 1 - x] != 'N')
+ score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ else
+ score_rc += t->get_matrix_average(sub_length - 1 - x);
+ }
+ }
+
+ else if(!USE_N && sub.find("N") == string::npos)
+ {
+ for(int x = 0; x < sub_length; x++)
+ {
+ score += t->get_value(x,char_to_index(sub[x]));
+ score_rc += t->get_value(x,comp_char_to_index(sub[sub_length - 1 - x]));
+ }
+ }
+ else
+ {
+ score = t->min()*2;
+ score_rc = t->min()*2;
+ }
+
+ scores.push_back(score);
+ scores_rc.push_back(score_rc);
+ count++;
+
+ #ifdef DEBUG7
+ cerr << endl << sub << '\t' << i + BIG_WINDOW - BIG_STEP + count << '\t' << i + BIG_WINDOW - BIG_STEP + count + sub_length;
+
+ if(score >= score_rc)
+ Max = score;
+ else
+ Max = score_rc;
+
+ r_score = Max;
+
+ r_score -= t->max();
+
+ r_score /= (t->min() - t->max());
+
+ cerr << '\t' << 1-r_score;
+ #endif
+ }
+
+ maxs = max_v(&scores,&pos);
+ maxs_rc = max_v(&scores_rc,&pos_rc);
+
+ if(!DOUBLE_STRAND)
+ maxs_rc = -INF;
+
+ if(maxs >= maxs_rc)
+ {
+ Max = maxs;
+ Pos = pos;
+ strand = '+';
+ }
+ else
+ {
+ Max = maxs_rc;
+ Pos = pos_rc;
+ strand = '-';
+ }
+
+
+ r_score = Max;
+
+ r_score -= t->max();
+
+ r_score /= (t->min() - t->max());
+
+ #ifdef DEBUG7
+ cerr << '\t' << 1-r_score << '\t' << i + Pos << '\t' << i << '\t' << Pos;
+ #endif
+
+ max_pos.push_back(i + Pos);
+ max_motif.push_back(seq().substr(i + Pos,sub_length));
+ Rel_score.push_back(r_score);
+ motif_strand.push_back(strand);
+ }
+
+ MAX_REL_SCORE.push_back(Rel_score);
+ MAX_POS.push_back(max_pos);
+ MAX_MOTIF.push_back(max_motif);
+ MOTIF_STRAND.push_back(motif_strand);
+
+ return;
+}
+
+string big_sequence::track_output(int r)
+{
+ ostringstream str;
+ int count = 0;
+
+ for(int i = 0; i < MAX_REL_SCORE[r].size(); i++)
+ {
+ if(1 - fabs(MAX_REL_SCORE[r][i]) > TRACK_CUTOFF)
+ str << endl << trackbg << '\t' << count + 1 << '\t' << count + BIG_WINDOW << '\t' << MAX_MOTIF[r][i] << '\t'
+ << (1 - fabs(MAX_REL_SCORE[r][i]))*1000 /*<< '\t' << MAX_POS[r][i] + 1*/ << '\t'
+ << MOTIF_STRAND[r][i];
+
+ count += BIG_STEP;
+ }
+
+ return str.str();
+}
+
+string big_sequence::inline_output()
+{
+ ostringstream str;
+ int count = 0, mrs_min_size = 10000000;
+ m_point sum;
+ vector<m_point> v_avg;
+ vector<vector<int> > v_bns;
+ vector<int> v_count;
+// vector<int> bns (100,0);
+// vector<m_point> m_sum;
+
+ for(int i = 0; i < MAX_REL_SCORE.size(); i++)
+ if(MAX_REL_SCORE[i].size() < mrs_min_size)
+ mrs_min_size = MAX_REL_SCORE[i].size();
+
+ for(int i = 0; i < MAX_REL_SCORE.size(); i++)
+ {
+ vector<int> bns (101,0);
+ v_avg.push_back(0);
+ v_count.push_back(0);
+ v_bns.push_back(bns);
+ }
+
+ for(int i = 0; i < MAX_REL_SCORE.size(); i++)
+ for(int j = 0; j < MAX_REL_SCORE[i].size(); j++)
+ {
+ if(1 - fabs (MAX_REL_SCORE[i][j]) > 0)
+ {
+ v_avg[i] += (1 - fabs (MAX_REL_SCORE[i][j])) * 100;
+ v_bns[i][(int)floor((1 - fabs(MAX_REL_SCORE[i][j]))*100)]++;
+ v_count[i]++;
+ }
+ }
+
+ for(int i = 0; i < MAX_REL_SCORE.size(); i++)
+ v_avg[i] /= (m_point)v_count[i];
+
+ for(int i = 0; i < mrs_min_size; i++)
+ {
+// sum = 0;
+
+// if(i < MAX_REL_SCORE[MAX_REL_SCORE.size() 1])
+ if(1 - fabs(MAX_REL_SCORE.at(0).at(i)) >= 0)
+ {
+ str << count + 1 << '\t' << count + BIG_WINDOW << '\t';
+
+ for(int r = 0; r < MAX_REL_SCORE.size(); r++)
+ {
+ str << 1 - fabs(MAX_REL_SCORE.at(r).at(i)) /*<< '\t' << MAX_POS[r][i] + 1*/ << '\t';
+ // << MAX_MOTIF[r][i] << '\t' << MOTIF_STRAND[r][i] << '\t';
+
+// sum += 1 - fabs(MAX_REL_SCORE.at(r).at(i));
+
+ }
+
+ str << endl;
+
+ }
+
+ count += BIG_STEP;
+// m_sum.push_back(sum);
+
+// str << sum ;
+
+/* if(i == mrs_min_size - 1)
+ {
+ int pos;
+ str << '\t' << max_v(&m_sum,&pos);
+ }*/
+
+
+ }
+
+ for(int i = 0; i < MAX_REL_SCORE.size(); i++)
+ {
+ cout << endl << endl << "Avg = " << v_avg[i] << endl << endl;
+
+ for(int j = 1; j < v_bns[i].size(); j++)
+ cout << j << '\t' << v_bns[i][j] << endl;
+ }
+
+ return str.str();
+}
+//END OF CLASS BIG_SEQUENCE
+//START OF BOUND_OLIGOS CLASS
+#ifdef BOUND_OLIGOS
+vector<string> bound_oligos::assign(vector<string> oligos)
+{
+ vector<string> w_mat;
+
+ if(oligos.size() < 2)
+ {
+ error_handler(BAD_FILE_FORMAT,bound_oligos_file);
+ return w_mat;
+ }
+
+ if(oligos[0].size() < 2)
+ {
+ error_handler(BAD_FILE_FORMAT,bound_oligos_file);
+ return w_mat;
+ }
+
+ if(oligos[0][0] != '>')
+ {
+ error_handler(BAD_FILE_FORMAT,bound_oligos_file);
+ return w_mat;
+ }
+ else
+ NAME = oligos[0].substr(1);
+
+ int size = (int)oligos[1].size();
+
+ for(int i = 1; i < (int)oligos.size(); i++)
+ {
+ if(size != (int)oligos[i].size())
+ {
+ error_handler(BAD_FILE_FORMAT,bound_oligos_file);
+ return w_mat;
+ }
+ else
+ {
+ bool gflag = true;
+
+ for(int j = 0; j < (int)oligos[i].size(); j++)
+ {
+ oligos[i][j] = toupper(oligos[i][j]);
+
+ bool flag = false;
+
+ for(int a = 0; a < ALPHABET_SIZE; a++)
+ {
+ if(oligos[i][j] == ALPHABET[a])
+ {
+ flag = true;
+ break;
+ }
+ }
+
+ if(!flag)
+ {
+ gflag = false;
+ break;
+ }
+ }
+
+ if(gflag)
+ raw_mat.push_back(oligos[i]);
+ }
+ }
+
+ w_mat.push_back(NAME);
+
+ build_matrix();
+ build_pos_cor_matrix();
+ w2_vector(&w_mat);
+
+ #ifdef DEBUG8
+ for(int i = 0; i < raw_mat.size(); i++)
+ cerr << raw_mat[i] << endl;
+ #endif
+
+ #ifdef DEBUG9
+ cerr << endl << show_pos_cor_mat();
+ #endif
+
+
+ return w_mat;
+}
+
+void bound_oligos::w2_vector(vector<string> *w2_mat)
+{
+
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ {
+ ostringstream ss;
+
+ for(int j = 0; j < x_size; j++)
+ {
+ ss << mat[j][i];
+
+ if(j < x_size - 1)
+ ss << '\t';
+ }
+
+// cerr << endl << ss.str() << endl;
+
+ w2_mat->push_back(ss.str());
+ }
+
+ return;
+}
+
+void bound_oligos::build_matrix()
+{
+ x_size = (int)raw_mat[0].size();
+ y_size = (int)raw_mat.size();
+
+ mat = new int*[x_size];
+
+ for(int i = 0; i < x_size; i++)
+ mat[i] = new int[ALPHABET_SIZE];
+
+ for(int i = 0; i < x_size; i++)
+ {
+ for(int j = 0; j < ALPHABET_SIZE; j++)
+ mat[i][j] = 0;
+
+ for(int j = 0; j < y_size; j++)
+ mat[i][char_to_index(raw_mat[j][i])]++;
+ }
+
+// for(int i = 0; i < (int)raw_mat[0].size(); i++)
+ // delete[] mat[i];
+
+// delete[] mat;
+
+ return;
+}
+
+void bound_oligos::build_pos_cor_matrix()
+{
+ pos_cor_mat = new m_point*[x_size];
+
+ for(int i = 0; i < x_size; i++)
+ pos_cor_mat[i] = new m_point[x_size];
+
+ for(int i = 0; i < x_size; i++)
+ for(int j = 0; j < x_size; j++)
+ pos_cor_mat[i][j] = 0;
+
+ for(int x = 0; x < x_size; x++)
+ {
+ for(int y = 0; y < x_size; y++)
+ {
+ for(int a = 0; a < ALPHABET_SIZE; a++)
+ {
+ for(int b = 0; b < ALPHABET_SIZE; b++)
+ {
+ m_point obs, exp, v;
+
+ obs = (m_point)observed_cooccurrences(x, ALPHABET[a], y, ALPHABET[b]);
+
+// cerr << "\nobs =" << obs << '\t' << "y_size = " << (m_point)y_size << endl;
+
+// obs /= (m_point)y_size;
+
+ exp = ((((m_point)mat[x][a] + 0.0000001)
+ / (m_point)(y_size + 0.0000004))
+ * (((m_point)mat[y][b] + 0.0000001)
+ / (m_point)(y_size + 0.0000004)))
+ * ((m_point)y_size + 0.0000004);
+
+ v = pow((obs - exp),2)/exp;
+
+ pos_cor_mat[x][y] += v;
+
+ // cerr << "x = " << x << " a = " << ALPHABET[a] << " y = " << y << " b = " << ALPHABET[b] << endl;
+ // cerr << "obs = " << obs << '\t' << "exp = " << exp << '\t' << "v= " << v << '\t' << "pcm = "
+ // << pos_cor_mat[x][y] << endl;
+ }
+ }
+
+ pos_cor_mat[x][y] = gsl_cdf_chisq_Q(pos_cor_mat[x][y], 9);
+
+ }
+ }
+
+
+
+ return;
+
+}
+
+int bound_oligos::observed_cooccurrences(int i1, char x1, int i2, char x2)
+{
+ int count = 0;
+
+ for(int i = 0; i < y_size; i++)
+ if(raw_mat[i][i1] == x1 && raw_mat[i][i2] == x2)
+ {
+ // cerr << raw_mat[i] << endl;
+ // cerr << raw_mat[i][i1] << '=' << x1 << '\t' << raw_mat[i][i2] << '=' << x2 << endl;
+ count++;
+ }
+
+// cerr << endl << i1 << '\t' << x1 << '\t' << i2 << '\t' << x2 << '\t' << count;
+
+ return count;
+}
+
+int bound_oligos::char_to_index(char c)
+{
+ for(int i = 0; i < ALPHABET_SIZE; i++)
+ if(c == ALPHABET[i])
+ return i;
+}
+
+string bound_oligos::show_pos_cor_mat()
+{
+ ostringstream ss;
+
+ for(int y = 0; y < x_size; y++)
+ {
+ for(int x = 0; x < x_size; x++)
+ ss << pos_cor_mat[x][y] << '\t';
+
+ ss << endl;
+ }
+
+ return ss.str();
+}
+#endif
+//END OF BOUND_OLIGOS CLASS
+
+//START OF CHIP CLASS
+
+chip::chip()
+{
+ unsigned int line_counter = 0, true_counter = 0;
+
+ if((PROBE % 2) == 0 || PROBE > 15 || PROBE == 0)
+ error_handler(EVEN_PROBE,"");
+
+ noe = chip_files.size();
+
+ vector<string> lines(noe);
+ vector<vector<string > > stack(noe);
+
+ exp_bg_means.resize(noe,0);
+ exp_fg_means.resize(noe,0);
+ Log_Ratio.resize(noe,0);
+
+ set_bins();
+
+ ifstream *in;
+ fout.open("chip.fasta");
+
+ in = new ifstream[noe];
+
+ for(unsigned short int a = 0; a < noe; a++)
+ {
+ in[a].open(chip_files.at(a).c_str());
+
+ if(!in[a])
+ error_handler(MISSING_CHIP_FILE,chip_files.at(a));
+ }
+
+ while(getline(in[0],lines[0]))
+ {
+ bool cflag = false;
+
+ for(unsigned short int a = 1; a < noe; a++)
+ getline(in[a],lines[a]);
+
+ if(!consistency_check(&lines))
+ {
+ ostringstream buf;
+ buf << line_counter;
+
+ error_handler(INCONSISTENT_CHIP,buf.str());
+ }
+ else
+ {
+
+ for(unsigned short int a = 0; a < noe; a++)
+ {
+ if(!lines[a].empty())
+ {
+ if(lines[a][0] != '#')
+ stack[a].push_back(lines[a]);
+ else
+ cflag = true;
+ }
+ else
+ cflag = true;
+
+ if(stack[a].size() > PROBE)
+ stack[a].erase(stack[a].begin());
+// cerr << a << '\t' << stack[a].size() << endl;
+ }
+
+ if(stack[0].size() == PROBE && !cflag)
+ stack_processor(&stack);
+ }
+
+ line_counter++;
+
+ if(!cflag)
+ {
+ true_counter++;
+ Exp_Push_Mean(&lines);
+ }
+ }
+
+ for(unsigned short int a = 0; a < noe; a++)
+ in[a].close();
+
+ delete[] in;
+
+ display_bins();
+
+ cerr << "\nChip data acquired." << endl;
+
+ cerr << "\nExperiment means:\n";
+
+ for(unsigned int a = 0; a < noe; a++)
+ {
+ exp_bg_means[a] /= true_counter;
+ exp_fg_means[a] /= true_counter;
+
+ cerr << a+1 << ":\t" << exp_bg_means[a] << '\t' << exp_fg_means[a] << '\t' << Log_Ratio[a]/true_counter << endl;
+ }
+
+ fout.close();
+
+ return;
+
+}
+
+void chip::Exp_Push_Mean(vector<string> *lines)
+{
+ for(unsigned short int a = 0; a < noe; a++)
+ {
+ istringstream in(lines->at(a));
+
+ string buf;
+ m_point bg, fg, lbg, lfg;
+
+ in >> buf >> buf >> buf >> bg >> fg;
+
+ lbg = log(bg);
+ lfg = log(fg);
+
+// cout << lines->at(a) << endl;
+// cout << log(bg) << endl;
+
+ exp_bg_means[a] += lbg;
+ exp_fg_means[a] += lfg;
+
+ Log_Ratio[a] += (lbg - lfg);
+ }
+
+ return;
+}
+
+void chip::set_bins()
+{
+ bins.push_back(0);
+
+ for(m_point a = 0; a < 30; a += 0.1)
+ {
+ bins.push_back(0);
+ // bin_label.push_back(a);
+ }
+
+ bins.push_back(0);
+
+ return;
+}
+
+bool chip::consistency_check(vector<string> *lines)
+{
+ unsigned short int ccount = 0;
+
+ for(unsigned short int a = 0; a < noe; a++)
+ {
+ if(lines->at(a).empty())
+ {
+ ccount++;
+ continue;
+ }
+ else if(lines->at(a).at(0) == '#')
+ ccount++;
+ }
+
+ if(ccount != 0 && ccount != noe)
+ return false;
+ else if(ccount == noe)
+ return true;
+
+ istringstream *vss;
+
+ vss = new istringstream[noe];
+
+ for(unsigned short int a = 0; a < noe; a++)
+ vss[a].str(lines->at(a));
+
+ for(unsigned short int a = 0; a < 3; a++)
+ {
+ vector<string> buf(noe);
+
+ for(unsigned short int b = 0; b < noe; b++)
+ vss[b] >> buf[b];
+
+ for(unsigned short int b = 1; b < noe; b++)
+ {
+ if(buf[0] != buf[b])
+ return false;
+ }
+ }
+
+ delete[] vss;
+
+ return true;
+}
+
+void chip::get_current_chr_seq(string filename, string *seq)
+{
+ const string suffix="_hg17";
+ string line;
+
+ filename += suffix;
+
+ ifstream in(filename.c_str());
+
+ if(!in)
+ {
+ cerr << "\nMissing file: " << filename << endl;
+ exit(1);
+ }
+
+ while(getline(in,line))
+ {
+ if(!line.empty())
+ if(!(line.at(0) == '>'))
+ {
+ for(unsigned int a = 0; a < line.size(); a++)
+ seq->push_back(line.at(a));
+ }
+
+ }
+
+ in.close();
+
+
+ return;
+}
+
+void chip::stack_processor(vector<vector<string > > *stack)
+{
+ istringstream vss;
+ string buf;
+ unsigned int start, stop;
+ unsigned int psize;
+ vector<m_point> v_bg, v_fg;
+ m_point mean_bg = 0, var_bg = 0, mean_fg = 0, var_fg = 0, welch, deg_free, N;
+
+ vss.str(stack->at(0).back());
+
+ vss >> buf >> start >> stop;
+
+ psize = (stop - start);
+
+ vss.str(stack->at(0).front());
+
+ vss >> buf >> start;
+
+// cerr << endl << stack->at(0)[0] << endl << stack->at(0)[1] << endl << stack->at(0)[2] << endl;
+// cerr << endl << start << '\t' << stop << '\t' << psize;
+
+ if(stop < start)
+ return;
+
+ if((stop - start) != psize * ((PROBE*2) - 1))
+ return;
+
+ Chr.push_back(buf);
+
+ if(current_chr.empty() || current_chr != buf)
+ {
+ current_chr_seq.clear();
+ get_current_chr_seq(buf, ¤t_chr_seq);
+ current_chr = buf;
+ }
+
+ Start.push_back(start + (psize * (PROBE - 1)));
+ Stop.push_back(Start.back() + psize);
+
+ for(unsigned short int a = 0; a < noe; a++)
+ for(unsigned short int b = 0; b < PROBE; b++)
+ {
+ m_point bg, fg;
+
+ vss.str(stack->at(a).at(b));
+
+ vss >> buf >> buf >> buf >> bg >> fg;
+
+ v_bg.push_back(log(bg));
+ v_fg.push_back(log(fg));
+ }
+
+ for(unsigned short int a = 0; a < v_bg.size(); a++)
+ {
+ mean_bg += v_bg.at(a);
+ mean_fg += v_fg.at(a);
+ }
+
+ N = v_bg.size();
+
+ mean_bg /= N;
+ mean_fg /= N;
+
+ for(unsigned short int a = 0; a < v_bg.size(); a++)
+ {
+ var_bg += pow((v_bg[a] - mean_bg),2);
+ var_fg += pow((v_fg[a] - mean_fg),2);
+ }
+
+ var_bg /= N;
+ var_fg /= N;
+
+ welch = (mean_bg - mean_fg)/(pow((var_bg/N) + (var_fg/N),0.5));
+
+ deg_free = pow((var_bg/N) + (var_fg/N),2);
+
+ deg_free /= (pow(var_bg,2)/(pow(N,2)*(N-1)) + pow(var_fg,2)/(pow(N,2)*(N-1)));
+
+ Score.push_back(gsl_cdf_tdist_P(welch,deg_free));
+
+ int bin_pos = (welch + 15)/0.1;
+
+ if(bin_pos < 0)
+ bins[0]++;
+
+ else if(bin_pos > bins.size() - 2)
+ bins[bins.size() - 1]++;
+
+ else
+ bins[bin_pos+1]++;
+
+
+ if(Score.back() <= CHIP_CUTOFF)
+ {
+ cerr << Chr.back() << '\t' << Start.back() << '\t' << Stop.back() << '\t' << mean_bg << '\t' << mean_fg << '\t' << var_bg << '\t' << var_fg
+ << '\t' << welch << '\t' << deg_free
+ << '\t' << Score.back() << endl;
+
+ fout << ">" << Chr.back() << ' ' << Start.back() - 450 << ' ' << Stop.back() + 450 << ' ' << Score.back() << ' ' << current_chr_seq.size() << endl
+ << current_chr_seq.substr(Start.back() - 1 - 450, Stop.back() - Start.back() + 450 + 450) << endl;
+ }
+
+ return;
+}
+
+void chip::display_bins()
+{
+ m_point pos = 0;
+ unsigned int count = 1;
+
+ cout << "<-15\t" << bins[0] << endl;
+
+ for(;pos < 30; pos += 0.1)
+ cout << pos-15 << '\t' << bins[count++] << endl;
+
+ cout << ">=15\t" << bins.back() << endl;
+
+ return;
+}
+//END OF CHIP CLASS
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pscan-tfbs.git
More information about the debian-med-commit
mailing list