[med-svn] [inspect] 01/02: Imported Upstream version 0.0.20120109
Andreas Tille
tille at debian.org
Wed Sep 30 15:18:22 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository inspect.
commit 9f3a74855a2ee51615c80b105ff9164fd76db028
Author: Andreas Tille <tille at debian.org>
Date: Wed Sep 30 17:12:25 2015 +0200
Imported Upstream version 0.0.20120109
---
AdjustPTM.py | 1869 +++++++++++++++
AminoAcidMasses.txt | 22 +
BN.c | 204 ++
BN.h | 96 +
BasicStats.py | 120 +
BuildConsensusSpectrum.py | 273 +++
BuildInspect.py | 123 +
BuildMGF.py | 126 +
BuildMS2DB.c | 2101 +++++++++++++++++
BuildMS2DB.h | 40 +
BuildMS2DB.jar | Bin 0 -> 182770 bytes
CCSVM1.model | 44 +
CCSVM1.range | 12 +
CCSVM2.model | 118 +
CCSVM2.range | 24 +
CCSVM2Phos.model | 320 +++
CCSVM2Phos.range | 25 +
CMemLeak.c | 421 ++++
CMemLeak.h | 43 +
Ch2BNPEP.dat | Bin 0 -> 3376 bytes
Ch2BNPEPQ.dat | Bin 0 -> 3376 bytes
Ch3BNPEP.dat | Bin 0 -> 3376 bytes
Ch3BNPEPQ.dat | Bin 0 -> 3376 bytes
ChargeState.c | 899 ++++++++
ChargeState.h | 65 +
CombinePTMFeatures.py | 627 +++++
CompareHEKPTM.py | 808 +++++++
ComputeFDR.jar | Bin 0 -> 4364231 bytes
ComputeFScore.py | 328 +++
ComputePTMFeatures.py | 943 ++++++++
Database/CommonContaminants.fasta | 20 +
Database/TestDatabase.index | Bin 0 -> 3404 bytes
Database/TestDatabase.trie | 1 +
Errors.c | 261 +++
Errors.h | 88 +
ExonGraphAlign.c | 1195 ++++++++++
ExonGraphAlign.h | 40 +
ExplainPTMs.py | 148 ++
FDRUtils.py | 1109 +++++++++
FreeMod.c | 2720 ++++++++++++++++++++++
FreeMod.h | 91 +
GetByteOffset.py | 169 ++
Global.py | 64 +
InVitroModifications.txt | 7 +
InVivoModifications.txt | 9 +
Inspect.exe | Bin 0 -> 920576 bytes
Inspect.h | 190 ++
Inspect.sln | 19 +
Inspect.vcproj | 566 +++++
InspectToPepXML.py | 859 +++++++
IonScoring.c | 1873 +++++++++++++++
IonScoring.h | 195 ++
IsotopePatterns.txt | 1750 ++++++++++++++
LDA.c | 280 +++
LDA.h | 59 +
LDA.py | 469 ++++
Label.py | 576 +++++
Learning.py | 1276 ++++++++++
MQScoreLDA2.model | Bin 0 -> 636 bytes
MQScoreLDA3.model | Bin 0 -> 636 bytes
MQScoreSVM2.model | 269 +++
MQScoreSVM2.range | 9 +
MQScoreSVM3.model | 282 +++
MQScoreSVM3.range | 9 +
MS2DB.c | 688 ++++++
MS2DB.h | 45 +
MS2DBShuffler.jar | Bin 0 -> 178348 bytes
MSSpectrum.py | 663 ++++++
MakeImage.py | 623 +++++
Makefile | 36 +
Mods.c | 340 +++
Mods.h | 110 +
PLSUtils.py | 265 +++
PMCLDA1.model | Bin 0 -> 956 bytes
PMCLDA2.model | Bin 0 -> 2580 bytes
PMCLDA2Phos.model | Bin 0 -> 3188 bytes
PMCLDA3.model | Bin 0 -> 2580 bytes
PMCLDA3Phos.model | Bin 0 -> 3188 bytes
PRM2.bn | Bin 0 -> 10964 bytes
PRM2.dat | Bin 0 -> 2736 bytes
PRM3.bn | Bin 0 -> 10964 bytes
PRM3.dat | Bin 0 -> 2736 bytes
PRMQ2.dat | Bin 0 -> 2736 bytes
PRMQ3.dat | Bin 0 -> 2736 bytes
PTMAnalysis.py | 523 +++++
PTMChooserLM.py | 1294 +++++++++++
PTMDatabase.txt | 563 +++++
PTMSearchBigDB.py | 171 ++
PTMods.txt | 105 +
PValue.c | 662 ++++++
PValue.h | 42 +
ParentMass.c | 710 ++++++
ParentMass.h | 105 +
ParseInput.c | 1653 +++++++++++++
ParseInput.h | 44 +
ParseXML.c | 1239 ++++++++++
ParseXML.h | 46 +
ParseXML.py | 281 +++
PhosCut2.bn | Bin 0 -> 19740 bytes
PhosCut3.bn | Bin 0 -> 22240 bytes
PhosphateLocalization.py | 324 +++
PrepDB.py | 283 +++
ProteinGrouper.py | 471 ++++
PyInspect.pyd | Bin 0 -> 315392 bytes
PyInspect/PyInspect.c | 661 ++++++
PyInspect/PySpectrum.c | 1265 ++++++++++
PyInspect/PySpectrum.h | 145 ++
PyInspect/PyUtils.c | 49 +
PyInspect/PyUtils.h | 39 +
PySVM.pyd | Bin 0 -> 57344 bytes
PySVM/PySVM.c | 327 +++
PySVM/PySVM.sln | 21 +
PySVM/PySVM.vcproj | 198 ++
PySVM/svm-predict.c | 202 ++
PySVM/svm.cpp | 3087 +++++++++++++++++++++++++
PySVM/svm.h | 72 +
ReleaseFiles.txt | 234 ++
ReleasePyInspect.py | 67 +
ReleasePySVM.py | 48 +
ResultsParser.py | 152 ++
Run.c | 1492 ++++++++++++
Run.h | 41 +
RunPySVM.py | 67 +
SNP.c | 244 ++
SNP.h | 63 +
SVM.c | 644 ++++++
SVM.h | 81 +
Score.c | 862 +++++++
Score.h | 85 +
Score.py | 61 +
ScoringModel.dat | Bin 0 -> 1680 bytes
Scorpion.c | 1304 +++++++++++
Scorpion.h | 108 +
SelectProteins.py | 397 ++++
ShuffleDB.py | 285 +++
SpectralSimilarity.py | 502 ++++
Spectrum.c | 1487 ++++++++++++
Spectrum.h | 160 ++
SpliceDB.c | 4212 ++++++++++++++++++++++++++++++++++
SpliceDB.h | 150 ++
SpliceScan.c | 1003 ++++++++
SpliceScan.h | 39 +
Spliced.c | 2113 +++++++++++++++++
Spliced.h | 120 +
StripPTM.py | 117 +
Summary.py | 471 ++++
SystemTest.py | 251 ++
SystemTest/BuildSimpleChromosome.txt | 3 +
SystemTest/Shew_Short.fasta | 20 +
SystemTest/Shew_dta.txt | 1451 ++++++++++++
SystemTest/SimpleChromosome.trie | 1 +
SystemTest/SimpleGenes.gff | 5 +
SystemTest/TestCDTA.txt | 5 +
SystemTest/TestInput.txt | 26 +
SystemTest/TestInputMod.txt | 8 +
SystemTest/TestInputTag1.txt | 9 +
SystemTest/TestInputTag3.txt | 9 +
SystemTest/TestMS2.txt | 8 +
SystemTest/TestPMC.txt | 7 +
SystemTest/TestSpectra.pkl | 1773 ++++++++++++++
SystemTest/TestSpectrum.dta | 131 ++
SystemTest/Yeast.ms2 | 1149 ++++++++++
SystemTest/YeastSmall.fasta | 62 +
TAG2.bn | Bin 0 -> 15372 bytes
TAG3.bn | Bin 0 -> 15372 bytes
TagFile.c | 493 ++++
TagFile.h | 67 +
TagSkewScores.dat | Bin 0 -> 252 bytes
Tagger.c | 2148 +++++++++++++++++
Tagger.h | 199 ++
TrainPTMFeatures.py | 762 ++++++
Trie.c | 2659 +++++++++++++++++++++
Trie.h | 309 +++
TrieUtils.py | 256 +++
Utils.c | 683 ++++++
Utils.h | 345 +++
Utils.py | 1074 +++++++++
base64.c | 217 ++
base64.h | 6 +
docs/Analysis.html | 79 +
docs/Copyright.html | 47 +
docs/Database.html | 78 +
docs/InspectTutorial.pdf | Bin 0 -> 120117 bytes
docs/Installation.html | 42 +
docs/MS2DB.html | 51 +
docs/PLSTutorial.pdf | Bin 0 -> 61551 bytes
docs/RunningInspectOnTheFWGrid.pdf | Bin 0 -> 27939 bytes
docs/Searching.html | 128 ++
docs/UnrestrictedSearchTutorial.pdf | Bin 0 -> 90009 bytes
docs/index.html | 42 +
main.c | 863 +++++++
191 files changed, 73681 insertions(+)
diff --git a/AdjustPTM.py b/AdjustPTM.py
new file mode 100644
index 0000000..3cb9338
--- /dev/null
+++ b/AdjustPTM.py
@@ -0,0 +1,1869 @@
+#Title: AdjustPTM.py
+#Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Merge and reconcile peptide species, after running ComputePTMFeatures
+and TrainPTMFeatures. Iterate over peptide species from "best" to
+"worst". For each species, consider whether there's another species
+which (1) is better, and either (2a) is the same after mod-shifting
+(including charge-state), or (2b) is compatible after mod-shifting.
+Case (2a): Try merging both species into one big cluster, determine
+whether the MQScore / PValue improves. If so, keep the merge.
+Case (2b): Try shifting the inferior species to match the superior.
+If the MQScore / PValue improves (or at least stays CLOSE), then
+keep the shift.
+
+A note on directories:
+Consensus spectra and clusters can take quite a bit of disk space. Running
+AdjustPTM changes spectra and clusters, but we'd like the liberty to
+re-run AdjustPTM. Therefore, AdjustPTM uses a set of "adjusted" directories.
+It wipes these when it starts a merge/reconcile run; it writes to them; it
+reads clusters and spectra from these directories first, if possible.
+"""
+import os
+import struct
+import shutil
+import math
+import sys
+import MSSpectrum
+import string
+import traceback
+import getopt
+import cPickle
+import BuildConsensusSpectrum
+import Learning
+import PyInspect
+import ResultsParser
+import SpectralSimilarity
+from Utils import *
+Initialize()
+import TrainPTMFeatures
+from TrainPTMFeatures import FormatBits
+from TrainPTMFeatures import FeatureBits
+
+PROFILING_RUN = 0
+
+class AnticipatedChemistry:
+ """
+ Represents a chemical adduct which we expect to see often, with relatively low
+ site specificity. Examples: M+16, *.Q-17. We want to flag the adducts
+ to highlight the remaining NON-adducts!
+ """
+ def __init__(self):
+ # For the "allowed" members, None means "no restriction".
+ self.AllowedResidues = None
+ self.AllowedPrefix = None
+ self.AllowedSuffix = None
+ self.Terminus = None
+ self.Mass = 0
+ self.Name = ""
+
+class SiteClass:
+ """
+ Wrapper for one or more Species instances which represent the same modification-mass
+ at the same database-position.
+ """
+ def __init__(self):
+ self.ModDBPos = None
+ self.ModMass = None
+ self.SpeciesList = []
+ def __str__(self):
+ return "%+d on dbpos %s"%(self.ModMass, self.ModDBPos)
+
+class PeptideSpeciesClass:
+ """
+ Represents one (modified) peptide species; different charge states are different
+ peptide species.
+ """
+ def __init__(self):
+ self.MergedFlag = 0
+ self.ConsensusSpectrum = None
+ self.ConsensusMSSpectrum = None
+ self.BestSpectrum = None
+ self.BestMSSpectrum = None
+ self.Bits = None
+ self.ConsensusModlessMSSpectrum = None
+ def __str__(self):
+ return "<peptide species: %s>"%self.Bits
+ def FreeCachedSpectra(self):
+ """
+ Discard our PySpectrum and MSSpectrum objects, because we can't hold ALL
+ such in memory at once:
+ """
+ self.BestSpectrum = None
+ self.BestMSSpectrum = None
+ self.ConsensusSpectrum = None
+ self.ConsensusMSSpectrum = None
+ self.ConsensusModlessMSSpectrum = None
+ def GetBestSpectrum(self, Master):
+ if self.BestSpectrum:
+ return self.BestSpectrum
+ FilePath = self.Bits[FormatBits.BestSpectrumPath]
+ ColonBits = FilePath.split(":")
+ try:
+ FilePos = int(ColonBits[-1])
+ FilePath = string.join(ColonBits[:-1], ":")
+ except:
+ FilePos = 0
+ FilePath = Master.FixSpectrumPath(FilePath)
+ self.BestSpectrum = PyInspect.Spectrum(FilePath, FilePos)
+ return self.BestSpectrum
+ def GetMemberListStr(self, Master):
+ Path = os.path.join(Master.ClusterScanListDirAdjusted, self.Annotation[2], "%s.%s.txt"%(self.Annotation, self.Charge))
+ if not os.path.exists(Path):
+ Path = os.path.join(Master.ClusterScanListDir, self.Annotation[2], "%s.%s.txt"%(self.Annotation, self.Charge))
+ if not os.path.exists(Path):
+ # Punt!
+ return ""
+ File = open(Path, "rb")
+ Text = File.read()
+ File.close()
+ return Text
+ def GetConsensusSpectrumPath(self, Master):
+ Path = os.path.join(Master.ConsensusSpectraDirAdjusted, self.Annotation[2], "%s.%s.dta"%(self.Annotation, self.Charge))
+ if not os.path.exists(Path):
+ Path = os.path.join(Master.ConsensusSpectraDir, self.Annotation[2], "%s.%s.dta"%(self.Annotation, self.Charge))
+ return Path
+ def GetConsensusSpectrum(self, Master):
+ if self.ConsensusSpectrum:
+ return self.ConsensusSpectrum
+ Path = self.GetConsensusSpectrumPath(Master)
+ self.ConsensusSpectrum = PyInspect.Spectrum(Path, 0)
+ return self.ConsensusSpectrum
+ def GetConsensusMSSpectrum(self, Master):
+ if self.ConsensusMSSpectrum:
+ return self.ConsensusMSSpectrum
+ Path = self.GetConsensusSpectrumPath(Master)
+ self.ConsensusMSSpectrum = MSSpectrum.SpectrumClass()
+ self.ConsensusMSSpectrum.ReadPeaks(Path)
+ self.ConsensusMSSpectrum.FilterPeaks()
+ self.ConsensusMSSpectrum.RankPeaksByIntensity()
+ return self.ConsensusMSSpectrum
+ def GetConsensusModlessMSSpectrum(self, Master):
+ if self.ConsensusModlessMSSpectrum:
+ return self.ConsensusModlessMSSpectrum
+ Path = os.path.join(Master.ConsensusSpectraDirAdjusted, self.ModlessAnnotation[2], "%s.%s.dta"%(self.ModlessAnnotation, self.Charge))
+ if not os.path.exists(Path):
+ Path = os.path.join(Master.ConsensusSpectraDir, self.ModlessAnnotation[2], "%s.%s.dta"%(self.ModlessAnnotation, self.Charge))
+ self.ConsensusModlessMSSpectrum = MSSpectrum.SpectrumClass()
+ self.ConsensusModlessMSSpectrum.ReadPeaks(Path)
+ self.ConsensusModlessMSSpectrum.FilterPeaks()
+ self.ConsensusModlessMSSpectrum.RankPeaksByIntensity()
+ return self.ConsensusModlessMSSpectrum
+ def ParseBits(self, Bits):
+ self.Bits = Bits
+ self.Annotation = Bits[FormatBits.Peptide]
+ self.Peptide = GetPeptideFromModdedName(self.Annotation)
+ self.ModlessAnnotation = "%s.%s.%s"%(self.Peptide.Prefix, self.Peptide.Aminos, self.Peptide.Suffix)
+ self.Charge = int(Bits[FormatBits.Charge])
+ self.ModDBPos = int(Bits[FormatBits.DBPos])
+ ModIndex = self.Peptide.Modifications.keys()[0]
+ self.DBPos = self.ModDBPos - ModIndex
+ self.ModMass = self.Peptide.Modifications[ModIndex][0].Mass
+ self.ModAA = self.Peptide.Aminos[ModIndex]
+ self.DBEnd = self.DBPos + len(self.Peptide.Aminos)
+ self.ConsensusMQScore = float(Bits[FormatBits.ConsensusMQScore])
+ try:
+ self.ModelScore = float(Bits[FormatBits.ModelScore])
+ self.PValue = float(Bits[FormatBits.ModelPValue])
+ except:
+ self.ModelScore = None
+ self.PValue = None
+ # Parse old features:
+ self.Features = []
+ for FeatureIndex in range(FormatBits.FirstFeature, FormatBits.LastFeature + 1):
+ try:
+ self.Features.append(float(Bits[FeatureIndex]))
+ except:
+ self.Features.append(0)
+ self.ComputePrefixes()
+ def ComputePrefixes(self):
+ # self.Prefixes[DBPos] is the mass that this species accumulates
+ # *before* the specified residue. Examples:
+ # ~ Species.Prefixes[Species.DBPos] = 0 always,
+ # ~ Species.Prefix[Species.DBPos + 1] is equal to the mass (with modification, if any)
+ # of the first residue
+ self.Prefixes = {}
+ self.Suffixes = {}
+ ParentMass = self.Peptide.Masses[-1] + 19
+ AccumulatedMass = 0
+ for Pos in range(len(self.Peptide.Aminos)):
+ self.Prefixes[self.DBPos + Pos] = AccumulatedMass
+ self.Suffixes[self.DBPos + Pos] = ParentMass - AccumulatedMass
+ AccumulatedMass += GetMass(self.Peptide.Aminos[Pos])
+ for Mod in self.Peptide.Modifications.get(Pos, []):
+ AccumulatedMass += Mod.Mass
+
+class PTMAdjuster(ResultsParser.SpectrumOracleMixin):
+ def __init__(self):
+ self.HeaderLines = []
+ self.CompatibilityTolerance = 3
+ self.CachedClusterPath = None
+ self.ConsensusClusterDir = "PTMScore\\Lens-99-10\\Cluster" # default
+ self.ConsensusSpectraDir = "PTMScore\\Lens-99-10\\Spectra" # default
+ self.SortByModel = 1
+ self.PeptideDict = {} # keys: (Annotation, Charge)
+ self.KnownChemistryFileName = None
+ self.OutputModelFileName2 = None
+ self.OutputModelFileName3 = None
+ self.DBStart = None
+ self.DBEnd = None
+ self.SpectrumRoot = None
+ self.CheckDirectoriesFlag = 0
+ self.MergeBlockRunsFlag = 0
+ self.KnownPTMVerboseOutputFileName = None
+ self.MaxPeptideWindowWidth = 2500
+ ResultsParser.SpectrumOracleMixin.__init__(self)
+ def PerformMergeReconcileOnWindow(self, PerformMergeFlag):
+ ###############################################################
+ # Consider merging/reconciling these peptides:
+ SortedList = []
+ self.PeptideDict = {}
+ for Species in self.WindowPeptides:
+ if self.SortByModel:
+ SortedList.append((Species.ModelScore, Species))
+ else:
+ SortedList.append((Species.ConsensusMQScore, Species))
+ Key = (Species.Annotation, Species.Charge)
+ self.PeptideDict[Key] = Species
+ # SortedList lists species from BEST to WORST.
+ SortedList.sort()
+ SortedList.reverse()
+ # Dual iteration over the peptides from the window: Species A has the
+ # lower score, species B has the higher score.
+ # Consider reconciling species A to species B:
+ for IndexA in range(len(SortedList)):
+ (ScoreA, SpeciesA) = SortedList[IndexA]
+ Str = "(%s/%s) %s %s"%(IndexA, len(SortedList), SpeciesA.Charge, SpeciesA.Annotation)
+ if PerformMergeFlag:
+ print "M", Str
+ else:
+ print "C", Str
+ if SpeciesA.MergedFlag:
+ # A has already been merged into another species.
+ continue
+ for IndexB in range(IndexA):
+ (ScoreB, SpeciesB) = SortedList[IndexB]
+ if SpeciesB.MergedFlag:
+ # B has already been merged into another species.
+ continue
+ # Compatibility checks.
+ # Charge must be the same in order to MERGE (but not to RECONCILE):
+ if SpeciesA.Charge != SpeciesB.Charge and PerformMergeFlag:
+ continue
+ # Peptides must overlap:
+ if SpeciesA.DBEnd <= SpeciesB.DBPos or SpeciesB.DBEnd <= SpeciesA.DBPos:
+ continue
+ # To reconcile, Peptide A must cover the DBposition which is modified in B:
+ if SpeciesB.ModDBPos >= SpeciesA.DBEnd or SpeciesB.ModDBPos < SpeciesA.DBPos:
+ if not PerformMergeFlag:
+ continue
+ # First, look for a MERGE:
+ # Prefix and suffix must be the same at some point:
+ SamePrefixSuffix = 0
+ for DBPos in SpeciesA.Prefixes.keys():
+ PMassA = SpeciesA.Prefixes[DBPos]
+ PMassB = SpeciesB.Prefixes.get(DBPos, -9999)
+ if abs(PMassA - PMassB) >= self.CompatibilityTolerance:
+ continue
+ SMassA = SpeciesA.Suffixes[DBPos]
+ SMassB = SpeciesB.Suffixes.get(DBPos, -9999)
+ if abs(SMassA - SMassB) >= self.CompatibilityTolerance:
+ continue
+ SamePrefixSuffix = 1
+ break
+ if SamePrefixSuffix and SpeciesA.Charge == SpeciesB.Charge:
+ # Merge is possible. If this is first-cycle, then do a merge;
+ # if not, then continue.
+ if PerformMergeFlag:
+ MergeFlag = self.AttemptMerge(SpeciesA, SpeciesB) # A into B
+ if MergeFlag:
+ SpeciesB.FreeCachedSpectra()
+ break
+ # We didn't merge A into B. But perhaps we can merge B into A!
+ MergeFlag = self.AttemptMerge(SpeciesB, SpeciesA, 1) # B into A
+ if MergeFlag:
+ SpeciesB.FreeCachedSpectra()
+ break
+ continue
+ # Merge is impossible. If this is first-cycle, bail out:
+ if PerformMergeFlag:
+ continue
+ # If species A and B are already compatible, then there's nothing to do:
+ if SpeciesA.ModDBPos == SpeciesB.ModDBPos and SpeciesA.ModMass == SpeciesB.ModMass:
+ print "(Already reconciled to %s)"%Species.Annotation
+ continue
+ # Perhaps A could be conformed to B
+ # if the modification-masses are similar (possibly after
+ # an endpoint shift):
+ if SpeciesA.DBPos < SpeciesB.DBPos:
+ ExtraPrefixA = GetMass(self.DB[SpeciesA.DBPos:SpeciesB.DBPos])
+ else:
+ ExtraPrefixA = 0
+ if SpeciesB.DBPos < SpeciesA.DBPos:
+ ExtraPrefixB = GetMass(self.DB[SpeciesB.DBPos:SpeciesA.DBPos])
+ else:
+ ExtraPrefixB = 0
+ if SpeciesA.DBEnd > SpeciesB.DBEnd:
+ ExtraSuffixA = GetMass(self.DB[SpeciesB.DBEnd:SpeciesA.DBEnd])
+ else:
+ ExtraSuffixA = 0
+ if SpeciesB.DBEnd > SpeciesA.DBEnd:
+ ExtraSuffixB = GetMass(self.DB[SpeciesA.DBEnd:SpeciesB.DBEnd])
+ else:
+ ExtraSuffixB = 0
+ # VERBOSE:
+ for DBPos in SpeciesA.Prefixes.keys():
+ PMassA = SpeciesA.Prefixes[DBPos] + ExtraPrefixA
+ PMassB = SpeciesB.Prefixes.get(DBPos, -9999) - ExtraPrefixB
+ SMassA = SpeciesA.Suffixes[DBPos] + ExtraSuffixA
+ SMassB = SpeciesB.Suffixes.get(DBPos, -9999) - ExtraSuffixB
+ #print "DBPos %s: PreA %s PreB %s (%s)"%(DBPos, PMassA, PMassB, abs(PMassA-PMassB))
+ #print " PostA %s PostB %s (%s)"%(SMassA, SMassB, abs(SMassA-SMassB))
+ if abs(PMassA - PMassB) >= self.CompatibilityTolerance:
+ continue
+ if abs(SMassA - SMassB) >= self.CompatibilityTolerance:
+ continue
+ SamePrefixSuffix = 1
+ break
+ if not SamePrefixSuffix:
+ # irreconcilable, move on to try the next species:
+ continue
+ self.ReconcileDetailOutput.write("%s\t%s\t%s\t%s\t\n"%(SpeciesA.ModDBPos, SpeciesB.ModDBPos, SpeciesA.ModMass, SpeciesB.ModMass))
+ self.ReconcileDetailOutput.write("%s\t%s\t%s\t\t%s\t%s\t%s\t\n"%(SpeciesA.Annotation, SpeciesA.DBPos, SpeciesA.DBEnd, SpeciesB.Annotation, SpeciesB.DBPos, SpeciesB.DBEnd))
+ # Species A *could* be reconciled with B.
+ Result = self.AttemptReconcile(SpeciesA, SpeciesB)
+ SpeciesB.FreeCachedSpectra()
+ if Result:
+ # We reconciled to B. Stop now, don't re-reconcile to
+ # another (INFERIOR) species:
+ break
+ SpeciesA.FreeCachedSpectra()
+ def PerformAllMerges(self, PerformMergeFlag = 1):
+ """
+ Workhorse for the merge and reconcile procedure. Double-loop over sites,
+ from high to low scoring. Consider either MERGING or RECONCILING the
+ low-scoring site to match the high-scoring site.
+ """
+ self.HeaderLines = []
+ self.ModTypeSpectrumCount = {}
+ self.ModTypeSiteCount = {}
+ # A list of peptides within our 'window'. At each iteration,
+ # we read peptides until we hit eof, hit a new protein-name, or hit a peptide 2500 characters
+ # past the first one. Then we attempt reconciliation/merging for peptides
+ # which are "covered" by the window. Then we advance the window.
+ self.WindowPeptides = []
+ EOFFlag = 0
+ WroteHeaderFlag = 0
+ NextProteinFirstPeptide = None
+ while 1:
+ if NextProteinFirstPeptide:
+ self.WindowPeptides = [NextProteinFirstPeptide]
+ NextProteinFirstPeptide = None
+ if not len(self.WindowPeptides):
+ WindowStart = 0
+ WindowEnd = 0
+ CurrentProteinName = None
+ if EOFFlag:
+ break
+ else:
+ WindowStart = self.WindowPeptides[0].DBPos
+ WindowEnd = self.WindowPeptides[-1].DBEnd
+ CurrentProteinName = self.WindowPeptides[0].ProteinName
+ ###############################################################
+ # Parse some more peptides:
+ while (not EOFFlag) and (WindowEnd < WindowStart + self.MaxPeptideWindowWidth):
+ FileLine = self.InputFile.readline()
+ if not FileLine:
+ EOFFlag = 1
+ break
+ if FileLine[0] == "#":
+ self.HeaderLines.append(FileLine)
+ continue # skip comment line
+ FileLine = FileLine.replace("\r", "").replace("\n", "")
+ if not FileLine.strip():
+ continue # skip blank line
+ Bits = FileLine.split("\t")
+ try:
+ Species = PeptideSpeciesClass()
+ Species.ParseBits(Bits)
+ Species.ProteinName = Bits[FormatBits.ProteinName]
+ except:
+ traceback.print_exc()
+ print Bits
+ continue
+ # SKIP the species if it falls outside our block:
+ if self.DBStart != None and Species.DBPos < self.DBStart:
+ continue
+ if self.DBEnd != None and Species.DBPos >= self.DBEnd:
+ continue
+ # If the species comes from a NEW protein, finish the window and save the new species
+ # for next iteration:
+ #print "CurrentProtein '%s', new species protein '%s'"%(CurrentProteinName, Species.ProteinName)
+ if CurrentProteinName == None:
+ # We have no current-protein. Start the list:
+ CurrentProteinName = Species.ProteinName
+ WindowStart = Species.DBPos
+ else:
+ # Check whether this species matches the current protein:
+ if Species.ProteinName != CurrentProteinName:
+ NextProteinFirstPeptide = Species
+ break
+ self.WindowPeptides.append(Species)
+ WindowStart = min(WindowStart, Species.DBPos)
+ WindowEnd = max(WindowEnd, Species.DBEnd)
+ ###############################################################
+ print "->Handling %s peptides in range %s...%s\n %s"%(len(self.WindowPeptides), WindowStart, WindowEnd, CurrentProteinName)
+ self.PerformMergeReconcileOnWindow(PerformMergeFlag)
+ ###############################################################
+ # Re-sort the window peptides, so that sites fall together:
+ SortedList = []
+ for Peptide in self.WindowPeptides:
+ SortedList.append((Peptide.ModDBPos, Peptide.ModMass, Peptide))
+ SortedList.sort()
+ self.WindowPeptides = []
+ for Tuple in SortedList:
+ self.WindowPeptides.append(Tuple[-1])
+ ###############################################################
+ # Write file header now, if we haven't:
+ if not WroteHeaderFlag:
+ for HeaderLine in self.HeaderLines:
+ self.OutputFile.write(HeaderLine)
+ WroteHeaderFlag = 1
+ ###############################################################
+ # Write out, and free, peptides in (the early portion of) the window:
+ PeptideIndex = 0
+ while PeptideIndex < len(self.WindowPeptides):
+ Species = self.WindowPeptides[PeptideIndex]
+ ###print " Species %s of %s: %s-%s (window %s-%s)"%(PeptideIndex, len(self.WindowPeptides), Species.DBPos, Species.DBEnd, WindowStart, WindowEnd)
+ if Species.DBEnd > WindowEnd - 100 and (not EOFFlag and not NextProteinFirstPeptide):
+ ###print " ->Leave it in the window"
+ PeptideIndex += 1
+ continue
+ # This peptide can be dropped from the list.
+ if Species.MergedFlag:
+ pass
+ else:
+ # Update Species.Bits to reflect changes to Species.Features:
+ for Index in range(FormatBits.FirstFeature, FormatBits.LastFeature + 1):
+ Species.Bits[Index] = str(Species.Features[Index - FormatBits.FirstFeature])
+ String = string.join(Species.Bits, "\t")
+ self.OutputFile.write(String + "\n")
+ ModTypeKey = (Species.ModAA, Species.ModMass)
+ # Note NOW the number of sites and spectra for each modification-type.
+ self.ModTypeSiteCount[ModTypeKey] = self.ModTypeSiteCount.get(ModTypeKey, 0) + 1
+ self.ModTypeSpectrumCount[ModTypeKey] = self.ModTypeSpectrumCount.get(ModTypeKey, 0) + Species.Features[FeatureBits.SpectrumCount]
+ ###print " ->Drop it from the window"
+ del self.WindowPeptides[PeptideIndex]
+ def GetSiteScore(self, Site):
+ Site.PValue = 1.0
+ SortedSpeciesScores = []
+ for Species in Site.SpeciesList:
+ SortedSpeciesScores.append(Species.ModelScore)
+ if Species.Charge > 2:
+ Species.PValue = self.Model3.GetPValue(Species.ModelScore)
+ else:
+ Species.PValue = self.Model2.GetPValue(Species.ModelScore)
+ Site.PValue *= Species.PValue
+ SortedSpeciesScores.sort()
+ SortedSpeciesScores.reverse()
+ SiteScore = [-math.log(Site.PValue)]
+ SiteScore.extend(SortedSpeciesScores)
+ return SiteScore
+ def LoadKnownModifications(self):
+ """
+ Parse the KnownPTM file, to get a list of chemical events which we
+ already have a (hypothetical) annotation for.
+ """
+ if not self.KnownChemistryFileName:
+ return
+ self.KnownPTMs = []
+ self.KnownPTMByMass = {}
+ File = open(self.KnownChemistryFileName, "rb")
+ # Load one PTM from each line of the file.
+ # Line format:
+ # Mass, name, AllowedResidues, Terminus, AllowedPrefix, AllowedSuffix
+ # Example: -17, pyroglutamate formation, Q, N, "", ""
+ LineNumber = 0
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ Bits = list(FileLine.strip().split("\t"))
+ if len(Bits) == 1:
+ continue
+ if FileLine[0] == "#":
+ continue
+ while len(Bits)<6:
+ Bits.append("")
+ try:
+ PTM = AnticipatedChemistry()
+ PTM.Mass = int(Bits[0])
+ except:
+ print "** Skipping invalid line %s of %s"%(LineNumber, self.KnownChemistryFileName)
+ print Bits
+ continue
+ PTM.Name = Bits[1]
+ if len(Bits[2]) == 0 or Bits[2][0] == "*":
+ PTM.AllowedResidues = None
+ else:
+ PTM.AllowedResidues = Bits[2]
+ if len(Bits[3].strip()):
+ PTM.Terminus = Bits[3].strip()
+ if len(Bits[4].strip()):
+ PTM.AllowedPrefix = Bits[4].strip()
+ if len(Bits[5].strip()):
+ PTM.AllowedSuffix = Bits[5].strip()
+ self.KnownPTMs.append(PTM)
+ if not self.KnownPTMByMass.has_key(PTM.Mass):
+ self.KnownPTMByMass[PTM.Mass] = []
+ self.KnownPTMByMass[PTM.Mass].append(PTM)
+ File.close()
+ print "Loaded %s known PTMs from '%s'."%(len(self.KnownPTMs), self.KnownChemistryFileName)
+ def AttemptKnownPTM(self, Site):
+ InitialScore = self.GetSiteScore(Site)
+ # Initialize known-ptm information for each species:
+ for Species in Site.SpeciesList:
+ Species.BestKnownPTMAnnotation = ""
+ Species.BestKnownPTMAnnotationName = ""
+ Species.BestKnownPTMAnnotationPValue = ""
+ Species.BestKnownPTMAnnotationScore = ""
+ Species.BestKnownPTMAnnotationSitePValue = ""
+ # Loop over species to find the allowed database residues.
+ ResidueCounts = {}
+ ResidueInitialCounts = {}
+ for Species in Site.SpeciesList:
+ for Pos in range(Species.DBPos, Species.DBPos + len(Species.Peptide.Aminos)):
+ ResidueCounts[Pos] = ResidueCounts.get(Pos, 0) + 1
+ ResidueInitialCounts[Species.DBPos] = ResidueInitialCounts.get(Species.DBPos, 0) + 1
+ BestEditedSiteScore = None
+ self.KPTMVerbose.write("\n===============================\n")
+ self.KPTMVerbose.write("Site %s initial score (%.3f, %.3f)\n"%(Site, InitialScore[0], InitialScore[1]))
+ Residues = ResidueCounts.items()
+ FirstCoveredResidue = min(ResidueCounts.keys())
+ LastCoveredResidue = max(ResidueCounts.keys())
+ ###############################################################################
+ # Consider shifting the modification to any (legal) residue, with any legal mass:
+ ModMass = Site.SpeciesList[0].ModMass
+ # Decide which endpoint-shifts we'll considered.
+ # If the peptides don't all share the same N-terminus, then shifting the
+ # N-terminus isn't allowed. (That would perforce SPLIT this site!)
+ Shifts = [None]
+ if ResidueCounts[FirstCoveredResidue] == len(Site.SpeciesList):
+ Shifts.extend(["N+1", "N+2"])
+ if FirstCoveredResidue > 0:
+ Shifts.append("N-1")
+ if FirstCoveredResidue > 1:
+ Shifts.append("N-2")
+ if ResidueCounts[LastCoveredResidue] == len(Site.SpeciesList):
+ Shifts.extend(["C-2", "C-1"])
+ if LastCoveredResidue < len(self.DB) - 1:
+ Shifts.append("C+1")
+ if LastCoveredResidue < len(self.DB) - 2:
+ Shifts.append("C+2")
+ for Shift in Shifts:
+ # CoreMass is equal to the modified mass, possibly modified due to
+ # endpoint shifts:
+ CoreMass = ModMass
+ # AllowedDBList is a list of the database positions where the ptm could
+ # be attached. We've already selected the range the peptide will cover,
+ # but the PTM could fall on various residues:
+ AllowedDBList = list(range(FirstCoveredResidue, LastCoveredResidue + 1))
+ if Shift in ("N-1", "N-2"):
+ AllowedDBList.append(FirstCoveredResidue - 1)
+ DropMass = Global.AminoMass.get(self.DB[FirstCoveredResidue - 1], None)
+ if not DropMass:
+ continue
+ CoreMass -= DropMass
+ if Shift == "N-2":
+ AllowedDBList.append(FirstCoveredResidue - 2)
+ DropMass = Global.AminoMass.get(self.DB[FirstCoveredResidue - 2], None)
+ if not DropMass:
+ continue
+ CoreMass -= DropMass
+ if Shift in ("N+1", "N+2"):
+ AllowedDBList.remove(FirstCoveredResidue)
+ CoreMass += Global.AminoMass.get(self.DB[FirstCoveredResidue], None)
+ if Shift == "N+2":
+ AllowedDBList.remove(FirstCoveredResidue + 1)
+ CoreMass += Global.AminoMass.get(self.DB[FirstCoveredResidue + 1], None)
+ if Shift in ("C+1", "C+2"):
+ AllowedDBList.append(LastCoveredResidue + 1)
+ DropMass = Global.AminoMass.get(self.DB[LastCoveredResidue + 1], None)
+ if not DropMass:
+ continue
+ CoreMass -= DropMass
+ if Shift == "C+2":
+ AllowedDBList.append(LastCoveredResidue + 2)
+ DropMass = Global.AminoMass.get(self.DB[LastCoveredResidue + 2], None)
+ if not DropMass:
+ continue
+ CoreMass -= DropMass
+ if Shift in ("C-1", "C-2"):
+ AllowedDBList.remove(LastCoveredResidue)
+ CoreMass += Global.AminoMass.get(self.DB[LastCoveredResidue], None)
+ if Shift == "C-2":
+ AllowedDBList.remove(LastCoveredResidue - 1)
+ CoreMass += Global.AminoMass.get(self.DB[LastCoveredResidue - 1], None)
+ if CoreMass < -250 or CoreMass > 250:
+ continue
+ CoreMass = int(round(CoreMass))
+ ShiftablePeptides = []
+ for DBPos in AllowedDBList:
+ TryMassList = (CoreMass - 2, CoreMass - 1, CoreMass, CoreMass + 1, CoreMass + 2)
+ for NearMass in TryMassList:
+ KnownPTMList = self.KnownPTMByMass.get(NearMass, [])
+ for KnownPTM in KnownPTMList:
+ # Determine whether this is a legal PTM placement.
+ # The amino acid type must be valid:
+ if KnownPTM.AllowedResidues != None and self.DB[DBPos] not in KnownPTM.AllowedResidues:
+ continue
+ # The terminus must be valid (for at least one peptide species):
+ if KnownPTM.Terminus in ("N", "^"):
+ if DBPos > FirstCoveredResidue:
+ continue
+ # The prefix and suffix residues must be valid:
+ if DBPos:
+ PrefixAA = self.DB[DBPos - 1]
+ else:
+ PrefixAA = "-"
+ if DBPos < len(self.DB) - 1:
+ SuffixAA = self.DB[DBPos + 1]
+ else:
+ SuffixAA = "-"
+ if KnownPTM.AllowedPrefix != None and PrefixAA not in KnownPTM.AllowedPrefix:
+ continue
+ if KnownPTM.AllowedSuffix != None and SuffixAA not in KnownPTM.AllowedSuffix:
+ continue
+ ############################################################
+ # Okay, this is a LEGAL placement. Determine its score:
+ Score = self.TryShiftedSite(Site, NearMass, DBPos, KnownPTM, Shift)
+ if Score > BestEditedSiteScore:
+ BestEditedSiteScore = Score
+ self.RememberOptimalKnownPTM(Site)
+ # Now, for this shift, let's try no modification at all...if
+ # our mass is not too large. Many spectra, especially from LTQ
+ # data-sets, have spurious +6 PTMs:
+ if abs(CoreMass) < 10.0:
+ Score = self.TryShiftedSite(Site, 0, DBPos, None, Shift)
+ if Score > BestEditedSiteScore:
+ BestEditedSiteScore = Score
+ self.RememberOptimalKnownPTM(Site)
+ ###############################################
+ # Loop over modification SHIFTS, POSITIONS and MASSES is now complete.
+ # Clean up memory usage:
+ for Species in Site.SpeciesList:
+ Species.FreeCachedSpectra()
+ # Edit the species bits, for output:
+ for Species in Site.SpeciesList:
+ while len(Species.Bits) <= FormatBits.KnownPTMSitePValue:
+ Species.Bits.append("")
+ Species.Bits[FormatBits.KnownPTMName] = Species.BestKnownPTMAnnotationName
+ Species.Bits[FormatBits.KnownPTMAnnotation] = Species.BestKnownPTMAnnotation
+ Species.Bits[FormatBits.KnownPTMScore] = str(Species.BestKnownPTMAnnotationScore)
+ Species.Bits[FormatBits.KnownPTMSitePValue] = str(Species.BestKnownPTMAnnotationSitePValue)
+ # Verbose output:
+ if BestEditedSiteScore:
+ # Let stdout know what we're up to:
+ Species = Site.SpeciesList[0]
+ self.KPTMVerbose.write("Result: PValue %.3f (versus %.3f)\n"%(Species.BestKnownPTMAnnotationSitePValue, Site.PValue))
+ ScoreDiff = -math.log(Species.BestKnownPTMAnnotationSitePValue) + math.log(Site.PValue)
+ self.KPTMVerbose.write("==>Score change: %s\n"%ScoreDiff)
+ for Species in Site.SpeciesList:
+ self.KPTMVerbose.write(" %s (original)\n"%Species.Annotation)
+ self.KPTMVerbose.write(" %s (%s)\n"%(Species.BestKnownPTMAnnotation, Species.BestKnownPTMAnnotationName))
+ self.KPTMVerbose.write("Score %s (vs %s)\n"%(Species.BestKnownPTMAnnotationScore, Species.ModelScore))
+ ###############################################
+ # And now, we can output the site:
+ for Species in Site.SpeciesList:
+ Str = string.join(Species.Bits, "\t")
+ self.OutputFile.write(Str + "\n")
+ def TryShiftedSite(self, Site, ModMass, ModDBPos, KnownPTM, Shift):
+ """
+ Try shifting this modification site to the specified database position
+ and mass. Consider the effects on the peptide-score of each peptide.
+ Return the resulting site-score.
+ The value of Shift can be None, N-1, N-2, N+1, N+2, C-1, C-2, C+1, C+2.
+ """
+ SitePValue = 1.0
+ SortedSpeciesScores = []
+ #print "try shift to %+d on %s%s shift %s (%s)"%(ModMass, self.DB[ModDBPos], ModDBPos, Shift, KnownPTM.Name)
+ for Species in Site.SpeciesList:
+ if Species.Charge > 2:
+ Model = self.Model3
+ else:
+ Model = self.Model2
+ # Default "null" values:
+ Species.KnownPTMAnnotation = ""
+ Species.KnownPTMAnnotationScore = ""
+ Species.KnownPTMAnnotationPValue = ""
+ Species.KnownPTMAnnotationName = ""
+ DBStart = Species.DBPos
+ DBEnd = Species.DBEnd
+ if Shift == "N-1":
+ DBStart -= 1
+ elif Shift == "N-2":
+ DBStart -= 2
+ elif Shift == "N+1":
+ DBStart += 1
+ elif Shift == "N+2":
+ DBStart += 2
+ elif Shift == "C-1":
+ DBEnd -= 1
+ elif Shift == "C-2":
+ DBEnd -= 2
+ elif Shift == "C+1":
+ DBEnd += 1
+ elif Shift == "C+2":
+ DBEnd += 2
+ if DBEnd <= ModDBPos or DBStart > ModDBPos:
+ continue
+ if KnownPTM and KnownPTM.Terminus in ("N", "^") and ModDBPos != DBStart:
+ continue
+ EditedFeatures = Species.Features[:]
+ BestSpectrum = Species.GetBestSpectrum(self)
+ NewAminos = self.DB[DBStart:DBEnd]
+ NewPrefix = self.DB[DBStart - 1]
+ NewSuffix = self.DB[DBEnd]
+ ModPos = ModDBPos - DBStart
+ if ModMass == 0:
+ NewAnnotation = NewAminos
+ else:
+ NewAnnotation = "%s%+d%s"%(NewAminos[:ModPos + 1], ModMass, NewAminos[ModPos + 1:])
+ NewAnnotation = "%s.%s.%s"%(NewPrefix, NewAnnotation, NewSuffix)
+ if NewAnnotation == Species.Annotation:
+ # Shortcut - the annotation hasn't changed, so neither will the score!
+ if Species.Charge > 2:
+ PValue = self.Model3.GetPValue(Species.ModelScore)
+ else:
+ PValue = self.Model2.GetPValue(Species.ModelScore)
+ SortedSpeciesScores.append(Species.ModelScore)
+ SitePValue *= PValue
+ # Store these temp values; if the site is GOOD then we'll edit the species:
+ Species.KnownPTMAnnotation = NewAnnotation
+ if KnownPTM:
+ Species.KnownPTMAnnotationName = KnownPTM.Name
+ else:
+ Species.KnownPTMAnnotationName = "unmodified"
+ Species.KnownPTMAnnotationScore = Species.ModelScore
+ Species.KnownPTMAnnotationPValue = PValue
+ continue
+ # Best spectrum score:
+ Tuple = BestSpectrum.ScorePeptideDetailed(NewAnnotation)
+ EditedFeatures[FeatureBits.BestMQScore] = Tuple[0]
+ #EditedFeatures[FeatureBits.BestMQScore] = BestSpectrum.ScorePeptide(NewAnnotation)
+ # Delta-score:
+ ScoreDiff = EditedFeatures[FeatureBits.BestMQScore] - Species.Features[FeatureBits.BestMQScore]
+ EditedFeatures[FeatureBits.BestDeltaScore] += ScoreDiff
+ # Consensus spectrum score:
+ #print Species
+ ConsensusSpectrum = Species.GetConsensusSpectrum(self)
+ ScoreInfo = ConsensusSpectrum.ScorePeptideDetailed(NewAnnotation)
+ EditedFeatures[FeatureBits.ConsensusMQScore] = ScoreInfo[0]
+ ScoreDiff = EditedFeatures[FeatureBits.ConsensusMQScore] - Species.Features[FeatureBits.ConsensusMQScore]
+ EditedFeatures[FeatureBits.DeltaVsBigDB] += ScoreDiff
+ EditedFeatures[FeatureBits.PeptideLength] = ScoreInfo[1]
+ EditedFeatures[FeatureBits.TotalCutScore] = ScoreInfo[2]
+ EditedFeatures[FeatureBits.MedianCutScore] = ScoreInfo[3]
+ EditedFeatures[FeatureBits.YPresent] = ScoreInfo[4]
+ EditedFeatures[FeatureBits.BPresent] = ScoreInfo[5]
+ EditedFeatures[FeatureBits.BYIntensity] = ScoreInfo[6]
+ EditedFeatures[FeatureBits.NTT] = ScoreInfo[7]
+## EditedFeatures[FeatureBits.PRMScore] = ScoreInfo[1]
+## EditedFeatures[FeatureBits.BYPresence] = ScoreInfo[2]
+## EditedFeatures[FeatureBits.TopPeakExplanation] = ScoreInfo[3]
+## EditedFeatures[FeatureBits.NTT] = ScoreInfo[4]
+ ModTypeKey = (self.DB[ModDBPos], ModMass)
+ EditedFeatures[FeatureBits.SpectraThisModType] = self.ModTypeSpectrumCount.get(ModTypeKey, 1)
+ EditedFeatures[FeatureBits.SitesThisModType] = self.ModTypeSiteCount.get(ModTypeKey, 1)
+ EditedFeatures[FeatureBits.LogSpecThisType] = math.log(EditedFeatures[FeatureBits.SpectraThisModType])
+ EditedFeatures[FeatureBits.LogSitesThisType] = math.log(EditedFeatures[FeatureBits.SitesThisModType])
+ # Spectral similarity:
+ # Spectral similarity:
+ try:
+ SisterAnnotationFlag = int(Species.Bits[FormatBits.SisterAnnotationFlag])
+ except:
+ SisterAnnotationFlag = 0
+ if SisterAnnotationFlag:
+ try:
+ ConsensusMSSpectrum = Species.GetConsensusMSSpectrum(self)
+ ModlessMSSpectrum = Species.GetConsensusModlessMSSpectrum(self)
+ Comparator = SpectralSimilarity.SpectralSimilarity(ConsensusMSSpectrum,
+ ModlessMSSpectrum, NewAnnotation, Species.ModlessAnnotation)
+ # COPIED from ComputePTMFeatures:
+ Comparator.LabelPeaks(0.5)
+ Similarity = Comparator.DotProduct(0.5)
+ EditedFeatures[FeatureBits.Dot] = Similarity
+ Similarity = Comparator.GetSharedPeakCount(0, 1)
+ EditedFeatures[FeatureBits.Shared01] = Similarity
+ Similarity = Comparator.GetSharedPeakCount(1, 1)
+ EditedFeatures[FeatureBits.Shared11] = Similarity
+ CorrelationCoefficient = Comparator.ComputeCorrelationCoefficient(1.0)
+ EditedFeatures[FeatureBits.Correlation] = Similarity
+ except:
+ traceback.print_exc()
+ print "*** Unable to generate spectral-similarity features; continuing."
+ print Site, ModMass, ModDBPos, KnownPTM, Shift
+ print Species.Annotation, NewAnnotation
+ print "BITS:", Species.Bits
+ print "SisterAnnotationFlag:", SisterAnnotationFlag
+ # Features are complete - score the altered peptide!
+ ModelScore = self.ScoreInstance(Model, EditedFeatures)
+## # TEMP: Verbose output
+## self.KPTMVerbose.write("%s -> %s\n"%(Species.Annotation, NewAnnotation))
+## for Index in range(len(EditedFeatures)):
+## self.KPTMVerbose.write("%s: %.2f %.2f (%.4f)\n"%(Index, Species.Features[Index], EditedFeatures[Index], EditedFeatures[Index] - Species.Features[Index]))
+## self.KPTMVerbose.write("Score: %s versus old %s\n"%(ModelScore, Species.ModelScore))
+ PValue = Model.GetPValue(ModelScore)
+ SortedSpeciesScores.append(ModelScore)
+ SitePValue *= PValue
+ # Store these temp values; if the site is GOOD then we'll edit the species:
+ Species.KnownPTMAnnotation = NewAnnotation
+ if KnownPTM:
+ Species.KnownPTMAnnotationName = KnownPTM.Name
+ else:
+ Species.KnownPTMAnnotationName = "Unmodified"
+ Species.KnownPTMAnnotationScore = ModelScore
+ Species.KnownPTMAnnotationPValue = PValue
+ SortedSpeciesScores.sort()
+ SortedSpeciesScores.reverse()
+ for Species in Site.SpeciesList:
+ Species.KnownPTMAnnotationSitePValue = SitePValue
+ SiteScore = [-math.log(SitePValue)]
+ SiteScore.extend(SortedSpeciesScores)
+ if len(SortedSpeciesScores):
+ self.KPTMVerbose.write("%s%+d on %s (%s): score (%.3f, %.3f)\n"%(self.DB[ModDBPos], ModMass, ModDBPos, Shift, SiteScore[0], SiteScore[1]))
+ return SiteScore
+ def ScoreInstance(self, Model, Features):
+ NiceFeatures = []
+ #for FeatureIndex in TrainPTMFeatures.ValidFeatureIndices:
+ # NiceFeatures.append(Features[FeatureIndex])
+ return Model.ScoreInstance(Features)
+ def RememberOptimalKnownPTM(self, Site):
+ for Species in Site.SpeciesList:
+ Species.BestKnownPTMAnnotation = Species.KnownPTMAnnotation
+ Species.BestKnownPTMAnnotationScore = Species.KnownPTMAnnotationScore
+ Species.BestKnownPTMAnnotationPValue = Species.KnownPTMAnnotationPValue
+ Species.BestKnownPTMAnnotationSitePValue = Species.KnownPTMAnnotationSitePValue
+ Species.BestKnownPTMAnnotationName = Species.KnownPTMAnnotationName
+ def LoadCoverageLevels(self):
+ "Load peptide coverage levels, written by ComputePTMFeatures"
+ CoveragePath = os.path.join(self.TempFileDir, "Coverage.dat")
+ try:
+ CoverageFile = open(CoveragePath, "rb")
+ except:
+ traceback.print_exc()
+ print "** WARNING: Coverage levels not found at '%s'"%CoveragePath
+ self.Coverage = [1] * len(self.DB)
+ self.ModCoverage = [1] * len(self.DB)
+ return
+ self.Coverage = []
+ self.ModCoverage = []
+ BlokSize = struct.calcsize("<II")
+ for DBPos in range(len(self.DB)):
+ Blok = CoverageFile.read(BlokSize)
+ Tuple = struct.unpack("<II", Blok)
+ self.Coverage.append(Tuple[0])
+ self.ModCoverage.append(Tuple[1])
+ CoverageFile.close()
+ def SaveCoverageLevels(self):
+ "Save peptide coverage levels, which may have changed during merge+reconcile"
+ Dir = self.TempFileDir #os.path.split(self.OutputFileName)[0]
+ if self.DBEnd != None:
+ CoveragePath = os.path.join(Dir, "AdjustedCoverage.%s.%s.dat"%(self.DBStart, self.DBEnd))
+ else:
+ CoveragePath = os.path.join(Dir, "AdjustedCoverage.dat")
+ CoverageFile = open(CoveragePath, "wb")
+ for DBPos in range(len(self.DB)):
+ if self.Coverage[DBPos] < 0 or self.Coverage[DBPos] >= 65535:
+ print "* Coverage of %s is %s"%(DBPos, self.Coverage[DBPos])
+ if self.ModCoverage[DBPos] < 0 or self.ModCoverage[DBPos] >= 65535:
+ print "* ModCoverage of %s is %s"%(DBPos, self.ModCoverage[DBPos])
+ Str = struct.pack("<II", self.Coverage[DBPos], self.ModCoverage[DBPos])
+ CoverageFile.write(Str)
+ CoverageFile.close()
+ def MergeAndReconcile(self):
+ """
+ Iterate over peptide species, from best to worst. Two iterations: In the first, we
+ consider MERGING a peptide with a superior one. In the second iteration, we consider
+ RECONCILING each species to a superior one.
+ """
+ self.LoadCoverageLevels()
+ #self.ParseOriginalSpectraForModType(self.InputFileName)
+ if self.DBEnd != None:
+ MergeFileName = "MergeDetails.%s.%s.txt"%(self.DBStart, self.DBEnd)
+ ReconcileFileName = "ReconcileDetails.%s.%s.txt"%(self.DBStart, self.DBEnd)
+ else:
+ MergeFileName = "MergeDetails.txt"
+ ReconcileFileName = "ReconcileDetails.txt"
+ OutputDir = os.path.split(self.OutputFileName)[0]
+ self.MergeDetailOutput = open(os.path.join(OutputDir, MergeFileName), "wb")
+ Header = "Flag\tCharge\tMaster\tServant\tMasterMQScore\tServantMQScore\tServantRescore\tScoreChange\tNewConsScore\tOldModelScore\tNewModelScore\t"
+ self.MergeDetailOutput.write(Header + "\n")
+ self.ReconcileDetailOutput = open(os.path.join(OutputDir, ReconcileFileName), "wb")
+ # Flow for file lines:
+ # Initial input file -> MergeTemp -> ReconcileTemp -> output file
+ ########################################################
+ # FIRST cycle through points: Consider merging.
+ self.InputFile = open(self.InputFileName, "rb")
+ MergeTempPath = "%s.merge"%self.OutputFileName
+ self.OutputFile = open(MergeTempPath, "wb")
+ print ">>>PerformAllMerges 1: Read from %s, write to %s"%(self.InputFileName, MergeTempPath)
+ self.PerformAllMerges(1)
+ self.InputFile.close()
+ self.OutputFile.close()
+ ########################################################
+ # SECOND cycle through points: Consider conforming.
+ print "\n\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
+ print "Reconcile:"
+ self.InputFile = open(MergeTempPath, "rb")
+ ConformTempPath = "%s.conform"%self.OutputFileName
+ self.OutputFile = open(ConformTempPath, "wb")
+ print ">>>PerformAllMerges 0: Read from %s, write to %s"%(MergeTempPath, ConformTempPath)
+ self.PerformAllMerges(0)
+ self.InputFile.close()
+ self.OutputFile.close()
+ self.SaveCoverageLevels()
+ if self.DBStart != None:
+ # We're handling just one block. Therefore, we shouldn't re-score the peptides:
+ return
+ ########################################################
+ # At this point, we know the number of sites and spectra for each
+ # modification type. We need that information for when we
+ # consider changing to known PTMs. Let's pickle it.
+ self.SaveSitesByType()
+ ########################################################
+ # THIRD cycle through points: Update sites-per-mod and spectra-per-mod,
+ # and accumulate feature vectors for the model.
+ #self.InputFile = open(ConformTempPath, "rb")
+ self.ParseFeatureVectors(ConformTempPath)
+ #self.InputFile.close()
+ ########################################################
+ # FOURTH cycle through points: Write the revised score!
+ self.OutputFile = open(self.OutputFileName, "wb")
+ for HeaderLine in self.HeaderLines:
+ self.OutputFile.write(HeaderLine)
+ self.ProcessSites(ConformTempPath, "rescore")
+ def MergeBlockRuns(self):
+ """
+ Combine the output of several AdjustPTM runs for sub-blocks of the database.
+ """
+ Directory = os.path.split(self.OutputFileName)[0]
+ self.CombineBlockCoverage(self.TempFileDir)
+ self.SaveCoverageLevels()
+ # Populate self.ModTypeSpectrumCount and self.ModTypeSiteCount:
+ self.LoadModSitesByTypeBlocks(Directory)
+ self.SaveSitesByType()
+ # Concatenate block files into one large file:
+ ConcatenatedFileName = os.path.join(Directory, "ConcatenatedFeatures.txt")
+ self.ConcatenateBlockOutputFiles(Directory, ConcatenatedFileName)
+ # Parse feature-vectors, train model, output model:
+ self.ParseFeatureVectors(ConcatenatedFileName)
+ # Rescore:
+ self.OutputFile = open(self.OutputFileName, "wb")
+ for HeaderLine in self.HeaderLines:
+ self.OutputFile.write(HeaderLine)
+ self.ProcessSites(ConcatenatedFileName, "rescore")
+ self.OutputFile.close()
+ def ConcatenateBlockOutputFiles(self, Directory, OutputFileName):
+ """
+ Concatenate the merge-and-reconcile output files from various blocks of the database.
+ """
+ OutputFile = open(OutputFileName, "wb")
+ FirstFileFlag = 1
+ for FileName in os.listdir(Directory):
+ (Stub, Extension) = os.path.splitext(FileName)
+ if Extension != ".conform":
+ continue
+ FilePath = os.path.join(Directory, FileName)
+ File = open(FilePath, "rb")
+ for FileLine in File.xreadlines():
+ if FileLine[0] == "#":
+ # Header line. Write it out iff this is the first file
+ if FirstFileFlag:
+ OutputFile.write(FileLine)
+ else:
+ OutputFile.write(FileLine)
+ File.close()
+ FirstFileFlag = 0
+ print "Concatenated results from '%s' into '%s'"%(Directory, OutputFileName)
+ def LoadModSitesByTypeBlocks(self, Directory):
+ """
+ Iterate over block output files from this directory. Populate
+ self.ModTypeSiteCount and self.ModTypeSpectrumCount based on the contents.
+ """
+ self.ModTypeSpectrumCount = {}
+ self.ModTypeSiteCount = {}
+ for FileName in os.listdir(Directory):
+ (Stub, Extension) = os.path.splitext(FileName)
+ if Extension != ".conform":
+ continue
+ Path = os.path.join(Directory, FileName)
+ File = open(Path, "rb")
+ print "Read SitesByType from %s..."%Path
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ if FileLine[0] == "#":
+ continue
+ AA = Bits[FormatBits.ModifiedAA]
+ Mass = int(Bits[FormatBits.ModificationMass])
+ Spectra = int(float(Bits[FormatBits.SpectrumCount]))
+ Key = (AA, Mass)
+ self.ModTypeSiteCount[Key] = self.ModTypeSiteCount.get(Key, 0) + 1
+ self.ModTypeSpectrumCount[Key] = self.ModTypeSpectrumCount.get(Key, 0) + Spectra
+ def CombineBlockCoverage(self, Directory):
+ # Load the original coverage, just for reference:
+ self.LoadCoverageLevels()
+ # Iterate over coverage output files from the individual blocks:
+ for FileName in os.listdir(Directory):
+ (Stub, Extension) = os.path.splitext(FileName)
+ if Extension != ".dat":
+ continue
+ Bits = FileName.split(".")
+ # Names have the form AdjustedCoverage.START.END.dat
+ if len(Bits) < 4:
+ continue
+ DBStart = int(Bits[1])
+ DBEnd = int(Bits[2])
+ Path = os.path.join(Directory, FileName)
+ print "Read block coverage from %s..."%Path
+ CoverageFile = open(Path, "rb")
+ BlokSize = struct.calcsize("<II")
+ for DBPos in range(len(self.DB)):
+ Blok = CoverageFile.read(BlokSize)
+ Tuple = struct.unpack("<II", Blok)
+ if DBPos < DBStart or DBPos >= DBEnd:
+ continue
+ self.Coverage[DBPos] = Tuple[0]
+ self.ModCoverage[DBPos] = Tuple[1]
+ CoverageFile.close()
+ def ProcessSites(self, InputFileName, Command):
+ """
+ Parse modification-sites from the input file. Once all the peptides
+ for a site have been read, execute the command.
+ """
+ if Command == "knownptm":
+ if self.KnownPTMVerboseOutputFileName:
+ self.KPTMVerbose = open(self.KnownPTMVerboseOutputFileName, "wb")
+ else:
+ self.KPTMVerbose = sys.stdout
+ CurrentSite = None
+ InputFile = open(InputFileName, "rb")
+ for FileLine in InputFile.xreadlines():
+ FileLine = FileLine.replace("\r", "").replace("\n", "")
+ if not FileLine:
+ continue # skip blank lines
+ Bits = FileLine.split("\t")
+ if FileLine[0] == "#" or len(Bits) < 2:
+ continue
+ try:
+ Species = PeptideSpeciesClass()
+ Species.ParseBits(Bits)
+ except:
+ traceback.print_exc()
+ print Bits
+ continue
+ if CurrentSite == None or Species.ModDBPos != CurrentSite.ModDBPos or Species.ModMass != CurrentSite.ModMass:
+ # Finish the previous (if any) site:
+ if CurrentSite:
+ if Command == "rescore":
+ self.RescoreAndWriteSite(CurrentSite)
+ elif Command == "knownptm":
+ self.AttemptKnownPTM(CurrentSite)
+ CurrentSite = SiteClass()
+ CurrentSite.ModDBPos = Species.ModDBPos
+ CurrentSite.ModMass = Species.ModMass
+ # Add a species to the current site:
+ CurrentSite.SpeciesList.append(Species)
+ InputFile.close()
+ # Finish the last site:
+ if CurrentSite:
+ if Command == "rescore":
+ self.RescoreAndWriteSite(CurrentSite)
+ elif Command == "knownptm":
+ self.AttemptKnownPTM(CurrentSite)
+ def ParseFeatureVectors(self, FileName):
+ """
+ Called after merge and reconcile. Read feature-vectors, updating
+ the spectrum/site counts for modification type, and obtain scores!
+ """
+ FeatureSet2 = Learning.FeatureSetClass()
+ FeatureSet3 = Learning.FeatureSetClass()
+ File = open(FileName, "rb")
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ if FileLine[0] == "#":
+ continue
+ if len(Bits) < 2:
+ continue
+ Charge = int(Bits[FormatBits.Charge])
+ Vector = Learning.FeatureVector()
+ Vector.Features = []
+ for BitIndex in range(FormatBits.FirstFeature, FormatBits.LastFeature + 1):
+ try:
+ Vector.Features.append(float(Bits[FeatureIndex]))
+ except:
+ Vector.Features.append(0)
+ # Tweak spectra-by-type and sites-by-type:
+ ModTypeKey = (Bits[FormatBits.ModifiedAA], int(Bits[FormatBits.ModificationMass]))
+ TotalSpectra = self.ModTypeSpectrumCount.get(ModTypeKey, 0)
+ TotalSites = self.ModTypeSiteCount.get(ModTypeKey, 0)
+ Vector.Features[FeatureBits.SpectraThisModType] = TotalSpectra
+ Vector.Features[FeatureBits.SitesThisModType] = TotalSites
+ print "Total Spectra: %d"%TotalSpectra
+ Vector.Features[FeatureBits.LogSpecThisType] = math.log(TotalSpectra)
+ Vector.Features[FeatureBits.LogSitesThisType] = math.log(TotalSites)
+ if Charge > 2:
+ FeatureSet = FeatureSet3
+ else:
+ FeatureSet = FeatureSet2
+ if int(Bits[FormatBits.TrueProteinFlag]):
+ Vector.TrueFlag = 1
+ FeatureSet.TrueVectors.append(Vector)
+ else:
+ Vector.TrueFlag = 0
+ FeatureSet.FalseVectors.append(Vector)
+ FeatureSet.AllVectors.append(Vector)
+ File.close()
+ FeatureSet2.SetCounts()
+ FeatureSet3.SetCounts()
+ self.Model2.Test(FeatureSet2)
+ self.Model3.Test(FeatureSet3)
+ if self.OutputModelFileName2:
+ self.Model2.SaveModel(self.OutputModelFileName2)
+ self.Model3.SaveModel(self.OutputModelFileName3)
+ def LoadCluster(self, Path):
+ Builder = BuildConsensusSpectrum.ConsensusBuilder()
+ Builder.UnpickleCluster(Path)
+ return Builder
+ def AttemptMerge(self, SpeciesA, SpeciesB, BWeak = 0):
+ """
+ Attermpt a merge of SpeciesA into SpeciesB. If the merge is valid,
+ perform the merge, set SpeciesA.MergedFlag, and RETURN TRUE.
+ """
+ print "AttemptMerge chg%s %s into %s"%(SpeciesA.Charge, SpeciesA.Annotation, SpeciesB.Annotation)
+ ################################################
+ # Condition 1: Consensus A seems to match annotation B reasonably well
+ if BWeak:
+ # Species A has the stronger score, so don't screw it up!
+ ScoreLossLimit = 0.1
+ else:
+ ScoreLossLimit = 3
+ SpectrumA = SpeciesA.GetConsensusSpectrum(self)
+ Score = SpectrumA.ScorePeptide(SpeciesB.Annotation)
+ DetailStr = "%s\t%s\t%s\t%s\t%s\t"%(SpeciesA.Charge, SpeciesB.Annotation, SpeciesA.Annotation, SpeciesB.Features[FeatureBits.ConsensusMQScore], SpeciesA.Features[FeatureBits.ConsensusMQScore])
+ DetailStr += "%s\t%s\t"%(Score, SpeciesA.Features[FeatureBits.ConsensusMQScore] - Score)
+ if (SpeciesA.Features[FeatureBits.ConsensusMQScore] - Score) > ScoreLossLimit:
+ DetailStr = "FailAScore\t"+DetailStr
+ self.MergeDetailOutput.write(DetailStr + "\n")
+ return 0
+ ################################################
+ # Condition 2: The merged consensus spectrum is not significantly WORSE.
+ # Load in ClusterA and ClusterB (we cache the species-A cluster)
+ ClusterPathA = os.path.join(self.ConsensusClusterDir, SpeciesA.Annotation[2], "%s.%s.cls"%(SpeciesA.Annotation, SpeciesA.Charge))
+ if ClusterPathA == self.CachedClusterPath:
+ ClusterA = self.CachedCluster
+ else:
+ self.CachedCluster = self.LoadCluster(ClusterPathA)
+ self.CachedClusterPath = ClusterPathA
+ ClusterA = self.CachedCluster
+ ClusterPathB = os.path.join(self.ConsensusClusterDir, SpeciesB.Annotation[2], "%s.%s.cls"%(SpeciesB.Annotation, SpeciesB.Charge))
+ ClusterB = self.LoadCluster(ClusterPathB)
+ # If we combine these two clusters into a single consensus
+ # spectrum, what sort of MQScore do we end up with?
+ TempConsensusPath = os.path.join(self.ConsensusSpectraDir, "Consensus.dta")
+ ClusterB.AssimilateCluster(ClusterA)
+ NewConsensusSpectrum = ClusterB.ProduceConsensusSpectrum()
+ NewConsensusSpectrum.WritePeaks(TempConsensusPath)
+ # Set the file members of the spectrum, since Label.py reads them:
+ NewConsensusSpectrum.FilePath = TempConsensusPath
+ NewConsensusSpectrum.FilePos = 0
+ NewConsensusSpectrum.RankPeaksByIntensity()
+ PySpectrum = PyInspect.Spectrum(TempConsensusPath, 0)
+ ScoreInfo = PySpectrum.ScorePeptideDetailed(SpeciesB.Annotation)
+ DetailStr += "%s\t"%ScoreInfo[0]
+ if SpeciesB.Features[FeatureBits.ConsensusMQScore] - ScoreInfo[0] > 2:
+ DetailStr = "FailBConsensus\t"+DetailStr
+ self.MergeDetailOutput.write(DetailStr + "\n")
+ return 0
+ NewFeatures = SpeciesB.Features[:]
+ # Update feature values for the merged guy:
+ # Spectra:
+ NewFeatures[FeatureBits.SpectrumCount] = SpeciesA.Features[FeatureBits.SpectrumCount] + SpeciesB.Features[FeatureBits.SpectrumCount]
+ NewFeatures[FeatureBits.LogSpectrumCount] = math.log(NewFeatures[FeatureBits.SpectrumCount])
+ NewFeatures[FeatureBits.ModlessSpectrumCount] = SpeciesB.Features[FeatureBits.ModlessSpectrumCount] # unchanged
+ # BestMQ, BestDelta, PeptideCount:
+ BestSpectrumA = SpeciesA.GetBestSpectrum(self)
+ NewBestMQA = BestSpectrumA.ScorePeptide(SpeciesB.Annotation)
+ NewBestDeltaA = SpeciesA.Features[3] + (NewBestMQA - SpeciesA.Features[2])
+ print "A best MQ %.4f (was %.4f) delta %.4f (was %.4f)"%(NewBestMQA, SpeciesA.Features[2],
+ NewBestDeltaA, SpeciesA.Features[3])
+ # Best MQScore:
+ NewFeatures[FeatureBits.BestMQScore] = max(NewBestMQA, SpeciesB.Features[FeatureBits.BestMQScore])
+ # BestDelta:
+ NewFeatures[FeatureBits.BestDeltaScore] = max(NewBestDeltaA, SpeciesB.Features[FeatureBits.BestDeltaScore])
+ # Peptide length:
+ NewFeatures[FeatureBits.PeptideLength] = SpeciesB.Features[FeatureBits.PeptideLength]
+ # Peptide count:
+ NewFeatures[FeatureBits.PeptideCount] = SpeciesB.Features[FeatureBits.PeptideCount]
+ # Consensus scoring (Score, and score components):
+ NewFeatures[FeatureBits.ConsensusMQScore] = ScoreInfo[0]
+ NewFeatures[FeatureBits.PeptideLength] = ScoreInfo[1]
+ NewFeatures[FeatureBits.TotalCutScore] = ScoreInfo[2]
+ NewFeatures[FeatureBits.MedianCutScore] = ScoreInfo[3]
+ NewFeatures[FeatureBits.YPresent] = ScoreInfo[4]
+ NewFeatures[FeatureBits.BPresent] = ScoreInfo[5]
+ NewFeatures[FeatureBits.BYIntensity] = ScoreInfo[6]
+ NewFeatures[FeatureBits.NTT] = ScoreInfo[7]
+ # Adjust delta-score by the difference in consensus-mq-score:
+ NewFeatures[FeatureBits.DeltaVsBigDB] = SpeciesB.Features[FeatureBits.DeltaVsBigDB] + (ScoreInfo[0] - SpeciesB.Features[FeatureBits.ConsensusMQScore])
+ # Similarity scores for the new consensus spectrum:
+ if SpeciesB.Bits[FormatBits.SisterAnnotationFlag]:
+ ModlessSpectrum = SpeciesB.GetConsensusModlessMSSpectrum(self)
+ #print "Build comparator:"
+ Comparator = SpectralSimilarity.SpectralSimilarity(NewConsensusSpectrum,
+ ModlessSpectrum, SpeciesB.Annotation, SpeciesB.ModlessAnnotation)
+ #print "Label peaks:"
+ # COPIED from ComputePTMFeatures:
+ Comparator.LabelPeaks(0.5)
+ #print "Compute:"
+ Similarity = Comparator.DotProduct(0.5)
+ NewFeatures[FeatureBits.Dot] = Similarity
+ Similarity = Comparator.GetSharedPeakCount(0, 1)
+ NewFeatures[FeatureBits.Shared01] = Similarity
+ Similarity = Comparator.GetSharedPeakCount(1, 1)
+ NewFeatures[FeatureBits.Shared11] = Similarity
+ CorrelationCoefficient = Comparator.ComputeCorrelationCoefficient(1.0)
+ NewFeatures[FeatureBits.Correlation] = Similarity
+ # Ask the trained model what it thinks of this new feature-vector:
+ if SpeciesA.Charge > 2:
+ Model = self.Model3
+ else:
+ Model = self.Model2
+ NewModelScore = self.ScoreInstance(Model, NewFeatures)
+ PValue = Model.GetPValue(NewModelScore)
+ OldPValue = Model.GetPValue(SpeciesB.ModelScore)
+ print "Score of the NEW FEATURES: %.3f (%.1f%%) versus %.3f (%.1f%%) old)"%(NewModelScore, 100 * PValue, SpeciesB.ModelScore, 100 * OldPValue)
+ DetailStr += "%s\t%s\t"%(SpeciesB.ModelScore, NewModelScore)
+ ################################################
+ # Condition 3: Model score should not be dramatically worse!
+ if NewModelScore < SpeciesB.ModelScore - 0.5:
+ DetailStr = "FailModelScore\t%s"%DetailStr
+ self.MergeDetailOutput.write(DetailStr + "\n")
+ return 0
+ DetailStr = "MERGE\t" + DetailStr
+ print "MERGE %s and %s"%(SpeciesA.Annotation, SpeciesB.Annotation)
+ self.MergeDetailOutput.write(DetailStr + "\n")
+ #######################################################
+ # ALL CONDITIONS PASSED, NOW LET'S MERGE:
+ SpeciesA.MergedFlag = 1 # this species won't be written out.
+ # Remember the new "best spectrum", if it belonged to A:
+ if (NewBestMQA > SpeciesB.Features[2]):
+ SpeciesB.ConsensusMQScore = NewBestMQA
+ SpeciesB.Bits[FormatBits.BestSpectrumPath] = SpeciesA.Bits[FormatBits.BestSpectrumPath]
+ SpeciesB.Features = NewFeatures
+ ############################################
+ # Update our COVERAGE and MODDED-FRACTION:
+ for DBPos in range(SpeciesA.DBPos, SpeciesA.DBEnd):
+ if SpeciesA.Peptide.Modifications.keys():
+ self.ModCoverage[DBPos] -= int(SpeciesA.Features[FeatureBits.SpectrumCount])
+ else:
+ self.Coverage[DBPos] -= int(SpeciesA.Features[FeatureBits.SpectrumCount])
+ for DBPos in range(SpeciesB.DBPos, SpeciesB.DBEnd):
+ if SpeciesB.Peptide.Modifications.keys():
+ self.ModCoverage[DBPos] += int(SpeciesB.Features[FeatureBits.SpectrumCount])
+ else:
+ self.Coverage[DBPos] += int(SpeciesB.Features[FeatureBits.SpectrumCount])
+ SpeciesB.Features[FeatureBits.ModdedFraction] = self.ModCoverage[SpeciesB.ModDBPos] / float(self.ModCoverage[SpeciesB.ModDBPos] + self.Coverage[SpeciesB.ModDBPos])
+ ############################################
+ # Write the adjusted consensus cluster:
+ ClusterPathB = os.path.join(self.ConsensusClusterDirAdjusted, SpeciesB.Annotation[2], "%s.%s.cls"%(SpeciesB.Annotation, SpeciesB.Charge))
+ ClusterB.PickleCluster(ClusterPathB)
+ SpeciesB.ConsensusMSSpectrum = NewConsensusSpectrum
+ SpeciesB.ConsensusSpectrum = PySpectrum
+ SpeciesB.ModelScore = NewModelScore
+ # Write the adjusted consensus spectrum:
+ ConsensusPath = os.path.join(self.ConsensusSpectraDirAdjusted, SpeciesB.Annotation[2], "%s.%s.dta"%(SpeciesB.Annotation, SpeciesB.Charge))
+ NewConsensusSpectrum.WritePeaks(ConsensusPath)
+ # Write the merged list of members:
+ MemberListStr = ""
+ try:
+ MemberListStr += SpeciesA.GetMemberListStr(self)
+ MemberListStr += SpeciesB.GetMemberListStr(self)
+ except:
+ print "* ERROR!"
+ print SpeciesA
+ print SpeciesB
+ raise
+ Path = os.path.join(self.ClusterScanListDirAdjusted, SpeciesB.Annotation[2], "%s.%s.txt"%(SpeciesB.Annotation, SpeciesB.Charge))
+ ClusterMemberFile = open(Path, "wb")
+ ClusterMemberFile.write(MemberListStr)
+ ClusterMemberFile.close()
+ # Remove peptide A from self.PeptideDict:
+ Key = (SpeciesA.Annotation, SpeciesA.Charge)
+ try:
+ del self.PeptideDict[Key]
+ except:
+ pass
+ return 1
+ def AttemptReconcileFixEndpoints(self, SpeciesA, SpeciesB, OldDBPos, OldDBEnd):
+ # Adjust endpoints, if necessary for reconciliation:
+ if SpeciesA.ModMass > SpeciesB.ModMass:
+ # Species A has a larger modification. Maybe we can ADD 1-2 residues
+ # and make the modification mass equal?
+ # Try shifting N-terminus:
+ ExtraMass = Global.AminoMass.get(self.DB[SpeciesA.DBPos - 1], 999999)
+ FullMass = int(round(SpeciesA.ModMass - ExtraMass))
+ if FullMass == SpeciesB.ModMass:
+ return (OldDBPos - 1, OldDBEnd)
+ if FullMass < SpeciesB.ModMass:
+ ExtraMass += Global.AminoMass.get(self.DB[SpeciesA.DBPos - 2], 999999)
+ FullMass = int(round(SpeciesA.ModMass - ExtraMass))
+ if FullMass == SpeciesB.ModMass:
+ return (OldDBPos - 2, OldDBEnd)
+ # Try shifting C-terminus:
+ ExtraMass = Global.AminoMass.get(self.DB[SpeciesA.DBEnd], 999999)
+ FullMass = int(round(SpeciesA.ModMass - ExtraMass))
+ if abs(FullMass - SpeciesB.ModMass) < 2:
+ return (OldDBPos, OldDBEnd + 1)
+ if FullMass < SpeciesB.ModMass:
+ ExtraMass += Global.AminoMass.get(self.DB[SpeciesA.DBEnd + 1], 999999)
+ FullMass = int(round(SpeciesA.ModMass - ExtraMass))
+ if abs(FullMass - SpeciesB.ModMass) < 2:
+ return (OldDBPos, OldDBEnd + 2)
+ if SpeciesA.ModMass < SpeciesB.ModMass:
+ # Species A has a smaller modification. Maybe we can REMOVE 1-2 residues
+ # and make the modification mass equal?
+ # Try shifting N-terminus:
+ ExtraMass = Global.AminoMass[self.DB[SpeciesA.DBPos]]
+ FullMass = int(round(SpeciesA.ModMass + ExtraMass))
+ if abs(FullMass - SpeciesB.ModMass) < 2:
+ return (OldDBPos + 1, OldDBEnd)
+ if FullMass > SpeciesB.ModMass:
+ ExtraMass += Global.AminoMass[self.DB[SpeciesA.DBPos + 1]]
+ FullMass = int(round(SpeciesA.ModMass + ExtraMass))
+ if abs(FullMass - SpeciesB.ModMass) < 2:
+ return (OldDBPos + 2, OldDBEnd)
+ # Try shifting C-terminus:
+ ExtraMass = Global.AminoMass[self.DB[SpeciesA.DBEnd - 1]]
+ FullMass = int(round(SpeciesA.ModMass + ExtraMass))
+ if abs(FullMass - SpeciesB.ModMass) < 2:
+ return (OldDBPos, OldDBEnd - 1)
+ if FullMass < SpeciesB.ModMass:
+ ExtraMass += Global.AminoMass[self.DB[SpeciesA.DBEnd - 2]]
+ FullMass = int(round(SpeciesA.ModMass + ExtraMass))
+ if abs(FullMass - SpeciesB.ModMass) < 2:
+ return (OldDBPos, OldDBEnd - 2)
+ return (OldDBPos, OldDBEnd)
+ def AttemptReconcile(self, SpeciesA, SpeciesB):
+ """
+ Attempt to reconcile species A with species B. In other words,
+ edit annotation A so that it carries the same modification as B, and
+ on the same database position. If the effects on match quality score
+ (and/or model score) are an IMPROVEMENT (or at least, not a big
+ disappointment), then perform the reconciliation, and return TRUE.
+ """
+ OldDBPos = SpeciesA.DBPos
+ OldDBEnd = SpeciesA.DBEnd
+ (NewDBPos, NewDBEnd) = self.AttemptReconcileFixEndpoints(SpeciesA, SpeciesB, SpeciesA.DBPos, SpeciesA.DBEnd)
+ NewPrefix = self.DB[NewDBPos - 1]
+ NewSuffix = self.DB[NewDBEnd]
+ ConsensusSpectrum = SpeciesA.GetConsensusSpectrum(self)
+ ModIndex = SpeciesB.ModDBPos - NewDBPos
+ ModdedAnnotation = "%s%+d%s"%(self.DB[NewDBPos:SpeciesB.ModDBPos + 1], SpeciesB.ModMass, self.DB[SpeciesB.ModDBPos + 1:NewDBEnd])
+ NewAnnotation = "%s.%s.%s"%(NewPrefix, ModdedAnnotation, NewSuffix)
+ NewAnnotation = NewAnnotation.replace("*", "-")
+ NewConsensusScore = ConsensusSpectrum.ScorePeptide(NewAnnotation)
+ ScoreDiff = NewConsensusScore - SpeciesA.ConsensusMQScore
+ self.ReconcileDetailOutput.write("%s\t%s\t%s\t%s\t%s\t\n"%(SpeciesA.Annotation, NewAnnotation, SpeciesA.ConsensusMQScore, NewConsensusScore, ScoreDiff))
+ if ScoreDiff < -0.5:
+ return 0
+ OldAnnotation = SpeciesA.Annotation
+ NewPeptide = GetPeptideFromModdedName(NewAnnotation)
+ NewModlessAnnotation = "%s.%s.%s"%(NewPeptide.Prefix, self.DB[NewDBPos:NewDBEnd], NewPeptide.Suffix)
+ # Compute new features of the 'reconciled peptide':
+ NewFeatures = SpeciesA.Features[:]
+ # Best spectrum MQScore and Delta-score:
+ PySpectrum = SpeciesA.GetBestSpectrum(self)
+ NewBestMQ = PySpectrum.ScorePeptide(NewAnnotation)
+ NewFeatures[FeatureBits.BestDeltaScore] += (NewBestMQ - NewFeatures[FeatureBits.BestMQScore])
+ NewFeatures[FeatureBits.BestMQScore] = NewBestMQ
+ PeptideLength = NewDBEnd - NewDBPos
+ NewFeatures[FeatureBits.PeptideLength] = PeptideLength
+ NewFeatures[FeatureBits.LogPeptideLength] = math.log(PeptideLength)
+ # Consensus score:
+ PySpectrum = SpeciesA.GetConsensusSpectrum(self)
+ ScoreInfo = PySpectrum.ScorePeptideDetailed(NewAnnotation)
+ NewFeatures[FeatureBits.ConsensusMQScore] = ScoreInfo[0]
+ NewFeatures[FeatureBits.PeptideLength] = ScoreInfo[1]
+ NewFeatures[FeatureBits.TotalCutScore] = ScoreInfo[2]
+ NewFeatures[FeatureBits.MedianCutScore] = ScoreInfo[3]
+ NewFeatures[FeatureBits.YPresent] = ScoreInfo[4]
+ NewFeatures[FeatureBits.BPresent] = ScoreInfo[5]
+ NewFeatures[FeatureBits.BYIntensity] = ScoreInfo[6]
+ NewFeatures[FeatureBits.NTT] = ScoreInfo[7]
+ # Adjust delta-score by the difference in consensus-mq-score:
+ NewFeatures[FeatureBits.DeltaVsBigDB] = SpeciesA.Features[FeatureBits.DeltaVsBigDB] + (ScoreInfo[0] - SpeciesA.Features[FeatureBits.ConsensusMQScore])
+ # Adjust spectra, sites for this modification type:
+ NewFeatures[FeatureBits.SpectraThisModType] = SpeciesB.Features[FeatureBits.SpectraThisModType]
+ NewFeatures[FeatureBits.SitesThisModType] = SpeciesB.Features[FeatureBits.SitesThisModType]
+ NewFeatures[FeatureBits.LogSpecThisType] = SpeciesB.Features[FeatureBits.LogSpecThisType]
+ NewFeatures[FeatureBits.LogSitesThisType] = SpeciesB.Features[FeatureBits.LogSitesThisType]
+ # Modless spectra:
+ # - If our endpoint didn't change, then we keep our old modless spectrum
+ # - If our endpoints changed and we're assimilating a target (ExistingSpecies),
+ # then we inherit *its* modless spectra
+ # - Otherwise, we LOSE our modless spectra!
+ Key = (NewAnnotation, SpeciesA.Charge)
+ ExistingSpecies = self.PeptideDict.get(Key, None)
+ # Initialize:
+ ModlessSpectrumFlag = 0
+ BestModlessSpectrumPath = ""
+ BestModlessMQScore = ""
+ if OldDBPos == NewDBPos and OldDBEnd == NewDBEnd:
+ try:
+ ModlessSpectrumFlag = int(SpeciesA.Bits[FormatBits.SisterAnnotationFlag])
+ BestModlessSpectrumPath = SpeciesA.Bits[FormatBits.BestModlessSpectrumPath]
+ BestModlessMQScore = float(SpeciesA.Bits[FormatBits.BestModlessMQScore])
+ ModlessMSSpectrum = SpeciesA.GetConsensusModlessMSSpectrum(self)
+ except:
+ pass # the modless-bits weren't set; that's fine
+ elif ExistingSpecies:
+ try:
+ ModlessSpectrumFlag = int(ExistingSpecies.Bits[FormatBits.SisterAnnotationFlag])
+ BestModlessSpectrumPath = ExistingSpecies.Bits[FormatBits.BestModlessSpectrumPath]
+ BestModlessMQScore = float(ExistingSpecies.Bits[FormatBits.BestModlessMQScore])
+ ModlessMSSpectrum = ExistingSpecies.GetConsensusModlessMSSpectrum(self)
+ except:
+ pass # the modless-bits weren't set; that's fine
+ else:
+ ModlessSpectrumFlag = ""
+ BestModlessSpectrumPath = ""
+ BestModlessMQScore = ""
+ if ModlessSpectrumFlag:
+ MSSpectrum = SpeciesA.GetConsensusMSSpectrum(self)
+ Comparator = SpectralSimilarity.SpectralSimilarity(MSSpectrum,
+ ModlessMSSpectrum, NewAnnotation, NewModlessAnnotation)
+ # COPIED from ComputePTMFeatures:
+ Comparator.LabelPeaks(0.5)
+ Similarity = Comparator.DotProduct(0.5)
+ NewFeatures[FeatureBits.Dot] = Similarity
+ Similarity = Comparator.GetSharedPeakCount(0, 1)
+ NewFeatures[FeatureBits.Shared01] = Similarity
+ Similarity = Comparator.GetSharedPeakCount(1, 1)
+ NewFeatures[FeatureBits.Shared11] = Similarity
+ CorrelationCoefficient = Comparator.ComputeCorrelationCoefficient(1.0)
+ NewFeatures[FeatureBits.Correlation] = Similarity
+ if SpeciesA.Charge > 2:
+ Model = self.Model3
+ else:
+ Model = self.Model2
+ NewModelScore = self.ScoreInstance(Model, NewFeatures)
+ # Finalize:
+ # If self.PeptideDict already has an entry, then this peptide
+ # is the same as another after reconciliation. But, we already
+ # skipped the opportunity to merge with that other.
+ if ExistingSpecies:
+ self.ReconcileDetailOutput.write("# Existing species has score %s vs model %s\n"%(ExistingSpecies.ModelScore, SpeciesA.ModelScore))
+ if ExistingSpecies.ModelScore < SpeciesA.ModelScore:
+ ExistingSpecies.MergedFlag = 1
+ else:
+ # We want to reconcile to the master...but that would make us the same as another
+ # peptide species, which we refused to merge with!
+ self.ReconcileDetailOutput.write("# *-> We'd like to reconcile %s to %s, but...\n"%(SpeciesA.Annotation, NewAnnotation))
+ self.ReconcileDetailOutput.write("# ...there's ALREADY a superior peptide at %s\n"%str(Key))
+ return 0
+ ################################################################################
+ # All tests passed. RECONCILE!
+ self.ReconcileDetailOutput.write("> Reconcile %s to %s\n"%(SpeciesA.Annotation, NewAnnotation))
+ # Copy over our consensus spectrum.
+ NewSpectrumPath = os.path.join(self.ConsensusSpectraDirAdjusted, NewAnnotation[2], "%s.%s.dta"%(NewAnnotation, SpeciesA.Charge))
+ OldSpectrumPath = os.path.join(self.ConsensusSpectraDirAdjusted, SpeciesA.Annotation[2], "%s.%s.dta"%(SpeciesA.Annotation, SpeciesA.Charge))
+ if os.path.exists(OldSpectrumPath):
+ # If we've already adjusted once, move the *old* adjusted to the *new* adjusted:
+ if sys.platform == "win32":
+ Command = "move \"%s\" \"%s\""%(OldSpectrumPath, NewSpectrumPath)
+ else:
+ Command = "mv \"%s\" \"%s\""%(OldSpectrumPath, NewSpectrumPath)
+ else:
+ OldSpectrumPath = os.path.join(self.ConsensusSpectraDir, SpeciesA.Annotation[2], "%s.%s.dta"%(SpeciesA.Annotation, SpeciesA.Charge))
+ if sys.platform == "win32":
+ Command = "copy \"%s\" \"%s\""%(OldSpectrumPath, NewSpectrumPath)
+ else:
+ Command = "cp \"%s\" \"%s\""%(OldSpectrumPath,NewSpectrumPath)
+ print Command
+ os.system(Command)
+ # Copy over the cluster member list:
+ MemberListStr = SpeciesA.GetMemberListStr(self)
+ Path = os.path.join(self.ClusterScanListDirAdjusted, NewAnnotation[2], "%s.%s.txt"%(NewAnnotation, SpeciesA.Charge))
+ ClusterMemberFile = open(Path, "wb")
+ ClusterMemberFile.write(MemberListStr)
+ ClusterMemberFile.close()
+ # Update features and such:
+ SpeciesA.Features = NewFeatures
+ SpeciesA.ConsensusMQScore = NewConsensusScore
+ SpeciesA.ModelScore = NewModelScore
+ SpeciesA.Peptide = NewPeptide
+ SpeciesA.ComputePrefixes()
+ SpeciesA.Annotation = NewAnnotation
+ SpeciesA.ModDBPos = SpeciesB.ModDBPos
+ SpeciesA.ModMass = SpeciesB.ModMass
+ SpeciesA.ModAA = SpeciesB.ModAA
+ SpeciesA.ModlessAnnotation = NewModlessAnnotation
+ # Revise file bits:
+ SpeciesA.Bits[FormatBits.DBPos] = str(SpeciesA.ModDBPos)
+ SpeciesA.Bits[FormatBits.ModificationMass] = str(SpeciesA.ModMass)
+ SpeciesA.Bits[FormatBits.ModifiedAA] = SpeciesB.Bits[FormatBits.ModifiedAA]
+ SpeciesA.Bits[FormatBits.ModifiedResidueNumber] = SpeciesB.Bits[FormatBits.ModifiedResidueNumber]
+ SpeciesA.Bits[FormatBits.Peptide] = NewAnnotation
+ for FeatureIndex in range(len(SpeciesA.Features)):
+ SpeciesA.Bits[FormatBits.FirstFeature + FeatureIndex] = str(SpeciesA.Features[FeatureIndex])
+ SpeciesA.Bits[FormatBits.ModelScore] = str(NewModelScore)
+ SpeciesA.Bits[FormatBits.ConsensusMQScore] = str(NewConsensusScore)
+ # Bits for modless spectra:
+ SpeciesA.Bits[FormatBits.SisterAnnotationFlag] = str(ModlessSpectrumFlag)
+ SpeciesA.Bits[FormatBits.BestModlessSpectrumPath] = BestModlessSpectrumPath
+ SpeciesA.Bits[FormatBits.BestModlessMQScore] = str(BestModlessMQScore)
+ # Remove our old PeptideDict entry:
+ try:
+ del self.PeptideDict[(OldAnnotation, SpeciesA.Charge)]
+ except:
+ pass
+ # Add a new PeptideDict entry:
+ self.PeptideDict[Key] = SpeciesA
+ ############################################
+ # Update our COVERAGE and MODDED-FRACTION:
+ for DBPos in range(OldDBPos, OldDBEnd):
+ self.ModCoverage[DBPos] -= int(SpeciesA.Features[FeatureBits.SpectrumCount])
+ for DBPos in range(SpeciesA.DBPos, SpeciesA.DBEnd):
+ self.ModCoverage[DBPos] += int(SpeciesA.Features[FeatureBits.SpectrumCount])
+ SpeciesA.Features[FeatureBits.ModdedFraction] = self.ModCoverage[SpeciesA.ModDBPos] / float(self.ModCoverage[SpeciesA.ModDBPos] + self.Coverage[SpeciesA.ModDBPos])
+ ############################################
+ return 1
+ def LoadModel(self):
+ print "load %s"%self.SavedModelFileName2
+ self.Model2 = Learning.LoadGeneralModel(self.SavedModelFileName2)
+ print "load %s"%self.SavedModelFileName3
+ self.Model3 = Learning.LoadGeneralModel(self.SavedModelFileName3)
+ #print "Model.MixtureModel", self.Model.MixtureModel
+ def GroupPeptidesBySite(self):
+ self.Sites = {} # (ModDBPos, ModMass) -> site instance
+ for Species in self.SpeciesList:
+ if Species.MergedFlag:
+ continue
+ Key = (Species.ModDBPos, Species.ModMass)
+ Site = self.Sites.get(Key, None)
+ if not Site:
+ Site = SiteClass()
+ Site.ModDBPos = Species.ModDBPos
+ Site.ModMass = Species.ModMass
+ Site.ModAA = Species.ModAA
+ self.Sites[Key] = Site
+ Site.SpeciesList.append(Species)
+ def ScorePTMSites(self):
+ """
+ Group peptide species by site, and compute the p-value (odds FALSE) for
+ each site.
+ """
+ self.GroupPeptidesBySite()
+ for Site in self.Sites.values():
+ Site.PValue = 1.0
+ for Species in Site.SpeciesList:
+ while len(Species.Bits) <= FormatBits.SitePValue:
+ Species.Bits.append("")
+ if Species.Charge > 2:
+ Model = self.Model3
+ else:
+ Model = self.Model2
+ PeptidePValue = Model.GetPValue(Species.ModelScore)
+ Species.Bits[FormatBits.ModelPValue] = str(PeptidePValue)
+ Site.PValue *= PeptidePValue
+ for Species in Site.SpeciesList:
+ Species.Bits[FormatBits.SitePValue] = str(Site.PValue)
+ def RescoreAndWriteSite(self, Site):
+ Site.PValue = 1.0
+ for Species in Site.SpeciesList:
+ ModTypeKey = (Species.Bits[FormatBits.ModifiedAA], Species.ModMass)
+ TotalSpectra = self.ModTypeSpectrumCount.get(ModTypeKey, 0)
+ TotalSites = self.ModTypeSiteCount.get(ModTypeKey, 0)
+ Species.Features[FeatureBits.SpectraThisModType] = TotalSpectra
+ Species.Features[FeatureBits.SitesThisModType] = TotalSites
+ Species.Features[FeatureBits.LogSpecThisType] = math.log(TotalSpectra)
+ Species.Features[FeatureBits.LogSitesThisType] = math.log(TotalSites)
+ Species.Bits[FormatBits.SpectraWithThisModType] = str(TotalSpectra)
+ Species.Bits[FormatBits.SitesWithThisModType] = str(TotalSites)
+ Species.Bits[FormatBits.LogSpectraThisModType] = str(math.log(TotalSpectra))
+ Species.Bits[FormatBits.LogSitesThisModType] = str(math.log(TotalSites))
+ DBPosition = int(Species.Bits[FormatBits.DBPos])
+ # Update modded%:
+ ModdedSpectra = self.ModCoverage[DBPosition]
+ ModlessSpectra = self.Coverage[DBPosition]
+ TotalSpectra = ModdedSpectra + ModlessSpectra
+ if TotalSpectra <= 0:
+ print "*** Warning: Site %s has no coverage at DB position %s"%(Species.Annotation, DBPosition)
+ Species.Bits[FormatBits.ModdedFraction] = str(ModdedSpectra / float(max(1, TotalSpectra)))
+ # Pad with empty bits if necessary:
+ while len(Species.Bits) <= FormatBits.SitePValue:
+ Species.Bits.append("")
+ if Species.Charge > 2:
+ Model = self.Model3
+ else:
+ Model = self.Model2
+ PeptidePValue = Model.GetPValue(Species.ModelScore)
+ Species.Bits[FormatBits.ModelPValue] = str(PeptidePValue)
+ Site.PValue *= PeptidePValue
+ for Species in Site.SpeciesList:
+ Species.Bits[FormatBits.SitePValue] = str(Site.PValue)
+ Str = string.join(Species.Bits, "\t")
+ self.OutputFile.write(Str + "\n")
+ def OutputPTMs(self):
+ File = open(self.OutputFileName, "wb")
+ for Line in self.HeaderLines:
+ File.write(Line)
+ # Sort the sites:
+ SortedSites = []
+ for Site in self.Sites.values():
+ SortedSites.append((Site.PValue, Site))
+ SortedSites.sort()
+ CumulativeSiteCount = 0
+ CumulativeTrueSiteCount = 0
+ CumulativeSpeciesCount = 0
+ # Report peptides, grouped by site, from best site to worst:
+ for (PValue, Site) in SortedSites:
+ BestSpecies = None
+ for Species in Site.SpeciesList:
+ if Species.MergedFlag:
+ continue
+ CumulativeSpeciesCount += 1
+ if (BestSpecies == None) or (Species.PValue < BestSpecies.PValue):
+ BestSpecies = Species
+ if not BestSpecies:
+ continue
+ CumulativeSiteCount += 1
+ if int(Species.Bits[FormatBits.TrueProteinFlag]):
+ CumulativeTrueSiteCount += 1
+ # The LENS way:
+ FalseProteinCount = CumulativeSiteCount - CumulativeTrueSiteCount
+ #FalseWithinTrue = FalseProteinCount * 0.01
+ FalseWithinTrue = FalseProteinCount
+ TrueCount = max(0, CumulativeTrueSiteCount - FalseWithinTrue)
+ SiteCount = CumulativeTrueSiteCount
+ # FDR:
+ if CumulativeTrueSiteCount <= 0:
+ FDR = 1.0
+ else:
+ # False discovery rate:
+ # The number of spurious sites which come from valid proteins
+ # divided by the number of sites that come from valid proteins
+ FDR = FalseWithinTrue / float(CumulativeTrueSiteCount)
+ FDR = min(1.0, FDR)
+ print "pvalue %.6f sites%d species%d T%d F%d FWT %.3f SC %.3f FDR %.3f"%(\
+ Site.PValue, CumulativeSiteCount, CumulativeSpeciesCount,
+ CumulativeTrueSiteCount, FalseProteinCount, FalseWithinTrue, SiteCount, FDR)
+ for Species in Site.SpeciesList:
+ if Species.MergedFlag:
+ continue
+ try:
+ Str = string.join(Species.Bits, "\t")
+ Str += "\t%s\t"%FDR
+ except:
+ traceback.print_exc()
+ print Species.Bits
+ print map(type, Species.Bits)
+ File.write(Str + "\n")
+ File.close()
+ def ParseDB(self):
+ DBFile = open(self.DBFileName, "rb")
+ self.DB = DBFile.read()
+ DBFile.close()
+ def CheckSpectrumDirectories(self):
+ """
+ Create our adjusted-spectrum and adjusted-cluster directories, wiping out
+ old ones beforehand if we must. We do this for merge-and-reconcile;
+ we DON'T do it for biochem tweaking
+ """
+ print "Prepare spectrum directories... (-z option, set for "
+ print "single-block runs and the first block of multi-block runs...)"
+ try:
+ shutil.rmtree(self.ConsensusClusterDirAdjusted)
+ except:
+ pass
+ try:
+ shutil.rmtree(self.ConsensusSpectraDirAdjusted)
+ except:
+ pass
+ try:
+ shutil.rmtree(self.ClusterScanListDirAdjusted)
+ except:
+ pass
+ MakeDirectory(self.ConsensusClusterDirAdjusted)
+ MakeDirectory(self.ConsensusSpectraDirAdjusted)
+ MakeDirectory(self.ClusterScanListDirAdjusted)
+ Aminos = "ACDEFGHIKLMNOPQRSTUVWY"
+ for Amino in Aminos:
+ Dir = os.path.join(self.ConsensusClusterDirAdjusted, Amino)
+ MakeDirectory(Dir)
+ Dir = os.path.join(self.ConsensusSpectraDirAdjusted, Amino)
+ MakeDirectory(Dir)
+ Dir = os.path.join(self.ClusterScanListDirAdjusted, Amino)
+ MakeDirectory(Dir)
+
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "r:w:d:c:m:k:M:x:y:X:zev:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ # -r results file(s)
+ if not os.path.exists(Value):
+ print "** Error: couldn't find results file '%s'\n\n"%Value
+ print UsageInfo
+ sys.exit(1)
+ self.InputFileName = Value
+ elif Option == "-m":
+ self.SavedModelFileName2 = "%s.2"%Value
+ self.SavedModelFileName3 = "%s.3"%Value
+ elif Option == "-M":
+ self.OutputModelFileName2 = "%s.2"%Value
+ self.OutputModelFileName3 = "%s.3"%Value
+ elif Option == "-w":
+ self.OutputFileName = Value
+ elif Option == "-d":
+ self.DBFileName = Value
+ elif Option == "-k":
+ self.KnownChemistryFileName = Value
+ elif Option == "-c":
+ self.TempFileDir = Value
+ self.ConsensusClusterDir = os.path.join(Value, "Clusters")
+ self.ConsensusSpectraDir = os.path.join(Value, "Spectra")
+ self.ClusterScanListDir = os.path.join(Value, "ClusterMembers")
+ self.ConsensusClusterDirAdjusted = os.path.join(Value, "ClustersAdjusted")
+ self.ConsensusSpectraDirAdjusted = os.path.join(Value, "SpectraAdjusted")
+ self.ClusterScanListDirAdjusted = os.path.join(Value, "ClusterMembersAdjusted")
+ elif Option == "-x":
+ self.DBStart = int(Value)
+ elif Option == "-y":
+ self.DBEnd = int(Value)
+ elif Option == "-X":
+ self.SpectrumRoot = Value
+ elif Option == "-z":
+ self.CheckDirectoriesFlag = 1
+ elif Option == "-e":
+ # mErge block runs:
+ self.MergeBlockRunsFlag = 1
+ elif Option == "-v":
+ self.KnownPTMVerboseOutputFileName = Value
+ def SaveSitesByType(self):
+ "Save sites/spectra by modification type"
+ PicklePath = os.path.join(self.TempFileDir, "SitesByModType.dat")
+ SitesFile = open(PicklePath, "wb")
+ cPickle.dump(self.ModTypeSiteCount, SitesFile)
+ cPickle.dump(self.ModTypeSpectrumCount, SitesFile)
+ SitesFile.close()
+ def LoadSitesByType(self):
+ "Load sites/spectra by modification type"
+ PicklePath = os.path.join(self.TempFileDir, "SitesByModType.dat")
+ SitesFile = open(PicklePath, "rb")
+ self.ModTypeSiteCount = cPickle.load(SitesFile)
+ self.ModTypeSpectrumCount = cPickle.load(SitesFile)
+ SitesFile.close()
+ def LoadHeaderLines(self, FileName):
+ File = open(FileName, "rb")
+ LineNumber = 0
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ if LineNumber > 10:
+ break
+ if FileLine[0] == "#":
+ self.HeaderLines.append(FileLine)
+ File.close()
+ def Main(self):
+ if self.SpectrumRoot:
+ self.PopulateSpectrumOracle(self.SpectrumRoot)
+ print "Load model..."
+ self.LoadModel()
+ print "Parse database..."
+ self.ParseDB()
+ if self.MergeBlockRunsFlag:
+ self.MergeBlockRuns()
+ elif self.KnownChemistryFileName:
+ print "Tweak sites to match KNOWN CHEMISTRY..."
+ self.LoadKnownModifications()
+ self.LoadSitesByType()
+ self.LoadHeaderLines(self.InputFileName)
+ self.OutputFile = open(self.OutputFileName, "wb")
+ for HeaderLine in self.HeaderLines:
+ self.OutputFile.write(HeaderLine)
+ self.ProcessSites(self.InputFileName, "knownptm")
+ else:
+ print "MERGE and RECONCILE..."
+ if self.CheckDirectoriesFlag:
+ self.CheckSpectrumDirectories()
+ self.MergeAndReconcile()
+ # Re-compute spectra for mod type, since some peptides
+ # have now been re-annotated:
+ #self.ComputeTotalSpectraForModType()
+ #self.RescorePeptides()
+ #self.ScorePTMSites()
+
+ #print "Write output to %s"%(self.OutputFileName)
+ #self.OutputPTMs()
+
+UsageInfo = """
+AdjustPTM: Merge, reconcile, and tweak PTM annotations.
+
+Arguments:
+-r [FILENAME]: Feature file (from TrainPTMFeatures) to read in
+-w [FILENAME]: Output modded-peptide filename
+-d [DBFILE]: Database searched
+-c [DIR]: Cluster directory
+-k [FILENAME]: Known chemistry filename. If specified, consider altering sites
+ to match known chemical adducts; report the best site-score attainable by using
+ known chemical adducts.
+-m [FILENAME]: Peptide scoring model INPUT filename
+-M [FILENAME]: Peptide scoring model OUTPUT filename
+-x [POS]: Database start position
+-y [POS]: Database end position
+"""
+
+if __name__ == "__main__":
+ if PROFILING_RUN:
+ import profile
+ profile.run("Main()")
+ else:
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(Psyco not found - no optimization)"
+ Adjutant = PTMAdjuster()
+ Adjutant.ParseCommandLine(sys.argv[1:])
+ Adjutant.Main()
diff --git a/AminoAcidMasses.txt b/AminoAcidMasses.txt
new file mode 100644
index 0000000..46b3d7c
--- /dev/null
+++ b/AminoAcidMasses.txt
@@ -0,0 +1,22 @@
+Glycine Gly G 57.02146 57.0520
+Alanine Ala A 71.03711 71.0788
+Serine Ser S 87.03203 87.0782
+Proline Pro P 97.05276 97.1167
+Valine Val V 99.06841 99.1326
+Threonine Thr T 101.04768 101.1051
+Cysteine Cys C 103.00919 103.1448
+Leucine Leu L 113.08406 113.1595
+Isoleucine Ile I 113.08406 113.1595
+Asparagine Asn N 114.04293 114.1039
+AsparticAcid Asp D 115.02694 115.0886
+Glutamine Gln Q 128.05858 128.1308
+Lysine Lys K 128.09496 128.1742
+GlutamicAcid Glu E 129.04259 129.1155
+Methionine Met M 131.04049 131.1986
+Histidine His H 137.05891 137.1412
+Phenylalanine Phe F 147.06841 147.1766
+Arginine Arg R 156.10111 156.1876
+Tyrosine Tyr Y 163.06333 163.1760
+Tryptophan Trp W 186.07931 186.2133
+#Selenocysteine Sel U 150.8 151.0
+#Pyrrolysine Pyr O 237.1 237.3
\ No newline at end of file
diff --git a/BN.c b/BN.c
new file mode 100644
index 0000000..8f559bc
--- /dev/null
+++ b/BN.c
@@ -0,0 +1,204 @@
+//Title: BN.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+// Bayesian network support functions.
+// We employ a BN for scoring PRMs (prefix residue masses).
+#include "CMemLeak.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+//#include <malloc.h>
+#include "Errors.h"
+#include "BN.h"
+#include "Utils.h"
+#include "Inspect.h"
+#include "Spectrum.h"
+#include "Trie.h"
+#include <math.h>
+
+BayesianModel* BNCharge2ScoringBN = NULL;
+BayesianModel* BNCharge3ScoringBN = NULL;
+BayesianModel* BNCharge2TaggingBN = NULL;
+BayesianModel* BNCharge3TaggingBN = NULL;
+
+void FreeBayesianModel(BayesianModel* Model);
+
+void OldInitBayesianModels()
+{
+ char FilePath[2048];
+ if (GlobalOptions->InstrumentType == INSTRUMENT_TYPE_QTOF)
+ {
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "Ch2BNPEPQ.dat");
+ BNCharge2ScoringBN = LoadBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRMQ2.dat");
+ BNCharge2TaggingBN = LoadBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "Ch3BNPEPQ.dat");
+ BNCharge3ScoringBN = LoadBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRMQ3.dat");
+ BNCharge3TaggingBN = LoadBayesianModel(FilePath);
+ }
+ else
+ {
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "Ch2BNPEP.dat");
+ BNCharge2ScoringBN = LoadBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "Ch3BNPEP.dat");
+ BNCharge3ScoringBN = LoadBayesianModel(FilePath);
+ if (GlobalOptions->DigestType == DIGEST_TYPE_TRYPSIN)
+ {
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRM2.dat");
+ BNCharge2TaggingBN = LoadBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRM3.dat");
+ BNCharge3TaggingBN = LoadBayesianModel(FilePath);
+ }
+ else
+ {
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRM2.dat");
+ BNCharge2TaggingBN = LoadBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRM3.dat");
+ BNCharge3TaggingBN = LoadBayesianModel(FilePath);
+ }
+ }
+}
+
+void OldFreeBayesianModels()
+{
+ if (BNCharge2ScoringBN)
+ {
+ FreeBayesianModel(BNCharge2ScoringBN);
+ BNCharge2ScoringBN = NULL;
+ }
+ if (BNCharge2TaggingBN)
+ {
+ FreeBayesianModel(BNCharge2TaggingBN);
+ BNCharge2TaggingBN = NULL;
+ }
+ if (BNCharge3ScoringBN)
+ {
+ FreeBayesianModel(BNCharge3ScoringBN);
+ BNCharge3ScoringBN = NULL;
+ }
+ if (BNCharge3TaggingBN)
+ {
+ FreeBayesianModel(BNCharge3TaggingBN);
+ BNCharge3TaggingBN = NULL;
+ }
+
+}
+
+// Compute the probability of a basesian node:
+// return ProbTable[ParentValue1*ParentBlock1 + ... + ParentValueN*ParentBlockN + FeatureValue]
+float ComputeBNProbability(BayesianNode* BN, int NodeIndex, int* FeatureValues)
+{
+ int ProbTableIndex;
+ int Parent;
+ int ParentIndex;
+ //
+ ProbTableIndex = 0;
+ for (ParentIndex = 0; ParentIndex < 4; ParentIndex++)
+ {
+ Parent = BN->Parents[ParentIndex];
+ if (Parent >= 0)
+ {
+ //ProbTableIndex += BN->ParentBlocks[ParentIndex] * Model->Nodes[Parent].Value;
+ ProbTableIndex += BN->ParentBlocks[ParentIndex] * FeatureValues[Parent];
+ }
+ else
+ {
+ break;
+ }
+ }
+ ProbTableIndex += FeatureValues[NodeIndex];
+ return BN->ProbTable[ProbTableIndex];
+}
+
+void FreeBayesianModel(BayesianModel* Model)
+{
+ int NodeIndex;
+ BayesianNode* BN;
+ if (Model)
+ {
+ for (NodeIndex = 0; NodeIndex < Model->NodeCount; NodeIndex++)
+ {
+ BN = Model->Nodes + NodeIndex;
+ SafeFree(BN->ProbTable);
+ }
+ SafeFree(Model->Nodes);
+ SafeFree(Model);
+ }
+}
+
+BayesianModel* LoadBayesianModel(char* FileName)
+{
+ int FeatureCount;
+ int FeatureIndex;
+ FILE* File;
+ BayesianNode* BN;
+ BayesianModel* Model;
+ //
+ File = fopen(FileName, "rb");
+ if (!File)
+ {
+ REPORT_ERROR_S(3, FileName);
+ return NULL;
+ }
+ ReadBinary(&FeatureCount, sizeof(int), 1, File);
+ if (FeatureCount < 1 || FeatureCount > 100)
+ {
+ REPORT_ERROR_I(6, FeatureCount);
+ return NULL;
+ }
+ Model = (BayesianModel*)calloc(1, sizeof(BayesianModel));
+ Model->NodeCount = FeatureCount;
+ Model->Nodes = (BayesianNode*)calloc(FeatureCount, sizeof(BayesianNode));
+ for (FeatureIndex = 0; FeatureIndex < FeatureCount; FeatureIndex++)
+ {
+ BN = Model->Nodes + FeatureIndex;
+ ReadBinary(&BN->Flags, sizeof(int), 1, File);
+ ReadBinary(&BN->ValueCount, sizeof(int), 1, File);
+ ReadBinary(&BN->Name, sizeof(char), 64, File);
+ if (BN->Flags & BNODE_HAS_PARENTS)
+ {
+ ReadBinary(BN->Parents, sizeof(int), 4, File);
+ ReadBinary(BN->ParentBlocks, sizeof(int), 4, File);
+ ReadBinary(&BN->ProbTableSize, sizeof(int), 1, File);
+ if (BN->ProbTableSize <= 0 || BN->ProbTableSize > 1000)
+ {
+ REPORT_ERROR_II(7, BN->ProbTableSize, FeatureIndex);
+ }
+ BN->ProbTable = (float*)calloc(BN->ProbTableSize, sizeof(float));
+ ReadBinary(BN->ProbTable, sizeof(float), BN->ProbTableSize, File);
+
+ }
+ }
+ return Model;
+}
diff --git a/BN.h b/BN.h
new file mode 100644
index 0000000..8126b60
--- /dev/null
+++ b/BN.h
@@ -0,0 +1,96 @@
+//Title: BN.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef BN_H
+#define BN_H
+
+// Structs to support use of Bayesian Networks. We use bayesian
+// networks to score PRMs, for both tag generation and for final
+// scoring of matches. Most of the nodes in the network correspond to
+// fragment types. The edges between nodes help capture the co-occurrence
+// relations between peaks (e.g. b-h2o is more likely in presence of b),
+// as well as other factors that predict peak strength (e.g. which spectrum
+// sector the PRM lies in)
+///
+///
+// The bayesian network file has the following format:
+// There's one NodeRecord per bayesian network node. The NodeRecord
+// has flags (int), ValueCount (int; the number of possible values for the node),
+// and a Name (char64). If the node has parents, it then has:
+// Parent indices (4 ints)
+// Parent block-sizes (4 ints, used in computing positions in the probability table)
+// ProbTableSize (equals the first parent block size * the ValueCount)
+// Probability table (float array of size ProbTableSize)
+// Note that the values in the probability table are log-probabilities, so that we can
+// add them up.
+#include "Utils.h"
+#include "Inspect.h"
+#include "Spectrum.h"
+#include "Trie.h"
+
+// Flags for BayesianNode.Flags:
+// A node has the BNODE_USE_PROB flag set if it's a leaf node, whose
+// probability is to be used. A node has the BNODE_HAS_PARENTS flag
+// set if it has one or more parent nodes.
+#define BNODE_HAS_PARENTS 1
+#define BNODE_USE_PROB 2
+
+typedef struct BayesianNode
+{
+ int Flags;
+ int Value;
+ int ValueCount;
+ char Name[64];
+ int Parents[4];
+ int ParentBlocks[4];
+ int ProbTableSize; // redundant, but useful to keep around for sanity-checks
+ float* ProbTable;
+} BayesianNode;
+
+typedef struct BayesianModel
+{
+ BayesianNode* Nodes;
+ int NodeCount;
+} BayesianModel;
+
+extern BayesianModel* BNCharge2ScoringBN;
+extern BayesianModel* BNCharge3ScoringBN;
+extern BayesianModel* BNCharge2TaggingBN;
+extern BayesianModel* BNCharge3TaggingBN;
+
+BayesianModel* LoadBayesianModel(char* FileName);
+float ComputeBNProbability(BayesianNode* BN, int NodeIndex, int* FeatureValues);
+void OldFreeBayesianModels();
+void OldInitBayesianModels();
+
+#endif // BN_H
+
diff --git a/BasicStats.py b/BasicStats.py
new file mode 100644
index 0000000..4a5b56f
--- /dev/null
+++ b/BasicStats.py
@@ -0,0 +1,120 @@
+#Title: BasicStats.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+import math
+
+def ComputeROCCurve(List):
+ """
+ Compute the ROC curve for a set of tuples of the form (reading, truthflag)
+ """
+ List.sort()
+ List.reverse()
+ AllPositive = 0
+ AllNegative = 0
+ for (Score, Truth) in List:
+ if (Truth):
+ AllPositive += 1
+ else:
+ AllNegative += 1
+ Area = 0
+ TPCount = 0
+ FPCount = 0
+ for (Score, Truth) in List:
+ if (Truth):
+ TPCount += 1
+ else:
+ FPCount += 1
+ TPRate = TPCount / float(AllPositive)
+ Area += TPRate
+ Area /= float(AllNegative)
+ return Area
+
+def GetMedian(List):
+ LocalCopy = List[:]
+ LocalCopy.sort()
+ Len = len(LocalCopy)
+ if Len % 2:
+ return LocalCopy[Len / 2]
+ return (LocalCopy[Len / 2] + LocalCopy[(Len / 2) - 1]) / 2.0
+
+def Sum(List):
+ Total = 0
+ for Entry in List:
+ Total += Entry
+ return Total
+
+def GetMedian(List):
+ SortedList = List[:]
+ SortedList.sort()
+ Len = len(SortedList)
+ if Len % 2 == 1:
+ return SortedList[Len / 2]
+ Score = (SortedList[Len / 2] + SortedList[(Len / 2) - 1]) / 2.0
+ return Score
+
+def GetMean(List):
+ if not len(List):
+ return None
+ Mean = 0
+ for Entry in List:
+ Mean += Entry
+ Mean /= float(len(List))
+ return Mean
+
+def GetMeanStdDev(List):
+ "Computes mean, standard deviation for a list of numbers"
+ if not len(List):
+ return (0, 0)
+ Mean = 0
+ for Entry in List:
+ Mean += Entry
+ Mean /= float(len(List))
+ StdDev = 0
+ for Entry in List:
+ StdDev += (Entry - Mean) ** 2
+ StdDev = math.sqrt(StdDev / float(len(List)))
+ return (Mean, StdDev)
+
+
+def GetStdDev(List):
+ "Computes standard deviation for a list of numbers"
+ if not len(List):
+ return 0.0
+ Mean = 0
+ for Entry in List:
+ Mean += Entry
+ Mean /= float(len(List))
+ StdDev = 0
+ for Entry in List:
+ StdDev += (Entry - Mean) ** 2
+ StdDev = math.sqrt(StdDev / float(len(List)))
+ return StdDev
+
diff --git a/BuildConsensusSpectrum.py b/BuildConsensusSpectrum.py
new file mode 100644
index 0000000..ad7abdf
--- /dev/null
+++ b/BuildConsensusSpectrum.py
@@ -0,0 +1,273 @@
+#Title: BuildConsensusSpectrum.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+BuildConsensusSpectrum:
+- Given many similar spectra which we consider to be the "same" (either due to
+Inspect annotations or due to clustering), we'd like to generate a CONSENSUS
+SPECTRUM. The consensus spectrum should contain less noise than the individual
+spectra, and mass errors should be decreased. Also, the computation shouldn't
+require too much time; for efficiency, it should require just one I/O pass through
+the spectra.
+
+Current pseudocode:
+
+# Accumulate intensity and peak count:
+For spectrum S:
+ Read in peaks from disk
+ Intensity1 = max(max peak intensity, grass peak intensity * 20)
+ For peak P in spectrum S:
+ ScaledIntensity = (Intensity(P) / Intensity1)
+ MassBin = mass of P, rounded to nearest 0.1Da
+ TotalIntensity[MassBin] += ScaledIntensity
+ PeakCount[MassBin]++
+ PeakCount[MassBin-1]++
+ PeakCount[MassBin+1]++
+
+# Process intensity into a peak list:
+Iterate over peaks P from high to low peak-count:
+ if Assimilated[P], continue
+ Peak P assimilates neighboring peaks if their total intensity is lower
+ new spectrum receives this assimilated peak
+"""
+import PyInspect
+import MSSpectrum
+import os
+import sys
+import traceback
+import cPickle
+
+USE_COUNT_FLAG = 1
+
+# Scaling factors, for levels of peak presence from 0% to 100%.
+ScalingFactors = {}
+for X in range(0, 101):
+ #ScalingFactors[X] = 0.95 + 0.05 * (1.0 + X / 100.0)**5
+ ScalingFactors[X] = 0.95 + 0.05 * (1.0 + X / 100.0)**5
+ #print X, ScalingFactors[X]
+
+
+class ConsensusBuilder:
+ def __init__(self, Charge = None):
+ self.SpectrumCount = 0
+ self.TotalMZ = 0
+ self.Intensity = {}
+ self.PeakCount = {}
+ self.Charge = Charge
+ self.SignalPeakCount = 0
+ def DebugPrint(self):
+ Bins = self.Intensity.keys()
+ Bins.sort()
+ MinBin = min(Bins)
+ MaxBin = max(Bins)
+ for Bin in range(MinBin, MaxBin + 1):
+ Str = "%s\t%s\t%s\t"%(Bin, self.Intensity.get(Bin, 0), self.PeakCount.get(Bin, 0))
+ print Str
+
+ def AddSpectrum(self, Spectrum):
+ """
+ Add one more spectrum to the binned intensity and peak counts
+ """
+ self.TotalMZ += Spectrum.PrecursorMZ
+ self.SpectrumCount += 1
+ if not self.Charge:
+ self.Charge = Spectrum.Charge # ASSUMED: all spectra have same charge!
+ if not self.SignalPeakCount:
+ # Expect to see this many "signal" peaks:
+ self.SignalPeakCount = int(round((Spectrum.ParentMass / 100.0) * 4))
+ MaxIntensity = 0
+ IntensityList = []
+ for Peak in Spectrum.Peaks:
+ MaxIntensity = max(MaxIntensity, Peak.Intensity)
+ IntensityList.append(Peak.Intensity)
+ IntensityList.sort()
+ # Spectra with high signal-to-noise are weighted a bit more heavily:
+ if len(IntensityList) > self.SignalPeakCount:
+ MinimumPresenceIntensity = IntensityList[-self.SignalPeakCount]
+ ScalingFactor = min(20.0 / MaxIntensity, 1.0 / MinimumPresenceIntensity)
+ else:
+ ScalingFactor = 20.0 / MaxIntensity
+ MinimumPresenceIntensity = 0
+ #print "%s peaks; Spectrum has scaling factor %s (grass %s, max peak %s)"%(len(IntensityList), ScalingFactor, MinimumPresenceIntensity, MaxIntensity)
+ for Peak in Spectrum.Peaks:
+ MZBin = int(round(Peak.Mass * 10))
+ ScaledIntensity = Peak.Intensity * ScalingFactor
+ self.Intensity[MZBin] = self.Intensity.get(MZBin, 0) + ScaledIntensity
+ if Peak.Intensity > MinimumPresenceIntensity:
+ self.PeakCount[MZBin] = self.PeakCount.get(MZBin, 0) + 1
+ self.PeakCount[MZBin-1] = self.PeakCount.get(MZBin-1, 0) + 1
+ self.PeakCount[MZBin-2] = self.PeakCount.get(MZBin-2, 0) + 1
+ self.PeakCount[MZBin+1] = self.PeakCount.get(MZBin+1, 0) + 1
+ self.PeakCount[MZBin+2] = self.PeakCount.get(MZBin+2, 0) + 1
+ #self.Intensity[MZBin] = self.Intensity.get(MZBin, 0) + ScaledIntensity
+ def PickleCluster(self, PicklePath):
+ """
+ Serialize the information we've read from many spectra. This method is used
+ if we want to reserve the option to add to the cluster later.
+ """
+ File = open(PicklePath, "wb")
+ cPickle.dump(self.Charge, File)
+ cPickle.dump(self.TotalMZ, File)
+ cPickle.dump(self.SpectrumCount, File)
+ cPickle.dump(self.Intensity, File)
+ cPickle.dump(self.PeakCount, File)
+ File.close()
+ def UnpickleCluster(self, PicklePath):
+ """
+ Sister method to PickleCluster - load a cluster from disk.
+ """
+ File = open(PicklePath, "rb")
+ self.Charge = cPickle.load(File)
+ self.TotalMZ = cPickle.load(File)
+ self.SpectrumCount = cPickle.load(File)
+ self.Intensity = cPickle.load(File)
+ self.PeakCount = cPickle.load(File)
+ File.close()
+
+ def ProduceConsensusSpectrum(self):
+ Spectrum = MSSpectrum.SpectrumClass()
+ Spectrum.Charge = self.Charge
+ Spectrum.PrecursorMZ = self.TotalMZ / float(max(self.SpectrumCount, 1))
+ Spectrum.ParentMass = (Spectrum.PrecursorMZ * Spectrum.Charge) - (Spectrum.Charge - 1)*1.0078
+ # Iterate over intensity entries:
+ self.AssimilationFlag = {}
+ SortedList = []
+ for (Bin, Score) in self.Intensity.items():
+ SortedList.append((Score, Bin))
+ SortedList.sort()
+ SortedList.reverse()
+ Spectrum.Peaks = []
+ for (Intensity, Bin) in SortedList:
+ if self.AssimilationFlag.get(Bin, 0):
+ continue
+ Peak = MSSpectrum.PeakClass(Bin / 10.0, Intensity)
+ for NearBin in (Bin-1, Bin-2, Bin-3, Bin+1, Bin+2, Bin+3):
+ if self.AssimilationFlag.get(NearBin, 0):
+ continue
+ self.AssimilationFlag[NearBin] = 1
+ Peak.Intensity += self.Intensity.get(NearBin, 0)
+ # Scale the intensity by the peak count, IF the cluster is large:
+ if USE_COUNT_FLAG and self.SpectrumCount > 4:
+ FractionPresent = self.PeakCount.get(Bin,0) / float(self.SpectrumCount)
+ Bin = min(100, int(round(FractionPresent * 100)))
+ #print FractionPresent, Bin, ScalingFactors[Bin]
+ Peak.Intensity *= ScalingFactors[Bin]
+ Spectrum.Peaks.append(Peak)
+ Spectrum.Peaks.sort()
+ return Spectrum
+ def AssimilateCluster(self, OtherCluster):
+ """
+ Assimilate the other consensus information into our consensus spectrum.
+ """
+ self.SpectrumCount += OtherCluster.SpectrumCount
+ self.TotalMZ += OtherCluster.TotalMZ
+ for (Key, Value) in OtherCluster.Intensity.items():
+ self.Intensity[Key] = self.Intensity.get(Key, 0) + Value
+ for (Key, Value) in OtherCluster.PeakCount.items():
+ self.PeakCount[Key] = self.PeakCount.get(Key, 0) + Value
+def TestConsensus(AnnotationsFile, Annotation, Charge = 2):
+ """
+ Build a consensus spectrum for a collection of spectra, and verify
+ that it looks good.
+ """
+ #AnnotationsFile = "ConsensusTest.txt"
+ #Annotation = "M.D+173VTIQHPWFK.R"
+ SpectrumDir = "e:\\ms\\lens\\spectra"
+ TestOutputFile = "ConsensusTest.dta"
+ Builder = ConsensusBuilder()
+ InputFile = open(AnnotationsFile, "rb")
+ BestScores = []
+ #MGFFile = open("TestTest.mgf", "wb") #%TEMP
+ #MGFFile.close() #%TEMP
+ for FileLine in InputFile.xreadlines():
+ Bits = FileLine.split("\t")
+ try:
+ FilePath = Bits[0]
+ FilePos = int(Bits[15])
+ SpectrumCharge = int(Bits[4])
+ MQScore = float(Bits[5])
+ except:
+ continue
+ if SpectrumCharge != Charge:
+ continue
+ FilePath = os.path.join(SpectrumDir, FilePath.replace("/", "\\").split("\\")[-1])
+ BestScores.append((MQScore, FilePath, FilePos))
+ Spectrum = MSSpectrum.SpectrumClass()
+ Spectrum.ReadPeaks("%s:%s"%(FilePath, FilePos))
+ Spectrum.SetCharge(Charge)
+ Builder.AddSpectrum(Spectrum)
+ # Try ari's thingy:
+ #Spectrum.WriteMGFPeaks("TestTest.mgf") #%TEMP
+ InputFile.close()
+ print "Build consensus spectrum for %s members."%Builder.SpectrumCount
+ Consensus = Builder.ProduceConsensusSpectrum()
+ Consensus.WritePeaks(TestOutputFile)
+ #Command = "MakeConsensus -mgf TestTest.mgf > %s"%(TestOutputFile) #%TEMP
+ #print Command #%TEMP
+ #os.system(Command) #%TEMP
+ PySpectrum = PyInspect.Spectrum(TestOutputFile, 0)
+ Results = PySpectrum.ScorePeptideDetailed(Annotation)
+ ConsensusScore = Results[0]
+ print "Consensus spectrum score: %.2f (%.2f, %.2f, %.2f, %d)"%(ConsensusScore, Results[1], Results[2], Results[3], Results[4])
+ # Compare the CONSENSUS score to the average for the BEST FIVE:
+ BestScores.sort()
+ TopHits = 0
+ for (Score, Path, Pos) in BestScores[-5:]:
+ PySpectrum = PyInspect.Spectrum(Path, Pos)
+ Score = PySpectrum.ScorePeptide(Annotation)
+ TopHits += Score
+ BestFiveAverage = TopHits / 5.0
+ ScoreGain = ConsensusScore - BestFiveAverage
+ print "Consensus %s vs top-5 average %s (%s)"%(ConsensusScore, BestFiveAverage, ScoreGain)
+ return ScoreGain
+
+def TestMain():
+ TestCases = [("Consensus.GNTIEIQGDDAPSLWVYGFSDR.txt", "K.GNTIEIQGDDAPSLWVYGFSDR.V", 2),
+ ("ConsensusTest.txt", "-.M+42DVTIQHPWFK.R", 1),
+ ("ConsensusTest.txt", "-.M+42DVTIQHPWFK.R", 2),
+ ("Consensus.R.QD-17DHGYISR.E.txt", "R.Q-17DDHGYISR.E", 1),
+ ("Consensus.R.QD-17DHGYISR.E.txt", "R.Q-17DDHGYISR.E", 2),
+ ]
+ ResultTotal = 0
+ ResultCount = 0
+ #TestCases = TestCases[0:1] # TEMP%!
+ for (AnnotationFile, Annotation, Charge) in TestCases:
+ ResultTotal += TestConsensus(AnnotationFile, Annotation, Charge)
+ ResultCount += 1
+ print "OVERALL RESULTS: Average MQGain is %s"%(ResultTotal / float(max(1, ResultCount)))
+
+if __name__ == "__main__":
+ # Given the filename of a cluster, print verbose info:
+ FileName = sys.argv[1]
+ Bob = ConsensusBuilder()
+ Bob.UnpickleCluster(FileName)
+ Bob.DebugPrint()
diff --git a/BuildInspect.py b/BuildInspect.py
new file mode 100644
index 0000000..be2e65c
--- /dev/null
+++ b/BuildInspect.py
@@ -0,0 +1,123 @@
+#Title: BuildInspect.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Python script to build Inspect. An alternative to makefiles.
+"""
+import sys
+import distutils
+import distutils.command.build
+import distutils.ccompiler
+
+def BuildInspect(BuildNow = 0):
+ InspectSourceFiles = [
+ "base64.c", "BN.c", "BuildMS2DB.c", "ChargeState.c", "CMemLeak.c",
+ "Errors.c", "ExonGraphAlign.c",
+ "FreeMod.c", "IonScoring.c", "LDA.c", "main.c", "Mods.c",
+ "MS2DB.c", "ParentMass.c", "ParseInput.c",
+ "ParseXML.c", "PValue.c",
+ "Run.c", "Score.c", "Scorpion.c", "SNP.c", "Spectrum.c", "Spliced.c",
+ "SpliceDB.c", "SpliceScan.c", "SVM.c", "Tagger.c", "Trie.c", "Utils.c", "TagFile.c"
+ ]
+ ExtraIncludeDirectories = ["expat\\lib",]
+ class MyBuildClass(distutils.command.build.build):
+ def build_opt(self):
+ CC = distutils.ccompiler.new_compiler()
+ #if sys.platform != 'win32':
+ # CC.add_library('m')
+ #import os.path
+ print dir(CC)
+ CC.library_dirs.append("expat/lib/release")
+ if sys.platform == "win32":
+ CC.add_library("libexpat")
+ else:
+ CC.add_library("expat") # not "libexpat", that won't work on Linux.
+ CC.add_library("m")
+ CC.set_include_dirs(ExtraIncludeDirectories)
+ opt_obj = CC.compile(InspectSourceFiles)
+ CC.link_executable(opt_obj, "inspect")
+ def run(self):
+ self.build_opt()
+ distutils.command.build.build.run(self)
+ if BuildNow:
+ Dist = distutils.dist.Distribution()
+ Dist.parse_config_files()
+ Dist.cmdclass["build"] = MyBuildClass
+ Dist.commands = ["build"]
+ Dist.run_commands()
+ else:
+ distutils.core.setup(cmdclass = {"build":MyBuildClass,})
+
+def BuildInspectOnConvey(BuildNow = 0):
+ InspectSourceFiles = [
+ "base64.c", "BN.c", "BuildMS2DB.c", "ChargeState.c", "CMemLeak.c",
+ "Errors.c", "ExonGraphAlign.c",
+ "FreeMod.c", "IonScoring.c", "LDA.c", "main.c", "Mods.c",
+ "MS2DB.c", "ParentMass.c", "ParseInput.c",
+ "ParseXML.c", "PValue.c",
+ "Run.c", "Score.c", "Scorpion.c", "SNP.c", "Spectrum.c", "Spliced.c",
+ "SpliceDB.c", "SpliceScan.c", "SVM.c", "Tagger.c", "Trie.c", "Utils.c", "TagFile.c",
+ "cny_kernel_wrapper.c", "pdk_kernel.c", "kernel.c", "cny_util.c"]
+ ExtraIncludeDirectories = ["expat\\lib",]
+ class MyBuildClass(distutils.command.build.build):
+ def build_opt(self):
+ CC = distutils.ccompiler.new_compiler()
+ #if sys.platform != 'win32':
+ # CC.add_library('m')
+ #import os.path
+ print dir(CC)
+ CC.library_dirs.append("expat/lib/release")
+ if sys.platform == "win32":
+ CC.add_library("libexpat")
+ else:
+ CC.add_library("expat") # not "libexpat", that won't work on Linux.
+ CC.add_library("m")
+ CC.set_include_dirs(ExtraIncludeDirectories)
+ opt_obj = CC.compile(InspectSourceFiles)
+ CC.link_executable(opt_obj, "inspect")
+ def run(self):
+ self.build_opt()
+ distutils.command.build.build.run(self)
+ if BuildNow:
+ Dist = distutils.dist.Distribution()
+ Dist.parse_config_files()
+ Dist.cmdclass["build"] = MyBuildClass
+ Dist.commands = ["build"]
+ Dist.run_commands()
+ else:
+ distutils.core.setup(cmdclass = {"build":MyBuildClass,})
+
+
+if __name__ == "__main__":
+ #sys.argv = ["", "build"]
+ BuildInspect()
+
diff --git a/BuildMGF.py b/BuildMGF.py
new file mode 100644
index 0000000..25980e4
--- /dev/null
+++ b/BuildMGF.py
@@ -0,0 +1,126 @@
+#Title: BuildMGF.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+BuildMGF.py
+This is a part of the PTMAnalysis pipeline. It creates an
+.mgf file of all the consensus spectra created by ComputePTMFeatures.py
+"""
+
+import os
+import sys
+import string
+import getopt
+import MSSpectrum
+from Utils import *
+Initialize()
+from TrainPTMFeatures import FormatBits
+
+class PeptideFeatureBag:
+ pass
+
+class MGFBuilder:
+ def __init__(self):
+ self.ConsensusSpectrumDir = "ptmscore\\LensLTQ-99-5\\spectra"
+ self.PeptideFeatureFileName = "PTMScore\\LensLTQ-99-5.txt"
+ self.MGFPath = "PTMScore\\LensLTQ-99-5.mgf"
+ self.ModifiedPeptides = []
+ def ParsePeptideFeatureFile(self):
+ """
+ Parse the contents of the peptide feature-file. We need to know the
+ path to the consensus spectrum file, the consensus annotation MQScore,
+ and the index.
+ """
+ File = open(self.PeptideFeatureFileName, "rb")
+ LineNumber = 0
+ for FileLine in File.xreadlines():
+ LineNumber +=1
+ if FileLine[0] == "#":
+ continue
+ Bits = list(FileLine.replace("\r", "").replace("\n", "").split("\t"))
+ try:
+ ConsensusMQScore = float(Bits[FormatBits.ConsensusMQScore])
+ except:
+ print "** Error: Can't parse consensus MQScore from line %s!"%LineNumber
+ print Bits
+ continue
+ PeptideFeatures = PeptideFeatureBag()
+ PeptideFeatures.Bits = Bits
+ PeptideFeatures.ConsensusMQScore = ConsensusMQScore
+ NiceAnnotation = Bits[FormatBits.Peptide].replace("*", "-")
+ PeptideFeatures.Bits[FormatBits.Peptide] = NiceAnnotation
+ FirstResidue = NiceAnnotation[2]
+ Charge = Bits[FormatBits.Charge]
+ PeptideFeatures.SpectrumPath = os.path.join(self.ConsensusSpectrumDir, FirstResidue, "%s.%s.dta"%(NiceAnnotation, Charge))
+ self.ModifiedPeptides.append(PeptideFeatures)
+ File.close()
+ print "Parsed %s modified peptides from %s file lines."%(len(self.ModifiedPeptides), LineNumber)
+ def PrepareSearchMGF(self):
+ """
+ Concatenate our consensus spectra into an MGF file, for searching.
+ """
+ MGFFile = open(self.MGFPath, "wb")
+ ScanNumber = 1
+ for PeptideIndex in range(len(self.ModifiedPeptides)):
+ if PeptideIndex % 100 == 0:
+ print "Peptide species %s/%s..."%(PeptideIndex, len(self.ModifiedPeptides))
+ PeptideFeatures = self.ModifiedPeptides[PeptideIndex]
+ Spectrum = MSSpectrum.SpectrumClass()
+ Spectrum.ReadPeaks(PeptideFeatures.SpectrumPath)
+ Spectrum.WriteMGFPeaks(MGFFile, PeptideFeatures.Bits[FormatBits.Peptide], ScanNumber)
+ ScanNumber += 1
+ MGFFile.close()
+ def Main(self):
+ self.ParsePeptideFeatureFile()
+ self.PrepareSearchMGF()
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "d:m:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-d":
+ self.PeptideFeatureDir = Value
+ elif Option == "-m":
+ self.MGFPath = Value
+ self.ConsensusSpectrumDir = os.path.join(self.PeptideFeatureDir, "Spectra")
+ self.PeptideFeatureFileName = os.path.join(self.PeptideFeatureDir, "PTMFeatures.txt")
+
+UsageInfo = """
+BuildMGF arguments:
+ -d [DIR]: Peptide feature directory
+ -m [FILE]: Output .mgf file name
+"""
+
+if __name__ == "__main__":
+ Builder = MGFBuilder()
+ Builder.ParseCommandLine(sys.argv[1:])
+ Builder.Main()
diff --git a/BuildMS2DB.c b/BuildMS2DB.c
new file mode 100644
index 0000000..7e8d6e2
--- /dev/null
+++ b/BuildMS2DB.c
@@ -0,0 +1,2101 @@
+//Title: BuildMS2DB.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+// BuildMS2DB.c is responsible for building a splice-tolerant database
+// from an input file in .gff format. Our overall plan is as follows:
+
+//- Parse all lines of the .gff file. Discard those lines which don't come from
+// the correct chromosome and strand. Build up a linked list of exons,
+// which is indexed by a hash based on (start, end, reading-frame).
+// NOTE: Reading frame is defined as "the first base pair in a codon, modulo 3".
+// NOTE: Records of type EST give rise to three exons (one for each reading frame),
+// which will be pruned later.
+// NOTE: We may parse SEVERAL gff files!
+//- Merge and split exons, to produce a minimal disjoint list.
+//- Prune exons with short ORFs.
+//- ITERATE, until all exons are covered:
+// - Take the first uncovered exon. Grab all the exons which it links to. Build
+// a corresponding gene record. Flag these exons as covered.
+//- Write cross-reference records for the GFFGenes.
+//
+// IMPORTANT NOTE: The data structures built in this file aren't used during a search.
+// When it comes to searching, look in MS2DB.c, which uses different (simpler) data
+// structures to track this stuff.
+
+#include "Utils.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "Trie.h"
+#include "Inspect.h"
+#include "Spliced.h"
+#include "SpliceDB.h"
+#include "SNP.h"
+#include "Errors.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Macros:
+#define MAX_NAME 256
+#define EXON_HASH_SIZE 10000
+
+#define MS2_EXON_NONE 0
+#define MS2_EXON_CUSTOMAA 1
+
+#define CODON_LENGTH 3
+
+//#define GFF_QUICKPARSE
+
+// Use this flag for a single-base exon which covers the middle
+// base of an exon, and which has only one link forward to an
+// adjacent exon.
+// The flag reflects this scenario (where B is the one-base exon):
+// AAAA---(X)---BCCCC
+// Rather than these scenarios:
+// AAAAB---(X)---CCC
+// AAAA---B---(X)---CCC
+#define MS2_EXON_CUSTOMAA_HEAD 2
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Structs:
+typedef struct MS2Exon
+{
+ int Start;
+ int End;
+ int ReadingFrame;
+ struct MS2Exon* Next;
+ struct MS2Exon* Prev;
+ char Prefix[CODON_LENGTH];
+ char Suffix[CODON_LENGTH];
+ struct MS2Edge* FirstForward;
+ struct MS2Edge* LastForward;
+ struct MS2Edge* FirstBackward;
+ struct MS2Edge* LastBackward;
+ struct MS2Gene* Gene;
+ char SeqName[MAX_NAME + 1];
+
+ int Index; // Assigned to a completed gene
+ char* Sequence;
+ // Flags for special bookkeeping. "bookkeeping" and its derivatives are perhaps the
+ // only English words with three consecutive double-letters.
+ int Flags;
+ // CustomAA is set only for length-1 exons that are part of special bridges.
+ char CustomAA;
+} MS2Exon;
+
+typedef struct MS2Edge
+{
+ struct MS2Exon* LinkTo;
+ struct MS2Exon* LinkFrom;
+ struct MS2Edge* Next;
+ // The amino acid for this edge can be set specially, up-front:
+ char SpecialAA;
+} MS2Edge;
+
+// A wrapper for an exon. This lets us stick an exon into multiple linked lists.
+typedef struct MS2ExonNode
+{
+ struct MS2ExonNode* Next;
+ struct MS2ExonNode* Prev;
+ MS2Exon* Exon;
+} MS2ExonNode;
+
+typedef struct IntNode
+{
+ struct IntNode* Next;
+ int Value;
+} IntNode;
+
+typedef struct MS2CrossReference
+{
+ struct MS2Gene* Gene;
+ struct GFFGeneClass* GFFGene;
+ IntNode* FirstExonID;
+ IntNode* LastExonID;
+ struct MS2CrossReference* Next;
+} MS2CrossReference;
+
+typedef struct MS2Gene
+{
+ MS2ExonNode* FirstExon;
+ MS2ExonNode* LastExon;
+ struct MS2Gene* Next;
+ int Index; // for debugging, mostly!
+ MS2CrossReference* FirstCrossReference;
+ MS2CrossReference* LastCrossReference;
+} MS2Gene;
+
+// Singleton class tracking high-level data like the exon hashes.
+typedef struct MS2Builder
+{
+ FILE* GenomeFile;
+ int ForwardFlag;
+ MS2ExonNode** ExonHash;
+ MS2Exon* FirstExon;
+ MS2Exon* LastExon;
+ struct GFFGeneClass* FirstGFFGene;
+ struct GFFGeneClass* LastGFFGene;
+ MS2Gene* FirstGene;
+ MS2Gene* LastGene;
+ char ChromosomeName[MAX_NAME];
+ int ExonCount;
+ int GeneCount;
+ int VerboseFlag;
+} MS2Builder;
+
+typedef struct GFFExonClass
+{
+ int Start;
+ int End;
+ int ReadingFrame;
+ struct GFFExonClass* Next;
+} GFFExonClass;
+
+typedef struct GFFGeneClass
+{
+ char Name[MAX_NAME + 1];
+ char DatabaseName[MAX_NAME + 1];
+
+ GFFExonClass* FirstExon;
+ GFFExonClass* LastExon;
+ struct GFFGeneClass* Next;
+ struct MS2CrossReference* CrossReference;
+} GFFGeneClass;
+
+typedef struct GFFParser
+{
+ // Link to our builder, where the REAL data (not transient parse-state) lives:
+ MS2Builder* Builder;
+ // Keep a link to the current gene, so we can add exons to it:
+ struct GFFGeneClass* CurrentGene;
+ // Remember our filename (mostly for error reporting)
+ char* CurrentFileName;
+ // Keep a link to the last exon of the current gene, so that we
+ // can add edges between exons as needed:
+ MS2Exon* PrevExon;
+} GFFParser;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Forward declarations:
+void DebugPrintMS2Builder(MS2Builder* Builder, char* Notes);
+void ExonInheritOneForwardEdge(MS2Exon* Exon, MS2Edge* OldEdge);
+void ExonInheritOneBackwardEdge(MS2Exon* Exon, MS2Edge* OldEdge);
+void FreeMS2CrossReference(MS2CrossReference* CR);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Constructor functions:
+MS2Exon* NewExon(int Start, int End, int ReadingFrame)
+{
+ MS2Exon* Exon;
+ Exon = (MS2Exon*)calloc(1, sizeof(MS2Exon));
+ Exon->Start = Start;
+ Exon->End = End;
+ Exon->ReadingFrame = ReadingFrame;
+ return Exon;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Destructor functions:
+
+void FreeGFFGene(GFFGeneClass* Gene)
+{
+ GFFExonClass* Exon;
+ GFFExonClass* Prev;
+ if (!Gene)
+ {
+ return;
+ }
+ FreeMS2CrossReference(Gene->CrossReference);
+ Prev = NULL;
+ for (Exon = Gene->FirstExon; Exon; Exon = Exon->Next)
+ {
+ SafeFree(Prev);
+ Prev = Exon;
+ }
+ SafeFree(Prev);
+ SafeFree(Gene);
+}
+
+// Free an MS2Exon, and its associated edges.
+void FreeMS2Exon(MS2Exon* Exon)
+{
+ MS2Edge* Edge;
+ MS2Edge* Prev;
+ //
+ if (!Exon)
+ {
+ return;
+ }
+ // Free the SEQUENE:
+ SafeFree(Exon->Sequence);
+ Exon->Sequence = NULL;
+ // Free the list of FORWARD edges:
+ Prev = NULL;
+ for (Edge = Exon->FirstForward; Edge; Edge = Edge->Next)
+ {
+ SafeFree(Prev);
+ Prev = Edge;
+ }
+ SafeFree(Prev);
+ // Free the list of BACKWARD edges:
+ Prev = NULL;
+ for (Edge = Exon->FirstBackward; Edge; Edge = Edge->Next)
+ {
+ SafeFree(Prev);
+ Prev = Edge;
+ }
+ SafeFree(Prev);
+ SafeFree(Exon);
+}
+
+void FreeExonHash(MS2Builder* Builder)
+{
+ int HashIndex;
+ MS2ExonNode* Node;
+ MS2ExonNode* Prev;
+ //
+ if (!Builder->ExonHash)
+ {
+ return;
+ }
+ for (HashIndex = 0; HashIndex < EXON_HASH_SIZE; HashIndex++)
+ {
+ Prev = NULL;
+ for (Node = Builder->ExonHash[HashIndex]; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ }
+ SafeFree(Builder->ExonHash);
+ Builder->ExonHash = NULL;
+}
+
+// Free an MS2CrossReference, and its associated list of integers.
+void FreeMS2CrossReference(MS2CrossReference* CR)
+{
+ IntNode* Node;
+ IntNode* Prev;
+ if (!CR)
+ {
+ return;
+ }
+ Prev = NULL;
+ for (Node = CR->FirstExonID; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ SafeFree(CR);
+}
+
+// Free an MS2Gene, and its associated list of MS2ExonNodes
+void FreeMS2Gene(MS2Gene* Gene)
+{
+ MS2ExonNode* Node;
+ MS2ExonNode* Prev = NULL;
+ //
+ if (!Gene)
+ {
+ return;
+ }
+ for (Node = Gene->FirstExon; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+}
+
+void FreeGFFGenes(MS2Builder* Builder)
+{
+ GFFGeneClass* Gene;
+ GFFGeneClass* Prev;
+ //
+ Prev = NULL;
+ for (Gene = Builder->FirstGFFGene; Gene; Gene = Gene->Next)
+ {
+ FreeGFFGene(Prev);
+ Prev = Gene;
+ }
+ FreeGFFGene(Prev);
+ Builder->FirstGFFGene = NULL;
+ Builder->LastGFFGene = NULL;
+}
+
+void FreeMS2Genes(MS2Builder* Builder)
+{
+ MS2Gene* Gene;
+ MS2Gene* Prev = NULL;
+ for (Gene = Builder->FirstGene; Gene; Gene = Gene->Next)
+ {
+ FreeMS2Gene(Prev);
+ Prev = Gene;
+ }
+ FreeMS2Gene(Prev);
+ Builder->FirstGene = NULL;
+ Builder->LastGene = NULL;
+}
+
+void FreeMS2Exons(MS2Builder* Builder)
+{
+ MS2Exon* Exon;
+ MS2Exon* Prev = NULL;
+ //
+ for (Exon = Builder->FirstExon; Exon; Exon = Exon->Next)
+ {
+ FreeMS2Exon(Prev);
+ Prev = Exon;
+ }
+ FreeMS2Exon(Prev);
+ Builder->FirstExon = NULL;
+ Builder->LastExon = NULL;
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Other functions:
+
+void MS2CrossReferenceAddID(MS2CrossReference* CR, int ID)
+{
+ IntNode* Node;
+ //
+ Node = (IntNode*)calloc(1, sizeof(IntNode));
+ Node->Value = ID;
+ if (CR->LastExonID)
+ {
+ CR->LastExonID->Next = Node;
+ }
+ else
+ {
+ CR->FirstExonID = Node;
+ }
+ CR->LastExonID = Node;
+}
+
+// Return true if we successfully cover everything from start...end
+int BuildGFFCrossReference(MS2Builder* Builder, MS2CrossReference* CR,
+ MS2Gene* Gene, int Start, int End, int ReadingFrame)
+{
+ MS2Exon* Exon;
+ MS2ExonNode* Node;
+ int Result = 0;
+ //
+ for (Node = Gene->FirstExon; Node; Node = Node->Next)
+ {
+ Exon = Node->Exon;
+ // Ignore exons with wrong reading frame:
+ if (Exon->ReadingFrame != ReadingFrame)
+ {
+ continue;
+ }
+ // Handle perfect-overlap:
+ if (Exon->Start == Start && Exon->End == End)
+ {
+ MS2CrossReferenceAddID(CR, Exon->Index);
+ return 1;
+ }
+ // Handle partial-overlap:
+ if (Builder->ForwardFlag && Exon->Start == Start && Exon->End <= End)
+ {
+ MS2CrossReferenceAddID(CR, Exon->Index);
+ return BuildGFFCrossReference(Builder, CR, Gene, Exon->End, End, ReadingFrame);
+ }
+ if (!Builder->ForwardFlag && Exon->End == End && Exon->Start >= Start)
+ {
+ MS2CrossReferenceAddID(CR, Exon->Index);
+ return BuildGFFCrossReference(Builder, CR, Gene, Start, Exon->Start, ReadingFrame);
+ }
+ }
+ return Result;
+}
+
+void BuildGFFCrossReferences(MS2Builder* Builder)
+{
+ // Iterate over GFFGenes. Inner loop over MS2Genes. When you find an MS2Gene which covers
+ // any of the GFFGene, build a cross-reference.
+ GFFGeneClass* GFFGene;
+ GFFExonClass* GFFExon;
+ MS2Gene* Gene;
+ MS2Exon* Exon;
+ MS2ExonNode* Node;
+ int OverlapFlag;
+ int CoveredFlag;
+ int CoverageComplete;
+ int Result;
+ //
+ for (GFFGene = Builder->FirstGFFGene; GFFGene; GFFGene = GFFGene->Next)
+ {
+ CoveredFlag = 0;
+ for (Gene = Builder->FirstGene; Gene; Gene = Gene->Next)
+ {
+ // First, check whether the gene overlaps this GFF gene's first exon:
+ OverlapFlag = 0;
+ GFFExon = GFFGene->FirstExon;
+ for (Node = Gene->FirstExon; Node; Node = Node->Next)
+ {
+ Exon = Node->Exon;
+ if (Exon->Start < GFFExon->End && Exon->End > GFFExon->Start && Exon->ReadingFrame == GFFExon->ReadingFrame)
+ {
+ OverlapFlag = 1;
+ }
+ }
+ if (!OverlapFlag)
+ {
+ continue;
+ }
+ // They overlap - create a cross-reference and start adding exon IDs to the list!
+ GFFGene->CrossReference = (MS2CrossReference*)calloc(1, sizeof(MS2CrossReference));
+ GFFGene->CrossReference->GFFGene = GFFGene;
+ GFFGene->CrossReference->Gene = Gene;
+ // the MS2Gene keeps a list of its cross references:
+ if (Gene->LastCrossReference)
+ {
+ Gene->LastCrossReference->Next = GFFGene->CrossReference;
+ }
+ else
+ {
+ Gene->FirstCrossReference = GFFGene->CrossReference;
+ }
+ Gene->LastCrossReference = GFFGene->CrossReference;
+ CoverageComplete = 1;
+ for (GFFExon = GFFGene->FirstExon; GFFExon; GFFExon = GFFExon->Next)
+ {
+ Result = BuildGFFCrossReference(Builder, GFFGene->CrossReference, Gene, GFFExon->Start, GFFExon->End, GFFExon->ReadingFrame);
+ if (!Result)
+ {
+ CoverageComplete = 0;
+ }
+ }
+ if (!CoverageComplete)
+ {
+ REPORT_ERROR_S(23, GFFGene->Name);
+ }
+ CoveredFlag = 1;
+ break;
+ }
+ // Sanity check: The GFF gene MUST be covered (since we did, after all, create exons for all these
+ // GFF exons!)
+ if (!CoveredFlag)
+ {
+ REPORT_ERROR_S(22, GFFGene->Name);
+ }
+ }
+}
+
+void AddMS2Exon(MS2Builder* Builder, MS2Exon* Exon)
+{
+ if (!Builder->FirstExon)
+ {
+ Builder->FirstExon = Exon;
+ }
+ else
+ {
+ Builder->LastExon->Next = Exon;
+ Exon->Prev = Builder->LastExon;
+ }
+ Builder->LastExon = Exon;
+ Builder->ExonCount++;
+}
+
+void LinkExonForward(MS2Exon* FromExon, MS2Exon* ToExon)
+{
+ // Add a link forward from FromExon to ToExon, as well as a reciprocal link back.
+ MS2Edge* Edge;
+
+ // Sanity checking: ToExon starts after FromExon
+ INSPECT_ASSERT(ToExon->Start >= FromExon->End);
+
+ // If the two exons are ALREADY linked, then return immediately:
+ for (Edge = FromExon->FirstForward; Edge; Edge = Edge->Next)
+ {
+ if (Edge->LinkTo == ToExon)
+ {
+ return;
+ }
+ }
+ // Sanity check: There's no forward link, therefore there must be no reciprocal link:
+ for (Edge = ToExon->FirstBackward; Edge; Edge = Edge->Next)
+ {
+ INSPECT_ASSERT(Edge->LinkTo != FromExon);
+ }
+ // Add an edge linking FORWARD:
+ Edge = (MS2Edge*)calloc(1, sizeof(MS2Edge));
+ Edge->LinkFrom = FromExon;
+ Edge->LinkTo = ToExon;
+ if (!FromExon->FirstForward)
+ {
+ FromExon->FirstForward = Edge;
+ }
+ else
+ {
+ FromExon->LastForward->Next = Edge;
+ }
+ FromExon->LastForward = Edge;
+
+ // Add a reciprocal edge linking BACKWARD:
+ Edge = (MS2Edge*)calloc(1, sizeof(MS2Edge));
+ Edge->LinkFrom = ToExon;
+ Edge->LinkTo = FromExon;
+ if (!ToExon->FirstBackward)
+ {
+ ToExon->FirstBackward = Edge;
+ }
+ else
+ {
+ ToExon->LastBackward->Next = Edge;
+ }
+ ToExon->LastBackward = Edge;
+}
+void RemoveBackwardEdge(MS2Exon* Exon, MS2Exon* LinkedExon)
+{
+ MS2Edge* Prev = NULL;
+ MS2Edge* Edge;
+ for (Edge = Exon->FirstBackward; Edge; Edge = Edge->Next)
+ {
+ if (Edge->LinkTo == LinkedExon)
+ {
+ // Remove this edge!
+ if (Prev)
+ {
+ Prev->Next = Edge->Next;
+ }
+ else
+ {
+ Exon->FirstBackward = Edge->Next;
+ }
+ if (Exon->LastBackward == Edge)
+ {
+ Exon->LastBackward = Prev;
+ }
+ SafeFree(Edge);
+ break;
+ }
+ Prev = Edge;
+ }
+}
+
+void RemoveForwardEdge(MS2Exon* Exon, MS2Exon* LinkedExon)
+{
+ MS2Edge* Prev = NULL;
+ MS2Edge* Edge;
+ for (Edge = Exon->FirstForward; Edge; Edge = Edge->Next)
+ {
+ if (Edge->LinkTo == LinkedExon)
+ {
+ // Remove this edge!
+ if (Prev)
+ {
+ Prev->Next = Edge->Next;
+ }
+ else
+ {
+ Exon->FirstForward = Edge->Next;
+ }
+ if (Exon->LastForward == Edge)
+ {
+ Exon->LastForward = Prev;
+ }
+ SafeFree(Edge);
+ break;
+ }
+ Prev = Edge;
+ }
+}
+
+void ExonInheritForwardEdges(MS2Exon* Exon, MS2Exon* DeadExon)
+{
+ MS2Edge* Edge;
+ MS2Edge* Prev = NULL;
+ //
+ // Sanity checking: Exon and DeadExon share their right endpoint.
+ INSPECT_ASSERT(Exon->End == DeadExon->End);
+ for (Edge = DeadExon->FirstForward; Edge; Edge = Edge->Next)
+ {
+ // Add a link forward from Exon->LinkToExon:
+ LinkExonForward(Exon, Edge->LinkTo);
+ // Remove reciprocal exon link from LinkTo back to DeadExon:
+ RemoveBackwardEdge(Edge->LinkTo, DeadExon);
+ SafeFree(Prev);
+ Prev = Edge;
+ }
+ SafeFree(Prev);
+ DeadExon->FirstForward = NULL;
+ DeadExon->LastForward = NULL;
+}
+
+void ExonInheritBackwardEdges(MS2Exon* Exon, MS2Exon* DeadExon)
+{
+ MS2Edge* Edge;
+ MS2Edge* Prev = NULL;
+ //
+ // Sanity checking: Exon and DeadExon share their right endpoint.
+ INSPECT_ASSERT(Exon->Start == DeadExon->Start);
+ for (Edge = DeadExon->FirstBackward; Edge; Edge = Edge->Next)
+ {
+ // Add a link forward from LinkToExon->Exon:
+ LinkExonForward(Edge->LinkTo, Exon);
+ // Remove reciprocal exon link from LinkTo to DeadExon:
+ RemoveForwardEdge(Edge->LinkTo, DeadExon);
+ SafeFree(Prev);
+ Prev = Edge;
+ }
+ SafeFree(Prev);
+ DeadExon->FirstBackward = NULL;
+ DeadExon->LastBackward = NULL;
+}
+
+// Given this start, end, and reading frame, look up the corresponding
+// exon in Builder->ExonHash. If the exon doesn't exist yet, then create it.
+// Return the exon.
+MS2Exon* HashExon(MS2Builder* Builder, int Start, int End, int ReadingFrame)
+{
+ int HashValue;
+ MS2ExonNode* Node;
+ MS2ExonNode* Prev = NULL;
+ MS2Exon* Exon;
+ //
+ HashValue = (Start + CODON_LENGTH * End + ReadingFrame) % EXON_HASH_SIZE;
+ for (Node = Builder->ExonHash[HashValue]; Node; Node = Node->Next)
+ {
+ if (Node->Exon->Start == Start && Node->Exon->End == End && Node->Exon->ReadingFrame == ReadingFrame)
+ {
+ return (Node->Exon);
+ }
+ Prev = Node;
+ }
+ // There's no node for this exon yet. Add one:
+ Exon = (MS2Exon*)calloc(1, sizeof(MS2Exon));
+ Exon->Start = Start;
+ Exon->End = End;
+ Exon->ReadingFrame = ReadingFrame;
+ Node = (MS2ExonNode*)calloc(1, sizeof(MS2ExonNode));
+ Node->Exon = Exon;
+ if (Prev)
+ {
+ Prev->Next = Node;
+ Node->Prev = Prev;
+ }
+ else
+ {
+ Builder->ExonHash[HashValue] = Node;
+ }
+ AddMS2Exon(Builder, Exon);
+ return Exon;
+}
+
+char* GetGeneNameFromGFF(char* GFFToken)
+{
+ char* Temp = strtok(GFFToken,"=");
+ if(!strcmp(Temp,"Parent") || !strcmp(Temp,"parent"))
+ return strtok(NULL,"=");
+ return NULL;
+
+}
+
+// Parse one line of a .gff file. Callback for ParseFileByLines
+int HandleGFFFileLine(int LineNumber, int FilePos, char* LineBuffer, void* ParseData)
+{
+ GFFParser* Parser;
+ MS2Builder* Builder;
+ GFFGeneClass* GFFGene;
+ GFFExonClass* GFFExon;
+
+ char* SeqName;
+ char* TokTemp;
+ char* GeneNameTemp;
+ char* GeneName;
+ char* DatabaseName;
+ char* DummyStr;
+ char* IntervalType;
+ int SyntaxErrorFlag = 0;
+ int Start;
+ int End;
+ int ReadingFrame;
+ MS2Exon* Exon;
+ Parser = (GFFParser*)ParseData;
+ Builder = Parser->Builder;
+ // Break the line up by tabs.
+ // Bit 0: Seq name
+ SeqName = strtok(LineBuffer, "\t");
+ // Bit 1: Source (used to populate the Database field)
+ DatabaseName = strtok(NULL, "\t");
+ if (!DatabaseName)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ // Debugging option:
+#ifdef GFF_QUICKPARSE
+ if (LineNumber > 1000)
+ {
+ return 0;
+ }
+#endif
+ // Bit 2: interval type (est or exon)
+ IntervalType = strtok(NULL, "\t");
+ if (!IntervalType)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ // Bit 3: start
+ DummyStr = strtok(NULL, "\t");
+ if (!DummyStr)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ Start = atoi(DummyStr) - 1; // fix one-based numbering!
+
+ // Bit 4: end
+ DummyStr = strtok(NULL, "\t");
+ if (!DummyStr)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ End = atoi(DummyStr);
+
+ // Error checking:
+ if (Start < 0 || End < 0 || Start >= End)
+ {
+ REPORT_ERROR_IIIS(20, Start, End, LineNumber, Parser->CurrentFileName);
+ }
+
+ // Bit 5: score (ignored)
+ DummyStr = strtok(NULL, "\t");
+ if (!DummyStr)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ // Bit 6: forward flag
+ DummyStr = strtok(NULL, "\t");
+ if (!DummyStr || !*DummyStr)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ // Skip over this exon, if it comes from the wrong strand:
+ if (*DummyStr == '+')
+ {
+ if (!Builder->ForwardFlag)
+ {
+ goto cleanup;
+ }
+ }
+ else if (*DummyStr == '-')
+ {
+ if (Builder->ForwardFlag)
+ {
+ goto cleanup;
+ }
+ }
+ else
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ // Bit 6: reading frame
+ DummyStr = strtok(NULL, "\t");
+ if (!DummyStr)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+ ReadingFrame = atoi(DummyStr);
+ if (ReadingFrame < 0 || ReadingFrame > 2)
+ {
+ SyntaxErrorFlag = 1;
+ goto cleanup;
+ }
+
+ // Bit 7: notes, but we assume it contains parent name
+ DummyStr = strtok(NULL,"\t");
+
+ GeneName = SeqName;
+
+ if(DummyStr)
+ {
+
+ TokTemp = strtok(DummyStr,";");
+ while(TokTemp)
+ {
+ GeneNameTemp = GetGeneNameFromGFF(TokTemp);
+ if(GeneNameTemp)
+ {
+ GeneName = GeneNameTemp;
+ break;
+ }
+ TokTemp = strtok(NULL,";");
+ }
+ }
+
+ /*printf("CurrGFFLine:\n");
+ printf(" name:%s\n",GeneName);
+ printf(" Strand:%d\n",Builder->ForwardFlag);
+ printf(" frame: %d\n",ReadingFrame);
+ */
+ /////////////////////////////////////////////////////////////////////////////////
+ // We've parsed a valid gff file line. Create a new GFFGene (if necessary), and
+ // add a new GFFExon to our current GFFGene.
+ // Fix up the reading frame. As always, reading frame is the modulus of the first base
+ // pair of a codon. In GFF format, reading frame is the number of bases to be skipped over
+ // before the first base pair of a codon.
+ if (Builder->ForwardFlag)
+ {
+ ReadingFrame = (Start + ReadingFrame) % CODON_LENGTH;
+ }
+ else
+ {
+ ReadingFrame = (End - 1 - ReadingFrame) % CODON_LENGTH;
+ }
+ // Create a new GFFGene, if necessary:
+ if (!Parser->CurrentGene || CompareStrings(GeneName, Parser->CurrentGene->Name))
+ {
+ GFFGene = (GFFGeneClass*)calloc(1, sizeof(GFFGeneClass));
+ strncpy(GFFGene->Name, GeneName, MAX_NAME);
+ strncpy(GFFGene->DatabaseName, DatabaseName, MAX_NAME);
+ //strncpy(GFFGene->SeqName,SeqName,MAX_NAME);
+ Parser->CurrentGene = GFFGene;
+ if (!Builder->FirstGFFGene)
+ {
+ Builder->FirstGFFGene = GFFGene;
+ }
+ else
+ {
+ Builder->LastGFFGene->Next = GFFGene;
+ }
+ Builder->LastGFFGene = GFFGene;
+ Parser->PrevExon = NULL;
+ }
+ else
+ {
+ // We're continuing along the same GFFGene, so we can link the previous exon to this one.
+ // (ASSUMPTION: Exons for the same gene are linked by introns, and come IN ORDER!)
+ }
+ // Append a new GFFExon to the current GFFGene:
+ GFFExon = (GFFExonClass*)calloc(1, sizeof(GFFExonClass));
+ GFFExon->Start = Start;
+ GFFExon->End = End;
+ GFFExon->ReadingFrame = ReadingFrame;
+ if (!Parser->CurrentGene->FirstExon)
+ {
+ Parser->CurrentGene->FirstExon = GFFExon;
+ }
+ else
+ {
+ Parser->CurrentGene->LastExon->Next = GFFExon;
+ }
+ Parser->CurrentGene->LastExon = GFFExon;
+
+ // Construct an MS2Exon:
+ Exon = HashExon(Builder, Start, End, ReadingFrame);
+
+ // Add a link, if necessary, between this exon and the previous:
+ if (Parser->PrevExon)
+ {
+ // Report an error if the exons overlap:
+ if (Parser->PrevExon->End > Exon->Start && Parser->PrevExon->Start < Exon->End)
+ {
+ REPORT_ERROR_IIII(17, Parser->PrevExon->Start, Parser->PrevExon->End, Exon->Start, Exon->End);
+ }
+ else if (Parser->PrevExon->Start < Exon->Start)
+ {
+ // Exons listed from low genome coords to high. Typical for forward strand.
+ LinkExonForward(Parser->PrevExon, Exon);
+ }
+ else
+ {
+ // Exons listed from high genome coords to low. Typical for reverse strand.
+ LinkExonForward(Exon, Parser->PrevExon);
+ }
+
+ }
+ Parser->PrevExon = Exon;
+
+ // Report syntax errors:
+cleanup:
+ if (SyntaxErrorFlag)
+ {
+ REPORT_ERROR_IS(14, LineNumber, Parser->CurrentFileName);
+ return 0;
+ }
+ return 1;
+
+}
+
+// Iterate over all GFF files. Parse each one, using the HandleGFFFileLine callback to do the real work.
+void ParseGFFFiles(MS2Builder* Builder)
+{
+ StringNode* GFFNode;
+ FILE* GFFFile;
+ GFFParser* Parser;
+
+ for (GFFNode = GlobalOptions->FirstGFFFileName; GFFNode; GFFNode = GFFNode->Next)
+ {
+ GFFFile = fopen(GFFNode->String, "rb");
+ if (!GFFFile)
+ {
+ REPORT_ERROR_S(8, GFFNode->String);
+ continue;
+ }
+ Parser = (GFFParser*)calloc(1, sizeof(GFFParser));
+ Parser->Builder = Builder;
+ Parser->CurrentFileName = GFFNode->String;
+ ParseFileByLines(GFFFile, HandleGFFFileLine, Parser, 0);
+ free(Parser);
+ fclose(GFFFile);
+ }
+}
+
+// Remove an exon from the master linked-list:
+void DeleteMS2Exon(MS2Builder* Builder, MS2Exon* Exon)
+{
+ if (Exon == Builder->FirstExon)
+ {
+ Builder->FirstExon = Builder->FirstExon->Next;
+ }
+ if (Exon == Builder->LastExon)
+ {
+ Builder->LastExon = Builder->LastExon->Prev;
+ }
+ if (Exon->Next)
+ {
+ Exon->Next->Prev = Exon->Prev;
+ }
+ if (Exon->Prev)
+ {
+ Exon->Prev->Next = Exon->Next;
+ }
+ FreeMS2Exon(Exon);
+ Builder->ExonCount--;
+}
+
+// When we're making our double-iteration over the exon linked list, it's important to keep
+// track of where we are. (Note that ExonA itself may be deleted, so we can't simply finish
+// the loop and then go to ExonA->Next!) The variable NextExonA is what ExonA should be set to
+// for the next A-loop. And normally NextExonA is simply equal to ExonA->Next. However,
+// if we delete exons A and B, *and* exon B happens to be ExonA->Next, then on the next time through
+// the loop, ExonA must be shifted two positions forward. Trust me.
+#define DELETE_EXON_B()\
+{\
+ if (NextExonA == ExonB)\
+ {\
+ NextExonA = ExonB->Next;\
+ }\
+ DeleteMS2Exon(Builder, ExonB);\
+}
+
+// Iterate over all pairs of exons in the builder. If the exons overlap, split them!
+// We handle overlap in many different special cases, each of which is straightforward.
+void SplitOverlappingExons(MS2Builder* Builder)
+{
+ MS2Exon* Exon1;
+ MS2Exon* Exon2;
+ MS2Exon* Exon3;
+ MS2Exon* ExonA;
+ MS2Exon* NextExonA;
+ MS2Exon* ExonB;
+ MS2Exon* NextExonB;
+ int ReadingFrame;
+ int OverlapFlag = 0;
+ //
+ ExonA = Builder->FirstExon;
+ while (ExonA)
+ {
+ if (OverlapFlag)
+ {
+ //DebugPrintMS2Builder(Builder, "A");
+ OverlapFlag = 0;
+ }
+
+ //printf("ExonA[%d]: %d-%d\n", ExonA->ReadingFrame, ExonA->Start, ExonA->End);
+ ReadingFrame = ExonA->ReadingFrame;
+ NextExonA = ExonA->Next;
+ // Loop B:
+ ExonB = ExonA->Next;
+ while (ExonB)
+ {
+ if (OverlapFlag)
+ {
+ //DebugPrintMS2Builder(Builder, "B");
+ OverlapFlag = 0;
+ }
+ // Compare exon A to exon B:
+ if (ExonA->ReadingFrame != ExonB->ReadingFrame)
+ {
+ ExonB = ExonB->Next;
+ continue;
+ }
+ if (ExonA->End <= ExonB->Start)
+ {
+ ExonB = ExonB->Next;
+ continue;
+ }
+ if (ExonA->Start >= ExonB->End)
+ {
+ ExonB = ExonB->Next;
+ continue;
+ }
+ NextExonB = ExonB->Next;
+ //printf(" %d-%d Overlaps with %d-%d\n", ExonA->Start, ExonA->End, ExonB->Start, ExonB->End);
+ OverlapFlag = 1;
+ ////////////////////////////////////////////////////////////////////////////////////
+ // There's overlap. Handle each case in turn.
+ if (ExonA->Start == ExonB->Start && ExonA->End == ExonB->End)
+ {
+ ExonInheritBackwardEdges(ExonA, ExonB);
+ ExonInheritForwardEdges(ExonA, ExonB);
+ DELETE_EXON_B();
+ ExonB = NextExonB;
+ continue;
+ }
+ if (ExonA->Start == ExonB->Start)
+ {
+ if (ExonA->End > ExonB->End)
+ {
+ // A-----
+ // B---
+ // 11
+ Exon1 = NewExon(ExonB->End, ExonA->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ ExonInheritForwardEdges(Exon1, ExonA);
+ ExonInheritBackwardEdges(ExonB, ExonA);
+ DeleteMS2Exon(Builder, ExonA);
+ break;
+ }
+ else
+ {
+ // A---
+ // B-----
+ // 11
+ Exon1 = NewExon(ExonA->End, ExonB->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ ExonInheritForwardEdges(Exon1, ExonB);
+ ExonInheritBackwardEdges(ExonA, ExonB);
+ DELETE_EXON_B();
+ ExonB = NextExonB;
+ continue;
+ }
+ } // end: ExonA->Start == ExonB->Start
+ if (ExonA->End == ExonB->End)
+ {
+ if (ExonA->Start < ExonB->Start)
+ {
+ // A-----
+ // B ---
+ // 11
+ Exon1 = NewExon(ExonA->Start, ExonB->Start, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ ExonInheritForwardEdges(ExonB, ExonA);
+ ExonInheritBackwardEdges(Exon1, ExonA);
+ DeleteMS2Exon(Builder, ExonA);
+ break;
+ }
+ else
+ {
+ // A ---
+ // B-----
+ // 11
+ Exon1 = NewExon(ExonB->Start, ExonA->Start, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ ExonInheritForwardEdges(ExonA, ExonB);
+ ExonInheritBackwardEdges(Exon1, ExonB);
+ DELETE_EXON_B();
+ ExonB = NextExonB;
+ continue;
+ }
+ } // end: ExonA->End == ExonB->End
+ if (ExonA->Start < ExonB->Start && ExonA->End < ExonB->End)
+ {
+ // A------
+ // B ------
+ // 111222333
+ Exon1 = NewExon(ExonA->Start, ExonB->Start, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ Exon2 = NewExon(ExonB->Start, ExonA->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon2);
+ Exon3 = NewExon(ExonA->End, ExonB->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon3);
+ ExonInheritBackwardEdges(Exon1, ExonA);
+ ExonInheritBackwardEdges(Exon2, ExonB);
+ ExonInheritForwardEdges(Exon2, ExonA);
+ ExonInheritForwardEdges(Exon3, ExonB);
+ DELETE_EXON_B();
+ ExonB = NextExonB;
+ DeleteMS2Exon(Builder, ExonA);
+ break;
+ }
+ if (ExonA->Start < ExonB->Start && ExonA->End > ExonB->End)
+ {
+ // A---------
+ // B ---
+ // 111 222
+ Exon1 = NewExon(ExonA->Start, ExonB->Start, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ Exon2 = NewExon(ExonB->End, ExonA->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon2);
+ ExonInheritBackwardEdges(Exon1, ExonA);
+ ExonInheritForwardEdges(Exon2, ExonA);
+ DeleteMS2Exon(Builder, ExonA);
+ break;
+ }
+ if (ExonA->Start > ExonB->Start && ExonA->End > ExonB->End)
+ {
+ // A ------
+ // B------
+ // 111222333
+ Exon1 = NewExon(ExonB->Start, ExonA->Start, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ Exon2 = NewExon(ExonA->Start, ExonB->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon2);
+ Exon3 = NewExon(ExonB->End, ExonA->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon3);
+ ExonInheritBackwardEdges(Exon1, ExonB);
+ ExonInheritBackwardEdges(Exon2, ExonA);
+ ExonInheritForwardEdges(Exon3, ExonA);
+ ExonInheritForwardEdges(Exon2, ExonB);
+ DELETE_EXON_B();
+ ExonB = NextExonB;
+ DeleteMS2Exon(Builder, ExonA);
+ break;
+ }
+ if (ExonA->Start > ExonB->Start && ExonA->End < ExonB->End)
+ {
+ // A ---
+ // B---------
+ // 111 222
+ Exon1 = NewExon(ExonB->Start, ExonA->Start, ReadingFrame);
+ AddMS2Exon(Builder, Exon1);
+ Exon2 = NewExon(ExonA->End, ExonB->End, ReadingFrame);
+ AddMS2Exon(Builder, Exon2);
+ ExonInheritBackwardEdges(Exon1, ExonB);
+ ExonInheritForwardEdges(Exon2, ExonB);
+ DELETE_EXON_B();
+ ExonB = NextExonB;
+ continue;
+ }
+ INSPECT_ASSERT(0); // we'd better not reach this point!
+ }
+ ExonA = NextExonA;
+ }
+}
+
+// If two exons are adjacent (one begins just after the other ends) and have
+// compatible reading frames, then add a link between them if necessary.
+void AddAdjacentExonLinks(MS2Builder* Builder)
+{
+ MS2Exon* ExonA;
+ MS2Exon* ExonB;
+ MS2Edge* TestEdge;
+ int LinkFound;
+ //
+ for (ExonA = Builder->FirstExon; ExonA; ExonA = ExonA->Next)
+ {
+ //printf("AAEL: %d-%d\n", ExonA->Start, ExonA->End);
+ for (ExonB = Builder->FirstExon; ExonB; ExonB = ExonB->Next)
+ {
+ if (ExonA->End == ExonB->Start && ExonA->ReadingFrame == ExonB->ReadingFrame)
+ {
+ LinkFound = 0;
+ for (TestEdge = ExonA->FirstForward; TestEdge; TestEdge = TestEdge->Next)
+ {
+ if (TestEdge->LinkTo == ExonB)
+ {
+ LinkFound = 1;
+ break;
+ }
+ }
+ if (!LinkFound)
+ {
+ LinkExonForward(ExonA, ExonB);
+ }
+ }
+ }
+ }
+}
+
+// Add an MS2Exon to an MS2Gene. Also, add to the gene all the exons
+// which are (recursively) linked to by MS2Exon.
+void AddExonToGene(MS2Gene* Gene, MS2Exon* Exon)
+{
+ MS2ExonNode* Node;
+ MS2Edge* Edge;
+ //
+ // We follow edges forward as well as edges back, so we'll re-visit the
+ // same exons, which stops the recursion:
+ if (Exon->Gene == Gene)
+ {
+ return;
+ }
+ //printf("[[Add exon %d-%d R%d to gene %d\n", Exon->Start, Exon->End, Exon->ReadingFrame, Gene->Index);
+ //if(Exon->Gene)
+ // {
+ // printf("But exon already belongs to Gene: %d\n",Exon->Gene->Index);
+ // getchar();
+ // }
+ INSPECT_ASSERT(!Exon->Gene);
+ Exon->Gene = Gene;
+ Node = (MS2ExonNode*)calloc(1, sizeof(ExonNode));
+ Node->Exon = Exon;
+ if (!Gene->FirstExon)
+ {
+ Gene->FirstExon = Node;
+ }
+ else
+ {
+ Gene->LastExon->Next = Node;
+ Node->Prev = Gene->LastExon;
+ }
+ Gene->LastExon = Node;
+ // Follow edges:
+ for (Edge = Exon->FirstForward; Edge; Edge = Edge->Next)
+ {
+ // printf("Following forward edge\n");
+ AddExonToGene(Gene, Edge->LinkTo);
+ //printf("Finished forward edge\n");
+ }
+ for (Edge = Exon->FirstBackward; Edge; Edge = Edge->Next)
+ {
+ //printf("Following reverse edge\n");
+ AddExonToGene(Gene, Edge->LinkTo);
+ //printf("Finished reverse edge\n");
+ }
+}
+
+// Assimilate all MS2Exons from the master list into MS2Genes. Iteratively:
+// Take the first exon that's not in a gene. Build a new gene, and add this exon,
+// and (recursively) add in everything the exon links to.
+void GroupExonsIntoGenes(MS2Builder* Builder)
+{
+ MS2Exon* Exon;
+ MS2Gene* Gene;
+ //
+ // Iterate over exons:
+ for (Exon = Builder->FirstExon; Exon; Exon = Exon->Next)
+ {
+ if (Exon->Gene)
+ {
+ continue;
+ }
+ // This exon doesn't have a gene yet. Create a gene to contain it:
+ Gene = (MS2Gene*)calloc(1, sizeof(MS2Gene));
+ Gene->Index = Builder->GeneCount;
+ AddExonToGene(Gene, Exon);
+
+ if (!Builder->FirstGene)
+ {
+ Builder->FirstGene = Gene;
+ }
+ else
+ {
+ Builder->LastGene->Next = Gene;
+ }
+ Builder->LastGene = Gene;
+ Builder->GeneCount++;
+ }
+ // All exons are now assigned to genes.
+}
+
+// Temp-struct for sorting exons by genome-position
+typedef struct MS2SortedExonNode
+{
+ MS2Exon* Exon;
+} MS2SortedExonNode;
+
+// Callback for qsort, to sort exons by genome-position, FORWARD strand
+int CompareMS2ExonNodesForward(const MS2SortedExonNode* NodeA, const MS2SortedExonNode* NodeB)
+{
+ if (NodeA->Exon->Start < NodeB->Exon->Start)
+ {
+ return -1;
+ }
+ if (NodeA->Exon->Start > NodeB->Exon->Start)
+ {
+ return 1;
+ }
+ if (NodeA->Exon->End < NodeB->Exon->End)
+ {
+ return -1;
+ }
+ if (NodeA->Exon->End > NodeB->Exon->End)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+// Callback for qsort, to sort exons by genome-position, REVERSE strand
+int CompareMS2ExonNodesBackward(const MS2SortedExonNode* NodeA, const MS2SortedExonNode* NodeB)
+{
+ if (NodeA->Exon->Start < NodeB->Exon->Start)
+ {
+ return 1;
+ }
+ if (NodeA->Exon->Start > NodeB->Exon->Start)
+ {
+ return -1;
+ }
+ if (NodeA->Exon->End < NodeB->Exon->End)
+ {
+ return 1;
+ }
+ if (NodeA->Exon->End > NodeB->Exon->End)
+ {
+ return -1;
+ }
+ return 0;
+}
+
+// Read (and translate) the protein sequence for an exon.
+void ReadExonSequence(MS2Builder* Builder, MS2Exon* Exon)
+{
+ FILE* File;
+ int DNALength;
+ int DNABufferSize = 0;
+ char* DNABuffer = NULL;
+ char* RCBuffer = NULL;
+ int Modulo;
+ char* TranslationStart;
+ int AAIndex;
+ char* TranslateMe;
+ int SuffixPos;
+ int AALength;
+ int LengthPrefix;
+ int LengthBody;
+ int LengthSuffix;
+ int LengthFull;
+ //
+ File = GlobalOptions->OutputFile;
+
+ // Allocate a buffer to store the DNA sequence (and reverse complement):
+ DNALength = Exon->End - Exon->Start;
+ if (DNALength + 1 > DNABufferSize)
+ {
+ SafeFree(DNABuffer);
+ SafeFree(RCBuffer);
+ DNABufferSize = max(1024, DNALength + 5);
+ DNABuffer = (char*)calloc(DNABufferSize, sizeof(char));
+ RCBuffer = (char*)calloc(DNABufferSize, sizeof(char));
+ }
+
+ // Retrieve the DNA:
+ fseek(Builder->GenomeFile, Exon->Start, 0);
+ ReadBinary(DNABuffer, sizeof(char), DNALength, Builder->GenomeFile);
+ DNABuffer[DNALength] = '\0';
+ if (Builder->ForwardFlag)
+ {
+ Modulo = Exon->Start % CODON_LENGTH;
+ if (Modulo == Exon->ReadingFrame)
+ {
+ TranslationStart = DNABuffer;
+ Exon->Prefix[0] = '\0';
+ }
+ else if ((Exon->ReadingFrame + 1) % CODON_LENGTH == Modulo % CODON_LENGTH)
+ {
+ TranslationStart = DNABuffer + 2;
+ Exon->Prefix[0] = DNABuffer[0];
+ Exon->Prefix[1] = DNABuffer[1];
+ Exon->Prefix[2] = '\0';
+ }
+ else
+ {
+ TranslationStart = DNABuffer + 1;
+ Exon->Prefix[0] = DNABuffer[0];
+ Exon->Prefix[1] = '\0';
+ }
+ TranslateMe = DNABuffer + strlen(Exon->Prefix);
+ }
+ else
+ {
+ WriteReverseComplement(DNABuffer, RCBuffer);
+ Modulo = (Exon->End - 1) % 3;
+ if (Modulo == Exon->ReadingFrame)
+ {
+ TranslationStart = RCBuffer;
+ Exon->Prefix[0] = '\0';
+ }
+ else if ((Exon->ReadingFrame + 1) % CODON_LENGTH == Modulo % CODON_LENGTH)
+ {
+ TranslationStart = RCBuffer + 1;
+ Exon->Prefix[0] = RCBuffer[0];
+ Exon->Prefix[1] = '\0';
+ }
+ else
+ {
+ TranslationStart = RCBuffer + 2;
+ Exon->Prefix[0] = RCBuffer[0];
+ Exon->Prefix[1] = RCBuffer[1];
+ Exon->Prefix[2] = '\0';
+ }
+ TranslateMe = RCBuffer + strlen(Exon->Prefix);
+ }
+ AALength = (DNALength - strlen(Exon->Prefix)) / CODON_LENGTH;
+ Exon->Sequence = (char*)calloc(AALength + 1, sizeof(char));
+ for (AAIndex = 0; AAIndex < AALength; AAIndex++)
+ {
+ Exon->Sequence[AAIndex] = TranslateCodon(TranslateMe);
+ if(Exon->Sequence[AAIndex] < 'A' || Exon->Sequence[AAIndex] >= 'Z')
+ {
+ printf("ExonSequence: Contains a roque character %c at position %d-%d\n",Exon->Start, Exon->End);
+ getchar();
+ }
+ TranslateMe += CODON_LENGTH;
+ }
+ // Set the suffix:
+ for (SuffixPos = 0; SuffixPos < CODON_LENGTH; SuffixPos++)
+ {
+ Exon->Suffix[SuffixPos] = *TranslateMe;
+ if (!*TranslateMe)
+ {
+ break;
+ }
+ TranslateMe++;
+ }
+ // Double-check lengths:
+ LengthPrefix = strlen(Exon->Prefix);
+ LengthSuffix = strlen(Exon->Suffix);
+ LengthBody = strlen(Exon->Sequence) * CODON_LENGTH;
+ LengthFull = LengthPrefix + LengthSuffix + LengthBody;
+ if (LengthFull != Exon->End - Exon->Start)
+ {
+ printf("** Error: Length %d != genomic span %d\n", LengthFull, Exon->End - Exon->Start);
+ }
+ SafeFree(DNABuffer);
+ SafeFree(RCBuffer);
+}
+
+// Output the <Exon> tag for this MS2Exon, along with child tags (edges, and sequence)
+void OutputMS2Exon(MS2Builder* Builder, MS2Gene* Gene, MS2Exon* Exon)
+{
+ FILE* File;
+ char SpanningCodon[4];
+ int LengthA;
+ int LengthB;
+ char AA;
+ MS2Edge* Edge;
+ MS2Exon* LinkExon;
+ //
+ File = GlobalOptions->OutputFile;
+ // Start the exon tag:
+ fprintf(File, " <Exon Index=\"%d\" Start=\"%d\" End=\"%d\"",
+ Exon->Index, Exon->Start, Exon->End);
+ fprintf(File, ">\n");
+
+ if (Exon->Sequence)
+ {
+ fprintf(File, " <ExonSequence Length=\"%d\">%s</ExonSequence>\n", strlen(Exon->Sequence), Exon->Sequence);
+ }
+ //fprintf(File, " <ExonSequence>%s</ExonSequence>\n", Exon->Sequence);
+ // Write out all the edges linking back from this exon to lower-numbered exons:
+ if (Builder->ForwardFlag)
+ {
+ Edge = Exon->FirstBackward;
+ }
+ else
+ {
+ Edge = Exon->FirstForward;
+ }
+ for (; Edge; Edge = Edge->Next)
+ {
+ // Start an <ExtendsExon> or a <LinkFrom> tag:
+ LinkExon = Edge->LinkTo;
+ if (LinkExon->Start == Exon->End || LinkExon->End == Exon->Start)
+ {
+ fprintf(File, " <ExtendsExon");
+ }
+ else
+ {
+ fprintf(File, " <LinkFrom");
+ }
+
+ // Indicate the exon index:
+ fprintf(File, " Index=\"%d\"", LinkExon->Index);
+
+ // Get the amino acid!
+ if (Exon->Flags & MS2_EXON_CUSTOMAA_HEAD)
+ {
+ AA = Exon->CustomAA;
+ }
+ else if (Exon->Flags & MS2_EXON_CUSTOMAA)
+ {
+ AA = '\0';
+ }
+ else if (LinkExon->Flags & MS2_EXON_CUSTOMAA)
+ {
+ AA = LinkExon->CustomAA;
+ }
+ else
+ {
+ // The spanning codon consists of 1 or 2 bases from this exon,
+ // and 2 or 1 bases from the linked exon.
+ AA = '\0';
+ memset(SpanningCodon, 0, sizeof(char) * 4);
+ LengthA = strlen(LinkExon->Suffix);
+ LengthB = strlen(Exon->Prefix);
+ if (LengthA + LengthB == CODON_LENGTH)
+ {
+ strcpy(SpanningCodon, LinkExon->Suffix);
+ strcat(SpanningCodon, Exon->Prefix);
+ }
+ else if (LengthA + LengthB != 0)
+ {
+ // Report an error now, if the exons have incompatible reading frames!
+ REPORT_ERROR_IIII(16, Exon->Start, Exon->End, LinkExon->Start, LinkExon->End);
+ }
+
+ if (SpanningCodon[0])
+ {
+ AA = TranslateCodon(SpanningCodon);
+ }
+ }
+ if (AA)
+ {
+ fprintf(File, " AA=\"%c\"", AA);
+ }
+
+ // End the tag:
+ fprintf(File, " />\n");
+ }
+
+ // End the exon tag:
+ fprintf(File, " </Exon>\n");
+
+}
+
+// Assign exon indexes for this gene, by first sorting the exons:
+void SortMS2GeneExons(MS2Builder* Builder, MS2Gene* Gene)
+{
+ MS2SortedExonNode* SortedExonBlock;
+ int ExonIndex;
+ int ExonCount;
+ MS2ExonNode* Node;
+
+ //
+ ExonCount = 0;
+ for (ExonIndex = 0, Node = Gene->FirstExon; Node; ExonIndex++, Node = Node->Next)
+ {
+ ExonCount++;
+ }
+
+ SortedExonBlock = (MS2SortedExonNode*)calloc(ExonCount, sizeof(MS2SortedExonNode));
+ for (ExonIndex = 0, Node = Gene->FirstExon; Node; ExonIndex++, Node = Node->Next)
+ {
+ SortedExonBlock[ExonIndex].Exon = Node->Exon;
+ }
+ if (Builder->ForwardFlag)
+ {
+ qsort(SortedExonBlock, ExonCount, sizeof(MS2SortedExonNode), (QSortCompare)CompareMS2ExonNodesForward);
+ }
+ else
+ {
+ qsort(SortedExonBlock, ExonCount, sizeof(MS2SortedExonNode), (QSortCompare)CompareMS2ExonNodesBackward);
+ }
+ for (ExonIndex = 0; ExonIndex < ExonCount; ExonIndex++)
+ {
+ SortedExonBlock[ExonIndex].Exon->Index = ExonIndex;
+ //ReadExonSequence(Builder, Gene, SortedExonBlock[ExonIndex].Exon);
+ }
+ DebugPrintMS2Builder(Builder, "Exons sorted");
+ SafeFree(SortedExonBlock);
+}
+
+// Generate XML for an MS2CrossReference. We need the GFF gene's database and accession number,
+// and we need the list of exon indices.
+void OutputMS2CrossReference(MS2Builder* Builder, FILE* File, MS2CrossReference* CR)
+{
+ IntNode* Node;
+ //
+ fprintf(File, " <CrossReference Database=\"%s\" ID=\"%s\">\n", CR->GFFGene->DatabaseName, CR->GFFGene->Name);
+ fprintf(File, " <CRExons Index=\"");
+ for (Node = CR->FirstExonID; Node; Node = Node->Next)
+ {
+ if (Node->Next)
+ {
+ fprintf(File, "%d, ", Node->Value);
+ }
+ else
+ {
+ fprintf(File, "%d", Node->Value);
+ }
+ }
+ fprintf(File, "\"/>\n");
+ fprintf(File, " </CrossReference>\n");
+}
+
+// Output the XML for this MS2Gene.
+void OutputMS2Gene(MS2Builder* Builder, MS2Gene* Gene)
+{
+ FILE* File;
+ int ExonCount;
+ int ExonIndex;
+ MS2ExonNode* Node;
+ MS2CrossReference* CR;
+
+ File = GlobalOptions->OutputFile;
+
+ // Count exons in the gene:
+ ExonCount = 0;
+ for (ExonIndex = 0, Node = Gene->FirstExon; Node; ExonIndex++, Node = Node->Next)
+ {
+ ExonCount++;
+ }
+
+ // Start the Gene tag:
+ fprintf(File, "<Gene ExonCount=\"%d\" Chromosome=\"%s\" ForwardFlag=\"%d\">\n", ExonCount, Builder->ChromosomeName, Builder->ForwardFlag);
+
+ // Loop over exons, and ouptut an <Exon> tag for each one:
+ for (ExonIndex = 0; ExonIndex < ExonCount; ExonIndex++)
+ {
+ for (Node = Gene->FirstExon; Node; Node = Node->Next)
+ {
+ if (Node->Exon->Index != ExonIndex)
+ {
+ continue;
+ }
+ OutputMS2Exon(Builder, Gene, Node->Exon);
+ break;
+ }
+ }
+
+ // Output all cross-references for the gene:
+ for (CR = Gene->FirstCrossReference; CR; CR = CR->Next)
+ {
+ OutputMS2CrossReference(Builder, File, CR);
+ }
+
+ // Complete the Gene tag:
+ fprintf(File, "</Gene>\n\n");
+ fflush(File);
+
+}
+
+// Convert a codon into a number from 0 to 63. (We probably could just TRANSLATE the
+// codon and use the amino acid value...)
+int GetCodonHashValue(char* EncodeCodon)
+{
+ int Pos;
+ int Multiplier[] = {1, 4, 16};
+ int Value = 0;
+ for (Pos = 0; Pos < CODON_LENGTH; Pos++)
+ {
+ switch (EncodeCodon[Pos])
+ {
+ case 'a':
+ case 'A':
+ Value += 0 * Multiplier[Pos];
+ break;
+ case 'c':
+ case 'C':
+ Value += 1 * Multiplier[Pos];
+ break;
+ case 'g':
+ case 'G':
+ Value += 2 * Multiplier[Pos];
+ break;
+ case 't':
+ case 'T':
+ Value += 3 * Multiplier[Pos];
+ break;
+ default:
+ //printf("* Error in GetCodonHashValue('%c')\n", EncodeCodon[Pos]);
+ REPORT_ERROR_I(24, EncodeCodon[Pos]);
+ return 0;
+ }
+ }
+ return Value;
+}
+
+// Scenario: What if exon X consists of a single base pair!?
+// That's tricky if the base pair is the middle of a codon, because the
+// linked exons must look PAST this central exon to get their prefix / suffix.
+// We'll produce special degree-1 "customAA" exons, one for each codon, to
+// get from each predecessor of exon X to each successor of exon X.
+// We produce one CustomAA exon for each possible codon.
+void RepairPromiscuousSingletonExons(MS2Builder* Builder)
+{
+ MS2Exon* Exon;
+ MS2Exon* NextExon;
+ int Modulo;
+ MS2Exon* CodonExons[64];
+ MS2Edge* BackwardEdge;
+ MS2Edge* ForwardEdge;
+ MS2Edge* Edge;
+ char Codon[4];
+ char RCCodon[4];
+ char* EncodeCodon;
+ int CodonValue;
+ int BridgedFlag;
+ //
+ memset(Codon, 0, sizeof(char) * 4);
+ memset(RCCodon, 0, sizeof(char) * 4);
+ Exon = Builder->FirstExon;
+ while (1)
+ {
+ if (!Exon)
+ {
+ break;
+ }
+ // Skip this exon if its length is > 1:
+ if (Exon->End > Exon->Start + 1)
+ {
+ Exon = Exon->Next;
+ continue;
+ }
+ // Skip this exon, unless its base pair is the middle of a codon:
+ if (Builder->ForwardFlag)
+ {
+ Modulo = Exon->Start % CODON_LENGTH;
+ if ((Exon->ReadingFrame + 1) % CODON_LENGTH != Modulo)
+ {
+ Exon = Exon->Next;
+ continue;
+ }
+ }
+ else
+ {
+ Modulo = (Exon->End - 1) % CODON_LENGTH;
+ if ((Exon->ReadingFrame - 1) % CODON_LENGTH != Modulo)
+ {
+ Exon = Exon->Next;
+ continue;
+ }
+ }
+ // Skip customAA exons built by previous passes through this loop:
+ if (Exon->CustomAA)
+ {
+ Exon = Exon->Next;
+ continue;
+ }
+ // This is the tricky case: A length-1 exon in the middle of a codon.
+ // Consider every pairing of a predecessor and a successor. For each distinct codon,
+ // build one CustomAA exon:
+ memset(CodonExons, 0, sizeof(MS2Exon*) * 64);
+ BridgedFlag = 0;
+ for (BackwardEdge = Exon->FirstBackward; BackwardEdge; BackwardEdge = BackwardEdge->Next)
+ {
+ for (ForwardEdge = Exon->FirstForward; ForwardEdge; ForwardEdge = ForwardEdge->Next)
+ {
+ if (Builder->ForwardFlag)
+ {
+ Codon[0] = BackwardEdge->LinkTo->Suffix[0];
+ Codon[1] = Exon->Prefix[0];
+ Codon[2] = ForwardEdge->LinkTo->Prefix[0];
+ EncodeCodon = Codon;
+ }
+ else
+ {
+ Codon[0] = ForwardEdge->LinkTo->Suffix[0];
+ Codon[1] = Exon->Prefix[0];
+ Codon[2] = BackwardEdge->LinkTo->Prefix[0];
+ WriteReverseComplement(Codon, RCCodon);
+ EncodeCodon = RCCodon;
+ }
+ CodonValue = GetCodonHashValue(EncodeCodon);
+ INSPECT_ASSERT(CodonValue >= 0 && CodonValue < 64);
+ // CodonExons[CodonValue] will hold the custom-aa exon:
+ if (!CodonExons[CodonValue])
+ {
+ CodonExons[CodonValue] = (MS2Exon*)calloc(1, sizeof(MS2Exon));
+ CodonExons[CodonValue]->CustomAA = TranslateCodon(EncodeCodon);
+ CodonExons[CodonValue]->Start = Exon->Start;
+ CodonExons[CodonValue]->End = Exon->End;
+ CodonExons[CodonValue]->ReadingFrame = Exon->ReadingFrame;
+ AddMS2Exon(Builder, CodonExons[CodonValue]);
+ }
+ ExonInheritOneForwardEdge(CodonExons[CodonValue], ForwardEdge);
+ ExonInheritOneBackwardEdge(CodonExons[CodonValue], BackwardEdge);
+ BridgedFlag = 1;
+ }
+ }
+
+ // Assign flags to these CustomAA exons. All this work to handle
+ // the case of a single-base-pair exon with out-degree 1 whose
+ // outgoing edge is to an adjacent exon; in that case, we want the
+ // amino acid to be placed on our incoming edges rather than
+ // on the outgoing edges.
+ for (CodonValue = 0; CodonValue < 64; CodonValue++)
+ {
+ if (CodonExons[CodonValue])
+ {
+ CodonExons[CodonValue]->Flags = MS2_EXON_CUSTOMAA;
+ if (Builder->ForwardFlag)
+ {
+ Edge = CodonExons[CodonValue]->FirstForward;
+ }
+ else
+ {
+ Edge = CodonExons[CodonValue]->FirstBackward;
+ }
+ if (!Edge || !Edge->Next)
+ {
+ continue;
+ }
+ if (Edge->LinkFrom->Start == Edge->LinkTo->End || Edge->LinkFrom->End == Edge->LinkTo->Start)
+ {
+ CodonExons[CodonValue]->Flags = MS2_EXON_CUSTOMAA_HEAD;
+ }
+ }
+ }
+ NextExon = Exon->Next;
+ if (BridgedFlag)
+ {
+ DeleteMS2Exon(Builder, Exon);
+ }
+ else
+ {
+ REPORT_WARNING_I(21, Exon->Start);
+ }
+ Exon = NextExon;
+ }
+}
+
+void ExonInheritOneForwardEdge(MS2Exon* Exon, MS2Edge* OldEdge)
+{
+ MS2Exon* DisplacedExon;
+ MS2Exon* LinkedExon;
+ //
+ DisplacedExon = OldEdge->LinkFrom;
+ LinkedExon = OldEdge->LinkTo;
+ RemoveBackwardEdge(LinkedExon, DisplacedExon);
+ RemoveForwardEdge(DisplacedExon, LinkedExon);
+ LinkExonForward(Exon, LinkedExon);
+}
+
+void ExonInheritOneBackwardEdge(MS2Exon* Exon, MS2Edge* OldEdge)
+{
+ MS2Exon* DisplacedExon;
+ MS2Exon* LinkedExon;
+ //
+ DisplacedExon = OldEdge->LinkFrom;
+ LinkedExon = OldEdge->LinkTo;
+ RemoveBackwardEdge(DisplacedExon, LinkedExon);
+ RemoveForwardEdge(LinkedExon, DisplacedExon);
+ LinkExonForward(LinkedExon, Exon);
+}
+
+// Main entry point for building MS2 database.
+void BuildMS2DB()
+{
+ MS2Builder* Builder;
+ int ForwardFlag;
+ MS2Gene* Gene;
+ MS2Exon* Exon;
+ //
+ Builder = (MS2Builder*)calloc(1, sizeof(MS2Builder));
+ // Builder->VerboseFlag = 1; // spewy!
+ // Open the genome file:
+ Builder->GenomeFile = fopen(GlobalOptions->GenomeFileName, "rb");
+ if (!Builder->GenomeFile)
+ {
+ REPORT_ERROR_S(8, GlobalOptions->GenomeFileName);
+ goto cleanup;
+ }
+ // At least one GFF file must be specified!
+ if (!GlobalOptions->FirstGFFFileName)
+ {
+ REPORT_ERROR(12);
+ goto cleanup;
+ }
+ fprintf(GlobalOptions->OutputFile, "<Database CreatedBy=\"BuildMS2DB.c\">\n");
+ // Loop: First the forward strand, then the reverse strand:
+ for (ForwardFlag = 1; ForwardFlag >= 0; ForwardFlag--)
+ {
+ Builder->ForwardFlag = ForwardFlag;
+ Builder->ExonHash = (MS2ExonNode**)calloc(EXON_HASH_SIZE, sizeof(ExonNode*));
+ strncpy(Builder->ChromosomeName, GlobalOptions->ChromosomeName, 256);
+
+ // Parse exons from GFF files:
+ ParseGFFFiles(Builder);
+
+ // Bail out, if we have no exons at all:
+ if (!Builder->FirstExon)
+ {
+ //REPORT_ERROR(15);
+ continue;
+ }
+
+ printf("Parsed GFF files. We now have %d exons.\n", Builder->ExonCount);
+ DebugPrintMS2Builder(Builder, "After GFF parse");
+
+ // Merge and split any overlapping exons as needed. Note that if we merge exons, then we
+ // can't report a cross-reference ("record FOO covers exons 1, 2, 3, 4, 5"). Therefore,
+ // most exons are NOT permitted to be merged. Only exons produced from EST alignments
+ // should be considerd merge-able.
+ SplitOverlappingExons(Builder);
+
+ DebugPrintMS2Builder(Builder, "After exon split");
+
+ // Add edges between adjacent exons:
+ AddAdjacentExonLinks(Builder);
+
+ // Read all exon sequences. We *could* read just the exons for one gene at a time.
+ for (Exon = Builder->FirstExon; Exon; Exon = Exon->Next)
+ {
+ ReadExonSequence(Builder, Exon);
+ }
+
+ // Ensure that length-1 exons (if any exist!) have at most one back-link.
+ RepairPromiscuousSingletonExons(Builder);
+
+ // Group exons into genes:
+ GroupExonsIntoGenes(Builder);
+
+ DebugPrintMS2Builder(Builder, "After gene grouping");
+
+ // Sort exons within genes, assigning exons index numbers:
+ for (Gene = Builder->FirstGene; Gene; Gene = Gene->Next)
+ {
+ SortMS2GeneExons(Builder, Gene);
+ }
+
+ // Add cross-references to genes:
+ BuildGFFCrossReferences(Builder);
+
+ // Output XML:
+ for (Gene = Builder->FirstGene; Gene; Gene = Gene->Next)
+ {
+ OutputMS2Gene(Builder, Gene);
+ }
+ // Free our exon hash, exon lists, gene lists, etc:
+ FreeExonHash(Builder);
+ FreeMS2Genes(Builder);
+ FreeGFFGenes(Builder);
+ FreeMS2Exons(Builder);
+ }
+ fprintf(GlobalOptions->OutputFile, "\n</Database>\n");
+cleanup:
+ FreeExonHash(Builder);
+ if (Builder->GenomeFile)
+ {
+ fclose(Builder->GenomeFile);
+ }
+ SafeFree(Builder);
+}
+
+
+// Handy debugging function: Spew out *all* the exons and genes parsed so far!
+void DebugPrintMS2Builder(MS2Builder* Builder, char* Notes)
+{
+ MS2Exon* Exon;
+ MS2ExonNode* Node;
+ MS2Gene* Gene;
+ int GeneExonCount;
+ int GeneStart;
+ int GeneEnd;
+ int GeneIndex;
+ MS2Edge* Edge;
+ int ExonIndex;
+ int ExonCount = 0;
+ int ForwardEdgeCount = 0;
+ int BackwardEdgeCount = 0;
+ MS2Edge* PrevEdge;
+ //
+ if (!Builder->VerboseFlag)
+ {
+ return;
+ }
+ printf("\n=-=-{O}=-=-{O}=-=-{O}=-=-{O}=-=-{O}=-=-{O}=-=-{O}=-=-{O}=-=-{O}=-=-\n");
+ if (Notes)
+ {
+ printf("*-*-> %s\n", Notes);
+ }
+ else
+ {
+ printf("*-*-> MS2Builder state:\n");
+ }
+
+ for (Exon = Builder->FirstExon, ExonIndex = 0; Exon; Exon = Exon->Next, ExonIndex++)
+ {
+ printf(" Exon %d: %d-%d R %d", ExonIndex, Exon->Start, Exon->End, Exon->ReadingFrame);
+ if (Exon->Gene)
+ {
+ printf(" Gene %d", Exon->Gene->Index);
+ }
+ printf("\n");
+ if (Exon->Sequence)
+ {
+ printf(" Prefix '%s' Suffix '%s'\n", Exon->Prefix, Exon->Suffix);
+ INSPECT_ASSERT(strlen(Exon->Sequence) * 3 + strlen(Exon->Prefix) + strlen(Exon->Suffix) == (Exon->End - Exon->Start));
+ }
+ PrevEdge = NULL;
+ for (Edge = Exon->FirstForward; Edge; Edge = Edge->Next)
+ {
+ printf(" >>>Link to %d-%d R%d\n", Edge->LinkTo->Start, Edge->LinkTo->End, Edge->LinkTo->ReadingFrame);
+ ForwardEdgeCount++;
+ PrevEdge = Edge;
+ }
+ if (PrevEdge != Exon->LastForward)
+ {
+ printf(" *** Error: LastForward link is corrupt!\n");
+ }
+ PrevEdge = NULL;
+ for (Edge = Exon->FirstBackward; Edge; Edge = Edge->Next)
+ {
+ printf(" <<<Link from %d-%d R%d\n", Edge->LinkTo->Start, Edge->LinkTo->End, Edge->LinkTo->ReadingFrame);
+ BackwardEdgeCount++;
+ PrevEdge = Edge;
+ }
+ if (PrevEdge != Exon->LastBackward)
+ {
+ printf(" *** Error: LastForward link is corrupt!\n");
+ }
+
+ ExonCount++;
+ }
+ printf("\n");
+ for (Gene = Builder->FirstGene, GeneIndex = 0; Gene; Gene = Gene->Next, GeneIndex++)
+ {
+ GeneExonCount = 0;
+ GeneStart = Gene->FirstExon->Exon->Start;
+ GeneEnd = Gene->FirstExon->Exon->End;
+ for (Node = Gene->FirstExon; Node; Node = Node->Next)
+ {
+ GeneExonCount++;
+ GeneStart = min(GeneStart, Node->Exon->Start);
+ GeneEnd = max(GeneEnd, Node->Exon->End);
+ }
+ printf("Gene %d/%d (%d...%d) has %d exons\n", GeneIndex, Gene->Index, GeneStart, GeneEnd, GeneExonCount);
+ for (Node = Gene->FirstExon; Node; Node = Node->Next)
+ {
+ if (Node->Exon->Gene != Gene)
+ {
+ printf("** ERROR: Exon %d-%d doesn't link up!\n", Node->Exon->Start, Node->Exon->End);
+ }
+ }
+ }
+ printf("\n...total of %d exons, %d/%d edges\n", ExonCount, ForwardEdgeCount, BackwardEdgeCount);
+}
diff --git a/BuildMS2DB.h b/BuildMS2DB.h
new file mode 100644
index 0000000..e93f170
--- /dev/null
+++ b/BuildMS2DB.h
@@ -0,0 +1,40 @@
+//Title: BuildMS2DB.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef BUILD_MS2DB_H
+#define BUILD_MS2DB_H
+
+
+
+void BuildMS2DB();
+
+#endif //BUILD_MS2DB_H
diff --git a/BuildMS2DB.jar b/BuildMS2DB.jar
new file mode 100644
index 0000000..31ab930
Binary files /dev/null and b/BuildMS2DB.jar differ
diff --git a/CCSVM1.model b/CCSVM1.model
new file mode 100644
index 0000000..ebad6ff
--- /dev/null
+++ b/CCSVM1.model
@@ -0,0 +1,44 @@
+svm_type c_svc
+kernel_type rbf
+gamma 0.1
+nr_class 2
+total_sv 35
+rho -0.312301
+label 1 -1
+nr_sv 19 16
+SV
+1 1:-0.75179 2:0.607494 3:-1 4:0.320215 5:-1 6:0.72313 7:0.0811355 8:-0.114332 9:-1 10:0.357271
+0.1660259920160678 1:-0.851741 2:-0.0927604 3:0.666942 4:0.35998 5:0.595754 6:0.738728 7:-0.521993 8:-0.172721 9:-0.336393 10:0.317712
+1 1:-0.936704 2:0.47541 3:-0.526493 4:0.320844 5:-0.881809 6:0.725531 7:-0.930563 8:-0.117135 9:-0.963797 10:0.356968
+1 1:-0.644116 2:0.404764 3:-0.253063 4:0.282541 5:-0.51243 6:0.698208 7:0.0279429 8:-0.11326 9:-0.218628 10:0.348546
+0.7418988259128604 1:-0.790143 2:0.0876919 3:0.81691 4:0.418834 5:0.849416 6:0.790491 7:-0.237949 8:-0.122427 9:-1 10:0.340648
+0.5082811007489776 1:-0.620509 2:0.466751 3:-1 4:0.313391 5:0.829375 6:0.766265 7:0.538866 8:-0.0931119 9:-1 10:0.35749
+0.2762882244591353 1:-0.985363 2:0.100306 3:0.971198 4:0.325929 5:0.996571 6:0.729376 7:-1 8:-0.115009 9:0.197818 10:0.358083
+0.2358071042810414 1:-0.986348 2:0.233124 3:0.84868 4:0.326947 5:0.386482 6:0.727203 7:-1 8:-0.115112 9:0.600604 10:0.359261
+1 1:-0.637898 2:0.51292 3:-1 4:0.313375 5:-0.58639 6:0.714613 7:0.428808 8:-0.100835 9:-1 10:0.351135
+1 1:-0.569543 2:0.672606 3:-1 4:0.314482 5:-0.935458 6:0.716067 7:-0.102802 8:-0.119006 9:-0.196241 10:0.351188
+1 1:-0.618451 2:0.484406 3:-0.289553 4:0.288088 5:-0.142375 6:0.710582 7:-0.536415 8:-0.149913 9:-0.600327 10:0.314834
+1 1:-0.953112 2:0.6566 3:-1 4:0.223062 5:-0.999028 6:0.484834 7:-0.980275 8:-0.162242 9:-0.997067 10:0.306999
+1 1:-0.80244 2:0.307676 3:-0.994303 4:-1 5:-0.98863 6:-0.508631 7:-0.980039 8:-0.583873 9:-1 10:-0.278361
+1 1:-0.802308 2:0.51663 3:0.16622 4:0.323858 5:-0.45996 6:0.724424 7:-0.0455461 8:-0.115047 9:-1 10:0.356722
+0.4426695076697531 1:-0.960351 2:-0.0444583 3:0.735875 4:0.324235 5:-0.259143 6:0.724855 7:-0.412562 8:-0.116235 9:-0.380176 10:0.357039
+1 1:-0.487744 2:0.408039 3:-0.386503 4:0.248649 5:-0.64458 6:0.654016 7:-0.693115 8:-0.157841 9:-1 10:0.304833
+1 1:-0.747522 2:0.454089 3:-0.124586 4:0.311553 5:0.284901 6:0.750294 7:-0.440615 8:-0.134631 9:-0.420396 10:0.332584
+1 1:-0.696164 2:0.104826 3:-0.978551 4:-0.145873 5:-0.994744 6:-0.633391 7:-0.933211 8:-0.860357 9:-0.987323 10:-0.519298
+1 1:-0.748748 2:-0.128167 3:-0.470328 4:0.305497 5:-1 6:0.715018 7:0.291475 8:-0.0984399 9:0.343179 10:0.364289
+-1 1:-0.1627 2:0.0819672 3:0.478414 4:0.332095 5:-0.642318 6:0.677763 7:-0.770054 8:-0.330038 9:-1 10:0.338713
+-1 1:-0.137889 2:0.324858 3:-1 4:-0.0920287 5:-0.960472 6:0.435168 7:-0.570267 8:-0.167445 9:-0.898061 10:0.292412
+-1 1:-0.196224 2:0.495265 3:-0.675143 4:0.317098 5:-1 6:0.720882 7:-0.720932 8:-0.123296 9:-0.61846 10:0.353556
+-1 1:-0.327768 2:0.953552 3:-0.591787 4:0.320595 5:0.818117 6:0.735123 7:-0.343423 8:-0.115808 9:-0.23334 10:0.357507
+-1 1:-0.327305 2:0.496896 3:-0.732837 4:0.0975269 5:-0.26576 6:0.601986 7:-0.355449 8:-0.181283 9:-0.70581 10:0.246242
+-1 1:0.0231203 2:0.471475 3:-0.187915 4:0.299803 5:-0.984348 6:-0.10867 7:-0.143927 8:-0.121665 9:-0.857203 10:0.24761
+-1 1:-0.373026 2:0.43907 3:-0.853121 4:0.314152 5:-0.924011 6:0.719738 7:-0.945645 8:-0.125024 9:-0.940812 10:0.344711
+-1 1:-0.606214 2:0.28052 3:-0.788078 4:0.321041 5:-0.940023 6:0.723269 7:-0.974707 8:-0.119677 9:-0.963612 10:0.348768
+-0.9665150629728126 1:-0.253084 2:0.441708 3:-0.970703 4:0.2657 5:-0.998178 6:0.622214 7:-0.866261 8:-0.12946 9:-0.883559 10:0.344661
+-1 1:-0.32251 2:0.436116 3:-0.893977 4:0.252243 5:-0.96782 6:0.624908 7:-0.428191 8:-0.122013 9:-0.249862 10:0.354316
+-1 1:-0.0601224 2:-0.246926 3:1 4:0.323794 5:1 6:0.728849 7:-0.584148 8:-0.123886 9:-0.449 10:0.350944
+-0.9199085377320684 1:0.24048 2:0.480049 3:-0.202535 4:0.316312 5:0.0612596 6:0.730524 7:-0.902298 8:-0.710501 9:-0.968861 10:-0.483095
+-0.4845471543829543 1:-0.00856172 2:0.345378 3:-0.999429 4:-0.640491 5:-0.999609 6:-0.262666 7:-0.999091 8:-0.455881 9:-0.999846 10:-0.311333
+-1 1:-0.312826 2:0.409405 3:-0.921582 4:0.299281 5:-0.994074 6:0.640002 7:-0.763224 8:-0.117524 9:-0.94537 10:0.352891
+-1 1:-0.285649 2:0.36626 3:-0.898888 4:0.273682 5:-0.963255 6:0.685241 7:-0.91936 8:-0.143351 9:-0.98431 10:0.290957
+-1 1:0.0781984 2:0.452641 3:0.0661623 4:0.324274 5:-0.995084 6:-0.645437 7:-0.136239 8:-0.117996 9:-0.934717 10:0.192604
diff --git a/CCSVM1.range b/CCSVM1.range
new file mode 100644
index 0000000..d65d8ad
--- /dev/null
+++ b/CCSVM1.range
@@ -0,0 +1,12 @@
+x
+-1 1
+1 0 0.82135439
+2 0 0.73939395
+3 0 1
+4 -0.5379861 0.2760399
+5 0 1
+6 -0.86319005 0.13671456
+7 0 1
+8 -0.78792346 0.99241817
+9 0 1
+10 -0.74330956 0.35138521
diff --git a/CCSVM2.model b/CCSVM2.model
new file mode 100644
index 0000000..a94184b
--- /dev/null
+++ b/CCSVM2.model
@@ -0,0 +1,118 @@
+svm_type c_svc
+kernel_type rbf
+gamma 0.0454545
+nr_class 2
+total_sv 109
+rho 1.88691
+label 1 -1
+nr_sv 56 53
+SV
+1 1:-0.586343 2:0.447929 3:-0.586343 4:0.447929 5:0.586343 6:-0.447929 7:0.270654 8:-0.35777 9:-0.614851 10:-0.771449 11:-0.286179 12:0.973306 13:-0.361175 14:0.958467 15:-0.397918 16:-0.948572 17:0.484338 18:-0.934396 19:0.236794 20:-0.724109 21:0.547246 22:-0.125524
+0.5596070646832195 1:0.0682849 2:0.464911 3:0.0682849 4:0.464911 5:-0.0682842 6:-0.464911 7:-0.883563 8:0.896384 9:-0.191985 10:0.907317 11:-0.0719692 12:0.497519 13:-0.301422 14:0.688257 15:-0.268794 16:-0.839139 17:0.362837 18:-0.460607 19:0.169239 20:-1 21:0.540128 22:-0.30313
+1 1:-0.341176 2:0.368027 3:-0.341176 4:0.368027 5:0.341174 6:-0.368027 7:-0.161618 8:0.987183 9:-0.391364 10:0.999116 11:-0.0518543 12:0.994744 13:-0.117239 14:0.998121 15:0.0820365 16:-0.429618 17:0.470359 18:0.542289 19:0.306701 20:-0.937824 21:0.411759 22:0.0496908
+1 1:-0.301444 2:0.519534 3:-0.301444 4:0.519534 5:0.301443 6:-0.519534 7:-0.231671 8:0.915459 9:-0.558913 10:0.996772 11:-0.0455303 12:0.979313 13:-0.378779 14:0.960248 15:-0.411043 16:-0.817763 17:0.499885 18:-0.892501 19:0.238828 20:0.587939 21:0.556435 22:-0.739683
+0.4744165168332187 1:-0.100362 2:0.497448 3:-0.100362 4:0.497448 5:0.100362 6:-0.497448 7:-0.586211 8:0.961407 9:-0.449057 10:0.965527 11:0.711384 12:0.999177 13:-0.192454 14:0.999381 15:-0.307237 16:-0.970007 17:0.376268 18:-0.976209 19:0.192955 20:-0.990705 21:0.537001 22:-0.92019
+1 1:-0.0162669 2:0.481616 3:-0.0162669 4:0.481616 5:0.0162667 6:-0.481616 7:-0.734485 8:0.999507 9:0.668068 10:0.998986 11:-0.00859887 12:0.977547 13:-0.340345 14:0.95151 15:-0.416897 16:-0.988217 17:-0.263232 18:-1 19:0.225853 20:-0.549695 21:0.542771 22:-0.900102
+1 1:-0.0998906 2:0.149826 3:-0.0998906 4:0.149826 5:0.0998906 6:-0.149826 7:-0.587042 8:1 9:-0.529908 10:1 11:-0.264798 12:0.96059 13:0.00532483 14:1 15:-0.403677 16:-0.0983453 17:0.497203 18:-1 19:0.236032 20:-0.660344 21:0.470228 22:-0.131685
+0.1989762850809156 1:0.36994 2:0.416452 3:0.36994 4:0.416452 5:-0.369941 6:-0.416452 7:-0.591375 8:0.895658 9:-0.288629 10:0.956849 11:-0.133629 12:0.934105 13:0.125194 14:0.954301 15:0.170537 16:-0.949086 17:0.196624 18:-0.900376 19:-0.0101391 20:0.025301 21:0.551641 22:-0.30438
+1 1:0.00835184 2:0.448781 3:0.00835184 4:0.448781 5:-0.00835176 6:-0.44878 7:-0.777892 8:0.607312 9:-0.540934 10:0.110858 11:-0.264211 12:0.338182 13:-0.326846 14:0.199405 15:-0.350711 16:-0.207805 17:0.483608 18:-0.245967 19:0.219276 20:-0.384979 21:0.514448 22:-0.109859
+1 1:0.281903 2:0.444653 3:0.281903 4:0.444653 5:-0.281904 6:-0.444653 7:-0.746599 8:0.903924 9:-0.207278 10:0.780967 11:-0.0903158 12:-0.0501338 13:-0.449562 14:-0.395518 15:-0.57681 16:-0.801324 17:0.347715 18:-0.556582 19:0.154491 20:-0.577215 21:0.536102 22:-0.683604
+1 1:0.671585 2:0.477308 3:0.671585 4:0.477308 5:-0.671585 6:-0.477308 7:-0.0595257 8:0.970662 9:-0.530234 10:0.959048 11:-0.263206 12:0.844518 13:-0.0837123 14:0.925784 15:-0.17477 16:0.0445118 17:0.51009 18:-0.680938 19:0.199413 20:-0.562869 21:0.539659 22:-0.277483
+1 1:0.657934 2:0.316153 3:0.657934 4:0.316153 5:-0.657934 6:-0.316153 7:-0.0835945 8:1 9:-0.610388 10:0.93972 11:-0.280656 12:1 13:-0.381819 14:0.987189 15:-0.414015 16:0.426698 17:0.527226 18:-0.0722581 19:0.23855 20:-0.515298 21:0.546444 22:-0.116434
+1 1:-0.440958 2:0.384736 3:-0.440958 4:0.384736 5:0.440958 6:-0.384736 7:0.0143158 8:0.754663 9:-0.585509 10:0.831596 11:-0.279427 12:0.245362 13:-0.418379 14:0.202683 15:-0.438459 16:-0.0630584 17:0.505839 18:0.389734 19:0.26914 20:0.237103 21:0.562722 22:-0.388008
+1 1:-0.486414 2:0.430894 3:-0.486414 4:0.430894 5:0.486415 6:-0.430894 7:0.0944636 8:0.999547 9:-0.550971 10:0.998508 11:-0.270172 12:0.969579 13:-0.289701 14:0.948459 15:-0.373008 16:-0.931439 17:-0.0476394 18:-0.48044 19:0.217399 20:-0.965434 21:0.40285 22:-0.749482
+1 1:-0.3299 2:0.396217 3:-0.3299 4:0.396217 5:0.3299 6:-0.396217 7:-0.181497 8:0.953396 9:-0.570408 10:0.495372 11:-0.282587 12:0.998933 13:-0.395738 14:0.976972 15:-0.435797 16:-0.910217 17:0.439613 18:-0.454396 19:0.233376 20:0.227735 21:0.551649 22:0.045953
+1 1:-0.351795 2:0.256739 3:-0.351795 4:0.256739 5:0.351795 6:-0.256739 7:-0.142893 8:0.999503 9:-0.364336 10:0.992364 11:-0.194011 12:0.999445 13:-0.164377 14:0.992241 15:-0.274452 16:-0.550078 17:0.454044 18:0.0301268 19:0.240288 20:0.861797 21:0.565931 22:0.146761
+1 1:-0.12363 2:0.089939 3:-0.12363 4:0.089939 5:0.12363 6:-0.089939 7:-0.545186 8:1 9:-0.436512 10:1 11:-0.177595 12:1 13:-0.244387 14:1 15:-0.275533 16:-0.656413 17:0.380233 18:-0.791762 19:0.114865 20:0.803011 21:0.571513 22:0.367263
+1 1:0.193279 2:0.48267 3:0.193279 4:0.48267 5:-0.193281 6:-0.48267 7:-0.902856 8:0.992069 9:-0.420573 10:0.986801 11:-0.219053 12:0.984208 13:0.327109 14:0.989893 15:0.766696 16:-0.974373 17:-0.0870789 18:-1 19:-0.147046 20:-0.873146 21:0.487336 22:-0.162558
+0.1097681833559589 1:0.633849 2:0.509146 3:0.633849 4:0.509146 5:-0.63385 6:-0.509146 7:-0.126059 8:0.134206 9:-0.614563 10:0.72621 11:-0.285493 12:0.999878 13:0.103765 14:0.999389 15:0.0290541 16:0.534943 17:0.516041 18:0.859695 19:0.254284 20:-1 21:0.548412 22:-0.886915
+1 1:-0.308093 2:0.480976 3:-0.308093 4:0.480976 5:0.308094 6:-0.480976 7:-0.219946 8:0.258287 9:-0.61395 10:0.0982527 11:-0.28583 12:0.0574076 13:-0.42947 14:-0.180807 15:-0.447977 16:-0.167755 17:0.491927 18:0.931486 19:0.246749 20:0.0827929 21:0.548853 22:-0.687232
+1 1:-0.25603 2:0.0995601 3:-0.25603 4:0.0995601 5:0.25603 6:-0.0995601 7:-0.311743 8:1 9:-0.614317 10:-1 11:-0.28595 12:0.998647 13:-0.364239 14:1 15:-0.416991 16:0.800016 17:0.534311 18:0.0886501 19:0.240971 20:0.538389 21:0.548926 22:-0.870887
+1 1:0.381401 2:0.467457 3:0.381401 4:0.467457 5:-0.381403 6:-0.467457 7:-0.571166 8:0.966931 9:-0.283778 10:0.987197 11:-0.136138 12:0.996575 13:0.390136 14:0.983524 15:0.36958 16:-0.938605 17:-0.111525 18:-0.889852 19:-0.101351 20:0.409386 21:0.562073 22:-0.238093
+1 1:-0.407687 2:0.43241 3:-0.409286 4:0.15957 5:0.407687 6:-0.43241 7:-0.0443467 8:0.632951 9:-0.614584 10:1 11:-0.281641 12:0.995113 13:-0.412482 14:0.990009 15:-0.433111 16:0.975196 17:0.616761 18:-0.0532698 19:0.23941 20:-0.798875 21:0.548329 22:-0.812192
+1 1:-0.381384 2:0.48489 3:-0.38323 4:0.333027 5:0.381384 6:-0.48489 7:-0.0907228 8:0.90056 9:-0.602526 10:0.994116 11:-0.27993 12:0.919902 13:-0.387762 14:0.997017 15:-0.410666 16:0.212216 17:0.509798 18:0.615852 19:0.243923 20:-0.555275 21:0.54677 22:-0.44556
+1 1:0.657865 2:0.418559 3:0.657865 4:0.418559 5:-0.657865 6:-0.418559 7:-0.0837164 8:0.982051 9:-0.60982 10:0.974959 11:-0.283128 12:0.984598 13:-0.38622 14:0.99638 15:-0.406722 16:-0.677503 17:0.482468 18:-0.680025 19:0.226878 20:-0.807329 21:0.537338 22:-0.115688
+1 1:0.410844 2:0.793817 3:0.384586 4:0.0450936 5:-0.410844 6:-0.793817 7:-0.519256 8:0.734904 9:-0.355755 10:0.867622 11:-0.151593 12:0.32882 13:-0.0649979 14:0.364197 15:-0.176056 16:-0.713813 17:0.428602 18:-0.506334 19:0.169325 20:-0.318047 21:0.543925 22:-0.639797
+1 1:0.422156 2:0.533101 3:0.422058 4:0.51777 5:-0.422156 6:-0.533101 7:-0.49931 8:0.975474 9:-0.441186 10:0.953309 11:-0.188735 12:0.875588 13:-0.153936 14:0.923043 15:-0.138697 16:-0.764521 17:0.263615 18:-0.775987 19:0.0585216 20:-0.841292 21:0.457284 22:-0.347796
+1 1:-0.51645 2:0.230827 3:-0.51645 4:0.230827 5:0.51645 6:-0.230827 7:0.14742 8:0.987989 9:-0.565042 10:0.984247 11:-0.25387 12:0.990325 13:-0.305422 14:0.990213 15:-0.342887 16:0.378168 17:0.540062 18:0.393266 19:0.263358 20:0.259464 21:0.553371 22:0.135901
+1 1:0.238222 2:0.37266 3:0.238222 4:0.37266 5:-0.238222 6:-0.37266 7:-0.823616 8:0.915236 9:0.134308 10:0.0502518 11:-0.284304 12:0.652288 13:-0.0167093 14:-0.153565 15:-0.462807 16:-0.808788 17:0.434321 18:-0.0034517 19:0.239546 20:-0.704591 21:0.53369 22:-0.629419
+0.07285356465522599 1:-0.315793 2:0.48801 3:-0.315793 4:0.48801 5:0.315793 6:-0.48801 7:-0.206371 8:0.998677 9:-0.541924 10:0.995599 11:-0.0371308 12:0.975513 13:-0.378187 14:0.886955 15:-0.427352 16:-0.950521 17:0.490977 18:-0.348778 19:0.237224 20:0.662718 21:0.554518 22:-0.739705
+1 1:-0.36855 2:0.308094 3:-0.36855 4:0.308094 5:0.368549 6:-0.308094 7:-0.113352 8:0.87392 9:-0.543669 10:0.667273 11:-0.256033 12:0.87228 13:-0.328798 14:0.743686 15:-0.328382 16:-0.291816 17:0.45961 18:-0.296628 19:0.219137 20:-0.839547 21:0.495189 22:0.0884328
+1 1:-0.492285 2:0.431992 3:-0.492285 4:0.431992 5:0.492285 6:-0.431992 7:0.104814 8:-0.101568 9:-0.614847 10:0.863995 11:-0.274842 12:0.977854 13:-0.389846 14:0.97568 15:-0.372854 16:-0.191448 17:0.49417 18:0.344803 19:0.255009 20:-0.619481 21:0.539645 22:-0.343214
+1 1:-0.335623 2:0.169481 3:-0.335623 4:0.169481 5:0.335624 6:-0.169481 7:-0.171407 8:0.996622 9:-0.383064 10:0.991566 11:0.0107828 12:0.991459 13:-0.063775 14:0.996811 15:0.102334 16:-0.562855 17:0.380578 18:-0.276057 19:0.189847 20:-0.675418 21:0.424216 22:0.222194
+1 1:-0.60664 2:0.39407 3:-0.60664 4:0.39407 5:0.60664 6:-0.39407 7:0.306442 8:0.998864 9:-0.581221 10:1 11:-0.0263783 12:0.62573 13:-0.428906 14:1 15:-0.439548 16:0.551483 17:0.514559 18:0.114448 19:0.240437 20:-0.954679 21:0.546405 22:-0.823663
+0.5358741563675483 1:-0.0664034 2:0.383825 3:-0.0664034 4:0.383825 5:0.066403 6:-0.383825 7:-0.646086 8:1 9:-0.503616 10:1 11:-0.257686 12:0.771434 13:-0.253061 14:0.858967 15:-0.331146 16:-0.872614 17:0.290232 18:-0.93065 19:0.118993 20:-0.589183 21:0.515917 22:-0.913288
+1 1:-0.486864 2:0.422274 3:-0.486864 4:0.422274 5:0.486864 6:-0.422274 7:0.0952555 8:0.996401 9:-0.455773 10:0.999615 11:-0.085654 12:0.999806 13:0.188512 14:0.999491 15:-0.0387637 16:-0.704547 17:0.496536 18:0.175756 19:0.243346 20:0.172858 21:0.551543 22:-0.31052
+1 1:0.0364816 2:0.493327 3:0.0364816 4:0.493327 5:-0.0364819 6:-0.493327 7:-0.827489 8:0.99666 9:-0.413362 10:0.998596 11:-0.0503499 12:0.969016 13:0.133667 14:0.953747 15:-0.0522417 16:-0.954976 17:-0.545767 18:-0.960684 19:-0.276192 20:-0.714058 21:0.517738 22:-0.801364
+0.5148494730248897 1:-0.0124078 2:0.498572 3:-0.0124078 4:0.498572 5:0.0124078 6:-0.498572 7:-0.741289 8:0.967992 9:-0.529865 10:1 11:1 12:1 13:-0.42631 14:1 15:-0.441308 16:-0.933978 17:0.472606 18:-1 19:0.232945 20:0.0889109 21:0.548763 22:-0.235028
+1 1:0.302517 2:0.463415 3:0.302517 4:0.463415 5:-0.302516 6:-0.463415 7:-0.710254 8:0.51784 9:-0.392845 10:0.521332 11:-0.113785 12:0.611404 13:-0.235453 14:0.714318 15:-0.168268 16:-0.949915 17:0.251864 18:-0.670698 19:-0.0427911 20:-0.474422 21:0.499639 22:-0.464822
+1 1:0.743347 2:0.394416 3:0.743347 4:0.394416 5:-0.743347 6:-0.394416 7:0.0670029 8:0.910294 9:-0.610358 10:0.601571 11:-0.284704 12:1 13:-0.37368 14:1 15:-0.390897 16:0.0657905 17:0.511036 18:-0.10271 19:0.238667 20:-0.147246 21:0.547753 22:-0.11628
+0.917171674952913 1:-0.28129 2:0.341463 3:-0.28129 4:0.341463 5:0.28129 6:-0.341463 7:-0.267205 8:0.626618 9:-0.605168 10:0.429773 11:-0.285289 12:1 13:-0.322021 14:1 15:-0.404777 16:-0.442832 17:0.494445 18:1 19:0.25183 20:-1 21:0.536631 22:-0.840349
+1 1:-0.609026 2:0.447929 3:-0.609026 4:0.447929 5:0.609026 6:-0.447929 7:0.310648 8:0.51238 9:-0.613696 10:0.482398 11:-0.285306 12:0.57918 13:-0.426866 14:0.366864 15:-0.440886 16:-0.266691 17:0.503673 18:0.315527 19:0.269261 20:0.931967 21:0.549317 22:-0.0887005
+0.348164886379205 1:0.0692804 2:0.92874 3:0.0534675 4:-0.0479937 5:-0.0692809 6:-0.92874 7:-0.885319 8:0.983 9:-0.398756 10:0.952999 11:-0.00435766 12:0.999962 13:-0.11588 14:0.902806 15:-0.227063 16:-0.892101 17:0.328324 18:-0.828931 19:0.207083 20:-0.91052 21:0.526309 22:-0.690825
+1 1:0.578146 2:0.403377 3:0.578146 4:0.403377 5:-0.578146 6:-0.403377 7:-0.224275 8:0.588429 9:-0.6146 10:-1 11:-0.285959 12:0.930657 13:0.65313 14:0.976113 15:0.0436324 16:0.514677 17:0.521094 18:0.794236 19:0.242741 20:-1 21:0.548515 22:-1
+1 1:-0.00719075 2:0.414634 3:-0.00737814 4:0.398374 5:0.00719096 6:-0.414634 7:-0.750487 8:0.989496 9:-0.259347 10:0.999136 11:-0.111278 12:0.991868 13:-0.207436 14:0.999302 15:-0.232533 16:-0.998136 17:0.318187 18:-0.727043 19:0.0858924 20:-0.919277 21:0.53584 22:-0.303007
+1 1:0.737565 2:0.616221 3:0.737565 4:0.616221 5:-0.737565 6:-0.616221 7:0.0568082 8:0.749228 9:-0.612368 10:1 11:-0.285055 12:0.270022 13:-0.415722 14:-0.55593 15:-0.450961 16:-1 17:0.508329 18:-0.630999 19:0.238094 20:-0.999998 21:0.529147 22:-0.692216
+0.9807978899181346 1:0.126935 2:0.503618 3:0.126935 4:0.503618 5:-0.126935 6:-0.503618 7:-0.986974 8:-0.355252 9:-0.798183 10:-0.53595 11:-0.530642 12:0.965781 13:0.188843 14:0.929469 15:0.294725 16:-0.49626 17:0.191486 18:0.303603 19:0.389976 20:-0.644883 21:0.488484 22:-0.316129
+1 1:-0.303797 2:0.341463 3:-0.303797 4:0.341463 5:0.303798 6:-0.341463 7:-0.227521 8:0.996809 9:-0.556053 10:0.946744 11:-0.265229 12:0.971874 13:-0.388131 14:0.97552 15:-0.406586 16:-0.938298 17:0.495425 18:-0.336644 19:0.237716 20:-0.099517 21:0.54721 22:0.0454306
+1 1:-0.299369 2:0.304542 3:-0.299369 4:0.304542 5:0.299368 6:-0.304542 7:-0.23533 8:-0.287076 9:-0.61563 10:0.992172 11:-0.277707 12:0.3129 13:-0.423048 14:0.399398 15:-0.440061 16:-1 17:0.431147 18:-0.654444 19:0.225119 20:0.236155 21:0.552155 22:-0.649479
+1 1:0.0723979 2:0.490515 3:0.0723979 4:0.490515 5:-0.0723979 6:-0.490515 7:-0.890816 8:0.271433 9:-0.550909 10:-0.29294 11:-0.304718 12:0.822387 13:-0.189803 14:0.617052 15:-0.338845 16:-0.99735 17:0.246394 18:-0.241855 19:0.208874 20:-0.1698 21:0.543135 22:-0.580233
+1 1:0.203415 2:0.52439 3:0.200207 4:0.463415 5:-0.203415 6:-0.52439 7:-0.884987 8:0.201086 9:-0.56697 10:0.684083 11:-0.235972 12:0.867594 13:-0.0074243 14:0.774399 15:-0.10621 16:-0.900716 17:0.234968 18:-0.459144 19:0.121838 20:-0.792795 21:0.479832 22:-0.64185
+1 1:-0.471464 2:0.367427 3:-0.471464 4:0.367427 5:0.471464 6:-0.367427 7:0.0681036 8:0.684579 9:-0.61453 10:0.987199 11:-0.264995 12:1 13:-0.402389 14:0.989835 15:-0.394969 16:-0.833764 17:0.507274 18:-0.679781 19:0.238515 20:-0.948217 21:0.548065 22:-0.129608
+1 1:-0.0869371 2:0.428571 3:-0.0869371 4:0.428571 5:0.0869361 6:-0.428571 7:-0.609882 8:0.995544 9:-0.320484 10:0.965462 11:-0.181339 12:1 13:-0.227088 14:0.975314 15:-0.31391 16:-0.947786 17:-0.328383 18:-0.821197 19:0.00299785 20:-0.884176 21:0.506818 22:-0.390821
+1 1:-0.732964 2:0.371274 3:-0.732964 4:0.371274 5:0.732964 6:-0.371274 7:0.529172 8:0.999058 9:-0.614047 10:1 11:-0.284923 12:0.996677 13:-0.42765 14:0.997889 15:-0.440344 16:0.979815 17:0.512833 18:0.636184 19:0.239863 20:-0.990803 21:0.548183 22:-0.559533
+1 1:-0.431661 2:0.940564 3:-0.495001 4:-0.0362301 5:0.431661 6:-0.940564 7:-0.00207637 8:0.528697 9:-0.61112 10:0.647861 11:-0.284245 12:0.54907 13:-0.421421 14:-0.803597 15:-0.447746 16:-0.894568 17:0.506539 18:-0.606406 19:0.237431 20:0.472298 21:0.550599 22:-0.681023
+1 1:0.0866925 2:0.380346 3:0.0866925 4:0.380346 5:-0.0866931 6:-0.380346 7:-0.91602 8:0.0924395 9:-0.613444 10:0.565248 11:-0.25041 12:0.923577 13:-0.224146 14:0.850363 15:-0.3956 16:-0.934678 17:0.282952 18:-0.411323 19:0.222608 20:-0.972609 21:0.484729 22:-0.494112
+-1 1:0.314173 2:0.57364 3:0.248588 4:0.186679 5:-0.314173 6:-0.57364 7:-0.689702 8:-0.753608 9:-0.635444 10:-0.682679 11:-0.290888 12:0.700462 13:-0.290094 14:0.131287 15:-0.44125 16:-0.952021 17:0.0991982 18:-0.506693 19:0.233323 20:-0.920479 21:0.438822 22:-0.856797
+-0.9072397084667069 1:-0.819382 2:-0.0838786 3:-0.819382 4:-0.0838786 5:0.819382 6:0.0838787 7:0.68154 8:1 9:-0.614073 10:0.925916 11:-0.285324 12:1 13:-0.424977 14:1 15:-0.440096 16:-0.87566 17:0.501737 18:-0.0224184 19:0.239469 20:-0.548757 21:0.548182 22:0.234697
+-1 1:0.290014 2:0.103913 3:0.290014 4:0.103913 5:-0.290014 6:-0.103913 7:-0.732298 8:1 9:-0.594535 10:1 11:-0.27302 12:0.94954 13:-0.401871 14:0.886464 15:-0.425788 16:-0.983362 17:-0.885167 18:-0.973772 19:-0.311776 20:0.316398 21:0.552948 22:0.266855
+-1 1:0.71737 2:0.386179 3:0.71737 4:0.386179 5:-0.71737 6:-0.386179 7:0.0212014 8:1 9:-0.567279 10:1 11:-0.28248 12:0.980334 13:-0.402101 14:1 15:-0.439201 16:-0.27883 17:0.500433 18:-0.562343 19:0.230617 20:-0.981629 21:0.533765 22:-0.520955
+-1 1:0.0531524 2:0.499283 3:0.0531524 4:0.499283 5:-0.0531525 6:-0.499283 7:-0.856883 8:0.719106 9:-0.45871 10:0.75029 11:-0.180035 12:0.89694 13:-0.100122 14:0.799798 15:-0.31753 16:-0.983592 17:-0.568076 18:-0.966201 19:-0.107333 20:-0.995902 21:0.123553 22:-0.621968
+-1 1:0.746402 2:0.280488 3:0.746402 4:0.280488 5:-0.746402 6:-0.280488 7:0.0723898 8:1 9:-0.58158 10:1 11:-0.283182 12:0.842823 13:-0.425851 14:0.929143 15:-0.439283 16:-0.936503 17:0.344804 18:-0.859598 19:0.119085 20:-0.988712 21:0.527753 22:-0.225431
+-1 1:-0.76884 2:0.442267 3:-0.76884 4:0.442267 5:0.768839 6:-0.442267 7:0.592426 8:0.921646 9:-0.609914 10:0.935718 11:-0.278114 12:0.958903 13:-0.41508 14:0.962155 15:-0.436201 16:0.602626 17:0.56163 18:0.818495 19:0.279524 20:0.502347 21:0.55442 22:-0.225346
+-1 1:-0.918483 2:0.555072 3:-0.918483 4:0.555072 5:0.918483 6:-0.555072 7:0.856272 8:0.977948 9:-0.613634 10:0.963833 11:-0.285524 12:0.994831 13:-0.42505 14:0.986542 15:-0.441408 16:-0.979398 17:0.506681 18:0.867977 19:0.243613 20:-0.241779 21:0.548501 22:-0.508623
+-1 1:0.362966 2:0.571217 3:0.362629 4:0.556394 5:-0.362966 6:-0.571217 7:-0.603672 8:0.761436 9:-0.545098 10:0.365721 11:-0.27836 12:0.873096 13:-0.308675 14:0.856215 15:-0.319712 16:-0.962038 17:-0.322795 18:-0.625463 19:0.114079 20:-0.545571 21:0.479135 22:-0.400456
+-1 1:0.247296 2:0.307645 3:0.247296 4:0.307645 5:-0.247296 6:-0.307645 7:-0.807618 8:1 9:-0.611993 10:1 11:-0.284683 12:0.206361 13:-0.333169 14:-0.836851 15:-0.706334 16:-0.998879 17:-0.254898 18:-0.947889 19:-0.0790606 20:-0.59284 21:0.510645 22:-0.0997706
+-1 1:0.551959 2:0.553273 3:0.551959 4:0.553273 5:-0.551958 6:-0.553273 7:-0.270447 8:0.656892 9:-0.592363 10:0.912889 11:-0.18659 12:0.702479 13:-0.330583 14:0.45293 15:-0.398743 16:-0.975299 17:0.0483964 18:-0.995985 19:-0.404355 20:-1 21:0.52356 22:-0.249263
+-1 1:-0.714479 2:0.52439 3:-0.714479 4:0.52439 5:0.714479 6:-0.52439 7:0.496578 8:0.994425 9:-0.55654 10:0.907499 11:-0.252391 12:0.999472 13:-0.41595 14:0.995312 15:-0.4274 16:0.615873 17:0.706353 18:0.967788 19:0.677751 20:-0.0829828 21:0.548365 22:-0.318398
+-1 1:0.475181 2:0.527205 3:0.475181 4:0.527205 5:-0.475181 6:-0.527204 7:-0.405818 8:0.545675 9:-0.606504 10:0.952094 11:-0.279339 12:0.546147 13:-0.353924 14:0.869819 15:-0.410809 16:-0.910452 17:0.231864 18:-0.940585 19:-0.371012 20:-0.580494 21:0.534986 22:-0.250645
+-0.9375540531464036 1:0.273824 2:0.701368 3:0.249197 4:0.325104 5:-0.273824 6:-0.701368 7:-0.760845 8:-0.732894 9:-0.621524 10:-0.481251 11:-0.288462 12:0.953364 13:-0.355107 14:0.919803 15:-0.38208 16:-0.889975 17:0.504062 18:-1 19:0.23929 20:-0.998936 21:-1 22:-0.565464
+-1 1:0.152462 2:0.767426 3:0.0863133 4:0.00450816 5:-0.152461 6:-0.767426 7:-0.974826 8:-0.580813 9:-0.668134 10:0.898186 11:0.208259 12:0.521068 13:-0.2819 14:0.386054 15:-0.34436 16:-0.944409 17:0.25892 18:-0.438837 19:0.19293 20:-0.994983 21:0.0227629 22:-0.504649
+-1 1:-0.821065 2:0.341463 3:-0.821065 4:0.341463 5:0.821065 6:-0.341463 7:0.684508 8:0.976281 9:-0.611809 10:0.73847 11:-0.284926 12:0.969001 13:-0.413961 14:0.992339 15:-0.43337 16:0.800543 17:0.624509 18:0.0670394 19:0.244416 20:-1 21:0.544043 22:-0.316082
+-1 1:0.0874382 2:0.533101 3:0.0874382 4:0.533101 5:-0.087438 6:-0.533101 7:-0.917334 8:0.684992 9:-0.478737 10:0.21614 11:-0.264174 12:0.399124 13:-0.37267 14:-0.425209 15:-0.517443 16:-0.973893 17:-0.577042 18:-0.869095 19:0.0488588 20:-0.887083 21:0.486157 22:-0.355497
+-1 1:0.335699 2:0.588772 3:0.335699 4:0.588772 5:-0.335699 6:-0.588772 7:-0.651748 8:0.763722 9:-0.486027 10:0.873241 11:-0.187559 12:0.878274 13:-0.291976 14:0.797142 15:-0.326437 16:-0.913324 17:-0.477834 18:-0.812836 19:0.1443 20:-0.930905 21:0.430101 22:-0.398904
+-1 1:0.426947 2:0.482176 3:0.426947 4:0.482176 5:-0.426947 6:-0.482176 7:-0.490862 8:1 9:-0.545892 10:1 11:-0.280435 12:0.992943 13:-0.251762 14:1 15:-0.395434 16:-0.691613 17:0.44945 18:-0.619779 19:0.140397 20:-0.785487 21:0.44963 22:0.244653
+-1 1:-0.880866 2:0.425305 3:-0.880866 4:0.425305 5:0.880866 6:-0.425305 7:0.789947 8:0.987777 9:-0.601968 10:0.997644 11:-0.276129 12:0.996272 13:-0.396802 14:0.997961 15:-0.408024 16:0.757128 17:0.530727 18:0.580053 19:0.299234 20:-0.111389 21:0.546603 22:-0.374854
+-1 1:0.838314 2:0.401274 3:0.83142 4:0.367096 5:-0.838314 6:-0.401274 7:0.234446 8:0.944777 9:-0.601175 10:0.915073 11:-0.281699 12:0.865605 13:-0.405837 14:1 15:-0.432593 16:-0.99649 17:0.471513 18:-0.975827 19:0.208841 20:-0.81984 21:0.537619 22:-0.200444
+-1 1:0.28387 2:0.494773 3:0.28387 4:0.494773 5:-0.28387 6:-0.494773 7:-0.743131 8:1 9:-0.598043 10:1 11:-0.273415 12:1 13:-0.313937 14:1 15:-0.37725 16:-1 17:0.344304 18:-0.554773 19:0.142136 20:-1 21:0.507637 22:0.165558
+-0.2184773448480964 1:-0.931151 2:0.664408 3:-0.937486 4:0.440831 5:0.931151 6:-0.664408 7:0.878607 8:0.976887 9:-0.530169 10:0.980579 11:-0.224123 12:0.915827 13:-0.428529 14:0.725425 15:-0.442071 16:0.91644 17:0.602366 18:0.881946 19:0.428949 20:-0.398664 21:0.525115 22:-0.251924
+-1 1:0.447162 2:0.788618 3:0.266108 4:0.229675 5:-0.447162 6:-0.788618 7:-0.45522 8:0.758634 9:-0.48651 10:0.161047 11:-0.276599 12:0.503117 13:-0.360738 14:-0.560266 15:-0.507808 16:-0.697934 17:0.0567238 18:-0.807127 19:-0.00422272 20:-0.869989 21:0.518937 22:-0.411518
+-1 1:0.421399 2:0.77575 3:0.354302 4:0.447622 5:-0.421399 6:-0.77575 7:-0.500644 8:0.822446 9:-0.238664 10:0.299605 11:-0.224904 12:0.719876 13:-0.195063 14:0.726083 15:-0.278783 16:-0.891679 17:-0.198613 18:-0.907721 19:-0.0262203 20:-0.91968 21:0.209938 22:-0.728423
+-1 1:-0.542993 2:0.308745 3:-0.542993 4:0.308745 5:0.542993 6:-0.308745 7:0.19422 8:0.892032 9:-0.565442 10:0.98268 11:-0.0576717 12:0.974617 13:-0.27553 14:0.97146 15:-0.333374 16:-0.501866 17:0.390441 18:0.32146 19:0.278349 20:-0.00317282 21:0.548403 22:0.0561746
+-1 1:-0.767483 2:0.0622756 3:-0.767483 4:0.0622756 5:0.767483 6:-0.0622757 7:0.590033 8:1 9:-0.609413 10:1 11:-0.27798 12:1 13:-0.41259 14:1 15:-0.430814 16:0.106161 17:0.516786 18:-0.272557 19:0.230707 20:0.704961 21:0.564232 22:0.356582
+-1 1:0.291942 2:0.615526 3:0.290535 4:0.586677 5:-0.291942 6:-0.615526 7:-0.728899 8:-0.750747 9:-0.688631 10:-0.864176 11:-0.415871 12:0.684524 13:0.0113795 14:0.290082 15:-0.359071 16:-0.90887 17:0.202634 18:-0.980114 19:-0.301483 20:-0.904303 21:0.485313 22:-0.897096
+-0.756695633922057 1:0.432115 2:0.56883 3:0.432115 4:0.56883 5:-0.432115 6:-0.56883 7:-0.48175 8:1 9:-0.556104 10:1 11:-0.196911 12:0.568382 13:-0.240856 14:-0.277788 15:-0.476479 16:-0.971105 17:-0.28234 18:-0.966243 19:-0.281655 20:-0.944121 21:0.495331 22:-0.249043
+-1 1:-0.758062 2:0.334584 3:-0.758062 4:0.334584 5:0.758062 6:-0.334584 7:0.573424 8:0.991925 9:-0.60947 10:0.928439 11:-0.283941 12:0.968487 13:-0.421123 14:0.986012 15:-0.431774 16:-0.471938 17:0.497059 18:0.0125694 19:0.239731 20:-0.059729 21:0.548216 22:0.0804318
+-1 1:0.599813 2:0.617948 3:0.599813 4:0.617948 5:-0.599813 6:-0.617948 7:-0.186072 8:-0.499586 9:-0.619897 10:-0.186008 11:-0.28841 12:0.100056 13:-0.412439 14:1 15:-0.390079 16:0.00793922 17:0.510547 18:0.111645 19:0.248091 20:-1 21:0.521258 22:-0.267001
+-1 1:-0.832127 2:0.498842 3:-0.832127 4:0.498842 5:0.832127 6:-0.498842 7:0.704011 8:0.97619 9:-0.595345 10:0.979067 11:-0.271839 12:0.869747 13:-0.427941 14:0.956343 15:-0.439005 16:0.76817 17:0.682224 18:0.885431 19:0.454549 20:0.453709 21:0.554543 22:-0.314007
+-0.3058345899969018 1:0.454476 2:0.948652 3:0.174883 4:-0.0397946 5:-0.454475 6:-0.948652 7:-0.442325 8:0.904787 9:-0.558912 10:-0.783207 11:-0.323264 12:-0.10814 13:-0.454124 14:-0.714833 15:-0.788736 16:-0.862256 17:0.440911 18:-0.830246 19:0.132482 20:-0.749265 21:0.50943 22:-0.788272
+-1 1:-0.792715 2:0.307067 3:-0.798203 4:0.261205 5:0.792714 6:-0.307067 7:0.634521 8:0.977711 9:-0.608349 10:0.83682 11:-0.282645 12:0.932819 13:-0.419286 14:0.976412 15:-0.434809 16:0.0773 17:0.509631 18:-0.859197 19:0.22891 20:0.00155986 21:0.548643 22:-0.502924
+-1 1:0.795872 2:0.350347 3:0.795872 4:0.350347 5:-0.795872 6:-0.350347 7:0.159614 8:0.975269 9:-0.59939 10:0.986503 11:-0.274122 12:0.993335 13:-0.421467 14:0.901366 15:-0.438505 16:-0.905651 17:0.456635 18:-0.911964 19:0.191985 20:-0.938038 21:0.528195 22:-0.225585
+-0.8561146207073825 1:0.396423 2:0.637374 3:0.396423 4:0.637374 5:-0.396422 6:-0.637374 7:-0.544682 8:0.90862 9:-0.606329 10:0.997799 11:0.0645829 12:0.460201 13:-0.424133 14:0.360053 15:-0.440772 16:-0.983808 17:-0.647309 18:-0.982008 19:-0.437347 20:-0.986235 21:0.486437 22:-0.544801
+-1 1:0.380801 2:0.389033 3:0.380801 4:0.389033 5:-0.380801 6:-0.389033 7:-0.572226 8:0.940192 9:-0.595803 10:1 11:-0.242985 12:0.980484 13:-0.123342 14:1 15:-0.385484 16:-0.743772 17:0.45919 18:-0.84692 19:0.215335 20:-0.875739 21:0.489558 22:0.156744
+-1 1:-0.767483 2:0.0622756 3:-0.767483 4:0.0622756 5:0.767483 6:-0.0622757 7:0.590033 8:1 9:-0.609413 10:1 11:-0.27798 12:1 13:-0.41259 14:1 15:-0.430814 16:0.106161 17:0.516786 18:-0.272557 19:0.230707 20:0.704961 21:0.564232 22:0.356582
+-0.730563744163681 1:-0.668374 2:-0.26158 3:-0.668374 4:-0.26158 5:0.668374 6:0.26158 7:0.415288 8:-1 9:-0.614618 10:-1 11:-0.28595 12:1 13:-0.419665 14:1 15:-0.433852 16:-0.137892 17:0.489976 18:1 19:0.453025 20:-0.299368 21:0.546516 22:0.470314
+-1 1:-0.297354 2:-0.0707181 3:-0.297354 4:-0.0707181 5:0.297355 6:0.070718 7:-0.238881 8:1 9:-0.598231 10:1 11:-0.27587 12:1 13:-0.409261 14:1 15:-0.420867 16:-0.78332 17:0.454446 18:-0.354751 19:0.218161 20:0.0967076 21:0.549721 22:0.345704
+-1 1:-0.824537 2:0.492614 3:-0.824537 4:0.492614 5:0.824537 6:-0.492614 7:0.690629 8:0.73043 9:-0.612161 10:0.670696 11:-0.285496 12:0.965013 13:-0.417322 14:0.920844 15:-0.438107 16:-0.965634 17:0.477821 18:0.826109 19:0.3156 20:0.108412 21:0.549347 22:-0.564079
+-1 1:0.322354 2:0.47561 3:0.320656 4:0.445799 5:-0.322354 6:-0.47561 7:-0.675277 8:0.953242 9:-0.50744 10:0.10698 11:-0.283228 12:0.851789 13:-0.343993 14:0.817963 15:-0.398817 16:-0.963488 17:-0.435988 18:-0.92677 19:-0.295056 20:-0.781032 21:0.44056 22:-0.399254
+-1 1:-0.820881 2:-0.059829 3:-0.820881 4:-0.059829 5:0.820881 6:0.0598291 7:0.684183 8:0.979115 9:-0.604718 10:1 11:-0.283004 12:0.980862 13:-0.417066 14:1 15:-0.443738 16:-0.705948 17:0.500993 18:0.922388 19:0.423673 20:-0.805536 21:0.544161 22:-0.304094
+-1 1:0.700548 2:0.732181 3:0.669869 4:0.146105 5:-0.700548 6:-0.732181 7:-0.00845817 8:0.316352 9:-0.596465 10:0.490465 11:-0.257142 12:0.556675 13:-0.397558 14:0.236811 15:-0.439143 16:-0.832081 17:0.385703 18:-0.194489 19:0.227845 20:-0.949427 21:0.394388 22:-0.414531
+-1 1:0.541867 2:0.561704 3:0.541867 4:0.561704 5:-0.541867 6:-0.561704 7:-0.28824 8:0.911582 9:-0.441525 10:0.79721 11:-0.203541 12:0.486591 13:-0.161126 14:0.409746 15:-0.0484364 16:-0.875679 17:-0.148297 18:-0.936686 19:-0.484127 20:-0.917694 21:0.5258 22:-0.551231
+-1 1:-0.656029 2:0.272186 3:-0.656029 4:0.272186 5:0.656029 6:-0.272186 7:0.393521 8:0.92188 9:-0.578817 10:0.880935 11:-0.265524 12:0.969755 13:-0.320846 14:0.993371 15:-0.285081 16:0.179222 17:0.535012 18:-0.134057 19:0.225315 20:0.557611 21:0.577787 22:0.135846
+-1 1:0.157111 2:0.543121 3:0.157111 4:0.543121 5:-0.157111 6:-0.543121 7:-0.966629 8:-0.0528439 9:-0.619864 10:0.094418 11:-0.276867 12:0.342407 13:-0.350232 14:-0.346832 15:-0.476239 16:-0.937902 17:-0.314103 18:-0.824794 19:-0.0263368 20:-0.975459 21:0.0839253 22:-0.534444
+-1 1:-0.67354 2:0.55248 3:-0.67354 4:0.55248 5:0.673541 6:-0.55248 7:0.424397 8:0.827255 9:-0.606568 10:0.962555 11:-0.277078 12:0.738011 13:-0.418606 14:0.634877 15:-0.438127 16:0.957779 17:0.80045 18:-0.662054 19:0.220401 20:0.941966 21:0.828716 22:-0.317922
+-1 1:0.67119 2:0.743902 3:0.634946 4:0.231707 5:-0.67119 6:-0.743902 7:-0.0602221 8:0.154936 9:-0.604384 10:0.286247 11:-0.277055 12:0.661604 13:-0.364537 14:0.789637 15:-0.41548 16:-0.935762 17:0.418935 18:-0.701344 19:0.168631 20:-0.984544 21:0.436366 22:-0.412
+-1 1:-0.920398 2:-0.000478218 3:-0.920398 4:-0.000478218 5:0.920397 6:0.000478299 7:0.859647 8:1 9:-0.614377 10:0.97164 11:-0.285754 12:0.998428 13:-0.422282 14:1 15:-0.440975 16:-0.990831 17:0.494214 18:-0.794761 19:0.233376 20:0.995874 21:0.564844 22:-0.529095
+-1 1:-0.874856 2:0.285569 3:-0.874856 4:0.285569 5:0.874856 6:-0.285569 7:0.77935 8:0.996218 9:-0.579749 10:0.988024 11:-0.275913 12:0.999882 13:-0.413516 14:1 15:-0.432648 16:-0.306094 17:0.500914 18:0.930494 19:0.284092 20:-1 21:0.548445 22:-0.362289
+-1 1:-0.808501 2:0.369411 3:-0.808501 4:0.369411 5:0.8085 6:-0.369411 7:0.662354 8:1 9:-0.611906 10:0.930421 11:-0.284141 12:0.972686 13:-0.427859 14:0.984724 15:-0.442367 16:-0.0655697 17:0.509323 18:0.479945 19:0.244883 20:0.580084 21:0.554692 22:0.0803294
+-1 1:0.150323 2:0.35984 3:0.133919 4:0.286335 5:-0.150322 6:-0.35984 7:-0.978598 8:0.989614 9:-0.112132 10:0.375646 11:-0.243991 12:0.903139 13:-0.215751 14:0.790831 15:-0.266959 16:-0.939576 17:-0.0371604 18:-0.932835 19:-0.0659647 20:-0.591267 21:0.491697 22:-0.339613
diff --git a/CCSVM2.range b/CCSVM2.range
new file mode 100644
index 0000000..35a3bea
--- /dev/null
+++ b/CCSVM2.range
@@ -0,0 +1,24 @@
+x
+-1 1
+1 0 0.88008529
+2 0 0.7454545500000001
+3 0 0.88008529
+4 0 0.7454545500000001
+5 0.1199147 1
+6 0.25454545 1
+7 0.00169894 1
+8 0 1
+9 -0.17033307 0.71363628
+10 0 1
+11 -0.48724923 0.87749803
+12 0 1
+13 -0.32095683 0.80732721
+14 0 1
+15 -0.25240511 0.65639925
+16 0 1
+17 -0.37407586 0.12153328
+18 0 1
+19 -0.1868066 0.11459596
+20 0 1
+21 -0.71157825 0.20744637
+22 0.84306002 3.57725191
diff --git a/CCSVM2Phos.model b/CCSVM2Phos.model
new file mode 100644
index 0000000..7e3ad09
--- /dev/null
+++ b/CCSVM2Phos.model
@@ -0,0 +1,320 @@
+svm_type c_svc
+kernel_type rbf
+gamma 0.0434783
+nr_class 2
+total_sv 311
+rho 0.920224
+label 1 -1
+nr_sv 154 157
+SV
+1 1:-0.306225 2:0.532802 3:-0.306097 4:0.532802 5:0.306225 6:-0.532802 7:-0.272234 8:0.883795 9:-0.796615 10:-0.55418 11:-0.403206 12:0.986492 13:-0.00369742 14:0.910231 15:-0.11334 16:-0.150704 17:0.289522 18:-1 19:0.190512 20:-0.99268 21:-0.177757 22:-0.417847 23:-0.0632531
+1 1:-0.0349255 2:0.687495 3:-0.034747 4:0.687495 5:0.0349255 6:-0.687495 7:-0.769739 8:0.999893 9:-0.506393 10:0.998949 11:-0.184983 12:0.988026 13:-0.188819 14:0.988558 15:0.0156268 16:-0.995519 17:-0.171474 18:-0.992497 19:-0.227256 20:-0.936611 21:0.0848795 22:-0.108089 23:-0.0632531
+1 1:0.162597 2:0.710048 3:0.162812 4:0.710048 5:-0.162597 6:-0.710048 7:-0.868046 8:1 9:-0.779149 10:0.809524 11:-0.393677 12:0.844498 13:-0.24917 14:0.924883 15:-0.201451 16:-0.943969 17:0.191648 18:-0.882353 19:0.170514 20:-0.792148 21:0.194007 22:-0.860499 23:-0.0632531
+1 1:-0.148112 2:0.781048 3:-0.147954 4:0.781048 5:0.148112 6:-0.781048 7:-0.56218 8:0.925492 9:-0.803581 10:0.859259 11:-0.365957 12:0.722106 13:-0.370044 14:0.500133 15:-0.199406 16:-0.912046 17:0.183318 18:-0.897784 19:0.160519 20:-0.134615 21:0.216893 22:-0.37673 23:0.294318
+1 1:-0.516125 2:0.71301 3:-0.516036 4:0.71301 5:0.516125 6:-0.71301 7:0.112677 8:0.569825 9:-0.822344 10:-0.279817 11:-0.403771 12:0.878779 13:-0.406922 14:0.996107 15:-0.202869 16:-0.989414 17:0.287979 18:-0.884527 19:0.191749 20:-0.988495 21:0.208402 22:0.0415306 23:-0.0269604
+1 1:0.26407 2:0.718679 3:0.264304 4:0.718679 5:-0.26407 6:-0.718679 7:-0.681967 8:0.922756 9:-0.817504 10:0.986233 11:-0.400206 12:-0.168108 13:-0.572699 14:0.305759 15:-0.12954 16:-0.130582 17:0.240107 18:-0.606051 19:0.185066 20:-1 21:0.217473 22:-0.74586 23:-0.0632531
+1 1:0.247391 2:0.820807 3:0.247622 4:0.820807 5:-0.247391 6:-0.820807 7:-0.712553 8:0.870915 9:-0.762923 10:0.369565 11:-0.399138 12:0.973918 13:-0.222038 14:-0.298146 15:-0.244729 16:-1 17:0.256591 18:0.390625 19:0.198507 20:0.915294 21:0.230678 22:-0.761314 23:-0.0632531
+1 1:0.21859 2:0.814726 3:0.218208 4:0.790403 5:-0.21859 6:-0.814726 7:-0.765367 8:0.989133 9:-0.638863 10:0.97679 11:-0.326505 12:0.904967 13:-0.00390097 14:-0.920201 15:-0.248326 16:-0.970755 17:0.283935 18:-0.820225 19:0.190674 20:0.875278 21:0.218898 22:-0.604254 23:-0.0632531
+1 1:0.266839 2:0.707854 3:0.267073 4:0.707854 5:-0.266839 6:-0.707854 7:-0.676889 8:0.340359 9:-0.800598 10:-0.116088 11:-0.425256 12:0.98492 13:-0.0988582 14:1 15:0.19386 16:-0.74859 17:0.271083 18:-0.409987 19:0.190266 20:-0.941317 21:0.185219 22:-0.0567437 23:-0.0632531
+1 1:-0.0863616 2:0.746904 3:-0.086408 4:0.720273 5:0.0863615 6:-0.746904 7:-0.675416 8:0.982624 9:-0.622616 10:0.998664 11:-0.0607351 12:0.999703 13:-0.151633 14:0.998817 15:-0.1833 16:-0.98361 17:0.135573 18:-0.805345 19:0.16883 20:-0.474114 21:0.210039 22:-0.648321 23:-0.0632531
+1 1:0.658459 2:0.77898 3:0.657894 4:0.718507 5:-0.658459 6:-0.77898 7:0.0412571 8:-1 9:-0.834675 10:0.69236 11:-0.394934 12:0.996904 13:-0.406991 14:1 15:-0.20548 16:-1 17:0.254501 18:-1 19:0.191028 20:-0.855359 21:0.210253 22:-0.318576 23:-0.0632531
+1 1:-0.240495 2:0.854875 3:-0.240707 4:0.810917 5:0.240495 6:-0.854875 7:-0.392769 8:-0.993031 9:-0.824338 10:-0.914949 11:-0.402612 12:0.984478 13:-0.39279 14:0.977262 15:-0.205614 16:-0.997399 17:0.286937 18:-1 19:0.190992 20:-0.928168 21:0.214553 22:-0.440547 23:-0.907866
+1 1:0.188961 2:0.728386 3:0.189181 4:0.728386 5:-0.188961 6:-0.728386 7:-0.819701 8:0.985612 9:-0.519506 10:0.958438 11:0.0323777 12:0.975721 13:0.230589 14:0.934846 15:0.237687 16:-0.892632 17:0.253123 18:-0.924783 19:0.179293 20:-1 21:-0.104714 22:-0.438406 23:-0.0632531
+1 1:-0.216965 2:0.660397 3:-0.21682 4:0.660397 5:0.216964 6:-0.660397 7:-0.435919 8:-0.0357246 9:-0.824068 10:-0.511265 11:-0.406554 12:0.503939 13:-0.375255 14:0.147809 15:-0.204928 16:-0.126638 17:0.290443 18:-0.841281 19:0.187779 20:1 21:0.217734 22:-0.603584 23:0.35836
+1 1:0.126543 2:0.7712 3:0.126751 4:0.7712 5:-0.126543 6:-0.7712 7:-0.934163 8:0.470885 9:-0.809999 10:0.386412 11:-0.395955 12:0.98846 13:-0.317418 14:0.971743 15:-0.208852 16:-1 17:0.258772 18:-1 19:0.191955 20:0.144852 21:0.218132 22:-0.718059 23:-0.0632531
+1 1:0.482737 2:0.702424 3:0.482927 4:0.663609 5:-0.482737 6:-0.702424 7:-0.28098 8:-0.0253395 9:-0.824065 10:-0.96895 11:-0.428305 12:0.999925 13:-0.0203751 14:0.995765 15:-0.115595 16:-0.288267 17:0.290332 18:-0.00046284 19:0.192487 20:0.931399 21:0.219189 22:-0.87511 23:-0.0632531
+1 1:-0.120308 2:0.702424 3:-0.120145 4:0.702424 5:0.120308 6:-0.702424 7:-0.613166 8:0.996779 9:0.865883 10:0.955841 11:-0.376045 12:-0.824248 13:-0.432964 14:-0.20134 15:-0.211618 16:-1 17:0.290468 18:-1 19:0.192433 20:-0.985736 21:0.211291 22:-0.865218 23:-0.0632531
+1 1:0.336888 2:0.559248 3:0.337135 4:0.559248 5:-0.336888 6:-0.559248 7:-0.548435 8:0.577939 9:-0.447842 10:-0.910517 11:-0.536692 12:0.575363 13:0.487363 14:0.722909 15:-0.204055 16:-0.711574 17:0.288609 18:-0.0335134 19:0.192136 20:-0.946893 21:0.212278 22:-0.252141 23:-0.0632531
+1 1:-0.219208 2:0.762337 3:-0.219063 4:0.762337 5:0.219208 6:-0.762337 7:-0.431805 8:0.969195 9:-0.78788 10:0.985164 11:-0.344994 12:0.985881 13:-0.329152 14:0.931624 15:-0.161195 16:-0.988262 17:0.213492 18:-0.688027 19:0.186252 20:-0.977571 21:0.149049 22:-0.0547176 23:-0.0632531
+1 1:0.489092 2:0.684477 3:0.489367 4:0.684477 5:-0.489092 6:-0.684477 7:-0.269326 8:-0.724159 9:-0.835749 10:0.439437 11:-0.392025 12:0.995355 13:0.153908 14:0.99791 15:0.145176 16:-0.995236 17:0.273034 18:-0.99882 19:0.165674 20:0.899849 21:0.22736 22:-0.849311 23:-0.0632531
+1 1:-0.0501292 2:0.709536 3:-0.0499537 4:0.709536 5:0.0501292 6:-0.709536 7:-0.741859 8:1 9:-0.618351 10:0.999221 11:-0.242838 12:0.998154 13:-0.314072 14:0.992543 15:-0.158431 16:-0.869337 17:0.287157 18:-1 19:0.190409 20:0.215341 21:0.21862 22:-0.212347 23:-0.0632531
+1 1:0.508016 2:0.596471 3:0.508294 4:0.596471 5:-0.508016 6:-0.596471 7:-0.234624 8:0.42156 9:-0.815146 10:-1 11:-0.402245 12:0.907751 13:0.0251185 14:-0.892501 15:-0.209672 16:-0.351823 17:0.290131 18:0.94012 19:0.19969 20:1 21:0.218011 22:-0.989382 23:-0.0632531
+0.4341541218136421 1:-0.186337 2:0.775201 3:-0.186531 4:0.757659 5:0.186337 6:-0.775201 7:-0.492083 8:0.856329 9:-0.818178 10:0.485855 11:-0.399262 12:0.977313 13:-0.340842 14:0.870646 15:-0.171987 16:-0.600967 17:0.287615 18:-1 19:0.189426 20:-0.632363 21:0.214209 22:-0.295078 23:0.40084
+1 1:-0.428794 2:0.675359 3:-0.428689 4:0.675359 5:0.428794 6:-0.675359 7:-0.0474687 8:-0.855238 9:-0.825026 10:0.564168 11:-0.394448 12:-0.340595 13:-0.418474 14:0.112532 15:-0.204492 16:-0.948902 17:0.275856 18:-0.831832 19:0.186012 20:1 21:0.218246 22:-0.418392 23:0.363924
+0.8345004290608965 1:-0.219606 2:0.654477 3:-0.219462 4:0.654477 5:0.219606 6:-0.654477 7:-0.431075 8:1 9:-0.802097 10:0.961755 11:-0.400785 12:1 13:-0.382889 14:0.0393486 15:-0.209308 16:-1 17:0.284435 18:-0.284909 19:0.192418 20:1 21:0.287478 22:-0.778663 23:-0.0632531
+1 1:-0.217722 2:0.739195 3:-0.217577 4:0.739195 5:0.217722 6:-0.739195 7:-0.43453 8:0.998529 9:-0.702731 10:0.999669 11:-0.17032 12:0.99722 13:0.265464 14:0.998297 15:0.121132 16:-0.986812 17:-0.3273 18:-0.998055 19:-0.108091 20:-0.862809 21:0.209738 22:-0.545599 23:-0.0632531
+1 1:0.541535 2:0.633127 3:0.54182 4:0.633127 5:-0.541535 6:-0.633127 7:-0.173157 8:1 9:-0.559817 10:0.999773 11:-0.298788 12:0.99998 13:-0.236374 14:0.999971 15:-0.0653444 16:-0.759404 17:0.289709 18:0.410828 19:0.192591 20:0.910987 21:0.218508 22:-0.207797 23:-0.0632531
+1 1:0.119575 2:0.703533 3:0.119782 4:0.703533 5:-0.119575 6:-0.703533 7:-0.94694 8:1 9:-0.792408 10:0.259849 11:-0.206731 12:0.978056 13:0.0987599 14:0.996327 15:0.0999099 16:-0.97546 17:0.281481 18:-0.743737 19:0.170398 20:-0.18297 21:0.20432 22:-0.62244 23:-0.0632531
+1 1:0.144758 2:0.873902 3:0.14497 4:0.873902 5:-0.144758 6:-0.873902 7:-0.90076 8:0.969201 9:-0.758642 10:-0.996746 11:-0.49423 12:0.998751 13:0.0300521 14:0.785322 15:-0.201455 16:-1 17:0.290428 18:-1 19:0.192488 20:-0.771229 21:0.194938 22:-0.456922 23:-0.0632531
+1 1:-0.386043 2:0.614732 3:-0.385929 4:0.614732 5:0.386043 6:-0.614732 7:-0.125866 8:1 9:-0.805941 10:-1 11:-0.404572 12:0.889398 13:-0.342019 14:-1 15:-0.21345 16:-0.871646 17:0.239677 18:-0.941601 19:0.157741 20:-0.884363 21:0.211568 22:-0.652159 23:-0.617591
+1 1:0.0557555 2:0.637733 3:0.0559506 4:0.637733 5:-0.0557554 6:-0.637733 7:-0.936028 8:-0.0164683 9:-0.82549 10:0.144006 11:-0.390894 12:0.982952 13:-0.280561 14:0.392394 15:-0.199116 16:-0.574871 17:0.272508 18:-0.791519 19:0.174658 20:-1 21:0.200462 22:-0.556245 23:-0.0632531
+1 1:0.240387 2:0.557835 3:0.240616 4:0.557835 5:-0.240387 6:-0.557835 7:-0.725397 8:0.980389 9:-0.734141 10:1 11:-0.213811 12:0.995571 13:-0.234204 14:0.993438 15:-0.0535768 16:-0.726678 17:0.272805 18:-0.877184 19:0.189955 20:-0.0170749 21:0.217419 22:-0.0876851 23:-0.0632531
+1 1:0.00349736 2:0.826392 3:0.00101542 4:0.807777 5:-0.0034973 6:-0.826392 7:-0.840198 8:0.113419 9:-0.821843 10:-0.720149 11:-0.409123 12:-0.59719 13:-0.43442 14:-0.464864 15:-0.214859 16:-0.97627 17:-0.229407 18:-1 19:0.0784554 20:-0.849469 21:0.178391 22:-0.151814 23:-0.0632531
+1 1:-0.171103 2:0.572861 3:-0.17095 4:0.572861 5:0.171103 6:-0.572861 7:-0.520019 8:0.109346 9:-0.819378 10:0.260822 11:-0.381463 12:0.315848 13:-0.382266 14:0.194383 15:-0.195365 16:-0.949534 17:0.263764 18:-0.838685 19:0.167333 20:-0.084619 21:0.215628 22:-0.530801 23:0.339372
+1 1:-0.523278 2:0.658209 3:-0.52319 4:0.658209 5:0.523278 6:-0.658209 7:0.125795 8:0.966822 9:-0.823164 10:-1 11:-0.402194 12:0.98707 13:-0.398026 14:1 15:-0.208576 16:-0.998123 17:0.287543 18:-1 19:0.184289 20:1 21:0.218344 22:-0.745667 23:-0.0632531
+1 1:-0.28954 2:0.777822 3:-0.289408 4:0.777822 5:0.28954 6:-0.777822 7:-0.302832 8:0.996123 9:-0.743915 10:0.690819 11:-0.389177 12:0.910347 13:-0.388554 14:0.793317 15:-0.184427 16:-0.982833 17:0.0106509 18:-1 19:0.167176 20:-0.393081 21:0.162874 22:-0.148599 23:-0.0207082
+1 1:0.418861 2:0.744699 3:0.418827 4:0.72146 5:-0.418861 6:-0.744699 7:-0.398115 8:-0.276899 9:-0.832891 10:-0.462298 11:-0.413044 12:0.170994 13:-0.359996 14:-0.0541652 15:-0.227293 16:-0.984623 17:0.28776 18:-0.990695 19:0.188786 20:-0.323687 21:0.215724 22:-0.535869 23:-0.0632531
+1 1:0.107347 2:0.785115 3:0.106387 4:0.758676 5:-0.107348 6:-0.785115 7:-0.969363 8:-0.763671 9:-0.830769 10:1 11:-0.401404 12:0.745063 13:-0.259053 14:0.808729 15:-0.182904 16:-0.639518 17:0.284613 18:1 19:0.194282 20:-0.847581 21:0.217239 22:-0.398374 23:0.0552768
+1 1:0.0671629 2:0.789327 3:0.0673602 4:0.789327 5:-0.0671628 6:-0.789327 7:-0.956947 8:-0.271343 9:-0.846312 10:-0.657465 11:-0.464495 12:-0.214314 13:-0.689886 14:-0.517092 15:-1 16:-0.493849 17:0.290292 18:-0.195974 19:0.192513 20:0.945581 21:0.228781 22:-0.817859 23:-0.0632531
+1 1:0.15233 2:0.634037 3:0.152543 4:0.634037 5:-0.15233 6:-0.634037 7:-0.886875 8:0.985611 9:-0.671378 10:0.970543 11:-0.198614 12:0.903288 13:-0.239296 14:0.587315 15:-0.16984 16:-0.543036 17:0.27746 18:0.19393 19:0.203289 20:-0.301432 21:0.216377 22:-0.270404 23:-0.0632531
+1 1:-0.0453449 2:0.731436 3:-0.0451684 4:0.731436 5:0.0453447 6:-0.731437 7:-0.750632 8:-0.107448 9:-0.825215 10:0.70577 11:-0.334646 12:0.889888 13:-0.354492 14:0.847748 15:-0.173777 16:-0.787967 17:0.27012 18:-0.414971 19:0.187407 20:-0.773894 21:0.187805 22:0.0158976 23:-0.0632531
+1 1:0.0601876 2:0.632558 3:0.0603836 4:0.632558 5:-0.0601876 6:-0.632558 7:-0.944156 8:0.383126 9:-0.80594 10:0.0295566 11:-0.400374 12:0.925 13:-0.218222 14:0.946588 15:-0.145421 16:-0.986207 17:0.225916 18:-0.930769 19:0.143359 20:-0.419476 21:0.191983 22:0.0253624 23:-0.0632531
+1 1:-0.161997 2:0.797194 3:-0.161842 4:0.797194 5:0.161997 6:-0.797194 7:-0.536717 8:0.942067 9:-0.785248 10:0.989778 11:-0.374849 12:0.972677 13:-0.346326 14:0.974429 15:-0.153014 16:-0.717805 17:0.262764 18:-1 19:0.192361 20:-0.746421 21:0.212703 22:-0.0957457 23:0.403285
+1 1:-0.100918 2:0.606783 3:-0.100752 4:0.606783 5:0.100918 6:-0.606783 7:-0.648723 8:0.972445 9:-0.759378 10:0.838443 11:-0.375796 12:1 13:-0.207455 14:0.950135 15:-0.114143 16:-0.556339 17:0.246156 18:-1 19:0.181752 20:-0.719222 21:0.1549 22:0.0582286 23:-0.0632531
+1 1:0.178708 2:0.749864 3:0.178926 4:0.749864 5:-0.178708 6:-0.749864 7:-0.838503 8:0.516482 9:-0.78854 10:0.883464 11:-0.330934 12:0.818683 13:-0.152922 14:0.731571 15:-0.151949 16:-1 17:0.17631 18:-0.640448 19:0.181283 20:0.437181 21:0.229119 22:-0.294616 23:-0.0632531
+1 1:0.10186 2:0.605388 3:0.102064 4:0.605388 5:-0.10186 6:-0.605388 7:-0.979425 8:0.99038 9:-0.783032 10:0.65055 11:-0.373562 12:0.996776 13:-0.191403 14:0.975602 15:-0.179632 16:-0.74613 17:0.284487 18:-0.463557 19:0.187998 20:0.712418 21:0.2279 22:-0.304853 23:-0.0632531
+1 1:0.686141 2:0.693947 3:0.686453 4:0.693947 5:-0.686141 6:-0.693947 7:0.0920201 8:0.990604 9:-0.821792 10:0.976436 11:-0.400575 12:0.982894 13:0.0729861 14:0.0770402 15:-0.207491 16:-0.923776 17:0.290511 18:-0.623337 19:0.192383 20:-0.986807 21:0.217521 22:-0.935983 23:-0.0632531
+1 1:0.0635538 2:0.575232 3:0.0637503 4:0.575232 5:-0.0635539 6:-0.575232 7:-0.950329 8:0.957018 9:-0.802388 10:0.918807 11:-0.391858 12:0.999203 13:-0.255428 14:1 15:-0.113911 16:-0.0239476 17:0.289595 18:-0.57802 19:0.17813 20:-0.880769 21:0.205445 22:0.117657 23:-0.0632531
+1 1:0.207202 2:0.722577 3:0.207425 4:0.722577 5:-0.207202 6:-0.722577 7:-0.786252 8:0.615963 9:-0.744839 10:0.302986 11:-0.363134 12:0.431554 13:-0.142151 14:0.179229 15:-0.160378 16:-0.727251 17:0.0231157 18:-0.869577 19:0.0309888 20:-0.91894 21:0.203818 22:-0.77315 23:-0.0632531
+1 1:0.116612 2:0.751932 3:0.115789 4:0.733317 5:-0.116612 6:-0.751932 7:-0.952374 8:0.874878 9:-0.613366 10:0.717092 11:-0.284599 12:0.998957 13:-0.161959 14:0.917417 15:-0.135463 16:-0.915587 17:0.13432 18:-0.819163 19:0.00232518 20:-0.258316 21:0.206694 22:-0.232752 23:-0.0632531
+1 1:-0.387804 2:0.695552 3:-0.38769 4:0.695552 5:0.387804 6:-0.695552 7:-0.122637 8:0.973853 9:-0.811555 10:0.969349 11:-0.381948 12:0.987446 13:-0.311156 14:0.986849 15:-0.13086 16:-0.936836 17:0.257818 18:-0.965243 19:0.158503 20:-1 21:0.213802 22:-0.23223 23:-0.0632531
+1 1:0.532303 2:0.729594 3:0.532586 4:0.729594 5:-0.532303 6:-0.729594 7:-0.190086 8:1 9:-0.783825 10:1 11:-0.393876 12:1 13:-0.172228 14:0.99315 15:-0.0120141 16:-0.401428 17:0.27125 18:-0.782135 19:0.191253 20:-0.340801 21:0.217167 22:-0.690986 23:-0.0632531
+1 1:0.245824 2:0.706206 3:0.246054 4:0.706206 5:-0.245824 6:-0.706206 7:-0.715427 8:0.993616 9:-0.328227 10:0.986997 11:0.216223 12:0.991176 13:0.0550569 14:0.971326 15:0.0431528 16:-0.912875 17:0.243873 18:-0.944168 19:0.161733 20:-0.579504 21:0.20411 22:-0.107436 23:-0.0632531
+1 1:-0.00832371 2:0.672979 3:-0.00814046 4:0.672979 5:0.0083239 6:-0.672979 7:-0.818521 8:0.297334 9:-0.811032 10:0.298776 11:-0.388401 12:1 13:-0.273576 14:0.990446 15:-0.0677674 16:-0.53247 17:0.283952 18:-1 19:0.184584 20:1 21:0.223036 22:-0.688598 23:-0.0632531
+1 1:0.132734 2:0.547167 3:0.132944 4:0.547167 5:-0.132734 6:-0.547167 7:-0.922809 8:0.947276 9:-0.812591 10:0.768566 11:-0.25118 12:0.0494058 13:-0.393631 14:-0.222756 15:-0.282622 16:-0.702472 17:0.194362 18:-0.984063 19:0.0937532 20:-0.85342 21:0.214302 22:-0.563088 23:-0.0632531
+1 1:0.0314578 2:0.845684 3:0.0316484 4:0.845684 5:-0.0314578 6:-0.845684 7:-0.891472 8:0.965987 9:-0.806981 10:-0.876213 11:-0.495916 12:0.977565 13:-0.313275 14:1 15:-0.206214 16:-1 17:0.289785 18:-1 19:0.192404 20:0.977372 21:0.224864 22:-0.725741 23:-0.0632531
+1 1:0.455671 2:0.731008 3:0.45594 4:0.731008 5:-0.455671 6:-0.731008 7:-0.330613 8:0.359655 9:-0.821171 10:0.103563 11:-0.400356 12:0.519318 13:-0.268662 14:-0.12904 15:-0.253595 16:0.388154 17:0.303146 18:0.198657 19:0.209778 20:0.842271 21:0.222274 22:-0.641625 23:-0.0632531
+1 1:0.140156 2:0.687495 3:0.140367 4:0.687495 5:-0.140156 6:-0.687495 7:-0.909199 8:0.997556 9:-0.652371 10:0.503889 11:0.0201712 12:0.635657 13:0.0892422 14:0.470433 15:0.0133549 16:-0.950448 17:0.172844 18:-0.846154 19:0.156472 20:-0.757293 21:0.188762 22:-0.457396 23:-0.0632531
+1 1:0.0734864 2:0.66634 3:0.0736848 4:0.66634 5:-0.0734864 6:-0.66634 7:-0.968543 8:0.953281 9:-0.54209 10:0.947652 11:0.0939237 12:0.899415 13:0.211877 14:0.815257 15:0.0789465 16:-0.989869 17:0.201174 18:-0.892012 19:0.03788 20:-0.126484 21:0.21601 22:0.0384862 23:-0.0632531
+0.6440765177238077 1:-0.206812 2:0.711586 3:-0.206666 4:0.711586 5:0.206812 6:-0.711586 7:-0.454536 8:1 9:-0.781289 10:1 11:-0.38774 12:0.833333 13:-0.359038 14:0.977579 15:-0.185706 16:0.116635 17:0.292787 18:-0.128571 19:0.18952 20:-0.927978 21:0.202014 22:-0.00174955 23:-0.0632531
+0.566826615180688 1:-0.16809 2:0.720273 3:-0.167937 4:0.720273 5:0.16809 6:-0.720273 7:-0.525543 8:0.998939 9:-0.654107 10:0.999699 11:-0.300221 12:-0.0231844 13:-0.416526 14:-0.358835 15:-0.24606 16:-0.261239 17:0.268799 18:-0.793803 19:0.14198 20:0.766706 21:0.222821 22:-0.536773 23:0.180826
+1 1:-0.129706 2:0.698046 3:-0.129545 4:0.698046 5:0.129706 6:-0.698046 7:-0.595932 8:0.513905 9:-0.817739 10:-0.726664 11:-0.42855 12:0.695017 13:-0.343012 14:0.0655372 15:-0.207771 16:-0.60691 17:0.266154 18:-0.93658 19:0.139886 20:-0.89167 21:0.204805 22:-0.52815 23:0.0720403
+1 1:-0.0729949 2:0.577403 3:-0.0728235 4:0.577403 5:0.0729949 6:-0.577403 7:-0.699928 8:0.699864 9:-0.776629 10:0.937796 11:-0.352453 12:0.883473 13:-0.329476 14:0.663417 15:-0.184604 16:-0.861255 17:0.265568 18:-0.863081 19:0.175518 20:-0.693931 21:0.210424 22:-0.412549 23:-0.0632531
+1 1:-0.378084 2:0.483157 3:-0.377969 4:0.483157 5:0.378084 6:-0.483157 7:-0.14046 8:0.855937 9:-0.819031 10:-1 11:-0.402191 12:0.787256 13:-0.387222 14:1 15:-0.207637 16:-0.827411 17:0.276106 18:-1 19:0.162785 20:0.710157 21:0.226291 22:-0.278221 23:0.23432
+1 1:0.101276 2:0.687495 3:0.10148 4:0.687495 5:-0.101276 6:-0.687495 7:-0.980496 8:0.994648 9:-0.512788 10:0.982353 11:-0.330881 12:0.999286 13:0.0130847 14:0.998448 15:-0.0320905 16:-0.989002 17:0.0672119 18:-0.984202 19:0.0431935 20:-0.637362 21:0.215716 22:-0.508034 23:-0.0632531
+1 1:-0.237115 2:0.890075 3:-0.237079 4:0.862642 5:0.237115 6:-0.890075 7:-0.398967 8:0.998298 9:0.835756 10:-1 11:-0.402212 12:-0.293894 13:-0.415261 14:0.983201 15:-0.207746 16:-1 17:0.289603 18:-1 19:0.190196 20:1 21:0.218281 22:-0.674865 23:-0.0632531
+1 1:-0.170443 2:0.657252 3:-0.17029 4:0.657252 5:0.170443 6:-0.657252 7:-0.521229 8:0.97105 9:-0.790875 10:0.77564 11:-0.37038 12:0.998583 13:-0.240406 14:0.853846 15:-0.208412 16:-0.99897 17:0.008621 18:-0.897032 19:0.156762 20:-0.885098 21:0.204586 22:-0.0829299 23:-0.0632531
+1 1:-0.331121 2:0.761787 3:-0.330997 4:0.761787 5:0.331121 6:-0.761787 7:-0.22658 8:0.22236 9:-0.8228 10:0.693405 11:-0.396567 12:0.59197 13:-0.388919 14:0.304452 15:-0.199639 16:-1 17:0.279122 18:-0.0214689 19:0.19212 20:-0.272727 21:0.216024 22:-0.457318 23:-0.0632531
+1 1:-0.0598278 2:0.468824 3:-0.0623656 4:0.424056 5:0.0598277 6:-0.468824 7:-0.724074 8:0.994709 9:-0.647303 10:0.944828 11:-0.320249 12:0.989095 13:-0.11863 14:1 15:-0.126456 16:-0.489051 17:0.284568 18:-0.871743 19:0.163422 20:-0.995342 21:-0.199482 22:-0.574823 23:-0.0632531
+1 1:0.0483076 2:0.599891 3:0.0485014 4:0.599891 5:-0.0483076 6:-0.599891 7:-0.922371 8:-0.346535 9:-0.831795 10:0.314286 11:-0.400262 12:0.987571 13:-0.289406 14:-0.435407 15:-0.223449 16:-0.724687 17:0.223867 18:-0.263158 19:0.186611 20:-0.866667 21:0.207226 22:-0.508339 23:-0.0632531
+1 1:0.168241 2:0.857417 3:0.168457 4:0.857417 5:-0.168242 6:-0.857417 7:-0.857696 8:0.894682 9:-0.781288 10:0.983026 11:-0.280529 12:0.984492 13:-0.107579 14:0.954739 15:-0.114122 16:-0.962573 17:0.130208 18:-1 19:0.0573685 20:-0.173176 21:0.204548 22:-0.327724 23:-0.0632531
+1 1:-0.236923 2:0.660539 3:-0.236782 4:0.660539 5:0.236923 6:-0.66054 7:-0.399319 8:0.890501 9:-0.444824 10:0.84965 11:-0.21866 12:0.78618 13:-0.230125 14:0.628217 15:-0.142747 16:-0.988246 17:0.231274 18:-0.914408 19:0.120629 20:0.745898 21:0.220422 22:-0.182255 23:-0.0434058
+1 1:-0.223964 2:0.699189 3:-0.22382 4:0.699189 5:0.223964 6:-0.699189 7:-0.423083 8:0.781026 9:-0.81465 10:0.925069 11:-0.36878 12:0.699332 13:-0.238442 14:0.575473 15:-0.157387 16:-1 17:0.272486 18:-0.751666 19:0.189249 20:-0.759764 21:0.217128 22:-0.864747 23:-0.0632531
+1 1:0.136638 2:0.74585 3:0.136848 4:0.74585 5:-0.136638 6:-0.74585 7:-0.915651 8:-0.220876 9:-0.824551 10:-1 11:-0.402217 12:-0.520886 13:-0.890013 14:1 15:-0.208168 16:-0.873023 17:-0.751058 18:-0.995182 19:0.180066 20:-0.527121 21:0.217419 22:-0.853771 23:-0.0632531
+1 1:-0.312793 2:0.678192 3:-0.312666 4:0.678192 5:0.312793 6:-0.678192 7:-0.260189 8:0.929336 9:-0.71202 10:0.960239 11:-0.179472 12:0.999853 13:-0.170301 14:0.999872 15:-0.0646316 16:-0.952787 17:0.0383451 18:-0.998824 19:-0.071344 20:-0.355805 21:0.217478 22:-0.229235 23:0.0244024
+1 1:-0.0192814 2:0.739195 3:-0.019323 4:0.675186 5:0.0192814 6:-0.739195 7:-0.798427 8:0.968521 9:-0.822993 10:-1 11:-0.402897 12:0.989899 13:-0.383999 14:-0.985027 15:-0.276541 16:-0.965286 17:0.273968 18:-0.136267 19:0.19238 20:0.742633 21:0.221264 22:-0.809248 23:-0.0632531
+1 1:-0.235987 2:0.646672 3:-0.235846 4:0.646672 5:0.235988 6:-0.646672 7:-0.401035 8:-0.114108 9:-0.824104 10:0.669285 11:-0.400872 12:1 13:-0.184714 14:0.995031 15:-0.0526796 16:-0.872314 17:0.285868 18:-1 19:0.191453 20:-1 21:0.210279 22:-0.0429919 23:0.404365
+1 1:-0.0297984 2:0.656623 3:-0.029619 4:0.656623 5:0.0297985 6:-0.656623 7:-0.779141 8:0.993438 9:-0.700997 10:0.977482 11:-0.373951 12:0.990358 13:0.299084 14:0.972454 15:0.0492867 16:-0.974579 17:0.0640217 18:-1 19:0.159524 20:0.376872 21:0.219493 22:0.0257981 23:-0.0632531
+1 1:0.27701 2:0.757659 3:0.277246 4:0.757659 5:-0.27701 6:-0.75766 7:-0.658238 8:0.289403 9:-0.81095 10:0.885804 11:-0.289817 12:0.981413 13:0.0860793 14:0.996706 15:0.0800199 16:-0.97378 17:0.256406 18:-1 19:0.192488 20:-0.469311 21:0.151961 22:-0.317927 23:-0.0632531
+1 1:-0.342565 2:0.503732 3:-0.342443 4:0.503732 5:0.342565 6:-0.503732 7:-0.205595 8:0.90533 9:-0.823638 10:0.840146 11:-0.402121 12:0.827948 13:-0.407506 14:-0.96634 15:-0.210014 16:-1 17:0.161712 18:-1 19:-0.0629136 20:-0.95995 21:0.213501 22:-0.880661 23:0.254755
+1 1:0.576811 2:0.758398 3:0.577103 4:0.758398 5:-0.576811 6:-0.758398 7:-0.108467 8:0.987384 9:-0.789352 10:0.858425 11:-0.387642 12:0.981544 13:0.0196823 14:1 15:-0.136626 16:0.954226 17:0.308156 18:-1 19:0.192429 20:-0.999398 21:0.0830696 22:-0.753143 23:-0.0632531
+1 1:-0.139193 2:0.727341 3:-0.139587 4:0.70482 5:0.139193 6:-0.727342 7:-0.578535 8:-0.782106 9:-0.840057 10:-0.274142 11:-0.40687 12:0.946582 13:0.0948028 14:0.984362 15:-0.151443 16:-0.414223 17:0.278427 18:-0.817138 19:0.179467 20:-1 21:0.197958 22:-0.355559 23:-0.00919147
+1 1:0.261415 2:0.773316 3:0.261648 4:0.773316 5:-0.261414 6:-0.773316 7:-0.686837 8:-1 9:-0.824609 10:0.980584 11:-0.360146 12:0.874048 13:-0.274443 14:0.981657 15:-0.156209 16:-1 17:0.273757 18:-0.468715 19:0.182453 20:0.437444 21:0.227839 22:-0.556424 23:-0.0632531
+0.7428170623900131 1:0.152541 2:0.563311 3:0.152754 4:0.563311 5:-0.152541 6:-0.563311 7:-0.886488 8:0.899953 9:-0.640214 10:0.91024 11:-0.362241 12:0.939472 13:0.327847 14:0.938259 15:-0.0187221 16:-0.894873 17:0.094033 18:-1 19:0.171848 20:-0.650007 21:0.197296 22:-0.708569 23:-0.0632531
+1 1:-0.248337 2:0.699189 3:-0.248198 4:0.699189 5:0.248337 6:-0.699189 7:-0.378388 8:1 9:-0.823464 10:0.797255 11:-0.400856 12:0.956463 13:-0.377113 14:0.884177 15:-0.192558 16:-1 17:0.289144 18:-0.326411 19:0.192036 20:-0.721111 21:0.214353 22:-0.871277 23:-0.0632531
+1 1:0.000682715 2:0.707854 3:0.000867698 4:0.707854 5:-0.000682792 6:-0.707854 7:-0.835037 8:0.843388 9:-0.737043 10:0.693171 11:-0.255865 12:0.870157 13:-0.255591 14:0.888411 15:-0.00403872 16:-0.9059 17:0.243196 18:-0.783729 19:0.137626 20:-0.144475 21:0.212808 22:-0.213076 23:-0.0632531
+1 1:-0.0189278 2:0.71301 3:-0.0187465 4:0.71301 5:0.0189278 6:-0.71301 7:-0.799075 8:1 9:-0.771747 10:1 11:0.16858 12:0.985637 13:-0.140125 14:0.978262 15:0.010199 16:-0.895109 17:0.273922 18:-0.869634 19:0.16182 20:-0.468155 21:0.211244 22:-0.0973053 23:0.148949
+1 1:0.314176 2:0.654477 3:0.314419 4:0.654477 5:-0.314176 6:-0.654477 7:-0.590083 8:0.987678 9:-0.4149 10:0.989072 11:0.0394218 12:0.937717 13:-0.00426253 14:0.91183 15:0.0941794 16:-1 17:0.290548 18:-1 19:0.188761 20:0.265623 21:0.218638 22:-0.0956312 23:-0.0632531
+1 1:0.0374852 2:0.193204 3:0.037677 4:0.193204 5:-0.0374852 6:-0.193204 7:-0.902525 8:1 9:-0.750877 10:-0.757549 11:-0.525479 12:0.915461 13:0.281952 14:0.688736 15:-0.0798519 16:0.0488372 17:0.293726 18:-1 19:0.192488 20:-1 21:0.217526 22:-0.652002 23:-0.0632531
+1 1:-0.0401896 2:0.802376 3:-0.0400122 4:0.802376 5:0.0401897 6:-0.802376 7:-0.760086 8:0.619918 9:-0.780856 10:0.298471 11:-0.391417 12:0.782492 13:-0.16892 14:0.380992 15:-0.197757 16:-0.401451 17:0.266574 18:-0.943662 19:0.161187 20:0.355089 21:0.227519 22:-0.37085 23:-0.0632531
+1 1:0.593364 2:0.721662 3:0.593659 4:0.721662 5:-0.593364 6:-0.721662 7:-0.0781126 8:0.327007 9:-0.817028 10:0.435712 11:-0.395637 12:0.334403 13:-0.263916 14:0.333923 15:-0.133776 16:0.200581 17:0.292005 18:0.996591 19:0.192922 20:-0.292007 21:0.217495 22:-0.662365 23:-0.0632531
+1 1:-0.252656 2:0.57582 3:-0.252518 4:0.57582 5:0.252656 6:-0.57582 7:-0.370469 8:-0.870265 9:-0.838649 10:-0.993001 11:-0.424336 12:0.855285 13:-0.301459 14:0.0717378 15:-0.206151 16:-0.915042 17:0.231242 18:-1 19:0.182447 20:0.458941 21:0.221055 22:-0.182083 23:0.138912
+1 1:-0.0849132 2:0.791886 3:-0.0847441 4:0.791886 5:0.0849133 6:-0.791886 7:-0.678072 8:1 9:-0.683607 10:0.332315 11:-0.401688 12:0.855131 13:-0.330215 14:-1 15:-0.210893 16:-0.684838 17:0.267898 18:-0.792907 19:0.182893 20:-0.847203 21:0.199504 22:-0.2918 23:0.146189
+0.655182527845297 1:0.137335 2:0.862642 3:0.137101 4:0.835209 5:-0.137335 6:-0.862642 7:-0.914373 8:0.837256 9:-0.817288 10:0.987549 11:-0.387045 12:0.285412 13:-0.412263 14:-0.91026 15:-0.215793 16:0.965595 17:0.915413 18:0.897666 19:0.361581 20:-0.956581 21:0.214418 22:-0.551314 23:-0.0632531
+1 1:0.0316125 2:0.770547 3:0.0310414 4:0.751932 5:-0.0316125 6:-0.770547 7:-0.891755 8:-0.0106301 9:-0.824543 10:-0.224223 11:-0.405134 12:0.475421 13:-0.325222 14:0.344676 15:-0.146767 16:-0.875327 17:0.26912 18:0.952775 19:0.21573 20:-0.211055 21:0.216167 22:-0.279366 23:-0.053392
+1 1:-0.170342 2:0.310862 3:-0.170189 4:0.310862 5:0.170342 6:-0.310862 7:-0.521414 8:1 9:-0.79365 10:0.990465 11:-0.297924 12:1 13:-0.369465 14:1 15:-0.188219 16:0.0898204 17:0.291671 18:-0.207547 19:0.186937 20:-0.945018 21:-0.448259 22:0.336798 23:0.0575001
+1 1:0.0297766 2:0.76474 3:0.0299669 4:0.76474 5:-0.0297767 6:-0.76474 7:-0.888389 8:0.146848 9:-0.807663 10:-1 11:-0.615893 12:0.0493344 13:-0.395803 14:-1 15:-0.681691 16:0.574712 17:0.495383 18:-0.405405 19:0.191986 20:0.270015 21:0.472228 22:-0.848925 23:-0.0632531
+1 1:0.212054 2:0.858627 3:0.212278 4:0.858627 5:-0.212054 6:-0.858628 7:-0.777353 8:1 9:-0.666995 10:-0.324512 11:-0.467039 12:1 13:-0.139155 14:1 15:-0.133725 16:-0.98472 17:-0.149672 18:-0.993251 19:-0.0659815 20:0.737368 21:0.220036 22:-0.208711 23:-0.0632531
+1 1:-0.0887151 2:0.787202 3:-0.0890203 4:0.739195 5:0.0887151 6:-0.787202 7:-0.671101 8:0.663843 9:-0.731821 10:-0.201834 11:-0.406241 12:-0.21059 13:-0.453487 14:-0.43447 15:-0.25778 16:-0.441863 17:0.276802 18:1 19:0.208065 20:-0.883309 21:0.207454 22:-0.492605 23:0.369715
+1 1:-0.241389 2:0.851212 3:-0.241624 4:0.783646 5:0.241389 6:-0.851212 7:-0.39113 8:-0.924655 9:-0.895289 10:0.0273721 11:-0.400054 12:0.982632 13:-0.317019 14:0.994482 15:-0.193302 16:0.997846 17:0.337517 18:1 19:0.195872 20:-0.992619 21:-0.825363 22:-0.820041 23:-0.0632531
+1 1:0.319385 2:0.732686 3:0.319629 4:0.732686 5:-0.319385 6:-0.732686 7:-0.580531 8:1 9:-0.736953 10:-0.68749 11:-0.462725 12:-0.047668 13:-0.455591 14:-0.559951 15:-0.260023 16:-0.666956 17:0.202791 18:-0.959572 19:0.176329 20:-0.752905 21:0.140323 22:-0.73553 23:-0.0632531
+1 1:-0.130918 2:0.653341 3:-0.130758 4:0.653341 5:0.130918 6:-0.653341 7:-0.593709 8:0.703526 9:-0.817849 10:0.688669 11:-0.379211 12:0.795469 13:-0.371665 14:0.696061 15:-0.173147 16:-0.988898 17:0.236125 18:-0.789147 19:0.146776 20:-0.311967 21:0.209821 22:0.0130777 23:-0.0632531
+1 1:0.391936 2:0.783904 3:0.392077 4:0.756053 5:-0.391936 6:-0.783904 7:-0.447489 8:0.34343 9:-0.7257 10:0.479118 11:-0.332733 12:0.99375 13:-0.0345791 14:0.998113 15:-0.00424696 16:-0.980366 17:0.160478 18:-0.845197 19:0.121106 20:-0.910691 21:0.156709 22:-0.629092 23:-0.0632531
+1 1:0.267094 2:0.676631 3:0.267329 4:0.676631 5:-0.267094 6:-0.676631 7:-0.676421 8:1 9:-0.673153 10:0.173562 11:-0.392116 12:-0.0018655 13:-0.415182 14:0.89756 15:-0.182894 16:-0.471607 17:0.25765 18:-0.854545 19:0.18995 20:-0.356629 21:0.212264 22:-0.507379 23:-0.0632531
+1 1:0.0959064 2:0.849586 3:0.0944512 4:0.791053 5:-0.0959064 6:-0.849586 7:-0.990343 8:0.974251 9:-0.748831 10:0.919141 11:-0.310543 12:0.893121 13:-0.269683 14:0.778715 15:-0.132585 16:-0.930518 17:0.210268 18:-0.652816 19:0.187104 20:-0.22755 21:0.216872 22:-0.221957 23:0.197357
+1 1:-0.196263 2:0.746178 3:-0.196215 4:0.71301 5:0.196263 6:-0.746178 7:-0.473881 8:0.964844 9:-0.657281 10:0.352147 11:-0.265404 12:0.978448 13:-0.006587 14:0.928788 15:-0.13768 16:-0.945645 17:0.126833 18:-0.894325 19:0.175177 20:-0.983234 21:0.00336787 22:-0.722465 23:0.0397413
+1 1:0.197841 2:0.745457 3:0.198062 4:0.745457 5:-0.197841 6:-0.745457 7:-0.803418 8:0.954179 9:-0.793629 10:-0.489741 11:-0.438176 12:0.999942 13:-0.171646 14:0.999781 15:-0.0567855 16:-0.94005 17:0.271378 18:-0.287684 19:0.176573 20:1 21:0.220368 22:-0.952607 23:-0.0632531
+1 1:-0.316112 2:0.737195 3:-0.315986 4:0.737195 5:0.316112 6:-0.737195 7:-0.254104 8:-0.0503979 9:-0.823972 10:0.981812 11:-0.391155 12:0.975472 13:-0.354869 14:0.929745 15:-0.18953 16:-0.937282 17:0.26277 18:-0.906634 19:0.178873 20:0.693796 21:0.253616 22:-0.247211 23:0.195339
+1 1:0.203057 2:0.604775 3:0.203279 4:0.604775 5:-0.203057 6:-0.604775 7:-0.793852 8:1 9:-0.765451 10:-0.25 11:-0.404695 12:0.789996 13:0.214549 14:-0.89545 15:-0.464871 16:-0.261905 17:0.286437 18:-0.97351 19:0.161646 20:0.408696 21:0.223065 22:-0.957098 23:-0.0632531
+1 1:0.163407 2:0.746178 3:0.163622 4:0.746178 5:-0.163407 6:-0.746178 7:-0.866562 8:0.865534 9:-0.814725 10:-0.111186 11:-0.45802 12:0.141573 13:-0.314879 14:-0.0813665 15:-0.211221 16:-0.860756 17:-0.314961 18:-0.849228 19:0.109748 20:-0.978765 21:0.203833 22:-0.665168 23:-0.0632531
+1 1:0.132525 2:0.606654 3:0.131936 4:0.586825 5:-0.132525 6:-0.606654 7:-0.923193 8:0.882583 9:-0.806111 10:0.688871 11:-0.322639 12:0.919744 13:-0.109111 14:1 15:-0.000878276 16:-0.542225 17:0.239351 18:-0.365741 19:0.153461 20:-0.353748 21:0.19068 22:-0.414726 23:-0.0632531
+1 1:0.562699 2:0.801716 3:0.562988 4:0.801716 5:-0.562699 6:-0.801716 7:-0.134346 8:0.458081 9:-0.817054 10:0.437927 11:-0.386207 12:0.956342 13:0.130385 14:0.991192 15:0.141364 16:-1 17:0.246428 18:-1 19:0.157161 20:1 21:0.217602 22:-0.849358 23:-0.0632531
+0.2988193557278515 1:0.347432 2:0.878852 3:0.346528 4:0.812515 5:-0.347432 6:-0.878852 7:-0.5291 8:0.844587 9:-0.816957 10:0.976674 11:-0.394422 12:0.972622 13:-0.347815 14:0.960071 15:-0.198813 16:-0.772482 17:0.0792672 18:0.58651 19:0.193074 20:0.743247 21:0.225391 22:-0.83757 23:-0.0632531
+1 1:0.0795594 2:0.747248 3:0.0797589 4:0.747248 5:-0.0795594 6:-0.747248 7:-0.97968 8:-0.319205 9:-0.897828 10:-0.569646 11:-0.690431 12:0.953629 13:-0.104135 14:1 15:-0.0846447 16:0.410645 17:0.30153 18:-0.0269685 19:0.190532 20:-0.763702 21:0.109011 22:-0.355503 23:-0.0632531
+1 1:0.059314 2:0.752031 3:0.0595098 4:0.752031 5:-0.059314 6:-0.752031 7:-0.942554 8:0.873817 9:-0.801572 10:1 11:-0.309524 12:0.921663 13:-0.280381 14:0.792936 15:-0.155293 16:-0.829146 17:0.253466 18:-0.598662 19:0.169901 20:-0.462898 21:0.208241 22:-0.372094 23:-0.0632531
+1 1:-0.273901 2:0.493512 3:-0.273766 4:0.493512 5:0.273901 6:-0.493512 7:-0.33151 8:1 9:-0.822017 10:1 11:-0.401936 12:1 13:-0.395406 14:-0.346396 15:-0.21447 16:-0.522133 17:0.273192 18:-0.0192103 19:0.192306 20:-0.498588 21:0.209083 22:-0.939724 23:-0.71051
+1 1:-0.187247 2:0.895924 3:-0.188202 4:0.734959 5:0.187247 6:-0.895924 7:-0.490414 8:0.999854 9:0.343868 10:0.768621 11:-0.401621 12:-0.807611 13:-0.416157 14:-0.191769 15:-0.210551 16:-0.980761 17:0.285657 18:-1 19:0.189957 20:-0.325897 21:0.217286 22:-0.710228 23:-0.0632531
+1 1:0.157469 2:0.778087 3:0.157683 4:0.778087 5:-0.157469 6:-0.778087 7:-0.877451 8:0.748084 9:-0.742003 10:0.743259 11:-0.358747 12:0.734411 13:-0.144515 14:0.705709 15:-0.166213 16:-1 17:0.245888 18:-0.508607 19:0.165835 20:-1 21:0.192053 22:-0.254484 23:-0.0632531
+1 1:-0.320087 2:0.641759 3:-0.319961 4:0.641759 5:0.320087 6:-0.641759 7:-0.246815 8:-1 9:-0.823911 10:0.762277 11:-0.389191 12:0.795418 13:-0.376996 14:0.940983 15:-0.200583 16:-1 17:0.282901 18:-0.685962 19:0.190397 20:-0.804593 21:0.21726 22:-0.574855 23:0.506934
+1 1:-0.379339 2:0.731008 3:-0.379224 4:0.731008 5:0.379339 6:-0.731008 7:-0.138159 8:1 9:-0.823674 10:-0.968733 11:-0.459563 12:0.991014 13:-0.387902 14:-1 15:-0.209367 16:-1 17:0.190889 18:-0.988973 19:0.162569 20:0.857366 21:0.217784 22:-0.59747 23:0.825435
+1 1:-0.0079766 2:0.665626 3:-0.00779315 4:0.665626 5:0.00797652 6:-0.665626 7:-0.819158 8:0.987081 9:-0.791062 10:0.994786 11:-0.356565 12:0.997747 13:-0.109997 14:0.991928 15:0.0140392 16:-0.846008 17:0.26754 18:-0.51638 19:0.188775 20:-1 21:0.209277 22:-0.242523 23:-0.0632531
+1 1:-0.714753 2:0.558786 3:-0.7147 4:0.558786 5:0.714752 6:-0.558786 7:0.476917 8:0.85285 9:-0.823786 10:0.531205 11:-0.40192 12:1 13:-0.409548 14:1 15:-0.208435 16:-1 17:0.28923 18:-0.635248 19:0.192387 20:-0.17749 21:0.217473 22:-0.157493 23:-0.0632531
+1 1:0.132004 2:0.631041 3:0.132213 4:0.631041 5:-0.132004 6:-0.631041 7:-0.924148 8:0.815261 9:-0.814086 10:0.526578 11:-0.384288 12:0.989916 13:-0.154592 14:0.997116 15:-0.0198444 16:-0.933528 17:0.263562 18:0.380783 19:0.196539 20:-0.347594 21:0.214762 22:-0.4581 23:-0.0632531
+1 1:0.269008 2:0.540177 3:0.269243 4:0.540177 5:-0.269008 6:-0.540177 7:-0.672911 8:-0.442786 9:-0.829197 10:0.765933 11:-0.310489 12:0.746992 13:-0.288874 14:0.764322 15:-0.144849 16:-0.863128 17:0.26496 18:0.534545 19:0.206158 20:-0.918548 21:0.109146 22:-0.0168542 23:-0.0632531
+1 1:0.38941 2:0.668785 3:0.389667 4:0.668785 5:-0.38941 6:-0.668785 7:-0.452121 8:1 9:-0.822391 10:0.991691 11:-0.400312 12:0.999356 13:0.00416786 14:-0.895105 15:-0.209663 16:-0.522689 17:0.290426 18:-0.450382 19:0.192281 20:-1 21:0.217432 22:-0.950087 23:-0.0632531
+1 1:0.206941 2:0.681587 3:0.207164 4:0.681587 5:-0.206941 6:-0.681587 7:-0.786729 8:0.997779 9:0.307293 10:1 11:-0.394126 12:-0.311236 13:-0.417953 14:1 15:-0.200268 16:-0.944453 17:0.280156 18:-1 19:0.140211 20:-0.507052 21:0.215701 22:-0.863883 23:-0.0632531
+1 1:-0.0884673 2:0.652413 3:-0.0882989 4:0.652413 5:0.0884673 6:-0.652413 7:-0.671555 8:1 9:-0.822668 10:0.767666 11:-0.401181 12:1 13:-0.351758 14:1 15:-0.180568 16:-1 17:0.288085 18:-0.918796 19:0.191001 20:0.974217 21:0.236137 22:-0.767877 23:-0.0632531
+1 1:-0.302318 2:0.69508 3:-0.302189 4:0.69508 5:0.302318 6:-0.695081 7:-0.279399 8:0.593539 9:-0.743154 10:0.397243 11:-0.340935 12:0.438294 13:-0.264157 14:0.0909128 15:-0.181745 16:-0.140605 17:0.280876 18:-0.474279 19:0.1536 20:-1 21:0.217124 22:-0.460489 23:-0.0632531
+1 1:-0.154169 2:0.760393 3:-0.154013 4:0.760393 5:0.154169 6:-0.760393 7:-0.551072 8:1 9:-0.818338 10:0.991443 11:-0.384922 12:0.968997 13:-0.366901 14:0.517135 15:-0.203058 16:-0.808561 17:0.281905 18:-1 19:0.1868 20:-1 21:0.216772 22:-0.506543 23:0.039733
+1 1:-0.169449 2:0.637254 3:-0.169725 4:0.614732 5:0.169449 6:-0.637254 7:-0.523053 8:-0.384934 9:-0.83839 10:0.0177925 11:-0.40189 12:0.988658 13:-0.369346 14:0.0151736 15:-0.208545 16:-0.683296 17:0.288366 18:-0.406408 19:0.191617 20:-0.19827 21:0.217386 22:-0.450126 23:0.567944
+1 1:0.306101 2:0.76861 3:0.306342 4:0.76861 5:-0.306101 6:-0.76861 7:-0.604892 8:0.826528 9:-0.598333 10:0.691547 11:-0.342657 12:0.999468 13:-0.0976256 14:0.994571 15:-0.103882 16:-0.455285 17:0.273478 18:-0.65247 19:0.170639 20:-0.714286 21:0.213681 22:-0.410574 23:-0.0632531
+1 1:-0.0162082 2:0.660539 3:-0.0160263 4:0.660539 5:0.0162083 6:-0.66054 7:-0.804063 8:1 9:-0.821463 10:0.95889 11:-0.377688 12:0.979258 13:-0.0665365 14:0.988576 15:-0.0461302 16:-0.919031 17:0.268748 18:-0.380981 19:0.189391 20:-0.767838 21:0.213538 22:-0.39165 23:-0.0632531
+0.2856827907018439 1:-0.684526 2:0.110102 3:-0.684467 4:0.110102 5:0.684526 6:-0.110102 7:0.421488 8:1 9:-0.78992 10:0.964946 11:-0.382698 12:1 13:-0.40381 14:-0.580247 15:-0.209582 16:-0.930906 17:0.288998 18:-0.196429 19:0.192411 20:-0.75 21:0.217479 22:0.255658 23:0.590964
+1 1:-0.773007 2:-0.730704 3:-0.772965 4:-0.730704 5:0.773007 6:0.730704 7:0.583744 8:1 9:-0.821356 10:1 11:-0.397049 12:1 13:-0.324341 14:1 15:-0.150586 16:-0.261146 17:0.288246 18:0.687861 19:0.202492 20:0.0676692 21:0.217951 22:1 23:-0.0632531
+1 1:-0.251386 2:0.627887 3:-0.251247 4:0.627887 5:0.251386 6:-0.627887 7:-0.372798 8:1 9:-0.812278 10:1 11:-0.388124 12:0.401094 13:-0.391384 14:0.258262 15:-0.191485 16:-0.979474 17:0.257881 18:-0.745725 19:0.145144 20:-0.272054 21:0.214466 22:0.0910053 23:-0.0632531
+1 1:0.0827524 2:0.676631 3:0.0829525 4:0.676631 5:-0.0827523 6:-0.676631 7:-0.985535 8:0.969728 9:-0.358383 10:0.973743 11:0.0186958 12:0.881494 13:-0.106832 14:0.892554 15:-0.0345385 16:-1 17:0.284043 18:-0.946093 19:0.0840182 20:-0.978778 21:0.163304 22:-0.301146 23:-0.0632531
+1 1:-0.481895 2:0.692766 3:-0.481799 4:0.692766 5:0.481895 6:-0.692766 7:0.0499072 8:0.984959 9:-0.822857 10:0.997152 11:-0.39756 12:0.995699 13:-0.405762 14:0.992475 15:-0.202504 16:-0.663942 17:0.289733 18:-0.864198 19:0.191971 20:0.466052 21:0.217757 22:-0.0428539 23:-0.0632531
+1 1:-0.231255 2:0.655637 3:-0.231112 4:0.655637 5:0.231254 6:-0.655637 7:-0.409714 8:0.924196 9:-0.782716 10:0.89549 11:-0.38072 12:0.900021 13:-0.338303 14:0.859268 15:-0.165078 16:-0.890131 17:0.280767 18:-0.389578 19:0.192359 20:-0.728437 21:0.217395 22:-0.241824 23:-0.0632531
+1 1:0.0834001 2:0.47814 3:0.0836004 4:0.47814 5:-0.0834 6:-0.47814 7:-0.986723 8:0.859823 9:-0.612383 10:0.747824 11:-0.363183 12:0.716861 13:0.417414 14:-0.0356569 15:-0.212337 16:-1 17:0.268306 18:-0.97399 19:-0.0447194 20:-0.986662 21:0.163091 22:0.130959 23:-0.0632531
+1 1:0.310102 2:0.537514 3:0.310344 4:0.537514 5:-0.310102 6:-0.537515 7:-0.597555 8:0.0974247 9:-0.757168 10:0.359379 11:-0.175093 12:0.656508 13:0.178032 14:0.536458 15:0.347688 16:-0.834029 17:0.248047 18:-0.028302 19:0.187214 20:-0.852782 21:0.203985 22:0.0348462 23:-0.0632531
+1 1:0.115184 2:0.77285 3:0.11539 4:0.77285 5:-0.115184 6:-0.77285 7:-0.954992 8:1 9:-0.457656 10:1 11:0.0449038 12:1 13:-0.115993 14:0.995237 15:0.0488367 16:-0.943355 17:0.273089 18:-0.686157 19:0.177235 20:0.0650121 21:0.21785 22:-0.162361 23:-0.0632531
+0.429568614147608 1:-0.143074 2:0.634037 3:-0.143555 4:0.60508 5:0.143074 6:-0.634037 7:-0.571418 8:0.940388 9:-0.72765 10:0.773155 11:-0.395897 12:0.997778 13:-0.00774144 14:0.991556 15:-0.0250029 16:-0.830451 17:0.150096 18:-0.682776 19:0.16134 20:-0.962036 21:-0.12765 22:-0.582292 23:0.0432485
+1 1:-0.304142 2:0.710722 3:-0.304013 4:0.710722 5:0.304142 6:-0.710722 7:-0.276054 8:0.507304 9:-0.823754 10:0.992404 11:-0.389524 12:0.998026 13:-0.394616 14:0.988827 15:-0.201448 16:-1 17:0.279404 18:0.026296 19:0.192523 20:-1 21:0.216954 22:-0.247635 23:-0.0632531
+1 1:-0.586005 2:0.759998 3:-0.585929 4:0.759998 5:0.586005 6:-0.759998 7:0.240823 8:0.922656 9:-0.817053 10:0.976227 11:-0.388324 12:0.853804 13:-0.405845 14:0.595331 15:-0.207949 16:-0.660706 17:0.283402 18:-0.696888 19:0.187447 20:0.943403 21:0.234708 22:0.00929982 23:-0.0632531
+1 1:0.152267 2:0.727024 3:0.152296 4:0.70133 5:-0.152267 6:-0.727024 7:-0.886991 8:-0.940299 9:-0.825879 10:-0.33424 11:-0.402362 12:0.923168 13:-0.287365 14:0.840684 15:-0.199739 16:-0.594021 17:0.278554 18:1 19:0.192719 20:1 21:0.217588 22:-0.482103 23:-0.0632531
+1 1:-0.074882 2:0.738079 3:-0.0764778 4:0.695654 5:0.0748819 6:-0.738079 7:-0.696468 8:0.987462 9:-0.802749 10:0.994541 11:-0.333623 12:0.951638 13:-0.146965 14:0.957186 15:-0.0109602 16:-0.777402 17:0.279085 18:-0.546869 19:0.179025 20:-0.429812 21:0.211034 22:-0.382605 23:-0.0632531
+1 1:-0.201124 2:0.738524 3:-0.200976 4:0.738524 5:0.201124 6:-0.738524 7:-0.464966 8:0.999836 9:-0.806376 10:0.998834 11:-0.398746 12:0.977462 13:-0.401739 14:0.977772 15:-0.204993 16:-0.895868 17:0.221069 18:-0.929028 19:0.16461 20:1 21:0.217616 22:-0.510797 23:-0.0632531
+1 1:0.832015 2:0.665626 3:0.832237 4:0.618242 5:-0.832015 6:-0.665626 7:0.359521 8:-0.847997 9:-0.828167 10:-0.867057 11:-0.403878 12:1 13:-0.265435 14:1 15:-0.154832 16:-0.142708 17:0.290228 18:0.8437 19:0.194211 20:-1 21:0.217489 22:-0.973201 23:-0.0632531
+1 1:-0.276997 2:0.283769 3:-0.276863 4:0.283769 5:0.276997 6:-0.283769 7:-0.325833 8:1 9:-0.801578 10:1 11:-0.368446 12:1 13:-0.269996 14:1 15:-0.0355276 16:-0.632275 17:0.246484 18:-0.874972 19:0.154454 20:-0.344346 21:0.214895 22:0.350405 23:-0.0632531
+1 1:-0.217976 2:0.674441 3:-0.217832 4:0.674441 5:0.217976 6:-0.674441 7:-0.434063 8:0.992651 9:-0.796831 10:0.995306 11:-0.39314 12:1 13:-0.206171 14:1 15:-0.130954 16:-1 17:0.271701 18:-0.927443 19:0.169175 20:-0.572928 21:0.204214 22:-0.23169 23:0.128867
+1 1:0.0540225 2:0.737428 3:0.0535875 4:0.715044 5:-0.0540226 6:-0.737428 7:-0.932851 8:0.91236 9:-0.805296 10:-0.350698 11:-0.431183 12:0.482463 13:-0.306 14:-0.0888146 15:-0.220719 16:-0.316805 17:0.276154 18:-0.701077 19:0.179071 20:0.656294 21:0.21973 22:-0.366619 23:0.286769
+0.9688165698611798 1:0.330968 2:0.63838 3:0.331214 4:0.63838 5:-0.330968 6:-0.63838 7:-0.559291 8:1 9:-0.624198 10:0.999681 11:0.0854053 12:0.999812 13:-0.113073 14:0.844538 15:-0.167792 16:-0.659771 17:0.285185 18:-0.661952 19:0.191112 20:0.661482 21:0.218541 22:-0.337786 23:-0.0632531
+1 1:-0.231053 2:0.716866 3:-0.230911 4:0.716866 5:0.231053 6:-0.716866 7:-0.410083 8:0.454954 9:-0.823416 10:-0.51565 11:-0.402684 12:0.880681 13:-0.409661 14:0.706652 15:-0.207796 16:-0.989361 17:0.271186 18:-1 19:0.173258 20:-0.953465 21:0.203734 22:-0.330396 23:0.424297
+1 1:-0.472107 2:0.772638 3:-0.472009 4:0.772638 5:0.472107 6:-0.772638 7:0.0319576 8:0.875774 9:-0.823402 10:0.041742 11:-0.402138 12:0.973815 13:-0.397331 14:-0.502318 15:-0.210202 16:0.525919 17:0.290796 18:-0.984924 19:0.190403 20:0.56232 21:0.217879 22:-0.449878 23:-0.0632531
+-1 1:0.153388 2:0.864943 3:0.152959 4:0.817865 5:-0.153388 6:-0.864943 7:-0.884934 8:0.909895 9:-0.809953 10:0.996062 11:-0.395676 12:1 13:-0.273465 14:0.977478 15:-0.17772 16:-0.929831 17:0.253402 18:-1 19:0.180805 20:-1 21:0.0433982 22:-0.469804 23:-0.344711
+-1 1:-0.229871 2:0.652703 3:-0.229729 4:0.652703 5:0.229871 6:-0.652703 7:-0.412251 8:-1 9:-0.825485 10:-0.531027 11:-0.40271 12:0.922207 13:-0.115007 14:0.942864 15:-0.0128998 16:-0.614655 17:-0.0594098 18:-1 19:-0.310416 20:-0.997564 21:0.0501324 22:-0.617139 23:-0.188432
+-0.9092126510934362 1:0.00498815 2:0.886191 3:0.00517386 4:0.886191 5:-0.00498823 6:-0.886191 7:-0.842932 8:1 9:-0.821017 10:1 11:-0.398403 12:1 13:-0.413173 14:-1 15:-0.209448 16:-0.9417 17:-0.0294413 18:-0.997541 19:0.0582388 20:-0.999751 21:0.128743 22:-0.840465 23:-0.0632531
+-1 1:-0.0337745 2:0.52836 3:-0.0335958 4:0.52836 5:0.0337745 6:-0.52836 7:-0.77185 8:1 9:-0.812554 10:-1 11:-0.402191 12:0.911357 13:-0.328257 14:1 15:-0.183513 16:-1 17:0.177294 18:-0.627119 19:0.157888 20:-0.649485 21:0.118253 22:-0.741959 23:-0.0632531
+-1 1:-0.00537357 2:0.881188 3:-0.00586151 4:0.855494 5:0.00537356 6:-0.881188 7:-0.823931 8:-0.37528 9:-0.824032 10:1 11:-0.395612 12:0.907762 13:-0.407531 14:0.00699448 15:-0.209361 16:-0.97121 17:0.256732 18:0.377578 19:0.193629 20:-0.915104 21:0.214264 22:-0.502858 23:-0.0632531
+-1 1:0.0780578 2:0.53345 3:0.0782571 4:0.53345 5:-0.0780578 6:-0.53345 7:-0.976926 8:0.784173 9:-0.815473 10:-0.387755 11:-0.417639 12:0.149492 13:-0.399445 14:-1 15:-0.349829 16:-0.927641 17:0.152291 18:-0.238494 19:0.178682 20:-0.303754 21:0.205418 22:-0.699619 23:-0.0632531
+-1 1:-0.085083 2:0.803705 3:-0.0864272 4:0.775201 5:0.0850831 6:-0.803705 7:-0.677761 8:1 9:-0.82374 10:-0.48134 11:-0.40503 12:0.356594 13:-0.374107 14:-0.882419 15:-0.301936 16:-0.547432 17:0.0712362 18:-0.0220246 19:0.190936 20:0.906275 21:0.220828 22:-0.587056 23:-0.408774
+-1 1:-0.0236976 2:0.752031 3:-0.0238529 4:0.73252 5:0.0236976 6:-0.752031 7:-0.790329 8:0.782326 9:-0.820108 10:-0.292881 11:-0.40571 12:0.976323 13:-0.238472 14:0.660114 15:-0.168139 16:-0.965555 17:0.267388 18:-0.442611 19:0.17919 20:0.160667 21:0.218524 22:-0.303291 23:-0.645235
+-1 1:0.0197718 2:0.767938 3:0.0199603 4:0.767938 5:-0.0197718 6:-0.767938 7:-0.870042 8:0.607312 9:-0.81809 10:0.567405 11:-0.400052 12:0.960324 13:-0.384508 14:0.865721 15:-0.195749 16:-1 17:0.238694 18:-0.907564 19:0.186359 20:0.918589 21:0.254532 22:0.099845 23:-0.452855
+-1 1:-0.403047 2:0.757659 3:-0.403584 4:0.710883 5:0.403047 6:-0.75766 7:-0.0946841 8:1 9:-0.823866 10:0.954239 11:-0.397411 12:0.769493 13:-0.407051 14:0.599646 15:-0.206965 16:-1 17:0.270754 18:0.924588 19:0.193075 20:-0.0342843 21:0.217489 22:-0.42378 23:-0.69769
+-1 1:-0.0867144 2:0.395144 3:-0.0865456 4:0.395144 5:0.0867143 6:-0.395144 7:-0.674769 8:1 9:-0.821072 10:1 11:-0.399359 12:0.999328 13:-0.204281 14:0.992732 15:-0.197819 16:-0.960199 17:0.240766 18:-0.993491 19:0.111337 20:0.784738 21:0.235846 22:0.306655 23:-0.360083
+-1 1:0.0523725 2:0.478585 3:0.0517859 4:0.451152 5:-0.0523725 6:-0.478585 7:-0.929825 8:1 9:-0.823849 10:-0.00025892 11:-0.402196 12:0.964959 13:-0.389871 14:-0.0909091 15:-0.209389 16:-0.91389 17:0.268671 18:-0.997382 19:0.165898 20:1 21:0.217611 22:-0.75917 23:-0.541129
+-1 1:0.258657 2:0.867918 3:0.258314 4:0.847871 5:-0.258657 6:-0.867918 7:-0.691893 8:0.885222 9:-0.802932 10:0.215716 11:-0.398374 12:0.999682 13:-0.352974 14:0.97779 15:-0.203124 16:-1 17:0.260109 18:-0.199233 19:0.190683 20:-0.981555 21:0.210049 22:-0.365063 23:-0.0632531
+-1 1:0.342084 2:0.590949 3:0.342332 4:0.590949 5:-0.342084 6:-0.590949 7:-0.538907 8:1 9:-0.823672 10:1 11:-0.400829 12:0.900663 13:-0.404841 14:1 15:-0.194006 16:-0.994144 17:0.250974 18:-0.925076 19:0.179531 20:-0.938791 21:0.214408 22:-0.312872 23:-0.0632531
+-1 1:-0.0344139 2:0.609006 3:-0.0342354 4:0.609006 5:0.0344139 6:-0.609006 7:-0.770677 8:0.342466 9:-0.822419 10:1 11:-0.343021 12:0.795565 13:-0.368433 14:1 15:-0.185619 16:-0.95828 17:0.167882 18:-1 19:0.169118 20:-0.796211 21:0.15032 22:-0.0213129 23:-0.0632531
+-1 1:-0.126653 2:0.798213 3:-0.128532 4:0.76474 5:0.126653 6:-0.798213 7:-0.60153 8:0.226016 9:-0.817603 10:-0.798561 11:-0.425702 12:-0.089934 13:-0.464511 14:-0.372917 15:-0.242824 16:-0.785819 17:0.0187224 18:-0.787583 19:-0.0254431 20:0.529881 21:0.249344 22:-0.881634 23:-0.0632531
+-0.02052404560933848 1:0.375048 2:0.942112 3:0.375302 4:0.942112 5:-0.375048 6:-0.942112 7:-0.478458 8:0.929369 9:-0.808008 10:0.893641 11:-0.384385 12:0.953351 13:-0.346808 14:0.777578 15:-0.190904 16:-0.987618 17:0.0772213 18:-0.839594 19:0.162965 20:-0.940857 21:0.203114 22:-0.325972 23:-0.0632531
+-1 1:0.648348 2:0.822776 3:0.645804 4:0.770279 5:-0.648348 6:-0.822776 7:0.0227154 8:-1 9:-0.824471 10:-1 11:-0.40279 12:-0.291834 13:-0.413771 14:0.322316 15:-0.209101 16:0.0244751 17:0.2906 18:-0.257695 19:0.192285 20:0.98378 21:0.219241 22:-0.608739 23:-0.0632531
+-1 1:-0.108019 2:0.835209 3:-0.107854 4:0.835209 5:0.108019 6:-0.83521 7:-0.635701 8:0.988792 9:-0.815144 10:0.997823 11:-0.333819 12:0.239429 13:-0.380854 14:0.392573 15:-0.164401 16:-0.608519 17:0.274312 18:-0.687312 19:0.190433 20:0.586141 21:0.219758 22:-0.110262 23:-0.0632531
+-1 1:-0.551248 2:0.87173 3:-0.552142 4:0.826963 5:0.551248 6:-0.87173 7:0.177086 8:0.993059 9:-0.797783 10:0.988618 11:-0.397822 12:0.995299 13:-0.355542 14:0.991062 15:-0.185584 16:-0.688027 17:0.268461 18:-0.981231 19:0.189402 20:-0.625931 21:0.217482 22:-0.269898 23:-1
+-1 1:0.0193923 2:0.490412 3:0.0195807 4:0.490412 5:-0.0193923 6:-0.490412 7:-0.869346 8:0.859425 9:-0.817704 10:1 11:-0.400899 12:0.629893 13:-0.390289 14:0.953846 15:-0.194636 16:-0.979548 17:0.0504075 18:-0.830585 19:0.0197857 20:0.723785 21:0.223263 22:0.127673 23:-0.0632531
+-1 1:0.0855421 2:0.83905 3:0.0824222 4:0.766079 5:-0.0855421 6:-0.83905 7:-0.990651 8:0.942696 9:-0.788742 10:1 11:-0.317722 12:0.744387 13:-0.352152 14:1 15:-0.173751 16:-0.90917 17:-0.00871449 18:-0.981575 19:-0.140839 20:-1 21:-0.106647 22:-0.556389 23:-0.0632531
+-1 1:0.0552481 2:0.900251 3:0.0548969 4:0.81198 5:-0.0552481 6:-0.900251 7:-0.935098 8:0.998928 9:-0.72895 10:0.922501 11:-0.385457 12:0.999896 13:0.0061359 14:0.99302 15:-0.110662 16:-0.944823 17:0.236488 18:-1 19:0.181487 20:-1 21:0.217526 22:-0.78978 23:-0.661682
+-1 1:0.0462187 2:0.79664 3:0.0464121 4:0.79664 5:-0.0462187 6:-0.79664 7:-0.91854 8:0.885498 9:-0.800006 10:0.725025 11:-0.368631 12:0.973132 13:-0.195143 14:0.833977 15:-0.104642 16:-0.999457 17:-0.112182 18:-0.989849 19:-0.0804385 20:-0.991786 21:0.144259 22:-0.229752 23:-0.0632531
+-1 1:0.342533 2:0.832209 3:0.340995 4:0.718192 5:-0.342532 6:-0.832209 7:-0.538084 8:1 9:-0.823105 10:-0.273597 11:-0.403707 12:0.3105 13:-0.401382 14:0.0589086 15:-0.208795 16:-0.993637 17:0.139746 18:-1 19:0.186969 20:1 21:0.217608 22:-0.696378 23:-0.0632531
+-1 1:-0.0888127 2:0.603587 3:-0.0886443 4:0.603587 5:0.0888126 6:-0.603588 7:-0.670922 8:0.572243 9:-0.774784 10:0.5 11:-0.397619 12:0.601307 13:-0.335356 14:1 15:-0.0733615 16:-0.599815 17:-0.151237 18:-0.813187 19:0.15469 20:-0.975051 21:-0.26461 22:-0.868273 23:-0.0632531
+-1 1:0.313109 2:0.854875 3:0.311792 4:0.810917 5:-0.313109 6:-0.854875 7:-0.592041 8:-0.815789 9:-0.825142 10:-0.454545 11:-0.402685 12:1 13:-0.402861 14:-0.118049 15:-0.209987 16:-0.959718 17:0.275681 18:-0.23356 19:0.190884 20:0.911877 21:0.218567 22:-0.55481 23:-0.0632531
+-1 1:0.033984 2:0.730673 3:0.0341752 4:0.730673 5:-0.033984 6:-0.730673 7:-0.896104 8:1 9:-0.395802 10:1 11:-0.0811909 12:0.995344 13:0.428458 14:0.989116 15:0.0627502 16:-0.991659 17:-0.0504616 18:-0.99805 19:-0.0977683 20:-1 21:-0.514037 22:-0.0957368 23:-0.0632531
+-1 1:-0.361138 2:0.917507 3:-0.361622 4:0.862642 5:0.361138 6:-0.917508 7:-0.171535 8:0.981529 9:-0.819808 10:0.529376 11:-0.401356 12:0.93581 13:-0.408929 14:0.817766 15:-0.209052 16:-0.994106 17:0.253021 18:-0.941933 19:0.180326 20:0.982615 21:0.218816 22:-0.630748 23:-0.797713
+-1 1:0.135156 2:0.784085 3:0.135366 4:0.784085 5:-0.135156 6:-0.784085 7:-0.918368 8:0.336424 9:-0.821525 10:0.199394 11:-0.401443 12:0.999935 13:-0.280398 14:-0.757128 15:-0.209719 16:-0.982419 17:0.276259 18:-0.999396 19:0.163582 20:0.0787265 21:0.217554 22:-0.539808 23:-0.0632531
+-1 1:-0.10388 2:0.851212 3:-0.10594 4:0.783646 5:0.10388 6:-0.851212 7:-0.643291 8:0.998237 9:-0.801309 10:-0.895803 11:-0.420002 12:0.552857 13:-0.412898 14:0.315766 15:-0.209339 16:-0.987181 17:0.222402 18:-0.957838 19:0.148202 20:-0.686478 21:0.2164 22:-0.861619 23:-0.0632531
+-1 1:-0.00348877 2:0.800168 3:-0.00363322 4:0.746904 5:0.00348876 6:-0.800168 7:-0.827387 8:0.13687 9:-0.797967 10:-0.905165 11:-0.43382 12:0.185193 13:-0.211988 14:-0.0684615 15:-0.231554 16:-0.346309 17:-0.22145 18:-0.985785 19:-0.146139 20:-0.00896782 21:0.217199 22:-0.677666 23:-0.0632531
+-1 1:0.0712267 2:0.681844 3:0.0714247 4:0.681844 5:-0.0712266 6:-0.681844 7:-0.964399 8:0.999667 9:-0.314068 10:0.999166 11:-0.217965 12:0.999816 13:-0.353627 14:0.999017 15:-0.18525 16:-0.993618 17:0.26337 18:-1 19:0.192416 20:0.872518 21:0.220366 22:-0.33261 23:-0.0632531
+-1 1:0.147248 2:0.869097 3:0.146243 4:0.797557 5:-0.147248 6:-0.869097 7:-0.896194 8:-0.972856 9:-0.829263 10:0.90389 11:-0.400813 12:0.562395 13:-0.410629 14:0.426222 15:-0.191814 16:-1 17:0.275088 18:0.383562 19:0.19275 20:0.855339 21:0.22144 22:-0.911164 23:-0.0632531
+-1 1:0.381945 2:0.296777 3:0.382201 4:0.296777 5:-0.381945 6:-0.296776 7:-0.46581 8:1 9:-0.823856 10:0.97304 11:-0.392751 12:0.915577 13:-0.388546 14:0.964644 15:-0.154441 16:-0.470963 17:0.281571 18:-0.956612 19:0.171068 20:-0.800439 21:0.210296 22:0.294887 23:-0.0632531
+-1 1:-0.0178604 2:0.769639 3:-0.0180861 4:0.725144 5:0.0178604 6:-0.769639 7:-0.801033 8:-0.163941 9:-0.82426 10:0.97235 11:-0.382032 12:0.981337 13:-0.309288 14:-1 15:-0.209367 16:-0.808771 17:0.267596 18:-0.427496 19:0.192014 20:0.384354 21:0.217746 22:-0.366491 23:-0.0632531
+-1 1:0.167215 2:0.835209 3:0.167431 4:0.835209 5:-0.167215 6:-0.83521 7:-0.859579 8:0.149024 9:-0.823013 10:0.0966996 11:-0.401668 12:0.944595 13:-0.381536 14:0.955372 15:-0.19679 16:-0.90311 17:0.231706 18:0.162234 19:0.192583 20:-0.916235 21:0.193319 22:-0.12 23:-0.202858
+-1 1:-0.275409 2:0.618012 3:-0.275275 4:0.618012 5:0.275409 6:-0.618012 7:-0.328745 8:1 9:-0.820519 10:0.991694 11:-0.398747 12:0.858996 13:-0.401208 14:0.613702 15:-0.206214 16:-0.813039 17:0.269345 18:-0.71684 19:0.167234 20:-0.551912 21:0.214211 22:0.0868445 23:-0.691359
+-1 1:-0.209506 2:0.847363 3:-0.209572 4:0.824271 5:0.209506 6:-0.847363 7:-0.449595 8:-0.224368 9:-0.823901 10:-1 11:-0.407708 12:1 13:-0.376183 14:0.823611 15:-0.17924 16:0.0400426 17:0.296546 18:0.975716 19:0.25026 20:-0.103527 21:0.206033 22:-0.393136 23:-0.0632531
+-1 1:-0.012066 2:0.577073 3:-0.0118834 4:0.577073 5:0.012066 6:-0.577073 7:-0.811658 8:1 9:-0.818731 10:0.710532 11:-0.397432 12:0.950979 13:-0.0907803 14:0.956142 15:0.100392 16:-0.986929 17:-0.352961 18:-1 19:-0.246803 20:-0.403321 21:0.212539 22:-0.21481 23:-0.0632531
+-1 1:0.326064 2:0.254149 3:0.326309 4:0.254149 5:-0.326064 6:-0.254149 7:-0.568283 8:1 9:-0.80161 10:1 11:-0.390607 12:1 13:-0.291889 14:1 15:-0.180519 16:-0.99189 17:0.115624 18:-0.977232 19:0.104935 20:-0.199186 21:0.209227 22:0.385579 23:-0.0632531
+-1 1:-0.193829 2:0.807777 3:-0.19368 4:0.807777 5:0.193829 6:-0.807777 7:-0.478345 8:0.768621 9:-0.819398 10:-1 11:-0.403507 12:0.81809 13:-0.408361 14:0.795341 15:-0.194895 16:-0.996252 17:0.0626062 18:-0.984568 19:-0.133564 20:0.951867 21:0.255737 22:-0.207421 23:-0.378022
+-1 1:-0.0240427 2:0.0162941 3:-0.0238623 4:0.0162941 5:0.0240427 6:-0.0162942 7:-0.789696 8:1 9:-0.702393 10:1 11:-0.392816 12:1 13:-0.411915 14:1 15:-0.206728 16:-0.963723 17:-0.399376 18:-0.967799 19:-0.358363 20:0.327333 21:0.38334 22:0.50718 23:-0.0632531
+-1 1:0.696306 2:0.58341 3:0.692496 4:0.535085 5:-0.696306 6:-0.583411 7:0.11066 8:0.0571429 9:-0.823902 10:0.723369 11:-0.399226 12:0.926226 13:-0.281108 14:0.885418 15:-0.189214 16:0.0577496 17:0.291174 18:-0.609756 19:0.191694 20:0.122807 21:0.217535 22:-0.636985 23:-0.0632531
+-0.9480320012272011 1:-0.000557745 2:0.857681 3:-0.000372991 4:0.857681 5:0.000557668 6:-0.857681 7:-0.832762 8:0.735403 9:-0.794027 10:-0.53456 11:-0.412047 12:0.990961 13:-0.113838 14:0.790546 15:-0.130677 16:-0.329547 17:0.288198 18:-0.939431 19:0.189682 20:-1 21:0.2155 22:-0.303118 23:-0.681514
+-0.3473251243236888 1:-0.98992 2:-1 3:-0.989918 4:-1 5:0.98992 6:1 7:0.981515 8:-1 9:-0.823908 10:-1 11:-0.402191 12:1 13:-0.411874 14:-1 15:-0.209367 16:-1 17:0.290276 18:-1 19:0.192488 20:-1 21:0.217526 22:-0.536444 23:-0.0632531
+-0.9064384811467762 1:0.351976 2:0.760153 3:0.351916 4:0.741632 5:-0.351976 6:-0.760153 7:-0.520767 8:1 9:-0.818395 10:0.750875 11:-0.400518 12:0.995159 13:-0.390436 14:0.971864 15:-0.205374 16:-0.920107 17:0.236348 18:-0.967476 19:0.174579 20:-0.999799 21:-0.271949 22:-0.34571 23:-0.0632531
+-1 1:0.218363 2:0.794326 3:0.218167 4:0.723709 5:-0.218363 6:-0.794326 7:-0.765784 8:0.549546 9:-0.802558 10:-0.200297 11:-0.40299 12:0.991949 13:-0.29318 14:0.960852 15:-0.184961 16:-0.991562 17:0.172543 18:-0.953716 19:0.152186 20:1 21:0.25274 22:-0.469927 23:-0.316876
+-1 1:-0.225408 2:0.612319 3:-0.225264 4:0.612319 5:0.225408 6:-0.612319 7:-0.420436 8:1 9:-0.816309 10:-1 11:-0.403732 12:0.90824 13:-0.2106 14:-0.0286512 15:-0.209512 16:-0.845293 17:0.245158 18:-0.603243 19:0.145275 20:0.497328 21:0.22435 22:-0.0144846 23:-0.0632531
+-1 1:-0.0398959 2:0.733317 3:-0.041024 4:0.714702 5:0.0398958 6:-0.733317 7:-0.760624 8:0.443038 9:-0.819607 10:-0.479365 11:-0.413021 12:0.99693 13:-0.292486 14:0.782504 15:-0.147659 16:-0.91201 17:0.159206 18:-0.91008 19:0.0718231 20:-0.913023 21:0.00437004 22:-0.382951 23:-0.200227
+-1 1:-1 2:-0.867574 3:-1 4:-0.867574 5:1 6:0.867574 7:1 8:-1 9:-0.823908 10:-1 11:-0.402191 12:1 13:-0.41298 14:-1 15:-0.209367 16:0.98134 17:0.298093 18:1 19:0.198779 20:1 21:0.221968 22:-0.114762 23:-0.0632531
+-1 1:-0.0223285 2:0.785115 3:-0.027151 4:0.705799 5:0.0223284 6:-0.785115 7:-0.792839 8:0.93758 9:-0.783243 10:0.166381 11:-0.399063 12:0.839782 13:-0.36917 14:0.756488 15:-0.164988 16:-0.898406 17:0.197269 18:-1 19:0.183237 20:-0.589237 21:0.191472 22:-0.471667 23:0.0141191
+-1 1:0.150399 2:0.736701 3:0.149616 4:0.71301 5:-0.150399 6:-0.736701 7:-0.890415 8:0.968425 9:-0.665504 10:0.968647 11:-0.170424 12:0.942418 13:-0.363409 14:0.915455 15:-0.117065 16:-0.969606 17:-0.334322 18:-0.908973 19:0.0319943 20:-0.548387 21:0.206564 22:-0.674281 23:-0.0632531
+-1 1:0.0954275 2:0.755655 3:0.09563 4:0.755655 5:-0.0954275 6:-0.755655 7:-0.991221 8:0.978941 9:-0.816184 10:0.293615 11:-0.397506 12:0.962041 13:-0.384628 14:0.994525 15:-0.207541 16:-0.86759 17:0.22537 18:-0.733511 19:0.158628 20:-0.765866 21:0.211447 22:-0.101168 23:-0.195762
+-1 1:-0.128208 2:0.894647 3:-0.128913 4:0.829494 5:0.128208 6:-0.894647 7:-0.598679 8:0.769483 9:-0.818848 10:-0.972411 11:-0.402871 12:0.967387 13:-0.357719 14:0.847833 15:-0.196591 16:-0.991484 17:0.0285746 18:-0.997772 19:0.0383957 20:0.904275 21:0.218387 22:-0.378058 23:-0.236408
+-1 1:-0.0363361 2:0.879589 3:-0.037044 4:0.830942 5:0.0363362 6:-0.87959 7:-0.767152 8:0.289119 9:-0.820824 10:-0.640816 11:-0.402698 12:0.947489 13:-0.39705 14:0.894577 15:-0.19262 16:-0.997303 17:-0.0284193 18:-0.755049 19:0.136666 20:-0.326687 21:0.211052 22:-0.259009 23:-0.0632531
+-1 1:0.094692 2:0.582589 3:0.0948944 4:0.582589 5:-0.0946921 6:-0.582589 7:-0.99257 8:1 9:-0.809069 10:1 11:-0.371011 12:0.933014 13:-0.263154 14:0.777778 15:-0.123765 16:-0.967655 17:0.14207 18:-0.78 19:0.120036 20:-0.692308 21:0.179961 22:0.178131 23:-0.0632531
+-1 1:0.129588 2:0.746904 3:0.129592 4:0.693641 5:-0.129588 6:-0.746904 7:-0.928578 8:0.944133 9:-0.809609 10:0.988536 11:-0.178479 12:0.988988 13:-0.0111928 14:0.927463 15:-0.105927 16:-0.936029 17:0.113488 18:-0.875022 19:0.0748007 20:-0.94452 21:0.212426 22:-0.689334 23:-0.0919843
+-1 1:0.0635339 2:0.8531 3:0.0569969 4:0.807777 5:-0.0635339 6:-0.8531 7:-0.950292 8:0.790927 9:-0.642983 10:0.81263 11:-0.308299 12:-0.267973 13:-0.423345 14:0.264667 15:-0.20219 16:-0.604766 17:0.236302 18:-0.966775 19:0.0477112 20:-1 21:0.183016 22:-0.508933 23:-0.0632531
+-1 1:-0.382994 2:0.814726 3:-0.38288 4:0.814726 5:0.382993 6:-0.814726 7:-0.131457 8:0.650417 9:-0.822263 10:0.881298 11:-0.401473 12:0.698895 13:-0.411846 14:0.717273 15:-0.206561 16:-0.920976 17:0.27478 18:-0.913992 19:0.18121 20:0.519407 21:0.21842 22:-0.424612 23:-0.0632531
+-1 1:-0.170153 2:0.620137 3:-0.17 4:0.620137 5:0.170153 6:-0.620138 7:-0.521761 8:0.870006 9:-0.818018 10:0.484056 11:-0.399677 12:0.720288 13:-0.409255 14:-0.344813 15:-0.210546 16:0.212459 17:0.290953 18:-0.061839 19:0.192153 20:-0.515394 21:0.216216 22:0.029868 23:-0.79278
+-1 1:-0.286641 2:0.708811 3:-0.286509 4:0.708811 5:0.286641 6:-0.708811 7:-0.308147 8:0.240422 9:-0.821538 10:0.73181 11:-0.391961 12:-0.929155 13:-0.416731 14:0.279665 15:-0.207323 16:-0.683158 17:0.268174 18:-0.995641 19:0.0734558 20:-0.818569 21:0.210497 22:-0.572549 23:0.175766
+-1 1:0.503506 2:0.794525 3:0.503269 4:0.732686 5:-0.503506 6:-0.794526 7:-0.242893 8:0.985274 9:-0.822114 10:0.405848 11:-0.402081 12:-0.960827 13:-0.413275 14:0.953794 15:-0.206365 16:-0.90302 17:0.287831 18:-1 19:0.19185 20:0.107061 21:0.217602 22:-0.770369 23:-0.0632531
+-1 1:-0.661394 2:0.780604 3:-0.661331 4:0.780604 5:0.661394 6:-0.780604 7:0.379069 8:0.723245 9:-0.823805 10:0.904463 11:-0.401785 12:1 13:-0.403157 14:0.963663 15:-0.20909 16:-0.979149 17:0.274565 18:-0.852209 19:0.176246 20:0.940363 21:0.218543 22:-0.0678971 23:0.570645
+-1 1:-0.446707 2:0.537198 3:-0.446605 4:0.537198 5:0.446708 6:-0.537198 7:-0.0146194 8:0.916334 9:-0.823086 10:0.358706 11:-0.402064 12:0.986049 13:-0.406812 14:0.671463 15:-0.208537 16:-0.998896 17:0.261824 18:-0.954256 19:0.152662 20:0.604809 21:0.218125 22:0.128106 23:-0.746096
+-1 1:0.0519501 2:0.758012 3:0.0521446 4:0.758012 5:-0.0519501 6:-0.758012 7:-0.92905 8:1 9:-0.822623 10:0.997103 11:-0.361349 12:0.0779024 13:-0.412796 14:0.614035 15:-0.207279 16:-0.952126 17:0.282809 18:-0.0932031 19:0.192258 20:-0.866836 21:0.214434 22:-0.230974 23:-0.0632531
+-1 1:0.517986 2:0.804832 3:0.518267 4:0.804832 5:-0.517986 6:-0.804832 7:-0.21634 8:0.951089 9:-0.814541 10:0.942482 11:-0.396756 12:0.929303 13:-0.385458 14:0.953605 15:-0.20378 16:-0.9796 17:0.20036 18:-0.843682 19:0.169585 20:-0.920765 21:0.209399 22:-0.404306 23:-0.0632531
+-0.3573081161357508 1:0.679311 2:0.483381 3:0.678243 4:0.432352 5:-0.679311 6:-0.483381 7:0.0794949 8:0.233645 9:-0.823889 10:0.931034 11:-0.402167 12:0.997575 13:-0.306732 14:0.627839 15:-0.208695 16:-0.99017 17:0.280732 18:-0.841754 19:0.190388 20:-0.923397 21:0.21337 22:-0.636754 23:-0.0632531
+-1 1:0.474366 2:0.727024 3:0.474639 4:0.727024 5:-0.474366 6:-0.727024 7:-0.296329 8:0.974026 9:-0.805736 10:0.755506 11:-0.387084 12:0.960438 13:-0.337978 14:-0.791045 15:-0.209933 16:-0.992962 17:-0.00886453 18:-0.177905 19:0.183085 20:-0.965841 21:0.214085 22:-0.765011 23:-0.0632531
+-1 1:0.61692 2:0.863956 3:0.617219 4:0.863956 5:-0.61692 6:-0.863956 7:-0.0349166 8:0.710145 9:-0.82381 10:1 11:-0.401015 12:0.952613 13:-0.406584 14:0.955437 15:-0.206636 16:-0.982253 17:0.150251 18:0.404378 19:0.196999 20:0.445954 21:0.217598 22:-0.437351 23:-0.0632531
+-1 1:0.0368548 2:0.018128 3:0.0370464 4:0.018128 5:-0.0368549 6:-0.0181281 7:-0.901369 8:1 9:-0.820096 10:1 11:-0.393283 12:1 13:-0.366482 14:1 15:-0.164141 16:-0.65652 17:0.266107 18:-0.918696 19:0.156205 20:-0.827835 21:0.207235 22:0.508216 23:-0.0632531
+-0.4116942610135136 1:0.279881 2:0.805048 3:0.279738 4:0.785946 5:-0.279881 6:-0.805048 7:-0.652974 8:0.825518 9:-0.767275 10:0.65194 11:-0.382155 12:0.384136 13:-0.401216 14:-0.494098 15:-0.212531 16:-0.887641 17:0.217726 18:-0.814091 19:0.152935 20:-0.15851 21:0.21384 22:-0.36499 23:-0.0632531
+-1 1:0.0273445 2:0.835209 3:0.0275344 4:0.835209 5:-0.0273444 6:-0.83521 7:-0.883929 8:0.938154 9:-0.81812 10:0.775796 11:-0.393122 12:0.860386 13:-0.365621 14:0.829981 15:-0.154479 16:-0.998607 17:0.170531 18:-1 19:0.151494 20:-0.997003 21:0.135819 22:-0.126156 23:-0.0632531
+-1 1:-0.277523 2:0.634966 3:-0.277389 4:0.634966 5:0.277522 6:-0.634966 7:-0.324868 8:0.888411 9:-0.799556 10:0.724761 11:-0.401218 12:0.9853 13:-0.400949 14:0.786581 15:-0.205439 16:-0.971709 17:0.275214 18:-0.94317 19:0.171021 20:-0.495763 21:0.216087 22:0.0870176 23:-0.131772
+-1 1:0.31084 2:0.912021 3:0.310528 4:0.843611 5:-0.31084 6:-0.912021 7:-0.596201 8:0.882796 9:-0.823558 10:0.97698 11:-0.398053 12:0.911214 13:-0.394062 14:0.91792 15:-0.1975 16:-0.963424 17:0.287422 18:-1 19:0.171708 20:0.701014 21:0.217614 22:-0.494263 23:-0.0632531
+-1 1:-0.919117 2:-0.929624 3:-0.919102 4:-0.929624 5:0.919117 6:0.929624 7:0.851678 8:-1 9:-0.823908 10:-1 11:-0.402191 12:1 13:-0.396738 14:-1 15:-0.209367 16:0.97561 17:0.355642 18:1 19:0.233301 20:1 21:0.233607 22:-0.0728064 23:-0.0632531
+-1 1:0.213109 2:0.777414 3:0.209993 4:0.706569 5:-0.213109 6:-0.777415 7:-0.775419 8:0.756532 9:-0.820469 10:-0.0334003 11:-0.402435 12:0.967249 13:-0.143469 14:0.906262 15:-0.19797 16:-0.998807 17:0.0840736 18:-0.981802 19:0.15625 20:-0.990668 21:-0.0407439 22:-0.837851 23:-0.319777
+-1 1:-0.146724 2:0.813633 3:-0.146933 4:0.752141 5:0.146724 6:-0.813633 7:-0.564725 8:-0.411255 9:-0.823954 10:-0.431116 11:-0.40222 12:0.995842 13:-0.234316 14:0.996485 15:-0.150104 16:-0.993632 17:0.288718 18:-0.854596 19:0.191739 20:0.719703 21:0.217784 22:-0.396565 23:-0.0632531
+-1 1:-0.295783 2:0.522403 3:-0.295653 4:0.522403 5:0.295783 6:-0.522403 7:-0.291383 8:0.76378 9:-0.822076 10:0.567251 11:-0.397914 12:0.923311 13:-0.388461 14:0.380952 15:-0.209055 16:-0.920177 17:0.164339 18:-0.936306 19:0.0230552 20:-0.996535 21:0.0647395 22:-0.0234873 23:-0.0632531
+-1 1:0.0584036 2:0.547167 3:0.0585992 4:0.547167 5:-0.0584036 6:-0.547167 7:-0.940885 8:0.948938 9:-0.758493 10:1 11:-0.40049 12:0.822275 13:-0.395917 14:0.327824 15:-0.207598 16:-0.956338 17:0.124544 18:-0.307847 19:0.189034 20:-0.125 21:0.217475 22:0.126217 23:-0.0632531
+-1 1:-0.556487 2:0.781716 3:-0.556405 4:0.781716 5:0.556487 6:-0.781716 7:0.186693 8:0.429736 9:-0.823521 10:0.883682 11:-0.400342 12:0.544856 13:-0.410607 14:0.0704514 15:-0.209201 16:-0.982438 17:0.273594 18:-0.471961 19:0.186299 20:-0.204325 21:0.215713 22:-0.58228 23:-0.914899
+-1 1:0.713066 2:0.861696 3:0.713187 4:0.836534 5:-0.713065 6:-0.861696 7:0.141394 8:0.493438 9:-0.823085 10:0.82376 11:-0.401542 12:0.31939 13:-0.412609 14:0.397795 15:-0.209211 16:-0.973828 17:0.288667 18:-1 19:0.191727 20:0.590842 21:0.218244 22:-0.712347 23:-0.0632531
+-1 1:-0.0937216 2:0.641526 3:-0.0935541 4:0.641526 5:0.0937216 6:-0.641526 7:-0.66192 8:0.598899 9:-0.800762 10:-0.671085 11:-0.40505 12:0.716961 13:-0.310011 14:-0.0319012 15:-0.21029 16:-1 17:0.238332 18:-1 19:0.136438 20:-0.989249 21:0.167325 22:-0.666973 23:-0.0632531
+-1 1:0.365165 2:0.359097 3:0.365417 4:0.359097 5:-0.365165 6:-0.359098 7:-0.496581 8:0.977762 9:-0.807266 10:1 11:-0.381422 12:0.742119 13:-0.410219 14:0.576283 15:-0.207372 16:-0.996755 17:0.255217 18:-0.971229 19:0.188201 20:0.277338 21:0.218902 22:0.266064 23:-0.0632531
+-1 1:0.259165 2:0.749864 3:0.254559 4:0.722837 5:-0.259165 6:-0.749864 7:-0.690962 8:-0.307815 9:-0.833329 10:1 11:-0.401564 12:0.994051 13:-0.244029 14:1 15:-0.201723 16:-0.99228 17:-0.426441 18:-1 19:0.188669 20:-0.692308 21:0.212892 22:-0.674236 23:-0.0632531
+-1 1:0.641037 2:0.376141 3:0.637402 4:0.319133 5:-0.641037 6:-0.376141 7:0.00930961 8:1 9:-0.823793 10:0.98895 11:-0.401883 12:0.970389 13:-0.354825 14:0.970241 15:-0.202621 16:-0.840984 17:0.2496 18:-0.448276 19:0.191767 20:-0.838235 21:0.216638 22:-0.636696 23:-0.0632531
+-1 1:-0.39915 2:0.329767 3:-0.399039 4:0.329767 5:0.39915 6:-0.329766 7:-0.10183 8:0.97283 9:-0.822909 10:1 11:-0.400963 12:0.99663 13:-0.381523 14:0.991763 15:-0.20029 16:0.934361 17:0.347993 18:-0.721017 19:0.174614 20:-0.99105 21:0.216918 22:0.299444 23:-0.0632531
+-1 1:0.00633151 2:0.879589 3:0.00651754 4:0.879589 5:-0.00633145 6:-0.87959 7:-0.845396 8:0.809524 9:-0.810094 10:0.453061 11:-0.396927 12:0.982011 13:-0.349961 14:0.560976 15:-0.205276 16:-0.991308 17:-0.180848 18:-0.626768 19:0.0924479 20:-0.739316 21:0.180458 22:-0.188782 23:-0.063689
+-1 1:0.281704 2:0.861696 3:0.275865 4:0.735884 5:-0.281705 6:-0.861696 7:-0.64963 8:-0.966548 9:-0.883004 10:-0.277031 11:-0.409766 12:-0.635979 13:-0.473714 14:0.591998 15:-0.142561 16:0.233749 17:0.293906 18:-0.0959157 19:0.191795 20:-0.954895 21:-1 22:-1 23:-0.0632531
+-0.1511414311021485 1:-0.240321 2:0.858627 3:-0.240181 4:0.858627 5:0.240321 6:-0.858628 7:-0.393087 8:0.784334 9:-0.820757 10:0.325235 11:-0.401721 12:0.93089 13:-0.376623 14:0.965426 15:-0.180344 16:-0.948306 17:0.246906 18:-0.978884 19:0.132794 20:-0.99653 21:-0.206467 22:-0.0830693 23:-0.370444
+-0.5004311581539996 1:-0.0605389 2:0.817429 3:-0.0622114 4:0.716081 5:0.0605389 6:-0.817429 7:-0.72277 8:-0.0442478 9:-0.823917 10:-0.0940555 11:-0.404756 12:-0.744706 13:-0.615789 14:-0.915004 15:-0.288785 16:-0.813664 17:0.135426 18:-0.812378 19:0.150974 20:0.994245 21:0.223209 22:-0.937407 23:-0.0632531
+-1 1:-0.533776 2:0.653092 3:-0.53369 4:0.653092 5:0.533776 6:-0.653092 7:0.145045 8:0.974003 9:-0.806577 10:-0.330966 11:-0.402547 12:0.986313 13:-0.290658 14:0.816032 15:-0.20553 16:-0.951008 17:0.245202 18:-0.523339 19:0.191183 20:0.367724 21:0.220256 22:-0.263425 23:-0.837749
+-0.6859808826851237 1:-0.232813 2:0.887697 3:-0.233846 4:0.83905 5:0.232813 6:-0.887697 7:-0.406856 8:0.796904 9:-0.823236 10:-0.87761 11:-0.404877 12:0.99683 13:-0.357176 14:-0.483146 15:-0.241495 16:-0.959888 17:0.211122 18:-0.985625 19:0.0853136 20:-0.99263 21:0.106287 22:-0.519176 23:-0.0632531
+-1 1:0.306641 2:0.664862 3:0.306882 4:0.664862 5:-0.306641 6:-0.664862 7:-0.603902 8:1 9:-0.823309 10:0.0503376 11:-0.402124 12:0.856849 13:-0.39858 14:0.898892 15:-0.196458 16:-1 17:0.278287 18:-1 19:0.14965 20:-0.468098 21:0.213663 22:-0.569228 23:-0.0632531
+-1 1:-0.160808 2:0.775201 3:-0.161609 4:0.752397 5:0.160808 6:-0.775201 7:-0.538898 8:0.00342238 9:-0.823901 10:0.0702281 11:-0.397649 12:0.887742 13:-0.332376 14:0.118779 15:-0.194083 16:-0.763914 17:0.167297 18:-0.939527 19:0.0694662 20:-0.991067 21:0.0535393 22:-0.415857 23:-0.0632531
+-1 1:0.106071 2:0.735884 3:0.101146 4:0.672979 5:-0.106071 6:-0.735884 7:-0.971703 8:-0.0763105 9:-0.824035 10:0.421857 11:-0.380452 12:1 13:-0.207691 14:0.959368 15:-0.0753366 16:-0.976237 17:0.0892575 18:-0.94274 19:0.0512007 20:-0.977612 21:0.127895 22:-0.2819 23:-0.0632531
+-1 1:0.21386 2:0.693947 3:0.214084 4:0.693947 5:-0.21386 6:-0.693947 7:-0.774042 8:0.515973 9:-0.821507 10:0.158231 11:-0.401364 12:0.918312 13:-0.376853 14:0.562752 15:-0.199306 16:-0.955794 17:0.0404244 18:-0.941021 19:0.0918136 20:-0.659956 21:0.21699 22:-0.868363 23:-0.0632531
+-1 1:0.0651686 2:0.812671 3:0.0653654 4:0.812671 5:-0.0651686 6:-0.812671 7:-0.95329 8:0.985904 9:-0.698878 10:0.935025 11:-0.343781 12:0.873191 13:-0.227068 14:0.0992506 15:-0.202353 16:-0.964597 17:0.21277 18:-0.12834 19:0.191035 20:-0.979699 21:0.187554 22:-0.0858848 23:-0.0632531
+-1 1:0.269199 2:0.841404 3:0.269106 4:0.753133 5:-0.269199 6:-0.841404 7:-0.672562 8:0.974028 9:-0.820404 10:0.784946 11:-0.40114 12:0.992415 13:-0.384281 14:-0.484752 15:-0.209542 16:-0.991495 17:0.27579 18:-0.999537 19:0.17478 20:0.581229 21:0.217803 22:-0.789245 23:-0.0632531
+-1 1:-0.323326 2:0.897988 3:-0.324246 4:0.827824 5:0.323326 6:-0.897988 7:-0.240875 8:-0.152944 9:-0.823951 10:0.912276 11:-0.401247 12:0.954302 13:-0.409486 14:0.350817 15:-0.209338 16:-0.9624 17:0.288568 18:-0.822504 19:0.192045 20:0.0529275 21:0.217534 22:-0.533007 23:-0.0632531
+-1 1:0.241414 2:0.855494 3:0.240733 4:0.752718 5:-0.241414 6:-0.855494 7:-0.723514 8:-1 9:-0.823908 10:-0.9207 11:-0.402694 12:0.850181 13:-0.393426 14:0.981003 15:-0.202785 16:-0.897406 17:0.267298 18:0.966648 19:0.210524 20:-1 21:0.217526 22:-0.601006 23:-0.0632531
+-1 1:0.377947 2:0.371222 3:0.378202 4:0.371222 5:-0.377947 6:-0.371222 7:-0.473141 8:1 9:-0.809623 10:1 11:-0.395751 12:0.983827 13:-0.288896 14:1 15:-0.180748 16:-0.984733 17:0.219705 18:-0.333333 19:0.190021 20:-0.501831 21:0.193799 22:0.294949 23:-0.0632531
+-0.8410738584030928 1:0.468711 2:0.375066 3:0.468982 4:0.375066 5:-0.468711 6:-0.375066 7:-0.3067 8:1 9:-0.821851 10:-0.050505 11:-0.402225 12:-0.109299 13:-0.41424 14:0.459854 15:-0.208987 16:-0.719298 17:0.289546 18:-0.868647 19:0.172172 20:0.522843 21:0.218584 22:0.0494153 23:-0.0632531
+-1 1:-0.438328 2:0.652703 3:-0.438225 4:0.652703 5:0.438328 6:-0.652703 7:-0.029985 8:1 9:-0.823513 10:-0.372634 11:-0.402443 12:0.993308 13:-0.360648 14:0.482745 15:-0.209084 16:-0.60106 17:0.289627 18:-0.140836 19:0.19217 20:-0.461578 21:0.216949 22:-0.309265 23:-0.920594
+-1 1:-0.297602 2:0.912021 3:-0.297971 4:0.826173 5:0.297602 6:-0.912021 7:-0.288048 8:0.987341 9:-0.583361 10:0.963 11:-0.361736 12:0.770474 13:-0.388493 14:0.68266 15:-0.198635 16:-0.996345 17:0.168891 18:-0.634396 19:0.167586 20:-0.969169 21:0.205248 22:-0.427928 23:-0.706329
+-1 1:-0.0275995 2:0.312763 3:-0.0274198 4:0.312763 5:0.0275995 6:-0.312763 7:-0.783173 8:1 9:-0.818125 10:0.879781 11:-0.382609 12:0.998868 13:-0.373507 14:0.550562 15:-0.206737 16:-0.941832 17:0.235299 18:-0.788144 19:0.12637 20:0.625 21:0.2189 22:-0.296457 23:-0.0632531
+-1 1:0.0728748 2:0.688705 3:0.073073 4:0.688705 5:-0.0728748 6:-0.688705 7:-0.967422 8:0.972503 9:-0.811783 10:-0.823529 11:-0.402425 12:-0.255474 13:-0.413729 14:0.808763 15:-0.19968 16:-0.992517 17:0.279107 18:-0.720613 19:0.190912 20:-0.249428 21:0.21707 22:-0.0363074 23:0.455066
+-1 1:-0.0641361 2:0.781048 3:-0.0639631 4:0.781048 5:0.0641361 6:-0.781048 7:-0.716173 8:-0.926594 9:-0.827831 10:0.0238465 11:-0.40202 12:0.845508 13:-0.288778 14:0.611825 15:-0.172847 16:-0.47166 17:0.23767 18:-0.538906 19:0.173928 20:-1 21:0.20239 22:-0.0137481 23:-0.0632531
+-1 1:0.367861 2:0.875273 3:0.368114 4:0.875273 5:-0.367861 6:-0.875273 7:-0.491637 8:0.517816 9:-0.822899 10:-0.885977 11:-0.408103 12:0.999348 13:-0.39164 14:0.815635 15:-0.20753 16:-1 17:0.219743 18:-0.574671 19:0.190677 20:-0.411333 21:0.217329 22:-0.597081 23:-0.0632531
+-1 1:-0.0848077 2:0.718679 3:-0.0846386 4:0.718679 5:0.0848076 6:-0.718679 7:-0.678266 8:1 9:0.729065 10:0.819417 11:-0.401671 12:0.844973 13:-0.41233 14:0.959073 15:-0.200754 16:-1 17:0.290173 18:-1 19:0.192235 20:1 21:0.220755 22:-0.70531 23:-0.0632531
+-1 1:0.452558 2:0.884995 3:0.451583 4:0.803916 5:-0.452558 6:-0.884995 7:-0.336321 8:0.247717 9:-0.822655 10:0.373297 11:-0.400187 12:0.664069 13:-0.40934 14:0.449313 15:-0.208031 16:-1 17:0.270082 18:-0.271328 19:0.188877 20:0.447938 21:0.221133 22:-0.696443 23:-0.0632531
+-0.9891994540451646 1:0.878711 2:0.826799 3:0.878053 4:0.746904 5:-0.878711 6:-0.8268 7:0.445152 8:0.600688 9:-0.823813 10:0.0527023 11:-0.40219 12:0.80072 13:-0.412321 14:0.875692 15:-0.209252 16:-0.295667 17:0.286642 18:-0.892248 19:0.190151 20:-0.825476 21:0.217112 22:-0.666952 23:-0.0632531
+-1 1:-0.131505 2:0.288937 3:-0.131345 4:0.288937 5:0.131505 6:-0.288936 7:-0.592632 8:1 9:-0.79884 10:0.992843 11:-0.34782 12:0.996525 13:-0.362013 14:0.955728 15:-0.201709 16:-1 17:0.203913 18:-0.720892 19:0.183667 20:-0.145455 21:0.217163 22:0.265733 23:-0.508746
+-1 1:0.187559 2:0.886191 3:0.184302 4:0.805471 5:-0.187559 6:-0.886191 7:-0.822272 8:-0.234824 9:-0.8318 10:-0.0192313 11:-0.402769 12:0.0850332 13:-0.411078 14:0.28921 15:-0.203213 16:0.553724 17:0.385322 18:0.00305998 19:0.192704 20:-0.996758 21:-0.632059 22:-0.138765 23:-0.0632531
+-1 1:0.173072 2:0.777601 3:0.173288 4:0.777601 5:-0.173072 6:-0.777601 7:-0.848839 8:0.826087 9:-0.813724 10:1 11:-0.357769 12:0.888502 13:-0.295086 14:1 15:-0.153602 16:-1 17:0.0324854 18:-0.716129 19:0.0993861 20:-0.45098 21:0.20669 22:0.177911 23:-0.0632531
+-1 1:0.176732 2:0.796384 3:0.176657 4:0.776447 5:-0.176732 6:-0.796384 7:-0.842127 8:0.977011 9:-0.657994 10:0.981818 11:0.0471881 12:0.613188 13:-0.321404 14:0.2688 15:-0.191853 16:-1 17:0.0929446 18:-0.998646 19:0.0602873 20:-0.704811 21:0.159738 22:-0.350252 23:-0.0632531
+-1 1:0.58645 2:0.82128 3:0.586493 4:0.783471 5:-0.58645 6:-0.82128 7:-0.0907926 8:1 9:-0.808325 10:0.944571 11:-0.371069 12:0.684063 13:-0.396912 14:0.885453 15:-0.193172 16:-0.718137 17:0.25961 18:-0.158898 19:0.190456 20:0.339703 21:0.219271 22:-0.205989 23:-0.0632531
+-1 1:0.62724 2:0.791053 3:0.626737 4:0.752031 5:-0.62724 6:-0.791053 7:-0.0159923 8:0.40898 9:-0.823603 10:0.835046 11:-0.382693 12:0.904791 13:-0.40702 14:0.836845 15:-0.207687 16:-0.791806 17:0.283239 18:-0.132346 19:0.192312 20:-0.818653 21:0.216848 22:-0.419349 23:-0.0632531
+-1 1:-0.434857 2:0.804832 3:-0.434753 4:0.804832 5:0.434857 6:-0.804832 7:-0.0363498 8:0.885103 9:-0.82166 10:0.531992 11:-0.401279 12:0.910059 13:-0.400526 14:0.909134 15:-0.201394 16:-0.592385 17:0.278836 18:-0.627505 19:0.185883 20:-0.638811 21:0.214886 22:-0.358917 23:-0.776836
+-1 1:0.108886 2:0.946226 3:0.108605 4:0.860713 5:-0.108886 6:-0.946226 7:-0.966541 8:-0.783151 9:-0.830185 10:-0.638918 11:-0.403942 12:0.73989 13:-0.383866 14:0.717483 15:-0.192053 16:-0.995379 17:0.178553 18:-0.99175 19:0.142419 20:0.7409 21:0.223893 22:-0.709939 23:-0.0632531
+-0.9409472333492426 1:0.232942 2:0.813206 3:0.232788 4:0.794203 5:-0.232942 6:-0.813206 7:-0.739049 8:0.946444 9:-0.822963 10:1 11:-0.340441 12:0.87505 13:-0.377963 14:0.674252 15:-0.208335 16:-0.965814 17:0.0602979 18:-0.993555 19:0.117857 20:-0.817141 21:0.216226 22:-0.178362 23:-0.0632531
+-1 1:-0.266014 2:0.802172 3:-0.266261 4:0.684477 5:0.266014 6:-0.802172 7:-0.345973 8:0.339928 9:-0.770165 10:-0.905991 11:-0.414525 12:0.433412 13:-0.341727 14:-0.76482 15:-0.217774 16:-0.26747 17:0.215641 18:-0.996655 19:0.177917 20:0.496289 21:0.297168 22:-0.93215 23:-0.0632531
+-1 1:-0.297395 2:0.822057 3:-0.298088 4:0.797067 5:0.297395 6:-0.822057 7:-0.288426 8:-0.703012 9:-0.877516 10:-0.623832 11:-0.435427 12:0.716817 13:-0.0661462 14:0.703454 15:-0.110821 16:0.708766 17:0.343498 18:-0.860971 19:0.150806 20:-0.999175 21:-0.313029 22:-0.511407 23:-0.0632531
+-1 1:-0.121629 2:0.824456 3:-0.121467 4:0.824456 5:0.12163 6:-0.824456 7:-0.610743 8:0.194964 9:-0.821282 10:0.987906 11:-0.338792 12:0.925433 13:-0.10628 14:0.642163 15:-0.153755 16:-0.895073 17:0.142365 18:0.00328768 19:0.192496 20:-0.819913 21:-0.0492687 22:-0.6288 23:-0.375604
+-0.7962775312281117 1:0.261626 2:0.857417 3:0.261211 4:0.832597 5:-0.261626 6:-0.857417 7:-0.68645 8:-0.722041 9:-0.828794 10:0.499267 11:-0.399149 12:0.919395 13:-0.409012 14:0.981404 15:0.0671003 16:-0.344997 17:-0.247673 18:-0.524203 19:-0.275102 20:-1 21:0.0559075 22:-0.426953 23:-0.0632531
+-1 1:-0.176905 2:0.105501 3:-0.176752 4:0.105501 5:0.176904 6:-0.105501 7:-0.50938 8:1 9:-0.767196 10:1 11:-0.371838 12:1 13:-0.290631 14:1 15:-0.166767 16:-0.60237 17:0.236005 18:-0.578695 19:0.156663 20:-0.904036 21:0.097972 22:0.509861 23:-0.0632531
+-1 1:-0.977259 2:0.640378 3:-0.977255 4:0.640378 5:0.977259 6:-0.640378 7:0.958298 8:0.993924 9:-0.823038 10:1 11:-0.402052 12:0.981684 13:-0.411577 14:0.980987 15:-0.209289 16:-0.244231 17:0.290266 18:-1 19:0.191084 20:0.999959 21:1 22:-0.471949 23:-0.0632531
+-1 1:-0.0279924 2:0.655637 3:-0.0285908 4:0.635915 5:0.0279923 6:-0.655637 7:-0.782453 8:0.702624 9:-0.821827 10:0.709459 11:-0.400074 12:0.993758 13:-0.370443 14:0.69662 15:-0.205608 16:-0.978292 17:-0.241537 18:-0.976013 19:0.0352147 20:-0.746725 21:0.213632 22:-0.359522 23:-0.0632531
+-1 1:-0.000776896 2:0.34447 3:-0.000592183 4:0.34447 5:0.000777016 6:-0.34447 7:-0.83236 8:0.973593 9:-0.823447 10:1 11:-0.394724 12:0.997378 13:-0.371369 14:1 15:-0.17595 16:-0.691095 17:0.27506 18:0.611193 19:0.193724 20:-0.803678 21:0.199201 22:0.314628 23:-0.592274
+-1 1:0.125949 2:0.718192 3:0.12303 4:0.689688 5:-0.125949 6:-0.718192 7:-0.935252 8:-0.110605 9:-0.824609 10:0.498955 11:-0.396616 12:-0.482728 13:-0.443084 14:0.933535 15:-0.185436 16:-0.951793 17:0.174703 18:-0.903254 19:0.151946 20:-0.62203 21:0.199187 22:-0.552538 23:-0.0632531
+-1 1:0.0700696 2:0.504742 3:0.0702675 4:0.504742 5:-0.0700697 6:-0.504742 7:-0.962278 8:1 9:-0.790355 10:1 11:-0.392373 12:0.828283 13:-0.400422 14:1 15:-0.185462 16:-0.982449 17:0.0131345 18:-0.989503 19:-0.0206808 20:-0.552782 21:0.196429 22:0.126078 23:-0.0632531
+-1 1:0.0852691 2:0.616948 3:0.0854697 4:0.616948 5:-0.0852692 6:-0.616948 7:-0.99015 8:0.676902 9:-0.823441 10:0.744851 11:-0.396701 12:0.991963 13:-0.37864 14:0.905337 15:-0.198768 16:-0.996765 17:0.161128 18:-0.985585 19:0.149218 20:-0.924563 21:0.209675 22:-0.101227 23:-0.0632531
+-1 1:0.446439 2:0.504742 3:0.446706 4:0.504742 5:-0.446439 6:-0.504742 7:-0.347542 8:0.994143 9:-0.819983 10:1 11:-0.401827 12:0.995507 13:-0.192002 14:0.989292 15:-0.10923 16:-0.938863 17:0.230642 18:-0.975636 19:0.118727 20:-0.207251 21:0.214127 22:0.189972 23:-0.0632531
+-1 1:0.661109 2:0.875535 3:0.660975 4:0.83905 5:-0.661109 6:-0.875536 7:0.046117 8:-0.139148 9:-0.824391 10:-0.0240497 11:-0.402233 12:0.969953 13:-0.362246 14:0.805675 15:-0.206018 16:-0.950235 17:0.202486 18:-0.657143 19:0.191031 20:0.738729 21:0.22094 22:-0.348807 23:-0.0632531
+-1 1:0.00808634 2:0.837992 3:0.00671262 4:0.785115 5:-0.00808628 6:-0.837993 7:-0.848613 8:0.729338 9:-0.814357 10:0.307672 11:-0.401016 12:0.918671 13:-0.351137 14:0.080597 15:-0.209343 16:-0.827623 17:0.261981 18:-0.820619 19:0.148952 20:-0.77224 21:0.209775 22:-0.278504 23:-0.0632531
+-1 1:-0.116108 2:0.664517 3:-0.115944 4:0.664517 5:0.116108 6:-0.664518 7:-0.620868 8:0.905396 9:-0.818193 10:1 11:-0.402109 12:0.770438 13:-0.40849 14:-0.176046 15:-0.209406 16:-0.996834 17:0.0609481 18:-0.982965 19:-0.0160832 20:-0.99919 21:0.0859699 22:-0.207543 23:0.547938
+-1 1:0.281982 2:0.598315 3:0.282219 4:0.598315 5:-0.281982 6:-0.598314 7:-0.64912 8:1 9:-0.823004 10:-1 11:-0.40293 12:0.602564 13:-0.383852 14:-0.609375 15:-0.22389 16:-0.582609 17:0.239917 18:-0.940828 19:0.147513 20:-0.307692 21:0.216255 22:-0.613927 23:-0.0632531
+-1 1:0.18694 2:0.647717 3:0.180025 4:0.618988 5:-0.18694 6:-0.647717 7:-0.823407 8:1 9:-0.77409 10:-0.411043 11:-0.431933 12:0.137534 13:-0.367124 14:0.509269 15:-0.117898 16:0.0853549 17:0.303134 18:-0.818499 19:0.0529374 20:-0.960159 21:0.177248 22:-0.753842 23:-0.0632531
+-1 1:-0.023637 2:0.787574 3:-0.0234566 4:0.787574 5:0.023637 6:-0.787575 7:-0.79044 8:0.512479 9:-0.80905 10:0.450269 11:-0.396266 12:0.965669 13:-0.241235 14:0.374418 15:-0.197423 16:-0.999046 17:-0.298634 18:-0.964298 19:-0.0910548 20:-0.994376 21:-0.239514 22:-0.667281 23:-0.0632531
+-1 1:0.221918 2:0.919467 3:0.22068 4:0.882237 5:-0.221918 6:-0.919467 7:-0.759266 8:0.993083 9:-0.780626 10:0.96873 11:-0.318146 12:0.933471 13:-0.407664 14:0.851567 15:-0.190869 16:-0.984477 17:-0.0652056 18:-1 19:0.159315 20:-0.64597 21:0.199063 22:-0.371971 23:-0.0632531
+-1 1:-0.040382 2:0.906882 3:-0.0428088 4:0.881188 5:0.040382 6:-0.906882 7:-0.759733 8:1 9:-0.79969 10:0.557428 11:-0.37568 12:0.879031 13:-0.360098 14:-0.201698 15:-0.210911 16:-0.971737 17:0.0864258 18:-1 19:-0.152495 20:-0.55247 21:0.177363 22:-0.232054 23:-0.0632531
+-1 1:-0.0207953 2:0.870323 3:-0.0213622 4:0.828626 5:0.0207954 6:-0.870323 7:-0.795651 8:-0.132254 9:-0.824032 10:0.709544 11:-0.401474 12:0.752628 13:-0.407249 14:-0.963434 15:-0.21033 16:-0.0318351 17:0.290262 18:1 19:0.193116 20:-1 21:0.207366 22:-0.388177 23:-0.665424
+-0.1597538653661974 1:0.00473722 2:0.89531 3:0.00426237 4:0.811756 5:-0.00473724 6:-0.89531 7:-0.842472 8:-0.721188 9:-0.849335 10:-0.976985 11:-0.42231 12:0.677469 13:-0.351941 14:0.675752 15:-0.163691 16:-1 17:0.245742 18:-0.312889 19:0.186495 20:-0.924925 21:0.0795114 22:-0.69623 23:-0.0632531
+-0.08658350832329205 1:-0.0657449 2:0.8298 3:-0.0655722 4:0.8298 5:0.0657449 6:-0.8298 7:-0.713223 8:-1 9:-0.826666 10:0.416818 11:-0.396244 12:0.970449 13:-0.339628 14:0.963774 15:-0.164238 16:-0.979115 17:0.262996 18:-0.99907 19:0.0572506 20:-0.813874 21:0.203497 22:-0.100046 23:-0.0632531
+-1 1:0.0838574 2:0.807777 3:0.0840577 4:0.807777 5:-0.0838573 6:-0.807777 7:-0.987561 8:-0.508542 9:-0.824348 10:1 11:-0.401911 12:0.0635285 13:-0.41039 14:0.299216 15:-0.194277 16:-1 17:0.151461 18:-0.941342 19:0.177537 20:0.143331 21:0.218279 22:-0.965422 23:-0.0632531
+-1 1:0.139116 2:0.960209 3:0.139132 4:0.925789 5:-0.139116 6:-0.960209 7:-0.911107 8:1 9:-0.821201 10:0.776334 11:-0.39801 12:0.994801 13:-0.4056 14:-0.818117 15:-0.216809 16:-1 17:0.290544 18:-0.187024 19:0.186951 20:1 21:0.217714 22:-0.908665 23:-0.709439
+-1 1:0.199169 2:0.0985754 3:0.19939 4:0.0985754 5:-0.199169 6:-0.0985756 7:-0.800982 8:1 9:-0.786225 10:1 11:-0.374231 12:1 13:-0.40467 14:1 15:-0.203051 16:0.407612 17:1 18:0.397065 19:1 20:-0.998647 21:-0.438171 22:0.485602 23:-0.0632531
+-1 1:-0.212196 2:0.615364 3:-0.21205 4:0.615364 5:0.212196 6:-0.615364 7:-0.444663 8:0.108143 9:-0.822724 10:-0.878641 11:-0.420135 12:0.844799 13:-0.383102 14:0.956768 15:-0.189253 16:-0.969814 17:0.194752 18:-0.992026 19:0.118083 20:-0.466941 21:0.202933 22:0.0190595 23:-0.0632531
+-1 1:0.0669913 2:0.600508 3:0.0652963 4:0.579172 5:-0.0669914 6:-0.600508 7:-0.956633 8:0.82415 9:-0.816564 10:1 11:-0.399722 12:0.936592 13:-0.333198 14:-1 15:-0.209474 16:-0.470947 17:0.266949 18:-0.952452 19:0.17153 20:-0.947903 21:0.156711 22:-0.439143 23:-0.0632531
+-1 1:0.28422 2:0.818342 3:0.284345 4:0.79369 5:-0.284219 6:-0.818342 7:-0.645018 8:0.950029 9:-0.822843 10:0.900699 11:-0.400095 12:0.994106 13:-0.350851 14:0.984813 15:-0.19887 16:-0.979002 17:0.139485 18:-0.97954 19:0.120082 20:-0.930789 21:0.165772 22:-0.538836 23:-0.0632531
+-1 1:0.130179 2:0.775201 3:0.128901 4:0.718192 5:-0.130179 6:-0.775201 7:-0.927494 8:0.28365 9:-0.821738 10:-0.652963 11:-0.425953 12:-0.965808 13:-0.45138 14:-0.924065 15:-0.236101 16:-0.031014 17:0.270134 18:-0.350278 19:0.0335525 20:-0.0541865 21:0.213531 22:-0.732648 23:-0.0632531
+-0.808521001246749 1:0.1077 2:0.355669 3:0.107904 4:0.355669 5:-0.1077 6:-0.355669 7:-0.968717 8:1 9:-0.823301 10:1 11:-0.392813 12:1 13:-0.183414 14:1 15:-0.110712 16:-0.905227 17:0.190292 18:-0.989051 19:0.0248452 20:-0.968899 21:0.100771 22:0.357464 23:-0.237935
+-1 1:0.0586908 2:0.959099 3:0.0588015 4:0.93556 5:-0.0586909 6:-0.959099 7:-0.941411 8:0.759281 9:-0.746375 10:0.792796 11:-0.344139 12:0.864541 13:-0.317874 14:0.668812 15:-0.176746 16:-0.997848 17:-0.138442 18:-0.982069 19:-0.0135535 20:-0.969809 21:-0.100417 22:-0.453775 23:-0.0632531
+-1 1:-0.204918 2:0.615577 3:-0.204771 4:0.615577 5:0.204918 6:-0.615577 7:-0.45801 8:0.882485 9:-0.82115 10:0.944516 11:-0.398654 12:0.949687 13:-0.403939 14:1 15:-0.207032 16:-1 17:0.167804 18:-0.980302 19:0.15708 20:-0.980018 21:0.204044 22:-0.23681 23:-0.367254
+-1 1:-0.12548 2:0.66755 3:-0.125318 4:0.66755 5:0.12548 6:-0.66755 7:-0.603682 8:0.816161 9:-0.819215 10:1 11:-0.399237 12:0.846097 13:-0.393288 14:0.704457 15:-0.196879 16:-0.69609 17:0.274961 18:0.559999 19:0.196981 20:-0.77267 21:0.19395 22:0.0248107 23:-0.270327
+-1 1:0.21919 2:0.865353 3:0.217149 4:0.801716 5:-0.21919 6:-0.865354 7:-0.764267 8:0.731461 9:-0.773152 10:0.939203 11:-0.367811 12:0.978364 13:-0.0897448 14:0.988306 15:-0.0599378 16:-0.987179 17:-0.175706 18:-0.99627 19:-0.286132 20:-0.999573 21:-0.0379434 22:-0.291702 23:-0.0632531
+-1 1:-0.113137 2:0.749864 3:-0.112973 4:0.749864 5:0.113137 6:-0.749864 7:-0.626317 8:-1 9:-0.825229 10:0.381174 11:-0.401328 12:0.922936 13:-0.374256 14:-0.0395434 15:-0.20953 16:-0.995443 17:0.259146 18:-0.182804 19:0.191322 20:-0.0085764 21:0.217497 22:-0.583994 23:-0.0632531
diff --git a/CCSVM2Phos.range b/CCSVM2Phos.range
new file mode 100644
index 0000000..c19b090
--- /dev/null
+++ b/CCSVM2Phos.range
@@ -0,0 +1,25 @@
+x
+-1 1
+1 0.00089166 0.91614902
+2 0.07594935999999999 0.6241135
+3 0.00089166 0.91597986
+4 0.07594935999999999 0.6241135
+5 0.08385096 0.99910837
+6 0.37588653 0.92405063
+7 0 0.99821669
+8 0 1
+9 -0.09029388000000001 0.93523729
+10 0 1
+11 -0.26260048 0.6159429
+12 0 1
+13 -0.34809536 0.83837408
+14 0 1
+15 -0.3935675 0.6020075700000001
+16 0 1
+17 -0.4748702 0.26105013
+18 0 1
+19 -0.19537343 0.13230015
+20 0 1
+21 -0.35513827 0.22823867
+22 0.80000597 3.5263319
+23 0.16302781 0.88247979
diff --git a/CMemLeak.c b/CMemLeak.c
new file mode 100644
index 0000000..b7e33cf
--- /dev/null
+++ b/CMemLeak.c
@@ -0,0 +1,421 @@
+// CMemLeak.c and CMemLeak.h are taken from the public domain. If the
+// build flag DEBUG_MEMORY_LEAKS is set, then malloc is redefined,
+// to assist in tracking down memory leaks. Using Purify or Valgrind
+// is better, though.
+
+//#include "CMemLeak.h"
+#undef malloc
+#undef realloc
+#undef free
+#undef strdup
+#undef calloc
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+//#include <malloc.h>
+
+// Guards for checking illegal memory writes
+static const char xwbProtect[] = "DeAd";
+static const unsigned int xwbProtSize = sizeof(xwbProtect);
+
+// Filename of report file
+static const char xwbReportFilename[] = "MemLeak.txt";
+
+// Uninitialized memory - pick a value that will cause the most problems
+static const unsigned char xwbUninit = 0x55;
+
+// Clean memory - pick a value which will cause the most problems
+static const unsigned char xwbFreed = 0xAA;
+
+static const char xwbIMW[] = "IMW"; // Illegal memory write
+static const char xwbMLK[] = "MLK"; // Memory leak
+static const char xwbFNH[] = "FNH"; // Free Non Heap memory
+static const char xwbFMW[] = "FMW"; // Free Memory Write
+
+// Node for storing the allocation details
+struct XWBNode
+{
+ struct XWBNode* mPrev;
+ struct XWBNode* mNext;
+ void* mPtr;
+ unsigned int mSize;
+ const char* mFile;
+ unsigned int mLine;
+ const char* mName;
+};
+
+struct XWBList
+{
+ // Doubly linked list
+ struct XWBNode* mHead;
+ struct XWBNode* mTail;
+
+ FILE* mReport;
+ unsigned long mAllocUsed; // Max in the life of the program
+ unsigned long mAllocTotal; // Number of allocations
+ unsigned long mAllocCurrent; // Current allocation
+
+ unsigned int mFree; // 1 if memory to be freed
+ unsigned int mAllocMax; // Not yet - preallocate nodes
+ struct XWBNode* mNode; // Not yet - contiguous node storage
+ struct XWBNode* mUnused; // Not yet - chain of free nodes
+};
+
+
+// Link for storing allocation details
+static struct XWBList xwbMem =
+{
+ (struct XWBNode*) 0,
+ (struct XWBNode*) 0
+};
+
+// Forward declarations
+
+static struct XWBNode* XWBNodeNew(void);
+static void XWBNodeDelete(struct XWBNode* that);
+static void XWBNodeFree(
+ struct XWBNode* that,
+ const char* iName,
+ const char* iFile,
+ const unsigned int iLine);
+static void XWBNodeLink(
+ struct XWBNode*,
+ struct XWBNode*,
+ struct XWBNode*);
+static void XWBNodeSet(
+ struct XWBNode* that,
+ void* iPtr,
+ const unsigned int iSize,
+ const char* iFile,
+ const unsigned int iLine);
+static void XWBNodeIMWCheck(struct XWBNode* that);
+static void XWBMemNew(void);
+static struct XWBNode* XWBMemFind(
+ void* iPtr,
+ unsigned int* oSIze,
+ const char** oFile,
+ unsigned int* oLine);
+static void XWBMemDump(void);
+static void XWBMemInsert(
+ void* iPtr,
+ const unsigned int iSize,
+ const char* iFile,
+ const unsigned int iLine);
+void XWBReport(const char* iTag);
+
+// Final Report
+void XWBReportFinal(void)
+{
+ XWBReport("Final Report");
+ fclose(xwbMem.mReport);
+ xwbMem.mReport = 0;
+}
+
+
+static struct XWBNode* XWBNodeNew(void)
+{
+ struct XWBNode* that = (struct XWBNode*)malloc(sizeof(struct XWBNode));
+ that->mPrev = 0;
+ that->mNext = 0;
+ that->mName = 0;
+
+ return that;
+}
+
+static void XWBNodeDelete(struct XWBNode* that)
+{
+ // Unlink
+ if (that->mPrev)
+ {
+ that->mPrev->mNext = that->mNext;
+ }
+
+ if (that->mNext)
+ {
+ that->mNext->mPrev = that->mPrev;
+ }
+
+ free(that);
+}
+
+static void XWBNodeFree(struct XWBNode* that, const char* iName, const char* iFile, const unsigned int iLine)
+{
+ that->mFile = iFile;
+ that->mLine = iLine;
+ that->mName = iName;
+}
+
+static void XWBNodeLink(struct XWBNode* that, struct XWBNode* iPrev, struct XWBNode* iNext)
+{
+ that->mPrev = iPrev;
+ if (iPrev != 0)
+ iPrev->mNext = that;
+
+ that->mNext = iNext;
+ if (iNext != 0)
+ iNext->mPrev = that;
+}
+
+static void XWBNodeSet(
+ struct XWBNode* that,
+ void* iPtr,
+ const unsigned int iSize,
+ const char* iFile,
+ const unsigned int iLine
+)
+{
+ that->mPtr = iPtr;
+ that->mSize = iSize;
+ that->mFile = iFile;
+ that->mLine = iLine;
+}
+
+static void XWBMemNew(void)
+{
+ // Set up the doubly linked list
+ xwbMem.mHead = XWBNodeNew();
+ xwbMem.mTail = XWBNodeNew();
+ XWBNodeLink(xwbMem.mHead, 0, xwbMem.mTail);
+ XWBNodeLink(xwbMem.mTail, xwbMem.mHead, 0);
+
+ // Initialize statistics
+ xwbMem.mAllocUsed = 0L;
+ xwbMem.mAllocTotal = 0L;
+ xwbMem.mAllocCurrent = 0L;
+
+ xwbMem.mFree = 1;
+
+ xwbMem.mReport = fopen(xwbReportFilename, "w");
+
+ atexit(XWBReportFinal);
+}
+
+// Dump List - used for debugging only
+static void XWBMemDump()
+{
+ int count;
+ struct XWBNode* iter = xwbMem.mHead;
+
+ for (count = 0; iter != 0; count++, iter = iter->mNext)
+ {
+ fprintf(xwbMem.mReport, "%d node %p prev %p next %p\n", count, iter, iter->mPrev, iter->mNext);
+ }
+ fprintf(xwbMem.mReport, "\n");
+}
+// Insert into the tracking list
+void XWBMemInsert(void* iPtr, const unsigned int iSize, const char* iFile,
+ const unsigned int iLine)
+{
+ struct XWBNode* node;
+ if (xwbMem.mHead == 0)
+ {
+ XWBMemNew();
+ }
+
+ // Link in the new node
+ node = XWBNodeNew();
+ XWBNodeSet(node, iPtr, iSize, iFile, iLine);
+ XWBNodeLink(node, xwbMem.mTail->mPrev, xwbMem.mTail);
+
+ xwbMem.mAllocTotal += 1;
+ xwbMem.mAllocCurrent += iSize;
+ if (xwbMem.mAllocUsed < xwbMem.mAllocCurrent)
+ {
+ xwbMem.mAllocUsed = xwbMem.mAllocCurrent;
+ }
+}
+
+// Find a memory pointer
+static struct XWBNode* XWBMemFind(void* iPtr, unsigned int* oSize,
+ const char** oFile, unsigned int* oLine)
+{
+ struct XWBNode* result = 0;
+ struct XWBNode* iter;
+
+ iter = xwbMem.mTail;
+ while ((iter = iter->mPrev) != xwbMem.mHead)
+ {
+ if (iter->mPtr == iPtr)
+ {
+ result = iter;
+ *oSize = iter->mSize;
+ *oFile = iter->mFile;
+ *oLine = iter->mLine;
+ break;
+ }
+ }
+ return result;
+}
+
+//Allocate memory
+void* XWBMalloc(unsigned int iSize, const char* iFile, const unsigned int iLine)
+{
+ register usize;
+ unsigned char* result;
+
+ usize = ((iSize + xwbProtSize) / sizeof(unsigned int) + 1) * sizeof(unsigned int);
+ result = malloc(usize);
+ memset(result, xwbUninit, usize);
+ memcpy(&result[iSize], xwbProtect, xwbProtSize);
+
+ XWBMemInsert(result, iSize, iFile, iLine);
+ return (void*) result;
+}
+
+// re-allocate memory
+void* XWBRealloc(void* iPtr, unsigned int iSize, const char* iFile, const unsigned int iLine)
+{
+ register usize;
+ unsigned char* result;
+ struct XWBNode* node;
+ unsigned int size, line;
+ const char* name;
+
+ usize = ((iSize + xwbProtSize) / sizeof(unsigned int) + 1) * sizeof(unsigned int);
+ result = realloc(iPtr, usize);
+ // memset (result, xwbUninit, usize);
+ memcpy(&result[iSize], xwbProtect, xwbProtSize);
+
+ // Update the allocation details
+ name = iFile;
+ line = iLine;
+ node = XWBMemFind(iPtr, &size, &name, &line);
+ XWBNodeSet(node, result, iSize, name, line);
+
+ xwbMem.mAllocCurrent -= size;
+ xwbMem.mAllocCurrent += iSize;
+ if (xwbMem.mAllocUsed < xwbMem.mAllocCurrent)
+ {
+ xwbMem.mAllocUsed = xwbMem.mAllocCurrent;
+ }
+ return (void*)result;
+}
+
+// Unallocate memory
+void XWBFree(void* iPtr, const char* iDesc, const char* iFile, const unsigned int iLine)
+{
+ // Check if it is one of ours
+ const char* file;
+ unsigned int line;
+ unsigned int size;
+ struct XWBNode* node;
+
+ node = XWBMemFind(iPtr, &size, &file, &line);
+ if (node != 0)
+ {
+ unsigned char* ptr = (unsigned char*)iPtr;
+ if (memcmp(&ptr[size], xwbProtect, xwbProtSize) != 0)
+ {
+ // Illegal memory write
+ fprintf(xwbMem.mReport, "%s: %s allocated %s: %u\n", xwbIMW, iDesc, file, line);
+ fprintf(xwbMem.mReport, " : %s deallocated %s: %u\n", iDesc, iFile, iLine);
+ }
+ memset(iPtr, xwbFreed, size);
+ if (xwbMem.mFree)
+ {
+ free(iPtr);
+ XWBNodeDelete(node);
+ }
+ else
+ {
+ // Save the freed memory details
+ XWBNodeFree(node, iDesc, iFile, iLine);
+ }
+ xwbMem.mAllocCurrent -= size;
+ }
+ else
+ {
+ // Free non-heap memory
+ fprintf(xwbMem.mReport, "%s: %s deallocated %s: %u\n", xwbFNH, iDesc, iFile, iLine);
+
+ // Don't do it otherwise it might crash
+ }
+}
+
+// Do not free
+void XWBNoFree(void)
+{
+ if (xwbMem.mHead == 0)
+ {
+ XWBMemNew();
+ }
+ xwbMem.mFree = 0;
+}
+
+//Report
+void XWBReport(const char* iTag)
+{
+ struct XWBNode* iter;
+ unsigned char* ptr;
+ unsigned int size;
+ register unsigned int u;
+
+ if (xwbMem.mHead == 0)
+ {
+ XWBMemNew();
+ }
+
+ if (iTag)
+ {
+ fprintf (xwbMem.mReport, "\n%s\n", iTag);
+ }
+
+ // XWBListDump ();
+ iter = xwbMem.mHead;
+ while ((iter = iter->mNext) != xwbMem.mTail)
+ {
+ ptr = (unsigned char*)iter->mPtr;
+ size = iter->mSize;
+ if (iter->mName)
+ {
+ // Check that there are no FMWs
+ for (u = 0; u < size; u++)
+ {
+ if (ptr[u] != xwbFreed)
+ {
+ fprintf(xwbMem.mReport, "%s: %s freed at %s: %u\n",
+ xwbFMW, iter->mName, iter->mFile, iter->mLine);
+ break;
+ }
+ }
+ }
+ else
+ {
+ fprintf(xwbMem.mReport, "%s: %p %u bytes allocated %s: %u\n",
+ xwbMLK, iter->mPtr, iter->mSize, iter->mFile, iter->mLine);
+ if (memcmp(&ptr[size], xwbProtect, xwbProtSize) != 0)
+ {
+ // Illegal memory write
+ fprintf(xwbMem.mReport, "%s: %p allocated %s: %u\n",
+ xwbIMW, ptr, iter->mFile, iter->mLine);
+ }
+ }
+ }
+
+ // Print statistics
+ fprintf(xwbMem.mReport, "Total allocations : %ld\n",
+ xwbMem.mAllocTotal);
+ fprintf(xwbMem.mReport, "Max memory allocation: %ld (%dK)\n",
+ xwbMem.mAllocUsed, xwbMem.mAllocUsed / 1024);
+ fprintf(xwbMem.mReport, "Total leak : %ld\n\n",
+ xwbMem.mAllocCurrent);
+}
+
+// Duplicate a string
+char* XWBStrDup(const char* iOrig, const char* iFile, const unsigned int iLine)
+{
+ char* result;
+ result = XWBMalloc(strlen(iOrig) + 1, iFile, iLine);
+ strcpy(result, iOrig);
+ return result;
+}
+
+// Allocate a number of items of a specified size
+void* XWBCalloc(unsigned int iNum, unsigned int iSize, const char* iFile, const unsigned int iLine)
+{
+ void* result;
+ unsigned int actual =(((iSize - 1)/sizeof(int)) + 1) * sizeof(int) * iNum;
+ result = XWBMalloc(actual, iFile, iLine);
+ memset(result, 0, actual);
+ return result;
+}
diff --git a/CMemLeak.h b/CMemLeak.h
new file mode 100644
index 0000000..c89d5b2
--- /dev/null
+++ b/CMemLeak.h
@@ -0,0 +1,43 @@
+// CMemLeak.c and CMemLeak.h are taken from the public domain. If the
+// build flag DEBUG_MEMORY_LEAKS is set, then malloc is redefined,
+// to assist in tracking down memory leaks. Using Purify or Valgrind
+// is better, though.
+#ifndef CMEMLEAK_H
+#define CMEMLEAK_H
+
+#include <stdlib.h>
+#include <string.h>
+
+// Used for tracking allocations
+extern void* XWBMalloc(unsigned int iSize, const char* iFile, const unsigned int iLine);
+extern void* XWBCalloc(unsigned int iNum, unsigned int iSize, const char* iFile,
+ const unsigned int iLine);
+extern char* XWBStrDup(const char* iOrig, const char* iFile, const unsigned int iLine);
+
+// Used for tracking reallocations
+extern void* XWBRealloc(void* iPrev, unsigned int iSize, const char* iFile, const unsigned int iLine);
+
+// Used for tracking deallocations
+extern void XWBFree(void* iPtr, const char* iDesc, const char* iFile, const unsigned int iLine);
+
+// Used for reporting
+extern void XWBReport(const char* iTag);
+extern void XWBReportFinal(void);
+
+// Used for detecting FMW
+extern void XWBNoFree(void);
+extern void XWBPreallocate(const int iInitialAllocations);
+
+//#define DEBUG_MEMORY_LEAKS
+
+// Change this ifdef, in order to redefine malloc(etc.) and track memory leaks:
+#ifdef DEBUG_MEMORY_LEAKS
+#define malloc(x) XWBMalloc((x), __FILE__, __LINE__)
+#define realloc(x,size) XWBRealloc(x,(size),__FILE__,__LINE__)
+#define free(x) XWBFree(x, #x, __FILE__, __LINE__)
+#define strdup(x) XWBStrDup(x, __FILE__, __LINE__)
+#define calloc(num,size) XWBCalloc((num), (size), __FILE__, __LINE__)
+#endif
+
+#endif // CMEMLEAK_H
+
diff --git a/Ch2BNPEP.dat b/Ch2BNPEP.dat
new file mode 100644
index 0000000..2ae019f
Binary files /dev/null and b/Ch2BNPEP.dat differ
diff --git a/Ch2BNPEPQ.dat b/Ch2BNPEPQ.dat
new file mode 100644
index 0000000..288d3a1
Binary files /dev/null and b/Ch2BNPEPQ.dat differ
diff --git a/Ch3BNPEP.dat b/Ch3BNPEP.dat
new file mode 100644
index 0000000..382db8b
Binary files /dev/null and b/Ch3BNPEP.dat differ
diff --git a/Ch3BNPEPQ.dat b/Ch3BNPEPQ.dat
new file mode 100644
index 0000000..5cbb6fb
Binary files /dev/null and b/Ch3BNPEPQ.dat differ
diff --git a/ChargeState.c b/ChargeState.c
new file mode 100644
index 0000000..7b3fbd3
--- /dev/null
+++ b/ChargeState.c
@@ -0,0 +1,899 @@
+//Title: ChargeState.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#include "CMemLeak.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "Utils.h"
+#include "ChargeState.h"
+#include "Spectrum.h"
+#include "Inspect.h"
+#include "SVM.h"
+#include "Errors.h"
+#include "LDA.h"
+#include "IonScoring.h"
+
+#ifdef _WIN32
+#include <Windows.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#endif
+
+#define CC_USE_SVM
+
+#define EPSILON (float)0.00001
+
+SVMModel** PMCModel = NULL;
+
+extern LDAModel* PMCCharge1LDA;
+extern LDAModel* PMCCharge2LDA;
+extern LDAModel* PMCCharge3LDA;
+
+extern SVMModel* PMCCharge1SVM;
+extern SVMModel* PMCCharge2SVM;
+extern SVMModel* PMCCharge3SVM;
+
+extern LDAModel* CCModel1LDA;
+extern LDAModel* CCModel2LDA;
+
+extern SVMModel* CCModel1SVM;
+extern SVMModel* CCModel2SVM;
+
+extern PRMBayesianModel* PRMModelCharge2;
+
+// For converting parts-per-million:
+#define ONE_MILLION 1000000
+
+///////////////////////////////////////////////////
+// Forward declarations:
+void ConvolveMassCorrectedSpectrum(PMCInfo* Info, PMCSpectrumInfo* SpectrumInfo);
+
+///////////////////////////////////////////////////
+// Functions:
+
+// Get charge correction features. Most of the charge correction features are set
+// during parent mass correction - if BY convolution assuming charge 2 is very high,
+// then it's most probable that the true spectrum charge is 2.
+void GetChargeCorrectionFeatures1(PMCSpectrumInfo* SpectrumInfo1, PMCSpectrumInfo* SpectrumInfo2,
+ PMCSpectrumInfo* SpectrumInfo3, float* Features)
+{
+ float TotalIntensity = 0;
+ float LowIntensity = 0; // Below m/z
+ float MediumIntensity = 0; // Between m/z and 2*m/z
+ float HighIntensity = 0; // Above 2*m/z
+ int LowPeakCount = 0;
+ int MediumPeakCount = 0;
+ int HighPeakCount = 0;
+ int PeakIndex;
+ int FeatureIndex = 0;
+ float Competitor;
+ int MZ;
+
+ MSSpectrum* Spectrum = SpectrumInfo1->Spectrum;
+ MZ = SpectrumInfo1->BestInfo->ParentMass;
+ //
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ TotalIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ if (Spectrum->Peaks[PeakIndex].Mass <= MZ)
+ {
+ LowPeakCount++;
+ LowIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ else if (Spectrum->Peaks[PeakIndex].Mass <= 2 * MZ)
+ {
+ MediumPeakCount++;
+ MediumIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ else
+ {
+ HighPeakCount++;
+ HighIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ }
+
+ // Feature: How much of the spectral intensity is above M/z?
+ Features[FeatureIndex++] = (MediumIntensity + HighIntensity) / (float)max(0.001, TotalIntensity);
+ Features[FeatureIndex++] = (MediumPeakCount + HighPeakCount) / (float)Spectrum->PeakCount;
+
+ // Features: How do the B/Y convolution values compare between charges 1 and 2?
+ Competitor = max(SpectrumInfo2->BestInfo->Convolve[0], SpectrumInfo3->BestInfo->Convolve[0]);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[0] / max(EPSILON, SpectrumInfo1->BestInfo->Convolve[0] + Competitor);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[0] - Competitor;
+ Competitor = max(SpectrumInfo2->BestInfo->Convolve[1], SpectrumInfo3->BestInfo->Convolve[1]);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[1] / max(EPSILON, SpectrumInfo1->BestInfo->Convolve[1] + Competitor);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[1] - Competitor;
+ Competitor = max(SpectrumInfo2->BestInfo->Convolve[2], SpectrumInfo3->BestInfo->Convolve[2]);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[2] / max(EPSILON, SpectrumInfo1->BestInfo->Convolve[2] + Competitor);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[2] - Competitor;
+ Competitor = max(SpectrumInfo2->BestInfo->Convolve[3], SpectrumInfo3->BestInfo->Convolve[3]);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[3] / max(EPSILON, SpectrumInfo1->BestInfo->Convolve[3] + Competitor);
+ Features[FeatureIndex++] = SpectrumInfo1->BestInfo->Convolve[3] - Competitor;
+}
+
+// Get charge correction features. Most of the charge correction features are set
+// during parent mass correction - if BY convolution assuming charge 2 is very high,
+// then it's most probable that the true spectrum charge is 2.
+void GetChargeCorrectionFeatures2(PMCSpectrumInfo* SpectrumInfo2, PMCSpectrumInfo* SpectrumInfo3,
+ float* Features)
+{
+ float TotalIntensity = 0;
+ float MediumIntensity = 0;
+ float HighIntensity = 0;
+ float LowIntensity = 0;
+ int LowPeakCount = 0;
+ int MediumPeakCount = 0;
+ int HighPeakCount = 0;
+ int PeakIndex;
+ int FeatureIndex = 0;
+ float MZ;
+ float Balance2;
+ MSSpectrum* Spectrum = SpectrumInfo2->Spectrum;
+ //
+ MZ = SpectrumInfo2->BestInfo->ParentMass / (float)2.0;
+ //
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ TotalIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ if (Spectrum->Peaks[PeakIndex].Mass <= MZ)
+ {
+ LowPeakCount++;
+ LowIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ else if (Spectrum->Peaks[PeakIndex].Mass <= 2 * MZ)
+ {
+ MediumPeakCount++;
+ MediumIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ else
+ {
+ HighPeakCount++;
+ HighIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ }
+
+ // Feature: How much of the spectral intensity is above M/z?
+ Features[FeatureIndex++] = (MediumIntensity + HighIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = (MediumPeakCount + HighPeakCount) / (float)Spectrum->PeakCount;
+ Features[FeatureIndex++] = (MediumIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = (MediumPeakCount) / (float)Spectrum->PeakCount;
+ Features[FeatureIndex++] = (LowIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = (LowPeakCount) / (float)Spectrum->PeakCount;
+ //Features[FeatureIndex++] = (HighIntensity) / TotalIntensity;
+ //Features[FeatureIndex++] = (HighPeakCount) / (float)Spectrum->PeakCount;
+
+ // Features: Balance between low and med-to-high:
+ Balance2 = (float)fabs((MediumIntensity + HighIntensity) - LowIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = Balance2;
+
+ // Features: How do the B/Y convolution values compare between charges 2 and 3?
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[0] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[0] + SpectrumInfo3->BestInfo->Convolve[0]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[0] - SpectrumInfo3->BestInfo->Convolve[0];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[1] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[1] + SpectrumInfo3->BestInfo->Convolve[1]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[1] - SpectrumInfo3->BestInfo->Convolve[1];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[2] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[2] + SpectrumInfo3->BestInfo->Convolve[2]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[2] - SpectrumInfo3->BestInfo->Convolve[2];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[3] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[3] + SpectrumInfo3->BestInfo->Convolve[3]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[3] - SpectrumInfo3->BestInfo->Convolve[3];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[0] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve2[0] + SpectrumInfo3->BestInfo->Convolve2[0]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[0] - SpectrumInfo3->BestInfo->Convolve2[0];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[1] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve2[1] + SpectrumInfo3->BestInfo->Convolve2[1]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[1] - SpectrumInfo3->BestInfo->Convolve2[1];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[2] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve2[2] + SpectrumInfo3->BestInfo->Convolve2[2]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[2] - SpectrumInfo3->BestInfo->Convolve2[2];
+ //Features[FeatureIndex++] = Spectrum->PeakCount;
+ Features[FeatureIndex++] = (SpectrumInfo2->BestInfo->ParentMass / (float)(1000 * DALTON));
+}
+
+
+//Phosphorylation uses a distinct PMC model, so that means it needs a distinct CC model
+//most notably, we are going to use the IntensePeakIntensity and skew
+void GetChargeCorrectionFeatures2Phos(PMCSpectrumInfo* SpectrumInfo2, PMCSpectrumInfo* SpectrumInfo3,
+ float* Features)
+{
+ float TotalIntensity = 0;
+ float MediumIntensity = 0;
+ float HighIntensity = 0;
+ float LowIntensity = 0;
+ int LowPeakCount = 0;
+ int MediumPeakCount = 0;
+ int HighPeakCount = 0;
+ int PeakIndex;
+ int FeatureIndex = 0;
+ float MZ;
+ float Balance2;
+ float PhosPeak2;
+ float PhosPeak3;
+ MSSpectrum* Spectrum = SpectrumInfo2->Spectrum;
+ //
+ MZ = SpectrumInfo2->BestInfo->ParentMass / (float)2.0;
+ //
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ TotalIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ if (Spectrum->Peaks[PeakIndex].Mass <= MZ)
+ {
+ LowPeakCount++;
+ LowIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ else if (Spectrum->Peaks[PeakIndex].Mass <= 2 * MZ)
+ {
+ MediumPeakCount++;
+ MediumIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ else
+ {
+ HighPeakCount++;
+ HighIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ }
+
+ // Feature: How much of the spectral intensity is above M/z?
+ Features[FeatureIndex++] = (MediumIntensity + HighIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = (MediumPeakCount + HighPeakCount) / (float)Spectrum->PeakCount;
+ Features[FeatureIndex++] = (MediumIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = (MediumPeakCount) / (float)Spectrum->PeakCount;
+ Features[FeatureIndex++] = (LowIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = (LowPeakCount) / (float)Spectrum->PeakCount;
+ //Features[FeatureIndex++] = (HighIntensity) / TotalIntensity;
+ //Features[FeatureIndex++] = (HighPeakCount) / (float)Spectrum->PeakCount;
+
+ // Features: Balance between low and med-to-high:
+ Balance2 = (float)fabs((MediumIntensity + HighIntensity) - LowIntensity) / TotalIntensity;
+ Features[FeatureIndex++] = Balance2;
+
+ // Features: How do the B/Y convolution values compare between charges 2 and 3?
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[0] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[0] + SpectrumInfo3->BestInfo->Convolve[0]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[0] - SpectrumInfo3->BestInfo->Convolve[0];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[1] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[1] + SpectrumInfo3->BestInfo->Convolve[1]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[1] - SpectrumInfo3->BestInfo->Convolve[1];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[2] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[2] + SpectrumInfo3->BestInfo->Convolve[2]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[2] - SpectrumInfo3->BestInfo->Convolve[2];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[3] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve[3] + SpectrumInfo3->BestInfo->Convolve[3]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve[3] - SpectrumInfo3->BestInfo->Convolve[3];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[0] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve2[0] + SpectrumInfo3->BestInfo->Convolve2[0]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[0] - SpectrumInfo3->BestInfo->Convolve2[0];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[1] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve2[1] + SpectrumInfo3->BestInfo->Convolve2[1]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[1] - SpectrumInfo3->BestInfo->Convolve2[1];
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[2] / max(EPSILON, SpectrumInfo2->BestInfo->Convolve2[2] + SpectrumInfo3->BestInfo->Convolve2[2]);
+ Features[FeatureIndex++] = SpectrumInfo2->BestInfo->Convolve2[2] - SpectrumInfo3->BestInfo->Convolve2[2];
+ //Features[FeatureIndex++] = Spectrum->PeakCount;
+ Features[FeatureIndex++] = (SpectrumInfo2->BestInfo->ParentMass / (float)(1000 * DALTON));
+ //M-p peak related stuff
+ PhosPeak2 = (float) (max(0.1, SpectrumInfo2->BestInfo->IntensePeakIntensity[2]));
+ PhosPeak3 = (float) (max(0.1, SpectrumInfo3->BestInfo->IntensePeakIntensity[2]));
+ Features[FeatureIndex++] = PhosPeak2 / (PhosPeak2 + PhosPeak3);
+}
+
+
+int ChargeCorrectSpectrum(SpectrumNode* Node, float* Model1Score, float* Model2Score)
+{
+ PMCSpectrumInfo* SpectrumInfo1;
+ PMCSpectrumInfo* SpectrumInfo2;
+ PMCSpectrumInfo* SpectrumInfo3;
+ float CCFeatures1[64];
+ float CCFeatures2[64];
+ float Score1;
+ float Score2;
+ //
+ Score1 = 0;
+#ifdef CC_USE_SVM
+ LoadCCModelSVM(0);
+#else
+ LoadCCModelLDA(0);
+#endif
+ /////////////////////////////////
+ // Charge 1 PMC:
+ Node->Spectrum->Charge = 1;
+ Node->Spectrum->ParentMass = (Node->Spectrum->MZ * 1);
+ SpectrumInfo1 = GetPMCSpectrumInfo(Node->Spectrum);
+ PerformPMC(SpectrumInfo1);
+ /////////////////////////////////
+ // Charge 2 PMC:
+ Node->Spectrum->Charge = 2;
+ Node->Spectrum->ParentMass = (Node->Spectrum->MZ * 2) - HYDROGEN_MASS;
+ SpectrumInfo2 = GetPMCSpectrumInfo(Node->Spectrum);
+ PerformPMC(SpectrumInfo2);
+ /////////////////////////////////
+ // Charge 3 PMC:
+ Node->Spectrum->Charge = 3;
+ Node->Spectrum->ParentMass = (Node->Spectrum->MZ * 3) - 2 * HYDROGEN_MASS;
+ SpectrumInfo3 = GetPMCSpectrumInfo(Node->Spectrum);
+ PerformPMC(SpectrumInfo3);
+ // Get features:
+ memset(CCFeatures1, 0, sizeof(float) * 64);
+ memset(CCFeatures2, 0, sizeof(float) * 64);
+ GetChargeCorrectionFeatures1(SpectrumInfo1, SpectrumInfo2, SpectrumInfo3, CCFeatures1);
+ GetChargeCorrectionFeatures2(SpectrumInfo2, SpectrumInfo3, CCFeatures2); //change to Phos function if you need
+#ifdef CC_USE_SVM
+ Score1 = SVMClassify(CCModel1SVM, CCFeatures1, 0);
+ Score2 = SVMClassify(CCModel2SVM, CCFeatures2, 0);
+#else
+ Score1 = ApplyLDAModel(CCModel1LDA, CCFeatures1);
+ Score2 = ApplyLDAModel(CCModel2LDA, CCFeatures2);
+#endif
+ // If the caller asked for them, return the scores from the two models:
+ if (Model1Score)
+ {
+ *Model1Score = Score1;
+ }
+ if (Model2Score)
+ {
+ *Model2Score = Score2;
+ }
+ // Free temporary structs:
+ FreePMCSpectrumInfo(SpectrumInfo1);
+ FreePMCSpectrumInfo(SpectrumInfo2);
+ FreePMCSpectrumInfo(SpectrumInfo3);
+ // Use cutoffs to determine the favorite charge state:
+ if (Score1 > 1.0)
+ {
+ return 1;
+ }
+ if (Score2 > 0.0)
+ {
+ return 2;
+ }
+ return 3;
+}
+
+// We've loaded a spectrum. Now let's adjust its parent mass and its charge to the
+// best possible.
+ void TweakSpectrum(SpectrumNode* Node)
+{
+ MSSpectrum* Spectrum;
+ PMCSpectrumInfo* SpectrumInfo;
+ PMCSpectrumInfo* SpectrumInfo1;
+ PMCSpectrumInfo* SpectrumInfo2;
+ PMCSpectrumInfo* SpectrumInfo3;
+ float CCFeatures[64];
+ float CCScore;
+ int TweakIndex;
+ int Charge;
+ //
+ if (!Node->Spectrum || !Node->Spectrum->PeakCount)
+ {
+ return;
+ }
+ Spectrum = Node->Spectrum;
+ //fflush(stdout);
+ // If our models aren't loaded - which should NEVER happen in production - then we'll
+ // trust the input mass and charge.
+ if (!PRMModelCharge2)
+ {
+ if (!Spectrum->Charge)
+ {
+ Spectrum->Charge = 2;
+ Spectrum->ParentMass = (Spectrum->MZ * 2) - HYDROGEN_MASS;
+ //printf("NEC_ERROR: We are unable to load Model and spectrum has no charge!!!");
+ }
+ TweakIndex = (Spectrum->Charge - 1) * 2;
+ Node->Tweaks[TweakIndex].Charge = Spectrum->Charge;
+ Node->Tweaks[TweakIndex].ParentMass = Spectrum->ParentMass;
+ //printf("NEC_ERROR: We are unable to load PRMModelCharge!!!!\n");
+ return;
+ }
+
+ Node->Spectrum->ParentMass = (Spectrum->MZ * 2) - HYDROGEN_MASS;
+
+ //printf("A\n");
+ //fflush(stdout);
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+
+ if (!GlobalOptions->MultiChargeMode && Spectrum->FileChargeFlag)
+ {
+ // The spectrum has charge(s) assigned, and we're trusting the charge(s).
+
+ for (Charge = 1; Charge < 5; Charge++)
+ {
+ if (Spectrum->FileCharge[Charge])
+ {
+ //printf("Tweaking for charge %d\n",Charge);
+ Spectrum->Charge = Charge;
+ SpectrumInfo = GetPMCSpectrumInfo(Spectrum);
+
+ PerformPMC(SpectrumInfo);
+ TweakIndex = min(3, Spectrum->Charge - 1) * 2;
+ Node->Tweaks[TweakIndex].Charge = Spectrum->Charge;
+ Node->Tweaks[TweakIndex].ParentMass = SpectrumInfo->BestInfo->ParentMass;
+ //printf("NEC_ERROR: We have file charge!! Tweak [%d]: z= %d, PM=%d\n",TweakIndex,Spectrum->Charge,Node->Tweaks[TweakIndex].ParentMass);
+ if (SpectrumInfo->RunnerUpInfo)
+ {
+ Node->Tweaks[TweakIndex + 1].Charge = Spectrum->Charge;
+ Node->Tweaks[TweakIndex + 1].ParentMass = SpectrumInfo->RunnerUpInfo->ParentMass;
+ }
+ //SpectrumComputeNoiseDistributions(Node);
+ FreePMCSpectrumInfo(SpectrumInfo);
+ }
+ }
+ return;
+ }
+
+#ifdef CC_USE_SVM
+ //printf("NEC_ERROR: Using LoadCCModelSVM\n");
+ LoadCCModelSVM(0);
+#else
+ //printf("NEC_ERROR: Using LoadCCModelLDA\n");
+ LoadCCModelLDA(0);
+#endif
+
+
+ // Either the spectrum has no charge set, or we're overriding the file guess
+ // with our charge correction guess.
+
+ // Find the best parent mass if the charge is 1:
+ Node->Spectrum->Charge = 1;
+ SpectrumInfo1 = GetPMCSpectrumInfo(Spectrum);
+
+ //printf("D\n");
+ //fflush(stdout);
+
+ PerformPMC(SpectrumInfo1);
+ Node->Tweaks[0].Charge = 1;
+ Node->Tweaks[0].ParentMass = SpectrumInfo1->BestInfo->ParentMass;
+ //printf("NEC_ERROR: Tweak [0]: z= %d, PM=%d\n",Node->Tweaks[0].Charge,Node->Tweaks[0].ParentMass);
+ if (SpectrumInfo1->RunnerUpInfo)
+ {
+ Node->Tweaks[1].Charge = 1;
+ Node->Tweaks[1].ParentMass = SpectrumInfo1->RunnerUpInfo->ParentMass;
+ //printf("NEC_ERROR: Tweak [1]: z= %d, PM=%d\n",Node->Tweaks[1].Charge,Node->Tweaks[1].ParentMass);
+ }
+
+ // Find the best parent mass if the charge is 2:
+
+ //printf("E\n");
+ //fflush(stdout);
+
+ Node->Spectrum->Charge = 2;
+ SpectrumInfo2 = GetPMCSpectrumInfo(Spectrum);
+ PerformPMC(SpectrumInfo2);
+ Node->Tweaks[2].Charge = 2;
+ Node->Tweaks[2].ParentMass = SpectrumInfo2->BestInfo->ParentMass;
+ //printf("NEC_ERROR: Tweak[2]: z= %d, PM=%d\n",Node->Tweaks[2].Charge,Node->Tweaks[2].ParentMass);
+ if (SpectrumInfo2->RunnerUpInfo)
+ {
+ Node->Tweaks[3].Charge = 2;
+ Node->Tweaks[3].ParentMass = SpectrumInfo2->RunnerUpInfo->ParentMass;
+ //printf("NEC_ERROR: Tweak [3]: z= %d, PM=%d\n",Node->Tweaks[3].Charge,Node->Tweaks[3].ParentMass);
+ }
+
+ // Find the best parent mass if the charge is 3:
+
+ Node->Spectrum->Charge = 3;
+ SpectrumInfo3 = GetPMCSpectrumInfo(Spectrum);
+ PerformPMC(SpectrumInfo3);
+ Node->Tweaks[4].Charge = 3;
+ Node->Tweaks[4].ParentMass = SpectrumInfo3->BestInfo->ParentMass;
+ //printf("NEC_ERROR: Tweak [4]: z= %d, PM=%d\n",Node->Tweaks[4].Charge,Node->Tweaks[4].ParentMass);
+ if (SpectrumInfo3->RunnerUpInfo)
+ {
+ Node->Tweaks[5].Charge = 3;
+ Node->Tweaks[5].ParentMass = SpectrumInfo3->RunnerUpInfo->ParentMass;
+ //printf("NEC_ERROR: Tweak [5]: z= %d, PM=%d\n",Node->Tweaks[5].Charge,Node->Tweaks[5].ParentMass);
+ }
+ //printf("F\n");
+ //fflush(stdout);
+ GetChargeCorrectionFeatures1(SpectrumInfo1, SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ CCScore = SVMClassify(CCModel1SVM, CCFeatures, 0);
+ if (CCScore > 0)
+ {
+ // It's a singly-charged spectrum:
+ Node->Tweaks[2].Charge = 0;
+ Node->Tweaks[3].Charge = 0;
+ Node->Tweaks[4].Charge = 0;
+ Node->Tweaks[5].Charge = 0;
+ }
+ else
+ {
+ // It's a multiply-charged spectrum:
+ Node->Tweaks[0].Charge = 0;
+ Node->Tweaks[1].Charge = 0;
+ if (GlobalOptions->PhosphorylationFlag)
+ {
+ GetChargeCorrectionFeatures2Phos(SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ }
+ else
+ {
+ GetChargeCorrectionFeatures2(SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ }
+ CCScore = SVMClassify(CCModel2SVM, CCFeatures, 0);
+ if (CCScore >= 0.5)
+ {
+ // It's clearly not charge-3:
+ Node->Tweaks[4].Charge = 0;
+ Node->Tweaks[5].Charge = 0;
+ }
+ if (CCScore <= -0.5)
+ {
+ // It's clearly not charge-2:
+ Node->Tweaks[2].Charge = 0;
+ Node->Tweaks[3].Charge = 0;
+ }
+ }
+ //printf("G\n");
+ //fflush(stdout);
+
+ // cleanup:
+ FreePMCSpectrumInfo(SpectrumInfo1);
+ FreePMCSpectrumInfo(SpectrumInfo2);
+ FreePMCSpectrumInfo(SpectrumInfo3);
+ //SpectrumComputeNoiseDistributions(Node);
+
+ return;
+}
+
+void TweakSpectrum_NEC(SpectrumNode* Node)
+{
+ MSSpectrum* Spectrum;
+ PMCSpectrumInfo* SpectrumInfo;
+ PMCSpectrumInfo* SpectrumInfo1;
+ PMCSpectrumInfo* SpectrumInfo2;
+ PMCSpectrumInfo* SpectrumInfo3;
+ float CCFeatures[64];
+ float CCScore;
+ int TweakIndex;
+ int Charge;
+ //
+ if (!Node->Spectrum || !Node->Spectrum->PeakCount)
+ {
+ return;
+ }
+ Spectrum = Node->Spectrum;
+ // If our models aren't loaded - which should NEVER happen in production - then we'll
+ // trust the input mass and charge.
+ if (!PRMModelCharge2)
+ {
+ if (!Spectrum->Charge)
+ {
+ Spectrum->Charge = 2;
+ Spectrum->ParentMass = (Spectrum->MZ * 2) - HYDROGEN_MASS;
+ }
+ TweakIndex = (Spectrum->Charge - 1) * 2;
+ Node->Tweaks[TweakIndex].Charge = Spectrum->Charge;
+ Node->Tweaks[TweakIndex].ParentMass = Spectrum->ParentMass;
+ return;
+ }
+
+ if(GlobalOptions->InstrumentType == INSTRUMENT_TYPE_FT_HYBRID)
+ {
+ if (!Spectrum->Charge)
+ {
+ Spectrum->Charge = 2;
+ Spectrum->ParentMass = (Spectrum->MZ * 2) - HYDROGEN_MASS;
+ }
+ TweakIndex = (Spectrum->Charge - 1) * 2;
+ Node->Tweaks[TweakIndex].Charge = Spectrum->Charge;
+ Node->Tweaks[TweakIndex].ParentMass = Spectrum->ParentMass;
+ return;
+
+ }
+ Node->Spectrum->ParentMass = (Spectrum->MZ * 2) - HYDROGEN_MASS;
+
+
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+
+ if (!GlobalOptions->MultiChargeMode && Spectrum->FileChargeFlag)
+ {
+ // The spectrum has charge(s) assigned, and we're trusting the charge(s).
+ for (Charge = 1; Charge < 5; Charge++)
+ {
+ if (Spectrum->FileCharge[Charge])
+ {
+ Spectrum->Charge = Charge;
+ SpectrumInfo = GetPMCSpectrumInfo(Spectrum);
+
+ PerformPMC(SpectrumInfo);
+ TweakIndex = min(3, Spectrum->Charge - 1) * 2;
+ Node->Tweaks[TweakIndex].Charge = Spectrum->Charge;
+ Node->Tweaks[TweakIndex].ParentMass = SpectrumInfo->BestInfo->ParentMass;
+
+ if (SpectrumInfo->RunnerUpInfo)
+ {
+ Node->Tweaks[TweakIndex + 1].Charge = Spectrum->Charge;
+ Node->Tweaks[TweakIndex + 1].ParentMass = SpectrumInfo->RunnerUpInfo->ParentMass;
+ }
+ //SpectrumComputeNoiseDistributions(Node);
+ FreePMCSpectrumInfo(SpectrumInfo);
+ }
+ }
+ return;
+ }
+#ifdef CC_USE_SVM
+ //printf("NEC_ERROR: Using LoadCCModelSVM\n");
+ LoadCCModelSVM(0);
+#else
+ //printf("NEC_ERROR: Using LoadCCModelLDA\n");
+ LoadCCModelLDA(0);
+#endif
+
+ // Either the spectrum has no charge set, or we're overriding the file guess
+ // with our charge correction guess.
+
+ // Find the best parent mass if the charge is 1:
+ Node->Spectrum->Charge = 1;
+ SpectrumInfo1 = GetPMCSpectrumInfo(Spectrum);
+
+
+
+ PerformPMC(SpectrumInfo1);
+ Node->Tweaks[0].Charge = 1;
+ Node->Tweaks[0].ParentMass = SpectrumInfo1->BestInfo->ParentMass;
+ if (SpectrumInfo1->RunnerUpInfo)
+ {
+ Node->Tweaks[1].Charge = 1;
+ Node->Tweaks[1].ParentMass = SpectrumInfo1->RunnerUpInfo->ParentMass;
+ }
+
+ // Find the best parent mass if the charge is 2:
+
+
+
+ Node->Spectrum->Charge = 2;
+ SpectrumInfo2 = GetPMCSpectrumInfo(Spectrum);
+ PerformPMC(SpectrumInfo2);
+ Node->Tweaks[2].Charge = 2;
+ Node->Tweaks[2].ParentMass = SpectrumInfo2->BestInfo->ParentMass;
+ if (SpectrumInfo2->RunnerUpInfo)
+ {
+ Node->Tweaks[3].Charge = 2;
+ Node->Tweaks[3].ParentMass = SpectrumInfo2->RunnerUpInfo->ParentMass;
+ }
+
+ // Find the best parent mass if the charge is 3:
+
+ Node->Spectrum->Charge = 3;
+ SpectrumInfo3 = GetPMCSpectrumInfo(Spectrum);
+ PerformPMC(SpectrumInfo3);
+ Node->Tweaks[4].Charge = 3;
+ Node->Tweaks[4].ParentMass = SpectrumInfo3->BestInfo->ParentMass;
+ if (SpectrumInfo3->RunnerUpInfo)
+ {
+ Node->Tweaks[5].Charge = 3;
+ Node->Tweaks[5].ParentMass = SpectrumInfo3->RunnerUpInfo->ParentMass;
+ }
+
+ GetChargeCorrectionFeatures1(SpectrumInfo1, SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ CCScore = SVMClassify(CCModel1SVM, CCFeatures, 0);
+ if (CCScore > 0)
+ {
+ // It's a singly-charged spectrum:
+ Node->Tweaks[2].Charge = 0;
+ Node->Tweaks[3].Charge = 0;
+ Node->Tweaks[4].Charge = 0;
+ Node->Tweaks[5].Charge = 0;
+ }
+ else
+ {
+ // It's a multiply-charged spectrum:
+ Node->Tweaks[0].Charge = 0;
+ Node->Tweaks[1].Charge = 0;
+ if (GlobalOptions->PhosphorylationFlag)
+ {
+ GetChargeCorrectionFeatures2Phos(SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ }
+ else
+ {
+ GetChargeCorrectionFeatures2(SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ }
+ CCScore = SVMClassify(CCModel2SVM, CCFeatures, 0);
+ if (CCScore >= 0.5)
+ {
+ // It's clearly not charge-3:
+ Node->Tweaks[4].Charge = 0;
+ Node->Tweaks[5].Charge = 0;
+ }
+ if (CCScore <= -0.5)
+ {
+ // It's clearly not charge-2:
+ Node->Tweaks[2].Charge = 0;
+ Node->Tweaks[3].Charge = 0;
+ }
+ }
+
+ // cleanup:
+ FreePMCSpectrumInfo(SpectrumInfo1);
+ FreePMCSpectrumInfo(SpectrumInfo2);
+ FreePMCSpectrumInfo(SpectrumInfo3);
+ //SpectrumComputeNoiseDistributions(Node);
+ return;
+}
+
+// Iterate over lines of a training/testing oracle file, and invoke the callback function once for each.
+// Line format: Tab-delimited. Pieces are:
+// Spectrum file name (not full path), charge, parent mass, annotation
+void TrainOnOracleFile(char* OracleFileName, char* SpectrumDir, TrainingCallback Callback)
+{
+ int BytesToRead;
+ char LineBuffer[MAX_LINE_LENGTH];
+ int BufferPos = 0;
+ int BytesRead;
+ int BufferEnd = 0;
+ int LineNumber = 0;
+ char TextBuffer[BUFFER_SIZE * 2];
+ FILE* OracleFile;
+ char* SpectrumFileName;
+ char FilePath[2048];
+ int Charge;
+ int ParentMass;
+ FILE* DTAFile;
+ char* Field;
+ Peptide* Match;
+ SpectrumNode* Node;
+ InputFileNode* FNode;
+ char* ColonPos;
+ int SpectrumFilePos;
+ char* Extension;
+ //
+ OracleFile = fopen(OracleFileName, "rb");
+ if (!OracleFile)
+ {
+ printf("** Error: Unable to open training oracle '%s'.\n", OracleFileName);
+ return;
+ }
+ Node = (SpectrumNode*)calloc(1, sizeof(SpectrumNode));
+ FNode = (InputFileNode*)calloc(1, sizeof(InputFileNode));
+ LineNumber = 0;
+ while (1)
+ {
+ BytesToRead = BUFFER_SIZE - BufferEnd;
+ BytesRead = ReadBinary(TextBuffer + BufferEnd, sizeof(char), BytesToRead, OracleFile);
+ BufferEnd += BytesRead;
+ TextBuffer[BufferEnd] = '\0';
+ if (BufferPos == BufferEnd)
+ {
+ // We're done!
+ break;
+ }
+
+ // Copy a line of text to the line buffer. Skip spaces, and stop at carriage return or newline.
+ BufferPos = CopyBufferLine(TextBuffer, BufferPos, BufferEnd, LineBuffer, 0);
+ LineNumber += 1;
+
+ // Now, move the remaining text to the start of the buffer:
+ memmove(TextBuffer, TextBuffer + BufferPos, BufferEnd - BufferPos);
+ BufferEnd -= BufferPos;
+ BufferPos = 0;
+
+ // Now, process this line of text!
+ // Skip empty lines:
+ if (!LineBuffer[0])
+ {
+ continue;
+ }
+ if (LineBuffer[0] == '#')
+ {
+ continue;
+ }
+ SpectrumFileName = strtok(LineBuffer, "\t");
+ if (!SpectrumFileName)
+ {
+ continue;
+ }
+
+ SpectrumFilePos = 0;
+ ColonPos = SpectrumFileName;
+ if (SpectrumFileName[1] == ':')
+ {
+ ColonPos = SpectrumFileName + 2;
+ }
+ while (*ColonPos)
+ {
+ if (*ColonPos == ':')
+ {
+ *ColonPos = '\0';
+ SpectrumFilePos = atoi(ColonPos + 1);
+ break;
+ }
+ ColonPos++;
+ }
+
+ Extension = SpectrumFileName + strlen(SpectrumFileName) - 4;
+ if (!CompareStrings(Extension, ".mgf"))
+ {
+ FNode->Format = SPECTRUM_FORMAT_MGF;
+ }
+ else if (!CompareStrings(Extension, ".ms2"))
+ {
+ //FNode->Format = SPECTRUM_FORMAT_MS2;
+ FNode->Format = SPECTRUM_FORMAT_MS2_COLONS;
+ }
+ else if (!CompareStrings(Extension, ".mzxml"))
+ {
+ FNode->Format = SPECTRUM_FORMAT_MZXML;
+ }
+ else if (!CompareStrings(Extension, ".mzdata"))
+ {
+ FNode->Format = SPECTRUM_FORMAT_MZDATA;
+ }
+ else
+ {
+ FNode->Format = SPECTRUM_FORMAT_DTA;
+ }
+ if (SpectrumFileName[1] == ':')
+ {
+ sprintf(FilePath, "%s", SpectrumFileName);
+ }
+ else
+ {
+ sprintf(FilePath, "%s%s", SpectrumDir, SpectrumFileName);
+ }
+
+
+ DTAFile = fopen(FilePath, "rb");
+ if (!DTAFile)
+ {
+ printf("**Error: Couldn't open training/testing spectrum '%s'\n", FilePath);
+ continue;
+ }
+ fseek(DTAFile, SpectrumFilePos, 0);
+ Field = strtok(NULL, "\t");
+ if (!Field)
+ {
+ printf("** Syntax error: Line %d of %s\n", LineNumber, OracleFileName);
+ continue;
+ }
+ Charge = atoi(Field);
+ Field = strtok(NULL, "\t");
+ if (!Field)
+ {
+ printf("** Syntax error: Line %d of %s\n", LineNumber, OracleFileName);
+ continue;
+ }
+
+ ParentMass = (int)(atof(Field) * MASS_SCALE + 0.5);
+ Field = strtok(NULL, "\t");
+ if (!Field)
+ {
+ printf("** Syntax error: Line %d of %s\n", LineNumber, OracleFileName);
+ continue;
+ }
+
+ Match = GetPeptideFromAnnotation(Field);
+ Node->Spectrum = (MSSpectrum*)calloc(1, sizeof(MSSpectrum));
+ Node->Spectrum->Node = Node;
+ strcpy(FNode->FileName, FilePath);
+ Node->InputFile = FNode;
+ //strcpy(Node->FileName, FilePath);
+ SpectrumLoadFromFile(Node->Spectrum, DTAFile);
+ fclose(DTAFile);
+ (*Callback)(Node, Charge, ParentMass, Match);
+ FreeSpectrum(Node->Spectrum);
+ FreePeptideNode(Match);
+ }
+}
+
diff --git a/ChargeState.h b/ChargeState.h
new file mode 100644
index 0000000..81dc9af
--- /dev/null
+++ b/ChargeState.h
@@ -0,0 +1,65 @@
+//Title: ChargeState.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef CHARGE_STATE_H
+#define CHARGE_STATE_H
+
+
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "Utils.h"
+#include "Inspect.h"
+#include "Spectrum.h"
+#include "ParentMass.h"
+
+// Code to support charge state determination. Our plan is:
+// - Organize a 'training+test' directory of spectra, half charge 2 and half charge 3. The directory should
+// include some QTOF results.
+// - Use an API in ChargeState.c to write out a set of features for these spectra. +1 means charge 3, in this case.
+// - Use libsvm to train a support vector machine on these features
+// - Use the resulting model to guess charge states if the charge is unlisted, or if the MultiCharge option is set. We
+// use easy heuristics to detect +1 spectra, then use the svm to separate +2 and +3. If confidence is low, still search
+// both charge states.
+
+void TweakSpectrum(SpectrumNode* Node);
+void TweakSpectrum_NEC(SpectrumNode* Node);
+
+void GetChargeCorrectionFeatures1(PMCSpectrumInfo* SpectrumInfo1, PMCSpectrumInfo* SpectrumInfo2,
+ PMCSpectrumInfo* SpectrumInfo3, float* Features);
+void GetChargeCorrectionFeatures2(PMCSpectrumInfo* SpectrumInfo2, PMCSpectrumInfo* SpectrumInfo3,
+ float* Features);
+void GetChargeCorrectionFeatures2Phos(PMCSpectrumInfo* SpectrumInfo2, PMCSpectrumInfo* SpectrumInfo3,
+ float* Features);
+int ChargeCorrectSpectrum(SpectrumNode* Node, float* Model1Score, float* Model2Score);
+#endif // CHARGE_STATE_H
diff --git a/CombinePTMFeatures.py b/CombinePTMFeatures.py
new file mode 100644
index 0000000..14e58ba
--- /dev/null
+++ b/CombinePTMFeatures.py
@@ -0,0 +1,627 @@
+#Title: CombinePTMFeatures.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+CombinePTMFeatures:
+- Parse the output of several runs of ComputePTMFeatures.
+- Accumulate consensus clusters, and accumulate coverage
+- Write the results to a merged directory
+"""
+import sys
+import os
+import math
+import getopt
+import string
+import struct
+import traceback
+import shutil
+import BuildConsensusSpectrum
+import ResultsParser
+import PyInspect
+from Utils import *
+from TrainPTMFeatures import FormatBits
+
+UsageInfo = """
+-r [PATH]: Directory containing the results-directories to merge
+-w [PATH]: Output file
+-d [PATH]: Database path
+
+Optional:
+-M [DIR]: Directory subtree where mzxml files *really* live.
+ Spectrum paths will be corrected to use these paths.
+-x: If set, prepare output directories. (Should be set for the first
+ run of a batch, and NOT for any others)
+-s [POS]: Start DBPosition
+-e [POS]: End DBPosition
+-q: Quick-parse flag
+-c [STRING]: Required filename chunk
+"""
+
+class PTMFeatureMerger(ResultsParser.SpectrumOracleMixin):
+ def __init__(self):
+ self.OutputDir = None
+ self.Peptides = {} # (annotation, charge) -> peptide species
+ self.HeaderLines = []
+ self.HeaderLinesParsed = 0
+ self.DBStart = None
+ self.DBEnd = None
+ self.QuickParseFlag = 0
+ self.TotalSpectrumCount = 0
+ self.SpectrumRoot = None
+ ResultsParser.SpectrumOracleMixin.__init__(self)
+ def WipeDir(self, Dir):
+ try:
+ shutil.rmtree(Dir)
+ except:
+ pass
+ def ParseCommandLine(self, Arguments):
+ PrepareDirsFlag = 0
+ (Options, Args) = getopt.getopt(Arguments, "d:r:w:s:e:qxM:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ self.PTMFeatureDirectory = Value
+ elif Option == "-d":
+ self.DBPath = Value
+ elif Option == "-w":
+ self.OutputPath = Value
+ elif Option == "-s":
+ self.DBStart = int(Value)
+ elif Option == "-e":
+ self.DBEnd = int(Value)
+ elif Option == "-q":
+ self.QuickParseFlag = 1
+ elif Option == "-x":
+ PrepareDirsFlag = 1
+ elif Option == "-M":
+ self.SpectrumRoot = Value
+ else:
+ print "* Error: Unrecognized option %s"%Option
+ if not self.OutputPath:
+ print "* Please specify an output file (-w)"
+ print UsageInfo
+ sys.exit(-1)
+ self.OutputDir = os.path.split(self.OutputPath)[0]
+ #self.OutputPath = os.path.join(self.OutputDir, "PTMFeatures.txt")
+ self.ClusterDir = os.path.join(self.OutputDir, "Clusters")
+ self.SpectrumDir = os.path.join(self.OutputDir, "Spectra")
+ self.ClusterMemberDir = os.path.join(self.OutputDir, "ClusterMembers")
+ print "Prepare directories..."
+ if PrepareDirsFlag:
+ self.WipeDir(self.OutputDir)
+ MakeDirectory(self.OutputDir)
+ for Dir in (self.ClusterDir, self.SpectrumDir, self.ClusterMemberDir):
+ MakeDirectory(Dir)
+ for AA in "ACDEFGHIKLMNPQRSTVWY":
+ MakeDirectory(os.path.join(Dir, AA))
+ return 1 # success
+ def LoadDB(self):
+ # Populate self.DB with the contents of the .trie file
+ File = open(self.DBPath, "rb")
+ self.DB = File.read()
+ File.close()
+ self.Coverage = [0] * len(self.DB)
+ self.ModCoverage = [0] * len(self.DB)
+ def OutputCoverage(self):
+ CoveragePath = os.path.join(self.OutputDir, "Coverage.dat")
+ CoverageFile = open(CoveragePath, "wb")
+ for DBPos in range(len(self.DB)):
+ Str = struct.pack("<II", self.Coverage[DBPos], self.ModCoverage[DBPos])
+ CoverageFile.write(Str)
+ CoverageFile.close()
+ def OutputPTMFeatures(self):
+ File = open(self.OutputPath, "wb")
+ for FileLine in self.HeaderLines:
+ File.write(FileLine)
+ for Peptide in self.Peptides.values():
+ String = string.join(Peptide.Bits, "\t")
+ File.write(String + "\n")
+ def GrabClusterMembers(self, Directory, Peptide):
+ InputPath = os.path.join(Directory, "ClusterMembers", Peptide.Annotation[2], "%s.%s.txt"%(Peptide.Annotation, Peptide.Charge))
+ OutputPath = os.path.join(self.ClusterMemberDir, Peptide.Annotation[2], "%s.%s.txt"%(Peptide.Annotation, Peptide.Charge))
+ InputFile = open(InputPath, "rb")
+ OutputFile = open(OutputPath, "a+b")
+ OutputFile.write(InputFile.read())
+ InputFile.close()
+ OutputFile.close()
+ def BuildNewPeptide(self, Cursor):
+ """
+ We've parsed the first of the cursors that contains this peptide species. Build a peptide
+ and start accumulating clusters.
+ """
+ Bits = Cursor.Bits
+
+ def AddNewPeptide(self, Directory, Bits):
+ """
+ We're parsing a peptide (from the ComputePTMFeatures output) which we haven't
+ seen before. Create a Peptide object and populate it.
+ """
+ Peptide = Bag()
+ Peptide.Bits = list(Bits)
+ Peptide.Charge = int(Bits[FormatBits.Charge])
+ Peptide.Annotation = Bits[FormatBits.Peptide]
+ SpectrumCount = int(Bits[FormatBits.SpectrumCount])
+ self.TotalSpectrumCount += SpectrumCount
+ Peptide.Peptide = GetPeptideFromModdedName(Peptide.Annotation)
+ Peptide.ModlessAnnotation = "%s.%s.%s"%(Peptide.Peptide.Prefix, Peptide.Peptide.Aminos, Peptide.Peptide.Suffix)
+ # Grab the cluster members right away:
+ self.GrabClusterMembers(Directory, Peptide)
+ # Add a cluster for the peptide:
+ Peptide.Cluster = BuildConsensusSpectrum.ConsensusBuilder(Peptide.Charge)
+ ClusterPath = os.path.join(Directory, "Clusters", Peptide.Annotation[2], "%s.%s.cls"%(Peptide.Annotation, Peptide.Charge))
+ Peptide.Cluster.UnpickleCluster(ClusterPath)
+ # Add a cluster for the modless peptide, if the cluster-file exists:
+ ModlessClusterPath = os.path.join(Directory, "Clusters", Peptide.Annotation[2], "%s.%s.cls"%(Peptide.ModlessAnnotation, Peptide.Charge))
+ if os.path.exists(ModlessClusterPath):
+ Peptide.ModlessCluster = BuildConsensusSpectrum.ConsensusBuilder(Peptide.Charge)
+ Peptide.ModlessCluster.UnpickleCluster(ModlessClusterPath)
+ else:
+ Peptide.ModlessCluster = None
+ # Add the peptide to our dictionary:
+ Key = (Peptide.Annotation, Peptide.Charge)
+ self.Peptides[Key] = Peptide
+ def AssimilatePeptide(self, Dir, Peptide, Bits):
+ """
+ We're parsing a peptide (from the ComputePTMFeatures output) which we
+ have already seen. Adjust the features of our Peptide object - accumulate
+ spectrum counts, et cetera.
+ """
+ # Best modless spectrum and MQScore. These may be empty for the new
+ # file bits, for the existing peptide, or both.
+ ScoreStr = Peptide.Bits[FormatBits.BestModlessMQScore]
+ if ScoreStr:
+ OldModlessMQScore = float(ScoreStr)
+ else:
+ OldModlessMQScore = None
+ ScoreStr = Bits[FormatBits.BestModlessMQScore]
+ if ScoreStr:
+ ModlessMQScore = float(ScoreStr)
+ else:
+ ModlessMQScore = None
+ if ModlessMQScore > OldModlessMQScore:
+ Peptide.Bits[FormatBits.BestModlessMQScore] = Bits[FormatBits.BestModlessMQScore]
+ Peptide.Bits[FormatBits.BestModlessSpectrumPath] = Bits[FormatBits.BestModlessSpectrumPath]
+ # Best modded spectrum, mqscore, delta-score:
+ OldBestMQScore = float(Peptide.Bits[FormatBits.BestMQScore])
+ BestMQScore = float(Bits[FormatBits.BestMQScore])
+ if BestMQScore > OldBestMQScore:
+ Peptide.Bits[FormatBits.BestMQScore] = Bits[FormatBits.BestMQScore]
+ Peptide.Bits[FormatBits.BestDeltaScore] = Bits[FormatBits.BestDeltaScore]
+ Peptide.Bits[FormatBits.BestSpectrumPath] = Bits[FormatBits.BestSpectrumPath]
+ # Spectra:
+ CurrentSpectra = int(Peptide.Bits[FormatBits.SpectrumCount])
+ NewBlockSpectra = int(Bits[FormatBits.SpectrumCount])
+ TotalSpectra = CurrentSpectra + NewBlockSpectra
+ self.TotalSpectrumCount += NewBlockSpectra
+ Peptide.Bits[FormatBits.SpectrumCount] = str(Spectra)
+ # Modless spectra
+ Spectra = int(Peptide.Bits[FormatBits.ModlessSpectrumCount])
+ Spectra += int(Bits[FormatBits.ModlessSpectrumCount])
+ Peptide.Bits[FormatBits.ModlessSpectrumCount] = str(Spectra)
+ # Accumulate modded spectra into the cluster:
+ ClusterPath = os.path.join(Dir, "Clusters", Peptide.Annotation[2], "%s.%s.cls"%(Peptide.Annotation, Peptide.Charge))
+ TempCluster = BuildConsensusSpectrum.ConsensusBuilder(Peptide.Charge)
+ TempCluster.UnpickleCluster(ClusterPath)
+ Peptide.Cluster.AssimilateCluster(TempCluster)
+ # Accumulate modless spectra into the modless cluster:
+ ClusterPath = os.path.join(Dir, "Clusters", Peptide.Annotation[2], "%s.%s.cls"%(Peptide.ModlessAnnotation, Peptide.Charge))
+ if os.path.exists(ClusterPath):
+ TempCluster = BuildConsensusSpectrum.ConsensusBuilder(Peptide.Charge)
+ TempCluster.UnpickleCluster(ClusterPath)
+ if Peptide.ModlessCluster:
+ Peptide.ModlessCluster.AssimilateCluster(TempCluster)
+ else:
+ Peptide.ModlessCluster = TempCluster
+ # Consensus MQScore handled at the end
+ # Spectra/sites this mod type handled at the *VERY* end, possibly after
+ # multiple runs of CombinePTMFeatures!
+ # Log spectrum-count handled at the end
+ def MergeResultsFromFile(self, Path):
+ Dir = os.path.split(Path)[0]
+ File = open(Path, "rb")
+ LineNumber = 0
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ if LineNumber % 100 == 0:
+ print "%s line %s (%s peptides, %s spectra)"%(Path, LineNumber, len(self.Peptides.keys()), self.TotalSpectrumCount)
+ if self.QuickParseFlag:
+ break
+ if FileLine[0] == "#":
+ if not self.HeaderLinesParsed:
+ self.HeaderLines.append(FileLine)
+ continue
+ Bits = FileLine.strip().split("\t")
+ # Skip any blank lines:
+ if len(Bits) < 2:
+ continue
+ try:
+ DBPos = int(Bits[FormatBits.DBPos])
+ Annotation = Bits[FormatBits.Peptide]
+ Charge = int(Bits[FormatBits.Charge])
+ except:
+ print "* Warning: Line %s of %s isn't valid!"%(LineNumber, Path)
+ traceback.print_exc()
+ continue
+ # Ignore any peptides which don't fall within our database region of interest:
+ if self.DBStart != None and DBPos < self.DBStart:
+ continue
+ if self.DBEnd != None and DBPos >= self.DBEnd:
+ continue
+ Key = (Annotation, Charge)
+ Peptide = self.Peptides.get(Key, None)
+ if Peptide:
+ self.AssimilatePeptide(Dir, Peptide, Bits)
+ else:
+ self.AddNewPeptide(Dir, Bits)
+ File.close()
+ self.HeaderLinesParsed = 1
+ def FinalizePTMFeatures(self):
+ """
+ Some PTM feature processing, such as building and scoring a consensus spectrum,
+ should happen just once. Those steps happen here, *after* each input file
+ has been parsed.
+ """
+ for Peptide in self.Peptides.values():
+ # Update log-spectrum-count:
+ Spectra = int(Peptide.Bits[FormatBits.SpectrumCount])
+ Peptide.Bits[FormatBits.LogSpectrumCount] = str(math.log(Spectra))
+ # Write out the consensus MODLESS cluster:
+ if Peptide.ModlessCluster:
+ Path = os.path.join(self.ClusterDir, Peptide.Annotation[2], "%s.%s.cls"%(Peptide.ModlessAnnotation, Peptide.Charge))
+ Peptide.ModlessCluster.PickleCluster(Path)
+ # Write out the consensus MODLESS spectrum:
+ if Peptide.ModlessCluster:
+ Path = os.path.join(self.SpectrumDir, Peptide.Annotation[2], "%s.%s.dta"%(Peptide.ModlessAnnotation, Peptide.Charge))
+ Spectrum = Peptide.ModlessCluster.ProduceConsensusSpectrum()
+ Spectrum.WritePeaks(Path)
+ # Write out the CLUSTER:
+ Path = os.path.join(self.ClusterDir, Peptide.Annotation[2], "%s.%s.cls"%(Peptide.Annotation, Peptide.Charge))
+ Peptide.Cluster.PickleCluster(Path)
+ # Write out the consensus SPECTRUM:
+ ConsensusSpectrumPath = os.path.join(self.SpectrumDir, Peptide.Annotation[2], "%s.%s.dta"%(Peptide.Annotation, Peptide.Charge))
+ Spectrum = Peptide.Cluster.ProduceConsensusSpectrum()
+ Spectrum.WritePeaks(ConsensusSpectrumPath)
+ # Compute consensus spectrum features:
+ PySpectrum = PyInspect.Spectrum(ConsensusSpectrumPath, 0)
+ ScoreList = PySpectrum.ScorePeptideDetailed(Peptide.Annotation)
+ Peptide.Bits[FormatBits.ConsensusMQScore] = str(ScoreList[0])
+ Peptide.Bits[FormatBits.ConsensusPRMScore] = str(ScoreList[1])
+ Peptide.Bits[FormatBits.ConsensusBYPresent] = str(ScoreList[2])
+ Peptide.Bits[FormatBits.ConsensusTopPeaks] = str(ScoreList[3])
+ Peptide.Bits[FormatBits.NTT] = str(ScoreList[4])
+ # Compute comparison features:
+ if Peptide.ModlessCluster:
+ Peptide.Bits[FormatBits.SisterAnnotationFlag] = "1"
+ pass #NOTE: skip these features, since we don't really use them!
+
+ def AssimilateDatabaseCoverage(self, CoverageFilePath):
+ """
+ Read Coverage.dat from one of the ComputePTMFeatures runs.
+ Accumulate total coverage.
+ """
+ StructSize = struct.calcsize("<II")
+ File = open(CoverageFilePath, "rb")
+ for DBPos in range(len(self.DB)):
+ Block = File.read(StructSize)
+ (Coverage, ModCoverage) = struct.unpack("<II", Block)
+ self.Coverage[DBPos] += Coverage
+ self.ModCoverage[DBPos] += ModCoverage
+ if DBPos % 10000 == 0:
+ print "%s/%s..."%(DBPos, len(self.DB))
+ File.close()
+ def WriteSingletonPeptide(self, Species, Cursor):
+ """
+ Here's a peptide that was found in only one of the input files. That makes our job very
+ easy; all we need to do is write out the fileline, and copy over: cluster members,
+ cluster file, spectrum file, modless cluster file (if it exists), modless spectrum file (if
+ it exists)
+ """
+ # Write the file line:
+ OutputLine = string.join(Cursor.Bits, "\t")
+ self.OutputFile.write(OutputLine + "\n")
+ # Copy the needful files:
+ Annotation = Species.Annotation
+ Charge = Species.Charge
+ AA = Species.AA
+ SourcePath = os.path.join(Cursor.Directory, "ClusterMembers", AA, "%s.%s.txt"%(Annotation, Charge))
+ TargetPath = os.path.join(self.OutputDir, "ClusterMembers", AA, "%s.%s.txt"%(Annotation, Charge))
+ shutil.copyfile(SourcePath, TargetPath)
+ SourcePath = os.path.join(Cursor.Directory, "Clusters", AA, "%s.%s.cls"%(Annotation, Charge))
+ TargetPath = os.path.join(self.OutputDir, "Clusters", AA, "%s.%s.cls"%(Annotation, Charge))
+ shutil.copyfile(SourcePath, TargetPath)
+ SourcePath = os.path.join(Cursor.Directory, "Spectra", AA, "%s.%s.dta"%(Annotation, Charge))
+ TargetPath = os.path.join(self.OutputDir, "Spectra", AA, "%s.%s.dta"%(Annotation, Charge))
+ shutil.copyfile(SourcePath, TargetPath)
+ SourcePath = os.path.join(Cursor.Directory, "Clusters", AA, "%s.%s.cls"%(Species.ModlessAnnotation, Charge))
+ TargetPath = os.path.join(self.OutputDir, "Clusters", AA, "%s.%s.cls"%(Species.ModlessAnnotation, Charge))
+ if os.path.exists(SourcePath):
+ shutil.copyfile(SourcePath, TargetPath)
+ SourcePath = os.path.join(Cursor.Directory, "Spectra", AA, "%s.%s.dta"%(Species.ModlessAnnotation, Charge))
+ TargetPath = os.path.join(self.OutputDir, "Spectra", AA, "%s.%s.dta"%(Species.ModlessAnnotation, Charge))
+ if os.path.exists(SourcePath):
+ shutil.copyfile(SourcePath, TargetPath)
+ def BuildPeptideSpecies(self, Cursor):
+ Species = Bag()
+ Bits = Cursor.Bits
+ Species.Annotation = Bits[FormatBits.Peptide]
+ Species.AA = Species.Annotation[2]
+ Species.Peptide = GetPeptideFromModdedName(Species.Annotation)
+ Species.ModlessAnnotation = "%s.%s.%s"%(Species.Peptide.Prefix, Species.Peptide.Aminos, Species.Peptide.Suffix)
+ Species.Charge = int(Bits[FormatBits.Charge])
+ Species.Bits = Bits[:]
+ Species.Cluster = None # ConsensusBuilder, instantaited later
+ Species.ModlessCluster = None # ConsensusBuilder, instantaited later
+ # Fix the spectrum PATHS:
+ Species.Bits[FormatBits.BestSpectrumPath] = self.FixSpectrumPath(Species.Bits[FormatBits.BestSpectrumPath])
+ if Species.Bits[FormatBits.BestModlessSpectrumPath]:
+ Species.Bits[FormatBits.BestModlessSpectrumPath] = self.FixSpectrumPath(Species.Bits[FormatBits.BestModlessSpectrumPath])
+ return Species
+ def AssimilatePeptideSpectra(self, Species, Cursor):
+ """
+ Accumulate total spectra for this species.
+ """
+ ###############################
+ # Adjust features - best spectrum, number of spectra, etc.
+ # Best modless spectrum and MQScore. These may be empty for the new
+ # file bits, for the existing peptide, or both.
+ ScoreStr = Species.Bits[FormatBits.BestModlessMQScore]
+ if ScoreStr:
+ OldModlessMQScore = float(ScoreStr)
+ else:
+ OldModlessMQScore = None
+ ScoreStr = Cursor.Bits[FormatBits.BestModlessMQScore]
+ if ScoreStr:
+ ModlessMQScore = float(ScoreStr)
+ else:
+ ModlessMQScore = None
+ if ModlessMQScore > OldModlessMQScore:
+ Species.Bits[FormatBits.BestModlessMQScore] = Cursor.Bits[FormatBits.BestModlessMQScore]
+ Species.Bits[FormatBits.BestModlessSpectrumPath] = Cursor.Bits[FormatBits.BestModlessSpectrumPath]
+ # Best modded spectrum, mqscore, delta-score:
+ OldBestMQScore = float(Species.Bits[FormatBits.BestMQScore])
+ BestMQScore = float(Cursor.Bits[FormatBits.BestMQScore])
+ if BestMQScore > OldBestMQScore:
+ Species.Bits[FormatBits.BestMQScore] = Cursor.Bits[FormatBits.BestMQScore]
+ Species.Bits[FormatBits.BestDeltaScore] = Cursor.Bits[FormatBits.BestDeltaScore]
+ Species.Bits[FormatBits.BestSpectrumPath] = Cursor.Bits[FormatBits.BestSpectrumPath]
+ # Spectra:
+ CurrentSpectra = int(Species.Bits[FormatBits.SpectrumCount])
+ NewBlockSpectra = int(Cursor.Bits[FormatBits.SpectrumCount])
+ TotalSpectra = CurrentSpectra + NewBlockSpectra
+ self.TotalSpectrumCount += NewBlockSpectra
+ Species.Bits[FormatBits.SpectrumCount] = str(TotalSpectra)
+ # Log of spectrum-count:
+ Species.Bits[FormatBits.LogSpectrumCount] = str(math.log(TotalSpectra))
+ # Modless spectra
+ Spectra = int(Species.Bits[FormatBits.ModlessSpectrumCount])
+ Spectra += int(Cursor.Bits[FormatBits.ModlessSpectrumCount])
+ Species.Bits[FormatBits.ModlessSpectrumCount] = str(Spectra)
+ ###############################
+ # Accumulate a list of cluster members:
+ ClusterMemberFileName = "%s.%s.txt"%(Species.Annotation, Species.Charge)
+ ClusterMemberPath = os.path.join(self.OutputDir, "ClusterMembers", Species.AA, ClusterMemberFileName)
+ ClusterMemberFile = open(ClusterMemberPath, "a+b")
+ CursorMemberPath = os.path.join(Cursor.Directory, "ClusterMembers", Species.AA, ClusterMemberFileName)
+ CursorMemberFile = open(CursorMemberPath, "rb")
+ Text = CursorMemberFile.read()
+ ClusterMemberFile.write(Text)
+ CursorMemberFile.close()
+ ClusterMemberFile.close()
+ ###############################
+ # Accumulate members of the modded cluster:
+ ClusterPath = os.path.join(Cursor.Directory, "Clusters", Species.AA, "%s.%s.cls"%(Species.Annotation, Species.Charge))
+ CursorCluster = BuildConsensusSpectrum.ConsensusBuilder(Species.Charge)
+ CursorCluster.UnpickleCluster(ClusterPath)
+ if not Species.Cluster:
+ Species.Cluster = CursorCluster
+ else:
+ Species.Cluster.AssimilateCluster(CursorCluster)
+ ###############################
+ # Accumulate members of the modless cluster:
+ ClusterPath = os.path.join(Cursor.Directory, "Clusters", Species.AA, "%s.%s.cls"%(Species.ModlessAnnotation, Species.Charge))
+ if os.path.exists(ClusterPath):
+ CursorCluster = BuildConsensusSpectrum.ConsensusBuilder(Species.Charge)
+ CursorCluster.UnpickleCluster(ClusterPath)
+ if not Species.ModlessCluster:
+ Species.ModlessCluster = CursorCluster
+ else:
+ Species.ModlessCluster.AssimilateCluster(CursorCluster)
+ def WriteCompletedPeptide(self, Species):
+ """
+ We've read data for this peptide species from TWO OR MORE cursors. Now we'll write out
+ one line to the output file, and output our consensus spectrum.
+ """
+ # Write the file line:
+ OutputLine = string.join(Species.Bits, "\t")
+ self.OutputFile.write(OutputLine + "\n")
+ # Cluster:
+ ClusterOutputPath = os.path.join(self.OutputDir, "Clusters", Species.AA, "%s.%s.cls"%(Species.Annotation, Species.Charge))
+ Species.Cluster.PickleCluster(ClusterOutputPath)
+ # Consensus spectrum:
+ ConsensusSpectrum = Species.Cluster.ProduceConsensusSpectrum()
+ ConsensusSpectrumPath = os.path.join(self.OutputDir, "Spectra", Species.AA, "%s.%s.dta"%(Species.Annotation, Species.Charge))
+ ConsensusSpectrum.WritePeaks(ConsensusSpectrumPath)
+ if Species.ModlessCluster:
+ # Modless cluster:
+ ClusterOutputPath = os.path.join(self.OutputDir, "Clusters", Species.AA, "%s.%s.cls"%(Species.ModlessAnnotation, Species.Charge))
+ Species.ModlessCluster.PickleCluster(ClusterOutputPath)
+ # Modless consensus spectrum:
+ ConsensusSpectrum = Species.ModlessCluster.ProduceConsensusSpectrum()
+ ConsensusSpectrumPath = os.path.join(self.OutputDir, "Spectra", Species.AA, "%s.%s.dta"%(Species.ModlessAnnotation, Species.Charge))
+ ConsensusSpectrum.WritePeaks(ConsensusSpectrumPath)
+ def MergeResults(self):
+ print "Load db..."
+ self.LoadDB()
+ # Measure combined db coverage:
+ print "Combine database coverage..."
+ self.CombineDatabaseCoverage()
+ print "Populate MZXML oracle..."
+ self.PopulateMZXMLOracle(self.SpectrumRoot)
+ self.OutputFile = open(self.OutputPath, "wb")
+ for FileLine in self.HeaderLines:
+ self.OutputFile.write(FileLine)
+ class FeatureCursor:
+ """
+ Wrapper for a feature file - tracks the "next" peptide.
+ """
+ def __init__(self, Path):
+ self.Path = Path
+ self.File = open(Path, "rb")
+ self.NextKey = None
+ self.Bits = None
+ self.Directory = os.path.split(Path)[0]
+ self.HeaderLines = []
+ def Close(self):
+ self.File.close()
+ self.NextKey = None
+ def GetNextPeptide(self):
+ # Read one or more lines (skipping header or invalid lines), and remember
+ # the next peptide to be processed
+ while (1):
+ FileLine = self.File.readline()
+ if not FileLine:
+ self.NextKey = None
+ return None # EOF
+ # Skip blank or comment lines:
+ if FileLine[0] == "#":
+ self.HeaderLines.append(FileLine)
+ continue
+ if not FileLine.strip():
+ continue
+ # Attempt to parse the line:
+ Bits = FileLine.strip().split("\t")
+ try:
+ ModDBPos = int(Bits[FormatBits.DBPos])
+ ModMass = int(Bits[FormatBits.ModificationMass])
+ Annotation = Bits[FormatBits.Peptide]
+ Charge = int(Bits[FormatBits.Charge])
+ except:
+ traceback.print_exc()
+ continue # skip invalid line
+ # We know our next key, so stop now:
+ self.NextKey = (ModDBPos, ModMass, Annotation, Charge)
+ self.Bits = Bits
+ break
+ self.FeatureCursors = []
+ # List the directories that need to be parsed, and open the files:
+ for SubDirectory in os.listdir(self.PTMFeatureDirectory):
+ Dir = os.path.join(self.PTMFeatureDirectory, SubDirectory)
+ if not os.path.isdir(Dir):
+ continue
+ FeatureFilePath = os.path.join(Dir, "PTMFeatures.txt")
+ if not os.path.exists(FeatureFilePath):
+ print "* Warning: Subdirectory %s doesn't contain a feature file!"%Dir
+ continue
+ CoverageFilePath = os.path.join(Dir, "Coverage.dat")
+ if not os.path.exists(CoverageFilePath):
+ print "* Warning: Subdirectory %d doesn't contain a coverage file!"%Dir
+ continue
+ Cursor = FeatureCursor(FeatureFilePath)
+ Cursor.GetNextPeptide()
+ self.FeatureCursors.append(Cursor)
+ # Output the header lines from an (arbitrary) cursor:
+ for HeaderLine in self.FeatureCursors[0].HeaderLines:
+ self.OutputFile.write(HeaderLine)
+ # Loop through the peptides, until all cursors hit EOF. At each stage, process
+ # the peptide with the first key - and process it from any and all cursors which
+ # are now pointing at it.
+ while (1):
+ CursorsForThisKey = 0
+ CursorThisKey = None
+ FirstKey = None
+ print
+ for Cursor in self.FeatureCursors:
+ if Cursor.NextKey != None and (FirstKey == None or FirstKey > Cursor.NextKey):
+ FirstKey = Cursor.NextKey
+ CursorThisKey = Cursor
+ CursorsForThisKey = 0
+ if Cursor.NextKey == FirstKey:
+ CursorsForThisKey += 1
+ if Cursor.NextKey == FirstKey:
+ print "*Cursor %s: %s"%(Cursor.Path, Cursor.NextKey)
+ else:
+ print "Cursor %s: %s"%(Cursor.Path, Cursor.NextKey)
+ if FirstKey == None:
+ break
+ Species = self.BuildPeptideSpecies(CursorThisKey)
+ print "Next species is '%s', found in %s files."%(Species.Annotation, CursorsForThisKey)
+ ########################################################################
+ # Shortcut: If CursorsForThisKey == 1, then we don't need to re-build the
+ # cluster or the consensus spectrum!
+ if CursorsForThisKey == 1:
+ self.WriteSingletonPeptide(Species, CursorThisKey)
+ CursorThisKey.GetNextPeptide()
+ continue
+ ########################################################################
+ # Standard case: Two or more cursors have the same peptide. Assimilate the
+ # total spectra, and cluster members, from each one!
+ for Cursor in self.FeatureCursors:
+ if Cursor.NextKey == FirstKey:
+ self.AssimilatePeptideSpectra(Species, Cursor)
+ Cursor.GetNextPeptide()
+ # We've assimilated all spectra into the cluster; now write out the
+ # totaled spectrum count and the consensus spectra!
+ self.WriteCompletedPeptide(Species)
+ ########################################################
+ # All peptides have been written out; every cursor is at EOF.
+ # Loop over cursors and finish up:
+ for Cursor in self.FeatureCursors:
+ Cursor.Close()
+ def CombineDatabaseCoverage(self):
+ """
+ Iterate over directories, and compute the total coverage for each database residue.
+ """
+ for SubDirectory in os.listdir(self.PTMFeatureDirectory):
+ Dir = os.path.join(self.PTMFeatureDirectory, SubDirectory)
+ if not os.path.isdir(Dir):
+ continue
+ CoverageFilePath = os.path.join(Dir, "Coverage.dat")
+ if not os.path.exists(CoverageFilePath):
+ print "* Warning: Subdirectory %s doesn't contain a coverage file!"%Dir
+ continue
+ print "Assimilate from %s..."%CoverageFilePath
+ self.AssimilateDatabaseCoverage(CoverageFilePath)
+ self.OutputCoverage()
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ except:
+ print "(psyco not available - no optimization for you)"
+ Merger = PTMFeatureMerger()
+ Result = Merger.ParseCommandLine(sys.argv[1:])
+ if not Result:
+ print UsageInfo
+ sys.exit(-1)
+ Merger.MergeResults()
+
diff --git a/CompareHEKPTM.py b/CompareHEKPTM.py
new file mode 100644
index 0000000..c751499
--- /dev/null
+++ b/CompareHEKPTM.py
@@ -0,0 +1,808 @@
+#Title: CompareHEKPTM.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Compare the collection of PTMs found in the HEK293 data-set with the PTMs reported
+in external databases (HPRD and Uniprot)
+"""
+
+import os
+import getopt
+import string
+import sys
+import struct
+import traceback
+import cPickle
+import xml.dom.minidom
+import xml.sax.handler
+from xml.dom.minidom import Node
+from TrainPTMFeatures import FormatBits
+from Utils import *
+Initialize()
+
+AminoAcidLetters = "ACDEFGHIKLMNPQRSTVWY"
+
+# HEK options:
+InspectOutputFileName = r"C:\Documents and Settings\swt\Desktop\SWTPapers\PTMScoring\SupplementalTables\STHEKSitesK2.txt"
+IPIDatabasePath = os.path.join("Database", "IPISubDB.trie")
+FDRPValueCutoff = 0.270065269846
+
+# LENS options:
+##InspectOutputFileName = r"C:\Documents and Settings\swt\Desktop\SWTPapers\PTMScoring\SupplementalTables\ST1LensSitesFullK2.txt"
+##IPIDatabasePath = os.path.join("Database", "Lens99.trie")
+##FDRPValueCutoff = 0.580962281
+
+UniprotXMLFileName = r"F:\Uniprot\uniprot_sprot.xml"
+HPRDDir = r"f:\ftproot\HPRD\HPRD_XML_010107\HPRD_XML_010107"
+
+SkipModificationNames = {"proteolytic cleavage":1,
+ "disulfide bridge":1,
+ }
+
+
+def GetXMLText(NodeList, Strip = 0):
+ """
+ Gets the text associated with an XML Node
+ <a>RETURNS THIS TEXT </a>
+ """
+ BodyText = ""
+ for Node in NodeList:
+ if Node.nodeType == Node.TEXT_NODE:
+ BodyText += Node.data
+ if Strip: # strip all white space, those included below
+ BodyText = BodyText.replace(" ","")
+ BodyText = BodyText.replace("\r","")
+ BodyText = BodyText.replace("\n","")
+ BodyText = BodyText.replace("\t","")
+ return BodyText
+
+def FindDBLocations(DB, Aminos):
+ """
+ Find all occurrences of this peptide in the database.
+ Return DB indices.
+ """
+ PrevPos = -1
+ LocationList = []
+ while (1):
+ Pos = DB.find(Aminos, PrevPos + 1)
+ if Pos == -1:
+ break
+ LocationList.append(Pos)
+ PrevPos = Pos
+ return LocationList
+
+
+class CompareMaster:
+ """
+ This class keeps track of the proteins in the canonical HEK database,
+ as well as the modifications from various sources.
+ """
+ def __init__(self):
+ pass
+ def InitializeModMasses(self):
+ """
+ Initialize self.ModMasses, which maps (lower-case) PTM names to masses.
+ Because the list is rather long, it has been moved to a table in
+ ExternalPTMNames.txt
+ """
+ self.ModMasses = {}
+ File = open("ExternalPTMNames.txt", "rb")
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ Bits = list(Bits)
+ # Repair Excel's broken columns:
+ if Bits[0][0] == '"':
+ Bits[0] = Bits[0][1:-1]
+ if FileLine[0] == "#" or len(Bits) < 2:
+ continue
+ Name = Bits[0].lower()
+ Mass = int(Bits[1])
+ self.ModMasses[Name] = Mass
+ File.close()
+ def LoadDatabase(self):
+ self.ProteinPos = []
+ self.DB = ""
+ File = open(IPIDatabasePath, "rb")
+ self.DB = File.read()
+ File.close()
+ PrevPos = -1
+ while 1:
+ NextPos = self.DB.find("*", PrevPos + 1)
+ if NextPos == -1:
+ break
+ self.ProteinPos.append(PrevPos + 1)
+ PrevPos = NextPos
+ # Read protein names, too:
+ IndexPath = os.path.splitext(IPIDatabasePath)[0] + ".index"
+ BlockSize = struct.calcsize("qi80s")
+ IndexFile = open(IndexPath, "rb")
+ self.ProteinNames = []
+ while 1:
+ Block = IndexFile.read(BlockSize)
+ if not Block:
+ break
+ Tuple = struct.unpack("qi80s", Block)
+ self.ProteinNames.append(Tuple[2])
+
+ def GetDBPosInfo(self, Pos):
+ """
+ Return (ProteinName, ProteinResidueNumber)
+ """
+ for ProteinIndex in range(len(self.ProteinPos)):
+ ProteinStart = self.ProteinPos[ProteinIndex]
+ if Pos < ProteinStart:
+ continue
+ if ProteinIndex < len(self.ProteinPos) - 1:
+ ProteinEnd = self.ProteinPos[ProteinIndex + 1]
+ if Pos >= ProteinEnd:
+ continue
+ else:
+ pass
+ ResidueNumber = Pos - ProteinStart
+ return (self.ProteinNames[ProteinIndex], ResidueNumber)
+ def FindPeptideLocations(self, Aminos):
+ """
+ Given an amino acid string, find all locations in the database.
+ Return a list of the form (ProteinIndex, ResidueNumber)
+ """
+ PrevPos = -1
+ LocationList = []
+ while (1):
+ Pos = self.DB.find(Aminos, PrevPos + 1)
+ if Pos == -1:
+ break
+ # Which protein does Pos lie in?
+ LowIndex = 0
+ HighIndex = len(self.ProteinPos) - 1
+ # Pos >= ProteinPos[LowIndex] and Pos < ProteinPos[HighIndex]
+ # Special case - last protein:
+ if Pos > self.ProteinPos[HighIndex]:
+ ProteinID = HighIndex
+ ResidueNumber = Pos - self.ProteinPos[HighIndex]
+ else:
+ while (1):
+ if LowIndex + 1 == HighIndex:
+ ProteinID = LowIndex
+ ResidueNumber = Pos - self.ProteinPos[LowIndex]
+ break
+ MidIndex = (LowIndex + HighIndex) / 2
+ if Pos >= self.ProteinPos[MidIndex]:
+ LowIndex = MidIndex
+ else:
+ HighIndex = MidIndex
+ LocationList.append((ProteinID, ResidueNumber))
+ PrevPos = Pos
+ return LocationList
+ def ParsePTMsInspect5(self):
+ return self.ParsePTMsInspect(0.05)
+ def ParsePTMsInspect(self, PValueThreshold = None):
+ if not PValueThreshold:
+ PValueThreshold = FDRPValueCutoff
+ PTMDictionary = {}
+ File = open(InspectOutputFileName, "rb")
+ LineNumber = 0
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ if LineNumber % 1000 == 0:
+ print "Line %s..."%LineNumber
+ if FileLine[0] == "#":
+ continue
+ Bits = FileLine.strip().split("\t")
+ try:
+ ProteinName = Bits[FormatBits.ProteinName]
+ #DeltaMass = int(Bits[FormatBits.ModificationMass])
+ DeltaMass = int(Bits[49])
+ Annotation = Bits[FormatBits.Peptide]
+ PeptidePValue = Bits[FormatBits.ModelPValue]
+ SitePValue = float(Bits[FormatBits.SitePValue])
+ KnownAnnotation = Bits[FormatBits.KnownPTMAnnotation]
+ KnownFlag = int(Bits[47])
+ if KnownFlag:
+ KeepAnnotation = KnownAnnotation
+ else:
+ KeepAnnotation = Annotation
+ Peptide = GetPeptideFromModdedName(KeepAnnotation)
+ except:
+ traceback.print_exc()
+ continue
+ try:
+ SitePValue = float(Bits[50])
+ except:
+ pass
+ if PValueThreshold != None and SitePValue >= PValueThreshold:
+ continue
+ # If the protein is shuffled (protein name starts with XXX), ignore the PTM:
+ if ProteinName[:3] == "XXX":
+ continue
+ if not Peptide.Modifications.keys():
+ continue # it's actually unmodified!
+ ModPos = Peptide.Modifications.keys()[0]
+ FullAminos = Peptide.Aminos
+ if Peptide.Prefix in AminoAcidLetters:
+ FullAminos = Peptide.Prefix + FullAminos
+ if Peptide.Suffix in AminoAcidLetters:
+ FullAminos = FullAminos + Peptide.Suffix
+ DBHitList = FindDBLocations(self.DB, FullAminos)
+ if not DBHitList:
+ print "*** Warning: Peptide '%s' not found in database!"%FullAminos
+ continue
+ for DBPos in DBHitList:
+ ModDBPos = DBPos + ModPos
+ if Peptide.Prefix in AminoAcidLetters:
+ ModDBPos += 1
+ if not PTMDictionary.has_key(ModDBPos):
+ PTMDictionary[ModDBPos] = []
+ # Avoid adding redundant records:
+ FoundFlag = 0
+ for (OldMass, OldName) in PTMDictionary[ModDBPos]:
+ if OldMass == DeltaMass:
+ FoundFlag = 1
+ break
+ if not FoundFlag:
+ PTMDictionary[ModDBPos].append((DeltaMass, "%+d"%DeltaMass))
+ print ModDBPos, DeltaMass, KeepAnnotation
+ File.close()
+ self.PTMDictionaryInspect = PTMDictionary
+ print "Inspect parse: Found %s modified residues in %s file lines"%(len(PTMDictionary.keys()), LineNumber)
+ return PTMDictionary
+ def ParsePTMsUniprot(self):
+ SAXParser = xml.sax.make_parser()
+ UniprotParser = UniprotXMLParser(self.DB)
+ UniprotParser.ModMasses = self.ModMasses
+ SAXParser.setContentHandler(UniprotParser)
+ print "Parse %s..."%UniprotXMLFileName
+ SAXParser.parse(UniprotXMLFileName)
+ self.PTMDictionaryUniprot = UniprotParser.PTMDictionary
+ print "Reporting UNKNOWN PTM names..."
+ UniprotParser.ReportUnknownPTMs("UnknownPTMs.Uniprot.txt")
+ return self.PTMDictionaryUniprot
+ def ParsePTMsHPRD(self):
+ SAXParser = xml.sax.make_parser()
+ HPRDParser = HPRDXMLParser(self.DB)
+ HPRDParser.ModMasses = self.ModMasses
+ SAXParser.setContentHandler(HPRDParser)
+ #print "Parse %s..."%HPRDXMLFileName
+ FileNames = os.listdir(HPRDDir)
+ for FileNameIndex in range(len(FileNames)):
+ FileName = FileNames[FileNameIndex]
+ print "%s/%s: %s"%(FileNameIndex, len(FileNames), FileName)
+ XMLFilePath = os.path.join(HPRDDir, FileName)
+ try:
+ SAXParser.parse(XMLFilePath)
+ except:
+ traceback.print_exc()
+ print "* Error parsing %s"%XMLFilePath
+ self.PTMDictionaryHPRD = HPRDParser.PTMDictionary
+ print "Reporting UNKNOWN PTM names..."
+ HPRDParser.ReportUnknownPTMs("UnknownPTMs.HPRD.txt")
+ return self.PTMDictionaryHPRD
+ def ComparePTMDictionariesOneWay(self, DictA, DictB,
+ MaxMassDiff = 2, MaxPosDiff = 2, LimitMassFlag = None):
+ """
+ Determine how many of A's PTMs are found in B:
+ """
+ HitA = 0
+ MissA = 0
+ TotalA = 0
+ for (Pos, ModList) in DictA.items():
+ for (Mass, Name) in ModList:
+ # LimitMassFlag == 1: Skip very small and very large PTMs.
+ if LimitMassFlag == 1:
+ if abs(Mass) < 5 or abs(Mass) >= 250:
+ continue
+ elif LimitMassFlag != None:
+ if Mass != LimitMassFlag:
+ continue
+ TotalA += 1
+ HitFlag = 0
+ AllowedPositions = [Pos]
+ for Diff in range(1, MaxPosDiff + 1):
+ AllowedPositions.append(Pos + Diff)
+ AllowedPositions.append(Pos - Diff)
+ for NearPos in AllowedPositions:
+ List = DictB.get(NearPos, [])
+ for (OtherMass, OtherName) in List:
+ if abs(Mass - OtherMass) <= MaxMassDiff:
+ HitFlag = 1
+ if HitFlag:
+ HitA += 1
+ else:
+ MissA += 1
+ return (HitA, MissA, TotalA)
+ def ComparePTMDictionaries(self, DictA, DictB,
+ MaxMassDiff = 2, MaxPosDiff = 2, LimitMassFlag = None):
+ """
+ Simple comparison: How many sites are shared between these two
+ dictionaries of PTMs?
+ """
+ (HitA, MissA, TotalA) = self.ComparePTMDictionariesOneWay(DictA, DictB,
+ MaxMassDiff, MaxPosDiff, LimitMassFlag)
+ (HitB, MissB, TotalB) = self.ComparePTMDictionariesOneWay(DictB, DictA,
+ MaxMassDiff, MaxPosDiff, LimitMassFlag)
+ print "ComparePTMDictionaries:"
+ SharedPercent = 100 * HitA / float(max(TotalA, 1))
+ print "A: %s total. %s (%.2f%%) shared, %s not shared)"%(TotalA, HitA, SharedPercent, MissA)
+ SharedPercent = 100 * HitB / float(max(TotalB, 1))
+ print "B: %s total. %s (%.2f%%) shared, %s not shared)"%(TotalB, HitB, SharedPercent, MissB)
+ OverallSharedPercent = (HitA + HitB) / float(max(1, TotalA + TotalB))
+ print "Overall shared percent: %.2f"%(100 * OverallSharedPercent)
+ def ParseAndOrPickle(self, ParseMethod, PickleFileName):
+ """
+ Parse PTMs from a file, OR unpickle them from a pre-parsed binary file.
+ (Parsing is slow, so we do it just once)
+ """
+ if os.path.exists(PickleFileName):
+ print "Loading PTM dictionary from %s..."%PickleFileName
+ File = open(PickleFileName, "rb")
+ Dictionary = cPickle.load(File)
+ File.close()
+ else:
+ Dictionary = ParseMethod()
+ print "Saving PTM dictionary to %s..."%PickleFileName
+ File = open(PickleFileName, "wb")
+ cPickle.dump(Dictionary, File)
+ File.close()
+ return Dictionary
+ def DebugReportPTMs(self):
+ """
+ For debugging purposes, print a file showing all the PTMs found in any source.
+ """
+ AllKeyDict = {}
+ for Key in self.PTMDictionaryInspect.keys():
+ AllKeyDict[Key] = 1
+## for Key in self.PTMDictionaryInspect5.keys():
+## AllKeyDict[Key] = 1
+ for Key in self.PTMDictionaryHPRD.keys():
+ AllKeyDict[Key] = 1
+ for Key in self.PTMDictionaryUniprot.keys():
+ AllKeyDict[Key] = 1
+ AllKeys = AllKeyDict.keys()
+ AllKeys.sort()
+ for Key in AllKeys:
+ DBStart = max(0, Key - 7)
+ DBEnd = min(len(self.DB), Key + 8)
+ Aminos = "%s.%s.%s"%(self.DB[DBStart:Key], self.DB[Key], self.DB[Key + 1:DBEnd])
+ MassInspect = self.PTMDictionaryInspect.get(Key, "")
+ #MassInspect5 = self.PTMDictionaryInspect5.get(Key, "")
+ MassHPRD = self.PTMDictionaryHPRD.get(Key, "")
+ MassUniprot = self.PTMDictionaryUniprot.get(Key, "")
+ if MassHPRD == "" and MassUniprot == "":
+ continue
+ Str = "%s\t%s\t%s\t%s\t%s\t"%(Key, Aminos, MassInspect, MassHPRD, MassUniprot)
+ print Str
+ def DebugPrintDict(self, Dict):
+ OverallCount = 0
+ for (Key, List) in Dict.items():
+ OverallCount += len(List)
+ print "Overall count:", OverallCount
+ def Main(self):
+ self.LoadDatabase()
+ self.InitializeModMasses()
+ self.PTMDictionaryInspect = self.ParseAndOrPickle(self.ParsePTMsInspect, "PTMDictionaryInspect.pickle")
+ #self.PTMDictionaryInspect5 = self.ParseAndOrPickle(self.ParsePTMsInspect5, "PTMDictionaryInspect5.pickle")
+ self.PTMDictionaryHPRD = self.ParseAndOrPickle(self.ParsePTMsHPRD, "PTMDictionaryHPRD.pickle")
+ self.PTMDictionaryUniprot = self.ParseAndOrPickle(self.ParsePTMsUniprot, "PTMDictionaryUniprot.pickle")
+ print len(self.PTMDictionaryInspect.keys())
+ #print len(self.PTMDictionaryInspect5.keys())
+ for Dict in (self.PTMDictionaryInspect, self.PTMDictionaryHPRD, self.PTMDictionaryUniprot):
+ self.DebugPrintDict(Dict)
+ #self.DebugCountPTMs(Dict)
+ print "\n\nUniprot and HPRD: EXACT"
+ self.ComparePTMDictionaries(self.PTMDictionaryUniprot, self.PTMDictionaryHPRD, 0, 0, LimitMassFlag = None)
+ print "\n\nUniprot and HPRD: EXACT, omit bad masses"
+ self.ComparePTMDictionaries(self.PTMDictionaryUniprot, self.PTMDictionaryHPRD, 0, 0, LimitMassFlag = 1)
+ print "\n\nUniprot and HPRD: omit bad masses"
+ self.ComparePTMDictionaries(self.PTMDictionaryUniprot, self.PTMDictionaryHPRD, LimitMassFlag = 1)
+
+## print "\n\nHEK293 best and uniprot, omit bad masses"
+## self.ComparePTMDictionaries(self.PTMDictionaryInspect5, self.PTMDictionaryUniprot, LimitMassFlag = 1)
+## print "\n\nHEK293 best and hprd, omit bad masses"
+## self.ComparePTMDictionaries(self.PTMDictionaryInspect5, self.PTMDictionaryHPRD, LimitMassFlag = 1)
+ print "\n\nHEK293 and uniprot, omit bad masses"
+ self.ComparePTMDictionaries(self.PTMDictionaryInspect, self.PTMDictionaryUniprot, LimitMassFlag = 1)
+ print "\n\nHEK293 and hprd, omit bad masses"
+ self.ComparePTMDictionaries(self.PTMDictionaryInspect, self.PTMDictionaryHPRD, LimitMassFlag = 1)
+ for KeyMass in (14, 28, 42, 80):
+ print "\n\nUniprot and HPRD: Mass %s"%KeyMass
+ self.ComparePTMDictionaries(self.PTMDictionaryUniprot, self.PTMDictionaryHPRD, LimitMassFlag = KeyMass)
+ print "HEK293 and Uniprot, mass %s"%KeyMass
+ self.ComparePTMDictionaries(self.PTMDictionaryInspect, self.PTMDictionaryUniprot, LimitMassFlag = KeyMass)
+ print "HEK293 and HORD, mass %s"%KeyMass
+ self.ComparePTMDictionaries(self.PTMDictionaryInspect, self.PTMDictionaryHPRD, LimitMassFlag = KeyMass)
+
+ ################################################
+ # Supplemental table: Inspect PTMs that match Uniprot *or* hprd
+ self.ReportInspectMatchedSites()
+ def ReportInspectMatchedSites(self):
+ """
+ Output a verbose report of all Inspect sites which were also seen in HPRD and/or uniprot.
+ """
+ HitA = 0
+ MissA = 0
+ TotalA = 0
+ MaxPosDiff = 2
+ MaxMassDiff = 3
+ LimitMassFlag = 0
+ ReportedAlreadyDict = {}
+ FilterFlag = 1 #%%%
+ if FilterFlag:
+ OutputFile = open("HEKPTM-InspectAndDB.txt", "wb")
+ else:
+ OutputFile = open("HEKPTM-InspectAndDB.unfiltered.txt", "wb")
+ for (Pos, ModList) in self.PTMDictionaryInspect.items():
+ for (Mass, Name) in ModList:
+ # LimitMassFlag == 1: Skip very small and very large PTMs.
+ if LimitMassFlag == 1:
+ if abs(Mass) < 5 or abs(Mass) >= 250:
+ continue
+ HPRDHitFlag = 0
+ HPRDMass = ""
+ HPRDName = ""
+ UniprotHitFlag = 0
+ UniprotMass = ""
+ UniprotName = ""
+ HPRDHitPos = ""
+ UniprotHitPos = ""
+ AllowedPositions = [Pos]
+ for Diff in range(1, MaxPosDiff + 1):
+ AllowedPositions.append(Pos + Diff)
+ AllowedPositions.append(Pos - Diff)
+ for NearPos in AllowedPositions:
+ List = self.PTMDictionaryHPRD.get(NearPos, [])
+ for (OtherMass, OtherName) in List:
+ if abs(Mass - OtherMass) <= MaxMassDiff:
+ HPRDHitFlag = 1
+ HPRDHitPos = NearPos
+ HPRDMass = OtherMass
+ HPRDName = OtherName
+ List = self.PTMDictionaryUniprot.get(NearPos, [])
+ for (OtherMass, OtherName) in List:
+ if abs(Mass - OtherMass) <= MaxMassDiff:
+ UniprotHitFlag = 1
+ UniprotHitPos = NearPos
+ UniprotMass = OtherMass
+ UniprotName = OtherName
+ if HPRDHitFlag or UniprotHitFlag:
+ ReportMass = HPRDMass
+ HitPos = HPRDHitPos
+ if ReportMass == "":
+ ReportMass = UniprotMass
+ HitPos = UniprotHitPos
+ ReportKey = (HitPos, ReportMass)
+ if ReportedAlreadyDict.has_key(ReportKey):
+ if FilterFlag:
+ continue
+ ReportedAlreadyDict[ReportKey] = 1
+ #(ProteinName, ProteinResidue) = self.GetDBPosInfo(Pos)
+ (ProteinName, ProteinResidue) = self.GetDBPosInfo(HitPos)
+ Residue = self.DB[HitPos]
+ NearAminos = self.DB[HitPos - 10:HitPos + 11]
+ Str = "%s\t%s\t%s\t%s\t"%(Pos, Mass, ProteinName, ProteinResidue)
+ Str += "%s\t%s\t"%(Residue, NearAminos)
+ Str += "%s\t%s\t"%(HPRDMass, HPRDName)
+ #Str += "%s\t%s\t%s\t"%(HPRDHitPos, HPRDMass, HPRDName)
+ Str += "%s\t%s\t"%(UniprotMass, UniprotName)
+ #Str += "%s\t%s\t%s\t"%(UniprotHitPos, UniprotMass, UniprotName)
+ OutputFile.write(Str + "\n")
+ OutputFile.close()
+
+class UXStates:
+ """
+ States for the [U]niprot [X]ml parser. State can change when we START or END a tag.
+ Most of the time we're in the SKIP state.
+ """
+ Skip = 0
+ Sequence = 1
+ Feature = 2
+ Accession = 3
+
+class TabularXMLParser(xml.sax.handler.ContentHandler):
+ """
+ Simple subclass of SAX XML parser: Employs dictionaries to look up the
+ handlers for tag start, tag end, body text. Keeps a current State.
+ """
+ def __init__(self):
+ self.startElement = self.StartElement
+ self.endElement = self.EndElement
+ self.characters = self.HandleCharacters
+ self.State = None
+ if not hasattr(self, "StartHandlers"):
+ self.StartHandlers = {}
+ if not hasattr(self, "EndHandlers"):
+ self.EndHandlers = {}
+ if not hasattr(self, "StringHandlers"):
+ self.StringHandlers = {}
+ xml.sax.handler.ContentHandler.__init__(self)
+ def StartElement(self, Name, Attributes):
+ Handler = self.StartHandlers.get(Name, None)
+ if Handler:
+ apply(Handler, (Attributes,))
+ def EndElement(self, Name):
+ Handler = self.EndHandlers.get(Name, None)
+ if Handler:
+ apply(Handler)
+ def HandleCharacters(self, String):
+ Handler = self.StringHandlers.get(self.State, None)
+ if Handler:
+ apply(Handler, (String,))
+
+
+class PTMXMLParser(TabularXMLParser):
+ """
+ Simple subclass of TabularXMLParser, adding the ability to look up modification
+ sites in self.DB; relies heavily on subclass methods!
+ """
+ def AddPendingPTMs(self):
+ # Add pending PTMs:
+ for (Name, Position) in self.PendingPTMs:
+ #print "Pending PTM:", Name, Position
+ # Get flanking amino acids:
+ StartPos = max(0, Position - 7)
+ EndPos = min(len(self.Sequence), Position + 8)
+ # If we're next to the edge, extend farther in the other direction:
+ Len = EndPos - StartPos
+ if Len < 15:
+ StartPos = max(0, EndPos - 16)
+ Len = EndPos - StartPos
+ if Len < 15:
+ EndPos = min(len(self.Sequence), StartPos + 16)
+ #StartPos = max(0, Position - 14)
+ Aminos = self.Sequence[StartPos:EndPos]
+ PrefixLength = Position - StartPos
+ if len(Aminos) < 10:
+ print "* Warning: Aminos %s...%s from %s not distinct enough!"%(StartPos, EndPos, self.Accession)
+ print "Sequence length is %s, position is %s"%(len(self.Sequence), Position)
+ # Determine the mass:
+ LowerName = Name.lower()
+ if SkipModificationNames.has_key(LowerName):
+ continue
+ Mass = self.ModMasses.get(LowerName, 0)
+ if Mass == 0:
+ # Try removing any parenthetical portions:
+ # Example: n6,n6,n6-trimethyllysine (alternate)
+ ParenPos = LowerName.find("(")
+ if ParenPos != -1:
+ PreParen = LowerName[:ParenPos].strip()
+ if SkipModificationNames.has_key(PreParen):
+ continue
+
+ #print "Try removing parens: '%s' to '%s'"%(LowerName, PreParen)
+ Mass = self.ModMasses.get(PreParen, 0)
+ if Mass == 0:
+ # Try the first bit of the mod, it might have the form "phosphoserine (by ck1)"
+ #print "try first bit: '%s' to '%s'"%(LowerName, LowerName.split()[0])
+ FirstBit = LowerName.split()[0]
+ Mass = self.ModMasses.get(FirstBit, 0)
+ if SkipModificationNames.has_key(FirstBit):
+ continue
+ if Mass == 0:
+ print "* Warning - mass not known for: %s (accession %s)"%(LowerName, self.Accession)
+ if Position - 1 < 0 or Position - 1 >= len(self.Sequence):
+ print "Found on residue %s (ILLEGAL NUMBER)"%Position
+ else:
+ print " Found on residue %s%s"%(self.Sequence[Position - 1], Position)
+ self.UnknownPTMDictionary[LowerName] = self.UnknownPTMDictionary.get(LowerName, 0) + 1
+ else:
+ #print "Adding ptm of size %s at dbpos %s"%(Mass, DBPos)
+ pass
+ # Get database positions:
+ #print "Aminos:", Aminos
+ DBHitList = FindDBLocations(self.DB, Aminos)
+ #print "Peptide %s found in %s positions"%(Aminos, len(DBHitList))
+ for AminosDBPos in DBHitList:
+ DBPos = AminosDBPos + PrefixLength
+ if not self.PTMDictionary.has_key(DBPos):
+ self.PTMDictionary[DBPos] = []
+
+ #print "ModMass %s at position %s, dbpos %s, flanking aminos from %s...%s: %s"%(\
+ # Mass, Position, DBPos, StartPos, EndPos, Aminos)
+ # Avoid adding REDUNDANT records:
+ FoundFlag = 0
+ for (OldMass, OldName) in self.PTMDictionary[DBPos]:
+ if OldMass == Mass:
+ FoundFlag = 1
+ break
+ if not FoundFlag:
+ self.PTMDictionary[DBPos].append((Mass, Name))
+ def ReportUnknownPTMs(self, OutputFileName):
+ SortedList = []
+ for (Name, Count) in self.UnknownPTMDictionary.items():
+ SortedList.append((Count, Name))
+ SortedList.sort()
+ SortedList.reverse()
+ File = open(OutputFileName, "wb")
+ for (Count, Name) in SortedList:
+ File.write("%s\t%s\t\n"%(Name, Count))
+ File.close()
+
+class UniprotXMLParser(PTMXMLParser):
+ """
+ Simple XML parser. Because the start and body and end handlers are handled by various
+ sub-functions, we use a dictionary to map tags to their handlers.
+ Note: Remember that XML parse routines return unicode, hence the calls to str().
+ """
+ def __init__(self, DB):
+ self.DB = DB
+ self.EntryCount = 0
+ self.UnknownPTMDictionary = {}
+ self.PTMDictionary = {}
+ self.StartHandlers = {"entry":self.StartEntry, "sequence": self.StartSequence,
+ "feature":self.StartFeature, "position": self.StartPosition,
+ "accession":self.StartAccession, }
+ self.EndHandlers = {"sequence": self.EndSequence, "feature":self.EndFeature,
+ "entry":self.EndEntry,"accession":self.EndAccession, }
+ self.StringHandlers = {UXStates.Sequence: self.HandleStringSequence,
+ UXStates.Accession: self.HandleStringAccession}
+ PTMXMLParser.__init__(self)
+ def StartAccession(self, Attributes):
+ self.State = UXStates.Accession
+ self.Accession = ""
+ def EndAccession(self):
+ self.State = UXStates.Skip
+ def StartSequence(self, Attributes):
+ self.Sequence = ""
+ self.State = UXStates.Sequence
+ def EndSequence(self):
+ self.State = UXStates.Skip
+ def StartEntry(self, Attributes):
+ """
+ A new top-level <entry> tag for a protein record. As we start the new record, we reset any
+ accumulated info.
+ """
+ self.Sequence = ""
+ self.Accession = ""
+ # PendingPTMs is a list of tuples of the form (Name, SequencePosition).
+ self.PendingPTMs = []
+ def EndEntry(self):
+ self.AddPendingPTMs()
+ self.EntryCount += 1
+ if self.EntryCount % 1000 == 0:
+ print "Handled entry #%d"%self.EntryCount
+ def HandleStringSequence(self, String):
+ "Handle the body of the <sequence> tag"
+ self.Sequence += str(String.strip())
+ def HandleStringAccession(self, String):
+ "Handle the body of the <accession> tag"
+ self.Accession += str(String.strip())
+ def StartFeature(self, Attributes):
+ """
+ Handle a <Feature>, ignoring it unless it's of type "modified residue".
+ """
+ Type = Attributes["type"].lower()
+ if Type != "modified residue":
+ return
+ self.CurrentModification = str(Attributes["description"])
+ self.State = UXStates.Feature
+ def EndFeature(self):
+ self.State = UXStates.Skip
+ def StartPosition(self, Attributes):
+ """
+ Handle tag of the form <position position="123"/>
+ """
+ if self.State == UXStates.Feature:
+ # Subtract 1, to go from 1-based to 0-based numbering:
+ Position = int(Attributes["position"]) - 1
+ # Add a PTM to our pending list:
+ self.PendingPTMs.append((self.CurrentModification, Position))
+ #print "Added PTM:", self.PendingPTMs[-1]
+
+
+class HPRDXStates:
+ Skipping = 0
+ Sequence = 1
+ PTMSite = 2
+
+class HPRDXMLParser(PTMXMLParser):
+ """
+ Parser for HPRD records. Similar to UniprotXMLParser.
+ """
+ def __init__(self, DB):
+ self.DB = DB
+ self.EntryCount = 0
+ self.UnknownPTMDictionary = {}
+ self.PTMDictionary = {}
+ self.State = UXStates.Skip
+ self.StartHandlers = {"protein_sequence":self.StartSequence,
+ "isoform":self.StartIsoform,
+ "protein":self.StartProtein,
+ "modification":self.StartModification,
+ "ptm_site":self.StartPTMSite,}
+ self.EndHandlers = {"isoform":self.EndIsoform,
+ "ptm_site":self.EndPTMSite,
+ "protein_sequence":self.EndSequence}
+ self.StringHandlers = {HPRDXStates.Sequence: self.HandleStringSequence,
+ HPRDXStates.PTMSite: self.HandleStringPTMSite
+ }
+ self.DummyTable = string.maketrans("", "")
+ PTMXMLParser.__init__(self)
+ def StartSequence(self, Attributes):
+ self.Sequence = ""
+ self.State = HPRDXStates.Sequence
+ def EndSequence(self):
+ self.State = HPRDXStates.Skipping
+ self.Sequence = self.Sequence.upper()
+ #print "Obtained sequence of length %s"%len(self.Sequence)
+ def HandleStringSequence(self, String):
+ "Handle the body of the <sequence> tag"
+ try:
+ Block = str(String)
+ except:
+ print "wtf?"
+ print "%d: '%s'"%(len(String), String)
+ return
+ Block = Block.translate(self.DummyTable, " \r\n\t")
+ #Block = self.StripWhitespace(String).upper()
+ self.Sequence += Block
+ def HandleStringPTMSite(self, String):
+ self.CurrentSite += String
+## def StripWhitespace(self, String):
+## return String.translate(self.DummyTable, " \r\n\t")
+ def StartProtein(self, Attributes):
+ self.Accession = str(Attributes["id"])
+ def StartIsoform(self, Attributes):
+ """
+ Start a protein record. Clear any accumuated data:
+ """
+ self.PendingPTMs = []
+ self.Sequence = ""
+ #self.Accession = ""
+ def StartModification(self, Attributes):
+ self.CurrentModType = str(Attributes["type"])
+ #print "START modification '%s'"%self.CurrentModType
+ def StartPTMSite(self, Attributes):
+ self.State = HPRDXStates.PTMSite
+ self.CurrentSite = ""
+ def EndPTMSite(self):
+ # subtract one, to go from 1-based to 0-based numbering.
+ #print "FINISH ptm_site"
+ Position = int(self.CurrentSite) - 1
+ self.PendingPTMs.append((self.CurrentModType, Position))
+ self.State = HPRDXStates.Skipping
+ def EndIsoform(self):
+ """
+ End a protein record. Save any accumulated modifications:
+ """
+ #print "END ISOFORM: add pending PTMs"
+ self.AddPendingPTMs()
+ self.EntryCount += 1
+ if self.EntryCount % 1000 == 0:
+ print "Handled entry #%d"%self.EntryCount
+
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(Warning: psyco not found, running non-optimized)"
+ Master = CompareMaster()
+ Master.Main()
diff --git a/ComputeFDR.jar b/ComputeFDR.jar
new file mode 100644
index 0000000..f0bdb24
Binary files /dev/null and b/ComputeFDR.jar differ
diff --git a/ComputeFScore.py b/ComputeFScore.py
new file mode 100644
index 0000000..badd27e
--- /dev/null
+++ b/ComputeFScore.py
@@ -0,0 +1,328 @@
+#Title: PValue.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2010
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+This script, based on PeptideProphet, computes an F-Score. The F-Score
+for a match is a weighted sum of the length-corrected MQScore and the delta score.
+
+Also, there is no length constraint on peptides. An FScore is computed for all peptides.
+The FScore column of each peptide-spectrum match (PSM) is updated, but no p-value or FDR is caluclated.
+
+"""
+import os
+import sys
+import random
+import math
+import getopt
+import traceback
+import struct
+import ResultsParser
+import SelectProteins
+import Learning
+from Utils import *
+Initialize()
+
+class Defaults:
+ MQScoreWeight = 0.3
+ DeltaScoreWeight = 1.5
+ ###########################
+ BlindMQScoreWeight = 0.3
+ BlindDeltaScoreWeight = 1.5
+
+ ###########################
+
+
+BLIND_MOD_PENALTY = 1.0
+MIN_MQSCORE = -10.0
+
+# Parse the scores from at most this many output files.
+MAX_RESULTS_FILES_TO_PARSE = 100
+
+BIN_MULTIPLIER = 10.0
+SQRT2PI = math.sqrt(2 * math.pi)
+
+Cof = [76.18009172947146, -86.50532032941677,
+ 24.01409824083091, -1.231739572450155,
+ 0.1208650973866179e-2, -0.5395239384952e-5]
+
+class Bag:
+ pass
+
+class FScoreParser(ResultsParser.ResultsParser):
+ def __init__(self):
+
+ self.VerboseFlag = 0
+ self.MQScoreWeight = Defaults.MQScoreWeight
+ self.DeltaScoreWeight = Defaults.DeltaScoreWeight
+ self.BlindFlag = 0
+ self.MaxDeltaScoreGap = -3.5
+ self.SplitByCharge = 0
+ self.Columns = ResultsParser.Columns()
+ ResultsParser.ResultsParser.__init__(self)
+
+ def ReadDeltaScoreDistribution(self, FilePath):
+ """
+ Read delta-scores from a file, to compute the average delta-score.
+ If passed a directory, iterate over all results files in the directory.
+ """
+ #
+ self.AllSpectrumCount2 = 0
+ self.AllSpectrumCount3 = 0
+ self.MeanDeltaScore2 = 0
+ self.MeanDeltaScore3 = 0
+ self.ProcessResultsFiles(FilePath, self.ReadDeltaScoreDistributionFromFile, MAX_RESULTS_FILES_TO_PARSE)
+
+ if self.SplitByCharge == 1:
+ self.MeanDeltaScore2 /= max(1, self.AllSpectrumCount2)
+ self.MeanDeltaScore3 /= max(1, self.AllSpectrumCount3)
+ if self.VerboseFlag:
+ print "Mean delta score ch1..2: %s over %s spectra"%(self.MeanDeltaScore2, self.AllSpectrumCount2)
+ print "Mean delta score ch3: %s over %s spectra"%(self.MeanDeltaScore3, self.AllSpectrumCount3)
+ if not self.MeanDeltaScore2:
+ self.MeanDeltaScore2 = 0.001
+ if not self.MeanDeltaScore3:
+ self.MeanDeltaScore3 = 0.001
+ else:
+ self.MeanDeltaScore = (self.MeanDeltaScore2 + self.MeanDeltaScore3)/(max(1,self.AllSpectrumCount2+self.AllSpectrumCount3))
+ if self.VerboseFlag:
+ print "Mean delta score: %s over %s spectra"%(self.MeanDeltaScore, self.AllSpectrumCount2+self.AllSpectrumCount3)
+
+ def ReadDeltaScoreDistributionFromFile(self, FilePath):
+ "Read delta-scores from a single file, to compute the average delta-score."
+ print "Read delta-score distribution from %s..."%FilePath
+ try:
+ File = open(FilePath, "rb")
+ except:
+ traceback.print_exc()
+ return
+ OldSpectrum = None
+ for FileLine in File.xreadlines():
+ # Skip header lines and blank lines
+ if FileLine[0] == "#":
+ self.Columns.initializeHeaders(FileLine)
+ continue
+ if not FileLine.strip():
+ continue
+ Bits = list(FileLine.split("\t"))
+ if len(Bits) <= self.Columns.getIndex("DeltaScore"):
+ continue
+ try:
+ Charge = int(Bits[self.Columns.getIndex("Charge")])
+ MQScore = float(Bits[self.Columns.getIndex("MQScore")])
+ DeltaScore = float(Bits[self.Columns.getIndex("DeltaScore")])
+ Peptide = GetPeptideFromModdedName(Bits[self.Columns.getIndex("Annotation")])
+ Spectrum = (os.path.basename(Bits[self.Columns.getIndex("SpectrumFile")]), Bits[self.Columns.getIndex("Scan#")])
+ except:
+ traceback.print_exc()
+ print Bits
+ continue # header line
+ if Spectrum == OldSpectrum:
+ continue
+
+ OldSpectrum = Spectrum
+
+ if DeltaScore < 0:
+ print "## Warning: DeltaScore < 0!", Spectrum, FilePath
+ print DeltaScore
+ print MQScore
+ print Bits
+ raw_input()
+ continue
+ if Charge < 3:
+ self.AllSpectrumCount2 += 1
+ self.MeanDeltaScore2 += DeltaScore
+
+ else:
+ self.AllSpectrumCount3 += 1
+ self.MeanDeltaScore3 += DeltaScore
+ File.close()
+ def WriteMatchesForSpectrum(self, MatchesForSpectrum, OutFile):
+
+ for Match in MatchesForSpectrum:
+ # Skip short matches:
+ Length = len(Match.Peptide.Aminos)
+
+ if self.SplitByCharge:
+ if Match.Charge < 3:
+ CurrMeanDeltaScore = self.MeanDeltaScore2
+ else:
+ CurrMeanDeltaScore = self.MeanDeltaScore3
+
+ else:
+ CurrMeanDeltaScore = self.MeanDeltaScore
+
+ WeightedScore = self.MQScoreWeight * Match.MQScore + self.DeltaScoreWeight * (Match.DeltaScore / CurrMeanDeltaScore)
+ ScoreBin = int(round(WeightedScore * BIN_MULTIPLIER))
+
+ Match.Bits[self.Columns.getIndex("F-Score")] = "%s"%WeightedScore
+
+ OutFile.write(string.join(Match.Bits, "\t"))
+ OutFile.write("\n")
+
+ def WriteFixedScores(self, OutputPath):
+
+ self.WriteScoresPath = OutputPath
+ # Make the output directory, if it doesn't exist already.
+ # Assume: OutputPath is a directory if ReadScoresPath is a directory,
+ # and OutputPath is a file if ReadScoresPath is a file.
+ if os.path.isdir(self.ReadScoresPath):
+ DirName = OutputPath
+ else:
+ DirName = os.path.split(OutputPath)[0]
+ try:
+ os.makedirs(DirName)
+ except:
+ pass
+ self.ProcessResultsFiles(self.ReadScoresPath, self.WriteFixedScoresFile)
+
+
+ def WriteFixedScoresFile(self, Path):
+ if os.path.isdir(self.ReadScoresPath):
+ OutputPath = os.path.join(self.WriteScoresPath, os.path.split(Path)[1])
+ else:
+ OutputPath = self.WriteScoresPath
+
+ try:
+ InFile = open(Path, "rb")
+ OutFile = open(OutputPath, "wb")
+ LineCount = 0
+
+ OldSpectrum = None
+ MatchesForSpectrum = []
+ for FileLine in InFile:
+ # Lines starting with # are comments (e.g. header line), and are written out as-is:
+ if FileLine[0] == "#":
+ self.Columns.initializeHeaders(FileLine)
+ OutFile.write(FileLine)
+ continue
+ Bits = list(FileLine.strip().split("\t"))
+ Match = Bag()
+ try:
+ Match.Bits = Bits
+ Match.Charge = int(Bits[self.Columns.getIndex("Charge")])
+ Match.MQScore = float(Bits[self.Columns.getIndex("MQScore")])
+ Match.DeltaScore = float(Bits[self.Columns.getIndex("DeltaScore")])
+ Match.Peptide = GetPeptideFromModdedName(Bits[self.Columns.getIndex("Annotation")])
+ Match.ProteinName = Bits[self.Columns.getIndex("Protein")]
+ except:
+ continue
+ LineCount += 1
+ Spectrum = (Bits[0], Bits[1])
+ if Spectrum != OldSpectrum:
+ self.WriteMatchesForSpectrum(MatchesForSpectrum, OutFile)
+ MatchesForSpectrum = []
+ OldSpectrum = Spectrum
+ MatchesForSpectrum.append(Match)
+ # Finish the last spectrum:
+ self.WriteMatchesForSpectrum(MatchesForSpectrum, OutFile)
+ InFile.close()
+ OutFile.close()
+
+
+ except:
+ traceback.print_exc()
+ print "* Error filtering annotations from '%s' to '%s'"%(Path, OutputPath)
+
+ def Run(self):
+ self.ReadDeltaScoreDistribution(self.ReadScoresPath)
+ self.WriteFixedScores(self.WriteScoresPath)
+
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "r:w:cvb")
+ OptionsSeen = {}
+
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-b":
+ self.BlindFlag = 1
+ self.MQScoreWeight = Defaults.BlindMQScoreWeight
+ self.DeltaScoreWeight = Defaults.BlindDeltaScoreWeight
+
+ elif Option == "-r":
+ self.ReadScoresPath = Value
+ elif Option == "-w":
+ self.WriteScoresPath = Value
+ elif Option == "-c":
+ self.SplitByCharge = 1
+ elif Option == "-v":
+ self.VerboseFlag = 1
+ else:
+ print "** Unknown option:", Option, Value
+ # Check validity of options:
+ if not OptionsSeen.has_key("-r") or not OptionsSeen.has_key("-w"):
+ print "* Error: Missing Arguments"
+ return 0
+ # No major problems - return TRUE for success.
+ return 1
+
+UsageInfo = """
+ComputeFScore.py - Compute FScore based on match quality score (MQScore) and delta score.
+Write out an updated results file.
+
+Required Parameters:
+ -r [FILENAME] Read results from filename (and fit the probability mixture
+ model to these results). If the option value is a directory, we'll read
+ all the results-files from the directory.
+ -w [FILENAME] Write re-scored results to a file.
+
+Optional Parameters:
+ -c Split by charge (compute the FScore separately for charge 1 and 2, and for charge 3.
+ -b Results are from a blind search (not recommended)
+
+Internal use only:
+ -v Verbose output (for debugging)
+
+
+Example:
+ ComputeFScore.py -r ShewanellaResults -w ShewanellaFiltered -c
+"""
+
+def Main(Parser = None):
+ global MAX_RESULTS_FILES_TO_PARSE
+
+ if not Parser:
+ Parser = FScoreParser()
+ Result = Parser.ParseCommandLine(sys.argv[1:])
+ if not Result:
+ print UsageInfo
+ return
+ Parser.Run()
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "psyco not found - running without optimization"
+ #TestMain()
+ Main()
diff --git a/ComputePTMFeatures.py b/ComputePTMFeatures.py
new file mode 100644
index 0000000..2e678e1
--- /dev/null
+++ b/ComputePTMFeatures.py
@@ -0,0 +1,943 @@
+#Title: ComputePTMFeatures.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Plan:
+Output a large collection of features for each post-translational modification accepted on a
+search of a part-bogus database. All modifications on the bogus proteins are incorrect.
+An equivalent number of modifications on the non-bogus proteins are incorrect. Let's compute
+a variety of features for the PTMs observed.
+
+Input:
+A collection of annotated spectra, output by SelectSites.py
+Output:
+A file listing all the observed modification sites, with various features computed.
+
+Then, we train a model to distinguish between good (correct DB) and bad (incorrect DB)
+modifications. Model types: LDA, logistic regression, SVM, etc.
+
+(Another possible experiment: Search unmodified spectra against a mutated database,
+judge correct precisely those modifications which "undo" the mutations)
+"""
+
+import os
+import sys
+import struct
+import traceback
+import getopt
+import MSSpectrum
+import PyInspect
+import random
+import shutil
+import time
+import math
+import cPickle
+import BasicStats
+import ResultsParser
+import BuildConsensusSpectrum
+import SpectralSimilarity
+import StripPTM
+random.seed(1)
+from Utils import *
+from TrainPTMFeatures import FormatBits
+Initialize()
+
+AMINO_ACIDS = "ACDEFGHIKLMNOPQRSTUVWY" # O and U are included, for now.
+INVALID_MASS = 99999
+
+# Retain at most this many spectra for an unmodified peptide:
+MAX_MODLESS_CLUSTER_SIZE = 100
+
+# For running the Python profiler:
+PROFILE_FLAG = 0
+
+class PeptideSpecies:
+ """
+ Peptides[(Annotation, Charge)] -> PeptideSpecies instance
+ The PeptideSpecies remembers a list of spectra, the modification position, the modification mass.
+ """
+ InstanceCount = 0
+ def __init__(self):
+ self.HitCount = 0
+ self.ModifiedFlag = 0
+ self.ModMass = 0
+ self.DBPos = 0
+ self.ModDBPos = 0
+ self.Spectra = []
+ self.Peptide = None
+ self.SpectrumCount = 0
+ PeptideSpecies.InstanceCount += 1
+ def __del__(self):
+ if PeptideSpecies:
+ PeptideSpecies.InstanceCount -= 1
+ def __str__(self):
+ return self.Annotation
+
+class SpectrumInfoClass(ResultsParser.ResultsParser):
+ """
+ Information about a single scan. We remember only the info we'll need later.
+ """
+ InstanceCount = 0
+ def __init__(self, Bits, Trainer):
+ ResultsParser.ResultsParser.__init__(self)
+
+ self.FileNameIndex = Trainer.RememberString(Trainer.CachedFilePaths, Bits[0])
+ self.MQScore = float(Bits[Trainer.Columns.getIndex("MQScore")])
+ self.DeltaScore = float(Bits[Trainer.Columns.getIndex("DeltaScore")])
+ self.ByteOffset = int(Bits[Trainer.Columns.getIndex("SpecFilePos")])
+ self.ScanNumber = int(Bits[Trainer.Columns.getIndex("Scan#")])
+ SpectrumInfoClass.InstanceCount += 1
+ def __cmp__(self, Other):
+ """
+ Sort from BEST to WORST match.
+ """
+ if self.MQScore > Other.MQScore:
+ return -1
+ if self.MQScore < Other.MQScore:
+ return 1
+ return 0
+ def __del__(self):
+ if SpectrumInfoClass:
+ SpectrumInfoClass.InstanceCount -= 1
+
+class PTMFeatureComputer(ResultsParser.ResultsParser, ResultsParser.SpectrumOracleMixin):
+ def __init__(self):
+ self.PValueCutoff = 0.1 # default
+ self.ResultsFileName = None
+ self.DBPath = None
+ self.OutputDir = "PTM"
+ self.OutputPath = os.path.join(self.OutputDir, "PTMFeatures.txt")
+ self.ConsensusClusterDir = None
+ self.ConsensusSpectrumDir = None
+ # Peptides keeps a list of SpectrumInfo objects for each peptide species
+ # we've observed. The keys have the form (Annotation, Charge) and the values
+ # are lists SpectrumInfo instance.
+ self.Peptides = {}
+ self.PTMs = {} # keys of the form (DBPos, Mass)
+ self.CoverageThreshold = 2 # at least this many spectra to consider a residue 'covered'.
+ self.QuickParseFlag = 0 # if true, then parse only the first n lines
+ self.PoolFlag = 0
+ self.ModelType = None
+ self.SisterProteins = {} # protein index -> sister protein's index
+ self.MZXMLOracle = {}
+ self.ModelTrainFilePath = "PTMFeatures.All.txt"
+ self.ModelTestFilePath = None
+ # Dictionary of unmodified peptides, for computing the coverage level:
+ self.UnmodifiedPeptides = {}
+ self.FeatureSelectionFlag = None
+ self.CachedFilePaths = []
+ self.CachedFixedFilePaths = []
+ self.StartOutputDBPos = 0
+ self.RequiredFileNameChunk = None
+ self.Columns = ResultsParser.Columns()
+ ResultsParser.ResultsParser.__init__(self)
+ ResultsParser.SpectrumOracleMixin.__init__(self)
+
+ def RememberString(self, StringList, NewString):
+ """
+ Return the index of NewString within StringList, adding to the list if necessary.
+ We keep a list of mzxml file names and store indexes into the list, to avoid
+ the memory hit required to store each occurrence of the name.
+ """
+ try:
+ Index = StringList.index(NewString)
+ return Index
+ except:
+ StringList.append(NewString)
+ return len(StringList) - 1
+ def LoadDB(self):
+ """
+ Load the database searched. For future reference, we want the protein names as well.
+ """
+ # Populate self.DB with the contents of the .trie file
+ File = open(self.DBPath, "rb")
+ self.DB = File.read()
+ File.close()
+ # Populate self.ProteinNames and self.ProteinPositions by parsing the index file:
+ self.ProteinNames = []
+ self.ProteinPositions = []
+ IndexPath = os.path.splitext(self.DBPath)[0] + ".index"
+ File = open(IndexPath, "rb")
+ BlockSize = struct.calcsize("<qi80s")
+ while 1:
+ Block = File.read(BlockSize)
+ if not Block:
+ break
+ Tuple = struct.unpack("<qi80s", Block)
+ Name = Tuple[-1]
+ NullPos = Name.find("\0")
+ if NullPos != -1:
+ Name = Name[:NullPos]
+ self.ProteinNames.append(Name)
+ self.ProteinPositions.append(Tuple[1])
+ File.close()
+ # Initialize our coverage arrays:
+ self.Coverage = [0] * len(self.DB)
+ self.ModCoverage = [0] * len(self.DB)
+ self.PeptideCoverage = [0] * len(self.DB)
+ self.ModPeptideCoverage = [0] * len(self.DB)
+ # Find sister-proteins.
+ # A shuffled protein name should be the standard protein's name
+ # with the characters "XXX" prepended. (If the standard protein name
+ # is very long, the last characters may "slide off the edge")
+ for IndexA in range(len(self.ProteinNames)):
+ Name = self.ProteinNames[IndexA]
+ SisterName = None
+ if Name[:3] == "XXX":
+ SisterName = Name[3:]
+ if SisterName:
+ for IndexB in range(len(self.ProteinNames)):
+ Name = self.ProteinNames[IndexB][:77]
+ if Name == SisterName:
+ self.SisterProteins[IndexA] = IndexB
+ self.SisterProteins[IndexB] = IndexA
+ def ComputeProteinCoverage(self):
+ """
+ Compute residue-level coverage. And, compute
+ what fraction of each protein is covered.
+ """
+ for Species in self.Peptides.values():
+ DBPos = Species.DBPos
+ if Species.ModifiedFlag:
+ for Pos in range(DBPos, DBPos + len(Species.Peptide.Aminos)):
+ self.ModCoverage[Pos] += Species.SpectrumCount # count modified spectra
+ self.ModPeptideCoverage[Pos] += 1 # count distinct peptide species
+ else:
+ for Pos in range(DBPos, DBPos + len(Species.Peptide.Aminos)):
+ self.Coverage[Pos] += Species.SpectrumCount # count unmodified spectra
+ self.PeptideCoverage[Pos] += 1 # count distinct unmodified peptide species
+ #########################################################
+ # Compute percentage of each protein that's covered:
+ self.ProteinCoverageLevels = []
+ for ProteinIndex in range(len(self.ProteinPositions)):
+ StartPos = self.ProteinPositions[ProteinIndex]
+ if ProteinIndex < len(self.ProteinPositions) - 1:
+ EndPos = self.ProteinPositions[ProteinIndex + 1]
+ else:
+ EndPos = len(self.DB)
+ #print "Protein %s (%s) from %s-%s"%(ProteinIndex, self.ProteinNames[ProteinIndex], StartPos, EndPos)
+ CoverFlags = 0
+ for Pos in range(StartPos, EndPos):
+ if self.Coverage[Pos] >= self.CoverageThreshold:
+ CoverFlags += 1
+ ProteinLength = EndPos - StartPos
+ #print " -> Coverage %s/%s = %s"%(CoverFlags, ProteinLength, CoverFlags / float(ProteinLength))
+ self.ProteinCoverageLevels.append(CoverFlags / float(ProteinLength))
+ # SAVE protein coverage levels:
+ CoveragePath = os.path.join(self.OutputDir, "Coverage.dat")
+ CoverageFile = open(CoveragePath, "wb")
+ for DBPos in range(len(self.DB)):
+ Str = struct.pack("<II", self.Coverage[DBPos], self.ModCoverage[DBPos])
+ CoverageFile.write(Str)
+ CoverageFile.close()
+ # Boost protein coverage levels based upon sister proteins:
+ for Index in range(len(self.ProteinNames)):
+ SisterIndex = self.SisterProteins.get(Index, None)
+ if SisterIndex == None:
+ continue
+ self.ProteinCoverageLevels[Index] = max(self.ProteinCoverageLevels[Index], self.ProteinCoverageLevels[SisterIndex])
+ #print "%s and %s are sisters, with coverage %s"%(self.ProteinNames[Index], self.ProteinNames[SisterIndex], self.ProteinCoverageLevels[Index])
+ def FixPeptideSpecies(self):
+ """
+ Iterate over all the peptide species we observed. Strip "obviously unnecessary" PTMs.
+ """
+ Keys = self.Peptides.keys()
+ for Key in Keys:
+ Annotation = Key[0]
+ Species = self.Peptides[Key]
+ Result = StripPTM.StripNeedlessModifications(self.DB, Annotation)
+ if not Result:
+ continue
+ (DBPos, FixedAnnotation) = Result
+ # If the annotation wasn't changed, then continue.
+ if FixedAnnotation == Annotation:
+ continue
+ Species.Peptide = GetPeptideFromModdedName(FixedAnnotation)
+ Species.Annotation = FixedAnnotation
+ Species.DBPos = DBPos
+ ModKeys = Species.Peptide.Modifications.keys()
+ if len(ModKeys):
+ ModIndex = ModKeys[0]
+ Species.ModifiedFlag = 1
+ Species.ModMass = int(round(Species.Peptide.Modifications[ModIndex][0].Mass))
+ Species.ModDBPos = DBPos + ModIndex
+ else:
+ Species.ModifiedFlag = 0
+ Species.ModMass = 0
+ Species.ModDBPos = None
+ del self.Peptides[Key]
+ # Either merge into the existing species with this fixed annotation,
+ # or move into the empty pigeonhole:
+ FixedKey = (FixedAnnotation, Key[1])
+ OldSpecies = self.Peptides.get(FixedKey, None)
+ if OldSpecies:
+ OldSpecies.Spectra.extend(Species.Spectra)
+ OldSpecies.SpectrumCount += Species.SpectrumCount
+ if len(OldSpecies.Spectra) > MAX_MODLESS_CLUSTER_SIZE:
+ OldSpecies.Spectra.sort()
+ OldSpecies.Spectra = Species.Spectra[:MAX_MODLESS_CLUSTER_SIZE]
+ else:
+ self.Peptides[FixedKey] = Species
+ if len(Species.Spectra) > MAX_MODLESS_CLUSTER_SIZE:
+ Species.Spectra.sort()
+ Species.Spectra = Species.Spectra[:MAX_MODLESS_CLUSTER_SIZE]
+ def ParsePTMsFromResultsFile(self, FilePath):
+ """
+ Callback for parsing one Inspect results-file. Our job here
+ is to populate self.BestSpectra, and self.PTMs.
+ Note: It's POSSIBLE that we'll spot some modified annotations
+ which can be "trivially fixed" to produce unmodified annotations.
+ Examples: T.T+101LAPTTVPITSAK.A, Y.E+163NPNFTGK.K
+ Because of this, we keep a dictionary self.FixedAnnotation,
+ where keys are raw annotations and values are fixed-up annotations.
+ """
+ if self.RequiredFileNameChunk:
+ Pos = FilePath.find(self.RequiredFileNameChunk)
+ if Pos == -1:
+ return
+ if os.path.isdir(FilePath):
+ print "NOTE: Skipping results sub-directory '%s'"%FilePath
+ return
+ try:
+ File = open(FilePath, "rb")
+ except:
+ print "** Unable to open results file '%s'"%FilePath
+ return
+ LineNumber = 0
+ OldSpectrum = None
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ if LineNumber % 1000 == 0:
+ print " Line %s..."%LineNumber
+ if self.QuickParseFlag:
+ break
+ if FileLine[0] == "#":
+ self.Columns.initializeHeaders(FileLine)
+ continue
+ Bits = FileLine.strip().split("\t")
+ if len(Bits) < 15:
+ continue # not valid!
+ Spectrum = (Bits[0], Bits[1])
+ if Spectrum == OldSpectrum:
+ continue
+ OldSpectrum = Spectrum
+ PValue = float(Bits[self.Columns.getIndex("InspectFDR")])
+ if PValue > self.PValueCutoff:
+ continue
+ Annotation = Bits[self.Columns.getIndex("Annotation")]
+ Charge = int(Bits[self.Columns.getIndex("Charge")])
+ AnnotationKey = (Annotation, Charge)
+ ##############################################################
+ # If we've never seen this annotation before, then create a PeptideSpecies object
+ # and record it in self.Peptides
+ Species = self.Peptides.get(AnnotationKey, None)
+ if not Species:
+ Species = PeptideSpecies()
+ Species.Peptide = GetPeptideFromModdedName(Annotation)
+ Mods = []
+ for (Index, List) in Species.Peptide.Modifications.items():
+ for Mod in List:
+ Mods.append((Index, Mod))
+ if len(Mods):
+ Species.ModifiedFlag = 1
+ Species.ModMass = int(Mods[0][1].Mass)
+ Species.ModAA = Species.Peptide.Aminos[Mods[0][0]]
+ else:
+ Species.ModifiedFlag = 0
+ Species.ProteinName = Bits[self.Columns.getIndex("Protein")]
+ self.Peptides[AnnotationKey] = Species
+ # Get the database position of the peptide:
+ Species.DBPos = self.DB.find(Species.Peptide.Aminos)
+ if len(Species.Peptide.Modifications.keys()):
+ ModIndex = Species.Peptide.Modifications.keys()[0]
+ Species.ModDBPos = Species.DBPos + ModIndex
+ else:
+ Species.ModDBPos = None
+ # Get the residue-number of the peptide:
+ StarPos = self.DB.rfind("*", 0, Species.DBPos)
+ if StarPos == -1:
+ Species.ResidueNumber = Species.DBPos
+ else:
+ Species.ResidueNumber = Species.DBPos - StarPos
+ Species.Annotation = Annotation
+ Species.Charge = Charge
+ if Species.DBPos == -1:
+ print "* skipping unknown peptide: %s"%Annotation
+ del self.Peptides[AnnotationKey] # remove the Species that was just created!
+ continue
+ MQScore = float(Bits[self.Columns.getIndex("MQScore")])
+ self.AnnotationCount += 1
+ ##############################################################
+ # Populate Species.Spectra:
+ try:
+ Info = SpectrumInfoClass(Bits, self)
+ except:
+ print "** Error: Couldn't parse spectrum info from line %s of file %s"%(LineNumber, FilePath)
+ traceback.print_exc()
+ continue
+ Species.Spectra.append(Info)
+ Species.SpectrumCount += 1
+ if not Species.ModifiedFlag:
+ if len(Species.Spectra) > MAX_MODLESS_CLUSTER_SIZE:
+ Species.Spectra.sort()
+ Species.Spectra = Species.Spectra[:MAX_MODLESS_CLUSTER_SIZE]
+ else:
+ pass
+ File.close()
+ def WipeDir(self, Dir):
+ try:
+ shutil.rmtree(Dir)
+ except:
+ pass
+ def ComputeFeaturesMain(self):
+ """
+ Main method:
+ - Load the database searched
+ - Iterate over the results-file, to get a list of PTMs
+ - Iterate over the PTMs, and write out features for each one.
+ """
+ self.ConsensusClusterDir = os.path.join(self.OutputDir, "Clusters")
+ self.ClusterScanListDir = os.path.join(self.OutputDir, "ClusterMembers")
+ self.ConsensusSpectrumDir = os.path.join(self.OutputDir, "Spectra")
+ if not self.StartOutputDBPos:
+ # Make sure necessary directories exist, and clean up any OLD output:
+ print "Prepare cluster directories..."
+ self.WipeDir(self.ConsensusClusterDir)
+ self.WipeDir(self.ConsensusSpectrumDir)
+ self.WipeDir(self.ClusterScanListDir)
+ MakeDirectory(self.ConsensusClusterDir)
+ MakeDirectory(self.ConsensusSpectrumDir)
+ MakeDirectory(self.ClusterScanListDir)
+ for AA in AMINO_ACIDS:
+ PathA = os.path.join(self.ConsensusClusterDir, AA)
+ PathB = os.path.join(self.ConsensusSpectrumDir, AA)
+ PathC = os.path.join(self.ClusterScanListDir, AA)
+ for Path in (PathA, PathB, PathC):
+ MakeDirectory(Path)
+ else:
+ print "CONTINUING ComputePTMFeatures from DBPosition %s"%self.StartOutputDBPos
+ print "Load database..."
+ self.LoadDB()
+ print "Parse annotations..."
+ self.AnnotationCount = 0
+ self.PTMAnnotationCount = 0
+ self.BestModlessHits = {}
+ self.ProcessResultsFiles(self.ResultsFileName, self.ParsePTMsFromResultsFile)
+ # Fix annotations:
+ self.FixPeptideSpecies()
+ # Fix file paths:
+ for FilePath in self.CachedFilePaths:
+ FixedPath = self.FixSpectrumPath(FilePath)
+ self.CachedFixedFilePaths.append(FixedPath)
+ self.PairModifiedUnmodifiedPeptides()
+ print "Produce CONSENSUS SPECTRA for modified and unmodified petpides..."
+ StartTime = time.clock()
+ self.ProduceConsensusSpectra()
+ EndTime = time.clock()
+ print "Elapsed time: %s"%(EndTime - StartTime)
+ print "Compute protein coverage..."
+ self.ComputeProteinCoverage()
+ print "Count spectra (and sites) by PTM type..."
+ self.ComputeTotalSpectraForModType()
+ print "Generate non-redundant PTM list..."
+ self.ListDistinctPTMs()
+ print "Compute features and output PTM info..."
+ self.ComputeFeaturesAllPTMs()
+ def PairModifiedUnmodifiedPeptides(self):
+ for Species in self.Peptides.values():
+ if not Species.ModifiedFlag:
+ continue
+ ModlessAnnotation = "%s.%s.%s"%(Species.Peptide.Prefix, Species.Peptide.Aminos, Species.Peptide.Suffix)
+ ModlessKey = (ModlessAnnotation, Species.Charge)
+ ModlessSpecies = self.Peptides.get(ModlessKey, None)
+ Species.Modless = ModlessSpecies
+ def ListDistinctPTMs(self):
+ """
+ Populate self.PTMs; keys are (DBPos, Mass) and values are simple objects
+ with lists of peptide species.
+ """
+ for Species in self.Peptides.values():
+ if not Species.ModifiedFlag:
+ continue
+ Index = Species.Peptide.Modifications.keys()[0]
+ ModifiedPos = Species.DBPos + Index
+ Key = (ModifiedPos, Species.ModMass)
+ if not self.PTMs.has_key(Key):
+ PTM = Bag()
+ PTM.SpeciesList = []
+ self.PTMs[Key] = PTM
+ ModIndex = Species.Peptide.Modifications.keys()[0]
+ PTM.DBPos = Species.ModDBPos
+ PTM.SpeciesList.append(Species)
+ Species.PTM = PTM
+ def ComputeTotalSpectraForModType(self):
+ """
+ Populate a dictionary of the form (AA, Mass) -> SpectrumCount. If a modificaiton is seen
+ at multiple sites, it is more likely to be valid.
+ """
+ self.ModTypeSpectrumCount = {}
+ self.ModTypeSiteCount = {}
+ for Species in self.Peptides.values():
+ if Species.ModifiedFlag:
+ ModTypeKey = (Species.ModAA, Species.ModMass)
+ self.ModTypeSpectrumCount[ModTypeKey] = self.ModTypeSpectrumCount.get(ModTypeKey, 0) + Species.SpectrumCount
+ self.ModTypeSiteCount[ModTypeKey] = self.ModTypeSiteCount.get(ModTypeKey, 0) + 1
+ pass
+ def OutputPTMInfoHeader(self):
+ """
+ Output column headers, plus some general-purpose information such as the number
+ of spectra parsed and the database size.
+ """
+ Header = "#Group\tDBPosition\tMass\tAminoAcid\tProtein\tResidueNumber\t"
+ Header += "Peptide\tCharge\tValidProteinFlag\tFacultativeFlag\tBestSpectrum\t"
+ Header += "BestModlessSpectrum\tBestModlessMQScore\tBigDBAnn\tBigDBScore\tSpectra\tModlessSpectra\tBestMQScore\t"
+ Header += "BestDeltaScore\tPeptideCount\tConsensusMQScore\tPeptideLength\tCutScoreTotal\t"
+ Header += "MedianCutScore\tYPresent\tBPresent\tBYIntensity\tNTT\tModdedFraction\tProteinCoverage\t"
+ Header += "SpectraThisModType\tSitesThisModType\tUnmodifiedPeptideCount\tDot0.5\tShared01\tShared11\t"
+ Header += "Correlation\tLogSpectrumCount\tLogPeptideLength\tLogSpecThisType\tLogSitesThisType\t"
+ Header += "DeltaVsBigDB\tModelScore\tModelPValue\tSitePValue\tKnownModType\tKnownModAnnotation\t"
+ Header += "KnownModScore\tKnownModSitePValue\t"
+ self.OutputFile.write(Header + "\n")
+ # Two more header lines, for feature-numbers and column-numbers:
+ Header = "#0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t"
+ self.OutputFile.write(Header + "\n")
+ Header = "#Feature\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t"
+ self.OutputFile.write(Header + "\n")
+ ProteinRecordCount = self.DB.count("*")
+ DBResidueSize = len(self.DB) - ProteinRecordCount
+ self.OutputFile.write("#DatabaseSize\t%s\t\n"%DBResidueSize)
+ self.OutputFile.write("#AnnotationCount\t%s\t\n"%self.AnnotationCount)
+ SiteCount = len(self.PTMs.values())
+ self.OutputFile.write("#SiteCount\t%s\t\n"%SiteCount)
+ def ProduceConsensusSpectra(self):
+ """
+ We adopt a brute-force strategy: Output a consensus spectrum for each
+ modified peptide species. And, if the equivalent unmodified peptide
+ was observed, then output a consensus spectrum for the unmodified
+ peptide, too.
+ We write a consensus CLUSTER of each modified peptide species.
+ Later on in processing, we may try MERGING two of these clusters
+ (e.g. EAM+16APK, EAMA+16PK)
+ """
+ ClustersBuilt = {} # keep track of clusters that have ALREADY been built
+ Keys = self.Peptides.keys()
+ for PeptideIndex in range(len(Keys)):
+ if (PeptideIndex % 100 == 0):
+ print "For peptide %s/%s..."%(PeptideIndex, len(Keys))
+ AnnotationKey = Keys[PeptideIndex]
+ (Annotation, Charge) = AnnotationKey
+ Species = self.Peptides[AnnotationKey]
+ if not Species.ModifiedFlag:
+ continue
+ Species.ClusterPath = os.path.join(self.ConsensusClusterDir, Annotation[2], "%s.%s.cls"%(Annotation.replace("*", "-"), Charge))
+ Species.ConsensusPath = os.path.join(self.ConsensusSpectrumDir, Annotation[2], "%s.%s.dta"%(Annotation.replace("*", "-"), Charge))
+ ClusterContentPath = os.path.join(self.ClusterScanListDir, Annotation[2], "%s.%s.txt"%(Annotation.replace("*", "-"), Charge))
+ ClusterContentFile = open(ClusterContentPath, "wb")
+ #print "Creating consensus file %s"%(Species.ConsensusPath)
+ #raw_input()
+ Builder = BuildConsensusSpectrum.ConsensusBuilder(Species.Charge)
+ MeanMQ = 0
+ for Info in Species.Spectra:
+ MeanMQ += Info.MQScore
+ MeanMQ /= float(len(Species.Spectra))
+ ValidSpectra = 0
+ for Info in Species.Spectra:
+ # Omit from the consensus spectra with very poor scores:
+ if Info.MQScore < MeanMQ - 3.0:
+ continue
+ SpectrumFilePath = self.CachedFixedFilePaths[Info.FileNameIndex]
+ # Keep track of where these scans came from:
+ ClusterContentFile.write("%s\t%s\t\n"%(SpectrumFilePath, Info.ByteOffset))
+ Spectrum = MSSpectrum.SpectrumClass()
+ SpectrumFile = open(SpectrumFilePath, "rb")
+ SpectrumFile.seek(Info.ByteOffset)
+ Spectrum.ReadPeaksFromFile(SpectrumFile, SpectrumFilePath)
+ if not Spectrum.PrecursorMZ:
+ print "* Error: Unable to read spectrum from '%s:%s'"%(SpectrumFilePath, Info.ByteOffset)
+ continue
+ ValidSpectra += 1
+ Spectrum.SetCharge(Charge)
+ SpectrumFile.close()
+ Builder.AddSpectrum(Spectrum)
+ # Special (and easy) case: If we only saw one spectrum, then write it
+ # out without changing it!
+ if len(Species.Spectra) == 1:
+ Spectrum.WritePeaks(Species.ConsensusPath)
+ # Write the modded cluster to disk, since we may try to augment it later:
+ Builder.PickleCluster(Species.ClusterPath)
+ if len(Species.Spectra) > 1:
+ Spectrum = Builder.ProduceConsensusSpectrum()
+ Spectrum.WritePeaks(Species.ConsensusPath)
+ ClusterContentFile.close()
+ # If we have unmodified peptides for this species, build their cluster:
+ if Species.Modless:
+ Species.Modless.ConsensusPath = os.path.join(self.ConsensusSpectrumDir, Species.Modless.Annotation[2], "%s.%s.dta"%(Species.Modless.Annotation.replace("*", "-"), Charge))
+ Species.Modless.ClusterPath = os.path.join(self.ConsensusClusterDir, Species.Modless.Annotation[2], "%s.%s.cls"%(Species.Modless.Annotation.replace("*", "-"), Charge))
+ ModlessKey = (Species.Modless.Annotation, Species.Modless.Charge)
+ if ClustersBuilt.has_key(ModlessKey):
+ pass
+ else:
+ ModlessMeanMQ = 0
+ for Info in Species.Modless.Spectra:
+ ModlessMeanMQ += Info.MQScore
+ ModlessMeanMQ /= float(len(Species.Modless.Spectra))
+ Builder = BuildConsensusSpectrum.ConsensusBuilder(Species.Charge)
+ for Info in Species.Modless.Spectra:
+ # Omit from the consensus spectra with very poor scores:
+ if Info.MQScore < ModlessMeanMQ - 3.0:
+ continue
+ SpectrumFilePath = self.CachedFixedFilePaths[Info.FileNameIndex]
+ Spectrum = MSSpectrum.SpectrumClass()
+ SpectrumFile = open(SpectrumFilePath, "rb")
+ SpectrumFile.seek(Info.ByteOffset)
+ Spectrum.ReadPeaksFromFile(SpectrumFile, SpectrumFilePath)
+ Spectrum.SetCharge(Charge)
+ SpectrumFile.close()
+ Builder.AddSpectrum(Spectrum)
+ Spectrum = Builder.ProduceConsensusSpectrum()
+ Spectrum.WritePeaks(Species.Modless.ConsensusPath)
+ Builder.PickleCluster(Species.Modless.ClusterPath)
+ ClustersBuilt[ModlessKey] = 1
+ def ComputeFeaturesAllPTMs(self):
+ """
+ Compute, and output, features for each modification site.
+ """
+ self.OutputFile = open(self.OutputPath, "wb")
+ self.OutputPTMInfoHeader()
+ # Use self.ConsensusCreatedFlags to flag which unmodified peptides we have
+ # already generated consensus spectra for, so that we don't do the same one twice and waste time:
+ self.ConsensusCreatedFlags = {}
+ # Order the peptides by (ModDBPos, Annotation, Charge). It's important to order things
+ # in this way so that, when we combine the *large* output files for HEK293, we can keep
+ # consistent 'cursors' moving through each of our input files.
+ print "Sorting %s peptides..."%len(self.Peptides.values())
+ SortedKeys = []
+ for Peptide in self.Peptides.values():
+ Key = (Peptide.ModDBPos, Peptide.Annotation, Peptide.Charge)
+ SortedKeys.append(Key)
+ SortedKeys.sort()
+ for KeyIndex in range(len(SortedKeys)):
+ (ModDBPos, Annotation, Charge) = SortedKeys[KeyIndex]
+ Key = (Annotation, Charge)
+ Species = self.Peptides[Key]
+ if not Species.ModifiedFlag:
+ continue
+ if Species.DBPos < self.StartOutputDBPos:
+ continue
+ print "(%s/%s) PTM: %+d on db residue %d"%(KeyIndex, len(SortedKeys), Species.ModMass, Species.DBPos + Species.Peptide.Modifications.keys()[0])
+ sys.stdout.flush()
+ try:
+ Features = self.ComputePTMFeatures(Species)
+ except:
+ traceback.print_exc()
+ print "** Error: Unable to compute PTM features for %s"%Species
+ continue
+ Str = "%s\t"%self.OutputPath
+ Str += "%s\t%+d\t%s\t"%(Species.ModDBPos, Species.ModMass, Species.ModAA)
+ Str += "%s\t"%Species.ProteinName
+ Str += "%s\t"%(Species.ResidueNumber + Species.Peptide.Modifications.keys()[0])
+ Str += "%s\t"%Species.Annotation
+ Str += "%s\t"%Species.Charge
+ for Feature in Features:
+ Str += "%s\t"%Feature
+ print Str
+ self.OutputFile.write(Str + "\n")
+ # We're done with this PTM now, so let's forget about it:
+ del self.Peptides[Key]
+ self.OutputFile.close()
+ def ComputePTMFeatures(self, Species):
+ """
+ Compute scoring-features for this peptide species, return them as a list
+ """
+ Features = []
+ # Feature: Is the PTM from a valid protein? (Note: this feature is not INPUT for the
+ # model, it's our desired output)
+ Feature = self.GetValidProteinFlag(Species)
+ Features.append(Feature)
+ # Important question: Is this PTM constituitive, or facultative? (In other words:
+ # is there a spectra annotated with an UNMODIFIED peptide for this PTM type?)
+ # Set flag to "1" if the PTM is facultative:
+ if Species.Modless:
+ Features.append("1")
+ else:
+ Features.append("")
+ # The best spectrum observed (meta-data, not a scoring feature)
+ BestMQScore = -999
+ BestDeltaScore = None
+ for Info in Species.Spectra:
+ if Info.MQScore > BestMQScore:
+ BestMQScore = Info.MQScore
+ BestDeltaScore = Info.DeltaScore
+ FilePath = self.CachedFixedFilePaths[Info.FileNameIndex]
+ BestMQSpectrum = ("%s:%s"%(FilePath, Info.ByteOffset))
+ Features.append(BestMQSpectrum)
+ # The best MODLESS spectrum observed (meta-data, not a scoring feature)
+ if Species.Modless:
+ BestMQScoreModless = -999
+ for Info in Species.Modless.Spectra:
+ if Info.MQScore > BestMQScoreModless:
+ BestMQScoreModless = Info.MQScore
+ FilePath = self.CachedFixedFilePaths[Info.FileNameIndex]
+ BestMQSpectrum = ("%s:%s"%(FilePath, Info.ByteOffset))
+ Features.append(BestMQSpectrum)
+ Features.append(str(BestMQScoreModless))
+ else:
+ Features.append("")
+ Features.append("")
+ # Feature: Annotation, and MQScore, from a search versus big-DB. (This feature
+ # will be spiked in later)
+ Features.append("")
+ Features.append("")
+ # Feature: Number of spectra annotated with this PTM
+ Features.append(Species.SpectrumCount)
+ # Feature: Number of spectra for the *unmodified* peptide version:
+ if Species.Modless:
+ Feature = Species.Modless.SpectrumCount
+ Features.append(Feature)
+ else:
+ Features.append(0)
+ # Feature: Best MQScore for this PTM, and the best delta-score for that scan:
+ Features.append(BestMQScore)
+ Features.append(BestDeltaScore)
+ # Feature: Number of peptide species observed for this PTM on this residue
+ Features.append(len(Species.PTM.SpeciesList))
+ # Feature: Consensus annotation score (and score-features) for this peptide
+ Species.ConsensusScore = None
+ self.GetConsensusMQScore(Species, Features)
+ # Feature: Presence of unmodified peptides covering the residue of interest
+ ModlessCount = self.Coverage[Species.PTM.DBPos]
+ ModdedSpectrumCount = self.ModCoverage[Species.PTM.DBPos]
+ ModdedFraction = ModdedSpectrumCount / float(ModlessCount + ModdedSpectrumCount)
+ Features.append(ModdedFraction)
+ # Feature: Coverage of the protein of interest (ONLY FOR FACULTATIVE!)
+ ProteinIndex = self.GetProteinIndex(Species.DBPos)
+ ProteinCoverage = self.ProteinCoverageLevels[ProteinIndex]
+ Features.append(ProteinCoverage)
+ # Feature: Number of annotations using this modification-type
+ ModTypeKey = (Species.ModAA, Species.ModMass)
+ ModTypeSpectrumCount = self.ModTypeSpectrumCount.get(ModTypeKey, 0)
+ Features.append(ModTypeSpectrumCount)
+ # Feature: Number of sites using this modification-type
+ ModTypeSiteCount = self.ModTypeSiteCount.get(ModTypeKey, 0)
+ Features.append(ModTypeSiteCount)
+ # Feature: Number of unmodified peptide species for this site
+ ModlessPeptides = self.PeptideCoverage[Species.DBPos]
+ Features.append(ModlessPeptides)
+ # Features for FACULTATIVE PTMs only:
+ # These features have been commented out, since we no longer pursue a
+ # special model for facultative PTMs.
+ if 0: #Species.Modless:
+ Comparator = SpectralSimilarity.SpectralSimilarity(Species.ConsensusPath,
+ Species.Modless.ConsensusPath, Species.Annotation, Species.Modless.Annotation)
+ Comparator.LabelPeaks(0.5)
+ Similarity = Comparator.DotProduct(0.5, HashByRank = 1)
+ Features.append(Similarity)
+ Similarity = Comparator.GetSharedPeakCount(0, 1)
+ Features.append(Similarity)
+ Similarity = Comparator.GetSharedPeakCount(1, 1)
+ Features.append(Similarity)
+ CorrelationCoefficient = Comparator.ComputeCorrelationCoefficient(1.0)
+ Features.append(CorrelationCoefficient)
+ del Comparator
+ else:
+ # This PTM is constitutive, so omit the spectrum-comparison features:
+ Features.append("") # dot
+ Features.append("") # shared-peaks
+ Features.append("") # shared-peaks
+ Features.append("") # correlation
+ # Feature: Log of spectrum-count
+ Features.append(math.log(1.0 + Species.SpectrumCount))
+ # Feature: Log of peptide-length
+ Features.append(math.log(len(Species.Peptide.Aminos)))
+ # Feature: Log of same-modtype-spectrum-count
+ Features.append(math.log(1.0 + ModTypeSpectrumCount))
+ # Feature: Log of same-modyupe-site-count
+ Features.append(math.log(1.0 + ModTypeSiteCount))
+ # Feature: Delta-score versus big-db search result. To be spiked in later!
+ Features.append("")
+ # Free the PySpectrum object now:
+ Species.PySpectrum = None
+ return Features
+ def GetValidProteinFlag(self, PTM):
+ # Normally we prepend "xxx" to the bogus names:
+ if PTM.ProteinName[:3] == "XXX":
+ return 0
+ return 1
+ def GetProteinIndex(self, DBPos):
+ for ProteinIndex in range(len(self.ProteinPositions)):
+ Pos = self.ProteinPositions[ProteinIndex]
+ if Pos > DBPos:
+ return ProteinIndex - 1
+ return len(self.ProteinPositions) - 1
+ def PreComputeAminosForMasses(self):
+ """
+ PepNovo often gives us partial interpretations - e.g. a
+ peptide that starts at 250Da. We "fill in" the prefix and
+ suffix to generate a (not necessarily optimal) full-length
+ peptide.
+ """
+ Aminos = "ACDEFGHILMNPQSTVWYRK" # PREFER ending in R or K.
+ self.AAStrings = {}
+ TotalMass = 0
+ for AA1 in Aminos:
+ Mass1 = Global.AminoMass[AA1]
+ TotalMass = int(round(Mass1))
+ self.AAStrings[TotalMass] = "%c"%(AA1)
+ for AA2 in Aminos:
+ Mass2 = Global.AminoMass[AA2]
+ TotalMass = int(round(Mass1 + Mass2))
+ self.AAStrings[TotalMass] = "%c%c"%(AA1, AA2)
+ for AA3 in Aminos:
+ Mass3 = Global.AminoMass[AA3]
+ TotalMass = int(round(Mass1 + Mass2 + Mass3))
+ self.AAStrings[TotalMass] = "%c%c%c"%(AA1, AA2, AA3)
+ for AA4 in Aminos:
+ Mass4 = Global.AminoMass[AA4]
+ TotalMass = int(round(Mass1 + Mass2 + Mass3 + Mass4))
+ self.AAStrings[TotalMass] = "%c%c%c%c"%(AA1, AA2, AA3, AA4)
+ def AddSpectrumToCluster(self, InputFilePath, InputFilePos, ClusterFile, Charge):
+ """
+ Append the specified scan to an ever-growing .mgf file
+ Returns 1 if successful, 0 if failed
+ """
+ try:
+ SpectrumFile = open(InputFilePath, "rb")
+ except:
+ print "** Error: couldn't open spectrum data file %s"%InputFilePath
+ return 0
+ SpectrumFile.seek(InputFilePos)
+ Spectrum = MSSpectrum.SpectrumClass()
+ try:
+ Spectrum.ReadPeaksFromFile(SpectrumFile, InputFilePath)
+ except:
+ traceback.print_exc()
+ print "***Can't parse:", InputFilePath, FileOffset
+ return 0
+ SpectrumFile.close()
+ ParentMass = Spectrum.PrecursorMZ * Charge - (Charge - 1)*1.0078 #Peptide.Masses[-1] + 19
+ #MZ = (ParentMass + (Info.Charge - 1)*1.0078) / Info.Charge
+ # Now write out this spectrum to the cluster:
+ self.ClusterScanNumber += 1 # ASSUMED: The caller set this to 0 at the start of the cluster!
+ ClusterFile.write("BEGIN IONS\n")
+ ClusterFile.write("TITLE=%s:%s\n"%(InputFilePath, InputFilePos))
+ ClusterFile.write("SCAN=%s\n"%self.ClusterScanNumber)
+ ClusterFile.write("CHARGE=%s\n"%Charge)
+ ClusterFile.write("PEPMASS=%s\n"%ParentMass)
+ for Peak in Spectrum.Peaks:
+ ClusterFile.write("%s %s\n"%(Peak.Mass, Peak.Intensity))
+ ClusterFile.write("END IONS\n")
+ #ClusterFile.close()
+ return 1
+ def GetConsensusMQScore(self, Species, Features):
+ """
+ Feature: MQScore of the consensus spectrum.
+ - Write specra to a cluster (done by ProduceConsensusSpectra)
+ - Generate a consensus-spectrum for the cluster (done by ProduceConsensusSpectra)
+ - Load the consensus-spectrum
+ - Score the spectrum
+ """
+ # Load in the consensus spectrum, and score the peptide annotation:
+ try:
+ print ">>PyConsensus spectrum:", Species.ConsensusPath
+ PySpectrum = PyInspect.Spectrum(Species.ConsensusPath, 0)
+ Species.PySpectrum = PySpectrum
+ print ">>ScorePeptideDetailed(%s)"%Species.Annotation
+ ScoreList = PySpectrum.ScorePeptideDetailed(Species.Annotation)
+ Species.ConsensusScore = ScoreList[0]
+ for ScoreItem in ScoreList:
+ Features.append(ScoreItem)
+ print "PyInspect score %s -> %s"%(Species.Annotation, ScoreList[0])
+ except:
+ traceback.print_exc()
+ for X in range(8):
+ Features.append(0)
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "d:r:w:s:M:lp:c:Z:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ # -r results file(s)
+ self.ResultsFileName = Value
+ elif Option == "-c":
+ self.RequiredFileNameChunk = Value
+ elif Option == "-d":
+ self.DBPath = Value
+ elif Option == "-M":
+ self.PopulateSpectrumOracle(Value)
+ elif Option == "-w":
+ self.OutputDir = Value
+ self.OutputPath = os.path.join(self.OutputDir, "PTMFeatures.txt")
+ elif Option == "-l":
+ self.QuickParseFlag = 1
+ elif Option == "-s" or Option == "-M":
+ self.PopulateSpectrumOracle(Value)
+ #self.SpectrumDir = Value
+ elif Option == "-p":
+ self.PValueCutoff = float(Value)
+ elif Option == "-Z": # secret debugging option: Start output from DB position
+ self.StartOutputDBPos = int(Value)
+ else:
+ print "* Error: Unrecognized option %s"%Option
+
+UsageInfo = """
+ComputePTMFeatures: Generate feature values for PTMs observed on a data-set.
+Run this AFTER running SelectSites, and BEFORE running TrainPTMFeatures.
+
+Arguments:
+ -r [ResultsFile]: Name of the results file (or directory)
+ -d [DBPath]: Path to the .trie file searched
+ -w [OutputDir]: Output file directory. Features are written to
+ PTMFeatures.txt within this directory. Clusters and other info
+ is written in (or below) this directory.
+ -M [RootDir]: Root directory for mzXML files.
+"""
+
+if __name__ == "__main__":
+ if not PROFILE_FLAG:
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco not installed; running unoptimized)"
+ Trainer = PTMFeatureComputer()
+ Trainer.ParseCommandLine(sys.argv[1:])
+ if not Trainer.ResultsFileName or not Trainer.DBPath:
+ print UsageInfo
+ sys.exit(-1)
+ if PROFILE_FLAG:
+ import profile
+ profile.run("Trainer.ComputeFeaturesMain()")
+ else:
+ Trainer.ComputeFeaturesMain()
+
+
diff --git a/Database/CommonContaminants.fasta b/Database/CommonContaminants.fasta
new file mode 100644
index 0000000..c0ed40b
--- /dev/null
+++ b/Database/CommonContaminants.fasta
@@ -0,0 +1,20 @@
+>TRYP_PIG Porcine trypsin - Sus scrofa (Pig).
+FPTDDDDKIVGGYTCAANSIPYQVSLNSGSHFCGGSLINSQWVVSAAHCYKSRIQVRLGEHNIDVLEGNEQFINAAKIITHPNFNGNTLDNDIMLIKLSSPATLNSRVATVSLPRSCAAAGTECLISGWGNTKSSGSSYPSLLQCLKAPVLSDSSCKSSYPGQITGNMICVGFLEGGKDSCQGDSGGPVVCNGQLQGIVSWGYGCAQKNKPGVYTKVCNYVNWIQQTIAAN
+>TRY1_BOVIN Bovine trypsin - Bos taurus (Bovine).
+FIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAH
+CYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRV
+ASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNM
+FCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTI
+ASN
+>sp|P35908|K22E_HUMAN Keratin, type II cytoskeletal 2 epidermal (Cytokeratin 2e) (K2e) (CK 2e) - Homo sapiens (Human).
+MSCQISCKSRGRGGGGGGFRGFSSGSAVVSGGSRRSTSSFSCLSRHGGGGGGFGGGGFGSRSLVGLGGTKSISISVAGGGGGFGAAGGFGGRGGGFGGGSGFGGGSGFGGGSGFSGGGFGGGGFGGGRFGGFGGPGGVGGLGGPGGFGPGGYPGGIHEVSVNQSLLQPLNVKVDPEIQNVKAQEREQIKTLNNKFASFIDKVRFLEQQNQVLQTKWELLQQMNVGTRPINLEPIFQGYIDSLKRYLDGLTAERTSQNSELNNMQDLVEDYKKKYEDEINKRTAAENDFVTLKKDVDNAYMIKVELQSKVDLLNQEIEFLKVLYDAEISQIHQSVTDTNVILSMDNSRNLDLDSIIAEVKAQYEEIAQRSKEEAEALYHSKYEELQVTVGRHGDSLKEIKIEISELNRVIQRLQGEIAHVKKQCKNVQDAIADAEQRGEHALKDARNKLNDLEEALQQAKEDLARLLRDYQELMNVKLALDVEIATYRKLLEG [...]
+>sp|Q01546|K22O_HUMAN Keratin, type II cytoskeletal 2 oral (Cytokeratin 2P) (K2P) (CK 2P) - Homo sapiens (Human).
+MNRQVCKKSFSGRSQGFSGRSAVVSGSSRMSCVARSGGAGGGACGFRSGAGSFGSRSLYNLGSNKSISISVAAGSSRAGGFGGGRSSCGFAGGYGGGFGGSYGGGFGGGRGVGSGFGGAGGFGGAGGFGGPGVFGGPGSFGGPGGFGPGGFPGGIQEVIVNQSLLQPLNVEIDPQIGQVKAQEREQIKTLNNKFASFIDKVRFLEQQNKVLETKWELLQQQTTGSGPSSLEPCFESYISFLCKQLDSLLGERGNLEGELKSMQDLVEDFKKKYEDEINKRTAAENEFVGLKKDVDAAFMNKVELQAKVDSLTDEVSFLRTLYEMELSQMQSHASDTSVVLSMDNNRCLDLGSIIAEVRTQYEEIAQRSKSEAEALYQTKLGELQTTAGRHGDDLRNTKSEIMELNRMIQRLRAEIENVKKQNANLQTAIAEAEQRGEMALKDANAKLQDLQTALQKAKDDLARLLRDYQELMNVKLALDVEIATYRKLLEGE [...]
+>sp|P04264|K2C1_HUMAN Keratin, type II cytoskeletal 1 (Cytokeratin 1) (K1) (CK 1) (67 kDa cytokeratin) (Hair alpha protein) - Homo sapiens (Human).
+SRQFSSRSGYRSGGGFSSGSAGIINYQRRTTSSSTRRSGGGGGRFSSCGGGGGSFGAGGGFGSRSLVNLGGSKSISISVARGGGRGSGFGGGYGGGGFGGGGFGGGGFGGGGIGGGGFGGFGSGGGGFGGGGFGGGGYGGGYGPVCPPGGIQEVTINQSLLQPLNVEIDPEIQKVKSREREQIKSLNNQFASFIDKVRFLEQQNQVLQTKWELLQQVDTSTRTHNLEPYFESFINNLRRRVDQLKSDQSRLDSELKNMQDMVEDYRNKYEDEINKRTNAENEFVTIKKDVDGAYMTKVDLQAKLDNLQQEIDFLTALYQAELSQMQTQISETNVILSMDNNRSLDLDSIIAEVKAQNEDIAQKSKAEAESLYQSKYEELQITAGRHGDSVRNSKIEISELNRVIQRLRSEIDNVKKQISNLQQSISDAEQRGENALKDAKNKLNDLEDALQQAKEDLARLLRDYQELMNTKLALDLEIATYRTLLEGEESRM [...]
+>sp|P12035|K2C3_HUMAN Keratin, type II cytoskeletal 3 (Cytokeratin 3) (K3) (CK3) (65 kDa cytokeratin) - Homo sapiens (Human).
+MSRQASKTSGGGSQGFSGRSAVVSGSSRMSCVAHSGGAGGGAYGFRSGAGGFGSRSLYNLGGDKSISISVAAGGSRAGGFGGGRSSCAFAGGYGGGFGSGYGGGFGGGFGGGRGMGGGFGGAGGFGGAGGFGGAGGFGGPGGFGGSGGFGGPGSLGSPGGFAPGGFPGGIQEVTTNQSLLQPLKVETDPQIGQVKAQEREQIKTLNNKFASFIDKVRFLEQQNKVLETKWNLLQQQGTSSISGTNNLEPLFENHINYLRSYLDNILGERGRLDSELKNMEDLVEDFKKKYEDEINKRYAAENEFVTLKKDVDSAYMNKVELQAKVDALIDEIDFLRTLYDAELSQMQSHISDTSVVLSMDNNRSLDLDSIIAEVGAQYEDIAQRSKAEAEALYQTKLGELQTTAGRHGDDLRNTKSEIIELNRMIQRLRAEIEGVKKQNANLQTAIAQAEQHGEMALKDANAKLQELQAALQQAKDDLARLLRDYQELMNVK [...]
+>sp|P08729|K2C7_HUMAN Keratin, type II cytoskeletal 7 (Cytokeratin 7) (K7) (CK 7) (Sarcolectin) - Homo sapiens (Human).
+SIHFSSPVFTSRSAAFSGRGAQVRLSSARPGGLGSSSLYGLGASRPRVAVRSAYGGPVGAGIREVTINQSLLAPLRLDADPSLQRVRQEESEQIKTLNNKFASFIDKVRFLEQQNKLLETKWTLLQEQKSAKSSRLPDIFEAQIAGLRGQLEALQVDGGRLEQGLRTMQDVVEDFKNKYEDEINRRTAAENEFVVLKKDVDAAYMSKVELEAKVDALNDEINFLRTLNETELTELQSQISDTSVVLSMDNSRSLDLDGIIAEVKAQYEEMAKCSRAEAEAWYQTKFETLQAQAGKHGDDLRNTRNEISEMNRAIQRLQAEIDNIKNQRAKLEAAIAEAEERGELALKDARAKQEELEAALQRAKQDMARQLREYQELMSVKLALDIEIATYRKLLEGEESRLAGDGVGAVNISVMNSTGGSSSGGGIGLTLGGTMGSNALSFSSSAGPGLLKAYSIRTASASRRSARD
+>sp|P35527|K1CI_HUMAN Keratin, type I cytoskeletal 9 (Cytokeratin 9) (K9) (CK 9) - Homo sapiens (Human).
+MSCRQFSSSYLTSGGGGGGGLGSGGSIRSSYSRFSSSGGRGGGGRFSSSSGYGGGSSRVCGRGGGGSFGYSYGGGSGGGFSASSLGGGFGGGSRGFGGASGGGYSSSGGFGGGFGGGSGGGFGGGYGSGFGGLGGFGGGAGGGDGGILTANEKSTMQELNSRLASYLDKVQALEEANNDLENKIQDWYDKKGPAAIQKNYSPYYNTIDDLKDQIVDLTVGNNKTLLDIDNTRMTLDDFRIKFEMEQNLRQGVDADINGLRQVLDNLTMEKSDLEMQYETLQEELMALKKNHKEEMSQLTGQNSGDVNVEINVAPGKDLTKTLNDMRQEYEQLIAKNRKDIENQYETQITQIEHEVSSSGQEVQSSAKEVTQLRHGVQELEIELQSQLSKKAALEKSLEDTKNRYCGQLQMIQEQISNLEAQITDVRQEIECQNQEYSLLLSIKMRLEKEIETYHNLLEGGQEDFESSGAGKIGLGGRGGSGGSYGRGSRGGS [...]
diff --git a/Database/TestDatabase.index b/Database/TestDatabase.index
new file mode 100644
index 0000000..66980d6
Binary files /dev/null and b/Database/TestDatabase.index differ
diff --git a/Database/TestDatabase.trie b/Database/TestDatabase.trie
new file mode 100644
index 0000000..096ca19
--- /dev/null
+++ b/Database/TestDatabase.trie
@@ -0,0 +1 @@
+VATIVKMCLVAMALRQPLKRLNVPGEIAESISKNNHAVRRINKKVDKFQSQEQQEMDDPRQDQVHPFAKTQSIVYPFPHPIPDSLPMNIPPVTQTPIFVAPFLEPEILGMHCVKEAMGPKHKELPFPKFPVDPYTEKQSFTNFDVSNLHLPMPLLQSWMHQPHQPLPPTILFPPQRVILLKYMKVLPVPDKEVPYPQRDMSCQAFCLYEDPVIGPWRGPFPLLM*EHHWGYGKHKGPEHWHMDFPLLNGEMQSPVNIDWHRVINDPPLKPLAGVYGSATSRRMLNNGHSMNVEYHDSENKSELKDGPITGAYRIVEFHQRWGSSDDQGSEHTIDRKKYCAELHIVHWNTKYNGFGTSAQQPDGMTIVGTFLTMGDCNPAWRTVLDALDSIKTKGTSTDFPNFDPGTLLPNVIDYWMYPGSLTTPPLMETVTWIVAKEPINMSDEHLFKFRTLNFNAEGDPELIMLANWRPAQPMQNLQVRGFPK*GDVEKAK [...]
\ No newline at end of file
diff --git a/Errors.c b/Errors.c
new file mode 100644
index 0000000..0aa0b8e
--- /dev/null
+++ b/Errors.c
@@ -0,0 +1,261 @@
+//Title: Errors.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#include "CMemLeak.h"
+#include <stdio.h>
+#include "Inspect.h"
+#include "Errors.h"
+
+void AssertionFailed(char* Assertion, char* FileName, int LineNumber)
+{
+ printf("** ASSERTION FAILED line %d file '%s':\n '%s'\n", LineNumber, FileName, Assertion);
+}
+
+typedef struct NumberedError
+{
+ int ID;
+ char* Message;
+} NumberedError;
+
+NumberedError ErrorMessages[] = {
+ {0, "Unhandled exception"},
+ {1, "Out of memory"},
+ {2, "Out of disk space"},
+ {3, "Missing required file '%s'"},
+ {4, "Internal assertion '%s'"},
+ {5, "File '%s' not found"},
+ {6, "Error in LoadBayesianModel: Bogus feature count %d"},
+ {7, "Bogus-looking probability table size %d for feature %d"},
+ {8, "Unable to open requested file '%s'"},
+ {9, "Scan number range (%d...%d) includes no spectra!"},
+ {10, "Only %d top-scoring matches for charge state; not recalibrating the FDR curve."},
+ {11, "No spectra were specified to search."},
+ {12, "No GFF files were specified as input to build an MS2DB file."},
+ {13, "Ignoring unknown command '%s' from Inspect input file"},
+ {14, "Syntax error on line %d of file %s"},
+ {15, "No valid exons found in GFF files"},
+ {16, "Linked exons %d...%d and %d...%d have incompatible reading frames"},
+ {17, "Consecutive GFF exons %d...%d and %d...%d come from same gene, but can't be linked because they overlap"},
+ {18, "Invalid command-line argument '%s'"},
+ {19, "Command-line argument '%s' requires a parameter."},
+ {20, "Invalid coordinates %d...%d on line %d of file %s"},
+ {21, "Length-1 exon at %d is a codon center, but doesn't link in and out"},
+ {22, "Unable to cover GFF gene '%s'"},
+ {23, "Coverage of GFF gene '%s' is incomplete"},
+ {24, "Unhandled instance: %d"},
+ {25, "XML parse exception: '%s'"},
+ {26, "Error linking exons %d and %d in gene '%s'"},
+ {27, "XML line %d: Unexpected tag '%s'"},
+ {28, "XML line %d: Unexpected attribute '%s' for XML tag '%s'"},
+ {29, "Exon %d of gene '%s' is too long!"},
+ {30, "Spectrum file '%s' has an abnormal extension; attempting to treat it as a .dta file"},
+ {31, "Too many peaks in spectrum (scan %d, file %s); dropping extras!"},
+ {32, "Illegal peak mass in scan %d of file %s"},
+ {33, "Syntax error on line %d of input file %s"},
+ {34, "Modifications specified, but no PTMs permitted in peptides. Use 'mods,1' to permit modified peptides."},
+ {35, "Too many PTMs specified in input file - ignoring '%s'"},
+ {36, "Invalid modification type '%s'"},
+ {37, "Illegal amino acid specificity '%s' for modification"},
+ {38, "Invalid tag length '%d': Valid values are 1 through 6"},
+ {39, "Input file parameter '%s' doesn't take a value"},
+ {40, "Input file parameter '%s' requires a string value"},
+ {41, "Input file parameter '%s' requires an integer value"},
+ {42, "Invalid mass %dDa at position %d in spectrum file '%s'"},
+ {43, "Invalid mass %dDa in spectrum file"},
+ {44, "Invalid mass ppm %d - should be in the range 1...1000"},
+ {45, "Peak for spectrum %s:%d is %dDa - possible corruption"},
+ {46, "Invalid scoring model specified: Charge must be 2 or 3"},
+ {47, "Invalid RequiredTermini value '%d' specified"},
+ {48, "Peak for spectrum %s:%d has intensity %f - possible corruption"},
+ {49, "Out of memory - failed to allocate %d bytes"},
+ {50, "Unable to write or close output file '%s'"},
+ {-1, NULL}
+};
+
+int ErrorMessageCount;
+
+void InitErrors()
+{
+ ErrorMessageCount = sizeof(ErrorMessages) / sizeof(char*);
+}
+
+// Report an error - write it to GlobalOptions->ErrorFile (if GlobalOptions exists)
+// and to stderr, and increment the count of reported errors.
+void ReportError(int ErrorSeverity, int ErrorID, int SourceLine, char* SourceFileName, int ArgType,
+ const char* StrA, const char* StrB, const char* StrC,
+ int IntA, int IntB, int IntC, int IntD,
+ float FloatA, float FloatB)
+{
+ char* ErrorMessage;
+ int ErrorIndex;
+ FILE* ErrorFile;
+ FILE* ErrorFile2;
+ //
+ if (!GlobalOptions || !GlobalOptions->ErrorFile)
+ {
+ ErrorFile = stderr;
+ ErrorFile2 = NULL;
+ }
+ else
+ {
+ ErrorFile = GlobalOptions->ErrorFile;
+ ErrorFile2 = stderr;
+ }
+ if (ErrorSeverity)
+ {
+ if (GlobalOptions)
+ {
+ GlobalOptions->ErrorCount++;
+ }
+ fprintf(ErrorFile, "[E%04d] %s:%d:", ErrorID, SourceFileName, SourceLine);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, "[E%04d] %s:%d:", ErrorID, SourceFileName, SourceLine);
+ }
+
+ }
+ else
+ {
+ if (GlobalOptions)
+ {
+ GlobalOptions->WarningCount++;
+ }
+ fprintf(ErrorFile, "{W%04d} %s:%d:", ErrorID, SourceFileName, SourceLine);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, "{W%04d} %s:%d:", ErrorID, SourceFileName, SourceLine);
+ }
+ }
+ ErrorMessage = "";
+ for (ErrorIndex = 0; ErrorIndex < ErrorMessageCount; ErrorIndex++)
+ {
+ if (ErrorID == ErrorMessages[ErrorIndex].ID)
+ {
+ ErrorMessage = ErrorMessages[ErrorIndex].Message;
+ break;
+ }
+ }
+ switch (ArgType)
+ {
+ case ERROR_ARGS_S:
+ fprintf(ErrorFile, ErrorMessage, StrA);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, StrA);
+ }
+ break;
+ case ERROR_ARGS_SS:
+ fprintf(ErrorFile, ErrorMessage, StrA, StrB);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, StrA, StrB);
+ }
+ break;
+ case ERROR_ARGS_I:
+ fprintf(ErrorFile, ErrorMessage, IntA);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, IntA);
+ }
+ break;
+ case ERROR_ARGS_IS:
+ fprintf(ErrorFile, ErrorMessage, IntA, StrA);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, IntA, StrA);
+ }
+ break;
+ case ERROR_ARGS_ISS:
+ fprintf(ErrorFile, ErrorMessage, IntA, StrA, StrB);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, IntA, StrA, StrB);
+ }
+ break;
+ case ERROR_ARGS_II:
+ fprintf(ErrorFile, ErrorMessage, IntA, IntB);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, IntA, IntB);
+ }
+ break;
+ case ERROR_ARGS_IIS:
+ fprintf(ErrorFile, ErrorMessage, IntA, IntB, StrA);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, IntA, IntB, StrA);
+ }
+ break;
+ case ERROR_ARGS_III:
+ fprintf(ErrorFile, ErrorMessage, IntA, IntB, IntC);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, IntA, IntB, IntC);
+ }
+ break;
+ case ERROR_ARGS_IIII:
+ fprintf(ErrorFile, ErrorMessage, IntA, IntB, IntC, IntD);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, IntA, IntB, IntC, IntD);
+ }
+ break;
+ case ERROR_ARGS_SII:
+ fprintf(ErrorFile, ErrorMessage, StrA, IntA, IntB);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, StrA, IntA, IntB);
+ }
+ break;
+ case ERROR_ARGS_SIF:
+ fprintf(ErrorFile, ErrorMessage, StrA, IntA, FloatA);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage, StrA, IntA, FloatA);
+ }
+ break;
+
+ case ERROR_ARGS_NONE:
+ default:
+ fprintf(ErrorFile, ErrorMessage);
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, ErrorMessage);
+ }
+ break;
+ }
+ fprintf(ErrorFile, "\n");
+ if (ErrorFile2)
+ {
+ fprintf(ErrorFile2, "\n");
+ }
+
+}
diff --git a/Errors.h b/Errors.h
new file mode 100644
index 0000000..451e8f1
--- /dev/null
+++ b/Errors.h
@@ -0,0 +1,88 @@
+//Title: Errors.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+//
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef ERRORS_H
+#define ERRORS_H
+
+
+void AssertionFailed(char* Assertion, char* FileName, int LineNumber);
+void InitErrors();
+// ReportError is not to be called directly. Use the REPORT_ERROR and REPORT_WARNING macros.
+void ReportError(int ErrorSeverity, int ErrorID, int SourceLine, char* SourceFileName, int Args,
+ const char* StrA, const char* StrB, const char* StrC,
+ int IntA, int IntB, int IntC, int IntD,
+ float FloatA, float FloatB);
+
+#define ERROR_ARGS_NONE 0
+#define ERROR_ARGS_S 1
+#define ERROR_ARGS_I 2
+#define ERROR_ARGS_II 3
+#define ERROR_ARGS_III 4
+#define ERROR_ARGS_IIII 5
+#define ERROR_ARGS_IS 6
+#define ERROR_ARGS_IIS 7
+#define ERROR_ARGS_IIIS 8
+#define ERROR_ARGS_SS 9
+#define ERROR_ARGS_ISS 10
+#define ERROR_ARGS_SII 11
+#define ERROR_ARGS_SIF 12
+
+#define REPORT_ERROR(ErrorID) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_NONE, NULL, NULL, NULL, 0, 0, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_S(ErrorID, StrA) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_S, StrA, NULL, NULL, 0, 0, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_SS(ErrorID, StrA, StrB) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_SS, StrA, StrB, NULL, 0, 0, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_I(ErrorID, IntA) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_I, NULL, NULL, NULL, IntA, 0, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_II(ErrorID, IntA, IntB) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_II, NULL, NULL, NULL, IntA, IntB, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_III(ErrorID, IntA, IntB, IntC) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_III, NULL, NULL, NULL, IntA, IntB, IntC, 0, 0.0, 0.0);
+#define REPORT_ERROR_IIII(ErrorID, IntA, IntB, IntC, IntD) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IIII, NULL, NULL, NULL, IntA, IntB, IntC, IntD, 0.0, 0.0);
+#define REPORT_ERROR_IS(ErrorID, IntA, StrA) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IS, StrA, NULL, NULL, IntA, 0, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_IIS(ErrorID, IntA, IntB, StrA) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IIS, StrA, NULL, NULL, IntA, IntB, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_IIIS(ErrorID, IntA, IntB, IntC, StrA) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IIIS, StrA, NULL, NULL, IntA, IntB, IntC, 0, 0.0, 0.0);
+#define REPORT_ERROR_ISS(ErrorID, IntA, StrA, StrB) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_ISS, StrA, StrB, NULL, IntA, 0, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_SII(ErrorID, StrA, IntA, IntB) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_SII, StrA, NULL, NULL, IntA, IntB, 0, 0, 0.0, 0.0);
+#define REPORT_ERROR_SIF(ErrorID, StrA, IntA, FloatA) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_SII, StrA, NULL, NULL, IntA, 0, 0, 0, FloatA, 0.0);
+
+#define REPORT_WARNING(ErrorID) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_NONE, NULL, NULL, NULL, 0, 0, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_S(ErrorID, StrA) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_S, StrA, NULL, NULL, 0, 0, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_SS(ErrorID, StrA, StrB) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_SS, StrA, StrB, NULL, 0, 0, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_I(ErrorID, IntA) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_I, NULL, NULL, NULL, IntA, 0, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_II(ErrorID, IntA, IntB) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_II, NULL, NULL, NULL, IntA, IntB, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_III(ErrorID, IntA, IntB, IntC) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_III, NULL, NULL, NULL, IntA, IntB, IntC, 0, 0.0, 0.0);
+#define REPORT_WARNING_IIII(ErrorID, IntA, IntB, IntC, IntD) ReportError(1, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IIII, NULL, NULL, NULL, IntA, IntB, IntC, IntD, 0.0, 0.0);
+#define REPORT_WARNING_IS(ErrorID, IntA, StrA) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IS, StrA, NULL, NULL, IntA, 0, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_IIS(ErrorID, IntA, IntB, StrA) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IIS, StrA, NULL, NULL, IntA, IntB, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_IIIS(ErrorID, IntA, IntB, IntC, StrA) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_IIIS, StrA, NULL, NULL, IntA, IntB, IntC, 0, 0.0, 0.0);
+#define REPORT_WARNING_ISS(ErrorID, IntA, StrA, StrB) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_ISS, StrA, StrB, NULL, IntA, 0, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_SII(ErrorID, StrA, IntA, IntB) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_SII, StrA, NULL, NULL, IntA, IntB, 0, 0, 0.0, 0.0);
+#define REPORT_WARNING_SIF(ErrorID, StrA, IntA, FloatA) ReportError(0, ErrorID, __LINE__, __FILE__, ERROR_ARGS_SII, StrA, NULL, NULL, IntA, 0, 0, 0, FloatA, 0.0);
+
+#endif // ERRORS_H
diff --git a/ExonGraphAlign.c b/ExonGraphAlign.c
new file mode 100644
index 0000000..dd3a602
--- /dev/null
+++ b/ExonGraphAlign.c
@@ -0,0 +1,1195 @@
+//Title: ExonGraphAlign.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include <stdio.h>
+#include <memory.h>
+//#include <malloc.h>
+#include "Utils.h"
+#include "Spliced.h"
+
+// This file implements an alignment algorithm between a sequence and
+// an exon graph, or between two exon graphs.
+// Exon-graph alignment is very similar to the smith-waterman alignment
+// algorithm. The main difference is that in the recurrence relation,
+// we may have several "previous" cells to move to, due to the makeup
+// of the exon graph.
+
+#define AA_COUNT 26
+
+// IntNode: For handling lists of successors to each row / column
+typedef struct IntNode
+{
+ int Value;
+ struct IntNode* Next;
+} IntNode;
+
+
+// Forward declarations:
+int AlignExonGraphAgainstExonGraph(GeneStruct* GeneA, GeneStruct* GeneB,
+ char* ScoringMatrixFileName, int StartGapPenalty, int ExtendGapPenalty);
+void FreePrevCellTable(IntNode** PrevCell, int Size);
+
+// Default distance matrix for aligning protein sequences: Hamming distance,
+// a bonus for each match and a penalty for each mismatch. This is adequate
+// for most purposes.
+int* GenerateHammingDistanceMatrix()
+{
+ int X;
+ int Y;
+ int* Matrix;
+ //
+ Matrix = (int*)calloc(AA_COUNT*AA_COUNT, sizeof(int));
+ for (X = 0; X < AA_COUNT; X++)
+ {
+ for (Y = 0; Y < AA_COUNT; Y++)
+ {
+ if (X == Y)
+ {
+ Matrix[Y*AA_COUNT + X] = 10;
+ }
+ else
+ {
+ Matrix[Y*AA_COUNT + X] = -10;
+ }
+ }
+ }
+ return Matrix;
+}
+
+int* LoadScoringMatrix(char* ScoringMatrixFileName)
+{
+ //FILE* ScoringMatrixFile;
+ printf("** Scoring matrix support not implemented yet - use Hamming disatnce for now\n");
+ return NULL;
+}
+
+
+void XALinkRowToRow(IntNode** PrevY, int Y, int TargetY)
+{
+ IntNode* INode;
+ IntNode* NewINode;
+ NewINode = (IntNode*)calloc(1, sizeof(IntNode));
+ NewINode->Value = TargetY;
+ //printf("Back-link from row %d to target %d\n", Y, TargetY);
+ if (PrevY[Y])
+ {
+ for (INode = PrevY[Y]; INode->Next; INode = INode->Next)
+ {
+ ;
+ }
+ INode->Next = NewINode;
+ }
+ else
+ {
+ PrevY[Y] = NewINode;
+ }
+}
+
+// This function links the specified row to the specified exon.
+// The catch: If the exon has length 0, then we link to the specified exon's predecessors.
+void XALinkRowToExon(GeneStruct* Gene, IntNode** PrevY, int Y, int ExonIndex, int* ExonOffsets, int* ExonEdgeOffsets)
+{
+ ExonStruct* Exon;
+ int EdgeIndex;
+ int AAEdgeCount;
+ ExonEdge* Edge;
+ //
+ Exon = Gene->Exons + ExonIndex;
+ // Standard case: The exon is non-empty, so we link back to it.
+ if (Exon->Length)
+ {
+ XALinkRowToRow(PrevY, Y, ExonOffsets[Exon->Index] + Exon->Length - 1);
+ return;
+ }
+ // Special case: The exon is empty, so we link to the exon's predecessors.
+ AAEdgeCount = 0;
+ for (EdgeIndex = 0; EdgeIndex < Exon->BackEdgeCount; EdgeIndex++)
+ {
+ Edge = Exon->BackwardEdges + EdgeIndex;
+ if (Edge->AA)
+ {
+ XALinkRowToRow(PrevY, Y, ExonEdgeOffsets[ExonIndex] + AAEdgeCount);
+ AAEdgeCount++;
+ }
+ else
+ {
+ XALinkRowToExon(Gene, PrevY, Y, Edge->Exon->Index, ExonOffsets, ExonEdgeOffsets);
+ }
+ }
+}
+
+// Test scaffolding for graph-based alignment
+void TestExonGraphAlignment(int argc, char* argv[])
+{
+ GeneStruct* GeneA;
+ GeneStruct* GeneB;
+ FILE* GeneFile;
+ //
+ GeneFile = fopen(argv[1], "rb");
+ if (!GeneFile)
+ {
+ printf("** Error: Can't open gene file '%s'.\n", argv[1]);
+ return;
+ }
+ GeneA = LoadGene(GeneFile);
+ fclose(GeneFile);
+ GeneFile = fopen(argv[2], "rb");
+ if (!GeneFile)
+ {
+ printf("** Error: Can't open gene file '%s'.\n", argv[1]);
+ return;
+ }
+ GeneB = LoadGene(GeneFile);
+ AlignExonGraphAgainstExonGraph(GeneA, GeneB, NULL, -10, -3);
+ //AlignSequenceAgainstExonGraph(Gene, Sequence, NULL, -10, -3);
+ printf("\n\nAlignment complete.\n");
+}
+
+#define Z_STANDARD 0
+#define Z_GAP_IN_X 1
+#define Z_GAP_IN_Y 2
+
+// Count the links and the total amino acids in this exon graph. Used
+// in determining the table size in alignment.
+void GetExonGraphSize(GeneStruct* Gene, int* pLinkCount, int* pSize)
+{
+ int ExonIndex;
+ int LinkIndex;
+ ExonStruct* Exon;
+ ExonEdge* Edge;
+ //
+ *pSize = 0;
+ for (ExonIndex = 0; ExonIndex < Gene->ExonCount; ExonIndex++)
+ {
+ Exon = Gene->Exons + ExonIndex;
+ *pSize += Exon->Length;
+
+ for (LinkIndex = 0; LinkIndex < Exon->BackEdgeCount; LinkIndex++)
+ {
+ Edge = Exon->BackwardEdges + LinkIndex;
+ if (Edge->AA)
+ {
+ (*pLinkCount)++;
+ (*pSize)++;
+ }
+ }
+ }
+}
+
+int CompareExonsForward(const ExonStruct* ExonA, const ExonStruct* ExonB)
+{
+ if (ExonA->Start < ExonB->Start)
+ {
+ return -1;
+ }
+ if (ExonA->Start > ExonB->Start)
+ {
+ return 1;
+ }
+ // ExonA->Start == ExonB->Start
+ if (ExonA->End < ExonB->End)
+ {
+ return -1;
+ }
+ if (ExonA->End > ExonB->End)
+ {
+ return 1;
+ }
+ // Same coordinates? Arbitrary sort:
+ if (ExonA < ExonB)
+ {
+ return -1;
+ }
+ else
+ {
+ return 1;
+ }
+}
+int CompareExonsReverse(const ExonStruct* ExonA, const ExonStruct* ExonB)
+{
+ if (ExonA->End > ExonB->End)
+ {
+ return -1;
+ }
+ if (ExonA->End < ExonB->End)
+ {
+ return 1;
+ }
+ // ExonA->End == ExonB->End
+ if (ExonA->Start > ExonB->Start)
+ {
+ return -1;
+ }
+ if (ExonA->Start > ExonB->Start)
+ {
+ return 1;
+ }
+ // Same coordinates? Arbitrary sort:
+ if (ExonA > ExonB)
+ {
+ return 1;
+ }
+ else
+ {
+ return -1;
+ }
+}
+
+
+// Build the necessary arrays for a d.p. alignment against an exon graph:
+// - ExonEdgeOffsets[n] is the row for the first back-link-with-aa of exon n
+// - ExonOffsets[n] is the row for the first aa in exon n
+// - YSequence[n] is the nth character in the flattened graph
+// - PrevY[n] is a linked list of predecessors for row n. If n is within an exon,
+// there's just one entry, n-1. If n is the start of an exon, there may be several
+// entries. If n comes from an edge, there'll be exactly one entry, for the earlier exon.
+void FlattenExonsForAlignment(GeneStruct* Gene, int* ExonOffsets, int* ExonEdgeOffsets,
+ char* YSequence, IntNode** PrevY, char** YRowInfo)
+{
+ int Y;
+ int ExonIndex;
+ int StartExonIndex;
+ int ExonIterateDir;
+ int ExonCount = Gene->ExonCount;
+ ExonStruct* Exon;
+ int AALinkCount;
+ int LinkIndex;
+ IntNode* NewINode;
+ int Pos;
+ ExonEdge* Edge;
+ //
+ StartExonIndex = 0;
+ ExonIterateDir = 1;
+ Y = 0;
+ for (ExonIndex = StartExonIndex; ExonIndex >= 0 && ExonIndex < ExonCount; ExonIndex += ExonIterateDir)
+ {
+ Exon = Gene->Exons + ExonIndex;
+ // Add a row for each edge with AA:
+ AALinkCount = 0;
+ ExonEdgeOffsets[ExonIndex] = -1;
+ for (LinkIndex = 0; LinkIndex < Exon->BackEdgeCount; LinkIndex++)
+ {
+ // For each exon with an associated amino acid:
+ // - Add a row for the edge, with one back-link.
+ // - Remember the row number, so that the exon can be linked back to this row
+ Edge = Exon->BackwardEdges + LinkIndex;
+ if (Edge->AA)
+ {
+ if (ExonEdgeOffsets[ExonIndex] < 0)
+ {
+ ExonEdgeOffsets[ExonIndex] = Y;
+ }
+ //EdgeOffsets[AALinkCount] = Y;
+ YSequence[Y] = Edge->AA;
+ sprintf(YRowInfo[Y], "X%d backlink%d to %d", ExonIndex, LinkIndex, Edge->Exon->Index);
+ XALinkRowToExon(Gene, PrevY, Y, Edge->Exon->Index, ExonOffsets, ExonEdgeOffsets);
+ //NewINode = (IntNode*)calloc(1, sizeof(IntNode));
+ //NewINode->Value = ExonOffsets[Exon->BackExon[LinkIndex]->Index] + Exon->BackExon[LinkIndex]->Length;
+ //PrevY[Y] = NewINode;
+ AALinkCount++;
+ if (PrevY[Y])
+ {
+ printf("Y %d: exon %d link %d, back to exon %d y %d\n", Y, ExonIndex, LinkIndex, Edge->Exon->Index, PrevY[Y]->Value);
+ }
+ Y++;
+ }
+ }
+ if (!Exon->Length)
+ {
+ continue;
+ }
+ printf("Y %d: start exon %d body.\n", Y, ExonIndex);
+ // Add back-links for the first AA in the exon:
+ ExonOffsets[ExonIndex] = Y;
+ AALinkCount = 0;
+ for (LinkIndex = 0; LinkIndex < Exon->BackEdgeCount; LinkIndex++)
+ {
+ Edge = Exon->BackwardEdges + LinkIndex;
+ if (Edge->AA)
+ {
+ XALinkRowToRow(PrevY, Y, ExonEdgeOffsets[ExonIndex] + AALinkCount);
+ printf("ExonEdge offset %d. Link %d goes to AARow %d\n", ExonEdgeOffsets[ExonIndex], LinkIndex, PrevY[Y]->Value);
+ AALinkCount++;
+ }
+ else
+ {
+ XALinkRowToExon(Gene, PrevY, Y, Edge->Exon->Index, ExonOffsets, ExonEdgeOffsets);
+ printf("Y %d: start exon %d. Link %d goes to exon row %d\n", Y, ExonIndex, LinkIndex, PrevY[Y]->Value);
+ }
+ }
+ // Add one row for each AA in the exon proper:
+ for (Pos = 0; Pos < Exon->Length; Pos++)
+ {
+ sprintf(YRowInfo[Y], "X%d pos %d/%d", ExonIndex, Pos, Exon->Length);
+ YSequence[Y] = Exon->Sequence[Pos];
+ if (Pos)
+ {
+ NewINode = (IntNode*)calloc(1, sizeof(IntNode));
+ NewINode->Value = Y-1;
+ PrevY[Y] = NewINode;
+ }
+ //printf("%d %c pos %d in exon #%d\n", Y, YSequence[Y], Pos, Exon->Index);
+ Y++;
+ }
+ }
+}
+
+void SortGeneExons(GeneStruct* Gene)
+{
+ return;
+}
+
+// ExonGraphAlign extends the Smith-Waterman alignment algorithm to
+// handle local alignment of a sequence with an exon graph. The scoring
+// matrix (such as BLOSUM) can be specified, or Hamming distance can be
+// used. Gap penalties should also be specified. The function will
+// return the score of the best alignment, and (optionally) set 'verbose'
+// strings specifying the alignment itself, like this:
+// EAM--APK
+// *** * *
+// EAMCGARK
+// The data structure we use in implementing this alignment is a grid
+// (stored as an array), where each row has a linked list of zero or more
+// nodes specifying the legal predecessor rows.
+int AlignSequenceAgainstExonGraph(GeneStruct* Gene, char* Sequence,
+ char* ScoringMatrixFileName, int StartGapPenalty, int ExtendGapPenalty)
+{
+ int* ScoreTable;
+ int* NextX;
+ int* NextY;
+ int* NextZ;
+ int X;
+ int Y;
+ int Z;
+ IntNode** PrevY;
+ int TableIndex;
+ int PrevTableIndex;
+ int TableSize;
+ int* ScoringMatrix = NULL;
+ int* ExonOffsets;
+ int* ExonEdgeOffsets;
+ int ExonCount = 0;
+ int LinkCount = 0;
+ int SequenceLength;
+ int MaxY;
+ int AlignScore;
+ int Score;
+ char* YSequence;
+ IntNode* INode;
+ int XBlockSize;
+ int YBlockSize;
+ char ResidueA;
+ char ResidueB;
+ int BestX = 0;
+ int BestY = 0;
+ int BestZ = 0;
+ int AlignStringLength;
+ char* AlignStringA;
+ char* AlignStringB;
+ char* AlignStringC;
+ int BestScore;
+ int StartExonIndex;
+ int NearY;
+ int ExonIterateDir;
+ char** YRowInfo;
+ // Ensure gap penalties are NEGATIVE numbers. Negative is bad.
+ if (StartGapPenalty > 0)
+ {
+ StartGapPenalty = -StartGapPenalty;
+ }
+ if (ExtendGapPenalty > 0)
+ {
+ ExtendGapPenalty = -ExtendGapPenalty;
+ }
+
+ // Load the scoring matrix (or use default hamming distance)
+ if (ScoringMatrixFileName)
+ {
+ ScoringMatrix = LoadScoringMatrix(ScoringMatrixFileName);
+ }
+ if (!ScoringMatrix)
+ {
+ ScoringMatrix = GenerateHammingDistanceMatrix();
+ }
+ SequenceLength = strlen(Sequence);
+ ////////////////////////////////////////////////////////////
+ // Count the exons and edges (with aa):
+ ExonCount = Gene->ExonCount;
+
+ if (Gene->ForwardFlag)
+ {
+ ExonIterateDir = 1;
+ StartExonIndex = 0;
+ }
+ else
+ {
+ ExonIterateDir = -1;
+ StartExonIndex = Gene->ExonCount - 1;
+ }
+ GetExonGraphSize(Gene, &LinkCount, &MaxY);
+ ////////////////////////////////////////////////////////////
+ // Allocate arrays:
+ TableSize = MaxY * SequenceLength * 3;
+ ScoreTable = (int*)calloc(TableSize, sizeof(int));
+ NextX = (int*)calloc(TableSize, sizeof(int));
+ NextY = (int*)calloc(TableSize, sizeof(int));
+ NextZ = (int*)calloc(TableSize, sizeof(int));
+ PrevY = (IntNode**)calloc(MaxY, sizeof(IntNode*));
+ YSequence = (char*)calloc(MaxY, sizeof(char));
+ ExonOffsets = (int*)calloc(ExonCount, sizeof(int));
+ ExonEdgeOffsets = (int*)calloc(ExonCount, sizeof(int));
+ YRowInfo = (char**)calloc(MaxY, sizeof(char*));
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ YRowInfo[Y] = (char*)calloc(64, sizeof(char));
+ }
+ ////////////////////////////////////////////////////////////
+ // Initialize the linked lists giving predecessors at each point.
+ SortGeneExons(Gene);
+ //DebugPrintGene(Gene);
+ FlattenExonsForAlignment(Gene, ExonOffsets, ExonEdgeOffsets, YSequence, PrevY, YRowInfo);
+ //////////////////////////////////////////////////////////////////////
+ // Debug print:
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ printf("%d ", Y);
+ for (NearY = Y - 5; NearY < Y + 6; NearY++)
+ {
+ if (NearY >= 0 && NearY < MaxY)
+ {
+ printf("%c", YSequence[NearY]);
+ }
+ }
+ for (INode = PrevY[Y]; INode; INode = INode->Next)
+ {
+ printf(" ->%d (", INode->Value);
+ for (NearY = INode->Value - 3; NearY < INode->Value; NearY++)
+ {
+ if (NearY >= 0 && NearY < MaxY)
+ {
+ printf("%c", YSequence[NearY]);
+ }
+ }
+ printf(" %c ", YSequence[INode->Value]);
+ for (NearY = INode->Value + 1; NearY < INode->Value + 4; NearY++)
+ {
+ if (NearY >= 0 && NearY < MaxY)
+ {
+ printf("%c", YSequence[NearY]);
+ }
+ }
+ printf(")");
+ }
+ printf("\n");
+ }
+ ////////////////////////////////////////////////////////////
+ // Carry out dynamic programming:
+ XBlockSize = 3;
+ YBlockSize = XBlockSize * SequenceLength;
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ ResidueB = YSequence[Y] - 'A';
+ if (ResidueB < 0 || ResidueB > 26)
+ {
+ ResidueB = 23; //'X';
+ }
+ for (X = 0; X < SequenceLength; X++)
+ {
+ ResidueA = Sequence[X] - 'A';
+ if (ResidueA < 0 || ResidueA > 26)
+ {
+ ResidueA = 23; //'X';
+ }
+ ////////////////////////////
+ // Z == 0, the alignment table:
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z_STANDARD;
+ // Default: Jump in
+ BestScore = 0;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = -1;
+ NextY[TableIndex] = -1;
+ NextZ[TableIndex] = -1;
+ // Consider aligning:
+ AlignScore = ScoringMatrix[ResidueA * AA_COUNT + ResidueB];
+ // Aligning at the edges of the world is allowed:
+ if (!X || !PrevY[Y])
+ {
+ if (AlignScore > BestScore)
+ {
+ ScoreTable[TableIndex] = AlignScore;
+ BestScore = AlignScore;
+ }
+ }
+ else
+ {
+ // Consider each predecessor row:
+ for (INode = PrevY[Y]; INode; INode = INode->Next)
+ {
+ PrevTableIndex = INode->Value * YBlockSize + (X-1)*XBlockSize + 0;
+ Score = AlignScore + ScoreTable[PrevTableIndex];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X - 1;
+ NextY[TableIndex] = INode->Value;
+ NextZ[TableIndex] = 0;
+ }
+ }
+ }
+ // Consider gapping in x:
+ if (X)
+ {
+ PrevTableIndex = Y * YBlockSize + (X-1) * XBlockSize + Z_GAP_IN_X;
+ Score = StartGapPenalty + ScoreTable[PrevTableIndex];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X - 1;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_GAP_IN_X;
+ }
+ }
+ // Consider gapping in y:
+ for (INode = PrevY[Y]; INode; INode = INode->Next)
+ {
+ PrevTableIndex = INode->Value * YBlockSize + X * XBlockSize + Z_GAP_IN_Y;
+ Score = StartGapPenalty + ScoreTable[PrevTableIndex];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = INode->Value;
+ NextZ[TableIndex] = Z_GAP_IN_Y;
+ }
+ }
+ //printf("At %d, %d, 0: Score %d, prev %d, %d, %d\n", X, Y, ScoreTable[TableIndex],
+ // NextX[TableIndex], NextY[TableIndex], NextZ[TableIndex]);
+ ////////////////////////////
+ // Z=1, gapping in x:
+ // By default, close the gap...but also consider extending it (unless x == 0)
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z_GAP_IN_X;
+ PrevTableIndex = Y*YBlockSize + X*XBlockSize + Z_STANDARD;
+ BestScore = ScoreTable[PrevTableIndex];
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_STANDARD;
+ if (X)
+ {
+ Score = ExtendGapPenalty + ScoreTable[Y*YBlockSize + (X-1)*XBlockSize + Z_GAP_IN_X];
+ if (Score > BestScore)
+ {
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X - 1;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_GAP_IN_X;
+ }
+ }
+ ////////////////////////////
+ // Z=2, gapping in y:
+ // By default, close the gap...but also consider extending it
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z_GAP_IN_Y;
+ PrevTableIndex = Y*YBlockSize + X*XBlockSize + Z_STANDARD;
+ BestScore = ScoreTable[PrevTableIndex];
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_STANDARD;
+ for (INode = PrevY[Y]; INode; INode = INode->Next)
+ {
+ Score = ExtendGapPenalty + ScoreTable[INode->Value*YBlockSize + X*XBlockSize + Z_GAP_IN_Y];
+ if (Score > BestScore)
+ {
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = INode->Value;
+ NextZ[TableIndex] = Z_GAP_IN_Y;
+ }
+ }
+ }
+ }
+ ////////////////////////////////////////////////////////////
+ // Find where the best alignment ends:
+ BestScore = -9999;
+ for (X = 0; X < SequenceLength; X++)
+ {
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ for (Z = 0; Z < 3; Z++)
+ {
+ Score = ScoreTable[Y*YBlockSize + X*XBlockSize + Z];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ BestX = X;
+ BestY = Y;
+ BestZ = Z;
+ }
+ }
+ }
+ }
+ ////////////////////////////////////////////////////////////
+ // Produce strings for the optimal alignment:
+ X = BestX;
+ Y = BestY;
+ Z = BestZ;
+ AlignStringLength = 0;
+ while (X >= 0)
+ {
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z;
+ // Each step we take will add to the string...except closing a gap.
+ if (!Z || NextZ[TableIndex])
+ {
+ AlignStringLength++;
+ }
+ X = NextX[TableIndex];
+ Y = NextY[TableIndex];
+ Z = NextZ[TableIndex];
+ }
+
+ AlignStringA = (char*)calloc(AlignStringLength + 1, sizeof(char));
+ AlignStringB = (char*)calloc(AlignStringLength + 1, sizeof(char));
+ AlignStringC = (char*)calloc(AlignStringLength + 1, sizeof(char));
+ X = BestX;
+ Y = BestY;
+ Z = BestZ;
+ while (X >= 0)
+ {
+ AlignStringLength--;
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z;
+ switch (Z)
+ {
+ case Z_STANDARD:
+ switch (NextZ[TableIndex])
+ {
+ case Z_STANDARD:
+ default:
+ ResidueA = Sequence[X];
+ ResidueB = YSequence[Y];
+ AlignStringA[AlignStringLength] = Sequence[X];
+ AlignStringC[AlignStringLength] = YSequence[Y];
+ if (ResidueA == ResidueB)
+ {
+ AlignStringB[AlignStringLength] = '*';
+ }
+ else
+ {
+ AlignStringB[AlignStringLength] = ' ';
+ }
+ printf("X %d (%c) Y %d (%c) %s\n", X, ResidueA, Y, ResidueB, YRowInfo[Y]);
+ break;
+ case Z_GAP_IN_X:
+ AlignStringA[AlignStringLength] = Sequence[X];
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = '-';
+ break;
+ case Z_GAP_IN_Y:
+ AlignStringA[AlignStringLength] = '-';
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = YSequence[Y];
+ break;
+ }
+ break;
+ case Z_GAP_IN_X:
+ if (NextZ[TableIndex])
+ {
+ AlignStringA[AlignStringLength] = Sequence[X];
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = '-';
+ }
+ break;
+ case Z_GAP_IN_Y:
+ if (NextZ[TableIndex])
+ {
+ AlignStringA[AlignStringLength] = '-';
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = YSequence[Y];
+ }
+ break;
+ }
+
+ // Each step we take will add to the string...except closing a gap.
+ if (Z && !NextZ[TableIndex])
+ {
+ AlignStringLength++;
+ }
+ X = NextX[TableIndex];
+ Y = NextY[TableIndex];
+ Z = NextZ[TableIndex];
+ }
+ printf("Alignment score %d. Alignment follows:\n", BestScore);
+ printf("%s\n", AlignStringA);
+ printf("%s\n", AlignStringB);
+ printf("%s\n", AlignStringC);
+
+ ////////////////////////////////////////////////////////////
+ // cleanup:
+ SafeFree(ScoringMatrix);
+ SafeFree(ScoreTable);
+ SafeFree(ExonOffsets);
+ SafeFree(NextX);
+ SafeFree(NextY);
+ SafeFree(NextZ);
+ SafeFree(YSequence);
+ SafeFree(ExonEdgeOffsets);
+ SafeFree(AlignStringA);
+ SafeFree(AlignStringB);
+ SafeFree(AlignStringC);
+ FreePrevCellTable(PrevY, MaxY);
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ SafeFree(YRowInfo[Y]);
+ }
+ SafeFree(YRowInfo);
+
+ return BestScore;
+}
+
+// Free an array of linked-lists providing predecessor cells
+void FreePrevCellTable(IntNode** PrevCell, int Size)
+{
+ int Index;
+ IntNode* Node;
+ IntNode* Prev;
+ if (!PrevCell)
+ {
+ return;
+ }
+ for (Index = 0; Index < Size; Index++)
+ {
+ Prev = NULL;
+ for (Node = PrevCell[Index]; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ }
+ SafeFree(PrevCell);
+}
+
+
+// Align an exon graph against another exon graph.
+int AlignExonGraphAgainstExonGraph(GeneStruct* GeneA, GeneStruct* GeneB,
+ char* ScoringMatrixFileName, int StartGapPenalty, int ExtendGapPenalty)
+{
+ int MaxX;
+ int MaxY;
+ int LinkCountA;
+ int LinkCountB;
+ int* ScoringMatrix = NULL;
+ int ExonCountA;
+ int ExonCountB;
+ int* ScoreTable;
+ int* NextX;
+ int* NextY;
+ int* NextZ;
+ IntNode** PrevX;
+ IntNode** PrevY;
+ char* XSequence;
+ char* YSequence;
+ int* ExonOffsetsA;
+ int* ExonOffsetsB;
+ int* ExonEdgeOffsetsA;
+ int* ExonEdgeOffsetsB;
+ int XBlockSize;
+ int YBlockSize;
+ IntNode* PrevNodeX;
+ IntNode* PrevNodeY;
+ int TableSize;
+ int X;
+ int Y;
+ int Z;
+ int BestX = 0;
+ int BestY = 0;
+ int BestZ = 0;
+ char ResidueA;
+ char ResidueB;
+ int Score;
+ int AlignScore;
+ int BestScore;
+ char* AlignStringA;
+ char* AlignStringB;
+ char* AlignStringC;
+ int TableIndex;
+ int PrevTableIndex;
+ int AlignStringLength;
+ char** RowInfoA;
+ char** RowInfoB;
+ // Ensure gap penalties are NEGATIVE numbers. Negative is bad.
+ if (StartGapPenalty > 0)
+ {
+ StartGapPenalty = -StartGapPenalty;
+ }
+ if (ExtendGapPenalty > 0)
+ {
+ ExtendGapPenalty = -ExtendGapPenalty;
+ }
+
+ // Load the scoring matrix (or use default hamming distance)
+
+ if (ScoringMatrixFileName)
+ {
+ ScoringMatrix = LoadScoringMatrix(ScoringMatrixFileName);
+ }
+ if (!ScoringMatrix)
+ {
+ ScoringMatrix = GenerateHammingDistanceMatrix();
+ }
+ printf("\n\nGene A:\n");
+ //DebugPrintGene(GeneA);
+ printf("\n\nGene B:\n");
+ //DebugPrintGene(GeneB);
+
+ ////////////////////////////////////////////////////////////
+ // Count the exons and edges (with aa):
+ ExonCountA = GeneA->ExonCount;
+ ExonCountB = GeneB->ExonCount;
+
+ GetExonGraphSize(GeneA, &LinkCountA, &MaxX);
+ GetExonGraphSize(GeneB, &LinkCountB, &MaxY);
+ ////////////////////////////////////////////////////////////
+ // Allocate arrays:
+ TableSize = MaxY * MaxX * 3;
+ ScoreTable = (int*)calloc(TableSize, sizeof(int));
+ NextX = (int*)calloc(TableSize, sizeof(int));
+ NextY = (int*)calloc(TableSize, sizeof(int));
+ NextZ = (int*)calloc(TableSize, sizeof(int));
+ XSequence = (char*)calloc(MaxX + 1, sizeof(char));
+ PrevX = (IntNode**)calloc(MaxX, sizeof(IntNode*));
+ YSequence = (char*)calloc(MaxY + 1, sizeof(char));
+ PrevY = (IntNode**)calloc(MaxY, sizeof(IntNode*));
+ ExonOffsetsA = (int*)calloc(ExonCountA, sizeof(int));
+ ExonEdgeOffsetsA = (int*)calloc(ExonCountA, sizeof(int));
+ ExonOffsetsB = (int*)calloc(ExonCountB, sizeof(int));
+ ExonEdgeOffsetsB = (int*)calloc(ExonCountB, sizeof(int));
+ RowInfoA = (char**)calloc(MaxX, sizeof(char*));
+ RowInfoB = (char**)calloc(MaxY, sizeof(char*));
+ for (X = 0; X < MaxX; X++)
+ {
+ RowInfoA[X] = (char*)calloc(64, sizeof(char));
+ }
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ RowInfoB[Y] = (char*)calloc(64, sizeof(char));
+ }
+
+ ////////////////////////////////////////////////////////////
+ // Initialize the linked lists giving predecessors at each point.
+ SortGeneExons(GeneA);
+ SortGeneExons(GeneB);
+ FlattenExonsForAlignment(GeneA, ExonOffsetsA, ExonEdgeOffsetsA, XSequence, PrevX, RowInfoA);
+ FlattenExonsForAlignment(GeneB, ExonOffsetsB, ExonEdgeOffsetsB, YSequence, PrevY, RowInfoB);
+ ////////////////////////////////////////////////////////////
+ // Carry out dynamic programming:
+ XBlockSize = 3;
+ YBlockSize = XBlockSize * MaxX;
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ ResidueB = YSequence[Y] - 'A';
+ if (ResidueB < 0 || ResidueB > 26)
+ {
+ ResidueB = 23; //'X';
+ }
+ for (X = 0; X < MaxX; X++)
+ {
+ ResidueA = XSequence[X] - 'A';
+ if (ResidueA < 0 || ResidueA > 26)
+ {
+ ResidueA = 23; //'X';
+ }
+ ////////////////////////////
+ // Z == 0, the alignment table:
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z_STANDARD;
+ // Default: Jump in
+ BestScore = 0;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = -1;
+ NextY[TableIndex] = -1;
+ NextZ[TableIndex] = -1;
+ // Consider aligning:
+ AlignScore = ScoringMatrix[ResidueA * AA_COUNT + ResidueB];
+ // Aligning at the edges of the world is allowed:
+ if (!PrevX[X] || !PrevY[Y])
+ {
+ if (AlignScore > BestScore)
+ {
+ ScoreTable[TableIndex] = AlignScore;
+ BestScore = AlignScore;
+ }
+ }
+ else
+ {
+ // Consider each predecessor cell (x, y):
+ for (PrevNodeX = PrevX[X]; PrevNodeX; PrevNodeX = PrevNodeX->Next)
+ {
+ for (PrevNodeY = PrevY[Y]; PrevNodeY; PrevNodeY = PrevNodeY->Next)
+ {
+ PrevTableIndex = PrevNodeY->Value * YBlockSize + PrevNodeX->Value * XBlockSize + 0;
+ Score = AlignScore + ScoreTable[PrevTableIndex];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = PrevNodeX->Value;
+ NextY[TableIndex] = PrevNodeY->Value;
+ NextZ[TableIndex] = 0;
+ }
+ }
+ }
+ }
+ // Consider gapping in x:
+ for (PrevNodeX = PrevX[X]; PrevNodeX; PrevNodeX = PrevNodeX->Next)
+ {
+ PrevTableIndex = Y * YBlockSize + PrevNodeX->Value * XBlockSize + Z_GAP_IN_X;
+ Score = StartGapPenalty + ScoreTable[PrevTableIndex];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = PrevNodeX->Value;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_GAP_IN_X;
+ }
+ }
+ // Consider gapping in y:
+ for (PrevNodeY = PrevY[Y]; PrevNodeY; PrevNodeY = PrevNodeY->Next)
+ {
+ PrevTableIndex = PrevNodeY->Value * YBlockSize + X * XBlockSize + Z_GAP_IN_Y;
+ Score = StartGapPenalty + ScoreTable[PrevTableIndex];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = PrevNodeY->Value;
+ NextZ[TableIndex] = Z_GAP_IN_Y;
+ }
+ }
+ //printf("At %d, %d, 0: Score %d, prev %d, %d, %d\n", X, Y, ScoreTable[TableIndex],
+ // NextX[TableIndex], NextY[TableIndex], NextZ[TableIndex]);
+ ////////////////////////////
+ // Z=1, gapping in x:
+ // By default, close the gap...but also consider extending it (unless x == 0)
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z_GAP_IN_X;
+ PrevTableIndex = Y*YBlockSize + X*XBlockSize + Z_STANDARD;
+ BestScore = ScoreTable[PrevTableIndex];
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_STANDARD;
+ for (PrevNodeX = PrevX[X]; PrevNodeX; PrevNodeX = PrevNodeX->Next)
+ {
+ Score = ExtendGapPenalty + ScoreTable[Y*YBlockSize + PrevNodeX->Value * XBlockSize + Z_GAP_IN_X];
+ if (Score > BestScore)
+ {
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = PrevNodeX->Value;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_GAP_IN_X;
+ }
+ }
+ ////////////////////////////
+ // Z=2, gapping in y:
+ // By default, close the gap...but also consider extending it
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z_GAP_IN_Y;
+ PrevTableIndex = Y*YBlockSize + X*XBlockSize + Z_STANDARD;
+ BestScore = ScoreTable[PrevTableIndex];
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = Y;
+ NextZ[TableIndex] = Z_STANDARD;
+ for (PrevNodeY = PrevY[Y]; PrevNodeY; PrevNodeY = PrevNodeY->Next)
+ {
+ Score = ExtendGapPenalty + ScoreTable[PrevNodeY->Value*YBlockSize + X*XBlockSize + Z_GAP_IN_Y];
+ if (Score > BestScore)
+ {
+ ScoreTable[TableIndex] = BestScore;
+ NextX[TableIndex] = X;
+ NextY[TableIndex] = PrevNodeY->Value;
+ NextZ[TableIndex] = Z_GAP_IN_Y;
+ }
+ }
+ }
+ }
+ ////////////////////////////////////////////////////////////
+ // Find where the best alignment ends:
+ BestScore = -9999;
+ for (X = 0; X < MaxX; X++)
+ {
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ for (Z = 0; Z < 3; Z++)
+ {
+ Score = ScoreTable[Y*YBlockSize + X*XBlockSize + Z];
+ if (Score > BestScore)
+ {
+ BestScore = Score;
+ BestX = X;
+ BestY = Y;
+ BestZ = Z;
+ }
+ }
+ }
+ }
+ ////////////////////////////////////////////////////////////
+ // Produce strings for the optimal alignment:
+ X = BestX;
+ Y = BestY;
+ Z = BestZ;
+ AlignStringLength = 0;
+ while (X >= 0)
+ {
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z;
+ // Each step we take will add to the string...except closing a gap.
+ if (!Z || NextZ[TableIndex])
+ {
+ AlignStringLength++;
+ }
+ X = NextX[TableIndex];
+ Y = NextY[TableIndex];
+ Z = NextZ[TableIndex];
+ }
+
+ AlignStringA = (char*)calloc(AlignStringLength + 1, sizeof(char));
+ AlignStringB = (char*)calloc(AlignStringLength + 1, sizeof(char));
+ AlignStringC = (char*)calloc(AlignStringLength + 1, sizeof(char));
+ X = BestX;
+ Y = BestY;
+ Z = BestZ;
+ while (X >= 0)
+ {
+ AlignStringLength--;
+ TableIndex = Y*YBlockSize + X*XBlockSize + Z;
+ switch (Z)
+ {
+ case Z_STANDARD:
+ switch (NextZ[TableIndex])
+ {
+ case Z_STANDARD:
+ default:
+ ResidueA = XSequence[X];
+ ResidueB = YSequence[Y];
+ AlignStringA[AlignStringLength] = ResidueA;
+ AlignStringC[AlignStringLength] = ResidueB;
+ if (ResidueA == ResidueB)
+ {
+ AlignStringB[AlignStringLength] = '*';
+ }
+ else
+ {
+ AlignStringB[AlignStringLength] = ' ';
+ }
+ break;
+ case Z_GAP_IN_X:
+ AlignStringA[AlignStringLength] = XSequence[X];
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = '-';
+ break;
+ case Z_GAP_IN_Y:
+ AlignStringA[AlignStringLength] = '-';
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = YSequence[Y];
+ break;
+ }
+ break;
+ case Z_GAP_IN_X:
+ if (NextZ[TableIndex])
+ {
+ AlignStringA[AlignStringLength] = XSequence[X];
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = '-';
+ }
+ break;
+ case Z_GAP_IN_Y:
+ if (NextZ[TableIndex])
+ {
+ AlignStringA[AlignStringLength] = '-';
+ AlignStringB[AlignStringLength] = ' ';
+ AlignStringC[AlignStringLength] = YSequence[Y];
+ }
+ break;
+ }
+
+ // Each step we take will add to the string...except closing a gap.
+ if (Z && !NextZ[TableIndex])
+ {
+ AlignStringLength++;
+ }
+ X = NextX[TableIndex];
+ Y = NextY[TableIndex];
+ Z = NextZ[TableIndex];
+ }
+ printf("Alignment score %d. Alignment follows:\n", BestScore);
+ printf("%s\n", AlignStringA);
+ printf("%s\n", AlignStringB);
+ printf("%s\n", AlignStringC);
+
+ ////////////////////////////////////////////////////////////
+ // cleanup:
+ SafeFree(ScoringMatrix);
+ SafeFree(ScoreTable);
+ SafeFree(ExonOffsetsA);
+ SafeFree(ExonOffsetsB);
+ SafeFree(NextX);
+ SafeFree(NextY);
+ SafeFree(NextZ);
+ SafeFree(YSequence);
+ SafeFree(XSequence);
+ SafeFree(ExonEdgeOffsetsA);
+ SafeFree(ExonEdgeOffsetsB);
+ if (AlignStringA)
+ {
+ SafeFree(AlignStringA);
+ SafeFree(AlignStringB);
+ SafeFree(AlignStringC);
+ }
+ FreePrevCellTable(PrevY, MaxY);
+ FreePrevCellTable(PrevX, MaxX);
+ for (Y = 0; Y < MaxY; Y++)
+ {
+ SafeFree(RowInfoB[Y]);
+ }
+ SafeFree(RowInfoB);
+ for (X = 0; X < MaxX; X++)
+ {
+ SafeFree(RowInfoA[X]);
+ }
+ SafeFree(RowInfoA);
+ return BestScore;
+}
+
+
diff --git a/ExonGraphAlign.h b/ExonGraphAlign.h
new file mode 100644
index 0000000..1143825
--- /dev/null
+++ b/ExonGraphAlign.h
@@ -0,0 +1,40 @@
+//Title: ExonGraphAlign.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#ifndef EXON_GRAPH_ALIGN_H
+#define EXON_GRAPH_ALIGN_H
+
+
+
+int AlignSequenceAgainstExonGraph(GeneStruct* Gene, char* Sequence,
+ char* ScoringMatrixFileName, int StartGapPenalty, int ExtendGapPenalty);
+#endif // EXON_GRAPH_ALIGN_H
+
diff --git a/ExplainPTMs.py b/ExplainPTMs.py
new file mode 100644
index 0000000..bf2714e
--- /dev/null
+++ b/ExplainPTMs.py
@@ -0,0 +1,148 @@
+#Title: ExplainPTMs.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Once an unrestrictive PTM search has completed, attempt to suggest
+possible (bio)chemical explanations for the modifications seen.
+"""
+import Global
+from Utils import *
+Initialize()
+
+# Mass delta (in daltons) -> list of PTMs
+AllPTMsByMass = {}
+InitializeFlag = 0
+
+def FixQuotedString(String):
+ if String and String[0] == '"' and String[-1] == '"':
+ return String[1:-1]
+ return String
+
+class PTMClass:
+ def __init__(self, SourceDB, DBID, Name, Residues, Mass):
+ self.SourceDB = SourceDB
+ self.DBID = DBID
+ self.Name = FixQuotedString(Name)
+ self.Residues = Residues
+ if not self.Residues:
+ self.Residues = None # specific to a terminus, not a residue.
+ self.Mass = Mass
+ self.Terminus = "" # valid values: "C", "N", ""
+ def __str__(self):
+ return self.Name
+ def GetURL(self):
+ if self.SourceDB.lower() == "unimod":
+ return "http://www.unimod.org/cgi/unimod.cgi?record_id=%s&display_details_view.x=7&display_details_view.y=5&display_details_view=on"%self.DBID
+ else:
+ return None
+ def GetNameWithLink(self):
+ URL = self.GetURL()
+ if URL:
+ return "<a href=\"%s\">%s</a>"%(URL, self.Name)
+ else:
+ return self.Name
+
+def LoadPTMDatabase():
+ global InitializeFlag
+ global AllPTMsByMass
+ if InitializeFlag:
+ return
+ InitializeFlag = 1
+ File = open("PTMDatabase.txt", "rb")
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ if FileLine[0] == "#" or len(Bits) < 5:
+ continue
+ try:
+ Mass = int(round(float(Bits[2])))
+ except:
+ # No valid mass? Probably a blank line.
+ continue
+ PTM = PTMClass(Bits[0], Bits[1], Bits[3], Bits[4], Mass)
+ if len(Bits) > 5 and Bits[5]:
+ Terminus = Bits[5][0]
+ if Terminus in ("C", "N"):
+ PTM.Terminus = Terminus
+ if not AllPTMsByMass.has_key(Mass):
+ AllPTMsByMass[Mass] = []
+ AllPTMsByMass[Mass].append(PTM)
+ File.close()
+
+def GetExplanation(AA, Mass, Terminus, BasePTM = 0):
+ """
+ Look for a known PTM that matches this residue, delta mass, and terminus.
+ If we don't find any such PTM, then look for a point mutation matching the
+ mass shift. The output of this function is an initial hypothesis, and requires
+ verification.
+ """
+ AllResidues = "ACDEFGHIKLMNPQRSTVWY"
+ LoadPTMDatabase()
+ Explanations = []
+ # If there's a base modification applied to this residue, then we should
+ # handle that case specially. Example: On cysteine, "-57" is a missing protecting
+ # group, and "-43" is a methylation!
+ if BasePTM:
+ if Mass == -BasePTM:
+ PTM = PTMClass("ProtectingGroup", None, "Missing %+d fixed mod"%BasePTM, AA, Mass)
+ Explanations.append(PTM)
+ return Explanations
+ Mass = Mass + BasePTM
+ PTMList = AllPTMsByMass.get(Mass, [])
+ for PTM in PTMList:
+ PTMOK = 0
+ if PTM.Residues == None:
+ if Terminus == PTM.Terminus:
+ #Explanations.append(PTM)
+ PTMOK = 1
+ elif (AA != None) and (AA in PTM.Residues):
+ if Terminus == PTM.Terminus or PTM.Terminus == "":
+ PTMOK = 1
+ #Explanations.append(PTM)
+ if not PTMOK:
+ continue
+ # Don't add multiple explanations with the same name! (There's some redundancy)
+ for OldExplanation in Explanations:
+ if OldExplanation.Name == PTM.Name:
+ PTMOK = 0
+ break
+ if PTMOK:
+ Explanations.append(PTM)
+ # Perhaps we can explain it with a mutation:
+ if AA != None:
+ for OtherAA in AllResidues:
+ Delta = Global.AminoMass[OtherAA] - Global.AminoMass[AA]
+ if abs(Delta - Mass) < 1.0:
+ PTM = PTMClass("Mutation", None, "Mutation from %s to %s"%(AA, OtherAA), AA, Delta)
+ Explanations.append(PTM)
+ return Explanations
+
diff --git a/FDRUtils.py b/FDRUtils.py
new file mode 100644
index 0000000..722a132
--- /dev/null
+++ b/FDRUtils.py
@@ -0,0 +1,1109 @@
+#Title: FDRUtils.py (formerly PValue.py)
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+This script, based on PeptideProphet, computes the estimated probability that
+a match is correct. This probability is derived upon an F-Score. The F-Score
+for a match is a weighted sum of the length-corrected MQScore and the delta score.
+
+We fit the distribution of F-scores as a mixture of two distributions:
+A GAMMA DISTRIBUTION for false matches (with lower mean)
+A NORMAL DISTRIBUTION for true matches (with higher mean)
+Therefore, the probability that a match with a given F-Score is correct depends
+upon the overall distribution of F-Scores for the rest of the run.
+
+============================================================================
+P-values for searches with shuffled-database:
+
+Given a score cutoff, let the number of valid-protein hits above the
+cutoff be V, and let the number of invalid-protein hits above the
+cutoff be I.
+
+# PVALUE WITH REMOVAL:
+Throw out the I hits from invalid proteins.
+Let TDB and FDB be the true and false database fractions. (Note: For
+a 1:1 database, TDB and FDB are both 0.5, so TDB/FDB equals 1.0) Even
+after throwing out hits from invalid proteins, there are still some
+chance hits to true proteins. The estimated number of hits from V
+that are actually false is equal to: I*(TDB/FDB)
+
+This, the odds that a match above this score cutoff is correct:
+(V - I*(TDB/FDB)) / V
+
+This formulation of pvalue is the one normally used (e.g. in an
+unmodified search). Normally, we have no reason to keep any matches
+to shuffled proteins; we only generate them in the first place so that
+we can count them.
+
+# PVALUE WITHOUT REMOVAL (-H command-line option):
+Retain all hits. As above, the number of
+hits from V that are false is I*(TDB/FDB). Thus, the odds that a
+match above the score cutoff is correct:
+(V - I*(TDB/FDB)) / (I+V)
+
+"""
+import os
+import sys
+import random
+import math
+import getopt
+import traceback
+import struct
+import ResultsParser
+import SelectProteins
+import Learning
+from Utils import *
+Initialize()
+
+try:
+ from PIL import Image
+ from PIL import ImageDraw
+ from PIL import ImageFont
+ # Fonts don't seem to work on Linux. (Tried pdf, pcf, and pil formats...but no luck)
+ # So, we'll content ourselves with a default font if we must:
+ try:
+ TheFont = ImageFont.truetype("Times.ttf", 12)
+ except:
+ TheFont = ImageFont.load_default()
+except:
+ print "(PIL not installed - image generation not available)"
+ Image = None
+
+class Colors:
+ White = (255, 255, 255)
+ Grey = (155, 155, 155)
+ Background = (255, 255, 255)
+ Black = (0, 0, 0)
+ Green = (0, 155, 0)
+ Red = (155, 0, 0)
+ Blue = (0, 0, 155)
+
+class Defaults:
+ "Default F-score distribution; a starting point for E/M model fitting."
+ MeanTrue = 4.48
+ VarianceTrue = 1.50
+ MeanFalse = 0.19
+ VarianceFalse = 0.17
+ PriorProbabilityTrue = 0.25
+ GammaOffset = 0.3
+ MQScoreWeight = 0.3
+ DeltaScoreWeight = 1.5
+ ###########################
+ BlindMeanTrue = 5.0
+ BlindVarianceTrue = 11.8
+ BlindMeanFalse = -0.8
+ BlindVarianceFalse = 0.7
+ BlindPriorProbabilityTrue = 0.18
+ BlindGammaOffset = 6.0
+ BlindMQScoreWeight = 0.3
+ BlindDeltaScoreWeight = 1.5
+
+BLIND_MOD_PENALTY = 1.0
+MIN_MQSCORE = -10.0
+
+# Parse the scores from at most this many output files.
+MAX_RESULTS_FILES_TO_PARSE = 100
+
+BIN_MULTIPLIER = 10.0
+SQRT2PI = math.sqrt(2 * math.pi)
+
+Cof = [76.18009172947146, -86.50532032941677,
+ 24.01409824083091, -1.231739572450155,
+ 0.1208650973866179e-2, -0.5395239384952e-5]
+
+def Gamma(Z):
+ X = Z
+ Y = Z
+ Temp = X + 5.5
+ Temp -= (X + 0.5) * math.log(Temp)
+ Ser = 1.000000000190015
+ for J in range(6):
+ Y += 1
+ Ser += Cof[J] / Y
+ Z = -Temp + math.log(2.5066282746310005 * Ser / X)
+ return math.exp(Z)
+
+class Bag:
+ pass
+
+class PValueParser(ResultsParser.ResultsParser):
+ def __init__(self):
+ self.RetainBadMatches = 0
+ self.LoadDistributionPath = None
+ self.ScoreHistogram2 = {}
+ self.ScoreHistogram3 = {}
+ self.ShuffledScoreHistogram2 = {}
+ self.ShuffledScoreHistogram3 = {}
+ self.MinimumPeptideLength = 7
+ self.VerboseFlag = 0
+ self.GenerateImageFlag = 0
+ self.MQScoreWeight = Defaults.MQScoreWeight
+ self.DeltaScoreWeight = Defaults.DeltaScoreWeight
+ self.GammaOffset = Defaults.GammaOffset
+ self.BlindFlag = 0
+ self.PValueCutoff = 0.1 # default
+ # aminos -> location list
+ self.PeptideDict = {}
+ self.MaxDeltaScoreGap = -3.5
+ self.DBPath = []
+ self.PerformProteinSelection = 0
+ self.ProteinPicker = None
+ self.WriteTopMatchOnly = 0
+ self.ShuffledDatabaseFraction = None
+ self.RemoveShuffledMatches = 1
+ # Overwrite existing files in -w target:
+ self.OverwriteNewScoresFlag = 1
+ self.ClusterInfoPath = None
+ self.Columns = ResultsParser.Columns()
+ ResultsParser.ResultsParser.__init__(self)
+ def ReadDeltaScoreDistribution(self, FilePath):
+ """
+ Read delta-scores from a file, to compute the average delta-score.
+ If passed a directory, iterate over all results files in the directory.
+ """
+ #
+ self.AllSpectrumCount2 = 0
+ self.AllSpectrumCount3 = 0
+ self.MeanDeltaScore2 = 0
+ self.MeanDeltaScore3 = 0
+ self.ProcessResultsFiles(FilePath, self.ReadDeltaScoreDistributionFromFile, MAX_RESULTS_FILES_TO_PARSE)
+ self.MeanDeltaScore2 /= max(1, self.AllSpectrumCount2)
+ self.MeanDeltaScore3 /= max(1, self.AllSpectrumCount3)
+ if self.VerboseFlag:
+ print "Mean delta score ch1..2: %s over %s spectra"%(self.MeanDeltaScore2, self.AllSpectrumCount2)
+ print "Mean delta score ch3: %s over %s spectra"%(self.MeanDeltaScore3, self.AllSpectrumCount3)
+ if not self.MeanDeltaScore2:
+ self.MeanDeltaScore2 = 0.001
+ if not self.MeanDeltaScore3:
+ self.MeanDeltaScore3 = 0.001
+ def ReadDeltaScoreDistributionFromFile(self, FilePath):
+ "Read delta-scores from a single file, to compute the average delta-score."
+ print "Read delta-score distribution from %s..."%FilePath
+ try:
+ File = open(FilePath, "rb")
+ except:
+ traceback.print_exc()
+ return
+ OldSpectrum = None
+ for FileLine in File.xreadlines():
+ # Skip header lines and blank lines
+ if FileLine[0] == "#":
+ self.Columns.initializeHeaders(FileLine)
+ continue
+ if not FileLine.strip():
+ continue
+ Bits = list(FileLine.split("\t"))
+ if len(Bits) <= self.Columns.getIndex("DeltaScore"):
+ continue
+ try:
+ Charge = int(Bits[self.Columns.getIndex("Charge")])
+ MQScore = float(Bits[self.Columns.getIndex("MQScore")])
+ DeltaScore = float(Bits[self.Columns.getIndex("DeltaScoreOther")])
+ Peptide = GetPeptideFromModdedName(Bits[self.Columns.getIndex("Annotation")])
+ Spectrum = (os.path.basename(Bits[self.Columns.getIndex("SpectrumFile")]), Bits[self.Columns.getIndex("Scan#")])
+ except:
+ traceback.print_exc()
+ print Bits
+ continue # header line
+ if Spectrum == OldSpectrum:
+ continue
+
+ OldSpectrum = Spectrum
+
+ Length = len(Peptide.Aminos)
+ if Length < self.MinimumPeptideLength:
+ continue
+ if DeltaScore < 0:
+ print "## Warning: DeltaScore < 0!", Spectrum, FilePath
+ print DeltaScore
+ print MQScore
+ print Bits
+ raw_input()
+ continue
+ if Charge < 3:
+ self.AllSpectrumCount2 += 1
+ self.MeanDeltaScore2 += DeltaScore
+
+ else:
+ self.AllSpectrumCount3 += 1
+ self.MeanDeltaScore3 += DeltaScore
+ File.close()
+ def ReadScoreDistributionFromFile(self, FilePath):
+ """
+ Read F-scores from a single file, to compute the score histogram.
+ """
+ print "Read score distribution from %s..."%FilePath
+ try:
+ File = open(FilePath, "rb")
+ except:
+ traceback.print_exc()
+ return
+ OldSpectrum = None
+ for FileLine in File.xreadlines():
+ # Skip header lines and blank lines
+ if FileLine[0] == "#":
+ self.Columns.initializeHeaders(FileLine)
+ continue
+ if not FileLine.strip():
+ continue
+ Bits = list(FileLine.split("\t"))
+ try:
+ Charge = int(Bits[self.Columns.getIndex("Charge")])
+ MQScore = float(Bits[self.Columns.getIndex("MQScore")])
+ DeltaScore = float(Bits[self.Columns.getIndex("DeltaScore")])
+ Peptide = GetPeptideFromModdedName(Bits[self.Columns.getIndex("Annotation")])
+ Protein = Bits[self.Columns.getIndex("ProteinName")]
+ Spectrum = (Bits[self.Columns.getIndex("SpectrumFile")], Bits[self.Columns.getIndex("Scan#")])
+ except:
+ continue # header line
+ if Spectrum == OldSpectrum:
+ continue
+ OldSpectrum = Spectrum
+ Length = len(Peptide.Aminos)
+ if Length < self.MinimumPeptideLength:
+ continue
+ if (Charge < 3):
+ MeanDeltaScore = self.MeanDeltaScore2
+ else:
+ MeanDeltaScore = self.MeanDeltaScore3
+ WeightedScore = self.MQScoreWeight * MQScore + self.DeltaScoreWeight * (DeltaScore / MeanDeltaScore)
+ ScoreBin = int(round(WeightedScore * BIN_MULTIPLIER))
+ Hit = 1
+ if self.ClusterInfoPath:
+ # Get this cluster's size:
+ ClusterFileName = Bits[0].replace("/","\\").split("\\")[-1]
+ ScanNumber = int(Bits[1])
+ ClusterSize = self.ClusterSizes.get((ClusterFileName, ScanNumber), None)
+ if not ClusterSize:
+ print "* Warning: ClusterSize not known for %s, %s"%(ClusterFileName, ScanNumber)
+ else:
+ Hit = ClusterSize
+ if Charge < 3:
+ self.ScoreHistogram2[ScoreBin] = self.ScoreHistogram2.get(ScoreBin, 0) + Hit
+ else:
+ self.ScoreHistogram3[ScoreBin] = self.ScoreHistogram3.get(ScoreBin, 0) + Hit
+ if self.ShuffledDatabaseFraction:
+ if Protein[:3] == "XXX":
+ if Charge < 3:
+ self.ShuffledScoreHistogram2[ScoreBin] = self.ShuffledScoreHistogram2.get(ScoreBin, 0) + Hit
+ else:
+ self.ShuffledScoreHistogram3[ScoreBin] = self.ShuffledScoreHistogram3.get(ScoreBin, 0) + Hit
+ File.close()
+ def ProduceScoreDistributionImage(self, ImagePath, Charge3Flag = 0):
+ """
+ Write out, to the specified path, an image with f-score on the X-axis
+ and p-value on the Y-axis. If we fit a mixture model, plot the true
+ and false (model) distributions; if we fit using shuffled proteins,
+ plot the empirical distributions.
+ """
+ if Image == None:
+ return
+
+ if Charge3Flag:
+ ScoreHistogram = self.ScoreHistogram3
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram3
+ else:
+ ScoreHistogram = self.ScoreHistogram2
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram2
+ # Image size:
+ self.Width = 900
+ self.Height = 500
+ self.LeftPadding = 50
+ self.RightPadding = 80
+ self.BottomPadding = 40
+ self.TopPadding = 10
+ self.PlotWidth = self.Width - (self.LeftPadding + self.RightPadding)
+ self.PlotHeight = self.Height - (self.TopPadding + self.BottomPadding)
+ self.PlotImage = Image.new("RGB", (self.Width, self.Height), Colors.Background)
+ self.Draw = ImageDraw.Draw(self.PlotImage)
+ # Find largest and smallest bins and entries:
+ self.MaxScoreHistogramEntry = 10
+ self.MaxScoreHistogramEntryValid = 10
+ self.MaxScoreHistogramEntryInvalid = 10
+ self.MaxScoreBin = -9999
+ self.MinScoreBin = 9999
+ self.TotalHistogramEntries = 0
+ for (Bin, Entry) in ScoreHistogram.items():
+ self.MaxScoreHistogramEntry = max(Entry, self.MaxScoreHistogramEntry)
+ InvalidCount = ShuffledScoreHistogram.get(Bin, 0)
+ ValidCount = max(0, Entry - InvalidCount)
+ self.MaxScoreHistogramEntryValid = max(self.MaxScoreHistogramEntryValid, ValidCount)
+ self.MaxScoreHistogramEntryInvalid = max(self.MaxScoreHistogramEntryInvalid, InvalidCount)
+ self.MaxScoreBin = max(self.MaxScoreBin, Bin)
+ self.MinScoreBin = min(self.MinScoreBin, Bin)
+ self.TotalHistogramEntries += Entry
+ #print "Bin %s: Valid %s, invalid %s"%(Bin, ValidCount, InvalidCount)
+ self.BinCount = self.MaxScoreBin - self.MinScoreBin + 1
+ # Draw the Y axis:
+ self.Draw.line((self.LeftPadding, self.TopPadding, self.LeftPadding, self.TopPadding + self.PlotHeight), Colors.Black)
+ self.Draw.line((self.Width - self.RightPadding, self.TopPadding, self.Width - self.RightPadding, self.TopPadding + self.PlotHeight), Colors.Black)
+ Fraction = 0
+ while Fraction <= 1.0:
+ Y = self.TopPadding + self.PlotHeight * (1.0 - Fraction)
+ Label = str(int(round(Fraction * self.MaxScoreHistogramEntry)))
+ self.Draw.text((self.LeftPadding - 5 - len(Label)*5, Y - 6), Label, Colors.Black)
+ self.Draw.line((self.LeftPadding - 5, Y, self.LeftPadding, Y), Colors.Black)
+ Label = str(Fraction)
+ self.Draw.text((self.Width - self.RightPadding + 10, Y - 6), Label, Colors.Black)
+ self.Draw.line((self.Width - self.RightPadding, Y, self.Width - self.RightPadding + 5, Y), Colors.Black)
+ Fraction += 0.1
+ # Draw the X axis:
+ self.Draw.line((self.LeftPadding, self.Height - self.BottomPadding, self.Width - self.RightPadding, self.Height - self.BottomPadding), Colors.Black)
+ Bin = self.MinScoreBin
+ while Bin % 10 != 0:
+ Bin += 1
+ while Bin < self.MaxScoreBin:
+ BinNumber = Bin - self.MinScoreBin
+ X = self.LeftPadding + BinNumber * self.PlotWidth / float(self.BinCount)
+ self.Draw.line((X, self.Height - self.BottomPadding - 2, X, self.Height - self.BottomPadding + 2), Colors.Black)
+ Label = "%.1f"%(Bin / BIN_MULTIPLIER)
+ self.Draw.text((X - len(Label) * 2.5, self.Height - self.BottomPadding + 2), Label, Colors.Black)
+ Bin += 10
+ if self.ShuffledDatabaseFraction != None:
+ self.ProduceImageShuffledDB(Charge3Flag)
+ else:
+ self.ProduceImageMixtureModel(Charge3Flag)
+ self.PlotImage.save(ImagePath)
+ # Free:
+ self.PlotImage = None
+ self.Draw = None
+ def ProduceImageShuffledDB(self, Charge3Flag = 0):
+ if not Image:
+ return
+
+ if Charge3Flag:
+ ScoreHistogram = self.ScoreHistogram3
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram3
+ OddsTrue = self.OddsTrue3
+ else:
+ ScoreHistogram = self.ScoreHistogram2
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram2
+ OddsTrue = self.OddsTrue2
+ # Draw the legend:
+ Y = self.Height - self.BottomPadding + 20
+ self.Draw.line((105, Y, 125, Y), Colors.Black)
+ self.Draw.rectangle((113, Y-2, 118, Y+2), Colors.Black)
+ self.Draw.text((130, Y - 5), "p-value", Colors.Black)
+ Y = self.Height - self.BottomPadding + 30
+ self.Draw.line((105, Y, 125, Y), Colors.Blue)
+ self.Draw.rectangle((113, Y-2, 118, Y+2), Colors.Grey)
+ self.Draw.text((130, Y - 5), "All hits", Colors.Grey)
+ Y = self.Height - self.BottomPadding + 20
+ self.Draw.line((305, Y, 325, Y), Colors.Red)
+ self.Draw.rectangle((313, Y-2, 318, Y+2), Colors.Green)
+ self.Draw.text((330, Y - 5), "Valid proteins", Colors.Green)
+ Y = self.Height - self.BottomPadding + 30
+ self.Draw.line((305, Y, 325, Y), Colors.Red)
+ self.Draw.rectangle((313, Y-2, 318, Y+2), Colors.Red)
+ self.Draw.text((330, Y - 5), "Invalid proteins", Colors.Red)
+ # Loop over bins, plotting distributions:
+ PrevYOdds = None
+ PrevYAll = None
+ PrevYTrue = None
+ PrevYFalse = None
+ PrevX = None
+ for Bin in range(self.MinScoreBin, self.MaxScoreBin + 1):
+ BinNumber = Bin - self.MinScoreBin
+ XX = self.LeftPadding + (BinNumber * self.PlotWidth / float(self.BinCount))
+ # p-value:
+ PValue = 1.0 - OddsTrue[Bin]
+ YOdds = self.Height - self.BottomPadding - self.PlotHeight * PValue
+ self.Draw.rectangle((XX - 2, YOdds - 2, XX + 2, YOdds + 2), Colors.Black)
+ if PrevYOdds != None:
+ self.Draw.line((PrevX, PrevYOdds, XX, YOdds), Colors.Black)
+ # Overall:
+ Count = ScoreHistogram.get(Bin, 0)
+ YAll = self.Height - self.BottomPadding - self.PlotHeight * Count / float(self.MaxScoreHistogramEntry)
+ self.Draw.rectangle((XX - 2, YAll - 2, XX + 2, YAll + 2), Colors.Grey)
+ if (PrevYAll):
+ self.Draw.line((PrevX, PrevYAll, XX, YAll), Colors.Grey)
+ # Invalid:
+ CountInvalid = ShuffledScoreHistogram.get(Bin, 0)
+ YFalse = self.Height - self.BottomPadding - self.PlotHeight * CountInvalid / float(self.MaxScoreHistogramEntryInvalid)
+ self.Draw.rectangle((XX - 2, YFalse - 2, XX + 2, YFalse + 2), Colors.Red)
+ if (PrevYFalse):
+ self.Draw.line((PrevX, PrevYFalse, XX, YFalse), Colors.Red)
+ # Valid:
+ CountValid = Count - CountInvalid
+ YTrue = self.Height - self.BottomPadding - self.PlotHeight * CountValid / float(self.MaxScoreHistogramEntryValid)
+ self.Draw.rectangle((XX - 2, YTrue - 2, XX + 2, YTrue + 2), Colors.Green)
+ #print "Bin %s: Valid %s/%s invalid %s/%s"%(Bin, CountValid, self.MaxScoreHistogramEntryValid, CountInvalid, self.MaxScoreHistogramEntryInvalid)
+ if (PrevYTrue):
+ self.Draw.line((PrevX, PrevYTrue, XX, YTrue), Colors.Green)
+ # Remember these values, for linking to the next in the series:
+ PrevX = XX
+ PrevYOdds = YOdds
+ PrevYAll = YAll
+ PrevYFalse = YFalse
+ PrevYTrue = YTrue
+ def ProduceImageMixtureModel(self, Charge3Flag = 0):
+ """
+ Helper for ProduceScoreDistributionImage, if we're using a mixture
+ model (not a shuffled database)
+ """
+ if Image == None:
+ return
+ if Charge3Flag:
+ ScoreHistogram = self.ScoreHistogram3
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram3
+ MixtureModel = self.MixtureModel3
+ OddsTrue = self.OddsTrue3
+ else:
+ ScoreHistogram = self.ScoreHistogram2
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram2
+ MixtureModel = self.MixtureModel2
+ OddsTrue = self.OddsTrue2
+ # Draw the legend:
+ Y = self.Height - self.BottomPadding + 20
+ self.Draw.line((105, Y, 125, Y), Colors.Black)
+ self.Draw.rectangle((113, Y-2, 118, Y+2), Colors.Black)
+ self.Draw.text((130, Y - 5), "Empirical score distribution", Colors.Black)
+ Y = self.Height - self.BottomPadding + 30
+ self.Draw.line((105, Y, 125, Y), Colors.Blue)
+ self.Draw.rectangle((113, Y-2, 118, Y+2), Colors.Blue)
+ self.Draw.text((130, Y - 5), "Probability true (1-pvalue)", Colors.Blue)
+ Y = self.Height - self.BottomPadding + 20
+ self.Draw.line((305, Y, 325, Y), Colors.Red)
+ self.Draw.rectangle((313, Y-2, 318, Y+2), Colors.Red)
+ self.Draw.text((330, Y - 5), "Gamma dist. (fit to false matches)", Colors.Red)
+ Y = self.Height - self.BottomPadding + 30
+ self.Draw.line((305, Y, 325, Y), Colors.Green)
+ self.Draw.rectangle((313, Y-2, 318, Y+2), Colors.Green)
+ self.Draw.text((330, Y - 5), "Normal dist. (fit to true matches)", Colors.Green)
+ Y = self.Height - self.BottomPadding + 20
+ self.Draw.line((555, Y, 575, Y), Colors.Grey)
+ self.Draw.rectangle((563, Y-2, 568, Y+2), Colors.Grey)
+ self.Draw.text((580, Y - 5), "Fitted mixture model", Colors.Grey)
+ # Draw the plot of OBSERVED SCORES:
+ PrevX = None
+ PrevY = None
+ for Bin in range(self.MinScoreBin, self.MaxScoreBin + 1):
+ BinNumber = Bin - self.MinScoreBin
+ X = self.LeftPadding + BinNumber * self.PlotWidth / float(self.BinCount)
+ Count = ScoreHistogram.get(Bin, 0)
+ Y = self.Height - self.BottomPadding - self.PlotHeight * Count / float(self.MaxScoreHistogramEntry)
+ self.Draw.rectangle((X - 2, Y - 2, X + 2, Y + 2), Colors.Black)
+ if PrevX != None:
+ self.Draw.line((PrevX, PrevY, X, Y), Colors.Black)
+ PrevX = X
+ PrevY = Y
+ #######################################################
+ # Find the scaling factor for the MERGED distribution:
+ ComboDistTotal = 0
+ for Bin in range(self.MinScoreBin, self.MaxScoreBin + 1):
+ TrueScore = Bin / BIN_MULTIPLIER
+ Pow = - ((TrueScore - MixtureModel.MeanTrue)**2) / (2 * MixtureModel.VarianceTrue)
+ TrueNormal = math.exp(Pow) / (MixtureModel.StdDevTrue * SQRT2PI)
+ GX = max(0.01, TrueScore + MixtureModel.GammaOffset)
+ FalseGamma = math.pow(GX, MixtureModel.KFalse - 1) * math.exp(-GX / MixtureModel.ThetaFalse) / MixtureModel.GammaDemonFalse
+ ComboDist = TrueNormal * MixtureModel.PriorProbabilityTrue + (1.0 - MixtureModel.PriorProbabilityTrue) * FalseGamma
+ ComboDistTotal += ComboDist
+ YFittedScalingFactor = self.TotalHistogramEntries / ComboDistTotal
+ #######################################################
+ # Draw the plot of the FALSE HIT GAMMA and TRUE HIT NORMAL and MERGED distributions:
+ PrevX = None
+ PrevYNormal = None
+ PrevYGamma = None
+ PrevYOdds = None
+ PrevYFitted = None
+ for Bin in range(self.MinScoreBin, self.MaxScoreBin + 1):
+ BinNumber = Bin - self.MinScoreBin
+ XX = self.LeftPadding + (BinNumber * self.PlotWidth / float(self.BinCount))
+ TrueScore = Bin / BIN_MULTIPLIER
+ Pow = - ((TrueScore - MixtureModel.MeanTrue)**2) / (2 * MixtureModel.VarianceTrue)
+ TrueNormal = math.exp(Pow) / (MixtureModel.StdDevTrue * SQRT2PI)
+ GX = max(0.01, TrueScore + MixtureModel.GammaOffset)
+ FalseGamma = math.pow(GX, MixtureModel.KFalse - 1) * math.exp(-GX / MixtureModel.ThetaFalse) / MixtureModel.GammaDemonFalse
+ YNormal = self.Height - self.BottomPadding - self.PlotHeight * TrueNormal
+ # Normal distribution:
+ self.Draw.rectangle((XX - 2, YNormal - 2, XX + 2, YNormal + 2), Colors.Green)
+ if PrevX != None:
+ self.Draw.line((PrevX, PrevYNormal, XX, YNormal), Colors.Green)
+ # Gamma distribution:
+ YGamma = self.Height - self.BottomPadding - self.PlotHeight * FalseGamma
+ self.Draw.rectangle((XX - 2, YGamma - 2, XX + 2, YGamma + 2), Colors.Red)
+ if PrevX != None:
+ self.Draw.line((PrevX, PrevYGamma, XX, YGamma), Colors.Red)
+ # Fitted curve:
+ ComboDist = TrueNormal * MixtureModel.PriorProbabilityTrue + (1.0 - MixtureModel.PriorProbabilityTrue) * FalseGamma
+ YFitted = ComboDist * YFittedScalingFactor / self.MaxScoreHistogramEntry
+ YFitted = self.Height - self.BottomPadding - YFitted * self.PlotHeight
+ #print TrueNormal, FalseGamma, self.AllSpectrumCount, ComboDist, YFitted
+ self.Draw.rectangle((XX - 2, YFitted - 2, XX + 2, YFitted + 2), Colors.Grey)
+ if PrevX != None:
+ self.Draw.line((PrevX, PrevYFitted, XX, YFitted), Colors.Grey)
+ # P-Value:
+ PValue = 1.0 - OddsTrue.get(Bin, 0)
+ YOdds = self.Height - self.BottomPadding - self.PlotHeight * PValue
+ self.Draw.rectangle((XX - 2, YOdds - 2, XX + 2, YOdds + 2), Colors.Blue)
+ if PrevX != None:
+ self.Draw.line((PrevX, PrevYOdds, XX, YOdds), Colors.Blue)
+ # Remember these points' coords for drawing lines next time:
+ PrevX = XX
+ PrevYNormal = YNormal
+ PrevYGamma = YGamma
+ PrevYOdds = YOdds
+ PrevYFitted = YFitted
+ def FitMixtureModel(self):
+ self.MixtureModel2 = Learning.MixtureModelClass()
+ self.MixtureModel2.Model(None, self.ScoreHistogram2)
+ self.OddsTrue2 = self.MixtureModel2.OddsTrue
+ self.MixtureModel3 = Learning.MixtureModelClass()
+ self.MixtureModel3.Model(None, self.ScoreHistogram3)
+ self.OddsTrue3 = self.MixtureModel3.OddsTrue
+ return 1
+ def SavePValueDistribution(self, Charge3Flag = 0):
+ """
+ Write out a p-value distribution derived from forward+shuffled database
+ """
+ if Charge3Flag:
+ OddsTrue = self.OddsTrue3
+ MeanDeltaScore = self.MeanDeltaScore3
+ ScoreHistogram = self.ScoreHistogram3
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram3
+ else:
+ OddsTrue = self.OddsTrue2
+ MeanDeltaScore = self.MeanDeltaScore2
+ ScoreHistogram = self.ScoreHistogram2
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram2
+ Keys = OddsTrue.keys()
+ if not Keys:
+ return
+ MinBin = min(Keys)
+ MaxBin = max(Keys)
+ self.OutputDistributionFile.write("#MeanDeltaScore\t%s\n"%MeanDeltaScore)
+ self.OutputDistributionFile.write("#BlindFlag\t%s\n"%self.BlindFlag)
+ if self.ShuffledDatabaseFraction != None:
+ Header = "#Bin\tFDR\tTotalHits\tHitsValid\tHitsInvalid\tPeptideFDR\tPeptidesValid\tPeptidesInvalid\tProteinFDR\tProteinsValid\tProteinsInvalid\t\n"
+ else:
+ Header = "#Bin\tFDR\tTotalHits\t\n"
+ self.OutputDistributionFile.write(Header)
+ if self.ShuffledDatabaseFraction != None:
+ # Count the total number of true hits, false hits, true peptides, false peptides...
+ CumulativeTrueHits = 0
+ CumulativeFalseHits = 0
+ for Bin in range(MinBin, MaxBin + 1):
+ AllHits = ScoreHistogram.get(Bin, 0)
+ FalseHits = ShuffledScoreHistogram.get(Bin, 0)
+ CumulativeFalseHits += FalseHits
+ CumulativeTrueHits += (AllHits - FalseHits)
+ if self.ProteinPicker:
+ ######################################################
+ # Peptides:
+ ValidPeptides = {}
+ InvalidPeptides = {}
+ CumulativeTruePeptides = 0
+ CumulativeFalsePeptides = 0
+ BestScoreByProtein = {}
+ for (Peptide, Score) in self.ProteinPicker.BestScoresByPeptide.items():
+ Bin = int(round(Score / BIN_MULTIPLIER))
+ ProteinID = self.ProteinPicker.PeptideProteins.get(Peptide, None)
+ if not ProteinID:
+ print "*** Warning: Peptide '%s' was never assigned to a protein!"%Peptide
+ LocationList = self.ProteinPicker.PeptideDict[Peptide]
+ print LocationList
+ for (ProteinID, Pos) in LocationList:
+ print ProteinID, self.ProteinPicker.ProteinNames[ProteinID], self.ProteinPicker.ProteinPeptideCounts[ProteinID]
+ continue # shouldn't occur!
+ ProteinName = self.ProteinPicker.ProteinNames[ProteinID]
+ OldScore = BestScoreByProtein.get(ProteinID, -9999)
+ BestScoreByProtein[ProteinID] = max(OldScore, Score)
+ if ProteinName[:3] == "XXX":
+ InvalidPeptides[Bin] = InvalidPeptides.get(Bin, 0) + 1
+ CumulativeFalsePeptides += 1
+ else:
+ ValidPeptides[Bin] = ValidPeptides.get(Bin, 0) + 1
+ CumulativeTruePeptides += 1
+ ######################################################
+ # Proteins:
+ ValidProteins = {}
+ InvalidProteins = {}
+ CumulativeTrueProteins = 0
+ CumulativeFalseProteins = 0
+ for (ProteinID, Score) in BestScoreByProtein.items():
+ ProteinName = self.ProteinPicker.ProteinNames[ProteinID]
+ if ProteinName[:3] == "XXX":
+ InvalidProteins[Bin] = InvalidProteins.get(Bin, 0) + 1
+ CumulativeFalseProteins += 1
+ else:
+ ValidProteins[Bin] = ValidProteins.get(Bin, 0) + 1
+ CumulativeTrueProteins += 1
+ for Bin in range(MinBin, MaxBin + 1):
+ FDR = 1.0 - OddsTrue[Bin]
+ AllHits = ScoreHistogram.get(Bin, 0)
+ self.OutputDistributionFile.write("%s\t%s\t%s\t"%(Bin, FDR, AllHits))
+ if self.ShuffledDatabaseFraction != None:
+ FalseHits = ShuffledScoreHistogram.get(Bin, 0)
+ TrueHits = AllHits - FalseHits
+ CumulativeTrueHits -= TrueHits
+ CumulativeFalseHits -= FalseHits
+ self.OutputDistributionFile.write("%s\t%s\t"%(CumulativeTrueHits, CumulativeFalseHits))
+ if self.ProteinPicker:
+ # Peptide FDR:
+ FalseWithinTrue = min(CumulativeTruePeptides, CumulativeFalsePeptides * self.ShuffledScalingFactor)
+ PeptideFDR = FalseWithinTrue / float(max(1, CumulativeTruePeptides))
+ self.OutputDistributionFile.write("%.4f\t%s\t%s\t"%(PeptideFDR, CumulativeTruePeptides, CumulativeFalsePeptides))
+ CumulativeTruePeptides -= ValidPeptides.get(Bin, 0)
+ CumulativeFalsePeptides -= InvalidPeptides.get(Bin, 0)
+ # Protein FDR:
+ FalseWithinTrue = min(CumulativeTrueProteins, CumulativeFalseProteins * self.ShuffledScalingFactor)
+ ProteinFDR = FalseWithinTrue / float(max(1, CumulativeTrueProteins))
+ self.OutputDistributionFile.write("%.4f\t%s\t%s\t"%(ProteinFDR, CumulativeTrueProteins, CumulativeFalseProteins))
+ CumulativeTrueProteins -= ValidProteins.get(Bin, 0)
+ CumulativeFalseProteins -= InvalidProteins.get(Bin, 0)
+ self.OutputDistributionFile.write("\n")
+ def LoadPValueDistribution(self, FileName):
+ Charge3Flag = 0
+
+ File = open(FileName, "rb")
+
+ for FileLine in File.xreadlines():
+ Bits = list(FileLine.strip().split("\t"))
+ if len(Bits) < 2:
+ continue
+ if FileLine[0] == "#":
+ # Header line. Parse special lines:
+ Name = Bits[0][1:]
+ if Name == "BlindFlag":
+ self.BlindFlag = int(Bits[1])
+ elif Name == "MeanDeltaScore":
+ if Charge3Flag:
+ self.MeanDeltaScore3 = float(Bits[1])
+ OddsTrue = {}
+ self.OddsTrue3 = OddsTrue
+ else:
+ self.MeanDeltaScore2 = float(Bits[1])
+ OddsTrue = {}
+ self.OddsTrue2 = OddsTrue
+
+ else:
+ print "(Skipping comment '%s', not understood)"%Bits[0]
+ continue
+ Bin = int(Bits[0])
+ OddsTrue[Bin] = 1.0 - float(Bits[1])
+ Charge3Flag = 1 #We've gotten past all the comments for charges 1 and 2, so the next time
+ #we see a '#' it will be for charge 3
+ File.close()
+ if self.BlindFlag:
+ self.MQScoreWeight = Defaults.BlindMQScoreWeight
+ self.DeltaScoreWeight = Defaults.BlindDeltaScoreWeight
+ self.GammaOffset = Defaults.BlindGammaOffset
+ else:
+ self.MQScoreWeight = Defaults.MQScoreWeight
+ self.DeltaScoreWeight = Defaults.DeltaScoreWeight
+ self.GammaOffset = Defaults.GammaOffset
+ def WriteMatchesForSpectrum(self, MatchesForSpectrum, OutFile):
+ if self.WriteTopMatchOnly:
+ MatchesForSpectrum = MatchesForSpectrum[0:1]
+ for Match in MatchesForSpectrum:
+ # If we have a shuffled database (-S option), then by default we get to throw shuffled-protein
+ # matches away for free. We don't get to keep runners-up to them, though!!
+ if Match.ProteinName[:3] == "XXX" and self.ShuffledDatabaseFraction != None and self.RemoveShuffledMatches:
+ break
+ # Skip matches with poor delta-score:
+ if Match.DeltaScore < self.MaxDeltaScoreGap and not self.RetainBadMatches:
+ continue
+ # Skip short matches:
+ Length = len(Match.Peptide.Aminos)
+ if Length < self.MinimumPeptideLength:
+ continue
+ if Match.Charge < 3:
+ MeanDeltaScore = self.MeanDeltaScore2
+ else:
+ MeanDeltaScore = self.MeanDeltaScore3
+ WeightedScore = self.MQScoreWeight * Match.MQScore + self.DeltaScoreWeight * (Match.DeltaScore / MeanDeltaScore)
+ ScoreBin = int(round(WeightedScore * BIN_MULTIPLIER))
+ if Match.Charge < 3:
+ TrueOdds = self.OddsTrue2.get(ScoreBin, None)
+ else:
+ TrueOdds = self.OddsTrue3.get(ScoreBin, None)
+ if TrueOdds == None:
+ if ScoreBin < 0:
+ TrueOdds = 0.00001
+ else:
+ TrueOdds = 0.99999
+ else:
+ TrueOdds = max(0.00001, min(TrueOdds, 0.99999))
+ Match.PValue = (1.0 - TrueOdds)
+ Match.Bits[self.Columns.getIndex("F-Score")] = "%s"%WeightedScore
+ Match.Bits[self.Columns.getIndex("InspectFDR")] = "%s"%Match.PValue
+ if self.ProteinPicker:
+ # Replace the original protein with the "correct" one:
+ ProteinID = self.ProteinPicker.PeptideProteins.get(Match.Peptide.Aminos, None)
+ if ProteinID != None:
+ Match.Bits[self.Columns.getIndex("RecordNumber")] = str(ProteinID)
+ Match.Bits[self.Columns.getIndex("Protein")] = self.ProteinPicker.ProteinNames[ProteinID]
+ if (not self.RetainBadMatches):
+ if (Match.PValue > self.PValueCutoff):
+ continue
+ # Sometimes things with a horrible MQScore get a good pvalue.
+ # We want to exclude these.
+ if Match.MQScore < MIN_MQSCORE:
+ continue
+ self.LinesAcceptedCount += 1
+ OutFile.write(string.join(Match.Bits, "\t"))
+ OutFile.write("\n")
+ def WriteFixedScores(self, OutputPath):
+ self.TotalLinesAcceptedCount = 0
+ self.TotalLinesSecondPass = 0
+ self.WriteScoresPath = OutputPath
+ # Make the output directory, if it doesn't exist already.
+ # Assume: OutputPath is a directory if ReadScoresPath is a directory,
+ # and OutputPath is a file if ReadScoresPath is a file.
+ if os.path.isdir(self.ReadScoresPath):
+ DirName = OutputPath
+ else:
+ DirName = os.path.split(OutputPath)[0]
+ try:
+ os.makedirs(DirName)
+ except:
+ pass
+ self.ProcessResultsFiles(self.ReadScoresPath, self.WriteFixedScoresFile)
+ print "Total accepted lines: %s of %s"%(self.TotalLinesAcceptedCount, self.TotalLinesSecondPass)
+ def WriteFixedScoresFile(self, Path):
+ if os.path.isdir(self.ReadScoresPath):
+ OutputPath = os.path.join(self.WriteScoresPath, os.path.split(Path)[1])
+ else:
+ OutputPath = self.WriteScoresPath
+ if (not self.OverwriteNewScoresFlag) and os.path.exists(OutputPath):
+ return
+ try:
+ InFile = open(Path, "rb")
+ OutFile = open(OutputPath, "wb")
+ LineCount = 0
+ self.LinesAcceptedCount = 0
+ OldSpectrum = None
+ MatchesForSpectrum = []
+ for FileLine in InFile.xreadlines():
+ # Lines starting with # are comments (e.g. header line), and are written out as-is:
+ if FileLine[0] == "#":
+ self.Columns.initializeHeaders(FileLine)
+ OutFile.write(FileLine)
+ continue
+ Bits = list(FileLine.strip().split("\t"))
+ Match = Bag()
+ try:
+ Match.Bits = Bits
+ Match.Charge = int(Bits[self.Columns.getIndex("Charge")])
+ Match.MQScore = float(Bits[self.Columns.getIndex("MQScore")])
+ #Match.DeltaScoreAny = float(Bits[self.Columns.DeltaScoreAny])
+ Match.DeltaScore = float(Bits[self.Columns.getIndex("DeltaScore")])
+ Match.Peptide = GetPeptideFromModdedName(Bits[self.Columns.getIndex("Annotation")])
+ Match.ProteinName = Bits[self.Columns.getIndex("Protein")]
+ except:
+ continue
+ LineCount += 1
+ Spectrum = (Bits[0], Bits[1])
+ if Spectrum != OldSpectrum:
+ self.WriteMatchesForSpectrum(MatchesForSpectrum, OutFile)
+ MatchesForSpectrum = []
+ OldSpectrum = Spectrum
+ MatchesForSpectrum.append(Match)
+ # Finish the last spectrum:
+ self.WriteMatchesForSpectrum(MatchesForSpectrum, OutFile)
+ InFile.close()
+ OutFile.close()
+ print "%s\t%s\t%s\t"%(Path, LineCount, self.LinesAcceptedCount)
+ self.TotalLinesAcceptedCount += self.LinesAcceptedCount
+ self.TotalLinesSecondPass += LineCount
+ except:
+ traceback.print_exc()
+ print "* Error filtering annotations from '%s' to '%s'"%(Path, OutputPath)
+ def ComputePValuesWithShuffled(self, Charge3Flag = 0):
+ """
+ Set self.OddsTrue using results from a partially-shuffled database.
+ Given a score cutoff we assume that, above the score cutoff, there are
+ T hits from valid proteins and F hits from invalid proteins.
+
+ # PVALUE WITH REMOVAL:
+ Let TDB and FDB be the true and false database fractions (FDB = self.ShuffledDatabaseFraction)
+ After filtering out all F hits from invalid proteins, there are still some
+ chance hits to true proteins. The estimated number of hits from T that are actually
+ false is equal to F*(TDB/FDB). This, the odds true for this cutoff is:
+ 1.0 - (F*(TDB/FDB) / T)
+
+ # PVALUE WITHOUT REMOVAL:
+ The odds true for the cutoff is simply T/(T+F).
+ """
+ OddsTrue = {}
+ if Charge3Flag:
+ self.OddsTrue3 = OddsTrue
+ ScoreHistogram = self.ScoreHistogram3
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram3
+ else:
+ self.OddsTrue2 = OddsTrue
+ ScoreHistogram = self.ScoreHistogram2
+ ShuffledScoreHistogram = self.ShuffledScoreHistogram2
+ CumulativeHits = 0
+ CumulativeHitsTrue = 0
+ CumulativeHitsFalse = 0
+ Keys = ScoreHistogram.keys()
+ Keys.sort()
+ if not Keys:
+ # There are NO HITS for this charge state.
+ return
+ MinKey = Keys[0]
+ MaxKey = Keys[-1]
+ TrueFraction = 1.0 - self.ShuffledDatabaseFraction
+ self.ShuffledScalingFactor = TrueFraction / self.ShuffledDatabaseFraction
+ for Key in range(MaxKey, MinKey - 1, -1):
+ AllHits = ScoreHistogram.get(Key, 0)
+ FalseHits = ShuffledScoreHistogram.get(Key, 0)
+ ValidHits = AllHits - FalseHits
+ CumulativeHitsTrue += ValidHits
+ CumulativeHitsFalse += FalseHits
+ FalseWithinTrue = min(CumulativeHitsTrue, CumulativeHitsFalse * self.ShuffledScalingFactor)
+ ##NEC_MOD
+ #if FalseWithinTrue == 0:
+ # FalseWithinTrue = 1
+ if self.RemoveShuffledMatches:
+ # OddsTrue = (V - I*(TDB/FDB)) / V
+ BinOddsTrue = max(0, CumulativeHitsTrue - FalseWithinTrue) / float(max(1, CumulativeHitsTrue))
+
+ else:
+ # OddsTrue = (V - I*(TDB/FDB)) / (I+V)
+ BinOddsTrue = max(0, CumulativeHitsTrue - FalseWithinTrue) / float(max(1, CumulativeHitsTrue + CumulativeHitsFalse))
+ if self.VerboseFlag:
+ # Bin, true, false, cumtrue, cumfalse
+ Str = "%s\t%s\t%s\t%s\t%s\t"%(Key, ValidHits, FalseHits, CumulativeHitsTrue, CumulativeHitsFalse)
+ Str += "%.5f\t%.5f\t"%(BinOddsTrue, 1.0 - BinOddsTrue)
+ print Str
+ OddsTrue[Key] = BinOddsTrue
+ if self.VerboseFlag:
+ print "\n\n"
+ def SelectProteins(self, PValueCutoff, ReadScoresPath):
+ """
+ Using SelectProteins, assign each peptide the the most reasonable "owner" protein.
+ """
+ # Select the F-score cutoff:
+ FScoreCutoff2 = 9999
+ FScoreCutoff3 = 9999
+ for FScoreBin in self.OddsTrue2.keys():
+ OddsTrue = self.OddsTrue2[FScoreBin]
+ if (1.0 - OddsTrue) <= PValueCutoff:
+ if (FScoreBin / BIN_MULTIPLIER) < FScoreCutoff2:
+ FScoreCutoff2 = FScoreBin / BIN_MULTIPLIER
+ for FScoreBin in self.OddsTrue3.keys():
+ OddsTrue = self.OddsTrue3[FScoreBin]
+ if (1.0 - OddsTrue) <= PValueCutoff:
+ if (FScoreBin / BIN_MULTIPLIER) < FScoreCutoff3:
+ FScoreCutoff3 = FScoreBin / BIN_MULTIPLIER
+ self.ProteinPicker.FScoreCutoff2 = FScoreCutoff2
+ self.ProteinPicker.FScoreCutoff3 = FScoreCutoff3
+ self.ProteinPicker.MeanDeltaScore2 = self.MeanDeltaScore2
+ self.ProteinPicker.MeanDeltaScore3 = self.MeanDeltaScore3
+ self.ProcessResultsFiles(ReadScoresPath, self.ProteinPicker.ParseAnnotations)
+ # We've COUNTED the protein hits. Now ask the picker to decide which
+ # protein 'owns' each peptide:
+ self.ProteinPicker.ChooseProteins()
+ def SetOutputDistributionPath(self, Path):
+ self.OutputDistributionPath = Path
+ self.OutputDistributionFile = open(Path, "wb")
+ def ParseClusterInfo(self):
+ """
+ Parse cluster-sizes from an info file.
+ """
+ self.ClusterSizes = {}
+ File = open(self.ClusterInfoPath, "rb")
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split()
+ try:
+ ScanNumber = int(Bits[1])
+ ClusterSize = int(Bits[2])
+ except:
+ print "* Skipping this line:", FileLine
+ self.ClusterSizes[(Bits[0], ScanNumber)] = ClusterSize
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "l:s:r:w:m:bp:vixzd:a1S:HX:")
+ OptionsSeen = {}
+ self.SaveDistributionPath = "PValues.txt" # default
+ self.ReadScoresPath = None
+ self.WriteScoresPath = None
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-l":
+ if not os.path.exists(Value):
+ print "** Error: can't read p-value distribution from file '%s'"%Value
+ return 0
+ self.LoadDistributionPath = Value
+ elif Option == "-p":
+ self.PValueCutoff = float(Value)
+ elif Option == "-s":
+ self.SaveDistributionPath = Value
+ elif Option == "-x":
+ self.RetainBadMatches = 1
+ elif Option == "-b":
+ self.BlindFlag = 1
+ self.MQScoreWeight = Defaults.BlindMQScoreWeight
+ self.DeltaScoreWeight = Defaults.BlindDeltaScoreWeight
+ self.GammaOffset = Defaults.BlindGammaOffset
+ elif Option == "-r":
+ self.ReadScoresPath = Value
+ elif Option == "-w":
+ self.WriteScoresPath = Value
+ elif Option == "-m":
+ MAX_RESULTS_FILES_TO_PARSE = int(Value)
+ elif Option == "-v":
+ self.VerboseFlag = 1
+ elif Option == "-i":
+ self.GenerateImageFlag = 1
+ elif Option == "-d":
+ if not os.path.exists(Value):
+ print "** Error: couldn't find database file '%s'\n\n"%Value
+ print UsageInfo
+ sys.exit(1)
+ self.DBPath.append(Value)
+ elif Option == "-a":
+ self.PerformProteinSelection = 1
+ elif Option == "-1":
+ self.WriteTopMatchOnly = 1
+ elif Option == "-S":
+ self.ShuffledDatabaseFraction = float(Value)
+ if self.ShuffledDatabaseFraction <= 0 or self.ShuffledDatabaseFraction >= 1:
+ print "* Invalid value for -S: %s"%Value
+ return 0
+ elif Option == "-H":
+ self.RemoveShuffledMatches = 0
+ elif Option == "-X":
+ # Undocumented option for CLUSTER searches:
+ self.ClusterInfoPath = Value
+ self.ParseClusterInfo()
+ else:
+ print "** Unknown option:", Option, Value
+ # Check validity of options:
+ if self.PerformProteinSelection and not self.DBPath:
+ print "* Error: -a option requires -d option!"
+ return 0
+ # No major problems - return TRUE for success.
+ return 1
+
+UsageInfo = """
+FDRUtils.py - Compute probability that each match from a tandem MS
+peptide database search is correct. Write out an updated results file containing
+only the high-quality results.
+
+Parameters:
+ -r [FILENAME] Read results from filename (and fit the probability mixture
+ model to these results). If the option value is a directory, we'll read
+ all the results-files from the directory.
+ -w [FILENAME] Write re-scored results to a file.
+ -l [FILENAME] Load p-value distribution from a file (written out earlier
+ with -s option)
+
+Protein selection can be performed, replacing the protein identification
+with a parsimonious set of protein IDs (using a simple iterative
+approach). The following options are required for protein selection:
+ -a: Replace protein identifications with a "parsimonious" set of protein IDs.
+ Requires -d option!
+ -d [FILENAME] Database (.trie file) searched
+ -S [FRACTION]: (see below)
+
+Other options:
+ -S [FRACTION]: The fraction of the database consisting of shuffled
+ proteins. For instance, if you use a 1:1 mix of valid and invalid
+ proteins, use -S 0.5. If this option is set, p-values will be set using
+ the number of matches to shuffled proteins, whose names begin with XXX
+ -s [FILENAME] Save p-value distribution to a file.
+ -i Write a .png image of the distribution graph (requires PIL)
+ -p [NUM] FDR cutoff for saving results; by default, 0.1
+ -b Blind search (use different score/deltascore weighting)
+ -x If the -x flag is passed, even "bad" matches are written out (no p-value
+ filtering is performed)
+ -1 Write only the top hit for each spectrum, even if "good" runners-up exist
+
+Internal use only:
+ -v Verbose output (for debugging)
+ -H Retain matches to shuffled proteins. Used for further processing ONLY.
+
+Example:
+ FDRUtils.py -r ShewanellaResults -s ShewFDR.txt -w ShewanellaFiltered
+ -p 0.05 -d database\Shew.trie -a
+"""
+
+def Main(Parser = None):
+ global MAX_RESULTS_FILES_TO_PARSE
+
+ if not Parser:
+ Parser = PValueParser()
+ Result = Parser.ParseCommandLine(sys.argv[1:])
+ if not Result:
+ print UsageInfo
+ return
+ if Parser.DBPath and Parser.PerformProteinSelection:
+ Parser.ProteinPicker = SelectProteins.ProteinSelector()
+ Parser.ProteinPicker.LoadMultipleDB(Parser.DBPath)
+ if Parser.LoadDistributionPath:
+ print "Load p-value distribution from %s..."%Parser.LoadDistributionPath
+ Parser.LoadPValueDistribution(Parser.LoadDistributionPath)
+ elif Parser.ReadScoresPath:
+ print "Read scores from search results at %s..."%Parser.ReadScoresPath
+ Parser.ReadDeltaScoreDistribution(Parser.ReadScoresPath)
+ Parser.SetOutputDistributionPath(Parser.SaveDistributionPath)
+ ##############################
+ # Loop for F-score methods
+ Parser.ProcessResultsFiles(Parser.ReadScoresPath, Parser.ReadScoreDistributionFromFile, MAX_RESULTS_FILES_TO_PARSE)
+ if Parser.ShuffledDatabaseFraction != None:
+ print "Compute PValues with shuffled..."
+ Parser.ComputePValuesWithShuffled(0)
+ Parser.ComputePValuesWithShuffled(1)
+ else:
+ Result = Parser.FitMixtureModel()
+ if not Result:
+ sys.exit(1)
+ if Parser.PerformProteinSelection:
+ Parser.SelectProteins(Parser.PValueCutoff, Parser.ReadScoresPath)
+ print "Write p-value distribution to %s..."%Parser.SaveDistributionPath
+ (Stub, Extension) = os.path.splitext(Parser.SaveDistributionPath)
+ Parser.SavePValueDistribution(0)
+ Parser.SavePValueDistribution(1)
+ ##############################
+ Parser.OutputDistributionFile.close()
+ else:
+ print "** Please specify either a distribution file or results file."
+ print UsageInfo
+ sys.exit(1)
+ if Parser.GenerateImageFlag and Image:
+ ImagePath = os.path.splitext(Parser.SaveDistributionPath)[0] + ".2.png"
+ Parser.ProduceScoreDistributionImage(ImagePath, 0)
+ ImagePath = os.path.splitext(Parser.SaveDistributionPath)[0] + ".3.png"
+ Parser.ProduceScoreDistributionImage(ImagePath, 1)
+ if Parser.WriteScoresPath:
+ Parser.WriteFixedScores(Parser.WriteScoresPath)
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "psyco not found - running without optimization"
+ #TestMain()
+ Main()
diff --git a/FreeMod.c b/FreeMod.c
new file mode 100644
index 0000000..a011b8a
--- /dev/null
+++ b/FreeMod.c
@@ -0,0 +1,2720 @@
+//Title: FreeMod.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+// Mod-tolerant matching of peptides to spectra. See header file FreeMod.h for overview.
+#include "CMemLeak.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "Utils.h"
+#include "Inspect.h"
+#include "Trie.h"
+#include "Mods.h"
+#include "Tagger.h"
+#include "Score.h"
+#include "FreeMod.h"
+#include "Scorpion.h"
+#include "SVM.h"
+#include "IonScoring.h"
+
+// SkewPenalty[n] is a score penalty applied to a node that is n/100 daltons
+// away from where it should be. Size 100. (Derived from a functional
+// fit to empirical histogram)
+int g_SkewPenalty[] = {0, 0, 0, 0, -1, -2, -2, -3, -4, -4, -4, -4, -4, -5,
+ -5, -6, -7, -8, -8, -9, -9, -9, -9, -9, -10, -11, -11, -12, -12,
+ -12, -12, -12, -12, -13, -13, -14, -14, -15, -15, -15, -15, -15,
+ -15, -15, -16, -16, -16, -17, -17, -17, -17, -17, -17, -17, -18,
+ -18, -18, -19, -19, -19, -19, -19, -19, -19, -20, -20, -20, -21,
+ -21, -22, -22, -22, -22, -22, -22, -23, -24, -24, -25, -25, -25,
+ -25, -25, -26, -26, -28, -29, -30, -31, -32, -32, -32, -32, -34,
+ -35, -39, -41, -48, -57, -65};
+
+int g_SkewPenaltySize = sizeof(g_SkewPenalty) / sizeof(int);
+int g_SkewPenaltyMax = sizeof(g_SkewPenalty) / sizeof(int) - 1;
+
+// MassDeltas[AminoAcid][n] is the nth modification (normally sorted by size) possible on AminoAcid.
+// e.g. MassDeltas[0][0] is the smallest (or most negative) modification allowed on alanine
+MassDelta** MassDeltas = NULL;
+
+// MassDeltaByMass[AminoAcid][PRM] is a linked list of MassDeltaNodes corresponding to MassDeltas matching
+// PRM on amino acid.
+MassDeltaNode*** MassDeltaByMass = NULL;
+
+// For user-supplied (limited) PTMs, MassDeltaByIndex[AA*MAX_PT_MODTYPE + n] points to an instance of the PTM with
+// index n, attached to AA. Used for decorations! A special case: Always, for AA of 26 (MDBI_ALL_MODS), store
+// a pointer to a valid PTM.
+// The array AllKnownPTMods holds *one* entry for all modifications of the same type (e.g. all phosphorylations),
+// but there's a separate MassDelta instance for serine-phos, threonine-phos, and tyrosine-phos. (And this is probably
+// as it should be, since we might attach a different penalty to phosphotyrosine than to phosphoserine, to reflect the
+// fact that serines are more commonly phosphorylated)
+MassDelta** MassDeltaByIndex = NULL;
+
+/////////////////////////////////////////////////////////
+// Forward declarations:
+void DebugPrintMultiModTable(TagGraph* Graph, char* Buffer, int MaxX, int MaxY, int MaxZ);
+void AddMultiModMatch(MSSpectrum* Spectrum, int CellIndex, int Bonus,
+ char* Buffer, int StartPos, int ModBlockSize, int AminoBlockSize, int BonusLength,
+ int BufferEnd);
+int ExtendMatchRightwardDuo(SearchInfo* Info, char* Buffer, int BufferEnd, int MatchMass,
+ int MaxMods, int ScoreToBeat, int FilePos, SpectrumTweak* Tweak);
+
+int MSAlignmentGeneral(SearchInfo* Info, char* Buffer, int BufferEnd, int MatchMass,
+ int MaxMods, int ScoreToBeat, int FilePos, SpectrumTweak* Tweak);
+
+void AddNewMatchDuo(SearchInfo* Info, SpectrumTweak* Tweak, char* Buffer, int Score, int* PrevCellTable, MassDelta** DeltaTable,
+ int CellIndex, MassDelta* FinalDelta, int AminoBlockSize, int AminoIndex, int EndAminoIndex,
+ int FilePos);
+
+void DebugPrintPRMScores(MSSpectrum* Spectrum, SpectrumTweak* Tweak);
+
+
+// Allocate the array MassDeltaByIndex, if it's not already allocated.
+void AllocMassDeltaByIndex()
+{
+ int MallocSize;
+ if (MassDeltaByIndex)
+ {
+ return;
+ }
+ MallocSize = (MAX_PT_MODTYPE * (AMINO_ACIDS + 1)) * sizeof(MassDelta*);
+ MassDeltaByIndex = (MassDelta**)malloc(MallocSize);
+}
+
+// Free the 2-dimensional table MassDeltaByMass. It's a big table, so don't forget to free it :)
+void FreeMassDeltaByMass()
+{
+ int AA;
+ int PRM;
+ MassDeltaNode* Node;
+ MassDeltaNode* Prev = NULL;
+ if (MassDeltaByMass)
+ {
+ for (AA = 0; AA < AMINO_ACIDS; AA++)
+ {
+ for (PRM = 0; PRM < GlobalOptions->DeltaBinCount; PRM++)
+ {
+ // MassDeltaByMass[AA][PRM] is either null, or it points to the head of a
+ // linked list of MassDeltaNode objects.
+ if (MassDeltaByMass[AA][PRM])
+ {
+ // Free each node of the list:
+ Node = MassDeltaByMass[AA][PRM];
+ Prev = NULL;
+ while (Node)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ Node = Node->Next;
+ }
+ SafeFree(Prev);
+ }
+ }
+ SafeFree(MassDeltaByMass[AA]);
+ }
+ SafeFree(MassDeltaByMass);
+ MassDeltaByMass = NULL;
+ }
+}
+
+// Free all the mods in MassDeltas array.
+void FreeMassDeltas()
+{
+ int AA;
+ if (MassDeltas)
+ {
+ for (AA = 0; AA < AMINO_ACIDS; AA++)
+ {
+ SafeFree(MassDeltas[AA]);
+ }
+ SafeFree(MassDeltas);
+ MassDeltas = NULL;
+ }
+ FreeMassDeltaByMass();
+ //SafeFree(MassDeltaByIndex);
+ //MassDeltaByIndex = NULL;
+}
+
+// Initialize the hash MassDeltaByMass. The table entry MassDeltaByMass[AA][Delta] points to a linked list of
+// mass deltas for amino acid AA matching Delta.
+// In some cases, it makes sense to consider two mass deltas of the same size. Example: Mutation
+// to Q or to K. We keep a *list* of mass deltas in all cases.
+void InitMassDeltaByMass()
+{
+ int MassDeltaIndex;
+ int Fudge;
+ int Delta;
+ int AA;
+ // We populate adjacent cells in MassDeltas[AA] as well. FudgeMax == how many bins away from the "right" bin to consider.
+ // FudgeMax is usually 1, to handle roundoff error. But FudgeMax can be 2-3 if the parent mass epsilon is quite large.
+ int FudgeMax = 1 + (GlobalOptions->ParentMassEpsilon / DALTON);
+ MassDeltaNode* OldNode;
+ MassDeltaNode* NewNode;
+ //
+
+ FreeMassDeltaByMass();
+ MassDeltaByMass = (MassDeltaNode***)calloc(AMINO_ACIDS, sizeof(MassDeltaNode**));
+ for (AA = 0; AA < AMINO_ACIDS; AA++)
+ {
+ MassDeltaByMass[AA] = (MassDeltaNode**)calloc(GlobalOptions->DeltaBinCount + 1, sizeof(MassDeltaNode**));
+ for (MassDeltaIndex = 0; MassDeltaIndex < GlobalOptions->DeltaBinCount; MassDeltaIndex++)
+ {
+ if (!MassDeltas[AA][MassDeltaIndex].Flags)
+ {
+ // Null array entry.
+ break;
+ }
+ ROUND_MASS_TO_DELTA_BIN(MassDeltas[AA][MassDeltaIndex].RealDelta, Delta);
+ // Add our MassDelta to the bin (and to neighboring bins), either filling the bin
+ // or adding the new MassDelta to the end of the bin's linked list of MassDeltaNodes:
+ for (Fudge = max(0, Delta - FudgeMax); Fudge < min(GlobalOptions->DeltaBinCount, Delta + FudgeMax + 1); Fudge++)
+ {
+ NewNode = (MassDeltaNode*)calloc(1, sizeof(MassDeltaNode));
+ NewNode->Delta = &MassDeltas[AA][MassDeltaIndex];
+ //NewNode->RealDelta = NewNode->Delta->RealDelta;
+ OldNode = MassDeltaByMass[AA][Fudge];
+ if (!OldNode)
+ {
+ MassDeltaByMass[AA][Fudge] = NewNode;
+ }
+ else
+ {
+ while (OldNode->Next)
+ {
+ OldNode = OldNode->Next;
+ }
+ OldNode->Next = NewNode;
+ }
+ }
+ }
+ }
+}
+
+void debugMassDeltaByMass()
+{
+
+ int AA, MassDeltaIndex,Fudge,Delta;
+
+ int FudgeMax = 1 + (GlobalOptions->ParentMassEpsilon / DALTON);
+ printf("MassDeltaByMass:\n");
+ for(AA=0; AA < AMINO_ACIDS; AA++)
+ {
+
+ for(MassDeltaIndex=0; MassDeltaIndex < GlobalOptions->DeltaBinCount; MassDeltaIndex++)
+ {
+ if (!MassDeltas[AA][MassDeltaIndex].Flags)
+ {
+ // Null array entry.
+ break;
+ }
+ ROUND_MASS_TO_DELTA_BIN(MassDeltas[AA][MassDeltaIndex].RealDelta, Delta);
+ // Add our MassDelta to the bin (and to neighboring bins), either filling the bin
+ // or adding the new MassDelta to the end of the bin's linked list of MassDeltaNodes:
+ for (Fudge = max(0, Delta - FudgeMax); Fudge < min(GlobalOptions->DeltaBinCount, Delta + FudgeMax + 1); Fudge++)
+ {
+ MassDelta * currDelta = MassDeltaByMass[AA][Fudge];
+ printf("[%c][%d][%d] : Delta=%d,RealDelta=%d,Name=%s,Index=%d\n",(char)(AA+'A'),MassDeltaIndex,Fudge,currDelta->Delta,currDelta->RealDelta,currDelta->Name,currDelta->Index);
+ }
+ }
+ }
+}
+
+
+
+// Read, from the binary file Mutations.dat, the definitions of all mass modifications we will consider.
+// (It's faster to consider a large but LIMITED set of modifications than to consider every feasible value
+// of delta. Also, this limited set lets us assign a SCORE and a NAME to each delta, which is very useful)
+// The file Mutations.dat is written out by the scaffold script PrepBlosum.py
+// Any mass delta with Flags == 0 is a dummy record, which is included simply to pad out the
+// array to a uniform size; such deltas should *never* actually be used!
+// If ReadFlag is false, then don't actually read anything from a file - just init the structure.
+// After calling this, the caller should also call InitMassDeltaByMass to init the hash.
+//ASSUMPTION: If ReadFlag is true, then we are reading from a mutations file and we only look for 26 mutations!
+void LoadMassDeltas(char* FileName, int ReadFlag)
+{
+ int AA;
+ int DeltaIndex;
+ FILE* MassDeltaFile;
+
+ int ScaledMassDelta;
+ int Bin;
+ float RealMassDelta;
+ float Score;
+ char crapola[21];
+ int ModFlags = DELTA_FLAG_VALID;
+
+
+
+ FreeMassDeltas(); // Free up any pre-conceived notions
+
+ MassDeltas = (MassDelta**)calloc(AMINO_ACIDS, sizeof(MassDelta*));
+ for (AA = 0; AA < AMINO_ACIDS; AA++)
+ {
+ MassDeltas[AA] = (MassDelta*)calloc(GlobalOptions->DeltasPerAA, sizeof(MassDelta));
+ }
+ if (!ReadFlag)
+ {
+ // That was a freebie.
+ return;
+ }
+ if (!FileName || !*FileName)
+ {
+ // No file to open.
+ return;
+ }
+ MassDeltaFile = fopen(FileName, "rb");
+ if (!MassDeltaFile)
+ {
+ printf("Error: Unable to open mutation data file '%s'", FileName);
+ return;
+ }
+
+ AllPTModCount = 0;
+
+ for (AA = 0; AA < AMINO_ACIDS; AA++)
+ {
+ MassDeltas[AA] = (MassDelta*)calloc(GlobalOptions->DeltasPerAA, sizeof(MassDelta));
+ //for (DeltaIndex = 0; DeltaIndex < AMINO_ACIDS; DeltaIndex++)
+ //for (DeltaIndex = 0; DeltaIndex < GlobalOptions->DeltasPerAA; DeltaIndex++)
+ DeltaIndex = 0;
+ //printf("DeltasPerAA: %d\n",GlobalOptions->DeltasPerAA);
+ while(DeltaIndex < GlobalOptions->DeltasPerAA)
+ {
+ ReadBinary(&ScaledMassDelta, sizeof(int), 1, MassDeltaFile);
+ ReadBinary(&Score, sizeof(float), 1, MassDeltaFile);
+
+ if(Score < GlobalOptions->MinLogOddsForMutation)
+ {
+
+ ReadBinary(crapola,sizeof(char),20,MassDeltaFile);
+ //printf("NEC_DEBUG: Found a mutation with too small a log odds %f:%s\n",Score,crapola);
+ ReadBinary(&crapola,sizeof(int),1,MassDeltaFile);
+ ReadBinary(&crapola,sizeof(char),1,MassDeltaFile);
+ DeltaIndex += 1;
+ continue;
+
+ }
+
+ MassDeltas[AA][DeltaIndex].RealDelta = ScaledMassDelta;
+ RealMassDelta = ((float)(ScaledMassDelta))/MASS_SCALE;
+ ROUND_MASS_TO_DELTA_BIN(RealMassDelta, Bin);
+ MassDeltas[AA][DeltaIndex].Delta = Bin;
+ MassDeltas[AA][DeltaIndex].Score = Score;
+
+ ReadBinary(&MassDeltas[AA][DeltaIndex].Name, sizeof(char), 20, MassDeltaFile);
+ //printf("Found a good score for %f:%s with mass %d\n",Score,MassDeltas[AA][DeltaIndex].Name,ScaledMassDelta);
+ ReadBinary(&MassDeltas[AA][DeltaIndex].Flags, sizeof(int), 1, MassDeltaFile);
+ ReadBinary(&MassDeltas[AA][DeltaIndex].Amino, sizeof(char), 1, MassDeltaFile);
+ MassDeltas[AA][DeltaIndex].Flags = ModFlags;
+ MassDeltas[AA][DeltaIndex].Index = AllPTModCount;
+ MassDeltaByIndex[AA * MAX_PT_MODTYPE + AllPTModCount] = &MassDeltas[AA][DeltaIndex];
+ MassDeltaByIndex[MDBI_ALL_MODS * MAX_PT_MODTYPE + AllPTModCount] = &MassDeltas[AA][DeltaIndex];
+
+ AllKnownPTMods[AllPTModCount].Mass = ScaledMassDelta;
+ AllKnownPTMods[AllPTModCount].Flags = ModFlags;
+ AllKnownPTMods[AllPTModCount].Allowed[AA] = 1;
+ strncpy(AllKnownPTMods[AllPTModCount].Name,MassDeltas[AA][DeltaIndex].Name,5);
+ g_PTMLimit[AllPTModCount] = 1;
+ AllPTModCount ++;
+ DeltaIndex += 1;
+
+ }
+ }
+ fclose(MassDeltaFile);
+ // The caller should now invoke InitMassDeltaByMass()
+ printf("Found %d total PTMs\n",AllPTModCount);
+}
+
+// Enrich MassDeltas[] to include one modification for any (reasonable) mass change applicable to any
+// amino acid. Also, update InitMassDeltaByMass()
+void AddBlindMods()
+{
+ int AA;
+ int DeltaMass;
+ int Bin;
+ int FoundFlag;
+ int Index;
+ int MaxDeltaMass;
+ //
+ for (AA = 0; AA < AMINO_ACIDS; AA++)
+ {
+ DeltaMass = PeptideMass['A' + AA];
+ if (!DeltaMass)
+ {
+ continue; // bogus amino like B or Z
+ }
+ DeltaMass = (DeltaMass / 1000) * 1000;
+ // The largest *negative* modification permitted is one that takes us down to the mass of glycine:
+ DeltaMass = GLYCINE_MASS - DeltaMass;
+ if(DeltaMass < GlobalOptions->MinPTMDelta * MASS_SCALE)
+ DeltaMass = GlobalOptions->MinPTMDelta * MASS_SCALE;
+ MaxDeltaMass = GlobalOptions->MaxPTMDelta * MASS_SCALE;
+
+ //printf("Min delta: %d\n",DeltaMass);
+ //printf("Max delta: %d\n",MaxDeltaMass);
+ while (DeltaMass < MaxDeltaMass)
+ {
+ // Don't add a mutation for mass delta ~0:
+ if (abs(DeltaMass) < MASS_SCALE)
+ {
+ DeltaMass += MASS_SCALE;
+ continue;
+ }
+ ROUND_MASS_TO_DELTA_BIN(DeltaMass, Bin);
+ FoundFlag = 0;
+ // If we already know a PTM that matches this mass closely enough, don't add another:
+ for (Index = 0; Index < GlobalOptions->DeltasPerAA; Index++)
+ {
+ if (!MassDeltas[AA][Index].Flags)
+ {
+ break;
+ }
+ if (abs(MassDeltas[AA][Index].RealDelta - DeltaMass) < HALF_DALTON)
+ {
+ FoundFlag = 1;
+ break;
+ }
+ }
+ if (!FoundFlag)
+ {
+
+ MassDeltas[AA][Index].RealDelta = DeltaMass;
+ MassDeltas[AA][Index].Delta = Bin;
+ MassDeltas[AA][Index].Flags = 1;
+ MassDeltas[AA][Index].Score = -1; // Somewhat magical score!
+ sprintf(MassDeltas[AA][Index].Name, "%+d", DeltaMass / MASS_SCALE);
+ //printf("MassDeltas[%c][%d].Delta = %d\n",(char)(AA+'65'),Index,Bin);
+ //printf("Name=%s\n",MassDeltas[AA][Index].Name);
+ }
+ DeltaMass += DALTON;
+ }
+ }
+ InitMassDeltaByMass();
+}
+
+// For development only: Print out the scores of all PRMs, as well as the b and y scores
+// and witness scores from which the PRMScores were derived. Requires some slow
+// business to set Spectrum->PRMDebugStrings, and so is not enabled in normal builds.
+void DebugPrintPRMScores(MSSpectrum* Spectrum, SpectrumTweak* Tweak)
+{
+ FILE* PRMFile = NULL;
+ int PRM;
+ ///
+ PRMFile = fopen("PRMScores.xls", "w");
+ if (!PRMFile)
+ {
+ printf("NO DEBUG PRINT OF PRM SCORES DONE.\n");
+ return;
+ }
+ fprintf(PRMFile, "#PRM\tMass\tScore\tBScore\tYScore\tWitnessScore\n");
+ for (PRM = 0; PRM < Tweak->PRMScoreMax; PRM++)
+ {
+ fprintf(PRMFile, "%d\t%.2f\t%d\t", PRM, PRM / 10.0, Tweak->PRMScores[PRM]);
+ //fprintf(PRMFile, "%s\n", Spectrum->PRMDebugStrings[PRM]);
+ fprintf(PRMFile, "\n");
+ }
+ fclose(PRMFile);
+}
+
+// When doing 2-mutant extension, MAX_RIGHT_EXTENSIONS needs to be large:
+#define MAX_RIGHT_EXTENSIONS 512
+Peptide LeftExtensions[MAX_RIGHT_EXTENSIONS];
+int LeftExtensionCount;
+Peptide RightExtensions[MAX_RIGHT_EXTENSIONS];
+int RightExtensionCount;
+
+Peptide* Add1ModMatch(SearchInfo* Info, char* Buffer, int BufferLength, int SuffixEndPos, int SuffixStartPos, int PrefixLength,
+ int Score, MassDelta* Delta, SpectrumTweak* Tweak, int FilePos, char ExtraPrefixChar)
+{
+ Peptide* Match;
+ int Length;
+ int Pos;
+ MSSpectrum* Spectrum = Info->Spectrum;
+ //
+ Length = SuffixEndPos - SuffixStartPos + PrefixLength + 1;
+ Match = NewPeptideNode();
+ Match->Tweak = Tweak;
+ strncpy(Match->Bases, Buffer + SuffixStartPos - PrefixLength, Length);
+
+ Match->InitialScore = Score;
+ Match->RecordNumber = Info->RecordNumber;
+ Match->FilePos = FilePos + SuffixStartPos - PrefixLength;
+ if (SuffixStartPos - PrefixLength > 0)
+ {
+ Match->PrefixAmino = Buffer[SuffixStartPos - PrefixLength - 1];
+ }
+ else
+ {
+ Match->PrefixAmino = ExtraPrefixChar;
+ }
+ Pos = SuffixStartPos - PrefixLength + Length;
+ if (Pos < BufferLength)
+ {
+ Match->SuffixAmino = Buffer[Pos];
+ }
+ if (Delta)
+ {
+ Match->AminoIndex[0] = PrefixLength;
+ Match->ModType[0] = Delta;
+ }
+ Match->DB = Info->DB;
+ GetPeptideParentMass(Match);
+ return StoreSpectralMatch(Spectrum, Match, Length, 0);
+}
+
+// SeekMatch1PTM performs a blind search with at most one PTM permitted.
+// Schematic of SeekMatch1PTM:
+// SuffixStartPos = PrefixEndPos, PTM attaches here
+// /
+// / SuffixEndPos
+// / /
+// * *
+// IKKWLSLPGEMTRPLIL
+// *
+// \--PrefixStartPos
+
+// Kludge: If Buffer points to the middle of a long peptide,
+// ExtraPrefixChar is the character that precedes Buffer.
+#define MAX_1MOD_PEPTIDE_LEN 64
+
+int SeekMatch1PTM(SearchInfo* Info, char* Buffer, int BufferLen, int MatchMass, int ScoreToBeat,
+ SpectrumTweak* Tweak, int FilePos, char ExtraPrefixChar)
+{
+ static int* PrefixScores = NULL;
+ static int* PrefixMasses = NULL;
+ int PrefixStartPos;
+ int PrefixEndPos;
+ int PRM;
+ int PRMBin;
+ int Score;
+ int MatchScore;
+ int MaxPrefix;
+ int SkipBases;
+ int ArrayIndex;
+ int PrefixLength;
+ int MaxPrefixLength;
+ int MinPossibleDelta = max(-130000,GlobalOptions->MinPTMDelta*MASS_SCALE);
+ int MaxPossibleDelta = GlobalOptions->MaxPTMDelta * MASS_SCALE;
+ int MaxMass = MatchMass + GlobalOptions->ParentMassEpsilon + MaxPossibleDelta;
+ int Delta;
+ int DeltaBin;
+ MassDeltaNode* DeltaNode;
+ char AA;
+ int AAIndex;
+ int AbsSkew;
+ int AAMass = 0;
+ int Skew;
+ int SuffixEndPos;
+ int SuffixStartPos;
+ int MatchScoreWithDelta;
+ Peptide* Match;
+ MSSpectrum* Spectrum = Info->Spectrum;
+
+ //printf("SeekMatch1PTM:\n");
+ //printf("MinPTM: %d\n",MinPossibleDelta);
+ //printf("MaxPTM: %d\n",MaxPossibleDelta);
+ //
+ if (!BufferLen)
+ {
+ return 1;
+ }
+ if (!PrefixScores)
+ {
+ PrefixScores = (int*)calloc(512 * MAX_1MOD_PEPTIDE_LEN, sizeof(int));
+ PrefixMasses = (int*)calloc(512 * MAX_1MOD_PEPTIDE_LEN, sizeof(int));
+ }
+ // The prefix of our peptide will extend from
+ // PrefixStartPos...PrefixEndPos, NOT including PrefixEndPos
+
+ // By default, we cover up to 450 bases in each call. If we hit the end of a protein, we stop there
+ // and handle the next protein in the next call to this function.
+ SkipBases = min(BufferLen, 450);
+ for (PrefixStartPos = 0; PrefixStartPos < SkipBases; PrefixStartPos++)
+ {
+ if (Buffer[PrefixStartPos]=='*')
+ {
+ SkipBases = PrefixStartPos + 1;
+ break;
+ }
+ PRM = 0;
+ Score = 0;
+ MaxPrefix = min(SkipBases, PrefixStartPos + MAX_1MOD_PEPTIDE_LEN);
+ for (PrefixEndPos = PrefixStartPos; PrefixEndPos < MaxPrefix; PrefixEndPos++)
+ {
+ ArrayIndex = PrefixEndPos * MAX_1MOD_PEPTIDE_LEN + (PrefixEndPos - PrefixStartPos);
+ if (ArrayIndex < 0 || ArrayIndex > 512 * MAX_1MOD_PEPTIDE_LEN)
+ {
+ printf("** error: Array index for prefix is %d\n", ArrayIndex);
+ }
+ PrefixScores[ArrayIndex] = Score;
+ PrefixMasses[ArrayIndex] = PRM;
+ //printf("%d: Prefix %d-%d score %d PRM %d\n", ArrayIndex, PrefixStartPos, PrefixEndPos, Score, PRM); //Verbose1Mod
+ if (PRM > MaxMass)
+ {
+ break;
+ }
+ AAMass = PeptideMass[Buffer[PrefixEndPos]];
+ if (AAMass)
+ {
+ PRM += AAMass;
+ }
+ else
+ {
+ Score = -9999999;
+ }
+ if (PRM > MaxMass)
+ {
+ // Modless prefix is too long!
+ Score = -9999999;
+ break;
+ }
+ else
+ {
+ PRMBin = (PRM + 50) / 100;
+ if (PRMBin >= 0 && PRMBin < Tweak->PRMScoreMax)
+ {
+ Score += Tweak->PRMScores[PRMBin];
+ }
+ }
+ }
+ }
+ // Now that the prefix table's complete, consider all possible suffixes.
+ // The suffix of our peptide will extend from SuffixStartPos...SuffixEndPos, INCLUSIVE.
+ for (SuffixEndPos = SkipBases - 1; SuffixEndPos > 0; SuffixEndPos--)
+ {
+ //printf("Try ending at pos'n %d (%c)\n", SuffixEndPos, Buffer[SuffixEndPos]); //Verbose1Mod
+ PRM = MatchMass;
+ Score = 0;
+ if (Spectrum->Node->LastMatch)
+ {
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+ }
+ for (SuffixStartPos = SuffixEndPos; SuffixStartPos >= 0; SuffixStartPos--)
+ {
+ // Grow the c-terminal suffix by one residue:
+ AA = Buffer[SuffixStartPos];
+ AAIndex = AA - 'A';
+ AAMass = PeptideMass[AA];
+ if (AAMass)
+ {
+ PRM -= AAMass;
+ }
+ else
+ {
+ break; // bogus AA encountered
+ }
+
+ //NEC_DEBUG
+ //printf("Suffix %d (%c) to %d (%c), mass remaining %.2f\n", SuffixStartPos, Buffer[SuffixStartPos], SuffixEndPos, Buffer[SuffixEndPos], PRM / 1000.0); //Verbose1Mod
+ //if (PRM < -GlobalOptions->ParentMassEpsilon)
+ if(PRM < MinPossibleDelta)
+ {
+
+ break; // modless suffix is too long!
+ }
+ // Try to hook up a to prefix:
+ ArrayIndex = SuffixStartPos * MAX_1MOD_PEPTIDE_LEN;
+ MaxPrefixLength = min(MAX_1MOD_PEPTIDE_LEN, SuffixStartPos + 1);
+ for (PrefixLength = 0; PrefixLength < MaxPrefixLength; PrefixLength++)
+ {
+ Delta = PRM - PrefixMasses[ArrayIndex];
+ if (Delta < MinPossibleDelta)
+ {
+ break;
+ }
+ if (Delta < MaxPossibleDelta)
+ {
+ MatchScore = Score + PrefixScores[ArrayIndex];
+ //printf("Prefix %d-%d, suffix %d-%d, delta %.2f, score %d\n", SuffixStartPos - PrefixLength, SuffixStartPos, SuffixStartPos, SuffixEndPos, Delta / (float)DALTON, MatchScore);
+ if (MatchScore > ScoreToBeat)
+ {
+ // Look for the delta that can hook these together:
+ if (abs(Delta) < GlobalOptions->ParentMassEpsilon)
+ {
+ Match = Add1ModMatch(Info, Buffer, BufferLen, SuffixEndPos, SuffixStartPos, PrefixLength, MatchScore, NULL, Tweak, FilePos, ExtraPrefixChar);
+ //after every call to add a match, the ScoreToBeat MUST be updated.
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+ }
+ else
+ {
+ ROUND_MASS_TO_DELTA_BIN(Delta, DeltaBin);
+ DeltaNode = MassDeltaByMass[AAIndex][DeltaBin];
+ while (DeltaNode)
+ {
+ Skew = Delta - DeltaNode->Delta->RealDelta;
+ AbsSkew = abs(Skew);
+ if (AbsSkew <= GlobalOptions->Epsilon)
+ {
+ MatchScoreWithDelta = MatchScore + (int)(DeltaNode->Delta->Score * DELTA_SCORE_SCALER);
+ if (MatchScoreWithDelta > ScoreToBeat)
+ {
+ Match = Add1ModMatch(Info, Buffer, BufferLen, SuffixEndPos, SuffixStartPos, PrefixLength, MatchScoreWithDelta, DeltaNode->Delta, Tweak, FilePos, ExtraPrefixChar);
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+ }
+ }
+ DeltaNode = DeltaNode->Next;
+ }
+ // If the modification mass was small, then ALSO try the unmodified peptide:
+ if (abs(Delta) < 5 * DALTON)
+ {
+ Add1ModMatch(Info, Buffer, BufferLen, SuffixEndPos, SuffixStartPos, PrefixLength, MatchScore, NULL, Tweak, FilePos, ExtraPrefixChar);
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+
+ }
+ }
+ }
+ }
+ ArrayIndex++;
+ }
+ // If we didn't just link up, then accumulate some score:
+ if (PRM >= 0)
+ {
+ Score += Tweak->PRMScores[MASS_TO_BIN(PRM)];
+ //printf("Accumulate score %.2f from PRM %d\n", Tweak->PRMScores[MASS_TO_BIN(PRM)], PRM);
+ }
+ } // SuffixEndPos loop
+ } // SuffixStartPos loop
+ return SkipBases;
+
+}
+
+int* PTMScoreTable = NULL;
+int* PrevCellIndexTable = NULL;
+MassDelta** DeltaTable = NULL;
+int* MassDeltaTable = NULL;
+
+#define DB_BUFFER_SIZE 1024000
+#define DB_SHUNT_BOUNDARY 900000
+#define DB_READ_BOUNDARY 900000
+
+// Search a database, using *no* tag-based filtering at all. This is much slower than searching
+// with tag-based filters, but also more sensitive, particularly since we haven't handled the
+// problem of tagging in the presence of mutations.
+void SearchDatabaseTagless(SearchInfo* Info, int MaxMods, int VerboseFlag, SpectrumTweak* Tweak)
+{
+ static char* Buffer = NULL; // will be big
+ int IsEOF = 0;
+ FILE* DBFile;
+ int FilePos = 0;
+ int BufferEnd = 0;
+ int BufferPos = 0;
+ int BytesRead;
+ MSSpectrum* Spectrum = Info->Spectrum;
+ // We require all peptide candidates to be long enough to be meaningful (at least 500 Da, or large
+ // enough to equal the parent mass after maximum mod mass, whichever is largest)
+ // We also stop considering peptide candidates after they are too long to match
+ // the spectrum (even after deducting some mass due to modifications)
+ int ParentResidueMass = Spectrum->ParentMass - PARENT_MASS_BOOST;
+ int ScoreToBeat = -999999;
+ int SkipBases;
+ char PrefixChar;
+ //
+ if (!Buffer)
+ {
+ Buffer = (char*)malloc(sizeof(char) * DB_BUFFER_SIZE);
+ if (!Buffer)
+ {
+ printf("** ERROR: Unable to allocate buffer in SearchDatabaseTagless()!\n");
+ return;
+ }
+ }
+
+ // Ensure that the PRM scores of this spectrum are set, so that we can score candidates:
+ if (!Tweak->PRMScores)
+ {
+ if (VerboseFlag)
+ {
+ printf("[V] SetPRMScores()\n");
+ }
+ SetSpectrumPRMScores(Spectrum, Tweak);
+ }
+ // Open the database, and start reading:
+
+ DBFile = Info->DB->DBFile;
+
+ //DebugPrintPRMScores(Spectrum, Tweak);
+ Info->RecordNumber = 0;
+ while (1)
+ {
+ if (VerboseFlag)
+ {
+ printf("[V] Bufferpos %d BufferEnd %d IsEOF %d FilePos %d Record# %d\n", BufferPos, BufferEnd, IsEOF, FilePos, Info->RecordNumber);
+ }
+
+ // Shunt bases toward front of buffer:
+ if (BufferPos > DB_SHUNT_BOUNDARY)
+ {
+ memmove(Buffer, Buffer + BufferPos, DB_BUFFER_SIZE - BufferPos);
+ BufferEnd = DB_BUFFER_SIZE - BufferPos;
+ BufferPos = 0;
+ }
+ // Read more bases:
+ if (!IsEOF && BufferEnd < DB_READ_BOUNDARY)
+ {
+ BytesRead = ReadBinary(Buffer + BufferEnd, sizeof(char), DB_BUFFER_SIZE - BufferEnd, DBFile);
+ BufferEnd += BytesRead;
+ if (!BytesRead)
+ {
+ IsEOF = 1;
+ }
+ }
+ if (BufferPos >= BufferEnd) // hit the end if the database
+ {
+ break;
+ }
+
+ // If this isn't an amino acid, skip onward:
+ if (!PeptideMass[Buffer[BufferPos]])
+ {
+ BufferPos++;
+ FilePos++;
+ continue;
+ }
+ if (Buffer[BufferPos]=='*')
+ {
+ BufferPos++;
+ FilePos++;
+ Info->RecordNumber++;
+ continue;
+ }
+ // Try to find peptide matches from a prefix of Buffer[BufferPos:]
+ if (MaxMods > 2)
+ {
+ // The SLOW way!
+ SkipBases = MSAlignmentGeneral(Info, Buffer + BufferPos, BufferEnd - BufferPos, ParentResidueMass,
+ MaxMods, ScoreToBeat, FilePos, Tweak);
+ if (VerboseFlag)
+ {
+ printf("[V] General() return. SkipBases %d\n", SkipBases);
+ }
+
+ }
+ else if (MaxMods > 1)
+ {
+ // Extend into a match, possibly up to 2 mods.
+ SkipBases = ExtendMatchRightwardDuo(Info, Buffer + BufferPos, BufferEnd - BufferPos, ParentResidueMass,
+ min(2, MaxMods), ScoreToBeat, FilePos, Tweak);
+ }
+ else
+ {
+ // Extend into a match using at most one mod.
+ if (BufferPos)
+ {
+ PrefixChar = Buffer[BufferPos - 1];
+ }
+ else
+ {
+ PrefixChar = '*';
+ }
+ SkipBases = SeekMatch1PTM(Info, Buffer + BufferPos, BufferEnd - BufferPos, ParentResidueMass, ScoreToBeat, Tweak, FilePos, PrefixChar);
+ }
+ // RightExtensionCount is set to -1 if there's an error and this spectrum can't be searched.
+ if (RightExtensionCount < 0)
+ {
+ break;
+ }
+ BufferPos += SkipBases;
+ FilePos += SkipBases;
+ if (Buffer[BufferPos-1] == '*')
+ {
+ Info->RecordNumber++;
+ }
+ if (Spectrum->Node->MatchCount == GlobalOptions->StoreMatchCount)
+ {
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+ }
+
+ }
+ SafeFree(Buffer);
+ Buffer = NULL;
+
+ // At this point, we have a list of candidates. They've been quick-scored, but we can sort them better if
+ // we score them more meticulously. The *caller* will call ScoreSpectralMatches(Spectrum) to re-score them
+ // (we could do it here, but that would be wrong in the MultiCharge case)
+ //fclose(DBFile);
+}
+
+void DebugPrintMatch(Peptide* Match)
+{
+ int ModIndex;
+ char* Amino;
+ int Mass = 0;
+ printf("Match '%s' ", Match->Bases);
+ // Show the mods:
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] < 0)
+ {
+ break;
+ }
+ printf(" %c%d:%s(%.2f)", Match->Bases[Match->AminoIndex[ModIndex]], Match->AminoIndex[ModIndex],
+ Match->ModType[ModIndex]->Name, Match->ModType[ModIndex]->RealDelta/100.0);
+ Mass += Match->ModType[ModIndex]->RealDelta;
+ }
+ for (Amino = Match->Bases; *Amino; Amino++)
+ {
+ Mass += PeptideMass[*Amino];
+ }
+ printf(" mass %.2f score %d:%.3f dcn%.2f dcno%.2f\n", Mass/(float)MASS_SCALE, Match->InitialScore, Match->MatchQualityScore, Match->DeltaCN, Match->DeltaCNOther);
+}
+
+// Print out a list of matches for the spectrum node (Spectrum->FirstMatch through Spectrum->LastMatch).
+void DebugPrintMatchList(SpectrumNode* Spectrum)
+{
+ Peptide* Match;
+ //
+ printf("Spectrum has %d matches:\n", Spectrum->MatchCount);
+ for (Match = Spectrum->FirstMatch; Match; Match = Match->Next)
+ {
+ DebugPrintMatch(Match);
+ }
+}
+
+// Re-score spectral matches. The matches in the list Spectrum->FirstMatch have been
+// quick-scored, but we can sort them better if we score them more meticulously.
+// Let's do so, and re-sort the list based on the new scores.
+void MQScoreSpectralMatches(SpectrumNode* Node)
+{
+ Peptide* PrevMatch;
+ Peptide* Match;
+ int OldScore;
+ int VerboseFlag = 0;
+ MSSpectrum* Spectrum = Node->Spectrum;
+ //
+ if (!Node->FirstMatch)
+ {
+ return; // that was easy - we scored 0 of 0 :)
+ }
+ PrevMatch = Node->FirstMatch;
+
+ Match = PrevMatch->Next;
+ Node->FirstMatch = NULL;
+ Node->LastMatch = NULL;
+ Node->MatchCount = 0;
+ while (PrevMatch)
+ {
+ PrevMatch->Prev = NULL;
+ PrevMatch->Next = NULL;
+ OldScore = PrevMatch->InitialScore;
+
+ ComputeMQScoreFeatures(Spectrum, PrevMatch, PrevMatch->ScoreFeatures, VerboseFlag);
+#ifdef MQSCORE_USE_SVM
+ PrevMatch->MatchQualityScore = SVMComputeMQScore(Spectrum, PrevMatch, PrevMatch->ScoreFeatures);
+#else
+ PrevMatch->MatchQualityScore = LDAComputeMQScore(Spectrum, PrevMatch, PrevMatch->ScoreFeatures);
+#endif
+ StoreSpectralMatch(Spectrum, PrevMatch, strlen(PrevMatch->Bases), 1);
+ PrevMatch = Match;
+ if (!Match)
+ {
+ break;
+ }
+ Match = Match->Next;
+ }
+ //SetMatchDeltaCN(Spectrum);
+}
+
+void PrunePoorGraphNodes(TagGraph* Graph)
+{
+ int NodeIndex = 0;
+ float* NodeScores;
+ float CutoffNodeScore;
+ TagGraphNode* Node;
+ TagGraphNode* NextNode = NULL;
+ TagGraphEdge* Edge;
+ TagGraphEdge* NextEdge = NULL;
+ TagGraphEdge* PrevEdge = NULL;
+ //
+ // Write the node scores to array NodeScores, sort them, and select the cutoff score.
+ NodeScores = (float*)malloc(sizeof(float) * Graph->NodeCount);
+
+ for (NodeIndex = 0, Node = Graph->FirstNode; Node; Node = Node->Next,NodeIndex++)
+ {
+ NodeScores[NodeIndex] = Node->Score;
+ }
+ qsort(NodeScores, NodeIndex, sizeof(float), (QSortCompare)CompareFloats);
+ CutoffNodeScore = NodeScores[498]; // Allow two endpoint nodes to survive
+ SafeFree(NodeScores);
+ // Eliminate every node whose score is <= the cutoff. Start by eliminating all EDGES to such nodes!
+ for (Node = Graph->FirstNode; Node; Node = Node->Next)
+ {
+ PrevEdge = NULL;
+ for (Edge= Node->FirstEdge; Edge; Edge = NextEdge)
+ {
+ NextEdge = Edge->Next;
+ if (Edge->ToNode->Score <= CutoffNodeScore && (Edge->ToNode->NodeType == evGraphNodeB || Edge->ToNode->NodeType == evGraphNodeY))
+ {
+ // Free this edge:
+ if (PrevEdge)
+ {
+ PrevEdge->Next = Edge->Next;
+ }
+ if (Node->FirstEdge == Edge)
+ {
+ Node->FirstEdge = Edge->Next;
+ }
+ if (Node->LastEdge == Edge)
+ {
+ Node->LastEdge = PrevEdge;
+ }
+ SafeFree(Edge);
+ }
+ else
+ {
+ PrevEdge = Edge;
+ }
+ }
+ }
+ // Now free the nodes themselves:
+ for (Node = Graph->FirstNode; Node; Node = NextNode)
+ {
+ NextNode = Node->Next;
+ if (Node->Score <= CutoffNodeScore && (Node->NodeType == evGraphNodeB || Node->NodeType == evGraphNodeY))
+ {
+ if (Node->Prev)
+ {
+ Node->Prev->Next = Node->Next;
+ }
+ if (Node->Next)
+ {
+ Node->Next->Prev = Node->Prev;
+ }
+ if (Graph->FirstNode == Node)
+ {
+ Graph->FirstNode = Node->Next;
+ }
+ if (Graph->LastNode == Node)
+ {
+ Graph->LastNode = Node->Prev;
+ }
+ FreeTagGraphNode(Node);
+ Graph->NodeCount--;
+ }
+ }
+ if (Graph->NodeCount > 500)
+ {
+ printf("* ERROR: Failed to prune excess graph nodes!\n");
+ }
+ // Fix node numbering:
+ for (NodeIndex = 0, Node = Graph->FirstNode; Node; Node = Node->Next, NodeIndex++)
+ {
+ Node->Index = NodeIndex;
+ }
+ // And now, rebuild the node index:
+ TagGraphBuildNodeIndex(Graph);
+}
+
+// Called after populating the tag graph with nodes.
+// Now we add edges between any two nodes that can be linked by a JUMP (an amino acid, or
+// an amino acid plus a decoration, or two amino acids plus 0-1 decorations)
+void TagGraphPopulateBackEdges(TagGraph* Graph)
+{
+ TagGraphNode* Node;
+ TagGraphNode* OtherNode;
+ TagGraphBackEdge* Edge;
+ TagGraphBackEdge* OldEdge;
+ int AA1;
+ int AA2;
+ int AA3;
+ int Mass;
+ int AA1Mass;
+ int AA2Mass;
+ int AA3Mass;
+ int Skew;
+ int AbsSkew;
+ int NextBackEdgeIndex = 0;
+ int BackEdgeBufferSize;
+ //
+
+ if (!Graph->NodeIndex)
+ {
+ TagGraphBuildNodeIndex(Graph);
+ }
+ SafeFree(Graph->BackEdgeBuffer);
+ BackEdgeBufferSize = min(5000000, 8420 * Graph->NodeCount);
+ Graph->BackEdgeBuffer = (TagGraphBackEdge*)calloc(BackEdgeBufferSize, sizeof(TagGraphBackEdge));
+ if (!Graph->BackEdgeBuffer)
+ {
+ printf("*** ERROR: Unable to allocate BackEdgeBuffer!\n");
+ fflush(stdout);
+ }
+ // NB: We can't easily realloc the BackEdgeBuffer, because there are many many pointers into it. If
+ // we overflow the buffer, we just complain and then bail out to avoid crashing.
+
+ // Ensure that there aren't too many PRMNodes. The array in ExtendMatchRightwardDuo assumes that there
+ // are at most 500. That should be *plenty*, since at most 20-30 of them can be true.
+ if (Graph->NodeCount > 500)
+ {
+ PrunePoorGraphNodes(Graph);
+ }
+
+ for (Node = Graph->FirstNode; Node; Node = Node->Next)
+ {
+ Node->BackEdge = (TagGraphBackEdge**)calloc(AMINO_ACIDS, sizeof(TagGraphBackEdge*));
+ Node->BackEdgeDouble = (TagGraphBackEdge**)calloc(AMINO_ACIDS*AMINO_ACIDS, sizeof(TagGraphBackEdge*));
+ Node->BackEdgeTriple = (TagGraphBackEdge**)calloc(AMINO_ACIDS*AMINO_ACIDS*AMINO_ACIDS, sizeof(TagGraphBackEdge*));
+ for (AA1 = 0; AA1 < AMINO_ACIDS; AA1++)
+ {
+ AA1Mass = PeptideMass[AA1 + 'A'];
+ if (!AA1Mass)
+ {
+ continue;
+ }
+ // Try to jump back by this amino acid's mass:
+ Mass = Node->Mass - AA1Mass;
+ if (Mass < -GlobalOptions->Epsilon)
+ {
+ continue;
+ }
+ Mass = max(Mass, 0);
+ OtherNode = Graph->NodeIndex[Mass / MASS_SCALE];
+ while (OtherNode)
+ {
+ Skew = OtherNode->Mass - Mass;
+ if (Skew > GlobalOptions->Epsilon)
+ {
+ break;
+ }
+ if (Skew < -GlobalOptions->Epsilon)
+ {
+ OtherNode = OtherNode->Next;
+ continue;
+ }
+ AbsSkew = abs(Skew) / 10;
+ Edge = Graph->BackEdgeBuffer + NextBackEdgeIndex;
+ NextBackEdgeIndex++;
+ if (NextBackEdgeIndex >= BackEdgeBufferSize)
+ {
+ printf("** Too many BackEdges for buffer - bailing out!\n");
+ return;
+ }
+ //Edge = (TagGraphBackEdge*)calloc(1, sizeof(TagGraphBackEdge));
+ Edge->FromNode = Node;
+ Edge->ToNode = OtherNode;
+ Edge->Skew = Skew;
+ Edge->Next = NULL;
+ if (AbsSkew > g_SkewPenaltyMax)
+ {
+ Edge->Score = g_SkewPenalty[g_SkewPenaltyMax];
+ }
+ else
+ {
+ Edge->Score = g_SkewPenalty[AbsSkew];
+ }
+ OldEdge = Node->BackEdge[AA1];
+ if (!OldEdge)
+ {
+ Node->BackEdge[AA1] = Edge;
+ }
+ else
+ {
+ while (OldEdge->Next)
+ {
+ OldEdge = OldEdge->Next;
+ }
+ OldEdge->Next = Edge;
+ }
+ OtherNode = OtherNode->Next;
+ }
+
+ // Try to jump back by a pair of amino acids:
+ for (AA2 = 0; AA2 < AMINO_ACIDS; AA2++)
+ {
+ AA2Mass = PeptideMass[AA2 + 'A'];
+ if (!AA2Mass)
+ {
+ continue;
+ }
+
+ Mass = Node->Mass - AA1Mass - AA2Mass;
+ if (Mass < -GlobalOptions->Epsilon)
+ {
+ continue;
+ }
+ Mass = max(Mass, 0);
+ OtherNode = Graph->NodeIndex[Mass / MASS_SCALE];
+ while (OtherNode)
+ {
+ Skew = OtherNode->Mass - Mass;
+ if (Skew > GlobalOptions->Epsilon)
+ {
+ break;
+ }
+ if (Skew < -GlobalOptions->Epsilon)
+ {
+ OtherNode = OtherNode->Next;
+ continue;
+ }
+ AbsSkew = abs(Skew) / 10;
+
+ Edge = Graph->BackEdgeBuffer + NextBackEdgeIndex;
+ NextBackEdgeIndex++;
+ if (NextBackEdgeIndex >= BackEdgeBufferSize)
+ {
+ printf("** Too many BackEdges for buffer - bailing out!\n");
+ return;
+ }
+
+ //Edge = (TagGraphBackEdge*)calloc(1, sizeof(TagGraphBackEdge));
+ Edge->FromNode = Node;
+ Edge->ToNode = OtherNode;
+ Edge->Skew = Skew;
+ Edge->Next = NULL;
+ Edge->HalfMass = Node->Mass - AA1Mass;
+ if (AbsSkew > g_SkewPenaltyMax)
+ {
+ Edge->Score = g_SkewPenalty[g_SkewPenaltyMax];
+ }
+ else
+ {
+ Edge->Score = g_SkewPenalty[AbsSkew];
+ }
+ OldEdge = Node->BackEdgeDouble[AA1 * AMINO_ACIDS + AA2];
+ if (!OldEdge)
+ {
+ Node->BackEdgeDouble[AA1 * AMINO_ACIDS + AA2] = Edge;
+ }
+ else
+ {
+ while (OldEdge->Next)
+ {
+ OldEdge = OldEdge->Next;
+ }
+ OldEdge->Next = Edge;
+ }
+ OtherNode = OtherNode->Next;
+ }
+
+ // Triple-jump (three amino acids)
+ for (AA3 = 0; AA3 < AMINO_ACIDS; AA3++)
+ {
+ AA3Mass = PeptideMass[AA3+'A'];
+ if (!AA3Mass)
+ {
+ continue;
+ }
+
+ Mass = Node->Mass - AA1Mass - AA2Mass - AA3Mass;
+ if (Mass < -GlobalOptions->Epsilon)
+ {
+ continue;
+ }
+ Mass = max(Mass, 0);
+ OtherNode = Graph->NodeIndex[Mass / MASS_SCALE];
+ while (OtherNode)
+ {
+ Skew = OtherNode->Mass - Mass;
+ if (Skew > GlobalOptions->Epsilon)
+ {
+ break;
+ }
+ if (Skew < -GlobalOptions->Epsilon)
+ {
+ OtherNode = OtherNode->Next;
+ continue;
+ }
+ AbsSkew = abs(Skew) / 10;
+
+ Edge = Graph->BackEdgeBuffer + NextBackEdgeIndex;
+ NextBackEdgeIndex++;
+ if (NextBackEdgeIndex >= BackEdgeBufferSize)
+ {
+ printf("** Too many BackEdges for buffer - bailing out!\n");
+ return;
+ }
+
+ //Edge = (TagGraphBackEdge*)calloc(1, sizeof(TagGraphBackEdge));
+ Edge->FromNode = Node;
+ Edge->ToNode = OtherNode;
+ Edge->Skew = Skew;
+ Edge->Next = NULL;
+ Edge->HalfMass = Node->Mass - AA1Mass;
+ Edge->HalfMass2 = Node->Mass - AA1Mass - AA2Mass;
+ if (AbsSkew > g_SkewPenaltyMax)
+ {
+ Edge->Score = g_SkewPenalty[g_SkewPenaltyMax];
+ }
+ else
+ {
+ Edge->Score = g_SkewPenalty[AbsSkew];
+ }
+ OldEdge = Node->BackEdgeTriple[AA1*676 + AA2*26 + AA3];
+ if (!OldEdge)
+ {
+ Node->BackEdgeTriple[AA1*676 + AA2*26 + AA3] = Edge;
+ }
+ else
+ {
+ while (OldEdge->Next)
+ {
+ OldEdge = OldEdge->Next;
+ }
+ OldEdge->Next = Edge;
+ }
+ OtherNode = OtherNode->Next;
+ }
+ }
+ }
+ }
+ }
+ return;
+}
+
+// The maximum dimensions of the Duo Table are 512 rows (for amino acids) and 500 columns (for the nodes).
+#define MAX_ROWS 512
+#define MAX_NODES 500
+
+// Find the end of this peptide block. Returns 1 if the block is valid, 0 if we
+// needn't bother searching this block at all.
+int FindPeptideBlockEnd(MSSpectrum* Spectrum, char* Buffer,int BufferEnd, int* pMaxAmino, int* pReturnAmino)
+{
+ int AccumMass = 0;
+ int AminoIndex;
+ char Amino;
+ //
+ *pMaxAmino = MAX_ROWS - 1; // default;
+ *pReturnAmino = -1; // uninitialized
+ // Iterate over the amino acids, keeping track of the total mass (AccumMass), and watching for
+ // record boundaries.
+ for (AminoIndex = 0; AminoIndex < MAX_ROWS; AminoIndex++)
+ {
+ if (AminoIndex >= BufferEnd)
+ {
+ *pMaxAmino = AminoIndex;
+ *pReturnAmino = *pMaxAmino;
+ break;
+ }
+ Amino = Buffer[AminoIndex];
+ if (Amino == '*')
+ {
+ // No peptide can span record boundaries, so we'll stop the block here.
+ *pMaxAmino = AminoIndex;// + 1;
+ *pReturnAmino = *pMaxAmino;
+ break;
+ }
+ if (!PeptideMass[Amino])
+ {
+ // A bogus amino acid in the database. Stop the block here.
+ *pMaxAmino = AminoIndex;// + 1;
+ *pReturnAmino = *pMaxAmino;
+ break;
+ }
+ AccumMass += PeptideMass[Amino];
+ if (AminoIndex >= MAX_ROWS-1)
+ {
+ // Our block is as large as it can get.
+ *pMaxAmino = AminoIndex-1;
+ *pReturnAmino = *pMaxAmino - 20;
+ break;
+ }
+ }
+ if (*pMaxAmino < 5)
+ {
+ // Not enough amino acids to make a reasonable peptide candidate, so just exit.
+ // (The longest we could get is 4aa, which is too small)
+ *pReturnAmino = *pMaxAmino;
+ return 0;
+ }
+ // Check to see whether the amino acids we've got are large enough - with PTMs included - to match the target:
+ AccumMass += GlobalOptions->MaxPTMDelta*100*2 + PARENT_MASS_BOOST + GlobalOptions->ParentMassEpsilon;
+ if (AccumMass < Spectrum->ParentMass)
+ {
+ // There's not enough peptide sequence left to match our target.
+ *pReturnAmino = *pMaxAmino;
+ return 0;
+ }
+ if (*pReturnAmino < 0)
+ {
+ // Shift forward by most of the length of the block. (Leave some overlap, because a valid peptide
+ // may start near the end of block #1)
+ *pReturnAmino = max(1, *pMaxAmino - 20);
+ }
+ return 1;
+}
+
+void DebugPrintPrefixSuffixTable(FILE* TableFile, char* Buffer, int MaxAmino, int* ScoreTable, int* MassTable)
+{
+ int AminoIndexI;
+ int AminoIndexJ;
+ int CellIndex;
+
+ // Header line:
+ fprintf(TableFile, "< >\t");
+ for (AminoIndexJ = 0; AminoIndexJ <= MaxAmino; AminoIndexJ++)
+ {
+ if (AminoIndexJ > 0)
+ {
+ fprintf(TableFile, "[%d %c]\t", AminoIndexJ, Buffer[AminoIndexJ-1]);
+ }
+ else
+ {
+ fprintf(TableFile, "[0]\t");
+ }
+ }
+ fprintf(TableFile, "\n");
+ // Other lines:
+ for (AminoIndexI = 0; AminoIndexI <= MaxAmino; AminoIndexI++)
+ {
+ if (AminoIndexI > 0)
+ {
+ fprintf(TableFile, "[%d %c]\t", AminoIndexI, Buffer[AminoIndexI-1]);
+ }
+ else
+ {
+ fprintf(TableFile, "[0]\t");
+ }
+
+ for (AminoIndexJ = 0; AminoIndexJ <= MaxAmino; AminoIndexJ++)
+ {
+ if (AminoIndexJ < AminoIndexI)
+ {
+ fprintf(TableFile, "\t");
+ continue;
+ }
+ CellIndex = AminoIndexI*MAX_ROWS + AminoIndexJ;
+ fprintf(TableFile, "%d : %d\t", MassTable[CellIndex], ScoreTable[CellIndex]);
+ }
+ fprintf(TableFile, "\n");
+ }
+}
+
+void DebugPrintPrefixSuffixTables(int MaxAmino, char* Buffer,
+ int* PrefixTable, int* SuffixTable, int* PrefixMassTable, int* SuffixMassTable)
+{
+ FILE* TableFile;
+ TableFile = fopen("PrefixSuffix.xls", "w");
+ if (!TableFile)
+ {
+ return;
+ }
+ fprintf(TableFile, "Prefix table:\n");
+ DebugPrintPrefixSuffixTable(TableFile, Buffer, MaxAmino, PrefixTable, PrefixMassTable);
+ fprintf(TableFile, "\n\nSuffix table:\n");
+ DebugPrintPrefixSuffixTable(TableFile, Buffer, MaxAmino, SuffixTable, SuffixMassTable);
+ fclose(TableFile);
+}
+
+// Fill in the score tables PrefixTable and SuffixTable, plus the mass tables PrefixMassTable and SuffixMassTable.
+// The entry PrefixTable[i, j] is the score that one obtains by matching Buffer[i..j] against the spectrum as
+// a prefix. PrefixTable[i,i] uses one PRM score, PrefixTable[i, i + 1] uses two PRM scores, and so on.
+// Most candidate peptides will have a PrefixTable entry as part of their final score.
+void FillPrefixSuffixTables(MSSpectrum* Spectrum, SpectrumTweak* Tweak, int MatchMass, char* Buffer, int MaxAmino,
+ int* PrefixTable, int* SuffixTable, int* PrefixMassTable, int* SuffixMassTable)
+{
+ int AminoIndexI;
+ int AminoIndexJ;
+ int CellIndex;
+ int PrevCellIndex = 0;
+ int PRM;
+
+ int MaxPRM = Tweak->PRMScoreMax;
+ //
+ // Brute force initializiation. (Note: Don't use memset here, because setting every byte
+ // to -1 (a) is hacky, and (b) makes scores that can easily wrap around to become HUGE POSITIVE)
+ for (CellIndex = 0; CellIndex < MAX_ROWS * MAX_ROWS; CellIndex++)
+ {
+ PrefixTable[CellIndex] = FORBIDDEN_PATH;
+ SuffixTable[CellIndex] = FORBIDDEN_PATH;
+ PrefixMassTable[CellIndex] = -999999;
+ SuffixMassTable[CellIndex] = -999999;
+ }
+
+ for (AminoIndexI = 1; AminoIndexI <= MaxAmino; AminoIndexI++)
+ {
+ for (AminoIndexJ = AminoIndexI; AminoIndexJ <= MaxAmino; AminoIndexJ++)
+ {
+ CellIndex = AminoIndexI * MAX_ROWS + AminoIndexJ;
+ /////////////////////////////
+ // Prefix table:
+ if (AminoIndexJ == AminoIndexI)
+ {
+ PrefixMassTable[CellIndex] = PeptideMass[Buffer[AminoIndexI-1]];
+ }
+ else
+ {
+ PrevCellIndex = AminoIndexI*MAX_ROWS + (AminoIndexJ-1);
+ PrefixMassTable[CellIndex] = PrefixMassTable[PrevCellIndex] + PeptideMass[Buffer[AminoIndexJ-1]];
+ }
+ PRM = MASS_TO_BIN(PrefixMassTable[CellIndex]);
+
+ // Allow PRMs that are slightly too big or small:
+ if (PRM > -PRM_ARRAY_SLACK)
+ {
+ PRM = max(PRM, 0);
+ }
+ if (PRM < MaxPRM + 5)
+ {
+ PRM = min(MaxPRM, PRM);
+ }
+ if (PRM >= 0 && PRM <= MaxPRM)
+ {
+ PrefixTable[CellIndex] = Tweak->PRMScores[PRM];
+ if (AminoIndexJ != AminoIndexI)
+ {
+ PrefixTable[CellIndex] += PrefixTable[PrevCellIndex];
+ }
+ }
+ else
+ {
+ PrefixTable[CellIndex] = FORBIDDEN_PATH;
+ break;
+ }
+ }
+ }
+ for (AminoIndexJ = MaxAmino; AminoIndexJ; AminoIndexJ--)
+ {
+ for (AminoIndexI = AminoIndexJ; AminoIndexI; AminoIndexI--)
+ {
+ CellIndex = AminoIndexI*MAX_ROWS + AminoIndexJ;
+
+ /////////////////////////////
+ // Suffix table:
+ if (AminoIndexJ == AminoIndexI)
+ {
+ SuffixMassTable[CellIndex] = MatchMass - PeptideMass[Buffer[AminoIndexI-1]];
+ }
+ else
+ {
+ PrevCellIndex = (AminoIndexI + 1)*MAX_ROWS + AminoIndexJ;
+ SuffixMassTable[CellIndex] = SuffixMassTable[PrevCellIndex] - PeptideMass[Buffer[AminoIndexI-1]];
+ }
+ PRM = MASS_TO_BIN(SuffixMassTable[CellIndex]);
+ if (PRM > -PRM_ARRAY_SLACK)
+ {
+ PRM = max(PRM, 0);
+ }
+ if (PRM < MaxPRM+5)
+ {
+ PRM = min(MaxPRM, PRM);
+ }
+ if (PRM >= 0 && PRM <= MaxPRM)
+ {
+ SuffixTable[CellIndex] = Tweak->PRMScores[PRM];
+ if (AminoIndexI!=AminoIndexJ)
+ {
+ SuffixTable[CellIndex] += SuffixTable[PrevCellIndex];
+ }
+ }
+ else
+ {
+ SuffixTable[CellIndex] = FORBIDDEN_PATH;
+ break;
+ }
+ }
+ }
+
+}
+
+// Print the Duo table to a file, for debugging. (This is most useful when searching a very small database, since then
+// the table has a managable height)
+void DebugPrintDTable(MSSpectrum* Spectrum, char* Buffer, int* DTable, MassDelta** DeltaTable, int* PrevCellTable, int MaxAmino)
+{
+ int AminoIndex;
+ int NodeIndex;
+ TagGraphNode* Node;
+ FILE* DTableFile = NULL;
+ int CellIndex;
+ int PrevCellIndex = 0;
+
+ int AminoBlockSize = Spectrum->Graph->NodeCount;
+ //
+ DTableFile = fopen("DTable.xls", "w");
+ if (!DTableFile)
+ {
+ return;
+ }
+ // Header:
+ fprintf(DTableFile, "\t");
+ for (NodeIndex = 0, Node = Spectrum->Graph->FirstNode; Node; Node = Node->Next, NodeIndex++)
+ {
+ fprintf(DTableFile, "%d (%.2f)\t", NodeIndex, Node->Mass / 100.0);
+ }
+ fprintf(DTableFile, "\n");
+ // Body:
+ for (AminoIndex = 0; AminoIndex < MaxAmino; AminoIndex++)
+ {
+ if (AminoIndex)
+ {
+ fprintf(DTableFile, "%c %d\t", Buffer[AminoIndex-1], AminoIndex);
+ }
+ else
+ {
+ fprintf(DTableFile, "%d\t", AminoIndex);
+ }
+ for (NodeIndex = 0, Node = Spectrum->Graph->FirstNode; Node; Node = Node->Next, NodeIndex++)
+ {
+ CellIndex = AminoIndex*AminoBlockSize + NodeIndex;
+ fprintf(DTableFile, "%d ", DTable[CellIndex]);
+ if (DeltaTable[CellIndex])
+ {
+ fprintf(DTableFile, "%s", DeltaTable[CellIndex]->Name);
+ }
+ PrevCellIndex = PrevCellTable[CellIndex];
+ if (PrevCellIndex >0)
+ {
+ fprintf(DTableFile, "-> (%d, %d)", PrevCellIndex/AminoBlockSize, PrevCellIndex%AminoBlockSize);
+ }
+ fprintf(DTableFile, "\t");
+ }
+ fprintf(DTableFile, "\n");
+ }
+
+ fclose(DTableFile);
+}
+
+void DebugPrintGeneralTable(MSSpectrum* Spectrum, char* Buffer, int MaxAmino, int MaxMods,
+ int* ScoreTable, int* PrevCellTable)
+{
+ int AminoIndex;
+ int NodeIndex;
+ int CellIndex;
+ int ModCountIndex;
+ FILE* DebugFile;
+ TagGraphNode* Node;
+ int AminoBlockSize;
+ int ZSize = MaxMods + 1;
+ //
+ DebugFile = fopen("DPTable.txt", "wb");
+ if (!DebugFile)
+ {
+ printf("Unable to open DPTable.txt - not debugprinting.\n");
+ return;
+ }
+ AminoBlockSize = Spectrum->Graph->NodeCount * ZSize;
+ for (ModCountIndex = 0; ModCountIndex < ZSize; ModCountIndex++)
+ {
+ fprintf(DebugFile, "\nZ = %d\n", ModCountIndex);
+ /////////////////////////////////
+ // Column headers:
+ fprintf(DebugFile, "\t");
+ for (AminoIndex = 0; AminoIndex < MaxAmino; AminoIndex++)
+ {
+ fprintf(DebugFile, "%d\t", AminoIndex);
+ }
+ fprintf(DebugFile, "\n");
+ fprintf(DebugFile, "\t\t");
+ for (AminoIndex = 1; AminoIndex < MaxAmino; AminoIndex++)
+ {
+ fprintf(DebugFile, "%c\t", Buffer[AminoIndex - 1]);
+ }
+ fprintf(DebugFile, "\n");
+ /////////////////////////////////
+ // Body:
+ for (NodeIndex = 0, Node = Spectrum->Graph->FirstNode; Node; Node = Node->Next, NodeIndex++)
+ {
+ // Print a ROW for this node:
+ fprintf(DebugFile, "%d (%.2f)\t", NodeIndex, Node->Mass / 100.0);
+ for (AminoIndex = 0; AminoIndex < MaxAmino; AminoIndex++)
+ {
+ CellIndex = AminoIndex * AminoBlockSize + NodeIndex * ZSize + ModCountIndex;
+ fprintf(DebugFile, "%d (c%d)\t", ScoreTable[CellIndex], CellIndex);
+ }
+ fprintf(DebugFile, "\n");
+ }
+ }
+ fclose(DebugFile);
+}
+
+
+static int* PrefixTable = NULL;
+static int* SuffixTable = NULL;
+static int* PrefixMassTable = NULL;
+static int* SuffixMassTable = NULL;
+
+// MS-Alignment algorithm, general case (handles k>2). For most purposes, this code is
+// unacceptably slow and non-selective. But, for completeness, it is implemented.
+// In practice, one should use "mods,1" or "mods,2" and find the corpus of
+// available PTMs that way.
+int MSAlignmentGeneral(SearchInfo* Info, char* Buffer, int BufferEnd, int MatchMass,
+ int MaxMods, int ScoreToBeat, int FilePos, SpectrumTweak* Tweak)
+{
+ static int* PrevCellTable = NULL;
+ static int* ScoreTable = NULL;
+ // StartPointPenalty and EndPointPenalty provide a simple protease specificity.
+ int StartPointPenalty[MAX_ROWS];
+ int EndPointPenalty[MAX_ROWS];
+ int Result;
+ int ReturnAmino = -1;
+ int AminoIndex;
+ int AA;
+ int AminoBlock = 0;
+ int NodeIndex;
+ int CellIndex;
+ int SliceIndex;
+ int AA2;
+ int AA3;
+ TagGraphNode* Node;
+ TagGraphBackEdge* Edge;
+ int ModCountIndex;
+ int AminoBlockSize;
+ int ZSize = MaxMods + 1;
+ int MaxAmino;
+ int AAMass;
+ int AA2Mass;
+ int BackEdgeDoubleIndex;
+ int BackEdgeTripleIndex;
+ int PrevCellIndex;
+ int Score;
+ int PRM;
+ TagGraphNode* BackNode;
+ //char MatchBuffer[256];
+ char MatchBufferPos;
+ Peptide* Match;
+ int Mass;
+ int BackNodeIndex;
+ int BackNodeDirection;
+ int Delta;
+ int MinPossibleDelta = -130000; // W->G mutation
+ int MaxPossibleDelta = GlobalOptions->MaxPTMDelta * MASS_SCALE;
+ int DeltaBin;
+ MassDeltaNode* DeltaNode;
+ int Skew;
+ int AbsSkew;
+ int MaxPRM = Tweak->PRMScoreMax - 1;
+ int X;
+ int Y;
+ int NextY;
+ int Z;
+ int MatchPTMCount;
+ int ModIndex;
+ int TokenDropped;
+ MSSpectrum* Spectrum = Info->Spectrum;
+ //
+
+ // Allocate tables, if necessary:
+ if (!PrevCellTable)
+ {
+ PrevCellTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_ROWS * (MaxMods + 1));
+ ScoreTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_NODES * (MaxMods + 1));
+ DeltaTable = (MassDelta**)malloc(sizeof(MassDelta*) * MAX_ROWS * MAX_NODES * (MaxMods+1));
+ MassDeltaTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_NODES * (MaxMods + 1));
+ }
+
+ /////////////////////////////////////////
+ // Find MaxAmino:
+ //VerboseFlag = 1;
+ if (Info->VerboseFlag)
+ {
+ printf("[V] FindPeptideBlockEnd:\n");
+ }
+ Result = FindPeptideBlockEnd(Spectrum, Buffer, BufferEnd, &MaxAmino, &ReturnAmino);
+ if (!Result)
+ {
+ // No extension necessary. Advance the database pointer:
+ return ReturnAmino;
+ }
+ if (Info->VerboseFlag)
+ {
+ printf("[V] FindPeptideBlockEnd: MaxAmino %d returnamino %d\n", MaxAmino, ReturnAmino);
+ }
+
+ // Apply a slap-on-the-wrist for using non-tryptic peptides:
+ for (AminoIndex = 0; AminoIndex < MaxAmino; AminoIndex++)
+ {
+ StartPointPenalty[AminoIndex] = 0;
+ if (AminoIndex)
+ {
+ AA = Buffer[AminoIndex - 1];
+ if (AA != 'R' && AA != 'K' && AA != '*')
+ {
+ StartPointPenalty[AminoIndex] = -500;
+ }
+ }
+
+ EndPointPenalty[AminoIndex] = 0;
+ if (AminoIndex)
+ {
+ AA = Buffer[AminoIndex - 1];
+ if ((AA != 'R' && AA != 'K') && (AminoIndex <= MaxAmino-1 && Buffer[AminoIndex + 1]!='*'))
+ {
+ EndPointPenalty[AminoIndex] = -500;
+ }
+ }
+ }
+ AminoBlockSize = Spectrum->Graph->NodeCount * ZSize;
+ // Loop over the d.p. table to populate scores and path.
+ // Iterate by amino acid index (row), by node (column), then by PTMCount (z).
+ for (AminoIndex = 0; AminoIndex <= MaxAmino; AminoBlock += AminoBlockSize, AminoIndex++)
+ {
+ CellIndex = AminoBlock;
+ AA2 = 0;
+ AA3 = 0;
+ if (AminoIndex)
+ {
+ AA = Buffer[AminoIndex-1] - 'A';
+ AAMass = PeptideMass[Buffer[AminoIndex - 1]];
+ }
+ if (AminoIndex > 1)
+ {
+ AA2 = Buffer[AminoIndex-2] - 'A';
+ AA2Mass = PeptideMass[Buffer[AminoIndex - 2]];
+ BackEdgeDoubleIndex = AA*AMINO_ACIDS + AA2;
+ }
+ if (AminoIndex > 2)
+ {
+ AA3 = Buffer[AminoIndex - 3] - 'A';
+ BackEdgeTripleIndex = AA*676 + AA2*26 + AA3;
+ }
+ for (NodeIndex = 0, Node = Spectrum->Graph->FirstNode; Node; Node = Node->Next, NodeIndex++)
+ {
+ SliceIndex = CellIndex;
+ for (ModCountIndex = 0; ModCountIndex < ZSize; ModCountIndex++)
+ {
+ // Check our cell index:
+ if (CellIndex != AminoIndex * AminoBlockSize + NodeIndex * ZSize + ModCountIndex)
+ {
+ printf("Bad cell index %d, %d, %d -> %d (%d)\n", NodeIndex, AminoIndex, ModCountIndex, CellIndex,
+ AminoIndex * AminoBlockSize + NodeIndex * ZSize + ModCountIndex);
+ }
+ ScoreTable[CellIndex] = FORBIDDEN_PATH; // default
+ DeltaTable[CellIndex] = NULL;
+ PrevCellTable[CellIndex] = -1;
+
+ ///////////////
+ // Free rides:
+ if (ModCountIndex == 0 && Node->Mass < GlobalOptions->ParentMassEpsilon)
+ {
+ ScoreTable[CellIndex] = StartPointPenalty[AminoIndex];
+ PrevCellTable[CellIndex] = -1;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Node->Mass;
+ CellIndex++;
+ continue;
+ }
+ ///////////////
+ // Drop a token:
+ TokenDropped = 0;
+ if (ModCountIndex)
+ {
+ PrevCellIndex = CellIndex - 1;
+ ScoreTable[CellIndex] = ScoreTable[PrevCellIndex];
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ TokenDropped = 1;
+ }
+
+ // And that's all we do on the top row:
+ if (AminoIndex == 0)
+ {
+ CellIndex++;
+ continue;
+ }
+
+ ///////////////
+ // One unmodified amino acid:
+ Edge = Node->BackEdge[AA];
+ while (Edge)
+ {
+ PrevCellIndex = AminoBlock - AminoBlockSize + (Edge->ToNode->Index * ZSize) + ModCountIndex;
+ Score = ScoreTable[PrevCellIndex] + Edge->Score;
+ if (Score > ScoreTable[CellIndex])
+ {
+ ScoreTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Edge->Skew + MassDeltaTable[PrevCellIndex];
+ TokenDropped = 0;
+ }
+ Edge = Edge->Next;
+ }
+
+ ///////////////
+ // Two unmodified amino acids:
+ if (AminoIndex > 1)
+ {
+ Edge = Node->BackEdgeDouble[BackEdgeDoubleIndex];
+ while (Edge)
+ {
+ PrevCellIndex = AminoBlock - AminoBlockSize - AminoBlockSize + (Edge->ToNode->Index * ZSize) + ModCountIndex;
+ // Accumulate points for the middle of the jump:
+ PRM = MASS_TO_BIN(Edge->HalfMass);
+ Score = Tweak->PRMScores[PRM] + ScoreTable[PrevCellIndex] + Edge->Score;
+ if (Score > ScoreTable[CellIndex])
+ {
+ ScoreTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Edge->Skew + MassDeltaTable[PrevCellIndex];
+ TokenDropped = 0;
+ }
+ Edge = Edge->Next;
+ }
+ }
+ ///////////////
+ // Three unmodified amino acids:
+ if (AminoIndex > 2)
+ {
+ Edge = Node->BackEdgeTriple[BackEdgeTripleIndex];
+ while (Edge)
+ {
+ PrevCellIndex = AminoBlock - AminoBlockSize - AminoBlockSize - AminoBlockSize + (Edge->ToNode->Index * ZSize) + ModCountIndex;
+ // Accumulate points for the middle of the jump:
+ PRM = MASS_TO_BIN(Edge->HalfMass);
+ Score = Tweak->PRMScores[PRM] + ScoreTable[PrevCellIndex] + Edge->Score;
+ PRM = MASS_TO_BIN(Edge->HalfMass2);
+ Score += Tweak->PRMScores[PRM];
+ if (Score > ScoreTable[CellIndex])
+ {
+ ScoreTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Edge->Skew + MassDeltaTable[PrevCellIndex];
+ TokenDropped = 0;
+ }
+ Edge = Edge->Next;
+ }
+ }
+
+ if (ModCountIndex)
+ {
+ ///////////////
+ // Modification!
+ // Remember: there may be no node corresponding to
+ // the peptide with a PTM removed. Example: Assume the peptide is
+ // AFKDEDTQAM+16PFR and we're at the node at 1152 for AFKDEDTQAM+16.
+ // We cannot place the M+16 PTM and jump to a node. We must place
+ // the M+16 PTM while placing the M amino acid in order to jump.
+ // If (due to poor fragmentation) there's no node available, then
+ // we cannot place the PTM at all, but hopefully we will place the correct
+ // PTM mass at another node.
+ Mass = Node->Mass - AAMass;
+ BackNodeIndex = NodeIndex;
+ BackNodeDirection = -1;
+ BackNode = Node;
+ while (1)
+ {
+ // Bouncing iteration: Iterate back along the node list until
+ // you hit the start of the list (or mass becomes too small).
+ // Then iterate forward along the list until you hit the end of the
+ // list (or mass becomes too large). We iterate over a "neighborhood"
+ // to save time.
+ if (BackNodeDirection < 0)
+ {
+ BackNode = BackNode->Prev;
+ BackNodeIndex--;
+ if (!BackNode)
+ {
+ BackNodeDirection = 1;
+ BackNodeIndex = NodeIndex;
+ BackNode = Node;
+ }
+ else
+ {
+ Delta = Mass - BackNode->Mass;
+ if (Delta > MaxPossibleDelta)
+ {
+ BackNodeDirection = 1;
+ BackNodeIndex = NodeIndex;
+ BackNode = Node;
+ }
+ }
+ }
+ if (BackNodeDirection > 0)
+ {
+ BackNode = BackNode->Next;
+ BackNodeIndex++;
+ if (!BackNode)
+ {
+ break;
+ }
+ else
+ {
+ Delta = Mass - BackNode->Mass;
+ if (Delta < MinPossibleDelta)
+ {
+ break;
+ }
+ }
+ }
+
+ ROUND_MASS_TO_DELTA_BIN(Delta, DeltaBin);
+ DeltaNode = MassDeltaByMass[AA][DeltaBin];
+ while (DeltaNode)
+ {
+ Skew = Delta - DeltaNode->Delta->RealDelta;
+ AbsSkew = abs(Skew) / 10;
+ if (abs(Skew) <= GlobalOptions->Epsilon)
+ {
+ PrevCellIndex = AminoBlock - AminoBlockSize + (BackNodeIndex * ZSize) + ModCountIndex - 1;
+ Score = g_SkewPenalty[AbsSkew] + (int)(DeltaNode->Delta->Score * DELTA_SCORE_SCALER) + ScoreTable[PrevCellIndex];
+ if (Score > ScoreTable[CellIndex])
+ {
+ ScoreTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ DeltaTable[CellIndex] = DeltaNode->Delta;
+ MassDeltaTable[CellIndex] = MassDeltaTable[PrevCellIndex] + Skew;
+ TokenDropped = 0;
+ }
+ }
+ DeltaNode = DeltaNode->Next;
+ }
+ } // loop over back-nodes
+
+ } // if ModCount
+ //////////////////////////////////////////////////////////////////////////////
+ // We now have our move backwards (or our FORBIDDEN_PATH). Get points for this node's PRM:
+ if (!TokenDropped)
+ {
+ Mass = MASS_TO_BIN(Node->Mass + MassDeltaTable[CellIndex]);
+ Mass = min(MaxPRM, max(0, Mass));
+ ScoreTable[CellIndex] += Tweak->PRMScores[Mass];
+ }
+ CellIndex++;
+ } // ModCountIndex loop
+ } // NodeIndex loop
+ } // AminoIndex loop
+
+ if (Info->VerboseFlag)
+ {
+ DebugPrintGeneralTable(Spectrum, Buffer, MaxAmino, MaxMods,
+ ScoreTable, PrevCellTable);
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////
+ // The d.p. table has been populated. Now we must read off the candidate(s).
+ for (Node = Spectrum->Graph->LastNode; Node; Node = Node->Prev)
+ {
+ NodeIndex = Node->Index;
+ // We want Node->Mass to be about equal to Tweak->ParentMass - 1900
+ // If it's too small, STOP. If it's too large, CONTINUE.
+ if (Node->Mass > Tweak->ParentMass - 1900 + GlobalOptions->ParentMassEpsilon)
+ {
+ continue;
+ }
+ if (Node->Mass < Tweak->ParentMass - 1900 - GlobalOptions->ParentMassEpsilon)
+ {
+ break;
+ }
+ for (AminoIndex = 0; AminoIndex < MaxAmino; AminoIndex++)
+ {
+ CellIndex = AminoIndex * AminoBlockSize + NodeIndex * ZSize + MaxMods;
+ Score = ScoreTable[CellIndex] + EndPointPenalty[AminoIndex];
+ if (Score > ScoreToBeat)
+ {
+ ///////////////////////////////////////
+ // We have a match. Read off the sequence and the PTMs.
+ X = NodeIndex;
+ Y = AminoIndex;
+ Z = MaxMods;
+ if (Info->VerboseFlag)
+ {
+ printf("\nMatch found at (%d, %d, %d)\n", X, Y, Z);
+ }
+ Match = NewPeptideNode();
+ Match->Tweak = Tweak;
+ MatchBufferPos = 0;
+ MatchPTMCount = 0;
+ while (1)
+ {
+ if (Info->VerboseFlag)
+ {
+ printf("Move to (%d, %d, %d), match is '%s'\n", X, Y, Z, Match->Bases);
+ }
+ //CellIndex = Y * AminoBlock + Y * ZSize + Z;
+ if (Y)
+ {
+ if (DeltaTable[CellIndex])
+ {
+ Match->AminoIndex[MatchPTMCount] = MatchBufferPos;
+ Match->ModType[MatchPTMCount] = DeltaTable[CellIndex];
+ MatchPTMCount++;
+ if (Info->VerboseFlag)
+ {
+ printf("Apply PTM '%s' (%d)\n", DeltaTable[CellIndex]->Name, DeltaTable[CellIndex]->RealDelta);
+ }
+ }
+ }
+ CellIndex = PrevCellTable[CellIndex];
+ if (CellIndex < 0)
+ {
+ break;
+ }
+ NextY = CellIndex / AminoBlockSize;
+ X = (CellIndex - NextY * AminoBlockSize) / ZSize;
+ Z = (CellIndex - NextY * AminoBlockSize) % ZSize;
+ while (Y > NextY)
+ {
+ Match->Bases[MatchBufferPos] = Buffer[Y - 1];
+ MatchBufferPos++;
+ Y--;
+ }
+ }
+ Match->FilePos = FilePos + Y;
+ Match->RecordNumber = Info->RecordNumber;
+ Match->Bases[MatchBufferPos] = '\0';
+ Match->SuffixAmino = Buffer[AminoIndex];
+ if (Y)
+ {
+ Match->PrefixAmino = Buffer[Y - 1];
+ }
+ ReverseString(Match->Bases);
+ for (ModIndex = 0; ModIndex < MatchPTMCount; ModIndex++)
+ {
+ Match->AminoIndex[ModIndex] = MatchBufferPos - 1 - Match->AminoIndex[ModIndex];
+ }
+ Match->DB = Info->DB;
+ //Match->Score = Score;
+ Match->InitialScore = Score;
+ Match = StoreSpectralMatch(Spectrum, Match, MatchBufferPos, 0);
+ if (Info->VerboseFlag && Match)
+ {
+ printf("Store match '%c.%s.%c' score %d endpointpenalty %d\n", Match->PrefixAmino, Match->Bases, Match->SuffixAmino, Match->InitialScore, EndPointPenalty[AminoIndex]);
+ }
+ } // final AminoIndex loop
+ } // final node loop
+ }
+ return ReturnAmino;
+}
+
+// The MS-Alignment algorithm.
+// New and improved version of the d.p. algorithm for generating candidates with 0-2 PTMs
+int ExtendMatchRightwardDuo(SearchInfo* Info, char* Buffer, int BufferEnd, int MatchMass,
+ int MaxMods, int ScoreToBeat, int FilePos, SpectrumTweak* Tweak)
+{
+ static int* PrevCellTable = NULL;
+ static int* DTable = NULL;
+ int MaxAmino;
+ int ReturnAmino = -1;
+ int Result;
+ int NodeIndex;
+ TagGraphNode* Node;
+ int AminoBlockSize;
+ int AminoBlock;
+ int CellIndex;
+ int PrevCellIndex = 0;
+ int Score;
+ int CellMass;
+ TagGraphBackEdge* Edge;
+ int StartAminoIndex;
+ int DeltaBin;
+ int Delta;
+ int MinPossibleDelta = -13000; // W->G mutation
+ int MaxPossibleDelta = GlobalOptions->MaxPTMDelta * 100;
+ MassDeltaNode* DeltaNode;
+ int AA;
+ int AAMass = 0;
+ int AA2;
+ int AA2Mass;
+ int AA3;
+ int EndAminoIndex;
+ int ComplementMass;
+ int CellScore;
+ int AminoIndex;
+ int Skew;
+ int AbsSkew;
+ int Mass;
+ int PRM;
+ int BackEdgeDoubleIndex = 0;
+ int BackEdgeTripleIndex = 0;
+ int MaxPRM = Tweak->PRMScoreMax - 1;
+ int StartPointPenalty[MAX_ROWS];
+ int EndPointPenalty[MAX_ROWS];
+ MSSpectrum* Spectrum = Info->Spectrum;
+ //
+ // Allocate tables, if necessary:
+ if (!PrefixTable)
+ {
+ PrefixTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_ROWS);
+ SuffixTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_ROWS);
+ PrefixMassTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_ROWS);
+ SuffixMassTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_ROWS);
+ DTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_NODES);
+ // MassDeltaTable stores mass delta used in reaching a cell of DTable:
+ MassDeltaTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_NODES);
+ PrevCellTable = (int*)malloc(sizeof(int) * MAX_ROWS * MAX_NODES);
+ DeltaTable = (MassDelta**)malloc(sizeof(MassDelta*) * MAX_ROWS * MAX_NODES);
+ }
+ /////////////////////////////////////////
+ // Find MaxAmino:
+ Result = FindPeptideBlockEnd(Spectrum, Buffer, BufferEnd, &MaxAmino, &ReturnAmino);
+ if (!Result)
+ {
+ // No extension necessary. Advance the database pointer:
+ return ReturnAmino;
+ }
+
+ for (AminoIndex = 0; AminoIndex < MaxAmino; AminoIndex++)
+ {
+ StartPointPenalty[AminoIndex] = 0;
+ if (AminoIndex)
+ {
+ AA = Buffer[AminoIndex - 1];
+ if (AA != 'R' && AA != 'K' && AA != '*')
+ {
+ StartPointPenalty[AminoIndex] = -500;
+ }
+ }
+ EndPointPenalty[AminoIndex] = 0;
+ AA = Buffer[AminoIndex];
+ if ((AA != 'R' && AA != 'K') && (AminoIndex <= MaxAmino-1 && Buffer[AminoIndex + 1]!='*'))
+ {
+ EndPointPenalty[AminoIndex] = -500;
+ }
+ }
+ /////////////////////////////////////////
+ // Fill the Forward and Suffix tables:
+
+ FillPrefixSuffixTables(Spectrum, Tweak, MatchMass, Buffer, MaxAmino, PrefixTable, SuffixTable,
+ PrefixMassTable, SuffixMassTable);
+#ifdef VERBOSE_DEBUGGING
+ DebugPrintPrefixSuffixTables(MaxAmino, Buffer, PrefixTable, SuffixTable, PrefixMassTable, SuffixMassTable);
+#endif
+ /////////////////////////////////////////
+ // Fill table D[]
+ AminoBlockSize = Spectrum->Graph->NodeCount;
+ AminoBlock = 0;
+ for (AminoIndex = 0; AminoIndex <= MaxAmino; AminoBlock += AminoBlockSize, AminoIndex++)
+ {
+
+ CellIndex = AminoBlock;
+ AA2 = 0;
+ AA3 = 0;
+ if (AminoIndex)
+ {
+ AA = Buffer[AminoIndex-1] - 'A';
+ AAMass = PeptideMass[Buffer[AminoIndex-1]];
+ }
+ if (AminoIndex>1)
+ {
+ AA2 = Buffer[AminoIndex-2] - 'A';
+ AA2Mass = PeptideMass[Buffer[AminoIndex-2]];
+ BackEdgeDoubleIndex = AA*AMINO_ACIDS + AA2;
+ }
+ if (AminoIndex>2)
+ {
+ AA3 = Buffer[AminoIndex-3] - 'A';
+ BackEdgeTripleIndex = AA*676 + AA2*26 + AA3;
+ }
+ for (NodeIndex = 0, Node = Spectrum->Graph->FirstNode; Node; Node = Node->Next, NodeIndex++, CellIndex++)
+ {
+ DTable[CellIndex] = FORBIDDEN_PATH; // default
+ DeltaTable[CellIndex] = NULL;
+ PrevCellTable[CellIndex] = -1;
+ MassDeltaTable[CellIndex] = 0;
+
+ ///////////////
+ // Free rides:
+ if (Node->Mass < GlobalOptions->ParentMassEpsilon)
+ {
+ DTable[CellIndex] = StartPointPenalty[AminoIndex];
+ PrevCellTable[CellIndex] = -1;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Node->Mass;
+ continue;
+ }
+ if (AminoIndex == 0)
+ {
+ continue; // And that's all we do on the top row.
+ }
+
+ ///////////////
+ // One unmodified amino acid:
+ Edge = Node->BackEdge[AA];
+ while (Edge)
+ {
+ PrevCellIndex = AminoBlock - AminoBlockSize + Edge->ToNode->Index;
+ Score = DTable[PrevCellIndex] + Edge->Score;
+ if (Score > DTable[CellIndex])
+ {
+ DTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Edge->Skew + MassDeltaTable[PrevCellIndex];
+ }
+ Edge = Edge->Next;
+ }
+
+ ///////////////
+ // Two unmodified amino acids:
+ if (AminoIndex > 1)
+ {
+ Edge = Node->BackEdgeDouble[BackEdgeDoubleIndex];
+ while (Edge)
+ {
+ PrevCellIndex = AminoBlock - AminoBlockSize - AminoBlockSize + Edge->ToNode->Index;
+ // Accumulate points for the middle of the jump:
+ PRM = MASS_TO_BIN(Edge->HalfMass);
+ Score = Tweak->PRMScores[PRM] + DTable[PrevCellIndex] + Edge->Score;
+ if (Score > DTable[CellIndex])
+ {
+ DTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Edge->Skew + MassDeltaTable[PrevCellIndex];
+ }
+ Edge = Edge->Next;
+ }
+ }
+ ///////////////
+ // Three unmodified amino acids:
+ if (AminoIndex > 2)
+ {
+ Edge = Node->BackEdgeTriple[BackEdgeTripleIndex];
+ while (Edge)
+ {
+ PrevCellIndex = AminoBlock - AminoBlockSize - AminoBlockSize - AminoBlockSize + Edge->ToNode->Index;
+ // Accumulate points for the middle of the jump:
+ PRM = MASS_TO_BIN(Edge->HalfMass);
+ Score = Tweak->PRMScores[PRM] + DTable[PrevCellIndex] + Edge->Score;
+ PRM = MASS_TO_BIN(Edge->HalfMass2);
+ Score += Tweak->PRMScores[PRM];
+ if (Score > DTable[CellIndex])
+ {
+ DTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = PrevCellIndex;
+ DeltaTable[CellIndex] = NULL;
+ MassDeltaTable[CellIndex] = Edge->Skew + MassDeltaTable[PrevCellIndex];
+ }
+ Edge = Edge->Next;
+ }
+ }
+
+ ///////////////
+ // No amino acid at all, or modified amino acid. Try using the prefix StartAminoIndex...EndAminoIndex.
+ // Also, try using an EMPTY prefix (the case where StartAminoIndex == AminoIndex > EndAminoIndex)
+ EndAminoIndex = AminoIndex - 1;
+ Mass = Node->Mass - AAMass;
+ for (StartAminoIndex = AminoIndex; StartAminoIndex>0; StartAminoIndex--)
+ {
+ if (StartAminoIndex == AminoIndex)
+ {
+ Delta = Mass; // Modification on the first amino acid of the peptide
+ }
+ else
+ {
+ Delta = Mass - PrefixMassTable[StartAminoIndex*MAX_ROWS + EndAminoIndex];
+ }
+ if (Delta > MaxPossibleDelta)
+ {
+ continue;
+ }
+ if (Delta < MinPossibleDelta)
+ {
+ break;
+ }
+ ROUND_MASS_TO_DELTA_BIN(Delta, DeltaBin);
+ DeltaNode = MassDeltaByMass[AA][DeltaBin];
+ while (DeltaNode)
+ {
+ Skew = Delta - DeltaNode->Delta->RealDelta;
+ //Skew = Delta - DeltaNode->RealDelta;
+ AbsSkew = abs(Skew) / 10;
+ if (AbsSkew <= GlobalOptions->Epsilon)
+ {
+ if (StartAminoIndex == AminoIndex)
+ {
+ Score = g_SkewPenalty[AbsSkew] + (int)(DeltaNode->Delta->Score * DELTA_SCORE_SCALER);
+ }
+ else
+ {
+ Score = g_SkewPenalty[AbsSkew] + (int)(DeltaNode->Delta->Score * DELTA_SCORE_SCALER + PrefixTable[StartAminoIndex*MAX_ROWS + EndAminoIndex]);
+ }
+ Score += StartPointPenalty[StartAminoIndex - 1];
+ //Score += Spectrum->PRMScores[PRM];
+ if (Score > DTable[CellIndex])
+ {
+ DTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = (StartAminoIndex-1) * AminoBlockSize;
+ DeltaTable[CellIndex] = DeltaNode->Delta;
+ MassDeltaTable[CellIndex] = Skew;
+ }
+ }
+ DeltaNode = DeltaNode->Next;
+ }
+ Skew = abs(Delta) / 10;
+ if (Skew < GlobalOptions->Epsilon)
+ {
+ if (StartAminoIndex > EndAminoIndex)
+ {
+ Score = g_SkewPenalty[Skew];
+ }
+ else
+ {
+ Score = g_SkewPenalty[Skew] + PrefixTable[StartAminoIndex*MAX_ROWS + EndAminoIndex];
+ }
+ Score += StartPointPenalty[StartAminoIndex - 1];
+ if (Score > DTable[CellIndex])
+ {
+ DTable[CellIndex] = Score;
+ PrevCellTable[CellIndex] = (StartAminoIndex-1) * AminoBlockSize;
+ DeltaTable[CellIndex] = NULL;
+ }
+ }
+ }
+ //////////////////////////////////////////////////////////////////////////////
+ // We now have our move backwards (or our FORBIDDEN_PATH). Get points for this node's PRM:
+ Mass = MASS_TO_BIN(Node->Mass + MassDeltaTable[CellIndex]);
+ Mass = min(MaxPRM, max(0, Mass));
+ DTable[CellIndex] += Tweak->PRMScores[Mass];
+ }
+ }
+#ifdef VERBOSE_DEBUGGING
+ DebugPrintDTable(Spectrum, Buffer, DTable, DeltaTable, PrevCellTable, MaxAmino);
+#endif
+
+ /////////////////////////////////////////
+ // Find candidate peptides, using tables PrefixTable, SuffixTable, and D:
+ AminoBlock = 0;
+ for (AminoIndex = 0; AminoIndex <= MaxAmino; AminoIndex++, AminoBlock += AminoBlockSize)
+ {
+ AA = Buffer[AminoIndex] - 'A'; // amimoindex + 1 - 1
+ CellIndex = AminoBlock;
+ for (NodeIndex = 0, Node = Spectrum->Graph->FirstNode; Node; Node = Node->Next, NodeIndex++, CellIndex++)
+ {
+ CellMass = Node->Mass + MassDeltaTable[CellIndex];
+ CellScore = DTable[CellIndex];
+ ComplementMass = MatchMass - CellMass;
+ // We can end right here:
+ if (abs(ComplementMass) < GlobalOptions->FlankingMassEpsilon)
+ {
+ Spectrum->CandidatesScored++;
+ GlobalStats->CandidatesScored++;
+ Score = CellScore + EndPointPenalty[AminoIndex-1];
+ if (CellScore > ScoreToBeat)
+ {
+ AddNewMatchDuo(Info, Tweak, Buffer, CellScore, PrevCellTable, DeltaTable, CellIndex, NULL,
+ AminoBlockSize, AminoIndex, AminoIndex, FilePos);
+ if (Spectrum->Node->MatchCount == GlobalOptions->StoreMatchCount)
+ {
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+ }
+ }
+ }
+ for (EndAminoIndex = AminoIndex + 1; EndAminoIndex <= MaxAmino; EndAminoIndex++)
+ {
+ Delta = SuffixMassTable[(AminoIndex + 1)*MAX_ROWS + EndAminoIndex] - CellMass;
+
+ if (Delta > MaxPossibleDelta)
+ {
+ continue;
+ }
+ if (Delta < MinPossibleDelta)
+ {
+ break;
+ }
+ //EndAA = Buffer[EndAminoIndex] - 'A'; // amimoindex+1-1
+ // Maybe we match a suffix mass:
+ if (abs(Delta) < GlobalOptions->Epsilon)
+ {
+ Skew = abs(Delta) / 10;
+ Score = CellScore + SuffixTable[(AminoIndex + 1)*MAX_ROWS + EndAminoIndex] + g_SkewPenalty[Skew];
+ Score += EndPointPenalty[EndAminoIndex - 1];
+ Spectrum->CandidatesScored++;
+ GlobalStats->CandidatesScored++;
+ if (Score > ScoreToBeat)
+ {
+ AddNewMatchDuo(Info, Tweak, Buffer, Score, PrevCellTable, DeltaTable, CellIndex, NULL,
+ AminoBlockSize, AminoIndex, EndAminoIndex, FilePos);
+ if (Spectrum->Node->MatchCount == GlobalOptions->StoreMatchCount)
+ {
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+ }
+
+ }
+ }
+ ROUND_MASS_TO_DELTA_BIN(Delta, DeltaBin);
+ DeltaNode = MassDeltaByMass[AA][DeltaBin];
+ while (DeltaNode)
+ {
+ Skew = abs(DeltaNode->Delta->RealDelta - Delta);
+ //Skew = abs(DeltaNode->RealDelta - Delta);
+ if (Skew < GlobalOptions->Epsilon)
+ {
+ Score = CellScore + (int)(DeltaNode->Delta->Score * DELTA_SCORE_SCALER) + SuffixTable[(AminoIndex + 1)*MAX_ROWS + EndAminoIndex] + g_SkewPenalty[Skew / 10];
+ Score += EndPointPenalty[EndAminoIndex - 1];
+ Spectrum->CandidatesScored++;
+ GlobalStats->CandidatesScored++;
+ if (Score > ScoreToBeat)
+ {
+ AddNewMatchDuo(Info, Tweak, Buffer, Score, PrevCellTable, DeltaTable, CellIndex, DeltaNode->Delta,
+ AminoBlockSize, AminoIndex, EndAminoIndex, FilePos);
+ if (Spectrum->Node->MatchCount == GlobalOptions->StoreMatchCount)
+ {
+ ScoreToBeat = Spectrum->Node->LastMatch->InitialScore;
+ }
+
+ }
+ }
+ DeltaNode = DeltaNode->Next;
+ }
+ }
+ }
+ }
+ return ReturnAmino;
+}
+Peptide* ClonePeptide(Peptide* Match)
+{
+ Peptide* NewMatch = NewPeptideNode();
+ memcpy(NewMatch, Match, sizeof(Peptide));
+ return NewMatch;
+}
+
+// AddNewMatchDuo considers an unmodified peptide whenever it considers a modification of size +1..+n or -1..-n.
+// We only want to add one "undecorated" peptide, not re-add the same thing multiple times,
+// Store the FilePos, StartAminoIndex, EndAminoIndex of the last match
+// We've got a new match! It ends at EndAminoIndex, and its D-path ends at the cell CellIndex.
+void AddNewMatchDuo(SearchInfo* Info, SpectrumTweak* Tweak, char* Buffer, int Score, int* PrevCellTable, MassDelta** DeltaTable,
+ int CellIndex, MassDelta* FinalDelta, int AminoBlockSize, int AminoIndex, int EndAminoIndex,
+ int FilePos)
+{
+ int StartAminoIndex;
+ int PeptideLength;
+ int OldCellIndex;
+ Peptide* Match;
+ Peptide* VariantMatch;
+ int SlideLeftIndex = -1;
+ int SlideRightIndex = -1;
+ int ModCount = 0;
+ int PlainMass;
+ int ModdedMass;
+ float RunningScore;
+ float PlainScore;
+ float ModdedScore;
+ int Diff;
+ MassDeltaNode* Node;
+ int PRM;
+ int BestDiff;
+ MSSpectrum* Spectrum = Info->Spectrum;
+ //
+ Match = NewPeptideNode();
+ Match->Tweak = Tweak;
+ // Trace back through the d.p. table to find our starting amino index:
+
+ StartAminoIndex = AminoIndex;
+ OldCellIndex = CellIndex;
+ while (OldCellIndex >= 0)
+ {
+ StartAminoIndex = OldCellIndex / AminoBlockSize;
+ if (DeltaTable[OldCellIndex])
+ {
+ Match->ModType[0] = DeltaTable[OldCellIndex];
+ Match->AminoIndex[0] = StartAminoIndex;
+ ModCount++;
+ }
+ OldCellIndex = PrevCellTable[OldCellIndex];
+ }
+ if (ModCount)
+ {
+ Match->AminoIndex[0] -= (StartAminoIndex + 1);
+ SlideLeftIndex = Match->AminoIndex[0];
+ }
+ PeptideLength = EndAminoIndex - StartAminoIndex;
+ strncpy(Match->Bases, Buffer + StartAminoIndex, PeptideLength);
+ Match->Bases[PeptideLength] = '\0';
+ Match->InitialScore = Score;
+ Match->FilePos = FilePos + StartAminoIndex;
+ Match->RecordNumber = Info->RecordNumber;
+ if (StartAminoIndex)
+ {
+ Match->PrefixAmino = Buffer[StartAminoIndex - 1];
+ }
+ if (FinalDelta)
+ {
+ Match->AminoIndex[ModCount] = AminoIndex - StartAminoIndex;
+ SlideRightIndex = Match->AminoIndex[ModCount];
+ Match->ModType[ModCount] = FinalDelta;
+ ModCount++;
+ }
+ GetPeptideParentMass(Match);
+
+ Match->SuffixAmino = Buffer[EndAminoIndex];
+#ifdef VERBOSE_DEBUGGING
+ DebugPrintMatch(Match);
+#endif
+ // STRIP DECORATION:
+ // If we placed a small PTM (mass -3...4), then be sure to consider a match with no modification.
+ // If we placed the PTM only to make the parent mass match up, then the modless peptide will get a
+ // better score, and we'll filter the spurious +1 modification. (There are a few real +1 modifications,
+ // such as deamidation of N, but spurious +1 modifications are much more common.
+ if (FinalDelta && FinalDelta->RealDelta >= -300 && FinalDelta->RealDelta < 500)
+ {
+ VariantMatch = ClonePeptide(Match);
+ VariantMatch->InitialScore = Score;
+ VariantMatch->AminoIndex[0] = -1;
+ VariantMatch->ModType[0] = NULL;
+ VariantMatch->DB = Info->DB;
+ StoreSpectralMatch(Spectrum, VariantMatch, PeptideLength, 0);
+ }
+ // SLIDE LEFT:
+ // If we placed a PTM at the edge of our prefix, but the PTM could just as easily have been placed earlier,
+ // then do so:
+ if (SlideLeftIndex > 0)
+ {
+ PlainMass = 0;
+ //ModdedMass = Match->ModType[0]->RealDelta;
+ for (AminoIndex = 0; AminoIndex <= Match->AminoIndex[0]; AminoIndex++)
+ {
+ PlainMass += PeptideMass[Match->Bases[AminoIndex]];
+ }
+ ModdedMass = PlainMass + Match->ModType[0]->RealDelta;
+ RunningScore = (float)Match->InitialScore;
+ for (AminoIndex = Match->AminoIndex[0]; AminoIndex > 0; AminoIndex--)
+ {
+ PlainMass -= PeptideMass[Match->Bases[AminoIndex]];
+ ModdedMass -= PeptideMass[Match->Bases[AminoIndex]];
+ PRM = MASS_TO_BIN(PlainMass);
+ if (PRM < -PRM_ARRAY_SLACK || PRM >= Tweak->PRMScoreMax)
+ {
+ break;
+ }
+ PRM = max(0, PRM);
+ PlainScore = (float)Tweak->PRMScores[PRM];
+ PRM = MASS_TO_BIN(ModdedMass);
+ if (PRM < -PRM_ARRAY_SLACK || PRM >= Tweak->PRMScoreMax)
+ {
+ break;
+ }
+ PRM = max(0, PRM);
+ ModdedScore = (float)Tweak->PRMScores[PRM];
+ if (ModdedScore > 0)
+ {
+ // We've already had the chance to attach this ptm here.
+ break;
+ }
+ RunningScore += (ModdedScore - PlainScore);
+ if (RunningScore < Match->InitialScore - 100)
+ {
+ // We've hurt our score quite a bit; let's stop.
+ break;
+ }
+ // Make a variant-match:
+ VariantMatch = ClonePeptide(Match);
+ VariantMatch->InitialScore = (int)RunningScore;
+ BestDiff = -1;
+ VariantMatch->ModType[0] = NULL;
+ for (Node = MassDeltaByMass[Match->Bases[AminoIndex-1]-'A'][Match->ModType[0]->Delta]; Node; Node = Node->Next)
+ {
+ Diff = abs(Node->Delta->RealDelta - Match->ModType[0]->RealDelta);
+ if (BestDiff < 0 || Diff < BestDiff)
+ {
+ BestDiff = Diff;
+ VariantMatch->ModType[0] = Node->Delta;
+ }
+ }
+ VariantMatch->AminoIndex[0] = AminoIndex - 1;
+ if (VariantMatch->ModType[0])
+ {
+#ifdef VERBOSE_DEBUGGING
+ printf("Variant:\n");
+ DebugPrintMatch(VariantMatch);
+#endif
+ VariantMatch->DB = Info->DB;
+ StoreSpectralMatch(Spectrum, VariantMatch, PeptideLength, 0);
+ }
+ else
+ {
+ FreePeptideNode(VariantMatch);
+ }
+ }
+ }
+ // SLIDE RIGHT:
+ // If we placed a PTM at the edge of our prefix, but the PTM could just as easily have been placed earlier,
+ // then do so:
+ if (SlideRightIndex > 0)
+ {
+ PlainMass = 0;
+ if (ModCount>1)
+ {
+ PlainMass += Match->ModType[0]->RealDelta;
+ }
+ //ModdedMass = Match->ModType[0]->RealDelta;
+ for (AminoIndex = 0; AminoIndex < Match->AminoIndex[ModCount-1]; AminoIndex++)
+ {
+ PlainMass += PeptideMass[Match->Bases[AminoIndex]];
+ }
+ ModdedMass = PlainMass + Match->ModType[ModCount-1]->RealDelta;
+ RunningScore = (float)Match->InitialScore;
+
+ for (AminoIndex = Match->AminoIndex[ModCount-1]; Match->Bases[AminoIndex]; AminoIndex++)
+ {
+ PlainMass += PeptideMass[Match->Bases[AminoIndex]];
+ ModdedMass += PeptideMass[Match->Bases[AminoIndex]];
+ PRM = MASS_TO_BIN(PlainMass);
+ if (PRM < -PRM_ARRAY_SLACK || PRM >= Tweak->PRMScoreMax)
+ {
+ break;
+ }
+ PlainScore = (float)Tweak->PRMScores[PRM];
+ PRM = MASS_TO_BIN(ModdedMass);
+ if (PRM < -PRM_ARRAY_SLACK || PRM >= Tweak->PRMScoreMax)
+ {
+ break;
+ }
+ ModdedScore = (float)Tweak->PRMScores[PRM];
+ RunningScore += (PlainScore - ModdedScore);
+ if (RunningScore < Match->InitialScore - 100)
+ {
+ // We've hurt our score quite a bit; let's stop.
+ break;
+ }
+ if (AminoIndex > Match->AminoIndex[ModCount-1])
+ {
+ if (ModdedScore > 0)
+ {
+ // We've already had the chance to attach this ptm here.
+ break;
+ }
+
+ // Make a variant-match:
+ VariantMatch = ClonePeptide(Match);
+ VariantMatch->InitialScore = (int)RunningScore;
+ BestDiff = -1;
+ VariantMatch->ModType[ModCount-1] = NULL;
+ for (Node = MassDeltaByMass[Match->Bases[AminoIndex]-'A'][Match->ModType[ModCount-1]->Delta]; Node; Node = Node->Next)
+ {
+ Diff = abs(Node->Delta->RealDelta - Match->ModType[ModCount-1]->RealDelta);
+ if (BestDiff < 0 || Diff < BestDiff)
+ {
+ BestDiff = Diff;
+ VariantMatch->ModType[ModCount-1] = Node->Delta;
+ }
+ }
+ if (VariantMatch->ModType[ModCount-1])
+ {
+ VariantMatch->AminoIndex[ModCount-1] = AminoIndex;
+#ifdef VERBOSE_DEBUGGING
+ printf("Variant:\n");
+ DebugPrintMatch(VariantMatch);
+#endif
+ VariantMatch->DB = Info->DB;
+ StoreSpectralMatch(Spectrum, VariantMatch, PeptideLength, 0);
+ }
+ else
+ {
+ FreePeptideNode(VariantMatch);
+ }
+ }
+ }
+
+ }
+ Match->DB = Info->DB;
+ StoreSpectralMatch(Spectrum, Match, PeptideLength, 0);
+}
diff --git a/FreeMod.h b/FreeMod.h
new file mode 100644
index 0000000..593628f
--- /dev/null
+++ b/FreeMod.h
@@ -0,0 +1,91 @@
+//Title: FreeMod.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef FREE_MOD_H
+#define FREE_MOD_H
+#include "Tagger.h"
+
+// FreeMod.h includes code and classes for handling mutations and large sets
+// of modifications. (Most references to "mods" can be taken to mean "mutations and
+// post-translational modifications") This is a more powerful (but much slower) way to search
+// spectra, and is most appropriate for second-pass searching. (In multipass
+// searching, the database contains only the proteins identified with high
+// confidence during a restrictive first-pass search of a large database, like an IPI
+// species database or swiss-prot)
+
+// DELTA_BIN_COUNT is the number of mass bins in the range [MIN_DELTA_AMU, MAX_DELTA_AMU], 400*10 = 4000
+// This bin count is the size of each MassDeltaByMass[AA] array.
+//#define DELTA_BIN_COUNT 4000
+
+// MASS_TO_BIN and BIN_TO_MASS convert between masses and mass-bins
+#define MASS_TO_BIN(mass) (int)((mass + 50) / 100)
+#define BIN_TO_MASS(bin) (int)((bin) * 100)
+
+#define MDBI_ALL_MODS 26
+
+// Scaling factor, compensating for the different score ranges of
+// quick PRM-based scoring and final match-scoring.
+#define DELTA_SCORE_SCALER 200
+#define DELTA_SCORE_SCALER_FINAL 0.5
+
+// Search a database, using *no* tag-based filtering at all. This is much slower than searching
+// with tag-based filters, but also more sensitive, particularly since tagging is harder in the presence of mods.
+void SearchDatabaseTagless(SearchInfo* Info, int MaxMods, int VerboseFlag, SpectrumTweak* Tweak);
+
+// Set Spectrum->PRMScores, using the PRM scoring model. When extending in blind mode,
+// we use the scores of these PRMs as an initial score for our peptides.
+void SetPRMScores(MSSpectrum* Spectrum);
+
+// Read, from the binary file Mutations.dat, the definitions of all mass modifications we will consider.
+void LoadMassDeltas(char* FileName, int ReadFlag);
+
+// Initialize the hash MassDeltaByMass. The table entry MassDeltaByMass[AA][Delta] points to a linked list
+// of mass deltas for amino acid AA matching Delta.
+void InitMassDeltaByMass();
+
+// Re-score spectral matches. The matches in the list Spectrum->FirstMatch have been
+// quick-scored, but we can sort them better if we score them more meticulously.
+// Let's do so, and re-sort the list based on the new scores.
+void MQScoreSpectralMatches(SpectrumNode* Spectrum);
+
+// Print out a list of matches for the spectrum (Spectrum->FirstMatch through Spectrum->LastMatch).
+void DebugPrintMatchList(SpectrumNode* Spectrum);
+
+// Attach edges moving back by one, two, or three amino acid masses to nodes in the TagGraph
+void TagGraphPopulateBackEdges(TagGraph* Graph);
+
+void FreeMassDeltaByMass();
+void FreeMassDeltas();
+void AddBlindMods();
+void AllocMassDeltaByIndex();
+
+#endif // FREE_MOD_H
diff --git a/GetByteOffset.py b/GetByteOffset.py
new file mode 100644
index 0000000..20268f4
--- /dev/null
+++ b/GetByteOffset.py
@@ -0,0 +1,169 @@
+#Title: GetByteOffset.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+GetByteOffset.py
+Utility to find the byte offset of scans in a spectrum file
+Has no main
+"""
+
+import os
+import sys
+import xml.sax.handler
+import xml.sax
+
+## auxiliary for the mzxml files
+class XMLHandler(xml.sax.handler.ContentHandler):
+ def __init__(self):
+ self.inOffset = 0
+ self.mapping = {}
+ def startElement(self, name, attributes):
+ if name == "offset":
+ self.buffer = ""
+ self.scan = attributes["id"]
+ self.inOffset = 1
+ def characters(self, data):
+ if self.inOffset:
+ self.buffer += data
+ def endElement(self, name):
+ if name == "offset":
+ self.inOffset = 0
+ self.mapping[self.scan] = self.buffer
+
+class Abacus:
+ def __init__(self):
+ self.ScanOffset = {} #Scan = Offset
+
+ def GetByteOffset(self, FileName):
+ self.ScanOffset = {} #reset every time
+ (Stub, Ext) = os.path.splitext(FileName)
+ if Ext.lower() == ".mzxml":
+ return self.GetOffsetsMZXML(FileName)
+ elif Ext.lower() == ".mgf":
+ return self.GetOffsetsMGF(FileName)
+
+ def GetOffsetsMZXML(self, FilePath):
+ """Parses an individual mzXML file and saves the scan num and byte offset
+ into an dictionary called self.ScanOffset
+ Now uses real XML parsing looking for <offset id="SCAN">OFFSET</offset>
+ DOM is slow, so I'll use sax
+ """
+ print "Opening mzXML file %s"%FilePath
+ FileName = os.path.split(FilePath)[1]
+ Parser = xml.sax.make_parser()
+ Handler = XMLHandler()
+ Parser.setContentHandler(Handler)
+ Parser.parse(FilePath)
+ for (Scan, Offset) in Handler.mapping.items():
+ ScanNumber = int(Scan)
+ Offset = int(Offset)
+ #print (Scan, Offset)
+ self.ScanOffset[ScanNumber] = Offset
+ return self.ScanOffset
+
+ def GetOffsetsMGF(self, FileName):
+ """There is no pleasant way of doing this. I suppose
+ that I can just read in line after line looking for BEGIN
+ """
+ File = open(FileName, "rb")
+ #read in a MEG of the file at a time, and search for beginning <scan tags
+ #Text holds the data, SeamText willhold the last few bytes of a block
+ #and get appended to the first few (to check for a tag that spans the block
+ MEG = 1024*1024
+ Text = ""
+ SeamText = ""
+ FileOffset = 0
+
+ Counter = 0
+ while 1: # read in blocks loop
+ Block = File.read(MEG)
+ if not Block: #EOF
+ break
+ Text += Block
+ Pos = -1 #set up as dummy before the loop
+ while 1: #look for scans and offsets loop
+ ScanPos = Text.find("SCAN", Pos + 1)
+ if not ScanPos == -1:
+ ## 1. Get the scan number
+ ActualNumberPos = Text.find("=", ScanPos)
+ EndNumberPos = Text.find("\n", ScanPos)
+ ScanNumber = int (Text[ActualNumberPos + 1:EndNumberPos])
+ #print ScanNumber
+ ## 2. Get the BEGIN tag
+ BeginPos = Text.rfind("BEGIN", 0, ScanPos)
+ ScanOffset = FileOffset + BeginPos
+ if not self.ScanOffset.has_key(ScanNumber):
+ self.ScanOffset[ScanNumber] = ScanOffset
+ #yes I got the above error for who knows why
+ else:
+ ##did not find a scan number. Two possibilities
+ ## Can or Cannot find a BEGIN
+ BeginPos = Text.find("BEGIN", Pos + 1)
+ if not BeginPos == -1:
+ #Begin was found, seam text to begin here
+ print "Most recent Scan was %s"%ScanNumber
+ SeamText = Text[BeginPos:]
+ break
+ else:
+ #here it is possible that the word begin spans the break
+ #to prevent that case, we simply make some seam text
+ SeamText = Text[-20:]
+ break
+ Pos = EndNumberPos
+
+ #now we've broken out of the finding loop. Need to reset some vars
+ LenBlock = len(Text)
+ Text = SeamText
+ FileOffset += LenBlock # can't use MEG here, because Text included some seam text
+ FileOffset -= len(SeamText)
+ File.close()
+ self.Validate(FileName)
+ return self.ScanOffset
+
+ def Validate(self, FileName):
+ "simple check of scanoffset"
+ File = open(FileName, "rb")
+ ErrorFound =0
+ for (ScanNumber, ScanOffset) in self.ScanOffset.items():
+ File.seek(ScanOffset)
+ Text = File.read(300)
+ Place = Text.find("BEGIN")
+ #print "Found begin at place %d"%Place
+ if not Text.find("BEGIN") == 0:
+ print "Error with scan %d"%ScanNumber
+ ErrorFound = 1
+ print Text
+ if not ErrorFound:
+ print "Validation Successful"
+
+
diff --git a/Global.py b/Global.py
new file mode 100644
index 0000000..a1448a1
--- /dev/null
+++ b/Global.py
@@ -0,0 +1,64 @@
+#Title: Global.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Global variables for mass-spec analysis
+"""
+
+IsotopeWeights = {}
+
+# Keys are ion type names, values are the corresponding ion instances
+AllIonDict = {}
+
+# Masses for amino acids (keys: 1-letter peptide abbreviations, like "G" or "D")
+AminoMass = {}
+AminoMassRight = {}
+
+# List of all amino acid (left) masses:
+AminoMasses = []
+
+# Dictionary of post-translational modifications. Keys are modification
+# names (in lower-case).
+PTMods = {}
+
+# Truncated, 3- or 4-character keys:
+PTModByShortName = {}
+PTModList = []
+
+AminoAcids = {} # key: single-letter abbreviation ("A" -> Alanine)
+FixedMods = {"C":57.0518} # The protecting group on C is enabled, by default!
+
+#List of ModificationTypeObject (see Utils.py) the user defines as invivo or invitro
+InVivoMods = []
+InVitroMods = []
+
diff --git a/InVitroModifications.txt b/InVitroModifications.txt
new file mode 100644
index 0000000..c4fcfad
--- /dev/null
+++ b/InVitroModifications.txt
@@ -0,0 +1,7 @@
+mod,57,C,fixed #CAM
+mod,-17,CQ,nterminal #pyroglutamate, Pyro-cmC
+mod,57,*,nterminal #CAM
+mod,12,*,nterminal #chemical adduct
+mod,43,*,nterminal #carbamylation
+mod,16,MW #oxidation
+mod,-48,M #neutral loss on Met or oxM
\ No newline at end of file
diff --git a/InVivoModifications.txt b/InVivoModifications.txt
new file mode 100644
index 0000000..88194cf
--- /dev/null
+++ b/InVivoModifications.txt
@@ -0,0 +1,9 @@
+mod,80,STY,opt,phosphorylation #phosphorylation
+mod,42,*,nterminal #nterminal acetylation
+mod,14,KR #methylation
+mod,42,K #acetylation
+mod,203,ST #GlcNAc
+mod,146,ST #fucosylation
+mod,210,*,nterminal #myristolation
+mod,16,P #hydroxyproline
+mod,28,KR #dimethylation
\ No newline at end of file
diff --git a/Inspect.exe b/Inspect.exe
new file mode 100644
index 0000000..a5fdc19
Binary files /dev/null and b/Inspect.exe differ
diff --git a/Inspect.h b/Inspect.h
new file mode 100644
index 0000000..18391b3
--- /dev/null
+++ b/Inspect.h
@@ -0,0 +1,190 @@
+//Title: Inspect.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+#ifndef INSPECT_H
+#define INSPECT_H
+
+#define INSPECT_VERSION_NUMBER "20110313"
+
+#include <stdio.h>
+#include "Utils.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// General-purpose #definitions.
+
+// Multiplier for scaling floating-point masses up to integers.
+// Represent 123.456Da as the integer 123456:
+#define MASS_SCALE 1000
+#define DALTON 1000
+#define HALF_DALTON 500
+#define DECI_DALTON 100
+
+// Mass (in amu) of hydrogen. (Used for, e.g., finding the PRM of a b peak)
+//#define HYDROGEN_MASS (float)1.0078
+#define HYDROGEN_MASS (int)1008
+#define TWO_HYDROGEN_MASS (int)2016
+#define GLYCINE_MASS 57000
+#define WATER_MASS 18000
+#define CAM_MASS 57000
+#define PHOSPHATE_MASS 79966
+#define PHOSPHATE_WATER_MASS 97966
+
+// The parent mass boost is equal to the difference in mass between a precursor ion and
+// the parent *residue* mass (sum of amino acid masses).
+#define PARENT_MASS_BOOST (int)19000
+
+// Maximum length of a peptide tag that can be indexed. (Char-arrays are limited to this size)
+// (Somewhat absurdly large, because we may want to use the trie to search for peptides
+// outside the MS/MS context)
+#define MAX_TAG_LENGTH 50
+
+// How far can peaks be from theoretical prediction?
+#define DEFAULT_EPSILON 500
+
+#define DEFAULT_PARENT_MASS_EPSILON 2500
+#define DEFAULT_PARENT_MASS_PPM 2000
+#define DEFAULT_FLANKING_MASS_EPSILON 3000
+
+// How large are the tags we generate, by default? (Overridable by -x, -y)
+#define DEFAULT_TAG_LENGTH 3
+
+// Maximum number of post-translational modification types:
+#define MAX_PT_MODTYPE 42
+
+// Maximum number of post-tranlational mods that can EVER be allowed:
+#define MAX_PT_MODS 8
+
+// How many entries in the match hash-table. (If we have many more matches than this,
+// performance will be slowed a bit)
+#define MATCH_HASH_SIZE 1000
+
+// Trie node's child array has one entry for each letter (some slots are wasted,
+// since there are only 20 peptides, but it makes for fast searching)
+// The index into the array is the peptide char minus 'A'. (alanine 0, cysteine 2, etc)
+#define TRIE_CHILD_COUNT 26
+
+#define FILENAME_AMINO_ACID_MASSES "AminoAcidMasses.txt"
+#define FILENAME_PTMS "PTMods.txt"
+#define FILENAME_MASTER_TAGGING_MODEL "PRM.dat"
+#define FILENAME_MASS_DELTAS "Mutations.dat"
+#define FILENAME_PVALUE "PValue.dat"
+#define FILENAME_PVALUE_TRYPTIC "PValueTryptic.dat"
+#define FILENAME_SCORING_MODEL "ScoringModel.dat"
+#define FILENAME_ISOTOPE_PATTERNS "IsotopePatterns.txt"
+#define FILENAME_INTENSITY_RANK_ODDS "IntensityRankIonOdds.txt"
+#define FILENAME_WITNESS_SCORES "IonWitnessScores.dat"
+#define FILENAME_PRM_MODEL "PRMModel.dat"
+
+#define TWEAK_COUNT 6
+
+//used as switches for fragmentation models
+#define FRAGMENTATION_NORMAL 0
+#define FRAGMENTATION_PHOSPHO 1
+
+// We may try two or three different charge/parent-mass combinations for one
+// spectrum. We use SVMs to determine parent mass and charge state, but in
+// borderline cases, we try both.
+typedef struct SpectrumTweak
+{
+ int ParentMass;
+ int Charge;
+ // Intensities(S, L) is the frequency of intensity level L in sector S
+ float Intensities[12]; // SECTOR_COUNT
+ int* PRMScores;
+ int PRMScoreMax;
+} SpectrumTweak;
+
+#define SPECTRUM_FORMAT_INVALID -1
+#define SPECTRUM_FORMAT_DTA 0
+#define SPECTRUM_FORMAT_PKL 1
+#define SPECTRUM_FORMAT_MS2 2
+#define SPECTRUM_FORMAT_MGF 3
+#define SPECTRUM_FORMAT_MS2_COLONS 4
+#define SPECTRUM_FORMAT_MZXML 5
+#define SPECTRUM_FORMAT_MZDATA 6
+#define SPECTRUM_FORMAT_CDTA 7
+
+// Create one InputFileNode for each file being searched.
+// If the input file is a standard .dta file, then we create one child SpectrumNode.
+// If the input file is a .ms2 file, then we create many child SpectrumNodes.
+typedef struct InputFileNode
+{
+ char FileName[MAX_FILENAME_LEN];
+ int SpectrumCount;
+ int Format; // 0 dta, 1 pkl, 2 ms2, 3 mgf
+ struct InputFileNode* Prev;
+ struct InputFileNode* Next;
+} InputFileNode;
+
+typedef struct SpectrumNode
+{
+ struct MSSpectrum* Spectrum;
+ struct SpectrumNode* Next;
+ SpectrumTweak Tweaks[TWEAK_COUNT];
+ int PMCFlag; // Set to 1 after PMC is done and our tweak-array is populated.
+ int FilePosition; // seek to here before parsing
+
+ //The scan number is a user defined notion for each spectrum.
+ //In MGF files the Scan number is a 0-based indexing of the spectra
+ //In MZXML files the scan number is read from the field 'scanNum'
+ int ScanNumber;
+
+ //The spectrum index is a 1-based indexing of MS2+ spectra in a file
+ int SpecIndex;
+ int MatchCount;
+ struct Peptide* FirstMatch;
+ struct Peptide* LastMatch;
+ InputFileNode* InputFile; // the file name (and file type)
+} SpectrumNode;
+
+// The Stats object is for keeping track of cumulative info (tags generated,
+// bytes read, spectra scored, that sort of thing)
+typedef struct InspectStats
+{
+ // Tags generated - raw count of all tripeptide paths through the PRM graph
+ long long TagsGenerated;
+ // Tag hits in the database (How many tripeptide tag matches were extended?)
+ long long TagMatches;
+ // Number of candidate peptides that were scored against the source spectrum
+ long long CandidatesScored;
+ long long TagGraphNodes;
+ long long TagGraphEdges;
+} InspectStats;
+
+extern InspectStats* GlobalStats;
+
+typedef void (*TrainingCallback)(SpectrumNode*, int, int, struct Peptide*);
+void TrainOnOracleFile(char* OracleFileName, char* SpectrumDir, TrainingCallback Callback);
+void AddSpectrumToList(InputFileNode* InputFile, int FilePos, int ScanNumber, int SpecIndex);
+
+#endif // INSPECT_H
diff --git a/Inspect.sln b/Inspect.sln
new file mode 100644
index 0000000..98c0b19
--- /dev/null
+++ b/Inspect.sln
@@ -0,0 +1,19 @@
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Inspect", "Inspect.vcproj", "{5C4CDF65-87D6-4FE9-B269-4695FD7EC35B}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {5C4CDF65-87D6-4FE9-B269-4695FD7EC35B}.Debug|Win32.ActiveCfg = Debug|Win32
+ {5C4CDF65-87D6-4FE9-B269-4695FD7EC35B}.Debug|Win32.Build.0 = Debug|Win32
+ {5C4CDF65-87D6-4FE9-B269-4695FD7EC35B}.Release|Win32.ActiveCfg = Release|Win32
+ {5C4CDF65-87D6-4FE9-B269-4695FD7EC35B}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/Inspect.vcproj b/Inspect.vcproj
new file mode 100644
index 0000000..0c5e0d7
--- /dev/null
+++ b/Inspect.vcproj
@@ -0,0 +1,566 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="9.00"
+ Name="Inspect"
+ ProjectGUID="{5C4CDF65-87D6-4FE9-B269-4695FD7EC35B}"
+ TargetFrameworkVersion="131072"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory=".\Debug"
+ IntermediateDirectory=".\Debug"
+ ConfigurationType="1"
+ InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+ UseOfMFC="0"
+ ATLMinimizesCRunTimeLibraryUsage="false"
+ CharacterSet="2"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TypeLibraryName=".\Debug/Protri.tlb"
+ HeaderFileName=""
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="expat\lib"
+ PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="1"
+ UsePrecompiledHeader="0"
+ PrecompiledHeaderFile=".\Debug/Protri.pch"
+ AssemblerListingLocation=".\Debug/"
+ ObjectFile=".\Debug/"
+ ProgramDataBaseFileName=".\Debug/"
+ WarningLevel="3"
+ SuppressStartupBanner="true"
+ DebugInformationFormat="4"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ PreprocessorDefinitions="_DEBUG"
+ Culture="1033"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalOptions="/FIXED:NO"
+ AdditionalDependencies="libexpat.lib"
+ OutputFile=".\Inspect.exe"
+ LinkIncremental="1"
+ SuppressStartupBanner="true"
+ AdditionalLibraryDirectories="expat\lib\debug"
+ IgnoreDefaultLibraryNames=""
+ GenerateDebugInformation="true"
+ ProgramDatabaseFile=".\Debug/Protri.pdb"
+ SubSystem="1"
+ RandomizedBaseAddress="1"
+ DataExecutionPrevention="0"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="."
+ IntermediateDirectory="."
+ ConfigurationType="1"
+ InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+ UseOfMFC="0"
+ ATLMinimizesCRunTimeLibraryUsage="false"
+ CharacterSet="2"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TypeLibraryName=".\Release/Protri.tlb"
+ HeaderFileName=""
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ InlineFunctionExpansion="1"
+ AdditionalIncludeDirectories="expat\lib"
+ PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+ StringPooling="true"
+ RuntimeLibrary="0"
+ EnableFunctionLevelLinking="true"
+ UsePrecompiledHeader="0"
+ PrecompiledHeaderFile=".\Release/Protri.pch"
+ AssemblerListingLocation=".\Release/"
+ ObjectFile=".\Release/"
+ ProgramDataBaseFileName=".\Release/"
+ WarningLevel="3"
+ SuppressStartupBanner="true"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ PreprocessorDefinitions="NDEBUG"
+ Culture="1033"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="libexpat.lib"
+ OutputFile=".\Inspect.exe"
+ LinkIncremental="1"
+ SuppressStartupBanner="true"
+ AdditionalLibraryDirectories="expat\lib\debug"
+ IgnoreDefaultLibraryNames=""
+ ProgramDatabaseFile=".\Release/Protri.pdb"
+ SubSystem="1"
+ RandomizedBaseAddress="1"
+ DataExecutionPrevention="0"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+ >
+ <File
+ RelativePath=".\base64.c"
+ >
+ </File>
+ <File
+ RelativePath=".\base64.h"
+ >
+ </File>
+ <File
+ RelativePath=".\BN.c"
+ >
+ </File>
+ <File
+ RelativePath=".\BN.h"
+ >
+ </File>
+ <File
+ RelativePath=".\BuildMS2DB.c"
+ >
+ </File>
+ <File
+ RelativePath=".\BuildMS2DB.h"
+ >
+ </File>
+ <File
+ RelativePath=".\ChargeState.c"
+ >
+ </File>
+ <File
+ RelativePath=".\ChargeState.h"
+ >
+ </File>
+ <File
+ RelativePath=".\CMemLeak.c"
+ >
+ </File>
+ <File
+ RelativePath=".\CMemLeak.h"
+ >
+ </File>
+ <File
+ RelativePath=".\Errors.c"
+ >
+ </File>
+ <File
+ RelativePath=".\Errors.h"
+ >
+ </File>
+ <File
+ RelativePath=".\ExonGraphAlign.c"
+ >
+ </File>
+ <File
+ RelativePath=".\ExonGraphAlign.h"
+ >
+ </File>
+ <File
+ RelativePath=".\FreeMod.c"
+ >
+ </File>
+ <File
+ RelativePath=".\FreeMod.h"
+ >
+ </File>
+ <File
+ RelativePath=".\Inspect.h"
+ >
+ </File>
+ <File
+ RelativePath=".\IonScoring.c"
+ >
+ </File>
+ <File
+ RelativePath=".\IonScoring.h"
+ >
+ </File>
+ <File
+ RelativePath=".\LDA.c"
+ >
+ </File>
+ <File
+ RelativePath=".\LDA.h"
+ >
+ </File>
+ <File
+ RelativePath="main.c"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ PreprocessorDefinitions=""
+ BasicRuntimeChecks="3"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ PreprocessorDefinitions=""
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="Mods.c"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ PreprocessorDefinitions=""
+ BasicRuntimeChecks="3"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ PreprocessorDefinitions=""
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath=".\Mods.h"
+ >
+ </File>
+ <File
+ RelativePath=".\MS2DB.c"
+ >
+ </File>
+ <File
+ RelativePath=".\MS2DB.h"
+ >
+ </File>
+ <File
+ RelativePath=".\ParentMass.c"
+ >
+ </File>
+ <File
+ RelativePath=".\ParentMass.h"
+ >
+ </File>
+ <File
+ RelativePath=".\ParseInput.c"
+ >
+ </File>
+ <File
+ RelativePath=".\ParseInput.h"
+ >
+ </File>
+ <File
+ RelativePath=".\ParseXML.c"
+ >
+ </File>
+ <File
+ RelativePath=".\ParseXML.h"
+ >
+ </File>
+ <File
+ RelativePath=".\PValue.c"
+ >
+ </File>
+ <File
+ RelativePath=".\PValue.h"
+ >
+ </File>
+ <File
+ RelativePath=".\Run.c"
+ >
+ </File>
+ <File
+ RelativePath=".\Run.h"
+ >
+ </File>
+ <File
+ RelativePath="Score.c"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ PreprocessorDefinitions=""
+ BasicRuntimeChecks="3"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ PreprocessorDefinitions=""
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath=".\Score.h"
+ >
+ </File>
+ <File
+ RelativePath=".\Scorpion.c"
+ >
+ </File>
+ <File
+ RelativePath=".\Scorpion.h"
+ >
+ </File>
+ <File
+ RelativePath=".\SNP.c"
+ >
+ </File>
+ <File
+ RelativePath=".\SNP.h"
+ >
+ </File>
+ <File
+ RelativePath=".\Spectrum.c"
+ >
+ </File>
+ <File
+ RelativePath=".\Spectrum.h"
+ >
+ </File>
+ <File
+ RelativePath=".\Spliced.c"
+ >
+ </File>
+ <File
+ RelativePath=".\Spliced.h"
+ >
+ </File>
+ <File
+ RelativePath=".\SpliceDB.c"
+ >
+ </File>
+ <File
+ RelativePath=".\SpliceDB.h"
+ >
+ </File>
+ <File
+ RelativePath=".\SpliceScan.c"
+ >
+ </File>
+ <File
+ RelativePath=".\SpliceScan.h"
+ >
+ </File>
+ <File
+ RelativePath=".\SVM.c"
+ >
+ </File>
+ <File
+ RelativePath=".\SVM.h"
+ >
+ </File>
+ <File
+ RelativePath=".\TagFile.c"
+ >
+ </File>
+ <File
+ RelativePath=".\TagFile.h"
+ >
+ </File>
+ <File
+ RelativePath="Tagger.c"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ PreprocessorDefinitions=""
+ BasicRuntimeChecks="3"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ PreprocessorDefinitions=""
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath=".\Tagger.h"
+ >
+ </File>
+ <File
+ RelativePath="Trie.c"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ PreprocessorDefinitions=""
+ BasicRuntimeChecks="3"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ PreprocessorDefinitions=""
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath=".\Trie.h"
+ >
+ </File>
+ <File
+ RelativePath="Utils.c"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ PreprocessorDefinitions=""
+ BasicRuntimeChecks="3"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ PreprocessorDefinitions=""
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath=".\Utils.h"
+ >
+ </File>
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
diff --git a/InspectToPepXML.py b/InspectToPepXML.py
new file mode 100644
index 0000000..15163a2
--- /dev/null
+++ b/InspectToPepXML.py
@@ -0,0 +1,859 @@
+#!/usr/bin/python
+
+# Update Jan 3, 2012 by Natalie to remove dependence on Column Order
+
+UsageInfo = \
+"""InspectToPepXML.py: Converts output of InsPecT search engine
+to PepXML format. Written by Samuel Payne, Venter Institute,
+and Terry Farrah, Institute for Systems Biology October 2008
+
+Required Parameters
+-i [Filename] - InsPecT results file from search (input)
+-o [Filename] - Converted file in PepXML (output)
+
+Optional Parameters
+-p [Filename] - InsPecT input (parameter) file
+ default: inspect.params
+-m [Dirname] - Dir containing .mzXML or .mgf spectrum file
+ default: current working directory
+-d N - write at most N hits per assumed charge
+
+Assumes InsPecT results file is TSV containing header line and
+ one record per peptide prediction sorted by scan #, then by rank.
+User must manually edit PepXML file and insert correct information
+ near top of file for precursor and fragment mass types --
+ either average or monoisotopic.
+If database file mentioned in parameter file is not in fasta
+ format (.fasta or .fa), you must create a fasta format file of
+ the same base name in the same dir. Use TrieToFASTA.py.
+This script, InspectToPepXML.py, must reside in the same directory
+ as the rest of the InsPecT code.
+"""
+
+import sys
+import os
+import glob
+import getopt
+import re
+import time
+import GetByteOffset
+import ResultsParser
+import Utils
+import Global
+from xml.sax import saxutils #xml.sax is for reading mzXML
+from xml.sax import ContentHandler
+from xml.sax import make_parser
+from xml.sax.handler import feature_namespaces
+
+global initial_dir
+global spectrum_query_count
+
+# ========================================================
+# Read tables with standard data such as amino acid masses
+# ========================================================
+
+# chdir() is Hack to make pgm invokable from any dir
+# (Utils makes use of auxiliary files in same dir as code)
+initial_dir = os.getcwd()
+os.chdir(sys.path[0])
+Utils.Initialize()
+os.chdir(initial_dir)
+
+# ===========================================================
+# Define classes to hold spectra (scans) and peptides (hits)
+# ===========================================================
+
+class InspectSpectrumClass:
+ """Stores the relevant InsPecT output file data for a spectrum"""
+ def __init__(self):
+ self.ScanNumber = -1
+ self.PrecursorMz = -1.0
+ self.RetentionTime = -1.0
+ self.HitList = [] # store a hit list for each charge state
+ for i in range (1,6): self.HitList = self.HitList + [[]]
+
+ def WriteSpectrumQueries(self, PepXMLHandle, SpectrumFileName, enzyme,
+ MaxHitsPerCharge):
+ """ Write <spectrum_query> tags for this spectrum.
+
+ There is one tag for each assumed charge that has any hits.
+ """
+ global spectrum_query_count
+ SpectrumFileType = os.path.splitext(SpectrumFileName)[1].lower()
+
+ for charge in range(1,5): # for each charge state
+ if len(self.HitList[charge]) > 0: # if any hits
+ spectrum_query_count = spectrum_query_count + 1
+ SpectrumTitle="%s.%05d.%05d.%s" % \
+ (os.path.splitext(SpectrumFileName)[0],
+ self.ScanNumber,self.ScanNumber,
+ charge)
+ if SpectrumFileType == ".mgf":
+ PrecursorNeutralMass = self.PrecursorNeutralMass
+ else:
+ _proton_mass = 1.007276
+ PrecursorNeutralMass = \
+ (self.PrecursorMz * charge) - \
+ (charge * _proton_mass)
+ if PrecursorNeutralMass < 0:
+ PrecursorNeutralMassString = ''
+ else:
+ PrecursorNeutralMassString = \
+ ' precursor_neutral_mass="%.6f"\n' % \
+ PrecursorNeutralMass
+ if self.RetentionTime < 0:
+ RetentionTimeString = ''
+ else:
+ RetentionTimeString = \
+ ' retention_time_sec="%.2f"\n' % \
+ self.RetentionTime
+ Query = '<spectrum_query\n' + \
+ ' spectrum="%s"\n' % SpectrumTitle + \
+ ' start_scan="%s"\n' % self.ScanNumber + \
+ ' end_scan="%s"\n' % self.ScanNumber + \
+ PrecursorNeutralMassString + \
+ ' assumed_charge="%s"\n' % charge + \
+ ' index="%s"\n' % spectrum_query_count + \
+ RetentionTimeString + \
+ '>\n'
+ PepXMLHandle.write(Query)
+ PepXMLHandle.write('<search_result search_id="1">\n')
+ for i in range(min(MaxHitsPerCharge,
+ len(self.HitList[charge]))):
+ self.HitList[charge][i].PrecursorNeutralMass = \
+ PrecursorNeutralMass
+ self.HitList[charge][i].WriteSearchHit(PepXMLHandle,
+ i+1, enzyme)
+ PepXMLHandle.write('</search_result>\n')
+ PepXMLHandle.write('</spectrum_query>\n')
+
+class InspectOutputRecordClass:
+ """Stores the relevant data from a single line of InsPecT output.
+
+ Each line represents a search hit--a predicted peptide for a spectrum.
+ """
+ def __init__(self):
+ self.Spectrum = None
+ self.FileOffset = -1
+ self.Protein = ""
+ self.Charge = -1
+ self.MQScore = ""
+ self.FScore = ""
+ self.DeltaScore = ""
+ self.PValue = ""
+ self.ProteinID = ""
+ self.Prefix = ""
+ self.Peptide = ""
+ self.Suffix = ""
+ self.OptModList = []
+ self.PrecursorNeutralMass = -1.0
+
+ def WriteSearchHit(self, PepXMLHandle, rank, enzyme):
+ """ Write <search_hit> tag for this this line of InsPecT output
+ """
+ global initial_dir
+ os.chdir(sys.path[0]) # hack to make pgm invokable from any dir
+ # GetMass adds on fixed modifications, but not optional ones
+ CalcMass = Utils.GetMass(self.Peptide) + 18.01528 #add h2o mass
+ for mod in self.OptModList:
+ CalcMass = CalcMass + float(mod[2])
+ os.chdir(initial_dir)
+ MassDiff = self.PrecursorNeutralMass - CalcMass
+ # If the enzyme is trypsin, count all KR except
+ # final one, and except when followed by P (proline).
+ if enzyme.lower() == "trypsin":
+ MissedCleavages = self.Peptide[:-1].count("K") + \
+ self.Peptide[:-1].count("R") - \
+ (self.Peptide[:-1].count("KP") + self.Peptide[:-1].count("RP"))
+ elif enzyme.lower() == "none":
+ MissedCleavages = 0
+ else: MissedCleavages = -1
+ # Break up Protein into accession # and description
+ first_space = self.Protein.find(' ')
+ if first_space >= 0:
+ Protein = self.Protein[:first_space]
+ ProteinDescr = self.Protein[first_space+1:]
+ ProteinDescr = ProteinDescr.replace(">",">")
+ ProteinDescr = ProteinDescr.replace("<","<")
+ ProteinDescr = ProteinDescr.replace("&","&")
+ ProteinDescr = ProteinDescr.replace("\"",""")
+ ProteinDescr = ProteinDescr.replace("\'","‘")
+ else:
+ Protein = self.Protein
+ ProteinDescr = Protein
+ Hit = '<search_hit\n' + \
+ ' hit_rank="%s"\n' % (rank) + \
+ ' peptide="%s"\n' % (self.Peptide) + \
+ ' peptide_prev_aa="%s"\n' % (self.Prefix) + \
+ ' peptide_next_aa="%s"\n' % (self.Suffix) + \
+ ' protein="%s"\n' % (Protein) + \
+ ' protein_descr="%s"\n' % (ProteinDescr) + \
+ ' num_tot_proteins="0"\n' + \
+ ' num_matched_ions="0"\n' + \
+ ' tot_num_ions="0"\n' + \
+ ' calc_neutral_pep_mass="%s"\n' % (CalcMass) + \
+ ' massdiff="%s"\n' % (MassDiff) + \
+ ' num_tol_term="%s"\n' % "2" + \
+ ' num_missed_cleavages="%d"\n'%(MissedCleavages) + \
+ ' is_rejected="0"\n' + \
+ '>\n'
+ PepXMLHandle.write(Hit)
+ # Create a dictionary of masses of all amino acids that
+ # are modified, indexed by peptide position.
+ # First, add to the dictionary all aa's that have optional mods.
+ # Then, add fixed mods. Use monoisotopic mass for basic AA.
+ ModMassDict = {}
+ for mod in self.OptModList:
+ aa = mod[0]
+ pos = mod[1]
+ mod_mass = mod[2]
+ if pos in ModMassDict:
+ ModMassDict[pos] += float(mod_mass)
+ else:
+ ModMassDict[pos] = float(mod_mass) + Global.AminoMass[aa]
+ for i in range(len(self.Peptide)):
+ aa = self.Peptide[i]
+ pos = i + 1
+ if aa in Global.FixedMods:
+ mod_mass = Global.FixedMods[aa]
+ if pos in ModMassDict:
+ ModMassDict[pos] += float(mod_mass)
+ else:
+ ModMassDict[pos] = float(mod_mass) + \
+ Global.AminoMass[aa]
+ # Now, create a pepXML string with an element for each modified AA.
+ ModString = ''
+ for i in range(len(self.Peptide)):
+ pos = i + 1
+ if pos in ModMassDict:
+ ModString = ModString + '<mod_aminoacid_mass ' + \
+ 'position="%d" ' % pos + \
+ 'mass="%.4f" />' % ModMassDict[pos]
+ if len(ModString) > 0:
+ ModInfo = '<modification_info>%s</modification_info>\n' % \
+ ModString
+ PepXMLHandle.write(ModInfo)
+ PepXMLHandle.write(
+ '<search_score name="mqscore" value="%s"/>\n'%self.MQScore)
+ PepXMLHandle.write(
+ '<search_score name="expect" value="%s"/>\n'%self.PValue)
+ PepXMLHandle.write(
+ '<search_score name="fscore" value="%s"/>\n'%self.FScore)
+ PepXMLHandle.write(
+ '<search_score name="deltascore" value="%s"/>\n'%self.DeltaScore)
+ PepXMLHandle.write('</search_hit>\n')
+
+
+# ======================================================================
+# Virtually all the code is contained within class InspectToPepXMLClass
+# ======================================================================
+
+class InspectToPepXMLClass(ResultsParser.ResultsParser):
+ def __init__(self):
+ """Initialize fields of InspectToPepXMLClass instance to null values
+ """
+ self.InputFilePath = None
+ self.OutputFilePath = None
+ self.SpectraDir = os.getcwd()
+ self.MaxHitsPerCharge = 10000 #effectively maxint
+ self.ParamFilePath = os.path.join(os.getcwd(), "inspect.params")
+ self.ScanOffset = {}
+ self.ScanDict= {}
+ self.SpectrumFileType = ""
+ self.SpectrumFileBase = ""
+ self.Columns = ResultsParser.Columns()
+ ResultsParser.ResultsParser.__init__(self)
+
+ #---------------------------------------------------------------------
+
+ def Main(self):
+ """Convert raw InsPecT output file to PepXML
+
+ Initially designed to handle entire directories of files.
+ """
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco not found - running in non-optimized mode)"
+ # Line directly below needed only if we want to handle directories
+ #self.self.ProcessResultsFiles(self.InputFilePath,
+ # self.ConvertInspectToPepXML)
+ self.ConvertInspectToPepXML(self.InputFilePath)
+
+ #---------------------------------------------------------------------
+
+ def ConvertInspectToPepXML(self, FilePath):
+ """ Convert a single raw InsPecT output file to PepXML
+ """
+
+ global spectrum_query_count
+
+ # ------------------------------------------------------------
+ # Open input/output files and gather info from auxiliary files
+ # ------------------------------------------------------------
+
+ # Get input filename; open output file handle
+ #FileName = os.path.split(FilePath)[1]
+ #FileName = FileName.replace(".txt", ".xml")
+ #NewPath = os.path.join(self.OutputFilePath, FileName)
+ #PepXMLHandle = open(NewPath, "wb")
+ PepXMLHandle = open(self.OutputFilePath, "wb")
+
+ # Glean info from inspect params file
+ if not os.path.exists(self.ParamFilePath):
+ print >> sys.stderr, "Inspect params file %s does not exist" % \
+ self.ParamFilePath
+ sys.exit()
+ ParamFile = open(self.ParamFilePath, "r")
+ nmods_allowed_per_spectrum = 0
+ nmods_in_params = 0
+ self.mod_weight = []
+ self.mod_aa = []
+ self.mod_type = []
+ self.mod_name = []
+ self.spec_file = []
+ # reset Global.FixedMods to empty; Global.py initializes it to
+ # {"C":57.0518}, but this is a hack we don't want
+ Global.FixedMods = {}
+ self.instrument = "UNKNOWN"
+ self.protease = "trypsin"
+ self.search_db = "UNKNOWN"
+ for Line in ParamFile.readlines():
+ Line = Line.strip() #remove leading and trailing whitespace
+ if Line.lower().startswith("mods,"):
+ nmods_allowed_per_spectrum = int(Line[len("mods,"):])
+ elif Line.lower().startswith("spectra,"):
+ this_spec_file = Line[len("spectra,"):].strip()
+ self.spec_file = self.spec_file + [this_spec_file]
+ elif Line.lower().startswith("mod,"):
+ tokens = Line.split(",")
+ this_mod_weight = float(tokens[1])
+ this_mod_aa_string = tokens[2].strip()
+ if this_mod_aa_string == "*":
+ this_mod_aa_string = "ACDEFGHIKLMNPQRSTVWY"
+ this_mod_type = "opt"
+ if len(tokens) > 3:
+ this_mod_type = tokens[3].strip()
+ if len(tokens) > 4:
+ this_mod_name = tokens[4].strip()
+ else: this_mod_name = None
+ for this_mod_aa in this_mod_aa_string:
+ self.mod_weight = self.mod_weight + [this_mod_weight]
+ self.mod_aa = self.mod_aa + [this_mod_aa]
+ self.mod_type = self.mod_type + [this_mod_type]
+ if this_mod_type == "fix":
+ Global.FixedMods[this_mod_aa] = this_mod_weight
+ self.mod_name = self.mod_name + [this_mod_name]
+ nmods_in_params = nmods_in_params + 1
+ elif Line.lower().startswith("instrument,"):
+ self.instrument = Line[len("instrument,"):].strip()
+ elif Line.lower().startswith("protease,"):
+ self.protease = Line[len("protease,"):].strip()
+ elif Line.lower().startswith("db,"):
+ search_db = Line[len("db,"):].strip()
+ search_db_ext = os.path.splitext(search_db)[1]
+ # Find the Fasta format of the .trie file used
+ if search_db_ext not in [".fa", ".fasta"]:
+ search_db_root = os.path.splitext(search_db)[0]
+ search_db_file_list = set(
+ glob.glob("%s.*" % search_db_root))
+ #print search_db_file_list
+ ext_list = [os.path.splitext(f)[1]
+ for f in search_db_file_list]
+ #print ext_list
+ try: ext_list.remove(".index")
+ except: pass
+ try: ext_list.remove(".trie")
+ except: pass
+ #print ext_list
+ if len(ext_list) == 1:
+ search_db = search_db_root + ext_list[0]
+ elif ".fasta" in ext_list:
+ search_db = search_db_root + ".fasta"
+ elif ".fa" in ext_list:
+ search_db = search_db_root + ".fa"
+ else:
+ print >> sys.stderr, \
+ "WARNING: Can't find a RefreshParser compatible database " + \
+ "file corresponding to %s " % search_db + \
+ "(such as a .fasta or .fa file with same root); using UNKNOWN.\n" + \
+ "(%s is the database file listed in your params file.)\n" % search_db
+ self.search_db = search_db
+ self.nmods = nmods_in_params
+
+ # Read just first line of inspect output to get spectrum filename
+# InspectHandle = open(FilePath, "r")
+# for Line in InspectHandle.xreadlines():
+# if Line[0] == "#": # comments
+# continue
+# Bits = list(Line.split("\t"))
+# break
+# InspectHandle.close()
+ InspectHandle = open(FilePath, "rb")
+
+ # Glean RTs & precursor M/z's for each scan from each spectrum file
+ # Also, store the full path for each file in a dictionary
+ # keyed to the filename.
+ retentionTimeDict = dict()
+ precursorMzDict = dict()
+ spectrumPathDict = dict() ###TMF_new
+ for SpectrumFilePath in self.spec_file:
+# SpectrumFilePath = Bits[self.Columns.SpectrumFile]
+ SpectrumFileName = os.path.split(SpectrumFilePath)[1]
+# We used to force the path to be the cwd, but now we're leaving it alone. Dec-11
+# SpectrumFilePath = os.path.join(self.SpectraDir, SpectrumFileName)
+ self.SpectrumFileBase = \
+ SpectrumFilePath.replace(os.path.splitext(SpectrumFilePath)[1], "")
+ #self.SpectrumFileType = os.path.splitext(SpectrumFilePath)[1]
+ spectrumPathDict[SpectrumFileName] = SpectrumFilePath
+
+ if not os.path.exists(SpectrumFilePath):
+ print >> sys.stderr, "Spectrum file %s does not exist" % \
+ SpectrumFilePath
+ sys.exit()
+ (Stub, Ext) = os.path.splitext(SpectrumFilePath)
+ if Ext.lower() == ".mzxml":
+ self.SpectrumFileType = ".mzXML"
+ (this_retentionTimeDict, this_precursorMzDict) = \
+ self.GetSpectrumInfoFromMzXML(SpectrumFilePath)
+ retentionTimeDict.update(this_retentionTimeDict)
+ precursorMzDict.update(this_precursorMzDict)
+ elif Ext.lower() == ".mgf":
+ self.SpectrumFileType = ".mgf"
+ break
+ else:
+ print >> sys.stderr, \
+ "Spectrum file %s lacks .mzXML or .mgf extension" % \
+ SpectrumFilePath
+ sys.exit()
+
+ # ------------------------------------------------------------
+ # - Write opening info to PepXML
+ # - Process InsPecT output file line by line and write to PepXML
+ # - Write closing info to PepXML
+ # ------------------------------------------------------------
+
+ self.WritePepXMLOpening(PepXMLHandle, self.OutputFilePath)
+
+ LastScanNumber = -1
+ spectrum_query_count = 0
+
+ # Each line represents a predicted peptide for a spectrum (scan).
+ # A scan can have multiple predicted peptides (hits).
+ # All hits for a scan are grouped together in the file.
+ # Further, all scans for each spectrum file are grouped
+ # together.
+ for Line in InspectHandle.xreadlines():
+ if Line[0] == "#":
+ self.Columns.initializeHeaders(Line) #This is the header, so save it
+ continue # skip comments
+ # create a record for this line and read the fields into Bits
+ this_rec = InspectOutputRecordClass()
+ Bits = list(Line.split("\t"))
+
+ try:
+ this_rec.FileOffset = int(Bits[self.Columns.getIndex("SpecFilePos")])
+ except:
+ print "WARNING: malformed FileOffset %s in/after scan %d" % (Bits[self.Columns.getIndex("SpecFilePos")], LastScanNumber)
+ continue
+
+ try:
+ ScanNumber = int(Bits[self.Columns.getIndex("Scan#")])
+ except:
+ print "WARNING: malformed ScanNumber %s in/after scan %d" % (Bits[self.Columns.getIndex("Scan#")], LastScanNumber)
+ continue
+
+ ### TMF_new
+ try:
+ SpectrumFilePath = Bits[self.Columns.getIndex("SpectrumFile")]
+ SpectrumFile = os.path.split(SpectrumFilePath)[1]
+ except:
+ print "WARNING: malformed SpectrumFile field in/after scan %d" % (LastScanNumber)
+ continue
+
+ ScanName = SpectrumFile + "." + str(ScanNumber)
+
+ if (LastScanNumber != ScanNumber):
+ if (LastScanNumber != -1):
+ # write results for last spectrum
+ this_scan.WriteSpectrumQueries(PepXMLHandle,
+# SpectrumFileName, self.protease,
+ SpectrumFile, self.protease,
+ self.MaxHitsPerCharge)
+ # initialize new spectrum
+ this_scan = InspectSpectrumClass()
+ this_scan.ScanNumber = ScanNumber
+ this_scan.ScanName = ScanName
+ # get info about spectrum from spectrum file
+ if self.SpectrumFileType == ".mgf":
+ SpectrumFilePath = spectrumPathDict[SpectrumFile] ### TMF_new
+ (MgfPepMass, MgfRT) = \
+ self.GetSpectrumInfoFromMGF(SpectrumFilePath,
+ this_rec.FileOffset)
+ this_scan.PrecursorNeutralMass = float(MgfPepMass)
+ this_scan.RetentionTime = float(MgfRT)
+ elif self.SpectrumFileType == ".mzXML":
+ if not retentionTimeDict.has_key(this_scan.ScanName):
+ print "WARNING: RT for scan %s not found in spectrum file; retention_time_sec will not be output" % ScanName
+ this_scan.RetentionTime = -1.0
+ else:
+ this_scan.RetentionTime = \
+ retentionTimeDict[this_scan.ScanName]
+ if not precursorMzDict.has_key(this_scan.ScanName):
+ print "WARNING: m/z for scan %s not found in spectrum file; precursor_neutral_mass will not be output" % ScanName
+ this_scan.PrecursorMz = -1.0
+ else:
+ this_scan.PrecursorMz = \
+ precursorMzDict[this_scan.ScanName]
+
+ this_rec.Spectrum = this_scan
+ LastScanNumber = ScanNumber
+
+ # ---------------------------
+ # Process data about this hit
+ # ---------------------------
+ Annotation = Bits[self.Columns.getIndex("Annotation")]
+ Peptide = Annotation[2:-2]
+
+ # process peptide string --TMF
+ # I think there is already code to do this in Utils.py
+ # Sam, you may want to replace my code with a call to that.
+ def ExtractAAModifications(search, peptide):
+ '''Given peptide like TVAM+16GGKYphosLV, extract the numbers
+ and other modification symbols.
+
+ Return (a) the peptide without the mods, and
+ (b) a list of (aa, aa-pos, number) tuples --
+ aa/aa-pos describe the aa posessing the mod.
+ '''
+ i = 0
+ mod_list = []
+ stripped_peptide = ""
+ while i < len(peptide):
+ if peptide[i].isupper():
+ stripped_peptide = stripped_peptide + peptide[i]
+ i = i + 1
+ continue
+ j = i + 1
+ while j < len(peptide) and not peptide[j].isupper():
+ j = j + 1
+ aa = peptide[i-1]
+ added_mod = peptide[i:j]
+ added_mod_pos = len(stripped_peptide) #counting starts at 1
+ # modifications with names in the param file
+ # will be represented by their names embedded in the
+ # peptide. Look up their weights.
+ for k in range(nmods_in_params):
+ if search.mod_name[k]:
+ truncated_name = search.mod_name[k][:4]
+ this_weight = int(search.mod_weight[k])
+ if this_weight > 0:
+ weight_string = "+" + str(this_weight)
+ added_mod = added_mod.replace(truncated_name,
+ weight_string)
+ # added_mod could be a concatenation of several mods,
+ # as in AEQDNLGKSVM-5+16IPTK;
+ # store each one as a separate mod.
+ this_mod = ""
+ for i in range(len(added_mod)):
+ c = added_mod[i]
+ if (c == "+" or c == "-"):
+ # store the previous mod
+ if len(this_mod) > 0:
+ mod_list = mod_list + [(aa, added_mod_pos, this_mod)]
+ # start a new mod
+ this_mod = c
+ else:
+ this_mod = this_mod + c
+ # store the last mod
+ mod_list = mod_list + [(aa, added_mod_pos, this_mod)]
+ i = j
+ return (stripped_peptide, mod_list)
+
+ (this_rec.Peptide, this_rec.OptModList) = \
+ ExtractAAModifications(self, Peptide)
+
+ # done processing peptide string
+
+ this_rec.Prefix = Annotation[0]
+ this_rec.Suffix = Annotation[-1]
+ this_rec.Protein = Bits[self.Columns.getIndex("Protein")]
+ this_rec.Charge = int(Bits[self.Columns.getIndex("Charge")])
+ this_rec.MQScore = Bits[self.Columns.getIndex("MQScore")]
+ this_rec.FScore = Bits[self.Columns.getIndex("F-Score")]
+ this_rec.DeltaScore = Bits[self.Columns.getIndex("DeltaScore")]
+ this_rec.PValue = Bits[self.Columns.getIndex("InspectFDR")]
+ this_rec.ProteinID = Bits[self.Columns.getIndex("RecordNumber")]
+
+ this_scan.HitList[this_rec.Charge] = \
+ this_scan.HitList[this_rec.Charge] + [this_rec]
+
+ # done processing a single line of InsPecT output file
+
+ # write conversion of last line of InsPecT output file
+ this_scan.WriteSpectrumQueries(PepXMLHandle, \
+# SpectrumFileName, self.protease, \
+ SpectrumFile, self.protease, \
+ self.MaxHitsPerCharge)
+
+ self.WritePepXMLClosing(PepXMLHandle)
+ InspectHandle.close()
+ PepXMLHandle.close()
+
+
+ #---------------------------------------------------------------------
+
+ def WritePepXMLOpening(self, PepXMLHandle, PepXMLFilePath):
+ """Write stuff that belongs at the top of the pepXML file"""
+ PepXMLHandle.write('<?xml version="1.0" encoding="UTF-8"?>\n')
+ PepXMLHandle.write('<?xml-stylesheet type="text/xsl" href="pepXML_std.xsl"?>\n')
+ datestr = time.strftime('%Y-%m-%dT%H:%M:%S')
+ PepXMLHandle.write(
+ '<msms_pipeline_analysis ' +
+ 'date="%s" ' % datestr +
+ 'summary_xml="%s" ' %PepXMLFilePath +
+ 'xmlns="http://regis-web.systemsbiology.net/pepXML" ' +
+ 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' +
+ 'xsi:schemaLocation="http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v112.xsd" ' +
+ '>\n' )
+ #'xsi:schemaLocation="http://regis-web.systemsbiology.net/pepXML http://mascot1/mascot/xmlns/schema/pepXML_v18/pepXML_v18.xsd" ' +
+ PepXMLHandle.write(
+ '<msms_run_summary ' +
+ 'base_name="%s" ' % self.SpectrumFileBase +
+ 'search_engine="InsPecT" ' +
+ 'msManufacturer="UNKNOWN" ' +
+ 'msModel="%s" ' % self.instrument +
+ 'msIonization="UNKNOWN" ' +
+ 'msMassAnalyzer="UNKNOWN" ' +
+ 'msDetector="UNKNOWN" ' +
+ 'raw_data_type="raw" ' +
+ 'raw_data="%s" ' % self.SpectrumFileType +
+ '>\n')
+ PepXMLHandle.write('<sample_enzyme name="%s">\n' % self.protease)
+ PepXMLHandle.write('<specificity cut="KR" no_cut="P" sense="C"/>\n')
+ PepXMLHandle.write('</sample_enzyme>\n')
+ PepXMLHandle.write(
+ '<search_summary ' +
+ 'base_name="%s" ' % self.SpectrumFileBase +
+ 'search_engine="InsPecT" ' +
+ 'precursor_mass_type="monoisotopic" ' +
+ 'fragment_mass_type="monoisotopic" ' +
+ 'search_id="1" ' +
+ 'out_data_type="out" ' +
+ 'out_data=".txt" ' +
+ '>\n')
+ PepXMLHandle.write(
+ '<search_database ' +
+ 'local_path="%s" ' % self.search_db +
+ 'type="AA" ' +
+ '/>\n')
+ #'database_name="" ' +
+ #'database_release_identifier="" ' +
+ #'size_in_db_entries="" ' +
+ #'size_of_residues="" ' +
+ PepXMLHandle.write(
+ '<enzymatic_search_constraint ' +
+ 'enzyme="%s" ' % self.protease +
+ 'max_num_internal_cleavages="2" ' +
+ 'min_number_termini="2" ' +
+ '/>\n')
+ for i in range(self.nmods):
+ mod_aa = self.mod_aa[i]
+ mod_weight = self.mod_weight[i]
+ mass = mod_weight + Global.AminoMass[mod_aa]
+ if self.mod_type[i] == "opt": mod_variable="Y"
+ elif self.mod_type[i]=="fix": mod_variable="N"
+ else: mod_variable="UNKNOWN" # are there other types?
+ PepXMLHandle.write(
+ '<aminoacid_modification ' +
+ 'aminoacid="%s" ' % mod_aa +
+ 'massdiff="%.4f" ' % mod_weight +
+ 'mass="%.4f" ' % mass +
+ 'variable="%s" ' % mod_variable +
+ '/>\n')
+ PepXMLHandle.write('<parameter name="CHARGE" value="2+ and 3+"/>\n')
+ PepXMLHandle.write('<parameter name="CLE" value="Trypsin"/>\n')
+ PepXMLHandle.write('<parameter name="DB" value=""/>\n')
+ PepXMLHandle.write('<parameter name="FILE" value=""/>\n')
+ PepXMLHandle.write('<parameter name="FORMAT" value=""/>\n')
+ PepXMLHandle.write('<parameter name="FORMVER" value=""/>\n')
+ PepXMLHandle.write('<parameter name="INSTRUMENT" value="%s"/>\n' % \
+ self.instrument)
+ PepXMLHandle.write('<parameter name="ITOL" value=""/>\n')
+ PepXMLHandle.write('<parameter name="ITOLU" value="Da"/>\n')
+ PepXMLHandle.write('<parameter name="MASS" value="Monoisotopic"/>\n')
+ PepXMLHandle.write('<parameter name="REPORT" value=""/>\n')
+ PepXMLHandle.write('<parameter name="REPTYPE" value="Peptide"/>\n')
+ PepXMLHandle.write('<parameter name="RULES" value=""/>\n')
+ PepXMLHandle.write('<parameter name="SEARCH" value=""/>\n')
+ PepXMLHandle.write('<parameter name="TAXONOMY" value=""/>\n')
+ PepXMLHandle.write('<parameter name="TOL" value=""/>\n')
+ PepXMLHandle.write('<parameter name="TOLU" value="Da"/>\n')
+ PepXMLHandle.write('</search_summary>\n')
+
+ #---------------------------------------------------------------------
+
+ def WritePepXMLClosing(self, PepXMLHandle):
+ """Write stuff that belongs at the end of the pepXML file"""
+ PepXMLHandle.write('</msms_run_summary>\n')
+ PepXMLHandle.write('</msms_pipeline_analysis>\n')
+
+ #---------------------------------------------------------------------
+
+ def GetAllSpectrumInfoFromMGF(self, FilePath):
+ sys.exit(1)
+
+ #---------------------------------------------------------------------
+
+ def GetSpectrumInfoFromMGF(self, FilePath, FileOffset):
+ """ returns the spectrum title and peptide mass corresponding to
+ the spectrum at the given file offset in the given mgf file
+ """
+ File = open(FilePath, "r")
+
+ File.seek(FileOffset)
+ Mass = 0
+ RT = 0
+ Title = None
+
+ MatchMass = re.compile('^PEPMASS=(\S*)')
+ MatchRT = re.compile('^RTINSECONDS=(\S*)')
+ MatchTitle = re.compile('^TITLE=([^\n]*)')
+ # read one line at a time
+ for Line in File:
+
+ # We are not currently using the title
+ #Match = MatchTitle.match(Line)
+ #if Match != None:
+ #Title = Match.group(1)
+ #continue
+
+ # is this a mass line?
+ Match = MatchMass.match(Line)
+ if Match != None:
+ Mass = Match.group(1)
+ continue
+
+ # is this an RT line?
+ Match = MatchRT.match(Line)
+ if Match != None:
+ RT = Match.group(1)
+ continue
+
+ # this is not title, mass, charge, or RT. If we've read
+ # all of them already, stop reading.
+ if Mass!=0 and RT!=0:
+ break
+
+ File.close()
+ if Mass==0 or RT==0:
+ print >> sys.stderr, "WARNING: mass, and/or RT missing for spectrum at offset %s in %s" % ( FileOffset, FilePath )
+ return (Mass,RT)
+
+ #--------------------------------------------------------------------
+
+ def GetSpectrumInfoFromMzXML(self, FilePath):
+ """ compiles dictionaries of the precursorMz and retentionTime
+ for each spectrum in an mzXML file
+ """
+
+ def normalize_whitespace(text):
+ "Remove redundant whitespace from a string"
+ return ' '.join(text.split())
+
+ class MzXMLHandler(ContentHandler):
+
+ def __init__(self):
+ self.this_scan = None
+ self.this_precursorMz = None
+ self.precursorMz = dict()
+ self.retentionTime = dict()
+ self.inPrecursorMzContent = 0
+ self.FileName = os.path.split(FilePath)[1] ###TMF_new
+
+ def startElement(self, name, attrs):
+ # If it's not a comic element, ignore it
+ if name == 'scan':
+ # Look for the title and number attributes
+ num = int(normalize_whitespace(attrs.get('num', None)))
+ retentionTime = normalize_whitespace(
+ attrs.get('retentionTime', None))
+ self.this_scan = int(num)
+ self.this_scan_name = self.FileName + "." + str(num) ###TMF_new
+ self.retentionTime[self.this_scan_name] = \
+ float(retentionTime[2:-1])
+# self.retentionTime[self.this_scan] = \
+# float(retentionTime[2:-1])
+ elif name == 'precursorMz':
+ self.inPrecursorMzContent = 1
+ self.thisprecursorMz = ""
+
+ def characters(self, ch):
+ if self.inPrecursorMzContent:
+ self.thisprecursorMz = self.thisprecursorMz + ch
+
+ def endElement(self, name):
+ if name == 'precursorMz':
+ self.inPrecursorMzContent = 0
+ idx = self.this_scan_name ###TMF_new
+ self.precursorMz[idx] = float(self.thisprecursorMz)
+# i = self.this_scan
+# self.precursorMz[i] = float(self.thisprecursorMz)
+ elif name == 'scan':
+ pass
+
+ # Create an XML parser and tell it
+ # we are not interested in XML namespaces
+ MzXMLparser = make_parser()
+ MzXMLparser.setFeature(feature_namespaces, 0)
+
+ # Create a handler and tell the parser to use it
+ mh = MzXMLHandler()
+ MzXMLparser.setContentHandler(mh)
+
+ # Parse the file
+ File = open(FilePath, "r")
+ try:
+ MzXMLparser.parse(File)
+ except:
+ print >> sys.stderr, "ERROR: SAX parser cannot parse %s" % FilePath
+ sys.exit()
+
+ return (mh.retentionTime, mh.precursorMz)
+
+ #---------------------------------------------------------------------
+
+ def ParseCommandLine(self,Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "i:o:m:p:d:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-i":
+ if not os.path.exists(Value):
+ print "** Error: couldn't find results file '%s'\n\n"%Value
+ print UsageInfo
+ sys.exit(1)
+ self.InputFilePath = Value
+ if Option == "-o":
+ self.OutputFilePath = Value
+ if Option == "-m":
+ self.SpectraDir = Value
+ if Option == "-p":
+ self.ParamFilePath = Value
+ if Option == "-d":
+ self.MaxHitsPerCharge = int(Value)
+ if not OptionsSeen.has_key("-i") or not OptionsSeen.has_key("-o"):
+ print UsageInfo
+ sys.exit(1)
+
+ def Finish(self):
+ self.InputFile.close()
+ self.OutputFile.close()
+
+#-------------------------------------------------------------------------
+
+if __name__ == '__main__':
+ Fix = InspectToPepXMLClass()
+ Fix.ParseCommandLine(sys.argv[1:])
+ Fix.Main()
diff --git a/IonScoring.c b/IonScoring.c
new file mode 100644
index 0000000..9153aef
--- /dev/null
+++ b/IonScoring.c
@@ -0,0 +1,1873 @@
+//Title: IonScoring.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include <stdlib.h>
+#include <math.h>
+#include "Inspect.h"
+#include "Utils.h"
+#include "Errors.h"
+#include "IonScoring.h"
+#include "Spectrum.h"
+#include "Tagger.h"
+
+// Global variables: Bayesian networks for PRM scoring (for MS-Alignment) and
+// for cut scoring (for tagging and match-scoring)
+PRMBayesianModel* PRMModelCharge2 = NULL;
+PRMBayesianModel* PRMModelCharge3 = NULL;
+PRMBayesianModel* TAGModelCharge2 = NULL;
+PRMBayesianModel* TAGModelCharge3 = NULL;
+PRMBayesianModel* PhosCutModelCharge2 = NULL;
+PRMBayesianModel* PhosCutModelCharge3 = NULL;
+
+// Forward declarations:
+int IonScoringGetPrefixContainPhos(PRMBayesianNode* Node, Peptide* Match, int AminoIndex);
+int IonScoringGetSuffixContainPhos(PRMBayesianNode* Node, Peptide* Match, int BreakIndex);
+PRMBayesianModel* GetScoringModel(Peptide* Match, int Charge);
+void AnnotateParentPeaks(MSSpectrum* Spectrum, Peptide* Match, PRMBayesianModel* Model);
+void ClaimParentPeak(MSSpectrum* Spectrum, Peptide* Match, int Mass, PRMBayesianModel* Model);
+
+// Free a node from a Bayesian network; helper for FreePRMBayesianModel
+void FreePRMBayesianNode(PRMBayesianNode* Node)
+{
+ if (!Node)
+ {
+ return;
+ }
+ SafeFree(Node->Parents);
+ SafeFree(Node->ParentBlocks);
+ SafeFree(Node->CountTable);
+ SafeFree(Node->ProbTable);
+ SafeFree(Node);
+}
+
+// Free a Bayesian network model.
+void FreePRMBayesianModel(PRMBayesianModel* Model)
+{
+ PRMBayesianNode* Node;
+ PRMBayesianNodeHolder* Holder;
+ PRMBayesianNodeHolder* PrevHolder = NULL;
+ PRMBayesianNode* Prev = NULL;
+ //
+ if (!Model)
+ {
+ return;
+ }
+ // Free the linked list of node-holders that require flanking amino acid info:
+ for (Holder = Model->FirstFlank; Holder; Holder = Holder->Next)
+ {
+ SafeFree(PrevHolder);
+ PrevHolder = Holder;
+ }
+ SafeFree(PrevHolder);
+
+ // Free the linked list of all nodes:
+ for (Node = Model->Head; Node; Node = Node->Next)
+ {
+ FreePRMBayesianNode(Prev);
+ Prev = Node;
+ }
+ FreePRMBayesianNode(Prev);
+ SafeFree(Model->Nodes);
+ SafeFree(Model);
+}
+
+// Add a node to a Bayesian network. Called from PyInspect when building up
+// a network (semi)interactively, not used in production.
+void AddPRMBayesianNode(PRMBayesianModel* Model, char* Name, int NodeType, int NodeFlag, float NodeMassOffset,
+ int FragmentType)
+{
+ PRMBayesianNode* Node;
+ //
+ // Create the node:
+ Node = (PRMBayesianNode*)calloc(1, sizeof(PRMBayesianNode));
+ Node->Type = NodeType;
+ strncpy(Node->Name, Name, 256);
+ Node->MassOffset = (int)(NodeMassOffset * DALTON);
+ Node->Flag = NodeFlag;
+ Node->Index = Model->NodeCount;
+ Node->FragmentType = FragmentType;
+ Model->NodeCount++;
+ // Insert the node into the list:
+ if (Model->Tail)
+ {
+ Model->Tail->Next = Node;
+ }
+ else
+ {
+ Model->Head = Node;
+ }
+ Model->Tail = Node;
+
+ // Insert the node into the array:
+ if (Model->Nodes)
+ {
+ Model->Nodes = (PRMBayesianNode**)realloc(Model->Nodes, Model->NodeCount * sizeof(PRMBayesianNode*));
+ }
+ else
+ {
+ Model->Nodes = (PRMBayesianNode**)calloc(sizeof(PRMBayesianNode*), 1);
+ }
+ Model->Nodes[Model->NodeCount - 1] = Node;
+
+ // Now set the value count:
+ switch (Node->Type)
+ {
+ case evPRMBPrefix:
+ case evPRMBPrefix2:
+ case evPRMBSuffix:
+ case evPRMBSuffix2:
+ // The number of values is determined by the intensity scheme:
+ switch (Model->IntensityScheme)
+ {
+ case 0:
+ case 1:
+ case 4:
+ Node->ValueCount = 4;
+ break;
+ case 2:
+ case 3:
+ Node->ValueCount = 3;
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ break;
+ case evSector:
+ // The number of values is 2, 3, 4, or 5, depending on our sector count:
+ switch (Node->Flag)
+ {
+ case 0:
+ Node->ValueCount = 2;
+ break;
+ case 1:
+ Node->ValueCount = 3;
+ break;
+ case 2:
+ Node->ValueCount = 4;
+ break;
+ case 3:
+ Node->ValueCount = 5;
+ break;
+ case 4:
+ Node->ValueCount = 5;
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ break;
+ case evFlank:
+ // The number of values depends on the flank scheme flag:
+ switch (Node->Flag)
+ {
+ case 0:
+ Node->ValueCount = 4;
+ break;
+ case 1:
+ Node->ValueCount = 4;
+ break;
+ case 2:
+ Node->ValueCount = 3;
+ break;
+ case 3:
+ Node->ValueCount = 3;
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ break;
+ case evPrefixAA:
+ case evSuffixAA:
+ // PrefixAA and SuffixAA nodes are simple binary nodes.
+ Node->ValueCount = 2;
+ break;
+ case evPrefixContain:
+ switch (Node->Flag)
+ {
+ case 0:
+ // Acid residue (flag)
+ Node->ValueCount = 2;
+ break;
+ case 1:
+ // Acid residue (0, 1, many)
+ Node->ValueCount = 3;
+ break;
+ case 2:
+ // Basic residue (flag)
+ Node->ValueCount = 2;
+ break;
+ case 3:
+ // Basic residue (0, 1, many)
+ Node->ValueCount = 2;
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ break;
+ case evSuffixContain:
+ switch (Node->Flag)
+ {
+ case 0:
+ // Acid residue (flag)
+ Node->ValueCount = 2;
+ break;
+ case 1:
+ // Acid residue (0, 1, many)
+ Node->ValueCount = 3;
+ break;
+ case 2:
+ // Basic residue (flag)
+ Node->ValueCount = 2;
+ break;
+ case 3:
+ // Basic residue (0, 1, many)
+ Node->ValueCount = 2;
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ break;
+ case evPrefixContainPhos:
+ case evSuffixContainPhos:
+ //has phosphate on the fragment (flag)
+ Node->ValueCount = 2;
+ break;
+ default:
+ printf("* Error: Unknown Node->Type in AddPRMBayesianNode\n");
+ break;
+ }
+ // Allocate initial count/probability tables, assuming no parents for the node:
+ Node->CountTable = (int*)calloc(Node->ValueCount, sizeof(int));
+ Node->ProbTable = (float*)calloc(Node->ValueCount, sizeof(float));
+ Node->TableSize = Node->ValueCount;
+}
+
+// Given a spectrum, compute the intensity-thresholds for level 0 (strongest)
+// through level n (absent).
+int ComputeSpectrumIntensityThresholds(PRMBayesianModel* Model, MSSpectrum* Spectrum)
+{
+ int ThresholdCount;
+ int CutoffRank;
+ int WeakRank;
+ int PeakIndex;
+ float SortedIntensity[200];
+ int WeakPeakCount = 0;
+ float TotalIntensity = 0;
+ float GrassIntensity;
+ float StrongPeakIntensity;
+ //
+
+ switch (Model->IntensityScheme)
+ {
+ case 0:
+ case 1:
+ // Scheme 1: Top N peaks, high, low, absent
+ ThresholdCount = 4;
+ Spectrum->IntensityThresholds = (float*)calloc(5, sizeof(float));
+ StrongPeakIntensity = -1;
+ CutoffRank = (int)(Spectrum->ParentMass / (50 * DALTON));
+ WeakRank = max(CutoffRank, Spectrum->PeakCount - 200);
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].IntensityRank >= WeakRank)
+ {
+ SortedIntensity[WeakPeakCount] = Spectrum->Peaks[PeakIndex].Intensity;
+ WeakPeakCount++;
+ }
+ else
+ {
+ if (StrongPeakIntensity < 0 || StrongPeakIntensity > Spectrum->Peaks[PeakIndex].Intensity)
+ {
+ StrongPeakIntensity = Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ }
+ TotalIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ if (WeakPeakCount == 200)
+ {
+ break;
+ }
+ }
+ if (!WeakPeakCount)
+ {
+ GrassIntensity = TotalIntensity / (2 * Spectrum->PeakCount);
+ }
+ else
+ {
+ qsort(SortedIntensity, WeakPeakCount, sizeof(float), (QSortCompare)CompareFloats);
+ GrassIntensity = SortedIntensity[WeakPeakCount / 2];
+ }
+ Spectrum->IntensityThresholds[0] = StrongPeakIntensity;
+ Spectrum->IntensityThresholds[1] = (float)min(StrongPeakIntensity * 0.5, GrassIntensity * 2);
+ //Spectrum->IntensityThresholds[2] = (float)0.5 * GrassIntensity;
+ Spectrum->IntensityThresholds[2] = 0;
+ Spectrum->IntensityThresholds[3] = -1;
+ break;
+ case 2:
+ case 3:
+ // Scheme 1: Top N peaks, present, absent
+ ThresholdCount = 3;
+ Spectrum->IntensityThresholds = (float*)calloc(5, sizeof(float));
+ StrongPeakIntensity = -1;
+ CutoffRank = (int)(Spectrum->ParentMass / (50 * DALTON));
+ WeakRank = max(CutoffRank, Spectrum->PeakCount - 200);
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].IntensityRank >= WeakRank)
+ {
+ SortedIntensity[WeakPeakCount] = Spectrum->Peaks[PeakIndex].Intensity;
+ WeakPeakCount++;
+ }
+ else
+ {
+ if (StrongPeakIntensity < 0 || StrongPeakIntensity > Spectrum->Peaks[PeakIndex].Intensity)
+ {
+ StrongPeakIntensity = Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ }
+ TotalIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ if (WeakPeakCount == 200)
+ {
+ break;
+ }
+ }
+ if (!WeakPeakCount)
+ {
+ GrassIntensity = TotalIntensity / (2 * Spectrum->PeakCount);
+ }
+ else
+ {
+ qsort(SortedIntensity, WeakPeakCount, sizeof(float), (QSortCompare)CompareFloats);
+ GrassIntensity = SortedIntensity[WeakPeakCount / 2];
+ }
+ Spectrum->IntensityThresholds[0] = StrongPeakIntensity;
+ Spectrum->IntensityThresholds[1] = 0; //GrassIntensity * 0.5;
+ //Spectrum->IntensityThresholds[2] = (float)0.5 * GrassIntensity;
+ Spectrum->IntensityThresholds[2] = -1;
+ //Spectrum->IntensityThresholds[3] = -1;
+ break;
+ case 4:
+ //Scheme 4: partitioned by ratio to grass
+ ThresholdCount = 4;
+ Spectrum->IntensityThresholds = (float*)calloc(5, sizeof(float));
+ WeakRank = (Spectrum->PeakCount / 3 ); //AverageGrass = median of bottom 1/3 of peaks
+ WeakRank = min(200, WeakRank); //at most 200, limited by array size
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].IntensityRank >= WeakRank)
+ {
+ SortedIntensity[WeakPeakCount] = Spectrum->Peaks[PeakIndex].Intensity;
+ WeakPeakCount++;
+ }
+ }
+ if (!WeakPeakCount)
+ {
+ GrassIntensity = TotalIntensity / (2 * Spectrum->PeakCount);
+ }
+ else
+ {
+ qsort(SortedIntensity, WeakPeakCount, sizeof(float), (QSortCompare)CompareFloats);
+ GrassIntensity = SortedIntensity[WeakPeakCount / 2];
+ }
+ Spectrum->IntensityThresholds[0] = GrassIntensity * (float)10.0;
+ Spectrum->IntensityThresholds[1] = GrassIntensity * 2;
+ Spectrum->IntensityThresholds[2] = GrassIntensity * (float)0.1;
+ Spectrum->IntensityThresholds[3] = -1;
+ break;
+ default:
+ REPORT_ERROR(0);
+ return 0;
+ }
+ return ThresholdCount;
+}
+
+// Prepare a spectrum for PRM and cut scoring. Compute intensity cutoffs, compute binned
+// intensity, comput binned intensity levels.
+void PrepareSpectrumForIonScoring(PRMBayesianModel* Model, MSSpectrum* Spectrum, int ForceRefresh)
+{
+ int WeakPeakCount = 0;
+ float TotalIntensity = 0;
+ int ThresholdCount;
+ int PeakIndex;
+ int IntensityLevel;
+ int BinScalingFactor = 100; // One bin per 0.1Da
+ int CountByIntensityLevel[16];
+ int Bin;
+ int NearBin;
+ SpectralPeak* Peak;
+ int MaxParentMass;
+ float Intensity;
+ float Probability;
+ float Multiplier;
+ int Skew;
+ //
+ if (Spectrum->IntensityThresholds && !ForceRefresh)
+ {
+ return; // Already set!
+ }
+ if (!Spectrum->PeakCount)
+ {
+ return;
+ }
+ if (!Model)
+ {
+ return;
+ }
+
+ ///////////////////////////////
+ // Free any old info:
+ SafeFree(Spectrum->BinnedIntensities);
+ Spectrum->BinnedIntensities = NULL;
+ SafeFree(Spectrum->BinnedIntensitiesTight);
+ Spectrum->BinnedIntensitiesTight = NULL;
+ SafeFree(Spectrum->BinnedIntensityLevels);
+ Spectrum->BinnedIntensityLevels = NULL;
+ SafeFree(Spectrum->BinPeakIndex);
+ Spectrum->BinPeakIndex = NULL;
+ SafeFree(Spectrum->IonScoringNoiseProbabilities);
+ Spectrum->IonScoringNoiseProbabilities = NULL;
+ SafeFree(Spectrum->IntensityThresholds);
+ Spectrum->IntensityThresholds = NULL;
+ ///////////////////////////////
+ ThresholdCount = ComputeSpectrumIntensityThresholds(Model, Spectrum);
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////
+ // We know our intensity thresholds; now let's compute the binned intensities:
+ MaxParentMass = Spectrum->MZ * 3 + (2 * HYDROGEN_MASS);
+
+ Spectrum->IntensityBinCount = (MaxParentMass + DALTON) / BinScalingFactor;
+ Spectrum->BinnedIntensities = (float*)calloc(Spectrum->IntensityBinCount, sizeof(float));
+ Spectrum->BinnedIntensitiesTight = (float*)calloc(Spectrum->IntensityBinCount, sizeof(float));
+ Spectrum->BinnedIntensityLevels = (int*)calloc(Spectrum->IntensityBinCount, sizeof(int));
+
+ Spectrum->BinPeakIndex = (int*)calloc(Spectrum->IntensityBinCount, sizeof(int));
+ for (Bin = 0; Bin < Spectrum->IntensityBinCount; Bin++)
+ {
+ Spectrum->BinPeakIndex[Bin] = -1;
+ }
+ // Iterate over spectral peaks, putting intensity into bins:
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Peak = Spectrum->Peaks + PeakIndex;
+ Bin = (Peak->Mass + 50) / BinScalingFactor;
+ for (NearBin = Bin - 6; NearBin < Bin + 7; NearBin++)
+ {
+ if (NearBin < 0 || NearBin >= Spectrum->IntensityBinCount)
+ {
+ continue;
+ }
+ Skew = abs(Peak->Mass - (NearBin * BinScalingFactor));
+ if (Skew > Model->IntensityRadius)
+ {
+ continue;
+ }
+ Multiplier = 1.0; // default
+ if (Model->IntensityScheme == 1 || Model->IntensityScheme == 3)
+ {
+ if (Skew >= Model->HalfIntensityRadius)
+ {
+ Multiplier = 0.5;
+ }
+ }
+ Spectrum->BinnedIntensities[NearBin] += Peak->Intensity * Multiplier;
+ if (Skew < INTENSITY_BIN_RADIUS_TIGHT)
+ {
+ Spectrum->BinnedIntensitiesTight[NearBin] += Peak->Intensity;
+ }
+ if (Spectrum->BinPeakIndex[NearBin] < 0)
+ {
+ Spectrum->BinPeakIndex[NearBin] = PeakIndex;
+ }
+ }
+ }
+ // Compute the intensity level (absent, lo, med, hi) for each bin:
+ //ComputeSpectrumIntensityCutoffs(Spectrum);
+ memset(CountByIntensityLevel, 0, sizeof(int) * 16);
+ for (Bin = 0; Bin < Spectrum->IntensityBinCount; Bin++)
+ {
+ Intensity = Spectrum->BinnedIntensities[Bin];
+ for (IntensityLevel = 0; IntensityLevel < 99; IntensityLevel++)
+ {
+ if (Intensity > Spectrum->IntensityThresholds[IntensityLevel])
+ {
+ Spectrum->BinnedIntensityLevels[Bin] = IntensityLevel;
+ CountByIntensityLevel[IntensityLevel]++;
+ break;
+ }
+ }
+ }
+ ////////////////////////////////////////////////////////////////////////////////////////////////
+ // Now let's compute the fraction of mass bins which attain these intensity thresholds 'by chance'.
+ // This fraction is used for scoring PRMs; the bonus for having a y peak is smaller for a very
+ // thick spectrum than for a very sparse spectrum.
+ Spectrum->IonScoringNoiseProbabilities = (float*)calloc(ThresholdCount + 1, sizeof(float));
+ for (IntensityLevel = 0; IntensityLevel < ThresholdCount; IntensityLevel++)
+ {
+ Probability = (CountByIntensityLevel[IntensityLevel] + 1) / (float)Spectrum->IntensityBinCount;
+ Spectrum->IonScoringNoiseProbabilities[IntensityLevel] = (float)log(Probability);
+ }
+}
+
+// Return the intensity level for this mass. If this is a cut, claim the peaks; otherwise,
+// just return the intensity level.
+int IonScoringGetPeakIntensity(PRMBayesianModel* Model, MSSpectrum* Spectrum, int Mass, int FragmentType, int SeizePeakAminoIndex)
+{
+ int Bin;
+ int MinMass;
+ int MaxMass;
+ float Intensity = 0;
+ int IntensityLevelIndex;
+ int PeakIndex;
+ int Skew;
+ float Multiplier;
+ //
+ Bin = (Mass + 50) / 100; // Bin width 0.1Da
+ MinMass = Mass - Model->IntensityRadius;
+ MaxMass = Mass + Model->IntensityRadius;
+
+ // If the mass is off the scale, then you get no peaks:
+ if (Bin >= Spectrum->IntensityBinCount || Bin < 0)
+ {
+ return Model->MinIntensityLevel;
+ }
+
+ // If this is a PRM (not a cut), then look up the intensity level
+ // in the spectrum's array:
+ if (SeizePeakAminoIndex < 0)
+ {
+ return Spectrum->BinnedIntensityLevels[Bin];
+ }
+
+ PeakIndex = Spectrum->BinPeakIndex[Bin];
+ if (PeakIndex >= 0)
+ {
+ for ( ; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].Mass > MaxMass)
+ {
+ break;
+ }
+ if (Spectrum->Peaks[PeakIndex].Mass < MinMass)
+ {
+ continue;
+ }
+
+ Multiplier = 1.0; // default
+ Skew = abs(Mass - Spectrum->Peaks[PeakIndex].Mass);
+ if (Model->IntensityScheme == 1 || Model->IntensityScheme == 3)
+ {
+ if (Skew >= Model->HalfIntensityRadius)
+ {
+ Multiplier = 0.5;
+ }
+ }
+ if (Spectrum->Peaks[PeakIndex].IonType)
+ {
+ // This peak has already been CLAIMED by another ion type:
+ continue;
+ }
+ Intensity += Spectrum->Peaks[PeakIndex].Intensity * Multiplier;
+ // CLAIM this spectrum:
+ Spectrum->Peaks[PeakIndex].IonType = FragmentType;
+ Spectrum->Peaks[PeakIndex].AminoIndex = SeizePeakAminoIndex;
+ }
+ }
+ for (IntensityLevelIndex = 0; IntensityLevelIndex < 99; IntensityLevelIndex++)
+ {
+ if (Intensity > Spectrum->IntensityThresholds[IntensityLevelIndex])
+ {
+ return IntensityLevelIndex;
+ }
+ }
+ return 0;
+}
+
+// Compute the sector for a given mass. The sector is a simple partition of
+// the mass range (low/high, or low/medium/high, etc).
+int IonScoringGetSector(PRMBayesianNode* Node, int ParentMass, int Mass)
+{
+ switch (Node->Flag)
+ {
+ case 0:
+ // Two sectors, LOW and HIGH:
+ if (Mass < ParentMass / 2)
+ {
+ return 0;
+ }
+ else
+ {
+ return 1;
+ }
+ break;
+ case 1:
+ // Three sectors, LOW and MEDIUM and HIGH:
+ if (Mass < ParentMass * 0.33)
+ {
+ return 0;
+ }
+ if (Mass < ParentMass * 0.66)
+ {
+ return 1;
+ }
+ return 2;
+ break;
+ case 2:
+ // Four sectors:
+ if (Mass < ParentMass * 0.25)
+ {
+ return 0;
+ }
+ if (Mass < ParentMass * 0.5)
+ {
+ return 1;
+ }
+ if (Mass < ParentMass * 0.75)
+ {
+ return 2;
+ }
+ return 3;
+ break;
+ case 3:
+ // Five sectors:
+ if (Mass < ParentMass * 0.2)
+ {
+ return 0;
+ }
+ if (Mass < ParentMass * 0.4)
+ {
+ return 1;
+ }
+ if (Mass < ParentMass * 0.6)
+ {
+ return 2;
+ }
+ if (Mass < ParentMass * 0.8)
+ {
+ return 3;
+ }
+ return 4;
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ return 0;
+}
+
+// Compute the value of the Flank feature. These features reflect flanking amino acids
+// which have effects on fragment intensities.
+int IonScoringGetFlank(PRMBayesianNode* Node, char Left, char Right)
+{
+ //
+ switch (Node->Flag)
+ {
+ case 0:
+ // Default B flank:
+ // G or P on left: Strong suppression
+ if (Left == 'G' || Left == 'P')
+ {
+ return 0;
+ }
+ // P on right: Augmentation
+ if (Right == 'P')
+ {
+ return 1;
+ }
+ // H or R on right: Suppression
+ if (Right == 'H' || Right == 'R')
+ {
+ return 2;
+ }
+ return 3;
+ break;
+ case 1:
+ // Default Y flank:
+ // P on right: Strong augmentation
+ if (Right == 'P')
+ {
+ return 0;
+ }
+ // K or R on right: Strong suppression
+ if (Right == 'R' || Right == 'K')
+ {
+ return 1;
+ }
+ // H on right or P on left: suppression
+ if (Left == 'P' || Right == 'H')
+ {
+ return 2;
+ }
+ return 3;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ return 0;
+}
+
+// Compute a feature for whether the N- or C-terminal portion of a peptide contains acidic
+// or basic residues. (Not used in production)
+int IonScoringGetFragmentContain(PRMBayesianNode* Node, Peptide* Match, int AminoIndex, int SuffixFlag)
+{
+ int MinIndex;
+ int MaxIndex;
+ int CheckIndex;
+ int Count = 0;
+ //
+ if (SuffixFlag)
+ {
+ MinIndex = AminoIndex;
+ MaxIndex = strlen(Match->Bases);
+ }
+ else
+ {
+ MinIndex = 0;
+ MaxIndex = AminoIndex;
+ }
+ for (CheckIndex = MinIndex; CheckIndex < MaxIndex; CheckIndex++)
+ {
+ switch (Match->Bases[CheckIndex])
+ {
+ case 'D':
+ case 'E':
+ if (Node->Flag == 0 || Node->Flag == 1)
+ {
+ Count++;
+ }
+ break;
+ case 'R':
+ case 'K':
+ case 'H':
+ if (Node->Flag == 2 || Node->Flag == 3)
+ {
+ Count++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ switch (Node->Flag)
+ {
+ case 0:
+ case 2:
+ if (Count)
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+ break;
+ case 1:
+ case 3:
+ if (Count > 1)
+ {
+ return 2;
+ }
+ else if (Count)
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ return 0; // unreachable
+}
+
+int IonScoringGetPrefixContainPhos(PRMBayesianNode* Node, Peptide* Match, int AminoIndex)
+{
+ int ModIndex;
+ int ModifiedResidueIndex = -1;
+ //
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Match->ModType[ModIndex])
+ {
+ break;
+ }
+ if (Match->ModType[ModIndex]->Flags & DELTA_FLAG_PHOSPHORYLATION)
+ {
+ ModifiedResidueIndex = Match->AminoIndex[ModIndex];
+ if (ModifiedResidueIndex < AminoIndex)
+ {
+ return 1;
+ }
+ }
+ }
+ //got all the way here without returning anything.
+ return 0;
+}
+
+int IonScoringGetSuffixContainPhos(PRMBayesianNode* Node, Peptide* Match, int BreakIndex)
+{
+ int ModIndex;
+ int ModifiedResidueIndex = -1;
+ //
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Match->ModType[ModIndex])
+ {
+ break;
+ }
+ if (Match->ModType[ModIndex]->Flags & DELTA_FLAG_PHOSPHORYLATION)
+ {
+ ModifiedResidueIndex = Match->AminoIndex[ModIndex];
+ if (BreakIndex <= ModifiedResidueIndex)
+ {
+ return 1;
+ }
+ }
+ }
+ //got all the way here without returning anything.
+ return 0;
+}
+
+// Return the value for a particular PRM or cut. This function calls the appropriate setter based
+// on the node type.
+// Important special note: AminoIndex should be -1 if we're getting PRM scores!
+int IonScoringGetNodeValue(PRMBayesianModel* Model, PRMBayesianNode* Node, MSSpectrum* Spectrum, int PRM,
+ Peptide* Match, int AminoIndex)
+{
+ int Suffix;
+ int PeptideLen;
+ char PrefixAA;
+ char SuffixAA;
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Set values for the current PRM:
+ switch (Node->Type)
+ {
+ case evPRMBPrefix:
+ // Handle b peak, or other N-terminal fragment:
+ return IonScoringGetPeakIntensity(Model, Spectrum, PRM + Node->MassOffset, Node->FragmentType, AminoIndex);
+ //Node->Value = IonScoringSetIntensityLevel(Spectrum, Intensity);
+ break;
+ case evPRMBPrefix2:
+ // Handle doubly-charged N-terminal fragment:
+ return IonScoringGetPeakIntensity(Model, Spectrum, (PRM + Node->MassOffset + HYDROGEN_MASS) / 2, Node->FragmentType, AminoIndex);
+ //Node->Value = IonScoringSetIntensityLevel(Spectrum, Intensity);
+ break;
+ case evPRMBSuffix:
+ // Handle C-terminal fragment:
+ Suffix = Spectrum->ParentMass - PRM;
+ return IonScoringGetPeakIntensity(Model, Spectrum, Suffix + Node->MassOffset, Node->FragmentType, AminoIndex);
+ //Node->Value = IonScoringSetIntensityLevel(Spectrum, Intensity);
+ break;
+ case evPRMBSuffix2:
+ // Handle doubly-charged C-terminal fragment:
+ Suffix = Spectrum->ParentMass - PRM;
+ return IonScoringGetPeakIntensity(Model, Spectrum, (Suffix + Node->MassOffset + HYDROGEN_MASS) / 2, Node->FragmentType, AminoIndex);
+ //Node->Value = IonScoringSetIntensityLevel(Spectrum, Intensity);
+ break;
+ case evSector:
+ // Handle "sector" (which part of the mass range this mass lies in)
+ return IonScoringGetSector(Node, Spectrum->ParentMass, PRM);
+ break;
+ case evFlank:
+ // Handle "flank" (for cuts only: based on prefix and suffix amino acids)
+ // If no peptide, return 0 (always the "default" intensity)
+ if (!Match)
+ {
+ return 0;
+ }
+ PeptideLen = strlen(Match->Bases);
+ if (AminoIndex > 0)
+ {
+ PrefixAA = Match->Bases[AminoIndex - 1];
+ }
+ else
+ {
+ PrefixAA = '\0';
+ }
+ if (AminoIndex < PeptideLen)
+ {
+ SuffixAA = Match->Bases[AminoIndex];
+ }
+ else
+ {
+ SuffixAA = '\0';
+ }
+ return IonScoringGetFlank(Node, PrefixAA, SuffixAA);
+ break;
+ case evPrefixAA:
+ if (AminoIndex > 0)
+ {
+ PrefixAA = Match->Bases[AminoIndex - 1];
+ }
+ else
+ {
+ PrefixAA = '\0';
+ }
+ if ((PrefixAA - 'A') == Node->Flag)
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+ break;
+ case evSuffixAA:
+ PeptideLen = strlen(Match->Bases);
+ if (AminoIndex < PeptideLen)
+ {
+ SuffixAA = Match->Bases[AminoIndex];
+ }
+ else
+ {
+ SuffixAA = '\0';
+ }
+ if ((SuffixAA - 'A') == Node->Flag)
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+ break;
+ case evPrefixContain:
+ return IonScoringGetFragmentContain(Node, Match, AminoIndex, 0);
+ case evSuffixContain:
+ return IonScoringGetFragmentContain(Node, Match, AminoIndex, 1);
+ case evPrefixContainPhos:
+ return IonScoringGetPrefixContainPhos(Node, Match, AminoIndex);
+ case evSuffixContainPhos:
+ return IonScoringGetSuffixContainPhos(Node, Match, AminoIndex);
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ return 0;
+}
+
+// For debugging purposes, print out the definition of a PRMBayesianModel.
+void DebugPrintPRMBayesianModel(PRMBayesianModel* Model)
+{
+ PRMBayesianNode* Node;
+ PRMBayesianNode* Parent;
+ int NodeIndex;
+ int ParentIndex;
+ printf(">>>DebugPrintPRMBayesianModel\n");
+ printf("CutFlag %d IntensityRadius %.2f IntensityScheme %d\n", Model->CutFlag, Model->IntensityRadius / (float)DALTON, Model->IntensityScheme);
+ for (Node = Model->Head, NodeIndex = 0; Node; Node = Node->Next, NodeIndex++)
+ {
+ printf(">>Node %d of %d %s:\n", NodeIndex, Model->NodeCount, Node->Name);
+ printf(" Type %d flag %d mass offset %.2f\n", Node->Type, Node->Flag, Node->MassOffset / (float)DALTON);
+ printf(" Valuecount %d\n", Node->ValueCount);
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ Parent = Node->Parents[ParentIndex];
+ printf(" Parent %d of %d: %s\n", ParentIndex, Node->ParentCount, Parent->Name);
+ }
+ }
+ printf(">>> End of model <<<\n");
+}
+
+// The tag-scoring Bayesian network includes some features which rely upon flanking amino acids.
+// These nodes must be visited during tag generation, when the flanking amino acids are finally
+// learned. To save time, we build up a singly-linked list (Model->FirstFlank...Model->LastFlank)
+// to keep track of such nodes.
+void BuildModelFlankList(PRMBayesianModel* Model)
+{
+ int NodeIndex;
+ PRMBayesianNode* Node;
+ PRMBayesianNode* Parent;
+ PRMBayesianNodeHolder* Holder;
+ int ParentIndex;
+ //
+ // Set flank flags of all nodes:
+ for (NodeIndex = 0; NodeIndex < Model->NodeCount; NodeIndex++)
+ {
+ Node = Model->Nodes[NodeIndex];
+ Node->FlankFlag = 0; // default
+ if (Node->Type == evFlank || Node->Type == evPrefixAA || Node->Type == evSuffixAA)
+ {
+ Node->FlankFlag = 1;
+ }
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ Parent = Node->Parents[ParentIndex];
+ if (Parent->Type == evFlank || Parent->Type == evPrefixAA || Parent->Type == evSuffixAA)
+ {
+ Node->FlankFlag = 1;
+ }
+ }
+ }
+ // Build linked list of nodes which rely upon flanking amino acid info:
+ for (NodeIndex = 0; NodeIndex < Model->NodeCount; NodeIndex++)
+ {
+ Node = Model->Nodes[NodeIndex];
+ if (Node->FlankFlag)
+ {
+ // Add a NodeHolder for this node:
+ Holder = (PRMBayesianNodeHolder*)calloc(1, sizeof(PRMBayesianNodeHolder));
+ Holder->Node = Node;
+ if (Model->FirstFlank)
+ {
+ Model->LastFlank->Next = Holder;
+ }
+ else
+ {
+ Model->FirstFlank = Holder;
+ }
+ Model->LastFlank = Holder;
+ }
+ }
+}
+
+// Save a PRMBayesianNode to a binary file. Helper function for SavePRMBayesianModel.
+void SavePRMBayesianNode(PRMBayesianNode* Node, FILE* ModelFile)
+{
+ int ParentIndex;
+ WriteBinary(&Node->Name, sizeof(char), 256, ModelFile);
+ WriteBinary(&Node->Type, sizeof(int), 1, ModelFile);
+ WriteBinary(&Node->Flag, sizeof(int), 1, ModelFile);
+ WriteBinary(&Node->FragmentType, sizeof(int), 1, ModelFile);
+ WriteBinary(&Node->MassOffset, sizeof(int), 1, ModelFile);
+ WriteBinary(&Node->ValueCount, sizeof(int), 1, ModelFile);
+ WriteBinary(&Node->ParentCount, sizeof(int), 1, ModelFile);
+ // Write parent indices:
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ WriteBinary(&Node->Parents[ParentIndex]->Index, sizeof(int), 1, ModelFile);
+ }
+ WriteBinary(Node->ParentBlocks, sizeof(int), Node->ParentCount, ModelFile);
+ WriteBinary(&Node->TableSize, sizeof(int), 1, ModelFile);
+ WriteBinary(Node->CountTable, sizeof(int), Node->TableSize, ModelFile);
+ WriteBinary(Node->ProbTable, sizeof(float), Node->TableSize, ModelFile);
+}
+
+// Load a PRMBayesianNode from a binary file. Helper function for LoadPRMBayesianModel.
+PRMBayesianNode* LoadPRMBayesianNode(PRMBayesianModel* Model, FILE* ModelFile)
+{
+ PRMBayesianNode* Node;
+ int ParentIndex;
+ int ParentNodeIndex;
+ //
+ Node = (PRMBayesianNode*)calloc(1, sizeof(PRMBayesianNode));
+ ReadBinary(&Node->Name, sizeof(char), 256, ModelFile);
+ ReadBinary(&Node->Type, sizeof(int), 1, ModelFile);
+ ReadBinary(&Node->Flag, sizeof(int), 1, ModelFile);
+ ReadBinary(&Node->FragmentType, sizeof(int), 1, ModelFile);
+ ReadBinary(&Node->MassOffset, sizeof(int), 1, ModelFile);
+ ReadBinary(&Node->ValueCount, sizeof(int), 1, ModelFile);
+ ReadBinary(&Node->ParentCount, sizeof(int), 1, ModelFile);
+ if (Node->ParentCount < 0 || Node->ParentCount > 100)
+ {
+ REPORT_ERROR(0);
+ return NULL;
+ }
+ if (Node->ParentCount)
+ {
+ Node->Parents = (PRMBayesianNode**)calloc(Node->ParentCount, sizeof(PRMBayesianNode*));
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ ReadBinary(&ParentNodeIndex, sizeof(int), 1, ModelFile);
+ if (ParentNodeIndex < 0 || ParentNodeIndex >= Model->NodeCount)
+ {
+ REPORT_ERROR(0);
+ return NULL;
+ }
+ Node->Parents[ParentIndex] = Model->Nodes[ParentNodeIndex];
+ }
+ Node->ParentBlocks = (int*)calloc(Node->ParentCount, sizeof(int));
+ ReadBinary(Node->ParentBlocks, sizeof(int), Node->ParentCount, ModelFile);
+ }
+ ReadBinary(&Node->TableSize, sizeof(int), 1, ModelFile);
+ if (Node->TableSize <= 0 || Node->TableSize > 10000)
+ {
+ REPORT_ERROR(0);
+ return NULL;
+ }
+ Node->CountTable = (int*)calloc(Node->TableSize, sizeof(int));
+ ReadBinary(Node->CountTable, sizeof(int), Node->TableSize, ModelFile);
+ Node->ProbTable = (float*)calloc(Node->TableSize, sizeof(float));
+ ReadBinary(Node->ProbTable, sizeof(float), Node->TableSize, ModelFile);
+ return Node;
+}
+
+// Save a PRMBayesian model to a binary file. In production, the model
+// is loaded (using LoadPRMBayesianModel) and then used.
+void SavePRMBayesianModel(PRMBayesianModel* Model, char* FileName)
+{
+ FILE* ModelFile;
+ PRMBayesianNode* Node;
+ //
+ if (!Model)
+ {
+ REPORT_ERROR(0);
+ return;
+ }
+ ModelFile = fopen(FileName, "wb");
+ if (!ModelFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+ WriteBinary(&Model->CutFlag, sizeof(int), 1, ModelFile);
+ WriteBinary(&Model->IntensityScheme, sizeof(int), 1, ModelFile);
+ WriteBinary(&Model->MinIntensityLevel, sizeof(int), 1, ModelFile);
+ WriteBinary(&Model->IntensityRadius, sizeof(int), 1, ModelFile);
+ fwrite(&Model->NoiseModel, sizeof(int), 1, ModelFile);
+ WriteBinary(Model->RandomIntensityCounts, sizeof(int), 10, ModelFile);
+ WriteBinary(Model->RandomIntensityScores, sizeof(float), 10, ModelFile);
+ WriteBinary(&Model->NodeCount, sizeof(int), 1, ModelFile);
+ for (Node = Model->Head; Node; Node = Node->Next)
+ {
+ SavePRMBayesianNode(Node, ModelFile);
+ }
+ fclose(ModelFile);
+}
+
+// Load a PRMBayesianModel from a binary file.
+PRMBayesianModel* LoadPRMBayesianModel(char* FileName)
+{
+ PRMBayesianModel* Model;
+ FILE* ModelFile;
+ int NodeIndex;
+ PRMBayesianNode* Node;
+
+ //
+ ModelFile = fopen(FileName, "rb");
+ if (!ModelFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return NULL;
+ }
+ Model = (PRMBayesianModel*)calloc(1, sizeof(PRMBayesianModel));
+ ReadBinary(&Model->CutFlag, sizeof(int), 1, ModelFile);
+ ReadBinary(&Model->IntensityScheme, sizeof(int), 1, ModelFile);
+ ReadBinary(&Model->MinIntensityLevel, sizeof(int), 1, ModelFile);
+ ReadBinary(&Model->IntensityRadius, sizeof(int), 1, ModelFile);
+ ReadBinary(&Model->NoiseModel, sizeof(int), 1, ModelFile);
+ ReadBinary(Model->RandomIntensityCounts, sizeof(int), 10, ModelFile);
+ ReadBinary(Model->RandomIntensityScores, sizeof(float), 10, ModelFile);
+ ReadBinary(&Model->NodeCount, sizeof(int), 1, ModelFile);
+ Model->Nodes = (PRMBayesianNode**)calloc(Model->NodeCount, sizeof(PRMBayesianNode*));
+ for (NodeIndex = 0; NodeIndex < Model->NodeCount; NodeIndex++)
+ {
+ Node = LoadPRMBayesianNode(Model, ModelFile);
+ Node->Index = NodeIndex;
+ Model->Nodes[NodeIndex] = Node;
+ if (Model->Tail)
+ {
+ Model->Tail->Next = Node;
+ }
+ else
+ {
+ Model->Head = Node;
+ }
+ Model->Tail = Node;
+ }
+ BuildModelFlankList(Model);
+ fclose(ModelFile);
+ return Model;
+}
+
+// Translate the CountTables for this model's nodes into probability tables.
+// We use a "buffer" count for each node to pad out the probabilities; if our training
+// set was small, it may have left ZERO entries in some nodes, and we don't want
+// probabilities of zero (since then we can't take their natural logarithm).
+void ComputePRMBayesianModelProbabilityTables(PRMBayesianModel* Model, int PaddingCount)
+{
+ PRMBayesianNode* Node;
+ int TotalEntries;
+ int TableIndex;
+ float Probability;
+ int Count;
+ int BlockStartIndex;
+ int IntensityLevel;
+ //
+
+ // Set global noise probabilities:
+ Count = 0;
+ for (IntensityLevel = 0; IntensityLevel <= Model->MinIntensityLevel; IntensityLevel++)
+ {
+ Count += (1 + Model->RandomIntensityCounts[IntensityLevel]);
+ }
+ for (IntensityLevel = 0; IntensityLevel <= Model->MinIntensityLevel; IntensityLevel++)
+ {
+ Probability = (1 + Model->RandomIntensityCounts[IntensityLevel]) / (float)Count;
+ Model->RandomIntensityScores[IntensityLevel] = (float)log(Probability);
+ }
+
+ // Set probabilities for each node:
+ for (Node = Model->Head; Node; Node = Node->Next)
+ {
+ // Compute the probability that this node will have a value,
+ // GIVEN the values of any parent nodes:
+ for (BlockStartIndex = 0; BlockStartIndex < Node->TableSize; BlockStartIndex += Node->ValueCount)
+ {
+ TotalEntries = 0;
+ for (TableIndex = BlockStartIndex; TableIndex < BlockStartIndex + Node->ValueCount; TableIndex++)
+ {
+ TotalEntries += Node->CountTable[TableIndex] + PaddingCount;
+ }
+ for (TableIndex = BlockStartIndex; TableIndex < BlockStartIndex + Node->ValueCount; TableIndex++)
+ {
+ if (TableIndex >= Node->TableSize)
+ {
+ REPORT_ERROR(0);
+ }
+ Count = Node->CountTable[TableIndex] + PaddingCount;
+ Probability = Count / (float)TotalEntries;
+ Node->ProbTable[TableIndex] = (float)log(Probability);
+ }
+ }
+ }
+}
+
+// PRMBNGetCutScore returns the score for a cut-point.
+// It's called AFTER setting the Values array for each node, with calls to IonScoringGetNodeValue
+float PRMBNGetCutScore(MSSpectrum* Spectrum, PRMBayesianModel* Model, int AminoIndex)
+{
+ float Score = 0;
+ float NodeScore = 0;
+ int TableIndex;
+ int ParentIndex;
+ PRMBayesianNode* Node;
+ int VerboseFlag = 0;
+ //
+ for (Node = Model->Head; Node; Node = Node->Next)
+ {
+ switch (Node->Type)
+ {
+ case evPRMBPrefix:
+ case evPRMBPrefix2:
+ case evPRMBSuffix:
+ case evPRMBSuffix2:
+ TableIndex = Node->Values[AminoIndex];
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ TableIndex += Node->Parents[ParentIndex]->Values[AminoIndex] * Node->ParentBlocks[ParentIndex];
+ }
+ if (TableIndex >= Node->TableSize)
+ {
+ REPORT_ERROR(0);
+ }
+ NodeScore = Node->ProbTable[TableIndex];
+ if (Model->NoiseModel)
+ {
+ // GLOBAL noise model, based on all spectra
+ NodeScore -= Model->RandomIntensityScores[Node->Values[AminoIndex]];
+ }
+ else
+ {
+ // SPECTRUM noise model:
+ NodeScore -= Spectrum->IonScoringNoiseProbabilities[Node->Values[AminoIndex]];
+ }
+ if (VerboseFlag)
+ {
+ printf(" AA %d: Node %d (%s) contributes %.3f - %.3f = %.5f\n", AminoIndex, Node->Index, Node->Name,
+ Node->ProbTable[TableIndex], Spectrum->IonScoringNoiseProbabilities[Node->Values[AminoIndex]], NodeScore);
+ }
+ Score += NodeScore;
+ break;
+ default:
+ // Other node-types don't contribute to the score.
+ break;
+ }
+ }
+ return Score;
+}
+
+// Compute the score for a PRM, using a bayesian network. Sum the log-probabilities over
+// all ion fragment nodes.
+float GetIonPRMFeatures(MSSpectrum* Spectrum, SpectrumTweak* Tweak, PRMBayesianModel* Model, int PRM, int VerboseFlag)
+{
+ PRMBayesianNode* Node;
+ int ParentIndex;
+ int TableIndex;
+ float Score = 0;
+ float NodeScore;
+ //
+ // Compute each node value:
+ for (Node = Model->Head; Node; Node = Node->Next)
+ {
+ Node->Value = IonScoringGetNodeValue(Model, Node, Spectrum, PRM, NULL, -1);
+ if (VerboseFlag)
+ {
+ printf("Score(%.2f): Node %d (%s) has value %d\n", PRM / (float)DALTON, Node->Index, Node->Name, Node->Value);
+ }
+ }
+ // Compute a SCORE for this collection of values:
+ for (Node = Model->Head; Node; Node = Node->Next)
+ {
+ switch (Node->Type)
+ {
+ case evPRMBPrefix:
+ case evPRMBPrefix2:
+ case evPRMBSuffix:
+ case evPRMBSuffix2:
+ TableIndex = Node->Value;
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ TableIndex += Node->Parents[ParentIndex]->Value * Node->ParentBlocks[ParentIndex];
+ }
+ if (TableIndex >= Node->TableSize)
+ {
+ REPORT_ERROR(0);
+ }
+ NodeScore = Node->ProbTable[TableIndex];
+ if (Model->NoiseModel)
+ {
+ // GLOBAL noise model, based on all spectra
+ NodeScore -= Model->RandomIntensityScores[Node->Value];
+ }
+ else
+ {
+ // SPECTRUM noise model:
+ NodeScore -= Spectrum->IonScoringNoiseProbabilities[Node->Value];
+ }
+ if (VerboseFlag)
+ {
+ printf(" Node %d (%s) contributes %.3f - %.3f = %.5f\n", Node->Index, Node->Name,
+ Node->ProbTable[TableIndex], Spectrum->IonScoringNoiseProbabilities[Node->Value], NodeScore);
+ }
+ Score += NodeScore;
+ break;
+ default:
+ // Other node-types don't contribute to the score.
+ break;
+ }
+ }
+ return Score;
+}
+
+// Iterate over all the nodes in our TagGraph, and assign a score to each.
+void TagGraphScorePRMNodes(PRMBayesianModel* Model, TagGraph* Graph, MSSpectrum* Spectrum, SpectrumTweak* Tweak)
+{
+ TagGraphNode* Node;
+
+ if (!Model)
+ {
+ if (Tweak->Charge < 3)
+ {
+ Model = TAGModelCharge2;
+ }
+ else
+ {
+ Model = TAGModelCharge3;
+ }
+ }
+
+ for (Node = Graph->FirstNode; Node; Node = Node->Next)
+ {
+ if (Node->NodeType != evGraphNodeB && Node->NodeType != evGraphNodeY)
+ {
+ Node->Score = 0;
+ continue;
+ }
+ Node->Score = GetIonPRMFeatures(Spectrum, Tweak, Model, Node->Mass, 0);
+ continue;
+ }
+}
+
+void FreeBayesianModels()
+{
+ FreePRMBayesianModel(PRMModelCharge2);
+ PRMModelCharge2 = NULL;
+ FreePRMBayesianModel(PRMModelCharge3);
+ PRMModelCharge3 = NULL;
+ FreePRMBayesianModel(TAGModelCharge2);
+ TAGModelCharge2 = NULL;
+ FreePRMBayesianModel(TAGModelCharge3);
+ TAGModelCharge3 = NULL;
+ FreePRMBayesianModel(PhosCutModelCharge2);
+ PhosCutModelCharge2 = NULL;
+ FreePRMBayesianModel(PhosCutModelCharge3);
+ PhosCutModelCharge3 = NULL;
+}
+
+// Load PRMBayesianModel objects for scoring PRMs and for scoring tags.
+void InitBayesianModels()
+{
+ char FilePath[2048];
+ // Return immediately, if models are loaded already:
+ if (PRMModelCharge2)
+ {
+ return;
+ }
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRM2.bn");
+ PRMModelCharge2 = LoadPRMBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PRM3.bn");
+ PRMModelCharge3 = LoadPRMBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "TAG2.bn");
+ TAGModelCharge2 = LoadPRMBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "TAG3.bn");
+ TAGModelCharge3 = LoadPRMBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PhosCut2.bn");
+ PhosCutModelCharge2 = LoadPRMBayesianModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PhosCut3.bn");
+ PhosCutModelCharge3 = LoadPRMBayesianModel(FilePath);
+}
+
+// Replace a PRMScoring model with one specified in the input file (the "PRMModel" option).
+// Useful for handling new instrument types, etc.
+int ReplacePRMScoringModel(int Charge, char* FileName)
+{
+ PRMBayesianModel* Model;
+ //
+ Model = LoadPRMBayesianModel(FileName);
+ if (!Model)
+ {
+ return 0;
+ }
+ if (Charge == 2)
+ {
+ FreePRMBayesianModel(PRMModelCharge2);
+ PRMModelCharge2 = Model;
+ }
+ else if (Charge == 3)
+ {
+ FreePRMBayesianModel(PRMModelCharge3);
+ PRMModelCharge3 = Model;
+ }
+ else
+ {
+ REPORT_ERROR(0);
+ }
+
+ return 1;
+}
+
+// Replace a tag scoring model with one specified in the input file (the "TAGModel" option).
+// Useful for handling new instrument types, etc.
+int ReplaceTAGScoringModel(int Charge, char* FileName)
+{
+ PRMBayesianModel* Model;
+ //
+ Model = LoadPRMBayesianModel(FileName);
+ if (!Model)
+ {
+ return 0;
+ }
+ if (Charge == 2)
+ {
+ FreePRMBayesianModel(TAGModelCharge2);
+ TAGModelCharge2 = Model;
+ }
+ else if (Charge == 3)
+ {
+ FreePRMBayesianModel(TAGModelCharge3);
+ TAGModelCharge3 = Model;
+ }
+ else
+ {
+ REPORT_ERROR(0);
+ }
+ return 1;
+}
+
+// Set the array Tweak->PRMScores. This is used in unrestrictive ("blind") searches.
+void SetSpectrumPRMScores(MSSpectrum* Spectrum, SpectrumTweak* Tweak)
+{
+ PRMBayesianModel* Model;
+ int PRM;
+ float fScore;
+ //
+ // Ensure models are loaded:
+ if (!PRMModelCharge2)
+ {
+ InitBayesianModels();
+ }
+ Tweak->PRMScoreMax = Tweak->ParentMass;
+ if (Spectrum->Graph)
+ {
+ Tweak->PRMScoreMax = max(Tweak->PRMScoreMax, Spectrum->Graph->LastNode->Mass);
+ }
+ Tweak->PRMScoreMax = PRM_ARRAY_SLACK + (Tweak->PRMScoreMax / PRM_BIN_SIZE);
+ SafeFree(Tweak->PRMScores);
+ Tweak->PRMScores = (int*)calloc(Tweak->PRMScoreMax + 5, sizeof(int)); // extra slack in alloc
+ if (Tweak->Charge > 2)
+ {
+ Model = PRMModelCharge3;
+ }
+ else
+ {
+ Model = PRMModelCharge2;
+ }
+ for (PRM = 0; PRM < Tweak->PRMScoreMax; PRM++)
+ {
+ fScore = GetIonPRMFeatures(Spectrum, Tweak, Model, PRM * PRM_BIN_SIZE, 0);
+ //GetPRMFeatures(Spectrum, Tweak, Model, PRM * PRM_BIN_SIZE, 0);
+ Tweak->PRMScores[PRM] = (int)(fScore * 1000);
+ }
+ //DebugPrintPRMScores(Spectrum, Tweak);
+}
+
+int CountTrypticTermini(Peptide* Match)
+{
+ int NTT = 0;
+ int PeptideLength = strlen(Match->Bases);
+ switch (GlobalOptions->DigestType)
+ {
+ case DIGEST_TYPE_TRYPSIN:
+ /////////////////////////////////
+ // Number of tryptic termini:
+ NTT = 0;
+ if (Match->PrefixAmino == '\0' || Match->PrefixAmino == '-' || Match->PrefixAmino == '*')
+ {
+ NTT++;
+ }
+ else if ((Match->PrefixAmino == 'K' || Match->PrefixAmino == 'R') && Match->Bases[0] != 'P')
+ {
+ NTT++;
+ }
+ if (Match->SuffixAmino == '\0' || Match->SuffixAmino == '-' || Match->SuffixAmino == '*')
+ {
+ NTT++;
+ }
+ else if ((Match->Bases[PeptideLength - 1] == 'K' || Match->Bases[PeptideLength - 1] == 'R') && Match->SuffixAmino != 'P')
+ {
+ NTT++;
+ }
+ break;
+ case DIGEST_TYPE_UNKNOWN:
+ default:
+ NTT = 2;
+ break;
+ }
+ return NTT;
+}
+
+void PopulateCutScores(PRMBayesianModel* Model, MSSpectrum* Spectrum, Peptide* Match, float* CutScores)
+{
+ int PRM = 0;
+ int NodeIndex;
+ PRMBayesianNode* Node;
+ int AminoIndex;
+ int ModIndex;
+ int PeptideLength = strlen(Match->Bases);
+ int PeakIndex;
+
+ // Reset all peak annotations:
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Spectrum->Peaks[PeakIndex].IonType = evFragmentTypeNone;
+ }
+ if (Match->SpecialFragmentation)
+ { // phosphorylated spectra
+ AnnotateParentPeaks(Spectrum, Match, Model);
+ }
+
+ for (NodeIndex = 0, Node = Model->Head; Node; NodeIndex++, Node = Node->Next)
+ {
+ PRM = 0;
+ for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ {
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Set values, and accumulate table entries:
+ Node->Values[AminoIndex] = IonScoringGetNodeValue(Model, Node, Spectrum, PRM, Match, AminoIndex);
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Add to PRM:
+ if (AminoIndex == PeptideLength)
+ {
+ break;
+ }
+ PRM += PeptideMass[Match->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] == AminoIndex)
+ {
+ PRM += Match->ModType[ModIndex]->RealDelta;
+ }
+ }
+ } // Amino loop
+ } // NodeIndex loop
+
+ // Populate the CutScores array:
+ for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ {
+ CutScores[AminoIndex] = PRMBNGetCutScore(Spectrum, Model, AminoIndex);
+ }
+}
+
+// Compute MQScore features, in preparation for MQScore calculation
+int ComputeMQScoreFeatures(MSSpectrum* Spectrum, Peptide* Match, float* MQFeatures, int VerboseFlag)
+{
+ int FeatureIndex = 0;
+ PRMBayesianModel* Model;
+ int PeptideLength;
+ float CutScores[256];
+ int PRM = 0;
+ int AminoIndex;
+ int PRMCount;
+ float ScoreTotal;
+ int YFlag[256];
+ int BFlag[256];
+ int PeakIndex;
+ int PresentCount;
+ int FragmentType;
+ float PeakIntensity;
+ float TotalIntensity;
+ float IntensityY = 0;
+ float IntensityYSeries = 0;
+ float IntensityB = 0;
+ float IntensityBSeries = 0;
+ //
+ Spectrum->ParentMass = Match->ParentMass;
+ Model = GetScoringModel(Match, Spectrum->Charge);
+ PeptideLength = strlen(Match->Bases);
+ // If the peptide is very short (length 5 or less), wey may not even want to bother
+ // computing features. Peptides that short are not informative!
+
+ MQFeatures[FeatureIndex++] = (float)PeptideLength; // #2
+
+ ///////////////////////////////////////
+ // Cut score features (5, 11):
+ PopulateCutScores(Model, Spectrum, Match, CutScores);
+
+ // Total/mean for CENTRAL cut scores:
+ ScoreTotal = 0;
+ PRMCount = 0;
+ for (AminoIndex = 1; AminoIndex < PeptideLength; AminoIndex++)
+ {
+ ScoreTotal += CutScores[AminoIndex];
+ PRMCount++;
+ if (VerboseFlag)
+ {
+ printf(" Cut score %d: %.2f\n", AminoIndex, CutScores[AminoIndex]);
+ }
+ }
+ MQFeatures[FeatureIndex++] = ScoreTotal; // #5
+
+ // Median cut score:
+ PRMCount = PeptideLength + 1;
+ MQFeatures[FeatureIndex++] = GetMedian(CutScores, PRMCount); // #11
+
+ // Count b and y peak presence:
+ memset(BFlag, 0, sizeof(int) * (PeptideLength + 1));
+ memset(YFlag, 0, sizeof(int) * (PeptideLength + 1));
+ TotalIntensity = 0;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ FragmentType = Spectrum->Peaks[PeakIndex].IonType;
+ PeakIntensity = Spectrum->Peaks[PeakIndex].Intensity;
+ if (FragmentType == evParentLoss)
+ {
+ // I don't want the parent loss peaks to count against
+ // phosphorylation, because it is typically very strong.
+ PeakIntensity = 0;
+ }
+ TotalIntensity += PeakIntensity;
+ switch (FragmentType)
+ {
+ case evFragmentY:
+ IntensityY += PeakIntensity;
+ IntensityYSeries += PeakIntensity;
+ YFlag[Spectrum->Peaks[PeakIndex].AminoIndex] = 1;
+ break;
+ case evFragmentYLoss:
+ IntensityYSeries += PeakIntensity;
+ break;
+ case evFragmentB:
+ IntensityB += PeakIntensity;
+ IntensityBSeries += PeakIntensity;
+ BFlag[Spectrum->Peaks[PeakIndex].AminoIndex] = 1;
+ break;
+ case evFragmentBLoss:
+ IntensityBSeries += PeakIntensity;
+ break;
+ }
+ }
+ // Fraction of B, Y present:
+ PresentCount = 0;
+ for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ {
+ PresentCount += YFlag[AminoIndex];
+ }
+ MQFeatures[FeatureIndex++] = PresentCount / (float)(PeptideLength + 1); // #12
+ PresentCount = 0;
+ for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ {
+ PresentCount += BFlag[AminoIndex];
+ }
+ MQFeatures[FeatureIndex++] = PresentCount / (float)(PeptideLength + 1); // #13
+
+ // Fraction of total intensity in B and Y series:
+ MQFeatures[FeatureIndex++] = (IntensityY + IntensityB) / TotalIntensity; // #25
+
+ MQFeatures[FeatureIndex++] = (float)CountTrypticTermini(Match); // #30
+ return FeatureIndex;
+}
+
+// This is currently only called for Phosphorylated spectra, for that reason we
+// claim the peaks corresponding to Parent-phosphate, and parent-phosphate-water
+// this bears resemblance to IonScoringGetPeakIntensity, when we claim the peak
+// as this does not correspond to an AminoIndex, yet requires peak claiming.
+// we have to rewrite it here.
+void AnnotateParentPeaks(MSSpectrum* Spectrum, Peptide* Match, PRMBayesianModel* Model)
+{
+ int Loss;
+ //the loss of phosphate from the spectrum is actually 98 daltons.(80 + 18)
+ int PMMinusPhosphate;
+ //actually phosphate and twoWaters (80+18+18)
+ int PMMinusPhosphateAndWater;
+ //
+ //set mz according to the current parent mass of the spectrum
+ Spectrum->MZ = (Spectrum->ParentMass + (Spectrum->Charge - 1) * HYDROGEN_MASS) / Spectrum->Charge;
+ Loss = PHOSPHATE_WATER_MASS / Spectrum->Charge;
+ PMMinusPhosphate = Spectrum->MZ - Loss;
+ Loss = (PHOSPHATE_WATER_MASS + WATER_MASS)/Spectrum->Charge;
+ PMMinusPhosphateAndWater = Spectrum->MZ - Loss;
+ ClaimParentPeak(Spectrum, Match, PMMinusPhosphate, Model);
+ ClaimParentPeak(Spectrum, Match, PMMinusPhosphateAndWater, Model);
+ // Now look for +1 isotopes
+ Loss = PHOSPHATE_WATER_MASS / Spectrum->Charge;
+ PMMinusPhosphate = Spectrum->MZ - Loss + (HYDROGEN_MASS/Spectrum->Charge);
+ Loss = (PHOSPHATE_WATER_MASS + WATER_MASS) / Spectrum->Charge;
+ PMMinusPhosphateAndWater = Spectrum->MZ - Loss + (HYDROGEN_MASS / Spectrum->Charge);
+ ClaimParentPeak(Spectrum, Match, PMMinusPhosphate, Model);
+ ClaimParentPeak(Spectrum, Match, PMMinusPhosphateAndWater, Model);
+}
+
+void ClaimParentPeak(MSSpectrum* Spectrum, Peptide* Match, int Mass, PRMBayesianModel* Model)
+{
+ int Bin;
+ int MinMass;
+ int MaxMass;
+ float Intensity = 0;
+ int PeakIndex;
+ int Skew;
+ float Multiplier;
+ //
+ Bin = (Mass + 50) / 100; // Bin width 0.1Da
+ MinMass = Mass - Model->IntensityRadius;
+ MaxMass = Mass + Model->IntensityRadius;
+
+ // If the mass is off the scale, then you get no peaks:
+ if (Bin >= Spectrum->IntensityBinCount || Bin < 0)
+ {
+ return;
+ }
+
+ PeakIndex = Spectrum->BinPeakIndex[Bin];
+ if (PeakIndex >= 0)
+ {
+ for ( ; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].Mass > MaxMass)
+ {
+ break;
+ }
+ if (Spectrum->Peaks[PeakIndex].Mass < MinMass)
+ {
+ continue;
+ }
+
+ Multiplier = 1.0; // default
+ Skew = abs(Mass - Spectrum->Peaks[PeakIndex].Mass);
+ if (Model->IntensityScheme == 1 || Model->IntensityScheme == 3)
+ {
+ if (Skew >= Model->HalfIntensityRadius)
+ {
+ Multiplier = 0.5;
+ }
+ }
+ if (Spectrum->Peaks[PeakIndex].IonType)
+ {
+ // This peak has already been CLAIMED by another ion type:
+ continue;
+ }
+ Intensity += Spectrum->Peaks[PeakIndex].Intensity * Multiplier;
+ // CLAIM this spectrum:
+ Spectrum->Peaks[PeakIndex].IonType = evParentLoss;
+ Spectrum->Peaks[PeakIndex].AminoIndex = -1; //not an amino index. is this a problem?
+ }
+ }
+}
+
+PRMBayesianModel* GetScoringModel(Peptide* Match, int Charge)
+{
+ int ModIndex;
+ //
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Match->ModType[ModIndex])
+ {
+ break;
+ }
+ if (Match->ModType[ModIndex]->Flags & DELTA_FLAG_PHOSPHORYLATION)
+ {
+ Match->SpecialFragmentation = FRAGMENTATION_PHOSPHO;
+ Match->SpecialModPosition = Match->AminoIndex[ModIndex];
+ break;
+ }
+ }
+ if (Match->SpecialFragmentation)
+ {
+ if (Charge > 2)
+ {
+ return PhosCutModelCharge3;
+ }
+ return PhosCutModelCharge2;
+ }
+ if (Charge > 2)
+ {
+ return TAGModelCharge3;
+ }
+ return TAGModelCharge2; //default
+}
+
+char* GetFragmentTypeName(int FragmentType)
+{
+ switch (FragmentType)
+ {
+ case evFragmentY:
+ return "Y";
+ case evFragmentB:
+ return "B";
+ case evFragmentYLoss:
+ return "Y loss";
+ case evFragmentBLoss:
+ return "B loss";
+ case evParentLoss:
+ return "Parent loss";
+ case evFragmentTypeNone:
+ default:
+ return "";
+ }
+}
+
diff --git a/IonScoring.h b/IonScoring.h
new file mode 100644
index 0000000..0e492c2
--- /dev/null
+++ b/IonScoring.h
@@ -0,0 +1,195 @@
+//Title: IonScoring.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef ION_SCORING_H
+#define ION_SCORING_H
+
+
+
+// New code to support scoring of PRMs, and of cuts.
+// Key ideas:
+// - Feature meta-data is read from the bayesian network. New network topologies,
+// and even new ion types, can be used without the need to recompile code.
+// - Ion types and network topologies will be learned in an objective, repeatable way.
+#include "Spectrum.h"
+#include "Tagger.h"
+#include "Trie.h"
+
+#define UNKNOWN_AMINO 'Z'
+
+typedef enum FragmentTypes
+{
+ evFragmentTypeNone = 0,
+ evFragmentY,
+ evFragmentB,
+ evFragmentYLoss,
+ evFragmentBLoss,
+ evParentLoss //used for parent-phosphate. not really a fragment, but used to claim the peak.
+} FragmentTypes;
+
+typedef enum PRMBayesianNodeType
+{
+ evPRMBInvalid = 0,
+ evPRMBPrefix,
+ evPRMBSuffix,
+ evPRMBPrefix2,
+ evPRMBSuffix2,
+ evSector,
+ evFlank,
+ evPrefixAA,
+ evSuffixAA,
+ evPrefixContain,
+ evSuffixContain,
+ evPrefixContainPhos,
+ evSuffixContainPhos
+} PRMBayesianNodeType;
+
+// A node in a Bayesian Network for scoring PRMs or cuts:
+typedef struct PRMBayesianNode
+{
+ // Index of the node in the bayesian network (0, 1, etc). By convention,
+ // parents will always have lower indices than their children.
+ int Index;
+ PRMBayesianNodeType Type;
+ // For ion type nodes:
+ int MassOffset;
+ int FragmentType; // from the FragmentTypes enum
+ // The Flag on a BayesianNode of special type is
+ // used to control how the node's values are computed.
+ // Examples: For type evFlank, the flag tells us whether
+ // we're looking for flanking aminos that affect b fragments,
+ // or y fragments.
+ int Flag;
+ struct PRMBayesianNode* Next;
+ // ValueCount is the number of distinct values this node can take on (2 or more).
+ // ValueCount is determined by our Type and Flag, and possibly by the network's
+ // intensity scheme.
+ int ValueCount;
+ int ParentCount;
+ struct PRMBayesianNode** Parents;
+ // ParentBlock[n] is the multiplier for parent n's value when indexing into
+ // the CountTable/ProbTable arrays. For instance, if we have 4 possible values
+ // and one parent, then ParentBlock[0] will be 4, and the index of a table entry
+ // is ParentValue*4 + ChildValue.
+ int* ParentBlocks;
+ // Size of CountTable and ProbTable:
+ int TableSize;
+ // Table counting the number of occurrences of a given value combination:
+ int* CountTable;
+ // Table giving natural logarithm of the probability of a given value combination:
+ float* ProbTable;
+ // Value is transiently set while scoring a PRM or cut point:
+ int Value;
+ // An array of values for cut points:
+ int Values[256];
+ // Human-readable name of the node, mostly for debugging:
+ char Name[256 + 1];
+ // Flag to indicate whether this node, or an immediate parent, requires knowledge of
+ // flanking amino acids. If this flag is set, then during tagging, we will delay
+ // full scoring of this node until tag construction.
+ int FlankFlag;
+} PRMBayesianNode;
+
+typedef struct PRMBayesianNodeHolder
+{
+ PRMBayesianNode* Node;
+ struct PRMBayesianNodeHolder* Next;
+} PRMBayesianNodeHolder;
+
+typedef struct PRMBayesianModel
+{
+ PRMBayesianNode* Head;
+ PRMBayesianNode* Tail;
+ // Array of the nodes, for quickly looking them up by index:
+ PRMBayesianNode** Nodes;
+ int NodeCount;
+ // Scheme for assigning intensity-levels to ion nodes:
+ int IntensityScheme;
+ // 0 is spectrum-specific, 1 is global
+ int NoiseModel;
+ // Radius (in daltons) of the window over which to sum intensities when
+ // finding peaks:
+ int IntensityRadius;
+ int HalfIntensityRadius;
+ // CutFlag is true if this model is used to score cut points. A few operations
+ // differ; in particular, we seize intensities for b and y peaks first, THEN consider
+ // neutral losses.
+ int CutFlag;
+ // Intensity levels are sorted from highest (0) to lowest (MinIntensityLevel).
+ int MinIntensityLevel;
+ // RandomIntensityCounts and RandomIntensityScores track how often a *random* mass
+ // has a particular intensity level. We'll try using a spectrum-specific noise
+ // model as well as this "global" noise model.
+ int RandomIntensityCounts[10];
+ float RandomIntensityScores[10];
+ // Linked list of nodes which require flanking amino acid information (either directly,
+ // or via parents):
+ PRMBayesianNodeHolder* FirstFlank;
+ PRMBayesianNodeHolder* LastFlank;
+} PRMBayesianModel;
+
+void AddPRMBayesianNode(PRMBayesianModel* Model, char* Name, int NodeType, int NodeFlag, float NodeMassOffset, int FragmentType);
+void FreePRMBayesianModel(PRMBayesianModel* Model);
+void FreePRMBayesianNode(PRMBayesianNode* Node);
+void PrepareSpectrumForIonScoring(PRMBayesianModel* Model, MSSpectrum* Spectrum, int ForceRefresh);
+int IonScoringGetNodeValue(PRMBayesianModel* Model, PRMBayesianNode* Node, MSSpectrum* Spectrum, int PRM,
+ Peptide* Match, int AminoIndex);
+void ComputePRMBayesianModelProbabilityTables(PRMBayesianModel* Model, int PaddingCount);
+void SavePRMBayesianModel(PRMBayesianModel* Model, char* FileName);
+PRMBayesianModel* LoadPRMBayesianModel(char* FileName);
+void DebugPrintPRMBayesianModel(PRMBayesianModel* Model);
+void TagGraphScorePRMNodes(PRMBayesianModel* Model, struct TagGraph* Graph, MSSpectrum* Spectrum, SpectrumTweak* Tweak);
+float GetIonPRMFeatures(MSSpectrum* Spectrum, SpectrumTweak* Tweak, PRMBayesianModel* Model, int PRM, int VerboseFlag);
+void BuildModelFlankList(PRMBayesianModel* Model);
+void LoadFlankingAminoEffects();
+int IonScoringGetFlank(PRMBayesianNode* Node, char Left, char Right);
+float PRMBNGetCutScore(MSSpectrum* Spectrum, PRMBayesianModel* Model, int AminoIndex);
+void InitBayesianModels();
+int ReplacePRMScoringModel(int Charge, char* FileName);
+int ReplaceTAGScoringModel(int Charge, char* FileName);
+void SetSpectrumPRMScores(MSSpectrum* Spectrum, SpectrumTweak* Tweak);
+void PopulateCutScores(PRMBayesianModel* Model, MSSpectrum* Spectrum, Peptide* Match, float* CutScores);
+int CountTrypticTermini(Peptide* Match);
+int ComputeMQScoreFeatures(MSSpectrum* Spectrum, Peptide* Match, float* MQFeatures, int VerboseFlag);
+char* GetFragmentTypeName(int FragmentType);
+void FreeBayesianModels();
+
+extern PRMBayesianModel* PRMModelCharge2;
+extern PRMBayesianModel* PRMModelCharge3;
+extern PRMBayesianModel* TAGModelCharge2;
+extern PRMBayesianModel* TAGModelCharge3;
+extern PRMBayesianModel* PhosCutModelCharge2;
+extern PRMBayesianModel* PhosCutModelCharge3;
+
+#endif // ION_SCORING_H
+
+
diff --git a/IsotopePatterns.txt b/IsotopePatterns.txt
new file mode 100644
index 0000000..95489fe
--- /dev/null
+++ b/IsotopePatterns.txt
@@ -0,0 +1,1750 @@
+0 0.0
+1 0.0
+2 0.0
+3 0.0
+4 0.0
+5 0.0
+6 0.0
+7 0.0
+8 0.0
+9 0.0
+10 0.0
+11 0.0
+12 0.0
+13 0.0
+14 0.0
+15 0.0
+16 0.0
+17 0.0
+18 0.0
+19 0.0
+20 0.0
+21 0.0
+22 0.0
+23 0.0
+24 0.0
+25 0.0
+26 0.0
+27 0.0
+28 0.0
+29 0.0
+30 0.0
+31 0.0
+32 0.0
+33 0.0
+34 0.0223878333148
+35 0.0223878333148
+36 0.0223878333148
+37 0.0223878333148
+38 0.0226878783215
+39 0.0226878783215
+40 0.0226878783215
+41 0.0226878783215
+42 0.0226878783215
+43 0.0226878783215
+44 0.0226878783215
+45 0.0226878783215
+46 0.0226878783215
+47 0.0226878783215
+48 0.0226878783215
+49 0.0226878783215
+50 0.0226878783215
+51 0.0226878783215
+52 0.0226878783215
+53 0.0228379008249
+54 0.0228379008249
+55 0.0228379008249
+56 0.0340318174823
+57 0.0340318174823
+58 0.0340318174823
+59 0.0340318174823
+60 0.0340318174823
+61 0.0340318174823
+62 0.0340318174823
+63 0.0340318174823
+64 0.0340318174823
+65 0.0340318174823
+66 0.0340318174823
+67 0.0340318174823
+68 0.0341818399857
+69 0.0341818399857
+70 0.0341818399857
+71 0.0341818399857
+72 0.0341818399857
+73 0.0341818399857
+74 0.0341818399857
+75 0.0341818399857
+76 0.0341818399857
+77 0.0341818399857
+78 0.0453757566431
+79 0.0453757566431
+80 0.0453757566431
+81 0.0453757566431
+82 0.0453757566431
+83 0.0455257791465
+84 0.0455257791465
+85 0.0455257791465
+86 0.0455257791465
+87 0.0455257791465
+88 0.0455257791465
+89 0.0455257791465
+90 0.0455257791465
+91 0.0455257791465
+92 0.0455257791465
+93 0.0455257791465
+94 0.0455257791465
+95 0.0455257791465
+96 0.0455257791465
+97 0.0455257791465
+98 0.0456758016498
+99 0.0456758016498
+100 0.0568697183072
+101 0.0568697183072
+102 0.0568697183072
+103 0.0568697183072
+104 0.0568697183072
+105 0.0568697183072
+106 0.0568697183072
+107 0.0568697183072
+108 0.0568697183072
+109 0.0568697183072
+110 0.0568697183072
+111 0.0568697183072
+112 0.0568697183072
+113 0.0570197408106
+114 0.0570197408106
+115 0.0570197408106
+116 0.0570197408106
+117 0.0570197408106
+118 0.0570197408106
+119 0.0570197408106
+120 0.0570197408106
+121 0.0570197408106
+122 0.0570197408106
+123 0.068213657468
+124 0.068213657468
+125 0.068213657468
+126 0.068213657468
+127 0.068213657468
+128 0.0683636799714
+129 0.0683636799714
+130 0.0683636799714
+131 0.0683636799714
+132 0.0683636799714
+133 0.0683636799714
+134 0.0683636799714
+135 0.0683636799714
+136 0.0683636799714
+137 0.0683636799714
+138 0.0683636799714
+139 0.0683636799714
+140 0.0683636799714
+141 0.0683636799714
+142 0.0683636799714
+143 0.0685137024748
+144 0.0685137024748
+145 0.0797076191322
+146 0.0797076191322
+147 0.0797076191322
+148 0.0797076191322
+149 0.0797076191322
+150 0.0797076191322
+151 0.0797076191322
+152 0.0797076191322
+153 0.0797076191322
+154 0.0797076191322
+155 0.0797076191322
+156 0.0797076191322
+157 0.0797076191322
+158 0.0798576416355
+159 0.0798576416355
+160 0.0798576416355
+161 0.0798576416355
+162 0.0798576416355
+163 0.0798576416355
+164 0.0798576416355
+165 0.0798576416355
+166 0.0798576416355
+167 0.0984044920894
+168 0.0984044920894
+169 0.0984044920894
+170 0.0984044920894
+171 0.0984044920894
+172 0.0984044920894
+173 0.0985545145928
+174 0.0985545145928
+175 0.0985545145928
+176 0.0985545145928
+177 0.0985545145928
+178 0.0985545145928
+179 0.0985545145928
+180 0.0985545145928
+181 0.0985545145928
+182 0.0985545145928
+183 0.0985545145928
+184 0.0985545145928
+185 0.0985545145928
+186 0.0985545145928
+187 0.0985545145928
+188 0.0987045370962
+189 0.109898453754
+190 0.109898453754
+191 0.109898453754
+192 0.109898453754
+193 0.109898453754
+194 0.109898453754
+195 0.109898453754
+196 0.109898453754
+197 0.109898453754
+198 0.109898453754
+199 0.109898453754
+200 0.109898453754
+201 0.109898453754
+202 0.109898453754
+203 0.110048476257
+204 0.110048476257
+205 0.110048476257
+206 0.110048476257
+207 0.110048476257
+208 0.110048476257
+209 0.110048476257
+210 0.110048476257
+211 0.110048476257
+212 0.121242392914
+213 0.121242392914
+214 0.121242392914
+215 0.121242392914
+216 0.121242392914
+217 0.121242392914
+218 0.121392415418
+219 0.121392415418
+220 0.121392415418
+221 0.121392415418
+222 0.121392415418
+223 0.121392415418
+224 0.121392415418
+225 0.121392415418
+226 0.121392415418
+227 0.121392415418
+228 0.121392415418
+229 0.121392415418
+230 0.121392415418
+231 0.121392415418
+232 0.121392415418
+233 0.121542437921
+234 0.132736354579
+235 0.132736354579
+236 0.132736354579
+237 0.132736354579
+238 0.132736354579
+239 0.132736354579
+240 0.132736354579
+241 0.132736354579
+242 0.132736354579
+243 0.132736354579
+244 0.132736354579
+245 0.132736354579
+246 0.132736354579
+247 0.132736354579
+248 0.132886377082
+249 0.13656284398
+250 0.13656284398
+251 0.13656284398
+252 0.13656284398
+253 0.13656284398
+254 0.13656284398
+255 0.13656284398
+256 0.147756760638
+257 0.147756760638
+258 0.147756760638
+259 0.147756760638
+260 0.147756760638
+261 0.147756760638
+262 0.147756760638
+263 0.147906783141
+264 0.147906783141
+265 0.147906783141
+266 0.147906783141
+267 0.147906783141
+268 0.147906783141
+269 0.147906783141
+270 0.147906783141
+271 0.147906783141
+272 0.147906783141
+273 0.147906783141
+274 0.147906783141
+275 0.147906783141
+276 0.147906783141
+277 0.147906783141
+278 0.159250722302
+279 0.159250722302
+280 0.159250722302
+281 0.159250722302
+282 0.159250722302
+283 0.159250722302
+284 0.159250722302
+285 0.159250722302
+286 0.159250722302
+287 0.159250722302
+288 0.159250722302
+289 0.159250722302
+290 0.159250722302
+291 0.159250722302
+292 0.159250722302
+293 0.159400744805
+294 0.159400744805
+295 0.159400744805
+296 0.159400744805
+297 0.159400744805
+298 0.159400744805
+299 0.159400744805
+300 0.170594661462
+301 0.170594661462
+302 0.170594661462
+303 0.170594661462
+304 0.170594661462
+305 0.170594661462
+306 0.170594661462
+307 0.170594661462
+308 0.170744683966
+309 0.170744683966
+310 0.170744683966
+311 0.170744683966
+312 0.170744683966
+313 0.170744683966
+314 0.170744683966
+315 0.170744683966
+316 0.170744683966
+317 0.170744683966
+318 0.170744683966
+319 0.170744683966
+320 0.170744683966
+321 0.170744683966
+322 0.170744683966
+323 0.182088623127
+324 0.182088623127
+325 0.182088623127
+326 0.182088623127
+327 0.182088623127
+328 0.182088623127
+329 0.182088623127
+330 0.182088623127
+331 0.185765090025
+332 0.185765090025
+333 0.185765090025
+334 0.185765090025
+335 0.185765090025
+336 0.185765090025
+337 0.185765090025
+338 0.185915112528
+339 0.185915112528
+340 0.185915112528
+341 0.185915112528
+342 0.185915112528
+343 0.185915112528
+344 0.185915112528
+345 0.197109029186
+346 0.197109029186
+347 0.197109029186
+348 0.197109029186
+349 0.197109029186
+350 0.197109029186
+351 0.197109029186
+352 0.197109029186
+353 0.197259051689
+354 0.197259051689
+355 0.197259051689
+356 0.197259051689
+357 0.197259051689
+358 0.197259051689
+359 0.197259051689
+360 0.197259051689
+361 0.197259051689
+362 0.197259051689
+363 0.197259051689
+364 0.197259051689
+365 0.197259051689
+366 0.197259051689
+367 0.208452968346
+368 0.20860299085
+369 0.20860299085
+370 0.20860299085
+371 0.20860299085
+372 0.20860299085
+373 0.20860299085
+374 0.20860299085
+375 0.20860299085
+376 0.20860299085
+377 0.20860299085
+378 0.20860299085
+379 0.20860299085
+380 0.20860299085
+381 0.20860299085
+382 0.20860299085
+383 0.208753013353
+384 0.208753013353
+385 0.208753013353
+386 0.208753013353
+387 0.208753013353
+388 0.208753013353
+389 0.219946930011
+390 0.219946930011
+391 0.219946930011
+392 0.219946930011
+393 0.219946930011
+394 0.219946930011
+395 0.219946930011
+396 0.219946930011
+397 0.219946930011
+398 0.220096952514
+399 0.220096952514
+400 0.220096952514
+401 0.220096952514
+402 0.220096952514
+403 0.220096952514
+404 0.220096952514
+405 0.220096952514
+406 0.220096952514
+407 0.220096952514
+408 0.220096952514
+409 0.220096952514
+410 0.220096952514
+411 0.220096952514
+412 0.231290869171
+413 0.235117358573
+414 0.235117358573
+415 0.235117358573
+416 0.235117358573
+417 0.235117358573
+418 0.235117358573
+419 0.235117358573
+420 0.235117358573
+421 0.235117358573
+422 0.235117358573
+423 0.235117358573
+424 0.235117358573
+425 0.235117358573
+426 0.235117358573
+427 0.235117358573
+428 0.235267381076
+429 0.235267381076
+430 0.235267381076
+431 0.235267381076
+432 0.235267381076
+433 0.235267381076
+434 0.246461297734
+435 0.246461297734
+436 0.246461297734
+437 0.246461297734
+438 0.246461297734
+439 0.246461297734
+440 0.246461297734
+441 0.246461297734
+442 0.246461297734
+443 0.246611320237
+444 0.246611320237
+445 0.246611320237
+446 0.246611320237
+447 0.246611320237
+448 0.246611320237
+449 0.246611320237
+450 0.246611320237
+451 0.246611320237
+452 0.246611320237
+453 0.246611320237
+454 0.246611320237
+455 0.246611320237
+456 0.257805236894
+457 0.257805236894
+458 0.257955259398
+459 0.257955259398
+460 0.257955259398
+461 0.257955259398
+462 0.257955259398
+463 0.257955259398
+464 0.257955259398
+465 0.257955259398
+466 0.257955259398
+467 0.257955259398
+468 0.257955259398
+469 0.257955259398
+470 0.257955259398
+471 0.257955259398
+472 0.257955259398
+473 0.258105281901
+474 0.258105281901
+475 0.258105281901
+476 0.258105281901
+477 0.258105281901
+478 0.269299198559
+479 0.269299198559
+480 0.269299198559
+481 0.269299198559
+482 0.269299198559
+483 0.269299198559
+484 0.269299198559
+485 0.269299198559
+486 0.269299198559
+487 0.269299198559
+488 0.269449221062
+489 0.269449221062
+490 0.269449221062
+491 0.269449221062
+492 0.269449221062
+493 0.269449221062
+494 0.269449221062
+495 0.27312568796
+496 0.27312568796
+497 0.27312568796
+498 0.27312568796
+499 0.27312568796
+500 0.284319604618
+501 0.284319604618
+502 0.284319604618
+503 0.284319604618
+504 0.284469627121
+505 0.284469627121
+506 0.284469627121
+507 0.284469627121
+508 0.284469627121
+509 0.284469627121
+510 0.284469627121
+511 0.284469627121
+512 0.284469627121
+513 0.284469627121
+514 0.284469627121
+515 0.284469627121
+516 0.284469627121
+517 0.284469627121
+518 0.284469627121
+519 0.284619649624
+520 0.284619649624
+521 0.284619649624
+522 0.284619649624
+523 0.295813566282
+524 0.295813566282
+525 0.295813566282
+526 0.295813566282
+527 0.295813566282
+528 0.295813566282
+529 0.295813566282
+530 0.295813566282
+531 0.295813566282
+532 0.295813566282
+533 0.295813566282
+534 0.295963588785
+535 0.295963588785
+536 0.295963588785
+537 0.295963588785
+538 0.295963588785
+539 0.295963588785
+540 0.295963588785
+541 0.295963588785
+542 0.295963588785
+543 0.295963588785
+544 0.295963588785
+545 0.307157505443
+546 0.307157505443
+547 0.307157505443
+548 0.307157505443
+549 0.307307527946
+550 0.307307527946
+551 0.307307527946
+552 0.307307527946
+553 0.307307527946
+554 0.307307527946
+555 0.307307527946
+556 0.307307527946
+557 0.307307527946
+558 0.307307527946
+559 0.307307527946
+560 0.307307527946
+561 0.307307527946
+562 0.307307527946
+563 0.307307527946
+564 0.307457550449
+565 0.307457550449
+566 0.307457550449
+567 0.318651467107
+568 0.318651467107
+569 0.318651467107
+570 0.318651467107
+571 0.318651467107
+572 0.318651467107
+573 0.318651467107
+574 0.318651467107
+575 0.318651467107
+576 0.318651467107
+577 0.322327934005
+578 0.322327934005
+579 0.322477956508
+580 0.322477956508
+581 0.322477956508
+582 0.322477956508
+583 0.322477956508
+584 0.322477956508
+585 0.322477956508
+586 0.322477956508
+587 0.322477956508
+588 0.322477956508
+589 0.333671873166
+590 0.333671873166
+591 0.333671873166
+592 0.333671873166
+593 0.333671873166
+594 0.333821895669
+595 0.333821895669
+596 0.333821895669
+597 0.333821895669
+598 0.333821895669
+599 0.333821895669
+600 0.333821895669
+601 0.333821895669
+602 0.333821895669
+603 0.333821895669
+604 0.333821895669
+605 0.333821895669
+606 0.333821895669
+607 0.333821895669
+608 0.333821895669
+609 0.333971918172
+610 0.333971918172
+611 0.333971918172
+612 0.34516583483
+613 0.34516583483
+614 0.34516583483
+615 0.34516583483
+616 0.34516583483
+617 0.34516583483
+618 0.34516583483
+619 0.34516583483
+620 0.34516583483
+621 0.34516583483
+622 0.34516583483
+623 0.34516583483
+624 0.345315857333
+625 0.345315857333
+626 0.345315857333
+627 0.345315857333
+628 0.345315857333
+629 0.345315857333
+630 0.345315857333
+631 0.345315857333
+632 0.345315857333
+633 0.345315857333
+634 0.356509773991
+635 0.356509773991
+636 0.356509773991
+637 0.356509773991
+638 0.356509773991
+639 0.356659796494
+640 0.356659796494
+641 0.356659796494
+642 0.356659796494
+643 0.356659796494
+644 0.356659796494
+645 0.356659796494
+646 0.356659796494
+647 0.356659796494
+648 0.356659796494
+649 0.356659796494
+650 0.356659796494
+651 0.356659796494
+652 0.356659796494
+653 0.356659796494
+654 0.356809818997
+655 0.356809818997
+656 0.368003735655
+657 0.368003735655
+658 0.368003735655
+659 0.371680202553
+660 0.371680202553
+661 0.371680202553
+662 0.371680202553
+663 0.371680202553
+664 0.371680202553
+665 0.371680202553
+666 0.371680202553
+667 0.371680202553
+668 0.371680202553
+669 0.371830225056
+670 0.371830225056
+671 0.371830225056
+672 0.371830225056
+673 0.371830225056
+674 0.371830225056
+675 0.371830225056
+676 0.371830225056
+677 0.371830225056
+678 0.383024141714
+679 0.383024141714
+680 0.383024141714
+681 0.383024141714
+682 0.383024141714
+683 0.383024141714
+684 0.383174164217
+685 0.383174164217
+686 0.383174164217
+687 0.383174164217
+688 0.383174164217
+689 0.383174164217
+690 0.383174164217
+691 0.383174164217
+692 0.383174164217
+693 0.383174164217
+694 0.383174164217
+695 0.383174164217
+696 0.383174164217
+697 0.383174164217
+698 0.383174164217
+699 0.383324186721
+700 0.394518103378
+701 0.394518103378
+702 0.394518103378
+703 0.394518103378
+704 0.394518103378
+705 0.394518103378
+706 0.394518103378
+707 0.394518103378
+708 0.394518103378
+709 0.394518103378
+710 0.394518103378
+711 0.394518103378
+712 0.394518103378
+713 0.394518103378
+714 0.394668125881
+715 0.394668125881
+716 0.394668125881
+717 0.394668125881
+718 0.394668125881
+719 0.394668125881
+720 0.394668125881
+721 0.394668125881
+722 0.394668125881
+723 0.405862042539
+724 0.405862042539
+725 0.405862042539
+726 0.405862042539
+727 0.405862042539
+728 0.405862042539
+729 0.406012065042
+730 0.406012065042
+731 0.406012065042
+732 0.406012065042
+733 0.406012065042
+734 0.406012065042
+735 0.406012065042
+736 0.406012065042
+737 0.406012065042
+738 0.406012065042
+739 0.406012065042
+740 0.406012065042
+741 0.40968853194
+742 0.40968853194
+743 0.40968853194
+744 0.409838554444
+745 0.421032471101
+746 0.421032471101
+747 0.421032471101
+748 0.421032471101
+749 0.421032471101
+750 0.421032471101
+751 0.421032471101
+752 0.421032471101
+753 0.421032471101
+754 0.421032471101
+755 0.421032471101
+756 0.421032471101
+757 0.421032471101
+758 0.421032471101
+759 0.421182493605
+760 0.421182493605
+761 0.421182493605
+762 0.421182493605
+763 0.421182493605
+764 0.421182493605
+765 0.421182493605
+766 0.421182493605
+767 0.432376410262
+768 0.432376410262
+769 0.432376410262
+770 0.432376410262
+771 0.432376410262
+772 0.432376410262
+773 0.432376410262
+774 0.432526432765
+775 0.432526432765
+776 0.432526432765
+777 0.432526432765
+778 0.432526432765
+779 0.432526432765
+780 0.432526432765
+781 0.432526432765
+782 0.432526432765
+783 0.432526432765
+784 0.432526432765
+785 0.432526432765
+786 0.432526432765
+787 0.432526432765
+788 0.432526432765
+789 0.443870371926
+790 0.443870371926
+791 0.443870371926
+792 0.443870371926
+793 0.443870371926
+794 0.443870371926
+795 0.443870371926
+796 0.443870371926
+797 0.443870371926
+798 0.443870371926
+799 0.443870371926
+800 0.443870371926
+801 0.443870371926
+802 0.443870371926
+803 0.443870371926
+804 0.444020394429
+805 0.444020394429
+806 0.444020394429
+807 0.444020394429
+808 0.444020394429
+809 0.444020394429
+810 0.444020394429
+811 0.444020394429
+812 0.455214311087
+813 0.455214311087
+814 0.455214311087
+815 0.455214311087
+816 0.455214311087
+817 0.455214311087
+818 0.455214311087
+819 0.45536433359
+820 0.45536433359
+821 0.45536433359
+822 0.45536433359
+823 0.459040800488
+824 0.459040800488
+825 0.459040800488
+826 0.459040800488
+827 0.459040800488
+828 0.459040800488
+829 0.459040800488
+830 0.459040800488
+831 0.459040800488
+832 0.459040800488
+833 0.459040800488
+834 0.470384739649
+835 0.470384739649
+836 0.470384739649
+837 0.470384739649
+838 0.470384739649
+839 0.470384739649
+840 0.470384739649
+841 0.470384739649
+842 0.470384739649
+843 0.470384739649
+844 0.470384739649
+845 0.470384739649
+846 0.470384739649
+847 0.470384739649
+848 0.470384739649
+849 0.470534762153
+850 0.470534762153
+851 0.470534762153
+852 0.470534762153
+853 0.470534762153
+854 0.470534762153
+855 0.470534762153
+856 0.48172867881
+857 0.48172867881
+858 0.48172867881
+859 0.48172867881
+860 0.48172867881
+861 0.48172867881
+862 0.48172867881
+863 0.48172867881
+864 0.481878701313
+865 0.481878701313
+866 0.481878701313
+867 0.481878701313
+868 0.481878701313
+869 0.481878701313
+870 0.481878701313
+871 0.481878701313
+872 0.481878701313
+873 0.481878701313
+874 0.481878701313
+875 0.481878701313
+876 0.481878701313
+877 0.481878701313
+878 0.493072617971
+879 0.493222640474
+880 0.493222640474
+881 0.493222640474
+882 0.493222640474
+883 0.493222640474
+884 0.493222640474
+885 0.493222640474
+886 0.493222640474
+887 0.493222640474
+888 0.493222640474
+889 0.493222640474
+890 0.493222640474
+891 0.493222640474
+892 0.493222640474
+893 0.493222640474
+894 0.493372662978
+895 0.493372662978
+896 0.493372662978
+897 0.493372662978
+898 0.493372662978
+899 0.493372662978
+900 0.504566579635
+901 0.504566579635
+902 0.504566579635
+903 0.504566579635
+904 0.504566579635
+905 0.508243046533
+906 0.508243046533
+907 0.508243046533
+908 0.508243046533
+909 0.508393069037
+910 0.508393069037
+911 0.508393069037
+912 0.508393069037
+913 0.508393069037
+914 0.508393069037
+915 0.508393069037
+916 0.508393069037
+917 0.508393069037
+918 0.508393069037
+919 0.508393069037
+920 0.508393069037
+921 0.508393069037
+922 0.508393069037
+923 0.519586985694
+924 0.519737008197
+925 0.519737008197
+926 0.519737008197
+927 0.519737008197
+928 0.519737008197
+929 0.519737008197
+930 0.519737008197
+931 0.519737008197
+932 0.519737008197
+933 0.519737008197
+934 0.519737008197
+935 0.519737008197
+936 0.519737008197
+937 0.519737008197
+938 0.519737008197
+939 0.519887030701
+940 0.519887030701
+941 0.519887030701
+942 0.519887030701
+943 0.519887030701
+944 0.519887030701
+945 0.531080947358
+946 0.531080947358
+947 0.531080947358
+948 0.531080947358
+949 0.531080947358
+950 0.531080947358
+951 0.531080947358
+952 0.531080947358
+953 0.531080947358
+954 0.531230969861
+955 0.531230969861
+956 0.531230969861
+957 0.531230969861
+958 0.531230969861
+959 0.531230969861
+960 0.531230969861
+961 0.531230969861
+962 0.531230969861
+963 0.531230969861
+964 0.531230969861
+965 0.531230969861
+966 0.531230969861
+967 0.542424886519
+968 0.542424886519
+969 0.542574909022
+970 0.542574909022
+971 0.542574909022
+972 0.542574909022
+973 0.542574909022
+974 0.542574909022
+975 0.542574909022
+976 0.542574909022
+977 0.542574909022
+978 0.542574909022
+979 0.542574909022
+980 0.542574909022
+981 0.542574909022
+982 0.542574909022
+983 0.542574909022
+984 0.542724931526
+985 0.542724931526
+986 0.546401398424
+987 0.546401398424
+988 0.546401398424
+989 0.557595315081
+990 0.557595315081
+991 0.557595315081
+992 0.557595315081
+993 0.557595315081
+994 0.557595315081
+995 0.557595315081
+996 0.557595315081
+997 0.557595315081
+998 0.557595315081
+999 0.557745337585
+1000 0.557745337585
+1001 0.557745337585
+1002 0.557745337585
+1003 0.557745337585
+1004 0.557745337585
+1005 0.557745337585
+1006 0.557745337585
+1007 0.557745337585
+1008 0.557745337585
+1009 0.557745337585
+1010 0.557745337585
+1011 0.557745337585
+1012 0.568939254242
+1013 0.568939254242
+1014 0.569089276745
+1015 0.569089276745
+1016 0.569089276745
+1017 0.569089276745
+1018 0.569089276745
+1019 0.569089276745
+1020 0.569089276745
+1021 0.569089276745
+1022 0.569089276745
+1023 0.569089276745
+1024 0.569089276745
+1025 0.569089276745
+1026 0.569089276745
+1027 0.569089276745
+1028 0.569089276745
+1029 0.569239299249
+1030 0.569239299249
+1031 0.569239299249
+1032 0.569239299249
+1033 0.569239299249
+1034 0.580433215906
+1035 0.580433215906
+1036 0.580433215906
+1037 0.580433215906
+1038 0.580433215906
+1039 0.580433215906
+1040 0.580433215906
+1041 0.580433215906
+1042 0.580433215906
+1043 0.580433215906
+1044 0.58058323841
+1045 0.58058323841
+1046 0.58058323841
+1047 0.58058323841
+1048 0.58058323841
+1049 0.58058323841
+1050 0.58058323841
+1051 0.58058323841
+1052 0.58058323841
+1053 0.58058323841
+1054 0.58058323841
+1055 0.58058323841
+1056 0.591777155067
+1057 0.591777155067
+1058 0.591777155067
+1059 0.59192717757
+1060 0.59192717757
+1061 0.59192717757
+1062 0.59192717757
+1063 0.59192717757
+1064 0.59192717757
+1065 0.59192717757
+1066 0.59192717757
+1067 0.59192717757
+1068 0.595603644469
+1069 0.595603644469
+1070 0.595603644469
+1071 0.595603644469
+1072 0.595603644469
+1073 0.595603644469
+1074 0.595753666972
+1075 0.595753666972
+1076 0.595753666972
+1077 0.595753666972
+1078 0.606947583629
+1079 0.606947583629
+1080 0.606947583629
+1081 0.606947583629
+1082 0.606947583629
+1083 0.606947583629
+1084 0.606947583629
+1085 0.606947583629
+1086 0.606947583629
+1087 0.606947583629
+1088 0.606947583629
+1089 0.607097606133
+1090 0.607097606133
+1091 0.607097606133
+1092 0.607097606133
+1093 0.607097606133
+1094 0.607097606133
+1095 0.607097606133
+1096 0.607097606133
+1097 0.607097606133
+1098 0.607097606133
+1099 0.607097606133
+1100 0.61829152279
+1101 0.61829152279
+1102 0.61829152279
+1103 0.61829152279
+1104 0.618441545294
+1105 0.618441545294
+1106 0.618441545294
+1107 0.618441545294
+1108 0.618441545294
+1109 0.618441545294
+1110 0.618441545294
+1111 0.618441545294
+1112 0.618441545294
+1113 0.618441545294
+1114 0.618441545294
+1115 0.618441545294
+1116 0.618441545294
+1117 0.618441545294
+1118 0.618441545294
+1119 0.618591567797
+1120 0.618591567797
+1121 0.618591567797
+1122 0.618591567797
+1123 0.629785484454
+1124 0.629785484454
+1125 0.629785484454
+1126 0.629785484454
+1127 0.629785484454
+1128 0.629785484454
+1129 0.629785484454
+1130 0.629785484454
+1131 0.629785484454
+1132 0.629785484454
+1133 0.629785484454
+1134 0.629935506958
+1135 0.629935506958
+1136 0.629935506958
+1137 0.629935506958
+1138 0.629935506958
+1139 0.629935506958
+1140 0.629935506958
+1141 0.629935506958
+1142 0.629935506958
+1143 0.629935506958
+1144 0.629935506958
+1145 0.641129423615
+1146 0.641129423615
+1147 0.641129423615
+1148 0.641129423615
+1149 0.641279446118
+1150 0.644955913017
+1151 0.644955913017
+1152 0.644955913017
+1153 0.644955913017
+1154 0.644955913017
+1155 0.644955913017
+1156 0.644955913017
+1157 0.644955913017
+1158 0.644955913017
+1159 0.644955913017
+1160 0.644955913017
+1161 0.644955913017
+1162 0.644955913017
+1163 0.644955913017
+1164 0.64510593552
+1165 0.64510593552
+1166 0.64510593552
+1167 0.656299852177
+1168 0.656299852177
+1169 0.656299852177
+1170 0.656299852177
+1171 0.656299852177
+1172 0.656299852177
+1173 0.656299852177
+1174 0.656299852177
+1175 0.656299852177
+1176 0.656299852177
+1177 0.656299852177
+1178 0.656299852177
+1179 0.656449874681
+1180 0.656449874681
+1181 0.656449874681
+1182 0.656449874681
+1183 0.656449874681
+1184 0.656449874681
+1185 0.656449874681
+1186 0.656449874681
+1187 0.656449874681
+1188 0.656449874681
+1189 0.667643791338
+1190 0.667643791338
+1191 0.667643791338
+1192 0.667643791338
+1193 0.667643791338
+1194 0.667793813842
+1195 0.667793813842
+1196 0.667793813842
+1197 0.667793813842
+1198 0.667793813842
+1199 0.667793813842
+1200 0.667793813842
+1201 0.667793813842
+1202 0.667793813842
+1203 0.667793813842
+1204 0.667793813842
+1205 0.667793813842
+1206 0.667793813842
+1207 0.667793813842
+1208 0.667793813842
+1209 0.667943836345
+1210 0.667943836345
+1211 0.667943836345
+1212 0.679137753002
+1213 0.679137753002
+1214 0.679137753002
+1215 0.679137753002
+1216 0.679137753002
+1217 0.679137753002
+1218 0.679137753002
+1219 0.679137753002
+1220 0.679137753002
+1221 0.679137753002
+1222 0.679137753002
+1223 0.679137753002
+1224 0.679287775506
+1225 0.679287775506
+1226 0.679287775506
+1227 0.679287775506
+1228 0.679287775506
+1229 0.679287775506
+1230 0.679287775506
+1231 0.679287775506
+1232 0.682964242404
+1233 0.682964242404
+1234 0.694158159061
+1235 0.694158159061
+1236 0.694158159061
+1237 0.694158159061
+1238 0.694158159061
+1239 0.694308181565
+1240 0.694308181565
+1241 0.694308181565
+1242 0.694308181565
+1243 0.694308181565
+1244 0.694308181565
+1245 0.694308181565
+1246 0.694308181565
+1247 0.694308181565
+1248 0.694308181565
+1249 0.694308181565
+1250 0.694308181565
+1251 0.694308181565
+1252 0.694308181565
+1253 0.694308181565
+1254 0.694458204068
+1255 0.694458204068
+1256 0.705652120726
+1257 0.705652120726
+1258 0.705652120726
+1259 0.705652120726
+1260 0.705652120726
+1261 0.705652120726
+1262 0.705652120726
+1263 0.705652120726
+1264 0.705652120726
+1265 0.705652120726
+1266 0.705652120726
+1267 0.705652120726
+1268 0.705652120726
+1269 0.705802143229
+1270 0.705802143229
+1271 0.705802143229
+1272 0.705802143229
+1273 0.705802143229
+1274 0.705802143229
+1275 0.705802143229
+1276 0.705802143229
+1277 0.705802143229
+1278 0.716996059886
+1279 0.716996059886
+1280 0.716996059886
+1281 0.716996059886
+1282 0.716996059886
+1283 0.716996059886
+1284 0.71714608239
+1285 0.71714608239
+1286 0.71714608239
+1287 0.71714608239
+1288 0.71714608239
+1289 0.71714608239
+1290 0.71714608239
+1291 0.71714608239
+1292 0.71714608239
+1293 0.71714608239
+1294 0.71714608239
+1295 0.71714608239
+1296 0.71714608239
+1297 0.71714608239
+1298 0.71714608239
+1299 0.717296104893
+1300 0.72849002155
+1301 0.72849002155
+1302 0.72849002155
+1303 0.72849002155
+1304 0.72849002155
+1305 0.72849002155
+1306 0.72849002155
+1307 0.72849002155
+1308 0.72849002155
+1309 0.72849002155
+1310 0.72849002155
+1311 0.72849002155
+1312 0.72849002155
+1313 0.72849002155
+1314 0.732316510952
+1315 0.732316510952
+1316 0.732316510952
+1317 0.732316510952
+1318 0.732316510952
+1319 0.732316510952
+1320 0.732316510952
+1321 0.732316510952
+1322 0.732316510952
+1323 0.743510427609
+1324 0.743510427609
+1325 0.743510427609
+1326 0.743510427609
+1327 0.743510427609
+1328 0.743510427609
+1329 0.743660450113
+1330 0.743660450113
+1331 0.743660450113
+1332 0.743660450113
+1333 0.743660450113
+1334 0.743660450113
+1335 0.743660450113
+1336 0.743660450113
+1337 0.743660450113
+1338 0.743660450113
+1339 0.743660450113
+1340 0.743660450113
+1341 0.743660450113
+1342 0.743660450113
+1343 0.743660450113
+1344 0.743810472616
+1345 0.755004389274
+1346 0.755004389274
+1347 0.755004389274
+1348 0.755004389274
+1349 0.755004389274
+1350 0.755004389274
+1351 0.755004389274
+1352 0.755004389274
+1353 0.755004389274
+1354 0.755004389274
+1355 0.755004389274
+1356 0.755004389274
+1357 0.755004389274
+1358 0.755004389274
+1359 0.755154411777
+1360 0.755154411777
+1361 0.755154411777
+1362 0.755154411777
+1363 0.755154411777
+1364 0.755154411777
+1365 0.755154411777
+1366 0.755154411777
+1367 0.766348328434
+1368 0.766348328434
+1369 0.766348328434
+1370 0.766348328434
+1371 0.766348328434
+1372 0.766348328434
+1373 0.766348328434
+1374 0.766498350938
+1375 0.766498350938
+1376 0.766498350938
+1377 0.766498350938
+1378 0.766498350938
+1379 0.766498350938
+1380 0.766498350938
+1381 0.766498350938
+1382 0.766498350938
+1383 0.766498350938
+1384 0.766498350938
+1385 0.766498350938
+1386 0.766498350938
+1387 0.766498350938
+1388 0.766498350938
+1389 0.777842290099
+1390 0.777842290099
+1391 0.777842290099
+1392 0.777842290099
+1393 0.777842290099
+1394 0.777842290099
+1395 0.777842290099
+1396 0.781518756997
+1397 0.781518756997
+1398 0.781518756997
+1399 0.781518756997
+1400 0.781518756997
+1401 0.781518756997
+1402 0.781518756997
+1403 0.781518756997
+1404 0.7816687795
+1405 0.7816687795
+1406 0.7816687795
+1407 0.7816687795
+1408 0.7816687795
+1409 0.7816687795
+1410 0.7816687795
+1411 0.7816687795
+1412 0.792862696158
+1413 0.792862696158
+1414 0.792862696158
+1415 0.792862696158
+1416 0.792862696158
+1417 0.792862696158
+1418 0.792862696158
+1419 0.793012718661
+1420 0.793012718661
+1421 0.793012718661
+1422 0.793012718661
+1423 0.793012718661
+1424 0.793012718661
+1425 0.793012718661
+1426 0.793012718661
+1427 0.793012718661
+1428 0.793012718661
+1429 0.793012718661
+1430 0.793012718661
+1431 0.793012718661
+1432 0.793012718661
+1433 0.793012718661
+1434 0.804356657822
+1435 0.804356657822
+1436 0.804356657822
+1437 0.804356657822
+1438 0.804356657822
+1439 0.804356657822
+1440 0.804356657822
+1441 0.804356657822
+1442 0.804356657822
+1443 0.804356657822
+1444 0.804356657822
+1445 0.804356657822
+1446 0.804356657822
+1447 0.804356657822
+1448 0.804356657822
+1449 0.804506680325
+1450 0.804506680325
+1451 0.804506680325
+1452 0.804506680325
+1453 0.804506680325
+1454 0.804506680325
+1455 0.804506680325
+1456 0.815700596982
+1457 0.815700596982
+1458 0.815700596982
+1459 0.815700596982
+1460 0.815700596982
+1461 0.815700596982
+1462 0.815700596982
+1463 0.815700596982
+1464 0.815850619486
+1465 0.815850619486
+1466 0.815850619486
+1467 0.815850619486
+1468 0.815850619486
+1469 0.815850619486
+1470 0.815850619486
+1471 0.815850619486
+1472 0.815850619486
+1473 0.815850619486
+1474 0.815850619486
+1475 0.815850619486
+1476 0.815850619486
+1477 0.815850619486
+1478 0.830721003042
+1479 0.830871025545
+1480 0.830871025545
+1481 0.830871025545
+1482 0.830871025545
+1483 0.830871025545
+1484 0.830871025545
+1485 0.830871025545
+1486 0.830871025545
+1487 0.830871025545
+1488 0.830871025545
+1489 0.830871025545
+1490 0.830871025545
+1491 0.830871025545
+1492 0.830871025545
+1493 0.830871025545
+1494 0.831021048048
+1495 0.831021048048
+1496 0.831021048048
+1497 0.831021048048
+1498 0.831021048048
+1499 0.831021048048
+1500 0.842214964706
+1501 0.842214964706
+1502 0.842214964706
+1503 0.842214964706
+1504 0.842214964706
+1505 0.842214964706
+1506 0.842214964706
+1507 0.842214964706
+1508 0.842214964706
+1509 0.842214964706
+1510 0.842364987209
+1511 0.842364987209
+1512 0.842364987209
+1513 0.842364987209
+1514 0.842364987209
+1515 0.842364987209
+1516 0.842364987209
+1517 0.842364987209
+1518 0.842364987209
+1519 0.842364987209
+1520 0.842364987209
+1521 0.842364987209
+1522 0.842364987209
+1523 0.853558903866
+1524 0.853558903866
+1525 0.85370892637
+1526 0.85370892637
+1527 0.85370892637
+1528 0.85370892637
+1529 0.85370892637
+1530 0.85370892637
+1531 0.85370892637
+1532 0.85370892637
+1533 0.85370892637
+1534 0.85370892637
+1535 0.85370892637
+1536 0.85370892637
+1537 0.85370892637
+1538 0.85370892637
+1539 0.85370892637
+1540 0.853858948873
+1541 0.853858948873
+1542 0.853858948873
+1543 0.853858948873
+1544 0.853858948873
+1545 0.865052865531
+1546 0.865052865531
+1547 0.865052865531
+1548 0.865052865531
+1549 0.865052865531
+1550 0.865052865531
+1551 0.865052865531
+1552 0.865052865531
+1553 0.865052865531
+1554 0.865052865531
+1555 0.865202888034
+1556 0.865202888034
+1557 0.865202888034
+1558 0.865202888034
+1559 0.865202888034
+1560 0.868879354932
+1561 0.868879354932
+1562 0.868879354932
+1563 0.868879354932
+1564 0.868879354932
+1565 0.868879354932
+1566 0.868879354932
+1567 0.88007327159
+1568 0.88007327159
+1569 0.88007327159
+1570 0.880223294093
+1571 0.880223294093
+1572 0.880223294093
+1573 0.880223294093
+1574 0.880223294093
+1575 0.880223294093
+1576 0.880223294093
+1577 0.880223294093
+1578 0.880223294093
+1579 0.880223294093
+1580 0.880223294093
+1581 0.880223294093
+1582 0.880223294093
+1583 0.880223294093
+1584 0.880223294093
+1585 0.880373316596
+1586 0.880373316596
+1587 0.880373316596
+1588 0.880373316596
+1589 0.891567233254
+1590 0.891567233254
+1591 0.891567233254
+1592 0.891567233254
+1593 0.891567233254
+1594 0.891567233254
+1595 0.891567233254
+1596 0.891567233254
+1597 0.891567233254
+1598 0.891567233254
+1599 0.891567233254
+1600 0.891717255757
+1601 0.891717255757
+1602 0.891717255757
+1603 0.891717255757
+1604 0.891717255757
+1605 0.891717255757
+1606 0.891717255757
+1607 0.891717255757
+1608 0.891717255757
+1609 0.891717255757
+1610 0.891717255757
+1611 0.891717255757
+1612 0.902911172415
+1613 0.902911172415
+1614 0.902911172415
+1615 0.903061194918
+1616 0.903061194918
+1617 0.903061194918
+1618 0.903061194918
+1619 0.903061194918
+1620 0.903061194918
+1621 0.903061194918
+1622 0.903061194918
+1623 0.903061194918
+1624 0.903061194918
+1625 0.903061194918
+1626 0.903061194918
+1627 0.903061194918
+1628 0.903061194918
+1629 0.903061194918
+1630 0.903211217421
+1631 0.903211217421
+1632 0.903211217421
+1633 0.903211217421
+1634 0.914405134079
+1635 0.914405134079
+1636 0.914405134079
+1637 0.914405134079
+1638 0.914405134079
+1639 0.914405134079
+1640 0.914405134079
+1641 0.914405134079
+1642 0.918081600977
+1643 0.918081600977
+1644 0.918081600977
+1645 0.91823162348
+1646 0.91823162348
+1647 0.91823162348
+1648 0.91823162348
+1649 0.91823162348
+1650 0.91823162348
+1651 0.91823162348
+1652 0.91823162348
+1653 0.91823162348
+1654 0.91823162348
+1655 0.91823162348
+1656 0.929425540138
+1657 0.929425540138
+1658 0.929425540138
+1659 0.929425540138
+1660 0.929575562641
+1661 0.929575562641
+1662 0.929575562641
+1663 0.929575562641
+1664 0.929575562641
+1665 0.929575562641
+1666 0.929575562641
+1667 0.929575562641
+1668 0.929575562641
+1669 0.929575562641
+1670 0.929575562641
+1671 0.929575562641
+1672 0.929575562641
+1673 0.929575562641
+1674 0.929575562641
+1675 0.929725585144
+1676 0.929725585144
+1677 0.929725585144
+1678 0.940919501802
+1679 0.940919501802
+1680 0.940919501802
+1681 0.940919501802
+1682 0.940919501802
+1683 0.940919501802
+1684 0.940919501802
+1685 0.940919501802
+1686 0.940919501802
+1687 0.940919501802
+1688 0.940919501802
+1689 0.940919501802
+1690 0.941069524305
+1691 0.941069524305
+1692 0.941069524305
+1693 0.941069524305
+1694 0.941069524305
+1695 0.941069524305
+1696 0.941069524305
+1697 0.941069524305
+1698 0.941069524305
+1699 0.941069524305
+1700 0.952263440963
+1701 0.952263440963
+1702 0.952263440963
+1703 0.952263440963
+1704 0.952263440963
+1705 0.952413463466
+1706 0.952413463466
+1707 0.952413463466
+1708 0.952413463466
+1709 0.952413463466
+1710 0.952413463466
+1711 0.952413463466
+1712 0.952413463466
+1713 0.952413463466
+1714 0.952413463466
+1715 0.952413463466
+1716 0.952413463466
+1717 0.952413463466
+1718 0.952413463466
+1719 0.952413463466
+1720 0.952563485969
+1721 0.952563485969
+1722 0.952563485969
+1723 0.963757402627
+1724 0.967433869525
+1725 0.967433869525
+1726 0.967433869525
+1727 0.967433869525
+1728 0.967433869525
+1729 0.967433869525
+1730 0.967433869525
+1731 0.967433869525
+1732 0.967433869525
+1733 0.967433869525
+1734 0.967433869525
+1735 0.967583892028
+1736 0.967583892028
+1737 0.967583892028
+1738 0.967583892028
+1739 0.967583892028
+1740 0.967583892028
+1741 0.967583892028
+1742 0.967583892028
+1743 0.967583892028
+1744 0.967583892028
+1745 0.978777808686
+1746 0.978777808686
+1747 0.978777808686
+1748 0.978777808686
+1749 0.978777808686
diff --git a/LDA.c b/LDA.c
new file mode 100644
index 0000000..49cc141
--- /dev/null
+++ b/LDA.c
@@ -0,0 +1,280 @@
+//Title: LDA.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+// LDA support functions.
+
+#include "CMemLeak.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "LDA.h"
+#include "Utils.h"
+#include "Inspect.h"
+#include "Errors.h"
+#include "Spectrum.h"
+#include "Trie.h"
+#include "Score.h"
+
+// Global variables:
+LDAModel* PMCCharge1LDA = NULL;
+LDAModel* PMCCharge2LDA = NULL;
+LDAModel* PMCCharge3LDA = NULL;
+
+LDAModel* CCModel1LDA = NULL;
+LDAModel* CCModel2LDA = NULL;
+
+LDAModel* MQModel2LDA = NULL;
+LDAModel* MQModel3LDA = NULL;
+
+
+void LoadCCModelLDA(int ForceRefresh)
+{
+ char FilePath[2048];
+ if (CCModel1LDA)
+ {
+ if (ForceRefresh)
+ {
+ FreeLDAModel(CCModel1LDA);
+ FreeLDAModel(CCModel2LDA);
+ }
+ else
+ {
+ return;
+ }
+ }
+ sprintf(FilePath, "%sCCLDA1.model", GlobalOptions->ResourceDir);
+ CCModel1LDA = LoadLDAModel(FilePath);
+ sprintf(FilePath, "%sCCLDA2.model", GlobalOptions->ResourceDir);
+ CCModel2LDA = LoadLDAModel(FilePath);
+}
+
+void FreeLDAModels()
+{
+ FreeLDAModel(PMCCharge1LDA);
+ PMCCharge1LDA = NULL;
+ FreeLDAModel(PMCCharge2LDA);
+ PMCCharge2LDA = NULL;
+ FreeLDAModel(PMCCharge3LDA);
+ PMCCharge3LDA = NULL;
+ FreeLDAModel(MQModel2LDA);
+ MQModel2LDA = NULL;
+ FreeLDAModel(MQModel3LDA);
+ MQModel3LDA = NULL;
+}
+
+// Load linear discriminant analysis (LDA) model for parent mass
+// correction (PMC). Special models for phosphorylation searches
+void LoadPMCLDA(int ForceLoad)
+{
+ char FilePath[2048];
+ if (PMCCharge1LDA)
+ {
+ if (ForceLoad)
+ {
+ FreeLDAModel(PMCCharge1LDA);
+ FreeLDAModel(PMCCharge2LDA);
+ FreeLDAModel(PMCCharge3LDA);
+ }
+ else
+ {
+ return;
+ }
+ }
+ sprintf(FilePath, "%sPMCLDA1.model", GlobalOptions->ResourceDir);
+ PMCCharge1LDA = LoadLDAModel(FilePath);
+ if (GlobalOptions->PhosphorylationFlag)
+ {//Load phosphorylation specific models, only different for charge 2 and 3
+ sprintf(FilePath, "%sPMCLDA2Phos.model", GlobalOptions->ResourceDir);
+ PMCCharge2LDA = LoadLDAModel(FilePath);
+ sprintf(FilePath, "%sPMCLDA3Phos.model", GlobalOptions->ResourceDir);
+ PMCCharge3LDA = LoadLDAModel(FilePath);
+ }
+ else
+ {
+ sprintf(FilePath, "%sPMCLDA2.model", GlobalOptions->ResourceDir);
+ PMCCharge2LDA = LoadLDAModel(FilePath);
+ sprintf(FilePath, "%sPMCLDA3.model", GlobalOptions->ResourceDir);
+ PMCCharge3LDA = LoadLDAModel(FilePath);
+ }
+}
+
+LDAModel* LoadLDAModel(char* LDAModelFileName)
+{
+ FILE* File;
+ LDAModel* Model;
+ double Value;
+ int BytesRead;
+ //
+ File = fopen(LDAModelFileName, "rb");
+ if (!File)
+ {
+ return NULL;
+ }
+ Model = (LDAModel*)calloc(1, sizeof(LDAModel));
+ //ReadBinary(&Value, sizeof(float), 1, File);
+ ReadBinary(&Model->FeatureCount, sizeof(int), 1, File);
+ assert(Model->FeatureCount >= 1 && Model->FeatureCount < 100);
+ Model->ScaledVector = (double*)calloc(Model->FeatureCount, sizeof(double));
+ Model->TempProductVector = (double*)calloc(Model->FeatureCount, sizeof(double));
+
+ // Read min and max values:
+ Model->MinValues = (double*)calloc(Model->FeatureCount, sizeof(double));
+ ReadBinary(Model->MinValues, sizeof(double), Model->FeatureCount, File);
+ Model->MaxValues = (double*)calloc(Model->FeatureCount, sizeof(double));
+ ReadBinary(Model->MaxValues, sizeof(double), Model->FeatureCount, File);
+ // Read mean true vector and mean false vector:
+ Model->MeanVectorTrue = (double*)calloc(Model->FeatureCount, sizeof(double));
+ ReadBinary(Model->MeanVectorTrue, sizeof(double), Model->FeatureCount, File);
+ Model->MeanVectorFalse = (double*)calloc(Model->FeatureCount, sizeof(double));
+ ReadBinary(Model->MeanVectorFalse, sizeof(double), Model->FeatureCount, File);
+ // Read constant ture and constant false:
+ ReadBinary(&Model->ConstantTrue, sizeof(double), 1, File);
+ ReadBinary(&Model->ConstantFalse, sizeof(double), 1, File);
+ // Read inverted covariance matrix:
+ Model->CovInv = (double*)calloc(Model->FeatureCount * Model->FeatureCount, sizeof(double));
+ ReadBinary(Model->CovInv, sizeof(double), Model->FeatureCount * Model->FeatureCount, File);
+ // Verify that we're at EOF:
+ BytesRead = ReadBinary(&Value, sizeof(float), 1, File);
+ assert(!BytesRead);
+ //printf("\nLoading LDA from %s:\n", LDAModelFileName);
+ //printf("%d features\n", Model->FeatureCount);
+ //printf("MinValues: %.4f...%.4f\n", Model->MinValues[0], Model->MinValues[Model->FeatureCount - 1]);
+ //printf("MaxValues: %.4f...%.4f\n", Model->MaxValues[0], Model->MaxValues[Model->FeatureCount - 1]);
+ //printf("MeanVectorTrue: %.4f...%.4f\n", Model->MeanVectorTrue[0], Model->MeanVectorTrue[Model->FeatureCount - 1]);
+ //printf("MeanVectorFalse: %.4f...%.4f\n", Model->MeanVectorFalse[0], Model->MeanVectorFalse[Model->FeatureCount - 1]);
+ //printf("CovInv: %.4f, %.4f, ..., %.4f, %.4f\n", Model->CovInv[0], Model->CovInv[1],
+ // Model->CovInv[Model->FeatureCount * Model->FeatureCount - 2],
+ // Model->CovInv[Model->FeatureCount * Model->FeatureCount - 1]);
+ //printf("ConstantTrue %.4f, ConstantFalse %.4f\n", Model->ConstantTrue, Model->ConstantFalse);
+ fclose(File);
+ return Model;
+}
+
+void FreeLDAModel(LDAModel* Model)
+{
+ if (!Model)
+ {
+ return;
+ }
+ SafeFree(Model->MinValues);
+ SafeFree(Model->MaxValues);
+ SafeFree(Model->CovInv);
+ SafeFree(Model->ScaledVector);
+ SafeFree(Model->TempProductVector);
+ SafeFree(Model->MeanVectorTrue);
+ SafeFree(Model->MeanVectorFalse);
+ SafeFree(Model);
+}
+
+float ApplyLDAModel(LDAModel* Model, float* Features)
+{
+ int FeatureIndex;
+ double HalfRange;
+ int ColumnIndex;
+ double ProductTrue;
+ double ProductFalse;
+ //
+ //printf("\nCFeatures %.4f...%.4f\n", Features[0], Features[Model->FeatureCount - 1]);
+ // Scale the features into [-1, 1]:
+ for (FeatureIndex = 0; FeatureIndex < Model->FeatureCount; FeatureIndex++)
+ {
+ HalfRange = (float)((Model->MaxValues[FeatureIndex] - Model->MinValues[FeatureIndex]) / 2.0);
+ Model->ScaledVector[FeatureIndex] = (float)((Features[FeatureIndex] - Model->MinValues[FeatureIndex]) / HalfRange - 1.0);
+ }
+ //printf("Scaled vector %.4f...%.4f\n", Model->ScaledVector[0], Model->ScaledVector[Model->FeatureCount - 1]);
+ // Compute the product of the inverse covariance matrix with our feature vector:
+ for (FeatureIndex = 0; FeatureIndex < Model->FeatureCount; FeatureIndex++)
+ {
+ Model->TempProductVector[FeatureIndex] = 0;
+ for (ColumnIndex = 0; ColumnIndex < Model->FeatureCount; ColumnIndex++)
+ {
+ Model->TempProductVector[FeatureIndex] += (float)(Model->ScaledVector[ColumnIndex] * Model->CovInv[FeatureIndex * Model->FeatureCount + ColumnIndex]);
+ }
+ }
+ //printf("Temp product vector vector %.4f...%.4f\n", Model->TempProductVector[0], Model->TempProductVector[Model->FeatureCount - 1]);
+
+ // Compute u0 * C-1 * X and u1 * C-1 * X
+ ProductTrue = 0;
+ ProductFalse = 0;
+ for (FeatureIndex = 0; FeatureIndex < Model->FeatureCount; FeatureIndex++)
+ {
+ ProductTrue += (float)(Model->MeanVectorTrue[FeatureIndex] * Model->TempProductVector[FeatureIndex]);
+ ProductFalse += (float)(Model->MeanVectorFalse[FeatureIndex] * Model->TempProductVector[FeatureIndex]);
+ }
+ ProductTrue += Model->ConstantTrue;
+ ProductFalse += Model->ConstantFalse;
+ //printf("ProdTrue %.4f ProdFalse %.4f result %.4f\n", ProductTrue, ProductFalse, ProductTrue - ProductFalse);
+ //ProductTrue += (float)Model->Sub;
+ //ProdFalse += (float)SubProdFalse;
+ //printf("%.2f\t%.2f\t%.2f\t\n", (ProdTrue - ProdFalse), ProdTrue, ProdFalse);
+ return (float)(ProductTrue - ProductFalse);
+}
+
+void InitPValueLDA()
+{
+ char FilePath[MAX_FILENAME_LEN];
+ //
+ if (!MQModel2LDA)
+ {
+ sprintf(FilePath, "%s%s.model", GlobalOptions->ResourceDir, "MQScoreLDA2");
+ MQModel2LDA = LoadLDAModel(FilePath);
+ }
+ if (!MQModel3LDA)
+ {
+ sprintf(FilePath, "%s%s.model", GlobalOptions->ResourceDir, "MQScoreLDA3");
+ MQModel3LDA = LoadLDAModel(FilePath);
+ }
+}
+
+float LDAComputeMQScore(MSSpectrum* Spectrum, Peptide* Match, float* MQFeatures)
+{
+ LDAModel* Model;
+ float Score;
+
+ if (Spectrum->Charge < 3)
+ {
+ Model = MQModel2LDA;
+ }
+ else
+ {
+ Model = MQModel3LDA;
+ }
+ if (!Model)
+ {
+ return 0.0;
+ }
+ Score = ApplyLDAModel(Model, MQFeatures);
+ Score = GetPenalizedScore(Spectrum, Match, Score);
+ return Score;
+
+}
diff --git a/LDA.h b/LDA.h
new file mode 100644
index 0000000..43289a1
--- /dev/null
+++ b/LDA.h
@@ -0,0 +1,59 @@
+//Title: LDA.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef LDA_H
+#define LDA_H
+
+typedef struct LDAModel
+{
+ int FeatureCount;
+ double* MinValues;
+ double* MaxValues;
+ double* CovInv;
+ double* MeanVectorTrue;
+ double* MeanVectorFalse;
+ double* ScaledVector;
+ double* TempProductVector;
+ double ConstantFalse;
+ double ConstantTrue;
+} LDAModel;
+
+LDAModel* LoadLDAModel(char* LDAModelFileName);
+void FreeLDAModel(LDAModel* Model);
+float ApplyLDAModel(LDAModel* Model, float* Features);
+void LoadPMCLDA();
+void FreeLDAModels();
+void LoadCCModelLDA(int ForceRefresh);
+void InitPValueLDA();
+
+#endif // LDA_H
+
diff --git a/LDA.py b/LDA.py
new file mode 100644
index 0000000..c20f373
--- /dev/null
+++ b/LDA.py
@@ -0,0 +1,469 @@
+#Title: LDA.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Linear discriminant analysis
+Assumes the input file is tab-delimited, with category in the first column, and float
+values in the remaining columns.
+"""
+
+USE_NUMPY = 1
+import traceback
+try:
+ if USE_NUMPY:
+ from numpy import *
+ import numpy.linalg
+ MatrixMulitply = dot
+ InvertMatrix = numpy.linalg.inv
+ FloatType = float
+ else:
+ from Numeric import *
+ import LinearAlgebra
+ InvertMatrix = LinearAlgebra.inverse
+ MatrixMulitply = matrixmultiply
+ FloatType = Float
+except:
+ print "\n* Warning: Unable to import numpy. LDA training not available."
+ print " Please install NumPy (see http://numpy.scipy.org/ for details)"
+ print " Error details are shown here:"
+ traceback.print_exc()
+
+import math
+import os
+import sys
+import random
+import struct
+import traceback
+
+ForbiddenFeatures = [2, 3, 4, 5, 13, 31, 32, 33, 34, 43, 47] #[2,3,4]
+
+def PrintHistogram(Histogram, HistoFile):
+ Bins = Histogram.keys()
+ Bins.sort()
+ #Bins.reverse()
+ TotalBads = 0
+ TotalGoods = 0
+ print "\nHistogram results:"
+ for Bin in Bins:
+ TotalBads += Histogram[Bin][0]
+ Bads = TotalBads
+ for Bin in range(Bins[0], Bins[-1]):
+ if Histogram.has_key(Bin):
+ Bads -= Histogram[Bin][0]
+ PValue = Bads / float(TotalBads)
+ print "%s\t%s\t%s\t%s\t"%(Bin, PValue, Bads, TotalBads)
+ if HistoFile and Bin >=- 70 and Bin < 150:
+ PValue = min(0.99, max(0.0001, PValue))
+ Str = struct.pack("<f", PValue)
+ #print "PValue struct:", Str
+ HistoFile.write(Str)
+
+class LDAClassifier:
+ def __init__(self):
+ pass
+ def GetCovarianceArray(self, VectorList):
+ Size = len(VectorList[0])
+ VectorCount = float(len(VectorList))
+ C = zeros((Size, Size), FloatType)
+ for Vector in VectorList:
+ for X in range(Size):
+ for Y in range(Size):
+ C[X][Y] += Vector[X] * Vector[Y] / VectorCount
+ return C
+ def LoadVectors(self, FileName, CategoryBit, FeatureList):
+ Size = len(FeatureList)
+ self.GoodVectors = []
+ self.BadVectors = []
+ # Iterate over file lines, and read vetors in:
+ File = open(FileName, "r")
+ for FileLine in File.xreadlines():
+ if FileLine[0] == "#":
+ continue # comment
+ Bits = FileLine.split("\t")
+ try:
+ Category = int(Bits[CategoryBit])
+ except:
+ continue
+ # Turn -1 vs 1 into 0 vs 1:
+ if Category < 0:
+ Category = 0
+ #for X in range(len(Bits)):
+ # print "%s: %s"%(X, Bits[X])
+ Vector = []
+ try:
+ for Index in FeatureList:
+ if Index >= len(Bits) or not Bits[Index].strip():
+ Vector.append(0)
+ else:
+ Vector.append(float(Bits[Index]))
+ except:
+ traceback.print_exc()
+ print Bits
+ continue
+ if Category:
+ self.GoodVectors.append(Vector)
+ else:
+ self.BadVectors.append(Vector)
+ print "First GoodVector:\n", self.GoodVectors[0]
+ print "First BadVector:\n", self.BadVectors[0]
+ def ScaleVectors(self):
+ """
+ Scale all vectors so that 90% of all values lie in the range [-1, 1]
+ """
+ Values = []
+ MinValues = []
+ MaxValues = []
+ FeatureCount = len(self.GoodVectors[0])
+ for X in range(FeatureCount):
+ Values.append([])
+ for Vector in self.GoodVectors:
+ for X in range(FeatureCount):
+ Values[X].append(Vector[X])
+ for Vector in self.BadVectors:
+ for X in range(FeatureCount):
+ Values[X].append(Vector[X])
+ print "Value count:", len(Values[0])
+ for X in range(FeatureCount):
+ Values[X].sort()
+ ValueCount = len(Values[X])
+ MinValues.append(Values[X][int(round(ValueCount * 0.05))])
+ MaxValues.append(Values[X][int(round(ValueCount * 0.95))])
+ print "Range:"
+ for X in range(FeatureCount):
+ print "%s: %.4f ... %.4f"%(X, MinValues[X], MaxValues[X])
+ for X in range(FeatureCount):
+ HalfRange = (MaxValues[X] - MinValues[X]) / 2.0
+ if not HalfRange:
+ continue
+ for Vector in self.BadVectors:
+ Vector[X] = (Vector[X] - MinValues[X]) / HalfRange - 1.0
+ #Vector[X] = max(-1.0, min(Vector[X], 1.0))
+ for Vector in self.GoodVectors:
+ Vector[X] = (Vector[X] - MinValues[X]) / HalfRange - 1.0
+ #Vector[X] = max(-1.0, min(Vector[X], 1.0))
+ def PerformLDA(self, FileName, CategoryBit, FeatureList, ScaleVectors = 1, FoldValidation = 0):
+ VerboseFlag = 1
+ Size = len(FeatureList)
+ self.LoadVectors(FileName, CategoryBit, FeatureList)
+ if ScaleVectors:
+ self.ScaleVectors()
+ if FoldValidation:
+ random.seed(1)
+ random.shuffle(self.GoodVectors)
+ random.shuffle(self.BadVectors)
+ # n-fold validation:
+ self.MasterGoodVectors = self.GoodVectors
+ self.MasterBadVectors = self.BadVectors
+ WorstAccuracy = 1.0
+ for Fold in range(max(1, FoldValidation)):
+ # Slice the master lists of good and bad vectors to separate 1/FoldValidation of
+ # them into a test set. FoldValidation can be 0, to do no such splitting.
+ self.GoodVectors = []
+ self.GoodTestVectors = []
+ for X in range(len(self.MasterGoodVectors)):
+ if FoldValidation and X % FoldValidation == Fold:
+ self.GoodTestVectors.append(self.MasterGoodVectors[X])
+ else:
+ self.GoodVectors.append(self.MasterGoodVectors[X])
+ self.BadVectors = []
+ self.BadTestVectors = []
+ for X in range(len(self.MasterBadVectors)):
+ if FoldValidation and X % FoldValidation == Fold:
+ self.BadTestVectors.append(self.MasterBadVectors[X])
+ else:
+ self.BadVectors.append(self.MasterBadVectors[X])
+ ############################################################
+ # Compute the mean vectors:
+ GoodCount = float(len(self.GoodVectors))
+ BadCount = float(len(self.BadVectors))
+ AllCount = GoodCount + BadCount
+ self.MeanGood = [0]*Size
+ self.MeanBad = [0]*Size
+ self.MeanGlobal = [0]*Size
+ for Vector in self.GoodVectors:
+ for Index in range(Size):
+ self.MeanGood[Index] += Vector[Index] / GoodCount
+ self.MeanGlobal[Index] += Vector[Index] / AllCount
+ for Vector in self.BadVectors:
+ for Index in range(Size):
+ self.MeanBad[Index] += Vector[Index] / BadCount
+ self.MeanGlobal[Index] += Vector[Index] / AllCount
+ print "MeanGood:\n ", self.MeanGood
+ print "MeanBad:\n ", self.MeanBad
+ print "MeanGlobal:\n ", self.MeanGlobal
+ ############################################################
+ # Compute the mean-corrected vectors:
+ MeanCorrectedGoodVectors = []
+ MeanCorrectedBadVectors = []
+ for Vector in self.GoodVectors:
+ NewVector = []
+ for X in range(Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ MeanCorrectedGoodVectors.append(NewVector)
+ for Vector in self.BadVectors:
+ NewVector = []
+ for X in range(Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ MeanCorrectedBadVectors.append(NewVector)
+ ############################################################
+ # Compute covariance matrices:
+ CovarArrayGood = self.GetCovarianceArray(MeanCorrectedGoodVectors)
+ if VerboseFlag:
+ print "CovarArrayGood:", CovarArrayGood
+ CovarArrayBad = self.GetCovarianceArray(MeanCorrectedBadVectors)
+ if VerboseFlag:
+ print "CovarArrayBad:", CovarArrayBad
+ # CovarArrayFull is the pooled within-group covariance matrix, it's
+ # computed componentwise as weighted sum of CovarArrayGood and CovarArrayBad.
+ CovarArrayFull = zeros((Size, Size), FloatType)
+ for X in range(Size):
+ for Y in range(Size):
+ CovarArrayFull[X][Y] += CovarArrayGood[X][Y] * GoodCount / AllCount
+ CovarArrayFull[X][Y] += CovarArrayBad[X][Y] * BadCount / AllCount
+ if VerboseFlag:
+ print "CovarArrayFull:", CovarArrayFull
+ ############################################################
+ # Invert the covariance array:
+ try:
+ self.CI = InvertMatrix(CovarArrayFull)
+ except:
+ traceback.print_exc()
+ print "Unable to invert covariance matrix! Invalid feature set."
+ return 0
+ if VerboseFlag:
+ print "CI:", self.CI
+ self.GoodMuC = MatrixMulitply(self.CI, self.MeanGood)
+ if VerboseFlag:
+ print "GoodMuC:", self.GoodMuC
+ self.BadMuC = MatrixMulitply(self.CI, self.MeanBad)
+ if VerboseFlag:
+ print "BadMuC:", self.BadMuC
+ self.ConstantGood = -MatrixMulitply(self.MeanGood, self.GoodMuC) / 2.0
+ self.ConstantBad = -MatrixMulitply(self.MeanBad, self.BadMuC) / 2.0
+ if VerboseFlag:
+ print "Constant good %.4f constant bad %.4f"%(self.ConstantGood, self.ConstantBad)
+ #######################################################
+ if VerboseFlag:
+ # Print C initializers:
+ for X in range(Size):
+ Str = "double CovInv%s[] = {"%chr(ord("A") + X)
+ for Y in range(Size):
+ Str += "%.3f,"%self.CI[X][Y]
+ Str = Str[:-1] + "};"
+ print Str
+ Str = "double MeanVectorTrue[] = {"
+ for X in range(Size):
+ Str += "%.3f,"%self.MeanGood[X]
+ Str = Str[:-1] + "};"
+ print Str
+ Str = "double MeanVectorFalse[] = {"
+ for X in range(Size):
+ Str += "%.3f,"%self.MeanBad[X]
+ Str = Str[:-1] + "};"
+ print Str
+ print "double SubProdTrue = (float)%.3f;"%self.ConstantGood
+ print "double SubProdFalse = (float)%.3f;"%self.ConstantBad
+ print "CG and CB:", self.ConstantGood, self.ConstantBad
+ #######################################################
+ Weights = []
+ for X in range(Size):
+ Weights.append(self.GoodMuC[X] - self.BadMuC[X])
+ Str = "*-*->Weights:"
+ for X in range(Size):
+ Str += " %s: %.4f"%(X, Weights[X])
+ print Str
+ #######################################################
+ # Compute our accuracy on the testing set:
+ if FoldValidation:
+ CorrectCount = 0
+ IncorrectCount = 0
+ for Vector in self.GoodTestVectors:
+ NewVector = []
+ for X in range(Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ Reading = self.GetReading(NewVector)
+ if Reading > 0:
+ CorrectCount += 1
+ else:
+ IncorrectCount += 1
+ for Vector in self.BadTestVectors:
+ NewVector = []
+ for X in range(Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ Reading = self.GetReading(NewVector)
+ if Reading > 0:
+ IncorrectCount += 1
+ else:
+ CorrectCount += 1
+ TotalCount = CorrectCount + IncorrectCount
+ Accuracy = CorrectCount / float(TotalCount)
+ print "Cross-validation accuracy: %d of %d (%.3f%%)"%(CorrectCount, TotalCount, Accuracy*100)
+ WorstAccuracy = min(Accuracy, WorstAccuracy)
+ else:
+ # Compute accuracy on all vectors:
+ CorrectCount = 0
+ IncorrectCount = 0
+ for Vector in MeanCorrectedGoodVectors:
+ Reading = self.GetReading(NewVector)
+ if Reading > 0:
+ CorrectCount += 1
+ else:
+ IncorrectCount += 1
+ for Vector in MeanCorrectedBadVectors:
+ Reading = self.GetReading(NewVector)
+ if Reading > 0:
+ IncorrectCount += 1
+ else:
+ CorrectCount += 1
+ TotalCount = CorrectCount + IncorrectCount
+ Accuracy = CorrectCount / float(TotalCount)
+ print "Accuracy: %d of %d (%.3f%%)"%(CorrectCount, TotalCount, Accuracy*100)
+ WorstAccuracy = min(Accuracy, WorstAccuracy)
+ print "Min. cross-validation accuracy: %.3f%%"%(WorstAccuracy*100)
+ return WorstAccuracy
+ def GetReading(self, Vector):
+ CIProduct = MatrixMulitply(self.CI, Vector)
+ ReadingGood = MatrixMulitply(self.MeanGood, CIProduct) + self.ConstantGood
+ ReadingBad = MatrixMulitply(self.MeanBad, CIProduct) + self.ConstantBad
+ print
+ print "Vector:", Vector
+ print "CIProduct:", CIProduct
+ print "ReadingGood %s ReadingBad %s Net %s"%(ReadingGood, ReadingBad, ReadingGood - ReadingBad)
+ return (ReadingGood - ReadingBad)
+ def ReportROCCurve(self):
+ SortedList = []
+ MeanCorrectedGoodVectors = []
+ MeanCorrectedBadVectors = []
+ PositiveCount = len(self.GoodVectors)
+ NegativeCount = len(self.BadVectors)
+ Size = len(self.GoodVectors[0])
+ for Vector in self.GoodVectors:
+ NewVector = []
+ for X in range(Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ CIProduct = MatrixMulitply(self.CI, NewVector)
+ ReadingGood = MatrixMulitply(self.MeanGood, CIProduct) + self.ConstantGood
+ ReadingBad = MatrixMulitply(self.MeanBad, CIProduct) + self.ConstantBad
+ SortedList.append((ReadingGood - ReadingBad, 1))
+ for Vector in self.BadVectors:
+ NewVector = []
+ for X in range(Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ CIProduct = MatrixMulitply(self.CI, NewVector)
+ ReadingGood = MatrixMulitply(self.MeanGood, CIProduct) + self.ConstantGood
+ ReadingBad = MatrixMulitply(self.MeanBad, CIProduct) + self.ConstantBad
+ SortedList.append((ReadingGood - ReadingBad, 0))
+ SortedList.sort()
+ SortedList.reverse()
+ TPCount = 0
+ FPCount = 0
+ Area = 0
+ ROCCurveFile = open("ROCCurve.txt", "wb")
+ for (Reading, TrueFlag) in SortedList:
+ #print Reading, TrueFlag
+ if (TrueFlag):
+ TPCount += 1
+ else:
+ FPCount += 1
+ Area += (TPCount / float(PositiveCount))
+ TPRate = TPCount / float(PositiveCount)
+ FPRate = FPCount / float(NegativeCount)
+ ROCCurveFile.write("%s\t%s\t\n"%(FPRate, TPRate))
+ if FPRate < 0.05:
+ HappyTPRate = TPRate
+ Area /= float(FPCount)
+ print "ROC curve area:", Area
+ print "TP rate for FP < 0.05: %s"%HappyTPRate
+ return Area
+ def ProducePValueCurve(self):
+ Histogram = {}
+ HistogramShort = {}
+ HistogramMedium = {}
+ HistogramLong = {}
+ MediumCutoff = 9
+ LongCutoff = 13
+ for X in range(len(self.BadVectors)):
+ Vector = self.BadVectors[X]
+ MQScore = self.GetReading(Vector)
+ Bin = int(round(MQScore * 10))
+ if not Histogram.has_key(Bin):
+ Histogram[Bin] = [0,0]
+ if not HistogramShort.has_key(Bin):
+ HistogramShort[Bin] = [0,0]
+ if not HistogramMedium.has_key(Bin):
+ HistogramMedium[Bin] = [0,0]
+ if not HistogramLong.has_key(Bin):
+ HistogramLong[Bin] = [0,0]
+ #Len = self.BadVectorPepLengths[X]
+ Histogram[Bin][0] += 1
+ PrintHistogram(Histogram, None)
+
+def FeatureSelectMain():
+ BestAccuracy = 0
+ FeatureList = [6, 49]
+ FeatureCount = 55
+ while len(FeatureList)<12:
+ BestAccuracy = 0
+ BestList = FeatureList
+ for FeatureA in range(4, FeatureCount):
+ if FeatureA in FeatureList:
+ continue
+ if FeatureA in ForbiddenFeatures:
+ continue
+ AugmentedFeatureList = FeatureList[:]
+ AugmentedFeatureList.append(FeatureA)
+ LDA = LDAClassifier()
+ LDA.PerformLDA("TrainingSet.Table.txt", 0, AugmentedFeatureList, 1, 0)
+ Accuracy = LDA.ReportROCCurve()
+ if Accuracy > BestAccuracy:
+ BestAccuracy = Accuracy
+ BestList = AugmentedFeatureList
+ print "Feature set %s has accuracy %.4f%%"%(AugmentedFeatureList, Accuracy*100)
+ print "So far...best accuracy %.4f%%, feature set %s"%(BestAccuracy*100, BestList)
+ FeatureList = BestList
+ print "Best accuracy %s, feature list %s"%(BestAccuracy, FeatureList)
+
+
+def Main():
+ LDA = LDAClassifier()
+ LDA.PerformLDA("LDATrainingSet.txt", 0, [1, 5], 0, 0)
+ LDA.ReportROCCurve()
+ #LDA.ProducePValueCurve()
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco not loaded)"
+ Main()
diff --git a/Label.py b/Label.py
new file mode 100644
index 0000000..132458c
--- /dev/null
+++ b/Label.py
@@ -0,0 +1,576 @@
+#Title: Label.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+import sys
+import os
+import getopt
+import PyInspect
+import MakeImage
+from Utils import *
+import MSSpectrum
+import GetByteOffset
+import PLSUtils
+UsageInfo = """
+Label.py - Generate a labeled spectrum, given a peptide.
+
+Required Options
+ -r [FileName] Spectrum file
+ -b [Offset] The byte offset in the file for the spectrum, as reported in
+ the Inspect output, it can be left blank for single-spectrum
+ -a [Peptide] The annotation for the spectrum
+ -c [Charge] The charge of the peptide
+
+Additional Options
+ -w [FileName] Output file name. Default is to temp.png
+ -v [FileName] Write verbose scoring details to the specified file
+ -d [Width]: Image width
+ -h [Height]: Image height
+ -s [ScanNumber]: Scan number
+ -x: Use black and white (for Printing)
+Example:
+ Label.py -r Sample346.ms2 -b 38289818 -a R.A+226LLAAFDFPFR.K
+"""
+
+class LabelClass:
+ def __init__(self):
+ self.SpectrumPath = None
+ self.SpectrumFilePos = 0
+ self.Peptide = None
+ self.OutputFileName = "temp.png"
+ self.VerboseFileName = None
+ self.LabeledPeaks = None
+ self.InspectFeatures = None
+ self.InspectFeatureNames = ["MQScore", "Length", "Total Cut Score", "Median Cut Score", "Y present", "B present", "Intensity in BY", "NTT"]
+ self.AutoPopUp = 1
+ self.PeptideHasPhosphorylation = 0
+ self.InstrumentType = "ESI-ION-TRAP" # or QTOF or FT-HYBRID
+ self.Charge = 0 #guessed or set by user.
+ self.ImageWidth = 600
+ self.ImageHeight = 400
+ self.ScanNumber = None
+ self.DoPLS = 0 #don't do this by default
+ def ParseCommandLineSimple(self, Arguments):
+ self.SpectrumPath = Arguments[0]
+ ColonPos = self.SpectrumPath.rfind(":")
+ try:
+ self.SpectrumFilePos = int(self.SpectrumPath[ColonPos + 1:])
+ self.SpectrumPath = self.SpectrumPath[:ColonPos]
+ except:
+ self.SpectrumFilePos = 0
+ self.Peptide = GetPeptideFromModdedName(Arguments[1])
+ if Arguments[1].find("phos") > 0:
+ self.PeptideHasPhosphorylation = 1
+ if len(Arguments) > 2:
+ self.OutputFileName = Arguments[2]
+ def ParseCommandLine(self, Arguments):
+ # Hack:
+ if len(Arguments) > 0 and Arguments[0][1] == ":":
+ return self.ParseCommandLineSimple(Arguments)
+ (Options, Args) = getopt.getopt(Arguments, "r:b:s:a:w:v:pi:c:d:h:s:xP")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-a":
+ Annotation = Value
+ self.Peptide = GetPeptideFromModdedName(Annotation)
+ if Annotation.find("phos") > 0:
+ self.PeptideHasPhosphorylation = 1
+ elif Option == "-r":
+ self.SpectrumPath = Value
+ elif Option == "-b":
+ self.SpectrumFilePos = int(Value)
+ elif Option == "-w":
+ self.OutputFileName = Value
+ elif Option == "-v":
+ self.VerboseFileName = Value
+ elif Option == "-i":
+ self.InstrumentType = Value
+ elif Option == "-c":
+ self.Charge = int(Value)
+ elif Option == "-p":
+ self.AutoPopUp = 0
+ #secret option to supress the image from popping up to the screen
+ elif Option == "-d":
+ self.ImageWidth = int(Value)
+ elif Option == "-h":
+ self.ImageHeight = int(Value)
+ elif Option == "-s":
+ self.ScanNumber = int(Value)
+ elif Option == "-x":
+ MakeImage.SetColors(1)
+ elif Option == "-P":
+ self.DoPLS = 1
+ else:
+ raise ValueError, "* Unknown option %s"%Option
+ # Filename and annotation are required. (Byte position is optional,
+ # since there are many single-scan .dta files out there)
+ if not OptionsSeen.has_key("-a") or not OptionsSeen.has_key("-r"):
+ print UsageInfo
+ sys.exit(1)
+ def Main(self):
+ if self.ScanNumber != None: # scan number is provided in input
+ # get byte offset using scan number
+ Abacus = GetByteOffset.Abacus()
+ self.ScanOffset = Abacus.GetByteOffset(self.SpectrumPath)
+ self.SpectrumFilePos = self.ScanOffset[self.ScanNumber]
+ print "ByteOffset # = %s"%self.SpectrumFilePos
+ self.LabelPeaks()
+ #self.ConvertDoublyChargedPeakLabelsOLD()
+ self.LabeledPeaks = self.ConvertDoublyChargedPeakLabels(self.LabeledPeaks, self.Peptide)
+ #self.ConvertYPeakNumberingOLD()
+ self.LabeledPeaks = self.ConvertYPeakNumbering(self.LabeledPeaks)
+ self.ConvertParentLossLabels()
+ self.VerboseOutput()
+ Maker = MakeImage.MSImageMaker(Width = self.ImageWidth, Height = self.ImageHeight)
+ Maker.ConvertPeakAnnotationToImage(self.LabeledPeaks, self.OutputFileName, self.Peptide,
+ Width = self.ImageWidth, Height = self.ImageHeight)
+ #if self.AutoPopUp:
+ # os.startfile(self.OutputFileName)
+ def VerboseOutput(self):
+ """
+ Extra output for the curious
+ 1. Inspect scoring features
+ """
+ if not self.VerboseFileName:
+ return
+ ##1. Inspect scoring features
+ VerboseHandle = open(self.VerboseFileName, "wb")
+ VerboseHandle.write("M/z %f\n"%self.MZ)
+ VerboseHandle.write("Annotation %s\n"%self.Peptide.GetFullModdedName())
+ VerboseHandle.write("ParentMass: Hypothetical, Observered, Error: %.2f, %.2f, %.2f\n"%(self.HypotheticalParentMass, self.ObservedParentMass, self.ObservedParentMassError))
+ for Index in range(len(self.InspectFeatures)):
+ VerboseHandle.write("%s\t%.3f\n"%(self.InspectFeatureNames[Index],self.InspectFeatures[Index]))
+ ##If they want the Phosphate localization Score do it here.
+ if self.DoPLS:
+ PLS = self.CalculatePLS()
+ if PLS:
+ #here it is possible that we get a different peptide as winner
+ VerboseHandle.write("Phosphate Localization Score: %.3f\n"%PLS[0])
+ if len(PLS) > 1:
+ VerboseHandle.write("WARNING: Better annotation than input. %.4f, %s"%(PLS[1], PLS[2]))
+ else:
+ VerboseHandle.write("Phosphate Localization Score: N/A\n")
+
+ ## 2. Write out the peaks found, not found
+ String = self.GetFoundPeaksTable()
+ VerboseHandle.write("\n\nPeaksFoundTable\n%s\n\n"%String)
+
+ ## 3. Dump the peak list
+ VerboseHandle.write("Mass\tIntensity\tLabel\tAminoIndex\n")
+ for Tuple in self.LabeledPeaks:
+ Label = Tuple[2]
+ if not Label:
+ Label = "UnLabeled"
+ Str = "%f\t%f\t%s\t%d\n"%(Tuple[0], Tuple[1], Label, Tuple[3])
+ VerboseHandle.write(Str)
+ VerboseHandle.write("\n\n")
+
+ VerboseHandle.close()
+
+ def GetFoundPeaksTable(self):
+ ## Get masses from Peptide object
+ ## mark the ones found with bold something
+ IonMasses = {} #key = IonName, value\tvalue\t
+ IonsFound = {} #key = IonNameAndIndex, value = 1 if found
+ IonNamesSorted = ("b2", "b", "y", "y2")
+ ##Mark Ions found
+ for Tuple in self.LabeledPeaks:
+ Label = Tuple[2]
+ if not Label:
+ continue
+ Label = Label.lower()
+ if Label in IonNamesSorted:
+ Index = Tuple[3]
+ Key = "%s:%s"%(Label,Index)
+ IonsFound[Key] = 1
+ #print "%s\t%s"%(Key, Tuple)
+ ##get predicted values
+ ReturnString = ""
+ for IonName in IonNamesSorted:
+ IonMasses[IonName] = "%s\t"%IonName
+ Ion = Global.AllIonDict[IonName]
+ for Index in range(1, len(self.Peptide.Masses)-1):
+ Mass = self.Peptide.Masses[Index] # offset by 1, since mass 0 is in there.
+ MassForIonType = Ion.GetPeakMass(Mass,self.Peptide.GetParentMass())
+ if IonName[0] == "b":
+ Key = "%s:%s"%(IonName,Index)
+ else:
+ NewIndex = len(self.Peptide.Aminos) - Index
+ Key = "%s:%s"%(IonName,NewIndex)
+ if IonsFound.has_key(Key):
+ #print "I found %s at mass %s"%(Key,MassForIonType)
+ IonMasses[IonName]+="F%.3f\t"%MassForIonType
+ else:
+ IonMasses[IonName]+="%.3f\t"%MassForIonType
+ #print "%s, %s"%(Key, MassForIonType)
+ ## Make String now that we are done with this IonName
+ ReturnString += "%s\n"%IonMasses[IonName]
+ if IonName == "b":
+ ReturnString += " \t%s\n"%self.Peptide.GetModdedName()
+ return ReturnString
+
+ def CalculatePLS(self):
+ """This function calculates the Phosphate Localization Score which is the ambuiguity score
+ for phosphorylation placement to a specific amino acid in the sequence. This
+ is reported in Albuquerque et al. Mol Cell Prot 2008
+ """
+ ##1. Get all the potential annotations for the peptide
+ ## If there are none, then the score is "N/A"
+ ##2. Get the score of each alternate annotation
+ ##3. determine winner and runner up
+ ##4. calcualte PLS
+ Abacus = PLSUtils.PLSClass()
+ PotentialAnnotations = Abacus.GetAlternateAnnotations(self.Peptide)
+
+ if len(PotentialAnnotations) == 0:
+ return None
+ ## 2. Try each individual annotation, keeping track of the top and runner up
+ BestAlternateMQScore = -10
+ RunnerUpAlternateMQScore = -10
+ BestAlternatePeakList = None
+ RunnerUpAlternatePeakList = None
+ BestAlternatePeptide = None
+ RunnerUpAlternatePeptide = None
+ for Annotation in PotentialAnnotations:
+ NewPeptide = GetPeptideFromModdedName(Annotation) # needed for peak label conversion
+ NewPeptide.Prefix = self.Peptide.Prefix
+ NewPeptide.Suffix = self.Peptide.Suffix
+ ##have to load it each time, it was not getting the correct results if I reused the same object
+ PySpectrum = PyInspect.Spectrum(self.SpectrumPath, self.SpectrumFilePos)
+ PySpectrum.SetParentMass(self.HypotheticalParentMass, self.Charge)
+ Features = PySpectrum.ScorePeptideDetailed(Annotation, self.Charge)
+ PeakAnnotations = PySpectrum.LabelPeaks(Annotation, self.Charge)
+ MQScore = Features[0]
+ #print "The score is %s, %s"%(MQScore, Annotation)
+ if MQScore > BestAlternateMQScore:
+ ##swap with runner up
+ RunnerUpAlternateMQScore = BestAlternateMQScore
+ RunnerUpAlternatePeakList = BestAlternatePeakList
+ RunnerUpAlternatePeptide = BestAlternatePeptide
+ BestAlternateMQScore = MQScore
+ BestAlternatePeakList = PeakAnnotations
+ BestAlternatePeptide = NewPeptide
+ elif MQScore > RunnerUpAlternateMQScore:
+ RunnerUpAlternateMQScore = MQScore
+ RunnerUpAlternatePeakList = PeakAnnotations
+ RunnerUpAlternatePeptide = NewPeptide
+ ## 3. Determine ther real winner and runner up. we assume that the original inspect
+ ## annotation is right, unless something beats it by say 0.3 units HARD CODED MAGIC!!!!!!!!!!!!!!!!
+ ## this is hard coded magic, but I tested it and it performs better than range(0,1,0.1)
+ ## then we swap out the top annotation.
+ if BestAlternateMQScore > (self.MQScore + 0.3):
+ TopPeakList = BestAlternatePeakList
+ TopPeptide = BestAlternatePeptide
+ TopMQScore = BestAlternateMQScore
+ #now also consider the fate of the RU score
+ if RunnerUpAlternateMQScore > (self.MQScore + 0.3):
+ RunnerUpPeakList = RunnerUpAlternatePeakList
+ RunnerUpPeptide = RunnerUpAlternatePeptide
+ RunnerUpMQScore = RunnerUpAlternateMQScore
+ else:
+ RunnerUpPeakList = self.LabeledPeaks
+ RunnerUpPeptide = self.Peptide
+ RunnerUpMQScore = self.MQScore
+ else:
+ TopPeakList = self.LabeledPeaks
+ TopPeptide = self.Peptide
+ TopMQScore = self.MQScore
+ RunnerUpPeakList = BestAlternatePeakList
+ RunnerUpPeptide = BestAlternatePeptide
+ RunnerUpMQScore = BestAlternateMQScore
+ ## 4. Find the distinguishing peaks between the top 2
+ #print "Winner ", TopPeptide.GetFullModdedName()
+ #print "runner up", RunnerUpPeptide.GetFullModdedName()
+ TopPeakList = self.ConvertDoublyChargedPeakLabels(TopPeakList, TopPeptide)
+ TopPeakList = self.ConvertYPeakNumbering(TopPeakList)
+ RunnerUpPeakList = self.ConvertDoublyChargedPeakLabels(RunnerUpPeakList, RunnerUpPeptide)
+ RunnerUpPeakList = self.ConvertYPeakNumbering(RunnerUpPeakList)
+ DistinguishingPeakList = Abacus.GetDistinguishingPeaks(TopPeptide, RunnerUpPeptide)
+ #print "finding peaks for %s"%TopPeptide.GetModdedName()
+ nWinner = Abacus.GetSupportingPeaks(TopPeakList, DistinguishingPeakList)
+ #print "finding peaks for %s"%RunnerUpPeptide.GetModdedName()
+ nRunnerUp = Abacus.GetSupportingPeaks(RunnerUpPeakList, DistinguishingPeakList)
+ ## 4.5 Here we take a slight detour. If nWinner < nRunnerUp, then PLS predicts something
+ ## different from Inspect. This happens, scoring functions will have different opinions
+ ## We simply swap the two and call it a day.
+ #print "Getting the ambuiguity score with %s, %s, %s (top, ru, total)"%(nWinner, nRunnerUp, len(DistinguishingPeakList))
+ AmbuigityScore = Abacus.ComputePLS(len(DistinguishingPeakList), nWinner, nRunnerUp)
+ if AmbuigityScore < 0:
+ ## means that nWinner < nRunnerUp
+ AmbuigityScore *= -1
+ ##now we shamelessly dump the top guy
+ TopMQScore = RunnerUpMQScore
+ TopPeptide = RunnerUpPeptide
+ #print "Ascore is %s"%AmbuigityScore
+ if not TopMQScore == self.MQScore:
+ print "WARNING::Top score was %.2f for peptide %s"%(TopMQScore, TopPeptide.GetModdedName())
+ print "\tInput was %.2f and %s"%(self.MQScore, self.Peptide.GetModdedName())
+ return (AmbuigityScore, TopMQScore, TopPeptide.GetFullModdedName())
+ return (AmbuigityScore,)
+
+
+ def ConvertParentLossLabels(self):
+ """
+ Special case for phorphorylated spectra. Change the label
+ 'Parent loss' to M-p or M-p-h2o.
+ """
+ PhosLoss = 98.0 / self.Charge
+ PhosWaterLoss = 116.0 / self.Charge
+ PhosLabel = "M-p"
+ PhosWaterLabel = "M-p-h2o"
+ Error = 3.0
+ for Index in range(len(self.LabeledPeaks)):
+ Tuple = self.LabeledPeaks[Index]
+ Label = Tuple[2]
+ if not Label == "Parent loss":
+ continue
+ Mass = Tuple[0]
+ Diff = abs(Mass - self.MZ)
+ MaybePhosLoss = abs(Diff - PhosLoss)
+ #print Tuple
+ if MaybePhosLoss < Error:
+ NewTuple = (Tuple[0], Tuple[1], PhosLabel, Tuple[3])
+ self.LabeledPeaks[Index] = NewTuple
+ #print self.LabeledPeaks[Index]
+ continue
+ MaybePhosWaterLoss = abs(Diff - PhosWaterLoss)
+ if MaybePhosWaterLoss < Error:
+ NewTuple = (Tuple[0], Tuple[1], PhosWaterLabel, Tuple[3])
+ self.LabeledPeaks[Index] = NewTuple
+ #print self.LabeledPeaks[Index]
+ def ConvertDoublyChargedPeakLabelsOLD(self):
+ """
+ The inspect output does not distinguish between single, and doubly charged peaks.
+ so in order for labeling to go well, we have to rewrite the labels as B2, Y2, etc
+ """
+ for Index in range(len(self.LabeledPeaks)):
+ Tuple = self.LabeledPeaks[Index]
+ Label = Tuple[2]
+ TupleMass = Tuple[0]
+ AminoIndex = Tuple[3]
+ NewLabel = None
+ PeptideMass = self.Peptide.Masses[AminoIndex]
+ if Label == "B":
+ PeptideMass += 1.0
+ if abs(PeptideMass - TupleMass) > 5:
+ NewLabel = "B2"
+ if Label == "Y":
+ PeptideMass = self.Peptide.GetParentMass() - PeptideMass
+ if abs(PeptideMass - TupleMass) > 5:
+ #a doublycharged peak, no isotope or error is this big
+ NewLabel = "Y2"
+ if Label == "Y loss": #hacky, but I can't think of a good way
+ YPeptideMass = self.Peptide.GetParentMass() - PeptideMass
+ Found = 0
+ for CommonLoss in [17, 18, 98]:
+ YLossMass = YPeptideMass - CommonLoss
+ if abs(YLossMass - TupleMass) < 5:
+ Found = 1
+ break
+ if not Found:
+ NewLabel = "Y2 Loss"
+ if Label == "B loss": #hacky, but I can't think of a good way
+ BPeptideMass = PeptideMass + 1
+ Found = 0
+ for CommonLoss in [17, 18, 98]:
+ BLossMass = BPeptideMass - CommonLoss
+ if abs(BLossMass - TupleMass) < 5:
+ Found = 1
+ break
+ if not Found:
+ NewLabel = "B2 Loss"
+
+ if NewLabel:
+ NewTuple = (Tuple[0], Tuple[1], NewLabel, Tuple[3])
+ self.LabeledPeaks[Index] = NewTuple
+ def ConvertDoublyChargedPeakLabels(self, Peaks, Peptide):
+ """
+ The inspect output does not distinguish between single, and doubly charged peaks.
+ so in order for labeling to go well, we have to rewrite the labels as B2, Y2, etc
+ """
+ for Index in range(len(Peaks)):
+ Tuple = Peaks[Index]
+ Label = Tuple[2]
+ TupleMass = Tuple[0]
+ AminoIndex = Tuple[3]
+ if abs(Tuple[0] - 402) < 1:
+ Verbose = 1
+ else:
+ Verbose = 0
+ NewLabel = None
+ PeptideMass = Peptide.Masses[AminoIndex]
+ if Label == "B":
+ PeptideMass += 1.0
+ if abs(PeptideMass - TupleMass) > 5:
+ NewLabel = "B2"
+ if Label == "Y":
+ PeptideMass = Peptide.GetParentMass() - PeptideMass
+ if abs(PeptideMass - TupleMass) > 5:
+ #a doublycharged peak, no isotope or error is this big
+ NewLabel = "Y2"
+ if Label == "Y loss": #hacky, but I can't think of a good way
+ YPeptideMass = Peptide.GetParentMass() - PeptideMass
+ Found = 0
+ for CommonLoss in [17, 18, 98]:
+ YLossMass = YPeptideMass - CommonLoss
+ if abs(YLossMass - TupleMass) < 5:
+ Found = 1
+ break
+ if not Found:
+ NewLabel = "Y2 Loss"
+ if Label == "B loss": #hacky, but I can't think of a good way
+ BPeptideMass = PeptideMass + 1
+ Found = 0
+ for CommonLoss in [17, 18, 98]:
+ BLossMass = BPeptideMass - CommonLoss
+ if abs(BLossMass - TupleMass) < 5:
+ Found = 1
+ break
+ if not Found:
+ NewLabel = "B2 Loss"
+
+ if NewLabel:
+ NewTuple = (Tuple[0], Tuple[1], NewLabel, Tuple[3])
+ Peaks[Index] = NewTuple
+ return Peaks
+
+ def ConvertYPeakNumberingOLD(self):
+ """
+ The amino indicies are numbered from the N- to C-terminus, but MakeImage numbers
+ its Y peaks from y1 (nearest the C-terminus) upwards. We re-number them here.
+ """
+ TempList = self.LabeledPeaks
+ self.LabeledPeaks = [] #clean it out
+ for Tuple in TempList:
+ Label = Tuple[2]
+ if not Label: #not a labeled peak. proceede normally
+ self.LabeledPeaks.append(Tuple)
+ continue
+ if not Label[0] == "Y":
+ self.LabeledPeaks.append(Tuple)
+ continue
+ ## should only have y derivates here. switch indices
+ AminoIndex = Tuple[-1]
+ #print Tuple
+ NewIndex = len(self.Peptide.Aminos) - AminoIndex
+ NewTuple = (Tuple[0], Tuple[1], Tuple[2], NewIndex)
+ self.LabeledPeaks.append(NewTuple)
+ def ConvertYPeakNumbering(self, Peaks):
+ """ SAME, just takes a parameter. I know it's messy.
+ The amino indicies are numbered from the N- to C-terminus, but MakeImage numbers
+ its Y peaks from y1 (nearest the C-terminus) upwards. We re-number them here.
+ """
+ TempList = Peaks
+ Peaks = [] #clean it out
+ for Tuple in TempList:
+ Label = Tuple[2]
+ if not Label: #not a labeled peak. proceede normally
+ Peaks.append(Tuple)
+ continue
+ if not Label[0] == "Y":
+ Peaks.append(Tuple)
+ continue
+ ## should only have y derivates here. switch indices
+ AminoIndex = Tuple[-1]
+ #print Tuple
+ NewIndex = len(self.Peptide.Aminos) - AminoIndex
+ NewTuple = (Tuple[0], Tuple[1], Tuple[2], NewIndex)
+ Peaks.append(NewTuple)
+ return Peaks
+
+ def LabelPeaks(self):
+ """
+ Uses PyInspect to label peaks in the spectrum according to Inspect's scoring
+ PyInspect will always be current, so let's use it.
+ """
+ ## load a spectrum, set charge, parent mass, then label the peaks
+ PySpectrum = PyInspect.Spectrum(self.SpectrumPath, self.SpectrumFilePos)
+ self.MZ = PySpectrum.GetMZ()
+ #print "m/z is %f"%self.MZ
+ ParentMass = self.Peptide.GetParentMass()
+ if not self.Charge: ## Guess charge if not input
+ BestDiff = 99999
+ for Charge in range(1, 5):
+ ParentMassFromCharge = self.MZ * Charge - (Charge - 1)*1.0078
+ Diff = abs(ParentMass - ParentMassFromCharge)
+ if Diff < BestDiff:
+ BestDiff = Diff
+ BestCharge = Charge
+ BestMass = ParentMassFromCharge
+ self.Charge = BestCharge
+ print "Appears to be charge %d with mass %.2f (oracle %.2f, error %.2f)"%(self.Charge, BestMass, ParentMass, BestDiff)
+ if BestDiff > 5:
+ print "\n** WARNING: Parent mass is off by %.2f!\n"%BestDiff
+ else: #chage given, calculate observed mass
+ BestMass = self.MZ * self.Charge - (self.Charge - 1)*1.0078
+ self.HypotheticalParentMass = ParentMass
+ self.ObservedParentMass = BestMass
+ self.ObservedParentMassError = abs(self.HypotheticalParentMass - self.ObservedParentMass)
+ PySpectrum.SetParentMass(ParentMass, self.Charge)
+ Annotation = self.Peptide.GetModdedName() # lacks prefix/suffix
+ ## self.LabeledPeaks is list of (Mass, intensity, ion, amino index)
+ self.LabeledPeaks = PySpectrum.LabelPeaks(Annotation, self.Charge)
+ if self.VerboseFileName:
+ self.InspectFeatures = PySpectrum.ScorePeptideDetailed(Annotation, self.Charge)
+ print "The MQScore for %s is %f"%(Annotation, self.InspectFeatures[0])
+ self.MQScore = self.InspectFeatures[0]
+
+
+def LabelSpectrum(Spectrum, Peptide, PeakTolerance):
+ Labeler = LabelClass()
+ Labeler.Peptide = Peptide
+ Labeler.SpectrumPath = Spectrum.FilePath
+ Labeler.SpectrumFilePos = Spectrum.FilePos
+ #print "Label.LabelSpectrum(%s:%s)"%(Spectrum.FilePath, Spectrum.FilePos)
+ Labeler.LabelPeaks()
+ # Paired iteration through Spectrum.Peaks and Labeler.LabeledPeaks:
+ IndexA = 0
+ IndexB = 0
+ while IndexA < len(Spectrum.Peaks) and IndexB < len(Labeler.LabeledPeaks):
+ Diff = Spectrum.Peaks[IndexA].Mass - Labeler.LabeledPeaks[IndexB][0]
+ if Diff > 0.01:
+ # Mass A is too large; let B catch up
+ IndexB += 1
+ continue
+ if Diff < 0.01:
+ # Mass A is too small; iterate forward
+ IndexA += 1
+ continue
+ Spectrum.Peaks[PeakIndex].IonType = Labeler.LabeledPeaks[IndexB][2]
+ Spectrum.Peaks[PeakIndex].AminoIndex = Labeler.LabeledPeaks[IndexB][3]
+ return Spectrum
+
+if __name__ == "__main__":
+ Dymo = LabelClass()
+ Dymo.ParseCommandLine(sys.argv[1:])
+ Dymo.Main()
diff --git a/Learning.py b/Learning.py
new file mode 100644
index 0000000..2562565
--- /dev/null
+++ b/Learning.py
@@ -0,0 +1,1276 @@
+#Title: Learning.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+The LearnerClass is an abstract machine learner. It can be trained,
+saved, and loaded.
+"""
+import os
+import sys
+import struct
+import random
+import math
+import traceback
+import cPickle
+import LDA
+import RunPySVM
+try:
+ import PySVM
+except:
+ print "(Warning: PySVM not present!)"
+
+try:
+ from numpy import *
+ import numpy.linalg
+ FloatType = float
+ MatrixMultiply = dot
+ InvertMatrix = numpy.linalg.inv
+except:
+ print "\n* Warning: Unable to import NumPy. Logit training not available"
+ print " Please install NumPy (see http://numpy.scipy.org/ for details)"
+ print " Error details are shown here:"
+ traceback.print_exc()
+
+random.seed(1)
+
+MaxSVMFeatureCount = 500
+
+if sys.platform == "win32":
+ PATH_SVMSCALE = r"C:\libsvm\windows\svmscale.exe"
+ PATH_SVMTRAIN = r"C:\libsvm\windows\svmtrain.exe"
+else:
+ PATH_SVMSCALE = os.path.join(os.environ["HOME"], "libsvm", "svm-scale")
+ PATH_SVMTRAIN = os.path.join(os.environ["HOME"], "libsvm", "svm-train")
+
+SQRT2PI = math.sqrt(2 * math.pi)
+SQRT2 = math.sqrt(2)
+Cof = [76.18009172947146, -86.50532032941677,
+ 24.01409824083091, -1.231739572450155,
+ 0.1208650973866179e-2, -0.5395239384952e-5]
+
+def Gamma(Z):
+ X = Z
+ Y = Z
+ Temp = X + 5.5
+ Temp -= (X + 0.5) * math.log(Temp)
+ Ser = 1.000000000190015
+ for J in range(6):
+ Y += 1
+ Ser += Cof[J] / Y
+ Z = -Temp + math.log(2.5066282746310005 * Ser / X)
+ return math.exp(Z)
+
+
+
+
+class MixtureModelClass:
+ def __init__(self, BinMultiplier = 10.0):
+ self.BinMultiplier = BinMultiplier
+ def Model(self, Values, Histogram = None):
+ if Values:
+ print "Model scores. Range is %s...%s"%(min(Values), max(Values))
+ else:
+ if not Histogram.keys():
+ # There's nothing to model!
+ self.MinBin = 0
+ self.MaxBin = 0
+ self.OddsTrue = {}
+ return
+ print "Model scores. Range is %s...%s"%(min(Histogram.keys()), max(Histogram.keys()))
+ self.MaxCycleCount = 300
+ self.VerboseFlag = 0
+ if Histogram:
+ self.ScoreHistogram = Histogram
+ else:
+ self.ScoreHistogram = {}
+ for Value in Values:
+ Bin = int(round(Value * self.BinMultiplier))
+ self.ScoreHistogram[Bin] = self.ScoreHistogram.get(Bin, 0) + 1
+ Keys = self.ScoreHistogram.keys()
+ self.MinBin = min(Keys)
+ self.MaxBin = max(Keys) + 1
+ self.InitializeModel()
+ try:
+ self.ModelDistribution()
+ except:
+ print "* Warning: Unable to compute p-values via mixture model"
+ print "* Error trace follows:"
+ traceback.print_exc()
+ print "self.VarianceFalse:", self.VarianceFalse
+ print "self.VarianceTrue:", self.VarianceTrue
+ print "MeanFalse:", self.MeanFalse
+ print "GammaOffset:", self.GammaOffset
+ print "ThetaFalse:", self.ThetaFalse
+ print "KFalse:", self.KFalse
+ def GetOddsTrue(self, X):
+## if self.CumulativeFlag:
+## # Set our odds using the cumulative probability p(score >= X) instead of
+## # the odds that the score is in this bin.
+## ErfArg = (X - self.MeanTrue) / (self.StdDevTrue * SQRT2)
+## NormalCDF = 0.5 + 0.5 * PyInspect.erf(ErfArg)
+## GX = max(0.01, X + self.GammaOffset)
+## GammaCDF = PyInspect.GammaIncomplete(self.KFalse, GX / self.ThetaFalse) #/ Gamma(self.KFalse)
+## TrueNormal = 1.0 - NormalCDF
+## FalseGamma = 1.0 - GammaCDF
+## else:
+ Pow = - ((X - self.MeanTrue)**2) / (2 * self.VarianceTrue)
+ TrueNormal = math.exp(Pow) / (self.StdDevTrue * SQRT2PI)
+ GX = max(0.01, X + self.GammaOffset)
+
+
+ FalseGamma = math.pow(GX, self.KFalse - 1) * math.exp(-GX / self.ThetaFalse) / self.GammaDemonFalse
+ # Special patch-up code:
+ # Toward the edges of the mixture model, odd behavior may occur where one curve falls off
+ # slower than the other. We force very low scores to get a bad odds-true, and very
+ # high scores to get a good odds-true.
+ if X < self.MeanTrue - self.VarianceTrue:
+ FalseGamma = max(FalseGamma, 0.001)
+ if X > self.MeanTrue + self.VarianceTrue:
+ TrueNormal = max(TrueNormal, 0.001)
+ OddsTrue = (TrueNormal * self.PriorProbabilityTrue) / (TrueNormal * self.PriorProbabilityTrue + FalseGamma * (1 - self.PriorProbabilityTrue))
+ return OddsTrue
+ def InitializeModel(self):
+ # Initialize mixture model:
+ MinValue = self.MinBin / self.BinMultiplier
+ MaxValue = self.MaxBin / self.BinMultiplier
+ self.MeanFalse = MinValue + (MaxValue - MinValue) * 0.25
+ self.MeanTrue = MaxValue - (MaxValue - MinValue) * 0.25
+ self.VarianceFalse = (MaxValue - MinValue) * 0.1
+ self.VarianceTrue = (MaxValue - MinValue) * 0.1
+ self.PriorProbabilityTrue = 0.1
+ if MinValue < 0:
+ self.GammaOffset = -MinValue
+ elif MinValue > 0.1:
+ self.GammaOffset = -MinValue
+ else:
+ self.GammaOffset = 0
+ self.OddsTrue = {}
+ def ModelDistribution(self):
+ self.ThetaFalse = self.VarianceFalse / (self.MeanFalse + self.GammaOffset)
+ self.StdDevTrue = math.sqrt(self.VarianceTrue)
+ self.KFalse = (self.MeanFalse + self.GammaOffset) / self.ThetaFalse
+ self.GammaDemonFalse = math.pow(self.ThetaFalse, self.KFalse) * Gamma(self.KFalse)
+ for Cycle in range(self.MaxCycleCount):
+ self.Cycle = Cycle
+ self.EstimateOddsTrue()
+ self.ComputeDistributionParameters()
+ def EstimateOddsTrue(self):
+ """
+ One half of the E/M cycle: Estimate the probability true for each bin.
+ """
+ # For each bin, compute the probability that it's true:
+ BestOddsTrue = 0
+ for Bin in range(self.MinBin, self.MaxBin):
+ X = Bin / self.BinMultiplier
+ self.OddsTrue[Bin] = self.GetOddsTrue(X)
+ # Somewhat hacky: If the left tail of the normal distribution falls off more slowly
+ # than that of the gamma distribution, the value of OddsTrue is often rather
+ # high for these (very bad!) bins. We fix that.
+ if Bin < 0:
+ self.OddsTrue[Bin] = min(self.OddsTrue[Bin], 1 / float(-Bin))
+ # Somewhat hacky: If the right tail of the normal distribution falls off too quickly,
+ # then the odds true will decay:
+ BestOddsTrue = max(BestOddsTrue, self.OddsTrue[Bin])
+ if X >= self.MeanTrue:
+ self.OddsTrue[Bin] = max(BestOddsTrue, self.OddsTrue[Bin])
+ #print "%s: %s"%(X, self.OddsTrue[Bin])
+ def ComputeDistributionParameters(self):
+ """
+ One half of the E/M cycle: Optimize the distribution parameters.
+ """
+ # Compute the new mean and variance for the true and the false distributions:
+ self.CountTrue = 0
+ self.MeanTrue = 0
+ self.CountFalse = 0
+ self.MeanFalse = 0
+ for Bin in range(self.MinBin, self.MaxBin):
+ X = Bin / self.BinMultiplier
+ Count = self.ScoreHistogram.get(Bin, 0)
+ self.MeanTrue += X * self.OddsTrue[Bin] * Count
+ self.CountTrue += self.OddsTrue[Bin] * Count
+ self.MeanFalse += X * (1.0 - self.OddsTrue[Bin]) * Count
+ self.CountFalse += (1.0 - self.OddsTrue[Bin]) * Count
+ if self.CountTrue <= 0 or self.CountFalse <= 0:
+ print "** Error: Unable to fit mixture model. Appears to be %s true and %s false matches."%(self.CountTrue, self.CountFalse)
+ return 0
+ self.MeanTrue /= self.CountTrue
+ self.MeanFalse /= self.CountFalse
+ self.PriorProbabilityTrue = self.CountTrue / (self.CountTrue + self.CountFalse)
+ # Adjust GammaOffset, if the false distribution's mean is getting close to 0:
+ if self.MeanFalse + self.GammaOffset < 0.1:
+ print "False distribution mean is small; BUMP gamma offset up"
+ self.GammaOffset += 0.5
+ ##################################
+ # Compute the new variation for the true and the false distributions:
+ self.VarianceTrue = 0
+ self.VarianceFalse = 0
+ for Bin in range(self.MinBin, self.MaxBin):
+ X = Bin / self.BinMultiplier
+ Count = self.ScoreHistogram.get(Bin, 0)
+ try:
+ self.VarianceTrue += (X - self.MeanTrue)**2 * Count * self.OddsTrue[Bin]
+ self.VarianceFalse += (X - self.MeanFalse)**2 * Count * (1.0 - self.OddsTrue[Bin])
+ except:
+ print X
+ print self.MeanTrue
+ print self.MeanFalse
+ print self.OddsTrue[Bin]
+ raise
+ self.VarianceTrue /= self.CountTrue
+ self.StdDevTrue = math.sqrt(self.VarianceTrue)
+ self.VarianceFalse /= self.CountFalse
+ #print " True mean %.4f var %.4f"%(self.MeanTrue, self.VarianceTrue)
+ #print " False mean %.4f var %.4f"%(self.MeanFalse, self.VarianceFalse)
+ self.ThetaFalse = self.VarianceFalse / (self.MeanFalse + self.GammaOffset)
+ self.KFalse = (self.MeanFalse + self.GammaOffset) / self.ThetaFalse
+ self.GammaDemonFalse = math.pow(self.ThetaFalse, self.KFalse) * Gamma(self.KFalse)
+ if self.VerboseFlag:
+ print "-----------------------"
+ print "Cycle %s report:"%self.Cycle
+ print "Theta %.4f K %.4f GammaDenominator %.8f GammaOffset %.2f"%(self.ThetaFalse, self.KFalse, self.GammaDemonFalse, self.GammaOffset)
+ print "True: Count %s mean %s variance %s"%(self.CountTrue, self.MeanTrue, self.VarianceTrue)
+ print "False: Count %s mean %s variance %s"%(self.CountFalse, self.MeanFalse, self.VarianceFalse)
+ print "Prior probability true: %s"%self.PriorProbabilityTrue
+ def PlotDistribution(self, FileName):
+ File = open(FileName, "wb")
+ Header = "Bin\tValue\tHistogram\tOddsTrue\tTrueNormal\tFalseGamma\tMixture\t"
+ File.write(Header + "\n")
+ for Bin in range(self.MinBin, self.MaxBin):
+ Str = "%s\t%s\t"%(Bin, Bin / self.BinMultiplier)
+ Str += "%s\t%s\t"%(self.ScoreHistogram.get(Bin, 0), self.OddsTrue[Bin])
+ X = Bin / self.BinMultiplier
+ # Plot gamma and normal curves:
+ Pow = - ((X - self.MeanTrue)**2) / (2 * self.VarianceTrue)
+ TrueNormal = math.exp(Pow) / (self.StdDevTrue * SQRT2PI)
+ GX = max(0.01, X + self.GammaOffset)
+ FalseGamma = math.pow(GX, self.KFalse - 1) * math.exp(-GX / self.ThetaFalse) / self.GammaDemonFalse
+ Str += "%s\t%s\t"%(TrueNormal, FalseGamma)
+## # Plot gamma and noraml CDF:
+## ErfArg = (X - self.MeanTrue) / (self.StdDevTrue * SQRT2)
+## NormalCDF = 0.5 + 0.5 * PyInspect.erf(ErfArg)
+## GX = max(0.01, X + self.GammaOffset)
+## GammaCDF = PyInspect.GammaIncomplete(self.KFalse, GX / self.ThetaFalse) #/ Gamma(self.KFalse)
+## Str += "%s\t%s\t"%(NormalCDF, GammaCDF)
+ MergedMixture = TrueNormal * self.PriorProbabilityTrue
+ MergedMixture += FalseGamma * (1.0 - self.PriorProbabilityTrue)
+ Str += "%s\t"%(MergedMixture)
+ File.write(Str + "\n")
+ def PickleSelf(self, File):
+ cPickle.dump(self.BinMultiplier, File)
+ cPickle.dump(self.PriorProbabilityTrue, File)
+ cPickle.dump(self.MeanTrue, File)
+ cPickle.dump(self.VarianceTrue, File)
+ cPickle.dump(self.StdDevTrue, File)
+ cPickle.dump(self.GammaOffset, File)
+ cPickle.dump(self.KFalse, File)
+ cPickle.dump(self.ThetaFalse, File)
+ cPickle.dump(self.GammaDemonFalse, File)
+
+def UnpickleMixtureModel(File):
+ Model = MixtureModelClass()
+ Model.BinMultiplier = cPickle.load(File)
+ Model.PriorProbabilityTrue = cPickle.load(File)
+ Model.MeanTrue = cPickle.load(File)
+ Model.VarianceTrue = cPickle.load(File)
+ Model.StdDevTrue = cPickle.load(File)
+ Model.GammaOffset = cPickle.load(File)
+ Model.KFalse = cPickle.load(File)
+ Model.ThetaFalse = cPickle.load(File)
+ Model.GammaDemonFalse = cPickle.load(File)
+ return Model
+
+class FeatureVector:
+ def __init__(self):
+ self.FileBits = []
+ self.Features = []
+ self.ScaledFeatures = None
+ self.TrueFlag = 0
+ self.Score = 0 # as assigned by an owning model
+
+class FeatureSetClass:
+ """
+ A feature-set is a list of TRUE tuples, and a list of FALSE tuples. Normally there
+ is a FeatureSetClass for testing, and one for training.
+ """
+ def __init__(self):
+ self.TrueVectors = []
+ self.FalseVectors = []
+ self.AllVectors = []
+ self.TrueCount = 0
+ self.FalseCount = 0
+ self.Count = 0
+ self.PriorProbabilityFalse = 0.5
+ def SetCounts(self):
+ self.Count = len(self.AllVectors)
+ self.TrueCount = len(self.TrueVectors)
+ self.FalseCount = len(self.FalseVectors)
+ if len(self.AllVectors):
+ self.Size = len(self.AllVectors[0].Features)
+ def FindFeatureRanges(self):
+ """
+ Simple scaling function: Find min and max values to push features into [-1, 1]
+ """
+ Values = []
+ Vector = self.AllVectors[0]
+ Size = len(Vector.Features)
+ print "SIZE:", Size
+ for X in range(Size):
+ Values.append([])
+ for Vector in self.AllVectors:
+ for X in range(Size):
+ Values[X].append(Vector.Features[X])
+ self.MinValues = []
+ self.MaxValues = []
+ for X in range(Size):
+ Values[X].sort()
+ ValIndex = int(round(len(Values[X]) * 0.025))
+ MinValue = Values[X][ValIndex]
+ self.MinValues.append(MinValue)
+ ValIndex = int(round(len(Values[X]) * 0.975))
+ MaxValue = Values[X][ValIndex]
+ self.MaxValues.append(MaxValue)
+## print "Range:"
+## for X in range(Size):
+## print "%s: %.4f-%.4f"%(X, self.MinValues[X], self.MaxValues[X])
+ pass
+ def ScaleFeatures(self):
+ """
+ Simple scaling function: Pushes 90% of feature values into the range [-1, 1].
+ Assumes that self.MinValues and self.MaxValues are set!
+ """
+ self.Size = len(self.AllVectors[0].Features)
+ for Vector in self.AllVectors:
+ Vector.ScaledFeatures = [0]*self.Size
+ for X in range(self.Size):
+ HalfRange = (self.MaxValues[X] - self.MinValues[X]) / 2.0
+ if not HalfRange:
+ continue
+ #print "Feature %s: Range %s...%s"%(X, self.MinValues[X], self.MaxValues[X])
+ for Vector in self.AllVectors:
+ Vector.ScaledFeatures[X] = (Vector.Features[X] - self.MinValues[X]) / HalfRange - 1.0
+ def __str__(self):
+ return "<%sT %sF>"%(len(self.TrueVectors), len(self.FalseVectors))
+ def GetPriorProbabilityFalse(self, DBTrueToFalseRatio):
+ """
+ Compute the prior probability that an arbitrary peptide is false.
+ """
+ # In a 1:1 database, there's 1 bogus peptide in a valid protein
+ # for each bogus peptide in an invalid protein; in that case, DBTrueToFalseRatio is 1.0
+ # In a 1:99 database, the ratio is 1/99.
+ FalseWithinTrue = self.FalseCount * DBTrueToFalseRatio
+ if FalseWithinTrue >= self.TrueCount:
+ # Uh-oh; there are FEWER peptides from valid proteins than we would expect
+ # to see by chance! Let's (arbitrarily) cap the prior probability false
+ # at 99%.
+ print "Warning: FalseWithinTrue = %s >= %s!"%(FalseWithinTrue, self.TrueCount)
+ self.PriorProbabilityFalse = 0.99
+ return
+ VectorCount = len(self.AllVectors)
+ self.PriorProbabilityFalse = (VectorCount - (self.TrueCount - FalseWithinTrue)) / float(VectorCount)
+ print "==>>PriorProbabilityFalse: %s"%(self.PriorProbabilityFalse)
+ def SaveTabDelimited(self, File):
+ if type(File) == type(""):
+ File = open(File, "wb")
+ CloseFlag = 1
+ else:
+ CloseFlag = 0
+ #File = open(FileName, "wb")
+ for VectorIndex in range(len(self.AllVectors)):
+ Vector = self.AllVectors[VectorIndex]
+ String = "%s\t%s\t"%(VectorIndex, Vector.TrueFlag)
+ for Value in Vector.Features:
+ String += "%s\t"%Value
+ File.write(String + "\n")
+ if CloseFlag:
+ File.close()
+
+def LoadGeneralModel(FileName):
+ File = open(FileName, "rb")
+ ModelType = cPickle.load(File)
+ File.close()
+ if ModelType == "LDA":
+ Model = LDAModel()
+ Model.LoadModel(FileName)
+ elif ModelType == "SVM":
+ Model = SVMModel()
+ Model.LoadModel(FileName)
+ elif ModelType == "LOGIT":
+ Model = LogitModel()
+ Model.LoadModel(FileName)
+ else:
+ print "** Error: Unable to load model type '%s'"%ModelType
+ return None
+ return Model
+
+class LearnerClass:
+ def __init__(self, FeatureList = None):
+ # The entries in FeatureList are indices into the
+ # available features of our training and testing sets.
+ self.FeatureList = FeatureList
+ # OddsTrue[Bin] = probability that an instance with a score
+ # in this bin or HIGHER is correct.
+ self.OddsTrue = {}
+ self.PValue = {}
+ # Bin = int(round(Score * self.BinScalingFactor))
+ self.BinScalingFactor = 10
+ self.MixtureModel = None
+ def SaveModel(self, FileName):
+ raise ValueError, "Abstract method - override in subclass!"
+ def LoadModel(self, FileName):
+ raise ValueError, "Abstract method - override in subclass!"
+ def Train(self, FeatureSet):
+ raise ValueError, "Abstract method - override in subclass!"
+ def Test(self, FeatureSet):
+ raise ValueError, "Abstract method - override in subclass!"
+ def ReportROC(self, FeatureSet, OutputFileName = None):
+ SortedList = []
+ for Vector in FeatureSet.AllVectors:
+ SortedList.append((Vector.Score, random.random(), Vector))
+ SortedList.sort()
+ SortedList.reverse()
+ OverallTrueCount = 0
+ OverallFalseCount = 0
+ # If there are many many vectors, then we'll end up with an unwieldy curve that's
+ # awkward to plot. So, consider PHASING things:
+ Slice = (len(SortedList) / 30000) + 1 # 2 or larger
+ OldSortedList = SortedList
+ SortedList = []
+ print "SLICE roc-curve list: Take every %sth entry"%Slice
+ for X in range(len(OldSortedList)):
+ if X % Slice == 0:
+ SortedList.append(OldSortedList[X])
+ Vector = OldSortedList[X][-1]
+ if Vector.TrueFlag:
+ OverallTrueCount += 1
+ else:
+ OverallFalseCount += 1
+ OldSortedList = None
+ TrueCount = 0
+ FalseCount = 0
+ if OutputFileName:
+ ROCCurvePlotFile = open(OutputFileName, "wb")
+ RowCount = 0
+ ROCTPForFP = {}
+ ROCTPForFPCount = {}
+ for (Score, Dummy, Vector) in SortedList:
+ RowCount += 1
+ if Vector.TrueFlag:
+ TrueCount += 1
+ else:
+ FalseCount += 1
+ OverallTPRate = TrueCount / float(max(1, OverallTrueCount))
+ OverallFPRate = FalseCount / float(max(1, OverallFalseCount))
+ Bin = int(round(OverallFPRate * 100))
+ ROCTPForFP[Bin] = ROCTPForFP.get(Bin, 0) + OverallTPRate
+ ROCTPForFPCount[Bin] = ROCTPForFPCount.get(Bin, 0) + 1
+ if OutputFileName:
+ ROCCurvePlotFile.write("%s\t%s\t%s\t%s\t%s\t\n"%(RowCount, TrueCount, FalseCount, OverallFPRate, OverallTPRate))
+ if OutputFileName:
+ ROCCurvePlotFile.close()
+ # Compute the area under the ROC curve.
+ for Bin in range(0, 100):
+ if ROCTPForFP.has_key(Bin):
+ ROCTPForFP[Bin] /= float(ROCTPForFPCount[Bin])
+ ROCArea = 0
+ for Bin in range(0, 100):
+ if ROCTPForFP.has_key(Bin):
+ ROCArea += 0.01 * ROCTPForFP[Bin]
+ #print "%s: %s"%(Bin, ROCTPForFP[Bin])
+ else:
+ # Interpolate between points:
+ PrevX = 0 # default
+ PrevY = 0 # default
+ for PrevBin in range(Bin - 1, -1, -1):
+ if ROCTPForFP.has_key(PrevBin):
+ PrevX = PrevBin
+ PrevY = ROCTPForFP[PrevBin]
+ break
+ NextX = 100
+ NextY = 1
+ for NextBin in range(Bin + 1, 101):
+ if ROCTPForFP.has_key(NextBin):
+ NextX = NextBin
+ NextY = ROCTPForFP[NextBin]
+ break
+ InterpolatedValue = PrevY + (Bin - PrevX) * float(NextY - PrevY) / (NextX - PrevX)
+ ROCArea += 0.01 * InterpolatedValue
+ print "ROC curve area:", ROCArea
+ def ReportAccuracy(self, FeatureSet, ROCFilePath = None):
+ """
+ Called after Test(FeatureSet), to measure how well we did at separating
+ the true and false vectors by score. Compute OddsTrue, as well.
+ """
+ SortedList = []
+ for Vector in FeatureSet.AllVectors:
+ SortedList.append((Vector.Score, Vector))
+ # sort from HIGHEST to LOWEST score:
+ SortedList.sort()
+ SortedList.reverse()
+ #self.ComputeOddsTrue(SortedList)
+ self.ComputePValues(SortedList)
+ #self.ComputePValues(SortedList, FeatureSet.PriorProbabilityFalse)
+ Rates = [0.05, 0.01, 0.1, 0.5, 0.005]
+ CountsByRate = [0, 0, 0, 0, 0, 0]
+ CumulativeTrue = 0
+ CumulativeFalse = 0
+ Cumulative = 0
+ PrevScore = None
+ for (Score, Vector) in SortedList:
+ if Vector.TrueFlag:
+ CumulativeTrue += 1
+ else:
+ CumulativeFalse += 1
+ Cumulative += 1
+ FractionFalse = CumulativeFalse / float(Cumulative)
+ if Score != PrevScore:
+ for RateIndex in range(len(Rates)):
+ if FractionFalse < Rates[RateIndex]:
+ CountsByRate[RateIndex] = CumulativeTrue
+ #print Score, FractionFalse, CumulativeTrue, CumulativeFalse
+ PrevScore = Score
+ print "Counts by FDRate: %d at 5%% %d at 1%% %d at 10%% %d at 50%%"%(CountsByRate[0], CountsByRate[1], CountsByRate[2], CountsByRate[3])
+ SensitivityByRate = []
+ for Count in CountsByRate:
+ SensitivityByRate.append(100 * Count / float(max(1, FeatureSet.TrueCount)))
+ print "FeatureSet.TrueCount:", FeatureSet.TrueCount
+ print " Sensitivity: %.2f at 5%% %.2f at 1%% %.2f at 10%% %.2f at 50%%"%(SensitivityByRate[0], SensitivityByRate[1], SensitivityByRate[2], SensitivityByRate[3])
+ if ROCFilePath:
+ self.ReportROC(FeatureSet, ROCFilePath)
+ return CountsByRate
+ def ComputeOddsTrue(self, SortedList):
+ "DEPRECATED; use ComputePValue instead"
+ if len(SortedList) < 200:
+ BlockSize = len(SortedList) / 4
+ else:
+ BlockSize = 100
+ WindowTrueSum = 0
+ WindowFalseSum = 0
+ for Entry in SortedList[:BlockSize - 1]:
+ if Entry[1].TrueFlag:
+ WindowTrueSum += 1
+ else:
+ WindowFalseSum += 1
+ for Index in range(len(SortedList)):
+ # Add one entry to the window:
+ if Index + BlockSize < len(SortedList):
+ Entry = SortedList[Index + BlockSize]
+ if Entry[1].TrueFlag:
+ WindowTrueSum += 1
+ else:
+ WindowFalseSum += 1
+ # Compute the probability-true for this window:
+ Vector = SortedList[Index][1]
+ OddsTrue = WindowTrueSum / float(WindowTrueSum + WindowFalseSum)
+ Bin = int(round(Vector.Score * self.BinScalingFactor))
+ self.OddsTrue[Bin] = OddsTrue
+ # Remove leftmost entry from the window:
+ if Index >= BlockSize:
+ Entry = SortedList[Index - BlockSize]
+ if Entry[1].TrueFlag:
+ WindowTrueSum -= 1
+ else:
+ WindowFalseSum -= 1
+ def GetPValue(self, Score):
+ if self.MixtureModel:
+ return 1.0 - self.MixtureModel.GetOddsTrue(Score)
+ Bin = int(round(Score * self.BinScalingFactor))
+ Keys = self.PValue.keys()
+ MinKey = min(Keys)
+ MaxKey = max(Keys)
+ if Bin < MinKey:
+ return self.PValue[MinKey]
+ if Bin > MaxKey:
+ return self.PValue[MaxKey]
+ return self.PValue[Bin]
+ def ComputePValuesMixtureModel(self, SortedList):
+ """
+ Our feature-set has an empirical distribution of scores. We'll approximate
+ this distribution as a mixture of two distributions: gamma (false) and
+ normal (true). Then we'll derive p-value (probability false) for each
+ score-bin.
+ """
+ Scores = []
+
+
+ for (Score, Vector) in SortedList:
+ Scores.append(Score)
+ #for Bin in range(Model.MinBin, Model.MaxBin):
+ # self.PValue[Bin] = 1.0 - Model.OddsTrue[Bin]
+ self.MixtureModel = MixtureModelClass(self.GetMMBinMultiplier())
+ self.MixtureModel.Model(Scores)
+ for (Score, Vector) in SortedList:
+ Vector.PValue = 1.0 - self.MixtureModel.GetOddsTrue(Score)
+ self.MixtureModel.PlotDistribution("PValues.txt")
+ def ComputePValues(self, SortedList):
+ self.ComputePValuesMixtureModel(SortedList)
+ def ComputePValuesEmpirical(self, SortedList, PriorProbabilityFalse):
+ """
+ DEPRECATED - called only if mixture model fails.
+
+ Input SortedList is a list of the form (ModelScore, FeatureVector), sorted from
+ highest to lowest ModelScore.
+
+ self.PValue[Bin] is the probability that a peptide P is FALSE, given a score of
+ Bin or better. Formally, it equals P(not P | S(P)>=Bin). By Bayes' Theorem, this
+ is equal to:
+ P(S(P)>=Bin | not P) * P(not P) / P(S(P)>=Bin)
+ """
+ # Passed in: PriorProbabilityFalse == P(not P)
+ # And set up dictionaries PValueTotals / PValueCounts to
+ # compute ProbRatio == P(S(P)>=Bin | not P) / P(S(P)>=Bin)
+ #PriorProbabilityFalse = 0
+ TrueInstanceCount = 0
+ HighScoringCount = 0
+ HighScoringFalseInstanceCount = 0
+ TotalInstances = len(SortedList)
+ PValueTotals = {}
+ PValueCounts = {}
+ TotalFalseInstances = 0
+ for (Score, Vector) in SortedList:
+ if not Vector.TrueFlag:
+ TotalFalseInstances += 1
+ HistoGood = {}
+ HistoBad = {}
+ TempOutputFile = open("TempPTMPValue.txt", "wb")
+ TempOutputFile.write("Bin\tHighFalse\tTotalInstances\tTotalFalseInstances\tHighScoringCount\t\n")
+ for (Score, Vector) in SortedList:
+ HighScoringCount += 1
+ if Vector.TrueFlag:
+ TrueInstanceCount += 1
+ else:
+ HighScoringFalseInstanceCount += 1
+ ProbRatio = (HighScoringFalseInstanceCount * TotalInstances) / float(TotalFalseInstances * HighScoringCount)
+ #ProbRatio = FalseInstanceCount / float(TrueInstanceCount + FalseInstanceCount)
+ Bin = int(round(Score * self.BinScalingFactor))
+ PValueTotals[Bin] = PValueTotals.get(Bin, 0) + ProbRatio
+ PValueCounts[Bin] = PValueCounts.get(Bin, 0) + 1
+ TempOutputFile.write("%s\t%s\t%s\t%s\t%s\t\n"%(Bin, HighScoringFalseInstanceCount, TotalInstances, TotalFalseInstances, HighScoringCount))
+ if Vector.TrueFlag:
+ HistoGood[Bin] = HistoGood.get(Bin, 0) + 1
+ else:
+ HistoBad[Bin] = HistoBad.get(Bin, 0) + 1
+ #PriorProbabilityFalse = FalseInstanceCount / float(FalseInstanceCount + TrueInstanceCount)
+ Keys = PValueTotals.keys()
+ Keys.sort()
+ for Bin in Keys:
+ AverageProbRatio = PValueTotals[Bin] / float(PValueCounts[Bin])
+ self.PValue[Bin] = AverageProbRatio * PriorProbabilityFalse
+ self.PValue[Bin] = max(self.PValue[Bin], 0.0001)
+ print "%s: %s"%(Bin, self.PValue[Bin])
+ TempOutputFile.write("\n\n\n")
+ for Bin in Keys:
+ TempOutputFile.write("%s\t%s\t%s\t\n"%(Bin, HistoGood.get(Bin, 0), HistoBad.get(Bin, 0)))
+ ############################################################
+ # Interpolate p-values, for missing bins:
+ Keys = self.PValue.keys()
+ Keys.sort()
+ MinKey = min(Keys)
+ MaxKey = max(Keys)
+ for Bin in range(MinKey, MaxKey):
+ if self.PValue.has_key(Bin):
+ PrevBin = Bin
+ PrevPValue = self.PValue[Bin]
+ continue
+ # Find the next bin:
+ for NextBin in range(Bin + 1, MaxKey + 1):
+ if self.PValue.has_key(NextBin):
+ NextPValue = self.PValue[NextBin]
+ break
+ # Interpolate from (PrevBin, PrevPValue) to (NextBin, NextPValue):
+ Slope = (NextPValue - PrevPValue) / float(NextBin - PrevBin)
+ Intermediate = PrevPValue + Slope * (Bin - PrevBin)
+ self.PValue[Bin] = Intermediate
+ def GetMMBinMultiplier(self):
+ return 10.0 #default
+
+
+class LDAModel(LearnerClass):
+ def __init__(self, FeatureList = None):
+ if FeatureList:
+ self.Size = len(FeatureList)
+ LearnerClass.__init__(self, FeatureList)
+ def GetCovarianceArray(self, VectorList):
+ VectorCount = float(len(VectorList))
+ C = zeros((self.Size, self.Size), FloatType)
+ for Vector in VectorList:
+ for X in range(self.Size):
+ for Y in range(self.Size):
+ C[X][Y] += Vector[X] * Vector[Y] / VectorCount
+ return C
+ def LoadModel(self, FileName):
+ File = open(FileName, "rb")
+ cPickle.load(File) # model type
+ self.FeatureList = cPickle.load(File)
+ self.PValue = cPickle.load(File)
+ self.MinValues = cPickle.load(File)
+ self.MaxValues = cPickle.load(File)
+ self.CI = cPickle.load(File)
+ self.MeanGood = cPickle.load(File)
+ self.ConstantGood = cPickle.load(File)
+ self.MeanBad = cPickle.load(File)
+ self.ConstantBad = cPickle.load(File)
+ self.Size = len(self.FeatureList)
+ self.MixtureModel = UnpickleMixtureModel(File)
+ # Verbose stuff:
+ print "\n>>>PyLoadLDAModel(%s)"%FileName
+ print "Features: %s"%self.Size
+ print "MinValues: %.4f...%.4f"%(self.MinValues[0], self.MinValues[-1])
+ print "MaxValues: %.4f...%.4f"%(self.MaxValues[0], self.MaxValues[-1])
+ print "MeanGood: %.4f...%.4f"%(self.MeanGood[0], self.MeanGood[-1])
+ print "MeanBad: %.4f...%.4f"%(self.MeanBad[0], self.MeanBad[-1])
+ if self.Size > 1:
+ print "CI: %.4f, %.4f...%.4f, %.4f"%(self.CI[0][0], self.CI[0][1],
+ self.CI[self.Size - 1][self.Size - 2],
+ self.CI[self.Size - 1][self.Size - 1]
+ )
+ print "ConstTrue %.4f, ConstFalse %.4f"%(self.ConstantGood, self.ConstantBad)
+ File.close()
+ def SaveBinaryModel(self, FileName):
+ """
+ Write out a binary representation of this model.
+ """
+ File = open(FileName, "wb")
+ File.write(struct.pack("<i", self.Size))
+ for FeatureIndex in range(self.Size):
+ File.write(struct.pack("<d", self.MinValues[FeatureIndex]))
+ for FeatureIndex in range(self.Size):
+ File.write(struct.pack("<d", self.MaxValues[FeatureIndex]))
+ for FeatureIndex in range(self.Size):
+ File.write(struct.pack("<d", self.MeanGood[FeatureIndex]))
+ for FeatureIndex in range(self.Size):
+ File.write(struct.pack("<d", self.MeanBad[FeatureIndex]))
+ File.write(struct.pack("<d", self.ConstantGood))
+ File.write(struct.pack("<d", self.ConstantBad))
+ for Row in range(self.Size):
+ for Column in range(self.Size):
+ File.write(struct.pack("<d", self.CI[Row][Column]))
+ File.close()
+ def SaveModel(self, FileName):
+ File = open(FileName, "wb")
+ cPickle.dump("LDA", File)
+ cPickle.dump(self.FeatureList, File)
+ cPickle.dump(self.PValue, File)
+ cPickle.dump(self.MinValues, File)
+ cPickle.dump(self.MaxValues, File)
+ cPickle.dump(self.CI, File)
+ cPickle.dump(self.MeanGood, File)
+ cPickle.dump(self.ConstantGood, File)
+ cPickle.dump(self.MeanBad, File)
+ cPickle.dump(self.ConstantBad, File)
+ try:
+ self.MixtureModel.PickleSelf(File)
+ except:
+ cPickle.dump(None, File)
+ File.close()
+ def ScoreInstance(self, RawFeatures):
+ Features = []
+ for FeatureIndex in self.FeatureList:
+ Features.append(RawFeatures[FeatureIndex])
+ for FeatureIndex in range(self.Size):
+ X = self.FeatureList[FeatureIndex]
+ HalfRange = (self.MaxValues[X] - self.MinValues[X]) / 2.0
+ if not HalfRange:
+ continue
+ Features[FeatureIndex] = (Features[FeatureIndex] - self.MinValues[X]) / HalfRange - 1.0
+ CIProduct = MatrixMultiply(self.CI, Features)
+ ReadingGood = MatrixMultiply(self.MeanGood, CIProduct) + self.ConstantGood
+ ReadingBad = MatrixMultiply(self.MeanBad, CIProduct) + self.ConstantBad
+ return (ReadingGood - ReadingBad)
+ def Test(self, FeatureSet):
+ # Scale features, according to our trained scaling
+ FeatureSet.MinValues = self.MinValues
+ FeatureSet.MaxValues = self.MaxValues
+ FeatureSet.ScaleFeatures()
+ # Compute scores:
+ for Vector in FeatureSet.AllVectors:
+ FixedVector = []
+ for FeatureIndex in self.FeatureList:
+ FixedVector.append(Vector.ScaledFeatures[FeatureIndex])
+ CIProduct = MatrixMultiply(self.CI, FixedVector)
+ ReadingGood = MatrixMultiply(self.MeanGood, CIProduct) + self.ConstantGood
+ ReadingBad = MatrixMultiply(self.MeanBad, CIProduct) + self.ConstantBad
+ Vector.Score = (ReadingGood - ReadingBad)
+ def Train(self, FeatureSet, VerboseFlag = 0):
+ # Get the feature range (training only):
+ FeatureSet.FindFeatureRanges()
+ self.MinValues = FeatureSet.MinValues
+ self.MaxValues = FeatureSet.MaxValues
+ # Sanity-checking: If a feature's range is a single point,
+ # then it's not useful - AND, it will generate a non-invertible
+ # matrix. So, let's filter out any such features.
+ InputFeatureList = self.FeatureList
+ self.FeatureList = []
+ for FeatureIndex in InputFeatureList:
+ if self.MinValues[FeatureIndex] < self.MaxValues[FeatureIndex]:
+ self.FeatureList.append(FeatureIndex)
+ else:
+ print "* Warning: Discarding feature '%s', every entry is %s"%(FeatureIndex, self.MinValues[FeatureIndex])
+ self.Size = len(self.FeatureList)
+ if self.Size == 0:
+ print "<< no features - bailing out >>"
+ return
+ # Scale features:
+ FeatureSet.ScaleFeatures()
+ AllVectors = []
+ TrueVectors = []
+ FalseVectors = []
+ for Vector in FeatureSet.AllVectors:
+ ScaledVector = []
+ for FeatureIndex in self.FeatureList:
+ ScaledVector.append(Vector.ScaledFeatures[FeatureIndex])
+ AllVectors.append(ScaledVector)
+ if Vector.TrueFlag:
+ TrueVectors.append(ScaledVector)
+ else:
+ FalseVectors.append(ScaledVector)
+ print "First true vector:", TrueVectors[0]
+ print "First false vector:", FalseVectors[0]
+## # Temp: Ensure the vector lists are the same size!
+## VectorCount = min(len(TrueVectors), len(FalseVectors))
+## random.shuffle(FalseVectors)
+## FalseVectors = FalseVectors[:VectorCount]
+ ############################################################
+ # Compute the mean vectors (training only):
+ self.MeanGood = [0] * self.Size
+ self.MeanBad = [0] * self.Size
+ self.MeanGlobal = [0] * self.Size
+ for Vector in TrueVectors:
+ for X in range(self.Size):
+ self.MeanGlobal[X] += Vector[X] / float(FeatureSet.Count)
+ self.MeanGood[X] += Vector[X] / float(FeatureSet.TrueCount)
+ for Vector in FalseVectors:
+ for X in range(self.Size):
+ self.MeanGlobal[X] += Vector[X] / float(FeatureSet.Count)
+ self.MeanBad[X] += Vector[X] / float(FeatureSet.FalseCount)
+ if VerboseFlag:
+ print "MeanGood:\n ",
+ for Value in self.MeanGood:
+ print "%.3f"%Value,
+ print
+ print "MeanBad:\n ",
+ for Value in self.MeanBad:
+ print "%.3f"%Value,
+ print
+ print "MeanGlobal:\n ",
+ for Value in self.MeanGlobal:
+ print "%.3f"%Value,
+ print
+ ############################################################
+ # Compute mean-corrected vectors:
+ MeanCorrectedGoodVectors = []
+ MeanCorrectedBadVectors = []
+ for Vector in TrueVectors:
+ NewVector = []
+ for X in range(self.Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ MeanCorrectedGoodVectors.append(NewVector)
+ for Vector in FalseVectors:
+ NewVector = []
+ for X in range(self.Size):
+ NewVector.append(Vector[X] - self.MeanGlobal[X])
+ MeanCorrectedBadVectors.append(NewVector)
+ ############################################################
+ # Compute covariance matrices:
+ CovarArrayGood = self.GetCovarianceArray(MeanCorrectedGoodVectors)
+ if VerboseFlag:
+ print "CovarArrayGood:", CovarArrayGood
+ CovarArrayBad = self.GetCovarianceArray(MeanCorrectedBadVectors)
+ if VerboseFlag:
+ print "CovarArrayBad:", CovarArrayBad
+ # CovarArrayFull is the pooled within-group covariance matrix, it's
+ # computed componentwise as weighted sum of CovarArrayGood and CovarArrayBad.
+ CovarArrayFull = zeros((self.Size, self.Size), FloatType)
+ for X in range(self.Size):
+ for Y in range(self.Size):
+ CovarArrayFull[X][Y] += CovarArrayGood[X][Y] * FeatureSet.TrueCount / float(FeatureSet.Count)
+ CovarArrayFull[X][Y] += CovarArrayBad[X][Y] * FeatureSet.FalseCount / float(FeatureSet.Count)
+ if VerboseFlag:
+ print "CovarArrayFull:", CovarArrayFull
+ ############################################################
+ # Invert the covariance array:
+ try:
+ self.CI = InvertMatrix(CovarArrayFull)
+ except:
+ traceback.print_exc()
+ print "Unable to invert covariance matrix! Invalid feature set."
+ return 0
+ if VerboseFlag:
+ print "CI:", self.CI
+ self.GoodMuC = MatrixMultiply(self.CI, self.MeanGood)
+ if VerboseFlag:
+ print "GoodMuC:", self.GoodMuC
+ self.BadMuC = MatrixMultiply(self.CI, self.MeanBad)
+ if VerboseFlag:
+ print "BadMuC:", self.BadMuC
+ self.ConstantGood = -MatrixMultiply(self.MeanGood, self.GoodMuC) / 2.0
+ self.ConstantBad = -MatrixMultiply(self.MeanBad, self.BadMuC) / 2.0
+ #if VerboseFlag:
+ print "LDA Constant good %.4f constant bad %.4f"%(self.ConstantGood, self.ConstantBad)
+
+class SVMModel(LearnerClass):
+ def __init__(self, FeatureList = None):
+ self.Scaling = None
+ self.SupportVectors = None
+ self.PySVMReadyFlag = 0
+ LearnerClass.__init__(self, FeatureList)
+ def WriteSVMFeaturesToFile(self, FilePath, FeatureSet, ForceEqualCounts = 0):
+ #print "TRUE vectors %s, FALSE vectors %s"%(len(FeatureSet.TrueVectors), len(FeatureSet.FalseVectors))
+ #print "TRUE count %s, FALSE count %s"%(FeatureSet.TrueCount, FeatureSet.FalseCount)
+ File = open(FilePath, "wb")
+ if ForceEqualCounts:
+ # Shuffle the tuples:
+ TrueVectors = FeatureSet.TrueVectors[:]
+ random.shuffle(TrueVectors)
+ FalseVectors = FeatureSet.FalseVectors[:]
+ random.shuffle(FalseVectors)
+ # Try writing out equal numbers of true and false tuples:
+ MaxIndex = min(FeatureSet.TrueCount, FeatureSet.FalseCount)
+ MaxIndex = min(MaxIndex, MaxSVMFeatureCount)
+ TrueVectors = TrueVectors[:MaxIndex]
+ FalseVectors = FalseVectors[:MaxIndex]
+ else:
+ TrueVectors = FeatureSet.TrueVectors
+ FalseVectors = FeatureSet.FalseVectors
+ for Vector in TrueVectors:
+ Str = "+1 "
+ for FeatureIndex in range(len(self.FeatureList)):
+ Str += "%d:%.8f "%(FeatureIndex + 1, Vector.Features[self.FeatureList[FeatureIndex]])
+ File.write(Str + "\n")
+ for Vector in FalseVectors:
+ Str = "-1 "
+ for FeatureIndex in range(len(self.FeatureList)):
+ Str += "%d:%.8f "%(FeatureIndex + 1, Vector.Features[self.FeatureList[FeatureIndex]])
+ File.write(Str + "\n")
+ File.close()
+ def Train(self, FeatureSet, VerboseFlag = 0):
+ print "TRAINSVM()...", FeatureSet
+ TempFeaturesFileName = "PTMFeatures.SVM.txt"
+ TempScalingFileName = "PTMFeaturesSVMScale.txt"
+ TempScaledFeaturesFileName = "PTMFeatures.SVMScaled.txt"
+ TempModelFileName = "PTMFeatures.SVMScaled.txt.model"
+ # Write feature vectors, forcing equal true and false instance-counts:
+ self.WriteSVMFeaturesToFile(TempFeaturesFileName, FeatureSet, 1)
+ ###############################################################
+ # SCALE the features, and remember the scaling:
+ Command = r"%s -s %s %s > %s"%(PATH_SVMSCALE, TempScalingFileName, TempFeaturesFileName, TempScaledFeaturesFileName)
+ print Command
+ os.system(Command)
+ # Read the scaling limits, for later use:
+ File = open(TempScalingFileName, "rb")
+ self.Scaling = File.read()
+ File.close()
+ os.remove(TempScalingFileName)
+ print "Train!"
+ ###############################################################
+ # TRAIN the model. We won't use cross-validation here, because in the future there'll be
+ # a testing-set.
+ Command = r"%s %s"%(PATH_SVMTRAIN, TempScaledFeaturesFileName)
+ print Command
+ os.system(Command)
+ File = open(TempModelFileName, "rb")
+ self.SupportVectors = File.read()
+ File.close()
+ ###############################################
+ # Clean up temp-files:
+ os.remove(TempFeaturesFileName)
+ os.remove(TempScaledFeaturesFileName)
+ os.remove(TempModelFileName)
+ def Test(self, FeatureSet):
+ if not self.Scaling or not self.SupportVectors:
+ print "Error in SVMModel.Test(): We haven't trained (or loaded) yet!"
+ return
+ TempFeaturesFileName = "TestFeatures.SVM.txt"
+ TempScalingFileName = "PTMFeaturesSVMScale.txt"
+ TempScaledFeaturesFileName = "TestFeatures.SVMScaled.txt"
+ TempModelFileName = "SVM.model"
+ TempOutputFileName = "SVMPrediction.txt"
+ TrueFlags = []
+ for Tuple in FeatureSet.TrueVectors:
+ TrueFlags.append(1)
+ for Tuple in FeatureSet.FalseVectors:
+ TrueFlags.append(0)
+ ########################################################################
+ # WRITE the testing set to file:
+ self.WriteSVMFeaturesToFile(TempFeaturesFileName, FeatureSet)
+ # Write our scaling-info and our model to files:
+ File = open(TempScalingFileName, "wb")
+ File.write(self.Scaling)
+ File.close()
+ File = open(TempModelFileName, "wb")
+ File.write(self.SupportVectors)
+ File.close()
+ # SCALE the testing set:
+ Command = r"%s -r %s %s > %s"%(PATH_SVMSCALE, TempScalingFileName, TempFeaturesFileName, TempScaledFeaturesFileName)
+ print Command
+ os.system(Command)
+ os.remove(TempFeaturesFileName)
+ os.remove(TempScalingFileName)
+ # Ok, now let's run svmpredict on all the instances in the TESTING set:
+## Command = r"%s %s %s %s"%(PATH_SVMPREDICT, TempScaledFeaturesFileName, TempModelFileName, TempOutputFileName)
+## print Command
+## os.system(Command)
+ RunPySVM.Predict(TempScaledFeaturesFileName, TempModelFileName, TempOutputFileName)
+ # Now read in the results, and assign scores to the vectors of the set:
+ File = open(TempOutputFileName, "rb")
+ InstanceIndex = 0
+ TrueIndex = 0
+ FalseIndex = 0
+ SortedList = []
+ for FileLine in File.xreadlines():
+ Score = float(FileLine)
+ if TrueFlags[InstanceIndex]:
+ Vector = FeatureSet.TrueVectors[TrueIndex]
+ TrueIndex += 1
+ else:
+ Vector = FeatureSet.FalseVectors[FalseIndex]
+ FalseIndex += 1
+ Vector.Score = float(FileLine)
+ InstanceIndex += 1
+ File.close()
+ ########################################################################
+ # Clean up temp-files:
+ os.remove(TempScaledFeaturesFileName)
+ os.remove(TempModelFileName)
+ os.remove(TempOutputFileName)
+ def SaveTextModel(self, Stub):
+ ModelPath = "%s.model"%Stub
+ File = open(ModelPath, "wb")
+ File.write(self.SupportVectors)
+ File.close()
+ ScalingPath = "%s.range"%Stub
+ File = open(ScalingPath, "wb")
+ File.write(self.Scaling)
+ File.close()
+ def SaveModel(self, FileName):
+ File = open(FileName, "wb")
+ cPickle.dump("SVM", File)
+ cPickle.dump(self.FeatureList, File)
+ cPickle.dump(self.PValue, File)
+ cPickle.dump(self.Scaling, File)
+ cPickle.dump(self.SupportVectors, File)
+ self.MixtureModel.PickleSelf(File)
+ File.close()
+ def LoadModel(self, FileName):
+ File = open(FileName, "rb")
+ cPickle.load(File) # model type
+ self.FeatureList = cPickle.load(File)
+ self.PValue = cPickle.load(File)
+ self.Scaling = cPickle.load(File)
+ self.SupportVectors = cPickle.load(File)
+ self.MixtureModel = UnpickleMixtureModel(File)
+ File.close()
+ def PreparePySVM(self):
+ """
+ Prepare PySVM to score some features using our model.
+ """
+ # Support vectors:
+ TempModelFileName = "TempModel.txt"
+ File = open(TempModelFileName, "wb")
+ File.write(self.SupportVectors)
+ File.close()
+ PySVM.LoadModel(TempModelFileName)
+ os.remove(TempModelFileName)
+ # Feature ranges:
+ TempScalingFileName = "TempScaling.txt"
+ File = open(TempScalingFileName, "wb")
+ File.write(self.Scaling)
+ File.close()
+ PySVM.LoadScaling(TempScalingFileName)
+ os.remove(TempScalingFileName)
+ # And now, we can score many vectors quickly!
+ self.PySVMReadyFlag = 1
+ def ScoreInstance(self, Features):
+ """
+ Compute the score for this instance.
+ """
+ if not self.PySVMReadyFlag:
+ self.PreparePySVM()
+ Vector = []
+ for FeatureIndex in self.FeatureList:
+ Vector.append(Features[FeatureIndex])
+ Score = PySVM.ScaleAndScore(Vector)
+ return Score
+
+class LogitModel(LearnerClass):
+ """
+ A maximum-likelihood logistic regression model. The model's parameters
+ are tuned using the Newton-Raphson algorithm. See p98 in Hastie/Tibshirani,
+ The Elements of Statistical Learning.
+ """
+ def GetMMBinMultiplier(self):
+ return 40.0 #default
+ def ComputePValues(self, SortedList):
+ pass # The score we output IS a probability!
+ def GetFixedTuples(self, FeatureSet):
+ # Return fixed-up tuples. Keep a random selection of
+ # true and of false tuples.
+ VectorSize = len(self.FeatureList) + 1 # add one for the CONSTANT input
+ AllTuples = [] # entries (TrueFlag, FeatureTuple)
+ KeepCount = min(FeatureSet.TrueCount, FeatureSet.FalseCount, 500)
+ random.shuffle(FeatureSet.TrueVectors)
+ random.shuffle(FeatureSet.FalseVectors)
+ for Vector in FeatureSet.TrueVectors[:KeepCount]:
+ FixedTuple = [1.0]
+ for FeatureIndex in self.FeatureList:
+ FixedTuple.append(Vector.Features[FeatureIndex])
+ AllTuples.append((1, tuple(FixedTuple)))
+ for Vector in FeatureSet.FalseVectors[:KeepCount]:
+ FixedTuple = [1.0]
+ for FeatureIndex in self.FeatureList:
+ FixedTuple.append(Vector.Features[FeatureIndex])
+ AllTuples.append((0, tuple(FixedTuple)))
+ return AllTuples
+ def Train(self, FeatureSet, VerboseFlag = 0):
+ VectorSize = len(self.FeatureList) + 1 # add one for the CONSTANT input
+ AllTuples = self.GetFixedTuples(FeatureSet)
+ random.shuffle(AllTuples)
+ #################################################################
+ # Train the model - set the weight-vector self.Beta
+ # Initialize vector self.Beta, all zeroes:
+ self.Beta = zeros(VectorSize)
+ TupleCount = len(AllTuples)
+ # Initialize vector Y, indicating which vectors are true:
+ Y = zeros(TupleCount, FloatType)
+ for I in range(TupleCount):
+ if AllTuples[I][0]:
+ Y[I] = 1.0
+ else:
+ Y[I] = 0.0
+ # Initialize the input matrix X:
+ X = zeros((TupleCount, VectorSize), FloatType)
+ for I in range(TupleCount):
+ #X[I][0] = 1.0
+ for J in range(VectorSize):
+ X[I][J] = AllTuples[I][1][J]
+ XT = transpose(X)
+ PrevLogLikelihood = None
+ CycleCount = 0
+ while 1:
+ # Compute the current log-likelihood:
+ LogLikelihood = 0
+ for I in range(TupleCount):
+ BX = MatrixMultiply(self.Beta, X[I])
+ LogLikelihood += Y[I] * BX
+ LogLikelihood -= math.log(1 + math.exp(BX))
+ if PrevLogLikelihood != None:
+ if VerboseFlag:
+ print "Log likelihood: %s (prev %s)"%(LogLikelihood, PrevLogLikelihood)
+ Improvement = PrevLogLikelihood - LogLikelihood
+ if Improvement < 0.001 and Improvement >= 0:
+ print "Reached optimum: Stop now!"
+ break
+ PrevLogLikelihood = LogLikelihood
+ # Compute the vector P:
+ P = zeros(TupleCount, FloatType)
+ for I in range(TupleCount):
+ self.BetaSum = 0
+ Tuple = AllTuples[I][1]
+ for J in range(VectorSize):
+ self.BetaSum += self.Beta[J] * Tuple[J]
+ Exp = math.exp(self.BetaSum)
+ P[I] = Exp / (1.0 + Exp)
+ # Compute the diagonal matrix W:
+ W = zeros((TupleCount, TupleCount), FloatType)
+ for I in range(TupleCount):
+ W[I][I] = P[I] * (1.0 - P[I])
+ try:
+ WI = numpy.linalg.inv(W)
+ except:
+ traceback.print_exc()
+ print "** Error: Unable to perform logistic regression due to singular matrix."
+ print "Feature list was:", self.FeatureList
+ return None
+ # Compute the "response vector" z:
+ z = MatrixMultiply(X, self.Beta)
+ Diff = Y - P
+ z += MatrixMultiply(WI, Diff)
+ # Compute the new self.Beta:
+ Product = MatrixMultiply(XT, W)
+ Product = MatrixMultiply(Product, X)
+ ProdI = numpy.linalg.inv(Product)
+ Product = MatrixMultiply(ProdI, XT)
+ Product = MatrixMultiply(Product, W)
+ NewBeta = MatrixMultiply(Product, z)
+ if VerboseFlag:
+ print "Old self.Beta:", self.Beta
+ print "New self.Beta:", NewBeta
+ self.Beta = NewBeta
+ CycleCount += 1
+ if CycleCount >= 100:
+ print "100 cycles performed; stopping now!"
+ break
+ def ScoreInstance(self, Features):
+ FixedFeatures = [1.0,]
+ for FeatureIndex in self.FeatureList:
+ FixedFeatures.append(Features[FeatureIndex])
+ BX = 0
+ for I in range(len(FixedFeatures)):
+ BX += self.Beta[I] * FixedFeatures[I]
+ try:
+ Exp = math.exp(BX)
+ except:
+ print "** exponent unreachable:", BX
+ print "Features:", FixedFeatures
+ raise ValueError, "Features out-of-range!"
+ Score = Exp / (1.0 + Exp)
+ return Score
+ def Test(self, FeatureSet):
+ for Vector in FeatureSet.AllVectors:
+ FixedFeatures = [1.0,]
+ for FeatureIndex in self.FeatureList:
+ FixedFeatures.append(Vector.Features[FeatureIndex])
+ BX = 0
+ for I in range(len(FixedFeatures)):
+ BX += self.Beta[I] * FixedFeatures[I]
+ Exp = math.exp(BX)
+ Vector.Score = Exp / (1.0 + Exp)
+ def SaveModel(self, FileName):
+ File = open(FileName, "wb")
+ cPickle.dump("LOGIT", File)
+ cPickle.dump(self.FeatureList, File)
+ cPickle.dump(self.PValue, File)
+ cPickle.dump(self.Beta, File)
+ File.close()
+ def LoadModel(self, FileName):
+ File = open(FileName, "rb")
+ cPickle.load(File) # model type
+ self.FeatureList = cPickle.load(File)
+ self.PValue = cPickle.load(File)
+ self.Beta = cPickle.load(File)
+ File.close()
+
+def Test():
+ pass
+
+if __name__ == "__main__":
+ # Command-line invocation: Test model loading/saving
+ Test()
diff --git a/MQScoreLDA2.model b/MQScoreLDA2.model
new file mode 100644
index 0000000..c8c3103
Binary files /dev/null and b/MQScoreLDA2.model differ
diff --git a/MQScoreLDA3.model b/MQScoreLDA3.model
new file mode 100644
index 0000000..92606eb
Binary files /dev/null and b/MQScoreLDA3.model differ
diff --git a/MQScoreSVM2.model b/MQScoreSVM2.model
new file mode 100644
index 0000000..28e1bcc
--- /dev/null
+++ b/MQScoreSVM2.model
@@ -0,0 +1,269 @@
+svm_type c_svc
+kernel_type rbf
+gamma 0.142857
+nr_class 2
+total_sv 260
+rho -0.532044
+label 1 -1
+nr_sv 127 133
+SV
+1 1:-0.241379 2:-0.48623 3:-0.90809 4:-0.518519 5:-0.571429 6:-0.0310774
+1 1:-0.37931 2:-0.583895 3:-0.920724 4:-0.1875 5:-0.839286 6:-0.660705 7:1
+1 1:-0.931034 2:-0.290438 3:-0.537272 4:-0.458333 5:-0.0357143 6:0.535738
+1 1:-0.310345 2:-0.5338 3:-0.907917 4:-0.362745 5:-0.243697 6:-0.847589 7:1
+1 1:-1 2:-0.276413 3:-0.675221 4:0.547619 5:-0.632653 6:0.24428 7:1
+1 1:-0.37931 2:-0.390898 3:-0.886994 4:-0.0520833 5:-0.357143 6:-0.367445
+1 1:-0.241379 2:-0.582448 3:-0.883201 4:0.324074 5:-0.428571 6:-0.525268
+1 1:-0.724138 2:-0.468985 3:-0.877878 4:0.378788 5:-0.0649351 6:0.0650669
+1 1:-0.931034 2:-0.288454 3:-0.506243 4:-0.1875 5:-0.357143 6:0.369617
+1 1:-0.37931 2:-0.307705 3:-0.897673 4:-0.1875 5:-0.517857 6:-0.207538 7:1
+1 1:-0.517241 2:-0.344343 3:-0.914243 4:0.702381 5:-0.0816326 6:-0.4745
+1 1:-0.37931 2:-0.252088 3:-0.736665 4:0.0833333 5:-0.357143 6:-0.0929526 7:1
+1 1:-0.517241 2:-0.371896 3:-0.831565 4:-0.380952 5:-0.44898 6:-0.0896593 7:1
+0.9640805968402784 1:-1 2:-0.215181 3:-0.289894 4:-0.690476 5:0.469388 6:0.726722
+1 1:0.862069 2:-0.663598 3:-0.909536 4:-0.617647 5:-0.773109 6:-0.412346 7:-1
+1 1:-0.724138 2:-0.267428 3:-0.456891 4:-0.0151515 5:-0.532468 6:0.100579 7:1
+1 1:-0.862069 2:-0.433816 3:-0.874294 4:-0.277778 5:0.428571 6:0.563044
+1 1:-0.793103 2:-0.285441 3:-0.789383 4:-0.133333 5:-0.742857 6:0.461847
+1 1:-0.724138 2:-0.20629 3:-0.701326 4:0.575758 5:-0.766234 6:-0.0335733 7:1
+1 1:-0.931034 2:-0.172582 3:-0.722064 4:-0.1875 5:-0.357143 6:0.407466 7:1
+1 1:-0.310345 2:-0.393416 3:-0.865847 4:0.27451 5:-0.243697 6:-0.319272 7:1
+1 1:-0.310345 2:-0.419821 3:-0.768026 4:0.0196078 5:0.512605 6:-0.0311913
+1 1:0.172414 2:-0.648493 3:-0.900489 4:-0.458333 5:-0.678571 6:-0.802595
+1 1:0.37931 2:-0.600103 3:-0.909536 4:-0.277778 5:-0.428571 6:-0.547839 7:-1
+1 1:-0.172414 2:-0.0824265 3:-0.651707 4:0.254386 5:-0.0526316 6:-0.282169
+1 1:0.241379 2:-0.120562 3:-0.900489 4:-0.0466667 5:-0.28 6:-0.0313917
+1 1:-0.310345 2:-0.0915925 3:-0.598523 4:0.0196078 5:-0.546218 6:0.436612
+1 1:0.724138 2:-0.829349 3:-0.925247 4:-0.255208 5:-0.517857 6:-0.666227 7:-1
+1 1:-0.793103 2:-0.354364 3:-0.530451 4:-0.35 5:-0.485714 6:0.368674 7:1
+1 1:-0.655172 2:-0.35028 3:-0.782299 4:-0.0972222 5:-0.571429 6:0.153881
+1 1:0.103448 2:-0.77115 3:-0.915565 4:-0.528986 5:-0.776398 6:-0.859992
+1 1:-0.862069 2:-0.250968 3:-0.701443 4:0.444444 5:-0.142857 6:0.444627
+1 1:0.241379 2:-0.619707 3:-0.900489 4:0.3 5:-0.485714 6:-0.049499 7:1
+1 1:-0.586207 2:-0.468397 3:-0.901703 4:0.333333 5:-0.010989 6:-0.217662 7:1
+1 1:-0.655172 2:-0.286756 3:-0.579113 4:-0.277778 5:-0.142857 6:0.252305 7:-1
+1 1:-0.517241 2:-0.243184 3:-0.524577 4:-0.22619 5:-0.0816326 6:0.32376
+1 1:-0.793103 2:-0.343474 3:-0.753986 4:-0.133333 5:-0.228571 6:-0.122669
+1 1:-0.724138 2:-0.330754 3:-0.646377 4:-0.212121 5:0.168831 6:0.160098 7:1
+1 1:-0.931034 2:-0.223299 3:-0.42933 4:-0.458333 5:0.285714 6:0.184891
+1 1:-0.724138 2:-0.230862 3:-0.514869 4:0.181818 5:-0.298701 6:0.235952
+1 1:-0.931034 2:-0.323058 3:-0.362464 4:0.354167 5:-0.678571 6:0.0569779
+1 1:-0.655172 2:-0.285406 3:-0.579994 4:0.263889 5:-0.571429 6:-0.465278
+1 1:-0.37931 2:-0.355082 3:-0.757722 4:0.489583 5:-0.196429 6:-0.20073
+1 1:-0.103448 2:-0.3892 3:-0.825284 4:0.3 5:-0.228571 6:-0.362396
+1 1:-0.517241 2:-0.338635 3:-0.803881 4:-0.690476 5:-0.0816326 6:0.0556964
+1 1:0.103448 2:-0.223735 3:-0.918002 4:0.130435 5:0.00621114 6:0.229439
+1 1:-0.862069 2:-0.236869 3:-0.320418 4:-0.277778 5:0.142857 6:-0.0131616
+1 1:-0.103448 2:-0.297841 3:-0.824209 4:-0.025 5:0.414286 6:0.158518
+1 1:-0.793103 2:-0.380259 3:-0.798152 4:-0.133333 5:-1 6:-0.461338 7:1
+1 1:-0.241379 2:-0.618069 3:-0.882975 4:0.203704 5:0.142857 6:0.0430099 7:1
+1 1:-0.586207 2:-0.245859 3:-0.623829 4:0.166667 5:-0.208791 6:-0.289937 7:1
+1 1:-0.655172 2:-0.34345 3:-0.729769 4:0.263889 5:-0.785714 6:0.492403 7:1
+1 1:-0.310345 2:0.0057331 3:-0.640964 4:0.0196078 5:0.361345 6:0.755297 7:-1
+1 1:-0.586207 2:-0.395567 3:-0.740535 4:0.333333 5:-0.802198 6:-0.237142 7:1
+1 1:-0.793103 2:-0.376799 3:-0.628935 4:0.0833333 5:-0.228571 6:-0.459936 7:1
+1 1:-0.586207 2:-0.372921 3:-0.828409 4:-0.166667 5:0.186813 6:-0.145837 7:1
+1 1:-0.793103 2:-0.63468 3:-0.935978 4:-0.35 5:-1 6:-0.688348 7:-1
+1 1:0.517241 2:-0.902187 3:-0.940958 4:-0.775862 5:-0.73399 6:-0.701773 7:-1
+1 1:-0.793103 2:-0.546968 3:-0.952616 4:-0.566667 5:-0.742857 6:0.131487
+1 1:0.37931 2:-0.476981 3:-0.900489 4:-0.037037 5:-0.52381 6:-0.0628567 7:1
+1 1:-0.586207 2:-0.332219 3:-0.7446 4:-0.5 5:0.582418 6:-0.456102 7:1
+1 1:-0.586207 2:-0.446724 3:-0.870618 4:-1.08333e-008 5:-0.406593 6:-0.150417
+1 1:-0.724138 2:-0.402211 3:-0.857637 4:-0.0151515 5:0.168831 6:0.146792 7:1
+1 1:-0.793103 2:-0.419513 3:-0.863119 4:-0.133333 5:-0.485714 6:-0.403859
+1 1:-0.862069 2:-0.336106 3:-0.7446 4:0.444444 5:-0.142857 6:-0.576431 7:1
+1 1:-0.448276 2:-0.452956 3:-0.719941 4:-0.133333 5:-0.142857 6:0.346591 7:1
+1 1:-0.448276 2:-0.178046 3:-0.696151 4:-0.133333 5:0.542857 6:0.71148 7:-1
+1 1:0.655172 2:-0.936618 3:-0.951321 4:-0.72043 5:-0.751152 6:-0.721208
+1 1:-0.310345 2:-0.224934 3:-0.589522 4:0.27451 5:-0.092437 6:-0.435616
+1 1:-0.586207 2:-0.401171 3:-0.655289 4:-1.08333e-008 5:-0.010989 6:-0.32191
+1 1:-0.0344828 2:-0.709157 3:-0.957154 4:-0.484127 5:-1 6:-0.865976
+1 1:-0.586207 2:-0.385129 3:-0.716265 4:-1.08333e-008 5:0.384615 6:-0.43438
+1 1:-1 2:-0.352717 3:-0.459725 4:-0.0714286 5:0.102041 6:0.610971
+1 1:-0.655172 2:-0.503127 3:-0.873498 4:-0.277778 5:-0.357143 6:-0.404742 7:1
+1 1:-0.310345 2:-0.50236 3:-0.809868 4:-0.107843 5:-0.243697 6:-0.237648
+1 1:0.310345 2:-0.419544 3:-0.850887 4:-0.0833333 5:-0.406593 6:-0.174632 7:1
+1 1:1 2:-0.833087 3:-0.912551 4:-0.638889 5:-0.428571 6:-0.510315 7:-1
+1 1:-0.37931 2:-0.361767 3:-0.669208 4:-0.0520833 5:-0.0357143 6:-0.506907 7:1
+1 1:-0.655172 2:-0.430158 3:-0.66526 4:0.444444 5:-0.571429 6:-0.0674941 7:1
+1 1:-0.862069 2:-0.576875 3:-0.885538 4:-0.277778 5:0.142857 6:-0.766175 7:-1
+1 1:-0.724138 2:-0.355254 3:-0.788071 4:0.378788 5:0.168831 6:-0.536547
+1 1:0.241379 2:-0.634731 3:-0.870644 4:0.3 5:-0.28 6:-0.200825 7:1
+1 1:-0.724138 2:-0.330181 3:-0.807739 4:0.378788 5:-0.0649351 6:-0.171485 7:1
+1 1:-0.310345 2:-0.218355 3:-0.837711 4:-0.235294 5:0.0588235 6:-0.214493 7:-1
+1 1:-0.655172 2:-0.215307 3:-0.777217 4:0.0833333 5:-0.142857 6:0.0727197
+1 1:-0.793103 2:-0.415335 3:-0.76492 4:-0.133333 5:0.0285714 6:-0.469124 7:1
+1 1:-0.655172 2:-0.500687 3:-0.873423 4:0.0833333 5:-0.357143 6:-0.5819 7:1
+1 1:0.37931 2:-0.795349 3:-0.918951 4:-0.358025 5:-0.809524 6:0.115482 7:1
+1 1:-0.241379 2:-0.664206 3:-0.943007 4:-0.398148 5:1.28571e-008 6:-0.652305 7:-1
+1 1:-0.655172 2:-0.385729 3:-0.653315 4:-0.0972222 5:-0.357143 6:-0.242609
+1 1:-0.793103 2:-0.275356 3:-0.69472 4:-0.133333 5:-0.485714 6:0.45331
+1 1:0.37931 2:-0.418604 3:-0.900489 4:-0.037037 5:-0.142857 6:-0.106769 7:1
+1 1:-0.862069 2:-0.261951 3:-0.623804 4:0.203704 5:-0.142857 6:-0.28015
+1 1:0.310345 2:-0.436923 3:-0.900489 4:-0.833333 5:-0.010989 6:0.00381001
+1 1:-0.793103 2:-0.535227 3:-0.877469 4:-0.783333 5:-0.742857 6:-0.0880047
+1 1:-0.931034 2:-0.284357 3:-0.523207 4:-0.458333 5:0.607143 6:0.221204 7:1
+1 1:-0.0344828 2:-0.204539 3:-0.88494 4:0.34127 5:-0.265306 6:0.0131454
+1 1:-0.655172 2:-0.267018 3:-0.588588 4:-0.277778 5:-0.357143 6:0.485339 7:1
+1 1:-0.793103 2:-0.286724 3:-0.772944 4:-0.35 5:0.285714 6:0.166651
+1 1:-0.448276 2:-0.377137 3:-0.844638 4:0.444444 5:-0.314286 6:-0.0379484
+1 1:-0.724138 2:-0.495351 3:-0.900489 4:0.181818 5:-0.766234 6:-0.406567 7:-1
+1 1:0.517241 2:-0.55354 3:-0.932627 4:-0.178161 5:-0.46798 6:0.621521 7:1
+1 1:-0.517241 2:-0.45223 3:-0.685305 4:0.0833333 5:-0.265306 6:-0.165251 7:1
+1 1:-0.310345 2:-0.26761 3:-0.900489 4:-0.107843 5:0.0588235 6:-0.19112
+1 1:-0.862069 2:-0.244531 3:-0.23871 4:0.203704 5:-0.714286 6:0.296864
+1 1:-0.862069 2:-0.30754 3:-0.47998 4:-0.277778 5:0.428571 6:-0.252382 7:1
+1 1:-0.724138 2:-0.646875 3:-0.974077 4:-0.409091 5:-0.766234 6:-0.852232
+1 1:-0.586207 2:-0.478723 3:-0.803049 4:0.5 5:-0.010989 6:-0.192285 7:1
+1 1:-0.793103 2:-0.118171 3:-0.491243 4:0.0833333 5:-0.228571 6:0.184716
+1 1:-0.862069 2:-0.370359 3:-0.602779 4:0.444444 5:-1 6:0.504577 7:1
+1 1:-0.724138 2:-0.291917 3:-0.813544 4:0.378788 5:-0.298701 6:-0.0712661
+0.9339870694824339 1:-0.586207 2:-0.261986 3:-0.366721 4:0.166667 5:-0.406593 6:0.192552
+1 1:0.172414 2:-0.667548 3:-0.900489 4:0.173611 5:-0.25 6:-0.537069 7:1
+1 1:-0.793103 2:-0.24428 3:-0.501567 4:0.0833333 5:-0.742857 6:0.412821
+1 1:-0.655172 2:-0.560635 3:-0.940958 4:-0.0972222 5:-0.785714 6:-0.412343 7:-1
+1 1:-0.793103 2:-0.211425 3:-0.576712 4:0.0833333 5:0.0285714 6:-0.0393997
+0.9270779802951742 1:-0.586207 2:-0.142073 3:-0.703038 4:0.5 5:-0.406593 6:0.186126
+1 1:-0.724138 2:-0.179104 3:-0.422208 4:0.181818 5:-0.0649351 6:-0.0716954
+1 1:-0.862069 2:-0.29406 3:-0.810751 4:0.203704 5:-0.142857 6:-7.71596e-005 7:1
+1 1:-0.448276 2:-0.363591 3:-0.700354 4:-0.277778 5:-0.142857 6:0.528887 7:1
+1 1:-0.172414 2:-0.179011 3:-0.875021 4:0.0263158 5:0.353383 6:-0.241964
+1 1:-0.586207 2:-0.215836 3:-0.849651 4:-1.08333e-008 5:-0.208791 6:-0.287691 7:1
+1 1:-0.793103 2:-0.299007 3:-0.408911 4:0.516667 5:-0.485714 6:-0.111755
+1 1:-1 2:-0.521259 3:-0.853444 4:-0.0714286 5:-0.632653 6:0.424164
+1 1:-0.793103 2:-0.335992 3:-0.637842 4:0.0833333 5:0.285714 6:-0.368557
+1 1:-0.724138 2:-0.363766 3:-0.900489 4:-0.0151515 5:-0.766234 6:-0.592984
+1 1:-0.862069 2:-0.337604 3:-0.352754 4:0.203704 5:-0.428571 6:-0.149161
+-1 1:-0.793103 2:-0.300039 3:-0.685204 4:0.0833333 5:-0.228571 6:-0.287248 7:1
+-1 1:-0.586207 2:-0.430449 3:-0.834167 4:-1.08333e-008 5:-0.604396 6:-0.331483 7:1
+-1 1:0.0344828 2:-0.584435 3:-0.851202 4:0.280303 5:-0.415584 6:-0.629043 7:1
+-1 1:-0.862069 2:-0.286357 3:-0.623804 4:-0.037037 5:-0.428571 6:0.0477259 7:-1
+-1 1:-0.0344828 2:-0.390569 3:-0.900489 4:-0.277778 5:-0.265306 6:-0.328902
+-1 1:-0.37931 2:-0.508333 3:-0.900489 4:-0.0520833 5:-0.678571 6:-0.488107 7:1
+-0.3461158665854429 1:0.517241 2:-0.760923 3:-0.909536 4:-0.402299 5:-0.46798 6:-0.676679 7:-1
+-1 1:0.172414 2:-0.609246 3:-0.884986 4:-0.00694444 5:-0.142857 6:-0.640898
+-1 1:-0.103448 2:-0.749119 3:-0.957154 4:-0.566667 5:-0.357143 6:0.390458 7:1
+-0.1986656559283165 1:-0.103448 2:-0.505162 3:-0.821382 4:-0.458333 5:-0.228571 6:-0.640563 7:1
+-1 1:-0.931034 2:-0.450195 3:-0.735707 4:-0.1875 5:-0.357143 6:-0.156017 7:1
+-1 1:-0.586207 2:-0.395101 3:-0.680834 4:-0.333333 5:0.78022 6:-0.378797 7:1
+-1 1:-0.0344828 2:-0.35478 3:-0.865847 4:-0.174603 5:-0.142857 6:-0.0275198
+-1 1:-0.241379 2:-0.526708 3:-0.911543 4:-0.037037 5:-0.714286 6:-0.559884 7:1
+-1 1:-0.655172 2:-0.200549 3:-0.76409 4:0.0833333 5:-0.571429 6:0.270576 7:-1
+-1 1:-1 2:-0.375035 3:-0.832913 4:-0.380952 5:0.102041 6:-0.0804503 7:1
+-1 1:-0.103448 2:-0.628021 3:-0.944364 4:-0.241667 5:-0.228571 6:0.0472809
+-1 1:-0.862069 2:-0.343055 3:-0.870644 4:-0.037037 5:-0.142857 6:0.345233
+-1 1:-0.241379 2:-0.442694 3:-0.87441 4:0.0833333 5:0.142857 6:-0.588923
+-1 1:-0.724138 2:-0.406114 3:-0.837092 4:-0.0151515 5:-0.532468 6:-0.00378432 7:1
+-1 1:-0.724138 2:-0.233608 3:-0.662433 4:-0.212121 5:0.168831 6:-0.0642961 7:1
+-1 1:-0.862069 2:-0.440206 3:-0.855997 4:0.203704 5:-0.142857 6:-0.389992
+-1 1:-0.862069 2:-0.456476 3:-0.681721 4:-0.277778 5:-0.142857 6:-0.403798 7:1
+-0.1188327971508159 1:-0.448276 2:-0.226053 3:-0.88494 4:-0.133333 5:-0.314286 6:-0.131822 7:-1
+-1 1:0.0344828 2:-0.489735 3:-0.894901 4:-0.409091 5:-0.298701 6:-0.524047 7:1
+-1 1:-0.655172 2:-0.287704 3:-0.897673 4:0.0833333 5:0.0714285 6:0.131022 7:-1
+-1 1:-0.655172 2:-0.48276 3:-0.828798 4:-0.277778 5:-0.571429 6:-0.111716 7:1
+-1 1:-0.655172 2:-0.392846 3:-0.8066 4:0.263889 5:-1 6:-0.542138
+-1 1:-0.448276 2:-0.373677 3:-0.909536 4:-0.133333 5:-0.485714 6:-0.164521
+-0.1911177572129832 1:-1 2:-0.390458 3:-0.834381 4:-0.380952 5:-0.632653 6:0.570309
+-1 1:-0.862069 2:-0.404916 3:-0.73092 4:-0.037037 5:-0.428571 6:-0.394964
+-1 1:-0.793103 2:-0.424249 3:-0.544062 4:-0.35 5:-0.228571 6:-0.389475
+-1 1:-0.793103 2:-0.270138 3:-0.611044 4:0.0833333 5:0.0285714 6:0.196232
+-1 1:-0.862069 2:-0.358739 3:-0.900489 4:-0.277778 5:0.428571 6:0.270363
+-1 1:-0.862069 2:-0.243284 3:-0.801284 4:-0.037037 5:-0.142857 6:0.270616 7:1
+-1 1:-0.517241 2:-0.591815 3:-0.917591 4:0.0833333 5:-0.44898 6:-0.637813 7:1
+-1 1:0.0344828 2:-0.662881 3:-0.921478 4:-0.212121 5:-0.0649351 6:-0.463562
+-1 1:0.517241 2:-0.75733 3:-0.909536 4:-0.477011 5:-0.73399 6:-0.566725 7:-1
+-1 1:-0.862069 2:-0.358723 3:-0.844736 4:-0.037037 5:-0.714286 6:0.0890594 7:1
+-1 1:-0.793103 2:-0.36291 3:-0.820885 4:-0.35 5:-0.485714 6:0.754638
+-1 1:-0.793103 2:-0.306744 3:-0.756006 4:0.0833333 5:0.285714 6:0.0400099 7:-1
+-1 1:-0.517241 2:-0.397803 3:-0.932062 4:-0.22619 5:-0.265306 6:0.123637
+-1 1:-0.0344828 2:-0.434735 3:-0.909536 4:-0.380952 5:0.102041 6:-0.418503
+-1 1:0.724138 2:-0.164899 3:-0.903678 4:0.015625 5:-0.357143 6:-0.4674
+-1 1:-0.448276 2:-0.322144 3:-0.900489 4:-0.133333 5:-0.142857 6:-0.499298
+-1 1:-0.931034 2:-0.309543 3:-0.638945 4:0.0833333 5:0.285714 6:-0.240527
+-1 1:-0.931034 2:-0.293594 3:-0.510508 4:0.0833333 5:-0.357143 6:0.157461 7:1
+-1 1:-0.862069 2:-0.45875 3:-0.915472 4:0.203704 5:0.142857 6:0.274205
+-1 1:-0.931034 2:-0.429477 3:-0.544742 4:-0.458333 5:-0.0357143 6:-0.262015 7:1
+-1 1:-1 2:-0.516554 3:-0.957154 4:0.238095 5:-0.632653 6:0.232113
+-1 1:-0.655172 2:-0.306011 3:-0.679549 4:-0.0972222 5:-0.785714 6:-0.118516 7:-1
+-1 1:-0.37931 2:-0.542896 3:-0.788843 4:-0.1875 5:0.125 6:0.12049 7:-1
+-1 1:0.37931 2:-0.674517 3:-0.908419 4:-0.277778 5:-0.333333 6:-0.372233
+-1 1:-0.517241 2:-0.380952 3:-0.92255 4:0.0833333 5:-0.44898 6:-0.51216 7:1
+-1 1:0.724138 2:-0.817156 3:-0.928262 4:-0.59375 5:-0.919643 6:-0.367139 7:-1
+-1 1:-0.172414 2:-0.54858 3:-0.900489 4:0.140351 5:-0.323308 6:0.0448816
+-1 1:-1 2:-0.316595 3:-0.901703 4:-0.0714286 5:0.102041 6:-0.541845 7:1
+-1 1:-1 2:-0.357677 3:-0.281824 4:-0.0714286 5:0.102041 6:0.610997 7:1
+-1 1:-0.517241 2:-0.435548 3:-0.883168 4:-0.0714286 5:-0.265306 6:-0.0986343
+-1 1:-0.862069 2:-0.187698 3:-0.598523 4:0.203704 5:0.714286 6:0.300126
+-1 1:0.37931 2:-0.401564 3:-0.900489 4:-0.037037 5:0.047619 6:-0.476828 7:-1
+-1 1:-0.517241 2:-0.497936 3:-0.908423 4:-0.0714286 5:-0.0816326 6:-0.195657 7:1
+-1 1:-0.586207 2:-0.404157 3:-0.730835 4:0.5 5:-0.406593 6:-0.60907
+-1 1:-0.37931 2:-0.403389 3:-0.882191 4:-0.1875 5:-0.517857 6:-0.0374236
+-1 1:-0.241379 2:-0.544544 3:-0.883168 4:-0.398148 5:-0.571429 6:-0.102519 7:1
+-1 1:-0.793103 2:-0.345087 3:-0.898462 4:-0.133333 5:0.542857 6:0.195684
+-1 1:-0.517241 2:-0.256095 3:-0.876263 4:0.0833333 5:0.102041 6:-0.182766 7:-1
+-1 1:-0.862069 2:-0.535223 3:-0.934693 4:-0.037037 5:-0.714286 6:0.384471 7:1
+-1 1:-0.37931 2:-0.324887 3:-0.796261 4:0.0833333 5:-0.678571 6:-0.0394031 7:-1
+-1 1:-0.931034 2:-0.390808 3:-0.493696 4:0.354167 5:-0.357143 6:0.0723801
+-1 1:-0.862069 2:-0.321522 3:-0.459725 4:-0.277778 5:-0.142857 6:0.0106136 7:1
+-1 1:-0.931034 2:-0.196354 3:-0.838718 4:-0.1875 5:-0.0357143 6:-0.181903 7:1
+-1 1:0.448276 2:-0.68652 3:-0.900489 4:-0.0714286 5:-0.265306 6:-0.61311 7:1
+-1 1:0.37931 2:-0.66825 3:-0.900489 4:-0.358025 5:-0.428571 6:-0.617936 7:1
+-1 1:-0.931034 2:-0.191597 3:-0.782381 4:-0.458333 5:0.285714 6:0.719136 7:-1
+-1 1:-0.862069 2:-0.301338 3:-0.71994 4:0.444444 5:-0.714286 6:0.0272952
+-1 1:-0.655172 2:-0.297984 3:-0.866199 4:-0.0972222 5:-0.142857 6:0.246378
+-0.00194321300395835 1:-0.517241 2:-0.455946 3:-0.868246 4:-0.380952 5:-1 6:0.506886
+-1 1:-0.655172 2:-0.480972 3:-0.845555 4:-0.277778 5:0.0714285 6:-0.187785
+-1 1:-0.655172 2:-0.483952 3:-0.836982 4:0.444444 5:-0.785714 6:-0.519897 7:1
+-1 1:-0.172414 2:-0.288114 3:-0.900489 4:-0.31579 5:-0.729323 6:-0.0571029
+-1 1:-0.37931 2:-0.331432 3:-0.811018 4:0.625 5:-0.196429 6:0.277337
+-1 1:-0.517241 2:-0.388103 3:-0.745131 4:-0.0714286 5:-0.265306 6:-0.207788
+-1 1:-0.241379 2:-0.543316 3:-0.84407 4:-0.037037 5:0.142857 6:-0.625793
+-1 1:-0.0344828 2:-0.481044 3:-0.900489 4:-0.174603 5:-0.387755 6:-0.39622 7:1
+-1 1:0.241379 2:-0.516918 3:-0.900489 4:-0.393333 5:-0.485714 6:-0.379752
+-1 1:-0.0344828 2:-0.528156 3:-0.900489 4:-0.793651 5:-0.510204 6:0.0612873 7:1
+-1 1:-0.931034 2:-0.393626 3:-0.631499 4:0.0833333 5:-0.0357143 6:-0.341374
+-1 1:-0.862069 2:-0.391379 3:-0.902731 4:-0.277778 5:-0.142857 6:-0.037767 7:1
+-1 1:-0.793103 2:-0.28445 3:-0.521809 4:-0.35 5:0.285714 6:-0.180078
+-0.09605236892975597 1:-0.172414 2:-0.520633 3:-0.900489 4:-0.429825 5:0.218045 6:-0.532098
+-1 1:-0.241379 2:-0.402277 3:-0.843681 4:-0.157407 5:-0.285714 6:-0.0713978 7:-1
+-1 1:-1 2:-0.19968 3:-0.598831 4:0.547619 5:0.102041 6:-0.106444
+-1 1:-1 2:-0.486976 3:-0.740206 4:0.238095 5:-0.632653 6:0.902842
+-1 1:-0.103448 2:-0.681568 3:-0.973066 4:-0.241667 5:-0.485714 6:0.216375 7:1
+-1 1:0.655172 2:-0.867517 3:-0.928277 4:-0.370968 5:-0.419355 6:-0.719936 7:1
+-1 1:-0.655172 2:-0.360159 3:-0.810736 4:-0.0972222 5:-0.571429 6:-0.596847 7:1
+-1 1:-0.517241 2:-0.494547 3:-0.880431 4:0.392857 5:-1 6:-0.243662 7:1
+-0.64657580806115 1:0.862069 2:-1 3:-0.969184 4:-0.362745 5:-0.394958 6:-0.726131
+-1 1:-0.793103 2:-0.332484 3:-0.797365 4:-0.133333 5:-0.228571 6:-0.259068
+-1 1:-0.931034 2:-0.557352 3:-0.892522 4:0.625 5:-0.678571 6:-0.350259
+-1 1:-0.793103 2:-0.467025 3:-0.860495 4:-0.35 5:-0.228571 6:0.319392 7:1
+-1 1:-0.655172 2:-0.368204 3:-0.900489 4:-0.0972222 5:-0.357143 6:-0.138288
+-1 1:-0.862069 2:-0.427017 3:-0.498743 4:-0.277778 5:-0.714286 6:0.288458
+-1 1:-0.862069 2:-0.501067 3:-0.839876 4:0.203704 5:-1 6:-0.124326 7:1
+-0.2258421797454636 1:-1 2:-0.494694 3:-0.753129 4:0.238095 5:-1 6:0.521165 7:-1
+-1 1:-0.241379 2:-0.543085 3:-0.863171 4:0.0833333 5:-0.714286 6:-0.373437
+-1 1:-0.931034 2:-0.414547 3:-0.705799 4:-0.1875 5:-0.0357143 6:-0.10521 7:1
+-1 1:-0.724138 2:-0.281936 3:-0.72282 4:-0.212121 5:-0.298701 6:-0.204346 7:1
+-1 1:-0.655172 2:-0.436776 3:-0.724993 4:-0.0972222 5:-0.571429 6:0.300358
+-1 1:-0.793103 2:-0.336565 3:-0.369472 4:-0.35 5:-1 6:0.249166
+-1 1:-0.37931 2:-0.445792 3:-0.79556 4:0.21875 5:-0.0357143 6:-0.441957 7:1
+-1 1:-0.862069 2:-0.210173 3:-0.826262 4:-0.037037 5:-0.142857 6:-0.168494
+-1 1:-0.862069 2:-0.50748 3:-0.870644 4:-0.037037 5:-0.714286 6:0.0609867 7:1
+-1 1:-1 2:-0.391642 3:-0.746024 4:-0.0714286 5:-0.632653 6:0.327364 7:1
+-1 1:-0.931034 2:-0.455162 3:-0.80779 4:-0.1875 5:-0.678571 6:0.76007 7:1
+-1 1:-0.37931 2:-0.356715 3:-0.725025 4:-0.0520833 5:-0.517857 6:-0.163341
+-1 1:-0.310345 2:-0.442106 3:-0.837711 4:-0.107843 5:-0.697479 6:0.0491281
+-1 1:0.0344828 2:-0.325442 3:-0.898997 4:-0.113636 5:-0.649351 6:-0.303956
+-1 1:-0.862069 2:-0.584626 3:-0.957154 4:-0.277778 5:-0.428571 6:0.904204
+-1 1:0.310345 2:-0.672325 3:-0.900489 4:-0.25 5:-0.10989 6:-0.48558 7:-1
+-1 1:-0.655172 2:-0.410823 3:-0.889056 4:0.0833333 5:0.0714285 6:-0.0893431 7:-1
+-1 1:-0.793103 2:-0.411392 3:-0.853012 4:-0.133333 5:-0.485714 6:-0.13411 7:1
+-1 1:-0.655172 2:-0.330315 3:-0.667955 4:-0.638889 5:0.0714285 6:-0.0845123
+-1 1:-0.862069 2:-0.421901 3:-0.809639 4:0.203704 5:-0.142857 6:-0.245588
+-1 1:-0.241379 2:-0.512065 3:-0.797986 4:-0.037037 5:0.142857 6:-0.205094 7:-1
+-1 1:-1 2:-0.470644 3:-0.76633 4:0.238095 5:-0.632653 6:0.381814
+-1 1:-0.448276 2:-0.450273 3:-0.900489 4:0.0111111 5:0.0285714 6:-0.240713
+-1 1:-0.103448 2:-0.531 3:-0.826215 4:0.0833333 5:0.157143 6:-0.187406
+-1 1:-0.241379 2:-0.60538 3:-0.892715 4:0.203704 5:-0.428571 6:-0.55211
+-1 1:-0.862069 2:-0.578037 3:-0.975431 4:0.203704 5:-0.428571 6:-0.165502 7:1
+-1 1:-1 2:-0.415978 3:-0.509378 4:-0.0714286 5:-0.265306 6:-0.400697
+-1 1:0.37931 2:-0.605814 3:-0.900489 4:-0.277778 5:-0.428571 6:-0.720478 7:1
diff --git a/MQScoreSVM2.range b/MQScoreSVM2.range
new file mode 100644
index 0000000..a62e96a
--- /dev/null
+++ b/MQScoreSVM2.range
@@ -0,0 +1,9 @@
+x
+-1 1
+1 6 35
+2 -62.2700119 172.55780029
+3 -1.98770213 13.04912186
+4 0 0.92307693
+5 0 0.77777779
+6 0.00559854 0.97112942
+7 0 2
diff --git a/MQScoreSVM3.model b/MQScoreSVM3.model
new file mode 100644
index 0000000..7b3209e
--- /dev/null
+++ b/MQScoreSVM3.model
@@ -0,0 +1,282 @@
+svm_type c_svc
+kernel_type rbf
+gamma 0.142857
+nr_class 2
+total_sv 273
+rho -0.954745
+label 1 -1
+nr_sv 134 139
+SV
+1 1:-0.272727 2:0.0899649 3:-0.55925 4:-0.0260869 5:-0.233333 6:-0.303918 7:1
+1 1:-0.454545 2:-0.148332 3:-0.890517 4:-0.49913 5:-0.693333 6:-0.360163 7:-1
+1 1:-0.490909 2:-0.0506354 3:-0.619732 4:-0.0434782 5:-0.25463 6:-0.339243 7:1
+1 1:-0.854545 2:0.0834544 3:-0.556154 4:0.0434783 5:-0.0873015 6:-0.565834 7:1
+1 1:-0.127273 2:-0.503121 3:-0.936973 4:-0.386189 5:-0.699346 6:-0.644947
+1 1:0.163636 2:-0.100406 3:-0.611421 4:-0.403727 5:-0.574074 6:-0.578468 7:-1
+1 1:-0.490909 2:-0.291929 3:-0.860365 4:-0.391304 5:-0.467593 6:-0.621126 7:-1
+1 1:0.2 2:-0.626494 3:-0.905317 4:-0.854398 5:-0.762274 6:-0.685701 7:-1
+1 1:-0.563636 2:0.115605 3:-0.577448 4:0.422925 5:-0.186869 6:-0.197135
+1 1:-0.6 2:-0.0945951 3:-0.826219 4:-0.10559 5:-0.391534 6:-0.134741 7:1
+1 1:-0.745455 2:0.0514783 3:-0.826219 4:-0.0179028 5:0.0522876 6:-0.294844 7:1
+1 1:-0.381818 2:0.294622 3:-0.405372 4:0.15942 5:-0.621399 6:-0.574228
+1 1:-0.309091 2:0.101064 3:-0.5954 4:-0.424288 5:-0.295019 6:0.21399 7:1
+1 1:-0.636364 2:0.0927785 3:-0.545102 4:0.147826 5:-0.744444 6:-0.209692
+1 1:-0.2 2:0.128124 3:-0.64662 4:0.108696 5:-0.121528 6:-0.57484 7:1
+1 1:-0.236364 2:-0.529273 3:-0.936973 4:-0.932679 5:-0.752688 6:-0.769762 7:-1
+1 1:-0.818182 2:-0.0711355 3:-0.7052 4:0.252174 5:-0.488889 6:-0.532768 7:1
+1 1:-0.636364 2:0.239167 3:-0.498404 4:0.147826 5:-0.488889 6:-0.170057
+1 1:-0.6 2:0.159823 3:-0.826219 4:0.0931678 5:0.338624 6:0.161563
+1 1:0.2 2:-0.620566 3:-0.936973 4:-0.660263 5:-0.524548 6:-0.6463 7:-1
+1 1:-0.381818 2:-0.0359829 3:-0.611421 4:-0.690821 5:-0.8107 6:-0.689936 7:-1
+1 1:-0.309091 2:-0.00291753 3:-0.611421 4:-0.424288 5:-0.559387 6:0.348496 7:1
+1 1:-0.818182 2:0.0103802 3:-0.606302 4:-0.0260869 5:-0.148148 6:-0.223739 7:1
+1 1:-0.563636 2:0.0300404 3:-0.591458 4:-0.146245 5:-0.30303 6:-0.33707 7:1
+1 1:-0.127273 2:-0.173354 3:-0.678562 4:-0.386189 5:-0.54902 6:-0.779538 7:1
+1 1:-0.781818 2:-0.186729 3:-1 4:-0.217391 5:0.118056 6:-0.252483 7:1
+1 1:-0.563636 2:-0.153324 3:-0.776537 4:-0.146245 5:-0.419192 6:-0.454266 7:1
+1 1:-0.0181818 2:-0.139491 3:-0.611421 4:-0.548766 5:-0.861862 6:-0.772751 7:-1
+1 1:-0.709091 2:0.0405295 3:-0.552922 4:0.0434783 5:-0.858025 6:-0.290584 7:-1
+1 1:-0.527273 2:0.212174 3:-0.277383 4:0.542533 5:-0.555556 6:-0.21165
+1 1:0.418182 2:-0.57029 3:-0.905317 4:-0.616681 5:-0.791383 6:-0.504048 7:-1
+1 1:-0.272727 2:-0.144955 3:-0.602385 4:-0.791304 5:-0.659259 6:-0.873601 7:-1
+1 1:-0.963636 2:0.289556 3:-0.187726 4:0.13834 5:-0.30303 6:0.612709
+1 1:0.381818 2:-0.593798 3:-0.905317 4:-0.695652 5:-0.733796 6:-0.521568 7:-1
+1 1:-0.381818 2:-0.377484 3:-0.905317 4:-0.536232 5:-1 6:0.274908 7:-1
+1 1:-0.345455 2:0.0193282 3:-0.581561 4:-0.254658 5:-0.269841 6:-0.469301 7:1
+1 1:-0.0181818 2:-0.419755 3:-0.905317 4:-0.774383 5:-0.585586 6:-0.533297 7:-1
+1 1:0.272727 2:-0.132596 3:-0.591458 4:-0.768116 5:-0.88642 6:-0.570232 7:-1
+1 1:-0.890909 2:0.266417 3:-0.555322 4:0.123746 5:-0.213675 6:0.54908
+1 1:-0.490909 2:-0.333979 3:-0.905317 4:-0.217391 5:-0.787037 6:-0.684209 7:-1
+1 1:0.0181818 2:-0.536225 3:-0.905317 4:-0.505721 5:-0.52924 6:-0.304684 7:-1
+1 1:-0.309091 2:0.0246884 3:-0.611421 4:-0.352324 5:-0.206897 6:-0.272258 7:1
+1 1:-0.127273 2:-0.387605 3:-0.905317 4:-0.570332 5:-0.77451 6:-0.460648 7:-1
+1 1:0.0909091 2:-0.575961 3:-0.905317 4:-0.895652 5:-0.808333 6:-0.656387 7:-1
+1 1:0.0909091 2:-0.102027 3:-0.591458 4:-0.686956 5:-0.616667 6:-0.735073 7:-1
+1 1:-0.0181818 2:-0.167028 3:-0.611421 4:-0.60517 5:-0.792793 6:-0.694207 7:-1
+1 1:-0.309091 2:0.0412439 3:-0.591458 4:0.00749625 5:-0.206897 6:-0.237956 7:1
+1 1:-0.890909 2:0.0751203 3:-0.591458 4:0.284281 5:-0.606838 6:-0.0229926 7:1
+1 1:-0.0909091 2:-0.113376 3:-0.591458 4:-0.880745 5:-1 6:-0.873557
+1 1:-0.309091 2:-0.297658 3:-0.843902 4:-0.568216 5:-0.911877 6:-0.52133 7:-1
+1 1:-0.345455 2:-0.050942 3:-0.596014 4:-0.776398 5:-0.452381 6:-0.788429 7:-1
+1 1:-0.381818 2:0.155907 3:-0.5954 4:-0.304348 5:-0.526749 6:0.0555391 7:1
+1 1:-0.454545 2:0.0369172 3:-0.611421 4:-0.749565 5:-0.08 6:-0.150702
+1 1:-0.163636 2:-0.219047 3:-0.702456 4:-0.177866 5:0.00673406 6:-0.582193 7:-1
+1 1:-0.563636 2:-0.0202913 3:-0.912263 4:-0.0513833 5:-0.535354 6:0.169819 7:1
+1 1:-0.127273 2:0.0487245 3:-0.60144 4:-0.0179028 5:-0.323529 6:-0.638323 7:-1
+1 1:0.0181818 2:-0.578235 3:-0.936973 4:-0.505721 5:-0.730994 6:-0.605073 7:-1
+1 1:-0.2 2:-0.455601 3:-0.905317 4:-0.608696 5:-0.920139 6:-1 7:-1
+1 1:-0.490909 2:-0.286568 3:-0.860246 4:-0.565217 5:-0.680556 6:-0.869284 7:-1
+1 1:0.454545 2:-0.697657 3:-0.905317 4:-0.833043 5:-0.948889 6:-0.773849 7:-1
+1 1:0.236364 2:-0.291848 3:-0.678562 4:-0.193676 5:-0.593434 6:-0.781461 7:-1
+1 1:-0.854545 2:0.0856328 3:-0.571902 4:-0.10559 5:-0.269841 6:-0.355254 7:1
+1 1:-0.381818 2:0.0797368 3:-0.579788 4:-0.381642 5:-0.148148 6:-0.0679593 7:1
+1 1:-0.272727 2:-0.155045 3:-0.668056 4:-0.373913 5:-0.403704 6:-0.628571 7:-1
+0.09087011095639566 1:-0.381818 2:0.058083 3:-0.584987 4:-0.381642 5:0.135802 6:-0.0235703 7:1
+1 1:-0.854545 2:-0.0110974 3:-0.611421 4:-0.254658 5:-0.634921 6:-0.145705 7:1
+1 1:-0.381818 2:-0.017075 3:-0.611421 4:-0.149758 5:-0.242798 6:-0.265751 7:1
+1 1:-0.163636 2:-0.0153693 3:-0.591458 4:-0.177866 5:-0.612795 6:-0.466029 7:-1
+1 1:-0.672727 2:0.209886 3:-0.676371 4:-0.231121 5:0.210526 6:-0.177047
+1 1:-0.709091 2:0.224216 3:-0.492565 4:0.275362 5:-0.148148 6:0.364779
+1 1:-0.854545 2:-0.0389125 3:-0.661235 4:-0.254658 5:-0.269841 6:-0.750148
+1 1:-0.890909 2:-0.0382322 3:-0.82403 4:-0.197324 5:-0.410256 6:0.176862 7:-1
+1 1:-0.418182 2:-0.109256 3:-0.806766 4:-0.357859 5:0.179487 6:0.282263
+1 1:-0.345455 2:0.12064 3:-0.591458 4:0.118012 5:0.00396823 6:-0.266596
+1 1:-0.818182 2:0.238362 3:-0.542748 4:-0.0260869 5:0.362963 6:0.371868
+1 1:-0.0181818 2:-0.501633 3:-0.905317 4:-0.717979 5:-0.585586 6:-0.628016 7:-1
+1 1:-0.490909 2:-0.254761 3:-0.905317 4:-0.652174 5:-0.893519 6:-0.538317 7:-1
+1 1:-0.309091 2:0.048215 3:-0.609596 4:0.00749625 5:-0.0306513 6:-0.351411 7:1
+1 1:-0.6 2:-0.0666729 3:-0.826219 4:-0.10559 5:-0.148148 6:-0.0614172
+1 1:-0.563636 2:-0.173841 3:-0.812676 4:-0.146245 5:0.277778 6:-0.197305 7:1
+1 1:0.236364 2:-0.713155 3:-0.936973 4:-0.905138 5:-1 6:-0.913983
+1 1:-0.381818 2:0.152434 3:-0.547043 4:0.31401 5:-0.432099 6:-0.0954
+1 1:-0.381818 2:-0.0557794 3:-0.587214 4:-0.536232 5:-0.526749 6:-0.548287 7:-1
+1 1:-0.0181818 2:-0.111359 3:-0.587214 4:-0.548766 5:-0.792793 6:-0.707813 7:-1
+1 1:-0.709091 2:-0.0426993 3:-0.591458 4:-0.42029 5:-0.858025 6:-0.675295
+1 1:-0.818182 2:0.265098 3:-0.537694 4:-0.165217 5:0.192593 6:0.304924
+1 1:-0.672727 2:-0.0963553 3:-0.75851 4:-0.121281 5:-0.730994 6:0.0472178 7:1
+1 1:-0.309091 2:-0.0503368 3:-0.591458 4:-0.208396 5:-0.118774 6:-0.370396 7:1
+1 1:-0.418182 2:0.233661 3:-0.257027 4:0.0434783 5:-0.213675 6:-0.492917 7:1
+1 1:0.0909091 2:-0.173653 3:-0.611421 4:-0.478261 5:-0.872222 6:-0.698216 7:-1
+1 1:-0.527273 2:-0.0688602 3:-0.587214 4:-0.637051 5:-1 6:-0.744235 7:-1
+1 1:-0.6 2:-0.124142 3:-0.826219 4:-0.10559 5:-0.148148 6:-0.0407903 7:1
+1 1:-0.490909 2:-0.236045 3:-0.913266 4:-0.826087 5:-0.25463 6:-0.605505
+1 1:0.309091 2:-0.790448 3:-0.905317 4:-1 5:-0.888889 6:-0.632747 7:-1
+1 1:-0.163636 2:0.0221065 3:-0.591458 4:-0.114624 5:0.00673406 6:-0.308139 7:1
+1 1:0.0181818 2:-0.0518016 3:-0.591458 4:-0.560641 5:-0.932749 6:-0.742613 7:-1
+1 1:-0.127273 2:-0.421377 3:-0.905317 4:-0.754476 5:-0.849673 6:-0.754287 7:-1
+1 1:-0.854545 2:-0.092065 3:-0.612299 4:-0.403727 5:-0.634921 6:-0.614144
+0.8219388582076175 1:-0.818182 2:0.264198 3:-0.316984 4:0.252174 5:0.192593 6:0.130613
+1 1:-0.490909 2:-0.145743 3:-0.795196 4:-0.217391 5:-0.574074 6:-0.641057 7:-1
+1 1:-0.0181818 2:-0.0894404 3:-0.611421 4:-0.548766 5:-0.585586 6:-0.404195 7:-1
+1 1:1 2:-1 3:-0.905317 4:-0.935786 5:-0.921368 6:-0.745086 7:-1
+1 1:-0.709091 2:-0.14343 3:-0.797298 4:-0.42029 5:-0.432099 6:-0.400887 7:1
+1 1:0.563636 2:-0.86391 3:-0.905317 4:-1 5:-0.855346 6:-0.916095 7:-1
+1 1:-0.672727 2:0.049684 3:-0.591458 4:-0.340961 5:-0.327485 6:-0.385873 7:1
+1 1:-0.927273 2:0.0955423 3:-0.60144 4:0.0434783 5:-0.574074 6:-0.140094 7:1
+1 1:-0.2 2:0.0159662 3:-0.5956 4:0.369565 5:-0.28125 6:-0.597218 7:1
+1 1:-0.236364 2:-0.0456932 3:-0.591458 4:-0.663394 5:-0.587814 6:-0.288249
+1 1:-0.563636 2:0.0163316 3:-0.591458 4:-0.241107 5:-0.535354 6:-0.319443 7:1
+1 1:-0.454545 2:-0.385983 3:-0.905317 4:-0.916522 5:-0.693333 6:-0.672395 7:-1
+1 1:-0.781818 2:0.122266 3:-0.591458 4:-0.217391 5:-0.520833 6:-0.175818 7:1
+1 1:-0.381818 2:0.0776673 3:-0.591458 4:0.15942 5:-0.526749 6:-0.360997
+1 1:-0.563636 2:0.150934 3:-0.77094 4:-0.241107 5:0.393939 6:0.483988 7:-1
+1 1:0.0909091 2:-0.651052 3:-0.921145 4:-1 5:-0.936111 6:-0.946851 7:-1
+1 1:-0.818182 2:0.30899 3:-0.33703 4:-0.0260869 5:0.362963 6:0.273405
+1 1:0.454545 2:-0.792387 3:-0.905317 4:-0.874783 5:-0.897778 6:-0.817506 7:-1
+1 1:-0.672727 2:-0.0224593 3:-0.826219 4:-0.231121 5:-0.327485 6:-0.114133 7:1
+1 1:-0.490909 2:0.359379 3:-0.575952 4:-0.217391 5:-0.680556 6:0.294065
+1 1:-0.854545 2:0.0704604 3:-0.570923 4:-0.254658 5:-0.0873015 6:-0.44293
+1 1:-0.527273 2:0.196481 3:-0.581406 4:0.0888469 5:0.222222 6:0.0283912
+1 1:0.0909091 2:-0.576766 3:-0.905317 4:-0.791304 5:-0.936111 6:-0.547386 7:-1
+1 1:-0.454545 2:-0.38071 3:-0.936973 4:-0.833043 5:-0.693333 6:-0.910928 7:-1
+1 1:0.0909091 2:-0.11038 3:-0.591458 4:-0.530435 5:-0.744444 6:-0.605258 7:-1
+1 1:-0.236364 2:-0.24063 3:-0.936973 4:-0.192146 5:-0.175627 6:-0.272347
+1 1:-0.454545 2:0.00996574 3:-0.81553 4:0.168696 5:0.124444 6:0.10609
+1 1:-0.309091 2:-0.401403 3:-0.905317 4:-0.856072 5:-0.559387 6:-0.549036 7:-1
+1 1:0.2 2:-0.2221 3:-0.635208 4:-0.611729 5:-0.762274 6:-0.785483 7:-1
+1 1:0.0181818 2:-0.101844 3:-0.611421 4:-0.670481 5:-0.663743 6:-0.474902 7:-1
+1 1:-0.2 2:-0.159109 3:-0.591458 4:-0.478261 5:-0.760417 6:-0.504889 7:-1
+1 1:0.127273 2:-0.167568 3:-0.611421 4:-0.541888 5:-0.688347 6:-0.777346 7:-1
+1 1:-0.636364 2:-0.0458581 3:-0.764868 4:-0.0608696 5:-0.488889 6:-0.283288 7:1
+1 1:0.0545455 2:-0.128377 3:-0.587214 4:-0.732441 5:-0.672365 6:-0.664909 7:-1
+1 1:-0.781818 2:-0.115252 3:-0.695252 4:0.173913 5:-0.361111 6:-0.580836 7:-1
+1 1:-0.381818 2:0.140729 3:-0.537694 4:0.00483096 5:-0.90535 6:-0.0959735
+-1 1:-0.418182 2:0.0246992 3:-0.611421 4:0.0434783 5:-0.017094 6:-0.62525
+-1 1:-0.0181818 2:-0.42518 3:-0.905317 4:-0.379553 5:-0.792793 6:-0.631876 7:-1
+-0.8415080727149786 1:-0.527273 2:-0.0638818 3:-0.611421 4:-0.455577 5:-0.444444 6:-0.660896 7:1
+-1 1:-0.527273 2:0.0671946 3:-0.611421 4:-0.0926276 5:-0.333333 6:-0.405899
+-1 1:-0.527273 2:0.00797 3:-0.681303 4:-0.00189033 5:-0.333333 6:-0.667839
+-1 1:-0.709091 2:0.00571178 3:-0.611421 4:-0.304348 5:-0.290123 6:-0.400502 7:1
+-1 1:-0.854545 2:-0.0139259 3:-0.60144 4:-0.10559 5:-0.634921 6:-0.463205 7:1
+-1 1:-0.0545455 2:-0.612309 3:-0.936973 4:-0.942029 5:-1 6:-0.918739 7:-1
+-1 1:-0.418182 2:-0.0729808 3:-0.826219 4:-0.197324 5:-0.311966 6:-0.149266 7:-1
+-1 1:-0.672727 2:0.00858909 3:-0.611421 4:-0.0114416 5:0.0760234 6:-0.543789 7:1
+-1 1:0.0545455 2:-0.642649 3:-0.936973 4:-0.839465 5:-0.803419 6:-0.450206 7:-1
+-1 1:0.163636 2:-0.53689 3:-0.905317 4:-0.652174 5:-0.452381 6:-0.752031 7:-1
+-1 1:-0.818182 2:0.0618789 3:-0.611421 4:-0.582609 5:-0.488889 6:0.301099
+-1 1:0.309091 2:-0.712624 3:-0.927801 4:-1 5:-0.833333 6:-0.507934 7:-1
+-1 1:0.236364 2:-0.526204 3:-0.905317 4:-0.667984 5:-0.709596 6:-0.489389 7:-1
+-1 1:-0.563636 2:-0.022088 3:-0.591458 4:-0.146245 5:-0.767677 6:-0.681754 7:1
+-1 1:-0.0909091 2:-0.496776 3:-0.905317 4:-0.880745 5:-0.926984 6:-0.767767 7:-1
+-1 1:-0.890909 2:0.00483222 3:-0.611636 4:-0.197324 5:-0.017094 6:-0.0546463
+-0.7494969301875734 1:-0.854545 2:0.163124 3:-0.60144 4:-0.10559 5:-0.0873015 6:-0.462605 7:-1
+-1 1:-0.0545455 2:-0.405538 3:-0.936973 4:-0.362319 5:-0.503086 6:-0.631079 7:-1
+-1 1:-0.0545455 2:-0.477893 3:-0.886726 4:-0.478261 5:-0.716049 6:-0.632654 7:-1
+-1 1:-0.381818 2:0.0339564 3:-0.825444 4:-0.227053 5:-0.053498 6:0.300788 7:-1
+-1 1:0.236364 2:-0.0857729 3:-0.60144 4:-0.667984 5:-0.477273 6:-0.543342
+-1 1:-0.272727 2:-0.443833 3:-0.905317 4:-0.791304 5:-1 6:-0.868337 7:-1
+-1 1:-0.709091 2:-0.0400854 3:-0.826219 4:0.15942 5:-0.716049 6:-0.102501 7:-1
+-1 1:-0.781818 2:-0.0221854 3:-0.650554 4:-0.347826 5:-0.520833 6:-0.196758 7:1
+-1 1:-0.418182 2:-0.0599039 3:-0.650441 4:-0.117057 5:-0.115385 6:-0.653608 7:1
+-1 1:-0.418182 2:-0.0606882 3:-0.642602 4:0.0434783 5:-0.508547 6:0.427144
+-0.3512983867561833 1:-0.0909091 2:-0.488606 3:-0.936973 4:-0.582609 5:-0.634921 6:-0.84491 7:-1
+-1 1:-0.127273 2:-0.567727 3:-0.936973 4:-1 5:-0.849673 6:-0.817741 7:-1
+-1 1:-0.636364 2:-0.0148505 3:-0.542748 4:-0.165217 5:-0.233333 6:-0.488847 7:1
+-1 1:-0.2 2:-0.0997141 3:-0.591458 4:-0.478261 5:-0.760417 6:-0.597278 7:-1
+-1 1:-0.163636 2:-0.0900011 3:-0.611421 4:-0.683794 5:-0.612795 6:-0.350616 7:-1
+-1 1:0.0909091 2:-0.644727 3:-0.936973 4:-1 5:-0.808333 6:-0.701207 7:-1
+-1 1:-0.927273 2:0.065699 3:-0.549035 4:-0.478261 5:0.0648148 6:0.163265
+-1 1:-0.2 2:-0.167746 3:-0.611421 4:-0.673913 5:-0.680556 6:-0.707054 7:-1
+-1 1:-0.2 2:-0.13371 3:-0.613205 4:-0.543478 5:-0.520833 6:-0.503354 7:1
+-1 1:-0.781818 2:-0.0455872 3:-0.599737 4:-0.347826 5:-0.0416667 6:-0.668734 7:1
+-1 1:-0.6 2:-0.082194 3:-0.700873 4:-0.10559 5:-0.026455 6:-0.619646
+-1 1:-0.709091 2:-0.170798 3:-0.864976 4:-0.42029 5:-0.148148 6:0.246331
+-1 1:0.0545455 2:-0.532674 3:-0.905317 4:-0.839465 5:-0.803419 6:-0.299747 7:-1
+-1 1:-0.454545 2:0.202734 3:-0.685363 4:0.168696 5:0.124444 6:-0.358694
+-1 1:-0.236364 2:-0.0876724 3:-0.611421 4:-0.528752 5:-0.752688 6:-0.391895 7:1
+-1 1:-0.672727 2:-0.0794497 3:-0.678562 4:-0.340961 5:-0.596491 6:-0.394035 7:1
+-1 1:-0.2 2:-0.181021 3:-0.611421 4:-0.478261 5:-0.361111 6:-0.783595 7:1
+-0.2088601833578218 1:-0.163636 2:-0.0591844 3:-0.618165 4:-0.683794 5:-0.457912 6:-0.543508 7:-1
+-1 1:0.0909091 2:-0.524417 3:-0.905317 4:-0.269565 5:-0.361111 6:-0.617018 7:-1
+-0.2444993380120906 1:-0.490909 2:-0.0527465 3:-0.611421 4:-0.565217 5:-0.680556 6:-0.405953 7:1
+-1 1:-0.818182 2:0.122344 3:-0.506547 4:-0.0260869 5:-0.148148 6:0.154165 7:-1
+-1 1:-0.745455 2:0.00956307 3:-0.627254 4:-0.263427 5:-0.0980392 6:-0.430725 7:1
+-1 1:0.163636 2:-0.231461 3:-0.611421 4:-0.602484 5:-0.574074 6:-0.573201 7:-1
+-1 1:-0.381818 2:0.0543607 3:-0.611421 4:-0.0724637 5:-0.526749 6:-0.38618
+-1 1:-0.236364 2:-0.10504 3:-0.611421 4:-0.663394 5:-0.917563 6:-0.707793 7:-1
+-1 1:-0.927273 2:0.101661 3:-0.330395 4:0.217391 5:-0.361111 6:-0.278765
+-1 1:-0.454545 2:-0.269005 3:-0.855685 4:-0.165217 5:-0.693333 6:0.282706 7:1
+-1 1:-0.672727 2:-0.165249 3:-0.826219 4:-0.121281 5:-0.596491 6:0.135902
+-1 1:0.236364 2:-0.571522 3:-0.929989 4:-0.810277 5:-0.593434 6:-0.639986 7:-1
+-1 1:-0.0181818 2:-0.101771 3:-0.613178 4:-0.548766 5:-0.585586 6:-0.332644 7:-1
+-1 1:-0.927273 2:0.0470043 3:-0.611421 4:-0.130435 5:-0.148148 6:0.390503
+-1 1:-0.781818 2:0.109517 3:-0.429301 4:0.173913 5:-0.520833 6:-0.496133
+-1 1:-0.636364 2:-0.359085 3:-0.936973 4:-0.478261 5:-0.233333 6:-0.453834 7:1
+-1 1:-0.454545 2:0.0248246 3:-0.611421 4:0.168696 5:-0.182222 6:-0.622496
+-1 1:-0.890909 2:-0.0157021 3:-0.826219 4:-0.0367893 5:-0.017094 6:0.313367 7:-1
+-1 1:0.0909091 2:-0.582931 3:-0.936973 4:-0.73913 5:-0.744444 6:-0.765332 7:-1
+-1 1:-0.309091 2:-0.040331 3:-0.613178 4:-0.28036 5:-0.295019 6:-0.61389
+-1 1:-0.672727 2:-0.0809354 3:-0.630219 4:-0.340961 5:-0.461988 6:-0.0198897
+-1 1:-0.781818 2:0.133051 3:-0.790454 4:-0.217391 5:-0.0416667 6:0.236305 7:-1
+-1 1:0.0181818 2:-0.132608 3:-0.611421 4:-0.78032 5:-0.798246 6:-0.717319 7:-1
+-1 1:-0.781818 2:0.0225945 3:-0.561739 4:0.0434783 5:-0.361111 6:-0.440202
+-0.6494670709789474 1:-0.636364 2:0.210515 3:-0.591458 4:-0.373913 5:-0.488889 6:0.00938966 7:-1
+-1 1:-0.709091 2:-0.0227442 3:-0.591458 4:-0.42029 5:-0.716049 6:0.0218606 7:1
+-0.01740323773261104 1:-0.709091 2:-0.105364 3:-0.674621 4:0.0434783 5:-0.716049 6:-0.375908 7:-1
+-1 1:-0.127273 2:-0.157604 3:-0.619244 4:-0.508951 5:-0.699346 6:-0.635428 7:-1
+-1 1:-0.890909 2:0.0469361 3:-0.407931 4:0.284281 5:-0.606838 6:-0.506912 7:1
+-1 1:-0.2 2:-0.232313 3:-0.881596 4:-0.478261 5:-0.680556 6:-0.764303 7:-1
+-1 1:0.0909091 2:-0.550348 3:-0.934601 4:-0.582609 5:-0.616667 6:-0.396859 7:-1
+-1 1:-0.527273 2:0.0596615 3:-0.591458 4:-0.274102 5:-0.444444 6:-0.250664
+-1 1:-0.381818 2:0.0936253 3:-0.587214 4:-0.381642 5:-0.716049 6:-0.52036 7:-1
+-1 1:-0.490909 2:-0.050703 3:-0.707439 4:-0.130435 5:-0.574074 6:-0.57135 7:1
+-1 1:-0.236364 2:0.0243524 3:-0.591458 4:-0.528752 5:-0.752688 6:-0.407722 7:1
+-1 1:-0.781818 2:-0.0646515 3:-0.826219 4:-0.347826 5:0.118056 6:0.529903 7:-1
+-1 1:-0.2 2:-0.0601066 3:-0.589336 4:-0.608696 5:-0.440972 6:-0.629401 7:-1
+-1 1:-0.309091 2:-0.111427 3:-0.649367 4:-0.352324 5:0.145594 6:-0.778008
+-1 1:-0.0181818 2:-0.559929 3:-0.905317 4:-0.548766 5:-0.792793 6:-0.651412 7:-1
+-1 1:-0.0181818 2:-0.120456 3:-0.611421 4:-0.548766 5:-0.585586 6:-0.772799 7:-1
+-1 1:-0.272727 2:0.131706 3:-0.591472 4:-0.373913 5:-0.574074 6:-0.206779 7:1
+-1 1:0.0181818 2:-0.187157 3:-0.611421 4:-0.670481 5:-0.596491 6:-0.77588 7:-1
+-1 1:-0.854545 2:0.017969 3:-0.611421 4:-0.10559 5:-0.81746 6:0.401227
+-1 1:-0.163636 2:-0.055253 3:-0.660318 4:-0.43083 5:-0.457912 6:-0.21914
+-1 1:-0.345455 2:-0.176929 3:-0.734192 4:-0.0310559 5:-0.72619 6:-0.795786 7:1
+-1 1:-0.272727 2:-0.12477 3:-0.702316 4:-0.304348 5:-0.403704 6:-0.716498 7:1
+-1 1:-0.927273 2:0.0457185 3:-0.564576 4:-0.130435 5:-0.148148 6:-0.066251 7:1
+-0.4671105187941058 1:-0.309091 2:-0.163095 3:-0.613823 4:-0.28036 5:-0.206897 6:-0.629506 7:-1
+-1 1:-0.672727 2:0.209898 3:-0.611421 4:-0.121281 5:-0.192982 6:-0.0768508
+-0.01182579585874976 1:-0.0909091 2:-0.106798 3:-0.611421 4:-0.403727 5:-0.342857 6:-0.588994
+-1 1:-0.854545 2:-0.0165895 3:-0.660564 4:-0.403727 5:-0.634921 6:-0.168617 7:1
+-1 1:-0.636364 2:0.0707515 3:-0.623186 4:0.356522 5:-0.616667 6:-0.204432
+-1 1:-0.272727 2:-0.0206975 3:-0.611421 4:-0.513043 5:-0.659259 6:-0.284838 7:1
+-1 1:-0.781818 2:-0.0197077 3:-0.591458 4:0.173913 5:-0.520833 6:-0.5623
+-1 1:-0.454545 2:-0.249197 3:-0.826219 4:0.0852174 5:-0.488889 6:-0.576205 7:-1
+-1 1:-0.709091 2:0.026341 3:-0.826219 4:-0.304348 5:-0.716049 6:0.759555
+-1 1:-0.781818 2:-0.0885713 3:-0.826219 4:0.0434783 5:-0.361111 6:-0.280461
+-1 1:0.0545455 2:-0.467114 3:-0.905317 4:-0.785953 5:-0.606838 6:-0.762643 7:-1
+-1 1:-0.781818 2:0.0130292 3:-0.618827 4:-0.217391 5:-0.840278 6:0.388225
+-1 1:0.527273 2:-0.804917 3:-0.936973 4:-0.759197 5:-0.803419 6:-0.625366 7:-1
+-1 1:-0.563636 2:-0.118494 3:-0.68463 4:-0.146245 5:-0.651515 6:-0.363501 7:1
+-1 1:-0.927273 2:0.0696857 3:-0.67799 4:0.0434783 5:0.0648148 6:-0.163748
+-1 1:0.127273 2:-0.585144 3:-0.905317 4:-0.592789 5:-0.688347 6:-0.751939 7:-1
+-1 1:-0.0181818 2:-0.551865 3:-0.905317 4:-0.943596 5:-0.930931 6:-0.657208 7:-1
+-1 1:-0.563636 2:-0.119306 3:-0.620539 4:-0.146245 5:-0.419192 6:-0.710607 7:1
+-1 1:-0.345455 2:-0.0397587 3:-0.620817 4:-0.254658 5:-0.178571 6:-0.696565
+-1 1:0.0545455 2:-0.319987 3:-0.826219 4:-0.518395 5:-0.410256 6:-0.612174 7:-1
+-1 1:-0.854545 2:0.0353539 3:-0.471227 4:-0.552795 5:0.0952381 6:-0.244297
+-1 1:-0.0909091 2:-0.491882 3:-0.905317 4:-0.880745 5:-0.853968 6:-0.789059 7:-1
+-1 1:0.490909 2:-0.8078 3:-0.905317 4:-1 5:-0.899782 6:-0.818685 7:-1
+-1 1:-0.236364 2:-0.123518 3:-0.611421 4:-0.326788 5:-0.670251 6:-0.527405 7:-1
+-1 1:-0.2 2:-0.127267 3:-0.611421 4:-0.543478 5:-0.680556 6:-0.48972 7:1
+-1 1:-0.890909 2:-0.00932866 3:-0.826219 4:-0.357859 5:-0.213675 6:0.281728 7:1
+-1 1:0.2 2:-0.513874 3:-0.889033 4:-0.660263 5:-0.643411 6:-0.63252 7:-1
+-1 1:-0.163636 2:-0.122697 3:-0.591458 4:-0.620553 5:-0.845118 6:-0.617291 7:-1
+-1 1:-0.127273 2:-0.130189 3:-0.591458 4:-0.508951 5:-0.849673 6:-0.69219 7:-1
+-1 1:-0.490909 2:0.103316 3:-0.60144 4:-0.0434782 5:-0.680556 6:-0.489871
+-1 1:-0.818182 2:-0.0500891 3:-0.591458 4:-0.304348 5:-0.318518 6:-0.5543 7:1
+-1 1:-0.309091 2:-0.0364527 3:-0.591458 4:-0.712144 5:-0.206897 6:-0.438237 7:1
+-1 1:-0.490909 2:-0.0807608 3:-0.611421 4:-0.391304 5:-0.574074 6:-0.604395 7:1
+-0.3713394347709519 1:-0.0545455 2:-0.418901 3:-0.905317 4:-0.478261 5:-0.503086 6:-0.61105 7:-1
+-1 1:-0.127273 2:-0.484589 3:-0.936973 4:-0.938619 5:-1 6:-0.677746 7:-1
+-1 1:-0.818182 2:0.168407 3:-0.555322 4:-0.0260869 5:0.0222223 6:0.180617 7:-1
+-1 1:-0.563636 2:-0.0494529 3:-0.591458 4:-0.43083 5:-0.186869 6:-0.759459 7:1
+-1 1:-0.672727 2:-0.00816984 3:-0.591458 4:0.0983982 5:-0.192982 6:-0.160027
+-1 1:-0.0181818 2:-0.14728 3:-0.611421 4:-0.323149 5:-0.447447 6:-0.685109
+-1 1:-0.0181818 2:-0.559724 3:-0.936973 4:-0.661575 5:-0.723724 6:-0.802932 7:-1
+-1 1:-0.854545 2:0.0184649 3:-0.60144 4:-0.10559 5:-0.81746 6:0.335871
+-1 1:-0.0181818 2:-0.0845103 3:-0.591458 4:-0.60517 5:-0.516516 6:-0.71556 7:1
+-1 1:0.0181818 2:-0.531072 3:-0.905317 4:-0.78032 5:-0.865497 6:-0.627418 7:-1
+-1 1:-0.127273 2:0.179987 3:-0.611421 4:-0.44757 5:-0.699346 6:-0.367645 7:-1
+-1 1:-0.381818 2:-0.00539883 3:-0.611421 4:-0.149758 5:0.135802 6:-0.694783 7:1
+-1 1:-0.345455 2:0.0139488 3:-0.611421 4:-0.180124 5:-0.361111 6:-0.55566
+-1 1:0.0909091 2:-0.135784 3:-0.591458 4:-0.791304 5:-0.297222 6:-0.646688 7:-1
diff --git a/MQScoreSVM3.range b/MQScoreSVM3.range
new file mode 100644
index 0000000..e4e1a5e
--- /dev/null
+++ b/MQScoreSVM3.range
@@ -0,0 +1,9 @@
+x
+-1 1
+1 9 64
+2 -117.65477753 122.49903107
+3 -1.90801275 5.78528643
+4 0 0.95833331
+5 0 0.78260869
+6 0.02718957 0.96912074
+7 0 2
diff --git a/MS2DB.c b/MS2DB.c
new file mode 100644
index 0000000..3fc2d7f
--- /dev/null
+++ b/MS2DB.c
@@ -0,0 +1,688 @@
+//Title: MS2DB.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+// Code to parse databases in MS2DB format.
+
+#include "CMemLeak.h"
+#include "MS2DB.h"
+#include "Spliced.h"
+#include "expat.h"
+#include "Errors.h"
+
+#define MS2DB_BUFFER_SIZE 102400
+
+// Macro for basic error-checking. Assumes Cursor is set. If the given
+// expression isn't true, we set the error flag and bail out of our current
+// function. (When Cursor->ErrorFlag is set, all our callback functions
+// will terminate immediately)
+#define XML_ASSERT(expr) \
+ if (!(expr)) \
+ {\
+ Cursor->ErrorFlag = 1;\
+ REPORT_ERROR_S(25, #expr);\
+ return;\
+ }
+
+#define XML_ASSERT_RETVAL(expr) \
+ if (!(expr)) \
+ {\
+ Cursor->ErrorFlag = 1;\
+ REPORT_ERROR_S(25, #expr);\
+ return 0;\
+ }
+
+// MS2ParseState tells us which tag we are currently inside.
+// The allowed "moves" (from tags to children) are those listed
+// in the XML schema. However, in the interest of extensibility,
+// we simply *ignore* any tags we aren't expecting to see.
+typedef enum MS2ParseState
+{
+ evMS2DBNone = 0,
+ evMS2DBDatabase,
+ evMS2DBGene,
+ evMS2DBGeneLocus,
+ evMS2DBGeneNotes,
+ evMS2DBExon,
+ evMS2DBExonSequence,
+ evMS2DBExonExtends,
+ evMS2DBExonLinkFrom,
+ evMS2DBExonMod,
+ evMS2DBExonModCrossReference,
+ evMS2DBGeneCrossReference
+} MS2ParseState;
+
+typedef struct MS2ParseCursor
+{
+ SearchInfo* Info;
+ int State;
+ GeneStruct* CurrentGene;
+ ExonStruct* CurrentExon;
+ int CurrentExonIndex;
+ int CurrentLinkIndex;
+ int CurrentExonSequenceIndex;
+ int ErrorFlag;
+ TrieNode* Root;
+ XML_Parser Parser;
+ int DBNumber;
+} MS2ParseCursor;
+
+// Free an MS2ParseCursor, including its attached gene.
+void FreeMS2ParseCursor(MS2ParseCursor* Cursor)
+{
+ if (!Cursor)
+ {
+ return;
+ }
+ if (Cursor->CurrentGene)
+ {
+ FreeGene(Cursor->CurrentGene);
+ Cursor->CurrentGene = NULL;
+ }
+ SafeFree(Cursor);
+}
+
+// expat callback: Handle character data in the body of a tag.
+void MS2CharacterDataHandler(void* UserData, const XML_Char* String, int Length)
+{
+ MS2ParseCursor* Cursor;
+ int NewLength;
+ //
+ Cursor = (MS2ParseCursor*)UserData;
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ switch (Cursor->State)
+ {
+ case evMS2DBExonSequence:
+ // Incorporate this sequence into the exon sequence.
+ XML_ASSERT(Cursor->CurrentExon);
+ //XML_ASSERT(Cursor->CurrentExon->Sequence);
+ if (!Cursor->CurrentExon->Sequence)
+ {
+ printf("* Warning: No sequence!?\n");
+ }
+ NewLength = strlen(Cursor->CurrentExon->Sequence) + Length;
+ if (NewLength > Cursor->CurrentExon->Length)
+ {
+ REPORT_ERROR_IS(29, Cursor->CurrentExonIndex, Cursor->CurrentGene->Name);
+ Cursor->ErrorFlag = 1;
+ return;
+ }
+ strncat(Cursor->CurrentExon->Sequence, String, Length);
+ break;
+ default:
+ break;
+ }
+}
+
+// Parse attributes of a Gene tag.
+void ParseGeneAttributes(MS2ParseCursor* Cursor, const char** Attributes)
+{
+ int AttributeIndex;
+ const char* Name;
+ const char* Value;
+ //
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+
+ XML_ASSERT(Cursor->CurrentGene);
+
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "Name"))
+ {
+ strncpy(Cursor->CurrentGene->Name, Value, GENE_NAME_LENGTH);
+ }
+ else if (!CompareStrings(Name, "ExonCount"))
+ {
+ Cursor->CurrentGene->ExonCount = atoi(Value);
+ }
+ else if (GlobalOptions->XMLStrictFlag)
+ {
+ REPORT_WARNING_ISS(28, XML_GetCurrentLineNumber(Cursor->Parser), Name, "Gene");
+ }
+ else if(!CompareStrings(Name,"Chromosome"))
+ {
+ Cursor->CurrentGene->ChromosomeNumber = atoi(Value);
+ }
+ else if(!CompareStrings(Name,"ForwardFlag"))
+ {
+ Cursor->CurrentGene->ForwardFlag = atoi(Value);
+ }
+ }
+ // Allocate exons:
+ XML_ASSERT(Cursor->CurrentGene->ExonCount >= 1 && Cursor->CurrentGene->ExonCount <= 10000);
+ Cursor->CurrentGene->Exons = (ExonStruct*)calloc(Cursor->CurrentGene->ExonCount, sizeof(ExonStruct));
+}
+
+// Parse attributes of a Locus tag.
+void ParseLocusAttributes(MS2ParseCursor* Cursor, const char** Attributes)
+{
+ int AttributeIndex;
+ const char* Name;
+ const char* Value;
+ //
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+
+ XML_ASSERT(Cursor->CurrentGene);
+
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "chromosome"))
+ {
+ Cursor->CurrentGene->ChromosomeNumber = atoi(Value);
+ }
+ else if (!CompareStrings(Name, "ForwardFlag"))
+ {
+ Cursor->CurrentGene->ForwardFlag = atoi(Value);
+ }
+ else if (GlobalOptions->XMLStrictFlag)
+ {
+ REPORT_WARNING_ISS(28, XML_GetCurrentLineNumber(Cursor->Parser), Name, "Locus");
+ }
+ }
+}
+
+// Parse attributes of an LinkFrom tag.
+// <LinkFrom Index="0" Score="3.14" AA="G" />
+// If ExtendsFlag is true, this exon EXTENDS the previous one (no splicing required)
+void ParseLinkFromAttributes(MS2ParseCursor* Cursor, const char** Attributes, int ExtendsFlag)
+{
+ int AttributeIndex;
+ const char* Name;
+ const char* Value;
+ char EdgeAA = '\0';
+ ExonEdge* Edge = NULL;
+ int BackExonIndex;
+ ExonStruct* Exon;
+ //
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ Exon = Cursor->CurrentExon;
+ XML_ASSERT(Exon);
+
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "Index"))
+ {
+ BackExonIndex = atoi(Value);
+ XML_ASSERT(BackExonIndex >= 0 && BackExonIndex < Cursor->CurrentGene->ExonCount);
+ Edge = (ExonEdge*)calloc(1, sizeof(ExonEdge));
+ if (ExtendsFlag)
+ {
+ Edge->Power = 0;
+ }
+ else
+ {
+ Edge->Power = 1;
+ }
+ Edge->Exon = Cursor->CurrentGene->Exons + BackExonIndex;
+ Edge->Source = Exon;
+ // Insert the exon into the list:
+ if (Exon->BackEdgeTail)
+ {
+ Exon->BackEdgeTail->Next = Edge;
+ }
+ else
+ {
+ Exon->BackEdgeHead = Edge;
+ }
+ Exon->BackEdgeTail = Edge;
+ Exon->BackEdgeCount++;
+ Edge->Exon->ForwardEdgeCount++;
+ }
+ else if (!CompareStrings(Name, "AA"))
+ {
+ EdgeAA = Value[0];
+ }
+ else if (GlobalOptions->XMLStrictFlag)
+ {
+ REPORT_WARNING_ISS(28, XML_GetCurrentLineNumber(Cursor->Parser), Name, "Link");
+ }
+ }
+ if (Edge)
+ {
+ Edge->AA = EdgeAA;
+ }
+}
+// Parse attributes of an Exon tag.
+void ParseExonAttributes(MS2ParseCursor* Cursor, const char** Attributes)
+{
+ int AttributeIndex;
+ const char* Name;
+ const char* Value;
+ //
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+
+ XML_ASSERT(Cursor->CurrentGene);
+
+ Cursor->CurrentExonIndex = -1; // invalidate it; the attributes will fix it
+ Cursor->CurrentExon = NULL;
+
+ // First, loop through the attributes to get the index, so we can point at the correct exon:
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "index"))
+ {
+ Cursor->CurrentExonIndex = atoi(Value);
+ }
+ }
+ XML_ASSERT(Cursor->CurrentExonIndex >= 0 && Cursor->CurrentExonIndex < Cursor->CurrentGene->ExonCount);
+
+ Cursor->CurrentExon = Cursor->CurrentGene->Exons + Cursor->CurrentExonIndex;
+ Cursor->CurrentExon->Gene = Cursor->CurrentGene;
+ // Initialize the exon START and END to -1 (that is, not on a known chromosome):
+ Cursor->CurrentExon->Start = -1;
+ Cursor->CurrentExon->End = -1;
+ Cursor->CurrentExon->Index = Cursor->CurrentExonIndex;
+
+ // Now loop through and read attribute values:
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "Start"))
+ {
+ Cursor->CurrentExon->Start = atoi(Value);
+ }
+ else if (!CompareStrings(Name, "End"))
+ {
+ Cursor->CurrentExon->End = atoi(Value);
+ }
+ else if (!CompareStrings(Name, "Prefix"))
+ {
+ strncpy(Cursor->CurrentExon->Prefix, Value, 2);
+ }
+ else if (!CompareStrings(Name, "Suffix"))
+ {
+ strncpy(Cursor->CurrentExon->Suffix, Value, 2);
+ }
+ else if (!CompareStrings(Name, "Index"))
+ {
+ ;
+ }
+ else if (GlobalOptions->XMLStrictFlag)
+ {
+ REPORT_WARNING_ISS(28, XML_GetCurrentLineNumber(Cursor->Parser), Name, "Exon");
+ }
+ }
+ XML_ASSERT(Cursor->CurrentExonIndex >= 0 && Cursor->CurrentExonIndex < Cursor->CurrentGene->ExonCount);
+}
+
+// Parse attributes of an ExonSequence tag.
+void ParseExonSequenceAttributes(MS2ParseCursor* Cursor, const char** Attributes)
+{
+ int AttributeIndex;
+ const char* Name;
+ const char* Value;
+ //
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ XML_ASSERT(Cursor->CurrentExon);
+
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "Length"))
+ {
+ Cursor->CurrentExon->Length = atoi(Value);
+ }
+ else if (GlobalOptions->XMLStrictFlag)
+ {
+ REPORT_WARNING_ISS(28, XML_GetCurrentLineNumber(Cursor->Parser), Name, "ExonSequence");
+ }
+ }
+
+ XML_ASSERT(Cursor->CurrentExon->Length >= 0 && Cursor->CurrentExon->Length < 1024*1024);
+ if (Cursor->CurrentExon->Length)
+ {
+ Cursor->CurrentExon->Sequence = (char*)calloc(sizeof(char), Cursor->CurrentExon->Length + 1);
+ }
+}
+
+// expat callback: Handle a tag and its attributes.
+void MS2StartElement(void* UserData, const char* Tag, const char** Attributes)
+{
+ MS2ParseCursor* Cursor;
+ int ExpectedTag = 0;
+ //
+ Cursor = (MS2ParseCursor*)UserData;
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+
+ // Switch on our current state, and handle the tags we expect to see in our current state.
+ // Tags we don't expect are ignored (i.e. new tags can be added without breaking the parser)
+ switch (Cursor->State)
+ {
+ case evMS2DBNone:
+ if (!CompareStrings(Tag, "Database"))
+ {
+ ExpectedTag = 1;
+ Cursor->State = evMS2DBDatabase;
+ // ignore database attributes for now
+ }
+ break;
+ case evMS2DBDatabase:
+ if (!CompareStrings(Tag, "Gene"))
+ {
+ ExpectedTag = 1;
+ XML_ASSERT(!Cursor->CurrentGene);
+ Cursor->State = evMS2DBGene;
+ Cursor->CurrentGene = (GeneStruct*)calloc(1, sizeof(GeneStruct));
+ Cursor->CurrentExonIndex = 0;
+ Cursor->CurrentGene->ChromosomeNumber = -1;
+ Cursor->CurrentGene->ForwardFlag = 1; // default
+ ParseGeneAttributes(Cursor, Attributes);
+ }
+ if (!CompareStrings(Tag, "Locus"))
+ {
+ ExpectedTag = 1;
+ Cursor->State = evMS2DBGeneLocus;
+ ParseLocusAttributes(Cursor, Attributes);
+ }
+ break;
+ case evMS2DBGene:
+ XML_ASSERT(Cursor->CurrentGene);
+ if (!CompareStrings(Tag, "Exon"))
+ {
+ ExpectedTag = 1;
+ Cursor->State = evMS2DBExon;
+ ParseExonAttributes(Cursor, Attributes);
+ }
+ if (!CompareStrings(Tag, "CrossReference"))
+ {
+ // We don't do anything with the attributes, but cross-references
+ // are "expected", so we don't raise a warning.
+ ExpectedTag = 1;
+ Cursor->State = evMS2DBGeneCrossReference;
+ }
+ break;
+ case evMS2DBGeneCrossReference:
+ if (!CompareStrings(Tag, "CRExons"))
+ {
+ ExpectedTag = 1;
+ }
+ break;
+ case evMS2DBExon:
+ XML_ASSERT(Cursor->CurrentExon);
+ if (!CompareStrings(Tag, "ExonSequence"))
+ {
+ ExpectedTag = 1;
+ Cursor->State = evMS2DBExonSequence;
+ ParseExonSequenceAttributes(Cursor, Attributes);
+ }
+ if (!CompareStrings(Tag, "ExtendsExon"))
+ {
+ ExpectedTag = 1;
+ // Don't change states, ExtendsExon has no body
+ ParseLinkFromAttributes(Cursor, Attributes, 1);
+ }
+ if (!CompareStrings(Tag, "LinkFrom"))
+ {
+ ExpectedTag = 1;
+ // Don't change states, LinkFrom has no body
+ ParseLinkFromAttributes(Cursor, Attributes, 0);
+ }
+ break;
+ default:
+ break;
+ }
+ if (!ExpectedTag)
+ {
+ REPORT_ERROR_IS(27, XML_GetCurrentLineNumber(Cursor->Parser), Tag);
+ }
+}
+
+// Confirm that this gene is, indeed, searchable.
+int IntegrityCheckXMLGene(MS2ParseCursor* Cursor)
+{
+ int ExonIndex;
+ int EdgeIndex;
+ ExonEdge* Edge;
+ ExonEdge* PrevEdge;
+ GeneStruct* Gene;
+ ExonStruct* Exon;
+ //
+ Gene = Cursor->CurrentGene;
+ XML_ASSERT_RETVAL(Gene);
+ for (ExonIndex = 0; ExonIndex < Cursor->CurrentGene->ExonCount; ExonIndex++)
+ {
+ // Confirm that we did, in fact, observe this exon:
+ Exon = Gene->Exons + ExonIndex;
+ if (!Exon->Gene)
+ {
+ printf("* Error: Exon '%d' from Gene '%s' not present!\n", ExonIndex, Gene->Name);
+ return 0;
+ }
+ }
+ // All exons have been initialized. Now let's fix up the backward edges from each exon.
+ // We MOVE the backward edges from the linked list Exon->BackEdgeHead->Next->...->Exon->BackEdgeTail
+ // into an array, Exon->BackwardEdges.
+ for (ExonIndex = 0; ExonIndex < Cursor->CurrentGene->ExonCount; ExonIndex++)
+ {
+ Exon = Gene->Exons + ExonIndex;
+ PrevEdge = NULL;
+ if (Exon->BackEdgeCount)
+ {
+ Exon->BackwardEdges = (ExonEdge*)calloc(Exon->BackEdgeCount, sizeof(ExonEdge));
+ for (EdgeIndex = 0, Edge = Exon->BackEdgeHead; Edge; EdgeIndex++, Edge = Edge->Next)
+ {
+ memcpy(Exon->BackwardEdges + EdgeIndex, Edge, sizeof(ExonEdge));
+ SafeFree(PrevEdge);
+ PrevEdge = Edge;
+ }
+ SafeFree(PrevEdge);
+ Exon->BackEdgeHead = NULL;
+ Exon->BackEdgeTail = NULL;
+ }
+ // Allocate forward-edge array:
+ Exon->ForwardEdges = (ExonEdge*)calloc(Exon->ForwardEdgeCount, sizeof(ExonEdge));
+ }
+ // Finally, we'll set the forward edges from each exon:
+ SetExonForwardEdges(Cursor->CurrentGene);
+ return 1;
+}
+
+void MS2EndElement(void* UserData, const char* Tag)
+{
+ MS2ParseCursor* Cursor;
+ int Result;
+ int Index;
+
+ //
+ Cursor = (MS2ParseCursor*)UserData;
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ //printf("End tag '%s', current state %d\n", Tag, Cursor->State);
+ switch (Cursor->State)
+ {
+ case evMS2DBDatabase:
+ if (!CompareStrings(Tag, "Database"))
+ {
+ Cursor->State = evMS2DBNone;
+ }
+ break;
+ case evMS2DBGene:
+ if (!CompareStrings(Tag, "Gene"))
+ {
+ Cursor->State = evMS2DBDatabase;
+ // We search a gene immediately after we finish parsing it. (Note that by the
+ // time control returns to SearchMS2DB, we may have shot through 10 genes!)
+ Result = IntegrityCheckXMLGene(Cursor);
+ if (Result)
+ {
+ //printf("**Gene: %s\n",Cursor->CurrentGene->Name);
+ //printf("Root: %p\n",Cursor->Info->Root);
+ //for(Index = 0; Index < TRIE_CHILD_COUNT; ++Index)
+ // {
+ // printf(" Child[%c] = %p\n",Index + 'A',Cursor->Info->Root->Children[Index+'A']);
+ // }
+ //getchar();
+
+ //fflush(stdout);
+
+
+ SearchSplicableGene(Cursor->Info, Cursor->CurrentGene);
+ }
+ FreeGene(Cursor->CurrentGene);
+ Cursor->CurrentGene = NULL;
+ Cursor->Info->RecordNumber++;
+ }
+ break;
+ case evMS2DBExon:
+ if (!CompareStrings(Tag, "Exon"))
+ {
+ Cursor->State = evMS2DBGene;
+ //Cursor->CurrentExonIndex++;
+ }
+ break;
+ case evMS2DBExonSequence:
+ if (!CompareStrings(Tag, "ExonSequence"))
+ {
+ Cursor->State = evMS2DBExon;
+ }
+ break;
+ case evMS2DBGeneLocus:
+ if (!CompareStrings(Tag, "Locus"))
+ {
+ Cursor->State = evMS2DBGene;
+ }
+ break;
+ case evMS2DBGeneCrossReference:
+ if (!CompareStrings(Tag, "CrossReference"))
+ {
+ Cursor->State = evMS2DBGene;
+ }
+ break;
+ default:
+ printf("* Error: End-tag '%s' not handled from state %d\n", Tag, Cursor->State);
+ Cursor->ErrorFlag = 1;
+ break;
+ }
+}
+
+void SearchMS2DB(SearchInfo* Info)
+{
+ FILE* DBFile;
+ XML_Parser Parser = NULL;
+ int ParseUserData = 0;
+ int XMLParseResult;
+ int BytesRead;
+ int DoneFlag = 0;
+ int FilePos = 0;
+ void* XMLBuffer;
+ MS2ParseCursor* Cursor;
+ int Error;
+ //
+ DBFile = Info->DB->DBFile;
+ if (!DBFile)
+ {
+ printf("** Error: Unable to open database file '%s'\n", Info->DB->FileName);
+ return;
+ }
+ fseek(DBFile, 0, 0);
+ AllocSpliceStructures();
+ Cursor = (MS2ParseCursor*)calloc(sizeof(MS2ParseCursor), 1);
+ Cursor->Info = Info;
+ Parser = XML_ParserCreate(NULL);
+ Cursor->Parser = Parser;
+ XML_SetUserData(Parser, Cursor);
+ XML_SetElementHandler(Parser, MS2StartElement, MS2EndElement);
+ XML_SetCharacterDataHandler(Parser, MS2CharacterDataHandler);
+
+ while (!DoneFlag)
+ {
+ // Get a buffer (parser handles the memory):
+ XMLBuffer = XML_GetBuffer(Parser, sizeof(char) * MS2DB_BUFFER_SIZE);
+ if (!XMLBuffer)
+ {
+ printf("* Error: Unable to get XML buffer of size %d\n", MS2DB_BUFFER_SIZE);
+ break;
+ }
+
+ // Read into the buffer:
+ BytesRead = ReadBinary(XMLBuffer, sizeof(char), MS2DB_BUFFER_SIZE, DBFile);
+ if (!BytesRead)
+ {
+ // We'll call XML_Parse once more, this time with DoneFlag set to 1.
+ DoneFlag = 1;
+ }
+
+ // Parse this block o' text:
+ XMLParseResult = XML_Parse(Parser, XMLBuffer, BytesRead, DoneFlag);
+ if (!XMLParseResult)
+ {
+ printf("XML Parse error - file position ~%d\n", FilePos);
+ Error = XML_GetErrorCode(Parser);
+ printf("Error code %d description '%s'\n", Error, XML_ErrorString(Error));
+ }
+
+ // If Cursor->ErrorFlag is set, then the file isn't valid! Error out
+ // now, since recovery could be difficult.
+ if (Cursor->ErrorFlag)
+ {
+ break;
+ }
+ FilePos += BytesRead;
+ }
+
+ XML_ParserFree(Parser);
+ SafeFree(Cursor);
+}
+
+
diff --git a/MS2DB.h b/MS2DB.h
new file mode 100644
index 0000000..10cacf9
--- /dev/null
+++ b/MS2DB.h
@@ -0,0 +1,45 @@
+//Title: MS2DB.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#ifndef MS2DB_H
+#define MS2DB_H
+
+
+
+// Code to parse databases in MS2DB format.
+#include "Inspect.h"
+#include "Trie.h"
+
+void SearchMS2DB(SearchInfo* Info);
+
+#endif
+
+
diff --git a/MS2DBShuffler.jar b/MS2DBShuffler.jar
new file mode 100644
index 0000000..8483fff
Binary files /dev/null and b/MS2DBShuffler.jar differ
diff --git a/MSSpectrum.py b/MSSpectrum.py
new file mode 100644
index 0000000..61030b8
--- /dev/null
+++ b/MSSpectrum.py
@@ -0,0 +1,663 @@
+#Title: MSSpectrum.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Classes representing an MS/MS spectrum and its peaks.
+"""
+import traceback
+import sys
+import os
+import types
+import string
+import re
+import struct
+import math
+import base64 # for mzxml parsing
+import ParseXML
+from Utils import *
+Initialize()
+
+# Some systems have old versions of base64, without the new decode interface.
+# As a workaround, set "B64Decode" to the decoding function.
+if hasattr(base64, "b64decode"):
+ B64Decode = base64.b64decode
+ B64Encode = base64.b64encode
+else:
+ B64Decode = base64.decodestring
+ B64Encode = base64.encodestring
+
+class PeakClass:
+ """
+ One peak from an ms/ms spectrum. Mostly just a mass, but we track the intensity and
+ (perhaps putative) ion-type, as well. Note that this may be a spectral peak (in
+ which case the mass is a spectral mass) or a PRM peak (in which case the mass is
+ a prefix residue mass). Each spectral peak gives rise to n PRM peaks, where n
+ is the number of ion types available.
+ """
+ def __init__(self, Mass, Intensity):
+ self.Mass = Mass
+ self.Intensity = Intensity
+ self.IonType = None # Assigned only for PRM peaks
+ self.Score = 0 # Score, based on current filtering scheme.
+ self.PeptideIndex = None
+ self.FilterRank = None
+ self.IntensityRank = None
+ self.IsPlausibleIsotopicPeak = 0
+ self.HasPlausibleIsotopicPeak = 0
+ def __cmp__(self, Other):
+ "Sort two peak objects - compare the masses"
+ if not isinstance(Other, PeakClass):
+ return 1
+ if (self.Mass < Other.Mass):
+ return -1
+ if (self.Mass > Other.Mass):
+ return 1
+ return 0
+ def __str__(self):
+ return "<peak %s>"%self.Mass
+ def GetPeakMass(self, ParentMass):
+ "Get the corresponding spectral mass from this PRM peak"
+ return self.IonType.GetPeakMass(self.Mass, ParentMass)
+ def GetPRMMass(self, ParentMass):
+ "Get the corresponding PRM mass from this spectral peak"
+ return self.IonType.GetPRMMass(self.Mass, ParentMass)
+ def PrintMe(self):
+ print "Printing information for a PeakClass object"
+ print "Mass %f Intensity %f "%(self.Mass,self.Intensity)
+ print "IonType %s PeptideIndex %s IntensityRank %d"%(self.IonType, self.PeptideIndex, self.IntensityRank)
+
+class SpectrumClass:
+ """
+ Mass-spec data, and some functions to filter peaks and such.
+ """
+ InstanceCount = 0
+ def __del__(self):
+ SpectrumClass.InstanceCount -= 1
+ def __init__(self, Scoring = None):
+ SpectrumClass.InstanceCount += 1
+ # Init our attributes:
+ self.Name = None
+ self.ParentMass = None
+ self.PrecursorMZ = None
+ self.PrecursorIntensity = None
+ self.Charge = 1 # default
+ self.Peaks = None # list of PeakClass instances
+ self.PRMPeaks = None # list of PeakClass instances
+ # The actual parent peptide (instance of PeptideClass), if known:
+ self.CorrectPeptide = None
+ self.Scoring = Scoring
+ def GetSignalToNoise(self):
+ "Return signal-to-noise ratio for this spectrum"
+ Intensities = []
+ for Peak in self.Peaks:
+ Intensities.append(Peak.Intensity)
+ Intensities.sort()
+ IntenseCount = min(len(Intensities), 5)
+ if not IntenseCount:
+ return 0
+ Signal = Intensities[-IntenseCount/2]
+ Noise = Intensities[len(Intensities)/2]
+ return Signal / float(Noise)
+ def GetTotalIntensity(self):
+ Intensity = 0
+ for Peak in self.Peaks:
+ Intensity += Peak.Intensity
+ return Intensity
+ def SetCharge(self, NewCharge):
+ self.Charge = NewCharge
+ self.ParentMass = self.PrecursorMZ * NewCharge - (NewCharge - 1)*1.0078
+ def ReadPeaksMGF(self, File):
+ self.Peaks = []
+ for FileLine in File.xreadlines():
+ if FileLine[:8] == "END IONS":
+ break
+ if FileLine[:6] == "CHARGE":
+ # Note: "2+ and 3+" is NOT supported. Use the MultiCharge option,
+ # or include two scans in the input file.
+ Charge = int(FileLine[7:9].replace("+",""))
+ continue
+ if FileLine[:7] == "PEPMASS":
+ #self.ParentMass = float(FileLine[8:])
+ self.PrecursorMZ = float(FileLine[8:].split()[0])
+ continue
+ Bits = FileLine.split()
+ try:
+ Mass = float(Bits[0])
+ Intensity = float(Bits[1])
+ except:
+ continue # some other header line we don't eat.
+ Peak = PeakClass(Mass, Intensity)
+ self.Peaks.append(Peak)
+ if self.Charge == 0:
+ # Guess!
+ self.Charge = 2
+ self.ParentMass = (self.PrecursorMZ * self.Charge) - (self.Charge-1)*1.0078
+ #print "PrecursorMZ %s charge %s"%(self.PrecursorMZ, self.Charge)
+ def ReadPeaksDTA(self, File):
+ "Read a spectrum from a file, assuming .dta or .pkl or .ms2 format."
+ HeaderLine = File.readline()
+ if not HeaderLine.strip():
+ HeaderLine = File.readline()
+ if not HeaderLine.strip():
+ HeaderLine = File.readline()
+ if not HeaderLine.strip():
+ HeaderLine = File.readline()
+ if not HeaderLine.strip():
+ HeaderLine = File.readline()
+ #print "HeaderLine: '%s'"%HeaderLine.strip()
+ Bits = HeaderLine.strip().split()
+ if HeaderLine[:7]=="CHARGE=":
+ self.Charge = int(HeaderLine[7])
+ HeaderLine = File.readline()
+ self.PrecursorMZ = float(HeaderLine[8:])
+ self.ParentMass = (self.PrecursorMZ * self.Charge) - (self.Charge-1)*1.0078
+ elif Bits[0] == "Z": # MS2 format:
+ self.Charge = int(Bits[1])
+ self.ParentMass = float(Bits[2])
+ elif Bits[0] == "S": #MS2 format:
+ HeaderLine = File.readline()
+ Bits = HeaderLine.strip().split()
+ if Bits[0] == "Z":
+ self.Charge = int(Bits[1])
+ self.ParentMass = float(Bits[2])
+ else:
+ print "ERROR: Expecting a line starting with Z but instead found %s"%HeaderLine
+ sys.exit(0)
+ elif HeaderLine[0] == ":": # MS2 colon format:
+ HeaderLine = File.readline()
+ Bits = HeaderLine.strip().split()
+ self.ParentMass = float(Bits[0])
+ self.Charge = int(Bits[1]) # always an integer!
+ elif len(Bits) == 3: #PKL format:
+ self.PrecursorMZ = float(Bits[0])
+ self.Charge = int(Bits[2])
+ self.ParentMass = (self.PrecursorMZ * self.Charge) - (self.Charge-1)*1.0078
+ else:
+ self.ParentMass = float(Bits[0])
+ self.Charge = int(Bits[1]) # always an integer!
+ if self.Charge == 0:
+ # Guess!
+ self.Charge = 2
+ self.PrecursorMZ = self.ParentMass
+ self.ParentMass = (self.PrecursorMZ * self.Charge) - (self.Charge-1)*1.0078
+ #print "Prescursor MZ is %.2f, so guess a parent mass of %.2f"%(self.PrecursorMZ, self.ParentMass)
+ else:
+ self.PrecursorMZ = (self.ParentMass + (self.Charge-1)*1.0078) / self.Charge
+ self.Peaks = []
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split()
+ # Skip comments:
+ if FileLine[0] == "#":
+ continue
+ Bits = FileLine.split()
+ if not Bits:
+ break
+ if Bits[0] == "Z":
+ continue # special for ms2: ignore.
+ if len(Bits) > 2:
+ break # no more!
+ try:
+ Mass = float(Bits[0])
+ Intensity = float(Bits[1])
+ except:
+ # It's over, over, over.
+ break
+ Peak = PeakClass(Mass, Intensity)
+ self.Peaks.append(Peak)
+ #File.close()
+ self.Peaks.sort() # sort by mass
+
+ def ReadPeakDTALine(self, FileLine):
+ FileLine = FileLine.strip()
+ Bits = FileLine.split()
+ if len(Bits) < 2:
+ return # blank (or broken) line, skip
+ Peak = PeakClass(float(Bits[0]), float(Bits[1]))
+ # If this is a labeled .dta file, read the ion types and the peptide indices:
+ if len(Bits)>2:
+ Peak.IonType = Global.AllIonDict.get(Bits[2], None)
+ if len(Bits)>3:
+ try:
+ Peak.PeptideIndex = int(Bits[3])
+ except:
+ pass # silent failure (a novel)
+ self.Peaks.append(Peak)
+ def RankPeaksByIntensity(self):
+ "Set Peak.IntensityRank for each of our peaks."
+ PeaksSortedByIntensity = []
+ for Peak in self.Peaks:
+ PeaksSortedByIntensity.append((Peak.Intensity, Peak))
+ PeaksSortedByIntensity.sort()
+ PeaksSortedByIntensity.reverse()
+ for Index in range(len(PeaksSortedByIntensity)):
+ PeaksSortedByIntensity[Index][1].IntensityRank = Index
+ def ReadPeaksPKL(self, File):
+ "Read peaks from a file in .pkl format"
+ HeaderLine = File.readline()
+ Bits = HeaderLine.split()
+ if len(Bits)!=3:
+ # .pkl files should have precursor m/z, precursor peak intensity, and
+ # guessed charge. If we don't have three pieces, then this isn't a
+ # valid .pkl file...
+ raise ValueError, "Invalid input file: Header line '%s' not a .pkl header."%HeaderLine
+ self.PrecursorMZ = float(Bits[0])
+ self.PrecursorIntensity = float(Bits[1])
+ self.Charge = int(Bits[2])
+ # We hope to be called with an actual charge. If we didn't get one at all, then guess 2.
+ if not self.Charge:
+ self.Charge = 2
+ self.ParentMass = (self.PrecursorMZ * self.Charge) - (1.0078 * (self.Charge - 1))
+ ##print "Prec %.2f times charge %s gives pm %s"%(self.PrecursorMZ, self.Charge, self.ParentMass)
+ # All subsequent lines: Mass and intensity
+ self.Peaks = []
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split()
+ if len(Bits) > 2:
+ break
+ self.ReadPeakDTALine(FileLine)
+ File.close()
+ self.Peaks.sort() # sort by mass
+ def ReadPeaksFromFile(self, File, FileName):
+ # Given a file 'blah123.dta', name the spectrum 'blah123'.
+ self.Name = os.path.split(FileName)[1]
+ (self.Name, FileExtension) = os.path.splitext(self.Name)
+ FileExtension = FileExtension.lower()
+ # Strip ".mzxml:444279" to ".mzxml":
+ if FileExtension.find(":")!=-1:
+ FileExtension = FileExtension[:FileExtension.find(":")]
+ # Use the appropriate parser
+ if FileExtension == ".pkl":
+ self.ReadPeaksPKL(File)
+ elif FileExtension == ".mzxml":
+ self.ReadPeaksMZXML(File)
+ elif FileExtension == ".mzdata":
+ self.ReadPeaksMZData(File)
+ elif FileExtension == ".mgf":
+ self.ReadPeaksMGF(File)
+ else:
+ # default case: DTA
+ self.ReadPeaksDTA(File)
+ def ReadPeaks(self, FileName, FilePos = None):
+ """
+ Instantiator - Read a spectrum from a file. Sets ParentMass, Charge,
+ and Peaks list. Doesn't filter, yet.
+ """
+ if FilePos == None:
+ try:
+ ColonBits = FileName.split(":")
+ FilePos = int(ColonBits[-1])
+ FileName = string.join(ColonBits[:-1], ":")
+ except:
+ FilePos = 0
+ try:
+ File = open(FileName, "rb")
+ except:
+ print "Error in ReadPeaks(): File '%s' couldn't be opened."%FileName
+ traceback.print_exc()
+ return
+ File.seek(FilePos)
+ self.ReadPeaksFromFile(File, FileName)
+ self.FilePath = FileName
+ self.FilePos = FilePos
+ File.close()
+ def ReadPeaksMZData(self, File):
+ """
+ Parse peaks from an .mzdata format file. This format is slightly inferior
+ to .mzxml, and not as commonly used.
+ """
+ ParseXML.GetSpectrumPeaksMZData(self, File)
+ self.Charge = 2 # guess!
+ self.ParentMass = (self.PrecursorMZ * self.Charge) - (self.Charge - 1) * 1.0078
+ return
+
+ def ReadPeaksMZXML(self, File):
+ """
+ Parse peaks from an .mzXML format file. Assumes we've already scanned to the
+ desired file offset.
+ """
+ ParseXML.GetSpectrumPeaksMZXML(self, File)
+ self.Charge = 2 # guess!
+ self.ParentMass = (self.PrecursorMZ * self.Charge) - (self.Charge - 1) * 1.0078
+ return
+ def DebugPrint(self, ShowPeaks = 0):
+ """
+ Print information on our spectrum, for debugging.
+ """
+ print "Spectrum '%s' has parent mass %f,\n charge %f, and %d peaks"%(self.Name, self.ParentMass,
+ self.Charge, len(self.Peaks))
+ if self.CorrectPeptide:
+ print " True parent peptide is: %s"%self.CorrectPeptide
+ if ShowPeaks:
+ for Peak in self.Peaks:
+ if Peak.IonType:
+ print " %f\t%f\t%s"%(Peak.Mass, Peak.Intensity, Peak.IonType.Name)
+ else:
+ print " %f\t%f"%(Peak.Mass, Peak.Intensity)
+ def GetBestPeak(self, Mass, MaxIntensity = None, Epsilon = 1.0):
+ "Used in labeling. Find the best nearby peak whose intensity doesn't exceed our limit."
+ if MaxIntensity == 0:
+ return (None, None)
+ BestPeak = None
+ BestPeakError = None
+ ClosestError = None
+ for Peak in self.Peaks:
+ Error = Peak.Mass - Mass
+ if Error < -Epsilon:
+ continue
+ if Error > Epsilon:
+ break
+ if MaxIntensity and Peak.Intensity > MaxIntensity:
+ continue # forbid neutral losses which are taller than the original
+ if (BestPeak == None or BestPeak.Intensity < Peak.Intensity):
+ BestPeak = Peak
+ BestPeakError = Error
+ return (BestPeak, BestPeakError)
+
+ def GetPeak(self, Mass, Epsilon = 1.0):
+ """
+ Get the closest peak to the specified mass, with a maximum error of Epsilon.
+ """
+ ClosestPeak = None
+ ClosestError = None
+ for Peak in self.Peaks:
+ Error = abs(Peak.Mass - Mass)
+ if Error < Epsilon:
+ if (ClosestPeak == None or ClosestError > Error):
+ ClosestPeak = Peak
+ ClosestError = Error
+ if Peak.Mass > Mass:
+ break
+ return ClosestPeak
+ def GetPRMPeak(self, Mass, Epsilon = 1.0):
+ """
+ Get the closest peak to the specified mass, with a maximum error of Epsilon.
+ """
+ ClosestPeak = None
+ ClosestError = None
+ for Peak in self.PRMPeaks:
+ Error = abs(Peak.Mass - Mass)
+ if Error < Epsilon:
+ if (ClosestPeak == None or ClosestError > Error):
+ ClosestPeak = Peak
+ ClosestError = Error
+ if Peak.Mass > Mass:
+ break
+ return ClosestPeak
+ def GetPRMPeaks(self, Mass, Epsilon = 1.0):
+ """
+ Get all peaks within Epsilon of Mass
+ """
+ Peaks = []
+ for Peak in self.PRMPeaks:
+ Error = abs(Peak.Mass - Mass)
+ if Error < Epsilon:
+ Peaks.append(Peak)
+ if Peak.Mass > Mass:
+ break
+ return Peaks
+ def AssignIonTypesFromPeptide(self):
+ """
+ Assign ion types to our peaks, based on the CorrectPeptide.
+ """
+ # The true PRMPeaks are sums of peptide masses. Iterate
+ # over the length of the peptide:
+ LeftMass = 0
+ for Index in range(0, len(self.CorrectPeptide.Aminos)):
+ LeftMass += Global.AminoMass[self.CorrectPeptide.Aminos[Index]]
+ LeftMass += Global.FixedMods.get(self.CorrectPeptide.Aminos[Index], 0)
+ # For this PRMPeak, look for all the possible spectral peaks
+ # corresponding to the various ion types:
+ for IonType in AllIons:
+ Mass = IonType.GetPeakMass(LeftMass, self.ParentMass)
+ Peak = self.GetPeak(Mass, 1.0)
+ if Peak:
+ Peak.IonType = IonType
+ Peak.Pep = self.CorrectPeptide[:Index+1]
+ def ApplyWindowFilter(self, RegionCutoffs, WindowSizes, MaxRankInclusive):
+ """
+ Apply this window-fiter to our peaks. RegionCutoffs describe the edges
+ of "early", "medium" and "late" spectral portions; WindowSizes are the
+ sizes (in AMUs) of windows for these portions. MaxRankInclusive is the
+ worst rank to keep.
+ """
+ #print "Apply window:", WindowSizes, RegionCutoffs, MaxRankInclusive
+ GoodPeaks = []
+ # List of region-edges:
+ Borders = []
+ for Cutoff in RegionCutoffs:
+ Borders.append(self.ParentMass * Cutoff)
+ NextBorderIndex = 0
+ LastBorderIndex = len(RegionCutoffs)
+ WindowIndex = 0
+ BadPeakIntensityList = []
+ for Peak in self.Peaks:
+ while (NextBorderIndex < LastBorderIndex and Peak.Mass > Borders[NextBorderIndex]):
+ NextBorderIndex += 1
+ WindowIndex += 1
+ WindowSize = WindowSizes[WindowIndex]
+ MinMass = Peak.Mass - WindowSize/2
+ MaxMass = Peak.Mass + WindowSize/2
+ List = []
+ for OtherPeak in self.Peaks:
+ if OtherPeak.Mass > MaxMass:
+ break
+ if OtherPeak.Mass > MinMass:
+ List.append((OtherPeak.Intensity, OtherPeak))
+ List.sort()
+ List.reverse() # best to worst
+ if (len(List) < MaxRankInclusive+1) or (Peak.Intensity >= List[MaxRankInclusive][0]):
+ GoodPeaks.append(Peak)
+ else:
+ BadPeakIntensityList.append(Peak.Intensity)
+ #print "Kept %d of %d original peaks."%(len(GoodPeaks), len(self.Peaks))
+ self.Peaks = GoodPeaks
+ if len(BadPeakIntensityList):
+ BadPeakIntensityList.sort()
+ return BadPeakIntensityList[len(BadPeakIntensityList)/2]
+ else:
+ return -1
+ def FilterPeaks(self, WindowSize = 50, PeakCount = 6):
+ self.ApplyWindowFilter([], (WindowSize,), PeakCount - 1)
+ def WritePeaks(self, FilePath):
+ "Write out a .dta file."
+ File = open(FilePath, "w")
+ File.write("%f\t%d\n"%(self.ParentMass, self.Charge))
+ for Peak in self.Peaks:
+ File.write("%f\t%f\n"%(Peak.Mass, Peak.Intensity))
+ File.close()
+ def WritePKLPeaks(self,FilePath):
+ """"
+ Append to the end of a .pkl file. Note this APPENDS a file.
+ if no precursor intensity is known, then we say zero. I hope that does not break things.
+ """
+ FileHandle = open(FilePath, "a")
+ if self.PrecursorIntensity:
+ FileHandle.write("%s %s %s\n"%(self.PrecursorMZ,self.PrecursorIntensity,self.Charge))
+ else:
+ FileHandle.write("%s 0.0 %s\n"%(self.PrecursorMZ,self.Charge))
+ for Peak in self.Peaks:
+ FileHandle.write("%f\t%f\n"%(Peak.Mass, Peak.Intensity))
+ FileHandle.write("\n") #need a blank line to separate different scans
+ FileHandle.close
+ def WriteMGFPeaks(self, TheFile, Title = "Spectrum", ScanNumber = None):
+ """
+ Append to the end of an mgf file. Pass in an open file, or
+ (as a string) the path of a file to be APPENDED to.
+ """
+ if type(TheFile) == type(""):
+ File = open(FilePath, "a")
+ else:
+ File = TheFile
+ File.write("BEGIN IONS\n")
+ File.write("TITLE=%s\n"%Title)
+ if ScanNumber != None:
+ File.write("SCAN=%s\n"%ScanNumber)
+ File.write("CHARGE=%d\n"%self.Charge)
+ File.write("PEPMASS=%f\n"%self.PrecursorMZ)
+ for Peak in self.Peaks:
+ File.write("%f\t%f\n"%(Peak.Mass, Peak.Intensity))
+ File.write("END IONS\n")
+ if type(TheFile) == type(""):
+ File.close()
+ def WriteMZXMLPeaks(self, File, ScanNumber):
+ PeakCount = len(self.Peaks)
+ Str = """<scan num="%s" msLevel="2" peaksCount="%s" polarity="+" scanType="Full" lowMz="125" highMz="2000" """%(ScanNumber, PeakCount)
+ Str += """\n<precursorMz """
+ if self.PrecursorIntensity:
+ Str += """ precursorIntensity = "%.2f" """%self.PrecursorIntensity
+ Str += ">%.5f</precursorMz>\n"%self.PrecursorMZ
+ Str += """\n<peaks precision="32" byteOrder="network" pairOrder="m/z-int">"""
+ PeakString = ""
+ for Peak in self.Peaks:
+ PeakString += struct.pack(">ff", Peak.Mass, Peak.Intensity)
+ PeakString = B64Encode(PeakString)
+ Str += PeakString
+ Str += "</peaks>\n</scan>\n"
+ File.write(Str + "\n")
+
+ def GetTopUnexplainedPeak(self):
+ TopUXRank = len(self.Peaks)
+ for Peak in self.Peaks:
+ if Peak.IonType == None and Peak.IntensityRank < TopUXRank:
+ TopUXRank = Peak.IntensityRank
+ return TopUXRank
+ def FindIsotopicPeaks(self):
+ for PeakIndex in range(len(self.Peaks)):
+ Peak = self.Peaks[PeakIndex]
+ RoundMass = int(round(Peak.Mass))
+ ExpectedFraction = Global.IsotopeWeights.get(RoundMass, None)
+ if ExpectedFraction==None:
+ continue
+ for IsotopePeakIndex in range(PeakIndex+1, len(self.Peaks)):
+ OtherPeak = self.Peaks[IsotopePeakIndex]
+ if OtherPeak.Mass < Peak.Mass + 0.8:
+ continue
+ if OtherPeak.Mass > Peak.Mass + 1.2:
+ break
+ Fraction = OtherPeak.Intensity / Peak.Intensity
+ # magic numbers ahoy:
+ if abs(Fraction - ExpectedFraction) < 0.5 or (abs(Fraction - ExpectedFraction) < 0.8 and OtherPeak.Mass > Peak.Mass + 0.9 and OtherPeak.Mass < Peak.Mass + 1.1):
+ OtherPeak.IsPlausibleIsotopicPeak = 1
+ Peak.HasPlausibleIsotopicPeak = 1
+ def GetExplainedIntensity(self):
+ """
+ Callable *after* the spectrum has been labeled. Returns the percentage
+ of total spectral intensity that has been explained by labels. All
+ things being equal, a candidate peptide with a higher explained
+ intensity is BETTER.
+ """
+ TotalIntensity = 0
+ ExplainedIntensity = 0
+ for Peak in self.Peaks:
+ TotalIntensity += Peak.Intensity
+ if Peak.IonType != None:
+ ExplainedIntensity += Peak.Intensity
+ #print "%s\t%s\t%s\t%s\t%s\t"%(Peak.Mass, Peak.Intensity, Peak.IonType, Peak.PeptideIndex, Peak.RescueFlag)
+ return ExplainedIntensity / float(max(1, TotalIntensity))
+ def GetExplainedIons(self, Peptide, DynamicRangeMin = 150, DynamicRangeMax = 2000):
+ "Return the percentage of b and y peaks present."
+ Annotated = {}
+
+ PhosphorylationFlag = 0
+ PhosB = [0]*40
+ PhosY = [0]*40
+ for (Pos, ModList) in Peptide.Modifications.items():
+ for Mod in ModList:
+ if Mod.Name == "Phosphorylation":
+ PhosphorylationFlag = 1
+ for Peak in self.Peaks:
+ if Peak.IonType:
+ Annotated[(Peak.IonType.Name, Peak.PeptideIndex)] = 1
+ Count = 0
+ Present = 0
+ TotalCutPresent = 0
+ PM = 19 + Peptide.Masses[-1]
+ for Index in range(len(Peptide.Masses)):
+ CutPresent = 0
+ BMass = Peptide.Masses[Index] + 1.0078
+ if BMass > DynamicRangeMin and BMass < DynamicRangeMax:
+ Count += 1
+ BPresent = Annotated.get(("b", Index),0)
+ BPresent |= Annotated.get(("b-p", Index),0)
+ Present += BPresent
+ CutPresent |= BPresent
+ YMass = PM - Peptide.Masses[Index]
+ if YMass > DynamicRangeMin and YMass < DynamicRangeMax:
+ Count += 1
+ YPresent = Annotated.get(("y", len(Peptide.Aminos) - Index), 0)
+ YPresent |= Annotated.get(("y-p", Index),0)
+ Present += YPresent
+ CutPresent |= YPresent
+ # Count the CUT POINTS that are witnessed:
+ if (Index and Index<len(Peptide.Masses)-1) and CutPresent:
+ TotalCutPresent += 1
+ CutCount = len(Peptide.Masses) - 1
+ #print Peptide.Aminos, "%s cut points of %s"%(TotalCutPresent, CutCount)
+ return (Present, Count, Present / max(1, float(Count)),
+ TotalCutPresent, CutCount, TotalCutPresent/max(1, float(CutCount)))
+
+
+ def GetExplainedPeaks(self, MaxRank = 24):
+ """
+ Returns the percentage of the top n peaks that have been explained
+ by peak labeling, where n = MaxRank. The output of GetExplainedPeaks()
+ should be high for a good candidate peptide.
+ """
+ TotalGoodPeaks = 0
+ ExplainedGoodPeaks = 0
+ for Peak in self.Peaks:
+ if Peak.IntensityRank <= MaxRank:
+ TotalGoodPeaks += 1
+ #print Peak.IntensityRank, Peak.Mass, Peak.IonType
+ if Peak.IonType != None:
+ ExplainedGoodPeaks += 1
+ return ExplainedGoodPeaks / float(max(1, TotalGoodPeaks))
+ def GetLogMeanStdev(self):
+ """computes the mean and standard deviation of the peak intensities
+ This can be done at any time before or after filtering. just
+ make sure that you know what it means.
+ This computes things based on the LOG intensity values
+ """
+ IntensitySum = 0.0
+ NumPeaks = len(self.Peaks)
+ for Peak in self.Peaks:
+ IntensitySum += math.log(Peak.Intensity)
+ Mean = IntensitySum / NumPeaks
+ VarSum = 0.0
+ for Peak in self.Peaks:
+ Diff = math.log(Peak.Intensity) - Mean
+ VarSum += Diff*Diff
+ Variance = VarSum / NumPeaks
+ Stdev = math.sqrt(Variance)
+ return (Mean,Stdev)
+
diff --git a/MakeImage.py b/MakeImage.py
new file mode 100644
index 0000000..800d4c4
--- /dev/null
+++ b/MakeImage.py
@@ -0,0 +1,623 @@
+#Title: MakeImage.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+MakeImage.py takes a spectrum labeled with ion types (built by Label.py) and produces a graph.
+"""
+from Utils import *
+import traceback
+import MSSpectrum
+import math
+import traceback
+
+try:
+ from PIL import Image
+ from PIL import ImageDraw
+ from PIL import ImageFont
+ # Fonts don't seem to work on Linux. (Tried pdf, pcf, and pil formats...but no luck)
+ # So, we'll content ourselves with a default font if we must:
+ try:
+ TheFont = ImageFont.truetype("Times.ttf", 12)
+ except:
+ TheFont = ImageFont.load_default()
+except:
+ traceback.print_exc()
+ print "WARNING: Python Imaging Library (PIL) not installed.\n Image creation is NOT available."
+ Image = None
+
+
+class WebColors:
+ "Color scheme for web display. Colorful."
+ White = (255,255,255)
+ Green = (0,255,0)
+ Blue = (0,0,255)
+ PaleBlue = (10,10,80)
+ Red = (255,0,0)
+ Grey = (155,155,155)
+ #Grey = (0,0,0)
+ Background = (255, 255, 255)
+ Peak = (199, 199, 199)
+ #Peak = (0,0,0)
+ AnnotatedPeak = (0, 0, 0)
+ Axis = (155, 155, 155)
+ LabeledPeak = (0, 0, 0)
+ PeakLabel = (200, 0, 0)
+ BSeries = (55,55,200)
+ BSeriesPale = (155,155,255)
+ YSeries = (155,155,55)
+ YSeriesPale = (200,200,100)
+
+class PrintingColors:
+ "Color scheme for printing. Black-and-white, dark shades."
+ White = (255,255,255)
+ Green = (0,0,0)
+ Blue = (0,0,0)
+ PaleBlue = (80,80,80)
+ Red = (0,0,0)
+ Grey = (80,80,80)
+ Background = (255, 255, 255)
+ Peak = (200, 200, 200)
+ AnnotatedPeak = (80, 80, 80)
+ Axis = (80, 80, 80)
+ LabeledPeak = (10, 10, 10)
+ PeakLabel = (0, 0, 0)
+ BSeries = (0,0,0)
+ BSeriesPale = (180,180,180)
+ YSeries = (0,0,0)
+ YSeriesPale = (180,180,180)
+
+#Colors = PrintingColors
+Colors = WebColors
+
+def SetColors(PrintFlag):
+ global Colors
+ if PrintFlag:
+ Colors = PrintingColors
+ else:
+ Colors = WebColors
+ print "COLORS SET!", Colors.YSeries
+
+def RoundOff(Int):
+ "Round an integer to the nearest power of ten"
+ PowerOfTen = 10
+ while (1):
+ if PowerOfTen>Int:
+ return Int
+ Radix = Int % PowerOfTen
+ if Radix == 0:
+ PowerOfTen *= 10
+ continue
+ if Radix > PowerOfTen/2:
+ return (Int-Radix) + PowerOfTen
+ else:
+ return (Int-Radix)
+
+
+def GetTickWidth(MassWidth):
+ "Look for a nice round number that divides the width into about 20 pieces."
+ IdealSliceCount = 20
+ # If 6-15, round to 10; if 26-35, round to 20... If 151...
+ #print "GetTickWidth:", MassWidth
+ Width = int(round(MassWidth / IdealSliceCount))
+ SliceCount = MassWidth / Width
+ while (1):
+ # Try rounding off another digit:
+ RoundedWidth = RoundOff(Width)
+ if RoundedWidth==Width:
+ break # we can't round off any more!
+ RoundedSliceCount = MassWidth / RoundedWidth
+ if (RoundedSliceCount < IdealSliceCount / 2) or (RoundedSliceCount > IdealSliceCount * 2):
+ break # Slices are too skinny or too fat
+ Width = RoundedWidth
+ return Width
+
+
+class BaseImageMaker:
+ """
+ Graph generating class. Not very MS-specific. Subclassed by the spectrum plotter.
+ """
+ LeftPad = 30
+ RightPad = 3
+ UpperPad = 50
+ LowerPad = 20
+ def __init__(self, Width = 600, Height = 400):
+ """
+ Width is the total image width, in pixels. The plot body ranges from
+ self.LeftPad to self.Width.
+ Height is the total image height, in pixels. The plot body ranges from
+ self.UpperPad to (self.Height-self.LowerPad)
+ """
+ self.MinX = 0
+ self.MaxX = 100
+ self.MinY = 0
+ self.MaxY = 1
+ self.Width = Width
+ self.Height = Height
+ self.BaseLine = self.Height - self.LowerPad
+ self.YBreak = None
+ def GetNiceTickMark(self, Width, GoodTickCount):
+ """
+ Given a width (axis size), compute a good interval for major tick marks.
+ We want around 10 ticks, but we round the tick size up or down in order
+ to get cleaner numbers.
+ """
+ if (Width <= 0):
+ return 0.1
+ Tick = Width / float(GoodTickCount)
+ MinimumPower = int(math.log10(Tick)) - 1
+ PossibleTicks = []
+ for Power in range(MinimumPower, MinimumPower+5):
+ PossibleTicks.append(10**Power)
+ PossibleTicks.append(10**Power * 2)
+ PossibleTicks.append(10**Power * 5)
+ for Index in range(len(PossibleTicks) - 1):
+ if PossibleTicks[Index+1] > Tick:
+ return PossibleTicks[Index+1]
+ return 0.1 # hacky fall-back case!
+ def GetValueName(self, Value):
+ """
+ Take a number - potentially a very large or small one - and format it so as
+ to use few characters, so as to make a usable axis label.
+ 1.2002 -> 1.2
+ 0.149 -> 0.15
+ 10000 -> 1e5
+ -0.00005 -> -5e-5
+ """
+ if Value == 0:
+ return "0"
+ if Value < 0:
+ Sign = "-"
+ Value *= -1
+ else:
+ Sign = ""
+ if Value > 1000:
+ Exp = int(math.log10(Value))
+ Abscissa = 10**(math.log10(Value) - int(math.log10(Value)))
+ return "%s%.1fe%d"%(Sign,Abscissa, Exp)
+ if Value < 0.01:
+ Exp = math.floor(math.log10(Value))
+ Abscissa = 10**(math.log10(Value) - Exp)
+ return "%s%.1fe%d"%(Sign,Abscissa, Exp)
+ if abs(Value) < 1:
+ return "%s%.2f"%(Sign, Value)
+ else:
+ return "%s%.1f"%(Sign, Value)
+ def DrawYAxis(self):
+ if self.YBreak:
+ self.DrawYAxisHelper(self.MinLowY, self.MaxLowY, 8)
+ self.DrawYAxisHelper((self.MinHighY + self.MaxHighY) / 2.0, self.MaxHighY, 1)
+ self.Draw.line((self.LeftPad, self.UpperPad, self.LeftPad, self.YBreak - 2), Colors.Axis)
+ self.Draw.line((self.LeftPad - 4, self.YBreak + 4, self.LeftPad + 4, self.YBreak), Colors.Axis)
+ self.Draw.line((self.LeftPad - 4, self.YBreak, self.LeftPad + 4, self.YBreak - 4), Colors.Axis)
+ self.Draw.line((self.LeftPad, self.YBreak + 2, self.LeftPad, self.BaseLine), Colors.Axis)
+ else:
+ self.DrawYAxisHelper(self.MinY, self.MaxY, 10)
+ self.Draw.line((self.LeftPad, self.UpperPad, self.LeftPad, self.BaseLine), Colors.Axis)
+ def DrawYAxisHelper(self, MinVal, MaxVal, TickCount):
+ "Draw the y-axis (including tick-marks and labels)"
+ TickSize = self.GetNiceTickMark(MaxVal - MinVal, TickCount)
+ ##print "Y axis: %s ticks from %s to %s; tick size %s"%(TickCount, MinVal, MaxVal, TickCount)
+ MaxPowers = int(math.log10(MaxVal)) + 1
+ LastY = None
+ try:
+ IntensityLevel = MinVal
+ while IntensityLevel < MaxVal:
+ Y = self.GetY(IntensityLevel)
+ self.Draw.line((self.LeftPad-2,Y,self.LeftPad,Y),Colors.Axis)
+ Str = self.GetValueName(IntensityLevel)
+ self.Draw.text((0, Y-5), Str, Colors.Axis, font = TheFont)
+ LastY = Y
+ IntensityLevel += TickSize
+ except:
+ traceback.print_exc()
+ print 0, self.MinY, self.MaxY, TickSize
+ raise
+ def BreakY(self, SmallestOfBig, BiggestOfSmall):
+ self.MinLowY = self.MinY
+ self.MaxLowY = BiggestOfSmall * 1.05
+ self.MinHighY = max(self.MaxLowY * 1.001, SmallestOfBig * 0.95)
+ self.MaxHighY = self.MaxY
+ self.BrokenY = 1
+ self.YBreak = int((self.Height - self.UpperPad - self.LowerPad) * 0.2)
+ #print "Y break is:", self.YBreak
+ def GetY(self, YValue):
+ if self.YBreak:
+ if YValue > self.MaxLowY:
+ YPercent = (YValue - self.MinHighY) / max(1, self.MaxHighY - self.MinHighY)
+ return self.YBreak - int((self.YBreak - self.UpperPad)*YPercent)
+ else:
+ YPercent = YValue / max(1, self.MaxLowY - self.MinLowY)
+ return self.BaseLine - int((self.BaseLine - self.YBreak)*YPercent)
+ YPercent = YValue / float(max(1, self.MaxY - self.MinY))
+ return self.BaseLine - int((self.BaseLine - self.UpperPad) * YPercent)
+ def GetX(self, XValue):
+ XPercent = (XValue - self.MinX) / max(1, (self.MaxX - self.MinX))
+ TotalWidth = self.Width - (self.LeftPad + self.RightPad)
+ return self.LeftPad + int(XPercent * TotalWidth)
+ def DrawTickMarks(self):
+ TickPos = 0
+ while TickPos < self.MaxX-1.0:
+ if TickPos < self.MinX:
+ TickPos += self.TickWidth
+ continue
+ X = self.GetX(TickPos)
+ # Draw text, unless it would go over the edge:
+ TextWidth = len(str(TickPos))*6
+ TextX = X - TextWidth/2
+ if TextX < self.Width - TextWidth:
+ self.Draw.line((X, self.BaseLine, X, self.BaseLine+3), Colors.Axis)
+ self.Draw.text((TextX, self.BaseLine+2), str(TickPos), Colors.Axis, font = TheFont)
+ TickPos += self.TickWidth
+
+class MSImageMaker(BaseImageMaker):
+ def __init__(self, *args, **kw):
+ # Some options:
+ self.YBreakThreshold = 0.3333
+ #self.Labels = {} Label -> (PeakX, PeakY, Label, PeakIntensity, IntensityRank)
+ self.IntensityRank = {} # PeakIndex -> Rank
+ BaseImageMaker.__init__(self, *args, **kw)
+ def ConvertPeakAnnotationToImage(self, PeakAnnotationList, OutputFileName, Peptide = None, Width = 600, Height = 400):
+ if not Image: # catch for no PIL
+ return None
+ self.Width = Width
+ self.Height = Height
+ self.BaseLine = self.Height - self.LowerPad
+ self.PeakAnnotationList = PeakAnnotationList
+ self.GetPeakDemographics(Peptide) #computes min, max, intensityrank
+ self.PlotImage = Image.new("RGB", (Width, Height), Colors.Background) # mode, size, [startcolor]
+ self.Draw = ImageDraw.Draw(self.PlotImage)
+ self.RoofLine = Height * 0.5
+ MassWidth = self.Width - (self.LeftPad + self.RightPad)
+ # Draw baseline
+ self.Draw.line((self.LeftPad, self.BaseLine, Width - self.RightPad, self.BaseLine), Colors.Axis)
+ # Draw x axis tickmarks (and labels):
+ self.TickWidth = 200
+ self.DrawTickMarks()
+ # Draw y axis:
+ self.DrawYAxis()
+ # Draw peaks, with labels
+ self.DrawPeaks()
+ if Peptide:
+ self.DrawBSeries(Peptide)
+ self.DrawYSeries(Peptide)
+ self.DrawPeakLabels()
+ self.PlotImage.save(OutputFileName, "png")
+ def GetPeakDemographics(self, Peptide = None):
+ """
+ Because this version uses a list of peak annotations given by
+ PyInspect, and not peak objects, they don't come with an associated
+ rank. here I do a quick bubble sort and rank stuff (Ali, I wrote this bubble sort)
+ """
+ Intensities = []
+ self.MinX = 1000
+ self.MaxX = 0
+ # Sort peak, from most to least intense:
+ PeaksByIntensity = []
+ for PeakIndex in range(len(self.PeakAnnotationList)):
+ Tuple = self.PeakAnnotationList[PeakIndex]
+ PeaksByIntensity.append((Tuple[1], Tuple[0], PeakIndex))
+ ## set min and max masses
+ if Tuple[0] > self.MaxX:
+ self.MaxX = Tuple[0]
+ if Tuple[0] < self.MinX:
+ self.MinX = Tuple[0]
+ self.MaxY = max(self.MaxY, Tuple[1])
+ PeaksByIntensity.sort()
+ PeaksByIntensity.reverse()
+ ###
+ self.IntensityRank = [None] * len(self.PeakAnnotationList)
+ for IntensityRank in range(len(PeaksByIntensity)):
+ self.IntensityRank[Tuple[-1]] = IntensityRank
+ ## possibly reset x min and max
+ if Peptide:
+ self.MaxX = max(self.MaxX, Peptide.Masses[-1] + 10)
+ self.MinX = 0
+ FullMass = self.MaxX - self.MinX
+ # Move the left and right edges a little bit, so that peaks don't
+ # hit the edges of the image.
+ self.MinX = max(0, self.MinX - FullMass * 0.05)
+ self.MaxX = self.MaxX + FullMass * 0.05
+ self.MaxY = max(10.0, self.MaxY)
+ ##############
+ ## do some Y breakage
+ Intensity1 = PeaksByIntensity[0][0]
+ if len(PeaksByIntensity) < 2:
+ return # why we would have a spectrum with only one peak, I will never know.
+ Intensity2 = PeaksByIntensity[1][0]
+ if (Intensity2 / Intensity1 < self.YBreakThreshold):
+ self.BreakY(Intensity1, Intensity2)
+ return
+ if len(PeaksByIntensity) < 3:
+ return
+ Intensity3 = PeaksByIntensity[2][0]
+ if Intensity3 / Intensity2 < self.YBreakThreshold:
+ self.BreakY(Intensity2, Intensity3)
+ return
+ def DrawPeaks(self):
+ self.Labels = {}
+ MaxIntensity = 0
+ # First, no-ion-type peaks:
+ for PeakTuple in self.PeakAnnotationList:
+ (Mass, Intensity, Label, AminoIndex) = PeakTuple
+ MaxIntensity = max(MaxIntensity, Intensity)
+ if Label: # don't draw in grey anything which has been labeled
+ continue
+ PeakX = self.GetX(Mass)
+ PeakY = self.GetY(Intensity)
+ self.Draw.line((PeakX, PeakY, PeakX, self.BaseLine), Colors.Peak)
+ MinLabelIntensity = MaxIntensity * 0.25 # any annotated peak above this threshold receives a text label, even if it's a neutral loss.
+ # Next, peaks with assigned ions:
+ for PeakIndex in range(len(self.PeakAnnotationList)):
+ (Mass, Intensity, Label, AminoIndex) = self.PeakAnnotationList[PeakIndex]
+ if not Label: #skip all the unlabeled peaks this round
+ continue
+ PeakX = self.GetX(Mass)
+ PeakY = self.GetY(Intensity)
+ self.Draw.line((PeakX, PeakY, PeakX, self.BaseLine), Colors.AnnotatedPeak)
+ TextLabelPeakNames = ("B", "Y", "GlcNAc", "Y2", "B2")
+ if Label in TextLabelPeakNames:
+ PeptideIndex = AminoIndex
+ if Label in ("B", "Y"):
+ Label = Label.lower()
+ if Label in ("B2", "Y2"):
+ Label = "%s2"%Label[0].lower()
+ if AminoIndex != None:
+ Label = "%s %s"%(Label, AminoIndex)
+ OldLabelInfo = self.Labels.get(Label, None)
+ if OldLabelInfo!=None and OldLabelInfo[-2] > Intensity:
+ continue
+ self.Labels[Label] = (PeakX, PeakY, Label, Intensity, self.IntensityRank[PeakIndex])
+ if Label[0] == "M":
+ ##special case for phorphorylated spectra. the label
+ ## Parent loss has been changed to M-p or M-p-h2o
+ OldLabelInfo = self.Labels.get(Label, None)
+ if OldLabelInfo != None and OldLabelInfo[-2] > Intensity:
+ continue
+ self.Labels[Label] = (PeakX, PeakY, Label, Intensity, self.IntensityRank[PeakIndex])
+ def CollideRectangles(self, X1, Y1, X2, Y2, Rectangles):
+ for (CX1, CY1, CX2, CY2) in Rectangles:
+ if CX1 <= X1 <= CX2 and CY1 <= Y1 <= CY2:
+ return (CX1, CY1, CX2, CY2)
+ if CX1 <= X2 <= CX2 and CY1 <= Y1 <= CY2:
+ return (CX1, CY1, CX2, CY2)
+ if CX1 <= X1 <= CX2 and CY1 <= Y2 <= CY2:
+ return (CX1, CY1, CX2, CY2)
+ if CX1 <= X2 <= CX2 and CY1 <= Y2 <= CY2:
+ return (CX1, CY1, CX2, CY2)
+ return None
+ def DrawPeakLabels(self):
+ # Sort labels by priority. b and y take precedence over all else;
+ # intense peaks take precedence over others.
+ SortedLabels = []
+ for (X, Y, Label, Intensity, Rank) in self.Labels.values():
+ if Rank < 10:
+ Priority = 0
+ else:
+ Priority = 1
+ SortedLabels.append([Priority, Y, X, Label, Intensity, None, None])
+ SortedLabels.sort()
+ SortedLabels = SortedLabels[:25]
+ DirtyRectangles = []
+ for List in SortedLabels:
+ (IsBY, Y, X, Label, Intensity, Dummy, Dummy) = List
+ (Width, Height) = self.Draw.textsize(Label, font = TheFont)
+ Height *= 2 # for superscript and subscript
+ X1 = X - Width/2
+ X2 = X + Width/2
+ Y1 = Y - Height
+ Y2 = Y
+ Tuple = self.CollideRectangles(X1, Y1, X2, Y2, DirtyRectangles)
+ if Tuple == None:
+ List[5] = (X1, Y1, X2, Y2)
+ DirtyRectangles.append((X1, Y1, X2, Y2))
+ continue
+ (CX1, CY1, CX2, CY2) = Tuple
+ # Try moving this label off to the side:
+ if (X1 + X2) / 2 < (CX1 + CX2) / 2:
+ Move = (X2 - CX1) + 1
+ X1 -= Move
+ X2 -= Move
+ Y1 -= 5
+ Y2 -= 5
+ else:
+ Move = (CX2 - X1) + 1
+ X1 += Move
+ X2 += Move
+ Y1 -= 5
+ Y2 -= 5
+ Tuple = self.CollideRectangles(X1, Y1, X2, Y2, DirtyRectangles)
+ if Tuple == None:
+ List[5] = (X1, Y1, X2, Y2)
+ DirtyRectangles.append((X1, Y1, X2, Y2))
+ List[6] = ((X1 + X2) / 2, Y2, X, Y)
+ continue
+ for Index in range(len(SortedLabels)-1, -1, -1):
+ List = SortedLabels[Index]
+ if List[5]:
+ (X1, Y1, X2, Y2) = List[5]
+ self.Draw.line((List[2], List[1], List[2], self.BaseLine), Colors.LabeledPeak) # color the peak
+ for Index in range(len(SortedLabels)-1, -1, -1):
+ List = SortedLabels[Index]
+ if List[5]:
+ (X1, Y1, X2, Y2) = List[5]
+ # Most peaks are drawn using superscripts and subscripts:
+ PeakName = List[3]
+ if PeakName == "GlcNAc":
+ self.Draw.text((X1, Y1 + 5), PeakName, Colors.PeakLabel, font = TheFont)
+ else:
+ self.Draw.text((X1, Y1 + 5), PeakName[0], Colors.PeakLabel, font = TheFont)
+ NumIndex = len(PeakName) - 1
+ while PeakName[NumIndex] in ("0123456789"):
+ NumIndex -= 1
+ if NumIndex > 0:
+ SuperScript = PeakName[1:NumIndex+1].strip()
+ else:
+ SuperScript = ""
+ if SuperScript:
+ self.Draw.text((X1+7, Y1+10), SuperScript, Colors.PeakLabel, font = TheFont)
+ self.Draw.text((X1+7, Y1), PeakName[NumIndex+1:], Colors.PeakLabel, font = TheFont)
+ else:
+ self.Draw.text((X1+7, Y1+5), PeakName[NumIndex+1:], Colors.PeakLabel, font = TheFont)
+ # Draw the dotted line from the label to its peak:
+ if List[6]:
+ self.Draw.line(List[6], Colors.LabeledPeak)
+ def DrawDottedLine(self, X1, Y1, X2, Y2, Color):
+ Distance = math.sqrt((X2-X1)**2 + (Y2-Y1)**2)
+ if Distance == 0:
+ return
+ DX = (X2-X1)/Distance
+ DY = (Y2-Y1)/Distance
+ OldLineLength = 0
+ Dot = 1
+ while (1):
+ LineLength = min(Distance, OldLineLength + 5)
+ XA = int(X1 + DX*OldLineLength)
+ XB = int(X1 + DX*LineLength)
+ YA = int(Y1 + DY*OldLineLength)
+ YB = int(Y1 + DY*LineLength)
+ if Dot:
+ self.Draw.line((XA, YA, XB, YB), Color)
+ OldLineLength = LineLength
+ Dot = not Dot
+ if (LineLength == Distance):
+ break
+ def DrawBSeries(self, Peptide):
+ BHeight = 17
+ if getattr(Peptide, "Seed", None):
+ SeedIndex = Peptide.Aminos.rfind(Peptide.Seed)
+ else:
+ SeedIndex = -999
+ self.Draw.text((10, BHeight-7), "b", Colors.BSeries, font = TheFont)
+ # First, draw tickmarks for the b peaks
+ for MassIndex in range(len(Peptide.Masses)):
+ Mass = Peptide.Masses[MassIndex]
+ X = self.GetX(Mass)
+ Label = ("b %s"%MassIndex)
+ if self.Labels.has_key(Label):
+ PeakIntensity = self.Labels[Label][-2]
+ Y = self.GetY(PeakIntensity)
+ self.DrawDottedLine(X, BHeight-2, X, Y, Colors.BSeriesPale)
+ else:
+ self.Draw.line((X, BHeight-2, X, BHeight+2), Colors.BSeriesPale)
+ # Now draw horizontal lines, and amino labels:
+ for AminoIndex in range(len(Peptide.Aminos)):
+ LabelA = "b %s"%AminoIndex
+ LabelB = "b %s"%(AminoIndex+1)
+ XA = self.GetX(Peptide.Masses[AminoIndex])
+ XB = self.GetX(Peptide.Masses[AminoIndex+1])
+ HasLabelA = self.Labels.has_key(LabelA)
+ if AminoIndex == 0:
+ HasLabelA = 1
+ HasLabelB = self.Labels.has_key(LabelB)
+ if AminoIndex ==len(Peptide.Aminos)-1:
+ HasLabelB = 1
+ if HasLabelA and HasLabelB:
+ self.Draw.line((XA, BHeight, XB, BHeight), Colors.BSeries)
+ else:
+ self.DrawDottedLine(XA, BHeight, XB, BHeight, Colors.BSeriesPale)
+ if AminoIndex in (SeedIndex, SeedIndex+1, SeedIndex+2):
+ self.Draw.line((XA, BHeight-1, XB, BHeight-1), Colors.BSeries)
+ self.Draw.line((XA, BHeight, XB, BHeight), Colors.BSeries)
+ self.Draw.line((XA, BHeight+1, XB, BHeight+1), Colors.BSeries)
+
+ X = (XA+XB)/2 - 3
+ Str = Peptide.Aminos[AminoIndex]
+ if Peptide.Modifications.get(AminoIndex):
+ self.Draw.text((X-4, BHeight-14), "%s*"%Str, Colors.BSeries, font = TheFont)
+ else:
+ self.Draw.text((X, BHeight-14), Str, Colors.BSeries, font = TheFont)
+ def DrawYSeries(self, Peptide): # Copied and modded from DrawBSeries
+ YHeight = 34
+ if getattr(Peptide, "Seed", None):
+ SeedIndex = Peptide.Aminos.rfind(Peptide.Seed)
+ else:
+ SeedIndex = -999
+
+ self.Draw.text((10, YHeight-7), "y", Colors.YSeries, font = TheFont)
+ # First, draw tickmarks for the y peaks
+ PM = Peptide.Masses[-1] + 19 # parent mass
+ for MassIndex in range(len(Peptide.Masses)):
+ Mass = PM - Peptide.Masses[MassIndex]
+ X = self.GetX(Mass)
+ Label = "y %s"%(len(Peptide.Masses) - MassIndex - 1)
+ #Label = "y %s"%(MassIndex)
+ #print "Y series y %d, mass %f, label %s"%(MassIndex, Mass, Label)
+ if self.Labels.has_key(Label):
+ PeakIntensity = self.Labels[Label][-2]
+ Y = self.GetY(PeakIntensity)
+ self.DrawDottedLine(X, YHeight-2, X, Y, Colors.YSeriesPale)
+ else:
+ self.Draw.line((X, YHeight-2, X, YHeight+2), Colors.YSeriesPale)
+ # Now draw horizontal lines, and amino labels:
+ for AminoIndex in range(len(Peptide.Aminos)):
+ LabelA = "y %s"%(len(Peptide.Aminos) - AminoIndex)
+ LabelB = "y %s"%(len(Peptide.Aminos) - AminoIndex - 1)
+ XA = self.GetX(PM - Peptide.Masses[AminoIndex])
+ XB = self.GetX(PM - Peptide.Masses[AminoIndex+1])
+ HasLabelA = self.Labels.has_key(LabelA)
+ if AminoIndex == 0:
+ HasLabelA = 1
+ HasLabelB = self.Labels.has_key(LabelB)
+ if AminoIndex ==len(Peptide.Aminos)-1:
+ HasLabelB = 1
+ if HasLabelA and HasLabelB:
+ self.Draw.line((XA, YHeight, XB, YHeight), Colors.YSeries)
+ else:
+ self.DrawDottedLine(XA, YHeight, XB, YHeight, Colors.YSeriesPale)
+ if AminoIndex in (SeedIndex, SeedIndex+1, SeedIndex+2):
+ self.Draw.line((XA, YHeight-1, XB, YHeight-1), Colors.YSeries)
+ self.Draw.line((XA, YHeight, XB, YHeight), Colors.YSeries)
+ self.Draw.line((XA, YHeight+1, XB, YHeight+1), Colors.YSeries)
+
+ X = (XA+XB)/2 - 3
+ Str = Peptide.Aminos[AminoIndex]
+ if Peptide.Modifications.get(AminoIndex):
+ self.Draw.text((X-4, YHeight-14), "%s*"%Str, Colors.YSeries, font = TheFont)
+ else:
+ self.Draw.text((X, YHeight-14), Str, Colors.YSeries, font = TheFont)
+
+UsageInfo = """
+Usage:
+ MakeImage.py <LabeledSpectrum> [<OutputFileName>]
+"""
+
+if __name__ == "__main__":
+ if len(sys.argv)<2:
+ print UsageInfo
+ sys.exit(1)
+ InputFileName = sys.argv[1]
+ if len(sys.argv)>2:
+ OutputFileName = sys.argv[2]
+ else:
+ OutputFileName = os.path.splitext(InputFileName)[0] + ".png"
+ Maker = MSImageMaker()
+ Maker.ConvertSpectrumFileToImage(InputFileName, OutputFileName)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0eb8aba
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,36 @@
+SHELL = /bin/sh
+.SUFFIXES:
+.SUFFIXES: .c .o
+CC = cc
+CFLAGS = -g -DDEBUG -D_CONSOLE -O1 -funroll-loops -lz
+LDFLAGS = -lm -lexpat
+
+OBJS = base64.o BN.o BuildMS2DB.o ChargeState.o CMemLeak.o Errors.o ExonGraphAlign.o \
+ FreeMod.o IonScoring.o \
+ LDA.o main.o Mods.o MS2DB.o ParentMass.o ParseInput.o ParseXML.o PValue.o \
+ Run.o Score.o Scorpion.o SNP.o Spectrum.o Spliced.o TagFile.o\
+ SpliceDB.o SpliceScan.o SVM.o Tagger.o Trie.o Utils.o
+
+HDRS = base64.h BN.h BuildMS2DB.h ChargeState.h CMemLeak.h Errors.h ExonGraphAlign.h FreeMod.h \
+ Inspect.h IonScoring.h LDA.h Mods.h MS2DB.h ParentMass.h ParseInput.h ParseXML.h PValue.h \
+ Run.h Score.h Scorpion.h TagFile.h \
+ SNP.h Spectrum.h Spliced.h SpliceDB.h SpliceScan.h SVM.h Tagger.h \
+ Trie.h Utils.h
+
+EXE = inspect
+
+all: $(EXE)
+
+.c.o: $(HDRS)
+ $(CC) $(CFLAGS) -c $<
+
+$(EXE): $(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $(EXE) $(OBJS)
+
+clean-objs:
+ @rm -f $(OBJS) $(MPI_OBJS)
+
+clean-exe:
+ @rm -f $(EXE) $(MPI_EXE)
+
+clean: clean-objs clean-exe
diff --git a/Mods.c b/Mods.c
new file mode 100644
index 0000000..3bc0ef9
--- /dev/null
+++ b/Mods.c
@@ -0,0 +1,340 @@
+//Title: Mods.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Inspect.h"
+#include "Trie.h"
+#include "Utils.h"
+#include "FreeMod.h"
+#include "Mods.h"
+#include "Errors.h"
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <stdlib.h>
+
+// AllKnownPTMods is initialized once and only once. AllowedPTMods is a sub-array,
+// set before doing a search or batch of searches. AllPTModCount is the size
+// of array AllKnownPTMods, and AllowedPTModCount is the size of array AllowedPTMods.
+PTMod AllKnownPTMods[MAX_PT_MODTYPE];
+int AllPTModCount = 0;
+
+int g_PhosphorylationMod = -1;
+
+// PTMLimit[n] is a limit on how many modifications of type n can be placed
+// on a peptide. For each n, PTMLimit[n] <= GlobalOptions->MaxPTMods.
+int g_PTMLimit[MAX_PT_MODTYPE];
+
+int PlainOldDecorationIndex = 0;
+
+Decoration* AllDecorations = NULL;
+int AllDecorationCount = 0;
+int AllDecorationAllocation = 0;
+
+int CompareDecorations(const Decoration* A, const Decoration* B)
+{
+ if (A->Mass < B->Mass)
+ {
+ return -1;
+ }
+ if (A->Mass > B->Mass)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+void ExpandDecorationList(int SourceDecorationIndex, int MinPTModIndex, int* PTMRemaining, int ModsLeft)
+{
+ int PTModIndex;
+ int Decor;
+ //
+ if (ModsLeft <= 0)
+ {
+ return;
+ }
+ for (PTModIndex = MinPTModIndex; PTModIndex < AllPTModCount; PTModIndex++)
+ {
+ if (PTMRemaining[PTModIndex] <= 0)
+ {
+ continue;
+ }
+ // If we have a lot of decorations, expand the memory available for them:
+ if (AllDecorationCount == AllDecorationAllocation-1)
+ {
+ AllDecorationAllocation *= 2;
+ AllDecorations = (Decoration*)realloc(AllDecorations, sizeof(Decoration) * AllDecorationAllocation);
+ }
+ Decor = AllDecorationCount;
+ AllDecorationCount++;
+ //printf("ExpandDecorationList memcpy\n");
+ //fflush(stdout);
+ memcpy(AllDecorations[Decor].Mods, AllDecorations[SourceDecorationIndex].Mods, sizeof(int) * MAX_PT_MODTYPE);
+ AllDecorations[Decor].Mods[PTModIndex]++;
+ AllDecorations[Decor].TotalMods = AllDecorations[SourceDecorationIndex].TotalMods + 1;
+ AllDecorations[Decor].Mass = AllDecorations[SourceDecorationIndex].Mass + MassDeltaByIndex[AMINO_ACIDS*MAX_PT_MODTYPE + PTModIndex]->RealDelta;
+ //printf("ExpandDecorationList memcpy done\n");
+ //fflush(stdout);
+ //printf("Added decoration %d (%.2f) ", Decor, AllDecorations[Decor].Mass);
+ //for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ //{
+ // printf("%d ", AllDecorations[Decor].Mods[ModIndex]);
+ //}
+ //printf("\n");
+
+ PTMRemaining[PTModIndex] -= 1;
+ ExpandDecorationList(Decor, PTModIndex, PTMRemaining, ModsLeft - 1);
+ PTMRemaining[PTModIndex] += 1;
+ //printf("Considering PTModIndex %d/%d\n",PTModIndex,AllPTModCount);
+ }
+}
+
+
+//Populates the AllPTMList as if all 400 mutations were specified in the input file
+int PopulatePTMListWithMutations()
+{
+ int ModFlags;
+
+ char* StrAminos = NULL;
+ char* StrType = NULL;
+ char* StrName = NULL;
+ float MassDelta;
+ int ScaledMassDelta;
+
+ int FromIndex;
+ int ToIndex;
+
+ int Bin;
+ int ModIndex;
+ char ModNameBuffer[64];
+
+
+ AllPTModCount = 0;
+ //
+ if (!MassDeltas)
+ {
+ LoadMassDeltas(NULL, 0);
+ }
+ if (AllPTModCount == MAX_PT_MODTYPE)
+ {
+ // Too many!
+ REPORT_ERROR_S(35, "??");
+ return 0;
+ }
+
+
+ //printf("Starting to populatePTMList!!!\n");
+ StrAminos = (char*)calloc(2,sizeof(char));
+ StrName = (char*)calloc(5,sizeof(char));
+ for(FromIndex = 0; FromIndex < AMINO_ACIDS; ++FromIndex)
+ {
+ if(PeptideMass[FromIndex + (int)('A')] == 0)
+ continue;
+ sprintf(StrAminos,"%c",(char)(FromIndex + 'A'));
+ ModFlags = DELTA_FLAG_VALID;
+ for(ToIndex = 0; ToIndex < AMINO_ACIDS; ++ToIndex)
+ {
+ if(PeptideMass[ToIndex + (int)('A')] == 0)
+ continue;
+ if(FromIndex == ToIndex)
+ continue;
+
+ ScaledMassDelta = PeptideMass[ToIndex + (int)('A')] - PeptideMass[FromIndex +(int)('A')];
+
+ MassDelta = ((float)(ScaledMassDelta))/MASS_SCALE;
+ sprintf(StrName,"%c->%c",(char)(FromIndex + 'a'),(char)(ToIndex + 'a'));
+
+ //printf("Scaled mass of %c->%c = %d, %.3f, %s\n",(char)(FromIndex + 'A'),(char)(ToIndex+'A'),ScaledMassDelta,MassDelta,StrName);
+ // Default modification type is OPTIONAL.
+ if (!StrType)
+ {
+ StrType = "opt";
+ }
+
+ AllKnownPTMods[AllPTModCount].Flags = ModFlags;
+ strncpy(AllKnownPTMods[AllPTModCount].Name, StrName, 5);
+ AllKnownPTMods[AllPTModCount].Allowed[FromIndex] = 1;
+ // Add to the first still-available slot:
+ for (ModIndex = 0; ModIndex < GlobalOptions->DeltasPerAA; ModIndex++)
+ {
+ if (!MassDeltas[FromIndex][ModIndex].Flags)
+ {
+ strncpy(MassDeltas[FromIndex][ModIndex].Name, StrName, 40);
+ MassDeltas[FromIndex][ModIndex].RealDelta = ScaledMassDelta;
+ ROUND_MASS_TO_DELTA_BIN(MassDelta, Bin);
+ MassDeltas[FromIndex][ModIndex].Delta = Bin;
+ MassDeltas[FromIndex][ModIndex].Index = AllPTModCount;
+ MassDeltaByIndex[FromIndex * MAX_PT_MODTYPE + AllPTModCount] = &MassDeltas[FromIndex][ModIndex];
+ MassDeltaByIndex[MDBI_ALL_MODS * MAX_PT_MODTYPE + AllPTModCount] = &MassDeltas[FromIndex][ModIndex];
+ MassDeltas[FromIndex][ModIndex].Flags = ModFlags;
+ break;
+ }
+ }
+
+ AllKnownPTMods[AllPTModCount].Mass = ScaledMassDelta;
+ g_PTMLimit[AllPTModCount] = 1; // allow 1 per peptide by default
+
+
+ AllPTModCount++;
+ //printf("Total mods %d\n",AllPTModCount);
+ }
+ }
+ //printf("Populate: MaxPTMods: %d\n",GlobalOptions->MaxPTMods);
+ free(StrName);
+ free(StrAminos);
+ return 1;
+}
+
+// Entries of form IsSubDecoration[DecorIndex][OtherDecorIndex]
+int** IsSubDecoration = NULL;
+
+
+// After reading the definitions of all the post-translational modifications, we construct
+// a list of decorations.
+// Special case: If GlobalOptions->MandatoryModName is set, then we set MandatoryModIndex, and
+// we only allow decorations that *do* contain that mod.
+void BuildDecorations()
+{
+ int DecorIndex;
+ int OtherDecorIndex;
+ int ModIndex;
+ int ValidSubDecoration;
+ int PTMRemaining[MAX_PT_MODTYPE];
+ int TotalPTMsPermitted;
+ //
+
+ // Free the old IsSubDecoration array, if allocated:
+ if (IsSubDecoration)
+ {
+ for (DecorIndex = 0; DecorIndex < AllDecorationCount; DecorIndex++)
+ {
+ SafeFree(IsSubDecoration[DecorIndex]);
+ }
+ SafeFree(IsSubDecoration);
+ IsSubDecoration = NULL;
+ }
+ AllDecorationAllocation = 100;
+ SafeFree(AllDecorations); // Remove old ones!
+ AllDecorations = NULL;
+ AllDecorations = (Decoration*)calloc(AllDecorationAllocation, sizeof(Decoration));
+ // AllDecorations[0] is now prepared. (Mass 0, no mods)
+ AllDecorationCount = 1;
+ //printf("MAX_PT_MODTYPE: %d\n",MAX_PT_MODTYPE);
+ //printf("Command: memcpy(%d,%d,%d)\n",PTMRemaining, g_PTMLimit, sizeof(int) * MAX_PT_MODTYPE);
+ //fflush(stdout);
+ memcpy(PTMRemaining, g_PTMLimit, sizeof(int) * MAX_PT_MODTYPE);
+ TotalPTMsPermitted = GlobalOptions->MaxPTMods;
+ //printf("DOne memcopy\n");
+ //fflush(stdout);
+ ExpandDecorationList(0, 0, PTMRemaining, TotalPTMsPermitted);
+ qsort(AllDecorations, AllDecorationCount, sizeof(Decoration), (QSortCompare)CompareDecorations);
+
+ //DEBUG-NEC
+ /*for (DecorIndex = 0; DecorIndex < AllDecorationCount; DecorIndex++)
+ {
+ printf("AllDecorations[%d]: Mass=%d,TotalMods=%d\n",DecorIndex,AllDecorations[DecorIndex].Mass,AllDecorations[DecorIndex].TotalMods);
+ for(ModIndex = 0; ModIndex < MAX_PT_MODTYPE; ++ModIndex)
+ printf(" - MassDeltas with Index %d = %d\n",ModIndex,AllDecorations[DecorIndex].Mods[ModIndex]);
+ }
+ */
+ // Locate the index of the unmodified null-decoration. (Usually it's #0, because
+ // it has mass 0, but it's possible for PTMs to have a *negative* mass)
+ for (DecorIndex = 0; DecorIndex < AllDecorationCount; DecorIndex++)
+ {
+ if (AllDecorations[DecorIndex].TotalMods == 0)
+ {
+ PlainOldDecorationIndex = DecorIndex;
+ break;
+ }
+ }
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ if (!CompareStrings(GlobalOptions->MandatoryModName, MassDeltaByIndex[AMINO_ACIDS*MAX_PT_MODTYPE + ModIndex]->Name))
+ {
+ GlobalOptions->MandatoryModIndex = ModIndex;
+ }
+ }
+
+ IsSubDecoration = (int**)calloc(AllDecorationCount, sizeof(int*));
+ for (DecorIndex = 0; DecorIndex < AllDecorationCount; DecorIndex++)
+ {
+ IsSubDecoration[DecorIndex] = (int*)calloc(AllDecorationCount, sizeof(int));
+ for (OtherDecorIndex = 0; OtherDecorIndex < AllDecorationCount; OtherDecorIndex++)
+ {
+ ValidSubDecoration = 1; // default
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ if (AllDecorations[OtherDecorIndex].Mods[ModIndex] < AllDecorations[DecorIndex].Mods[ModIndex])
+ {
+ ValidSubDecoration = 0;
+ break;
+ }
+ }
+ if (ValidSubDecoration)
+ {
+ IsSubDecoration[DecorIndex][OtherDecorIndex] = 1;
+ }
+ }
+ }
+}
+
+void FreeIsSubDecoration()
+{
+ int ModIndex;
+ for (ModIndex = 0; ModIndex < AllDecorationCount; ModIndex++)
+ {
+ SafeFree(IsSubDecoration[ModIndex]);
+ IsSubDecoration[ModIndex] = NULL;
+ }
+ SafeFree(IsSubDecoration);
+ IsSubDecoration = NULL;
+}
+
+// Returns a PTM with this name. Returns NULL if no match found.
+// Case-insensitive (pHoSpHoRyLaTiOn is ok).
+MassDelta* FindPTModByName(char Amino, char* Name)
+{
+ int ModIndex;
+ int AminoIndex = Amino - 'A';
+ for (ModIndex = 0; ModIndex < GlobalOptions->DeltasPerAA; ModIndex++)
+ {
+ if (!MassDeltas[AminoIndex][ModIndex].Flags)
+ {
+ break;
+ }
+ if (!CompareStrings(MassDeltas[AminoIndex][ModIndex].Name, Name))
+ {
+ return &MassDeltas[AminoIndex][ModIndex];
+ }
+ }
+ return NULL;
+}
diff --git a/Mods.h b/Mods.h
new file mode 100644
index 0000000..e1d3917
--- /dev/null
+++ b/Mods.h
@@ -0,0 +1,110 @@
+//Title: Mods.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#ifndef MODS_H
+#define MODS_H
+
+// Structs to support search with post-translational modifications.
+
+#include "Inspect.h"
+
+#define DELTA_FLAG_VALID 1
+#define DELTA_FLAG_PHOSPHORYLATION 2
+#define DELTA_FLAG_C_TERMINAL 4
+#define DELTA_FLAG_N_TERMINAL 8
+
+// A MassDeltaNode is part of a linked list, each node of which wraps
+// a MassDelta. Given a modification that affects multiple amino acids (e.g.
+// oxidation of M or W), we build one MassDelta struct...but there's one
+// MassDeltaNode for MassDeltas[M] and one for MassDeltas[W].
+typedef struct MassDeltaNode
+{
+ struct MassDelta* Delta;
+ struct MassDeltaNode* Next;
+} MassDeltaNode;
+
+typedef struct MassDelta
+{
+ float Score;
+ int Delta; // in bin-units
+ int RealDelta; // in actual mass-units. RealDelta = Delta * 10
+ char Name[21];
+ char Amino; // if this is a mutation to an amino acid
+ int Flags; // Used for noting which is phosphorylation. If flags == 0, this record is an end-of-array marker!
+
+ // Index of type. For instance, all phosphorylations have same index. Offset into AllKnownPTMods.
+ int Index;
+} MassDelta;
+
+extern MassDelta** MassDeltas;
+extern MassDelta** MassDeltaByIndex;
+// A decoration is a collection of post-translational modification. This includes the
+// 'empty decoration', with no modifications, and mass 0. When we examine the flanking
+// regions of a tag match to see whether the masses are valid, we consider each possible
+// decoration. (For instance, if the prefix mass is too low by 80 but phosphorylation
+// is available - and there's a phosphorylatable residue in the prefix - then we have a match
+// via the decoration of mass 80)
+
+typedef struct PTMod
+{
+ char Name[40];
+ int Mass;
+ // How many of this modification can be attached to a base? (Generally zero or one!)
+ int Allowed[TRIE_CHILD_COUNT];
+ int Flags;
+} PTMod;
+
+typedef struct Decoration
+{
+ int Mass;
+ int TotalMods;
+ int Mods[MAX_PT_MODTYPE]; // Decoration->Mods[n] is how many MassDeltas with Index of n are in this decoration.
+} Decoration;
+
+extern Decoration* AllDecorations;
+extern int AllDecorationCount;
+extern int PlainOldDecorationIndex;
+
+// AllKnownPTMods - initialized at startup
+extern PTMod AllKnownPTMods[MAX_PT_MODTYPE];
+extern int AllPTModCount;
+extern int g_PTMLimit[MAX_PT_MODTYPE];
+extern int g_PhosphorylationMod; // index of the phosphorylation PTM
+
+// Returns index of the PTM with this name. Returns -1 if no match found.
+// Case-insensitive (pHoSpHoRyLaTiOn is ok).
+MassDelta* FindPTModByName(char Amino, char* Name);
+
+void BuildDecorations();
+void FreeIsSubDecoration();
+int PopulatePTMListWithMutations();
+extern int** IsSubDecoration;
+#endif // MODS_H
diff --git a/PLSUtils.py b/PLSUtils.py
new file mode 100644
index 0000000..df1f275
--- /dev/null
+++ b/PLSUtils.py
@@ -0,0 +1,265 @@
+#Title: PLSUtils.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""PLSUtils.py
+
+These are a set of related functions which help calculate the Phosphate
+Localization Score or PLS. The differences between this and the Ascore
+are described in Albuquerque MCP 2008.
+1. The winner and runner up are determined by the MQScore, not peptide score.
+2. We do not repeatedly filter the peaks to find the optimal
+AScore. That just takes too long. We use the default Inspect peak filtering which
+leaves ~12 peaks / 100 m/z. This corresponds to a p of 0.12 for the
+binomial.
+3. We will not do any thing for peptides which contain
+more than 2 sites of phosphorylation. I don't trust those annotations
+anyway.
+"""
+
+import math
+import copy
+from Utils import*
+Initialize()
+
+class PLSClass:
+ def __init__(self):
+ self.ChooseTable = {} # (N, k) -> value of NchooseK
+ self.Factorial = {}
+ self.Factorial[0] = 1
+
+ def ComputeBinomial(self, N, n, p=0.12):
+ """ Make sure that you have populated the Choose table and Factorial table
+ I have defaulted the p = 0.12 because it's inspect peak density.
+ """
+ #print "computing binomial with N %s and n %s"%(N,n)
+ Sum = 0
+ for k in range(n, N+1): #stupid range operator, it's gonna kill me
+ # n<=k<=N
+ Sum += self.ChooseTable[(N,k)] * pow(p,k) * pow((1-p),(N-k))
+ return Sum
+
+ def FillOutChooseTable(self, N):
+ """Simply dose all the possible N choose k for k < N
+ """
+ self.FillOutFactorialTable(N)
+ for k in range(N+1):
+ if self.ChooseTable.has_key((N,k)):
+ continue
+ NFac = self.Factorial[N]
+ kFac = self.Factorial[k]
+ NMinusKFac = self.Factorial[N-k]
+ Value = NFac / (kFac * NMinusKFac)
+ self.ChooseTable[(N, k)] = Value
+
+ def FillOutFactorialTable(self, X):
+ """Get all the factorials of numbers <= X """
+ for x in range(1, X+1):
+ if self.Factorial.has_key(x):
+ continue # already have this
+ ##here we calculate it for the value x
+ Prod = x
+ for num in range(1, x):
+ Prod *= num
+ self.Factorial[x] = Prod
+ #for Pair in self.Factorial.items():
+ # print "Factorials ",Pair
+
+ def ComputePLS(self, N, nWinner, nRunnerUp):
+ """From the Ascore Paper paper:
+ AScore = ScoreWinner - ScoreRunnerUp
+ Score = -10 * Log(P)
+ P = Sum_{k=n}^N [N choose k] p^k * (1-p)^{N-k}
+ k = iterator
+ N = number of distinguishing ions
+ n = number of distinguishing ions found
+ p = some very poorly explained probability number. It appears to be the probabilty of
+ a peak in 100 m/z. Inspect filters stuff automatically and I don't want to mess
+ with it. so this number will be PeakDensity of 100 m/z units /100
+ """
+ self.FillOutChooseTable(N) # the N, or number of potential peaks
+ TopBinomial = self.ComputeBinomial(N, nWinner)
+ RunnerUpBinomial = self.ComputeBinomial(N, nRunnerUp)
+ TopScore = -10 * math.log(TopBinomial, 10)
+ RunnerUpScore = -10 * math.log(RunnerUpBinomial, 10)
+ PLS = TopScore - RunnerUpScore # The AScore
+ return PLS
+
+ def GetDistinguishingPeaks(self, Peptide1, Peptide2):
+ """ Given two peptides, find all the peaks that
+ distinguish between the two phos placements, and return
+ those in a list. e.g. SphosEPTIDE vs. SEPTphosIDE
+ distinguishing B fragments = b1: Sphos or S
+ b2: SphosE or SE
+ b3: SphosEP or SEP
+ b4: SphosEPT and SEPTphos have the same mass = NOT DISTINGUISHING
+
+ The general case is: if the b fragment has different number of phosphorylations
+ between the two annotations, then both b and y are distinguishing.
+ """
+ Differences = [] #list of indicies that are different [1] would be differences on b1, yn-1
+ ModIndex1 = Peptide1.Modifications.keys() # a list of indicies
+ #print "Peptide %s"%Peptide1.GetModdedName()
+ for Index in ModIndex1:
+ #print "This is my mod on %s, %s"%(Index, Peptide1.Modifications[Index])
+ PTMList = Peptide1.Modifications[Index]
+ FoundPhos =0
+ for Item in PTMList:
+ if Item.Name == "Phosphorylation":
+ FoundPhos = 1
+ if not FoundPhos:
+ ModIndex1.remove(Index)
+ ModIndex2 = Peptide2.Modifications.keys() # a list of indicies
+ #print "Peptide %s"%Peptide2.GetModdedName()
+ for Index in ModIndex2:
+ #print "This is my mod on %s, %s"%(Index, Peptide2.Modifications[Index])
+ PTMList = Peptide2.Modifications[Index]
+ FoundPhos =0
+ for Item in PTMList:
+ if Item.Name == "Phosphorylation":
+ FoundPhos = 1
+ if not FoundPhos:
+ ModIndex2.remove(Index)
+ Count1 =0
+ Count2 =0
+ for Index in range(len(Peptide1.Aminos)):
+ if Index in ModIndex1:
+ Count1 +=1
+ if Index in ModIndex2:
+ Count2 +=1
+ if not Count1 == Count2:
+ Differences.append(Index+1)
+ ## now we have a list of the b indicies. Let's make the return list
+ DistinguishingPeaks = []
+ for B in Differences:
+ YIndex = len(Peptide1.Aminos) - B
+ #print "a distinguishing peak B%s, Y%s"%(B, YIndex)
+ DistinguishingPeaks.append("B%s"%B)
+ DistinguishingPeaks.append("Y%s"%YIndex)
+ return DistinguishingPeaks
+
+
+ def GetAlternateAnnotations(self, Peptide):
+ """Given an annotation(SAMPAYphosNE), return all
+ alternate annotations. This version should work correctly
+ in the presence of non-phosphorylation modifications,
+ e.g. SAM+16PAYphosNE
+ """
+ #Dummy = GetPeptideFromModdedName("SAM+16PAYphosNESphosT")
+ #Peptide = Dummy
+ NumPhos = Peptide.GetModdedName().count("phos")
+ AllAnnotations = []
+ if not NumPhos in [1,2]:
+ return AllAnnotations # empty list
+ ## now see if the number of phos == number of potential residues
+ ## this means that an AScore is impossible, only one possible placement
+ Count = Peptide.Aminos.count("S")
+ Count += Peptide.Aminos.count("T")
+ Count += Peptide.Aminos.count("Y")
+ if Count == NumPhos:
+ return AllAnnotations #key for "N/A"
+
+ (Dephos, PhosPTM) = self.RemovePhosFromPeptide(Peptide)
+ for Index in range(len(Dephos.Aminos)):
+ if Dephos.Aminos[Index] in ["S", "T", "Y"]:
+ #place a phosphorylation
+ CreateNewLevel1 =0
+ if not Dephos.Modifications.has_key(Index):
+ CreateNewLevel1 = 1
+ Dephos.Modifications[Index] = []
+ Dephos.Modifications[Index].append(PhosPTM)
+ #do I need to place a second phosphate?
+ if NumPhos == 2:
+ for Jndex in range(Index + 1, len(Dephos.Aminos)):
+ if Dephos.Aminos[Jndex] in ["S", "T", "Y"]:
+ CreateNewLevel2 = 0
+ if not Dephos.Modifications.has_key(Jndex):
+ Dephos.Modifications[Jndex] = []
+ CreateNewLevel2 = 1
+ Dephos.Modifications[Jndex].append(PhosPTM)
+ #add string to list, remove phos and move on.
+ #print Dephos.GetModdedName()
+ AllAnnotations.append(Dephos.GetModdedName())
+ Dephos.Modifications[Jndex].pop()
+ if CreateNewLevel2:
+ del Dephos.Modifications[Jndex]
+ else:
+ #only one phos. Add string to list
+ AllAnnotations.append(Dephos.GetModdedName())
+ #print Dephos.GetModdedName()
+ #only add string to the list if it's a single phos peptide
+ #regardless, get rid of the modification now that we've done all it's combinations
+ Dephos.Modifications[Index].pop() # get rid of the most recently added Phos PTM
+ if CreateNewLevel1:
+ del Dephos.Modifications[Index]
+ ## now one last thing remains. We have to remove the original annotation from the list
+ try:
+ AllAnnotations.remove(Peptide.GetModdedName())
+ except:
+ print "The original peptide was not in the list. that's BAAAAAAAADDDDDDDDDDDDDD"
+ return []
+ return AllAnnotations
+
+
+ def RemovePhosFromPeptide(self, Peptide):
+ """Given a Peptide object, remove the phosphorylation PTMod
+ objects and return the neutered copy
+ """
+ Clone = copy.deepcopy(Peptide)
+ PTModPhos = None
+ RemoveElement = [] #indicies in Clone.Modifications that become empty
+ for AminoIndex in Clone.Modifications:
+ ModificationList = Clone.Modifications[AminoIndex]
+ for PTMod in ModificationList:
+ if PTMod.Name == "Phosphorylation":
+ if not PTModPhos:
+ PTModPhos = PTMod
+ ModificationList.remove(PTMod)
+ if len(ModificationList) == 0:
+ RemoveElement.append(AminoIndex)
+ ## now clone is without phosphorylations, but with other modifications
+ ## clean up the empty Modification keys
+ for Index in RemoveElement:
+ del Clone.Modifications[Index]
+ return (Clone, PTModPhos)
+
+ def GetSupportingPeaks(self, FullPeakList, DistinguishingPeakList):
+ SupportingPeaks = {} # key = b8 value = 1
+ for Tuple in FullPeakList:
+ Ion = Tuple[2]
+ Index = Tuple[3]
+ if Ion in ["B", "Y", "B2", "Y2"]:
+ Peak = "%s%s"%(Ion[0], Index)
+ if Peak in DistinguishingPeakList:
+ #print "Supporting peak found %s"%Peak
+ SupportingPeaks[Peak]= 1
+ return len(SupportingPeaks)
diff --git a/PMCLDA1.model b/PMCLDA1.model
new file mode 100644
index 0000000..4d68c06
Binary files /dev/null and b/PMCLDA1.model differ
diff --git a/PMCLDA2.model b/PMCLDA2.model
new file mode 100644
index 0000000..0dd9477
Binary files /dev/null and b/PMCLDA2.model differ
diff --git a/PMCLDA2Phos.model b/PMCLDA2Phos.model
new file mode 100644
index 0000000..5cc8d51
Binary files /dev/null and b/PMCLDA2Phos.model differ
diff --git a/PMCLDA3.model b/PMCLDA3.model
new file mode 100644
index 0000000..e8ab123
Binary files /dev/null and b/PMCLDA3.model differ
diff --git a/PMCLDA3Phos.model b/PMCLDA3Phos.model
new file mode 100644
index 0000000..e28937c
Binary files /dev/null and b/PMCLDA3Phos.model differ
diff --git a/PRM2.bn b/PRM2.bn
new file mode 100644
index 0000000..a063c46
Binary files /dev/null and b/PRM2.bn differ
diff --git a/PRM2.dat b/PRM2.dat
new file mode 100644
index 0000000..08dc4b6
Binary files /dev/null and b/PRM2.dat differ
diff --git a/PRM3.bn b/PRM3.bn
new file mode 100644
index 0000000..04b5337
Binary files /dev/null and b/PRM3.bn differ
diff --git a/PRM3.dat b/PRM3.dat
new file mode 100644
index 0000000..f12c457
Binary files /dev/null and b/PRM3.dat differ
diff --git a/PRMQ2.dat b/PRMQ2.dat
new file mode 100644
index 0000000..e68f8c9
Binary files /dev/null and b/PRMQ2.dat differ
diff --git a/PRMQ3.dat b/PRMQ3.dat
new file mode 100644
index 0000000..bfeb6aa
Binary files /dev/null and b/PRMQ3.dat differ
diff --git a/PTMAnalysis.py b/PTMAnalysis.py
new file mode 100644
index 0000000..12080f3
--- /dev/null
+++ b/PTMAnalysis.py
@@ -0,0 +1,523 @@
+#Title: PTMAnalysis.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+PTMAnalysis.py
+This wrapper script automates the steps involved in processing raw Insepct
+results into finalized PTM site identifications.
+1. FDRUtils.py - obtain a consistent pvalue across all runs
+2. ComputePTMFeatures.py - group individual annotations into peptides and
+ compute some features for each.
+3. BuildMGF.py - builds a single .mgf file out of all the consensus spectra
+ created in step 2 (in preparation for the Inspect run)
+4. Inspect run. Search clustered spectra (unmodified) against a large database
+5. PTMSearchBigDB.py - Integrates the results of the inspect search against a
+ large database, findig a delta-score.
+6. TrainPTMFeatures.py - Computes the PValue of each site from a model
+7. AdjustPTM.py - attempts to merge and reconcile sites
+8. AdjustPTM.py (known chemistry) - attempts to find known explanations for
+ the site
+
+Depending on the size of your dataset, this program may take quite a while
+(up to 1 day per million spectra). It is reasonably easy to parallelize
+the time-intensive steps (2) and (4) above; doing so is up to the user, since
+compute clusters are so heterogenous.
+
+To perform only a subset of the steps, you can designate start and stop steps.
+This is a useful feature if perhaps the program crashes at some point, or if
+you would like to run some steps on a grid, and some locally.
+To start from a step other than one, the program assumes that all the previous
+steps have been executed, and that their outputs are in the proper directories.
+See the functions below for the expected directory and file names.
+"""
+import os
+import getopt
+import sys
+import traceback
+from Utils import *
+
+UsageInfo = """
+PTMAnalysis.py - produces a set of PTM sites.
+
+Required options:
+ -r [FileName] - The name of the results file to parse. If a directory is
+ specified, then all .txt files within the directory will be combined into
+ one report
+ -d [FileName] - The name of the database file (.trie format) searched.
+ -w [FileName] - The final output filename
+ -s [Dir] - Directory containing the MS/MS spectra
+ -B [FilePath] - Path to the large database (for unmodified "decoy" search).
+ -S [Value] - The fraction of sequences in the database that are shuffled.
+ -k [FileName]: Known chemistry filename. If specified, consider altering
+ sites to match known chemical adducts; report the best site-score
+ attainable by using known chemical adducts.
+
+Additional options:
+ -p [Value] - The pvalue cutoff for spectra annotations. Default to 0.1
+ -q [Value] - The pvalue cutoff for PTM site annotations. Default to 0.05
+ -t [Path] - The path where all intermediate results will be written.
+ Default to PTMTempFiles
+ -i [Instrument] - The type of instrument that the Spectra were created on.
+ Default is ESI-ION-TRAP. Valid values are: QTOF, FT-Hybrid
+ -n [FileName] The parameters file to use in the BigSearch (spectra and DB will be replaced, blind will be removed)
+
+Advanced options: to run only a subset of the steps
+ -x [StartStep] - Start with step X (assume that previous steps are done)
+ -y [StopStep] - Stop with step Y (inclusive)
+
+Protein selection can be performed, replacing the protein identification
+with a parsimonious set of protein IDs (using a simple iterative
+approach). The following options are required for protein selection:
+ -a: Replace protein identifications with a "parsimonious" set of protein IDs.
+
+"""
+
+class Steps:
+ First = 1
+ PValue = 1
+ ComputePTMFeatures = 2
+ BuildMGF = 3
+ RunInspect = 4
+ SearchBigDB = 5
+ TrainPTMFeatures = 6
+ AdjustPeptides = 7
+ AdjustToKnownChemistry = 8
+ Last = 8
+
+class WrapperClass:
+ def __init__ (self):
+ self.SpectrumPValue = 0.1 # for the pvalue.py program
+ self.SitePValue = 0.05 # for the final output
+ self.BasePath = "PTMTempFiles"
+ self.PValuePath = None
+ self.PValueOutput = None
+ self.ComputePTMPath = None
+ self.ComputePTMOutput = None
+ self.TrainPTMPath = None
+ self.TrainPTMOutput = None
+ self.TrainPTMModelOutput = None
+ self.AdjustDir = None
+ self.AdjustOutputPath = None
+ self.AdjustModelOutput = None
+ self.SearchBigDBPath = None
+ self.SearchBigDBOutput = None
+ self.InputResults = None
+ self.DatabaseFile = None
+ self.PercentShuffled = None
+ self.SpectraDir = None
+ self.SelectProteins = 0 #default
+ self.FinalOutputFile = None
+ self.BigDB = None
+ self.Instrument = "ESI-ION-TRAP"
+ self.BuildMGFPath = None
+ self.InspectOutDir = None
+ self.MGFPath = None
+ self.SpawnFlag = 0
+ self.StartStep = Steps.First
+ self.StopStep = Steps.Last
+ self.KnownChemistryFileName = None
+ self.TrainPTMModelType = "svm" # default
+ self.ParamsFile = None
+ def SetupDirectories(self):
+ """
+ Below the basepath there will be a group of directories, one for
+ each major step
+ """
+ self.PValuePath = os.path.join(self.BasePath, "PValued")
+ self.ComputePTMPath = os.path.join(self.BasePath, "ComputePTMFeatures")
+ self.TrainPTMPath = os.path.join(self.BasePath, "TrainPTMFeatures")
+ self.AdjustDir = os.path.join(self.BasePath, "AdjustPTM")
+ self.SearchBigDBPath = os.path.join(self.BasePath, "SearchBigDB")
+ self.BuildMGFPath = os.path.join(self.BasePath, "BuildMGF")
+ self.InspectOutDir = os.path.join(self.BasePath, "InspectOut")
+ print "Making temporary directories in %s for all intermediate output"%self.BasePath
+ MakeDirectory(self.PValuePath)
+ MakeDirectory(self.ComputePTMPath)
+ MakeDirectory(self.TrainPTMPath)
+ MakeDirectory(self.AdjustDir)
+ MakeDirectory(self.SearchBigDBPath)
+ MakeDirectory(self.BuildMGFPath)
+ MakeDirectory(self.InspectOutDir)
+ def RunPValue(self):
+ """
+ FDRUtils.py
+ -r InputResults
+ -w OutputResults
+ -S Percent of database shuffled (optional)
+ -p pvalue cutoff
+ -s Distribution file
+ -i Distribution image
+ -H write out results from shuffled protein
+ """
+ self.PValueOutput = self.PValuePath # default, a directory for directory input
+ DistributionFile = os.path.join(self.PValuePath, "Distribution.txt")
+ if not os.path.isdir(self.InputResults): # the InputResults is a single file
+ FileName = os.path.split(self.InputResults)[1]
+ self.PValueOutput = os.path.join(self.PValueOutput, FileName)
+ PValueArgs = ""
+ if self.PercentShuffled:
+ PValueArgs = "-r %s -w %s -S %f -p %f -s %s -i -H" %(self.InputResults, self.PValueOutput, self.PercentShuffled, self.SpectrumPValue, DistributionFile)
+ else:
+ PValueArgs = "-r %s -w %s -p %f -s %s -i -H" %(self.InputResults, self.PValueOutput, self.SpectrumPValue, DistributionFile)
+ if self.SelectProteins:
+ PValueArgs += " -a -d %s"%self.DatabaseFile
+ PValueArgs += " -b "
+ if self.StartStep <= Steps.PValue and Steps.PValue <= self.StopStep:
+ print "Step %s: FDRUtils"%Steps.PValue
+ print "Arguments: %s"%PValueArgs
+ if self.SpawnFlag:
+ Command = "python FDRUtils.py %s"%PValueArgs
+ print Command
+ os.system(Command)
+ else:
+ import FDRUtils
+ ArgsList = PValueArgs.split()
+ Parser = FDRUtils.PValueParser()
+ Parser.ParseCommandLine(ArgsList)
+ FDRUtils.Main(Parser)
+ del FDRUtils
+ else:
+ print "Skipping Step %s: FDRUtils"%Steps.PValue
+ def RunComputePTMFeatures(self):
+ """
+ ComputePTMFeatures.py
+ -r InputResults
+ -w OutputDir
+ -d Database
+ -s spectra
+ """
+ self.ComputePTMOutput = os.path.join(self.ComputePTMPath, "PTMFeatures.txt")
+ Args = " -r %s -w %s -d %s -s %s"%(self.PValueOutput, self.ComputePTMPath, self.DatabaseFile, self.SpectraDir)
+ if self.StartStep <= Steps.ComputePTMFeatures and Steps.ComputePTMFeatures <= self.StopStep:
+ print "Step %s: ComputePTMFeatures"%Steps.ComputePTMFeatures
+ print "Arguments: %s"%Args
+ if self.SpawnFlag:
+ Command = "ComputePTMFeatures.py %s"%Args
+ print Command
+ os.system(Command)
+ else:
+ import ComputePTMFeatures
+ ArgsList = Args.split()
+ Computer = ComputePTMFeatures.PTMFeatureComputer()
+ Computer.ParseCommandLine(ArgsList)
+ Computer.ComputeFeaturesMain()
+ del ComputePTMFeatures
+ else:
+ print "Skipping Step %s: ComputePTMFeatures"%Steps.ComputePTMFeatures
+ def RunBuildMGF(self):
+ """
+ BuildMGF.py
+ -d PTM feature directory
+ -m MGF file to make
+ """
+ self.MGFPath = os.path.join(self.BuildMGFPath, "spectra.mgf")
+ Args = " -d %s -m %s"%(self.ComputePTMPath, self.MGFPath)
+
+ if self.StartStep <= Steps.BuildMGF and Steps.BuildMGF <= self.StopStep:
+ print "Step %s: BuildMGF"%Steps.BuildMGF
+ print "Arguments: %s"%Args
+ if self.SpawnFlag:
+ Command = "BuildMGF.py %s"%Args
+ print Command
+ os.system(Command)
+ else:
+ ArgsList = Args.split()
+ import BuildMGF
+ Builder = BuildMGF.MGFBuilder()
+ Builder.ParseCommandLine(ArgsList)
+ Builder.Main()
+ del BuildMGF
+ else:
+ print "Skipping Step %s: BuildMGF"%Steps.BuildMGF
+ def RunInspect(self):
+ """
+ Given that the mgf file was previously created, here we create an
+ input file for Inspect and run it.
+ """
+ InspectExe = None
+ if sys.platform == "win32":
+ InspectExe = "Inspect.exe"
+ else:
+ InspectExe = "./inspect"
+ InspectIn = "BigSearch.in"
+ self.InspectOutFile = os.path.join(self.InspectOutDir, "Results.txt")
+ Command = "%s -i %s -o %s"%(InspectExe, InspectIn, self.InspectOutFile)
+
+ Dict = {}
+ if self.ParamsFile:
+ File = open(self.ParamsFile,'r')
+
+ for Line in File:
+ Line = Line.strip()
+ if Line == "":
+ continue
+ Bits = Line.split(",")
+ if Dict.has_key(Bits[0].lower()):
+ Dict[Bits[0].lower()].append((",".join(Bits[1:])).lower())
+ else:
+ Dict[Bits[0].lower()] = [(",".join(Bits[1:])).lower()]
+ File.close()
+ if Dict.has_key("blind"):
+ del Dict["blind"]
+ if Dict.has_key("mods"):
+ del Dict["mods"]
+ if Dict.has_key("sequencefile"):
+ del Dict["sequencefile"]
+ if Dict.has_key("unrestrictive"):
+ del Dict["unrestrictive"]
+ if Dict.has_key("maxptmsize"):
+ del Dict["maxptmsize"]
+
+ else:
+
+ Dict["protease"] = ["Trypsin"]
+ Dict["mod"] = ["57,C,fix"]
+ Dict["tagcount"] = ["25"]
+
+ if not Dict.has_key("instrument"):
+ Dict["instrument"] = [self.Instrument]
+ Dict["db"] = [self.BigDB]
+ Dict["spectra"] = [self.MGFPath]
+
+ InFileCommands = ""
+ for Key in Dict.keys():
+ List = Dict[Key]
+ Str = ""
+ for L in List:
+ Str += "%s,%s\n"%(Key,L)
+ InFileCommands += Str
+
+ #InFileCommands = "spectra,%s\n"%self.MGFPath
+ #InFileCommands += "instrument,%s\n"%self.Instrument
+ #InFileCommands += "protease,Trypsin\n"
+ #InFileCommands += "DB,%s\n"%self.BigDB
+ #InFileCommands += "mod,57,C,fix\n"
+ #InFileCommands += "tagcount,25\n"
+ #print InFileCommands
+ #raw_input()
+ Handle = open(InspectIn, "wb")
+ Handle.write(InFileCommands)
+ Handle.close()
+ if self.StartStep <= Steps.RunInspect and Steps.RunInspect <= self.StopStep:
+ print "Step %s: Run Inspect, searching consensus spectra"%Steps.RunInspect
+ print "Arguments: %s"%Command
+ os.system(Command)
+ else:
+ print "Skipping Step %s: Run Inspect, searching consensus spectra"%Steps.RunInspect
+ def RunPTMSearchBigDB(self):
+ """
+ PTMSearchBigDB.py
+ -d PTM feature directory
+ -w Outputfile to write
+ -r Inspect Search Results (default to directory, not file)
+ """
+ self.SearchBigDBOutput = os.path.join(self.SearchBigDBPath, "Results.txt")
+ # we use the InspectOutDir in case there are multiple files within the directory.
+ Args = " -d %s -w %s -r %s"%(self.ComputePTMPath, self.SearchBigDBOutput, self.InspectOutDir)
+ if self.StartStep <= Steps.SearchBigDB and Steps.SearchBigDB <= self.StopStep:
+ print "Step %s: PTMSearchBigDB"%Steps.SearchBigDB
+ print "Arguments: %s"%Args
+ if self.SpawnFlag:
+ Command = "PTMSearchBigDB.py %s"%Args
+ print Command
+ os.system(Command)
+ else:
+ ArgsList = Args.split()
+ import PTMSearchBigDB
+ Searcher = PTMSearchBigDB.PTMSearcher()
+ Searcher.ParseCommandLine(ArgsList)
+ Searcher.Main()
+ del PTMSearchBigDB
+ else:
+ print "Skipping Step %s: PTMSearchBigDB"%Steps.SearchBigDB
+ def RunTrainPTMFeatures(self):
+ """
+ TrainPTMFeatures.py
+ -u InputResults
+ -v OutputResults
+ -m ModelType
+ -w OutputModel
+ """
+ self.TrainPTMOutput = os.path.join(self.TrainPTMPath, "Results.txt")
+ self.TrainPTMModelOutput = os.path.join(self.TrainPTMPath, "model.%s.txt"%self.TrainPTMModelType)
+ Args = "-u %s -v %s -m %s -w %s"%(self.SearchBigDBOutput, self.TrainPTMOutput, self.TrainPTMModelType, self.TrainPTMModelOutput)
+ if self.StartStep <= Steps.TrainPTMFeatures and Steps.TrainPTMFeatures <= self.StopStep:
+ print "Step %s: TrainPTMFeatures"%Steps.TrainPTMFeatures
+ print "Arguments: %s"%Args
+ if self.SpawnFlag:
+ Command = "TrainPTMFeatures.py %s"%Args
+ print Command
+ os.system(Command)
+ else:
+ ArgsList = Args.split()
+ import TrainPTMFeatures
+ Trainer = TrainPTMFeatures.PTMFeatureTrainer()
+ Trainer.ParseCommandLine(ArgsList)
+ Trainer.TrainModel()
+ del TrainPTMFeatures
+ else:
+ print "Skipping Step %s: TrainPTMFeatures"%Steps.TrainPTMFeatures
+ def AdjustPeptides(self):
+ """
+ AdjustPTM.py
+ -r InputResults
+ -w OutputResults
+ -d Database File
+ -c Cluster directory from ComputePTMFeatures
+ -m model INPUT filename
+ -M model OUTPUT filename
+ """
+ self.AdjustOutputPath = os.path.join(self.AdjustDir, "Results.txt")
+ self.AdjustModelOutput = os.path.join(self.AdjustDir, "model.%s.txt"%self.TrainPTMModelType)
+ Args = "-r %s -w %s -d %s -c %s -m %s -M %s -z "%(self.TrainPTMOutput, self.AdjustOutputPath, self.DatabaseFile, self.ComputePTMPath, self.TrainPTMModelOutput, self.AdjustModelOutput)
+ if self.StartStep <= Steps.AdjustPeptides and Steps.AdjustPeptides <= self.StopStep:
+ print "Step %s: AdjustPTM"%Steps.AdjustPeptides
+ print "Arguments: %s"%Args
+ if self.SpawnFlag:
+ Command = "AdjustPTM.py %s"%Args
+ print Command
+ os.system(Command)
+ else:
+ ArgsList = Args.split()
+ import AdjustPTM
+ Adjutant = AdjustPTM.PTMAdjuster()
+ Adjutant.ParseCommandLine(ArgsList)
+ Adjutant.Main()
+ del AdjustPTM
+ else:
+ print "Skipping Step %s: AdjustPTM"%Steps.AdjustPeptides
+ def AdjustToKnownChemistry(self):
+ """
+ AdjustPTM.py
+ -r InputResults
+ -w OutputResults
+ -m input Model
+ -d Database
+ -c Clusters
+ -k Known PTM file
+ -v verbose output file
+ """
+ if not self.KnownChemistryFileName:
+ print "* Skipping AdjustToKnownChemistry: Requires a file (-k) of 'common' modifications."
+ return
+ KnownPTMVerboseOutputPath = os.path.join(self.BasePath, "KnownPTMOutput.txt")
+ Args = "-r %s -w %s -m %s -d %s -c %s -k %s -v %s"%(self.AdjustOutputPath, self.FinalOutputFile, self.AdjustModelOutput, self.DatabaseFile, self.ComputePTMPath, self.KnownChemistryFileName, KnownPTMVerboseOutputPath)
+
+ if self.StartStep <= Steps.AdjustToKnownChemistry and Steps.AdjustToKnownChemistry <= self.StopStep:
+ print "Step %s: AdjustToKnownChemistry"%Steps.AdjustToKnownChemistry
+ print "Arguments: %s"%Args
+ if self.SpawnFlag == 1:
+ Command = "AdjustPTM.py %s"%Args
+ print Command
+ os.system(Command)
+ else:
+ ArgsList = Args.split()
+ import AdjustPTM
+ Adjutant = AdjustPTM.PTMAdjuster()
+ Adjutant.ParseCommandLine(ArgsList)
+ Adjutant.Main()
+ del AdjustPTM
+ else:
+ print "Skipping Step %s: AdjustToKnownChemistry"%Steps.AdjustToKnownChemistry
+ def ParseCommandLine(self, Arguments):
+ "Args is a list of arguments only (does not include sys.argv[0] == script name)"
+ (Options, Args) = getopt.getopt(Arguments, "r:d:w:s:p:q:t:S:B:i:x:y:k:bZm:n:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ self.InputResults = Value
+ elif Option == "-m":
+ self.TrainPTMModelType = Value # "svm" or "lda"
+ elif Option == "-d":
+ self.DatabaseFile = Value
+ elif Option == "-w":
+ self.FinalOutputFile = Value
+ elif Option == "-s":
+ self.SpectraDir = Value
+ elif Option == "-p":
+ self.SpectrumPValue = float(Value)
+ elif Option == "-q":
+ self.SitePValue = float(Value)
+ elif Option == "-t":
+ self.BasePath = Value
+ elif Option == "-S":
+ self.PercentShuffled = float (Value)
+ elif Option == "-B":
+ self.BigDB = Value
+ elif Option == "-x":
+ self.StartStep = int (Value)
+ elif Option == "-y":
+ self.StopStep = int (Value)
+ elif Option == "-k":
+ self.KnownChemistryFileName = Value
+ elif Option == "-Z":
+ self.SpawnFlag = 1
+ elif Option == "-n":
+ self.ParamsFile = Value
+ else:
+ print "** Unknown option:", Option, Value
+ if not OptionsSeen.has_key("-r") or not OptionsSeen.has_key("-d") or not OptionsSeen.has_key("-w") or not OptionsSeen.has_key("-s") or not OptionsSeen.has_key("-S"):
+ print "Missing required options (r, d, w, s, S)"
+ print UsageInfo
+ sys.exit(-1)
+ if not self.BigDB:
+ print "Missing large DB (-B)"
+ print UsageInfo
+ sys.exit(-1)
+
+def Main():
+ "Main control box for the script"
+ Wrap = WrapperClass()
+ Wrap.ParseCommandLine(sys.argv[1:])
+ Wrap.SetupDirectories()
+ #now we run the parts of the scripts one after another
+ print "\n*** Starting to run components ***"
+ Wrap.RunPValue()
+ Wrap.RunComputePTMFeatures()
+ Wrap.RunBuildMGF()
+ Wrap.RunInspect()
+ Wrap.RunPTMSearchBigDB()
+ Wrap.RunTrainPTMFeatures()
+ Wrap.AdjustPeptides()
+ Wrap.AdjustToKnownChemistry()
+
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "psyco not found - running without optimization"
+ Main()
+
diff --git a/PTMChooserLM.py b/PTMChooserLM.py
new file mode 100644
index 0000000..c030b5e
--- /dev/null
+++ b/PTMChooserLM.py
@@ -0,0 +1,1294 @@
+#Title: PTMChooserLM.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Low-memory-usage version of PTMChooser.
+"""
+import sys
+import getopt
+import struct
+import types
+import traceback
+import os
+import time
+import Label
+import MakeImage
+import ExplainPTMs
+from Utils import *
+Initialize()
+
+UsageInfo = """PTMChooser - Parse database search output, and select a parsimonious
+ set of PTMs to explain the results.
+
+Required parameters:
+ -r [FileName]: Results file to parse. If a directory is specified,
+ then ALL files in the directory shall be parsed.
+ -d [FileName]: Database .trie file searched
+
+Optional parameters:
+ -s [Dir]: Summary directory to write findings to (default: PTMSummary)
+ -v [value]: p-value cutoff. Annotations with inferior p-values are
+ discarded
+ -l [count]: Maximum number of lines to read from the results file(s).
+ Use this option to preview full results quickly.
+ -p Generate PTM frequency matrix. This option detects common,
+ non-site-specific modifications such as oxidized methionine. It is
+ not well-suited to finding point mutations.
+
+Options for PTM site mode:
+ -i Generate spectrum images for the representative spectra for
+ each site
+ -w [value]: p-value cutoff for selecting a site-specific PTM.
+ Defaults to the value of -v; cannot be larger.
+ -c Size of the protecting group on cysteine residues (defaults to 57).
+ -t Maximum sites to report (defaults to 1000)
+ -m Minimum size of mass delta (defaults to 3). Mass differences of
+ less than three daltons on ion trap spectra are most likely due
+ to incorrect parent mass reporting, and so are filtered.
+ -k [file]: File enumerating known PTMs, such as M+16. Used to override
+ name reporting.
+"""
+
+MaxLineCount = None # by default, read the entire results file. Override with -l option.
+
+class SiteClass:
+ "For a putative PTM."
+ def __init__(self):
+ self.Residue = "" # M1, Q155, that sort of thing
+ self.DBPos = None
+ self.Mass = None
+ self.BestPeptides = [] # sorted list (pvalue, -score, peptide) for the best 10 hits
+ self.BestModlessPeptides = [] # sorted list (pvalue, -score, peptide) for the best 10 hits
+ self.BestOtherModPeptides = [] # sorted list (pvalue, -score, peptide) for the best 10 hits
+ self.ModdedSpecies = {}
+ self.AnnotationCount = 0
+ self.ModlessAnnotationCount = 0
+ self.OtherModAnnotationCount = 0
+ self.AA = "X"
+ # Count how many annotations use N-terminus, middle, C-terminus.
+ self.TerminalCount = [0, 0, 0]
+
+class PTMClass:
+ "For known PTMs"
+ def __init__(self, Mass):
+ self.Mass = Mass
+ self.AA = {}
+ self.Terminus = ""
+ self.Name = str(Mass)
+ def GetNameWithLink(self):
+ return self.Name
+ def BuildPTModClass(self):
+ self.PTMod = PTModClass("%+d"%self.Mass)
+ self.PTMod.Mass = self.Mass
+ self.PTMod.Bases = self.AA
+
+class Processor:
+ def __init__(self):
+ # Cutoff for using spectra to propose a new modification:
+ self.PValueCutoff = 0.05
+ # Cutoff for reporting spectra:
+ self.PValueReportCutoff = 0.08
+ self.SiteList = []
+ self.PTMFrequencyMatrix = {} # (DBPos, Mass)->Count
+ self.PTMBestScoreMatrix = {} # (DBPos, Mass)->BestScore
+ self.NTerminalFlag = 0
+ self.PTMSummaryDir = "PTMSummary"
+ self.DB = ""
+ self.MinSpectraForSite = 1 # can override
+ self.MaxSiteCount = 1000 # can override
+ self.GenerateSpectrumImagesFlag = 0 # disabled by default
+ self.CysteineProtection = 57
+ self.MinimumPTMSize = 3
+ self.DeltaScoreCutoff = -2
+ self.MQScoreCutoff = -3
+ # how many peptides to report for a PTM:
+ self.ReportPeptideCount = 10
+ self.KnownPTMFileName = None
+ self.KnownPTMs = []
+ self.KnownPTMDict = {} # (AA, Mass, Terminus) -> PTMClass instance
+ self.BuildPTMFrequencyMatrix = 0
+ self.MinimumPeptideLength = 7
+ def ReadKnownPTMs(self):
+ if not self.KnownPTMFileName:
+ return
+ if not os.path.exists(self.KnownPTMFileName):
+ print "** Known PTM file '%s' not found, ignoring!"%self.KnownPTMFileName
+ return
+ File = open(self.KnownPTMFileName, "rb")
+ # Parse a line of the form: mod,+88,*,nterminal
+ for FileLine in File.xreadlines():
+ FileLine = FileLine.strip()
+ if not FileLine:
+ continue
+ Bits = FileLine.split(",")
+ if len(Bits)<3:
+ continue
+ if Bits[0].lower() != "mod":
+ continue
+ if len(Bits) > 3:
+ PTMType = Bits[3]
+ else:
+ PTMType = "opt"
+ if PTMType[:3].lower() == "fix":
+ continue
+ Mass = int(Bits[1])
+ PTM = PTMClass(Mass)
+ Aminos = Bits[2]
+ if Aminos == "*":
+ Aminos = "ACDEFGHIKLMNPQRSTVWY"
+ for Amino in Aminos:
+ PTM.AA[Amino] = 1
+ if PTMType.lower() == "nterminal":
+ PTM.Terminus = "N"
+ if PTMType.lower() == "cterminal":
+ PTM.Terminus = "C"
+ if len(Bits) > 4:
+ PTM.Name = Bits[4]
+ self.KnownPTMs.append(PTM)
+ PTM.BuildPTModClass()
+ for Amino in Aminos:
+ Key = (Amino, int(round(PTM.Mass)), PTM.Terminus)
+ self.KnownPTMDict[Key] = PTM
+ if PTM.Terminus == "":
+ # A non-terminal PTM is still legal at a terminus:
+ Key = (Amino, int(round(PTM.Mass)), "C")
+ self.KnownPTMDict[Key] = PTM
+ Key = (Amino, int(round(PTM.Mass)), "N")
+ self.KnownPTMDict[Key] = PTM
+
+ def ReadSpectrumAnnotations(self, FileName):
+ "Parse annotations from a file (or a directory containing many results files)"
+ self.LinesReadCount = 0
+ self.AnnotationFileName = FileName
+ if not os.path.exists(FileName):
+ print "* Error in PTMChooser: Results file '%s' does not exist!"%FileName
+ return
+ if os.path.isdir(FileName):
+ print "Parsing results files from directory %s..."%FileName
+ SubFileNames = os.listdir(FileName)
+ SubFileNames.sort()
+ for SubFileNameIndex in range(len(SubFileNames)):
+ SubFileName = SubFileNames[SubFileNameIndex]
+ print "File %s/%s: %s"%(SubFileNameIndex, len(SubFileNames), SubFileName)
+ Path = os.path.join(FileName, SubFileName)
+ if os.path.isdir(Path):
+ print "Skip subdirectory %s"%Path
+ else:
+ self.ReadSpectrumAnnotationsFromFile(Path)
+ if MaxLineCount != None and self.LinesReadCount > MaxLineCount:
+ break
+ else:
+ self.ReadSpectrumAnnotationsFromFile(FileName)
+ def ReadPTMWitnesses(self, FileName):
+ self.LinesReadCount = 0
+ Path = os.path.join(self.PTMSummaryDir, "PTMAnnotations.txt")
+ self.OutputAnnotationFile = open(Path, "wb")
+ "Parse annotations from a file (or a directory containing many results files)"
+ self.AnnotationFileName = FileName
+ if not os.path.exists(FileName):
+ print "* Error in PTMChooser: Results file '%s' does not exist!"%FileName
+ return
+ if os.path.isdir(FileName):
+ print "Parsing results files from directory %s..."%FileName
+ SubFileNames = os.listdir(FileName)
+ for SubFileNameIndex in range(len(SubFileNames)):
+ SubFileName = SubFileNames[SubFileNameIndex]
+ print "File %s/%s: %s"%(SubFileNameIndex, len(SubFileNames), SubFileName)
+ Path = os.path.join(FileName, SubFileName)
+ if os.path.isdir(Path):
+ print "Skip subdirectory %s"%Path
+ else:
+ self.ReadPTMWitnessesFromFile(Path)
+ if MaxLineCount != None and self.LinesReadCount > MaxLineCount:
+ break
+ else:
+ self.ReadPTMWitnessesFromFile(FileName)
+ self.OutputAnnotationFile.close()
+ def TweakIncorrectEndpoints(self, Peptide):
+ """
+ Some putative modifications can be explained away by altering the endpoints of
+ a peptide. Examples include "K.AYGSTNPINIVR-71A.T" (right endpoint off by one),
+ and "S.D+87KFSTVEQQASYGVGR.Q" (left endpoint off by one). If we can explain away
+ a modification by shifting the endpoint, then we'll do so, and in doing so get
+ rid of a major source of delta-correct annotations.
+ """
+ if not Peptide.Modifications.keys():
+ return Peptide
+ if self.KnownPTMFileName:
+ # Check whether this peptide's PTMs match known PTMs. If so, tweak them
+ # if necessary, and return the fixed PTMs.
+ UnknownPTMFlag = 0
+ NewModifications = {}
+ for (Index, ModList) in Peptide.Modifications.items():
+ Terminus = ""
+ if Index == 0:
+ Terminus = "N"
+ elif Index == len(Peptide.Aminos) - 1:
+ Terminus = "C"
+ for Mod in ModList:
+ Key = (Peptide.Aminos[Index], Mod.Mass, Terminus)
+ if self.KnownPTMDict.has_key(Key):
+ # Acceptable!
+ if not NewModifications.has_key(Index):
+ NewModifications[Index] = []
+ NewModifications[Index].append(Mod)
+ continue
+ # Look for a nearby PTM:
+ MinIndex = max(0, Index - 3)
+ MaxIndex = min(len(Peptide.Aminos) - 1, Index + 4)
+ FoundFlag = 0
+ for NearIndex in range(MinIndex, MaxIndex):
+ if FoundFlag:
+ break
+ for NearMass in range(Mod.Mass - 1, Mod.Mass + 2):
+ NearTerminus = ""
+ if NearIndex == 0:
+ NearTerminus = "N"
+ elif NearIndex == len(Peptide.Aminos) - 1:
+ NearTerminus = "C"
+ Key = (Peptide.Aminos[NearIndex], NearMass, NearTerminus)
+ PTM = self.KnownPTMDict.get(Key, None)
+ if PTM:
+ # Aha! This appears to be a delta-correct annotation.
+ if not NewModifications.has_key(NearIndex):
+ NewModifications[NearIndex] = []
+ NewModifications[NearIndex].append(PTM.PTMod)
+ FoundFlag = 1
+ break
+ if not FoundFlag:
+ UnknownPTMFlag = 1 # known PTMs don't explain this annotation!
+ # Loop is finished. Did we see any with no explanation?
+ if UnknownPTMFlag == 0:
+ OldName = Peptide.GetFullModdedName()
+ Peptide.Modifications = NewModifications
+ NewName = Peptide.GetFullModdedName()
+ #print "MASSAGED:", OldName, NewName
+ return Peptide
+ Q17Mod = PTModClass("-17")
+ Q17Mod.Mass = -17
+ #print "Tweaking incorrect annotation endpoints."
+ EditThisAnnot = 0
+ Len = len(Peptide.Aminos)
+ if not Peptide.UnexplainedModList or len(Peptide.UnexplainedModList) != 1:
+ return Peptide
+ (AA, Mass, Pos, Terminus) = Peptide.UnexplainedModList[0]
+ Endpoint = Peptide.DBPos + len(Peptide.Aminos)
+ # Try moving endpoint left, to repair things like X.XXXX-57G.X:
+ if Pos >= len(Peptide.Aminos) - 3:
+ Diff = abs(Mass + Global.AminoMass[Peptide.Aminos[-1]])
+ if Diff < 1.1:
+ NewPeptide = GetPeptideFromModdedName(Peptide.Aminos[:-1])
+ NewPeptide.DBPos = Peptide.DBPos
+ NewPeptide.Prefix = Peptide.Prefix
+ NewPeptide.Suffix = self.DB[Peptide.DBPos + len(NewPeptide.Aminos)]
+ NewPeptide.UnexplainedModList = []
+ return NewPeptide
+ # Try moving startpoint left by 1-3 bases, to repair thinks like A.X+71XXX.X
+ if (Pos < 3 and Mass > 0):
+ ExtraMass = 0
+ for AACount in range(1, 4):
+ DBPos = Peptide.DBPos - AACount
+ if (DBPos < 0):
+ break
+ ExtraMass += Global.AminoMass.get(self.DB[DBPos], 9999)
+ Diff = abs(Mass - ExtraMass)
+ if Diff < 1.1:
+ NewPeptide = GetPeptideFromModdedName(self.DB[DBPos:Endpoint])
+ NewPeptide.DBPos = DBPos
+ NewPeptide.Prefix = self.DB[DBPos-1]
+ NewPeptide.Suffix = Peptide.Suffix
+ NewPeptide.UnexplainedModList = []
+ return NewPeptide
+ # Consider using Q-17 here, instead of a spurious +111:
+ if self.DB[DBPos] == "Q":
+ Diff = abs(Mass - (ExtraMass - 17))
+ if Diff < 1.1:
+ NewPeptide = GetPeptideFromModdedName(self.DB[DBPos:Endpoint])
+ NewPeptide.DBPos = DBPos
+ NewPeptide.Prefix = self.DB[DBPos-1]
+ NewPeptide.Suffix = Peptide.Suffix
+ NewPeptide.Modifications[0] = [Q17Mod]
+ NewPeptide.UnexplainedModList = [("Q", -17, 0, "N")]
+ NewPeptide.ComputeMasses()
+ return NewPeptide
+ # Try moving endpoint right by 1-3 bases, to repair things like X.XXX+128.K
+ if (Pos > len(Peptide.Aminos) - 3 and Mass > 0):
+ ExtraMass = 0
+ for AACount in range(1, 4):
+ DBPos = Endpoint + AACount - 1
+ if (DBPos >= len(self.DB)):
+ break
+ ExtraMass += Global.AminoMass.get(self.DB[DBPos], 9999)
+ Diff = abs(Mass - ExtraMass)
+ if Diff < 1.1:
+ NewPeptide = GetPeptideFromModdedName(self.DB[Peptide.DBPos:DBPos + 1])
+ NewPeptide.DBPos = Peptide.DBPos
+ NewPeptide.Prefix = Peptide.Prefix
+ NewPeptide.Suffix = self.DB[DBPos + 1]
+ NewPeptide.UnexplainedModList = []
+ return NewPeptide
+ return Peptide
+ def ReadPTMWitnessesFromFile(self, FileName):
+ """
+ Read annotations (again!) from the specified file. This time around, we know
+ which PTM sites we accept, and we count how many spectra are present.
+ """
+ try:
+ File = open(FileName, "rb")
+ except:
+ print "* Error in PTMChooser: Cannot open results file '%s'!"%FileName
+ return
+ LineNumber = 0
+ AnnotationCount = 0
+ OldSpectrumName = None
+ AnnotatedFlag = 0
+ File = open(FileName, "r")
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ self.LinesReadCount += 1
+ if LineNumber%10000 == 0:
+ print "Line#%s %s annotations accepted"%(LineNumber, AnnotationCount)
+ sys.stdout.flush()
+ if MaxLineCount != None and self.LinesReadCount > MaxLineCount:
+ break
+ if FileLine[0]=="#":
+ continue # skip comment-line
+ Bits = FileLine.strip().split("\t")
+ ##################################################################
+ # Skip invalid lines, or poor annotations:
+ if len(Bits)<16:
+ # This isn't a valid annotation line!
+ continue
+ try:
+ MQScore = float(Bits[5])
+ PValue = float(Bits[10])
+ DeltaScore = float(Bits[11])
+ except:
+ print "** Warning: Invalid annotation line in %s line %s"%(FileName, LineNumber)
+ continue
+ if PValue > self.PValueCutoff or DeltaScore < self.DeltaScoreCutoff or MQScore < self.MQScoreCutoff:
+ continue
+ SpectrumName = (Bits[0], Bits[1])
+ if SpectrumName != OldSpectrumName:
+ # It's a new spectrum; reset the AnnotatedFlag
+ AnnotatedFlag = 0
+ OldSpectrumName = SpectrumName
+ if AnnotatedFlag:
+ continue
+ Peptide = GetPeptideFromModdedName(Bits[2][2:-2])
+ Peptide.UnexplainedModList = []
+ if len(Peptide.Aminos) < self.MinimumPeptideLength:
+ continue # Short peptides are too likely to be spurious! (Length 1-4 is rubbish, 5 is marginal)
+ ##################################################################
+ Peptide.MQScore = MQScore
+ Peptide.DeltaScore = float(Bits[12])
+ Peptide.PValue = PValue
+ Peptide.SpectrumName = Bits[0].replace("/","\\").split("\\")[-1]
+ Peptide.SpectrumPath = Bits[0]
+ Peptide.ScanNumber = Bits[1]
+ Peptide.ProteinName = Bits[3]
+ Peptide.ScanByteOffset = Bits[15]
+ Peptide.DBPos = self.DB.find(Peptide.Aminos)
+ Peptide.Prefix = Bits[2][0]
+ Peptide.Suffix = Bits[2][-1]
+ if Peptide.DBPos == -1:
+ print "** Warning: Annotation '%s' for spectrum '%s' not found in database!"%(Peptide.Aminos, SpectrumName)
+ continue
+ # Accept modless peptides immediately:
+ if len(Peptide.Modifications.keys()) == 0:
+ self.AcceptPTMWitness(Peptide, Bits)
+ AnnotatedFlag = 1
+ AnnotationCount += 1
+ continue
+ # Fixup endpoints, and if we removed all mods, accept:
+ Peptide = self.TweakIncorrectEndpoints(Peptide)
+ if len(Peptide.Modifications.keys()) == 0:
+ self.AcceptPTMWitness(Peptide, Bits)
+ AnnotatedFlag = 1
+ AnnotationCount += 1
+ continue
+ # Check to see whether all the PTMs in the peptide are correct,
+ # or at least delta-correct. Note that SiteDict contains
+ # "shadow" entries already!
+ OKSiteList = []
+ InvalidPTM = 0
+ for (Index, ModList) in Peptide.Modifications.items():
+ DBPos = Peptide.DBPos + Index
+ for Mod in ModList:
+ Site = self.SiteDict.get((DBPos, Mod.Mass))
+ # Check that Site.DBPos is in the range [Peptide.DBPos, Peptide.DBpos + len(Peptide.Aminos)),
+ # because Site.DBPos must actually fall within the peptide!
+ if Site and Site.DBPos >= Peptide.DBPos and Site.DBPos <= (Peptide.DBPos + len(Peptide.Aminos) - 1):
+ OKSiteList.append((Site, Index))
+ continue
+## # We didn't find anything at (DBPos, Mod.Mass), so consider shadows:
+## MinDBPos = DBPos - min(Index, 3)
+## MaxDBPos = DBPos + min(3, len(Peptide.Aminos) - Index - 1)
+## FoundFlag = 0
+## for NearMass in (Mod.Mass - 1, Mod.Mass, Mod.Mass + 2):
+## if FoundFlag:
+## break
+## for NearDBPos in range(MinDBPos, MaxDBPos):
+## Site = self.SiteDict.get((NearDBPos, NearMass))
+## if Site:
+## OKSiteList.append((Site, Index))
+## FoundFlag = 1
+## print "Accept close:", DBPos, Mod.Mass, Site.DBPos, Site.Mass, Peptide.GetFullModdedName()
+## break
+## if not FoundFlag:
+ InvalidPTM = 1
+ break
+ if not InvalidPTM:
+ for (Site, Index) in OKSiteList:
+ self.AcceptPTMWitness(Peptide, Bits, Site)
+ AnnotationCount += 1
+ AnnotatedFlag = 1
+ def AcceptPTMWitness(self, Peptide, Bits, AnnotatedSite = None):
+ """
+ Second pass: We found a legal peptide annotation. If Index==None and AnnotatedSite==None,
+ the peptide is unmodified. Otherwise AnnotatedSite is a legal PTM which this
+ peptide uses (although it may need tweaking!)
+ """
+ Bits = list(Bits)
+ # Add extra bits for the modification site:
+ if AnnotatedSite:
+ Bits.append(str(AnnotatedSite.Residue))
+ Bits.append(str(AnnotatedSite.Mass))
+ else:
+ Bits.append("")
+ Bits.append("")
+ # Add extra bit for the ORIGINAL annotation:
+ Bits.append(Bits[2])
+ # Tweak the peptide annotation, if necessary:
+ DBStart = Peptide.DBPos
+ DBEnd = Peptide.DBPos + len(Peptide.Aminos)
+ ScoredTuple = (Peptide.PValue, -Peptide.MQScore, Peptide)
+ if AnnotatedSite == None:
+ # Note this unmodified spectrum overlapping modified sites:
+ for Site in self.SiteList:
+ if Site.DBPos >= DBStart and Site.DBPos < DBEnd:
+ Site.BestModlessPeptides.append(ScoredTuple)
+ Site.BestModlessPeptides.sort()
+ Site.BestModlessPeptides = Site.BestModlessPeptides[:self.ReportPeptideCount]
+ Site.ModlessAnnotationCount += 1
+ else:
+ # Tweak the protein if necessary:
+ TweakFlag = 0
+ for (Index, ModList) in Peptide.Modifications.items():
+ if TweakFlag:
+ break
+ for Mod in ModList:
+ ModDBPos = Peptide.DBPos + Index
+ if (ModDBPos != AnnotatedSite.DBPos or Mod.Mass != AnnotatedSite.Mass):
+ # This isn't the same as the modification. If it's CLOSE, then
+ # tweak it:
+ if abs(AnnotatedSite.DBPos - ModDBPos) <= 3 and abs(AnnotatedSite.Mass - Mod.Mass) < 1.2:
+ ModList.remove(Mod)
+ if not ModList:
+ del Peptide.Modifications[Index]
+ NewIndex = AnnotatedSite.DBPos - Peptide.DBPos
+ if not Peptide.Modifications.has_key(NewIndex):
+ Peptide.Modifications[NewIndex] = []
+ NewMod = PTModClass("%+d"%AnnotatedSite.Mass)
+ NewMod.Mass = AnnotatedSite.Mass
+ Peptide.Modifications[NewIndex].append(NewMod)
+ break
+ # Note this spectrum:
+ AnnotatedSite.BestPeptides.append(ScoredTuple)
+ AnnotatedSite.BestPeptides.sort()
+ AnnotatedSite.BestPeptides = AnnotatedSite.BestPeptides[:self.ReportPeptideCount]
+ AnnotatedSite.AnnotationCount += 1
+ AnnotatedSite.ModdedSpecies[Peptide.Aminos] = AnnotatedSite.ModdedSpecies.get(Peptide.Aminos, 0) + 1
+ # Note the terminus:
+ PeptidePos = Peptide.DBPos - AnnotatedSite.DBPos
+ if PeptidePos == 0:
+ AnnotatedSite.TerminalCount[0] += 1
+ elif PeptidePos == len(Peptide.Aminos) - 1:
+ AnnotatedSite.TerminalCount[2] += 1
+ else:
+ AnnotatedSite.TerminalCount[1] += 1
+ # And note this alternative modification for other sites:
+ for Site in self.SiteList:
+ if Site != AnnotatedSite and Site.DBPos >= DBStart and Site.DBPos < DBEnd:
+ Site.BestOtherModPeptides.append(ScoredTuple)
+ Site.BestOtherModPeptides.sort()
+ Site.BestOtherModPeptides = Site.BestOtherModPeptides[:self.ReportPeptideCount]
+ Site.OtherModAnnotationCount += 1
+ Bits[2] = Peptide.GetFullModdedName()
+ Str = string.join(Bits, "\t")
+ self.OutputAnnotationFile.write(Str + "\n")
+ def ReadSpectrumAnnotationsFromFile(self, FileName):
+ """
+ Parse annotations. We've already verified that it's a file (not a directory) and it exists.
+ ASSUMPTION: All annotations for the same spectrum appear consecutively.
+ """
+ try:
+ File = open(FileName, "rb")
+ except:
+ print "* Error in PTMChooser: Cannot open results file '%s'!"%FileName
+ return
+ LineNumber = 0
+ AnnotationCount = 0
+ OldSpectrumName = None
+ AnnotatedFlag = 0
+ MatrixEntryCount = 0
+ File = open(FileName, "r")
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ self.LinesReadCount += 1
+ if LineNumber%10000 == 0:
+ print "Line#%s %s modless %s matrix entries"%(LineNumber, AnnotationCount, MatrixEntryCount)
+ sys.stdout.flush()
+ if MaxLineCount != None and self.LinesReadCount > MaxLineCount:
+ break
+ if FileLine[0]=="#":
+ continue # skip comment-line
+ Bits = FileLine.strip().split("\t")
+ ##################################################################
+ # Skip invalid lines, or poor annotations:
+ if len(Bits)<16:
+ # This isn't a valid annotation line!
+ continue
+ try:
+ MQScore = float(Bits[5])
+ PValue = float(Bits[10])
+ DeltaScore = float(Bits[11])
+ except:
+ print "** Warning: Invalid annotation line in %s line %s"%(FileName, LineNumber)
+ continue
+ if PValue > self.PValueCutoff or DeltaScore < self.DeltaScoreCutoff:
+ ##print "%s Ignore match %s %s"%(LineNumber, PValue, DeltaScore) #%%%
+ continue
+ SpectrumName = (Bits[0], Bits[1])
+ if SpectrumName != OldSpectrumName:
+ # It's a new spectrum; reset the AnnotatedFlag
+ AnnotatedFlag = 0
+ OldSpectrumName = SpectrumName
+ Peptide = GetPeptideFromModdedName(Bits[2][2:-2])
+ Peptide.UnexplainedModList = None
+ if len(Peptide.Aminos) < self.MinimumPeptideLength:
+ ##print "%s Ignore short peptide %s"%(LineNumber, Bits[2]) #%%%
+ continue # Short peptides are too likely to be spurious! (Length 1-4 is rubbish, 5 is marginal)
+ Peptide.Prefix = Bits[2][0]
+ Peptide.Suffix = Bits[2][-1]
+ ##################################################################
+ # If this peptide is unmodified, then ignore any further (lower-scoring) peptides for
+ # the same spectrum:
+ Keys = Peptide.Modifications.keys()
+ if len(Keys) == 0:
+ AnnotatedFlag = 1
+ AnnotationCount += 1
+ ##print "%s Accept modless %s"%(LineNumber, Bits[2]) #%%%
+ continue
+ if AnnotatedFlag:
+ continue
+ Peptide.DBPos = self.DB.find(Peptide.Aminos)
+ if Peptide.DBPos == -1:
+ print "** Warning: Annotation '%s' for spectrum '%s' not found in database!"%(Peptide.Aminos, SpectrumName)
+ continue
+ UnknownPTMSeen = 0
+ for (Index, ModList) in Peptide.Modifications.items():
+ for Mod in ModList:
+ Terminus = None
+ if Index == 0:
+ Terminus = "N"
+ if Index == len(Peptide.Aminos)-1:
+ Terminus = "C"
+ Key = (Peptide.Aminos[Index], Mod.Mass, Index, Terminus)
+ if Peptide.UnexplainedModList == None:
+ Peptide.UnexplainedModList = []
+ Peptide.UnexplainedModList.append(Key)
+ Key = (Peptide.Aminos[Index], Mod.Mass, Terminus)
+ if not self.KnownPTMDict.has_key(Key):
+ UnknownPTMSeen = 1
+ # Tweak any known mistakes in peptide annotation:
+ Peptide = self.TweakIncorrectEndpoints(Peptide)
+ # If it's modless now, note that and continue:
+ if len(Peptide.Modifications.keys()) == 0:
+ AnnotatedFlag = 1
+ AnnotationCount += 1
+ #print "%s Tweaked %s to %s"%(LineNumber, Bits[2], Peptide.GetFullModdedName()) #%%%
+ continue
+ # Accumulate entries in PTMFrequencyMatrix:
+ for (Index, ModList) in Peptide.Modifications.items():
+ for Mod in ModList:
+ if self.BuildPTMFrequencyMatrix:
+ Key = (Peptide.Aminos[Index], Mod.Mass)
+ self.PTMFrequencyMatrix[Key] = self.PTMFrequencyMatrix.get(Key, 0) + 1
+ self.PTMBestScoreMatrix[Key] = max(self.PTMBestScoreMatrix.get(Key, -999), MQScore)
+ if Index == 0:
+ Key = ("^", Mod.Mass)
+ self.PTMFrequencyMatrix[Key] = self.PTMFrequencyMatrix.get(Key, 0) + 1
+ self.PTMBestScoreMatrix[Key] = max(self.PTMBestScoreMatrix.get(Key, -999), MQScore)
+ if Index == len(Peptide.Aminos) - 1:
+ Key = ("$", Mod.Mass)
+ self.PTMFrequencyMatrix[Key] = self.PTMFrequencyMatrix.get(Key, 0) + 1
+ self.PTMBestScoreMatrix[Key] = max(self.PTMBestScoreMatrix.get(Key, -999), MQScore)
+ else:
+ Key = (Peptide.DBPos + Index, Mod.Mass)
+ self.PTMFrequencyMatrix[Key] = self.PTMFrequencyMatrix.get(Key, 0) + 1
+ self.PTMBestScoreMatrix[Key] = max(self.PTMBestScoreMatrix.get(Key, -999), MQScore)
+ #print "%s Peptide %s %s key %s"%(LineNumber, Bits[2], Peptide.GetFullModdedName(), Key) #%%%
+ MatrixEntryCount += 1
+ if not UnknownPTMSeen:
+ AnnotatedFlag = 1 # ignore all subsequent annotations
+ File.close()
+ return AnnotationCount
+ def SelectSites(self):
+ """
+ Iterate: Find the largest entry in self.PTMFrequencyMatrix. Remove entries
+ from this cell and neighboring cells, and append a new SiteClass instance
+ to self.SiteList. Stop when the next entry is too small, or when we have
+ already generated enough sites.
+ """
+ while (1):
+ BestCount = 0
+ BestMQScore = -999
+ BestKey = None
+ for (Key, Count) in self.PTMFrequencyMatrix.items():
+ (AA, Mass) = Key
+ MQScore = self.PTMBestScoreMatrix.get(Key, -999)
+ # Filter out +1, -1 here:
+ if abs(Mass) >= self.MinimumPTMSize:
+ if (Count > BestCount) or (Count == BestCount and MQScore > BestMQScore):
+ BestCount = Count
+ BestMQScore = MQScore
+ BestKey = Key
+ #print BestCount, BestMQScore, Key
+ if not BestKey:
+ break
+ if BestCount < self.MinSpectraForSite:
+ print "Next PTM explains %s<%s spectra, stop now"%(BestCount, self.MinSpectraForSite)
+ break
+ (DBPos, Mass) = BestKey
+ Site = SiteClass()
+ Site.DBPos = DBPos
+ Site.Mass = Mass
+ (ProteinName, ProteinNumber, ResidueNumber) = self.GetProteinInfo(DBPos)
+ Site.ProteinName = ProteinName
+ Site.Residue = "%s%s"%(self.DB[DBPos], ResidueNumber)
+ Site.AA = self.DB[DBPos]
+ print "%s Accept PTM: %s on %s from %s"%(BestCount, Mass, Site.Residue, ProteinName[:40])
+ self.SiteList.append(Site)
+ if len(self.SiteList) >= self.MaxSiteCount:
+ print "Acquired %s sites - stop now"%self.MaxSiteCount
+ break
+ # Remove matrix entries:
+ for NearPos in range(DBPos - 3, DBPos + 4):
+ if NearPos in (DBPos-1, DBPos, DBPos+1):
+ Masses = (Mass-1, Mass, Mass+1)
+ else:
+ Masses = (Mass,)
+ for NearMass in Masses:
+ Key = (NearPos, NearMass)
+ if self.PTMFrequencyMatrix.has_key(Key):
+ print "Absorb adjacent entry:", Key
+ del self.PTMFrequencyMatrix[Key]
+ # Keep a dictionary of the accepted sites, for easy lookup:
+ self.SiteDict = {}
+ for Site in self.SiteList:
+ for NearPos in range(Site.DBPos - 3, Site.DBPos + 4):
+ if NearPos in (DBPos-1, DBPos, DBPos+1):
+ Masses = (Mass-1, Mass, Mass+1)
+ else:
+ Masses = (Mass,)
+ for NearMass in range(Site.Mass - 1, Site.Mass + 2):
+ Key = (Site.DBPos, NearMass)
+ if not self.SiteDict.has_key(Key):
+ self.SiteDict[Key] = Site
+ def GetProteinInfo(self, DBPos):
+ "Return the protein# and the residue# for this file position."
+ for Index in range(1, len(self.ProteinNames)):
+ if self.ProteinStartPositions[Index] > DBPos:
+ ResidueNumber = DBPos - self.ProteinStartPositions[Index - 1] + 1
+ #return (self.ProteinNames[Index - 1], ResidueNumber)
+ return (self.ProteinNames[Index - 1], Index - 1, ResidueNumber)
+ # The match must come from the last protein:
+ ResidueNumber = DBPos - self.ProteinStartPositions[-1] + 1
+ return (self.ProteinNames[-1], len(self.ProteinNames) - 1, ResidueNumber)
+ def OutputResults(self):
+ # Remove existing files AllSiteSummary and AllSiteDetails, so we start them fresh:
+ Path = os.path.join(self.PTMSummaryDir, "AllSiteSummary.html")
+ print Path
+ if os.path.exists(Path):
+ os.remove(Path)
+ Path = os.path.join(self.PTMSummaryDir, "AllSiteDetails.html")
+ print Path
+ if os.path.exists(Path):
+ os.remove(Path)
+ # Sort the sites by annotation-count:
+ SortedSites = []
+ for Site in self.SiteList:
+ if Site.AnnotationCount:
+ SortedSites.append((Site.AnnotationCount, Site))
+ SortedSites.sort()
+ SortedSites.reverse()
+ self.TotalSpectraForPTM = {} # (AA, Mass) -> Count
+ self.SitesForPTM = {} # (AA, Mass) -> list of Site instances
+ self.TerminusForPTM = {} # (AA, Mass) -> terminus-tuple
+ for (Count, Site) in SortedSites:
+ # Note this site in the PTM lists:
+ Key = (Site.AA, Site.Mass)
+ if not self.SitesForPTM.has_key(Key):
+ self.SitesForPTM[Key] = []
+ self.SitesForPTM[Key].append(Site)
+ self.TotalSpectraForPTM[Key] = self.TotalSpectraForPTM.get(Key, 0) + Site.AnnotationCount
+ if not self.TerminusForPTM.has_key(Key):
+ self.TerminusForPTM[Key] = [0, 0, 0]
+ for X in range(3):
+ self.TerminusForPTM[Key][X] += Site.TerminalCount[X]
+ # Write a table summarizing this sites to the PTM page, and to the
+ # overall details page:
+ HTML = self.WriteSiteSummary(Site, 1)
+ DetailsFilePath = os.path.join(self.PTMSummaryDir, "%s%sDetails.html"%(Site.AA, Site.Mass))
+ if len(self.SitesForPTM[Key]) == 1:
+ File = open(DetailsFilePath, "w")
+ else:
+ File = open(DetailsFilePath, "a")
+ File.write(HTML)
+ File.close()
+ File = open(os.path.join(self.PTMSummaryDir, "AllSiteDetails.html"), "a")
+ File.write(HTML)
+ File.close()
+ #
+ #######################################
+ # Write the index file, which summarizes things by PTM (possibly several
+ # different sites correspond to each row)
+ IndexFilePath = os.path.join(self.PTMSummaryDir, "index.html")
+ IndexFile = open(IndexFilePath, "w")
+ IndexFile.write("<h3>PTM Summary Report</h3>\n")
+ if not SortedSites:
+ IndexFile.write("<b> * * * No PTMs found * * *</b>\n")
+ return
+ IndexFile.write("<a href=\"AllSiteSummary.html\">Summary table for all sites</a>")
+ IndexFile.write(" <a href=\"AllSiteDetails.html\">Details for all sites</a><br><br>\n")
+ IndexFile.write("<table><tr><td><b>Terminus</b></td><td><b>AA</b></td><td><b>Mass<br>delta</b></td>")
+ IndexFile.write("<td><b>Spectra</b></td><td><b>Sites</b></td><td><b>Top-site<br>spectra</b></td>")
+ IndexFile.write("<td><b>Results</b></td><td><b>Possible explanations</b></td></tr>\n")
+ ############
+ SortedPTMs = []
+ for (Key, Count) in self.TotalSpectraForPTM.items():
+ SortedPTMs.append((Count, Key))
+ SortedPTMs.sort()
+ SortedPTMs.reverse()
+ for (Count, Key) in SortedPTMs:
+ (AA, Mass) = Key
+ if self.TotalSpectraForPTM.get(Key, 0) < 1:
+ continue # Skip this, we don't have a single spectra for it!
+ print "Write summary for %s %s"%(AA, Mass)
+ # Decide whether we think it's terminal:
+ N = self.TerminusForPTM[Key][0]
+ Body = self.TerminusForPTM[Key][1]
+ C = self.TerminusForPTM[Key][2]
+ if N > Body:
+ Terminus = "N"
+ elif C > Body:
+ Terminus = "C"
+ else:
+ Terminus = ""
+ ######################################################################
+ # Write terse records for each site for this PTM:
+ HTML = self.WriteTerseSummary(self.SitesForPTM[Key])
+ File = open(os.path.join(self.PTMSummaryDir, "%s%sSummary.html"%(AA, Mass)), "w")
+ File.write(HTML)
+ File.close()
+ File = open(os.path.join(self.PTMSummaryDir, "AllSiteSummary.html"), "a")
+ File.write(HTML)
+ File.close()
+ ######################################################################
+ # Add links to the index page:
+ DetailLink = "%s%sDetails.html"%(AA, Mass)
+ SummaryLink = "%s%sSummary.html"%(AA, Mass)
+ ExplanationList = self.GetKnownPTMExplanation(AA, Mass, Terminus)
+ if AA == "C":
+ ExplanationList.extend(ExplainPTMs.GetExplanation(AA, Mass, Terminus, BasePTM = self.CysteineProtection))
+ else:
+ ExplanationList.extend(ExplainPTMs.GetExplanation(AA, Mass, Terminus))
+ if len(ExplanationList) == 0:
+ Explanations = "Unknown"
+ else:
+ Explanations = ""
+ for Entry in ExplanationList[:3]:
+ Explanations += "%s, "%Entry.GetNameWithLink()
+ Explanations = Explanations[:-2] # remove trailing comma+space
+ IndexFile.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>"%(\
+ Terminus, AA, Mass, self.TotalSpectraForPTM[Key], len(self.SitesForPTM[Key]),
+ self.SitesForPTM[Key][0].AnnotationCount))
+ IndexFile.write("<td><a href=\"%s\">Details</a> <a href=\"%s\">Summary</a></td><td>%s</td></tr>\n"%(DetailLink, SummaryLink, Explanations))
+ IndexFile.close()
+ def GetKnownPTMExplanation(self, AA, Mass, Terminus):
+ """
+ Return a list of known PTMs that fit this description. Mostly just so
+ that we can report their correct names.
+ """
+ ExplanationList = []
+ for PTM in self.KnownPTMs:
+ if PTM.Mass != Mass:
+ continue
+ if PTM.Terminus == Terminus and PTM.AA.has_key(AA):
+ ExplanationList.append(PTM)
+ return ExplanationList
+ def WriteTerseSummary(self, SiteList):
+ if not SiteList:
+ return ""
+ HTML = ""
+ AA = self.DB[SiteList[0].DBPos]
+ Mass = SiteList[0].Mass
+ HTML += "<h3>Sites for %+d on %s</h3>"%(Mass, AA)
+ TotalSpectra = 0
+ for Site in SiteList:
+ TotalSpectra += Site.AnnotationCount
+ HTML += "<b>%s spectra in all<br>\n"%TotalSpectra
+ HTML += "<table><tr><td><b>Protein</b></td><td><b>Residue</b></td><td><b>Spectra</b></td><td><b>Species</b></td><td><b>Unmodified</b></td></tr>\n"
+ for Site in SiteList:
+ (ProteinName, ProteinIndex, ResidueNumber) = self.GetProteinInfo(Site.DBPos)
+ Residue = "%s%s"%(AA, ResidueNumber)
+ HTML += "<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n"%(ProteinName, Residue, Site.AnnotationCount, len(Site.ModdedSpecies.keys()), Site.ModlessAnnotationCount)
+ HTML += "</table>"
+ return HTML
+
+ def WriteSiteSummaryLine(self, Notes, Peptide):
+ Annotation = Peptide.GetFullModdedName()
+ #print "WriteSiteSummaryLine: Notes %s Pval %s score %s Pep %s"%(Notes, Peptide.PValue, Peptide.MQScore, Peptide.GetFullModdedName())
+ WroteLine = 0
+ HTML = ""
+ if self.GenerateSpectrumImagesFlag:
+ try:
+ ImageFileName = "%s%s.png"%(Peptide.SpectrumName, Peptide.ScanNumber)
+ ImagePath = os.path.join(self.PTMSummaryDir, "Images", ImageFileName)
+ SpecFilePath = self.GetSpectrumFilePath(Peptide.SpectrumPath)
+ FileName = "%s:%s"%(SpecFilePath, Peptide.ScanByteOffset)
+ LabeledSpectrum = Label.LabelDTAFile(Peptide, FileName, None)
+ Maker = MakeImage.MSImageMaker()
+ Maker.ConvertSpectrumToImage(LabeledSpectrum, ImagePath, Peptide)
+ HTML = "<tr><td>%s</td><td>%s</td><td>%s</td><td><a href=\"Images/%s\">%s</a></td>"%(Notes, Peptide.SpectrumName, Peptide.ScanNumber, ImageFileName, Annotation)
+ HTML += "<td>%s</td><td>%s</td><td>%s</td></tr>\n"%(Peptide.MQScore, Peptide.DeltaScore, Peptide.PValue)
+ WroteLine = 1
+ except:
+ # Error generating image - perhaps the file isn't available on disk?
+ print SpecFilePath, Peptide.ScanByteOffset, Peptide.SpectrumPath, Peptide.ScanNumber
+ traceback.print_exc()
+ #pass
+ if not WroteLine:
+ HTML = "<tr><td>%s</td><td>%s</td><td>%s</td>"%(Notes, Peptide.SpectrumName, Peptide.ScanNumber)
+ HTML += "<td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n"%(Annotation, Peptide.MQScore, Peptide.DeltaScore, Peptide.PValue)
+ return HTML
+ def WriteSiteSummary(self, Site, VerboseFlag):
+ """
+ Write out a verbose summary of this putative modification site to the specified file.
+ """
+ ModlessPeptides = []
+ OtherModPeptides = []
+ Site.SpectrumCount = 0
+ # Sort the peptide species:
+ SortedSpecies = []
+ for (Aminos, Count) in Site.ModdedSpecies.items():
+ SortedSpecies.append((Count, Aminos))
+ SortedSpecies.sort()
+ ###############################################################
+ # Report up to 10 annotations per species:
+ HTML = ""
+ (ProteinName, ProteinIndex, ResidueNumber) = self.GetProteinInfo(Site.DBPos)
+ SortedSpecies.sort()
+ HTML += "<h3>%+d on Residue %s of protein %s</h3>\n"%(Site.Mass, Site.Residue, ProteinName)
+ HTML += "<b>%s spectra</b> annotated this residue with %+d <br>\n"%(Site.AnnotationCount, Site.Mass)
+ HTML += "<b>%s spectra</b> cover this site without modification<br>\n"%Site.ModlessAnnotationCount
+ if Site.OtherModAnnotationCount:
+ HTML += "<b>%s spectra</b> containing different modifications cover this site<br>\n"%Site.OtherModAnnotationCount
+ HTML += "Details for the top-scoring spectra follow:<br>\n"
+ HTML += "<table><tr><td><b>Notes</b></td><td><b>Spectrum</b></td>"
+ HTML += "<td><b>Scan</b></td><td><b>Annotation</b></td>"
+ HTML += "<td><b>MQScore</b></td><td><b>Delta-score</b></td><td><b>p-value</b></td></tr>\n"
+ for (Dummy1, Dummy2, Peptide) in Site.BestPeptides:
+ HTML += self.WriteSiteSummaryLine("", Peptide)
+ ######################################################
+ # Without PTM:
+ ModlessPeptides.sort()
+ for (Dummy1, Dummy2, Peptide) in Site.BestModlessPeptides:
+ HTML += self.WriteSiteSummaryLine("No PTM", Peptide)
+ ######################################################
+ # Other PTMs:
+ OtherModPeptides.sort()
+ for (Dummy1, Dummy2, Peptide) in Site.BestOtherModPeptides:
+ HTML += self.WriteSiteSummaryLine("Other PTM", Peptide)
+ HTML += "</table><hr>"
+ return HTML
+ def ReadDatabase(self, DBPath):
+ try:
+ File = open(DBPath, "rb")
+ except:
+ print "** Unable to open database file '%s'!"%DBPath
+ raise
+ self.DB = File.read()
+ File.close()
+ # Read the database index, if found:
+ self.ProteinStartPositions = []
+ self.ProteinNames = []
+ IndexPath = os.path.splitext(DBPath)[0] + ".index"
+ if os.path.exists(IndexPath):
+ File = open(IndexPath, "rb")
+ while (1):
+ Data = File.read(92)
+ if not Data:
+ break
+ Tuple = struct.unpack("<qi80s", Data)
+ self.ProteinStartPositions.append(Tuple[1])
+ Name = Tuple[2]
+ NullPos = Name.find(chr(0))
+ if NullPos != -1:
+ Name = Name[:NullPos]
+ self.ProteinNames.append(Name)
+ File.close()
+ else:
+ print "** Error: Database index file '%s' not found!"%IndexPath
+ def GetSpectrumFilePath(self, FileName):
+ # this can be overridden, if spectra are moved
+ #Bits = FileName.replace("/", "\\").split("\\")
+ #return os.path.join(r"E:\ms\OMICS04", Bits[-2], Bits[-1])
+ return FileName
+ def PerformIterativeNSSSelection(self, MinExplanationCount):
+ self.NSSPTMList = []
+ Matrix = self.PTMFrequencyMatrix.copy()
+ while (1):
+ SortedList = []
+ for (Key, Count) in Matrix.items():
+ if Key[0] in ("$","^"):
+ continue
+ SortedList.append((Count, Key))
+ SortedList.sort()
+ SortedList.reverse()
+ if not SortedList:
+ return
+ (Count, Key) = SortedList[0]
+ if Count < MinExplanationCount:
+ return
+ # Grab nearby 'shadow' entries:
+ (AA, Mass) = Key
+ for NearMass in (Mass - 1, Mass + 1):
+ NearKey = (AA, NearMass)
+ Matrix[Key] += Matrix.get(NearKey, 0)
+ Matrix[NearKey] = 0
+ Matrix[Key] = 0 # selected already
+ PTM = PTMClass(Mass)
+ PTM.AA = AA
+ self.NSSPTMList.append(PTM)
+ def PerformNonSiteSpecificPTMSelection(self, ResultsFileName):
+ print "\n\nRead spectrum annotations:"
+ self.ReadSpectrumAnnotations(ResultsFileName)
+ # Output the PTM frequency matrix:
+ FileName = os.path.join(self.PTMSummaryDir, "NonSiteSpecific.html")
+ HTMLFile = open(FileName, "w")
+ self.WriteHTMLMatrix(self.PTMFrequencyMatrix, HTMLFile)
+ #####
+ FileName = os.path.join(self.PTMSummaryDir, "NonSiteSpecific.txt")
+ TextFile = open(FileName, "w")
+ self.WriteTextMatrix(self.PTMFrequencyMatrix, TextFile)
+ TextFile.close()
+ #######################################################
+ # Now, perform iterative PTM selection. This will tidy up the matrix significantly.
+ # We'll stop selecting PTMs when the next one explains fewer than X entries, where X
+ # is twice the median matrix entry:
+ OrderedEntries = []
+ for (Key, Count) in self.PTMFrequencyMatrix.items():
+ if Key[0] not in "^$":
+ OrderedEntries.append(Count)
+ OrderedEntries.sort()
+ MinExplanationCount = self.LightShadingCutoff #max(OrderedEntries[len(OrderedEntries)/2] * 4, 10)
+ self.PerformIterativeNSSSelection(MinExplanationCount)
+ #######################################################
+ # Write details on each PTM.
+ HTMLFile.write("<hr>")
+ #self.GeneratePTMFrequencyMatrix(1)
+ HTMLFile.write("<h3>Putative modifications</h3>\n")
+ # Get a list of PTMs, sorted by the number of spectra they explain:
+ SortedList = []
+ for NSSPTM in self.NSSPTMList:
+ if NSSPTM.AA:
+ Count = self.PTMFrequencyMatrix.get((NSSPTM.AA, NSSPTM.Mass), 0)
+ else:
+ if NSSPTM.Terminus == "N":
+ Count = self.PTMFrequencyMatrix.get(("^", NSSPTM.Mass), 0)
+ elif NSSPTM.Terminus == "C":
+ Count = self.PTMFrequencyMatrix.get(("$", NSSPTM.Mass), 0)
+ SortedList.append((Count, NSSPTM))
+ SortedList.sort()
+ SortedList.reverse()
+ for (Count, NSSPTM) in SortedList:
+ if Count < self.LightShadingCutoff:
+ continue # garbage PTM
+ ModStr = "%s%+d"%(NSSPTM.AA, NSSPTM.Mass)
+ HTMLFile.write("Modification %s applied to %s spectra<br>\n"%(ModStr, Count))
+ HTMLFile.write(" ")
+ if NSSPTM.AA == "C":
+ ExplanationList = ExplainPTMs.GetExplanation(NSSPTM.AA, NSSPTM.Mass, "", BasePTM = self.CysteineProtection)
+ else:
+ ExplanationList = ExplainPTMs.GetExplanation(NSSPTM.AA, NSSPTM.Mass, "")
+ if not ExplanationList:
+ HTMLFile.write("(unknown mass-delta)<br><br>\n")
+ else:
+ Str = ""
+ for Explanation in ExplanationList:
+ Str += "%s, "%Explanation.GetNameWithLink()
+ Str = Str[:-2] # remove trailing comma
+ HTMLFile.write("Possible annotations: %s<br><br>\n"%Str)
+## #######################################################
+## # Write the cleaned-up matrix:
+## #self.GeneratePTMFrequencyMatrix(0)
+## TabbedMatrixFileName = os.path.join(self.PTMSummaryDir, "ProcessedMatrix.txt")
+## TabbedMatrixFile = open(TabbedMatrixFileName, "w")
+## self.WriteTextMatrix(self.PTMFrequencyMatrix, TabbedMatrixFile)
+## TabbedMatrixFile.close()
+## HTMLFile.write("<h3>Resultant PTM frequency matrix</h3>")
+## self.WriteHTMLMatrix(self.Matrix, HTMLFile)
+## # Finish and cleanup:
+## Path = os.path.join(self.PTMSummaryDir, "NonSiteSpecificAnnotations.txt")
+## self.OutputAnnotations(Path)
+## HTMLFile.close()
+
+ def WriteHTMLMatrix(self, Matrix, HTMLFile):
+ """
+ Write PTM frequency matrix to a webpage, with shading on well-filled cells.
+ """
+ AAList = "^ACDEFGHIKLMNPQRSTVWY$"
+ # First, let's decide what the cutoffs are for heavy, medium, and light shading.
+ EntryList = []
+ for (Key, Value) in Matrix.items():
+ if Key[0] not in ("^$"):
+ EntryList.append(Value)
+ EntryList.sort()
+ if not len(EntryList):
+ HTMLFile.write("<b>** Error - no entries, so no PTM matrix written<br>\n")
+ self.LightShadingCutoff = 1
+ return
+ MaximumEntry = EntryList[-1]
+ HeavyShadingCutoff = max(10, MaximumEntry / 2.0)
+ MediumShadingCutoff = max(5, MaximumEntry / 10.0)
+ self.LightShadingCutoff = max(2, MaximumEntry / 100.0)
+ MedianEntry = EntryList[len(EntryList) / 2]
+ Str = "Maximum entry %s, median entry %s.<br>\nRows containing an entry of <b>%d</b> or larger are displayed.<br>\n"
+ HTMLFile.write(Str%(MaximumEntry, MedianEntry, int(self.LightShadingCutoff + 1.0)))
+ HTMLFile.write("<table><tr>")
+ ####################
+ # Write the header:
+ HeaderRow = ""
+ HeaderRow += "<td><b>Mass</b></td>"
+ for AA in AAList:
+ if AA == "^":
+ AA = " (N)"
+ elif AA == "$":
+ AA = " (C)"
+ else:
+ AA = " " + AA
+ HeaderRow += "<td><b>%s</b></td>"%AA
+ HeaderRow += "</tr>\n"
+ HTMLFile.write(HeaderRow)
+ # Get mass range:
+ MinimumMass = 999
+ MaximumMass = -999
+ for Key in Matrix.keys():
+ MinimumMass = min(MinimumMass, Key[1])
+ MaximumMass = max(MaximumMass, Key[1])
+ # Write out one row for each feasible mass:
+ RowsPrinted = 0
+ for Mass in range(MinimumMass, MaximumMass + 1):
+ # Get the total number of entries on this row. If it's low, then skip the row!
+ EntriesForThisRow = 0
+ BestEntryThisRow = 0
+ for AA in AAList[1:-1]:
+ EntriesForThisRow += Matrix.get((AA, Mass), 0)
+ BestEntryThisRow = max(BestEntryThisRow, Matrix.get((AA, Mass), 0))
+ # Only display a row if it has an entry equal to at least twice the median cell:
+ if BestEntryThisRow <= self.LightShadingCutoff:
+ continue
+ HTMLFile.write("<tr><td>%s</td>"%Mass)
+ for AA in AAList:
+ Key = (AA, Mass)
+ Count = Matrix.get(Key, 0)
+ if Count < 10:
+ CountStr = " %s"%Count
+ elif Count < 100:
+ CountStr = " %s"%Count
+ elif Count < 1000:
+ CountStr = " %s"%Count
+ else:
+ CountStr = "%s"%Count
+ if Count > HeavyShadingCutoff:
+ HTMLFile.write("<td bgcolor=\"#999999\">%s</td>"%CountStr)
+ elif Count > MediumShadingCutoff:
+ HTMLFile.write("<td bgcolor=\"#bbbbbb\">%s</td>"%CountStr)
+ elif Count > self.LightShadingCutoff:
+ HTMLFile.write("<td bgcolor=\"#dddddd\">%s</td>"%CountStr)
+ else:
+ HTMLFile.write("<td>%s</td>"%CountStr)
+ HTMLFile.write("</tr>\n")
+ RowsPrinted += 1
+ if RowsPrinted%25 == 0:
+ HTMLFile.write(HeaderRow)
+ HTMLFile.write("</table>\n")
+ def WriteTextMatrix(self, Matrix, TabbedMatrixFile):
+ """
+ Write PTM frequency matrix, in tab-delimited format (for easy parsing). Similar code
+ in WriteHTMLMatrix, for easy reading by eye.
+ """
+ AAList = "^ACDEFGHIKLMNPQRSTVWY$"
+ HeaderLine = "Mass\t"
+ for AA in AAList:
+ if AA == "^":
+ AA = "(N)"
+ if AA == "$":
+ AA = "(C)"
+ HeaderLine += "%s\t"%AA
+ HeaderLine += "\n"
+ TabbedMatrixFile.write(HeaderLine)
+ MinimumMass = 999
+ MaximumMass = -999
+ for Key in Matrix.keys():
+ MinimumMass = min(MinimumMass, Key[1])
+ MaximumMass = max(MaximumMass, Key[1])
+ # Write out one row for each feasible mass:
+ for Mass in range(MinimumMass, MaximumMass + 1):
+ Str = "%s\t"%Mass
+ for AA in AAList:
+ Key = (AA, Mass)
+ Str += "%s\t"%Matrix.get(Key, 0)
+ Str += "\n"
+ TabbedMatrixFile.write(Str)
+
+
+def Main(PTMProcessor):
+ global MaxLineCount
+ if len(sys.argv) < 3:
+ print UsageInfo
+ sys.exit(1)
+ ResultsFileName = None
+ #PTMProcessor = Processor()
+ (Options, Args) = getopt.getopt(sys.argv[1:], "r:d:s:c:iv:w:t:l:m:k:p")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ # -r results file(s)
+ ResultsFileName = Value
+ elif Option == "-c":
+ # -c Mass of cysteine protecting group (57 by default)
+ PTMProcessor.CysteineProtection = int(Value)
+ elif Option == "-k":
+ # -k File specifying known, non-site-specific PTMs
+ PTMProcessor.KnownPTMFileName = Value
+ elif Option == "-t":
+ # -t Max number of sites to report (1000 by default)
+ PTMProcessor.MaxSiteCount = int(Value)
+ elif Option == "-v":
+ # -v p-value cutoff (0.01 by default)
+ PTMProcessor.PValueReportCutoff = float(Value)
+ if PTMProcessor.PValueReportCutoff <= 0 or PTMProcessor.PValueReportCutoff > 1:
+ print "** Error: Invalid p-value cutoff '%s'"%Value
+ print UsageInfo
+ sys.exit(1)
+ elif Option == "-w":
+ # -w p-value cutoff for the spectra used to pick a ptm (same as -v by default)
+ PTMProcessor.PValueCutoff = float(Value)
+ if PTMProcessor.PValueCutoff <= 0 or PTMProcessor.PValueCutoff > 1:
+ print "** Error: Invalid p-value cutoff '%s'"%Value
+ print UsageInfo
+ sys.exit(1)
+ elif Option == "-d":
+ # -d database
+ print "Read database:", Value
+ Path = FixupPath(Value)
+ PTMProcessor.ReadDatabase(Path)
+ elif Option == "-s":
+ # -s SummaryDir
+ PTMProcessor.PTMSummaryDir = Value
+ elif Option == "-i":
+ # -i -> generate spectrum images
+ PTMProcessor.GenerateSpectrumImagesFlag = 1
+ elif Option == "-l":
+ # -l -> Maximum number of lines to read in
+ MaxLineCount = int(Value)
+ elif Option == "-m":
+ # -m -> Minimum PTM size (defaults to 2)
+ PTMProcessor.MinimumPTMSize = int(Value)
+ elif Option == "-p":
+ # -p -> Generate PTM frequency matrix
+ PTMProcessor.BuildPTMFrequencyMatrix = 1
+ else:
+ print "Option not understood: '%s' '%s'"%(Option, Value)
+ if not OptionsSeen.get("-r"):
+ print "** Please specify a search results file (-r)"
+ print UsageInfo
+ sys.exit(1)
+ if not OptionsSeen.get("-d"):
+ print "** Please specify a database file (-d)"
+ print UsageInfo
+ sys.exit(1)
+ if not OptionsSeen.get("-w"):
+ PTMProcessor.PValueCutoff = PTMProcessor.PValueReportCutoff
+ # Make necessary directories:
+ try:
+ os.makedirs(PTMProcessor.PTMSummaryDir)
+ except:
+ pass
+ try:
+ Dir = os.path.join(PTMProcessor.PTMSummaryDir, "Images")
+ os.makedirs(Dir)
+ except:
+ pass
+ if PTMProcessor.BuildPTMFrequencyMatrix:
+ PTMProcessor.PerformNonSiteSpecificPTMSelection(ResultsFileName)
+ return
+ PTMProcessor.ReadKnownPTMs()
+ # Read annotations, generate the PTM frequency matrix:
+ print "\n\nRead spectrum annotations:"
+ sys.stdout.flush()
+ PTMProcessor.ReadSpectrumAnnotations(ResultsFileName)
+ # Select sites by 'peak finding' among large matrix entries:
+ print "\n\nSelect sites:"
+ sys.stdout.flush()
+ PTMProcessor.SelectSites()
+ # Re-read annotations, keeping a few in memory:
+ print "\n\nRead PTM witnesses:"
+ sys.stdout.flush()
+ PTMProcessor.ReadPTMWitnesses(ResultsFileName)
+ # Output our findings:
+ print "\n\nOutput results:"
+ sys.stdout.flush()
+ PTMProcessor.OutputResults()
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco optimization system not loaded - running normally)"
+ Main(Processor())
diff --git a/PTMDatabase.txt b/PTMDatabase.txt
new file mode 100644
index 0000000..81e3ee3
--- /dev/null
+++ b/PTMDatabase.txt
@@ -0,0 +1,563 @@
+#Database ID Mass Name AA Terminus
+"#UniMOD PTM reference, parsed from http://www.unimod.org on 2/20/6. "
+UniMOD 1 42.0367 Acetylation N-term
+UniMOD 1 42.0367 Acetylation K
+UniMOD 1 42.0367 Acetylation C
+UniMOD 1 42.0367 Acetylation S
+UniMOD 2 -0.9848 Amidation C-term
+UniMOD 3 226.2954 Biotinylation N-term
+UniMOD 3 226.2954 Biotinylation K
+UniMOD 4 57.0513 Iodoacetamide derivative D
+UniMOD 4 57.0513 Iodoacetamide derivative H
+UniMOD 4 57.0513 Iodoacetamide derivative N-term
+UniMOD 4 57.0513 Iodoacetamide derivative K
+UniMOD 4 57.0513 Iodoacetamide derivative C
+UniMOD 4 57.0513 Iodoacetamide derivative E
+UniMOD 5 43.0247 Carbamylation C
+UniMOD 5 43.0247 Carbamylation R
+UniMOD 5 43.0247 Carbamylation N-term
+UniMOD 5 43.0247 Carbamylation K
+UniMOD 6 58.0361 Iodoacetic acid derivative C
+UniMOD 7 0.9848 Deamidation Q
+UniMOD 7 0.9848 Deamidation N
+UniMOD 8 486.6253 Gygi ICAT(TM) d0 C
+UniMOD 9 494.6746 Gygi ICAT(TM) d8 C
+UniMOD 10 -30.0922 Homoserine M
+UniMOD 11 -48.1075 Homoserine lactone M
+UniMOD 12 450.6221 Applied Biosystems original ICAT(TM) d8 C
+UniMOD 13 442.5728 Applied Biosystems original ICAT(TM) d0 C
+UniMOD 14 14.0266 Methyl ester T
+UniMOD 14 14.0266 Methyl ester S
+UniMOD 14 14.0266 Methyl ester E
+UniMOD 14 14.0266 Methyl ester D
+UniMOD 14 14.0266 Methyl ester C-term
+UniMOD 15 42.0367 N-Acetylation N-term
+UniMOD 16 28.0101 N-Formylation N-term
+UniMOD 17 99.1311 N-isopropylcarboxamidomethyl C
+UniMOD 18 1.9998 O18 label C-term
+UniMOD 19 15.9994 Oxidation W
+UniMOD 19 15.9994 Oxidation H
+UniMOD 19 15.9994 Oxidation M
+UniMOD 20 414.5196 "Biotinyl-iodoacetamidyl-3,6-dioxaoctanediamine" C
+UniMOD 21 79.9799 Phosphorylation Y
+UniMOD 21 79.9799 Phosphorylation D
+UniMOD 21 79.9799 Phosphorylation C
+UniMOD 21 79.9799 Phosphorylation H
+UniMOD 21 79.9799 Phosphorylation T
+UniMOD 21 79.9799 Phosphorylation S
+UniMOD 21 79.9799 Phosphorylation R
+UniMOD 22 79.9799 Phosphorylation without neutral loss T
+UniMOD 22 79.9799 Phosphorylation without neutral loss S
+UniMOD 23 -18.0153 Phosphorylation with prompt loss of phosphate S
+UniMOD 23 -18.0153 Phosphorylation with prompt loss of phosphate T
+UniMOD 23 -18.0153 Phosphorylation with prompt loss of phosphate Y
+UniMOD 24 71.0779 Acrylamide adduct C
+UniMOD 25 119.1207 pyridylacetyl N-term
+UniMOD 25 119.1207 pyridylacetyl K
+UniMOD 26 -17.0305 S-carbamoylmethylcysteine cyclization (N-terminus) C
+UniMOD 27 -18.0153 Pyro-glu from E E
+UniMOD 28 -17.0305 Pyro-glu from Q Q
+UniMOD 29 127.1412 N-Succinimidyl-3-morpholine acetate N-term
+UniMOD 29 127.1412 N-Succinimidyl-3-morpholine acetate K
+UniMOD 30 21.9818 Sodium adduct D
+UniMOD 30 21.9818 Sodium adduct C-term
+UniMOD 30 21.9818 Sodium adduct E
+UniMOD 31 105.1372 S-pyridylethylation C
+UniMOD 32 31.9988 Sulphone M
+UniMOD 33 0.9848 Citrullination R
+UniMOD 34 14.0266 Methylation I
+UniMOD 34 14.0266 Methylation Q
+UniMOD 34 14.0266 Methylation R
+UniMOD 34 14.0266 Methylation N-term
+UniMOD 34 14.0266 Methylation N
+UniMOD 34 14.0266 Methylation K
+UniMOD 34 14.0266 Methylation H
+UniMOD 34 14.0266 Methylation C
+UniMOD 34 14.0266 Methylation L
+UniMOD 35 15.9994 Hydroxylation R
+UniMOD 35 15.9994 Hydroxylation Y
+UniMOD 35 15.9994 Hydroxylation F
+UniMOD 35 15.9994 Hydroxylation P
+UniMOD 35 15.9994 Hydroxylation N
+UniMOD 35 15.9994 Hydroxylation K
+UniMOD 35 15.9994 Hydroxylation D
+UniMOD 36 28.0532 di-Methylation N-term
+UniMOD 36 28.0532 di-Methylation R
+UniMOD 36 28.0532 di-Methylation K
+UniMOD 36 28.0532 di-Methylation N
+UniMOD 37 42.0797 tri-Methylation R
+UniMOD 37 42.0797 tri-Methylation K
+UniMOD 38 44.0095 Gamma-carboxylation E
+UniMOD 38 44.0095 Gamma-carboxylation D
+UniMOD 39 46.0916 Beta-methylthiolation D
+UniMOD 39 46.0916 Beta-methylthiolation N
+UniMOD 40 80.0632 O-Sulfonation Y
+UniMOD 40 80.0632 O-Sulfonation T
+UniMOD 40 80.0632 O-Sulfonation S
+UniMOD 41 162.1406 Hexose R
+UniMOD 41 162.1406 Hexose C
+UniMOD 41 162.1406 Hexose T
+UniMOD 41 162.1406 Hexose W
+UniMOD 41 162.1406 Hexose N
+UniMOD 41 162.1406 Hexose N-term
+UniMOD 41 162.1406 Hexose K
+UniMOD 41 162.1406 Hexose Y
+UniMOD 42 188.3103 Lipoyl K
+UniMOD 43 203.1925 N-Acetylhexosamine T
+UniMOD 43 203.1925 N-Acetylhexosamine S
+UniMOD 43 203.1925 N-Acetylhexosamine N
+UniMOD 44 204.3511 Farnesylation C
+UniMOD 45 210.3556 Myristoylation K
+UniMOD 45 210.3556 Myristoylation G
+UniMOD 45 210.3556 Myristoylation C
+UniMOD 46 229.1266 Pyridoxal phosphate K
+UniMOD 47 238.4088 Palmitoylation T
+UniMOD 47 238.4088 Palmitoylation S
+UniMOD 47 238.4088 Palmitoylation K
+UniMOD 47 238.4088 Palmitoylation C
+UniMOD 48 272.4681 Geranyl-geranyl C
+UniMOD 49 340.333 Phosphopantetheine S
+UniMOD 50 783.5339 Flavin adenine dinucleotide Y
+UniMOD 50 783.5339 Flavin adenine dinucleotide H
+UniMOD 50 783.5339 Flavin adenine dinucleotide C
+UniMOD 51 789.3049 N-acyl diglyceride cysteine C
+UniMOD 52 42.04 Guanidination K
+UniMOD 53 156.2221 4-hydroxynonenal (HNE) C
+UniMOD 53 156.2221 4-hydroxynonenal (HNE) H
+UniMOD 53 156.2221 4-hydroxynonenal (HNE) K
+UniMOD 54 176.1241 N-glucuronylation N-term
+UniMOD 55 305.3076 glutathione disulfide C
+UniMOD 56 45.0552 "Acetate labeling reagent (N-term & K) (heavy form, +3amu)" N-term
+UniMOD 56 45.0552 "Acetate labeling reagent (N-term & K) (heavy form, +3amu)" K
+UniMOD 57 42.0367 Acetate labeling reagent light form (N-term & K) N-term
+UniMOD 57 42.0367 Acetate labeling reagent light form (N-term & K) K
+UniMOD 58 56.0633 Propionate labeling reagent light form (N-term & K) N-term
+UniMOD 58 56.0633 Propionate labeling reagent light form (N-term & K) K
+UniMOD 59 59.0412 "Propionate labeling reagent heavy form (+3amu), N-term&K" N-term
+UniMOD 59 59.0412 "Propionate labeling reagent heavy form (+3amu), N-term&K" K
+UniMOD 60 127.1842 Quaternary amine labeling reagent light form (N-term & K) N-term
+UniMOD 60 127.1842 Quaternary amine labeling reagent light form (N-term & K) K
+UniMOD 61 130.2027 "Quaternary amine labeling reagent heavy (+3amu) form, N-term & K" N-term
+UniMOD 61 130.2027 "Quaternary amine labeling reagent heavy (+3amu) form, N-term & K" K
+UniMOD 62 133.2212 "Quaternary amine labeling reagent heavy form (+6amu), N-term & K" N-term
+UniMOD 62 133.2212 "Quaternary amine labeling reagent heavy form (+6amu), N-term & K" K
+UniMOD 63 136.2397 "Quaternary amine labeling reagent heavy form (+9amu), N-term & K" N-term
+UniMOD 63 136.2397 "Quaternary amine labeling reagent heavy form (+9amu), N-term & K" K
+UniMOD 64 100.0728 Succinic anhydride labeling reagent light form (N-term & K) N-term
+UniMOD 64 100.0728 Succinic anhydride labeling reagent light form (N-term & K) K
+UniMOD 65 104.0974 "Succinic anhydride labeling reagent, heavy form (+4amu, 4H2), N-term & K" N-term
+UniMOD 65 104.0974 "Succinic anhydride labeling reagent, heavy form (+4amu, 4H2), N-term & K" K
+UniMOD 66 104.0434 "Succinic anhydride labeling reagent, heavy form (+4amu, 4C13), N-term & K" N-term
+UniMOD 66 104.0434 "Succinic anhydride labeling reagent, heavy form (+4amu, 4C13), N-term & K" K
+UniMOD 89 225.3106 Iminobiotinylation N-term
+UniMOD 89 225.3106 Iminobiotinylation K
+UniMOD 90 338.4682 ESP-Tag light d0 N-term
+UniMOD 90 338.4682 ESP-Tag light d0 K
+UniMOD 91 348.5299 ESP-Tag heavy d10 N-term
+UniMOD 91 348.5299 ESP-Tag heavy d10 K
+UniMOD 92 339.453 NHS-LC-Biotin N-term
+UniMOD 92 339.453 NHS-LC-Biotin K
+UniMOD 93 601.8021 EDT-maleimide-PEO-biotin T
+UniMOD 93 601.8021 EDT-maleimide-PEO-biotin S
+UniMOD 94 68.0773 IMID d0 K
+UniMOD 95 72.1019 IMID d4 K
+UniMOD 97 74.0964 Acrylamide d3 C
+UniMOD 105 227.2603 Applied Biosystems cleavable ICAT(TM) light C
+UniMOD 106 236.1942 Applied Biosystems cleavable ICAT(TM) heavy C
+UniMOD 107 160.2141 Addition of N-formyl met N-term
+UniMOD 108 125.1253 N-ethylmaleimide on cysteines C
+UniMOD 112 354.4676 "Oxidized lysine biotinylated with biotin-LC-hydrazide, reduced" K
+UniMOD 113 352.4518 Oxidized lysine biotinylated with biotin-LC-hydrazide K
+UniMOD 114 371.4982 "Oxidized proline biotinylated with biotin-LC-hydrazide, reduced" P
+UniMOD 115 369.4823 Oxidized Proline biotinylated with biotin-LC-hydrazide P
+UniMOD 116 310.4118 Oxidized arginine biotinylated with biotin-LC-hydrazide R
+UniMOD 117 312.4277 "Oxidized arginine biotinylated with biotin-LC-hydrazide, reduced" R
+UniMOD 118 490.7034 EDT-iodo-PEO-biotin T
+UniMOD 118 490.7034 EDT-iodo-PEO-biotin S
+UniMOD 119 316.3759 Thio Ether Formation - BTP Adduct C
+UniMOD 121 114.1026 ubiquitinylation residue K
+UniMOD 122 28.0101 Formylation N-term
+UniMOD 122 28.0101 Formylation T
+UniMOD 122 28.0101 Formylation S
+UniMOD 122 28.0101 Formylation K
+UniMOD 123 345.7754 "N-iodoacetyl, p-chlorobenzyl-12C6-glucamine" C
+UniMOD 124 351.7313 "N-iodoacetyl, p-chlorobenzyl-13C6-glucamine" C
+UniMOD 125 32.0778 reductive amination-D N-term
+UniMOD 125 32.0778 reductive amination-D K
+UniMOD 126 88.1283 thioacylation of primary amines (N-term and Lys) N-term
+UniMOD 126 88.1283 thioacylation of primary amines (N-term and Lys) K
+UniMOD 127 17.9905 fluorophenylalanine replacement of phenylalanine F
+UniMOD 128 388.3497 "5-Iodoacetamidofluorescein (Molecular Probe, Eugene, OR)" C
+UniMOD 129 125.8965 Iodination H
+UniMOD 129 125.8965 Iodination Y
+UniMOD 130 251.7931 di-Iodination Y
+UniMOD 131 377.6896 tri-Iodination Y
+UniMOD 134 208.3398 (cis-delta 5)-tetradecaenoyl G
+UniMOD 135 206.3239 "(cis,cis-delta 5, delta 8)-tetradecadienoyl" G
+UniMOD 136 104.1061 labeling reagent light form (N-term & K) N-term
+UniMOD 136 104.1061 labeling reagent light form (N-term & K) K
+UniMOD 137 1217.088 N-linked glycan core N
+UniMOD 139 233.2862 5-dimethylaminonaphthalene-1-sulfonyl K
+UniMOD 139 233.2862 5-dimethylaminonaphthalene-1-sulfonyl N-term
+UniMOD 140 -29.018 ISD a-series (C-Term) C-term
+UniMOD 141 41.0519 amidination of lysines or N-terminal amines with methyl acetimidate K
+UniMOD 141 41.0519 amidination of lysines or N-terminal amines with methyl acetimidate N-term
+UniMOD 142 349.3337 HexNAc1dHex1 N
+UniMOD 143 406.385 HexNAc2 N
+UniMOD 144 486.4218 Hex3 N
+UniMOD 145 495.4749 HexNAc1dHex2 N
+UniMOD 146 511.4743 Hex1HexNAc1dHex1 N
+UniMOD 147 552.5262 HexNAc2dHex1 N
+UniMOD 148 568.5256 Hex1HexNAc2 N
+UniMOD 149 656.5877 Hex1HexNAc1NeuAc1 N
+UniMOD 150 698.6674 HexNAc2dHex2 N
+UniMOD 151 700.6403 Hex1HexNAc2Pent1 N
+UniMOD 152 714.6668 Hex1HexNAc2dHex1 N
+UniMOD 153 730.6662 Hex2HexNAc2 N
+UniMOD 154 821.7289 Hex3HexNAc1Pent1 N
+UniMOD 155 846.7815 Hex1HexNAc2dHex1Pent1 N
+UniMOD 156 860.808 Hex1HexNAc2dHex2 N
+UniMOD 157 862.7809 Hex2HexNAc2Pent1 N
+UniMOD 158 876.8074 Hex2HexNAc2dHex1 N
+UniMOD 159 892.8068 Hex3HexNAc2 N
+UniMOD 160 947.8423 Hex1HexNAc1NeuAc2 N
+UniMOD 161 923.7806 Hex3HexNAc2P1 N
+UniMOD 162 46.895 Selenium replaces sulphur in Methionine M
+UniMOD 170 3.0077 glycosylated asparagine 18O labeling N
+UniMOD 171 159.1144 Shimadzu 13CNBS W
+UniMOD 172 153.1585 Shimadzu 12CNBS W
+UniMOD 176 218.3346 Michael addition of BHT quinone methide to Cysteine and Lysine H
+UniMOD 176 218.3346 Michael addition of BHT quinone methide to Cysteine and Lysine C
+UniMOD 176 218.3346 Michael addition of BHT quinone methide to Cysteine and Lysine K
+UniMOD 178 87.1866 phosphorylation to amine thiol T
+UniMOD 178 87.1866 phosphorylation to amine thiol S
+UniMOD 179 -15.9994 Serine to Alanine S
+UniMOD 182 -15.9994 Threonine to a-aminobutyrate T
+UniMOD 184 8.9339 C13 label Y
+UniMOD 185 88.9138 C13 label (Phosphotyrosine) Y
+UniMOD 186 132.1162 Hydroxyphenylglyoxal arginine R
+UniMOD 187 282.2476 2 Hydroxyphenylglyoxal arginine R
+UniMOD 188 5.9559 C13 label F
+UniMOD 188 5.9559 C13 label L
+UniMOD 188 5.9559 C13 label K
+UniMOD 188 5.9559 C13 label R
+UniMOD 193 3.9995 O18 label at both C-terminal oxygens C-term
+UniMOD 194 170.1674 6-aminoquinolyl-N-hydroxysuccinimidyl carbamate N-term
+UniMOD 194 170.1674 6-aminoquinolyl-N-hydroxysuccinimidyl carbamate K
+UniMOD 195 170.252 APTA-d0 C
+UniMOD 196 174.2784 APTA d3 C
+UniMOD 197 184.2786 EAPTA d0 C
+UniMOD 198 189.3094 EAPTA d5 C
+UniMOD 199 32.0778 DiMethyl-CHD2 N-term
+UniMOD 199 32.0778 DiMethyl-CHD2 K
+UniMOD 200 76.1838 EDT T
+UniMOD 200 76.1838 EDT S
+UniMOD 202 170.252 APTA- d0 with no neutral loss C
+UniMOD 203 170.252 APTA-d0 with quaternary amine loss C
+UniMOD 205 94.1112 Acrolein addition +94 K
+UniMOD 206 56.0633 Acrolein addition +56 K
+UniMOD 206 56.0633 Acrolein addition +56 H
+UniMOD 206 56.0633 Acrolein addition +56 C
+UniMOD 207 38.048 Acrolein addition +38 K
+UniMOD 208 76.096 Acrolein addition +76 K
+UniMOD 209 112.1265 Acrolein addition +112 K
+UniMOD 211 85.1045 N-ethyl iodoacetamide-d0 Y
+UniMOD 211 85.1045 N-ethyl iodoacetamide-d0 C
+UniMOD 212 90.1353 N-ethyl iodoacetamide-d5 Y
+UniMOD 212 90.1353 N-ethyl iodoacetamide-d5 C
+UniMOD 213 541.3005 ADP Ribose addition S
+UniMOD 213 541.3005 ADP Ribose addition N
+UniMOD 213 541.3005 ADP Ribose addition C
+UniMOD 213 541.3005 ADP Ribose addition R
+UniMOD 213 541.3005 ADP Ribose addition E
+UniMOD 214 144.1544 Applied Biosystems iTRAQ(TM) multiplexed quantitation chemistry Y
+UniMOD 214 144.1544 Applied Biosystems iTRAQ(TM) multiplexed quantitation chemistry N-term
+UniMOD 214 144.1544 Applied Biosystems iTRAQ(TM) multiplexed quantitation chemistry K
+UniMOD 215 0.9848 deglycosylated asparagine N
+UniMOD 243 297.1478 label Cysteine with IGBP reagent C
+UniMOD 253 70.0898 Crotonaldehyde K
+UniMOD 253 70.0898 Crotonaldehyde H
+UniMOD 253 70.0898 Crotonaldehyde C
+UniMOD 254 26.0373 Acetaldehyde +26 K
+UniMOD 254 26.0373 Acetaldehyde +26 H
+UniMOD 255 28.0532 Acetaldehyde +28 K
+UniMOD 255 28.0532 Acetaldehyde +28 H
+UniMOD 256 40.0639 Propionaldehyde +40 K
+UniMOD 256 40.0639 Propionaldehyde +40 H
+UniMOD 258 1.9998 "O18 Labeling of Serine, Threonine or Tyrosine" Y
+UniMOD 258 1.9998 "O18 Labeling of Serine, Threonine or Tyrosine" T
+UniMOD 258 1.9998 "O18 Labeling of Serine, Threonine or Tyrosine" S
+UniMOD 259 7.9427 C13 and N15 label K
+UniMOD 260 96.0455 Thiophosphorylation Y
+UniMOD 260 96.0455 Thiophosphorylation T
+UniMOD 260 96.0455 Thiophosphorylation S
+UniMOD 261 215.2495 4-sulfophenyl isothiocyanate K
+UniMOD 261 215.2495 4-sulfophenyl isothiocyanate N-term
+UniMOD 262 3.0185 Trideuteration L
+UniMOD 264 121.2028 phosphorylation to pyridyl thiol T
+UniMOD 264 121.2028 phosphorylation to pyridyl thiol S
+UniMOD 267 9.9296 C13 and N15 label R
+UniMOD 268 5.9567 C13 and N15 label V
+UniMOD 269 9.9273 C13 and N15 label F
+UniMOD 270 362.3738 nucleophilic addtion to cytopiloyne Y
+UniMOD 270 362.3738 nucleophilic addtion to cytopiloyne S
+UniMOD 270 362.3738 nucleophilic addtion to cytopiloyne R
+UniMOD 270 362.3738 nucleophilic addtion to cytopiloyne P
+UniMOD 270 362.3738 nucleophilic addtion to cytopiloyne N-term
+UniMOD 270 362.3738 nucleophilic addtion to cytopiloyne K
+UniMOD 270 362.3738 nucleophilic addtion to cytopiloyne C
+UniMOD 271 380.3891 nucleophilic addition to cytopiloyne+H2O Y
+UniMOD 271 380.3891 nucleophilic addition to cytopiloyne+H2O T
+UniMOD 271 380.3891 nucleophilic addition to cytopiloyne+H2O S
+UniMOD 271 380.3891 nucleophilic addition to cytopiloyne+H2O R
+UniMOD 271 380.3891 nucleophilic addition to cytopiloyne+H2O N-term
+UniMOD 271 380.3891 nucleophilic addition to cytopiloyne+H2O K
+UniMOD 271 380.3891 nucleophilic addition to cytopiloyne+H2O C
+UniMOD 272 136.1265 sulfonation of N-terminus N-term
+UniMOD 273 253.2512 covalent modification of lysine by cross-linking reagent K
+UniMOD 275 28.9982 S-nitrosylation C
+UniMOD 276 183.2276 Aminoethylbenzenesulfonylation N-term
+UniMOD 276 183.2276 Aminoethylbenzenesulfonylation K
+UniMOD 276 183.2276 Aminoethylbenzenesulfonylation S
+UniMOD 276 183.2276 Aminoethylbenzenesulfonylation H
+UniMOD 276 183.2276 Aminoethylbenzenesulfonylation Y
+UniMOD 277 46.0916 Methyl methanethiosulfonate C
+UniMOD 278 44.0526 Ethanolation of Cys C
+UniMOD 279 15.9994 Cysteine sulfenic acid C
+UniMOD 280 28.0532 Ethylation N-term
+UniMOD 280 28.0532 Ethylation K
+UniMOD 280 28.0532 Ethylation E
+UniMOD 281 765.5182 Cysteine modified Coenzyme A C
+UniMOD 282 14.0266 N-methylation N-term
+UniMOD 283 28.0532 N-ethylation N-term
+UniMOD 284 16.0389 Deuterium Methylation of Lysine K
+UniMOD 285 155.1744 Light Sulfanilic Acid (SA) C12 E
+UniMOD 285 155.1744 Light Sulfanilic Acid (SA) C12 D
+UniMOD 285 155.1744 Light Sulfanilic Acid (SA) C12 C-term
+UniMOD 286 161.1303 Heavy Sulfanilic Acid (SA) C13 E
+UniMOD 286 161.1303 Heavy Sulfanilic Acid (SA) C13 D
+UniMOD 286 161.1303 Heavy Sulfanilic Acid (SA) C13 C-term
+UniMOD 288 13.9835 Tryptophan oxidation to oxolactone W
+UniMOD 289 356.4835 Biotin polyethyleneoxide amine C-term
+UniMOD 289 356.4835 Biotin polyethyleneoxide amine D
+UniMOD 289 356.4835 Biotin polyethyleneoxide amine E
+UniMOD 290 428.6124 Pierce EZ-Link Biotin-HPDP C
+UniMOD 291 200.59 Mercury Mercaptan C
+UniMOD 292 322.1654 "Cross-link of (Iodo)-uracil MP with W,F,Y" W
+UniMOD 292 322.1654 "Cross-link of (Iodo)-uracil MP with W,F,Y" Y
+UniMOD 292 322.1654 "Cross-link of (Iodo)-uracil MP with W,F,Y" F
+UniMOD 293 145.1796 3-(carbamidomethylthio)propanoyl K
+UniMOD 293 145.1796 3-(carbamidomethylthio)propanoyl N-term
+UniMOD 294 326.4145 biotinoyl-iodoacetyl-ethylenediamine C
+UniMOD 295 146.1412 Fucose T
+UniMOD 295 146.1412 Fucose S
+UniMOD 298 17.0451 deuterated methyl ester E
+UniMOD 298 17.0451 deuterated methyl ester D
+UniMOD 298 17.0451 deuterated methyl ester C-term
+UniMOD 299 44.0095 Carboxylation K
+UniMOD 299 44.0095 Carboxylation W
+UniMOD 299 44.0095 Carboxylation D
+UniMOD 300 58.0361 Hydroxylethanone W
+UniMOD 301 190.1986 Monobromobimane derivative C
+UniMOD 302 170.1641 Menadione derivative K
+UniMOD 302 170.1641 Menadione derivative C
+UniMOD 303 76.1176 Cysteine mercaptoethanol C
+UniMOD 305 1445.3331 Fucosylated biantennary (-2 galactose) N
+UniMOD 306 80.0632 Sulfitolysis C
+UniMOD 307 1607.4737 Fucosylated biantennary (-1 galactose) N
+UniMOD 308 1769.6143 Fucosylated biantennary N
+UniMOD 309 1299.1919 Biantennary (-2 galactose) N
+UniMOD 310 1461.3325 Biantennary (-1 galactose) N
+UniMOD 311 1623.4731 Biantennary N
+UniMOD 312 120.1502 Cysteinylation C
+UniMOD 313 -128.1723 C terminal -K from HC of MAb K
+UniMOD 314 111.0987 N-methylmaleimide C
+UniMOD 316 78.1118 "2,5-dimethypyrrole" K
+UniMOD 317 -18.0153 D-Succinimide D
+UniMOD 318 62.0694 MDA adduct +62 K
+UniMOD 319 54.0474 MDA adduct +54 K
+UniMOD 320 143.1406 N-ethylmaleimide hydrolysis C
+UniMOD 321 -17.0073 N-Succinimide N
+UniMOD 323 713.5626 bis-N-I-sulfonerahodamine C
+UniMOD 324 87.1435 "dimethyl 3,3'-dithiobispropionimidate" R
+UniMOD 324 87.1435 "dimethyl 3,3'-dithiobispropionimidate" Q
+UniMOD 324 87.1435 "dimethyl 3,3'-dithiobispropionimidate" N
+UniMOD 324 87.1435 "dimethyl 3,3'-dithiobispropionimidate" K
+UniMOD 324 87.1435 "dimethyl 3,3'-dithiobispropionimidate" N-term
+UniMOD 325 573.7485 10-ethoxyphosphinyl-N-(biotinamidopentyl)decanamide T
+UniMOD 325 573.7485 10-ethoxyphosphinyl-N-(biotinamidopentyl)decanamide Y
+UniMOD 325 573.7485 10-ethoxyphosphinyl-N-(biotinamidopentyl)decanamide S
+UniMOD 327 44.1188 S-Ethylcystine from Serine S
+UniMOD 329 18.0377 monomethylated arginine R
+UniMOD 330 36.0754 dimethylated arginine R
+UniMOD 332 525.6658 thiophosphate labeled with biotin-HPDP Y
+UniMOD 332 525.6658 thiophosphate labeled with biotin-HPDP T
+UniMOD 332 525.6658 thiophosphate labeled with biotin-HPDP S
+UniMOD 333 448.5371 6-N-biotinylaminohexyl isopropyl phosphate S
+UniMOD 333 448.5371 6-N-biotinylaminohexyl isopropyl phosphate Y
+UniMOD 333 448.5371 6-N-biotinylaminohexyl isopropyl phosphate T
+UniMOD 334 146.1875 CAMthiopropanoyl of Lys K
+UniMOD 335 158.238 reduced 4-Hydroxynonenal K
+UniMOD 335 158.238 reduced 4-Hydroxynonenal H
+UniMOD 335 158.238 reduced 4-Hydroxynonenal C
+UniMOD 337 13.0418 Michael addition with methylamine S
+UniMOD 337 13.0418 Michael addition with methylamine T
+UniMOD 340 78.8961 bromination F
+UniMOD 340 78.8961 bromination H
+UniMOD 340 78.8961 bromination W
+UniMOD 341 -2.0159 threonine oxidation to 2-amino-3-oxo-butanoic acid T
+UniMOD 342 15.0146 Tyrosine oxidation to 2-aminotyrosine Y
+UniMOD 343 199.27 oxidized Arginine biotinylated with biotin hydrazide R
+UniMOD 344 -43.0711 Arginine oxidation to glutamic semialdehyde R
+UniMOD 345 47.9982 cysteine oxidation to cysteic acid C
+UniMOD 346 31.9988 phenylalanine oxidation to dihydroxyphenylalanine F
+UniMOD 347 31.9988 tryptophan oxidation to formylkynurenin W
+UniMOD 348 -23.0366 histidine oxidation to aspargine H
+UniMOD 349 -22.0519 histidine oxidation to aspartic acid H
+UniMOD 350 19.9881 tryptophan oxidation to hydroxykynurenin W
+UniMOD 351 3.9887 tryptophan oxidation to kynurenin W
+UniMOD 352 -1.0311 Lysine oxidation to aminoadipic semialdehyde K
+UniMOD 353 241.31 oxidized Lysine biotinylated with biotin hydrazide K
+UniMOD 354 44.9976 Oxidation to nitro Y
+UniMOD 354 44.9976 Oxidation to nitro W
+UniMOD 357 258.3405 oxidized proline biotinylated with biotin hydrazide P
+UniMOD 358 15.9994 Proline oxidation to glutamic semialdehyde P
+UniMOD 359 13.9835 proline oxidation to pyroglutamic acid P
+UniMOD 360 -30.026 Proline oxidation to pyrrolidinone P
+UniMOD 361 240.3252 oxidized Threonine biotinylated with biotin hydrazide T
+UniMOD 362 164.1394 Diisopropylphosphate S
+UniMOD 362 164.1394 Diisopropylphosphate Y
+UniMOD 363 122.0596 monoisopropyl phosphate Y
+UniMOD 363 122.0596 monoisopropyl phosphate S
+UniMOD 364 111.05 "Bruker Daltonics SERVA-ICPL(TM) quantification chemistry, heavy form" N-term
+UniMOD 364 111.05 "Bruker Daltonics SERVA-ICPL(TM) quantification chemistry, heavy form" K
+UniMOD 365 105.0941 "Bruker Daltonics SERVA-ICPL(TM) quantification chemistry, light form" N-term
+UniMOD 365 105.0941 "Bruker Daltonics SERVA-ICPL(TM) quantification chemistry, light form" K
+UniMOD 366 2.9845 Deamidation in presence of O18 Q
+UniMOD 366 2.9845 Deamidation in presence of O18 N
+UniMOD 367 -43.0711 Arginine oxidation to gamma-glutamyl semialdehyde R
+UniMOD 368 -34.0809 Dehydroalanine (from Cysteine) C
+UniMOD 369 -28.0101 Pyrrolidone from Proline P
+UniMOD 371 86.0892 Michael addition of hydroxymethylvinyl ketone to cysteine C
+UniMOD 372 -42.04 Ornithine from Arginine R
+UniMOD 373 46.895 Selenium replaces sulphur in cysteine C
+UniMOD 374 -1.0079 Half of a disulfide bridge C
+UniMOD 375 143.2068 Diphthamide H
+UniMOD 376 220.3505 hydroxyfarnesyl C
+UniMOD 377 576.9334 diacylglycerol C
+UniMOD 378 72.0627 carboxyethyl K
+UniMOD 379 87.1204 hypusine K
+UniMOD 380 266.4204 retinal K
+UniMOD 381 14.9683 alpha-amino adipic acid K
+UniMOD 382 -33.0961 pyruvic acid from N-term cys C
+UniMOD 383 -17.0305 pyruvic acid from N-term ser S
+UniMOD 384 0.9848 phenyllactyl from N-term Phe F N-term
+UniMOD 385 -17.0305 oxobutanoic acid from N term Thr T N-term
+UniMOD 386 100.0728 succinylated N-term Trp W N-term
+UniMOD 387 586.678 phycocyanobilin C
+UniMOD 388 588.6939 phycoerythrobilin C
+UniMOD 389 584.6621 phytochromobilin C
+UniMOD 390 616.4873 heme H
+UniMOD 390 616.4873 heme C
+UniMOD 391 520.2668 molybdopterin C
+UniMOD 392 29.9829 quinone W
+UniMOD 392 29.9829 quinone Y
+UniMOD 393 340.2806 glucosylgalactosyl hydroxylysine K
+UniMOD 394 123.0477 glycosylphosphatidylinositol C-term
+UniMOD 395 881.6335 phosphoribosyl dephospho-coenzyme A S
+UniMOD 396 197.1262 glycerylphosphorylethanolamine E
+UniMOD 397 469.7849 triiodo Y
+UniMOD 398 595.6815 tetraiodo Y
+UniMOD 399 -18.0153 Dehydro S
+UniMOD 399 -18.0153 Dehydro T
+UniMOD 400 -94.1112 Dehydroalanine (from Tyrosine) Y
+UniMOD 401 -2.0159 didehydro S
+UniMOD 401 -2.0159 didehydro Y
+UniMOD 402 -18.0815 oxoalanine C
+UniMOD 403 -15.0146 lactic acid from N-term Ser N-term
+UniMOD 405 329.2059 AMP binding site T
+UniMOD 405 329.2059 AMP binding site K
+UniMOD 405 329.2059 AMP binding site Y
+UniMOD 405 329.2059 AMP binding site H
+UniMOD 407 146.1427 hydroxycinnamyl C
+UniMOD 408 148.114 glycosyl-L-hydroxyproline P
+UniMOD 409 454.3279 flavin mononucleotide H
+UniMOD 409 454.3279 flavin mononucleotide C
+UniMOD 410 635.1417 S-diphytanylglycerol diether C
+UniMOD 411 119.1207 phenyl isocyanate N-term
+UniMOD 412 124.1515 d5-phenyl isocyanate N-term
+UniMOD 413 345.2053 phospho-guanosine K
+UniMOD 413 345.2053 phospho-guanosine H
+UniMOD 414 30.026 hydroxymethyl N
+UniMOD 415 1618.9096 L-selenocysteinyl molybdenum bis(molybdopterin guanine dinucleotide) C
+UniMOD 416 418.3973 dipyrrolylmethanemethyl C
+UniMOD 417 306.166 uridine phosphodiester Y
+UniMOD 417 306.166 uridine phosphodiester H
+UniMOD 418 31.9988 trihydroxyphenylalanine Y
+UniMOD 419 154.0584 glycerophospho S
+UniMOD 420 16.0656 thiocarboxylic acid G
+UniMOD 421 32.065 persulfide C
+UniMOD 422 70.0468 N-pyruvic acid 2-iminyl V
+UniMOD 422 70.0468 N-pyruvic acid 2-iminyl C
+UniMOD 422 70.0468 N-pyruvic acid 2-iminyl K
+UniMOD 423 78.96 selenyl C
+UniMOD 424 1572.0146 molybdenum bis(molybdopterin guanine dinucleotide) C
+UniMOD 424 1572.0146 molybdenum bis(molybdopterin guanine dinucleotide) D
+UniMOD 425 31.9988 dihydroxy R
+UniMOD 425 31.9988 dihydroxy P
+UniMOD 425 31.9988 dihydroxy K
+UniMOD 426 126.1962 octanoyl S
+UniMOD 426 126.1962 octanoyl T
+UniMOD 427 176.1241 glucuronosyl S
+UniMOD 428 283.1724 N-acetylglucosamine-1-phosphoryl S
+UniMOD 429 242.1205 phosphoglycosyl-D-mannose-1-phosphoryl S
+UniMOD 430 -18.0153 C-term blocking imide C-term
+UniMOD 431 236.3929 palmitoleyl C
+UniMOD 432 368.6383 cholesterol ester C-term
+UniMOD 433 264.4046 "3,4-didehydroretinylidene" K
+UniMOD 434 294.3859 "cis-14-hydroxy-10,13-dioxo-7-heptadecenoic ester" D
+UniMOD 435 109.1259 4-methyl-delta-1-pyrroline-5-carboxyl K
+UniMOD 436 614.4714 hydroxyheme E
+UniMOD 437 386.3003 (3-aminopropyl)(L-aspartyl-1-amino)phosphoryl-5-adenosine C-term
+UniMOD 438 25.0095 cyano C
+UniMOD 439 342.876 hydrogenase diiron subcluster C
+UniMOD 440 42.04 amidino C
+UniMOD 441 238.4088 N-palmitoyl N-term
+UniMOD 442 438.3285 O3-(riboflavin phosphoryl) T
+UniMOD 442 438.3285 O3-(riboflavin phosphoryl) S
+UniMOD 443 456.3438 S-(4a-FMN) C
+UniMOD 444 922.067 copper sulfido molybdopterin cytosine dinuncleotide C
+UniMOD 445 59.0871 "5-hydroxy-N6,N6,N6-trimethyl" K
+UniMOD 446 44.0095 N-carboxylation of Met N-term
+UniMOD 447 -15.9994 reduction D
+UniMOD 448 831.6871 microcin E492 siderophore ester from serine C-term
+UniMOD 449 154.2493 lipid S
+UniMOD 449 154.2493 lipid T
+UniMOD 450 129.114 monoglutamyl E
+UniMOD 450 129.114 monoglutamyl C-term
+UniMOD 451 258.228 diglutamyl C-term
+UniMOD 451 258.228 diglutamyl E
+UniMOD 452 387.3419 triglutamyl C-term
+UniMOD 452 387.3419 triglutamyl E
+UniMOD 453 516.4559 tetraglutamyl C-term
+UniMOD 453 516.4559 tetraglutamyl E
+UniMOD 454 161.1558 Hexosamine W
+UniMOD 454 161.1558 Hexosamine T
+UniMOD 454 161.1558 Hexosamine N
+UniMOD 454 161.1558 Hexosamine K
+UniMOD 455 154.2096 "One end of crosslink attached, one end free" N-term
+UniMOD 455 154.2096 "One end of crosslink attached, one end free" K
+UniMOD 456 122.1677 Both ends of crosslink attached to same peptide N-term
+UniMOD 456 122.1677 Both ends of crosslink attached to same peptide K
+UniMOD 457 175.1855 "naphthalene-2,3-dicarboxaldehyde" N-term
+UniMOD 457 175.1855 "naphthalene-2,3-dicarboxaldehyde" K
+UniMOD 464 221.2054 4-sulfophenyl isothiocyanate (Heavy C13) N-term
+UniMOD 464 221.2054 4-sulfophenyl isothiocyanate (Heavy C13) K
+UniMOD 465 32.0778 N-reductive amination-D N-term
+UniMOD 472 59.1334 aminoethylcysteine S
+UniMOD 472 59.1334 aminoethylcysteine T
+UniMOD 475 136.1265 Sulfonation of Lysine K
+UniMOD 476 128.1922 4-trimethyllammoniumbutyryl- N-term
+UniMOD 476 128.1922 4-trimethyllammoniumbutyryl- K
+UniMOD 477 137.2476 d9-4-trimethyllammoniumbutyryl- K
+UniMOD 477 137.2476 d9-4-trimethyllammoniumbutyryl- N-term
diff --git a/PTMSearchBigDB.py b/PTMSearchBigDB.py
new file mode 100644
index 0000000..b3a0adb
--- /dev/null
+++ b/PTMSearchBigDB.py
@@ -0,0 +1,171 @@
+#Title: PTMSearchBigDB.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Context:
+ We've performed an unrestrictive search of many spectra against a small database.
+ We've found a collection of modified peptides, after taking a p-value threshold at
+ the spectrum level. We want to better distinguish between VALID and INVALID annotations.
+
+Plan:
+ Let's take each modified peptide and search its consensus spectrum against
+ a large database (Swiss-Prot). The resulting delta-score should be an informative feature
+ when in comes to distinguishing beween the VALID and INVALID annotations. The delta-score
+ will be zero (actually, slightly negative) if the consensus spectrum matches an unmodified
+ peptide (e.g. an unanticipated contaminant).
+"""
+
+import os
+import sys
+import string
+import getopt
+import MSSpectrum
+from Utils import *
+Initialize()
+from TrainPTMFeatures import FormatBits
+
+class PeptideFeatureBag:
+ pass
+
+class PTMSearcher:
+ def __init__(self):
+ self.HeaderLines = []
+ self.ConsensusSpectrumDir = "ptmscore\\LensLTQ-99-5\\spectra"
+ self.PeptideFeatureFileName = "PTMScore\\LensLTQ-99-5.txt"
+ self.FixedFeatureFileName = None
+ self.ModifiedPeptides = []
+ self.InspectOut = None
+ def ParsePeptideFeatureFile(self):
+ """
+ Parse the contents of the peptide feature-file. We need to know the
+ path to the consensus spectrum file, the consensus annotation MQScore,
+ and the index.
+ """
+ File = open(self.PeptideFeatureFileName, "rb")
+ LineNumber = 0
+ for FileLine in File.xreadlines():
+ LineNumber +=1
+ if FileLine[0] == "#":
+ self.HeaderLines.append(FileLine)
+ continue
+ Bits = list(FileLine.replace("\r", "").replace("\n", "").split("\t"))
+ try:
+ ConsensusMQScore = float(Bits[FormatBits.ConsensusMQScore])
+ except:
+ print "** Error: Can't parse consensus MQScore from line %s!"%LineNumber
+ print Bits
+ continue
+ PeptideFeatures = PeptideFeatureBag()
+ PeptideFeatures.Bits = Bits
+ PeptideFeatures.ConsensusMQScore = ConsensusMQScore
+ NiceAnnotation = Bits[FormatBits.Peptide].replace("*", "-")
+ PeptideFeatures.Bits[FormatBits.Peptide] = NiceAnnotation
+ FirstResidue = NiceAnnotation[2]
+ Charge = Bits[FormatBits.Charge]
+ PeptideFeatures.SpectrumPath = os.path.join(self.ConsensusSpectrumDir, FirstResidue, "%s.%s.dta"%(NiceAnnotation, Charge))
+ self.ModifiedPeptides.append(PeptideFeatures)
+ File.close()
+ print "Parsed %s modified peptides from %s file lines."%(len(self.ModifiedPeptides), LineNumber)
+ def ComputeDeltaScoreFeatureFile(self, FileName):
+ File = open(FileName, "rb")
+ OldSpectrum = None
+ for FileLine in File.xreadlines():
+ if FileLine[0] == "#":
+ continue
+ Bits = FileLine.split("\t")
+ Spectrum = (Bits[0], Bits[1])
+ if Spectrum == OldSpectrum:
+ continue
+ OldSpectrum = Spectrum
+ MQScore = float(Bits[5])
+ ScanNumber = int(Bits[1])
+ PeptideFeatures = self.ModifiedPeptides[ScanNumber]
+ while len(PeptideFeatures.Bits) <= FormatBits.ConsensusDeltaBigDB:
+ PeptideFeatures.Bits.append("")
+ PeptideFeatures.Bits[FormatBits.BigDBAnnotation] = Bits[2]
+ PeptideFeatures.Bits[FormatBits.BigDBMQScore] = Bits[5]
+ DeltaScore = float(PeptideFeatures.ConsensusMQScore - MQScore)
+ PeptideFeatures.Bits[FormatBits.ConsensusDeltaBigDB] = str(DeltaScore)
+ File.close()
+ def ComputeDeltaScoreFeature(self):
+ """
+ Parse annotations from the inspect search. Tweak the corresponding modified-peptides
+ to know their modless-annotation and delta-score.
+ """
+ # Iterate over just one result file, or a directory full of results-files:
+ if os.path.isdir(self.InspectOut):
+ for FileName in os.listdir(self.InspectOut):
+ Path = os.path.join(self.InspectOut, FileName)
+ self.ComputeDeltaScoreFeatureFile(Path)
+ else:
+ self.ComputeDeltaScoreFeatureFile(self.InspectOut)
+ # Write out the fixed feature-rows:
+ File = open(self.FixedFeatureFileName, "wb")
+ for HeaderLine in self.HeaderLines:
+ File.write(HeaderLine)
+ for Peptide in self.ModifiedPeptides:
+ FileLine = string.join(Peptide.Bits, "\t")
+ File.write(FileLine + "\n")
+ File.close()
+ def Main(self):
+ self.ParsePeptideFeatureFile()
+ self.ComputeDeltaScoreFeature()
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "d:w:r:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-d":
+ self.PeptideFeatureDir = Value
+ elif Option == "-w":
+ self.FixedFeatureFileName = Value
+ elif Option == "-r":
+ self.InspectOut = Value
+ if not self.PeptideFeatureDir:
+ print UsageInfo
+ sys.exit(-1)
+ self.PeptideFeatureFileName = os.path.join(self.PeptideFeatureDir, "PTMFeatures.txt")
+ self.ConsensusSpectrumDir = os.path.join(self.PeptideFeatureDir, "Clusters")
+
+UsageInfo = """
+PTMSearchBigDB arguments:
+ -d [DIR]: Peptide directory. This directory should contain PTMFeatures.txt, as well
+ as the consensus spectra and clusters.
+ -w [FILE]: Output file, for peptides with delta-score included
+ -r [FILE]: Inspect output filename
+"""
+
+if __name__ == "__main__":
+ Searcher = PTMSearcher()
+ Searcher.ParseCommandLine(sys.argv[1:])
+ Searcher.Main()
diff --git a/PTMods.txt b/PTMods.txt
new file mode 100644
index 0000000..a0884a7
--- /dev/null
+++ b/PTMods.txt
@@ -0,0 +1,105 @@
+#Name Mass Monisotopic Residues
+#Acetylation 42.0106 K
+#Alkylation 14.01564 CKRHDENQ
+#Amidation -0.984
+#S-archaeol 634.6628
+
+#Biotin 226.0776 K
+#Bromination 77.9105 HFW
+#Carbamylation 43.00581 K
+#Cholesterol 368.3443
+#plants CHDH 294.39 D
+#Citrullination 0.9840276 R
+#C-Mannosylation 162.052823
+#Deamidation 0.984 NQ
+#S-diacylglycerol cysteine 576.51171
+#Dimethylation 28.0314 CKRHDENQ
+#FAD 783.1415 CH
+#Farnesylation 204.1878 C
+#Formylation 27.9949
+#Geranyl-geranylation 272.2504 C
+#Gamma-carboxyglutamic acid 43.98983 E
+#O-GlcNAc 203.0794 ST
+#Glucosylation (glycation) 162.0528 NTK
+#Glutathionylation 305.0680814 C
+#Hydroxylation 15.9949 PKDN
+#Lipoyl 188.033 K
+#Myristoylation 210.1984
+
+#n-Octanoate 126.1044 S
+#Omega-hydroxyceramide glutamate ester 760.73082 E
+#Palmitoylation 238.2297 STCK
+#yeast PALE 236.39 C
+#Phosphatidylethanolamine amidated glycine 773.54443
+# Generally phosphorylation only affects S, T, and Y. It *can* affect CDH, but that's relatively rare.
+#Phosphorylation 79.9663 STY
+Phosphorylation 80 STY
+#Phosphorylation 79.9663 STYHCD
+Methylation 14.0157 CKRHDENQ 3
+krmethylation 14.0157 KR 2
+CMethylation 14.0157 C 3
+Biotin 339.16 K
+HelgeProbe 511.6 C
+Oxidation 16 M 2
+#MissingCarb -57 C
+CSmall -14 C
+CBig 103.143 C
+DoubleOxidation 32 M
+CysteineMod 57.0518 C
+MCysteineMod 71.067 C
+#Pyridoxal phosphate 229.014 K
+#Phosphopantetheine 339.078 S
+#Pyrrolidone carboxylic acid -17.0266 Q
+#Sulfation 79.9568 Y
+#Trimethylation 42.0471 CKRHDENQ
+Acetylation 42.0106 K
+Deamidation -17 QC
+#Hydroxylation 15.9949 PKDN
+Hydroxylation 15.9949 PK
+Sulfation 80 Y
+S-Nitrosylation 28.99017 C
+sprobe 766.7 S
+cprobe 511.6 C
+Beta-methylthiolation 45.9877118 C
+1Cysteine sulfenic acid (-SOH) 15.9949146 C
+2Cysteine sulfinic acid (-SO2H) 31.9898292 C
+OxoHist 16.0 C
+Desmosine -58 K
+EpsImine 12 K
+Citruline 1 R
+terminal 14.01564 A
+cter 14.01564 A
+nomet -14.02 A
+nt4 4.0 ACDEFGHIKLMNPQRSTVWY
+nt6 6.0 ACDEFGHIKLMNPQRSTVWY
+nt10 10.0 ACDEFGHIKLMNPQRSTVWY
+lys4 4.0 K
+lys6 6.0 K
+lys10 10.0 K
+#-57ikkb -57.0 C
+
+#1lens 1.0 CK
+
+##-18ikkb -18.0 DET
+##1ikkb 1.0 N
+##10ikkb 10.0 A
+##14ikkb 14.0 CK
+##16ikkb 16.0 MW
+##22ikkb 22.0 DE
+##25ikkb 25.0 L
+##28ikkb 28.01 K
+##32ikkb 32.01 MW
+##40ikkb 40.0 P
+##+43k 43.0 K
+##42lens 42.0 A
+##43lens 43.0 A
+##38lens 38.0 A
+##21lens 21.0 A
+cysl -2 C
+cyst 12 C
+g14 14 G
+dehydration -18 DES
+sodium 22 DE
+dopa 16 Y
+gtod 58 G
+atov 29 A
\ No newline at end of file
diff --git a/PValue.c b/PValue.c
new file mode 100644
index 0000000..f64de02
--- /dev/null
+++ b/PValue.c
@@ -0,0 +1,662 @@
+//Title: PValue.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include "PValue.h"
+#include "Score.h"
+#include "Errors.h"
+
+double GammaCof[] = {76.18009172947146, -86.50532032941677,
+ 24.01409824083091, -1.231739572450155,
+ 0.1208650973866179e-2, -0.5395239384952e-5};
+
+double Gamma(double Z)
+{
+ double X;
+ double Y;
+ double Temp;
+ double Ser;
+ int J;
+ //////////
+ X = Z;
+ Y = Z;
+ Temp = X + 5.5;
+ Temp -= (X + 0.5) * log(Temp);
+ Ser = 1.000000000190015;
+ for (J = 0; J < 6; J++)
+ {
+ Y += 1;
+ Ser += GammaCof[J] / Y;
+ }
+ Z = -Temp + log(2.5066282746310005 * Ser / X);
+ return exp(Z);
+}
+
+#define F_BIN_MAX 511
+#define F_BIN_OFFSET 40 //used because Fscores are negative, but array indexes cannot be
+
+void DebugPrintPValueCurve(char* FileName, int* FScoreHistogram, double* OddsTrue)
+{
+ FILE* PValueFile;
+ int FBin;
+ float X;
+ //
+ PValueFile = fopen(FileName, "wb");
+ if (!PValueFile)
+ {
+ printf("** Not debug-printing the p-value curve.\n");
+ return;
+ }
+ for (FBin = 0; FBin <= F_BIN_MAX; FBin++)
+ {
+ X = (FBin - F_BIN_OFFSET) / (float)10.0;
+ fprintf(PValueFile, "%d\t%d\t%.3f\t%.3f\t\n", FBin, FScoreHistogram[FBin], X, 1.0 - OddsTrue[FBin]);
+ }
+ fclose(PValueFile);
+}
+
+#define EM_CYCLE_COUNT 300
+#define SQRT_2_PI (float)2.5066282746310002
+
+#define MAX_BITS 30
+#define PROCESS_COMPUTE_MEAN_DELTA 0
+#define PROCESS_INITIALIZE_SCORE_HISTOGRAM 1
+#define PROCESS_WRITE_PVALUES 2
+
+#define BIT_INDEX_CHARGE 4
+#define BIT_INDEX_MQSCORE 5
+#define BIT_INDEX_PVALUE 13
+#define BIT_INDEX_FSCORE 14
+#define BIT_INDEX_DELTA_SCORE 16
+#define DEFAULT_MQ_SCORE_WEIGHT (float)0.3
+#define DEFAULT_DELTA_SCORE_WEIGHT (float)1.5
+#define BLIND_MQ_SCORE_WEIGHT (float)0.3
+#define BLIND_DELTA_SCORE_WEIGHT (float)2.25
+
+float MQScoreWeight;
+float DeltaScoreWeight;
+
+typedef struct PValueParseInfo
+{
+ FILE* OutputFile;
+ int TotalMatches;
+ float MeanDeltaScore;
+ int Action;
+ char CurrentSpectrum[512];
+} PValueParseInfo;
+
+typedef struct PValueInfo
+{
+ float MeanDeltaScore2;
+ float MeanDeltaScore3;
+ int TotalMatches2;
+ int TotalMatches3;
+ char* FinalOutputPath;
+ int FScoreHistogram2[F_BIN_MAX + 1];
+ double OddsTrue2[F_BIN_MAX + 1];
+ int FScoreHistogram3[F_BIN_MAX + 1];
+ double OddsTrue3[F_BIN_MAX + 1];
+ PValueParseInfo* ParseInfo;
+} PValueInfo;
+
+typedef struct PValueModel
+{
+ double MeanTrue;
+ double VarianceTrue;
+ double MeanFalse;
+ double VarianceFalse;
+ double PriorProbabilityTrue;
+ double ThetaFalse;
+ double KFalse;
+ double GammaOffset;
+ double StdDevTrue;
+ double GammaDemonFalse;
+ double CountTrue;
+ double CountFalse;
+} PValueModel;
+
+// CustomTok is a variant of strtok: It returns once for every occurrence of a delimiter,
+// rather than once for every contiguous block of delimiters. Why the difference?
+// When processing tab-delimited files, we want to properly handle empty columns (corresponding to
+// occurrences of \t\t in the text).
+static char* CustomTokEnd;
+static char* CustomTokNext;
+char* CustomTok(char* Buffer, char* Delimiters)
+{
+ char* CheckPos;
+ char* StringStart;
+ char* CheckDelimiter;
+ //
+ if (Buffer)
+ {
+ CustomTokEnd = Buffer + strlen(Buffer);
+ CheckPos = Buffer;
+ StringStart = Buffer;
+ }
+ else
+ {
+ CheckPos = CustomTokNext;
+ StringStart = CustomTokNext;
+ }
+
+ // If we're out of bits, then say so:
+ if (CheckPos >= CustomTokEnd)
+ {
+ return NULL;
+ }
+ // Scan forward until you see a delimiter, or until the end of the string:
+ for (; CheckPos < CustomTokEnd; CheckPos++)
+ {
+ for (CheckDelimiter = Delimiters; *CheckDelimiter; CheckDelimiter++)
+ {
+ if (*CheckPos == *CheckDelimiter)
+ {
+ *CheckPos = '\0';
+ CustomTokNext = CheckPos + 1;
+ return StringStart;
+ }
+ }
+ }
+ // We didn't see the delimiter.
+ CustomTokNext = CustomTokEnd;
+ return StringStart;
+}
+
+int PValueProcessResultsFileLine(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ PValueInfo* Info;
+ PValueParseInfo* ParseInfo;
+ char* Bits[MAX_BITS];
+ int BitCount;
+ char* Bit;
+ char Spectrum[512];
+ int TopMatchFlag;
+ float MQScore;
+ float DeltaScore;
+ float FScore;
+ int FBin;
+ float PValue;
+ char PValueBuffer[256];
+ char FScoreBuffer[256];
+ int BitIndex;
+ int Charge;
+
+ //
+ Info = (PValueInfo*)UserData;
+ ParseInfo = Info->ParseInfo;
+ if (ParseInfo->Action == PROCESS_WRITE_PVALUES)
+ {
+ INSPECT_ASSERT(ParseInfo->OutputFile);
+ }
+
+ // Handle comments:
+ if (LineBuffer[0] == '#')
+ {
+ // Comment lines are written out as-is:
+ if (ParseInfo->Action == PROCESS_WRITE_PVALUES)
+ {
+ fprintf(ParseInfo->OutputFile, "%s\n", LineBuffer);
+ }
+ return 1;
+ }
+ // Split the line into tab-delimited bits:
+ Bit = CustomTok(LineBuffer, "\t");
+ Bits[0] = Bit;
+ BitCount = 1;
+ while (1)
+ {
+ Bit = CustomTok(NULL, "\t");
+ if (!Bit)
+ {
+ break;
+ }
+ Bits[BitCount] = Bit;
+ BitCount++;
+ if (BitCount >= MAX_BITS)
+ {
+ break;
+ }
+ }
+
+ // If we don't have enough tab-bits, then this isn't a valid line, and we skip it:
+ if (BitCount < BIT_INDEX_PVALUE + 1)
+ {
+ return 1;
+ }
+
+ // Note whether this is the top-scoring match for the spectrum:
+ sprintf(Spectrum, "%256s%50s", Bits[0], Bits[1]);
+ if (strcmp(Spectrum, ParseInfo->CurrentSpectrum))
+ {
+ TopMatchFlag = 1;
+ strncpy(ParseInfo->CurrentSpectrum, Spectrum, 512);
+ }
+ else
+ {
+ TopMatchFlag = 0;
+ }
+ Charge = atoi(Bits[BIT_INDEX_CHARGE]);
+
+ // Now take various actions:
+ switch (ParseInfo->Action)
+ {
+ case PROCESS_COMPUTE_MEAN_DELTA:
+ if (TopMatchFlag)
+ {
+ if (Charge < 3)
+ {
+ Info->MeanDeltaScore2 += (float)atof(Bits[BIT_INDEX_DELTA_SCORE]);
+ Info->TotalMatches2++;
+ }
+ else
+ {
+ Info->MeanDeltaScore3 += (float)atof(Bits[BIT_INDEX_DELTA_SCORE]);
+ Info->TotalMatches3++;
+ }
+ }
+ break;
+ case PROCESS_INITIALIZE_SCORE_HISTOGRAM:
+ if (TopMatchFlag)
+ {
+ MQScore = (float)atof(Bits[BIT_INDEX_MQSCORE]);
+ DeltaScore = (float)atof(Bits[BIT_INDEX_DELTA_SCORE]) / (Charge < 3 ? Info->MeanDeltaScore2 : Info->MeanDeltaScore3);
+ FBin = (int)floor((10 * (MQScoreWeight * MQScore + DeltaScoreWeight * DeltaScore)));
+ FBin = min(max(FBin + F_BIN_OFFSET, 0), F_BIN_MAX);
+ if (Charge < 3)
+ {
+ Info->FScoreHistogram2[FBin]++;
+ }
+ else
+ {
+ Info->FScoreHistogram3[FBin]++;
+ }
+ }
+ break;
+ case PROCESS_WRITE_PVALUES:
+ MQScore = (float)atof(Bits[BIT_INDEX_MQSCORE]);
+ DeltaScore = (float)atof(Bits[BIT_INDEX_DELTA_SCORE]) / (Charge < 3 ? Info->MeanDeltaScore2 : Info->MeanDeltaScore3);
+ FScore = (MQScoreWeight * MQScore) + (DeltaScoreWeight * DeltaScore);
+ sprintf(FScoreBuffer, "%.5f", FScore);
+ Bits[BIT_INDEX_FSCORE] = FScoreBuffer;
+ FBin = (int)(10 * FScore);
+ FBin = min(max(FBin + F_BIN_OFFSET, 0), F_BIN_MAX);
+ if (Charge < 3)
+ {
+ PValue = (float)(1.0 - Info->OddsTrue2[FBin]);
+ }
+ else
+ {
+ PValue = (float)(1.0 - Info->OddsTrue3[FBin]);
+ }
+ sprintf(PValueBuffer, "%.5f", PValue);
+ Bits[BIT_INDEX_PVALUE] = PValueBuffer;
+ for (BitIndex = 0; BitIndex < BitCount; BitIndex++)
+ {
+ fprintf(ParseInfo->OutputFile, "%s\t", Bits[BitIndex]);
+ }
+ fprintf(ParseInfo->OutputFile, "\n");
+ break;
+ default:
+ printf("** Error: Unknown action '%d' in ProcessResultsFile!\n", ParseInfo->Action);
+ return 0;
+ }
+ return 1;
+}
+
+void ProcessResultsFile(PValueInfo* Info, char* FilePath, int Action)
+{
+ FILE* File;
+ PValueParseInfo ParseInfo;
+ memset(&ParseInfo,0,sizeof(ParseInfo));
+ //
+ Info->ParseInfo = &ParseInfo;
+ File = fopen(FilePath, "rb");
+ if (!File)
+ {
+ REPORT_ERROR_S(8, FilePath);
+ return;
+ }
+ ParseInfo.Action = Action;
+ if (Action == PROCESS_WRITE_PVALUES)
+ {
+ ParseInfo.OutputFile = fopen(Info->FinalOutputPath, "wb");
+ if (!ParseInfo.OutputFile)
+ {
+ REPORT_ERROR_S(8, Info->FinalOutputPath);
+ return;
+ }
+ }
+ ParseFileByLines(File, PValueProcessResultsFileLine, Info, 1);
+ fclose(File);
+}
+
+#define DEFAULT_MEAN_TRUE 4.48
+#define DEFAULT_VARIANCE_TRUE 1.50
+#define DEFAULT_MEAN_FALSE 0.19
+#define DEFAULT_VARIANCE_FALSE 0.18
+#define DEFAULT_PRIOR_PROBABILITY 0.25
+
+PValueModel* InitPValueModel(int Charge3Flag)
+{
+ PValueModel* Model;
+ //
+ Model = (PValueModel*)calloc(1, sizeof(PValueModel));
+ Model->MeanTrue = DEFAULT_MEAN_TRUE;
+ Model->VarianceTrue = DEFAULT_VARIANCE_TRUE;
+ Model->MeanFalse = DEFAULT_MEAN_FALSE;
+ Model->VarianceFalse = DEFAULT_VARIANCE_FALSE;
+ Model->PriorProbabilityTrue = DEFAULT_PRIOR_PROBABILITY;
+ Model->GammaOffset = 0;
+ Model->GammaOffset = max(Model->GammaOffset, -Model->MeanFalse + 0.1);
+ Model->ThetaFalse = Model->VarianceFalse / (Model->MeanFalse + Model->GammaOffset);
+ Model->KFalse = (Model->MeanFalse + Model->GammaOffset) / Model->ThetaFalse;
+ Model->GammaDemonFalse = pow(Model->ThetaFalse, Model->KFalse) * Gamma(Model->KFalse);
+ Model->StdDevTrue = sqrt(Model->VarianceTrue);
+ return Model;
+}
+
+int FitPValueMixtureModel(PValueInfo* Info, PValueModel* Model, int Charge3Flag)
+{
+ int FBin;
+ int* FScoreHistogram;
+ double* OddsTrue;
+ int LowestFScoreBin = -1;
+ //double GammaOffset;
+ int TotalMatches;
+ int EMCycle;
+ double X;
+ int Count;
+ int MaxBinPopulated = 0;
+ float LowestFScore = 0;
+ int CurveFitComplete;
+ double Pow;
+ double GX;
+ double TrueNormal;
+ double FalseGamma;
+ //
+
+ if (Charge3Flag)
+ {
+ FScoreHistogram = Info->FScoreHistogram3;
+ OddsTrue = Info->OddsTrue3;
+ TotalMatches = Info->TotalMatches3;
+ }
+ else
+ {
+ FScoreHistogram = Info->FScoreHistogram2;
+ OddsTrue = Info->OddsTrue2;
+ TotalMatches = Info->TotalMatches2;
+ }
+
+ // Note the highest score-bin that has any entries at all:
+ for (FBin = 0; FBin <= F_BIN_MAX; FBin++)
+ {
+ if (FScoreHistogram[FBin] > 0 && LowestFScoreBin < 0)
+ {
+ LowestFScoreBin = FBin;
+ }
+ if (FScoreHistogram[FBin])
+ {
+ MaxBinPopulated = FBin;
+ }
+ }
+
+ // Convert the lowest F-score bin# into the corresponding score:
+ LowestFScore = (LowestFScoreBin - F_BIN_OFFSET) / (float)10.0;
+ Model->GammaOffset = 0.0;
+ if (LowestFScore <= 0)
+ {
+ Model->GammaOffset = max(Model->GammaOffset, -LowestFScore + 0.1);
+ }
+ if (Model->MeanFalse <= 0)
+ {
+ Model->GammaOffset = max(Model->GammaOffset, -Model->MeanFalse + 0.1);
+ }
+
+ ////////////////////////////////////////////////////////////////////
+ // Fit the mixture model, using a gamma distribution for false match f-scores and a
+ // normal distribution for true match f-scores.
+ Model->ThetaFalse = Model->VarianceFalse / (Model->MeanFalse + Model->GammaOffset);
+ Model->KFalse = (Model->MeanFalse + Model->GammaOffset) / Model->ThetaFalse;
+ Model->GammaDemonFalse = pow(Model->ThetaFalse, Model->KFalse) * Gamma(Model->KFalse);
+ Model->StdDevTrue = sqrt(Model->VarianceTrue);
+ if (TotalMatches < 200)
+ {
+ REPORT_WARNING_I(10, TotalMatches);
+ CurveFitComplete = 0;
+ }
+ else
+ {
+ for (EMCycle = 0; EMCycle < EM_CYCLE_COUNT; EMCycle++)
+ {
+ // For each bin, compute the probability that it's true:
+ for (FBin = 0; FBin <= F_BIN_MAX; FBin++)
+ {
+ // After the last histogram entry, just inherit the last true-probability.
+ if (FBin > MaxBinPopulated)
+ {
+ OddsTrue[FBin] = OddsTrue[FBin - 1];
+ continue;
+ }
+ X = (FBin - F_BIN_OFFSET) / 10.0;
+ Pow = (X - Model->MeanTrue);
+ Pow = - (Pow * Pow / (2 * Model->VarianceTrue));
+ TrueNormal = exp(Pow) / (Model->StdDevTrue * SQRT_2_PI);
+ GX = max(0.01, X + Model->GammaOffset);
+ FalseGamma = pow(GX, Model->KFalse - 1) * exp(-GX / Model->ThetaFalse) / Model->GammaDemonFalse;
+ // Avoid underflow:
+ if (TrueNormal < 0.00001)
+ {
+ if (X > 5)
+ {
+ OddsTrue[FBin] = (float)0.99999;
+ }
+ else
+ {
+ OddsTrue[FBin] = (float)0.0;
+ }
+ }
+ else
+ {
+ OddsTrue[FBin] = (TrueNormal * Model->PriorProbabilityTrue) / (TrueNormal * Model->PriorProbabilityTrue + FalseGamma * (1 - Model->PriorProbabilityTrue));
+ }
+
+ //printf("%.8f\t%.8f\t%.8f\t%.8f\n", X, TrueNormal, FalseGamma, OddsTrue[FBin]);
+ // Because the left tail of the normal distribution falls off slowly, the value of OddsTrue can be
+ // high for negative values. Cap it.
+ if (FBin < F_BIN_OFFSET)
+ {
+ OddsTrue[FBin] = min(OddsTrue[FBin], 1.0 / (F_BIN_OFFSET - FBin));
+ }
+ }
+ /////////////////////////////////////////////////
+ // Compute the mean of the true and the false distributions:
+ Model->CountTrue = 0;
+ Model->MeanTrue = 0;
+ Model->CountFalse = 0;
+ Model->MeanFalse = 0;
+ for (FBin = 0; FBin <= F_BIN_MAX; FBin++)
+ {
+ X = (FBin - F_BIN_OFFSET) / 10.0;
+ Count = FScoreHistogram[FBin];
+ Model->MeanTrue += X * OddsTrue[FBin] * Count;
+ Model->CountTrue += OddsTrue[FBin] * Count;
+ Model->MeanFalse += X * (1.0 - OddsTrue[FBin]) * Count;
+ Model->CountFalse += (1.0 - OddsTrue[FBin]) * Count;
+ }
+ Model->MeanTrue /= Model->CountTrue;
+ Model->MeanFalse /= Model->CountFalse;
+ Model->PriorProbabilityTrue = Model->CountTrue / (Model->CountTrue + Model->CountFalse);
+ /////////////////////////////////////////////////
+ // Compute the variance of the true and the false distributions:
+ Model->VarianceTrue = 0;
+ Model->VarianceFalse = 0;
+ for (FBin = 0; FBin <= F_BIN_MAX; FBin++)
+ {
+ X = (FBin - F_BIN_OFFSET) / 10.0;
+ Count = FScoreHistogram[FBin];
+ Model->VarianceTrue += (X - Model->MeanTrue) * (X - Model->MeanTrue) * Count * OddsTrue[FBin];
+ Model->VarianceFalse += (X - Model->MeanFalse) * (X - Model->MeanFalse) * Count * (1.0 - OddsTrue[FBin]);
+ }
+ Model->VarianceTrue /= Model->CountTrue;
+ Model->StdDevTrue = sqrt(Model->VarianceTrue);
+ Model->VarianceFalse /= Model->CountFalse;
+ // Recompute other distribution parameters:
+ Model->ThetaFalse = Model->VarianceFalse / (Model->MeanFalse + Model->GammaOffset);
+ Model->KFalse = (Model->MeanFalse + Model->GammaOffset) / Model->ThetaFalse;
+ Model->GammaDemonFalse = pow(Model->ThetaFalse, Model->KFalse) * Gamma(Model->KFalse);
+ //printf("Cycle %d:\n", EMCycle);
+ //printf("True: Count %.4f mean %.4f variance %.4f\n", CountTrue, MeanTrue, VarianceTrue);
+ //printf("False: Count %.4f mean %.4f variance %.4f\n", CountFalse, MeanFalse, VarianceFalse);
+ //printf("Prior probability true: %.4f\n", PriorProbabilityTrue);
+ } // E-M cycle loop
+ CurveFitComplete = 1;
+ }
+
+ ///////////////////////////////////////
+ // Check to make sure the distribution is sensible. If curve-fitting failed
+ // due to underflow/overflow, then fall back to the default curve:
+ if (Model->GammaDemonFalse <= 0 || Model->KFalse <= 0)
+ {
+ printf("** Error fitting p-value distribution; using default. Consider running PValue.py\n");
+ CurveFitComplete = 0;
+ }
+
+ if (!CurveFitComplete)
+ {
+ // COPY-PASTA: Fill in the OddsTrue array using all default parameters.
+ Model = InitPValueModel(Charge3Flag);
+
+ // For each bin, compute the probability that it's true:
+ for (FBin = 0; FBin <= F_BIN_MAX; FBin++)
+ {
+ // After the last histogram entry, just inherit the last true-probability.
+ if (FBin > MaxBinPopulated)
+ {
+ OddsTrue[FBin] = OddsTrue[FBin - 1];
+ continue;
+ }
+ X = (FBin - F_BIN_OFFSET) / 10.0;
+ Pow = (X - Model->MeanTrue);
+ Pow = - (Pow * Pow / (2 * Model->VarianceTrue));
+ TrueNormal = exp(Pow) / (Model->StdDevTrue * SQRT_2_PI);
+ GX = max(0.01, X + Model->GammaOffset);
+ FalseGamma = pow(GX, Model->KFalse - 1) * exp(-GX / Model->ThetaFalse) / Model->GammaDemonFalse;
+ // Avoid underflow:
+ if (TrueNormal < 0.00001)
+ {
+ if (X > 5)
+ {
+ OddsTrue[FBin] = (float)0.99999;
+ }
+ else
+ {
+ OddsTrue[FBin] = (float)0.0;
+ }
+ }
+ else
+ {
+ OddsTrue[FBin] = (TrueNormal * Model->PriorProbabilityTrue) / (TrueNormal * Model->PriorProbabilityTrue + FalseGamma * (1 - Model->PriorProbabilityTrue));
+ }
+
+ //printf("%.8f\t%.8f\t%.8f\t%.8f\n", X, TrueNormal, FalseGamma, OddsTrue[FBin]);
+ // Because the left tail of the normal distribution falls off slowly, the value of OddsTrue can be
+ // high for negative values. Cap it.
+ if (FBin < F_BIN_OFFSET)
+ {
+ OddsTrue[FBin] = (float)min(OddsTrue[FBin], 1.0 / (F_BIN_OFFSET - FBin));
+ }
+ }
+ // free the temp-model:
+ SafeFree(Model);
+ } // if curve fit didn't complete
+ return 1;
+}
+
+// Iterate over all the matches, and get the distribution
+// of F-scores; use those to derive p-values. We compute
+// one distribution for charge 1 and 2 spectra, a second
+// distribution for charge 3 spectra.
+void CalculatePValues(char* ResultsFilePath, char* FinalOutputPath)
+{
+ PValueModel* Model2;
+ PValueModel* Model3;
+ PValueInfo* Info;
+ //
+ Model2 = InitPValueModel(0);
+ Model3 = InitPValueModel(1);
+ Info = (PValueInfo*)calloc(1, sizeof(PValueInfo));
+
+ if (GlobalOptions->RunMode & (RUN_MODE_MUTATION | RUN_MODE_BLIND))
+ {
+ MQScoreWeight = BLIND_MQ_SCORE_WEIGHT;
+ DeltaScoreWeight = BLIND_DELTA_SCORE_WEIGHT;
+ }
+ else
+ {
+ MQScoreWeight = DEFAULT_MQ_SCORE_WEIGHT;
+ DeltaScoreWeight = DEFAULT_DELTA_SCORE_WEIGHT;
+ }
+
+ //////////////////////////////////////////////////////////
+ // Compute mean delta-score:
+ ProcessResultsFile(Info, ResultsFilePath, PROCESS_COMPUTE_MEAN_DELTA);
+ Info->MeanDeltaScore2 /= max(1, Info->TotalMatches2);
+ Info->MeanDeltaScore2 = max(Info->MeanDeltaScore2, (float)0.01);
+ Info->MeanDeltaScore3 /= max(1, Info->TotalMatches3);
+ Info->MeanDeltaScore3 = max(Info->MeanDeltaScore3, (float)0.01);
+
+ //////////////////////////////////////////////////////////
+ // Initialze FScoreHistogram:
+ memset(Info->FScoreHistogram2, 0, sizeof(int) * (F_BIN_MAX + 1));
+ memset(Info->FScoreHistogram3, 0, sizeof(int) * (F_BIN_MAX + 1));
+ ProcessResultsFile(Info, ResultsFilePath, PROCESS_INITIALIZE_SCORE_HISTOGRAM);
+
+ //////////////////////////////////////////////////////////
+ // Fit the mixture model, populating OddsTrue:
+ FitPValueMixtureModel(Info, Model2, 0);
+ FitPValueMixtureModel(Info, Model3, 1);
+
+ // Verbose ouptut of the p-value curve:
+ // (Disabled in production, especially for the web server!)
+ //DebugPrintPValueCurve("PValueCurve2.txt", Info->FScoreHistogram2, Info->OddsTrue2);
+ //DebugPrintPValueCurve("PValueCurve3.txt", Info->FScoreHistogram3, Info->OddsTrue3);
+
+ // Write the p-values to the final output file:
+ Info->FinalOutputPath = FinalOutputPath;
+ ProcessResultsFile(Info, ResultsFilePath, PROCESS_WRITE_PVALUES);
+ // Now we can erase the intermediate output file:
+ remove(GlobalOptions->OutputFileName);
+}
diff --git a/PValue.h b/PValue.h
new file mode 100644
index 0000000..07a5684
--- /dev/null
+++ b/PValue.h
@@ -0,0 +1,42 @@
+//Title: PValue.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#ifndef PVALUE_H
+#define PVALUE_H
+#include "Inspect.h"
+#include "Trie.h"
+
+// PValue.c derives p-values for spectrum annotation using a mixture model for the
+// histogram of match scores. The computation is based on the PeptideProphet paper.
+
+void CalculatePValues();
+
+#endif // PVALUE_H
diff --git a/ParentMass.c b/ParentMass.c
new file mode 100644
index 0000000..3f4b6f5
--- /dev/null
+++ b/ParentMass.c
@@ -0,0 +1,710 @@
+//Title: ParentMass.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+// ParentMass.c: Routines for parent mass correction. The precursor mass, as supplied,
+// may be off by up to 1 Da (or more, depending on the experiment). Here we determine
+// which parent mass is correct by considering the spectrum's self-convolution: The
+// overlap between b and y peaks should be highest when the parent mass is exactly right.
+//
+// Our implementation: We construct a PMCSpectrumInfo object for the spectrum, which keeps
+// track of PMCInfo nodes. We build one PMCInfo node for each mass we're testing. We
+// compute self-convolutions for each PMCInfo node, and we also compare convolutions across
+// PMCInfo nodes and across mass offsets. Finally, we feed these features into a model
+// which assigns each PMCInfo a score, and we keep the best PMCInfo.
+
+#include "CMemLeak.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "Utils.h"
+#include "ChargeState.h"
+#include "Spectrum.h"
+#include "Inspect.h"
+#include "SVM.h"
+#include "Errors.h"
+#include "LDA.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#endif
+
+#define EPSILON (float)0.000001
+
+// Models, for parent mass correction in various charge states:
+extern LDAModel* PMCCharge1LDA;
+extern LDAModel* PMCCharge2LDA;
+extern LDAModel* PMCCharge3LDA;
+SVMModel* PMCCharge1SVM;
+SVMModel* PMCCharge2SVM;
+SVMModel* PMCCharge3SVM;
+
+// For converting parts-per-million:
+#define ONE_MILLION 1000000
+
+///////////////////////////////////////////////////
+// Forward declarations:
+
+///////////////////////////////////////////////////
+// Functions:
+void CharacterizePhosphatePeaks(PMCInfo* Info, PMCSpectrumInfo* SpectrumInfo, int Offset, int FeatureIndex);
+
+// Free PMCSpectrumInfo, which is only kept around during
+// parent mass and charge state correction.
+void FreePMCSpectrumInfo(PMCSpectrumInfo* SpectrumInfo)
+{
+ PMCInfo* Info;
+ PMCInfo* Prev;
+ SelfConvolutionNode* Node;
+ SelfConvolutionNode* PrevNode;
+ int HashIndex;
+ //
+ if (!SpectrumInfo)
+ {
+ return;
+ }
+ // Free PMCInfo list:
+ Prev = NULL;
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ SafeFree(Prev);
+ Prev = Info;
+ }
+ SafeFree(Prev);
+ // Free SelfConvolution list:
+ for (HashIndex = 0; HashIndex < SC_HASH_SIZE; HashIndex++)
+ {
+ PrevNode = NULL;
+ for (Node = SpectrumInfo->SCHash[HashIndex]; Node; Node = Node->Next)
+ {
+ SafeFree(PrevNode);
+ PrevNode = Node;
+ }
+ SafeFree(PrevNode);
+ }
+ // Free SelfConvolution2 list:
+ for (HashIndex = 0; HashIndex < SC_HASH_SIZE; HashIndex++)
+ {
+ PrevNode = NULL;
+ for (Node = SpectrumInfo->SC2Hash[HashIndex]; Node; Node = Node->Next)
+ {
+ SafeFree(PrevNode);
+ PrevNode = Node;
+ }
+ SafeFree(PrevNode);
+ }
+ // Free the parent:
+ SafeFree(SpectrumInfo);
+}
+
+// Build PMCInfo nodes for the masses we'll consider adjusting to. We'll add one node
+// for the core mass, and we'll add some more nodes in the neighborhood (MinMass, MaxMass).
+// The PMCInfo nodes are children of SpectrumInfo.
+void AddPMCNodes(PMCSpectrumInfo* SpectrumInfo, int CoreMass, int MinMass, int MaxMass)
+{
+ PMCInfo* Info;
+ int MassChange;
+ int Mass;
+ //
+ // Iterate from the core mass downward. When you reach the end, iterate
+ // from the core mass (+0.1Da) upward. (Use the 'two-way iteration' instead
+ // of two loops)
+ Mass = CoreMass;
+
+ MassChange = -DECI_DALTON;
+ while (1)
+ {
+ if (Mass < MinMass)
+ {
+ MassChange = DECI_DALTON;
+ Mass = CoreMass + MassChange;
+ }
+ if (Mass > MaxMass)
+ {
+ break;
+ }
+ Info = (PMCInfo*)calloc(1, sizeof(PMCInfo));
+ Info->Charge = SpectrumInfo->Charge;
+ Info->ParentMass = Mass;
+ if (!SpectrumInfo->Head)
+ {
+ SpectrumInfo->Head = Info;
+ }
+ else
+ {
+ SpectrumInfo->Tail->Next = Info;
+ }
+ SpectrumInfo->Tail = Info;
+ Mass += MassChange;
+ }
+}
+
+// Compute features for performing parent mass correction.
+// Assumes that the charge state is set!
+void ComputePMCFeatures(PMCSpectrumInfo* SpectrumInfo)
+{
+ int OffsetIndex;
+ int FeatureIndex;
+ int Charge;
+ int BestScoreIndex = 0;
+ int BestRunnerUpIndex = -1;
+ float PMRadius;
+ float AverageConvolution;
+ PMCInfo* Info;
+ MSSpectrum* Spectrum;
+ float MaxConvolution;
+ int InfoCount;
+ float Diff;
+ //
+
+
+ Spectrum = SpectrumInfo->Spectrum;
+
+ // Set the spectrum's mass:
+ Spectrum->ParentMass = Spectrum->MZ * SpectrumInfo->Charge - HYDROGEN_MASS * (SpectrumInfo->Charge - 1); // base mass
+ Charge = min(3, SpectrumInfo->Charge);
+
+
+
+ ////////////////////////////////////////////////////////////
+ // Build PMCInfo structs for allowed masses. We're always allowed a +1 or -1 isotope.
+ // And we're allowed to move around by 0.1Da until our mass error (in PPM) becomes too large.
+ PMRadius = (float)Spectrum->ParentMass;
+ PMRadius *= GlobalOptions->ParentMassPPM / (float)ONE_MILLION;
+ AddPMCNodes(SpectrumInfo, Spectrum->ParentMass,
+ (int)(Spectrum->ParentMass - PMRadius), (int)(Spectrum->ParentMass + PMRadius));
+
+ // We're always allowed a +1 and -1 shift:
+ if (PMRadius < DALTON)
+ {
+ AddPMCNodes(SpectrumInfo, Spectrum->ParentMass - DALTON,
+ (int)(Spectrum->ParentMass - DALTON - PMRadius),
+ (int)(min(Spectrum->ParentMass - DALTON + PMRadius, Spectrum->ParentMass - PMRadius)));
+ AddPMCNodes(SpectrumInfo, Spectrum->ParentMass + DALTON,
+ (int)(max(Spectrum->ParentMass + DALTON - PMRadius, Spectrum->ParentMass + PMRadius)),
+ (int)(Spectrum->ParentMass + DALTON + PMRadius));
+ }
+ // Ok, PMCInfo nodes have now been created.
+ // Perform self-conovolution, at various parent masses. This populates Info->Convolve, Info->Convolve2.
+ // Along the way, track the *average* and *maximum* self-convolutions.
+ InfoCount = 0;
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ ConvolveMassCorrectedSpectrum(Info, SpectrumInfo);
+ InfoCount++;
+ }
+
+ // Use the self-convolution info to populate the feature-vector for each PMCInfo:
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ FeatureIndex = 0;
+
+ // First feature is derived from the mass offset:
+ if (SpectrumInfo->Charge == 1)
+ {
+ // Absolute Mass offset
+ Info->Features[FeatureIndex++] = (float)fabs((Spectrum->ParentMass - Info->ParentMass) / (float)MASS_SCALE);
+ }
+ else
+ {
+ // Mass offset:
+ Diff = (Spectrum->ParentMass - Info->ParentMass) / (float)MASS_SCALE;
+ Info->Features[FeatureIndex++] = Diff;
+ // Absolute mass offset:
+ Info->Features[FeatureIndex++] = Diff * Diff;
+ }
+
+ ////////////////////////////////////////////////////////////////
+ // Convolution features:
+ // Find the average convolution for several masses:
+ AverageConvolution = 0;
+ MaxConvolution = 0;
+ for (OffsetIndex = 0; OffsetIndex < SELF_CONVOLVE_OFFSETS; OffsetIndex++)
+ {
+ AverageConvolution += Info->Convolve[OffsetIndex];
+ MaxConvolution = max(MaxConvolution, Info->Convolve[OffsetIndex]);
+ }
+ AverageConvolution /= (float)SELF_CONVOLVE_OFFSETS;
+ AverageConvolution = max(EPSILON, AverageConvolution);
+ MaxConvolution = max(EPSILON, MaxConvolution);
+ for (OffsetIndex = 0; OffsetIndex < SELF_CONVOLVE_OFFSETS; OffsetIndex++)
+ {
+ if (OffsetIndex < 4)
+ {
+ Info->Features[FeatureIndex++] = Info->Convolve[OffsetIndex];
+ Info->Features[FeatureIndex++] = Info->Convolve[OffsetIndex] / AverageConvolution;
+ }
+ }
+ // Convolutions of singly- and doubly-charged peaks.
+ // (These features aren't computed for charge 1!)
+ if (SpectrumInfo->Charge > 1)
+ {
+ ////////////////////////////////////////////////////////////////
+ // Convolution2 features:
+ // Find the average convolution for several masses:
+ AverageConvolution = 0;
+ MaxConvolution = 0;
+ for (OffsetIndex = 0; OffsetIndex < SELF_CONVOLVE2_OFFSETS; OffsetIndex++)
+ {
+ AverageConvolution += Info->Convolve2[OffsetIndex];
+ MaxConvolution = max(MaxConvolution, Info->Convolve2[OffsetIndex]);
+ }
+ AverageConvolution /= (float)SELF_CONVOLVE2_OFFSETS;
+ AverageConvolution = max(EPSILON, AverageConvolution);
+ MaxConvolution = max(EPSILON, MaxConvolution);
+ for (OffsetIndex = 0; OffsetIndex < SELF_CONVOLVE2_OFFSETS; OffsetIndex++)
+ {
+ if (OffsetIndex < 3)
+ {
+ Info->Features[FeatureIndex++] = Info->Convolve2[OffsetIndex];
+ Info->Features[FeatureIndex++] = Info->Convolve2[OffsetIndex] / AverageConvolution;
+ }
+ }
+ }
+ if (GlobalOptions->PhosphorylationFlag) // sam's new insertion
+ { //feature is simple sum of M-p and M-p-h20 intensity and skew
+ CharacterizePhosphatePeaks(Info, SpectrumInfo, PHOSPHATE_WATER_MASS / Info->Charge, 0);
+ CharacterizePhosphatePeaks(Info, SpectrumInfo, (PHOSPHATE_WATER_MASS + WATER_MASS) / Info->Charge, 1);
+ Info->Features[FeatureIndex++] = Info->IntensePeakIntensity[0] + Info->IntensePeakIntensity[1];
+ Info->Features[FeatureIndex++] = (float)(Info->IntensePeakSkew[0] + Info->IntensePeakSkew[1]);
+ //save this information for the charge state model.
+ Info->IntensePeakIntensity[2] = Info->IntensePeakIntensity[0] + Info->IntensePeakIntensity[1];
+ }
+ }
+}
+
+// Get Features of possible phosphorylated peak
+// looking for the most intense peak within a given range
+void CharacterizePhosphatePeaks(PMCInfo* Info, PMCSpectrumInfo* SpectrumInfo, int Offset, int FeatureIndex)
+{
+ MSSpectrum* Spectrum;
+ int PeakIndex = -1;
+ int MZ; // Of this PMCInfo guess at parent mass, not what is listed in the file.
+ int Epsilon = (int)(0.5 * DALTON);
+ int SavedPeakIndex = -1;
+ int Difference;
+ int Skew = 0;
+ float Intensity = 0;
+ float TotalIntensity = 0;
+ int ExpectedPeakMass;
+ //
+ Spectrum = SpectrumInfo->Spectrum;
+ MZ = (Info->ParentMass + (Info->Charge - 1) * HYDROGEN_MASS) / Info->Charge;
+ ExpectedPeakMass = MZ - Offset;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ TotalIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ Difference = abs(Spectrum->Peaks[PeakIndex].Mass - ExpectedPeakMass);
+ if (Difference > Epsilon)
+ {
+ continue;
+ }
+ if (Spectrum->Peaks[PeakIndex].Intensity > Intensity)
+ {
+ Intensity = Spectrum->Peaks[PeakIndex].Intensity;
+ Skew = Difference;
+ SavedPeakIndex = PeakIndex;
+ }
+ }
+ if (SavedPeakIndex > 0)
+ {
+ Info->IntensePeakIndex[FeatureIndex] = SavedPeakIndex;
+ Info->IntensePeakIntensity[FeatureIndex] = Intensity / TotalIntensity; //percent total
+ Info->IntensePeakSkew[FeatureIndex] = Skew;
+ }
+ else
+ {//nothing found, save zeros
+ Info->IntensePeakIndex[FeatureIndex] = 0;
+ Info->IntensePeakIntensity[FeatureIndex] = 0;
+ Info->IntensePeakSkew[FeatureIndex] = 0;
+ }
+}
+
+
+// Carry out parent mass correction on this spectrum.
+void PerformPMC(PMCSpectrumInfo* SpectrumInfo)
+{
+ PMCInfo* Info;
+
+ //
+#ifdef PMC_USE_SVM
+ LoadPMCSVM(0);
+#else
+ LoadPMCLDA(0);
+#endif
+ ComputePMCFeatures(SpectrumInfo);
+ // If we don't have a model (yet), then give the FIRST mass the best score:
+
+
+ switch (SpectrumInfo->Head->Charge)
+ {
+ case 1:
+ if (!PMCCharge1LDA && !PMCCharge1SVM)
+ {
+ SpectrumInfo->BestInfo = SpectrumInfo->Head;
+ SpectrumInfo->Head->SVMScore = 1.0;
+ return;
+ }
+ break;
+ case 3:
+ if (!PMCCharge3LDA && !PMCCharge3SVM)
+ {
+ SpectrumInfo->BestInfo = SpectrumInfo->Head;
+ SpectrumInfo->Head->SVMScore = 1.0;
+ return;
+ }
+ break;
+ default:
+ if (!PMCCharge2LDA && !PMCCharge2SVM)
+ {
+ SpectrumInfo->BestInfo = SpectrumInfo->Head;
+ SpectrumInfo->Head->SVMScore = 1.0;
+ return;
+ }
+ break;
+ }
+
+ // Apply the machine learning model to each one:
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ if (Info->Charge == 1)
+ {
+#ifdef PMC_USE_SVM
+ Info->SVMScore = SVMClassify(PMCCharge1SVM, Info->Features, 0);
+#else
+ Info->SVMScore = ApplyLDAModel(PMCCharge1LDA, Info->Features);
+#endif
+ }
+ else if (Info->Charge == 2)
+ {
+#ifdef PMC_USE_SVM
+ Info->SVMScore = SVMClassify(PMCCharge2SVM, Info->Features, 0);
+#else
+ Info->SVMScore = ApplyLDAModel(PMCCharge2LDA, Info->Features);
+#endif
+ }
+ else
+ {
+#ifdef PMC_USE_SVM
+ Info->SVMScore = SVMClassify(PMCCharge3SVM, Info->Features, 0);
+#else
+ Info->SVMScore = ApplyLDAModel(PMCCharge3LDA, Info->Features);
+#endif
+ }
+ }
+ // Remember the best one:
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ if (!SpectrumInfo->BestInfo || Info->SVMScore > SpectrumInfo->BestInfo->SVMScore)
+ {
+ SpectrumInfo->BestInfo = Info;
+ }
+ }
+ // Remember the second-best one:
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ //if (Info == SpectrumInfo->BestInfo)
+ //{
+ // continue;
+ //}
+ if (abs(Info->ParentMass - SpectrumInfo->BestInfo->ParentMass) <= 400)
+ {
+ continue;
+ }
+
+ if (!SpectrumInfo->RunnerUpInfo || (Info->SVMScore > SpectrumInfo->RunnerUpInfo->SVMScore))
+ {
+ SpectrumInfo->RunnerUpInfo = Info;
+ }
+ }
+}
+
+// Load parent mass correction SVM models.
+int LoadPMCSVM()
+{
+ char FilePath[1024];
+ if (PMCCharge1SVM)
+ {
+ return 1;
+ }
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PMCSVM1.model");
+ PMCCharge1SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PMCSVM1.range");
+ ReadSVMScaling(PMCCharge1SVM, FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PMCSVM2.model");
+ PMCCharge2SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PMCSVM2.range");
+ ReadSVMScaling(PMCCharge2SVM, FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PMCSVM3.model");
+ PMCCharge3SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "PMCSVM3.range");
+ ReadSVMScaling(PMCCharge3SVM, FilePath);
+ return 1;
+}
+
+// Build a PMCSpectrumInfo instance for the spectrum. Assumes the charge state is set.
+PMCSpectrumInfo* GetPMCSpectrumInfo(MSSpectrum* Spectrum)
+{
+ PMCSpectrumInfo* SpectrumInfo;
+ float SelfConvolve;
+ int PeakIndex;
+ int Bin;
+ float Intensity;
+ //
+ SpectrumInfo = (PMCSpectrumInfo*)calloc(1, sizeof(PMCSpectrumInfo));
+ SpectrumInfo->Spectrum = Spectrum;
+ SpectrumInfo->Charge = Spectrum->Charge;
+ SpectrumInfo->Mass = (Spectrum->MZ * Spectrum->Charge) - ((Spectrum->Charge - 1) * HYDROGEN_MASS);
+
+
+ //printf("A2\n");
+ //fflush(stdout);
+
+ // Scale spectrum peaks to a TOTAL intensity of 100:
+ Intensity = 0;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Intensity += Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ //printf("B2\n");
+ //fflush(stdout);
+ SpectrumInfo->PeakScalingFactor = 100 / Intensity;
+ SpectrumInfo->PeakScalingFactor *= SpectrumInfo->PeakScalingFactor;
+
+ // Compute self-convolution:
+ SelfConvolve = EPSILON;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Bin = (Spectrum->Peaks[PeakIndex].Mass + 50) / 100;
+ if (Bin >= 0 && Bin < Spectrum->IntensityBinCount)
+ {
+ Intensity = Spectrum->BinnedIntensitiesTight[Bin];
+ SelfConvolve += Spectrum->Peaks[PeakIndex].Intensity * Intensity * SpectrumInfo->PeakScalingFactor;
+ }
+ }
+ //printf("C2\n");
+ //fflush(stdout);
+ SpectrumInfo->SelfConvolution = SelfConvolve;
+ //printf("D2\n");
+ //fflush(stdout);
+ return SpectrumInfo;
+}
+
+float SpectrumGetSelfConvolution(MSSpectrum* Spectrum, PMCSpectrumInfo* SpectrumInfo, int Offset, int DoublyChargedFlag)
+{
+ SelfConvolutionNode* Node;
+ SelfConvolutionNode* OldNode;
+
+ int PeakIndex;
+ int OtherMass;
+ int Bin;
+ float Product;
+ int VerboseFlag = 0;
+ float Convolution;
+ int HashIndex;
+ //
+
+ HashIndex = abs(Offset / 100) % SC_HASH_SIZE;
+ // If the self-convolution has already been computed, then we simply look it up:
+ if (DoublyChargedFlag)
+ {
+ Node = SpectrumInfo->SC2Hash[HashIndex];
+ }
+ else
+ {
+ Node = SpectrumInfo->SCHash[HashIndex];
+ }
+ for (; Node; Node = Node->Next)
+ {
+ if (Node->MassOffset == Offset)
+ {
+ //printf("SGSC%d: Return already-computed for offset %d\n", DoublyChargedFlag, Offset);
+ return Node->Value;
+ }
+ }
+
+ //printf("SGSC%d: Compute for offset %d\n", DoublyChargedFlag, Offset);
+ // Compute convolution value for these parameters:
+ Convolution = 0;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (DoublyChargedFlag)
+ {
+ OtherMass = SpectrumInfo->Mass + 2 * HYDROGEN_MASS - (2 * Spectrum->Peaks[PeakIndex].Mass) + Offset;
+ }
+ else
+ {
+ OtherMass = SpectrumInfo->Mass + HYDROGEN_MASS - Spectrum->Peaks[PeakIndex].Mass + Offset;
+ }
+ Bin = ((OtherMass + 50) / 100);
+ if (Bin < 0 || Bin >= Spectrum->IntensityBinCount)
+ {
+ continue;
+ }
+ Product = Spectrum->Peaks[PeakIndex].Intensity * Spectrum->BinnedIntensitiesTight[Bin] * SpectrumInfo->PeakScalingFactor;
+ if (VerboseFlag && Product)
+ {
+ printf("Peak@%.2f and binned intensity %d (%.2f) -> %.5f\n", Spectrum->Peaks[PeakIndex].Mass / (float)DALTON,
+ Bin, OtherMass / (float)DALTON, Product);
+ }
+ Convolution += Product;
+ }
+ Node = (SelfConvolutionNode*)calloc(1, sizeof(SelfConvolutionNode));
+ Node->MassOffset = Offset;
+ Node->Value = Convolution / SpectrumInfo->SelfConvolution;
+ if (DoublyChargedFlag)
+ {
+ if (SpectrumInfo->SC2Hash[HashIndex])
+ {
+ OldNode = SpectrumInfo->SC2Hash[HashIndex];
+ while (OldNode->Next)
+ {
+ OldNode = OldNode->Next;
+ }
+ OldNode->Next = Node;
+ }
+ else
+ {
+ SpectrumInfo->SC2Hash[HashIndex] = Node;
+ }
+ }
+ else
+ {
+ if (SpectrumInfo->SCHash[HashIndex])
+ {
+ OldNode = SpectrumInfo->SCHash[HashIndex];
+ while (OldNode->Next)
+ {
+ OldNode = OldNode->Next;
+ }
+ OldNode->Next = Node;
+ }
+ else
+ {
+ SpectrumInfo->SCHash[HashIndex] = Node;
+ }
+ }
+ return Node->Value;
+}
+
+
+// ConvolveMassCorrectedSpectrum computes self-convolution for the given charge
+// and parent mass.
+void ConvolveMassCorrectedSpectrum(PMCInfo* Info, PMCSpectrumInfo* SpectrumInfo)
+{
+ MSSpectrum* Spectrum;
+ int OffsetIndex;
+ int VerboseFlag = 0;
+ int OverallOffset;
+
+ // Offsets consists of some masses where we expect LARGE self-convolution, followed by others where we expect SMALL
+ // self-convolution:
+ int Offsets[SELF_CONVOLVE_OFFSETS] = {-18 * DALTON, -17 * DALTON, 0 * DALTON, 1 * DALTON,
+ -1 * DALTON, (int)(0.5 * DALTON), (int)(-16.5 * DALTON)};
+ int Offsets2[SELF_CONVOLVE2_OFFSETS] = {(int)(0.4 * DALTON), (int)(1.2 * DALTON), (int)(-17.5 * DALTON),
+ -1 * DALTON, 4 * DALTON};
+
+ if(GlobalOptions->PhosphorylationFlag)
+ {//for phos searches, these offsets produce much better results.
+ Offsets2[0] = (int)(0.2 * DALTON);
+ Offsets2[2] = (int)(-18.0 * DALTON);
+ }
+
+ //
+ Spectrum = SpectrumInfo->Spectrum;
+ if (!Spectrum->BinnedIntensities) // move to caller!
+ {
+ REPORT_ERROR_S(4, "Error in ConvolveMassCorrectedSpectrum(): Spectrum binned intensities not set.\n");
+ return;
+ }
+ for (OffsetIndex = 0; OffsetIndex < SELF_CONVOLVE_OFFSETS; OffsetIndex++)
+ {
+ if (VerboseFlag)
+ {
+ printf("\n>>>Offset %d: %.2f\n", OffsetIndex, Offsets[OffsetIndex] / (float)DALTON);
+ }
+ OverallOffset = Offsets[OffsetIndex] + (Info->ParentMass - SpectrumInfo->Mass);
+ Info->Convolve[OffsetIndex] = SpectrumGetSelfConvolution(Spectrum, SpectrumInfo, OverallOffset, 0);
+
+ //// Compute convolution value for these parameters:
+ //Convolution = 0;
+ //for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ //{
+ // OtherMass = Info->ParentMass + HYDROGEN_MASS - Spectrum->Peaks[PeakIndex].Mass + Offsets[OffsetIndex];
+ // Bin = ((OtherMass + 50) / 100);
+ // if (Bin < 0 || Bin >= Spectrum->IntensityBinCount)
+ // {
+ // continue;
+ // }
+ // Product = Spectrum->Peaks[PeakIndex].Intensity * Spectrum->BinnedIntensitiesTight[Bin] * SpectrumInfo->PeakScalingFactor;
+ // if (VerboseFlag && Product)
+ // {
+ // printf("Peak@%.2f and binned intensity %d (%.2f) -> %.5f\n", Spectrum->Peaks[PeakIndex].Mass / (float)DALTON,
+ // Bin, OtherMass / (float)DALTON, Product);
+ // }
+ // Convolution += Product;
+ //}
+ //Info->Convolve[OffsetIndex] = Convolution / SpectrumInfo->SelfConvolution;
+ //if (VerboseFlag)
+ //{
+ // printf(">>Convolve[%d] = %.4f\n", OffsetIndex, Convolution);
+ //}
+ }
+
+ if (Spectrum->Charge > 1)//I compute these values for phos charge 2, but don't use them for the PMC model.
+ { //they do go into the ChargeCorrection model, so they are still calculated.
+ // Compute convolution of charge-1 and charge-2 peaks:
+ for (OffsetIndex = 0; OffsetIndex < SELF_CONVOLVE2_OFFSETS; OffsetIndex++)
+ {
+ OverallOffset = Offsets2[OffsetIndex] + (Info->ParentMass - SpectrumInfo->Mass);
+ Info->Convolve2[OffsetIndex] = SpectrumGetSelfConvolution(Spectrum, SpectrumInfo, OverallOffset, 1);
+ //Convolution = 0;
+ //for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ //{
+ // OtherMass = Info->ParentMass + 2 * HYDROGEN_MASS - (2 * Spectrum->Peaks[PeakIndex].Mass) + Offsets2[OffsetIndex];
+ // Bin = ((OtherMass + 50) / 100);
+ // if (Bin < 0 || Bin >= Spectrum->IntensityBinCount)
+ // {
+ // continue;
+ // }
+ // Convolution += Spectrum->Peaks[PeakIndex].Intensity * Spectrum->BinnedIntensitiesTight[Bin] * SpectrumInfo->PeakScalingFactor;
+ //}
+ //Info->Convolve2[OffsetIndex] = Convolution / SpectrumInfo->SelfConvolution;
+ }
+ }
+}
diff --git a/ParentMass.h b/ParentMass.h
new file mode 100644
index 0000000..1e5e691
--- /dev/null
+++ b/ParentMass.h
@@ -0,0 +1,105 @@
+//Title: ParentMass.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef PARENT_MASS_H
+#define PARENT_MASS_H
+
+
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "Utils.h"
+#include "Inspect.h"
+#include "Spectrum.h"
+
+#define SELF_CONVOLVE_OFFSETS 7
+#define SELF_CONVOLVE2_OFFSETS 5
+
+// A linked list of self-convolutions for a spectrum.
+// We keep this list in the PMCSpectrumInfo, because many
+// of the PMCInfo objects will re-use the same self-convolutions;
+// it's expensive to re-compute them.
+typedef struct SelfConvolutionNode
+{
+ int MassOffset;
+ float Value;
+ struct SelfConvolutionNode* Next;
+} SelfConvolutionNode;
+
+#define SC_HASH_SIZE 64
+// PMCSpectrumInfo is data used during parent mass correction; here we store
+// intermediate values which are general across the whole spectrum, so that
+// we needn't re-compute them for each PMCInfo
+typedef struct PMCSpectrumInfo
+{
+ MSSpectrum* Spectrum;
+ int Charge;
+ float PeakScalingFactor;
+ float SelfConvolution;
+ struct PMCInfo* Head;
+ struct PMCInfo* Tail;
+ struct PMCInfo* BestInfo;
+ struct PMCInfo* RunnerUpInfo;
+ int Mass; // base mass, from the file
+ SelfConvolutionNode* SCHash[SC_HASH_SIZE];
+ //SelfConvolutionNode* SCTail;
+ //SelfConvolutionNode* SC2Head;
+ SelfConvolutionNode* SC2Hash[SC_HASH_SIZE];
+} PMCSpectrumInfo;
+
+// We allocate one PMCInfo struct for each candidate parent mass. We store
+// the SVM features here, along with the mass and other bookkeeping info.
+// The PMCInfo structs are kept in a list. The final tweaks that we keep are
+// the best 1..3 PMCInfos
+typedef struct PMCInfo
+{
+ int Charge;
+ int ParentMass;
+
+ float Features[64];
+ float Convolve[SELF_CONVOLVE_OFFSETS];
+ float Convolve2[SELF_CONVOLVE_OFFSETS];
+ float SVMScore;
+ struct PMCInfo* Next;
+ int IntensePeakIndex[6];//these are for keeping track of possible M-p related peaks, which are superintense
+ float IntensePeakIntensity[6]; //ratio to mean
+ int IntensePeakSkew[6]; //0 = M-p, 1 = m-p-h2o, 2 = feature used
+} PMCInfo;
+
+void PerformPMC(PMCSpectrumInfo* SpectrumInfo);
+void FreePMCSpectrumInfo(PMCSpectrumInfo* SpectrumInfo);
+void ComputePMCFeatures(PMCSpectrumInfo* SpectrumInfo);
+PMCSpectrumInfo* GetPMCSpectrumInfo(MSSpectrum* Spectrum);
+void ConvolveMassCorrectedSpectrum(PMCInfo* Info, PMCSpectrumInfo* SpectrumInfo);
+
+#endif // PARENT_MASS_H
diff --git a/ParseInput.c b/ParseInput.c
new file mode 100644
index 0000000..0698661
--- /dev/null
+++ b/ParseInput.c
@@ -0,0 +1,1653 @@
+//Title: ParseInput.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+// ParseInput.c is responsible for parsing the Inspect input file.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+#include <locale.h>
+#include "Trie.h"
+#include "Utils.h"
+#include "Spectrum.h"
+#include "Mods.h"
+#include "Score.h"
+#include "Tagger.h"
+#include "FreeMod.h"
+#include "CMemLeak.h"
+#include "SVM.h"
+#include "BN.h"
+#include "Run.h"
+#include "SNP.h"
+#include "SpliceDB.h"
+#include "ChargeState.h"
+#include "Scorpion.h"
+#include "ParseXML.h"
+#include "SpliceScan.h"
+#include "Errors.h"
+#include "IonScoring.h"
+#include "TagFile.h" //ARI_MOD
+//#include "SuffixArray.h"
+
+// If the input file specifies a directory full of spectra, we must iterate over the files in that directory.
+// That works a bit differently on Windows and on Unix.
+#ifdef _WIN32
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#else
+#include <dirent.h>
+#include <sys/stat.h>
+#endif
+
+// Global variables:
+extern Options* GlobalOptions;
+extern MSSpectrum* Spectrum;
+
+// Array of spectra to be searched. We put them into an array so that we can qsort
+// them. (Not crucial, but it's nice to get output in order)
+SpectrumNode* g_BigNodeArray = NULL;
+
+extern StringNode* FirstTagCheckNode;
+extern StringNode* LastTagCheckNode;
+
+void AddDatabase(DatabaseFile* Database)
+{
+ if (!Database)
+ {
+ return;
+ }
+ if (!GlobalOptions->FirstDatabase)
+ {
+ GlobalOptions->FirstDatabase = Database;
+ GlobalOptions->LastDatabase = Database;
+ }
+ else
+ {
+ GlobalOptions->LastDatabase->Next = Database;
+ GlobalOptions->LastDatabase = Database;
+ }
+}
+
+// Parse a FASTA file, and convert it into a .trie file. This is the same
+// thing as PrepDB.py.
+void PrepareSecondarySequenceFile(char* FileName)
+{
+ FILE* FastaFile;
+ FILE* DBFile;
+ FILE* IndexFile;
+ int Dummy = 0;
+ char Char;
+ int BytesRead;
+ int NameLength = 0;
+ int ReadingName = 0;
+ int TargetDBPos = 0;
+ int SourceFilePos = 0;
+ char* StarChar = "*";
+ char* NullChar = "\0";
+ char TempDBName[MAX_FILENAME_LEN + 1];
+
+ DatabaseFile* Database;
+ //
+ FastaFile = fopen(FileName, "rb");
+ if (!FastaFile)
+ {
+ printf("Couldn't open %s\n",FileName);
+ /// If what we got looks like a complete path, then keep the path:
+ if (FileName[0]=='/' || FileName[0]=='.' || FileName[1]==':')
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+
+ }
+ else
+ {
+ // Otherwise, go to $resourcedir\database
+ sprintf(TempDBName, "%sDatabase%c%s", GlobalOptions->ResourceDir, SEPARATOR, FileName);
+
+ FastaFile = fopen(TempDBName,"rb");
+ if(!FastaFile)
+ {
+ //if not in /Database, look in just the resourcedir
+ sprintf(TempDBName, "%s%s", GlobalOptions->ResourceDir,FileName);
+
+ FastaFile = fopen(TempDBName,"rb");
+ }
+ }
+ if(!FastaFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+
+ }
+
+
+
+ DBFile = fopen("AdditionalDB.trie", "wb");
+ IndexFile = fopen("AdditionalDB.index", "wb");
+ if (!DBFile || !IndexFile)
+ {
+ printf("Unable to write out processed secondary database! Skipping.\n");
+ return;
+ }
+ while (1)
+ {
+ BytesRead = ReadBinary(&Char, sizeof(char), 1, FastaFile);
+ if (!BytesRead)
+ {
+ break;
+ }
+ if (Char == '>')
+ {
+ ReadingName = 1;
+ if (TargetDBPos)
+ {
+ WriteBinary(StarChar, sizeof(char), 1, DBFile);
+ TargetDBPos++;
+ }
+ WriteBinary(&SourceFilePos, sizeof(int), 1, IndexFile);
+ // Source file pos is a long long; assume we must write another 4 bytes:
+ WriteBinary(&Dummy, sizeof(int), 1, IndexFile);
+ WriteBinary(&TargetDBPos, sizeof(int), 1, IndexFile);
+ NameLength = 0;
+ continue;
+ }
+ if (Char == '\r' || Char == '\n')
+ {
+ if (ReadingName)
+ {
+ // Pad the protein name out:
+ while (NameLength < 80)
+ {
+ WriteBinary(NullChar, sizeof(char), 1, IndexFile);
+ NameLength++;
+ }
+
+ }
+ ReadingName = 0;
+ continue;
+ }
+ if (ReadingName)
+ {
+ if (NameLength < 79)
+ {
+ WriteBinary(&Char, sizeof(char), 1, IndexFile);
+ NameLength++;
+ }
+
+ continue;
+ }
+ if (Char == ' ' || Char == '\t' || Char == '*')
+ {
+ continue;
+ }
+ WriteBinary(&Char, sizeof(char), 1, DBFile);
+ TargetDBPos++;
+ }
+ fclose(DBFile);
+ fclose(IndexFile);
+ Database = (DatabaseFile*)calloc(1, sizeof(DatabaseFile));
+ strcpy(Database->FileName, "AdditionalDB.trie");
+ Database->Type = evDBTypeTrie;
+ AddDatabase(Database);
+}
+
+// Free the linked list of TagCheckNodes
+void FreeTagCheckNodes()
+{
+ StringNode* Node;
+ StringNode* Prev = NULL;
+ for (Node = FirstTagCheckNode; Node; Node = Node->Next)
+ {
+ if (Prev)
+ {
+ SafeFree(Prev->String);
+ SafeFree(Prev);
+ }
+ Prev = Node;
+ }
+ if (Prev)
+ {
+ SafeFree(Prev->String);
+ SafeFree(Prev);
+ }
+}
+
+void FreeInputFileNodes()
+{
+ InputFileNode* Node;
+ InputFileNode* Prev = NULL;
+
+ // At this point, freeing doesn't matter much, since this function's called
+ // just before program exit.
+ for (Node = GlobalOptions->FirstFile; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+}
+
+char* ProteaseNames[] = {"none", "trypsin", "chymotrypsin", "lysc", "aspn", "gluc"};
+
+typedef struct ParseSpectraFromFileInfo
+{
+ int FirstScan;
+ int LastScan;
+ int ScanNumber; //This is a user defined number attached to each spectrum. In mzXML files, this is read from the field scanNumber, but for other files
+ //it is a 0-based indexing
+ int SpecIndex; //This is the 1-based index of the spectrum in the file. In mzXML files, the MS1 scans are not counted.
+ InputFileNode* InputFile;
+} ParseSpectraFromFileInfo;
+
+// Callback for ParseSpectraFromMS2File: Handle one line of an .ms2 spectrum file.
+int ParseSpectraFromMS2FileCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ ParseSpectraFromFileInfo* Info;
+ //
+ Info = (ParseSpectraFromFileInfo*)UserData;
+ if (LineBuffer[0] == ':')
+ {
+ Info->InputFile->Format = SPECTRUM_FORMAT_MS2_COLONS;
+ Info->ScanNumber = atoi(LineBuffer + 1);
+
+ if (Info->ScanNumber >= Info->FirstScan && (Info->LastScan < 0 || Info->ScanNumber < Info->LastScan))
+ {
+ AddSpectrumToList(Info->InputFile, FilePos, Info->ScanNumber, Info->SpecIndex);
+ }
+ Info->SpecIndex = Info->SpecIndex += 1;
+ }
+ if (LineBuffer[0] == 'S' && (LineBuffer[1] == ' ' || LineBuffer[1] == '\t'))
+ {
+ // Start of a spectrum:
+ Info->ScanNumber = atoi(LineBuffer + 1);
+ Info->InputFile->Format = SPECTRUM_FORMAT_MS2;
+
+
+ if (Info->ScanNumber >= Info->FirstScan && (Info->LastScan < 0 || Info->ScanNumber < Info->LastScan))
+ {
+ AddSpectrumToList(Info->InputFile, FilePos, Info->ScanNumber, Info->SpecIndex);
+ }
+ Info->SpecIndex = Info->SpecIndex += 1;
+ }
+ if (LineBuffer[0] == 'Z' && (LineBuffer[1] == ' ' || LineBuffer[1] == '\t'))
+ {
+ //// This is the start of a spectrum:
+ //Info->InputFile->Format = SPECTRUM_FORMAT_MS2;
+ //if (Info->ScanNumber >= Info->FirstScan && (Info->LastScan < 0 || Info->ScanNumber < Info->LastScan))
+ //{
+ // AddSpectrumToList(Info->InputFile, FilePos, Info->ScanNumber);
+ //}
+ }
+ return 1;
+}
+
+// Iterate over lines in the MS2 file.
+// If you reach a line of the form "Z [charge] [mass]", that's the beginning of a spectrum record.
+// The spectrum code knows that it should process the first Z it sees, skip any others, then process peaks
+// until it sees something that is not a peak.
+void ParseSpectraFromMS2File(char* FileName, InputFileNode* InputFile, int FirstScan, int LastScan)
+{
+ FILE* MS2File;
+ ParseSpectraFromFileInfo Info;
+ //
+ MS2File = fopen(FileName, "rb");
+ if (!MS2File)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+ printf("Count spectra from '%s'...\n", FileName);
+ Info.FirstScan = FirstScan;
+ Info.LastScan = LastScan;
+ Info.InputFile = InputFile;
+ Info.SpecIndex = 1;
+ ParseFileByLines(MS2File, ParseSpectraFromMS2FileCallback, &Info, 0);
+ fclose(MS2File);
+}
+
+// Callback for ParseSpectraFromMGFFile: Handle one line of an .mgf spectrum file.
+int ParseSpectraFromMGFFileCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ ParseSpectraFromFileInfo* Info;
+ //
+ Info = (ParseSpectraFromFileInfo*)UserData;
+ if (!strncmp(LineBuffer, "BEGIN IONS", 10))
+ {
+
+ if (Info->ScanNumber >= Info->FirstScan && (Info->LastScan < 0 || Info->ScanNumber <= Info->LastScan))
+ {
+ AddSpectrumToList(Info->InputFile, FilePos, Info->ScanNumber, Info->SpecIndex);
+ }
+ Info->SpecIndex++;
+ Info->ScanNumber++;
+ }
+ return 1;
+}
+
+//Callback for ParseSpectraFromCDTAFile: Handle one line of .cdta file
+//Assume: header begins with '=' and the second token after a '.' is the scan number
+int ParseSpectraFromCDTAFileCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ ParseSpectraFromFileInfo* Info;
+ char* StrA;
+ char* StrB;
+ char* StrC;
+ char* StrD;
+
+ int ScanNumber;
+
+
+ Info = (ParseSpectraFromFileInfo*)UserData;
+ if(!strncmp(LineBuffer,"=",1)) //This denotes the beginning of a new spectrum
+ {
+ StrA = strtok(LineBuffer,".");
+ StrB = strtok(NULL,".");
+ ScanNumber = atoi(StrB);
+
+ if (ScanNumber >= Info->FirstScan && (Info->LastScan < 0 || ScanNumber <= Info->LastScan))
+ {
+ AddSpectrumToList(Info->InputFile, FilePos, ScanNumber, Info->SpecIndex);
+ }
+ Info->SpecIndex++;
+ }
+ return 1;
+
+
+}
+
+// Callback for ParseSpectraFromPKLFile: Handle one line of a .pkl spectrum file.
+// Assume: If there are three numbers, then this line is the header of a spectrum.
+int ParseSpectraFromPKLFileCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ ParseSpectraFromFileInfo* Info;
+ char* StrA;
+ char* StrB;
+ char* StrC;
+ float FloatValue;
+ int IntValue;
+ //
+ Info = (ParseSpectraFromFileInfo*)UserData;
+
+ // First, check to see that there are three fields on this line of the file:
+ StrA = strtok(LineBuffer, WHITESPACE);
+ StrB = strtok(NULL, WHITESPACE);
+ if (!StrB)
+ {
+ return 1;
+ }
+ StrC = strtok(NULL, WHITESPACE);
+ if (!StrC)
+ {
+ return 1;
+ }
+ // Now, check to see that the three fields are valid numbers:
+ FloatValue = (float)atof(StrA);
+ if (!FloatValue)
+ {
+ return 1;
+ }
+ FloatValue = (float)atof(StrB);
+ if (!FloatValue)
+ {
+ return 1;
+ }
+ IntValue = atoi(StrC);
+
+ if (Info->ScanNumber >= Info->FirstScan && (Info->LastScan < 0 || Info->ScanNumber <= Info->LastScan))
+ {
+ AddSpectrumToList(Info->InputFile, FilePos, Info->ScanNumber, Info->SpecIndex);
+ }
+ Info->SpecIndex++;
+ Info->ScanNumber++;
+ return 1;
+}
+
+void ParseSpectraFromPKLFile(char* FileName, InputFileNode* InputFile,
+ int FirstScan, int LastScan)
+{
+ FILE* SpectrumFile;
+ ParseSpectraFromFileInfo Info;
+ //
+ SpectrumFile = fopen(FileName, "rb");
+ if (!SpectrumFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+ printf("Count spectra from '%s'...\n", FileName);
+ Info.FirstScan = FirstScan;
+ Info.LastScan = LastScan;
+ Info.InputFile = InputFile;
+ Info.ScanNumber = 0;
+ Info.SpecIndex = 1;
+ ParseFileByLines(SpectrumFile, ParseSpectraFromPKLFileCallback, &Info, 0);
+ fclose(SpectrumFile);
+}
+
+
+//Iterate over lines in the CDTA file.
+//This format is form of concatenated DTA file, where each DTA file is separated by a line
+//beginning with =. Scan numbers are attempted to be parsed from the DTA scan header
+//They are expected to be of the form =======FileName.StartScan.EndScan.Charge.dta==========
+//StartScan is taken to be the scan.
+void ParseSpectraFromCDTAFile(char* FileName, InputFileNode* InputFile, int FirstScan, int LastScan)
+{
+ FILE* CDTAFile;
+ ParseSpectraFromFileInfo Info;
+
+ CDTAFile = fopen(FileName,"rb");
+ if(!CDTAFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+ printf("Count spectra from '%s'..\n",FileName);
+ Info.FirstScan = FirstScan;
+ Info.LastScan = LastScan;
+ Info.InputFile = InputFile;
+ Info.ScanNumber = 0;
+ Info.SpecIndex = 1;
+ ParseFileByLines(CDTAFile, ParseSpectraFromCDTAFileCallback, &Info,0);
+ fclose(CDTAFile);
+
+}
+
+// Iterate over lines in the MGF file.
+// If you reach a line of the form "BEGIN IONS", that starts a spectrum record.
+// The spectrum parse code knows that it should process a CHARGE line, a PEPMASS line, then peak lines.
+// Note: Scan numbers from the MGF file are *ignored*! The first scan we see is number 0,
+// the next is number 1, etc.
+void ParseSpectraFromMGFFile(char* FileName, InputFileNode* InputFile,
+ int FirstScan, int LastScan)
+{
+ FILE* MGFFile;
+ ParseSpectraFromFileInfo Info;
+ //
+ MGFFile = fopen(FileName, "rb");
+ if (!MGFFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+ printf("Count spectra from '%s'...\n", FileName);
+ Info.FirstScan = FirstScan;
+ Info.LastScan = LastScan;
+ Info.InputFile = InputFile;
+ Info.ScanNumber = 0;
+ Info.SpecIndex = 1;
+ ParseFileByLines(MGFFile, ParseSpectraFromMGFFileCallback, &Info, 0);
+ fclose(MGFFile);
+}
+
+void AddSpectrumNodesForFile(char* FileName, InputFileNode* InputFile,
+ int FirstScan, int LastScan)
+{
+ int Format;
+ //
+ // Based upon the file extension, decide whether and how to parse the input file
+ Format = GuessSpectrumFormatFromExtension(FileName);
+ InputFile->Format = Format;
+ switch (Format)
+ {
+ case SPECTRUM_FORMAT_MS2:
+ case SPECTRUM_FORMAT_MS2_COLONS:
+ ParseSpectraFromMS2File(FileName, InputFile, FirstScan, LastScan);
+ break;
+ case SPECTRUM_FORMAT_MZXML:
+ ParseSpectraFromMZXML(FileName, InputFile, FirstScan, LastScan);
+ break;
+ case SPECTRUM_FORMAT_MZDATA:
+ ParseSpectraFromMZData(FileName, InputFile, FirstScan, LastScan);
+ break;
+ case SPECTRUM_FORMAT_MGF:
+ ParseSpectraFromMGFFile(FileName, InputFile, FirstScan, LastScan);
+ break;
+ case SPECTRUM_FORMAT_PKL:
+ ParseSpectraFromPKLFile(FileName, InputFile, FirstScan, LastScan);
+ break;
+ case SPECTRUM_FORMAT_DTA:
+ // Let's assume that we can treat it as a .dta file.
+ AddSpectrumToList(InputFile, 0, 0,1);
+ break;
+ case SPECTRUM_FORMAT_CDTA:
+ //This is a special flavor of concatenated DTA file (ala PNNL)
+ ParseSpectraFromCDTAFile(FileName,InputFile,FirstScan,LastScan);
+ break;
+ default:
+ printf("Not parsing unknown spectrum file format:%s\n", FileName);
+ break;
+ }
+}
+
+// Add a spectrum file to our input list. If the file contains multiple spectra,
+// then we'll create several SpectrumNode instances. If FirstScan is set and >0,
+// skip all scans with scan number < FirstScan. If LastScan is set and >-1, then skip
+// all scans with scan number > LastScan. (INCLUSIVE ends)
+void AddSpectraToList(char* FileName, int FirstScan, int LastScan)
+{
+ InputFileNode* NewFile;
+ //
+ NewFile = (InputFileNode*)calloc(1, sizeof(InputFileNode));
+ strncpy(NewFile->FileName, FileName, MAX_FILENAME_LEN);
+ if (GlobalOptions->LastFile)
+ {
+ GlobalOptions->LastFile->Next = NewFile;
+ }
+ else
+ {
+ GlobalOptions->FirstFile = NewFile;
+ }
+ GlobalOptions->LastFile = NewFile;
+ AddSpectrumNodesForFile(FileName, NewFile, FirstScan, LastScan);
+ //strncpy(NewNode->FileName, FileName, MAX_FILENAME_LEN);
+
+}
+
+#ifdef _WIN32
+// The WINDOWS way to iterate over a directory:
+void ProcessInputCommandSpectra(char* FileName, int FirstScan, int LastScan)
+{
+ char DirBuffer[1024];
+ char StarBuffer[1024];
+ char FileNameBuffer[1024];
+ struct stat StatBuffer;
+ int StatResult;
+ int Len;
+ int Result;
+ int SkipFile;
+ if (!FileName || !FileName[0])
+ {
+ printf("* Error: null filename specified in 'spectra' command\n");
+ return;
+ }
+ StatResult = stat(FileName, &StatBuffer);
+ if (StatResult < 0)
+ {
+ REPORT_ERROR_S(8, FileName);
+ //printf("Unable to stat '%s' - skipping.\n", FileName);
+ return;
+ }
+ if (StatBuffer.st_mode & _S_IFDIR)
+ {
+ HANDLE hFindFile;
+ WIN32_FIND_DATA wFileFindData;
+ sprintf(DirBuffer, FileName);
+ Len = strlen(FileName);
+ if (DirBuffer[Len-1] != '\\')
+ {
+ strcat(DirBuffer, "\\");
+ }
+ sprintf(StarBuffer, "%s*.*", DirBuffer);
+ hFindFile = FindFirstFile(StarBuffer, &wFileFindData);
+ while (hFindFile != INVALID_HANDLE_VALUE)
+ {
+ SkipFile = 0;
+ if (wFileFindData.cFileName[0]=='\0')
+ {
+ SkipFile = 1;
+ }
+ if (wFileFindData.cFileName[0]=='.' && wFileFindData.cFileName[1]=='\0')
+ {
+ SkipFile = 1;
+ }
+ if (wFileFindData.cFileName[0]=='.' && wFileFindData.cFileName[1]=='.' && wFileFindData.cFileName[2]=='\0')
+ {
+ SkipFile = 1;
+ }
+ if (!SkipFile)
+ {
+ sprintf(FileNameBuffer, "%s%s", DirBuffer, wFileFindData.cFileName);
+ StatResult = stat(FileNameBuffer, &StatBuffer);
+ if (StatBuffer.st_mode & _S_IFREG)
+ {
+ //printf("Adding file to list: '%s'\n", FileNameBuffer);
+ AddSpectraToList(FileNameBuffer, FirstScan, LastScan);
+ }
+ }
+ Result = FindNextFile(hFindFile, &wFileFindData);
+ if (!Result)
+ {
+ break;
+ }
+ }
+ }
+ else
+ {
+ AddSpectraToList(FileName, FirstScan, LastScan);
+ }
+}
+#else
+// The UNIX way to iterate over a directory:
+void ProcessInputCommandSpectra(char* FileName, int FirstScan, int LastScan)
+{
+ char DirBuffer[1024];
+ char FileNameBuffer[1024];
+ struct stat StatBuffer;
+ DIR *dirp;
+ struct dirent *ep;
+ int StatResult;
+ int Len;
+ if (!FileName || !FileName[0])
+ {
+ printf("* Error: null filename specified in 'spectra' command\n");
+ return;
+ }
+
+ StatResult = stat(FileName, &StatBuffer);
+ if (S_ISDIR(StatBuffer.st_mode))
+ {
+ sprintf(DirBuffer, FileName);
+ Len = strlen(FileName);
+ if (DirBuffer[Len-1] != '/')
+ {
+ strcat(DirBuffer, "/");
+ }
+ dirp = opendir(DirBuffer);
+ while ((ep = readdir(dirp)) != NULL)
+ {
+ Len = strlen(ep->d_name);
+ if (ep->d_name[0]=='\0')
+ {
+ continue;
+ }
+ if (ep->d_name[0]=='.' && ep->d_name[1]=='\0')
+ {
+ continue;
+ }
+ if (ep->d_name[0]=='.' && ep->d_name[1]=='.' && ep->d_name[2]=='\0')
+ {
+ continue;
+ }
+ sprintf(FileNameBuffer, "%s%s", DirBuffer, ep->d_name);
+ StatResult = stat(FileNameBuffer, &StatBuffer);
+ if (S_ISREG(StatBuffer.st_mode))
+ {
+ AddSpectraToList(FileNameBuffer, FirstScan, LastScan);
+ }
+ }
+ closedir(dirp);
+ }
+ else
+ {
+ AddSpectraToList(FileName, FirstScan, LastScan);
+ }
+}
+#endif
+
+#define REJECT_NULL_VALUE(name)\
+{\
+if (!CommandValue || !CommandValue[0]) \
+{\
+ printf("* Error: Null value for '(name)'\n");\
+ return;\
+}\
+}
+
+typedef int (*InputParameterParser)(char* CommandValue);
+
+#define INPUT_VALUE_TYPE_NONE 0
+#define INPUT_VALUE_TYPE_INT 1
+#define INPUT_VALUE_TYPE_STRING 2
+typedef struct InputParameter
+{
+ char* Name;
+ InputParameterParser ParseFunction;
+ int ValueType;
+} InputParameter;
+
+int ParseInputTagCheck(char* Value)
+{
+ StringNode* StrNode;
+ //
+ StrNode = (StringNode*)calloc(1, sizeof(StringNode));
+ StrNode->String = strdup(Value);
+ if (FirstTagCheckNode)
+ {
+ LastTagCheckNode->Next = StrNode;
+ }
+ else
+ {
+ FirstTagCheckNode = StrNode;
+ }
+ LastTagCheckNode = StrNode;
+ return 1;
+}
+
+int ParseInputTagsOnly(char* Value)
+{
+ GlobalOptions->RunMode |= RUN_MODE_TAGS_ONLY;
+ return 1;
+}
+
+int ParseInputExternalTagger(char* Value)
+{
+ GlobalOptions->ExternalTagger = 1;
+ ReadExternalTags(Value,1); //ARI_MOD
+ return 1;
+}
+
+int ParseInputSpectra(char* Value)
+{
+ char* ScanStr;
+ int FirstScan;
+ int LastScan;
+ // Spectrum file:
+ // Note: If the file is a directory, we will iterate over all the files in
+ // the directory. We don't recurse into subdirectories.
+ //ScanStr = strtok(NULL, ",");
+ FirstScan = 0; //default
+ LastScan = -1; //default
+ ScanStr = strtok(NULL, ",");
+ if (ScanStr)
+ {
+ FirstScan = atoi(ScanStr);
+ ScanStr = strtok(NULL, ",");
+ if (ScanStr)
+ {
+ LastScan = atoi(ScanStr);
+ // LastScan of -1 means no upper limit...but otherwise, LastScan should
+ // not be below FirstScan!
+ if (LastScan < FirstScan && LastScan >= 0)
+ {
+ REPORT_WARNING_II(9, FirstScan, LastScan);
+ return 0;
+ }
+ }
+ }
+ ProcessInputCommandSpectra(Value, FirstScan, LastScan);
+ return 1;
+}
+
+int ParseInputInstrument(char* CommandValue)
+{
+ // Instrument name is LCQ or QTOF or FT-Hybrid. If QTOF, use a different
+ // scoring model, and don't perform parent-mass correction.
+ if (!CompareStrings(CommandValue, "ESI-ION-TRAP"))
+ {
+ GlobalOptions->InstrumentType = INSTRUMENT_TYPE_LTQ;
+ }
+ else if (!CompareStrings(CommandValue, "QTOF"))
+ {
+ GlobalOptions->InstrumentType = INSTRUMENT_TYPE_QTOF;
+ GlobalOptions->ParentMassPPM = 100;
+ }
+ else if (!CompareStrings(CommandValue, "FT-HYBRID"))
+ {
+ GlobalOptions->InstrumentType = INSTRUMENT_TYPE_FT_HYBRID;
+ GlobalOptions->ParentMassPPM = 100;
+ }
+ else
+ {
+ printf("** Warning: unknown instrument type '%s'\n", CommandValue);
+ return 0;
+ }
+ return 1;
+}
+
+int ParseInputProtease(char* CommandValue)
+{
+ int ProteaseIndex;
+ for (ProteaseIndex = 0; ProteaseIndex < sizeof(ProteaseNames)/sizeof(char*); ProteaseIndex++)
+ {
+ if (!CompareStrings(ProteaseNames[ProteaseIndex], CommandValue))
+ {
+ GlobalOptions->DigestType = ProteaseIndex;
+ return 1;
+ }
+ }
+ printf("* Error: Protease '%s' not understood\n", CommandValue);
+ return 0;
+}
+
+int GuessDBTypeFromExtension(char* FileName)
+{
+ char* Extension;
+ Extension = FileName + strlen(FileName);
+ while (Extension > FileName)
+ {
+ Extension--;
+ if (*Extension == '.')
+ {
+ if (!CompareStrings(Extension, ".ms2db"))
+ {
+ return evDBTypeMS2DB;
+ }
+ if (!CompareStrings(Extension, ".dat"))
+ {
+ return evDBTypeSpliceDB;
+ }
+ }
+ }
+ return evDBTypeTrie; // default guess
+}
+
+int ParseInputDB(char* CommandValue)
+{
+ DatabaseFile* Database;
+ FILE* TempFile;
+ char DBFileName[MAX_FILENAME_LEN + 1];
+
+ //printf("CommandValue: %s\n",CommandValue);
+ /// If what we got looks like a complete path, then keep the path:
+ if (CommandValue[0]=='/' || CommandValue[0]=='.' || CommandValue[1]==':')
+ {
+ strncpy(DBFileName, CommandValue, MAX_FILENAME_LEN);
+ }
+ else
+ {
+ // Otherwise, go to $resourcedir\database
+ sprintf(DBFileName, "%sDatabase%c%s", GlobalOptions->ResourceDir, SEPARATOR, CommandValue);
+ TempFile = fopen(DBFileName,"rb");
+ if(!TempFile)
+ {
+ //if not in /Database, look in just the resourcedir
+ sprintf(DBFileName, "%s%s", GlobalOptions->ResourceDir,CommandValue);
+ }
+ else
+ {
+ fclose(TempFile);
+ }
+ }
+ printf("DBFileName: %s\n",DBFileName);
+ //To-ju: Putting protein databases in a subfolder of the inspect executable will cause unnatural coupling.
+
+ //strncpy(DBFileName, CommandValue, MAX_FILENAME_LEN);
+
+
+ TempFile = fopen(DBFileName, "rb");
+ if (!TempFile)
+ {
+ REPORT_ERROR_S(8, DBFileName);
+ return 0;
+ }
+ else
+ {
+ fclose(TempFile);
+ }
+ Database = (DatabaseFile*)calloc(1, sizeof(DatabaseFile));
+ strcpy(Database->FileName, DBFileName);
+ Database->Type = GuessDBTypeFromExtension(Database->FileName);
+ AddDatabase(Database);
+ return 1;
+}
+
+int ParseInputPMTolerance(char* CommandValue)
+{
+ GlobalOptions->ParentMassEpsilon = (int)(strtod(CommandValue,NULL) * MASS_SCALE);
+ return 1;
+}
+
+int ParseInputReportMatches(char* CommandValue)
+{
+ GlobalOptions->ReportMatchCount = atoi(CommandValue);
+ GlobalOptions->ReportMatchCount = min(100, max(1, GlobalOptions->ReportMatchCount));
+ return 1;
+}
+
+int ParseInputRequireTermini(char* CommandValue)
+{
+ int RequireTerminiCount;
+ //
+ RequireTerminiCount = atoi(CommandValue);
+ if (RequireTerminiCount < 0 || RequireTerminiCount > 2)
+ {
+ REPORT_ERROR_I(47, RequireTerminiCount);
+ }
+
+ GlobalOptions->RequireTermini = RequireTerminiCount;
+ return 1;
+}
+
+int ParseInputRequiredMod(char* CommandValue)
+{
+ strncpy(GlobalOptions->MandatoryModName, CommandValue, 256);
+ return 1;
+}
+int ParseInputTagCount(char* CommandValue)
+{
+ GlobalOptions->GenerateTagCount = atoi(CommandValue);
+ return 1;
+}
+
+int ParseInputTagLength(char* CommandValue)
+{
+ GlobalOptions->GenerateTagLength = atoi(CommandValue);
+ if (GlobalOptions->GenerateTagLength <= 0 || GlobalOptions->GenerateTagLength > 6)
+ {
+ REPORT_ERROR_I(38, GlobalOptions->GenerateTagLength);
+ GlobalOptions->GenerateTagLength = DEFAULT_TAG_LENGTH;
+ return 0;
+ }
+ return 1;
+}
+
+int ParseInputIonTolerance(char* CommandValue)
+{
+ //if (!CompareStrings(CommandName, "IonTolerance") || !CompareStrings(CommandName, "Ion_Tolerance"))
+ GlobalOptions->Epsilon = (int)(strtod(CommandValue,NULL) * MASS_SCALE);
+ return 1;
+}
+
+int ParseInputMods(char* CommandValue)
+{
+ GlobalOptions->MaxPTMods = atoi(CommandValue);
+ return 1;
+}
+
+int ParseInputFreeMods(char* CommandValue)
+{
+ char Path[MAX_FILENAME_LEN];
+
+ GlobalOptions->MaxPTMods = atoi(CommandValue);
+ // "freemods,1" or "freemods,2" allows mutations plus a rich PTM set.
+ if (GlobalOptions->MaxPTMods && !(GlobalOptions->RunMode & RUN_MODE_BLIND))
+ {
+ GlobalOptions->RunMode |= RUN_MODE_MUTATION;
+ GlobalOptions->PhosphorylationFlag = 1;
+ //sprintf(Path, "%s%s", GlobalOptions->ResourceDir, FILENAME_MASS_DELTAS);
+ //LoadMassDeltas(Path, GlobalOptions->RunMode & RUN_MODE_MUTATION);
+ }
+ return 1;
+}
+
+int ParseInputLogOdds(char* CommandValue)
+{
+ float LogOdds = atof(CommandValue);
+ GlobalOptions->MinLogOddsForMutation = LogOdds;
+ return 1;
+}
+
+/*int ParseInputSuffixArrayBuild(char * CommandValue)
+{
+
+ int ret;
+
+ ret = buildSuffixArray(CommandValue,NULL);
+ exit(ret);
+
+}
+*/
+int ParseInputMutationMode(char* CommandValue)
+{
+ char Path[MAX_FILENAME_LEN];
+
+ int AA;
+ int ModFlags = 0;
+ char StrAminos[2];
+ float MassDelta;
+ char StrName[5];
+
+ FILE* MassDeltaFile;
+
+ AllPTModCount = 0;
+
+ //THIS IS FIXED, WE ONLY ALLOW 1 MUTATION PER PEPTIDE!
+ GlobalOptions->MaxPTMods = 1;
+
+
+ //printf("MaxPTMods: %d\n",GlobalOptions->MaxPTMods);
+ // "mutationMode,1 or mutationMode,2 allows 1 or 2 mutations per peptide
+ if (GlobalOptions->MaxPTMods && !(GlobalOptions->RunMode & RUN_MODE_BLIND))
+ {
+ GlobalOptions->RunMode |= RUN_MODE_TAG_MUTATION;
+ //printf("MaxPTMods: %d\n",GlobalOptions->MaxPTMods);
+ GlobalOptions->PhosphorylationFlag = 1;
+ }
+ return 1;
+}
+
+int ParseInputPMCOnly(char* CommandValue)
+{
+ if (atoi(CommandValue))
+ {
+ GlobalOptions->RunMode |= RUN_MODE_PMC_ONLY;
+ }
+ return 1;
+}
+
+int ParseInputNoScoring(char* CommandValue)
+{
+ GlobalOptions->RunMode |= RUN_MODE_RAW_OUTPUT;
+
+ return 1;
+}
+int ParseInputTagless(char* CommandValue)
+{
+ GlobalOptions->TaglessSearchFlag = atoi(CommandValue);
+ return 1;
+}
+int ParseInputBlind(char* CommandValue)
+{
+ if (atoi(CommandValue))
+ {
+ GlobalOptions->RunMode |= RUN_MODE_BLIND;
+ }
+ return 1;
+}
+int ParseInputBlindTagging(char* CommandValue)
+{
+
+ if (atoi(CommandValue))
+ {
+
+ GlobalOptions->RunMode |= RUN_MODE_BLIND_TAG;
+
+ }
+ return 1;
+}
+
+// Maximum size, in daltons, of PTMs to consider. (Blind search only)
+int ParseInputMaxPTMSize(char* CommandValue)
+{
+ GlobalOptions->MaxPTMDelta = atoi(CommandValue);
+ if (GlobalOptions->MaxPTMDelta < 1 || GlobalOptions->MaxPTMDelta >= 2000)
+ {
+ printf("** Error: Invalid maxptmsize '%s' - please select a value between 10 and 2000Da\n", CommandValue);
+ GlobalOptions->MaxPTMDelta = 200;
+ return 0;
+ }
+ GlobalOptions->DeltaBinCount = (GlobalOptions->MaxPTMDelta - GlobalOptions->MinPTMDelta) * 10 + 1;
+ GlobalOptions->DeltasPerAA = max(512, GlobalOptions->DeltaBinCount * 2);
+ return 1;
+}
+
+// Maximum size, in daltons, of PTMs to consider. (Blind search only)
+int ParseInputMinPTMSize(char* CommandValue)
+{
+ GlobalOptions->MinPTMDelta = atoi(CommandValue);
+ if (GlobalOptions->MinPTMDelta < -2000 || GlobalOptions->MinPTMDelta > 2000)
+ {
+ printf("** Error: Invalid minptmsize '%s' - please select a value between -2000 and 2000Da\n", CommandValue);
+ GlobalOptions->MaxPTMDelta = 200;
+ return 0;
+ }
+ GlobalOptions->DeltaBinCount = (GlobalOptions->MaxPTMDelta - GlobalOptions->MinPTMDelta) * 10 + 1;
+ GlobalOptions->DeltasPerAA = max(512, GlobalOptions->DeltaBinCount * 2);
+ return 1;
+}
+
+// If multicharge flag is set, then ALWAYS try charge correction on spectra. (Otherwise, do it only
+// if the source file provides no charge, or says the charge is zero)
+int ParseInputMultiCharge(char* CommandValue)
+{
+ GlobalOptions->MultiChargeMode = atoi(CommandValue);
+ return 1;
+}
+int ParseInputXMLStrict(char* CommandValue)
+{
+ GlobalOptions->XMLStrictFlag = atoi(CommandValue);
+ return 1;
+}
+
+void debugPrintPTMStuff()
+{
+ int index = 0;
+ int index2 = 0;
+ int dIndex = 0;
+ printf("AllKnownPTMods:\n");
+ for(index = 0; index < AllPTModCount; ++index)
+ {
+ printf(" [%d]: Name=%s,Mass=%d,Flags=%x\n",index,AllKnownPTMods[index].Name,AllKnownPTMods[index].Mass,AllKnownPTMods[index].Flags);
+ for(index2 = 0; index2 < TRIE_CHILD_COUNT; ++index2)
+ printf(" - Allowed on %c=%d\n",(char)(index2+'A'),AllKnownPTMods[index].Allowed[index2]);
+ }
+
+ printf("\nMassDeltas:\n");
+ for(index= 0; index < TRIE_CHILD_COUNT; ++index)
+ {
+ for (index2 = 0; index2 < GlobalOptions->DeltasPerAA; index2++)
+ {
+ if(!MassDeltas[index][index2].Flags)
+ continue;
+ printf("[%c][%d] : Delta=%d, RealDelta=%d,Name=%s,Index=%d\n",(char)(index+'A'),index2,MassDeltas[index][index2].Delta,MassDeltas[index][index2].RealDelta,MassDeltas[index][index2].Name,MassDeltas[index][index2].Index);
+ }
+ }
+ /*
+ printf("\nMassDeltasByIndex:\n");
+ for(index = 0; index < AMINO_ACIDS; ++index)
+ {
+ for(index2 = 0; index2 < MAX_PT_MODTYPE; ++index2)
+ {
+ dIndex = index*MAX_PT_MODTYPE+index2;
+ printf("[%d] (AA:%c,index:%d) : Delta=%d,RealDelta=%d,Name=%s,Index=%d\n",dIndex, (char)(index+'A'),index2,MassDeltaByIndex[dIndex]->Delta,MassDeltaByIndex[dIndex]->RealDelta,MassDeltaByIndex[dIndex]->Name,MassDeltaByIndex[dIndex]->Index);
+ }
+ }*/
+
+}
+
+int ParseInputPRMModel(char* CommandValue)
+{
+ char* StrCharge = NULL;
+ char* FileName;
+ int Charge;
+ //
+ StrCharge = CommandValue;
+ FileName = strtok(CommandValue, ",");
+ Charge = atoi(StrCharge);
+ if (Charge < 2 || Charge > 3)
+ {
+ REPORT_ERROR(46);
+ return 0;
+ }
+ return ReplacePRMScoringModel(Charge, FileName);
+}
+
+int ParseInputTAGModel(char* CommandValue)
+{
+ char* StrCharge = NULL;
+ char* FileName;
+ int Charge;
+ //
+ StrCharge = CommandValue;
+ FileName = strtok(CommandValue, ",");
+ Charge = atoi(StrCharge);
+ if (Charge < 2 || Charge > 3)
+ {
+ REPORT_ERROR(46);
+ return 0;
+ }
+ return ReplaceTAGScoringModel(Charge, FileName);
+}
+
+int ParseInputMod(char* CommandValue)
+{
+ int ModFlags;
+ char* StrMass = NULL;
+ char* StrAminos = NULL;
+ char* StrType = NULL;
+ char* StrName = NULL;
+ float MassDelta;
+ char* Amino;
+ int AminoIndex;
+ int AminoFoundFlag;
+ int Bin;
+ int ModIndex;
+ char ModNameBuffer[64];
+ //
+ if (!MassDeltas)
+ {
+ LoadMassDeltas(NULL, 0);
+ }
+ if (AllPTModCount == MAX_PT_MODTYPE)
+ {
+ // Too many!
+ REPORT_ERROR_S(35, CommandValue);
+ return 0;
+ }
+ ModFlags = DELTA_FLAG_VALID;
+ StrMass = CommandValue;
+ StrAminos = strtok(NULL, ","); // required, can be "*" for no specificity
+ if (!StrAminos || !*StrAminos)
+ {
+ printf("* Error: Modification must have amino acids specified!\n");
+ return 0;
+ }
+ StrType = strtok(NULL, ","); // optional: fix/opt/cterminal/nterminal
+ if (StrType)
+ {
+ StrName = strtok(NULL, ","); // optional: name
+ }
+ if (!StrMass || !StrAminos || !StrAminos[0])
+ {
+ printf("** Error: invalid modification in input file. Skipping!\n");
+ return 0;
+ }
+ if (strstr(StrAminos, "*"))
+ {
+ StrAminos = "ACDEFGHIKLMNPQRSTVWY";
+ }
+ MassDelta = (float)atof(StrMass);
+ if (MassDelta == 0 || MassDelta > 1000 || MassDelta < -200)
+ {
+ printf("** Error: invalid modification in input file; mass is %.2f. Skipping!\n", MassDelta);
+ return 0;
+ }
+ // Default modification type is OPTIONAL.
+ if (!StrType)
+ {
+ StrType = "opt";
+ }
+ // Default name is the mass (rounded to integer, with sign indicated)
+ if (!StrName)
+ {
+ if (MassDelta > 0)
+ {
+ sprintf(ModNameBuffer, "%+d", (int)(MassDelta + 0.5));
+ }
+ else
+ {
+ sprintf(ModNameBuffer, "%-d", (int)(MassDelta - 0.5));
+ }
+ StrName = ModNameBuffer;
+ }
+ // If it's a fixed modification, then adjust the amino acid mass:
+ if (!CompareStrings(StrType, "fix") || !CompareStrings(StrType, "fixed"))
+ {
+ for (Amino = StrAminos; *Amino; Amino++)
+ {
+ AminoIndex = *Amino - 'A';
+ if (AminoIndex >= 0 && AminoIndex < TRIE_CHILD_COUNT)
+ {
+ PeptideMass[Amino[0]] += (int)(MassDelta * MASS_SCALE);
+ // We haven't yet called PopulateJumpingHash(), so that's all we need to do
+ }
+ }
+ return 1;
+ }
+ else if (!CompareStrings(StrType, "cterminal") || !CompareStrings(StrType, "c-terminal"))
+ {
+ ModFlags |= DELTA_FLAG_C_TERMINAL;
+ }
+ else if (!CompareStrings(StrType, "nterminal") || !CompareStrings(StrType, "n-terminal"))
+ {
+ ModFlags |= DELTA_FLAG_N_TERMINAL;
+ }
+ else if (!CompareStrings(StrType, "opt") || !CompareStrings(StrType, "optional"))
+ {
+ ; // pass
+ }
+ else
+ {
+ REPORT_ERROR_S(36, StrType);
+ }
+
+ if (!CompareStrings(StrName, "phosphorylation"))
+ {
+ g_PhosphorylationMod = AllPTModCount;
+ GlobalOptions->PhosphorylationFlag = 1;
+ ModFlags |= DELTA_FLAG_PHOSPHORYLATION;
+ }
+ AllKnownPTMods[AllPTModCount].Flags = ModFlags;
+ strncpy(AllKnownPTMods[AllPTModCount].Name, StrName, 40);
+ // Add another modification to each amino acid's mod-array:
+ AminoFoundFlag = 0;
+ for (Amino = StrAminos; *Amino; Amino++)
+ {
+ AminoIndex = *Amino - 'A';
+ if (AminoIndex >= 0 && AminoIndex < TRIE_CHILD_COUNT)
+ {
+ AminoFoundFlag = 1;
+ AllKnownPTMods[AllPTModCount].Allowed[AminoIndex] = 1;
+ // Add to the first still-available slot:
+ for (ModIndex = 0; ModIndex < GlobalOptions->DeltasPerAA; ModIndex++)
+ {
+ if (!MassDeltas[AminoIndex][ModIndex].Flags)
+ {
+ strncpy(MassDeltas[AminoIndex][ModIndex].Name, StrName, 40);
+ MassDeltas[AminoIndex][ModIndex].RealDelta = (int)(MassDelta * MASS_SCALE);
+ ROUND_MASS_TO_DELTA_BIN(MassDelta, Bin);
+ MassDeltas[AminoIndex][ModIndex].Delta = Bin;
+ MassDeltas[AminoIndex][ModIndex].Index = AllPTModCount;
+ MassDeltaByIndex[AminoIndex * MAX_PT_MODTYPE + AllPTModCount] = &MassDeltas[AminoIndex][ModIndex];
+ MassDeltaByIndex[MDBI_ALL_MODS * MAX_PT_MODTYPE + AllPTModCount] = &MassDeltas[AminoIndex][ModIndex];
+ MassDeltas[AminoIndex][ModIndex].Flags = ModFlags;
+ break;
+ }
+ }
+ }
+ }
+ if (!AminoFoundFlag)
+ {
+ REPORT_ERROR_S(37, StrAminos);
+ return 0;
+ }
+ AllKnownPTMods[AllPTModCount].Mass = (int)(MassDelta * MASS_SCALE);
+ g_PTMLimit[AllPTModCount] = 2; // allow 2 per peptide by default
+ // But, only allow ONE c-terminal one:
+ if ((ModFlags & DELTA_FLAG_C_TERMINAL) || (ModFlags & DELTA_FLAG_N_TERMINAL))
+ {
+ g_PTMLimit[AllPTModCount] = 1;
+ }
+ AllPTModCount++;
+ return 1;
+}
+int ParseInputPTM(char* CommandValue)
+{
+ printf("*** The 'ptm' input command is no longer supported - please use 'mod' instead.\n");
+ printf(" (Refer to the documentation for details)\n");
+ return 0;
+}
+int ParseInputSequenceFile(char* CommandValue)
+{
+ PrepareSecondarySequenceFile(CommandValue);
+ return 1;
+}
+
+int ParseInputReadGFF(char* CommandValue)
+{
+ FILE* GFFFile;
+ StringNode* Node;
+ // Check to be sure we can read the file:
+ GFFFile = fopen(CommandValue, "rb");
+ if (!GFFFile)
+ {
+ REPORT_ERROR_S(8, CommandValue);
+ }
+ else
+ {
+ // File is ok - add it to the GFF file list.
+ fclose(GFFFile);
+ Node = (StringNode*)calloc(1, sizeof(StringNode));
+ Node->String = strdup(CommandValue);
+ if (GlobalOptions->LastGFFFileName)
+ {
+ GlobalOptions->LastGFFFileName->Next = Node;
+ }
+ else
+ {
+ GlobalOptions->FirstGFFFileName = Node;
+ }
+ GlobalOptions->LastGFFFileName = Node;
+ }
+ GlobalOptions->RunMode = RUN_MODE_PREP_MS2DB;
+ return 1;
+}
+
+int ParseInputGenomeFile(char* CommandValue)
+{
+ strncpy(GlobalOptions->GenomeFileName, CommandValue, MAX_FILENAME_LEN);
+ GlobalOptions->RunMode = RUN_MODE_PREP_MS2DB;
+ return 1;
+}
+
+int ParseInputChromosomeName(char* CommandValue)
+{
+ strncpy(GlobalOptions->ChromosomeName, CommandValue, 256);
+ return 1;
+}
+
+int ParseInputParentPPM(char* ValueString)
+{
+ int CommandValue = atoi(ValueString);
+ if (CommandValue < 1 || CommandValue > 4000)
+ {
+ REPORT_ERROR_I(44, CommandValue);
+ return 0;
+ }
+ GlobalOptions->ParentMassPPM = CommandValue;
+ return 1;
+}
+
+int ParseInputPeakPPM(char* ValueString)
+{
+ int CommandValue = atoi(ValueString);
+ if (CommandValue < 1 || CommandValue > 1000)
+ {
+ REPORT_ERROR_I(44, CommandValue);
+ return 0;
+ }
+ GlobalOptions->PeakPPM = CommandValue;
+ return 1;
+}
+
+int ParseInputNewScoring(char* Value)
+{
+ GlobalOptions->NewScoring = 1;
+ return 1;
+}
+static const InputParameter InputParameters[] =
+{
+ {"Blind", ParseInputBlind, INPUT_VALUE_TYPE_INT},
+ {"Unrestrictive", ParseInputBlind, INPUT_VALUE_TYPE_INT},
+ {"BlindTagging", ParseInputBlindTagging, INPUT_VALUE_TYPE_INT},
+ {"Database", ParseInputDB, INPUT_VALUE_TYPE_STRING},
+ {"DB", ParseInputDB, INPUT_VALUE_TYPE_STRING},
+ // {"ExternalTagger", ParseInputExternalTagger, INPUT_VALUE_TYPE_NONE}, //ARI_MOD
+ {"ExternalTagFile",ParseInputExternalTagger,INPUT_VALUE_TYPE_STRING}, //ARI_MOD
+ {"FreeMods", ParseInputFreeMods, INPUT_VALUE_TYPE_INT},
+ {"MutationMode",ParseInputMutationMode,INPUT_VALUE_TYPE_NONE},
+ {"Instrument", ParseInputInstrument, INPUT_VALUE_TYPE_STRING},
+ {"IonTolerance", ParseInputIonTolerance, INPUT_VALUE_TYPE_STRING},
+ {"MaxPTMSize", ParseInputMaxPTMSize, INPUT_VALUE_TYPE_INT},
+ {"MinPTMSize", ParseInputMinPTMSize, INPUT_VALUE_TYPE_INT},
+ {"Mod", ParseInputMod, INPUT_VALUE_TYPE_STRING},
+ {"Mods", ParseInputMods, INPUT_VALUE_TYPE_INT},
+ {"MultiCharge", ParseInputMultiCharge, INPUT_VALUE_TYPE_INT},
+ {"PMCOnly", ParseInputPMCOnly, INPUT_VALUE_TYPE_INT},
+ {"PMTolerance", ParseInputPMTolerance, INPUT_VALUE_TYPE_STRING},
+ {"PM_Tolerance", ParseInputPMTolerance, INPUT_VALUE_TYPE_STRING}, // deprecated
+ {"PRMModel", ParseInputPRMModel, INPUT_VALUE_TYPE_STRING},
+ {"Protease", ParseInputProtease, INPUT_VALUE_TYPE_STRING},
+ {"ReportMatches", ParseInputReportMatches, INPUT_VALUE_TYPE_INT},
+ {"RequireTermini", ParseInputRequireTermini, INPUT_VALUE_TYPE_INT},
+ {"RequiredMod", ParseInputRequiredMod, INPUT_VALUE_TYPE_STRING},
+ {"SequenceFile", ParseInputSequenceFile, INPUT_VALUE_TYPE_STRING},
+ {"Spectra", ParseInputSpectra, INPUT_VALUE_TYPE_STRING},
+ {"TagCheck", ParseInputTagCheck, INPUT_VALUE_TYPE_STRING},
+ {"TagCount", ParseInputTagCount, INPUT_VALUE_TYPE_INT},
+ {"TagCountB", ParseInputTagCount, INPUT_VALUE_TYPE_INT}, // deprecated
+ {"TagLength", ParseInputTagLength, INPUT_VALUE_TYPE_INT},
+ {"TAGModel", ParseInputTAGModel, INPUT_VALUE_TYPE_STRING},
+ {"Tagless", ParseInputTagless, INPUT_VALUE_TYPE_INT},
+ {"TagsOnly", ParseInputTagsOnly, INPUT_VALUE_TYPE_NONE},
+ {"XMLStrict", ParseInputXMLStrict, INPUT_VALUE_TYPE_INT},
+ {"NoScoring",ParseInputNoScoring,INPUT_VALUE_TYPE_NONE},
+
+ // Commands for preparing MS2DB files:
+ {"ReadGFF", ParseInputReadGFF, INPUT_VALUE_TYPE_STRING},
+ {"GenomeFile", ParseInputGenomeFile, INPUT_VALUE_TYPE_STRING},
+ {"ChromosomeName", ParseInputChromosomeName, INPUT_VALUE_TYPE_STRING},
+ {"ParentPPM", ParseInputParentPPM, INPUT_VALUE_TYPE_INT},
+ {"PeakPPM", ParseInputPeakPPM, INPUT_VALUE_TYPE_INT},
+ {"NewScoring",ParseInputNewScoring,INPUT_VALUE_TYPE_NONE},
+ {"MinMutationLogOdds",ParseInputLogOdds,INPUT_VALUE_TYPE_STRING},
+ //{"BuildSuffixArray",ParseInputSuffixArrayBuild,INPUT_VALUE_TYPE_STRING},
+ // Sentinel:
+ {NULL}
+};
+
+// Process one line from the inspect input file. Lines have the form "command,value".
+int ProcessInputCommand(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ const InputParameter* Parameter;
+ int CommandMatched = 0;
+ int ValueOK = 1;
+ char* CheckChar;
+ char* CommandName;
+ char* Value;
+ //
+ CommandName = strtok(LineBuffer, ",");
+ Value = strtok(NULL, ",");
+ for (Parameter = InputParameters; Parameter->Name; Parameter++)
+ {
+ if (CompareStrings(CommandName, Parameter->Name))
+ {
+ continue;
+ }
+ CommandMatched = 1;
+ // Validate the value:
+ switch (Parameter->ValueType)
+ {
+ case INPUT_VALUE_TYPE_NONE:
+ if (Value && *Value)
+ {
+ REPORT_ERROR_S(39, CommandName);
+ ValueOK = 0;
+ }
+ break;
+ case INPUT_VALUE_TYPE_STRING:
+ if (!Value || !*Value)
+ {
+ REPORT_ERROR_S(40, CommandName);
+ ValueOK = 0;
+ }
+ break;
+ case INPUT_VALUE_TYPE_INT:
+ if (!Value || !*Value)
+ {
+ REPORT_ERROR_S(41, CommandName);
+ ValueOK = 0;
+ break;
+ }
+ for (CheckChar = Value; *CheckChar; CheckChar++)
+ {
+ if (!isdigit(*CheckChar))
+ {
+ REPORT_ERROR_S(41, CommandName);
+ ValueOK = 0;
+ break;
+ }
+ }
+ break;
+ }
+ if (ValueOK)
+ {
+ Parameter->ParseFunction(Value);
+ }
+ }
+ if (!CommandMatched)
+ {
+ REPORT_ERROR_S(13, CommandName);
+ }
+ return 1;
+}
+
+// Parse the input file; return TRUE if successful.
+int ParseInputFile()
+{
+ FILE* InputFile;
+ int ModIndex;
+
+ ///////////////////
+ InputFile = fopen(GlobalOptions->InputFileName, "rb");
+ if (!InputFile)
+ {
+ REPORT_ERROR_S(8, GlobalOptions->InputFileName);
+ return 0;
+ }
+ ParseFileByLines(InputFile, ProcessInputCommand, NULL, 0);
+ fclose(InputFile);
+
+ // PTM processing:
+ if (AllPTModCount && !GlobalOptions->MaxPTMods)
+ {
+ // This is worrisome - the user has defined modifications, but matches are not
+ // permitted to USE modifications. That is reasonable only under weird circumstances.
+ if (GlobalOptions->RunMode & RUN_MODE_TAGS_ONLY && GlobalOptions->ExternalTagger)
+ {
+ //
+ }
+ else
+ {
+ REPORT_ERROR(34);
+ }
+ }
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ g_PTMLimit[ModIndex] = min(g_PTMLimit[ModIndex], GlobalOptions->MaxPTMods);
+ }
+ if (GlobalOptions->MaxPTMods > 2)
+ {
+ if (GlobalOptions->RunMode & (RUN_MODE_MUTATION | RUN_MODE_BLIND))
+ {
+ printf("** Warning: Unrestrictive search with more than two mods is NOT recommended.\n");
+ }
+ }
+ // Set the flanking mass tolerance: Equal to parent mass tolerance plus ion tolerance
+ // plus 0.1
+ GlobalOptions->FlankingMassEpsilon = GlobalOptions->ParentMassEpsilon + GlobalOptions->Epsilon + 10;
+ //debugPrintPTMStuff();
+ if (GlobalOptions->ErrorCount)
+ {
+ return 0;
+ }
+ else
+ {
+ return 1;
+ }
+
+
+}
+
+int CompareSpectrumNodes(const SpectrumNode* NodeA, const SpectrumNode* NodeB)
+{
+ int NameResult;
+ NameResult = strcmp(NodeA->InputFile->FileName, NodeB->InputFile->FileName);
+ if (NameResult)
+ {
+ return NameResult;
+ }
+ //return (strcmp(NodeA->FileName, NodeB->FileName));
+ return (NodeA->FilePosition - NodeB->FilePosition);
+}
+
+// Sort spectra by filename.
+void SortSpectra()
+{
+ SpectrumNode* Node;
+ SpectrumNode* Prev;
+ int NodeIndex;
+ int NodeCount;
+ //
+ if (!GlobalOptions->FirstSpectrum)
+ {
+ return;
+ }
+ g_BigNodeArray = (SpectrumNode*)calloc(GlobalOptions->SpectrumCount, sizeof(SpectrumNode));
+ NodeIndex = 0;
+ for (Node = GlobalOptions->FirstSpectrum; Node; Node = Node->Next)
+ {
+ memcpy(g_BigNodeArray + NodeIndex, Node, sizeof(SpectrumNode));
+ NodeIndex++;
+ }
+ NodeCount = NodeIndex;
+
+ // Free old list:
+ Prev = NULL;
+ for (Node = GlobalOptions->FirstSpectrum; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+
+ // Sort array:
+ qsort(g_BigNodeArray, NodeCount, sizeof(SpectrumNode), (QSortCompare)CompareSpectrumNodes);
+ for (NodeIndex = 0; NodeIndex < NodeCount; NodeIndex++)
+ {
+ if (NodeIndex < NodeCount-1)
+ {
+ g_BigNodeArray[NodeIndex].Next = g_BigNodeArray + NodeIndex + 1;
+ }
+ else
+ {
+ g_BigNodeArray[NodeIndex].Next = NULL;
+ }
+ }
+ GlobalOptions->FirstSpectrum = g_BigNodeArray;
+ GlobalOptions->LastSpectrum = g_BigNodeArray + (NodeCount - 1);
+}
+
+
diff --git a/ParseInput.h b/ParseInput.h
new file mode 100644
index 0000000..983e5a6
--- /dev/null
+++ b/ParseInput.h
@@ -0,0 +1,44 @@
+//Title: ParseInput.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#ifndef PARSE_INPUT_H
+#define PARSE_INPUT_H
+
+
+
+int ParseInputFile();
+void FreeTagCheckNodes();
+void FreeInputFileNodes();
+void SortSpectra();
+int ProcessInputCommand(int LineNumber, int FilePos, char* LineBuffer, void* UserData);
+
+#endif //PARSE_INPUT_H
+
diff --git a/ParseXML.c b/ParseXML.c
new file mode 100644
index 0000000..b910b55
--- /dev/null
+++ b/ParseXML.c
@@ -0,0 +1,1239 @@
+//Title: ParseXML.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Spectrum.h"
+#include "Utils.h"
+#include "Inspect.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <string.h>
+#include <math.h>
+#include "CMemLeak.h"
+#include "Tagger.h"
+#include "base64.h"
+#include "Errors.h"
+#include "expat.h"
+
+#define MZXML_BUFFER_SIZE 102400
+
+typedef enum MZXMLScanState
+{
+ evMZXMLNone = 0,
+ evMZXMLPrecursorMZ,
+ evMZXMLPeaks,
+} MZXMLScanState;
+
+typedef enum MZDataScanState
+{
+ evMZDataNone = 0,
+ evMZDataIonSelection,
+ evMZDataMZArray,
+ evMZDataMZArrayBody,
+ evMZDataIntensityArray,
+ evMZDataIntensityArrayBody
+} MZDataScanState;
+
+#define MZXML_PARSE_LIST_SPECTRA 0
+#define MZXML_PARSE_OBTAIN_PEAKS 1
+
+// The MZXMLParseCursor is used for parsing MZXML and MZDATA formats.
+// It holds the expat Parser object, and a limited amount of parse state
+// (i.e. the current tag).
+typedef struct MZXMLParseCursor
+{
+ int FirstScan;
+ int LastScan;
+ int ScanNumber;
+ int SpecIndex;
+ int ErrorFlag;
+ int Charge; //NEC_MZXML files may contain precursorCharge
+ XML_Parser Parser;
+ int PeakCountAllocation;
+ int PeakBufferSize;
+ int PeakBufferPos;
+ char* PeakBuffer;
+ char* DecodedPeakBuffer;
+ float* Peaks;
+ char PrecursorMZBuffer[256];
+ InputFileNode* InputFile;
+ int State;
+ MSSpectrum* Spectrum;
+ int PeakCount;
+ int ByteOrderLittle;
+ int SpectrumPeaksCompleteFlag;
+ void* XMLBuffer;
+ int Mode;
+ int MSLevel;
+} MZXMLParseCursor;
+
+typedef struct MZDataParseCursor
+{
+ int FirstScan;
+ int LastScan;
+ int ScanNumber;
+ int SpecIndex;
+ int ErrorFlag;
+ XML_Parser Parser;
+ int PeakCountAllocation;
+ int PeakBufferSize;
+ int PeakBufferPos;
+ char* PeakBuffer;
+ char* DecodedPeakBuffer;
+ float* MZBuffer;
+ float* IntensityBuffer;
+ InputFileNode* InputFile;
+ int State;
+ MSSpectrum* Spectrum;
+ int PeakCount;
+ int ByteOrderLittle;
+ int SpectrumPeaksCompleteFlag;
+ void* XMLBuffer;
+ int Mode;
+ float* Peaks;
+ int PrecursorMZ;
+ int SpectrumStartFilePos;
+ int MSLevel;
+ int Precision;
+} MZDataParseCursor;
+
+// We build a single MZXMLCursor when needed, and free it when cleaning up.
+MZXMLParseCursor* g_MZXMLParseCursor = NULL;
+MZDataParseCursor* g_MZDataParseCursor = NULL;
+
+void EndianByteSwap(char* Buffer, int EntrySize)
+{
+ char ByteSwap;
+ int Pos;
+
+ for (Pos = 0; Pos < (EntrySize / 2); Pos++)
+ {
+ ByteSwap = Buffer[Pos];
+ Buffer[Pos] = Buffer[EntrySize - Pos - 1];
+ Buffer[EntrySize - Pos - 1] = ByteSwap;
+ }
+}
+// expat callback: Handle character data in the body of a tag.
+// The only mzxml body we care about is <peaks>
+void MZXMLCharacterDataHandler(void* UserData, const XML_Char* String, int Length)
+{
+ MZXMLParseCursor* Cursor;
+ int PeakCopySize;
+ //
+ Cursor = (MZXMLParseCursor*)UserData;
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ switch (Cursor->State)
+ {
+ case evMZXMLPrecursorMZ:
+ strncat(Cursor->PrecursorMZBuffer, String, min(Length, 255));
+ break;
+ case evMZXMLPeaks:
+ PeakCopySize = Length;
+ if (Cursor->PeakBufferPos + PeakCopySize >= Cursor->PeakBufferSize)
+ {
+ REPORT_ERROR(0);
+ PeakCopySize = max(0, Cursor->PeakBufferSize - Cursor->PeakBufferPos - 1);
+ }
+ memcpy(Cursor->PeakBuffer + Cursor->PeakBufferPos, String, PeakCopySize);
+ Cursor->PeakBufferPos += PeakCopySize;
+ Cursor->PeakBuffer[Cursor->PeakBufferPos] = '\0';
+ break;
+ // Default behavior is to ignore text:
+ default:
+ break;
+ }
+}
+
+void MZXMLStartScan(MZXMLParseCursor* Cursor, const char** Attributes)
+{
+ int AttributeIndex;
+ const char* Name;
+ const char* Value;
+ int MSLevel = 1;
+ int ScanNumber = -1;
+ int PeakCount = 0;
+ int FilePos;
+ //
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "msLevel"))
+ {
+ MSLevel = atoi(Value);
+ }
+ else if (!CompareStrings(Name, "peaksCount"))
+ {
+ PeakCount = atoi(Value);
+ }
+ else if (!CompareStrings(Name, "num"))
+ {
+ ScanNumber = atoi(Value);
+ }
+ }
+
+ Cursor->ScanNumber = -1;
+ Cursor->PeakBufferPos = 0;
+ Cursor->PeakBuffer[0] = '\0';
+ Cursor->PrecursorMZBuffer[0] = '\0';
+ Cursor->PeakCount = PeakCount;
+ Cursor->MSLevel = MSLevel;
+
+
+
+
+ // If it's a level-2 scan with non-trivial peak count, then we should parse it:
+ if (MSLevel >= 2 && PeakCount >= 10)
+ {
+ if (ScanNumber >= Cursor->FirstScan && (Cursor->LastScan < 0 || ScanNumber <= Cursor->LastScan))
+ {
+ FilePos = XML_GetCurrentByteIndex(Cursor->Parser);
+ if (Cursor->Mode == MZXML_PARSE_LIST_SPECTRA)
+ {
+ AddSpectrumToList(Cursor->InputFile, FilePos, ScanNumber, Cursor->SpecIndex);
+ }
+ Cursor->ScanNumber = ScanNumber;
+ // Allocate peak buffer, if necessary:
+ if (PeakCount >= Cursor->PeakCountAllocation)
+ {
+ Cursor->PeakCountAllocation = PeakCount * 2;
+ Cursor->PeakBufferSize = sizeof(double) * 4 * Cursor->PeakCountAllocation;
+ SafeFree(Cursor->PeakBuffer);
+ Cursor->PeakBuffer = (char*)malloc(Cursor->PeakBufferSize);
+ SafeFree(Cursor->Peaks)
+ Cursor->Peaks = (float*)malloc(sizeof(float) * 2 * Cursor->PeakCountAllocation);
+ SafeFree(Cursor->DecodedPeakBuffer);
+ Cursor->DecodedPeakBuffer = (char*)malloc(Cursor->PeakBufferSize);
+ }
+ }
+ }
+ if(MSLevel >= 2)
+ Cursor->SpecIndex++;
+}
+
+// Callback for reaching </peaks> in an mzXML parser - decode the peak array!
+void MZXMLFinishPeaks(MZXMLParseCursor* Cursor, MSSpectrum* Spectrum)
+{
+ int Trail;
+ int FloatIndex;
+ int PeakCount;
+ int PeakIndex;
+ float Value;
+ float RawMass;
+ //
+
+ PeakCount = Cursor->PeakCount;
+
+ Trail = (PeakCount % 3);
+ if (!(PeakCount % 3))
+ {
+ Cursor->PeakBuffer[PeakCount * 32/3] = '\0';
+ }
+ else
+ {
+ Cursor->PeakBuffer[(PeakCount * 32/3) + Trail + 1] = '\0';
+ }
+ b64_decode_mio(Cursor->PeakBuffer, Cursor->DecodedPeakBuffer);
+ for (FloatIndex = 0; FloatIndex < (2 * PeakCount); FloatIndex++)
+ {
+#ifdef BYTEORDER_LITTLE_ENDIAN
+ if (!Cursor->ByteOrderLittle)
+ {
+ EndianByteSwap(Cursor->DecodedPeakBuffer + (FloatIndex * 4), 4);
+ }
+#else
+ if (Cursor->ByteOrderLittle)
+ {
+ EndianByteSwap(Cursor->DecodedPeakBuffer + (FloatIndex * 4), 4);
+ }
+#endif
+ memcpy(Cursor->Peaks + FloatIndex, Cursor->DecodedPeakBuffer + FloatIndex * 4, 4);
+ }
+
+ Spectrum->PeakCount = PeakCount;
+ Spectrum->PeakAllocation = PeakCount;
+ Spectrum->Peaks = (SpectralPeak*)calloc(sizeof(SpectralPeak), PeakCount);
+ if (!Spectrum->Peaks)
+ {
+ REPORT_ERROR_I(49, sizeof(SpectralPeak) * PeakCount);
+ }
+
+ for (PeakIndex = 0; PeakIndex < PeakCount; PeakIndex++)
+ {
+
+ Value = Cursor->Peaks[PeakIndex * 2];
+ RawMass = Value;
+ Spectrum->Peaks[PeakIndex].Mass = (int)(Value * MASS_SCALE + 0.5);
+
+ Value = Cursor->Peaks[PeakIndex * 2 + 1];
+ Spectrum->Peaks[PeakIndex].Intensity = Value;
+
+ }
+ if (Spectrum->Peaks[0].Mass < -1 || Spectrum->Peaks[0].Mass > (GlobalOptions->DynamicRangeMax + GlobalOptions->Epsilon))
+ {
+
+ REPORT_WARNING_SII(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[0].Mass / MASS_SCALE);
+ }
+ if (Spectrum->Peaks[Spectrum->PeakCount - 1].Mass < -1 || Spectrum->Peaks[Spectrum->PeakCount - 1].Mass > (GlobalOptions->DynamicRangeMax + GlobalOptions->Epsilon))
+ {
+ REPORT_WARNING_SII(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[Spectrum->PeakCount - 1].Mass / MASS_SCALE);
+ }
+ if (Spectrum->Peaks[0].Intensity < 0)
+ {
+ REPORT_WARNING_SIF(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[0].Intensity);
+ }
+ if (Spectrum->Peaks[Spectrum->PeakCount - 1].Intensity < 0)
+ {
+ REPORT_WARNING_SIF(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[0].Intensity);
+ }
+
+ Cursor->State = evMZXMLNone;
+ Cursor->SpectrumPeaksCompleteFlag = 1;
+ // After the end of the <peaks> flag, this scan ends.
+ // Nuke the handlers, so we can finish off the buffer in peace.
+ XML_SetElementHandler(Cursor->Parser, NULL, NULL);
+ XML_SetCharacterDataHandler(Cursor->Parser, NULL);
+ XML_SetProcessingInstructionHandler(Cursor->Parser, NULL);
+}
+
+// expat callback: End a tag.
+void MZXMLEndElement(void* UserData, const char* Tag)
+{
+ MZXMLParseCursor* Cursor;
+ MSSpectrum* Spectrum;
+ //
+ Cursor = (MZXMLParseCursor*)UserData;
+ Spectrum = Cursor->Spectrum;
+ //printf("End '%s'\n", Tag);
+ // Set the precursor m/z, if appropriate:
+ if (Cursor->State == evMZXMLPrecursorMZ)
+ {
+ if (Spectrum)
+ {
+ Spectrum->MZ = (int)(MASS_SCALE * strtod(Cursor->PrecursorMZBuffer,NULL));
+ Spectrum->FileMZ = (int)(MASS_SCALE * strtod(Cursor->PrecursorMZBuffer,NULL));
+ if(Cursor->Charge != 0 && Cursor->Charge < 6)
+ {
+ Spectrum->FileCharge[Cursor->Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ }
+ Spectrum->Charge = Cursor->Charge;
+ Spectrum->ParentMass = (Spectrum->MZ * Spectrum->Charge) - (Spectrum->Charge - 1)*HYDROGEN_MASS;
+ }
+ }
+
+ // If we just finished <peaks>, and we have a spectrum, then set the peaks.
+ if (Cursor->State != evMZXMLPeaks || !Spectrum)
+ {
+ Cursor->State = evMZXMLNone;
+ return;
+ }
+ MZXMLFinishPeaks(Cursor, Spectrum);
+}
+
+void MZXMLStartPeaks(MZXMLParseCursor* Cursor, const char** Attributes)
+{
+ const char* Name;
+ const char* Value;
+ int ScanNumber = -1;
+ int PeakCount = 0;
+ int AttributeIndex;
+ //
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ if (Cursor->MSLevel < 2)
+ {
+ return; // we don't care about peaks at level 1
+ }
+
+ Cursor->State = evMZXMLPeaks;
+ Cursor->PeakBuffer[0] = '\0';
+ Cursor->PeakBufferPos = 0;
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!CompareStrings(Name, "byteOrder"))
+ {
+ // Parse the byte ordering:
+ if (!CompareStrings(Value, "network"))
+ {
+ Cursor->ByteOrderLittle = 0;
+ }
+ else if (!CompareStrings(Value, "little"))
+ {
+ Cursor->ByteOrderLittle = 0;
+ }
+ else if (!CompareStrings(Value, "big"))
+ {
+ Cursor->ByteOrderLittle = 0;
+ }
+ }
+ }
+}
+
+// expat callback: Handle a tag and its attributes.
+void MZXMLStartElement(void* UserData, const char* Tag, const char** Attributes)
+{
+ MZXMLParseCursor* Cursor;
+ int ExpectedTag = 0;
+
+ //NEC_Added for precursorCharge parsing
+ int AttributeIndex = 0;
+ const char* Name;
+ const char* Value;
+ //
+ Cursor = (MZXMLParseCursor*)UserData;
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ //printf("Start '%s'\n", Tag);
+ // Switch on our current state, and handle the tags we expect to see in our current state.
+ // Tags we don't expect are ignored (i.e. new tags can be added without breaking the parser)
+ switch (Cursor->State)
+ {
+ default:
+ // If we encounter <scan>, start the new scan:
+ if (!strcmp(Tag, "scan"))
+ {
+ MZXMLStartScan(Cursor, Attributes);
+ return;
+ }
+ if (!strcmp(Tag, "precursorMz"))
+ {
+ Cursor->State = evMZXMLPrecursorMZ;
+ Cursor->Charge = 0;
+ for(AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if(!CompareStrings(Name, "precursorCharge"))
+ {
+ Cursor->Charge = atoi(Value);
+ }
+ }
+
+ Cursor->PrecursorMZBuffer[0] = '\0';
+ return;
+ }
+ if (!strcmp(Tag, "peaks"))
+ {
+ MZXMLStartPeaks(Cursor, Attributes);
+ return;
+ }
+ break;
+ }
+}
+
+void MZDataParseMSLevel(MZDataParseCursor* Cursor, const char** Attributes)
+{
+ const char* Name;
+ const char* Value;
+ int AttributeIndex;
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!strcmp(Name, "msLevel"))
+ {
+ Cursor->MSLevel = atoi(Value);
+
+
+ }
+ }
+}
+
+void MZDataGetPrecursorMZ(MZDataParseCursor* Cursor, const char** Attributes)
+{
+ const char* Name;
+ const char* Value;
+ int AttributeIndex;
+ int MassChargeRatioFlag = 0;
+ double FloatValue;
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!strcmp(Name, "name"))
+ {
+ if (!strcmp(Value, "MassToChargeRatio") || !strcmp(Value, "mz"))
+ {
+ MassChargeRatioFlag = 1;
+ }
+ continue;
+ }
+ if (!strcmp(Name, "value"))
+ {
+ FloatValue = strtod(Value,NULL);
+ }
+ }
+ if (MassChargeRatioFlag)
+ {
+ Cursor->PrecursorMZ = (int)(MASS_SCALE * FloatValue);
+ }
+}
+
+// Look up the number of peaks. Re-allocate buffers if necessary.
+void MZDataGetPeakCount(MZDataParseCursor* Cursor, const char** Attributes)
+{
+ const char* Name;
+ const char* Value;
+ int AttributeIndex;
+ for (AttributeIndex = 0; Attributes[AttributeIndex]; AttributeIndex += 2)
+ {
+ Name = Attributes[AttributeIndex];
+ Value = Attributes[AttributeIndex + 1];
+ if (!strcmp(Name, "precision"))
+ {
+ Cursor->Precision = atoi(Value);
+ }
+ else if (!strcmp(Name, "endian"))
+ {
+ Cursor->ByteOrderLittle = 0; // default
+ if (!strcmp(Value, "little"))
+ {
+ Cursor->ByteOrderLittle = 1;
+ }
+ else if (!strcmp(Value, "network"))
+ {
+ Cursor->ByteOrderLittle = 0;
+ }
+ else if (!strcmp(Value, "big"))
+ {
+ Cursor->ByteOrderLittle = 0;
+ }
+ continue;
+ }
+ if (!strcmp(Name, "length"))
+ {
+ Cursor->PeakCount = atoi(Value);
+ // Is this more peaks than we can currently handle?
+ if (Cursor->PeakCount >= Cursor->PeakCountAllocation)
+ {
+ Cursor->PeakCountAllocation = Cursor->PeakCount * 2;
+ SafeFree(Cursor->PeakBuffer);
+ Cursor->PeakBufferSize = sizeof(double) * 4 * Cursor->PeakCountAllocation;
+ Cursor->PeakBuffer = (char*)malloc(Cursor->PeakBufferSize);
+ SafeFree(Cursor->DecodedPeakBuffer);
+ Cursor->DecodedPeakBuffer = (char*)malloc(sizeof(float) * Cursor->PeakCountAllocation);
+ SafeFree(Cursor->IntensityBuffer);
+ Cursor->IntensityBuffer = (float*)malloc(sizeof(float) * Cursor->PeakCountAllocation);
+ SafeFree(Cursor->MZBuffer);
+ Cursor->MZBuffer = (float*)malloc(sizeof(float) * Cursor->PeakCountAllocation);
+ }
+ continue;
+ }
+ }
+}
+
+// expat callback: Handle a tag and its attributes.
+void MZDataStartElement(void* UserData, const char* Tag, const char** Attributes)
+{
+ MZDataParseCursor* Cursor;
+ int ExpectedTag = 0;
+ //
+ Cursor = (MZDataParseCursor*)UserData;
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+
+ // Switch on our current state, and handle the tags we expect to see in our current state.
+ // Tags we don't expect are ignored (i.e. new tags can be added without breaking the parser)
+ // If we encounter <spectrum>, start the new scan:
+ if (!strcmp(Tag, "spectrum"))
+ {
+ //MZDataStartScan(Cursor, Attributes);
+ Cursor->SpectrumStartFilePos = XML_GetCurrentByteIndex(Cursor->Parser);
+ Cursor->ScanNumber = atoi(Attributes[1]);
+ return;
+ }
+ // If we encounter <ionSelection>, update our state:
+ if (!strcmp(Tag, "ionSelection"))
+ {
+ Cursor->State = evMZDataIonSelection;
+ return;
+ }
+ // If we encounter <cvParam> within ionSelection, set precursor m/z if possible:
+ if (!strcmp(Tag, "cvParam") && Cursor->State == evMZDataIonSelection)
+ {
+ MZDataGetPrecursorMZ(Cursor, Attributes);
+ return;
+ }
+
+ if (!strcmp(Tag, "data"))
+ {
+ switch (Cursor->State)
+ {
+ case evMZDataMZArray:
+ Cursor->State = evMZDataMZArrayBody;
+ Cursor->PeakBufferPos = 0;
+ MZDataGetPeakCount(Cursor, Attributes);
+ break;
+ case evMZDataIntensityArray:
+ Cursor->State = evMZDataIntensityArrayBody;
+ Cursor->PeakBufferPos = 0;
+ MZDataGetPeakCount(Cursor, Attributes);
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+ return;
+ }
+ if (!strcmp(Tag, "mzArrayBinary"))
+ {
+ Cursor->State = evMZDataMZArray;
+ return;
+ }
+ if (!strcmp(Tag, "intenArrayBinary"))
+ {
+ Cursor->State = evMZDataIntensityArray;
+ return;
+ }
+ if (!strcmp(Tag, "precursor"))
+ {
+ MZDataParseMSLevel(Cursor, Attributes);
+ return;
+ }
+}
+
+void MZDataCompleteSpectrum(MZDataParseCursor* Cursor, MSSpectrum* Spectrum)
+{
+ int PeakIndex;
+
+ if (Cursor->Mode == MZXML_PARSE_LIST_SPECTRA)
+ {
+ if (Cursor->PeakCount >= 10 && Cursor->MSLevel > 1)
+ {
+ AddSpectrumToList(Cursor->InputFile, Cursor->SpectrumStartFilePos, Cursor->ScanNumber, Cursor->SpecIndex);
+ Cursor->SpecIndex++;
+ }
+ return;
+ }
+
+ Spectrum->PeakCount = Cursor->PeakCount;
+ Spectrum->PeakAllocation = Cursor->PeakCount;
+ Spectrum->Peaks = (SpectralPeak*)calloc(sizeof(SpectralPeak), Cursor->PeakCount);
+ Spectrum->MZ = Cursor->PrecursorMZ;
+ Spectrum->FileMZ = Cursor->PrecursorMZ;
+
+ for (PeakIndex = 0; PeakIndex < Cursor->PeakCount; PeakIndex++)
+ {
+ Spectrum->Peaks[PeakIndex].Mass = (int)(Cursor->MZBuffer[PeakIndex] * MASS_SCALE + 0.5);
+ Spectrum->Peaks[PeakIndex].Intensity = Cursor->IntensityBuffer[PeakIndex];
+ }
+ if (Spectrum->Peaks[0].Mass < -1 || Spectrum->Peaks[0].Mass > (GlobalOptions->DynamicRangeMax + GlobalOptions->Epsilon))
+ {
+ REPORT_WARNING_SII(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[0].Mass);
+ }
+ if (Spectrum->Peaks[Spectrum->PeakCount - 1].Mass < -1 || Spectrum->Peaks[Spectrum->PeakCount - 1].Mass > (GlobalOptions->DynamicRangeMax + GlobalOptions->Epsilon))
+ {
+ REPORT_WARNING_SII(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[0].Mass);
+ }
+ if (Spectrum->Peaks[0].Intensity < 0)
+ {
+ REPORT_WARNING_SIF(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[0].Intensity);
+ }
+ if (Spectrum->Peaks[Spectrum->PeakCount - 1].Intensity < 0)
+ {
+ REPORT_WARNING_SIF(45, Spectrum->Node->InputFile->FileName, Spectrum->Node->ScanNumber,
+ Spectrum->Peaks[0].Intensity);
+ }
+
+ Cursor->State = evMZDataNone;
+ Cursor->SpectrumPeaksCompleteFlag = 1;
+ // After the end of the <peaks> flag, this scan ends.
+ // Nuke the handlers, so we can finish off the buffer in peace.
+ XML_SetElementHandler(Cursor->Parser, NULL, NULL);
+ XML_SetCharacterDataHandler(Cursor->Parser, NULL);
+ XML_SetProcessingInstructionHandler(Cursor->Parser, NULL);
+
+}
+
+// MZData callback for end </data> tag:
+// - Decode the base64-encoded float array
+// - Store the floats in the MZ or Intensity array
+void MZDataProcessEncodedPeakData(MZDataParseCursor* Cursor, MSSpectrum* Spectrum)
+{
+ int PeakCount;
+ int Trail;
+ int FloatIndex;
+ int EncodedRecordSize;
+ //
+ if (Cursor->State == evMZDataIntensityArrayBody)
+ {
+ Cursor->State = evMZDataIntensityArray;
+ }
+ else if (Cursor->State == evMZDataMZArrayBody)
+ {
+ Cursor->State = evMZDataMZArray;
+ }
+ else
+ {
+ REPORT_ERROR(0);
+ }
+ if (!Spectrum)
+ {
+ return;
+ }
+ PeakCount = Cursor->PeakCount;
+ Trail = (PeakCount % 3);
+ if (!(PeakCount % 3))
+ {
+ Cursor->PeakBuffer[PeakCount * 32/3] = '\0';
+ }
+ else
+ {
+ Cursor->PeakBuffer[(PeakCount * 32/3) + Trail + 1] = '\0';
+ }
+ b64_decode_mio(Cursor->PeakBuffer, Cursor->DecodedPeakBuffer);
+ if (Cursor->Precision == 32)
+ {
+ EncodedRecordSize = 4;
+ }
+ else if (Cursor->Precision == 64)
+ {
+ EncodedRecordSize = 8;
+ }
+ else
+ {
+ // Default to 32bit:
+ EncodedRecordSize = 4;
+ }
+ for (FloatIndex = 0; FloatIndex < PeakCount; FloatIndex++)
+ {
+#ifdef BYTEORDER_LITTLE_ENDIAN
+ if (!Cursor->ByteOrderLittle)
+ {
+ EndianByteSwap(Cursor->DecodedPeakBuffer + (FloatIndex * EncodedRecordSize), EncodedRecordSize);
+ //ByteSwap = Cursor->DecodedPeakBuffer[FloatIndex*4];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4] = Cursor->DecodedPeakBuffer[FloatIndex*4 + 3];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4 + 3] = ByteSwap;
+ //ByteSwap = Cursor->DecodedPeakBuffer[FloatIndex*4 + 1];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4 + 1] = Cursor->DecodedPeakBuffer[FloatIndex*4 + 2];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4 + 2] = ByteSwap;
+ }
+#else
+ if (Cursor->ByteOrderLittle)
+ {
+ EndianByteSwap(Cursor->DecodedPeakBuffer + (FloatIndex * EncodedRecordSize), EncodedRecordSize);
+ //ByteSwap = Cursor->DecodedPeakBuffer[FloatIndex*4];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4] = Cursor->DecodedPeakBuffer[FloatIndex*4 + 3];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4 + 3] = ByteSwap;
+ //ByteSwap = Cursor->DecodedPeakBuffer[FloatIndex*4 + 1];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4 + 1] = Cursor->DecodedPeakBuffer[FloatIndex*4 + 2];
+ //Cursor->DecodedPeakBuffer[FloatIndex*4 + 2] = ByteSwap;
+ }
+#endif
+ if (Cursor->State == evMZDataMZArrayBody || Cursor->State == evMZDataMZArray)
+ {
+ Cursor->MZBuffer[FloatIndex] = *((float*)(Cursor->DecodedPeakBuffer + FloatIndex * EncodedRecordSize));
+ //memcpy(Cursor->MZBuffer + FloatIndex, Cursor->DecodedPeakBuffer + FloatIndex * 4, 4);
+ }
+ else
+ {
+ //memcpy(Cursor->IntensityBuffer + FloatIndex, Cursor->DecodedPeakBuffer + FloatIndex * 4, 4);
+ Cursor->IntensityBuffer[FloatIndex] = *((float*)(Cursor->DecodedPeakBuffer + FloatIndex * EncodedRecordSize));
+ }
+ }
+}
+
+// expat callback: End a tag.
+void MZDataEndElement(void* UserData, const char* Tag)
+{
+ MZDataParseCursor* Cursor;
+ MSSpectrum* Spectrum;
+ //
+ Cursor = (MZDataParseCursor*)UserData;
+ Spectrum = Cursor->Spectrum;
+
+ if (!strcmp(Tag, "spectrum"))
+ {
+ MZDataCompleteSpectrum(Cursor, Spectrum);
+ Cursor->SpectrumPeaksCompleteFlag = 1;
+ return;
+ }
+ if (!strcmp(Tag, "data"))
+ {
+ MZDataProcessEncodedPeakData(Cursor, Spectrum);
+ return;
+ }
+ if (!strcmp(Tag, "intenArrayBinary"))
+ {
+ Cursor->State = evMZDataNone;
+ return;
+ }
+ if (!strcmp(Tag, "mzArrayBinary"))
+ {
+ Cursor->State = evMZDataNone;
+ return;
+ }
+ if (!strcmp(Tag, "ionSelection"))
+ {
+ Cursor->State = evMZDataNone;
+ return;
+ }
+}
+
+// expat callback: Handle character data in the body of a tag.
+// The only mzdata body we care about is <data>
+void MZDataCharacterDataHandler(void* UserData, const XML_Char* String, int Length)
+{
+ MZDataParseCursor* Cursor;
+ int PeakCopySize;
+ //
+ Cursor = (MZDataParseCursor*)UserData;
+ if (Cursor->ErrorFlag)
+ {
+ return;
+ }
+ switch (Cursor->State)
+ {
+ case evMZDataMZArrayBody:
+ case evMZDataIntensityArrayBody: // deliberate fallthrough
+ PeakCopySize = Length;
+ if (Cursor->PeakBufferPos + PeakCopySize >= Cursor->PeakBufferSize)
+ {
+ REPORT_ERROR(0);
+ PeakCopySize = max(0, Cursor->PeakBufferSize - Cursor->PeakBufferPos - 1);
+ }
+ memcpy(Cursor->PeakBuffer + Cursor->PeakBufferPos, String, PeakCopySize);
+ Cursor->PeakBufferPos += PeakCopySize;
+ Cursor->PeakBuffer[Cursor->PeakBufferPos] = '\0';
+ break;
+ // Default behavior is to ignore text:
+ default:
+ break;
+ }
+}
+
+MZDataParseCursor* GetMZDataParseCursor()
+{
+ if (g_MZDataParseCursor)
+ {
+ return g_MZDataParseCursor;
+ }
+ g_MZDataParseCursor = (MZDataParseCursor*)calloc(1, sizeof(MZDataParseCursor));
+ g_MZDataParseCursor->PeakCountAllocation = 1024;
+ g_MZDataParseCursor->PeakBufferSize = sizeof(double) * 4 * g_MZDataParseCursor->PeakCountAllocation;
+ g_MZDataParseCursor->PeakBuffer = (char*)malloc(g_MZDataParseCursor->PeakBufferSize);
+ g_MZDataParseCursor->DecodedPeakBuffer = (char*)malloc(g_MZDataParseCursor->PeakCountAllocation * sizeof(float));
+ g_MZDataParseCursor->MZBuffer = (float*)malloc(g_MZDataParseCursor->PeakCountAllocation * sizeof(float));
+ g_MZDataParseCursor->IntensityBuffer = (float*)malloc(g_MZDataParseCursor->PeakCountAllocation * sizeof(float));
+ g_MZDataParseCursor->Parser = XML_ParserCreate(NULL);
+ //g_MZDataParseCursor->XMLBuffer = XML_GetBuffer(g_MZDataParseCursor->Parser, sizeof(char) * MZXML_BUFFER_SIZE);
+ //if (!g_MZDataParseCursor->XMLBuffer)
+ //{
+ // printf("* Error: Unable to get XML buffer of size %d\n", MZXML_BUFFER_SIZE);
+ //}
+ return g_MZDataParseCursor;
+}
+
+MZXMLParseCursor* GetMZXMLParseCursor()
+{
+ if (g_MZXMLParseCursor)
+ {
+ return g_MZXMLParseCursor;
+ }
+ g_MZXMLParseCursor = (MZXMLParseCursor*)calloc(1, sizeof(MZXMLParseCursor));
+ g_MZXMLParseCursor->PeakCountAllocation = 1024;
+ g_MZXMLParseCursor->PeakBufferSize = sizeof(double) * 4 * g_MZXMLParseCursor->PeakCountAllocation;
+ g_MZXMLParseCursor->PeakBuffer = (char*)malloc(g_MZXMLParseCursor->PeakBufferSize);
+ g_MZXMLParseCursor->DecodedPeakBuffer = (char*)malloc(g_MZXMLParseCursor->PeakBufferSize);
+ g_MZXMLParseCursor->Peaks = (float*)malloc(sizeof(float) * 2 * g_MZXMLParseCursor->PeakCountAllocation);
+ g_MZXMLParseCursor->Parser = XML_ParserCreate(NULL);
+ //g_MZXMLParseCursor->XMLBuffer = XML_GetBuffer(g_MZXMLParseCursor->Parser, sizeof(char) * MZXML_BUFFER_SIZE);
+ //if (!g_MZXMLParseCursor->XMLBuffer)
+ //{
+ // printf("* Error: Unable to get XML buffer of size %d\n", MZXML_BUFFER_SIZE);
+ //}
+
+ return g_MZXMLParseCursor;
+}
+
+void FreeMZXMLParseCursor()
+{
+ if (!g_MZXMLParseCursor)
+ {
+ return;
+ }
+ SafeFree(g_MZXMLParseCursor->PeakBuffer);
+ SafeFree(g_MZXMLParseCursor->Peaks);
+ SafeFree(g_MZXMLParseCursor->DecodedPeakBuffer);
+ if (g_MZXMLParseCursor->Parser)
+ {
+ XML_ParserFree(g_MZXMLParseCursor->Parser);
+ }
+ SafeFree(g_MZXMLParseCursor);
+ g_MZXMLParseCursor = NULL;
+}
+
+void FreeMZDataParseCursor()
+{
+ if (!g_MZDataParseCursor)
+ {
+ return;
+ }
+ SafeFree(g_MZDataParseCursor->PeakBuffer);
+ SafeFree(g_MZDataParseCursor);
+ XML_ParserFree(g_MZDataParseCursor->Parser);
+ g_MZDataParseCursor = NULL;
+}
+
+// Parse through an MZXML file to get a list of spectra and their byte offsets.
+void ParseSpectraFromMZXML(char* FileName, InputFileNode* InputFile, int FirstScan, int LastScan)
+{
+ FILE* MZXMLFile;
+ MZXMLParseCursor* Cursor;
+ int FilePos = 0;
+ int DoneFlag = 0;
+ //void* XMLBuffer;
+ int BytesRead;
+ int XMLParseResult;
+ int Error;
+ //
+
+ MZXMLFile = fopen(FileName, "rb");
+ if (!MZXMLFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+ printf("Parse spectra from '%s'...\n", FileName);
+ Cursor = GetMZXMLParseCursor();
+ Cursor->FirstScan = FirstScan;
+ Cursor->LastScan = LastScan;
+ Cursor->InputFile = InputFile;
+ Cursor->ErrorFlag = 0;
+ Cursor->Spectrum = NULL;
+ Cursor->SpecIndex = 1;
+ Cursor->Mode = MZXML_PARSE_LIST_SPECTRA;
+ XML_SetUserData(Cursor->Parser, Cursor);
+ XML_SetElementHandler(Cursor->Parser, MZXMLStartElement, MZXMLEndElement);
+ XML_SetCharacterDataHandler(Cursor->Parser, MZXMLCharacterDataHandler);
+ while (!DoneFlag)
+ {
+ // Get a buffer (parser handles the memory):
+ Cursor->XMLBuffer = XML_GetBuffer(Cursor->Parser, sizeof(char) * MZXML_BUFFER_SIZE);
+ if (!Cursor->XMLBuffer)
+ {
+ printf("* ParseSpectraFromMZXML Error: Unable to get XML buffer of size %d\n", MZXML_BUFFER_SIZE);
+ break;
+ }
+
+ // Read into the buffer:
+ BytesRead = ReadBinary(Cursor->XMLBuffer, sizeof(char), MZXML_BUFFER_SIZE, MZXMLFile);
+ if (!BytesRead)
+ {
+ // We'll call XML_Parse once more, this time with DoneFlag set to 1.
+ DoneFlag = 1;
+ }
+
+ // Parse this block o' text:
+ XMLParseResult = XML_Parse(Cursor->Parser, Cursor->XMLBuffer, BytesRead, DoneFlag);
+ if (!XMLParseResult)
+ {
+ printf("XML Parse error - file position ~%d\n", XML_GetCurrentByteIndex(Cursor->Parser));
+ Error = XML_GetErrorCode(Cursor->Parser);
+ printf("Error code %d description '%s'\n", Error, XML_ErrorString(Error));
+ }
+
+ // If Cursor->ErrorFlag is set, then the file isn't valid! Error out
+ // now, since recovery could be difficult.
+ if (Cursor->ErrorFlag)
+ {
+ break;
+ }
+ FilePos += BytesRead;
+ }
+
+ // Close file, free memory:
+ fclose(MZXMLFile);
+ FreeMZXMLParseCursor();
+}
+
+// Parse ONE spectrum from the file. Return true on success.
+int SpectrumLoadMZXML(MSSpectrum* Spectrum, FILE* MZXMLFile)
+{
+ MZXMLParseCursor* Cursor;
+ int FilePos = 0;
+ int DoneFlag = 0;
+ //void* XMLBuffer;
+ int BytesRead;
+ int XMLParseResult;
+ int ReturnResult = 1;
+ //
+
+ Cursor = GetMZXMLParseCursor();
+ Cursor->Spectrum = Spectrum;
+ Cursor->Mode = MZXML_PARSE_OBTAIN_PEAKS;
+ Cursor->ErrorFlag = 0;
+ Cursor->FirstScan = 0;
+ Cursor->LastScan = -1;
+ XML_ParserReset(Cursor->Parser, NULL);
+ XML_SetUserData(Cursor->Parser, Cursor);
+ XML_SetElementHandler(Cursor->Parser, MZXMLStartElement, MZXMLEndElement);
+ XML_SetCharacterDataHandler(Cursor->Parser, MZXMLCharacterDataHandler);
+
+ while (!DoneFlag)
+ {
+ // Get a buffer (parser handles the memory):
+ Cursor->XMLBuffer = XML_GetBuffer(Cursor->Parser, sizeof(char) * MZXML_BUFFER_SIZE);
+ if (!Cursor->XMLBuffer)
+ {
+ printf("* SpectrumLoadMZXML Error: Unable to get XML buffer of size %d\n", MZXML_BUFFER_SIZE);
+ break;
+ }
+
+ // Read into the buffer:
+ BytesRead = ReadBinary(Cursor->XMLBuffer, sizeof(char), MZXML_BUFFER_SIZE, MZXMLFile);
+ if (!BytesRead)
+ {
+ // We'll call XML_Parse once more, this time with DoneFlag set to 1.
+ DoneFlag = 1;
+ }
+
+ // Parse this block o' text:
+ XMLParseResult = XML_Parse(Cursor->Parser, Cursor->XMLBuffer, BytesRead, DoneFlag);
+ if (!XMLParseResult)
+ {
+ Cursor->ErrorFlag = 1;
+ // If we have peaks...let's NOT report a warning...because we're parsing a sub-document,
+ // and we'll run off the edge and get well-formedness complaints.
+ // Newer expat versions will have the ability to abort when we hit the </scan>
+ // tag ending.
+ if (!Cursor->Spectrum->PeakCount)
+ {
+ ReturnResult = 0;
+ }
+ }
+
+ // If Cursor->ErrorFlag is set, then the file isn't valid! Error out
+ // now, since recovery could be difficult.
+ if (Cursor->ErrorFlag)
+ {
+ break;
+ }
+ if (Cursor->SpectrumPeaksCompleteFlag)
+ {
+ break;
+ }
+
+ FilePos += BytesRead;
+ }
+ // Sanity check: We must have a precursor m/z!
+ if (!Cursor->Spectrum->MZ)
+ {
+ ReturnResult = 0;
+ }
+ if(Cursor->Spectrum->Charge && (Cursor->Spectrum->Charge <= 0 || Cursor->Spectrum->Charge >= 6))
+ ReturnResult = 0;
+
+ //Other Checks for decent peaks
+ if (Cursor->Spectrum->Peaks[0].Mass < -1 || Cursor->Spectrum->Peaks[0].Mass > (GlobalOptions->DynamicRangeMax + GlobalOptions->Epsilon))
+ {
+ ReturnResult = 0;
+ }
+ if (Cursor->Spectrum->Peaks[Cursor->Spectrum->PeakCount - 1].Mass < -1 || Cursor->Spectrum->Peaks[Cursor->Spectrum->PeakCount - 1].Mass > (GlobalOptions->DynamicRangeMax + GlobalOptions->Epsilon))
+ {
+ ReturnResult = 0;
+ }
+ if (Cursor->Spectrum->Peaks[0].Intensity < 0)
+ {
+ ReturnResult = 0;
+ }
+ if (Cursor->Spectrum->Peaks[Spectrum->PeakCount - 1].Intensity < 0)
+ {
+ ReturnResult = 0;
+ }
+ return ReturnResult;
+}
+
+// Parse through an mzData file to get a list of spectra and their byte offsets.
+void ParseSpectraFromMZData(char* FileName, InputFileNode* InputFile, int FirstScan, int LastScan)
+{
+ FILE* MZXMLFile;
+ MZDataParseCursor* Cursor;
+ int FilePos = 0;
+ int DoneFlag = 0;
+ //void* XMLBuffer;
+ int BytesRead;
+ int XMLParseResult;
+ int Error;
+ //
+
+ MZXMLFile = fopen(FileName, "rb");
+ if (!MZXMLFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return;
+ }
+ printf("Parse spectra from '%s'...\n", FileName);
+ Cursor = GetMZDataParseCursor();
+ Cursor->FirstScan = FirstScan;
+ Cursor->LastScan = LastScan;
+ Cursor->InputFile = InputFile;
+ Cursor->ErrorFlag = 0;
+ Cursor->SpecIndex = 1;
+ Cursor->Spectrum = NULL;
+ Cursor->Mode = MZXML_PARSE_LIST_SPECTRA;
+ XML_SetUserData(Cursor->Parser, Cursor);
+ XML_SetElementHandler(Cursor->Parser, MZDataStartElement, MZDataEndElement);
+ XML_SetCharacterDataHandler(Cursor->Parser, MZDataCharacterDataHandler);
+ //XMLBuffer = Cursor->XMLBuffer;
+
+ while (!DoneFlag)
+ {
+ // Get a buffer (parser handles the memory):
+ Cursor->XMLBuffer = XML_GetBuffer(Cursor->Parser, sizeof(char) * MZXML_BUFFER_SIZE);
+ if (!Cursor->XMLBuffer)
+ {
+ printf("* Error: Unable to get XML buffer of size %d\n", MZXML_BUFFER_SIZE);
+ break;
+ }
+
+ // Read into the buffer:
+ BytesRead = ReadBinary(Cursor->XMLBuffer, sizeof(char), MZXML_BUFFER_SIZE, MZXMLFile);
+ if (!BytesRead)
+ {
+ // We'll call XML_Parse once more, this time with DoneFlag set to 1.
+ DoneFlag = 1;
+ }
+
+ // Parse this block o' text:
+ XMLParseResult = XML_Parse(Cursor->Parser, Cursor->XMLBuffer, BytesRead, DoneFlag);
+ if (!XMLParseResult)
+ {
+ printf("XML Parse error - file position ~%d\n", XML_GetCurrentByteIndex(Cursor->Parser));
+ Error = XML_GetErrorCode(Cursor->Parser);
+ printf("Error code %d description '%s'\n", Error, XML_ErrorString(Error));
+ }
+
+ // If Cursor->ErrorFlag is set, then the file isn't valid! Error out
+ // now, since recovery could be difficult.
+ if (Cursor->ErrorFlag)
+ {
+ break;
+ }
+ FilePos += BytesRead;
+ }
+
+ // Close file, free memory:
+ fclose(MZXMLFile);
+ FreeMZDataParseCursor();
+}
+
+
+// Parse ONE spectrum from the file
+void SpectrumLoadMZData(MSSpectrum* Spectrum, FILE* MZXMLFile)
+{
+ MZDataParseCursor* Cursor;
+ int FilePos = 0;
+ int DoneFlag = 0;
+ //void* XMLBuffer;
+ int BytesRead;
+ int XMLParseResult;
+ //
+
+ Cursor = GetMZDataParseCursor();
+ Cursor->Spectrum = Spectrum;
+ Cursor->Mode = MZXML_PARSE_OBTAIN_PEAKS;
+ Cursor->ErrorFlag = 0;
+ XML_ParserReset(Cursor->Parser, NULL);
+ XML_SetUserData(Cursor->Parser, Cursor);
+ XML_SetElementHandler(Cursor->Parser, MZDataStartElement, MZDataEndElement);
+ XML_SetCharacterDataHandler(Cursor->Parser, MZDataCharacterDataHandler);
+ while (!DoneFlag)
+ {
+ // Get a buffer (parser handles the memory):
+ Cursor->XMLBuffer = XML_GetBuffer(Cursor->Parser, sizeof(char) * MZXML_BUFFER_SIZE);
+ if (!Cursor->XMLBuffer)
+ {
+ printf("* Error: Unable to get XML buffer of size %d\n", MZXML_BUFFER_SIZE);
+ break;
+ }
+
+ // Read into the buffer:
+ BytesRead = ReadBinary(Cursor->XMLBuffer, sizeof(char), MZXML_BUFFER_SIZE, MZXMLFile);
+ if (!BytesRead)
+ {
+ // We'll call XML_Parse once more, this time with DoneFlag set to 1.
+ DoneFlag = 1;
+ }
+
+ // Parse this block o' text:
+ XMLParseResult = XML_Parse(Cursor->Parser, Cursor->XMLBuffer, BytesRead, DoneFlag);
+ if (!XMLParseResult)
+ {
+ // Let's NOT report a warning...because we're parsing a sub-document,
+ // and we'll run off the edge and get well-formedness complaints.
+ // Newer expat versions will have the ability to abort when we hit the </scan>
+ // tag ending.
+ }
+
+ // If Cursor->ErrorFlag is set, then the file isn't valid! Error out
+ // now, since recovery could be difficult.
+ if (Cursor->ErrorFlag)
+ {
+ break;
+ }
+ if (Cursor->SpectrumPeaksCompleteFlag)
+ {
+ break;
+ }
+
+ FilePos += BytesRead;
+ }
+
+ // Close file, free memory:
+ //fclose(MZXMLFile);
+}
+
diff --git a/ParseXML.h b/ParseXML.h
new file mode 100644
index 0000000..46ea08a
--- /dev/null
+++ b/ParseXML.h
@@ -0,0 +1,46 @@
+//Title: ParseXML.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef PARSE_XML_H
+#define PARSE_XML_H
+
+
+
+int SpectrumLoadMZXML(MSSpectrum* Spectrum, FILE* MZXMLFile);
+void ParseSpectraFromMZXML(char* FileName, InputFileNode* InputFile, int FirstScan, int LastScan);
+void ParseSpectraFromMZData(char* FileName, InputFileNode* InputFile, int FirstScan, int LastScan);
+int SpectrumLoadMZData(MSSpectrum* Spectrum, FILE* MZDataFile);
+void FreeMZXMLParseCursor();
+void FreeMZDataParseCursor();
+
+#endif // PARSE_XML_H
+
diff --git a/ParseXML.py b/ParseXML.py
new file mode 100644
index 0000000..fc9dae6
--- /dev/null
+++ b/ParseXML.py
@@ -0,0 +1,281 @@
+#Title: ParseXML.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+MZXML and mzData peak parsing
+"""
+
+import os
+import sys
+import struct
+import xml.sax
+import xml.sax.handler
+import base64
+import MSSpectrum
+
+if hasattr(base64, "b64decode"):
+ B64Decode = base64.b64decode
+ B64Encode = base64.b64encode
+else:
+ B64Decode = base64.decodestring
+ B64Encode = base64.encodestring
+
+def GetSpectrumPeaksMZXML(Spectrum, File):
+ Spectrum.Peaks = []
+ SAXParser = xml.sax.make_parser()
+ Handler = MZXMLPeakParser(Spectrum)
+ Handler.Parser = SAXParser
+ SAXParser.setContentHandler(Handler)
+ try:
+ SAXParser.parse(File)
+ except xml.sax.SAXException, XMLException:
+ Message = XMLException.getMessage()
+ # If there are no peaks, then all exceptions are raised:
+ if not len(Spectrum.Peaks):
+ raise
+ # If we did succeed in getting peaks, then the error likely arose
+ # after the end of the peaks tag.
+ if Message == "junk after document element":
+ pass
+ elif Message == "not well-formed (invalid token)":
+ pass
+ else:
+ raise
+
+def GetSpectrumPeaksMZData(Spectrum, File):
+ Spectrum.Peaks = []
+ SAXParser = xml.sax.make_parser()
+ Handler = MZDataPeakParser(Spectrum)
+ Handler.Parser = SAXParser
+ SAXParser.setContentHandler(Handler)
+ try:
+ SAXParser.parse(File)
+ except xml.sax.SAXException, XMLException:
+ Message = XMLException.getMessage()
+ if Message == "junk after document element":
+ pass
+ elif Message == "not well-formed (invalid token)":
+ pass
+ else:
+ raise
+
+class MZXMLParseStates:
+ SpectrumComplete = -1
+ Skipping = 0
+ Peaks = 1
+ PrecursorMZ = 2
+
+class XMLDictionaryHandler(xml.sax.handler.ContentHandler):
+ """
+ A simple wrapper for the skeletal ContentHandler class. Fixes broken API names, and
+ supports the use of "triage dictionaries" self.StartHandlers and self.EndHandlers
+ to find the handlers for tags.
+ """
+ def __init__(self):
+ # Repair names:
+ self.startElement = self.StartElement
+ self.endElement = self.EndElement
+ self.characters = self.HandleCharacters
+ #
+ self.VerboseFlag = 0
+ def StartElement(self, Name, Attributes):
+ #print "Start <%s>@%s"%(Name, self.Parser._parser.CurrentByteIndex)
+ Handler = self.StartHandlers.get(Name, None)
+ if self.VerboseFlag:
+ print "<%s> %s"%(Name, Handler)
+ if Handler:
+ Handler(Attributes)
+ def EndElement(self, Name):
+ #print " End <%s>"%Name
+ Handler = self.EndHandlers.get(Name, None)
+ if self.VerboseFlag:
+ print "</%s> %s"%(Name, Handler)
+ if Handler:
+ Handler()
+ def HandleCharacters(self, String):
+ pass
+
+class MZXMLPeakParser(XMLDictionaryHandler):
+ def __init__(self, Spectrum):
+ self.State = MZXMLParseStates.Skipping
+ self.StartHandlers = {"peaks":self.StartPeaks,
+ "precursorMz":self.StartPrecursorMZ
+ }
+ self.EndHandlers = {"peaks":self.EndPeaks,
+ "scan":self.EndScan,
+ "precursorMz":self.EndPrecursorMZ
+ }
+ self.Spectrum = Spectrum
+ self.PeakBuffer = ""
+ XMLDictionaryHandler.__init__(self)
+ def HandleCharacters(self, String):
+ if self.State == MZXMLParseStates.PrecursorMZ:
+ self.PrecursorMZBuffer += String
+ return
+ if self.State == MZXMLParseStates.Peaks:
+ self.PeakBuffer += String
+ return
+ def StartPrecursorMZ(self, Attributes):
+ if self.State == MZXMLParseStates.SpectrumComplete:
+ return
+ self.State = MZXMLParseStates.PrecursorMZ
+ self.PrecursorMZBuffer = ""
+ def EndPrecursorMZ(self):
+ if self.State == MZXMLParseStates.SpectrumComplete:
+ return
+ #print "Precursor MZ -> %s"%self.PrecursorMZBuffer
+ self.Spectrum.PrecursorMZ = float(self.PrecursorMZBuffer)
+ self.State = MZXMLParseStates.Skipping
+ def StartPeaks(self, Attributes):
+ if self.State == MZXMLParseStates.SpectrumComplete:
+ return
+ self.State = MZXMLParseStates.Peaks
+ self.PeakBuffer = ""
+ ByteOrder = Attributes.get("byteOrder", "network")
+ if ByteOrder == "little" or ByteOrder == "little-endian":
+ self.ByteOrder = "little"
+ else:
+ self.ByteOrder = "big"
+ def EndScan(self):
+ self.State = MZXMLParseStates.SpectrumComplete
+ def EndPeaks(self):
+ if self.State == MZXMLParseStates.SpectrumComplete:
+ return
+ DecodedPeaks = B64Decode(self.PeakBuffer)
+ StringPos = 0
+ self.Peaks = []
+ while StringPos < len(DecodedPeaks):
+ if self.ByteOrder == sys.byteorder:
+ Mass = struct.unpack("f", DecodedPeaks[StringPos:StringPos+4])[0]
+ Intensity = struct.unpack("f", DecodedPeaks[StringPos+4:StringPos+8])[0]
+ else:
+ Mass = struct.unpack("!f", DecodedPeaks[StringPos:StringPos+4])[0]
+ Intensity = struct.unpack("!f", DecodedPeaks[StringPos+4:StringPos+8])[0]
+ Peak = MSSpectrum.PeakClass(Mass, Intensity)
+ StringPos += 8
+ #print Peak.Mass, Peak.Intensity
+ self.Spectrum.Peaks.append(Peak)
+
+class MZDataParseStates:
+ SpectrumComplete = -1
+ Skipping = 0
+ MZArray = 1
+ MZArrayData = 2
+ IntensityArray = 3
+ IntensityArrayData = 4
+
+class MZDataPeakParser(XMLDictionaryHandler):
+ def __init__(self, Spectrum):
+ self.State = MZDataParseStates.Skipping
+ self.StartHandlers = {"data":self.StartData,
+ "mzArrayBinary":self.StartMZArrayBinary,
+ "intenArrayBinary":self.StartIntensityArrayBinary,
+ "cvParam":self.StartCVParam,
+ }
+ self.EndHandlers = {"data":self.EndData,
+ "spectrum":self.EndSpectrum,
+ }
+ self.Spectrum = Spectrum
+ self.PeakBuffer = ""
+ XMLDictionaryHandler.__init__(self)
+ def StartCVParam(self, Attributes):
+ Name = Attributes.get("name", None)
+ Value = Attributes.get("value", None)
+ if Name == "mz":
+ self.Spectrum.PrecursorMZ = float(Value)
+ def StartMZArrayBinary(self, Attributes):
+ if self.State == MZDataParseStates.SpectrumComplete:
+ return
+ self.State = MZDataParseStates.MZArray
+ def StartIntensityArrayBinary(self, Attributes):
+ if self.State == MZDataParseStates.SpectrumComplete:
+ return
+ self.State = MZDataParseStates.IntensityArray
+ def HandleCharacters(self, String):
+ if self.State in (MZDataParseStates.MZArrayData, MZDataParseStates.IntensityArrayData):
+ self.PeakBuffer += String
+ def EndData(self):
+ if self.State in (MZDataParseStates.MZArrayData, MZDataParseStates.IntensityArrayData):
+ # Parse the float array:
+ FloatList = []
+ DecodedPeaks = B64Decode(self.PeakBuffer)
+ StringPos = 0
+ while StringPos < len(DecodedPeaks):
+ if self.ByteOrder == sys.byteorder:
+ if self.Precision == 64:
+ Value = struct.unpack("d", DecodedPeaks[StringPos:StringPos + 8])[0]
+ else:
+ Value = struct.unpack("f", DecodedPeaks[StringPos:StringPos + 4])[0]
+ else:
+ if self.Precision == 64:
+ Value = struct.unpack("!d", DecodedPeaks[StringPos:StringPos + 8])[0]
+ else:
+ Value = struct.unpack("!f", DecodedPeaks[StringPos:StringPos + 4])[0]
+ #Peak = MSSpectrum.PeakClass(Mass, Intensity)
+ FloatList.append(Value)
+ if self.Precision == 64:
+ StringPos += 8
+ else:
+ StringPos += 4
+ #print Peak.Mass, Peak.Intensity
+ #self.Spectrum.Peaks.append(Peak)
+ if self.State == MZDataParseStates.MZArrayData:
+ self.MZList = FloatList
+ else:
+ self.IntensityList = FloatList
+ #print "...parsed %s values!"%len(FloatList)
+ self.State = MZDataParseStates.Skipping
+ def StartData(self, Attributes):
+ if self.State == MZDataParseStates.SpectrumComplete:
+ return
+ self.Precision = Attributes.get("precision", "32")
+ ByteOrder = Attributes.get("endian", "network")
+ if ByteOrder == "little" or ByteOrder == "little-endian":
+ self.ByteOrder = "little"
+ else:
+ self.ByteOrder = "big"
+ if self.State == MZDataParseStates.MZArray:
+ self.State = MZDataParseStates.MZArrayData
+ self.PeakBuffer = ""
+ return
+ if self.State == MZDataParseStates.IntensityArray:
+ self.State = MZDataParseStates.IntensityArrayData
+ self.PeakBuffer = ""
+ def EndSpectrum(self):
+ if self.State != MZXMLParseStates.SpectrumComplete:
+ self.State = MZXMLParseStates.SpectrumComplete
+ for PeakIndex in range(len(self.MZList)):
+ Mass = self.MZList[PeakIndex]
+ Intensity = self.IntensityList[PeakIndex]
+ Peak = MSSpectrum.PeakClass(Mass, Intensity)
+ self.Spectrum.Peaks.append(Peak)
diff --git a/PhosCut2.bn b/PhosCut2.bn
new file mode 100644
index 0000000..35c9bea
Binary files /dev/null and b/PhosCut2.bn differ
diff --git a/PhosCut3.bn b/PhosCut3.bn
new file mode 100644
index 0000000..25eaef8
Binary files /dev/null and b/PhosCut3.bn differ
diff --git a/PhosphateLocalization.py b/PhosphateLocalization.py
new file mode 100644
index 0000000..3e3dbfc
--- /dev/null
+++ b/PhosphateLocalization.py
@@ -0,0 +1,324 @@
+#Title: PhosphateLocalization.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""PhosphateLocalization.py
+
+This script is a glorified wrapper for Label.py. It calls label and
+calculates the PLS score for each spectral annotation in the input set.
+
+1. read in input data. If it is not in native inspect format, then we
+send it to the GetByteOffset part, so that we can use it like Inspect.
+
+2. Label all possible annotations of the string and get their peptide score (think binomial)
+
+3. Find the difference between the top two scores, report. print.
+The results are reported by appending two extra columns to the data from the input
+file. These correspond to the top annotation, and it's PLS.
+"""
+
+UsageInfo = """PhosphateLocalization.py
+Calculates the Phosphate Localization Score (PLS) for each spectral
+annotation in the input file. Make sure to read the tutorial so
+that you understand how to use it correctly.
+
+Required Options:
+ -r [FileName] File of formatted annotations
+ -m [Directory] Directory containing spectra files (not filename)
+ -w [FileName] Output of this program
+
+Additional Options:
+ -d [Directory] Directory for the images and annotated peak lists
+ created during the label process. Default "LabelSpewage"
+
+"""
+
+import os
+import sys
+import getopt
+import ResultsParser
+import GetByteOffset
+import string
+import Label
+
+class DetectiveClass(ResultsParser.ResultsParser):
+ def __init__(self):
+ self.InputFilePath = None
+ self.OutputFilePath = None
+ self.LabeledAnnotationsDir = "LabelSpewage" # for Labeled output
+ self.MZXMLDir = None
+ self.InspectFormat = 0
+ self.ScanOffset = {} # potentially large dictionary for storing the byte offset of each spectrum
+ self.OldInspectResults = {} #(file, scan) => (MQScore, Annotation) #file name only, not! path
+ self.PLSDict = {} # self.PLSDict[(SpectrumFile, Scan)] = (PLS, NewPeptide)
+ self.Columns = ResultsParser.Columns()
+ ResultsParser.ResultsParser.__init__(self)
+
+ def Main(self):
+ self.CheckInputFormat(self.InputFilePath)
+ if not self.InspectFormat:
+ self.GetByteOffsetsForSpectra()
+ MakeDirectory(self.LabeledAnnotationsDir)
+ self.LabelMe()
+ self.MakeOutput()
+
+
+ def MakeOutput(self):
+ """The results of Label.py have been put into a folder, and we now have to parse
+ those and put them back into the file that people gave us.
+ """
+ ## get all the stuff from Label
+ self.ProcessResultsFiles(self.LabeledAnnotationsDir, self.ParseLabelSpewage)
+ # start putting it into the output
+ Handle = open(self.InputFilePath, "rb")
+ OutHandle = open(self.OutputFilePath, "wb")
+ for Line in Handle.xreadlines():
+ if not Line.strip():
+ continue
+ if Line[0] == "#":
+ #header
+ OutHandle.write("%s\tBetterAnnotation\tPLS\n"%Line.strip())
+ continue
+ Bits = Line.strip().split("\t")
+ SpectrumFullPath = Bits[self.Columns.getIndex("SpectrumFile")]
+ SpectrumFile = os.path.split(SpectrumFullPath)[1]
+ Scan = Bits[self.Columns.getIndex("Scan#")]
+ Annotation = Bits[self.Columns.getIndex("Annotation")]
+ Tuple = (SpectrumFile, Scan)
+ if not self.PLSDict.has_key(Tuple):
+ print "NO KEY, %s, %s"%(SpectrumFile, Scan)
+ continue
+ (PLS, NewPeptideAnnotation) = self.PLSDict[(SpectrumFile, Scan)]
+ #now write stuff out
+ Bits.append("%s"%NewPeptideAnnotation)
+ Bits.append("%s"%PLS)
+ String = "\t".join(Bits)
+ OutHandle.write("%s\n"%String)
+ OutHandle.close()
+
+ def ParseLabelSpewage(self, FilePath):
+ """In each file I am going to grep out
+ filename, scan number, PLS, better peptide if such exists
+ """
+
+ ##in the filename are the scan number and mzxml filename
+ if not FilePath[-3:] == "txt":
+ return #skip png images
+ (Path, FileName) = os.path.split(FilePath)
+ Pos = FileName.find("mzXML") + 5
+ SpectrumFile = FileName[:Pos]
+ Dot = FileName.find(".", Pos+1)
+ Scan = FileName[Pos+1:Dot] # string value, not int
+ NewPeptide = None
+ Handle= open(FilePath, "rb")
+ PLS = "N/A" #default, shoudl get overridden for every file
+ for Line in Handle.xreadlines():
+ Line = Line.strip()
+ #hard coded magic
+ if Line[:10] == "Phosphate ":
+ #Phosphate Localization Score: 52.2
+ Colon = Line.find(":")
+ PLS = Line[Colon + 1:]
+ #print Line
+ #print "I parsed out %s"%PLS
+ if Line[:7] == "WARNING":
+ #parse out new peptide
+ ToSplit = Line.replace("WARNING: Better annotation than input.", "")
+ (BetterMQScore, NewPeptide) = ToSplit.split(",")
+ NewPeptide = NewPeptide.strip()
+ if Line[:2] == "b2":
+ #this means we've started to get into the rest of the verbose output
+ # and past what we care about
+ break
+ Handle.close()
+ Tuple = (SpectrumFile, Scan)
+ self.PLSDict[Tuple] = (PLS, NewPeptide)
+
+
+ def GetByteOffsetsForSpectra(self):
+ "Read mzXML from either a single file, or directory, creating the self.ScanOffset dictionary"
+ Abacus = GetByteOffset.Abacus()
+ if os.path.isdir(self.MZXMLDir):
+ for FileName in os.listdir(self.MZXMLDir):
+ (Stub, Extension) = os.path.splitext(FileName)
+ if Extension.lower() == ".mzxml":
+ Path = os.path.join(self.MZXMLDir, FileName)
+ ScanOffsetSingleFile = Abacus.GetByteOffset(Path)
+ for (ScanNumber, ScanOffset) in ScanOffsetSingleFile.items():
+ self.ScanOffset[(FileName, ScanNumber)] = (Path, ScanOffset)
+ else:
+ ScanOffsetSingleFile = Abacus.GetByteOffset(self.MZXMLDir)
+ FileName = os.path.split(self.MZXMLDir)[1]
+ for (ScanNumber, ScanOffset) in ScanOffsetSingleFile.items():
+ self.ScanOffset[(FileName, ScanNumber)] = (self.MZXMLDir, ScanOffset)
+ #print "Storing value (%s,%s) with key (%s, %s)"%(self.MZXMLDir, ScanOffset, FileName, ScanNumber)
+
+ def LabelMe(self):
+ Handle = open(self.InputFilePath, "rb")
+ Dymo = Label.LabelClass()
+ Count = 0
+ GoodScoreCount = 0
+ WrongChargeCount = 0
+ ScoredWorseCount = 0
+ for Line in Handle.xreadlines():
+ if Line[0] == "#":
+ self.Columns.initializeHeaders(Line)
+ continue
+ if not Line.strip():
+ continue
+ Bits = list(Line.strip().split("\t"))
+ #Charge = int (Bits[self.Columns.Charge]) I don't thin I need this anymore
+ Count +=1
+ Annotation = Bits[self.Columns.getIndex("Annotation")]
+ #print "Annotation :%s:"%Annotation
+ FileName = Bits[self.Columns.getIndex("SpectrumFile")]
+ Scan = int(Bits[self.Columns.getIndex("Scan#")])
+ if not self.InspectFormat:
+ FileNameMinusPath = os.path.split(FileName)[1]
+ (FullPathDummy, ByteOffset) = self.ScanOffset[(FileNameMinusPath, Scan)]
+ #print (FullPathDummy, ByteOffset)
+ #continue
+ else:
+ ByteOffset = int(Bits[self.Columns.getIndex("SpecFilePos")])
+ (Path,File) = os.path.split(FileName)
+ FileName = os.path.join(self.MZXMLDir, File)
+ VerboseFileName = "%s.%s.%s.verbose.txt"%(File, Scan, Annotation[2:-2])
+ ImageFileName = "%s.%s.%s.png"%(File, Scan, Annotation[2:-2])
+ VerboseFilePath = os.path.join(self.LabeledAnnotationsDir, VerboseFileName)
+ ImageFilePath = os.path.join(self.LabeledAnnotationsDir, ImageFileName)
+ ## as we've got a single Dymo object, we must be passing in full args list
+ ## -p to suppress the image popup, and -P for the PLS score
+ Args = " -r %s -b %d -a %s -v %s -w %s -p -P"%(FileName, ByteOffset, Annotation, VerboseFilePath, ImageFilePath)
+ ArgsList = Args.split()
+ #print "Parsing Results for %s, scan %s, charge %s"%(FileName, Scan, Charge)
+ #print "Args: %s"%Args
+ Dymo.ParseCommandLine(ArgsList)
+ Dymo.Main()
+
+ Handle.close()
+
+ def CheckInputFormat(self, FileName):
+ """This method serves to catch input files that are not in the
+ proper Inspect format. If this is the case, then we must convert the
+ files to Inspect format. This basically means that we put a byte offset at the
+ end.
+ Expected format. (tab delimited, 3 columns)
+ Spectrum File Spectrum Number (int) Annotation (string, no! numbers!)
+ """
+ Handle = open (self.InputFilePath, "rb")
+ ## 1. get the first line and see if it's already in Inspect Format
+ Line = Handle.readline()
+ try:
+ Bits = Line.strip().split("\t")
+ except:
+ print "####################################################"
+ print "Input file in improper format. Please read tutorial."
+ sys.exit(1)
+ #if not len(Bits) < self.Columns.getIndex("SpecFilePos"):
+ # self.InspectFormat = 1
+ # return # in inspect format. it's okay
+ ## 2. Check to see if each line of the input file has the proper format
+ Reject = 0
+ for Line in Handle.xreadlines():
+ if Line[0] == "#":
+ self.Columns.initializeHeaders(Line)
+ continue
+ try:
+ Bits = Line.strip().split("\t")
+ except:
+ print "####################################################"
+ print "Input file in improper format. Please read tutorial."
+ sys.exit(1)
+ #now check to see if column 1 is a number, and 2 is a string (with no brackets)
+ try:
+ SpectrumNumber = int(Bits[self.Columns.getIndex("Scan#")])
+ except:
+ Reject = 1
+ print "Second column must be a integer representing the spectrum number"
+ Annotation = Bits[self.Columns.getIndex("Annotation")]
+ AcceptArray = string.ascii_letters
+ AcceptArray += "." #for delimiting the prefix/suffix
+ AcceptArray += "*" # for the beginning/end of a protein. should only be in prefix/suffix
+ AcceptArray += string.digits
+ for Index in range(len(Annotation)):
+ if not Annotation[Index] in AcceptArray:
+ print "This annotation is in an improper format %s"%Annotation
+ Reject = 1
+ break
+ if Reject:
+ print "####################################################"
+ print "There were formatting problems with the input file"
+ print "We cannot proceed. Please read the tutorial."
+ sys.exit(1)
+ print "Input file %s received in the correct format"%FileName
+ def ParseCommandLine(self,Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "r:w:m:d:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ # -r results file(s)
+ if not os.path.exists(Value):
+ print "** Error: couldn't find results file '%s'\n\n"%Value
+ print UsageInfo
+ sys.exit(1)
+ self.InputFilePath = Value
+ if Option == "-d":
+ self.LabeledAnnotationsDir = Value
+ if Option == "-w":
+ self.OutputFilePath = Value
+ if Option == "-m":
+ self.MZXMLDir = Value
+ if not OptionsSeen.has_key("-r") or not OptionsSeen.has_key("-m"):
+ print UsageInfo
+ sys.exit(1)
+
+
+def MakeDirectory(Dir):
+ if os.path.exists(Dir):
+ return
+ try:
+ os.makedirs(Dir)
+ except:
+ raise
+
+
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco not found - running in non-optimized mode)"
+ MacGyver = DetectiveClass()
+ MacGyver.ParseCommandLine(sys.argv[1:])
+ MacGyver.Main()
diff --git a/PrepDB.py b/PrepDB.py
new file mode 100644
index 0000000..01eb755
--- /dev/null
+++ b/PrepDB.py
@@ -0,0 +1,283 @@
+#Title: PrepDB.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Translate a protein database to a good format for trie-based searching.
+The source database should be in either FASTA format or in swiss-prot format.
+The output database will be in "concatenated format" - peptide strings with
+asterisks delimiting the peptides, no whitespace.
+We also save a binary file indexing into the concatenated DB.
+
+Index file format is record-based, with one record per peptide:
+- original DB position (int); the START of a record (>)
+- concatenated DB file position (int); the START of a record (first peptide)
+- Peptide ID (string, 80 chars)
+"""
+import sys
+import struct
+import traceback
+import os
+import string
+
+class SwissCompressor:
+ """
+ Convert a protein database into concatenated format.
+ Processes the SwissProt database format.
+ """
+ def __init__(self, SourceFileName, SquishedFileName, IndexFileName, Species = None):
+ self.SourceFile = open(SourceFileName,"rb")
+ self.SquishedFile = open(SquishedFileName,"wb")
+ self.IndexFile = open(IndexFileName,"wb")
+ self.FASTA = 0
+ self.Species = Species
+ def Compress(self):
+ """
+ The parts of swiss-prot we care about look like this:
+SQ SEQUENCE 296 AA; 34077 MW; B0D7CD175C7A3625 CRC64;
+ FNSNMLRGSV CEEDVSLMTS IDNMIEEIDF YEKEIYKGSH SGGVIKGMDY DLEDDENDED
+ EMTEQMVEEV ADHITQDMID EVAHHVLDNI THDMAHMEEI VHGLSGDVTQ IKEIVQKVNV
+ AVEKVKHIVE TEETQKTVEP EQIEETQNTV EPEQTEETQK TVEPEQTEET QNTVEPEQIE
+ ETQKTVEPEQ TEEAQKTVEP EQTEETQKTV EPEQTEETQK TVEPEQTEET QKTVEPEQTE
+ ETQKTVEPEQ TEETQKTVEP EQTEETQKTV EPEQTEETQN TVEPEPTQET QNTVEP
+//
+ """
+ self.InSequence = 0
+ RecordNumber = 0
+ LineNumber = 0
+ CorrectSpecies = 0
+ while (1):
+ LineNumber += 1
+ SourceFilePos = self.SourceFile.tell()
+ RawFileLine = self.SourceFile.readline()
+ if not RawFileLine:
+ break # end o' file!
+ FileLine = RawFileLine.strip()
+ if self.InSequence:
+ # // marks end of sequence; anything else is sequence data.
+ # ...but in some cases, the // marker isn't present, so we
+ # stop when we see the "ID" tag from the next record.
+ #if FileLine[:2] == "//":
+ #print self.InSequence, FileLine
+ if RawFileLine[:2] != " ":
+ self.InSequence = 0
+ if self.FASTA:
+ pass
+ else:
+ self.SquishedFile.write("*")
+ RecordNumber += 1
+ else:
+ Stripped = FileLine.replace(" ","")
+ self.SquishedFile.write(Stripped)
+ else:
+ if FileLine[:3] == "OS ":
+ if self.Species == None or FileLine.lower().find(self.Species)!=-1:
+ CorrectSpecies = 1
+ else:
+ CorrectSpecies = 0
+ if FileLine[:3] == "ID ":
+ SourceFileRecordStart = SourceFilePos
+ ID = FileLine.split()[1]
+ ID = ID[:80]
+ if self.FASTA:
+ self.SquishedFile.write("\n>%s\n"%ID)
+ if FileLine[:3] == "SQ ":
+ if CorrectSpecies:
+ self.InSequence = 1
+ SquishedFilePos = self.SquishedFile.tell()
+ Str = struct.pack("<qi80s", SourceFileRecordStart, SquishedFilePos, ID)
+ self.IndexFile.write(Str)
+ if LineNumber%1000 == 0:
+ print "Processed line %d."%LineNumber
+ #self.SquishedFile.flush()
+ #self.IndexFile.flush()
+ #sys.stdin.readline()
+ print "Total records seen:", RecordNumber
+
+class FASTACompressor:
+ """
+ Convert a protein database into concatenated format.
+ Processes FASTA format.
+ """
+ def __init__(self, SourceFileName, SquishedFileName, IndexFileName, Species = None):
+ self.SourceFile = open(SourceFileName,"rb")
+ self.SquishedFile = open(SquishedFileName,"wb")
+ self.IndexFile = open(IndexFileName,"wb")
+ self.SquishedFileName = SquishedFileName
+ self.IndexFileName = IndexFileName
+ def Compress(self):
+ RecordNumber = 0
+ LineNumber = 0
+ FirstRecord = 1
+ LineNumberWarnings = 0
+ DummyTable = string.maketrans("", "")
+ while (1):
+ LineNumber += 1
+ SourceFilePos = self.SourceFile.tell()
+ FileLine = self.SourceFile.readline()
+ if not FileLine:
+ break # end o' file!
+ FileLine = FileLine.strip()
+ if not FileLine:
+ continue # empty lines (whitespace only) are skipped
+ if FileLine[0] == ">":
+ RecordNumber += 1
+ if not FirstRecord:
+ self.SquishedFile.write("*")
+ ID = FileLine[1:81].strip()
+ # Fix weird characters in the ID:
+ ID = ID.replace("\t", " ")
+ # Note: Important to call tell() *after* writing the asterisk! (Fixed a bug 1/20/5)
+ SquishedFilePos = self.SquishedFile.tell()
+ Str = struct.pack("<qi80s", SourceFilePos, SquishedFilePos, ID)
+ self.IndexFile.write(Str)
+ FirstRecord = 0
+ else:
+ WarnFlag = 0
+ FileLine = string.translate(FileLine, DummyTable, " \r\n\t*")
+ FileLine = FileLine.upper()
+ Str = ""
+ for Char in FileLine:
+ if Char not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+ WarnFlag = 1
+ else:
+ Str += Char
+ #FileLine = FileLine.replace("*","")
+ if WarnFlag and LineNumberWarnings < 10:
+ print "* Warning: line %s contains non-amino-acid characters:"%LineNumber
+ print FileLine
+ LineNumberWarnings += 1
+ if LineNumberWarnings >= 10:
+ print "(omitting further warnings)"
+ self.SquishedFile.write(Str)
+ print "Converted %s protein sequences (%s lines) to .trie format."%(RecordNumber + 1, LineNumber)
+ print "Created database file '%s'"%self.SquishedFileName
+
+class MS2DBCompressor:
+ """
+ Creates the index file for a splice graph, no modification is made to the original database
+ """
+ def __init__(self, SourceFileName, SquishedFileName, IndexFileName, Species = None):
+ self.SourceFile = open(SourceFileName,"rb")
+
+ self.IndexFile = open(IndexFileName,"wb")
+ self.IndexFileName = IndexFileName
+ def Compress(self):
+ RecordNumber = 0
+ LineNumber = 0
+ FirstRecord = 1
+ LineNumberWarnings = 0
+ DummyTable = string.maketrans("", "")
+ while (1):
+ LineNumber += 1
+ SourceFilePos = self.SourceFile.tell()
+ FileLine = self.SourceFile.readline()
+ if not FileLine:
+ break # end o' file!
+ FileLine = FileLine.strip()
+ if not FileLine:
+ continue # empty lines (whitespace only) are skipped
+ if FileLine[0:6] == "<Gene ":
+ RecordNumber += 1
+
+ ID = ""
+ Bits = FileLine[6:].split(" ")
+ for B in Bits:
+ (Item,Value) = B.split("=")
+ if Item == "Name":
+ ID = Value[1:-1]
+ break
+
+ if ID == "":
+ print "No valid ID found in %s"%FileLine
+ raw_input()
+
+ # Note: Important to call tell() *after* writing the asterisk! (Fixed a bug 1/20/5)
+ Str = struct.pack("<qi80s", SourceFilePos, SourceFilePos, ID)
+ self.IndexFile.write(Str)
+ FirstRecord = 0
+
+ print "Converted %s protein sequences (%s lines) to .ms2index format."%(RecordNumber + 1, LineNumber)
+ print "Created index file '%s'"%self.IndexFileName
+
+
+def PrintUsage():
+ print "Please supply a database filename."
+ print "Usage: PrepDB.py <format> <OriginalDB> [NewDB] [IndexFile]"
+ print "Example: Prepdb.py FASTA Drome.fasta"
+ print " The source format can be either FASTA or SWISS or MS2DB"
+ print " New DB file name defaults to original filename with .trie appended (no new file is created for MS2DB)"
+ print " Index file name defaults to original filename with .index appended (or .ms2index for MS2DB)"
+
+if __name__ == "__main__":
+ if len(sys.argv)<3:
+ PrintUsage()
+ sys.exit()
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco not found - running in non-optimized mode)"
+ # First argument: Original database file format
+ Format = sys.argv[1].lower()
+ if Format == "fasta":
+ CompressorClass = FASTACompressor
+ elif Format == "swiss":
+ CompressorClass = SwissCompressor
+ elif Format == "ms2db":
+ CompressorClass = MS2DBCompressor
+ else:
+ print "Unknown source database format '%s'"%Format
+ PrintUsage()
+ sys.exit()
+ # Second argument: Original database file
+ SourceFileName = sys.argv[2]
+ # Optional third argument: New database file name
+ if len(sys.argv) > 3:
+ SquishedFileName = sys.argv[3]
+ elif Format == "ms2db":
+ SquishedFileName = None
+ else:
+ SquishedFileName = "%s.trie"%os.path.splitext(SourceFileName)[0]
+ # Optional third argument: Index file name
+ if len(sys.argv) > 4:
+ IndexFileName = sys.argv[4]
+ elif Format == "ms2db":
+ IndexFileName = "%s.ms2index"%os.path.splitext(SourceFileName)[0]
+ else:
+ IndexFileName = "%s.index"%os.path.splitext(SourceFileName)[0]
+ # Use FASTACompressor for FASTA format, Compressor for the weird swiss-prot format
+ # If "species" is a string, then the Swiss-prot reader will filter out any records
+ # that don't contain that string. For example, set Species = "sapiens" to grab only
+ # human proteins.
+ Species = None
+ Squasher = CompressorClass(SourceFileName, SquishedFileName, IndexFileName, Species)
+ Squasher.Compress()
diff --git a/ProteinGrouper.py b/ProteinGrouper.py
new file mode 100644
index 0000000..5250d36
--- /dev/null
+++ b/ProteinGrouper.py
@@ -0,0 +1,471 @@
+#Title: ProteinGrouper.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#Updated 1-3-2012 to allow column header based lookup (NEC)
+
+import os
+import sys
+import ResultsParser
+import TrieUtils
+import Utils
+import getopt
+
+UsageInfo = """ProteinGrouper.py version 2012.01.03
+ProteinGrouper updates the 'Protein' field for Inspect annotations, replacing
+the single protein name with a '!' delimited list of protein names. For each
+Inspect results file specified, a new file is created with the updated 'Protein'
+field.
+[REQUIRED]
+-r [File or Dir] File or directory containing Inspect annotations
+-t [File] Trie file used to search spectra, assumes an index file exists of the same name
+-w [Dir] Directory where updated Inspect annotations are written
+
+[OPTIONAL]:
+-p Assign peptides to a parsimonious set of proteins.
+-a Assign peptides to a parsimonious set of proteins. This protein will
+appear first in the list of proteins contained by the peptide.
+"""
+DELIM = "!"
+
+class ProteinGrouper(ResultsParser.ResultsParser):
+ def __init__(self):
+ ResultsParser.ResultsParser.__init__(self)
+ self.Columns = ResultsParser.Columns()
+ Utils.Initialize()
+ self.TrieFiles = []
+ self.IndexFiles = []
+ self.DoParsimony = 0
+ self.DoParsimonyAndGroup = 0
+ self.TUtils = TrieUtils.TrieUtils()
+
+
+ self.Peptide2ProteinID = {} #Peptide sequence -> (TrieIndex,ProteinIDS)
+ self.Peptide2SpectralCount = {} #Peptide sequence -> Spectral Count
+ self.Protein2Peptides = {} # (TrieIndex,ProteinID) -> Peptide sequences
+ #self.ProteinNames = {} #(TrieIndex,ProteinID) -> ProteinName
+
+ #Populated only if parsimony
+ self.SelectedProteins = {}
+ self.HeaderLine = None
+
+
+ def Main(self):
+
+ #Load peptides
+ for FileName in self.InputFiles:
+ print "Loading peptides from %s"%FileName
+ self.LoadPeptides(FileName)
+ if self.DoParsimony == 1 or self.DoParsimonyAndGroup == 1:
+ self.ChoosePeptides()
+ for FileName in self.InputFiles:
+ self.WritePeptides(FileName)
+
+
+ #We assume that if multiple peptides are listed for a spectrum, then they appear together in the file and in decreasing order of
+ #confidence
+ def LoadPeptides(self,FileName):
+ #print "Loading peptides!!"
+ #raw_input()
+ LocalDebug = 1
+
+ RawFileName = os.path.basename(FileName)
+
+ File = open(FileName,'r')
+ #OutputFile = os.path.join(self.OutputDir,RawFileName)
+ #OutFile = open(OutputFile ,'w')
+ PeptideList = {}
+
+ PrevSpectrum = None
+
+
+ for Line in File:
+ Line = Line.strip()
+ if Line == "":
+ continue
+ if Line[0] == "#":
+ self.HeaderLine = Line
+ continue
+
+ if LocalDebug:
+ print Line
+
+ Bits = Line.split("\t")
+ ModdedPeptide = Bits[self.Columns.getIndex("Annotation")]
+ Peptide = Utils.GetPeptideFromModdedName(ModdedPeptide).Aminos
+
+ #if Peptide == "YGPLLDLPELPFPELER":
+ # LocalDebug = 1
+ #else:
+ # LocalDebug = 0
+ CurrSpectrum = (Bits[0],Bits[1])
+ if CurrSpectrum == PrevSpectrum:
+ continue
+ PrevSpectrum = CurrSpectrum
+
+ #Update the spectral count for this peptide
+ if not self.Peptide2SpectralCount.has_key(Peptide):
+ self.Peptide2SpectralCount[Peptide] = 0
+ self.Peptide2SpectralCount[Peptide] += 1
+
+ if LocalDebug:
+ print self.Peptide2SpectralCount[Peptide]
+ #raw_input()
+
+ if self.Peptide2ProteinID.has_key(Peptide):
+ if LocalDebug:
+ print "Already searched for peptide %s"%Peptide
+ raw_input()
+ continue
+
+ else:
+ PeptideList[Peptide] = 1
+ if LocalDebug:
+ print "Searching %s for the first time!"%Peptide
+
+
+ #See if we've reached enough peptides to search
+ if len(PeptideList.keys()) >= TrieUtils.MIN_TRIE_SEARCH_SIZE:
+ if LocalDebug:
+ print "Reached %s peptides to search"%(len(PeptideList.keys()))
+ #raw_input()
+
+ #Loop through each trie file that we have
+ for TrieIndex in range(0,len(self.TrieFiles)):
+ if LocalDebug:
+ print "Searching trieDB: %s"%(self.TrieFiles[TrieIndex])
+ Locations = self.TUtils.GetAllLocations(PeptideList.keys(),self.TrieFiles[TrieIndex])
+ if LocalDebug:
+ print "Finished searching"
+ #For each peptide that we searched, add it's locations and proteins
+ for Pep in PeptideList.keys():
+ #if Pep == "AAGARPLTSPESLSR" or Pep == "GFFDPNTHENLTYLQLLR" or Pep == "EMAVPDVHLPDVQLPK" or Pep == "YGPLLDLPELPFPELER":
+ # LocalDebug = 1
+ #else:
+ # LocalDebug = 0
+ if LocalDebug:
+ print Pep
+ print "Total Locations: %s"%(len(Locations[Pep]))
+ if len(Locations[Pep]) == 0:
+ print "No locations found for %s"%Pep
+ continue
+ if not self.Peptide2ProteinID.has_key(Pep):
+ self.Peptide2ProteinID[Pep] = []
+ for (ID,Res) in Locations[Pep]:
+
+ if LocalDebug:
+ print "%s appear in %s at pos %s"%(Pep,ID,Res)
+ #Get the protein name
+ #if not self.ProteinNames.has_key((TrieIndex,ID)):
+ ProteinName = self.TUtils.GetProteinName(self.IndexFiles[TrieIndex],ID)
+
+ #else:
+ # ProteinName = self.ProteinNames[(TrieIndex,ID)]
+ if LocalDebug:
+ print "Hit to protine %s"%ProteinName
+ if ProteinName[0:3] == "XXX": #Skip hits to reverse DB
+ continue
+ if ProteinName.find(DELIM) >= 0:
+ print "ERROR: Protein %s contains delim %s"%(ProteinName,DELIM)
+ sys.exit(0)
+ #self.ProteinNames[(TrieIndex,ID)] = ProteinName
+
+ if self.Peptide2ProteinID[Pep].count((TrieIndex,ID)) == 0:
+ self.Peptide2ProteinID[Pep].append((TrieIndex,ID))
+
+ if not self.Protein2Peptides.has_key((TrieIndex,ID)):
+ self.Protein2Peptides[(TrieIndex,ID)] = []
+ if self.Protein2Peptides[(TrieIndex,ID)].count(Pep) == 0:
+ self.Protein2Peptides[(TrieIndex,ID)].append(Pep)
+
+ PeptideList = {}
+ if len(PeptideList.keys()) > 0:
+ if LocalDebug:
+ print "Reached %s peptides to search"%(len(PeptideList.keys()))
+
+ for TrieIndex in range(0,len(self.TrieFiles)):
+ Locations = self.TUtils.GetAllLocations(PeptideList.keys(),self.TrieFiles[TrieIndex])
+
+ for Pep in PeptideList.keys():
+ if LocalDebug:
+ print Pep
+
+ if len(Locations[Pep]) == 0:
+ print "No locations found for %s"%Pep
+ continue
+ if not self.Peptide2ProteinID.has_key(Pep):
+ self.Peptide2ProteinID[Pep] = []
+
+ for (ID,Res) in Locations[Pep]:
+
+ #Get the protein name
+ #if not self.ProteinNames.has_key((TrieIndex,ID)):
+ ProteinName = self.TUtils.GetProteinName(self.IndexFiles[TrieIndex],ID)
+
+ #else:
+ # ProteinName = self.ProteinNames[(TrieIndex,ID)]
+
+ if ProteinName[0:3] == "XXX": #Skip hits to reverse DB
+ continue
+ if ProteinName.find(DELIM) >= 0:
+ print "ERROR: Protein %s contains delim %s"%(ProteinName,DELIM)
+ sys.exit(0)
+ #self.ProteinNames[(TrieIndex,ID)] = ProteinName
+
+ if self.Peptide2ProteinID[Pep].count((TrieIndex,ID)) == 0:
+ self.Peptide2ProteinID[Pep].append((TrieIndex,ID))
+
+ if not self.Protein2Peptides.has_key((TrieIndex,ID)):
+ self.Protein2Peptides[(TrieIndex,ID)] = []
+ if self.Protein2Peptides[(TrieIndex,ID)].count(Pep) == 0:
+ self.Protein2Peptides[(TrieIndex,ID)].append(Pep)
+
+
+
+
+
+ File.close()
+
+
+ def ChoosePeptides(self):
+
+ LocalDebug = 1
+ if LocalDebug:
+ print "Total peptides: %s"%(len(self.Peptide2ProteinID.keys()))
+ print "Total protiens: %s"%(len(self.Protein2Peptides.keys()))
+
+
+ ProteinCounts = {}
+ for (TrieIndex,ID) in self.Protein2Peptides.keys():
+ SpecCount = 0
+ for Pep in self.Protein2Peptides[(TrieIndex,ID)]:
+ SpecCount += self.Peptide2SpectralCount[Pep]
+ ProteinCounts[(TrieIndex,ID)] = (len(self.Protein2Peptides[(TrieIndex,ID)]),SpecCount)
+
+ self.SelectedProteins = {}
+ self.FinalPeptideProteins = {} #peptide -> final protein selection
+
+ while (1):
+ BestCandidate = None
+ BestScore = None
+ BestProteinName = None
+ #Find the next best protein (best = most peptides)
+ for (TrieIndex,ProteinID) in ProteinCounts.keys():
+
+ #We've already added this guy
+ if self.SelectedProteins.has_key((TrieIndex,ProteinID)):
+ continue
+ Score = ProteinCounts[(TrieIndex,ProteinID)]
+ CurrProteinName = self.TUtils.GetProteinName(self.IndexFiles[TrieIndex],ProteinID)
+
+ if Score > BestScore or BestScore == None or (Score == BestScore and CurrProteinName < BestProteinName):
+ BestScore = Score
+ BestCandidate = (TrieIndex,ProteinID)
+ BestProteinName = CurrProteinName
+ #print "New Best %s, score %s"%(ProteinID,BestScore)
+ if not BestScore:
+ break
+ (PeptideCount, SpectrumCount) = BestScore
+ if PeptideCount == 0:
+ break
+ #%%%
+ ProteinName = BestProteinName
+ print "Accept protein %s (%s)\n Gets %s peptides, %s spectra"%(BestCandidate, ProteinName, PeptideCount, SpectrumCount)
+ self.SelectedProteins[BestCandidate] = BestScore
+ # Lay claim to all the (not-yet-claimed) peptides:
+ for Peptide in self.Protein2Peptides[BestCandidate]:
+ if LocalDebug:
+ print " Grab %s spectra from peptide %s"%(self.Peptide2SpectralCount[Peptide], Peptide)
+ self.FinalPeptideProteins[Peptide] = BestCandidate
+ # Other proteins (if not already accepted) lose a peptide, and some spectra:
+ for (OtherTrieIndex,OtherID) in self.Peptide2ProteinID[Peptide]:
+ if self.SelectedProteins.has_key((OtherTrieIndex,OtherID)):
+ continue
+ #if LocalDebug:
+ # print "Removing spectra from other Protein %s/%s (%s)"%(OtherTrieIndex,OtherID,self.ProteinNames[(OtherTrieIndex,OtherID)])
+ (pCount,sCount) = ProteinCounts[(OtherTrieIndex,OtherID)]
+
+ if LocalDebug:
+ print "Old counts: %s peptides %s spectra"%(pCount,sCount)
+ self.Protein2Peptides[(OtherTrieIndex,OtherID)].remove(Peptide)
+ pCount -= 1
+ sCount -= self.Peptide2SpectralCount[Peptide]
+ if LocalDebug:
+ print "New counts: %s peptides %s spectra"%(pCount,sCount)
+ ProteinCounts[(OtherTrieIndex, OtherID)] = (pCount,sCount)
+ # Sanity check - the selected proteins have peptides, the unselected proteins have 0
+ for Protein in self.Protein2Peptides.keys():
+ #ProteinName = self.ProteinNames[Protein]
+ PeptideCount = len(self.Protein2Peptides[Protein])
+ SpectrumCount = ProteinCounts.get(Protein, 0)
+ if self.SelectedProteins.has_key(Protein) and PeptideCount <= 0:
+ print "** Warning: Selected protein %s has %s peptides!"%(Protein, PeptideCount)
+ if not self.SelectedProteins.has_key(Protein) and PeptideCount != 0:
+ print "** Warning: Unelected protein %s has %s peptides!"%(Protein, PeptideCount)
+
+
+
+ def WritePeptides(self,FileName):
+ RawFileName = os.path.basename(FileName)
+ InputFile = open(FileName,'r')
+ OutputFile = os.path.join(self.OutputDir,RawFileName)
+ OutFile = open(OutputFile ,'w')
+
+ MissCount = 0
+ LineCount = 0
+ for Line in InputFile:
+ Line = Line.strip()
+ if Line == "":
+ continue
+ if Line[0] == "#":
+ OutFile.write(Line + "\n")
+ continue
+ Bits = Line.split("\t")
+ ModdedPeptide = Bits[self.Columns.getIndex("Annotation")]
+ Peptide = Utils.GetPeptideFromModdedName(ModdedPeptide).Aminos
+
+
+ #See if we are doing parsimony
+ if self.DoParsimonyAndGroup == 1:
+ if not self.Peptide2ProteinID.has_key(Peptide) or len(self.Peptide2ProteinID[Peptide]) == 0:
+ print "ERROR: Peptide %s of %s has no locations!!!"%(Peptide,Line)
+ MissCount += 1
+ continue
+
+ if not self.FinalPeptideProteins.has_key(Peptide):
+ print "ERROR: Peptide %s of %s has no selected protein!!!"%(Peptide,Line)
+ print "Formerly found in:"
+ for (TrieIndex,ID) in self.Peptide2ProteinID[Peptide]:
+ print "(%s,%s)"%(TrieIndex,ID)
+ continue
+ Protein = self.FinalPeptideProteins[Peptide]
+ (TrieIndex,ProtID) = Protein
+
+ #Add other proteins
+
+
+
+ Locations = self.Peptide2ProteinID[Peptide]
+ LocStr = self.TUtils.GetProteinName(self.IndexFiles[TrieIndex],ProtID)
+ for Prot in Locations:
+ if Prot != Protein:
+ (TIndex,PID) = Prot
+ LocStr += DELIM + self.TUtils.GetProteinName(self.IndexFiles[TIndex],PID)
+ Bits[self.Columns.getIndex("Protein")] = LocStr
+
+ elif self.DoParsimony == 1:
+ if not self.FinalPeptideProteins.has_key(Peptide):
+ print "ERROR: Peptide %s of %s has no selected protein!!!"%(Peptide,Line)
+ MissCount += 1
+ continue
+ Protein = self.FinalPeptideProteins[Peptide]
+ (TrieIndex,ProtID) = Protein
+ Bits[self.Columns.getIndex("Protein")] = self.TUtils.GetProteinName(self.IndexFiles[TrieIndex],ProtID)
+ Bits[self.Columns.getIndex("RecordNumber")] = str(Protein[1])
+ else:
+ if not self.Peptide2ProteinID.has_key(Peptide) or len(self.Peptide2ProteinID[Peptide]) == 0:
+ print "ERROR: Peptide %s of %s has no locations!!!"%(Peptide,Line)
+ MissCount += 1
+ continue
+
+ Locations = self.Peptide2ProteinID[Peptide]
+ LocStr = ""
+ for Prot in Locations:
+ (TIndex,PD) = Prot
+ LocStr += self.TUtils.GetProteinName(self.IndexFiles[TIndex],PID) + DELIM
+ LocStr = LocStr[0:-1*len(DELIM)]
+
+ Bits[self.Columns.getIndex("Protein")] = LocStr
+ Str = "\t".join(Bits)
+ OutFile.write("%s\n"%Str)
+ #print Str
+ LineCount += 1
+
+ print "Total peptides omitted: %s"%MissCount
+ print "Wrote %s lines to %s"%(LineCount,OutputFile)
+ OutFile.close()
+ InputFile.close()
+
+
+
+ def ParseCommandLine(self,Arguments):
+ (Options,Args) = getopt.getopt(Arguments,"r:w:t:pa")
+ OptionsSeen = {}
+ for (Option,Value) in Options:
+ OptionsSeen[Option] = 1
+
+ if Option == "-r":
+ if not os.path.exists(Value):
+ print "ERROR: %s is not a valid file or directory"%Value
+ sys.exit(0)
+
+ if not os.path.isdir(Value):
+ self.InputFiles = [Value]
+
+ else:
+ Files = os.listdir(Value)
+ self.InputFiles = []
+ for FileName in Files:
+ self.InputFiles.append(os.path.join(Value,FileName))
+
+ elif Option == "-w":
+ if not os.path.exists(Value):
+ os.makedirs(Value)
+ self.OutputDir = Value
+ elif Option == "-t":
+ if not os.path.isfile(Value):
+ print "ERROR: %s is not a valid database file"%Value
+ sys.exit(0)
+
+ IndexFileName = os.path.splitext(Value)[0] + ".index"
+ if not os.path.isfile(IndexFileName):
+ print "ERROR: Unable to find index file %s for trie file %s"%(IndexFileName,Value)
+ sys.ext(0)
+ self.TrieFiles.append(Value)
+ self.IndexFiles.append(IndexFileName)
+ elif Option == "-p":
+ self.DoParsimony = 1
+ elif Option == "-a":
+ self.DoParsimonyAndGroup = 1
+
+ else:
+ print "ERROR %s is not a valid argument"%Option
+
+ if not OptionsSeen.has_key("-r") or not OptionsSeen.has_key("-w") or not OptionsSeen.has_key("-t"):
+ print "ERROR: Missing arguments"
+ print UsageInfo
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ Grouper = ProteinGrouper()
+ Grouper.ParseCommandLine(sys.argv[1:])
+ Grouper.Main()
diff --git a/PyInspect.pyd b/PyInspect.pyd
new file mode 100644
index 0000000..bfe8d19
Binary files /dev/null and b/PyInspect.pyd differ
diff --git a/PyInspect/PyInspect.c b/PyInspect/PyInspect.c
new file mode 100644
index 0000000..e05aa65
--- /dev/null
+++ b/PyInspect/PyInspect.c
@@ -0,0 +1,661 @@
+//Title: PyInspect.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Python.h"
+#include "Trie.h"
+#include "Score.h"
+#include "PySpectrum.h"
+#include "BN.h"
+#include "PyUtils.h"
+#include "Errors.h"
+#include "FreeMod.h"
+#include "IonScoring.h"
+#include "SVM.h"
+#include "LDA.h"
+#include "ParseInput.h"
+#include "ParseXML.h"
+#include "TagFile.h"
+
+PyObject* InspectError;
+PRMBayesianModel* InteractiveBN = NULL;
+
+PyObject* PyResetIonScoring(PyObject* self, PyObject* args)
+{
+ int IntensityScheme;
+ float IntensityRadius;
+ int CutFlag = 0;
+ int NoiseModelFlag = 0;
+ //
+ if (!PyArg_ParseTuple(args, "if|ii", &IntensityScheme, &IntensityRadius, &CutFlag, &NoiseModelFlag))
+ {
+ return NULL;
+ }
+ FreePRMBayesianModel(InteractiveBN);
+ InteractiveBN = (PRMBayesianModel*)calloc(1, sizeof(PRMBayesianModel));
+ InteractiveBN->NoiseModel = NoiseModelFlag;
+ InteractiveBN->IntensityScheme = IntensityScheme;
+ switch (InteractiveBN->IntensityScheme)
+ {
+ case 0:
+ case 1:
+ case 4:
+ InteractiveBN->MinIntensityLevel = 3;
+ break;
+ case 2:
+ case 3:
+ InteractiveBN->MinIntensityLevel = 2;
+ break;
+ default:
+ REPORT_ERROR(0);
+ break;
+ }
+
+ InteractiveBN->IntensityRadius = (int)(IntensityRadius * DALTON);
+ InteractiveBN->HalfIntensityRadius = InteractiveBN->IntensityRadius / 2;
+ InteractiveBN->CutFlag = CutFlag;
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PyAddIonScoringNode(PyObject* self, PyObject* args)
+{
+ int NodeType;
+ int NodeInfoA;
+ char* Name;
+ int FragmentType = evFragmentTypeNone;
+ float NodeMassOffset;
+ if (!PyArg_ParseTuple(args, "sii|fi", &Name, &NodeType, &NodeInfoA, &NodeMassOffset, &FragmentType))
+ {
+ return NULL;
+ }
+ // Add a node:
+ AddPRMBayesianNode(InteractiveBN, Name, NodeType, NodeInfoA, NodeMassOffset, FragmentType);
+ // Return the node's index:
+ return PyInt_FromLong(InteractiveBN->NodeCount - 1);
+}
+
+PyObject* PySetIonScoringNodeParents(PyObject* self, PyObject* args)
+{
+ int NodeIndex;
+ PyObject* ParentIndexList;
+ PRMBayesianNode* Node;
+ PRMBayesianNode* Parent;
+ int OverallBlockSize;
+ int ParentIndex;
+ int OtherParentIndex;
+ //
+ if (!PyArg_ParseTuple(args, "iO", &NodeIndex, &ParentIndexList))
+ {
+ return NULL;
+ }
+ // Validate input:
+ if (NodeIndex < 0 || NodeIndex >= InteractiveBN->NodeCount)
+ {
+ sprintf(PythonErrorString, "Illegal node index %d in SetIonScoringNodeParents", NodeIndex);
+ ReportPythonError();
+ return NULL;
+ }
+ Node = InteractiveBN->Nodes[NodeIndex];
+
+ // Free the OLD parents, if any:
+ SafeFree(Node->Parents);
+ SafeFree(Node->ParentBlocks);
+
+ // Set the parents of this node:
+ Node->ParentCount = PyList_Size(ParentIndexList);
+ if (Node->ParentCount)
+ {
+ Node->Parents = (PRMBayesianNode**)calloc(sizeof(PRMBayesianNode*), Node->ParentCount);
+ Node->ParentBlocks = (int*)calloc(sizeof(int), Node->ParentCount);
+ }
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ Parent = InteractiveBN->Nodes[PyInt_AsLong(PyList_GetItem(ParentIndexList, ParentIndex))];
+ Node->Parents[ParentIndex] = Parent;
+ }
+ // Set the parent block sizes. Node->ParentBlocks[n] is the total number of combinations
+ // for values of parents n+1 and beyond (or 1, if n is ParentCount - 1). When indexing into
+ // the probability tables, we use these blocks.
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ OverallBlockSize = Node->ValueCount;
+ for (OtherParentIndex = ParentIndex + 1; OtherParentIndex < Node->ParentCount; OtherParentIndex++)
+ {
+ OverallBlockSize *= Node->Parents[OtherParentIndex]->ValueCount;
+ }
+ Node->ParentBlocks[ParentIndex] = OverallBlockSize;
+ Parent = Node->Parents[ParentIndex];
+ OverallBlockSize /= Parent->ValueCount;
+ }
+ // Allocate probability tables:
+ Node->TableSize = Node->ValueCount;
+ if (Node->ParentCount)
+ {
+ Node->TableSize = (Node->Parents[0]->ValueCount * Node->ParentBlocks[0]);
+ }
+ SafeFree(Node->CountTable);
+ Node->CountTable = (int*)calloc(Node->TableSize, sizeof(int));
+ SafeFree(Node->ProbTable);
+ Node->ProbTable = (float*)calloc(Node->TableSize, sizeof(float));
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+void TrainNoiseModelRandomMasses(PRMBayesianModel* Model, MSSpectrum* Spectrum)
+{
+ int Bin;
+ int BinIndex;
+ //
+ for (BinIndex = 0; BinIndex < 20; BinIndex++)
+ {
+ Bin = rand() % Spectrum->IntensityBinCount;
+ Model->RandomIntensityCounts[Spectrum->BinnedIntensityLevels[Bin]]++;
+ }
+}
+
+PyObject* PyTrainBNOnSpectrum(PyObject* self, PyObject* args)
+{
+ PySpectrum* SpectrumObject;
+ char* PeptideAnnotation;
+ Peptide* Match;
+ int PRM = 0;
+ int AminoCount;
+ int AminoIndex;
+ int ModIndex;
+ int NodeIndex;
+ int TableIndex;
+ int ParentIndex;
+ MSSpectrum* Spectrum;
+ PRMBayesianNode* Node;
+ //
+ if (!PyArg_ParseTuple(args, "Os", &SpectrumObject, &PeptideAnnotation))
+ {
+ return NULL;
+ }
+ Match = GetPeptideFromAnnotation(PeptideAnnotation);
+ if (!Match)
+ {
+ REPORT_ERROR(0);
+ return NULL;
+ }
+ Spectrum = SpectrumObject->Spectrum;
+ // Force the spectrum's parent mass to match the right parent mass:
+ Spectrum->ParentMass = Match->ParentMass;
+ PrepareSpectrumForIonScoring(InteractiveBN, Spectrum, 1);
+ AminoCount = strlen(Match->Bases);
+ for (NodeIndex = 0, Node = InteractiveBN->Head; Node; NodeIndex++, Node = Node->Next)
+ {
+ PRM = 0;
+ for (AminoIndex = 0; AminoIndex <= AminoCount; AminoIndex++)
+ {
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Set values, and accumulate table entries:
+ Node->Values[AminoIndex] = IonScoringGetNodeValue(InteractiveBN, Node, Spectrum, PRM, Match, AminoIndex);
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Accumulate PRM from the prefix so far:
+ if (AminoIndex == AminoCount)
+ {
+ break;
+ }
+ PRM += PeptideMass[Match->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] == AminoIndex)
+ {
+ PRM += Match->ModType[ModIndex]->RealDelta;
+ }
+ }
+ } // Amino loop
+ } // NodeIndex loop
+
+ // Iterate over the values arrays, accumulate counts in the frequency tables:
+ for (NodeIndex = 0; NodeIndex < InteractiveBN->NodeCount; NodeIndex++)
+ {
+ for (AminoIndex = 0; AminoIndex <= AminoCount; AminoIndex++)
+ {
+ Node = InteractiveBN->Nodes[NodeIndex];
+ TableIndex = 0;
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ TableIndex += Node->Parents[ParentIndex]->Values[AminoIndex] * Node->ParentBlocks[ParentIndex];
+ }
+ TableIndex += Node->Values[AminoIndex];
+ if (TableIndex >= Node->TableSize)
+ {
+ // Panic!
+ REPORT_ERROR(0);
+ TableIndex = 0;
+ }
+ Node->CountTable[TableIndex]++;
+ }
+ }
+
+ // And, count how frequent the various intensity levels are for a random mass:
+ TrainNoiseModelRandomMasses(InteractiveBN, Spectrum);
+
+ // Cleanup:
+ FreePeptideNode(Match);
+
+ // Return:
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PyDebugPrintPRMBayesianModel(PyObject* self, PyObject* args)
+{
+ if (!PyArg_ParseTuple(args, ""))
+ {
+ return NULL;
+ }
+
+ DebugPrintPRMBayesianModel(InteractiveBN);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+// Save InteractiveBN to disk:
+PyObject* PySaveBNModel(PyObject* self, PyObject* args)
+{
+ char* FileName;
+ if (!PyArg_ParseTuple(args, "s", &FileName))
+ {
+ return NULL;
+ }
+
+ SavePRMBayesianModel(InteractiveBN, FileName);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+// Load InteractiveBN from file; return node count
+PyObject* PyLoadBNModel(PyObject* self, PyObject* args)
+{
+ char* FileName;
+ if (!PyArg_ParseTuple(args, "s", &FileName))
+ {
+ return NULL;
+ }
+
+ FreePRMBayesianModel(InteractiveBN);
+ InteractiveBN = LoadPRMBayesianModel(FileName);
+ if (!InteractiveBN)
+ {
+ return PyInt_FromLong(-1);
+ }
+ return PyInt_FromLong(InteractiveBN->NodeCount);
+}
+
+// Convert the COUNT tables of the bayesian network into PROBABILITY tables.
+PyObject* PyComputeBNProbabilityTables(PyObject* self, PyObject* args)
+{
+ if (!PyArg_ParseTuple(args, ""))
+ {
+ return NULL;
+ }
+ ComputePRMBayesianModelProbabilityTables(InteractiveBN, 1);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PyComputeMutualInformation(PyObject* self, PyObject* args)
+{
+ PyObject* ReturnList;
+ PyObject* NodeEntropyList;
+ PRMBayesianNode* Node;
+ PRMBayesianNode* Parent;
+ float EntropySum[256];
+ int TableIndex;
+ float Entropy;
+ float JointEntropy;
+ int ParentIndex;
+ int TempIndex;
+ int ParentValue;
+ float Probability;
+ int Value;
+ int ValueIndex;
+ int NodeIndex;
+ float NodeEntropy[512];
+ float MutualInformation;
+ int FullTableCount;
+ if (!PyArg_ParseTuple(args, ""))
+ {
+ return NULL;
+ }
+
+ ReturnList = PyList_New(0);
+
+ // Compute the entropy of each node:
+ for (NodeIndex = 0, Node = InteractiveBN->Head; Node; Node = Node->Next, NodeIndex++)
+ {
+ ////////////////////////////////////////////////////////////////
+ // Compute the node's entropy:
+ memset(EntropySum, 0, sizeof(float) * 256);
+ FullTableCount = 0;
+ for (TableIndex = 0; TableIndex < Node->TableSize; TableIndex++)
+ {
+ FullTableCount += Node->CountTable[TableIndex];
+ }
+ for (TableIndex = 0; TableIndex < Node->TableSize; TableIndex++)
+ {
+ Probability = Node->CountTable[TableIndex] / (float)FullTableCount;
+ Value = TableIndex % Node->ValueCount;
+ EntropySum[Value] += Probability;
+ }
+ Entropy = 0;
+ for (ValueIndex = 0; ValueIndex < Node->ValueCount; ValueIndex++)
+ {
+ printf("Node %d %s value %d: Odds %.6f\n", NodeIndex, Node->Name, ValueIndex, EntropySum[ValueIndex]);
+ if (EntropySum[ValueIndex] > 0.0)
+ {
+ Entropy -= EntropySum[ValueIndex] * (float)log(EntropySum[ValueIndex]);
+ }
+ }
+ NodeEntropy[NodeIndex] = Entropy;
+ }
+ for (NodeIndex = 0, Node = InteractiveBN->Head; Node; Node = Node->Next, NodeIndex++)
+ {
+ NodeEntropyList = PyList_New(0);
+ PyList_Append(ReturnList, NodeEntropyList);
+ PyList_Append(NodeEntropyList, PyInt_FromLong(NodeIndex));
+ PyList_Append(NodeEntropyList, PyString_FromString(Node->Name));
+ PyList_Append(NodeEntropyList, PyFloat_FromDouble(NodeEntropy[NodeIndex]));
+ ////////////////////////////////////////////////////////////////
+ // Compute the node's joint entropy with each parent:
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ // EntropySum[ParentValue*Block+Value] = probability that parent has ParentValue
+ // and node has Value
+ Parent = Node->Parents[ParentIndex];
+ memset(EntropySum, 0, sizeof(float) * 256);
+ for (TableIndex = 0; TableIndex < Node->TableSize; TableIndex++)
+ {
+ TempIndex = TableIndex;
+ if (ParentIndex)
+ {
+ TempIndex = TempIndex % Node->ParentBlocks[ParentIndex - 1];
+ }
+ ParentValue = TempIndex / Node->ParentBlocks[ParentIndex];
+ Probability = Node->CountTable[TableIndex] / (float)FullTableCount;
+ //Probability = (float)exp(Node->ProbTable[TableIndex]);
+ Value = TableIndex % Node->ValueCount;
+ EntropySum[ParentValue * Node->ValueCount + Value] += Probability;
+ }
+ JointEntropy = 0;
+ for (ValueIndex = 0; ValueIndex < (Node->ValueCount * Parent->ValueCount); ValueIndex++)
+ {
+ ParentValue = ValueIndex / Node->ValueCount;
+ printf("Node %d %s value %d and parent (%d %s) has value %d: Odds %.6f\n", NodeIndex,
+ Node->Name,
+ ValueIndex % Node->ValueCount, Parent->Index, Parent->Name, ParentValue, EntropySum[ValueIndex]);
+ if (EntropySum[ValueIndex] > 0.0)
+ {
+ JointEntropy -= EntropySum[ValueIndex] * (float)log(EntropySum[ValueIndex]);
+ }
+ }
+ MutualInformation = (NodeEntropy[NodeIndex] + NodeEntropy[Parent->Index] - JointEntropy);
+ printf("Node %d(%s) parent %d(%s):\n", NodeIndex, Node->Name, Parent->Index, Parent->Name);
+ printf(" Child entropy %.6f, parent entropy %.6f\n", NodeEntropy[NodeIndex], NodeEntropy[Parent->Index]);
+ printf(" Joint entropy: %.6f\n", JointEntropy);
+ printf(" Mutual information: %.6f\n", MutualInformation);
+ printf(" Conditional entropy (Child|Parent): %.6f\n", JointEntropy - NodeEntropy[Parent->Index]);
+ printf(" Normalized MI: %.6f\n", MutualInformation / NodeEntropy[NodeIndex]);
+ PyList_Append(NodeEntropyList, PyFloat_FromDouble(MutualInformation / NodeEntropy[NodeIndex]));
+ }
+ Py_DECREF(NodeEntropyList);
+ }
+ return ReturnList;
+}
+
+PyObject* PyGetBNFeatureNames(PyObject* self, PyObject* args)
+{
+ PyObject* ReturnList;
+ PRMBayesianNode* Node;
+ //
+ if (!PyArg_ParseTuple(args, ""))
+ {
+ return NULL;
+ }
+ ReturnList = PyList_New(0);
+ for (Node = InteractiveBN->Head; Node; Node = Node->Next)
+ {
+ PyList_Append(ReturnList, PyString_FromString(Node->Name));
+ }
+ return ReturnList;
+}
+
+PyObject* PyComputeBNValuesForSpectrum(PyObject* self, PyObject* args)
+{
+ PySpectrum* SpectrumObject;
+ char* PeptideAnnotation;
+ Peptide* Match;
+ int PRM = 0;
+ int AminoCount;
+ int AminoIndex;
+ int ModIndex;
+ int NodeIndex;
+ MSSpectrum* Spectrum;
+ PRMBayesianNode* Node;
+ PyObject* ReturnList;
+ PyObject* NodeValueList;
+ //
+ if (!PyArg_ParseTuple(args, "Os", &SpectrumObject, &PeptideAnnotation))
+ {
+ return NULL;
+ }
+ Match = GetPeptideFromAnnotation(PeptideAnnotation);
+ if (!Match)
+ {
+ REPORT_ERROR(0);
+ return NULL;
+ }
+ Spectrum = SpectrumObject->Spectrum;
+ // Force the spectrum's parent mass to match the right parent mass:
+ Spectrum->ParentMass = Match->ParentMass;
+ PrepareSpectrumForIonScoring(InteractiveBN, Spectrum, 1);
+ AminoCount = strlen(Match->Bases);
+ for (NodeIndex = 0, Node = InteractiveBN->Head; Node; NodeIndex++, Node = Node->Next)
+ {
+ PRM = 0;
+ for (AminoIndex = 0; AminoIndex <= AminoCount; AminoIndex++)
+ {
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Set values, and accumulate table entries:
+ Node->Values[AminoIndex] = IonScoringGetNodeValue(InteractiveBN, Node, Spectrum, PRM, Match, AminoIndex);
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Add to PRM:
+ if (AminoIndex == AminoCount)
+ {
+ break;
+ }
+ PRM += PeptideMass[Match->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] == AminoIndex)
+ {
+ PRM += Match->ModType[ModIndex]->RealDelta;
+ }
+ }
+ } // Amino loop
+ } // NodeIndex loop
+
+ // Iterate over the values arrays, building the return-list.
+ ReturnList = PyList_New(0);
+ for (AminoIndex = 0; AminoIndex <= AminoCount; AminoIndex++)
+ {
+ NodeValueList = PyList_New(0);
+ PyList_Append(ReturnList, NodeValueList);
+ for (NodeIndex = 0; NodeIndex < InteractiveBN->NodeCount; NodeIndex++)
+ {
+ Node = InteractiveBN->Nodes[NodeIndex];
+ PyList_Append(NodeValueList, PyInt_FromLong(Node->Values[AminoIndex]));
+ //Py_DECREF(NodeValueList);
+ }
+ }
+
+ // Cleanup:
+ FreePeptideNode(Match);
+
+ // Return:
+ return ReturnList;
+}
+
+PyObject* PyFinishIonScoringNetwork(PyObject* self, PyObject* args)
+{
+ if (!PyArg_ParseTuple(args, ""))
+ {
+ return NULL;
+ }
+
+ // Perform any activities necessary to finalizing InteractiveBN:
+ BuildModelFlankList(InteractiveBN);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PyReloadPMC(PyObject* self, PyObject* args)
+{
+ //
+ if (!PyArg_ParseTuple(args, "|"))
+ {
+ return NULL;
+ }
+
+ // Reload parent mass correction and charge-correction models:
+
+#ifdef PMC_USE_SVM
+ LoadPMCSVM(1);
+#else
+ LoadPMCLDA(1);
+#endif
+
+#ifdef CC_USE_SVM
+ LoadCCModelSVM(1);
+#else
+ LoadCCModelLDA(1);
+#endif
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static PyMethodDef PyInspectMethods[] =
+{
+ {"ResetIonScoring", PyResetIonScoring, 1, "Reset the ion scoring network"},
+ {"AddIonScoringNode", PyAddIonScoringNode, 1, "Add a node to the ion scoring network"},
+ {"SetIonScoringNodeParents", PySetIonScoringNodeParents, 1, "Set the parent(s) of an ion scoring node"},
+ {"FinishIonScoringNetwork", PyFinishIonScoringNetwork, 1, "Finalize an ion scoring network"},
+ {"TrainBNOnSpectrum", PyTrainBNOnSpectrum, 1, "Accumulate counts for network nodes, given a spectrum and peptide"},
+ {"DebugPrintBNModel", PyDebugPrintPRMBayesianModel, 1, "Debug print"},
+ {"SaveBNModel", PySaveBNModel, 1, "Save model to a binary file"},
+ {"LoadBNModel", PyLoadBNModel, 1, "Load from binary file (as written by SaveBNModel)"},
+ {"ComputeBNProbabilityTables", PyComputeBNProbabilityTables, 1, "Compute probability tables for a BNModel"},
+ {"ComputeBNValuesForSpectrum", PyComputeBNValuesForSpectrum, 1, "Compute values for nodes in the BNModel"},
+ {"GetBNFeatureNames", PyGetBNFeatureNames, 1, "Return a list of names of nodes in the bayesian network"},
+ {"ComputeMutualInformation", PyComputeMutualInformation, 1, "Compute MutualInformation for nodes and their parents"},
+ {"ReloadPMC", PyReloadPMC, 1, "Reset PMC / CC models"},
+ //{"erf", PyErrorFunction, METH_VARARGS, "return the error function erf(x)"},
+ //{"GammaIncomplete", PyGammaIncomplete, METH_VARARGS, "return the incomplete gamma function g(a, x)"},
+ //{"foo", ex_foo, 1, "foo() doc string"},
+ {NULL, NULL}
+};
+
+// Cleanup, called by Python when unloading. Deallocate memory:
+void PyInspectCleanup(void)
+{
+ FreeMassDeltaByMass();
+ FreeMassDeltas();
+ FreeIsSubDecoration();
+ //FreeTaggingModel();
+ FreeJumpingHash();
+ FreeSVMModels();
+ FreeBayesianModels();
+ FreeLDAModels();
+ FreePRMBayesianModel(InteractiveBN);
+ InteractiveBN = NULL;
+ SafeFree(GlobalOptions);
+ GlobalOptions = NULL;
+ FreeCCModelSVM();
+ FreeTagSkewScores();
+ SafeFree(MassDeltaByIndex);
+ MassDeltaByIndex = NULL;
+ FreeMZXMLParseCursor();
+ FreeMZDataParseCursor();
+
+}
+
+PyMODINIT_FUNC initPyInspect(void)
+{
+ PyObject* Module;
+ ////////////////////
+ PySpectrumType.tp_new = PyType_GenericNew;
+ if (PyType_Ready(&PySpectrumType) < 0)
+ {
+ return;
+ }
+ Module = Py_InitModule("PyInspect", PyInspectMethods);
+
+ // Add the Error object:
+ InspectError = PyErr_NewException("Inspect.error", NULL, NULL);
+ Py_INCREF(InspectError);
+ PyModule_AddObject(Module, "error", InspectError);
+ InitErrors();
+
+ // Add the Spectrum object:
+ PyModule_AddObject(Module, "Spectrum", (PyObject *)&PySpectrumType);
+
+ // Create an ion scoring network, for interactive use:
+ InteractiveBN = (PRMBayesianModel*)calloc(1, sizeof(PRMBayesianModel));
+
+ // Perform some standard loading here, like amino acid masses.
+ AllocMassDeltaByIndex();
+ InitOptions();
+ sprintf(GlobalOptions->ResourceDir, ".%c", SEPARATOR);
+ LoadPeptideMasses(NULL);
+ PeptideMass['C'] += 57000; // ASSUMED: All cysteines carry the +57 modification.
+ LoadMassDeltas(NULL, 0);
+ InitBayesianModels();
+ SetTagSkewScores();
+ //LoadFlankingAminoEffects();
+ //LoadCCModel();
+#ifdef MQSCORE_USE_SVM
+ InitPValueSVM();
+#else
+ InitPValueLDA();
+#endif
+ PopulateJumpingHash();
+
+ // Set the blind-flag to TRUE so that modified peptides
+ // incur a score-penalty:
+ GlobalOptions->RunMode |= RUN_MODE_BLIND;
+ // Register our cleanup function to run at exit:
+ Py_AtExit(PyInspectCleanup);
+}
diff --git a/PyInspect/PySpectrum.c b/PyInspect/PySpectrum.c
new file mode 100644
index 0000000..6e650a8
--- /dev/null
+++ b/PyInspect/PySpectrum.c
@@ -0,0 +1,1265 @@
+//Title: PySpectrum.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+// PySpectrum: Python wrapper for an MSSpectrum object.
+#include "CMemLeak.h"
+#include "PySpectrum.h"
+#include "PyUtils.h"
+#include "Trie.h"
+#include "Tagger.h"
+#include "Score.h"
+#include "Mods.h"
+#include "CMemLeak.h"
+#include "FreeMod.h"
+#include "BN.h"
+#include "ChargeState.h"
+#include "Scorpion.h"
+#include "SVM.h"
+#include "IonScoring.h"
+#include "Errors.h"
+#include "TagFile.h"
+
+// Important note: These type objects MUST be defined here in the .c file, not in the header!
+// Otherwise, a copy is built for each including file, and these copies are not
+// updated during module initialization. (The result is that MSPeak objects instantiated
+// from code don't have their members set right, so their attributes can't be accessed)
+PyTypeObject PySpectrumType =
+{
+ PyObject_HEAD_INIT(NULL)
+ 0, //ob_size
+ "PyInspect.PySpectrum", //tp_name
+ sizeof(PySpectrum), //tp_basicsize
+ 0, //tp_itemsize
+ PySpectrumDealloc, //tp_dealloc
+ 0, //tp_print
+ 0, //tp_getattr
+ 0, //tp_setattr
+ 0, //tp_compare
+ 0, //tp_repr
+ 0, //tp_as_number
+ 0, //tp_as_sequence
+ 0, //tp_as_mapping
+ 0, //tp_hash
+ 0, //tp_call
+ 0, //tp_str
+ 0, //tp_getattro
+ 0, //tp_setattro
+ 0, //tp_as_buffer
+ Py_TPFLAGS_DEFAULT, //tp_flags
+ "MS spectrum", // tp_doc
+ 0, // tp_traverse
+ 0, // tp_clear
+ 0, // tp_richcompare
+ 0, // tp_weaklistoffset
+ 0, // tp_iter
+ 0, // tp_iternext
+ PySpectrumMethods, // tp_methods
+ PySpectrumMembers, // tp_members
+ PySpectrumGetSet, // tp_getset
+ 0, // tp_base
+ 0, // tp_dict
+ 0, // tp_descr_get
+ 0, // tp_descr_set
+ 0, // tp_dictoffset
+ (initproc)PySpectrumInit, // tp_init
+ 0, // tp_alloc
+ PySpectrumNew, // tp_new
+};
+
+extern PRMBayesianModel* InteractiveBN; // lives in PyInspect.c
+
+TrieTag* TagGraphGenerateTags(TagGraph* Graph, MSSpectrum* Spectrum, int* TagCount,
+ int MaximumTagCount, SpectrumTweak* Tweak, float TagEdgeScoreMultiplier,
+ PRMBayesianModel* Model);
+
+// __new__ method of PySpectrum; call this in C code to create new PySpectrum objects.
+// (It's expected that PySpectrumInit gets called too)
+PyObject* PySpectrumNew(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+ PySpectrum* self;
+ //
+ self = (PySpectrum*)type->tp_alloc(type, 0);
+ if (self != NULL)
+ {
+ // Perform non-parameterized initialization here.
+ //memset(self->NumberToMSIndex, -1, MAX_MS_SCAN);
+ }
+ return (PyObject*)self;
+}
+
+// Called when Inspect.Spectrum() is instantiated from Python code.
+// Parse the specified spectrum file!
+PyObject* PySpectrumInit(PySpectrum* self, PyObject* args, PyObject* kwds)
+{
+ char* FilePath;
+ int FilePosition = 0; // Default: byte offset 0
+ SpectrumNode* Node;
+ FILE* SpectrumFile;
+ int LoadResult;
+
+
+ //
+ // Constructor argument: the path to a mass-spec run output file
+ if (!PyArg_ParseTuple(args, "s|i", &FilePath, &FilePosition))
+ {
+ return (PyObject*)-1;
+ }
+ Node = (SpectrumNode*)calloc(1, sizeof(SpectrumNode));
+
+ Node->FilePosition = FilePosition;
+
+ Node->ScanNumber = 0;
+ Node->InputFile = (InputFileNode*)calloc(1, sizeof(InputFileNode));
+ strncpy(Node->InputFile->FileName, FilePath, MAX_FILENAME_LEN);
+ strncpy(self->FileName, FilePath, MAX_FILENAME_LEN);
+ // Guess the file format:
+ Node->InputFile->Format = GuessSpectrumFormatFromExtension(FilePath);
+
+ Node->Spectrum = (MSSpectrum*)calloc(1, sizeof(MSSpectrum));
+ Node->Spectrum->Node = Node;
+
+ // Special case: If it's a .ms2 extension, it could be "colon format" or standard MS2:
+ if (Node->InputFile->Format == SPECTRUM_FORMAT_MS2_COLONS)
+ {
+ Node->InputFile->Format = GuessSpectrumFormatFromHeader(FilePath, Node->Spectrum);
+ }
+ SpectrumFile = fopen(FilePath, "rb");
+ if (!SpectrumFile)
+ {
+ sprintf(PythonErrorString, "** Error: Unable to open spectrum file '%s'\n", FilePath);
+ ReportPythonError();
+ // In an Init function, we must return -1 to indicate that an object can't
+ // be created. (Normally we return NULL for failure!)
+ return (PyObject*)-1;
+ }
+
+ fseek(SpectrumFile, Node->FilePosition, 0);
+ LoadResult = SpectrumLoadFromFile(Node->Spectrum, SpectrumFile);
+ fclose(SpectrumFile);
+ if (!LoadResult)
+ {
+ sprintf(PythonErrorString, "** Error: Unable to parse spectrum from %s:%d\n", FilePath, Node->FilePosition);
+ ReportPythonError();
+ // In an Init function, we must return -1 to indicate that an object can't
+ // be created. (Normally we return NULL for failure!)
+ return (PyObject*)-1;
+ }
+
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+
+ IntensityRankPeaks(Node->Spectrum);
+
+ self->Spectrum = Node->Spectrum;
+ TweakSpectrum(Node);
+
+ return 0;
+}
+
+int GuessCharge(SpectrumNode* Node, int MatchMass)
+{
+ int BestDiff = -1;
+ int BestCharge;
+ int Charge;
+ int Mass;
+ int Diff;
+ //
+ for (Charge = 1; Charge < 5; Charge++)
+ {
+ Mass = Node->Spectrum->MZ * Charge - (Charge - 1) * HYDROGEN_MASS;
+ Diff = abs(MatchMass - Mass);
+ if (BestDiff < 0 || Diff < BestDiff)
+ {
+ BestDiff = Diff;
+ BestCharge = Charge;
+ }
+ }
+ return BestCharge;
+}
+
+// Helper function for both LabelPeaks() and Score(); the code paths overlap
+PyObject* ScoreHelper(PySpectrum* self, char* PeptideString, int Charge, int VerboseFlag, int LabelPeaksFlag, int ReturnScoreDetails)
+{
+ Peptide* Match;
+ float MQScore;
+ SpectrumNode* Node;
+ int PeakIndex;
+ SpectralPeak* Peak;
+ PyObject* LabeledPeakList;
+ PyObject* LabeledPeakTuple;
+ PyListObject* List;
+ int FeatureIndex;
+ //
+
+ Node = self->Spectrum->Node;
+ Match = GetPeptideFromAnnotation(PeptideString);
+ if (!Match)
+ {
+ sprintf(PythonErrorString, "** Error: Unable to parse peptide annotation '%s'\n", PeptideString);
+ ReportPythonError();
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+ if (!Charge)
+ {
+ Charge = GuessCharge(Node, Match->ParentMass);
+ }
+ Node->Tweaks[0].ParentMass = Match->ParentMass;
+ Node->Tweaks[0].Charge = Charge;
+ Match->Tweak = Node->Tweaks;
+ Node->Spectrum->Charge = Charge;
+ GlobalOptions->DigestType = DIGEST_TYPE_TRYPSIN;
+ ComputeMQScoreFeatures(Node->Spectrum, Match, Match->ScoreFeatures, VerboseFlag);
+
+#ifdef MQSCORE_USE_SVM
+ MQScore = SVMComputeMQScore(Node->Spectrum, Match, Match->ScoreFeatures);
+#else
+ MQScore = LDAComputeMQScore(Node->Spectrum, Match, Match->ScoreFeatures);
+#endif
+
+ if (VerboseFlag)
+ {
+ // Print out the ion-type categorization of all peaks:
+ printf("\n");
+ printf("Score %s on spectrum %s:%d\n", PeptideString, Node->InputFile->FileName, Node->FilePosition);
+ for (PeakIndex = 0; PeakIndex < Node->Spectrum->PeakCount; PeakIndex++)
+ {
+ Peak = Node->Spectrum->Peaks + PeakIndex;
+ printf("%.2f\t%.2f\t%s\t%d\t\n", Peak->Mass / (float)MASS_SCALE, Peak->Intensity, GetFragmentTypeName(Peak->IonType), Peak->AminoIndex);
+ }
+ }
+
+ if (LabelPeaksFlag)
+ {
+ // Return a list of peaks.
+ LabeledPeakList = PyList_New(0);
+ for (PeakIndex = 0; PeakIndex < Node->Spectrum->PeakCount; PeakIndex++)
+ {
+ Peak = Node->Spectrum->Peaks + PeakIndex;
+ LabeledPeakTuple = Py_BuildValue("ffsi", Peak->Mass / (float)MASS_SCALE, Peak->Intensity, GetFragmentTypeName(Peak->IonType), Peak->AminoIndex);
+ PyList_Append(LabeledPeakList, LabeledPeakTuple);
+ // The call to PyList_Append has incremented the refcount of the
+ // tuple to 2. We're abandoning our reference to the tuple now,
+ // so we decrease its reference count:
+ Py_DECREF(LabeledPeakTuple);
+ LabeledPeakTuple = NULL; // just to be explicit about it!
+ }
+ FreePeptideNode(Match);
+ return LabeledPeakList;
+ }
+ else
+ {
+ if (ReturnScoreDetails)
+ {
+ List = (PyListObject*)PyList_New(0);
+ PyList_Append((PyObject*)List, PyFloat_FromDouble(MQScore));
+ for (FeatureIndex = 0; FeatureIndex < 7; FeatureIndex++)
+ {
+ PyList_Append((PyObject*)List, PyFloat_FromDouble(Match->ScoreFeatures[FeatureIndex]));
+ }
+ FreePeptideNode(Match);
+ //Py_DECREF(List); // Important to do this!
+ return (PyObject*)List;
+ }
+ else
+ {
+ FreePeptideNode(Match);
+ return PyFloat_FromDouble(MQScore);
+ }
+ }
+}
+
+PyObject* PySpectrumLabelPeaks(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ char* PeptideString;
+ int Charge = 0;
+ int VerboseFlag = 0;
+ //
+ if (!PyArg_ParseTuple(args, "s|ii", &PeptideString, &Charge, &VerboseFlag))
+ {
+ return NULL;
+ }
+ return ScoreHelper(self, PeptideString, Charge, VerboseFlag, 1, 0);
+}
+
+PyObject* PySpectrumScore(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ char* PeptideString;
+ int Charge = 0;
+ int VerboseFlag = 0;
+ if (!PyArg_ParseTuple(args, "s|ii", &PeptideString, &Charge, &VerboseFlag))
+ {
+ return NULL;
+ }
+ return ScoreHelper(self, PeptideString, Charge, VerboseFlag, 0, 0);
+}
+
+PyObject* PySpectrumScoreDetailed(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ char* PeptideString;
+ int Charge = 0;
+ int VerboseFlag = 0;
+ if (!PyArg_ParseTuple(args, "s|ii", &PeptideString, &Charge, &VerboseFlag))
+ {
+ return NULL;
+ }
+ return ScoreHelper(self, PeptideString, Charge, VerboseFlag, 0, 1);
+}
+
+
+// Deallocate an PySpectrum object.
+void PySpectrumDealloc(PyObject* selfobject)
+{
+ PySpectrum* self = (PySpectrum*)selfobject;
+ if (self)
+ {
+ if (self->Spectrum)
+ {
+ // The PySpectrum object wraps a Spectrum object, but also a SpectrumNode and an InputFileNode.
+ // So, free those as well:
+ if (self->Spectrum->Node->InputFile)
+ {
+ free(self->Spectrum->Node->InputFile);
+ self->Spectrum->Node->InputFile = NULL;
+ }
+ if (self->Spectrum->Node)
+ {
+ FreeSpectrumNode(self->Spectrum->Node);
+ }
+ else
+ {
+ FreeSpectrum(self->Spectrum);
+ }
+ if (self->MatchList)
+ {
+ Py_DECREF(self->MatchList);
+ }
+ }
+ self->ob_type->tp_free((PyObject*)self);
+ }
+}
+
+PyObject* PySpectrumGetPeakCount(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ if (!PyArg_ParseTuple(args, ""))
+ {
+ return NULL;
+ }
+ return Py_BuildValue("i", self->Spectrum->PeakCount);
+}
+
+//PyObject* PySpectrumGetCharge(PySpectrum* self, PyObject* args, PyObject* kwargs)
+//{
+// if (!PyArg_ParseTuple(args, ""))
+// {
+// return NULL;
+// }
+// return Py_BuildValue("i", self->Spectrum->Charge);
+//}
+
+
+// Run parent mass correction. Return a list of tuples of the
+// form (Mass, ModelScore).
+PyObject* PySpectrumCorrectParentMass(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ int CorrectChargeTemp = -1;
+ int PMToleranceTemp = -1;
+ int VerboseFlag = 0;
+ PMCSpectrumInfo* SpectrumInfo;
+ PyObject* FeatureTupleList;
+ PyObject* FeatureTuple;
+ PMCInfo* Info;
+ MSSpectrum* Spectrum = self->Spectrum;
+
+ //
+ if (!PyArg_ParseTuple(args, "|ii", &PMToleranceTemp, &CorrectChargeTemp))
+ {
+ return NULL;
+ }
+
+ SpectrumInfo = GetPMCSpectrumInfo(Spectrum);
+ PerformPMC(SpectrumInfo);
+
+ FeatureTupleList = PyList_New(0);
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ FeatureTuple = PyTuple_New(2);
+ PyTuple_SetItem(FeatureTuple, 0, PyFloat_FromDouble(Info->ParentMass / (float)MASS_SCALE));
+ PyTuple_SetItem(FeatureTuple, 1, PyFloat_FromDouble(Info->SVMScore));
+ PyList_Append(FeatureTupleList, FeatureTuple);
+ }
+
+ //return ScoreHelper(self, PeptideString, Charge, VerboseFlag, 0, 0);
+ FreePMCSpectrumInfo(SpectrumInfo);
+ return FeatureTupleList;
+}
+
+// Function to assist in training and testing parent mass correction. Given a spectrum,
+// compute its self-convolution features, and return them as a list of tuples.
+PyObject* PySpectrumGetPMCFeatures(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ PMCSpectrumInfo* SpectrumInfo;
+ PMCInfo* Info;
+ int PMCCount;
+ PyObject* FeatureTupleList;
+ PyObject* FeatureTuple;
+ int FeatureIndex;
+ int PMCFeatureCount = 64;
+ //
+
+ SpectrumInfo = GetPMCSpectrumInfo(self->Spectrum);
+ ComputePMCFeatures(SpectrumInfo);
+
+ // Count the PMCInfo nodes:
+ PMCCount = 0;
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ PMCCount++;
+ }
+ // Return a list of features.
+ FeatureTupleList = PyList_New(0);
+ for (Info = SpectrumInfo->Head; Info; Info = Info->Next)
+ {
+ FeatureTuple = PyTuple_New(PMCFeatureCount + 1);
+ PyTuple_SetItem(FeatureTuple, 0, PyFloat_FromDouble(Info->ParentMass / (float)MASS_SCALE));
+ for (FeatureIndex = 0; FeatureIndex < PMCFeatureCount; FeatureIndex++)
+ {
+ PyTuple_SetItem(FeatureTuple, FeatureIndex + 1, PyFloat_FromDouble(Info->Features[FeatureIndex]));
+ }
+ PyList_Append(FeatureTupleList, FeatureTuple);
+ Py_DECREF(FeatureTuple); // Important!
+ }
+ FreePMCSpectrumInfo(SpectrumInfo);
+ return FeatureTupleList;
+}
+
+// Function to assist in training and testing charge correction. Given a spectrum,
+// computes and return the charge-correction features.
+PyObject* PySpectrumGetCCFeatures(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ int Charge1Flag = 0;
+ PMCSpectrumInfo* SpectrumInfo1;
+ PMCSpectrumInfo* SpectrumInfo2;
+ PMCSpectrumInfo* SpectrumInfo3;
+ float CCFeatures[64];
+ int FeatureIndex;
+ PyObject* FeatureTuple;
+ //
+ if (!PyArg_ParseTuple(args, "i", &Charge1Flag))
+ {
+ return NULL;
+ }
+ memset(CCFeatures, 0, sizeof(float) * 64);
+ /////////////////////////////////
+ // Charge 1 PMC:
+ self->Spectrum->Charge = 1;
+ self->Spectrum->ParentMass = (self->Spectrum->MZ * 1);
+ SpectrumInfo1 = GetPMCSpectrumInfo(self->Spectrum);
+ PerformPMC(SpectrumInfo1);
+ /////////////////////////////////
+ // Charge 2 PMC:
+ self->Spectrum->Charge = 2;
+ self->Spectrum->ParentMass = (self->Spectrum->MZ * 2) - HYDROGEN_MASS;
+ SpectrumInfo2 = GetPMCSpectrumInfo(self->Spectrum);
+ PerformPMC(SpectrumInfo2);
+ /////////////////////////////////
+ // Charge 3 PMC:
+ self->Spectrum->Charge = 3;
+ self->Spectrum->ParentMass = (self->Spectrum->MZ * 3) - 2 * HYDROGEN_MASS;
+ SpectrumInfo3 = GetPMCSpectrumInfo(self->Spectrum);
+ PerformPMC(SpectrumInfo3);
+ if (Charge1Flag == 1)
+ {
+ //////////////////////////////////
+ // Get features:
+ GetChargeCorrectionFeatures1(SpectrumInfo1, SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ }
+ else
+ {
+ GetChargeCorrectionFeatures2(SpectrumInfo2, SpectrumInfo3, CCFeatures);
+ }
+ FeatureTuple = PyTuple_New(64);
+ for (FeatureIndex = 0; FeatureIndex < 64; FeatureIndex++)
+ {
+ PyTuple_SetItem(FeatureTuple, FeatureIndex, PyFloat_FromDouble(CCFeatures[FeatureIndex]));
+ }
+
+ FreePMCSpectrumInfo(SpectrumInfo1);
+ FreePMCSpectrumInfo(SpectrumInfo2);
+ FreePMCSpectrumInfo(SpectrumInfo3);
+ return FeatureTuple;
+}
+
+
+PyObject* PySpectrumSetCharge(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ int NewCharge;
+ if (!PyArg_ParseTuple(args, "i", &NewCharge))
+ {
+ return NULL;
+ }
+ self->Spectrum->Charge = NewCharge;
+ // Reset the parent mass, based upon the M/Z:
+ self->Spectrum->ParentMass = (self->Spectrum->MZ * NewCharge) - (NewCharge - 1) * HYDROGEN_MASS;
+ // Set tweaks:
+ TweakSpectrum(self->Spectrum->Node); //sam, comment out for phos pmc
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PySpectrumGetMZ(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ float MZ;
+ //
+ MZ = self->Spectrum->MZ / (float)MASS_SCALE;
+ return PyFloat_FromDouble(MZ);
+}
+
+PyObject* PySpectrumGetParentMass(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ float Mass;
+ //
+ Mass = self->Spectrum->ParentMass / (float)MASS_SCALE;
+ return PyFloat_FromDouble(Mass);
+}
+
+PyObject* PySpectrumSetParentMass(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ float NewParentMass;
+ int Charge = 0;
+ MSSpectrum* Spectrum = self->Spectrum;
+ //
+ if (!PyArg_ParseTuple(args, "f|i", &NewParentMass, &Charge))
+ {
+ return NULL;
+ }
+ Spectrum->ParentMass = (int)(NewParentMass * MASS_SCALE + 0.5);
+ if (Charge)
+ {
+ Spectrum->Charge = Charge;
+ }
+ if (Spectrum->Charge)
+ {
+ Spectrum->MZ = (Spectrum->ParentMass + (Spectrum->Charge - 1) * HYDROGEN_MASS) / Spectrum->Charge;
+ }
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PySpectrumBYConvolve(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ // If TriplyChargedFlag is true, then we're looking for pairs with one singly-charged
+ // peak and one doubly-charged peak. Otherwise we're looking for two singly-charged peaks.
+ int TriplyChargedFlag = 0;
+ int Offset;
+ float FloatOffset = 1.0;
+ float Convolution = 0;
+ int PeakIndex;
+ int OtherMass;
+ int Bin;
+ float Intensity;
+ //
+ MSSpectrum* Spectrum = self->Spectrum;
+ //
+ if (!PyArg_ParseTuple(args, "|fi", &FloatOffset, &TriplyChargedFlag))
+ {
+ return NULL;
+ }
+
+ // Special case: If TriplyChargedFlag is -1, then compute direct self-convolution!
+ // The direct self-convolution (dot product with self) is useful for scaling the
+ // b,y convolutions.
+ if (TriplyChargedFlag < 0)
+ {
+ Convolution = 0;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Bin = (Spectrum->Peaks[PeakIndex].Mass + 50) / 100;
+ if (Bin >= 0 && Bin < Spectrum->IntensityBinCount)
+ {
+ Intensity = Spectrum->BinnedIntensitiesTight[Bin];
+ Convolution += Spectrum->Peaks[PeakIndex].Intensity * Intensity; // * PeakScalingFactor;
+ }
+ }
+ return PyFloat_FromDouble(Convolution);
+ }
+ Offset = (int)(FloatOffset * MASS_SCALE + 0.5);
+
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (TriplyChargedFlag)
+ {
+ OtherMass = Spectrum->ParentMass + 2 * HYDROGEN_MASS - (2 * Spectrum->Peaks[PeakIndex].Mass) + Offset;
+ }
+ else
+ {
+ OtherMass = Spectrum->ParentMass + HYDROGEN_MASS - Spectrum->Peaks[PeakIndex].Mass + Offset;
+ }
+ Bin = ((OtherMass + 50) / 100);
+ if (Bin < 0 || Bin >= Spectrum->IntensityBinCount)
+ {
+ continue;
+ }
+ Convolution += Spectrum->Peaks[PeakIndex].Intensity * Spectrum->BinnedIntensitiesTight[Bin];
+ }
+ return PyFloat_FromDouble(Convolution);
+}
+
+PyObject* PySpectrumCorrectCharge(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ // If TriplyChargedFlag is true, then we're looking for pairs with one singly-charged
+ // peak and one doubly-charged peak. Otherwise we're looking for two singly-charged peaks.
+ int TriplyChargedFlag = 0;
+ float FloatOffset = 1.0;
+ float Convolution = 0;
+ int Result;
+ int ReturnScoresFlag = 0;
+ float Model1Score;
+ float Model2Score;
+ //
+ MSSpectrum* Spectrum = self->Spectrum;
+ //
+ if (!PyArg_ParseTuple(args, "|i", &ReturnScoresFlag))
+ {
+ return NULL;
+ }
+ Result = ChargeCorrectSpectrum(self->Spectrum->Node, &Model1Score, &Model2Score);
+ if (ReturnScoresFlag)
+ {
+ return Py_BuildValue("ff", Model1Score, Model2Score);
+ }
+ else
+ {
+ return PyInt_FromLong(Result);
+ }
+}
+
+void PySpectrumReportTagsFromTrie(PyObject* TagList, TrieNode* Root)
+{
+ TrieTagHanger* Hanger;
+ TrieTag* Tag;
+ int ChildIndex;
+ TrieNode* Node;
+ PyObject* TagTuple;
+ //
+ for (Hanger = Root->FirstTag; Hanger; Hanger = Hanger->Next)
+ {
+ Tag = Hanger->Tag;
+ TagTuple = Py_BuildValue("fsf", Tag->PrefixMass / (float)DALTON,
+ Tag->Tag, Tag->SuffixMass / (float)DALTON);
+ PyList_Append(TagList, TagTuple);
+ }
+
+ for (ChildIndex = 0; ChildIndex < AMINO_ACIDS; ChildIndex++)
+ {
+ if (ChildIndex == 'I' - 'A' || ChildIndex == 'Q' - 'A')
+ {
+ continue;
+ }
+ Node = Root->Children[ChildIndex];
+ if (Node)
+ {
+ PySpectrumReportTagsFromTrie(TagList, Node);
+ }
+ }
+}
+
+int WriteTagsToList(TrieNode* Root, TrieTag* Tags, int MaxCount, int CurrentCount)
+{
+ //int RunningTotal;
+ int ChildIndex;
+ TrieNode* Child;
+ TrieTagHanger* Hanger;
+ //
+ if (CurrentCount >= MaxCount)
+ {
+ return CurrentCount;
+ }
+
+ for (ChildIndex = 0; ChildIndex < AMINO_ACIDS; ChildIndex++)
+ {
+ if (ChildIndex == 'I' - 'A' || ChildIndex == 'Q' - 'A')
+ {
+ continue;
+ }
+
+ Child = Root->Children[ChildIndex];
+ if (Child)
+ {
+ CurrentCount = WriteTagsToList(Child, Tags, MaxCount, CurrentCount);
+ }
+ }
+ for (Hanger = Root->FirstTag; Hanger; Hanger = Hanger->Next)
+ {
+ memcpy(Tags + CurrentCount, Hanger->Tag, sizeof(TrieTag));
+ CurrentCount++;
+ if (CurrentCount >= MaxCount)
+ {
+ return CurrentCount;
+ }
+ }
+ return CurrentCount;
+}
+
+// Generate tags for this spectrum. Optionally, use the new PRM scoring model
+// to do so.
+PyObject* PySpectrumGenerateTags(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ int CustomScoringModelFlag = 0;
+ int Charge;
+ MSSpectrum* Spectrum;
+ SpectrumTweak* Tweak;
+ TrieTag* Tags;
+ PyObject* TagList;
+ PyObject* TagTuple;
+ int TagCount;
+ PRMBayesianModel* Model;
+ int MaximumTagCount = 200;
+ int TagIndex;
+ TrieTag* Tag;
+ TrieNode* Root = NULL;
+ float TagEdgeScoreMultiplier = 1.0;
+ //static TrieTag* SortedFilteredTagList = NULL;
+ //
+ if (!PyArg_ParseTuple(args, "i|if", &Charge, &CustomScoringModelFlag, &TagEdgeScoreMultiplier))
+ {
+ return NULL;
+ }
+ Spectrum = self->Spectrum;
+ Spectrum->Charge = Charge;
+ TweakSpectrum(Spectrum->Node);
+ Tweak = Spectrum->Node->Tweaks + (Charge - 1) * 2;
+ if (Spectrum->Graph)
+ {
+ FreeTagGraph(Spectrum->Graph);
+ }
+ Spectrum->Graph = ConstructTagGraph(Spectrum);
+ TagGraphAddNodes(Spectrum->Graph, Spectrum);
+ // Look up the correct PRM scoring model:
+ if (CustomScoringModelFlag)
+ {
+ Model = InteractiveBN;
+ }
+ else
+ {
+ // Use the current production model to score the nodes:
+ if (Spectrum->Charge > 2)
+ {
+ Model = TAGModelCharge3;
+ }
+ else
+ {
+ Model = TAGModelCharge2;
+ }
+ }
+ PrepareSpectrumForIonScoring(Model, Spectrum, 1);
+ // Score PRMs using this model:
+ TagGraphScorePRMNodes(Model, Spectrum->Graph, Spectrum, Tweak);
+
+ TagGraphPopulateEdges(Spectrum->Graph);
+ Tags = TagGraphGenerateTags(Spectrum->Graph, Spectrum, &TagCount, MaximumTagCount, Tweak, TagEdgeScoreMultiplier, InteractiveBN);
+
+ // Note: This array of tags may have some duplicates. Let's just build a trie and
+ // use THAT for our output!
+ Root = BuildTrieFromTags(Tags, TagCount, Root, MaximumTagCount);
+ //if (!SortedFilteredTagList)
+ //{
+ // SortedFilteredTagList = (TrieTag*)calloc(500, sizeof(TrieTag));
+ //}
+ TagCount = WriteTagsToList(Root, Tags, 500, 0);
+ qsort(Tags, TagCount, sizeof(TrieTag), (QSortCompare)CompareTagScores);
+ TagList = PyList_New(0);
+ for (TagIndex = 0; TagIndex < TagCount; TagIndex++)
+ {
+ Tag = Tags + TagIndex;
+ TagTuple = Py_BuildValue("fsffii", Tag->PrefixMass / (float)DALTON,
+ Tag->Tag, Tag->SuffixMass / (float)DALTON, Tag->Score,
+ Tag->TotalSkew, Tag->TotalAbsSkew);
+ PyList_Append(TagList, TagTuple);
+ }
+
+ //PySpectrumReportTagsFromTrie(TagList, Root);
+ FreeTrieNode(Root);
+
+ return TagList;
+}
+
+PyObject* PySpectrumGetPRMScore(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ float Mass;
+ int CustomScoringModelFlag = 0;
+ PRMBayesianModel* Model;
+ SpectrumTweak* Tweak;
+ SpectrumNode* SpecNode;
+ int IntMass;
+ float Score;
+ int VerboseFlag = 0;
+ //
+ if (!PyArg_ParseTuple(args, "f|ii", &Mass, &CustomScoringModelFlag, &VerboseFlag))
+ {
+ return NULL;
+ }
+ if (CustomScoringModelFlag)
+ {
+ Model = InteractiveBN;
+ }
+ else
+ {
+ if (self->Spectrum->Charge < 3)
+ {
+ Model = PRMModelCharge2;
+ }
+ else
+ {
+ Model = PRMModelCharge3;
+ }
+ }
+ PrepareSpectrumForIonScoring(Model, self->Spectrum, 1);
+ IntMass = (int)(Mass * MASS_SCALE);
+ SpecNode = self->Spectrum->Node;
+ if (SpecNode->Tweaks[2].Charge)
+ {
+ Tweak = SpecNode->Tweaks + 2;
+ }
+ else if (SpecNode->Tweaks[4].Charge)
+ {
+ Tweak = SpecNode->Tweaks + 4;
+ }
+ else
+ {
+ Tweak = SpecNode->Tweaks;
+ }
+ Score = GetIonPRMFeatures(self->Spectrum, Tweak, Model, IntMass, VerboseFlag);
+ return PyFloat_FromDouble(Score);
+ //GetIonPRMFeatures(self->Spectrum, Tweak, Model, Mass, 1);
+}
+
+PyObject* PySpectrumPlotPRMScores(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ char* FileName;
+ FILE* PlotFile;
+ int PRM;
+ float Score;
+ SpectrumTweak* Tweak;
+ PRMBayesianModel* Model;
+ int UseCustomModelFlag = 0;
+ //
+ if (!PyArg_ParseTuple(args, "s|i", &FileName, &UseCustomModelFlag))
+ {
+ return NULL;
+ }
+ if (UseCustomModelFlag)
+ {
+ Model = InteractiveBN;
+ }
+ else
+ {
+ if (self->Spectrum->Charge < 3)
+ {
+ Model = PRMModelCharge2;
+ }
+ else
+ {
+ Model = PRMModelCharge3;
+ }
+ }
+
+ PlotFile = fopen(FileName, "wb");
+ if (!PlotFile)
+ {
+ sprintf(PythonErrorString, "Unable to open '%s'", FileName);
+ ReportPythonError();
+ return NULL;
+ }
+
+ PrepareSpectrumForIonScoring(Model, self->Spectrum, 1);
+ Tweak = self->Spectrum->Node->Tweaks + (self->Spectrum->Charge * 2) - 2;
+ for (PRM = 0; PRM < self->Spectrum->ParentMass; PRM += (DALTON / 10))
+ {
+ Score = GetIonPRMFeatures(self->Spectrum, Tweak, Model, PRM, 0);
+ fprintf(PlotFile, "%.2f\t%.2f\t\n", PRM / (float)DALTON, Score);
+ }
+ fclose(PlotFile);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+void VerboseReportTopTag(TrieTag* Tag, Peptide* Match, MSSpectrum* Spectrum)
+{
+ int PRM;
+ int AminoIndex;
+ float PRMScore;
+ int ValidFlag;
+ int TagNodeIndex;
+ TagGraphNode* Node;
+ int Diff;
+ int PeptidePRM;
+ int AminoCount;
+ int ModIndex;
+
+ PRM = Tag->PrefixMass;
+ AminoCount = strlen(Match->Bases);
+ printf(" Tag %s %.2f (prefix %.2f, suffix %.2f)\n", Tag->Tag, Tag->Score, Tag->PrefixMass / (float)DALTON, Tag->SuffixMass / (float)DALTON);
+ for (TagNodeIndex = 0; TagNodeIndex < 4; TagNodeIndex++)
+ {
+ // PRM is our node's mass.
+ // First question: Is it correct?
+ ValidFlag = 0;
+ PeptidePRM = 0;
+ for (AminoIndex = 0; AminoIndex < AminoCount; AminoIndex++)
+ {
+ Diff = abs(PeptidePRM - PRM);
+ if (Diff < GlobalOptions->Epsilon)
+ {
+ ValidFlag = 1;
+ break;
+ }
+
+ PeptidePRM += PeptideMass[Match->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] == AminoIndex)
+ {
+ PeptidePRM += Match->ModType[ModIndex]->RealDelta;
+ }
+ }
+ }
+ // Second question: What's the score of the node?
+ Node = Tag->Nodes[TagNodeIndex];
+ PRMScore = Node->Score;
+ if (ValidFlag)
+ {
+ printf(" [RIGHT]");
+ }
+ else
+ {
+ printf(" [wrong]");
+ }
+ printf(" %.2f %.2f", PRM / (float)DALTON, PRMScore);
+ if (TagNodeIndex < 3)
+ {
+ printf(" -%c- ", Tag->Tag[TagNodeIndex]);
+ }
+ printf("\n");
+ // Increment the PRM.
+ PRM += PeptideMass[Tag->Tag[TagNodeIndex]];
+
+ }
+}
+
+void VerboseReportTrueTagPRMs(Peptide* Match, MSSpectrum* Spectrum)
+{
+ int TruePRM;
+ int PrevPRM;
+ int AminoIndex;
+ int AminoCount;
+ int ModIndex;
+ int Diff;
+ TagGraphNode* BackNode;
+ TagGraphNode* Node;
+ TagGraphNode* BestNode;
+ TagGraphNode* FirstNode;
+ TagGraphNode* OldFirstNode = NULL;
+ TagGraphEdge* BestEdge;
+ float BestEdgeScore;
+ TagGraphEdge* Edge;
+ int BestDiff;
+ float Score;
+ //
+ TruePRM = 0;
+ AminoCount = strlen(Match->Bases);
+ for (AminoIndex = 0; AminoIndex <= AminoCount; AminoIndex++)
+ {
+ BestNode = NULL;
+ FirstNode = NULL;
+
+ // Look for a close node:
+ for (Node = Spectrum->Graph->FirstNode; Node; Node = Node->Next)
+ {
+ Diff = abs(Node->Mass - TruePRM);
+ if (Diff < GlobalOptions->Epsilon)
+ {
+ if (!FirstNode)
+ {
+ FirstNode = Node;
+ }
+ if (!BestNode || Node->Score > BestNode->Score)
+ {
+ BestNode = Node;
+ BestDiff = Diff;
+ }
+ }
+ }
+
+ // Also, is there an edge to the PREVIOUS node?
+ if (TruePRM)
+ {
+ BestEdge = NULL;
+ BestEdgeScore = -9999;
+ for (BackNode = OldFirstNode; BackNode; BackNode = BackNode->Next)
+ {
+ if (BackNode->Mass > PrevPRM + GlobalOptions->Epsilon)
+ {
+ break;
+ }
+ for (Edge = BackNode->FirstEdge; Edge; Edge = Edge->Next)
+ {
+ Diff = abs(Edge->ToNode->Mass - TruePRM);
+ if (Diff < GlobalOptions->Epsilon)
+ {
+ Score = Edge->FromNode->Score + Edge->ToNode->Score;
+ if (Score > BestEdgeScore)
+ {
+ BestEdge = Edge;
+ BestEdgeScore = Score;
+ }
+ }
+ }
+ }
+ if (BestEdge)
+ {
+ printf("-Edge: From %.2f (%.2f) to %.2f (%.2f) via %c (skew %.2f)\n", BestEdge->FromNode->Mass / (float)DALTON,
+ BestEdge->FromNode->Score, BestEdge->ToNode->Mass / (float)DALTON, BestEdge->ToNode->Score,
+ BestEdge->Jump->Amino, BestEdge->Skew / (float)DALTON);
+ }
+ else
+ {
+ printf("-(no edge)\n");
+ }
+ }
+ if (BestNode)
+ {
+ printf("PRM %.2f: Best node score PRM %.2f (diff %.2f) score %.2f\n",
+ TruePRM / (float)DALTON, BestNode->Mass / (float)DALTON, BestDiff / (float)DALTON, BestNode->Score);
+ }
+ else
+ {
+ printf("PRM %.2f: (no node)\n", TruePRM / (float)DALTON);
+ }
+ // Add mass for this aa:
+ OldFirstNode = FirstNode;
+ PrevPRM = TruePRM;
+ TruePRM += PeptideMass[Match->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] == AminoIndex)
+ {
+ TruePRM += Match->ModType[ModIndex]->RealDelta;
+ }
+ }
+
+ }
+}
+
+PyObject* PySpectrumCheckTagging(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ char* Annotation;
+ Peptide* Match;
+ MSSpectrum* Spectrum;
+ PRMBayesianModel* Model;
+ SpectrumTweak* Tweak;
+ int Charge;
+ int TagCount;
+ int MaximumTagCount = 500;
+ TrieTag* Tags;
+ float TagEdgeScoreMultiplier = 10.0;
+ int TagIndex;
+ //
+ if (!PyArg_ParseTuple(args, "s", &Annotation))
+ {
+ return NULL;
+ }
+ Match = GetPeptideFromAnnotation(Annotation);
+ // Bail out if the match isn't valid:
+ if (!Match)
+ {
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+ Spectrum = self->Spectrum;
+ Charge = self->Spectrum->Charge;
+ // Look up the correct PRM scoring model:
+ Model = InteractiveBN;
+ PrepareSpectrumForIonScoring(Model, Spectrum, 1);
+ Tweak = Spectrum->Node->Tweaks + (Charge - 1) * 2;
+ // Generate some tags:
+ if (Spectrum->Graph)
+ {
+ FreeTagGraph(Spectrum->Graph);
+ }
+ Spectrum->Graph = ConstructTagGraph(Spectrum);
+ TagGraphAddNodes(Spectrum->Graph, Spectrum);
+ // Score PRMs using this model:
+ TagGraphScorePRMNodes(Model, Spectrum->Graph, Spectrum, Tweak);
+ TagGraphPopulateEdges(Spectrum->Graph);
+ Tags = TagGraphGenerateTags(Spectrum->Graph, Spectrum, &TagCount, MaximumTagCount, Tweak, TagEdgeScoreMultiplier, Model);
+ printf("\nCheck tagging: Peptide %s\n", Annotation);
+ printf("%s:%d\n", self->FileName, Spectrum->Node->FilePosition);
+ ///////////////////////////////////////////////////////////////////////
+ // Report on the tag graph node scores which correpsond to the true PRMs. Are
+ // we missing a PRM entirely? Missing an edge entirely? Did we miss a tag
+ // simply because the scores were medicore?
+ VerboseReportTrueTagPRMs(Match, Spectrum);
+
+ ///////////////////////////////////////////////////////////////////////
+ // Report the top 10 tags. Are they correct? Partially correct?
+ //printf("Top 10 tags (true peptide %s):\n", Annotation);
+ for (TagIndex = 0; TagIndex < min(10, TagCount); TagIndex++)
+ {
+ VerboseReportTopTag(Tags + TagIndex, Match, Spectrum);
+ }
+
+ // Cleanup:
+ FreePeptideNode(Match);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PySpectrumGetCutScores(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ Peptide* Match;
+ char* Annotation;
+ MSSpectrum* Spectrum;
+ int NodeIndex;
+ int AminoIndex;
+ int PRM;
+ float Score;
+ int AminoCount;
+ int ModIndex;
+ PyObject* ReturnList;
+ PRMBayesianNode* Node;
+ //
+ if (!PyArg_ParseTuple(args, "s", &Annotation))
+ {
+ return NULL;
+ }
+ Spectrum = self->Spectrum;
+ Match = GetPeptideFromAnnotation(Annotation);
+ // Force the spectrum's parent mass to match the right parent mass:
+ Spectrum->ParentMass = Match->ParentMass;
+ PrepareSpectrumForIonScoring(InteractiveBN, Spectrum, 1);
+ AminoCount = strlen(Match->Bases);
+ for (NodeIndex = 0, Node = InteractiveBN->Head; Node; NodeIndex++, Node = Node->Next)
+ {
+ PRM = 0;
+ for (AminoIndex = 0; AminoIndex <= AminoCount; AminoIndex++)
+ {
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Set values, and accumulate table entries:
+ Node->Values[AminoIndex] = IonScoringGetNodeValue(InteractiveBN, Node, Spectrum, PRM, Match, AminoIndex);
+ ///////////////////////////////////////////////////////////////////////////////////////
+ // Add to PRM:
+ if (AminoIndex == AminoCount)
+ {
+ break;
+ }
+ PRM += PeptideMass[Match->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] == AminoIndex)
+ {
+ PRM += Match->ModType[ModIndex]->RealDelta;
+ }
+ }
+ } // Amino loop
+ } // NodeIndex loop
+
+ // Iterate over the values arrays, building the return-list.
+ ReturnList = PyList_New(0);
+ for (AminoIndex = 0; AminoIndex <= AminoCount; AminoIndex++)
+ {
+ Score = PRMBNGetCutScore(Spectrum, InteractiveBN, AminoIndex);
+ PyList_Append(ReturnList, PyFloat_FromDouble(Score));
+ }
+
+ // Cleanup:
+ FreePeptideNode(Match);
+
+ // Return:
+ return ReturnList;
+}
+
+PyObject* PySpectrumGetMatchFeatures(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ PyObject* ReturnList;
+ char* Annotation;
+ Peptide* Match;
+ MSSpectrum* Spectrum;
+ float MatchFeatures[256];
+ int FeatureIndex;
+ int FeatureCount;
+ //
+ if (!PyArg_ParseTuple(args, "s", &Annotation))
+ {
+ return NULL;
+ }
+ Spectrum = self->Spectrum;
+ Match = GetPeptideFromAnnotation(Annotation);
+ FeatureCount = GetPeptideMatchFeaturesFull(Spectrum, Match, MatchFeatures);
+ FreePeptideNode(Match);
+ ReturnList = PyList_New(0);
+ for (FeatureIndex = 0; FeatureIndex < FeatureCount; FeatureIndex++)
+ {
+ PyList_Append(ReturnList, PyFloat_FromDouble(MatchFeatures[FeatureIndex]));
+ }
+
+ return ReturnList;
+}
+
+PyObject* PySpectrumPrepareIonScoring(PySpectrum* self, PyObject* args, PyObject* kwargs)
+{
+ int CustomModelFlag = 0;
+ PRMBayesianModel* Model;
+ MSSpectrum* Spectrum = self->Spectrum;
+ if (!PyArg_ParseTuple(args, "i", &CustomModelFlag))
+ {
+ return NULL;
+ }
+ if (CustomModelFlag)
+ {
+ Model = InteractiveBN;
+ }
+ else
+ {
+ if (self->Spectrum->Charge < 3)
+ {
+ Model = PRMModelCharge2;
+ }
+ else
+ {
+ Model = PRMModelCharge3;
+ }
+ }
+ PrepareSpectrumForIonScoring(Model, Spectrum, 1);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
diff --git a/PyInspect/PySpectrum.h b/PyInspect/PySpectrum.h
new file mode 100644
index 0000000..52bf906
--- /dev/null
+++ b/PyInspect/PySpectrum.h
@@ -0,0 +1,145 @@
+//Title: PySpectrum.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef PY_SPECTRUM_H
+#define PY_SPECTRUM_H
+// PySpectrum: Python wrapper for a Spectrum object.
+#include "Python.h"
+#include "structmember.h"
+#include "Utils.h"
+#include "Spectrum.h"
+#include "TagFile.h"
+
+typedef struct
+{
+ PyObject_HEAD
+ MSSpectrum* Spectrum;
+ char FileName[MAX_FILENAME_LEN];
+ PyObject* MatchList; // list of PyPeptide instances
+ int PrevMass;
+ //struct Peptide* FirstMatch; // list of Peptide instances for matches
+ //struct Peptide* LastMatch;
+} PySpectrum;
+
+extern PyTypeObject PySpectrumType;
+
+void SpectrumSetCharge(MSSpectrum* Spectrum, int Charge);
+PyObject* PySpectrumGetPeakCount(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumInit(PySpectrum* self, PyObject *args, PyObject *kwds);
+PyObject* PySpectrumNew(PyTypeObject *type, PyObject *args, PyObject *kwds);
+PyObject* PySpectrumScore(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumScoreDetailed(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumLabelPeaks(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumCorrectParentMass(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGetPMCFeatures(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumSetCharge(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumBYConvolve(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumSetParentMass(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGetParentMass(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGetCCFeatures(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumCorrectCharge(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGenerateTags(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGetPRMScore(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumPlotPRMScores(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumCheckTagging(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGetCutScores(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGetMatchFeatures(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumPrepareIonScoring(PySpectrum* self, PyObject* args, PyObject* kwargs);
+PyObject* PySpectrumGetMZ(PySpectrum* self, PyObject* args, PyObject* kwargs);
+
+void PySpectrumDealloc(PyObject* selfobject);
+
+// Methods of the PySpectrum class
+static PyMethodDef PySpectrumMethods[] =
+{
+ {"GetPeakCount", (PyCFunction)PySpectrumGetPeakCount, METH_VARARGS | METH_KEYWORDS,
+ "Return peak count for a scan"},
+ {"ScorePeptide", (PyCFunction)PySpectrumScore, METH_VARARGS | METH_KEYWORDS,
+ "Score a peptide match for this spectrum"},
+ {"ScorePeptideDetailed", (PyCFunction)PySpectrumScoreDetailed, METH_VARARGS | METH_KEYWORDS,
+ "Score a peptide match for this spectrum; return all the scoring features"},
+ {"LabelPeaks", (PyCFunction)PySpectrumLabelPeaks, METH_VARARGS | METH_KEYWORDS,
+ "Label spectrum peaks using a peptide annotation"},
+ {"CorrectParentMass", (PyCFunction)PySpectrumCorrectParentMass, METH_VARARGS | METH_KEYWORDS,
+ "Select correct charge and parent mass for the spectrum"},
+ {"GetPMCFeatures", (PyCFunction)PySpectrumGetPMCFeatures, METH_VARARGS | METH_KEYWORDS,
+ "Compute parent-mass-correction features for the spectrum"},
+ {"SetCharge", (PyCFunction)PySpectrumSetCharge, METH_VARARGS | METH_KEYWORDS,
+ "Adjust the spectrum's charge"},
+ {"BYConvolve", (PyCFunction)PySpectrumBYConvolve, METH_VARARGS | METH_KEYWORDS,
+ "Perform b/y peak convolution"},
+ {"SetParentMass", (PyCFunction)PySpectrumSetParentMass, METH_VARARGS | METH_KEYWORDS,
+ "Set the parent mass"},
+ {"GetParentMass", (PyCFunction)PySpectrumGetParentMass, METH_VARARGS | METH_KEYWORDS,
+ "Returns the parent mass"},
+ {"GetCCFeatures", (PyCFunction)PySpectrumGetCCFeatures, METH_VARARGS | METH_KEYWORDS,
+ "Compute charge correction features for the spectrum"},
+ {"CorrectCharge", (PyCFunction)PySpectrumCorrectCharge, METH_VARARGS | METH_KEYWORDS,
+ "Get the corrected charge for this spectrum"},
+ {"GenerateTags", (PyCFunction)PySpectrumGenerateTags, METH_VARARGS | METH_KEYWORDS,
+ "Generate tags for this spectrum"},
+ {"GetPRMScore", (PyCFunction)PySpectrumGetPRMScore, METH_VARARGS | METH_KEYWORDS,
+ "Get the score for a prefix residue mass (PRM)"},
+ {"PlotPRMScores", (PyCFunction)PySpectrumPlotPRMScores, METH_VARARGS | METH_KEYWORDS,
+ "Output a plot of PRM scores for this spectrum"},
+ {"CheckTagging", (PyCFunction)PySpectrumCheckTagging, METH_VARARGS | METH_KEYWORDS,
+ "Test whether this spectrum can generate a tag for a specified peptide"},
+ {"GetCutScores", (PyCFunction)PySpectrumGetCutScores, METH_VARARGS | METH_KEYWORDS,
+ "Compute cut-point scores for the specified peptide annotation"},
+ {"GetMatchFeatures", (PyCFunction)PySpectrumGetMatchFeatures, METH_VARARGS | METH_KEYWORDS,
+ "Get features for scoring a peptide match"},
+ {"PrepareIonScoring", (PyCFunction)PySpectrumPrepareIonScoring, METH_VARARGS | METH_KEYWORDS,
+ "Force a call to PrepareSpectrumForIonScoring"},
+ {"GetMZ", (PyCFunction)PySpectrumGetMZ, METH_VARARGS | METH_KEYWORDS,
+ "Get the m/z for this spectrum"},
+
+
+ {NULL},
+};
+
+
+// Methods (currently none) of the MSPeak class
+static PyMemberDef PySpectrumMembers[] =
+{
+ {NULL},
+};
+
+// Getters and setters for the PySpectrum class. (Should be used
+// if Python code will be modifying the run dynamically)
+static PyGetSetDef PySpectrumGetSet[] =
+{
+ {NULL} // Sentinel
+};
+
+
+
+#endif // PY_SPECTRUM_H
diff --git a/PyInspect/PyUtils.c b/PyInspect/PyUtils.c
new file mode 100644
index 0000000..0c9b8a2
--- /dev/null
+++ b/PyInspect/PyUtils.c
@@ -0,0 +1,49 @@
+//Title: PyUtils.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Python.h"
+#include <stdarg.h>
+#include "CMemLeak.h"
+#include "TagFile.h"
+
+char PythonErrorString[2048];
+extern PyObject* InspectError; // defined in PyInspect.c
+
+FILE* LogFile = NULL;
+
+// Simple interface for error-reporting to Python callers:
+// Print an error to PythonErrorString, then call ReportPythonError().
+void ReportPythonError()
+{
+ PyErr_SetString(InspectError, PythonErrorString);
+}
diff --git a/PyInspect/PyUtils.h b/PyInspect/PyUtils.h
new file mode 100644
index 0000000..584696f
--- /dev/null
+++ b/PyInspect/PyUtils.h
@@ -0,0 +1,39 @@
+//Title: PyUtils.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef PY_UTILS_H
+#define PY_UTILS_H
+
+void ReportPythonError();
+extern char PythonErrorString[2048];
+
+#endif // PY_UTILS_H
diff --git a/PySVM.pyd b/PySVM.pyd
new file mode 100644
index 0000000..d01abd7
Binary files /dev/null and b/PySVM.pyd differ
diff --git a/PySVM/PySVM.c b/PySVM/PySVM.c
new file mode 100644
index 0000000..6790aff
--- /dev/null
+++ b/PySVM/PySVM.c
@@ -0,0 +1,327 @@
+//Title: PySVM.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "Python.h"
+#include "svm.h"
+//#include "TagFile.h"
+
+#define MAX_LINE_LENGTH 2048
+
+struct svm_model* Model;
+double DecisionValues[10]; // hacky
+struct svm_node* SVMFeatures;
+int SVMFeatureAllocation;
+
+// Assume no more than 128 features!
+double MinFeatureValue[128];
+double MaxFeatureValue[128];
+
+static PyObject* ex_foo(PyObject* self, PyObject* args)
+{
+ printf("Hello, world\n");
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+// Copy one line (up to a \r or \n character) from a source buffer to a target buffer.
+// Optionally, strip out spaces. Return the position just AFTER the end of the line.
+// (If a line ends in \r\n, we'll end up processing the line, and then one empty line; that's okay)
+// If a line is very long, we stop copying, and skip over the rest of it.
+int CopyBufferLine(char* Source, int BufferPos, int BufferEnd, char* LineBuffer, int StripWhitespace)
+{
+ int LinePos = 0;
+ int LineComplete = 0;
+ int Chars = 0;
+ int Skipping = 0;
+ //
+ while (!LineComplete)
+ {
+ if (BufferPos > BufferEnd)
+ {
+ // Our line extends off the edge of the buffer. That's probably a Bad Thing.
+ printf("** Warning: Ran off the edge of the buffer in CopyBufferLine. Line too ling?\n");
+ LineBuffer[LinePos] = '\0';
+ return BufferPos;
+ }
+ switch (Source[BufferPos])
+ {
+ case ' ':
+ if (StripWhitespace)
+ {
+ BufferPos++;
+ }
+ else
+ {
+ if (!Skipping)
+ {
+ LineBuffer[LinePos++] = Source[BufferPos];
+ }
+ BufferPos++;
+ Chars++;
+ }
+ break;
+ case '\r':
+ case '\n':
+ LineBuffer[LinePos] = '\0';
+ BufferPos++;
+ LineComplete = 1;
+ break;
+ case '\0':
+ LineBuffer[LinePos] = '\0';
+ LineComplete = 1;
+ break;
+ default:
+ if (!Skipping)
+ {
+ LineBuffer[LinePos++] = Source[BufferPos];
+ }
+ BufferPos++;
+ Chars++;
+ break;
+ }
+ if (Chars == MAX_LINE_LENGTH - 1)
+ {
+ printf("** Error: Line too long! Truncating line.");
+ // Read the rest of the chars, but don't write them:
+ Chars = 0;
+ Skipping = 1;
+ }
+ }
+ return BufferPos;
+}
+
+static PyObject* PyLoadScaling(PyObject* self, PyObject* args)
+{
+ char* FilePath;
+ char* FileText;
+ char LineBuffer[MAX_LINE_LENGTH];
+ FILE* ScaleFile;
+ int BufferPos;
+ int BufferEnd;
+ int FeatureNumber;
+ char* StrValue;
+ double MinValue;
+ double MaxValue;
+ //
+ if (!PyArg_ParseTuple(args, "s", &FilePath))
+ {
+ return (PyObject*)-1; // Return -1 to signal that the object can't be created
+ }
+ ScaleFile = fopen(FilePath, "rb");
+ if (!ScaleFile)
+ {
+ printf("** Error: Can't open file '%s'\n", FilePath);
+ return (PyObject*)-1; // Return -1 to signal that the object can't be created
+ }
+ FileText = (char*)calloc(sizeof(char), 10240);
+ BufferEnd = fread(FileText, sizeof(char), 10240, ScaleFile);
+ BufferPos = 0;
+ while (1)
+ {
+ if (!FileText[BufferPos])
+ {
+ break;
+ }
+ BufferPos = CopyBufferLine(FileText, BufferPos, BufferEnd, LineBuffer, 0);
+ //printf("Line parsed: '%s'\n", LineBuffer);
+ StrValue = strtok(LineBuffer, " \t");
+ if (!StrValue)
+ {
+ continue;
+ }
+ FeatureNumber = atoi(StrValue);
+ if (FeatureNumber <= 0)
+ {
+ continue;
+ }
+ StrValue = strtok(NULL, " \t");
+ if (!StrValue)
+ {
+ continue;
+ }
+ MinValue = atof(StrValue);
+ StrValue = strtok(NULL, " \t");
+ if (!StrValue)
+ {
+ continue;
+ }
+ MaxValue = atof(StrValue);
+ MinFeatureValue[FeatureNumber - 1] = MinValue;
+ MaxFeatureValue[FeatureNumber - 1] = MaxValue;
+ //printf("Feature %d: Range %f...%f\n", FeatureNumber, MinValue, MaxValue);
+ }
+ fclose(ScaleFile);
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+
+void ScaleSVMFeatures(struct svm_node* Features, int FeatureCount)
+{
+ int FeatureIndex;
+ double Value;
+ double Range;
+ //
+ for (FeatureIndex = 0; FeatureIndex < FeatureCount; FeatureIndex++)
+ {
+ Value = Features[FeatureIndex].value;
+ Range = MaxFeatureValue[FeatureIndex] - MinFeatureValue[FeatureIndex];
+ if (Value <= MinFeatureValue[FeatureIndex])
+ {
+ SVMFeatures[FeatureIndex].value = -1.0;
+ continue;
+ }
+ if (Value >= MaxFeatureValue[FeatureIndex])
+ {
+ Features[FeatureIndex].value = 1.0;
+ continue;
+ }
+ Features[FeatureIndex].value = -1.0 + 2.0 * (Value - MinFeatureValue[FeatureIndex]) / Range;
+ }
+}
+
+static PyObject* PyLoadModel(PyObject* self, PyObject* args)
+{
+ char* FilePath;
+ if (!PyArg_ParseTuple(args, "s", &FilePath))
+ {
+ return (PyObject*)-1; // Return -1 to signal that the object can't be created
+ }
+
+ // Free the old model, if any:
+ if (Model)
+ {
+ svm_destroy_model(Model);
+ Model = NULL;
+ }
+
+ // Load model from specified file:
+ Model = svm_load_model(FilePath);
+
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+PyObject* PyScoreHelper(PyObject* FeatureList, int ScaleFlag)
+{
+ int FeatureIndex;
+ int FeatureCount;
+ int SequenceType;
+ //
+ if (PyList_Check(FeatureList))
+ {
+ SequenceType = 1;
+ FeatureCount = PyList_Size(FeatureList);
+ }
+ else if PyTuple_Check(FeatureList)
+ {
+ SequenceType = 2;
+ FeatureCount = PyTuple_Size(FeatureList);
+ }
+ else
+ {
+ printf("** Error in PyScore vector: Illegal argument (not a vector or tuple)\n");
+ return (PyObject*)-1; // Return -1 to signal that the object can't be created
+ }
+
+ // Allocate SVMFeatures, if necessary:
+ if (FeatureCount >= SVMFeatureAllocation)
+ {
+ if (SVMFeatures)
+ {
+ free(SVMFeatures);
+ }
+ SVMFeatures = (struct svm_node*)malloc((FeatureCount + 1) * sizeof(struct svm_node));
+ SVMFeatureAllocation = FeatureCount + 1;
+ }
+
+ // Populate SVMFeatures:
+ for (FeatureIndex = 0; FeatureIndex < FeatureCount; FeatureIndex++)
+ {
+ SVMFeatures[FeatureIndex].index = FeatureIndex + 1;
+ if (SequenceType == 1)
+ {
+ SVMFeatures[FeatureIndex].value = PyFloat_AsDouble(PyList_GetItem(FeatureList, FeatureIndex));
+ }
+ else
+ {
+ SVMFeatures[FeatureIndex].value = PyFloat_AsDouble(PyTuple_GetItem(FeatureList, FeatureIndex));
+ }
+ }
+ if (ScaleFlag)
+ {
+ ScaleSVMFeatures(SVMFeatures, FeatureCount);
+ }
+ SVMFeatures[FeatureCount].index = -1;
+ // Predict, and return:
+ svm_predict_values(Model, SVMFeatures, DecisionValues);
+ return PyFloat_FromDouble(DecisionValues[0]);
+}
+
+static PyObject* PyScoreVector(PyObject* self, PyObject* args)
+{
+ PyObject* FeatureList;
+ //
+ if (!PyArg_ParseTuple(args, "O", &FeatureList))
+ {
+ return (PyObject*)-1; // Return -1 to signal that the object can't be created
+ }
+ return PyScoreHelper(FeatureList, 0);
+}
+
+static PyObject* PyScaleAndScoreVector(PyObject* self, PyObject* args)
+{
+ PyObject* FeatureList;
+ //
+ if (!PyArg_ParseTuple(args, "O", &FeatureList))
+ {
+ return (PyObject*)-1; // Return -1 to signal that the object can't be created
+ }
+ return PyScoreHelper(FeatureList, 1);
+}
+
+static PyMethodDef PySVM_methods[] = {
+ {"foo", ex_foo, METH_VARARGS, "foo() doc string"},
+ {"LoadModel", PyLoadModel, METH_VARARGS, "Load an SVM model from disk"},
+ {"Score", PyScoreVector, METH_VARARGS, "Score a (pre-scaled) vector"},
+ {"LoadScaling", PyLoadScaling, METH_VARARGS, "Load feature scaling parameters from a file"},
+ {"ScaleAndScore", PyScaleAndScoreVector, METH_VARARGS, "Scale and score a feature-vector"},
+ {NULL, NULL}
+};
+
+PyMODINIT_FUNC initPySVM(void)
+{
+ Py_InitModule("PySVM", PySVM_methods);
+ Model = NULL;
+ SVMFeatures = NULL;
+ SVMFeatureAllocation = 0;
+}
diff --git a/PySVM/PySVM.sln b/PySVM/PySVM.sln
new file mode 100644
index 0000000..28e4a22
--- /dev/null
+++ b/PySVM/PySVM.sln
@@ -0,0 +1,21 @@
+Microsoft Visual Studio Solution File, Format Version 8.00
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PySVM", "PySVM.vcproj", "{A0608D6F-84ED-44AE-A2A6-A3CC7F4A4030}"
+ ProjectSection(ProjectDependencies) = postProject
+ EndProjectSection
+EndProject
+Global
+ GlobalSection(SolutionConfiguration) = preSolution
+ Debug = Debug
+ Release = Release
+ EndGlobalSection
+ GlobalSection(ProjectConfiguration) = postSolution
+ {A0608D6F-84ED-44AE-A2A6-A3CC7F4A4030}.Debug.ActiveCfg = Debug|Win32
+ {A0608D6F-84ED-44AE-A2A6-A3CC7F4A4030}.Debug.Build.0 = Debug|Win32
+ {A0608D6F-84ED-44AE-A2A6-A3CC7F4A4030}.Release.ActiveCfg = Release|Win32
+ {A0608D6F-84ED-44AE-A2A6-A3CC7F4A4030}.Release.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ EndGlobalSection
+ GlobalSection(ExtensibilityAddIns) = postSolution
+ EndGlobalSection
+EndGlobal
diff --git a/PySVM/PySVM.vcproj b/PySVM/PySVM.vcproj
new file mode 100644
index 0000000..26567a8
--- /dev/null
+++ b/PySVM/PySVM.vcproj
@@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="windows-1250"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="7.10"
+ Name="PySVM"
+ SccProjectName=""
+ SccLocalPath="">
+ <Platforms>
+ <Platform
+ Name="Win32"/>
+ </Platforms>
+ <Configurations>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory=".\Release"
+ IntermediateDirectory=".\Release"
+ ConfigurationType="2"
+ UseOfMFC="0"
+ ATLMinimizesCRunTimeLibraryUsage="FALSE">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ InlineFunctionExpansion="1"
+ AdditionalIncludeDirectories="d:\python-2.5\Include,d:\python-2.5\PC"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS"
+ StringPooling="TRUE"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="TRUE"
+ UsePrecompiledHeader="2"
+ PrecompiledHeaderFile=".\Release/PySVM.pch"
+ AssemblerListingLocation=".\Release/"
+ ObjectFile=".\Release/"
+ ProgramDataBaseFileName=".\Release/"
+ WarningLevel="3"
+ SuppressStartupBanner="TRUE"
+ CompileAs="0"/>
+ <Tool
+ Name="VCCustomBuildTool"/>
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalOptions="/export:initPySVM"
+ AdditionalDependencies="odbc32.lib odbccp32.lib python25.lib"
+ OutputFile="d:\research\inspect\PySVM.pyd"
+ LinkIncremental="1"
+ SuppressStartupBanner="TRUE"
+ AdditionalLibraryDirectories="d:\python-2.5\PCbuild"
+ ModuleDefinitionFile=""
+ ProgramDatabaseFile=".\Release/PySVM.pdb"
+ SubSystem="2"
+ ImportLibrary=".\Release/PySVM.lib"
+ TargetMachine="1"/>
+ <Tool
+ Name="VCMIDLTool"
+ PreprocessorDefinitions="NDEBUG"
+ MkTypLibCompatible="TRUE"
+ SuppressStartupBanner="TRUE"
+ TargetEnvironment="1"
+ TypeLibraryName=".\Release/PySVM.tlb"
+ HeaderFileName=""/>
+ <Tool
+ Name="VCPostBuildEventTool"/>
+ <Tool
+ Name="VCPreBuildEventTool"/>
+ <Tool
+ Name="VCPreLinkEventTool"/>
+ <Tool
+ Name="VCResourceCompilerTool"
+ PreprocessorDefinitions="NDEBUG"
+ Culture="1033"/>
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"/>
+ <Tool
+ Name="VCXMLDataGeneratorTool"/>
+ <Tool
+ Name="VCWebDeploymentTool"/>
+ <Tool
+ Name="VCManagedWrapperGeneratorTool"/>
+ <Tool
+ Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+ </Configuration>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory=".\Debug"
+ IntermediateDirectory=".\Debug"
+ ConfigurationType="2"
+ UseOfMFC="0"
+ ATLMinimizesCRunTimeLibraryUsage="FALSE">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="d:\python-2.5\Include,d:\python-2.5\PC"
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="2"
+ PrecompiledHeaderFile=".\Debug/PySVM.pch"
+ AssemblerListingLocation=".\Debug/"
+ ObjectFile=".\Debug/"
+ ProgramDataBaseFileName=".\Debug/"
+ WarningLevel="3"
+ SuppressStartupBanner="TRUE"
+ DebugInformationFormat="4"
+ CompileAs="0"/>
+ <Tool
+ Name="VCCustomBuildTool"/>
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalOptions="/export:initPySVM"
+ AdditionalDependencies="odbc32.lib odbccp32.lib python25_d.lib"
+ OutputFile="d:\research\inspect\PySVM_d.pyd"
+ LinkIncremental="1"
+ SuppressStartupBanner="TRUE"
+ AdditionalLibraryDirectories="d:\python-2.5\PCbuild"
+ ModuleDefinitionFile=""
+ GenerateDebugInformation="TRUE"
+ ProgramDatabaseFile=".\Debug/PySVM_d.pdb"
+ SubSystem="2"
+ ImportLibrary=".\Debug/PySVM_d.lib"
+ TargetMachine="1"/>
+ <Tool
+ Name="VCMIDLTool"
+ PreprocessorDefinitions="_DEBUG"
+ MkTypLibCompatible="TRUE"
+ SuppressStartupBanner="TRUE"
+ TargetEnvironment="1"
+ TypeLibraryName=".\Debug/PySVM.tlb"
+ HeaderFileName=""/>
+ <Tool
+ Name="VCPostBuildEventTool"/>
+ <Tool
+ Name="VCPreBuildEventTool"/>
+ <Tool
+ Name="VCPreLinkEventTool"/>
+ <Tool
+ Name="VCResourceCompilerTool"
+ PreprocessorDefinitions="_DEBUG"
+ Culture="1033"/>
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"/>
+ <Tool
+ Name="VCXMLDataGeneratorTool"/>
+ <Tool
+ Name="VCWebDeploymentTool"/>
+ <Tool
+ Name="VCManagedWrapperGeneratorTool"/>
+ <Tool
+ Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cxx;rc;def;r;odl;hpj;bat;for;f90">
+ <File
+ RelativePath="PySVM.c">
+ <FileConfiguration
+ Name="Release|Win32">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ AdditionalIncludeDirectories=""
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;$(NoInherit)"/>
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|Win32">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories=""
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;$(NoInherit)"/>
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath=".\svm-predict.c">
+ </File>
+ <File
+ RelativePath=".\svm.cpp">
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;fi;fd">
+ <File
+ RelativePath=".\svm.h">
+ </File>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="ico;cur;bmp;dlg;rc2;rct;bin;cnt;rtf;gif;jpg;jpeg;jpe">
+ </Filter>
+ <File
+ RelativePath="readme.txt">
+ </File>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
diff --git a/PySVM/svm-predict.c b/PySVM/svm-predict.c
new file mode 100644
index 0000000..11db985
--- /dev/null
+++ b/PySVM/svm-predict.c
@@ -0,0 +1,202 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include "svm.h"
+
+char* line;
+int max_line_len = 1024;
+struct svm_node *x;
+int max_nr_attr = 64;
+
+struct svm_model* model;
+int predict_probability=0;
+
+void predict(FILE *input, FILE *output)
+{
+ int correct = 0;
+ int total = 0;
+ double error = 0;
+ double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0;
+ double dec_values[10]; // hacky
+ int svm_type=svm_get_svm_type(model);
+ int nr_class=svm_get_nr_class(model);
+ int* labels = (int*)malloc(nr_class*sizeof(int));
+ double* prob_estimates=NULL;
+ int j;
+
+ if (predict_probability)
+ {
+ if (svm_type==NU_SVR || svm_type==EPSILON_SVR)
+ {
+ printf("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma=%g\n",svm_get_svr_probability(model));
+ }
+ else
+ {
+ svm_get_labels(model,labels);
+ prob_estimates = (double *) malloc(nr_class*sizeof(double));
+ fprintf(output,"labels");
+ for(j = 0; j < nr_class; j++)
+ {
+ fprintf(output," %d",labels[j]);
+ }
+ fprintf(output,"\n");
+ }
+ }
+ while(1)
+ {
+ int i = 0;
+ int c;
+ double target;
+ double v;
+
+ if (fscanf(input,"%lf",&target)==EOF)
+ {
+ break;
+ }
+
+ while(1)
+ {
+ if (i>=max_nr_attr-1) // need one more for index = -1
+ {
+ max_nr_attr *= 2;
+ x = (struct svm_node *) realloc(x,max_nr_attr*sizeof(struct svm_node));
+ }
+
+ do
+ {
+ c = getc(input);
+ if (c=='\n' || c==EOF) goto out2;
+ } while(isspace(c));
+ ungetc(c,input);
+ fscanf(input,"%d:%lf",&x[i].index,&x[i].value);
+ ++i;
+ }
+
+out2:
+ x[i++].index = -1;
+
+ if (predict_probability && (svm_type==C_SVC || svm_type==NU_SVC))
+ {
+ v = svm_predict_probability(model,x,prob_estimates);
+ fprintf(output,"%g ",v);
+ for(j=0;j<nr_class;j++)
+ {
+ fprintf(output,"%g ",prob_estimates[j]);
+ }
+ fprintf(output,"\n");
+ }
+ else
+ {
+ //v = svm_predict(model,x);
+ //fprintf(output,"%g\n",v);
+ svm_predict_values(model, x, dec_values);
+ //printf("%.2f %.2f\n", v, dec_values[0]);
+ fprintf(output, "%g\n", dec_values[0]); // high values should correspond to +1, that's easier to remember.
+ if (dec_values[0] < 0)
+ {
+ v = 1;
+ }
+ else
+ {
+ v = -1;
+ }
+ }
+
+ if (v == target)
+ {
+ ++correct;
+ }
+ error += (v-target)*(v-target);
+ sumv += v;
+ sumy += target;
+ sumvv += v*v;
+ sumyy += target*target;
+ sumvy += v*target;
+ ++total;
+ }
+ printf("Accuracy = %g%% (%d/%d) (classification)\n",
+ (double)correct/total*100,correct,total);
+ printf("Mean squared error = %g (regression)\n",error/total);
+ printf("Squared correlation coefficient = %g (regression)\n",
+ ((total*sumvy-sumv*sumy)*(total*sumvy-sumv*sumy))/
+ ((total*sumvv-sumv*sumv)*(total*sumyy-sumy*sumy))
+ );
+ if (predict_probability)
+ {
+ free(prob_estimates);
+ free(labels);
+ }
+}
+
+void exit_with_help()
+{
+ printf(
+ "Usage: svm-predict [options] test_file model_file output_file\n"
+ "options:\n"
+ "-b probability_estimates: whether to predict probability estimates, 0 or 1 (default 0); one-class SVM not supported yet\n"
+ );
+ exit(1);
+}
+
+int svpredict_main(int argc, char **argv)
+{
+ FILE *input, *output;
+ int i;
+
+ // parse options
+ for(i=1;i<argc;i++)
+ {
+ if (argv[i][0] != '-') break;
+ ++i;
+ switch(argv[i-1][1])
+ {
+ case 'b':
+ predict_probability = atoi(argv[i]);
+ break;
+ default:
+ fprintf(stderr,"unknown option\n");
+ exit_with_help();
+ }
+ }
+ if (i>=argc)
+ exit_with_help();
+
+ input = fopen(argv[i],"r");
+ if (input == NULL)
+ {
+ fprintf(stderr,"can't open input file %s\n",argv[i]);
+ exit(1);
+ }
+
+ output = fopen(argv[i+2],"w");
+ if (output == NULL)
+ {
+ fprintf(stderr,"can't open output file %s\n",argv[i+2]);
+ exit(1);
+ }
+
+ if ((model=svm_load_model(argv[i+1]))==0)
+ {
+ fprintf(stderr,"can't open model file %s\n",argv[i+1]);
+ exit(1);
+ }
+
+ line = (char *) malloc(max_line_len*sizeof(char));
+ x = (struct svm_node *) malloc(max_nr_attr*sizeof(struct svm_node));
+ if (predict_probability)
+ {
+ if (svm_check_probability_model(model)==0)
+ {
+ fprintf(stderr,"model does not support probabiliy estimates\n");
+ predict_probability=0;
+ }
+ }
+ predict(input,output);
+ svm_destroy_model(model);
+ free(line);
+ free(x);
+ fclose(input);
+ fclose(output);
+ return 0;
+}
diff --git a/PySVM/svm.cpp b/PySVM/svm.cpp
new file mode 100644
index 0000000..aaf2fa8
--- /dev/null
+++ b/PySVM/svm.cpp
@@ -0,0 +1,3087 @@
+
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <stdarg.h>
+#include "svm.h"
+typedef float Qfloat;
+typedef signed char schar;
+#ifndef min
+template <class T> inline T min(T x,T y) { return (x<y)?x:y; }
+#endif
+#ifndef max
+template <class T> inline T max(T x,T y) { return (x>y)?x:y; }
+#endif
+template <class T> inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
+template <class S, class T> inline void clone(T*& dst, S* src, int n)
+{
+ dst = new T[n];
+ memcpy((void *)dst,(void *)src,sizeof(T)*n);
+}
+#define INF HUGE_VAL
+#define TAU 1e-12
+#define Malloc(type,n) (type *)malloc((n)*sizeof(type))
+#if 1
+void info(char *fmt,...)
+{
+ va_list ap;
+ va_start(ap,fmt);
+ vprintf(fmt,ap);
+ va_end(ap);
+}
+void info_flush()
+{
+ fflush(stdout);
+}
+#else
+void info(char *fmt,...) {}
+void info_flush() {}
+#endif
+
+//
+// Kernel Cache
+//
+// l is the number of total data items
+// size is the cache size limit in bytes
+//
+class Cache
+{
+public:
+ Cache(int l,int size);
+ ~Cache();
+
+ // request data [0,len)
+ // return some position p where [p,len) need to be filled
+ // (p >= len if nothing needs to be filled)
+ int get_data(const int index, Qfloat **data, int len);
+ void swap_index(int i, int j); // future_option
+private:
+ int l;
+ int size;
+ struct head_t
+ {
+ head_t *prev, *next; // a cicular list
+ Qfloat *data;
+ int len; // data[0,len) is cached in this entry
+ };
+
+ head_t *head;
+ head_t lru_head;
+ void lru_delete(head_t *h);
+ void lru_insert(head_t *h);
+};
+
+Cache::Cache(int l_,int size_):l(l_),size(size_)
+{
+ head = (head_t *)calloc(l,sizeof(head_t)); // initialized to 0
+ size /= sizeof(Qfloat);
+ size -= l * sizeof(head_t) / sizeof(Qfloat);
+ size = max(size, 2*l); // cache must be large enough for two columns
+ lru_head.next = lru_head.prev = &lru_head;
+}
+
+Cache::~Cache()
+{
+ for (head_t *h = lru_head.next; h != &lru_head; h=h->next)
+ free(h->data);
+ free(head);
+}
+
+void Cache::lru_delete(head_t *h)
+{
+ // delete from current location
+ h->prev->next = h->next;
+ h->next->prev = h->prev;
+}
+
+void Cache::lru_insert(head_t *h)
+{
+ // insert to last position
+ h->next = &lru_head;
+ h->prev = lru_head.prev;
+ h->prev->next = h;
+ h->next->prev = h;
+}
+
+int Cache::get_data(const int index, Qfloat **data, int len)
+{
+ head_t *h = &head[index];
+ if (h->len) lru_delete(h);
+ int more = len - h->len;
+
+ if (more > 0)
+ {
+ // free old space
+ while(size < more)
+ {
+ head_t *old = lru_head.next;
+ lru_delete(old);
+ free(old->data);
+ size += old->len;
+ old->data = 0;
+ old->len = 0;
+ }
+
+ // allocate new space
+ h->data = (Qfloat *)realloc(h->data,sizeof(Qfloat)*len);
+ size -= more;
+ swap(h->len,len);
+ }
+
+ lru_insert(h);
+ *data = h->data;
+ return len;
+}
+
+void Cache::swap_index(int i, int j)
+{
+ if (i==j) return;
+
+ if (head[i].len) lru_delete(&head[i]);
+ if (head[j].len) lru_delete(&head[j]);
+ swap(head[i].data,head[j].data);
+ swap(head[i].len,head[j].len);
+ if (head[i].len) lru_insert(&head[i]);
+ if (head[j].len) lru_insert(&head[j]);
+
+ if (i>j) swap(i,j);
+ for (head_t *h = lru_head.next; h!=&lru_head; h=h->next)
+ {
+ if (h->len > i)
+ {
+ if (h->len > j)
+ swap(h->data[i],h->data[j]);
+ else
+ {
+ // give up
+ lru_delete(h);
+ free(h->data);
+ size += h->len;
+ h->data = 0;
+ h->len = 0;
+ }
+ }
+ }
+}
+
+//
+// Kernel evaluation
+//
+// the static method k_function is for doing single kernel evaluation
+// the constructor of Kernel prepares to calculate the l*l kernel matrix
+// the member function get_Q is for getting one column from the Q Matrix
+//
+class QMatrix {
+public:
+ virtual Qfloat *get_Q(int column, int len) const = 0;
+ virtual Qfloat *get_QD() const = 0;
+ virtual void swap_index(int i, int j) const = 0;
+};
+
+class Kernel: public QMatrix {
+public:
+ Kernel(int l, svm_node * const * x, const svm_parameter& param);
+ virtual ~Kernel();
+
+ static double k_function(const svm_node *x, const svm_node *y,
+ const svm_parameter& param);
+ virtual Qfloat *get_Q(int column, int len) const = 0;
+ virtual Qfloat *get_QD() const = 0;
+ virtual void swap_index(int i, int j) const // no so const...
+ {
+ swap(x[i],x[j]);
+ if (x_square) swap(x_square[i],x_square[j]);
+ }
+protected:
+
+ double (Kernel::*kernel_function)(int i, int j) const;
+
+private:
+ const svm_node **x;
+ double *x_square;
+
+ // svm_parameter
+ const int kernel_type;
+ const double degree;
+ const double gamma;
+ const double coef0;
+
+ static double dot(const svm_node *px, const svm_node *py);
+ double kernel_linear(int i, int j) const
+ {
+ return dot(x[i],x[j]);
+ }
+ double kernel_poly(int i, int j) const
+ {
+ return pow(gamma*dot(x[i],x[j])+coef0,degree);
+ }
+ double kernel_rbf(int i, int j) const
+ {
+ return exp(-gamma*(x_square[i]+x_square[j]-2*dot(x[i],x[j])));
+ }
+ double kernel_sigmoid(int i, int j) const
+ {
+ return tanh(gamma*dot(x[i],x[j])+coef0);
+ }
+};
+
+Kernel::Kernel(int l, svm_node * const * x_, const svm_parameter& param)
+:kernel_type(param.kernel_type), degree(param.degree),
+ gamma(param.gamma), coef0(param.coef0)
+{
+ switch(kernel_type)
+ {
+ case LINEAR:
+ kernel_function = &Kernel::kernel_linear;
+ break;
+ case POLY:
+ kernel_function = &Kernel::kernel_poly;
+ break;
+ case RBF:
+ kernel_function = &Kernel::kernel_rbf;
+ break;
+ case SIGMOID:
+ kernel_function = &Kernel::kernel_sigmoid;
+ break;
+ }
+
+ clone(x,x_,l);
+
+ if (kernel_type == RBF)
+ {
+ x_square = new double[l];
+ for (int i=0;i<l;i++)
+ x_square[i] = dot(x[i],x[i]);
+ }
+ else
+ x_square = 0;
+}
+
+Kernel::~Kernel()
+{
+ delete[] x;
+ delete[] x_square;
+}
+
+double Kernel::dot(const svm_node *px, const svm_node *py)
+{
+ double sum = 0;
+ while(px->index != -1 && py->index != -1)
+ {
+ if (px->index == py->index)
+ {
+ sum += px->value * py->value;
+ ++px;
+ ++py;
+ }
+ else
+ {
+ if (px->index > py->index)
+ ++py;
+ else
+ ++px;
+ }
+ }
+ return sum;
+}
+
+double Kernel::k_function(const svm_node *x, const svm_node *y,
+ const svm_parameter& param)
+{
+ switch(param.kernel_type)
+ {
+ case LINEAR:
+ return dot(x,y);
+ case POLY:
+ return pow(param.gamma*dot(x,y)+param.coef0,param.degree);
+ case RBF:
+ {
+ double sum = 0;
+ while(x->index != -1 && y->index !=-1)
+ {
+ if (x->index == y->index)
+ {
+ double d = x->value - y->value;
+ sum += d*d;
+ ++x;
+ ++y;
+ }
+ else
+ {
+ if (x->index > y->index)
+ {
+ sum += y->value * y->value;
+ ++y;
+ }
+ else
+ {
+ sum += x->value * x->value;
+ ++x;
+ }
+ }
+ }
+
+ while(x->index != -1)
+ {
+ sum += x->value * x->value;
+ ++x;
+ }
+
+ while(y->index != -1)
+ {
+ sum += y->value * y->value;
+ ++y;
+ }
+
+ return exp(-param.gamma*sum);
+ }
+ case SIGMOID:
+ return tanh(param.gamma*dot(x,y)+param.coef0);
+ default:
+ return 0; /* Unreachable */
+ }
+}
+
+// Generalized SMO+SVMlight algorithm
+// Solves:
+//
+// min 0.5(\alpha^T Q \alpha) + b^T \alpha
+//
+// y^T \alpha = \delta
+// y_i = +1 or -1
+// 0 <= alpha_i <= Cp for y_i = 1
+// 0 <= alpha_i <= Cn for y_i = -1
+//
+// Given:
+//
+// Q, b, y, Cp, Cn, and an initial feasible point \alpha
+// l is the size of vectors and matrices
+// eps is the stopping criterion
+//
+// solution will be put in \alpha, objective value will be put in obj
+//
+class Solver {
+public:
+ Solver() {};
+ virtual ~Solver() {};
+
+ struct SolutionInfo {
+ double obj;
+ double rho;
+ double upper_bound_p;
+ double upper_bound_n;
+ double r; // for Solver_NU
+ };
+
+ void Solve(int l, const QMatrix& Q, const double *b_, const schar *y_,
+ double *alpha_, double Cp, double Cn, double eps,
+ SolutionInfo* si, int shrinking);
+protected:
+ int active_size;
+ schar *y;
+ double *G; // gradient of objective function
+ enum { LOWER_BOUND, UPPER_BOUND, FREE };
+ char *alpha_status; // LOWER_BOUND, UPPER_BOUND, FREE
+ double *alpha;
+ const QMatrix *Q;
+ const Qfloat *QD;
+ double eps;
+ double Cp,Cn;
+ double *b;
+ int *active_set;
+ double *G_bar; // gradient, if we treat free variables as 0
+ int l;
+ bool unshrinked; // XXX
+
+ double get_C(int i)
+ {
+ return (y[i] > 0)? Cp : Cn;
+ }
+ void update_alpha_status(int i)
+ {
+ if (alpha[i] >= get_C(i))
+ alpha_status[i] = UPPER_BOUND;
+ else if (alpha[i] <= 0)
+ alpha_status[i] = LOWER_BOUND;
+ else alpha_status[i] = FREE;
+ }
+ bool is_upper_bound(int i) { return alpha_status[i] == UPPER_BOUND; }
+ bool is_lower_bound(int i) { return alpha_status[i] == LOWER_BOUND; }
+ bool is_free(int i) { return alpha_status[i] == FREE; }
+ void swap_index(int i, int j);
+ void reconstruct_gradient();
+ virtual int select_working_set(int &i, int &j);
+ virtual int max_violating_pair(int &i, int &j);
+ virtual double calculate_rho();
+ virtual void do_shrinking();
+};
+
+void Solver::swap_index(int i, int j)
+{
+ Q->swap_index(i,j);
+ swap(y[i],y[j]);
+ swap(G[i],G[j]);
+ swap(alpha_status[i],alpha_status[j]);
+ swap(alpha[i],alpha[j]);
+ swap(b[i],b[j]);
+ swap(active_set[i],active_set[j]);
+ swap(G_bar[i],G_bar[j]);
+}
+
+void Solver::reconstruct_gradient()
+{
+ // reconstruct inactive elements of G from G_bar and free variables
+
+ if (active_size == l) return;
+
+ int i;
+ for (i=active_size;i<l;i++)
+ G[i] = G_bar[i] + b[i];
+
+ for (i=0;i<active_size;i++)
+ if (is_free(i))
+ {
+ const Qfloat *Q_i = Q->get_Q(i,l);
+ double alpha_i = alpha[i];
+ for (int j=active_size;j<l;j++)
+ G[j] += alpha_i * Q_i[j];
+ }
+}
+
+void Solver::Solve(int l, const QMatrix& Q, const double *b_, const schar *y_,
+ double *alpha_, double Cp, double Cn, double eps,
+ SolutionInfo* si, int shrinking)
+{
+ this->l = l;
+ this->Q = &Q;
+ QD=Q.get_QD();
+ clone(b, b_,l);
+ clone(y, y_,l);
+ clone(alpha,alpha_,l);
+ this->Cp = Cp;
+ this->Cn = Cn;
+ this->eps = eps;
+ unshrinked = false;
+
+ // initialize alpha_status
+ {
+ alpha_status = new char[l];
+ for (int i=0;i<l;i++)
+ update_alpha_status(i);
+ }
+
+ // initialize active set (for shrinking)
+ {
+ active_set = new int[l];
+ for (int i=0;i<l;i++)
+ active_set[i] = i;
+ active_size = l;
+ }
+
+ // initialize gradient
+ {
+ G = new double[l];
+ G_bar = new double[l];
+ int i;
+ for (i=0;i<l;i++)
+ {
+ G[i] = b[i];
+ G_bar[i] = 0;
+ }
+ for (i=0;i<l;i++)
+ if (!is_lower_bound(i))
+ {
+ const Qfloat *Q_i = Q.get_Q(i,l);
+ double alpha_i = alpha[i];
+ int j;
+ for (j=0;j<l;j++)
+ G[j] += alpha_i*Q_i[j];
+ if (is_upper_bound(i))
+ for (j=0;j<l;j++)
+ G_bar[j] += get_C(i) * Q_i[j];
+ }
+ }
+
+ // optimization step
+
+ int iter = 0;
+ int counter = min(l,1000)+1;
+
+ while(1)
+ {
+ // show progress and do shrinking
+
+ if (--counter == 0)
+ {
+ counter = min(l,1000);
+ if (shrinking) do_shrinking();
+ info("."); info_flush();
+ }
+
+ int i,j;
+ if (select_working_set(i,j)!=0)
+ {
+ // reconstruct the whole gradient
+ reconstruct_gradient();
+ // reset active set size and check
+ active_size = l;
+ info("*"); info_flush();
+ if (select_working_set(i,j)!=0)
+ break;
+ else
+ counter = 1; // do shrinking next iteration
+ }
+
+ ++iter;
+
+ // update alpha[i] and alpha[j], handle bounds carefully
+
+ const Qfloat *Q_i = Q.get_Q(i,active_size);
+ const Qfloat *Q_j = Q.get_Q(j,active_size);
+
+ double C_i = get_C(i);
+ double C_j = get_C(j);
+
+ double old_alpha_i = alpha[i];
+ double old_alpha_j = alpha[j];
+
+ if (y[i]!=y[j])
+ {
+ double quad_coef = Q_i[i]+Q_j[j]+2*Q_i[j];
+ if (quad_coef <= 0)
+ quad_coef = TAU;
+ double delta = (-G[i]-G[j])/quad_coef;
+ double diff = alpha[i] - alpha[j];
+ alpha[i] += delta;
+ alpha[j] += delta;
+
+ if (diff > 0)
+ {
+ if (alpha[j] < 0)
+ {
+ alpha[j] = 0;
+ alpha[i] = diff;
+ }
+ }
+ else
+ {
+ if (alpha[i] < 0)
+ {
+ alpha[i] = 0;
+ alpha[j] = -diff;
+ }
+ }
+ if (diff > C_i - C_j)
+ {
+ if (alpha[i] > C_i)
+ {
+ alpha[i] = C_i;
+ alpha[j] = C_i - diff;
+ }
+ }
+ else
+ {
+ if (alpha[j] > C_j)
+ {
+ alpha[j] = C_j;
+ alpha[i] = C_j + diff;
+ }
+ }
+ }
+ else
+ {
+ double quad_coef = Q_i[i]+Q_j[j]-2*Q_i[j];
+ if (quad_coef <= 0)
+ quad_coef = TAU;
+ double delta = (G[i]-G[j])/quad_coef;
+ double sum = alpha[i] + alpha[j];
+ alpha[i] -= delta;
+ alpha[j] += delta;
+
+ if (sum > C_i)
+ {
+ if (alpha[i] > C_i)
+ {
+ alpha[i] = C_i;
+ alpha[j] = sum - C_i;
+ }
+ }
+ else
+ {
+ if (alpha[j] < 0)
+ {
+ alpha[j] = 0;
+ alpha[i] = sum;
+ }
+ }
+ if (sum > C_j)
+ {
+ if (alpha[j] > C_j)
+ {
+ alpha[j] = C_j;
+ alpha[i] = sum - C_j;
+ }
+ }
+ else
+ {
+ if (alpha[i] < 0)
+ {
+ alpha[i] = 0;
+ alpha[j] = sum;
+ }
+ }
+ }
+
+ // update G
+
+ double delta_alpha_i = alpha[i] - old_alpha_i;
+ double delta_alpha_j = alpha[j] - old_alpha_j;
+
+ for (int k=0;k<active_size;k++)
+ {
+ G[k] += Q_i[k]*delta_alpha_i + Q_j[k]*delta_alpha_j;
+ }
+
+ // update alpha_status and G_bar
+
+ {
+ bool ui = is_upper_bound(i);
+ bool uj = is_upper_bound(j);
+ update_alpha_status(i);
+ update_alpha_status(j);
+ int k;
+ if (ui != is_upper_bound(i))
+ {
+ Q_i = Q.get_Q(i,l);
+ if (ui)
+ for (k=0;k<l;k++)
+ G_bar[k] -= C_i * Q_i[k];
+ else
+ for (k=0;k<l;k++)
+ G_bar[k] += C_i * Q_i[k];
+ }
+
+ if (uj != is_upper_bound(j))
+ {
+ Q_j = Q.get_Q(j,l);
+ if (uj)
+ for (k=0;k<l;k++)
+ G_bar[k] -= C_j * Q_j[k];
+ else
+ for (k=0;k<l;k++)
+ G_bar[k] += C_j * Q_j[k];
+ }
+ }
+ }
+
+ // calculate rho
+
+ si->rho = calculate_rho();
+
+ // calculate objective value
+ {
+ double v = 0;
+ int i;
+ for (i=0;i<l;i++)
+ v += alpha[i] * (G[i] + b[i]);
+
+ si->obj = v/2;
+ }
+
+ // put back the solution
+ {
+ for (int i=0;i<l;i++)
+ alpha_[active_set[i]] = alpha[i];
+ }
+
+ // juggle everything back
+ /*{
+ for (int i=0;i<l;i++)
+ while(active_set[i] != i)
+ swap_index(i,active_set[i]);
+ // or Q.swap_index(i,active_set[i]);
+ }*/
+
+ si->upper_bound_p = Cp;
+ si->upper_bound_n = Cn;
+
+ info("\noptimization finished, #iter = %d\n",iter);
+
+ delete[] b;
+ delete[] y;
+ delete[] alpha;
+ delete[] alpha_status;
+ delete[] active_set;
+ delete[] G;
+ delete[] G_bar;
+}
+
+// return 1 if already optimal, return 0 otherwise
+int Solver::select_working_set(int &out_i, int &out_j)
+{
+ // return i,j such that
+ // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)
+ // j: minimizes the decrease of obj value
+ // (if quadratic coefficeint <= 0, replace it with tau)
+ // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
+
+ double Gmax = -INF;
+ int Gmax_idx = -1;
+ int Gmin_idx = -1;
+ double obj_diff_min = INF;
+
+ for (int t=0;t<active_size;t++)
+ if (y[t]==+1)
+ {
+ if (!is_upper_bound(t))
+ if (-G[t] >= Gmax)
+ {
+ Gmax = -G[t];
+ Gmax_idx = t;
+ }
+ }
+ else
+ {
+ if (!is_lower_bound(t))
+ if (G[t] >= Gmax)
+ {
+ Gmax = G[t];
+ Gmax_idx = t;
+ }
+ }
+
+ int i = Gmax_idx;
+ const Qfloat *Q_i = NULL;
+ if (i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1
+ Q_i = Q->get_Q(i,active_size);
+
+ for (int j=0;j<active_size;j++)
+ {
+ if (y[j]==+1)
+ {
+ if (!is_lower_bound(j))
+ {
+ double grad_diff=Gmax+G[j];
+ if (grad_diff >= eps)
+ {
+ double obj_diff;
+ double quad_coef=Q_i[i]+QD[j]-2*y[i]*Q_i[j];
+ if (quad_coef > 0)
+ obj_diff = -(grad_diff*grad_diff)/quad_coef;
+ else
+ obj_diff = -(grad_diff*grad_diff)/TAU;
+
+ if (obj_diff <= obj_diff_min)
+ {
+ Gmin_idx=j;
+ obj_diff_min = obj_diff;
+ }
+ }
+ }
+ }
+ else
+ {
+ if (!is_upper_bound(j))
+ {
+ double grad_diff= Gmax-G[j];
+ if (grad_diff >= eps)
+ {
+ double obj_diff;
+ double quad_coef=Q_i[i]+QD[j]+2*y[i]*Q_i[j];
+ if (quad_coef > 0)
+ obj_diff = -(grad_diff*grad_diff)/quad_coef;
+ else
+ obj_diff = -(grad_diff*grad_diff)/TAU;
+
+ if (obj_diff <= obj_diff_min)
+ {
+ Gmin_idx=j;
+ obj_diff_min = obj_diff;
+ }
+ }
+ }
+ }
+ }
+
+ if (Gmin_idx == -1)
+ return 1;
+
+ out_i = Gmax_idx;
+ out_j = Gmin_idx;
+ return 0;
+}
+
+// return 1 if already optimal, return 0 otherwise
+int Solver::max_violating_pair(int &out_i, int &out_j)
+{
+ // return i,j: maximal violating pair
+
+ double Gmax1 = -INF; // max { -y_i * grad(f)_i | i in I_up(\alpha) }
+ int Gmax1_idx = -1;
+
+ double Gmax2 = -INF; // max { y_i * grad(f)_i | i in I_low(\alpha) }
+ int Gmax2_idx = -1;
+
+ for (int i=0;i<active_size;i++)
+ {
+ if (y[i]==+1) // y = +1
+ {
+ if (!is_upper_bound(i)) // d = +1
+ {
+ if (-G[i] >= Gmax1)
+ {
+ Gmax1 = -G[i];
+ Gmax1_idx = i;
+ }
+ }
+ if (!is_lower_bound(i)) // d = -1
+ {
+ if (G[i] >= Gmax2)
+ {
+ Gmax2 = G[i];
+ Gmax2_idx = i;
+ }
+ }
+ }
+ else // y = -1
+ {
+ if (!is_upper_bound(i)) // d = +1
+ {
+ if (-G[i] >= Gmax2)
+ {
+ Gmax2 = -G[i];
+ Gmax2_idx = i;
+ }
+ }
+ if (!is_lower_bound(i)) // d = -1
+ {
+ if (G[i] >= Gmax1)
+ {
+ Gmax1 = G[i];
+ Gmax1_idx = i;
+ }
+ }
+ }
+ }
+
+ if (Gmax1+Gmax2 < eps)
+ return 1;
+
+ out_i = Gmax1_idx;
+ out_j = Gmax2_idx;
+ return 0;
+}
+
+void Solver::do_shrinking()
+{
+ int i,j,k;
+ if (max_violating_pair(i,j)!=0) return;
+ double Gm1 = -y[j]*G[j];
+ double Gm2 = y[i]*G[i];
+
+ // shrink
+
+ for (k=0;k<active_size;k++)
+ {
+ if (is_lower_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (-G[k] >= Gm1) continue;
+ }
+ else if (-G[k] >= Gm2) continue;
+ }
+ else if (is_upper_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (G[k] >= Gm2) continue;
+ }
+ else if (G[k] >= Gm1) continue;
+ }
+ else continue;
+
+ --active_size;
+ swap_index(k,active_size);
+ --k; // look at the newcomer
+ }
+
+ // unshrink, check all variables again before final iterations
+
+ if (unshrinked || -(Gm1 + Gm2) > eps*10) return;
+
+ unshrinked = true;
+ reconstruct_gradient();
+
+ for (k=l-1;k>=active_size;k--)
+ {
+ if (is_lower_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (-G[k] < Gm1) continue;
+ }
+ else if (-G[k] < Gm2) continue;
+ }
+ else if (is_upper_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (G[k] < Gm2) continue;
+ }
+ else if (G[k] < Gm1) continue;
+ }
+ else continue;
+
+ swap_index(k,active_size);
+ active_size++;
+ ++k; // look at the newcomer
+ }
+}
+
+double Solver::calculate_rho()
+{
+ double r;
+ int nr_free = 0;
+ double ub = INF, lb = -INF, sum_free = 0;
+ for (int i=0;i<active_size;i++)
+ {
+ double yG = y[i]*G[i];
+
+ if (is_lower_bound(i))
+ {
+ if (y[i] > 0)
+ ub = min(ub,yG);
+ else
+ lb = max(lb,yG);
+ }
+ else if (is_upper_bound(i))
+ {
+ if (y[i] < 0)
+ ub = min(ub,yG);
+ else
+ lb = max(lb,yG);
+ }
+ else
+ {
+ ++nr_free;
+ sum_free += yG;
+ }
+ }
+
+ if (nr_free>0)
+ r = sum_free/nr_free;
+ else
+ r = (ub+lb)/2;
+
+ return r;
+}
+
+//
+// Solver for nu-svm classification and regression
+//
+// additional constraint: e^T \alpha = constant
+//
+class Solver_NU : public Solver
+{
+public:
+ Solver_NU() {}
+ void Solve(int l, const QMatrix& Q, const double *b, const schar *y,
+ double *alpha, double Cp, double Cn, double eps,
+ SolutionInfo* si, int shrinking)
+ {
+ this->si = si;
+ Solver::Solve(l,Q,b,y,alpha,Cp,Cn,eps,si,shrinking);
+ }
+private:
+ SolutionInfo *si;
+ int select_working_set(int &i, int &j);
+ double calculate_rho();
+ void do_shrinking();
+};
+
+// return 1 if already optimal, return 0 otherwise
+int Solver_NU::select_working_set(int &out_i, int &out_j)
+{
+ // return i,j such that y_i = y_j and
+ // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)
+ // j: minimizes the decrease of obj value
+ // (if quadratic coefficeint <= 0, replace it with tau)
+ // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
+
+ double Gmaxp = -INF;
+ int Gmaxp_idx = -1;
+
+ double Gmaxn = -INF;
+ int Gmaxn_idx = -1;
+
+ int Gmin_idx = -1;
+ double obj_diff_min = INF;
+
+ for (int t=0;t<active_size;t++)
+ if (y[t]==+1)
+ {
+ if (!is_upper_bound(t))
+ if (-G[t] >= Gmaxp)
+ {
+ Gmaxp = -G[t];
+ Gmaxp_idx = t;
+ }
+ }
+ else
+ {
+ if (!is_lower_bound(t))
+ if (G[t] >= Gmaxn)
+ {
+ Gmaxn = G[t];
+ Gmaxn_idx = t;
+ }
+ }
+
+ int ip = Gmaxp_idx;
+ int in = Gmaxn_idx;
+ const Qfloat *Q_ip = NULL;
+ const Qfloat *Q_in = NULL;
+ if (ip != -1) // NULL Q_ip not accessed: Gmaxp=-INF if ip=-1
+ Q_ip = Q->get_Q(ip,active_size);
+ if (in != -1)
+ Q_in = Q->get_Q(in,active_size);
+
+ for (int j=0;j<active_size;j++)
+ {
+ if (y[j]==+1)
+ {
+ if (!is_lower_bound(j))
+ {
+ double grad_diff=Gmaxp+G[j];
+ if (grad_diff >= eps)
+ {
+ double obj_diff;
+ double quad_coef = Q_ip[ip]+QD[j]-2*Q_ip[j];
+ if (quad_coef > 0)
+ obj_diff = -(grad_diff*grad_diff)/quad_coef;
+ else
+ obj_diff = -(grad_diff*grad_diff)/TAU;
+
+ if (obj_diff <= obj_diff_min)
+ {
+ Gmin_idx=j;
+ obj_diff_min = obj_diff;
+ }
+ }
+ }
+ }
+ else
+ {
+ if (!is_upper_bound(j))
+ {
+ double grad_diff=Gmaxn-G[j];
+ if (grad_diff >= eps)
+ {
+ double obj_diff;
+ double quad_coef = Q_in[in]+QD[j]-2*Q_in[j];
+ if (quad_coef > 0)
+ obj_diff = -(grad_diff*grad_diff)/quad_coef;
+ else
+ obj_diff = -(grad_diff*grad_diff)/TAU;
+
+ if (obj_diff <= obj_diff_min)
+ {
+ Gmin_idx=j;
+ obj_diff_min = obj_diff;
+ }
+ }
+ }
+ }
+ }
+
+ if (Gmin_idx == -1)
+ return 1;
+
+ if (y[Gmin_idx] == +1)
+ out_i = Gmaxp_idx;
+ else
+ out_i = Gmaxn_idx;
+ out_j = Gmin_idx;
+
+ return 0;
+}
+
+void Solver_NU::do_shrinking()
+{
+ double Gmax1 = -INF; // max { -y_i * grad(f)_i | y_i = +1, i in I_up(\alpha) }
+ double Gmax2 = -INF; // max { y_i * grad(f)_i | y_i = +1, i in I_low(\alpha) }
+ double Gmax3 = -INF; // max { -y_i * grad(f)_i | y_i = -1, i in I_up(\alpha) }
+ double Gmax4 = -INF; // max { y_i * grad(f)_i | y_i = -1, i in I_low(\alpha) }
+
+ // find maximal violating pair first
+ int k;
+ for (k=0;k<active_size;k++)
+ {
+ if (!is_upper_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (-G[k] > Gmax1) Gmax1 = -G[k];
+ }
+ else if (-G[k] > Gmax3) Gmax3 = -G[k];
+ }
+ if (!is_lower_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (G[k] > Gmax2) Gmax2 = G[k];
+ }
+ else if (G[k] > Gmax4) Gmax4 = G[k];
+ }
+ }
+
+ // shrinking
+
+ double Gm1 = -Gmax2;
+ double Gm2 = -Gmax1;
+ double Gm3 = -Gmax4;
+ double Gm4 = -Gmax3;
+
+ for (k=0;k<active_size;k++)
+ {
+ if (is_lower_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (-G[k] >= Gm1) continue;
+ }
+ else if (-G[k] >= Gm3) continue;
+ }
+ else if (is_upper_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (G[k] >= Gm2) continue;
+ }
+ else if (G[k] >= Gm4) continue;
+ }
+ else continue;
+
+ --active_size;
+ swap_index(k,active_size);
+ --k; // look at the newcomer
+ }
+
+ // unshrink, check all variables again before final iterations
+
+ if (unshrinked || max(-(Gm1+Gm2),-(Gm3+Gm4)) > eps*10) return;
+
+ unshrinked = true;
+ reconstruct_gradient();
+
+ for (k=l-1;k>=active_size;k--)
+ {
+ if (is_lower_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (-G[k] < Gm1) continue;
+ }
+ else if (-G[k] < Gm3) continue;
+ }
+ else if (is_upper_bound(k))
+ {
+ if (y[k]==+1)
+ {
+ if (G[k] < Gm2) continue;
+ }
+ else if (G[k] < Gm4) continue;
+ }
+ else continue;
+
+ swap_index(k,active_size);
+ active_size++;
+ ++k; // look at the newcomer
+ }
+}
+
+double Solver_NU::calculate_rho()
+{
+ int nr_free1 = 0,nr_free2 = 0;
+ double ub1 = INF, ub2 = INF;
+ double lb1 = -INF, lb2 = -INF;
+ double sum_free1 = 0, sum_free2 = 0;
+
+ for (int i=0;i<active_size;i++)
+ {
+ if (y[i]==+1)
+ {
+ if (is_lower_bound(i))
+ ub1 = min(ub1,G[i]);
+ else if (is_upper_bound(i))
+ lb1 = max(lb1,G[i]);
+ else
+ {
+ ++nr_free1;
+ sum_free1 += G[i];
+ }
+ }
+ else
+ {
+ if (is_lower_bound(i))
+ ub2 = min(ub2,G[i]);
+ else if (is_upper_bound(i))
+ lb2 = max(lb2,G[i]);
+ else
+ {
+ ++nr_free2;
+ sum_free2 += G[i];
+ }
+ }
+ }
+
+ double r1,r2;
+ if (nr_free1 > 0)
+ r1 = sum_free1/nr_free1;
+ else
+ r1 = (ub1+lb1)/2;
+
+ if (nr_free2 > 0)
+ r2 = sum_free2/nr_free2;
+ else
+ r2 = (ub2+lb2)/2;
+
+ si->r = (r1+r2)/2;
+ return (r1-r2)/2;
+}
+
+//
+// Q matrices for various formulations
+//
+class SVC_Q: public Kernel
+{
+public:
+ SVC_Q(const svm_problem& prob, const svm_parameter& param, const schar *y_)
+ :Kernel(prob.l, prob.x, param)
+ {
+ clone(y,y_,prob.l);
+ cache = new Cache(prob.l,(int)(param.cache_size*(1<<20)));
+ QD = new Qfloat[prob.l];
+ for (int i=0;i<prob.l;i++)
+ QD[i]= (Qfloat)(this->*kernel_function)(i,i);
+ }
+
+ Qfloat *get_Q(int i, int len) const
+ {
+ Qfloat *data;
+ int start;
+ if ((start = cache->get_data(i,&data,len)) < len)
+ {
+ for (int j=start;j<len;j++)
+ data[j] = (Qfloat)(y[i]*y[j]*(this->*kernel_function)(i,j));
+ }
+ return data;
+ }
+
+ Qfloat *get_QD() const
+ {
+ return QD;
+ }
+
+ void swap_index(int i, int j) const
+ {
+ cache->swap_index(i,j);
+ Kernel::swap_index(i,j);
+ swap(y[i],y[j]);
+ swap(QD[i],QD[j]);
+ }
+
+ ~SVC_Q()
+ {
+ delete[] y;
+ delete cache;
+ delete[] QD;
+ }
+private:
+ schar *y;
+ Cache *cache;
+ Qfloat *QD;
+};
+
+class ONE_CLASS_Q: public Kernel
+{
+public:
+ ONE_CLASS_Q(const svm_problem& prob, const svm_parameter& param)
+ :Kernel(prob.l, prob.x, param)
+ {
+ cache = new Cache(prob.l,(int)(param.cache_size*(1<<20)));
+ QD = new Qfloat[prob.l];
+ for (int i=0;i<prob.l;i++)
+ QD[i]= (Qfloat)(this->*kernel_function)(i,i);
+ }
+
+ Qfloat *get_Q(int i, int len) const
+ {
+ Qfloat *data;
+ int start;
+ if ((start = cache->get_data(i,&data,len)) < len)
+ {
+ for (int j=start;j<len;j++)
+ data[j] = (Qfloat)(this->*kernel_function)(i,j);
+ }
+ return data;
+ }
+
+ Qfloat *get_QD() const
+ {
+ return QD;
+ }
+
+ void swap_index(int i, int j) const
+ {
+ cache->swap_index(i,j);
+ Kernel::swap_index(i,j);
+ swap(QD[i],QD[j]);
+ }
+
+ ~ONE_CLASS_Q()
+ {
+ delete cache;
+ delete[] QD;
+ }
+private:
+ Cache *cache;
+ Qfloat *QD;
+};
+
+class SVR_Q: public Kernel
+{
+public:
+ SVR_Q(const svm_problem& prob, const svm_parameter& param)
+ :Kernel(prob.l, prob.x, param)
+ {
+ l = prob.l;
+ cache = new Cache(l,(int)(param.cache_size*(1<<20)));
+ QD = new Qfloat[2*l];
+ sign = new schar[2*l];
+ index = new int[2*l];
+ for (int k=0;k<l;k++)
+ {
+ sign[k] = 1;
+ sign[k+l] = -1;
+ index[k] = k;
+ index[k+l] = k;
+ QD[k]= (Qfloat)(this->*kernel_function)(k,k);
+ QD[k+l]=QD[k];
+ }
+ buffer[0] = new Qfloat[2*l];
+ buffer[1] = new Qfloat[2*l];
+ next_buffer = 0;
+ }
+
+ void swap_index(int i, int j) const
+ {
+ swap(sign[i],sign[j]);
+ swap(index[i],index[j]);
+ swap(QD[i],QD[j]);
+ }
+
+ Qfloat *get_Q(int i, int len) const
+ {
+ Qfloat *data;
+ int real_i = index[i];
+ if (cache->get_data(real_i,&data,l) < l)
+ {
+ for (int j=0;j<l;j++)
+ data[j] = (Qfloat)(this->*kernel_function)(real_i,j);
+ }
+
+ // reorder and copy
+ Qfloat *buf = buffer[next_buffer];
+ next_buffer = 1 - next_buffer;
+ schar si = sign[i];
+ for (int j=0;j<len;j++)
+ buf[j] = si * sign[j] * data[index[j]];
+ return buf;
+ }
+
+ Qfloat *get_QD() const
+ {
+ return QD;
+ }
+
+ ~SVR_Q()
+ {
+ delete cache;
+ delete[] sign;
+ delete[] index;
+ delete[] buffer[0];
+ delete[] buffer[1];
+ delete[] QD;
+ }
+private:
+ int l;
+ Cache *cache;
+ schar *sign;
+ int *index;
+ mutable int next_buffer;
+ Qfloat *buffer[2];
+ Qfloat *QD;
+};
+
+//
+// construct and solve various formulations
+//
+static void solve_c_svc(
+ const svm_problem *prob, const svm_parameter* param,
+ double *alpha, Solver::SolutionInfo* si, double Cp, double Cn)
+{
+ int l = prob->l;
+ double *minus_ones = new double[l];
+ schar *y = new schar[l];
+
+ int i;
+
+ for (i=0;i<l;i++)
+ {
+ alpha[i] = 0;
+ minus_ones[i] = -1;
+ if (prob->y[i] > 0) y[i] = +1; else y[i]=-1;
+ }
+
+ Solver s;
+ s.Solve(l, SVC_Q(*prob,*param,y), minus_ones, y,
+ alpha, Cp, Cn, param->eps, si, param->shrinking);
+
+ double sum_alpha=0;
+ for (i=0;i<l;i++)
+ sum_alpha += alpha[i];
+
+ if (Cp==Cn)
+ info("nu = %f\n", sum_alpha/(Cp*prob->l));
+
+ for (i=0;i<l;i++)
+ alpha[i] *= y[i];
+
+ delete[] minus_ones;
+ delete[] y;
+}
+
+static void solve_nu_svc(
+ const svm_problem *prob, const svm_parameter *param,
+ double *alpha, Solver::SolutionInfo* si)
+{
+ int i;
+ int l = prob->l;
+ double nu = param->nu;
+
+ schar *y = new schar[l];
+
+ for (i=0;i<l;i++)
+ if (prob->y[i]>0)
+ y[i] = +1;
+ else
+ y[i] = -1;
+
+ double sum_pos = nu*l/2;
+ double sum_neg = nu*l/2;
+
+ for (i=0;i<l;i++)
+ if (y[i] == +1)
+ {
+ alpha[i] = min(1.0,sum_pos);
+ sum_pos -= alpha[i];
+ }
+ else
+ {
+ alpha[i] = min(1.0,sum_neg);
+ sum_neg -= alpha[i];
+ }
+
+ double *zeros = new double[l];
+
+ for (i=0;i<l;i++)
+ zeros[i] = 0;
+
+ Solver_NU s;
+ s.Solve(l, SVC_Q(*prob,*param,y), zeros, y,
+ alpha, 1.0, 1.0, param->eps, si, param->shrinking);
+ double r = si->r;
+
+ info("C = %f\n",1/r);
+
+ for (i=0;i<l;i++)
+ alpha[i] *= y[i]/r;
+
+ si->rho /= r;
+ si->obj /= (r*r);
+ si->upper_bound_p = 1/r;
+ si->upper_bound_n = 1/r;
+
+ delete[] y;
+ delete[] zeros;
+}
+
+static void solve_one_class(
+ const svm_problem *prob, const svm_parameter *param,
+ double *alpha, Solver::SolutionInfo* si)
+{
+ int l = prob->l;
+ double *zeros = new double[l];
+ schar *ones = new schar[l];
+ int i;
+
+ int n = (int)(param->nu*prob->l); // # of alpha's at upper bound
+
+ for (i=0;i<n;i++)
+ alpha[i] = 1;
+ if (n<prob->l)
+ alpha[n] = param->nu * prob->l - n;
+ for (i=n+1;i<l;i++)
+ alpha[i] = 0;
+
+ for (i=0;i<l;i++)
+ {
+ zeros[i] = 0;
+ ones[i] = 1;
+ }
+
+ Solver s;
+ s.Solve(l, ONE_CLASS_Q(*prob,*param), zeros, ones,
+ alpha, 1.0, 1.0, param->eps, si, param->shrinking);
+
+ delete[] zeros;
+ delete[] ones;
+}
+
+static void solve_epsilon_svr(
+ const svm_problem *prob, const svm_parameter *param,
+ double *alpha, Solver::SolutionInfo* si)
+{
+ int l = prob->l;
+ double *alpha2 = new double[2*l];
+ double *linear_term = new double[2*l];
+ schar *y = new schar[2*l];
+ int i;
+
+ for (i=0;i<l;i++)
+ {
+ alpha2[i] = 0;
+ linear_term[i] = param->p - prob->y[i];
+ y[i] = 1;
+
+ alpha2[i+l] = 0;
+ linear_term[i+l] = param->p + prob->y[i];
+ y[i+l] = -1;
+ }
+
+ Solver s;
+ s.Solve(2*l, SVR_Q(*prob,*param), linear_term, y,
+ alpha2, param->C, param->C, param->eps, si, param->shrinking);
+
+ double sum_alpha = 0;
+ for (i=0;i<l;i++)
+ {
+ alpha[i] = alpha2[i] - alpha2[i+l];
+ sum_alpha += fabs(alpha[i]);
+ }
+ info("nu = %f\n",sum_alpha/(param->C*l));
+
+ delete[] alpha2;
+ delete[] linear_term;
+ delete[] y;
+}
+
+static void solve_nu_svr(
+ const svm_problem *prob, const svm_parameter *param,
+ double *alpha, Solver::SolutionInfo* si)
+{
+ int l = prob->l;
+ double C = param->C;
+ double *alpha2 = new double[2*l];
+ double *linear_term = new double[2*l];
+ schar *y = new schar[2*l];
+ int i;
+
+ double sum = C * param->nu * l / 2;
+ for (i=0;i<l;i++)
+ {
+ alpha2[i] = alpha2[i+l] = min(sum,C);
+ sum -= alpha2[i];
+
+ linear_term[i] = - prob->y[i];
+ y[i] = 1;
+
+ linear_term[i+l] = prob->y[i];
+ y[i+l] = -1;
+ }
+
+ Solver_NU s;
+ s.Solve(2*l, SVR_Q(*prob,*param), linear_term, y,
+ alpha2, C, C, param->eps, si, param->shrinking);
+
+ info("epsilon = %f\n",-si->r);
+
+ for (i=0;i<l;i++)
+ alpha[i] = alpha2[i] - alpha2[i+l];
+
+ delete[] alpha2;
+ delete[] linear_term;
+ delete[] y;
+}
+
+//
+// decision_function
+//
+struct decision_function
+{
+ double *alpha;
+ double rho;
+};
+
+decision_function svm_train_one
+(
+ const svm_problem *prob, const svm_parameter *param,
+ double Cp, double Cn)
+{
+ double *alpha = Malloc(double,prob->l);
+ Solver::SolutionInfo si;
+ switch(param->svm_type)
+ {
+ case C_SVC:
+ solve_c_svc(prob,param,alpha,&si,Cp,Cn);
+ break;
+ case NU_SVC:
+ solve_nu_svc(prob,param,alpha,&si);
+ break;
+ case ONE_CLASS:
+ solve_one_class(prob,param,alpha,&si);
+ break;
+ case EPSILON_SVR:
+ solve_epsilon_svr(prob,param,alpha,&si);
+ break;
+ case NU_SVR:
+ solve_nu_svr(prob,param,alpha,&si);
+ break;
+ }
+
+ info("obj = %f, rho = %f\n",si.obj,si.rho);
+
+ // output SVs
+
+ int nSV = 0;
+ int nBSV = 0;
+ for (int i=0;i<prob->l;i++)
+ {
+ if (fabs(alpha[i]) > 0)
+ {
+ ++nSV;
+ if (prob->y[i] > 0)
+ {
+ if (fabs(alpha[i]) >= si.upper_bound_p)
+ ++nBSV;
+ }
+ else
+ {
+ if (fabs(alpha[i]) >= si.upper_bound_n)
+ ++nBSV;
+ }
+ }
+ }
+
+ info("nSV = %d, nBSV = %d\n",nSV,nBSV);
+
+ decision_function f;
+ f.alpha = alpha;
+ f.rho = si.rho;
+ return f;
+}
+
+//
+// svm_model
+//
+struct svm_model
+{
+ svm_parameter param; // parameter
+ int nr_class; // number of classes, = 2 in regression/one class svm
+ int l; // total #SV
+ svm_node **SV; // SVs (SV[l])
+ double **sv_coef; // coefficients for SVs in decision functions (sv_coef[n-1][l])
+ double *rho; // constants in decision functions (rho[n*(n-1)/2])
+ double *probA; // pariwise probability information
+ double *probB;
+
+ // for classification only
+
+ int *label; // label of each class (label[n])
+ int *nSV; // number of SVs for each class (nSV[n])
+ // nSV[0] + nSV[1] + ... + nSV[n-1] = l
+ // XXX
+ int free_sv; // 1 if svm_model is created by svm_load_model
+ // 0 if svm_model is created by svm_train
+};
+
+// Platt's binary SVM Probablistic Output: an improvement from Lin et al.
+void sigmoid_train(
+ int l, const double *dec_values, const double *labels,
+ double& A, double& B)
+{
+ double prior1=0, prior0 = 0;
+ int i;
+
+ for (i=0;i<l;i++)
+ if (labels[i] > 0) prior1+=1;
+ else prior0+=1;
+
+ int max_iter=100; // Maximal number of iterations
+ double min_step=1e-10; // Minimal step taken in line search
+ double sigma=1e-3; // For numerically strict PD of Hessian
+ double eps=1e-5;
+ double hiTarget=(prior1+1.0)/(prior1+2.0);
+ double loTarget=1/(prior0+2.0);
+ double *t=Malloc(double,l);
+ double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize;
+ double newA,newB,newf,d1,d2;
+ int iter;
+
+ // Initial Point and Initial Fun Value
+ A=0.0; B=log((prior0+1.0)/(prior1+1.0));
+ double fval = 0.0;
+
+ for (i=0;i<l;i++)
+ {
+ if (labels[i]>0) t[i]=hiTarget;
+ else t[i]=loTarget;
+ fApB = dec_values[i]*A+B;
+ if (fApB>=0)
+ fval += t[i]*fApB + log(1+exp(-fApB));
+ else
+ fval += (t[i] - 1)*fApB +log(1+exp(fApB));
+ }
+ for (iter=0;iter<max_iter;iter++)
+ {
+ // Update Gradient and Hessian (use H' = H + sigma I)
+ h11=sigma; // numerically ensures strict PD
+ h22=sigma;
+ h21=0.0;g1=0.0;g2=0.0;
+ for (i=0;i<l;i++)
+ {
+ fApB = dec_values[i]*A+B;
+ if (fApB >= 0)
+ {
+ p=exp(-fApB)/(1.0+exp(-fApB));
+ q=1.0/(1.0+exp(-fApB));
+ }
+ else
+ {
+ p=1.0/(1.0+exp(fApB));
+ q=exp(fApB)/(1.0+exp(fApB));
+ }
+ d2=p*q;
+ h11+=dec_values[i]*dec_values[i]*d2;
+ h22+=d2;
+ h21+=dec_values[i]*d2;
+ d1=t[i]-p;
+ g1+=dec_values[i]*d1;
+ g2+=d1;
+ }
+
+ // Stopping Criteria
+ if (fabs(g1)<eps && fabs(g2)<eps)
+ break;
+
+ // Finding Newton direction: -inv(H') * g
+ det=h11*h22-h21*h21;
+ dA=-(h22*g1 - h21 * g2) / det;
+ dB=-(-h21*g1+ h11 * g2) / det;
+ gd=g1*dA+g2*dB;
+
+
+ stepsize = 1; // Line Search
+ while (stepsize >= min_step)
+ {
+ newA = A + stepsize * dA;
+ newB = B + stepsize * dB;
+
+ // New function value
+ newf = 0.0;
+ for (i=0;i<l;i++)
+ {
+ fApB = dec_values[i]*newA+newB;
+ if (fApB >= 0)
+ newf += t[i]*fApB + log(1+exp(-fApB));
+ else
+ newf += (t[i] - 1)*fApB +log(1+exp(fApB));
+ }
+ // Check sufficient decrease
+ if (newf<fval+0.0001*stepsize*gd)
+ {
+ A=newA;B=newB;fval=newf;
+ break;
+ }
+ else
+ stepsize = stepsize / 2.0;
+ }
+
+ if (stepsize < min_step)
+ {
+ info("Line search fails in two-class probability estimates\n");
+ break;
+ }
+ }
+
+ if (iter>=max_iter)
+ info("Reaching maximal iterations in two-class probability estimates\n");
+ free(t);
+}
+
+double sigmoid_predict(double decision_value, double A, double B)
+{
+ double fApB = decision_value*A+B;
+ if (fApB >= 0)
+ return exp(-fApB)/(1.0+exp(-fApB));
+ else
+ return 1.0/(1+exp(fApB)) ;
+}
+
+// Method 2 from the multiclass_prob paper by Wu, Lin, and Weng
+void multiclass_probability(int k, double **r, double *p)
+{
+ int t,j;
+ int iter = 0, max_iter=100;
+ double **Q=Malloc(double *,k);
+ double *Qp=Malloc(double,k);
+ double pQp, eps=0.005/k;
+
+ for (t=0;t<k;t++)
+ {
+ p[t]=1.0/k; // Valid if k = 1
+ Q[t]=Malloc(double,k);
+ Q[t][t]=0;
+ for (j=0;j<t;j++)
+ {
+ Q[t][t]+=r[j][t]*r[j][t];
+ Q[t][j]=Q[j][t];
+ }
+ for (j=t+1;j<k;j++)
+ {
+ Q[t][t]+=r[j][t]*r[j][t];
+ Q[t][j]=-r[j][t]*r[t][j];
+ }
+ }
+ for (iter=0;iter<max_iter;iter++)
+ {
+ // stopping condition, recalculate QP,pQP for numerical accuracy
+ pQp=0;
+ for (t=0;t<k;t++)
+ {
+ Qp[t]=0;
+ for (j=0;j<k;j++)
+ Qp[t]+=Q[t][j]*p[j];
+ pQp+=p[t]*Qp[t];
+ }
+ double max_error=0;
+ for (t=0;t<k;t++)
+ {
+ double error=fabs(Qp[t]-pQp);
+ if (error>max_error)
+ max_error=error;
+ }
+ if (max_error<eps) break;
+
+ for (t=0;t<k;t++)
+ {
+ double diff=(-Qp[t]+pQp)/Q[t][t];
+ p[t]+=diff;
+ pQp=(pQp+diff*(diff*Q[t][t]+2*Qp[t]))/(1+diff)/(1+diff);
+ for (j=0;j<k;j++)
+ {
+ Qp[j]=(Qp[j]+diff*Q[t][j])/(1+diff);
+ p[j]/=(1+diff);
+ }
+ }
+ }
+ if (iter>=max_iter)
+ info("Exceeds max_iter in multiclass_prob\n");
+ for (t=0;t<k;t++) free(Q[t]);
+ free(Q);
+ free(Qp);
+}
+
+// Cross-validation decision values for probability estimates
+void svm_binary_svc_probability(
+ const svm_problem *prob, const svm_parameter *param,
+ double Cp, double Cn, double& probA, double& probB)
+{
+ int i;
+ int nr_fold = 5;
+ int *perm = Malloc(int,prob->l);
+ double *dec_values = Malloc(double,prob->l);
+
+ // random shuffle
+ for (i=0;i<prob->l;i++) perm[i]=i;
+ for (i=0;i<prob->l;i++)
+ {
+ int j = i+rand()%(prob->l-i);
+ swap(perm[i],perm[j]);
+ }
+ for (i=0;i<nr_fold;i++)
+ {
+ int begin = i*prob->l/nr_fold;
+ int end = (i+1)*prob->l/nr_fold;
+ int j,k;
+ struct svm_problem subprob;
+
+ subprob.l = prob->l-(end-begin);
+ subprob.x = Malloc(struct svm_node*,subprob.l);
+ subprob.y = Malloc(double,subprob.l);
+
+ k=0;
+ for (j=0;j<begin;j++)
+ {
+ subprob.x[k] = prob->x[perm[j]];
+ subprob.y[k] = prob->y[perm[j]];
+ ++k;
+ }
+ for (j=end;j<prob->l;j++)
+ {
+ subprob.x[k] = prob->x[perm[j]];
+ subprob.y[k] = prob->y[perm[j]];
+ ++k;
+ }
+ int p_count=0,n_count=0;
+ for (j=0;j<k;j++)
+ if (subprob.y[j]>0)
+ p_count++;
+ else
+ n_count++;
+
+ if (p_count==0 && n_count==0)
+ for (j=begin;j<end;j++)
+ dec_values[perm[j]] = 0;
+ else if (p_count > 0 && n_count == 0)
+ for (j=begin;j<end;j++)
+ dec_values[perm[j]] = 1;
+ else if (p_count == 0 && n_count > 0)
+ for (j=begin;j<end;j++)
+ dec_values[perm[j]] = -1;
+ else
+ {
+ svm_parameter subparam = *param;
+ subparam.probability=0;
+ subparam.C=1.0;
+ subparam.nr_weight=2;
+ subparam.weight_label = Malloc(int,2);
+ subparam.weight = Malloc(double,2);
+ subparam.weight_label[0]=+1;
+ subparam.weight_label[1]=-1;
+ subparam.weight[0]=Cp;
+ subparam.weight[1]=Cn;
+ struct svm_model *submodel = svm_train(&subprob,&subparam);
+ for (j=begin;j<end;j++)
+ {
+ svm_predict_values(submodel,prob->x[perm[j]],&(dec_values[perm[j]]));
+ // ensure +1 -1 order; reason not using CV subroutine
+ dec_values[perm[j]] *= submodel->label[0];
+ }
+ svm_destroy_model(submodel);
+ svm_destroy_param(&subparam);
+ free(subprob.x);
+ free(subprob.y);
+ }
+ }
+ sigmoid_train(prob->l,dec_values,prob->y,probA,probB);
+ free(dec_values);
+ free(perm);
+}
+
+// Return parameter of a Laplace distribution
+double svm_svr_probability(
+ const svm_problem *prob, const svm_parameter *param)
+{
+ int i;
+ int nr_fold = 5;
+ double *ymv = Malloc(double,prob->l);
+ double mae = 0;
+
+ svm_parameter newparam = *param;
+ newparam.probability = 0;
+ svm_cross_validation(prob,&newparam,nr_fold,ymv);
+ for (i=0;i<prob->l;i++)
+ {
+ ymv[i]=prob->y[i]-ymv[i];
+ mae += fabs(ymv[i]);
+ }
+ mae /= prob->l;
+ double std=sqrt(2*mae*mae);
+ int count=0;
+ mae=0;
+ for (i=0;i<prob->l;i++)
+ if (fabs(ymv[i]) > 5*std)
+ count=count+1;
+ else
+ mae+=fabs(ymv[i]);
+ mae /= (prob->l-count);
+ info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma= %g\n",mae);
+ free(ymv);
+ return mae;
+}
+
+
+// label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data
+// perm, length l, must be allocated before calling this subroutine
+void svm_group_classes(const svm_problem *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm)
+{
+ int l = prob->l;
+ int max_nr_class = 16;
+ int nr_class = 0;
+ int *label = Malloc(int,max_nr_class);
+ int *count = Malloc(int,max_nr_class);
+ int *data_label = Malloc(int,l);
+ int i;
+
+ for (i=0;i<l;i++)
+ {
+ int this_label = (int)prob->y[i];
+ int j;
+ for (j=0;j<nr_class;j++)
+ {
+ if (this_label == label[j])
+ {
+ ++count[j];
+ break;
+ }
+ }
+ data_label[i] = j;
+ if (j == nr_class)
+ {
+ if (nr_class == max_nr_class)
+ {
+ max_nr_class *= 2;
+ label = (int *)realloc(label,max_nr_class*sizeof(int));
+ count = (int *)realloc(count,max_nr_class*sizeof(int));
+ }
+ label[nr_class] = this_label;
+ count[nr_class] = 1;
+ ++nr_class;
+ }
+ }
+
+ int *start = Malloc(int,nr_class);
+ start[0] = 0;
+ for (i=1;i<nr_class;i++)
+ start[i] = start[i-1]+count[i-1];
+ for (i=0;i<l;i++)
+ {
+ perm[start[data_label[i]]] = i;
+ ++start[data_label[i]];
+ }
+ start[0] = 0;
+ for (i=1;i<nr_class;i++)
+ start[i] = start[i-1]+count[i-1];
+
+ *nr_class_ret = nr_class;
+ *label_ret = label;
+ *start_ret = start;
+ *count_ret = count;
+ free(data_label);
+}
+
+//
+// Interface functions
+//
+svm_model* svm_train(const svm_problem *prob, const svm_parameter *param)
+{
+ svm_model *model = Malloc(svm_model,1);
+ model->param = *param;
+ model->free_sv = 0; // XXX
+
+ if (param->svm_type == ONE_CLASS || param->svm_type == EPSILON_SVR || param->svm_type == NU_SVR)
+ {
+ // regression or one-class-svm
+ model->nr_class = 2;
+ model->label = NULL;
+ model->nSV = NULL;
+ model->probA = NULL; model->probB = NULL;
+ model->sv_coef = Malloc(double *,1);
+
+ if (param->probability &&
+ (param->svm_type == EPSILON_SVR ||
+ param->svm_type == NU_SVR))
+ {
+ model->probA = Malloc(double,1);
+ model->probA[0] = svm_svr_probability(prob,param);
+ }
+
+ decision_function f = svm_train_one(prob,param,0,0);
+ model->rho = Malloc(double,1);
+ model->rho[0] = f.rho;
+
+ int nSV = 0;
+ int i;
+ for (i=0;i<prob->l;i++)
+ {
+ if (fabs(f.alpha[i]) > 0)
+ {
+ ++nSV;
+ }
+ }
+ model->l = nSV;
+ model->SV = Malloc(svm_node*, nSV);
+ model->sv_coef[0] = Malloc(double,nSV);
+ int j = 0;
+ for (i=0;i<prob->l;i++)
+ {
+ if (fabs(f.alpha[i]) > 0)
+ {
+ model->SV[j] = prob->x[i];
+ model->sv_coef[0][j] = f.alpha[i];
+ ++j;
+ }
+ }
+ free(f.alpha);
+ }
+ else
+ {
+ // classification
+ int l = prob->l;
+ int nr_class;
+ int *label = NULL;
+ int *start = NULL;
+ int *count = NULL;
+ int *perm = Malloc(int,l);
+
+ // group training data of the same class
+ svm_group_classes(prob,&nr_class,&label,&start,&count,perm);
+ svm_node** x = Malloc(svm_node *,l);
+ int i;
+ for (i=0;i<l;i++)
+ {
+ x[i] = prob->x[perm[i]];
+ }
+
+ // calculate weighted C
+
+ double *weighted_C = Malloc(double, nr_class);
+ for (i=0;i<nr_class;i++)
+ {
+ weighted_C[i] = param->C;
+ }
+ for (i=0;i<param->nr_weight;i++)
+ {
+ int j;
+ for (j=0;j<nr_class;j++)
+ if (param->weight_label[i] == label[j])
+ break;
+ if (j == nr_class)
+ {
+ fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]);
+ }
+ else
+ {
+ weighted_C[j] *= param->weight[i];
+ }
+ }
+
+ // train k*(k-1)/2 models
+
+ bool *nonzero = Malloc(bool,l);
+ for (i=0;i<l;i++)
+ {
+ nonzero[i] = false;
+ }
+ decision_function *f = Malloc(decision_function,nr_class*(nr_class-1)/2);
+
+ double *probA=NULL,*probB=NULL;
+ if (param->probability)
+ {
+ probA=Malloc(double,nr_class*(nr_class-1)/2);
+ probB=Malloc(double,nr_class*(nr_class-1)/2);
+ }
+
+ int p = 0;
+ for (i=0;i<nr_class;i++)
+ for (int j=i+1;j<nr_class;j++)
+ {
+ svm_problem sub_prob;
+ int si = start[i], sj = start[j];
+ int ci = count[i], cj = count[j];
+ sub_prob.l = ci+cj;
+ sub_prob.x = Malloc(svm_node *,sub_prob.l);
+ sub_prob.y = Malloc(double,sub_prob.l);
+ int k;
+ for (k=0;k<ci;k++)
+ {
+ sub_prob.x[k] = x[si+k];
+ sub_prob.y[k] = +1;
+ }
+ for (k=0;k<cj;k++)
+ {
+ sub_prob.x[ci+k] = x[sj+k];
+ sub_prob.y[ci+k] = -1;
+ }
+
+ if (param->probability)
+ svm_binary_svc_probability(&sub_prob,param,weighted_C[i],weighted_C[j],probA[p],probB[p]);
+
+ f[p] = svm_train_one(&sub_prob,param,weighted_C[i],weighted_C[j]);
+ for (k=0;k<ci;k++)
+ if (!nonzero[si+k] && fabs(f[p].alpha[k]) > 0)
+ nonzero[si+k] = true;
+ for (k=0;k<cj;k++)
+ if (!nonzero[sj+k] && fabs(f[p].alpha[ci+k]) > 0)
+ nonzero[sj+k] = true;
+ free(sub_prob.x);
+ free(sub_prob.y);
+ ++p;
+ }
+
+ // build output
+
+ model->nr_class = nr_class;
+
+ model->label = Malloc(int,nr_class);
+ for (i=0; i<nr_class; i++)
+ {
+ model->label[i] = label[i];
+ }
+
+ model->rho = Malloc(double,nr_class*(nr_class-1)/2);
+ for (i=0;i<nr_class*(nr_class-1)/2;i++)
+ {
+ model->rho[i] = f[i].rho;
+ }
+
+ if (param->probability)
+ {
+ model->probA = Malloc(double,nr_class*(nr_class-1)/2);
+ model->probB = Malloc(double,nr_class*(nr_class-1)/2);
+ for (i=0;i<nr_class*(nr_class-1)/2;i++)
+ {
+ model->probA[i] = probA[i];
+ model->probB[i] = probB[i];
+ }
+ }
+ else
+ {
+ model->probA=NULL;
+ model->probB=NULL;
+ }
+
+ int total_sv = 0;
+ int* nz_count = Malloc(int,nr_class);
+ model->nSV = Malloc(int,nr_class);
+ for (i=0;i<nr_class;i++)
+ {
+ int nSV = 0;
+ for (int j=0;j<count[i];j++)
+ {
+ if (nonzero[start[i]+j])
+ {
+ ++nSV;
+ ++total_sv;
+ }
+ }
+ model->nSV[i] = nSV;
+ nz_count[i] = nSV;
+ }
+
+ info("Total nSV = %d\n",total_sv);
+
+ model->l = total_sv;
+ model->SV = Malloc(svm_node *,total_sv);
+ p = 0;
+ for (i=0;i<l;i++)
+ if (nonzero[i]) model->SV[p++] = x[i];
+
+ int *nz_start = Malloc(int,nr_class);
+ nz_start[0] = 0;
+ for (i=1;i<nr_class;i++)
+ nz_start[i] = nz_start[i-1]+nz_count[i-1];
+
+ model->sv_coef = Malloc(double *,nr_class-1);
+ for (i=0;i<nr_class-1;i++)
+ model->sv_coef[i] = Malloc(double,total_sv);
+
+ p = 0;
+ for (i=0;i<nr_class;i++)
+ for (int j=i+1;j<nr_class;j++)
+ {
+ // classifier (i,j): coefficients with
+ // i are in sv_coef[j-1][nz_start[i]...],
+ // j are in sv_coef[i][nz_start[j]...]
+
+ int si = start[i];
+ int sj = start[j];
+ int ci = count[i];
+ int cj = count[j];
+
+ int q = nz_start[i];
+ int k;
+ for (k=0;k<ci;k++)
+ if (nonzero[si+k])
+ model->sv_coef[j-1][q++] = f[p].alpha[k];
+ q = nz_start[j];
+ for (k=0;k<cj;k++)
+ if (nonzero[sj+k])
+ model->sv_coef[i][q++] = f[p].alpha[ci+k];
+ ++p;
+ }
+
+ free(label);
+ free(probA);
+ free(probB);
+ free(count);
+ free(perm);
+ free(start);
+ free(x);
+ free(weighted_C);
+ free(nonzero);
+ for (i=0;i<nr_class*(nr_class-1)/2;i++)
+ free(f[i].alpha);
+ free(f);
+ free(nz_count);
+ free(nz_start);
+ }
+ return model;
+}
+
+// Stratified cross validation
+void svm_cross_validation(const svm_problem *prob, const svm_parameter *param, int nr_fold, double *target)
+{
+ int i;
+ int *fold_start = Malloc(int,nr_fold+1);
+ int l = prob->l;
+ int *perm = Malloc(int,l);
+ int nr_class;
+
+ // stratified cv may not give leave-one-out rate
+ // Each class to l folds -> some folds may have zero elements
+ if ((param->svm_type == C_SVC ||
+ param->svm_type == NU_SVC) && nr_fold < l)
+ {
+ int *start = NULL;
+ int *label = NULL;
+ int *count = NULL;
+ svm_group_classes(prob,&nr_class,&label,&start,&count,perm);
+
+ // random shuffle and then data grouped by fold using the array perm
+ int *fold_count = Malloc(int,nr_fold);
+ int c;
+ int *index = Malloc(int,l);
+ for (i=0;i<l;i++)
+ index[i]=perm[i];
+ for (c=0; c<nr_class; c++)
+ for (i=0;i<count[c];i++)
+ {
+ int j = i+rand()%(count[c]-i);
+ swap(index[start[c]+j],index[start[c]+i]);
+ }
+ for (i=0;i<nr_fold;i++)
+ {
+ fold_count[i] = 0;
+ for (c=0; c<nr_class;c++)
+ fold_count[i]+=(i+1)*count[c]/nr_fold-i*count[c]/nr_fold;
+ }
+ fold_start[0]=0;
+ for (i=1;i<=nr_fold;i++)
+ fold_start[i] = fold_start[i-1]+fold_count[i-1];
+ for (c=0; c<nr_class;c++)
+ for (i=0;i<nr_fold;i++)
+ {
+ int begin = start[c]+i*count[c]/nr_fold;
+ int end = start[c]+(i+1)*count[c]/nr_fold;
+ for (int j=begin;j<end;j++)
+ {
+ perm[fold_start[i]] = index[j];
+ fold_start[i]++;
+ }
+ }
+ fold_start[0]=0;
+ for (i=1;i<=nr_fold;i++)
+ fold_start[i] = fold_start[i-1]+fold_count[i-1];
+ free(start);
+ free(label);
+ free(count);
+ free(index);
+ free(fold_count);
+ }
+ else
+ {
+ for (i=0;i<l;i++) perm[i]=i;
+ for (i=0;i<l;i++)
+ {
+ int j = i+rand()%(l-i);
+ swap(perm[i],perm[j]);
+ }
+ for (i=0;i<=nr_fold;i++)
+ fold_start[i]=i*l/nr_fold;
+ }
+
+ for (i=0;i<nr_fold;i++)
+ {
+ int begin = fold_start[i];
+ int end = fold_start[i+1];
+ int j,k;
+ struct svm_problem subprob;
+
+ subprob.l = l-(end-begin);
+ subprob.x = Malloc(struct svm_node*,subprob.l);
+ subprob.y = Malloc(double,subprob.l);
+
+ k=0;
+ for (j=0;j<begin;j++)
+ {
+ subprob.x[k] = prob->x[perm[j]];
+ subprob.y[k] = prob->y[perm[j]];
+ ++k;
+ }
+ for (j=end;j<l;j++)
+ {
+ subprob.x[k] = prob->x[perm[j]];
+ subprob.y[k] = prob->y[perm[j]];
+ ++k;
+ }
+ struct svm_model *submodel = svm_train(&subprob,param);
+ if (param->probability &&
+ (param->svm_type == C_SVC || param->svm_type == NU_SVC))
+ {
+ double *prob_estimates=Malloc(double,svm_get_nr_class(submodel));
+ for (j=begin;j<end;j++)
+ target[perm[j]] = svm_predict_probability(submodel,prob->x[perm[j]],prob_estimates);
+ free(prob_estimates);
+ }
+ else
+ for (j=begin;j<end;j++)
+ target[perm[j]] = svm_predict(submodel,prob->x[perm[j]]);
+ svm_destroy_model(submodel);
+ free(subprob.x);
+ free(subprob.y);
+ }
+ free(fold_start);
+ free(perm);
+}
+
+
+int svm_get_svm_type(const svm_model *model)
+{
+ return model->param.svm_type;
+}
+
+int svm_get_nr_class(const svm_model *model)
+{
+ return model->nr_class;
+}
+
+void svm_get_labels(const svm_model *model, int* label)
+{
+ if (model->label != NULL)
+ {
+ for (int i=0;i<model->nr_class;i++)
+ {
+ label[i] = model->label[i];
+ }
+ }
+}
+
+double svm_get_svr_probability(const svm_model *model)
+{
+ if ((model->param.svm_type == EPSILON_SVR || model->param.svm_type == NU_SVR) &&
+ model->probA!=NULL)
+ return model->probA[0];
+ else
+ {
+ info("Model doesn't contain information for SVR probability inference\n");
+ return 0;
+ }
+}
+
+void svm_predict_values(const svm_model *model, const svm_node *x, double* dec_values)
+{
+ if (model->param.svm_type == ONE_CLASS ||
+ model->param.svm_type == EPSILON_SVR ||
+ model->param.svm_type == NU_SVR)
+ {
+ double *sv_coef = model->sv_coef[0];
+ double sum = 0;
+ for (int i=0;i<model->l;i++)
+ sum += sv_coef[i] * Kernel::k_function(x,model->SV[i],model->param);
+ sum -= model->rho[0];
+ *dec_values = sum;
+ }
+ else
+ {
+ int i;
+ int nr_class = model->nr_class;
+ int l = model->l;
+
+ double *kvalue = Malloc(double,l);
+ for (i=0;i<l;i++)
+ kvalue[i] = Kernel::k_function(x,model->SV[i],model->param);
+
+ int *start = Malloc(int,nr_class);
+ start[0] = 0;
+ for (i=1;i<nr_class;i++)
+ start[i] = start[i-1]+model->nSV[i-1];
+
+ int p=0;
+ int pos=0;
+ for (i=0;i<nr_class;i++)
+ for (int j=i+1;j<nr_class;j++)
+ {
+ double sum = 0;
+ int si = start[i];
+ int sj = start[j];
+ int ci = model->nSV[i];
+ int cj = model->nSV[j];
+
+ int k;
+ double *coef1 = model->sv_coef[j-1];
+ double *coef2 = model->sv_coef[i];
+ for (k=0;k<ci;k++)
+ sum += coef1[si+k] * kvalue[si+k];
+ for (k=0;k<cj;k++)
+ sum += coef2[sj+k] * kvalue[sj+k];
+ sum -= model->rho[p++];
+ dec_values[pos++] = sum;
+ }
+
+ free(kvalue);
+ free(start);
+ }
+}
+
+double svm_predict(const svm_model *model, const svm_node *x)
+{
+ if (model->param.svm_type == ONE_CLASS ||
+ model->param.svm_type == EPSILON_SVR ||
+ model->param.svm_type == NU_SVR)
+ {
+ double res;
+ svm_predict_values(model, x, &res);
+
+ if (model->param.svm_type == ONE_CLASS)
+ return (res>0)?1:-1;
+ else
+ return res;
+ }
+ else
+ {
+ int i;
+ int nr_class = model->nr_class;
+ double *dec_values = Malloc(double, nr_class*(nr_class-1)/2);
+ svm_predict_values(model, x, dec_values);
+
+ int *vote = Malloc(int,nr_class);
+ for (i=0;i<nr_class;i++)
+ vote[i] = 0;
+ int pos=0;
+ for (i=0;i<nr_class;i++)
+ for (int j=i+1;j<nr_class;j++)
+ {
+ if (dec_values[pos++] > 0)
+ ++vote[i];
+ else
+ ++vote[j];
+ }
+
+ int vote_max_idx = 0;
+ for (i=1;i<nr_class;i++)
+ if (vote[i] > vote[vote_max_idx])
+ vote_max_idx = i;
+ free(vote);
+ free(dec_values);
+ return model->label[vote_max_idx];
+ }
+}
+
+double svm_predict_probability(
+ const svm_model *model, const svm_node *x, double *prob_estimates)
+{
+ if ((model->param.svm_type == C_SVC || model->param.svm_type == NU_SVC) &&
+ model->probA!=NULL && model->probB!=NULL)
+ {
+ int i;
+ int nr_class = model->nr_class;
+ double *dec_values = Malloc(double, nr_class*(nr_class-1)/2);
+ svm_predict_values(model, x, dec_values);
+
+ double min_prob=1e-7;
+ double **pairwise_prob=Malloc(double *,nr_class);
+ for (i=0;i<nr_class;i++)
+ pairwise_prob[i]=Malloc(double,nr_class);
+ int k=0;
+ for (i=0;i<nr_class;i++)
+ for (int j=i+1;j<nr_class;j++)
+ {
+ pairwise_prob[i][j]=min(max(sigmoid_predict(dec_values[k],model->probA[k],model->probB[k]),min_prob),1-min_prob);
+ pairwise_prob[j][i]=1-pairwise_prob[i][j];
+ k++;
+ }
+ multiclass_probability(nr_class,pairwise_prob,prob_estimates);
+
+ int prob_max_idx = 0;
+ for (i=1;i<nr_class;i++)
+ if (prob_estimates[i] > prob_estimates[prob_max_idx])
+ prob_max_idx = i;
+ for (i=0;i<nr_class;i++)
+ free(pairwise_prob[i]);
+ free(dec_values);
+ free(pairwise_prob);
+ return model->label[prob_max_idx];
+ }
+ else
+ return svm_predict(model, x);
+}
+
+const char *svm_type_table[] =
+{
+ "c_svc","nu_svc","one_class","epsilon_svr","nu_svr",NULL
+};
+
+const char *kernel_type_table[]=
+{
+ "linear","polynomial","rbf","sigmoid",NULL
+};
+
+int svm_save_model(const char *model_file_name, const svm_model *model)
+{
+ FILE *fp = fopen(model_file_name,"w");
+ if (fp==NULL) return -1;
+
+ const svm_parameter& param = model->param;
+
+ fprintf(fp,"svm_type %s\n", svm_type_table[param.svm_type]);
+ fprintf(fp,"kernel_type %s\n", kernel_type_table[param.kernel_type]);
+
+ if (param.kernel_type == POLY)
+ fprintf(fp,"degree %g\n", param.degree);
+
+ if (param.kernel_type == POLY || param.kernel_type == RBF || param.kernel_type == SIGMOID)
+ fprintf(fp,"gamma %g\n", param.gamma);
+
+ if (param.kernel_type == POLY || param.kernel_type == SIGMOID)
+ fprintf(fp,"coef0 %g\n", param.coef0);
+
+ int nr_class = model->nr_class;
+ int l = model->l;
+ fprintf(fp, "nr_class %d\n", nr_class);
+ fprintf(fp, "total_sv %d\n",l);
+
+ {
+ fprintf(fp, "rho");
+ for (int i=0;i<nr_class*(nr_class-1)/2;i++)
+ fprintf(fp," %g",model->rho[i]);
+ fprintf(fp, "\n");
+ }
+
+ if (model->label)
+ {
+ fprintf(fp, "label");
+ for (int i=0;i<nr_class;i++)
+ fprintf(fp," %d",model->label[i]);
+ fprintf(fp, "\n");
+ }
+
+ if (model->probA) // regression has probA only
+ {
+ fprintf(fp, "probA");
+ for (int i=0;i<nr_class*(nr_class-1)/2;i++)
+ fprintf(fp," %g",model->probA[i]);
+ fprintf(fp, "\n");
+ }
+ if (model->probB)
+ {
+ fprintf(fp, "probB");
+ for (int i=0;i<nr_class*(nr_class-1)/2;i++)
+ fprintf(fp," %g",model->probB[i]);
+ fprintf(fp, "\n");
+ }
+
+ if (model->nSV)
+ {
+ fprintf(fp, "nr_sv");
+ for (int i=0;i<nr_class;i++)
+ fprintf(fp," %d",model->nSV[i]);
+ fprintf(fp, "\n");
+ }
+
+ fprintf(fp, "SV\n");
+ const double * const *sv_coef = model->sv_coef;
+ const svm_node * const *SV = model->SV;
+
+ for (int i=0;i<l;i++)
+ {
+ for (int j=0;j<nr_class-1;j++)
+ fprintf(fp, "%.16g ",sv_coef[j][i]);
+
+ const svm_node *p = SV[i];
+ while(p->index != -1)
+ {
+ fprintf(fp,"%d:%.8g ",p->index,p->value);
+ p++;
+ }
+ fprintf(fp, "\n");
+ }
+
+ fclose(fp);
+ return 0;
+}
+
+svm_model *svm_load_model(const char *model_file_name)
+{
+ FILE *fp = fopen(model_file_name,"rb");
+ if (fp==NULL) return NULL;
+
+ // read parameters
+
+ svm_model *model = Malloc(svm_model,1);
+ svm_parameter& param = model->param;
+ model->rho = NULL;
+ model->probA = NULL;
+ model->probB = NULL;
+ model->label = NULL;
+ model->nSV = NULL;
+
+ char cmd[81];
+ while(1)
+ {
+ fscanf(fp,"%80s",cmd);
+
+ if (strcmp(cmd,"svm_type")==0)
+ {
+ fscanf(fp,"%80s",cmd);
+ int i;
+ for (i=0;svm_type_table[i];i++)
+ {
+ if (strcmp(svm_type_table[i],cmd)==0)
+ {
+ param.svm_type=i;
+ break;
+ }
+ }
+ if (svm_type_table[i] == NULL)
+ {
+ fprintf(stderr,"unknown svm type.\n");
+ free(model->rho);
+ free(model->label);
+ free(model->nSV);
+ free(model);
+ return NULL;
+ }
+ }
+ else if (strcmp(cmd,"kernel_type")==0)
+ {
+ fscanf(fp,"%80s",cmd);
+ int i;
+ for (i=0;kernel_type_table[i];i++)
+ {
+ if (strcmp(kernel_type_table[i],cmd)==0)
+ {
+ param.kernel_type=i;
+ break;
+ }
+ }
+ if (kernel_type_table[i] == NULL)
+ {
+ fprintf(stderr,"unknown kernel function.\n");
+ free(model->rho);
+ free(model->label);
+ free(model->nSV);
+ free(model);
+ return NULL;
+ }
+ }
+ else if (strcmp(cmd,"degree")==0)
+ fscanf(fp,"%lf",¶m.degree);
+ else if (strcmp(cmd,"gamma")==0)
+ fscanf(fp,"%lf",¶m.gamma);
+ else if (strcmp(cmd,"coef0")==0)
+ fscanf(fp,"%lf",¶m.coef0);
+ else if (strcmp(cmd,"nr_class")==0)
+ fscanf(fp,"%d",&model->nr_class);
+ else if (strcmp(cmd,"total_sv")==0)
+ fscanf(fp,"%d",&model->l);
+ else if (strcmp(cmd,"rho")==0)
+ {
+ int n = model->nr_class * (model->nr_class-1)/2;
+ model->rho = Malloc(double,n);
+ for (int i=0;i<n;i++)
+ fscanf(fp,"%lf",&model->rho[i]);
+ }
+ else if (strcmp(cmd,"label")==0)
+ {
+ int n = model->nr_class;
+ model->label = Malloc(int,n);
+ for (int i=0;i<n;i++)
+ fscanf(fp,"%d",&model->label[i]);
+ }
+ else if (strcmp(cmd,"probA")==0)
+ {
+ int n = model->nr_class * (model->nr_class-1)/2;
+ model->probA = Malloc(double,n);
+ for (int i=0;i<n;i++)
+ fscanf(fp,"%lf",&model->probA[i]);
+ }
+ else if (strcmp(cmd,"probB")==0)
+ {
+ int n = model->nr_class * (model->nr_class-1)/2;
+ model->probB = Malloc(double,n);
+ for (int i=0;i<n;i++)
+ fscanf(fp,"%lf",&model->probB[i]);
+ }
+ else if (strcmp(cmd,"nr_sv")==0)
+ {
+ int n = model->nr_class;
+ model->nSV = Malloc(int,n);
+ for (int i=0;i<n;i++)
+ fscanf(fp,"%d",&model->nSV[i]);
+ }
+ else if (strcmp(cmd,"SV")==0)
+ {
+ while(1)
+ {
+ int c = getc(fp);
+ if (c==EOF || c=='\n') break;
+ }
+ break;
+ }
+ else
+ {
+ fprintf(stderr,"unknown text in model file\n");
+ free(model->rho);
+ free(model->label);
+ free(model->nSV);
+ free(model);
+ return NULL;
+ }
+ }
+
+ // read sv_coef and SV
+
+ int elements = 0;
+ long pos = ftell(fp);
+
+ while(1)
+ {
+ int c = fgetc(fp);
+ switch(c)
+ {
+ case '\n':
+ // count the '-1' element
+ case ':':
+ ++elements;
+ break;
+ case EOF:
+ goto out;
+ default:
+ ;
+ }
+ }
+out:
+ fseek(fp,pos,SEEK_SET);
+
+ int m = model->nr_class - 1;
+ int l = model->l;
+ model->sv_coef = Malloc(double *,m);
+ int i;
+ for (i=0;i<m;i++)
+ model->sv_coef[i] = Malloc(double,l);
+ model->SV = Malloc(svm_node*,l);
+ svm_node *x_space=NULL;
+ if (l>0) x_space = Malloc(svm_node,elements);
+
+ int j=0;
+ for (i=0;i<l;i++)
+ {
+ model->SV[i] = &x_space[j];
+ for (int k=0;k<m;k++)
+ fscanf(fp,"%lf",&model->sv_coef[k][i]);
+ while(1)
+ {
+ int c;
+ do {
+ c = getc(fp);
+ if (c=='\n') goto out2;
+ } while(isspace(c));
+ ungetc(c,fp);
+ fscanf(fp,"%d:%lf",&(x_space[j].index),&(x_space[j].value));
+ ++j;
+ }
+out2:
+ x_space[j++].index = -1;
+ }
+
+ fclose(fp);
+
+ model->free_sv = 1; // XXX
+ return model;
+}
+
+void svm_destroy_model(svm_model* model)
+{
+ if (model->free_sv && model->l > 0)
+ free((void *)(model->SV[0]));
+ for (int i=0;i<model->nr_class-1;i++)
+ free(model->sv_coef[i]);
+ free(model->SV);
+ free(model->sv_coef);
+ free(model->rho);
+ free(model->label);
+ free(model->probA);
+ free(model->probB);
+ free(model->nSV);
+ free(model);
+}
+
+void svm_destroy_param(svm_parameter* param)
+{
+ free(param->weight_label);
+ free(param->weight);
+}
+
+const char* svm_check_parameter(const svm_problem *prob, const svm_parameter *param)
+{
+ // svm_type
+
+ int svm_type = param->svm_type;
+ if (svm_type != C_SVC &&
+ svm_type != NU_SVC &&
+ svm_type != ONE_CLASS &&
+ svm_type != EPSILON_SVR &&
+ svm_type != NU_SVR)
+ {
+ return "unknown svm type";
+ }
+
+ // kernel_type
+
+ int kernel_type = param->kernel_type;
+ if (kernel_type != LINEAR &&
+ kernel_type != POLY &&
+ kernel_type != RBF &&
+ kernel_type != SIGMOID)
+ {
+ return "unknown kernel type";
+ }
+
+ // cache_size,eps,C,nu,p,shrinking
+
+ if (param->cache_size <= 0)
+ {
+ return "cache_size <= 0";
+ }
+
+ if (param->eps <= 0)
+ {
+ return "eps <= 0";
+ }
+
+ if (svm_type == C_SVC ||
+ svm_type == EPSILON_SVR ||
+ svm_type == NU_SVR)
+ {
+ if (param->C <= 0)
+ {
+ return "C <= 0";
+ }
+ }
+
+ if (svm_type == NU_SVC ||
+ svm_type == ONE_CLASS ||
+ svm_type == NU_SVR)
+ {
+ if (param->nu < 0 || param->nu > 1)
+ return "nu < 0 or nu > 1";
+ }
+
+ if (svm_type == EPSILON_SVR)
+ {
+ if (param->p < 0)
+ {
+ return "p < 0";
+ }
+ }
+
+ if (param->shrinking != 0 &&
+ param->shrinking != 1)
+ {
+ return "shrinking != 0 and shrinking != 1";
+ }
+
+ if (param->probability != 0 &&
+ param->probability != 1)
+ {
+ return "probability != 0 and probability != 1";
+ }
+
+ if (param->probability == 1 &&
+ svm_type == ONE_CLASS)
+ {
+ return "one-class SVM probability output not supported yet";
+ }
+
+
+ // check whether nu-svc is feasible
+
+ if (svm_type == NU_SVC)
+ {
+ int l = prob->l;
+ int max_nr_class = 16;
+ int nr_class = 0;
+ int *label = Malloc(int,max_nr_class);
+ int *count = Malloc(int,max_nr_class);
+
+ int i;
+ for (i=0; i<l; i++)
+ {
+ int this_label = (int)prob->y[i];
+ int j;
+ for (j=0;j<nr_class;j++)
+ if (this_label == label[j])
+ {
+ ++count[j];
+ break;
+ }
+ if (j == nr_class)
+ {
+ if (nr_class == max_nr_class)
+ {
+ max_nr_class *= 2;
+ label = (int *)realloc(label,max_nr_class*sizeof(int));
+ count = (int *)realloc(count,max_nr_class*sizeof(int));
+ }
+ label[nr_class] = this_label;
+ count[nr_class] = 1;
+ ++nr_class;
+ }
+ }
+
+ for (i=0;i<nr_class;i++)
+ {
+ int n1 = count[i];
+ for (int j=i+1;j<nr_class;j++)
+ {
+ int n2 = count[j];
+ if (param->nu*(n1+n2)/2 > min(n1,n2))
+ {
+ free(label);
+ free(count);
+ return "specified nu is infeasible";
+ }
+ }
+ }
+ free(label);
+ free(count);
+ }
+
+ return NULL;
+}
+
+int svm_check_probability_model(const svm_model *model)
+{
+ return ((model->param.svm_type == C_SVC || model->param.svm_type == NU_SVC) &&
+ model->probA!=NULL && model->probB!=NULL) ||
+ ((model->param.svm_type == EPSILON_SVR || model->param.svm_type == NU_SVR) &&
+ model->probA!=NULL);
+}
diff --git a/PySVM/svm.h b/PySVM/svm.h
new file mode 100644
index 0000000..e613473
--- /dev/null
+++ b/PySVM/svm.h
@@ -0,0 +1,72 @@
+
+
+#ifndef _LIBSVM_H
+#define _LIBSVM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct svm_node
+{
+ int index;
+ double value;
+};
+
+struct svm_problem
+{
+ int l;
+ double *y;
+ struct svm_node **x;
+};
+
+enum { C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR }; /* svm_type */
+enum { LINEAR, POLY, RBF, SIGMOID }; /* kernel_type */
+
+struct svm_parameter
+{
+ int svm_type;
+ int kernel_type;
+ double degree; /* for poly */
+ double gamma; /* for poly/rbf/sigmoid */
+ double coef0; /* for poly/sigmoid */
+
+ /* these are for training only */
+ double cache_size; /* in MB */
+ double eps; /* stopping criteria */
+ double C; /* for C_SVC, EPSILON_SVR and NU_SVR */
+ int nr_weight; /* for C_SVC */
+ int *weight_label; /* for C_SVC */
+ double* weight; /* for C_SVC */
+ double nu; /* for NU_SVC, ONE_CLASS, and NU_SVR */
+ double p; /* for EPSILON_SVR */
+ int shrinking; /* use the shrinking heuristics */
+ int probability; /* do probability estimates */
+};
+
+struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param);
+void svm_cross_validation(const struct svm_problem *prob, const struct svm_parameter *param, int nr_fold, double *target);
+
+int svm_save_model(const char *model_file_name, const struct svm_model *model);
+struct svm_model *svm_load_model(const char *model_file_name);
+
+int svm_get_svm_type(const struct svm_model *model);
+int svm_get_nr_class(const struct svm_model *model);
+void svm_get_labels(const struct svm_model *model, int *label);
+double svm_get_svr_probability(const struct svm_model *model);
+
+void svm_predict_values(const struct svm_model *model, const struct svm_node *x, double* dec_values);
+double svm_predict(const struct svm_model *model, const struct svm_node *x);
+double svm_predict_probability(const struct svm_model *model, const struct svm_node *x, double* prob_estimates);
+
+void svm_destroy_model(struct svm_model *model);
+void svm_destroy_param(struct svm_parameter *param);
+
+const char *svm_check_parameter(const struct svm_problem *prob, const struct svm_parameter *param);
+int svm_check_probability_model(const struct svm_model *model);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBSVM_H */
diff --git a/ReleaseFiles.txt b/ReleaseFiles.txt
new file mode 100644
index 0000000..5890795
--- /dev/null
+++ b/ReleaseFiles.txt
@@ -0,0 +1,234 @@
+# This document lists all the files to be included in the
+# standard release of Inspect. The script Release.py is
+# responsible for parsing this file, copying stuff as
+# needed, and zipping the release.
+
+####################################
+# Inspect source code (to be kept in synch with Makefile):
+base64.c
+BN.c
+BuildMS2DB.c
+ChargeState.c
+CMemLeak.c
+Errors.c
+ExonGraphAlign.c
+FreeMod.c
+main.c
+Mods.c
+MS2DB.c
+ParseInput.c
+ParseXML.c
+PValue.c
+Run.c
+Score.c
+Scorpion.c
+SNP.c
+Spectrum.c
+Spliced.c
+SpliceDB.c
+SpliceScan.c
+SVM.c
+Tagger.c
+Trie.c
+TagFile.c
+Utils.c
+base64.h
+BN.h
+BuildMS2DB.h
+ChargeState.h
+CMemLeak.h
+Errors.h
+ExonGraphAlign.h
+FreeMod.h
+Inspect.h
+Mods.h
+MS2DB.h
+ParseInput.h
+ParseXML.h
+PValue.h
+Run.h
+Score.h
+Scorpion.h
+SNP.h
+Spectrum.h
+Spliced.h
+SpliceDB.h
+SpliceScan.h
+SVM.h
+Tagger.h
+Trie.h
+TagFile.h
+Utils.h
+LDA.c
+LDA.h
+ParentMass.c
+ParentMass.h
+IonScoring.c
+IonScoring.h
+TagFile.c
+TagFile.h
+
+# Other build-specific files:
+Inspect.sln
+Inspect.vcproj
+Makefile
+ReleaseFiles.txt
+BuildInspect.py
+
+####################################
+# Executables and dlls:
+#Inspect.exe
+#libexpat.dll
+
+####################################
+# PyInspect stuff:
+PyInspect.pyd
+PyInspect/PyInspect.c
+PyInspect/PySpectrum.c
+PyInspect/PySpectrum.h
+PyInspect/PyUtils.c
+PyInspect/PyUtils.h
+ReleasePyInspect.py
+
+####################################
+# PySVM stuff:
+PySVM.pyd
+PySVM/PySVM.c
+PySVM/PySVM.sln
+PySVM/PySVM.vcproj
+PySVM/svm-predict.c
+PySVM/svm.cpp
+PySVM/svm.h
+ReleasePySVM.py
+
+####################################
+# Inspect data files:
+AminoAcidMasses.txt
+InVivoModifications.txt
+InVitroModifications.txt
+CCSVM1.model
+CCSVM1.range
+CCSVM2.model
+CCSVM2.range
+Ch2BNPEP.dat
+Ch2BNPEPQ.dat
+Ch3BNPEP.dat
+Ch3BNPEPQ.dat
+IsotopePatterns.txt
+PMCLDA1.model
+PMCLDA2.model
+PMCLDA3.model
+PRM2.dat
+PRM3.dat
+PRMQ2.dat
+PRMQ3.dat
+PTMods.txt
+ScoringModel.dat
+Database/CommonContaminants.fasta
+TagSkewScores.dat
+PRM2.bn
+PRM3.bn
+TAG2.bn
+TAG3.bn
+MQScoreSVM2.model
+MQScoreSVM2.range
+MQScoreSVM3.model
+MQScoreSVM3.range
+MQScoreLDA2.model
+MQScoreLDA3.model
+PhosCut2.bn
+PhosCut3.bn
+CCSVM2Phos.model
+CCSVM2Phos.range
+PMCLDA2Phos.model
+PMCLDA3Phos.model
+
+####################################
+# Documentation:
+docs/Analysis.html
+docs/Copyright.html
+docs/Database.html
+docs/index.html
+docs/Installation.html
+docs/MS2DB.html
+docs/Searching.html
+docs/UnrestrictedSearchTutorial.pdf
+docs/RunningInspectOnTheFWGrid.pdf
+docs/InspectTutorial.pdf
+docs/PLSTutorial.pdf
+
+####################################
+# Analysis scripts:
+BasicStats.py
+Global.py
+Label.py
+LDA.py
+Learning.py
+MakeImage.py
+MSSpectrum.py
+PrepDB.py
+PLSUtils.py
+FDRUtils.py
+ResultsParser.py
+Score.py
+SelectProteins.py
+ShuffleDB.py
+Summary.py
+SystemTest.py
+Utils.py
+ParseXML.py
+InspectToPepXML.py
+ProteinGrouper.py
+TrieUtils.py
+ComputeFScore.py
+ComputeFDR.jar
+BuildMS2DB.jar
+MS2DBShuffler.jar
+
+####################################
+# Old PTM analysis scripts:
+ExplainPTMs.py
+PTMChooserLM.py
+PTMDatabase.txt
+
+####################################
+# New PTM analysis scripts:
+ComputePTMFeatures.py
+CombinePTMFeatures.py
+BuildMGF.py
+PTMSearchBigDB.py
+TrainPTMFeatures.py
+AdjustPTM.py
+BuildConsensusSpectrum.py
+PTMAnalysis.py
+SpectralSimilarity.py
+RunPySVM.py
+CompareHEKPTM.py
+GetByteOffset.py
+StripPTM.py
+PhosphateLocalization.py
+
+####################################
+# System tests:
+SystemTest/TestInput.txt
+SystemTest/TestInputMod.txt
+SystemTest/TestPMC.txt
+SystemTest/TestSpectrum.dta
+SystemTest/TestSpectra.pkl
+SystemTest/TestInputTag1.txt
+SystemTest/TestInputTag3.txt
+SystemTest/SimpleChromosome.trie
+SystemTest/SimpleGenes.gff
+SystemTest/BuildSimpleChromosome.txt
+SystemTest/TestMS2.txt
+SystemTest/Yeast.ms2
+SystemTest/YeastSmall.fasta
+Database/TestDatabase.index
+Database/TestDatabase.trie
+SystemTest/Shew_Short.fasta
+SystemTest/TestCDTA.txt
+SystemTest/Shew_dta.txt
+
+
+# Not for production use:
+#TestSuite.py
diff --git a/ReleasePyInspect.py b/ReleasePyInspect.py
new file mode 100644
index 0000000..de81fb9
--- /dev/null
+++ b/ReleasePyInspect.py
@@ -0,0 +1,67 @@
+#Title: ReleasePyInspect.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Script to build PyInspect
+"""
+import sys
+import os
+
+import distutils.core
+
+PyInspectFileNames = [
+ "PyInspect/PyInspect.c", "PyInspect/PySpectrum.c", "PyInspect/PyUtils.c",
+ "base64.c", "BN.c", "BuildMS2DB.c", "ChargeState.c", "CMemLeak.c",
+ "Errors.c", "ExonGraphAlign.c", "FreeMod.c", "IonScoring.c", "LDA.c",
+ "Mods.c", "MS2DB.c", "ParentMass.c", "ParseInput.c", "ParseXML.c", "PValue.c",
+ "Run.c", "Score.c", "Scorpion.c", "SNP.c",
+ "Spectrum.c", "Spliced.c", "SpliceDB.c",
+ "SpliceScan.c", "SVM.c", "Tagger.c", "Trie.c", "Utils.c","TagFile.c"]
+
+def Main(Arguments):
+ print "Prepping PyInspect..."
+ if sys.platform == "win32":
+ LibraryList = ["libexpat"]
+ else:
+ LibraryList = ["expat"]
+
+ PyInspectExtension = distutils.core.Extension('PyInspect',
+ sources = PyInspectFileNames,
+ include_dirs = [".", "expat/lib"],
+ library_dirs = ["expat/lib/release","pdk_wrapper"],
+ libraries = LibraryList)
+
+ distutils.core.setup(name = 'PyInspect', version = '1.0', ext_modules=[PyInspectExtension],
+ script_args = Arguments)
+
+if __name__ == "__main__":
+ Main(sys.argv[1:])
diff --git a/ReleasePySVM.py b/ReleasePySVM.py
new file mode 100644
index 0000000..577eaab
--- /dev/null
+++ b/ReleasePySVM.py
@@ -0,0 +1,48 @@
+#Title: ReleasePySVM.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Script to build PySVM
+"""
+import sys
+import distutils.core
+
+def Main(Arguments):
+ print "Prepping PySVM..."
+ PySVMFileNames = ['PySVM/PySVM.c',"PySVM/svm-predict.c", "PySVM/svm.cpp",]
+ PySVMExtension = distutils.core.Extension('PySVM', sources = PySVMFileNames)
+ distutils.core.setup(name = 'PySVM', version = '1.0', ext_modules = [PySVMExtension],
+ script_args = Arguments)
+
+if __name__ == "__main__":
+ Main(sys.argv[1:])
+
diff --git a/ResultsParser.py b/ResultsParser.py
new file mode 100644
index 0000000..4a08097
--- /dev/null
+++ b/ResultsParser.py
@@ -0,0 +1,152 @@
+#Title: ResultsParser.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Constants and methods for parsing (Inspect) search results
+"""
+import os
+import random
+class Columns:
+
+ DefaultInspectHeader = "#SpectrumFile\tScan#\tAnnotation\tProtein\tCharge\tMQScore\tLength\tTotalPRMScore\tMedianPRMScore\tFractionY\tFractionB\tIntensity\tNTT\tInspectFDR\tF-Score\tDeltaScore\tDeltaScoreOther\tRecordNumber\tDBFilePos\tSpecFilePos\tPrecursorMZ\tPrecursorMZError\tSpecIndex"
+
+
+ def __init__(self):
+ self.header = self.initializeHeaders(self.DefaultInspectHeader)
+
+ def initializeHeaders(self,Header):
+ if Header[0] == '#':
+ Header = Header[1:]
+
+ self.headers = Header.lower().split("\t")
+
+ def getIndex(self,headerVal):
+
+ for i in range(0,len(self.headers)):
+ if headerVal.lower() == self.headers[i]:
+ return i
+
+ return -1
+
+
+# "Constants for which columns contain which data"
+# SpectrumFile = 0
+# ScanNumber = 1
+# Annotation = 2
+# ProteinName = 3
+# Charge = 4
+# MQScore = 5
+# Length = 6
+# NTT = 12
+# PValue = 13
+# FScore = 14
+# DeltaScoreAny = 15
+# DeltaScore = 16
+# ProteinID = 17
+# DBPos = 18
+# FileOffset = 19 #Spectrum File pos
+# ParentMZ = 20 #Corrected, associated with tweak
+# MZError = 21
+
+# #More columns for splicing
+# Chromosome = 22
+# Strand = 23
+# GenomicPost = 24
+# SplicedSequence = 25
+# Splices = 26
+# SearchDB = 27
+
+
+
+class SpectrumOracleMixin:
+ def __init__(self):
+ self.SpectrumOracle = {}
+ def FixSpectrumPath(self, Path):
+ FileName = os.path.split(Path)[-1]
+ Stub = os.path.splitext(FileName)[0]
+ return self.SpectrumOracle.get(Stub, Path)
+ def PopulateSpectrumOracle(self, RootDirectory):
+ """
+ Used when mzxml files are spread over multiple subdirectories.
+ MZXMLOracle[Stub] = full path to the corresponding MZXML file
+ Used with -M option (not with -s option)
+ """
+ if not RootDirectory or not os.path.exists(RootDirectory):
+ return
+ print "Populate oracle from %s..."%RootDirectory
+ for SubFileName in os.listdir(RootDirectory):
+ # Avoid expensive iteration through results directories:
+ if SubFileName[:7] == "Results":
+ continue
+ SubFilePath = os.path.join(RootDirectory, SubFileName)
+ if os.path.isdir(SubFilePath):
+ self.PopulateSpectrumOracle(SubFilePath)
+ continue
+ (Stub, Extension) = os.path.splitext(SubFileName)
+ Extension = Extension.lower()
+ if Extension == ".mzxml":
+ self.SpectrumOracle[Stub] = os.path.join(RootDirectory, SubFileName)
+ elif Extension == ".mgf":
+ self.SpectrumOracle[Stub] = os.path.join(RootDirectory, SubFileName)
+ elif Extension == ".ms2":
+ self.SpectrumOracle[Stub] = os.path.join(RootDirectory, SubFileName)
+
+class ResultsParser:
+ def __init__(self, *args, **kw):
+ #self.Columns = Columns
+ self.Running = 1
+ def ProcessResultsFiles(self, FilePath, Callback, MaxFilesToParse = None, QuietFlag = 0):
+ """
+ Function for applying a Callback function to one search-reuslts file, or to every
+ search-results file in a directory.
+ """
+ print "ResultsParser:%s"%FilePath
+ FileCount = 0
+ if os.path.isdir(FilePath):
+ FileNames = os.listdir(FilePath)
+ random.shuffle(FileNames)
+ for FileNameIndex in range(len(FileNames)):
+ FileName = FileNames[FileNameIndex]
+ if not QuietFlag:
+ print "(%s/%s) %s"%(FileNameIndex, len(FileNames), FileName)
+ (Stub, Extension) = os.path.splitext(FileName)
+ if Extension.lower() not in (".txt", ".filtered", ".res", ".csv", ".out"):
+ continue
+ FileCount += 1
+ SubFilePath = os.path.join(FilePath, FileName)
+ apply(Callback, (SubFilePath,))
+ # Don't parse every single file, that will take too long!
+ if MaxFilesToParse != None and FileCount > MaxFilesToParse:
+ break
+ else:
+ apply(Callback, (FilePath,))
+
diff --git a/Run.c b/Run.c
new file mode 100644
index 0000000..35f9949
--- /dev/null
+++ b/Run.c
@@ -0,0 +1,1492 @@
+//Title: Run.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#include "CMemLeak.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+#include <math.h>
+#include "Trie.h"
+#include "Utils.h"
+#include "Run.h"
+#include "Tagger.h"
+#include "Score.h"
+#include "FreeMod.h"
+#include "Spliced.h"
+#include "Errors.h"
+#include "BN.h"
+#include "SVM.h"
+#include "Scorpion.h"
+#include "ChargeState.h"
+#include "PValue.h"
+#include "MS2DB.h"
+#include "IonScoring.h"
+#include "TagFile.h" //ARI_MOD
+
+
+extern float g_CutScores[];
+extern PRMBayesianModel* PRMModelCharge2;
+
+// Forward Declaration
+void DebugPrintBlindTagExtensions(SearchInfo* Info);
+void AttemptParentMassPeakRemoval(MSSpectrum* Spectrum);
+void RestoreParentMassPeakRemoval(MSSpectrum* Spectrum);
+
+
+TrieTag* TagGraphGenerateTags(TagGraph* Graph, MSSpectrum* Spectrum, int* TagCount,
+ int MaximumTagCount, SpectrumTweak* Tweak, float TagEdgeScoreMultiplier,
+ PRMBayesianModel* Model);
+
+void OutputMatchesForSpectrum(SpectrumNode* Node, FILE* OutputFile)
+{
+ char MatchedPeptideVerbose[256];
+ char PeptideName[256];
+ Peptide* Match;
+ //Peptide* NextMatch;
+ int MatchNumber = 0;
+ int FeatureIndex;
+ static int FirstCall = 1;
+ PeptideSpliceNode* SpliceNode;
+
+ double FileMass;
+ double PeptideMass;
+ double PeptideMZ;
+
+ int bytecount = 0;
+
+ //
+ if (!OutputFile)
+ {
+ return;
+ }
+ PeptideName[0] = '\0';
+ SetMatchDeltaCN(Node);
+
+ //SetMatchDeltaCN(Node);
+ Match = Node->FirstMatch;
+ while (Match)
+ {
+
+ GetProteinID(Match->RecordNumber, Match->DB, PeptideName);
+
+
+ // Write a header line:
+ if (FirstCall)
+ {
+ FirstCall = 0;
+ if(fprintf(OutputFile, "#SpectrumFile\tScan#\tAnnotation\tProtein\tCharge\t")<0)
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ exit(50);
+ }
+ if(fprintf(OutputFile, "MQScore\tLength\tTotalPRMScore\tMedianPRMScore\tFractionY\tFractionB\tIntensity\tNTT\t") < 0)
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ exit(50);
+ }
+ if(fprintf(OutputFile, "InspectFDR\tF-Score\t") < 0)
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ exit(50);
+ }
+ if(fprintf(OutputFile, "DeltaScore\tDeltaScoreOther\tRecordNumber\tDBFilePos\tSpecFilePos\tPrecursorMZ\tPrecursorMZError") < 0)
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ exit(50);
+ }
+ if (GlobalOptions->FirstDatabase->Type != evDBTypeTrie)
+ {
+ if(fprintf(OutputFile, "\tChromosome\tStrand\tGenomicPos\tSplicedSequence\tSplices\tSearchedDB") < 0)
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ exit(50);
+ }
+ }
+ if(fprintf(OutputFile,"\tSpecIndex") < 0)
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ exit(50);
+ }
+ if(fprintf(OutputFile, "\n") < 0)
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ exit(50);
+ }
+ fflush(OutputFile);
+ }
+
+ //GetProteinID(Match->RecordNumber, IndexFile, PeptideName, 1);
+ WriteMatchToString(Match, MatchedPeptideVerbose, 1);
+
+ //bytecount = fprintf(OutputFile, "TEST");
+ //fprintf(OutputFile, "bytecount:%d",bytecount);
+ //printf("TESTEST\tTEST\n");
+ //fflush(OutputFile);
+ //fprintf(OutputFile, "XX%dXX\t", Node->ScanNumber);
+
+ // Which spectrum?
+ fprintf(OutputFile, "%s\t%d\t", Node->InputFile->FileName, Node->ScanNumber);
+ // What's the match?
+ fprintf(OutputFile, "%s\t%s\t%d\t%.3f\t", MatchedPeptideVerbose, PeptideName, Match->Tweak->Charge, Match->MatchQualityScore);
+ // How good is the match?
+ for (FeatureIndex = 0; FeatureIndex < MQ_FEATURE_COUNT; FeatureIndex++)
+ {
+ fprintf(OutputFile, "%.3f\t", Match->ScoreFeatures[FeatureIndex]);
+ }
+ //fprintf(OutputFile, "%.3f\t", Match->InitialScore / 1000.0);
+ fprintf(OutputFile, "%.5f\t", Match->FScore);
+ fprintf(OutputFile, "%.5f\t", Match->PValue);
+ fprintf(OutputFile, "%.3f\t", Match->DeltaCN);
+ fprintf(OutputFile, "%.3f\t", Match->DeltaCNOther);
+ //fprintf(OutputFile, "%.3f\t", Match->ParentMassError / 100.0); // Temp: Parent mass error (for FT)
+ // Extra fields, for debugging:
+ fprintf(OutputFile, "%d\t%d\t%d\t", Match->RecordNumber, Match->FilePos, Node->FilePosition);
+
+ //FileMass = ((float)Node->Spectrum->MZ) * Match->Tweak->Charge - (Match->Tweak->Charge-1)*1007.8;
+ fprintf(OutputFile,"%.3f\t",(double)Node->Spectrum->FileMZ/MASS_SCALE);
+
+ PeptideMass = (double)GetPeptideParentMass(Match);
+ PeptideMZ = (double)(PeptideMass + (Match->Tweak->Charge-1)*1007.8)/Match->Tweak->Charge;
+ PeptideMZ = PeptideMZ/MASS_SCALE;
+
+ fprintf(OutputFile,"%.3f",((double)Node->Spectrum->FileMZ/MASS_SCALE - PeptideMZ));
+
+ //NEC_DEBUG
+ //printf("%s\t%d\t%s\t",Node->InputFile->FileName, Node->ScanNumber,MatchedPeptideVerbose);
+ //printf("%.3f\t%.3f\t%d\n",Match->MatchQualityScore,PeptideMass,Match->Tweak->ParentMass);
+ //fprintf(OutputFile,"\t%.3f",(double)PeptideMass);
+ //fprintf(OutputFile,"\t%d\t%d",Match->Tweak->Charge, Match->Tweak->ParentMass);
+
+ ////////////////////////////////////////////////////////////
+ // If it's a splice-tolerant search, then output some information about the match:
+ if (Match->DB->Type != evDBTypeTrie)
+ {
+ if (Match->ChromosomeNumber >= 0)
+ {
+ fprintf(OutputFile, "\t%d", Match->ChromosomeNumber);
+ fprintf(OutputFile, "\t%d", Match->ChromosomeForwardFlag);
+ }
+ else
+ {
+ fprintf(OutputFile, "\t");
+ fprintf(OutputFile, "\t");
+ }
+ if (Match->GenomicLocationStart >= 0)
+ {
+ fprintf(OutputFile, "\t%d-%d",
+ min(Match->GenomicLocationStart, Match->GenomicLocationEnd),
+ max(Match->GenomicLocationStart, Match->GenomicLocationEnd));
+ }
+ else
+ {
+ fprintf(OutputFile, "\t");
+ }
+ fprintf(OutputFile, "\t%s", Match->SplicedBases);
+ if(Match->SpliceHead)
+ {
+ for (SpliceNode = Match->SpliceHead; SpliceNode; SpliceNode = SpliceNode->Next)
+ {
+ fprintf(OutputFile, "\t%d-%d ", SpliceNode->DonorPos, SpliceNode->AcceptorPos);
+ }
+ }
+ else
+ fprintf(OutputFile,"\t");
+
+ fprintf(OutputFile, "\t%s", Match->DB->FileName);
+ }
+ fprintf(OutputFile,"\t%d",Node->SpecIndex);
+ fprintf(OutputFile, "\n");
+ Match = Match->Next;
+ MatchNumber++;
+ if (MatchNumber >= GlobalOptions->ReportMatchCount)
+ {
+ break;
+ }
+ }
+ //printf("Wrote out %d matches for '%s'.\n", MatchNumber, Node->FileName);
+ fflush(OutputFile);
+}
+
+#define vprintf(x) if (VerboseFlag) printf(x)
+
+void MutationModeSearch(SearchInfo* Info)
+{
+ Peptide* FirstMatch = NULL;
+ Peptide* LastMatch = NULL;
+ Peptide* NextOldMatchNode;
+ Peptide* OldMatchNode;
+ Peptide* MatchNode;
+ Peptide* NextMatchNode;
+ Peptide* FreeNode;
+ Peptide* FreePrev;
+ int MatchCount = 0;
+ int VerboseFlag = 0;
+ int TweakIndex;
+ MSSpectrum* Spectrum = Info->Spectrum;
+ SpectrumNode* Node = Info->Spectrum->Node;
+
+
+ if (Spectrum->PeakCount < 10) // Demand AT LEAST ten peaks (even that many is a bit silly; 50 is more like it)
+ {
+ return;
+ }
+
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ if (!Node->Tweaks[TweakIndex].Charge)
+ {
+ continue;
+ }
+
+ fseek(Info->DB->DBFile, 0, 0);
+ // *** PRM scores now ***
+ Spectrum->Charge = Node->Tweaks[TweakIndex].Charge;
+ Spectrum->ParentMass = Node->Tweaks[TweakIndex].ParentMass;
+ //vprintf("[V] Assign isotope neighbors\n");
+ //SpectrumAssignIsotopeNeighbors(Node->Spectrum);
+ //vprintf("[V] Find isotopic peaks\n");
+ //SpectrumFindIsotopicPeaks(Node->Spectrum);
+ FreeTagGraph(Node->Spectrum->Graph);
+ vprintf("[V] Construct tag graph\n");
+ Node->Spectrum->Graph = ConstructTagGraph(Node->Spectrum);
+ vprintf("[V] Add nodes\n");
+ TagGraphAddNodes(Node->Spectrum->Graph, Node->Spectrum);
+ vprintf("[V] Score nodes\n");
+ TagGraphScorePRMNodes(NULL, Node->Spectrum->Graph, Node->Spectrum, Node->Tweaks + TweakIndex);
+ vprintf("[V] Populate back edges\n");
+ if (GlobalOptions->MaxPTMods > 1)
+ {
+ TagGraphPopulateBackEdges(Node->Spectrum->Graph);
+ }
+ vprintf("[V] Set PRM scores\n");
+ SetSpectrumPRMScores(Node->Spectrum, Node->Tweaks + TweakIndex);
+ vprintf("[V] Tagless search 1:\n");
+
+ SearchDatabaseTagless(Info, GlobalOptions->MaxPTMods, VerboseFlag, Node->Tweaks + TweakIndex);
+ ////////////
+ vprintf("[V] Score matches:\n");
+ vprintf("[V] merge multi-charge list:\n");
+ OldMatchNode = FirstMatch;
+ if (FirstMatch)
+ {
+ NextOldMatchNode = FirstMatch->Next;
+ }
+ else
+ {
+ NextOldMatchNode = NULL;
+ }
+ MatchNode = Node->FirstMatch;
+ if (MatchNode)
+ {
+ NextMatchNode = MatchNode->Next;
+ }
+ else
+ {
+ NextMatchNode = NULL;
+ }
+ MatchCount = 0;
+ FirstMatch = NULL;
+ LastMatch = NULL;
+ while (MatchNode || OldMatchNode)
+ {
+ if (!MatchNode || (OldMatchNode && MatchNode->InitialScore < OldMatchNode->InitialScore))
+ {
+ // Add one of the old matches to the master-list:
+ if (FirstMatch)
+ {
+ LastMatch->Next = OldMatchNode;
+ OldMatchNode->Prev = LastMatch;
+ LastMatch = OldMatchNode;
+ LastMatch->Next = NULL;
+ }
+ else
+ {
+ FirstMatch = OldMatchNode;
+ LastMatch = OldMatchNode;
+ OldMatchNode->Prev = NULL;
+ OldMatchNode->Next = NULL;
+ }
+ OldMatchNode = NextOldMatchNode;
+ if (OldMatchNode)
+ {
+ NextOldMatchNode = OldMatchNode->Next;
+ }
+ else
+ {
+ NextOldMatchNode = NULL;
+ }
+
+ }
+ else
+ {
+ // Add one of the new matches to the master-list:
+ if (FirstMatch)
+ {
+ LastMatch->Next = MatchNode;
+ MatchNode->Prev = LastMatch;
+ LastMatch = MatchNode;
+ LastMatch->Next = NULL;
+ }
+ else
+ {
+ FirstMatch = MatchNode;
+ LastMatch = MatchNode;
+ MatchNode->Prev = NULL;
+ MatchNode->Next = NULL;
+ }
+ MatchNode = NextMatchNode;
+ if (MatchNode)
+ {
+ NextMatchNode = MatchNode->Next;
+ }
+ else
+ {
+ NextMatchNode = NULL;
+ }
+ }
+ MatchCount++;
+ if (MatchCount >= GlobalOptions->StoreMatchCount)
+ {
+ break;
+ }
+ }
+ // Now we can free any remaining matches from these lists:
+ FreeNode = MatchNode;
+ FreePrev = NULL;
+ while (FreeNode)
+ {
+ if (FreePrev)
+ {
+ FreePeptideNode(FreePrev);
+ }
+ FreePrev = FreeNode;
+ FreeNode = FreeNode->Next;
+ }
+ if (FreePrev)
+ {
+ FreePeptideNode(FreePrev);
+ }
+ FreeNode = OldMatchNode;
+ FreePrev = NULL;
+ while (FreeNode)
+ {
+ if (FreePrev)
+ {
+ FreePeptideNode(FreePrev);
+ }
+ FreePrev = FreeNode;
+ FreeNode = FreeNode->Next;
+ }
+ if (FreePrev)
+ {
+ FreePeptideNode(FreePrev);
+ }
+ Node->FirstMatch = NULL;
+ Node->LastMatch = NULL;
+ Node->MatchCount = 0;
+ // Check the master-list for duplicates:
+ for (MatchNode = FirstMatch; MatchNode; MatchNode = MatchNode->Next)
+ {
+ for (OldMatchNode = MatchNode->Next; OldMatchNode; OldMatchNode = OldMatchNode->Next)
+ {
+ if (OldMatchNode->RecordNumber == MatchNode->RecordNumber && !strcmp(OldMatchNode->Bases, MatchNode->Bases) &&
+ !memcmp(MatchNode->AminoIndex, OldMatchNode->AminoIndex, sizeof(int) * MAX_PT_MODS) &&
+ !memcmp(MatchNode->ModType, OldMatchNode->ModType, sizeof(int) * MAX_PT_MODS))
+ {
+ // Free OldMatchNode, it's a duplicate!
+ if (OldMatchNode->Prev)
+ {
+ OldMatchNode->Prev->Next = OldMatchNode->Next;
+ }
+ if (OldMatchNode->Next)
+ {
+ OldMatchNode->Next->Prev = OldMatchNode->Prev;
+ }
+ if (LastMatch == OldMatchNode)
+ {
+ LastMatch = OldMatchNode->Prev;
+ }
+ FreePeptideNode(OldMatchNode);
+ OldMatchNode = MatchNode->Next;
+ if (!OldMatchNode)
+ {
+ break;
+ }
+ }
+ }
+ }
+ } // tweak loop
+ Node->FirstMatch = FirstMatch;
+ Node->LastMatch = LastMatch;
+ Node->MatchCount = MatchCount;
+ vprintf("[V] Complete.\n");
+}
+
+TrieNode* ConstructTagsForSpectrum(TrieNode* Root, SpectrumNode* Node, int TagCount)
+{
+ int TweakIndex;
+ MSSpectrum* Spectrum;
+ //
+ Spectrum = Node->Spectrum;
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ if (!Node->Tweaks[TweakIndex].Charge)
+ {
+ continue;
+ }
+ //printf("Constructing tags for %d tweak %d\n",Node->ScanNumber, TweakIndex);
+ Spectrum->Charge = Node->Tweaks[TweakIndex].Charge;
+ Spectrum->ParentMass = Node->Tweaks[TweakIndex].ParentMass;
+ //SpectrumAssignIsotopeNeighbors(Spectrum);
+ //SpectrumFindIsotopicPeaks(Spectrum);
+ //sam Temp Insert
+ AttemptParentMassPeakRemoval( Spectrum);
+ //printf("PeakRemoved: %d\n",Spectrum->RemovedPeakIndex);
+ Root = GenerateTagsFromSpectrum(Spectrum, Root, TagCount, Node->Tweaks + TweakIndex);
+ //Sam Temp Insert
+ RestoreParentMassPeakRemoval(Spectrum);
+
+ }
+
+
+ return Root;
+}
+
+void OutputTagsToFile(FILE* OutputFile, char* SpectrumFileName, int ScanNumber, int SpectrumFilePos, TrieTag* TagArray, int TagCount)
+{
+ int TagIndex;
+ //TrieTagHanger* Tag;
+ TrieTag* Tag;
+ TagCount = min(TagCount, GlobalOptions->GenerateTagCount);
+ for (TagIndex = 0; TagIndex < TagCount; TagIndex++)
+ {
+ Tag = TagArray + TagIndex;
+ fprintf(OutputFile, "%s\t", SpectrumFileName);
+ fprintf(OutputFile, "%d\t", ScanNumber);
+ fprintf(OutputFile, "%d\t", SpectrumFilePos);
+ fprintf(OutputFile, "%d\t", TagIndex);
+ fprintf(OutputFile, "%.2f\t", Tag->PrefixMass / (float)MASS_SCALE);
+ fprintf(OutputFile, "%s\t", Tag->Tag);
+ fprintf(OutputFile, "%.2f\t", Tag->SuffixMass / (float)MASS_SCALE);
+ fprintf(OutputFile, "%.2f\t", Tag->Score);
+ fprintf(OutputFile, "\n");
+ }
+}
+
+int MergeIdenticalTags(TrieTag* TagArray, int TagCount)
+{
+ int TagIndexA;
+ int TagIndexB;
+ TrieTag* TagA;
+ TrieTag* TagB;
+ int Diff;
+ //
+ for (TagIndexA = 0; TagIndexA < TagCount; TagIndexA++)
+ {
+ TagA = TagArray + TagIndexA;
+ TagIndexB = TagIndexA + 1;
+ while (TagIndexB < TagCount)
+ {
+ TagB = TagArray + TagIndexB;
+ if (strcmp(TagA->Tag, TagB->Tag))
+ {
+ TagIndexB++;
+ continue;
+ }
+ Diff = abs(TagA->PrefixMass - TagB->PrefixMass);
+ if (Diff > GlobalOptions->Epsilon)
+ {
+ TagIndexB++;
+ continue;
+ }
+ Diff = abs(TagA->SuffixMass - TagB->SuffixMass);
+ if (Diff > GlobalOptions->Epsilon)
+ {
+ TagIndexB++;
+ continue;
+ }
+ // These tags are essentially identical! Remove B.
+ memmove(TagArray + TagIndexB, TagArray + TagIndexB + 1, sizeof(TrieTag) * (TagCount - TagIndexB));
+ TagCount--;
+ // TagIndexB is unchanged.
+ }
+ }
+ return TagCount;
+}
+
+static TrieTag* _TagsOnlyTagList = NULL;
+
+// Perform ONLY tag generation...and output the resulting tags.
+void PerformTagGeneration(void)
+{
+ SpectrumNode* SNode;
+ FILE* SpectrumFile;
+ int Result;
+ int TagCount = GlobalOptions->GenerateTagCount;
+ int TweakIndex;
+ int TotalTagCount;
+ int TagIndex;
+ int TagsGenerated;
+ SpectrumTweak* Tweak;
+ TrieTag* Tags;
+ //
+ // Write a HEADER to the output file:
+ fprintf(GlobalOptions->OutputFile, "#File\tScan\tFilePos\tTagIndex\tPrefix\tTag\tSuffix\tTagscore\t\n");
+ if (!_TagsOnlyTagList)
+ {
+ _TagsOnlyTagList = (TrieTag*)calloc(TWEAK_COUNT * TagCount + 1, sizeof(TrieTag));
+ }
+
+
+ BuildDecorations();
+ for (SNode = GlobalOptions->FirstSpectrum; SNode; SNode = SNode->Next)
+ {
+ SpectrumFile = fopen(SNode->InputFile->FileName, "rb");
+ if (SpectrumFile)
+ {
+ SNode->Spectrum = (MSSpectrum*)calloc(1, sizeof(MSSpectrum));
+ SNode->Spectrum->Node = SNode;
+ fseek(SpectrumFile, SNode->FilePosition, 0);
+ Result = SpectrumLoadFromFile(SNode->Spectrum, SpectrumFile);
+ fclose(SpectrumFile);
+ if (!Result)
+ {
+ SafeFree(SNode->Spectrum);
+ SNode->Spectrum = NULL;
+ continue;
+ }
+ else
+ {
+ WindowFilterPeaks(SNode->Spectrum, 0, 0);
+ IntensityRankPeaks(SNode->Spectrum);
+ }
+ if (!SNode->PMCFlag)
+ {
+ TweakSpectrum(SNode);
+ }
+ TotalTagCount = 0;
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ if (!SNode->Tweaks[TweakIndex].Charge)
+ {
+ continue;
+ }
+ Tweak = SNode->Tweaks + TweakIndex;
+ SNode->Spectrum->Charge = Tweak->Charge;
+ SNode->Spectrum->ParentMass = Tweak->ParentMass;
+ //SpectrumAssignIsotopeNeighbors(SNode->Spectrum);
+ //SpectrumFindIsotopicPeaks(SNode->Spectrum);
+ SNode->Spectrum->Graph = ConstructTagGraph(SNode->Spectrum);
+ TagGraphAddNodes(SNode->Spectrum->Graph, SNode->Spectrum);
+ TagGraphScorePRMNodes(NULL, SNode->Spectrum->Graph, SNode->Spectrum, Tweak);
+ TagGraphPopulateEdges(SNode->Spectrum->Graph);
+ Tags = TagGraphGenerateTags(SNode->Spectrum->Graph, SNode->Spectrum, &TagsGenerated, TagCount, Tweak, TAG_EDGE_SCORE_MULTIPLIER, NULL);
+
+ for (TagIndex = 0; TagIndex < min(TagCount, TagsGenerated); TagIndex++)
+ {
+ memcpy(_TagsOnlyTagList + TotalTagCount, Tags + TagIndex, sizeof(TrieTag));
+ TotalTagCount++;
+ }
+ FreeTagGraph(SNode->Spectrum->Graph);
+ SNode->Spectrum->Graph = NULL;
+ } // Tweak list
+ qsort(_TagsOnlyTagList, TotalTagCount, sizeof(TrieTag), (QSortCompare)CompareTagScores);
+ TotalTagCount = MergeIdenticalTags(_TagsOnlyTagList, TotalTagCount);
+ OutputTagsToFile(GlobalOptions->OutputFile, SNode->InputFile->FileName, SNode->ScanNumber, SNode->FilePosition, _TagsOnlyTagList, TotalTagCount);
+ // Clean up the spectrum:
+ FreeSpectrum(SNode->Spectrum);
+ SNode->Spectrum = NULL;
+ }
+ }
+ SafeFree(_TagsOnlyTagList);
+ _TagsOnlyTagList = NULL;
+}
+
+#define TEMP_TAGGING_INPUT "TempTagging.dta"
+#define TEMP_TAGGING_OUTPUT "TempTags.txt"
+
+// Call upon PepNovo to generate some tags for us.
+// To do so, we need to write out a temporary .dta file!
+TrieNode* ConstructTagsExternalTagger(TrieNode* Root, SpectrumNode* Node, int TagCount)
+{
+ FILE* TempDTAFile;
+ FILE* TempTagOutputFile;
+ int TweakIndex;
+ int PeakIndex;
+ SpectralPeak* Peak;
+ char CommandLine[2048];
+ char LineBuffer[MAX_LINE_LENGTH];
+ int BytesToRead;
+ int BufferPos = 0;
+ int BytesRead;
+ int BufferEnd = 0;
+ int LineNumber = 0;
+ int PrevLineFilePos = 0;
+ int LineFilePos = 0;
+ char TextBuffer[BUFFER_SIZE * 2];
+ char* BitA;
+ char* BitB;
+ char* BitC;
+ int WithinTagsFlag = 0;
+ float PrefixMass;
+ float Probability;
+ int DuplicateFlag;
+ TrieTag* NewTag;
+ char* TempAA;
+ char AnnotationBuffer[256];
+ char ModBuffer[256];
+ int TagIndex = 0;
+ int AminoIndex;
+ MassDelta* Delta;
+ int ModBufferPos;
+ int ModIndex;
+ int TotalTagCount = 0;
+ //
+ if (!Root)
+ {
+ Root = NewTrieNode();
+ Root->FailureNode = Root;
+ }
+ // Initialization for tags-only:
+ if (GlobalOptions->RunMode & RUN_MODE_TAGS_ONLY)
+ {
+ if (!_TagsOnlyTagList)
+ {
+ _TagsOnlyTagList = (TrieTag*)calloc(TWEAK_COUNT * TagCount + 1, sizeof(TrieTag));
+ }
+ }
+
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ // Skip this mass-tweak, if it's not a valid charge/mass combo
+ if (!Node->Tweaks[TweakIndex].Charge)
+ {
+ continue;
+ }
+ unlink(TEMP_TAGGING_INPUT);
+ TempDTAFile = fopen(TEMP_TAGGING_INPUT, "wb");
+ if (!TempDTAFile)
+ {
+ printf("** Error opening tag input file %s for writing!", TEMP_TAGGING_INPUT);
+ return Root;
+ }
+ fprintf(TempDTAFile, "%.3f %d\n", Node->Tweaks[TweakIndex].ParentMass / (float)MASS_SCALE, Node->Tweaks[TweakIndex].Charge);
+ for (PeakIndex = 0; PeakIndex < Node->Spectrum->PeakCount; PeakIndex++)
+ {
+ Peak = Node->Spectrum->Peaks + PeakIndex;
+ fprintf(TempDTAFile, "%.3f %.3f\n", Peak->Mass / (float)MASS_SCALE, Peak->Intensity);
+ }
+ fclose(TempDTAFile);
+ // Call out to pepnovo:
+ unlink(TEMP_TAGGING_OUTPUT);
+ sprintf(CommandLine, "pepnovo.exe -dta %s -model tryp_model.txt -num_tags %d > %s", TEMP_TAGGING_INPUT, TagCount, TEMP_TAGGING_OUTPUT);
+ system(CommandLine);
+ TempTagOutputFile = fopen(TEMP_TAGGING_OUTPUT, "rb");
+ if (!TempTagOutputFile)
+ {
+ printf("** Error: Unable to open tag output file '%s'\n", TEMP_TAGGING_OUTPUT);
+ return Root;
+ }
+ WithinTagsFlag = 0;
+ while (1)
+ {
+ BytesToRead = BUFFER_SIZE - BufferEnd;
+ BytesRead = ReadBinary(TextBuffer + BufferEnd, sizeof(char), BytesToRead, TempTagOutputFile);
+ BufferEnd += BytesRead;
+ TextBuffer[BufferEnd] = '\0';
+ if (BufferPos == BufferEnd)
+ {
+ // We're done!
+ break;
+ }
+ // Copy a line of text to the line buffer. Skip spaces, and stop at carriage return or newline.
+
+ BufferPos = CopyBufferLine(TextBuffer, BufferPos, BufferEnd, LineBuffer, 0);
+ LineNumber += 1;
+ PrevLineFilePos = LineFilePos;
+ LineFilePos += BufferPos;
+ //printf("Line %d starts at %d\n", LineNumber, LineFilePos);
+ // Now, move the remaining text to the start of the buffer:
+ memmove(TextBuffer, TextBuffer + BufferPos, BufferEnd - BufferPos);
+ BufferEnd -= BufferPos;
+ BufferPos = 0;
+ // Now, process this line of text!
+ if (!LineBuffer[0])
+ {
+ continue;
+ }
+ BitA = strtok(LineBuffer, "\t\r\n");
+ BitB = strtok(NULL, "\t\r\n");
+ if (!BitB)
+ {
+ continue;
+ }
+ BitC = strtok(NULL, "\t\r\n");
+ if (!BitC)
+ {
+ continue;
+ }
+ if (!strcmp(BitC, "Probability:") && !strcmp(BitB, "Tag"))
+ {
+ WithinTagsFlag = 1;
+ }
+ if (WithinTagsFlag)
+ {
+ PrefixMass = (float)atof(BitA);
+ Probability = (float)atof(BitC);
+ if (Probability < (float)0.1)
+ {
+ continue;
+ }
+ NewTag = _TagsOnlyTagList + TotalTagCount;
+ memset(NewTag, 0, sizeof(TrieTag));
+ // Special code:
+ // PepNovo may include MODIFICATIONS in its tags - so, we must parse them.
+ // We assume that (a) modifications are written in the form %+d, and (b) we
+ // know of the modification type from the inspect input file.
+ TempAA = BitB;
+ AminoIndex = 0;
+ ModBufferPos = 0;
+
+ while (*TempAA)
+ {
+ if (*TempAA >= 'A' && *TempAA <= 'Z')
+ {
+ // an amino acid - so, finish the modification-in-progress, if there is one.
+ if (ModBufferPos && AminoIndex)
+ {
+ if (NewTag->ModsUsed == MAX_PT_MODS)
+ {
+ printf("** Error tagging scan %d from file %s: Too many PTMs!\n", Node->ScanNumber, Node->InputFile->FileName);
+ break;
+ }
+ ModBuffer[ModBufferPos] = '\0';
+ Delta = FindPTModByName(NewTag->Tag[AminoIndex - 1], ModBuffer);
+ if (Delta)
+ {
+ NewTag->AminoIndex[NewTag->ModsUsed] = AminoIndex - 1;
+ NewTag->ModType[NewTag->ModsUsed] = Delta;
+ NewTag->ModsUsed++;
+ }
+ else
+ {
+ printf("** Error tagging scan %d from file %s: Modification %s not understood!\n", Node->ScanNumber, Node->InputFile->FileName, ModBuffer);
+ }
+ }
+ ModBufferPos = 0;
+ // Add the AA:
+ NewTag->Tag[AminoIndex++] = *TempAA;
+ }// aa
+ else
+ {
+ ModBuffer[ModBufferPos++] = *TempAA;
+ } // not aa
+ TempAA++;
+ }
+ NewTag->Tag[AminoIndex] = '\0';
+ // Finish any pending mod (COPY-PASTA FROM ABOVE)
+ if (ModBufferPos && AminoIndex)
+ {
+ if (NewTag->ModsUsed == MAX_PT_MODS)
+ {
+ printf("** Error tagging scan %d from file %s: Too many PTMs!\n", Node->ScanNumber, Node->InputFile->FileName);
+ break;
+ }
+ ModBuffer[ModBufferPos] = '\0';
+ Delta = FindPTModByName(NewTag->Tag[AminoIndex - 1], ModBuffer);
+ if (Delta)
+ {
+ NewTag->AminoIndex[NewTag->ModsUsed] = AminoIndex - 1;
+ NewTag->ModType[NewTag->ModsUsed] = Delta;
+ NewTag->ModsUsed++;
+ }
+ else
+ {
+ printf("** Error tagging scan %d from file %s: Modification %s not understood!\n", Node->ScanNumber, Node->InputFile->FileName, ModBuffer);
+ }
+ }
+
+ //strncpy(NewTag->Tag, BitB, MAX_TAG_LENGTH);
+ NewTag->Charge = Node->Tweaks[TweakIndex].Charge;
+ NewTag->ParentMass = Node->Tweaks[TweakIndex].ParentMass;
+ NewTag->PSpectrum = Node->Spectrum;
+ NewTag->Tweak = Node->Tweaks;
+ NewTag->PrefixMass = (int)(PrefixMass * MASS_SCALE + 0.5);
+ NewTag->SuffixMass = Node->Spectrum->ParentMass - NewTag->PrefixMass;
+ NewTag->Score = Probability;
+ for (TempAA = NewTag->Tag; *TempAA; TempAA++)
+ {
+ NewTag->SuffixMass -= PeptideMass[(*TempAA)];
+ }
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (NewTag->AminoIndex[ModIndex] >= 0 && NewTag->ModType[ModIndex])
+ {
+ NewTag->SuffixMass -= NewTag->ModType[ModIndex]->RealDelta;
+ }
+ }
+ TotalTagCount++;
+ } // Handle a line AFTER the tag header
+ } // Loop over file lines
+ fclose(TempTagOutputFile);
+ } // Loop over tweaks
+ qsort(_TagsOnlyTagList, TotalTagCount, sizeof(TrieTag), (QSortCompare)CompareTagScores);
+ TotalTagCount = MergeIdenticalTags(_TagsOnlyTagList, TotalTagCount);
+ TotalTagCount = min(TotalTagCount, GlobalOptions->GenerateTagCount);
+ for (TagIndex = 0; TagIndex < TotalTagCount; TagIndex++)
+ {
+ NewTag = _TagsOnlyTagList + TagIndex;
+ if (GlobalOptions->RunMode & RUN_MODE_TAGS_ONLY)
+ {
+ fprintf(GlobalOptions->OutputFile, "%s\t", Node->InputFile->FileName);
+ fprintf(GlobalOptions->OutputFile, "%d\t", Node->ScanNumber);
+ fprintf(GlobalOptions->OutputFile, "%d\t", Node->FilePosition);
+ fprintf(GlobalOptions->OutputFile, "%d\t", TagIndex);
+ fprintf(GlobalOptions->OutputFile, "%.2f\t", NewTag->PrefixMass / (float)MASS_SCALE);
+ WriteTagToString(NewTag, AnnotationBuffer, 1);
+ fprintf(GlobalOptions->OutputFile, "%s\t", AnnotationBuffer);
+ fprintf(GlobalOptions->OutputFile, "%.2f\t", NewTag->SuffixMass / (float)MASS_SCALE);
+ fprintf(GlobalOptions->OutputFile, "%.2f\t", NewTag->Score);
+ fprintf(GlobalOptions->OutputFile, "\n");
+ }
+ else
+ {
+ Root = AddTagToTrie(Root, NewTag, &DuplicateFlag);
+ }
+ }
+ return Root;
+}
+
+int SearchSpectrumBlockMSAlignment(SearchInfo* Info, SpectrumNode* FirstBlockSpectrum,
+ SpectrumNode* LastBlockSpectrum, DatabaseFile* DB)
+{
+ char MatchedPeptideVerbose[256];
+ SpectrumNode* BlockSpectrum;
+ int SpectraSearched = 0;
+ int TweakIndex;
+ Peptide* Match;
+ MSSpectrum*Spectrum;
+ //
+ for (BlockSpectrum = FirstBlockSpectrum; BlockSpectrum != LastBlockSpectrum; BlockSpectrum = BlockSpectrum->Next)
+ {
+ if (!BlockSpectrum->Spectrum)
+ {
+ continue;
+ }
+ WindowFilterPeaks(BlockSpectrum->Spectrum, 0, 0);
+ IntensityRankPeaks(BlockSpectrum->Spectrum);
+ if (!BlockSpectrum->PMCFlag)
+ {
+
+ TweakSpectrum(BlockSpectrum);
+ }
+
+
+ //fflush(stdout);
+
+ if (!BlockSpectrum->Spectrum)
+ {
+ continue;
+ }
+
+ Info->Spectrum = BlockSpectrum->Spectrum;
+ MutationModeSearch(Info);
+ if(!(GlobalOptions->RunMode & RUN_MODE_RAW_OUTPUT))
+ {
+ MQScoreSpectralMatches(BlockSpectrum);
+ }
+ else
+ {
+ Spectrum = BlockSpectrum->Spectrum;
+ Match = BlockSpectrum->FirstMatch;
+ while(Match)
+ {
+
+ WriteMatchToString(Match,MatchedPeptideVerbose,1);
+ fprintf(GlobalOptions->OutputFile,"%s\t%d\t%s\t%d\t%d\t%d\n",Spectrum->Node->InputFile->FileName,Spectrum->Node->ScanNumber,MatchedPeptideVerbose, Match->InitialScore,Match->FilePos, Match->Tweak->ParentMass);
+ Match = Match->Next;
+ }
+ }
+ //OutputMatchesForSpectrum(BlockSpectrum, GlobalOptions->OutputFile);
+ //FreeMatchList(BlockSpectrum);
+ // Free PRM scores:
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ if (BlockSpectrum->Tweaks[TweakIndex].PRMScores)
+ {
+ SafeFree(BlockSpectrum->Tweaks[TweakIndex].PRMScores);
+ BlockSpectrum->Tweaks[TweakIndex].PRMScores = NULL;
+ }
+ }
+ if (BlockSpectrum->Spectrum->Graph)
+ {
+ FreeTagGraph(BlockSpectrum->Spectrum->Graph);
+ BlockSpectrum->Spectrum->Graph = NULL;
+ }
+ SpectraSearched++;
+ }
+ return SpectraSearched;
+}
+
+int SearchSpectrumBlockTrie(SearchInfo* Info, SpectrumNode* FirstBlockSpectrum, SpectrumNode* LastBlockSpectrum, DatabaseFile* DB)
+{
+ int TagCount;
+ SpectrumNode* BlockSpectrum;
+ char TagBuffer[256];
+ int SpectraSearched = 0;
+ //
+ // Construct tags for these spectra, and scan with trie:
+
+
+ TagCount = GlobalOptions->GenerateTagCount;
+ for (BlockSpectrum = FirstBlockSpectrum; BlockSpectrum != LastBlockSpectrum; BlockSpectrum = BlockSpectrum->Next)
+ {
+ if (!BlockSpectrum->Spectrum)
+ {
+ continue;
+ }
+ SpectraSearched++;
+
+ //ARI_MOD - move tag generation below
+ // We perform peak filtering AFTER calling the external tagger.
+ //if (GlobalOptions->ExternalTagger)
+ //{
+ // Info->Root = ConstructTagsExternalTagger(Info->Root, BlockSpectrum, TagCount);
+ //}
+
+
+
+ WindowFilterPeaks(BlockSpectrum->Spectrum, 0, 0);
+
+
+ IntensityRankPeaks(BlockSpectrum->Spectrum);
+
+ if (!GlobalOptions->ExternalTagger && !BlockSpectrum->PMCFlag) //ARI_MOD - no tweaking if using external tags
+ {
+
+ TweakSpectrum(BlockSpectrum);
+
+ }
+
+
+
+ if (!GlobalOptions->ExternalTagger)
+ {
+ Info->Root = ConstructTagsForSpectrum(Info->Root, BlockSpectrum, TagCount);
+ }
+ else //ARI_MOD - get tags from the TagHolder and add them to the trie,
+ //then prepare spectrum for scoring (this is a part of TweakSpectrum() that is needed
+ {
+ Info->Root = AddExternalTags(Info->Root,BlockSpectrum);
+ PrepareSpectrumForIonScoring(PRMModelCharge2,BlockSpectrum->Spectrum,0);
+
+ }
+ }
+ // Special case for external tagger, tags only:
+ if (GlobalOptions->ExternalTagger && (GlobalOptions->RunMode & RUN_MODE_TAGS_ONLY))
+ {
+ //
+ }
+ else
+ {
+
+ memset(TagBuffer, 0, sizeof(char)*256);
+ InitializeTrieFailureNodes(Info->Root, Info->Root, TagBuffer);
+ //printf("Scan file with trie...\n");
+ //fflush(stdout);
+ fseek(Info->DB->DBFile, 0, 0);
+ switch (DB->Type)
+ {
+ case evDBTypeMS2DB:
+ SearchMS2DB(Info);
+ break;
+ case evDBTypeSpliceDB:
+ SearchSplicableGenes(Info);
+ break;
+ case evDBTypeTrie:
+ ScanFileWithTrie(Info);
+ break;
+ default:
+ break;
+ }
+ }
+ return SpectraSearched;
+}
+// Return number of spectra searched
+int SearchSpectrumBlockAgainstDB(SpectrumNode* FirstBlockSpectrum, SpectrumNode* LastBlockSpectrum, DatabaseFile* DB)
+{
+ SearchInfo* Info;
+ int SpectraSearched;
+ //
+ Info = (SearchInfo*)calloc(1, sizeof(SearchInfo));
+ Info->DB = DB;
+
+ // MutationMode search is 'unrestricted, but not blind' mode.
+ if (GlobalOptions->RunMode & (RUN_MODE_MUTATION | RUN_MODE_BLIND))
+ {
+
+ SpectraSearched = SearchSpectrumBlockMSAlignment(Info, FirstBlockSpectrum, LastBlockSpectrum, DB);
+ }
+ else
+ {
+
+ SpectraSearched = SearchSpectrumBlockTrie(Info, FirstBlockSpectrum, LastBlockSpectrum, DB);
+ //if (GlobalOptions->RunMode & RUN_MODE_BLINDTAG)
+ //{
+ // DebugPrintBlindTagExtensions(Info);
+ //}
+ }
+
+ FreeTrieNode(Info->Root);
+ free(Info);
+ Info->Root = NULL;
+ return SpectraSearched;
+}
+
+// I want to see some basic information about the onesided tag extension
+// like the number of DB hits/tag and the number of onesided extends/tag
+//SpectrumFileName-ScanCount-prefixMass-prefixExtends-Tag-TagHits-SuffixMass-SuffixExtends-score
+void DebugPrintBlindTagExtensions(SearchInfo* Info)
+{
+ TrieNode* Root = Info->Root;
+ TrieNode* L1 = NULL;
+ TrieNode* L2 = NULL;
+ TrieNode* L3 = NULL;
+ TrieTagHanger* Hanger = NULL;
+ TrieTag* Tag = NULL;
+ int LevelOneKids;
+ int LevelTwoKids;
+ int LevelThreeKids;
+ SpectrumNode* SNode = NULL;
+ FILE* OutputFile;
+
+ printf("I GOT TO THE DEBUG\n");
+ OutputFile = fopen("BlindTaggingInfo.txt", "wb");
+ //fprintf(OutputFile, "SpectrumFileName\tScanCount\tPrefixMass\tPrefixExtends\t");
+ //fprintf(OutputFile, "Tag\tTagHits\tSuffixMass\tSuffixExtends\tScore\n");
+ if (!OutputFile)
+ {
+ printf("Unable to upen the output file. BlindTaggingInfo\n");
+ return;
+ }
+ for (LevelOneKids = 0; LevelOneKids < TRIE_CHILD_COUNT; LevelOneKids++)
+ {
+ //every node here is of depth 1, and has a single letter word
+ if (LevelOneKids == ('I'-'A') || LevelOneKids == ('Q'-'A'))
+ { //don't print out both nodes for I and L, or for Q and K
+ continue;
+ }
+ L1 = Root->Children[LevelOneKids];
+ if (L1 != NULL)
+ {
+ for (LevelTwoKids = 0; LevelTwoKids < TRIE_CHILD_COUNT; LevelTwoKids++)
+ {
+ if (LevelTwoKids == ('I'-'A') || LevelTwoKids == ('Q'-'A'))
+ {
+ continue;
+ }
+ L2 = L1->Children[LevelTwoKids];
+ if(L2 != NULL)
+ {
+ for (LevelThreeKids = 0; LevelThreeKids < TRIE_CHILD_COUNT; LevelThreeKids++)
+ {
+ if (LevelThreeKids == ('I'-'A') || LevelThreeKids == ('Q'-'A'))
+ {
+ continue;
+ }
+ L3 = L2->Children[LevelThreeKids];
+ if (L3 != NULL)
+ {
+ //Level three kids should be a tripeptide, with a hanger and tags
+ //printf("My depth is %d\n",L3->Depth);
+ Hanger = L3->FirstTag;
+ while (Hanger) // != NULL; Go through all the hangers on a tag
+ {
+ Tag = Hanger->Tag;
+ SNode = Tag->PSpectrum->Node;
+ fprintf(OutputFile, "%s\t",SNode->InputFile->FileName);
+ fprintf(OutputFile, "%d\t",SNode->ScanNumber);
+ fprintf(OutputFile, "%.2f\t", Tag->PrefixMass / (float)MASS_SCALE);
+ fprintf(OutputFile, "%d\t",Tag->PrefixExtends);
+ fprintf(OutputFile, "%s\t", Tag->Tag);
+ fprintf(OutputFile, "%d\t", Tag->DBTagMatches);
+ fprintf(OutputFile, "%.2f\t", Tag->SuffixMass / (float)MASS_SCALE);
+ fprintf(OutputFile, "%d\t",Tag->SuffixExtends);
+ fprintf(OutputFile, "%.2f\t", Tag->Score);
+ fprintf(OutputFile, "\n");
+ fflush(OutputFile);
+
+ Hanger = Hanger->Next;
+ }// while
+ }
+ } //Level three kids
+ }
+ }// level 2 kids
+ }
+ }// level one kids
+ fclose(OutputFile);
+}
+
+char* FindExtension(char* FileName)
+{
+ char* ExtensionString;
+ //
+ ExtensionString = FileName + strlen(FileName);
+ while (ExtensionString > FileName)
+ {
+ ExtensionString--;
+ if (*ExtensionString == '.')
+ {
+ return ExtensionString;
+ }
+ }
+ return FileName + strlen(FileName);
+}
+
+// Set DB->IndexFileName, based upon the FileName and database type.
+void FindDatabaseIndexFile(DatabaseFile* DB)
+{
+ char* Extension;
+
+ strcpy(DB->IndexFileName, DB->FileName);
+ Extension = FindExtension(DB->IndexFileName);
+ sprintf(Extension, ".index\0");
+ DB->IndexFile = fopen(DB->IndexFileName, "rb");
+ if (DB->IndexFile)
+ {
+
+ return;
+ }
+
+ sprintf(Extension, ".ms2index\0");
+
+
+ DB->IndexFile = fopen(DB->IndexFileName, "rb");
+ if (DB->IndexFile)
+ {
+ return;
+ }
+
+ // No index file; that's ok.
+ return;
+}
+
+// Search all our SpectrumNodes, one block at a time.
+// Once the search is complete, compute p-values and output search results.
+void RunSearch(void)
+{
+ int BlockSize;
+ SpectrumNode* FirstBlockSpectrum;
+ SpectrumNode* LastBlockSpectrum;
+ SpectrumNode* BlockSpectrum;
+ FILE* SpectrumFile;
+ int Result;
+ TrieNode* Root = NULL;
+ int SpectraSearched = 0;
+ int ThisBlockSpectraSearched;
+ DatabaseFile* DB;
+
+ // Find index filenames:
+ for (DB = GlobalOptions->FirstDatabase; DB; DB = DB->Next)
+ {
+ FindDatabaseIndexFile(DB);
+ }
+ // Open database files:
+ for (DB = GlobalOptions->FirstDatabase; DB; DB = DB->Next)
+ {
+ if (!DB->DBFile)
+ {
+ DB->DBFile = fopen(DB->FileName, "rb");
+ }
+ if (!DB->IndexFile)
+ {
+ DB->IndexFile = fopen(DB->IndexFileName, "rb");
+ }
+ //printf("DBFile: %s\n", DB->FileName);
+ //getchar();
+ }
+
+ if (GlobalOptions->RunMode & (RUN_MODE_MUTATION | RUN_MODE_BLIND))
+ {
+ GlobalOptions->TrieBlockSize = 100;
+ GlobalOptions->StoreMatchCount = 100;
+ GlobalOptions->ReportMatchCount = 10; // in production report at MOST 20 even in blind mode
+ }
+
+ //printf("About to PopulatePTMListWIthMutations...\n");
+ //if (GlobalOptions->RunMode & (RUN_MODE_TAG_MUTATION))
+ // PopulatePTMListWithMutations();
+ //
+
+ //printf("Building decorations...\n");
+ BuildDecorations();
+ //printf("Done building decorations...n");
+
+
+ FirstBlockSpectrum = GlobalOptions->FirstSpectrum;
+ while (FirstBlockSpectrum)
+ {
+ fflush(stdout);
+
+
+ // Load one block of spectrum objects:
+ BlockSize = 0;
+ LastBlockSpectrum = FirstBlockSpectrum;
+ for (BlockSize = 0; BlockSize < GlobalOptions->TrieBlockSize; BlockSize++)
+ {
+ fflush(stdout);
+
+ SpectrumFile = fopen(LastBlockSpectrum->InputFile->FileName, "rb");
+
+ if (SpectrumFile)
+ {
+ LastBlockSpectrum->Spectrum = (MSSpectrum*)calloc(1, sizeof(MSSpectrum));
+ LastBlockSpectrum->Spectrum->PeakAllocation = 0;
+ LastBlockSpectrum->Spectrum->Node = LastBlockSpectrum;
+ fseek(SpectrumFile, LastBlockSpectrum->FilePosition, 0);
+
+ Result = SpectrumLoadFromFile(LastBlockSpectrum->Spectrum, SpectrumFile);
+ //printf("Load from '%s' result %d\n", LastBlockSpectrum->InputFile->FileName, Result);
+ fclose(SpectrumFile);
+ if (!Result)
+ {
+ SafeFree(LastBlockSpectrum->Spectrum);
+ LastBlockSpectrum->Spectrum = NULL;
+ }
+ }
+ LastBlockSpectrum = LastBlockSpectrum->Next;
+ if (!LastBlockSpectrum)
+ {
+ BlockSize++;
+ break;
+ }
+ }
+ printf("Search block of %d spectra starting with %s:%d\n", BlockSize, FirstBlockSpectrum->InputFile->FileName, FirstBlockSpectrum->ScanNumber);
+ fflush(stdout);
+ ThisBlockSpectraSearched = 0;
+ for (DB = GlobalOptions->FirstDatabase; DB; DB = DB->Next)
+ {
+ ThisBlockSpectraSearched = SearchSpectrumBlockAgainstDB(FirstBlockSpectrum, LastBlockSpectrum, DB);
+ }
+ SpectraSearched += ThisBlockSpectraSearched;
+ printf("Search progress: %d / %d (%.2f%%)\n", SpectraSearched, GlobalOptions->SpectrumCount, 100 * SpectraSearched / (float)max(1, GlobalOptions->SpectrumCount));
+ fflush(stdout);
+
+ // Clean up this block, and move to the next:
+ fflush(stdout);
+ for (BlockSpectrum = FirstBlockSpectrum; BlockSpectrum != LastBlockSpectrum; BlockSpectrum = BlockSpectrum->Next)
+ {
+ if (BlockSpectrum->Spectrum)
+ {
+ if(!(GlobalOptions->RunMode & RUN_MODE_RAW_OUTPUT))
+ {
+
+ OutputMatchesForSpectrum(BlockSpectrum, GlobalOptions->OutputFile);
+ }
+ FreeSpectrum(BlockSpectrum->Spectrum);
+ FreeMatchList(BlockSpectrum);
+ BlockSpectrum->Spectrum = NULL;
+ }
+ }
+ fflush(stdout);
+
+ FreeTrieNode(Root);
+ Root = NULL;
+
+ FirstBlockSpectrum = LastBlockSpectrum;
+ fflush(stdout);
+
+ }
+ ///////////////////////////////////////////////////////////
+ // After searching, we compute p-values and output matches:
+
+ if(fclose(GlobalOptions->OutputFile))
+ {
+ REPORT_ERROR_S(50,GlobalOptions->OutputFileName);
+ return;
+
+ }
+
+
+
+ GlobalOptions->OutputFile = NULL;
+
+ /// Compute p-values, write them to final output file:
+ if (GlobalOptions->ExternalTagger && (GlobalOptions->RunMode & RUN_MODE_TAGS_ONLY))
+ {
+ // do nothing
+ }
+ else if(GlobalOptions->RunMode & RUN_MODE_RAW_OUTPUT)
+ {
+ //do nothing
+ }
+ else
+ {
+ CalculatePValues(GlobalOptions->OutputFileName, GlobalOptions->FinalOutputFileName);
+ }
+
+ // Close database files:
+ for (DB = GlobalOptions->FirstDatabase; DB; DB = DB->Next)
+ {
+ if (DB->DBFile)
+ {
+ fclose(DB->DBFile);
+ DB->DBFile = NULL;
+ }
+ if (DB->IndexFile)
+ {
+ fclose(DB->IndexFile);
+ DB->IndexFile = NULL;
+ }
+ }
+}
+
+// Special run mode: Perform parent mass correction on our input spectra. Output the
+// parent masses and charge states.
+void PerformSpectrumTweakage(void)
+{
+ SpectrumNode* Node;
+ FILE* SpectrumFile;
+ int TweakIndex;
+ int Result;
+ //
+ for (Node = GlobalOptions->FirstSpectrum; Node; Node = Node->Next)
+ {
+ SpectrumFile = fopen(Node->InputFile->FileName, "rb");
+ fseek(SpectrumFile, Node->FilePosition, 0);
+ if (SpectrumFile)
+ {
+ Node->Spectrum = (MSSpectrum*)calloc(1, sizeof(MSSpectrum));
+ Node->Spectrum->Node = Node;
+ Result = SpectrumLoadFromFile(Node->Spectrum, SpectrumFile);
+ fclose(SpectrumFile);
+ if (!Result)
+ {
+ FreeSpectrum(Node->Spectrum);
+ Node->Spectrum = NULL;
+ }
+ else
+ {
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+ IntensityRankPeaks(Node->Spectrum);
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+ Node->Spectrum->Node = Node;
+ TweakSpectrum(Node);
+
+ fprintf(GlobalOptions->OutputFile, "%s\t", Node->InputFile->FileName);
+ fprintf(GlobalOptions->OutputFile, "%d\t", Node->ScanNumber);
+ fprintf(GlobalOptions->OutputFile, "%d\t", Node->FilePosition);
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ if (Node->Tweaks[TweakIndex].Charge)
+ {
+ fprintf(GlobalOptions->OutputFile, "%.2f\t%d\t", Node->Tweaks[TweakIndex].ParentMass / (float)DALTON, Node->Tweaks[TweakIndex].Charge);
+ }
+ }
+ fprintf(GlobalOptions->OutputFile, "\n");
+ }
+ FreeSpectrum(Node->Spectrum);
+ Node->Spectrum = NULL;
+ }
+
+ }
+}
+
+//For phosphorylated spectra, the superprominent M-p peak can
+//fritz the charge state guessing, and tagging. So we remove it.
+void AttemptParentMassPeakRemoval(MSSpectrum* Spectrum)
+{
+ int MostIntensePeakIndex = 0; //NEC: Added to get rid of possible use when uninitialized warning
+ int MostIntenseMass = 0; //NEC: Added to get rid of possible use when uninitialized warning
+ int PeakIndex;
+ float MostIntense = 0.0;
+ float NextMostIntense = 0.0;
+ int Diff;
+ int ExpectedDiff;
+ int ExpectedDiff2;
+ int Epsilon = HALF_DALTON;
+ int CalculatedMZ;
+ //
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].Intensity > MostIntense)
+ {
+ NextMostIntense = MostIntense;
+ MostIntense = Spectrum->Peaks[PeakIndex].Intensity;
+ MostIntensePeakIndex = PeakIndex;
+ MostIntenseMass = Spectrum->Peaks[PeakIndex].Mass;
+ }
+ else if(Spectrum->Peaks[PeakIndex].Intensity > NextMostIntense)
+ {
+ NextMostIntense = Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ }
+ //printf("Most intense %f, next %f\n",MostIntense, NextMostIntense);
+ //if more than 2 times great, and in the right place, remove peak.
+ // if (MostIntense < 2 * NextMostIntense)
+ // {
+ //Spectrum->RemovedPeakIndex = -1;//dummy holder
+ // return;
+ // }
+ //printf ("MZ of %d, charge %d\n", Spectrum->MZ, Spectrum->Charge);
+ //Set m/z with the new parentmass and charge that was just assigned in ConstructTags
+ CalculatedMZ = (Spectrum->ParentMass + (Spectrum->Charge - 1) * HYDROGEN_MASS) / Spectrum->Charge;
+ Diff = abs(CalculatedMZ - MostIntenseMass);
+ ExpectedDiff = PHOSPHATE_WATER_MASS / Spectrum->Charge;
+ ExpectedDiff2 = (PHOSPHATE_WATER_MASS + WATER_MASS) / Spectrum->Charge;
+ if (abs (Diff - ExpectedDiff) < Epsilon)
+ { //remove peak
+ Spectrum->RemovedPeakIndex = MostIntensePeakIndex;
+ Spectrum->RemovedPeakIntensity = Spectrum->Peaks[MostIntensePeakIndex].Intensity;
+ Spectrum->Peaks[MostIntensePeakIndex].Intensity = 1.0; //cut to ground
+ }
+ else if (abs(Diff - ExpectedDiff2) < Epsilon)
+ { //remove peak
+ Spectrum->RemovedPeakIndex = MostIntensePeakIndex;
+ Spectrum->RemovedPeakIntensity = Spectrum->Peaks[MostIntensePeakIndex].Intensity;
+ Spectrum->Peaks[MostIntensePeakIndex].Intensity = 1.0; //cut to ground
+ }
+ else
+ {
+ Spectrum->RemovedPeakIndex = -1;//dummy holder
+ }
+}
+
+void RestoreParentMassPeakRemoval(MSSpectrum* Spectrum)
+{
+ if (Spectrum->RemovedPeakIndex == -1)
+ {
+ return;
+ }
+ Spectrum->Peaks[Spectrum->RemovedPeakIndex].Intensity = Spectrum->RemovedPeakIntensity;
+}
diff --git a/Run.h b/Run.h
new file mode 100644
index 0000000..7a090eb
--- /dev/null
+++ b/Run.h
@@ -0,0 +1,41 @@
+//Title: Run.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef RUN_H
+#define RUN_H
+
+void RunSearch();
+void PerformSpectrumTweakage();
+void PerformTagGeneration();
+
+#endif // RUN_H
+
diff --git a/RunPySVM.py b/RunPySVM.py
new file mode 100644
index 0000000..6bc8741
--- /dev/null
+++ b/RunPySVM.py
@@ -0,0 +1,67 @@
+#Title: RunPySVM.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Wrapper for PySVM
+"""
+import os
+import sys
+import traceback
+try:
+ import PySVM
+except:
+ print "(Warning: PySVM not imported - SVM training not available)"
+
+def Predict(FeaturePath, ModelPath, OutputPath):
+ PySVM.LoadModel(ModelPath)
+ InputFile = open(FeaturePath, "rb")
+ OutputFile = open(OutputPath, "wb")
+ for FileLine in InputFile.xreadlines():
+ Bits = FileLine.split()
+ FeatureVector = []
+ for Bit in Bits[1:]:
+ ColonPos = Bit.find(":")
+ if ColonPos == -1:
+ continue
+ FeatureIndex = int(Bit[:ColonPos]) - 1
+ while len(FeatureVector) <= FeatureIndex:
+ FeatureVector.append(0)
+ FeatureVector[FeatureIndex] = float(Bit[ColonPos + 1:])
+ Score = PySVM.Score(FeatureVector)
+ OutputFile.write("%s\n"%Score)
+ InputFile.close()
+ OutputFile.close()
+
+
+if __name__ == "__main__":
+ Predict("TestFeatures.SVMScaled.txt", "SVM.model", "SVMPrediction.pytxt")
diff --git a/SNP.c b/SNP.c
new file mode 100644
index 0000000..d4bfc8f
--- /dev/null
+++ b/SNP.c
@@ -0,0 +1,244 @@
+//Title: SNP.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+#include "Trie.h"
+#include "Utils.h"
+#include "Run.h"
+#include "Tagger.h"
+#include "Score.h"
+#include "FreeMod.h"
+#include "Spliced.h"
+#include "BN.h"
+#include "SVM.h"
+#include "Scorpion.h"
+#include "ChargeState.h"
+#include "SNP.h"
+
+// Code to support the inclusion of POLYMORPHISMS, particularly SNPs, in a protein database.
+// The motivation: If a protein has several polymorphic sites, we can include all of them in
+// a string-table proteomic database only by including multiple isoforms. Since we're already
+// using a DAG data-structure to capture alternative splicing, let's include SNPs in the DAG,
+// and capture polymorphic variability as well as splicing variability.
+
+// We use PolyNodes during database construction (not needed during search) to keep track of all
+// polymorphisms. The polynodes are read from a binary file (currently written out by
+// ParseSNPDatabase.py while parsing snp.txt from the ucsc genome browser). They're ordered
+// by genomic position.
+
+PolyNode* g_FirstPolyNode = NULL;
+PolyNode* g_LastPolyNode = NULL;
+
+Polymorphism* g_Polymorphisms = NULL; // array
+int g_PolymorphismCount;
+
+// Search for the FIRST polymorphism that lies within the given interval.
+// Return its index. Return -1 if there's no polymorphism in that interval.
+// Simple binary search.
+int FindPolyInInterval(int Start, int End)
+{
+ int Low;
+ int High;
+ int Mid;
+ int Pos;
+ //
+ if (!g_PolymorphismCount)
+ {
+ return -1;
+ }
+ Low = 0;
+ High = g_PolymorphismCount - 1;
+ while (1)
+ {
+ // If we're down to a minimally-sized poly-interval, check it and return:
+ if (Low + 1 >= High)
+ {
+ Pos = g_Polymorphisms[Low].Pos;
+ if (Pos >= Start && Pos < End)
+ {
+ return Low;
+ }
+ Pos = g_Polymorphisms[High].Pos;
+ if (Pos >= Start && Pos < End)
+ {
+ return High;
+ }
+ return -1;
+ }
+
+ Mid = (Low + High) / 2;
+ Pos = g_Polymorphisms[Mid].Pos;
+ if (Pos < Start)
+ {
+ Low = Mid;
+ continue;
+ }
+ if (Pos >= End)
+ {
+ High = Mid;
+ continue;
+ }
+ // We found one! Make sure we have the FIRST one.
+ for (Low = Mid; Low >= 0; Low--)
+ {
+ if (g_Polymorphisms[Low].Pos < Start)
+ {
+ return (Low + 1);
+ }
+ }
+ return 0;
+ }
+}
+
+// Free the full linked list of poly nodes.
+void FreePolyNodes()
+{
+ PolyNode* Node;
+ PolyNode* Prev = NULL;
+ //
+ if (!g_FirstPolyNode)
+ {
+ return;
+ }
+ for (Node = g_FirstPolyNode; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ g_FirstPolyNode = NULL;
+ g_LastPolyNode = NULL;
+}
+
+// Parse polymorphism nodes for the current chromosome.
+void ParsePolyNodes(char* FileName)
+{
+ FILE* File;
+ PolyNode* Node;
+ int BytesRead;
+ int GenomicPosition;
+ int RecordNumber;
+ //
+ File = fopen(FileName, "rb");
+ if (!File)
+ {
+ printf("** Error: Unable to open polymorphism database '%s'\n", FileName);
+ return;
+ }
+ RecordNumber = 0;
+ while (1)
+ {
+ BytesRead = ReadBinary(&GenomicPosition, sizeof(int), 1, File);
+ if (!BytesRead)
+ {
+ break;
+ }
+ Node = (PolyNode*)calloc(sizeof(PolyNode), 1);
+ Node->Pos = GenomicPosition;
+ ReadBinary(&Node->Type, sizeof(char), 1, File);
+ switch (Node->Type)
+ {
+ case 0:
+ ReadBinary(Node->SNP, sizeof(char), 2, File);
+ break;
+ case 1:
+ ReadBinary(Node->SNP, sizeof(char), 3, File);
+ break;
+ case 2:
+ ReadBinary(Node->SNP, sizeof(char), 4, File);
+ break;
+ default:
+ printf("** Error: Unable to parse polymorphism node %d type '%d'\n", RecordNumber, Node->Type);
+ break;
+ }
+ if (g_LastPolyNode)
+ {
+ g_LastPolyNode->Next = Node;
+ // Sanity check: These nodes MUST come in order.
+ if (g_LastPolyNode->Pos >= Node->Pos)
+ {
+ printf("** Error parsing polymorphism data: Record %d is out of order! (Start %d vs %d)\n", RecordNumber, Node->Pos, g_LastPolyNode->Pos);
+ }
+ }
+ else
+ {
+ g_FirstPolyNode = Node;
+ }
+ g_LastPolyNode = Node;
+ RecordNumber++;
+ }
+ fclose(File);
+ ////////////////////////////////////////////////////////////////////////
+ // Now, put all those nodes into an array:
+ g_PolymorphismCount = RecordNumber;
+ g_Polymorphisms = (Polymorphism*)calloc(g_PolymorphismCount, sizeof(Polymorphism));
+ RecordNumber = 0;
+ for (Node = g_FirstPolyNode; Node; Node = Node->Next)
+ {
+ g_Polymorphisms[RecordNumber].Pos = Node->Pos;
+ memcpy(g_Polymorphisms[RecordNumber].SNP, Node->SNP, sizeof(char) * 4);
+ RecordNumber++;
+ }
+ FreePolyNodes();
+}
+
+// For debugging: Print out all the polymorphism nodes.
+void DebugPrintPolyNodes(int FirstRecord, int LastRecord)
+{
+ PolyNode* Node;
+ int RecordNumber;
+ //
+ RecordNumber = 0;
+ for (Node = g_FirstPolyNode; Node; Node = Node->Next)
+ {
+ if (FirstRecord >= 0 && RecordNumber < FirstRecord)
+ {
+ continue;
+ }
+ if (LastRecord >= 0 && RecordNumber > LastRecord)
+ {
+ continue;
+ }
+ printf("SNP record %d: Pos %d can be %c or %c\n", RecordNumber, Node->Pos, Node->SNP[0], Node->SNP[1]);
+ RecordNumber++;
+ }
+}
+
+void SNPTestMain()
+{
+ ParsePolyNodes("SNP\\1.snp");
+ DebugPrintPolyNodes(-1, -1);
+}
diff --git a/SNP.h b/SNP.h
new file mode 100644
index 0000000..15e791a
--- /dev/null
+++ b/SNP.h
@@ -0,0 +1,63 @@
+//Title: SNP.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+#include "Trie.h"
+
+typedef struct PolyNode
+{
+ struct PolyNode* Next;
+ int Pos;
+ int Type; // placeholder; for now always 0 for SNP
+ char SNP[4];
+} PolyNode;
+
+// Polymorphism: For now, assume it's always a SNP.
+// Usually SNPC and SNPD are null.
+typedef struct Polymorphism
+{
+ int Pos;
+ char SNP[4];
+} Polymorphism;
+
+extern PolyNode* g_FirstPolyNode;
+extern PolyNode* g_LastPolyNode;
+extern int g_PolymorphismCount;
+extern Polymorphism* g_Polymorphisms;
+
+void ParsePolyNodes(char* FileName);
+void FreePolyNodes();
+int FindPolyInInterval(int Start, int End);
diff --git a/SVM.c b/SVM.c
new file mode 100644
index 0000000..e418b5c
--- /dev/null
+++ b/SVM.c
@@ -0,0 +1,644 @@
+//Title: SVM.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+// SVM support functions.
+// We employ SVMs to distinguish between (1) true and false peptide classifications,
+// and (2) true and false mutation assignments.
+// We also can use SVMs for charge state determination (+2 versus +3, currently; +1 is easy and for +4 and beyond we
+// have no data) and parent mass correction.
+#include "CMemLeak.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+//#include <malloc.h>
+#include "SVM.h"
+#include "Utils.h"
+#include "Inspect.h"
+#include "Spectrum.h"
+#include "Trie.h"
+#include "Scorpion.h"
+#include "BN.h"
+#include "Score.h"
+#include "Errors.h"
+#include "IonScoring.h"
+
+// Forward declarations:
+float LDAClassify(float* Features);
+
+// Global variables:
+extern PRMBayesianModel* PRMModelCharge2;
+
+SVMModel* PValueSVM = NULL;
+float g_SVMToPValueMin;
+int g_PValueBinCount;
+float* g_SVMToPValue;
+
+SVMModel* CCModel1SVM = NULL;
+SVMModel* CCModel2SVM = NULL;
+
+SVMModel* MQModel2SVM = NULL;
+SVMModel* MQModel3SVM = NULL;
+
+
+//SVMModel* PValueSVM = NULL;
+extern float g_CutScores[];
+extern float g_BAbsSkew[];
+extern float g_YAbsSkew[];
+extern float g_BSkew[];
+extern float g_YSkew[];
+extern float g_BIntensity[];
+extern float g_YIntensity[];
+
+float GetPValue(float MQScore)
+{
+ int Bin;
+ //
+ Bin = (int)((MQScore - g_SVMToPValueMin)*10 + 0.5);
+ Bin = max(Bin, 0);
+ Bin = min(Bin, g_PValueBinCount - 1);
+ return g_SVMToPValue[Bin];
+}
+
+// Given a model and an array of feature-values, perform SVM classification.
+float SVMClassify(SVMModel* Model, float* Coords, int PreScaled)
+{
+ SupportVector* Vector;
+ double Total = 0;
+ double InnerProduct;
+ int CoordIndex;
+ double Diff;
+ double ScaledCoords[64];
+ //
+
+ if (PreScaled)
+ {
+ for (CoordIndex = 0; CoordIndex < Model->Coords; CoordIndex++)
+ {
+ ScaledCoords[CoordIndex] = Coords[CoordIndex];
+ }
+ }
+ else
+ {
+ // Scale coordinates to the range [-1, 1] based upon the extrema in the model:
+ for (CoordIndex = 0; CoordIndex < Model->Coords; CoordIndex++)
+ {
+ ScaledCoords[CoordIndex] = (Coords[CoordIndex] - Model->ScaleMin[CoordIndex]) / Model->ScaleSize[CoordIndex] - 1.0;
+ ScaledCoords[CoordIndex] = min(1, max(-1, ScaledCoords[CoordIndex]));
+ }
+ }
+
+ // Compute the SVM value by taking weighted inner products with the support vectors ('border points')
+ for (Vector = Model->FirstVector; Vector; Vector = Vector->Next)
+ {
+ InnerProduct = 0;
+ for (CoordIndex = 0; CoordIndex < Model->Coords; CoordIndex++)
+ {
+ Diff = (ScaledCoords[CoordIndex] - Vector->Coords[CoordIndex]);
+ InnerProduct += Diff * Diff;
+ }
+ InnerProduct = exp(-Model->Gamma * InnerProduct);
+ Total += Vector->Weight * InnerProduct;
+ }
+ Total -= Model->Rho;
+ return (float)Total;
+}
+
+// Free an SVMModel instance, including its list of vectors.
+void FreeSVMModel(SVMModel* Model)
+{
+ SupportVector* Vector;
+ SupportVector* Prev = NULL;
+ //printf("Free SVM model.\n");
+ if (Model)
+ {
+ for (Vector = Model->FirstVector; Vector; Vector = Vector->Next)
+ {
+ SafeFree(Prev);
+ Prev = Vector;
+ }
+ SafeFree(Prev);
+ SafeFree(Model);
+ }
+}
+
+// Free all loaded SVM models.
+void FreeSVMModels()
+{
+ FreeSVMModel(PValueSVM);
+ FreeSVMModel(MQModel2SVM);
+ FreeSVMModel(MQModel3SVM);
+}
+
+void InitPValueSVM()
+{
+ char FilePath[2048];
+
+ // NEW models:
+ if (!MQModel2SVM)
+ {
+ sprintf(FilePath, "%s%s.model", GlobalOptions->ResourceDir, "MQScoreSVM2");
+ MQModel2SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s.range", GlobalOptions->ResourceDir, "MQScoreSVM2");
+ ReadSVMScaling(MQModel2SVM, FilePath);
+ }
+
+ if (!MQModel3SVM)
+ {
+ sprintf(FilePath, "%s%s.model", GlobalOptions->ResourceDir, "MQScoreSVM3");
+ MQModel3SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s.range", GlobalOptions->ResourceDir, "MQScoreSVM3");
+ ReadSVMScaling(MQModel3SVM, FilePath);
+ }
+}
+
+float SVMComputeMQScore(MSSpectrum* Spectrum, Peptide* Match, float* MQFeatures)
+{
+ SVMModel* Model;
+ float Score;
+
+ if (Spectrum->Charge < 3)
+ {
+ Model = MQModel2SVM;
+ }
+ else
+ {
+ Model = MQModel3SVM;
+ }
+ if (!Model)
+ {
+ return 0.0;
+ }
+ Score = SVMClassify(Model, MQFeatures, 0);
+ Score = GetPenalizedScore(Spectrum, Match, Score);
+ return Score;
+}
+
+int ReadSVMScalingCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ SVMModel* Model;
+ char* Value;
+ int CoordIndex;
+ float ScaleMin;
+ float ScaleMax;
+ //
+ Model = (SVMModel*)UserData;
+ //CoordIndex = LineNumber - 3;
+ Value = strtok(LineBuffer, " \r\n\t\0");
+ if (!Value)
+ {
+ return 1;
+ }
+ CoordIndex = atoi(Value) - 1;
+ if (CoordIndex < 0)
+ {
+ return 1;
+ }
+ Value = strtok(NULL, " \r\n\t\0");
+ if (!Value)
+ {
+ return 1;
+ }
+ ScaleMin = (float)atof(Value);
+ Value = strtok(NULL, " \r\n\t\0");
+ if (!Value)
+ {
+ return 1;
+ }
+ ScaleMax = (float)atof(Value);
+ if (ScaleMax <= ScaleMin)
+ {
+ REPORT_ERROR(0);
+ }
+ Model->ScaleMin[CoordIndex] = ScaleMin;
+ Model->ScaleMax[CoordIndex] = ScaleMax;
+ Model->ScaleSize[CoordIndex] = (Model->ScaleMax[CoordIndex] - Model->ScaleMin[CoordIndex]) / 2.0;
+ return 1;
+}
+
+// Read feature extrema (for scaling) for an SVM model.
+void ReadSVMScaling(SVMModel* Model, char* ScaleFileName)
+{
+ FILE* File;
+ //
+ File = fopen(ScaleFileName, "r");
+ if (!File)
+ {
+ REPORT_ERROR_S(8, ScaleFileName);
+ return;
+ }
+ ParseFileByLines(File, ReadSVMScalingCallback, Model, 0);
+ fclose(File);
+}
+
+typedef struct SVMParseInfo
+{
+ SVMModel* Model;
+ int InVectors;
+} SVMParseInfo;
+
+
+int ReadSVMModelCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ SVMModel* Model;
+ SVMParseInfo* Info;
+ SupportVector* Vector;
+ char* Command;
+ char* Value;
+ int CoordIndex;
+ char* CoordIndexStr;
+
+ //
+ Info = (SVMParseInfo*)UserData;
+ Model = Info->Model;
+ // Process either a support vector line, or a header command.
+ if (Info->InVectors)
+ {
+ Vector = (SupportVector*)calloc(sizeof(SupportVector), 1);
+ // Weight, then a list of values.
+ Value = strtok(LineBuffer, " \r\n\t");
+ if (!Value)
+ {
+ printf("* Critical error: strtok failed in ReadSVMModel\n");
+ return 0;
+ }
+ Vector->Weight = atof(Value);
+ CoordIndex = 0;
+ while (1)
+ {
+ Value = strtok(NULL, " \r\n\t");
+ if (!Value)
+ {
+ break;
+ }
+ CoordIndexStr = Value;
+ while (*Value != ':' && *Value)
+ {
+ Value++;
+ }
+ *Value = '\0';
+ CoordIndex = atoi(CoordIndexStr) - 1;
+ Value++;
+ if (CoordIndex >= SUPPORT_VECTOR_LENGTH)
+ {
+ printf("* Error: SVM vector too long!\n");
+ break;
+ }
+ Vector->Coords[CoordIndex] = atof(Value);
+ CoordIndex++;
+ }
+ Model->Coords = CoordIndex;
+ if (Model->LastVector)
+ {
+ Model->LastVector->Next = Vector;
+ Model->LastVector = Vector;
+ }
+ else
+ {
+ Model->FirstVector = Vector;
+ Model->LastVector = Vector;
+ }
+ }
+ else
+ {
+ // A header line. We pay attention to parameters "gamma" and "rho". The line "sv" marks the end of
+ // the header, and the start of the support vectors.
+ Command = strtok(LineBuffer, " ");
+ Value = strtok(NULL, " ");
+ // First, handle commands that take no arguments:
+ if (!CompareStrings(Command, "sv"))
+ {
+ Info->InVectors = 1;
+ }
+ else
+ {
+ // The remaining commands have a mandatory argument:
+ if (!Value)
+ {
+ printf("* Invalid command line in ReadSVMModel\n");
+ return 0;
+ }
+ if (!CompareStrings(Command, "gamma"))
+ {
+ Model->Gamma = atof(Value);
+ }
+ if (!CompareStrings(Command, "rho"))
+ {
+ Model->Rho = atof(Value);
+ }
+ }
+ }
+ return 1;
+}
+
+// Read an SVM model from a .model file.
+SVMModel* ReadSVMModel(char* ModelFileName)
+{
+ SVMModel* Model;
+ FILE* File;
+ SVMParseInfo Info;
+ //
+ //printf("Reading SVM model.\n");
+ Model = (SVMModel*)calloc(sizeof(SVMModel), 1);
+ File = fopen(ModelFileName, "r");
+ if (!File)
+ {
+ REPORT_ERROR_S(8, ModelFileName);
+ return NULL;
+ }
+ Info.Model = Model;
+ Info.InVectors = 0;
+ ParseFileByLines(File, ReadSVMModelCallback, &Info, 0);
+ return Model;
+}
+
+void TestPValue(char* FeatureVectorFileName)
+{
+ FILE* FeatureVectorFile;
+ int* HistogramFalse;
+ int* HistogramTrue;
+ float Coords[32];
+ char* ValueString;
+ int TrueFlag;
+ int FeatureIndex;
+ float Result;
+ int HistogramBin;
+ FILE* OutputFile;
+ int FalseCount = 0;
+ int TrueCount = 0;
+ int TrueCumulative = 0;
+ int FalseCumulative = 0;
+ int BufferEnd = 0;
+ int BufferPos = 0;
+ int BytesRead;
+ int BytesToRead;
+ char* Buffer;
+ char* LineBuffer;
+ int LineNumber = 0;
+ char* FieldString;
+ char TextBuffer[BUFFER_SIZE * 2];
+ //char* ValueString;
+ //
+ FeatureVectorFile = fopen(FeatureVectorFileName, "r");
+ Buffer = (char*)malloc(sizeof(char) * 10240);
+ LineBuffer = (char*)malloc(sizeof(char)*MAX_LINE_LENGTH);
+
+ HistogramFalse = (int*)calloc(sizeof(int), 1000);
+ HistogramTrue = (int*)calloc(sizeof(int), 1000);
+ OutputFile = fopen("PValueTest.txt", "w");
+ InitPValueSVM();
+ while (1)
+ {
+ BytesToRead = BUFFER_SIZE - BufferEnd;
+ BytesRead = ReadBinary(TextBuffer + BufferEnd, sizeof(char), BytesToRead, FeatureVectorFile);
+ BufferEnd += BytesRead;
+ TextBuffer[BufferEnd] = '\0';
+ if (BufferPos == BufferEnd)
+ {
+ // We're done!
+ break;
+ }
+
+ // Copy a line of text to the line buffer. Skip spaces, and stop at carriage return or newline.
+ BufferPos = CopyBufferLine(TextBuffer, BufferPos, BufferEnd, LineBuffer, 0);
+ LineNumber += 1;
+
+ // Now, move the remaining text to the start of the buffer:
+ memmove(TextBuffer, TextBuffer + BufferPos, BufferEnd - BufferPos);
+ BufferEnd -= BufferPos;
+ BufferPos = 0;
+
+ // Now, process this line of text!
+ // Skip empty lines:
+ if (!LineBuffer[0])
+ {
+ continue;
+ }
+ if (LineBuffer[0] == '#')
+ {
+ continue;
+ }
+ // Ok, it's a feature line. Split into pieces...
+ memset(Coords, 0, sizeof(float)*32);
+ ValueString = strtok(LineBuffer, WHITESPACE);
+ TrueFlag = atoi(ValueString);
+ fprintf(OutputFile, "%d\t", TrueFlag);
+ if (TrueFlag < 0)
+ {
+ TrueFlag = 0;
+ }
+ FeatureIndex = 0;
+ while (1)
+ {
+ FieldString = strtok(NULL, WHITESPACE);
+ ValueString = FieldString;
+ if (!ValueString)
+ {
+ break;
+ }
+ while (*ValueString!=':')
+ {
+ ValueString++;
+ }
+ *ValueString = '\0';
+ FeatureIndex = atoi(FieldString) - 1;
+ ValueString++;
+ Coords[FeatureIndex++] = (float)atof(ValueString);
+ fprintf(OutputFile, "%s\t", ValueString);
+ }
+ Result = SVMClassify(PValueSVM, Coords, 1);
+ fprintf(OutputFile, "%.4f\n", Result);
+ HistogramBin = (int)(Result*10 + 0.5) + 300;
+ HistogramBin = max(0, min(999, HistogramBin));
+ if (TrueFlag)
+ {
+ HistogramTrue[HistogramBin]++;
+ TrueCount++;
+ }
+ else
+ {
+ HistogramFalse[HistogramBin]++;
+ FalseCount++;
+ }
+ }
+ FalseCount = max(FalseCount, 1); // avoid dividing by zero
+ TrueCount = max(TrueCount, 1); // avoid dividing by zero
+
+ for (HistogramBin = 0; HistogramBin < 1000; HistogramBin++)
+ {
+ TrueCumulative += HistogramTrue[HistogramBin];
+ FalseCumulative += HistogramFalse[HistogramBin];
+ fprintf(OutputFile, "%d\t%.2f\t%.2f\t%.2f\t\n",
+ HistogramBin, (HistogramBin - 300) / 10.0,
+ 100*TrueCumulative/(float)TrueCount,
+ 100*FalseCumulative/(float)FalseCount);
+ }
+}
+
+float LDAClassify(float* Features)
+{
+ float ScaledFeatures[6];
+ double FeatureMin[] = {-1.88, 0, 0, 0};
+ double FeatureMax[] = {3.81, 1, 1, 2};
+ float HalfRange;
+ int X;
+ int Y;
+ int FeatureCount;
+ static double* CovInv[6];
+ double* MeanVectorTrue;
+ double* MeanVectorFalse;
+ double SubProdTrue;
+ double SubProdFalse;
+ float ProdTemp[6];
+ float ProdTrue;
+ float ProdFalse;
+
+ // Constants for TRYPTIC scoring:
+ double TCovInvA[] = {6.037,-8.996,-7.351,-0.283};
+ double TCovInvB[] = {-8.996,51.379,-3.536,2.428};
+ double TCovInvC[] = {-7.351,-3.536,28.577,-0.271};
+ double TCovInvD[] = {-0.283,2.428,-0.271,2.382};
+ double TMeanVectorTrue[] = {2.048,0.022,0.187,0.622};
+ double TMeanVectorFalse[] = {-0.352,-0.668,-0.629,0.102};
+ double TSubProdTrue = (float)-10.052;
+ double TSubProdFalse = (float)-12.146;
+
+ // Constants for NON-TRYPTIC scoring:
+ double NTCovInvA[] = {6.003,-8.708,-7.383};
+ double NTCovInvB[] = {-8.708,48.904,-3.259};
+ double NTCovInvC[] = {-7.383,-3.259,28.546};
+ double NTMeanVectorTrue[] = {2.048,0.022,0.187};
+ double NTMeanVectorFalse[] = {-0.352,-0.668,-0.629};
+ double NTSubProdTrue = (float)-9.880;
+ double NTSubProdFalse = (float)-11.888;
+
+ // Choose the feature-set by digest type.
+ if (GlobalOptions->DigestType == DIGEST_TYPE_TRYPSIN)
+ {
+ MeanVectorTrue = TMeanVectorTrue;
+ MeanVectorFalse = TMeanVectorFalse;
+ SubProdTrue = TSubProdTrue;
+ SubProdFalse = TSubProdFalse;
+ CovInv[0] = TCovInvA;
+ CovInv[1] = TCovInvB;
+ CovInv[2] = TCovInvC;
+ CovInv[3] = TCovInvD;
+ FeatureCount = 4;
+ }
+ else
+ {
+ MeanVectorTrue = NTMeanVectorTrue;
+ MeanVectorFalse = NTMeanVectorFalse;
+ SubProdTrue = NTSubProdTrue;
+ SubProdFalse = NTSubProdFalse;
+ CovInv[0] = NTCovInvA;
+ CovInv[1] = NTCovInvB;
+ CovInv[2] = NTCovInvC;
+ FeatureCount = 3;
+ }
+ // Scale the features into [-1, 1]:
+ for (X = 0; X < FeatureCount; X++)
+ {
+ HalfRange = (float)((FeatureMax[X] - FeatureMin[X]) / 2.0);
+ ScaledFeatures[X] = (float)((Features[X] - FeatureMin[X]) / HalfRange - 1.0);
+ }
+ // Compute the product of the inverse covariance matrix with our feature vector:
+ for (X = 0; X < FeatureCount; X++)
+ {
+ ProdTemp[X] = 0;
+ for (Y = 0; Y < FeatureCount; Y++)
+ {
+ ProdTemp[X] += (float)(ScaledFeatures[Y] * CovInv[X][Y]);
+ }
+ }
+ // Compute u0 * C-1 * X and u1 * C-1 * X
+ ProdTrue = 0;
+ ProdFalse = 0;
+ for (X = 0; X < FeatureCount; X++)
+ {
+ ProdFalse += (float)(MeanVectorFalse[X] * ProdTemp[X]);
+ ProdTrue += (float)(MeanVectorTrue[X] * ProdTemp[X]);
+ }
+ ProdTrue += (float)SubProdTrue;
+ ProdFalse += (float)SubProdFalse;
+ //printf("%.2f\t%.2f\t%.2f\t\n", (ProdTrue - ProdFalse), ProdTrue, ProdFalse);
+ return (ProdTrue - ProdFalse);
+}
+
+void LoadCCModelSVM(int ForceRefresh)
+{
+ char FilePath[2048];
+ if (CCModel1SVM)
+ {
+ if (ForceRefresh)
+ {
+ FreeSVMModel(CCModel1SVM);
+ FreeSVMModel(CCModel2SVM);
+ }
+ else
+ {
+ return;
+ }
+ }
+
+ if(GlobalOptions->PhosphorylationFlag)
+ { //separate model for charge 2 only. not enuf training data for charge 1
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM1.model");
+ CCModel1SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM1.range");
+ ReadSVMScaling(CCModel1SVM, FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM2Phos.model");
+ CCModel2SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM2Phos.range");
+ ReadSVMScaling(CCModel2SVM, FilePath);
+ }
+ else
+ {
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM1.model");
+ CCModel1SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM1.range");
+ ReadSVMScaling(CCModel1SVM, FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM2.model");
+ CCModel2SVM = ReadSVMModel(FilePath);
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "CCSVM2.range");
+ ReadSVMScaling(CCModel2SVM, FilePath);
+ }
+}
+
+void FreeCCModelSVM()
+{
+ FreeSVMModel(CCModel1SVM);
+ CCModel1SVM = NULL;
+ FreeSVMModel(CCModel2SVM);
+ CCModel2SVM = NULL;
+
+}
diff --git a/SVM.h b/SVM.h
new file mode 100644
index 0000000..3e32429
--- /dev/null
+++ b/SVM.h
@@ -0,0 +1,81 @@
+//Title: SVM.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef SVM_H
+#define SVM_H
+// Structs to support use of SVMs:
+#include "Utils.h"
+#include "Inspect.h"
+#include "Spectrum.h"
+#include "Trie.h"
+
+// Support vectors are of this length (or shorter)
+#define SUPPORT_VECTOR_LENGTH 32
+
+typedef struct SupportVector
+{
+ //int Classification; // +1 or -1
+ double Weight;
+ double Coords[SUPPORT_VECTOR_LENGTH];
+ struct SupportVector* Next;
+} SupportVector;
+
+typedef struct SVMModel
+{
+ SupportVector* FirstVector;
+ SupportVector* LastVector;
+ int Coords;
+ double ScaleMin[SUPPORT_VECTOR_LENGTH];
+ double ScaleMax[SUPPORT_VECTOR_LENGTH];
+ double ScaleSize[SUPPORT_VECTOR_LENGTH];
+ double Beta[SUPPORT_VECTOR_LENGTH]; // for computing classifier values
+ double Beta0;
+ double Rho;
+ double Gamma; // for RBF kernel
+} SVMModel;
+
+extern SVMModel* PValueSVMModel;
+
+float SVMComputeMQScore(MSSpectrum* Spectrum, Peptide* Match, float* MQFeatures);
+float SVMClassify(SVMModel* Model, float* Coords, int PreScaled);
+void FreeSVMModels();
+SVMModel* ReadSVMModel(char* FileName);
+void ReadSVMScaling(SVMModel* Model, char* ScaleFileName);
+float GetPValue(float MQScore);
+float LDAClassify(float* Features);
+void TestPValue(char* FeatureVectorFileName);
+void LoadCCModelSVM(int ForceRefresh);
+void FreeCCModelSVM();
+void InitPValueSVM();
+
+#endif // SVM_H
+
diff --git a/Score.c b/Score.c
new file mode 100644
index 0000000..baba739
--- /dev/null
+++ b/Score.c
@@ -0,0 +1,862 @@
+//Title: Score.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include <string.h>
+#include <math.h>
+#include <stdlib.h>
+#include "Mods.h"
+#include "Score.h"
+#include "Spectrum.h"
+#include "Inspect.h"
+#include "Tagger.h"
+#include "SVM.h"
+#include "IonScoring.h"
+#include "ParentMass.h"
+
+////////////////////////////////////////////////////////////////////////////////////////
+// Score.c and Score.h support an 'alignment-based' scoring method which has been
+// discarded in favor of SVM-based scoring. The two methods have similar performance, but
+// the SVM was slightly better and slightly faster. This code is kept around for reference,
+// but isn't executed in practice.
+
+// The max length permitted for any peptide match:
+#define MAX_PEPTIDE_LENGTH 256
+
+// Set SHOW_DP_TABLE to enable verbose printout of the d.p. table from scoring.
+//#define SHOW_DP_TABLE 1
+
+// TheoPeak has a mass, an ion type, and a score. It's a peak
+// in the *theoretical fragmentation spectrum*. Some peaks (e.g.
+// an a-ion peak late in the peptide) have bad scores; others get very
+// good scores. The scores are log-odds scores. Scores are based on the
+// scores in the ScoringModel.
+typedef struct TheoPeak
+{
+ int Mass;
+ int IonType;
+ int LossType;
+ int Score;
+ int AntiScore;
+ // CutIndex is the number of amino acids in this fragment.
+ // For instance, in EAMAPK, b1 and y5 are the same cut point.
+ // CutIndex is always at least 1, since a fragment has at least one amino acid.
+ int CutIndex;
+ // TrueCutIndex is the index of the cut. (So, b and y fragments with same TrueCutIndex represent
+ // breakage of the same peptide bond)
+ int TrueCutIndex;
+ // AssignedPeak is built during spectral scoring, when backtracking along the DP table
+ SpectralPeak* AssignedPeak;
+} TheoPeak;
+
+// Our scoring model uses several type of cuts, and computes probabilities for each.
+// Here's a diagram of the cut points for peptide "SPECTRUM":
+// LeftEdge L1 L2 Mid Mid Mid R2 R1 RightEdge
+// S P E C T R U M
+typedef enum CutPointType
+{
+ CutPointTypeL1 = 0,
+ CutPointTypeL2,
+ CutPointTypeR1,
+ CutPointTypeR2,
+ CutPointTypeMid,
+ CutPointTypeLeftEdge,
+ CutPointTypeRightEdge,
+ CutPointTypeCount
+} CutPointType;
+
+// Skew odds are modeled by bins (0 to 0.05 Da, 0.05 to 0.1, up to a bin for 0.5+).
+#define SKEW_BIN_COUNT 10
+
+#define SECTOR_COUNT 3
+
+// Scoring model, built by DPTrainer.py
+typedef struct ScoringModel
+{
+ int BScore[CutPointTypeCount];
+ int YScore[CutPointTypeCount];
+ int BH2OBoostScore[CutPointTypeCount];
+ int BH2OScore[CutPointTypeCount];
+ int BNH3BoostScore[CutPointTypeCount];
+ int BNH3Score[CutPointTypeCount];
+ int ABoostScore[CutPointTypeCount];
+ int AScore[CutPointTypeCount];
+ int YH2OBoostScore[CutPointTypeCount];
+ int YH2OScore[CutPointTypeCount];
+ int YNH3Score[CutPointTypeCount];
+ int B2BoostScore[CutPointTypeCount];
+ int B2Score[CutPointTypeCount];
+ int Y2Score[CutPointTypeCount];
+ int AH2OScore[CutPointTypeCount];
+ int ANH3Score[CutPointTypeCount];
+ int NoisePenalty[21];
+ int PresenceScore[SECTOR_COUNT];
+ int AbsenceScore[SECTOR_COUNT];
+ int InexplicableScore[SECTOR_COUNT];
+ int SkewScore[SKEW_BIN_COUNT];
+} ScoringModel;
+
+// We have an array of models - one for each charge state (1, 2, 3+)
+ScoringModel* Models;
+
+// P-values are assigned based on a histogram of Match Quality Scores for false matches.
+// Histogram bin-count (and edges) are hard-coded.
+#define PVALUE_BIN_COUNT 300
+#define PVALUE_BIN_BOOST 100
+float g_MatchPValueShort[PVALUE_BIN_COUNT];
+float g_MatchPValueMedium[PVALUE_BIN_COUNT];
+float g_MatchPValueLong[PVALUE_BIN_COUNT];
+float g_MatchPValueLongLong[PVALUE_BIN_COUNT];
+
+int InitPValue(char* FileName)
+{
+ FILE* File;
+ //
+ File = fopen(FileName, "rb");
+ if (!File)
+ {
+ return 0;
+ }
+ ReadBinary(g_MatchPValueShort, sizeof(float), PVALUE_BIN_COUNT, File);
+ ReadBinary(g_MatchPValueMedium, sizeof(float), PVALUE_BIN_COUNT, File);
+ ReadBinary(g_MatchPValueLong, sizeof(float), PVALUE_BIN_COUNT, File);
+ ReadBinary(g_MatchPValueLongLong, sizeof(float), PVALUE_BIN_COUNT, File);
+ fclose(File);
+ return 1;
+}
+
+
+// For debug output: Return a description of an ion type code.
+char* GetIonTypeName(int IonType)
+{
+ switch (IonType)
+ {
+ case evIonTypeNone:
+ return "None";
+ case evIonTypeB:
+ return "B";
+ case evIonTypeY:
+ return "Y";
+ case evIonTypeA:
+ return "A";
+ case evIonTypeBH2O:
+ return "B-H2O";
+ case evIonTypeAH2O:
+ return "A-H2O";
+ case evIonTypeBNH3:
+ return "B-NH3";
+ case evIonTypeANH3:
+ return "A-NH3";
+ case evIonTypeYH2O:
+ return "Y-H2O";
+ case evIonTypeYNH3:
+ return "Y-NH3";
+ case evIonTypeB2:
+ return "B2";
+ case evIonTypeY2:
+ return "Y2";
+ case evIonTypeNoise:
+ return "<noise>";
+ case evIonTypeBPhos:
+ return "b-p";
+ case evIonTypeYPhos:
+ return "y-p";
+ default:
+ return "BROKEN****";
+ }
+}
+
+// For debug output: Return a description of an ion type code.
+char* GetShortIonTypeName(int IonType)
+{
+ switch (IonType)
+ {
+ case evIonTypeNone:
+ return "-";
+ case evIonTypeB:
+ return "b";
+ case evIonTypeY:
+ return "y";
+ case evIonTypeA:
+ return "a";
+ case evIonTypeBH2O:
+ return "b-H2O";
+ case evIonTypeAH2O:
+ return "a-H2O";
+ case evIonTypeBNH3:
+ return "b-NH3";
+ case evIonTypeANH3:
+ return "a-NH3";
+ case evIonTypeYH2O:
+ return "y-H2O";
+ case evIonTypeYNH3:
+ return "y-NH3";
+ case evIonTypeB2:
+ return "b2";
+ case evIonTypeY2:
+ return "y2";
+ case evIonTypeNoise:
+ return "<noise>";
+ case evIonTypeBPhos:
+ return "b-p";
+ case evIonTypeYPhos:
+ return "y-p";
+ default:
+ return "*";
+ }
+}
+
+// For debugging: Print a theoretical fragmentation spectrum.
+void DebugPrintPeaks(TheoPeak* Peaks, int PeakCount)
+{
+ int PeakIndex;
+ //
+ printf("\n-----Peak list:-----\n");
+ for (PeakIndex = 0; PeakIndex < PeakCount; PeakIndex++)
+ {
+ printf("%d: m/z %.2f %s %d %d\n", PeakIndex, Peaks[PeakIndex].Mass / (float)MASS_SCALE, GetIonTypeName(Peaks[PeakIndex].IonType),
+ Peaks[PeakIndex].CutIndex, Peaks[PeakIndex].Score);
+ }
+}
+
+// Theoretical spectrum builder - simple struct for remembering sector edges
+// and the current proline bonus.
+typedef struct TheoPeakBuilder
+{
+ int SectorEdgeA;
+ int SectorEdgeB;
+ int BProlineBonus;
+ int YProlineBonus;
+ int MatchLength;
+} TheoPeakBuilder;
+
+// Add a new theoretical peak to Peaks.
+void AddTheoPeak(TheoPeakBuilder* Theo, TheoPeak* Peaks, int PeakCount, int IonType, int LossType, int Mass,
+ int CutIndex, int PrefixFlag, ScoringModel* Model, int Score)
+{
+ int SectorNumber;
+ //
+ Peaks[PeakCount].IonType = IonType;
+ Peaks[PeakCount].LossType = LossType;
+ Peaks[PeakCount].Mass = Mass;
+ Peaks[PeakCount].CutIndex = CutIndex;
+ Peaks[PeakCount].Score = Score;
+ if (PrefixFlag)
+ {
+ Peaks[PeakCount].TrueCutIndex = CutIndex;
+ Peaks[PeakCount].Score = min(0, Peaks[PeakCount].Score + Theo->BProlineBonus);
+ }
+ else
+ {
+ Peaks[PeakCount].TrueCutIndex = Theo->MatchLength - CutIndex;
+ Peaks[PeakCount].Score = min(0, Peaks[PeakCount].Score + Theo->YProlineBonus);
+ }
+ if (Mass > Theo->SectorEdgeB)
+ {
+ SectorNumber = 2;
+ }
+ else if (Mass > Theo->SectorEdgeA)
+ {
+ SectorNumber = 1;
+ }
+ else
+ {
+ SectorNumber = 0;
+ }
+ // Compare against the null model right here:
+ Peaks[PeakCount].Score -= Model->PresenceScore[SectorNumber];
+}
+
+// Give a bonus for peptides that seem plausible given the cleavage type.
+// For instance: If our sample was subjected to trypsin digest, then *most* fragments will
+// end in K or R (and be preceded by a K or R), so give a bonus to such peptides.
+// The DEFAULT behavior is to assume no digest and give no points.
+// This code is NO LONGER USED in production; instead, number of tryptic termini (NTT) is used
+// as one feature for LDA.
+int ApplyDigestBonus(Peptide* Match)
+{
+ int Score = 0;
+ int AminoCount;
+ int AminoIndex;
+ // SWT 3/9/5: Use a somewhat HEAVIER penalty for broken endpoints. And, penalize two bad endpoints
+ // superadditively
+ int MissedCleavagePenalty = 100;
+ int BrokenSuffixPenalty = 550;
+ int BrokenPrefixPenalty = 550;
+ int BrokenBothPenalty = 400; // extra penalty if both endpoints broken
+ char MutantBases[256];
+ int BrokenTermini = 0;
+ int ModIndex;
+ // Write MutantBases string. This contains the *real* amino acids of the match, with any mutations
+ // applied. For instance, if Match->Bases is "EAMAPK" but match has an M->Q mutation in position 2,
+ // then MutantBases will be "EAQAPK".
+ strcpy(MutantBases, Match->Bases);
+ if (!GlobalOptions->TaglessSearchFlag)
+ {
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Match->ModType[ModIndex])
+ {
+ break;
+ }
+ if (Match->ModType[ModIndex]->Amino)
+ {
+ MutantBases[Match->AminoIndex[ModIndex]] = Match->ModType[ModIndex]->Amino;
+ }
+ }
+ }
+ switch (GlobalOptions->DigestType)
+ {
+ case 0:
+ // No digest (or unknown digest), so no points
+ return 0;
+ case 1:
+ // A tryptic peptide gets a minor bonus, and
+ // missed cleavages get a minor penalty.
+ AminoCount = strlen(MutantBases);
+ for (AminoIndex = 1; AminoIndex < AminoCount - 1; AminoIndex++)
+ {
+ if ((MutantBases[AminoIndex] == 'K' || MutantBases[AminoIndex] == 'R') && (MutantBases[AminoIndex + 1]!='P'))
+ Score -= MissedCleavagePenalty;
+ }
+ if (MutantBases[AminoCount - 1] != 'K' && MutantBases[AminoCount - 1] != 'R')
+ {
+ Score -= BrokenSuffixPenalty;
+ BrokenTermini++;
+ }
+ if (Match->PrefixAmino && (Match->PrefixAmino!='K' && Match->PrefixAmino!='R'))
+ {
+ Score -= BrokenPrefixPenalty;
+ BrokenTermini++;
+ }
+ if (BrokenTermini==2)
+ {
+ Score -= BrokenBothPenalty;
+ }
+ return Score;
+ case 2:
+ // Chymotrypsin: Cleaves C-terminal side of FYWL (if not followed by P)
+ AminoCount = strlen(MutantBases);
+ for (AminoIndex = 1; AminoIndex < AminoCount - 1; AminoIndex++)
+ {
+ if ((MutantBases[AminoIndex] == 'F' || MutantBases[AminoIndex] == 'Y' ||
+ MutantBases[AminoIndex] == 'W' || MutantBases[AminoIndex] == 'L') && (MutantBases[AminoIndex + 1]!='P'))
+ Score -= MissedCleavagePenalty;
+ }
+ if (MutantBases[AminoCount - 1] != 'F' && MutantBases[AminoCount - 1] != 'Y' &&
+ MutantBases[AminoCount - 1] != 'W' && MutantBases[AminoCount - 1] != 'L')
+ {
+ BrokenTermini++;
+ Score -= BrokenSuffixPenalty;
+ }
+ if (Match->PrefixAmino && (Match->PrefixAmino != 'F' && Match->PrefixAmino != 'Y' &&
+ Match->PrefixAmino != 'W' && Match->PrefixAmino != 'L'))
+ {
+ BrokenTermini++;
+ Score -= BrokenPrefixPenalty;
+ }
+ if (BrokenTermini==2)
+ {
+ Score -= BrokenBothPenalty;
+ }
+
+ return Score;
+ case 3:
+ // Lys-C - similar to trypsin. Cleaves after K if not before P.
+ // missed cleavages get a minor penalty.
+ AminoCount = strlen(MutantBases);
+ for (AminoIndex = 1; AminoIndex < AminoCount - 1; AminoIndex++)
+ {
+ if ((MutantBases[AminoIndex] == 'K') && (MutantBases[AminoIndex+1]!='P'))
+ Score -= MissedCleavagePenalty;
+ }
+ if (MutantBases[AminoCount - 1] != 'K')
+ {
+ Score -= BrokenSuffixPenalty;
+ BrokenTermini++;
+ }
+ if (Match->PrefixAmino && (Match->PrefixAmino!='K'))
+ {
+ Score -= BrokenPrefixPenalty;
+ BrokenTermini++;
+ }
+ if (BrokenTermini==2)
+ {
+ Score -= BrokenBothPenalty;
+ }
+
+ return Score;
+ case 4:
+ // Asp-N - Cleaves before (on N-terminal side of) DE
+ AminoCount = strlen(MutantBases);
+ // Penalty for missed cleavages:
+ for (AminoIndex = 1; AminoIndex < AminoCount - 1; AminoIndex++)
+ {
+ if ((MutantBases[AminoIndex] == 'D') || (MutantBases[AminoIndex]=='E'))
+ Score -= MissedCleavagePenalty;
+ }
+ if (Match->SuffixAmino && (Match->SuffixAmino!='K' && Match->SuffixAmino!='E'))
+ {
+ Score -= BrokenSuffixPenalty;
+ BrokenTermini++;
+ }
+ if (MutantBases[0]!='D' && MutantBases[0]!='E')
+ {
+ Score -= BrokenPrefixPenalty;
+ BrokenTermini++;
+ }
+ if (BrokenTermini==2)
+ {
+ Score -= BrokenBothPenalty;
+ }
+
+ return Score;
+ case 5:
+ // GluC cleaves c-terminal of E
+ AminoCount = strlen(MutantBases);
+ for (AminoIndex = 1; AminoIndex < AminoCount - 1; AminoIndex++)
+ {
+ if (MutantBases[AminoIndex] == 'E')
+ {
+ Score -= MissedCleavagePenalty;
+ }
+ }
+ if (MutantBases[AminoCount - 1] != 'E')
+ {
+ Score -= BrokenSuffixPenalty;
+ BrokenTermini++;
+ }
+ if (Match->PrefixAmino && (Match->PrefixAmino!='E'))
+ {
+ Score -= BrokenPrefixPenalty;
+ BrokenTermini++;
+ }
+ if (BrokenTermini==2)
+ {
+ Score -= BrokenBothPenalty;
+ }
+ return Score;
+ default:
+ printf("Unknown digest type '%d' encountered, no scoring adjustment applied.\n", GlobalOptions->DigestType);
+ return 0;
+ }
+}
+
+int DiffPeptides(char* AA1, char* AA2)
+{
+ int DiffCount = 0;
+ while (*AA1 && *AA2)
+ {
+ if (*AA1 != *AA2)
+ {
+ DiffCount++;
+ }
+ AA1++;
+ AA2++;
+ }
+ return DiffCount;
+}
+
+void SetMatchDeltaCN(SpectrumNode* Spectrum)
+{
+ Peptide* Match;
+ Peptide* OtherMatch;
+ int MatchNumber = 0;
+ int IsSame;
+
+ // Init DeltaCN and DeltaCNOther:
+ for (Match = Spectrum->FirstMatch; Match; Match = Match->Next)
+ {
+ Match->DeltaCN = (float)FORBIDDEN_PATH;
+ Match->DeltaCNOther = (float)FORBIDDEN_PATH;
+ }
+
+ // Properly set DeltaCN and DeltaCNOther:
+ for (Match = Spectrum->FirstMatch; Match; Match = Match->Next)
+ {
+ MatchNumber++;
+ if (Match != Spectrum->FirstMatch)
+ {
+ Match->DeltaCN = Match->MatchQualityScore - Spectrum->FirstMatch->MatchQualityScore;
+ }
+ else
+ {
+ if (Match->Next)
+ {
+ Match->DeltaCN = Match->MatchQualityScore - Match->Next->MatchQualityScore;
+ }
+ else
+ {
+ Match->DeltaCN = max(0, Match->MatchQualityScore);
+ }
+ }
+ // If this match is already dissimilar to a higher-scoring one, stop now:
+ if (Match->DeltaCNOther != FORBIDDEN_PATH)
+ {
+ continue;
+ }
+ if (Match->FamilyLeader)
+ {
+ Match->DeltaCNOther = Match->FamilyLeader->DeltaCNOther + (Match->MatchQualityScore - Match->FamilyLeader->MatchQualityScore);
+ continue;
+ }
+ if (MatchNumber > GlobalOptions->ReportMatchCount)
+ {
+ // We won't bother computing DeltaCNOther for any poorer matches, because we'll drop them anyway.
+ break;
+ }
+ for (OtherMatch = Match->Next; OtherMatch; OtherMatch = OtherMatch->Next)
+ {
+ IsSame = 0;
+ if (abs(Match->FilePos - OtherMatch->FilePos) < 3)
+ {
+ IsSame = 1;
+ }
+ if (DiffPeptides(Match->Bases, OtherMatch->Bases) < 2)
+ {
+ IsSame = 1;
+ }
+ if (DiffPeptides(Match->Bases, OtherMatch->Bases + 1) < 2)
+ {
+ IsSame = 1;
+ }
+ if (DiffPeptides(Match->Bases + 1, OtherMatch->Bases) < 2)
+ {
+ IsSame = 1;
+ }
+ if (IsSame)
+ {
+ OtherMatch->FamilyLeader = Match;
+ }
+ else
+ {
+ OtherMatch->DeltaCNOther = OtherMatch->MatchQualityScore - Match->MatchQualityScore;
+ if (Match->DeltaCNOther == FORBIDDEN_PATH)
+ {
+ Match->DeltaCNOther = Match->MatchQualityScore - OtherMatch->MatchQualityScore;
+ }
+ }
+ }
+ if (Match->DeltaCNOther == FORBIDDEN_PATH)
+ {
+ if (Match == Spectrum->LastMatch)
+ {
+ Match->DeltaCNOther = max(Match->MatchQualityScore, 0);
+ }
+ else
+ {
+ Match->DeltaCNOther = Match->MatchQualityScore - Spectrum->LastMatch->MatchQualityScore;
+ }
+ }
+ }
+}
+
+
+// Get PeptideMatchFeatures having to do with cut scores (mean, median...)
+int GetCutScorePeptideMatchFeatures(MSSpectrum* Spectrum, Peptide* Match, float* FeatureArray, PRMBayesianModel* Model)
+{
+ int FeatureIndex = 0;
+ float CutScores[256];
+ int PRMCount;
+ int AminoIndex;
+ float ScoreTotal;
+ int PeptideLength;
+ //
+ PeptideLength = strlen(Match->Bases);
+ //for (NodeIndex = 0, Node = Model->Head; Node; NodeIndex++, Node = Node->Next)
+ //{
+ // PRM = 0;
+ // for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ // {
+ // ///////////////////////////////////////////////////////////////////////////////////////
+ // // Set values, and accumulate table entries:
+ // Node->Values[AminoIndex] = IonScoringGetNodeValue(Model, Node, Spectrum, PRM, Match, AminoIndex);
+ // ///////////////////////////////////////////////////////////////////////////////////////
+ // // Add to PRM:
+ // if (AminoIndex == PeptideLength)
+ // {
+ // break;
+ // }
+ // PRM += PeptideMass[Match->Bases[AminoIndex]];
+ // for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ // {
+ // if (Match->AminoIndex[ModIndex] == AminoIndex)
+ // {
+ // PRM += Match->ModType[ModIndex]->RealDelta;
+ // }
+ // }
+ // } // Amino loop
+ //} // NodeIndex loop
+
+ //// Populate the CutScores array:
+ //for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ //{
+ // CutScores[AminoIndex] = PRMBNGetCutScore(Spectrum, Model, AminoIndex);
+ //}
+ PopulateCutScores(Model, Spectrum, Match, CutScores);
+
+ // Compute features based upon cut scores:
+ // Total/mean for ALL cut scores:
+ ScoreTotal = 0;
+ PRMCount = 0;
+ for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ {
+ ScoreTotal += CutScores[AminoIndex];
+ PRMCount++;
+ }
+ FeatureArray[FeatureIndex++] = ScoreTotal;
+ FeatureArray[FeatureIndex++] = ScoreTotal / (float)PRMCount;
+
+ // Total/mean for CENTRAL cut scores:
+ ScoreTotal = 0;
+ PRMCount = 0;
+ for (AminoIndex = 1; AminoIndex < PeptideLength; AminoIndex++)
+ {
+ ScoreTotal += CutScores[AminoIndex];
+ PRMCount++;
+ }
+ FeatureArray[FeatureIndex++] = ScoreTotal;
+ FeatureArray[FeatureIndex++] = ScoreTotal / (float)max(1, PRMCount);
+
+ // Total/mean for CENTRAL cut scores:
+ ScoreTotal = 0;
+ PRMCount = 0;
+ for (AminoIndex = 2; AminoIndex < (PeptideLength - 1); AminoIndex++)
+ {
+ ScoreTotal += CutScores[AminoIndex];
+ PRMCount++;
+ }
+ FeatureArray[FeatureIndex++] = ScoreTotal;
+ FeatureArray[FeatureIndex++] = ScoreTotal / (float)max(1, PRMCount);
+
+ // Median cut score:
+ PRMCount = PeptideLength + 1;
+ FeatureArray[FeatureIndex++] = GetMedian(CutScores + 2, PRMCount - 4);
+ FeatureArray[FeatureIndex++] = GetMedian(CutScores + 1, PRMCount - 2);
+ FeatureArray[FeatureIndex++] = GetMedian(CutScores, PRMCount);
+
+ return FeatureIndex;
+}
+
+// Helper for GetPeptideMatchFeaturesFull: Compute features having to do with the percentage of peaks
+// and peak intensity explained by the match.
+int GetExplainedPeakPeptideMatchFeatures(MSSpectrum* Spectrum, Peptide* Match, float* FeatureArray)
+{
+ int PeakIndex;
+ float IntensityB = 0;
+ float IntensityY = 0;
+ float IntensityBSeries = 0;
+ float IntensityYSeries = 0;
+ float TotalIntensity = 0;
+ int PeakCountB = 0;
+ int PeakCountY = 0;
+ int StrongPeakCountB = 0;
+ int StrongPeakCountY = 0;
+ float WeightedPeakCountTotal = 0;
+ float WeightedPeakCountB = 0;
+ float WeightedPeakCountY = 0;
+ int StrongPeakCount;
+ int FeatureIndex = 0;
+ int FragmentType;
+ float PeakIntensity;
+ float WeightedPeakIndex;
+ int BFlag[256];
+ int YFlag[256];
+ int PeptideLength;
+ int PresentCount;
+ int AminoIndex;
+ //
+ PeptideLength = strlen(Match->Bases);
+ memset(BFlag, 0, sizeof(int) * (PeptideLength + 1));
+ memset(YFlag, 0, sizeof(int) * (PeptideLength + 1));
+ StrongPeakCount = PeptideLength * 2;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ FragmentType = Spectrum->Peaks[PeakIndex].IonType;
+ PeakIntensity = Spectrum->Peaks[PeakIndex].Intensity;
+ TotalIntensity += PeakIntensity;
+ WeightedPeakIndex = (float)1.0 / (Spectrum->Peaks[PeakIndex].IntensityRank + 1);
+ WeightedPeakCountTotal += WeightedPeakIndex;
+ switch (FragmentType)
+ {
+ case evFragmentY:
+ PeakCountY++;
+ IntensityY += PeakIntensity;
+ IntensityYSeries += PeakIntensity;
+ WeightedPeakCountY += WeightedPeakIndex;
+ YFlag[Spectrum->Peaks[PeakIndex].AminoIndex] = 1;
+ if (Spectrum->Peaks[PeakIndex].IntensityRank < StrongPeakCount)
+ {
+ StrongPeakCountY++;
+ }
+ break;
+ case evFragmentYLoss:
+ IntensityYSeries += PeakIntensity;
+ break;
+ case evFragmentB:
+ PeakCountB++;
+ IntensityB += PeakIntensity;
+ IntensityBSeries += PeakIntensity;
+ WeightedPeakCountB += WeightedPeakIndex;
+ BFlag[Spectrum->Peaks[PeakIndex].AminoIndex] = 1;
+ if (Spectrum->Peaks[PeakIndex].IntensityRank < StrongPeakCount)
+ {
+ StrongPeakCountB++;
+ }
+ break;
+ case evFragmentBLoss:
+ IntensityBSeries += PeakIntensity;
+ break;
+ }
+ }
+ // Fraction of B, Y present:
+ PresentCount = 0;
+ for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ {
+ PresentCount += YFlag[AminoIndex];
+ }
+ FeatureArray[FeatureIndex++] = PresentCount / (float)(PeptideLength + 1);
+ PresentCount = 0;
+ for (AminoIndex = 0; AminoIndex <= PeptideLength; AminoIndex++)
+ {
+ PresentCount += BFlag[AminoIndex];
+ }
+ FeatureArray[FeatureIndex++] = PresentCount / (float)(PeptideLength + 1);
+ PresentCount = 0;
+ for (AminoIndex = 1; AminoIndex < PeptideLength; AminoIndex++)
+ {
+ PresentCount += YFlag[AminoIndex];
+ }
+ FeatureArray[FeatureIndex++] = PresentCount / (float)(PeptideLength - 1);
+ PresentCount = 0;
+ for (AminoIndex = 1; AminoIndex < PeptideLength; AminoIndex++)
+ {
+ PresentCount += BFlag[AminoIndex];
+ }
+ FeatureArray[FeatureIndex++] = PresentCount / (float)(PeptideLength - 1);
+
+ // Fraction of top peaks:
+ FeatureArray[FeatureIndex++] = (StrongPeakCountY + StrongPeakCountB) / (float)StrongPeakCount;
+ FeatureArray[FeatureIndex++] = StrongPeakCountY / (float)StrongPeakCount;
+ FeatureArray[FeatureIndex++] = StrongPeakCountB / (float)StrongPeakCount;
+
+ FeatureArray[FeatureIndex++] = (WeightedPeakCountY + WeightedPeakCountB) / WeightedPeakCountTotal;
+ FeatureArray[FeatureIndex++] = WeightedPeakCountY / WeightedPeakCountTotal;
+ FeatureArray[FeatureIndex++] = WeightedPeakCountB / WeightedPeakCountTotal;
+
+ // Fraction of intensity:
+ FeatureArray[FeatureIndex++] = (IntensityY + IntensityB) / TotalIntensity;
+ FeatureArray[FeatureIndex++] = IntensityY / TotalIntensity;
+ FeatureArray[FeatureIndex++] = IntensityB / TotalIntensity;
+
+ // Fraction of intensity:
+ FeatureArray[FeatureIndex++] = (IntensityYSeries + IntensityBSeries) / TotalIntensity;
+ FeatureArray[FeatureIndex++] = IntensityYSeries / TotalIntensity;
+ FeatureArray[FeatureIndex++] = IntensityBSeries / TotalIntensity;
+
+ return FeatureIndex;
+}
+
+// Compute features rating the quality of this annotation for the spectrum.
+// Set feature values in FeatureArray, return the number of features set.
+int GetPeptideMatchFeaturesFull(MSSpectrum* Spectrum, Peptide* Match, float* FeatureArray)
+{
+ int FeatureIndex = 0;
+ int PeptideLength;
+ PRMBayesianModel* Model;
+ PMCSpectrumInfo* SpectrumInfo;
+ PMCInfo* Info;
+ //
+
+ // Length:
+ PeptideLength = strlen(Match->Bases);
+ FeatureArray[FeatureIndex++] = (float)PeptideLength;
+
+ if (Spectrum->Charge < 3)
+ {
+ Model = TAGModelCharge2;
+ }
+ else
+ {
+ Model = TAGModelCharge3;
+ }
+
+ Spectrum->ParentMass = GetPeptideParentMass(Match);
+
+ // Compute cut scores:
+ FeatureIndex += GetCutScorePeptideMatchFeatures(Spectrum, Match, FeatureArray + FeatureIndex, Model);
+
+ // Compute features based on the fraction of top peaks / intensity explained:
+ FeatureIndex += GetExplainedPeakPeptideMatchFeatures(Spectrum, Match, FeatureArray + FeatureIndex);
+
+ ///////////////////////////////
+ // Spectral convolution:
+ SpectrumInfo = GetPMCSpectrumInfo(Spectrum);
+ Info = (PMCInfo*)calloc(1, sizeof(PMCInfo));
+ Info->Charge = SpectrumInfo->Charge;
+ Info->ParentMass = Spectrum->ParentMass;
+ SpectrumInfo->Head = Info;
+ SpectrumInfo->Tail = Info;
+ ConvolveMassCorrectedSpectrum(Info, SpectrumInfo);
+ FeatureArray[FeatureIndex++] = Info->Convolve[2];
+ FeatureArray[FeatureIndex++] = Info->Convolve2[0];
+ FreePMCSpectrumInfo(SpectrumInfo);
+
+ /////////////////////////////////
+ // Number of tryptic termini:
+ FeatureArray[FeatureIndex++] = (float)CountTrypticTermini(Match);
+
+ ////////////////////////////////
+ // Fancy length feature:
+ FeatureArray[FeatureIndex++] = (float)log(max(1, PeptideLength - 5));
+ FeatureArray[FeatureIndex++] = (float)log(max(1, PeptideLength - 4));
+ FeatureArray[FeatureIndex++] = (float)log(max(1, PeptideLength - 3));
+
+ return FeatureIndex;
+}
+
+float GetPenalizedScore(MSSpectrum* Spectrum, Peptide* Match, float Score)
+{
+ int ModIndex;
+ if (strlen(Match->Bases) < MIN_VALID_PEPTIDE_LENGTH)
+ {
+ Score -= 1.0;
+ }
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->ModType[ModIndex])
+ {
+ Score -= 0.25;
+ }
+ }
+ return Score;
+}
diff --git a/Score.h b/Score.h
new file mode 100644
index 0000000..22a08b6
--- /dev/null
+++ b/Score.h
@@ -0,0 +1,85 @@
+//Title: Score.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef SCORE_H
+#define SCORE_H
+#include "Inspect.h"
+#include "Trie.h"
+
+// For marking forbidden paths in d.p. tables:
+#define FORBIDDEN_PATH -99999999
+
+typedef enum evIonType
+{
+ evIonTypeNone = 0,
+ evIonTypeB,
+ evIonTypeY,
+ evIonTypeA,
+ evIonTypeBH2O,
+ evIonTypeAH2O,
+ evIonTypeBNH3,
+ evIonTypeANH3,
+ evIonTypeYH2O,
+ evIonTypeYNH3,
+ evIonTypeB2,
+ evIonTypeY2,
+ evIonTypeNoise,
+ evIonTypeBPhos, // B minus a phosphorylation
+ evIonTypeYPhos, // Y minus a phosphorylation
+ evIonTypeCount,
+} evIonType;
+
+typedef enum evLossType
+{
+ evLossNone = 0,
+ evLossB,
+ evLossY,
+} evLossType;
+
+// Workhorse function of Score.c: Compare the theoretical fragmentation pattern of a peptide to
+// a spectrum, and assign a score.
+int ScoreMatch(MSSpectrum* Spectrum, Peptide* Match, int VerboseFlag);
+
+// Apply a penalty if the peptide doesn't match GlobalOptions->DigestType. We have a
+// special scoring model for tryptic peptides, but other less specific proteases - like
+// GluC - also should affect scoring based on endpoints.
+int ApplyDigestBonus(Peptide* Match);
+
+// Compute the p-value for a match, based upon explained intensity and explained peaks and b/y ladder
+// and match score.
+//void ComputeMatchConfidenceLevel(MSSpectrum* Spectrum, Peptide* Match);
+void ScoreMatchTest(int VerboseFlag);
+int InitPValue(char* FileName);
+void SetMatchDeltaCN(SpectrumNode* Spectrum);
+int GetPeptideMatchFeaturesFull(MSSpectrum* Spectrum, Peptide* Match, float* FeatureArray);
+float GetPenalizedScore(MSSpectrum* Spectrum, Peptide* Match, float Score);
+#endif // SCORE_H
diff --git a/Score.py b/Score.py
new file mode 100644
index 0000000..797e2df
--- /dev/null
+++ b/Score.py
@@ -0,0 +1,61 @@
+#Title: Score.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Score.py: Simple wrapper for inspect scoring
+"""
+import sys
+import string
+import PyInspect
+
+def FormatTuple(Tuple):
+ Str = "("
+ for Entry in Tuple:
+ Str += "%.4g, "%Entry
+ Str = Str[:-2]
+ Str += ")"
+ return Str
+
+ColonBits = sys.argv[1].split(":")
+try:
+ FileOffset = int(ColonBits[-1])
+ FileName = string.join(ColonBits[:-1], ":")
+except:
+ FileName = sys.argv[1]
+ FileOffset = 0
+
+Spectrum = PyInspect.Spectrum(FileName, FileOffset)
+#Result = Spectrum.ScorePeptideDetailed(sys.argv[2])
+Result = Spectrum.ScorePeptideDetailed(sys.argv[2])
+Str = "MQ %.4f %s"%(Result[0], FormatTuple(Result[1:]))
+print Str
diff --git a/ScoringModel.dat b/ScoringModel.dat
new file mode 100644
index 0000000..99e6bdc
Binary files /dev/null and b/ScoringModel.dat differ
diff --git a/Scorpion.c b/Scorpion.c
new file mode 100644
index 0000000..71d499a
--- /dev/null
+++ b/Scorpion.c
@@ -0,0 +1,1304 @@
+//Title: Scorpion.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <assert.h>
+#include <math.h>
+#include "Mods.h"
+#include "Scorpion.h"
+#include "Spectrum.h"
+#include "Inspect.h"
+#include "Tagger.h"
+#include "SVM.h"
+#include "BN.h"
+#include "FreeMod.h"
+#include "IonScoring.h"
+
+#define PRM_FEATURE_COUNT 32
+#define MAX_PEPTIDE_LENGTH 256
+#define INTENSITY_LEVEL_COUNT 4
+
+int g_CutFeatures[MAX_PEPTIDE_LENGTH * CUT_FEATURE_COUNT];
+float g_VerboseCutFeatures[MAX_PEPTIDE_LENGTH * CUT_FEATURE_COUNT];
+int g_PRMFeatures[PRM_FEATURE_COUNT];
+//float g_PRMBScore; // hax
+//float g_PRMYScore; // hax
+float g_CutScores[MAX_PEPTIDE_LENGTH];
+extern PRMBayesianModel* PRMModelCharge2;
+
+int SeizePeaks(MSSpectrum* Spectrum, int TargetMass, int IonType, int AminoIndex, float* pIntensity, float* pSkew, float* pAbsSkew);
+
+FILE* g_ScorpionScoringFile = NULL;
+
+float GetExplainedPeakPercent(MSSpectrum* Spectrum, int PeakCount, int BYOnly)
+{
+ int PeaksSeen = 0;
+ int AnnotatedCount = 0;
+ int PeakIndex;
+ SpectralPeak* Peak;
+ int VerboseFlag = 0;
+ //
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Peak = Spectrum->Peaks + PeakIndex;
+ if (PeakCount>0 && Peak->IntensityRank >= PeakCount)
+ {
+ continue;
+ }
+ PeaksSeen++;
+ switch (Peak->IonType)
+ {
+ case IonB:
+ case IonB2:
+ case IonBI:
+ case IonY:
+ case IonY2:
+ case IonYI:
+ AnnotatedCount++;
+ if (VerboseFlag)
+ {
+ printf("Peak index %d at %.2f: %d\n", Peak->IntensityRank, Peak->Mass / (float)MASS_SCALE, Peak->IonType);
+ }
+
+ break;
+ case 0:
+ if (VerboseFlag)
+ {
+ printf("* Peak index %d at %.2f NOT annotated\n", Peak->IntensityRank, Peak->Mass / (float)MASS_SCALE);
+ }
+ break; // No annotated intensity for you!
+ default:
+ if (VerboseFlag)
+ {
+ printf("Peak index %d at %.2f: %d\n", Peak->IntensityRank, Peak->Mass / (float)MASS_SCALE, Peak->IonType);
+ }
+
+ if (!BYOnly)
+ {
+ AnnotatedCount++;
+ }
+ }
+ }
+ if (!PeaksSeen)
+ {
+ return 0;
+ }
+ if (PeakCount > 0)
+ {
+ return AnnotatedCount / (float)PeakCount;
+ }
+ return AnnotatedCount / (float)PeaksSeen;
+}
+
+
+float GetExplainedIntensityPercent(MSSpectrum* Spectrum, int PeakCount, int BYOnly)
+{
+ float PeakIntensity = 0;
+ float AnnotatedIntensity = 0;
+ int PeakIndex;
+ SpectralPeak* Peak;
+ //
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Peak = Spectrum->Peaks + PeakIndex;
+ if (PeakCount>0 && Peak->IntensityRank >= PeakCount)
+ {
+ continue;
+ }
+
+ PeakIntensity += Peak->Intensity;
+ //printf("%.2f\t%.2f\t%d\n", Peak->Mass / (float)MASS_SCALE, Peak->Intensity, Peak->IonType);
+ switch (Peak->IonType)
+ {
+ case IonB:
+ case IonB2:
+ case IonBI:
+ case IonY:
+ case IonY2:
+ case IonYI:
+ AnnotatedIntensity += Peak->Intensity;
+ break;
+ case 0:
+ break; // No annotated intensity for you!
+ default:
+ if (!BYOnly)
+ {
+ AnnotatedIntensity += Peak->Intensity;
+ }
+ }
+ }
+ if (PeakIntensity == 0)
+ {
+ return 0;
+ }
+ return AnnotatedIntensity / PeakIntensity;
+}
+
+int GetFlankBFeature(char Left, char Right)
+{
+ // H on right: Strong suppression
+ if (Right == 'H')
+ {
+ return 0;
+ }
+ // G or P on left: Strong suppression
+ if (Left == 'G' || Left == 'P')
+ {
+ return 1;
+ }
+ // K or R on left: Augmentation
+ if (Left == 'K' || Left == 'R')
+ {
+ return 2;
+ }
+ // P on right: Augmentation
+ if (Right == 'P')
+ {
+ return 3;
+ }
+
+ return 4;
+}
+
+int GetFlankYFeature(char Left, char Right)
+{
+ // K or R on right: Strong suppression
+ if (Right == 'K' || Right == 'R')
+ {
+ return 0;
+ }
+ // G or P on left: Strong suppression
+ if (Left == 'G' || Left == 'P')
+ {
+ return 1;
+ }
+ // K or R on left: Augmentation
+ if (Left == 'K' || Left == 'R')
+ {
+ return 2;
+ }
+ // P on right: Augmentation
+ if (Right == 'P')
+ {
+ return 3;
+ }
+
+ return 4;
+}
+
+// SECTOR_COUNT
+#define GET_SECTOR(Mass) \
+if (Mass > GlobalOptions->DynamicRangeMax || Mass < GlobalOptions->DynamicRangeMin) \
+{\
+ Sector = -1;\
+}\
+else if (Mass > SectorCutoffA) \
+{\
+ Sector = 1;\
+}\
+else\
+{\
+ Sector = 0;\
+}
+
+// SECTOR_COUNT
+#define GET_CUT_SECTOR(Mass) \
+if (Mass > GlobalOptions->DynamicRangeMax || Mass < GlobalOptions->DynamicRangeMin) \
+{\
+ Sector = -1;\
+}\
+else if (Mass > SectorCutoffA) \
+{\
+ Sector = 1;\
+}\
+else\
+{\
+ Sector = 0;\
+}
+
+// Helper macro for GetPRMFeatures. NOT applicable in peptide context, only in tagging context
+#define GET_BIN_INTENSITY(Mass) \
+ Bin = (Mass + 50) / 100;\
+ if (Bin >= 0 && Bin < Spectrum->IntensityBinCount) \
+ { \
+ IntensityLevel = Spectrum->BinnedIntensityLevels[Bin]; \
+ } \
+ else \
+ { \
+ IntensityLevel = 0; \
+ }
+
+// Given a putative prefix residue mass for an unannotated spectrum, compute features for scoring its quality.
+// Used in building a score array for blind search, and in quick-scoring tagged search.
+// This code has SIGNIFICANT OVERLAP with the code in GetCutFeatures()
+// Here, PRM is a mass (in thousandths-of-a-dalton).
+float GetPRMFeatures(MSSpectrum* Spectrum, SpectrumTweak* Tweak, BayesianModel* Model, int PRM, int VerboseFlag)
+{
+ int ParentMass;
+ int MassB;
+ int MassY;
+ int Mass;
+ int IntensityLevel;
+ int Sector;
+ int SectorCutoffA;
+ float Score = 0;
+ int Bin;
+ //Spectrum->Charge = Tweak->Charge;
+ //Spectrum->ParentMass = Tweak->ParentMass;
+ ParentMass = Tweak->ParentMass;
+ SectorCutoffA = (int)(ParentMass * 0.5 + 0.5);
+ // SECTOR_COUNT
+ if (PRM > SectorCutoffA)
+ {
+ g_PRMFeatures[SISector] = 1;
+ }
+ else
+ {
+ g_PRMFeatures[SISector] = 0;
+ }
+ MassB = PRM + DALTON;
+ MassY = ParentMass - PRM;
+ // Compute the vector of features. Compute parent features BEFORE computing children.
+ g_PRMFeatures[SICharge] = Spectrum->Charge;
+
+ // Alterations to PRM scoring in context of a phosphopeptide search:
+ // - Don't try to use a phosphate-loss peak as a b or y peak
+ // - Give a bonus for phosphate loss peaks, maybe
+
+ // Find the intensity level for the y peak, and store it in the feature-vector:
+ GET_BIN_INTENSITY(MassY);
+ //IntensityLevel = SeizePeaks(Spectrum, MassY, 0);
+ g_PRMFeatures[IonY] = IntensityLevel;
+
+ // If the y peak is outside dynamic range, then don't adjust the score.
+ // If it's in range: Add the y node's log-probability, and subtract the null model's log-probability.
+ GET_SECTOR(MassY);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonY, IonY, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("Y\t%.1f\t\t%d\t%.2f\t%.2f\n", MassY / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonY, IonY, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // FOR PHOSOPHOPEPTIDE TAGGING:
+ if (GlobalOptions->PhosphorylationFlag)
+ {
+ GET_BIN_INTENSITY(MassY - PHOSPHATE_WATER_MASS);
+ if (IntensityLevel)
+ {
+ Score += 0.5;
+ }
+ }
+
+ // b peak:
+ //IntensityLevel = SeizePeaks(Spectrum, MassB, 0);
+ GET_BIN_INTENSITY(MassB);
+ g_PRMFeatures[IonB] = IntensityLevel;
+
+ GET_SECTOR(MassB);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonB, IonB, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("B\t%.1f\t\t%d\t%.2f\t%.2f\n", MassB / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonB, IonB, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // FOR PHOSOPHOPEPTIDE TAGGING:
+ if (GlobalOptions->PhosphorylationFlag)
+ {
+ GET_BIN_INTENSITY(MassB - PHOSPHATE_WATER_MASS);
+ if (IntensityLevel)
+ {
+ Score += 0.5;
+ }
+ }
+
+ // y isotopic peak:
+ Mass = MassY + DALTON;
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, 0);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonYI] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonYI, IonYI, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("YI\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonYI, IonYI, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // b isotopic peak:
+ Mass = MassB + DALTON;
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, 0);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonBI] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonBI, IonBI, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("BI\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonBI, IonBI, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // doubly-charged y:
+ Mass = (int)((MassY + HYDROGEN_MASS)/2 + 0.5);
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, 0);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonY2] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonY2, IonY2, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("Y2\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonY2, IonY2, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // doubly-charged b:
+ Mass = (int)((MassB + HYDROGEN_MASS)/2 + 0.5);
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, 0);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonB2] = IntensityLevel;
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonB2, IonB2, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("B2\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonB2, IonB2, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // Y-H2O:
+ Mass = MassY - WATER_MASS;
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, 0);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonYH2O] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonYH2O, IonYH2O, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("Y-h2o\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonYH2O, IonYH2O, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // a:
+ Mass = MassB - 27000;
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, IonA);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonA] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonA, IonA, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("a\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonA, IonA, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // b-H2O:
+ Mass = MassB - WATER_MASS;
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, IonBH2O);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonBH2O] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonBH2O, IonBH2O, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("b-h2o\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonBH2O, IonBH2O, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // y-NH3:
+ Mass = MassY - 17000;
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, IonYNH3);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonYNH3] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonYNH3, IonYNH3, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("y-nh3\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonYNH3, IonYNH3, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // b-NH3:
+ Mass = MassB - 17000;
+ //IntensityLevel = SeizePeaks(Spectrum, Mass, IonBNH3);
+ GET_BIN_INTENSITY(Mass);
+ g_PRMFeatures[IonBNH3] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ Score += ComputeBNProbability(Model->Nodes + IonBNH3, IonBNH3, g_PRMFeatures);
+ Score -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("b-nh3\t%.1f\t\t%d\t%.2f\t%.2f\n", Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonBNH3, IonBNH3, g_PRMFeatures),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ if (GlobalOptions->PhosphorylationFlag)
+ {
+ Mass = (Spectrum->ParentMass + HYDROGEN_MASS * (Tweak->Charge - 1) - PHOSPHATE_WATER_MASS) / 2;
+ if (abs(PRM - Mass) < 1000)
+ {
+ Score = min(Score, (float)2.0);
+ }
+ }
+
+ return Score;
+}
+
+float g_BIntensity[MAX_PEPTIDE_LENGTH];
+float g_YIntensity[MAX_PEPTIDE_LENGTH];
+float g_BSkew[MAX_PEPTIDE_LENGTH];
+float g_YSkew[MAX_PEPTIDE_LENGTH];
+float g_BAbsSkew[MAX_PEPTIDE_LENGTH];
+float g_YAbsSkew[MAX_PEPTIDE_LENGTH];
+
+#define FRAGMENTATION_NORMAL 0
+#define FRAGMENTATION_PHOSPHO 1
+
+// Given a spectrum and a peptide, generate the feature-vector for each cut-point along the backbone.
+// This is separate from GetPRMFeatures, which is done in the context of a spectrum WITHOUT a peptide.
+// (The difference: Here we know the flanking peptide, and we have 5 possible sectors rather than 3)
+//
+// Either FeatureFile is non-null (in which case we should write our feature-vectors to the file), or
+// ScoringNetwork is non-null (in which case we should save the array of cut-point probabilities)
+void GetCutFeatures(MSSpectrum* Spectrum, SpectrumTweak* Tweak, Peptide* Match,
+ BayesianModel* Model)
+{
+ int Mass;
+ int PRM;
+ int AminoIndex;
+ int Length;
+ int CutMasses[MAX_PEPTIDE_LENGTH];
+ //int BaseFlags[MAX_PEPTIDE_LENGTH];
+ //int AcidFlags[MAX_PEPTIDE_LENGTH];
+ int ModIndex;
+ int MassB;
+ int MassY;
+ int SectorCutoffA;
+ //int SectorCutoffB;
+ //int SectorCutoffC;
+ int ParentMass;
+ int FeatureValue;
+ int Sector;
+ int IntensityLevel;
+ int VerboseFlag = 0;
+ int AminoIndexY;
+ int AminoIndexB;
+ int CutFeaturesBaseIndex;
+ //
+ Spectrum->Charge = Tweak->Charge;
+ Spectrum->ParentMass = Tweak->ParentMass;
+
+ // Check whether we're using special fragmentation models.
+ // Use phosphopeptide fragmentation rules if this is Sphos or Tphos (but not for Yphos)
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Match->ModType[ModIndex])
+ {
+ break;
+ }
+ if (Match->ModType[ModIndex]->Flags & DELTA_FLAG_PHOSPHORYLATION && Match->Bases[Match->AminoIndex[ModIndex]]!='Y')
+ {
+ Match->SpecialFragmentation = FRAGMENTATION_PHOSPHO;
+ Match->SpecialModPosition = Match->AminoIndex[ModIndex];
+ }
+ }
+ Length = strlen(Match->Bases);
+ Mass = 0;
+ //memset(BaseFlags, 0, sizeof(int)*MAX_PEPTIDE_LENGTH);
+ //memset(AcidFlags, 0, sizeof(int)*MAX_PEPTIDE_LENGTH);
+ // Get the array of masses:
+ for (AminoIndex = 0; AminoIndex < Length; AminoIndex++)
+ {
+ Mass += PeptideMass[Match->Bases[AminoIndex]];
+
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] != AminoIndex)
+ {
+ continue;
+ }
+ if (!Match->ModType[ModIndex])
+ {
+ break;
+ }
+ Mass += Match->ModType[ModIndex]->RealDelta;
+ }
+ CutMasses[AminoIndex] = Mass;
+ }
+ ParentMass = Mass + PARENT_MASS_BOOST;
+ // Set dynamic-range feature:
+ for (AminoIndex = 0; AminoIndex < Length-1; AminoIndex++)
+ {
+ Mass = CutMasses[AminoIndex];
+ MassB = Mass + DALTON;
+ MassY = ParentMass - Mass;
+ if (MassB < GlobalOptions->DynamicRangeMin || MassB > GlobalOptions->DynamicRangeMax)
+ {
+ // B is out.
+ if (MassY < GlobalOptions->DynamicRangeMin || MassY > GlobalOptions->DynamicRangeMax)
+ {
+ // Both out:
+ g_CutFeatures[AminoIndex * CUT_FEATURE_COUNT] = 0;
+ }
+ else
+ {
+ // Y, no B:
+ g_CutFeatures[AminoIndex * CUT_FEATURE_COUNT] = 2;
+ }
+
+ }
+ else
+ {
+ if (MassY < GlobalOptions->DynamicRangeMin || MassY > GlobalOptions->DynamicRangeMax)
+ {
+ // B, no Y:
+ g_CutFeatures[AminoIndex*CUT_FEATURE_COUNT] = 1;
+ }
+ else
+ {
+ // Both lie inside dynamic range
+ g_CutFeatures[AminoIndex*CUT_FEATURE_COUNT] = 3;
+ }
+ }
+ }
+
+ SectorCutoffA = (int)(ParentMass * 0.5 + 0.5);
+ //SectorCutoffB = (int)(ParentMass * 0.667 + 0.5);
+ //SectorCutoffC = (int)(ParentMass * 0.667 + 0.5); // SECTOR_COUNT
+ memset(g_VerboseCutFeatures, 0, sizeof(int) * MAX_PEPTIDE_LENGTH * CUT_FEATURE_COUNT);
+ memset(g_BSkew, 0, sizeof(float) * MAX_PEPTIDE_LENGTH);
+ memset(g_YSkew, 0, sizeof(float) * MAX_PEPTIDE_LENGTH);
+ memset(g_BAbsSkew, 0, sizeof(float) * MAX_PEPTIDE_LENGTH);
+ memset(g_YAbsSkew, 0, sizeof(float) * MAX_PEPTIDE_LENGTH);
+
+ // Annotate the M-P peak, if it's a phosphopeptide:
+ if (Match->SpecialFragmentation == FRAGMENTATION_PHOSPHO)
+ {
+ Mass = (ParentMass + (Tweak->Charge - 1) * HYDROGEN_MASS - PHOSPHATE_WATER_MASS) / Tweak->Charge;
+ SeizePeaks(Spectrum, Mass, IonParentLoss, -1, NULL, NULL, NULL);
+ }
+
+ // Capture ions:
+ for (AminoIndex = 0; AminoIndex < Length; AminoIndex++)
+ {
+ // We number the CUTS from 0 up through length-1. We set AminoIndexY and AminoIndexB
+ // to the length (in amino acids) of the b and y fragments, for easy reading by humans.
+ AminoIndexY = Length - AminoIndex - 1;
+ AminoIndexB = AminoIndex + 1;
+ CutFeaturesBaseIndex = AminoIndex * CUT_FEATURE_COUNT;
+ PRM = CutMasses[AminoIndex];
+ g_CutScores[AminoIndex] = 0;
+ MassB = PRM + DALTON;
+ MassY = ParentMass - PRM;
+ // Compute the vector of features. Compute parent features BEFORE computing children.
+ g_CutFeatures[CutFeaturesBaseIndex + SICharge] = Spectrum->Charge;
+ g_CutFeatures[CutFeaturesBaseIndex + SIFlankB] = GetFlankBFeature(Match->Bases[AminoIndex], Match->Bases[AminoIndex + 1]);
+ g_CutFeatures[CutFeaturesBaseIndex + SIFlankY] = GetFlankYFeature(Match->Bases[AminoIndex], Match->Bases[AminoIndex + 1]);
+ // 2+3 value sector (first cut, last cut, and three regions)
+
+ if (PRM > ParentMass * 0.5)
+ {
+ FeatureValue = 1;
+ }
+ else
+ {
+ FeatureValue = 0;
+ }
+ g_CutFeatures[CutFeaturesBaseIndex + SISector] = FeatureValue;
+
+ // Find the intensity level for the y peak, and store it in the feature-vector:
+ IntensityLevel = SeizePeaks(Spectrum, MassY, IonY, AminoIndexY, g_YIntensity + AminoIndex, g_YSkew + AminoIndex, g_YAbsSkew + AminoIndex);
+ if (Match->SpecialFragmentation && AminoIndex < Match->SpecialModPosition)
+ {
+ IntensityLevel = max(IntensityLevel, SeizePeaks(Spectrum, MassY - PHOSPHATE_WATER_MASS, IonY, AminoIndexY, g_YIntensity + AminoIndex, g_YSkew + AminoIndex, g_YAbsSkew + AminoIndex));
+ }
+ g_CutFeatures[CutFeaturesBaseIndex + IonY] = IntensityLevel;
+ // If the y peak is outside dynamic range, then don't adjust the score.
+ // If it's in range: Add the y node's log-probability, and subtract the null model's log-probability.
+ GET_SECTOR(MassY);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonY, IonY, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonY] = ComputeBNProbability(Model->Nodes + IonY, IonY, g_CutFeatures + CutFeaturesBaseIndex) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("%s\t%s\t%d\t%.1f\tY\t%d\t%.2f\t%.2f\t\n", Spectrum->Node->InputFile->FileName,
+ Match->Bases, AminoIndex, MassY / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonY, IonY, g_CutFeatures + CutFeaturesBaseIndex),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+ // b peak:
+ IntensityLevel = SeizePeaks(Spectrum, MassB, IonB, AminoIndexB, g_BIntensity + AminoIndex, g_BSkew + AminoIndex, g_BAbsSkew + AminoIndex);
+ if (Match->SpecialFragmentation && AminoIndex >= Match->SpecialModPosition)
+ {
+ IntensityLevel = max(IntensityLevel, SeizePeaks(Spectrum, MassB - PHOSPHATE_WATER_MASS, IonB, AminoIndexB, g_BIntensity + AminoIndex, g_BSkew + AminoIndex, g_BAbsSkew + AminoIndex));
+ }
+
+ g_CutFeatures[CutFeaturesBaseIndex + IonB] = IntensityLevel;
+ GET_SECTOR(MassB);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonB, IonB, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonB] = ComputeBNProbability(Model->Nodes + IonB, IonB, g_CutFeatures + CutFeaturesBaseIndex) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("%s\t%s\t%d\t%.1f\tB\t%d\t%.2f\t%.2f\t\n", Spectrum->Node->InputFile->FileName,
+ Match->Bases, AminoIndex, MassB / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonB, IonB, g_CutFeatures + CutFeaturesBaseIndex),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+ // y isotopic peak:
+ Mass = MassY + DALTON;
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonYI, AminoIndexY, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonYI] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonYI, IonYI, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonYI] = ComputeBNProbability(Model->Nodes + IonYI, IonYI, g_CutFeatures + CutFeaturesBaseIndex) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("%s\t%s\t%d\t%.1f\tYI\t%d\t%.2f\t%.2f\t\n", Spectrum->Node->InputFile->FileName,
+ Match->Bases, AminoIndex, Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonYI, IonYI, g_CutFeatures + CutFeaturesBaseIndex),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // b isotopic peak:
+ Mass = MassB + DALTON;
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonBI, AminoIndexB, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonBI] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonBI, IonBI, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonBI] = ComputeBNProbability(Model->Nodes + IonBI, IonBI, g_CutFeatures + CutFeaturesBaseIndex) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("%s\t%s\t%d\t%.1f\tBI\t%d\t%.2f\t%.2f\t\n", Spectrum->Node->InputFile->FileName,
+ Match->Bases, AminoIndex, Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonBI, IonBI, g_CutFeatures + CutFeaturesBaseIndex),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ // doubly-charged y:
+ Mass = (int)((MassY + HYDROGEN_MASS)/2 + 0.5);
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonY2, AminoIndexY, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonY2] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonY2, IonY2, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[CutFeaturesBaseIndex + IonY2] = ComputeBNProbability(Model->Nodes + IonY2, IonY2, g_CutFeatures + CutFeaturesBaseIndex) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ if (VerboseFlag)
+ {
+ printf("%s\t%s\t%d\t%.1f\tY2\t%d\t%.2f\t%.2f\t\n", Spectrum->Node->InputFile->FileName,
+ Match->Bases, AminoIndex, Mass / (float)MASS_SCALE, IntensityLevel,
+ ComputeBNProbability(Model->Nodes + IonY2, IonY2, g_CutFeatures + CutFeaturesBaseIndex),
+ Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel]);
+ }
+ }
+
+ Mass = (int)((MassB + HYDROGEN_MASS)/2 + 0.5);
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonB2, AminoIndexB, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonB2] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonB2, IonB2, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonB2] = ComputeBNProbability(Model->Nodes + IonB2, IonB2, g_CutFeatures) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ }
+
+ // Y-H2O:
+ Mass = MassY - WATER_MASS;
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonYH2O, AminoIndexY, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonYH2O] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonYH2O, IonYH2O, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonYH2O] = ComputeBNProbability(Model->Nodes + IonYH2O, IonYH2O, g_CutFeatures) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ }
+
+ // a:
+ Mass = MassB - 27000;
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonA, AminoIndexB, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonA] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonA, IonA, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonA] = ComputeBNProbability(Model->Nodes + IonA, IonA, g_CutFeatures) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ }
+
+ // b-H2O:
+ Mass = MassB - WATER_MASS;
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonBH2O, AminoIndexB, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonBH2O] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonBH2O, IonBH2O, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonBH2O] = ComputeBNProbability(Model->Nodes + IonBH2O, IonBH2O, g_CutFeatures) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ }
+
+ // y-NH3:
+ Mass = MassY - 17000;
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonYNH3, AminoIndexY, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonYNH3] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonYNH3, IonYNH3, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonYNH3] = ComputeBNProbability(Model->Nodes + IonYNH3, IonYNH3, g_CutFeatures) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ }
+
+ // b-NH3:
+ Mass = MassB - 17000;
+ IntensityLevel = SeizePeaks(Spectrum, Mass, IonBNH3, AminoIndexB, 0, 0, 0);
+ g_CutFeatures[CutFeaturesBaseIndex + IonBNH3] = IntensityLevel;
+ GET_SECTOR(Mass);
+ if (Model && Sector >= 0)
+ {
+ g_CutScores[AminoIndex] += ComputeBNProbability(Model->Nodes + IonBNH3, IonBNH3, g_CutFeatures + CutFeaturesBaseIndex);
+ g_CutScores[AminoIndex] -= Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ g_VerboseCutFeatures[AminoIndex*CUT_FEATURE_COUNT + IonBNH3] = ComputeBNProbability(Model->Nodes + IonBNH3, IonBNH3, g_CutFeatures) - Tweak->Intensities[Sector * INTENSITY_LEVEL_COUNT + IntensityLevel];
+ }
+
+ }
+}
+
+// Take all unlabeled peaks in a radius of the target m/z. Annotate them with this ion type,
+// and return the cumulative intensity level.
+int SeizePeaks(MSSpectrum* Spectrum, int TargetMass, int IonType, int AminoIndex, float* pIntensity, float* pSkew, float *pAbsSkew)
+{
+ int PeakIndex;
+ int MaxMass;
+ float Intensity = 0;
+ int Bin;
+ float WeightedSkew = 0;
+ float AbsWeightedSkew = 0;
+ //
+ Bin = (TargetMass + 50) / 100;
+ MaxMass = TargetMass + INTENSITY_BIN_RADIUS;
+
+ // If the mass is off the scale, then you get no peaks:
+ if (Bin >= Spectrum->IntensityBinCount || Bin < 0)
+ {
+ return 0;
+ }
+ PeakIndex = Spectrum->BinPeakIndex[Bin];
+ if (PeakIndex >= 0)
+ {
+ for ( ; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].Mass > MaxMass)
+ {
+ break;
+ }
+ Intensity += Spectrum->Peaks[PeakIndex].Intensity;
+ Spectrum->Peaks[PeakIndex].IonType = IonType;
+ Spectrum->Peaks[PeakIndex].AminoIndex = AminoIndex;
+ WeightedSkew += Spectrum->Peaks[PeakIndex].Intensity * (Spectrum->Peaks[PeakIndex].Mass - TargetMass);
+ AbsWeightedSkew += Spectrum->Peaks[PeakIndex].Intensity * abs(Spectrum->Peaks[PeakIndex].Mass - TargetMass);
+ }
+ }
+ if (pIntensity)
+ {
+ *pIntensity = Intensity;
+ *pSkew = WeightedSkew;
+ *pAbsSkew = AbsWeightedSkew;
+ }
+
+ if (Intensity < Spectrum->IntensityCutoffLow)
+ {
+ return 0;
+ }
+ if (Intensity < Spectrum->IntensityCutoffMedium)
+ {
+ return 1;
+ }
+ if (Intensity < Spectrum->IntensityCutoffHigh)
+ {
+ return 2;
+ }
+ return 3;
+}
+
+FILE* g_TrainFile2;
+FILE* g_TrainFile3;
+
+// Callback for trining PRM scorer in peptide context.
+void TrainPepPRMCallback(SpectrumNode* Node, int Charge, int ParentMass, Peptide* Annotation)
+{
+ FILE* OutputFile;
+ int AminoIndex;
+ int Length;
+ int FeatureIndex;
+ MSSpectrum* Spectrum;
+
+ Spectrum = Node->Spectrum;
+ Length = strlen(Annotation->Bases);
+ WindowFilterPeaks(Spectrum, 0, 0);
+ IntensityRankPeaks(Spectrum);
+ // Use the charge+PM oracle:
+ Node->Tweaks[0].Charge = Charge;
+ Node->Tweaks[0].ParentMass = ParentMass;
+
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+ if (Charge < 3)
+ {
+ OutputFile = g_TrainFile2;
+ }
+ else
+ {
+ OutputFile = g_TrainFile3;
+ }
+ GetCutFeatures(Node->Spectrum, Node->Tweaks, Annotation, NULL);
+
+ for (AminoIndex = 0; AminoIndex < Length; AminoIndex++)
+ {
+ for (FeatureIndex = 0; FeatureIndex < SIMax; FeatureIndex++)
+ {
+ fprintf(OutputFile, "%d\t", g_CutFeatures[AminoIndex*CUT_FEATURE_COUNT + FeatureIndex]);
+ }
+ fprintf(OutputFile, "\n");
+ }
+ fflush(OutputFile);
+}
+
+void TrainPepPRM(char* OracleFile, char* OracleDir)
+{
+ g_TrainFile2 = fopen("TrainingFiles\\PEPPRM2.txt", "w");
+ g_TrainFile3 = fopen("TrainingFiles\\PEPPRM3.txt", "w");
+ TrainOnOracleFile(OracleFile, OracleDir, TrainPepPRMCallback);
+}
+
+void ScorpionSetPRMScores(MSSpectrum* Spectrum, SpectrumTweak* Tweak)
+{
+ BayesianModel* Model;
+ int PRM;
+ float fScore;
+ //
+ // Ensure models are loaded:
+ if (!BNCharge2TaggingBN)
+ {
+ InitBayesianModels();
+ }
+ Tweak->PRMScoreMax = Tweak->ParentMass;
+ if (Spectrum->Graph)
+ {
+ Tweak->PRMScoreMax = max(Tweak->PRMScoreMax, Spectrum->Graph->LastNode->Mass);
+ }
+ Tweak->PRMScoreMax = PRM_ARRAY_SLACK + (Tweak->PRMScoreMax / PRM_BIN_SIZE);
+ SafeFree(Tweak->PRMScores);
+ Tweak->PRMScores = (int*)calloc(Tweak->PRMScoreMax + 5, sizeof(int)); // extra slack in alloc
+ if (Tweak->Charge > 2)
+ {
+ Model = BNCharge3TaggingBN;
+ }
+ else
+ {
+ Model = BNCharge2TaggingBN;
+ }
+ for (PRM = 0; PRM < Tweak->PRMScoreMax; PRM++)
+ {
+ fScore = GetPRMFeatures(Spectrum, Tweak, Model, PRM * PRM_BIN_SIZE, 0);
+ Tweak->PRMScores[PRM] = (int)(fScore * 1000);
+ }
+}
+
+void FinishPRMTestRecord(char* RememberFileName, int* Scores, int MatchCount, int* RankHistogram, char* CandidateAnnotations)
+{
+ int TrueScore;
+ int ScoreIndex;
+ int BestScore = -9999;
+ int BestScoreIndex = 0;
+ int HistogramPoint = 0;
+ //
+ // Find the best score:
+ for (ScoreIndex = 0; ScoreIndex < MatchCount; ScoreIndex++)
+ {
+ if (Scores[ScoreIndex] > BestScore)
+ {
+ BestScore = Scores[ScoreIndex];
+ BestScoreIndex = ScoreIndex;
+ }
+ }
+ TrueScore = Scores[0];
+
+ qsort(Scores, MatchCount, sizeof(int), (QSortCompare)CompareInts);
+ for (ScoreIndex = 0; ScoreIndex < MatchCount; ScoreIndex++)
+ {
+ if (Scores[ScoreIndex] <= TrueScore)
+ {
+ // Found it!
+ RankHistogram[ScoreIndex] += 1;
+ HistogramPoint = ScoreIndex;
+ break;
+ }
+ }
+ // Verbose output:
+ printf("%s\t%s\t%s\t%d\t%d\t%d\n", RememberFileName, CandidateAnnotations, CandidateAnnotations + 128*BestScoreIndex,
+ BestScore, TrueScore, HistogramPoint);
+
+}
+
+
+// TestPepPRMCallback:
+// Print the minimum, maximum, and average score of cut-point scores for this annotation.
+// The primary goal here is to evaluate whether minor changes to the PepPRM scoring
+// model, such as changing the intensity cutoffs or adding new nodes and edges,
+// improve the model's effectiveness.
+// This function also serves as a 'sanity check' that true matches are getting
+// reasonably good PepPRM scores.
+void TestPepPRMCallback(SpectrumNode* Node, int Charge, int ParentMass, Peptide* Annotation)
+{
+ float MinPRMScore = 9999;
+ float MaxPRMScore = -9999;
+ int AminoIndex;
+ int PRMCount;
+ int PRM;
+ int Len;
+ float Score;
+ float TotalScore;
+ BayesianModel* Model;
+ ////////////////////////////////////////////////////////////////////////
+ // Main
+ Node->Tweaks[0].ParentMass = Annotation->ParentMass;
+ Node->Tweaks[0].Charge = Charge;
+ Node->Spectrum->ParentMass = Annotation->ParentMass;
+ Node->Spectrum->Charge = Charge;
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+ IntensityRankPeaks(Node->Spectrum);
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+ //SpectrumComputeNoiseDistributions(Node);
+ if (Charge > 2)
+ {
+ Model = BNCharge3ScoringBN;
+ }
+ else
+ {
+ Model = BNCharge2ScoringBN;
+ }
+ PRMCount = 0;
+ PRM = 0;
+ TotalScore = 0;
+ Len = strlen(Annotation->Bases);
+ GetCutFeatures(Node->Spectrum, Node->Tweaks, Annotation, Model);
+ for (AminoIndex = 0; AminoIndex < Len; AminoIndex++)
+ {
+ Score = g_CutScores[AminoIndex];
+ TotalScore += Score;
+ MinPRMScore = min(MinPRMScore, Score);
+ MaxPRMScore = max(MaxPRMScore, Score);
+ PRMCount++;
+ }
+ Score = TotalScore / PRMCount;
+ printf("%s\t%s\t%.2f\t%.2f\t%.2f\t\n", Node->InputFile->FileName, Annotation->Bases, Score, MinPRMScore, MaxPRMScore);
+}
+
+void TestPRMQuickScoringCallback(SpectrumNode* Node, int Charge, int ParentMass, Peptide* Annotation)
+{
+ static int* Scores;
+ static int MatchCount;
+ static char* CurrentFile;
+ static int* RankHistogram;
+ static int RowsProcessed;
+ int Score;
+ int PRM;
+ int AminoIndex;
+ int ModIndex;
+ int Len;
+ BayesianModel* Model;
+ BayesianModel* PepPRMModel;
+ int Cumulative;
+ int TotalPeptides;
+ int RankIndex;
+ int PRMCount;
+ static char CandidateAnnotations[512*128];
+ static char RememberFileName[1024];
+
+ // If Node is null, then we've been called in initialize / cleanup mode:
+ if (!Node)
+ {
+ if (!Charge)
+ {
+ CurrentFile = (char*)calloc(256, sizeof(char));
+ Scores = (int*)calloc(512, sizeof(int));
+ RankHistogram = (int*)calloc(512, sizeof(int));
+ RowsProcessed = 0;
+ return;
+ }
+ // Finish the current peptide, if any:
+ if (*CurrentFile)
+ {
+ FinishPRMTestRecord(RememberFileName, Scores, MatchCount, RankHistogram, CandidateAnnotations);
+ }
+ // Now report:
+ printf("Histogram of PRM quick score pack positions:\n");
+ TotalPeptides = 0;
+ Cumulative = 0;
+ for (RankIndex = 0; RankIndex< 512; RankIndex++)
+ {
+ TotalPeptides += RankHistogram[RankIndex];
+ }
+ for (RankIndex = 0; RankIndex< 512; RankIndex++)
+ {
+ Cumulative += RankHistogram[RankIndex];
+ printf("%d\t%d\t%d\t%.2f\t%.2f\t\n", RankIndex, RankHistogram[RankIndex], Cumulative,
+ RankHistogram[RankIndex] / (float)TotalPeptides, Cumulative / (float)TotalPeptides);
+ }
+ // Lastly, free memory:
+ SafeFree(Scores);
+ SafeFree(RankHistogram);
+ SafeFree(CurrentFile);
+ return;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ // Main
+ Node->Tweaks[0].ParentMass = Annotation->ParentMass;
+ Node->Tweaks[0].Charge = Charge;
+ Node->Spectrum->ParentMass = Annotation->ParentMass;
+ Node->Spectrum->Charge = Charge;
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+ IntensityRankPeaks(Node->Spectrum);
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+ //SpectrumComputeNoiseDistributions(Node);
+ // OLD PRM SCORING:
+ //SetPRMScores(Node->Spectrum);
+ RowsProcessed++;
+ if (strcmp(CurrentFile, Node->InputFile->FileName))
+ {
+ if (MatchCount)
+ {
+ FinishPRMTestRecord(RememberFileName, Scores, MatchCount, RankHistogram, CandidateAnnotations);
+ }
+ MatchCount = 0;
+ strcpy(CurrentFile, Node->InputFile->FileName);
+ }
+ sprintf(RememberFileName, Node->InputFile->FileName);
+ // Compute score for these PRM values:
+ if (Charge > 2)
+ {
+ PepPRMModel = BNCharge3ScoringBN;
+ Model = BNCharge3TaggingBN;
+ }
+ else
+ {
+ PepPRMModel = BNCharge2ScoringBN;
+ Model = BNCharge2TaggingBN;
+ }
+ Len = strlen(Annotation->Bases);
+ PRM = 0;
+ Score = 0;
+ PRMCount = 0;
+
+ // Verify that using flanking aminos and such really does improve things:
+ GetCutFeatures(Node->Spectrum, Node->Tweaks, Annotation, PepPRMModel);
+
+ for (AminoIndex = 0; AminoIndex < Len; AminoIndex++)
+ {
+ PRM += PeptideMass[Annotation->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Annotation->AminoIndex[ModIndex] == AminoIndex && Annotation->ModType[ModIndex])
+ {
+ PRM += Annotation->ModType[ModIndex]->RealDelta;
+ }
+ }
+ PRMCount++;
+
+
+ Score += (int)(1000 * g_CutScores[AminoIndex]);
+ }
+ Score = Score / PRMCount;
+ // Cheese to prevent running off the edge of the array:
+ if (MatchCount < 512)
+ {
+ Scores[MatchCount] = Score;
+ strcpy(CandidateAnnotations + 128 * MatchCount, Annotation->Bases);
+ MatchCount++;
+ }
+}
+
+void TestPRMQuickScoring(char* OracleFile, char* OracleDir)
+{
+ InitBayesianModels();
+ TestPRMQuickScoringCallback(NULL, 0, 0, NULL); // initialization
+ TrainOnOracleFile(OracleFile, OracleDir, TestPRMQuickScoringCallback);
+ TestPRMQuickScoringCallback(NULL, 1, 0, NULL); // completion
+}
+
+void TestPepPRM(char* OracleFile, char* OracleDir)
+{
+ InitBayesianModels();
+ TrainOnOracleFile(OracleFile, OracleDir, TestPepPRMCallback);
+}
+
+// TestLDACallback:
+void TestLDACallback(SpectrumNode* Node, int Charge, int ParentMass, Peptide* Annotation)
+{
+ BayesianModel* Model;
+ //
+ Node->Tweaks[0].ParentMass = Annotation->ParentMass;
+ Node->Tweaks[0].Charge = Charge;
+ Annotation->Tweak = Node->Tweaks;
+ Node->Spectrum->ParentMass = Annotation->ParentMass;
+ Node->Spectrum->Charge = Charge;
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+ IntensityRankPeaks(Node->Spectrum);
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+ //SpectrumComputeNoiseDistributions(Node);
+ GlobalOptions->DigestType = DIGEST_TYPE_TRYPSIN;
+ if (Charge > 2)
+ {
+ Model = BNCharge3ScoringBN;
+ }
+ else
+ {
+ Model = BNCharge2ScoringBN;
+ }
+ ScorpionSetPRMScores(Node->Spectrum, Node->Tweaks);
+
+}
+
+void TestLDA(char* OracleFile, char* OracleDir)
+{
+ InitBayesianModels();
+ TrainOnOracleFile(OracleFile, OracleDir, TestLDACallback);
+}
+
+// For debug output: Return a description of an ion type code.
+char* GetScorpIonName(int IonType)
+{
+ switch (IonType)
+ {
+ case IonY:
+ return "y";
+ case IonB:
+ return "b";
+ case IonYI:
+ return "yi";
+ case IonBI:
+ return "bi";
+ case IonY2:
+ return "y2";
+ case IonB2:
+ return "b2";
+ case IonYH2O:
+ return "y-h2o";
+ case IonA:
+ return "a";
+ case IonBH2O:
+ return "b-h2o";
+ case IonYNH3:
+ return "y-nh3";
+ case IonBNH3:
+ return "b-nh3";
+ default:
+ return "";
+ }
+}
diff --git a/Scorpion.h b/Scorpion.h
new file mode 100644
index 0000000..e530d7b
--- /dev/null
+++ b/Scorpion.h
@@ -0,0 +1,108 @@
+//Title: Scorpion.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef SCORPION_H
+#define SCORPION_H
+
+// Scorpion - Ion-based scoring of mass spectra. Compute various features
+// for use by the match-scoring SVM.
+
+#include "Inspect.h"
+#include "Trie.h"
+#include "BN.h"
+
+#define CUT_FEATURE_COUNT 32
+
+extern int g_CutFeatures[];
+
+// Features used in scoring of cut points
+typedef enum ScorpIons
+{
+ SIDynamicRange = 0,
+ IonY,
+ IonB,
+ IonYI,
+ IonBI,
+ IonY2,
+ IonB2,
+ IonYH2O,
+ IonA,
+ IonBH2O,
+ IonYNH3,
+ IonBNH3,
+ SICharge,
+ SIFlankB, // Flanking amino acids that affect N-terminal fragments
+ SIFlankY, // Flanking amino acids that affect C-terminal fragments
+ SISector,
+ //SIBasePrefix,
+ //SIAcidPrefix,
+ //SIBaseSuffix,
+ //SIAcidSuffix,
+ //SIFlankLeft,
+ //SIFlankRight,
+ SITestA,
+ SITestB,
+ SITestC,
+ SITestD,
+ SITestE,
+ SITestF,
+ SITestG,
+ SITestH,
+ SITestI,
+ SITestJ,
+ SIMax,
+ IonParentLoss
+} ScorpIons;
+
+typedef enum CutFeature
+{
+ CFDynamic,
+ CFCharge,
+ CFFlank,
+ CFSector,
+ CFBasic,
+ CFAcidic,
+ CFY,
+ CF
+} CutFeature;
+
+void GetCutFeatures(MSSpectrum* Spectrum, SpectrumTweak* Tweak, Peptide* Match, BayesianModel* Model);
+void ScorpionSetPRMScores(MSSpectrum* Spectrum, SpectrumTweak* Tweak);
+void TestPRMQuickScoring(char* OracleFile, char* OracleDir);
+float GetExplainedIntensityPercent(MSSpectrum* Spectrum, int PeakCount, int BYOnly);
+float GetExplainedPeakPercent(MSSpectrum* Spectrum, int PeakCount, int BYOnly);
+int GetPeptideParentMass(Peptide* Match);
+void TrainPepPRM(char* OracleFile, char* OracleDir);
+void TestLDA(char* OracleFile, char* OracleDir);
+void TestPepPRM(char* OracleFile, char* OracleDir);
+char* GetScorpIonName(int IonType);
+#endif // SCORPION_H
diff --git a/SelectProteins.py b/SelectProteins.py
new file mode 100644
index 0000000..ab30cb2
--- /dev/null
+++ b/SelectProteins.py
@@ -0,0 +1,397 @@
+#Title: SelectProteins.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Helper class for FDRUtils.py and FDR.py:
+Given an f-score cutoff, select a parsimonious collection of proteins which
+account for most / all of the annotations.
+"""
+import os
+import sys
+import traceback
+import struct
+import ResultsParser
+from Utils import *
+Initialize()
+
+class ProteinSelector(ResultsParser.ResultsParser):
+ def __init__(self):
+ self.PeptideDict = {} # aminos -> location list
+ self.ProteinPeptideCounts = {}
+ self.ProteinSpectrumCounts = {}
+ #self.SharedProteinPeptides = {}
+ #self.SharedProteinSpectra = {}
+ self.PeptideSpectrumCounts = {}
+ self.ProteinPeptides = {} # Protein -> List of aminos
+ self.ProteinNames = {}
+ self.ProteinSequences = {}
+ self.MQScoreWeight = 0.3
+ self.DeltaScoreWeight = 1.5
+ self.MinimumPeptideLength = 7
+ self.BestScoresByPeptide = {}
+ self.PValueCutoff = None
+ self.MaxFileLines = None
+ # if RetainRepresentativeCount is set, then we remember the
+ # best n spectra for a particular annotation in the dictionary
+ # self.BestRepresentatives
+ self.RetainRepresentativeCount = None
+ self.BestRepresentatives = {}
+ self.AnnotationSpectrumCounts = {}
+ self.FScoreCutoff2 = None
+ self.FScoreCutoff3 = None
+ self.Columns = ResultsParser.Columns()
+ ResultsParser.ResultsParser.__init__(self)
+
+ def FindPeptideLocations(self, Aminos):
+ PrevPos = -1
+
+ LocationList = []
+ while (1):
+ Pos = self.DB.find(Aminos, PrevPos + 1)
+ if Pos == -1:
+ break
+
+ # Which protein does Pos lie in?
+ LowIndex = 0
+ HighIndex = len(self.ProteinPos) - 1
+ # Pos >= ProteinPos[LowIndex] and Pos < ProteinPos[HighIndex]
+ # Special case - last protein:
+
+ if Pos >= self.ProteinPos[HighIndex]:
+ ProteinID = HighIndex
+ ResidueNumber = Pos - self.ProteinPos[HighIndex]
+
+ else:
+ while (1):
+ if LowIndex+1==HighIndex:
+ ProteinID = LowIndex
+ ResidueNumber = Pos - self.ProteinPos[LowIndex]
+ break
+ MidIndex = (LowIndex + HighIndex) / 2
+ if Pos >= self.ProteinPos[MidIndex]:
+ LowIndex = MidIndex
+ else:
+ HighIndex = MidIndex
+ LocationList.append((ProteinID, ResidueNumber))
+ PrevPos = Pos
+ return LocationList
+ def OldFindPeptideLocations(self, Aminos):
+ LocationList = []
+ #print "Find locations for %s..."%Aminos
+ for (ID, Sequence) in self.ProteinSequences.items():
+ Pos = Sequence.find(Aminos)
+ if Pos != -1:
+ LocationList.append((ID, Pos))
+ #print "Found at pos %s in %s"%(Pos, ID)
+ if len(LocationList) == 0:
+ print "*** WARNING: Peptide '%s' not found in the database."%Aminos
+ return LocationList
+ def LoadDB(self, DBPath):
+ DBFile = open(DBPath, "rb")
+ self.DB = DBFile.read()
+ DBFile.close()
+ IndexPath = os.path.splitext(DBPath)[0] + ".index"
+ IndexFile = open(IndexPath, "rb")
+ BlockSize = struct.calcsize("<qi80s")
+ ID = 0
+ PrevID = None
+ self.ProteinPos = []
+ while (1):
+ Block = IndexFile.read(BlockSize)
+ if not Block:
+ break
+ Info = struct.unpack("<qi80s", Block)
+ Name = Info[2]
+ NullPos = Name.find("\0")
+ if NullPos !=- 1:
+ Name = Name[:NullPos]
+ self.ProteinNames[ID]= Name
+ StartPos = Info[1]
+ self.ProteinPos.append(StartPos)
+ if PrevID != None:
+ self.ProteinSequences[PrevID] = self.DB[self.ProteinPos[PrevID]:StartPos - 1]
+ PrevID = ID
+ ID += 1
+ self.ProteinSequences[PrevID] = self.DB[self.ProteinPos[PrevID]:]
+ def LoadMultipleDB(self, DBPathList):
+ """" Given a list of DB pathnames, load all the corresponding DB """
+ ID = 0
+ self.DB = ""
+ self.ProteinPos = []
+ for DBPath in DBPathList:
+ print "loading %s"%DBPath
+ DBFile = open(DBPath, "rb")
+ OldDB = self.DB
+ self.DB += DBFile.read() # concatenate all DBs sequentially
+ DBFile.close()
+ IndexPath = os.path.splitext(DBPath)[0] + ".index"
+ IndexFile = open(IndexPath, "rb")
+ BlockSize = struct.calcsize("<qi80s")
+ PrevID = None
+ while (1):
+ Block = IndexFile.read(BlockSize)
+ if not Block:
+ break
+ Info = struct.unpack("<qi80s", Block)
+ Name = Info[2]
+ NullPos = Name.find("\0")
+ if NullPos !=- 1:
+ Name = Name[:NullPos]
+ self.ProteinNames[ID]= Name
+ StartPos = Info[1] + len(OldDB) # adjust StartPos for adding a new DB
+ self.ProteinPos.append(StartPos)
+ if PrevID != None:
+ self.ProteinSequences[PrevID] = self.DB[self.ProteinPos[PrevID]:StartPos - 1]
+
+ PrevID = ID
+ ID += 1
+ self.ProteinSequences[PrevID] = self.DB[self.ProteinPos[PrevID]:]
+
+ def OldLoadDB(self, DBPath):
+ """
+ Load the database, popluating self.ProteinSequences
+ """
+ print "LoadDB(%s)"%DBPath
+ IndexPath = os.path.splitext(DBPath)[0] + ".index"
+ IndexFile = open(IndexPath, "rb")
+ DBFile = open(DBPath, "rb")
+ BlockSize = struct.calcsize("<qi80s")
+ PrevName = None
+ PrevID = None
+ PrevStartPos = None
+ ID = 0
+ while (1):
+ Block = IndexFile.read(BlockSize)
+ if not Block:
+ break
+ Info = struct.unpack("<qi80s", Block)
+ Name = Info[2]
+ NullPos = Name.find("\0")
+ if NullPos !=- 1:
+ Name = Name[:NullPos]
+ StartPos = Info[1]
+ self.ProteinNames[ID] = Name
+ if PrevName != None:
+ DBFile.seek(PrevStartPos)
+ Sequence = DBFile.read(StartPos - PrevStartPos)
+ Sequence = Sequence.replace("*", "")
+ self.ProteinSequences[PrevID] = Sequence
+ PrevName = Name
+ PrevID = ID
+ PrevStartPos = StartPos
+ ID += 1
+ if PrevName != None:
+ DBFile.seek(StartPos)
+ Sequence = DBFile.read()
+ self.ProteinSequences[PrevID] = Sequence
+ #self.ProteinNames[PrevID] = Name
+
+ DBFile.close()
+ IndexFile.close()
+ def ChooseProteins(self):
+ """
+ Iteratively select proteins which account for all the peptides.
+ """
+ self.SelectedProteins = {} # Protein -> (Peptides, Spectra)
+ self.PeptideProteins = {} # Peptide -> final selection of protein
+ print "\n\n\n"
+ print "CHOOSE PROTEINS:"
+ for (Peptide, SpectrumCount) in self.PeptideSpectrumCounts.items():
+ for (ProteinID, Pos) in self.PeptideDict[Peptide]:
+ self.ProteinSpectrumCounts[ProteinID] = self.ProteinSpectrumCounts.get(ProteinID, 0) + SpectrumCount
+
+ print "Loaded %s peptides and %s proteins"%(len(self.PeptideSpectrumCounts.keys()), len(self.ProteinSpectrumCounts.keys()))
+ while (1):
+ BestCandidate = None
+ BestScore = None
+ for Protein in self.ProteinPeptideCounts.keys():
+ if self.SelectedProteins.has_key(Protein):
+ continue
+ PeptideCount = self.ProteinPeptideCounts[Protein]
+ SpectrumCount = self.ProteinSpectrumCounts.get(Protein, 0)
+ Score = (PeptideCount, SpectrumCount)
+ #print Protein, Score
+ if Score > BestScore or (Score == BestScore and self.ProteinNames[Protein] < self.ProteinNames[BestCandidate]):
+ BestScore = Score
+ BestCandidate = Protein
+ #print "New Best %s, score %s"%(BestCandidate,BestScore)
+ if not BestScore:
+ break
+ (PeptideCount, SpectrumCount) = BestScore
+ if PeptideCount == 0:
+ break
+ #%%%
+ print "Accept protein %s (%s)\n Gets %s peptides, %s spectra"%(BestCandidate, self.ProteinNames[BestCandidate], PeptideCount, SpectrumCount)
+
+ self.SelectedProteins[BestCandidate] = BestScore
+ # Lay claim to all the (not-yet-claimed) peptides:
+ for Peptide in self.ProteinPeptides[BestCandidate]:
+ #print Peptide
+ if not self.PeptideProteins.has_key(Peptide):
+
+ self.PeptideProteins[Peptide] = BestCandidate
+ # Other proteins (if not already accepted) lose a peptide, and some spectra:
+ for (OtherProtein, Pos) in self.PeptideDict[Peptide]:
+ if self.SelectedProteins.has_key(OtherProtein):
+ continue
+
+ self.ProteinPeptideCounts[OtherProtein] -= 1
+ self.ProteinSpectrumCounts[OtherProtein] = self.ProteinSpectrumCounts.get(OtherProtein, 0) - self.PeptideSpectrumCounts[Peptide]
+ # Sanity check - the selected proteins have peptides, the unselected proteins have 0
+ for Protein in self.ProteinPeptideCounts.keys():
+ ProteinName = self.ProteinNames[Protein]
+ PeptideCount = self.ProteinPeptideCounts[Protein]
+ SpectrumCount = self.ProteinSpectrumCounts.get(Protein, 0)
+ if self.SelectedProteins.has_key(Protein) and PeptideCount <= 0:
+ print "** Warning: Selected protein %s (%s) has %s peptides!"%(Protein, ProteinName, PeptideCount)
+ if not self.SelectedProteins.has_key(Protein) and PeptideCount != 0:
+ print "** Warning: Unelected protein %s (%s) has %s peptides!"%(Protein, ProteinName, PeptideCount)
+ def ParseAnnotations(self, FileName):
+ """
+ Parse annotations, remembering all protein locations for each peptide.
+ """
+ print "Parse %s..."%FileName
+ File = open(FileName, "rb")
+ OldSpectrum = None
+ Stub = os.path.split(FileName)[1]
+ LineNumber = 0
+ for FileLine in File:
+ LineNumber += 1
+ if LineNumber % 100 == 0:
+ print "%s %s..."%(Stub, LineNumber)
+ if self.MaxFileLines != None and LineNumber >= self.MaxFileLines:
+ return # Quick-parse, for debugging only!
+
+
+ if FileLine[0] == "#":
+ self.Columns.initializeHeaders(FileLine)
+ continue
+ if not FileLine.strip():
+ continue
+ Bits = FileLine.split("\t")
+ try:
+ Spectrum = (Bits[self.Columns.getIndex("SpectrumFile")], Bits[self.Columns.getIndex("Scan#")])
+ except:
+ continue # header line
+ if Spectrum == OldSpectrum:
+ continue
+ OldSpectrum = Spectrum
+ try:
+ MQScore = float(Bits[self.Columns.getIndex("MQScore")])
+ DeltaScore = float(Bits[self.Columns.getIndex("DeltaScoreOther")])
+ Charge = int(Bits[self.Columns.getIndex("Charge")])
+ except:
+ traceback.print_exc()
+ print Bits
+ continue
+ # Apply a threshold: EITHER f-score cutoff (default) OR p-value cutoff
+
+ if self.PValueCutoff != None:
+
+ try:
+ PValue = float(Bits[self.Columns.getIndex("InspectFDR")])
+ except:
+ traceback.print_exc()
+ print Bits
+ continue
+ PeptideScore = (-PValue, MQScore)
+ if PValue > self.PValueCutoff:
+
+ continue
+ else:
+ if Charge < 3:
+ WeightedScore = self.MQScoreWeight * MQScore + self.DeltaScoreWeight * (DeltaScore / self.MeanDeltaScore2)
+ if WeightedScore < self.FScoreCutoff2:
+
+ continue
+ else:
+ WeightedScore = self.MQScoreWeight * MQScore + self.DeltaScoreWeight * (DeltaScore / self.MeanDeltaScore3)
+ if WeightedScore < self.FScoreCutoff3:
+
+ continue
+ PeptideScore = WeightedScore
+
+
+ try:
+ Peptide = GetPeptideFromModdedName(Bits[self.Columns.getIndex("Annotation")])
+ except:
+
+ continue
+ if len(Peptide.Aminos) < self.MinimumPeptideLength:
+
+ continue
+ # Remember this peptide:
+
+ if not self.PeptideDict.get(Peptide.Aminos):
+
+ # It's a new peptide! Figure out where it falls in the database:
+ LocationList = self.FindPeptideLocations(Peptide.Aminos)
+ for (Protein, Pos) in LocationList:
+ if not self.ProteinPeptides.has_key(Protein):
+ self.ProteinPeptides[Protein] = []
+ #print "Adding peptide %s to protein %s '%s':"%(Peptide.Aminos,Protein,self.ProteinNames[Protein])
+ self.ProteinPeptides[Protein].append(Peptide.Aminos)
+
+ self.PeptideDict[Peptide.Aminos] = LocationList
+ for (ProteinNumber, Dummy) in LocationList:
+ self.ProteinPeptideCounts[ProteinNumber] = self.ProteinPeptideCounts.get(ProteinNumber, 0) + 1
+ else:
+ # We've seen this peptide before:
+ LocationList = self.PeptideDict[Peptide.Aminos]
+ OldScore = self.BestScoresByPeptide.get(Peptide.Aminos, -9999)
+ self.BestScoresByPeptide[Peptide.Aminos] = max(PeptideScore, OldScore)
+ self.PeptideSpectrumCounts[Peptide.Aminos] = self.PeptideSpectrumCounts.get(Peptide.Aminos, 0) + 1
+ ##############################################################
+ # Populate self.BestRepresentative, if requested:
+ if self.RetainRepresentativeCount:
+ Peptide.MQScore = MQScore
+ Peptide.PValue = PValue
+ Peptide.SpectrumFilePath = Bits[0]
+ Peptide.ScanNumber = int(Bits[1])
+ Peptide.SpectrumFilePos = int(Bits[self.Columns.getIndex("SpecFilePos")])
+ Key = Peptide.GetFullModdedName()
+ RepresentativeList = self.BestRepresentatives.get(Key, [])
+ Tuple = (PeptideScore, Peptide)
+ RepresentativeList.append(Tuple)
+ RepresentativeList.sort()
+ self.BestRepresentatives[Key] = RepresentativeList[-self.RetainRepresentativeCount:]
+ self.AnnotationSpectrumCounts[Key] = self.AnnotationSpectrumCounts.get(Key, 0) + 1
+
+
+if __name__ == "__main__":
+ # Test
+ Bob = ProteinSelector()
+ Bob.LoadDB("database\DictyCommon.Aug28.FS2.trie")
+ print Bob.FindPeptideLocations("GTVESEMAEQDSLLNKLNK")
+ print Bob.FindPeptideLocations("TSEGDFTLLLGQIVDNQIGDLNKSG")
+ print Bob.FindPeptideLocations("YAVFAPGLADVVIEVVAK")
diff --git a/ShuffleDB.py b/ShuffleDB.py
new file mode 100644
index 0000000..3ec586d
--- /dev/null
+++ b/ShuffleDB.py
@@ -0,0 +1,285 @@
+#Title: ShuffleDB.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Shuffle all the records in a database. Useful for generating a database of all bogus records,
+to get an idea of the rate at which matches of a given quality are generated when there's
+nothing real to match. Or a database of half valid, half invalid records.
+"""
+import os
+import sys
+import string
+import getopt
+import struct
+import random
+
+UsageInfo = """
+ShuffleDB - Produce a 'red herring' database of erroneous peptide records.
+Options:
+ -r [Trie file name]: Path to input database
+ -w [FileName]: Path to output database
+ -s: If set, proteins will be REVERSED. Default behavior is to SHUFFLE.
+ -b: If set, ONLY the scrambled proteins are written out. Default
+ behavior is to write both forward and scrambled proteins.
+ -p: In shuffled mode, avoid repeating peptides of length 8 or more
+ in the shuffled database. (Treat I and L as identical.
+ Don't treat Q and K as identical) Requires a little longer to run;
+ some "bad words" (repeated 8mers) will still be seen for repetitive
+ records.
+ -t: Number of shuffled copies to write out (defaults to 1)
+
+Example:
+ ShuffleDB.py -r database\Shewanella.trie -w database\ShewanellaHalf.trie -p
+"""
+
+##def EncodeQuad(Str):
+## ValA = ord(Str[0]) - 65
+## ValB = ord(Str[1]) - 65
+## ValC = ord(Str[2]) - 65
+## ValD = ord(Str[3]) - 65
+## return ValA + 32*ValB + 32*32*ValC + 32*32*32*ValD
+##def EncodeOct(Str):
+## return struct.pack("<ii", EncodeQuad(Str[:4]), EncodeQuad(Str[4:]))
+##def DecodeOct(Str):
+## (ValA, ValB) = struct.unpack("<ii", Str)
+## return DecodeQuad(ValA) + DecodeQuad(ValB)
+##def DecodeQuad(Value):
+## ValA = Value % 32
+## Value -= ValA
+## Value /= 32
+## ValB = Value % 32
+## Value -= ValB
+## Value /= 32
+## ValC = Value % 32
+## Value -= ValC
+## Value /= 32
+## ValD = Value
+## print ValA, ValB, ValC, ValD
+## return chr(ValA + 65) + chr(ValB + 65) + chr(ValC + 65) + chr(ValD + 65)
+
+class Shuffler:
+ MAX_SHUFFLE_ATTEMPTS = 100
+ PARANOIA_PEPTIDE_LENGTH = 8
+ def __init__(self):
+ self.ShuffleFlag = 1
+ self.WriteBothFlag = 1
+ self.TrieFileName = None
+ self.OutputFileName = None
+ self.BogusTimes = 1
+ self.BogusProteins = 0
+ self.ParanoiaFlag = 0
+ self.TotalBadWordCount = 0
+ def LoadProteinNames(self, IndexPath):
+ File = open(IndexPath, "rb")
+ self.ProteinNames = []
+ BlockSize = struct.calcsize("<qi80s")
+ while (1):
+ Block = File.read(BlockSize)
+ if not Block:
+ break
+ Tuple = struct.unpack("<qi80s", Block)
+ Name = Tuple[-1]
+ NullPos = Name.find('\0')
+ if NullPos != -1:
+ Name = Name[:NullPos]
+ self.ProteinNames.append(Name)
+ File.close()
+ def Main(self):
+ IndexPath = os.path.splitext(self.TrieFileName)[0] + ".index"
+ self.LoadProteinNames(IndexPath)
+ TrieFile = open(self.TrieFileName, "rb")
+ if self.ParanoiaFlag:
+ self.ForbiddenPeptides = {}
+ DB = TrieFile.read()
+ for Pos in range(len(DB) - self.PARANOIA_PEPTIDE_LENGTH):
+ if Pos % 10000 == 0:
+ print "%s (%s)..."%(Pos, len(self.ForbiddenPeptides.keys()))
+ Peptide = DB[Pos:Pos + self.PARANOIA_PEPTIDE_LENGTH].replace("I", "L")
+ if Peptide.find("X")!=-1:
+ # Peptides containing X need not be forbidden, because they will
+ # never be matched!
+ continue
+ self.ForbiddenPeptides[Peptide] = 1
+ TrieFile.seek(0)
+ print "(Note: %s peptide words are forbidden)"%len(self.ForbiddenPeptides.keys())
+ NewIndexPath = os.path.splitext(self.OutputFileName)[0] + ".index"
+ self.OutputTrieFile = open(self.OutputFileName, "wb")
+ self.OutputIndexFile = open(NewIndexPath, "wb")
+ Sequence = ""
+ ProteinIndex = 0
+ while (1):
+ Data = TrieFile.read(1024)
+ if not Data:
+ break
+ Sequence += Data
+ Pos = Sequence.find("*")
+ while (Pos != -1):
+ self.WriteProtein(Sequence[:Pos], ProteinIndex)
+ ProteinIndex += 1
+ Sequence = Sequence[Pos+1:]
+ Pos = Sequence.find("*")
+ if (Sequence):
+ self.WriteProtein(Sequence, ProteinIndex)
+ ProteinIndex += 1
+ #List = list(Sequence)
+ #List.reverse()
+ #Protein = string.join(List,"")
+ #ReversedTrieFile.write(Protein)
+ #ReversedTrieFile.write("*")
+ self.OutputTrieFile.close()
+ self.OutputIndexFile.close()
+ print "Wrote out %s proteins."%ProteinIndex
+ print "Wrote out %d bogus proteins."%self.BogusProteins
+ print "Total bad words:", self.TotalBadWordCount
+ def ShuffleProtein(self, Sequence):
+ """
+ Produce the invalid (shuffled) version of a protein.
+ """
+ Residues = list(Sequence)
+ if not self.ShuffleFlag:
+ Residues.reverse()
+ return string.join(Residues, "")
+ if not self.ParanoiaFlag:
+ random.shuffle(Residues)
+ return string.join(Residues, "")
+ # And now, the interesting case: We shall shuffle the protein, and we shall apply some
+ # heuristics along the way to minimize the number of shared k-mers.
+ BestBadWordCount = 9999
+ BestPeptideString = None
+ for AttemptIndex in range(10):
+ random.shuffle(Residues)
+ BadWordCount = 0
+ for Pos in range(len(Residues) - self.PARANOIA_PEPTIDE_LENGTH):
+ WordResidues = Residues[Pos:Pos + self.PARANOIA_PEPTIDE_LENGTH]
+ Word = string.join(WordResidues, "").replace("I", "L")
+ if self.ForbiddenPeptides.has_key(Word):
+ # Damn, this shuffling shares a word! Maybe we can re-shuffle this
+ # word and solve the problem:
+ FixedFlag = 0
+ for WordShuffleIndex in range(10):
+ random.shuffle(WordResidues)
+ FixedWord = string.join(WordResidues, "").replace("I", "L")
+ if self.ForbiddenPeptides.has_key(FixedWord):
+ # The shuffled word is no good!
+ continue
+ # We shuffled a word, and in so doing, we changed the preceding
+ # words. Let's check whether they're now forbidden:
+ BrokeOldWord = 0
+ for StepsBack in range(1, self.PARANOIA_PEPTIDE_LENGTH):
+ PrevPos = Pos - StepsBack
+ if PrevPos < 0:
+ break
+ PrevWord = Residues[PrevPos:Pos]
+ PrevWord.extend(WordResidues[:-StepsBack])
+ PrevWord = string.join(PrevWord, "").replace("I", "L")
+ if self.ForbiddenPeptides.has_key(PrevWord):
+ BrokeOldWord = 1
+ #print "Preceding word %s is '%s': FORBIDDEN!"%(StepsBack, PrevWord)
+ break
+ #print "Preceding word %s is '%s'"%(StepsBack, PrevWord)
+ if not BrokeOldWord:
+ FixedFlag = 1
+ break
+ if FixedFlag:
+ # This word (and the previous words that overlap it) is now ok.
+ Residues[Pos:Pos + self.PARANOIA_PEPTIDE_LENGTH] = WordResidues
+ else:
+ # We couldn't fix the word by shuffling it. Increment the bad word count:
+ BadWordCount += 1
+ if BadWordCount == 0:
+ #print "Protein '%s...' shuffled with no bad words"%(Sequence[:20])
+ return string.join(Residues, "")
+ if BadWordCount < BestBadWordCount:
+ BestBadWordCount = BadWordCount
+ BestPeptideString = string.join(Residues, "")
+ print "Protein '%s...' shuffled with %s bad words"%(Sequence[:20], BestBadWordCount)
+ self.TotalBadWordCount += BestBadWordCount
+ return BestPeptideString
+ def WriteProtein(self, Sequence, ProteinIndex):
+ """
+ Given a protein sequence, and protein index number (for looking up the name),
+ write a scrambled or reversed record to the output database. (And write the
+ original, if the -b flag was specified)
+ """
+ for ShuffleIndex in range(self.BogusTimes):
+ ShuffledProtein = self.ShuffleProtein(Sequence)
+ OutputFilePos = self.OutputTrieFile.tell()
+ self.OutputTrieFile.write(ShuffledProtein)
+ self.OutputTrieFile.write("*")
+ if self.BogusTimes > 1:
+ ShuffledName = "XXX.%d.%s"%(ShuffleIndex, self.ProteinNames[ProteinIndex])
+ else:
+ ShuffledName = "XXX.%s"%self.ProteinNames[ProteinIndex]
+ Block = struct.pack("<qi80s", 0, OutputFilePos, ShuffledName)
+ self.OutputIndexFile.write(Block)
+ self.BogusProteins +=1
+ # If we're writing both the red herrings and the originals,
+ # then write the original protein as well now:
+ if self.WriteBothFlag:
+ OutputFilePos = self.OutputTrieFile.tell()
+ self.OutputTrieFile.write(Sequence)
+ self.OutputTrieFile.write("*")
+ Block = struct.pack("<qi80s", 0, OutputFilePos, self.ProteinNames[ProteinIndex])
+ self.OutputIndexFile.write(Block)
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "r:w:sbt:p")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ self.TrieFileName = Value
+ elif Option == "-w":
+ self.OutputFileName = Value
+ elif Option == "-s":
+ self.ShuffleFlag = 0
+ elif Option == "-b":
+ self.WriteBothFlag = 0
+ elif Option == "-t":
+ self.BogusTimes = int(Value)
+ elif Option == "-p":
+ self.ParanoiaFlag = 1
+ else:
+ print "** Warning: Option %s not understood"%Option
+
+if __name__ =="__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "* Warning: psyco not found"
+ App = Shuffler()
+ App.ParseCommandLine(sys.argv[1:])
+ if not App.TrieFileName or not App.OutputFileName:
+ print UsageInfo
+ sys.exit(-1)
+ App.Main()
diff --git a/SpectralSimilarity.py b/SpectralSimilarity.py
new file mode 100644
index 0000000..0010612
--- /dev/null
+++ b/SpectralSimilarity.py
@@ -0,0 +1,502 @@
+#Title: SpectralSimilarity.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+This is an auxillary module for ComputePTMFeatures.py. It measurs the similarity between
+two spectra. The simplest case is a comparison of two spectra with the same annotation.
+We are also able to compare two spectra which differ by a PTM (provided the PTM position
+is known).
+"""
+
+import MSSpectrum
+import Label
+import math
+from Utils import *
+Initialize()
+
+class SpectralSimilarity:
+ """
+ Container class that holds the spectra, and measures similarity
+ ASSUMPTION: Either AnnotationA and AnnotationB are the same,
+ or AnnotationA is modified and AnnotationB is NOT.
+ """
+ def __init__(self, SpectrumA, SpectrumB, AnnotationA, AnnotationB):
+ """
+ SpectrumA and SpectrumB are SpectrumClass objects from MSSpectrum.py
+ AnnotationA and AnnotationB are strings of the Inspect annotation of the
+ spectrum (if the annotation was R.THISPEPK.M this method would get
+ passed "THISPEPK")
+ """
+ self.BinMultiplier = 1.0 # default
+ # Spectra can be either MSSpectrum objects or file paths:
+ if isinstance(SpectrumA, MSSpectrum.SpectrumClass):
+ self.SpectrumA = SpectrumA
+ else:
+ Spectrum = MSSpectrum.SpectrumClass()
+ Spectrum.ReadPeaks(SpectrumA)
+ self.SpectrumA = Spectrum
+ self.SpectrumA.FilterPeaks()
+ self.SpectrumA.RankPeaksByIntensity()
+ if isinstance(SpectrumB, MSSpectrum.SpectrumClass):
+ self.SpectrumB = SpectrumB
+ else:
+ Spectrum = MSSpectrum.SpectrumClass()
+ Spectrum.ReadPeaks(SpectrumB)
+ self.SpectrumB = Spectrum
+ self.SpectrumB.FilterPeaks()
+ self.SpectrumB.RankPeaksByIntensity()
+ # Annotations can be either a string, or a peptide object:
+ if isinstance(AnnotationA, PeptideClass):
+ self.PeptideA = AnnotationA
+ else:
+ self.PeptideA = GetPeptideFromModdedName(AnnotationA)
+ if isinstance(AnnotationB, PeptideClass):
+ self.PeptideB = AnnotationB
+ else:
+ self.PeptideB = GetPeptideFromModdedName(AnnotationB)
+ def ComputeSimilarity(self):
+ """
+ This method determines how similar two spectra are. It can use a variety of methods,
+ and honestly this may turn into a big switch box
+ """
+ DPSimScore = self.DotProduct()
+ print "The Scaled Dot Product of these two is %f"%DPSimScore
+ def LabelPeaks(self, PeakTolerance = 0.5):
+ """
+ Should be called once, *if* the annotations differ, so that we can match
+ corresponding peaks which have different masses due to PTM attachment.
+ """
+ # Label the spectra so that I can know which peaks belong to what name eg. b7
+ Label.LabelSpectrum(self.SpectrumA, self.PeptideA, PeakTolerance)
+ Label.LabelSpectrum(self.SpectrumB, self.PeptideB, PeakTolerance)
+ def DotProductSignal(self, BinMultiplier = 1.0, MaxIntensityRank = 50, EnableShift = 1, VerboseFlag = 0, HashByRank = 0):
+ """
+ Variant of Dot Product, incorporating a correction factor introduced in [Parag and Mallick, 2006ish]
+ """
+ self.MaxIntensityRank = MaxIntensityRank
+ self.BinMultiplier = BinMultiplier
+ self.HashByRank = HashByRank
+ # set up hashes
+ HashA = {}
+ HashB = {}
+ # Populate HashA and HashB:
+ if EnableShift and len(self.PeptideA.Modifications) > 0:
+ self.HashPeaksWithShift(HashA, self.SpectrumA, self.PeptideA, VerboseFlag)
+ else:
+ self.HashPeaks(HashA, self.SpectrumA)
+ if EnableShift and len(self.PeptideB.Modifications) > 0:
+ self.HashPeaksWithShift(HashB, self.SpectrumB, self.PeptideB, VerboseFlag)
+ else:
+ self.HashPeaks(HashB, self.SpectrumB)
+ #Do Dot Product
+ MaxBins = max(HashA.keys())
+ MaxBins = max(MaxBins, max(HashB.keys()))
+ DotProduct = 0
+ SumSqA = 0
+ SumSqB = 0
+ TotalIntensityA = 0
+ TotalIntensityB = 0
+ for I in range(MaxBins):
+ A = HashA.get(I, 0)
+ B = HashB.get(I, 0)
+ TotalIntensityA += A
+ TotalIntensityB += B
+ if VerboseFlag and (A or B):
+ print "%s\t%s\t%s\t%s\t"%(I, A, B, A*B)
+ DotProduct += A * B
+ SumSqA += A * A
+ SumSqB += B * B
+ #print "Dot Product is %f"%DotProduct
+ #print "sqrt thing is %f"%math.sqrt(SumSqA * SumSqB)
+ OddsCollision = 1.0 / (max(self.SpectrumA.ParentMass, self.SpectrumB.ParentMass) * BinMultiplier)
+ DotProduct -= TotalIntensityA * TotalIntensityB * OddsCollision
+ return DotProduct / math.sqrt(SumSqA * SumSqB)
+ def DotProduct(self, BinMultiplier = 1.0, MaxIntensityRank = 50, EnableShift = 1, VerboseFlag = 0, HashByRank = 0):
+ """
+ This method measures similarity between spectra by calculating their dot product.
+ It is written to work on spectra might be PTM shifted versions of each other.
+ e.g. SAMMY and SAM+16MY. If the peptide annotation has a PTM, then the peaks
+ are shifted back.
+ Variables:
+ Rank = indicates that you want to use a rank based dotproduct, instead of intensity based
+ """
+ self.MaxIntensityRank = MaxIntensityRank
+ self.BinMultiplier = BinMultiplier
+ self.HashByRank = HashByRank
+ # set up hashes
+ HashA = {}
+ HashB = {}
+ # Populate HashA and HashB:
+ if EnableShift and len(self.PeptideA.Modifications) > 0:
+ self.HashPeaksWithShift(HashA, self.SpectrumA, self.PeptideA, VerboseFlag)
+ else:
+ self.HashPeaks(HashA, self.SpectrumA)
+ if EnableShift and len(self.PeptideB.Modifications) > 0:
+ self.HashPeaksWithShift(HashB, self.SpectrumB, self.PeptideB, VerboseFlag)
+ else:
+ self.HashPeaks(HashB, self.SpectrumB)
+ #Do Dot Product
+ MaxBins = max(HashA.keys())
+ MaxBins = max(MaxBins, max(HashB.keys()))
+ DotProduct = 0
+ SumSqA = 0
+ SumSqB = 0
+ for I in range(MaxBins):
+ A = HashA.get(I, 0)
+ B = HashB.get(I, 0)
+ if VerboseFlag and (A or B):
+ print "%s\t%s\t%s\t%s\t%s\t"%(I, I/self.BinMultiplier, A, B, A*B)
+ DotProduct += A * B
+ SumSqA += A * A
+ SumSqB += B * B
+ #print "Dot Product is %f"%DotProduct
+ #print "sqrt thing is %f"%math.sqrt(SumSqA * SumSqB)
+ return DotProduct / math.sqrt(SumSqA * SumSqB)
+ def HashPeaksWithShift(self, Hash, Spectrum, Peptide, VerboseFlag = 0):
+ """
+ Takes a peptide, spectrum, and hashtable
+ and puts values into the hash so that it can be dotproducted (now there's a verb
+ from a noun!)
+ The caveat is that we shift peaks by the mass of the PTM.
+ WARNING: Currently written for only ONE PTM per peptide
+ """
+ ModIndices = Peptide.Modifications.keys()
+ ModList = Peptide.Modifications[ModIndices[0]]
+ FirstMod = ModList[0]
+ ModMass = FirstMod.Mass
+ ModIndex = ModIndices[0] + 1
+ PeptideLength = len(Peptide.Aminos)
+ for Peak in Spectrum.Peaks:
+ if self.MaxIntensityRank != None and Peak.IntensityRank > self.MaxIntensityRank:
+ continue
+ Bin = int(round(Peak.Mass * self.BinMultiplier)) # default
+ if Peak.IonType:
+ if Peak.IonType.Name in ("b", "b-h2o", "b-nh3", "b-h2o-h2o", "b-h2o-nh3", "a"):
+ if Peak.PeptideIndex >= ModIndex:
+ Bin = int(round((Peak.Mass - ModMass) * self.BinMultiplier))
+ if VerboseFlag:
+ print "Peak at %s is %s %s, shift left to %s"%(Peak.Mass, Peak.IonType.Name, Peak.PeptideIndex, Bin)
+ if Peak.IonType.Name in ("b2",):
+ if Peak.PeptideIndex >= ModIndex:
+ Bin = int(round((Peak.Mass - ModMass/2.0) * self.BinMultiplier))
+ if VerboseFlag:
+ print "Peak at %s is %s %s, shift halfleft to %s"%(Peak.Mass, Peak.IonType.Name, Peak.PeptideIndex, Bin)
+ if Peak.IonType.Name in ("y", "y-h2o", "y-nh3", "y-h2o-nh3", "y-h2o-h2o"):
+ if (PeptideLength - Peak.PeptideIndex) < ModIndex:
+ Bin = int(round((Peak.Mass - ModMass) * self.BinMultiplier))
+ if VerboseFlag:
+ print "Peak at %s is %s %s, shift right to %s"%(Peak.Mass, Peak.IonType.Name, Peak.PeptideIndex, Bin)
+ if Peak.IonType.Name in ("y2",):
+ if (PeptideLength - Peak.PeptideIndex) < ModIndex:
+ Bin = int(round((Peak.Mass - ModMass/2.0) * self.BinMultiplier))
+ if VerboseFlag:
+ print "Peak at %s is %s %s, shift halfright to %s"%(Peak.Mass, Peak.IonType.Name, Peak.PeptideIndex, Bin)
+ Value = Peak.Intensity
+ if self.HashByRank:
+ Value = 10.0 / (10 + Peak.IntensityRank)
+ Hash[Bin] = Hash.get(Bin, 0) + Value
+ def HashPeaks(self, Hash, Spectrum):
+ """
+ Hashes Peak intensities into integer bins
+ """
+ for Peak in Spectrum.Peaks:
+ if self.MaxIntensityRank != None and Peak.IntensityRank > self.MaxIntensityRank:
+ continue # only deal with the good peaks
+ Bin = int(round(Peak.Mass * self.BinMultiplier))
+ Value = Peak.Intensity
+ if self.HashByRank:
+ Value = 10.0 / (10 + Peak.IntensityRank)
+ Hash[Bin] = Hash.get(Bin, 0) + Value
+ def GetSharedPeakCount(self, PeakWeight, RankWeight, SkewMultiplier = 1.0,
+ PeakCountDivisor = 40, EnableShift = 1, VerboseFlag = 0):
+ """
+ Measure the shared peak count between two spectra.
+ Iterate over the top N peaks of spectrum A (where N = (ParentMass / 100)*5).
+ If the peak is present (modulo epsilon) as one of the top N peaks in spectrum B,
+ then receive score PeakWeight + RankWeight / IntensityRank. The sum of these
+ scores is then scaled relative to the maximum attainable score.
+ """
+ SkewMultipliers = []
+ for X in range(5):
+ SkewMultipliers.append(SkewMultiplier ** X)
+ self.BinMultiplier = 1.0
+ SortedPeaksA = []
+ for Peak in self.SpectrumA.Peaks:
+ SortedPeaksA.append((Peak.Intensity, Peak))
+ SortedPeaksA.sort()
+ SortedPeaksA.reverse()
+ N = int(round(self.SpectrumA.ParentMass / float(PeakCountDivisor)))
+ #print "N = %s/%s = %s"%(self.SpectrumA.ParentMass, PeakCountDivisor, N)
+ self.SpectrumB.RankPeaksByIntensity()
+ # Populate HashedPeaks[Bin] = list of peaks in SpectrumB
+ # which fall into Bin. Only peaks
+ # with rank <= N are hashed.
+ HashedPeaks = {}
+ PeptideLength = len(self.PeptideB.Aminos)
+ ModIndices = self.PeptideB.Modifications.keys()
+ if len(ModIndices):
+ ModIndex = ModIndices[0] + 1
+ ModMass = self.PeptideB.Modifications[ModIndices[0]][0].Mass
+ else:
+ ModIndex = None
+ for Peak in self.SpectrumB.Peaks:
+ if Peak.IntensityRank > N:
+ continue
+ Peak.ShiftedMass = Peak.Mass
+ if EnableShift and ModIndex != None and Peak.IonType:
+ if Peak.IonType.Name in ("b", "b-h2o", "b-nh3", "b-h2o-h2o", "b-h2o-nh3", "a"):
+ if Peak.PeptideIndex >= ModIndex:
+ Peak.ShiftedMass -= ModMass
+ if Peak.IonType.Name in ("b2",):
+ if Peak.PeptideIndex >= ModIndex:
+ Peak.ShiftedMass -= (ModMass / 2.0)
+ if Peak.IonType.Name in ("y", "y-h2o", "y-nh3", "y-h2o-nh3", "y-h2o-h2o"):
+ if (PeptideLength - Peak.PeptideIndex) < ModIndex:
+ Peak.ShiftedMass -= ModMass
+ if Peak.IonType.Name in ("y2",):
+ if (PeptideLength - Peak.PeptideIndex) < ModIndex:
+ Peak.ShiftedMass -= (ModMass / 2.0)
+ Bin = int(round(Peak.ShiftedMass * self.BinMultiplier))
+ if not HashedPeaks.has_key(Bin):
+ HashedPeaks[Bin] = []
+ HashedPeaks[Bin].append(Peak)
+## ########################
+## # Debug peak hashing:
+## Keys = HashedPeaks.keys()
+## Keys.sort()
+## for Key in Keys:
+## Str = "%s: "%Key
+## for Peak in HashedPeaks[Key]:
+## Str += "(#%d %.1f, %.2f)"%(Peak.IntensityRank, Peak.Mass, Peak.Intensity)
+## print Str
+## ########################
+ OverallScore = 0
+ MaxScore = 0
+ PeptideLength = len(self.PeptideA.Aminos)
+ ModIndices = self.PeptideA.Modifications.keys()
+ if len(ModIndices):
+ ModIndex = ModIndices[0] + 1
+ ModMass = self.PeptideA.Modifications[ModIndices[0]][0].Mass
+ else:
+ ModIndex = None
+ for PeakIndex in range(min(N, len(SortedPeaksA))):
+ Peak = SortedPeaksA[PeakIndex][1]
+ Peak.ShiftedMass = Peak.Mass
+ if EnableShift and ModIndex != None and Peak.IonType:
+ if Peak.IonType.Name in ("b", "b-h2o", "b-nh3", "b-h2o-h2o", "b-h2o-nh3", "a"):
+ if Peak.PeptideIndex >= ModIndex:
+ Peak.ShiftedMass -= ModMass
+ if Peak.IonType.Name in ("b2",):
+ if Peak.PeptideIndex >= ModIndex:
+ Peak.ShiftedMass -= (ModMass / 2.0)
+ if Peak.IonType.Name in ("y", "y-h2o", "y-nh3", "y-h2o-nh3", "y-h2o-h2o"):
+ if (PeptideLength - Peak.PeptideIndex) < ModIndex:
+ Peak.ShiftedMass -= ModMass
+ if Peak.IonType.Name in ("y2",):
+ if (PeptideLength - Peak.PeptideIndex) < ModIndex:
+ Peak.ShiftedMass -= (ModMass / 2.0)
+ Bin = int(round(Peak.ShiftedMass * self.BinMultiplier))
+ BestPeak = None
+ BestScore = 0
+ for NearBin in (Bin - 1, Bin, Bin + 1):
+ PeakList = HashedPeaks.get(NearBin, [])
+ for PeakB in PeakList:
+ Skew = abs(Peak.ShiftedMass - PeakB.ShiftedMass)
+ SkewDeciDaltons = int(round(abs(Skew) * 10))
+ if SkewDeciDaltons >= 5:
+ continue
+ #Score = PeakWeight / float(RankWeight + Peak.IntensityRank + PeakB.IntensityRank)
+ Score = PeakWeight*10 + RankWeight*10 / (10.0 + Peak.IntensityRank)
+ Score *= SkewMultipliers[SkewDeciDaltons]
+ if Score > BestScore:
+ BestScore = Score
+ BestPeak = PeakB
+ BestPeakScoreMultiplier = SkewMultipliers[SkewDeciDaltons]
+ if VerboseFlag:
+ Str = "PeakA #%d %.1f (bin %d):"%(Peak.IntensityRank, Peak.Mass, Bin)
+ if Peak.IonType:
+ Str += " (%s %s)"%(Peak.IonType.Name, Peak.PeptideIndex)
+ print Str
+ if BestPeak:
+ Str = " Best near peak #%d %.1f ==> %s"%(BestPeak.IntensityRank, BestPeak.Mass, BestScore)
+ if BestPeak.IonType:
+ Str += " (%s %s)"%(BestPeak.IonType.Name, BestPeak.PeptideIndex)
+ print Str
+ OverallScore += BestScore
+ MaxScore += PeakWeight*10 + RankWeight*10 / (10.0 + Peak.IntensityRank)
+ #MaxScore += PeakWeight / float(RankWeight + Peak.IntensityRank + Peak.IntensityRank)
+ return OverallScore / float(MaxScore)
+ def ComputeCorrelationCoefficient(self, BinMultiplier = 1.0, MaxIntensityRank = 50, EnableShift = 1, VerboseFlag = 0, HashByRank = 0):
+ """
+ Compute similarity between two spectra by computing the
+ correlation coefficient of the binned intensities.
+ """
+ self.BinMultiplier = BinMultiplier
+ self.MaxIntensityRank = MaxIntensityRank
+ self.HashByRank = HashByRank
+ # set up hashes
+ HashA = {}
+ HashB = {}
+ # Populate HashA and HashB:
+ if EnableShift and len(self.PeptideA.Modifications) > 0:
+ self.HashPeaksWithShift(HashA, self.SpectrumA, self.PeptideA, VerboseFlag)
+ else:
+ self.HashPeaks(HashA, self.SpectrumA)
+ if EnableShift and len(self.PeptideB.Modifications) > 0:
+ self.HashPeaksWithShift(HashB, self.SpectrumB, self.PeptideB, VerboseFlag)
+ else:
+ self.HashPeaks(HashB, self.SpectrumB)
+ MinBin = min(HashA.keys())
+ MinBin = min(MinBin, min(HashB.keys()))
+ MaxBin = max(HashA.keys())
+ MaxBin = max(MaxBin, max(HashB.keys()))
+ TotalA = 0
+ TotalB = 0
+ BinCount = MaxBin - MinBin + 1
+ for Bin in range(MinBin, MaxBin + 1):
+ A = HashA.get(Bin, 0)
+ B = HashB.get(Bin, 0)
+ TotalA += A
+ TotalB += B
+ MeanA = TotalA / float(BinCount)
+ MeanB = TotalB / float(BinCount)
+ if VerboseFlag:
+ print "MeanA %s over %s bins"%(MeanA, BinCount)
+ print "MeanB %s over %s bins"%(MeanB, BinCount)
+ SigmaSumA = 0
+ SigmaSumB = 0
+ for Bin in range(MinBin, MaxBin + 1):
+ A = HashA.get(Bin, 0)
+ B = HashB.get(Bin, 0)
+ SigmaSumA += (A - MeanA)**2
+ SigmaSumB += (B - MeanB)**2
+ VarianceA = SigmaSumA / float(BinCount)
+ SigmaA = math.sqrt(VarianceA)
+ VarianceB = SigmaSumB / float(BinCount)
+ SigmaB = math.sqrt(VarianceB)
+ if VerboseFlag:
+ print "A has variance %s stddev %s"%(VarianceA, SigmaA)
+ print "B has variance %s stddev %s"%(VarianceB, SigmaB)
+ CovarianceSum = 0
+ for Bin in range(MinBin, MaxBin + 1):
+ A = HashA.get(Bin, 0)
+ B = HashB.get(Bin, 0)
+ CovarianceSum += (A - MeanA) * (B - MeanB)
+ Covariance = CovarianceSum / float(BinCount - 1)
+ CorrelationCoefficient = Covariance / (SigmaA * SigmaB)
+ if VerboseFlag:
+ print "Covariance %s, corr.coeff %s"%(Covariance, CorrelationCoefficient)
+ return CorrelationCoefficient
+
+def Test():
+ FileName = "..\mzxml\Dicty-HeavyCells-11.mzXML"
+ FileHandle = open(FileName,"rb")
+ FileHandle.seek(60145805)
+ S1 = MSSpectrum.SpectrumClass()
+ S1.ReadPeaksFromFile(FileHandle,FileName) #it also sets the PrecursorMZ
+ Annotation1 = "MKRKLLK"
+
+ FileName = "..\mzxml\Dicty-HeavyCells-13.mzXML"
+ FileHandle = open(FileName,"rb")
+ FileHandle.seek(59307113)
+ S2 = MSSpectrum.SpectrumClass()
+ S2.ReadPeaksFromFile(FileHandle,FileName) #it also sets the PrecursorMZ
+ Annotation2 = "MKRKLLK"
+
+ FileName = "..\mzxml\Dicty-HeavyCells-12.mzXML"
+ FileHandle = open(FileName,"rb")
+ FileHandle.seek(87683754)
+ S3 = MSSpectrum.SpectrumClass()
+ S3.ReadPeaksFromFile(FileHandle,FileName) #it also sets the PrecursorMZ
+ Annotation3 = "MKIFIIK"
+
+ FileName = "..\mzxml\Dicty-HeavyCells-05.mzXML"
+ FileHandle = open(FileName,"rb")
+ FileHandle.seek(102201432)
+ S5 = MSSpectrum.SpectrumClass()
+ S5.ReadPeaksFromFile(FileHandle,FileName) #it also sets the PrecursorMZ
+ #S5.FilterPeaks()
+ Annotation5 = "NWNGQPVGVPQGQYANMNYAR"
+
+ FileName = "..\mzxml\Dicty-HeavyCells-12.mzXML"
+ FileHandle = open(FileName,"rb")
+ FileHandle.seek(112303085)
+ S6 = MSSpectrum.SpectrumClass()
+ S6.ReadPeaksFromFile(FileHandle,FileName) #it also sets the PrecursorMZ
+ Annotation6 = "NWNGQPVGVPQGQYANMNYAR+14"
+
+ FileName = "..\mzxml\Dicty-HeavyCells-12.mzXML"
+ FileHandle = open(FileName,"rb")
+ FileHandle.seek(111782847)
+ S7 = MSSpectrum.SpectrumClass()
+ S7.ReadPeaksFromFile(FileHandle,FileName) #it also sets the PrecursorMZ
+ Annotation7 = "NWNGQPVGVPQGQYANMNYAR"
+
+ Simm = SpectralSimilarity(S5,S5,Annotation5,Annotation5)
+ Simm.SpectralAlignment()
+
+if __name__ == "__main__":
+ # Command-line arguments: Two spectra, then two annotations.
+ if len(sys.argv)<5:
+ print "Not enough arguments:", sys.argv
+ sys.exit(-1)
+ Comparator = SpectralSimilarity(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
+ Comparator.LabelPeaks()
+ Result = Comparator.GetSharedPeakCount(1, 0, 0.9, PeakCountDivisor = 20, VerboseFlag = 1)
+ print "Shared 1 0 0.8:", Result
+ print "\n\n"
+ Result = Comparator.GetSharedPeakCount(0, 1, 0.9, PeakCountDivisor = 20, VerboseFlag = 1)
+ print "Shared 0 1 0.8:", Result
+ print "\n\n"
+
+## Result = Comparator.GetSharedPeakCount(0, 1, PeakCountDivisor = 20, VerboseFlag = 1)
+## print "Shared 0 1 1.0:", Result
+## sys.exit(0)
+## print "Shared 0 1 0.66:", Comparator.GetSharedPeakCount(0, 1, 0.66, PeakCountDivisor = 5)
+## print "Shared 0 1 0.66:", Comparator.GetSharedPeakCount(0, 1, 0.66, PeakCountDivisor = 50)
+##
+## Result = Comparator.DotProduct(VerboseFlag = 1)
+## print "Dot product similarity score 1.0:", Result
+## Result = Comparator.DotProduct(2.0, VerboseFlag = 1)
+## print "Dot product similarity score 2.0:", Result
+ Result = Comparator.DotProduct(0.5, VerboseFlag = 1, HashByRank = 1)
+ print "Dot product similarity score 0.5:", Result
+## print "Shared 1 0:", Comparator.GetSharedPeakCount(1, 0)
+## print "Shared 0 1:", Comparator.GetSharedPeakCount(0, 1)
+##
+## print "Shared 1 1:", Comparator.GetSharedPeakCount(1, 1)
+## print "Cov/corr:", Comparator.ComputeCorrelationCoefficient()
+ Command = "label.py \"%s\" %s"%(sys.argv[1], sys.argv[3])
+ os.system(Command)
+ Command = "label.py \"%s\" %s"%(sys.argv[2], sys.argv[4])
+ os.system(Command)
diff --git a/Spectrum.c b/Spectrum.c
new file mode 100644
index 0000000..43da9eb
--- /dev/null
+++ b/Spectrum.c
@@ -0,0 +1,1487 @@
+//Title: Spectrum.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Spectrum.h"
+#include "Utils.h"
+#include "Inspect.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <string.h>
+#include <math.h>
+#include "Tagger.h"
+#include "Errors.h"
+#include "ParseXML.h"
+
+#define INITIAL_PEAK_COUNT 100
+#define INITIAL_PRM_PEAK_COUNT 500
+
+#define MINIMUM_ALLOWED_PARENT_MASS GLYCINE_MASS
+#define MAXIMUM_ALLOWED_PARENT_MASS 6000*DALTON
+
+// This should be MORE than enough peaks for any realistic spectrum.
+// If there are more than this, we refuse to parse them all, so there.
+#define MAX_PEAKS_PER_SPECTRUM 10000
+
+/////////////////////////////////////////////////////////////////////////////////
+// Forward declarations:
+int SpectrumLoadHeaderLine(MSSpectrum* Spectrum, char* LineBuffer);
+void AttemptParentMassPeakRemoval(MSSpectrum* Spectrum);
+
+/////////////////////////////////////////////////////////////////////////////////
+// Functions:
+
+void SpectrumComputeSignalToNoise(MSSpectrum* Spectrum)
+{
+ int IntensePeakIndex;
+ int MedianPeakIndex;
+ int PeakIndex;
+ float Signal = 0;
+ float Noise = 0;
+ //
+ IntensePeakIndex = min(5, Spectrum->PeakCount) / 2;
+ MedianPeakIndex = Spectrum->PeakCount / 2;
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].IntensityRank == IntensePeakIndex)
+ {
+ Signal = Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ if (Spectrum->Peaks[PeakIndex].IntensityRank == MedianPeakIndex)
+ {
+ Noise = Spectrum->Peaks[PeakIndex].Intensity;
+ }
+ }
+ Spectrum->SignalToNoise = Signal / (float)max(1.0, Noise);
+}
+
+// Remove peaks that are not reasonably high for their mass window.
+// If WindowWidth and KeepCount are <= 0, use reasonable defaults.
+void WindowFilterPeaks(MSSpectrum* Spectrum, float WindowWidth, int KeepCount)
+{
+ int FilterPeakIndex;
+ int NewIndex;
+ int OtherPeakIndex;
+ float* Intensities;
+ int Neighbors;
+ float WindowStart;
+ float WindowEnd;
+ int FilteredCount = 0;
+ //
+ if (Spectrum->UnfilteredPeaks)
+ {
+ // We've already performed window filtering; don't do it again!
+ return;
+ }
+ if (WindowWidth <= 0)
+ {
+ WindowWidth = DEFAULT_WINDOW_WIDTH;
+ }
+ if (KeepCount <= 0)
+ {
+ KeepCount = DEFAULT_WINDOW_KEEP_COUNT;
+ }
+
+ //
+ Intensities = (float*)calloc(Spectrum->PeakCount, sizeof(float));
+ for (FilterPeakIndex = 0; FilterPeakIndex < Spectrum->PeakCount; FilterPeakIndex++)
+ {
+ WindowStart = Spectrum->Peaks[FilterPeakIndex].Mass - (WindowWidth / (float)2.0);
+ WindowEnd = Spectrum->Peaks[FilterPeakIndex].Mass + (WindowWidth / (float)2.0);
+ Neighbors = 0;
+ for (OtherPeakIndex = 0; OtherPeakIndex < Spectrum->PeakCount; OtherPeakIndex++)
+ {
+ if (Spectrum->Peaks[OtherPeakIndex].Mass > WindowEnd)
+ {
+ break;
+ }
+ if (Spectrum->Peaks[OtherPeakIndex].Mass > WindowStart)
+ {
+ Intensities[Neighbors] = Spectrum->Peaks[OtherPeakIndex].Intensity;
+ Neighbors++;
+ }
+ }
+ qsort(Intensities, Neighbors, sizeof(float), (QSortCompare)CompareFloats);
+ if (Neighbors < KeepCount || Spectrum->Peaks[FilterPeakIndex].Intensity >= Intensities[KeepCount - 1])
+ {
+ Spectrum->Peaks[FilterPeakIndex].FilterScore = 1;
+ FilteredCount++;
+ }
+ }
+ SafeFree(Intensities);
+ // New array:
+ Spectrum->UnfilteredPeakCount = Spectrum->PeakCount;
+ Spectrum->UnfilteredPeaks = Spectrum->Peaks;
+ Spectrum->PeakCount = FilteredCount;
+ Spectrum->Peaks = (SpectralPeak*)calloc(FilteredCount, sizeof(SpectralPeak));
+ NewIndex = 0;
+ for (FilterPeakIndex = 0; FilterPeakIndex < Spectrum->UnfilteredPeakCount; FilterPeakIndex++)
+ {
+ if (Spectrum->UnfilteredPeaks[FilterPeakIndex].FilterScore)
+ {
+ memcpy(Spectrum->Peaks + NewIndex, Spectrum->UnfilteredPeaks + FilterPeakIndex, sizeof(SpectralPeak));
+ Spectrum->Peaks[NewIndex].Index = NewIndex;
+ NewIndex++;
+ }
+ }
+
+}
+
+// Sort from MOST to LEAST intense:
+int ComparePeaksIntensity(const SpectralPeak* PeakA, const SpectralPeak* PeakB)
+{
+ if (PeakA->Intensity < PeakB->Intensity)
+ {
+ return 1;
+ }
+ if (PeakA->Intensity > PeakB->Intensity)
+ {
+ return -1;
+ }
+ return 0;
+
+}
+
+int ComparePeaksByMass(const SpectralPeak* PeakA, const SpectralPeak* PeakB)
+{
+ if (PeakA->Mass < PeakB->Mass)
+ {
+ return -1;
+ }
+ if (PeakA->Mass > PeakB->Mass)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+void IntensityRankPeaks(MSSpectrum* Spectrum)
+{
+ int PeakIndex;
+ //
+ qsort(Spectrum->Peaks, Spectrum->PeakCount, sizeof(SpectralPeak), (QSortCompare)ComparePeaksIntensity);
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ Spectrum->Peaks[PeakIndex].IntensityRank = PeakIndex;
+ }
+ qsort(Spectrum->Peaks, Spectrum->PeakCount, sizeof(SpectralPeak), (QSortCompare)ComparePeaksByMass);
+ SpectrumComputeSignalToNoise(Spectrum);
+}
+
+
+void FreeMatchList(SpectrumNode* Spectrum)
+{
+ Peptide* MatchNode;
+ Peptide* MatchPrev = NULL;
+ for (MatchNode = Spectrum->FirstMatch; MatchNode; MatchNode = MatchNode->Next)
+ {
+ if (MatchPrev)
+ {
+ FreePeptideNode(MatchPrev);
+ }
+ MatchPrev = MatchNode;
+ }
+ if (MatchPrev)
+ {
+ FreePeptideNode(MatchPrev);
+ }
+ Spectrum->MatchCount = 0;
+ Spectrum->FirstMatch = NULL;
+ Spectrum->LastMatch = NULL;
+}
+
+void FreeSpectrum(MSSpectrum* Spectrum)
+{
+ if (!Spectrum)
+ {
+ return;
+ }
+ SafeFree(Spectrum->UnfilteredPeaks);
+ SafeFree(Spectrum->Peaks);
+ if (Spectrum->Graph)
+ {
+ FreeTagGraph(Spectrum->Graph);
+ Spectrum->Graph = NULL;
+ }
+ SafeFree(Spectrum->BinnedIntensities);
+ SafeFree(Spectrum->BinnedIntensitiesTight);
+ SafeFree(Spectrum->BinnedIntensityLevels);
+ SafeFree(Spectrum->BinPeakIndex);
+ SafeFree(Spectrum->IntensityThresholds);
+ SafeFree(Spectrum->IonScoringNoiseProbabilities);
+ SafeFree(Spectrum);
+}
+
+// Constructor: Allocate and return a spectrum
+MSSpectrum* NewSpectrum()
+{
+ MSSpectrum* Spectrum;
+ // Allocate a spectrum, with a reasonable amount of space to store peaks
+ Spectrum = (MSSpectrum*)calloc(1, sizeof(MSSpectrum));
+ Spectrum->Peaks = (SpectralPeak*)calloc(INITIAL_PEAK_COUNT, sizeof(SpectralPeak));
+ Spectrum->PeakAllocation = INITIAL_PEAK_COUNT;
+ return Spectrum;
+}
+
+int SpectrumAddPeak(MSSpectrum* Spectrum, float Mass, float Intensity)
+{
+ int OldAllocation;
+
+ // If necessary, reallocate:
+ if (Spectrum->PeakCount > MAX_PEAKS_PER_SPECTRUM)
+ {
+
+ if (Spectrum->Node->InputFile)
+ {
+ REPORT_ERROR_IS(31, Spectrum->Node->ScanNumber, Spectrum->Node->InputFile->FileName);
+ }
+ else
+ {
+ REPORT_ERROR_IS(31, Spectrum->Node->ScanNumber, "??");
+ }
+ return 0;
+ }
+ if (Spectrum->PeakCount == Spectrum->PeakAllocation)
+ {
+
+ OldAllocation = Spectrum->PeakAllocation;
+ Spectrum->PeakAllocation = max(200, Spectrum->PeakAllocation * 2);
+
+ if (OldAllocation)
+ {
+ Spectrum->Peaks = (SpectralPeak*)realloc(Spectrum->Peaks, sizeof(SpectralPeak) * Spectrum->PeakAllocation);
+ memset(Spectrum->Peaks + OldAllocation, 0, sizeof(SpectralPeak) * (Spectrum->PeakAllocation - OldAllocation));
+
+ }
+ else
+ {
+ Spectrum->Peaks = (SpectralPeak*)calloc(Spectrum->PeakAllocation, sizeof(SpectralPeak));
+ }
+ }
+ ROUND_MASS(Mass, Spectrum->Peaks[Spectrum->PeakCount].Mass);
+ //Spectrum->Peaks[Spectrum->PeakCount].Mass = Mass;
+ Spectrum->Peaks[Spectrum->PeakCount].Intensity = Intensity;
+ Spectrum->Peaks[Spectrum->PeakCount].FilterScore = 0; // init
+ Spectrum->Peaks[Spectrum->PeakCount].NoisePenalty = 0; // init
+ Spectrum->Peaks[Spectrum->PeakCount].PercentIntensity = 0; // init
+ memset(Spectrum->Peaks[Spectrum->PeakCount].IsotopeNeighbors, -1, sizeof(int)*MAX_ISOTOPE_NEIGHBORS);
+ memset(Spectrum->Peaks[Spectrum->PeakCount].NoiseNeighbors, -1, sizeof(int)*MAX_NOISY_NEIGHBORS);
+ //Log("Added peak %d to spectrum. (Alloc size is now %d)\n", Spectrum->PeakCount, Spectrum->PeakAllocation);
+ Spectrum->PeakCount++;
+ Spectrum->MaxIntensity = max(Spectrum->MaxIntensity, Intensity);
+ return 1;
+}
+
+// Handle the header line for .dta, .pkl, .ms2 formats.
+int SpectrumLoadHeaderLine(MSSpectrum* Spectrum, char* LineBuffer)
+{
+ char* StrA;
+ char* StrB;
+ char* StrC;
+ float Mass;
+ int Charge;
+ // Default case: The first line should be the parent mass and the charge.
+ StrA = strtok(LineBuffer, WHITESPACE);
+ if (!StrA || !*StrA)
+ {
+
+ return 0;
+ }
+
+ StrB = strtok(NULL, WHITESPACE);
+ if (!StrB)
+ {
+
+ return 0;
+ }
+
+ StrC = strtok(NULL, WHITESPACE);
+
+
+ // MS2 file: Z, charge, parent-mass.
+ if (!strcmp(StrA, "Z"))
+ {
+
+ Mass = (float)atof(StrC);
+ ROUND_MASS(Mass, Spectrum->ParentMass);
+ if (Spectrum->ParentMass < MINIMUM_ALLOWED_PARENT_MASS ||
+ Spectrum->ParentMass > MAXIMUM_ALLOWED_PARENT_MASS)
+ {
+ if (Spectrum->Node && Spectrum->Node->InputFile)
+ {
+ REPORT_ERROR_IIS(42, Spectrum->ParentMass / MASS_SCALE, Spectrum->Node->FilePosition, Spectrum->Node->InputFile->FileName);
+ }
+ else
+ {
+ REPORT_ERROR_I(43, Spectrum->ParentMass / MASS_SCALE);
+ }
+ return 0;
+ }
+ Charge = atoi(StrB);
+ if (Charge > 6)
+ {
+ printf("** Invalid charge '%s' - maximum is 6\n", StrB);
+ return 0;
+ }
+ Spectrum->Charge = Charge;
+ Spectrum->FileChargeFlag = 1;
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->MZ = (Spectrum->ParentMass + (Charge - 1) * HYDROGEN_MASS) / Charge;
+ Spectrum->FileMZ = Spectrum->MZ;
+ return 1;
+ }
+
+ // Header line of a PKL file: precursor mz, precursor intensity, and charge.
+ if (StrC)
+ {
+
+ Mass = (float)atof(StrA);
+ ROUND_MASS(Mass, Spectrum->ParentMass);
+ if (Spectrum->ParentMass < MINIMUM_ALLOWED_PARENT_MASS ||
+ Spectrum->ParentMass > MAXIMUM_ALLOWED_PARENT_MASS)
+ {
+ printf("** Error in SpectrumLoadFromFile: Mass %.2f not legal.\n", Mass);
+ return 0;
+ }
+ Charge = atoi(StrC);
+ if (Charge > 6)
+ {
+ printf("** Invalid charge '%s' - maximum is 6\n", StrC);
+ return 0;
+ }
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ Spectrum->Charge = Charge;
+ Spectrum->FileMZ = Spectrum->ParentMass;
+ if (Charge)
+ {
+ Spectrum->ParentMass = (Spectrum->ParentMass * Charge) - (Charge - 1)*HYDROGEN_MASS;
+ }
+ }
+ else
+ {
+ // DTA file:
+ Mass = (float)atof(StrA);
+ if (Mass < 1)
+ {
+ // Invalid header line - the mass can't be zero!
+ return 0;
+ }
+ ROUND_MASS(Mass, Spectrum->ParentMass);
+ Charge = atoi(StrB);
+
+
+ if (!Charge)
+ {
+ Spectrum->MZ = Spectrum->ParentMass;
+ Spectrum->FileMZ = Spectrum->MZ;
+ Spectrum->ParentMass = 0;
+
+ }
+ else
+ {
+ // The file's mass is the residue mass + 19, which is the parent mass.
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ Spectrum->Charge = Charge;
+ Spectrum->FileMZ = (Spectrum->ParentMass + (Charge - 1) * HYDROGEN_MASS) / Charge;
+ Spectrum->MZ = Spectrum->FileMZ;
+ //Spectrum->ParentMass -= HYDROGEN_MASS; // remove one H+
+ //Spectrum->ParentMass = (float)atof(StrA) - HYDROGEN_MASS; // remove one H+
+ }
+ }
+ return 1;
+}
+
+int SpectrumLoadCDTAHeaderLine(MSSpectrum* Spectrum, char* LineBuffer)
+{
+
+ char* StrA;
+ char* StrB;
+
+ float Mass;
+ int Charge;
+ // Default case: The first line should be the parent mass and the charge.
+ StrA = strtok(LineBuffer, WHITESPACE);
+ if (!StrA || !*StrA)
+ {
+
+ return 0;
+ }
+
+ StrB = strtok(NULL, WHITESPACE);
+ if (!StrB)
+ {
+
+ return 0;
+ }
+
+
+ Mass = (float)atof(StrA);
+ if (Mass < 1)
+ {
+ // Invalid header line - the mass can't be zero!
+ return 0;
+ }
+ ROUND_MASS(Mass, Spectrum->ParentMass);
+ Charge = atoi(StrB);
+
+
+ if (!Charge)
+ {
+ Spectrum->MZ = Spectrum->ParentMass;
+ Spectrum->FileMZ = Spectrum->MZ;
+ Spectrum->ParentMass = 0;
+
+ }
+ else
+ {
+ if(Charge <= 0 || Charge >= 6)
+ return 0;
+ // The file's mass is the residue mass + 19, which is the parent mass.
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ Spectrum->Charge = Charge;
+ Spectrum->FileMZ = (Spectrum->ParentMass + (Charge - 1) * HYDROGEN_MASS) / Charge;
+ Spectrum->MZ = Spectrum->FileMZ;
+
+ //Spectrum->ParentMass -= HYDROGEN_MASS; // remove one H+
+ //Spectrum->ParentMass = (float)atof(StrA) - HYDROGEN_MASS; // remove one H+
+ }
+
+ return 1;
+}
+
+int GuessSpectralCharge(MSSpectrum* Spectrum)
+{
+ int PeakIndex;
+ float MeanMass = 0;
+ int Charge;
+ int BestDiff = 9999999;
+ int BestCharge = 2;
+ int ParentMass;
+ int Diff;
+ // Compute the MEDIAN peak mass:
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (PeakIndex >= Spectrum->PeakCount / 2)
+ {
+ MeanMass = (float)Spectrum->Peaks[PeakIndex].Mass;
+ break;
+ }
+ }
+ //MeanMass /= Spectrum->PeakCount;
+ // Use a charge that will bring the parent mass as close as possible to the mean peak mass x 2
+ for (Charge = 1; Charge < 10; Charge++)
+ {
+ ParentMass = (Spectrum->MZ * Charge) - (HYDROGEN_MASS * (Charge - 1));
+ Diff = abs(ParentMass - (int)(MeanMass*2));
+ if (Diff < BestDiff)
+ {
+ BestDiff = Diff;
+ BestCharge = Charge;
+ }
+ }
+ return BestCharge;
+}
+
+// Initial parent mass computation. We do it LATER if our charge is 0 (unknown).
+void SpectrumComputeParentMass(MSSpectrum* Spectrum)
+{
+ //
+ if (!Spectrum->Charge)
+ {
+ return; // We'll tweak later!
+ }
+ else
+ {
+
+ Spectrum->MZ = (Spectrum->ParentMass + (Spectrum->Charge - 1)*HYDROGEN_MASS) / Spectrum->Charge;
+ Spectrum->FileMass = Spectrum->ParentMass;
+ }
+}
+
+// Return FALSE when we're done loading
+int SpectrumHandleMS2ColonLine(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ char* ValueStr;
+ float ParentMass;
+ float Mass;
+ float Intensity;
+ MSSpectrum* Spectrum;
+ int Charge;
+ int Result;
+ //
+ Spectrum = (MSSpectrum*)UserData;
+
+ if (LineBuffer[0] == ':')
+ {
+ if (Spectrum->PeakCount)
+ {
+ // We've loaded some peaks already, so this is the scan number of
+ // the NEXT scan.
+ return 0;
+ }
+ else
+ {
+ // We know our scan number already, so we do nothing.
+ return 1;
+ }
+ }
+
+ // If we don't know our MZ yet, then we load it now. Otherwise we add a peak.
+ if (!Spectrum->MZ)
+ {
+ ValueStr = strtok(LineBuffer, WHITESPACE);
+ if (!ValueStr)
+ {
+ return 1; // INVALID LINE, stop now
+ }
+ ParentMass = (float)(atof(ValueStr) * MASS_SCALE);
+ ValueStr = strtok(NULL, WHITESPACE);
+ if (!ValueStr)
+ {
+ return 0; // INVALID LINE, stop now
+ }
+ Charge = atoi(ValueStr);
+ if (Charge)
+ {
+ if(Charge <= 0 || Charge >= 6)
+ return 0;
+ Spectrum->Charge = Charge;
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ Spectrum->ParentMass = (int)(ParentMass - HYDROGEN_MASS + 0.5);
+ Spectrum->MZ = (int)((ParentMass + (Spectrum->Charge - 1)*HYDROGEN_MASS) / (float)Spectrum->Charge + 0.5);
+ Spectrum->FileMZ = Spectrum->MZ;
+ }
+ else
+ {
+ Spectrum->ParentMass = (int)(ParentMass + 0.5);
+ Spectrum->MZ = (int)(ParentMass + 0.5);
+ Spectrum->FileMZ = Spectrum->MZ;
+ }
+ return 1;
+ }
+ // Ordinary peak
+ ValueStr = strtok(LineBuffer, WHITESPACE);
+ if (!ValueStr)
+ {
+ return 0; // INVALID LINE, stop now
+ }
+ Mass = (float)atof(ValueStr);
+ ValueStr = strtok(NULL, WHITESPACE);
+ if (!ValueStr)
+ {
+ return 0; // INVALID LINE, stop now
+ }
+ Intensity = (float)atof(ValueStr);
+ Result = SpectrumAddPeak(Spectrum, Mass, Intensity);
+ return Result;
+}
+
+int SpectrumLoadCDTACallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ MSSpectrum* Spectrum;
+ char * StrA;
+ char * Str;
+ float Mass;
+ float Intensity;
+
+ int ScanNumber;
+ int Charge;
+ int Result;
+
+
+ Spectrum = (MSSpectrum*)UserData;
+
+
+
+ //if line starts with = wthen this is header, we can get the scan number and charge
+ if(LineNumber == 1 && LineBuffer[0] == '=')
+ {
+
+ StrA = strtok(LineBuffer, ".");
+ StrA = strtok(NULL, ".");
+ ScanNumber = atoi(StrA);
+ Spectrum->Node->ScanNumber = ScanNumber;
+
+ StrA = strtok(NULL,".");
+ StrA = strtok(NULL,".");
+ Charge = atoi(StrA);
+
+ Spectrum->Charge = Charge;
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ }
+ else if(LineBuffer[0] == '=')
+ {
+ return 0;
+ }
+ else if(LineNumber == 3) //The first line after the == should be the header
+ {
+
+
+ return SpectrumLoadCDTAHeaderLine(Spectrum,LineBuffer);
+
+ }
+ else
+ {
+
+ // After the first line, we expect to see lines of the form "Mass Intensity"
+ Str = strtok(LineBuffer, WHITESPACE);
+ if (!Str)
+ {
+ return 1;
+ }
+ Mass = (float)atof(Str);
+ if (!Mass)
+ {
+ return 1;
+ }
+ Str = strtok(NULL, WHITESPACE);
+ if (!Str)
+ {
+ // This line had only two pieces on it. Invalid syntax!
+ //printf("**Error in file '%s': peak lines must contain mass AND intensity\n", Spectrum->Node->InputFile->FileName);
+ REPORT_ERROR_IS(33, LineNumber, Spectrum->Node->InputFile->FileName);
+ return 0;
+ }
+ Intensity = (float)atof(Str);
+
+
+ if (!Intensity)
+ {
+ // invalid intensity? Assume that a string staring with "0" really means intensity zero,
+ // god help us.
+ if (Str[0] != '0')
+ {
+ REPORT_ERROR_IS(33, LineNumber, Spectrum->Node->InputFile->FileName);
+ return 0;
+ }
+ }
+ // If there's a third piece on the line, then stop parsing now. (That happens if we run
+ // off the end of a record in a pkl file, into the start of the next record):
+ Str = strtok(NULL, WHITESPACE);
+ if (Str)
+ {
+ return 0;
+ }
+ Result = SpectrumAddPeak(Spectrum, Mass, Intensity);
+
+ return Result;
+
+
+ }
+
+
+
+}
+
+int SpectrumLoadMGFCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ MSSpectrum* Spectrum;
+ float Mass;
+ float Intensity;
+ char* WordA;
+ char* WordB;
+ char* EQWordA;
+ int Result;
+ int Charge;
+ char* AndWord;
+ //
+ Spectrum = (MSSpectrum*)UserData;
+
+
+ // If we see a command we recognize, then handle it:
+ WordA = strtok(LineBuffer, WHITESPACE);
+ WordB = strtok(NULL, WHITESPACE);
+ EQWordA = strtok(WordA, "=");
+
+ if (!CompareStrings(WordA, "END"))
+ {
+ if (WordB && !CompareStrings(WordB, "IONS"))
+ {
+ // Stop parsing lines now!
+ return 0;
+ }
+ }
+ else if (!CompareStrings(EQWordA, "PEPMASS"))
+ {
+ Mass = (float)atof(LineBuffer + 8);
+ ROUND_MASS(Mass, Spectrum->MZ);
+ Spectrum->FileMZ = Spectrum->MZ;
+ if (Spectrum->MZ < MINIMUM_ALLOWED_PARENT_MASS || Spectrum->MZ > MAXIMUM_ALLOWED_PARENT_MASS)
+ {
+ // Illegal mass!
+ if (Spectrum->Node->InputFile)
+ {
+ REPORT_ERROR_IS(32, Spectrum->Node->ScanNumber, Spectrum->Node->InputFile->FileName);
+ }
+ else
+ {
+ REPORT_ERROR_IS(32, Spectrum->Node->ScanNumber, "???");
+ }
+ return 0;
+ }
+
+ }
+ else if (!CompareStrings(EQWordA, "CHARGE"))
+ {
+
+ Charge = atoi(LineBuffer + 7);
+ if (Charge)
+ {
+
+
+ Spectrum->Charge = Charge;
+ if(Charge >= 6)
+ return 0;
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ }
+ // the CHARGE line may have the form "2+ and 3+"
+ if (WordB && !CompareStrings(WordB, "and"))
+ {
+ Charge = atoi(WordB + 4);
+ if (Charge)
+ {
+ Spectrum->Charge = Charge;
+ if(Charge >= 6)
+ return 0;
+
+ Spectrum->FileCharge[Charge] = 1;
+ Spectrum->FileChargeFlag = 1;
+ }
+ }
+ }
+ else
+ {
+ // Default: Try to read an m/z and intensity
+ Mass = (float)atof(WordA);
+ if (Mass && WordB)
+ {
+ Intensity = (float)atof(WordB);
+ Result = SpectrumAddPeak(Spectrum, Mass, Intensity);
+ return Result;
+ }
+ }
+ return 1;
+}
+
+// Load spectrum from a cdta file. See a header line ====, then some peaks
+// end with a new ===.
+int SpectrumLoadCDTA(MSSpectrum* Spectrum, FILE* DTAFile)
+{
+ ParseFileByLines(DTAFile, SpectrumLoadCDTACallback,Spectrum,0);
+ if(Spectrum->Charge && (Spectrum->Charge <= 0 || Spectrum->Charge >= 6))
+ return 0;
+ //Should we guess charge?
+
+ return 1;
+}
+
+// Load spectrum from an MGF file. See one or more header lines, then some
+// peaks, then an "END IONS" line.
+int SpectrumLoadMGF(MSSpectrum* Spectrum, FILE* DTAFile)
+{
+
+ ParseFileByLines(DTAFile, SpectrumLoadMGFCallback, Spectrum, 0);
+ if (Spectrum->Charge)
+ {
+ if(Spectrum->Charge <= 0 || Spectrum->Charge >= 6)
+ return 0;
+ Spectrum->ParentMass = Spectrum->MZ * Spectrum->Charge - (HYDROGEN_MASS * (Spectrum->Charge - 1));
+
+ }
+
+ return 1;
+}
+
+int GuessMS2FormatFromLine(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ MSSpectrum* Spectrum;
+ Spectrum = (MSSpectrum*)UserData;
+ if (LineBuffer[0] == ':')
+ {
+ Spectrum->Node->InputFile->Format = SPECTRUM_FORMAT_MS2_COLONS;
+ return 0;
+ }
+ if (LineBuffer[0] == 'Z')
+ {
+ Spectrum->Node->InputFile->Format = SPECTRUM_FORMAT_MS2;
+ return 0;
+ }
+ return 1;
+
+}
+
+// Return 1 if we succeeded.
+int SpectrumLoadDTAFileLine(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ MSSpectrum* Spectrum;
+ int Result;
+ float Mass;
+ float Intensity;
+ char* Str;
+ //
+ Spectrum = (MSSpectrum*)UserData;
+ // Special case: MS2 format handles one or more "Z" lines
+ if (LineBuffer[0] == 'Z' && (LineBuffer[1] == ' ' || LineBuffer[1] == '\t'))
+ {
+ Result = SpectrumLoadHeaderLine(Spectrum, LineBuffer);
+ return Result;
+ }
+ // Special case: MS2 format skips the first "S" line, and knows the
+ // second "S" line it sees marks the end of the record
+ if (LineBuffer[0] == 'S' && (LineBuffer[1] == ' ' || LineBuffer[1] == '\t'))
+ {
+ if (LineNumber > 1)
+ {
+ return 0;
+ }
+ else
+ {
+ return 1;
+ }
+ }
+ if (LineNumber == 1)
+ {
+ Result = SpectrumLoadHeaderLine(Spectrum, LineBuffer);
+ return Result;
+ }
+
+ // After the first line, we expect to see lines of the form "Mass Intensity"
+ Str = strtok(LineBuffer, WHITESPACE);
+ if (!Str)
+ {
+ return 1;
+ }
+ Mass = (float)atof(Str);
+ if (!Mass)
+ {
+ return 1;
+ }
+ Str = strtok(NULL, WHITESPACE);
+ if (!Str)
+ {
+ // This line had only two pieces on it. Invalid syntax!
+ //printf("**Error in file '%s': peak lines must contain mass AND intensity\n", Spectrum->Node->InputFile->FileName);
+ REPORT_ERROR_IS(33, LineNumber, Spectrum->Node->InputFile->FileName);
+ return 0;
+ }
+ Intensity = (float)atof(Str);
+ if (!Intensity)
+ {
+ // invalid intensity? Assume that a string staring with "0" really means intensity zero,
+ // god help us.
+ if (Str[0] != '0')
+ {
+ REPORT_ERROR_IS(33, LineNumber, Spectrum->Node->InputFile->FileName);
+ return 0;
+ }
+ }
+ // If there's a third piece on the line, then stop parsing now. (That happens if we run
+ // off the end of a record in a pkl file, into the start of the next record):
+ Str = strtok(NULL, WHITESPACE);
+ if (Str)
+ {
+ return 0;
+ }
+ Result = SpectrumAddPeak(Spectrum, Mass, Intensity);
+ return Result;
+}
+
+int GuessSpectrumFormatFromHeader(char* FilePath, MSSpectrum* Spectrum)
+{
+ FILE* MS2File;
+ //
+ MS2File = fopen(FilePath, "rb");
+ ParseFileByLines(MS2File, GuessMS2FormatFromLine, Spectrum, 0);
+ fclose(MS2File);
+ return Spectrum->Node->InputFile->Format;
+}
+
+// SpectrumLoadFromFile: Return True if the spectrum is valid, False if it's not.
+// Example of an invalid spectrum file: Sequest .out files contaminating the .dta directory.
+// Iterate over lines, handling the header specially.
+int SpectrumLoadFromFile(MSSpectrum* Spectrum, FILE* DTAFile)
+{
+ int ReturnCode = 1;
+ int MS2ChargeLineSeen = 0;
+ int i;
+ float PeakMass;
+ //
+
+ // handle XML formats separately from line-based foramts:
+ switch (Spectrum->Node->InputFile->Format)
+ {
+ case SPECTRUM_FORMAT_MZXML:
+ ReturnCode = SpectrumLoadMZXML(Spectrum, DTAFile);
+ break;
+ case SPECTRUM_FORMAT_MZDATA:
+ SpectrumLoadMZData(Spectrum, DTAFile);
+ break;
+ case SPECTRUM_FORMAT_MGF:
+ ReturnCode = SpectrumLoadMGF(Spectrum, DTAFile);
+ break;
+ case SPECTRUM_FORMAT_MS2_COLONS:
+ ParseFileByLines(DTAFile, SpectrumHandleMS2ColonLine, Spectrum, 0);
+ break;
+ case SPECTRUM_FORMAT_CDTA:
+ ReturnCode = SpectrumLoadCDTA(Spectrum,DTAFile);
+ break;
+ case SPECTRUM_FORMAT_PKL:
+ case SPECTRUM_FORMAT_DTA:
+ case SPECTRUM_FORMAT_MS2:
+ default:
+ ParseFileByLines(DTAFile, SpectrumLoadDTAFileLine, Spectrum, 0);
+ break;
+ }
+ if(Spectrum->Charge && (Spectrum->Charge < 0 || Spectrum->Charge >= 6))
+ return 0;
+
+ //We only like spectra with charge less than 3
+ if(Spectrum->Charge && !GlobalOptions->MultiChargeMode && Spectrum->Charge > 3)
+ {
+ //printf("Ignoring Spectrum %d with charge %d\n",Spectrum->Node->ScanNumber,Spectrum->Charge);
+ return 0;
+ }
+ else
+ {
+ //printf("Keeping Spectrum %d with charge %d and %d peaks\n",Spectrum->Node->ScanNumber,Spectrum->Charge,Spectrum->PeakCount);
+ }
+
+ if (ReturnCode)
+ {
+ SpectrumComputeParentMass(Spectrum);
+ }
+ //printf("SCAN: %d\n",Spectrum->Node->ScanNumber);
+ //for(i = 0; i < Spectrum->PeakCount; ++i)
+ // {
+
+ // PeakMass = (float)(Spectrum->Peaks[i].Mass);
+ // printf("%f %f\n",PeakMass/1000, Spectrum->Peaks[i].Intensity);
+ // }
+
+
+ //if (GlobalOptions->PhosphorylationFlag)
+ //{
+ // AttemptParentMassPeakRemoval(Spectrum);
+ //}
+ return ReturnCode;
+}
+
+////For phosphorylated spectra, the superprominent M-p peak can
+////fritz the charge state guessing, and tagging. So we remove it.
+//void AttemptParentMassPeakRemoval(MSSpectrum* Spectrum)
+//{
+// int MostIntensePeakIndex;
+// int MostIntenseMass;
+// int PeakIndex;
+// float MostIntense = 0.0;
+// float NextMostIntense = 0.0;
+// int Diff;
+// int ExpectedDiff;
+// int ExpectedDiff2;
+// int Epsilon = 2 * DALTON;
+// int Charge;
+// //
+// for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+// {
+// if (Spectrum->Peaks[PeakIndex].Intensity > MostIntense)
+// {
+// NextMostIntense = MostIntense;
+// MostIntense = Spectrum->Peaks[PeakIndex].Intensity;
+// MostIntensePeakIndex = PeakIndex;
+// MostIntenseMass = Spectrum->Peaks[PeakIndex].Mass;
+// }
+// else if(Spectrum->Peaks[PeakIndex].Intensity > NextMostIntense)
+// {
+// NextMostIntense = Spectrum->Peaks[PeakIndex].Intensity;
+// }
+// }
+// //printf("Most intense %f, next %f\n",MostIntense, NextMostIntense);
+// //if more than 3 times great, and in the right place, remove peak.
+// //if (MostIntense < 2 * NextMostIntense)
+// //{
+// // return;
+// //}
+// //printf ("MZ of %d, charge %d\n", Spectrum->MZ, Spectrum->Charge);
+// // If the spectrum has a charge, then trust, otherwise try charge 2, 3
+// //Set m/z with the new parentmass and charge that was just assigned in ConstructTags
+// printf("Old MZ %f\n",Spectrum->MZ);
+// Spectrum->MZ = (Spectrum->ParentMass + (Spectrum->Charge - 1) * HYDROGEN_MASS) / Spectrum->Charge;
+// printf("New MZ %f\n",Spectrum->MZ);
+// return;
+// if (Spectrum->Charge)
+// {
+// Diff = abs(Spectrum->MZ - MostIntenseMass);
+// ExpectedDiff = PHOSPHATE_WATER_MASS / Spectrum->Charge;
+// ExpectedDiff2 = (PHOSPHATE_WATER_MASS + WATER_MASS) / Spectrum->Charge;
+// if (abs (Diff - ExpectedDiff) < Epsilon)
+// { //remove peak
+// Spectrum->RemovedPeakIndex = MostIntensePeakIndex;
+// Spectrum->RemovedPeakIntensity = Spectrum->Peaks[MostIntensePeakIndex].Intensity;
+// Spectrum->Peaks[MostIntensePeakIndex].Intensity = 1.0; //cut to ground
+// }
+// else if (abs(Diff - ExpectedDiff2) < Epsilon)
+// { //remove peak
+// Spectrum->RemovedPeakIndex = MostIntensePeakIndex;
+// Spectrum->RemovedPeakIntensity = Spectrum->Peaks[MostIntensePeakIndex].Intensity;
+// Spectrum->Peaks[MostIntensePeakIndex].Intensity = 1.0; //cut to ground
+// }
+// }
+// else
+// {
+// for (Charge = 1; Charge <= 3; Charge++)
+// {
+// Diff = abs(Spectrum->MZ - MostIntenseMass);
+// ExpectedDiff = PHOSPHATE_WATER_MASS/ Charge;
+// ExpectedDiff2 = (PHOSPHATE_WATER_MASS + WATER_MASS)/ Charge;
+// // printf("Charge %d, Diff %d, ExpectedDiff %d\n", Charge, Diff, ExpectedDiff);
+// if (abs (Diff - ExpectedDiff) < Epsilon)
+// { // remove peak
+// Spectrum->RemovedPeakIndex = MostIntensePeakIndex;
+// Spectrum->RemovedPeakIntensity = Spectrum->Peaks[MostIntensePeakIndex].Intensity;
+// Spectrum->Peaks[MostIntensePeakIndex].Intensity = 1.0; //cut to ground
+// Spectrum->Charge = Charge; // This is a big enough clue, that we are going to guess charge
+// Spectrum->MZ = MostIntenseMass + ExpectedDiff; //testing this feature
+// break;
+// }
+// else if (abs(Diff - ExpectedDiff2) < Epsilon)
+// { // remove peak
+// Spectrum->RemovedPeakIndex = MostIntensePeakIndex;
+// Spectrum->RemovedPeakIntensity = Spectrum->Peaks[MostIntensePeakIndex].Intensity;
+// Spectrum->Peaks[MostIntensePeakIndex].Intensity = 1.0; //cut to ground
+// Spectrum->Charge = Charge;
+// Spectrum->MZ = MostIntenseMass + ExpectedDiff2;
+// break;
+// }
+// } // end for
+// } // end else
+//
+//}
+
+// Called AFTER filtering. Looks 1Da to the left of peaks for potential isotope neighbors.
+void SpectrumAssignIsotopeNeighbors(MSSpectrum* Spectrum)
+{
+ // Don't worry *too* much about efficiency, as this happens only once during scoring
+ int PeakIndex;
+ int OldPeakIndex;
+ int IsotopeCount;
+ int NoiseCount;
+ int MaxMass;
+ int MinMass;
+ int OtherPeakIndex;
+ float IntensityPercent;
+ //
+ // Assign noise penalty:
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ IntensityPercent = Spectrum->Peaks[PeakIndex].Intensity / Spectrum->MaxIntensity;
+ Spectrum->Peaks[PeakIndex].PercentIntensity = IntensityPercent;
+ if (IntensityPercent < 0.05)
+ {
+ Spectrum->Peaks[PeakIndex].NoisePenalty = -921;//0.0001
+ }
+ else if (IntensityPercent < 0.3)
+ {
+ Spectrum->Peaks[PeakIndex].NoisePenalty = -1382; //0.000001
+ }
+ else if (IntensityPercent < 0.6)
+ {
+ Spectrum->Peaks[PeakIndex].NoisePenalty = -1842; //0.00000001
+ }
+ else
+ {
+ Spectrum->Peaks[PeakIndex].NoisePenalty = -2303; //0.0000000001
+ }
+ }
+ // First, look for isotope neighbors. Scan downward from each peak:
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ MaxMass = Spectrum->Peaks[PeakIndex].Mass - 79;
+ MinMass = Spectrum->Peaks[PeakIndex].Mass - 121;
+ IsotopeCount = 0;
+ for (OldPeakIndex = max(0, PeakIndex - 1); OldPeakIndex; OldPeakIndex--)
+ {
+ if (Spectrum->Peaks[OldPeakIndex].Mass < MinMass)
+ {
+ break;
+ }
+ if (Spectrum->Peaks[OldPeakIndex].Mass > MaxMass)
+ {
+ continue;
+ }
+ Spectrum->Peaks[PeakIndex].IsotopeNeighbors[IsotopeCount++] = OldPeakIndex;
+ }
+ }
+ // Now look for noise-neighbors (peaks which could be the same peak, but are split
+ // due to limitations in recording).
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ NoiseCount = 0;
+ MaxMass = Spectrum->Peaks[PeakIndex].Mass + 21; // 0.2 Da radius
+ MinMass = Spectrum->Peaks[PeakIndex].Mass - 21;
+ for (OtherPeakIndex = PeakIndex + 1; OtherPeakIndex < min(Spectrum->PeakCount, PeakIndex + 5); OtherPeakIndex++)
+ {
+ if (Spectrum->Peaks[OtherPeakIndex].Mass > MaxMass)
+ {
+ break;
+ }
+ Spectrum->Peaks[PeakIndex].NoiseNeighbors[NoiseCount++] = OtherPeakIndex;
+ }
+ for (OtherPeakIndex = max(0, PeakIndex - 1); OtherPeakIndex > max(-1, PeakIndex - 5); OtherPeakIndex--)
+ {
+ if (Spectrum->Peaks[OtherPeakIndex].Mass < MinMass)
+ {
+ break;
+ }
+ Spectrum->Peaks[PeakIndex].NoiseNeighbors[NoiseCount++] = OtherPeakIndex;
+ }
+ }
+}
+
+void SpectrumSetCharge(MSSpectrum* Spectrum, int Charge)
+{
+ //MZ = ((Charge-1)*1.0078 + self->Spectrum->ParentMass) / self->Spectrum->Charge;
+ Spectrum->Charge = Charge;
+ Spectrum->PMCorrectedFlag = 0;
+ Spectrum->ParentMass = (Spectrum->MZ * Charge) - (Charge - 1) * HYDROGEN_MASS;
+}
+
+// Compute the low/med/hi intensity cutoffs for the spectrum.
+void ComputeSpectrumIntensityCutoffs(MSSpectrum* Spectrum)
+{
+ int PeakIndex;
+ float GrassIntensity;
+ float TotalIntensity;
+ int CutoffRank;
+ float SortedIntensity[200];
+ int WeakPeakCount = 0;
+ //
+ TotalIntensity = 0;
+ CutoffRank = (int)(Spectrum->ParentMass / (100 * DALTON));
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ if (Spectrum->Peaks[PeakIndex].IntensityRank >= CutoffRank)
+ {
+ SortedIntensity[WeakPeakCount] = Spectrum->Peaks[PeakIndex].Intensity;
+ WeakPeakCount++;
+ }
+ TotalIntensity += Spectrum->Peaks[PeakIndex].Intensity;
+ if (WeakPeakCount == 200)
+ {
+ break;
+ }
+ }
+ if (!WeakPeakCount)
+ {
+ //printf("** Error in ComputeSpectrumIntensityCutoffs: No weak peak ranks found? Intensity ranking must be complete here.\n");
+ if (!Spectrum->PeakCount)
+ {
+ return;
+ }
+ GrassIntensity = TotalIntensity / (2 * Spectrum->PeakCount);
+ }
+ else
+ {
+ qsort(SortedIntensity, WeakPeakCount, sizeof(float), (QSortCompare)CompareFloats);
+ GrassIntensity = SortedIntensity[WeakPeakCount / 2];
+ }
+ Spectrum->IntensityCutoffLow = (float)0.25 * GrassIntensity;
+ Spectrum->IntensityCutoffMedium = 3 * GrassIntensity;
+ Spectrum->IntensityCutoffHigh = 10 * GrassIntensity;
+}
+
+//// Allocate and populate BinnedIntensities for the spectrum. Assumes that ParentMass is set.
+//void SpectrumComputeBinnedIntensities(SpectrumNode* Node) // OBSOLETE
+//{
+// int MaxParentMass = 0;
+// MSSpectrum* Spectrum;
+// int PeakIndex;
+// int Bin;
+// int NearBin;
+// SpectralPeak* Peak;
+// float Intensity;
+// int BinScalingFactor = 100; // One bin per 0.1Da
+//
+// // A spectrum has at most this many "high" peaks (one per 100Da)
+// int SuperPeakCount;
+//
+// static int* BestIntensityRank = NULL;
+// static int BestIntensityRankSize = 0;
+// //
+// Spectrum = Node->Spectrum;
+// if (!Spectrum)
+// {
+// return;
+// }
+// SuperPeakCount = Spectrum->ParentMass / (100 * DALTON);
+// MaxParentMass = Spectrum->MZ * 3;
+// Spectrum->IntensityBinCount = (MaxParentMass + DALTON) / BinScalingFactor;
+// SafeFree(Spectrum->BinnedIntensities);
+// SafeFree(Spectrum->BinnedIntensitiesTight);
+// SafeFree(Spectrum->BinnedIntensityLevels);
+// SafeFree(Spectrum->BinPeakIndex);
+// Spectrum->BinnedIntensities = (float*)calloc(Spectrum->IntensityBinCount, sizeof(float));
+// Spectrum->BinnedIntensitiesTight = (float*)calloc(Spectrum->IntensityBinCount, sizeof(float));
+// Spectrum->BinnedIntensityLevels = (int*)calloc(Spectrum->IntensityBinCount, sizeof(int));
+// Spectrum->BinPeakIndex = (int*)calloc(Spectrum->IntensityBinCount, sizeof(int));
+//
+// if (BestIntensityRankSize < Spectrum->IntensityBinCount)
+// {
+// SafeFree(BestIntensityRank);
+// BestIntensityRankSize = Spectrum->IntensityBinCount + 500;
+// BestIntensityRank = (int*)calloc(BestIntensityRankSize, sizeof(int));
+// }
+// for (Bin = 0; Bin < Spectrum->IntensityBinCount; Bin++)
+// {
+// Spectrum->BinPeakIndex[Bin] = -1;
+// BestIntensityRank[Bin] = 999;
+// }
+//
+// // Iterate over spectral peaks, putting intensity into bins:
+// for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+// {
+// Peak = Spectrum->Peaks + PeakIndex;
+// Bin = (Peak->Mass + 50) / BinScalingFactor;
+// for (NearBin = Bin - 6; NearBin < Bin + 7; NearBin++)
+// {
+// if (NearBin < 0 || NearBin >= Spectrum->IntensityBinCount)
+// {
+// continue;
+// }
+// if (abs(Peak->Mass - (NearBin * BinScalingFactor)) > INTENSITY_BIN_RADIUS)
+// {
+// continue;
+// }
+// Spectrum->BinnedIntensities[NearBin] += Peak->Intensity;
+// BestIntensityRank[Bin] = min(BestIntensityRank[Bin], Peak->IntensityRank);
+// if (Spectrum->BinPeakIndex[NearBin] < 0)
+// {
+// Spectrum->BinPeakIndex[NearBin] = PeakIndex;
+// }
+// if (abs(Peak->Mass - (NearBin * BinScalingFactor)) <= INTENSITY_BIN_RADIUS_TIGHT)
+// {
+// Spectrum->BinnedIntensitiesTight[NearBin] += Peak->Intensity;
+// }
+// }
+// }
+// // Compute the intensity level (absent, lo, med, hi) for each bin:
+// ComputeSpectrumIntensityCutoffs(Spectrum);
+// for (Bin = 0; Bin < Spectrum->IntensityBinCount; Bin++)
+// {
+// Intensity = Spectrum->BinnedIntensities[Bin];
+// if (Intensity > Spectrum->IntensityCutoffHigh && BestIntensityRank[Bin] < SuperPeakCount)
+// {
+// Spectrum->BinnedIntensityLevels[Bin] = 3;
+// }
+// else if (Intensity > Spectrum->IntensityCutoffMedium)
+// {
+// Spectrum->BinnedIntensityLevels[Bin] = 2;
+// }
+// else if (Intensity > Spectrum->IntensityCutoffLow)
+// {
+// Spectrum->BinnedIntensityLevels[Bin] = 1;
+// }
+// else
+// {
+// Spectrum->BinnedIntensityLevels[Bin] = 0;
+// }
+// }
+//}
+
+void SpectrumComputeNoiseDistributions(SpectrumNode* Node)
+{
+ MSSpectrum* Spectrum;
+ int BinCountA;
+ int BinCountB;
+ int BinCountC;
+ int BinCountD;
+ int BinCutoffA;
+ int BinCutoffB;
+ int Bin;
+ int Index;
+ int IntensityRank;
+ SpectrumTweak* Tweak;
+ int TweakIndex;
+ // Compute the distributions of intensity-levels for the three sectors according
+ // to each parent mass:
+ Spectrum = Node->Spectrum;
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ Tweak = Node->Tweaks + TweakIndex;
+ if (!Tweak->Charge)
+ {
+ continue;
+ }
+ BinCutoffA = (int)((Node->Tweaks[TweakIndex].ParentMass * 0.3333 + 5) / 100);
+ BinCutoffB = (int)((Node->Tweaks[TweakIndex].ParentMass * 0.6667 + 5) / 100);
+ BinCountA = 0;
+ BinCountB = 0;
+ BinCountC = 0;
+ BinCountD = 0;
+ // SECTOR_COUNT
+ BinCutoffA = (int)((Node->Tweaks[TweakIndex].ParentMass * 0.5 + 5) / 100);
+ for (Index = 0; Index < 8; Index++)
+ {
+ Node->Tweaks[TweakIndex].Intensities[Index] = 1; // padding-probability
+ }
+ for (Bin = 0; Bin < Spectrum->IntensityBinCount; Bin++)
+ {
+ if (Bin >= BinCutoffA)
+ {
+ BinCountB++;
+ Tweak->Intensities[4 + Spectrum->BinnedIntensityLevels[Bin]] += 1.0;
+ }
+ else
+ {
+ BinCountA++;
+ Tweak->Intensities[0 + Spectrum->BinnedIntensityLevels[Bin]] += 1.0;
+ }
+ }
+ for (IntensityRank = 0; IntensityRank < 4; IntensityRank++)
+ {
+ Tweak->Intensities[0 + IntensityRank] = (float)log((Tweak->Intensities[0 + IntensityRank] + 2) / (BinCountA + 2));
+ Tweak->Intensities[4 + IntensityRank] = (float)log((Tweak->Intensities[4 + IntensityRank] + 2) / (BinCountB + 2));
+ }
+
+ }
+}
+
+// Add a spectrum to the list of spectra to be searched.
+void AddSpectrumToList(InputFileNode* InputFile, int FilePos, int ScanNumber, int SpecIndex)
+{
+ SpectrumNode* NewNode;
+
+ NewNode = (SpectrumNode*)calloc(1, sizeof(SpectrumNode));
+ NewNode->InputFile = InputFile;
+ if (GlobalOptions->LastSpectrum)
+ {
+ GlobalOptions->LastSpectrum->Next = NewNode;
+ }
+ else
+ {
+ GlobalOptions->FirstSpectrum = NewNode;
+ }
+ NewNode->FilePosition = FilePos;
+ NewNode->ScanNumber = ScanNumber;
+ NewNode->SpecIndex = SpecIndex;
+ GlobalOptions->LastSpectrum = NewNode;
+ GlobalOptions->SpectrumCount++;
+ InputFile->SpectrumCount++;
+}
+
+
+int GuessSpectrumFormatFromExtension(char* FileName)
+{
+ char* Extension;
+ for (Extension = FileName + strlen(FileName); Extension > FileName; Extension--)
+ {
+ if (*Extension == '.')
+ {
+ break;
+ }
+ }
+ if (!CompareStrings(Extension, ".out"))
+ {
+ // sequest gunk, ignore.
+ return SPECTRUM_FORMAT_INVALID;
+ }
+ if (!CompareStrings(Extension, ".ms2"))
+ {
+ return SPECTRUM_FORMAT_MS2_COLONS; //SPECTRUM_FORMAT_MS2;
+ }
+ if (!CompareStrings(Extension, ".mzxml"))
+ {
+ return SPECTRUM_FORMAT_MZXML;
+ }
+ if (!CompareStrings(Extension, ".mzdata"))
+ {
+ return SPECTRUM_FORMAT_MZDATA;
+ }
+ if (!CompareStrings(Extension, ".mgf"))
+ {
+ return SPECTRUM_FORMAT_MGF;
+ }
+ if (!CompareStrings(Extension, ".dta"))
+ {
+ return SPECTRUM_FORMAT_DTA;
+ }
+ if (!CompareStrings(Extension, ".pkl"))
+ {
+ return SPECTRUM_FORMAT_PKL;
+ }
+ if(!CompareStrings(Extension,".txt"))
+ {
+ //_dta.txt is a PNNL specific way of saying concatenated DTA
+ for (Extension; Extension > FileName; Extension--)
+ {
+ if (*Extension == '_')
+ {
+ break;
+ }
+ }
+ if(!CompareStrings(Extension,"_dta.txt"))
+ {
+
+ return SPECTRUM_FORMAT_CDTA;
+ }
+ }
+
+ // Unexpected extension. Let's ASSUME that it's a .dta file.
+ REPORT_WARNING_S(30, FileName);
+ return SPECTRUM_FORMAT_DTA;
+}
+
+void FreeSpectrumNode(SpectrumNode* Node)
+{
+ int TweakIndex;
+ //
+ if (!Node)
+ {
+ return;
+ }
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ SafeFree(Node->Tweaks[TweakIndex].PRMScores);
+ Node->Tweaks[TweakIndex].PRMScores = NULL;
+ }
+ if (Node->Spectrum)
+ {
+ FreeSpectrum(Node->Spectrum);
+ }
+ SafeFree(Node);
+}
diff --git a/Spectrum.h b/Spectrum.h
new file mode 100644
index 0000000..5e84106
--- /dev/null
+++ b/Spectrum.h
@@ -0,0 +1,160 @@
+//Title: Spectrum.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef SPECTRUM_H
+#define SPECTRUM_H
+
+// The basic spectrum object, with array of peaks.
+// Structs and functions to support loading of spectra from several
+// text-based file formats (.dta files, .mgf files, .ms2 files).
+
+#include <stdio.h>
+#include "Inspect.h"
+
+#define DEFAULT_WINDOW_WIDTH 50000 //50Da
+#define DEFAULT_WINDOW_KEEP_COUNT 6 // 6
+
+// After filtering, there will probably be just 1 possible isotope neighbor
+#define MAX_ISOTOPE_NEIGHBORS 8
+
+#define MAX_NOISY_NEIGHBORS 8
+
+// Intensity bin radii, in thousandths of a dalton
+#define INTENSITY_BIN_RADIUS_TIGHT 150
+#define INTENSITY_BIN_RADIUS 500
+
+// Set VERBOSE_DEBUGGING to true if scoring (in mutation-tolerant mode) is broken.
+// Slows things down a bit, because we write out spdeadsheets:
+// PRMScores.xls (verbose annotations for every last PRM bin).
+// DTable.xls (in 2-mod mode) The DScore[] d.p. table
+// PrefixSuffix.xls (in 2-mod mode) The PrefixTable and SuffixTable
+//#define VERBOSE_DEBUGGING
+
+typedef struct SpectralPeak
+{
+ int Mass;
+ float Intensity;
+ int IntensityRank;
+ int Rank; // binned version of IntensityRank
+ int IonType; // for PRM peaks only
+ int FilterScore;
+ int NoisePenalty;
+ float PercentIntensity;
+ int HasNeutralLosses; // 0, 1, or 2
+ int TheoPeak; // for (greedy) interpretation
+ int Score; // for (greedy) interpretation
+ // The IsotopeNeighbors array holds the indices of peaks that are potential
+ // isotopes of this peak. If a peak was assigned a Noise ion type, but it has
+ // a neighbor peak at -1Da, then we give the peak the Isotope ion type.
+ // (The +1 peak gets an IsotopeNeighbors entry)
+ int IsotopeNeighbors[MAX_ISOTOPE_NEIGHBORS];
+ // Sometimes two high-intensity peaks are separated by only 0.1 amu. That *probably*
+ // means there's one big peak that was split by the machine.
+ int NoiseNeighbors[MAX_NOISY_NEIGHBORS];
+ int Index;
+ int RescuedFlag;
+ int AminoIndex; // For labeling purposes only!
+} SpectralPeak;
+
+
+typedef struct ListNode
+{
+ struct ListNode* Prev;
+ struct ListNode* Next;
+ int Entry;
+} ListNode;
+
+typedef struct MSSpectrum
+{
+ int MZ;
+ int ParentMass;
+ float SignalToNoise;
+ //Parent MZ from the file (BEFORE correction)
+ int FileMZ;
+ // Parent mass based on the file (BEFORE correction)
+ int FileMass;
+ // The input file may indicate no charge at all (in which case we guess),
+ // a single charge (in which case we accept it, OR guess if MultiCharge is set),
+ // or multiple charges (in which case we accept it, OR guess if MultiCharge is set).
+ char FileCharge[6];
+ int FileChargeFlag;
+ int Charge;
+ int PeakCount;
+ // PeakAllocation is the size of the allocated Peaks array; >= PeakCount
+ // When we run out of space in the array, we reallocate to double size.
+ int PeakAllocation;
+ SpectralPeak* Peaks;
+ int UnfilteredPeakCount;
+ SpectralPeak* UnfilteredPeaks;
+ int PRMPeakCount;
+ float MaxIntensity; // max over all peaks
+ int PMCorrectedFlag;
+ struct TagGraph* Graph;
+ int CandidatesScored;
+ int IntensityBinCount;
+ float* BinnedIntensitiesTight; // size IntensityBinCount; used for PMC. Tighter radius
+ float* BinnedIntensities; // size IntensityBinCount
+ int* BinnedIntensityLevels; // size IntensityBinCount
+ int* BinPeakIndex; // size IntensityBinCount
+ float IntensityCutoffLow;
+ float IntensityCutoffMedium;
+ float IntensityCutoffHigh;
+ struct SpectrumNode* Node;
+ // For use by IonScoring:
+ float* IntensityThresholds;
+ float* IonScoringNoiseProbabilities;
+ //For Phosphorylation trickery. we remove superdominant peaks for M-p
+ int RemovedPeakIndex;
+ float RemovedPeakIntensity;
+
+#ifdef VERBOSE_DEBUGGING
+ char** PRMDebugStrings;
+#endif
+} MSSpectrum;
+
+int GuessSpectralCharge(MSSpectrum* Spectrum);
+void UnitTestSpectrum();
+void WindowFilterPeaks(MSSpectrum* Spectrum, float WindowWidth, int KeepCount);
+void SpectrumAssignIsotopeNeighbors(MSSpectrum* Spectrum);
+void IntensityRankPeaks();
+MSSpectrum* NewSpectrum();
+void FreeSpectrum(MSSpectrum* Spectrum);
+int SpectrumLoadFromFile(MSSpectrum* Spectrum, FILE* DTAFile);
+void SpectrumCorrectParentMass(MSSpectrum* Spectrum);
+void SpectrumSetCharge(MSSpectrum* Spectrum, int Charge);
+void FreeMatchList(SpectrumNode* Spectrum);
+//void SpectrumComputeBinnedIntensities(SpectrumNode* Node);
+void SpectrumComputeNoiseDistributions(SpectrumNode* Node);
+int GuessSpectrumFormatFromExtension(char* FileName);
+void FreeSpectrumNode(SpectrumNode* Node);
+int GuessSpectrumFormatFromHeader(char* FilePath, MSSpectrum* Spectrum);
+#endif // SPECTRUM_H
diff --git a/SpliceDB.c b/SpliceDB.c
new file mode 100644
index 0000000..144d834
--- /dev/null
+++ b/SpliceDB.c
@@ -0,0 +1,4212 @@
+//Title: SpliceDB.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+// SpliceDB.c constructs a splice-tolerant database, starting from a collection of INTERVALS with LINKS.
+// Translated from the original Python script, CollectExons.py, for efficiency
+#include "CMemLeak.h"
+#include "Utils.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "Trie.h"
+#include "Inspect.h"
+#include "Spliced.h"
+#include "SpliceDB.h"
+#include "SNP.h"
+
+#define MAX_INTERVALS_PER_GENE 2000
+#define MAX_INTERVAL_LENGTH 100000
+
+// Reject any splice junctions from an EST which don't have this good of
+// a splice consensus score...unless we see more than one EST supporting
+// the junction.
+#define SPLICE_SIGNAL_SCORE_CUTOFF -15
+#define DEFAULT_MINIMUM_ORF_LENGTH 50
+#define IFLAG_FRAME_0 1
+#define IFLAG_FRAME_1 2
+#define IFLAG_FRAME_2 4
+#define IFLAG_ALL_READING_FRAMES 7
+
+// We might be parsing many, many intervals from two different sources...which means
+// they'll be read out-of-order. We maintain an index g_IntervalIndex such that
+// g_IntervalIndex[n] is the first interval whose start is at least n*10000.
+// When it comes time to insert a new interval, we check g_IntervalIndex. If
+// the entry is NULL, the interval goes at the end of the global list. If the
+// interval isn't null, the interval goes NEAR that interval (maybe a little earlier,
+// maybe a little later, but the scan is cheap and that's the key idea)
+IntervalNode** g_IntervalIndex = NULL;
+
+// Linked list of all intervals in a chromosome (+ orientation):
+IntervalNode* g_FirstInterval;
+IntervalNode* g_LastInterval;
+
+// Linked list of GeneNode structs for the current gene.
+GeneNode* g_GeneFirst;
+GeneNode* g_GeneLast;
+int GeneNodeCount;
+
+// Int variables for reporting statistics on the database generation algorithms:
+int g_StatsIncompleteGeneCount = 0;
+int g_StatsLargestGeneSize = 0;
+int g_StatsLargestGeneRecordNumber = 0;
+int g_StatsIntervalsBeforeMerge = 0;
+int g_StatsEdgesBeforeMerge = 0;
+int g_StatsIntervalsAfterMerge = 0;
+int g_StatsEdgesAfterMerge = 0;
+int g_StatsIntervalsAfterIntersect = 0;
+int g_StatsEdgesAfterIntersect = 0;
+int g_StatsTotalExonsWritten = 0;
+int g_StatsTotalEdgesWritten = 0;
+
+// Forward declarations:
+IntervalNode* InsertIntervalBefore(IntervalNode* Interval, IntervalNode* Before);
+IntervalNode* InsertIntervalAfter(IntervalNode* Interval, IntervalNode* After);
+int BuildAndWriteExons(FILE* GenomicFile, FILE* OutputFile, int ReverseFlag, char* GeneName, int ChromosomeNumber, int MinORFLength);
+void MaskBrokenSequence(char* Protein, int MinORFLength);
+void IntegrityCheckGene();
+void PruneShortORFs(int ReverseFlag, int MinimumORFLength);
+void DeleteExonLink(ExonNode* Exon, ExonLink* Link, int ForwardFlag);
+void PurgeNonCodingExonChunks();
+void GenomeDAGLinkBack(GenomeDAGNode* DAGNode, GenomeDAGNode* BackDAGNode, int Count);
+void FreeIntervalExons(IntervalNode* Interval);
+
+// Print the current GeneNode list to stdout, for debugging.
+void DebugPrintBuiltGene()
+{
+ GeneNode* GNode;
+ ExonNode* Exon;
+ ExonNode* OtherExon;
+ ExonLink* Link;
+ char Buffer[512];
+ int IntervalLen;
+ int TrueLen;
+ GenomeDAGNode* DAGNode;
+ GenomeDAGLink* DAGLink;
+ int DAGNodeIndex;
+ //
+ printf("\n--== Current gene ==--\n");
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ printf(" Interval from %d to %d flag %d\n", GNode->Interval->Start, GNode->Interval->End, GNode->Interval->Flags);
+ for (DAGNodeIndex = 0; DAGNodeIndex < GNode->Interval->DAGNodeCount; DAGNodeIndex++)
+ {
+ DAGNode = GNode->Interval->DAGNodes + DAGNodeIndex;
+ // Skip any extra allocation (null DAG nodes)
+ if (!DAGNode->Sequence)
+ {
+ continue;
+ }
+ printf(" DAG node from %d to %d (%d bases)\n", DAGNode->Start, DAGNode->End , DAGNode->End - DAGNode->Start);
+ for (DAGLink = DAGNode->FirstBack; DAGLink; DAGLink = DAGLink->Next)
+ {
+ printf(" << Link back %d to DAG node %d-%d\n", DAGLink->Count, DAGLink->Node->Start, DAGLink->Node->End);
+ }
+ for (DAGLink = DAGNode->FirstForward; DAGLink; DAGLink = DAGLink->Next)
+ {
+ printf(" >> Link forw %d to DAG node %d-%d\n", DAGLink->Count, DAGLink->Node->Start, DAGLink->Node->End);
+ }
+ }
+ for (Exon = GNode->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ printf(" Exon from %d to %d (%dAA)\n", Exon->Start, Exon->End, Exon->Length);
+ IntervalLen = Exon->End - Exon->Start;
+ if (IntervalLen <= 0)
+ {
+ printf("** WARNING: Exon is * * E M P T Y * * \n");
+ }
+ TrueLen = strlen(Exon->Prefix) + Exon->Length*3 + strlen(Exon->Suffix);
+ if (IntervalLen != TrueLen)
+ {
+ printf("** Warning: %d-%d is length %d, but true exon length is %zd+%d+%zd\n",
+ Exon->Start, Exon->End, IntervalLen,
+ strlen(Exon->Prefix), Exon->Length*3, strlen(Exon->Suffix));
+ }
+ if (Exon->Sequence)
+ {
+ strncpy(Buffer, Exon->Sequence, 512);
+ Buffer[511] = '\0';
+ printf(" Sequence(partial): %s\n", Buffer);
+ }
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ OtherExon = Link->Exon;
+ printf(" Link back %d to an exon from %d to %d (%dAA)\n", Link->Power,
+ OtherExon->Start, OtherExon->End, OtherExon->Length);
+ if (OtherExon->Sequence)
+ {
+ strncpy(Buffer, OtherExon->Sequence, 50);
+ Buffer[50] = '\0';
+ printf(" Ls: %s\n", Buffer);
+ }
+ if ((OtherExon->Start != Exon->End) && (OtherExon->Start != Exon->End + 1) &&
+ (OtherExon->End != Exon->Start) && (OtherExon->End != Exon->Start - 1))
+ {
+ if (!Link->Power)
+ {
+ printf("** Warning: Link with no power!\n");
+ }
+ }
+ }
+
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ OtherExon = Link->Exon;
+ printf(" Link forward %d to an exon from %d to %d (%dAA)\n", Link->Power, OtherExon->Start,
+ OtherExon->End, OtherExon->Length);
+ if (OtherExon->Sequence)
+ {
+ strncpy(Buffer, OtherExon->Sequence, 50);
+ Buffer[50] = '\0';
+ printf(" Ls: %s\n", Buffer);
+ }
+ if ((OtherExon->Start != Exon->End) && (OtherExon->Start != Exon->End + 1) &&
+ (OtherExon->End != Exon->Start) && (OtherExon->End != Exon->Start - 1))
+ {
+ if (!Link->Power)
+ {
+ printf("** Warning: Link with no power!\n");
+ }
+ }
+
+ }
+ }
+ }
+}
+
+// Print all intervals to stdout. (VERY verbose, if done for the whole chromosome!)
+void DebugPrintIntervals(int IncludeLinks, int CountingFlag,
+ int CoverageStart, int CoverageEnd)
+{
+ IntervalNode* Interval;
+ EdgeNode* Edge;
+ int IntervalCount = 0;
+ int ForwardCount = 0;
+ int BackwardCount = 0;
+ int IForwardCount = 0;
+ int IBackwardCount = 0;
+
+ printf("\n\n=-=-=-=-=- Intervals =-=-=-=-=-\n");
+ for (Interval = g_FirstInterval; Interval; Interval = Interval->Next)
+ {
+ IntervalCount++;
+ // Skip output of intervals not in the range CoverageStart...CoverageEnd
+ if (CoverageStart >= 0 && Interval->End < CoverageStart)
+ {
+ continue;
+ }
+ if (CoverageEnd >= 0 && Interval->Start > CoverageEnd)
+ {
+ continue;
+ }
+ if (IncludeLinks >= 0)
+ {
+ IForwardCount = 0;
+ IBackwardCount = 0;
+ for (Edge = Interval->FirstForward; Edge; Edge = Edge->Next)
+ {
+ IForwardCount++;
+ }
+ for (Edge = Interval->FirstBack; Edge; Edge = Edge->Next)
+ {
+ IBackwardCount++;
+ }
+ printf("%d-%d %d <%d >%d\n", Interval->Start, Interval->End, Interval->Occurrences,
+ IBackwardCount, IForwardCount);
+ }
+
+ for (Edge = Interval->FirstForward; Edge; Edge = Edge->Next)
+ {
+ if (IncludeLinks > 0)
+ {
+ printf(" -> %d-%d (%d)\n", Edge->Interval->Start, Edge->Interval->End, Edge->Count);
+ }
+ if (Edge->Interval->Start < Interval->Start)
+ {
+ printf("** Corruption: Forward link goes to an interval EARLIER along the chrom\n");
+ printf("** Start %d-%d, edge to %d-%d\n", Interval->Start, Interval->End,
+ Edge->Interval->Start, Edge->Interval->End);
+ }
+ ForwardCount++;
+ }
+ for (Edge = Interval->FirstBack; Edge; Edge = Edge->Next)
+ {
+ if (IncludeLinks > 0)
+ {
+ printf(" <- %d-%d (%d)\n", Edge->Interval->Start, Edge->Interval->End, Edge->Count);
+ }
+ if (Edge->Interval->Start > Interval->Start)
+ {
+ printf("** Corruption: Forward link goes to an interval EARLIER along the chrom\n");
+ printf("** Start %d-%d, edge to %d-%d\n", Interval->Start, Interval->End,
+ Edge->Interval->Start, Edge->Interval->End);
+ }
+ BackwardCount++;
+ }
+ }
+ printf("Total: %d intervals, %d forward links, %d backward links.\n", IntervalCount, ForwardCount, BackwardCount);
+ switch (CountingFlag)
+ {
+ case 1:
+ g_StatsIntervalsBeforeMerge = IntervalCount;
+ g_StatsEdgesBeforeMerge = ForwardCount;
+ break;
+ case 2:
+ g_StatsIntervalsAfterMerge = IntervalCount;
+ g_StatsEdgesAfterMerge = ForwardCount;
+ break;
+ case 3:
+ g_StatsIntervalsAfterIntersect = IntervalCount;
+ g_StatsEdgesAfterIntersect = ForwardCount;
+ break;
+ default:
+ break;
+ }
+}
+
+// Add a new interval to the master list. Or, if that interval has already been
+// seen, increment its count. We use g_IntervalIndex to jump to *approximately* the right place
+// in the global list of intervals, then scan forward or backward to find exactly the right spot.
+IntervalNode* AddInterval(int Start, int End, int Flags)
+{
+ IntervalNode* OldInterval;
+ IntervalNode* NewInterval;
+ int Bin;
+ int IterateBin;
+ //
+ if (!g_IntervalIndex)
+ {
+ // Somewhat hacky: Hard-coded size of 25000, large enough to cover human chromosome #1
+ g_IntervalIndex = (IntervalNode**)calloc(25000, sizeof(IntervalNode*));
+ }
+ Bin = Start / 10000;
+ OldInterval = g_IntervalIndex[Bin];
+ if (!OldInterval)
+ {
+ // This interval's start position is larger than any seen before!
+ // Insert the interval at the end of the global list:
+ NewInterval = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ NewInterval->Occurrences = 1;
+ NewInterval->Start = Start;
+ NewInterval->End = End;
+ NewInterval->Flags = Flags;
+ if (g_LastInterval)
+ {
+ g_LastInterval->Next = NewInterval;
+ }
+ NewInterval->Prev = g_LastInterval;
+ if (!g_FirstInterval)
+ {
+ g_FirstInterval = NewInterval;
+ }
+ g_LastInterval = NewInterval;
+ // Update the index:
+ for (IterateBin = Bin; IterateBin >= 0; IterateBin--)
+ {
+ if (g_IntervalIndex[IterateBin])
+ {
+ break;
+ }
+ g_IntervalIndex[IterateBin] = NewInterval;
+ }
+ return NewInterval;
+ }
+ // Next case: OldInterval is exactly right:
+ if (Start == OldInterval->Start && End == OldInterval->End)
+ {
+ OldInterval->Occurrences++;
+ OldInterval->Flags |= Flags;
+ return OldInterval;
+ }
+ // Next case: OldInterval precedes this interval.
+ if (Start > OldInterval->Start || (Start == OldInterval->Start && End > OldInterval->End))
+ {
+ // Iterate forward until OldInterval is NULL or OldInterval comes AFTER the new interval:
+ for (; OldInterval; OldInterval = OldInterval->Next)
+ {
+ if (OldInterval->Start > Start)
+ {
+ break;
+ }
+ if (OldInterval->Start == Start && OldInterval->End > End)
+ {
+ break;
+ }
+ if (OldInterval->Start == Start && OldInterval->End == End)
+ {
+ OldInterval->Occurrences++;
+ OldInterval->Flags |= Flags;
+ return OldInterval;
+ }
+ }
+ NewInterval = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ NewInterval->Occurrences = 1;
+ NewInterval->Start = Start;
+ NewInterval->End = End;
+ NewInterval->Flags = Flags;
+ if (!OldInterval)
+ {
+ // The new interval comes at the END of the list:
+ if (g_LastInterval)
+ {
+ g_LastInterval->Next = NewInterval;
+ }
+ NewInterval->Prev = g_LastInterval;
+ g_LastInterval = NewInterval;
+ }
+ else
+ {
+ // Insert new interval just before OldInterval:
+ if (OldInterval->Prev)
+ {
+ OldInterval->Prev->Next = NewInterval;
+ }
+ NewInterval->Prev = OldInterval->Prev;
+ NewInterval->Next = OldInterval;
+ OldInterval->Prev = NewInterval;
+ }
+ return NewInterval;
+ }
+ else
+ {
+ // Last case: The new interval immediately precedes OldInterval.
+ NewInterval = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ NewInterval->Occurrences = 1;
+ NewInterval->Start = Start;
+ NewInterval->End = End;
+ NewInterval->Flags = Flags;
+ if (OldInterval->Prev)
+ {
+ OldInterval->Prev->Next = NewInterval;
+ }
+ if (g_FirstInterval == OldInterval)
+ {
+ g_FirstInterval = NewInterval;
+ }
+ NewInterval->Prev = OldInterval->Prev;
+ NewInterval->Next = OldInterval;
+ OldInterval->Prev = NewInterval;
+ for (IterateBin = Bin; IterateBin >= 0; IterateBin--)
+ {
+ if (g_IntervalIndex[IterateBin] && (g_IntervalIndex[IterateBin]->Start < Start || (g_IntervalIndex[IterateBin]->Start == Start && g_IntervalIndex[IterateBin]->End < End)))
+ {
+ break;
+ }
+ g_IntervalIndex[IterateBin] = NewInterval;
+ }
+ return NewInterval;
+ }
+ //if (!FirstInterval)
+ //{
+ // NewInterval = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ // NewInterval->Occurrences = 1;
+ // NewInterval->Start = StartPos;
+ // NewInterval->End = EndPos;
+ // NewInterval->Flags = Flags;
+ // FirstInterval = NewInterval;
+ // LastInterval = NewInterval;
+ // return NewInterval;
+ //}
+ //// After this loop, Interval is the last one before the new guy (or NULL, if the new guy
+ //// belongs at the start of the list), and NextInterval is the first one after the
+ //// new guy (or NULL, if the new guy belongs at the end of the list).
+ //for (Interval = LastInterval; Interval; Interval = Interval->Prev)
+ //{
+ // if (Interval->Start == StartPos)
+ // {
+ // if (Interval->End > EndPos)
+ // {
+ // NextInterval = Interval;
+ // continue;
+ // }
+ // if (Interval->End == EndPos)
+ // {
+ // Interval->Occurrences++;
+ // Interval->Flags |= Flags;
+ // return Interval;
+ // }
+ // break;
+ // }
+ // if (Interval->Start < StartPos)
+ // {
+ // break;
+ // }
+ // NextInterval = Interval;
+ //}
+
+ //NewInterval = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ //NewInterval->Occurrences = 1;
+ //NewInterval->Start = StartPos;
+ //NewInterval->End = EndPos;
+ //NewInterval->Flags = Flags;
+
+ //if (!Interval)
+ //{
+ // FirstInterval->Prev = NewInterval;
+ // NewInterval->Next = FirstInterval;
+ // FirstInterval = NewInterval;
+ // return NewInterval;
+ //}
+ //Interval->Next = NewInterval;
+ //NewInterval->Prev = Interval;
+ //NewInterval->Next = NextInterval;
+ //if (NextInterval)
+ //{
+ // NextInterval->Prev = NewInterval;
+ //}
+ //else
+ //{
+ // LastInterval = NewInterval;
+ //}
+ //return NewInterval;
+
+}
+
+// Link forward from interval A to interval B.
+void LinkIntervals(IntervalNode* A, IntervalNode* B, int Count, float Score)
+{
+ EdgeNode* OldEdge;
+ EdgeNode* NewEdge;
+ int Linked;
+ //
+ Linked = 0;
+ for (OldEdge = A->FirstForward; OldEdge; OldEdge = OldEdge->Next)
+ {
+ if (OldEdge->Interval == B)
+ {
+ OldEdge->Count += Count;
+ Linked = 1;
+ }
+ }
+ if (!Linked)
+ {
+ NewEdge = (EdgeNode*)calloc(sizeof(EdgeNode), 1);
+ NewEdge->Count = Count;
+ NewEdge->Score = Score;
+ NewEdge->Interval = B;
+ if (!A->FirstForward)
+ {
+ A->FirstForward = NewEdge;
+ }
+ else
+ {
+ A->LastForward->Next = NewEdge;
+ NewEdge->Prev = A->LastForward;
+ }
+ A->LastForward = NewEdge;
+ }
+ Linked = 0;
+ for (OldEdge = B->FirstBack; OldEdge; OldEdge = OldEdge->Next)
+ {
+ if (OldEdge->Interval == A)
+ {
+ OldEdge->Count += Count;
+ Linked = 1;
+ }
+ }
+ if (!Linked)
+ {
+ NewEdge = (EdgeNode*)calloc(sizeof(EdgeNode), 1);
+ NewEdge->Count = Count;
+ NewEdge->Score = Score;
+ NewEdge->Interval = A;
+ if (!B->FirstBack)
+ {
+ B->FirstBack = NewEdge;
+ }
+ else
+ {
+ B->LastBack->Next = NewEdge;
+ NewEdge->Prev = B->LastBack;
+ }
+ B->LastBack = NewEdge;
+ }
+}
+
+// Copied-and-modified from ParseIntervalsESTBinaryFile.
+// Parse intervals from a binary file, with reading-frame flags attached.
+void ParseIntervalsGeneFindBinaryFile(char* FileName)
+{
+ FILE* File;
+ IntervalNode* Interval;
+ IntervalNode* BackInterval;
+ int Start;
+ int End;
+ int BytesRead;
+ int Score;
+ int Flags;
+ int FilePos = 0;
+ int TotalIntervals = 0;
+ int JunctionCount = 0;
+ int JunctionIndex;
+ int JunctionStart;
+ float JunctionScore;
+ int TotalJunctions = 0;
+ int BackIntervalFound;
+ //
+ File = fopen(FileName, "rb");
+ if (!File)
+ {
+ printf("** Error in ParseIntervalsBinaryFile: Can't open '%s'\n", FileName);
+ return;
+ }
+ while (1)
+ {
+
+ BytesRead = ReadBinary(&Start, sizeof(int), 1, File);
+ if (!BytesRead)
+ {
+ break;
+ }
+ FilePos += BytesRead;
+ BytesRead += ReadBinary(&End, sizeof(int), 1, File);
+ // Sanity check:
+ if (Start<0 || End<0 || End<=Start)
+ {
+ printf("** BARF: Gene finder output reports interval from %d to %d!\n", Start, End);
+ }
+ BytesRead += ReadBinary(&Flags, sizeof(int), 1, File);
+ BytesRead += ReadBinary(&Score, sizeof(int), 1, File);
+
+ Interval = AddInterval(Start, End, Flags);
+ TotalIntervals++;
+ //FilePos += ReadBinary(&Interval->Occurrences, sizeof(int), 1, File);
+ FilePos += ReadBinary(&JunctionCount, sizeof(int), 1, File);
+ // Read a list of junctions that END at this interval.
+ for (JunctionIndex = 0; JunctionIndex < JunctionCount; JunctionIndex++)
+ {
+ FilePos += ReadBinary(&JunctionStart, sizeof(int), 1, File);
+ //FilePos += ReadBinary(&JunctionOccurrences, sizeof(int), 1, File);
+ FilePos += ReadBinary(&JunctionScore, sizeof(float), 1, File);
+ // Right here is where we filter crummy splice junctions:
+ //if (JunctionOccurrences < 2 && JunctionScore < SPLICE_SIGNAL_SCORE_CUTOFF)
+ //{
+ // continue;
+ //}
+ TotalJunctions++;
+ // Find an interval which ends at the junction's splice point:
+ BackIntervalFound = 0;
+ for (BackInterval = Interval->Prev; BackInterval; BackInterval = BackInterval->Prev)
+ {
+ if (BackInterval->End == JunctionStart)
+ {
+ BackIntervalFound = 1;
+ LinkIntervals(BackInterval, Interval, 1, JunctionScore);
+ break;
+ }
+ }
+ if (!BackIntervalFound)
+ {
+ printf("** Warning: Found a junction with no back-interval!\n");
+ printf(" Junction %d %f\n", JunctionStart, JunctionScore);
+ printf(" Interval %d-%d\n", Interval->Start, Interval->End);
+ }
+ }
+ }
+ fclose(File);
+ printf("Read %d intervals, %d junctions.\n", TotalIntervals, TotalJunctions);
+}
+
+// Parse intervals from a binary file containing Interval records. Each Interval
+// record may contain a list of Junction records.
+// Interval record: IntervalStart, IntervalEnd, IntervalCount, JunctionCount
+// Junction record: Start, Count, Score
+// The list contains all junctions that END at the START of this interval. (That way,
+// we can look *back* through the list to find the splice donor)
+// We can filter any junctions that don't have a good occurrence-Count or a good
+// consensus splice signal Score.
+// Note that EST intervals have no particular reading frame specified.
+void ParseIntervalsESTBinaryFile(char* FileName)
+{
+ FILE* File;
+ IntervalNode* Interval;
+ IntervalNode* BackInterval;
+ int Start;
+ int End;
+ int BytesRead;
+ int FilePos = 0;
+ int TotalIntervals = 0;
+ int JunctionCount;
+ int JunctionIndex;
+ int JunctionStart;
+ int JunctionOccurrences;
+ float JunctionScore;
+ int BackIntervalFound;
+ int TotalJunctions = 0;
+ //
+ File = fopen(FileName, "rb");
+ if (!File)
+ {
+ printf("** Error in ParseIntervalsBinaryFile: Can't open '%s'\n", FileName);
+ return;
+ }
+ while (1)
+ {
+
+ BytesRead = ReadBinary(&Start, sizeof(int), 1, File);
+ if (!BytesRead)
+ {
+ break;
+ }
+ FilePos += BytesRead;
+ BytesRead += ReadBinary(&End, sizeof(int), 1, File);
+ Interval = AddInterval(Start, End, IFLAG_ALL_READING_FRAMES);
+ TotalIntervals++;
+ FilePos += ReadBinary(&Interval->Occurrences, sizeof(int), 1, File);
+ FilePos += ReadBinary(&JunctionCount, sizeof(int), 1, File);
+ for (JunctionIndex = 0; JunctionIndex < JunctionCount; JunctionIndex++)
+ {
+ FilePos += ReadBinary(&JunctionStart, sizeof(int), 1, File);
+ FilePos += ReadBinary(&JunctionOccurrences, sizeof(int), 1, File);
+ FilePos += ReadBinary(&JunctionScore, sizeof(float), 1, File);
+ // Right here is where we filter crummy splice junctions:
+ if (JunctionOccurrences < 2 && JunctionScore < SPLICE_SIGNAL_SCORE_CUTOFF)
+ {
+ continue;
+ }
+ TotalJunctions++;
+ // Find an interval which ends at the junction's splice point:
+ BackIntervalFound = 0;
+ for (BackInterval = Interval->Prev; BackInterval; BackInterval = BackInterval->Prev)
+ {
+ if (BackInterval->End == JunctionStart)
+ {
+ BackIntervalFound = 1;
+ LinkIntervals(BackInterval, Interval, JunctionOccurrences, JunctionScore);
+ break;
+ }
+ }
+ if (!BackIntervalFound)
+ {
+ printf("** Warning: Found a junction with no back-interval!\n");
+ printf(" Junction %d %d %f\n", JunctionStart, JunctionOccurrences, JunctionScore);
+ printf(" Interval %d-%d\n", Interval->Start, Interval->End);
+ }
+ }
+ }
+ fclose(File);
+ printf("Read %d intervals, %d junctions.\n", TotalIntervals, TotalJunctions);
+}
+
+// B inherits all backward links from A.
+// before: after:
+// C<->A C<-\ A
+// B \->B
+void AssimilateLinksBack(IntervalNode* A, IntervalNode* B)
+{
+ EdgeNode* NodeA;
+ EdgeNode* PrevA;
+ EdgeNode* NodeB;
+ EdgeNode* NodeC;
+ EdgeNode* Next;
+ IntervalNode* C;
+ int Found;
+ int ACStrength = 0;
+ int BCStrength = 0;
+ //
+
+ for (NodeA = A->FirstBack; NodeA; NodeA = NodeA->Next)
+ {
+ ACStrength = NodeA->Count;
+ BCStrength = 0;
+ // Ensure that B has a link to this target:
+ Found = 0;
+ for (NodeB = B->FirstBack; NodeB; NodeB = NodeB->Next)
+ {
+ if (NodeB->Interval == NodeA->Interval)
+ {
+ BCStrength = NodeB->Count;
+ //NodeB->Count += ACStrength; // Counts are already full
+ Found = 1;
+ break;
+ }
+ }
+ // If B didn't already link back to the target, add an EdgeNode to B's list:
+ if (!Found)
+ {
+ NodeB = (EdgeNode*)calloc(sizeof(EdgeNode), 1);
+ NodeB->Interval = NodeA->Interval;
+ NodeB->Count = ACStrength;
+ if (B->LastBack)
+ {
+ NodeB->Prev = B->LastBack;
+ B->LastBack->Next = NodeB;
+ }
+ else
+ {
+ B->FirstBack = NodeB;
+ }
+ B->LastBack = NodeB;
+ }
+ // Switch C to point to B. It's possible that C has a pointer to B already, in which case
+ // we'll free the old one.
+ C = NodeA->Interval;
+ for (NodeC = C->FirstForward; NodeC; NodeC = NodeC->Next)
+ {
+ if (NodeC->Interval == B)
+ {
+ //FoundCount += NodeC->Count;
+ Next = NodeC->Next;
+ if (Next)
+ {
+ Next->Prev = NodeC->Prev;
+ }
+ if (NodeC->Prev)
+ {
+ NodeC->Prev->Next = Next;
+ }
+ if (C->FirstForward == NodeC)
+ {
+ C->FirstForward = NodeC->Next;
+ }
+ if (C->LastForward == NodeC)
+ {
+ C->LastForward = NodeC->Prev;
+ }
+ SafeFree(NodeC);
+ NodeC = Next;
+ if (!NodeC)
+ {
+ break;
+ }
+ }
+ }
+ Found = 0;
+ for (NodeC = C->FirstForward; NodeC; NodeC= NodeC->Next)
+ {
+ if (NodeC->Interval == A)
+ {
+ //NodeC->Count += FoundCount;
+ //NodeC->Count = ACStrength + BCStrength; // Counts are already full
+ NodeC->Count = ACStrength;
+ NodeC->Interval = B;
+ Found = 1;
+ }
+ }
+ if (!Found)
+ {
+ printf("*** Corruption! %d-%d should link forward to %d-%d\n", C->Start, C->End, A->Start, A->End);
+ }
+ }
+
+ // Free the old nodes:
+ PrevA = NULL;
+ for (NodeA = A->FirstBack; NodeA; NodeA = NodeA->Next)
+ {
+ SafeFree(PrevA);
+ PrevA = NodeA;
+ }
+ SafeFree(PrevA);
+ A->FirstBack = NULL;
+ A->LastBack = NULL;
+
+}
+
+// B inherits all the forward links from A.
+void AssimilateLinksForward(IntervalNode* A, IntervalNode* B)
+{
+ EdgeNode* NodeA;
+ EdgeNode* PrevA;
+ EdgeNode* NodeB;
+ EdgeNode* NodeC;
+ EdgeNode* Next;
+ IntervalNode* C;
+ int Found;
+ int ACStrength = 0;
+ int BCStrength = 0;
+ //
+ for (NodeA = A->FirstForward; NodeA; NodeA = NodeA->Next)
+ {
+ ACStrength = NodeA->Count;
+ BCStrength = 0;
+ // Ensure that B has a link to this target:
+ Found = 0;
+ for (NodeB = B->FirstForward; NodeB; NodeB = NodeB->Next)
+ {
+ if (NodeB->Interval == NodeA->Interval)
+ {
+ BCStrength = NodeB->Count;
+ //NodeB->Count += ACStrength; // Counts are already full
+ Found = 1;
+ break;
+ }
+ }
+ if (!Found)
+ {
+ NodeB = (EdgeNode*)calloc(sizeof(EdgeNode), 1);
+ NodeB->Interval = NodeA->Interval;
+ NodeB->Count = ACStrength;
+ if (B->LastForward)
+ {
+ NodeB->Prev = B->LastForward;
+ B->LastForward->Next = NodeB;
+ }
+ else
+ {
+ B->FirstForward = NodeB;
+ }
+ B->LastForward = NodeB;
+ }
+ // Switch C to point to B. It's possible that C has a pointer to B already, in which case
+ // we'll free the old one.
+ C = NodeA->Interval;
+ for (NodeC = C->FirstBack; NodeC; NodeC= NodeC->Next)
+ {
+ if (NodeC->Interval == B)
+ {
+ //FoundCount += NodeC->Count;
+ Next = NodeC->Next;
+ if (Next)
+ {
+ Next->Prev = NodeC->Prev;
+ }
+ if (NodeC->Prev)
+ {
+ NodeC->Prev->Next = Next;
+ }
+ if (C->FirstBack == NodeC)
+ {
+ C->FirstBack = NodeC->Next;
+ }
+ if (C->LastBack == NodeC)
+ {
+ C->LastBack = NodeC->Prev;
+ }
+ SafeFree(NodeC);
+ NodeC = Next;
+ if (!NodeC)
+ {
+ break;
+ }
+ }
+ }
+ Found = 0;
+ for (NodeC = C->FirstBack; NodeC; NodeC = NodeC->Next)
+ {
+ if (NodeC->Interval == A)
+ {
+ //NodeC->Count = ACStrength + BCStrength;// Counts are already full
+ NodeC->Count = ACStrength;
+ NodeC->Interval = B;
+ Found = 1;
+ }
+ }
+ if (!Found)
+ {
+ printf("*** Corruption! %d-%d should link backward to %d-%d\n", C->Start, C->End, A->Start, A->End);
+ }
+ }
+
+ // Free the old nodes:
+ PrevA = NULL;
+ for (NodeA = A->FirstForward; NodeA; NodeA = NodeA->Next)
+ {
+ SafeFree(PrevA);
+ PrevA = NodeA;
+ }
+ SafeFree(PrevA);
+ A->FirstForward = NULL;
+ A->LastForward = NULL;
+}
+void FreeIntervalDAG(IntervalNode* Interval)
+{
+ int DAGNodeIndex;
+ GenomeDAGNode* DAGNode;
+ GenomeDAGLink* Link;
+ GenomeDAGLink* PrevLink;
+ //
+ if (!Interval || !Interval->DAGNodes)
+ {
+ return;
+ }
+ for (DAGNodeIndex = 0; DAGNodeIndex < Interval->DAGNodeCount; DAGNodeIndex++)
+ {
+ DAGNode = Interval->DAGNodes + DAGNodeIndex;
+ SafeFree(DAGNode->Sequence);
+ SafeFree(DAGNode->Exons);
+ // Free links back:
+ PrevLink = NULL;
+ for (Link = DAGNode->FirstBack; Link; Link = Link->Next)
+ {
+ SafeFree(PrevLink);
+ PrevLink = Link;
+ }
+ SafeFree(PrevLink);
+ // Free links forward:
+ PrevLink = NULL;
+ for (Link = DAGNode->FirstForward; Link; Link = Link->Next)
+ {
+ SafeFree(PrevLink);
+ PrevLink = Link;
+ }
+ SafeFree(PrevLink);
+ }
+ SafeFree(Interval->DAGNodes);
+ Interval->DAGNodes = NULL;
+ Interval->DAGNodeCount = 0;
+}
+
+void FreeInterval(IntervalNode* Interval)
+{
+ //
+ FreeIntervalDAG(Interval);
+ FreeIntervalExons(Interval);
+
+ Interval->FirstForward = NULL;
+ Interval->FirstBack = NULL;
+ Interval->LastForward = NULL;
+ Interval->LastBack = NULL;
+ Interval->Start = -1;
+ Interval->End = -1;
+ SafeFree(Interval);
+}
+
+// Remove an interval from the master list. And USUALLY, free the
+// interval and its edges. If DontFree is true, then don't free
+// any memory yet.
+void RemoveInterval(IntervalNode* Interval, int DontFree)
+{
+ EdgeNode* Prev;
+ EdgeNode* Edge;
+ EdgeNode* NeighborEdge;
+ if (Interval == g_FirstInterval)
+ {
+ g_FirstInterval = Interval->Next;
+ }
+ if (Interval == g_LastInterval)
+ {
+ g_LastInterval = Interval->Prev;
+ }
+ if (Interval->Prev)
+ {
+ Interval->Prev->Next = Interval->Next;
+ }
+ if (Interval->Next)
+ {
+ Interval->Next->Prev = Interval->Prev;
+ }
+ if (!DontFree)
+ {
+ Prev = NULL;
+ Edge = Interval->FirstForward;
+ while (Edge)
+ {
+ SafeFree(Prev);
+ Prev = Edge;
+ // If someone points at us, free their pointer (to avoid corruption!)
+ for (NeighborEdge = Edge->Interval->FirstBack; NeighborEdge; NeighborEdge = NeighborEdge->Next)
+ {
+ if (NeighborEdge->Interval == Interval)
+ {
+ if (Edge->Interval->FirstBack == NeighborEdge)
+ {
+ Edge->Interval->FirstBack = NeighborEdge->Next;
+ }
+ if (Edge->Interval->LastBack == NeighborEdge)
+ {
+ Edge->Interval->LastBack = NeighborEdge->Prev;
+ }
+ if (NeighborEdge->Prev)
+ {
+ NeighborEdge->Prev->Next = NeighborEdge->Next;
+ }
+ if (NeighborEdge->Next)
+ {
+ NeighborEdge->Next->Prev = NeighborEdge->Prev;
+ }
+ SafeFree(NeighborEdge);
+ break;
+ }
+ }
+
+ Edge = Edge->Next;
+ }
+ SafeFree(Prev);
+ //
+ Prev = NULL;
+ Edge = Interval->FirstBack;
+ while (Edge)
+ {
+ SafeFree(Prev);
+ Prev = Edge;
+ // If someone points at us, free their pointer (to avoid corruption!)
+ for (NeighborEdge = Edge->Interval->FirstForward; NeighborEdge; NeighborEdge = NeighborEdge->Next)
+ {
+ if (NeighborEdge->Interval == Interval)
+ {
+ if (Edge->Interval->FirstForward == NeighborEdge)
+ {
+ Edge->Interval->FirstForward = NeighborEdge->Next;
+ }
+ if (Edge->Interval->LastForward == NeighborEdge)
+ {
+ Edge->Interval->LastForward = NeighborEdge->Prev;
+ }
+ if (NeighborEdge->Prev)
+ {
+ NeighborEdge->Prev->Next = NeighborEdge->Next;
+ }
+ if (NeighborEdge->Next)
+ {
+ NeighborEdge->Next->Prev = NeighborEdge->Prev;
+ }
+ SafeFree(NeighborEdge);
+ break;
+ }
+ }
+ Edge = Edge->Next;
+ }
+ SafeFree(Prev);
+ FreeInterval(Interval);
+ }
+}
+
+// Merge all redundant intervals. Intervals which overlap, and
+// have no incompatible edges, can be merged into one large(r) interval.
+// The merged interval inherits all reading frames of the subintervals.
+// (This could add some redundancy, but not much, especially if we later
+// prune short ORFs)
+void MergeIntervals()
+{
+ IntervalNode* MergeA;
+ IntervalNode* NextMergeA;
+ IntervalNode* MergeB;
+ int MergePerformed = 0;
+ int TotalMergesPerformed = 0;
+ //
+
+ NextMergeA = g_FirstInterval;
+ while (1)
+ {
+
+ MergeA = NextMergeA;
+ if (!MergeA)
+ {
+ break;
+ }
+ MergePerformed = 0;
+ MergeB = MergeA->Next;
+ while (1)
+ {
+ if (MergePerformed)
+ {
+ TotalMergesPerformed++;
+ //DebugPrintIntervals(-1. 0);
+ break;
+ }
+ if (!MergeB || MergeB->Start > MergeA->End)
+ {
+ NextMergeA = MergeA->Next;
+ break;
+ }
+ // Case 0: Identical!
+ if (MergeA->Start == MergeB->Start && MergeA->End == MergeB->End)
+ {
+ //printf("%% [0] Merge two identical intervals %d-%d\n", MergeA->Start, MergeA->End);
+ AssimilateLinksBack(MergeB, MergeA);
+ AssimilateLinksForward(MergeB, MergeA);
+ MergeA->Occurrences += MergeB->Occurrences;
+ MergeA->Flags |= MergeB->Flags;
+ RemoveInterval(MergeB, 0);
+ MergePerformed = 1;
+ }
+ // Case 1: Same starting point, A doesn't link forward:
+ else if (MergeA->Start == MergeB->Start && !MergeA->FirstForward)
+ {
+ //printf("%% [1] Same starting point: %d-%d, %d-%d\n", MergeA->Start, MergeA->End, MergeB->Start, MergeB->End);
+ AssimilateLinksBack(MergeA, MergeB);
+ MergeB->Occurrences += MergeA->Occurrences;
+ MergeB->Flags |= MergeA->Flags;
+ NextMergeA = MergeA->Next;
+ RemoveInterval(MergeA, 0);
+ MergePerformed = 1;
+ }
+ // Case 2: Same ending point, B doesn't link backward:
+ else if (MergeA->End == MergeB->End && !MergeB->FirstBack)
+ {
+ //printf("%% [2] Same ending point: %d-%d, %d-%d\n", MergeA->Start, MergeA->End, MergeB->Start, MergeB->End);
+ AssimilateLinksForward(MergeB, MergeA);
+ MergeA->Occurrences += MergeB->Occurrences;
+ MergeA->Flags |= MergeB->Flags;
+ NextMergeA = MergeA;
+ RemoveInterval(MergeB, 0);
+ MergePerformed = 1;
+ }
+ // Case 3: Full overlap, no links in B:
+ else if (MergeA->Start < MergeB->Start && MergeB->End < MergeA->End && !MergeB->FirstBack && !MergeB->FirstForward)
+ {
+ //printf("%% [3] full overlap: %d-%d, %d-%d\n", MergeA->Start, MergeA->End, MergeB->Start, MergeB->End);
+ MergeA->Occurrences += MergeB->Occurrences;
+ MergeA->Flags |= MergeB->Flags;
+ NextMergeA = MergeA;
+ RemoveInterval(MergeB, 0);
+ MergePerformed = 1;
+ }
+ // Case 4: 'proper' overlap, A no forward, B no backward:
+ else if (MergeB->Start > MergeA->Start && MergeB->End > MergeA->End && !MergeA->FirstForward && !MergeB->FirstBack)
+ {
+ //printf("%% [4] Proper overlap: %d-%d, %d-%d\n", MergeA->Start, MergeA->End, MergeB->Start, MergeB->End);
+ MergeA->End = MergeB->End;
+ AssimilateLinksForward(MergeB, MergeA);
+ MergeA->Occurrences += MergeB->Occurrences;
+ MergeA->Flags |= MergeB->Flags;
+ NextMergeA = MergeA;
+ RemoveInterval(MergeB, 0);
+ MergePerformed = 1;
+ }
+ else
+ {
+ // Default case: Non-mergable
+ MergeB = MergeB->Next;
+ }
+ }
+ } // Iterate MergeA
+ printf("Performed a total of %d merges.\n", TotalMergesPerformed);
+}
+
+// If two intervals intersect, then we don't want to store separate exons for each one. That would be a lot of
+// redundant sequence data! So, after calling MergeIntervals, we call IntersectIntervals().
+// The routine IntersectIntervals will produce a (near-)minimal disjoint set of intervals covering all the
+// ESTs and splice boundaries we've ever seen. The intersection interval inherits all reading frames from its
+// parents.
+void IntersectIntervals()
+{
+ IntervalNode* A;
+ IntervalNode* NextA;
+ IntervalNode* B;
+ IntervalNode* C;
+ IntervalNode* D;
+ int IntersectPerformed = 0;
+ //
+
+ NextA = g_FirstInterval;
+ while (1)
+ {
+ A = NextA;
+ if (!A)
+ {
+ break;
+ }
+ if (IntersectPerformed)
+ {
+ //DebugPrintIntervals(1, 0);
+ }
+ IntersectPerformed = 0;
+ B = A->Next;
+ if (!B)
+ {
+ break;
+ }
+ // Easy case: B starts after A ends. No intersection required:
+ if (B->Start >= A->End)
+ {
+ NextA = A->Next;
+ continue;
+ }
+ if (B->Start == A->Start && B->End == A->End)
+ {
+ //printf("%d-%d is identical to %d-%d\n", A->Start, A->End, B->Start, B->End);
+ AssimilateLinksForward(B, A);
+ AssimilateLinksBack(B, A);
+ A->Occurrences += B->Occurrences;
+ A->Flags |= B->Flags;
+ NextA = A;
+ RemoveInterval(B, 0);
+ IntersectPerformed = 1;
+ continue;
+ }
+ if (B->Start == A->Start)
+ {
+ // |----| A
+ // |-----------| B
+ //
+ // |----|------|
+ // A C
+ //printf("%d-%d has same START as %d-%d\n", A->Start, A->End, B->Start, B->End);
+ C = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ C->Start = A->End;
+ C->End = B->End;
+ C->Occurrences = B->Occurrences;
+ C->Flags = B->Flags;
+ A->Flags |= B->Flags;
+ AssimilateLinksForward(B, C);
+ AssimilateLinksBack(B, A);
+ LinkIntervals(A, C, 0, 0);
+ RemoveInterval(B, 0);
+ C = InsertIntervalAfter(C, A);
+ NextA = A;
+ IntersectPerformed = 1;
+ continue;
+ }
+ if (B->End == A->End)
+ {
+ // |-----------| A
+ // |-----| B
+ //
+ // |----|------|
+ // C B
+
+ //printf("%d-%d has same END as %d-%d\n", A->Start, A->End, B->Start, B->End);
+ NextA = A->Prev;
+ C = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ C->Start = A->Start;
+ C->End = B->Start;
+ C->Occurrences = A->Occurrences;
+ C->Flags = A->Flags;
+ B->Flags |= A->Flags;
+ AssimilateLinksForward(A, B);
+ AssimilateLinksBack(A, C);
+ LinkIntervals(C, B, 0, 0);
+ RemoveInterval(A, 0);
+ C = InsertIntervalBefore(C, B);
+ if (!NextA)
+ {
+ NextA = g_FirstInterval;
+ }
+ IntersectPerformed = 1;
+ continue;
+ }
+ // |---------------|
+ // |---|
+ //
+ // |-----|---|-----|
+ // C B D
+ if (B->End < A->End)
+ {
+ //printf("%d-%d CONTAINS %d-%d\n", A->Start, A->End, B->Start, B->End);
+ C = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ C->Start = A->Start;
+ C->End = B->Start;
+ C->Flags = A->Flags;
+ D = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ D->Start = B->End;
+ D->End = A->End;
+ D->Flags = A->Flags;
+ B->Flags |= A->Flags;
+ AssimilateLinksBack(A, C);
+ AssimilateLinksForward(A, D);
+ LinkIntervals(C, B, 0, 0);
+ LinkIntervals(B, D, 0, 0);
+ C = InsertIntervalBefore(C, B);
+ D = InsertIntervalAfter(D, B);
+ RemoveInterval(A, 0);
+ NextA = C;
+ IntersectPerformed = 1;
+ continue;
+ }
+ // |-------------| A
+ // |---------| B
+ //
+ // |--------|----|----|
+ // C B D
+ //printf("%d-%d has PROPER OVERLAP with %d-%d\n", A->Start, A->End, B->Start, B->End);
+ C = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ C->Start = A->Start;
+ C->End = B->Start;
+ C->Occurrences = A->Occurrences;
+ C->Flags = A->Flags;
+ D = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ D->Start = A->End;
+ D->End = B->End;
+ D->Occurrences = B->Occurrences;
+ D->Flags = B->Flags;
+ B->Flags |= A->Flags;
+ //B2 = (IntervalNode*)calloc(sizeof(IntervalNode), 1);
+ AssimilateLinksBack(A, C);
+ AssimilateLinksForward(B, D);
+ AssimilateLinksForward(A, B);
+ LinkIntervals(C, B, 0, 0);
+ LinkIntervals(B, D, 0, 0);
+ C = InsertIntervalBefore(C, B);
+ D = InsertIntervalAfter(D, B);
+ RemoveInterval(B, 1);
+ B->End = A->End;
+ RemoveInterval(A, 0);
+ B = InsertIntervalBefore(B, D);
+ NextA = C;
+ IntersectPerformed = 1;
+ continue;
+ }
+}
+
+// Insert Interval into the master list. It comes after After.
+IntervalNode* InsertIntervalAfter(IntervalNode* Interval, IntervalNode* After)
+{
+ IntervalNode* Node;
+ //
+ Node = After;
+ if (!Node)
+ {
+ Node = g_FirstInterval;
+ }
+ while (Node)
+ {
+ if (Node->Start > Interval->Start)
+ {
+ break;
+ }
+ if (Node->Start == Interval->Start)
+ {
+ if (Node->End == Interval->End)
+ {
+ AssimilateLinksForward(Interval, Node);
+ AssimilateLinksBack(Interval, Node);
+ Node->Occurrences += Interval->Occurrences;
+ SafeFree(Interval);
+ return Node;
+ }
+ if (Node->End > Interval->End)
+ {
+ break;
+ }
+ }
+ Node = Node->Next;
+ }
+ // At this point, Node is the guy that Interval will be inserted before:
+ if (!Node)
+ {
+ g_LastInterval->Next = Interval;
+ Interval->Prev = g_LastInterval;
+ g_LastInterval = Interval;
+ }
+ else
+ {
+ if (Node->Prev)
+ {
+ Node->Prev->Next = Interval;
+ }
+ Interval->Prev = Node->Prev;
+ Interval->Next = Node;
+ Node->Prev = Interval;
+ }
+ return Interval;
+}
+
+// Insert Interval into the master list. It comes before Before.
+IntervalNode* InsertIntervalBefore(IntervalNode* Interval, IntervalNode* Before)
+{
+ IntervalNode* Node;
+ //
+ Node = Before;
+ if (!Node)
+ {
+ Node = g_LastInterval;
+ }
+ while (Node)
+ {
+ if (Node->Start < Interval->Start)
+ {
+ break;
+ }
+ if (Node->Start == Interval->Start)
+ {
+ if (Node->End == Interval->End)
+ {
+ AssimilateLinksForward(Interval, Node);
+ AssimilateLinksBack(Interval, Node);
+ Node->Occurrences += Interval->Occurrences;
+ SafeFree(Interval);
+ return Node;
+ }
+ if (Node->End < Interval->End)
+ {
+ break;
+ }
+ }
+ Node = Node->Prev;
+ }
+ // At this point, Node is the guy that Interval will be inserted after:
+ if (!Node)
+ {
+ g_FirstInterval->Prev = Interval;
+ Interval->Next = g_FirstInterval;
+ g_FirstInterval = Interval;
+ }
+ else
+ {
+ if (Node->Next)
+ {
+ Node->Next->Prev = Interval;
+ }
+ Interval->Next = Node->Next;
+ Interval->Prev = Node;
+ Node->Next = Interval;
+ }
+ return Interval;
+}
+
+// Add Interval to the current gene sometime after Start
+GeneNode* AddIntervalToGeneAfter(GeneNode* Start, IntervalNode* Interval)
+{
+ GeneNode* Node;
+ GeneNode* NewNode;
+ //
+ for (Node = Start; Node; Node = Node->Next)
+ {
+ if (Node->Interval->Start == Interval->Start)
+ {
+ // Already on list, good.
+ return Node;
+ }
+ if (Node->Interval->Start > Interval->Start)
+ {
+ NewNode = (GeneNode*)calloc(sizeof(GeneNode), 1);
+ NewNode->Interval = Interval;
+ Interval->GNode = NewNode;
+ if (Node->Prev)
+ {
+ Node->Prev->Next = NewNode;
+ NewNode->Prev = Node->Prev;
+ }
+ NewNode->Next = Node;
+ Node->Prev = NewNode;
+ GeneNodeCount++;
+ return NewNode;
+ }
+ }
+ // We ran off the edge of the list without seeing something that comes after the new interval.
+ // So, the new interval becomes the last one of the gene:
+ NewNode = (GeneNode*)calloc(sizeof(GeneNode), 1);
+ NewNode->Interval = Interval;
+ Interval->GNode = NewNode;
+ NewNode->Prev = g_GeneLast;
+ g_GeneLast->Next = NewNode;
+ g_GeneLast = NewNode;
+ GeneNodeCount++;
+ return NewNode;
+}
+
+// Add Interval to the current gene sometime before Start
+GeneNode* AddIntervalToGeneBefore(GeneNode* Start, IntervalNode* Interval)
+{
+ GeneNode* Node;
+ GeneNode* NewNode;
+ //
+ for (Node = Start; Node; Node = Node->Prev)
+ {
+ if (Node->Interval->Start == Interval->Start)
+ {
+ // Already on list, good.
+ return Node;
+ }
+ if (Node->Interval->Start < Interval->Start)
+ {
+ NewNode = (GeneNode*)calloc(sizeof(GeneNode), 1);
+ NewNode->Interval = Interval;
+ Interval->GNode = NewNode;
+ if (Node->Next)
+ {
+ Node->Next->Prev = NewNode;
+ NewNode->Next = Node->Next;
+ }
+ NewNode->Prev = Node;
+ Node->Next = NewNode;
+ GeneNodeCount++;
+ return NewNode;
+ }
+ }
+ // We ran off the edge of the list without seeing something that comes after the new interval.
+ // So, the new interval becomes the first one of the gene:
+ NewNode = (GeneNode*)calloc(sizeof(GeneNode), 1);
+ NewNode->Interval = Interval;
+ Interval->GNode = NewNode;
+ NewNode->Next = g_GeneFirst;
+ g_GeneFirst->Prev = NewNode;
+ g_GeneFirst = NewNode;
+ GeneNodeCount++;
+ return NewNode;
+}
+
+// Add new GeneNodes to handle any peptides that start in Node->Interval and extend forward
+// GNode is the bookmark where we started the satisfaction effort, so when (if) we insert new nodes,
+// we'll insert
+int SatisfyIntervalForward(GeneNode* GNode, int CharsSoFar)
+{
+ EdgeNode* Edge;
+ int Chars;
+ GeneNode* SubGNode;
+ int RX;
+ int MinRX;
+ //
+ // If this node has already been satisfied, then return immediately:
+ if (GNode->RX + CharsSoFar > 60)
+ {
+ return GNode->RX;
+ }
+ MinRX = 9999;
+ // Iterate over all 'forward intervals' that this interval links to:
+ for (Edge = GNode->Interval->FirstForward; Edge; Edge = Edge->Next)
+ {
+ // Find (or create) the GeneNode for the forward interval:
+ SubGNode = Edge->Interval->GNode;
+ if (!SubGNode)
+ {
+ SubGNode = AddIntervalToGeneAfter(GNode, Edge->Interval);
+ }
+ RX = Edge->Interval->End - Edge->Interval->Start;
+ Chars = CharsSoFar + (Edge->Interval->End - Edge->Interval->Start);
+ if (Chars < 60)
+ {
+ // We're not yet satisfied along this path, so continue adding intervals:
+ RX += SatisfyIntervalForward(SubGNode, Chars);
+ }
+ MinRX = min(MinRX, RX);
+ }
+ // Sanity check: RX cannot decrease when you add more intervals, it can only improve.
+ if (MinRX < GNode->RX)
+ {
+ printf("%d < %d???\n", MinRX, GNode->RX);
+ }
+ GNode->RX = MinRX;
+ return MinRX;
+}
+
+// Add new GeneNodes to handle any peptides that start in Node->Interval and extend backward
+int SatisfyIntervalBack(GeneNode* GNode, int CharsSoFar)
+{
+ EdgeNode* Edge;
+ int Chars;
+ GeneNode* SubGNode;
+ int LX;
+ int MinLX;
+ //
+ if (GNode->LX + CharsSoFar > 60)
+ {
+ return GNode->LX;
+ }
+ MinLX = 9999;
+ for (Edge = GNode->Interval->FirstBack; Edge; Edge = Edge->Next)
+ {
+ SubGNode = Edge->Interval->GNode;
+ if (!SubGNode)
+ {
+ SubGNode = AddIntervalToGeneBefore(GNode, Edge->Interval);
+ }
+ LX = Edge->Interval->End - Edge->Interval->Start;
+ Chars = CharsSoFar + (Edge->Interval->End - Edge->Interval->Start);
+ if (Chars < 60)
+ {
+ LX += SatisfyIntervalBack(SubGNode, Chars);
+ }
+ MinLX = min(MinLX, LX);
+ }
+ if (MinLX < GNode->LX)
+ {
+ printf("%d < %d???\n", MinLX, GNode->LX);
+ }
+
+ GNode->LX = MinLX;
+ return MinLX;
+
+}
+
+// Free all the GeneNode instances in the global list.
+void FreeGeneNodes()
+{
+ GeneNode* Prev;
+ GeneNode* Node;
+ // Free all the gene nodes:
+ Prev = NULL;
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ Node->Interval->GNode = NULL;
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ g_GeneFirst = NULL;
+ g_GeneLast = NULL;
+ GeneNodeCount = 0;
+}
+
+// Take this interval, and 'satisfy' it by adding linked intervals until (a) there are
+// no more links to follow, or (b) we have extended a considerable distance (in amino acids!).
+// Take the resulting pool of intervals, build exons for them, and write out one
+// "gene" record.
+int SatisfyIntervalAndWriteGene(IntervalNode* NextUnsatisfied, FILE* SequenceFile, FILE* OutputFile,
+ int RecordNumber, int ChromosomeNumber, int ReverseFlag, int MinORFLength)
+{
+ GeneNode* Node;
+ GeneNode* Dissatisfied;
+ char GeneName[256];
+ int AllSatisfied;
+ char DirectionChar;
+ int ValidGeneFlag;
+ //
+
+ // First gene node wraps NextUnsatisfied:
+ g_GeneFirst = (GeneNode*)calloc(sizeof(GeneNode), 1);
+ g_GeneFirst->Interval = NextUnsatisfied;
+ NextUnsatisfied->GNode = g_GeneFirst;
+ g_GeneLast = g_GeneFirst;
+ GeneNodeCount = 1;
+
+ // Add the necessary gene nodes to satisfy:
+ Node = g_GeneFirst;
+ SatisfyIntervalForward(Node, 0);
+ SatisfyIntervalBack(Node, 0);
+ Node->Interval->Satisfied = 1;
+ // Iterate: If there are unsatisfied intervals in the gene, and the gene isn't too large, satisfy some more.
+ AllSatisfied = 0;
+ while (GeneNodeCount < MAX_INTERVALS_PER_GENE)
+ {
+ Dissatisfied = NULL;
+ // Find the first not-yet-satisfied interval:
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ if (!Node->Interval->Satisfied)
+ {
+ Dissatisfied = Node;
+ break;
+ }
+ }
+ if (!Dissatisfied)
+ {
+ AllSatisfied = 1;
+ break; // Done!
+ }
+ SatisfyIntervalForward(Dissatisfied, 0);
+ SatisfyIntervalBack(Dissatisfied, 0);
+ Dissatisfied->Interval->Satisfied = 1;
+ }
+ if (!AllSatisfied)
+ {
+ g_StatsIncompleteGeneCount++;
+ IntegrityCheckGene();
+ }
+
+ // Write this gene out:
+ if (ReverseFlag)
+ {
+ DirectionChar = '-';
+ }
+ else
+ {
+ DirectionChar = '+';
+ }
+ sprintf(GeneName, "%d%c Gene %d, %d-%d", ChromosomeNumber, DirectionChar, RecordNumber, g_GeneFirst->Interval->Start, g_GeneLast->Interval->End);
+ // *************************
+ ValidGeneFlag = BuildAndWriteExons(SequenceFile, OutputFile, ReverseFlag, GeneName, ChromosomeNumber, MinORFLength);
+
+ FreeGeneNodes();
+ return ValidGeneFlag;
+}
+
+
+// Once the master interval list has been prepared, we can write out genes.
+// The procedure works like this:
+// We'll build a linked list of GeneNode structs, from g_GeneFirst to g_GeneLast, with size GeneNodeCount. The intervals
+// contained in this list of GeneNodes are what we'll write out as a gene record.
+// Iterate:
+// - Take the first interval not yet satisfied, A.
+// - Find all neighbors necessary in order to satisfy A, and add them to the gene.
+// - Iterate:
+// -- If the gene contains too many intervals, stop.
+// -- If every interval in the gene has now been satisfied, stop.
+// -- Otherwise, take the first interval in the gene which has not yet been flagged satisfied, and add the necessary
+// intervals to satisfy it. (It's possible that we already have the necessary intervals, and just need to
+// discoverify that fact)
+// - For each interval in the active range: Construct exons
+// - Using the exon structs, write out a gene record
+// - Free the exon structs (they're bloaty, containing sequence strings) and the GeneNode list
+//
+// If the interval graph is well-behaved, and consists of small connected components, then we write one gene for each
+// connected component. If the interval graph is messy, then our iterative procedure will cover the graph in
+// a reasonably efficient way. (We're guaranteed to satisfy one interval with each gene, and we're likely to satisfy
+// several)
+int WriteGenesForIntervals(char* SequenceFileName, char* OutputFileName, int ChromosomeNumber, int ReverseFlag, int MinORFLength)
+{
+ FILE* SequenceFile;
+ FILE* OutputFile;
+ IntervalNode* NextUnsatisfied;
+ int RecordNumber = 0;
+ int ValidGeneFlag;
+ //
+ SequenceFile = fopen(SequenceFileName, "rb");
+ if (!SequenceFile)
+ {
+ printf("** ERROR: Unable to open chromosome database '%s'\n", SequenceFileName);
+ return 0;
+ }
+ OutputFile = fopen(OutputFileName, "wb");
+ if (!OutputFile)
+ {
+ printf("** ERROR: Unable to open output file '%s'\n", OutputFileName);
+ return 0;
+ }
+ // Iterate over intervals, skipping over intervals that have already been satisfied.
+ NextUnsatisfied = g_FirstInterval;
+ while (1)
+ {
+ if (!NextUnsatisfied)
+ {
+ break;
+ }
+ if (NextUnsatisfied->Satisfied)
+ {
+ NextUnsatisfied = NextUnsatisfied->Next;
+ continue;
+ }
+
+ //printf("\n - - - Satisfy the next interval: %d-%d\n", NextUnsatisfied->Start, NextUnsatisfied->End);
+ // if SatisfyIntervalAndWriteGene returns 0, then there's no real gene here (short ORFs were pruned).
+ ValidGeneFlag = SatisfyIntervalAndWriteGene(NextUnsatisfied, SequenceFile, OutputFile, RecordNumber, ChromosomeNumber, ReverseFlag, MinORFLength);
+ if (ValidGeneFlag)
+ {
+ RecordNumber++;
+ //printf("Wrote gene record %d\n", RecordNumber);
+ }
+ } // main loop for writing out genes.
+ printf("Genes have been written out. Statistics:\n");
+ printf("%d\t", ChromosomeNumber);
+ printf("%d\t", ReverseFlag);
+ printf("%d\t", RecordNumber);
+ printf("%d\t", g_StatsIncompleteGeneCount);
+ printf("%d\t", g_StatsLargestGeneSize);
+ printf("%d\t", g_StatsLargestGeneRecordNumber);
+ printf("%d\t", g_StatsIntervalsBeforeMerge);
+ printf("%d\t", g_StatsEdgesBeforeMerge);
+ printf("%d\t", g_StatsIntervalsAfterMerge);
+ printf("%d\t", g_StatsEdgesAfterMerge);
+ printf("%d\t", g_StatsIntervalsAfterIntersect);
+ printf("%d\t", g_StatsEdgesAfterIntersect);
+ printf("%d\t", g_StatsTotalExonsWritten);
+ printf("%d\t", g_StatsTotalEdgesWritten);
+ printf("\n");
+ return RecordNumber;
+}
+
+// Free exons for an interval
+void FreeIntervalExons(IntervalNode* Interval)
+{
+ ExonNode* PrevExon;
+ ExonNode* Exon;
+ ExonLink* PrevLink;
+ ExonLink* Link;
+
+ PrevExon = NULL;
+ for (Exon = Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ // Free forward links:
+ PrevLink = NULL;
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ SafeFree(PrevLink);
+ PrevLink = Link;
+ }
+ SafeFree(PrevLink);
+
+ // Free backward links:
+ PrevLink = NULL;
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ SafeFree(PrevLink);
+ PrevLink = Link;
+ }
+ SafeFree(PrevLink);
+
+ // Free exon itself:
+ SafeFree(PrevExon->Sequence);
+ SafeFree(PrevExon);
+ PrevExon = Exon;
+ }
+ SafeFree(PrevExon->Sequence);
+ SafeFree(PrevExon);
+ Interval->FirstExon = NULL;
+ Interval->LastExon = NULL;
+}
+
+void AddExonToInterval(IntervalNode* Interval, ExonNode* Exon)
+{
+ if (Interval->LastExon)
+ {
+ Interval->LastExon->Next = Exon;
+ }
+ else
+ {
+ Interval->FirstExon = Exon;
+ }
+ Interval->LastExon = Exon;
+ Exon->Interval = Interval;
+}
+
+
+
+// Given an exon and its dna sequence, translate into amino acids.
+// The exon's prefix has already been set, but we'll set the suffix (with the 'leftovers')
+// If MinORFLength>0, then call MaskBrokenSequence to MASK OUT the interval between two stop
+// codons separated by less than MinORFLength.
+void GetExonSequence(ExonNode* Exon, char* DNA, int MinORFLength)
+{
+ char ProteinBuffer[MAX_INTERVAL_LENGTH];
+ int Pos;
+ int Length;
+ char* Peptide;
+ int SuffixPos;
+
+ if (!DNA || !*DNA)
+ {
+ Exon->Suffix[0] = '\0';
+ Exon->Sequence = NULL;
+ Exon->Length = 0;
+ return;
+ }
+ Length = strlen(DNA);
+ Pos = 0;
+ Peptide = ProteinBuffer;
+ while (Pos + 2 < Length)
+ {
+ *Peptide = TranslateCodon(DNA + Pos);
+ Peptide++;
+ Pos += 3;
+ }
+ *Peptide = '\0';
+ MaskBrokenSequence(ProteinBuffer, MinORFLength);
+ //Exon->Length = strlen(ProteinBuffer);
+ Exon->Length = strlen(ProteinBuffer);
+ if (Exon->Length)
+ {
+ Exon->Sequence = (char*)calloc(sizeof(char), Exon->Length + 1);
+ strcpy(Exon->Sequence, ProteinBuffer);
+ }
+ SuffixPos = 0;
+ while (Pos < Length)
+ {
+ Exon->Suffix[SuffixPos] = DNA[Pos];
+ SuffixPos++;
+ Pos++;
+ }
+ Exon->Suffix[SuffixPos] = '\0';
+}
+
+// If an exon's protein sequence has two stop codons, with fewer than 50 residues in between,
+// then CUT OUT that section. (Because we interpret genomic intervals in multiple reading
+// frames, we often get reads of very short length; we don't believe that such short peptides
+// are reasonable!)
+// Todo: Try encoding some stop codons as selenocysteines (U).
+// Update: We no longer CUT the sequence, because that would ruin our genomic coordinates.
+// Rather, we MASK super-short reading frames!
+void MaskBrokenSequence(char* Protein, int MinORFLength)
+{
+ int AnchorPos = -1;
+ int Pos;
+ char AA;
+ int MaskPos;
+ //
+ // if MinORFLength <= 0, then don't filter.
+ if (MinORFLength <= 0)
+ {
+ return;
+ }
+ Pos = 0;
+ while (1)
+ {
+ AA = Protein[Pos];
+ if (!AA)
+ {
+ break;
+ }
+ if (AA == 'X')
+ {
+ if (AnchorPos == -1 || (Pos - AnchorPos >= MinORFLength))
+ {
+ AnchorPos = Pos;
+ }
+ else
+ {
+ for (MaskPos = AnchorPos + 1; MaskPos < Pos; MaskPos++)
+ {
+ Protein[MaskPos] = 'X';
+ }
+ AnchorPos = Pos;
+ Pos++;
+ continue;
+ }
+ }
+ Pos++;
+ }
+}
+
+// Add a forward link from exon A to exon B
+void AddExonLink(ExonNode* A, ExonNode* B, char AA, int Power)
+{
+ ExonLink* Link;
+ //
+ Link = (ExonLink*)calloc(sizeof(ExonLink), 1);
+ Link->Exon = B;
+ Link->AA = AA;
+ Link->Power = Power;
+ if (A->LastForward)
+ {
+ A->LastForward->Next = Link;
+ }
+ else
+ {
+ A->FirstForward = Link;
+ }
+ A->LastForward = Link;
+ //
+ Link = (ExonLink*)calloc(sizeof(ExonLink), 1);
+ Link->Exon = A;
+ Link->AA = AA;
+ Link->Power = Power;
+ if (B->LastBack)
+ {
+ B->LastBack->Next = Link;
+ }
+ else
+ {
+ B->FirstBack = Link;
+ }
+ B->LastBack = Link;
+}
+
+// Link up the "edge" DAG nodes for intervals, if their parent intervals are linked.
+void LinkDAGAcrossIntervals(IntervalNode* Interval, EdgeNode* Edge, int ReverseFlag)
+{
+ IntervalNode* OtherInterval;
+ GenomeDAGNode* DAGNode;
+ GenomeDAGNode* OtherDAGNode;
+ int DAGNodeIndex;
+ int OtherDAGNodeIndex;
+ //
+ OtherInterval = Edge->Interval;
+ for (DAGNodeIndex = 0; DAGNodeIndex < Interval->DAGNodeCount; DAGNodeIndex++)
+ {
+ DAGNode = Interval->DAGNodes + DAGNodeIndex;
+ if (!DAGNode->Sequence)
+ {
+ continue;
+ }
+ // We link forward only from dag nodes that touch the edge:
+ if (DAGNode->End < Interval->End)
+ {
+ continue;
+ }
+ for (OtherDAGNodeIndex = 0; OtherDAGNodeIndex < OtherInterval->DAGNodeCount; OtherDAGNodeIndex++)
+ {
+ OtherDAGNode = OtherInterval->DAGNodes + OtherDAGNodeIndex;
+ if (!OtherDAGNode->Sequence)
+ {
+ continue;
+ }
+ if (OtherDAGNode->Start > OtherInterval->Start)
+ {
+ continue;
+ }
+ GenomeDAGLinkBack(OtherDAGNode, DAGNode, Edge->Count); // link!
+ }
+ }
+}
+
+// Add links between edges, according to how the 'parent' DAG is linked.
+// Each DAG can have three exons: prefix-length 0, prefix 1, prefix 2.
+// If you're length 1, then you get one with no prefix (and length-1 suffix), one with prefix (and no suffix).
+// If you're longer than 1, then the exons you get depend on your reading frame flags: EST-dervied exons get
+// three exons, most gene-finding-derived exons get a single exon for the single plausible reading frame.
+void LinkIntervalExons(IntervalNode* Interval, int ReverseFlag)
+{
+ ExonNode* Exon;
+ ExonNode* NextExon = NULL;
+ GenomeDAGNode* DAGNode;
+ GenomeDAGLink* Edge;
+ GenomeDAGLink* NextEdge;
+ int DAGNodeIndex;
+ int ExonIndex;
+ int SuffixLength;
+ char DNA[4];
+ char AA = 0;
+ int Power = 0;
+ //
+ DNA[3] = '\0';
+ for (DAGNodeIndex = 0; DAGNodeIndex < Interval->DAGNodeCount; DAGNodeIndex++)
+ {
+ DAGNode = Interval->DAGNodes + DAGNodeIndex;
+ if (ReverseFlag)
+ {
+ Edge = DAGNode->FirstBack;
+ }
+ else
+ {
+ Edge = DAGNode->FirstForward;
+ }
+ while (Edge)
+ {
+ // This DAG has one, two, or three exons to join.
+ for (ExonIndex = 0; ExonIndex < 3; ExonIndex++)
+ {
+ Exon = DAGNode->Exons[ExonIndex];
+ if (!Exon)
+ {
+ continue;
+ }
+ SuffixLength = strlen(Exon->Suffix);
+ switch (SuffixLength)
+ {
+ case 0:
+ NextExon = Edge->Node->Exons[0];
+ if (!NextExon)
+ {
+ continue;
+ }
+ AA = '\0';
+ Power = Edge->Count;
+ break;
+ case 1:
+ // A length-1 suffix. We link to a length-2 prefix, if available:
+ NextExon = Edge->Node->Exons[2];
+ if (NextExon)
+ {
+ // Combine our length-1 suffix with the length-2 prefix:
+ DNA[0] = Exon->Suffix[0];
+ DNA[1] = NextExon->Prefix[0];
+ DNA[2] = NextExon->Prefix[1];
+ AA = TranslateCodon(DNA);
+ Power = Edge->Count;
+ break;
+ }
+ else
+ {
+ // There's no length-2 prefix available. If that's because the next
+ // exon has length <1, then we "leapfrog" through it:
+ if ((Edge->Node->End == Edge->Node->Start + 1) && Edge->Node->Exons[0])
+ {
+ // The ugly case. Take our suffix char, add the forward interval's base,
+ // and then add one base from the "far interval".
+ DNA[0] = Exon->Suffix[0];
+ DNA[1] = Edge->Node->Exons[0]->Suffix[0];
+ if (ReverseFlag)
+ {
+ NextEdge = Edge->Node->FirstBack;
+ }
+ else
+ {
+ NextEdge = Edge->Node->FirstForward;
+ }
+ for (; NextEdge; NextEdge = NextEdge->Next)
+ {
+ NextExon = NextEdge->Node->Exons[1];
+ if (NextExon)
+ {
+ DNA[2] = NextExon->Prefix[0];
+ AA = TranslateCodon(DNA);
+ Power = max(Edge->Count, NextEdge->Count);
+ AddExonLink(Exon, NextExon, AA, Power);
+ }
+ }
+ }
+ continue;
+ }
+ case 2:
+ NextExon = Edge->Node->Exons[1];
+ if (!NextExon)
+ {
+ continue;
+ }
+ DNA[0] = Exon->Suffix[0];
+ DNA[1] = Exon->Suffix[1];
+ DNA[2] = NextExon->Prefix[0];
+ AA = TranslateCodon(DNA);
+ Power = Edge->Count;
+ break;
+ }
+ AddExonLink(Exon, NextExon, AA, Power);
+
+ } // exon loop
+ Edge = Edge->Next;
+ } // edge loop
+ } // DAG node loop
+}
+
+// Write out one exon record in binary format.
+void WriteExonRecord(ExonNode* Exon, FILE* OutputFile, int ReverseFlag)
+{
+ int Length;
+ int BackLinkCount;
+ int ForwardLinkCount;
+ ExonLink* Link;
+ //
+ WriteBinary(&Exon->Start, sizeof(int), 1, OutputFile);
+ WriteBinary(&Exon->End, sizeof(int), 1, OutputFile);
+ Length = Exon->Length;
+ g_StatsTotalExonsWritten++;
+
+ WriteBinary(&Length, sizeof(int), 1, OutputFile);
+ WriteBinary(&Exon->Interval->Occurrences, sizeof(int), 1, OutputFile);
+ if (Length)
+ {
+ WriteBinary(Exon->Sequence, sizeof(char), Length, OutputFile);
+ }
+ BackLinkCount = 0;
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ BackLinkCount++;
+ }
+ ForwardLinkCount = 0;
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ ForwardLinkCount++;
+ }
+
+ if (0) //ReverseFlag)
+ {
+ WriteBinary(Exon->Prefix, sizeof(char), 2, OutputFile);
+ WriteBinary(Exon->Suffix, sizeof(char), 2, OutputFile);
+
+ WriteBinary(&ForwardLinkCount, sizeof(int), 1, OutputFile);
+ WriteBinary(&BackLinkCount, sizeof(int), 1, OutputFile);
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ g_StatsTotalEdgesWritten++;
+ WriteBinary(&Link->Exon->Index, sizeof(int), 1, OutputFile);
+ WriteBinary(&Link->Power, sizeof(int), 1, OutputFile);
+ WriteBinary(&Link->AA, sizeof(char), 1, OutputFile);
+ }
+
+ }
+ else
+ {
+ WriteBinary(Exon->Prefix, sizeof(char), 2, OutputFile);
+ WriteBinary(Exon->Suffix, sizeof(char), 2, OutputFile);
+
+ WriteBinary(&BackLinkCount, sizeof(int), 1, OutputFile);
+ WriteBinary(&ForwardLinkCount, sizeof(int), 1, OutputFile);
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ g_StatsTotalEdgesWritten++;
+ WriteBinary(&Link->Exon->Index, sizeof(int), 1, OutputFile);
+ WriteBinary(&Link->Power, sizeof(int), 1, OutputFile);
+ WriteBinary(&Link->AA, sizeof(char), 1, OutputFile);
+ }
+ }
+}
+
+// Given a range of intervals (from First to Last), with exons built, write
+// out the binary gene record to the splice-tolerant database file.
+int WriteGeneRecord(int ChromosomeNumber, char* GeneName, int ReverseFlag, FILE* OutputFile)
+{
+ int ExonCount = 0;
+ IntervalNode* Interval;
+ ExonNode* Exon;
+ GeneNode* Node;
+ char ForwardFlag;
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ Interval = Node->Interval;
+ for (Exon = Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ ExonCount++;
+ }
+ }
+ if (!ExonCount)
+ {
+ // No exons!? That can happen if we pruned them all away due to short ORFs.
+ // No need to write anything at all:
+ return 0;
+ }
+
+ WriteBinary(GeneName, sizeof(char), 256, OutputFile);
+ WriteBinary(GeneName, sizeof(char), 256, OutputFile);
+ WriteBinary(&ChromosomeNumber, sizeof(int), 1, OutputFile);
+ if (ReverseFlag)
+ {
+ ForwardFlag = 0;
+ }
+ else
+ {
+ ForwardFlag = 1;
+ }
+ WriteBinary(&ForwardFlag, sizeof(char), 1, OutputFile);
+ WriteBinary(&ExonCount, sizeof(int), 1, OutputFile);
+ if (ReverseFlag)
+ {
+ // Re-index all the exons:
+ ExonCount = 0;
+ for (Node = g_GeneLast; Node; Node = Node->Prev)
+ {
+ for (Exon = Node->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ Exon->Index = ExonCount;
+ ExonCount++;
+ }
+ }
+ for (Node = g_GeneLast; Node; Node = Node->Prev)
+ {
+ Interval = Node->Interval;
+ for (Exon = Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ WriteExonRecord(Exon, OutputFile, ReverseFlag);
+ }
+ }
+ }
+ else
+ {
+ // Re-index all the exons:
+ ExonCount = 0;
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ for (Exon = Node->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ Exon->Index = ExonCount;
+ ExonCount++;
+ }
+ }
+
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ Interval = Node->Interval;
+ for (Exon = Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ WriteExonRecord(Exon, OutputFile, ReverseFlag);
+ }
+ }
+ }
+ return 1;
+}
+
+// Verify that our exon construction is valid. The number of exon forward-links and
+// backward-links should match. And no exons should go outside the active range...
+void IntegrityCheckGene()
+{
+ int ForwardCount = 0;
+ int BackwardCount = 0;
+ int ExonCount = 0;
+ int IntervalCount = 0;
+ ExonNode* Exon;
+ ExonLink* Link;
+ ExonLink* RecipLink;
+ int FoundFlag;
+ IntervalNode* Interval;
+ GeneNode* Node;
+ EdgeNode* Edge;
+ int Count;
+ //
+ printf("\n===Integrity check: Intervals from %d to %d\n", g_GeneFirst->Interval->Start, g_GeneLast->Interval->End);
+
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ Interval = Node->Interval;
+ if (Interval->Satisfied)
+ {
+ printf("%d - %d SATISFIED ", Interval->Start, Interval->End);
+ }
+ else
+ {
+ printf("%d - %d unsatisfied ", Interval->Start, Interval->End);
+ }
+ Count = 0;
+ for (Edge = Interval->FirstForward; Edge; Edge = Edge->Next)
+ {
+ Count++;
+ }
+ switch (Count)
+ {
+ case 0:
+ break;
+ case 1:
+ printf("to %d", Interval->FirstForward->Interval->Start);
+ break;
+ case 2:
+ printf("to %d, %d", Interval->FirstForward->Interval->Start, Interval->FirstForward->Next->Interval->Start);
+ break;
+ default:
+ printf("to %d, %d, +%d", Interval->FirstForward->Interval->Start,
+ Interval->FirstForward->Next->Interval->Start, Count-2);
+ break;
+
+ }
+ printf("\n");
+ IntervalCount++;
+ for (Exon = Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ ExonCount++;
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ ForwardCount++;
+ if (!Link->Exon->Interval->GNode)
+ {
+ printf("** Warning: Exon %d links forward out of this world.\n", Exon->Index);
+ }
+ // Check for a reciprocal link, too:
+ FoundFlag = 0;
+ for (RecipLink = Link->Exon->FirstBack; RecipLink; RecipLink = RecipLink->Next)
+ {
+ if (RecipLink->Exon == Exon)
+ {
+ FoundFlag = 1;
+ if (Link->Power != RecipLink->Power)
+ {
+ printf("** Warning: Exon link %d to %d has inconsistent strength %d, %d\n",
+ Exon->Index, Link->Exon->Index, Link->Power, RecipLink->Power);
+ }
+ break;
+ }
+ }
+ if (!FoundFlag)
+ {
+ printf("** Warning: Exon %d has a non-reciprocated forward link.\n", Exon->Index);
+ }
+ }
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ BackwardCount++;
+ if (!Link->Exon->Interval->GNode)
+ {
+ printf("** Warning: Exon %d links backward out of this world.\n", Exon->Index);
+ }
+ // Check for a reciprocal link, too:
+ FoundFlag = 0;
+ for (RecipLink = Link->Exon->FirstForward; RecipLink; RecipLink = RecipLink->Next)
+ {
+ if (RecipLink->Exon == Exon)
+ {
+ FoundFlag = 1;
+ break;
+ }
+ }
+ if (!FoundFlag)
+ {
+ printf("** Warning: Exon %d has a non-reciprocated backward link.\n", Exon->Index);
+ }
+ }
+ } // exon loop
+ } // interval loop
+ printf("Saw %d intervals, %d exons, %d links.\n", IntervalCount, ExonCount, ForwardCount);
+ if (ForwardCount != BackwardCount)
+ {
+ printf("** Warning: Total forward links is %d != backward links %d\n", ForwardCount, BackwardCount);
+ }
+}
+
+int g_UFTotalExons = 0;
+int g_UFTotalAA = 0;
+int g_UFTotalEdges = 0;
+int g_TotalExons = 0;
+int g_TotalAA = 0;
+int g_TotalEdges = 0;
+int g_TotalTrueExons = 0;
+int g_TotalTrueEdges = 0;
+
+typedef struct ExonSortNode
+{
+ ExonNode* Exon;
+} ExonSortNode;
+
+int CompareExonNodesForward(const ExonSortNode* NodeA, const ExonSortNode* NodeB)
+{
+ if (NodeA->Exon->Start < NodeB->Exon->Start)
+ {
+ return -1;
+ }
+ if (NodeA->Exon->Start > NodeB->Exon->Start)
+ {
+ return 1;
+ }
+ // arbitrary:
+ if (NodeA->Exon < NodeB->Exon)
+ {
+ return -1;
+ }
+ else
+ {
+ return 1;
+ }
+}
+int CompareExonNodesBackward(const ExonSortNode* NodeA, const ExonSortNode* NodeB)
+{
+ if (NodeA->Exon->Start < NodeB->Exon->Start)
+ {
+ return 1;
+ }
+ if (NodeA->Exon->Start > NodeB->Exon->Start)
+ {
+ return -1;
+ }
+ // arbitrary:
+ if (NodeA->Exon < NodeB->Exon)
+ {
+ return 1;
+ }
+ else
+ {
+ return -1;
+ }
+}
+
+// It is desirable for an exon's back-links to always hit exons with LOWER index numbers.
+void SortExons(int ReverseFlag)
+{
+ ExonSortNode* ExonNodes;
+ GeneNode* GNode;
+ int ExonCount;
+ int ExonIndex;
+ ExonNode* Exon;
+ //
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ ExonCount = 0;
+ for (Exon = GNode->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ ExonCount++;
+ }
+ if (ExonCount)
+ {
+ ExonNodes = (ExonSortNode*)calloc(ExonCount, sizeof(ExonSortNode));
+ ExonIndex = 0;
+ for (Exon = GNode->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ ExonNodes[ExonIndex].Exon = Exon;
+ ExonIndex++;
+ }
+ if (ReverseFlag)
+ {
+ qsort(ExonNodes, ExonCount, sizeof(ExonSortNode), (QSortCompare)CompareExonNodesBackward);
+ }
+ else
+ {
+ qsort(ExonNodes, ExonCount, sizeof(ExonSortNode), (QSortCompare)CompareExonNodesForward);
+ }
+ GNode->Interval->FirstExon = ExonNodes[0].Exon;
+ GNode->Interval->LastExon = ExonNodes[ExonCount - 1].Exon;
+ for (ExonIndex = 0; ExonIndex < ExonCount; ExonIndex++)
+ {
+ if (ExonIndex < ExonCount - 1)
+ {
+ ExonNodes[ExonIndex].Exon->Next = ExonNodes[ExonIndex + 1].Exon;
+ }
+ else
+ {
+ ExonNodes[ExonIndex].Exon->Next = NULL;
+ }
+ }
+ SafeFree(ExonNodes);
+ }
+ }
+}
+
+// For reporting purposes, count how many exons and edges and amino acids are in our db:
+void CountExons(int PreFilterFlag)
+{
+ GeneNode* GNode;
+ ExonNode* Exon;
+ ExonLink* Link;
+ int Pos;
+ int TrueExonFlag;
+ //
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ for (Exon = GNode->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ if (PreFilterFlag)
+ {
+ g_UFTotalExons++;
+ for (Pos = 0; Pos < Exon->Length; Pos++)
+ {
+ if (Exon->Sequence[Pos]!='X')
+ {
+ g_UFTotalAA++;
+ }
+ }
+
+ }
+ else
+ {
+ g_TotalExons++;
+ TrueExonFlag = 1;
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ if (Link->Exon->End == Exon->Start || Link->Exon->Start == Exon->End)
+ {
+ TrueExonFlag = 0;
+ }
+ else
+ {
+ g_TotalTrueEdges++;
+ }
+ }
+ g_TotalTrueExons += TrueExonFlag;
+ for (Pos = 0; Pos < Exon->Length; Pos++)
+ {
+ if (Exon->Sequence[Pos]!='X')
+ {
+ g_TotalAA++;
+ }
+ }
+ }
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ if (PreFilterFlag)
+ {
+ g_UFTotalEdges++;
+ if (Link->AA)
+ {
+ g_UFTotalAA++;
+ }
+ }
+ else
+ {
+ g_TotalEdges++;
+ if (Link->AA)
+ {
+ g_TotalAA++;
+ }
+ }
+ }
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ if (PreFilterFlag)
+ {
+ g_UFTotalEdges++;
+ if (Link->AA)
+ {
+ g_UFTotalAA++;
+ }
+ }
+ else
+ {
+ g_TotalEdges++;
+ if (Link->AA)
+ {
+ g_TotalAA++;
+ }
+ }
+ } // iterate forward links
+ } // iterate exons
+ } // iterate GNodes
+}
+
+// Create a link between two genome DAG nodes.
+void GenomeDAGLinkBack(GenomeDAGNode* DAGNode, GenomeDAGNode* BackDAGNode, int Count)
+{
+ GenomeDAGLink* NewLink;
+ GenomeDAGLink* Link;
+ //
+ // Add the back-link:
+ NewLink = (GenomeDAGLink*)calloc(1, sizeof(GenomeDAGLink));
+ NewLink->Node = BackDAGNode;
+ NewLink->Count = Count;
+ if (!DAGNode->FirstBack)
+ {
+ DAGNode->FirstBack = NewLink;
+ }
+ else
+ {
+ for (Link = DAGNode->FirstBack; Link->Next; Link = Link->Next)
+ {
+ ;
+ }
+ Link->Next = NewLink;
+ }
+ // Add the forward-link:
+ NewLink = (GenomeDAGLink*)calloc(1, sizeof(GenomeDAGLink));
+ NewLink->Node = DAGNode;
+ NewLink->Count = Count;
+ if (!BackDAGNode->FirstForward)
+ {
+ BackDAGNode->FirstForward = NewLink;
+ }
+ else
+ {
+ for (Link = BackDAGNode->FirstForward; Link->Next; Link = Link->Next)
+ {
+ ;
+ }
+ Link->Next = NewLink;
+ }
+}
+
+int GetReadingFrameFlag(int Start, int End, int Offset, int ReverseFlag)
+{
+ int ReadingFrameFlag = 0;
+ //
+ if (ReverseFlag)
+ {
+ switch ((End - 1 - Offset) % 3)
+ {
+ case 0:
+ ReadingFrameFlag = IFLAG_FRAME_0;
+ break;
+ case 1:
+ ReadingFrameFlag = IFLAG_FRAME_1;
+ break;
+ case 2:
+ ReadingFrameFlag = IFLAG_FRAME_2;
+ break;
+ }
+ }
+ else
+ {
+ switch ((Start + Offset) % 3)
+ {
+ case 0:
+ ReadingFrameFlag = IFLAG_FRAME_0;
+ break;
+ case 1:
+ ReadingFrameFlag = IFLAG_FRAME_1;
+ break;
+ case 2:
+ ReadingFrameFlag = IFLAG_FRAME_2;
+ break;
+ }
+ }
+ return ReadingFrameFlag;
+}
+
+// Build a DAG for this genomic interval. Then we'll generate three exons (two, for length-1 nodes)
+// for each node of the DAG. The DAG is generally just one node, corresponding to genomic DNA.
+// But the DAG may have extra nodes and edges if there are SNPs that fall within the interval.
+void BuildDAGForInterval(IntervalNode* Interval, FILE* GenomicFile, int MinORFLength, int ReverseFlag)
+{
+ int DAGNodeCount;
+ int DAGNodeIndex;
+ int NextDAGStart;
+ GenomeDAGNode* DAGNode;
+ int Length;
+ ExonNode* Exon;
+ char RCDNABuffer[MAX_INTERVAL_LENGTH + 1];
+ int PolyIndex;
+ int FirstPolyIndex;
+ Polymorphism* Poly;
+ int PrevNodesStart = -1;
+ int PrevNodesEnd = -1;
+ int PrevNodeIndex;
+ int NewPrevNodesStart;
+ int SNPIndex;
+ int ReadingFrameFlag;
+ //
+
+ FirstPolyIndex = FindPolyInInterval(Interval->Start, Interval->End);
+
+ ////////////////////////////////////////////////////////////
+ // How many nodes in our DAG? Assume that no polymorphisms overlap, and so we will need 1 dag node
+ // plus three per polymorphism (or two for a polymorphism directly after another polymorphism, but
+ // just alloc three anyway)
+ DAGNodeCount = 1;
+ if (FirstPolyIndex >= 0)
+ {
+ for (PolyIndex = FirstPolyIndex; PolyIndex < g_PolymorphismCount; PolyIndex++)
+ {
+ if (g_Polymorphisms[PolyIndex].Pos >= Interval->End)
+ {
+ break;
+ }
+ // A polymorphism means one node for each SNP allele, and (USUALLY) a node
+ // for the continued 'core track'
+ // a
+ // XXXX b xxxx
+ // c
+ DAGNodeCount++;
+ DAGNodeCount += strlen(g_Polymorphisms[PolyIndex].SNP);
+ }
+ }
+ Interval->DAGNodeCount = DAGNodeCount;
+ Interval->DAGNodes = (GenomeDAGNode*)calloc(DAGNodeCount, sizeof(GenomeDAGNode));
+ ////////////////////////////////////////////////////////////
+ // Initialize all the DAG nodes:
+ NextDAGStart = Interval->Start;
+ DAGNodeIndex = 0;
+ PolyIndex = FirstPolyIndex;
+ while (1)
+ {
+ if (PolyIndex < 0 || PolyIndex >= g_PolymorphismCount || g_Polymorphisms[PolyIndex].Pos >= Interval->End)
+ {
+ // There are no more polymorphisms.
+ if (NextDAGStart < Interval->End)
+ {
+ // Generate an interval that extends to the end:
+ DAGNode = Interval->DAGNodes + DAGNodeIndex;
+ DAGNode->Start = NextDAGStart;
+ DAGNode->End = Interval->End;
+ Length = DAGNode->End - DAGNode->Start;
+ DAGNode->Sequence = (char*)calloc(Length + 1, sizeof(char)); // +1 for null terminator
+ fseek(GenomicFile, DAGNode->Start, 0);
+ ReadBinary(DAGNode->Sequence, sizeof(char), Length, GenomicFile);
+ // If we have some nodes already, that's because there's a polymorphism. Link to the
+ // previous two nodes:
+ if (DAGNodeIndex)
+ {
+ for (PrevNodeIndex = PrevNodesStart; PrevNodeIndex <= PrevNodesEnd; PrevNodeIndex++)
+ {
+ GenomeDAGLinkBack(DAGNode, Interval->DAGNodes + PrevNodeIndex, 0);
+ }
+ }
+ }
+ // And we're done!
+ break;
+ }
+ // There is another polymorphism.
+ Poly = g_Polymorphisms + PolyIndex;
+ if (NextDAGStart < Poly->Pos)
+ {
+ // If there's non-polymorphic sequence before the next poly, then
+ // generate a DAG node for it.
+ DAGNode = Interval->DAGNodes + DAGNodeIndex;
+ DAGNode->Start = NextDAGStart;
+ DAGNode->End = Poly->Pos;
+ Length = DAGNode->End - DAGNode->Start;
+ DAGNode->Sequence = (char*)calloc(Length + 1, sizeof(char)); // +1 for null terminator
+ fseek(GenomicFile, DAGNode->Start, 0);
+ ReadBinary(DAGNode->Sequence, sizeof(char), Length, GenomicFile);
+ // If we have some nodes already, that's because there's a polymorphism. Link to the
+ // previous nodes:
+ if (DAGNodeIndex)
+ {
+ for (PrevNodeIndex = PrevNodesStart; PrevNodeIndex <= PrevNodesEnd; PrevNodeIndex++)
+ {
+ GenomeDAGLinkBack(DAGNode, Interval->DAGNodes + PrevNodeIndex, 0);
+ }
+ }
+ PrevNodesStart = DAGNodeIndex;
+ PrevNodesEnd = DAGNodeIndex;
+ DAGNodeIndex++;
+ }
+ // Nodes for the two (or more) alleles:
+ NewPrevNodesStart = DAGNodeIndex;
+ for (SNPIndex = 0; SNPIndex < 4; SNPIndex++)
+ {
+ if (!Poly->SNP[SNPIndex])
+ {
+ break;
+ }
+ DAGNode = Interval->DAGNodes + DAGNodeIndex;
+ DAGNode->Start = Poly->Pos;
+ DAGNode->End = Poly->Pos + 1;
+ DAGNode->Sequence = (char*)calloc(2, sizeof(char));
+ DAGNode->Sequence[0] = Poly->SNP[SNPIndex];
+ if (PrevNodesStart > -1)
+ {
+ for (PrevNodeIndex = PrevNodesStart; PrevNodeIndex <= PrevNodesEnd; PrevNodeIndex++)
+ {
+ GenomeDAGLinkBack(DAGNode, Interval->DAGNodes + PrevNodeIndex, 0);
+ }
+ }
+ DAGNodeIndex++;
+ }
+ PrevNodesStart = NewPrevNodesStart;
+ PrevNodesEnd = DAGNodeIndex - 1;
+ NextDAGStart = Poly->Pos + 1;
+ PolyIndex++;
+ }
+ ////////////////////////////////////////////////////////////////////////////////
+ // The DAG for the interval has now been constructed. Build exons for all DAG nodes.
+ for (DAGNodeIndex = 0; DAGNodeIndex < Interval->DAGNodeCount; DAGNodeIndex++)
+ {
+ DAGNode = Interval->DAGNodes + DAGNodeIndex;
+ if (!DAGNode->Sequence)
+ {
+ continue; // not a real DAG node.
+ }
+ if (DAGNode->End <= DAGNode->Start)
+ {
+ DAGNode = DAGNode;
+ }
+ // Reverse-complement the DAG's sequence, if necessary:
+ Length = DAGNode->End - DAGNode->Start;
+ if (Length >= MAX_INTERVAL_LENGTH)
+ {
+ printf("** Warning: Genomic interval from %d to %d is MUCH too long to process; truncating!", DAGNode->Start, DAGNode->End);
+ DAGNode->Sequence[MAX_INTERVAL_LENGTH] = '\0';
+ }
+ if (ReverseFlag)
+ {
+ strcpy(RCDNABuffer, DAGNode->Sequence);
+ WriteReverseComplement(RCDNABuffer, DAGNode->Sequence);
+ }
+ DAGNode->Exons = (ExonNode**)calloc(3, sizeof(ExonNode*));
+ // Add two or three exons for this DAG node.
+ // Check the reading frame of the interval to decide where codons are supposed to begin.
+ ReadingFrameFlag = GetReadingFrameFlag(DAGNode->Start, DAGNode->End, 0, ReverseFlag);
+ if (ReadingFrameFlag & Interval->Flags)
+ {
+ // Reading Frame 0:
+ Exon = (ExonNode*)calloc(1, sizeof(ExonNode));
+ Exon->Prefix[0] = '\0';
+ GetExonSequence(Exon, DAGNode->Sequence, MinORFLength);
+ Exon->Start = DAGNode->Start;
+ Exon->End = DAGNode->End;
+ Exon->DAGNode = DAGNode;
+ DAGNode->Exons[0] = Exon;
+ AddExonToInterval(Interval, Exon);
+ }
+ ReadingFrameFlag = GetReadingFrameFlag(DAGNode->Start, DAGNode->End, 1, ReverseFlag);
+ if (ReadingFrameFlag & Interval->Flags)
+ {
+ // Reading frame 1:
+ Exon = (ExonNode*)calloc(sizeof(ExonNode), 1);
+ Exon->Prefix[0] = DAGNode->Sequence[0];
+ Exon->Prefix[1] = '\0';
+ GetExonSequence(Exon, DAGNode->Sequence + 1, MinORFLength);
+ Exon->Start = DAGNode->Start;
+ Exon->End = DAGNode->End;
+ Exon->DAGNode = DAGNode;
+ DAGNode->Exons[1] = Exon;
+ AddExonToInterval(Interval, Exon);
+ }
+ // Reading frame 2:
+ if (Length > 1)
+ {
+ ReadingFrameFlag = GetReadingFrameFlag(DAGNode->Start, DAGNode->End, 2, ReverseFlag);
+ if (ReadingFrameFlag & Interval->Flags)
+ {
+ Exon = (ExonNode*)calloc(sizeof(ExonNode), 1);
+ Exon->Prefix[0] = DAGNode->Sequence[0];
+ Exon->Prefix[1] = DAGNode->Sequence[1];
+ Exon->Prefix[2] = '\0';
+ GetExonSequence(Exon, DAGNode->Sequence + 2, MinORFLength);
+ Exon->Start = DAGNode->Start;
+ Exon->End = DAGNode->End;
+ Exon->DAGNode = DAGNode;
+ DAGNode->Exons[2] = Exon;
+ AddExonToInterval(Interval, Exon);
+ }
+ }
+ }
+}
+
+// Every interval gives rise to three exons (two, if it's only one base long).
+// If interval A links to interval B, then A's exons each link to a compatible
+// exon in B. Exception: If an exon with suffix length 1 links to an interval
+// of length 1, then we must go to the NEXT-next interval to complete a codon.
+//
+// GenomicFile is the file containing the genomic sequence.
+int BuildAndWriteExons(FILE* GenomicFile, FILE* OutputFile, int ReverseFlag,
+ char* GeneName, int ChromosomeNumber, int MinORFLength)
+{
+ IntervalNode* Interval;
+ EdgeNode* Edge;
+ GeneNode* Node;
+ int IntervalCount = 0;
+ int ValidGeneFlag;
+ int VerboseFlag = 0;
+ //
+ // Construct 1-3 exons for each interval within the gene:
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ BuildDAGForInterval(Node->Interval, GenomicFile, MinORFLength, ReverseFlag);
+ IntervalCount++;
+ }
+ if (IntervalCount > g_StatsLargestGeneSize)
+ {
+ g_StatsLargestGeneSize = IntervalCount;
+ }
+ //DebugPrintBuiltGene();
+ // Link up the DAG graphs for all the intervals:
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ Interval = Node->Interval;
+
+ for (Edge = Interval->FirstForward; Edge; Edge = Edge->Next)
+ {
+ // Ignore edges that extend out of this gene (we'll get them in overlap)
+ if (!Edge->Interval->GNode)
+ {
+ continue;
+ }
+ LinkDAGAcrossIntervals(Interval, Edge, ReverseFlag);
+ }
+ }
+ //printf("\nLinked DAG across intervals:\n");
+ //DebugPrintBuiltGene();
+ // Link up the exons, in accordance with the DAG graph linkage:
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ Interval = Node->Interval;
+ LinkIntervalExons(Interval, ReverseFlag);
+ }
+
+ CountExons(1);
+
+ // If an exon isn't part of any long reading frame, it can be dropped. And if it contains a stop
+ // codon, or the stop codon's prefix (or suffix), and/or some edges, can be dropped. Perform
+ // that filtering now:
+ //printf("\nLinked interval exons:\n");
+ if (VerboseFlag)
+ {
+ DebugPrintBuiltGene();
+ }
+ PruneShortORFs(ReverseFlag, MinORFLength);
+ // Exons may include 'masked-out' sequence blocks between stop codons. These
+ // sequences aren't needed for search, so delete them, splitting the exons if necessary.
+ //printf("\nPruned short ORFs:\n");
+ //DebugPrintBuiltGene();
+
+ PurgeNonCodingExonChunks(ReverseFlag);
+ //printf("\nPruned non-coding chunks:\n");
+ //DebugPrintBuiltGene();
+ CountExons(0);
+ SortExons(ReverseFlag);
+ // Write out a gene record:
+ if (VerboseFlag)
+ {
+ DebugPrintBuiltGene();
+ }
+ ValidGeneFlag = WriteGeneRecord(ChromosomeNumber, GeneName, ReverseFlag, OutputFile);
+
+ // Go back and free all the exon records:
+ for (Node = g_GeneFirst; Node; Node = Node->Next)
+ {
+ Interval = Node->Interval;
+ FreeIntervalExons(Interval);
+ FreeIntervalDAG(Interval);
+ }
+ return ValidGeneFlag;
+}
+
+// Delete an exon entirely!
+void DeleteExon(ExonNode* Exon)
+{
+ ExonNode* OtherExon;
+ ExonLink* Link;
+ ExonLink* NextLink;
+
+ // First, fix the pointers from the parent interval:
+ if (Exon->Interval->FirstExon == Exon)
+ {
+ Exon->Interval->FirstExon = Exon->Next;
+ if (Exon->Interval->LastExon == Exon)
+ {
+ Exon->Interval->LastExon = NULL;
+ }
+ }
+ else
+ {
+ for (OtherExon = Exon->Interval->FirstExon; OtherExon; OtherExon = OtherExon->Next)
+ {
+ if (OtherExon->Next == Exon)
+ {
+ OtherExon->Next = Exon->Next;
+ if (Exon->Interval->LastExon == Exon)
+ {
+ Exon->Interval->LastExon = OtherExon;
+ }
+ break;
+ }
+ }
+ }
+ // Now, free all the edges (and reciprocal edges):
+ Link = Exon->FirstBack;
+ while (Link)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 0);
+ Link = NextLink;
+ }
+ Link = Exon->FirstForward;
+ while (Link)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 1);
+ Link = NextLink;
+ }
+
+ // Now, free the exon itself:
+ SafeFree(Exon->Sequence);
+ SafeFree(Exon);
+}
+
+// Delete the specified Link from this Exon. ForwardFlag indicates
+// whether it's a forward link.
+void DeleteExonLink(ExonNode* Exon, ExonLink* Link, int ForwardFlag)
+{
+ ExonLink* OtherLink;
+ ExonNode* OtherExon;
+ ExonLink* Prev;
+ //
+ if (ForwardFlag)
+ {
+ // Update the exon's linked list of edges, removing Link:
+ for (OtherLink = Exon->FirstForward; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Next == Link)
+ {
+ OtherLink->Next = Link->Next;
+ if (Exon->LastForward == Link)
+ {
+ Exon->LastForward = OtherLink;
+ }
+ break;
+ }
+ }
+ if (Exon->FirstForward == Link)
+ {
+ Exon->FirstForward = Link->Next;
+ }
+ if (Exon->LastForward == Link)
+ {
+ Exon->LastForward = NULL;
+ }
+
+ // Remove the link from the other exon:
+ OtherExon = Link->Exon;
+ Prev = NULL;
+ for (OtherLink = OtherExon->FirstBack; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Exon == Exon && OtherLink->AA == Link->AA)
+ {
+ if (OtherExon->LastBack == OtherLink)
+ {
+ OtherExon->LastBack = Prev;
+ }
+ if (Prev)
+ {
+ Prev->Next = OtherLink->Next;
+ }
+ else
+ {
+ OtherExon->FirstBack = OtherLink->Next;
+ }
+ SafeFree(OtherLink);
+ break;
+ }
+ Prev = OtherLink;
+ }
+ SafeFree(Link);
+ } // forward link
+ else
+ {
+ // Update the exon's linked list of edges, removing Link:
+ for (OtherLink = Exon->FirstBack; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Next == Link)
+ {
+ OtherLink->Next = Link->Next;
+ if (Exon->LastBack == Link)
+ {
+ Exon->LastBack = OtherLink;
+ }
+ break;
+ }
+ }
+ if (Exon->FirstBack == Link)
+ {
+ Exon->FirstBack = Link->Next;
+ }
+ if (Exon->LastBack == Link)
+ {
+ Exon->LastBack = NULL;
+ }
+
+ // Remove the link from the other exon:
+ OtherExon = Link->Exon;
+ Prev = NULL;
+ for (OtherLink = OtherExon->FirstForward; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Exon == Exon && OtherLink->AA == Link->AA)
+ {
+ if (OtherExon->LastForward == OtherLink)
+ {
+ OtherExon->LastForward = Prev;
+ }
+ if (Prev)
+ {
+ Prev->Next = OtherLink->Next;
+ }
+ else
+ {
+ OtherExon->FirstForward = OtherLink->Next;
+ }
+ SafeFree(OtherLink);
+ break;
+ }
+ Prev = OtherLink;
+ }
+ SafeFree(Link);
+ } // backward link
+}
+
+// if Link is set, we've counted the exon itself and we're on this link.
+// if Link is null, we're entering the exon:
+int GeneFindLongestExtension(ExonNode* OldExon, ExonLink* Link, int LongEnough, int ForwardFlag)
+{
+ int Length;
+ ExonNode* Exon;
+ ExonLink* OtherLink;
+ int Extension;
+ int BestExtension;
+ int Pos;
+ //
+ if (Link && Link->AA)
+ {
+ Length = 1;
+ if (Length >= LongEnough)
+ {
+ return Length;
+ }
+ }
+ else
+ {
+ Length = 0;
+ }
+ Exon = Link->Exon;
+
+ // Iterate over bases in the exon, and add to our length:
+ if (ForwardFlag)
+ {
+ for (Pos = 0; Pos < Exon->Length; Pos++)
+ {
+ if (Exon->Sequence[Pos] == 'X')
+ {
+ return Length;
+ }
+ Length++;
+ }
+ }
+ else
+ {
+ for (Pos = Exon->Length - 1; Pos >= 0; Pos--)
+ {
+ if (Exon->Sequence[Pos] == 'X')
+ {
+ return Length;
+ }
+ Length++;
+ }
+ }
+ if (Length >= LongEnough)
+ {
+ return Length;
+ }
+
+ // Continue following edges:
+ if (ForwardFlag)
+ {
+ OtherLink = Exon->FirstForward;
+ }
+ else
+ {
+ OtherLink = Exon->FirstBack;
+ }
+ BestExtension = 0;
+ while (OtherLink)
+ {
+ Extension = GeneFindLongestExtension(Exon, OtherLink, LongEnough - Length, ForwardFlag);
+ if (Length + Extension >= LongEnough)
+ {
+ return (Length + Extension);
+ }
+ BestExtension = max(BestExtension, Extension);
+ OtherLink = OtherLink->Next;
+ }
+ return (Length + BestExtension);
+}
+
+// For each exon:
+// if its length is zero, we're done
+// if it's all stop codons, delete the exon
+// if it's all 'real' residues, we're done
+// if it starts with one or more stop codons, delete them (and back-edges)
+// otherwise, it starts with one or more real residues. Split them to a separate exon.
+void PurgeNonCodingExonChunks(int ReverseFlag)
+{
+ GeneNode* GNode;
+ ExonNode* Exon;
+ ExonNode* NextExon = NULL;
+ ExonNode* NewExon;
+ int FirstReal;
+ int FirstStop;
+ char AA = 0;
+ ExonLink* Link;
+ ExonLink* OtherLink;
+ ExonLink* NextLink;
+ char* NewSequence;
+ int AminoIndex;
+ int AAEdgeBack;
+ int AAEdgeForward;
+ //
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ Exon = GNode->Interval->FirstExon;
+ while (Exon)
+ {
+ // An EMPTY exon can be nuked:
+ if (Exon->Start == Exon->End)
+ {
+ NextExon = Exon->Next;
+ DeleteExon(Exon);
+ Exon = NextExon;
+ continue;
+ }
+ // This exon has no amino acids, but it has edges. Keep it:
+ if (Exon->Length == 0)
+ {
+ Exon = Exon->Next;
+ continue;
+ }
+
+ // Loop over residues to find the first stop codon (-1 if none) and the first real codon (-1 if none)
+ FirstReal = -1;
+ FirstStop = -1;
+ for (AminoIndex = 0; AminoIndex < Exon->Length; AminoIndex++)
+ {
+ AA = Exon->Sequence[AminoIndex];
+ if (AA == 'X')
+ {
+ if (FirstStop < 0)
+ {
+ FirstStop = AminoIndex;
+ }
+ }
+ else
+ {
+ if (FirstReal < 0)
+ {
+ FirstReal = AminoIndex;
+ }
+ }
+ }
+ // Count the number of AAEdges (edges with an amino acid char attached) back and forward
+ AAEdgeBack = 0;
+ AAEdgeForward = 0;
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ if (Link->AA)
+ {
+ AAEdgeBack++;
+ }
+ }
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ if (Link->AA)
+ {
+ AAEdgeForward++;
+ }
+ }
+ //printf("FirstReal %d FirstStop %d EdgeBack %d EdgeForward %d\n", FirstReal, FirstStop, AAEdgeBack, AAEdgeForward);
+ if (FirstReal == -1)
+ {
+ // This exon contains nothing but stop codons!
+ // We probably mustn't simply delete this exon, or else the aa-edges leading into and out of
+ // it will be broken. But let's truncate its sequence, and delete any
+ // non-AA edges.
+ if (AAEdgeBack)
+ {
+ NewExon = (ExonNode*)calloc(1, sizeof(ExonNode));
+ NewExon->Interval = Exon->Interval;
+ NewExon->Length = 0;
+ if (ReverseFlag)
+ {
+ NewExon->Start = Exon->End - strlen(Exon->Prefix);
+ NewExon->End = Exon->End;
+ }
+ else
+ {
+ NewExon->Start = Exon->Start;
+ NewExon->End = Exon->Start + strlen(Exon->Prefix);
+ }
+ strcpy(NewExon->Prefix, Exon->Prefix);
+ // Assimilate all edges with AA into the new exon:
+ Link = Exon->FirstBack;
+ while (Link)
+ {
+ if (!Link->AA)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 0);
+ Link = NextLink;
+ continue;
+ }
+ // Fix the reciprocal links to point to the new exon:
+ for (OtherLink = Link->Exon->FirstForward; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Exon == Exon && OtherLink->AA == Link->AA)
+ {
+ OtherLink->Exon = NewExon;
+ }
+ }
+ if (NewExon->FirstBack)
+ {
+ NewExon->LastBack->Next = Link;
+ }
+ else
+ {
+ NewExon->FirstBack = Link;
+ }
+ NewExon->LastBack = Link;
+ Link = Link->Next;
+ }
+ if (NewExon->LastBack)
+ {
+ NewExon->LastBack->Next = NULL;
+ }
+ Exon->FirstBack = NULL;
+ Exon->LastBack = NULL;
+ GNode->Interval->LastExon->Next = NewExon;
+ GNode->Interval->LastExon = NewExon;
+ }
+ if (AAEdgeForward)
+ {
+ NewExon = (ExonNode*)calloc(1, sizeof(ExonNode));
+ NewExon->Interval = Exon->Interval;
+ NewExon->Length = 0;
+ if (ReverseFlag)
+ {
+ NewExon->Start = Exon->Start;
+ NewExon->End = Exon->Start + strlen(Exon->Suffix);
+ strcpy(NewExon->Suffix, Exon->Suffix);
+ }
+ else
+ {
+ NewExon->Start = Exon->End - strlen(Exon->Suffix);
+ NewExon->End = Exon->End;
+ strcpy(NewExon->Suffix, Exon->Suffix);
+ }
+ // Assimilate all edges with AA into the new exon:
+ Link = Exon->FirstForward;
+ while (Link)
+ {
+ if (!Link->AA)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 1);
+ Link = NextLink;
+ continue;
+ }
+ // Fix the reciprocal links to point to the new exon:
+ for (OtherLink = Link->Exon->FirstBack; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Exon == Exon && OtherLink->AA == Link->AA)
+ {
+ OtherLink->Exon = NewExon;
+ }
+ }
+ if (NewExon->FirstForward)
+ {
+ NewExon->LastForward->Next = Link;
+ }
+ else
+ {
+ NewExon->FirstForward = Link;
+ }
+ NewExon->LastForward = Link;
+ Link = Link->Next;
+ }
+ if (NewExon->LastForward)
+ {
+ NewExon->LastForward->Next = NULL;
+ }
+ Exon->FirstForward = NULL;
+ Exon->LastForward = NULL;
+ GNode->Interval->LastExon->Next = NewExon;
+ GNode->Interval->LastExon = NewExon;
+ }
+ NextExon = Exon->Next;
+ DeleteExon(Exon);
+ Exon = NextExon;
+ continue;
+ } // if exon contains only stop codons
+ if (FirstStop == -1)
+ {
+ // This exon contains no stop codons. Leave it alone, move on.
+ Exon = Exon->Next;
+ continue;
+ }
+ if (FirstStop == 0)
+ {
+ // This exon begins with one or more stop codons. Delete all back edges except
+ // those containing an amino acid:
+ if (AAEdgeBack)
+ {
+ NewExon = (ExonNode*)calloc(1, sizeof(ExonNode));
+ NewExon->Interval = Exon->Interval;
+ NewExon->Length = 0;
+ if (ReverseFlag)
+ {
+ NewExon->Start = Exon->End - strlen(Exon->Prefix);
+ NewExon->End = Exon->End;
+ }
+ else
+ {
+ NewExon->Start = Exon->Start;
+ NewExon->End = Exon->Start + strlen(Exon->Prefix);
+ }
+ strcpy(NewExon->Prefix, Exon->Prefix);
+ // Assimilate all edges with AA into the new exon:
+ Link = Exon->FirstBack;
+ while (Link)
+ {
+ if (!Link->AA)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 0);
+ Link = NextLink;
+ continue;
+ }
+ // Fix the reciprocal links to point to the new exon:
+ for (OtherLink = Link->Exon->FirstForward; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Exon == Exon)
+ {
+ OtherLink->Exon = NewExon;
+ }
+ }
+ if (NewExon->FirstBack)
+ {
+ NewExon->LastBack->Next = Link;
+ }
+ else
+ {
+ NewExon->FirstBack = Link;
+ }
+ NewExon->LastBack = Link;
+ Link = Link->Next;
+ }
+ if (NewExon->LastBack)
+ {
+ NewExon->LastBack->Next = NULL;
+ }
+ Exon->FirstBack = NULL;
+ Exon->LastBack = NULL;
+ GNode->Interval->LastExon->Next = NewExon;
+ GNode->Interval->LastExon = NewExon;
+ }
+ else
+ {
+ Link = Exon->FirstBack;
+ while (Link)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 0);
+ Link = NextLink;
+ }
+ }
+ // Delete the exon's prefix, and move its start position up:
+ if (ReverseFlag)
+ {
+ Exon->End -= strlen(Exon->Prefix) + 3 * FirstReal;
+ }
+ else
+ {
+ Exon->Start += strlen(Exon->Prefix) + 3 * FirstReal;
+ }
+ Exon->Prefix[0] = '\0';
+ NewSequence = (char*)calloc(Exon->Length - FirstReal + 1, sizeof(char)); // 1 byte for null
+ strcpy(NewSequence, Exon->Sequence + FirstReal);
+ SafeFree(Exon->Sequence);
+ Exon->Sequence = NewSequence;
+ Exon->Length = Exon->Length - FirstReal;
+ // We'll revisit this exon in the next loop pass, in case it has more stop codons later on.
+ continue;
+ } // if sequence starts with stop codon
+
+ // This exon contains a stop codon, preceded by 'real' AAs. Build a new exon
+ // to hold our suffix. The old exon gets truncated and gets its genomic
+ // pos changed.
+ NewExon = (ExonNode*)calloc(1, sizeof(ExonNode));
+ NewExon->Interval = Exon->Interval;
+ NewExon->Length = Exon->Length - FirstStop - 1;
+ NewExon->Sequence = (char*)calloc(NewExon->Length + 1, sizeof(char));
+ strcpy(NewExon->Sequence, Exon->Sequence + FirstStop + 1);
+ if (ReverseFlag)
+ {
+ NewExon->Start = Exon->Start;
+ NewExon->End = Exon->End - strlen(Exon->Prefix) - (FirstStop + 1) * 3;
+ Exon->Start = NewExon->End + 3;
+ }
+ else
+ {
+ NewExon->End = Exon->End;
+ NewExon->Start = Exon->Start + strlen(Exon->Prefix) + (FirstStop + 1) * 3;
+ Exon->End = NewExon->Start - 3;
+ }
+
+ NewExon->FirstForward = Exon->FirstForward;
+ NewExon->LastForward = Exon->LastForward;
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ for (OtherLink = Link->Exon->FirstBack; OtherLink; OtherLink = OtherLink->Next)
+ {
+ if (OtherLink->Exon == Exon)
+ {
+ OtherLink->Exon = NewExon;
+ }
+ }
+ }
+ Exon->FirstForward = NULL;
+ Exon->LastForward = NULL;
+ Exon->Sequence[FirstStop] = '\0';
+ Exon->Length = FirstStop;
+ strcpy(NewExon->Suffix, Exon->Suffix);
+ Exon->Suffix[0] = '\0';
+ GNode->Interval->LastExon->Next = NewExon;
+ GNode->Interval->LastExon = NewExon;
+ Exon = Exon->Next;
+ continue;
+ } // Iteration over exons
+ } // Iteration over GNodes (intervals)
+} // PurgeNonCodingExonChunks
+
+int SetExonLinkExtensionLengthsBack(ExonNode* Exon, int MinimumORFLength, int IncludeBody)
+{
+ ExonLink* Link;
+ int Length;
+ int Pos;
+ //
+
+ // First case: We're starting INTO the exon sequence. We may stop partway.
+ if (IncludeBody && Exon->Sequence)
+ {
+ Length = 0;
+ for (Pos = Exon->Length - 1; Pos >= 0; Pos--)
+ {
+ if (Exon->Sequence[Pos] == 'X')
+ {
+ return Length;
+ break;
+ }
+ Length++;
+ if (Length >= MinimumORFLength)
+ {
+ // That's long enough already!
+ return Length;
+ }
+ }
+ }
+ else
+ {
+ Length = 0;
+ }
+
+ if (Exon->MaxBackOverall != -1)
+ {
+ return Length + Exon->MaxBackOverall;
+ }
+
+ // Set Link->MaxLength for each link back. We always do this when we're called with
+ // IncludeBody == 0, we MAY do it for IncludeBody == 1 (as necessary)
+ Exon->MaxBackOverall = 0;
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ // If the extension for the link is already known, note it and continue
+ if (Link->MaxLength != -1)
+ {
+ Exon->MaxBackOverall = max(Exon->MaxBackOverall, Link->MaxLength);
+ continue;
+ }
+ if (Link->AA)
+ {
+ Link->MaxLength = 1;
+ }
+ else
+ {
+ Link->MaxLength = 0;
+ }
+ Link->MaxLength += SetExonLinkExtensionLengthsBack(Link->Exon, MinimumORFLength, 1);
+ Exon->MaxBackOverall = max(Exon->MaxBackOverall, Link->MaxLength);
+ }
+ return Length + Exon->MaxBackOverall;
+}
+
+int SetExonLinkExtensionLengthsForward(ExonNode* Exon, int MinimumORFLength, int IncludeBody)
+{
+ ExonLink* Link;
+ int Length;
+ int Pos;
+ //
+
+ // First case: We're starting INTO the exon sequence. We may stop partway.
+ if (IncludeBody && Exon->Sequence)
+ {
+ Length = 0;
+ for (Pos = 0; Pos < Exon->Length; Pos++)
+ {
+ if (Exon->Sequence[Pos] == 'X')
+ {
+ return Length;
+ break;
+ }
+ Length++;
+ if (Length >= MinimumORFLength)
+ {
+ // That's long enough already!
+ return Length;
+ }
+ }
+ }
+ else
+ {
+ Length = 0;
+ }
+
+ if (Exon->MaxForwardOverall != -1)
+ {
+ return Length + Exon->MaxForwardOverall;
+ }
+
+ // Set Link->MaxLength for each link back. We always do this when we're called with
+ // IncludeBody==0, we MAY do it for IncludeBody==1 (as necessary)
+ Exon->MaxForwardOverall = 0;
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ // If the extension for the link is already known, note it and continue
+ if (Link->MaxLength != -1)
+ {
+ Exon->MaxForwardOverall = max(Exon->MaxForwardOverall, Link->MaxLength);
+ continue;
+ }
+ if (Link->AA)
+ {
+ Link->MaxLength = 1;
+ }
+ else
+ {
+ Link->MaxLength = 0;
+ }
+ Link->MaxLength += SetExonLinkExtensionLengthsForward(Link->Exon, MinimumORFLength, 1);
+ Exon->MaxForwardOverall = max(Exon->MaxForwardOverall, Link->MaxLength);
+ }
+ return Length + Exon->MaxForwardOverall;
+}
+
+// Short open reading frame pruning:
+// - Determine the maximum length of each reading frame attainable by linking forward along the graph F1...Fm
+// - Determine the maximum length of each reading frame attainable by linking backward along the graph B1...Bn
+// - If there are no stop codons:
+// If len + max(F1...Fm) + max(B1...Bn) < N, kill the exon and all links
+// Else:
+// If len + max(B1...Bn) + Fi < N, kill forward link i
+// If len + max(F1...Fm) + Bi < N, kill backward link i
+// - Let S be the length of the suffix (all AAs after the last stop codon)
+// If S + max(F1...Fm) < N, mask S and remove all forward links
+// Else if S + Fi < N, remove Fi
+// - Let P be the length of the prefix (all AAs up to the first stop codon)
+// If P + max(B1...Bn) < N, mask P and remove all backward links
+// Else if P + Bi < N, remove Bi
+void PruneShortORFs(int ReverseFlag, int MinimumORFLength)
+{
+ GeneNode* GNode;
+ ExonNode* Exon;
+ ExonNode* NextExon = NULL;
+ ExonLink* Link;
+ ExonLink* NextLink;
+ int LinkIndex;
+ int PrefixLength;
+ int SuffixLength;
+ int Pos;
+ char* NewSequence;
+ int CutsPerformed;
+ int Flag;
+ int GeneNodeIndex = 0;
+ int ExonIndex;
+ //
+ // if MinOrfLength <= 0, then don't filter.
+ if (MinimumORFLength <= 0)
+ {
+ return;
+ }
+ // Iterate over all exons in all intervals. Init the link lengths.
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ for (Exon = GNode->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ for (Link = Exon->FirstForward; Link; Link = Link->Next)
+ {
+ Link->MaxLength = -1;
+ }
+ for (Link = Exon->FirstBack; Link; Link = Link->Next)
+ {
+ Link->MaxLength = -1;
+ }
+ // Exon->MaxBackOverall is set to -1 to indicate that it hasn't been
+ // processed yet.
+ Exon->MaxBackOverall = -1;
+ Exon->MaxForwardOverall = -1;
+ }
+ }
+ // Iterate over all exons in all intervals. Set the max lengths of all
+ // their links.
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ for (Exon = GNode->Interval->FirstExon; Exon; Exon = Exon->Next)
+ {
+ if (Exon->MaxBackOverall == -1)
+ {
+ SetExonLinkExtensionLengthsBack(Exon, MinimumORFLength, 0);
+ }
+ if (Exon->MaxForwardOverall == -1)
+ {
+ SetExonLinkExtensionLengthsForward(Exon, MinimumORFLength, 0);
+ }
+ }
+ }
+
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ //printf("Start gene node #%d\n", GeneNodeIndex);
+ Exon = GNode->Interval->FirstExon;
+ ExonIndex = 0;
+ while (Exon)
+ {
+ //printf("Start GeneNode#%d exon#%d\n", GeneNodeIndex, ExonIndex);
+ ExonIndex++;
+
+ // Measure the exon, its prefix, and its suffix.
+ PrefixLength = 0;
+ SuffixLength = 0;
+ if (Exon->Sequence)
+ {
+ for (Pos = 0; Pos < Exon->Length; Pos++)
+ {
+ if (Exon->Sequence[Pos] == 'X')
+ {
+ break;
+ }
+ PrefixLength++;
+ }
+ for (Pos = Exon->Length - 1; Pos >= 0; Pos--)
+ {
+ if (Exon->Sequence[Pos] == 'X')
+ {
+ break;
+ }
+ SuffixLength++;
+ }
+ }
+ // Consider removing the exon entirely:
+ if (Exon->Length + Exon->MaxBackOverall + Exon->MaxForwardOverall < MinimumORFLength)
+ {
+ //printf("*Delete the exon entirely!\n");
+ // Zap! Free the exon, and its links.
+ NextExon = Exon->Next;
+ DeleteExon(Exon);
+ Exon = NextExon;
+ continue;
+ }
+
+ if (PrefixLength == Exon->Length)
+ {
+ // This exon contains no stop codons. And we cannot delete it entirely,
+ // but we can perhaps still prune a few links.
+ // Try removing forward links:
+ Link = Exon->FirstForward;
+ LinkIndex = 0;
+ while (Link)
+ {
+ if (Exon->MaxBackOverall + Exon->Length + Link->MaxLength < MinimumORFLength)
+ {
+ // This link can't be part of a full-length ORF. So, let's remove the link:
+ NextLink = Link->Next;
+ //printf("*Delete a forward link\n");
+ DeleteExonLink(Exon, Link, 1);
+ Link = NextLink;
+ LinkIndex++;
+ continue;
+ }
+ LinkIndex++;
+ Link = Link->Next;
+ }
+ // Try removing backward links:
+ Link = Exon->FirstBack;
+ LinkIndex = 0;
+ while (Link)
+ {
+ if (Exon->MaxForwardOverall + Exon->Length + Link->MaxLength < MinimumORFLength)
+ {
+ // This link can't be part of a full-length ORF. So, let's remove the link:
+ NextLink = Link->Next;
+ //printf("*Delete a backward link\n");
+ DeleteExonLink(Exon, Link, 0);
+ Link = NextLink;
+ LinkIndex++;
+ continue;
+ }
+ LinkIndex++;
+ Link = Link->Next;
+ }
+ }
+ else
+ {
+ // This exon contains at least one stop codon. We'll consider pruning the
+ // prefix (everything up to and including the stop), the suffix (the stop
+ // and everything beyond it).
+ CutsPerformed = 0;
+ // First, consider removing the prefix (or some incoming links):
+ if (PrefixLength + Exon->MaxBackOverall < MinimumORFLength)
+ {
+ // We can cut the prefix! First delete all backward links:
+ Link = Exon->FirstBack;
+ while (Link)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 0);
+ Link = NextLink;
+ }
+ // If there's at least one non-stop character after the prefix,
+ // then the exon still has a sequence:
+ if (Exon->Length > PrefixLength + 1)
+ {
+ NewSequence = (char*)calloc(Exon->Length - PrefixLength, sizeof(char));
+ strcpy(NewSequence, Exon->Sequence + PrefixLength + 1);
+ Exon->Length = Exon->Length - PrefixLength - 1;
+ SafeFree(Exon->Sequence);
+ Exon->Sequence = NewSequence;
+ // Fix the genomic start coordinate:
+ if (ReverseFlag)
+ {
+ Exon->End -= strlen(Exon->Prefix) + (PrefixLength + 1)*3;
+ }
+ else
+ {
+ Exon->Start += strlen(Exon->Prefix) + (PrefixLength + 1)*3;
+ }
+ }
+ else
+ {
+ // The exon's body is gone!
+ if (ReverseFlag)
+ {
+ Exon->End -= strlen(Exon->Prefix) + Exon->Length * 3;
+ }
+ else
+ {
+ Exon->Start += strlen(Exon->Prefix) + Exon->Length * 3;
+ }
+ Exon->Length = 0;
+ SafeFree(Exon->Sequence);
+ Exon->Sequence = NULL;
+ }
+ CutsPerformed++;
+ Exon->Prefix[0] = '\0';
+ }
+ // Consider removing the suffix (or some outgoing links):
+ if (SuffixLength + Exon->MaxForwardOverall < MinimumORFLength)
+ {
+ // Delete all forward links:
+ //printf("*Delete all forward links\n");
+ Link = Exon->FirstForward;
+ while (Link)
+ {
+ NextLink = Link->Next;
+ DeleteExonLink(Exon, Link, 1);
+ Link = NextLink;
+ }
+ if (Exon->Length > SuffixLength + 1)
+ {
+ NewSequence = (char*)calloc(Exon->Length - SuffixLength, sizeof(char));
+ strncpy(NewSequence, Exon->Sequence, Exon->Length - SuffixLength - 1);
+ Exon->Length = Exon->Length - SuffixLength - 1;
+ NewSequence[Exon->Length] = '\0';
+ SafeFree(Exon->Sequence);
+ Exon->Sequence = NewSequence;
+ if (ReverseFlag)
+ {
+ Exon->Start += strlen(Exon->Suffix) + (SuffixLength + 1) * 3;
+ }
+ else
+ {
+ Exon->End -= strlen(Exon->Suffix) + (SuffixLength + 1) * 3;
+ }
+ }
+ else
+ {
+ // The exon's body is gone!
+ if (ReverseFlag)
+ {
+ Exon->Start += strlen(Exon->Suffix) + Exon->Length * 3;
+ }
+ else
+ {
+ Exon->End -= strlen(Exon->Suffix) + Exon->Length * 3;
+ }
+ Exon->Length = 0;
+ SafeFree(Exon->Sequence);
+ Exon->Sequence = NULL;
+
+ }
+ Exon->Suffix[0] = '\0';
+ CutsPerformed++;
+ }
+ // If we removed the exon body, and there's no prefix or suffix left, then cut the exon itself:
+ if (Exon->Start == Exon->End)
+ {
+ NextExon = Exon->Next;
+ DeleteExon(Exon);
+ Exon = NextExon;
+ continue;
+ }
+ // If we cut a prefix and cut a suffix, then we have no links, and it's entirely possible
+ // that no real sequence remains. Check to see that we have a non-stop-codon:
+ if (CutsPerformed == 2)
+ {
+ Flag = 0;
+ for (Pos = 0; Pos < Exon->Length; Pos++)
+ {
+ if (Exon->Sequence[Pos] != 'X')
+ {
+ Flag = 1;
+ break;
+ }
+ }
+ if (!Flag)
+ {
+ NextExon = Exon->Next;
+ //printf("*Delete the exon itself\n");
+ DeleteExon(Exon);
+ Exon = NextExon;
+ continue;
+ }
+ }
+ // Even if we didn't cut the prefix (or suffix) outright, we may be able to remove
+ // some incoming and outgoing links:
+ Link = Exon->FirstBack;
+ LinkIndex = 0;
+ while (Link)
+ {
+ if (Link->MaxLength + PrefixLength < MinimumORFLength)
+ {
+ NextLink = Link->Next;
+ //printf("*Delete ONE backward link\n");
+ DeleteExonLink(Exon, Link, 0);
+ Link = NextLink;
+ LinkIndex++;
+ continue;
+ }
+ Link = Link->Next;
+ LinkIndex++;
+ }
+ Link = Exon->FirstForward;
+ LinkIndex = 0;
+ while (Link)
+ {
+ if (Link->MaxLength + SuffixLength < MinimumORFLength)
+ {
+ NextLink = Link->Next;
+ //printf("*Delete ONE forward link\n");
+ DeleteExonLink(Exon, Link, 1);
+ Link = NextLink;
+ LinkIndex++;
+ continue;
+ }
+ Link = Link->Next;
+ LinkIndex++;
+ }
+ } // if the exon contains a stop codon
+ Exon = Exon->Next;
+ } // exon loop
+ GeneNodeIndex++;
+ } // gene node loop
+}
+
+// Prepare a splice-db for a particular target gene. We'll read all the intervals for the given
+// chromosome number, perform the merge and interstect algorithms, and then construct a set
+// of genomic intervals which 'satisfy' the target.
+void PrepareOneGeneSpliceDB(int ChromosomeNumber, int ReverseFlag, int IntervalStart,
+ int IntervalEnd, char* CustomFileName, char* GeneName, int MinORFLength)
+{
+ char FileName[1024];
+ char GenomeFileName[1024];
+ FILE* GenomicFile;
+ FILE* CustomFile;
+ GeneNode* GNode;
+ IntervalNode* Interval;
+ int SatisfiedOne;
+ char ReverseChar;
+ //
+ if (ReverseFlag)
+ {
+ ReverseChar = '-';
+ }
+ else
+ {
+ ReverseChar = '+';
+ }
+ sprintf(FileName, "NewSpliceDB\\%d%c.filtered", ChromosomeNumber, ReverseChar);
+ //ParseIntervalsFlatFile(FileName, -1);
+ ParseIntervalsESTBinaryFile(FileName);
+
+ /////////////////////////////////////////////////////////////////
+ //// For purposes of debugging, we can trim the list of intervals a bit. (Debug printout of a whole
+ //// chromosome is unwieldy!) In production, we MUST NOT trim, because one of the intervals in the
+ //// master-interval may be linked to a far-away interval.
+ //PruneEdge = IntervalStart - 5000;
+ //while (g_FirstInterval->End < PruneEdge)
+ //{
+ // RemoveInterval(g_FirstInterval, 0);
+ //}
+ //PruneEdge = IntervalEnd + 5000;
+ //while (g_LastInterval->Start > PruneEdge)
+ //{
+ // RemoveInterval(g_LastInterval, 0);
+ //}
+
+ printf("BEFORE merge:\n");
+ DebugPrintIntervals(1, 1, -1, -1);
+ MergeIntervals();
+ printf("AFTER merge:\n");
+ DebugPrintIntervals(1, 2, -1, -1);
+ IntersectIntervals();
+ printf("AFTER intersect:\n");
+ DebugPrintIntervals(1, 3, -1, -1);
+
+ //sprintf(GenomeFileName, "C:\\source\\Bafna\\Splice\\chromFa\\chr%d.trie", ChromosomeNumber);
+ sprintf(GenomeFileName, "e:\\Chromosome\\chr%d.trie", ChromosomeNumber);
+ GenomicFile = fopen(GenomeFileName, "rb");
+ CustomFile = fopen(CustomFileName, "wb");
+ // Create the gene node list. First, add every interval that overlaps
+ // the requested 'master interval':
+ for (Interval = g_FirstInterval; Interval; Interval = Interval->Next)
+ {
+ if (Interval->Start > IntervalEnd)
+ {
+ break;
+ }
+ if (Interval->End < IntervalStart)
+ {
+ continue;
+ }
+ GNode = (GeneNode*)calloc(1, sizeof(GeneNode));
+ GNode->Interval = Interval;
+ Interval->GNode = GNode;
+ if (g_GeneFirst)
+ {
+ g_GeneLast->Next = GNode;
+ GNode->Prev = g_GeneLast;
+ }
+ else
+ {
+ g_GeneFirst = GNode;
+ }
+ g_GeneLast = GNode;
+ }
+ // Iterate: Find the first interval overlapping the master which is not satisfied. Then, satisfy it.
+ // Break after every interval overlapping the master has been satisfied.
+ while (1)
+ {
+ SatisfiedOne = 0;
+ for (GNode = g_GeneFirst; GNode; GNode = GNode->Next)
+ {
+ if (GNode->Interval->End < IntervalStart || GNode->Interval->Start > IntervalEnd)
+ {
+ continue; // We needn't satisfy this one, since it's not in the master-interval.
+ }
+ if (!GNode->Interval->Satisfied)
+ {
+ SatisfyIntervalForward(GNode, 0);
+ SatisfyIntervalBack(GNode, 0);
+ GNode->Interval->Satisfied = 1;
+ SatisfiedOne = 1;
+ break;
+ }
+ }
+ if (!SatisfiedOne)
+ {
+ // Everyone's happy, so stop now.
+ break;
+ }
+ }
+
+ BuildAndWriteExons(GenomicFile, CustomFile, ReverseFlag, GeneName, ChromosomeNumber, MinORFLength);
+ fclose(GenomicFile);
+ fclose(CustomFile);
+
+ FreeGeneNodes();
+ // Free the interval list!
+ while (g_FirstInterval)
+ {
+ RemoveInterval(g_FirstInterval, 0);
+ }
+
+}
+
+// Parse a binary file listiing genomic intervals, with links between
+// them. Convert it into an exon graph and write it out.
+void PrepareSpliceDB(int ChromosomeNumber, int ReverseFlag, int MinORFLength)
+{
+ FILE* StatsFile;
+ char ReverseChar;
+ char FileName[1024];
+ char ChromosomeFileName[1024];
+ char OutputFileName[1024];
+ int GeneCount;
+
+ if (ReverseFlag)
+ {
+ ReverseChar = '-';
+ }
+ else
+ {
+ ReverseChar = '+';
+ }
+
+ ////////////////////////////////////////////////////////////////////////////
+ // HUMAN data sources:
+ // We can parse ESTs, or gene finder output, or BOTH. (Both may be slow)
+ // ESTREF, if reference sequences are included, or EST, if only ESTs are included:
+ //sprintf(FileName, "ESTREF\\%d%c.filtered", ChromosomeNumber, ReverseChar); // %%% hard-coded path
+ sprintf(FileName, "EST\\%d%c.filtered", ChromosomeNumber, ReverseChar); // %%% hard-coded path
+ ParseIntervalsESTBinaryFile(FileName);
+ sprintf(FileName, "GeneFindDB\\%d%c.dat", ChromosomeNumber, ReverseChar); // %%% hard-coded path
+ ParseIntervalsGeneFindBinaryFile(FileName);
+ sprintf(ChromosomeFileName, "e:\\Chromosome\\chr%d.trie", ChromosomeNumber);
+ sprintf(OutputFileName, "ESTSpliceDB\\%d%c.dat", ChromosomeNumber, ReverseChar);
+
+ printf("BEFORE merge:\n");
+ DebugPrintIntervals(-1, 1, -1, -1);
+ MergeIntervals();
+ printf("AFTER merge:\n");
+ DebugPrintIntervals(-1, 2, -1, -1);
+ IntersectIntervals();
+ printf("AFTER intersect:\n");
+ DebugPrintIntervals(-1, 3, -1, -1);
+
+ GeneCount = WriteGenesForIntervals(ChromosomeFileName, OutputFileName, ChromosomeNumber, ReverseFlag, MinORFLength);
+ //WriteGenesForIntervals("C:\\source\\Bafna\\Splice\\chromFa\\chr11.trie", "ESTSpliceDB\\11-.dat", 11, 1);
+ StatsFile = fopen("SplicePrepStats.txt", "a+");
+ //fprintf(StatsFile, "Genes have been written out. Statistics:\n");
+ fprintf(StatsFile, "%d\t", ChromosomeNumber);
+ fprintf(StatsFile, "%d\t", ReverseFlag);
+ fprintf(StatsFile, "%d\t", GeneCount);
+ fprintf(StatsFile, "%d\t", g_StatsIncompleteGeneCount);
+ fprintf(StatsFile, "%d\t", g_StatsLargestGeneSize);
+ fprintf(StatsFile, "%d\t", g_StatsLargestGeneRecordNumber);
+ fprintf(StatsFile, "%d\t", g_StatsIntervalsBeforeMerge);
+ fprintf(StatsFile, "%d\t", g_StatsEdgesBeforeMerge);
+ fprintf(StatsFile, "%d\t", g_StatsIntervalsAfterMerge);
+ fprintf(StatsFile, "%d\t", g_StatsEdgesAfterMerge);
+ fprintf(StatsFile, "%d\t", g_StatsIntervalsAfterIntersect);
+ fprintf(StatsFile, "%d\t", g_StatsEdgesAfterIntersect);
+ fprintf(StatsFile, "%d\t", g_StatsTotalExonsWritten);
+ fprintf(StatsFile, "%d\t", g_StatsTotalEdgesWritten);
+ printf("Exon counts:\n");
+ fprintf(StatsFile, "\t%d\t", g_PolymorphismCount);
+ printf("%d\t%d\t%d\t\t%d\t%d\t%d\t", g_UFTotalExons, g_UFTotalEdges, g_UFTotalAA, g_TotalExons, g_TotalEdges, g_TotalAA);
+ fprintf(StatsFile, "\t%d\t%d\t%d\t\t%d\t%d\t%d\t", g_UFTotalExons, g_UFTotalEdges, g_UFTotalAA, g_TotalExons, g_TotalEdges, g_TotalAA);
+ // How many exons are there...if you count adjacent exons as a single 'real' exon?
+ fprintf(StatsFile, "\t%d\t%d\t", g_TotalTrueExons, g_TotalTrueEdges);
+ fprintf(StatsFile, "\n");
+ fclose(StatsFile);
+
+ // Free the interval list!
+ while (g_FirstInterval)
+ {
+ RemoveInterval(g_FirstInterval, 0);
+ }
+}
diff --git a/SpliceDB.h b/SpliceDB.h
new file mode 100644
index 0000000..5ee7f36
--- /dev/null
+++ b/SpliceDB.h
@@ -0,0 +1,150 @@
+//Title: SpliceDB.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef SPLICEDB_H
+#define SPLICEDB_H
+
+#include <stdio.h>
+#include "Utils.h"
+#include "Trie.h"
+
+typedef struct GenomeDAGLink
+{
+ struct GenomeDAGNode* Node;
+ struct GenomeDAGLink* Next;
+ int Count;
+} GenomeDAGLink;
+
+#define MAX_DAG_NODE_LINKS 3
+
+typedef struct GenomeDAGNode
+{
+ int Start;
+ int End;
+ char* Sequence;
+ GenomeDAGLink* FirstForward;
+ GenomeDAGLink* FirstBack;
+ //GenomeDAGNode** Next[MAX_DAG_NODE_LINKS];
+ //GenomeDAGNode** Prev[MAX_DAG_NODE_LINKS];
+ struct ExonNode** Exons;
+ //GenomeDAGLink* FirstForward;
+ //GenomeDAGLink* FirstBack;
+} GenomeDAGNode;
+
+typedef struct IntervalNode
+{
+ int Start;
+ int End; // exclusive
+ int Occurrences;
+ int Satisfied;
+ unsigned int OriginalFilePos;
+ struct EdgeNode* FirstForward;
+ struct EdgeNode* LastForward;
+ struct EdgeNode* FirstBack;
+ struct EdgeNode* LastBack;
+ struct IntervalNode* Prev;
+ struct IntervalNode* Next;
+ struct ExonNode* FirstExon;
+ struct ExonNode* LastExon;
+ struct GeneNode* GNode; // non-null while this interval is in a pending gene.
+ int DAGNodeCount;
+ GenomeDAGNode* DAGNodes;
+ int Flags; // for keeping track of which reading frames we permit!
+} IntervalNode;
+
+typedef struct GeneNode
+{
+ IntervalNode* Interval;
+ // RX is the minimum covered length of any path originating at this interval and extending forward.
+ // We set RX during the 'satisfaction' process, so that we can note the (partial) satisfaction
+ // of intervals other than the seed.
+ // The 'covered' section of a path is the portion that's already part of the gene.
+ // RX is initialized to 0.
+ // During satisfaction procedure:
+ // If RX is big enough already, return without recursing. Otherwise:
+ // RX is set to 9999 if we have no forward edges.
+ // Otherwise, RX is set to the minimum value of the outgoing edge's interval's length (plus return value of the recursive
+ // satisfaction call, if any).
+ int RX;
+ int LX;
+ struct GeneNode* Prev;
+ struct GeneNode* Next;
+} GeneNode;
+
+// ExonNode is used while constructing the database.
+// In production, use ExonStruct from Spliced.h instead.
+typedef struct ExonNode
+{
+ IntervalNode* Interval;
+ struct ExonLink* FirstForward;
+ struct ExonLink* LastForward;
+ struct ExonLink* FirstBack;
+ struct ExonLink* LastBack;
+ struct ExonNode* Next;
+ char Prefix[3]; // one or two characters, and null-terminator
+ char Suffix[3];
+ int Index;
+ int Start; // start (in genomic coordinates, usually same as the parent interval's start)
+ int End; //end (in genomic coordinates)
+ int Length; // length in amino acids (not in genomic coordinates)
+ char* Sequence;
+ GenomeDAGNode* DAGNode;
+ int MaxForwardOverall;
+ int MaxBackOverall;
+
+} ExonNode;
+
+typedef struct ExonLink
+{
+ char AA;
+ ExonNode* Exon;
+ int Power;
+ // maximum peptide length achievable with this amino acid (if any) and the next exon,
+ // until stop codon or edge of graph.
+ int MaxLength;
+ struct ExonLink* Next;
+} ExonLink;
+
+typedef struct EdgeNode
+{
+ int Count;
+ float Score;
+ IntervalNode* Interval;
+ struct EdgeNode* Prev;
+ struct EdgeNode* Next;
+} EdgeNode;
+
+void PrepareSpliceDB(int ChromosomeNumber, int ReverseFlag, int MinORFLength);
+void PrepareOneGeneSpliceDB(int ChromosomeNumber, int ReverseFlag, int IntervalStart, int IntervalEnd,
+ char* CustomFileName, char* GeneName, int MinORFLength);
+void TestSpliceDB(int argc, char** argv);
+#endif // SPLICEDB_H
diff --git a/SpliceScan.c b/SpliceScan.c
new file mode 100644
index 0000000..249723c
--- /dev/null
+++ b/SpliceScan.c
@@ -0,0 +1,1003 @@
+//Title: SpliceScan.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+//
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+#include "CMemLeak.h"
+#include "Utils.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "Errors.h"
+#include "Trie.h"
+#include "Inspect.h"
+#include "Spliced.h"
+#include "SpliceDB.h"
+
+// SpliceScan.c is not used in production. Its job is to take a standard
+// database (.trie format), and check whether the proteins in that database
+// are present in a splice-tolerant database. Because EST coverage is
+// incomplete, we may be missing some exons from a gene, or missing some
+// genes entirely. And due to polymorphisms, errors
+// in gene and protein sequencing, there will be some minor differences.
+// We want to quantify how many genes are missing, and how extensive the
+// differences are.
+
+// We do the following for each protein:
+// - Take all 8-mers from the protein, put them in a trie.
+// ASSUMPTION: If the protein is present at all, there will be 8 consecutive residues
+// present without error
+// ASSUMPTION: any proteins with length <8aa can be ignored (yes, IPI has records of length <8...)
+// - Use the trie to search each gene in the splice-tolerant database
+// - Count the number of leaves that were matched. If the rate is above (near?) our best so far,
+// flag all the characters that were matched. Remember the percentage of characters matched,
+// span from the first to the last, and which gene record was so good.
+// ASSUMPTION: 8mers aren't repeated within a protein
+// ASSUMPTION: Chance matches are very rare. Our numbers will be distorted only slightly
+// by considering 8mers that appear at the wrong position.
+// - When you run out of genes, or when you get 95% or better match, report the best match.
+// Free the trie and start the next record.
+
+//#define SS_TRIE_DEBUG 1
+
+#define PROTEIN_NAME_BLOCK 81
+#define SS_BLOCK_SIZE 1000
+#define MAX_TRIE_NODE_COUNT 2000000
+int g_TrieNodeMatches[SS_BLOCK_SIZE];
+int TrieNodeHitFlags[MAX_TRIE_NODE_COUNT];
+int g_NextTrieLeafIndex;
+
+typedef struct SSTrieNode
+{
+ void* Children[26]; // SSTrieNodes or SSTrieLeafs
+ struct SSTrieNode* FailureNode;
+ int FailureDepth;
+#ifdef SS_TRIE_DEBUG
+ int Depth; // for debugging only!
+ char Buffer[16]; // for debugging only!
+#endif // SS_TRIE_DEBUG
+} SSTrieNode;
+
+typedef struct SSTrieLeafNode
+{
+ int ProteinNumber;
+ int ProteinPos;
+ struct SSTrieLeafNode* Next;
+} SSTrieLeafNode;
+
+typedef struct SSTrieLeaf
+{
+ struct SSTrieNode* FailureNode;
+ int FailureDepth;
+ // Index is a pointer into the array MAX_TRIE_NODE_COUNT. We use an index
+ // instead of storing the hit flag in the leaf. Why? Because we have
+ // to reset all the flags to zero after every gene record. Doing so with
+ // memset is much faster than traversing the trie!
+ int Index;
+ SSTrieLeafNode* Head;
+#ifdef SS_TRIE_DEBUG
+ char Buffer[16]; // for debugging only!
+#endif // SS_TRIE_DEBUG
+} SSTrieLeaf;
+
+void SSTrieFailureNodeHelper(SSTrieNode* Root, char* Buffer, SSTrieNode* FailedNode, int Depth)
+{
+ int SuffixStart;
+ int BufferPos;
+ int FailureDepth;
+ SSTrieNode* FailureNode = NULL;
+ SSTrieNode* Node;
+ SSTrieLeaf* Leaf;
+ int AA;
+ //
+ ////////////////////////////////////////////////////////////////////////////
+ // Set this node's failure-node, by finding the longest proper suffix of Buffer
+ // that reaches a node:
+ if (Depth > 1)
+ {
+ for (SuffixStart = 1; SuffixStart < Depth; SuffixStart++)
+ {
+ Node = Root;
+ for (BufferPos = SuffixStart; BufferPos < Depth; BufferPos++)
+ {
+ Node = Node->Children[Buffer[BufferPos]];
+ if (!Node)
+ {
+ break;
+ }
+ }
+ if (Node)
+ {
+ // The suffix matched!
+ FailureDepth = Depth - SuffixStart;
+ FailureNode = Node;
+ break;
+ }
+ }
+ if (!FailureNode)
+ {
+ FailureNode = Root;
+ FailureDepth = 0;
+ }
+ if (Depth == 8)
+ {
+ Leaf = (SSTrieLeaf*)FailedNode;
+ Leaf->FailureDepth = FailureDepth;
+ Leaf->FailureNode = FailureNode;
+ }
+ else
+ {
+ FailedNode->FailureDepth = FailureDepth;
+ FailedNode->FailureNode = FailureNode;
+ }
+ }
+ else
+ {
+ // A depth-1 node. Always gets the root as its failure node:
+ FailedNode->FailureDepth = 0;
+ FailedNode->FailureNode = Root;
+ }
+ ////////////////////////////////////////////////////////////////////////////
+ // Set our children's failure-nodes:
+ if (Depth < 8)
+ {
+ for (AA = 0; AA < 26; AA++)
+ {
+ Node = FailedNode->Children[AA];
+ if (Node)
+ {
+ Buffer[Depth] = AA;
+ SSTrieFailureNodeHelper(Root, Buffer, Node, Depth + 1);
+ }
+ }
+ }
+}
+
+// Initialize all the failure nodes for the trie.
+void SetSSTrieFailureNodes(SSTrieNode* Root)
+{
+ char Buffer[16];
+ int AA;
+ SSTrieNode* Child;
+
+ // Root never gets a failure node:
+ Root->FailureDepth = 0;
+ Root->FailureNode = NULL;
+
+ // For other nodes: Populate a Buffer with your string. Then find the
+ // longest proper suffix of Buffer that reaches a node.
+ for (AA = 0; AA < 26; AA++)
+ {
+ Child = Root->Children[AA];
+ if (Child)
+ {
+ Buffer[0] = AA;
+ SSTrieFailureNodeHelper(Root, Buffer, Child, 1);
+ }
+ }
+}
+
+void FreeSSTrieNode(SSTrieNode* Root, int Depth)
+{
+ SSTrieLeaf* Leaf;
+ SSTrieLeafNode* Node;
+ SSTrieLeafNode* Prev = NULL;
+ int AA;
+ //
+ if (!Root)
+ {
+ return;
+ }
+ if (Depth == 8)
+ {
+ Leaf = (SSTrieLeaf*)Root;
+ for (Node = Leaf->Head; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ SafeFree(Leaf);
+ return;
+ }
+ for (AA = 0; AA < 26; AA++)
+ {
+ if (Root->Children[AA])
+ {
+ FreeSSTrieNode(Root->Children[AA], Depth + 1);
+ }
+ }
+ SafeFree(Root);
+}
+
+SSTrieNode* ConstructSSTrie(char** Sequences, int BlockSize)
+{
+ SSTrieNode* Root;
+ SSTrieNode* CurrentNode;
+ SSTrieNode* NextNode;
+ SSTrieLeaf* Leaf;
+ SSTrieLeafNode* Node;
+ int Len;
+ int StartPos;
+ int PeptidePos;
+ int SequencePos;
+ int AA;
+ int ProteinNumber;
+ char* Sequence;
+ //
+ Root = (SSTrieNode*)calloc(1, sizeof(SSTrieNode));
+ g_NextTrieLeafIndex = 0;
+ for (ProteinNumber = 0; ProteinNumber < BlockSize; ProteinNumber++)
+ {
+ Sequence = Sequences[ProteinNumber];
+ Len = strlen(Sequence);
+ for (StartPos = 0; StartPos <= Len-8; StartPos++)
+ {
+ SequencePos = StartPos;
+ PeptidePos = 0;
+ CurrentNode = Root;
+ // Add nodes for the first n-1 positions:
+ for (PeptidePos = 0; PeptidePos < 7; PeptidePos++)
+ {
+ AA = Sequence[StartPos + PeptidePos] - 'A';
+ if (AA < 0 || AA > 25)
+ {
+ break; // invalid character in protein sequence!
+ }
+ NextNode = CurrentNode->Children[AA];
+ if (!NextNode)
+ {
+ NextNode = (SSTrieNode*)calloc(1, sizeof(SSTrieNode));
+ CurrentNode->Children[AA] = NextNode;
+#ifdef SS_TRIE_DEBUG
+ NextNode->Depth = PeptidePos + 1;
+ strncpy(NextNode->Buffer, Sequence + StartPos, PeptidePos + 1);
+ NextNode->Buffer[PeptidePos + 1] = '\0';
+#endif //SS_TRIE_DEBUG
+ }
+ CurrentNode = NextNode;
+ }
+ // Add a leaf node for the nth position:
+ AA = Sequence[StartPos + PeptidePos] - 'A';
+ if (AA < 0 || AA > 25)
+ {
+ continue; // invalid character in protein sequence!
+ }
+ Leaf = CurrentNode->Children[AA];
+ if (!Leaf)
+ {
+ Leaf = (SSTrieLeaf*)calloc(1, sizeof(SSTrieLeaf));
+ //Leaf->ProteinPos = StartPos + PeptidePos;
+ Leaf->Index = g_NextTrieLeafIndex++;
+ CurrentNode->Children[AA] = Leaf;
+ Leaf->Head = (SSTrieLeafNode*)calloc(1, sizeof(SSTrieLeafNode));
+ Leaf->Head->ProteinNumber = ProteinNumber;
+ Leaf->Head->ProteinPos = StartPos;
+#ifdef SS_TRIE_DEBUG
+ strncpy(Leaf->Buffer, Sequence + StartPos, 8);
+ Leaf->Buffer[8] = '\0';
+#endif
+ }
+ else
+ {
+ for (Node = Leaf->Head; Node->Next; Node = Node->Next)
+ {
+ ;;
+ }
+ Node->Next = (SSTrieLeafNode*)calloc(1, sizeof(SSTrieLeafNode));
+ Node->Next->ProteinNumber = ProteinNumber;
+ Node->Next->ProteinPos = StartPos;
+ }
+ } // Loop on start positions
+ } // Loop on proteins
+ return Root;
+}
+
+int SSTrieCoverSequence(SSTrieNode* Root, char* MatchFlags, int Depth, int ProteinNumber)
+{
+ SSTrieLeaf* Leaf;
+ int AA;
+ int Sum = 0;
+ SSTrieNode* Child;
+ SSTrieLeafNode* Node;
+ int X;
+ //
+ if (Depth == 8)
+ {
+ Leaf = (SSTrieLeaf*)Root;
+ if (TrieNodeHitFlags[Leaf->Index])
+ {
+ for (Node = Leaf->Head; Node; Node = Node->Next)
+ {
+ if (Node->ProteinNumber == ProteinNumber)
+ {
+ for (X = 0; X < 8; X++)
+ {
+ if (!MatchFlags[Node->ProteinPos + X])
+ {
+ Sum += 1;
+ MatchFlags[Node->ProteinPos + X] = 1;
+ }
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ for (AA = 0; AA < 26; AA++)
+ {
+ Child = Root->Children[AA];
+ if (Child)
+ {
+ Sum += SSTrieCoverSequence(Child, MatchFlags, Depth + 1, ProteinNumber);
+ }
+ }
+ }
+ return Sum;
+}
+
+
+// Recursive main function for scanning through splice-tolerant database with a trie.
+void SSDatabaseScanHelper(ExonStruct* Exon, int Len, int Pos, SSTrieNode* Node, int Depth)
+{
+ SSTrieLeaf* Leaf;
+ int NextLen = 0;
+ int EdgeIndex;
+ SSTrieNode* NextNode;
+ SSTrieLeafNode* LeafNode;
+ ExonEdge* Edge;
+ //
+ if (!Node)
+ {
+ return;
+ }
+ if (Depth == 8)
+ {
+ Leaf = (SSTrieLeaf*)Node;
+ if (!TrieNodeHitFlags[Leaf->Index])
+ {
+ //Leaf->HitFlag = 1;
+ TrieNodeHitFlags[Leaf->Index] = 1;
+ for (LeafNode = Leaf->Head; LeafNode; LeafNode = LeafNode->Next)
+ {
+ g_TrieNodeMatches[LeafNode->ProteinNumber]++;
+ }
+ }
+ return;
+ }
+ Len = Exon->Length;
+ if (Pos >= Len)
+ {
+ for (EdgeIndex = 0; EdgeIndex < Exon->ForwardEdgeCount; EdgeIndex++)
+ {
+ Edge = Exon->ForwardEdges + EdgeIndex;
+ if (Edge->AA)
+ {
+ NextNode = Node->Children[Edge->AA - 'A'];
+ if (NextNode)
+ {
+ NextLen = Edge->Exon->Length;
+ SSDatabaseScanHelper(Edge->Exon, NextLen, 0, NextNode, Depth + 1);
+ }
+ }
+ else
+ {
+ NextLen = Edge->Exon->Length;
+ SSDatabaseScanHelper(Edge->Exon, NextLen, 0, Node, Depth);
+ }
+ }
+ }
+ else
+ {
+ NextNode = Node->Children[Exon->Sequence[Pos] - 'A'];
+ if (NextNode)
+ {
+ SSDatabaseScanHelper(Exon, Len, Pos + 1, NextNode, Depth + 1);
+ }
+ }
+}
+
+void DebugPrintSSTrie(SSTrieNode* Node, int Depth, char* Buffer)
+{
+ int AA;
+ SSTrieNode* Child;
+#ifdef SS_TRIE_DEBUG
+ SSTrieNode* FailureNode;
+#endif
+ SSTrieLeafNode* LeafNode;
+ SSTrieLeaf* Leaf;
+ //
+#ifdef SS_TRIE_DEBUG
+ for (AA = 0; AA < Depth; AA++)
+ {
+ printf(" ");
+ }
+ if (Depth == 8)
+ {
+ Leaf = (SSTrieLeaf*)Node;
+ FailureNode = Leaf->FailureNode;
+ if (FailureNode)
+ {
+ printf("Leaf '%s' failure '%s' (depth %d)\n", Leaf->Buffer, FailureNode->Buffer, FailureNode->Depth);
+ }
+ else
+ {
+ printf("Leaf '%s' (NO FAILURE NODE)\n", Leaf->Buffer);
+ }
+ }
+ else
+ {
+ FailureNode = Node->FailureNode;
+ if (FailureNode)
+ {
+ printf("Node '%s' d%d failure node '%s' (depth %d)\n", Node->Buffer, Node->Depth,
+ FailureNode->Buffer, FailureNode->Depth);
+ }
+ else
+ {
+ printf("Node '%s' d%d NO FAILURE NODE\n", Node->Buffer, Node->Depth);
+ }
+ for (AA = 0; AA < 26; AA++)
+ {
+ Child = Node->Children[AA];
+ if (Child)
+ {
+ Buffer[Depth] = 'A' + AA;
+ DebugPrintSSTrie(Child, Depth + 1, Buffer);
+ }
+ }
+ }
+ return;
+#endif
+ if (Depth < 8)
+ {
+ for (AA = 0; AA < 26; AA++)
+ {
+ Child = Node->Children[AA];
+ if (Child)
+ {
+ Buffer[Depth] = 'A' + AA;
+ DebugPrintSSTrie(Child, Depth + 1, Buffer);
+ }
+ }
+ }
+ Buffer[8] = '\0';
+ Leaf = (SSTrieLeaf*)Node;
+ for (LeafNode = Leaf->Head; LeafNode; LeafNode = LeafNode->Next)
+ {
+ printf("%s at pos %d in record #%d\n", Buffer, LeafNode->ProteinPos, LeafNode->ProteinNumber);
+ }
+}
+
+void SSDatabaseScanExon(ExonStruct* Exon, SSTrieNode* Root, int StartDepth)
+{
+ SSTrieNode* CurrentNode;
+ SSTrieNode* Child;
+ int StartPos;
+ int Pos;
+ int Depth;
+ int AA;
+ SSTrieLeaf* Leaf;
+ SSTrieLeafNode* LeafNode;
+ int LinkIndex;
+ ExonEdge* Edge;
+ //
+ CurrentNode = Root;
+ Pos = 0;
+ Depth = StartDepth;
+ //printf("\n--->Start exon scan: Exon len %d, start depth is %d\n", Exon->Length, StartDepth);
+
+ // It's possible that we started at a leaf, if an edge-AA finished us off. If so,
+ // flag the match and return:
+ if (StartDepth == 8)
+ {
+ // We're at a leaf! Flag this match:
+ Leaf = (SSTrieLeaf*)CurrentNode;
+ if (!TrieNodeHitFlags[Leaf->Index])
+ {
+ TrieNodeHitFlags[Leaf->Index] = 1;
+ for (LeafNode = Leaf->Head; LeafNode; LeafNode = LeafNode->Next)
+ {
+ g_TrieNodeMatches[LeafNode->ProteinNumber]++;
+ //printf("At start of exon %d hit word %d\n", Exon->Index, LeafNode->ProteinPos);
+ }
+
+ }
+ return;
+ }
+ StartPos = 0;
+ while (1)
+ {
+ if (Pos >= Exon->Length)
+ {
+ // We've reached the end of the exon. Follow all outgoing edges:
+ for (LinkIndex = 0; LinkIndex < Exon->ForwardEdgeCount; LinkIndex++)
+ {
+ Edge = Exon->ForwardEdges + LinkIndex;
+ AA = Edge->AA;
+ if (AA)
+ {
+ Child = CurrentNode->Children[AA - 'A'];
+ if (Child)
+ {
+ SSDatabaseScanExon(Edge->Exon, Child, Depth + 1);
+ }
+ }
+ else
+ {
+ SSDatabaseScanExon(Edge->Exon, CurrentNode, Depth);
+ }
+ }
+ // If we were already partway down the trie when we started this exon, return:
+ if (StartDepth)
+ {
+ return;
+ }
+ else
+ {
+ // Advance to the next starting point, and jump back to the root:
+ StartPos++;
+ if (StartPos >= Exon->Length)
+ {
+ return; // done with all peptides that begin in this exon!
+ }
+ CurrentNode = Root;
+ Pos = StartPos;
+ Depth = 0;
+ }
+ }
+ //printf("%c Pos %d, current node '%s' depth %d=%d\n", Exon->Sequence[Pos], Pos, CurrentNode->Buffer, CurrentNode->Depth, Depth);
+ AA = Exon->Sequence[Pos] - 'A';
+ if (CurrentNode->Children[AA])
+ {
+ CurrentNode = CurrentNode->Children[AA];
+ Depth++;
+ if (Depth == 8)
+ {
+ // We're at a leaf! Flag this match:
+ Leaf = (SSTrieLeaf*)CurrentNode;
+ if (!TrieNodeHitFlags[Leaf->Index])
+ {
+ TrieNodeHitFlags[Leaf->Index] = 1;
+ for (LeafNode = Leaf->Head; LeafNode; LeafNode = LeafNode->Next)
+ {
+ g_TrieNodeMatches[LeafNode->ProteinNumber]++;
+ //printf("At position %d in exon %d hit word %d\n", Pos, Exon->Index, LeafNode->ProteinPos);
+ }
+
+ }
+ // If our starting depth is >0, then don't go to a failure node,
+ // just return. (We only use failure nodes when doing "normal"
+ // linear parsing along an exon)
+ if (StartDepth)
+ {
+ return;
+ }
+ //Depth = Leaf->FailureDepth;
+ //CurrentNode = Leaf->FailureNode;
+ StartPos++;
+ Depth = 0;
+ CurrentNode = Root;
+ Pos = StartPos;
+ continue;
+ }
+ Pos++;
+ }
+ else
+ {
+ // Our match has ended. Stop now, or use a failure node:
+ if (StartDepth)
+ {
+ return;
+ }
+ // If we are in the root now, then we should just advance by
+ // one character:
+ if (!Depth)
+ {
+ Pos++;
+ StartPos++;
+ }
+ else
+ {
+ //// We're not in the root, so we can use a failure node.
+ //// Pos stays where it is.
+ //Depth = CurrentNode->FailureDepth;
+ //CurrentNode = CurrentNode->FailureNode;
+ StartPos++;
+ CurrentNode = Root;
+ Pos = StartPos;
+ Depth = 0;
+ }
+ }
+ }
+}
+
+typedef struct SSMatchInfo
+{
+ int Coverage;
+ int RecordNumber;
+ int ChromosomeNumber;
+ int Strand;
+ int ApproximatePosition;
+ int CoverageStart;
+ int CoverageEnd;
+} SSMatchInfo;
+
+// From high to low coverage
+int CompareSSMatchInfo(const SSMatchInfo* a, const SSMatchInfo* b)
+{
+ if (a->Coverage > b->Coverage)
+ {
+ return -1;
+ }
+ if (a->Coverage < b->Coverage)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+// Keep the top n matches for each protein, where n is this number:
+#define MATCHES_PER_PROTEIN 5
+#define LAST_MATCH_FOR_PROTEIN 4
+
+// Main function: Given an array of protein Sequences (with names in NameBuffer) and the splicedb file name,
+// scan through the genes in the splice-db to find words from the proteins
+void SSDatabaseScanProteins(int FirstRecordNumber, char** Sequences, char* NameBuffer, char* SpliceDBFileName,
+ int BlockSize, FILE* OutputFile)
+{
+ int ProteinNumber;
+ int MaxSequenceLength = 0;
+ SSTrieNode* Root;
+ SSMatchInfo* AllMatchInfo;
+ SSMatchInfo* MatchInfo;
+ int SequenceLengths[SS_BLOCK_SIZE];
+ int RecordNumber = 0;
+ char* MatchFlags;
+ FILE* SpliceDBFile;
+ GeneStruct* CurrentGene;
+ ExonStruct* Exon;
+ int EdgeIndex;
+ int AA;
+ int ExonIndex;
+ int Len;
+ int Coverage;
+ SSTrieNode* Node;
+ int GeneNumber;
+ ExonEdge* Edge;
+ int MatchIndex;
+ //
+ AllMatchInfo = (SSMatchInfo*)calloc(SS_BLOCK_SIZE * MATCHES_PER_PROTEIN, sizeof(SSMatchInfo));
+ for (ProteinNumber = 0; ProteinNumber < BlockSize; ProteinNumber++)
+ {
+ SequenceLengths[ProteinNumber] = strlen(Sequences[ProteinNumber]);
+ MaxSequenceLength = max(MaxSequenceLength, SequenceLengths[ProteinNumber]);
+ }
+ MatchFlags = (char*)calloc(MaxSequenceLength, sizeof(char));
+ Root = ConstructSSTrie(Sequences, BlockSize);
+ SetSSTrieFailureNodes(Root);
+ //DebugPrintSSTrie(Root, 0, DebugBuffer);
+ SpliceDBFile = fopen(SpliceDBFileName, "rb");
+ if (!SpliceDBFile)
+ {
+ REPORT_ERROR_S(8, SpliceDBFileName);
+ return;
+ }
+ memset(TrieNodeHitFlags, 0, sizeof(int) * MAX_TRIE_NODE_COUNT);
+ GeneNumber = 0;
+ while (1)
+ {
+ GeneNumber++;
+ if (GeneNumber%100 == 0)
+ {
+ printf("%d ", GeneNumber);
+ }
+ //ResetSSTrieFlags(Root, 0);
+ memset(TrieNodeHitFlags, 0, sizeof(int) * g_NextTrieLeafIndex);
+ memset(g_TrieNodeMatches, 0, sizeof(int) * SS_BLOCK_SIZE);
+ CurrentGene = LoadGene(SpliceDBFile);
+ if (!CurrentGene)
+ {
+ break;
+ }
+ // Iterate over exons:
+ for (ExonIndex = 0; ExonIndex < CurrentGene->ExonCount; ExonIndex++)
+ {
+ Exon = CurrentGene->Exons + ExonIndex;
+ Len = Exon->Length;
+
+ // Try starting a match with the incoming edge:
+ for (EdgeIndex = 0; EdgeIndex < Exon->BackEdgeCount; EdgeIndex++)
+ {
+ Edge = Exon->BackwardEdges + EdgeIndex;
+ AA = Edge->AA - 'A';
+ if (AA >= 0 && AA < 26)
+ {
+ Node = Root->Children[AA];
+ //SSDatabaseScanHelper(Exon, Len, 0, Node, 1);
+ if (Node)
+ {
+ SSDatabaseScanExon(Exon, Node, 1);
+ }
+ }
+ }
+ SSDatabaseScanExon(Exon, Root, 0);
+ //for (Pos = 0; Pos < Len; Pos++)
+ //{
+ // SSDatabaseScanHelper(Exon, Len, Pos, Root, 0);
+ //}
+ }
+ // Rate the quality of the match, saving it if it's good:
+ for (ProteinNumber = 0; ProteinNumber < BlockSize; ProteinNumber++)
+ {
+ Coverage = g_TrieNodeMatches[ProteinNumber];
+ if (Coverage > SequenceLengths[ProteinNumber] * 0.1)
+ {
+ //printf("\nProtein #%d: %s\n", ProteinNumber, NameBuffer + (ProteinNumber * PROTEIN_NAME_BLOCK));
+ //DebugPrintGene(CurrentGene);
+ memset(MatchFlags, 0, sizeof(char) * MaxSequenceLength);
+ Coverage = SSTrieCoverSequence(Root, MatchFlags, 0, ProteinNumber);
+ //for (AA = 0; AA < SequenceLengths[ProteinNumber]; AA++)
+ //{
+ // printf("%d\t%c\t%d\t\n", AA, Sequences[ProteinNumber][AA], MatchFlags[AA]);
+ //}
+ // If this coverage is better than the lowest-saved-coverage for the protein,
+ // then replace the lowest-saved-coverage and sort the list:
+ if (Coverage > AllMatchInfo[ProteinNumber * MATCHES_PER_PROTEIN + LAST_MATCH_FOR_PROTEIN].Coverage)
+ {
+ MatchInfo = AllMatchInfo + ProteinNumber * MATCHES_PER_PROTEIN + LAST_MATCH_FOR_PROTEIN;
+ MatchInfo->Coverage = Coverage;
+ MatchInfo->RecordNumber = RecordNumber;
+ MatchInfo->ChromosomeNumber = CurrentGene->ChromosomeNumber;
+ MatchInfo->Strand = CurrentGene->ForwardFlag;
+ MatchInfo->ApproximatePosition = CurrentGene->Exons[0].Start;
+
+ for (AA = 0; AA < SequenceLengths[ProteinNumber]; AA++)
+ {
+ if (MatchFlags[AA])
+ {
+ MatchInfo->CoverageStart = AA;
+ //BestStartPos[ProteinNumber] = AA;
+ break;
+ }
+ }
+ for (AA = SequenceLengths[ProteinNumber] - 1; AA >= 0; AA--)
+ {
+ if (MatchFlags[AA])
+ {
+ MatchInfo->CoverageEnd = AA;
+ //BestEndPos[ProteinNumber] = AA;
+ break;
+ }
+ }
+ qsort(AllMatchInfo + ProteinNumber * MATCHES_PER_PROTEIN, MATCHES_PER_PROTEIN, sizeof(SSMatchInfo), (QSortCompare)CompareSSMatchInfo);
+ }
+ }
+ }
+ RecordNumber += 1;
+ FreeGene(CurrentGene);
+ // If we've got 95% of the protein, then stop now - we probably won't
+ // get any more!
+ //if (BestCoverage > 0.95*SequenceLength)
+ //{
+ // break;
+ //}
+ }
+ // Print the match:
+ for (ProteinNumber = 0; ProteinNumber < BlockSize; ProteinNumber++)
+ {
+ fprintf(OutputFile, "%d\t", FirstRecordNumber + ProteinNumber);
+ fprintf(OutputFile, "%s\t", NameBuffer + (ProteinNumber * PROTEIN_NAME_BLOCK));
+ fprintf(OutputFile, "%d\t", SequenceLengths[ProteinNumber]);
+ for (MatchIndex = 0; MatchIndex < MATCHES_PER_PROTEIN; MatchIndex++)
+ {
+ MatchInfo = AllMatchInfo + ProteinNumber * MATCHES_PER_PROTEIN + MatchIndex;
+ if (MatchInfo->Coverage)
+ {
+ fprintf(OutputFile, "%d\t", MatchInfo->ChromosomeNumber);
+ fprintf(OutputFile, "%d\t", MatchInfo->Strand);
+ fprintf(OutputFile, "%d\t", MatchInfo->ApproximatePosition);
+ fprintf(OutputFile, "%d\t", MatchInfo->Coverage);
+ }
+ }
+ fprintf(OutputFile, "\n");
+ }
+ FreeSSTrieNode(Root, 0);
+
+ // Cleanup:
+ SafeFree(MatchFlags);
+ SafeFree(AllMatchInfo);
+}
+
+typedef struct SSHashNode
+{
+ char TrueSequence[8];
+ int ProteinIndex;
+ int ProteinPos;
+ int MatchFlag;
+ struct SSHashNode* Next;
+} SSHashNode;
+
+#define SS_HASH_MAX 5000000
+// Big hash:
+SSHashNode* SSHash[SS_HASH_MAX];
+
+void ClearSSHash()
+{
+ int HashIndex;
+ SSHashNode* Node;
+ SSHashNode* Prev;
+ //
+ for (HashIndex = 0; HashIndex < SS_HASH_MAX; HashIndex++)
+ {
+ Prev = NULL;
+ Node = SSHash[HashIndex];
+ while (Node)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ Node = Node->Next;
+ }
+ SafeFree(Prev);
+ SSHash[HashIndex] = NULL;
+ }
+}
+
+#define HASH_SEQUENCE(Buffer)\
+HashValue = 0;\
+for (X = 0; X < 8; X++)\
+{\
+ HashValue += Buffer[X] * X * X;\
+ HashValue %= SS_HASH_MAX;\
+}\
+
+// Hashing *may* be faster than trie; it hasn't been implemented yet.
+void PopulateSSHash(char** SequenceBuffer, int BlockSize)
+{
+ int ProteinIndex;
+ int Pos;
+ int Len;
+ //
+ for (ProteinIndex = 0; ProteinIndex < BlockSize; ProteinIndex++)
+ {
+ Len = strlen(SequenceBuffer[ProteinIndex]);
+ for (Pos = 0; Pos < Len - 7; Pos++)
+ {
+ }
+ }
+
+}
+
+// For more rapid scanning of proteins...let's use a hash instead of a trie.
+void SSQDatabaseScanProteins(char** SequenceBuffer, char* NameBuffer, char* SpliceDBFileName, int BlockSize)
+{
+ ClearSSHash();
+ PopulateSSHash(SequenceBuffer, BlockSize);
+}
+
+// Main method:
+void SSDatabaseScan(char* TrieFileName, char* IndexFileName, char* SpliceDBFileName,
+ int FirstRecord, int LastRecord)
+{
+ //GeneStruct* CurrentGene;
+ //GeneStruct* LoadGene(FILE* File)
+ int DummyInt;
+ int LastFilePos = -1;
+ int FilePos;
+ FILE* TrieFile;
+ FILE* IndexFile;
+ char* SequenceBuffer[SS_BLOCK_SIZE];
+ char NameBuffer[PROTEIN_NAME_BLOCK * SS_BLOCK_SIZE];
+ int BytesRead;
+ int RecordLength;
+ int BlockIndex = 0;
+ FILE* OutputFile;
+ int RecordNumber;
+ int BlockFirstRecordNumber = 0;
+ //
+ TrieFile = fopen(TrieFileName, "rb");
+ if (!TrieFile)
+ {
+ REPORT_ERROR_S(8, TrieFileName);
+ return;
+ }
+ IndexFile = fopen(IndexFileName, "rb");
+ if (!IndexFile)
+ {
+ REPORT_ERROR_S(8, IndexFileName);
+ return;
+ }
+ OutputFile = fopen("SSDatabaseScan.txt", "wb");
+ if (!OutputFile)
+ {
+ printf("** Error: Failed to open SSDatabaseScan.txt\n");
+ return;
+ }
+ // Header:
+ fprintf(OutputFile, "RecordNumber\tProtein\tLength\tChromosome\tForwardFlag\tApproxPos\tCoverage\t\n");
+ // Read protein records from the trie database. Once you accumulate a block
+ // of them in the trie, launch a scan through the exon graph with SSDatabaseScanProteins.
+ RecordNumber = 0;
+ while (1)
+ {
+ BytesRead = ReadBinary(&DummyInt, sizeof(int), 1, IndexFile);
+ if (!BytesRead)
+ {
+ // End of file. Scan our last block, if we have anything in the block:
+ if (LastFilePos >= 0 && BlockIndex)
+ {
+ fseek(TrieFile, FilePos, SEEK_SET);
+ SequenceBuffer[BlockIndex - 1] = (char*)calloc(30000, sizeof(char));
+ ReadBinary(SequenceBuffer[BlockIndex - 1], sizeof(char), 30000, TrieFile);
+ SSDatabaseScanProteins(BlockFirstRecordNumber, SequenceBuffer, NameBuffer, SpliceDBFileName, BlockIndex, OutputFile);
+ }
+ break;
+ }
+ BytesRead = ReadBinary(&DummyInt, sizeof(int), 1, IndexFile);
+ BytesRead = ReadBinary(&FilePos, sizeof(int), 1, IndexFile);
+ //
+ if (LastFilePos >= 0 && RecordNumber >= FirstRecord)
+ {
+ RecordLength = FilePos - LastFilePos - 1;
+ SequenceBuffer[BlockIndex - 1] = (char*)calloc(RecordLength + 1, sizeof(char));
+ fseek(TrieFile, LastFilePos, SEEK_SET);
+ ReadBinary(SequenceBuffer[BlockIndex - 1], sizeof(char), RecordLength, TrieFile);
+ if (BlockIndex == SS_BLOCK_SIZE || (LastRecord >= 0 && RecordNumber >= LastRecord))
+ {
+ SSDatabaseScanProteins(BlockFirstRecordNumber, SequenceBuffer, NameBuffer, SpliceDBFileName, BlockIndex, OutputFile);
+ for (BlockIndex = 0; BlockIndex < BlockIndex; BlockIndex++)
+ {
+ SafeFree(SequenceBuffer[BlockIndex]);
+ }
+ BlockIndex = 0;
+ // If we hit the last record, then stop now.
+ if (LastRecord >= 0 && RecordNumber >= LastRecord)
+ {
+ break;
+ }
+ }
+ }
+ LastFilePos = FilePos;
+ ReadBinary(NameBuffer + BlockIndex*PROTEIN_NAME_BLOCK, sizeof(char), 80, IndexFile);
+ NameBuffer[BlockIndex*PROTEIN_NAME_BLOCK + 80] = '\0';
+ if (RecordNumber >= FirstRecord)
+ {
+ if (BlockIndex == 0)
+ {
+ BlockFirstRecordNumber = RecordNumber;
+ }
+ BlockIndex++;
+ }
+ RecordNumber++;
+ }
+ fclose(IndexFile);
+ fclose(TrieFile);
+}
diff --git a/SpliceScan.h b/SpliceScan.h
new file mode 100644
index 0000000..63346ae
--- /dev/null
+++ b/SpliceScan.h
@@ -0,0 +1,39 @@
+//Title: SpliceScan.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef SPLICE_SCAN_H
+#define SPLICE_SCAN_H
+
+void SSDatabaseScan(char* TrieFileName, char* IndexFileName, char* SpliceDBFileName,
+ int FirstRecord, int LastRecord);
+
+#endif // SPLICE_SCAN_H
diff --git a/Spliced.c b/Spliced.c
new file mode 100644
index 0000000..c061dfe
--- /dev/null
+++ b/Spliced.c
@@ -0,0 +1,2113 @@
+//Title: Spliced.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Utils.h"
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include "Trie.h"
+#include "Spliced.h"
+#include "Inspect.h"
+#include "ExonGraphAlign.h"
+#include "Errors.h"
+// The left and right extensions of a tag can have at most this many successes.
+#define MAX_SIDE_EXTENSIONS 128
+
+// For keeping a linked list of genes in memory. In practice, we generally DON'T use this list,
+// instead we load and search one gene at a time. (Genomes are big!)
+GeneStruct* FirstGene;
+GeneStruct* LastGene;
+
+ExonStruct** g_TagExonArray;
+int g_TagExonArrayPos;
+ExonEdge** g_TagSpliceArray;
+int g_TagSpliceArrayPos;
+// Tags often contain residues K/Q and I/L. We wish to report the true residue found
+// in the exon, undoing Q->K and I->L substitutions as necessary. So, we log the
+// matched tag chars in g_TagBuffer.
+char* g_TagBuffer;
+char* g_TagBufferSpliced;
+int g_TagBufferPos;
+int g_TagBufferPosSpliced;
+
+static char* MatchedBases;
+char* ExtensionBufferLeft;
+char* ExtensionBufferRight;
+int* ExtensionLeftDecorations;
+int* ExtensionRightDecorations;
+int* ExtensionGenomicStart;
+int* ExtensionGenomicEnd;
+MSSpectrum** ExtensionSpectra;
+
+static int MH_MinMatchMass;
+static int MH_MaxMatchMass;
+static char* MH_MatchBuffer;
+static char* MH_MatchBufferSpliced;
+static int* MH_MatchDecoration;
+static int MH_MatchCount;
+static char* MH_Buffer;
+static char* MH_BufferSpliced;
+static ExonStruct** MH_MatchExons;
+static ExonEdge** MH_MatchEdges;
+static int MH_MatchExonPos;
+static int MH_MatchEdgePos;
+//static int MH_MatchSplicePos;
+
+ExonEdge* GetReciprocalExonEdge(ExonEdge* Edge, int ForwardFlag);
+
+// Free one Gene (and its exons)
+void FreeGene(GeneStruct* Gene)
+{
+ int Index;
+ //
+ if (!Gene)
+ {
+ return;
+ }
+ for (Index = 0; Index < Gene->ExonCount; Index++)
+ {
+ SafeFree(Gene->Exons[Index].ForwardEdges);
+ SafeFree(Gene->Exons[Index].BackwardEdges);
+ SafeFree(Gene->Exons[Index].Sequence);
+ }
+ SafeFree(Gene->Exons);
+ SafeFree(Gene);
+}
+
+// Free the global list of genes. (Not used in practice, since we load one at a time)
+void FreeGenes()
+{
+ GeneStruct* Gene;
+ GeneStruct* Prev = NULL;
+ //
+ for (Gene = FirstGene; Gene; Gene = Gene->Next)
+ {
+ if (Prev)
+ {
+ FreeGene(Prev);
+ }
+ Prev = Gene;
+ }
+ if (Prev)
+ {
+ FreeGene(Prev);
+ }
+ FirstGene = NULL;
+ LastGene = NULL;
+}
+
+// For debugging purposes: Print out a list of exons (with partial sequences) and edges.
+// (Mostly for verifying database generation worked)
+void DebugPrintGene(GeneStruct* Gene)
+{
+ int ExonIndex;
+ ExonStruct* Exon;
+ int EdgeIndex;
+ //
+ printf("*Gene %s (%s) has %d exons\n", Gene->Name, Gene->SprotName, Gene->ExonCount);
+ for (ExonIndex = 0; ExonIndex < Gene->ExonCount; ExonIndex++)
+ {
+ Exon = Gene->Exons + ExonIndex;
+ printf("Exon %d from %d-%d cov %d: \n", Exon->Index, Exon->Start, Exon->End, Exon->Occurrences);
+ if (Exon->Sequence)
+ {
+ printf(Exon->Sequence);
+ }
+ else
+ {
+ printf("<none>");
+ }
+ printf("\n");
+ //printf(" Exon from %d-%d coverage %d sequence %s...\n", Exon->Start, Exon->End, Exon->Occurrences, Buffer);
+ for (EdgeIndex = 0; EdgeIndex < Exon->ForwardEdgeCount; EdgeIndex++)
+ {
+ printf(" >> (%d) '%c' to exon #%d %d-%d\n", Exon->ForwardEdges[EdgeIndex].Power,
+ Exon->ForwardEdges[EdgeIndex].AA, Exon->ForwardEdges[EdgeIndex].Exon->Index,
+ Exon->ForwardEdges[EdgeIndex].Exon->Start, Exon->ForwardEdges[EdgeIndex].Exon->End);
+ }
+ for (EdgeIndex = 0; EdgeIndex < Exon->BackEdgeCount; EdgeIndex++)
+ {
+ printf(" << (%d) '%c' to exon #%d %d-%d\n", Exon->BackwardEdges[EdgeIndex].Power,
+ Exon->BackwardEdges[EdgeIndex].AA, Exon->BackwardEdges[EdgeIndex].Exon->Index,
+ Exon->BackwardEdges[EdgeIndex].Exon->Start, Exon->BackwardEdges[EdgeIndex].Exon->End);
+ }
+ }
+}
+
+// For debugging: Print *all* our genes, and their exons and edges
+void DebugPrintGenes()
+{
+ GeneStruct* Gene;
+ //
+ printf("Genes:\n");
+ for (Gene = FirstGene; Gene; Gene = Gene->Next)
+ {
+ printf("\n");
+ DebugPrintGene(Gene);
+ }
+}
+
+// Load one gene from the (binary) gene file. Does some basic error checking, in case of
+// obsolete or broken file formats.
+GeneStruct* LoadGene(FILE* File)
+{
+ char Buffer[1024];
+ int Bytes;
+ GeneStruct* Gene;
+ int ExonIndex;
+ int OtherExonIndex;
+ ExonStruct* Exon;
+ ExonStruct* OtherExon;
+ int Length;
+ int EdgeIndex;
+ char AA;
+ int LinkPower;
+ //
+ Bytes = ReadBinary(Buffer, sizeof(char), GENE_NAME_LENGTH, File);
+ if (!Bytes)
+ {
+ return NULL; // eof
+ }
+ Gene = (GeneStruct*)calloc(1, sizeof(GeneStruct));
+ strncpy(Gene->Name, Buffer, GENE_NAME_LENGTH);
+ ReadBinary(Gene->SprotName, sizeof(char), GENE_NAME_LENGTH, File);
+ ReadBinary(&Gene->ChromosomeNumber, sizeof(int), 1, File);
+ if (!Gene->ChromosomeNumber)
+ {
+ printf("** Warning: No chromosome number for gene '%s'\n", Gene->Name);
+ }
+ ReadBinary(&Gene->ForwardFlag, sizeof(char), 1, File);
+ ReadBinary(&Gene->ExonCount, sizeof(int), 1, File);
+ if (Gene->ExonCount < 1 || Gene->ExonCount > MAX_GENE_EXONS)
+ {
+ printf("** Warning: suspicious exon-count %d encountered in LoadGene(). File position is %ld.\n", Gene->ExonCount, ftell(File));
+ return NULL;
+ }
+ //fread(&GIIDBlock, sizeof(int), 10, File);
+ Gene->Exons = (ExonStruct*)calloc(Gene->ExonCount, sizeof(ExonStruct));
+
+ // Read the gene's exons:
+ for (ExonIndex = 0; ExonIndex < Gene->ExonCount; ExonIndex++)
+ {
+ //printf("Filepos %d, now read exon %d of %d\n", ftell(File), ExonIndex, Gene->ExonCount);
+ Exon = Gene->Exons + ExonIndex;
+ Exon->Gene = Gene;
+ Bytes = ReadBinary(&Exon->Start, sizeof(int), 1, File);
+ if (!Bytes)
+ {
+ printf("** Error: EOF encountered while reading exon %d of gene '%s'\n", ExonIndex, Gene->Name);
+ break;
+ }
+ Exon->Index = ExonIndex;
+ ReadBinary(&Exon->End, sizeof(int), 1, File);
+ ReadBinary(&Length, sizeof(int), 1, File);
+ if (Length < 0 || Length > 10000)
+ {
+ printf("** Error: Bogus sequence length %d encountered while reading exon %d of gene '%s'\n", Length, ExonIndex, Gene->Name);
+ break;
+ }
+ ReadBinary(&Exon->Occurrences, sizeof(int), 1, File);
+ Exon->Length = Length;
+ if (Length)
+ {
+ Exon->Sequence = (char*)calloc(Length + 1, sizeof(char));
+ ReadBinary(Exon->Sequence, sizeof(char), Length, File);
+ }
+ else
+ {
+ Exon->Sequence = NULL;
+ }
+ //printf("%d '%s'\n", ExonIndex, Exon->Sequence); //
+ ReadBinary(&Exon->Prefix, sizeof(char), 2, File);
+ ReadBinary(&Exon->Suffix, sizeof(char), 2, File);
+ ReadBinary(&Exon->BackEdgeCount, sizeof(int), 1, File);
+ if (Exon->BackEdgeCount < 0 || Exon->BackEdgeCount > 500)
+ {
+ printf("** zomg broken back edge count in LoadGene() exon %d gene '%s'\n", ExonIndex, Gene->Name);
+ }
+ ReadBinary(&Exon->ForwardEdgeCount, sizeof(int), 1, File);
+ if (Exon->ForwardEdgeCount < 0 || Exon->ForwardEdgeCount > 500)
+ {
+ printf("** zomg broken forward edge count in LoadGene() exon %d gene '%s'\n", ExonIndex, Gene->Name);
+ }
+
+ if (Exon->ForwardEdgeCount)
+ {
+ Exon->ForwardEdges = (ExonEdge*)calloc(Exon->ForwardEdgeCount, sizeof(ExonEdge));
+ }
+ if (Exon->BackEdgeCount)
+ {
+ Exon->BackwardEdges = (ExonEdge*)calloc(Exon->BackEdgeCount, sizeof(ExonEdge));
+ }
+ // Read all the edges for this exon. (Read all the back-edges, and THEN take care of old forward-edges)
+ for (EdgeIndex = 0; EdgeIndex < Exon->BackEdgeCount; EdgeIndex++)
+ {
+ Bytes = ReadBinary(&OtherExonIndex, sizeof(int), 1, File);
+ if (!Bytes)
+ {
+ printf("** Error: EOF encountered while reading exon %d edge %d of gene '%s'\n", ExonIndex, EdgeIndex, Gene->Name);
+ break;
+ }
+ ReadBinary(&LinkPower, sizeof(int), 1, File);
+ ReadBinary(&AA, sizeof(char), 1, File);
+ if (OtherExonIndex < 0 || OtherExonIndex >= Gene->ExonCount)
+ {
+ printf("** Error: Illegal exon back-link %d encountered for exon %d edge %d gene '%s'\n", OtherExonIndex, ExonIndex, EdgeIndex, Gene->Name);
+ }
+ else
+ {
+ OtherExon = Gene->Exons + OtherExonIndex;
+ Exon->BackwardEdges[EdgeIndex].Exon = OtherExon;
+ Exon->BackwardEdges[EdgeIndex].AA = AA;
+ Exon->BackwardEdges[EdgeIndex].Power = LinkPower;
+ Exon->BackwardEdges[EdgeIndex].Source = Exon;
+ }
+ }
+ } // exon loop
+ // We set all the back-links while we're reading the exons in. Now, let's go through
+ // and fix all the forward-links.
+ SetExonForwardEdges(Gene);
+ return Gene;
+}
+
+// INPUT: A gene where the backward edges are populated, and the exon forward edges are allocated but *not* populated.
+// Result: Forward edges are populated.
+void SetExonForwardEdges(GeneStruct* Gene)
+{
+ int ExonIndex;
+ ExonStruct* Exon;
+ ExonStruct* OtherExon;
+ int EdgeIndex;
+ int OtherEdgeIndex;
+ int ForwardEdgeSet;
+ char AA;
+ int LinkPower;
+ //
+ for (ExonIndex = 0; ExonIndex < Gene->ExonCount; ExonIndex++)
+ {
+ Exon = Gene->Exons + ExonIndex;
+ for (EdgeIndex = 0; EdgeIndex < Exon->BackEdgeCount; EdgeIndex++)
+ {
+ // The first empty slot in the OtherExon forward arrays will now be set:
+ OtherExon = Exon->BackwardEdges[EdgeIndex].Exon;
+ AA = Exon->BackwardEdges[EdgeIndex].AA;
+ LinkPower = Exon->BackwardEdges[EdgeIndex].Power;
+ ForwardEdgeSet = 0;
+ for (OtherEdgeIndex = 0; OtherEdgeIndex < OtherExon->ForwardEdgeCount; OtherEdgeIndex++)
+ {
+ if (!OtherExon->ForwardEdges[OtherEdgeIndex].Exon)
+ {
+ OtherExon->ForwardEdges[OtherEdgeIndex].Exon = Exon;
+ OtherExon->ForwardEdges[OtherEdgeIndex].AA = AA;
+ OtherExon->ForwardEdges[OtherEdgeIndex].Power = LinkPower;
+ OtherExon->ForwardEdges[OtherEdgeIndex].Source = OtherExon;
+ ForwardEdgeSet = 1;
+ break;
+ }
+ }
+ if (!ForwardEdgeSet)
+ {
+ REPORT_ERROR_IIS(26, OtherExon->Index, Exon->Index, Gene->Name);
+ }
+ }
+ }
+}
+
+// Load genes from a binary file, built by running inspect with splicedb arguments.
+// (In practice, we don't call this - we just load ONE gene at a time!)
+void LoadGenes(char* FileName)
+{
+ FILE* File;
+ GeneStruct* Gene;
+ //
+ File = fopen(FileName, "rb");
+ if (!File)
+ {
+ printf("** Error: Unable to open gene file '%s'\n", FileName);
+ return;
+ }
+ while (1)
+ {
+ Gene = LoadGene(File);
+ if (!Gene)
+ {
+ break;
+ }
+
+ // Insert new gene into list:
+ if (LastGene)
+ {
+ LastGene->Next = Gene;
+ Gene->Prev = LastGene;
+ }
+ else
+ {
+ FirstGene = Gene;
+ }
+ LastGene = Gene;
+ }
+ fclose(File);
+}
+
+// Static structures used in splice-tolerant search. Tag extension builds up an array
+// of extension-matches for the left and for the right, then tries each combination of
+// a legal right-extension and a legal left-extension.
+char* SLeftMatchBuffer = NULL; // The AAs of the extension
+char* SLeftMatchBufferSpliced = NULL; // The AAs of the extension, with splice boundaries
+int* SLeftMatchDecoration = NULL; // The decoration to be attached over the extension
+int* SLeftGenomicPosition = NULL;
+int* SRightGenomicPosition = NULL;
+char* SLeftPrefixes = NULL; // The AA just *beyond* the extension
+char* SRightMatchBuffer = NULL;
+char* SRightMatchBufferSpliced = NULL;
+int* SRightMatchDecoration = NULL;
+char* SRightSuffixes = NULL;
+ExonStruct** SLeftExon = NULL; // The exons reached by prefix extension. SLeftExon[MatchNumber*16 + ExonIndex]
+ExonEdge** SLeftEdge = NULL; // The splice boundaries crossed by prefix extension.
+//int* SLeftSpliceScore = NULL; // The scores of splice boundaries used in prefix extension.
+int* SLeftExonCount = NULL;
+int* SLeftSpliceCount = NULL;
+ExonStruct** SRightExon = NULL; // The exons reached by suffix extension. SLeftExon[MatchNumber*16 + ExonIndex]
+ExonEdge** SRightEdge = NULL; // The splice boundaries crossed by suffix extension.
+//int* SRightSpliceScore = NULL;
+int* SRightExonCount = NULL;
+int* SRightSpliceCount = NULL;
+
+void AllocSpliceStructures()
+{
+ if (SLeftMatchBuffer)
+ {
+ return; // It seems we've already allocated them.
+ }
+
+ // The Spliced buffer is made extra-long so that we can afford to add two symbol chars
+ // per amino acid.
+ SLeftMatchBuffer = (char*)calloc(MAX_EXTENSION_LENGTH * MAX_SIDE_EXTENSIONS, sizeof(char));
+ SLeftMatchBufferSpliced = (char*)calloc(MAX_SEXTENSION_LENGTH * MAX_SIDE_EXTENSIONS, sizeof(char));
+ SLeftMatchDecoration = (int*)calloc(MAX_SIDE_EXTENSIONS + 1, sizeof(int));
+ SLeftGenomicPosition = (int*)calloc(MAX_SIDE_EXTENSIONS + 1, sizeof(int));
+ SLeftPrefixes = (char*)calloc(MAX_SIDE_EXTENSIONS, sizeof(char));
+ SRightMatchBuffer = (char*)calloc(MAX_EXTENSION_LENGTH * MAX_SIDE_EXTENSIONS, sizeof(char));
+ SRightMatchBufferSpliced = (char*)calloc(MAX_SEXTENSION_LENGTH * MAX_SIDE_EXTENSIONS, sizeof(char));
+ SRightMatchDecoration = (int*)calloc(MAX_SIDE_EXTENSIONS + 1, sizeof(int));
+ SRightGenomicPosition = (int*)calloc(MAX_SIDE_EXTENSIONS + 1, sizeof(int));
+ SRightSuffixes = (char*)calloc(MAX_SIDE_EXTENSIONS, sizeof(char));
+
+ SLeftExon = (ExonStruct**)calloc(MAX_SIDE_EXTENSIONS * MAX_EXTENSION_EXONS, sizeof(ExonStruct*));
+ SLeftEdge = (ExonEdge**)calloc(MAX_SIDE_EXTENSIONS * MAX_EXTENSION_EXONS, sizeof(ExonEdge*));
+ //SLeftSpliceScore = (int*)calloc(MAX_SIDE_EXTENSIONS * MAX_EXTENSION_EXONS, sizeof(int));
+ SLeftExonCount = (int*)calloc(MAX_SIDE_EXTENSIONS, sizeof(int));
+ SLeftSpliceCount = (int*)calloc(MAX_SIDE_EXTENSIONS, sizeof(int));
+
+ SRightExon = (ExonStruct**)calloc(sizeof(ExonStruct*), MAX_SIDE_EXTENSIONS * MAX_EXTENSION_EXONS);
+ SRightEdge = (ExonEdge**)calloc(MAX_SIDE_EXTENSIONS * MAX_EXTENSION_EXONS, sizeof(ExonEdge*));
+ //SRightSpliceScore = (int*)calloc(sizeof(int), MAX_SIDE_EXTENSIONS * MAX_EXTENSION_EXONS);
+ SRightExonCount = (int*)calloc(sizeof(int), MAX_SIDE_EXTENSIONS);
+ SRightSpliceCount = (int*)calloc(sizeof(int), MAX_SIDE_EXTENSIONS);
+
+ g_TagExonArray = (ExonStruct**)calloc(sizeof(ExonStruct*), 16);
+ g_TagSpliceArray = (ExonEdge**)calloc(sizeof(ExonEdge*), 16);
+ g_TagBuffer = (char*)calloc(sizeof(int), 10);
+ g_TagBufferSpliced = (char*)calloc(sizeof(int), MAX_EXTENSION_EXONS);
+
+ MH_Buffer = (char*)calloc(sizeof(char), 128);
+ MH_BufferSpliced = (char*)calloc(sizeof(char), 128);
+ MH_MatchExons = (ExonStruct**)calloc(sizeof(ExonStruct*), MAX_EXTENSION_EXONS);
+ MH_MatchEdges = (ExonEdge**)calloc(sizeof(ExonEdge*), MAX_EXTENSION_EXONS);
+ //MH_MatchSplices = (int*)calloc(sizeof(int), MAX_EXTENSION_EXONS);
+
+ MatchedBases = (char*)calloc(sizeof(char), 512);
+ ExtensionBufferLeft = (char*)calloc(sizeof(char), MAX_EXTENSION_LENGTH*512);
+ ExtensionBufferRight = (char*)calloc(sizeof(char), MAX_EXTENSION_LENGTH*512);
+ ExtensionLeftDecorations = (int*)calloc(sizeof(int), 512);
+ ExtensionRightDecorations = (int*)calloc(sizeof(int), 512);
+ ExtensionGenomicStart = (int*)calloc(sizeof(int), 512);
+ ExtensionGenomicEnd = (int*)calloc(sizeof(int), 512);
+
+ ExtensionSpectra = (MSSpectrum**)calloc(sizeof(MSSpectrum*), 512);
+}
+
+
+// Helper function: We've successfully extended a tag either forward (Direction=1) or backward (Direction=-1)
+// along the peptide. Set the genomic endpoint, and the flanking (prefix or suffix) amino acid character.
+// What makes the job tricky is that we may have finished at the edge of an exon, either by using up an
+// incoming edge (if Pos==-1) or by using up the entire exon (if Pos+Direction falls off the edge).
+// If we used up the full exon and there's an edge, report the AA for the first edge. (TODO: Sort edges, maybe).
+// If we used up a full exon and there's nothing to link to, report char '-'
+void SetMatchPrefixSuffix(ExonStruct* Exon, int Pos, int Direction)
+{
+ int Length = 0;
+ char AA;
+ ExonEdge* Edge;
+ if (Direction > 0)
+ {
+ // Direction is 1, so set RightGenomicPosition:
+ if (Exon->Start < 0)
+ {
+ // This exon has no knwon genomic position:
+ SRightGenomicPosition[MH_MatchCount] = -1;
+ }
+ else if (Exon->Sequence)
+ {
+ if (Exon->Gene->ForwardFlag)
+ {
+ if (Pos > -1)
+ {
+ SRightGenomicPosition[MH_MatchCount] = Exon->Start + (Pos+1)*3 + strlen(Exon->Prefix);
+ }
+ else
+ {
+ SRightGenomicPosition[MH_MatchCount] = Exon->Start + strlen(Exon->Prefix);
+ }
+ }
+ else
+ {
+ if (Pos > -1)
+ {
+ // Yes, still add prefix length here:
+ SRightGenomicPosition[MH_MatchCount] = Exon->End - (Pos+1)*3 - strlen(Exon->Prefix);
+ }
+ else
+ {
+ SRightGenomicPosition[MH_MatchCount] = Exon->End - strlen(Exon->Prefix);
+ }
+ }
+ }
+ else
+ {
+ if (Exon->Gene->ForwardFlag)
+ {
+ SRightGenomicPosition[MH_MatchCount] = Exon->Start + strlen(Exon->Prefix);
+ }
+ else
+ {
+ SRightGenomicPosition[MH_MatchCount] = Exon->End - strlen(Exon->Prefix);
+ }
+ }
+ Length = Exon->Length;
+ if (Pos + Direction < Length)
+ {
+ SRightSuffixes[MH_MatchCount] = Exon->Sequence[Pos + Direction];
+ return;
+ }
+ // If we have a forward-edge, use the (most common) forward edge aa
+ if (Exon->ForwardEdgeCount)
+ {
+ Edge = Exon->ForwardEdges;
+ AA = Edge->AA;
+ if (!AA && Edge->Exon->Sequence)
+ {
+ AA = Edge->Exon->Sequence[0];
+ }
+ SRightSuffixes[MH_MatchCount] = AA;
+ return;
+ }
+ SRightSuffixes[MH_MatchCount] = '-';
+ return;
+ }
+ else
+ {
+ // Direction is -1. Set LeftGenomicPosition:
+ if (Exon->Start < 0)
+ {
+ SLeftGenomicPosition[MH_MatchCount] = -1;
+ }
+ else if (Exon->Sequence)
+ {
+ if (Exon->Gene->ForwardFlag)
+ {
+ if (Pos >= 0)
+ {
+ SLeftGenomicPosition[MH_MatchCount] = Exon->Start + Pos*3 + strlen(Exon->Prefix);
+ }
+ else
+ {
+ // We never used any sequence from the exon proper, we used
+ // an incoming aa-edge:
+ SLeftGenomicPosition[MH_MatchCount] = Exon->End - strlen(Exon->Suffix);
+ }
+ }
+ else
+ {
+ if (Pos >= 0)
+ {
+ // Yes, still add prefix length here:
+ SLeftGenomicPosition[MH_MatchCount] = Exon->End - Pos*3 - strlen(Exon->Prefix);
+ }
+ else
+ {
+ SLeftGenomicPosition[MH_MatchCount] = Exon->Start + strlen(Exon->Suffix);
+ }
+ }
+ }
+ else
+ {
+ if (Exon->Gene->ForwardFlag)
+ {
+ SLeftGenomicPosition[MH_MatchCount] = Exon->End - strlen(Exon->Suffix);
+ }
+ else
+ {
+ SLeftGenomicPosition[MH_MatchCount] = Exon->Start + strlen(Exon->Suffix);
+ }
+ }
+
+ if (Pos + Direction >= 0 && Exon->Sequence)
+ {
+ SLeftPrefixes[MH_MatchCount] = Exon->Sequence[Pos + Direction];
+ return;
+ }
+ else if (Exon->BackEdgeCount)
+ {
+ Edge = Exon->BackwardEdges;
+ AA = Edge->AA;
+ if (!AA && Edge->Exon->Sequence)
+ {
+ Length = strlen(Edge->Exon->Sequence);
+ AA = Edge->Exon->Sequence[Length-1];
+ }
+ SLeftPrefixes[MH_MatchCount] = AA;
+ return;
+ }
+ SLeftPrefixes[MH_MatchCount] = '-';
+ return;
+ }
+ }
+
+// Copy the exon list from MH_MatchExons into either SLeftExon or SRightExon,
+// initializing the left-over entries to NULL
+void MatchHelperSetExons(int Direction)
+{
+ ExonStruct** MatchExons;
+ ExonEdge** MatchEdges;
+ int Index;
+ if (Direction < 0)
+ {
+ MatchExons = SLeftExon;
+ MatchEdges = SLeftEdge;
+ //MatchSplices = SLeftSpliceScore;
+ }
+ else
+ {
+ MatchExons = SRightExon;
+ MatchEdges = SRightEdge;
+ //MatchSplices = SRightSpliceScore;
+ }
+ for (Index = 0; Index < MAX_EXTENSION_EXONS; Index++)
+ {
+ if (Index >= MH_MatchExonPos)
+ {
+ MatchExons[MH_MatchCount * MAX_EXTENSION_EXONS + Index] = NULL;
+ //MatchSplices[MH_MatchCount*MAX_EXTENSION_EXONS + Index] = -1;
+ }
+ else
+ {
+ MatchExons[MH_MatchCount * MAX_EXTENSION_EXONS + Index] = MH_MatchExons[Index];
+ //MatchSplices[MH_MatchCount*MAX_EXTENSION_EXONS + Index] = MH_MatchSplices[Index];
+ }
+ if (Index >= MH_MatchEdgePos)
+ {
+ MatchEdges[MH_MatchCount * MAX_EXTENSION_EXONS + Index] = NULL;
+ }
+ else
+ {
+ MatchEdges[MH_MatchCount * MAX_EXTENSION_EXONS + Index] = MH_MatchEdges[Index];
+ }
+ }
+}
+
+// Recursion counter - tracks calls to MatchFlankingMassSpliceHelper, so that
+// we can bail out if we take absurdly long. (If we have a 3000Da+ flanking mass
+// and a lot of SNPs, then the search time becomes unacceptable)
+int g_SpliceHelperRecursionCount = 0;
+// The largest count we ever saw before the limit was added: 1862528167
+// Second-largest 816910, 99.99% were <30000
+#define MAX_HELPER_RECURSION_COUNT 30000
+// Recursable function for matching MatchMass. We start out with decoration DecorationMatchIndex, and we try
+// smaller decorations (with smaller index number) as we go. We start out with FlankingMass = 0 on the first
+// call; it's nonzero if we hit a splice junction and recurse.
+int MatchFlankingMassSpliceHelper(MSSpectrum* Spectrum, TrieTag* Tag, ExonStruct* Exon,
+ int StartPos, int Direction, int MatchMass, int ModsRemaining,
+ int DecorationMassIndex, int FlankingMass, int BufferPos, int BufferPosSpliced)
+{
+ int Pos;
+ int AAMass;
+ int Diff;
+ int AbsDiff;
+ int MandatoryDecorationChange = 0;
+ int BridgeBufferPos;
+ int BridgeDMI;
+ ExonStruct* BridgeExon;
+ int BridgeMass;
+ //char* EdgeAA;
+ int EdgeCount;
+ //ExonStruct** EdgeExon;
+ //int* EdgePower;
+ ExonEdge* Edges;
+ int EdgeIndex;
+ int Length;
+ int VerboseFlag = 0;
+ int OldMatchExonPos;
+ //int OldMatchSplicePos;
+ int OldMatchEdgePos;
+ int BridgeBufferPosSpliced;
+
+ g_SpliceHelperRecursionCount++;
+
+ //////////////////////////
+ // StartPos < 0 if we're starting at the edge of the exon and working inward.
+ if (StartPos < 0)
+ {
+ if (Direction>0)
+ {
+ Pos = 0;
+ }
+ else
+ {
+ if (Exon->Sequence)
+ {
+ Pos = Exon->Length - 1;
+ }
+ else
+ {
+ Pos = -1;
+ }
+ }
+ MH_MatchExons[MH_MatchExonPos] = Exon;
+ MH_MatchExonPos++;
+ if (MH_MatchExonPos >= MAX_EXTENSION_EXONS)
+ {
+ // Bail out! We extended across too many exons!
+ return 0;
+ }
+ }
+ else
+ {
+ // The tag includes the character at StartPos, so move to the next character:
+ Pos = StartPos + Direction;
+ }
+ Length = Exon->Length;
+
+ // First, we'll extend out as far as possible WITHOUT bridging:
+ while (1)
+ {
+ if (Pos < 0 || Pos >= Length)
+ {
+ break;
+ }
+ if (DecorationMassIndex < 0)
+ {
+ break;
+ }
+ AAMass = PeptideMass[Exon->Sequence[Pos]];
+ if (!AAMass)
+ {
+ // We've reached a stop codon.
+ DecorationMassIndex = -1;
+ break;
+ }
+ FlankingMass += AAMass;
+ MH_Buffer[BufferPos++] = Exon->Sequence[Pos];
+ MH_BufferSpliced[BufferPosSpliced++] = Exon->Sequence[Pos];
+ Diff = MatchMass - (FlankingMass + AllDecorations[DecorationMassIndex].Mass);
+ AbsDiff = abs(Diff);
+ if (AbsDiff < GlobalOptions->FlankingMassEpsilon)
+ {
+ // Aha! This is *probably* a match. Check to be sure we have the bases we need:
+ if (CheckForPTAttachmentPoints(DecorationMassIndex, MH_Buffer, 0, BufferPos - 1, 1))
+ {
+ if (VerboseFlag)
+ {
+ printf("Side is match! Dec-index %d, flank %.2f.\n", DecorationMassIndex, FlankingMass / (float)MASS_SCALE);
+ printf("Copy to buffer. Match count is %d, bufferpos is %d\n", MH_MatchCount, BufferPos);
+ }
+ strncpy(MH_MatchBuffer + MAX_EXTENSION_LENGTH * MH_MatchCount, MH_Buffer, BufferPos);
+ MH_MatchBuffer[MAX_EXTENSION_LENGTH * MH_MatchCount + BufferPos] = '\0';
+ strncpy(MH_MatchBufferSpliced + MAX_SEXTENSION_LENGTH * MH_MatchCount, MH_BufferSpliced, BufferPosSpliced);
+ MH_MatchBufferSpliced[MAX_SEXTENSION_LENGTH * MH_MatchCount + BufferPosSpliced] = '\0';
+ // Set prefix or suffix for this extension:
+ SetMatchPrefixSuffix(Exon, Pos, Direction);
+ MH_MatchDecoration[MH_MatchCount] = DecorationMassIndex;
+ MatchHelperSetExons(Direction);
+ MH_MatchCount++;
+ if (MH_MatchCount >= MAX_SIDE_EXTENSIONS)
+ {
+ return MH_MatchCount;
+ }
+ }
+ }
+ // Move the DecorationMassIndex, if needed.
+ while (MandatoryDecorationChange || FlankingMass + AllDecorations[DecorationMassIndex].Mass > MH_MinMatchMass)
+ {
+ // The flanking sequence's mass is significantly bigger than our (decorated) target mass.
+ // Move to a smaller decoration:
+ MandatoryDecorationChange = 0;
+ DecorationMassIndex--;
+ if (DecorationMassIndex<0)
+ {
+ break;
+ }
+ // Skip any decorations that include phosphorylation, if we're not on phospho mode.
+ if (!GlobalOptions->PhosphorylationFlag && g_PhosphorylationMod > -1 && AllDecorations[DecorationMassIndex].Mods[g_PhosphorylationMod])
+ {
+ MandatoryDecorationChange = 1;
+ continue;
+ }
+ if (AllDecorations[DecorationMassIndex].TotalMods > ModsRemaining)
+ {
+ continue;
+ }
+ // This decoration is acceptable. Check for a match:
+ Diff = MatchMass - (FlankingMass + AllDecorations[DecorationMassIndex].Mass);
+ AbsDiff = abs(Diff);
+ if (AbsDiff < GlobalOptions->FlankingMassEpsilon)
+ {
+ // Aha! This is *probably* a match. Check to be sure we have the bases we need:
+ if (CheckForPTAttachmentPoints(DecorationMassIndex, MH_Buffer, 0, BufferPos-1, 1))
+ {
+ if (VerboseFlag)
+ {
+ printf("Left is match! Dec-index %d, flank %.2f.\n", DecorationMassIndex, FlankingMass / (float)MASS_SCALE);
+ }
+ strncpy(MH_MatchBuffer + MAX_EXTENSION_LENGTH * MH_MatchCount, MH_Buffer, BufferPos);
+ MH_MatchBuffer[MAX_EXTENSION_LENGTH * MH_MatchCount + BufferPos] = '\0';
+ strncpy(MH_MatchBufferSpliced + MAX_SEXTENSION_LENGTH * MH_MatchCount, MH_BufferSpliced, BufferPosSpliced);
+ MH_MatchBufferSpliced[MAX_SEXTENSION_LENGTH * MH_MatchCount + BufferPosSpliced] = '\0';
+
+ // Set prefix or suffix for this extension:
+ SetMatchPrefixSuffix(Exon, Pos, Direction);
+ MH_MatchDecoration[MH_MatchCount] = DecorationMassIndex;
+ MatchHelperSetExons(Direction);
+ MH_MatchCount++;
+ if (MH_MatchCount >= MAX_SIDE_EXTENSIONS)
+ {
+ return MH_MatchCount;
+ }
+ MandatoryDecorationChange = 1;
+ }
+ }
+ }
+ Pos += Direction;
+ }
+
+ // If DMI < 0, then our flanking mass became too large or we hit a stop codon:
+ if (DecorationMassIndex<0)
+ {
+ return MH_MatchCount;
+ }
+
+ // Now: We reached the end of the exon, so next we'll try each edge:
+ if (Direction > 0)
+ {
+ Edges = Exon->ForwardEdges;
+ EdgeCount = Exon->ForwardEdgeCount;
+ }
+ else
+ {
+ Edges = Exon->BackwardEdges;
+ EdgeCount = Exon->BackEdgeCount;
+ }
+ // Save our current state (FlankingMass, BufferPos, and DecorationMassIndex). After trying each edge,
+ // we return to this state.
+ BridgeMass = FlankingMass;
+ BridgeBufferPos = BufferPos;
+ BridgeBufferPosSpliced = BufferPosSpliced;
+ BridgeDMI = DecorationMassIndex;
+ OldMatchExonPos = MH_MatchExonPos;
+ OldMatchEdgePos = MH_MatchEdgePos;
+
+ for (EdgeIndex = 0; EdgeIndex < EdgeCount; EdgeIndex++)
+ {
+ FlankingMass = BridgeMass;
+ BufferPos = BridgeBufferPos;
+ BufferPosSpliced = BridgeBufferPosSpliced;
+ DecorationMassIndex = BridgeDMI;
+ MH_MatchExonPos = OldMatchExonPos;
+ MH_MatchEdgePos = OldMatchEdgePos;
+
+ BridgeExon = Edges[EdgeIndex].Exon;
+ MH_MatchEdges[MH_MatchEdgePos] = Edges + EdgeIndex;
+ MH_MatchEdgePos++;
+ // Extend with the edge amino acid:
+ if (Edges[EdgeIndex].AA)
+ {
+ AAMass = PeptideMass[Edges[EdgeIndex].AA];
+ if (!AAMass)
+ {
+ continue; // terminator
+ }
+ FlankingMass += AAMass;
+ // If this is a "true edge" (not an adjacent-edge), then note the splicing:
+ if (Edges[EdgeIndex].Power)
+ {
+ MH_BufferSpliced[BufferPosSpliced++] = ';';
+ MH_BufferSpliced[BufferPosSpliced++] = Edges[EdgeIndex].AA;
+ MH_BufferSpliced[BufferPosSpliced++] = ';';
+ }
+ else
+ {
+ MH_BufferSpliced[BufferPosSpliced++] = Edges[EdgeIndex].AA;
+ }
+ MH_MatchExons[MH_MatchExonPos] = BridgeExon;
+ MH_MatchExonPos++;
+ if (MH_MatchExonPos >= MAX_EXTENSION_EXONS)
+ {
+ // Bail out! We extended across too many exons!
+ MH_MatchExonPos--;
+ continue;
+ }
+
+ MH_Buffer[BufferPos++] = Edges[EdgeIndex].AA; //EdgeAA[EdgeIndex];
+ Diff = MatchMass - (FlankingMass + AllDecorations[DecorationMassIndex].Mass);
+ AbsDiff = abs(Diff);
+ if (AbsDiff < GlobalOptions->FlankingMassEpsilon)
+ {
+ // Aha! This is *probably* a match. Check to be sure we have the bases we need:
+ if (CheckForPTAttachmentPoints(DecorationMassIndex, MH_Buffer, 0, BufferPos-1, 1))
+ {
+ if (VerboseFlag)
+ {
+ printf("Side is match! Dec-index %d, flank %.2f.\n", DecorationMassIndex, FlankingMass / (float)MASS_SCALE);
+ }
+ strncpy(MH_MatchBuffer + MAX_EXTENSION_LENGTH * MH_MatchCount, MH_Buffer, BufferPos);
+ MH_MatchBuffer[MAX_EXTENSION_LENGTH * MH_MatchCount + BufferPos] = '\0';
+ strncpy(MH_MatchBufferSpliced + MAX_SEXTENSION_LENGTH * MH_MatchCount, MH_BufferSpliced, BufferPosSpliced);
+ MH_MatchBufferSpliced[MAX_SEXTENSION_LENGTH * MH_MatchCount + BufferPosSpliced] = '\0';
+ // Set prefix or suffix for this extension:
+ if (Direction > 0)
+ {
+ // Direction > 0: set suffix!
+ if (BridgeExon->Sequence)
+ {
+ SRightSuffixes[MH_MatchCount] = BridgeExon->Sequence[0];
+ }
+ else
+ {
+ SRightSuffixes[MH_MatchCount] = '-';
+ }
+ if (Exon->Start < 0)
+ {
+ SRightGenomicPosition[MH_MatchCount] = -1;
+ }
+ else if (Exon->Gene->ForwardFlag)
+ {
+ SRightGenomicPosition[MH_MatchCount] = BridgeExon->Start + strlen(BridgeExon->Prefix);
+ }
+ else
+ {
+ SRightGenomicPosition[MH_MatchCount] = BridgeExon->End - strlen(BridgeExon->Prefix);
+ }
+
+ }
+ else
+ {
+ // Direction < 0: set prefix!
+ if (BridgeExon->Sequence)
+ {
+ if (strlen(BridgeExon->Sequence) < 1)
+ {
+ MH_MatchCount = MH_MatchCount;
+ }
+ SLeftPrefixes[MH_MatchCount] = BridgeExon->Sequence[strlen(BridgeExon->Sequence)-1];
+ }
+ else
+ {
+ SLeftPrefixes[MH_MatchCount] = '-';
+ }
+ if (Exon->Start < 0)
+ {
+ SLeftGenomicPosition[MH_MatchCount] = -1;
+ }
+ else if (Exon->Gene->ForwardFlag)
+ {
+ SLeftGenomicPosition[MH_MatchCount] = BridgeExon->End - strlen(BridgeExon->Suffix);
+ }
+ else
+ {
+ SLeftGenomicPosition[MH_MatchCount] = BridgeExon->Start + strlen(BridgeExon->Suffix);
+ }
+ }
+ MH_MatchDecoration[MH_MatchCount] = DecorationMassIndex;
+ MatchHelperSetExons(Direction);
+ MH_MatchCount++;
+ if (MH_MatchCount >= MAX_SIDE_EXTENSIONS)
+ {
+ return MH_MatchCount;
+ }
+ }
+ }
+ // Move the DecorationMassIndex, if needed.
+ while (MandatoryDecorationChange || FlankingMass + AllDecorations[DecorationMassIndex].Mass > MH_MinMatchMass)
+ {
+ // The flanking sequence's mass is significantly bigger than our (decorated) target mass.
+ // Move to a smaller decoration:
+ MandatoryDecorationChange = 0;
+ DecorationMassIndex--;
+ if (DecorationMassIndex<0)
+ {
+ break;
+ }
+ // Skip any decorations that include phosphorylation, if we're not on phospho mode:
+ if (!GlobalOptions->PhosphorylationFlag && g_PhosphorylationMod>-1 && AllDecorations[DecorationMassIndex].Mods[g_PhosphorylationMod])
+ {
+ MandatoryDecorationChange = 1;
+ continue;
+ }
+ if (AllDecorations[DecorationMassIndex].TotalMods > ModsRemaining)
+ {
+ continue;
+ }
+ // And, check for a match:
+ Diff = MatchMass - (FlankingMass + AllDecorations[DecorationMassIndex].Mass);
+ AbsDiff = abs(Diff);
+ if (AbsDiff < GlobalOptions->FlankingMassEpsilon)
+ {
+ // Aha! This is *probably* a match. Check to be sure we have the bases we need:
+ if (CheckForPTAttachmentPoints(DecorationMassIndex, MH_Buffer, 0, BufferPos-1, 1))
+ {
+ if (VerboseFlag)
+ {
+ printf("Left is match! Dec-index %d, flank %.2f.\n", DecorationMassIndex, FlankingMass / (float)MASS_SCALE);
+ }
+ strncpy(MH_MatchBuffer + MAX_EXTENSION_LENGTH*MH_MatchCount, MH_Buffer, BufferPos);
+ MH_MatchBuffer[MAX_EXTENSION_LENGTH * MH_MatchCount + BufferPos] = '\0';
+ strncpy(MH_MatchBufferSpliced + MAX_SEXTENSION_LENGTH * MH_MatchCount, MH_BufferSpliced, BufferPosSpliced);
+ MH_MatchBufferSpliced[MAX_SEXTENSION_LENGTH * MH_MatchCount + BufferPosSpliced] = '\0';
+ // Set prefix or suffix for this extension:
+ if (Direction > 0)
+ {
+ if (BridgeExon->Sequence)
+ {
+ SRightSuffixes[MH_MatchCount] = BridgeExon->Sequence[0];
+ }
+ else
+ {
+ SRightSuffixes[MH_MatchCount] = '-';
+ }
+ if (BridgeExon->Start < 0)
+ {
+ SRightGenomicPosition[MH_MatchCount] = -1;
+ }
+ else if (Exon->Gene->ForwardFlag)
+ {
+ SRightGenomicPosition[MH_MatchCount] = BridgeExon->Start + strlen(BridgeExon->Prefix);
+ }
+ else
+ {
+ SRightGenomicPosition[MH_MatchCount] = BridgeExon->End - strlen(BridgeExon->Prefix);
+ }
+ }
+ else
+ {
+ if (BridgeExon->Sequence)
+ {
+ SLeftPrefixes[MH_MatchCount] = BridgeExon->Sequence[strlen(BridgeExon->Sequence)-1];
+ }
+ else
+ {
+ SLeftPrefixes[MH_MatchCount] = '-';
+ }
+ if (BridgeExon->Start < 0)
+ {
+ SLeftGenomicPosition[MH_MatchCount] = -1;
+ }
+ else if (Exon->Gene->ForwardFlag)
+ {
+ SLeftGenomicPosition[MH_MatchCount] = BridgeExon->End - strlen(BridgeExon->Suffix);
+ }
+ else
+ {
+ SLeftGenomicPosition[MH_MatchCount] = BridgeExon->Start + strlen(BridgeExon->Suffix);
+ }
+ }
+
+ MH_MatchDecoration[MH_MatchCount] = DecorationMassIndex;
+ MatchHelperSetExons(Direction);
+ MH_MatchCount++;
+ if (MH_MatchCount >= MAX_SIDE_EXTENSIONS)
+ {
+ return MH_MatchCount;
+ }
+ MandatoryDecorationChange = 1;
+ }
+ }
+ }
+ MH_MatchExonPos--;
+ } // If the edge has an AA
+ else
+ {
+ if (Edges[EdgeIndex].Power)
+ {
+ MH_BufferSpliced[BufferPosSpliced++] = ':';
+ }
+ }
+
+ // Recurse! Call MatchFlankingMassSpliceHelper again:
+ MatchFlankingMassSpliceHelper(Spectrum, Tag, BridgeExon, -1, Direction, MatchMass, ModsRemaining,
+ DecorationMassIndex, FlankingMass, BufferPos, BufferPosSpliced);
+ if (MH_MatchCount >= MAX_SIDE_EXTENSIONS)
+ {
+ return MH_MatchCount;
+ }
+ if (g_SpliceHelperRecursionCount >= MAX_HELPER_RECURSION_COUNT)
+ {
+ return MH_MatchCount;
+ }
+ } // Iteration over edges
+ return MH_MatchCount;
+}
+
+int MatchFlankingMassSpliced(MSSpectrum* Spectrum, TrieTag* Tag, ExonStruct* Exon, int StartPos, int Direction,
+ int MatchMass, int ModsRemaining)
+{
+ static int DecorationMassIndex;
+ static int AAMass;
+ //
+ /////////////////////////////////////////////////////////
+ // If prefix mass is zero, that qualifies as a match always.
+ MH_MatchCount = 0;
+ if (MatchMass < GlobalOptions->FlankingMassEpsilon)
+ {
+ if (Direction < 0)
+ {
+ SLeftMatchDecoration[0] = PlainOldDecorationIndex;
+ SLeftExon[0] = NULL;
+ SLeftEdge[0] = NULL;
+ SLeftMatchBuffer[0] = '\0';
+ SLeftMatchBufferSpliced[0] = '\0';
+ SetMatchPrefixSuffix(Exon, StartPos, Direction);
+ }
+ else
+ {
+ SRightMatchDecoration[0] = PlainOldDecorationIndex;
+ SRightExon[0] = NULL;
+ SRightEdge[0] = NULL;
+ SRightMatchBuffer[0] = '\0';
+ SRightMatchBufferSpliced[0] = '\0';
+ SetMatchPrefixSuffix(Exon, StartPos, Direction);
+ }
+ return 1;
+ }
+
+ MH_MinMatchMass = MatchMass - GlobalOptions->FlankingMassEpsilon;
+ MH_MaxMatchMass = MatchMass + GlobalOptions->FlankingMassEpsilon;
+ if (Direction < 0)
+ {
+ MH_MatchBuffer = SLeftMatchBuffer;
+ MH_MatchBufferSpliced = SLeftMatchBufferSpliced;
+ MH_MatchDecoration = SLeftMatchDecoration;
+ MH_MatchExonPos = 0;
+ //MH_MatchSplices = SLeftSpliceScore;
+ //MH_MatchSplicePos = 0;
+ MH_MatchEdgePos = 0;
+ }
+ else
+ {
+ MH_MatchBuffer = SRightMatchBuffer;
+ MH_MatchBufferSpliced = SRightMatchBufferSpliced;
+ MH_MatchDecoration = SRightMatchDecoration;
+ MH_MatchExonPos = 0;
+ //MH_MatchSplices = SRightSpliceScore;
+ //MH_MatchSplicePos = 0;
+ MH_MatchEdgePos = 0;
+ }
+
+ DecorationMassIndex = AllDecorationCount - 1;
+ // Skip over any decorations that use up too many pt-mods:
+ while (1)
+ {
+ if (AllDecorations[DecorationMassIndex].TotalMods > ModsRemaining)
+ {
+ DecorationMassIndex--;
+ continue;
+ }
+ break;
+ }
+
+ MH_MatchExonPos = 0;
+ MH_MatchEdgePos = 0;
+ // Perform tag extension, following edges as needed:
+ g_SpliceHelperRecursionCount = 0;
+ MatchFlankingMassSpliceHelper(Spectrum, Tag, Exon, StartPos, Direction,
+ MatchMass, ModsRemaining, DecorationMassIndex, 0, 0, 0);
+ return MH_MatchCount;
+
+}
+
+// Copies a string to a destination, in reverse character order.
+void ReverseStringCopy(char* Target, char* Source)
+{
+ int Length;
+ char* SourceChar;
+ //
+ Length = strlen(Source);
+ for (SourceChar = Source + Length - 1; SourceChar >= Source; SourceChar--)
+ {
+ *Target = *SourceChar;
+ Target++;
+ }
+
+}
+
+#define MINIMUM_EXON_LENGTH 4
+
+// A tag has been matched. Its left edge lies at LeftExonPos in LeftExon (or at -1 if its leftmost character
+// comes from an edge). Its right edge lies at RightExonPos in RightExon (or at -1 if its rightmost character
+// comes from an edge). Try to extend out to a prefix/suffix mass match. Analogous to the GetMatches() function
+// in standard trie search. The difference is that our extension can follow an exon edge.
+void GetSplicedMatches(SearchInfo* Info, TrieNode* Node, ExonStruct* LeftExon, int LeftExonPos,
+ ExonStruct* RightExon, int RightExonPos)
+{
+ int LeftMatchCount;
+ int RightMatchCount;
+ int LeftMatchIndex;
+ int RightMatchIndex;
+ int ModIndex;
+ int Length;
+ int ModsRemaining;
+ int Pos;
+ Peptide* Match;
+ int VerboseFlag = 0;
+ int ForwardFlag;
+ MSSpectrum* Spectrum;
+ static int PTMLimit[MAX_PT_MODTYPE];
+ TrieTagHanger* TagNode;
+ int ExtensionIndex;
+ int ExtensionCount = 0;
+ int ExtensionFound;
+ int UsedTooMany;
+ int ExIndex;
+ int ExonCount;
+ int SpliceScoreCount;
+ ExonStruct* TempExon;
+ ExonStruct* AllExons[256];
+ ExonEdge* AllEdges[256];
+ char SplicedBases[256];
+ int AllEdgeCount;
+ int AllExonCount;
+ PeptideSpliceNode* SpliceTail;
+ PeptideSpliceNode* SpliceNode;
+ //int GenomicLocation;
+ PeptideSpliceNode* PrevSpliceNode;
+ int EdgeIndex;
+ ExonEdge* TempEdge;
+ int GenomicStart;
+ int GenomicEnd;
+ char* ShortExonCheck;
+ int DistanceFromLastJunction;
+ int InvalidExonFlag;
+ //////////////
+ //printf("GetSplicedMatches() called for tag %s\n", Node->FirstTag->Tag->Tag);
+ if (!Node->FirstTag)
+ {
+ return;
+ }
+ ForwardFlag = LeftExon->Gene->ForwardFlag;
+ for (TagNode = Node->FirstTag; TagNode; TagNode = TagNode->Next)
+ {
+ if (VerboseFlag)
+ {
+ printf("Matched tag '%s' (pre %.2f post %.2f).\n Left exon %d pos %d, right exon %d pos %d\n",
+ TagNode->Tag->Tag, TagNode->Tag->PrefixMass / (float)MASS_SCALE, TagNode->Tag->SuffixMass / (float)MASS_SCALE,
+ LeftExon->Start, LeftExonPos, RightExon->Start, RightExonPos);
+ }
+ /*
+ printf("TagNode: %p\n",TagNode);
+ fflush(stdout);
+ printf("Tag: %p\n",TagNode->Tag);
+ fflush(stdout);
+ printf("Seq: %s\n",TagNode->Tag->Tag);
+ fflush(stdout);
+ printf("PSpectrum: %p\n",TagNode->Tag->PSpectrum);
+ fflush(stdout);
+ */
+ Spectrum = TagNode->Tag->PSpectrum;
+ Info->Spectrum = Spectrum;
+ memcpy(PTMLimit, g_PTMLimit, sizeof(int) * AllPTModCount);
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (TagNode->Tag->AminoIndex[ModIndex] < 0)
+ {
+ break;
+ }
+ PTMLimit[TagNode->Tag->ModType[ModIndex]->Index] -= 1;
+ }
+ ModsRemaining = GlobalOptions->MaxPTMods - TagNode->Tag->ModsUsed;
+ if (ModsRemaining < 0)
+ {
+ continue;
+ }
+ LeftMatchCount = MatchFlankingMassSpliced(Spectrum, TagNode->Tag, LeftExon, LeftExonPos, -1, TagNode->Tag->PrefixMass, ModsRemaining);
+ if (LeftMatchCount == 0)
+ {
+ continue;
+ }
+ RightMatchCount = MatchFlankingMassSpliced(Spectrum, TagNode->Tag, RightExon, RightExonPos, 1, TagNode->Tag->SuffixMass, ModsRemaining);
+ if (RightMatchCount == 0)
+ {
+ continue;
+ }
+
+ // Consider each combination of left-decoration and right-decoration:
+ for (LeftMatchIndex = 0; LeftMatchIndex < LeftMatchCount; LeftMatchIndex++)
+ {
+ for (RightMatchIndex = 0; RightMatchIndex < RightMatchCount; RightMatchIndex++)
+ {
+ if (VerboseFlag)
+ {
+ printf("LMI %d RMI %d Count %d\n", LeftMatchIndex, RightMatchIndex, ExtensionCount);
+ }
+ UsedTooMany = 0;
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+
+ if (AllDecorations[SLeftMatchDecoration[LeftMatchIndex]].Mods[ModIndex] +
+ AllDecorations[SRightMatchDecoration[RightMatchIndex]].Mods[ModIndex] > PTMLimit[ModIndex])
+ {
+ UsedTooMany = 1;
+ break;
+ }
+ }
+ if (UsedTooMany)
+ {
+ continue;
+ }
+ if (AllDecorations[SLeftMatchDecoration[LeftMatchIndex]].TotalMods +
+ AllDecorations[SRightMatchDecoration[RightMatchIndex]].TotalMods > ModsRemaining)
+ {
+ continue;
+ }
+ if (GlobalOptions->MandatoryModIndex > -1 &&
+ !TagNode->Tag->MandatoryModUsed &&
+ AllDecorations[SLeftMatchDecoration[LeftMatchIndex]].Mods[GlobalOptions->MandatoryModIndex] == 0 &&
+ AllDecorations[SRightMatchDecoration[RightMatchIndex]].Mods[GlobalOptions->MandatoryModIndex] == 0)
+ {
+ continue; // We don't have our mandatory PTM (biotin, or whatever)
+ }
+ if (LeftExon->Gene->ForwardFlag)
+ {
+ GenomicStart = SLeftGenomicPosition[LeftMatchIndex];
+ GenomicEnd = SRightGenomicPosition[RightMatchIndex];
+ }
+ else
+ {
+ GenomicStart = SRightGenomicPosition[RightMatchIndex];
+ GenomicEnd = SLeftGenomicPosition[LeftMatchIndex];
+ }
+ // Don't produce the same extension multiple times:
+ ExtensionFound = 0;
+ for (ExtensionIndex = 0; ExtensionIndex < ExtensionCount; ExtensionIndex++)
+ {
+ if (!strcmp(ExtensionBufferLeft + ExtensionIndex*MAX_EXTENSION_LENGTH, SLeftMatchBuffer + LeftMatchIndex*MAX_EXTENSION_LENGTH)
+ && !strcmp(ExtensionBufferRight + ExtensionIndex*MAX_EXTENSION_LENGTH, SRightMatchBuffer + RightMatchIndex*MAX_EXTENSION_LENGTH)
+ && ExtensionLeftDecorations[ExtensionIndex] == SLeftMatchDecoration[LeftMatchIndex]
+ && ExtensionRightDecorations[ExtensionIndex] == SRightMatchDecoration[RightMatchIndex]
+ && ExtensionSpectra[ExtensionIndex] == TagNode->Tag->PSpectrum)
+ {
+ // Gosh, looks like we found the same peptide again (probably by starting with
+ // another valid tag). Let's check whether the genomic endpoints are the
+ // same as well:
+ if (GenomicStart == ExtensionGenomicStart[ExtensionIndex] && GenomicEnd == ExtensionGenomicEnd[ExtensionIndex])
+ {
+ ExtensionFound = 1;
+ break;
+ }
+ }
+ }
+ if (ExtensionFound)
+ {
+ continue;
+ }
+ ExtensionLeftDecorations[ExtensionCount] = SLeftMatchDecoration[LeftMatchIndex];
+ ExtensionRightDecorations[ExtensionCount] = SRightMatchDecoration[RightMatchIndex];
+ strcpy(ExtensionBufferLeft + ExtensionCount * MAX_EXTENSION_LENGTH, SLeftMatchBuffer + LeftMatchIndex * MAX_EXTENSION_LENGTH);
+ strcpy(ExtensionBufferRight + ExtensionCount * MAX_EXTENSION_LENGTH, SRightMatchBuffer + RightMatchIndex * MAX_EXTENSION_LENGTH);
+ ExtensionSpectra[ExtensionCount] = TagNode->Tag->PSpectrum;
+
+ // MatchedBases is concatenated together from five sources:
+ // prefixAA
+ // | TAG
+ // A BBBBBB CCC DDDDDDD E
+ // left right |
+ // ext. ext. suffix
+ Pos = strlen(SLeftMatchBuffer + MAX_EXTENSION_LENGTH * LeftMatchIndex);
+ MatchedBases[0] = SLeftPrefixes[LeftMatchIndex];
+ ReverseStringCopy(MatchedBases + 1, SLeftMatchBuffer + MAX_EXTENSION_LENGTH*LeftMatchIndex);
+ g_TagBuffer[g_TagBufferPos] = '\0';
+ strcpy(MatchedBases + 1 + Pos, g_TagBuffer);
+ //strcpy(MatchedBases + 1 + Pos, TagNode->Tag->Tag);
+ strcpy(MatchedBases + 1 + Pos + strlen(TagNode->Tag->Tag), SRightMatchBuffer + MAX_EXTENSION_LENGTH*RightMatchIndex);
+ Length = strlen(MatchedBases+1);
+ MatchedBases[strlen(MatchedBases+1)+2] = '\0';
+ MatchedBases[strlen(MatchedBases+1)+1] = SRightSuffixes[RightMatchIndex];
+
+ // Set SplicedBases, and check for unacceptably short exons:
+ Pos = strlen(SLeftMatchBufferSpliced + MAX_SEXTENSION_LENGTH * LeftMatchIndex);
+ ReverseStringCopy(SplicedBases, SLeftMatchBufferSpliced + MAX_SEXTENSION_LENGTH * LeftMatchIndex);
+ g_TagBufferSpliced[g_TagBufferPosSpliced] = '\0';
+ strcpy(SplicedBases + Pos, g_TagBufferSpliced);
+ strcpy(SplicedBases + Pos + strlen(g_TagBufferSpliced), SRightMatchBufferSpliced + MAX_SEXTENSION_LENGTH * RightMatchIndex);
+ DistanceFromLastJunction = 999;
+ InvalidExonFlag = 0;
+ for (ShortExonCheck = SplicedBases; *ShortExonCheck; ShortExonCheck++)
+ {
+ switch (*ShortExonCheck)
+ {
+ case ';':
+ if (DistanceFromLastJunction < MINIMUM_EXON_LENGTH)
+ {
+ InvalidExonFlag = 1;
+ }
+ else
+ {
+ ShortExonCheck += 2; // We're at the start of ;x;, skip over aa and other ;
+ DistanceFromLastJunction = 0;
+ }
+ break;
+ case ':':
+ if (DistanceFromLastJunction < MINIMUM_EXON_LENGTH)
+ {
+ InvalidExonFlag = 1;
+ }
+ DistanceFromLastJunction = 0;
+ break;
+ default:
+ DistanceFromLastJunction++;
+ break;
+ }
+ if (InvalidExonFlag)
+ {
+ break;
+ }
+ }
+ // Reject, if unacceptably short exons were used:
+ if (InvalidExonFlag)
+ {
+ continue;
+ }
+
+ ExtensionGenomicStart[ExtensionCount] = GenomicStart;
+ ExtensionGenomicEnd[ExtensionCount] = GenomicEnd;
+ Match = AddNewMatch(Info, -1, TagNode->Tag,
+ MatchedBases + 1, Length,
+ strlen(SLeftMatchBuffer + MAX_EXTENSION_LENGTH * LeftMatchIndex),
+ SLeftMatchDecoration[LeftMatchIndex], SRightMatchDecoration[RightMatchIndex],
+ GenomicStart, GenomicEnd);
+
+ if (Match)
+ {
+ // We might have some splice nodes stored here. If so, free them:
+ if (Match->SpliceHead)
+ {
+ PrevSpliceNode = NULL;
+ for (SpliceNode = Match->SpliceHead; SpliceNode; SpliceNode = SpliceNode->Next)
+ {
+ SafeFree(PrevSpliceNode);
+ PrevSpliceNode = SpliceNode;
+ }
+ SafeFree(PrevSpliceNode);
+ Match->SpliceHead = NULL;
+ }
+ //Match->GenomicLocation = GenomicLocation;
+
+ Match->ChromosomeNumber = LeftExon->Gene->ChromosomeNumber;
+ Match->ChromosomeForwardFlag = LeftExon->Gene->ForwardFlag;
+ Match->RecordNumber = Info->RecordNumber;
+ // Copy in the list of exons and the splice scores:
+ ExonCount = 0;
+ SpliceScoreCount = 0;
+ AllExonCount = 0;
+ AllEdgeCount = 0;
+ // Read exons from the prefix:
+ for (ExIndex = 0; ExIndex < MAX_EXTENSION_EXONS; ExIndex++)
+ {
+ TempExon = SLeftExon[LeftMatchIndex * MAX_EXTENSION_EXONS + ExIndex];
+ if (!TempExon)
+ {
+ ExIndex--;
+ break;
+ }
+ }
+ while (ExIndex > -1)
+ {
+ AllExons[AllExonCount] = SLeftExon[LeftMatchIndex * MAX_EXTENSION_EXONS + ExIndex];
+ AllExonCount++;
+ ExIndex--;
+ }
+ // Read edges from the prefix:
+ for (ExIndex = 0; ExIndex < MAX_EXTENSION_EXONS; ExIndex++)
+ {
+ TempEdge = SLeftEdge[LeftMatchIndex * MAX_EXTENSION_EXONS + ExIndex];
+ if (!TempEdge)
+ {
+ ExIndex--;
+ break;
+ }
+ }
+ while (ExIndex > -1)
+ {
+ AllEdges[AllEdgeCount] = GetReciprocalExonEdge(SLeftEdge[LeftMatchIndex * MAX_EXTENSION_EXONS + ExIndex], 0);
+ AllEdgeCount++;
+ ExIndex--;
+ }
+ // Read exons from the tag:
+ for (ExIndex = 0; ExIndex < g_TagExonArrayPos; ExIndex++)
+ {
+ if (AllExonCount && (AllExons[AllExonCount-1] == g_TagExonArray[ExIndex]))
+ {
+ continue;
+ }
+ AllExons[AllExonCount] = g_TagExonArray[ExIndex];
+ AllExonCount++;
+ }
+ // Read edges from the tag:
+ for (ExIndex = 0; ExIndex < g_TagSpliceArrayPos; ExIndex++)
+ {
+ AllEdges[AllEdgeCount] = g_TagSpliceArray[ExIndex];
+ AllEdgeCount++;
+ }
+ // Read exons from the suffix:
+ for (ExIndex = 0; ExIndex < MAX_EXTENSION_EXONS; ExIndex++)
+ {
+ TempExon = SRightExon[RightMatchIndex * MAX_EXTENSION_EXONS + ExIndex];
+ if (TempExon)
+ {
+ if (AllExonCount && (AllExons[AllExonCount-1] == TempExon))
+ {
+ continue;
+ }
+ AllExons[AllExonCount] = TempExon;
+ AllExonCount++;
+ }
+ else
+ {
+ break; // After the first null exon comes undefined rubbish data
+ }
+ }
+ // Read edges from the suffix:
+ for (ExIndex = 0; ExIndex < MAX_EXTENSION_EXONS; ExIndex++)
+ {
+ TempEdge = SRightEdge[RightMatchIndex * MAX_EXTENSION_EXONS + ExIndex];
+ if (TempEdge)
+ {
+ AllEdges[AllEdgeCount] = TempEdge;
+ AllEdgeCount++;
+ }
+ else
+ {
+ break; // After the first null exon comes undefined rubbish data
+ }
+ }
+ // Store the sequence, with splice boundaries indicated:
+ SafeFree(Match->SplicedBases);
+ Match->SplicedBases = (char*)calloc(sizeof(char), 256);
+ strncpy(Match->SplicedBases, SplicedBases, 256);
+
+ // We know the exons, now we'll store all the splicing info for the match:
+ SpliceTail = NULL;
+ for (EdgeIndex = 0; EdgeIndex < AllEdgeCount; EdgeIndex++)
+ {
+ if (AllEdges[EdgeIndex]->Power)
+ {
+ SpliceNode = (PeptideSpliceNode*)calloc(sizeof(PeptideSpliceNode), 1);
+ if (ForwardFlag)
+ {
+ SpliceNode->DonorPos = AllEdges[EdgeIndex]->Source->End;
+ SpliceNode->AcceptorPos = AllEdges[EdgeIndex]->Exon->Start;
+ }
+ else
+ {
+ SpliceNode->DonorPos = AllEdges[EdgeIndex]->Source->Start;
+ SpliceNode->AcceptorPos = AllEdges[EdgeIndex]->Exon->End;
+ }
+ SpliceNode->ChromosomeNumber = LeftExon->Gene->ChromosomeNumber;
+ if (SpliceTail)
+ {
+ SpliceTail->Next = SpliceNode;
+ }
+ else
+ {
+ Match->SpliceHead = SpliceNode;
+ }
+ SpliceTail = SpliceNode;
+ }
+ }
+
+ //// %%% SANITY CHECK SPLICING %%%
+ //if (Match->SpliceHead && (!strstr(Match->SplicedBases, ";") && !strstr(Match->SplicedBases, ":")))
+ //{
+ // printf("Warning: Match found with no true splicing, but splice junction stored!\n");
+ // printf("%s %s\n", Match->Bases, Match->SplicedBases);
+ // printf("SpliceNode: %d-%d\n", Match->SpliceHead->DonorPos, Match->SpliceHead->AcceptorPos);
+ // DebugPrintGene(LeftExon->Gene);
+ //}
+ //if (!Match->SpliceHead && (strstr(Match->SplicedBases, ";") || strstr(Match->SplicedBases, ":")))
+ //{
+ // printf("Warning: Match found with true splicing, but splice junction not stored!\n");
+ // printf("%s %s\n", Match->Bases, Match->SplicedBases);
+ // //printf("SpliceNode: %d-%d\n", Match->SpliceHead->DonorPos, Match->SpliceHead->AcceptorPos);
+ // DebugPrintGene(LeftExon->Gene);
+ //}
+ } // if match
+ ExtensionCount = min(511, ExtensionCount + 1);
+ } // RightMatchIndex
+ } // LeftMatchIndex
+ } // Tag loop
+ return;
+}
+
+// Integrity checking of a gene. (For debugging use only)
+void CheckGene(GeneStruct* Gene)
+{
+ int ExonIndex;
+ int EdgeIndex;
+ ExonStruct* Exon;
+ ExonStruct* Exon2;
+ //
+ if (!Gene)
+ {
+ return;
+ }
+ for (ExonIndex = 0; ExonIndex < Gene->ExonCount; ExonIndex++)
+ {
+ Exon = Gene->Exons + ExonIndex;
+ if (Exon->Start < 0 || Exon->End < 0 || Exon->Start >= Exon->End)
+ {
+ printf("*ERROR\n");
+ }
+ for (EdgeIndex = 0; EdgeIndex < Exon->BackEdgeCount; EdgeIndex++)
+ {
+ Exon2 = Exon->BackwardEdges[EdgeIndex].Exon;
+ if (!Exon2)
+ {
+ printf("*ERROR!\n");
+ }
+ }
+ for (EdgeIndex = 0; EdgeIndex < Exon->ForwardEdgeCount; EdgeIndex++)
+ {
+ Exon2 = Exon->ForwardEdges[EdgeIndex].Exon;
+ if (!Exon2)
+ {
+ printf("*ERROR!\n");
+ }
+ }
+ }
+}
+
+// Given an exon, search it for tag matches. If AnchoredFlag is true, then we've already
+// matched part of the tag (and Root isn't the root of the entire trie)
+void GetSplicedTagMatches(SearchInfo* Info, ExonStruct* LeftExon, int LeftExonPos, ExonStruct* Exon,
+ TrieNode* Root, int AnchoredFlag)
+{
+ int AnchorMax;
+ int AnchorPos;
+ int OldExonPos;
+ int OldSplicePos;
+ char AA;
+ ExonStruct* BridgedExon;
+ TrieNode* CurrentNode;
+ TrieNode* SubNode;
+ int EdgeIndex;
+ int SequenceLength;
+ int SequencePos;
+ int OldTagBufferPos;
+ int OldTagBufferPos2;
+ int OldTagBufferPosSpliced;
+ int OldTagBufferPosSpliced2;
+
+ int Index = 0;
+ TrieTagHanger * TempTag = NULL;
+
+ //printf("New Cal!!!\n");
+ //fflush(stdout);
+
+ //
+ OldExonPos = g_TagExonArrayPos;
+ OldSplicePos = g_TagSpliceArrayPos;
+ OldTagBufferPos = g_TagBufferPos;
+ OldTagBufferPosSpliced = g_TagBufferPosSpliced;
+ SequenceLength = Exon->Length;
+ if (AnchoredFlag)
+ {
+ AnchorMax = min(1, SequenceLength); // it's possible that sequencelength is 0!
+ }
+ else
+ {
+ AnchorMax = SequenceLength;
+ }
+ //
+ // printf("Root: %p\n",Root);
+ //for(Index = 0; Index < TRIE_CHILD_COUNT; ++Index)
+ //{
+ // printf(" Child[%c] = %p\n",Index + 'A',Root->Children[Index]);
+ //}
+ //getchar();
+
+ // fflush(stdout);
+ //printf("AnchoredFlag %d sequencelen %d anchormax %d\n", AnchoredFlag, SequenceLength, AnchorMax);
+ //printf("Exon %d: %s\n",Exon->Index,Exon->Sequence);
+ for (AnchorPos = 0; AnchorPos < AnchorMax; AnchorPos++)
+ {
+ //printf("Seq char: %c\n",Exon->Sequence[AnchorPos]);
+ //fflush(stdout);
+ if(Exon->Sequence[AnchorPos]- 'A' >= 0 && Exon->Sequence[AnchorPos] - 'A' < TRIE_CHILD_COUNT)
+ CurrentNode = Root->Children[Exon->Sequence[AnchorPos] - 'A'];
+ else
+ {
+ CurrentNode = Root->Children['X' - 'A'];
+
+ printf("Searching Gene: %s Exon: %d/%d\n",Exon->Gene->Name, Exon->Index, Exon->Gene->ExonCount);
+ printf("Root: %p Transition: **%c**\n",Root,Exon->Sequence[AnchorPos]);
+ printf("ExonLength: %d\n",SequenceLength);
+ printf("Sequence: %s\n",Exon->Sequence);
+ printf("AnchorPos: %d\n",AnchorPos);
+ fflush(stdout);
+ }
+ if (!CurrentNode)
+ {
+ continue;
+ }
+ //printf("Current Node is not NULL!\n");
+
+ SequencePos = AnchorPos;
+ g_TagBufferPos = OldTagBufferPos;
+ g_TagBufferPosSpliced = OldTagBufferPosSpliced;
+ g_TagBuffer[g_TagBufferPos++] = Exon->Sequence[AnchorPos];
+ g_TagBufferSpliced[g_TagBufferPosSpliced++] = Exon->Sequence[AnchorPos];
+
+ // If we're performing a tagless search, then our tag may have length 1,
+ // so we could get matches right now:
+
+ if (CurrentNode->FirstTag)
+ {
+ if (AnchoredFlag)
+ {
+ GetSplicedMatches(Info, CurrentNode, LeftExon, LeftExonPos, Exon, AnchorPos);
+ }
+ else
+ {
+ GetSplicedMatches(Info, CurrentNode, Exon, AnchorPos, Exon, AnchorPos);
+ }
+ }
+
+ while (1)
+ {
+ SequencePos++;
+ //printf("Exon %d anchor %d sequence pos %d\n", Exon->Index, AnchorPos, SequencePos);
+ //fflush(stdout);
+ if (SequencePos >= SequenceLength)
+ {
+ //printf("Following an edge forward...\n");
+ //fflush(stdout);
+ // Try to follow any edges forward
+ OldTagBufferPos2 = g_TagBufferPos;
+ OldTagBufferPosSpliced2 = g_TagBufferPosSpliced;
+ for (EdgeIndex = 0; EdgeIndex < Exon->ForwardEdgeCount; EdgeIndex++)
+ {
+ g_TagBufferPos = OldTagBufferPos2;
+ g_TagBufferPosSpliced = OldTagBufferPosSpliced2;
+ AA = Exon->ForwardEdges[EdgeIndex].AA;
+ if (AA)
+ {
+ SubNode = CurrentNode->Children[AA-'A'];
+ g_TagBuffer[g_TagBufferPos++] = AA;
+ if (Exon->ForwardEdges[EdgeIndex].Power)
+ {
+ g_TagBufferSpliced[g_TagBufferPosSpliced++] = ';';
+ g_TagBufferSpliced[g_TagBufferPosSpliced++] = AA;
+ g_TagBufferSpliced[g_TagBufferPosSpliced++] = ';';
+ }
+ else
+ {
+ g_TagBufferSpliced[g_TagBufferPosSpliced++] = AA;
+ }
+ }
+ else
+ {
+ SubNode = CurrentNode;
+ if (Exon->ForwardEdges[EdgeIndex].Power > 0)
+ {
+ g_TagBufferSpliced[g_TagBufferPosSpliced++] = ':';
+ }
+
+ }
+ if (!SubNode)
+ {
+ continue;
+ }
+ BridgedExon = Exon->ForwardEdges[EdgeIndex].Exon;
+ if (AA)
+ {
+ g_TagExonArray[g_TagExonArrayPos++] = BridgedExon;
+ g_TagSpliceArray[g_TagSpliceArrayPos++] = Exon->ForwardEdges + EdgeIndex; //Exon->ForwardEdgePower[EdgeIndex];
+ }
+ if (SubNode->FirstTag && AA)
+ {
+ if (AnchoredFlag)
+ {
+ GetSplicedMatches(Info, SubNode, LeftExon, LeftExonPos, BridgedExon, -1);
+ }
+ else
+ {
+ GetSplicedMatches(Info, SubNode, Exon, AnchorPos, BridgedExon, -1);
+ }
+ }
+ if (!AA)
+ {
+ g_TagExonArray[g_TagExonArrayPos++] = BridgedExon;
+ g_TagSpliceArray[g_TagSpliceArrayPos++] = Exon->ForwardEdges + EdgeIndex; //Exon->ForwardEdgePower[EdgeIndex];
+ }
+
+ // We've now spanned an edge with our tag.
+ if (AnchoredFlag)
+ {
+ GetSplicedTagMatches(Info, LeftExon, LeftExonPos, BridgedExon, SubNode, 1);
+ }
+ else
+ {
+ GetSplicedTagMatches(Info, Exon, AnchorPos, BridgedExon, SubNode, 1);
+ }
+ g_TagExonArrayPos = OldExonPos;
+ g_TagSpliceArrayPos = OldSplicePos;
+ }
+ break;
+ } // following an edge forward
+ else
+ {
+ //printf("OldCurrNode: %p\n",CurrentNode);
+ CurrentNode = CurrentNode->Children[Exon->Sequence[SequencePos] - 'A'];
+ //printf("CurrentNode updated on %c!!!\n",Exon->Sequence[SequencePos]);
+ //printf("NewCurrNode: %p\n",CurrentNode);
+ //fflush(stdout);
+ g_TagBuffer[g_TagBufferPos++] = Exon->Sequence[SequencePos];
+ g_TagBufferSpliced[g_TagBufferPosSpliced++] = Exon->Sequence[SequencePos];
+ if (!CurrentNode)
+ {
+ break;
+ }
+ if (CurrentNode->FirstTag)
+ {
+ if (AnchoredFlag)
+ {
+ GetSplicedMatches(Info, CurrentNode, LeftExon, LeftExonPos, Exon, SequencePos);
+ }
+ else
+ {
+ GetSplicedMatches(Info, CurrentNode, Exon, AnchorPos, Exon, SequencePos);
+ }
+ }
+ }
+ } // sequencepos iteration
+ } // anchorpos
+}
+
+// Given an edge record for one exon, get the corresponding edge struct for the linked exon.
+// If ForwardFlag is 1, then the edge passed is a forward edge (and the reciprocal edge is a backward edge)
+// If ForwardFlag is 0, then the edge passed is a backward edge (and the reciprocal edge is a forward edge)
+ExonEdge* GetReciprocalExonEdge(ExonEdge* Edge, int ForwardFlag)
+{
+ int EdgeIndex;
+ ExonEdge* OtherEdge;
+ if (ForwardFlag)
+ {
+ for (EdgeIndex = 0; EdgeIndex < Edge->Exon->BackEdgeCount; EdgeIndex++)
+ {
+ OtherEdge = Edge->Exon->BackwardEdges + EdgeIndex;
+ if (OtherEdge->Exon == Edge->Source && OtherEdge->AA == Edge->AA)
+ {
+ return Edge->Exon->BackwardEdges + EdgeIndex;
+ }
+ }
+ }
+ else
+ {
+ for (EdgeIndex = 0; EdgeIndex < Edge->Exon->ForwardEdgeCount; EdgeIndex++)
+ {
+ OtherEdge = Edge->Exon->ForwardEdges + EdgeIndex;
+ if (OtherEdge->Exon == Edge->Source && OtherEdge->AA == Edge->AA)
+ {
+ return Edge->Exon->ForwardEdges + EdgeIndex;
+ }
+ }
+ }
+ INSPECT_ASSERT(0);
+ return NULL;
+}
+
+void SearchSplicableGene(SearchInfo* Info, GeneStruct* Gene)
+{
+ TrieNode* CurrentNode;
+ int ExonIndex;
+ int SequencePos;
+ int SequenceLength;
+ int EdgeIndex;
+ int VerboseFlag = 0;
+ ExonStruct* Exon;
+ ExonStruct* ActiveExon;
+ int TotalSequenceLength = 0;
+ int TotalExonCount = 0;
+ ExonEdge* Edge;
+ int Index;
+
+
+ // CheckGene(Gene);
+ //if (VerboseFlag)
+ //{
+ //printf("Gene %d: '%s' (%d exons)\n", Info->RecordNumber, Gene->Name, Gene->ExonCount);
+ //}
+ for (ExonIndex = 0; ExonIndex < Gene->ExonCount; ExonIndex++)
+ {
+ SequencePos = 0;
+ Exon = Gene->Exons + ExonIndex;
+ ActiveExon = Exon;
+ if (VerboseFlag)
+ {
+ printf("Search exon %d: '%s'\n", ExonIndex, Exon->Sequence);
+ }
+
+ SequenceLength = Exon->Length;
+ TotalExonCount++;
+ TotalSequenceLength += SequenceLength;
+
+ ////////////////////////////////////////////////////////////
+ // Try starting with each edge INTO the exon. XXX-T-AGXX
+ for (EdgeIndex = 0; EdgeIndex < Exon->BackEdgeCount; EdgeIndex++)
+ {
+ Edge = Exon->BackwardEdges + EdgeIndex;
+ if (!Edge->AA)
+ {
+ continue;
+ }
+ if (Edge->Source->Start == 31887068 && Edge->AA == 'D' && Edge->Source->Index == 463)
+ {
+ Edge = Edge;
+ }
+ g_TagExonArray[0] = Edge->Exon;
+ g_TagExonArray[1] = Exon;
+ g_TagExonArrayPos = 2;
+ g_TagSpliceArray[0] = GetReciprocalExonEdge(Edge, 0);
+ g_TagSpliceArrayPos = 1;
+ g_TagBuffer[0] = Edge->AA;
+ g_TagBufferPos = 1;
+ if (Edge->Power)
+ {
+ g_TagBufferSpliced[0] = ';';
+ g_TagBufferSpliced[1] = Edge->AA;
+ g_TagBufferSpliced[2] = ';';
+ g_TagBufferPosSpliced = 3;
+ }
+ else
+ {
+ g_TagBufferSpliced[0] = Edge->AA;
+ g_TagBufferPosSpliced = 1;
+ }
+ CurrentNode = Info->Root->Children[Edge->AA - 'A'];
+ if (CurrentNode)
+ {
+ GetSplicedTagMatches(Info, Edge->Exon, -1, Exon, CurrentNode, 1);
+ // Special for tagless search:
+ if (CurrentNode->FirstTag)
+ {
+ GetSplicedMatches(Info, CurrentNode, Edge->Exon, -1, Exon, -1);
+ }
+ }
+ }
+ g_TagExonArray[0] = Exon;
+ g_TagExonArrayPos = 1;
+ g_TagSpliceArrayPos = 0;
+ g_TagBufferPos = 0;
+ g_TagBufferPosSpliced = 0;
+ GetSplicedTagMatches(Info, Exon, 0, Exon, Info->Root, 0);
+ } // loop over exons
+}
+
+// Main method for Spliced.c: Given a collection of tags (Root) and a binary splicedb (FileName), search
+// for matches to the current list of spectra (list head GlobalOptions->FirstSpectrum, but we get to them
+// via back-links from tags). Score matches with Scorer. If GeneNames is not null, then it's an array
+// of gene names to be searched, and we skip any gene whose name isn't on the list.
+void SearchSplicableGenes(SearchInfo* Info)
+{
+ FILE* File;
+ GeneStruct* Gene;
+ int VerboseFlag = 0;
+ int RecordNumber = -1;
+ int TotalSequenceLength = 0;
+ int TotalExonCount = 0;
+ int Index;
+
+ //
+ AllocSpliceStructures();
+
+ File = Info->DB->DBFile;
+ if (!File)
+ {
+ printf("** Erorr: Unable to open gene database '%s'. No search performed.\n", Info->DB->FileName);
+ return;
+ }
+ fseek(File, 0, 0);
+ //printf("Gene: %s\n",Gene->Name);
+ printf("SEARCH SPLICEABLE GENES...\n");
+ printf("Root: %p\n",Info->Root);
+ for(Index = 0; Index < TRIE_CHILD_COUNT; ++Index)
+ {
+ printf(" Child[%c] = %p\n",Index + 'A',Info->Root->Children[Index+'A']);
+ }
+ getchar();
+
+ fflush(stdout);
+
+
+ while (1)
+ {
+ RecordNumber++;
+ //StartTime = clock();
+ Gene = LoadGene(File);
+ if (!Gene)
+ {
+ break;
+ }
+ SearchSplicableGene(Info, Gene);
+
+ FreeGene(Gene);
+
+ } // while genes
+ printf("Searched %d genes, %d exons, %d bases\n", RecordNumber, TotalExonCount, TotalSequenceLength);
+}
+
+// For debugging: Exercise the splice functionality.
+void TestSplicing()
+{
+ if (!SLeftMatchBuffer)
+ {
+ AllocSpliceStructures();
+ }
+ LoadGenes("C:\\source\\Inspect\\Inspect\\SpliceDB\\Ch1.dat");
+ DebugPrintGenes();
+}
+
+
+// inspect test splicedb <DBPath> <Start> <End> [<DesiredProtein>]
+// Print out all genes which overlap the region of interest. Useful for asking
+// why a particular known protein isn't present in its entirety. If a protein
+// sequence is provided as well, we align that sequence against the exon graph,
+// determining how much of it is present in database.
+void TestSpliceDB(int argc, char** argv)
+{
+ int StartPos;
+ int EndPos;
+ FILE* DBFile;
+ GeneStruct* Gene;
+ int GeneStart;
+ int GeneEnd;
+ int ExonIndex;
+ char* DesiredProtein = NULL;
+ int DesiredProteinLength;
+ FILE* ProteinFile;
+ int BytesRead;
+ //
+ if (argc < 4)
+ {
+ printf("** Not enough args - bailing out\n");
+ return;
+ }
+ DBFile = fopen(argv[3], "rb");
+ if (!DBFile)
+ {
+ printf("** Error: Can't open splicedb at '%s'\n", argv[3]);
+ return;
+ }
+ StartPos = 0;
+ EndPos = -1;
+ if (argc > 4)
+ {
+ StartPos = atoi(argv[4]);
+ }
+ if (argc > 5)
+ {
+ EndPos = atoi(argv[5]);
+ }
+ if (argc > 6)
+ {
+ // Read protein sequence:
+ ProteinFile = fopen(argv[6], "rb");
+ if (!ProteinFile)
+ {
+ printf("** Error: Can't read target protein sequence from '%s'\n", argv[6]);
+ return;
+ }
+ fseek(ProteinFile, 0, 2);
+ DesiredProteinLength = ftell(ProteinFile);
+ DesiredProtein = (char*)calloc(DesiredProteinLength, sizeof(char));
+ fseek(ProteinFile, 0, 0);
+ BytesRead = ReadBinary(DesiredProtein, sizeof(char), DesiredProteinLength, ProteinFile);
+ DesiredProtein[BytesRead] = '\0';
+ fclose(ProteinFile);
+ }
+ ////////////////////////
+ while (1)
+ {
+ Gene = LoadGene(DBFile);
+ if (!Gene)
+ {
+ break;
+ }
+ // Decide whether to print the gene:
+ GeneStart = -1;
+ GeneEnd = -1;
+ for (ExonIndex = 0; ExonIndex < Gene->ExonCount; ExonIndex++)
+ {
+ if (GeneStart < 0 || GeneStart > Gene->Exons[ExonIndex].Start)
+ {
+ GeneStart = Gene->Exons[ExonIndex].Start;
+ }
+ if (GeneEnd < 0 || GeneEnd < Gene->Exons[ExonIndex].End)
+ {
+ GeneEnd = Gene->Exons[ExonIndex].End;
+ }
+ }
+ if (GeneEnd >= StartPos && (EndPos < 0 || GeneStart < EndPos))
+ {
+ //DebugPrintGene(Gene);
+ if (DesiredProtein)
+ {
+ AlignSequenceAgainstExonGraph(Gene, DesiredProtein, NULL, -10, -1);
+ }
+ else
+ {
+ DebugPrintGene(Gene);
+ }
+ }
+ FreeGene(Gene);
+ }
+
+}
diff --git a/Spliced.h b/Spliced.h
new file mode 100644
index 0000000..5f311c9
--- /dev/null
+++ b/Spliced.h
@@ -0,0 +1,120 @@
+//Title: Spliced.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef SPLICED_H
+#define SPLICED_H
+
+#include <stdio.h>
+#include "Utils.h"
+#include "Trie.h"
+
+#define GENE_NAME_LENGTH 256
+// Maximum length (in AA) of an extension:
+#define MAX_EXTENSION_LENGTH 64
+// max length (in chars) of extension with splice chars included
+#define MAX_SEXTENSION_LENGTH 192
+// No gene can have this many exons or more:
+#define MAX_GENE_EXONS 50000
+
+#define MAX_EXTENSION_EXONS 128
+
+// Splice-aware database search code.
+// Here is our basic approach:
+// - Construct an exon-only nucleotide database, stored as a graph. Each node is an exon (in some reading frame). Edges
+// may contain one additional amino acid (to ''glue'' the extra nucleotides at the edges of the exons). This construction
+// is performed offline by the script SplicePrepare.py
+// - Using a trie of tags (built in Tagger.c, just as they are for ordinary search), search the graph.
+// Tags (and their extensions) may follow edges between nodes.
+
+typedef struct ExonEdge
+{
+ char AA; // can be NULL
+ int Power; // if zero, this is an adjacent-edge and not a splice junction
+ struct ExonStruct* Source; // the source exon (DONOR, if this is a forward edge)
+ struct ExonStruct* Exon; // the target exon (ACCEPTOR, if this is a forward edge)
+
+ // We construct a linked list of exon edges when parsing from an XML file. Then, when
+ // the gene is complete, we convert the linked list to an array. The linked list
+ // uses the Next member; the finished array does not.
+ struct ExonEdge* Next;
+} ExonEdge;
+
+typedef struct ExonStruct
+{
+ int Start;
+ int End;
+ int Index;
+ char* Sequence;
+ int Length; // length of our sequence, in amino acids.
+ char Prefix[3];
+ char Suffix[3];
+ int BackEdgeCount;
+ int ForwardEdgeCount;
+ ExonEdge* ForwardEdges;
+ ExonEdge* BackwardEdges;
+ ExonEdge* BackEdgeHead; // used during XML parse only
+ ExonEdge* BackEdgeTail; // used during XML parse only
+ int Occurrences;
+ struct GeneStruct* Gene;
+} ExonStruct;
+
+// GeneStructs can be stored in a doubly-linked list.
+typedef struct GeneStruct
+{
+ char Name[GENE_NAME_LENGTH + 1];
+ char SprotName[GENE_NAME_LENGTH + 1];
+ int ChromosomeNumber;
+ int ForwardFlag;
+ int ExonCount; // Size of the Exons arrays
+ struct ExonStruct* Exons;
+ struct GeneStruct* Next;
+ struct GeneStruct* Prev;
+} GeneStruct;
+
+void TestSplicing(); // internal testing junk!
+
+// Main method for Spliced.c: Given a collection of tags (Root) and a binary splicedb (FileName), search
+// for matches to the current list of spectra (list head GlobalOptions->FirstSpectrum, but we get to them
+// via back-links from tags). Score matches with Scorer. If GeneNames is not null, then it's an array
+// of gene names to be searched, and we skip any gene whose name isn't on the list.
+void SearchSplicableGenes(SearchInfo* Info);
+//void SearchSplicableGenes(TrieNode* Root, char* FileName, char** GeneNames, ScoringFunction Scorer,
+// int DBNumber);
+GeneStruct* LoadGene(FILE* File);
+void FreeGene(GeneStruct* Gene);
+void DebugPrintGene(GeneStruct* Gene);
+//void SearchSplicableGene(TrieNode* Root, GeneStruct* Gene, ScoringFunction Scorer, int DBNumber);
+void SearchSplicableGene(SearchInfo* Info, GeneStruct* Gene);
+void AllocSpliceStructures();
+void SetExonForwardEdges(GeneStruct* Gene);
+
+#endif // SPLICED_H
diff --git a/StripPTM.py b/StripPTM.py
new file mode 100644
index 0000000..56dea0b
--- /dev/null
+++ b/StripPTM.py
@@ -0,0 +1,117 @@
+#Title: StripPTM.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+A handy function to strip unnecessary modifications from a peptide.
+"""
+from Utils import *
+Initialize()
+
+AMINO_ACIDS = "ACDEFGHIKLMNOPQRSTUVWY" # O and U are included, for now.
+INVALID_MASS = 99999
+
+def StripNeedlessModifications(DB, Annotation):
+ """
+ Replace "*" in annotations with "-".
+ Also, correct "unnecessary PTM" annotations.
+ Return (DBPos, FixedAnnotation) as a tuple.
+ We fix the following:
+ Y.A+|Y|BCD.Z -> YABCD
+ XY.A+|XY|BCD.Z -> XYABCD
+ JYZ.A+|XYZ|BCD.Z -> JYZABCD
+ X.ABCD+|Y|.Y -> ABCDY
+ X.ABCD+|YZ|.YZ -> ABCDYZ
+ X.ABCD+|YZJ|.YZH -> ABCDYZJ
+ """
+ VerboseFlag = 1
+ Peptide = GetPeptideFromModdedName(Annotation)
+ # Find where this peptide occurs within the database:
+ Aminos = Peptide.Aminos
+ if Peptide.Prefix in AMINO_ACIDS:
+ Aminos = Peptide.Prefix + Aminos
+ if Peptide.Suffix in AMINO_ACIDS:
+ Aminos += Peptide.Suffix
+ DBPos = DB.find(Aminos)
+ if Peptide.Prefix in AMINO_ACIDS:
+ DBPos += 1
+ if not Peptide.Modifications.keys():
+ # An unmodified peptide? We don't deal with those!
+ return (DBPos, Annotation.replace("*", "-"))
+ # Check whether a simple endpoint-shift can abolish all
+ # modifications:
+ ModIndex = Peptide.Modifications.keys()[0]
+ ModMass = Peptide.Modifications[ModIndex][0].Mass
+ # Try a shift to the LEFT:
+ if ModIndex < 3:
+ FlankMass = 0
+ NewAA = ""
+ for ShiftCharCount in (1, 2, 3):
+ if DBPos - ShiftCharCount < 0:
+ break
+ AA = DB[DBPos - ShiftCharCount]
+ FlankMass += Global.AminoMass.get(AA, INVALID_MASS)
+ NewAA = AA + NewAA # prepend to the new chars
+ if abs(FlankMass - ModMass) <= 2:
+ # The mass matches! Let's shift the annotation.
+ if (DBPos - ShiftCharCount > 0):
+ Prefix = DB[DBPos - ShiftCharCount - 1]
+ else:
+ Prefix = "-"
+ FixedAnnotation = "%s.%s%s.%s"%(Prefix, NewAA, Peptide.Aminos, Peptide.Suffix)
+ if VerboseFlag:
+ print "-%d The fix is in: %s to %s"%(ShiftCharCount, Annotation, FixedAnnotation)
+ return (DBPos - ShiftCharCount, FixedAnnotation.replace("*", "-"))
+ # Try a shift to the RIGHT:
+ if ModIndex >= len(Peptide.Aminos) - 3:
+ NewAA = ""
+ FlankMass = 0
+ OldEndpoint = DBPos + len(Peptide.Aminos) - 1
+ for ShiftCharCount in (1, 2, 3):
+ if OldEndpoint + ShiftCharCount >= len(DB):
+ print "Off the end of the DB with %s shifted by %d"%(Annotation, ShiftCharCount)
+ continue
+ AA = DB[OldEndpoint + ShiftCharCount]
+ FlankMass += Global.AminoMass.get(AA, INVALID_MASS)
+ NewAA += AA # append the new amino acid
+ if abs(FlankMass - ModMass) <= 2:
+ # The mass matches! Let's shift the annotation.
+ if (OldEndpoint + ShiftCharCount + 1)<len(DB):
+ Suffix = DB[OldEndpoint + ShiftCharCount + 1]
+ else:
+ Suffix = "-"
+ FixedAnnotation = "%s.%s%s.%s"%(Peptide.Prefix, Peptide.Aminos, NewAA, Suffix)
+ if VerboseFlag:
+ print "+%d The fix is in: %s to %s"%(ShiftCharCount, Annotation, FixedAnnotation)
+ return (DBPos, FixedAnnotation.replace("*", "-"))
+ # We can't edit away the PTM. Just fix any asterisks:
+ return (DBPos, Annotation.replace("*", "-"))
diff --git a/Summary.py b/Summary.py
new file mode 100644
index 0000000..ff7fa50
--- /dev/null
+++ b/Summary.py
@@ -0,0 +1,471 @@
+#Title: Summary.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+The new Summary script, modified for lower memory usage. This version
+assumes that the top annotation is the correct one, and does not consider
+re-assigning a spectrum to a close runner-up or to a homologous protein.
+
+- Iterate over annotations. Filter any that aren't a top-scoring annotation with
+good p-value.
+- Remember the best modified and the best unmodified annotation for each database interval
+- Generate the protein report: one sub-report for each protein, sorted by coverage
+- Sub-report includes a table of covered residues, followed by a list of peptide rows.
+- Each peptide row gives a spectrum count and score for the best spectrum
+- Peptide rows are sorted by protein position, then mass
+"""
+import os
+import time
+import sys
+import struct
+import traceback
+import shutil
+import getopt
+import Label
+import MakeImage
+import MSSpectrum
+import SelectProteins
+import ResultsParser
+from Utils import *
+Initialize()
+
+UsageInfo = """Summary.py - Parse search results, and generate either a webpage
+ summarizing the results, or a filtered database for unrestrictive PTM search.
+
+Required options:
+ -r [FileName] - The name of the results file to parse. If a directory is
+ specified, then all .txt files within the directory will be combined into
+ one report
+ -d [FileName] - The name of the database file (.trie format) searched.
+ (allows more than one database file; use multiple -d options)
+
+Additional options:
+ -b [FileName] - Second-pass database filename. If specified, the proteins
+ selected will be written out to a database (.fasta, .trie and .index files)
+ suitable for unrestrictive search.
+ -w [FileName] - Webpage directory. If specified, a webpage will be written
+ to the specified directory, summarizing the proteins identified, and the
+ degree of coverage.
+
+ (Note: Either -b or -w, or both, must be provided)
+
+ -p [Value] - Cutoff p-value. Annotations with inferior p-value are ignored.
+ Defaults to 0.05.
+ -e [Count] - Minimum number of peptides that a protein must annotate in order
+ to add it to the report or the filtered database. Defaults to 1.
+ -m [Count] - Minimum number of spectra that a protein must annotate in order
+ to add it to the report or the filtered database. By default, this count
+ is set to (SpectrumCount / ProteinsInDatabase) * 2. If the protein
+ database has already been filtered, set this parameter to 1.
+ -v [Count] - Verbose spectrum output count. If set, report [Count] spectra
+ for each distinct peptide identified. This option is slower and
+ consumes more memory, but can be more informative.
+ -i [SpectraPath] - For use if verbose spectrum output (-v) is enabled.
+ Images will be generated for each annotation, if the Python Imaging
+ Library (PIL) is installed. This option generates many files on disk,
+ so it's recommended that you set the summary file (-w option) in its own
+ directory. "SpectraPath" is the path to the folder with MS2 spectra
+
+
+Examples:
+ Summary.py -r Frac1Ouput.txt -d Database%ssprot.trie -p 0.1
+ Summary.py -r Frac1Ouput.txt -d Database%ssprot.trie -w F1Summary\index.html -v
+"""%(os.sep, os.sep)
+
+class SummarizerClass(ResultsParser.ResultsParser):
+ def __init__(self):
+ # SharedProteinSpectra[ProteinA][ProteinB] = number of spectra for
+ # which the annotation is found in both protein A and protein B. When
+ # we accept one of the two proteins, the other one loses some annotations!
+ self.SharedProteinSpectra = {}
+ # SharedProteinPeptides is similar to SharedProteinSpectra, but tracks
+ # distinct peptide records
+ self.SharedProteinPeptides = {}
+ self.AnnotationCounts = {} # ProteinID -> count
+ # BestRepresentatives[(DBStart, DBEnd)] is a list of peptide
+ # instances for the best spectra for that positon.
+ self.BestRepresentatives = {}
+ self.BestModRepresentatives = {}
+ self.ResultsFileName = None
+ self.DatabasePath = []
+ self.SecondPassDatabasePath = None
+ self.MinimumProteinHits = None
+ self.SummaryPagePath = None
+ self.MZXMLPath = None
+ self.PValueCutoff = 0.05
+ # Keys are peptides (after I->L substitution), values are lists of protein
+ # record numbers. We keep this dictionary so that we needn't repeatedly map
+ # the same protein to the database.
+ self.PeptideDict = {}
+ self.VerboseProteinReportCount = 0
+ self.GenerateSpectrumImagesFlag = 0
+ # Very short peptides are uninformative...skip them.
+ self.MinimumPeptideLength = 7
+ self.SpectrumCount = 0
+ self.IntervalHitCount = {}
+ self.MinimumPeptides = 1
+ ResultsParser.ResultsParser.__init__(self)
+ def GetSpectrumPath(self, Path, SpectraPath):
+ """
+ This requires a bit of trickery, because sometimes the results are
+ generated on a unix machine (creating a unix path), and this script
+ is run on a windows machine (can't split a unix path). So I have going
+ to hack things out on my own.
+ """
+ FileName = None
+ if Path.find("/") >= 0:
+ "results files made on a unix machine"
+ LastSlash = Path.rfind("/")
+ FileName = Path[LastSlash+1:]
+ else:
+ "results files made on a windows machine, hopefully. Any other users, go home"
+ LastBackSlash = Path.rfind("\\")
+ FileName = Path[LastBackSlash+1:]
+ if not FileName:
+ print "unable to create a path to the spectrum file %s"%Path
+ return Path
+ return os.path.join(SpectraPath,FileName)
+ def WriteSecondPassDatabase(self):
+ """
+ Write out the "present" proteins to our second-pass database.
+ self.ProteinSelector is responsible for deciding which peptides
+ belong to which proteins.
+ """
+ Bits = os.path.split(self.SecondPassDatabasePath)
+ DBPathStub = os.path.splitext(self.SecondPassDatabasePath)[0]
+ if len(Bits[0]) == 0:
+ DBPath = os.path.join("Database", "%s.trie"%DBPathStub)
+ IndexPath = os.path.join("Database", "%s.index"%DBPathStub)
+ FastaPath = os.path.join("Database", "%s.fasta"%DBPathStub)
+ else:
+ DBPath = DBPathStub + ".trie"
+ IndexPath = DBPathStub + ".index"
+ FastaPath = DBPathStub + ".fasta"
+ print "Writing second-pass database to %s..."%DBPath
+ DBFile = open(DBPath, "wb")
+ IndexFile = open(IndexPath, "wb")
+ FASTAFile = open(FastaPath, "wb")
+ DBFilePos = 0
+ for (ProteinID, ScoreTuple) in self.ProteinSelector.SelectedProteins.items():
+ (PeptideCount, SpectrumCount) = ScoreTuple
+ if SpectrumCount < self.MinimumProteinHits:
+ continue
+ if PeptideCount < self.MinimumPeptides:
+ continue
+ # Let's write out the protein. Write to the INDEX file, the
+ # TRIE file, and a FASTA file. (The FASTA file is just for
+ # humans to read)
+ ProteinName = self.ProteinSelector.ProteinNames[ProteinID]
+ ProteinSequence = self.ProteinSelector.ProteinSequences[ProteinID]
+ Str = struct.pack("<qi80s", 0, DBFilePos, ProteinName[:80])
+ IndexFile.write(Str)
+ DBFile.write("%s*"%ProteinSequence)
+ DBFilePos += len(ProteinSequence) + 1
+ Pos = 0
+ FASTAFile.write(">%s\n"%ProteinName)
+ while Pos < len(ProteinSequence):
+ Chunk = ProteinSequence[Pos:Pos+70]
+ FASTAFile.write(Chunk)
+ FASTAFile.write("\n")
+ Pos += 70
+ IndexFile.close()
+ FASTAFile.close()
+ DBFile.close()
+ def GetProteinHREF(self, ProteinID, ProteinName):
+ # By default, just print the name with no hyperlinking.
+ # Subclass can override to hyperlink to IPI, swiss-prot, etc
+ return ProteinName
+ def WriteSummaryPage(self):
+ """
+ Produce the protein report. The index file contains the protein coverage
+ information. If verbose output is requested, also contains one row per peptide.
+ """
+ Dir = os.path.split(self.SummaryPagePath)[0]
+ try:
+ os.makedirs(Dir)
+ except:
+ pass
+ # Populate PeptidesForProtein. Keys are protein IDs. Values are lists
+ # of peptide annotations; the annotations are, in turn, keys for
+ # self.ProteinSelector.BestRepresentatives
+ self.PeptidesForProtein = {}
+ for (Annotation, RepList) in self.ProteinSelector.BestRepresentatives.items():
+ if len(RepList) < 1:
+ continue
+ Peptide = RepList[0][1]
+ ProteinID = self.ProteinSelector.PeptideProteins.get(Peptide.Aminos, None)
+ if ProteinID == None:
+ continue
+ PeptideList = self.PeptidesForProtein.get(ProteinID, [])
+ PeptideList.append(Annotation)
+ self.PeptidesForProtein[ProteinID] = PeptideList
+ # Sort proteins from "best" to "worst". For now, just sort by the
+ # number of distinct peptides.
+ SortedProteins = []
+ for (ProteinID, AnnotationList) in self.PeptidesForProtein.items():
+ AnnotationCount = len(AnnotationList)
+ if AnnotationCount >= self.MinimumPeptides:
+ SortedProteins.append((AnnotationCount, ProteinID))
+ SortedProteins.sort()
+ SortedProteins.reverse()
+ # Start the index file:
+ self.SummaryPageDir = os.path.split(self.SummaryPagePath)[0]
+ # Ensure the directory exists:
+ try:
+ os.makedirs(self.SummaryPageDir)
+ except:
+ pass
+ self.IndexFile = open(self.SummaryPagePath, "w")
+ self.IndexFile.write("<html><title>Protein Report</title>\n")
+ # Iterate over proteins, writing one record for each one:
+ for (PeptideCount, ProteinID) in SortedProteins:
+ ProteinName = self.ProteinSelector.ProteinNames[ProteinID]
+ if ProteinName[:3]== "XXX":
+ #print "Found a fake protein %s"%ProteinName
+ continue
+ #print "Write protein %s (%s), %s peptides"%(ProteinID, self.ProteinSelector.ProteinNames[ProteinID], PeptideCount)
+ self.WriteProteinSummary(ProteinID)
+ self.IndexFile.close()
+ def WriteProteinSummary(self, ProteinID):
+ """
+ Write summary page section for a single protein.
+ """
+ ##########################################################
+ # Determine coverage, and sort peptides by position within the protein:
+ ProteinName = self.ProteinSelector.ProteinNames[ProteinID]
+ ProteinSequence = self.ProteinSelector.ProteinSequences[ProteinID]
+ Coverage = [0] * len(ProteinSequence)
+ PeptideCount = len(self.PeptidesForProtein.get(ProteinID, []))
+ for Annotation in self.PeptidesForProtein.get(ProteinID, []):
+ Peptide = self.ProteinSelector.BestRepresentatives[Annotation][0][1]
+ MatchPos = ProteinSequence.find(Peptide.Aminos)
+ if MatchPos == -1:
+ print "** Error: Peptide '%s' assigned to incompatible protein %s '%s'"%(Peptide.Aminos, ProteinID, ProteinName)
+ for Pos in range(MatchPos, MatchPos + len(Peptide.Aminos)):
+ Coverage[Pos] += 1
+ CoverFlags = 0
+ for CoverageCount in Coverage:
+ if CoverageCount:
+ CoverFlags += 1
+ CoverageRate = CoverFlags / float(len(ProteinSequence))
+ # Write header:
+ SpectrumCount = self.ProteinSelector.ProteinSpectrumCounts[ProteinID]
+ HREF = self.GetProteinHREF(ProteinID, ProteinName)
+ self.IndexFile.write("<h3>%s</h3>\n<b>%s peptides, %s spectra, %.1f%% coverage</b><br>\n"%(HREF, PeptideCount, SpectrumCount, CoverageRate*100))
+ # Write protein sequence:
+ ColorUncovered = "#aaaaaa"
+ ColorCovered = "#000000"
+ OldColor = ColorUncovered
+ OldBoldFlag = 0
+ BoldFlag = 0
+ self.IndexFile.write("<tt>")
+ for Pos in range(len(ProteinSequence)):
+ ResidueNumber = Pos + 1
+ if ResidueNumber%50 == 1:
+ if BoldFlag:
+ self.IndexFile.write("</b>")
+ BoldFlag = 0
+ self.IndexFile.write("</font><br>\n<font color=#000000>")
+ OldColor = ColorCovered
+ OldBoldFlag = 0
+ for Foo in range(4 - len(str(ResidueNumber))):
+ self.IndexFile.write(" ")
+ self.IndexFile.write("%d "%ResidueNumber)
+ if ResidueNumber % 10 == 1:
+ self.IndexFile.write(" ")
+ if Coverage[Pos]:
+ Color = ColorCovered
+ else:
+ Color = ColorUncovered
+ BoldFlag = 0
+ if Color != OldColor:
+ self.IndexFile.write("</font><font color=%s>"%Color)
+ OldColor = Color
+ if BoldFlag != OldBoldFlag:
+ if BoldFlag:
+ self.IndexFile.write("<b>")
+ else:
+ self.IndexFile.write("</b>")
+ OldBoldFlag = BoldFlag
+ self.IndexFile.write("%s"%ProteinSequence[Pos])
+ self.IndexFile.write("<br><br></font><font color=#000000></tt>\n\n")
+ ###############################################
+ # Write individual peptides, if requested:
+ if self.VerboseProteinReportCount:
+ # Write out peptides:
+ self.WritePeptideHeader(ProteinID, self.IndexFile)
+ SortedAnnotations = []
+ for Annotation in self.PeptidesForProtein.get(ProteinID, []):
+ RepresentativeList = self.ProteinSelector.BestRepresentatives[Annotation]
+ Peptide = RepresentativeList[0][1]
+ Pos = ProteinSequence.find(Peptide.Aminos)
+ SortedAnnotations.append((Pos, Pos + len(Peptide.Aminos) - 1, Annotation))
+ SortedAnnotations.sort()
+ for (StartPos, EndPos, Annotation) in SortedAnnotations:
+ IntervalString = "%s-%s"%(StartPos + 1, EndPos + 1)
+ TotalHitCount = self.ProteinSelector.AnnotationSpectrumCounts[Annotation]
+ RepresentativeList = self.ProteinSelector.BestRepresentatives[Annotation]
+ RepresentativeList.reverse() # they're sorted from worst-to-best; fix that.
+ for Index in range(len(RepresentativeList)):
+ Peptide = RepresentativeList[Index][-1]
+ self.WritePeptideLine(self.IndexFile, IntervalString, ProteinID, Index, Peptide, TotalHitCount)
+ self.WritePeptideFooter(self.IndexFile)
+ self.IndexFile.write("<hr>")
+ def WritePeptideFooter(self, IndexFile):
+ IndexFile.write("</table>\n")
+ def WritePeptideHeader(self, ProteinID, IndexFile):
+ IndexFile.write("<table><tr><td><b>Residues</b></td><td><b>Total Spectra</b></td><td><b>Peptide</b></td><td><b>p-value</b></td><td><b>MQScore</b></td><td><b>File</b></td><td><b>Scan</b></td></tr>")
+ def WritePeptideLine(self, File, IntervalStr, ProteinID, SpectrumIndex, Peptide, TotalHitCount):
+ Dir = os.path.split(self.SummaryPagePath)[0]
+ Annotation = Peptide.GetModdedName()
+ SpecFileName = Peptide.SpectrumFilePath.replace("/","\\").split("\\")[-1]
+ if self.GenerateSpectrumImagesFlag:
+ ImageFileName = "%s.%s.png"%(Annotation, SpectrumIndex)
+ ImageFilePath = os.path.join(Dir, ImageFileName)
+ Maker = MakeImage.MSImageMaker()
+ MSpectrum = MSSpectrum.SpectrumClass()
+ Path = self.GetSpectrumPath(Peptide.SpectrumFilePath, self.MZXMLPath)
+ FileName = "%s:%s"%(Path, Peptide.SpectrumFilePos)
+ try:
+ #SpectrumFile = Label.OpenAndSeekFile(FileName)
+ #print FileName
+ #MSpectrum.ReadPeaksFromFile(SpectrumFile, FileName)
+ #MSpectrum.RankPeaksByIntensity()
+ #SpectrumFile.close()
+ #Label.LabelSpectrum(MSpectrum, Peptide)
+ #Maker.ConvertSpectrumToImage(MSpectrum, ImageFilePath, Peptide, Width = 500, Height = 380)
+ Args = " -r %s -b %d -a %s -w %s -p"%(Path, int(Peptide.SpectrumFilePos), Annotation, ImageFilePath)
+ ArgsList = Args.split()
+ #print "Parsing Results for %s, scan %s"%(FileName, Scan)
+ Dymo = Label.LabelClass()
+ Dymo.ParseCommandLine(ArgsList)
+ #Dymo.LoadModel(0, Dymo.PeptideHasPhosphorylation)
+ Dymo.Main()
+
+ except:
+ traceback.print_exc()
+ File.write("<tr><td>%s</td><td>%s</td><td><a href=\"%s\">%s</td><td>%s</td><td>%s</td>"%(IntervalStr, TotalHitCount, ImageFileName, Peptide.GetFullModdedName(), Peptide.PValue, Peptide.MQScore))
+ File.write("<td>%s</td><td>%s</td></tr>\n"%(SpecFileName, Peptide.ScanNumber))
+ else:
+ File.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td>"%(IntervalStr, TotalHitCount, Peptide.GetFullModdedName(), Peptide.PValue, Peptide.MQScore))
+ File.write("<td>%s</td><td>%s</td></tr>\n"%(SpecFileName, Peptide.ScanNumber))
+ def SetMinimumProteinHits(self):
+ ProteinCount = len(self.ProteinNames)
+ self.MinimumProteinHits = (2 * self.SpectrumCount) / ProteinCount
+ self.MinimumProteinHits = max(self.MinimumProteinHits, 2)
+ print "%s spectra with a valid annotation, %s proteins in all"%(self.SpectrumCount, ProteinCount)
+ print "Minimum hits required to accept an additional protein: ", self.MinimumProteinHits
+ def Main(self):
+ self.ProteinSelector = SelectProteins.ProteinSelector()
+ print self.PValueCutoff
+ self.ProteinSelector.PValueCutoff = self.PValueCutoff
+ self.ProteinSelector.LoadMultipleDB(self.DatabasePath)
+ # If we're expected to write out a summary page, then keep track
+ # of the top N representatives for each annotation:
+ if self.SummaryPagePath:
+ if self.VerboseProteinReportCount:
+ self.ProteinSelector.RetainRepresentativeCount = self.VerboseProteinReportCount
+ else:
+ self.ProteinSelector.RetainRepresentativeCount = 1
+ self.ProcessResultsFiles(self.ResultsFileName, self.ProteinSelector.ParseAnnotations)
+ self.ProteinSelector.ChooseProteins()
+ if self.SecondPassDatabasePath:
+ self.WriteSecondPassDatabase()
+ print "Second-pass database written to:", self.SecondPassDatabasePath
+ if self.SummaryPagePath:
+ self.WriteSummaryPage()
+ print "Summary page written to:", self.SummaryPagePath
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "r:d:b:p:w:m:v:i:e:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-r":
+ # -r results file(s)
+ if not os.path.exists(Value):
+ print "** Error: couldn't find results file '%s'\n\n"%Value
+ print UsageInfo
+ sys.exit(1)
+ self.ResultsFileName = Value
+ elif Option == "-d":
+ # -d database
+ if not os.path.exists(Value):
+ print "** Error: couldn't find database file '%s'\n\n"%Value
+ print UsageInfo
+ sys.exit(1)
+ self.DatabasePath.append(Value)
+ elif Option == "-b":
+ # -b Second-pass database
+ self.SecondPassDatabasePath = Value
+ elif Option == "-i":
+ self.GenerateSpectrumImagesFlag = 1
+ self.MZXMLPath = Value
+ elif Option == "-m":
+ # -m Minimum number of spectra for a new protein
+ self.MinimumProteinHits = int(Value)
+ elif Option == "-e":
+ # -e Minimum number of peptides for a new protein
+ self.MinimumPeptides = int(Value)
+ elif Option == "-w":
+ # -w Summary page filename
+ self.SummaryPagePath = Value
+ elif Option == "-p":
+ # -p p-value cutoff
+ self.PValueCutoff = float(Value)
+ print self.PValueCutoff
+ elif Option == "-v":
+ # -v Verbose output flag
+ self.VerboseProteinReportCount = int(Value)
+ # Error out, if we didn't see required options:
+ if not OptionsSeen.has_key("-d") or not OptionsSeen.has_key("-r"):
+ print "** Please specify database (-d) and results file (-r)"
+ print UsageInfo
+ sys.exit(1)
+ # If neither -b nor -w was specified, assume they want a summary:
+ if not OptionsSeen.has_key("-b") and not OptionsSeen.has_key("-w"):
+ self.SummaryPagePath = os.path.join("ProteinSummary", "index.html")
+ print "** Summary page will be written to '%s'; use -w to override this"%Summarizer.SummaryPagePath
+ print "Summary page path:", self.SummaryPagePath
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco not found - running in non-optimized mode)"
+ Summarizer = SummarizerClass()
+ Summarizer.ParseCommandLine(sys.argv[1:])
+ StartTime = time.clock()
+ Summarizer.Main()
+ EndTime = time.clock()
+ print "ELAPSED:", EndTime - StartTime
diff --git a/SystemTest.py b/SystemTest.py
new file mode 100644
index 0000000..d49c97a
--- /dev/null
+++ b/SystemTest.py
@@ -0,0 +1,251 @@
+#Title: SystemTest.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+SystemTest.py is the master test script for the Inspect toolkit.
+It should be run after the inspect executable has been installed to
+the current directory (and built, if necessary).
+
+Run with no command-line arguments to perform a full test.
+"""
+import os
+import sys
+import traceback
+try:
+ from Utils import *
+ Initialize()
+except:
+ print "** Error: Unable to load Utils!"
+if hasattr(os, "sysconf"):
+ IS_WINDOWS = 0
+else:
+ IS_WINDOWS = 1
+
+if IS_WINDOWS:
+ INSPECT_EXECUTABLE = "inspect"
+else:
+ INSPECT_EXECUTABLE = "./inspect"
+
+SystemTestDir = "SystemTest"
+
+class InspectRunner:
+ def __init__(self):
+ self.ErrorCount = 0
+ self.TestsRun = 0
+ self.TempOutputName = "SystemTestTemp.txt"
+ def RunTestSearch(self, InputFileName, DesiredPeptide):
+ "Run inspect, and verify that the desired peptide is the top match."
+ Command = "%s -i %s -o %s"%(INSPECT_EXECUTABLE, InputFileName, self.TempOutputName)
+ print Command
+ self.TestsRun += 1
+ try:
+ # Remove old output before running test:
+ if os.path.exists(self.TempOutputName):
+ os.remove(self.TempOutputName)
+ # Run inspect:
+
+ os.system(Command)
+ except:
+ traceback.print_exc()
+ self.ErrorCount += 1
+ return
+ self.VerifyTestSearchResults(InputFileName, self.TempOutputName, DesiredPeptide)
+ def VerifyTestSearchResults(self, InputFileName, OutputFileName, DesiredPeptide):
+ if not os.path.exists(OutputFileName):
+ print "** Error: No test output written for input '%s' to %s"%(InputFileName, OutputFileName)
+ self.ErrorCount += 1
+ return
+ File = open(OutputFileName, "rb")
+ GoodHitPosition = None
+ HitIndex = 0
+ TopHit = None
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ try:
+ Score = float(Bits[5])
+ except:
+ continue # header line
+ if not TopHit:
+ TopHit = Bits[2][2:-2]
+ HitIndex += 1
+ if Bits[2][2:-2] == DesiredPeptide:
+ GoodHitPosition = HitIndex
+ break
+ if GoodHitPosition == 1:
+ print "Test '%s' passed - top hit was '%s'"%(InputFileName, DesiredPeptide)
+ return
+ self.ErrorCount += 1
+ print "** Error for test '%s':\n Top hit was '%s'\n Desired hit '%s' was seen at position %s"%(InputFileName, TopHit, DesiredPeptide, GoodHitPosition)
+ def Summarize(self):
+ print
+ print "-=- "*18
+ print "System test summary: Ran %s tests, encountered %s errors."%(self.TestsRun, self.ErrorCount)
+ def TestParentMassCorrection(self):
+ # TestSpectra.pkl:
+ # K.VLVLDTDYK.K, K.CLMEGAGDVAFVK.H, R.TPEVDDEALEK.F
+ InputFileName = os.path.join(SystemTestDir, "TestPMC.txt")
+ Command = "%s -i %s -o %s"%(INSPECT_EXECUTABLE, InputFileName, self.TempOutputName)
+ self.TestsRun += 1
+ print Command
+ os.system(Command)
+ if not os.path.exists(self.TempOutputName):
+ print "Error: TestPMC produced no output!"
+ self.ErrorCount += 1
+ return
+ File = open(self.TempOutputName, "rb")
+ Bits = File.readline().split("\t")
+ File.close()
+ if len(Bits) < 5:
+ print "* Error: TestPMC produced invalid output!"
+ self.ErrorCount += 1
+ return
+ Mass = float(Bits[3])
+ Charge = int(Bits[4])
+ DesiredCharge = 2
+ DesiredMass = 1065.6
+ if Charge != DesiredCharge or abs(Mass - DesiredMass) > 1.0:
+ print "* Error: TestPMC produced invalid charge+mass (%s, %s), should be (%s, %s)"%(Charge, Mass, DesiredCharge, DesiredMass)
+ else:
+ print "TestPMC successful: Parent mass %s within tolerance"%Mass
+ print "Parent mass correction complete."
+
+ def TestTagging(self, InputFileName, Annotation, TagLength = None):
+ """
+ Run inspect in tag-generation mode. Verify that one or more of the
+ tags are correct for the target peptide (Annotation, a string).
+ """
+ DesiredPeptide = GetPeptideFromModdedName(Annotation)
+ Command = "%s -i %s -o %s"%(INSPECT_EXECUTABLE, InputFileName, self.TempOutputName)
+ self.TestsRun += 1
+ try:
+ # Remove old output before running test:
+ if os.path.exists(self.TempOutputName):
+ os.remove(self.TempOutputName)
+ # Run inspect:
+ print Command
+ os.system(Command)
+ except:
+ traceback.print_exc()
+ self.ErrorCount += 1
+ return
+ if not os.path.exists(self.TempOutputName):
+ print "** Error: No test output written for input '%s' to %s"%(InputFileName, self.TempOutputName)
+ self.ErrorCount += 1
+ return
+ ValidTagCount = 0
+ TagCount = 0
+ File = open(self.TempOutputName, "rb")
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ if FileLine[0] == "#" or len(Bits) < 7:
+ continue
+ #print Bits
+ TagAminos = Bits[5]
+ if TagLength != None and len(TagAminos) != TagLength:
+ print "* Error in test '%s': Tag has length %s != %s"%(InputFileName, len(TagAminos), TagLength)
+ self.ErrorCount += 1
+ Tag = PeptideClass(Bits[5])
+ Tag.PrefixMass = float(Bits[4])
+ Tag.SuffixMass = float(Bits[6])
+ if DesiredPeptide.IsValidTag(Tag):
+ ValidTagCount += 1
+ TagCount += 1
+ if not ValidTagCount:
+ print "* Test '%s' failed: No valid tags among %s attempts, for peptide %s"%(InputFileName, TagCount, DesiredPeptide.GetModdedName())
+ self.ErrorCount += 1
+ else:
+ print "Tag test successful - found %s valid tags"%ValidTagCount
+ def TestMS2DBConstruction(self):
+ InputFileName = os.path.join(SystemTestDir, "BuildSimpleChromosome.txt")
+ TempDBPath = "Temp.ms2db"
+ Command = "%s -i %s -o %s"%(INSPECT_EXECUTABLE, InputFileName, TempDBPath)
+ try:
+ print Command
+ os.system(Command)
+ except:
+ print Command
+ traceback.print_exc()
+ self.ErrorCount += 1
+ try:
+ File = open(TempDBPath)
+ except:
+ print "** MS2DB test failed: No db constructed"
+ self.ErrorCount += 1
+ return
+ MS2DB = File.read()
+ File.close()
+ #Pos = MS2DB.find("RERERERA")
+ Pos = MS2DB.find("RERE")
+ if Pos == -1:
+ print "** MS2DB test failed: Expected peptide '%s' not present"%"RERE"
+ self.ErrorCount += 1
+ # Now that the database has been constructed, let's search it:
+ # TempInputFileName = "TempMS2DB.in"
+ # TempScriptFile = open(TempInputFileName, "wb")
+ # TempScriptFile.write("db,%s\n"%os.path.abspath(TempDBPath))
+ # TempScriptFile.write("spectra,SystemTest/TestSpectrum.dta\n")
+ # TempScriptFile.write("protease,None\n")
+ # TempScriptFile.write("mod,+57,C,fix\n")
+ # TempScriptFile.close()
+ # Command = "%s -i %s -o %s"%(INSPECT_EXECUTABLE, TempInputFileName, self.TempOutputName)
+ # self.TestsRun += 1
+ #try:
+ # Remove old output before running test:
+ # if os.path.exists(self.TempOutputName):
+ # os.remove(self.TempOutputName)
+ # Run inspect:
+ #print Command
+ #os.system(Command)
+ #except:
+ # traceback.print_exc()
+ # self.ErrorCount += 1
+ # return
+ #self.VerifyTestSearchResults(TempInputFileName, self.TempOutputName, "VKEAMAPK")
+ #try:
+ # os.remove(TempInputFileName)
+ #except:
+ # pass
+ #print "MS2DB search complete"
+ def RunTests(self):
+ self.TestMS2DBConstruction()
+ self.TestTagging(os.path.join(SystemTestDir, "TestInputTag1.txt"), "VKEAMAPK", TagLength = 1)
+ self.TestTagging(os.path.join(SystemTestDir, "TestInputTag3.txt"), "VKEAMAPK", TagLength = 3)
+ self.RunTestSearch(os.path.join(SystemTestDir, "TestInput.txt"), "VKEAMGuserPK")
+ self.RunTestSearch(os.path.join(SystemTestDir, "TestInputMod.txt"), "VKEAMG+14PK")
+ self.RunTestSearch(os.path.join(SystemTestDir, "TestMS2.txt"), "AAEAATTDLTYR")
+ self.RunTestSearch(os.path.join(SystemTestDir, "TestCDTA.txt"), "EIQIAEATVPK");
+ self.TestParentMassCorrection()
+ self.Summarize()
+
+if __name__ == "__main__":
+ Runner = InspectRunner()
+ Runner.RunTests()
diff --git a/SystemTest/BuildSimpleChromosome.txt b/SystemTest/BuildSimpleChromosome.txt
new file mode 100644
index 0000000..540ec59
--- /dev/null
+++ b/SystemTest/BuildSimpleChromosome.txt
@@ -0,0 +1,3 @@
+ReadGFF,SystemTest/SimpleGenes.gff
+GenomeFile,SystemTest/SimpleChromosome.trie
+ChromosomeName,simple
diff --git a/SystemTest/Shew_Short.fasta b/SystemTest/Shew_Short.fasta
new file mode 100644
index 0000000..a47f3c5
--- /dev/null
+++ b/SystemTest/Shew_Short.fasta
@@ -0,0 +1,20 @@
+>SO_0548 DNA-binding protein, HU family
+MNKTELIAKIAENADITKAQATRALKSFEAAITESMKNGDKISIVGFGSFETTTRAARTG
+RNPQTGKEIQIAEATVPKFKAGKTLRDSVN
+>SO_3146 DNA-binding protein, H-NS family
+MSEFLEILTHGRRFKAAVKDLSVEELRDLAAKLDKILVERESMEAEELQAIAARNAKIEE
+IRQQMEAVGLSIDDLGGVAVKASSKKRAPRPAKYQIEVDGEVIQWTGQGRMPTVFKNEVN
+KGRSMDDFLI
+>SO_1126 chaperone protein, DnaK
+MGKIIGIDLGTTNSCVAVLDGGKARVLENAEGDRTTPSIIAYTDDETIVGQPAKRQAVTN
+PNNTFFAIKRLIGRRFKDDEVQRDVNIMPFKIIAADNGDAWVESRGNKMAPPQVSAEILK
+KMKKTAEDFLGEEVTEAVITVPAYFNDSQRQATKDAGRIAGLEVKRIINEPTAAALAYGI
+DKKQGDNIVAVYDLGGGTFDISIIEIDSNDGDQTFEVLATNGDTHLGGEDFDNRLINYLA
+DEFKKEQGLDLRKDPLAMQRLKEAAEKAKIELSSTNQTEVNLPYITADATGPKHLVVKIT
+RAKLESLVEDLIIRTLEPLKVALADADLSVSDINEVILVGGQTRMPKVQEAVTNFFGKEP
+RKDVNPDEAVAVGAAIQAGVLSGDVKDVLLLDVTPLSLGIETMGSVMTKLIEKNTTIPTK
+AQQVFSTADDNQSAVTIHVLQGERKQASANKSLGQFNLDGIEPAPRGMPQIEVMFDIDAD
+GILHVSATDKKTGKKQNITIKASSGLSEEEVAQMVRDAEAHAEEDKKFEELVQSRNQADG
+LVHATKKQVEEAGDALPSEDKAKIEAAMSAVETAVKGNDKEAIEKATQALIEASAKLMEI
+AQAKAQTQGGAQEGAAKQSNATADDVVDAEFEEVKDDKK
+
diff --git a/SystemTest/Shew_dta.txt b/SystemTest/Shew_dta.txt
new file mode 100644
index 0000000..d333905
--- /dev/null
+++ b/SystemTest/Shew_dta.txt
@@ -0,0 +1,1451 @@
+=================================== "QC_Shew_07_02-pt3_31Mar07_OWL_07-03-07.3186.3186.1.dta" ==================================
+1319.69421 1 scan=3186 cs=1
+379.036 4.3
+381.198 8.2
+382.280 4.6
+383.081 4.6
+384.461 18.4
+385.142 5.0
+385.989 3.5
+387.036 25.9
+389.413 3.0
+394.441 4.8
+397.267 18.2
+398.925 4.1
+400.437 5.0
+403.206 14.6
+404.342 12.7
+407.127 26.3
+408.789 17.0
+409.516 24.5
+410.246 9.5
+411.799 24.6
+412.507 39.6
+413.335 47.0
+418.426 64.2
+419.234 17.9
+419.877 11.0
+421.143 11.9
+423.140 9.1
+424.066 8.7
+425.275 8.6
+426.279 37.6
+427.542 14.0
+429.523 9.0
+430.389 16.7
+435.210 28.8
+437.190 25.6
+439.275 7.7
+440.161 3.0
+442.548 13.6
+443.429 7.0
+445.412 5.2
+450.835 74.9
+452.336 10.6
+453.324 20.4
+454.037 48.8
+456.057 39.7
+457.631 31.3
+458.542 3.8
+460.223 3.0
+462.976 17.8
+464.093 12.5
+465.239 11.1
+466.437 3.5
+467.071 9.9
+468.181 47.9
+469.084 11.9
+470.165 36.4
+471.671 45.1
+472.475 19.7
+473.296 5.4
+474.256 14.7
+478.120 3.3
+479.761 14.6
+480.520 41.8
+481.188 2.1
+482.360 22.0
+483.471 37.7
+484.328 36.0
+485.116 5.8
+486.098 14.5
+487.285 10.4
+490.121 10.1
+493.558 13.7
+494.674 17.1
+495.418 11.1
+496.228 57.9
+497.427 2.2
+498.064 6.7
+500.060 10.2
+501.391 20.8
+503.038 5.1
+504.394 9.1
+507.419 15.6
+508.318 3.2
+510.254 24.5
+511.358 62.9
+512.397 21.6
+513.267 35.2
+514.259 16.5
+516.292 40.5
+518.346 14.2
+520.083 20.5
+521.388 17.7
+522.031 11.0
+523.228 61.1
+525.239 128.6
+526.396 19.9
+528.065 3.9
+529.502 32.6
+530.351 12.9
+531.355 51.6
+532.423 6.1
+533.399 10.4
+536.156 3.7
+537.109 25.8
+538.273 45.8
+539.218 129.9
+540.331 34.9
+541.180 54.1
+542.175 8.1
+543.263 48.6
+544.065 4.3
+545.348 22.1
+546.042 4.0
+547.433 210.6
+548.230 13.3
+551.507 22.1
+553.232 117.1
+554.234 17.3
+555.243 33.3
+556.489 32.3
+557.416 20.6
+558.486 6.5
+559.345 39.5
+560.324 3.4
+561.285 37.0
+563.346 6.3
+565.057 11.6
+566.089 28.3
+567.389 37.8
+568.179 20.3
+569.226 17.8
+570.043 12.3
+571.193 21.1
+573.616 4.1
+577.264 4.6
+578.458 8.8
+579.367 25.6
+582.212 11.7
+582.870 17.4
+585.097 52.0
+586.175 25.3
+587.215 27.9
+589.419 34.8
+592.514 31.7
+593.269 8.5
+594.515 18.4
+596.365 17.9
+597.399 54.4
+598.213 50.7
+599.302 30.6
+601.149 18.9
+602.589 6.5
+603.358 32.6
+604.145 18.2
+605.882 14.4
+606.622 9.7
+607.726 8.5
+608.536 28.4
+609.303 31.0
+610.516 6.3
+611.315 18.8
+612.249 7.5
+613.302 5.4
+614.446 27.4
+615.239 6.7
+616.151 12.7
+616.790 3.4
+617.482 2.4
+619.127 5.1
+620.914 28.2
+622.178 15.0
+623.426 15.3
+624.206 102.6
+625.494 18.7
+626.392 49.5
+627.441 17.7
+628.529 55.5
+629.192 12.3
+630.154 14.6
+631.157 18.5
+632.405 18.5
+635.468 5.2
+636.203 33.7
+637.183 7.4
+638.471 52.6
+639.307 25.5
+640.354 110.2
+642.279 126.2
+643.490 20.9
+644.323 34.2
+646.478 100.4
+647.640 15.8
+650.268 6.3
+651.290 22.8
+652.184 9.5
+653.243 9.4
+654.237 188.5
+655.164 52.0
+656.363 119.7
+657.380 3.6
+658.331 16.3
+659.246 29.6
+660.373 89.4
+661.274 27.1
+664.606 20.2
+665.326 36.1
+666.117 23.8
+667.660 3.9
+668.662 15.9
+669.628 17.1
+670.284 45.0
+672.315 87.5
+673.889 34.3
+674.522 32.5
+675.242 16.3
+677.354 18.2
+678.393 17.9
+679.441 59.8
+681.037 39.6
+681.642 37.0
+682.359 43.1
+684.154 81.4
+685.299 10.0
+686.357 44.1
+687.039 9.3
+690.548 15.1
+691.449 5.3
+694.459 6.9
+695.476 57.2
+696.453 48.7
+697.286 28.9
+698.294 55.6
+699.268 27.0
+700.183 7.4
+702.315 52.4
+704.496 22.2
+705.149 3.3
+706.418 25.8
+707.075 23.5
+708.232 10.4
+709.055 20.5
+709.938 20.1
+711.108 40.0
+712.412 8.4
+713.165 29.2
+713.844 32.4
+714.991 51.8
+716.203 14.3
+717.399 48.2
+718.585 29.6
+719.302 5.4
+720.376 16.8
+721.673 13.9
+722.570 9.9
+723.340 6.9
+724.336 2.6
+725.262 101.1
+726.195 54.1
+727.373 64.2
+728.391 61.7
+729.381 41.3
+731.811 17.2
+736.004 5.0
+737.154 44.5
+738.087 44.8
+739.216 91.0
+740.052 43.4
+741.332 61.5
+742.439 55.5
+743.363 133.7
+744.613 44.4
+745.431 60.7
+746.301 23.3
+751.102 38.6
+752.792 64.2
+753.594 37.2
+754.666 63.6
+755.413 119.5
+756.543 44.4
+757.349 43.9
+757.971 8.1
+761.529 54.9
+765.391 42.9
+766.250 23.9
+767.323 84.7
+768.381 22.3
+769.410 62.1
+770.292 6.8
+771.387 105.0
+772.619 71.9
+773.370 79.3
+774.283 48.8
+778.185 12.3
+779.705 10.6
+781.583 25.5
+782.466 5.8
+783.430 31.0
+784.320 58.9
+785.461 57.0
+786.288 41.6
+787.395 28.8
+788.252 11.8
+789.139 13.5
+789.776 5.9
+791.551 2.2
+793.303 16.9
+794.060 11.8
+795.035 21.0
+796.131 90.0
+797.107 41.2
+798.452 21.0
+799.209 43.9
+800.281 27.0
+803.150 46.9
+804.444 207.6
+805.377 23.3
+807.713 7.4
+808.333 53.2
+810.057 98.5
+811.301 76.0
+812.197 24.5
+813.286 45.9
+814.273 147.4
+815.378 123.6
+816.303 3.4
+817.366 13.7
+822.894 14.4
+825.154 38.5
+826.298 70.5
+827.208 34.1
+828.306 65.9
+829.476 28.7
+830.815 12.7
+831.472 8.6
+832.940 17.9
+833.589 15.7
+835.750 9.1
+836.569 52.1
+837.445 12.1
+838.199 15.6
+839.645 37.3
+841.237 7.1
+842.523 31.5
+843.501 40.4
+844.499 71.9
+845.468 15.4
+846.257 17.9
+849.330 23.9
+851.529 26.2
+854.089 189.8
+855.297 83.1
+856.337 62.8
+857.746 12.5
+860.363 11.6
+861.145 8.7
+863.844 13.6
+864.606 7.6
+867.343 19.9
+868.169 24.2
+869.425 6.5
+870.398 12.2
+871.664 39.6
+872.415 34.2
+873.302 24.7
+873.909 12.0
+874.554 41.2
+875.794 11.0
+880.570 16.7
+882.619 21.3
+883.433 36.9
+884.370 48.0
+885.604 38.0
+886.302 72.3
+887.129 34.8
+888.852 21.3
+890.733 10.2
+895.586 55.4
+896.444 2.3
+898.212 36.6
+899.477 10.6
+900.817 19.0
+902.167 125.0
+903.170 30.2
+904.399 17.7
+907.529 22.7
+908.503 6.1
+909.558 43.1
+910.463 3.8
+911.353 8.2
+913.197 66.1
+914.224 4.1
+915.128 12.7
+916.411 49.9
+917.245 22.9
+918.455 9.0
+919.108 8.7
+920.771 2.6
+922.460 17.4
+923.908 6.6
+925.252 196.8
+926.431 130.1
+927.383 76.4
+928.372 5.6
+930.245 9.6
+931.629 34.5
+932.409 13.2
+934.795 15.2
+935.477 32.7
+936.499 41.1
+937.373 25.7
+938.570 11.4
+940.057 39.0
+941.380 9.9
+943.258 224.8
+944.245 96.7
+945.469 34.9
+946.829 6.5
+948.373 12.1
+949.374 25.1
+951.471 29.4
+952.763 20.7
+953.608 31.6
+954.801 14.8
+955.523 9.6
+956.516 35.1
+957.273 14.3
+958.379 26.1
+959.488 22.2
+960.284 16.4
+961.412 44.5
+962.246 5.8
+967.325 62.2
+968.487 15.5
+969.233 36.5
+971.361 101.5
+973.415 30.4
+974.830 22.3
+981.311 11.8
+983.151 8.7
+985.215 183.0
+986.022 37.3
+986.653 61.6
+987.631 16.7
+988.515 7.8
+993.842 6.1
+996.921 17.4
+997.686 19.7
+998.521 8.0
+1001.468 2.8
+1003.288 53.3
+1004.421 29.2
+1006.367 72.5
+1007.250 75.4
+1009.506 12.6
+1011.265 17.8
+1012.564 8.2
+1013.579 21.3
+1015.196 29.8
+1020.111 25.9
+1021.011 13.9
+1021.967 8.0
+1024.431 78.7
+1025.297 11.8
+1025.926 13.7
+1027.360 10.8
+1028.639 5.4
+1029.713 36.3
+1031.225 10.7
+1032.864 5.9
+1036.760 6.1
+1038.344 86.7
+1039.144 47.8
+1039.786 16.7
+1041.324 20.5
+1042.164 71.8
+1043.127 45.6
+1044.615 34.9
+1045.645 30.6
+1047.153 15.3
+1054.530 14.7
+1056.300 317.6
+1057.341 149.8
+1059.372 21.1
+1060.367 37.8
+1061.410 35.1
+1069.957 4.5
+1074.147 199.8
+1075.142 58.9
+1077.379 392.8
+1078.031 11.0
+1078.640 112.6
+1084.310 37.3
+1085.611 21.2
+1086.347 7.2
+1089.500 2.4
+1090.765 3.7
+1094.035 4.8
+1101.455 3.9
+1102.705 19.2
+1103.552 9.5
+1105.449 8.6
+1108.468 14.9
+1110.370 25.1
+1111.064 8.0
+1113.137 6.4
+1115.752 36.8
+1116.462 14.7
+1119.903 3.0
+1120.560 3.9
+1123.235 11.5
+1125.553 4.0
+1128.516 19.1
+1129.724 27.6
+1130.841 8.9
+1131.494 3.5
+1134.293 9.9
+1137.386 106.9
+1138.331 7.1
+1141.214 11.4
+1145.438 3.9
+1146.144 17.4
+1146.855 10.9
+1155.367 148.4
+1156.396 93.0
+1159.724 8.0
+1167.367 41.1
+1173.125 63.1
+1173.869 36.5
+1174.490 13.0
+1182.127 33.1
+1184.230 32.2
+1191.309 4.3
+1192.453 4.3
+1198.006 22.8
+1204.139 26.7
+1206.883 7.7
+1207.673 10.5
+1214.094 2.8
+1216.542 5.8
+1217.260 46.6
+1219.255 11.1
+1229.442 11.7
+1230.316 13.3
+1232.582 9.7
+1236.331 10.0
+1245.958 5.2
+1247.680 27.3
+1250.812 7.8
+1256.999 27.4
+1257.651 18.2
+1258.975 18.4
+1270.249 17.8
+1273.464 5.4
+1275.289 16.1
+1276.513 43.3
+1278.499 7.0
+1280.116 15.3
+1282.114 14.5
+1283.600 46.3
+1284.438 42.3
+1287.047 19.2
+1289.917 5.9
+1291.428 15.4
+1294.722 24.5
+1299.315 2.4
+1300.755 75.1
+1301.527 365.7
+1302.412 360.7
+1303.358 17.8
+=================================== "QC_Shew_07_02-pt3_31Mar07_OWL_07-03-07.2599.2599.1.dta" ==================================
+1198.67297 1 scan=2599 cs=1
+338.156 11.1
+343.316 7.5
+353.282 75.3
+354.724 11.5
+357.232 2.8
+359.517 3.9
+362.098 3.4
+365.116 12.6
+366.429 4.9
+371.361 45.8
+372.174 3.8
+379.583 23.3
+380.322 16.0
+383.265 14.5
+384.965 39.7
+385.588 8.7
+390.333 5.2
+391.283 10.1
+393.262 3.6
+394.225 11.2
+395.403 3.2
+397.074 24.1
+398.432 13.8
+399.372 13.1
+400.291 12.9
+406.229 22.7
+407.320 13.8
+408.424 9.5
+409.145 3.0
+413.428 18.0
+415.235 6.3
+416.028 6.0
+421.051 10.2
+423.222 59.7
+424.589 85.7
+425.297 87.4
+426.301 44.7
+427.266 19.7
+433.067 4.3
+436.425 24.5
+438.173 125.9
+439.552 24.9
+440.280 2.5
+441.028 4.5
+442.056 2.8
+443.272 17.7
+444.498 81.4
+445.535 6.9
+449.310 10.5
+450.312 11.0
+451.298 11.8
+452.010 8.4
+452.618 11.6
+454.104 61.4
+455.300 50.2
+456.435 20.6
+460.538 2.8
+463.577 5.8
+466.158 189.4
+467.255 64.4
+468.337 50.3
+469.332 10.9
+470.306 18.2
+471.322 20.0
+472.309 18.1
+475.129 4.2
+478.224 55.1
+480.379 37.8
+481.912 1.7
+484.173 152.2
+487.404 7.1
+490.391 14.3
+491.836 10.2
+493.320 26.2
+494.192 14.8
+495.813 80.7
+496.590 4.6
+497.391 48.5
+498.288 17.0
+499.469 11.5
+500.466 3.1
+502.126 13.3
+506.189 4.3
+507.274 12.7
+508.420 36.6
+509.207 5.9
+510.254 4.2
+511.405 11.9
+512.372 32.5
+513.181 36.3
+514.296 57.8
+515.380 117.1
+516.447 11.3
+517.555 8.8
+520.010 45.0
+520.762 31.8
+522.170 30.4
+523.560 10.2
+524.405 9.3
+525.480 19.1
+526.212 33.4
+527.032 10.3
+528.389 7.9
+529.527 14.1
+533.089 3.9
+535.401 2.6
+536.385 67.8
+537.252 280.9
+538.352 82.5
+539.260 16.9
+540.299 25.5
+541.210 30.0
+542.527 20.7
+548.090 7.5
+548.988 6.9
+550.068 20.9
+551.620 11.1
+553.193 163.0
+554.061 68.0
+554.849 79.2
+555.451 192.5
+556.275 30.1
+559.205 3.9
+561.629 2.1
+562.246 9.6
+563.598 18.6
+564.204 18.9
+565.421 17.7
+566.584 13.6
+567.225 184.5
+568.055 21.4
+571.387 15.8
+572.385 20.9
+573.241 15.2
+577.214 7.9
+578.129 40.0
+579.096 48.0
+580.321 38.2
+581.465 106.1
+582.513 45.9
+583.377 107.0
+584.674 36.9
+585.379 32.8
+586.088 12.3
+586.926 11.2
+591.414 28.3
+592.174 13.3
+593.057 11.2
+595.219 16.4
+596.205 55.1
+597.313 164.8
+598.240 9.1
+599.453 14.6
+601.415 10.0
+603.662 12.9
+604.541 14.8
+606.615 17.6
+607.393 76.7
+608.656 60.3
+609.463 78.6
+610.668 4.5
+612.184 8.4
+615.287 5.8
+616.028 6.2
+620.457 20.9
+621.433 61.1
+622.112 29.6
+624.156 127.9
+625.207 80.2
+626.239 130.4
+627.341 90.3
+628.199 28.0
+631.579 8.3
+636.520 25.1
+637.436 36.2
+638.281 77.4
+639.193 53.5
+641.483 29.1
+642.421 85.5
+643.438 28.0
+644.501 51.2
+645.413 17.9
+647.911 55.3
+649.431 70.0
+650.180 36.4
+651.199 20.6
+653.308 16.6
+654.316 113.9
+655.310 53.0
+656.341 48.1
+657.546 10.8
+660.324 14.9
+663.742 10.6
+664.370 41.2
+666.196 550.9
+667.261 190.3
+668.488 39.8
+669.801 24.7
+670.583 7.3
+671.416 11.2
+672.281 22.1
+674.254 26.8
+674.856 15.8
+677.899 29.8
+678.543 72.9
+679.258 14.4
+680.293 28.9
+681.155 16.7
+684.297 321.9
+685.238 65.0
+690.266 31.6
+691.115 12.7
+691.769 8.6
+693.183 32.3
+694.365 4.3
+695.293 154.4
+696.122 98.7
+697.150 72.3
+697.929 11.8
+701.349 5.8
+702.270 8.5
+704.435 21.9
+706.382 42.7
+708.134 79.7
+709.248 77.2
+710.232 79.5
+711.323 36.9
+712.078 13.7
+713.469 11.8
+714.326 53.7
+715.320 78.5
+716.498 13.5
+717.413 11.5
+719.383 110.6
+720.276 51.2
+721.274 13.1
+722.022 1.9
+724.277 12.8
+725.187 39.2
+726.254 12.9
+727.362 29.1
+728.266 64.5
+737.201 465.4
+738.328 217.8
+739.126 42.4
+743.175 10.7
+744.951 8.6
+746.504 7.9
+747.477 21.8
+748.783 19.0
+749.927 17.7
+750.946 31.0
+752.712 7.8
+753.761 22.4
+754.620 54.7
+755.309 378.1
+756.280 80.7
+757.561 7.4
+761.551 3.0
+762.850 7.7
+763.515 97.4
+764.275 40.5
+765.168 5.8
+766.163 9.1
+767.219 77.8
+768.060 32.1
+771.287 12.9
+774.126 10.3
+775.201 4.1
+777.321 5.6
+780.224 5.7
+781.255 9.9
+782.174 14.4
+783.311 156.3
+784.424 23.3
+788.443 22.7
+790.287 30.8
+791.659 45.9
+792.450 38.8
+793.379 19.3
+794.494 25.4
+795.478 13.4
+796.441 7.1
+798.086 15.2
+800.728 8.3
+802.085 7.3
+803.168 48.0
+804.715 14.8
+806.219 25.1
+807.048 35.3
+808.232 184.8
+809.351 160.9
+810.325 57.9
+810.992 9.9
+812.379 59.2
+813.444 18.1
+816.302 5.7
+820.201 248.9
+821.268 273.2
+822.228 15.4
+823.791 18.7
+824.403 298.0
+825.281 91.2
+826.244 32.2
+827.141 83.9
+828.399 45.8
+829.560 8.5
+831.332 7.8
+836.895 46.4
+838.200 1069.1
+839.328 545.2
+842.361 66.5
+843.245 15.1
+848.354 52.7
+850.007 20.3
+852.020 16.4
+854.216 8.5
+855.116 16.6
+855.761 11.4
+856.393 279.0
+857.523 68.9
+859.955 13.8
+865.649 42.6
+866.258 502.3
+867.265 248.4
+883.566 10.8
+884.410 28.0
+889.356 4.7
+891.234 59.6
+892.366 45.8
+893.448 71.7
+896.171 9.9
+901.468 7.9
+902.381 13.9
+903.292 13.1
+909.119 93.2
+910.309 34.0
+911.425 31.8
+917.327 2.1
+918.603 31.5
+919.356 306.0
+920.227 97.1
+921.226 37.4
+921.909 4.7
+925.458 14.3
+926.708 8.5
+928.938 5.3
+931.467 15.1
+935.513 30.3
+937.324 1984.2
+938.343 565.1
+938.947 6.6
+939.679 22.5
+940.551 6.6
+955.328 280.8
+956.430 147.2
+957.343 4.0
+963.951 15.3
+966.352 20.2
+967.428 8.6
+968.628 17.3
+972.277 17.1
+975.213 8.1
+977.441 3.1
+978.552 2.1
+980.831 11.7
+982.752 12.0
+990.356 15.0
+991.463 6.9
+993.403 4.8
+998.783 4.4
+999.796 12.1
+1008.368 6.0
+1010.945 12.5
+1015.960 11.4
+1020.183 16.1
+1021.619 3.9
+1023.501 24.1
+1027.984 3.4
+1032.245 13.3
+1033.117 15.6
+1034.426 17.5
+1035.539 4.9
+1042.792 9.6
+1043.700 10.3
+1045.310 4.3
+1049.479 12.9
+1050.222 7.7
+1052.456 7.8
+1053.718 21.5
+1055.391 3.5
+1058.200 18.2
+1060.490 9.0
+1065.561 3.2
+1067.359 17.1
+1068.342 4.1
+1069.259 36.7
+1070.011 39.8
+1072.364 32.0
+1073.851 9.9
+1075.034 15.6
+1079.383 9.2
+1080.358 2.1
+1083.620 40.9
+1090.258 3.6
+1095.532 3.6
+1098.125 3.2
+1099.807 3.9
+1103.615 13.1
+1104.701 5.3
+1110.164 9.6
+1111.996 46.5
+1113.114 19.5
+1115.906 4.7
+1117.053 4.1
+1122.531 39.7
+1129.947 9.2
+1131.160 20.0
+1132.609 10.3
+1134.414 20.2
+1135.541 7.8
+1136.350 4.2
+1140.338 22.1
+1145.908 1.9
+1148.870 11.2
+1153.263 31.9
+1154.066 7.5
+1154.724 17.7
+1155.775 52.4
+1160.474 24.8
+1162.516 66.0
+1163.623 57.1
+1164.722 25.2
+1165.377 11.6
+1167.206 16.4
+1173.268 12.9
+1177.439 2.1
+1178.364 29.0
+1180.520 1339.3
+1181.654 574.5
+1182.615 2.5
+1183.478 22.2
+1187.544 10.6
+1190.782 26.4
+
+=================================== "QC_Shew_07_02-pt3_31Mar07_OWL_07-03-07.2778.2778.1.dta" ==================================
+960.50385 1 scan=2778 cs=1
+270.430 3.0
+271.380 23.6
+272.488 10.8
+280.182 30.2
+281.209 16.7
+282.662 6.1
+285.355 12.0
+288.300 485.2
+289.343 65.7
+294.188 7.7
+295.022 7.4
+297.573 12.4
+298.250 98.2
+299.219 11.0
+301.575 2.1
+303.736 9.6
+308.207 18.4
+308.995 4.9
+310.219 14.2
+311.431 53.3
+312.905 25.2
+316.273 54.3
+317.317 11.5
+320.337 13.5
+323.056 15.1
+326.808 47.1
+328.186 10.8
+329.211 1.5
+330.291 4.6
+331.425 7.6
+337.802 3.8
+339.212 14.4
+340.193 31.6
+342.343 13.1
+348.159 14.0
+350.193 4.1
+351.343 6.5
+353.024 7.7
+353.667 2.5
+354.308 57.2
+355.139 15.7
+356.352 10.8
+357.111 15.9
+358.400 17.1
+359.205 31.5
+363.808 38.4
+365.363 4.5
+368.095 10.0
+369.149 6.9
+370.378 3.0
+372.134 92.0
+374.375 26.2
+378.495 24.8
+379.508 9.3
+380.211 3.8
+380.936 6.0
+382.099 109.7
+383.470 13.1
+386.185 1.7
+387.218 8.5
+388.537 12.7
+391.229 23.8
+392.669 3.0
+393.350 6.9
+396.074 60.9
+397.151 32.6
+397.779 13.4
+399.294 91.8
+400.237 57.5
+401.220 17.6
+402.093 22.5
+403.317 12.2
+408.996 12.4
+411.450 21.9
+412.389 32.7
+413.209 10.2
+414.127 18.3
+415.054 5.8
+417.312 621.7
+418.317 114.3
+421.029 16.8
+422.494 6.8
+423.273 8.3
+424.295 34.2
+425.032 2.8
+426.317 21.9
+427.202 123.4
+428.710 26.2
+429.552 16.7
+430.972 12.9
+431.754 8.2
+435.268 9.0
+436.391 3.0
+438.332 13.5
+440.347 6.7
+441.469 9.1
+442.388 6.0
+443.304 19.9
+444.408 3.9
+445.299 48.3
+447.578 2.5
+448.256 10.2
+450.334 8.8
+451.275 20.3
+452.340 3.6
+453.009 8.0
+455.192 10.5
+455.819 7.3
+457.234 21.9
+459.308 22.5
+460.331 6.3
+463.196 24.1
+464.963 23.4
+466.066 17.0
+466.902 14.5
+468.352 26.2
+469.177 60.6
+470.916 13.0
+472.398 9.0
+473.390 2.1
+474.367 13.5
+475.576 3.2
+476.321 6.0
+478.190 7.4
+480.555 10.8
+481.637 6.4
+482.655 10.1
+483.535 25.2
+485.056 9.7
+485.794 8.3
+486.415 7.3
+487.052 5.0
+489.074 7.9
+490.460 10.5
+491.139 6.8
+492.295 4.1
+493.505 14.1
+494.244 9.0
+495.321 18.4
+496.516 23.5
+497.415 23.0
+498.560 47.8
+499.566 2.7
+502.449 4.9
+503.194 9.2
+504.824 16.3
+506.347 12.2
+507.627 5.2
+508.360 4.7
+509.305 42.7
+510.270 3.2
+511.278 112.4
+512.175 52.5
+513.732 51.2
+515.689 13.1
+516.407 12.5
+520.307 6.4
+521.108 9.7
+522.209 44.5
+523.063 7.2
+524.270 30.1
+525.416 74.5
+526.465 42.4
+527.166 42.9
+528.248 40.6
+529.373 154.5
+530.219 56.0
+531.354 1.7
+534.233 18.5
+538.290 33.4
+540.122 123.6
+541.505 28.4
+542.367 117.6
+543.125 18.4
+544.210 67.8
+546.422 106.8
+547.357 21.6
+548.066 4.5
+556.611 33.4
+558.160 27.4
+558.985 9.9
+559.955 6.4
+560.940 18.9
+564.334 22.3
+565.396 20.8
+566.383 24.4
+567.210 34.8
+568.700 4.4
+569.485 7.1
+572.922 16.7
+574.253 15.1
+575.188 3.6
+576.495 16.1
+578.734 35.1
+581.401 16.7
+582.326 12.7
+584.356 10.4
+585.306 4.3
+586.380 26.8
+592.305 6.5
+593.260 12.1
+596.073 13.0
+597.347 3.6
+598.930 4.7
+599.894 12.9
+600.532 28.2
+601.409 20.2
+604.086 2.1
+606.484 16.6
+608.236 9.5
+609.427 9.3
+610.207 48.1
+612.572 24.6
+613.268 5.3
+616.364 19.2
+618.052 10.0
+619.404 21.6
+621.947 13.7
+626.086 13.2
+627.257 55.2
+628.234 155.7
+629.589 29.5
+631.991 4.8
+637.391 29.5
+638.235 73.3
+639.108 13.7
+640.453 11.1
+643.019 14.6
+644.325 15.9
+645.414 106.6
+646.508 68.8
+647.458 2.5
+653.240 22.6
+655.152 201.8
+656.118 152.3
+657.083 11.9
+658.482 9.2
+660.520 35.1
+662.559 18.2
+663.805 27.1
+666.645 3.6
+668.394 13.1
+671.328 16.2
+672.113 18.9
+673.333 178.2
+674.314 17.7
+678.354 4.5
+680.381 16.9
+681.291 3.5
+682.215 5.1
+683.197 34.9
+685.253 3.0
+688.385 11.4
+689.174 2.7
+690.713 9.6
+692.341 6.1
+693.703 4.2
+696.735 24.3
+698.364 31.7
+699.092 9.4
+700.198 23.7
+701.583 21.8
+702.888 21.6
+704.069 8.2
+705.169 9.7
+706.378 12.6
+708.739 18.1
+709.347 4.0
+710.421 15.6
+711.221 10.3
+711.961 2.7
+712.605 41.5
+713.361 28.1
+714.392 58.6
+715.181 124.9
+716.293 45.3
+718.529 10.2
+720.748 10.7
+722.517 6.9
+724.354 7.8
+726.814 51.1
+729.553 7.3
+730.270 8.2
+731.179 5.4
+732.480 270.8
+733.458 169.1
+735.014 12.4
+735.967 7.7
+736.662 11.5
+738.878 8.1
+740.261 8.2
+741.523 29.4
+742.421 21.6
+743.099 17.0
+744.441 34.0
+747.529 15.7
+748.727 16.6
+749.835 15.9
+750.706 36.4
+751.774 15.2
+752.853 7.1
+755.391 24.0
+756.139 5.6
+758.477 19.3
+760.606 6.0
+765.428 21.8
+768.355 74.1
+769.384 33.5
+770.345 3.8
+772.553 6.3
+773.365 18.3
+776.187 17.3
+776.871 3.4
+779.085 30.7
+782.713 9.1
+783.377 7.2
+784.419 5.8
+786.188 66.1
+787.118 25.4
+788.637 17.2
+789.240 7.9
+790.330 10.2
+791.848 14.4
+792.541 4.4
+794.716 16.8
+795.417 7.5
+796.656 25.1
+797.363 3.6
+798.455 4.5
+799.413 13.6
+801.389 8.8
+803.227 13.6
+804.296 3.2
+806.664 4.2
+807.556 3.6
+809.351 43.3
+810.823 12.2
+811.629 12.3
+812.522 6.2
+814.433 20.5
+815.511 25.3
+816.465 4.0
+818.372 17.2
+819.375 16.5
+820.336 10.8
+821.929 11.3
+822.638 22.4
+824.536 23.6
+826.810 54.1
+827.544 353.0
+828.449 237.1
+829.550 64.5
+830.502 22.4
+831.517 34.7
+832.957 15.7
+837.693 18.7
+838.367 8.1
+839.847 18.2
+840.506 7.9
+843.532 3.8
+845.448 7068.0
+846.486 2611.9
+847.175 9.9
+852.341 2.5
+854.213 33.0
+854.855 8.4
+855.995 46.6
+857.062 50.7
+860.595 6.2
+861.587 26.1
+863.285 28.9
+864.455 33.8
+865.896 24.9
+867.094 13.7
+867.760 40.7
+869.292 10.0
+870.184 8.6
+871.557 66.3
+874.371 28.3
+876.438 45.5
+877.888 30.1
+879.520 4.9
+880.205 47.9
+881.555 103.6
+882.498 28.9
+883.410 7.2
+884.387 6.6
+885.360 9.3
+888.819 37.1
+890.965 35.0
+894.445 9.6
+895.159 35.1
+896.580 11.6
+897.499 14.3
+898.297 40.4
+899.763 149.3
+900.461 132.7
+901.602 15.4
+902.674 3.4
+906.095 26.3
+907.052 18.3
+908.911 34.9
+912.460 32.3
+913.697 25.6
+914.352 35.4
+916.531 91.4
+918.400 144.1
+919.450 77.4
+920.055 5.6
+922.197 25.2
+923.782 39.2
+924.937 104.7
+925.575 126.7
+926.378 76.2
+928.218 23.2
+930.409 94.3
+931.053 87.0
+931.944 56.3
+933.039 10.3
+934.502 21.9
+935.546 10.1
+937.748 80.0
+940.002 127.4
+940.688 26.1
+941.494 125.5
+942.466 1434.5
+943.436 2255.4
+944.437 852.1
+945.306 25.3
+948.063 5.4
+
diff --git a/SystemTest/SimpleChromosome.trie b/SystemTest/SimpleChromosome.trie
new file mode 100644
index 0000000..09b62b6
--- /dev/null
+++ b/SystemTest/SimpleChromosome.trie
@@ -0,0 +1 @@
+AGCGGGAGAGAGAGAGAGAGAGAGAGAGAGAGCGAGAGAGCGTGAGCGCGCGCAAGCTAGCGAGCAAACCAGAGAGACAGACCGAGAGAGGGACCAGGAGAGAGACCCAGAGAGAGAAGAAGAAGCCAGAAGCCGAGCTCTGTCAGGGCTCAACCTCCAACTTGTTTCAGTTCATTCATCCTTCTCTCCTTTCCGCTCAGACTGTAGAGCTCGGTCTCTCCAAGTTTGTGCCTAAGAAGATGATAATCACACAAACAAGTCACTGTTACATGACCAGCCTTGGGATTCTTTTCCTGATTAATATTCTCCCTGGAACCACTGGTCAAGGGGAATCAAGACGACAAGAACCCGGGGACTTTGTGAAGCAGGACATTGGCGGGCTGTCTCCTAAGCATGCCCCAGATATTCCTGATGACAGCACTGACAACATCACTATCTTCACCAGAATCTTGGATCGTCTTCTGGACGGCTATGACAACCGGCTGCGACCTG [...]
\ No newline at end of file
diff --git a/SystemTest/SimpleGenes.gff b/SystemTest/SimpleGenes.gff
new file mode 100644
index 0000000..681e140
--- /dev/null
+++ b/SystemTest/SimpleGenes.gff
@@ -0,0 +1,5 @@
+SampleChromosome Natalie exon 3 14 1 + 0 Parent=Gene1
+SampleChromosome Natalie exon 23 34 1 + 0 Parent=Gene1
+SampleChromosome Natalie exon 26 34 1 + 0 Parent=Gene2
+SampleChromosome Natalie exon 56 65 1 + 0 Parent=Gene3
+SampleChromosome Natalie exon 84 97 1 + 2 Parent=Gene3
diff --git a/SystemTest/TestCDTA.txt b/SystemTest/TestCDTA.txt
new file mode 100644
index 0000000..e15363d
--- /dev/null
+++ b/SystemTest/TestCDTA.txt
@@ -0,0 +1,5 @@
+spectra,SystemTest/Shew_dta.txt
+instrument,FT-Hybrid
+protease,Trypsin
+SequenceFile,SystemTest/Shew_Short.fasta
+mod,+57,C,fix
diff --git a/SystemTest/TestInput.txt b/SystemTest/TestInput.txt
new file mode 100644
index 0000000..b9aebc2
--- /dev/null
+++ b/SystemTest/TestInput.txt
@@ -0,0 +1,26 @@
+# Spectrum file-name. Multiple 'spectra' lines can be used to search
+# several .dta or .pkl files. Specify a directory name to search every
+# file in that directory (non-recursive)
+spectra,SystemTest/TestSpectrum.dta
+
+# Instrument type (QTOF or ESI-ION-TRAP)
+instrument,ESI-ION-TRAP
+
+# Protease - nonstandard digests are penalized.
+# Options are trypsin, chymotrypsin, lysc, aspn, gluc
+protease,None
+
+# Path to the database file (as written out by PrepDB.py)
+DB,TestDatabase.trie
+
+#mod,mass,residues,fix/opt,name
+
+# Specify one modification (the test peptide was actually
+# VKEAMAPK, not VKEAMGPK, so a +14 mod will work)
+mod,+14,G,opt,user-defined
+
+# Note: MOST searches should include the +57 modification
+# for the protecting group on cysteines.
+mod,+57,C,fix
+
+mods,1
diff --git a/SystemTest/TestInputMod.txt b/SystemTest/TestInputMod.txt
new file mode 100644
index 0000000..04fde94
--- /dev/null
+++ b/SystemTest/TestInputMod.txt
@@ -0,0 +1,8 @@
+spectra,SystemTest/TestSpectrum.dta
+instrument,ESI-ION-TRAP
+protease,None
+DB,TestDatabase.trie
+
+# This line causes Inspect to run a 'blind' search, where any mass modification
+# up to mass 200Da is allowed. (Very slow on large databases):
+unrestrictive,1
diff --git a/SystemTest/TestInputTag1.txt b/SystemTest/TestInputTag1.txt
new file mode 100644
index 0000000..86faee2
--- /dev/null
+++ b/SystemTest/TestInputTag1.txt
@@ -0,0 +1,9 @@
+spectra,SystemTest/TestSpectrum.dta
+instrument,ESI-ION-TRAP
+protease,None
+DB,TestDatabase.trie
+mod,+57,C,fix
+mods,1
+taglength,1
+tagsonly
+
diff --git a/SystemTest/TestInputTag3.txt b/SystemTest/TestInputTag3.txt
new file mode 100644
index 0000000..01d4888
--- /dev/null
+++ b/SystemTest/TestInputTag3.txt
@@ -0,0 +1,9 @@
+spectra,SystemTest/TestSpectrum.dta
+instrument,ESI-ION-TRAP
+protease,None
+DB,TestDatabase.trie
+mod,+57,C,fix
+mods,1
+taglength,3
+tagsonly
+
diff --git a/SystemTest/TestMS2.txt b/SystemTest/TestMS2.txt
new file mode 100644
index 0000000..ebddbf6
--- /dev/null
+++ b/SystemTest/TestMS2.txt
@@ -0,0 +1,8 @@
+spectra,SystemTest/Yeast.ms2
+instrument,ESI-ION-TRAP
+protease,Trypsin
+
+#DB,TestDatabase.trie
+
+SequenceFile,SystemTest/YeastSmall.fasta
+mod,+57,C,fix
diff --git a/SystemTest/TestPMC.txt b/SystemTest/TestPMC.txt
new file mode 100644
index 0000000..4b21462
--- /dev/null
+++ b/SystemTest/TestPMC.txt
@@ -0,0 +1,7 @@
+spectra,SystemTest/TestSpectra.pkl
+instrument,ESI-ION-TRAP
+protease,None
+DB,TestDatabase.trie
+mod,+57,C,fix
+pmconly,1
+mods,1
diff --git a/SystemTest/TestSpectra.pkl b/SystemTest/TestSpectra.pkl
new file mode 100644
index 0000000..ccdb3a1
--- /dev/null
+++ b/SystemTest/TestSpectra.pkl
@@ -0,0 +1,1773 @@
+533.28 12345.67 2
+151.2 5390.0
+172.0 5028.0
+180.5 2581.0
+184.3 65135.0
+185.1 1791417.0
+186.1 160666.0
+186.8 23.0
+202.8 13982.0
+207.1 10560.0
+209.9 5390.0
+211.1 35087.0
+211.7 6078.0
+213.1 7171183.0
+214.1 887589.0
+215.2 94536.0
+216.3 4106.0
+219.8 2130.0
+222.6 6979.0
+239.2 47363.0
+239.9 3.0
+242.1 2597.0
+245.0 6343.0
+246.5 30612.0
+247.2 798.0
+248.5 5421.0
+249.6 5878.0
+253.4 16252.0
+258.0 10514.0
+261.1 4105.0
+261.7 5927.0
+265.0 30162.0
+266.2 44759.0
+267.2 1296471.0
+268.1 201492.0
+269.2 39197.0
+274.3 13954.0
+275.2 67736.0
+275.8 1.0
+278.8 13686.0
+281.2 94777.0
+282.0 33108.0
+283.7 15660.0
+292.3 127058.0
+293.1 29770.0
+294.5 15211.0
+295.6 38326.0
+302.5 4572.0
+305.5 5286.0
+307.3 1749.0
+308.9 25083.0
+309.5 39412.0
+310.3 1184256.0
+312.2 3816479.0
+313.4 350869.0
+314.1 41337.0
+314.7 7136.0
+321.4 23893.0
+325.6 11750.0
+326.3 397383.0
+327.2 98934.0
+328.2 94218.0
+329.3 18430.0
+332.3 30533.0
+334.9 75433.0
+339.6 6176.0
+340.3 24714.0
+341.5 20109.0
+343.5 36429.0
+344.4 2796.0
+345.9 22232.0
+346.8 10606.0
+350.7 4379.0
+354.0 22273.0
+358.2 40279.0
+364.8 51812.0
+368.1 5552.0
+369.2 21720.0
+371.3 1749.0
+372.2 4457.0
+373.5 15591.0
+379.0 12822.0
+380.3 154243.0
+381.4 54914.0
+382.0 4.0
+388.9 12738.0
+389.9 17339.0
+392.5 21223.0
+394.3 45628.0
+395.5 20497.0
+397.4 1015323.0
+398.5 234354.0
+399.3 11.0
+404.9 6160.0
+407.9 107198.0
+408.9 33407.0
+409.7 7234.0
+411.1 8348.0
+412.4 6086.0
+413.3 4457.0
+414.2 5359.0
+415.4 33816.0
+416.2 9314.0
+418.6 72935.0
+420.0 42304.0
+420.7 54475.0
+422.2 8291.0
+423.0 8684.0
+424.4 47436.0
+425.4 1358187.0
+426.4 392094.0
+427.6 933483.0
+428.4 110663.0
+429.3 143489.0
+430.2 53360.0
+430.8 7010.0
+431.7 50899.0
+432.5 2511.0
+433.3 10783.0
+434.1 6218.0
+436.2 25753.0
+437.9 42137.0
+441.7 133956.0
+442.5 4980.0
+443.4 19846.0
+444.1 7.0
+445.2 35205.0
+446.0 25.0
+447.4 25510.0
+448.5 4933.0
+450.4 8161.0
+451.4 12710.0
+453.1 16538.0
+456.4 19959.0
+459.2 4379.0
+461.1 5342.0
+463.1 8366.0
+465.3 5384.0
+468.5 17921.0
+469.2 30451.0
+469.8 9.0
+470.8 16300.0
+471.6 3784.0
+474.7 1784.0
+477.3 6967.0
+481.7 14238.0
+483.8 91491.0
+484.6 171372.0
+486.5 178165.0
+487.5 36287.0
+489.1 100315.0
+489.7 5787.0
+492.3 32722.0
+493.5 3374.0
+494.3 37120.0
+495.1 43495.0
+496.4 8160.0
+497.1 18.0
+499.5 5753.0
+505.0 11091.0
+506.4 16280.0
+508.5 33703.0
+509.3 11448.0
+510.4 48115.0
+512.4 66935.0
+513.3 17408.0
+515.2 19987.0
+515.9 57252.0
+517.3 27245.0
+518.3 38028.0
+519.0 7.0
+520.5 19696.0
+521.7 21662.0
+522.3 28852.0
+523.4 44714.0
+524.5 78861.0
+525.5 137108.0
+526.5 840780.0
+527.4 206179.0
+528.4 94072.0
+529.0 35.0
+530.3 23777.0
+531.1 30463.0
+535.3 16902.0
+540.5 438350.0
+541.4 254479.0
+542.6 49349.0
+544.5 120573.0
+545.5 14799.0
+548.3 89281.0
+549.1 8424.0
+550.0 23165.0
+551.7 22014.0
+552.3 22402.0
+553.7 4543.0
+554.8 15768.0
+556.5 67909.0
+558.6 186876.0
+559.6 38382.0
+560.6 19349.0
+561.5 4506.0
+562.4 17040.0
+563.9 30871.0
+567.0 14923.0
+567.7 7231.0
+569.2 18826.0
+570.0 43252.0
+570.7 40744.0
+576.4 27868.0
+578.3 37027.0
+579.0 17.0
+581.2 11862.0
+585.8 11202.0
+588.3 34625.0
+589.0 3.0
+590.6 45434.0
+591.3 10312.0
+593.9 11394.0
+594.6 31336.0
+596.1 28878.0
+599.9 27065.0
+600.8 9783.0
+603.6 8349.0
+605.8 32512.0
+608.2 9548.0
+609.3 6418.0
+613.9 28913.0
+621.2 19801.0
+622.7 25512.0
+623.5 248741.0
+624.6 101891.0
+625.4 11783.0
+626.4 27504.0
+628.5 41288.0
+630.7 15924.0
+634.7 42605.0
+635.6 17233.0
+637.2 12494.0
+638.0 16525.0
+639.4 20236.0
+640.8 323591.0
+641.5 5714043.0
+642.5 1859248.0
+643.5 375036.0
+645.0 40057.0
+645.7 11.0
+646.7 1798.0
+649.9 4779.0
+653.4 1211.0
+657.4 47758.0
+659.5 87105.0
+660.4 5546.0
+667.4 15667.0
+671.3 8437.0
+672.5 19269.0
+673.4 54740.0
+678.3 8211.0
+679.9 6552.0
+683.8 13941.0
+685.7 3062.0
+687.6 19265.0
+690.3 16952.0
+693.0 4623.0
+694.6 6031.0
+695.3 3952.0
+696.2 9830.0
+698.9 6626.0
+700.7 28163.0
+703.5 14559.0
+707.3 50122.0
+707.9 5349.0
+708.9 68522.0
+711.7 34640.0
+718.4 10184.0
+719.2 25325.0
+720.8 6199.0
+722.5 4035.0
+723.5 4361.0
+725.3 26713.0
+726.8 107967.0
+727.6 86575.0
+728.6 28054.0
+731.2 2661.0
+736.5 252360.0
+737.5 66384.0
+738.5 223658.0
+739.5 136451.0
+740.4 90229.0
+746.7 21403.0
+753.3 13110.0
+754.4 12183936.0
+755.4 4816542.0
+756.4 2736858.0
+757.4 667465.0
+758.4 197965.0
+759.3 8041.0
+761.3 2270.0
+762.4 9530.0
+769.8 21250.0
+770.4 9397.0
+771.4 7409.0
+772.5 13269.0
+775.1 9990.0
+783.6 2687.0
+784.5 7197.0
+787.4 137576.0
+788.5 25460.0
+796.5 19556.0
+799.8 10087.0
+806.3 36471.0
+808.3 6340.0
+814.6 125632.0
+815.4 23214.0
+816.5 22919.0
+817.3 16850.0
+818.3 24779.0
+820.2 33895.0
+827.4 11089.0
+831.9 1392.0
+835.5 298945.0
+836.4 175308.0
+838.2 13674.0
+846.0 2289.0
+849.3 23645.0
+852.7 580178.0
+853.4 22303784.0
+854.3 8664932.0
+855.3 2489524.0
+856.2 217073.0
+856.8 14.0
+859.8 7362.0
+863.3 24506.0
+868.8 1503.0
+874.6 33001.0
+876.5 5401.0
+877.3 28104.0
+891.1 3538.0
+893.3 11443.0
+901.3 79610.0
+902.2 48761.0
+904.1 27722.0
+906.1 8699.0
+911.2 2501.0
+919.2 195606.0
+920.2 100094.0
+921.0 16844.0
+925.1 20719.0
+927.8 6905.0
+935.6 5998.0
+936.8 6240.0
+948.4 39712.0
+951.5 3029.0
+952.2 11436.0
+960.5 1731.0
+963.1 19209.0
+966.4 678935.0
+967.4 261603.0
+968.2 147220.0
+976.4 27633.0
+977.6 22498.0
+995.6 3603.0
+1000.1 19739.0
+1003.6 4573.0
+1011.4 2914.0
+1017.0 11894.0
+1018.4 31847.0
+1019.8 4967.0
+1028.6 3414.0
+1030.7 14837.0
+1044.4 5145.0
+1047.4 4571.0
+1055.0 5603.0
+1060.5 5752.0
+1074.8 2108.0
+1095.7 1385.0
+1097.5 19117.0
+1105.5 648.0
+1107.5 3.0
+1110.5 2426.0
+1116.7 11819.0
+1123.8 4820.0
+1165.5 4537.0
+1181.8 7919.0
+1183.4 3712.0
+1187.5 4571.0
+1202.0 4628.0
+1211.8 4269.0
+1236.6 2846.0
+1259.5 2889.0
+1311.6 2651.0
+1438.5 4573.0
+1468.2 2116.0
+1469.5 3922.0
+1485.6 4062.0
+1500.7 4107.0
+1507.8 2806.0
+1515.9 3070.0
+1538.8 3259.0
+1541.6 5431.0
+1542.8 2460.0
+1579.4 1984.0
+1582.8 30.0
+1589.0 4967.0
+
+698.84 12345.67 2
+183.2 627.0
+191.3 926.0
+198.3 251.0
+201.0 1029.0
+211.7 364.0
+214.7 499.0
+218.5 963.0
+220.1 2120.0
+221.4 462.0
+222.3 493.0
+228.9 4167.0
+229.8 2188.0
+240.0 1256.0
+244.1 1418.0
+245.1 698.0
+246.1 67354.0
+247.1 7803.0
+247.7 4.0
+253.2 337.0
+255.0 2333.0
+256.1 5114.0
+257.0 6458.0
+260.9 4314.0
+261.7 631.0
+268.1 733.0
+272.2 1383.0
+273.1 1578.0
+273.9 98821.0
+275.0 10056.0
+275.9 6406.0
+282.2 470.0
+291.7 605.0
+293.7 894.0
+296.2 2084.0
+297.0 915.0
+301.4 184.0
+313.2 1032.0
+314.2 1474.0
+316.4 1830.0
+317.9 5240.0
+319.2 3431.0
+322.2 1652.0
+326.9 1042.0
+328.9 947.0
+330.3 4.0
+330.9 1153.0
+334.1 2128.0
+337.5 2544.0
+339.9 695.0
+341.1 1345.0
+349.9 883.0
+351.1 184.0
+353.1 2859.0
+353.8 5.0
+356.7 2489.0
+358.5 2523.0
+360.3 1760.0
+361.1 4.0
+363.7 403.0
+364.7 728.0
+369.3 11218.0
+371.1 2872.0
+374.1 3840.0
+375.0 3303.0
+376.0 601.0
+379.0 237.0
+380.1 514.0
+384.9 1549.0
+385.9 2967.0
+386.6 1710.0
+388.0 6505.0
+389.0 5203.0
+390.5 556.0
+391.3 3960.0
+393.2 33193.0
+394.2 8629.0
+395.1 1751.0
+399.7 6622.0
+401.6 1149.0
+402.9 6269.0
+404.4 11858.0
+405.1 123529.0
+406.1 21010.0
+407.3 4834.0
+407.9 443.0
+412.2 3892.0
+415.3 1577.0
+416.2 1743.0
+419.9 2260.0
+420.8 650.0
+422.2 1169.0
+423.0 372.0
+425.6 7679.0
+427.0 601.0
+429.1 3690.0
+430.1 2271.0
+431.4 1395.0
+433.3 325.0
+434.8 3880.0
+440.0 1721.0
+442.0 2835.0
+442.7 1994.0
+444.1 780.0
+445.6 2095.0
+451.8 410.0
+452.8 675.0
+454.2 860.0
+454.9 1103.0
+456.2 2079.0
+458.4 813.0
+459.8 478.0
+464.1 18044.0
+465.3 10262.0
+466.2 927.0
+470.3 2689.0
+471.1 4998.0
+472.0 3849.0
+472.7 1400.0
+478.0 3597.0
+483.0 4665.0
+485.0 3838.0
+486.2 2013.0
+487.9 7448.0
+490.8 870.0
+496.4 1958.0
+497.1 550.0
+498.0 7041.0
+500.2 1733.0
+501.1 5560.0
+501.8 3276.0
+502.7 6176.0
+503.5 712.0
+504.2 450.0
+506.1 20201.0
+507.0 5120.0
+508.1 4191.0
+511.1 2177.0
+513.1 9683.0
+514.1 18983.0
+515.5 7519.0
+516.3 19908.0
+518.2 643.0
+519.2 2030.0
+524.0 2740.0
+527.6 2278.0
+529.2 1358.0
+531.1 2134.0
+532.7 4981.0
+534.0 61758.0
+535.2 23686.0
+536.2 5637.0
+541.3 3209.0
+544.4 3430.0
+545.4 7750.0
+546.7 1134.0
+553.2 8745.0
+553.9 426.0
+554.9 2320.0
+556.3 1030.0
+560.2 463.0
+560.8 3366.0
+562.4 15805.0
+563.2 39141.0
+564.4 15816.0
+567.5 1274.0
+568.7 1079.0
+569.4 2841.0
+570.9 10764.0
+571.8 2024.0
+573.1 6084.0
+574.1 10543.0
+575.3 1861.0
+576.5 3074.0
+580.5 528.0
+586.0 6214.0
+587.0 4524.0
+589.1 6692.0
+591.1 6959.0
+592.0 1908.0
+597.6 996.0
+599.0 1371.0
+600.6 788.0
+608.7 3430.0
+609.9 1677.0
+611.4 4420.0
+613.1 996.0
+614.2 1489.0
+615.7 1005.0
+617.1 6574.0
+618.1 885.0
+619.3 2173.0
+623.9 4468.0
+624.6 4207.0
+626.3 3211.0
+627.9 914.0
+631.5 3148.0
+634.0 15430.0
+640.3 3541.0
+644.1 12462.0
+645.1 8739.0
+647.1 1761.0
+649.2 536.0
+651.1 808.0
+652.8 4881.0
+654.3 409.0
+656.1 4416.0
+657.4 3642.0
+658.7 7594.0
+660.9 6871.0
+662.2 49673.0
+663.2 14529.0
+664.4 10058.0
+667.9 525.0
+668.6 3229.0
+670.4 657.0
+671.3 1239.0
+672.6 3301.0
+674.6 1529.0
+676.2 3869.0
+678.4 12761.0
+679.3 9189.0
+680.8 2296.0
+681.7 13156.0
+682.6 2288.0
+683.4 1851.0
+684.3 694.0
+685.3 746.0
+688.5 2459.0
+689.9 49906.0
+690.9 21177.0
+692.7 588.0
+693.4 1530.0
+699.4 881.0
+707.2 792.0
+708.2 336.0
+715.6 412.0
+716.2 599.0
+718.4 1560.0
+719.3 4944.0
+720.0 3617.0
+721.2 1626.0
+723.3 4438.0
+724.0 2617.0
+725.7 493.0
+726.4 535.0
+733.0 1864.0
+734.6 253.0
+735.3 160631.0
+736.3 61458.0
+737.4 20539.0
+742.8 1475.0
+749.8 651.0
+755.2 4301.0
+758.5 2435.0
+761.2 2269.0
+764.1 909.0
+771.8 1105.0
+772.9 1249.0
+775.4 3409.0
+783.6 3314.0
+785.2 319.0
+786.8 334.0
+788.2 4043.0
+794.0 120.0
+796.8 1868.0
+797.4 1216.0
+798.0 6239.0
+799.0 1.0
+800.4 2254.0
+804.6 6413.0
+805.2 436.0
+806.3 50490.0
+807.4 26601.0
+808.3 1121.0
+813.7 2621.0
+816.0 3311.0
+822.6 588.0
+832.4 3168.0
+834.2 10167.0
+835.2 18982.0
+839.1 372.0
+841.9 3148.0
+844.7 1382.0
+845.3 11.0
+846.2 2371.0
+847.2 3783.0
+851.6 2837.0
+855.8 372.0
+860.2 2830.0
+861.4 3541.0
+862.7 4041.0
+863.4 327620.0
+864.4 95558.0
+865.3 10696.0
+867.1 795.0
+876.4 467.0
+878.0 2544.0
+886.9 1710.0
+888.2 1016.0
+892.2 453.0
+895.5 2803.0
+896.3 458.0
+899.0 2935.0
+902.8 4610.0
+905.9 6288.0
+907.2 2530.0
+909.4 305.0
+913.2 4078.0
+915.2 4921.0
+916.2 12312.0
+918.9 3973.0
+925.2 581.0
+926.3 1524.0
+927.1 1245.0
+929.4 233.0
+933.3 13397.0
+934.0 8793.0
+935.2 3756.0
+938.3 1108.0
+941.9 1385.0
+946.8 603.0
+948.3 1413.0
+952.9 1945.0
+957.7 338.0
+959.5 1407.0
+960.5 410.0
+962.6 264.0
+965.3 638.0
+966.0 1807.0
+969.3 1117.0
+971.2 1626.0
+973.0 2030.0
+974.5 14086.0
+975.5 14392.0
+977.0 9641.0
+981.4 3670.0
+986.3 6219.0
+987.3 8361.0
+988.3 1124.0
+991.7 15621.0
+992.4 193384.0
+993.4 115474.0
+994.5 11689.0
+995.3 2.0
+996.5 878.0
+999.1 899.0
+1004.1 38544.0
+1005.3 23030.0
+1006.2 5906.0
+1007.4 1906.0
+1010.8 387.0
+1014.4 293.0
+1017.4 1324.0
+1031.0 958.0
+1036.8 892.0
+1050.7 824.0
+1053.0 399.0
+1062.1 736.0
+1063.7 2103.0
+1073.0 361.0
+1075.4 6551.0
+1079.4 3675.0
+1082.3 1482.0
+1087.2 2783.0
+1088.4 3752.0
+1089.4 3592.0
+1105.0 5968.0
+1105.9 1127.0
+1118.4 1646.0
+1119.9 378.0
+1123.4 262065.0
+1124.4 127052.0
+1125.5 17558.0
+1133.3 3703.0
+1134.1 3315.0
+1135.2 2533.0
+1136.2 2913.0
+1149.8 649.0
+1151.1 42506.0
+1152.2 23761.0
+1153.4 7274.0
+1155.3 468.0
+1157.2 559.0
+1166.2 2439.0
+1180.9 1177.0
+1183.5 1414.0
+1205.2 528.0
+1212.6 664.0
+1214.5 1770.0
+1224.7 374.0
+1227.9 1045.0
+1229.4 1307.0
+1232.2 7564.0
+1233.1 7841.0
+1236.4 4378.0
+1246.4 815.0
+1250.2 30353.0
+1251.3 11885.0
+1252.4 19983.0
+1253.1 1.0
+1263.6 328.0
+1280.5 550.0
+1281.6 694.0
+1284.2 365.0
+1294.2 362.0
+1307.1 493.0
+1327.6 573.0
+1333.9 590.0
+1335.3 387.0
+1340.8 401.0
+1355.2 171.0
+1367.6 506.0
+1375.5 560.0
+1381.5 730.0
+1397.6 471.0
+1420.8 1068.0
+1439.6 952.0
+1452.2 452.0
+1454.8 327.0
+1463.8 591.0
+1474.5 701.0
+1497.3 307.0
+1511.5 461.0
+1531.3 307.0
+1574.9 262.0
+1575.9 453.0
+1579.3 328.0
+1581.6 328.0
+1588.5 434.0
+1596.3 710.0
+1630.3 560.0
+1632.7 224.0
+1639.7 382.0
+1655.0 414.0
+1659.4 566.0
+1673.2 365.0
+1695.6 325.0
+1704.0 621.0
+1709.1 246.0
+1729.2 435.0
+1752.3 567.0
+1761.6 568.0
+1788.3 273.0
+1794.5 238.0
+1850.8 217.0
+1859.0 713.0
+1891.0 394.0
+1892.9 641.0
+1903.6 790.0
+1911.1 329.0
+1919.4 509.0
+1933.3 223.0
+1951.1 246.0
+1955.4 621.0
+1967.7 396.0
+1968.3 652.0
+1990.7 1107.0
+1992.9 513.0
+1994.2 470.0
+1997.5 224.0
+
+623.28 12345.67 2
+160.3 525.0
+161.8 474.0
+163.1 238.0
+164.2 784.0
+165.0 770.0
+166.9 966.0
+168.2 1496.0
+170.6 872.0
+175.2 1289.0
+175.9 1.0
+178.6 724.0
+179.4 884.0
+181.1 11524.0
+182.4 2691.0
+183.2 10614.0
+184.5 288.0
+185.3 3331.0
+187.0 1222.0
+188.2 556.0
+190.2 1793.0
+191.5 2540.0
+193.2 403.0
+194.0 1010.0
+195.2 1340.0
+196.2 2845.0
+197.2 2150.0
+199.1 106283.0
+200.3 9382.0
+201.3 5260.0
+202.3 620.0
+203.2 3152.0
+206.4 1690.0
+208.4 3678.0
+209.3 3182.0
+210.2 1791.0
+211.0 17726.0
+212.1 3600.0
+213.2 1951.0
+214.9 2445.0
+217.6 755.0
+219.3 660.0
+220.5 7284.0
+221.5 1483.0
+222.3 6851.0
+223.0 634.0
+224.2 2570.0
+225.4 966.0
+227.1 57603.0
+228.2 10228.0
+229.2 8534.0
+230.3 1186.0
+231.3 11152.0
+232.4 1950.0
+234.8 899.0
+236.3 264.0
+237.3 2032.0
+238.3 4006.0
+240.2 10595.0
+241.4 571.0
+242.2 1101.0
+243.1 3899.0
+244.1 4855.0
+245.1 4604.0
+246.6 971.0
+248.4 2328.0
+252.3 831.0
+253.1 1152.0
+254.3 5680.0
+255.2 978.0
+256.0 709.0
+257.6 723.0
+258.3 34646.0
+259.4 2880.0
+260.8 902.0
+263.1 569.0
+264.2 8156.0
+265.4 6731.0
+266.3 22564.0
+267.2 4284.0
+268.2 5078.0
+268.9 544.0
+270.9 760.0
+274.1 791.0
+275.7 861.0
+276.3 50370.0
+277.2 3198.0
+278.3 832.0
+279.6 741.0
+280.6 3643.0
+281.7 2938.0
+282.7 3986.0
+283.5 1029.0
+284.2 9222.0
+285.0 2098.0
+286.3 1899.0
+290.9 462.0
+292.2 14421.0
+293.2 13662.0
+294.5 475.0
+295.5 4315.0
+296.3 5659.0
+298.3 5108.0
+300.4 4193.0
+301.6 1334.0
+303.0 1037.0
+305.4 710.0
+306.4 563.0
+308.3 2567.0
+310.3 66666.0
+311.2 14463.0
+312.2 3249.0
+313.8 2938.0
+314.5 3.0
+316.1 14038.0
+317.9 3036.0
+320.4 473.0
+323.3 131.0
+323.9 2946.0
+324.7 1471.0
+325.3 697.0
+326.3 58805.0
+327.4 10313.0
+328.3 201506.0
+329.3 24220.0
+330.1 2704.0
+331.2 1839.0
+332.7 346.0
+336.8 1967.0
+338.0 3326.0
+338.8 685.0
+339.6 883.0
+341.2 7421.0
+342.2 3988.0
+343.6 2400.0
+344.2 3949.0
+345.0 1086.0
+345.7 2.0
+346.8 4893.0
+349.1 868.0
+349.9 323.0
+351.6 4470.0
+352.4 2376.0
+353.8 3474.0
+354.7 3694.0
+355.4 7860.0
+356.8 1184.0
+358.3 2436.0
+359.9 13481.0
+360.9 2890.0
+361.5 1709.0
+364.0 2923.0
+366.6 2571.0
+368.2 244.0
+369.3 2484.0
+370.4 352.0
+371.0 1782.0
+372.7 521.0
+377.4 1164.0
+378.5 1719.0
+381.4 7365.0
+382.6 3990.0
+383.4 14808.0
+384.5 4462.0
+385.2 1096.0
+386.9 615.0
+387.6 467.0
+388.6 2074.0
+389.3 24845.0
+389.9 26728.0
+390.8 9291.0
+391.9 3082.0
+393.0 1749.0
+394.2 695.0
+395.2 11145.0
+396.2 6669.0
+397.3 1981.0
+398.2 3599.0
+399.4 71560.0
+400.4 17896.0
+401.5 15911.0
+402.1 5723.0
+403.2 2935.0
+405.0 469.0
+407.2 528.0
+409.3 40462.0
+410.4 24794.0
+411.4 2299.0
+413.5 9817.0
+415.1 781.0
+416.2 384.0
+420.6 1731.0
+423.0 5225.0
+424.1 3777.0
+424.8 2425.0
+425.7 5819.0
+426.6 9509.0
+427.4 122732.0
+428.4 27073.0
+429.3 3819.0
+430.0 836.0
+431.3 15878.0
+432.2 2174.0
+436.9 376.0
+438.9 1503.0
+439.5 567.0
+440.4 1113.0
+441.3 39487.0
+442.2 12254.0
+443.2 6834.0
+444.3 1103.0
+445.3 618.0
+445.9 443.0
+448.6 648.0
+449.5 2222.0
+451.9 5329.0
+452.8 196.0
+453.6 1690.0
+454.9 367.0
+456.8 1867.0
+457.9 5333.0
+459.1 2449.0
+459.8 945.0
+460.5 104166.0
+461.4 35137.0
+462.4 5056.0
+463.3 3060.0
+464.0 1.0
+465.4 664.0
+466.1 214.0
+469.2 874.0
+470.4 616.0
+471.0 914.0
+471.6 3.0
+472.4 834.0
+473.4 1376.0
+475.8 619.0
+477.6 1860.0
+479.1 721.0
+480.5 666.0
+481.3 3225.0
+481.9 689.0
+483.4 5433.0
+485.3 1301.0
+487.1 1091.0
+490.1 4757.0
+491.5 3521.0
+492.5 620.0
+494.6 2030.0
+495.3 904.0
+496.3 7989.0
+497.6 1064.0
+498.4 5330.0
+499.6 1663.0
+502.4 260.0
+506.4 1829.0
+507.2 7189.0
+508.2 2316.0
+510.2 9488.0
+510.9 848.0
+513.5 1914.0
+514.6 2855.0
+516.6 6005.0
+517.2 3.0
+518.4 495.0
+520.4 2369.0
+521.4 614.0
+522.5 561.0
+524.4 15777.0
+525.6 18934.0
+526.3 4493.0
+527.6 1775.0
+528.3 4833.0
+529.3 1655.0
+530.8 2470.0
+531.5 727.0
+533.2 560.0
+534.6 4220.0
+535.4 1943.0
+536.3 1612.0
+537.5 986.0
+538.4 5951.0
+540.1 880.0
+541.0 228.0
+542.4 98225.0
+543.5 32438.0
+544.4 13297.0
+545.2 2967.0
+546.4 1140.0
+548.3 1994.0
+550.5 2290.0
+551.2 937.0
+552.3 16328.0
+553.4 11545.0
+555.0 5233.0
+556.4 30260.0
+557.3 12134.0
+558.4 8438.0
+559.4 6297.0
+560.7 1253.0
+561.4 513.0
+562.7 3151.0
+564.1 14483.0
+564.9 12193.0
+566.1 740.0
+566.8 1153.0
+567.7 1208.0
+570.2 20676.0
+571.4 38175.0
+573.2 922378.0
+574.3 64579.0
+575.6 17692.0
+577.0 7849.0
+578.1 6517.0
+578.9 3603.0
+579.9 622.0
+582.2 1268.0
+583.6 7853.0
+584.4 4915.0
+586.2 2267.0
+587.6 1651.0
+588.4 704.0
+589.4 207345.0
+590.5 50029.0
+591.4 20022.0
+592.3 9965.0
+593.1 10946.0
+594.1 3189.0
+595.2 14777.0
+596.4 15599.0
+597.2 4783.0
+598.5 1638.0
+600.5 7628.0
+601.5 16610.0
+602.3 4824.0
+603.3 2565.0
+603.9 3933.0
+605.5 22455.0
+606.4 22129.0
+607.4 5360.0
+609.1 1626.0
+610.2 4676.0
+611.2 2405.0
+614.7 84840.0
+615.6 26879.0
+616.3 7161.0
+617.2 616.0
+618.2 788.0
+618.8 697.0
+622.7 388.0
+623.4 2778.0
+627.1 456.0
+627.8 252.0
+628.8 1222.0
+631.6 364.0
+633.8 632.0
+636.8 548.0
+639.3 27398.0
+640.7 29601.0
+641.4 29635.0
+642.3 13469.0
+643.3 8764.0
+645.0 3785.0
+646.1 5966.0
+647.6 1371.0
+648.5 533.0
+651.5 1833.0
+652.3 1206.0
+653.0 1054.0
+654.8 1581.0
+655.6 2326.0
+656.3 3439.0
+657.3 122913.0
+658.4 31026.0
+659.2 10217.0
+660.3 2740.0
+662.1 784.0
+663.6 913.0
+664.2 1764.0
+665.3 1654.0
+666.3 2418.0
+667.3 7798.0
+668.2 4867.0
+669.8 368.0
+671.2 3693.0
+672.9 8019.0
+674.2 5211.0
+675.5 4863.0
+676.5 662.0
+679.8 228.0
+685.1 4815.0
+686.3 12999.0
+687.3 8808.0
+689.6 409.0
+692.5 2969.0
+694.5 2059.0
+696.3 8599.0
+697.0 94.0
+698.3 904.0
+699.2 1097.0
+701.6 580.0
+703.6 775.0
+704.4 243550.0
+705.4 96896.0
+706.4 17140.0
+707.1 1643.0
+708.4 741.0
+710.5 73.0
+714.3 846.0
+715.3 1269.0
+718.9 1477.0
+720.4 1663.0
+724.4 5125.0
+726.2 4304.0
+727.2 4928.0
+728.4 2899.0
+731.9 1261.0
+736.4 6155.0
+737.4 3929.0
+738.5 5115.0
+739.6 8838.0
+741.1 1406.0
+742.3 6696.0
+743.5 3155.0
+744.2 5402.0
+744.9 447.0
+749.1 554.0
+750.2 2775.0
+750.8 219.0
+751.6 220.0
+752.3 2305.0
+753.2 1456.0
+754.2 26266.0
+755.3 7662.0
+756.4 11450.0
+757.3 4288.0
+758.3 6443.0
+759.5 2811.0
+760.6 400.0
+761.3 879.0
+763.2 902.0
+764.5 280.0
+765.1 2227.0
+765.7 563.0
+766.4 340.0
+768.3 19166.0
+769.3 12947.0
+770.5 4653.0
+771.6 857.0
+772.4 15226.0
+773.5 2899.0
+774.2 4345.0
+775.4 1001.0
+776.4 1001.0
+777.1 1641.0
+777.9 181.0
+778.6 1934.0
+780.3 5930.0
+781.3 744.0
+783.1 1663.0
+783.8 930.0
+784.4 1247.0
+785.4 3225.0
+786.2 31474.0
+787.2 13146.0
+788.2 5296.0
+789.2 2522.0
+790.4 3731.0
+791.4 451.0
+793.6 790.0
+794.5 1402.0
+796.5 2247.0
+797.4 3.0
+799.1 327.0
+800.3 627.0
+801.4 16551.0
+802.4 8007.0
+803.2 7782.0
+806.4 941.0
+808.7 579.0
+810.1 1.0
+811.1 1253.0
+812.3 3592.0
+813.3 1521.0
+814.3 1835.0
+817.5 660.0
+818.7 37447.0
+819.4 918482.0
+820.4 386567.0
+821.4 90417.0
+822.2 3315.0
+823.3 6906.0
+823.9 1302.0
+826.0 1438.0
+827.8 2293.0
+828.5 643.0
+829.3 6143.0
+830.2 4036.0
+831.6 3486.0
+833.6 1014.0
+834.3 2026.0
+836.5 419.0
+837.3 1570.0
+838.0 2638.0
+839.2 32613.0
+840.3 13179.0
+841.1 5162.0
+842.2 1312.0
+844.5 1003.0
+847.4 3049.0
+848.8 1749.0
+849.6 6795.0
+851.3 7103.0
+852.2 5417.0
+854.2 2308.0
+855.3 5195.0
+856.6 573.0
+857.2 45461.0
+858.3 10102.0
+859.4 9343.0
+862.5 959.0
+864.0 1568.0
+865.0 1988.0
+866.2 605.0
+867.3 2921.0
+869.2 10174.0
+870.3 12332.0
+871.3 5745.0
+873.2 3474.0
+874.6 977.0
+875.3 4191.0
+879.5 7.0
+880.9 4224.0
+882.4 542.0
+883.3 2319.0
+884.2 4028.0
+885.2 1009.0
+886.2 538.0
+889.7 1025.0
+891.1 449.0
+893.6 447.0
+894.4 2641.0
+899.4 389.0
+900.5 9945.0
+901.4 11504.0
+902.4 1570.0
+903.4 1926.0
+907.2 350.0
+910.5 1673.0
+913.3 456.0
+915.3 274.0
+916.4 324.0
+917.4 1885.0
+918.4 666583.0
+919.3 254572.0
+920.3 62927.0
+921.5 325.0
+924.4 5932.0
+925.2 624.0
+926.4 2411.0
+927.2 2950.0
+928.4 5990.0
+930.5 1654.0
+931.5 2675.0
+935.2 2631.0
+940.4 658.0
+941.4 1188.0
+942.4 4579.0
+943.6 2123.0
+944.3 757.0
+948.6 539.0
+949.6 299.0
+951.5 3822.0
+952.4 24010.0
+953.3 10559.0
+954.3 10425.0
+955.4 2010.0
+956.4 306.0
+957.2 1884.0
+958.3 1589.0
+960.2 447.0
+962.4 821.0
+963.8 333.0
+967.4 2729.0
+968.4 530.0
+969.4 1495.0
+970.4 34124.0
+971.3 3452.0
+972.0 671.0
+980.1 2757.0
+981.3 3481.0
+983.4 72.0
+984.7 522.0
+986.3 4479.0
+987.4 1828.0
+988.1 1.0
+991.7 5449.0
+992.4 42.0
+993.1 264.0
+994.0 443.0
+996.2 1148.0
+998.4 8650.0
+999.4 5547.0
+1000.5 5606.0
+1001.8 560.0
+1002.9 438.0
+1003.6 1345.0
+1005.3 220.0
+1010.0 612.0
+1012.3 6434.0
+1013.9 337.0
+1015.2 4672.0
+1016.6 2412.0
+1018.0 5252.0
+1019.1 482.0
+1024.4 274.0
+1027.2 2816.0
+1028.2 883.0
+1029.4 89879.0
+1030.4 49686.0
+1031.5 9081.0
+1036.5 347.0
+1037.6 3069.0
+1038.4 129.0
+1039.4 2056.0
+1043.7 482.0
+1044.6 440.0
+1045.5 943.0
+1046.6 5299.0
+1047.3 454009.0
+1048.3 219202.0
+1049.4 48203.0
+1050.3 1679.0
+1053.9 1286.0
+1055.4 3598.0
+1056.4 1977.0
+1057.5 9525.0
+1058.4 2978.0
+1059.4 1722.0
+1063.3 2913.0
+1064.0 1416.0
+1065.2 580.0
+1069.5 2801.0
+1071.3 4739.0
+1071.9 1412.0
+1073.2 1257.0
+1076.0 632.0
+1077.0 676.0
+1079.7 438.0
+1081.3 31033.0
+1082.2 18537.0
+1083.2 4449.0
+1083.9 1.0
+1086.8 191.0
+1087.7 572.0
+1088.9 801.0
+1092.1 323.0
+1094.4 600.0
+1095.5 330.0
+1096.7 571.0
+1098.3 4122.0
+1099.2 62769.0
+1100.3 22410.0
+1101.2 11433.0
+1103.6 933.0
+1104.8 640.0
+1108.5 831.0
+1112.5 2246.0
+1113.8 939.0
+1115.2 1044.0
+1117.0 8540.0
+1118.1 3714.0
+1119.4 5337.0
+1122.9 193.0
+1123.7 400.0
+1124.4 321.0
+1126.2 1557.0
+1128.0 484.0
+1128.8 1709.0
+1130.9 287.0
+1134.0 439.0
+1136.1 237.0
+1139.3 561.0
+1140.6 2492.0
+1141.3 2847.0
+1142.3 287.0
+1144.4 30410.0
+1145.3 23573.0
+1146.0 871.0
+1146.7 3857.0
+1147.4 162.0
+1151.8 54.0
+1155.7 374.0
+1158.2 2996.0
+1159.6 833.0
+1160.4 304.0
+1162.5 304.0
+1163.5 732.0
+1168.8 180.0
+1173.1 1614.0
+1175.4 376.0
+1176.7 99.0
+1178.2 357.0
+1180.4 110.0
+1182.6 573.0
+1185.4 60.0
+1189.5 446.0
+1197.0 458.0
+1207.7 621.0
+1211.0 484.0
+1213.8 501.0
+1217.6 518.0
+1222.2 660.0
+1224.3 438.0
+1227.1 765.0
+1231.5 433.0
+1232.5 359.0
+1234.7 82.0
+1235.9 458.0
+1242.5 175.0
+1243.8 688.0
+1247.8 224.0
+1250.3 874.0
+1254.7 314.0
+1255.3 389.0
+1259.4 110.0
+1262.1 335.0
+1269.9 241.0
+1274.6 797.0
+1277.7 471.0
+1280.2 220.0
+1296.2 2069.0
+1300.4 188.0
+1301.1 437.0
+1305.5 209.0
+1306.9 561.0
+1309.1 308.0
+1310.4 604.0
+1311.1 327.0
+1313.2 470.0
+1319.9 518.0
+1322.7 366.0
+1324.4 72.0
+1334.6 261.0
+1336.5 268.0
+1344.3 1447.0
+1346.6 401.0
+1347.6 540.0
+1351.2 192.0
+1353.1 413.0
+1353.7 304.0
+1354.5 446.0
+1359.0 531.0
+1364.0 224.0
+1368.8 268.0
+1375.8 173.0
+1383.9 290.0
+1387.2 535.0
+1389.0 506.0
+1391.9 302.0
+1401.4 609.0
+1405.7 484.0
+1409.8 344.0
+1415.2 493.0
+1422.7 224.0
+1423.4 414.0
+1424.9 617.0
+1425.7 374.0
+1427.7 82.0
+1429.5 339.0
+1431.6 747.0
+1434.2 382.0
+1435.4 453.0
+1437.5 912.0
+1440.2 713.0
+1441.3 244.0
+1443.5 994.0
+1447.6 344.0
+1448.9 263.0
+1451.8 191.0
+1453.7 342.0
+1454.7 342.0
+1464.1 174.0
+1464.9 265.0
+1465.6 181.0
+1473.2 192.0
+1479.0 626.0
+1483.8 28.0
+1485.2 398.0
+1486.0 632.0
+1489.9 780.0
+1492.7 518.0
+1494.8 268.0
+1497.9 564.0
+1502.3 1218.0
+1504.2 1109.0
+1505.8 382.0
+1506.8 936.0
+1509.4 310.0
+1510.3 116.0
+1512.3 193.0
+1514.1 1031.0
+1517.3 851.0
+1518.0 783.0
+1519.8 191.0
+1523.0 558.0
+1526.3 285.0
+1527.6 458.0
+1533.2 163.0
+1542.7 1083.0
+1552.7 632.0
+1565.2 250.0
+1567.6 192.0
+1568.5 163.0
+1570.7 290.0
+1571.3 339.0
+1572.3 909.0
+1577.3 495.0
+1580.9 285.0
+1582.4 312.0
+1591.8 995.0
+1593.6 261.0
+1594.8 128.0
+1597.8 330.0
+1602.3 457.0
+1605.8 211.0
+1612.3 561.0
+1613.4 752.0
+1615.8 330.0
+1620.6 285.0
+1621.7 270.0
+1623.5 1155.0
+1624.2 537.0
+1625.2 909.0
+1628.4 63.0
+1630.3 453.0
+1631.5 433.0
+1632.6 535.0
+1636.6 280.0
+1640.8 211.0
+1641.6 301.0
+1642.6 695.0
+1645.6 377.0
+1652.3 413.0
+1656.3 758.0
+1661.2 927.0
+1662.0 245.0
+1663.7 362.0
+1665.2 373.0
+1677.2 95.0
+1684.7 422.0
+1685.7 397.0
+1686.4 1.0
+1687.5 236.0
+1688.3 258.0
+1690.3 473.0
+1695.2 459.0
+1700.6 369.0
+1701.5 1084.0
+1703.3 376.0
+1705.2 1732.0
+1706.5 344.0
+1714.7 234.0
+1719.0 350.0
+1721.6 398.0
+1730.1 51.0
+1734.0 468.0
+1736.4 372.0
+1743.4 549.0
+1744.5 578.0
+1747.0 447.0
+1749.6 870.0
+1750.7 356.0
+1756.7 1002.0
+1768.3 398.0
+1770.3 304.0
+1771.3 619.0
+1774.0 887.0
+1775.1 225.0
+1777.6 376.0
+1780.2 891.0
+1786.0 309.0
+1790.7 245.0
+1792.3 261.0
+1794.4 162.0
+1795.7 468.0
+1797.6 179.0
+1805.0 387.0
+1808.7 2.0
+1811.7 384.0
+1812.9 884.0
+1829.5 711.0
+1831.6 535.0
+1833.6 535.0
+1837.0 947.0
+1838.0 162.0
+1838.7 372.0
+1841.9 737.0
+1845.5 234.0
+1849.7 270.0
+1856.3 352.0
+1857.3 424.0
+1859.4 335.0
+1867.1 136.0
+1868.7 249.0
+1870.3 654.0
+1880.5 359.0
diff --git a/SystemTest/TestSpectrum.dta b/SystemTest/TestSpectrum.dta
new file mode 100644
index 0000000..d6fcdae
--- /dev/null
+++ b/SystemTest/TestSpectrum.dta
@@ -0,0 +1,131 @@
+873.95 2
+129.2 4817117.0
+129.9 96.0
+147.2 448662.0
+148.0 117858.0
+183.1 1749307.0
+183.7 8.0
+224.0 33559.0
+226.2 147973.0
+228.1 4392773.0
+228.7 38.0
+229.1 277275.0
+240.1 203282.0
+244.2 1289841.0
+244.9 31.0
+277.1 79783.0
+285.9 308883.0
+286.2 127712.0
+288.1 161002.0
+297.0 100733.0
+297.3 96080.0
+298.9 70539.0
+308.2 62321.0
+315.2 1303618.0
+315.4 564476.0
+316.3 101625.0
+322.0 49709.0
+322.3 136818.0
+326.3 122342.0
+327.0 73412.0
+339.2 293492.0
+339.3 218142.0
+339.9 39.0
+346.4 74730.0
+357.3 2712944.0
+358.0 14.0
+358.1 17.0
+364.0 162256.0
+368.3 55444.0
+378.7 58351.0
+379.6 232041.0
+379.7 436209.0
+380.3 91029.0
+385.3 210606.0
+385.4 246725.0
+386.6 122039.0
+387.9 10579208.0
+388.8 724544.0
+392.8 79137.0
+393.1 115509.0
+396.2 38927.0
+402.9 79175.0
+403.9 65971.0
+406.6 47412.0
+407.3 95341.0
+413.8 114220.0
+416.7 32485.0
+417.5 226840.0
+418.3 79899.0
+419.8 84798.0
+420.0 412990.0
+420.5 232510.0
+420.6 25.0
+422.9 170358.0
+423.6 65648.0
+425.3 186123.0
+427.6 131170.0
+428.5 3030179.0
+429.4 560828.0
+430.1 9.0
+431.2 174309.0
+446.4 682814.0
+447.3 92575.0
+455.2 170323.0
+485.3 67081.0
+488.1 150338.0
+488.2 74141.0
+488.8 106.0
+501.2 62882.0
+513.4 205403.0
+514.2 5.0
+517.2 6485638.0
+518.2 493244.0
+518.3 790176.0
+519.2 146916.0
+519.5 52953.0
+531.0 77063.0
+541.0 133883.0
+541.2 62920.0
+542.2 382995.0
+559.2 1572418.0
+560.0 140280.0
+560.1 19.0
+560.6 4.0
+586.5 74840.0
+604.4 67209.0
+612.4 274635.0
+613.2 44321.0
+614.1 331074.0
+628.2 736475.0
+629.3 27633.0
+629.4 194441.0
+630.3 4750656.0
+631.3 1040444.0
+631.9 119229.0
+632.4 186042.0
+645.5 180598.0
+646.2 28936448.0
+646.3 15599104.0
+647.2 5003226.0
+647.3 8997120.0
+648.2 3226851.0
+648.3 1537127.0
+649.0 179994.0
+650.8 103859.0
+656.4 130084.0
+695.4 164154.0
+727.3 188009.0
+728.3 45501.0
+729.3 82736.0
+729.5 101977.0
+745.0 69734.0
+745.3 627959.0
+745.9 95202.0
+747.5 311995.0
+756.3 87195.0
+774.2 245314.0
+774.4 578725.0
+774.8 111.0
+775.4 366855.0
+776.6 73345.0
diff --git a/SystemTest/Yeast.ms2 b/SystemTest/Yeast.ms2
new file mode 100644
index 0000000..f9eeb80
--- /dev/null
+++ b/SystemTest/Yeast.ms2
@@ -0,0 +1,1149 @@
+
+S 006099 006099 641.99
+Z 2 1282.97
+Z 3 1923.95
+186.4 35.9468955993652
+191.1 29.613317489624
+201.2 82.8019027709961
+204.0 167.65022277832
+206.1 36.6520576477051
+209.4 21.1239356994629
+218.4 42.0477523803711
+219.2 30.990306854248
+227.2 57.4513549804688
+228.1 22.0139961242676
+237.1 21.2120952606201
+239.1 35.2356872558594
+243.3 25.4867935180664
+246.4 122.990943908691
+251.1 49.2037315368652
+254.0 61.7847213745117
+256.9 151.22412109375
+267.0 36.8427696228027
+270.0 25.8793964385986
+272.1 642.616027832031
+272.9 13.9499454498291
+278.2 29.1785507202148
+280.1 69.7298736572266
+282.0 62.0832252502441
+283.4 41.801929473877
+285.2 85.9555892944336
+290.2 25.7369613647461
+296.2 40.9112319946289
+297.2 20.1072216033936
+298.2 73.1739807128906
+304.9 89.5611114501953
+308.3 33.9126281738281
+310.1 46.537483215332
+311.4 10.3522138595581
+315.0 56.7888412475586
+321.1 117.581436157227
+323.2 58.6494178771973
+325.1 448.190612792969
+326.3 107.182983398438
+327.4 110.143035888672
+337.0 38.5364837646484
+338.3 1098.029296875
+339.3 41.2637481689453
+340.3 11.1756210327148
+343.2 645.440063476563
+344.3 109.936744689941
+351.3 29.0518474578857
+355.2 33.4019088745117
+356.4 30.1679000854492
+360.5 26.5615749359131
+362.2 18.0467681884766
+369.1 298.878936767578
+371.1 12.5077867507935
+383.2 44.0830192565918
+384.8 188.012496948242
+389.4 11.3310432434082
+395.9 137.669387817383
+396.7 89.0941619873047
+399.1 81.6594543457031
+400.1 78.5207290649414
+401.2 25.1709098815918
+403.1 22.3223552703857
+404.2 121.889595031738
+406.9 49.677303314209
+409.1 43.7738418579102
+410.2 62.7529335021973
+412.3 12.8203449249268
+414.2 1419.58569335938
+415.3 332.163116455078
+421.2 10.6162090301514
+424.2 84.2117385864258
+426.1 168.869049072266
+427.3 95.0237503051758
+428.0 109.249137878418
+431.5 32.5814361572266
+439.2 2071.63598632813
+440.2 408.921997070313
+440.9 15.2504940032959
+443.3 28.6818408966064
+447.4 60.4266128540039
+448.4 11.2975292205811
+451.3 37.678295135498
+454.4 75.4133224487305
+456.4 76.3940124511719
+458.5 67.6319198608398
+460.1 24.6743640899658
+465.1 57.8467826843262
+470.1 91.4192962646484
+472.5 8.72362995147705
+474.2 88.6505813598633
+475.9 119.517150878906
+479.4 128.116409301758
+482.4 19.2141532897949
+483.2 48.2660903930664
+484.4 121.271522521973
+486.6 28.458251953125
+487.4 164.915145874023
+488.0 28.6708030700684
+490.2 15.172703742981
+491.2 22.9361953735352
+492.8 37.0322074890137
+493.7 35.3836669921875
+497.4 1556.947265625
+498.3 74.538215637207
+500.1 70.1072235107422
+510.2 83.919059753418
+512.6 30.4224014282227
+515.3 756.706909179688
+516.2 153.287338256836
+517.3 246.687286376953
+520.4 25.2067165374756
+522.4 16.1743965148926
+526.3 42.132453918457
+527.2 164.588928222656
+534.0 177.418548583984
+535.3 33.273811340332
+544.3 66.3913269042969
+550.6 61.2340927124023
+552.6 1630.35693359375
+553.4 501.324157714844
+555.0 365.45947265625
+556.4 164.618499755859
+558.2 11.3074045181274
+559.3 134.589447021484
+561.8 1224.5458984375
+562.4 167.068695068359
+568.1 111.532554626465
+570.7 337.444793701172
+571.3 384.015441894531
+572.3 301.724792480469
+573.3 11.5521078109741
+578.6 33.6315460205078
+580.1 635.729064941406
+581.4 13.2606468200684
+583.2 12.0318899154663
+584.0 128.675155639648
+587.4 51.7380027770996
+588.4 138.934951782227
+589.1 122.758239746094
+594.7 256.360260009766
+596.4 53.9915237426758
+597.6 709.977294921875
+598.5 426.9580078125
+599.4 68.6525573730469
+602.4 63.3077049255371
+606.3 302.083190917969
+607.7 35.7342338562012
+608.7 121.486000061035
+610.5 80.8682250976563
+611.4 148.939575195313
+612.6 185.941467285156
+613.6 25.428466796875
+614.8 113.093994140625
+615.7 59.2823028564453
+616.4 353.466247558594
+617.5 143.952346801758
+618.6 374.312347412109
+619.4 99.4176940917969
+623.7 3357.48388671875
+624.4 1359.62329101563
+625.4 245.867614746094
+626.1 157.986923217773
+627.2 519.459045410156
+629.3 19.5083637237549
+632.1 505.914093017578
+632.8 517.481567382813
+657.8 43.5045852661133
+667.5 1928.78002929688
+668.4 264.997253417969
+669.2 24.4608688354492
+673.2 124.966384887695
+679.4 107.868896484375
+683.2 14.2855787277222
+684.4 8.41092777252197
+693.4 13.253529548645
+694.7 131.752044677734
+695.7 245.257415771484
+700.1 34.7297515869141
+702.2 31.5073890686035
+704.5 53.2440452575684
+709.6 71.0768890380859
+710.3 26.4182605743408
+711.6 69.4583587646484
+712.8 223.488006591797
+713.4 398.023956298828
+714.1 42.4429244995117
+716.4 268.347808837891
+721.7 29.4761581420898
+723.7 30.2028484344482
+724.4 77.1617126464844
+727.7 23.8826656341553
+728.8 83.3675689697266
+730.9 61.7215995788574
+731.5 365.945495605469
+732.4 25.6780529022217
+733.5 86.0997161865234
+734.1 211.706298828125
+734.7 170.057266235352
+735.7 34.4355850219727
+736.3 21.3248996734619
+737.8 28.2123718261719
+740.5 64.4994964599609
+745.9 35.8622665405273
+749.3 139.081253051758
+750.2 90.3683624267578
+751.2 58.2744140625
+754.8 272.833679199219
+755.5 94.048957824707
+756.5 41.7789077758789
+758.6 54.7928771972656
+761.7 210.011077880859
+764.6 142.994476318359
+765.6 81.9965362548828
+766.7 35.577995300293
+768.5 3041.2265625
+769.6 572.0908203125
+771.3 45.8496932983398
+772.4 23.0608959197998
+773.0 127.99210357666
+774.3 116.940147399902
+782.7 57.8522033691406
+785.8 36.3534317016602
+787.4 58.3450508117676
+789.7 17.1753330230713
+801.5 156.923110961914
+808.5 93.7870178222656
+811.7 138.829620361328
+812.9 95.1789703369141
+815.9 49.2497062683105
+819.5 57.8614921569824
+823.4 146.516845703125
+825.5 110.653259277344
+826.3 847.304016113281
+827.5 261.314758300781
+833.6 313.015563964844
+834.3 75.0192184448242
+835.6 6.64657402038574
+836.5 29.6156463623047
+840.3 30.5793743133545
+844.2 826.542785644531
+845.5 95.272834777832
+851.5 9.39962959289551
+852.5 215.784210205078
+854.0 22.95090675354
+862.3 55.8864212036133
+868.8 310.330688476563
+869.5 4365.05078125
+870.6 973.645568847656
+880.7 109.654144287109
+881.6 418.934722900391
+882.5 242.965194702148
+883.5 30.5551948547363
+890.1 90.925910949707
+893.7 108.532112121582
+901.1 103.438674926758
+901.9 26.619327545166
+904.4 37.5361213684082
+909.4 71.6884689331055
+910.6 160.241668701172
+922.5 1927.29174804688
+923.5 521.376098632813
+925.3 93.7609786987305
+926.7 105.412010192871
+927.4 411.131469726563
+928.5 197.210342407227
+930.5 96.873405456543
+934.3 46.5086402893066
+940.5 2199.56323242188
+941.3 386.321563720703
+945.4 1276.38610839844
+946.4 184.895385742188
+948.4 14.1699657440186
+949.7 105.596832275391
+964.7 27.335994720459
+973.7 91.5315170288086
+978.5 37.3656005859375
+979.4 90.653694152832
+980.6 230.451599121094
+981.5 78.6788864135742
+983.5 18.730318069458
+992.2 62.2943420410156
+993.5 791.578002929688
+994.8 218.580230712891
+1011.5 1050.68334960938
+1012.4 75.6817169189453
+1023.6 21.024393081665
+1035.9 130.89958190918
+1039.5 44.0413208007813
+1040.7 45.8695411682129
+1055.3 61.0222091674805
+1073.4 7.92942237854004
+1084.7 90.3775177001953
+1090.0 212.279205322266
+1091.0 40.6307792663574
+1095.7 12.9097557067871
+1106.3 51.4776039123535
+1108.4 295.580932617188
+1109.5 21.5643367767334
+1120.1 46.7118492126465
+1122.7 48.6416053771973
+1123.7 101.746391296387
+1134.5 53.9113693237305
+1140.4 57.3741493225098
+1249.4 18.1081371307373
+
+S 004596 004596 833.57
+Z 2 1666.13
+Z 3 2498.69
+233.9 9.06327342987061
+241.1 5.11276388168335
+253.0 29.5022850036621
+255.2 13.4334888458252
+258.2 19.8327140808105
+259.2 48.1433982849121
+267.1 6.46290969848633
+271.3 16.4752368927002
+277.3 21.7187061309814
+278.3 7.87414360046387
+282.3 51.4676246643066
+283.0 54.5101699829102
+292.2 54.9369888305664
+294.4 32.9214401245117
+301.1 48.046501159668
+316.3 10.6048393249512
+317.0 30.692834854126
+329.2 21.925012588501
+329.9 34.8108901977539
+346.1 16.7230834960938
+348.3 9.27395248413086
+356.2 7.91190099716187
+357.3 70.3492050170898
+374.1 125.380523681641
+392.3 153.714630126953
+394.2 19.604907989502
+398.2 21.0054054260254
+400.5 73.8589553833008
+413.4 11.3629217147827
+417.3 126.462875366211
+422.0 14.8418464660645
+427.3 51.3735427856445
+430.3 17.6019592285156
+435.6 53.4573135375977
+441.2 43.047061920166
+445.3 144.098297119141
+450.0 28.6957931518555
+452.4 30.8523025512695
+458.5 6.60467672348022
+475.2 61.2120590209961
+479.5 10.9543313980103
+485.2 50.4099273681641
+486.1 59.972469329834
+487.3 108.26229095459
+501.6 50.5225563049316
+502.9 34.3544769287109
+521.4 51.9593124389648
+522.1 14.2406911849976
+526.0 19.6839714050293
+527.0 74.3941650390625
+528.5 28.6051902770996
+530.7 57.061164855957
+538.6 53.1338119506836
+540.1 49.4479827880859
+540.9 31.368989944458
+544.5 24.9528007507324
+555.8 11.3694086074829
+573.4 97.236328125
+574.3 45.6546630859375
+575.4 130.573364257813
+577.2 6.5386176109314
+584.6 6.45523118972778
+588.4 44.2757339477539
+592.2 92.0804977416992
+593.4 92.0999450683594
+597.2 67.4226150512695
+602.0 26.0033073425293
+615.0 86.8636474609375
+616.4 25.6954402923584
+623.3 90.7534637451172
+624.1 7.25228643417358
+629.4 8.37987899780273
+633.5 10.1180047988892
+644.4 123.277397155762
+645.5 26.6690483093262
+647.4 57.6000633239746
+651.2 37.3147773742676
+655.3 162.406112670898
+659.3 76.8048095703125
+670.5 29.8815269470215
+673.4 219.581573486328
+674.3 99.6774368286133
+681.0 64.9308395385742
+685.9 165.341735839844
+688.4 41.0708198547363
+694.9 462.405883789063
+698.0 69.1585311889648
+702.2 125.955169677734
+703.4 71.1014633178711
+704.5 33.5423851013184
+705.2 48.9885292053223
+708.1 66.7361297607422
+714.7 104.197769165039
+717.2 14.3000354766846
+721.2 252.020385742188
+722.2 77.3344573974609
+724.4 87.7767944335938
+728.2 68.5680694580078
+732.8 52.2872161865234
+744.3 391.912902832031
+745.6 184.961624145508
+746.5 156.845245361328
+750.7 657.696960449219
+751.4 18.371150970459
+755.5 40.9293518066406
+759.5 1001.6298828125
+760.3 185.179153442383
+766.2 18.4048862457275
+771.8 88.8187637329102
+774.9 77.6372375488281
+778.5 34.156867980957
+787.4 48.0149726867676
+788.5 61.3476409912109
+797.3 17.4006214141846
+798.8 68.5802688598633
+810.5 85.8645248413086
+814.7 314.213836669922
+815.4 734.52880859375
+816.4 413.814666748047
+817.6 52.8716354370117
+844.5 107.223091125488
+845.4 33.610710144043
+846.6 18.8185806274414
+847.5 7.90904998779297
+851.0 28.5967636108398
+855.0 36.3489379882813
+856.3 41.3064002990723
+874.7 9.3620433807373
+875.5 36.7420845031738
+885.6 9.04907894134521
+900.4 36.1689414978027
+901.3 21.0365180969238
+902.4 66.7550888061523
+921.3 313.239807128906
+922.2 124.673614501953
+923.7 60.0936737060547
+927.7 29.9614486694336
+931.4 43.4766387939453
+944.4 627.132751464844
+945.4 532.959594726563
+946.5 120.982429504395
+973.3 57.9656295776367
+974.4 77.9250793457031
+991.7 69.4336090087891
+992.4 226.383087158203
+993.1 84.7520141601563
+999.2 28.4588661193848
+1029.5 48.6811981201172
+1031.4 39.3587913513184
+1032.1 32.7809562683105
+1037.4 28.0909366607666
+1046.3 29.002513885498
+1056.1 65.2755584716797
+1059.4 108.616226196289
+1073.3 570.997436523438
+1074.5 197.580123901367
+1075.8 75.5585556030273
+1084.6 28.5858478546143
+1085.3 5.72312927246094
+1100.5 46.452522277832
+1117.6 42.2168426513672
+1118.5 16.8088397979736
+1121.3 52.8077545166016
+1122.3 76.6635971069336
+1127.2 30.0536308288574
+1144.5 549.320068359375
+1145.5 231.196197509766
+1146.7 32.1935005187988
+1154.5 13.3004770278931
+1157.7 11.2992610931396
+1170.4 25.4317169189453
+1173.6 25.4448413848877
+1178.7 83.2475204467773
+1179.8 60.0345001220703
+1180.5 5.71155214309692
+1189.2 38.1284255981445
+1190.2 17.0804290771484
+1191.6 53.1390533447266
+1202.7 66.1393280029297
+1228.5 24.0686721801758
+1247.4 30.144847869873
+1257.5 45.1692428588867
+1259.5 34.0688285827637
+1260.2 76.1449203491211
+1265.5 136.952331542969
+1267.3 109.77311706543
+1273.5 599.671142578125
+1274.4 558.226501464844
+1275.4 28.9847583770752
+1317.6 20.680456161499
+1336.3 143.956207275391
+1337.6 37.9440574645996
+1372.4 11.1991987228394
+1389.3 428.820587158203
+1390.7 23.7479400634766
+1429.7 21.7973117828369
+1491.7 44.3017616271973
+1517.1 23.8610935211182
+1518.7 95.5812835693359
+1519.5 7.1558256149292
+
+S 005975 005975 702.56
+Z 2 1404.11
+Z 3 2105.66
+200.2 244.198944091797
+211.2 125.478965759277
+212.2 16.577657699585
+214.3 11.4341650009155
+215.3 5.64610004425049
+223.2 63.2361526489258
+228.1 286.697509765625
+229.6 16.2009506225586
+242.2 44.9520416259766
+245.1 17.505184173584
+260.3 144.833358764648
+261.4 69.0646896362305
+275.4 19.417350769043
+288.2 128.053085327148
+298.1 19.3211917877197
+300.3 10.1043291091919
+304.2 76.6266784667969
+307.3 60.6780471801758
+308.0 13.6618480682373
+312.0 9.87532234191895
+313.4 25.3828125
+315.0 20.2992763519287
+324.3 65.1580352783203
+325.1 20.2132091522217
+327.2 12.8937082290649
+338.2 11.7464742660522
+339.4 55.8018798828125
+341.2 25.3004417419434
+342.2 67.9788513183594
+347.3 346.322814941406
+348.3 58.1488418579102
+353.3 15.0066108703613
+354.4 27.8334083557129
+356.0 9.63239669799805
+358.2 14.5823936462402
+360.5 5.89802646636963
+368.1 3.92963528633118
+372.3 7.24454307556152
+374.3 21.0096054077148
+376.4 4.21282577514648
+379.6 4.66910982131958
+384.8 11.3986129760742
+388.4 40.2899208068848
+395.9 73.3092880249023
+397.3 193.537811279297
+401.4 13.8330335617065
+406.1 7.93343830108643
+408.1 17.9111042022705
+416.5 51.3518180847168
+417.5 76.2232894897461
+418.4 256.286834716797
+419.4 68.0492095947266
+423.4 18.4184894561768
+425.3 5.02486658096313
+426.3 65.3859939575195
+438.1 47.4040946960449
+440.3 161.905990600586
+441.3 26.0683917999268
+442.2 17.5846996307373
+443.6 43.2222785949707
+455.3 59.9161911010742
+457.1 75.4026107788086
+458.1 78.7186737060547
+463.3 18.6236782073975
+467.2 10.6252880096436
+470.5 8.72642421722412
+472.8 87.0850143432617
+476.4 5.52875137329102
+478.4 37.4836158752441
+487.5 130.571670532227
+492.5 8.30630302429199
+494.3 32.3030166625977
+495.1 9.66674995422363
+497.3 3.92838454246521
+505.4 1313.36706542969
+506.5 135.740921020508
+509.1 38.0695419311523
+510.4 18.4730243682861
+511.4 121.727012634277
+514.5 78.4575042724609
+516.3 61.8367080688477
+521.2 36.5624313354492
+525.4 33.6884727478027
+528.7 11.8382091522217
+532.2 333.392181396484
+532.8 14.3780479431152
+533.5 45.6845512390137
+537.0 3.61257028579712
+545.2 34.7418174743652
+552.9 11.3466119766235
+554.1 102.469604492188
+555.1 10.6421909332275
+556.4 51.0505142211914
+559.0 20.3898468017578
+563.0 96.0205841064453
+567.3 10.91233253479
+570.2 54.6397972106934
+571.4 263.921142578125
+572.1 73.5406875610352
+573.3 68.0976104736328
+575.1 77.4662170410156
+578.0 40.3536758422852
+580.2 204.993774414063
+585.6 60.9190216064453
+588.9 1152.40795898438
+590.2 38.9507522583008
+592.1 25.5116691589355
+594.5 30.7449226379395
+600.1 65.2834320068359
+601.0 73.0758590698242
+602.8 26.2031745910645
+604.4 300.026977539063
+605.5 42.6035766601563
+606.1 29.0496406555176
+609.5 181.237121582031
+612.1 32.6494140625
+619.4 63.0686531066895
+621.1 62.0305824279785
+626.1 56.7526702880859
+628.4 150.734649658203
+629.5 49.8539390563965
+632.9 140.969299316406
+634.6 221.341583251953
+635.4 26.9136543273926
+636.6 49.7521667480469
+637.5 47.7426071166992
+641.7 266.860137939453
+643.6 53.9990005493164
+645.5 232.62663269043
+647.5 226.769683837891
+649.4 145.022598266602
+650.2 358.644439697266
+650.8 155.972473144531
+651.5 60.8240661621094
+652.8 55.4289054870605
+655.3 11.1546087265015
+659.5 117.91535949707
+661.1 103.795867919922
+662.4 76.0803756713867
+665.5 5.84557199478149
+666.9 6590.75390625
+667.8 114.863128662109
+668.5 20.6604976654053
+669.8 80.9661865234375
+670.8 129.979095458984
+671.7 74.4041366577148
+672.5 10.6992282867432
+673.4 7.63064670562744
+674.4 27.5398654937744
+675.4 166.632598876953
+676.4 495.025024414063
+677.5 230.132858276367
+679.7 42.198314666748
+680.5 38.5691337585449
+682.5 17.8973026275635
+684.0 288.871368408203
+684.9 1798.48559570313
+685.9 254.203155517578
+689.4 10.351300239563
+691.4 94.4514617919922
+693.1 53.3338851928711
+711.3 17.4954223632813
+714.3 27.8164100646973
+719.3 171.809494018555
+720.8 64.9053344726563
+723.6 18.8676643371582
+730.0 38.5443572998047
+731.1 5.20265102386475
+734.5 32.1008682250977
+737.9 156.770599365234
+738.7 73.6795349121094
+739.4 21.3797340393066
+740.5 44.2326507568359
+749.2 47.5809860229492
+756.5 85.7226104736328
+757.7 44.4652557373047
+760.3 73.0944290161133
+761.4 53.5566673278809
+765.5 56.8680610656738
+766.5 4.47359132766724
+773.5 16.1217212677002
+775.5 46.4773635864258
+778.0 42.6657257080078
+780.6 31.6064567565918
+782.3 15.7793560028076
+783.6 170.060028076172
+785.4 135.614166259766
+791.2 47.928882598877
+795.0 208.91325378418
+795.6 28.0148868560791
+798.2 62.7083740234375
+800.2 380.663665771484
+801.4 9.69822692871094
+803.8 110.207542419434
+810.9 50.1951560974121
+811.8 83.2831420898438
+812.5 36.3833160400391
+816.2 13.966290473938
+817.7 45.6291694641113
+818.3 61.1527786254883
+830.1 11.6600093841553
+833.3 260.473815917969
+834.0 12.3208589553833
+834.7 112.854782104492
+837.0 36.0131454467773
+838.7 45.579174041748
+840.2 80.4257125854492
+845.3 19.3566246032715
+847.2 46.8500747680664
+847.8 148.996017456055
+848.7 38.8970489501953
+851.9 459.620239257813
+852.7 194.229232788086
+854.9 85.1216812133789
+857.7 66.8775482177734
+863.2 48.8110694885254
+864.4 164.661315917969
+865.3 365.423797607422
+866.3 49.4286422729492
+871.4 66.1395568847656
+872.3 81.3633651733398
+873.5 13.4700765609741
+874.4 35.9890632629395
+875.5 18.2322235107422
+876.8 9.85540866851807
+882.3 313.851837158203
+883.3 87.6082000732422
+884.2 32.3517189025879
+886.7 98.698356628418
+890.2 59.7312812805176
+891.2 45.8622016906738
+891.8 22.1744136810303
+893.1 20.6718139648438
+899.3 278.485504150391
+900.3 80.6021499633789
+905.9 8.03078842163086
+910.3 28.1656703948975
+914.7 34.9159507751465
+915.7 47.9202575683594
+930.5 43.4959259033203
+934.4 41.5366744995117
+935.3 47.1219825744629
+937.9 40.6703262329102
+943.7 17.0642185211182
+944.6 85.5159759521484
+947.5 428.296264648438
+948.7 136.942932128906
+951.3 15.1743173599243
+952.3 23.6910362243652
+968.4 10.0206346511841
+969.4 181.718063354492
+970.3 29.9817886352539
+983.3 40.5213432312012
+986.7 75.0085754394531
+987.4 22.8729152679443
+988.4 89.006965637207
+989.6 69.4317779541016
+990.6 25.6456508636475
+1003.6 86.861572265625
+1004.5 50.9892883300781
+1005.4 12.2137088775635
+1015.9 22.3824424743652
+1022.3 98.5564117431641
+1023.4 229.349945068359
+1024.5 16.3501873016357
+1026.4 6.50220108032227
+1028.5 60.1939582824707
+1029.5 79.4005737304688
+1030.5 36.3758544921875
+1035.6 25.5759105682373
+1039.6 143.11442565918
+1040.3 171.596115112305
+1041.5 73.3590621948242
+1044.8 84.3878021240234
+1045.7 22.8279666900635
+1046.4 69.415283203125
+1057.6 210.712661743164
+1058.5 44.5951919555664
+1062.5 730.289489746094
+1063.5 267.512573242188
+1064.4 10.4151649475098
+1065.5 73.1464309692383
+1076.5 21.3651866912842
+1078.5 38.1752815246582
+1082.2 222.550704956055
+1083.5 14.8064031600952
+1086.8 39.2629013061523
+1092.1 41.5884628295898
+1099.4 321.617858886719
+1101.6 20.9681720733643
+1108.6 13.6788148880005
+1116.4 47.1218719482422
+1118.0 62.8035011291504
+1118.7 25.3689498901367
+1126.5 47.0022087097168
+1129.7 16.3078765869141
+1131.0 64.7820434570313
+1137.4 27.5963973999023
+1141.4 117.911560058594
+1143.4 34.6314277648926
+1144.1 44.4455184936523
+1144.8 12.3725433349609
+1145.4 35.4225387573242
+1146.5 21.6615524291992
+1158.6 199.043838500977
+1159.5 544.716247558594
+1160.2 59.1385192871094
+1160.8 17.697847366333
+1166.4 28.6122779846191
+1176.5 3301.90576171875
+1177.5 1429.97766113281
+1194.1 81.3943939208984
+1205.2 44.0304222106934
+1211.7 22.6078281402588
+1212.4 66.2081451416016
+1218.5 209.972213745117
+1221.8 60.764289855957
+1222.7 5.46742630004883
+1223.6 31.7276344299316
+1224.7 41.3597106933594
+1228.7 90.4682006835938
+1230.4 15.1203279495239
+1239.7 110.184677124023
+1241.1 76.9298400878906
+1257.5 32.2757186889648
+1258.7 5.31881237030029
+1259.7 42.3198509216309
+1281.4 28.8357028961182
+1300.2 133.588577270508
+1301.0 21.0918998718262
+1301.8 39.0974578857422
+1372.7 5.29332780838013
+
+S 006176 006176 521.15
+Z 2 1041.29
+Z 3 1561.43
+150.0 99.5292739868164
+152.3 24.8964080810547
+155.8 207.423416137695
+164.4 73.7251739501953
+166.1 20.3963298797607
+167.4 56.4421691894531
+170.9 108.634124755859
+173.1 173.533508300781
+174.2 37.6329040527344
+178.0 75.6242446899414
+179.0 35.9586982727051
+183.2 191.816421508789
+184.3 37.8338394165039
+186.3 207.774597167969
+187.0 207.990249633789
+189.3 40.4748420715332
+189.9 81.716911315918
+193.5 127.7421875
+195.1 521.047424316406
+197.1 103.121147155762
+198.3 21.614372253418
+201.2 312.417938232422
+202.2 124.550903320313
+204.3 43.8117828369141
+207.1 132.988067626953
+208.2 686.391052246094
+209.2 114.778541564941
+214.2 164.669555664063
+215.4 25.4923419952393
+216.7 184.227172851563
+218.5 287.131683349609
+225.1 3456.8671875
+226.2 92.256477355957
+227.0 59.1759757995605
+228.3 620.92138671875
+229.2 525.492736816406
+230.6 149.691055297852
+231.3 159.055297851563
+233.2 33.5002593994141
+237.4 61.6736602783203
+240.2 74.5882263183594
+243.3 19.8281497955322
+250.2 197.245040893555
+251.2 162.863800048828
+253.4 31.8894538879395
+264.8 57.371395111084
+268.3 141.637969970703
+274.1 172.328796386719
+275.1 25.5934238433838
+279.1 180.300064086914
+280.3 137.019592285156
+281.0 160.750122070313
+283.1 61.654468536377
+283.9 43.0586090087891
+285.1 106.889259338379
+291.1 4545.6015625
+292.2 604.569396972656
+293.2 126.150260925293
+298.2 45.4005432128906
+302.2 59.6245574951172
+303.6 21.8396987915039
+308.4 155.952682495117
+309.3 110.394332885742
+311.2 64.5288696289063
+313.2 243.498977661133
+320.1 241.904708862305
+321.7 1670.7666015625
+322.3 652.284362792969
+325.6 267.42919921875
+326.9 310.211853027344
+328.1 257.203186035156
+329.4 101.441497802734
+332.2 339.5302734375
+332.8 280.295135498047
+333.9 20.0184745788574
+335.2 44.7012939453125
+338.2 2062.89501953125
+339.0 568.604553222656
+339.6 26.5377578735352
+343.4 39.5140609741211
+347.2 105.298751831055
+350.3 1040.91418457031
+351.2 45.4548950195313
+352.2 134.943908691406
+352.9 101.220916748047
+354.2 218.063583374023
+361.2 214.029571533203
+361.9 378.6259765625
+363.5 85.0887603759766
+366.6 119.213439941406
+373.8 177.017532348633
+375.3 634.314392089844
+379.4 358.606567382813
+381.1 344.165191650391
+385.4 51.2277984619141
+388.9 95.055061340332
+392.3 245.888320922852
+392.9 34.1393585205078
+395.1 80.3456802368164
+396.6 109.19938659668
+398.5 300.292816162109
+399.2 18.2609996795654
+405.8 398.365234375
+406.6 1514.87939453125
+407.3 1132.36779785156
+408.4 40.1826972961426
+409.2 44.524471282959
+410.7 95.4533462524414
+412.8 223.126602172852
+415.0 225.890472412109
+418.8 148.28759765625
+420.3 136.622268676758
+423.5 57.5325317382813
+425.8 181.402893066406
+426.5 197.004180908203
+429.5 150.588348388672
+437.5 159.617172241211
+438.9 155.705093383789
+440.0 320.820617675781
+441.4 59.1881942749023
+443.0 25.7806797027588
+444.3 257.557678222656
+452.2 488.827575683594
+453.2 122.581436157227
+454.2 89.0110778808594
+456.2 582.955322265625
+456.9 492.3125
+460.9 190.344879150391
+462.2 29.7451362609863
+463.4 166.561279296875
+464.2 342.171936035156
+465.3 28.275318145752
+468.3 724.613342285156
+469.9 317.924926757813
+473.9 520.170349121094
+476.6 106.93537902832
+478.3 254.534317016602
+479.4 258.606475830078
+481.1 429.047546386719
+483.9 419.321502685547
+485.7 182.188278198242
+487.3 71.4511795043945
+489.5 301.888153076172
+490.4 64.9893569946289
+492.0 3018.5400390625
+492.9 19.1942615509033
+496.9 55.5130004882813
+498.0 52.1472969055176
+502.6 647.311218261719
+503.4 3316.97265625
+505.4 1220.0107421875
+506.2 1401.5634765625
+509.4 1633.67236328125
+511.1 585.366455078125
+512.2 1213.20642089844
+529.0 325.823822021484
+532.0 189.374313354492
+534.0 341.669189453125
+535.2 46.2352294921875
+539.2 106.508460998535
+546.0 71.2236404418945
+551.3 2736.69946289063
+552.3 976.499267578125
+555.4 189.501678466797
+556.4 336.451629638672
+558.4 39.9683456420898
+568.1 65.471794128418
+571.5 76.160774230957
+572.5 23.0066204071045
+575.2 36.9830474853516
+577.1 226.924942016602
+578.5 191.365570068359
+579.1 286.493865966797
+581.1 367.433197021484
+582.4 133.620468139648
+586.5 248.491760253906
+588.7 362.93359375
+590.5 503.667633056641
+592.8 145.779647827148
+596.4 259.081512451172
+597.6 415.784301757813
+598.3 244.758529663086
+599.0 471.252563476563
+612.4 1846.80407714844
+613.4 75.7025833129883
+614.1 37.2765922546387
+618.7 170.722930908203
+619.6 313.047088623047
+621.4 660.327514648438
+622.2 21.7938098907471
+624.4 313.432464599609
+626.9 130.991302490234
+628.3 97.1897354125977
+632.6 191.810455322266
+634.2 85.4735488891602
+636.0 7497.3466796875
+636.9 1406.98706054688
+638.2 35.5022239685059
+639.0 115.526031494141
+639.6 33.7445449829102
+641.0 395.794189453125
+642.2 1687.37548828125
+643.4 509.403411865234
+644.5 130.244094848633
+646.3 38.5255012512207
+650.5 2003.96423339844
+651.5 757.571533203125
+652.2 152.041275024414
+656.4 34.3828544616699
+659.2 70.6767654418945
+660.1 281.738739013672
+660.9 147.216033935547
+662.6 552.89013671875
+663.8 219.385498046875
+665.5 369.198394775391
+666.7 41.4958038330078
+667.4 75.3739318847656
+669.1 9056.111328125
+670.0 1263.83227539063
+671.2 629.555541992188
+672.9 751.114501953125
+677.4 286.827606201172
+679.3 131.191223144531
+680.2 529.069458007813
+681.5 349.415435791016
+682.5 66.8927688598633
+687.7 52.2866020202637
+699.2 6555.517578125
+700.2 1734.43823242188
+701.4 363.563201904297
+708.2 1169.42895507813
+711.1 567.533569335938
+712.3 964.083862304688
+713.4 313.545013427734
+717.1 275.078643798828
+719.2 85.1127319335938
+720.5 443.043731689453
+721.6 266.181488037109
+723.5 522.390747070313
+725.6 114.072143554688
+730.7 188.755187988281
+731.3 29.9104385375977
+732.4 260.320922851563
+745.5 124.977264404297
+746.8 309.491943359375
+749.5 3493.01147460938
+750.7 388.916625976563
+751.5 303.732635498047
+755.5 214.958847045898
+763.4 291.977966308594
+764.9 97.63037109375
+767.4 61.3502807617188
+768.8 113.80931854248
+773.7 263.193145751953
+777.6 100.272850036621
+792.6 178.006988525391
+795.5 265.086547851563
+803.2 105.245651245117
+806.5 252.971466064453
+810.4 833.338012695313
+811.5 3686.44848632813
+812.6 2232.9130859375
+813.6 1352.23278808594
+814.4 90.9366302490234
+815.7 50.749153137207
+820.4 134.235275268555
+822.3 75.2090606689453
+832.0 52.4180221557617
+839.5 191.050765991211
+840.9 209.548797607422
+844.6 123.805145263672
+862.5 747.167541503906
+863.6 135.258483886719
+871.7 19.2006816864014
+872.9 101.640571594238
+882.6 270.211181640625
+892.3 112.96102142334
+893.4 102.064804077148
+894.2 228.017761230469
+895.7 68.2846908569336
+911.4 1574.09594726563
+912.6 826.477661132813
+913.7 46.9788780212402
+917.5 97.251220703125
+919.6 764.842529296875
+920.5 233.038986206055
+927.9 19.5952644348145
+931.7 88.3806304931641
+933.6 55.7731246948242
+934.6 274.783782958984
+940.4 53.4021797180176
+963.2 40.4569473266602
+1010.6 294.230255126953
+1012.9 91.7745132446289
+1020.5 73.7383041381836
+1046.6 576.338745117188
+1048.0 32.0886535644531
diff --git a/SystemTest/YeastSmall.fasta b/SystemTest/YeastSmall.fasta
new file mode 100644
index 0000000..fcb719c
--- /dev/null
+++ b/SystemTest/YeastSmall.fasta
@@ -0,0 +1,62 @@
+>YBR118W TEF2 SGDID:S0000322, Chr II from 477627-479003, Verified ORF
+MGKEKSHINVVVIGHVDSGKSTTTGHLIYKCGGIDKRTIEKFEKEAAELGKGSFKYAWVL
+DKLKAERERGITIDIALWKFETPKYQVTVIDAPGHRDFIKNMITGTSQADCAILIIAGGV
+GEFEAGISKDGQTREHALLAFTLGVRQLIVAVNKMDSVKWDESRFQEIVKETSNFIKKVG
+YNPKTVPFVPISGWNGDNMIEATTNAPWYKGWEKETKAGVVKGKTLLEAIDAIEQPSRPT
+DKPLRLPLQDVYKIGGIGTVPVGRVETGVIKPGMVVTFAPAGVTTEVKSVEMHHEQLEQG
+VPGDNVGFNVKNVSVKEIRRGNVCGDAKNDPPKGCASFNATVIVLNHPGQISAGYSPVLD
+CHTAHIACRFDELLEKNDRRSGKKLEDHPKFLKSGDAALVKFVPSKPMCVEAFSEYPPLG
+RFAVRDMRQTVAVGVIKSVDKTEKAAKVTKAAQKAAKK*
+>YER177W BMH1 SGDID:S0000979, Chr V from 545606-546409, Verified ORF
+MSTSREDSVYLAKLAEQAERYEEMVENMKTVASSGQELSVEERNLLSVAYKNVIGARRAS
+WRIVSSIEQKEESKEKSEHQVELICSYRSKIETELTKISDDILSVLDSHLIPSATTGESK
+VFYYKMKGDYHRYLAEFSSGDAREKATNASLEAYKTASEIATTELPPTHPIRLGLALNFS
+VFYYEIQNSPDKACHLAKQAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDMSES
+GQAEDQQQQQQHQQQQPPAAAEGEAPK*
+>YGL008C PMA1 SGDID:S0002976, Chr VII from 482669-479913, reverse complement, Verified ORF
+MTDTSSSSSSSSASSVSAHQPTQEKPAKTYDDAASESSDDDDIDALIEELQSNHGVDDED
+SDNDGPVAAGEARPVPEEYLQTDPSYGLTSDEVLKRRKKYGLNQMADEKESLVVKFVMFF
+VGPIQFVMEAAAILAAGLSDWVDFGVICGLLMLNAGVGFVQEFQAGSIVDELKKTLANTA
+VVIRDGQLVEIPANEVVPGDILQLEDGTVIPTDGRIVTEDCFLQIDQSAITGESLAVDKH
+YGDQTFSSSTVKRGEGFMVVTATGDNTFVGRAAALVNKAAGGQGHFTEVLNGIGIILLVL
+VIATLLLVWTACFYRTNGIVRILRYTLGITIIGVPVGLPAVVTTTMAVGAAYLAKKQAIV
+QKLSAIESLAGVEILCSDKTGTLTKNKLSLHEPYTVEGVSPDDLMLTACLAASRKKKGLD
+AIDKAFLKSLKQYPKAKDALTKYKVLEFHPFDPVSKKVTAVVESPEGERIVCVKGAPLFV
+LKTVEEDHPIPEDVHENYENKVAELASRGFRALGVARKRGEGHWEILGVMPCMDPPRDDT
+AQTVSEARHLGLRVKMLTGDAVGIAKETCRQLGLGTNIYNAERLGLGGGGDMPGSELADF
+VENADGFAEVFPQHKYRVVEILQNRGYLVAMTGDGVNDAPSLKKADTGIAVEGATDAARS
+AADIVFLAPGLSAIIDALKTSRQIFHRMYSYVVYRIALSLHLEIFLGLWIAILDNSLDID
+LIVFIAIFADVATLAIAYDNAPYSPKPVKWNLPRLWGMSIILGIVLAIGSWITLTTMFLP
+KGGIIQNFGAMNGIMFLQISLTENWLIFITRAAGPFWSSIPSWQLAGAVFAVDIIATMFT
+LFGWWSENWTDIVTVVRVWIWSIGIFCVLGGFYYEMSTSEAFDRLMNGKPMKEKKSTRSV
+EDFMAAMQRVSTQHEKET*
+>YOR230W WTM1 SGDID:S0005756, Chr XV from 770800-772113, Verified ORF
+MPKKVWKSSTPSTYEHISSLRPKFVSRVDNVLHQRKSLTFSNVVVPDKKNNTLTSSVIYS
+QGSDIYEIDFAVPLQEAASEPVKDYGDAFEGIENTSLSPKFVYQGETVSKMAYLDKTGET
+TLLSMSKNGSLAWFKEGIKVPIHIVQELMGPATSYASIHSLTRPGDLPEKDFSLAISDFG
+ISNDTETIVKSQSNGDEEDSILKIIDNAGKPGEILRTVHVPGTTVTHTVRFFDNHIFASC
+SDDNILRFWDTRTSDKPIWVLGEPKNGKLTSFDCSQVSNNLFVTGFSTGIIKLWDARAAE
+AATTDLTYRQNGEDPIQNEIANFYHAGGDSVVDVQFSATSSSEFFTVGGTGNIYHWNTDY
+SLSKYNPDDTIAPPQDATEESQTKSLRFLHKGGSRRSPKQIGRRNTAAWHPVIENLVGTV
+DDDSLVSIYKPYTEESE*
+>YBR189W RPS9B SGDID:S0000393, Chr II from 604465-604471,604885-605465, Verified ORF
+MPRAPRTYSKTYSTPKRPYESSRLDAELKLAGEFGLKNKREIYRISFQLSKIRRAARDLL
+TRDEKDPKRLFEGNALIRRLVRVGVLSEDKKKLDYVLALKVEDFLERRLQTQVYKLGLAK
+SVHHARVLITQRHIAVGKQIVNIPSFMVRLDSEKHIDFAPTSPFGGARPGRVARRNAARK
+AEASGEAAEEAEDEE*
+>YOR361C PRT1 SGDID:S0005888, Chr XV from 1017648-1015357, reverse complement, Verified ORF
+MKNFLPRTLKNIYELYFNNISVHSIVSRNTQLKRSKIIQMTTETFEDIKLEDIPVDDIDF
+SDLEEQYKVTEEFNFDQYIVVNGAPVIPSAKVPVLKKALTSLFSKAGKVVNMEFPIDEAT
+GKTKGFLFVECGSMNDAKKIIKSFHGKRLDLKHRLFLYTMKDVERYNSDDFDTEFREPDM
+PTFVPSSSLKSWLMDDKVRDQFVLQDDVKTSVFWNSMFNEEDSLVESRENWSTNYVRFSP
+KGTYLFSYHQQGVTAWGGPNFDRLRRFYHPDVRNSSVSPNEKYLVTFSTEPIIVEEDNEF
+SPFTKKNEGHQLCIWDIASGLLMATFPVIKSPYLKWPLVRWSYNDKYCARMVGDSLIVHD
+ATKNFMPLEAKALKPSGIRDFSFAPEGVKLQPFRNGDEPSVLLAYWTPETNNSACTATIA
+EVPRGRVLKTVNLVQVSNVTLHWQNQAEFLCFNVERHTKSGKTQFSNLQICRLTERDIPV
+EKVELKDSVFEFGWEPHGNRFVTISVHEVADMNYAIPANTIRFYAPETKEKTDVIKRWSL
+VKEIPKTFANTVSWSPAGRFVVVGALVGPNMRRSDLQFYDMDYPGEKNINDNNDVSASLK
+DVAHPTYSAATNITWDPSGRYVTAWSSSLKHKVEHGYKIFNIAGNLVKEDIIAGFKNFAW
+RPRPASILSNAERKKVRKNLREWSAQFEEQDAMEADTAMRDLILHQRELLKQWTEYREKI
+GQEMEKSMNFKIFDVQPEDASDDFTTIEEIVEEVLEETKEKVE*
+
+
diff --git a/TAG2.bn b/TAG2.bn
new file mode 100644
index 0000000..12370b7
Binary files /dev/null and b/TAG2.bn differ
diff --git a/TAG3.bn b/TAG3.bn
new file mode 100644
index 0000000..f547131
Binary files /dev/null and b/TAG3.bn differ
diff --git a/TagFile.c b/TagFile.c
new file mode 100644
index 0000000..5b208b4
--- /dev/null
+++ b/TagFile.c
@@ -0,0 +1,493 @@
+//Title: IonScoring.h
+//Author: Ari Frank
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+//
+//TagFile.c is responsible for parsing tag files from an external tagger
+
+#include "TagFile.h"
+#include "Trie.h"
+#include "Tagger.h"
+#include <stdio.h>
+#include "Errors.h"
+
+
+// Global variable
+ExternalTagHolder *TagHolder=NULL;
+
+// Reads all the contents of an external tag file
+// for each scan we have new tweaks, and new tags
+
+
+void ReadExternalTags(char *TagFilePath, int verbose)
+{
+ int MaxScan=0;
+ int TotalNumTags=0;
+ FILE *InputStream;
+ int GlobalTagIndex=0;
+ int i;
+
+ int LineNumber = 0;
+ char Buff[1024];
+ int ScanNumber,NumTags;
+
+ ScanTags *ThisScanTags=NULL;
+
+ int NRead=0;
+ int Charge=0;
+ float ParentMass = 0;
+
+ char* TempAA;
+ char AnnotationBuffer[256];
+ char ModBuffer[256];
+ int AminoIndex;
+ MassDelta* Delta;
+ int ModBufferPos;
+ int ModIndex;
+ TrieTag *NewTag;
+ int TweakIdx;
+ float Score,PrefixMass;
+ char TagSeq[16];
+
+ if (verbose)
+ printf("Parsing tags from: %s\n",TagFilePath);
+
+ if (! TagHolder)
+ {
+ TagHolder = (ExternalTagHolder *)malloc(sizeof(ExternalTagHolder));
+ }
+
+ InputStream=fopen(TagFilePath,"r");
+ if (! InputStream)
+ {
+ //printf("Error: couldn't read external tag file: %s\n",TagFilePath);
+ //exit(1);
+ REPORT_ERROR_S(8, TagFilePath);
+ exit(8);
+ }
+
+ // read in two passes: first determine how many scans and tags there are
+ // in second passs allocate the memory for all the tag
+
+ while (1)
+ {
+ i = 0;
+ if (! fgets(Buff,1024,InputStream))
+ break;
+
+ ScanNumber = 0;
+ NumTags = 0;
+
+ if (sscanf(Buff,"%d %d",&ScanNumber,&NumTags) == 2)
+ {
+ MaxScan=ScanNumber;
+ TotalNumTags+=NumTags;
+ LineNumber += 1;
+ }
+ else
+ {
+ //printf("Error parsing tag file1:\n%s\n",Buff);
+ REPORT_ERROR_IS(14,LineNumber,TagFilePath);
+ exit(14);
+ }
+
+ // skip tweaks and tags this round
+
+ for (i=0; i<TWEAK_COUNT+NumTags; i++)
+ {
+ fgets(Buff,1024,InputStream);
+ LineNumber += 1;
+ }
+ }
+ fclose(InputStream);
+
+ if (verbose)
+ printf("Allocating memory for %d scans and %d tags...\n",MaxScan+1,TotalNumTags);
+
+ TagHolder->MaxScanNumber = MaxScan;
+ TagHolder->AllScanTags = (ScanTags *)malloc((MaxScan+1)*sizeof(ScanTags));
+ TagHolder->AllExternalTrieTags = (TrieTag *)malloc(TotalNumTags*sizeof(TrieTag));
+
+ if (! TagHolder->AllScanTags || ! TagHolder->AllExternalTrieTags)
+ {
+ //printf("Error: coudln't allocate sufficient memory to store all external tags!\n");
+ REPORT_ERROR(1);
+ exit(1);
+ }
+
+ for (i=0; i<=MaxScan; i++)
+ {
+ TagHolder->AllScanTags[i].ScanNumber=i;
+ TagHolder->AllScanTags[i].NumTags=0;
+ }
+
+ // read again, this time store tags
+ InputStream=fopen(TagFilePath,"r");
+ if (! InputStream)
+ {
+ //printf("Error: couldn't read external tag file: %s\n",TagFilePath);
+ REPORT_ERROR_S(8, TagFilePath);
+ exit(8);
+ }
+
+
+ while (1)
+ {
+ ThisScanTags=NULL;
+ i = 0;
+ NRead=0;
+ ScanNumber=0;
+ NumTags=0;
+ LineNumber = 0;
+
+ if (! fgets(Buff,1024,InputStream))
+ break;
+
+ if (sscanf(Buff,"%d %d",&ScanNumber,&NumTags) != 2)
+ {
+ //printf("Error parsing tag file2: %s\n",Buff);
+ REPORT_ERROR_IS(14,LineNumber,TagFilePath);
+ exit(14);
+
+ }
+
+ // read tweaks
+ ThisScanTags = &(TagHolder->AllScanTags[ScanNumber]);
+ for (i=0; i<TWEAK_COUNT; i++)
+ {
+ Charge=0;
+ ParentMass = 0;
+
+ fgets(Buff,1024,InputStream);
+ LineNumber += 1;
+ if (sscanf(Buff,"%d %f",&Charge,&ParentMass) != 2)
+ {
+ //printf("Error parsing tag file3: %s\n",Buff);
+ //exit(1);
+ REPORT_ERROR_IS(14,LineNumber,TagFilePath);
+ exit(14);
+ }
+
+ ThisScanTags->Tweaks[i].Charge = Charge;
+ if (Charge>0)
+ {
+ ThisScanTags->Tweaks[i].ParentMass=(int)(ParentMass* MASS_SCALE + 0.5);
+ }
+ else
+ ThisScanTags->Tweaks[i].ParentMass=0;
+ }
+
+ ThisScanTags->NumTags = NumTags;
+ ThisScanTags->Tags = &(TagHolder->AllExternalTrieTags[GlobalTagIndex]);
+
+ // read tags (use Stephen's code to parse tags
+ for (i=0; i<NumTags; i++)
+ {
+
+ NewTag = &(TagHolder->AllExternalTrieTags[GlobalTagIndex++]);
+ fgets(Buff,1024,InputStream);
+ LineNumber += 1;
+ if (sscanf(Buff,"%d\t%f\t%f\t%s",&TweakIdx,&Score,&PrefixMass,TagSeq) != 4)
+ {
+ //printf("Error parsing tag file4: %s\n",Buff);
+ //printf("Index: %d ScanNumber: %d GTI: %d\n",i,ScanNumber,GlobalTagIndex);
+ // exit(1);
+ REPORT_ERROR_IS(14,LineNumber,TagFilePath);
+ exit(14);
+ }
+
+
+ //5 24.407 1988.619 SQLK
+ memset(NewTag, 0, sizeof(TrieTag));
+ for (ModIndex=0; ModIndex<MAX_PT_MODS; ModIndex++)
+ NewTag->AminoIndex[ModIndex]=-1;
+
+ // Special code:
+ // PepNovo may include MODIFICATIONS in its tags - so, we must parse them.
+ // We assume that (a) modifications are written in the form %+d, and (b) we
+ // know of the modification type from the inspect input file.
+ TempAA = TagSeq;
+ AminoIndex = 0;
+ ModBufferPos = 0;
+
+ while (*TempAA)
+ {
+ if (*TempAA >= 'A' && *TempAA <= 'Z')
+ {
+ // an amino acid - so, finish the modification-in-progress, if there is one.
+ if (ModBufferPos && AminoIndex)
+ {
+ if (NewTag->ModsUsed == MAX_PT_MODS)
+ {
+ printf("** Error tagging scan %d from file %s: Too many PTMs!\n", ScanNumber,TagFilePath);
+ break;
+ }
+ ModBuffer[ModBufferPos] = '\0';
+ Delta = FindPTModByName(NewTag->Tag[AminoIndex - 1], ModBuffer);
+ if (Delta)
+ {
+ NewTag->AminoIndex[NewTag->ModsUsed] = AminoIndex - 1;
+ NewTag->ModType[NewTag->ModsUsed] = Delta;
+ NewTag->ModsUsed++;
+ }
+ else
+ {
+ printf("** Error tagging scan %d from file %s: Modification %s not understood!\n", ScanNumber, TagFilePath, ModBuffer);
+ break;
+ }
+ }
+ ModBufferPos = 0;
+ // Add the AA:
+ NewTag->Tag[AminoIndex++] = *TempAA;
+ }// aa
+ else
+ {
+ ModBuffer[ModBufferPos++] = *TempAA;
+ } // not aa
+ TempAA++;
+ }
+ NewTag->Tag[AminoIndex] = '\0';
+ // Finish any pending mod (COPY-PASTA FROM ABOVE)
+ if (ModBufferPos && AminoIndex)
+ {
+ if (NewTag->ModsUsed == MAX_PT_MODS)
+ {
+ printf("** Error tagging scan %d from file %s: Too many PTMs!\n", ScanNumber, TagFilePath);
+ }
+ ModBuffer[ModBufferPos] = '\0';
+ Delta = FindPTModByName(NewTag->Tag[AminoIndex - 1], ModBuffer);
+ if (Delta)
+ {
+ NewTag->AminoIndex[NewTag->ModsUsed] = AminoIndex - 1;
+ NewTag->ModType[NewTag->ModsUsed] = Delta;
+ NewTag->ModsUsed++;
+ }
+ else
+ {
+ printf("** Error tagging scan %d from file %s: Modification %s not understood!\n",ScanNumber, TagFilePath, ModBuffer);
+ }
+ }
+
+ NewTag->Charge = ThisScanTags->Tweaks[TweakIdx].Charge;
+ NewTag->ParentMass = ThisScanTags->Tweaks[TweakIdx].ParentMass;
+ NewTag->PSpectrum = NULL;
+ NewTag->Tweak = ThisScanTags->Tweaks + TweakIdx;
+ NewTag->PrefixMass = (int)(PrefixMass * MASS_SCALE + 0.5);
+ NewTag->SuffixMass = NewTag->ParentMass - NewTag->PrefixMass - PARENT_MASS_BOOST;
+ NewTag->Score = Score;
+ NewTag->TagLength =0;
+
+ for (TempAA = NewTag->Tag; *TempAA; TempAA++)
+ {
+ NewTag->SuffixMass -= PeptideMass[*TempAA];
+ NewTag->TagLength++;
+ }
+
+ NewTag->Tag[NewTag->TagLength]='\0';
+
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (NewTag->AminoIndex[ModIndex] >= 0 && NewTag->ModType[ModIndex])
+ {
+ NewTag->SuffixMass -= NewTag->ModType[ModIndex]->RealDelta;
+ }
+ }
+ }
+ }
+ fclose(InputStream);
+
+ if (verbose)
+ {
+ printf("Done reading %d tags\n",GlobalTagIndex);
+ printf("Max ScanNumber with tags %d\n",TagHolder->MaxScanNumber);
+ }
+}
+
+
+void FreeExternalTagHolder()
+{
+ if (TagHolder)
+ {
+ free(TagHolder->AllScanTags);
+ free(TagHolder->AllExternalTrieTags);
+ free(TagHolder);
+ }
+}
+
+
+
+void WriteExternalTags(char *OutFile)
+{
+ int i;
+ FILE *OutStream;
+ int TweakIdx;
+ int TagIdx;
+ ScanTags *ThisScan;
+ TrieTag * Tag;
+ int Index;
+ int ModIndex;
+
+ if (! TagHolder)
+ return;
+
+ printf("Writing tags to %s..\n",OutFile);
+
+ OutStream=fopen(OutFile,"w");
+ if (! OutStream)
+ {
+ REPORT_ERROR_S(8, OutFile);
+ exit(8);
+ //printf("Error couldn't open file for writing: %s\n",OutFile);
+ //exit(1);
+ }
+
+ for (i=0; i<=TagHolder->MaxScanNumber; i++)
+ {
+
+
+ ThisScan = &(TagHolder->AllScanTags[i]);
+
+ if (ThisScan->NumTags<=0)
+ continue;
+
+ //printf("%d %d\n",i,ThisScan->NumTags);
+ fprintf(OutStream,"%d\t%d\n",i,ThisScan->NumTags);
+
+ for (TagIdx=0; TagIdx<ThisScan->NumTags; TagIdx++)
+ {
+ Tag = ThisScan->Tags + TagIdx;
+
+
+ fprintf(OutStream,"%d\t%.3f\t%.2f\t",Tag->Charge, (float)(Tag->PrefixMass / (float)MASS_SCALE),
+ (float)(Tag->ParentMass / (float)MASS_SCALE));
+
+ for (Index = 0; Index < Tag->TagLength; Index++)
+ {
+ //int ModIndex;
+ fprintf(OutStream,"%c", Tag->Tag[Index]);
+
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Tag->AminoIndex[ModIndex]==Index)
+ fprintf(OutStream,"%s", Tag->ModType[ModIndex]->Name);
+ }
+ }
+ fprintf(OutStream,"\n");
+ }
+ }
+
+ fclose(OutStream);
+
+ printf("Done writing tags (Max ScanNumber with tags %d)..\n",TagHolder->MaxScanNumber);
+}
+
+
+
+TrieNode *AddExternalTags(TrieNode *Root, SpectrumNode *Node)
+{
+ int ScanNumber = Node->ScanNumber;
+ MSSpectrum* Spectrum = Node->Spectrum;
+ int DuplicateFlag;
+ int NumTags;
+ int TagIdx;
+ int TweakIdx;
+ SpectrumTweak *Tweaks;
+ TrieTag * Tags;
+ TrieTag *NewTag;
+
+
+ //
+ if (!Root)
+ {
+ Root = NewTrieNode();
+ Root->FailureNode = Root;
+ }
+
+ for (TweakIdx=0; TweakIdx<TWEAK_COUNT; TweakIdx++)
+ Node->Tweaks[TweakIdx].Charge=0;
+
+ if (ScanNumber> TagHolder->MaxScanNumber)
+ return Root;
+
+ NumTags = TagHolder->AllScanTags[ScanNumber].NumTags;
+ if (NumTags<=0)
+ return Root;
+
+ Tweaks = TagHolder->AllScanTags[ScanNumber].Tweaks;
+ for (TweakIdx=0; TweakIdx<TWEAK_COUNT; TweakIdx++)
+ {
+ Node->Tweaks[TweakIdx]= Tweaks[TweakIdx];
+ }
+
+ Tags = TagHolder->AllScanTags[ScanNumber].Tags;
+
+ // Construct a root, if we don't have one already.
+ if (!Root)
+ {
+ Root = NewTrieNode();
+ Root->FailureNode = Root;
+ }
+ for (TagIdx = 0; TagIdx < NumTags; TagIdx++)
+ {
+ NewTag = Tags + TagIdx;
+ TweakIdx = 0;
+
+ NewTag->PSpectrum = Spectrum; // Add pointers from Tag to Spectrum
+
+ // make tag point to the spectrum's Tweak in case they need to share the
+ // later on same information
+
+ for (TweakIdx=0; TweakIdx<TWEAK_COUNT; TweakIdx++)
+ {
+ if (NewTag->Tweak->Charge == Node->Tweaks[TweakIdx].Charge &&
+ NewTag->Tweak->ParentMass == Node->Tweaks[TweakIdx].ParentMass)
+ {
+ NewTag->Tweak = Node->Tweaks + TweakIdx;
+ break;
+ }
+ }
+
+ if (TweakIdx == TWEAK_COUNT)
+ {
+ printf("BAD Error: Tweak went missing?!\n");
+ exit(1);
+ }
+
+
+
+ AddTagToTrie(Root, NewTag, &DuplicateFlag);
+ }
+ //DebugPrintTrieTags(Root);
+ return Root;
+}
+
+
+
+
diff --git a/TagFile.h b/TagFile.h
new file mode 100644
index 0000000..204dca5
--- /dev/null
+++ b/TagFile.h
@@ -0,0 +1,67 @@
+//Title: IonScoring.h
+//Author: Ari Frank
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef __TAGFILE_H__
+#define __TAGFILE_H__
+
+#include "Trie.h"
+#include "Inspect.h"
+
+typedef struct ScanTags {
+ int ScanNumber;
+ int NumTags;
+
+ SpectrumTweak Tweaks[TWEAK_COUNT];
+ TrieTag *Tags;
+} ScanTags;
+
+
+typedef struct ExternalTagHolder {
+
+ int MaxScanNumber;
+
+ ScanTags *AllScanTags;
+
+ TrieTag *AllExternalTrieTags; // one allocation for all tags
+} ExternalTagHolder;
+
+TrieNode *AddExternalTags(TrieNode *Root, SpectrumNode *Spectrum);
+
+void ReadExternalTags(char *TagFilePath, int verbose);
+
+void FreeExternalTagHolder();
+
+void WriteExternalTags( char *OutFile);
+
+
+#endif
+
diff --git a/TagSkewScores.dat b/TagSkewScores.dat
new file mode 100644
index 0000000..9291304
Binary files /dev/null and b/TagSkewScores.dat differ
diff --git a/Tagger.c b/Tagger.c
new file mode 100644
index 0000000..e581ea4
--- /dev/null
+++ b/Tagger.c
@@ -0,0 +1,2148 @@
+//Title: Tagger.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+// Tag generation. Given a spectrum, generate TrieTag objects.
+// See TagTrainer.py for the generation of the tagging model based on
+// empirical ion frequencies.
+
+#include "CMemLeak.h"
+#include "Inspect.h"
+#include "Utils.h"
+#include "Tagger.h"
+#include "Spectrum.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "Trie.h"
+#include "Mods.h"
+#include "Score.h"
+#include "FreeMod.h"
+#include "BN.h"
+#include "Scorpion.h"
+#include "Errors.h"
+
+// For each mass (in daltons), we have an expected ratio of the +1 isotope to
+// the main peak. These ratios rise from near-0 to near 1 as mass goes from 0
+// up to 1750. (For larger masses, isotope sets are more complex!)
+#define ISOTOPE_WEIGHT_COUNT 1750
+
+// How far can the isotope-peak-to-main-peak ratio differ from what we expect?
+// (This controls whether the members Is and HasPlausibleIsotopicPeak are set
+// for spectral peaks. In practice, isotope ratios can vary quite a bit, so
+// we're fairly permissive)
+#define TIGHT_ISOTOPE_RATIO_DIFFERENCE 0.5
+#define MAX_ISOTOPE_RATIO_DIFFERENCE 0.8
+
+// How many intensity-ranks do we track? (We're granular for top-10 peaks, then less so
+// for crappier peaks)
+#define INTENSITY_RANK_COUNT 22
+
+// Number of charges and sectors for which we have a tagging model. The size
+// of the array g_TruthByNodeType will equal CHARGE_COUNT * CHUNK_COUNT, similarly for other
+// model params.
+#define CHARGE_COUNT 3
+#define CHUNK_COUNT 3
+
+// JumpingHash[n] has a list of all amino acids (and modifications) that have a mass
+// rounding off to n daltons. To find, e.g., a jump matching 80.3, we'd check
+// JumpingHash[79] and JumpingHash[80] and JumpingHash[81].
+#define MAX_JUMPING_HASH 1024
+JumpNode** JumpingHash = NULL;
+
+//
+StringNode* FirstTagCheckNode;
+StringNode* LastTagCheckNode;
+
+// JumpsByAA is a 2D array containing pointers to all jumps for each amino acid.
+// We iterate over JumpsByAA when setting jump scores.
+JumpNode** JumpsByAA; //[AMINO_ACIDS * GlobalOptions->DeltasPerAA];
+
+// Skew histo is used in scoring edges - SkewHistoStep[n] is the score penalty we
+// apply to an edge that deviates from the ideal jump size by floor(n/100).
+// Hard-coded, and should probably be part of the tagging model!
+double SkewHistoStep[] = {0.0000, 0.0000, -0.0037, -0.0501, -0.1061, -0.2106, -0.2418,
+ -0.3466, -0.4198, -0.4800, -0.4861, -0.4863, -0.4926, -0.5421, -0.5893, -0.6851,
+ -0.7173, -0.8108, -0.8811, -0.9275, -0.9302, -0.9302, -0.9353, -0.9802, -1.0259,
+ -1.1027, -1.1395, -1.2253, -1.2640, -1.2921, -1.2931, -1.2931, -1.2960, -1.3207,
+ -1.3587, -1.4249, -1.4567, -1.5158, -1.5444, -1.5696, -1.5710, -1.5710, -1.5710,
+ -1.5989, -1.6153, -1.6641, -1.6884, -1.7345, -1.7544, -1.7708, -1.7714, -1.7714,
+ -1.7714, -1.7903, -1.8150, -1.8556, -1.8819, -1.9274, -1.9462, -1.9716, -1.9723,
+ -1.9723, -1.9737, -1.9976, -2.0244, -2.0640, -2.0957, -2.1574, -2.1966, -2.2151,
+ -2.2151, -2.2151, -2.2178, -2.2512, -2.2848, -2.3753, -2.4070, -2.4806, -2.5463,
+ -2.5879, -2.5891, -2.5891, -2.5917, -2.6571, -2.6951, -2.8598, -2.9352, -3.0991,
+ -3.1702, -3.2468, -3.2493, -3.2493, -3.2669, -3.4412, -3.5462, -3.9350, -4.1434,
+ -4.8738, -5.7776, -6.5501};
+
+// IsotopeWeights[n] is the expected ratio for the +1 isotope to the +0 isotope, given
+// a peptide that whose weight is n daltons.
+float IsotopeWeights[ISOTOPE_WEIGHT_COUNT];
+
+// Forward declarations:
+void TagGraphAddEndpointNodes(TagGraph* Graph, MSSpectrum* Spectrum);
+void DebugPrintTagList(MSSpectrum* Spectrum, TrieTag* Tags, int TagCount);
+void PrintTagToLine(FILE* OutputFile, TrieTag* Tag);
+void DebugPrintTagGraph(MSSpectrum* Spectrum, TagGraph* Graph);
+void DebugPrintTagsForPeptide(MSSpectrum* Spectrum, TagGraph* Graph, TrieTag* Tags, int TagCount);
+TrieTag* TagGraphGenerateTags(TagGraph* Graph, MSSpectrum* Spectrum, int* TagCount,
+ int MaximumTagCount, SpectrumTweak* Tweak, float TagEdgeScoreMultiplier,
+ PRMBayesianModel* Model);
+
+
+// New (as of 3/2005) tagging model. We load one model in for each NodeType. (A NodeType is a combination of charge
+// and sector - e.g. charge 3+ and middle sector). This model is used in scoring tag graph nodes, and in scoring
+// PRMs.
+
+#define BY_RANK_TINY 20
+#define BY_RANK_MISSING 21
+#define BY_RANK_COUNT 22
+#define BY_SKEW_COUNT 5
+typedef struct TaggingModel
+{
+ int BRank[BY_RANK_COUNT];
+ int SisterBRank[BY_RANK_COUNT];
+ int SisterBSkew[BY_SKEW_COUNT];
+ int BSkew[BY_SKEW_COUNT];
+ int SkewableBRank[BY_RANK_COUNT];
+ int YRank[BY_RANK_COUNT];
+ int SisterYRank[BY_RANK_COUNT];
+ int SisterYSkew[BY_SKEW_COUNT];
+ int YSkew[BY_SKEW_COUNT];
+ int SkewableYRank[BY_RANK_COUNT];
+ int Witness[512];
+ int BIsotope[4];
+ int YIsotope[4];
+} TaggingModel;
+
+typedef struct TagMaster
+{
+ TaggingModel Models[CHARGE_COUNT * CHUNK_COUNT];
+ float PTMPenalty;
+} TagMaster;
+
+TagMaster MasterTaggingModel;
+
+// Constructor for TagGraph
+TagGraph* ConstructTagGraph(MSSpectrum* Spectrum)
+{
+ TagGraph* Graph;
+ Graph = (TagGraph*)calloc(1, sizeof(TagGraph));
+ return Graph;
+}
+
+// Destructor for a node from a TagGraph, as well as the nodes edges.
+void FreeTagGraphNode(TagGraphNode* Node)
+{
+ TagGraphEdge* Edge;
+ TagGraphEdge* PrevEdge = NULL;
+ if (!Node)
+ {
+ return;
+ }
+ for (Edge = Node->FirstEdge; Edge; Edge = Edge->Next)
+ {
+ SafeFree(PrevEdge);
+ PrevEdge = Edge;
+ }
+ SafeFree(PrevEdge);
+ // Back edges:
+ SafeFree(Node->BackEdge);
+ SafeFree(Node->BackEdgeDouble);
+ SafeFree(Node->BackEdgeTriple);
+ SafeFree(Node);
+}
+
+// Destructor for a TagGraph.
+void FreeTagGraph(TagGraph* Graph)
+{
+ TagGraphNode* TagNode;
+ TagGraphNode* PrevTagNode = NULL;
+
+ //
+ if (!Graph)
+ {
+ return;
+ }
+ SafeFree(Graph->BackEdgeBuffer);
+ for (TagNode = Graph->FirstNode; TagNode; TagNode = TagNode->Next)
+ {
+ if (PrevTagNode)
+ {
+ FreeTagGraphNode(PrevTagNode);
+ }
+ PrevTagNode = TagNode;
+ }
+ if (PrevTagNode)
+ {
+ FreeTagGraphNode(PrevTagNode);
+ }
+ SafeFree(Graph->NodeIndex);
+ Graph->FirstNode = NULL;
+ Graph->LastNode = NULL;
+ SafeFree(Graph);
+}
+
+// Somewhat ugly macro for inserting a tag graph node into the list.
+// (Note that since we're not inserting phosphate loss peaks, this is
+// overkill - we will always be inserting at the end of the list in practice!)
+#define INSERT_TAGNODE_ASC(First, Last, Node)\
+{\
+ InsertAfter = (Last); \
+ while ((InsertAfter) && (InsertAfter)->Mass > (Node)->Mass) \
+ { \
+ (InsertAfter) = (InsertAfter)->Prev; \
+ } \
+ if (InsertAfter) \
+ { \
+ if ((InsertAfter)->Next) \
+ { \
+ InsertAfter->Next->Prev = Node; \
+ } \
+ Node->Next = InsertAfter->Next; \
+ InsertAfter->Next = Node; \
+ Node->Prev = InsertAfter; \
+ } \
+ else \
+ { \
+ Node->Next = First; \
+ if (First)\
+ { \
+ First->Prev = Node; \
+ } \
+ First = Node; \
+ } \
+ if (InsertAfter == Last) \
+ { \
+ Last = Node; \
+ } \
+}
+
+#define INSERT_TAGNODE_DESC(First, Last, Node)\
+{\
+InsertAfter = Last;\
+while (InsertAfter && InsertAfter->Mass < Node->Mass)\
+{\
+ InsertAfter = InsertAfter->Prev;\
+}\
+if (InsertAfter)\
+{\
+ if (InsertAfter->Next)\
+ {\
+ InsertAfter->Next->Prev = Node;\
+ }\
+ Node->Next = InsertAfter->Next;\
+ InsertAfter->Next = Node;\
+ Node->Prev = InsertAfter;\
+}\
+else\
+{\
+ Node->Next = First;\
+ if (First)\
+ {\
+ First->Prev = Node;\
+ }\
+ First = Node;\
+}\
+if (InsertAfter == Last)\
+{\
+ Last = Node;\
+}\
+}
+
+// Take a new (empty) tag graph, and add nodes to it. For each spectral peak, add 2 nodes (one b, one y).
+// Also add endpoint nodes.
+void TagGraphAddNodes(TagGraph* Graph, MSSpectrum* Spectrum)
+{
+ int PeakIndex;
+ TagGraphNode* FirstBNode = NULL;
+ TagGraphNode* LastBNode = NULL;
+ TagGraphNode* FirstYNode = NULL;
+ TagGraphNode* LastYNode = NULL;
+ TagGraphNode* InsertAfter = NULL;
+ TagGraphNode* Node;
+ TagGraphNode* MergingBNode;
+ TagGraphNode* MergingYNode;
+ int NodeMass;
+ int NodeIndex;
+ int MinPRMMass = 50 * DALTON;
+ // Iterate over peaks. For each peak, add a b and a y node. We'll build two lists,
+ // b node list (FirstBNode / LastBNode) and y node list (FirstYNode / LastYNode), then
+ // merge the lists.
+ for (PeakIndex = 0; PeakIndex < Spectrum->PeakCount; PeakIndex++)
+ {
+ NodeMass = Spectrum->Peaks[PeakIndex].Mass - HYDROGEN_MASS;
+ //printf("Peak Index %d, NodeMass %d\n",PeakIndex,NodeMass);
+ // Filter any nodes whose mass is negative, or <50 but not zero, or larger than the precursor mass.
+ // (A node at, say, PRM 19 couldn't possibly be part of a true peptide, since the smallest mass-jump
+ // is 57)
+ if (NodeMass > -GlobalOptions->Epsilon && (NodeMass < GlobalOptions->Epsilon || NodeMass > MinPRMMass) && (NodeMass < Spectrum->ParentMass + GlobalOptions->ParentMassEpsilon))
+ {
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->NodeType = evGraphNodeB;
+ Node->OriginalPeakIndex = PeakIndex;
+ Node->IntensityRankB = Spectrum->Peaks[PeakIndex].IntensityRank;
+ Node->BIndex = PeakIndex;
+ Node->YIndex = -1;
+ Node->IntensityRankY = -1;
+ Node->Mass = NodeMass;
+ Node->IonTypeFlags = ION_FLAG_B;
+ INSERT_TAGNODE_ASC(FirstBNode, LastBNode, Node);
+ if (0)//(Spectrum->Charge > 2 && Spectrum->Peaks[PeakIndex].IntensityRank < 16)
+ { //charge 3 spectra have high intensity doubly-charged peaks. I need to put those into the graph
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->NodeType = evGraphNodeB;
+ Node->OriginalPeakIndex = PeakIndex;
+ Node->IntensityRankB = Spectrum->Peaks[PeakIndex].IntensityRank;
+ Node->BIndex = PeakIndex;
+ Node->YIndex = -1;
+ Node->IntensityRankY = -1;
+ Node->Mass = (NodeMass * 2) - HYDROGEN_MASS; //the single charge mass, if the peak was doubly charged
+ Node->IonTypeFlags = ION_FLAG_B;
+ INSERT_TAGNODE_ASC(FirstBNode, LastBNode, Node);
+ }
+ //printf("Peak %d intensity %f rank %d\n",Spectrum->Peaks[PeakIndex].Mass, Spectrum->Peaks[PeakIndex].Intensity, Spectrum->Peaks[PeakIndex].IntensityRank);
+ }
+ //else
+ // {
+ // printf("NodeMass is %d <= %d\n",NodeMass,-GlobalOptions->Epsilon);
+ // printf("NodeMass is %d >= %d or <= %d\n",NodeMass,GlobalOptions->Epsilon,MinPRMMass);
+ // printf("NodeMass is %d <= %d\n",NodeMass,Spectrum->ParentMass + GlobalOptions->ParentMassEpsilon);
+ // printf("ParentMass %d\n",Spectrum->ParentMass);
+ // //getch();
+ // }
+ NodeMass = Spectrum->ParentMass - Spectrum->Peaks[PeakIndex].Mass;
+ //printf("Peak Index %d, NodeMass %d\n",PeakIndex,NodeMass);
+ if (NodeMass > -GlobalOptions->Epsilon && (NodeMass < GlobalOptions->Epsilon || NodeMass > MinPRMMass) && (NodeMass < Spectrum->ParentMass + GlobalOptions->ParentMassEpsilon))
+ {
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->NodeType = evGraphNodeY;
+ Node->OriginalPeakIndex = PeakIndex;
+ Node->IntensityRankY = Spectrum->Peaks[PeakIndex].IntensityRank;
+ Node->IntensityRankB = -1;
+ Node->YIndex = PeakIndex;
+ Node->BIndex = -1;
+ Node->Mass = NodeMass;
+ Node->IonTypeFlags = ION_FLAG_Y;
+ INSERT_TAGNODE_DESC(FirstYNode, LastYNode, Node);
+ if (0)//(Spectrum->Charge > 2 && Spectrum->Peaks[PeakIndex].IntensityRank < 16)
+ { //charge 3 spectra have high intensity doubly-charged peaks. I need to put those into the graph
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->NodeType = evGraphNodeY;
+ Node->OriginalPeakIndex = PeakIndex;
+ Node->IntensityRankY = Spectrum->Peaks[PeakIndex].IntensityRank;
+ Node->IntensityRankB = -1;
+ Node->YIndex = PeakIndex;
+ Node->BIndex = -1;
+ Node->Mass = (NodeMass * 2) - HYDROGEN_MASS; //the single charge mass, if the peak was doubly charged
+ Node->IonTypeFlags = ION_FLAG_Y;
+ INSERT_TAGNODE_DESC(FirstYNode, LastYNode, Node);
+ }
+ }
+ // We could insert phosphate-loss peaks for b and y nodes at this point.
+ // There are cases (particularly for breaks next to the phosphorylation site,
+ // and for phosphoserines) when the phosphate-loss peak is MORE LIKELY than
+ // the original peak. However, if we insert phosphate-loss peaks, we end up
+ // with 4 nodes per peak rather than 2...that slows the speed of tag generation
+ // down *considerably*, and probably lowers selectivity quite a bit.
+ }
+ ///////////////////////////////////////////
+ // Merge b and y node lists into the list Graph->FirstNode...Graph->LastNode
+ MergingBNode = FirstBNode;
+ MergingYNode = LastYNode;
+ while (1)
+ {
+ if (!MergingBNode && !MergingYNode)
+ {
+ break;
+ }
+ if (!MergingBNode || (MergingYNode && (MergingBNode->Mass > MergingYNode->Mass)))
+ {
+ // Insert the y node into the master list:
+ Node = MergingYNode->Prev; // temp
+ if (!Graph->FirstNode)
+ {
+ Graph->FirstNode = MergingYNode;
+ Graph->LastNode = MergingYNode;
+ MergingYNode->Next = NULL;
+ MergingYNode->Prev = NULL;
+ }
+ else
+ {
+ MergingYNode->Prev = Graph->LastNode;
+ Graph->LastNode->Next = MergingYNode;
+ Graph->LastNode = MergingYNode;
+ }
+ MergingYNode->Next = NULL;
+ MergingYNode = Node;
+ }
+ else
+ {
+ // Insert the b node into the master list:
+ Node = MergingBNode->Next; // temp
+ if (!Graph->FirstNode)
+ {
+ Graph->FirstNode = MergingBNode;
+ Graph->LastNode = MergingBNode;
+ MergingBNode->Next = NULL;
+ MergingBNode->Prev = NULL;
+ }
+ else
+ {
+ MergingBNode->Prev = Graph->LastNode;
+ Graph->LastNode->Next = MergingBNode;
+ Graph->LastNode = MergingBNode;
+ }
+ MergingBNode->Next = NULL;
+ MergingBNode = Node;
+ }
+ Graph->NodeCount++;
+ }
+
+ TagGraphAddEndpointNodes(Graph, Spectrum);
+ for (Node = Graph->FirstNode, NodeIndex = 0; Node; Node = Node->Next, NodeIndex++)
+ {
+ Node->Index = NodeIndex;
+ }
+}
+
+// Insert another node into the tag-graph. (Used only for a few nodes, as this isn't super fast)
+void InsertTagGraphNode(TagGraph* Graph, TagGraphNode* Node)
+{
+ TagGraphNode* TempNode;
+ // Iterate backwards, until either TempNode points to a smaller PRM or we fall of the edge of the list.
+ for (TempNode = Graph->LastNode; TempNode && TempNode->Mass > Node->Mass; TempNode = TempNode->Prev)
+ {
+ ;;
+ }
+ if (!TempNode)
+ {
+ // This new node is smaller than any we've seen.
+ if (Graph->FirstNode)
+ {
+ Graph->FirstNode->Prev = Node;
+ }
+ else
+ {
+ Graph->LastNode = Node;
+ }
+ Node->Next = Graph->FirstNode;
+ Graph->FirstNode = Node;
+ }
+ else if (TempNode->Next)
+ {
+ TempNode->Next->Prev = Node;
+ Node->Next = TempNode->Next;
+ Node->Prev = TempNode;
+ TempNode->Next = Node;
+ }
+ else
+ {
+ Node->Prev = Graph->LastNode;
+ Graph->LastNode->Next = Node;
+ if (Graph->LastNode == Graph->FirstNode)
+ {
+ Graph->FirstNode = Node;
+ }
+ Graph->LastNode = Node;
+ }
+ Graph->NodeCount++;
+}
+
+// Add the "goalpost nodes" to our tag graph, mass 0 and parent mass:
+void TagGraphAddEndpointNodes(TagGraph* Graph, MSSpectrum* Spectrum)
+{
+ TagGraphNode* Node;
+ int ModType;
+
+ // LEFT edge:
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->Mass = 0;
+ Node->NodeType = evGraphNodeLeft;
+ Node->IonTypeFlags = ION_FLAG_B;
+ Node->OriginalPeakIndex = -1;
+
+ InsertTagGraphNode(Graph, Node);
+
+ // LEFT EDGE plus N-terminal mod:
+ for (ModType = 0; ModType < AllPTModCount; ModType++)
+ {
+ if (AllKnownPTMods[ModType].Flags & DELTA_FLAG_N_TERMINAL)
+ {
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->Mass = AllKnownPTMods[ModType].Mass;
+ Node->NodeType = evGraphNodeLeftMod;
+ Node->IonTypeFlags = ION_FLAG_B;
+ Node->OriginalPeakIndex = -1;
+ // The node stores a pointer to the MassDelta, so that the tag
+ // will also include the MassDelta:
+ Node->PTM = MassDeltaByIndex[MAX_PT_MODTYPE * MDBI_ALL_MODS + ModType];
+ //Node->PTM = ModType;
+ InsertTagGraphNode(Graph, Node);
+ }
+ }
+
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->Mass = Spectrum->ParentMass - PARENT_MASS_BOOST;
+ Node->NodeType = evGraphNodeRight;
+ Node->IonTypeFlags = ION_FLAG_Y;
+ Node->OriginalPeakIndex = -1;
+ InsertTagGraphNode(Graph, Node);
+
+ // RIGHT EDGE minus C-terminal PTM:
+ for (ModType = 0; ModType < AllPTModCount; ModType++)
+ {
+ if (AllKnownPTMods[ModType].Flags & DELTA_FLAG_C_TERMINAL)
+ {
+ Node = (TagGraphNode*)calloc(1, sizeof(TagGraphNode));
+ Node->Mass = Spectrum->ParentMass - PARENT_MASS_BOOST - AllKnownPTMods[ModType].Mass;
+ Node->NodeType = evGraphNodeRightMod;
+ Node->IonTypeFlags = ION_FLAG_Y;
+ Node->OriginalPeakIndex = -1;
+ Node->PTM = MassDeltaByIndex[MAX_PT_MODTYPE * MDBI_ALL_MODS + ModType];
+ //Node->PTM = ModType;
+ InsertTagGraphNode(Graph, Node);
+ }
+ }
+}
+
+// Print all the PRM nodes from a tag graph. (Handy for debugging tagging)
+void DebugPrintTagGraph(MSSpectrum* Spectrum, TagGraph* Graph)
+{
+ TagGraphNode* Node;
+ //
+ printf(">->Printing tag graph...\n");
+ for (Node = Graph->FirstNode; Node; Node = Node->Next)
+ {
+#ifdef DEBUG_TAG_GENERATION
+ printf("%s\n", Node->VerboseNodeInfo);
+#else
+ printf("%.2f %.2f\n", Node->Mass / (float)MASS_SCALE, Node->Score);
+ //printf("At %.2f node %d ion types %d score %.3f:\n", Node->Mass/100.0, Node->NodeType, Node->IonTypeFlags, Node->Score);
+ //printf(" b%d y%d in%.2f io%.2f is%.2f\n", Node->IntensityRankB, Node->IntensityRankY, Node->IntensityScore, Node->IonTypeScore, Node->IsotopeScore);
+
+ if (Node->BIndex > -1)
+ {
+ }
+ if (Node->YIndex > -1)
+ {
+ }
+#endif
+
+ }
+ printf("<-<End of tag graph.\n");
+}
+
+// The JumpingHash stores, for each mass (rounded to nearest integer), a list
+// of amino acids (or modified amino acids) matching the mass. When constructing
+// tags, we allow a move from node A to node B if we find a jump whose size matches
+// the mass difference between nodes A and B. (We check three hash buckets, to
+// compensate for roundoff screwery)
+JumpNode* JumpingHashAddJump(int Mass, char Amino, MassDelta* Delta)
+{
+ int HashBucket;
+ JumpNode* Node;
+ JumpNode* NewNode;
+ JumpNode* Prev;
+ // HashBucket = Mass, rounded to nearest int
+ FAST_ROUND(Mass / (float)MASS_SCALE, HashBucket);
+ if (HashBucket < 0 || HashBucket >= MAX_JUMPING_HASH)
+ {
+ printf("** ERROR: Bad mass in JumpingHashAddJump\n");
+ printf("Mass %d amino %c delta %s\n", Mass, Amino, Delta->Name);
+ return NULL;
+ }
+ // HashBucket = (int)Mass;
+ //if (Mass > HashBucket + 0.5)
+ //{
+ // HashBucket += 1;
+ //}
+ NewNode = (JumpNode*)calloc(1, sizeof(JumpNode));
+ NewNode->Amino = Amino;
+ NewNode->Mass = Mass;
+ NewNode->Delta = Delta; // The PTM for this jump, or -1 if there's no mod.
+ if (NewNode->Delta)
+ {
+ NewNode->Score = Delta->Score;
+ }
+ Node = JumpingHash[HashBucket];
+
+ if (!Node)
+ {
+ // Add a brand new entry to the hash:
+ JumpingHash[HashBucket] = NewNode;
+ }
+ else
+ {
+ // Add this jump to the end of the list.
+ // (Lists are short, so we don't bother keeping a tail pointer or two-way links)
+ for (; Node; Node = Node->Next)
+ {
+ Prev = Node;
+ }
+ Prev->Next = NewNode;
+ }
+ return NewNode;
+}
+
+// Populate the jumping hash with each amino acid, and each modified amino acid.
+void PopulateJumpingHash()
+{
+ int Amino;
+ int PTModIndex;
+ MassDelta* Delta;
+ int ModForAAIndex;
+ JumpNode* JNode;
+
+ FreeJumpingHash(); // free any old stuff
+
+ // Allocate memory:
+ JumpingHash = (JumpNode**)calloc(MAX_JUMPING_HASH, sizeof(JumpNode*));
+ SafeFree(JumpsByAA);
+ JumpsByAA = (JumpNode**)calloc(sizeof(JumpNode*), AMINO_ACIDS * GlobalOptions->DeltasPerAA);
+
+ memset(JumpsByAA, 0, sizeof(JumpNode*) * AMINO_ACIDS * GlobalOptions->DeltasPerAA);
+ for (Amino = 'A'; Amino<='Y'; Amino++)
+ {
+ if (PeptideMass[Amino]<0.01)
+ {
+ continue; // Not an amino acid ("O" or "U" or "J" or somesuch)
+ }
+ ModForAAIndex = 0;
+ // Don't build a jump node for unmodified Q or unmodified I, because they are accounted
+ // for by the jumps for unmodified K and L.
+ if (Amino != 'Q' && Amino != 'I')
+ {
+ JNode = JumpingHashAddJump(PeptideMass[Amino], (char)Amino, NULL);
+ JumpsByAA[(Amino-'A') * GlobalOptions->DeltasPerAA] = JNode;
+ ModForAAIndex = 1;
+ }
+ for (PTModIndex = 0; PTModIndex < GlobalOptions->DeltasPerAA; PTModIndex++)
+ {
+ Delta = &MassDeltas[Amino - 'A'][PTModIndex];
+
+ if (Delta->Flags)
+ {
+ if (!(Delta->Flags & (DELTA_FLAG_C_TERMINAL | DELTA_FLAG_N_TERMINAL)))
+ {
+ JNode = JumpingHashAddJump(Delta->RealDelta + PeptideMass[Amino], (char)Amino, Delta);
+ JumpsByAA[(Amino-'A') * GlobalOptions->DeltasPerAA + ModForAAIndex] = JNode;
+ ModForAAIndex++;
+ }
+ }
+ else
+ {
+ // There are no more PTMs in MassDeltas[Amino], so stop iterating:
+ break;
+ }
+ }
+ }
+}
+
+// Destructor for the JumpingHash contents.
+void FreeJumpingHash()
+{
+ int HashBucket;
+ JumpNode* Node;
+ JumpNode* Prev = NULL;
+ //
+ if (JumpingHash)
+ {
+ for (HashBucket = 0; HashBucket < MAX_JUMPING_HASH; HashBucket++)
+ {
+ Prev = NULL;
+ for (Node = JumpingHash[HashBucket]; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ JumpingHash[HashBucket] = NULL;
+ }
+ SafeFree(JumpingHash);
+ JumpingHash = NULL;
+ }
+ SafeFree(JumpsByAA);
+ JumpsByAA = NULL;
+}
+
+void DebugPrintTagGraphEdges(TagGraph* Graph)
+{
+ TagGraphNode* Node;
+ TagGraphEdge* Edge;
+ for (Node = Graph->FirstNode; Node; Node = Node->Next)
+ {
+ printf("Node at %.2f: (score %.2f)\n", Node->Mass / (float)MASS_SCALE, Node->Score);
+ for (Edge = Node->FirstEdge; Edge; Edge = Edge->Next)
+ {
+ printf("-->Add '%c' (%.2f, skew %.2f) to reach %.2f\n", Edge->Jump->Amino, Edge->Jump->Mass / (float)MASS_SCALE,
+ ((Edge->ToNode->Mass - Node->Mass) - (Edge->Jump->Mass))/(float)MASS_SCALE,
+ Edge->ToNode->Mass / (float)MASS_SCALE);
+ }
+ }
+}
+
+// Called after populating the tag graph with nodes.
+// Now we add edges between any two nodes that can be linked by a JUMP (an amino acid, or
+// an amino acid plus a decoration)
+void TagGraphPopulateEdges(TagGraph* Graph)
+{
+ TagGraphNode* Node;
+ TagGraphNode* OtherNode;
+ int JumpSize;
+ // For efficiency, we *never* consider a jump smaller or larger than these boundaries.
+ // Note that glycine has mass 57.02, and tryptophan has size 186
+ // (If there are PTMs, we do consider jumps of MaxJumpSize + MaxPTMMass; that's probably overkill)
+ int MinJumpSize = GLYCINE_MASS - (DALTON * 2);
+ int MaxJumpSize;
+ int MaxAA;
+ int ModIndex;
+ int MaxSkew;
+ TagGraphEdge* Edge;
+ int IntSkew;
+ int EdgeCount = 0;
+ int ModJumpCount = 0;
+ int AA;
+ JumpNode* JNode;
+ int Bucket;
+ int HashBucket;
+ MaxAA = 0;
+ for (AA = 'A'; AA < 'X'; AA++)
+ {
+ MaxAA = max(MaxAA, PeptideMass[AA]);
+ }
+ MaxJumpSize = MaxAA;
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ MaxJumpSize = (int)(max(MassDeltaByIndex[MAX_PT_MODTYPE * MDBI_ALL_MODS + ModIndex]->RealDelta + MaxAA, MaxJumpSize));
+ for (AA = 0; AA < 26; AA++)
+ {
+ ModJumpCount += AllKnownPTMods[ModIndex].Allowed[AA];
+ }
+ }
+ MaxJumpSize += GlobalOptions->ParentMassEpsilon;
+
+ MaxSkew = sizeof(SkewHistoStep) / sizeof(double) - 1;
+
+ // We do a double-loop over the graph to find all legal edges.
+ for (Node = Graph->FirstNode; Node; Node = Node->Next)
+ {
+ if (Node->NodeType == evGraphNodeRight || Node->NodeType == evGraphNodeRightMod)
+ {
+ // This is a right-edge, so no edges emit from it:
+ continue;
+ }
+
+ for (OtherNode = Node->Next; OtherNode; OtherNode = OtherNode->Next)
+ {
+ if (OtherNode->NodeType == evGraphNodeLeft || OtherNode->NodeType == evGraphNodeLeftMod)
+ {
+ // This is a left-edge, so no edges enter it:
+ continue;
+ }
+ JumpSize = OtherNode->Mass - Node->Mass;
+ if (JumpSize < MinJumpSize)
+ {
+ continue;
+ }
+ if (JumpSize > MaxJumpSize)
+ {
+ break;
+ }
+ FAST_ROUND(JumpSize / (float)MASS_SCALE, HashBucket);
+ for (Bucket = HashBucket - 1; Bucket < HashBucket + 2; Bucket++)
+ {
+ if (Bucket < 0 || Bucket >= MAX_JUMPING_HASH)
+ {
+ continue;
+ }
+ for (JNode = JumpingHash[Bucket]; JNode; JNode = JNode->Next)
+ {
+ IntSkew = JumpSize - JNode->Mass;
+ if (abs(IntSkew) > GlobalOptions->Epsilon)
+ {
+ continue;
+ }
+ if (JNode->Delta)
+ {
+ if (GlobalOptions->TagPTMMode == 1 || GlobalOptions->MaxPTMods == 0 || JNode->Delta->Score < -5)
+ {
+ continue;
+ }
+ }
+ // Allocate a TagGraphEdge, initialize it, and add it to this node's list of edges:
+ Edge = (TagGraphEdge*)calloc(1, sizeof(TagGraphEdge));
+ Edge->Jump = JNode;
+ Edge->FromNode = Node;
+ Edge->ToNode = OtherNode;
+ Edge->Skew = IntSkew;
+ // For now, no skew scoring:
+ //if (IntSkew > MaxSkew)
+ //{
+ // Edge->Score = (float)SkewHistoStep[MaxSkew];
+ //}
+ //else
+ //{
+ // Edge->Score = (float)SkewHistoStep[IntSkew];
+ //}
+ Edge->Score = JNode->Score;
+ if (!Node->FirstEdge)
+ {
+ Node->FirstEdge = Edge;
+ Node->LastEdge = Edge;
+ }
+ else
+ {
+ Node->LastEdge->Next = Edge;
+ Node->LastEdge = Edge;
+ }
+ if (Edge->Jump->Delta)
+ {
+ Edge->Score += MasterTaggingModel.PTMPenalty;
+ }
+ //GlobalStats->TagGraphEdges++;
+ EdgeCount++;
+ } // Jnode loop
+ } // bucket loop
+ }
+ }
+}
+
+// For quick-sort of tags - list from highest to lowest score.
+int CompareTagScores(const TrieTag* TagA, const TrieTag* TagB)
+{
+ if (TagA->Score > TagB->Score)
+ {
+ return -1;
+ }
+ if (TagA->Score < TagB->Score)
+ {
+ return 1;
+ }
+ if (TagA->PrefixMass < TagB->PrefixMass)
+ {
+ return -1;
+ }
+ if (TagA->PrefixMass > TagB->PrefixMass)
+ {
+ return 1;
+ }
+ if (TagA < TagB)
+ {
+ return -1;
+ }
+ if (TagA > TagB)
+ {
+ return -1;
+ }
+ return 0;
+}
+
+int TagSkewBinCount;
+float* TagSkewScore = NULL;
+float* TagTotalAbsSkewScore = NULL;
+
+void FreeTagSkewScores()
+{
+ SafeFree(TagSkewScore);
+ TagSkewScore = NULL;
+ SafeFree(TagTotalAbsSkewScore);
+ TagTotalAbsSkewScore = NULL;
+}
+
+void SetTagSkewScores()
+{
+ char FilePath[2048];
+ FILE* TagSkewFile;
+ //
+ if (TagSkewScore)
+ {
+ return;
+ }
+ sprintf(FilePath, "%s%s", GlobalOptions->ResourceDir, "TagSkewScores.dat");
+ TagSkewFile = fopen(FilePath, "rb");
+ if (!TagSkewFile)
+ {
+ REPORT_ERROR_S(3, FilePath);
+ // To avoid crashing later, set up a length-1 array:
+ TagSkewBinCount = 1;
+ TagSkewScore = (float*)calloc(1, sizeof(float));
+ TagSkewScore[0] = 0;
+ TagTotalAbsSkewScore = (float*)calloc(1, sizeof(float));
+ TagTotalAbsSkewScore[0] = 0;
+ return;
+ }
+ // Read the number of entries:
+ ReadBinary(&TagSkewBinCount, sizeof(int), 1, TagSkewFile);
+ // Allocate arrays:
+ TagSkewScore = (float*)calloc(TagSkewBinCount, sizeof(float));
+ TagTotalAbsSkewScore = (float*)calloc(TagSkewBinCount, sizeof(float));
+ // Populate arrays:
+ ReadBinary(TagSkewScore, sizeof(float), TagSkewBinCount, TagSkewFile);
+ ReadBinary(TagTotalAbsSkewScore, sizeof(float), TagSkewBinCount, TagSkewFile);
+ fclose(TagSkewFile);
+}
+
+static TrieTag* AllTags = NULL;
+//// New tag generation function: Generates tags of a (more-or-less) arbitrary length!
+//TrieTag* TagGraphGenerateTagsOld(TagGraph* Graph, MSSpectrum* Spectrum, int* TagCount,
+// int MaximumTagCount, SpectrumTweak* Tweak, float TagEdgeScoreMultiplier)
+//{
+// TagGraphNode* TagNodes[12];
+// TagGraphEdge* TagEdges[12];
+// int NodeIndex;
+// int EdgeIndex;
+// TagGraphNode* Node;
+// TagGraphEdge* Edge;
+// TagGraphNode* LeftNode;
+// TagGraphNode* RightNode;
+// int CurrentDepth;
+// int InternalNodes;
+// float NodeScore;
+// TrieTag* Tag;
+// int TagAllocation;
+// int BacktrackFlag;
+// int AllTagCount = 0;
+// int Bin;
+// float ScoreToBeat = -9999;
+// //
+// *TagCount = 0;
+// TagAllocation = 1024;
+// if (!AllTags)
+// {
+// AllTags = (TrieTag*)calloc(sizeof(TrieTag), TagAllocation);
+// }
+// NodeIndex = 0;
+// EdgeIndex = -1;
+// BacktrackFlag = 0;
+// CurrentDepth = 0;
+// TagNodes[0] = Graph->FirstNode;
+// while (1)
+// {
+// // If we're BACKTRACKING, then move to a sibling or parent:
+// if (BacktrackFlag)
+// {
+// // Move the root of the subtree, if necessary:
+// if (CurrentDepth == 0)
+// {
+// // Move to the next 'first' node:
+// TagNodes[0] = TagNodes[0]->Next;
+// if (!TagNodes[0])
+// {
+// break;
+// }
+// BacktrackFlag = 0;
+// continue;
+// }
+// // Move to a sibling, if we can:
+// TagEdges[CurrentDepth - 1] = TagEdges[CurrentDepth - 1]->Next;
+// if (TagEdges[CurrentDepth - 1])
+// {
+// TagNodes[CurrentDepth] = TagEdges[CurrentDepth - 1]->ToNode;
+// BacktrackFlag = 0;
+// continue;
+// }
+// // No more siblings - move up one level.
+// CurrentDepth--;
+// continue;
+// }
+//
+// // Special case for level 1: Skip tag nodes with silly masses like 20Da.
+// if (CurrentDepth == 0)
+// {
+// Node = TagNodes[0];
+// if (Node->Mass > GlobalOptions->ParentMassEpsilon && Node->Mass < GLYCINE_MASS - GlobalOptions->Epsilon)
+// {
+// BacktrackFlag = 1;
+// continue;
+// }
+// }
+//
+// // If we're deep enough, report a tag and start backtracking:
+// if (CurrentDepth >= GlobalOptions->GenerateTagLength)
+// {
+// BacktrackFlag = 1;
+// LeftNode = TagNodes[0];
+// RightNode = TagNodes[CurrentDepth];
+// Tag = AllTags + (*TagCount);
+// InternalNodes = 0;
+// NodeScore = 0;
+// for (NodeIndex = 0; NodeIndex <= CurrentDepth; NodeIndex++)
+// {
+// Node = TagNodes[NodeIndex];
+// if (Node->OriginalPeakIndex > 0)
+// {
+// NodeScore += Node->Score;
+// InternalNodes++;
+// }
+// Tag->Nodes[NodeIndex] = TagNodes[NodeIndex];
+// }
+// NodeScore *= (GlobalOptions->GenerateTagLength + 1) / (float)max(1, InternalNodes);
+// Tag->Score = NodeScore;
+// Tag->ModsUsed = 0;
+// memset(Tag->ModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+// memset(Tag->AminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+// if (LeftNode->NodeType == evGraphNodeLeftMod)
+// {
+// Tag->AminoIndex[Tag->ModsUsed] = 0;
+// Tag->ModType[Tag->ModsUsed] = LeftNode->PTM;
+// Tag->ModsUsed++;
+// }
+// for (EdgeIndex = 0; EdgeIndex < CurrentDepth; EdgeIndex++)
+// {
+// Edge = TagEdges[EdgeIndex];
+// Tag->Score += TagEdges[EdgeIndex]->Score;
+// Tag->Tag[EdgeIndex] = Edge->Jump->Amino;
+// if (Edge->Jump->Delta)
+// {
+// Tag->AminoIndex[Tag->ModsUsed] = EdgeIndex;
+// Tag->ModType[Tag->ModsUsed] = Edge->Jump->Delta;
+// Tag->ModsUsed++;
+// }
+// }
+// // Set skew info:
+// Tag->TotalSkew = 0;
+// Tag->TotalAbsSkew = 0;
+// for (EdgeIndex = 0; EdgeIndex < CurrentDepth; EdgeIndex++)
+// {
+// Edge = TagEdges[EdgeIndex];
+// Tag->TotalSkew += Edge->Skew;
+// Tag->TotalAbsSkew += abs(Edge->Skew);
+// }
+// ////////////////////////////////////////////////////
+// // If the total skew is large, penalize the tag's score:
+// Bin = (int)fabs((Tag->TotalSkew / 50.0) + 0.5);
+// if (Bin >= TagSkewBinCount)
+// {
+// Bin = TagSkewBinCount - 1;
+// }
+// Tag->Score += TagSkewScore[Bin] * TagEdgeScoreMultiplier;
+// Bin = (int)fabs((Tag->TotalAbsSkew / 50.0) + 0.5);
+// if (Bin >= TagSkewBinCount)
+// {
+// Bin = TagSkewBinCount - 1;
+// }
+// Tag->Score += TagTotalAbsSkewScore[Bin] * TagEdgeScoreMultiplier;
+// ////////////////////////////////////////////////////
+// Tag->Tag[EdgeIndex] = '\0';
+// if (Tag->Score < ScoreToBeat)
+// {
+// // Abort the tag - it's not good enough!
+// continue;
+// }
+// if (RightNode->NodeType == evGraphNodeRightMod)
+// {
+// Tag->AminoIndex[Tag->ModsUsed] = CurrentDepth;
+// Tag->ModType[Tag->ModsUsed] = RightNode->PTM;
+// Tag->ModsUsed++;
+// }
+// Tag->PSpectrum = Spectrum;
+// Tag->Tweak = Tweak;
+// Tag->TagLength = CurrentDepth;
+// Tag->ParentMass = Spectrum->ParentMass;
+// Tag->Charge = Spectrum->Charge;
+// Tag->PrefixMass = TagNodes[0]->Mass;
+// Tag->SuffixMass = Spectrum->ParentMass - PARENT_MASS_BOOST - TagNodes[CurrentDepth]->Mass;
+// (*TagCount)++;
+// AllTagCount++;
+// // If we've got as many tags as we can handle, drop all but the best. (Don't
+// // just reallocate; we could end up with a *lot*!)
+// if ((*TagCount) + 5 >= TagAllocation)
+// {
+// qsort(AllTags, *TagCount, sizeof(TrieTag), (QSortCompare)CompareTagScores);
+// *TagCount = TagAllocation / 2;
+// if (MaximumTagCount >= 0)
+// {
+// ScoreToBeat = AllTags[min(TagAllocation - 5, MaximumTagCount)].Score;
+// }
+// else
+// {
+// ScoreToBeat = AllTags[*TagCount].Score;
+// }
+// }
+// continue;
+// } // If we're at tag depth
+//
+// // We're not at tag depth yet.
+// // Move to our first available child:
+// TagEdges[CurrentDepth] = TagNodes[CurrentDepth]->FirstEdge;
+// if (!TagEdges[CurrentDepth])
+// {
+// BacktrackFlag = 1;
+// continue;
+// }
+// else
+// {
+// CurrentDepth++;
+// TagNodes[CurrentDepth] = TagEdges[CurrentDepth - 1]->ToNode;
+// }
+// }
+// // Sort the tags, by score:
+// qsort(AllTags, *TagCount, sizeof(TrieTag), (QSortCompare)CompareTagScores);
+// return AllTags;
+//
+//}
+
+// Build a trie from a list of tags. Returns the trie root.
+// AllTags is the tag array, TagCount its size.
+// Since we construct one big trie for many spectra, we take Root as an
+// argument; Root is NULL on the first call.
+// If GlobalOptions->GenerateCount is >= 0 and < TagCount, then we stop after adding
+// that many tags.
+TrieNode* BuildTrieFromTags(TrieTag* AllTags, int TagCount, TrieNode* Root, int MaximumTagCount)
+{
+ int DuplicateFlag;
+ int TagsInTrie = 0;
+ int TagIndex;
+ TrieTag* Tag;
+
+ int Index;
+
+ //printf("BuildTrieFromTags...\n");
+ // Construct a root, if we don't have one already.
+ if (!Root)
+ {
+
+ Root = NewTrieNode();
+ Root->FailureNode = Root;
+
+ }
+ for (TagIndex = 0; TagIndex < TagCount; TagIndex++)
+ {
+ AddTagToTrie(Root, AllTags + TagIndex, &DuplicateFlag);
+ if (!DuplicateFlag)
+ {
+ TagsInTrie++;
+ Tag = AllTags + TagIndex;
+ if (MaximumTagCount >= 0 && TagsInTrie >= MaximumTagCount)
+ {
+ break;
+ }
+ }
+ }
+
+
+ //DebugPrintTrieTags(Root);
+ return Root;
+}
+
+void DebugPrintTagList(MSSpectrum* Spectrum, TrieTag* Tags, int TagCount)
+{
+ int TagIndex;
+ TrieTag* Tag;
+ int Index;
+ for (TagIndex = 0; TagIndex < TagCount; TagIndex++)
+ {
+ Tag = Tags + TagIndex;
+#ifdef DEBUG_TAG_GENERATION
+ printf("%s\n", Tag->TagScoreDetails);
+#endif
+ printf("%d: %.2f: %s %.2f %.2f\n", TagIndex, Tag->Score, Tag->Tag, Tag->PrefixMass / (float)MASS_SCALE, (Spectrum->ParentMass - PARENT_MASS_BOOST - Tag->SuffixMass) / (float)MASS_SCALE);
+ for (Index = 0; Index < Tag->TagLength; Index++)
+ {
+ printf("%c", Tag->Tag[Index]);
+ fflush(stdout);
+ if (Tag->AminoIndex[Index]>-1)
+ {
+ printf("%s", Tag->ModType[Index]->Name);
+ fflush(stdout);
+ }
+ }
+ printf("\n");
+ }
+}
+
+// Called when searching in tagless mode. (Tagless mode performs *no* database filtering; it's
+// appropriate for searching a small database, typically a database formed by an initial search run)
+// The trie, in this case, will have a child for each amino acid (prefix 0, suffix = parent mass - amino mass)
+TrieNode* GenerateDummyTags(MSSpectrum* Spectrum, TrieNode* Root)
+{
+ TrieTag* Tag;
+ char* Aminos = "ACDEFGHKLMNPRSTVWY"; // skip I and Q, because they're synonymous with L and K
+ char* Amino;
+ int DuplicateFlag;
+ int ModIndex;
+ int TweakIndex;
+ SpectrumTweak* Tweak;
+ // Set up the root, if it doesn't exist already:
+ if (!Root)
+ {
+ Root = NewTrieNode();
+ Root->FailureNode = Root;
+ }
+ for (TweakIndex = 0; TweakIndex < TWEAK_COUNT; TweakIndex++)
+ {
+ Tweak = Spectrum->Node->Tweaks + TweakIndex;
+ if (!Tweak->Charge)
+ {
+ continue;
+ }
+ // Loop over alphabet soup, add one tag per amino:
+ for (Amino = Aminos; *Amino; Amino++)
+ {
+ Tag = NewTrieTag();
+ Tag->Tag[0] = *Amino;
+ Tag->Tag[1] = '\0';
+ memset(Tag->ModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+ memset(Tag->AminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+ Tag->PSpectrum = Spectrum;
+ Tag->Charge = Tweak->Charge;
+ Tag->ParentMass = Tweak->ParentMass;
+ Tag->Tweak = Tweak;
+ Tag->PrefixMass = 0;
+ Tag->SuffixMass = Tweak->ParentMass - PeptideMass[*Amino] - PARENT_MASS_BOOST;
+ Tag->TagLength = 1;
+ //GlobalStats->TagsGenerated++;
+ AddTagToTrie(Root, Tag, &DuplicateFlag);
+ // ...ok, also allow mods on this first amino
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ if (AllKnownPTMods[ModIndex].Allowed[*Amino - 'A'])
+ {
+ Tag = NewTrieTag();
+ Tag->Tag[0] = *Amino;
+ Tag->Tag[1] = '\0';
+ memset(Tag->ModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+ memset(Tag->AminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+ Tag->ModType[0] = MassDeltaByIndex[(*Amino-'A') * MAX_PT_MODTYPE + ModIndex];
+ Tag->AminoIndex[0] = 0;
+ Tag->ModsUsed = 1;
+ Tag->PSpectrum = Spectrum;
+ Tag->Charge = Tweak->Charge;
+ Tag->Tweak = Tweak;
+ Tag->ParentMass = Tweak->ParentMass;
+ Tag->PrefixMass = 0;
+ Tag->SuffixMass = Tweak->ParentMass - PeptideMass[*Amino] - PARENT_MASS_BOOST - AllKnownPTMods[ModIndex].Mass;
+ Tag->TagLength = 1;
+ //GlobalStats->TagsGenerated++;
+ AddTagToTrie(Root, Tag, &DuplicateFlag);
+ }
+ }
+ }
+ }
+ return Root;
+}
+
+TrieNode* GenerateTagsFromSpectrum(MSSpectrum* Spectrum, TrieNode* Root, int MaximumTagCount, SpectrumTweak* Tweak)
+{
+ TrieTag* Tags;
+ int TagCount;
+
+ // Note: Spectrum load and preprocessing methods need to be called before calling this function.
+ // Call these:
+ //SpectrumFindIsotopicPeaks(Spectrum);
+ //IntensityRankPeaks(Spectrum);
+ //SpectrumCorrectParentMass(Spectrum);
+
+ //printf("GenerateTagsFromSpectrum...\n");
+ if (Spectrum->Graph)
+ {
+ FreeTagGraph(Spectrum->Graph);
+ Spectrum->Graph = NULL;
+ }
+ if (GlobalOptions->TaglessSearchFlag)
+ {
+ return GenerateDummyTags(Spectrum, Root);
+ }
+ Spectrum->Graph = ConstructTagGraph(Spectrum);
+ TagGraphAddNodes(Spectrum->Graph, Spectrum);
+ //printf("From spectrum with %d peaks, graph with %d nodes\n",Spectrum->PeakCount,Spectrum->Graph->NodeCount);
+
+ TagGraphScorePRMNodes(NULL, Spectrum->Graph, Spectrum, Tweak);
+ //DebugPrintTagGraph(Spectrum, Spectrum->Graph);
+ TagGraphPopulateEdges(Spectrum->Graph);
+
+#ifdef DEBUG_TAG_GENERATION
+ DebugPrintTagGraph(Spectrum, Spectrum->Graph);
+ DebugPrintTagGraphEdges(Spectrum->Graph); ////
+#endif
+ Tags = TagGraphGenerateTags(Spectrum->Graph, Spectrum, &TagCount, MaximumTagCount, Tweak, TAG_EDGE_SCORE_MULTIPLIER, NULL);
+
+#ifdef DEBUG_TAG_GENERATION
+ DebugPrintTagList(Spectrum, Tags, 300);
+#endif
+ DebugPrintTagsForPeptide(Spectrum, Spectrum->Graph, Tags, TagCount);
+ Root = BuildTrieFromTags(Tags, TagCount, Root, MaximumTagCount);
+ if (0)
+ {
+ DebugPrintTrieTags(Root);
+ }
+
+ // The caller should usually invoke InitializeTrieFailureNodes next. When doing a batch of
+ // spectra, however, we do InitializeTrieFailureNodes once at the end.
+
+ return Root;
+}
+
+// Build a hash (Graph->NodeIndex) for quick lookup of nodes based on mass. This is used
+// in GetBYScore, when choosing a PTM attachment point.
+void TagGraphBuildNodeIndex(TagGraph* Graph)
+{
+ TagGraphNode* Node;
+ int Bucket;
+ int BucketMax;
+ SafeFree(Graph->NodeIndex);
+ Graph->NodeIndexSize = ((int)(Graph->LastNode->Mass / DALTON)) + 1;
+ Graph->NodeIndex = (TagGraphNode**)calloc(Graph->NodeIndexSize, sizeof(TagGraphNode*));
+ for (Node = Graph->FirstNode; Node; Node = Node->Next)
+ {
+ BucketMax = (int)(min(Graph->NodeIndexSize - 1, Node->Mass / DALTON + 1));
+ for (Bucket = max(0, (int)(Node->Mass / DALTON) - 1); Bucket <= BucketMax; Bucket++)
+ {
+ if (!Graph->NodeIndex[Bucket])
+ {
+ Graph->NodeIndex[Bucket] = Node;
+ }
+ }
+ }
+}
+
+int NiceCheckAA(char AA1, char AA2)
+{
+ if (AA1 == 'I')
+ {
+ AA1= 'L';
+ }
+ if (AA2 == 'I')
+ {
+ AA2= 'L';
+ }
+ if (AA1 == 'Q')
+ {
+ AA1= 'K';
+ }
+ if (AA2 == 'Q')
+ {
+ AA2= 'K';
+ }
+ return (AA1 == AA2);
+}
+
+void DebugCheckTagMatch(int TagIndex, TrieTag* Tag, int* Masses, int MassCount, char* Peptide)
+{
+ int Pos;
+ int Diff;
+ int ParentMass;
+ int TagAAIndex;
+ //
+ ParentMass = Masses[MassCount-1];
+ for (Pos = 0; Pos < MassCount - 3; Pos++)
+ {
+ Diff = abs(Tag->PrefixMass - Masses[Pos]);
+ if (Diff > 2 * DALTON)
+ {
+ continue;
+ }
+ Diff = abs((ParentMass - Masses[Pos+3]) - Tag->SuffixMass);
+ if (Diff > 2 * DALTON)
+ {
+ continue;
+ }
+ for (TagAAIndex = 0; TagAAIndex < Tag->TagLength; TagAAIndex++)
+ {
+ if (!NiceCheckAA(Peptide[Pos + TagAAIndex + 1], Tag->Tag[TagAAIndex]))
+ {
+ continue;
+ }
+ }
+ printf("Matched by tag #%d: '%s', prefix %.2f, suffix %.2f\n", TagIndex, Tag->Tag, Tag->PrefixMass / (float)MASS_SCALE, Tag->SuffixMass / (float)MASS_SCALE);
+ }
+}
+
+// Sometimes we don't generate tags for a peptide, and it's not obvious why.
+// In such casses, include a line of the form "tagcheck,PEPTIDE" in the input file.
+// Then, this function will compare the tags for this peptide against the actual tags
+// (and actual tag graph).
+void DebugPrintTagsForPeptide(MSSpectrum* Spectrum, TagGraph* Graph, TrieTag* Tags, int TagCount)
+{
+ StringNode* Node;
+ int MassCount;
+ int Masses[64];
+ char* Amino;
+ int AminoMass;
+ int AccumMass;
+ char Peptide[64];
+ int PeptideLength;
+ int TagIndex;
+ TrieTag* Tag;
+ int MassIndex;
+ TagGraphNode* GraphNode;
+ //
+ for (Node = FirstTagCheckNode; Node; Node = Node->Next)
+ {
+ printf("--- Check tagging results for %s Charge %d\n", Node->String, Spectrum->Charge);
+ MassCount = 0;
+ AccumMass = 0;
+ PeptideLength = 0;
+ // Parse the peptide string. For now, DROP all mods.
+ for (Amino = Node->String; *Amino; Amino++)
+ {
+ AminoMass = PeptideMass[*Amino];
+ if (AminoMass)
+ {
+ AccumMass += AminoMass;
+ Masses[MassCount++] = AccumMass;
+ Peptide[PeptideLength++] = *Amino;
+ }
+ }
+ Peptide[PeptideLength] = '\0';
+ ///////////////////////////////////////////////////////////
+ // Loop over tags, and see whether any tag matches the peptide:
+ for (TagIndex = 0; TagIndex < TagCount; TagIndex++)
+ {
+ Tag = Tags + TagIndex;
+ DebugCheckTagMatch(TagIndex, Tag, Masses, MassCount, Peptide);
+ }
+ ///////////////////////////////////////////////////////////
+ // Loop over PRMs in the peptide, and see how taggable they are:
+ for (MassIndex = 0; MassIndex < MassCount; MassIndex++)
+ {
+ AccumMass = Masses[MassIndex];
+ printf("Mass %d (%.2f):\n", MassIndex, AccumMass / (float)MASS_SCALE);
+ for (GraphNode = Graph->FirstNode; GraphNode; GraphNode = GraphNode->Next)
+ {
+ if (GraphNode->Mass > AccumMass + DALTON)
+ {
+ break;
+ }
+ if (GraphNode->Mass < AccumMass - DALTON)
+ {
+ continue;
+ }
+ printf(" Node at %.2f (%.2f) score %.2f\n", GraphNode->Mass / (float)MASS_SCALE,
+ (GraphNode->Mass - AccumMass) / (float)MASS_SCALE, GraphNode->Score);
+ }
+ }
+ }
+}
+TagGraphNode* TagTestGetBestNode(TagGraph* Graph, int PRM)
+{
+ int MinMass;
+ int MaxMass;
+ TagGraphNode* TGNode;
+ TagGraphNode* BestNode = NULL;
+ //
+ MinMass = PRM - 50;
+ MaxMass = PRM + 50;
+ for (TGNode = Graph->FirstNode; TGNode; TGNode = TGNode->Next)
+ {
+ if (TGNode->Mass > MaxMass)
+ {
+ break;
+ }
+ if (TGNode->Mass < MinMass)
+ {
+ continue;
+ }
+ if (!BestNode || BestNode->Score < TGNode->Score)
+ {
+ BestNode = TGNode;
+ }
+ }
+ return BestNode;
+}
+
+void TestTaggingCallback(SpectrumNode* Node, int Charge, int ParentMass, Peptide* Annotation)
+{
+ static int* TrueTagRankHistogram = NULL;
+ static int SpectrumCount = 0;
+ int Rank;
+ int Cumulative;
+ FILE* ResultsFile;
+ int TagIndex;
+ int TagCount;
+ TrieTag* Tags;
+ int FoundFlag;
+ TrieTag* TestTag;
+ int PRM[64];
+ int Mass;
+ int AminoIndex;
+ int ModIndex;
+ int TrieTagCount;
+ int MatchLength;
+ BayesianModel* Model;
+ TagGraphNode* Node0;
+ TagGraphNode* Node1;
+ TagGraphNode* Node2;
+ TagGraphNode* Node3;
+ float TagScore;
+ TrieNode* Root;
+ int DuplicateFlag;
+ int VerboseFlag = 0;
+ //
+ Root = NULL;
+ if (!Node)
+ {
+ if (!Charge)
+ {
+ // Initialization call:
+ TrueTagRankHistogram = (int*)calloc(512, sizeof(int));
+ }
+ else
+ {
+ // Completion call:
+ ResultsFile = fopen("TagTestingResults.txt", "w");
+ Cumulative = 0;
+ fprintf(ResultsFile, "Tagging results on %d spectra\n", SpectrumCount);
+ for (Rank = 0; Rank < 512; Rank++)
+ {
+ Cumulative += TrueTagRankHistogram[Rank];
+ fprintf(ResultsFile, "%d\t%d\t%.2f\t%.2f\t\n",
+ Rank, TrueTagRankHistogram[Rank], TrueTagRankHistogram[Rank] / (float)SpectrumCount,
+ Cumulative / (float)SpectrumCount);
+ }
+ //SafeFree(TrueTagRankHistogram);
+ }
+ return;
+ }
+ // Standard call: Given a spectrum, generate some tags. Remember the rank of the first true tag.
+
+ Root = NewTrieNode();
+ Root->FailureNode = Root;
+
+ Node->Tweaks[0].Charge = Charge;
+ Node->Tweaks[0].ParentMass = Annotation->ParentMass;
+ Node->Spectrum->Charge = Charge;
+ Node->Spectrum->ParentMass = Annotation->ParentMass;
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+ //SpectrumComputeNoiseDistributions(Node);
+ //SpectrumAssignIsotopeNeighbors(Node->Spectrum);
+ //SpectrumFindIsotopicPeaks(Node->Spectrum);
+ Node->Spectrum->Graph = ConstructTagGraph(Node->Spectrum);
+ TagGraphAddNodes(Node->Spectrum->Graph, Node->Spectrum);
+ TagGraphPopulateEdges(Node->Spectrum->Graph);
+ TagGraphScorePRMNodes(NULL, Node->Spectrum->Graph, Node->Spectrum, Node->Tweaks);
+ Tags = TagGraphGenerateTags(Node->Spectrum->Graph, Node->Spectrum, &TagCount, 1024, Node->Tweaks, TAG_EDGE_SCORE_MULTIPLIER, NULL);
+ if (Charge > 2)
+ {
+ Model = BNCharge3TaggingBN;
+ }
+ else
+ {
+ Model = BNCharge2TaggingBN;
+ }
+ // Set our PRM array, so we can check tag prefix masses:
+ //printf("\nTags for: %s\n", Annotation->Bases);
+ Mass = 0;
+ MatchLength = strlen(Annotation->Bases);
+ for (AminoIndex = 0; AminoIndex < MatchLength-1; AminoIndex++)
+ {
+ switch (Annotation->Bases[AminoIndex])
+ {
+ case 'I':
+ Annotation->Bases[AminoIndex] = 'L';
+ break;
+ case 'Q':
+ Annotation->Bases[AminoIndex] = 'K';
+ break;
+ default:
+ break;
+ }
+ PRM[AminoIndex] = Mass;
+ Mass += PeptideMass[Annotation->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Annotation->AminoIndex[ModIndex] == AminoIndex && Annotation->ModType[ModIndex])
+ {
+ Mass +=Annotation->ModType[ModIndex]->RealDelta;
+ }
+ }
+ }
+ ///////////////////////////////////////////////////////////////
+ // Optional verbose debugging:
+ // For each theoretical tag, look for the best tag that can be generated.
+ if (VerboseFlag)
+ {
+ //GetPRMFeatures(Node->Spectrum, Node->Tweaks, Model, 97870);
+ //DebugPrintScorpPRMScores(Node->Spectrum, Node->Tweaks);
+ for (AminoIndex = 0; AminoIndex < MatchLength-2; AminoIndex++)
+ {
+ Node0 = TagTestGetBestNode(Node->Spectrum->Graph, PRM[AminoIndex]);
+ Node1 = TagTestGetBestNode(Node->Spectrum->Graph, PRM[AminoIndex + 1]);
+ Node2 = TagTestGetBestNode(Node->Spectrum->Graph, PRM[AminoIndex + 2]);
+ Node3 = TagTestGetBestNode(Node->Spectrum->Graph, PRM[AminoIndex + 3]);
+ printf("Theoretical tag %.2f %s:\n", PRM[AminoIndex] / (float)MASS_SCALE, Annotation->Bases + AminoIndex);
+ if (Node0)
+ {
+ TagScore = Node0->Score;
+ printf(" Node0 %.2f score %.2f\n", Node0->Mass / (float)MASS_SCALE, Node0->Score);
+ }
+ else
+ {
+ printf(" <Node0 missing>\n");
+ TagScore = -9999;
+ }
+ if (Node1)
+ {
+ TagScore += Node1->Score;
+ printf(" Node1 %.2f score %.2f\n", Node1->Mass / (float)MASS_SCALE, Node1->Score);
+ }
+ else
+ {
+ printf(" <Node1 missing>\n");
+ TagScore = -9999;
+ }
+ if (Node2)
+ {
+ TagScore += Node2->Score;
+ printf(" Node2 %.2f score %.2f\n", Node2->Mass / (float)MASS_SCALE, Node2->Score);
+ }
+ else
+ {
+ printf(" <Node2 missing>\n");
+ TagScore = -9999;
+ }
+ if (Node3)
+ {
+ TagScore += Node3->Score;
+ printf(" Node3 %.2f score %.2f\n", Node3->Mass / (float)MASS_SCALE, Node3->Score);
+ }
+ else
+ {
+ printf(" <Node3 missing>\n");
+ TagScore = -9999;
+ }
+ if (Node0 && Node0->OriginalPeakIndex < 0)
+ {
+ TagScore *= (float)1.3333;
+ }
+ if (Node3 && Node3->OriginalPeakIndex < 0)
+ {
+ TagScore *= (float)1.3333;
+ }
+
+ printf("overall: %.2f\n", TagScore);
+ }
+ }
+ ///////////////////////////////////////////////////////////////
+ // Check each tag to see whether it's correct:
+ TagCount = min(TagCount, 512);
+ FoundFlag = 0;
+ TrieTagCount = 0;
+ for (TagIndex = 0; TagIndex < TagCount; TagIndex++)
+ {
+ TestTag = Tags + TagIndex;
+ DuplicateFlag = 0;
+ AddTagToTrie(Root, TestTag, &DuplicateFlag);
+ if (!DuplicateFlag)
+ {
+ TrieTagCount++;
+ //if (TrieTagCount <= 10)
+ {
+ if (VerboseFlag)
+ {
+ printf("%.2f\t%s\t%.2f\n", TestTag->PrefixMass / (float)MASS_SCALE, TestTag->Tag, TestTag->Score);
+ }
+
+ }
+ for (AminoIndex = 0; AminoIndex < MatchLength-2; AminoIndex++)
+ {
+ if (abs(TestTag->PrefixMass - PRM[AminoIndex]) < GlobalOptions->ParentMassEpsilon)
+ {
+ if (TestTag->Tag[0] == Annotation->Bases[AminoIndex] &&
+ TestTag->Tag[1] == Annotation->Bases[AminoIndex + 1] &&
+ TestTag->Tag[2] == Annotation->Bases[AminoIndex + 2])
+ {
+ TrueTagRankHistogram[TrieTagCount]++;
+ FoundFlag = 1;
+ }
+ }
+ }
+ }
+ if (FoundFlag)
+ {
+ break;
+ }
+ }
+ if (!FoundFlag)
+ {
+ // we missed, too bad. don't need to poke the histogram.
+ //TrueTagRankHistogram[511]++;
+ }
+ FreeTrieNode(Root);
+ Root = NULL;
+
+ SpectrumCount++;
+
+}
+
+void TestTagging(char* OracleFile, char* OracleDir)
+{
+ InitBayesianModels(); // to use new PRM scoring
+ InitStats();
+ TestTaggingCallback(NULL, 0, 0, NULL); // initialization
+ TrainOnOracleFile(OracleFile, OracleDir, TestTaggingCallback);
+ TestTaggingCallback(NULL, 1, 0, NULL); // completion
+}
+
+void TrainTaggingCallback(SpectrumNode* Node, int Charge, int ParentMass, Peptide* Annotation)
+{
+ static int SpectrumCount = 0;
+ int TagIndex;
+ int TagCount;
+ TrieTag* Tags;
+ int FoundFlag;
+ TrieTag* TestTag;
+ int PRM[64];
+ int Mass;
+ int AminoIndex;
+ int ModIndex;
+ int TrieTagCount;
+ int MatchLength;
+ int FeatureIndex;
+ BayesianModel* Model;
+ TrieNode* Root;
+ int DuplicateFlag;
+ int TrueTagFlag;
+ static FILE* TagTrainingFile = NULL;
+ //
+ Root = NULL;
+
+ if (!TagTrainingFile)
+ {
+ TagTrainingFile = fopen("TagTraining.txt", "w");
+ }
+ // Standard call: Given a spectrum, generate some tags. Test the first n tags, and write
+ // out a feature-vector for each.
+
+ // ** skip modded peptides:
+ if (Annotation->ModType[0])
+ {
+ return;
+ }
+ Root = NewTrieNode();
+ Root->FailureNode = Root;
+
+ Node->Tweaks[0].Charge = Charge;
+ Node->Tweaks[0].ParentMass = Annotation->ParentMass;
+ Node->Spectrum->Charge = Charge;
+ Node->Spectrum->ParentMass = Annotation->ParentMass;
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+ PrepareSpectrumForIonScoring(PRMModelCharge2, Node->Spectrum, 0);
+ //SpectrumComputeBinnedIntensities(Node);
+ //SpectrumComputeNoiseDistributions(Node);
+ //SpectrumAssignIsotopeNeighbors(Node->Spectrum);
+ //SpectrumFindIsotopicPeaks(Node->Spectrum);
+ Node->Spectrum->Graph = ConstructTagGraph(Node->Spectrum);
+ TagGraphAddNodes(Node->Spectrum->Graph, Node->Spectrum);
+ TagGraphPopulateEdges(Node->Spectrum->Graph);
+ TagGraphScorePRMNodes(NULL, Node->Spectrum->Graph, Node->Spectrum, Node->Tweaks);
+ Tags = TagGraphGenerateTags(Node->Spectrum->Graph, Node->Spectrum, &TagCount, 1024, Node->Tweaks, TAG_EDGE_SCORE_MULTIPLIER, NULL);
+ if (Charge > 2)
+ {
+ Model = BNCharge3TaggingBN;
+ }
+ else
+ {
+ Model = BNCharge2TaggingBN;
+ }
+ // Set our PRM array, so we can check tag prefix masses:
+ Mass = 0;
+ MatchLength = strlen(Annotation->Bases);
+ for (AminoIndex = 0; AminoIndex < MatchLength-1; AminoIndex++)
+ {
+ switch (Annotation->Bases[AminoIndex])
+ {
+ case 'I':
+ Annotation->Bases[AminoIndex] = 'L';
+ break;
+ case 'Q':
+ Annotation->Bases[AminoIndex] = 'K';
+ break;
+ default:
+ break;
+ }
+ PRM[AminoIndex] = Mass;
+ Mass += PeptideMass[Annotation->Bases[AminoIndex]];
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Annotation->AminoIndex[ModIndex] == AminoIndex && Annotation->ModType[ModIndex])
+ {
+ Mass +=Annotation->ModType[ModIndex]->RealDelta;
+ }
+ }
+ }
+ ///////////////////////////////////////////////////////////////
+ // Check each tag to see whether it's correct:
+ TagCount = min(TagCount, 512);
+ FoundFlag = 0;
+ TrieTagCount = 0;
+ for (TagIndex = 0; TagIndex < min(10, TagCount); TagIndex++)
+ {
+ TrueTagFlag = 0;
+ TestTag = Tags + TagIndex;
+ DuplicateFlag = 0;
+ Root = AddTagToTrie(Root, TestTag, &DuplicateFlag);
+ if (!DuplicateFlag)
+ {
+ TrieTagCount++;
+ for (AminoIndex = 0; AminoIndex < MatchLength-2; AminoIndex++)
+ {
+ if (abs(TestTag->PrefixMass - PRM[AminoIndex]) < GlobalOptions->ParentMassEpsilon)
+ {
+ if (TestTag->Tag[0] == Annotation->Bases[AminoIndex] &&
+ TestTag->Tag[1] == Annotation->Bases[AminoIndex + 1] &&
+ TestTag->Tag[2] == Annotation->Bases[AminoIndex + 2])
+ {
+ TrueTagFlag = 1;
+ }
+ }
+ }
+ }
+ if (TrueTagFlag)
+ {
+ fprintf(TagTrainingFile, "+1 ");
+ }
+ else
+ {
+ fprintf(TagTrainingFile, "-1 ");
+ }
+ FeatureIndex = 1;
+ fprintf(TagTrainingFile, "%d:%.3f ", FeatureIndex++, TestTag->Score);
+ fprintf(TagTrainingFile, "\n");
+ }
+ if (!FoundFlag)
+ {
+ // we missed, too bad. don't need to poke the histogram.
+ //TrueTagRankHistogram[511]++;
+ }
+ FreeTrieNode(Root);
+ Root = NULL;
+
+ SpectrumCount++;
+
+}
+
+void TrainTagging(char* OracleFile, char* OracleDir)
+{
+ InitBayesianModels(); // to use new PRM scoring
+ InitStats();
+ TrainOnOracleFile(OracleFile, OracleDir, TrainTaggingCallback);
+}
+
+// Using flanking amino acid info, score the remaining nodes in Model.
+float SetTaggingFlankScore(PRMBayesianModel* Model, TagGraphNode** TagNodes, TagGraphEdge** TagEdges, int Depth, int RightEndpointFlag)
+{
+ PRMBayesianNode* Node;
+ PRMBayesianNodeHolder* Holder;
+ char PrefixAA = UNKNOWN_AMINO;
+ char SuffixAA = UNKNOWN_AMINO;
+ float Score = 0;
+ int TableIndex;
+ int ParentIndex;
+ //
+ if (Depth)
+ {
+ PrefixAA = TagEdges[Depth - 1]->Jump->Amino;
+ }
+ if (!RightEndpointFlag)
+ {
+ SuffixAA = TagEdges[Depth]->Jump->Amino;
+ }
+ for (Holder = Model->FirstFlank; Holder; Holder = Holder->Next)
+ {
+ Node = Holder->Node;
+ switch (Node->Type)
+ {
+ case evFlank:
+ Node->Value = IonScoringGetFlank(Node, PrefixAA, SuffixAA);
+ break;
+ case evPrefixAA:
+ if ((PrefixAA - 'A') == Node->Flag)
+ {
+ Node->Value = 1;
+ }
+ else
+ {
+ Node->Value = 0;
+ }
+ break;
+ case evSuffixAA:
+ if ((SuffixAA - 'A') == Node->Flag)
+ {
+ Node->Value = 1;
+ }
+ else
+ {
+ Node->Value = 0;
+ }
+ break;
+ default:
+ // We already knew this node's value (based on intensity). Now we know its parents' values (based
+ // in part on flanking amino acids). ASSUME that all parents are in the FlankList.
+ TableIndex = Node->Value;
+ for (ParentIndex = 0; ParentIndex < Node->ParentCount; ParentIndex++)
+ {
+ TableIndex += Node->Parents[ParentIndex]->Value * Node->ParentBlocks[ParentIndex];
+ }
+ Score = Node->ProbTable[TableIndex];
+ // The score from the NOISE MODEL has already been integrated. So, we're done.
+ break;
+ }
+ }
+ return Score;
+}
+
+
+// New tag generation function: Generates tags of a (more-or-less) arbitrary length!
+// Incorporates a more sophisticated intensity scoring function that considers
+// amino acid effects.
+TrieTag* TagGraphGenerateTags(TagGraph* Graph, MSSpectrum* Spectrum, int* TagCount,
+ int MaximumTagCount, SpectrumTweak* Tweak, float TagEdgeScoreMultiplier,
+ struct PRMBayesianModel* Model)
+{
+ TagGraphNode* TagNodes[12];
+ TagGraphEdge* TagEdges[12];
+ int NodeIndex;
+ int EdgeIndex;
+ TagGraphNode* Node;
+ TagGraphEdge* Edge;
+ TagGraphNode* LeftNode;
+ TagGraphNode* RightNode;
+ int CurrentDepth;
+ int InternalNodes;
+ float NodeScore;
+ TrieTag* Tag;
+ int TagAllocation;
+ int BacktrackFlag;
+ int AllTagCount = 0;
+ int Bin;
+ float FlankScore[12];
+ float ScoreToBeat = -9999;
+ //
+ if (!Model)
+ {
+ if (Tweak->Charge < 3)
+ {
+ Model = TAGModelCharge2;
+ }
+ else
+ {
+ Model = TAGModelCharge3;
+ }
+ }
+ *TagCount = 0;
+ TagAllocation = 1024;
+ if (!AllTags)
+ {
+ AllTags = (TrieTag*)calloc(TagAllocation, sizeof(TrieTag));
+ }
+ NodeIndex = 0;
+ EdgeIndex = -1;
+ BacktrackFlag = 0;
+ CurrentDepth = 0;
+ TagNodes[0] = Graph->FirstNode;
+ // Main iteration: Depth-first traversal through the DAG, up to a maximum depth of
+ // GlobalOptions->GenerateTagLength, and with each possible root (TagNodes[0]).
+ while (1)
+ {
+ // If we're BACKTRACKING, then move to a sibling or parent:
+ if (BacktrackFlag)
+ {
+ // Move the root of the subtree, if necessary:
+ if (CurrentDepth == 0)
+ {
+ // Move to the next 'first' node:
+ TagNodes[0] = TagNodes[0]->Next;
+ if (!TagNodes[0])
+ {
+ break;
+ }
+ BacktrackFlag = 0;
+ continue;
+ }
+ // Move to a sibling, if we can:
+ TagEdges[CurrentDepth - 1] = TagEdges[CurrentDepth - 1]->Next;
+ if (TagEdges[CurrentDepth - 1])
+ {
+ TagNodes[CurrentDepth] = TagEdges[CurrentDepth - 1]->ToNode;
+ BacktrackFlag = 0;
+ FlankScore[CurrentDepth - 1] = SetTaggingFlankScore(Model, TagNodes, TagEdges, CurrentDepth - 1, 0);
+ continue;
+ }
+ // No more siblings - move up one level.
+ CurrentDepth--;
+ continue;
+ }
+
+ // Special case for level 1: Skip tag nodes with silly masses like 20Da.
+ if (CurrentDepth == 0)
+ {
+ Node = TagNodes[0];
+ if (Node->Mass > GlobalOptions->ParentMassEpsilon && Node->Mass < GLYCINE_MASS - GlobalOptions->Epsilon)
+ {
+ BacktrackFlag = 1;
+ continue;
+ }
+ }
+
+ // If we're deep enough, report a tag and start backtracking:
+ if (CurrentDepth >= GlobalOptions->GenerateTagLength)
+ {
+ FlankScore[CurrentDepth] = SetTaggingFlankScore(Model, TagNodes, TagEdges, CurrentDepth, 1);
+ BacktrackFlag = 1;
+ LeftNode = TagNodes[0];
+ RightNode = TagNodes[CurrentDepth];
+ Tag = AllTags + (*TagCount);
+ InternalNodes = 0;
+ NodeScore = 0;
+ for (NodeIndex = 0; NodeIndex <= CurrentDepth; NodeIndex++)
+ {
+ Node = TagNodes[NodeIndex];
+ if (Node->OriginalPeakIndex > 0)
+ {
+ NodeScore += Node->Score;
+ NodeScore += FlankScore[NodeIndex];
+ InternalNodes++;
+ }
+ Tag->Nodes[NodeIndex] = TagNodes[NodeIndex];
+ }
+ NodeScore *= (GlobalOptions->GenerateTagLength + 1) / (float)max(1, InternalNodes);
+ Tag->Score = NodeScore;
+ Tag->ModsUsed = 0;
+ memset(Tag->ModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+ memset(Tag->AminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+ if (LeftNode->NodeType == evGraphNodeLeftMod)
+ {
+ // Sanity check: The first AA must be one where this mod can
+ // be attached!
+ if (!AllKnownPTMods[LeftNode->PTM->Index].Allowed[TagEdges[0]->Jump->Amino - 'A'])
+ {
+ continue;
+ }
+ Tag->AminoIndex[Tag->ModsUsed] = 0;
+ Tag->ModType[Tag->ModsUsed] = LeftNode->PTM;
+ Tag->ModsUsed++;
+ }
+ for (EdgeIndex = 0; EdgeIndex < CurrentDepth; EdgeIndex++)
+ {
+ Edge = TagEdges[EdgeIndex];
+ Tag->Score += TagEdges[EdgeIndex]->Score;
+ Tag->Tag[EdgeIndex] = Edge->Jump->Amino;
+ if (Edge->Jump->Delta)
+ {
+ Tag->AminoIndex[Tag->ModsUsed] = EdgeIndex;
+ Tag->ModType[Tag->ModsUsed] = Edge->Jump->Delta;
+ Tag->ModsUsed++;
+ }
+ }
+ // Set skew info:
+ Tag->TotalSkew = 0;
+ Tag->TotalAbsSkew = 0;
+ for (EdgeIndex = 0; EdgeIndex < CurrentDepth; EdgeIndex++)
+ {
+ Edge = TagEdges[EdgeIndex];
+ Tag->TotalSkew += Edge->Skew;
+ Tag->TotalAbsSkew += abs(Edge->Skew);
+ }
+ ////////////////////////////////////////////////////
+ // If the total skew is large, penalize the tag's score:
+ Bin = (int)(fabs((Tag->TotalSkew / 50.0)));
+ if (Bin >= TagSkewBinCount)
+ {
+ Bin = TagSkewBinCount - 1;
+ }
+ Tag->Score += TagSkewScore[Bin] * TagEdgeScoreMultiplier;
+ Bin = (int)(fabs((Tag->TotalAbsSkew / 50.0)));
+ if (Bin >= TagSkewBinCount)
+ {
+ Bin = TagSkewBinCount - 1;
+ }
+ Tag->Score += TagTotalAbsSkewScore[Bin] * TagEdgeScoreMultiplier;
+ ////////////////////////////////////////////////////
+ Tag->Tag[EdgeIndex] = '\0';
+ if (Tag->Score < ScoreToBeat)
+ {
+ // Abort the tag - it's not good enough!
+ continue;
+ }
+ if (RightNode->NodeType == evGraphNodeRightMod)
+ {
+ // Sanity check: The first AA must be one where this mod can
+ // be attached!
+ if (!AllKnownPTMods[RightNode->PTM->Index].Allowed[TagEdges[CurrentDepth - 1]->Jump->Amino - 'A'])
+ {
+ continue;
+ }
+ Tag->AminoIndex[Tag->ModsUsed] = CurrentDepth;
+ Tag->ModType[Tag->ModsUsed] = RightNode->PTM;
+ Tag->ModsUsed++;
+ }
+ Tag->PSpectrum = Spectrum;
+ Tag->Tweak = Tweak;
+ Tag->TagLength = CurrentDepth;
+ Tag->ParentMass = Spectrum->ParentMass;
+ Tag->Charge = Spectrum->Charge;
+ Tag->PrefixMass = TagNodes[0]->Mass;
+ Tag->SuffixMass = Spectrum->ParentMass - PARENT_MASS_BOOST - TagNodes[CurrentDepth]->Mass;
+ (*TagCount)++;
+ AllTagCount++;
+ // If we've got as many tags as we can handle, drop all but the best. (Don't
+ // just reallocate; we could end up with a *lot*!)
+ if ((*TagCount) + 5 >= TagAllocation)
+ {
+ qsort(AllTags, *TagCount, sizeof(TrieTag), (QSortCompare)CompareTagScores);
+ *TagCount = TagAllocation / 2;
+ if (MaximumTagCount >= 0)
+ {
+ ScoreToBeat = AllTags[min(TagAllocation - 5, MaximumTagCount)].Score;
+ }
+ else
+ {
+ ScoreToBeat = AllTags[*TagCount].Score;
+ }
+ }
+ //printf("Added a tag for %d - %s - %d\n",Tag->PrefixMass, Tag->Tag, Tag->SuffixMass);
+ continue;
+ } // If we're at tag depth
+
+ // We're not at tag depth yet.
+ // Move to our first available child:
+ TagEdges[CurrentDepth] = TagNodes[CurrentDepth]->FirstEdge;
+ if (!TagEdges[CurrentDepth])
+ {
+ BacktrackFlag = 1;
+ continue;
+ }
+ else
+ {
+ CurrentDepth++;
+ TagNodes[CurrentDepth] = TagEdges[CurrentDepth - 1]->ToNode;
+ FlankScore[CurrentDepth - 1] = SetTaggingFlankScore(Model, TagNodes, TagEdges, CurrentDepth - 1, 0);
+ }
+ }
+ // Sort the tags, by score:
+ qsort(AllTags, *TagCount, sizeof(TrieTag), (QSortCompare)CompareTagScores);
+ return AllTags;
+
+}
diff --git a/Tagger.h b/Tagger.h
new file mode 100644
index 0000000..c3b5ad7
--- /dev/null
+++ b/Tagger.h
@@ -0,0 +1,199 @@
+//Title: Tagger.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef TAGGER_H
+#define TAGGER_H
+// Tagger.h defines objects and functions related to the building of a tag graph,
+// and the generation on short (usually tripeptide) tags from the graph. Each
+// node in the graph represents a prefix residue mass (PRM), and so the nodes can
+// be thought of as points along an m/z axis. A (directed) edge in the graph
+// represents a valid jump, where a "jump" is the mass of an amino acid (or modified
+// amino acid). The tag graph is used to construct tags (formally, paths of length
+// three). This is one approach to the local de novo interpretation problem.
+
+#include "Inspect.h"
+#include "Trie.h"
+#include "Spectrum.h"
+#include "IonScoring.h"
+
+// Each graph node has some set of witness ions (e.g. b and y peaks). We encode
+// the a set of ions as a bitfield.
+// For instance, B and Y is ION_FLAG_B | ION_FLAG_Y = 0x11
+#define ION_FLAG_B 0x0001
+#define ION_FLAG_BH2O 0x0002
+#define ION_FLAG_BNH3 0x0004
+#define ION_FLAG_A 0x0008
+#define ION_FLAG_Y 0x0010
+#define ION_FLAG_YH2O 0x0020
+#define ION_FLAG_YNH3 0x0040
+#define ION_FLAG_B2 0x0080
+#define ION_FLAG_Y2 0x0100
+
+// INTENSITY_RANK_COUNT is the number of entries in IntensityRankBOdds and IntensityRankYOdds
+#define INTENSITY_RANK_COUNT 22
+
+// Probability that a peak with this intensity-rank is a b peak:
+extern float* IntensityRankBOdds;
+
+// Probability that a peak with this intensity-rank is a y peak:
+extern float* IntensityRankYOdds;
+
+// Probability that a peak with reported m/z this far from the expected m/z is a true b or y peak:
+extern double SkewHistoStep[100];
+
+// The witness scores for a node are based upon the collection of ion
+// types bearing witness to a particular break-point. Similar to Danciek
+// scores. Scores are empirically derived from a training dataset (currently ISB). dataset. We reckon these odds
+// separately for low, medium and high mass peaks. "Low" is "below 33% of precursor mass",
+// and "High" is "above 66% of precursor mass".
+
+// A JumpNode captures the mass, amino acid, and PTM (if any) of a valid
+// edge length for the tag graph.
+typedef struct JumpNode
+{
+ int Mass;
+ struct JumpNode* Next;
+ char Amino;
+ // ASSUME: We only permit one modification per peptide in a tag.
+ MassDelta* Delta;
+ float Score;
+} JumpNode;
+
+// The type of a graph node indicates whether it was created by interpreting a spectral peak as a b or y ion,
+// or whether it is an endpoint (evGraphNodeLeft, evGraphNodeRight). The special types evGraphNodeLeftMod,
+// evGraphNodeRightMod are created when N- and C-terminal PTMs are allowed.
+typedef enum evGraphNodeType
+{
+ evGraphNodeB = 0,
+ evGraphNodeY,
+ evGraphNodeLeft,
+ evGraphNodeLeftMod,
+ evGraphNodeRight,
+ evGraphNodeRightMod
+} evGraphNodeType;
+
+typedef struct TagGraphNode
+{
+#ifdef DEBUG_TAG_GENERATION
+ char VerboseNodeInfo[2048];
+#endif
+ int OriginalPeakIndex;
+ int BIndex;
+ int YIndex;
+ int IntensityRankB;
+ int IntensityRankY;
+ evGraphNodeType NodeType;
+ // A graph node is scored based upon its intensity score (intensity-rank of the b and y peak),
+ // its isotope score (whether the b and y peaks are apparently secondary isotopic peaks, primary
+ // peaks with children, or lone peaks) and its ion type score (the witness set).
+ float IntensityScore;
+ float IsotopeScore;
+ float IonTypeScore;
+ float Score;
+ //float ScoreB;
+ //float ScoreY;
+ int IonTypeFlags;
+ int Mass;
+ // List of edges leading forward in the graph:
+ struct TagGraphEdge* FirstEdge;
+ struct TagGraphEdge* LastEdge;
+ // Next and previous nodes (sorted by mass):
+ struct TagGraphNode* Next;
+ struct TagGraphNode* Prev;
+ MassDelta* PTM; // Is non-null only if NodeType is LeftMod or RightMod
+ // BackEdge, BackEdgeDouble, and BackEdgeTriple are set only when carrying out blind
+ // mod search. They speed up the big d.p. extension algorithm.
+ struct TagGraphBackEdge** BackEdge; //[26]; // List of edges matching an unmodified aa
+ struct TagGraphBackEdge** BackEdgeDouble; //[26*26]; // List of edges matching two unmodified aa's
+ struct TagGraphBackEdge** BackEdgeTriple; //[26*26*26]; // List of edges matching three unmodified aa's
+ int Index; // This is set AFTER all the graph nodes have been created and sorted.
+} TagGraphNode;
+
+// BackEdge points to a graph node whose mass is smaller by 1, 2, or 3 unmodified amino acid masses.
+typedef struct TagGraphBackEdge
+{
+ TagGraphNode* FromNode;
+ TagGraphNode* ToNode;
+ int Score;
+ int Skew;
+ // If this edge is a double-amino-acid jump, then HalfMass is the mass after the first amino acid.
+ int HalfMass;
+ int HalfMass2; // For triples
+ struct TagGraphBackEdge* Next;
+} TagGraphBackEdge;
+
+// Each NODE in the graph owns a list of EDGES. Each edge joins to a higher-mass node
+typedef struct TagGraphEdge
+{
+ TagGraphNode* FromNode;
+ TagGraphNode* ToNode;
+ JumpNode* Jump;
+ float Score;
+ struct TagGraphEdge* Next;
+ int Skew;
+} TagGraphEdge;
+
+// A TagGraph has pointer to its first/last nodes, an index (for quickly finding nodes for a PRM),
+// and a buffer of back edges (populated only in blind mode).
+typedef struct TagGraph
+{
+ TagGraphNode* FirstNode;
+ TagGraphNode* LastNode;
+ // Index: Points to the first node that could match a given rounded-to-amu mass
+ TagGraphNode** NodeIndex;
+ int NodeIndexSize;
+ int NodeCount; // Number of nodes in the list FirstNode...LastNode.
+ struct TagGraphBackEdge* BackEdgeBuffer;
+} TagGraph;
+
+void TagGraphBuildNodeIndex(TagGraph* Graph);
+TrieNode* GenerateTagsFromSpectrum(MSSpectrum* Spectrum, TrieNode* Root, int MaximumTagCount, SpectrumTweak* Tweak);
+void CorrectParentMass(MSSpectrum* Spectrum);
+int LoadIntensityRankOdds(char* FileName);
+int LoadWitnessScores(char* FileName);
+void PopulateJumpingHash();
+int FindIntensePeak(MSSpectrum* Spectrum, int Mass, float MaxIntensity, float* FoundIntensity);
+void SpectrumFindIsotopicPeaks(MSSpectrum* Spectrum);
+TagGraph* ConstructTagGraph(MSSpectrum* Spectrum);
+void TagGraphAddNodes(TagGraph* Graph, MSSpectrum* Spectrum);
+void TagGraphPopulateEdges(TagGraph* Graph);
+void FreeTagGraph(TagGraph* Graph);
+void FreeJumpingHash();
+void FreeTagGraphNode(TagGraphNode* Node);
+void TestTagging(char* OracleFile, char* OracleDir);
+void TrainTagging(char* OracleFile, char* OracleDir);
+int CompareTagScores(const TrieTag* TagA, const TrieTag* TagB);
+TrieNode* BuildTrieFromTags(TrieTag* AllTags, int TagCount, TrieNode* Root, int MaximumTagCount);
+void SetTagSkewScores();
+void FreeTagSkewScores();
+// declaration of TagGraphGenerateTags moved out, since it uses PRMBayesianModel
+#endif // TAGGER_H
diff --git a/TrainPTMFeatures.py b/TrainPTMFeatures.py
new file mode 100644
index 0000000..70bf3c1
--- /dev/null
+++ b/TrainPTMFeatures.py
@@ -0,0 +1,762 @@
+#Title: TrainPTMFeatures.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+"""
+Plan:
+Output a large collection of features for each post-translational modification accepted on a
+search of a part-bogus database. All modifications on the bogus proteins are incorrect.
+An equivalent number of modifications on the non-bogus proteins are incorrect. Let's compute
+a variety of features for the PTMs observed.
+
+Input:
+A collection of annotated spectra, output by SelectSites.py
+Output:
+A file listing all the observed modification sites, with various features computed.
+
+Then, we train a model to distinguish between good (correct DB) and bad (incorrect DB)
+modifications. Model types: LDA, logistic regression, SVM, etc.
+(Another experiment: Search unmodified spectra against a mutated database,
+judge correct precisely those modifications which "undo" the mutations)
+"""
+
+UsageInfo = """
+TrainPTMFeatures: Train model on PTM features from on a data-set.
+Run this AFTER running ComputePTMFeatures.
+
+Arguments:
+ -m [model-type]: Train a model and report its accuracy on the specified
+ (-u) file
+ -u [FeatureFile]: Path to the feature-file written out by ComputePTMFeatures
+ -v [FeatureFile]: Write scored features out to the specified file
+ -w [FileName]: Write model to the specified file. (Set either -w OR -r)
+ -r [FileName]: Read a model from the specified file (Set either -w OR -r)
+
+Optional:
+ -f [Flag]: Perform feature selection (1 for accumulate, 2 for prune)
+ -e [TestingFile]: Path to the feature-file that serves as a testing set.
+ If not specified, then the same features (-u) will be used for testing.
+ For use with -f flag only.
+ -R [Path]: Report ROC curve to the specified file
+"""
+import os
+import sys
+import struct
+import traceback
+import getopt
+import MSSpectrum
+import PyInspect
+import random
+import shutil
+import Learning
+import BasicStats
+import ResultsParser
+import SpectralSimilarity
+random.seed(1)
+from Utils import *
+Initialize()
+try:
+ from numpy import *
+ import numpy.linalg
+ FloatType = float
+ MatrixMultiply = dot
+except:
+ print "** Warning: Unable to import Numpy. Logit training not available"
+
+ValidFeatureIndices = [2,3,5,22,23,24,26]
+
+class FeatureBits:
+ SpectrumCount = 0
+ ModlessSpectrumCount = 1
+ BestMQScore = 2
+ BestDeltaScore = 3
+ PeptideCount = 4
+ ConsensusMQScore = 5
+ PeptideLength = 6
+ TotalCutScore = 7
+ MedianCutScore = 8
+ YPresent = 9
+ BPresent = 10
+ BYIntensity = 11
+ NTT = 12
+ ModdedFraction = 13
+ SpectraThisModType = 15
+ SitesThisModType = 16
+ Dot = 18
+ Shared01 = 19
+ Shared11 = 20
+ Correlation = 21
+ LogSpectrumCount = 22
+ LogPeptideLength = 23
+ LogSpecThisType = 24
+ LogSitesThisType = 25
+ DeltaVsBigDB = 26
+
+class FormatBits:
+ DBPos = 1
+ ModificationMass = 2
+ ModifiedAA = 3
+ ProteinName = 4
+ ModifiedResidueNumber = 5
+ Peptide = 6
+ Charge = 7
+ TrueProteinFlag = 8
+ SisterAnnotationFlag = 9
+ BestSpectrumPath = 10
+ BestModlessSpectrumPath = 11
+ BestModlessMQScore = 12
+ BigDBAnnotation = 13
+ BigDBMQScore = 14
+ SpectrumCount = 15
+ ModlessSpectrumCount = 16
+ BestMQScore = 17
+ BestDeltaScore = 18
+ PeptideCount = 19
+ ConsensusMQScore = 20
+ NTT = 27
+ ModdedFraction = 28
+ SpectraWithThisModType = 30
+ SitesWithThisModType = 31
+ LogSpectrumCount = 37
+ LogSpectraThisModType = 39
+ LogSitesThisModType = 40
+ ConsensusDeltaBigDB = 41
+ FirstFeature = 15
+ LastFeature = 41
+ FeatureCount = LastFeature - FirstFeature + 1
+ ModelScore = 42 # score for the PEPTIDE SPECIES
+ ModelPValue = 43 # p-value (probability false given this score) for the PEPTIDE SPECIES
+ SitePValue = 44 # p-value (probability false given several species) for the SITE
+ KnownPTMName = 45
+ KnownPTMAnnotation = 46
+ KnownPTMScore = 47
+ KnownPTMSitePValue = 48
+
+class PTMFeatureTrainer(ResultsParser.ResultsParser):
+ def __init__(self):
+ self.ResultsFileName = None
+ self.DBPath = None
+ self.OutputPath = "PTMFeatures.txt"
+ self.TempFileDir = "PTMFeatures"
+ self.PTMs = {} # keys of the form (DBPos, Mass)
+ self.CoverageThreshold = 2 # at least this many spectra to consider a residue 'covered'.
+ self.QuickParseFlag = 0 # if true, then parse only the first n lines
+ self.SpectrumDir = None
+ self.SuperSpectrumDir = None
+ self.PoolFlag = 0
+ self.ModelType = None
+ self.SisterProteins = {} # protein index -> sister protein's index
+ self.InputFeaturePath = None
+ self.ModelTestFilePath = None
+ # Dictionary of unmodified peptides, for computing the coverage level:
+ self.UnmodifiedPeptides = {}
+ self.FeatureSelectionFlag = None
+ self.CachedProteinNames = []
+ self.CachedFilePaths = []
+ self.CachedFixedFilePaths = []
+ self.StartOutputDBPos = 0
+ self.HeaderLines = []
+ self.ReportROCPath = None
+ self.OutputFeaturePath = None
+ self.ReadModelFilePath2 = None
+ self.ReadModelFilePath3 = None
+ self.TrainingSetDBRatio = 1.0
+ self.TestingSetDBRatio = 1.0
+ self.WriteModelFilePath2 = None
+ self.WriteModelFilePath3 = None
+ ResultsParser.ResultsParser.__init__(self)
+ def TrainFacultative(self):
+ """
+ Train paired models for CONSTITUTIVE ("always") and FACULTATIVE ("sometimes") PTMs.
+ """
+ # Train a model on all PTMs, to get initial scores for all PTMs.
+ # The initial model uses only CONSTITUTIVE features, and its output
+ # is used only to provide an ORACLE for the facultative model:
+ print "TRAIN model on all features:"
+ self.Model.Train(self.TrainingSetAll)
+ print "SCORE all features:"
+ self.Model.Test(self.TrainingSetAll)
+ ##############################################################
+ print "Generate SUB-MODEL of only facultative features:"
+ # Sort facultative instances by score:
+ SortedList = []
+ for Vector in self.TrainingSetAll.AllVectors:
+ if not Vector.FileBits[FormatBits.SisterAnnotationFlag]:
+ continue
+ SortedList.append((Vector.Score, Vector))
+ SortedList.sort()
+ FacFeatureSet = Learning.FeatureSetClass()
+ ChunkSize = min(len(SortedList) / 4, 1000)
+ print "Sorted list of %s facultative features, chunk size is %s"%(len(SortedList), ChunkSize)
+ for (Score, Vector) in SortedList[:ChunkSize]:
+ NewVector = Learning.FeatureVector()
+ NewVector.FileBits = Vector.FileBits[:]
+ NewVector.Features = Vector.Features[:]
+ NewVector.TrueFlag = 0
+ FacFeatureSet.AllVectors.append(NewVector)
+ FacFeatureSet.FalseVectors.append(NewVector)
+ for (Score, Vector) in SortedList[-ChunkSize:]:
+ NewVector = Learning.FeatureVector()
+ NewVector.FileBits = Vector.FileBits[:]
+ NewVector.Features = Vector.Features[:]
+ NewVector.TrueFlag = 1
+ FacFeatureSet.AllVectors.append(NewVector)
+ FacFeatureSet.TrueVectors.append(NewVector)
+ FacFeatureSet.SetCounts()
+ FacFeatureSet.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
+ ##############################################################
+ # Write out the FACULTATIVE feature set:
+ FacTrainingFile = open("FacultativeTrainingSet.txt", "wb")
+ for HeaderLine in self.HeaderLines:
+ FacTrainingFile.write(HeaderLine)
+ for Vector in FacFeatureSet.AllVectors:
+ Bits = Vector.FileBits[:]
+ if Vector.TrueFlag:
+ Bits[FormatBits.TrueProteinFlag] = "1"
+ else:
+ Bits[FormatBits.TrueProteinFlag] = "0"
+ Str = string.join(Bits, "\t")
+ FacTrainingFile.write(Str + "\n")
+ FacTrainingFile.close()
+ ##############################################################
+ # Train the sub-model:
+ self.FacModel = self.GetModelObject(self.FeaturesF)
+ self.FacModel.Train(FacFeatureSet)
+ self.FacModel.Test(FacFeatureSet)
+ self.FacModel.ReportAccuracy(FacFeatureSet) # invokes ComputeOddsTrue
+ ##############################################################
+ # Apply the trained fac-model to *all* facultative features, and
+ # train an overall model on all *constitutive* features:
+ self.FeatureSetC = Learning.FeatureSetClass()
+ self.FeatureSetF = Learning.FeatureSetClass()
+ for Vector in self.TrainingSetAll.AllVectors:
+ if Vector.FileBits[FormatBits.SisterAnnotationFlag]:
+ FeatureSet = self.FeatureSetF
+ else:
+ FeatureSet = self.FeatureSetC
+ FeatureSet.AllVectors.append(Vector)
+ if Vector.TrueFlag:
+ FeatureSet.TrueVectors.append(Vector)
+ else:
+ FeatureSet.FalseVectors.append(Vector)
+ self.FeatureSetC.SetCounts()
+ self.FeatureSetF.SetCounts()
+ self.FeatureSetC.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
+ self.FeatureSetF.GetPriorProbabilityFalse(self.TrainingSetDBRatio)
+ # Score facultative-feature, using facultative-model:
+ self.FacModel.Test(self.FeatureSetF)
+ # Train constitutive-ONLY model, and score constitutive features:
+ self.ConModel = self.GetModelObject(self.FeaturesC)
+ self.ConModel.Train(self.FeatureSetC)
+ self.ConModel.Test(self.FeatureSetC)
+ self.ConModel.ReportAccuracy(self.FeatureSetC) # to invoke ComputeOddsTrue
+ ##############################################################
+ # Save our models:
+ if self.WriteModelFilePath:
+ (Stub, Extension) = os.path.splitext(self.WriteModelFilePath)
+ ConModelPath = "%s.con"%Stub
+ FacModelPath = "%s.fac"%Stub
+ self.ConModel.SaveModel(ConModelPath)
+ self.FacModel.SaveModel(FacModelPath)
+ ##############################################################
+ # Write out the scored features:
+ OutputFile = open(self.OutputFeaturePath, "wb")
+ for Line in self.HeaderLines:
+ OutputFile.write(Line)
+ for Vector in self.TrainingSetAll.AllVectors:
+ if Vector.FileBits[FormatBits.SisterAnnotationFlag]:
+ PValue = self.FacModel.GetPValue(Vector.Score)
+ else:
+ PValue = self.ConModel.GetPValue(Vector.Score)
+ while len(Vector.FileBits) <= FormatBits.ModelPValue:
+ Vector.FileBits.append("")
+ Vector.FileBits[FormatBits.ModelScore] = str(Vector.Score)
+ Vector.FileBits[FormatBits.ModelPValue] = str(PValue)
+ Str = string.join(Vector.FileBits, "\t")
+ OutputFile.write(Str + "\n")
+ def GetModelObject(self, Features):
+ if self.ModelType == "lda":
+ return Learning.LDAModel(Features)
+ elif self.ModelType == "svm":
+ return Learning.SVMModel(Features)
+ elif self.ModelType == "logit":
+ return Learning.LogitModel(Features)
+ else:
+ print "** Model type NOT KNOWN!", self.ModelType
+ return
+ def TrainModel(self):
+ """
+ Our training data-set is in self.InputFeaturePath.
+ Let's train a model to predict which entries come from the true database.
+ """
+ if not self.InputFeaturePath:
+ print "* Please specify an input feature-file."
+ print UsageInfo
+ sys.exit(-1)
+ # Load in features for a collection of TRUE and FALSE instances.
+ File = open(self.InputFeaturePath, "rb")
+ self.FeatureNames = {}
+ FeatureCount = FormatBits.LastFeature - FormatBits.FirstFeature + 1
+ # We have one set of features for facultative sites, and one for constitutive.
+ # Note that some features (modification rate, correlation with unmodified peptide)
+ # are applicable to F but not C.
+ #self.FeaturesF = range(FeatureCount)
+ # For constitutive modifications: Modification rate, protein coverage,
+ # and number of unmodified peptides are all off-limits. (Those features
+ # are "dead giveaways" that we have a non-shuffled protein!)
+ #self.FeaturesC = [2, 3, 5, 22, 24, 25, 26]
+ self.FeaturesC = ValidFeatureIndices[:]
+ #self.FeaturesC = range(FeatureCount)
+ self.FeaturesF = self.FeaturesC
+ self.FeaturesAll = []
+ for FeatureIndex in self.FeaturesF:
+ if FeatureIndex in self.FeaturesC:
+ self.FeaturesAll.append(FeatureIndex)
+ # We can OVERRIDE the list of features here, to forbid the use of some:
+ print "Permitted features all:", self.FeaturesAll
+ # Parse the features from the TRAINING and TESTING files. We generate
+ # training sets for the FACULTATIVE (F) and for CONSTITUTIVE (C) sites.
+ self.TrainingSet2 = Learning.FeatureSetClass()
+ self.TrainingSet2.Type = "Charge-2"
+ self.TrainingSet3 = Learning.FeatureSetClass()
+ self.TrainingSet3.Type = "Charge-3"
+ #self.TrainingSetAll = Learning.FeatureSetClass()
+ #self.TrainingSetAll.Type = "All"
+ self.ParseFeatureFile(self.InputFeaturePath, self.TrainingSet2, self.TrainingSet3,
+ self.TrainingSetDBRatio)
+ if self.ModelTestFilePath:
+ self.TestingSet2 = FeatureSetClass()
+ self.TestingSet3 = FeatureSetClass()
+ self.ParseFeatureFile(self.ModelTestFilePath, self.TestingSet2, self.TestingSet3,
+ self.TestingSetAll, self.TestingSetDBRatio)
+ # SPECIAL values for model, which don't actually cause training:
+ if self.ModelType == "feature":
+ print "\n\nSINGLE feature:"
+ self.TrainOneFeature(self.TrainingSet2)
+ self.TrainOneFeature(self.TrainingSet3)
+ return
+ if self.ModelType == "featurescatter":
+ print "\n\nFeature+feature scatter-plots:"
+ self.ProduceFeatureScatterPlots(self.TrainingSetAll)
+ return
+ if self.ModelType == "summary":
+ self.PerformFeatureSummary()
+ return
+ # Instantiate our model:
+ self.Model2 = self.GetModelObject(self.FeaturesAll)
+ self.Model3 = self.GetModelObject(self.FeaturesAll)
+ # Load a pre-trained model, if we received a path:
+ if self.ReadModelFilePath2:
+ self.Model2.LoadModel(self.ReadModelFilePath2)
+ self.Model3.LoadModel(self.ReadModelFilePath3)
+ #######################################################################
+ # Special value for feature selection (3) means that we train a model on
+ # all data, then use it to generate a sub-feature-set for a facultative model!
+ if self.FeatureSelectionFlag == 3:
+ self.TrainFacultative()
+ return
+ #######################################################################
+ # If we're not doing feature selection: Train on the training set,
+ # and then (if we have a testing set) test on the testing set.
+ if not self.FeatureSelectionFlag:
+ # Train the model (unless we just loaded it in):
+ if not self.ReadModelFilePath2:
+ self.Model2.Train(self.TrainingSet2)
+ self.Model3.Train(self.TrainingSet3)
+ # Compute the score of each vector:
+ if self.ModelTestFilePath:
+
+ self.Model2.Test(self.TestingSet2)
+ self.Model2.ReportAccuracy(self.TestingSet2)
+
+ self.Model3.Test(self.TestingSet3)
+ self.Model3.ReportAccuracy(self.TestingSet3)
+ self.WriteScoredFeatureSet(self.TestingSet2, self.TestingSet3)
+ else:
+
+ self.Model2.Test(self.TrainingSet2)
+ self.Model2.ReportAccuracy(self.TrainingSet2)
+ shutil.copyfile("PValues.txt", "PValues.chg2.txt")
+
+ self.Model3.Test(self.TrainingSet3)
+ self.Model3.ReportAccuracy(self.TrainingSet3)
+ shutil.copyfile("PValues.txt", "PValues.chg3.txt")
+ #if self.ReportROCPath:
+ # self.Model.ReportROC(self.TrainingSetAll, self.ReportROCPath)
+ self.WriteScoredFeatureSet(self.TrainingSet2, self.TrainingSet3)
+ if self.WriteModelFilePath2:
+ self.Model2.SaveModel(self.WriteModelFilePath2)
+ self.Model3.SaveModel(self.WriteModelFilePath3)
+ return
+ #######################################################################
+ # We're doing feature selection. We'll need to write out feature files,
+ # then call TrainMachineLearner
+ print "Feature names:", self.FeatureNames
+ print "AllFeatures:", self.FeaturesAll
+ self.WriteFeaturesToFile(self.TrainingSet2, "PTMFeatures.2.txt")
+ self.WriteFeaturesToFile(self.TrainingSet3, "PTMFeatures.3.txt")
+ # *** Additive and subtractive aren't done here, the user can do it!
+ def WriteFeaturesToFile(self, TrainingSet, FileName):
+ print "Write features to %s..."%FileName
+ File = open(FileName, "wb")
+ File.write("#Index\tValidFlag\t")
+ for Key in self.FeaturesAll:
+ File.write("%s\t"%self.FeatureNames[Key])
+ File.write("\n")
+ TrainingSet.SaveTabDelimited(File)
+ File.close()
+ def ProduceFeatureScatterPlots(self, FeatureSet):
+ """
+ Iterate over all pairs of (distinct) features. For each pair, produce a scatter-plot
+ with N true points and N false points.
+ """
+ OutputFile = open("FeatureScatterPlots.txt", "wb")
+ VectorCount = 200
+ TrueVectors = FeatureSet.TrueVectors[:]
+ random.shuffle(TrueVectors)
+ TrueVectors = TrueVectors[:VectorCount]
+ FalseVectors = FeatureSet.FalseVectors[:]
+ random.shuffle(FalseVectors)
+ FalseVectors = FalseVectors[:VectorCount]
+ # Write a HEADER:
+ HeaderStr = ""
+ for FeatureIndex in range(len(self.FeaturesAll)):
+ Feature = self.FeaturesAll[FeatureIndex]
+ HeaderStr += "T %s\tF %s\t"%(self.FeatureNames[Feature], self.FeatureNames[Feature])
+ OutputFile.write(HeaderStr + "\n")
+ # Write one row for each pair of vectors:
+ for RowIndex in range(len(TrueVectors)):
+ Str = ""
+ TrueVector = TrueVectors[RowIndex]
+ FalseVector = FalseVectors[RowIndex]
+ for Feature in self.FeaturesAll:
+ Str += "%s\t%s\t"%(TrueVector.Features[Feature], FalseVector.Features[Feature])
+ OutputFile.write(Str + "\n")
+ return
+ def WriteScoredFeatureSet(self, FeatureSet2, FeatureSet3):
+ # Write out the features with their model-scores:
+ if not self.OutputFeaturePath:
+ return
+ File = open(self.OutputFeaturePath, "wb")
+ for FileLine in self.HeaderLines:
+ File.write(FileLine)
+ SortedVectors = []
+ for Vector in FeatureSet2.AllVectors:
+ SortedVectors.append((int(Vector.FileBits[1]), Vector.FileBits[6], int(Vector.FileBits[7]), Vector))
+ for Vector in FeatureSet3.AllVectors:
+ SortedVectors.append((int(Vector.FileBits[1]), Vector.FileBits[6], int(Vector.FileBits[7]), Vector))
+ SortedVectors.sort()
+ for Tuple in SortedVectors:
+ Vector = Tuple[-1]
+ Charge = int(Tuple[2])
+ if Charge > 2:
+ Model = self.Model3
+ else:
+ Model = self.Model2
+ Bits = Vector.FileBits
+ while len(Bits) <= FormatBits.ModelPValue:
+ Bits.append("")
+ Bits[FormatBits.ModelScore] = str(Vector.Score)
+ Bits[FormatBits.ModelPValue] = str(self.Model2.GetPValue(Vector.Score))
+ Str = string.join(Bits, "\t")
+ File.write(Str + "\n")
+ File.close()
+ return
+ # Iterate over all vectors, write them all out:
+ for Vector in FeatureSet2.AllVectors:
+ Bits = Vector.FileBits
+ while len(Bits) <= FormatBits.ModelPValue:
+ Bits.append("")
+ Bits[FormatBits.ModelScore] = str(Vector.Score)
+ Bits[FormatBits.ModelPValue] = str(self.Model2.GetPValue(Vector.Score))
+ Str = string.join(Bits, "\t")
+ File.write(Str + "\n")
+ # Iterate over all vectors, write them all out:
+ for Vector in FeatureSet3.AllVectors:
+ Bits = Vector.FileBits
+ while len(Bits) <= FormatBits.ModelPValue:
+ Bits.append("")
+ Bits[FormatBits.ModelScore] = str(Vector.Score)
+ Bits[FormatBits.ModelPValue] = str(self.Model3.GetPValue(Vector.Score))
+ Str = string.join(Bits, "\t")
+ File.write(Str + "\n")
+ File.close()
+ def ParseFeatureFile(self, FilePath, FeatureSet2, FeatureSet3, DBRatio):
+ """
+ Initialize the FeatureSet object, by parsing features from the specified FilePath.
+ Facultative features go to FeatureSetF, constitutive features go to FeatureSetC
+ """
+ File = open(FilePath, "rb")
+ # Parse the header line specially:
+ HeaderLine = File.readline()
+ self.HeaderLines.append(HeaderLine)
+ Bits = HeaderLine.strip().split("\t")
+ for BitIndex in range(len(Bits)):
+ if BitIndex >= FormatBits.FirstFeature:
+ self.FeatureNames[BitIndex - FormatBits.FirstFeature] = Bits[BitIndex]
+ #if BitIndex <= FormatBits.LastFeature:
+ # print "Feature %s: %s"%(BitIndex - FormatBits.FirstFeature, Bits[BitIndex])
+ # Iterate over non-header lines:
+ LineNumber = 0
+ for FileLine in File.xreadlines():
+ LineNumber += 1
+ if FileLine[0] == "#":
+ self.HeaderLines.append(FileLine)
+ continue # skip comment line
+ if not FileLine.strip():
+ continue # skip blank line
+ Bits = FileLine.replace("\r","").replace("\n","").split("\t")
+ # If there are TOO MANY bits, then discard the extras:
+ Bits = Bits[:FormatBits.LastFeature + 1]
+ try:
+ TrueFlag = int(Bits[FormatBits.TrueProteinFlag])
+ except:
+ continue # skip; not a valid instance line
+ Charge = int(Bits[FormatBits.Charge])
+ SisterAnnotation = Bits[FormatBits.SisterAnnotationFlag]
+ Vector = Learning.FeatureVector()
+ if Charge > 2:
+ FeatureSet = FeatureSet3
+ else:
+ FeatureSet = FeatureSet2
+ try:
+ for FeatureBitIndex in range(FormatBits.FirstFeature, FormatBits.LastFeature + 1):
+ FeatureIndex = FeatureBitIndex - FormatBits.FirstFeature
+ #if FeatureIndex not in self.FeaturesAll:
+ # continue
+ if FeatureBitIndex < len(Bits) and Bits[FeatureBitIndex].strip() and Bits[FeatureBitIndex] != "None":
+ Vector.Features.append(float(Bits[FeatureBitIndex]))
+ else:
+ Vector.Features.append(0)
+ Vector.FileBits = Bits
+ Vector.TrueFlag = TrueFlag
+ if TrueFlag:
+ FeatureSet.TrueVectors.append(Vector)
+ else:
+ FeatureSet.FalseVectors.append(Vector)
+ FeatureSet.AllVectors.append(Vector)
+ except:
+ traceback.print_exc()
+ print "** Error on line %s column %s of feature file"%(LineNumber, FeatureIndex)
+ print Bits
+ File.close()
+ # Initialize counts:
+ for FeatureSet in (FeatureSet2, FeatureSet3):
+ FeatureSet.SetCounts()
+ FeatureSet.GetPriorProbabilityFalse(DBRatio)
+ print "CHARGE 1,2: Read in %s true and %s false vectors"%(FeatureSet2.TrueCount, FeatureSet2.FalseCount)
+ print "CHARGE 3+: Read in %s true and %s false vectors"%(FeatureSet3.TrueCount, FeatureSet3.FalseCount)
+ def ReportAccuracy(self, SortedList, ROCCurvePlotPath = None):
+ """
+ The list should have entries of the form (ModelScore, TrueFlag)
+ We'll sort them from high model scores to low, and report how many
+ TRUE positives we have for a given FALSE DISCOVERY RATE.
+ """
+ SortedList.sort()
+ SortedList.reverse()
+ AllTrueCount = 0
+ for Tuple in SortedList:
+ AllTrueCount += Tuple[-1]
+ AllFalseCount = len(SortedList) - AllTrueCount
+ print "SortedList has %s entries, %s true"%(len(SortedList), AllTrueCount)
+ # Iterate through the list from best to worst. Report the number of hits
+ # before false positive rate rises above 1%, and before it rises above 5%.
+ # ALSO: Compute the area under the ROC curve!
+ TrueCount = 0
+ FalseCount = 0
+ Cutoffs = (0.01, 0.03, 0.05, 0.07, 0.1)
+ HitFlags = [0] * len(Cutoffs)
+ Thresholds = [0] * len(Cutoffs)
+ BestCounts = [0] * len(Cutoffs)
+ BestCountsGenerous = [0] * len(Cutoffs)
+ PrevStuff = None
+ TopCount = 0
+ TopCountFalse = 0
+ if ROCCurvePlotPath:
+ ROCCurvePlotFile = open(ROCCurvePlotPath, "wb")
+ ROCTPForFP = {}
+ ROCTPForFPCount = {}
+ # Find the cutoff that gives a particular DISCOVERY RATE:
+ for Index in range(len(SortedList)):
+ Tuple = SortedList[Index]
+ if Tuple[-1]:
+ TrueCount += 1
+ else:
+ FalseCount += 1
+ if (TrueCount + FalseCount) <= 200:
+ TopCount = (TrueCount + FalseCount)
+ TopCountFalse = FalseCount
+ OverallTPRate = TrueCount / float(max(1, AllTrueCount))
+ OverallFPRate = FalseCount / float(max(1, AllFalseCount))
+ Bin = int(round(OverallFPRate * 100))
+ ROCTPForFP[Bin] = ROCTPForFP.get(Bin, 0) + OverallTPRate
+ ROCTPForFPCount[Bin] = ROCTPForFPCount.get(Bin, 0) + 1
+ if ROCCurvePlotPath:
+ ROCCurvePlotFile.write("%s\t%s\t%s\t%s\t%s\t\n"%(Index, TrueCount, FalseCount, OverallFPRate, OverallTPRate))
+ #print Index, Tuple[0], TrueCount, FalseCount, OverallTrueCount, OverallFalseCount, OverallTPRate, OverallFPRate
+ if Tuple[0] == PrevStuff:
+ if TopCount == (TrueCount + FalseCount - 1):
+ TopCount = (TrueCount + FalseCount)
+ TopCountFalse = FalseCount
+ continue
+ PrevStuff = Tuple[0]
+ FDRate = FalseCount / float(max(1, TrueCount))
+ FDRate = min(1.0, FDRate)
+ for CutIndex in range(len(Cutoffs)):
+ if FDRate > Cutoffs[CutIndex]:
+ HitFlags[CutIndex] = 1
+ if not HitFlags[CutIndex]:
+ BestCounts[CutIndex] = max(BestCounts[CutIndex], TrueCount)
+ Thresholds[CutIndex] = Tuple[0]
+ if FDRate <= Cutoffs[CutIndex]:
+ BestCountsGenerous[CutIndex] = max(BestCountsGenerous[CutIndex], TrueCount)
+ # Compute the area under the ROC curve.
+ for Bin in range(0, 100):
+ if ROCTPForFP.has_key(Bin):
+ ROCTPForFP[Bin] /= float(ROCTPForFPCount[Bin])
+ ROCArea = 0
+ for Bin in range(0, 100):
+ if ROCTPForFP.has_key(Bin):
+ ROCArea += 0.01 * ROCTPForFP[Bin]
+ #print "%s: %s"%(Bin, ROCTPForFP[Bin])
+ else:
+ # Interpolate between points:
+ PrevX = 0 # default
+ PrevY = 0 # default
+ for PrevBin in range(Bin - 1, -1, -1):
+ if ROCTPForFP.has_key(PrevBin):
+ PrevX = PrevBin
+ PrevY = ROCTPForFP[PrevBin]
+ break
+ NextX = 100
+ NextY = 1
+ for NextBin in range(Bin + 1, 101):
+ if ROCTPForFP.has_key(NextBin):
+ NextX = NextBin
+ NextY = ROCTPForFP[NextBin]
+ break
+ InterpolatedValue = PrevY + (Bin - PrevX) * float(NextY - PrevY) / (NextX - PrevX)
+ ROCArea += 0.01 * InterpolatedValue
+ for CutIndex in range(len(Cutoffs)):
+ Sensitivity = 100 * BestCounts[CutIndex] / float(max(1, AllTrueCount))
+ print " At %.1f%% FDRate (cutoff %.5f), got %s PTMs (sensitivity %.2f%%)"%(Cutoffs[CutIndex] * 100, Thresholds[CutIndex],
+ BestCounts[CutIndex], Sensitivity)
+ print " ->True sensitivity: %.4f%%"%(100 * BestCounts[CutIndex] / float(max(1, AllTrueCount - AllFalseCount)))
+ print "False positive rate amoung top %s sites: %s"%(TopCount, 100*TopCountFalse/float(max(1, TopCount)))
+ print "Overall, %s true and %s false features."%(TrueCount, FalseCount)
+ print "ROC curve area: %.5f"%ROCArea
+ # The 'score' we return is a tuple giving the best accuracy at several cutoffs:
+ return (BestCounts[2], BestCounts[0], BestCounts[4], BestCounts[3], BestCounts[2])
+ def PerformFeatureSummary(self):
+ for FeatureIndex in range(len(self.Features)):
+ TrueList = []
+ for Tuple in self.TrueTuples:
+ TrueList.append(Tuple[FeatureIndex])
+ TrueList.sort()
+ (TMean, TStdDev) = BasicStats.GetMeanStdDev(TrueList)
+ FalseList = []
+ for Tuple in self.FalseTuples:
+ FalseList.append(Tuple[FeatureIndex])
+ FalseList.sort()
+ (FMean, FStdDev) = BasicStats.GetMeanStdDev(FalseList)
+ print "Feature %s (%s):"%(FeatureIndex, self.FeatureNames[FeatureIndex])
+ print " True: Mean %.4f, stddev %.4f (range %.4f..%.4f)"%(TMean, TStdDev, TrueList[0], TrueList[-1])
+ print " False: Mean %.4f, stddev %.4f (range %.4f..%.4f)"%(FMean, FStdDev, FalseList[0], FalseList[-1])
+ def TrainOneFeature(self, TrainingSet):
+ """
+ Compute accuracy for a very simple-minded model:
+ Rank sites by the value of a SINGLE FEATURE (descending order)
+ """
+ for FeatureIndex in range(FormatBits.FeatureCount):
+ SortedList = []
+ for Vector in TrainingSet.TrueVectors:
+ SortedList.append((Vector.Features[FeatureIndex], random.random(), 1))
+ for Vector in TrainingSet.FalseVectors:
+ SortedList.append((Vector.Features[FeatureIndex], random.random(), 0))
+ # And report the accuracy of this lonely feature:
+ print
+ print "Feature %s (%s):"%(FeatureIndex, self.FeatureNames[FeatureIndex])
+ self.ReportAccuracy(SortedList)
+ def ParseCommandLine(self, Arguments):
+ (Options, Args) = getopt.getopt(Arguments, "m:u:v:r:w:f:e:R:D:")
+ OptionsSeen = {}
+ for (Option, Value) in Options:
+ OptionsSeen[Option] = 1
+ if Option == "-m":
+ self.ModelType = Value.lower()
+ elif Option == "-D":
+ self.TrainingSetDBRatio = float(Value)
+ elif Option == "-r":
+ if not os.path.exists(Value):
+ print "** Error: Model file '%s' not found for reading.\n"%Value
+ return 0
+ self.ReadModelFilePath2 = "%s.2"%Value
+ self.ReadModelFilePath3 = "%s.3"%Value
+ elif Option == "-w":
+ #self.WriteModelFilePath = Value
+ self.WriteModelFilePath2 = "%s.2"%Value
+ self.WriteModelFilePath3 = "%s.3"%Value
+ elif Option == "-u":
+ if not os.path.exists(Value):
+ print "** Error: Feature file '%s' not found for reading.\n"%Value
+ return 0
+ self.InputFeaturePath = Value
+ elif Option == "-v":
+ self.OutputFeaturePath = Value
+ elif Option == "-e":
+ self.ModelTestFilePath = Value
+ elif Option == "-f":
+ self.FeatureSelectionFlag = int(Value)
+ elif Option == "-R":
+ self.ReportROCPath = Value
+ else:
+ print "* Error: Unrecognized option %s"%Option
+ return 0
+ return 1 # success
+
+if __name__ == "__main__":
+ try:
+ import psyco
+ psyco.full()
+ except:
+ print "(psyco not installed; running unoptimized)"
+ Trainer = PTMFeatureTrainer()
+ Result = Trainer.ParseCommandLine(sys.argv[1:])
+ if not Result:
+ sys.exit(-1)
+ if Trainer.ModelType:
+ Trainer.TrainModel()
+ sys.exit()
+ print UsageInfo
+ sys.exit(-1)
+
+
diff --git a/Trie.c b/Trie.c
new file mode 100644
index 0000000..80407d8
--- /dev/null
+++ b/Trie.c
@@ -0,0 +1,2659 @@
+//Title: Trie.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Trie.h"
+#include "Utils.h"
+#include <memory.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h> // for fabs
+#include <ctype.h>
+#include "Spectrum.h"
+#include "Mods.h"
+#include "Score.h"
+#include "Tagger.h"
+#include "BN.h"
+#include "Scorpion.h"
+#include "Errors.h"
+#include "SVM.h"
+#include "LDA.h"
+
+// If two tags have the same peptides, and are within this amount on prefix/Suffix masses, then
+// consider them identical and only use the top scorer.
+// 1.5 daltons:
+#define IDENTICAL_TAG_EPSILON 1500
+
+// Number of chars allowed in a post-translational modification name. ('phosphorylation' is a typical name)
+#define MAX_PTMOD_NAME 256
+
+#define TRIE_INDEX_RECORD_SIZE (LONG_LONG_SIZE + sizeof(int) + 80*sizeof(char))
+#define SPLICEDB_INDEX_RECORD_SIZE (LONG_LONG_SIZE + sizeof(int) + 80*sizeof(char))
+
+// Global variable storing configurable options:
+Options* GlobalOptions;
+InspectStats* GlobalStats;
+
+////////////////////////////////////////////////////////////////////////////////////////
+// Forward declarations:
+void FreeTrieTagHangerList(TrieTagHanger* Head, int FreeTags);
+void FlagMandatoryModUsage(TrieNode* Node);
+int ExtendTagMatchBlind(SearchInfo* Info, TrieNode* Node, char* Buffer, int BufferPos, int BufferEnd, int FilePos);
+int ProcessGeneHitsBlindTag();
+int InsertBlindTagMatch(BlindTagMatch* Match);
+void FreeBlindTagMatch(BlindTagMatch* This);
+void FreeAllBlindTagMatches(BlindTagMatch* This);
+int IsIdenticalBlindTagMatches(BlindTagMatch* NodeA, BlindTagMatch* NodeB);
+
+Peptide* FindMatchedPeptide(char* Bases);
+void AddPTMassesToTagTable(int TagTableSize, char* CurrentTag, float Mass, char FirstAllowedPep, int CharsLeft, int ModsLeft,
+ int Peptide, int MinMod);
+
+
+// Indexed by characters. (Use upper-case amino acid codes)
+int StandardPeptideMass[256];
+// PeptidMass may be different from StandardPeptideMass if a fixed modification
+// (e.g. +57 to all cysteine residues) has been applied.
+int PeptideMass[256];
+
+// A decoration is a collection of post-translational modification. This includes the
+// 'empty decoration', with no modifications, and mass 0. Each decoration has an index;
+// they are ordered from smallest mass to largest.
+
+// Size of the decoration array:
+int DecorationMassCount;
+
+// Mass of each decoration:
+float* DecorationMasses;
+
+// Largest mass over all our decorations
+float DecorationMaxMass;
+
+// DecorationMassMods[DecorationIndex][n] is the index of the nth mod used in a particular decoration.
+// For decorations that use fewer than the maximum allowed number of mods, we store an index of -1.
+int** DecorationMassMods;
+
+// DecorationModCount[DecorationIndex] is the number of mods used in a decoration
+int* DecorationModCount;
+
+// PTModCount lists how many post-translational mods exist for each peptide. (Faster than iterating
+// over a full 2D table of flags). Indexed by peptide-char (entry #0 is alanine)
+int PTModCount[TRIE_CHILD_COUNT];
+
+// How many modifications are there, in all?
+int TotalPTMods;
+
+// SubDecorations tells how to get to a sub-decoration (a decoration containing fewer post-translational
+// modificaitons) from a parent decoration. SubDecorations[DecorIndex][Modification] is the index
+// of the decoration Decor with one such modification removed. SubDecorations entries are -1 if the specified
+// mod isn't part of the specified decoration.
+int** SubDecorations;
+
+// PTMods lists the mass of each post-translational mod for each peptide. (Redundant storage,
+// for fast lookups)
+float PTModMass[TRIE_CHILD_COUNT][MAX_PT_MODTYPE];
+
+// PTMods lists the index of each post-translational mod for each peptide. (So, PTMods[0][0] is the
+// first modification available to alanine
+int PTModIndex[TRIE_CHILD_COUNT][MAX_PT_MODTYPE];
+
+// Names of all known PTMods.
+char PTModName[MAX_PT_MODTYPE][MAX_PTMOD_NAME];
+float ModMasses[MAX_PT_MODTYPE];
+
+//BlindTagMatch Pointers for the list of matches to a single Gene
+BlindTagMatch* FirstBlindTag = NULL;
+BlindTagMatch* LastBlindTag = NULL;
+
+void InitStats()
+{
+ if (GlobalStats)
+ {
+ memset(GlobalStats, 0, sizeof(InspectStats));
+ }
+ else
+ {
+ GlobalStats = (InspectStats*)calloc(1, sizeof(InspectStats));
+ }
+}
+
+// Set global options to reasonable default values:
+void InitOptions()
+{
+ GlobalOptions = (Options*)calloc(1, sizeof(Options));
+ GlobalOptions->MaxPTMods = 0;
+ GlobalOptions->Epsilon = DEFAULT_EPSILON;
+ GlobalOptions->FlankingMassEpsilon = DEFAULT_FLANKING_MASS_EPSILON;
+ GlobalOptions->OutputFile = stdout;
+ sprintf(GlobalOptions->ErrorFileName, "inspect.err");
+ GlobalOptions->ErrorCount = 0;
+ GlobalOptions->WarningCount = 0;
+ GlobalOptions->ReportAllMatches = 1;
+ GlobalOptions->ParentMassEpsilon = DEFAULT_PARENT_MASS_EPSILON;
+ GlobalOptions->ParentMassPPM = DEFAULT_PARENT_MASS_PPM;
+ GlobalOptions->ReportMatchCount = 10; // Don't report more than 10 to the page!
+ GlobalOptions->StoreMatchCount = 100; //
+ GlobalOptions->MandatoryModIndex = -1; // By default, there is no mandatory modification
+ GlobalOptions->GenerateTagCount = 100;
+ GlobalOptions->GenerateTagLength = DEFAULT_TAG_LENGTH;
+ GlobalOptions->DynamicRangeMin = 105 * DALTON;
+ GlobalOptions->DynamicRangeMax = 2000 * DALTON;
+ GlobalOptions->TrieBlockSize = 250;
+ GlobalOptions->TagPTMMode = 2;
+ //strcpy(GlobalOptions->AminoFileName, FILENAME_AMINO_ACID_MASSES);
+ sprintf(GlobalOptions->InputFileName, "Input.txt");
+ GlobalOptions->MinPTMDelta = -200;
+ // Default maxptmdelta is 250, this allows us to find GlcNac (203) and biotin (226)
+ GlobalOptions->MaxPTMDelta = 250;
+ GlobalOptions->DeltaBinCount = (GlobalOptions->MaxPTMDelta - GlobalOptions->MinPTMDelta) * 10 + 1;
+ GlobalOptions->DeltasPerAA = max(512, GlobalOptions->DeltaBinCount * 2);
+ GlobalOptions->NewScoring = 0;
+ GlobalOptions->MinLogOddsForMutation = -100; //A sufficiently small number so that no guys are omitted
+
+}
+
+//constructor for a new BlindTagMatch
+BlindTagMatch* NewBlindTagMatch()
+{
+ BlindTagMatch* This;
+ This = (BlindTagMatch*)calloc(1, sizeof(BlindTagMatch));
+ This->Next = NULL; //set a few pointers up for tidyness
+ This->Prev = NULL;
+ return This;
+}
+//destructor for BlindTagMatch. This frees ALL connected nodes
+//following the next pointer
+void FreeAllBlindTagMatches(BlindTagMatch* This)
+{
+ This->Tag = NULL; //free pointer to tag, but KEEP TAG
+ This->Prev = NULL;
+ if (This->Next){
+ FreeAllBlindTagMatches(This->Next);
+ }
+ This->Next = NULL;
+ SafeFree(This);
+}
+//This destructor assumes that the links have been
+//previously nullified, and the linked list fixed
+//and ready for this node to be wipedout
+void FreeBlindTagMatch(BlindTagMatch* This)
+{
+ This->Tag = NULL;//free pointer to tag, but KEEP TAG, it's part of the trie
+ This->Next = NULL;
+ This->Prev = NULL;
+ SafeFree(This);
+}
+
+
+// Constructor for a new TrieNode
+TrieNode* NewTrieNode()
+{
+ TrieNode* This;
+ int Index;
+ This = (TrieNode*)calloc(1, sizeof(TrieNode));
+
+ return This;
+}
+
+
+// Free a trie node. Also frees its tag-nodes (if any), and recursively frees its children.
+void FreeTrieNode(TrieNode* This)
+{
+ int Letter;
+ if (!This)
+ {
+ return;
+ }
+ // Free our tag nodes:
+ FreeTrieTagHangerList(This->FirstTag, 1);
+
+ // Free our children too! Free them only AFTER
+ // we iterate over them, since Node->Next must be
+ // valid at the end of each loop-cycle.
+ for (Letter = 0; Letter < TRIE_CHILD_COUNT; Letter++)
+ {
+ // Nodes I and K always point to the same child as L and Q, respectively.
+ // So...don't free them twice!
+ if (Letter == ('I'-'A') || Letter == ('K'-'A'))
+ {
+ continue;
+ }
+ if (This->Children[Letter])
+ {
+ FreeTrieNode(This->Children[Letter]);
+ }
+ }
+
+ // Ok, now free ourselves:
+ SafeFree(This);
+}
+
+// Constructor for a TrieTag
+TrieTag* NewTrieTag()
+{
+ TrieTag* This;
+ int ModIndex;
+ //
+ This = (TrieTag*)calloc(sizeof(TrieTag), 1);
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ This->AminoIndex[ModIndex] = -1;
+ }
+ This->DBTagMatches = 0;
+ This->PrefixExtends = 0;
+ This->SuffixExtends = 0;
+ return This;
+}
+
+// Destructor for a TrieTag
+void FreeTrieTag(TrieTag* This)
+{
+ SafeFree(This);
+}
+
+// Trie construction helper function.
+// We've got a node on the Trie which completely matches a tag. So,
+// add this tag to the list of TrieTagNodes on this TrieNode.
+TrieTagHanger* TrieNodeAddTagNode(TrieNode* Node, TrieTag* Tag, int* DuplicateFlag)
+{
+ TrieTagHanger* Hanger;
+ TrieTag* LocalTag;
+
+ // Look at our current list of tags for the node.
+ // DON'T add this tag if we already have the same pre-and-post masses.
+ // (We normally add tags in order from best to worst, so in the event that
+ // two tags are quit similar, we keep the one with the higher score)
+ //printf("Adding tag '%s' %.2f...\n", Tag->Tag, Tag->PrefixMass);
+ for (Hanger = Node->FirstTag; Hanger; Hanger = Hanger->Next)
+ {
+ if (Hanger->Tag->PSpectrum == Tag->PSpectrum)
+ {
+ if ((abs(Hanger->Tag->SuffixMass - Tag->SuffixMass) < IDENTICAL_TAG_EPSILON) &&
+ (abs(Hanger->Tag->PrefixMass - Tag->PrefixMass) < IDENTICAL_TAG_EPSILON))
+ {
+ // Prefer the new prefix/suffix, if this new tag scores higher:
+ if (Hanger->Tag->Score < Tag->Score)
+ {
+ Hanger->Tag->PrefixMass = Tag->PrefixMass;
+ Hanger->Tag->SuffixMass = Tag->SuffixMass;
+ }
+ *DuplicateFlag = 1;
+ return Hanger;
+ }
+ }
+ }
+
+ Hanger = NewTrieTagHanger();
+ LocalTag = NewTrieTag();
+ memcpy(LocalTag, Tag, sizeof(TrieTag));
+ Hanger->Tag = LocalTag;
+ //Hanger->Tag = Tag;
+ if (Node->LastTag)
+ {
+ Node->LastTag->Next = Hanger;
+ Hanger->Prev = Node->LastTag;
+ }
+ else
+ {
+ Node->FirstTag = Hanger;
+ }
+ Node->LastTag = Hanger;
+ *DuplicateFlag = 0;
+ return Hanger;
+}
+
+// We've got a tag - add it to the trie.
+TrieNode* AddTagToTrie(TrieNode* Root, TrieTag* Tag, int* DuplicateFlag)
+{
+ TrieNode* Node;
+ TrieNode* NextNode;
+ int Index;
+ int TagLength;
+ char TagChar;
+
+ int Index2;
+
+ //printf("Adding tag: %s\n",Tag->Tag);
+ //
+ // First, travel down the trie, matching the specified tag as far as possible (perhaps completely):
+ Index = 0;
+ TagLength = Tag->TagLength;
+ Node = Root;
+
+ //printf("Gene: %s\n",Gene->Name);
+ // printf("**Root: %p\n",Root);
+ //for(Index2 = 0; Index2 < TRIE_CHILD_COUNT; ++Index2)
+ // {
+ // printf(" Child[%c] = %p\n",Index2 + 'A',Root->Children[Index2]);
+ // }
+ //getchar();
+
+ //fflush(stdout);
+
+ while (1)
+ {
+ TagChar = Tag->Tag[Index];
+ if (TagChar == 'I')
+ {
+ TagChar = 'L';
+ }
+ else if (TagChar == 'K')
+ {
+ TagChar = 'Q';
+ }
+ // Look up our child for this letter:
+ NextNode = Node->Children[TagChar - 'A'];
+ if (!NextNode)
+ {
+
+ // Ok, we matched as far as possbile - next we'll add children to match the remainder of the tag.
+ break;
+ }
+ Node = NextNode;
+ Index++;
+ // Did we match the tag completely?
+ if (Index == TagLength)
+ {
+ // Aha - this tag is in the trie! Add the tag to the list:
+ //printf("Tag is already in trie!!\n");
+ //fflush(stdout);
+ //getchar();
+ TrieNodeAddTagNode(Node, Tag, DuplicateFlag);
+ return Node;
+ }
+ }
+ // Ok, we didn't match the entire tag...so, start adding child nodes now!
+ while (Index < TagLength)
+ {
+ NextNode = NewTrieNode();
+ NextNode->Letter = Tag->Tag[Index];
+ NextNode->Depth = Index + 1;
+ Node->Children[Tag->Tag[Index] - 'A'] = NextNode;
+ //printf("Adding trans %c to node %p\n",Tag->Tag[Index],Node);
+ //printf("Child[%c] = %p = %p\n",Tag->Tag[Index],Node->Children[Tag->Tag[Index]-'A'],Root->Children[Tag->Tag[Index] - 'A']);
+
+ // Extra child I and L to same place, K and Q to same place.
+ switch (Tag->Tag[Index])
+ {
+ // Special case for aminos with same mass (I and L equal, K and Q are off by <.1):
+ // Child pointers for I and L, and for K and Q both point to the same place.
+ case 'I':
+ Node->Children['L'-'A'] = NextNode;
+ break;
+ case 'L':
+ Node->Children['I'-'A'] = NextNode;
+ break;
+ case 'K':
+ Node->Children['Q'-'A'] = NextNode;
+ break;
+ case 'Q':
+ Node->Children['K'-'A'] = NextNode;
+ break;
+ default:
+ break;
+ }
+ Node = NextNode;
+ Index++;
+ }
+
+ //printf("**Root: %p\n",Root);
+ //for(Index2 = 0; Index2 < TRIE_CHILD_COUNT; ++Index2)
+ // {
+ // printf(" Child[%c] = %p\n",Index2 + 'A',Root->Children[Index2]);
+ // }
+ //getchar();
+
+ // fflush(stdout);
+ TrieNodeAddTagNode(Node, Tag, DuplicateFlag);
+
+ return Node;
+}
+
+// Constructor for a TagHanger
+TrieTagHanger* NewTrieTagHanger()
+{
+ TrieTagHanger* This;
+ //
+ This = (TrieTagHanger*)calloc(1, sizeof(TrieTagHanger));
+ return This;
+}
+
+// Destructor for a TagHanger
+void FreeTrieTagHanger(TrieTagHanger* This)
+{
+ SafeFree(This);
+}
+
+// Destructor for a TagHanger list
+void FreeTrieTagHangerList(TrieTagHanger* Head, int FreeTags)
+{
+ TrieTagHanger* Prev = NULL;
+ //
+ for (; Head; Head = Head->Next)
+ {
+ if (Prev)
+ {
+ if (FreeTags)
+ {
+ FreeTrieTag(Prev->Tag);
+ }
+ FreeTrieTagHanger(Prev);
+ }
+ Prev = Head;
+ }
+ if (Prev)
+ {
+ if (FreeTags)
+ {
+ FreeTrieTag(Prev->Tag);
+ }
+ FreeTrieTagHanger(Prev);
+ }
+}
+
+// Prints a Trie node, using indentation to denote depth.
+// The entry point is DebugPrintTrie, which calls this function.
+int DebugPrintTrieHelper(TrieNode* Root, char* TagSoFar)
+{
+ char Buffer[1024];
+ char TagBuffer[1024];
+ int Index;
+ int BufferPos;
+ int TagLength;
+ TrieTagHanger* Node;
+ int Letter;
+ int TagCount = 0;
+ //
+ TagLength = strlen(TagSoFar);
+ BufferPos = 0;
+ for (Index = 0; Index < TagLength; Index++)
+ {
+ Buffer[BufferPos++]=' ';
+ Buffer[BufferPos++]=' ';
+ Buffer[BufferPos++]=' ';
+ }
+ Buffer[BufferPos] = '\0';
+ //
+ strcpy(TagBuffer, TagSoFar);
+ if (Root->Letter)
+ {
+ BufferPos = strlen(TagBuffer);
+ TagBuffer[BufferPos++] = Root->Letter;
+ TagBuffer[BufferPos++] = '\0';
+ }
+ printf("%s%s\n", Buffer, TagBuffer);
+
+ // Print attached tags:
+ for (Node = Root->FirstTag; Node; Node = Node->Next)
+ {
+ printf("%s%s: prefix %d,suffix %d mods %d\n", Buffer, Node->Tag->Tag, Node->Tag->PrefixMass,Node->Tag->SuffixMass, Node->Tag->ModsUsed);
+ TagCount++;
+ }
+ if (Root->FailureNode)
+ {
+ printf("%s Failure: Skip %d, depth %d\n", Buffer, Root->FailureLetterSkip, Root->FailureNode->Depth);
+ }
+ else
+ {
+ printf("%s (no failure node set)\n", Buffer);
+ }
+
+ // Print children:
+ for (Letter = 0; Letter < TRIE_CHILD_COUNT; Letter++)
+ {
+ if (Root->Children[Letter])
+ {
+
+ TagCount += DebugPrintTrieHelper(Root->Children[Letter], TagBuffer);
+ }
+ }
+ return TagCount;
+}
+
+// Print out a trie and all its nodes to stdout.
+void DebugPrintTrie(TrieNode* Root)
+{
+ int TagCount;
+
+ printf("-->Trie:\n");
+ TagCount = DebugPrintTrieHelper(Root, "");
+ printf("Total tags: %d\n", TagCount);
+ printf("(end of trie nodes)\n");
+}
+
+// Set up all the failure nodes for our trie
+// The failure node for a trie node is the node you jump to when you're currently
+// matching that trie node, but then break a match because none of your children
+// match the *next* character. If another trie node matches a substring of this
+// node (not a prefix, but any other substring), we must try matching that node as
+// well.
+// Example: Suppose we have nodes ABCDE and BCD, and we're scanning text ABCDF.
+// when we reach F, we jump to ABCDE's failure node BCD, and move the anchor to B.
+void InitializeTrieFailureNodes(TrieNode* Root, TrieNode* Node, char* Tag)
+{
+ TrieNode* FailureNode;
+ int Letter;
+ int TagLength;
+ int StartIndex = 0;
+ int EndIndex = 0;
+ //
+ TagLength = strlen(Tag);
+ if (!Root)
+ {
+ return;
+ }
+ if (Node == Root)
+ {
+ // Failure on the root means the letter can't start a tag; no speedup, just step forward:
+ Root->FailureNode = Root;
+ Root->FailureLetterSkip = 1;
+ }
+ else
+ {
+ // There's a real tag. Navigate to the SHORTEST node-with-tags which matches a suffix of our tag.
+ // Try knocking off one letter, then two, and so on:
+ for (StartIndex=1; StartIndex<TagLength; StartIndex++)
+ {
+ FailureNode = Root;
+ for (EndIndex = StartIndex; EndIndex<TagLength; EndIndex++)
+ {
+ Letter = Tag[EndIndex];
+ if (Letter == 'I')
+ {
+ Letter = 'L';
+ }
+ if (Letter == 'Q')
+ {
+ Letter = 'K';
+ }
+ if (!FailureNode->Children[Letter - 'A'])
+ {
+ // We can't go deeper in the trie...and we saw NO TAGS! So, we needn't go here.
+ // Suppose you have tags GGGROOVY and GROOVY. After matching GGGROOVY, you can
+ // jump to the 3rd G (you needn't handle the 2nd, even though it matches
+ // partway down the trie). Of course, if you had GGGROOVY, GROOVY and GGROO, you would only
+ // jump to the 2nd G - because we'd have found a tag in here..
+ FailureNode = Root;
+ break;
+ }
+ FailureNode = FailureNode->Children[Letter - 'A']; // move down the tree.
+ // If there are tags, STOP NOW. (Don't jump from PANTS->ANTS if ANT has a tag)
+ if (FailureNode->FirstTag)
+ {
+ break;
+ }
+ }
+ // If we're not pointing at root, then we're pointing at a good failure node:
+ if (FailureNode != Root)
+ {
+ Node->FailureNode = FailureNode;
+ Node->FailureLetterSkip = StartIndex;
+ break;
+ }
+ }
+ if (!Node->FailureNode)
+ {
+ // Hmm...no good failure nodes found? That means we can jump forward over our full tag!
+ Node->FailureNode = Root;
+ Node->FailureLetterSkip = strlen(Tag);
+ }
+ }
+ // Now, handle all our children:
+ for (Letter = 0; Letter < TRIE_CHILD_COUNT; Letter++)
+ {
+ if (Node->Children[Letter])
+ {
+ Tag[TagLength] = 'A'+Letter;
+ Tag[TagLength+1] = '\0';
+ InitializeTrieFailureNodes(Root, Node->Children[Letter], Tag);
+ }
+ }
+}
+
+// Constructor for a Peptide
+Peptide* NewPeptideNode()
+{
+ Peptide* This;
+ This = (Peptide*)calloc(1, sizeof(Peptide));
+ if (!This)
+ {
+ printf("** Fatal error: Unable to allocate a new peptide!\n");
+ return NULL;
+ }
+ memset(This->AminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+ return This;
+}
+
+// Destructor for a Peptide
+void FreePeptideNode(Peptide* Pep)
+{
+ PeptideMatch* Node;
+ PeptideMatch* Prev = NULL;
+ PeptideSpliceNode* PSNode;
+ PeptideSpliceNode* PSPrev;
+ if (!Pep)
+ {
+ return;
+ }
+ // Free the list of PeptideSpliceNodes, starting with SpliceHead:
+ PSPrev = NULL;
+ for (PSNode = Pep->SpliceHead; PSNode; PSNode = PSNode->Next)
+ {
+ SafeFree(PSPrev);
+ PSPrev = PSNode;
+ }
+ SafeFree(PSPrev);
+ // Free the list of PeptideMatch instances, starting with First:
+ for (Node = Pep->First; Node; Node = Node->Next)
+ {
+ SafeFree(Prev);
+ Prev = Node;
+ }
+ SafeFree(Prev);
+ SafeFree(Pep->SplicedBases);
+ SafeFree(Pep->PetDelta);
+ SafeFree(Pep);
+}
+
+MassDelta* GetPeptideModFromAnnotation(Peptide* Match, char* ModBuffer, int ModCount, int AminoIndex)
+{
+ int MaxModsFromParsedPeptide = 10;
+ MassDelta* Delta;
+ //
+ if (!Match->PetDelta)
+ {
+ Match->PetDelta = (MassDelta*)calloc(MaxModsFromParsedPeptide, sizeof(MassDelta));
+ }
+ if (ModCount >= MaxModsFromParsedPeptide)
+ {
+ return NULL;
+ }
+ Delta = Match->PetDelta + ModCount;
+ Delta->Flags = DELTA_FLAG_VALID;
+ if(!CompareStrings(ModBuffer,"phos"))
+ { //necessary switch, cannot do atoi("phos") and expect real numbers
+ Delta->RealDelta = 80 * DALTON;
+ Delta->Flags |= DELTA_FLAG_PHOSPHORYLATION;
+ Match->SpecialFragmentation = FRAGMENTATION_PHOSPHO; // special flags we need
+ Match->SpecialModPosition = Match->AminoIndex[ModCount];
+ }
+ else
+ {
+ Delta->RealDelta = atoi(ModBuffer) * DALTON;
+ }
+ Delta->Delta = Delta->RealDelta / 100; // tenth-of-a-dalton
+ Match->AminoIndex[ModCount] = AminoIndex - 1;
+ Match->ModType[ModCount] = Delta;
+ return Delta;
+}
+
+// Produce a peptide from an annotation string. The annotation string
+// consists of amino acids, plus - possibly - some modification masses.
+// Valid examples:
+// GPLLVQDVVFTDEMAHFDR
+// VLVLDTDY+16KK
+// SVTDC-2TSNFCLFQSNSK
+Peptide* GetPeptideFromAnnotation(char* Annotation)
+{
+ char ModBuffer[32];
+ int AminoIndex = 0;
+ int ModCount = 0;
+ int ModBufferPos;
+ Peptide* Match;
+ MassDelta* Delta;
+ int PRM = 0;
+ char* BaseAnnotation;
+ int MaxModsFromParsedPeptide = 10;
+ //
+ if (!Annotation)
+ {
+ return NULL;
+ }
+ ModBufferPos = 0;
+ Match = NewPeptideNode();
+ BaseAnnotation = Annotation;
+ if (BaseAnnotation[1] == '.')
+ {
+ Match->PrefixAmino = BaseAnnotation[0];
+ Annotation += 2;
+ }
+ while (*Annotation)
+ {
+ if ((*Annotation >= 'A' && *Annotation <= 'Z') || *Annotation == '.')
+ {
+ // It's an amino acid, or period.
+ // Finish any pending mod:
+ if (ModBufferPos)
+ {
+ ModBuffer[ModBufferPos] = '\0';
+ Delta = GetPeptideModFromAnnotation(Match, ModBuffer, ModCount, AminoIndex);
+ if (!Delta)
+ {
+ printf("*** Warning: Invalid modifications in '%s', not parsing\n", Annotation);
+ FreePeptideNode(Match);
+ return NULL;
+ }
+ PRM += Delta->RealDelta;
+ ModBufferPos = 0;
+ ModCount += 1;
+ // Bail out if we have too many PTMs to cope with:
+ if (ModCount == MAX_PT_MODS)
+ {
+ return NULL;
+ }
+ }
+ // It's a dot - set the prefix and break:
+ if (*Annotation == '.')
+ {
+ Match->SuffixAmino = *(Annotation + 1);
+ break;
+ }
+ // It's an amino acid - add the AA mass:
+ Match->Bases[AminoIndex++] = *Annotation;
+ PRM += PeptideMass[*Annotation];
+ }
+ else
+ {
+ ModBuffer[ModBufferPos++] = *Annotation;
+ }
+ Annotation++;
+ }
+ Match->Bases[AminoIndex] = '\0';
+ // Finish any pending mod:
+ if (ModBufferPos)
+ {
+ ModBuffer[ModBufferPos] = '\0';
+ Delta = GetPeptideModFromAnnotation(Match, ModBuffer, ModCount, AminoIndex);
+ if (!Delta)
+ {
+ printf("*** Warning: Invalid modifications in '%s', not parsing\n", Annotation);
+ FreePeptideNode(Match);
+ return NULL;
+ }
+ PRM += Delta->RealDelta;
+ ModBufferPos = 0;
+ ModCount += 1;
+ // Bail out if we have too many PTMs to cope with:
+ if (ModCount == MAX_PT_MODS)
+ {
+ return NULL;
+ }
+ }
+ Match->ParentMass = PRM + PARENT_MASS_BOOST;
+ return Match;
+}
+
+
+int GetPeptideParentMass(Peptide* Match)
+{
+ int Mass = PARENT_MASS_BOOST;
+ char* Amino;
+ int ModIndex;
+ for (Amino = Match->Bases; *Amino; Amino++)
+ {
+ Mass += PeptideMass[*Amino];
+ }
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex] >= 0)
+ {
+ Mass += Match->ModType[ModIndex]->RealDelta;
+ }
+ }
+ Match->ParentMass = Mass;
+ return Mass;
+}
+
+int LoadPeptideMassesCallback(int LineNumber, int FilePos, char* LineBuffer, void* UserData)
+{
+ float Mass;
+ char* Str;
+ char Letter;
+
+
+ // Name:
+ Str = strtok(LineBuffer, " ");
+ // 3-letter abbrev:
+
+ Str = strtok(NULL, " ");
+ if (!Str)
+ {
+ REPORT_ERROR(0);
+ return 0;
+ }
+
+ // 1-letter abbrev:
+ Str = strtok(NULL, " ");
+ if (!Str)
+ {
+ REPORT_ERROR(0);
+ return 0;
+ }
+
+ Letter = Str[0];
+ // mass:
+ Str = strtok(NULL, " ");
+ if (!Str)
+ {
+ REPORT_ERROR(0);
+ return 0;
+ }
+
+ Mass = (float)atof(Str);
+ ROUND_MASS(Mass, StandardPeptideMass[Letter]);
+
+ return 1;
+}
+
+// Read peptide masses from a file.
+int LoadPeptideMasses(char* FileName)
+{
+ FILE* AAFile;
+ //
+ if (!FileName)
+ {
+ // Use a sensible default:
+ FileName = FILENAME_AMINO_ACID_MASSES;
+ }
+ AAFile = fopen(FileName, "r");
+ if (!AAFile)
+ {
+ REPORT_ERROR_S(8, FileName);
+ return 0;
+ }
+ ParseFileByLines(AAFile, LoadPeptideMassesCallback, NULL, 0);
+
+ // This absurdly high mass for the delimiter, *, ensures that it won't be part of a match:
+ StandardPeptideMass[42] = 9999999;
+ memcpy(PeptideMass, StandardPeptideMass, sizeof(int) * 256);
+ return 1;
+}
+
+
+// We read in large chunks of the file at once. When we get past SCAN_BUFFER_A, it's time to shunt what we've got
+// to the front of the buffer. And if our buffer ends before SCAN_BUFFER_B, we try to read more data (until
+// we reach eof)
+#define SCAN_BUFFER_SIZE 5242880
+#define SCAN_BUFFER_A 5232680
+#define SCAN_BUFFER_B 5242680
+#define RECORD_END '*'
+
+// We have matched a tag in the peptide database, and the flanking series (plus some PTMods) matches our
+// flanking mass. Check to be sure these PTMods can be attached to the flanking series. (For example:
+// if the flanking sequence AAG plus a phosphate mass matches our prefix mass, that's NOT a match, because
+// neither glycine nor alanine is phosphorylatable)
+// Simplification: Assume that multiple PTMods can be attached to one base. (This assumption isn't always valid, but
+// it's nontrivial to know when it is; the user can toss out any unreasonable constructs later)
+// From Start to End, INCLUSIVE.
+int CheckForPTAttachmentPoints(int DecorationMassIndex, char* Buffer, int Start, int End, int BufferDir)
+{
+ int ModIndex;
+ int ModsLeft[MAX_PT_MODTYPE];
+ int BufferPos;
+ int Done;
+ int PeptideIndex;
+ int Legal;
+
+ memcpy(ModsLeft, AllDecorations[DecorationMassIndex].Mods, sizeof(int)*MAX_PT_MODTYPE);
+ for (BufferPos = Start; BufferPos <= End; BufferPos++)
+ {
+ Done = 1; //by default
+ PeptideIndex = Buffer[BufferPos] - 'A';
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ if (ModsLeft[ModIndex])
+ {
+ Legal = 1;
+ // Avoid attaching a C-terminal PTM, if we're not at the C terminus:
+ if (AllKnownPTMods[ModIndex].Flags & DELTA_FLAG_C_TERMINAL)
+ {
+ if (BufferDir < 0 || BufferPos != End)
+ {
+ Legal = 0;
+ }
+ }
+ // Avoid attaching an N-terminal PTM, if we're not at the N terminus:
+ if (AllKnownPTMods[ModIndex].Flags & DELTA_FLAG_N_TERMINAL)
+ {
+ if (BufferDir > 0 || BufferPos != Start)
+ {
+ Legal = 0;
+ }
+ }
+ if (Legal)
+ {
+ ModsLeft[ModIndex] = max(0, ModsLeft[ModIndex] - AllKnownPTMods[ModIndex].Allowed[PeptideIndex]);
+ }
+ if (ModsLeft[ModIndex])
+ {
+ Done = 0;
+ }
+ }
+ }
+ if (Done)
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+// MAX_SIDE_MODS is how many flanking matches we're allowed for an initial tag match.
+// (For instance: The preceding aminos may match with no PTMs, or we may be able to match
+// with one fewer amino and a PTM)
+#define MAX_SIDE_MODS 10
+int LeftMatchPos[MAX_SIDE_MODS];
+int LeftMatchDecoration[MAX_SIDE_MODS];
+int RightMatchPos[MAX_SIDE_MODS];
+int RightMatchDecoration[MAX_SIDE_MODS];
+
+// MatchFlankingMass is called when we matched a trie tag, and we are checking whether the
+// flanking amino acids match our prefix or suffix mass.
+// WARNING: If there are two or more decorations with the same mass, this method will FAIL, because we'll only
+// consider ONE such decoration.
+int MatchFlankingMass(MSSpectrum* Spectrum, TrieTag* Tag, char* Buffer, int StartPos, int BufferDir, int BufferEnd, int MatchMass, int ModsRemaining)
+{
+ int MatchCount = 0;
+ int Pos;
+ int Mass;
+ int Diff;
+ int AbsDiff;
+ int FlankingMass;
+ int MandatoryDecorationChange = 0;
+ int DecorationMassIndex;
+ int Verbose = 0;
+ int* MatchPos;
+ int* MatchDecoration;
+ int MinMatchMass = MatchMass - GlobalOptions->FlankingMassEpsilon;
+ //
+ if (BufferDir<0)
+ {
+ MatchPos = LeftMatchPos;
+ MatchDecoration = LeftMatchDecoration;
+ }
+ else
+ {
+ MatchPos = RightMatchPos;
+ MatchDecoration = RightMatchDecoration;
+ }
+
+ /////////////////////////////////////////////////////////
+ // If prefix mass is zero, that qualifies as a match always.
+ if (MatchMass < GlobalOptions->FlankingMassEpsilon)
+ {
+ MatchPos[MatchCount] = StartPos - BufferDir;
+ MatchDecoration[MatchCount] = PlainOldDecorationIndex;
+ return 1;
+ }
+ DecorationMassIndex = AllDecorationCount - 1;
+ // Skip over any decorations that use up too many pt-mods:
+ while (1)
+ {
+ if (AllDecorations[DecorationMassIndex].TotalMods > ModsRemaining)
+ {
+ DecorationMassIndex--;
+ continue;
+ }
+ break;
+ }
+ FlankingMass = 0;
+ for (Pos = StartPos; Pos >= 0; Pos += BufferDir)
+ {
+ if (Pos >= BufferEnd)
+ {
+ break;
+ }
+ if (Buffer[Pos] == '>' || Buffer[Pos] == '*')
+ {
+ break;
+ }
+ Mass = PeptideMass[Buffer[Pos]];
+ if (Mass == 0)
+ {
+ // Invalid peptide!
+ break;
+ }
+ FlankingMass += Mass;
+ Diff = MatchMass - (FlankingMass + AllDecorations[DecorationMassIndex].Mass);
+ AbsDiff = abs(Diff);
+ if (AbsDiff < GlobalOptions->FlankingMassEpsilon)
+ {
+ // Aha! This is *probably* a match. Check to be sure we have the bases we need:
+ if (CheckForPTAttachmentPoints(DecorationMassIndex, Buffer, min(Pos, StartPos), max(Pos, StartPos), BufferDir))
+ {
+ if (Verbose)
+ {
+ printf("Side is match! Dec-index %d, flank %.2f.\n", DecorationMassIndex, FlankingMass / (float)MASS_SCALE);
+ }
+ MatchPos[MatchCount] = Pos;
+ MatchDecoration[MatchCount] = DecorationMassIndex;
+ MatchCount++;
+ if (MatchCount == MAX_SIDE_MODS)
+ {
+ return MatchCount;
+ }
+
+ }
+ }
+ // Move the DecorationMassIndex, if needed.
+ while (MandatoryDecorationChange || FlankingMass + AllDecorations[DecorationMassIndex].Mass > MinMatchMass)
+ {
+ // The flanking sequence's mass is significantly bigger than our (decorated) target mass.
+ // Move to a smaller decoration:
+ MandatoryDecorationChange = 0;
+ DecorationMassIndex--;
+ if (DecorationMassIndex<0)
+ {
+ break;
+ }
+ // Skip any decorations that include phosphorylation, if we're not on phospho mode:
+ if (!GlobalOptions->PhosphorylationFlag && g_PhosphorylationMod>-1 && AllDecorations[DecorationMassIndex].Mods[g_PhosphorylationMod])
+ {
+ MandatoryDecorationChange = 1;
+ continue;
+ }
+ if (AllDecorations[DecorationMassIndex].TotalMods > ModsRemaining)
+ {
+ continue;
+ }
+ // And, check for a match:
+ Diff = MatchMass - (FlankingMass + AllDecorations[DecorationMassIndex].Mass);
+ AbsDiff = abs(Diff);
+ if (AbsDiff < GlobalOptions->FlankingMassEpsilon)
+ {
+ // Aha! This is *probably* a match. Check to be sure we have the bases we need:
+ if (CheckForPTAttachmentPoints(DecorationMassIndex, Buffer, min(Pos, StartPos), max(Pos, StartPos), BufferDir))
+ {
+ if (Verbose)
+ {
+ printf("Left is match! Dec-index %d, flank %.2f.\n", DecorationMassIndex, FlankingMass / (float)MASS_SCALE);
+ }
+ MatchPos[MatchCount] = Pos;
+ MatchDecoration[MatchCount] = DecorationMassIndex;
+ MatchCount++;
+ if (MatchCount == MAX_SIDE_MODS)
+ {
+ return MatchCount;
+ }
+ MandatoryDecorationChange = 1;
+ }
+ }
+ }
+ if (DecorationMassIndex<0)
+ {
+ break;
+ }
+ }
+ return MatchCount;
+}
+// We extend LEFT and RIGHT from the match region (running from BufferPos to BufferEnd, INCLUSIVE),
+// attempting to match our tag's prefix mass and Suffix mass. Extension works like this:
+// - DecoratedMassIndex starts out pointing at our largest decoration, FlankingMass starts at 0
+// - At each iteration step:
+// -- move one base further along, and add its mass to FlankingMass
+// -- If FlankingMass plus the mass of our decoration matches our tag, we have a match.
+// -- If FlankingMass plus the mass of our decoration is too LARGE, decrement DecoratedMassIndex
+// until we have a match, run out of decorations, or the mass again becomes too SMALL.
+// -- At some point, we'll run out of decorations (FlankingMass becomes larger than the tag mass), and stop.
+//FilePos and BufferPos point to the last character in the matched tag.
+void GetMatches(SearchInfo* Info, TrieNode* Node, char* Buffer, int BufferPos, int BufferEnd, int FilePos)
+{
+ TrieTagHanger* TagNode;
+ int ModsRemaining;
+
+ int LeftMatchCount;
+ int RightMatchCount;
+ int LeftMatchIndex;
+ int RightMatchIndex;
+ int ModIndex;
+ int UsedTooMany;
+ static int PTMLimit[MAX_PT_MODTYPE];
+ // To avoid repeated scoring:
+ int ExtensionIndex = 0;
+ int ExtensionCount = 0;
+ static int StartingPoints[512];
+ static int EndingPoints[512];
+ static int ExtensionLeftDecorations[512];
+ static int ExtensionRightDecorations[512];
+ static MSSpectrum* ExtensionSpectra[512];
+ int startOfPeptideFilePos;
+ int ExtensionFound;
+
+ int validTag = 1;
+ MSSpectrum* Spectrum;
+ //
+ if (!Node->FirstTag)
+ {
+ return;
+ }
+ //GlobalStats->TagMatches++;
+
+ //printf("Extend matches of '%s' at position %d\n", Node->FirstTag->Tag->Tag, FilePos);
+ //Log("Extend matches of '%s' at position %d\n", Node->FirstTag->Tag->Tag, FilePos);
+ // Try each tag corresponding to this TrieNode.
+ for (TagNode = Node->FirstTag; TagNode; TagNode = TagNode->Next)
+ {
+ Spectrum = TagNode->Tag->PSpectrum;
+ Info->Spectrum = Spectrum;
+ memcpy(PTMLimit, g_PTMLimit, sizeof(int) * AllPTModCount);
+ //If ther are mods in the tag, then these must count towards to PTMLiimt
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (TagNode->Tag->AminoIndex[ModIndex] < 0)
+ {
+ break;
+ }
+
+ //Also check that the PTM is valid!!!
+ if(AllKnownPTMods[TagNode->Tag->ModType[ModIndex]->Index].Allowed[Buffer[BufferPos - 2 + TagNode->Tag->AminoIndex[ModIndex]]] == 0)
+ {
+ validTag = 0;
+ }
+ PTMLimit[TagNode->Tag->ModType[ModIndex]->Index] -= 1;
+ }
+ if(validTag == 0)
+ continue;
+ ModsRemaining = GlobalOptions->MaxPTMods - TagNode->Tag->ModsUsed;
+ if (ModsRemaining < 0)
+ {
+ continue;
+ }
+ //See how many prefix matches there are. Populates LeftMatchDecorations array
+ LeftMatchCount = MatchFlankingMass(Spectrum, TagNode->Tag, Buffer, BufferPos - TagNode->Tag->TagLength, -1, BufferEnd, TagNode->Tag->PrefixMass, ModsRemaining);
+ if (LeftMatchCount == 0)
+ {
+ continue;
+ }
+ //See how many suffix matches there are. Populates RightMatchDecorations array
+ RightMatchCount = MatchFlankingMass(Spectrum, TagNode->Tag, Buffer, BufferPos + 1, 1, BufferEnd, TagNode->Tag->SuffixMass, ModsRemaining);
+ if (RightMatchCount == 0)
+ {
+ continue;
+ }
+ // Consider each combination of left-extension and right-extension:
+ for (LeftMatchIndex = 0; LeftMatchIndex<LeftMatchCount; LeftMatchIndex++)
+ {
+ for (RightMatchIndex = 0; RightMatchIndex<RightMatchCount; RightMatchIndex++)
+ {
+ UsedTooMany = 0;
+ //Check that there aren't too many of any type of modification with the selected extensions
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ if (AllDecorations[LeftMatchDecoration[LeftMatchIndex]].Mods[ModIndex] +
+ AllDecorations[RightMatchDecoration[RightMatchIndex]].Mods[ModIndex] > PTMLimit[ModIndex])
+ {
+ UsedTooMany = 1;
+ break;
+ }
+ }
+ if (UsedTooMany)
+ {
+ continue;
+ }
+ //Check that the total number of mods is within the limits
+ if (AllDecorations[LeftMatchDecoration[LeftMatchIndex]].TotalMods +
+ AllDecorations[RightMatchDecoration[RightMatchIndex]].TotalMods > ModsRemaining)
+ {
+ continue;
+ }
+ if (GlobalOptions->MandatoryModIndex > -1 &&
+ !TagNode->Tag->MandatoryModUsed &&
+ AllDecorations[LeftMatchDecoration[LeftMatchIndex]].Mods[GlobalOptions->MandatoryModIndex] == 0 &&
+ AllDecorations[RightMatchDecoration[RightMatchIndex]].Mods[GlobalOptions->MandatoryModIndex] == 0)
+ {
+ continue; // We don't have our mandatory PTM (biotin, or whatever)
+ }
+ ExtensionFound = 0;
+ for (ExtensionIndex = 0; ExtensionIndex < ExtensionCount; ExtensionIndex++)
+ {
+ if (StartingPoints[ExtensionIndex] == LeftMatchPos[LeftMatchIndex] && EndingPoints[ExtensionIndex] == RightMatchPos[RightMatchIndex]
+ && ExtensionLeftDecorations[ExtensionIndex] == LeftMatchDecoration[LeftMatchIndex]
+ && ExtensionRightDecorations[ExtensionIndex] == RightMatchDecoration[RightMatchIndex]
+ && ExtensionSpectra[ExtensionIndex] == TagNode->Tag->PSpectrum)
+ {
+ ExtensionFound = 1;
+ break;
+ }
+ }
+ if (ExtensionFound)
+ {
+ continue;
+ }
+ StartingPoints[ExtensionCount] = LeftMatchPos[LeftMatchIndex];
+ EndingPoints[ExtensionCount] = RightMatchPos[RightMatchIndex];
+ ExtensionLeftDecorations[ExtensionCount] = LeftMatchDecoration[LeftMatchIndex];
+ ExtensionRightDecorations[ExtensionCount] = RightMatchDecoration[RightMatchIndex];
+ ExtensionSpectra[ExtensionCount] = TagNode->Tag->PSpectrum;
+ Info->Spectrum = TagNode->Tag->PSpectrum;
+
+ //printf("FilePos: %d\n",FilePos);
+ startOfPeptideFilePos = FilePos - TagNode->Tag->TagLength - ((BufferPos - Node->Depth + 1) - LeftMatchPos[LeftMatchIndex]) + 1;
+ AddNewMatch(Info,startOfPeptideFilePos,TagNode->Tag,
+ Buffer + LeftMatchPos[LeftMatchIndex],
+ RightMatchPos[RightMatchIndex] - LeftMatchPos[LeftMatchIndex] + 1,
+ (BufferPos - Node->Depth + 1) - LeftMatchPos[LeftMatchIndex],
+ LeftMatchDecoration[LeftMatchIndex], RightMatchDecoration[RightMatchIndex],
+ 0, 0);
+ ExtensionCount = min(511,ExtensionCount);
+ }
+ }
+ }
+ return;
+}
+
+//Extending Tags for a blind search requires a separate function.
+//We keep Tags where only one side (suffix or prefix) is extendable. It is a simple
+//extension, because no PTMs are allowed. If both sides are extendable
+//then it is a nomod match, and sent to the regular scorer.
+int ExtendTagMatchBlind(SearchInfo* Info, TrieNode* Node, char* Buffer, int BufferPos, int BufferEnd, int FilePos)
+{
+ TrieTagHanger* Hanger;
+ MSSpectrum* Spectrum;
+ int LeftMatchCount;
+ int RightMatchCount;
+ int ModsRemaining = 0; //always zero for this simple extension
+ int Extensions = 0;
+
+ for (Hanger = Node->FirstTag; Hanger; Hanger = Hanger->Next)
+ {
+ Spectrum = Hanger->Tag->PSpectrum;
+ Info->Spectrum = Spectrum;
+ //by virtue of getting here, we know that this TAG (tripeptide) has matched the database
+ Hanger->Tag->DBTagMatches++;
+ LeftMatchCount = MatchFlankingMass(Spectrum, Hanger->Tag, Buffer, BufferPos - Hanger->Tag->TagLength, -1, BufferEnd, Hanger->Tag->PrefixMass, ModsRemaining);
+
+ RightMatchCount = MatchFlankingMass(Spectrum, Hanger->Tag, Buffer, BufferPos + 1, 1, BufferEnd, Hanger->Tag->SuffixMass, ModsRemaining);
+ if (LeftMatchCount + RightMatchCount == 1)
+ {
+ //set up the BlindTagMatchObject, representing this match.
+ //Match = NewBlindTagMatch();
+ //Match->Tag = Hanger->Tag;
+ //Match->TagDBLoc = BufferPos - Hanger->Tag->TagLength; //pos of the first char
+ if (LeftMatchCount)
+ {
+ Hanger->Tag->PrefixExtends ++;
+ // Match->ExtendLR = -1;
+ // Match->ExtendDBLoc = LeftMatchPos[0]; //only one match position possible, bc no mods
+ // Match->ExtendLength = Match->TagDBLoc - Match->ExtendDBLoc;
+ }
+ else
+ {
+ Hanger->Tag->SuffixExtends ++;
+ // Match->ExtendLR = 1; //right extension
+ // Match->ExtendDBLoc = RightMatchPos[0];
+ // Match->ExtendLength = Match->ExtendDBLoc - Match->TagDBLoc;
+ }
+ //InsertBlindTagMatch(Match);
+ Hanger->Tag->PrefixExtends += LeftMatchCount;
+ Hanger->Tag->SuffixExtends += RightMatchCount;
+ Extensions++;
+ }
+ else if (LeftMatchCount + RightMatchCount == 2)
+ {
+ //send to regular scorer, it's a two sided hit
+ }
+ //printf ("Extend matches of '%s' at position %d\n", Node->FirstTag->Tag->Tag, FilePos);
+ //printf ("Returned RightMatch %d, returned LeftMatch %d\n",RightMatchCount,LeftMatchCount);
+ }
+ return Extensions;
+
+}
+
+void GetProteinID(int RecordNumber, DatabaseFile* DB, char* Name)
+{
+ int Dummy[16];
+ int RecordSize;
+ if (!DB || !DB->IndexFile)
+ {
+ Name[0] = '?';
+ Name[1] = '\0';
+ return;
+ }
+ if (DB->Type == evDBTypeSpliceDB)
+ {
+ RecordSize = SPLICEDB_INDEX_RECORD_SIZE;
+ }
+ else
+ {
+ RecordSize = TRIE_INDEX_RECORD_SIZE;
+ }
+
+
+ fseek(DB->IndexFile, TRIE_INDEX_RECORD_SIZE * RecordNumber, SEEK_SET);
+ ReadBinary(&Dummy, LONG_LONG_SIZE, 1, DB->IndexFile);
+ ReadBinary(&Dummy, sizeof(int), 1, DB->IndexFile);
+ ReadBinary(Name, sizeof(char), 80, DB->IndexFile);
+ Name[80] = '\0';
+ //Log("Record %d has ID %s\n", Pep->RecordNumber, Pep->Name);
+}
+
+
+void SortModifications(int* AminoIndex, MassDelta** ModType)
+{
+ int AminoIndexSorted[MAX_PT_MODS];
+ MassDelta* ModTypeSorted[MAX_PT_MODS];
+ int MinAminoIndex = 0;
+ int NextSortedPosition = 0;
+ int Index;
+ int MinAminoPos = 0;
+ //
+ memset(AminoIndexSorted, -1, sizeof(int)*MAX_PT_MODS);
+ memset(ModTypeSorted, 0, sizeof(MassDelta*)*MAX_PT_MODS);
+ while (1)
+ {
+ // Find the smallest amino acid index in AminoIndex, skipping
+ // over entries of -1 (which are empty)
+ MinAminoIndex = -1;
+ for (Index = 0; Index < MAX_PT_MODS; Index++)
+ {
+ if (AminoIndex[Index]>-1 && (MinAminoIndex<0 || AminoIndex[Index]<MinAminoIndex))
+ {
+ MinAminoIndex = AminoIndex[Index];
+ MinAminoPos = Index;
+ }
+ }
+ if (MinAminoIndex==-1)
+ {
+ // Everything's been moved to the sorted list. Jane, stop this crazy thing!
+ break;
+ }
+ // MOVE these entries into the sorted list:
+ AminoIndexSorted[NextSortedPosition] = AminoIndex[MinAminoPos];
+ AminoIndex[MinAminoPos] = -1;
+ ModTypeSorted[NextSortedPosition] = ModType[MinAminoPos];
+ ModType[MinAminoPos] = NULL;
+ NextSortedPosition++;
+ }
+ // Move the sorted shadows back into the real arrays:
+ memcpy(AminoIndex, AminoIndexSorted, sizeof(int)*MAX_PT_MODS);
+ memcpy(ModType, ModTypeSorted, sizeof(MassDelta*)*MAX_PT_MODS);
+ // Hooray!
+}
+
+#define SCORE_PTM_ATTACH_IMPOSSIBLE (float)-999999999.0
+
+// Diagram of the dynamic programming table for optimal mod positioning:
+// Suppose we have three decorations (zero, one or two attachments of the same PTM),
+// and the PTMs should be attached at B and C in prefix ABCDE. Then the grid
+// looks like this:
+// A B C D E
+// 0 x--x
+// |
+// 1 x--x
+// |
+// 2 x--x--x
+//
+// (Columns for amino acids, rows for decorations, vertical moves mean an attachment)
+//
+// Find the optimal way to place modifications (from FullDecoration) on a polypeptide
+// (Peptide) with length PeptideLength; store the mod-placements in AminoIndex / ModType
+void FindOptimalPTModPositions(MSSpectrum* Spectrum, char* Peptide,
+ int PeptideLength, int FullDecoration, int BaseMass, int* AminoIndex,
+ MassDelta** ModType, int VerboseFlag, SpectrumTweak* Tweak)
+{
+ float* ScoreMatrix = NULL;
+ int* SubDecorationMatrix = NULL;
+ int PeptidePos;
+ int DecorationIndex;
+ float BestScore;
+ char Amino;
+ int ModIndex;
+ int Mass;
+ float Score;
+ int ModCount;
+ int CanBridge;
+ int ModsNeeded;
+ float BYScore;
+ int ModAdder;
+ int AminoAcidIndex;
+ int BestSubDecoration;
+ int SubDecoration;
+ PRMBayesianModel* Model;
+ ///
+ VerboseFlag = 0;
+ memset(AminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+ memset(ModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+ if (FullDecoration == PlainOldDecorationIndex)
+ {
+ return; // No mods to place!
+ }
+ if (Spectrum->Charge > 2)
+ {
+ Model = PRMModelCharge3;
+ }
+ else
+ {
+ Model = PRMModelCharge2;
+ }
+ // D.P. tables. ScoreMatrix holds the score at each cell; SubDecorationMatrix tells
+ // the previous subdecoration (and hence, how to backtrack through the table)
+ ScoreMatrix = (float*)calloc(PeptideLength * AllDecorationCount, sizeof(float));
+ SubDecorationMatrix = (int*)calloc(PeptideLength * AllDecorationCount, sizeof(int));
+
+ // Fill the dynamic programming table. Outer loop over amino acids,
+ // inner loop over decorations.
+ Mass = BaseMass;
+ for (PeptidePos = 0; PeptidePos < PeptideLength; PeptidePos++)
+ {
+ Amino = Peptide[PeptidePos];
+ AminoAcidIndex = Amino - 'A';
+ Mass += PeptideMass[Amino];
+ BestScore = 0;
+ for (DecorationIndex = 0; DecorationIndex < AllDecorationCount; DecorationIndex++)
+ {
+ if (!IsSubDecoration[DecorationIndex][FullDecoration])
+ {
+ continue;
+ }
+ BestScore = 0;
+ BestSubDecoration = DecorationIndex;
+ BYScore = GetIonPRMFeatures(Spectrum, Tweak, Model, Mass + AllDecorations[DecorationIndex].Mass, 0);
+ //BYScore = (int)(100 * GetPRMFeatures(Spectrum, Tweak, Model, Mass + AllDecorations[DecorationIndex].Mass, 0));
+ if (PeptidePos)
+ {
+ // Consider attaching nothing at this peptide:
+ BestScore += ScoreMatrix[(PeptidePos - 1) * AllDecorationCount + DecorationIndex];
+ }
+ else
+ {
+ if (DecorationIndex != PlainOldDecorationIndex)
+ {
+ BestScore += SCORE_PTM_ATTACH_IMPOSSIBLE;
+ }
+ }
+ BestScore += BYScore;
+ BestSubDecoration = DecorationIndex;
+ //Log printf(" No mod here: Score %.2f\n", BestScore);
+
+ // Consider attaching a modification at this peptide:
+ for (SubDecoration = 0; SubDecoration < AllDecorationCount; SubDecoration++)
+ {
+ if (SubDecoration == DecorationIndex)
+ {
+ continue;
+ }
+ if (!IsSubDecoration[SubDecoration][DecorationIndex])
+ {
+ continue;
+ }
+ CanBridge = 1;
+ for (ModIndex = 0; ModIndex < AllPTModCount; ModIndex++)
+ {
+ // This decoration must contain all the mods from the subdecoration:
+ ModsNeeded = AllDecorations[DecorationIndex].Mods[ModIndex] - AllDecorations[SubDecoration].Mods[ModIndex];
+ if (ModsNeeded < 0)
+ {
+ CanBridge = 0;
+ break;
+ }
+ // This amino acid must be able to support the modification(s):
+ //printf("ModsNeeded: %d\n",ModsNeeded);
+ //printf("AllKnownPTMods[%d].Allowed[%c]=%d\n",ModIndex,(char)(AminoAcidIndex+'A'),AllKnownPTMods[ModIndex].Allowed[AminoAcidIndex]);
+ if (ModsNeeded > AllKnownPTMods[ModIndex].Allowed[AminoAcidIndex])
+ {
+ CanBridge = 0;
+ break;
+ }
+ // If the decoration is terminal, then this attachment position must be terminal:
+ if (ModsNeeded)
+ {
+ if ((AllKnownPTMods[ModIndex].Flags & DELTA_FLAG_C_TERMINAL) && PeptidePos < (PeptideLength - 1))
+ {
+ CanBridge = 0;
+ break;
+ }
+ if ((AllKnownPTMods[ModIndex].Flags & DELTA_FLAG_N_TERMINAL) && PeptidePos)
+ {
+ CanBridge = 0;
+ break;
+ }
+ }
+ }
+ if (CanBridge)
+ {
+ if (PeptidePos)
+ {
+ Score = ScoreMatrix[(PeptidePos - 1) * AllDecorationCount + SubDecoration];
+ }
+ else
+ {
+ if (SubDecoration != PlainOldDecorationIndex)
+ {
+ Score = SCORE_PTM_ATTACH_IMPOSSIBLE; // Impossible!
+ }
+ else
+ {
+ Score = 0;
+ }
+ }
+ //Log printf(" To Sub-decoration %d: Score %d\n", SubDecoration, Score);
+ Score += BYScore;
+ if (Score >= BestScore)
+ {
+ BestScore = Score;
+ BestSubDecoration = SubDecoration;
+ }
+ }
+ }
+ if (VerboseFlag)
+ {
+ //Log printf(" PeptidePos %d decoration %d: \n Mass %d BYscore %.2f, best score %.2f, sub decoration %d\n", PeptidePos, DecorationIndex,
+ //(Mass + AllDecorations[DecorationIndex].Mass), BYScore, BestScore, BestSubDecoration);
+ }
+ ScoreMatrix[PeptidePos * AllDecorationCount + DecorationIndex] = BestScore;
+ SubDecorationMatrix[PeptidePos * AllDecorationCount + DecorationIndex] = BestSubDecoration;
+ }
+ }
+ // Fill in AminoIndex, ModType. Start at the bottom right of the DP table (last amino acid,
+ // and full decoration), work back to the top row (first amino acid, no more decorations)
+ ModCount = 0;
+ DecorationIndex = FullDecoration;
+ PeptidePos = PeptideLength - 1;
+ while (PeptidePos >= 0)
+ {
+ SubDecoration = SubDecorationMatrix[PeptidePos * AllDecorationCount + DecorationIndex];
+ if (SubDecoration != DecorationIndex)
+ {
+ for (ModIndex = 0; ModIndex < MAX_PT_MODTYPE; ModIndex++)
+ {
+ ModsNeeded = AllDecorations[DecorationIndex].Mods[ModIndex] - AllDecorations[SubDecoration].Mods[ModIndex];
+ for (ModAdder = 0; ModAdder<ModsNeeded; ModAdder++)
+ {
+ AminoIndex[ModCount] = PeptidePos;
+ AminoAcidIndex = Peptide[PeptidePos] - 'A';
+ //printf("Peptide: %s\n",Peptide);
+ //printf("Amino acid Index=%c\n",Peptide[PeptidePos]);
+
+ ModType[ModCount] = MassDeltaByIndex[AminoAcidIndex * MAX_PT_MODTYPE + ModIndex];
+ //printf("Mod Delta: %d\n",ModType[ModCount]->Delta);
+ ModCount++;
+ }
+ }
+ }
+ PeptidePos--;
+ DecorationIndex = SubDecoration;
+ }
+ // Free temp storage:
+ SafeFree(ScoreMatrix);
+ SafeFree(SubDecorationMatrix);
+}
+
+// Return TRUE if two matches are the same.
+// If we're performing an exon-graph search, then we only consider matches
+// to be the same if they have the same sequence AND genomic coordinates.
+int IsMatchDuplicate(Peptide* Match, Peptide* OldMatch, int PeptideLength)
+{
+ int CompareGenomicLocation = 1;
+
+ if (Match->DB && Match->DB->Type == evDBTypeTrie && OldMatch->DB && OldMatch->DB->Type == evDBTypeTrie)
+ {
+ CompareGenomicLocation = 0;
+ }
+ if (!CompareGenomicLocation)
+ {
+ if (!strncmp(Match->Bases, OldMatch->Bases, PeptideLength) &&
+ !memcmp(Match->AminoIndex, OldMatch->AminoIndex, sizeof(int)*MAX_PT_MODS) &&
+ !memcmp(Match->ModType, OldMatch->ModType, sizeof(int)*MAX_PT_MODS))
+ {
+ return 1;
+ }
+ }
+ else
+ {
+ // For exon graph search, we consider a match to be different if it has a different
+ // genomic location. We may see the same peptide inside two different exons, and
+ // we may have different options for splicing.
+ if (!strncmp(Match->Bases, OldMatch->Bases, PeptideLength) &&
+ !memcmp(Match->AminoIndex, OldMatch->AminoIndex, sizeof(int)*MAX_PT_MODS) &&
+ !memcmp(Match->ModType, OldMatch->ModType, sizeof(int)*MAX_PT_MODS) &&
+ Match->GenomicLocationStart == OldMatch->GenomicLocationStart &&
+ Match->GenomicLocationEnd == OldMatch->GenomicLocationEnd)
+ {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+// Store a match in this Spectrum's Node's match list. Don't store duplicate matches.
+// Don't store more than GlobalOptions->StoreMatchCount matches. Keep matches sorted
+// by InitialScore (or, if MQScoreFlag is set, by MatchQualityScore)
+Peptide* StoreSpectralMatch(MSSpectrum* Spectrum, Peptide* Match, int PeptideLength, int MQScoreFlag)
+{
+ Peptide* OldMatch;
+ Peptide* CrummyScoreOldMatch;
+ int VerboseFlag = 0;
+ int SameFlag = 0;
+ SpectrumNode* Node = Spectrum->Node;
+ int NTT;
+
+
+ //
+ if (GlobalOptions->RequireTermini)
+ {
+ NTT = CountTrypticTermini(Match);
+ if (NTT < GlobalOptions->RequireTermini)
+ {
+ FreePeptideNode(Match);
+ return NULL;
+ }
+ }
+ //printf("NEC_ERROR: Store match %d '%s'\n", Match->InitialScore, Match->Bases);
+ if (!Node->FirstMatch)
+ {
+ Node->FirstMatch = Match;
+ Node->LastMatch = Match;
+ Node->MatchCount++;
+ }
+ else
+ {
+ OldMatch = Node->FirstMatch;
+ while (1)
+ {
+ SameFlag = IsMatchDuplicate(Match, OldMatch, PeptideLength);
+ // Check to see whether it's the SAME as an existing match:
+ if (SameFlag)
+ {
+ // Old match is the same as our new peptide. Free the new guy, and break:
+ //printf("NEC_ERROR: This is a duplicate, do not add to list\n");
+ OldMatch->MatchQualityScore = max(OldMatch->MatchQualityScore, Match->MatchQualityScore);
+ OldMatch->InitialScore = max(OldMatch->InitialScore, Match->InitialScore);
+ SafeFree(Match);
+ Match = OldMatch;
+ //OldMatch->SeenCount++;
+ break;
+ }
+ if ((MQScoreFlag && Match->MatchQualityScore > OldMatch->MatchQualityScore) || (!MQScoreFlag && Match->InitialScore > OldMatch->InitialScore))
+ {
+ //printf("NEC_ERROR: This is a good score, adding to list\n");
+ if (Node->FirstMatch == OldMatch)
+ {
+ Node->FirstMatch = Match;
+ }
+ Match->Next = OldMatch;
+ Match->Prev = OldMatch->Prev;
+ if (OldMatch->Prev)
+ {
+ OldMatch->Prev->Next = Match;
+ }
+ OldMatch->Prev = Match;
+ Node->MatchCount++;
+ // It's possible that we've already seen this peptide, but with a lower score. (Why a lower score?
+ // probably because we searched with the WRONG parent mass before, and the RIGHT parent mass now!) So, iterate over
+ // the rest of the old matches, and if any is the same as this match, free it.
+ for (CrummyScoreOldMatch = Match->Next; CrummyScoreOldMatch; CrummyScoreOldMatch = CrummyScoreOldMatch->Next)
+ {
+ SameFlag = IsMatchDuplicate(Match, CrummyScoreOldMatch, PeptideLength);
+ if (SameFlag)
+ {
+ //printf("NEC_ERROR: This is a duplicate, but its better than the previous one\n");
+ if (Node->LastMatch == CrummyScoreOldMatch)
+ {
+ Node->LastMatch = Node->LastMatch->Prev;
+ }
+ if (CrummyScoreOldMatch->Next)
+ {
+ CrummyScoreOldMatch->Next->Prev = CrummyScoreOldMatch->Prev;
+ }
+ if (CrummyScoreOldMatch->Prev)
+ {
+ CrummyScoreOldMatch->Prev->Next = CrummyScoreOldMatch->Next;
+ }
+ FreePeptideNode(CrummyScoreOldMatch);
+ break;
+ }
+ }
+ break;
+ }
+ OldMatch = OldMatch->Next;
+ if (!OldMatch)
+ {
+ //printf("NEC_ERROR: adding to list\n");
+ // Save our new match at the end of the list.
+ Node->LastMatch->Next = Match;
+ Match->Prev = Node->LastMatch;
+ Node->LastMatch = Match;
+ Node->MatchCount++;
+ break;
+ }
+ }
+ }
+ if (Node->MatchCount > GlobalOptions->StoreMatchCount)
+ {
+ if (Match == Node->LastMatch)
+ {
+ Match = NULL;
+ }
+ OldMatch = Node->LastMatch->Prev;
+ //printf("NEC_ERROR: Removing the last match '%s'\n",Node->LastMatch->Bases);
+ FreePeptideNode(Node->LastMatch);
+ Node->LastMatch = OldMatch;
+ if (OldMatch)
+ {
+ OldMatch->Next = NULL;
+ }
+ Node->MatchCount--;
+ }
+ return Match;
+}
+
+
+// Record a new match in the global match list. If it's a duplicate peptide, then
+// don't add it again.
+Peptide* AddNewMatch(SearchInfo* Info, int FilePos, TrieTag* Tag, char* MatchedBases,
+ int PeptideLength, int TagPosition, int PrefixDecoration, int SuffixDecoration,
+ int GenomicStart, int GenomicEnd)
+{
+ Peptide* Match;
+ char MatchedPeptideVerbose[256];
+ PeptideMatch* PepInfo;
+ int AminoIndex[MAX_PT_MODS];
+ MassDelta* ModType[MAX_PT_MODS];
+ int PrefixAminoIndex[MAX_PT_MODS];
+ MassDelta* PrefixModType[MAX_PT_MODS];
+ int SuffixAminoIndex[MAX_PT_MODS];
+ MassDelta* SuffixModType[MAX_PT_MODS];
+ int Mass;
+ int SuffixStart;
+ int AminoPos;
+ int ModIndex;
+ int TotalMods = 0;
+ float ScoreToBeat;
+ //int Score;
+ int VerboseFlag;
+ char* Amino;
+ //int PrecursorMass;
+ int ParentMassError;
+ MSSpectrum* Spectrum = Info->Spectrum;
+
+ int i;
+
+ //
+ memset(PrefixAminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+ memset(SuffixAminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+ memset(PrefixModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+ memset(SuffixModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+ memset(ModType, 0, sizeof(MassDelta*) * MAX_PT_MODS);
+ memset(AminoIndex, -1, sizeof(int) * MAX_PT_MODS);
+
+ SuffixStart = TagPosition + strlen(Tag->Tag);
+
+ //Log("Prefix mods %d, suffix mods %d\n", PrefixDecoration, SuffixDecoration);
+ TotalMods = Tag->ModsUsed + AllDecorations[PrefixDecoration].TotalMods + AllDecorations[SuffixDecoration].TotalMods;
+ //////////////////////////////////////////////////
+ // Optimally place the prefix and suffix PTMs:
+ VerboseFlag = 0;
+
+ ////////////////////////////////////////////////////////////////////////////////////////
+ // Temporarily adjust the charge and parent mass to reflect this candidate:
+ Spectrum->Charge = Tag->Charge;
+ Spectrum->ParentMass = PARENT_MASS_BOOST;
+ for (AminoPos = 0, Amino = MatchedBases; AminoPos<PeptideLength; AminoPos++,Amino++)
+ {
+ Spectrum->ParentMass += PeptideMass[*Amino];
+ }
+ Spectrum->ParentMass += AllDecorations[PrefixDecoration].Mass;
+ Spectrum->ParentMass += AllDecorations[SuffixDecoration].Mass;
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Tag->ModType[ModIndex])
+ {
+ break;
+ }
+ Spectrum->ParentMass += Tag->ModType[ModIndex]->RealDelta;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ // Reject this parent mass, if it's too far from the theoretical mass.
+ //Use the corrected parent mass from the tweak, not the file mass.
+ //PrecursorMass = Spectrum->MZ * Spectrum->Charge - (HYDROGEN_MASS * (Spectrum->Charge - 1));
+ //ParentMassError = PrecursorMass - Spectrum->ParentMass;
+ ParentMassError = Tag->Tweak->ParentMass - Spectrum->ParentMass;
+ if (abs(ParentMassError) > GlobalOptions->ParentMassEpsilon)
+ {
+ // *** Reject this match, it doesn't match the parent mass!
+ return NULL;
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////////////
+ FindOptimalPTModPositions(Spectrum, MatchedBases, TagPosition, PrefixDecoration, 0,
+ PrefixAminoIndex, PrefixModType, VerboseFlag, Tag->Tweak);
+ // Get the starting mass for our suffix match:
+ Mass = 0;
+ for (AminoPos = 0; AminoPos < TagPosition + Tag->TagLength; AminoPos++)
+ {
+ Mass += PeptideMass[MatchedBases[AminoPos]];
+ }
+ Mass += AllDecorations[PrefixDecoration].Mass;
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Tag->ModType[ModIndex])
+ {
+ break;
+ }
+ Mass += Tag->ModType[ModIndex]->RealDelta;
+ }
+ FindOptimalPTModPositions(Spectrum,
+ MatchedBases + TagPosition + Tag->TagLength,
+ PeptideLength - TagPosition - Tag->TagLength,
+ SuffixDecoration,
+ Mass,
+ SuffixAminoIndex,
+ SuffixModType, 0, Tag->Tweak);
+ /////////////////////////////////////////////////////////
+ // Merge all the mods into one array, then sort it:
+ TotalMods = 0;
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!PrefixModType[ModIndex])
+ {
+ break;
+ }
+ ModType[TotalMods] = PrefixModType[ModIndex];
+ AminoIndex[TotalMods] = PrefixAminoIndex[ModIndex];
+ TotalMods++;
+ }
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!Tag->ModType[ModIndex])
+ {
+ break;
+ }
+ ModType[TotalMods] = Tag->ModType[ModIndex];
+ AminoIndex[TotalMods] = Tag->AminoIndex[ModIndex] + TagPosition;
+ TotalMods++;
+ }
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (!SuffixModType[ModIndex])
+ {
+ break;
+ }
+ ModType[TotalMods] = SuffixModType[ModIndex];
+ AminoIndex[TotalMods] = SuffixAminoIndex[ModIndex] + TagPosition + Tag->TagLength;
+ TotalMods++;
+ }
+ SortModifications(AminoIndex, ModType);
+
+
+
+ /////////////////////////////////////////////////////////
+ // Score the match. If the score's not good enough, then toss it:
+ ScoreToBeat = -999999;
+ if (Spectrum->Node->MatchCount >= GlobalOptions->StoreMatchCount)
+ {
+ ScoreToBeat = Spectrum->Node->LastMatch->MatchQualityScore;
+ }
+ Match = NewPeptideNode();
+ Match->ParentMassError = ParentMassError;
+ Match->Tweak = Tag->Tweak;
+ Match->DB = Info->DB;
+ memcpy(Match->AminoIndex, AminoIndex, sizeof(int)*MAX_PT_MODS);
+ memcpy(Match->ModType, ModType, sizeof(int)*MAX_PT_MODS);
+ if (FilePos)
+ {
+ Match->PrefixAmino = *(MatchedBases - 1);
+ }
+ Match->SuffixAmino = *(MatchedBases + PeptideLength);
+ strncpy(Match->Bases, MatchedBases, PeptideLength);
+ Match->FilePos = FilePos;
+ Match->RecordNumber = Info->RecordNumber;
+ VerboseFlag = 0;
+
+ GetPeptideParentMass(Match);
+
+
+ if(GlobalOptions->RunMode & RUN_MODE_RAW_OUTPUT)
+ {
+
+
+ WriteMatchToString(Match,MatchedPeptideVerbose,1);
+ fprintf(GlobalOptions->OutputFile,"%s\t%d\t%s\n",Spectrum->Node->InputFile->FileName,Spectrum->Node->ScanNumber,MatchedPeptideVerbose);
+ //printf("%s\t%d\t%s\n",Spectrum->Node->InputFile->FileName,Spectrum->Node->ScanNumber,MatchedPeptideVerbose);
+ //fflush(stdout);
+ return NULL;
+ }
+ Spectrum->CandidatesScored++;
+ Tag->CandidatesScored++;
+ // Invoke the scoring function now:
+ ComputeMQScoreFeatures(Spectrum, Match, Match->ScoreFeatures, 0);
+
+#ifdef MQSCORE_USE_SVM
+
+ Match->MatchQualityScore = SVMComputeMQScore(Spectrum, Match, Match->ScoreFeatures);
+#else
+
+ Match->MatchQualityScore = LDAComputeMQScore(Spectrum, Match, Match->ScoreFeatures);
+#endif
+ Match->InitialScore = (int)(1000 * Match->MatchQualityScore);
+
+
+
+ Match->GenomicLocationEnd = GenomicEnd;
+ Match->GenomicLocationStart = GenomicStart;
+ if (Match->MatchQualityScore < ScoreToBeat)
+ {
+ // Not good enough - forget it!
+
+ SafeFree(Match);
+ return NULL;
+ }
+ // It's good enough to add to the list:
+ //printf("NEC_ERROR:Match: %s, Tweak[z=%d,m=%d], Score: %f\n",Match->Bases,Match->Tweak->Charge, Match->Tweak->ParentMass, Match->MatchQualityScore);
+ //for(i = 0; i < 16; ++i)
+ // {
+ // printf("ScoreFeature[%d] = %f\n",i,Match->ScoreFeatures[i]);
+ // }
+
+ Match = StoreSpectralMatch(Spectrum, Match, PeptideLength, 1);
+ if (!Match)
+ {
+
+ return NULL;
+ }
+ //DebugPrintMatch(Match);
+ // Store the match details, if requested:
+ if (GlobalOptions->ReportAllMatches)
+ {
+ PepInfo = (PeptideMatch*)calloc(1, sizeof(PeptideMatch));
+ PepInfo->FilePos = FilePos;
+ PepInfo->RecordNumber = Info->RecordNumber;
+ PepInfo->Tag = Tag;
+ if (Match->Last)
+ {
+ Match->Last->Next = PepInfo;
+ }
+ else
+ {
+ Match->First = PepInfo;
+ }
+ Match->Last = PepInfo;
+ }
+ return Match;
+}
+
+// Print a list (one per line) of all the decorations we generated for
+// the available post-translational modifications.
+void DebugPrintDecoratedMassList()
+{
+ int Index;
+ int ModIndex;
+ //
+ printf("Decorated masses: (%d in all)\n", DecorationMassCount);
+ for (Index = 0; Index < DecorationMassCount; Index++)
+ {
+ printf(" %.2f: ",DecorationMasses[Index]);
+ for (ModIndex = 0; ModIndex <= GlobalOptions->MaxPTMods; ModIndex++)
+ {
+ if (DecorationMassMods[Index][ModIndex]<0)
+ {
+ // That's all the modifications in this one.
+ printf("(end)\n");
+ break;
+ }
+ printf("%d: %s (%.2f), ", DecorationMassMods[Index][ModIndex], PTModName[DecorationMassMods[Index][ModIndex]], ModMasses[DecorationMassMods[Index][ModIndex]]);
+ }
+ }
+ printf("End of decorated mass list.\n");
+}
+
+// Helper macro for quick-sort
+#define DECO_SWAP(a,b) \
+{ \
+fSwap = Masses[(a)]; \
+memcpy(TempSpace, Mods[(a)], sizeof(int) * GlobalOptions->MaxPTMods); \
+Masses[(a)] = Masses[b]; \
+memcpy(Mods[(a)], Mods[(b)], sizeof(int) * GlobalOptions->MaxPTMods); \
+Masses[(b)] = fSwap; \
+memcpy(Mods[(b)], TempSpace, sizeof(int) * GlobalOptions->MaxPTMods); \
+}
+
+// Sort decorations using QuickSort. We're sorting the array Masses, but we'll
+// also make the corresponding
+// changes to the 2D array Mods, to keep the arrays in synch.
+// Reminder: Quick-sort is done recursively. Take the first element of the array as a pivot, then
+// 'pseudo-sort' the remaining elements so that all the EARLY elements (those less than the pivot)
+// come before all the LATE elements (those larger than the pivot). The 'pseudo-sort' is done by
+// moving a left-index and right-index in from the edges of the array until they meet).
+// Then - here's the recursion-part - use quick-sort to sort the early and late elements.
+void QuickSortDecoratedMasses(float* Masses, int** Mods, int Count)
+{
+ float fSwap;
+ int TempSpace[1024];
+ float Pivot;
+ int LeftIndex;
+ int RightIndex;
+ // Sorting a list of one element is easy:
+ if (Count<2)
+ {
+ return;
+ }
+ // Sorting a list of two elements is easy:
+ if (Count == 2)
+ {
+ if (Masses[0] > Masses[1])
+ {
+ DECO_SWAP(0,1);
+ }
+ return;
+ }
+ // Now the REAL case begins:
+ Pivot = Masses[0];
+ LeftIndex = 1;
+ RightIndex = Count-1;
+ while (LeftIndex < RightIndex)
+ {
+ while (Masses[LeftIndex] <= Pivot)
+ {
+ LeftIndex++;
+ if (LeftIndex == Count)
+ {
+ // Pivot element is the biggest of all!
+ DECO_SWAP(0, Count-1);
+ QuickSortDecoratedMasses(Masses, Mods, Count-1);
+ return;
+ }
+ }
+ while (Masses[RightIndex] > Pivot)
+ {
+ RightIndex--;
+ }
+ if (RightIndex == 0)
+ {
+ // Pivot element is the smallest of all!
+ QuickSortDecoratedMasses(Masses+1, Mods+1, Count-1);
+ return;
+ }
+ if (RightIndex > LeftIndex)
+ {
+ DECO_SWAP(LeftIndex, RightIndex);
+ }
+ }
+ DECO_SWAP(0, RightIndex);
+ QuickSortDecoratedMasses(Masses, Mods, RightIndex);
+ QuickSortDecoratedMasses(Masses+RightIndex+1, Mods+RightIndex+1, Count-RightIndex-1);
+
+}
+
+int PopulateDecoratedMassList(float* TotalMass, int** Mods,
+ float MassSoFar, int* UsedMods, int UsedModCount)
+{
+ int Index;
+ int MinModIndex;
+ int RecordsBuilt = 0;
+ //
+ // If our prefix is mod #1, don't do 1,0; just to 1,1 and onward. (Decorations
+ // are listed from lowest PTM-index to largest)
+ if (UsedModCount)
+ {
+ MinModIndex = UsedMods[UsedModCount-1];
+ }
+ else
+ {
+ MinModIndex = 0;
+ }
+ // Consider adding no mods at all:
+ for (Index = 0; Index < UsedModCount; Index++)
+ {
+ Mods[0][Index] = UsedMods[Index];
+ }
+ TotalMass[0] = MassSoFar;
+ RecordsBuilt++;
+ if (UsedModCount == GlobalOptions->MaxPTMods)
+ {
+ return 1;
+ }
+ // Ok: Extend with each legal (lexigraphically subsequent) modificaiton!
+ for (Index = MinModIndex; Index < TotalPTMods; Index++)
+ {
+ UsedMods[UsedModCount] = Index;
+ RecordsBuilt += PopulateDecoratedMassList(TotalMass + RecordsBuilt, Mods + RecordsBuilt,
+ MassSoFar + ModMasses[Index], UsedMods, UsedModCount+1);
+ }
+ return RecordsBuilt;
+
+}
+
+int GetDecoratedMassCount(int AvailableMods, int PermissibleModCount)
+{
+ int ModIndex;
+ int Total;
+ if (PermissibleModCount == 0)
+ {
+ return 1;
+ }
+ Total = 1; // If we add no more
+ for (ModIndex = 0; ModIndex < AvailableMods; ModIndex++)
+ {
+ Total += GetDecoratedMassCount(AvailableMods - ModIndex, PermissibleModCount - 1);
+ }
+ return Total;
+}
+
+//Trie.c::ProcessGeneHitsBlindTag
+//This function processes all the onesided hits to a single gene, from the
+//blind tagging option, send to a function to find the PTM site, and the scores.
+//1. Tags (or the container) is sent to the function SeekMatch1PTM
+int ProcessGeneHitsBlindTag()
+{
+ BlindTagMatch* Match;
+ int Counter = 0;
+
+ for (Match = FirstBlindTag; Match; Match = Match->Next)
+ {
+ Counter ++;
+ }
+ printf ("Processing a gene with %d matches\n",Counter);
+ return 1;
+}
+//Trie.c::IsIdenticalBlindTagMatches
+//returns true (1) if the two tag matches are identical
+//else false
+//Conditions for Identity
+//1. Tags from the same spectra and Tweak
+//2. Tags have identical DBAnchorPoint
+//3. Tags extend in the same direction.
+
+int IsIdenticalBlindTagMatches(BlindTagMatch* NodeA, BlindTagMatch* NodeB)
+{
+ if (NodeA->Tag->PSpectrum != NodeB->Tag->PSpectrum)
+ {
+ return 0;
+ }
+ if (NodeA->Tag->Tweak != NodeB->Tag->Tweak)
+ {
+ return 0;
+ }
+ if (NodeA->ExtendDBLoc != NodeB->ExtendDBLoc)
+ {
+ return 0;
+ }
+ if (NodeA->ExtendLR != NodeB->ExtendLR)
+ { //this one may be unnecessary but it is in there for completeness
+ return 0;
+ }
+ return 1;
+}
+//Trie.c :: InsertBlindTagMatch
+//Inserts an object into a linked list, first testing
+//if the object is identical to an already existing entry.
+//If an object is not inserted, then it is freed (because the calling
+//function expects us to deal with this type of thing). Similarly
+//with an object in the list which must be replaced.
+int InsertBlindTagMatch(BlindTagMatch* Match)
+{
+ BlindTagMatch* NodeA;
+ BlindTagMatch* Prev = NULL; //in case we do some swapping in the list
+ BlindTagMatch* Next = NULL;
+
+ if (FirstBlindTag == NULL) //just started
+ {
+ FirstBlindTag = Match;
+ LastBlindTag = Match;
+ return 1;
+ }
+
+ //cycle through the list, and see if there are any identical tags.
+ // if identical tags exist, then we keep only the one with the
+ //longer extension. In the absence of any twin, we put it at the end
+ for (NodeA = FirstBlindTag; NodeA; NodeA = NodeA->Next)
+ {
+ if (IsIdenticalBlindTagMatches(Match, NodeA))
+ { //decide who to keep
+ if (NodeA->ExtendLength >= Match->ExtendLength)
+ { //winner is already in the list
+ FreeBlindTagMatch(Match);
+ return 1;
+ }
+ //have to remove item in the list. swap in Match
+ if (NodeA->Prev == NULL) //first Item
+ {
+ FirstBlindTag = Match;
+ Next = NodeA->Next; //temp
+ Match->Next = Next;
+ Next->Prev = Match;
+ FreeBlindTagMatch(NodeA);
+ return 1;
+ }
+ if (NodeA->Next == NULL) //last item
+ {
+ LastBlindTag = Match;
+ Prev = NodeA->Prev;
+ Match->Prev = Prev;
+ Prev->Next = Match;
+ FreeBlindTagMatch(NodeA);
+ return 1;
+ }
+ //default else, nodeA in the middle
+ Prev = NodeA->Prev;
+ Next = NodeA->Next;
+ Match->Prev = Prev;
+ Match->Next = Next;
+ Prev->Next = Match;
+ Next->Prev = Match;
+ FreeBlindTagMatch(NodeA);
+ return 1;
+ }
+ }
+ LastBlindTag->Next = Match; //add onto the end
+ Match->Prev = LastBlindTag; //point back
+ LastBlindTag = Match; //move end
+ return 1;
+}
+// Main method: Use a trie to search a data-file. Return the number of proteins searched.
+int ScanFileWithTrie(SearchInfo* Info)
+{
+ FILE* File;
+ int FilePos = 0;
+ char* Buffer;
+ int BufferPos = 0;
+ int BufferEnd = 0;
+ int AnchorPos = -1; // -1 means that no anchor is set
+ TrieNode* Node;
+ TrieNode* NextNode;
+ int IsEOF = 0;
+ int BytesRead;
+ int OldPos;
+ int PaddingDistance = 50;
+ int Verbose = 0;
+ //
+ Info->RecordNumber = 0;
+ File = Info->DB->DBFile;
+ if (!File)
+ {
+ return 0;
+ }
+ fseek(File, 0, 0);
+ if (!Info->Root)
+ {
+ return 0;
+ }
+
+ Buffer = (char*)calloc(SCAN_BUFFER_SIZE, sizeof(char));
+ Node = Info->Root;
+ // We'll scan in chunks of the file, and scan across them. We try to always keep a buffer of 50 characters
+ // before and after the current position, so that we can look forward and back to get masses. (When we match
+ // a tag-string, we look at surrounding masses).
+ while (1)
+ {
+
+ //printf("Anc %d, Buf %d, BufEnd %d, FilePos %d, Char%c\n", AnchorPos, BufferPos, BufferEnd, FilePos, Buffer[BufferPos]);
+ // Periodically shunt data toward the front of the buffer:
+ if (BufferPos > SCAN_BUFFER_A && AnchorPos==-1)
+ {
+ // ......ppppBbbbbbbbbbE... <- diagram (p = pad, B = buffer start, E = buffer end)
+ // ppppBbbbbbbbbbE.... <- after move
+ memmove(Buffer, Buffer + BufferPos - PaddingDistance, BufferEnd - (BufferPos - PaddingDistance));
+ BufferEnd -= (BufferPos - PaddingDistance);
+ BufferPos = PaddingDistance;
+ }
+
+ // Read more data, if we have room and we can:
+ if (BufferEnd < SCAN_BUFFER_B && !IsEOF)
+ {
+ BytesRead = ReadBinary(Buffer + BufferEnd, sizeof(char), SCAN_BUFFER_SIZE - BufferEnd, File);
+ if (!BytesRead)
+ {
+ IsEOF = 1;
+ }
+ BufferEnd += BytesRead;
+
+ }
+
+ if (AnchorPos!=-1)
+ {
+ // If we're anchored: Attempt to extend the current match.
+ if (Buffer[BufferPos] >= 'A' && Buffer[BufferPos] <= 'Z')
+ {
+ NextNode = Node->Children[Buffer[BufferPos] - 'A'];
+ }
+ else
+ {
+ NextNode = NULL;
+ }
+ // If we can extend the current match...
+ if (NextNode)
+ {
+ // Note any new matches:
+ if (NextNode->FirstTag)
+ {
+ //if(GlobalOptions->RunMode & RUN_MODE_BLINDTAG)
+ //{
+ // ExtendTagMatchBlind(Info, NextNode, Buffer, BufferPos, BufferEnd, FilePos);
+ //}
+ //else
+ //{
+ GetMatches(Info, NextNode, Buffer, BufferPos, BufferEnd, FilePos);
+ //}
+ }
+ // Travel down the trie:
+ Node = NextNode;
+ BufferPos++;
+ FilePos++;
+ }
+ else
+ {
+ // We could NOT extend the match.
+ // We're done with this anchor. Clear the anchor, and use our FailureNode to jump
+ // forward. (AnchorPos moves forward by FailureLetterSkip chars, and the BufferPos
+ // moves to the correct distance ahead of the anchor)
+ if (IS_ROOT(Node->FailureNode))
+ {
+ AnchorPos = -1;
+ }
+ else
+ {
+ AnchorPos = AnchorPos + Node->FailureLetterSkip;
+ OldPos = BufferPos;
+ BufferPos = AnchorPos + Node->FailureNode->Depth - 1;
+ FilePos += (BufferPos - OldPos);
+ // Process matches immediately:
+ if (Node->FailureNode->FirstTag)
+ {
+ // if (GlobalOptions->RunMode & RUN_MODE_BLINDTAG)
+ //{
+ // ExtendTagMatchBlind(Info, NextNode, Buffer, BufferPos, BufferEnd, FilePos);
+ //}
+ //else
+ //{
+ GetMatches(Info, Node->FailureNode, Buffer, BufferPos, BufferEnd, FilePos);
+ //}
+ }
+ BufferPos++;
+ FilePos++;
+ }
+ Node = Node->FailureNode;
+ }
+ }
+ else
+ {
+ // We're not currently anchored. Process end-of-record tags, or attempt to start a
+ // brand new match.
+ if (BufferPos>=BufferEnd || Buffer[BufferPos] == RECORD_END || !Buffer[BufferPos])
+ {
+ // END of a protein.
+ // if (GlobalOptions->RunMode & RUN_MODE_BLINDTAG)
+ //{
+ //ProcessGeneHitsBlindTag(); // Process the blind tags a gene at a time.
+ //FreeAllBlindTagMatches(FirstBlindTag); //free up the hits
+ //FirstBlindTag = NULL; //reset the pointers
+ //LastBlindTag = NULL;
+ //}
+ Info->RecordNumber++;
+ AnchorPos = -1;
+ }
+ else
+ {
+ // Now: Start a new match, if possible:
+ if (Buffer[BufferPos] >= 'A' && Buffer[BufferPos] <= 'Z')
+ {
+ NextNode = Node->Children[Buffer[BufferPos] - 'A'];
+ }
+ else
+ {
+ NextNode = NULL;
+ }
+ if (NextNode)
+ {
+ // Note any new matches. (Not likely, because
+ // at this point in the code, we're only at depth 1 in the
+ // tree; tags of length 1 aren't very good)
+ if (NextNode->FirstTag)
+ {
+ // if (GlobalOptions->RunMode & RUN_MODE_BLINDTAG)
+ //{
+ // ExtendTagMatchBlind(Info, NextNode, Buffer, BufferPos, BufferEnd, FilePos);
+ //}
+ //else
+ //{
+ GetMatches(Info, NextNode, Buffer, BufferPos, BufferEnd, FilePos);
+ //}
+ }
+ Node = NextNode;
+ AnchorPos = BufferPos;
+ }
+ }
+ BufferPos++;
+ FilePos++;
+ if (BufferPos >= BufferEnd)
+ {
+ break;
+ }
+ } // if not anchored
+ } // Master while-loop
+
+ SafeFree(Buffer);
+
+ return Info->RecordNumber + 1;
+}
+
+
+// Print just the tags from our trie:
+void DebugPrintTrieTags(TrieNode* Node)
+{
+ TrieTagHanger* Hanger;
+ int ChildIndex;
+ TrieNode* Failure;
+ char TagBuffer[256];
+ int Len;
+ int ModIndex;
+ if (!Node)
+ {
+ return;
+ }
+ for (Hanger = Node->FirstTag; Hanger; Hanger = Hanger->Next)
+ {
+ ModIndex = 0;
+ TagBuffer[0] = Hanger->Tag->Tag[0];
+ TagBuffer[1] = '\0';
+ if (Hanger->Tag->ModType[ModIndex] && Hanger->Tag->AminoIndex[ModIndex] == 0)
+ {
+ strcat(TagBuffer, Hanger->Tag->ModType[ModIndex]->Name);
+ ModIndex++;
+ }
+ Len = strlen(TagBuffer);
+ TagBuffer[Len] = Hanger->Tag->Tag[1];
+ TagBuffer[Len+1] = '\0';
+ if (Hanger->Tag->ModType[ModIndex] && Hanger->Tag->AminoIndex[ModIndex] == 0)
+ {
+ strcat(TagBuffer, Hanger->Tag->ModType[ModIndex]->Name);
+ ModIndex++;
+ }
+ Len = strlen(TagBuffer);
+ TagBuffer[Len] = Hanger->Tag->Tag[2];
+ TagBuffer[Len+1] = '\0';
+
+ if (Hanger->Tag->ModType[ModIndex] && Hanger->Tag->AminoIndex[ModIndex] == 0)
+ {
+ strcat(TagBuffer, Hanger->Tag->ModType[ModIndex]->Name);
+ ModIndex++;
+ }
+
+ //ARI_MOD - for tags of length 4
+ Len = strlen(TagBuffer);
+ TagBuffer[Len] = Hanger->Tag->Tag[3];
+ TagBuffer[Len+1] = '\0';
+
+
+ if(Hanger->Tag->ModType[ModIndex] && Hanger->Tag->AminoIndex[ModIndex] == 0)
+ {
+ strcat(TagBuffer,Hanger->Tag->ModType[ModIndex]->Name);
+ ModIndex++;
+ }
+
+
+ printf("Tag '%s' (prefix %.2f, Suffix %.2f) %.2f hits %d\n", TagBuffer,
+ Hanger->Tag->PrefixMass / (float)MASS_SCALE,
+ Hanger->Tag->SuffixMass / (float)MASS_SCALE,
+ Hanger->Tag->Score, Hanger->Tag->CandidatesScored);
+#ifdef DEBUG_TAG_GENERATION
+ printf("%s\n", Hanger->Tag->TagScoreDetails);
+#endif
+ }
+ Failure = Node->FailureNode;
+ if (Node->FirstTag && Failure)
+ {
+ printf(" Node %s has failure node depth %d letter %c.\n", Node->FirstTag->Tag->Tag, Failure->Depth, Failure->Letter);
+ }
+
+ for (ChildIndex = 0; ChildIndex < 26; ChildIndex++)
+ {
+ if (ChildIndex == 'I'-'A' || ChildIndex == 'Q'-'A')
+ {
+ continue;
+ }
+ if (Node->Children[ChildIndex])
+ {
+ DebugPrintTrieTags(Node->Children[ChildIndex]);
+ }
+ }
+}
+
+void FlagMandatoryModUsage(TrieNode* Node)
+{
+ TrieTagHanger* Hanger;
+ int CharIndex;
+ int ModIndex;
+ //
+ if (!Node)
+ {
+ return;
+ }
+ for (Hanger = Node->FirstTag; Hanger; Hanger = Hanger->Next)
+ {
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Hanger->Tag->ModType[ModIndex]->Index == GlobalOptions->MandatoryModIndex)
+ {
+ Hanger->Tag->MandatoryModUsed = 1;
+ }
+ }
+ }
+
+ for (CharIndex = 0; CharIndex < TRIE_CHILD_COUNT; CharIndex++)
+ {
+ FlagMandatoryModUsage(Node->Children[CharIndex]);
+ }
+}
+
+// COPYPASTA from WriteMatchToString.
+void WriteTagToString(TrieTag* Tag, char* Buffer, int IncludeMods)
+{
+ char* Stuff;
+ int AminoPos;
+ char NameChar;
+ int ModIndex;
+ int NameIndex;
+ //
+ Stuff = Buffer;
+
+ for (AminoPos = 0; AminoPos < strlen(Tag->Tag); AminoPos++)
+ {
+ *Stuff++ = Tag->Tag[AminoPos];
+ if (IncludeMods)
+ {
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Tag->AminoIndex[ModIndex] == AminoPos && Tag->ModType[ModIndex])
+ {
+ // Write the modification:
+ for (NameIndex = 0; NameIndex < 4; NameIndex++)
+ {
+ NameChar = Tag->ModType[ModIndex]->Name[NameIndex];
+ if (!NameChar)
+ {
+ break;
+ }
+ *Stuff++ = ConvertToLower(NameChar);
+ }
+ }
+ }
+ }
+ }
+ *Stuff = '\0';
+}
+
+// Write (to a char buffer) the string version of a peptide, including modifications.
+// For example: "EAM+16APK". Similar to the method PeptideClass.GetModdedName
+void WriteMatchToString(Peptide* Match, char* Buffer, int IncludeMods)
+{
+ char* Stuff;
+ int AminoPos;
+ char NameChar;
+ int ModIndex;
+ int NameIndex;
+ //
+ Stuff = Buffer;
+
+ if (Match->PrefixAmino)
+ {
+ *Stuff++ = Match->PrefixAmino;
+ }
+ else
+ {
+ *Stuff++ = '-';
+ }
+ *Stuff++ = '.';
+ for (AminoPos = 0; AminoPos < strlen(Match->Bases); AminoPos++)
+ {
+ *Stuff++ = Match->Bases[AminoPos];
+ if (IncludeMods)
+ {
+ for (ModIndex = 0; ModIndex < MAX_PT_MODS; ModIndex++)
+ {
+ if (Match->AminoIndex[ModIndex]==AminoPos)
+ {
+ // Write the modification:
+ for (NameIndex = 0; NameIndex < 4; NameIndex++)
+ {
+ NameChar = Match->ModType[ModIndex]->Name[NameIndex];
+ if (!NameChar)
+ {
+ break;
+ }
+ *Stuff++ = ConvertToLower(NameChar);
+ }
+ }
+ }
+ }
+ }
+ *Stuff++ = '.';
+ if (Match->SuffixAmino)
+ {
+ *Stuff++ = Match->SuffixAmino;
+ }
+ else
+ {
+ *Stuff++ = '-';
+ }
+ *Stuff = '\0';
+}
diff --git a/Trie.h b/Trie.h
new file mode 100644
index 0000000..b33ac7d
--- /dev/null
+++ b/Trie.h
@@ -0,0 +1,309 @@
+//Title: Trie.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef TRIE_H
+#define TRIE_H
+
+// Implementation of the Aho-Corasic algorithm for string search using
+// trie automaton. Also, implementation of the d.p. tag extension algorithm
+// in presence of PTMs.
+
+#include "Inspect.h"
+#include <stdio.h>
+#include "Utils.h"
+#include "Spectrum.h"
+#include "Mods.h"
+
+// Tags, produced from mass-spec analysis. A tag consists of sequence of bases
+// (e.g. "QVL"), a prefix mass, and a Suffix mass. Tags are stored in the nodes
+// of a trie.
+// In the simple case, leaf nodes in the trie have just one tag. But
+// there may be multiple tags with the same bases but different prefix/Suffix
+// masses, these two tags will be children of the same trie node. (We use
+// the TrieTagHanger struct to hold lists of tags. The second class adds some
+// overhead; one advantage is that tags can be in more than one list)
+typedef struct TrieTag
+{
+#ifdef DEBUG_TAG_GENERATION
+ char TagScoreDetails[2048];
+#endif
+ int PrefixMass;
+ int SuffixMass;
+ int Charge;
+ int ParentMass;
+ int TagLength;
+ int CandidatesScored;
+ // How far this tag's mass is from the ACTUAL mass of the peptide
+ float Error;
+ // Rank of this tag (0 being best)
+ int Rank;
+ // Score of this tag (higher is better):
+ float Score;
+ // PTMods used up in generating this tag. These count against our allowable total.
+ // AminoIndex is set to -1 for unused entries.
+ int AminoIndex[MAX_PT_MODS];
+ MassDelta* ModType[MAX_PT_MODS];
+ int ModsUsed;
+ int MandatoryModUsed; // if GlobalOptions->MandatoryModIndex is set.
+ // The tag itself:
+ char Tag[MAX_TAG_LENGTH + 1];
+ struct MSSpectrum* PSpectrum;
+ SpectrumTweak* Tweak;
+ int PrefixExtends;
+ int SuffixExtends;
+ int DBTagMatches;
+ // Some members for training edge skew measures:
+ int TotalSkew;
+ int TotalAbsSkew;
+ struct TagGraphNode* Nodes[4];
+} TrieTag;
+
+// A trie (from 'reTRIEval') is a tree where each node corresponds to a word. The root
+// corresponds to an empty string, and a node's children correspond to that node's word
+// extended by one letter. The trie data structure allows fast searches for any of the
+// words in the trie. In this case, the 'words' are short peptides, and the database
+// is swiss-prot or some other collection of peptides.
+//
+// During the search, the ANCHOR is the start of our current match (if any).
+typedef struct TrieNode
+{
+ struct TrieNode* Children[TRIE_CHILD_COUNT];
+ // Depth == length of our string.
+ // Root has depth 0, its children have depth 1...
+ int Depth;
+
+ // The failure node is an optimization which makes trie searching fast.
+ // Suppose we just finished matching the tag PANTS. Naively, we would move on
+ // to tags starting with A. But if we have a node ANT, maybe we can jump there
+ // directly. If we have no nodes starting with A, we can jump to the N.
+ // The FailureNode 'precomputes our jumps' - we move the anchor FailureLetterSkip
+ // letters forward from the old anchor, and switch to the given failure node.
+ // If FailureNode is equal to the trie root, then we CLEAR THE ANCHOR.
+ int FailureLetterSkip;
+ struct TrieNode* FailureNode;
+
+ // Our (most recently added) letter:
+ char Letter;
+ // Our list of tags:
+ struct TrieTagHanger* FirstTag;
+ struct TrieTagHanger* LastTag;
+} TrieNode;
+
+typedef struct PeptideSpliceNode
+{
+ int ChromosomeNumber;
+ int DonorPos;
+ int AcceptorPos;
+ struct PeptideSpliceNode* Next;
+} PeptideSpliceNode;
+
+// A Peptide struct represents the peptide we use to annotate a tandem mass spectrum - possibly
+// with PTMs, and prefix and suffix residues.
+typedef struct Peptide
+{
+ int ParentMassError;
+ char Bases[256];
+
+ char PrefixAmino; // The base BEFORE the peptide starts. (Useful for checking trypsin fragmentation)
+
+ char SuffixAmino; // The base AFTER the peptide starts.
+
+ // FilePos is the byte-offset in the database where the peptide starts. If the peptide is found
+ // multiple times, FilePos the position within the file of the FIRST occurrence of the peptide.
+ int FilePos;
+
+ // RecordNumber is the protein record # where the peptide is found
+ int RecordNumber;
+
+ int InitialScore;
+
+ float MatchQualityScore;
+
+ // For the best match, DeltaCN is the difference in score between this match and its runner-up. For
+ // other matches, DeltaCN is the difference in score between them and the best match (i.e. DeltaCN is
+ // negative for them). We compute DeltaCN because a larger DeltaCN value generally indicates
+ // a better match.
+ float DeltaCN;
+
+ // DeltaCNOther is the difference in score between this peptide and the best runner-up that's NOT
+ // the same peptide. "Same" means "file-pos at most 2 steps away, or sequence has at most two diffs".
+ float DeltaCNOther;
+ struct Peptide* FamilyLeader;
+ float FScore;
+ float PValue;
+
+ // We may "own" our own mass delta, in which case we must free it when we dealloc:
+ MassDelta* PetDelta;
+
+ // Track the nth post-translational modification by setting AminoIndex[n] to the index of the
+ // modified amino acid, and ModType[n] to the modification type. Set AminoIndex to -1 for
+ // the extra records.
+ int AminoIndex[MAX_PT_MODS];
+ MassDelta* ModType[MAX_PT_MODS];
+ struct PeptideMatch* First;
+ struct PeptideMatch* Last;
+ struct Peptide* Next;
+ struct Peptide* Prev;
+ int PrefixMass; // Used only if this is a tag
+ int SuffixMass; // Used only if this is a tag
+
+ // DB is a pointer to the database which this match comes from.
+ DatabaseFile* DB;
+ PeptideSpliceNode* SpliceHead; // if splice-tolerant
+ int GenomicLocationStart; // if splice-tolerant
+ int GenomicLocationEnd; // if splice-tolerant
+ int ChromosomeNumber; // if splice-tolerant
+ int ChromosomeForwardFlag; // if splice-tolerant
+ char* SplicedBases; // if splice-tolerant
+ int ParentMass;
+ SpectrumTweak* Tweak;
+ float ScoreFeatures[16];
+ int SpecialFragmentation;
+ int SpecialModPosition;
+} Peptide;
+
+typedef struct PeptideMatch
+{
+ int FilePos;
+ int RecordNumber;
+ TrieTag* Tag;
+ struct PeptideMatch* Next;
+} PeptideMatch;
+
+typedef float (*ScoringFunction)(MSSpectrum* Spectrum, Peptide* Match, int VerboseFlag);
+
+typedef struct SearchInfo
+{
+ DatabaseFile* DB;
+ int RecordNumber;
+ //ScoringFunction Scorer;
+ MSSpectrum* Spectrum;
+ TrieNode* Root;
+ int VerboseFlag;
+} SearchInfo;
+
+//container for information about the blind tag match.
+//These matches extend on only one side, which I called Anchored
+typedef struct BlindTagMatch
+{
+ TrieTag* Tag;
+ struct BlindTagMatch* Next;
+ struct BlindTagMatch* Prev;
+ //denotes the direction of the matched (modless) extension
+ int ExtendLR; // -1 for Left, 1 for Right
+ int ExtendDBLoc; // the location in the database where the extension matches
+ int TagDBLoc; // the location in the DB where the first letter of the tag matches.
+ int ExtendLength; //length of the anchored extension XXXTAG---- means an extend len of 3
+} BlindTagMatch;
+
+// PTModCount lists how many post-translational mods exist for each peptide:
+extern int PTModCount[TRIE_CHILD_COUNT];
+
+// PTMods lists the mass of each post-translational mod for each peptide:
+extern float PTMods[TRIE_CHILD_COUNT][MAX_PT_MODTYPE];
+
+// Table of prefix and suffix peptide masses
+extern int PeptideMass[256];
+extern int StandardPeptideMass[256];
+
+#define IS_ROOT(node) ((node)->Depth == 0)
+
+
+// For constructing lists of TrieTags. (A single TrieTag can be part of more than
+// one list, by using more than one TrieTagHanger)
+typedef struct TrieTagHanger
+{
+ struct TrieTagHanger* Prev;
+ struct TrieTagHanger* Next;
+ TrieTag* Tag;
+} TrieTagHanger;
+
+// Constructor: TrieNode
+TrieNode* NewTrieNode();
+
+// Destructor: TrieNode
+void FreeTrieNode(TrieNode* This);
+
+// Constructor: TrieTag
+TrieTag* NewTrieTag();
+
+// Destructor: TrieTag
+void FreeTrieTag(TrieTag* This);
+
+// Add a new tag to the trie. New trie nodes will be added, if necessary, in order
+// to hold the tag. (For instance, adding "CAT" to a root node with no children would
+// add three nodes: C, CA, and CAT).
+TrieNode* AddTagToTrie(TrieNode* Root, TrieTag* Tag, int* DuplicateFlag);
+
+// Constructor: TrieTagHanger
+TrieTagHanger* NewTrieTagHanger();
+
+// Destructor: TrieTagHanger
+void FreeTrieTagHanger(TrieTagHanger* This);
+
+// Debug: Print a trie to stdout
+void DebugPrintTrie(TrieNode* Root);
+
+// Print all matches
+void PrintMatches(MSSpectrum* Spectrum, char* IndexFileName);
+
+// Load the masses for amino acids: n-terminal (left) and c-terminal (right) masses
+int LoadPeptideMasses(char* FileName);
+
+// Initialize GlobalOptions, a global variable storing configurable options.
+void InitOptions();
+
+void InitStats();
+
+// Important main method: Use a trie to search a data-file.
+int ScanFileWithTrie(SearchInfo* Info);
+
+int GetMaxTagRank(TrieNode* Root);
+//int ComparePeptideScores(const Peptide* A, const Peptide* B);
+void PrintMatch(Peptide* Match, FILE* IndexFile);
+void DebugPrintTrieTags(TrieNode* Node);
+void InitializeTrieFailureNodes(TrieNode* Root, TrieNode* Node, char* Tag);
+void FreePeptideNode(Peptide* Pep);
+void GetProteinID(int RecordNumber, DatabaseFile* DB, char* Name);
+void FlagMandatoryModUsage(TrieNode* Node);
+void WriteMatchToString(Peptide* Match, char* Buffer, int IncludeMods);
+Peptide* StoreSpectralMatch(MSSpectrum* Spectrum, Peptide* Match, int PeptideLength, int MQScoreFlag);
+Peptide* NewPeptideNode();
+int CheckForPTAttachmentPoints(int DecorationMassIndex, char* Buffer, int Start, int End, int BufferDir);
+Peptide* AddNewMatch(SearchInfo* Info, int FilePos, TrieTag* Tag, char* Peptide,
+ int PeptideLength, int TagPosition, int PrefixDecoration, int SuffixDecoration,
+ int GenomicStart, int GenomicEnd);
+Peptide* GetPeptideFromAnnotation(char* Annotation);
+int GetPeptideParentMass(Peptide* Match);
+void WriteTagToString(TrieTag* Tag, char* Buffer, int IncludeMods);
+
+#endif //TRIE_H
diff --git a/TrieUtils.py b/TrieUtils.py
new file mode 100644
index 0000000..907e9a6
--- /dev/null
+++ b/TrieUtils.py
@@ -0,0 +1,256 @@
+#Title: TrieUtils.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+import os
+import sys
+import struct
+
+TRIE_FAIL = -1
+TRIE_BLOCKSIZE = 4096
+MIN_TRIE_SEARCH_SIZE = 512
+
+def Union(List,NewElement):
+ for i in range(0,len(List)):
+ if List[i] == NewElement:
+ return List
+
+ List.append(NewElement)
+ return List
+
+def UnionList(List1,List2):
+ for i in range(0,len(List2)):
+ List1 = Union(List1,List2[i])
+ return List1
+
+class TrieUtils:
+ def __init__(self):
+ #print "New TrieUtils!!"
+ return
+
+ def GetProteinSequence(self,ProteinNamePrefix, TrieFileName, IndexFileName):
+ if not os.path.exists(TrieFileName):
+ print "ERROR: TrieUtils.GetProteinSequence: %s is not a valid file name"%TrieFileName
+ return None
+ if not os.path.exists(IndexFileName):
+ print "ERROR: TrieUtils.GetProteinSequence: %s is not a valid file name"%IndexFileName
+ return None
+
+ IndexFile = open(IndexFileName ,'r')
+
+ BlockSize = struct.calcsize("<qi80s")
+ while (1):
+ Block = IndexFile.read(BlockSize)
+ if not Block:
+ IndexFile.close()
+ return None
+
+ Info = struct.unpack("<qi80s",Block)
+ Name = str(Info[2])
+ NullPos = Name.find("\0")
+ if NullPos !=- 1:
+ Name = Name[:NullPos]
+ TriePos = Info[1]
+ if Name[0:len(ProteinNamePrefix)] == ProteinNamePrefix:
+ TrieFile = open(TrieFileName,'r')
+ TrieFile.seek(TriePos)
+ Seq = ""
+ while Seq.find("*") < 0:
+ Seq += TrieFile.read(256)
+ TrieFile.close()
+ IndexFile.close()
+ return Seq[0:Seq.find("*")]
+
+ IndexFile.close()
+ return None
+
+
+ def GetProteinName(self,IndexFileName,ProteinID):
+ if not os.path.exists(IndexFileName):
+ print "ERROR: TrieUtils.GetProteinSequence: %s is not a valid file name"%IndexFileName
+ return None
+
+ IndexFile = open(IndexFileName ,'r')
+
+ BlockSize = struct.calcsize("<qi80s")
+ IndexFile.seek(BlockSize*ProteinID)
+
+ Block = IndexFile.read(BlockSize)
+ if not Block:
+ IndexFile.close()
+ return None
+
+ Info = struct.unpack("<qi80s",Block)
+ Name = Info[2]
+ NullPos = Name.find("\0")
+ if NullPos !=- 1:
+ Name = Name[:NullPos]
+ TriePos = Info[1]
+
+ return Name.strip()
+
+
+
+ def GetAllLocations(self,Peptides,TrieFileName):
+
+ (Transitions,Output,Failure) = self.BuildTrie(Peptides)
+ LocalDebug = 0
+ Locations = {}
+ for P in Peptides:
+ Locations[P] = []
+
+ TrieFile = open(TrieFileName,'r')
+ State = 0
+ ProteinID = 0
+ ResidueNum = 0
+ BlockCount = 0
+ TrieFile.seek(0,2)
+ FileBlocks = TrieFile.tell()/TRIE_BLOCKSIZE
+ TrieFile.seek(0)
+ pos = 0
+ while(1):
+
+ TrieLine = TrieFile.read(TRIE_BLOCKSIZE)
+ BlockCount += 1
+ #print TrieLine
+ if not TrieLine:
+ print "Done with this file"
+ break
+ for i in range(0,len(TrieLine)):
+ if LocalDebug:
+ print "[%s] - %s"%(i,TrieLine[i])
+ print "%s:%s"%(ProteinID,ResidueNum)
+ if TrieLine[i] == '*':
+ ResidueNum = 0
+ ProteinID += 1
+ State = 0
+ if LocalDebug:
+ print "Encountered a *, resetting"
+ #raw_input()
+ continue
+ while Transitions.get((State,TrieLine[i]),TRIE_FAIL) == TRIE_FAIL:
+ if LocalDebug:
+ print "Transition[%s,%s]->%s"%(State,TrieLine[i],TRIE_FAIL)
+ print "FailState[%s]->%s"%(State,Failure[State])
+
+ if(State == Failure[State]):
+ print "Transition[%s,%s]->%s"%(State,TrieLine[i],TRIE_FAIL)
+ print "FailState[%s]->%s"%(State,Failure[State])
+ raw_input()
+ State = Failure[State]
+
+ if LocalDebug:
+ print "Transition[%s,%s]->%s"%(State,TrieLine[i],Transitions[(State,TrieLine[i])])
+ #raw_input()
+ State = Transitions[(State,TrieLine[i])]
+ if Output.has_key(State):
+ for Pep in Output[State]:
+ if LocalDebug:
+ print "*****%s - %s:%s"%(Pep,ProteinID,ResidueNum)
+ raw_input()
+ Locations[Pep].append((ProteinID,ResidueNum))
+ ResidueNum += 1
+ print pos
+ pos += len(TrieLine)
+ if len(TrieLine) < TRIE_BLOCKSIZE:
+ break
+ if LocalDebug:
+ print "Done with block!!"
+ #raw_input()
+ if LocalDebug:
+ print "Done!"
+ raw_input()
+ print "Finished!!!"
+ return Locations
+
+
+ def BuildTrie(self,Peptides):
+
+ #Build Transition and Output Functions
+ Transitions = {}
+ Output = {}
+ NewState = 0
+ #Str = ""
+ for Pep in Peptides:
+ State = 0
+ J = 0
+ #print Pep
+ #Str += Pep
+ while(J < len(Pep) and Transitions.get((State,Pep[J]),TRIE_FAIL) != TRIE_FAIL):
+ State = Transitions[(State,Pep[J])]
+ J += 1
+ for P in range(J,len(Pep)):
+ NewState += 1
+ Transitions[(State,Pep[P])] = NewState
+ State = NewState
+
+
+ List = Output.get(State,[])
+ List.append(Pep)
+ Output[State] = List
+ #print Str
+ #raw_input()
+ #Create a self loop at node 0, back to node 0
+ for AA in "ABCDEFGHIKLMNOPQRSTUVWXYZ":
+ S = Transitions.get((0,AA),TRIE_FAIL)
+ if S == TRIE_FAIL:
+ Transitions[(0,AA)] = 0
+
+ #Create Failure Function
+ Queue = []
+ Failure = {}
+ for AA in "ABCDEFGHIKLMNOPQRSTUVWXYZ":
+ S = Transitions.get((0,AA),TRIE_FAIL)
+ if S != 0:
+ Queue = Union(Queue,S)
+ Failure[S] = 0
+ while len(Queue) > 0:
+ R = Queue.pop(0)
+ for AA in "ABCDEFGHIKLMNOPQRSTUVWXYZ":
+ S = Transitions.get((R,AA),TRIE_FAIL)
+ if S != TRIE_FAIL:
+ Queue = Union(Queue,S)
+ State = Failure[R]
+ while(Transitions.get((State,AA),TRIE_FAIL) == TRIE_FAIL):
+ State = Failure[State]
+ Failure[S] = Transitions[(State,AA)]
+ Output[S] = UnionList(Output.get(S,[]),Output.get(Failure[S],[]))
+
+ Failure[0] = 0
+ return (Transitions,Output,Failure)
+
+
+
+
+
+if __name__=="__main__":
+ print "TrieUtils.py"
diff --git a/Utils.c b/Utils.c
new file mode 100644
index 0000000..c7a8f70
--- /dev/null
+++ b/Utils.c
@@ -0,0 +1,683 @@
+//Title: Utils.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#include "CMemLeak.h"
+#include "Utils.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdarg.h>
+
+
+
+// From high to low
+int CompareFloats(const float* a, const float* b)
+{
+ if (*a > *b)
+ {
+ return -1;
+ }
+ if (*a < *b)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+// From high to low
+int CompareInts(const int* a, const int* b)
+{
+ if (*a > *b)
+ {
+ return -1;
+ }
+ if (*a < *b)
+ {
+ return 1;
+ }
+ return 0;
+}
+
+
+// Copy one line (up to a \r or \n character) from a source buffer to a target buffer.
+// Optionally, strip out spaces. Return the position just AFTER the end of the line.
+// (If a line ends in \r\n, we'll end up processing the line, and then one empty line; that's okay)
+// If a line is very long, we stop copying, and skip over the rest of it.
+int CopyBufferLine(char* Source, int BufferPos, int BufferEnd, char* LineBuffer, int StripWhitespace)
+{
+ int LinePos = 0;
+ int LineComplete = 0;
+ int Chars = 0;
+ int Skipping = 0;
+ //
+ while (!LineComplete)
+ {
+ if (BufferPos > BufferEnd)
+ {
+ // Our line extends off the edge of the buffer. That's probably a Bad Thing.
+ printf("** Warning: Ran off the edge of the buffer in CopyBufferLine. Line too ling?\n");
+ LineBuffer[LinePos] = '\0';
+ return BufferPos;
+ }
+ switch (Source[BufferPos])
+ {
+ case ' ':
+ if (StripWhitespace)
+ {
+ BufferPos++;
+ }
+ else
+ {
+ if (!Skipping)
+ {
+ LineBuffer[LinePos++] = Source[BufferPos];
+ }
+ BufferPos++;
+ Chars++;
+ }
+ break;
+ case '\r':
+ case '\n':
+ LineBuffer[LinePos] = '\0';
+ BufferPos++;
+ LineComplete = 1;
+ break;
+ case '\0':
+ LineBuffer[LinePos] = '\0';
+ LineComplete = 1;
+ break;
+ default:
+ if (!Skipping)
+ {
+ LineBuffer[LinePos++] = Source[BufferPos];
+ }
+ BufferPos++;
+ Chars++;
+ break;
+ }
+ if (Chars == MAX_LINE_LENGTH - 1)
+ {
+ printf("** Error: Line too long! Truncating line.");
+ // Read the rest of the chars, but don't write them:
+ Chars = 0;
+ Skipping = 1;
+ }
+ }
+ return BufferPos;
+}
+
+void ParseFileByLines(FILE* File, FileLineParser LineParser, void* ParseData, int ProcessCommentLines)
+{
+ char LineBuffer[MAX_LINE_LENGTH];
+ char TextBuffer[BUFFER_SIZE * 2];
+ int LineNumber = 0;
+ int FilePos;
+ int NewFilePos = 0;
+ int BytesToRead;
+ int BufferEnd = 0;
+ int BytesRead;
+ int BufferPos = 0;
+ int KeepParsingFlag;
+ //
+ if (!File)
+ {
+ return;
+ }
+ NewFilePos = ftell(File);
+ while (1)
+ {
+ FilePos = NewFilePos;
+ BytesToRead = BUFFER_SIZE - BufferEnd;
+ BytesRead = ReadBinary(TextBuffer + BufferEnd, sizeof(char), BytesToRead, File);
+ BufferEnd += BytesRead;
+ TextBuffer[BufferEnd] = '\0';
+ if (BufferPos == BufferEnd)
+ {
+ // We're done!
+ break;
+ }
+ BufferPos = CopyBufferLine(TextBuffer, BufferPos, BufferEnd, LineBuffer, 0);
+ if (!BufferPos)
+ {
+ // We encountered a null character. Force advance:
+ BufferPos++;
+ }
+ NewFilePos = FilePos + BufferPos;
+
+ LineNumber += 1;
+ // Now, move the remaining text to the start of the buffer:
+ memmove(TextBuffer, TextBuffer + BufferPos, BufferEnd - BufferPos);
+ BufferEnd -= BufferPos;
+ BufferPos = 0;
+ // Now, process this line of text!
+ // Skip empty lines:
+ if (!LineBuffer[0])
+ {
+ continue;
+ }
+ // Skip comment lines:
+ if (LineBuffer[0] == '#' && !ProcessCommentLines)
+ {
+ continue;
+ }
+ KeepParsingFlag = LineParser(LineNumber, FilePos, LineBuffer, ParseData);
+ if (!KeepParsingFlag)
+ {
+ break;
+ }
+ }
+}
+
+#define FORCE_UPPER(X) X = ((X) >= 'A' && (X) <= 'Z' ? (X) + 'a' - 'A' : (X));
+
+// Case-insensitive string comparison. Returns -1 if A<B, 1 if A>B, 0 if same.
+int CompareStrings(const char* StringA, const char* StringB)
+{
+ const char* CharA;
+ const char* CharB;
+ char A;
+ char B;
+ CharA = StringA;
+ CharB = StringB;
+ while (1)
+ {
+ if (!*CharA && !*CharB)
+ {
+ return 0;
+ }
+ A = *CharA;
+ B = *CharB;
+ FORCE_UPPER(A);
+ FORCE_UPPER(B);
+ //if (isupper(A)) A = ConvertToLower(A);
+ //if (isupper(B)) B = ConvertToLower(B);
+ if (A < B)
+ {
+ return 1;
+ }
+ if (A > B)
+ {
+ return -1;
+ }
+ CharA++;
+ CharB++;
+ }
+}
+
+#ifdef __ppc__
+// Reads a little endian binary file for a big endian system
+size_t ReadBinary(void* Buffer, size_t ItemSize, size_t ItemCount, FILE* File)
+{
+ size_t ItemIndex;
+ size_t ByteIndex;
+ unsigned char SwapValue;
+ char* CharBuffer;
+ int BytesRead;
+
+ BytesRead = fread(Buffer, size, MemberCount, File); // raw fread
+
+ CharBuffer = (char*)Buffer;
+
+ for (ItemIndex = 0; ItemIndex < ItemCount; ItemIndex++)
+ {
+ for (ByteIndex = 0; ByteIndex < ItemSize >> 1; ByteIndex++)
+ {
+ // Swap the first and last bytes, then bytes 1 and max - 1, etc.
+ SwapValue = CharBuffer[size * ItemSize + ByteIndex];
+ CharBuffer[size * ItemIndex + ByteIndex] = CharBuffer[ItemSize * ItemIndex + ItemSize - ByteIndex - 1];
+ CharBuffer[size * ItemIndex + ItemSize - ByteIndex - 1] = SwapValue;
+ }
+ }
+ return BytesRead;
+}
+
+// We're on a big-endian system, and we must write out a little-endian file.
+size_t WriteBinary(void* Buffer, size_t ItemSize, size_t ItemCount, FILE* File)
+{
+ char ItemBuffer[256];
+ int ItemIndex;
+ int ByteIndex;
+ char* CharBuffer = (char*)Buffer;
+ //
+ // Write a byte-swapped version of each item to ItemBuffer, then write ItemBuffer to disk.
+ for (ItemIndex = 0; ItemIndex < ItemCount; ItemIndex++)
+ {
+ for (ByteIndex = 0; ByteIndex < ItemSize; ByteIndex++)
+ {
+ ItemBuffer[ItemSize - ByteIndex - 1] = CharBuffer[ItemSize * ItemIndex + ByteIndex]
+ fwrite(ItemBuffer, ItemSize, 1, File); // raw fwrite
+ }
+ }
+}
+
+#else
+#define ReadBinary fread
+#define WriteBinary fwrite
+#endif
+
+char TranslateCodon(char* DNA)
+{
+ switch (DNA[0])
+ {
+ case 'T':
+ case 't':
+ switch (DNA[1])
+ {
+ case 'T':
+ case 't':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'F';
+ case 'C':
+ case 'c':
+ return 'F';
+ case 'A':
+ case 'a':
+ return 'L';
+ case 'G':
+ case 'g':
+ return 'L';
+ }
+ break;
+ case 'C':
+ case 'c':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'S';
+ case 'C':
+ case 'c':
+ return 'S';
+ case 'A':
+ case 'a':
+ return 'S';
+ case 'G':
+ case 'g':
+ return 'S';
+ }
+ break;
+ case 'A':
+ case 'a':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'Y';
+ case 'C':
+ case 'c':
+ return 'Y';
+ case 'A':
+ case 'a':
+ return 'X';
+ case 'G':
+ case 'g':
+ return 'X';
+ }
+ break;
+ case 'G':
+ case 'g':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'C';
+ case 'C':
+ case 'c':
+ return 'C';
+ case 'A':
+ case 'a':
+ return 'X';
+ case 'G':
+ case 'g':
+ return 'W';
+ }
+ break;
+ }
+ break;
+ case 'C':
+ case 'c':
+ switch (DNA[1])
+ {
+ case 'T':
+ case 't':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'L';
+ case 'C':
+ case 'c':
+ return 'L';
+ case 'A':
+ case 'a':
+ return 'L';
+ case 'G':
+ case 'g':
+ return 'L';
+ }
+ break;
+ case 'C':
+ case 'c':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'P';
+ case 'C':
+ case 'c':
+ return 'P';
+ case 'A':
+ case 'a':
+ return 'P';
+ case 'G':
+ case 'g':
+ return 'P';
+ }
+ break;
+ case 'A':
+ case 'a':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'H';
+ case 'C':
+ case 'c':
+ return 'H';
+ case 'A':
+ case 'a':
+ return 'Q';
+ case 'G':
+ case 'g':
+ return 'Q';
+ }
+ break;
+ case 'G':
+ case 'g':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'R';
+ case 'C':
+ case 'c':
+ return 'R';
+ case 'A':
+ case 'a':
+ return 'R';
+ case 'G':
+ case 'g':
+ return 'R';
+ }
+ break;
+ }
+ break;
+ case 'A':
+ case 'a':
+ switch (DNA[1])
+ {
+ case 'T':
+ case 't':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'I';
+ case 'C':
+ case 'c':
+ return 'I';
+ case 'A':
+ case 'a':
+ return 'I';
+ case 'G':
+ case 'g':
+ return 'M';
+ }
+ break;
+ case 'C':
+ case 'c':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'T';
+ case 'C':
+ case 'c':
+ return 'T';
+ case 'A':
+ case 'a':
+ return 'T';
+ case 'G':
+ case 'g':
+ return 'T';
+ }
+ break;
+ case 'A':
+ case 'a':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'N';
+ case 'C':
+ case 'c':
+ return 'N';
+ case 'A':
+ case 'a':
+ return 'K';
+ case 'G':
+ case 'g':
+ return 'K';
+ }
+ break;
+ case 'G':
+ case 'g':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'S';
+ case 'C':
+ case 'c':
+ return 'S';
+ case 'A':
+ case 'a':
+ return 'R';
+ case 'G':
+ case 'g':
+ return 'R';
+ }
+ break;
+ }
+ break;
+ case 'G':
+ case 'g':
+ switch (DNA[1])
+ {
+ case 'T':
+ case 't':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'V';
+ case 'C':
+ case 'c':
+ return 'V';
+ case 'A':
+ case 'a':
+ return 'V';
+ case 'G':
+ case 'g':
+ return 'V';
+ }
+ break;
+ case 'C':
+ case 'c':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'A';
+ case 'C':
+ case 'c':
+ return 'A';
+ case 'A':
+ case 'a':
+ return 'A';
+ case 'G':
+ case 'g':
+ return 'A';
+ }
+ break;
+ case 'A':
+ case 'a':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'D';
+ case 'C':
+ case 'c':
+ return 'D';
+ case 'A':
+ case 'a':
+ return 'E';
+ case 'G':
+ case 'g':
+ return 'E';
+ }
+ break;
+ case 'G':
+ case 'g':
+ switch (DNA[2])
+ {
+ case 'T':
+ case 't':
+ return 'G';
+ case 'C':
+ case 'c':
+ return 'G';
+ case 'A':
+ case 'a':
+ return 'G';
+ case 'G':
+ case 'g':
+ return 'G';
+ }
+ break;
+ }
+ break;
+ }
+ return 'X';
+}
+
+void WriteReverseComplement(char* Source, char* Destination)
+{
+ char* A;
+ char* B;
+ A = Source;
+ while (*A)
+ {
+ A++;
+ }
+ A--;
+ B = Destination;
+ while (A >= Source)
+ {
+ switch (*A)
+ {
+ case 'C':
+ case 'c':
+ *B = 'G';
+ break;
+ case 'G':
+ case 'g':
+ *B = 'C';
+ break;
+ case 'A':
+ case 'a':
+ *B = 'T';
+ break;
+ case 'T':
+ case 't':
+ *B = 'A';
+ break;
+ }
+ A--;
+ B++;
+ }
+}
+
+// Reverse a null-terminated string in place:
+void ReverseString(char* String)
+{
+ char* A;
+ char* Z;
+ char Temp;
+ int Len;
+ if (!String)
+ {
+ return;
+ }
+ Len = strlen(String);
+ if (!Len)
+ {
+ return;
+ }
+ Z = String + Len - 1;
+ A = String;
+ while (A < Z)
+ {
+ Temp = *Z;
+ *Z = *A;
+ *A = Temp;
+ A++;
+ Z--;
+ }
+}
+
+float GetMedian(float* Values, int ValueCount)
+{
+ qsort(Values, ValueCount, sizeof(float), (QSortCompare)CompareFloats);
+ if (ValueCount % 2)
+ {
+ return Values[ValueCount / 2];
+ }
+ else
+ {
+ return (Values[ValueCount / 2] + Values[(ValueCount / 2) - 1]) / (float)2.0;
+ }
+}
diff --git a/Utils.h b/Utils.h
new file mode 100644
index 0000000..9ea29a6
--- /dev/null
+++ b/Utils.h
@@ -0,0 +1,345 @@
+//Title: Utils.h
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <memory.h>
+#include <string.h>
+#include <stdio.h>
+
+#ifndef _WIN32
+// linux lacks these:
+#define max(X,Y) (((X)>(Y)) ? (X) : (Y))
+#define min(X,Y) (((X)>(Y)) ? (Y) : (X))
+#endif
+
+#ifdef _WIN32
+#define SEPARATOR '\\'
+#define SEPARATOR_STRING "\\"
+#else
+#define SEPARATOR '/'
+#define SEPARATOR_STRING "/"
+#endif
+
+// We don't like compiler warnings. Therefore, we cast all our
+// qsort comparison callbacks, in order to avoid this:
+// Warning: "passing arg 4 of `qsort' from incompatible pointer type"
+typedef int (*QSortCompare)(const void*, const void*);
+
+// For tokenizing strings:
+#define WHITESPACE " \r\n\t"
+
+// It seems that tolower is not defined on OSX, so we define our own:
+#define ConvertToLower(c) (((c)<'A' || (c)>'Z') ? (c) : ((c)-'A' + 'a'))
+
+#define DIGEST_TYPE_UNKNOWN 0
+#define DIGEST_TYPE_TRYPSIN 1
+
+#define MIN_VALID_PEPTIDE_LENGTH 6
+
+// For debugging tag generation. (Comment it out to disable)
+//#define DEBUG_TAG_GENERATION
+
+// PaH! sizeof() is returning 4 for sizeof(long long), which is INCORRECT.
+#define LONG_LONG_SIZE 8
+
+// There are many places where we index into an array by an amino acid's index (A == 0, C == 2, and so on).
+// These arrays waste a little space because there is no entry for B, J, O, U, X, Z. But they are
+// very time-efficient.
+#define AMINO_ACIDS 26
+
+#define SCORE_TOP_MATCH_VERBOSELY 1
+#define BUFFER_SIZE 1024
+#define MAX_FILENAME_LEN 1024
+#define MAX_MATCHES 10
+
+// We divide the full m/z range into PRM BINS. The width of each PRM bin
+// is 0.1Da, or BIN_SIZE / DALTON. Masses are stored as ints, where one dalton equals a mass
+// of 1000. Mass bins are 0.1Da (10 units) in width. There
+// are still some places where bin size is hard-coded.
+#define PRM_BIN_SIZE 100
+
+// Number of padding entries added to the end of the PRMScores array. Useful if we want the score for a PRM that's
+// outside our mass range, but just barely.
+#define PRM_ARRAY_SLACK 10
+
+// Maximum allowed length for a line in line-based data files. (If the line is longer
+// than this, we report an error)
+#define MAX_LINE_LENGTH 2048
+
+#define TAG_EDGE_SCORE_MULTIPLIER 20
+
+#define FAST_ROUND(Float, Int)\
+{\
+ Int = (int)((Float) + 0.5);\
+}
+
+#define ROUND_MASS(FloatMass, IntMass)\
+{\
+ (IntMass) = (int)(FloatMass * 1000 + 0.5);\
+}
+
+#define ROUND_MASS_TO_DELTA_BIN(x, bin) \
+{\
+(bin) = (int)(((x) + 200000) / 1000.0 + 0.5);\
+}
+
+#define MAX_FILENAME_LEN 1024
+
+
+// Instrument type LTQ is the default.
+#define INSTRUMENT_TYPE_LTQ 0
+// QTOF: Fragmentation properties are different (b series is weaker, y series is stronger).
+// Parent masses are quite accurate, so parent mass correction is NOT performed
+#define INSTRUMENT_TYPE_QTOF 1
+// FT hybrid: Parent masses are extremely accurate, so parent mass correction is NOT performed.
+// The fragment masses can still be a bit inaccurate however.
+#define INSTRUMENT_TYPE_FT_HYBRID 2
+
+#define RUN_MODE_DEFAULT 0
+#define RUN_MODE_TAGS_ONLY 1
+#define RUN_MODE_MUTATION 2
+#define RUN_MODE_BLIND 4
+#define RUN_MODE_BLIND_TAG 8
+#define RUN_MODE_PMC_ONLY 16
+#define RUN_MODE_TAG_MUTATION 64
+#define RUN_MODE_PREP_MS2DB 32
+#define RUN_MODE_RAW_OUTPUT 128
+
+
+#define PMC_FEATURE_RAW 1
+#define PMC_FEATURE_AVG_RATIO 2
+#define PMC_FEATURE_AVG_DIFF 4
+
+typedef enum DatabaseType
+{
+ evDBTypeTrie = 0,
+ evDBTypeMS2DB,
+ evDBTypeSpliceDB
+} DatabaseType;
+
+typedef struct DatabaseFile
+{
+ char FileName[MAX_FILENAME_LEN + 1];
+ char IndexFileName[MAX_FILENAME_LEN + 1];
+ int Type;
+ struct DatabaseFile* Next;
+ FILE* DBFile;
+ FILE* IndexFile;
+} DatabaseFile;
+
+typedef struct StringNode
+{
+ struct StringNode* Next;
+ char* String;
+} StringNode;
+
+
+
+// Global options. (Set on command-line or in config-file)
+typedef struct Options
+{
+ // RunMode is a set of flags describing which overall code path to take.
+ int RunMode;
+
+ // maximum number of post-translational mods to allow in a match
+ int MaxPTMods;
+
+ // maximum allowed mass error for prefix/suffix peptides
+ int Epsilon;
+ int PeakPPM;
+
+ // maximum allowed mass error for prefix/suffix masses
+ int FlankingMassEpsilon;
+
+ // return at most this many matches in a search
+ int MaxMatches;
+
+ // -v provides extended debugging info
+ int VerboseFlag;
+
+ // amino acid input-file
+ char AminoFileName[MAX_FILENAME_LEN];
+
+ // -o output file (if not set, print matches to stdout)
+ char FinalOutputFileName[MAX_FILENAME_LEN];
+ char OutputFileName[MAX_FILENAME_LEN]; // Intermediate output, before p-value computation
+ char ErrorFileName[MAX_FILENAME_LEN];
+ int ErrorCount;
+ int WarningCount;
+ DatabaseFile* FirstDatabase;
+ DatabaseFile* LastDatabase;
+
+ // -m file listing legal post-translational modifications
+ char PTModFileName[MAX_FILENAME_LEN];
+
+ // -i input file name
+ char InputFileName[MAX_FILENAME_LEN];
+ char ResourceDir[MAX_FILENAME_LEN];
+
+ // either stdout, or opened OutputFileName)
+ FILE* OutputFile;
+ // either stderr, or opened ErrorFileName:
+ FILE* ErrorFile;
+
+ // -t requests unit tests
+ int TestingFlag;
+
+ // if true, we remember *all* the occurrences of matched peptides.
+ int ReportAllMatches;
+
+ // How far we're allowed to tweak the parent mass of the spectrum. (Parent masses are often off
+ // by one or two amu)
+ int ParentMassEpsilon;
+ int ParentMassPPM;
+
+ struct Peptide* TruePeptide;
+
+ char MandatoryModName[256];
+
+ int MandatoryModIndex;
+
+ // How many matches to report. Defaults to 5.
+ int ReportMatchCount;
+
+ // How many matches to store for detailed scoring. Defaults to 100.
+ int StoreMatchCount;
+
+ // How many tags shall we generate, and how long shall they be?
+ int GenerateTagCount;
+ int GenerateTagLength;
+
+ // Nonzero if this is this a trypsin digest, or some other type of specific digest.
+ // If DigestType != 0, then we can give a penalty for missed cleavages, and a bonus for matching termini
+ int DigestType;
+
+ // Linked list of SpectrumNodes:
+ struct SpectrumNode* FirstSpectrum;
+ struct SpectrumNode* LastSpectrum;
+
+ // Linked list of InputFiles:
+ struct InputFileNode* FirstFile;
+ struct InputFileNode* LastFile;
+
+ int SpectrumCount;
+ int DynamicRangeMin;
+ int DynamicRangeMax;
+ int TaglessSearchFlag;
+
+ // If PhosphorylationFlag, then attempt to interpret phosphorylated peptides. This has implications
+ // for tag-generation, as well as candidate scoring.
+ int PhosphorylationFlag;
+
+ int TagPTMMode; // 0 is free, 1 is forbidden, and 2 is penalized
+
+ int MultiChargeMode; // if 1, try multiple parent charge states.
+
+ int TrieBlockSize;
+ int InstrumentType;
+ // Options for unrestrictive PTM search:
+ // DeltaBinCount is the number of mass bins in the range [MinPTMDelta, MaxPTMDelta],
+ // by default it equals 400 * 10 = 4000.
+ int MinPTMDelta;
+ int MaxPTMDelta;
+ int DeltaBinCount;
+ int DeltasPerAA; // == max(DeltaBinCount*2, 512)
+ // If TRUE, then use PepNovo for tag generation (assumed to live in working directory!)
+ int ExternalTagger;
+
+ // Options for producing an .ms2db file from .gff files:
+ StringNode* FirstGFFFileName;
+ StringNode* LastGFFFileName;
+ char GenomeFileName[MAX_FILENAME_LEN + 1];
+ char ChromosomeName[256 + 1];
+
+ // If XMLStrictFlag is set, then we'll complain about any unexpected
+ // tags or attributes. This is useful when debugging .ms2db file
+ // generation. In production, this flag won't generally be set,
+ // because it is officially Allowable to add new tags and
+ // attributes to an .ms2db file.
+ int XMLStrictFlag;
+
+ // if RequireTermini is 1 or 2, then we accept only semi-tryptic or fully-tryptic matches.
+ int RequireTermini;
+
+ int NewScoring; //temporary flag while we work on a new code path for scoring
+
+
+ float MinLogOddsForMutation; //MinimumLogOddsForAMutation
+} Options;
+
+extern Options* GlobalOptions;
+
+int CopyBufferLine(char* Source, int BufferPos, int BufferEnd, char* LineBuffer, int StripWhitespace);
+int CompareFloats(const float* a, const float* b);
+int CompareInts(const int* a, const int* b);
+int CompareStrings(const char* StringA, const char* StringB);
+char TranslateCodon(char* DNA);
+void WriteReverseComplement(char* Source, char* Destination);
+void ReverseString(char* String);
+
+#ifdef __ppc__
+size_t ReadBinary(void* Buffer, size_t ItemSize, size_t ItemCount, FILE* stream);
+size_t WriteBinary(void* Buffer, size_t ItemSize, size_t ItemCount, FILE* stream);
+#define BYTEORDER_BIG_ENDIAN
+#else
+#define ReadBinary fread
+#define WriteBinary fwrite
+#define BYTEORDER_LITTLE_ENDIAN
+#endif
+
+void AssertionFailed(char* Assertion, char* FileName, int LineNumber);
+
+#define INSPECT_ASSERT(expr) \
+ if (!(expr)) \
+ AssertionFailed(#expr, __FILE__, __LINE__)
+
+#define SafeFree(Pointer)\
+ if (Pointer) \
+ {\
+ free(Pointer);\
+ }
+
+// a FileLineParser is called once per line as a callback from ParseFileByLines()
+typedef int (*FileLineParser)(int FilePos, int LineNumber, char* LineBuffer, void* ParseData);
+
+void ParseFileByLines(FILE* File, FileLineParser Parser, void* ParseData, int ProcessCommentLines);
+float GetMedian(float* Values, int ValueCount);
+
+
+//#define PMC_USE_SVM
+#define MQSCORE_USE_SVM
+
+#define MQ_FEATURE_COUNT 7
+
+#endif //UTILS_H
diff --git a/Utils.py b/Utils.py
new file mode 100644
index 0000000..2c8351d
--- /dev/null
+++ b/Utils.py
@@ -0,0 +1,1074 @@
+#Title: Utils.py
+#Author: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+#Created: 2005
+# Copyright 2007,2008,2009 The Regents of the University of California
+# All Rights Reserved
+#
+# Permission to use, copy, modify and distribute any part of this
+# program for educational, research and non-profit purposes, by non-profit
+# institutions only, without fee, and without a written agreement is hereby
+# granted, provided that the above copyright notice, this paragraph and
+# the following three paragraphs appear in all copies.
+#
+# Those desiring to incorporate this work into commercial
+# products or use for commercial purposes should contact the Technology
+# Transfer & Intellectual Property Services, University of California,
+# San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+# Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+#
+# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+# INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+#
+# THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+# OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+# ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+# REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+# EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+# THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+"""
+Various support functions and constants to support ms/ms algorithms.
+Amino acid mass importer, ion type list generation.
+"""
+import Global
+import os
+import sys
+import string
+import types
+
+if hasattr(os, "sysconf"):
+ IS_WINDOWS = 0
+else:
+ IS_WINDOWS = 1
+
+
+# IDs for all chromosome files. Usually ID == chromosome number, but we also
+# give numbers to X, Y, and all of the unlocalized ("random" as in "random access") sequences
+ChromosomeMap = {"chr1":1, "chr2":2, "chr3":3, "chr4":4,
+ "chr5":5, "chr6":6, "chr7":7, "chr8":8,
+ "chr9":9, "chr10":10, "chr11":11, "chr12":12,
+ "chr13":13, "chr14":14, "chr15":15, "chr16":16,
+ "chr17":17, "chr18":18, "chr19":19, "chr20":20,
+ "chr21":21, "chr22":22, "chrX":23, "chrY":24,
+ "chrM":25, "chr1_random":26, "chr2_random":27, "chr3_random":28,
+ "chr4_random":29, "chr5_random":30, "chr6_random":31, "chr7_random":32,
+ "chr8_random":33, "chr9_random":34, "chr10_random":35, "chr11_random":36,
+ "chr12_random":37, "chr13_random":38, "chr14_random":39, "chr15_random":40,
+ "chr16_random":41, "chr17_random":42, "chr18_random":43, "chr19_random":44,
+ "chr20_random":45, "chr21_random":46, "chr22_random":47, "chrX_random":48,
+ "chrx":23, "chry":24, "chrm":25, "chrx_random":48
+ }
+
+ReverseChromosomeMap = ["", "chr1", "chr2", "chr3", "chr4", "chr5",
+ "chr6", "chr7", "chr8", "chr9", "chr10",
+ "chr11", "chr12", "chr13", "chr14", "chr15",
+ "chr16", "chr17", "chr18", "chr19", "chr20",
+ "chr21", "chr22", "chrX", "chrY", "chrM",
+ "chr1_random", "chr2_random", "chr3_random", "chr4_random", "chr5_random",
+ "chr6_random", "chr7_random", "chr8_random", "chr9_random", "chr10_random",
+ "chr10_random", "chr12_random", "chr13_random", "chr14_random", "chr15_random",
+ "chr16_random", "chr17_random", "chr18_random", "chr19_random", "chr20_random",
+ "chr21_random", "chr22_random", "chrX_random",]
+
+MassPTMods = {}
+
+class Bag:
+ "Generic argument-container class"
+ pass
+
+def FixupPath(Path):
+ if IS_WINDOWS:
+ return Path.replace("/", "\\")
+ else:
+ return Path.replace("\\", "/")
+
+class PTModClass:
+ InstanceCount = 0
+ "A type of post-translational modification, such as phosphorylation."
+ def __init__(self, Name):
+ self.Name = Name
+ # Peptides that this modification can affect:
+ self.Bases = {}
+ self.BaseString = ""
+ self.Mass = 0.0
+ self.Score = 0.0
+ PTModClass.InstanceCount += 1
+ try:
+ Mass = int(Name)
+ self.Mass = Mass
+ except:
+ pass
+ def __del__(self):
+ if PTModClass:
+ PTModClass.InstanceCount -= 1
+ def __str__(self):
+ return "<PTMod '%s'>"%self.Name
+
+def LoadPTMods():
+ """
+ Read the definitions of post-translational modifications from PTMods.txt.
+ Line format is tab-delimited, like this: "Alkylation 14.01564 CKRHDENQ"
+ (This is rarely used in practice, but is useful for search results that are annotated with names instead of masses)
+ """
+ FileName = ""
+ for path in sys.path:
+ FileName = os.path.join(path,"PTMods.txt")
+ if os.path.exists(FileName):
+ #print FileName
+ break
+ else:
+ FileName = ""
+ if FileName == "":
+ print "Utils: Unable to open PTMods.txt"
+ sys.exit(1)
+ File = open(FileName, 'r')
+ for FileLine in File.xreadlines():
+ FileLine = FileLine.strip()
+ if (not FileLine) or FileLine[0]=="#":
+ continue
+ Bits = FileLine.split("\t")
+ PTMod = PTModClass(Bits[0])
+ PTMod.Mass = float(Bits[1])
+ PTMod.BaseString = Bits[2]
+ for Char in Bits[2]:
+ PTMod.Bases[Char] = 1
+ Global.PTMods[PTMod.Name.lower()] = PTMod
+ Global.PTModByShortName[PTMod.Name[:3].lower()] = PTMod
+ Global.PTModByShortName[PTMod.Name[:4].lower()] = PTMod
+ Global.PTModList.append(PTMod)
+ File.close()
+
+class ProteinClass:
+ """
+ Class representing a single protein: a collection of peptides.
+ Can compute sequence coverage, as well as the list of modifications (PTM)
+ """
+ def __init__(self,Sequence, Type= None):
+ self.CellType = Type #use by sam for comparing cell types
+ self.Sequence = Sequence
+ self.SequenceCoverage = [0]*len(self.Sequence) #counts of spectra at each residue
+ self.Peptides = [] #list of UnmodifiedPeptide objects
+ self.PositionModSpectraDict = {} #key = (Name,Position) value = ModdedSpectra
+ self.Coverage = 0.0
+ def GenerateSequenceCoverage(self):
+ """
+ Goes through all the peptides and increments TotalSpectraCount for the
+ Residues that it covers.
+ """
+ for UPeptide in self.Peptides: #UnmodifiedPeptide
+ Start = self.Sequence.find(UPeptide.Aminos)
+ Len = len(UPeptide.Aminos)
+ for I in range(Start,Start+Len,1):
+ self.SequenceCoverage[I] += UPeptide.TotalSpectraCount
+ Covered =0
+ for I in range(len(self.Sequence)):
+ if self.SequenceCoverage[I] > 0:
+ Covered +=1
+ Coverage = Covered/ float(len(self.Sequence))
+ self.Coverage = Coverage
+ def GenerateModList(self):
+ """
+ This method runs through all the peptides in self.Peptides
+ and generates a list of modifications on residues
+ """
+ for UPep in self.Peptides:
+ PeptidePosition = self.Sequence.find(UPep.Aminos)
+ TotalSpectra = UPep.TotalSpectraCount
+ for Peptide in UPep.Peptides: # PeptideClass Object
+ for (AminoIndex, ModList) in Peptide.Modifications.items():
+ SpectraThisPeptide = UPep.SpectraCount[Peptide.GetModdedName()]
+ ModificationPosition = AminoIndex + PeptidePosition
+ for Mod in ModList: #PTModClass objects
+ Name = Mod.Name
+ Key = (Name,ModificationPosition)
+ if self.PositionModSpectraDict.has_key(Key):
+ self.PositionModSpectraDict[Key] += SpectraThisPeptide
+ else:
+ self.PositionModSpectraDict[Key] = SpectraThisPeptide
+
+ def AddAnnotation(self,Peptide,SpectrumCounts):
+ "Add a Peptide to the protein"
+ Found = 0
+ for UPeptide in self.Peptides:
+ if UPeptide.Aminos == Peptide.Aminos: # same aminos already a UPep object,
+ #perhaps this is a modified version of the same
+ UPeptide.AddAnnotation(Peptide,SpectrumCounts)
+ Found =1
+ break
+ if not Found:
+ ToAdd = UnmodifiedPeptide(Peptide,SpectrumCounts)
+ self.Peptides.append(ToAdd)
+
+class UnmodifiedPeptide:
+ """A wrapper for the PeptideClass, it contains all modified states of the
+ same amino acid sequence. Useful sometimes.
+ """
+ def __init__(self,Peptide,SpectrumCounts): #PeptideClass below
+ "constructor"
+ self.Aminos = Peptide.Aminos
+ self.UnmodifiedSpectraCount =0
+ self.TotalSpectraCount =0
+ self.Peptides = [] #an array of PeptideClass objects
+ self.SpectraCount ={} #key = fullname of peptide (no prefix/suffix), value = spectracount
+ self.AddAnnotation(Peptide,SpectrumCounts)
+ #maybe some sort of modification list of distinct modifications
+
+ def IsMe(self,Peptide):
+ if self.Aminos == Peptide.Aminos:
+ return 1
+ return 0
+
+ def PrintMe(self):
+ print "UnmodifiedPeptide Object: %s"%self.UnmodifiedSequence
+ print "Total Spectra %d, UnmodifiedSpectra %d"%(self.TotalSpectraCount,self.UnmodifiedSpectraCount)
+
+ def AddAnnotation(self,NewPeptide,SpectrumCounts):
+ "adds an annotation to my list, and updates tallies"
+ self.TotalSpectraCount += SpectrumCounts
+ if len (NewPeptide.Modifications) == 0:
+ if self.UnmodifiedSpectraCount == 0:
+ self.Peptides.append(NewPeptide)
+ self.UnmodifiedSpectraCount += SpectrumCounts
+ else:
+ #determine if we've already got this modification
+ Found =0
+ for MyPeptide in self.Peptides:
+ if MyPeptide.GetModdedName() == NewPeptide.GetModdedName():
+ Found ==1
+ break
+ if not Found:
+ self.Peptides.append(NewPeptide)
+ self.SpectraCount[NewPeptide.GetModdedName()] = SpectrumCounts
+
+
+class ModificationTypeObject:
+ """
+ This holds information about a specific type of Modification. Remember the format
+ mod,14,KR TAB #methylation.
+ mod,DELTA,AMINOS,POSITION,whatever TAB #Name
+ """
+ def __init__(self,Latin,Name,DMass,Residues,Position):
+ self.inLatin = Latin #inVivo, inVitro
+ self.Name = Name
+ self.DeltaMass = DMass
+ self.Residues = Residues
+ if self.Residues == "*":
+ self.Residues = "ACDEFGHIKLMNPQRSTVWY"
+ self.Position = Position
+ self.InspectID = ""
+ ##Do a littl processing. the way a modification shows up in the Inspect
+ ## output is with a +43, or a -17, or possibly phos. So I need to generate
+ ## the InspectIdentifier that can be compared later in the RemoveSelf function
+ if self.Name == "phosphorylation":
+ self.InspectID = "phos"
+ elif self.DeltaMass < 0:
+ self.InspectID = "%s"%self.DeltaMass
+ elif self.DeltaMass > 0:
+ "this is for positive values of DeltaMass which present a problem"
+ self.InspectID = "+" + "%s"%self.DeltaMass
+ #try:
+ # if self.DeltaMass[0] == "+":
+ # self.InspectID = "%s"%self.DeltaMass
+ # self.DeltaMass = int(self.DeltaMass[1:])
+ #except:
+ # self.InspectID = "+" + "%s"%self.DeltaMass
+
+ def PrintMe(self):
+ "Simple Debugging printer"
+ print "I am a ModificationTypeObject for %s"%self.Name
+ print "InspectID %s"%self.InspectID
+ print "AcceptableResidues %s"%self.Residues
+
+ def RemoveSelf(self,Annotation):
+ """
+ This method takes an input string and looks for modifications which correspond
+ to its identity. If any are found, it removes them from the string and returns it.
+ It will remove all copies of itself from the String
+ """
+ InMod = 0
+ StartIndex = -1
+ ModString = ""
+ I = 0 #loop iterater
+ while I < len(Annotation):
+ Letter = Annotation[I]
+ if not Letter in string.uppercase:
+ if not InMod:
+ StartIndex=I
+ InMod = 1
+ ModString += Letter
+ elif InMod:
+ #this is the first upper case letter after a modification.
+ if ModString == self.InspectID:
+ #this is my Identifier, Check position and residue
+ PositionCheck = 0 #false
+ ResidueCheck = 0
+ if self.Position == "nterminal" and StartIndex == 1:
+ PositionCheck = 1
+ elif not self.Position:
+ PositionCheck = 1 #no position specified (it should be None)
+ ### Add other position things in here as you get them ###
+ ModifiedResidue = Annotation[StartIndex-1]
+ if self.Residues.find(ModifiedResidue) >= 0:
+ #found the modified residue in self.residue string
+ ResidueCheck = 1
+ if PositionCheck and ResidueCheck:
+ Front = Annotation[:StartIndex]
+ EndIndex = StartIndex + len(self.InspectID)
+ Back = Annotation[EndIndex:]
+ Annotation = Front + Back
+ I = StartIndex-1 ##### VERY IMPORTANT to go back once the Annotation has been reset.
+ #regardless of whether this was actually me or not, still reset the vars below
+ InMod = 0
+ ModString = ""
+ I += 1 #must increment for the while loop
+ return Annotation
+
+ def IsMe(self, Identifier,Residue, Position):
+ """
+ Check to see if all the criteria match
+ """
+ if not Identifier == self.InspectID:
+ return 0
+ if self.Position == "nterminal":
+ if Position > 0: #zero indexed string
+ return 0
+ ### add other position identifiers if you have them
+ if self.Residues.find(Residue) < 0:
+ return 0 # returned a -1 for "not found"
+ return 1
+
+def LoadModifications():
+ """
+ This method reads in two files: InVivoModifications.txt and InVitroModifications.txt
+ It makes a ModificationTypeObject out of each mod listed in the files
+ (except fixed mods). These input files are expected to be of the format
+ mod,14,KR TAB #methylation.
+ mod,DELTA_MASS,AMINOS,POSITION__TAB__#Modification name
+ """
+ FileName = ""
+ for path in sys.path:
+ FileName = os.path.join(path,"InVivoModifications.txt")
+ if os.path.exists(FileName):
+ #print FileName
+ break
+ else:
+ FileName = ""
+ if FileName == "":
+ print "Utils: Unable to open InVivoModifications.txt"
+ sys.exit(1)
+ LoadModificationsFromFile(FileName, Global.InVivoMods, "InVivo")
+ FileName = ""
+ for path in sys.path:
+ FileName = os.path.join(path,"InVitroModifications.txt")
+ if os.path.exists(FileName):
+ #print FileName
+ break
+ else:
+ FileName = ""
+ if FileName == "":
+ print "Utils: Unable to open InVitroModifications.txt"
+ sys.exit(1)
+ LoadModificationsFromFile(FileName, Global.InVitroMods, "InVitro")
+
+def LoadModificationsFromFile(FileName, ModificationList, ChemistryType):
+ try:
+ File = open(FileName,"rb")
+ except:
+ #print "File '%s' not found - not loading mods"%FileName
+ return
+ for Line in File.xreadlines():
+ Line = Line.rstrip()
+ Data = Line.split("\t")
+ Name = Data[1][1:] #should get rid of the '#'
+ Latin = "InVivo"
+ InspectInput = Data[0].rstrip() #get rid of any right side junk
+ Data = InspectInput.split(",")
+ DeltaMass = int (Data[1])
+ Residues = Data[2]
+ if len(Data) > 3:
+ Position = Data[3]
+ else:
+ Position = None
+ Mod = ModificationTypeObject(ChemistryType, Name, DeltaMass, Residues, Position)
+ ModificationList.append(Mod)
+ File.close()
+
+class PeptideClass:
+ """
+ Class representing one peptide, possibly with modifications. We get one PeptideClass instance
+ for every match from the trie-based search. A PeptideClass instance can also (if its PrefixMass
+ and SuffixMass members are set) represent a tag.
+ """
+ # Track number of live instances:
+ InstanceCount = 0
+ def __init__(self, Aminos = ""):
+ "Constructor - if we have amino acids, get our masses now."
+ self.Aminos = Aminos
+ self.Masses = []
+ # Modifications map amino acid indices to a list of PTModClass instances
+ self.Modifications = {}
+ self.Score = None
+ self.ID = None
+ self.RecordNumber = None
+ self.PValue = 0
+ self.DeltaCN = 0
+ self.DeltaCNOther = 0
+ if Aminos:
+ self.ComputeMasses()
+ PeptideClass.InstanceCount += 1
+ def GetPTMBeforeAfter(self, Mass):
+ PTMBefore = {}
+ PTMAfter = {}
+ for (AminoIndex, List) in self.Modifications.items():
+ for Entry in List:
+ if Entry.Mass == Mass:
+ for OtherIndex in range(AminoIndex, len(self.Aminos)+1):
+ PTMBefore[OtherIndex] = 1
+ for OtherIndex in range(0, AminoIndex+1):
+ PTMAfter[OtherIndex] = 1
+ return (PTMBefore, PTMAfter)
+
+ def GetPhosphoBeforeAfter(self):
+ PhosBefore = {}
+ PhosAfter = {}
+ for (AminoIndex, List) in self.Modifications.items():
+ for Entry in List:
+ if Entry.Name == "Phosphorylation":
+ for OtherIndex in range(AminoIndex, len(self.Aminos)+1):
+ PhosBefore[OtherIndex] = 1
+ for OtherIndex in range(0, AminoIndex+1):
+ PhosAfter[OtherIndex] = 1
+ return (PhosBefore, PhosAfter)
+ def __del__(self):
+ if PeptideClass:
+ PeptideClass.InstanceCount -= 1
+ def GetParentMass(self):
+ if not self.Masses:
+ self.ComputeMasses()
+ return 19 + self.Masses[-1]
+ def ComputeMasses(self):
+ """
+ Populate our Masses list, based upon Aminos and Modifications. Must be called,
+ if self.Modifications is edited!
+ """
+ self.Masses = [0]
+ Mass = 0
+ for Index in range(len(self.Aminos)):
+ Amino = self.Aminos[Index]
+ AminoMass = Global.AminoMass.get(Amino, None)
+ if AminoMass == None:
+ if Amino == "X":
+ print "** Warning: Peptide '%s' contains wild-card amino X, mass is probably wrong."%(self.Aminos)
+ AminoMass = 0
+ else:
+ raise ValueError, "Bad amino '%s' in peptide '%s'"%(Amino, self.Aminos)
+ Mass += AminoMass
+ Mass += Global.FixedMods.get(Amino, 0)
+ for Mod in self.Modifications.get(Index, []):
+ Mass += Mod.Mass
+ # Warn, but don't fail here. (The trick case: We generate tag GVQ instead of GVK,
+ # and biotin can't attach to Q. Bah!)
+ #if not Mod.Bases.has_key(Amino):
+ # print "Warning: Amino '%s' in peptide '%s' has illegal modification %s at %s"%(Amino, self.Aminos, Mod.Name, Index)
+ self.Masses.append(Mass)
+ def GetPTMCount(self):
+ Total = 0
+ for Key in self.Modifications.keys():
+ Total += len(self.Modifications[Key])
+ return Total
+ def GetFullModdedName(self):
+ return "%s.%s.%s"%(self.Prefix, self.GetModdedName(), self.Suffix)
+ def GetModdedName(self):
+ "Returns the amino sequence with modifications included, like this: EAM+16APK"
+ Str = ""
+ for Index in range(len(self.Aminos)):
+ Amino = self.Aminos[Index]
+ Str += "%s"%(Amino)
+ for Mod in self.Modifications.get(Index, []):
+ Str += "%s"%(Mod.Name[:4].lower())
+ return Str
+ def __str__(self):
+ return "<Peptide '%s'>"%self.Aminos
+ def IsValidTag(self, TagPeptide, Epsilon = 2.0):
+ """
+ Returns true if TagPeptide is a valid tag for this (full-length) peptide
+ """
+ TotalResidueMass = self.Masses[-1]
+ TagLength = len(TagPeptide.Aminos)
+ TagAminos = TagPeptide.Aminos.replace("I", "L").replace("Q", "K")
+ Aminos = self.Aminos.replace("I", "L").replace("Q", "K")
+ for Pos in range(len(self.Masses)):
+ PrefixMass = self.Masses[Pos]
+ # Check flanking mass:
+ if abs(PrefixMass - TagPeptide.PrefixMass) > Epsilon:
+ #print "Pos %s: Invalid (prefix %s vs %s)"%(Pos, PrefixMass, TagPeptide.PrefixMass)
+ continue
+ # Check amino acids:
+ if Aminos[Pos:Pos + TagLength] != TagAminos:
+ #print "Pos %s: Invalid (aminos %s vs %s)"%(Pos, Aminos[Pos:Pos + TagLength], TagAminos)
+ continue
+ # Check suffix mass:
+ SuffixMass = TotalResidueMass - self.Masses[Pos + TagLength]
+ if abs(SuffixMass - TagPeptide.SuffixMass) > Epsilon:
+ #print "Pos %s: Invalid (suffix %s vs %s)"%(Pos, SuffixMass, TagPeptide.SuffixMass)
+ continue
+ return 1
+ #Mass = TagPeptide.PrefixMass + TagPeptide.SuffixMass + GetMass(TagPeptide.Aminos)
+
+ def IsSame(self, OtherPeptide):
+ SubstDict = {"Q": "K", "I": "L"}
+ if len(self.Aminos) != len(OtherPeptide.Aminos):
+ return 0
+ for AminoIndex in range(len(self.Aminos)):
+ OurAmino = self.Aminos[AminoIndex]
+ TheirAmino = OtherPeptide.Aminos[AminoIndex]
+ OurMods = []
+ TheirMods = []
+ for Mod in self.Modifications.get(AminoIndex, []):
+ if Mod.Name[1:3] == "->":
+ OurAmino = Mod.Name[-1].upper()
+ else:
+ OurMods.append(Mod.Mass)
+ for Mod in OtherPeptide.Modifications.get(AminoIndex, []):
+ if Mod.Name[1:3] == "->":
+ TheirAmino = Mod.Name[-1].upper()
+ else:
+ TheirMods.append(Mod.Mass)
+ OurAmino = SubstDict.get(OurAmino, OurAmino)
+ TheirAmino = SubstDict.get(TheirAmino, TheirAmino)
+ if OurAmino != TheirAmino:
+ return 0
+ OurMods.sort()
+ TheirMods.sort()
+ if OurMods != TheirMods:
+ return 0
+ return 1
+ def __cmp__(self, OtherPeptide):
+ if (not isinstance(OtherPeptide, PeptideClass)):
+ return 1
+ # Sort by score, best to worst:
+ if self.Score > OtherPeptide.Score:
+ return -1
+ if self.Score < OtherPeptide.Score:
+ return 1
+ return 0
+ def GetNTT(self):
+ """
+ Returns the number of tryptic termini. (assumes self.prefix and self.suffix
+ are set)
+ """
+ NTT = 0
+ if self.Prefix in ("-*X"):
+ NTT += 1
+ elif (self.Prefix in ("KR")) and (self.Aminos[0] !="P"):
+ NTT += 1
+ if self.Suffix in ("-*X"):
+ NTT += 1
+ elif (self.Aminos[-1] in "KR") and (self.Suffix != "P"):
+ NTT += 1
+ return NTT
+ def IsFullyTryptic(self):
+ if self.Prefix in ("-", "*"):
+ pass
+ elif (self.Prefix in ("K", "R")) and self.Aminos[0] != "P":
+ pass
+ else:
+ return 0
+ if self.Suffix in ("-", "*"):
+ pass
+ elif self.Aminos[-1] in ("K", "R") and self.Suffix != "P":
+ pass
+ else:
+ return 0
+ return 1
+ def GetNiceAnnnotation(self):
+ """
+ Return an annotation suitable for a filename. *.ABC.D turns into -.ABC.D
+ """
+ Str = "%s.%s.%s"%(self.Prefix, self.Aminos, self.Suffix)
+ return Str.replace("*", "-")
+
+def GetPeptideFromModdedName(TagName):
+ """
+ Parse a tag with form like "ATphosQ", adding PTMs at the correct spots.
+ """
+ StringPos = 0
+ Peptide = PeptideClass()
+
+ # If the name has the form K.ABCDER.G, then strip off the prefix and suffix:
+ if len(TagName) > 4 and TagName[1] == "." and TagName[-2] == ".":
+ Peptide.Prefix = TagName[0]
+ Peptide.Suffix = TagName[-1]
+ TagName = TagName[2:-2]
+
+ try:
+ while (1):
+ if StringPos >= len(TagName):
+ break
+ if TagName[StringPos] in string.uppercase:
+ Peptide.Aminos += TagName[StringPos]
+ StringPos += 1
+ else:
+ # It's a modification:
+ ModName = ""
+ while (StringPos<len(TagName) and TagName[StringPos] not in string.uppercase) and len(ModName)<4:
+ if ModName and ModName[0] in ("-","+") and TagName[StringPos] not in "0123456789k":
+ break
+ ModName += TagName[StringPos]
+ StringPos += 1
+ Mod = Global.PTModByShortName.get(ModName)
+ if len(ModName)<2:
+ print "!???", TagName, ModName
+ if not Mod and ModName[-2]==">": #Mutation is annotated as "a->g", etc.
+ Mod = PTModClass(ModName)
+ Mod.Mass = Global.AminoMass[ModName[-1].upper()] - Global.AminoMass[ModName[0].upper()]
+ if not Mod and ModName[0] in ("-","+"):
+ ModName = ModName.replace("(","")
+ # Keep a cache of "mass mods":
+ ModMass = int(ModName)
+ Mod = MassPTMods.get(ModMass, None)
+ if not Mod:
+ Mod = PTModClass(ModName)
+ Mod.Mass = ModMass
+ MassPTMods[ModMass] = Mod
+ if Mod:
+ Pos = len(Peptide.Aminos) - 1
+ if not Peptide.Modifications.has_key(Pos):
+ Peptide.Modifications[Pos] = []
+ Peptide.Modifications[Pos].append(Mod)
+ else:
+ print "** Warning: Unknown mod '%s' in '%s'"%(ModName, TagName)
+ except:
+ print TagName
+ raise
+ Peptide.ComputeMasses()
+ return Peptide
+
+class AminoClass:
+ def __init__(self, Name, ShortName, Abbreviation, LeftMass, RightMass):
+ self.Name = Name # "Histidine"
+ self.ShortName = ShortName # "His"
+ self.Abbreviation = Abbreviation # "H"
+ self.LeftMass = LeftMass
+ self.RightMass = RightMass
+ self.RequiredModification = None
+
+def LoadAminoAcids():
+ """
+ Read in the masses of all amino acids.
+ Populate dictionaries AminoMass, AminoMassRight and list AminoMasses
+ """
+ FileName = ""
+ for path in sys.path:
+ FileName = os.path.join(path,"AminoAcidMasses.txt")
+ if os.path.exists(FileName):
+ #print FileName
+ break
+ else:
+ FileName = ""
+ if FileName == "":
+ print "Utils: Unable to open AminoAcidMasses.txt"
+ sys.exit(1)
+ File = open(FileName,'r')
+ for FileLine in File.xreadlines():
+ # Line is whitespace-delimited. Pieces are:
+ # Long, short, abbrev, left-mass, right-mass
+ # Example: "Glycine Gly G 57.02146 57.0520"
+ FileLine = FileLine.strip()
+ if FileLine[0] == "#":
+ continue
+ Bits = FileLine.split(" ")
+ if len(Bits)<5:
+ continue
+ LeftMass = float(Bits[3])
+ RightMass = float(Bits[4])
+ Global.AminoMass[Bits[2]] = LeftMass
+ Global.AminoMassRight[Bits[2]] = RightMass
+ Global.AminoMasses.append(LeftMass)
+ # Put the Amino object into Global.AminoAcids:
+ Amino = AminoClass(Bits[0], Bits[1], Bits[2], LeftMass, RightMass)
+ Global.AminoAcids[Amino.Abbreviation] = Amino
+ File.close()
+ Global.AminoMasses.sort()
+
+
+def DebugPrintPTMods():
+ Keys = Global.PTMods.keys()
+ Keys.sort()
+ print "--PTMods--"
+ for Key in Keys:
+ PTMod = Global.PTMods[Key]
+ BaseString = ""
+ for Base in PTMod.Bases.keys():
+ BaseString += Base
+ print " %s mass %s bases '%s'"%(PTMod.Name, PTMod.Mass, BaseString)
+ print "-----"
+
+class IonClass:
+ """
+ Each IonClass corresponds to an ion type, such as b or y-nh3.
+ Each spectral peak gives rise to one PRM peak for each ion type;
+ these PRM peaks remember their associated ion class
+ """
+ def __init__(self, Name):
+ self.Name = Name
+ self.Opposite = None
+ self.Charge = 1
+ self.Score = 1.0
+ def __str__(self):
+ return "<ion '%s'>"%self.Name
+ def GetPRMMass(self, Mass, ParentMass):
+ """
+ Returns the prm peak for a spectrum peak of the given mass. For instance,
+ for b ions, GetPRMMass() returns the peak mass minus 1. (Because the spectral peak
+ appears 1amu to the right of the actual prefix mass)
+ """
+ return None
+ def GetPeakMass(self, Mass, ParentMass):
+ """
+ Returns the peak for a PRM of the given mass. Inverse of GetPRMMass.
+ For instance, for b ions, GetPeakMass() returns the PRM plus 1.
+ """
+ return None
+
+
+
+AllIons = []
+Global.AllIonDict = {}
+def DefineIons():
+ """
+ Define all the ion types we care about.
+ (This function is repetitive, but easy enough to maintain since the zoo of ion types
+ is pretty small...the scores should be in a datafile, though!)
+ """
+ IonB = IonClass("b")
+ IonB.GetPeakMass = lambda L, P:L+1
+ IonB.GetPRMMass = lambda M, P:M-1
+ AllIons.append(IonB)
+ #
+ IonBH = IonClass("b-h2o")
+ IonBH.GetPeakMass = lambda L, P:L-17
+ IonBH.GetPRMMass = lambda M, P:M+17
+ AllIons.append(IonBH)
+ #
+ IonBN = IonClass("b-nh3")
+ IonBN.GetPeakMass = lambda L, P:L-16
+ IonBN.GetPRMMass = lambda M, P:M+16
+ AllIons.append(IonBN)
+ #
+ Ion = IonClass("b-h2o-h2o")
+ Ion.GetPeakMass = lambda L, P:L-17-18
+ Ion.GetPRMMass = lambda M, P:M+17+18
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b-h2o-nh3")
+ Ion.GetPeakMass = lambda L, P:L-16-18
+ Ion.GetPRMMass = lambda M, P:M+16+18
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b-p'")
+ Ion.GetPeakMass = lambda L, P:L-79
+ Ion.GetPRMMass = lambda M, P:M+79
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b-p")
+ Ion.GetPeakMass = lambda L, P:L-97
+ Ion.GetPRMMass = lambda M, P:M+97
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b-p-h2o")
+ Ion.GetPeakMass = lambda L, P:L-97-18
+ Ion.GetPRMMass = lambda M, P:M+97+18
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b-p-nh3")
+ Ion.GetPeakMass = lambda L, P:L-97-17
+ Ion.GetPRMMass = lambda M, P: M+97+17
+ AllIons.append(Ion)
+ # for oxidized methionine:
+ Ion = IonClass("b-*")
+ Ion.GetPeakMass = lambda L, P:L-63
+ Ion.GetPRMMass = lambda M, P:M+63
+ AllIons.append(Ion)
+ #
+ IonY = IonClass("y")
+ IonY.GetPeakMass = lambda L, P:P-L
+ IonY.GetPRMMass = lambda M, P:P-M
+ AllIons.append(IonY)
+ #
+ IonYH = IonClass("y-h2o")
+ IonYH.GetPeakMass = lambda L, P:P-(L+18)
+ IonYH.GetPRMMass = lambda M, P:(P-M)-18
+ AllIons.append(IonYH)
+ #
+ IonYN = IonClass("y-nh3")
+ IonYN.GetPeakMass = lambda L, P:P-(L+17)
+ IonYN.GetPRMMass = lambda M, P:(P-M)-17
+ AllIons.append(IonYN)
+ #
+ Ion = IonClass("y-h2o-nh3")
+ Ion.GetPeakMass = lambda L, P:P-(L+17+18)
+ Ion.GetPRMMass = lambda M, P:(P-M)-17-18
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y-h2o-h2o")
+ Ion.GetPeakMass = lambda L, P:P-(L+18+18)
+ Ion.GetPRMMass = lambda M, P:(P-M)-18-18
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y-p'")
+ Ion.GetPeakMass = lambda L, P:(P-L)-80
+ Ion.GetPRMMass = lambda M, P:P-(M+80)
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y-p")
+ Ion.GetPeakMass = lambda L, P:(P-L)-98
+ Ion.GetPRMMass = lambda M, P:P-(M+98)
+ AllIons.append(Ion)
+ # For oxidized methionine:
+ Ion = IonClass("y-*")
+ Ion.GetPeakMass = lambda L, P:(P-L)-64
+ Ion.GetPRMMass = lambda M, P:P-(M+64)
+ AllIons.append(Ion)
+ #
+ IonA = IonClass("a")
+ IonA.GetPeakMass = lambda L, P:L-27
+ IonA.GetPRMMass = lambda M,P:M+27
+ AllIons.append(IonA)
+ #
+ IonAN = IonClass("a-nh3")
+ IonAN.GetPeakMass = lambda L, P:L-27-17
+ IonAN.GetPRMMass = lambda M,P:M+27+17
+ AllIons.append(IonAN)
+ #
+ IonAH = IonClass("a-h2o")
+ IonAH.GetPeakMass = lambda L, P:L-27-18
+ IonAH.GetPRMMass = lambda M,P:M+27+18
+ AllIons.append(IonAH)
+ #
+ Ion = IonClass("b2")
+ Ion.GetPeakMass = lambda L,P:(L/2)+1
+ Ion.GetPRMMass = lambda M,P:(M-1)*2
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b2-h2o")
+ Ion.GetPeakMass = lambda L,P:(L/2)+1 - 9
+ Ion.GetPRMMass = lambda M,P:(M-1)*2 + 18
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b2-nh3")
+ Ion.GetPeakMass = lambda L,P:(L/2)+1 - 8.5
+ Ion.GetPRMMass = lambda M,P:(M-1)*2 + 17
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b2-nh3-h2o")
+ Ion.GetPeakMass = lambda L,P:(L/2)+1 - 17.5
+ Ion.GetPRMMass = lambda M,P:(M-1)*2 + 35
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("b2-p")
+ Ion.GetPeakMass = lambda L,P:(L/2)+1 - 49
+ Ion.GetPRMMass = lambda M,P:(M-1)*2 + 98
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y2")
+ Ion.GetPeakMass = lambda L,P:(P-L+1.0078)/2
+ Ion.GetPRMMass = lambda M,P:P - (M*2 - 1.0078)
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y2-h2o")
+ Ion.GetPeakMass = lambda L,P:(P-L+1.0078 - 18)/2
+ Ion.GetPRMMass = lambda M,P:P - (M*2 - 1.0078 + 18)
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y2-nh3")
+ Ion.GetPeakMass = lambda L,P:(P-L+1.0078 - 17)/2
+ Ion.GetPRMMass = lambda M,P:P - (M*2 - 1.0078 + 17)
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y2-nh3-h2o")
+ Ion.GetPeakMass = lambda L,P:(P-L+1.0078 - 17 - 18)/2
+ Ion.GetPRMMass = lambda M,P:P - (M*2 - 1.0078 + 17 + 18)
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+ Ion = IonClass("y2-p")
+ Ion.GetPeakMass = lambda L,P:(P-L+1.0078 - 98)/2
+ Ion.GetPRMMass= lambda M,P:P - (M*2 - 1.0078 + 98)
+ Ion.Charge =2
+ AllIons.append(Ion)
+ # For oxidized methionine:
+ Ion = IonClass("y2-*")
+ Ion.GetPeakMass = lambda L,P:(P-L+1.0078 - 64)/2
+ Ion.GetPRMMass = lambda M,P:P - (M*2 - 1.0078 + 64)
+ Ion.Charge = 2
+ AllIons.append(Ion)
+ #
+
+ for Ion in AllIons:
+ Global.AllIonDict[Ion.Name] = Ion
+
+def GetMass(Str):
+ "Return the mass of a string of amino acids. Useful in interactive mode."
+ Mass = 0
+ for Char in Str:
+ Mass += Global.AminoMass[Char]
+ Mass += Global.FixedMods.get(Char, 0)
+ return Mass
+
+
+
+def GetIsotopePatterns():
+ Global.IsotopeWeights = {}
+ FileName = ""
+ for path in sys.path:
+ FileName = os.path.join(path,"IsotopePatterns.txt")
+ if os.path.exists(FileName):
+ #print FileName
+ break
+ else:
+ FileName = ""
+ if FileName == "":
+ print "Utils: Unable to open IsotopePatterns.txt"
+ sys.exit(1)
+ File = open(FileName,'r')
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ if len(Bits) < 2:
+ continue
+ Global.IsotopeWeights[int(Bits[0])] = float(Bits[1])
+
+
+INITIALIZED = 0
+DummyIon = None
+def Initialize():
+ global INITIALIZED
+ global DummyIon
+ if INITIALIZED:
+ return
+ DefineIons()
+
+ # dummy ion type, for the spectral edge peaks we put at mass 0 and at parent-mass:
+ DummyIon = IonClass("")
+ DummyIon.GetPeakMass = lambda L,P:L
+ DummyIon.GetPRMMass = lambda M,P:M
+
+ # Do this initialization once, up front:
+ LoadAminoAcids()
+ LoadPTMods()
+ LoadModifications()
+ GetIsotopePatterns()
+ INITIALIZED = 1
+
+#SAME AS INITIALIZE, BUT SPECIFY DIRECTORY FOR FILES
+
+def InitializeNonInspect(ResourceDir):
+ global INITIALIZED
+ global DummyIon
+ if INITIALIZED:
+ return
+ DefineIons()
+
+
+ # dummy ion type, for the spectral edge peaks we put at mass 0 and at parent-mass:
+ DummyIon = IonClass("")
+ DummyIon.GetPeakMass = lambda L,P:L
+ DummyIon.GetPRMMass = lambda M,P:M
+
+ # Do this initialization once, up front:
+ LoadAminoAcidsNonInspect(ResourceDir)
+ LoadPTModsNonInspect(ResourceDir)
+ LoadModificationsNonInspect(ResourceDir)
+ GetIsotopePatternsNonInspect(ResourceDir)
+ INITIALIZED = 1
+
+def LoadAminoAcidsNonInspect(ResourceDir):
+ """
+ Read in the masses of all amino acids.
+ Populate dictionaries AminoMass, AminoMassRight and list AminoMasses
+ """
+ File = open(os.path.join(ResourceDir,"AminoAcidMasses.txt"),"r")
+ for FileLine in File.xreadlines():
+ # Line is whitespace-delimited. Pieces are:
+ # Long, short, abbrev, left-mass, right-mass
+ # Example: "Glycine Gly G 57.02146 57.0520"
+ FileLine = FileLine.strip()
+ if FileLine[0] == "#":
+ continue
+ Bits = FileLine.split(" ")
+ if len(Bits)<5:
+ continue
+ LeftMass = float(Bits[3])
+ RightMass = float(Bits[4])
+ Global.AminoMass[Bits[2]] = LeftMass
+ Global.AminoMassRight[Bits[2]] = RightMass
+ Global.AminoMasses.append(LeftMass)
+ # Put the Amino object into Global.AminoAcids:
+ Amino = AminoClass(Bits[0], Bits[1], Bits[2], LeftMass, RightMass)
+ Global.AminoAcids[Amino.Abbreviation] = Amino
+ File.close()
+ Global.AminoMasses.sort()
+
+def LoadPTModsNonInspect(ResourceDir):
+ """
+ Read the definitions of post-translational modifications from PTMods.txt.
+ Line format is tab-delimited, like this: "Alkylation 14.01564 CKRHDENQ"
+ (This is rarely used in practice, but is useful for search results that are annotated with names instead of masses)
+ """
+ File = open(os.path.join(ResourceDir,"PTMods.txt"),"r")
+ for FileLine in File.xreadlines():
+ FileLine = FileLine.strip()
+ if (not FileLine) or FileLine[0]=="#":
+ continue
+ Bits = FileLine.split("\t")
+ PTMod = PTModClass(Bits[0])
+ PTMod.Mass = float(Bits[1])
+ PTMod.BaseString = Bits[2]
+ for Char in Bits[2]:
+ PTMod.Bases[Char] = 1
+ Global.PTMods[PTMod.Name.lower()] = PTMod
+ Global.PTModByShortName[PTMod.Name[:3].lower()] = PTMod
+ Global.PTModByShortName[PTMod.Name[:4].lower()] = PTMod
+ Global.PTModList.append(PTMod)
+ File.close()
+
+def LoadModificationsNonInspect(ResourceDir):
+ """
+ This method reads in two files: InVivoModifications.txt and InVitroModifications.txt
+ It makes a ModificationTypeObject out of each mod listed in the files
+ (except fixed mods). These input files are expected to be of the format
+ mod,14,KR TAB #methylation.
+ mod,DELTA_MASS,AMINOS,POSITION__TAB__#Modification name
+ """
+ LoadModificationsFromFile(os.path.join(ResourceDir,"InVivoModifications.txt"), Global.InVivoMods, "InVivo")
+ LoadModificationsFromFile(os.path.join(ResourceDir,"InVitroModifications.txt"), Global.InVitroMods, "InVitro")
+
+def GetIsotopePatternsNonInspect(ResourceDir):
+ Global.IsotopeWeights = {}
+ File = open(os.path.join(ResourceDir,"IsotopePatterns.txt"), "r")
+ for FileLine in File.xreadlines():
+ Bits = FileLine.split("\t")
+ if len(Bits) < 2:
+ continue
+ Global.IsotopeWeights[int(Bits[0])] = float(Bits[1])
+
+def MakeDirectory(Dir):
+ if os.path.exists(Dir):
+ return
+ try:
+ os.makedirs(Dir)
+ except:
+ raise
+
diff --git a/base64.c b/base64.c
new file mode 100644
index 0000000..2f53e49
--- /dev/null
+++ b/base64.c
@@ -0,0 +1,217 @@
+// downloaded from web
+#include "CMemLeak.h"
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include "base64.h"
+
+
+int GetPosition(char buf);
+
+static const unsigned char *b64_tbl = (const unsigned char*) "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const unsigned char b64_pad = '=';
+
+// base64 encode a group of between 1 and 3 input chars into a group of 4 output chars
+static void encode_group(const unsigned char input[], unsigned char output[], int n)
+{
+ unsigned char ingrp[3];
+
+ ingrp[0] = n > 0 ? input[0] : 0;
+ ingrp[1] = n > 1 ? input[1] : 0;
+ ingrp[2] = n > 2 ? input[2] : 0;
+
+ // upper 6 bits of ingrp[0]
+ output[0] = n > 0 ? b64_tbl[ingrp[0] >> 2] : b64_pad;
+
+ // lower 2 bits of ingrp[0] | upper 4 bits of ingrp[1]
+ output[1] = n > 0 ? b64_tbl[((ingrp[0] & 0x3) << 4) | (ingrp[1] >> 4)] : b64_pad;
+
+ // lower 4 bits of ingrp[1] | upper 2 bits of ingrp[2]
+ output[2] = n > 1 ? b64_tbl[((ingrp[1] & 0xf) << 2) | (ingrp[2] >> 6)] : b64_pad;
+
+ // lower 6 bits of ingrp[2]
+ output[3] = n > 2 ? b64_tbl[ingrp[2] & 0x3f] : b64_pad;
+
+}
+
+// base64 decode a group of 4 input chars into a group of between 0 and
+// 3 output chars
+static void decode_group(const unsigned char input[], unsigned char output[], int* n)
+{
+ unsigned char* t1;
+ unsigned char* t2;
+ *n = 0;
+
+ if (input[0] == '=')
+ {
+ return;
+ }
+
+ t1 = (unsigned char*)strchr((const char*)b64_tbl, input[0]);
+ t2 = (unsigned char*)strchr((const char*)b64_tbl, input[1]);
+
+ output[(*n)++] = ((t1 - b64_tbl) << 2) | ((t2 - b64_tbl) >> 4);
+
+ if (input[2] == '=')
+ {
+ return;
+ }
+
+ t1 = (unsigned char*) strchr ((const char*)b64_tbl, input[2]);
+
+ output[(*n)++] = ((t2 - b64_tbl) << 4) | ((t1 - b64_tbl) >> 2);
+
+ if (input[3] == '=')
+ return;
+
+ t2 = (unsigned char*) strchr ((const char*)b64_tbl, input[3]);
+
+ output[(*n)++] = ((t1 - b64_tbl) << 6) | (t2 - b64_tbl);
+
+ return;
+}
+
+int GetPosition(char buf)
+{
+
+ if (buf > 96) // [a-z]
+ {
+ return (buf - 71);
+ }
+ else if (buf > 64) // [A-Z]
+ {
+ return (buf - 65);
+ }
+ else if (buf > 47) // [0-9]
+ {
+ return (buf + 4);
+ }
+ else if (buf == 43)
+ {
+ return 63;
+ }
+ else // buf == '/'
+ {
+ return 64;
+ }
+}
+
+void b64_decode_mio(char* src, char* dest)
+{
+ char* temp;
+ int BlockCount = 0;
+
+ temp = dest;
+
+ while (*src)
+ {
+
+ int register a;
+ int register b;
+ int t1,t2,t3,t4;
+ //printf("Block %d: '%c%c%c%c'\n", BlockCount, src[0], src[1], src[2], src[3]);
+ BlockCount++;
+ t1 = src[0];
+ t2 = src[1];
+ t3 = src[2];
+ t4 = src[3];
+
+ if (t1 == 61) // if == '='
+ {
+ return;
+ }
+ if (t1 > 96) // [a-z]
+ {
+ a = (t1 - 71);
+ }
+ else if (t1 > 64) // [A-Z]
+ {
+ a = (t1 - 65);
+ }
+ else if (t1 > 47) // [0-9]
+ {
+ a = (t1 + 4);
+ }
+ else if (t1 == 43)
+ {
+ a = 62;
+ }
+ else // src[0] == '/'
+ {
+ a = 63;
+ }
+ if (t2 > 96) // [a-z]
+ {
+ b = (t2 - 71);
+ }
+ else if (t2 > 64) // [A-Z]
+ {
+ b = (t2 - 65);
+ }
+ else if (t2 > 47) // [0-9]
+ {
+ b = (t2 + 4);
+ }
+ else if (t2 == 43)
+ {
+ b = 62;
+ }
+ else // src[0] == '/'
+ {
+ b = 63;
+ }
+ *temp++ = (a << 2) | (b >> 4);
+ if (t3 == 61)
+ {
+ return;
+ }
+ if (t3 > 96) // [a-z]
+ {
+ a = (t3 - 71);
+ }
+ else if (t3 > 64) // [A-Z]
+ {
+ a = (t3 - 65);
+ }
+ else if (t3 > 47) // [0-9]
+ {
+ a = (t3 + 4);
+ }
+ else if (t3 == 43)
+ {
+ a = 62;
+ }
+ else // src[0] == '/'
+ {
+ a = 63;
+ }
+ *temp++ = (b << 4) | (a >> 2);
+ if (t4 == 61)
+ {
+ return;
+ }
+
+ if (t4 > 96) // [a-z]
+ {
+ b = (t4 - 71);
+ }
+ else if (t4 > 64) // [A-Z]
+ {
+ b = (t4 - 65);
+ }
+ else if (t4 > 47) // [0-9]
+ {
+ b = (t4 + 4);
+ }
+ else if (t4 == 43)
+ {
+ b = 62;
+ }
+ else // src[0] == '/'
+ {
+ b = 63;
+ }
+ *temp++ = ( a << 6) | ( b );
+ src += 4;
+ }
+}
diff --git a/base64.h b/base64.h
new file mode 100644
index 0000000..0d11f94
--- /dev/null
+++ b/base64.h
@@ -0,0 +1,6 @@
+#ifndef BASE64_H
+#define BASE64_H
+
+void b64_decode_mio (char* dest, char* src);
+
+#endif // BASE64_H
diff --git a/docs/Analysis.html b/docs/Analysis.html
new file mode 100644
index 0000000..238aa6a
--- /dev/null
+++ b/docs/Analysis.html
@@ -0,0 +1,79 @@
+<h1>Inspect: A Proteomics Search Toolkit</h1>
+Copyright 2007, The Regents of the University of California
+<h3>Table of Contents</h1>
+<li><a href="index.html">Overview</a>
+<li><a href="Copyright.html">Copyright information</a>
+<li><a href="Installation.html">Installation</a>
+<li><a href="Database.html">Database</a>
+<li><a href="Searching.html">Searching</a>
+<li><a href="Analysis.html">Analysis</a>
+<li><a href="InspectTutorial.pdf">Basic Tutorial</a>
+<li><a href="InspectAdvancedTutorial.pdf">Advanced Tutorial</a>
+<li><a href="UnrestrictedSearchTutorial.pdf">Unrestricted Search Tutorial</a>
+<hr>
+<h2>Analysis</h2>
+Inspect writes search results to a tab-delimited file. Up to ten search hits are written for each spectrum,
+but typcially all but the first can be discarded.
+<br><br>
+The quality of each match can be determiend by the F-Score. The F-score is a weighted sum of two factors. First is the
+MQScore, or match quality score (in column 6). Second
+is the delta-score (in column 14), the difference in MQScore between this match and the best alternative.
+Because delta-score is highly dependent on database size and search parameters, Inspect takes the ratio of
+the delta-score to the average delta-score for all top-scoring matches.
+<br><br>
+The preferred method to compute the false discovery rate (FDR) for a collection of matches is to employ a decoy
+database. This method requires you to generate shuffled protein records before search using the "ShuffleDB" script
+(see the Database section for details). Then, run the ComputeFDR.jar script to compute the empirical false discovery
+rate for a given f-score cutoff.
+<br><br>
+As of January 3, 2012, the columns have been updated slightly. Below is a list of all the columns and their meaning:
+<li>SpectrumFile - The file searched
+<li>Scan# - The scan number within the file; this value is 0 for .dta files; For MGF files, the scan# is equivalent to the SpecIndex, but is 0-based numbering.
+<li>Annotation - Peptide annotation, with prefix and suffix and (non-fixed) modifications indicated.
+Example: K.DFSQIDNAP+16EER.E
+<li>Protein - The name of the protein this peptide comes from. (Protein names are stored to the .index file
+corresponding to the database .trie file)
+<li>Charge - Precursor charge. If "multicharge" is set, or if no charge is specified in the source file, Inspect
+attempts to guess the charge.
+<li>MQScore - Match quality score, the main measure of match quality.
+<li>Length - The length of the matched peptide in amino acids.</li>
+<li>TotalPRMScore - Summed score for break points (between amino acids), based upon a Bayesian network modeling
+fragmentation propensities
+<li>MedianPRMScore - Median score for break pounts.</li>
+<li>FractionY - The fraction of charge 1 y ions detected</li>
+<li>FractionB - The fraction of charge 1 b ions detected</li>
+<li>Intensity - Fraction of high-intensity peaks which are b or y fragments. For a length-n peptide, the top n*3
+peaks are considered.
+<li>NTT - Number of tryptic termini (or Unused, if no protease was specified). Note that the N- and C-terminus of
+a protein are both considered to be valid termini.
+<li>InspectFDR - This is the FDR of all matches with F-score equal to or greater than this match. Since Inspect knows
+nothing about a decoy database, it is often best to run ComputeFDR.jar to compute an empirical FDR.
+<li>DeltaScore - Difference between the MQScore of this match and the best alternative
+<li>DeltaScoreOther - Difference between the MQScore of this match and the best alternative from a different locus.
+To see the difference between this and the previous column, consider a search that finds similar matches
+of the form "M+16MALGEER" and "MM+16ALGEER". In such a case, DeltaScore would be very small, but DeltaScoreOther
+might still be large.
+<li>RecordNumber - Index of the protein record in the database
+<li>DBFilePos - Byte-position of this match within the database
+<li>SpecFilePos - Offset, in the input file, of this spectrum; useful for passing to the "Label" script (see below)
+<li>PrecursorMZ - The precursor m/z given in the spectrum file.
+<li>PrecursorError - The difference (in m/z units) between the precursor m/z given in the file and the theoretical m/z of the identified peptide.
+<li>SpecIndex - This is a one-based number of the index of the spectrum in the original spectrum file. Only MS2+ spectra are counted.</li>
+<hr>
+<h2>Post-processing</h2>
+Python scripts for performing various analyses are included in the distribution.
+<b>Run a script with no command-line parameters to print a list of available arguments.</b><br>
+<li>Label.py - Given a spectrum and a peptide annotation, label the spectrum peaks with
+their associated fragments. Produces a .png image for a spectrum, with associated peptide interpretation. Requires
+the Python Imaging Library (PIL). Sample command:
+<br> <tt>Label.py Shewanella.mzXML:6200392 R.SNGSIGQNQ+14TPGR.V</tt>
+<li>ComputeFDR.jar - Given Inspect output, filter to a user-determined FDR. The ComputeFDR.jar script can be used for many experiments but typical use for Inspect results would be</li>
+<pre> java -jar ComputeFDR.jar -f InspectResult.out 3 XXX -n 1 -p 2 -s 14 1 -fdr 0.01</pre>
+<li>Summary.py - Given Inspect output, produce an html-format summary of the results. The report provides
+a "protein-level" look at the results. This script is also used when
+producing a "second-pass" protein database, containing the proteins identified with high confidence.
+<li>PTMAnalysis.py - This script examines output from MS-Alignment (Inspect run in "blind" mode), and
+highlights the most plausible evidence for PTMs. The script iteratively selects the most common
+post-translational modifications, and report the selections. These selections require manual curation
+and/or validation.
+<hr>
diff --git a/docs/Copyright.html b/docs/Copyright.html
new file mode 100644
index 0000000..9930020
--- /dev/null
+++ b/docs/Copyright.html
@@ -0,0 +1,47 @@
+<h1>Inspect: A Proteomics Search Toolkit</h1>
+Copyright 2007, The Regents of the University of California
+<h3>Table of Contents</h1>
+<li><a href="index.html">Overview</a>
+<li><a href="Copyright.html">Copyright information</a>
+<li><a href="Installation.html">Installation</a>
+<li><a href="Database.html">Database</a>
+<li><a href="Searching.html">Searching</a>
+<li><a href="Analysis.html">Analysis</a>
+<li><a href="InspectTutorial.pdf">Basic Tutorial</a>
+<li><a href="InspectAdvancedTutorial.pdf">Advanced Tutorial</a>
+<li><a href="UnrestrictedSearchTutorial.pdf">Unrestricted Search Tutorial</a>
+
+<hr>
+<pre>
+Copyright information:
+----------------------
+
+Copyright 2007,2008,2009 The Regents of the University of California
+All Rights Reserved
+
+Permission to use, copy, modify and distribute any part of this
+program for educational, research and non-profit purposes, by non-profit
+institutions only, without fee, and without a written agreement is hereby
+granted, provided that the above copyright notice, this paragraph and
+the following three paragraphs appear in all copies.
+
+Those desiring to incorporate this work into commercial
+products or use for commercial purposes should contact the Technology
+Transfer & Intellectual Property Services, University of California,
+San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+
+IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+OF SUCH DAMAGE.
+
+THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+</pre>
diff --git a/docs/Database.html b/docs/Database.html
new file mode 100644
index 0000000..f953c6e
--- /dev/null
+++ b/docs/Database.html
@@ -0,0 +1,78 @@
+<h1>Inspect: A Proteomics Search Toolkit</h1>
+Copyright 2007, The Regents of the University of California
+<h3>Table of Contents</h1>
+<li><a href="index.html">Overview</a>
+<li><a href="Copyright.html">Copyright information</a>
+<li><a href="Installation.html">Installation</a>
+<li><a href="Database.html">Database</a>
+<li><a href="Searching.html">Searching</a>
+<li><a href="Analysis.html">Analysis</a>
+<li><a href="InspectTutorial.pdf">Basic Tutorial</a>
+<li><a href="InspectAdvancedTutorial.pdf">Advanced Tutorial</a>
+<li><a href="UnrestrictedSearchTutorial.pdf">Unrestricted Search Tutorial</a>
+
+<hr>
+<h1>Overview</h1>
+Inspect requires a database (a file of protein sequences) in order to interpret spectra. You
+can specify one or more databases in the Inspect input file. Databases can be stored in one
+of two formats: A .trie file (bare-bones format with sequence data only), or a .ms2db file
+(simple XML format with exon linkage information). These two formats are discussed below.
+
+<h1>Sequence Databases (FASTA)</h1>
+For efficiency reasons, Inspect processes FASTA files into its own internal format before
+searching. A database is stored a two files, one with the extension ".trie" (which holds peptide sequences),
+and one with the extension ".index" (which holds protein names and other meta-data). To prepare
+the database, first copy the protein sequences of interest into a FASTA file in the Database
+subdirectroy. Then, from the Inspect directory, run the Python script PrepDB.py as follows:<br>
+ <tt><b>python PrepDB.py FASTA MyStuff.fasta</b></tt><br>
+Replace "MyStuff.fasta" with the name of your FASTA database. After PrepDB has run, the database
+files MyStuff.trie and MyStuff.index will be ready to search. PrepDB.py also handles
+Swiss-prot ".dat file" format as input.
+<br><br>
+Inspect can perform this processing automatically
+(see the "SequenceFile" option in the <a href="Searching.html">searching</a> documentation). Running
+PrepDB.py is the preferred method since it creates a database file which can be re-used by many
+searches.
+<br><br>
+<b>Note:</b> The database should include all proteins known to be in the sample, otherwise some spectra
+will receive incorrect (and possibly misleading) annotations. In particular, most databases should
+include trypsin (used to digest proteins) and human keratins (introduced during sample processing).
+The file "CommonContaminants.fasta", in the Inspect directory, contains several protein sequences you
+can append to your database.
+<br><br>
+<h1>Decoy records (ShuffleDB)</h1>
+Databases including "decoy proteins" (shuffled or reversed sequences) are emerging as the
+gold standard for computing false discovery rates. Inspect can compute p-values in two
+ways:
+ <li>Compute the empirical false discovery rate by counting the number of hits to
+ invalid proteins. <b>This is the recommended method.</b> Given an f-score cutoff,
+ Inspect computes the number shuffled-protein hits above that threshold - these hits
+ are all invalid. Inspect
+ then estimates the number of invalid hits which happen to fall within valid proteins.
+ This count provides an empirical false discovery rate, which is reported as the
+ "p-value".
+ <li>By fitting the distribution of F-scores as a mixture model, in the manner of
+ PeptideProphet. This is how the initial p-values output by inspect are computed.
+ Use PValue.py <b>without</b> the "-S" option to compute p-values using this method.
+<br><br>
+To compute empirical false discovery rates:
+<li> Use the script ShuffleDB.py to append decoy records to a database before searching. Decoy records have the
+flag "XXX" prefixed to their name.
+<li>After searching, use the script PValue.py (including the "-S" option) to carry out this analysis.
+
+<h1>MS2DB Format</h1>
+The MS2DB file format is a simple, extensible XML format for storing proteins. The main benefits of
+using MS2DB format instead of FASTA files are:
+ <li>Reduced redundancy - Each exon is stored once, and only once
+ <li>Splice information - All isoforms (and sequence variants) corresponding to a locus are grouped
+ as one Gene, which reduces the usual confusion between proteins and records.
+ <li>Site-specific modifications - Known modifications, such as phosphorylation, can be
+ explicitly indicated. Considering these site-specific modifications is much cheaper than
+ a search that attempts to discover new modifications.
+ <li>Rich annotations - The format has places to store information such as accession numbers from
+ sequence repositories, species name, etc.
+<br><br>
+You can use the script BuildMS2DB.jar to generate a MS2DB file. As input, you will need:
+<li>One or more files in GFF3 format containing exon predictions
+<li>A FASTA file containing the sequences on which the exons are predicted</li>
+For more details on using BuildMS2DB.jar (and MS2DBShuffler.jar for building a decoy database) please read the information on proteogenomics found <a href=http://cseweb.ucsd.edu/~ncastell/Maize>here</a>
\ No newline at end of file
diff --git a/docs/InspectTutorial.pdf b/docs/InspectTutorial.pdf
new file mode 100644
index 0000000..66b7ec2
Binary files /dev/null and b/docs/InspectTutorial.pdf differ
diff --git a/docs/Installation.html b/docs/Installation.html
new file mode 100644
index 0000000..3f5c0ff
--- /dev/null
+++ b/docs/Installation.html
@@ -0,0 +1,42 @@
+<h1>Inspect: A Proteomics Search Toolkit</h1>
+Copyright 2007, The Regents of the University of California
+<h3>Table of Contents</h1>
+<li><a href="index.html">Overview</a>
+<li><a href="Copyright.html">Copyright information</a>
+<li><a href="Installation.html">Installation</a>
+<li><a href="Database.html">Database</a>
+<li><a href="Searching.html">Searching</a>
+<li><a href="Analysis.html">Analysis</a>
+<li><a href="InspectTutorial.pdf">Basic Tutorial</a>
+<li><a href="InspectAdvancedTutorial.pdf">Advanced Tutorial</a>
+<li><a href="UnrestrictedSearchTutorial.pdf">Unrestricted Search Tutorial</a>
+
+<hr>
+<h1>Installation</h1>
+To install Inspect, first unzip the source distribution. You may need to compile it:
+<li> On Windows, use the included executable
+inspect.exe. You can build the source code on Windows using the makefile, or
+the included Visual Studio project.
+<li> On Linux, first install <a href="http://sourceforge.net/projects/expat/">expat</a>. Then,
+build Inspect using the included makefile.
+<li> Macintosh is not officially supported. Follow the instructions for Linux, they are likely to work.
+<br><br>
+Other things to do while you're installing:
+<li> Inspect requires <a href="http://www.python.org">Python</a> (version 2.1 or later) in order
+to run various analysis and utility scripts.
+<li> (Optional) You may wish to
+install the <a href="http://www.pythonware.com/products/pil/">Python Imaging Library (PIL)</a>
+for generation of simple spectrum images.
+<li> (Optional) If the <a href="http://psyco.sourceforge.net/">psyco</a>
+library is installed, it is automatically loaded to speed up analysis scripts; this is entirely
+optional.
+<li> (Recommended) The Python numeric library (numpy) is required for some analysis scripts.
+<li>The distribution includes some system tests. After installing, go to the Inspect directory, and
+run them to be sure that things are installed properly:<br>
+<tt><b>python SystemTest.py</b></tt><br>
+After the run completes, any errors will be reported. Files used by the system tests are stored
+in the "SystemTest" subdirectory. The test input file "TestInput.txt" is annotated
+with comments, and you can refer to it (or copy and modify it) when starting up searches.
+<br><br>
+If the tests fail, please <a href="mailto:spayne at ucsd.edu">submit a bug report</a> (and include
+any relevant-looking error messages).
diff --git a/docs/MS2DB.html b/docs/MS2DB.html
new file mode 100644
index 0000000..f89035e
--- /dev/null
+++ b/docs/MS2DB.html
@@ -0,0 +1,51 @@
+<h1>Inspect: A Proteomics Search Toolkit</h1>
+Copyright 2007, The Regents of the University of California
+<h3>Table of Contents</h1>
+<li><a href="index.html">Overview</a>
+<li><a href="Copyright.html">Copyright information</a>
+<li><a href="Installation.html">Installation</a>
+<li><a href="Database.html">Database</a>
+<li><a href="Searching.html">Searching</a>
+<li><a href="Analysis.html">Analysis</a>
+<li><a href="InspectTutorial.pdf">Basic Tutorial</a>
+<li><a href="InspectAdvancedTutorial.pdf">Advanced Tutorial</a>
+<li><a href="UnrestrictedSearchTutorial.pdf">Unrestricted Search Tutorial</a>
+
+<hr>
+
+MS2DB is a relatively straightforward file format. For now, the documentation is limited to
+a example.
+<br><br>
+<b>TODO:</b> Document all available tags and attributes.
+
+<h3>Abbreviated example</h3>
+<pre>
+<Database>
+<Gene ExonCount="14" Chromosome="chr1" ForwardFlag="1" Name="At1g02100">
+ <Exon Index="0" Start="389875" End="389943">
+ <ExonSequence Length="22">MAESRSNRAAVQATNDDASASK</ExonSequence>
+ </Exon>
+ <Exon Index="1" Start="390036" End="390250">
+ <ExonSequence Length="71">SCVKKGYMKDDYVHLFVKRPVRRSPIINRGYFSRWAAFRKLMSQFLLSGTSSKKQILSLGAGFDTTYFQLL</ExonSequence>
+ <LinkFrom Index="0" AA="L" />
+ </Exon>
+
+[.......]
+
+ <Exon Index="12" Start="392261" End="392300">
+ <ExonSequence Length="13">EHYCVTYAVNDAM</ExonSequence>
+ <LinkFrom Index="9" />
+ </Exon>
+ <Exon Index="13" Start="392373" End="392448">
+ <ExonSequence Length="25">GIFGDFGFTREGGGERMSSSASSPX</ExonSequence>
+ <LinkFrom Index="12" />
+ </Exon>
+ <CrossReference Database"Salk" ID="At1g02100.1">
+ <CRExons Index="0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13 />
+ </CrossReference>
+ <CrossReference Database"Salk" ID="At1g02100.2">
+ <CRExons Index="0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 />
+ </CrossReference>
+</Gene>
+</Database>
+</pre>
\ No newline at end of file
diff --git a/docs/PLSTutorial.pdf b/docs/PLSTutorial.pdf
new file mode 100644
index 0000000..de5adbb
Binary files /dev/null and b/docs/PLSTutorial.pdf differ
diff --git a/docs/RunningInspectOnTheFWGrid.pdf b/docs/RunningInspectOnTheFWGrid.pdf
new file mode 100644
index 0000000..20fdc8a
Binary files /dev/null and b/docs/RunningInspectOnTheFWGrid.pdf differ
diff --git a/docs/Searching.html b/docs/Searching.html
new file mode 100644
index 0000000..5ec764c
--- /dev/null
+++ b/docs/Searching.html
@@ -0,0 +1,128 @@
+<h1>Inspect: A Proteomics Search Toolkit</h1>
+Copyright 2007, The Regents of the University of California
+<h3>Table of Contents</h1>
+<li><a href="index.html">Overview</a>
+<li><a href="Copyright.html">Copyright information</a>
+<li><a href="Installation.html">Installation</a>
+<li><a href="Database.html">Database</a>
+<li><a href="Searching.html">Searching</a>
+<li><a href="Analysis.html">Analysis</a>
+<li><a href="InspectTutorial.pdf">Basic Tutorial</a>
+<li><a href="InspectAdvancedTutorial.pdf">Advanced Tutorial</a>
+<li><a href="UnrestrictedSearchTutorial.pdf">Unrestricted Search Tutorial</a>
+
+<hr>
+<h1>Searching</h1>
+
+To run a search, you first create an inspect <b>input file</b>. The input file is text file that
+tells Inspect what to do. Each line of the
+input file has the form [COMMAND],[VALUE]. For example, one line might be "spectra,spec18.dta",
+where the command is "spectra" and the value is "spec18.dta". Inspect ignores blank lines. You can
+include comments by starting lines with a hash character (#). Here is an example
+of what an input file might look like:
+<pre>
+spectra,Fraction01.mzxml
+instrument,ESI-ION-TRAP
+protease,Trypsin
+DB,TestDatabase.trie
+# Protecting group on cysteine:
+mod,57,C,fix
+</pre>
+<br>Here are the
+available input file commands. Those you are most likely to set are listed first. The only required
+commands are one or more "spectra" commands, and either "db" or "SequenceFile". Commands are
+case-insensitive (type "Spectra" or "spectra", it doesn't matter). Values are case-insensitive with
+the exception (on Linux) of filenames. If Inspect doesn't understand a command, it will print a
+warning and ignore it.
+<br><br>
+<li><b>spectra,[FILENAME]</b> - Specifies a spectrum file to search. You can specify the name of a
+directory to search every file in that directory (non-recursively).<br>
+Preferred file formats: .mzXML and .mgf <br>
+Other accepted file formats: .mzData, .ms2 .dta.
+Note that multiple spectra in a single .dta file are <b>not</b> supported.
+<br>
+<li><b>db,[FILENAME]</b> - Specifies the name of a database (.trie file) to search. The .trie file
+contains one or more protein sequences delimited by asterisks, with no whitespace or other data.
+Use PrepDB.py (see <a href="Database.html">Databases</a> to prepare a database. You should specify
+at least one database. You may specify several databases; if so, each database will be searched in turn.
+<li><b>SequenceFile,[FILENAME]</b> - Specifies the name of a FASTA-format protein database to search. If
+you plan to search a large database, it is more efficient to preprocess it using PrepDB.py and use the "db"
+command instead. You can specify at most one SequenceFile.
+<br>
+<li><b>protease,[NAME]</b> - Specifies the name of a protease. "Trypsin", "None", and "Chymotrypsin" are
+the available values. If tryptic digest is specified, then matches with non-tryptic termini are penalized.
+<br>
+<li><b>mod,[MASS],[RESIDUES],[TYPE],[NAME]</b> - Specifies an amino acid modification. The delta mass
+(in daltons) and affected amino acids are required. The first four characters of the name should be
+unique. Valid values for "type" are "fix", "cterminal", "nterminal", and "opt" (the default). For a guide
+to various known modification types, consult the following databases:
+<li> <a href="http://www.abrf.org/index.cfm/dm.home">ABRF mass delta reference</a>
+<li> <a href="http://www.unimod.org">UNIMOD database</a>
+<li>RESID database of modifications
+Examples:
+<br><tt>mod,+57,C,fix</tt> - Most searches should include this line. It reflects the addition of CAM
+(carbamidomethylation, done by adding iodoacetamide) which prevents cysteines from forming disulfide bonds.
+<br><tt>mod,80,STY,opt,phosphorylation</tt>
+<br><tt>mod,16,M</tt> (Oxidation of methionine, seen in many samples)
+<br><tt>mod,43,*,nterminal</tt> (N-terminal carbamylation, common if sample is treated with urea)
+<br>
+<b>Important note:</b> When searching for phosphorylation sites, use a modification with the name "phosphorylation".
+This lets Inspect know that it should use its model of phosphopeptide fragmentation
+when generating tags and scoring matches. (Phosphorylation of serine dramatically affects fragmentation, so
+modeling it as simply an 80Da offset is typically <b>not</b> sufficient to detect sites with high sensitivity)
+<li><b>Mods,[COUNT]</b> - Number of PTMs permitted in a single peptide. Set this to 1 (or higher) if you
+ specify PTMs to search for.
+<li><b>Unrestrictive,[FLAG]</b> - If FLAG is 1, use the MS-Alignment algorithm to perform an <b>unrestrictive</b>
+search (allowing arbitrary modification masses). Running an unrestrictive search with one mod per peptide is slower than the
+normal (tag-based) search; running time is approximately 1 second per spectrum per megabyte of database. Running an unrestrictive search
+with two mods is significantly slower. We recommend performing unrestrictive searches against a small database, containing proteins
+output by an earlier search. (The "Summary.py" script can be used to generate a second-pass database
+from initial search results; see <a href="Analysis">Analysis</a>)
+<li><b>MaxPTMSize,[SIZE]</b> - For blind search, specifies the maximum modification size (in Da) to consider.
+Defaults to 250. Larger values require more time to search.
+<li><b>PMTolerance,[MASS]</b> - Specifies the parent mass tolerance, in Daltons. A candidate's
+flanking mass can differ from the tag's flanking mass by no more than ths amount. Default value
+is 2.5. Note that secondary ions are often selected for fragmentation, so parent mass errors near
+1.0Da or -1.0Da are not uncommon in typical datasets, even on FT machines.
+<li><b>ParentPPM,[MASS]</b> - Specifies a parent mass tolerance, in parts per million. Alternative to PMTolerance.
+<li><b>IonTolerance,[MASS]</b> - Error tolerance for how far ion fragments (b and y peaks) can be
+ shifted from their expected masses. Default is 0.5. Higher values produce a more sensitive but much slower search.
+<li><b>PeakPPM,[MASS]</b> - Specifies a fragment mass tolerance, in parts per million. Alternative to IonTolerance.
+<li><b>MultiCharge,[FLAG]</b> - If set to true, attempt to guess the precursor charge and mass, and consider
+multiple charge states if feasible.
+<li><b>Instrument,[TYPE]</b> - Options are ESI-ION-TRAP (default), QTOF, and FT-Hybrid. If set to ESI-ION-TRAP,
+Inspect attempts to correct the parent mass. If set to QTOF, Inspect uses a fragmentation model trained on
+QTOF data. (QTOF data typically features a stronger y ladder and weaker b ladder than other spectra).
+<li><b>RequiredMod,[NAME]</b> - The specified modification MUST be found somewhere on the peptide.
+<li><b>TagCount,[COUNT]</b> - Number of tags to generate
+<li><b>TagLength,[LENGTH]</b> - Length of peptide sequence tags. Defaults to 3. Accepted values are 1 through 6.
+<li><b>RequireTermini,[COUNT]</b> - If set to 1 or 2, require 1 or 2 valid proteolytic termini. Deprecated, because
+ the scoring model already incorporates the number of valid (tryptic) termini.
+<h3>Non-standard options:</h3>
+<b>TagsOnly</b> - Tags are generated and written to the specified output file. No search is performed.
+
+<hr>
+<h2>Command-line arguments</h2>
+Inspect features a few command-line options. Most options are specified in an <b>input file</b>, rather
+than on the command-line. The
+command-line options are:
+ <li> <b>-i</b> Input file name. Defaults to "Input.txt"
+ <li> <b>-o</b> Output file name. Defaults to "Inspect.txt"
+ <li> <b>-e</b> Error file name. Defaults to "Inspect.err".
+ <li> <b>-r</b> The resource directory. Defaults to the current working directory. The resource directory
+is where Inspect searches for its resource files such as AminoAcidMasses.txt.
+<br><br>
+Sample usage:<br>
+On Windows: <b>Inspect -i TripureIn.txt -o TripureOut.txt</b><br>
+On Linux: <b>./inspect -i TripureIn.txt -o TripureOut.txt</b><br>
+<h3>Error Reporting</h3>
+If Inspect encounters a problem - such as a spectrum file with garbled format, or
+running out of memory - it reports the problem to the error file. One error (or warning)
+is reported per line of the file, and each error/warning type has an ID, to make them
+easier to parse. If no error file is left behind after a run, then there were no errors - this
+is a good thing!
+<br><br>Here is a sample error message, where I gave inspect an incorrect file name:<br>
+<tt>[E0008] .\ParseInput.c:725:Unable to open requested file '.\Database\TestDatbaase.trie'</tt>
+<br>
+And here is a sample warning message, where - on a small search - Inspect was not able to re-fit the p-value distribution:<br>
+<tt>{W0010} .\PValue.c:396:Few spectra were searched; not recalibrating the p-value curve.</tt><br>
diff --git a/docs/UnrestrictedSearchTutorial.pdf b/docs/UnrestrictedSearchTutorial.pdf
new file mode 100644
index 0000000..038d0ba
Binary files /dev/null and b/docs/UnrestrictedSearchTutorial.pdf differ
diff --git a/docs/index.html b/docs/index.html
new file mode 100644
index 0000000..2f11c01
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1,42 @@
+<h1>InsPecT: A Proteomics Search Toolkit</h1>
+Copyright 2007, The Regents of the University of California
+<br>Version 20120103 documentation
+<h3>Table of Contents</h1>
+<li><a href="index.html">Overview</a>
+<li><a href="Copyright.html">Copyright information</a>
+<li><a href="Installation.html">Installation</a>
+<li><a href="Database.html">Database</a>
+<li><a href="Searching.html">Searching</a>
+<li><a href="Analysis.html">Analysis</a>
+<li><a href="InspectTutorial.pdf">Basic Tutorial</a>
+<li><a href="InspectAdvancedTutorial.pdf">Advanced Tutorial</a>
+<li><a href="UnrestrictedSearchTutorial.pdf">Unrestricted Search Tutorial</a>
+<hr>
+<h1>Documentation Overview</h1>
+This is the documentation for InsPecT, a MS/MS database search tool. A general description of the
+program can be found <a href="http://proteomics.bioprojects.org/Software/Inspect.html">here</a>
+online. The documentation contains two general types of information: tutorials and docs pages.
+The tutorials walk through basic setup and usage of InsPecT and are <b>highly</b> recommended.
+The docs pages are more detailed documentation on options available within the program.
+<br><br>
+InsPecT was developed at the University of California, San Diego and the project homepage
+is <a href="http://proteomics.ucsd.edu/">here</a>. A Windows executable is
+available for download, as well as the ANSI C source code (which compiles on Windows,
+Linux, or Macintosh). Inspect is free for educational, research, and non-profit purposes.
+<br><br>
+The following publications provide additional information on InsPecT; you may wish to
+cite them if you use InsPecT search results in your research:
+<br>
+<li>S. Tanner, H. Shu, A. Frank, L.Wang, E. Zandi, M. Mumby, P.A. Pevzner, and V. Bafna.
+Inspect: Fast and accurate identification of post-translationally modified peptides
+from tandem mass spectra. Anal. Chem., 77(14):4626�4639, 2005.
+<li>Identification of Post-translational Modifications via Blind Search of
+Mass-Spectra. Dekel Tsur, Stephen Tanner, Ebrahim Zandi, Vineet Bafna, Pavel
+A. Pevzner. Nature Biotechnology 23, 1562-2567 (01 Dec 2005).
+<br><br>
+The authors and <a href="mailto:vbafna at cs.ucsd.edu">principal investigator</a> welcome questions, comments, and corrections.
+<hr>
+<h3>Bugs</h3>
+Bugs in Inspect are tracked using <a href="http://bugs.bioprojects.org">JIRA</a>. If you encounter
+problems, please submit a bug report online!
+
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..c1852b5
--- /dev/null
+++ b/main.c
@@ -0,0 +1,863 @@
+//Title: main.c
+//Authors: Stephen Tanner, Samuel Payne, Natalie Castellana, Pavel Pevzner, Vineet Bafna
+//Created: 2005
+// Copyright 2007,2008,2009 The Regents of the University of California
+// All Rights Reserved
+//
+// Permission to use, copy, modify and distribute any part of this
+// program for educational, research and non-profit purposes, by non-profit
+// institutions only, without fee, and without a written agreement is hereby
+// granted, provided that the above copyright notice, this paragraph and
+// the following three paragraphs appear in all copies.
+//
+// Those desiring to incorporate this work into commercial
+// products or use for commercial purposes should contact the Technology
+// Transfer & Intellectual Property Services, University of California,
+// San Diego, 9500 Gilman Drive, Mail Code 0910, La Jolla, CA 92093-0910,
+// Ph: (858) 534-5815, FAX: (858) 534-7345, E-MAIL:invent at ucsd.edu.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
+// FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+// INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE, EVEN
+// IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY
+// OF SUCH DAMAGE.
+//
+// THE SOFTWARE PROVIDED HEREIN IS ON AN "AS IS" BASIS, AND THE UNIVERSITY
+// OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES,
+// ENHANCEMENTS, OR MODIFICATIONS. THE UNIVERSITY OF CALIFORNIA MAKES NO
+// REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER IMPLIED OR
+// EXPRESS, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF
+// THE SOFTWARE WILL NOT INFRINGE ANY PATENT, TRADEMARK OR OTHER RIGHTS.
+
+
+// Inspect - a tool for efficient peptide MS/MS interpretation in the
+// presence of post-translational modifications.
+
+// Inspect can use partial de novo to generate tags, then use the tags
+// to search a protein database for matching peptides. A tag
+// has a prefix mass, a sequence of peptides, and a suffix mass. Typically,
+// tags are tripeptides. We use a trie structure (Aho-Corasic algorithm) to find
+// occurrences of our tag strings in the database, then examine the flanking masses
+// to be sure they match. The flanking mass comparison is a d.p. 'hit extension'
+// algorithm.
+//
+// Inspect requires a database-file in the correct format. The file
+// should contain peptides concatenated together, separated by asterisks.
+// No whitespace or newlines. Like this: PANTS*GWWYTT*GAAH
+// The PrepDB.py script compresses a Swiss-prot or FASTA database into
+// concatenated format. An accompanying .index file is produced, so that the
+// name of a matched protein can be reported.
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <string.h>
+#include "Trie.h"
+#include "Utils.h"
+#include "Spectrum.h"
+#include "Mods.h"
+#include "Score.h"
+#include "Tagger.h"
+#include "FreeMod.h"
+#include "CMemLeak.h"
+#include "SVM.h"
+#include "BN.h"
+#include "LDA.h"
+#include "Run.h"
+#include "SNP.h"
+#include "SpliceDB.h"
+#include "ChargeState.h"
+#include "Scorpion.h"
+#include "ParseXML.h"
+#include "SpliceScan.h"
+#include "ParseInput.h"
+#include "PValue.h"
+#include "Errors.h"
+#include "BuildMS2DB.h"
+#include "IonScoring.h"
+#include "TagFile.h" //ARI_MOD
+
+// Global variables, shared between main.c and Trie.c:
+extern Options* GlobalOptions;
+extern MSSpectrum* Spectrum;
+
+// Array of spectra to be searched. We put them into an array so that we can qsort
+// them. (Not crucial, but it's nice to get output in order)
+extern SpectrumNode* g_BigNodeArray;
+
+extern StringNode* FirstTagCheckNode;
+extern StringNode* LastTagCheckNode;
+
+
+void PrintUsageInfo()
+{
+ printf("\nSample command-line:\n");
+ printf("Inspect.exe -i Foo.in -o Foo.txt -e ErrorsFoo.txt\n");
+ printf("Command-line arguments:\n");
+ printf(" -i InputFileName: Path to a config file specifying search parameters.\n");
+ printf(" -o OutputFileName: Output file for match results. If not\n");
+ printf(" specified, output goes to stdout.\n");
+ printf(" -e ErrorFileName: Output file for errors and warnings, if any. If not\n");
+ printf(" specified, any errors go to Inspect.err; if there are no errors.\n");
+ printf(" or warnings reported, this file will be erased at end of run.\n");
+ printf(" -r ResourceDir: Directory for resource files (such \n");
+ printf(" as AminoAcidMasses.txt). Defaults to current directory. \n");
+ printf(" -a AminoAcidMassesFile: Specify a file containing non-standard amino acid masses. \n");
+ printf(" Consult the documentation (Inspect.html) for further details.\n");
+}
+
+void FreeSpectra()
+{
+ SpectrumNode* Node;
+ SpectrumNode* Prev = NULL;
+ //
+ for (Node = GlobalOptions->FirstSpectrum; Node; Node = Node->Next)
+ {
+ if (Prev)
+ {
+ FreeSpectrum(Prev->Spectrum);
+ Prev->Spectrum = NULL;
+ // Important: don't free spectrum nodes, because they all come from one big array!
+ //FreeSpectrumNode(Prev);
+ }
+ Prev = Node;
+ //FreeSpectrum(Node->Spectrum);
+ }
+ if (Prev)
+ {
+ FreeSpectrum(Prev->Spectrum);
+ Prev->Spectrum = NULL;
+ }
+ GlobalOptions->FirstSpectrum = NULL;
+ GlobalOptions->LastSpectrum = NULL;
+}
+
+
+void FreeGlobalOptions()
+{
+
+ StringNode* Prev;
+ StringNode* GFFNode;
+ DatabaseFile* PrevDB;
+ DatabaseFile* DatabaseNode;
+
+ if (!GlobalOptions)
+ {
+ return;
+ }
+
+ // Free the list FirstGFFFileName...LastGFFFileName
+ Prev = NULL;
+ for (GFFNode = GlobalOptions->FirstGFFFileName; GFFNode; GFFNode = GFFNode->Next)
+ {
+ if (Prev)
+ {
+ SafeFree(Prev->String);
+ SafeFree(Prev);
+ }
+ Prev = GFFNode;
+ }
+ if (Prev)
+ {
+ SafeFree(Prev->String);
+ SafeFree(Prev);
+ }
+
+ // Free the DatabaseFile list:
+ PrevDB = NULL;
+ for (DatabaseNode = GlobalOptions->FirstDatabase; DatabaseNode; DatabaseNode = DatabaseNode->Next)
+ {
+ SafeFree(PrevDB);
+ PrevDB = DatabaseNode;
+ }
+
+ // Save the overall struct:
+ SafeFree(GlobalOptions);
+ GlobalOptions = NULL;
+}
+
+// Free various structs that we built up. (This isn't strictly necessary, since we're about
+// to exit process anyway, but it's good practice)
+// NOTE: After calling Cleanup(), you can't call Log() any more, because GlobalOptions no longer
+// points at a log file.
+void Cleanup()
+{
+ //printf("Cleaning up...\n");
+ FreeMassDeltaByMass();
+ FreeMassDeltas();
+ FreeIsSubDecoration();
+ //FreeTaggingModel();
+ FreeJumpingHash();
+ FreeSVMModels();
+ FreeBayesianModels();
+ FreeTagCheckNodes();
+ FreeInputFileNodes();
+ FreeLDAModels();
+ FreeCCModelSVM();
+ FreeTagSkewScores();
+ if (GlobalOptions)
+ {
+ FreeSpectra();
+ // Close our error file. And if we never wrote errors or warnings, erase it!
+ if (GlobalOptions->ErrorFile)
+ {
+ fclose(GlobalOptions->ErrorFile);
+ GlobalOptions->ErrorFile = NULL;
+ if (!GlobalOptions->ErrorCount && !GlobalOptions->WarningCount)
+ {
+ unlink(GlobalOptions->ErrorFileName);
+ }
+ }
+ FreeGlobalOptions();
+ }
+ SafeFree(g_BigNodeArray);
+ g_BigNodeArray = NULL;
+ SafeFree(GlobalStats);
+ GlobalStats = NULL;
+ FreeExternalTagHolder(); //ARI_MOD
+}
+
+
+// Parse the command-line arguments, and populate GlobalOptions.
+// Returns true on success, 0 if the args are invalid.
+int ReadCommandLineArgs(int argc, char** argv)
+{
+ int Index = 1;
+ int MoreArgs;
+ int Result;
+ char PeptideFilePath[2048];
+ int AASet = 0;
+ if (argc<2)
+ {
+ return 0;
+ }
+ while (Index < argc)
+ {
+ if (argv[Index][0] != '-')
+ {
+ REPORT_ERROR_S(18, argv[Index]);
+ return 0;
+ }
+ // Are there args after this one?
+ if (Index < argc-1)
+ {
+ MoreArgs = 1;
+ }
+ else
+ {
+ MoreArgs = 0;
+ }
+ switch (ConvertToLower(argv[Index][1]))
+ {
+ case 'i': // Input options file name
+ if (!MoreArgs)
+ {
+ REPORT_ERROR_S(19, "-i");
+ return 0;
+ }
+ strncpy(GlobalOptions->InputFileName, argv[Index + 1], MAX_FILENAME_LEN);
+ Index += 2;
+ break;
+ case 'o':
+ if (!MoreArgs)
+ {
+ REPORT_ERROR_S(19, "-o");
+ return 0;
+ }
+ strncpy(GlobalOptions->FinalOutputFileName, argv[Index + 1], MAX_FILENAME_LEN);
+ Index += 2;
+ break;
+ case 'e':
+ if (!MoreArgs)
+ {
+ REPORT_ERROR_S(19, "-e");
+ return 0;
+ }
+ strncpy(GlobalOptions->ErrorFileName, argv[Index + 1], MAX_FILENAME_LEN);
+ Index += 2;
+ break;
+
+ case 'r':
+ if (!MoreArgs)
+ {
+ REPORT_ERROR_S(19, "-r");
+ return 0;
+ }
+ strcpy(GlobalOptions->ResourceDir, argv[Index + 1]);
+ printf("Setting resource directory: '%s'\n", argv[Index + 1]);
+ if (*(GlobalOptions->ResourceDir + strlen(GlobalOptions->ResourceDir) - 1) != SEPARATOR)
+ {
+ strcat(GlobalOptions->ResourceDir, SEPARATOR_STRING);
+ }
+ printf("Resource directory is: '%s'\n", GlobalOptions->ResourceDir);
+ Index += 2;
+ break;
+ case 'v':
+ GlobalOptions->VerboseFlag = 1;
+ Index++;
+ break;
+ case 'a':
+ strcpy(GlobalOptions->AminoFileName,argv[Index+1]);
+ printf("Setting amino acid masses: '%s'\n", GlobalOptions->AminoFileName);
+ AASet = 1;
+ Index += 2;
+ break;
+ default:
+ printf("Error: I don't understand this argument '%s'.\n", argv[Index]);
+ return 0;
+ }
+ }
+
+ // Read the table of amino acid masses:
+ if(AASet == 1)
+ {
+ sprintf(PeptideFilePath, "%s", GlobalOptions->AminoFileName);
+ Result = LoadPeptideMasses(PeptideFilePath);
+
+ if(!Result)
+ {
+ sprintf(PeptideFilePath, "%s%s", GlobalOptions->ResourceDir,GlobalOptions->AminoFileName);
+ Result = LoadPeptideMasses(PeptideFilePath);
+
+ }
+ }
+ else
+ {
+ sprintf(PeptideFilePath, "%s%s",GlobalOptions->ResourceDir, FILENAME_AMINO_ACID_MASSES);
+ Result = LoadPeptideMasses(PeptideFilePath);
+ if (!Result)
+ {
+ Result = LoadPeptideMasses(NULL);
+ }
+ }
+ if (!Result)
+ {
+ printf("Error - couldn't load amino acid masses!\n");
+ return 1;
+ }
+ // If -r argument wasn't passed, then use the current working directory:
+ if (!GlobalOptions->ResourceDir[0])
+ {
+ sprintf(GlobalOptions->ResourceDir, ".%c", SEPARATOR);
+ }
+ if (GlobalOptions->InputFileName)
+ {
+ //printf("Parse input file:\n");
+ Result = ParseInputFile();
+
+ //printf("Input file parse result %d\n", Result);
+ if (!Result)
+ {
+ return 0;
+ }
+ SortSpectra();
+ }
+
+ // If no spectra were specified, then error out - unless we're running a
+ // mode that requires no spectra.
+ if (!GlobalOptions->FirstSpectrum)
+ {
+ if (!GlobalOptions->RunMode & (RUN_MODE_PREP_MS2DB))
+ {
+ REPORT_ERROR(11);
+ return 0;
+ }
+ }
+
+ if (!(*GlobalOptions->FinalOutputFileName))
+ {
+ sprintf(GlobalOptions->FinalOutputFileName, "Inspect.txt");
+ }
+
+ return 1;
+}
+
+// Perform miscellaneous chores *after* reading the input script and *before* starting to search.
+int Initialize()
+{
+ char Path[2048];
+
+ sprintf(Path, "%s%s", GlobalOptions->ResourceDir, FILENAME_MASS_DELTAS);
+ if (!MassDeltas)
+ {
+
+ if (GlobalOptions->RunMode & (RUN_MODE_BLIND|RUN_MODE_BLIND_TAG))
+ {
+ //LoadMassDeltas(Path, 0);
+ }
+ else
+ {
+ LoadMassDeltas(Path, GlobalOptions->RunMode & (RUN_MODE_MUTATION | RUN_MODE_TAG_MUTATION));
+ }
+ }
+ InitBayesianModels();
+ SetTagSkewScores();
+
+ if(GlobalOptions->RunMode & (RUN_MODE_MUTATION | RUN_MODE_TAG_MUTATION))
+ LoadMassDeltas(Path,1);
+ if (GlobalOptions->RunMode & (RUN_MODE_BLIND | RUN_MODE_BLIND_TAG))
+ {
+
+ //FreeMassDeltas();
+ LoadMassDeltas(NULL, 0);
+ AddBlindMods();
+ }
+ else
+ {
+ InitMassDeltaByMass();
+ //debugMassDeltaByMass();
+ }
+
+ PopulateJumpingHash();
+ //LoadFlankingAminoEffects();
+ //sprintf(Path, "%s%s", GlobalOptions->ResourceDir, FILENAME_SCORING_MODEL);
+ //Result = InitScoringModel(Path);
+ //if (!Result)
+ //{
+ // printf("Error loading scoring model from file '%s'\n", Path);
+ // return 0;
+ //}
+
+#ifdef MQSCORE_USE_SVM
+ InitPValueSVM();
+#else
+ InitPValueLDA();
+#endif
+
+ return 1;
+}
+
+// Offshoot of main() for handling spliced-database creation and maintenance:
+void MainSpliceDB(int argc, char** argv)
+{
+ int ChromosomeNumber;
+ int ReverseFlag;
+ char* GeneName;
+ char* CustomFileName;
+ int IntervalStart = -1;
+ int IntervalEnd = -1;
+ int MinORFLength;
+ char SNPFileName[256];
+ //
+ // inspect <chromosome> <reverseflag> [ GeneName, OutputFileName, IntervalStart, IntervalEnd ]
+
+ ChromosomeNumber = atoi(argv[1]);
+ ReverseFlag = atoi(argv[2]);
+ if (argc > 3)
+ {
+ MinORFLength = atoi(argv[3]);
+ }
+ else
+ {
+ MinORFLength = 50;//DEFAULT_MINIMUM_ORF_LENGTH;
+ }
+
+ if (MinORFLength == 0)
+ {
+ MinORFLength = -1;
+ }
+ printf("MainSpliceDB() chrom %d reverse %d minorf %d\n", ChromosomeNumber, ReverseFlag, MinORFLength);
+ // Read a linked-list of all the polymorphisms we'd like to account for:
+ sprintf(SNPFileName, "SNP\\%d.snp", ChromosomeNumber);
+ ParsePolyNodes(SNPFileName); // %%% ARABIDOPSIS: No polynodes available
+ printf("PolyNodes parsed\n");
+ if (argc > 4)
+ {
+ GeneName = argv[4];
+ CustomFileName = argv[5];
+ IntervalStart = atoi(argv[6]);
+ IntervalEnd = atoi(argv[7]);
+ PrepareOneGeneSpliceDB(ChromosomeNumber, ReverseFlag, IntervalStart, IntervalEnd, CustomFileName, GeneName, MinORFLength);
+ }
+ else
+ {
+ printf("PrepareSpliceDB...\n");
+ PrepareSpliceDB(ChromosomeNumber, ReverseFlag, MinORFLength);
+ }
+ FreePolyNodes();
+
+}
+
+// MainTraining() is called if the first command-line argument is "train".
+// Syntax is:
+// inspect.exe train [model] [OracleFile] [SpectrumDir] [extra]
+// Example:
+// inspect.exe train pmc c:\ms\TrainingSet.txt c:\ms\TrainingSet
+//
+// Output format depends on the particular model, but generally we spew out a delimited text file
+// which can be processed by a wrapper-script.
+int MainTraining(int argc, char** argv)
+{
+ char* ModelName;
+ char* OracleFile;
+ char OracleDir[1024];
+ int Len;
+ //
+ if (argc < 5)
+ {
+ printf("Error: Not enough arguments to train!\n");
+ printf("Please provide model name, oracle file, and spectrum directory.\n");
+ printf("Sample: inspect.exe train pmc c:\\ms\\TrainingSet.txt c:\\ms\\TrainingSet\n");
+ return -1;
+ }
+ InitOptions();
+ ModelName = argv[2];
+ OracleFile = argv[3];
+ // Guarantee that OracleDir ends with a delimiter:
+ strcpy(OracleDir, argv[4]);
+ Len = strlen(OracleDir);
+
+ if (Len && OracleDir[Len] != SEPARATOR)
+ {
+ OracleDir[Len] = SEPARATOR;
+ OracleDir[Len + 1] = '\0';
+ }
+ // Various trainings are available:
+ if (!CompareStrings(ModelName, "pmc"))
+ {
+ //TrainPMC(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "cc"))
+ {
+ //TrainCC(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "pepprm"))
+ {
+ LoadPeptideMasses("AminoAcidMasses.txt");
+ PeptideMass['C'] += CAM_MASS; // ASSUMED: All cysteines in the training set carry the +57 modification.
+ //GlobalOptions->InstrumentType = INSTRUMENT_TYPE_QTOF;
+ TrainPepPRM(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "tag"))
+ {
+ LoadPeptideMasses("AminoAcidMasses.txt");
+ PeptideMass['C'] += CAM_MASS; // ASSUMED: All cysteines in the training set carry the +57 modification.
+ LoadMassDeltas(NULL, 0);
+ InitMassDeltaByMass();
+ PopulateJumpingHash();
+ TrainTagging(OracleFile, OracleDir);
+ }
+
+ else
+ {
+ printf("Unknown model name '%s' - no training performed.\n", ModelName);
+ }
+ return 0;
+}
+
+int MainTesting(int argc, char** argv)
+{
+ char* ModelName;
+ char* OracleFile;
+ char OracleDir[1024];
+ int Len;
+ //
+
+ InitOptions();
+ ModelName = argv[2];
+ OracleFile = argv[3];
+ // Guarantee that OracleDir ends with a delimiter:
+ if (argc > 4)
+ {
+ strcpy(OracleDir, argv[4]);
+ Len = strlen(OracleDir);
+ if (Len && OracleDir[Len] != SEPARATOR)
+ {
+ OracleDir[Len] = SEPARATOR;
+ OracleDir[Len + 1] = '\0';
+ }
+ }
+ // Various tests are available:
+ if (!CompareStrings(ModelName, "pmc"))
+ {
+ //TestPMC(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "splicedbug"))
+ {
+ TestSpliceDB(argc, argv);
+ }
+ else if (!CompareStrings(ModelName, "cc"))
+ {
+ LoadPeptideMasses("AminoAcidMasses.txt");
+ PeptideMass['C'] += CAM_MASS; // ASSUMED: All cysteines in the training set carry the +57 modification.
+ //TestCC(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "prmq"))
+ {
+ LoadPeptideMasses("AminoAcidMasses.txt");
+ PeptideMass['C'] += CAM_MASS; // ASSUMED: All cysteines in the training set carry the +57 modification.
+ // The oracle file contains the true match for a spectrum, followed by many false matches.
+ // Compute the total (average) PRM score for each, sort them, and report the position of the
+ // true peptide within the list. (Report a histogram of these positions)
+ TestPRMQuickScoring(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "pepprm"))
+ {
+ LoadPeptideMasses("AminoAcidMasses.txt");
+ PeptideMass['C'] += CAM_MASS; // ASSUMED: All cysteines in the training set carry the +57 modification.
+ TestPepPRM(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "lda"))
+ {
+ LoadPeptideMasses("AminoAcidMasses.txt");
+ PeptideMass['C'] += CAM_MASS; // ASSUMED: All cysteines in the training set carry the +57 modification.
+ TestLDA(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "tag"))
+ {
+ LoadPeptideMasses("AminoAcidMasses.txt");
+ PeptideMass['C'] += CAM_MASS; // ASSUMED: All cysteines in the training set carry the +57 modification.
+ LoadMassDeltas(NULL, 0);
+ InitMassDeltaByMass();
+ PopulateJumpingHash();
+ // The oracle file contains the true match for a spectrum, followed by many false matches.
+ // Compute the total (average) PRM score for each, sort them, and report the position of the
+ // true peptide within the list. (Report a histogram of these positions)
+ TestTagging(OracleFile, OracleDir);
+ }
+ else if (!CompareStrings(ModelName, "pvalue"))
+ {
+ // Read in positive and negative feature-vectors, and produce a histogram:
+ TestPValue(OracleFile);
+ }
+ else
+ {
+ printf("Unknown model name '%s' - no testing performed.\n", ModelName);
+ }
+
+ return 0;
+
+}
+
+// SpliceFind: Arguments are a genomic database, and "string-table" protein database.
+// We also get a range of protein numbers. We then look through the genome to find the
+// best (approximate!) match for each protein.
+int MainSpliceFind(int argc, char* argv[])
+{
+ int FirstRecord;
+ int LastRecord; // inclusive!
+ char IndexFileName[512];
+ char* Temp;
+ //
+ if (argc < 6)
+ {
+ printf("** Not enough args for splice find. Sample run:\n");
+ printf("inspect splicefind database\\ipiv313.trie ESTSpliceDB\\Genome.dat 0 1000\n");
+ return -1;
+ }
+ FirstRecord = atoi(argv[4]);
+ LastRecord = atoi(argv[5]);
+ if (LastRecord <= FirstRecord && LastRecord > -1)
+ {
+ printf("** Bad record numbers: %s to %s\n", argv[4], argv[5]);
+ return -1;
+ }
+ strcpy(IndexFileName, argv[2]);
+ for (Temp = IndexFileName + strlen(IndexFileName); Temp >= IndexFileName; Temp--)
+ {
+ if (*Temp == '.')
+ {
+ *Temp = '\0';
+ break;
+ }
+ }
+ strcat(IndexFileName, ".index");
+ SSDatabaseScan(argv[2], IndexFileName, argv[3], FirstRecord, LastRecord);
+ return 1;
+}
+
+int LoadAndScoreSpectrum()
+{
+ //char* FilePath = "PTMScore\\HEKMerged\\Spectra\\H\\R.HIADLAGNSEVILPVPAFNVINGGS+244HAG.N.2.dta";
+ //char* Annotation = "R.HIADLAGNSEVILPVPAFNVINGGS+244HAG.N";
+ char* FilePath = "SystemTest\\TestSpectrum.dta";
+ char* Annotation = "VKEAMAPK";
+ MSSpectrum* Spectrum;
+ int FilePosition = 0; // Default: byte offset 0
+ SpectrumNode* Node;
+ FILE* SpectrumFile;
+ //
+ Node = (SpectrumNode*)calloc(1, sizeof(SpectrumNode));
+ Node->FilePosition = FilePosition;
+ Node->ScanNumber = 0;
+ Node->InputFile = (InputFileNode*)calloc(1, sizeof(InputFileNode));
+ strncpy(Node->InputFile->FileName, FilePath, MAX_FILENAME_LEN);
+ // Guess the file format:
+ Node->InputFile->Format = GuessSpectrumFormatFromExtension(FilePath);
+ SpectrumFile = fopen(FilePath, "rb");
+ fseek(SpectrumFile, Node->FilePosition, 0);
+ Node->Spectrum = (MSSpectrum*)calloc(1, sizeof(MSSpectrum));
+ Spectrum = Node->Spectrum;
+ Node->Spectrum->Node = Node;
+ SpectrumLoadFromFile(Node->Spectrum, SpectrumFile);
+ fclose(SpectrumFile);
+ WindowFilterPeaks(Node->Spectrum, 0, 0);
+ IntensityRankPeaks(Node->Spectrum);
+ //SpectrumComputeNoiseDistributions(Node);
+ //SpectrumComputeBinnedIntensities(Node);
+ printf("Tweak and score...\n");
+ TweakSpectrum(Node);
+ ////////////////////////////////////
+ // Score:
+ ////////////////////////////////////
+ // Free:
+ // The PySpectrum object wraps a Spectrum object, but also a SpectrumNode and an InputFileNode.
+ // So, free those as well:
+ if (Spectrum->Node->InputFile)
+ {
+ free(Spectrum->Node->InputFile);
+ Spectrum->Node->InputFile = NULL;
+ }
+ if (Spectrum->Node)
+ {
+ FreeSpectrumNode(Spectrum->Node);
+ }
+ else
+ {
+ FreeSpectrum(Spectrum);
+ }
+ return 0;
+}
+
+int TestMain(int argc, char* argv[])
+{
+ char Buffer[2048];
+ // For temp test scaffolding
+ InitOptions();
+ InitErrors();
+ InitStats();
+ Initialize();
+ printf(">>> Start <<<\n");
+ Cleanup();
+ printf(">>> End <<<\n");
+ ReadBinary(Buffer, sizeof(char), 1, stdin);
+ return 1;
+}
+
+// Program entry point. Parses arguments, does initialization of global data,
+// then either runs unit tests or calls RunTrieSearch.
+int main(int argc, char** argv)
+{
+ int Result;
+ clock_t StartTime;
+ clock_t EndTime;
+ float ElapsedTime;
+ int ChromosomeNumber;
+ //
+
+ //return TestMain(argc, argv);
+ // Jump into the training/testing code, maybe:
+ if (argc > 1 && !CompareStrings(argv[1], "train"))
+ {
+ return MainTraining(argc, argv);
+ }
+ if (argc > 1 && !CompareStrings(argv[1], "test"))
+ {
+ return MainTesting(argc, argv);
+ }
+ if (argc > 1 && !CompareStrings(argv[1], "splicefind"))
+ {
+ return MainSpliceFind(argc, argv);
+ }
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ printf("\nInsPecT version %s\n Interpretation of Peptides with Post-translational Modifications.\n", INSPECT_VERSION_NUMBER);
+ printf(" Copyright 2007,2008,2009 The Regents of the University of California\n");
+ printf(" [See Docs directory for usage manual and copyright information]\n\n");
+ fflush(stdout);
+ // Allocate stuff:
+ AllocMassDeltaByIndex();
+
+ // Slightly hacky behavior: If the first argument is an integer, then
+ // jump to the splice-db code:
+ if (argc > 1)
+ {
+ ChromosomeNumber = atoi(argv[1]);
+ if (ChromosomeNumber)
+ {
+ MainSpliceDB(argc, argv);
+ goto cleanup;
+ }
+ }
+
+ // Set the (global) default options:
+ InitOptions();
+ InitErrors();
+ InitStats();
+
+ // Parse arguments. If ReadCommandLineArgs returns false, we didn't get
+ // valid arguments, so we print usage info and quit.
+ Result = ReadCommandLineArgs(argc, argv);
+ if (!Result)
+ {
+ PrintUsageInfo();
+ goto cleanup;
+ }
+
+ // Open the error file *after* parsing the command-line:
+ GlobalOptions->ErrorFile = fopen(GlobalOptions->ErrorFileName, "wb");
+ if (!GlobalOptions->ErrorFile)
+ {
+ GlobalOptions->ErrorFile = stderr;
+ }
+
+ printf("Initialize:\n");
+ Result = Initialize();
+ if (!Result)
+ {
+ printf("Initialization FAILED - aborting search.\n");
+ goto cleanup;
+ }
+
+
+ ///////////////////////////////////////////////////
+ // Main function: Run the search!
+ StartTime = clock();
+
+ // Set an intermediate output file name, if we're performing a search.
+ // (We write to the intermediate file, then perform p-value computation)
+ if (!(GlobalOptions->RunMode & (RUN_MODE_TAGS_ONLY | RUN_MODE_PMC_ONLY | RUN_MODE_PREP_MS2DB | RUN_MODE_RAW_OUTPUT)))
+ {
+
+ sprintf(GlobalOptions->OutputFileName, "%s.tmp", GlobalOptions->FinalOutputFileName);
+ }
+ else
+ {
+ sprintf(GlobalOptions->OutputFileName, "%s", GlobalOptions->FinalOutputFileName);
+ }
+ GlobalOptions->OutputFile = fopen(GlobalOptions->OutputFileName, "w");
+ if (!GlobalOptions->OutputFile)
+ {
+ REPORT_ERROR_S(8, GlobalOptions->OutputFileName);
+ goto cleanup;
+ }
+
+ if (GlobalOptions->RunMode & RUN_MODE_PREP_MS2DB)
+ {
+ BuildMS2DB();
+ }
+ else if (GlobalOptions->RunMode & RUN_MODE_PMC_ONLY)
+ {
+ // Just correct charges and parent masses, don't search anything:
+ PerformSpectrumTweakage();
+ }
+ else if ((GlobalOptions->RunMode & RUN_MODE_TAGS_ONLY) && !GlobalOptions->ExternalTagger)
+ {
+ PerformTagGeneration();
+ }
+ else
+ {
+ RunSearch();
+ }
+
+ EndTime = clock();
+ ElapsedTime = (float)((EndTime - StartTime) / (float)CLOCKS_PER_SEC);
+ printf("Elapsed time: %.4f seconds.\n", ElapsedTime);
+ printf("Inspect run complete.\n");
+
+cleanup:
+ Cleanup();
+ return 0;
+}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/inspect.git
More information about the debian-med-commit
mailing list