[med-svn] [pipasic] 01/02: Imported Upstream version 0.0+15
Andreas Tille
tille at debian.org
Tue Sep 29 13:41:32 UTC 2015
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository pipasic.
commit 6602ce8ef450c7c7c81d1173a213025b8f8f44ea
Author: Andreas Tille <tille at debian.org>
Date: Tue Sep 29 15:21:32 2015 +0200
Imported Upstream version 0.0+15
---
LICENSE | 24 ++
PASiC.py | 82 +++++
README.txt | 144 ++++++++
TideProcessing.py | 214 +++++++++++
config_files/config_Inspect_MSSim.txt | 11 +
config_files/config_Inspect_MSSim_py.txt | 11 +
config_files/config_Inspect_py.txt | 11 +
config_files/config_MSSim_v1_9_short.ini | 275 ++++++++++++++
inspectparser.py | 117 ++++++
pipasic.py | 301 +++++++++++++++
plotting.py | 68 ++++
readMGF.py | 389 ++++++++++++++++++++
runInspect_user_config.py | 164 +++++++++
simulation_based_similarity.py | 239 ++++++++++++
trypticpeptides.py | 604 +++++++++++++++++++++++++++++++
15 files changed, 2654 insertions(+)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..58cec58
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+Copyright (c) 2013, Martin S. Lindner and Anke Penzlin,
+Robert Koch-Institut, Germany,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * The name of the author may not be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL MARTIN S. LINDNER OR ANKE PENZLIN BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PASiC.py b/PASiC.py
new file mode 100644
index 0000000..7cd80c3
--- /dev/null
+++ b/PASiC.py
@@ -0,0 +1,82 @@
+"""
+Adaption of similarity correction method from GASiC (plus normalized abundances)
+
+ at author: Anke Penzlin, June 2013
+"""
+import numpy as np
+import scipy.optimize as opt
+
+def similarity_correction(sim, reads, N):
+ """ Calculate corrected abundances given a similarity matrix and observations using optimization.
+ Copied from GASiC, for metaproteomic experiments replace the term 'reads' with 'spectra'.
+
+ Input:
+ sim: [numpy.array (M,M)] with pairwise similarities between species
+ reads: [numpy.array (M,)] with number of observed reads for each species
+ N: [int] total number of reads
+
+ Output:
+ abundances: [numpy.array (M,)] estimated abundance of each species in the sample
+ """
+
+ # transform reads to abundances and rename similarity matrix
+ A = np.transpose(sim)
+ r = reads.astype(np.float) / N
+
+ rng = range(len(reads))
+
+ # Now solve the optimization problem: min_c |Ac-r|^2 s.t. c_i >= 0 and 1-sum(c_i) >= 0
+ # construct objective function
+ def objective (c):
+ # numpy implementation of objective function |A*c-r|^2
+ return np.sum(np.square(np.dot(A,c)-r))
+
+ # construct constraints
+ def build_con(k):
+ # constraint: k'th component of x should be >0
+ return lambda c: c[k]
+ cons = [build_con(k) for k in rng]
+ # constraint: the sum of all components of x should not exceed 1
+ cons.append( lambda c: 1-np.sum(c) )
+
+ # initial guess
+ c_0 = np.array([0.5 for i in range(len(reads))])
+
+ # finally: optimization procedure
+ abundances = opt.fmin_cobyla(objective, c_0, cons, disp=0, rhoend=1e-10, maxfun=10000)
+
+ return abundances
+
+
+def PASiC(simiMat, counts, N):
+ """
+ Call similarity_correction and additionally calculate normalized abundances.
+
+ Input:
+ simiMat: [numpy.array (M,M)] with pairwise similarities between species
+ counts: [numpy.array (M,)] with number of observed PSMs for each species
+ N: [int] total number of spectra
+
+ Output:
+ abundances: [numpy.array (M,)] estimated abundance of each species in the sample
+ normAbundances: [numpy.array (M,)] abundances normalized to one
+ """
+ abundances = similarity_correction(sim=simiMat, reads=counts, N=N)
+ normAbundances = np.divide(abundances, float(sum(abundances)))
+ return abundances, normAbundances
+
+'''if __name__ == "__main__":
+ # Idee: np-Dateien einlesen von Eingabepfaden -d common directory
+ #c = np.load("/data/NG4/anke/spectra/acid_mine/counts_Biofilm.dat")
+ #M = np.load("/data/NG4/anke/spectra/acid_mine/unw_similarityMatrix_seq.dat")
+ #M_wtd = np.load("/data/NG4/anke/spectra/acid_mine/wtd_similarityMatrix_seq.dat")
+ corrAbunds, normCorr = PASiC(simiMat=M, counts=c, N=10001)
+ wtdCorr, n_wtdCorr = PASiC(simiMat=M_wtd, counts=c, N=10001)
+ print "observed abundances: ", np.round(np.divide(c, 10001),5)
+ print "observed, normalized: ", np.round(np.divide(c, float(sum(c))),5)
+ print "corrected abundances:", np.round(corrAbunds,5)
+ print "corrected, normalized:", np.round(normCorr,5)
+ print "with weighting"
+ print "corrected abundances:", np.round(wtdCorr,5)
+ print "corrected, normalized:", np.round(n_wtdCorr,5)
+'''
\ No newline at end of file
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..8257439
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,144 @@
+-------------------------------------------------------------------------------
+ Pipasic - peptide intensity-weighted proteome abundance similarity correction
+-------------------------------------------------------------------------------
+
+
+Abstract
+--------
+
+Metaproteomic analysis allows studying the interplay of organisms or functional
+groups and has become increasingly popular also for diagnostic purposes.
+However, difficulties arise due to the high sequence similarity between related
+organisms. Further, the state of conservation of proteins between species can
+be correlated with their expression level which can lead to significant bias in
+results and interpretation. These challenges are similar but not identical to
+the challenges arising in the analysis of metagenomic samples and require
+specific solutions.
+We developed Pipasic (peptide intensity-weighted proteome abundance similarity
+correction) as a tool which corrects identification and spectral counting based
+quantification results using peptide similarity estimation and expression level
+weighting within a non-negative lasso framework. pipasic has distinct advantages
+over approaches only regarding unique peptides or aggregating results to the
+lowest common ancestor, as demonstrated on examples of viral diagnostics and an
+acid mine drainage dataset.
+
+
+Requirements
+------------
+
+Pipasic was developed with Python 2.7.2, the following python libraries are
+required:
+
+- Biopython 1.60
+- NumPy 1.7.1
+- matplotlib 1.2.1 (optional for graphical output)
+
+Pipasic might be able to work with different software versions, but we tested
+it using the given configuration.
+
+Spectrum identification can be done with Inspect or Tide. We used the following
+versions:
+
+- InsPecT version 20100804
+- Tide as part of Crux 1.36
+
+
+Installation
+------------
+
+Pipasic is a Python tool and does not require any installation (except the
+requirements above). Download the source code or check out the repository and
+run Pipasic from the command line by calling
+
+> python pipasic.py
+
+See below for more details.
+
+
+Usage
+-----
+
+Usage: pipasic.py SPECTRA DB [module options] [input and configuration options]
+
+Overall pipasic calling tool, including:
+- weighted (always) and unweighted (optional) similarity estimation
+- correction, using matrix from similarity estimation
+- peptide Identification by InsPecT/Tide
+
+SPECTRA: Comma-separated string of spectrum files (mgf) - without file-extension!
+DB: Comma-separated string of reference proteomes (fasta-files) - without file-extension!
+ if -S or -I: decoy database must exist as db_name+"_decoy.fasta"
+
+Note: Pipasic requires two .fasta for each reference proteome <ref>:
+- "<ref>.fasta" containing the protein sequences only.
+- "<ref>_decoy.fasta" containing BOTH the protein AND decoy sequences. Decoy
+ sequences must be tagged in their name with "REVERSED" or "DECOY" or any
+ other tag specified by "-t".
+
+
+Options:
+ -h, --help show this help message and exit
+ -U, --Unweighted calculate unweighted similarities for all given
+ proteomes
+ -I, --Identify identify given spectra with all given proteomes
+ -T, --Tide use Tide instead of InsPecT
+ -V Visualize results using matplotlib
+ -o OUTFILE, --outfile=OUTFILE
+ Output filename for results. Also serves as trunk for
+ other result files (graphics, data). [default:
+ results.txt]
+ -s SPEC_DIR, --spec_dir=SPEC_DIR
+ Directory of SPECTRA (mgf) files. Search in current
+ directory, if not given. [default: none]
+ -d DB_DIR, --db_dir=DB_DIR
+ Directory of proteinDBs. Search for DB files current
+ directory, if not given. [default: none]
+ -m MODS, --mods=MODS A string containing all modifications in question,
+ modification choice by filename if not given.
+ [default: none]
+ -i INSP_DIR, --inspect_dir=INSP_DIR
+ Inspect directory. [default: none]
+ -f FDR, --fdr_cutoff=FDR
+ False discovery rate cut-off for identification lists.
+ [default: 0.05]
+ -t DECOY_TAG, --decoy_tag=DECOY_TAG
+ Tag to identify decoy sequences in the database.
+ Regular expressions may be used. This tag must be used
+ in the name of all decoy sequences in the file
+ "<DB>_decoy.fasta". [default: REVERSED|DECOY]
+ -l LABELS, --labels=LABELS
+ Comma-separated string of short names for organisms in
+ the reference proteomes. If not given, the file name
+ is used. [default: none]
+ -N N, --N_spectra=N Number of spectra in original dataset, comma-separated
+ list if multiple datasets. [default: none]
+ -c COUNTS, --C_spectra=COUNTS
+ File containing numbers of spectra found by
+ identification (Numpy Array dump). [default: none]
+ -q, --quiet don't print status messages to stdout
+
+
+
+Example run
+-----------
+
+Download the example dataset from Sourceforge!
+https://sourceforge.net/projects/pipasic/files/example.tar.gz/download
+
+Extract the archive into your pipasic installation and follow the instructions
+in example/README.
+
+
+License
+-------
+
+pipasic is Open Source! Please find detailed licensing information in the
+LICENSE file.
+
+
+Contact
+-------
+
+Please contact Bernhard Renard (renardb at rki.de) if you have questions
+concerning Pipasic.
+
diff --git a/TideProcessing.py b/TideProcessing.py
new file mode 100644
index 0000000..13a51b9
--- /dev/null
+++ b/TideProcessing.py
@@ -0,0 +1,214 @@
+"""
+Tide does not use spectrum numbers using mgf-files, therefore, files are split
+into single spectrum files which are processed by all Tide steps and recombined
+within output parsing into a list of peptide spectrum matches.
+
+TideProcessing: everything to produce a list of peptide spectrum matches
+with Tide (as done with Inspect before) from mgf-file + db(s);
+- split mgf into single spectra,
+- prepDB (tide-index) and prepSpec (msconvert -> spectrumrecords),
+- runTide (tide-search) and
+- combine single result files to an identification list by TideParser
+
+ at author: Anke Penzlin, April 2013
+"""
+
+import os
+import sys
+
+
+def splitMGF(infile, verbose=True):
+ """
+Split mgf-file into files containing each a single spectrum, storing the
+spectrum numbers (IDs) in corresponding filenames and return overall number of
+spectra from original file (= number of new files)
+"""
+
+ from readMGF import storeAllSpec
+ if verbose: print "Reading spectra from", infile, "..."
+ specList = storeAllSpec(infile, verbose=verbose)
+ filepath = os.path.splitext(infile)[0]+"_singleSpec/"
+ if not os.path.exists(filepath): os.mkdir(filepath)
+ if verbose:
+ print "Writing single spectrum files to", filepath
+ sys.stdout.flush()
+ for num, spec in enumerate(specList):
+ filename = filepath+"%0.5i.mgf" %(num)
+ with open(filename,"w") as out:
+ out.write(spec)
+
+ return len(specList), filepath
+
+
+def prepDB_Tide(db_fasta, params="--enzyme=trypsin --digestion=partial-digest --max_missed_cleavages=1 --monoisotopic_precursor --mods_spec=C+57.021464,1NQ+0.984016,1M+15.994915"):
+ return os.system("tide-index %s --fasta=%s" %(params,db_fasta))
+
+def prepSpec_Tide(filepath, nSpec, verbose=False):
+ std_dir = os.getcwd()
+ os.chdir(filepath)
+ if verbose:
+ for nr in range(nSpec):
+ os.system("msconvert --spectrumrecords -o %s %s%0.5i.mgf" %(filepath, filepath, nr))
+ else:
+ import subprocess
+ with open(os.devnull, "w") as fnull:
+ for nr in range(nSpec):
+ command = "msconvert --spectrumrecords -o %s %s%0.5i.mgf" %(filepath, filepath, nr)
+ subprocess.call(command, stdout = fnull, stderr = fnull, shell = True)
+ os.chdir(std_dir)
+
+def runTide(filepath, nSpec, db_fasta, verbose=True, convComments=False):
+ """
+"""
+ nofile = []
+
+ if not os.path.exists(db_fasta+".pepix"):
+ if prepDB_Tide(db_fasta=db_fasta):
+ return ["database "+db_fasta+" not found",nofile]
+
+ if verbose:
+ print "searching all spectra in", db_fasta
+ sys.stdout.flush()
+ conv_counter = 0
+ std_dir = os.getcwd()
+ os.chdir(filepath)
+ for id in range(nSpec):
+ if not os.path.isfile("%0.5i.spectrumrecords"%id):
+ try:
+ if convComments:
+ os.system("msconvert --spectrumrecords %0.5i.mgf" %id)
+ else:
+ os.system("msconvert --spectrumrecords %0.5i.mgf > /dev/null" %id)
+ '''with open(os.devnull, "w") as fnull:
+ command = "msconvert --spectrumrecords %0.5i.mgf" %id
+ subprocess.call(command, stdout = fnull, stderr = fnull, shell = True)'''
+ conv_counter += 1
+ except:
+ nofile.append(id)
+ continue
+ os.system("tide-search --peptides=%s.pepix --proteins=%s.protix --spectra=%0.5i.spectrumrecords --mass_window=0.06 --results=protobuf > results" %(db_fasta,db_fasta,id))
+ os.system("tide-results --results_file=results.tideres --proteins=%s.protix --spectra=%0.5i.spectrumrecords --out_format=text --out_filename=results_%0.5i" %(db_fasta,id,id))
+ os.chdir(std_dir)
+ if verbose and conv_counter: print conv_counter, "mgf-files converted to spectrumrecords"
+ return ["", nofile]
+
+
+def TideParser(filepath, nSpec, db_name="", nofile=[], out_sep="\t", fdr_listcut=1, fdr_countcut=0.05, silent=True, verbose=True):
+ import re
+
+ score = 0.0
+ seq = ""
+ isDecoy = "false"
+ fdr = 10.0
+ peptides = []
+ notFound = [[],[]]
+
+ if verbose: print "reading the input files..."
+ for id in range(nSpec):
+ if id in nofile: continue
+ else:
+ try :
+ infile = open(filepath+"results_%0.5i.text"%id,"r")
+ lines = infile.readlines()
+ if lines > 1:
+ tmp = lines[1].split('\t')
+ # get score
+ score = float(tmp[1])
+ # get sequence
+ seq = tmp[2].strip()
+ # get protein name
+ prot = lines[2].split('\t')[2]
+ # extract decoy information
+ if re.match("RRRRR",prot):
+ isDecoy = "true"
+ else: isDecoy = "false"
+ peptides.append([id,seq,score,isDecoy,fdr,prot])
+ else: notFound[0].append(id)
+ infile.close()
+ except:
+ notFound[1].append("%.5i"%id)
+ if verbose: print("File %.5i not found"%id)
+
+
+ peptides = sorted(peptides, key = lambda x: x[2], reverse=True)
+ #calculate FDR and write output file
+ countDecoys = 0.0
+ counter = 0.0
+ nonDecoyCount = 0.0
+ cutscore = peptides[0][2] + 0.01
+ fdr = float(0.0)
+ out_name = filepath[:-1] + "_" + db_name + "_TideOut_parsed.txt"
+ if verbose: print "writing the output file", out_name, "..."
+ outfile = open(out_name,"w")
+ # create header
+ outfile.write(out_sep+out_sep.join(["Spectrum","Peptid","Score","isDecoy","FDR","matchedProtein"])+"\n")
+ outfile.write(out_sep+out_sep.join(["--------","------","-----","-------","---","--------------"])+"\n")
+ for pep in peptides:
+ counter +=1
+ if pep[3] == "true":
+ countDecoys +=1
+ fdr = countDecoys/counter
+ if fdr > fdr_listcut: break
+ if fdr < fdr_countcut:
+ nonDecoyCount = counter-countDecoys
+ cutscore = pep[2]
+ pep[4] = fdr
+ outfile.write(out_sep+out_sep.join([str(e) for e in pep])+"\n")
+ outfile.close()
+ if silent: return out_name
+ else: return [out_name, nonDecoyCount, len(peptides), cutscore, notFound] #[output filename, nonDecoy, total, last score, list of unmatched spectra and unknown files]
+
+
+def runTide_all(spec_mgf, db_list, db_path="", verbose=True):
+ """
+spec_mgf: file of spectra in mascot generic format
+db_list: only name, no extension, no path! (_decoy.fasta must be available)
+db_path: directory of all dbs (_decoy.fasta must be available for each)
+verbose: if True (default), print status messages to stdout
+"""
+ nSpec, filepath = splitMGF(infile=spec_mgf, verbose=verbose)
+ counts = []
+ testout = []
+ for db in db_list:
+ db_fasta = db_path+db+"_decoy.fasta"
+ succ = runTide(filepath=filepath, nSpec=nSpec, db_fasta=db_fasta, verbose=verbose)
+ if succ[0]: # database not found
+ print succ[0]
+ results = []
+ counts.append(None)
+ else:
+ results = TideParser(filepath=filepath, nSpec=nSpec, db_name=db, nofile=succ[1], silent=False, verbose=verbose)
+ counts.append(results[1])
+ testout.append([results,succ[1]])
+ return counts, testout
+
+
+if __name__=="__main__":
+ usage = """%prog SPEC_MGF DB_LIST
+
+run Tide for each spectrum of SPEC_MGF and construct a list of peptide matches
+(including FDR) for each (decoy version of) database given in DB_LIST.
+
+SPEC_MGF dataset-file in mgf-format (full path!)
+DB_LIST ','-separated list of filenames (as a string) of proteome databases
+ in fasta-format (but without file extension!)
+"""
+
+ from optparse import OptionParser
+ # configure the command line parser
+ command_parser = OptionParser(usage=usage)
+ #command_parser.add_option("-f","--fdr",type="float",dest="fdr",default=0.05, help="FDR cut-off for count and cut-off score calculation [default: %default]")
+ #command_parser.add_option("-l","--list_fdr",type="float",dest="list_fdr",default=1, help="FDR list cut-off for restriction of output lines, overwrites fdr cut-off if smaller [default: %default (= no cut)]")
+ command_parser.add_option("-d","--db_path",type="string",dest="db_path",default="",help="common directory of all databases in db_list")
+ command_parser.add_option("-q","--quiet",action="store_false",dest="verbose",default=True,help="don't print status messages to stdout")
+ # parse arguments
+ options, args = command_parser.parse_args()
+ verbose=options.verbose
+
+ if len(args) == 2:
+ counts, testout = runTide_all(spec_mgf=args[0], db_list=args[1].split(","), db_path=options.db_path, verbose=verbose)
+ print "counts: ", counts
+ if verbose: print "control parameters:\n", testout
+ else:
+ command_parser.print_help()
+ sys.exit(1)
diff --git a/config_files/config_Inspect_MSSim.txt b/config_files/config_Inspect_MSSim.txt
new file mode 100644
index 0000000..1cf00e5
--- /dev/null
+++ b/config_files/config_Inspect_MSSim.txt
@@ -0,0 +1,11 @@
+spectra,/data/NG4/anke/MSSim/sampled_data/EHEC11128_sampl100MSSim.mgf
+instrument,FT-Hybrid
+protease,Trypsin
+DB,/data/NG4/anke/proteome/Shigellaflex_decoy.trie
+# Protecting group on cysteine:
+mod,57.021464,C
+mod,0.984016,NQ
+mod,15.994915,M
+mods,2
+ParentPPM,10
+IonTolerance,0.8
diff --git a/config_files/config_Inspect_MSSim_py.txt b/config_files/config_Inspect_MSSim_py.txt
new file mode 100644
index 0000000..14ac7cc
--- /dev/null
+++ b/config_files/config_Inspect_MSSim_py.txt
@@ -0,0 +1,11 @@
+spectra,./ref/simulFiles/fer2_Biofilm_sampl100MSSim_sample1000.mgf
+instrument,FT-Hybrid
+protease,Trypsin
+DB,./ref/fer2_Biofilm_decoy.trie
+# Protecting group on cysteine:
+mod,57.021464,C
+mod,0.984016,NQ
+mod,15.994915,M
+mods,2
+ParentPPM,10
+IonTolerance,0.8
diff --git a/config_files/config_Inspect_py.txt b/config_files/config_Inspect_py.txt
new file mode 100644
index 0000000..ec5170b
--- /dev/null
+++ b/config_files/config_Inspect_py.txt
@@ -0,0 +1,11 @@
+spectra,/home/lindnerm/data/2013_06_PASiC/exp_zaripov/data/Rot.mgf
+instrument,FT-Hybrid
+protease,Trypsin
+DB,/home/lindnerm/data/2013_06_PASiC/exp_zaripov/ref/F_decoy.trie
+# Protecting group on cysteine
+mod,57.021464,C,fix
+mod,15.994915,M
+mod,42.010565,*,nterminal
+mods,2
+ParentPPM,10
+IonTolerance,0.8
diff --git a/config_files/config_MSSim_v1_9_short.ini b/config_files/config_MSSim_v1_9_short.ini
new file mode 100644
index 0000000..d9e1dc5
--- /dev/null
+++ b/config_files/config_MSSim_v1_9_short.ini
@@ -0,0 +1,275 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<PARAMETERS version="1.3" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/schemas/Param_1_3.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <NODE name="MSSimulator" description="A highly configurable simulator for mass spectrometry experiments.">
+ <ITEM name="version" value="1.9.0" type="string" description="Version of the tool that generated this parameters file." tags="advanced" />
+ <NODE name="1" description="Instance '1' section for 'MSSimulator'">
+ <ITEMLIST name="in" type="string" description="Input protein sequences in FASTA format" tags="input file,required" restrictions="*.fasta">
+ </ITEMLIST>
+ <ITEM name="out" value="" type="string" description="output (simulated MS map) in mzML format" tags="output file" restrictions="*.mzML" />
+ <ITEM name="out_pm" value="" type="string" description="output (simulated MS map) in mzML format (picked GT)" tags="output file" restrictions="*.mzML" />
+ <ITEM name="out_fm" value="" type="string" description="output (simulated MS map) in featureXML format" tags="output file" restrictions="*.featureXML" />
+ <ITEM name="out_cm" value="" type="string" description="output (simulated MS map) in consensusXML format (grouping charge variants from a parent peptide from ESI)" tags="output file" restrictions="*.consensusXML" />
+ <ITEM name="out_lcm" value="" type="string" description="output (simulated MS map) in consensusXML format (grouping labeled variants)" tags="output file" restrictions="*.consensusXML" />
+ <ITEM name="out_cntm" value="" type="string" description="output (simulated MS map) in featureXML format (contaminants)" tags="output file" restrictions="*.featureXML" />
+ <ITEM name="log" value="" type="string" description="Name of log file (created only when specified)" tags="advanced" />
+ <ITEM name="debug" value="0" type="int" description="Sets the debug level" tags="advanced" />
+ <ITEM name="threads" value="2" type="int" description="Sets the number of threads allowed to be used by the TOPP tool" />
+ <ITEM name="no_progress" value="false" type="string" description="Disables progress logging to command line" tags="advanced" restrictions="true,false" />
+ <ITEM name="test" value="false" type="string" description="Enables the test mode (needed for internal use only)" tags="advanced" restrictions="true,false" />
+ <NODE name="algorithm" description="Algorithm parameters section">
+ <NODE name="MSSim" description="">
+ <NODE name="Digestion" description="">
+ <ITEM name="enzyme" value="Trypsin" type="string" description="Enzyme to use for digestion (select 'none' to skip digestion)" restrictions="Trypsin,none" />
+ <ITEM name="model" value="naive" type="string" description="The cleavage model to use for digestion. 'Trained' is based on a log likelihood model (see DOI:10.1021/pr060507u)." restrictions="trained,naive" />
+ <ITEM name="min_peptide_length" value="3" type="int" description="Minimum peptide length after digestion (shorter ones will be discarded)" restrictions="1:" />
+ <NODE name="model_trained" description="">
+ <ITEM name="threshold" value="0.5" type="float" description="Model threshold for calling a cleavage. Higher values increase the number of cleavages. -2 will give no cleavages, +4 almost full cleavage." restrictions="-2:4" />
+ </NODE>
+ <NODE name="model_naive" description="">
+ <ITEM name="missed_cleavages" value="1" type="int" description="Maximum number of missed cleavages considered. All possible resulting peptides will be created." restrictions="0:" />
+ </NODE>
+ </NODE>
+ <NODE name="RT" description="">
+ <ITEM name="rt_column" value="HPLC" type="string" description="Modelling of an RT or CE column" restrictions="none,HPLC,CE" />
+ <ITEM name="auto_scale" value="true" type="string" description="Scale predicted RT's/MT's to given 'total_gradient_time'? If 'true', for CE this means that 'CE:lenght_d', 'CE:length_total', 'CE:voltage' have no influence." restrictions="true,false" />
+ <ITEM name="total_gradient_time" value="2500" type="float" description="The duration [s] of the gradient." restrictions="1e-05:" />
+ <ITEM name="sampling_rate" value="2" type="float" description="Time interval [s] between consecutive scans" restrictions="0.01:60" />
+ <NODE name="scan_window" description="">
+ <ITEM name="min" value="500" type="float" description="Start of RT Scan Window [s]" restrictions="0:" />
+ <ITEM name="max" value="1500" type="float" description="End of RT Scan Window [s]" restrictions="1:" />
+ </NODE>
+ <NODE name="variation" description="Random component that simulates technical/biological variation">
+ <ITEM name="feature_stddev" value="3" type="int" description="Standard deviation of shift in retention time [s] from predicted model (applied to every single feature independently)" />
+ <ITEM name="affine_offset" value="0" type="int" description="Global offset in retention time [s] from predicted model" />
+ <ITEM name="affine_scale" value="1" type="int" description="Global scaling in retention time from predicted model" />
+ </NODE>
+ <NODE name="column_condition" description="">
+ <ITEM name="distortion" value="0" type="int" description="Distortion of the elution profiles. Good presets are 0 for a perfect elution profile, 1 for a slightly distorted elution profile etc... For trapping instruments (e.g. Orbitrap) distortion should be >4." restrictions="0:10" />
+ </NODE>
+ <NODE name="profile_shape" description="">
+ <NODE name="width" description="Width of the EGH elution shape, i.e. the sigma^2 parameter, which is computed using 'value' + rnd_cauchy('variance')">
+ <ITEM name="value" value="9" type="float" description="Width of the Exponential Gaussian Hybrid distribution shape of the elution profile. This does not correspond directly to the width in [s]." restrictions="0:" />
+ <ITEM name="variance" value="1.6" type="float" description="Random component of the width (set to 0 to disable randomness), i.e. scale parameter for the lorentzian variation of the variance (Note: The scale parameter has to be >= 0)." restrictions="0:" />
+ </NODE>
+ <NODE name="skewness" description="Skewness of the EGH elution shape, i.e. the tau parameter, which is computed using 'value' + rnd_cauchy('variance')">
+ <ITEM name="value" value="0.1" type="float" description="Asymmetric component of the EGH. Higher absolute(!) values lead to more skewness (negative values cause fronting, positive values cause tailing). Tau parameter of the EGH, i.e. time constant of the exponential decay of the Exponential Gaussian Hybrid distribution shape of the elution profile." />
+ <ITEM name="variance" value="0.3" type="float" description="Random component of skewness (set to 0 to disable randomness), i.e. scale parameter for the lorentzian variation of the time constant (Note: The scale parameter has to be > 0)." restrictions="0:" />
+ </NODE>
+ </NODE>
+ <NODE name="HPLC" description="">
+ <ITEM name="model_file" value="examples/simulation/RTPredict.model" type="string" description="SVM model for retention time prediction" />
+ </NODE>
+ <NODE name="CE" description="">
+ <ITEM name="pH" value="3" type="float" description="pH of buffer" restrictions="0:14" />
+ <ITEM name="alpha" value="0.5" type="float" description="Exponent Alpha used to calculate mobility" restrictions="0:1" />
+ <ITEM name="mu_eo" value="0" type="float" description="Electroosmotic flow" restrictions="0:5" />
+ <ITEM name="lenght_d" value="70" type="float" description="Length of capillary [cm] from injection site to MS" restrictions="0:1000" />
+ <ITEM name="length_total" value="75" type="float" description="Total length of capillary [cm]" restrictions="0:1000" />
+ <ITEM name="voltage" value="1000" type="float" description="Voltage applied to capillary" restrictions="0:" />
+ </NODE>
+ </NODE>
+ <NODE name="Detectability" description="">
+ <ITEM name="dt_simulation_on" value="false" type="string" description="Modelling detectibility enabled? This can serve as a filter to remove peptides which ionize badly, thus reducing peptide count" restrictions="true,false" />
+ <ITEM name="min_detect" value="0.5" type="float" description="Minimum peptide detectability accepted. Peptides with a lower score will be removed" />
+ <ITEM name="dt_model_file" value="examples/simulation/DTPredict.model" type="string" description="SVM model for peptide detectability prediction" />
+ </NODE>
+ <NODE name="Ionization" description="">
+ <NODE name="esi" description="">
+ <ITEMLIST name="ionized_residues" type="string" description="List of residues (as three letter code) that will be considered during ES ionization. The N-term is always assumed to carry a charge. This parameter will be ignored during MALDI ionization." restrictions="Ala,Cys,Asp,Glu,Phe,Gly,His,Ile,Lys,Leu,Met,Asn,Pro,Gln,Arg,Sec,Ser,Thr,Val,Trp,Tyr">
+ <LISTITEM value="Arg"/>
+ <LISTITEM value="Lys"/>
+ <LISTITEM value="His"/>
+ </ITEMLIST>
+ <ITEMLIST name="charge_impurity" type="string" description="List of charged ions that contribute to charge with weight of occurrence (their sum is scaled to 1 internally), e.g. ['H:1'] or ['H:0.7' 'Na:0.3'], ['H:4' 'Na:1'] (which internally translates to ['H:0.8' 'Na:0.2'])">
+ <LISTITEM value="H+:1"/>
+ </ITEMLIST>
+ <ITEM name="max_impurity_set_size" value="3" type="int" description="Maximal #combinations of charge impurities allowed (each generating one feature) per charge state. E.g. assuming charge=3 and this parameter is 2, then we could choose to allow '3H+, 2H+Na+' features (given a certain 'charge_impurity' constraints), but no '3H+, 2H+Na+, 3Na+'" tags="advanced" />
+ <ITEM name="ionization_probability" value="0.8" type="float" description="Probability for the binomial distribution of the ESI charge states" />
+ </NODE>
+ <NODE name="maldi" description="">
+ <ITEMLIST name="ionization_probabilities" type="float" description="List of probabilities for the different charge states during MALDI ionization (the list must sum up to 1.0)">
+ <LISTITEM value="0.9"/>
+ <LISTITEM value="0.1"/>
+ </ITEMLIST>
+ </NODE>
+ <NODE name="mz" description="">
+ <ITEM name="lower_measurement_limit" value="200" type="float" description="Lower m/z detector limit." restrictions="0:" />
+ <ITEM name="upper_measurement_limit" value="2500" type="float" description="Upper m/z detector limit." restrictions="0:" />
+ </NODE>
+ </NODE>
+ <NODE name="RawSignal" description="">
+ <ITEM name="enabled" value="true" type="string" description="Enable RAW signal simulation? (select 'false' if you only need feature-maps)" restrictions="true,false" />
+ <ITEM name="peak_shape" value="Gaussian" type="string" description="Peak Shape used around each isotope peak (be aware that the area under the curve is constant for both types, but the maximal height will differ (~ 2:3 = Lorentz:Gaussian) due to the wider base of the Lorentzian." restrictions="Gaussian,Lorentzian" />
+ <NODE name="resolution" description="">
+ <ITEM name="value" value="50000" type="int" description="Instrument resolution at 400 Th." />
+ <ITEM name="type" value="linear" type="string" description="How does resolution change with increasing m/z?! QTOFs usually show 'constant' behavior, FTs have linear degradation, and on Orbitraps the resolution decreases with square root of mass." restrictions="constant,linear,sqrt" />
+ </NODE>
+ <NODE name="baseline" description="Baseline modeling for MALDI ionization">
+ <ITEM name="scaling" value="0" type="float" description="Scale of baseline. Set to 0 to disable simulation of baseline." restrictions="0:" />
+ <ITEM name="shape" value="0.5" type="float" description="The baseline is modeled by an exponential probability density function (pdf) with f(x) = shape*e^(- shape*x)" restrictions="0:" />
+ </NODE>
+ <NODE name="mz" description="">
+ <ITEM name="sampling_points" value="3" type="int" description="Number of raw data points per FWHM of the peak." restrictions="2:" />
+ </NODE>
+ <NODE name="contaminants" description="">
+ <ITEM name="file" value="examples/simulation/contaminants.csv" type="string" description="Contaminants file with sum formula and absolute RT interval. See 'OpenMS/examples/simulation/contaminants.txt' for details." />
+ </NODE>
+ <NODE name="variation" description="Random components that simulate biological and technical variations of the simulated data.">
+ <NODE name="mz" description="Shifts in mass to charge dimension of the simulated signals.">
+ <ITEM name="error_stddev" value="0" type="float" description="Standard deviation for m/z errors. Set to 0 to disable simulation of m/z errors." />
+ <ITEM name="error_mean" value="0" type="float" description="Average systematic m/z error (Da)" />
+ </NODE>
+ <NODE name="intensity" description="Variations in intensity to model randomness in feature intensity.">
+ <ITEM name="scale" value="100" type="float" description="Constant scale factor of the feature intensity. Set to 1.0 to get the real intensity values provided in the FASTA file." restrictions="0:" />
+ <ITEM name="scale_stddev" value="0" type="float" description="Standard deviation of peak intensity (relative to the scaled peak height). Set to 0 to get simple rescaled intensities." restrictions="0:" />
+ </NODE>
+ </NODE>
+ <NODE name="noise" description="Parameters modeling noise in mass spectrometry measurements.">
+ <NODE name="shot" description="Parameters of Poisson and Exponential for shot noise modeling (set :rate OR :mean = 0 to disable).">
+ <ITEM name="rate" value="0" type="float" description="Poisson rate of shot noise per unit m/z. Set this to 0 to disable simulation of shot noise." restrictions="0:" />
+ <ITEM name="intensity-mean" value="1" type="float" description="Shot noise intensity mean (exponentially distributed with given mean)." />
+ </NODE>
+ <NODE name="white" description="Parameters of Gaussian distribution for white noise modeling (set :mean AND :stddev = 0 to disable).">
+ <ITEM name="mean" value="0" type="float" description="Mean value of white noise being added to each measured signal." />
+ <ITEM name="stddev" value="0" type="float" description="Standard deviation of white noise being added to each measured signal." />
+ </NODE>
+ <NODE name="detector" description="Parameters of Gaussian distribution for detector noise modeling (set :mean AND :stddev = 0 to disable).">
+ <ITEM name="mean" value="0" type="float" description="Mean value of the detector noise being added to the complete measurement." />
+ <ITEM name="stddev" value="0" type="float" description="Standard deviation of the detector noise being added to the complete measurement." />
+ </NODE>
+ </NODE>
+ </NODE>
+ <NODE name="RawTandemSignal" description="">
+ <ITEM name="status" value="precursor" type="string" description="Create Tandem-MS scans?" restrictions="disabled,precursor,MS^E" />
+ <ITEM name="tandem_mode" value="2" type="int" description="Algorithm to generate the tandem-MS spectra. 0 - fixed intensities, 1 - SVC prediction (abundant/missing), 2 - SVR prediction of peak intensity #br#" restrictions="0:2" />
+ <ITEM name="svm_model_set_file" value="examples/simulation/SvmModelSet.model" type="string" description="File containing the Filenames of SVM Models for different charge variants" />
+ <NODE name="Precursor" description="">
+ <ITEM name="ms2_spectra_per_rt_bin" value="5" type="int" description="Number of allowed MS/MS spectra in a retention time bin." restrictions="1:" />
+ <ITEM name="min_peak_distance" value="2" type="float" description="The minimal distance (in Da) of two peaks in one spectrum so that they can be selected." restrictions="0:" />
+ <ITEM name="selection_window" value="2" type="float" description="All peaks within a mass window (in Da) of a selected peak are also selected for fragmentation." restrictions="0:" />
+ <ITEM name="exclude_overlapping_peaks" value="true" type="string" description="If true overlapping or nearby peaks (within min_peak_distance) are excluded for selection." restrictions="true,false" />
+ <ITEMLIST name="charge_filter" type="int" description="Charges considered for MS2 fragmentation." restrictions="1:5">
+ <LISTITEM value="2"/>
+ <LISTITEM value="3"/>
+ </ITEMLIST>
+ <NODE name="Exclusion" description="">
+ <ITEM name="use_dynamic_exclusion" value="true" type="string" description="If true dynamic exclusion is applied." restrictions="true,false" />
+ <ITEM name="exclusion_time" value="100" type="float" description="The time (in seconds) a feature is excluded." restrictions="0:" />
+ </NODE>
+ </NODE>
+ <NODE name="MS_E" description="">
+ <ITEM name="add_single_spectra" value="false" type="string" description="If true, the MS2 spectra for each peptide signal are included in the output (might be a lot). They will have a meta value 'MS_E_debug_spectra' attached, so they can be filtered out. Full MS_E spectra will have 'MS_E_FullSpectrum' instead." restrictions="true,false" />
+ </NODE>
+ <NODE name="TandemSim" description="">
+ <NODE name="Simple" description="">
+ <ITEM name="add_isotopes" value="false" type="string" description="If set to 1 isotope peaks of the product ion peaks are added" restrictions="true,false" />
+ <ITEM name="max_isotope" value="2" type="int" description="Defines the maximal isotopic peak which is added, add_isotopes must be set to 1" />
+ <ITEM name="add_metainfo" value="false" type="string" description="Adds the type of peaks as metainfo to the peaks, like y8+, [M-H2O+2H]++" restrictions="true,false" />
+ <ITEM name="add_losses" value="true" type="string" description="Adds common losses to those ion expect to have them, only water and ammonia loss is considered" restrictions="true,false" />
+ <ITEM name="add_precursor_peaks" value="true" type="string" description="Adds peaks of the precursor to the spectrum, which happen to occur sometimes" restrictions="true,false" />
+ <ITEM name="add_abundant_immonium_ions" value="false" type="string" description="Add most abundant immonium ions" restrictions="true,false" />
+ <ITEM name="add_first_prefix_ion" value="false" type="string" description="If set to true e.g. b1 ions are added" restrictions="true,false" />
+ <ITEM name="add_y_ions" value="true" type="string" description="Add peaks of y-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="add_b_ions" value="true" type="string" description="Add peaks of b-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="add_a_ions" value="false" type="string" description="Add peaks of a-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="add_c_ions" value="false" type="string" description="Add peaks of c-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="add_x_ions" value="false" type="string" description="Add peaks of x-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="add_z_ions" value="false" type="string" description="Add peaks of z-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="y_intensity" value="1" type="float" description="Intensity of the y-ions" />
+ <ITEM name="b_intensity" value="1" type="float" description="Intensity of the b-ions" />
+ <ITEM name="a_intensity" value="1" type="float" description="Intensity of the a-ions" />
+ <ITEM name="c_intensity" value="1" type="float" description="Intensity of the c-ions" />
+ <ITEM name="x_intensity" value="1" type="float" description="Intensity of the x-ions" />
+ <ITEM name="z_intensity" value="1" type="float" description="Intensity of the z-ions" />
+ <ITEM name="relative_loss_intensity" value="0.1" type="float" description="Intensity of loss ions, in relation to the intact ion intensity" />
+ <ITEM name="precursor_intensity" value="1" type="float" description="Intensity of the precursor peak" />
+ <ITEM name="precursor_H2O_intensity" value="1" type="float" description="Intensity of the H2O loss peak of the precursor" />
+ <ITEM name="precursor_NH3_intensity" value="1" type="float" description="Intensity of the NH3 loss peak of the precursor" />
+ </NODE>
+ <NODE name="SVM" description="">
+ <ITEM name="add_isotopes" value="false" type="string" description="If set to 1 isotope peaks of the product ion peaks are added" restrictions="true,false" />
+ <ITEM name="max_isotope" value="2" type="int" description="Defines the maximal isotopic peak which is added, add_isotopes must be set to 1" />
+ <ITEM name="add_metainfo" value="false" type="string" description="Adds the type of peaks as metainfo to the peaks, like y8+, [M-H2O+2H]++" restrictions="true,false" />
+ <ITEM name="add_first_prefix_ion" value="false" type="string" description="If set to true e.g. b1 ions are added" restrictions="true,false" />
+ <ITEM name="hide_y_ions" value="false" type="string" description="Add peaks of y-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_y2_ions" value="false" type="string" description="Add peaks of y-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_b_ions" value="false" type="string" description="Add peaks of b-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_b2_ions" value="false" type="string" description="Add peaks of b-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_a_ions" value="false" type="string" description="Add peaks of a-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_c_ions" value="false" type="string" description="Add peaks of c-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_x_ions" value="false" type="string" description="Add peaks of x-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_z_ions" value="false" type="string" description="Add peaks of z-ions to the spectrum" restrictions="true,false" />
+ <ITEM name="hide_losses" value="false" type="string" description="Adds common losses to those ion expect to have them, only water and ammonia loss is considered" restrictions="true,false" />
+ <ITEM name="y_intensity" value="1" type="float" description="Intensity of the y-ions" />
+ <ITEM name="b_intensity" value="1" type="float" description="Intensity of the b-ions" />
+ <ITEM name="a_intensity" value="1" type="float" description="Intensity of the a-ions" />
+ <ITEM name="c_intensity" value="1" type="float" description="Intensity of the c-ions" />
+ <ITEM name="x_intensity" value="1" type="float" description="Intensity of the x-ions" />
+ <ITEM name="z_intensity" value="1" type="float" description="Intensity of the z-ions" />
+ <ITEM name="relative_loss_intensity" value="0.1" type="float" description="Intensity of loss ions, in relation to the intact ion intensity" />
+ </NODE>
+ </NODE>
+ </NODE>
+ <NODE name="Global" description="">
+ <ITEM name="ionization_type" value="ESI" type="string" description="Type of Ionization (MALDI or ESI)" restrictions="MALDI,ESI" />
+ </NODE>
+ <NODE name="Labeling" description="">
+ <ITEM name="type" value="labelfree" type="string" description="Select the labeling type you want for your experiment" restrictions="ICPL,SILAC,itraq,labelfree,o18" />
+ <NODE name="ICPL" description="">
+ <ITEM name="ICPL_fixed_rtshift" value="0" type="float" description="Fixed retention time shift between labeled pairs. If set to 0.0 only the retention times, computed by the RT model step are used." />
+ <ITEM name="label_proteins" value="true" type="string" description="Enables protein-labeling. (select 'false' if you only need peptide-labeling)" restrictions="true,false" />
+ <ITEM name="ICPL_light_channel_label" value="UniMod:365" type="string" description="UniMod Id of the light channel ICPL label." tags="advanced" />
+ <ITEM name="ICPL_medium_channel_label" value="UniMod:687" type="string" description="UniMod Id of the medium channel ICPL label." tags="advanced" />
+ <ITEM name="ICPL_heavy_channel_label" value="UniMod:364" type="string" description="UniMod Id of the heavy channel ICPL label." tags="advanced" />
+ </NODE>
+ <NODE name="SILAC" description="">
+ <ITEM name="fixed_rtshift" value="0.0001" type="float" description="Fixed retention time shift between labeled peptides. If set to 0.0 only the retention times computed by the RT model step are used." restrictions="0:" />
+ <NODE name="medium_channel" description="Modifications for the medium SILAC channel.">
+ <ITEM name="modification_lysine" value="UniMod:481" type="string" description="Modification of Lysine in the medium SILAC channel" />
+ <ITEM name="modification_arginine" value="UniMod:188" type="string" description="Modification of Arginine in the medium SILAC channel" />
+ </NODE>
+ <NODE name="heavy_channel" description="Modifications for the heavy SILAC channel. If you want to use only 2 channels, just leave the Labels as they are and provide only 2 input files.">
+ <ITEM name="modification_lysine" value="UniMod:259" type="string" description="Modification of Lysine in the heavy SILAC channel. If left empty, two channelSILAC is assumed." />
+ <ITEM name="modification_arginine" value="UniMod:267" type="string" description="Modification of Arginine in the heavy SILAC channel. If left empty, twochannel SILAC is assumed." />
+ </NODE>
+ </NODE>
+ <NODE name="itraq" description="">
+ <ITEM name="iTRAQ" value="4plex" type="string" description="4plex or 8plex iTRAQ?" restrictions="4plex,8plex" />
+ <ITEM name="reporter_mass_shift" value="0.1" type="float" description="Allowed shift (uniformly distributed - left to right) in Da from the expected position (of e.g. 114.1, 115.1)" restrictions="0:0.5" />
+ <ITEMLIST name="channel_active_4plex" type="string" description="Four-plex only: Each channel that was used in the experiment and its description (114-117) in format <channel>:<name>, e.g. "114:myref","115:liver".">
+ <LISTITEM value="114:myReference"/>
+ </ITEMLIST>
+ <ITEMLIST name="channel_active_8plex" type="string" description="Eight-plex only: Each channel that was used in the experiment and its description (113-121) in format <channel>:<name>, e.g. "113:myref","115:liver","118:lung".">
+ <LISTITEM value="113:myReference"/>
+ </ITEMLIST>
+ <ITEMLIST name="isotope_correction_values_4plex" type="string" description="override default values (see Documentation); use the following format: <channel>:<-2Da>/<-1Da>/<+1Da>/<+2Da> ; e.g. '114:0/0.3/4/0' , '116:0.1/0.3/3/0.2' " tags="advanced">
+ <LISTITEM value="114:0/1/5.9/0.2"/>
+ <LISTITEM value="115:0/2/5.6/0.1"/>
+ <LISTITEM value="116:0/3/4.5/0.1"/>
+ <LISTITEM value="117:0.1/4/3.5/0.1"/>
+ </ITEMLIST>
+ <ITEMLIST name="isotope_correction_values_8plex" type="string" description="override default values (see Documentation); use the following format: <channel>:<-2Da>/<-1Da>/<+1Da>/<+2Da> ; e.g. '113:0/0.3/4/0' , '116:0.1/0.3/3/0.2' " tags="advanced">
+ <LISTITEM value="113:0/0/6.89/0.22"/>
+ <LISTITEM value="114:0/0.94/5.9/0.16"/>
+ <LISTITEM value="115:0/1.88/4.9/0.1"/>
+ <LISTITEM value="116:0/2.82/3.9/0.07"/>
+ <LISTITEM value="117:0.06/3.77/2.99/0"/>
+ <LISTITEM value="118:0.09/4.71/1.88/0"/>
+ <LISTITEM value="119:0.14/5.66/0.87/0"/>
+ <LISTITEM value="121:0.27/7.44/0.18/0"/>
+ </ITEMLIST>
+ <ITEM name="Y_contamination" value="0.3" type="float" description="Efficiency of labeling tyrosine ('Y') residues. 0=off, 1=full labeling" restrictions="0:1" />
+ </NODE>
+ <NODE name="o18" description="">
+ <ITEM name="labeling_efficiency" value="1" type="float" description="Describes the distribution of the labeled peptide over the different states (unlabeled, mono- and dilabeled)" restrictions="0:1" />
+ </NODE>
+ </NODE>
+ </NODE>
+ <NODE name="RandomNumberGenerators" description="Parameters for generating the random aspects (e.g. noise) in the simulated data. The generation is separated into two parts, the technical part, like noise in the raw signal, and the biological part, like systematic deviations in the predicted retention times.">
+ <ITEM name="biological" value="random" type="string" description="Controls the 'biological' randomness of the generated data (e.g. systematic effects like deviations in RT). If set to 'random' each experiment will look different. If set to 'reproducible' each experiment will have the same outcome (given that the input data is the same)." restrictions="reproducible,random" />
+ <ITEM name="technical" value="random" type="string" description="Controls the 'technical' randomness of the generated data (e.g. noise in the raw signal). If set to 'random' each experiment will look different. If set to 'reproducible' each experiment will have the same outcome (given that the input data is the same)." restrictions="reproducible,random" />
+ </NODE>
+ </NODE>
+ </NODE>
+ </NODE>
+</PARAMETERS>
diff --git a/inspectparser.py b/inspectparser.py
new file mode 100644
index 0000000..0442059
--- /dev/null
+++ b/inspectparser.py
@@ -0,0 +1,117 @@
+"""
+Extract selected information from InsPecT output files and produce a specified
+peptide identification (PSM) list for further analyses.
+
+ at author: Anke Penzlin, June 2013 (developed since May 2012)
+"""
+# get peptides (+ corresponding protein names) from Inspect output
+# store specID_sequence_score_isDecoy_fdr_proteinName in a tabular txt-file
+
+import re
+import optparse
+import sys
+import os
+
+
+def parseInspect(inpath, fdr_listcut=1, fdr_countcut=0.05, multipleFiles=True, silent=True):
+
+ out_name = inpath[:-4]+"_parsed.txt"
+
+ score = 0.0
+ sequence = ""
+ specID = ""
+ isDecoy= ""
+ peps = []
+ nonDecoyCount = 0
+ reliable = True
+
+ #print "Reading the input ..."
+ # get [specID ,sequence ,score,isDecoy]
+ infile = open(inpath,"r")
+ for line in infile:
+ if not line.startswith('#'):
+ if multipleFiles:
+ f_name = os.path.splitext(os.path.basename(line.split("\t")[0]))[0]
+ specID = f_name + ".." + line.split("\t")[1]
+ else: specID = int(line.split("\t")[1])
+ score = float(line.split("\t")[5]) # MQScore (choose best one)
+ sequence = line.split("\t")[2].split(".")[1]
+ sequence = sequence.replace("+","")
+ sequence = re.sub('[0-9]', '', sequence)#sequence.replace('[0-9]',"")
+ protein = line.split("\t")[3]
+
+ if "REVERSED" in protein: isDecoy = "true"
+ else: isDecoy = "false"
+
+ peps.append([specID, sequence, score, isDecoy, protein])
+
+ peps = sorted(peps, key = lambda x : (x[0], x[2]), reverse = True)
+
+ lastspec = -1
+ newpeps = []
+
+ # store important peptides in list new_peps
+ for elem in peps:
+ if elem[0] != lastspec:
+ newpeps.append(elem)
+ lastspec = elem[0]
+
+ peps = []
+ newpeps = sorted(newpeps, key = lambda x : (x[2],x[3]=="false"), reverse = True)
+
+ cutscore = newpeps[0][2] + 0.01
+
+ #calculate FDR
+ #print "Calculating FDR and writing the output in : "+ out_name
+ countDecoys = 0.0
+ counter = 0.0
+ #fdr = float(0.0)
+ outfile = open(out_name,"w")
+ outfile.write("\t Spectrum\t Peptid\t Score\t isDecoy\t FDR\t matchedProtein \n")
+ outfile.write("\t --------\t ------\t -----\t -------\t ---\t -------------- \n")
+ for elm in newpeps:
+ counter +=1
+ if elm[3] == "true":
+ countDecoys +=1
+ fdr = countDecoys/counter
+ if fdr > fdr_listcut: break
+ if fdr > fdr_countcut: reliable = False
+ if reliable:
+ nonDecoyCount = counter-countDecoys
+ cutscore = elm[2]
+ outfile.write("\t"+str(elm[0])+"\t"+elm[1]+"\t")#
+ outfile.write(str(elm[2])+"\t"+elm[3]+"\t"+str(fdr)+"\t"+elm[4])
+ outfile.write("\n")
+
+ outfile.close()
+ if silent: return out_name
+ else: return [out_name, nonDecoyCount, len(newpeps), cutscore] #[total, nonDecoy, output filename]
+
+if __name__=="__main__":
+ usage = """%prog INSPECT_OUT
+
+compute FDR from InsPecT output (txt-file).
+
+"""
+
+ # configure the command line parser
+ command_parser = optparse.OptionParser(usage=usage)
+ command_parser.add_option("-f","--fdr",type="float",dest="fdr",default=0.05, help="FDR cut-off for count and cut-off score calculation [default: %default]")
+ command_parser.add_option("-l","--list_fdr",type="float",dest="list_fdr",default=1, help="FDR list cut-off for restriction of output lines, overwrites fdr cut-off if smaller [default: %default (= no cut)]")
+ command_parser.add_option("-m","--multifile",action="store_true",dest="multifile",default=False,help="use filename to tag spectrum numbers for subsequent mixture of multiple different files")
+ command_parser.add_option("-s","--silent",action="store_true",dest="short_out",default=False,help="only print output filename instead of all calculated values")#"don't print status messages to stdout")
+ # parse arguments
+ options, args = command_parser.parse_args()
+ silent = options.short_out
+
+ if len(args) == 1:
+ inspect_out = args[0]
+ if silent: print parseInspect(inpath=inspect_out, fdr_listcut=options.list_fdr, fdr_countcut=options.fdr, multipleFiles=True, silent=silent)
+ else:
+ outname, count, total, score = parseInspect(inpath=inspect_out, fdr_listcut=options.list_fdr, fdr_countcut=options.fdr, multipleFiles=True, silent=silent)
+ print "%i of %i spectra within %.2f FDR, last score: %.3f" %(count, total, options.fdr, score)
+ print "selected identification lines written to:", outname
+
+ else:
+ command_parser.print_help()
+ sys.exit(1)
diff --git a/pipasic.py b/pipasic.py
new file mode 100644
index 0000000..5106d95
--- /dev/null
+++ b/pipasic.py
@@ -0,0 +1,301 @@
+"""
+Aggregation of pipasic functionalities (by import) to a single command line tool.
+
+ at author: Anke Penzlin, June 2013
+ at lastChange: January 2014
+"""
+
+#import re
+#import random
+#from Bio import SeqIO
+import matplotlib.pyplot as plt
+import numpy as np
+import sys
+import os
+
+
+if __name__=="__main__":
+ usage = """%prog SPECTRA DB [module options] [input and configuration options]
+
+Overall pipasic calling tool, including:
+- weighted (always) and unweighted (optional) similarity estimation
+ (both sequence based)
+- correction, using matrix from similarity estimation
+- peptide Identification by InsPecT or Tide
+
+SPECTRA: Comma-separated string of spectrum files (mgf) - without file-extension!
+DB: Comma-separated string of reference proteomes (fasta-files) - without file-extension!
+ if -S or -I: decoy database must exist as db_name+"_decoy.fasta"
+
+Note: Pipasic requires two .fasta for each reference proteome <ref>:
+- "<ref>.fasta" containing the protein sequences only.
+- "<ref>_decoy.fasta" containing BOTH the protein AND decoy sequences. Decoy
+ sequences must be tagged in their name with "REVERSED" or "DECOY" or any
+ other tag specified by "-t".
+"""
+ # enable command line parsing
+ from optparse import OptionParser
+ optparser = OptionParser(usage=usage)
+
+ # modules:
+ optparser.add_option("-U","--Unweighted",action="store_true",dest="unw",default=False,help="calculate unweighted similarities for all given proteomes")
+ optparser.add_option("-I","--Identify",action="store_true",dest="id",default=False,help="identify given spectra with all given proteomes")
+ optparser.add_option("-T","--Tide",action="store_true",dest="Tide",default=False,help="use Tide instead of InsPecT")
+ optparser.add_option("-V",action="store_true",dest="viz",default=False,help="Visualize results using matplotlib")
+ #optparser.add_option("-S","--Simulation",action="store_true",dest="simul",default=False,help="use simulation to calculate unweighted similarities, sequence based otherwise")
+ #optparser.add_option("-W","--Weighted",action="store_true",dest="wtd",default=True,help="calculate weighted similarities for a dataset and all given proteomes")
+
+ # input parameters:
+ optparser.add_option('-o', '--outfile', type='string', dest='outfile', default="results.txt", help='Output filename for results. Also serves as trunk for other result files (graphics, data). [default: %default]')
+ optparser.add_option('-s', '--spec_dir', type='string', dest='spec_dir', default=None, help='Directory of SPECTRA (mgf) files. Search in current directory, if not given. [default: %default]')
+ optparser.add_option('-d', '--db_dir', type='string', dest='db_dir', default=None, help='Directory of proteinDBs. Search for DB files current directory, if not given. [default: %default]')
+ optparser.add_option('-m', '--mods', type='string', dest='mods', default=None, help='A string containing all modifications in question, modification choice by filename if not given. [default: %default]')
+ optparser.add_option('-i', '--inspect_dir', type='string', dest='insp_dir', default=None, help='Inspect directory. [default: %default]')
+ optparser.add_option('-f', '--fdr_cutoff', type='float', dest='fdr', default=0.05, help='False discovery rate cut-off for identification lists. [default: %default]')# -l : list_fdr_cutoff ??
+ optparser.add_option('-t', '--decoy_tag', type='str', dest='decoy_tag', default="REVERSED|DECOY", help='Tag to identify decoy sequences in the database. Regular expressions may be used. This tag must be used in the name of all decoy sequences in the file "<DB>_decoy.fasta". [default: %default]')
+ #optparser.add_option('-n', '--numberspectra', type='int', dest='nspec', default=1000, help='number of samples from simulated spectra. [default: %default]')
+ optparser.add_option('-l', '--labels', type='string', dest='labels', default=None, help='Comma-separated string of short names for organisms in the reference proteomes. If not given, the file name is used. [default: %default]')
+ optparser.add_option('-N', '--N_spectra', type='string', dest='N', default=None, help='Number of spectra in original dataset, comma-separated list if multiple datasets. [default: %default]')
+ optparser.add_option('-c', '--C_spectra', type='string', dest='counts', default=None, help='File containing numbers of spectra found by identification (Numpy Array dump). [default: %default]')
+
+ # flags:
+ optparser.add_option("-q","--quiet",action="store_false",dest="verbose",default=True,help="don't print status messages to stdout")
+
+ # parse options and arguments
+ options, args = optparser.parse_args()
+ #if not any([options.id, options.unw, options.simul , options.wtd]):
+ # optparser.print_help()
+ # sys.exit(1)
+
+ # process documentation
+ '''if options.time_mes:
+ import time
+ t0 = time.time()
+ print time.strftime("%Y/%m/%d, %H:%M:%S: Starting whole procedure and overall time measurement", time.localtime(t0))
+ sys.stdout.flush()'''
+
+ # scanning some options
+ verbose = options.verbose
+ unw = options.unw
+ wtd = True
+ correction = True
+ fdr = options.fdr
+ decoy_tag = options.decoy_tag
+ outfile = options.outfile
+ if not outfile:
+ print "No output filename given. Writing all output to working directory, 'output'-files\n"
+ outfile = "output.txt"
+ labels = options.labels
+ if labels:
+ labels = labels.split(",")
+ Counts = options.counts
+ config_path = os.path.dirname(os.path.realpath(__file__))+"/config_files/"
+
+ if not options.spec_dir:
+ options.spec_dir = ""
+ if not options.db_dir:
+ options.db_dir = ""
+ if not options.mods:
+ options.mods = ""
+ if not options.insp_dir:
+ options.insp_dir = ""
+
+
+ # Identify spectra [-I]
+ if options.id:
+ if len(args) == 2:
+ spectra = args[0].split(',')
+ db_list = args[1].split(',')
+ countList = []
+ if options.Tide:
+ from TideProcessing import runTide_all
+ for spec in spectra:
+ print "Started Tide for "+spec
+ sample_counts = runTide_all(spec_mgf=options.spec_dir+spec+".mgf", db_list=db_list, db_path=options.db_dir, verbose=verbose)[0]
+ countList.append([count for count in sample_counts])
+ else:
+ from runInspect_user_config import runInspect_config
+ insp_config = config_path+"config_Inspect_py.txt"
+ runInspect_config(spectra=spectra, DBs=db_list, spec_path=options.spec_dir, db_path=options.db_dir, inspect_dir=options.insp_dir, conf=insp_config, user_mods=options.mods) # output and/or variable? <- may not be necessary (since spectra enough)
+ from inspectparser import parseInspect
+ correction = False
+ for spec in spectra:
+ sample_counts = []
+ for DB in db_list:
+ # compose filename of spectra and database
+ inspectOut = options.spec_dir+spec +"_"+ DB +"_InspectOut.txt"
+ try:
+ count = parseInspect(inpath=inspectOut,fdr_countcut=fdr,silent=False)[1]
+ sample_counts.append(count)
+ correction = True
+ except:
+ print "Could not find PSM-file for %s and %s" %(spec,DB)
+ sample_counts.append(0)
+ countList.append(sample_counts)
+ np.array(countList).dump(os.path.splitext(outfile)[0]+"_counts.dat")
+ else:
+ print "Aborting. - Spectra and reference proteomes needed as the arguments!"
+ optparser.print_help()
+ sys.exit(1)
+
+
+ # unweighted similarity (sequence based) [-U]
+ if unw:
+ if len(args) > 1:
+ spectra = args[0].split(',')
+ db_list = args[1].split(',')
+ elif len(args) > 0:
+ db_list = args[0].split(',')
+ else:
+ print "Aborting. - Reference proteomes needed as (only or 2nd) argument!"
+ optparser.print_help()
+ sys.exit(1)
+ dbList = ",".join([options.db_dir+db+".fasta" for db in db_list])
+ from trypticpeptides import unweightedMatrix
+ simiMat_unw = unweightedMatrix(dbList=dbList, verbose=verbose)
+ np.array(simiMat_unw).dump(os.path.splitext(outfile)[0]+"_M_unw.dat")
+
+ # weighted similarity (sequence based)
+ if wtd:
+ if len(args) == 2:
+ spectra = args[0].split(',')
+ db_list = args[1].split(',')
+ dbList = ",".join([options.db_dir+db+".fasta" for db in db_list])
+ else:
+ print "Aborting. - Spectra and reference proteomes needed as the arguments!"
+ optparser.print_help()
+ sys.exit(1)
+ from trypticpeptides import weightedMatrix
+ M_list_wtd = []
+ for spec in spectra:
+ M_list_wtd.append(weightedMatrix(spectra_name=options.spec_dir+spec, dbList=dbList, fdr=fdr, init_weight=0, verbose=verbose, Tide=options.Tide, decoy_tag=decoy_tag))
+ np.array(M_list_wtd).dump(os.path.splitext(outfile)[0]+"_mList_wtd.dat")
+
+ # prep correction (1) - load/generate list of counts
+ if correction:
+ from PASiC import PASiC
+ if Counts:
+ Counts = np.load(Counts)
+ elif options.id:
+ Counts = np.array(countList)
+ elif options.Tide:
+ print "Correction impossible! - With Tide, do the identification in the same run as the correction or provide (with '-c') counts from a previous run."
+ correction = False
+ else:
+ from inspectparser import parseInspect
+ countList = []
+ for spec in spectra:
+ sample_counts = []
+ for DB in db_list:
+ # compose filename of spectra and database
+ inspectOut = options.spec_dir+spec +"_"+ DB +"_InspectOut.txt"
+ try:
+ count = parseInspect(inpath=inspectOut,fdr_countcut=fdr,silent=False)[1]
+ sample_counts.append(count)
+ except:
+ print "Could not find PSM-file for %s and %s" %(spec,DB)
+ sample_counts.append(0)
+ countList.append(sample_counts)#np.transpose(sample_counts))
+ Counts = np.array(countList)
+ Counts.dump(os.path.splitext(outfile)[0]+"_counts.dat")
+
+ # prep correction (2) - generate list of sample sizes
+ if correction:
+ if options.N:
+ N_list = [int(N) for N in options.N.split(",")]
+ if len(spectra) != len(N_list) or not all(N_list):
+ print "Number of spectra required for each input dataset!"
+ print "datasets:",len(spectra),"dataset sizes:",len(N_list)
+ correction = False
+ elif len(args) > 1:
+ def countSpec_mgf(spec):
+ s_path = os.path.join(options.spec_dir, spec+".mgf")
+ nr_spec = os.popen('grep -c "END IONS" %s'%s_path).read()
+ print "\n",s_path, nr_spec, "\n\n"
+ return int(nr_spec)
+ try:
+ N_list = [countSpec_mgf(s) for s in spectra]
+ except:
+ print "Correction impossible! - input spectra could not be counted, number required with '-N'."
+ correction = False
+ else:
+ print "Correction impossible! - Number of input spectra required with '-N'."
+ correction = False
+
+ # run correction
+ if correction:
+ normCorr = []
+ norm_wtd = []
+ wtd_cnts = []
+ for i,spec in enumerate(spectra):
+ N = N_list[i]
+ if unw:
+ corrAbunds, normCorr_i = PASiC(simiMat=simiMat_unw, counts=Counts[i], N=N)
+ normCorr.append(normCorr_i)
+ if wtd:
+ wtd_Abunds, norm_wtd_i = PASiC(simiMat=M_list_wtd[i], counts=Counts[i], N=N)
+ wtd_cnts.append(wtd_Abunds * N)
+ norm_wtd.append(norm_wtd_i)
+
+
+ # Write all results to text file
+ oFile = open(outfile, 'w')
+
+ # process documentation
+ '''if options.time_mes:
+ oFile.write(time.strftime("%Y/%m/%d, %H:%M:%S\n", time.localtime(time.time())))
+ '''
+ ops = ["Identification", "Unweighted Similarity", "Weighted Similarity", "Correction"]
+ for i,op in enumerate([options.id, unw, wtd, correction]):
+ if op:
+ oFile.write(ops[i]+"\tdone.\n")
+
+ #resulting values
+ if correction:
+ if not labels:
+ labels = db_list
+ for i,spec in enumerate(spectra):
+ oFile.write("Results for dataset '%s':\n"%spec)
+ if unw:
+ out_table = np.array(zip(labels, Counts[i], wtd_cnts[i], norm_wtd[i], normCorr[i]))
+ oFile.write("\t".join(["Name", "Raw counts", "Corrected Counts", "Relative abundance", "Rel. abundance unweighted corr."])+"\n")
+ else:
+ out_table = np.array(zip(labels, Counts[i], wtd_cnts[i], norm_wtd[i]))
+ oFile.write("\t".join(["Name", "Raw counts", "Corrected Counts", "Relative abundance"])+"\n")
+ for line in out_table:
+ oFile.write("\t".join([str(val) for val in line])+"\n")
+ oFile.write("\n")
+
+ oFile.write("Counts saved to "+os.path.splitext(outfile)[0]+"_counts.dat\n")
+ if wtd:
+ if len(db_list)+len(spectra) < 7: oFile.write(str(M_list_wtd)+"\n")
+ oFile.write("List of weighted matrices saved to "+os.path.splitext(outfile)[0]+"_mList_wtd.dat\n")
+ if unw:
+ if len(db_list) < 7: oFile.write(str(simiMat_unw)+"\n")
+ oFile.write("Unweighted matrix saved to "+os.path.splitext(outfile)[0]+"_M_unw.dat\n")
+ oFile.write("All done.\n\n")
+ oFile.close()
+
+ # show results [-V]
+ if options.viz:
+ if correction and len(spectra) == 1: #!!!
+ from plotting import barplotPASiC
+ if not labels:
+ labels = db_list
+ if not unw:
+ normCorr = norm_wtd[0]
+ norm_wtd = []
+ else:
+ normCorr = normCorr[0]
+ norm_wtd = norm_wtd[0]
+ barplotPASiC(names=labels, obs=np.divide(Counts[0], float(sum(Counts[0]))), corr=normCorr, wtd=norm_wtd, filename=os.path.splitext(outfile)[0]+"_correction.pdf")
+ print "Graphical output written to:",os.path.splitext(outfile)[0]+"_correction.pdf"
+ elif correction:
+ print "Visualization is only provided for a single dataset."
+
+ '''if options.time_mes:
+ t1 = time.time()
+ print time.strftime("%Y/%m/%d, %H:%M:%S: all done\n", time.localtime(t1))
+ print "took %.3f seconds overall (that's %.3f minutes)"%(t1 - t0, (t1 - t0)/60)
+ print "--------------------------------------------------------------------------------\n\n\n"
+ '''
diff --git a/plotting.py b/plotting.py
new file mode 100644
index 0000000..2492638
--- /dev/null
+++ b/plotting.py
@@ -0,0 +1,68 @@
+"""
+Barplot for given observed counts and correction results (optional plus weighted
+correction) for a number of proteomes.
+
+ at author: Anke Penzlin, June 2013
+"""
+
+import numpy as np
+
+def barplotPASiC(names, obs, corr, wtd=[], filename='results_correction.pdf',
+ leg_tags=['Observed fraction','Estimated fraction','Fraction estimated by weighting']):
+ """
+ names: organism names for labeling (x-axis) and legend; must have same
+ length as the data lists (number of organisms)
+ obs: observed counts, normalized to sum=1 for all organisms
+ corr: results of (unweighted) correction, normalized sum=1 for all orgs
+ wtd: results of weighted correction, normalized to sum=1 for all orgs;
+ if [] (default), only obs and corr will be plotted
+ filename: outputfile (image format like .png or .pdf), 'results_correction.pdf'
+ is created in working directory by default
+ leg_tags: description of data given in obs, corr and wtd for legend
+ """
+ import matplotlib.pyplot as plt
+ rng = np.arange(len(names))
+ cl1 = np.array([215, 25, 28])/255.
+ cl2 = np.array([253, 174, 97])/255.
+ cl3 = np.array([171, 221, 164])/255.
+ cl4 = np.array([63, 151, 206])/255.
+
+ plt.figure()
+ plt.bar(3*rng, obs, 1, color=cl1, label=leg_tags[0])
+ plt.bar(3*rng+1, corr, 1, color=cl4, label=leg_tags[1])
+ if wtd != []: plt.bar(3*rng+2, wtd,1, color=cl2, label=leg_tags[2])
+ #plt.text(0.5, unique[0]+ 0.1*unique[0], "Unique / all mapped reads", color='k', rotation=90, horizontalalignment='center', verticalalignment='bottom')
+ #plt.text(1.5, corr[0]*total+ 0.1*unique[0], "GASiC estimate", rotation=90, horizontalalignment='center', verticalalignment='bottom')
+ plt.xticks(3*rng+1, names)#, rotation=90)
+ plt.ylabel('Fraction of PSMs')
+ plt.xlim(xmin=0, xmax=3*len(names))
+ plt.ylim(ymax=1.5)#ymin=-0.05*total)
+ plt.legend(loc='upper left')
+ plt.savefig(filename, bbox_inches='tight')
+
+
+if __name__ == "__main__":
+ usage = """%prog OBSERVED CORRECTED [WEIGHTED]
+
+plot PASiC results.
+"""
+
+ from optparse import OptionParser
+ command_parser = OptionParser(usage=usage)
+ command_parser.add_option("-d","--dir",type="string",dest="f_dir",default="",help="common directory of all input files. [default: No common path]")
+ command_parser.add_option("-o","--out_file",type="string",dest="outfile",default="results_correction.pdf",help="specify an output filename for the produced graphic. [default: %default]")
+ command_parser.add_option("-t","--tags",type="string",dest="tags",default="Observed fraction,Estimated fraction,Fraction estimated by weighting",help="string of ','-separated legend line names. [default: %default]")
+ command_parser.add_option("-n","--names",type="string",dest="names",default="",help="string of ','-separated proteome names. [default: org_1,..,org_n]")
+ options, args = command_parser.parse_args()
+
+ if len(args) < 2:
+ command_parser.print_help()
+ sys.exit(1)
+ obs = np.load(options.f_dir+args[0])
+ corr = np.load(options.f_dir+args[1])
+ if len(args) > 2: wtd = np.load(options.f_dir+args[2])
+ else: wtd = []
+ input_tags = options.tags.split(",")
+ if options.names: names = options.names.split(",")
+ else: names = ["org%i" for i in range(len(obs))]
+ barplotPASiC(names=names, obs=obs, corr=corr, wtd=wtd, filename=options.outfile, leg_tags=input_tags)
diff --git a/readMGF.py b/readMGF.py
new file mode 100644
index 0000000..a597011
--- /dev/null
+++ b/readMGF.py
@@ -0,0 +1,389 @@
+"""
+Processing spectra files in mascot generic format
+-------------------------------------------------
+
+Including:
+storeAllSpec:
+ spectra from mgf-file to a python list of strings ()
+sampleMGF:
+ use list of spectra to write a certain number of them to a new mgf-file
+filterCharge:
+ filter spectra from mgf-file removing with charge > 3
+plotQ (+ auxiliary function Quantile):
+ visualize match qualities from Inspect results
+getFilterIDs:
+ specialized sampling procedure according to (Inspect) quality scores;
+ spectrum IDs are sampled facilitating the selection of peptides in different
+ formats (filterFDR for mgf, trypticPeptides.filterParsed for identifications)
+filterFDR:
+ from a given mgf-file select spectra according to spectrum IDs obtained from
+ getFilterIDs
+"""
+
+import re
+import random
+import numpy as np
+#import os # former use in creating new filenames (in filterFDR)
+
+
+def storeAllSpec(mgf, verbose=True):
+ """
+ write spectra from a given mgf-file to a python list of strings.
+ Each spectrum is delimited by BEGIN IONS and END IONS statements and will
+ be represented by a string including line breaks.
+ (return: list of spectra as mgf-strings)
+
+ mgf: filename of input spectra in mascot generic format (mgf)
+ """
+ if verbose: print "reading spectra from %s ..." %mgf
+ MGF_handle = open(mgf,"r")
+ specList = [] # initialize output list
+ for line in MGF_handle:
+ spec = ""
+ if re.search("BEGIN IONS",line): #start of next spectrum
+ spec += line
+ for line2 in MGF_handle: #add lines of current spectrum description
+ spec += line2
+ if re.search("END IONS",line2): #end of current spectrum
+ spec += "\n"
+ break
+ specList.append(spec) # add full spectrum description as list entry
+ if verbose: print "found %i spectra" %len(specList)
+ return specList
+
+
+def sampleMGF(infile, n, outfile = ""):
+ """
+ randomly choose a given number of spectra from an mgf file.
+ (return: filename of sampling result)
+ using: storeAllSpec
+
+ infile: mgf-file of input spectra
+ n: desired number of randomly sampled spectra; if 0, no sampling
+ outfile: name of output file; if "", a filename including n is generated
+ """
+ spectra = storeAllSpec(infile) # write spectra from input file to a list
+ nprot = len(spectra) # overall number of provided spectra
+ #print "Found %i spectra" % nprot
+ if n == 0 or n > nprot:
+ print "Take the complete list of %i spectra" %nprot
+ return infile
+ else:
+ rnum = random.sample(range(nprot),n) # sample n list indices
+ if outfile == "": outfile = infile[:-4] + "_sample"+str(n)+".mgf"
+ print 'Writing '+str(n)+' spectra to '+outfile+" ..."
+ f_out = open(outfile,"w")
+ for z in rnum: # write file of spectra corresponding to sampled indices
+ f_out.write(spectra[z])
+ f_out.close()
+ print "done"
+ return outfile
+
+
+def filterCharge(spectra, value=3, outfile="", mgf=True):
+ """
+ from a given mgf-file select spectra with charge <= 3 (for Inspect)
+
+ spectra: mgf-file containing the spectra (default) or list object as
+ produced in storeAllSpec() with mgf=False
+ value: upper limit to restrict charge, default=3 (used with Inspect)
+ outfile: name of new mgf-file; if "", a filename is generated if an input
+ filename is given in spectra
+ mgf: specification of input type; True (default) if spectra contains an
+ mgf-file, False for a list object as produced in storeAllSpec()
+ """
+ if mgf: # read data from file
+ if outfile=="": outfile = spectra[:-4] + "_charge_max"+str(value)+".mgf"
+ spectra = storeAllSpec(spectra)
+ print "Writing spectra with charge < "+str(value+1)+" to "+outfile+" ..."
+ handle_out = open(outfile,"w")
+ for entry in spectra: # write spectra except with charge > value to outfile
+ if int(entry[entry.find("CHARGE=")+7]) < value+1:
+ handle_out.write(entry)
+ handle_out.close()
+ print "done"
+ return outfile
+
+
+# java-InspectParser-Output!
+def plotQ(scoreTab, outfile="testplot.pdf", fdrfile="", java=True, fdrcut=0.05):
+ """
+ produce a histogram of Inspect quality scores with marked FDR cut-off score
+ and quantiles at lower 75% and 95%.
+ (return: name of the file containing the resulting figure)
+ requires: (the first two) output files from java-InspectParser
+ - peptide table (tableFilename) containing quality scores and
+ - summary of identification list (extractFilename) containing FDR
+ cut-off score (if not given, histogram without marked cut-off)
+ using: Quantile
+
+ scoreTab: tableFilename; path to the peptide spectrum match file including
+ a table of '|Spectrum|Peptid|Score|isDecoy|FDR|'
+ outfile: output file, different image file formats possible; if no
+ specification given, create "testplot.pdf"
+ fdrfile: extractFilename; path to the summary file including
+ 'score:'cut-off score in its second line
+ """
+
+ #import matplotlib.mlab as mlab
+ import matplotlib.pyplot as plt # required plotting library
+ scoreList = [] # initialize list of score values
+ if java: sep = "|"
+ else:
+ sep = "\t"
+ fdrScore = 120
+ with open(scoreTab) as f: # specific for table file (skip 2 lines header)
+ next(f)
+ next(f)
+ if java:
+ for line in f: # read score value from column 3 of each identification
+ scoreList.append(float(line.split(sep)[3]))
+ else:
+ for line in f: # read score value from column 3 of each identification
+ recScore = float(line.split(sep)[3])
+ scoreList.append(recScore)
+ if float(line.split(sep)[5]) < fdrcut and recScore < fdrScore:
+ fdrScore = recScore
+ plt.clf() # initialize figure
+ n, bins, patches = plt.hist(scoreList, 50, facecolor='green', alpha=0.75) # histogram of scores
+ plt.axvline(Quantile(scoreList,0.95),color='grey',lw=1.3) # add lower 95%-quantile
+ plt.axvline(Quantile(scoreList,0.75),color='grey',lw=1.3) # add lower 75%-quantile
+ if java:
+ if fdrfile != "": # if no FDR-file given, no cut-off score will be marked
+ with open(fdrfile) as f: # get cut-off score (from fdrfile)
+ ls = f.readlines()
+ fdrScore = float(ls[1][6:]) # defined position of score in fdrfile
+ plt.axvline(fdrScore,color='b') # show cut-off score in blue
+ else: plt.axvline(fdrScore,color='b',lw=1.3) # show cut-off score in blue
+ plt.savefig(outfile) # resulting figure will not be displayed but saved to file
+ return outfile
+
+def Quantile(data, q, precision=1.0):
+ """
+ Returns the q'th percentile of the distribution given in the argument
+ 'data'. Uses the 'precision' parameter to control the noise level.
+ source: https://gist.github.com/1840407
+ [an alternative would be scipy.stats.mstats.mquantiles(), which does NOT
+ yield exactly the same results(!)]
+ """
+
+ N, bins = np.histogram(data, bins=precision*np.sqrt(len(data)))
+ norm_cumul = 1.0*N.cumsum() / len(data)
+
+ return bins[norm_cumul > q][0]
+
+
+def readIdentificationFile(infile, sep="\t"):
+ """
+ read lines from a peptide identification file and store the column split version
+ of those lines (without header) in a list.
+ (return: list of peptide identifications as list of identification )
+
+ infile: file containing identification lines (like output of
+ InspectParser_FDR_uniq.py or InspectParser.java Table)
+ sep: separator for splitting input identification lines,
+ default="\t" for python parser output, "|" for java Tab
+ """
+
+ with open(infile) as f: peptList = [line.split(sep) for line in f]
+ del peptList[0:2]
+ return peptList
+
+def getFilterIDs_direct(scoreTab, fdrcut=0.05, sep="\t", maxN=1000):
+ """
+ specialized sampling procedure for spectra from different datasets;
+ auxiliary function for filterFDR and trypticPeptides.filterParsed.
+ The FDR cut-off divides the identification list into peptides considered
+ reliable (higher score) and peptides with a lower score (not further analysed).
+ Spectra are sampled equally from both parts.
+ (return: list of sampled SpecIDs, number of samples from each list part)
+ using: bootstrapping.readIdentificationFile
+
+ scoreTab: tableFilename; path to the peptide spectrum match file including
+ a table of 'sepSpectrumsepPeptidsepScoresepisDecoysepFDRsep[...]'
+ fdrcut: cut-off controlling FDR in identification lists
+ sep: separator for split of scoreTab input lines,
+ default="\t" for python parser output, "|" for java Tab
+ maxN: desired sampling number of spectrum IDs from each list part,
+ if one part contains less spectra the maximum possible number is
+ used (default: 1000)
+ """
+ #from bootstrapping import readIdentificationFile
+ ls = readIdentificationFile(infile=scoreTab, sep=sep)
+ ls.sort(key = lambda x : float(x[3]), reverse = True)
+ # split input list into SpecIDs of entries with scores above / under FDR
+ ls1 = [] # sub-cut-off score part of identification list
+ ls2 = [] # high score part of identification list
+ counter = 0.0
+ countDecoys = 0.0
+ fdr = 0.0
+ for line in ls:
+ # ignore file name if prefix of id (concatenated by '..'); ids must be unique (no mixed files)
+ id = int(line[1].split("..")[1]) if len(line[1].split("..")) > 1 else int(line[1])
+ counter +=1
+ if line[4] == "true":
+ countDecoys +=1
+ fdr = countDecoys/counter
+ if fdr < fdrcut:
+ ls1.append(id)
+ else: ls2.append(id)
+ n = min(len(ls1),len(ls2),maxN) # calculate the maximum list length
+ rnum1 = random.sample(ls1,n) # sample the same number from lower and
+ rnum2 = random.sample(ls2,n) # from upper list part
+ return rnum1+rnum2, n # concatenated list of sampled SpecIDs, number of samples from each part
+
+'''
+# based on getFilterIDs using a score value independent from identification list
+# actually never used, since filterIDs are always taken from the same list as score
+def getFilterIDs_val(fdrScore, scoreTab, sep="\t", maxN=1000, verbose=True):
+ """
+ specialized sampling procedure for spectra from different datasets;
+ auxiliary function for filterFDR and trypticPeptides.filterParsed.
+ The FDR cut-off score divides the identification list into peptides considered
+ reliable (higher score) and peptides with a lower score (not further analysed).
+ Spectra are sampled equally from both parts.
+ (return: list of sampled SpecIDs, number of samples from each list part)
+ using: bootstrapping.readIdentificationFile
+
+ fdrScore: a float-value defining the FDR cut-off score
+ scoreTab: tableFilename; path to the peptide spectrum match file including
+ a table of 'sepSpectrumsepPeptidsepScoresepisDecoysepFDRsep[...]'
+ maxN: desired sampling number of spectrum IDs from each list part,
+ if one part contains less spectra the maximum possible number is
+ used (default: 1000)
+ sep: separator for split of scoreTab input lines,
+ default="\t" for python parser output, "|" for java Tab
+ maxN: desired sampling number of spectrum IDs from each list part,
+ if one part contains less spectra the maximum possible number is
+ used (default: 1000)
+ verbose: if True, status messages and comments are printed to stdout
+ """
+ from bootstrapping import readIdentificationFile
+
+ if verbose: print "split list of spectra by cut-off score %f" %fdrScore
+ # read SpecIDs and corresponding scores from scoreTab
+ ls = readIdentificationFile(infile=scoreTab, sep=sep)
+ # split input list into SpecIDs of entries with scores above / under FDR
+ ls1 = [] # sub-cut-off score part of identification list
+ ls2 = [] # high score part of identification list
+ for en in ls:
+ if float(en[3]) < fdrScore:
+ ls1.append(en[1])
+ else: ls2.append(en[1])
+ n = min(len(ls1),len(ls2),maxN) # calculate the maximum list length
+ rnum1 = random.sample(ls1,n) # sample the same number from lower and
+ rnum2 = random.sample(ls2,n) # from upper list part
+ return rnum1+rnum2, n # concatenated list of sampled SpecIDs, number of samples from each part
+
+# auxiliary function connecting old getFilterIDs input to new getFilterIDs_val
+# actually never used
+def getCutScore(fdrfile, fdrcut=0.05, java=False):
+ """
+ get cut-off score from java-InspectParser output extractFile or
+ python-InspectParser identification list
+
+ fdrfile: if java, extractFilename; path to the summary file including
+ 'score:'cut-off score in its second line;
+ else, identification list filename
+ """
+ if verbose: print "get cut-off score from "+fdrfile
+ if java:
+ with open(fdrfile) as f:
+ ls = f.readlines()
+ fdrScore = float(ls[1][6:])
+ else:
+ from bootstrapping import readIdentificationFile
+ ls = readIdentificationFile(infile=fdrfile, sep="\t")
+ ls.sort(key = lambda x : float(x[3]), reverse = True)
+ counter = 0.0
+ countDecoys = 0.0
+ fdr = 0.0
+ for line in ls:
+ counter +=1
+ if line[4] == "true":
+ countDecoys +=1
+ fdr = countDecoys/counter
+ if fdr > fdrcut: break
+ fdrScore = line[3]
+ return fdrScore
+'''
+
+# deprecated since strongly relying on java-InspectParser output
+def getFilterIDs(fdrfile, scoreTab, maxN=1000, verbose=True):
+ """
+ specialized sampling procedure for spectra from different datasets;
+ auxiliary function for filterFDR and trypticPeptides.filterParsed.
+ The FDR cut-off score (from fdrfile) divides the identification list into
+ peptides considered reliable (higher score) and peptides with a lower score
+ (not further analysed). Spectra are sampled equally from both parts.
+ (return: list of sampled SpecIDs, number of samples from each list part)
+ requires: (the first two) output files from java-InspectParser
+ (extractFilename and tableFilename)
+
+ fdrfile: extractFilename; path to the summary file including
+ 'score:'cut-off score in its second line
+ scoreTab: tableFilename; path to the peptide spectrum match file including
+ a table of '|Spectrum|Peptid|Score|isDecoy|FDR|'
+ maxN: desired sampling number of spectrum IDs from each list part,
+ if one part contains less spectra the maximum possible number is
+ used (default: 1000)
+ """
+ # get cut-off score (from fdrfile)
+ if verbose: print "get cut-off score from "+fdrfile
+ with open(fdrfile) as f:
+ ls = f.readlines()
+ fdrScore = float(ls[1][6:])
+ if verbose: print "split list of spectra by cut-off score %f" %fdrScore
+ # read SpecIDs and corresponding scores from scoreTab
+ with open(scoreTab) as f:
+ ls = [line.rstrip().split("|") for line in f]
+ del ls[0:2] # skip header (2 lines)
+ # split input list into SpecIDs of entries with scores above / under FDR
+ ls1 = [] # sub-cut-off score part of identification list
+ ls2 = [] # high score part of identification list
+ for en in ls:
+ if float(en[3]) < fdrScore:
+ ls1.append(en[1])
+ else: ls2.append(en[1])
+ n = min(len(ls1),len(ls2),maxN) # calculate the maximum list length
+ rnum1 = random.sample(ls1,n) # sample the same number from lower and
+ rnum2 = random.sample(ls2,n) # from upper list part
+ return rnum1+rnum2, n # concatenated list of sampled SpecIDs, number of samples from each part
+
+def filterFDR(spectra, scoreTab, fdrfile="", maxN=1000, verbose=True):
+ """
+ from a given mgf-file select spectra according to corresponding Inspect
+ scores and write them to a new mgf-file. For more information on the
+ specialized sampling procedure see getFilterIDs.
+ (return: new filename)
+ requires: (the first two) output files from java-InspectParser
+ (extractFilename and tableFilename)
+ using: storeAllSpec, getFilterIDs
+
+ spectra: mgf-file containing exactly the spectra used for Inspect analysis
+ resulting in fdrfile and scoreTab using java-InspectParser
+ (SpecIDs must be consistent)
+ scoreTab: tableFilename; path to the peptide spectrum match file including
+ a table of '|Spectrum|Peptid|Score|isDecoy|FDR|'
+ fdrfile: extractFilename; path to the summary file including
+ 'score:'cut-off score in its second line
+ maxN: desired number of samples from each part of identification list,
+ maximum in case of a shorter list part (default=1000)
+ """
+
+ # read input mgf-file and create a python list of all spectra
+ allSpectra = storeAllSpec(spectra, verbose=verbose)
+ # sample spectrum numbers (SpecIDs)
+ if fdrfile: SpecIDs, n = getFilterIDs(fdrfile=fdrfile, scoreTab=scoreTab, maxN=maxN, verbose=verbose)
+ else: SpecIDs, n = getFilterIDs_direct(scoreTab=scoreTab, maxN=maxN)
+ # include number of contained spectra into new filename
+ newfile = spectra[:-4] + "_filter" + str(2*n) + ".mgf" # since filterFDR processes only mgf-files
+ #newfile = os.path.splitext(spectra)[0] + "_filter" + str(2*n) + os.path.splitext(spectra)[1] # obsolete
+ if verbose: print 'Writing '+str(n)+"+"+str(n)+' spectra to '+newfile+" ..."
+ outfile = open(newfile,"w")
+ # take SpecIDs as indices to write sampled entries of allSpectra to file
+ for z in SpecIDs:
+ outfile.write(allSpectra[int(z)])
+ outfile.close()
+ if verbose: print "done"
+ return newfile
diff --git a/runInspect_user_config.py b/runInspect_user_config.py
new file mode 100644
index 0000000..5a3e382
--- /dev/null
+++ b/runInspect_user_config.py
@@ -0,0 +1,164 @@
+"""
+Automatized configuration and execution of Inspect peptide identification for
+a list of spectrum files and a list of reference proteomes. Specifications of
+posttranslational modifications can either be directly passed by the user or
+assigned to the dataset by its filename (if dataset group is already known).
+
+ at author: Anke Penzlin, June 2013
+"""
+
+import re
+import os
+import sys
+import optparse
+
+#from InspectParser_FDRcut import parseInspect
+from simulation_based_similarity import prepDB, run_inspect
+
+def runInspect_config(spectra, DBs, spec_path, db_path="/data/NG4/anke/proteome/",
+ inspect_dir = "/home/franziska/bin/Inspect/",
+ conf = "/data/NG4/anke/Inspect/config_Inspect_py.txt", user_mods=""):
+ """
+ run Inspect for each pair of spectrum dataset and proteome database using
+ modifications according to the dataset in the configuration file.
+ """
+
+ rngDB = range(len(DBs))
+ rngSpc = range(len(spectra))
+ #simMat = [ [0 for i in rngDB] for j in rngSpc ] # initializing output
+
+ for i in rngSpc:
+ specs = spec_path+spectra[i]+".mgf"
+ for j in rngDB:
+ db_j = db_path+DBs[j]+"_decoy.trie"
+ # create trie if necessary
+ if not os.path.exists(db_j):
+ go_on = prepDB(db_path+DBs[j]+"_decoy.fasta", path=inspect_dir)
+ if not go_on: return
+ inspect_out = specs[:-4] +"_"+DBs[j]+"_InspectOut.txt"
+
+ # prepare configfile
+ conf_out = open(conf,'w')
+ conf_out.write("spectra,"+specs+"\n")
+ conf_out.write("instrument,FT-Hybrid\n")
+ conf_out.write("protease,Trypsin\n")
+ conf_out.write("DB,"+db_j+"\n")
+ if not user_mods == "":
+ conf_out.write(user_mods)
+ elif re.search("Lacto_131",spectra[i]):
+ conf_out.write("mod,46.0916,C,fix\n")
+ conf_out.write("mod,15.994915,M\n")
+ conf_out.write("# iTraq\n")
+ conf_out.write("mod,144.1544,K,fix\n")
+ conf_out.write("mod,144.1544,*,nterminal\n")
+ print "modifications according to acc. nr. 13105-13162"
+ sys.stdout.flush()
+ elif re.search("Shigelladys",spectra[i]):
+ conf_out.write("mod,46.0916,C,fix\n")
+ conf_out.write("mod,15.994915,M\n")
+ print "modifications according to http://www.biomedcentral.com/1471-2180/11/147#sec2"
+ sys.stdout.flush()
+ else:
+ conf_out.write("# Protecting group on cysteine\n")
+ conf_out.write("mod,57.021464,C,fix\n")
+ if re.search("Bacicer_113",spectra[i]):
+ conf_out.write("mod,15.994915,M\n")
+ print "modifications according to acc. nr. 11339-11362"
+ sys.stdout.flush()
+ elif re.search("Bacisub_175",spectra[i]):
+ conf_out.write("mod,15.994915,M\n")
+ conf_out.write("mod,119.1423,C\n")
+ conf_out.write("mod,396.37,C\n")
+ print "modifications according to acc. nr. 17516-17659"
+ sys.stdout.flush()
+ elif re.search("Ecoli_12",spectra[i]):
+ conf_out.write("mod,32,M,opt\n")
+ print "modifications according to acc. nr. 12189-12199"
+ sys.stdout.flush()
+ elif re.search("Strepyo_1923",spectra[i]):
+ conf_out.write("mod,15.994915,M\n")
+ conf_out.write("mod,79.9799,STY\n")
+ print "modifications according to acc. nr. 19230/19231"
+ sys.stdout.flush()
+ elif re.search("CPXV_",spectra[i]):
+ conf_out.write("mod,15.994915,M\n")#oxidation
+ conf_out.write("mod,42.010565,*,nterminal\n")#acetylation
+ print "modifications according to standard configuration (for pox)"
+ elif re.search("MSSim",spectra[i]):
+ conf_out.write("mod,0.984016,NQ\n")
+ conf_out.write("mod,15.994915,M\n")
+ print "modifications according to (simulation) standard configuration"
+ sys.stdout.flush()
+ else:
+ conf_out.write("mod,15.994915,M\n")#oxidation
+ conf_out.write("mod,42.010565,*,nterminal\n")#acetylation
+ #conf_out.write("mod,0.984016,NQ\n")
+ print "modifications according to (unspecified) standard configuration"
+ sys.stdout.flush()
+ conf_out.write("mods,2\n")
+ if re.search("Shigelladys",spectra[i]):
+ conf_out.write("PMTolerance,1.4\n")
+ conf_out.write("IonTolerance,0.5\n")
+ conf_out.write("MultiCharge,3\n")
+ else:
+ conf_out.write("ParentPPM,10\n")
+ conf_out.write("IonTolerance,0.8\n")
+ conf_out.close()
+
+ # run Inspect
+ if re.search( "Ecoli_12", spectra[i] ):
+ AA_file = inspect_dir + "AminoAcidMasses_15N.txt"
+ if os.path.exists(AA_file):
+ run_inspect(conf, inspect_out, inspect_dir, "-a "+AA_file)
+ print "amino acid masses according to 15N (because of special e.coli data set)."
+ sys.stdout.flush()
+ else:
+ run_inspect(conf, inspect_out, inspect_dir)
+ print "WARNING: file containing amino acid masses according to 15N not found!\nDatabase search using usual file disregarding special e.coli data set)."
+ sys.stdout.flush()
+ else:
+ run_inspect(conf, inspect_out, inspect_dir)
+ # evaluate results from Inspect to calculate an FDR-matrix
+ #simMat[i][j] = parseInspect(inspect_out)[2]
+
+ #for line in simMat:
+ # print line
+
+
+if __name__=="__main__":
+ usage = """%prog SPECTRA DB_LIST -s SPEC_DIR -d DB_DIR
+
+run InsPecT (for multiple spectrum datasets and references),
+using known modification options (assigned by filename),
+and calculate FDR-corrected identification counts from InsPecT output.
+
+SPECTRA: ','-separated spectrum-filenames (mgf-format) without file extension
+DB_LIST: ','-separated proteome-filenames (fasta-format) without file extension
+
+"""
+
+ # configure the parser
+ optparser = optparse.OptionParser(usage=usage)
+ optparser.add_option('-s', '--specdir', type='string', dest='spec_dir', default="/data/NG4/anke/spectra/", help='directory of specFiles (absolute path!). [default: %default]')
+ optparser.add_option('-d', '--dbdir', type='string', dest='db_dir', default="/data/NG4/anke/proteome/", help='directory of proteinDBs (absolute path!). [default: %default]')
+ optparser.add_option('-c', '--configfile', type='string', dest='config', default="/data/NG4/anke/Inspect/config_Inspect_py.txt", help='a txt-file for Inspect configuration, will be written. [default: %default]')
+ optparser.add_option('-m', '--mods', type='string', dest='mods', default="", help='a string containing all modifications in question, modification choice by filename if "". [default: %default]')
+ optparser.add_option('-i', '--inspect_dir', type='string', dest='ins_dir', default="/home/franziska/bin/Inspect", help='directory of Inspect.exe. [default: %default]')
+
+ # parse options and arguments
+ options, args = optparser.parse_args()
+
+ if len(args) == 2:
+ spectra = args[0].split(',')
+ db_list = args[1].split(',')
+ else:
+ optparser.print_help()
+ sys.exit(1)
+
+ '''db_path = options.db_dir
+ spec_path = options.spec_dir
+ configfile = options.config # Inspect configuration file
+ mods = options.mods
+ inspect_dir = options.ins_dir'''
+ runInspect_config(spectra=spectra, DBs=db_list, spec_path=options.spec_dir, db_path=options.db_dir, inspect_dir=options.ins_dir, conf=options.config, user_mods=options.mods)
+
\ No newline at end of file
diff --git a/simulation_based_similarity.py b/simulation_based_similarity.py
new file mode 100644
index 0000000..0ba7f4c
--- /dev/null
+++ b/simulation_based_similarity.py
@@ -0,0 +1,239 @@
+# in: ProteinDBs with and without decoys, number of samples for simulation and Inspect
+# out: FDR-matrix (and several result files)
+
+# Attention: For using Inspect, this script must be called out of the Inspect-directory! The
+# "cd /home/franziska/bin/Inspect" command is called be the main routine of the script
+
+# procedure:
+# 1. sample sequences from Protein-DB (for faster simulation)
+# 2. call the simulator with these sequences
+# 3. convert spectra to mgf-format (input format Inspect and Biceps)
+# 4. sample spectra from simulated ones
+# 5. run Inspect
+# 6. evaluate results from Inspect to calculate an FDR-matrix
+
+import re
+import random
+from Bio import SeqIO
+import os
+import sys
+import optparse
+import time
+
+from inspectparser import parseInspect
+
+
+# 1. sampling from DB (def)
+def samplProt(inpath, n):
+ """
+ Sampling a given number of protein sequences from a given fasta-file
+
+ inpath: fasta-file, protein DB which will be sampled from
+ n: number of samples
+ """
+
+ protIDs = [] # IDs of all proteins from db
+ print 'Reading protein IDs from '+inpath+' ...'
+ for record in SeqIO.parse(open(inpath, "rU"), "fasta") :
+ # Add ID of this record to protIDs list
+ protIDs.append(record.id)
+ nprot = len(protIDs)
+ print "Found %i proteins" % nprot
+
+ if n == 0 or n > nprot:
+ print "Take the complete list of %i proteins" %nprot
+ return inpath
+ else:
+ print 'Choosing %i proteins ...' %n
+ rprot = random.sample(protIDs,n)
+
+ if inpath[-3] == '.': outpath = inpath[:-3] + "_sample" + str(n) +".fasta"
+ else: outpath = inpath[:-6] + "_sample" + str(n) +".fasta"
+ print 'Writing selected proteins to '+outpath+" ..."
+ output_handle = open(outpath, "w")
+ for record in SeqIO.parse(open(inpath, "rU"), "fasta") :
+ if record.id in rprot: # for every selected protein
+ SeqIO.write(record, output_handle, "fasta")
+ output_handle.close()
+ print "done"
+ return outpath
+
+# 2. use OpenMS simulator (def)
+def run_MSSim(DBfasta, outmzML, ini="/data/NG4/anke/MSSim/mscconf_sebio02.ini", param=""):
+ command = "MSSimulator -in {inp} -out {out} -ini {ini} {param}".format(inp=DBfasta, out=outmzML, ini=ini, param=param)
+ print "Executing:",command
+ sys.stdout.flush()
+ os.system(command)
+ return 1
+
+# 3. convert mzML to mgf format (def)
+def convertSpectra(mzMLfile, param=""):
+ mgf = mzMLfile[:-5]+".mgf"
+ command = "FileConverter -in {infile} -in_type mzML -out {outfile} -out_type mgf {param}".format(infile=mzMLfile, outfile=mgf, param=param)
+ print "Executing: ",command
+ sys.stdout.flush()
+ os.system(command)
+ return mgf
+
+# 4. sampling from simulated spectra (def)
+from readMGF import sampleMGF as samplSpecs
+'''def samplSpecs(inpath, n):
+ """
+ Sampling a given number of spectra from a given mgf-file
+
+ inpath: mgf-file which will be sampled from
+ n: number of samples
+ """
+
+ infile = open(inpath,"r")
+ specs = [] # names of all spectra
+
+ print 'Reading specTitles ...'
+ for line in infile:
+ if re.search("TITLE",line):
+ specs.append(line.rstrip())
+ if n == 0 or n > len(specs):
+ print "Take the complete list of %i spectra" %len(specs)
+ return inpath
+ else:
+ print 'Choosing %i spectra ...' %n
+ rspecs = random.sample(specs,n)
+ infile.seek(0)
+ outpath = inpath[:-4] + "_sample" + str(n) +".mgf"
+ print 'Writing selected specs to '+outpath+" ..."
+ outfile = open(outpath,"w")
+ for line in infile:
+ if line.rstrip() in rspecs: # for ervery selected title write spectrum to new file
+ outfile.write("BEGIN IONS\n")
+ outfile.write(line)
+ for line2 in infile:
+ if re.search("BEGIN IONS",line2): break
+ outfile.write(line2)
+ outfile.close()
+ print "done"
+ return outpath
+'''
+
+# 5.a prepare decoyDB input for Inspect (def)
+def prepDB(fastaDB, path="/home/franziska/bin/Inspect/"):
+ if not os.path.exists(os.path.join(path,"PrepDB.py")):
+ print "InsPecT files not found, please correct path!"
+ return 0
+ command = "python {path}PrepDB.py FASTA {db}".format(path=path, db=fastaDB)
+ print "Executing: ",command
+ sys.stdout.flush()
+ os.system(command)
+ return 1
+
+# 5. match spectra against database with Inspect (def)
+def run_inspect(configfile, outputfile, path="", param=""):
+ if len(path) > 0:
+ command = "inspect -i {input} -o {output} -r {path} {param}".format(input=configfile, output=outputfile, path=path, param=param)
+ else:
+ command = "inspect -i {input} -o {output} {param}".format(input=configfile, output=outputfile, param=param)
+ print "Executing: ",command
+ sys.stdout.flush()
+ os.system(command)
+ return 1
+
+# 1.-6. go all steps
+def calculateSimilarityMatrix(DBs, db_path="/data/NG4/anke/proteome/", nProt=100, nSpec=1000,
+ sim_out_path="/data/NG4/anke/MSSim/sampled_data/", MSSim_ini="/data/NG4/anke/MSSim/mscconf_sebio02.ini",
+ inspect_config="/data/NG4/anke/Inspect/config_Inspect_MSSim.txt", inspect_dir="/home/franziska/bin/Inspect/"):
+ """
+ """
+ rng = range(len(DBs))
+ simMat = [ [0 for i in rng] for j in rng ] # initializing output
+
+ configfile = inspect_config[:-4]+"_py.txt" # Inspect configuration file (final version)
+
+ for i in rng:
+ # 1. sampling from DB (run)
+ prot_path = samplProt(db_path+DBs[i]+".fasta",nProt)
+ # 2. use OpenMS simulator with sampled proteins (run)
+ out_path = sim_out_path+DBs[i]+"_sampl"+str(nProt)+"MSSim.mzML"
+ run_MSSim(prot_path, out_path, ini=MSSim_ini, param="-threads 4")
+ # 3. convert mzML to mgf format
+ sampl_path = convertSpectra(out_path)
+ # 4. sampling from simulated spectra
+ spec_path = samplSpecs(sampl_path,nSpec)
+ # (*) with runInspect_user_config.runInspect_config for all DBs
+ # runInspect_config(spectra=spec_path, DBs=DBs, spec_path="", db_path=db_path, inspect_dir=inspect_dir, conf=configfile, user_mods="")
+ for j in rng:
+ # 5. calling InSpecT (*)
+ db_j = db_path+DBs[j]+"_decoy.trie"
+ # 5.a create trie if necessary
+ if not os.path.exists(db_j):
+ prepDB(db_path+DBs[j]+"_decoy.fasta",path=inspect_dir)
+ inspect_out = spec_path[:-4] +"_"+DBs[j]+"_InspectOut.txt" # (!!)
+ # prepare configfile # <-- call runInspect_user_config.runInspect_config instead!! (bei 5. ansetzen, j-Schleife nur mit inspect_out behalten)
+ conf_in = open(inspect_config,'r')
+ conf_out = open(configfile,'w')
+ for line in conf_in:
+ if re.match("spectra,",line):
+ conf_out.write("spectra,"+spec_path+"\n")
+ elif re.match("DB,",line):
+ conf_out.write("DB,"+db_j+"\n")
+ else:
+ conf_out.write(line)
+
+ conf_in.close()
+ conf_out.close()
+
+ run_inspect(configfile, inspect_out, path=inspect_dir)# from 5. remove all but (!!) up to here
+ # 6. evaluate results from Inspect to calculate an FDR-matrix
+ simMat[i][j] = parseInspect(inspect_out, silent=False)[1]
+ normSimMat = [ [0 for i in rng] for j in rng ]
+ for k in rng:
+ for l in rng:
+ normSimMat[k][l] = simMat[k][l] / simMat[k][k]
+ return simMat, normSimMat
+
+
+if __name__=="__main__":
+ usage = """%prog DB_LIST -d DB_DIR -p NPROT -n NSPEC -i INIFILE
+
+Sample a given number of proteins from DB for simulation of spectra,
+sample a given number of spectra from simulation for running inspect
+and calculate FDR from Inspect output.
+
+"""
+
+ t0 = time.time()
+ print time.strftime("%Y/%m/%d, %H:%M:%S: Starting whole procedure and overall time measurement", time.localtime(t0))
+ sys.stdout.flush()
+ # configure the parser
+ optparser = optparse.OptionParser(usage=usage)
+ optparser.add_option('-d', '--dbdir', type='string', dest='db_dir', default="/data/NG4/anke/proteome/", help='directory of proteinDBs. [default: %default]')
+ optparser.add_option('-p', '--proteinnumber', type='int', dest='nprot', default=100, help='number of samples from proteinDB. [default: %default]')
+ optparser.add_option('-n', '--numberspectra', type='int', dest='nspec', default=1000, help='number of samples from spectra. [default: %default]')
+ optparser.add_option('-c', '--configfile', type='string', dest='MSSim_ini', default="/data/NG4/anke/MSSim/mscconf_sebio02.ini", help='configuration file for MSSImulator. [default: %default]')
+ optparser.add_option('-i', '--inspectdir', type='string', dest='insp_dir', default="/home/franziska/bin/Inspect/", help="path to 'inspect.exe'. [default: %default]")
+ optparser.add_option('-o', '--outfile', type='string', dest='out', default="/data/NG4/anke/Inspect/collectOutput.txt", help='name of file resulting matrix is written to. [default: %default]')
+ # parse options and arguments
+ options, args = optparser.parse_args()
+
+ if len(args) == 1:
+ db_list = args[0].split(',')
+ sim_out_path = "/data/NG4/anke/MSSim/sampled_data/"
+ nProt = options.nprot # nr. of sampled proteins (from DB)
+ nSpec = options.nspec # nr. of sampled spectra (from simulation)
+ config_path = "/data/NG4/anke/Inspect/config_Inspect_MSSim.txt" # Inspect configuration file (initial version)
+
+ M, M_n = calculateSimilarityMatrix(DBs = db_list, db_path = options.db_dir, nProt=nProt, nSpec=nSpec, sim_out_path=sim_out_path, MSSim_ini=options.MSSim_ini, inspect_config=config_path, inspect_dir=options.insp_dir)#print db_list
+ else:
+ optparser.print_help()
+ sys.exit(1)
+
+ outfile = open(options.out,"a")
+ outfile.write("normalized similarity matrix derived from InSpect with %i sampled spectra "%nSpec)
+ outfile.write("(from simulation with %i sampled proteins): \n"%nProt)
+ print "normalized: \n"
+ for line in M_n:
+ print line
+ outfile.write(str(line)+"\n")
+ outfile.write("\n\n\n")
+ outfile.close()
+ print time.strftime("%Y/%m/%d, %H:%M:%S: all done\n", time.localtime(time.time()))
+ print "took %.3f seconds overall (that's %.3f minutes)"%(time.time() - t0, (time.time() - t0)/60)
+ print "----------------------------------------------------------------------------------------------------\n\n\n"
diff --git a/trypticpeptides.py b/trypticpeptides.py
new file mode 100644
index 0000000..6628d94
--- /dev/null
+++ b/trypticpeptides.py
@@ -0,0 +1,604 @@
+"""
+Sequence based Similarity (Alternative to Simulator)
+----------------------------------------------------
+using tryptic peptide sequences instead of spectra
+
+Including:
+trypticCut:
+ produce trypsin digested peptides of all proteins in fasta-file
+peptideSearch:
+ search all sequences from a given fasta-file in a given database (fasta);
+ get the number of tryptic peptides from proteome A also matching proteome B
+cutNsearch:
+ combine trypticCut and peptideSearch
+weightPeptides:
+ search identified peptides in tryptic peptide list of a proteome in order to
+ weight each peptide by its frequency (using 3 normalization steps)
+weightedMatching:
+ use weighted peptides to calculate a weighted proteome similarity summing up
+ weights of tryptic peptides from proteome A also matching proteome B
+weightedMatrix:
+ perform weightPeptides and weightedMatching for a number of proteomes
+ combining all pairs of proteomes
+weightedMatrix_short:
+ short version of weightedMatrix using already weighted peptides
+filterParsed:
+ specialized sampling of peptide spectrum identifications; on the basis of
+ readMGF.filterFDR, but for identifications instead of spectra, also using
+ spectrum numbers from readMGF.getFilterIDs
+
+still contained, but out of use:
+uniqPeptides:
+ for each sequence found in a fasta-file, copy only the first occurrence to a
+ new file [actually no use in further analyses]
+decoyFilter:
+ copy only non-decoy peptide matches from InspectParser_uniq.py output or
+ Inspect output identification list to a new file
+ [actually no use in further analyses]
+getSharedDic:
+ create a dictionary of potentially ambiguous peptide sequences
+ [very time-consuming! Dictionary constructed on the way where required]
+
+
+ at author: Anke Penzlin, May 2013
+"""
+
+import numpy as np
+import os
+import re
+import optparse
+import sys
+from Bio import SeqIO
+
+def trypticCut(fastaFile, verbose=True):
+ """
+ cut all protein sequences from a given fasta-file into tryptic peptides.
+ Peptide sequences (+ protein description) are written to a new fasta file.
+ (return: path to peptide file, number of peptides)
+
+ fastaFile: protein sequences to be cut
+ """
+ nprot = 0 # number of proteins (in fastaFile)
+ peptLs = [] # tryptic peptides
+ protLs = [] # description (ID/name) of the respective protein from db
+ if verbose: print 'Reading and cutting protein sequences from '+fastaFile+' ...'
+ for record in SeqIO.parse(open(fastaFile, "rU"), "fasta") :
+ nameP = record.description
+ # cut after lysine:
+ pepsLys = [pep+"K" for pep in str(record.seq).upper().split("K")]
+ pepsLys[-1] = pepsLys[-1][:-1]
+ # remove cut before proline:
+ pepsLysP = [pepsLys[0]]
+ for pe in range(1,len(pepsLys)):
+ if re.match("P",pepsLys[pe]):
+ pepsLysP[-1] += pepsLys[pe]
+ else: pepsLysP.append(pepsLys[pe])
+ # cut after arginine:
+ for pept in pepsLysP:
+ pepsArg = [pep+"R" for pep in pept.split("R")]
+ pepsArg[-1] = pepsArg[-1][:-1]
+ # remove cut before proline:
+ pepsArgP = [pepsArg[0]]
+ for pe in range(1,len(pepsArg)):
+ if re.match("P",pepsArg[pe]):
+ pepsArgP[-1] += pepsArg[pe]
+ else: pepsArgP.append(pepsArg[pe])
+ # only accept peptides of length 7 - 25
+ for pep in pepsArgP:
+ if len(pep) > 6 and len(pep) < 26:
+ peptLs.append(pep) # store peptide sequence
+ protLs.append(nameP) # store corresponding description
+ nprot += 1
+ PeptProtLs = zip(peptLs,protLs)
+ if verbose: print "Found %i proteins, obtained %i peptides" %(nprot,len(peptLs))
+
+ outpath = os.path.dirname(fastaFile) + "/peptides_" + os.path.basename(fastaFile)
+ if verbose: print 'Writing peptides to '+outpath+" ..."
+ outF = open(outpath,"w")
+ for pep in PeptProtLs:
+ outF.write(">"+pep[1]+"\n")
+ outF.write(str(pep[0])+"\n")
+ outF.close()
+ return outpath,len(peptLs)
+
+
+def peptideSearch(peptFile,dbFile,total=50000,verbose=True):
+ """
+ search all peptide sequences from a given fasta-file in a given database
+ and count matches (return: number of matches, number of searched peptides)
+
+ peptFile: peptide sequences (fasta-format)
+ dbFile: protein database (fasta-format)
+ total: number of peptides in peptFile
+ """
+
+ DB = [] # list of protein sequences from dbFile
+ cPept = 0 # number of peptides
+ cMatch = 0 # number of matches (peptide <-> DB sequence)
+
+ if verbose: print "reading DB sequences ..."
+ # store all protein sequences from DB in a list
+ for record in SeqIO.parse(open(dbFile, "rU"), "fasta") :
+ DB.append(str(record.seq).upper())
+
+ if verbose: print "reading and searching peptide sequences ..."
+ # search each peptide sequence from peptFile in DB and count matches
+ for pept in SeqIO.parse(open(peptFile, "rU"), "fasta") :
+ if verbose and cPept%100 == 0:
+ msg = "Peptide %i of %i"%(cPept,total)
+ sys.stdout.write("\r" + " "*(len(msg)+5) + "\r" + msg)
+ sys.stdout.flush()
+ cPept += 1
+ for DBi in DB:
+ if str(pept.seq).upper() in str(DBi):
+ cMatch += 1
+ break # only one match per peptide is counted
+ if verbose: print "\r%i of %i peptides found" %(cMatch,cPept)
+ return cMatch,cPept
+
+def cutNsearch(proteinFile,dbFile):
+ """
+ produce tryptic peptides from proteinFile and search them in the sequences
+ of dbFile. (return: ratio of matches/peptides)
+ """
+ peptFile, nr = trypticCut(proteinFile)
+ cMatch, cPept = peptideSearch(peptFile,dbFile,total=nr)
+ return float(cMatch)/cPept
+
+def unweightedMatrix(dbList, verbose=True):
+ """
+ for all proteomes, run pairwise peptide search to construct an unweighted
+ similarity matrix. (return: proteome similarity matrix)
+ using: trypticCut
+ peptideSearch
+
+ dbList: list of protein database files (fasta-format, full path),
+ ","-separated
+ """
+
+ dbList = dbList.split(",")
+ Mx = np.zeros([len(dbList),len(dbList)]) # proteome similarity matrix
+ for i,DBi in enumerate(dbList):
+ if verbose: print "\n"
+ # compose filename of spectra and database
+ trypticPeptides, nr = trypticCut(DBi, verbose=verbose) # database peptides
+ for j,DBj in enumerate(dbList):
+ cMatch, cPept = peptideSearch(trypticPeptides,DBj,total=nr,verbose=verbose)
+ Mx[i,j] = float(cMatch)/cPept
+ return Mx # proteome similarity matrix
+
+
+# not further used.
+# simply ignores all but the first occurrence of shared peptide sequences.
+def uniqPeptides(peptFile):
+ """
+ create a fasta-file of non-redundant peptide sequences from peptFile.
+ (return: path to created file)
+
+ peptFile: peptide sequences (fasta-format)
+ """
+
+ outfile = os.path.dirname(peptFile) + "/uniq_" + os.path.basename(peptFile)
+ newls = []
+ peps = []
+ n_peps = 0
+
+ print "reading peptide sequences ..."
+ for pept in SeqIO.parse(open(peptFile, "rU"), "fasta") :
+ #if ilqk: #later on
+ # pept = pept.replace("I", "L").replace("Q", "K")
+ if not str(pept.seq) in peps:
+ peps.append(str(pept.seq))
+ newls.append(pept)
+ n_peps += 1
+ print "%i different peptide sequences out of %i peptides" %(len(peps),n_peps)
+ output_handle = open(outfile, "w")
+ for record in newls:
+ SeqIO.write(record, output_handle, "fasta")
+ output_handle.close()
+ return outfile
+
+
+# not further used, replaced by a single line while file reading where required
+def decoyFilter(peptFile):
+ """
+ parse list of non-decoy peptide matches from Inspect output list.
+ (return: path to the new list file)
+
+ peptFile: peptide list from Inspect output or InspectParser_uniq.py output
+ (protein name must be contained in each line!)
+ """
+
+ outfile = os.path.dirname(peptFile) + "/nondecoy_" + os.path.basename(peptFile)
+ newls = []
+ n_peps = 0
+
+ print "reading sequences of identified peptides ..."
+ inF = open(peptFile,"r")
+ inF.readline()
+ inF.readline()
+ for line in inF:
+ if not "REVERSED" in line: # part of protein names for decoy entries
+ newls.append(line)
+ n_peps += 1
+ print "%i of %i peptides are non-decoy matches" %(len(newls),n_peps)
+ outF = open(outfile, "w")
+ for line in newls:
+ outF.write(line)
+ outF.close()
+ return outfile
+
+
+# not further used - long runtime!!
+def getSharedDic(peptList):
+ """
+ create a dictionary of peptide sequences and their occurrences in peptList
+ """
+ peps = np.array(peptList)
+ peps_u = np.unique(peps)
+ u_dict = dict()
+ for u in peps_u:
+ u_dict[u] = np.where(peps == u)
+ return u_dict
+
+
+# based on inspectparser.py output
+# (list of peptide hits and corresponding protein names)
+def weightPeptides(identifiedPeptides, trypticPeptides, fdr=1, init=0, ilqk=False, sep="\t", decoy_tag= "REVERSED|DECOY", verbose=True):
+ """
+ search identified peptides in tryptic peptide list of a proteome in order
+ to weight each peptide
+ (return: path to output file with line structure: "weight sequence")
+
+ identifiedPeptides: inspectparser.py output (experimental spectra)
+ trypticPeptides: typticCut result of proteome in question
+ fdr: cut-off controlling false discovery rate (1 = no cut)
+ init: initial weight for all peptides
+ ilqk: handle isoleucine as leucine and glutamine as lysine
+ (because of their equal/very similar mass), if true
+ sep: seperator for split of identifiedPeptides input lines,
+ do not change unless your input type has the same
+ content as inspectparser.py output
+ decoy_tag: string that specifies how decoy sequences in the DB
+ are labeled. By default, "REVERSED" or "DECOY" are
+ accepted.
+ """
+
+ outfile = os.path.dirname(identifiedPeptides) + "/matched_init_"+ str(init) + "_" + os.path.splitext(os.path.basename(trypticPeptides))[0] + "_to_" + os.path.basename(identifiedPeptides)
+ trypList = [] # list of tryptic peptides
+ trypDic = dict() # dictionary of all peptide sequences and corresponding indices in trypList
+ protDic = dict() # dictionary of protein names and corresponding indices in trypList
+ idx = 0 # index of current peptide read from trypticPeptides used in trypList
+
+ if verbose: print "reading sequences of tryptic peptides ..."
+ for pept in SeqIO.parse(open(trypticPeptides, "rU"), "fasta") :
+ # dictionary of protein names (descriptions)
+ p_name = str(pept.description)
+ if p_name in protDic: # add peptide (by index) to its protein
+ protDic[p_name].append(idx)
+ else: # add new protein name (+ first entry) to dictionary
+ protDic[p_name] = [idx]
+ # peptide sequence
+ '''if ilqk:
+ pept = str(pept.seq).replace("I", "L").replace("Q", "K")
+ else:
+ pept = str(pept.seq)'''
+ pept = str(pept.seq)#!!! .replace("I", "L")
+ trypList.append(pept)
+ # dictionary of peptide sequences indicating unique and shared peptides
+ if pept in trypDic: # add index of current peptide to already known sequence
+ trypDic[pept].append(idx)
+ else: # add new entry to sequence dictionary
+ trypDic[pept] = list([idx])
+ idx += 1
+ if verbose: print len(trypDic), "different sequences out of", len(trypList), "peptides from", len(protDic), "proteins"
+ # array for peptide match counts
+ counts = [init]*len(trypList)
+ # counter for peptides not in tryptic peptide list
+ notfound = 0
+
+ '''# monitoring not found sequences
+ with open("/data/NG4/anke/spectra/Shigella_dysenteriae/out/testOutput.txt","w") as f:
+ f.write("")'''
+ if verbose: print "reading and searching sequences of identified peptides ..."
+ inF = open(identifiedPeptides,"r")
+ inF.readline()
+ inF.readline()
+ for line in inF:
+ if float(line.split(sep)[5]) > fdr: break # use only reliable part of peptide list
+ if not re.search(decoy_tag,line.split(sep)[6]): # use only non decoy hits
+ '''pept = line.split(sep)[2] # read identified peptide sequence
+ if ilqk:
+ pept = pept.upper().replace("I", "L").replace("Q", "K")'''
+ pept = (line.split(sep)[2])#!!! .replace("I", "L") # read identified peptide sequence
+ # split identified peptide into tryptic peptides (if neccessary)
+ # cut after lysine:
+ pepsLys = [pep+"K" for pep in pept.split("K")]
+ pepsLys[-1] = pepsLys[-1][:-1]
+ # remove cut before proline:
+ pepsLysP = [pepsLys[0]]
+ for pe in range(1,len(pepsLys)):
+ if re.match("P",pepsLys[pe]):
+ pepsLysP[-1] += pepsLys[pe]
+ else: pepsLysP.append(pepsLys[pe])
+ # cut after arginine:
+ for pept in pepsLysP:
+ pepsArg = [pep+"R" for pep in pept.split("R")]
+ pepsArg[-1] = pepsArg[-1][:-1]
+ # remove cut before proline:
+ pepsArgP = [pepsArg[0]]
+ for pe in range(1,len(pepsArg)):
+ if re.match("P",pepsArg[pe]):
+ pepsArgP[-1] += pepsArg[pe]
+ else: pepsArgP.append(pepsArg[pe])
+ for pep in pepsArgP:
+ # only accept peptides of length 7 - 25
+ if len(pep) > 6 and len(pep) < 26:
+ # search each peptide in list of proteome tryptic peptides
+ if pep in trypList:
+ counts[trypList.index(pep)] += 1
+ else:
+ notfound += 1
+ '''# monitoring not found sequences
+ with open("/data/NG4/anke/spectra/Shigella_dysenteriae/out/testOutput.txt","a") as f:
+ f.write(pep + "\tfrom\t" + line.split(sep)[2] + "\n")'''
+ # calculate weights from counts
+ if verbose: print "not found: %i of %i"%(notfound,sum(counts)+notfound)
+ if sum(counts) > 0:
+ if verbose: print "recalculating counts for shared peptides ..."
+ for t in trypList: # equally distribute counts for peptides of same sequence
+ t_ind = trypDic.get(t)
+ x = sum([counts[c] for c in t_ind])/float(len(t_ind))
+ for i in t_ind:
+ counts[i] = x
+
+ sum_c = sum(counts)
+ for p in protDic: # equally distribute counts for peptides of same protein
+ # normalized by the sum of all counts (sum_c)
+ p_ind = protDic.get(p)
+ x = sum([counts[c] for c in p_ind])/float(len(p_ind))/sum_c
+ for i in p_ind:
+ counts[i] = x
+ else:
+ print "Warning: no peptides identified!"
+ if verbose: print "writing resulting weights ..."
+ cPos = 0 # number of positively weighted peptides
+ outF = open(outfile,"w")
+ # output file line structure: "weight sequence"
+ for idx, seq in enumerate(trypList):
+ outF.write(str(counts[idx])+"\t"+seq+"\n")
+ if not counts[idx] == 0:
+ cPos += 1
+ outF.close()
+ if verbose: print cPos,"of",len(trypList),"peptides got positive weight."
+ return outfile
+
+
+def weightedMatching(weightedPeptFile,dbFile,total,sum_all=True, verbose=True):
+ """
+ search all weighted peptide sequences from a given file in a given database
+ (return: weighted sum of matches, number of matches, number of peptides)
+
+ weightedPeptFile: textfile of weighted peptide sequences
+ (weightPeptides output file line structure: "count sequence")
+ dbFile: protein database (fasta-format)
+ total: number of peptides in weightedPeptFile (only relevant for
+ progress display)
+ sum_all: add up weights of matching peptides; if false, set count of
+ matching peptides with weight > 0 to 1 (only suitable for init=0)
+ """
+
+ DB = [] #list of protein sequences from dbFile
+ cPept = 0 # number of peptides
+ cMatch = 0 # number of matches (weighted peptide <-> DB peptide)
+ pMatch = float(0) # weighted sum of matches
+
+ if verbose: print "reading DB sequences ..."
+ for record in SeqIO.parse(open(dbFile, "rU"), "fasta") :
+ DB.append(str(record.seq).upper())#.replace("I", "L"))#!!! .replace("Q", "K"))
+ if not sum_all: prot_count = [0]*len(DB)
+ if verbose: print "reading and mapping peptide sequences ..."
+ for line in open(weightedPeptFile, "rU"):
+ if verbose and cPept%100 == 0: # progress display
+ msg = "Peptide %i of %i"%(cPept,total)
+ sys.stdout.write("\r" + " "*(len(msg)+5) + "\r" + msg)
+ sys.stdout.flush()
+ cPept += 1
+ for i,DBi in enumerate(DB): # search process (weighted peptide in DB)
+ if line.split("\t")[1].rstrip().upper() in str(DBi):
+ cMatch += 1
+ if sum_all: pMatch += float(line.split("\t")[0])
+ elif (float(line.split("\t")[0]) > 0): prot_count[i] = 1
+ break
+ if not sum_all: #normalization (neccessary for count data)
+ pMatch = float(sum(prot_count))/len(DB)
+
+ if verbose: print "\r%i of %i peptides found" %(cMatch,cPept)
+ return pMatch, cMatch, cPept # weighted sum of matches, #matches, #peptides
+
+
+def weightedMatrix(spectra_name, dbList, fdr=0.05, init_weight=0, Tide=False, decoy_tag="REVERSED|DECOY", verbose=True):
+ """
+ Based on InsPecT output (or output parsed by inspectparser)
+ for dataset and all proteomes, construct a weighted similarity matrix
+ (return: proteome similarity matrix weighted by dataset)
+ using: inspectparser or corresponding output
+ trypticCut
+ weightPeptides
+ weightedMatching
+ requires: InsPecT output file named nameSpectra'_'nameDB'_InspectOut.txt'
+ or a '_parsed.txt' version (used preferably) of this file in the
+ directory given in spectra_name
+
+ spectra_name: name of spectra dataset used to identify peptides via Inspect
+ (path of Inspect output)
+ dbList: list of protein database files (fasta-format, full path),
+ ","-separated
+ fdr: cut-off controlling false discovery rate (1 = no cut)
+ init_weight: initial weight for all peptides (see weightPeptides), default=0
+ """
+
+ from inspectparser import parseInspect
+
+ dbList = dbList.split(",")
+ Mx = np.zeros([len(dbList),len(dbList)]) # proteome similarity matrix
+ for i,DBi in enumerate(dbList):
+ if verbose: print "\n"
+ # compose filename of spectra and database
+ if Tide:
+ identifiedPeptides = os.path.splitext(spectra_name)[0] +"_"+ os.path.splitext(os.path.basename(DBi))[0] +"_TideOut_parsed.txt"
+ if not os.path.exists(identifiedPeptides):
+ print "Could not find PSM-file for %s and %s" %(spectra_name,DBi)
+ continue
+ else:
+ inspectOut = os.path.splitext(spectra_name)[0] +"_"+ os.path.splitext(os.path.basename(DBi))[0] +"_InspectOut.txt"
+ identifiedPeptides = inspectOut[:-4]+"_parsed.txt" # dataset peptides
+ if not os.path.exists(identifiedPeptides):
+ try:
+ identifiedPeptides = parseInspect(inpath=inspectOut,fdr_countcut=fdr,multipleFiles=True,silent=True)
+ except:
+ print "Could not find PSM-file for %s and %s" %(spectra_name,DBi)
+ print "Current task:"
+ print weightedMatrix.__doc__
+ continue
+ trypticPeptides, nr = trypticCut(DBi, verbose=verbose) # database peptides
+ weightedPeptFile = weightPeptides(identifiedPeptides, trypticPeptides, fdr=fdr, init=init_weight, decoy_tag=decoy_tag, verbose=verbose)
+ for j,DBj in enumerate(dbList):
+ pMatch, cMatch, cPept = weightedMatching(weightedPeptFile,DBj,total=nr,verbose=verbose)
+ Mx[i,j] = pMatch
+ return Mx # proteome similarity matrix
+
+
+# Short matrix calculation if weights are known (from previous calculations),
+# e.g. counting matching peptides with weight > 0 (sum_all=False) using known init=0 weights.
+def weightedMatrix_short(spectra_name, dbList, sum_all=False):
+ """
+ construct a weighted similarity matrix based on files of weighted peptides
+ for each combination of dataset and proteome (db).
+ (return: proteome similarity matrix weighted by dataset)
+ using: weightedMatching
+
+ spectra_name: name of spectra dataset used to identify peptides via Inspect
+ (path of Inspect output)
+ dbList: list of protein database files (fasta-format, full oath),
+ ","-separated
+ sum_all: see weightedMatching, except for default: False
+ """
+
+ dbList = dbList.split(",")
+ Mx = np.zeros([len(dbList),len(dbList)])
+ for i,DBi in enumerate(dbList):
+ print "\n"
+ identifiedPeptides = os.path.splitext(spectra_name)[0] +"_"+ os.path.splitext(os.path.basename(DBi))[0] +"_InspectOut_parsed.txt"
+ weightedPeptFile = os.path.dirname(identifiedPeptides) + "/matched_peptides_" + os.path.splitext(os.path.basename(DBi))[0] + "_to_" + os.path.basename(identifiedPeptides)
+ for j,DBj in enumerate(dbList):
+ #if DBj == DBi:
+ # Mx[i,j] = 1.
+ #else:
+ pMatch, cMatch, cPept = weightedMatching(weightedPeptFile,DBj,total=50000,sum_all=sum_all)
+ Mx[i,j] = pMatch
+ return Mx
+
+
+# sampling peptide spectrum matches using quality filter (getFilterIDs in readMGF.py)
+def filterParsed(spectra,filterIDs, verbose=True):
+ """
+ from a given inspectparser output select spectra according to
+ corresponding Inspect score using filterIDs sampled by readMGF.getFilterIDs
+ (return: path to sample file)
+
+ spectra: file of peptide spectrum matches in parsed format
+ (inspectparser output)
+ filterIDs: spectrum numbers sampled by readMGF.getFilterIDs
+ """
+ notfound = 0 # print warning if any spectrum could not be found
+ specDic = dict() # datastructure for fast access on spectra by number
+ with open(spectra) as f: allSpectra = f.readlines()
+ del allSpectra[0:2] # delete header of inspectparser output file
+ for nr,line in enumerate(allSpectra):
+ list = line.split("\t") # spectrum number (1) and match score (3) easily accessible
+ # ignore file name if prefix of id (concatenated by '..'); ids must be unique (no mixed files)
+ id = int(list[1].split("..")[1]) if len(list[1].split("..")) > 1 else int(list[1])
+ # insert index of line in allSpectra and split line into dictionary, key = id
+ specDic[id] = [nr,list]
+ # name of sample file includes '_filter' and number of samples:
+ newfile = os.path.splitext(spectra)[0] + "_filter" + str(len(filterIDs)) + os.path.splitext(spectra)[1]
+ if verbose: print 'Writing '+str(len(filterIDs))+' spectra to '+newfile+" ..."
+ outlist = [] # list of sampled dictionary entries
+ for z in filterIDs:
+ z = int(z)
+ if z in specDic: outlist.append(specDic[z])
+ else: notfound += 1
+ # restore order by match scores:
+ outlist.sort(key = lambda x : float(x[1][3]), reverse = True)
+ outfile = open(newfile,"w")
+ # write sampled lines of input file to newfile using list index for allSpectra
+ for entry in outlist: outfile.write(allSpectra[entry[0]])
+ outfile.close()
+ if notfound > 0: print "!! %i spectra could not be found" %notfound
+ return newfile
+'''# slow because of specID search:
+def filterParsed(spectra,filterIDs):#, fdrfile, scoreTab, maxN=1000):
+ """
+ from a given InspectParser_FDR_uniq output select spectra according to corresponding Inspect score.
+ """
+ #from readMGF import getFilterIDs
+ #rnum, n = getFilterIDs(fdrfile=fdrfile, scoreTab=scoreTab, maxN=maxN)
+ with open(spectra) as f: allSpectra = f.readlines()
+ del allSpectra[0:2]
+ newfile = os.path.splitext(spectra)[0] + "_filter" + str(len(filterIDs)) + os.path.splitext(spectra)[1]
+ print 'Writing '+str(len(filterIDs))+' spectra to '+newfile+" ..."
+ outlist = []
+ for z in filterIDs:
+ x = np.array(map(lambda y: y.split("\t")[1].find(str(z)), allSpectra))
+ idx = np.where(x >= 0)[0]
+ if len(idx) > 0: outlist.append(allSpectra[idx[0]])
+ outlist.sort(key = lambda x : float(x.split("\t")[3]), reverse = True)
+ outfile = open(newfile,"w")
+ for line in outlist: outfile.write(line)
+ outfile.close()
+ return newfile
+'''
+
+if __name__=="__main__":
+ usage = """%prog PROTFILE DBFILE
+
+Command line interface of 'cutNsearch', producing tryptic peptides of the fasta
+sequences in PROTFILE and counting occurrences of these peptides in DBFILE.
+(Most functions of corresponding script file are not included in the interface)
+
+"""
+
+ # configure the command line parser
+ command_parser = optparse.OptionParser(usage=usage)
+ #command_parser.add_option('-f', '--fdr', type='float', dest='fdr', default=1, help='list will be cut at FDR-value [default: %default (means no cut)]')
+
+ # parse arguments and options
+ options, args = command_parser.parse_args()
+
+ if len(args) == 2:
+ print cutNsearch(args[0],args[1])
+ else:
+ command_parser.print_help()
+
+'''resulting sequence similarity (matching tryptic peptides to refDB):
+peptides_EcoliDH10B -> Shigelladys: 0.6282523791187634 (34263/54537)
+peptides_Shigelladys -> EcoliDH10B: 0.7151272577996716 (34841/48720)
+spectral similarity (MSSim): simShig = np.array([[1.0, 0.58696331], [0.71932158, 1.0]])
+
+weighted by exp. spectra "Ecoli_12189_filter20000":
+peptides_EcoliDH10B -> Shigelladys: 0.8793818269104893 (34263/54537) #weightedMatching("../../spectra/Shigella_dysenteriae/out/matched_peptides_EcoliDH10B_to_Ecoli_12189_filter20000_EcoliDH10B_InspectOut_parsed.txt","../../proteome/Shigelladys.fasta",54537)
+peptides_Shigelladys -> EcoliDH10B: 0.9236761034288927 (34841/48720) #weightedMatching("../../spectra/Shigella_dysenteriae/out/matched_peptides_Shigelladys_to_Ecoli_12189_filter20000_Shigelladys_InspectOut_parsed.txt","../../proteome/EcoliDH10B.fasta",48720)
+peptides_x -> x: all found
+
+weighted by exp. spectra "Shigelladys_1_all_filter20000":
+peptides_EcoliDH10B -> Shigelladys: (0.9528234762832758, 34263, 54537) # weightedMatching(searchExperimental("../../spectra/Shigella_dysenteriae/out/Shigelladys_1_all_filter20000_EcoliDH10B_InspectOut_parsed.txt","../../proteome/peptides_EcoliDH10B.fasta",fdr=0.05),"../../proteome/Shigelladys.fasta",54537)
+peptides_Shigelladys -> EcoliDH10B: (0.9092465576969604, 34841, 48720) # weightedMatching(searchExperimental("../../spectra/Shigella_dysenteriae/out/Shigelladys_1_all_filter20000_Shigelladys_InspectOut_parsed.txt","../../proteome/peptides_Shigelladys.fasta",fdr=0.05),"../../proteome/EcoliDH10B.fasta",48720)
+peptides_EcoliDH10B -> EcoliDH10B: (1.0000000000001164, 54537, 54537) # weightedMatching(searchExperimental("../../spectra/Shigella_dysenteriae/out/Shigelladys_1_all_filter20000_EcoliDH10B_InspectOut_parsed.txt","../../proteome/peptides_EcoliDH10B.fasta",fdr=0.05),"../../proteome/EcoliDH10B.fasta",54537)
+
+matching tryptic peptides to refDB:
+Bacicer -> Bacisub168: 0.03248105914484546
+Bacisub168 -> Bacisub: 0.037594533029612756
+spectral similarity (MSSim): simB_800_l3 = np.array([[ 1.0 , 0.01346487],[ 0.01373776, 1.0 ]])
+ simBac = np.array([[1.0, 0.024390243902439025, 0.0040650406504065045],[0.024473147518694765, 1.0, 0.003399048266485384],[0.0006747638326585695, 0.0006747638326585695, 1.0]])
+'''
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pipasic.git
More information about the debian-med-commit
mailing list