[med-svn] [Git][med-team/tnseq-transit][master] 3 commits: New upstream version 3.2.0
Nilesh Patra
gitlab at salsa.debian.org
Wed Oct 28 21:34:17 GMT 2020
Nilesh Patra pushed to branch master at Debian Med / tnseq-transit
Commits:
3b803d06 by Nilesh Patra at 2020-10-29T02:47:18+05:30
New upstream version 3.2.0
- - - - -
290ac4e1 by Nilesh Patra at 2020-10-29T02:49:24+05:30
Update upstream source from tag 'upstream/3.2.0'
Update to upstream version '3.2.0'
with Debian dir 70023e8dfdb58da65004e0aa1110af083f1df406
- - - - -
960097a9 by Nilesh Patra at 2020-10-29T02:54:00+05:30
Update changelog
- - - - -
17 changed files:
- CHANGELOG.md
- debian/changelog
- src/pytransit/__init__.py
- src/pytransit/analysis/corrplot.py
- src/pytransit/analysis/gi.py
- src/pytransit/analysis/heatmap.py
- src/pytransit/analysis/pathway_enrichment.py
- + src/pytransit/data/GO_terms_for_each_Rv.obo-3-11-18.txt
- src/pytransit/data/H37Rv_COG_roles.dat
- src/pytransit/data/H37Rv_sanger_roles.dat
- + src/pytransit/data/gene_ontology.1_2.3-11-18.obo
- + src/pytransit/doc/source/_images/genetic_interaction_types.png
- src/pytransit/doc/source/transit_methods.rst
- src/pytransit/export/mean_counts.py
- + src/pytransit/genomes/genomes.html~
- src/pytransit/transit_gui.py
- + tests/H37Rv.fna
Changes:
=====================================
CHANGELOG.md
=====================================
@@ -1,6 +1,16 @@
# Change log
All notable changes to this project will be documented in this file.
+## Version 3.2.0 2020-10-26
+#### TRANSIT:
+ - improvements to pathway_enrichment analysis
+ - added '--ranking' flag for GSEA to sort genes based on LFC or SLPV
+ - implemented Ontologizer method (-M ONT), which works better for GO terms
+ - updated auxilliary files in transit data directory for different systems of functional categories (COG, Sanger, GO)
+ - added '-signif' flag to GI (Genetic Interaction analysis) (options: HDI, prob, BFDR, FWER)
+ - updated description of methods for determining significant interactions in documentation
+ - various improvements to other methods, including corrplot and heatmap
+
## Version 3.1.0 2020-03-08
#### TRANSIT:
- added 'corrplot' and 'heatmap' commands
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+tnseq-transit (3.2.0-1) unstable; urgency=medium
+
+ * Team Upload.
+ * New upstream version 3.2.0
+
+ -- Nilesh Patra <npatra974 at gmail.com> Thu, 29 Oct 2020 02:49:44 +0530
+
tnseq-transit (3.1.0-2) unstable; urgency=medium
* Team Upload.
=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
__all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
-__version__ = "v3.1.0"
+__version__ = "v3.2.0"
prefix = "[TRANSIT]"
=====================================
src/pytransit/analysis/corrplot.py
=====================================
@@ -154,6 +154,8 @@ class CorrplotMethod(base.SingleConditionMethod):
genenames = ["%s/%s" % (w[0],w[1]) for w in data]
hash = {}
headers = [h.replace("Mean_","") for h in headers]
+ headers = [h.replace("-",".") for h in headers] # because of R conversion
+ headers = ["X"+x if x[0].isdigit() else x for x in headers] # because R prepends 'X' to column names starting with a digit
for i,col in enumerate(headers): hash[col] = FloatVector([x[i] for x in means])
df = DataFrame(hash) # can't figure out how to set rownames
=====================================
src/pytransit/analysis/gi.py
=====================================
@@ -22,7 +22,7 @@ import math
import random
import numpy
import scipy.stats
-import datetime
+import datetime
from pytransit.analysis import base
import pytransit
@@ -44,7 +44,7 @@ NOTE: This method requires 4 groups of datasets. Use the main interface to add d
"""
transposons = ["himar1"]
-columns = ["Orf","Name","Number of TA Sites","Mean count (Strain A Condition 1)","Mean count (Strain A Condition 2)","Mean count (Strain B Condition 1)","Mean count (Strain B Condition 2)", "Mean logFC (Strain A)", "Mean logFC (Strain B)", "Mean delta logFC","Lower Bound delta logFC","Upper Bound delta logFC", "Prob. of delta-logFC being within ROPE", "Adjusted Probability (BFDR)", "Is HDI outside ROPE?", "Type of Interaction"]
+columns = ["Orf","Name","Number of TA Sites","Mean count (Strain A Condition 1)","Mean count (Strain A Condition 2)","Mean count (Strain B Condition 1)","Mean count (Strain B Condition 2)", "Mean logFC (Strain A)", "Mean logFC (Strain B)", "Mean delta logFC","Lower Bound delta logFC","Upper Bound delta logFC", "Prob. of delta-logFC being within ROPE", "Adjusted Probability", "Is HDI outside ROPE?", "Type of Interaction"]
class GIAnalysis(base.TransitAnalysis):
@@ -143,7 +143,7 @@ class GIGUI(base.AnalysisGUI):
# Zeros Check
- (self.wxobj.giZeroCheckBox, zeroSizer) = self.defineCheckBox(giPanel, labelText="Include sites with all zeros", widgetCheck=True, widgetSize=(-1,-1), tooltipText="Includes sites that are empty (zero) accross all datasets. Unchecking this may be useful for tn5 datasets, where all nucleotides are possible insertion sites and will have a large number of empty sites (significantly slowing down computation and affecting estimates).")
+ (self.wxobj.giZeroCheckBox, zeroSizer) = self.defineCheckBox(giPanel, labelText="Include sites with all zeros", widgetCheck=True, widgetSize=(-1,-1), tooltipText="Includes sites that are empty (zero) across all datasets. Unchecking this may be useful for tn5 datasets, where all nucleotides are possible insertion sites and will have a large number of empty sites (significantly slowing down computation and affecting estimates).")
giSizer.Add(zeroSizer, 0, wx.EXPAND, 5 )
@@ -474,6 +474,7 @@ class GIMethod(base.QuadConditionMethod):
normalization="TTR",
samples=10000,
rope=0.5,
+ signif="HDI",
includeZeros=False,
replicates="Sum",
LOESS=False,
@@ -486,8 +487,7 @@ class GIMethod(base.QuadConditionMethod):
self.samples = samples
self.includeZeros = includeZeros
self.rope = rope
- self.doBFDR = True # TRI
- self.doFWER = True # TRI
+ self.signif = signif
self.NTerminus = NTerminus
self.CTerminus = CTerminus
@@ -558,6 +558,7 @@ class GIMethod(base.QuadConditionMethod):
if not output_path: return None
output_file = open(output_path, "w")
+ signif = "HDI" # need to add choice for this to the GUI
return self(ctrldataA,
ctrldataB,
@@ -568,6 +569,7 @@ class GIMethod(base.QuadConditionMethod):
normalization,
samples,
rope,
+ signif,
includeZeros,
replicates,
LOESS,
@@ -600,10 +602,10 @@ class GIMethod(base.QuadConditionMethod):
normalization = kwargs.get("n", "TTR")
samples = int(kwargs.get("s", 10000))
rope = float(kwargs.get("-rope", 0.5)) # fixed! changed int to float
+ signif = kwargs.get("signif","HDI")
replicates = kwargs.get("r", "Sum")
includeZeros = kwargs.get("iz", False)
-
LOESS = kwargs.get("l", False)
ignoreCodon = True
NTerminus = float(kwargs.get("iN", 0.00))
@@ -618,6 +620,7 @@ class GIMethod(base.QuadConditionMethod):
normalization,
samples,
rope,
+ signif,
includeZeros,
replicates,
LOESS,
@@ -674,7 +677,7 @@ class GIMethod(base.QuadConditionMethod):
var_list_b2 = []
- # Base priors on empirical observations accross genes.
+ # Base priors on empirical observations across genes.
for gene in sorted(G_A1):
if gene.n > 1:
A1_data = G_A1[gene.orf].reads.flatten()
@@ -764,7 +767,7 @@ class GIMethod(base.QuadConditionMethod):
mean_logFC_B = numpy.mean(logFC_B_post)
mean_delta_logFC = numpy.mean(delta_logFC_post)
- # Is HDI significantly different than ROPE?
+ # Is HDI significantly different than ROPE? (i.e. no overlap)
not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope
# Probability of posterior overlaping with ROPE
@@ -816,25 +819,35 @@ class GIMethod(base.QuadConditionMethod):
self.transit_message_inplace("Running Export Method... %1.1f%%" % (100.0*count/(N-1)))
count+=1
- data.sort(key=lambda x: x[-2])
+ # for HDI, maybe I should sort on abs(mean_delta_logFC); however, need to sort by prob to calculate BFDR
+ probcol = -2 # probROPEs
+ data.sort(key=lambda x: x[probcol])
+ sortedprobs = numpy.array([x[probcol] for x in data])
+
+ # BFDR method: Newton M.A., Noueiry A., Sarkar D., Ahlquist P. (2004). Detecting differential gene expression with a semiparametric hierarchical mixture method. Biostatistics, 5:155–176.
- if self.doBFDR or not self.doFWER:
- postprob = numpy.array(postprob)
- postprob.sort()
- bfdr = numpy.cumsum(postprob)/numpy.arange(1, len(postprob)+1)
- adjusted_prob = bfdr
+ if self.signif=="BFDR":
+ sortedprobs = numpy.array(sortedprobs)
+ #sortedprobs.sort() # why, since already sorted?
+ bfdr = numpy.cumsum(sortedprobs)/numpy.arange(1, len(sortedprobs)+1)
+ adjusted_prob = bfdr # should be same order as sorted above by probROPE
adjusted_label = "BFDR"
- elif doFWER:
- fwer = FWER_Bayes(postprob)
- fwer.sort()
+
+ elif self.signif=="FWER":
+ fwer = stat_tools.FWER_Bayes(sortedprobs)
+ #fwer.sort() # should not need this if monotonic
adjusted_prob = fwer
adjusted_label = "FWER"
# If not using adjustment for classification, sort correctly
- if not self.doBFDR and not self.doFWER:
- sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
- adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
- data = [data[ii] for ii in sorted_index]
+ else:
+ adjusted_prob = sortedprobs
+ adjusted_label = "un"
+ # should I stable-sort by overlap_bit?
+
+# sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
+# adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
+# data = [data[ii] for ii in sorted_index]
@@ -848,37 +861,40 @@ class GIMethod(base.QuadConditionMethod):
else:
self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))
+ now = str(datetime.datetime.now())
+ now = now[:now.rfind('.')]
+ self.output.write("#Date: "+now+"\n")
+ #self.output.write("#Runtime: %s s\n" % (time.time() - start_time))
+
self.output.write("#Control Data-A: %s\n" % (",".join(self.ctrldataA).encode('utf-8')))
self.output.write("#Control Data-B: %s\n" % (",".join(self.ctrldataB).encode('utf-8')))
self.output.write("#Experimental Data-A: %s\n" % (",".join(self.expdataA).encode('utf-8')))
self.output.write("#Experimental Data-B: %s\n" % (",".join(self.expdataB).encode('utf-8')))
self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8')))
- self.output.write("#Time: %s\n" % (time.time() - start_time))
- self.output.write("#%s\n" % "\t".join(columns))
-
+ self.output.write("#ROPE=%s, method for significance=%s\n" % (self.rope,self.signif))
+ #self.output.write("#%s\n" % "\t".join(columns))
- if self.doBFDR or self.doFWER:
- self.output.write("# Significant interactions are those whose adjusted probability of the delta-logFC falling within ROPE is < 0.05 (Adjusted using %s)\n" % (adjusted_label))
- else:
- self.output.write("# Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n")
- self.output.write("#\n")
+ if self.signif=="HDI":
+ self.output.write("#Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n")
+ elif self.signif in "prob BDFR FWER":
+ self.output.write("#Significant interactions are those whose %s-adjusted probability of the delta-logFC falling within ROPE is < 0.05.\n" % (adjusted_label))
- # Write column names
- self.output.write("#ORF\tName\tNumber of TA Sites\tMean count (Strain A Condition 1)\tMean count (Strain A Condition 2)\tMean count (Strain B Condition 1)\tMean count (Strain B Condition 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tProb. of delta-logFC being within ROPE\tAdjusted Probability (%s)\tIs HDI outside ROPE?\tType of Interaction\n" % adjusted_label)
+ # Write column names (redundant with self.columns)
+ self.output.write("#ORF\tName\tNumber of TA Sites\tMean count (Strain A Condition 1)\tMean count (Strain A Condition 2)\tMean count (Strain B Condition 1)\tMean count (Strain B Condition 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tIs HDI outside ROPE?\tProb. of delta-logFC being within ROPE\t%s-Adjusted Probability\tType of Interaction\n" % adjusted_label)
# Write gene results
for i,row in enumerate(data):
- #1 2 3 4 5 6 7 8 9 10 11 12 13 14
- orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row
- type_of_interaction = "No Interaction"
- if ((self.doBFDR or self.doFWER) and adjusted_prob[i] < 0.05):
- type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
- elif not (self.doBFDR or self.doFWER) and not_HDI_overlap_bit:
- type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
+ #1 2 3 4 5 6 7 8 9 10 11 12 13 14
+ orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row
+
+ interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
+ type_of_interaction = "No Interaction"
+ if self.signif in "prob BFDR FWER" and adjusted_prob[i] < 0.05: type_of_interaction = interaction
+ if self.signif=="HDI" and not_HDI_overlap_bit: type_of_interaction = interaction
- new_row = tuple(list(row[:-1])+[adjusted_prob[i], not_HDI_overlap_bit, type_of_interaction])
- self.output.write("%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.8f\t%1.8f\t%s\t%s\n" % new_row)
+ new_row = tuple(list(row[:-2])+[not_HDI_overlap_bit, probROPE, adjusted_prob[i], type_of_interaction])
+ self.output.write("%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%s\t%1.8f\t%1.8f\t%s\n" % new_row)
self.transit_message("Adding File: %s" % (self.output.name))
@@ -907,16 +923,24 @@ class GIMethod(base.QuadConditionMethod):
GI performs a comparison among 4 groups of datasets, strain A and B assessed in conditions 1 and 2 (e.g. control vs treatment).
It looks for interactions where the response to the treatment (i.e. effect on insertion counts) depends on the strain (output variable: delta_LFC).
Provide replicates in each group as a comma-separated list of wig files.
- Significant interactions are those with "HDI outside ROPE?"=TRUE, and all genes are sorted by significance using BFDR."
+ HDI is highest density interval for posterior distribution of delta_LFC, which is like a confidence interval on difference of slopes.
+ Genes are sorted by probability of HDI overlapping with ROPE. (genes with the highest abs(mean_delta_logFC) are near the top, approximately)
+ Significant genes are indicated by 'Type of Interaction' column (No Interaction, Aggravating, Alleviating, Suppressive).
+ By default, hits are defined as "Is HDI outside of ROPE?"=TRUE (i.e. non-overlap of delta_LFC posterior distritbuion with Region of Probably Equivalence around 0)
+ Alternative methods for significance: use -signif flag with prob, BFDR, or FWER. These affect 'Type of Interaction' (i.e. which genes are labeled 'No Interaction')
Optional Arguments:
-s <integer> := Number of samples. Default: -s 10000
--rope <float> := Region of Practical Equivalence. Area around 0 (i.e. 0 +/- ROPE) that is NOT of interest. Can be thought of similar to the area of the null-hypothesis. Default: --rope 0.5
-n <string> := Normalization method. Default: -n TTR
- -iz := Include rows with zero accross conditions.
+ -iz := Include rows with zero across conditions.
-l := Perform LOESS Correction; Helps remove possible genomic position bias. Default: Turned Off.
-iN <float> := Ignore TAs occuring at given percentage (as integer) of the N terminus. Default: -iN 0
-iC <float> := Ignore TAs occuring at given percentage (as integer) of the C terminus. Default: -iC 0
+ -signif HDI := (default) Significant if HDI does not overlap ROPE; if HDI overlaps ROPE, 'Type of Interaction' is set to 'No Interaction'
+ -signif prob := Optionally, significant hits are re-defined based on probability (degree) of overlap of HDI with ROPE, prob<0.05 (no adjustment)
+ -signif BFDR := Apply "Bayesian" FDR correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as BFDR<0.05
+ -signif FWER := Apply "Bayesian" FWER correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as FWER<0.05
""" % (sys.argv[0])
=====================================
src/pytransit/analysis/heatmap.py
=====================================
@@ -102,41 +102,51 @@ class HeatmapMethod(base.SingleConditionMethod):
@classmethod
def fromargs(self, rawargs):
+ (args, kwargs) = transit_tools.cleanargs(rawargs)
if len(rawargs)<3: print(self.usage_string()); sys.exit(-1)
self.filetype = None
- if rawargs[0]=="-anova": self.filetype = "anova"
- elif rawargs[0]=="-zinb": self.filetype = "zinb"
+ if kwargs.get("anova",False): self.filetype = "anova"
+ elif kwargs.get("zinb",False): self.filetype = "zinb"
else: print(self.usage_string()); sys.exit(-1)
- self.infile = rawargs[1]
- self.outfile = rawargs[2]
+ self.infile = args[0]
+ self.outfile = args[1]
+ self.qval = float(kwargs.get("qval",0.05))
+ self.topk = int(kwargs.get("topk",-1))
return self(self.infile,outfile=self.outfile)
def Run(self):
- # assume first non-comment line is header; samples are
+ if self.filetype!="anova" and self.filetype!="zinb":
+ print("filetype not recognized: %s" % self.filetype); sys.exit(-1)
+
headers = None
- data,LFCs = [],[]
-
- if self.filetype=="anova" or self.filetype=="zinb":
- n = -1 # number of conditions
- for line in open(self.infile):
- w = line.rstrip().split('\t')
- if line[0]=='#' or ('pval' in line and 'padj' in line): # check for 'pval' for backwards compatibility
- headers = w; continue # keep last comment line as headers
- if n==-1:
- # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
- # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
- if self.filetype=="anova": n = int((len(w)-6)/2)
- elif self.filetype=="zinb": n = int((len(headers)-6)/4)
- headers = headers[3:3+n]
- headers = [x.replace("Mean_","") for x in headers]
+ data,hits = [],[]
+ n = -1 # number of conditions
+
+ for line in open(self.infile):
+ w = line.rstrip().split('\t')
+ if line[0]=='#' or ('pval' in line and 'padj' in line): # check for 'pval' for backwards compatibility
+ headers = w; continue # keep last comment line as headers
+ # assume first non-comment line is header
+ if n==-1:
+ # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
+ # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
+ if self.filetype=="anova": n = int((len(w)-6)/2)
+ elif self.filetype=="zinb": n = int((len(headers)-6)/4)
+ headers = headers[3:3+n]
+ headers = [x.replace("Mean_","") for x in headers]
+ else:
lfcs = [float(x) for x in w[3+n:3+n+n]] # take just the columns of means
qval = float(w[-2])
- if qval<0.05: data.append(w); LFCs.append(lfcs)
- else: print("filetype not recognized: %s" % self.filetype); sys.exit(-1)
- print("heatmap based on %s genes" % len(LFCs))
+ data.append((w,lfcs,qval))
- genenames = ["%s/%s" % (w[0],w[1]) for w in data]
+ data.sort(key=lambda x: x[-1])
+ hits,LFCs = [],[]
+ for k,(w,lfcs,qval) in enumerate(data):
+ if (self.topk==-1 and qval<self.qval) or (self.topk!=-1 and k<self.topk): hits.append(w); LFCs.append(lfcs)
+
+ print("heatmap based on %s genes" % len(hits))
+ genenames = ["%s/%s" % (w[0],w[1]) for w in hits]
hash = {}
headers = [h.replace("Mean_","") for h in headers]
for i,col in enumerate(headers): hash[col] = FloatVector([x[i] for x in LFCs])
@@ -167,8 +177,7 @@ dev.off()
@classmethod
def usage_string(self):
- return "usage: python3 %s heatmap -anova|-zinb <anova_or_zinb_output> <heatmap.png>" % sys.argv[0]
- # could add a flag for padj cutoff (or top n most signif genes)
+ return "usage: python3 %s heatmap <anova_or_zinb_output> <heatmap.png> -anova|-zinb [-topk <int>] [-qval <float>]\n note: genes are selected based on qval<0.05 by default" % sys.argv[0]
if __name__ == "__main__":
=====================================
src/pytransit/analysis/pathway_enrichment.py
=====================================
@@ -79,7 +79,7 @@ class PathwayGUI(base.AnalysisGUI):
class PathwayMethod(base.AnalysisMethod):
- def __init__(self,resamplingFile,associationsFile,pathwaysFile,outputFile,method,PC=0,N=10000,p=1):
+ def __init__(self,resamplingFile,associationsFile,pathwaysFile,outputFile,method,PC=0,Nperm=10000,p=0,ranking="SLPV"):
base.AnalysisMethod.__init__(self, short_name, long_name, short_desc, long_desc, open(outputFile,"w"), None) # no annotation file
self.resamplingFile = resamplingFile
self.associationsFile = associationsFile
@@ -87,9 +87,10 @@ class PathwayMethod(base.AnalysisMethod):
self.outputFile = outputFile
# self.output is the opened file, which will be set in base class
self.method = method
- self.PC = PC # for FISHER
- self.N = N # for GSEA
+ self.PC = PC # for FET
+ self.Nperm = Nperm # for GSEA
self.p = p # for GSEA
+ self.ranking = ranking # for GSEA
@classmethod
def fromGUI(self, wxobj):
@@ -102,37 +103,82 @@ class PathwayMethod(base.AnalysisMethod):
associations = args[1]
pathways = args[2]
output = args[3]
- method = kwargs.get("M", "FISHER")
- N = int(kwargs.get("N", "10000")) # for GSEA?
- p = int(kwargs.get("p","1")) # for GSEA?
- PC = int(kwargs.get("PC","2"))
+ method = kwargs.get("M", "FET")
+ PC = int(kwargs.get("PC","2")) # for FET
+ Nperm = int(kwargs.get("Nperm", "10000")) # for GSEA
+ p = float(kwargs.get("p","0")) # for GSEA
+ ranking = kwargs.get("ranking","SLPV") # for GSEA
- if method not in "FISHER GSEA ONT".split():
+ if method not in "FET GSEA ONT".split():
print("error: method %s not recognized" % method)
print(self.usage_string());
sys.exit(0)
- if method=="ONT": print("error: Ontologizer is not implemented yet"); sys.exit(0)
- return self(resamplingFile,associations,pathways,output,method,PC=PC,N=N,p=p)
+ return self(resamplingFile,associations,pathways,output,method,PC=PC,Nperm=Nperm,p=p,ranking=ranking)
@classmethod
def usage_string(self):
- return """python3 %s pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FISHER|GSEA|GO>] [-PC <int>]""" % (sys.argv[0])
+ return """python3 %s pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FET|GSEA|GO>] [-PC <int>] [-ranking SLPV|LFC] [-p <float>] [-Nperm <int>]
+
+Optional parameters:
+ -M FET|GSEA|ONT: method to use, FET for Fisher's Exact Test (default), GSEA for Gene Set Enrichment Analysis (Subramaniam et al, 2005), or ONT for Ontologizer (Grossman et al, 2007)
+ -ranking (for GSEA): SLPV is signed-log-p-value (default); LFC is log2-fold-change from resampling
+ -p (for GSEA): exponent to use in calculating enrichment score; recommend trying 0 or 1 (as in Subramaniam et al, 2005)
+ -Nperm (for GSEA): number of permutations to simulate for null distribution to determine p-value (default=10000)
+ -PC (for FET): pseudo-counts to use in calculating p-value based on hypergeometric distribution (default=2)
+""" % (sys.argv[0])
def Run(self):
self.transit_message("Starting Pathway Enrichment Method")
start_time = time.time()
# self.output in base class should be open by now
- self.write("# command: "+' '.join(sys.argv))
- self.write("# date: "+str(datetime.datetime.now()))
+ self.write("# command: python3 "+' '.join(sys.argv))
+ now = str(datetime.datetime.now())
+ now = now[:now.rfind('.')]
+ self.write("# date: "+now)
+ self.write("# pathway method: "+self.method)
- if self.method=="FISHER": self.fisher_exact_test()
+ if self.method=="FET": self.fisher_exact_test()
elif self.method =="GSEA": self.GSEA()
+ elif self.method =="ONT": self.Ontologizer()
else:
method = "Not a valid method"
self.progress_update("Not a valid method", 100)
+ def write(self,msg): self.output.write(msg+"\n")
+
+ def read_resampling_file(self,filename):
+ genes,hits,headers = [],[],[]
+ for line in open(filename):
+ if line[0]=='#': headers.append(line); continue
+ w = line.rstrip().split('\t')
+ genes.append(w)
+ qval = float(w[-1])
+ if qval<0.05: hits.append(w[0])
+ return genes,hits,headers
+
+ # assume these are listed as pairs (tab-sep)
+ # return bidirectional hash (genes->[terms], terms->[genes]; each can be one-to-many, hence lists)
+ def read_associations(self,filename):
+ associations = {}
+ for line in open(filename):
+ if line[0]=='#': continue
+ w = line.rstrip().split('\t')
+ # store mappings in both directions
+ for (a,b) in [(w[0],w[1]),(w[1],w[0])]:
+ if a not in associations: associations[a] = []
+ associations[a].append(b)
+ return associations
+
+ def read_pathways(self,filename):
+ pathways = {}
+ for line in open(filename):
+ if line[0]=='#': continue
+ w = line.rstrip().split('\t')
+ pathways[w[0]] = w[1]
+ return pathways
+
############### GSEA ######################
def makeindex(self,lst):
@@ -141,7 +187,6 @@ class PathwayMethod(base.AnalysisMethod):
return index
# based on GSEA paper (Subramanian et al, 2005, PNAS)
- # xxx assume that Bindex is a dictionary that maps all genes into ranks, and Bscores maps all genes to SLPV
# ranks and scores are hashes from genes into ranks and SLPV
# when p=0, ES(S) reduces to the standard K-S statistic; p=1 is used in PNAS paper
@@ -183,7 +228,8 @@ class PathwayMethod(base.AnalysisMethod):
terms2orfs = associations
allgenes = [x[0] for x in data]
- self.write("# method=GSEA, using SLPV to rank genes, Nperm=%d" % self.N)
+ self.write("# method=GSEA, Nperm=%d, p=%d" % (self.Nperm,self.p))
+ self.write("# ranking genes by %s" % self.ranking)
self.write("# total genes: %s, mean rank: %s" % (len(data),n2))
# rank by SLPV=sign(LFC)*log10(pval)
@@ -195,14 +241,21 @@ class PathwayMethod(base.AnalysisMethod):
for w in data:
orf,LFC,Pval = w[0],float(w[LFC_col]),float(w[Pval_col])
SLPV = (-1 if LFC<0 else 1)*math.log(Pval+0.000001,10)
- pairs.append((orf,SLPV))
+ if self.ranking=="SLPV": pairs.append((orf,SLPV))
+ elif self.ranking=="LFC": pairs.append((orf,LFC))
+
+ # pre-randomize ORFs, to avoid genome-position bias in case of ties in pvals (e.g. 1.0)
+ indexes = range(len(pairs))
+ indexes = numpy.random.permutation(indexes).tolist()
+ pairs = [pairs[i] for i in indexes]
+
pairs.sort(key=lambda x: x[1],reverse=True) # emulate ranking genes with *higher* correlation at top
orfs2rank,orfs2score = {},{}
for i,(orf,score) in enumerate(pairs):
orfs2score[orf] = score
orfs2rank[orf] = i
- Nperm = self.N
+ Nperm = self.Nperm
results,Total = [],len(terms)
for i,term in enumerate(terms):
sys.stdout.flush()
@@ -254,38 +307,7 @@ class PathwayMethod(base.AnalysisMethod):
self.output.write('\t'.join([str(x) for x in vals])+'\n')
self.output.close()
- #################################
-
- def read_resampling_file(self,filename):
- genes,hits,headers = [],[],[]
- for line in open(filename):
- if line[0]=='#': headers.append(line); continue
- w = line.rstrip().split('\t')
- genes.append(w)
- qval = float(w[-1])
- if qval<0.05: hits.append(w[0])
- return genes,hits,headers
-
- # assume these are listed as pairs (tab-sep)
- # return bidirectional hash (genes->[terms], terms->[genes]; each can be one-to-many, hence lists)
- def read_associations(self,filename):
- associations = {}
- for line in open(filename):
- if line[0]=='#': continue
- w = line.rstrip().split('\t')
- # store mappings in both directions
- for (a,b) in [(w[0],w[1]),(w[1],w[0])]:
- if a not in associations: associations[a] = []
- associations[a].append(b)
- return associations
-
- def read_pathways(self,filename):
- pathways = {}
- for line in open(filename):
- if line[0]=='#': continue
- w = line.rstrip().split('\t')
- pathways[w[0]] = w[1]
- return pathways
+ ########## Fisher Exact Test ###############
# HYPERGEOMETRIC
# scipy.stats.hypergeom.sf() is survival function (1-cdf), so only enriched genes will be significant
@@ -297,8 +319,6 @@ class PathwayMethod(base.AnalysisMethod):
def hypergeometric(self,k,M,n,N):
return hypergeom.sf(k,M,n,N)
- def write(self,msg): self.output.write(msg+"\n")
-
def fisher_exact_test(self):
genes,hits,headers = self.read_resampling_file(self.resamplingFile)
associations = self.read_associations(self.associationsFile)
@@ -313,7 +333,7 @@ class PathwayMethod(base.AnalysisMethod):
for gene in genes:
orf = gene[0]
if orf in associations: genes_with_associations += 1
- self.write("# method=FISHER, PC=%s" % self.PC)
+ self.write("# method=FET, PC=%s" % self.PC)
self.write("# genes with associations=%s out of %s total" % (genes_with_associations,len(genes)))
self.write("# significant genes (qval<0.05): %s" % (len(hits)))
@@ -366,6 +386,142 @@ class PathwayMethod(base.AnalysisMethod):
self.finish()
self.transit_message("Finished Pathway Enrichment Method")
+ ########## Ontologizer ###############
+
+ # this method is restricted to GO terms
+
+ # this implements the union method of:
+ # Grossman et al (2007). Improved Detection of overrepresentation of Gene-Ontology
+ # annotation with parent-child analysis. Bioinformatics, 23(22):3024-3031.
+
+ def Ontologizer(self):
+
+ def warning(s): sys.stderr.write("%s\n" % s) # use self.warning? prepend method name?
+
+ # returns True if ch is a descendant of GO (as GO terms, like "GO:0006810")
+
+ def descendant_of(ch,GO):
+ if ch==GO: return True
+ for node in parents.get(ch,[]):
+ if descendant_of(node,GO)==True: return True
+ return False
+
+ # visited is a hash
+ # assume B is above A
+ # do upward DFS following parent pointers till hit NULL
+ # return all nodes on all paths? include A and B?
+
+ def get_ancestors(A,visited):
+ if A in visited: return visited
+ visited[A] = 1
+ for parent in parents.get(A,[]): get_ancestors(parent,visited)
+ return visited
+
+ def depth(go):
+ if go not in parents: return 0
+ return 1+min([depth(x) for x in parents[go]])
+
+ ontology,parents = {},{}
+
+ GOannot = self.associationsFile
+ OBOfile = self.pathwaysFile
+
+ for line in open(OBOfile):
+ if line[:3]=="id:": id = line[4:-1]
+ if line[:5]=="is_a:":
+ parent = line.split()[1]
+ if id not in parents: parents[id] = []
+ parents[id].append(parent)
+ if len(line)<2: id = None
+ if line[:5]=="name:": ontology[id] = line[6:-1]
+
+ rv2gos,go2rvs = {},{}
+ MINTERMS,MAXTERMS = 2,300
+
+ for line in open(GOannot):
+ w = line.rstrip().split('\t')
+ rv,go = w[0],w[1]
+ if rv not in rv2gos: rv2gos[rv] = []
+ if go not in rv2gos[rv]: rv2gos[rv].append(go)
+ if go not in go2rvs: go2rvs[go] = []
+ if rv not in go2rvs[go]: go2rvs[go].append(rv)
+ # expand to all parents...
+ BP,CC,MF = ["GO:0008150","GO:0005575","GO:0003674"]
+ for g in get_ancestors(go,{}):
+ if g not in rv2gos[rv]:
+ rv2gos[rv].append(g)
+ if g not in go2rvs: go2rvs[g] = []
+ go2rvs[g].append(rv)
+
+ warning("GO terms with at least one ORF: %s" % len(go2rvs.keys())) # what about between MIN and MAX?
+ for go in go2rvs.keys():
+ if go not in ontology: warning("not found: %s" % go) # also indicate which gene?
+
+ # could use class method, but would have to adapt it:
+ # genes,hits,headers = self.read_resampling_file(self.resamplingFile)
+
+ genes,pvals = {},[]
+ allorfs,studyset = [],[]
+ for line in open(self.resamplingFile):
+ if line[0]=="#": continue
+ w = line.rstrip().split()
+ genes[w[0]] = w
+ allorfs.append(w[0])
+ pval,qval = float(w[-2]),float(w[-1])
+ if qval<0.05: studyset.append(w[0])
+ pvals.append((w[0],pval))
+ pvals.sort(key=lambda x: x[1])
+ ranks = {}
+ for i,(rv,pval) in enumerate(pvals): ranks[rv] = i+1
+ self.write("# number of resampling hits (qval<0.05): %s" % len(studyset))
+
+ counts = []
+ n,a = len(allorfs),len(studyset)
+ for go in go2rvs.keys():
+ if go not in ontology: continue
+ m = len(go2rvs[go]) # orfs in the GO terms in the genome overall
+ if m>=MINTERMS and m<=MAXTERMS:
+ b = len(list(filter(lambda x: x in go2rvs[go],studyset))) # orfs with GO term in studyset
+ if b==0: continue
+ # calc p=len(rvs for parents of go), q=len(subset of parent rvs in studyset)
+ P = set()
+ if go not in parents: continue
+ npar = len(parents[go])
+ for par in parents[go]: P = P.union(go2rvs[par])
+ p,q = len(P),len(P.intersection(studyset))
+ enrich = ((b+1)/float(q+1))/((m+1)/float(p+1)) # enrichment
+ # parents: p overall, q in studyset; GO in studyset: b out of q (subset of studyset labeled with a parent GO term)
+ # assume m orfs have GO term inside parent (same as overall)
+ # subtract 1 from b possibly because we are using 1-cdf? this is needed to get same numeric results as 'phypergeometric' in Ontologizer
+ pval = 1.0-scipy.stats.hypergeom.cdf(b-1,p,m,q)
+ counts.append([go,n,m,p,q,a,b,npar,enrich,pval])
+
+ pvals = [x[-1] for x in counts]
+ qvals = multitest.fdrcorrection(pvals)[1]
+
+ counts = [x+[y] for x,y in zip(counts,qvals)]
+ counts.sort(key=lambda x: x[-1])
+
+ self.write("# number of GO terms that are significantly enriched (qval<0.05): %s" % len(list(filter(lambda x: x[-1]<0.05,counts))))
+
+ # 1-sided pvals, only report enriched terms, not depleted
+
+ self.write('\t'.join("GO_term description total_orfs orfs_in_GO orfs_in_parents hits_in_parents hits GO_in_hits num_parent_nodes enrichment pval qval genes_in_intersection_of_hits_and_GO_term".split())) # assume self.outputFile has already been opened
+ for (go,n,m,p,q,a,b,npar,enrich,pval,qval) in counts:
+ hits = filter(lambda x: x in go2rvs[go],studyset)
+ hits = [(x,genes[x][1],ranks[x]) for x in hits]
+ hits.sort(key=lambda x: x[2]) # sort on ranks
+ hits = ["%s/%s(%s)" % (rv,gene,rank) for (rv,gene,rank) in hits]
+ hits = ','.join(hits) # add gene names
+ vals = [go,ontology[go],n,m,a,b,p,q,npar,round(enrich,3),round(pval,6),round(qval,6),hits]
+ self.write('\t'.join([str(x) for x in vals]))
+
+ self.transit_message("Adding File: %s" % (self.outputFile))
+ self.add_file(filetype="Pathway Enrichment")
+ self.finish()
+ self.transit_message("Finished Pathway Enrichment Method")
+
+
####################################################
if __name__ == "__main__":
=====================================
src/pytransit/data/GO_terms_for_each_Rv.obo-3-11-18.txt
=====================================
The diff for this file was not included because it is too large.
=====================================
src/pytransit/data/H37Rv_COG_roles.dat
=====================================
@@ -40,7 +40,6 @@ Rv0054 L ssb Replication, recombination and repair
Rv0055 J rpsR Translation, ribosomal structure and biogenesis
Rv0056 J rplI Translation, ribosomal structure and biogenesis
Rv0058 L dnaB Replication, recombination and repair
-Rv0058 L dnaB Replication, recombination and repair
Rv0060 R - General function prediction only
Rv0062 G celA1 Carbohydrate transport and metabolism
Rv0063 C - Energy production and conversion
@@ -66,7 +65,6 @@ Rv0084 C hycD Energy production and conversion
Rv0085 C hycP Energy production and conversion
Rv0086 P hycQ Inorganic ion transport and metabolism
Rv0087 C hycE Energy production and conversion
-Rv0087 C hycE Energy production and conversion
Rv0088 J - Translation, ribosomal structure and biogenesis
Rv0089 R - General function prediction only
Rv0090 P - Inorganic ion transport and metabolism
@@ -78,7 +76,6 @@ Rv0097 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv0099 Q fadD10 Secondary metabolites biosynthesis, transport and catabolism
Rv0100 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv0101 Q nrp Secondary metabolites biosynthesis, transport and catabolism
-Rv0101 Q nrp Secondary metabolites biosynthesis, transport and catabolism
Rv0102 S - Function unknown
Rv0103c P ctpB Inorganic ion transport and metabolism
Rv0104 T - Signal transduction mechanisms
@@ -135,11 +132,11 @@ Rv0165c K - Transcription
Rv0166 Q fadD5 Secondary metabolites biosynthesis, transport and catabolism
Rv0167 Q yrbE1A Secondary metabolites biosynthesis, transport and catabolism
Rv0168 Q yrbE1B Secondary metabolites biosynthesis, transport and catabolism
-Rv0169 T mce1A Signal transduction mechanisms
Rv0169 Q mce1A Secondary metabolites biosynthesis, transport and catabolism
+Rv0169 T mce1A Signal transduction mechanisms
Rv0170 Q mce1B Secondary metabolites biosynthesis, transport and catabolism
-Rv0171 T mce1C Signal transduction mechanisms
Rv0171 Q mce1C Secondary metabolites biosynthesis, transport and catabolism
+Rv0171 T mce1C Signal transduction mechanisms
Rv0172 Q mce1D Secondary metabolites biosynthesis, transport and catabolism
Rv0173 Q lprK Secondary metabolites biosynthesis, transport and catabolism
Rv0174 Q mce1F Secondary metabolites biosynthesis, transport and catabolism
@@ -168,7 +165,6 @@ Rv0207c S - Function unknown
Rv0208c R trmB General function prediction only
Rv0211 C pckA Energy production and conversion
Rv0212c H nadR Coenzyme transport and metabolism
-Rv0212c H nadR Coenzyme transport and metabolism
Rv0213c C - Energy production and conversion
Rv0214 Q fadD4 Secondary metabolites biosynthesis, transport and catabolism
Rv0215c I fadE3 Lipid transport and metabolism
@@ -204,15 +200,14 @@ Rv0253 R nirD General function prediction only
Rv0254c H cobU Coenzyme transport and metabolism
Rv0255c H cobQ1 Coenzyme transport and metabolism
Rv0259c S - Function unknown
-Rv0260c K - Transcription
Rv0260c H - Coenzyme transport and metabolism
+Rv0260c K - Transcription
Rv0261c P narK3 Inorganic ion transport and metabolism
Rv0262c R aac General function prediction only
Rv0263c E - Amino acid transport and metabolism
Rv0264c E - Amino acid transport and metabolism
Rv0265c P - Inorganic ion transport and metabolism
Rv0266c Q oplA Secondary metabolites biosynthesis, transport and catabolism
-Rv0266c Q oplA Secondary metabolites biosynthesis, transport and catabolism
Rv0267 P narU Inorganic ion transport and metabolism
Rv0268c D - Cell cycle control, cell division, chromosome partitioning
Rv0269c L - Replication, recombination and repair
@@ -248,8 +243,8 @@ Rv0319 O pcp Posttranslational modification, protein turnover, chaperones
Rv0321 F dcd Nucleotide transport and metabolism
Rv0322 M udgA Cell wall/membrane/envelope biogenesis
Rv0323c S - Function unknown
-Rv0324 P - Inorganic ion transport and metabolism
Rv0324 K - Transcription
+Rv0324 P - Inorganic ion transport and metabolism
Rv0325 R - General function prediction only
Rv0326 R - General function prediction only
Rv0327c Q cyp135A1 Secondary metabolites biosynthesis, transport and catabolism
@@ -261,7 +256,6 @@ Rv0334 M rmlA Cell wall/membrane/envelope biogenesis
Rv0337c E aspC Amino acid transport and metabolism
Rv0338c C - Energy production and conversion
Rv0339c K - Transcription
-Rv0339c K - Transcription
Rv0342 R iniA General function prediction only
Rv0343 R iniC General function prediction only
Rv0345 R - General function prediction only
@@ -295,10 +289,9 @@ Rv0380c J - Translation, ribosomal structure and biogenesis
Rv0382c F pyrE Nucleotide transport and metabolism
Rv0384c O clpB Posttranslational modification, protein turnover, chaperones
Rv0385 C - Energy production and conversion
-Rv0385 C - Energy production and conversion
-Rv0386 T - Signal transduction mechanisms
Rv0386 K - Transcription
Rv0386 R - General function prediction only
+Rv0386 T - Signal transduction mechanisms
Rv0389 F purT Nucleotide transport and metabolism
Rv0390 P - Inorganic ion transport and metabolism
Rv0391 E metZ Amino acid transport and metabolism
@@ -347,8 +340,8 @@ Rv0449c R - General function prediction only
Rv0450c R mmpL4 General function prediction only
Rv0452 K - Transcription
Rv0454 J - Translation, ribosomal structure and biogenesis
-Rv0456c I echA2 Lipid transport and metabolism
Rv0456A T - Signal transduction mechanisms
+Rv0456c I echA2 Lipid transport and metabolism
Rv0457c E - Amino acid transport and metabolism
Rv0458 C - Energy production and conversion
Rv0459 S - Function unknown
@@ -360,8 +353,8 @@ Rv0466 I - Lipid transport and metabolism
Rv0467 C icl Energy production and conversion
Rv0468 I fadB2 Lipid transport and metabolism
Rv0469 M umaA Cell wall/membrane/envelope biogenesis
-Rv0470c M pcaA Cell wall/membrane/envelope biogenesis
Rv0470A H - Coenzyme transport and metabolism
+Rv0470c M pcaA Cell wall/membrane/envelope biogenesis
Rv0472c K - Transcription
Rv0473 S - Function unknown
Rv0474 K - Transcription
@@ -392,7 +385,6 @@ Rv0508 C - Energy production and conversion
Rv0509 H hemA Coenzyme transport and metabolism
Rv0510 H hemC Coenzyme transport and metabolism
Rv0511 H hemD Coenzyme transport and metabolism
-Rv0511 H hemD Coenzyme transport and metabolism
Rv0512 H hemB Coenzyme transport and metabolism
Rv0516c T - Signal transduction mechanisms
Rv0517 I - Lipid transport and metabolism
@@ -437,9 +429,8 @@ Rv0566c S - Function unknown
Rv0567 R - General function prediction only
Rv0568 Q cyp135B1 Secondary metabolites biosynthesis, transport and catabolism
Rv0570 F nrdZ Nucleotide transport and metabolism
-Rv0571c R - General function prediction only
-Rv0571c R - General function prediction only
Rv0571c I - Lipid transport and metabolism
+Rv0571c R - General function prediction only
Rv0573c H - Coenzyme transport and metabolism
Rv0574c M - Cell wall/membrane/envelope biogenesis
Rv0575c C - Energy production and conversion
@@ -485,11 +476,10 @@ Rv0627 R - General function prediction only
Rv0628c S - Function unknown
Rv0629c L recD Replication, recombination and repair
Rv0630c L recB Replication, recombination and repair
-Rv0630c L recB Replication, recombination and repair
Rv0631c L recC Replication, recombination and repair
Rv0632c I echA3 Lipid transport and metabolism
-Rv0634c R - General function prediction only
Rv0634B J rpmG Translation, ribosomal structure and biogenesis
+Rv0634c R - General function prediction only
Rv0636 I - Lipid transport and metabolism
Rv0638 U secE Intracellular trafficking, secretion, and vesicular transport
Rv0639 K nusG Transcription
@@ -663,7 +653,6 @@ Rv0855 C far Energy production and conversion
Rv0858c E - Amino acid transport and metabolism
Rv0859 I fadA Lipid transport and metabolism
Rv0860 I fadB Lipid transport and metabolism
-Rv0860 I fadB Lipid transport and metabolism
Rv0861c L ercc3 Replication, recombination and repair
Rv0864 H moaC Coenzyme transport and metabolism
Rv0865 H mog Coenzyme transport and metabolism
@@ -678,8 +667,8 @@ Rv0876c R - General function prediction only
Rv0880 K - Transcription
Rv0881 J - Translation, ribosomal structure and biogenesis
Rv0884c E serC Amino acid transport and metabolism
-Rv0886 R fprB General function prediction only
Rv0886 C fprB Energy production and conversion
+Rv0886 R fprB General function prediction only
Rv0887c S - Function unknown
Rv0889c C citA Energy production and conversion
Rv0890c K - Transcription
@@ -695,7 +684,6 @@ Rv0899 M ompA Cell wall/membrane/envelope biogenesis
Rv0902c T prrB Signal transduction mechanisms
Rv0903c K prrA Transcription
Rv0904c I accD3 Lipid transport and metabolism
-Rv0904c I accD3 Lipid transport and metabolism
Rv0905 I echA6 Lipid transport and metabolism
Rv0906 R - General function prediction only
Rv0907 V - Defense mechanisms
@@ -709,8 +697,8 @@ Rv0919 R - General function prediction only
Rv0920c L - Replication, recombination and repair
Rv0921 L - Replication, recombination and repair
Rv0922 L - Replication, recombination and repair
-Rv0923c S - Function unknown
Rv0923c R - General function prediction only
+Rv0923c S - Function unknown
Rv0924c P mntH Inorganic ion transport and metabolism
Rv0925c R - General function prediction only
Rv0926c S - Function unknown
@@ -727,7 +715,6 @@ Rv0935 P pstC1 Inorganic ion transport and metabolism
Rv0936 P pstA2 Inorganic ion transport and metabolism
Rv0937c S - Function unknown
Rv0938 L - Replication, recombination and repair
-Rv0938 L - Replication, recombination and repair
Rv0939 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv0939 R - General function prediction only
Rv0940c C - Energy production and conversion
@@ -753,8 +740,6 @@ Rv0969 P ctpV Inorganic ion transport and metabolism
Rv0971c I echA7 Lipid transport and metabolism
Rv0972c I fadE12 Lipid transport and metabolism
Rv0973c I accA2 Lipid transport and metabolism
-Rv0973c I accA2 Lipid transport and metabolism
-Rv0973c I accA2 Lipid transport and metabolism
Rv0974c I accD2 Lipid transport and metabolism
Rv0975c I fadE13 Lipid transport and metabolism
Rv0978c S PE_PGRS17 Function unknown
@@ -897,8 +882,8 @@ Rv1166 E lpqW Amino acid transport and metabolism
Rv1167c K - Transcription
Rv1170 S mshB Function unknown
Rv1173 R fbiC General function prediction only
-Rv1175c R fadH General function prediction only
Rv1175c C fadH Energy production and conversion
+Rv1175c R fadH General function prediction only
Rv1176c K - Transcription
Rv1177 C fdxC Energy production and conversion
Rv1178 E - Amino acid transport and metabolism
@@ -957,7 +942,6 @@ Rv1245c R - General function prediction only
Rv1246c D - Cell cycle control, cell division, chromosome partitioning
Rv1247c D - Cell cycle control, cell division, chromosome partitioning
Rv1248c C kgd Energy production and conversion
-Rv1248c C kgd Energy production and conversion
Rv1250 R - General function prediction only
Rv1251c L - Replication, recombination and repair
Rv1251c R - General function prediction only
@@ -974,7 +958,6 @@ Rv1263 J amiB2 Translation, ribosomal structure and biogenesis
Rv1264 T - Signal transduction mechanisms
Rv1266c L pknH Replication, recombination and repair
Rv1267c T embR Signal transduction mechanisms
-Rv1267c T embR Signal transduction mechanisms
Rv1272c V - Defense mechanisms
Rv1273c V - Defense mechanisms
Rv1276c T - Signal transduction mechanisms
@@ -989,10 +972,9 @@ Rv1283c P oppB Inorganic ion transport and metabolism
Rv1284 P - Inorganic ion transport and metabolism
Rv1285 H cysD Coenzyme transport and metabolism
Rv1286 P cysN Inorganic ion transport and metabolism
-Rv1286 P cysN Inorganic ion transport and metabolism
Rv1287 K - Transcription
-Rv1288 R - General function prediction only
Rv1288 M - Cell wall/membrane/envelope biogenesis
+Rv1288 R - General function prediction only
Rv1290c S - Function unknown
Rv1292 J argS Translation, ribosomal structure and biogenesis
Rv1293 E lysA Amino acid transport and metabolism
@@ -1009,7 +991,6 @@ Rv1304 C atpB Energy production and conversion
Rv1305 C atpE Energy production and conversion
Rv1306 C atpF Energy production and conversion
Rv1307 C atpH Energy production and conversion
-Rv1307 C atpH Energy production and conversion
Rv1308 C atpA Energy production and conversion
Rv1309 C atpG Energy production and conversion
Rv1310 C atpD Energy production and conversion
@@ -1018,8 +999,8 @@ Rv1313c L - Replication, recombination and repair
Rv1314c S - Function unknown
Rv1315 M murA Cell wall/membrane/envelope biogenesis
Rv1316c L ogt Replication, recombination and repair
-Rv1317c L alkA Replication, recombination and repair
Rv1317c F alkA Nucleotide transport and metabolism
+Rv1317c L alkA Replication, recombination and repair
Rv1318c T - Signal transduction mechanisms
Rv1319c T - Signal transduction mechanisms
Rv1320c T - Signal transduction mechanisms
@@ -1047,23 +1028,21 @@ Rv1344 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv1345 Q fadD33 Secondary metabolites biosynthesis, transport and catabolism
Rv1346 I fadE14 Lipid transport and metabolism
Rv1347c J - Translation, ribosomal structure and biogenesis
-Rv1348 V - Defense mechanisms
Rv1348 P - Inorganic ion transport and metabolism
+Rv1348 V - Defense mechanisms
Rv1349 V - Defense mechanisms
Rv1350 R fabG General function prediction only
Rv1353c K - Transcription
Rv1354c T - Signal transduction mechanisms
-Rv1354c T - Signal transduction mechanisms
Rv1355c H moeY Coenzyme transport and metabolism
Rv1357c T - Signal transduction mechanisms
-Rv1358 T - Signal transduction mechanisms
Rv1358 K - Transcription
Rv1358 R - General function prediction only
+Rv1358 T - Signal transduction mechanisms
Rv1359 T - Signal transduction mechanisms
Rv1360 C - Energy production and conversion
-Rv1364c T - Signal transduction mechanisms
-Rv1364c T - Signal transduction mechanisms
Rv1364c K - Transcription
+Rv1364c T - Signal transduction mechanisms
Rv1365c T rsfA Signal transduction mechanisms
Rv1366 S - Function unknown
Rv1367c V - Defense mechanisms
@@ -1099,13 +1078,11 @@ Rv1406 J fmt Translation, ribosomal structure and biogenesis
Rv1407 J fmu Translation, ribosomal structure and biogenesis
Rv1408 G rpe Carbohydrate transport and metabolism
Rv1409 H ribG Coenzyme transport and metabolism
-Rv1409 H ribG Coenzyme transport and metabolism
Rv1410c R - General function prediction only
Rv1412 H ribC Coenzyme transport and metabolism
Rv1413 E - Amino acid transport and metabolism
Rv1414 E - Amino acid transport and metabolism
Rv1415 H ribA2 Coenzyme transport and metabolism
-Rv1415 H ribA2 Coenzyme transport and metabolism
Rv1416 H ribH Coenzyme transport and metabolism
Rv1420 L uvrC Replication, recombination and repair
Rv1421 R - General function prediction only
@@ -1136,8 +1113,8 @@ Rv1456c O - Posttranslational modification, protein turnover, chaperones
Rv1457c V - Defense mechanisms
Rv1458c V - Defense mechanisms
Rv1460 K - Transcription
-Rv1461 O - Posttranslational modification, protein turnover, chaperones
Rv1461 L - Replication, recombination and repair
+Rv1461 O - Posttranslational modification, protein turnover, chaperones
Rv1462 O - Posttranslational modification, protein turnover, chaperones
Rv1463 O - Posttranslational modification, protein turnover, chaperones
Rv1464 E csd Amino acid transport and metabolism
@@ -1164,14 +1141,12 @@ Rv1488 O - Posttranslational modification, protein turnover, chaperones
Rv1489A I - Lipid transport and metabolism
Rv1491c S - Function unknown
Rv1492 I mutA Lipid transport and metabolism
-Rv1492 I mutA Lipid transport and metabolism
-Rv1493 I mutB Lipid transport and metabolism
Rv1493 I mutB Lipid transport and metabolism
Rv1495 T - Signal transduction mechanisms
Rv1496 E - Amino acid transport and metabolism
Rv1497 V lipL Defense mechanisms
-Rv1498c R - General function prediction only
Rv1498A S - Function unknown
+Rv1498c R - General function prediction only
Rv1500 M - Cell wall/membrane/envelope biogenesis
Rv1501 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv1502 G - Carbohydrate transport and metabolism
@@ -1272,7 +1247,6 @@ Rv1626 T - Signal transduction mechanisms
Rv1627c I - Lipid transport and metabolism
Rv1628c R - General function prediction only
Rv1629 L polA Replication, recombination and repair
-Rv1629 L polA Replication, recombination and repair
Rv1630 J rpsA Translation, ribosomal structure and biogenesis
Rv1631 H coaE Coenzyme transport and metabolism
Rv1631 S coaE Function unknown
@@ -1315,16 +1289,16 @@ Rv1668c R - General function prediction only
Rv1670 S - Function unknown
Rv1672c R - General function prediction only
Rv1673c E - Amino acid transport and metabolism
-Rv1674c P - Inorganic ion transport and metabolism
Rv1674c K - Transcription
+Rv1674c P - Inorganic ion transport and metabolism
Rv1675c T - Signal transduction mechanisms
Rv1676 E - Amino acid transport and metabolism
Rv1677 C dsbF Energy production and conversion
Rv1679 I fadE16 Lipid transport and metabolism
Rv1680 P - Inorganic ion transport and metabolism
Rv1681 R moeX General function prediction only
-Rv1683 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv1683 I - Lipid transport and metabolism
+Rv1683 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv1684 S - Function unknown
Rv1685c K - Transcription
Rv1686c V - Defense mechanisms
@@ -1370,8 +1344,6 @@ Rv1732c C - Energy production and conversion
Rv1734c C - Energy production and conversion
Rv1735c P - Inorganic ion transport and metabolism
Rv1736c C narX Energy production and conversion
-Rv1736c C narX Energy production and conversion
-Rv1736c C narX Energy production and conversion
Rv1737c P narK2 Inorganic ion transport and metabolism
Rv1739c P - Inorganic ion transport and metabolism
Rv1740 S - Function unknown
@@ -1389,8 +1361,8 @@ Rv1757c L - Replication, recombination and repair
Rv1762c S - Function unknown
Rv1763 L - Replication, recombination and repair
Rv1764 L - Replication, recombination and repair
-Rv1765c K - Transcription
Rv1765A L - Replication, recombination and repair
+Rv1765c K - Transcription
Rv1766 S - Function unknown
Rv1767 S - Function unknown
Rv1769 E - Amino acid transport and metabolism
@@ -1426,7 +1398,6 @@ Rv1828 K - Transcription
Rv1829 S - Function unknown
Rv1830 K - Transcription
Rv1832 E gcvB Amino acid transport and metabolism
-Rv1832 E gcvB Amino acid transport and metabolism
Rv1833c R - General function prediction only
Rv1834 R - General function prediction only
Rv1835c R - General function prediction only
@@ -1450,8 +1421,8 @@ Rv1855c C - Energy production and conversion
Rv1856c R - General function prediction only
Rv1857 P modA Inorganic ion transport and metabolism
Rv1858 P modB Inorganic ion transport and metabolism
-Rv1859 P modC Inorganic ion transport and metabolism
Rv1859 G modC Carbohydrate transport and metabolism
+Rv1859 P modC Inorganic ion transport and metabolism
Rv1861 S - Function unknown
Rv1862 R adhA General function prediction only
Rv1863c R - General function prediction only
@@ -1483,7 +1454,6 @@ Rv1899c R lppD General function prediction only
Rv1900c R lipJ General function prediction only
Rv1900c T lipJ Signal transduction mechanisms
Rv1901 R cinA General function prediction only
-Rv1901 R cinA General function prediction only
Rv1902c R nanT General function prediction only
Rv1903 S - Function unknown
Rv1904 T - Signal transduction mechanisms
@@ -1505,18 +1475,15 @@ Rv1927 S - Function unknown
Rv1928c R - General function prediction only
Rv1930c R - General function prediction only
Rv1931c K - Transcription
-Rv1931c K - Transcription
Rv1932 O tpx Posttranslational modification, protein turnover, chaperones
Rv1933c I fadE18 Lipid transport and metabolism
Rv1934c I fadE17 Lipid transport and metabolism
Rv1935c I echA13 Lipid transport and metabolism
Rv1936 C - Energy production and conversion
Rv1937 C - Energy production and conversion
-Rv1937 C - Energy production and conversion
Rv1938 R ephB General function prediction only
Rv1939 R - General function prediction only
Rv1940 H ribA1 Coenzyme transport and metabolism
-Rv1940 H ribA1 Coenzyme transport and metabolism
Rv1941 R - General function prediction only
Rv1942c T - Signal transduction mechanisms
Rv1944c U - Intracellular trafficking, secretion, and vesicular transport
@@ -1544,8 +1511,8 @@ Rv1985c K - Transcription
Rv1986 R - General function prediction only
Rv1988 J - Translation, ribosomal structure and biogenesis
Rv1989c S - Function unknown
-Rv1990c K - Transcription
Rv1990A R - General function prediction only
+Rv1990c K - Transcription
Rv1991c T - Signal transduction mechanisms
Rv1992c P ctpG Inorganic ion transport and metabolism
Rv1994c K - Transcription
@@ -1554,28 +1521,27 @@ Rv1996 T - Signal transduction mechanisms
Rv1997 P ctpF Inorganic ion transport and metabolism
Rv1998c G - Carbohydrate transport and metabolism
Rv1999c E - Amino acid transport and metabolism
-Rv2000 R - General function prediction only
Rv2000 E - Amino acid transport and metabolism
+Rv2000 R - General function prediction only
Rv2001 I - Lipid transport and metabolism
Rv2002 R fabG3 General function prediction only
Rv2003c R - General function prediction only
Rv2004c R - General function prediction only
Rv2004c S - Function unknown
Rv2005c T - Signal transduction mechanisms
-Rv2006 R otsB1 General function prediction only
-Rv2006 G otsB1 Carbohydrate transport and metabolism
Rv2006 G otsB1 Carbohydrate transport and metabolism
+Rv2006 R otsB1 General function prediction only
Rv2007c C fdxA Energy production and conversion
-Rv2008c R - General function prediction only
Rv2008c J - Translation, ribosomal structure and biogenesis
+Rv2008c R - General function prediction only
Rv2009 K - Transcription
Rv2010 R - General function prediction only
Rv2011c K - Transcription
Rv2012 R - General function prediction only
Rv2013 L - Replication, recombination and repair
Rv2014 L - Replication, recombination and repair
-Rv2017 K - Transcription
Rv2017 E - Amino acid transport and metabolism
+Rv2017 K - Transcription
Rv2018 S - Function unknown
Rv2020c R - General function prediction only
Rv2021c K - Transcription
@@ -1584,11 +1550,9 @@ Rv2024c R - General function prediction only
Rv2025c P - Inorganic ion transport and metabolism
Rv2026c T - Signal transduction mechanisms
Rv2027c T - Signal transduction mechanisms
-Rv2027c T - Signal transduction mechanisms
Rv2028c T - Signal transduction mechanisms
Rv2029c G pfkB Carbohydrate transport and metabolism
Rv2030c R - General function prediction only
-Rv2030c R - General function prediction only
Rv2031c O hspX Posttranslational modification, protein turnover, chaperones
Rv2032 C acg Energy production and conversion
Rv2034 K - Transcription
@@ -1601,10 +1565,8 @@ Rv2043c Q pncA Secondary metabolites biosynthesis, transport and catabolism
Rv2045c I lipT Lipid transport and metabolism
Rv2046 Q lppI Secondary metabolites biosynthesis, transport and catabolism
Rv2047c G - Carbohydrate transport and metabolism
-Rv2047c G - Carbohydrate transport and metabolism
Rv2048c Q pks12 Secondary metabolites biosynthesis, transport and catabolism
Rv2051c M ppm1 Cell wall/membrane/envelope biogenesis
-Rv2051c M ppm1 Cell wall/membrane/envelope biogenesis
Rv2052c R - General function prediction only
Rv2054 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv2055c J rpsR Translation, ribosomal structure and biogenesis
@@ -1618,14 +1580,12 @@ Rv2062c H cobN Coenzyme transport and metabolism
Rv2064 P cobG Inorganic ion transport and metabolism
Rv2065 H cobH Coenzyme transport and metabolism
Rv2066 H cobI Coenzyme transport and metabolism
-Rv2066 H cobI Coenzyme transport and metabolism
Rv2067c R - General function prediction only
Rv2068c V blaC Defense mechanisms
Rv2069 K sigC Transcription
Rv2070c H cobK Coenzyme transport and metabolism
Rv2071c H cobM Coenzyme transport and metabolism
Rv2072c H cobL Coenzyme transport and metabolism
-Rv2072c H cobL Coenzyme transport and metabolism
Rv2073c R - General function prediction only
Rv2086 L - Replication, recombination and repair
Rv2087 L - Replication, recombination and repair
@@ -1640,7 +1600,6 @@ Rv2096c K - Transcription
Rv2101 L helZ Replication, recombination and repair
Rv2102 S - Function unknown
Rv2103c R - General function prediction only
-Rv2103c R - General function prediction only
Rv2105 L - Replication, recombination and repair
Rv2106 L - Replication, recombination and repair
Rv2109c O prcA Posttranslational modification, protein turnover, chaperones
@@ -1653,7 +1612,6 @@ Rv2120c S - Function unknown
Rv2121c E hisG Amino acid transport and metabolism
Rv2122c E hisE Amino acid transport and metabolism
Rv2124c E metH Amino acid transport and metabolism
-Rv2124c E metH Amino acid transport and metabolism
Rv2125 R - General function prediction only
Rv2127 E ansP1 Amino acid transport and metabolism
Rv2129c R - General function prediction only
@@ -1697,7 +1655,6 @@ Rv2187 I fadD15 Lipid transport and metabolism
Rv2188c M - Cell wall/membrane/envelope biogenesis
Rv2190c M - Cell wall/membrane/envelope biogenesis
Rv2191 L - Replication, recombination and repair
-Rv2191 L - Replication, recombination and repair
Rv2192c E trpD Amino acid transport and metabolism
Rv2193 C ctaE Energy production and conversion
Rv2194 C qcrC Energy production and conversion
@@ -1715,7 +1672,6 @@ Rv2211c E gcvT Amino acid transport and metabolism
Rv2212 T - Signal transduction mechanisms
Rv2213 E pepB Amino acid transport and metabolism
Rv2214c R ephD General function prediction only
-Rv2214c R ephD General function prediction only
Rv2215 C dlaT Energy production and conversion
Rv2216 R - General function prediction only
Rv2217 H lipB Coenzyme transport and metabolism
@@ -1728,11 +1684,10 @@ Rv2224c R - General function prediction only
Rv2225 H panB Coenzyme transport and metabolism
Rv2226 S - Function unknown
Rv2227 S - Function unknown
-Rv2228c L - Replication, recombination and repair
Rv2228c G - Carbohydrate transport and metabolism
+Rv2228c L - Replication, recombination and repair
Rv2229c R - General function prediction only
Rv2230c S - Function unknown
-Rv2230c S - Function unknown
Rv2231c E cobC Amino acid transport and metabolism
Rv2232 R - General function prediction only
Rv2234 T ptpA Signal transduction mechanisms
@@ -1748,8 +1703,8 @@ Rv2245 Q kasA Secondary metabolites biosynthesis, transport and catabolism
Rv2246 Q kasB Secondary metabolites biosynthesis, transport and catabolism
Rv2247 I accD6 Lipid transport and metabolism
Rv2249c C glpD1 Energy production and conversion
-Rv2250c K - Transcription
Rv2250A C - Energy production and conversion
+Rv2250c K - Transcription
Rv2251 C - Energy production and conversion
Rv2252 R - General function prediction only
Rv2257c V - Defense mechanisms
@@ -1847,7 +1802,6 @@ Rv2380c Q mbtE Secondary metabolites biosynthesis, transport and catabolism
Rv2381c Q mbtD Secondary metabolites biosynthesis, transport and catabolism
Rv2382c Q mbtC Secondary metabolites biosynthesis, transport and catabolism
Rv2383c Q mbtB Secondary metabolites biosynthesis, transport and catabolism
-Rv2383c Q mbtB Secondary metabolites biosynthesis, transport and catabolism
Rv2384 Q mbtA Secondary metabolites biosynthesis, transport and catabolism
Rv2385 I mbtJ Lipid transport and metabolism
Rv2386c H mbtI Coenzyme transport and metabolism
@@ -1885,9 +1839,8 @@ Rv2426c R - General function prediction only
Rv2427c E proA Amino acid transport and metabolism
Rv2428 O ahpC Posttranslational modification, protein turnover, chaperones
Rv2429 S ahpD Function unknown
-Rv2434c T - Signal transduction mechanisms
Rv2434c M - Cell wall/membrane/envelope biogenesis
-Rv2435c T - Signal transduction mechanisms
+Rv2434c T - Signal transduction mechanisms
Rv2435c T - Signal transduction mechanisms
Rv2436 G rbsK Carbohydrate transport and metabolism
Rv2437 O - Posttranslational modification, protein turnover, chaperones
@@ -1906,7 +1859,6 @@ Rv2449c S - Function unknown
Rv2453c H mobA Coenzyme transport and metabolism
Rv2454c C - Energy production and conversion
Rv2455c C - Energy production and conversion
-Rv2455c C - Energy production and conversion
Rv2456c R - General function prediction only
Rv2457c O clpX Posttranslational modification, protein turnover, chaperones
Rv2458 E mmuM Amino acid transport and metabolism
@@ -1929,13 +1881,13 @@ Rv2478c L - Replication, recombination and repair
Rv2479c L - Replication, recombination and repair
Rv2480c L - Replication, recombination and repair
Rv2482c I plsB2 Lipid transport and metabolism
-Rv2483c I plsC Lipid transport and metabolism
Rv2483c E plsC Amino acid transport and metabolism
+Rv2483c I plsC Lipid transport and metabolism
Rv2485c I lipQ Lipid transport and metabolism
Rv2486 I echA14 Lipid transport and metabolism
-Rv2488c T - Signal transduction mechanisms
Rv2488c K - Transcription
Rv2488c R - General function prediction only
+Rv2488c T - Signal transduction mechanisms
Rv2491 R - General function prediction only
Rv2492 F - Nucleotide transport and metabolism
Rv2494 R - General function prediction only
@@ -1956,17 +1908,14 @@ Rv2509 R - General function prediction only
Rv2510c R - General function prediction only
Rv2511 A orn RNA processing and modification
Rv2512c L - Replication, recombination and repair
-Rv2515c K - Transcription
Rv2515c E - Amino acid transport and metabolism
+Rv2515c K - Transcription
Rv2518c S lppS Function unknown
Rv2521 O bcp Posttranslational modification, protein turnover, chaperones
Rv2522c E - Amino acid transport and metabolism
Rv2523c I acpS Lipid transport and metabolism
-Rv2524c Q fas Secondary metabolites biosynthesis, transport and catabolism
-Rv2524c I fas Lipid transport and metabolism
-Rv2524c I fas Lipid transport and metabolism
-Rv2524c I fas Lipid transport and metabolism
Rv2524c I fas Lipid transport and metabolism
+Rv2524c Q fas Secondary metabolites biosynthesis, transport and catabolism
Rv2527 R - General function prediction only
Rv2528c V mrr Defense mechanisms
Rv2529 L - Replication, recombination and repair
@@ -1993,12 +1942,11 @@ Rv2559c L - Replication, recombination and repair
Rv2561 E - Amino acid transport and metabolism
Rv2563 V - Defense mechanisms
Rv2564 V glnQ Defense mechanisms
-Rv2565 T - Signal transduction mechanisms
Rv2565 R - General function prediction only
+Rv2565 T - Signal transduction mechanisms
Rv2566 E - Amino acid transport and metabolism
Rv2566 S - Function unknown
Rv2567 S - Function unknown
-Rv2567 S - Function unknown
Rv2568c S - Function unknown
Rv2569c E - Amino acid transport and metabolism
Rv2570 S - Function unknown
@@ -2006,8 +1954,8 @@ Rv2571c S - Function unknown
Rv2572c J aspS Translation, ribosomal structure and biogenesis
Rv2573 H - Coenzyme transport and metabolism
Rv2575 R - General function prediction only
-Rv2577 R - General function prediction only
Rv2577 P - Inorganic ion transport and metabolism
+Rv2577 R - General function prediction only
Rv2578c L - Replication, recombination and repair
Rv2579 R dhaA General function prediction only
Rv2580c J hisS Translation, ribosomal structure and biogenesis
@@ -2038,8 +1986,8 @@ Rv2610c M pimA Cell wall/membrane/envelope biogenesis
Rv2611c M - Cell wall/membrane/envelope biogenesis
Rv2612c I pgsA1 Lipid transport and metabolism
Rv2613c R - General function prediction only
-Rv2614c J thrS Translation, ribosomal structure and biogenesis
Rv2614A G - Carbohydrate transport and metabolism
+Rv2614c J thrS Translation, ribosomal structure and biogenesis
Rv2616 S - Function unknown
Rv2618 K - Transcription
Rv2619c S - Function unknown
@@ -2048,10 +1996,9 @@ Rv2622 R - General function prediction only
Rv2623 T TB31.7 Signal transduction mechanisms
Rv2624c T - Signal transduction mechanisms
Rv2625c R - General function prediction only
-Rv2625c R - General function prediction only
Rv2626c R - General function prediction only
-Rv2629 L - Replication, recombination and repair
Rv2629 J - Translation, ribosomal structure and biogenesis
+Rv2629 L - Replication, recombination and repair
Rv2630 S - Function unknown
Rv2631 S - Function unknown
Rv2636 V - Defense mechanisms
@@ -2061,8 +2008,8 @@ Rv2639c S - Function unknown
Rv2640c K - Transcription
Rv2641 E cadI Amino acid transport and metabolism
Rv2642 K - Transcription
-Rv2643 T arsC Signal transduction mechanisms
Rv2643 P arsC Inorganic ion transport and metabolism
+Rv2643 T arsC Signal transduction mechanisms
Rv2646 L - Replication, recombination and repair
Rv2648 L - Replication, recombination and repair
Rv2649 L - Replication, recombination and repair
@@ -2120,7 +2067,6 @@ Rv2733c J - Translation, ribosomal structure and biogenesis
Rv2734 S - Function unknown
Rv2736c R recX General function prediction only
Rv2737c L recA Replication, recombination and repair
-Rv2737c L recA Replication, recombination and repair
Rv2739c C - Energy production and conversion
Rv2740 Q - Secondary metabolites biosynthesis, transport and catabolism
Rv2744c T 35kd_ag Signal transduction mechanisms
@@ -2202,11 +2148,9 @@ Rv2842c S - Function unknown
Rv2845c J proS Translation, ribosomal structure and biogenesis
Rv2846c R efpA General function prediction only
Rv2847c H cysG Coenzyme transport and metabolism
-Rv2847c H cysG Coenzyme transport and metabolism
Rv2848c H cobB Coenzyme transport and metabolism
Rv2849c H cobO Coenzyme transport and metabolism
Rv2850c H - Coenzyme transport and metabolism
-Rv2850c H - Coenzyme transport and metabolism
Rv2851c R - General function prediction only
Rv2852c R mqo General function prediction only
Rv2854 I - Lipid transport and metabolism
@@ -2269,8 +2213,8 @@ Rv2918c O glnD Posttranslational modification, protein turnover, chaperones
Rv2919c E glnB Amino acid transport and metabolism
Rv2920c P amt Inorganic ion transport and metabolism
Rv2921c U ftsY Intracellular trafficking, secretion, and vesicular transport
-Rv2922c D smc Cell cycle control, cell division, chromosome partitioning
Rv2922A C acyP Energy production and conversion
+Rv2922c D smc Cell cycle control, cell division, chromosome partitioning
Rv2923c O - Posttranslational modification, protein turnover, chaperones
Rv2924c L fpg Replication, recombination and repair
Rv2925c K rnc Transcription
@@ -2283,7 +2227,6 @@ Rv2932 Q ppsB Secondary metabolites biosynthesis, transport and catabolism
Rv2933 Q ppsC Secondary metabolites biosynthesis, transport and catabolism
Rv2934 Q ppsD Secondary metabolites biosynthesis, transport and catabolism
Rv2935 Q ppsE Secondary metabolites biosynthesis, transport and catabolism
-Rv2935 Q ppsE Secondary metabolites biosynthesis, transport and catabolism
Rv2936 V drrA Defense mechanisms
Rv2937 V drrB Defense mechanisms
Rv2938 V drrC Defense mechanisms
@@ -2297,7 +2240,6 @@ Rv2944 L - Replication, recombination and repair
Rv2946c Q pks1 Secondary metabolites biosynthesis, transport and catabolism
Rv2947c Q pks15 Secondary metabolites biosynthesis, transport and catabolism
Rv2948c Q fadD22 Secondary metabolites biosynthesis, transport and catabolism
-Rv2948c Q fadD22 Secondary metabolites biosynthesis, transport and catabolism
Rv2949c H - Coenzyme transport and metabolism
Rv2950c Q fadD29 Secondary metabolites biosynthesis, transport and catabolism
Rv2951c C - Energy production and conversion
@@ -2318,8 +2260,8 @@ Rv2966c L - Replication, recombination and repair
Rv2967c C pca Energy production and conversion
Rv2968c S - Function unknown
Rv2969c O - Posttranslational modification, protein turnover, chaperones
-Rv2970c I lipN Lipid transport and metabolism
Rv2970A R - General function prediction only
+Rv2970c I lipN Lipid transport and metabolism
Rv2971 R - General function prediction only
Rv2973c K recG Transcription
Rv2974c R - General function prediction only
@@ -2414,8 +2356,8 @@ Rv3074 V - Defense mechanisms
Rv3075c G - Carbohydrate transport and metabolism
Rv3077 P - Inorganic ion transport and metabolism
Rv3079c C - Energy production and conversion
-Rv3080c L pknK Replication, recombination and repair
Rv3080c K pknK Transcription
+Rv3080c L pknK Replication, recombination and repair
Rv3082c K virS Transcription
Rv3083 P - Inorganic ion transport and metabolism
Rv3084 I lipR Lipid transport and metabolism
@@ -2451,7 +2393,6 @@ Rv3120 R - General function prediction only
Rv3121 Q cyp141 Secondary metabolites biosynthesis, transport and catabolism
Rv3124 T - Signal transduction mechanisms
Rv3132c T devS Signal transduction mechanisms
-Rv3132c T devS Signal transduction mechanisms
Rv3133c K devR Transcription
Rv3134c T - Signal transduction mechanisms
Rv3137 G - Carbohydrate transport and metabolism
@@ -2489,8 +2430,8 @@ Rv3176c R mesT General function prediction only
Rv3177 R - General function prediction only
Rv3179 R - General function prediction only
Rv3180c R - General function prediction only
-Rv3181c K - Transcription
Rv3181c D - Cell cycle control, cell division, chromosome partitioning
+Rv3181c K - Transcription
Rv3182 S - Function unknown
Rv3183 K - Transcription
Rv3184 L - Replication, recombination and repair
@@ -2503,13 +2444,11 @@ Rv3193c S - Function unknown
Rv3194c T - Signal transduction mechanisms
Rv3195 S - Function unknown
Rv3197 R - General function prediction only
-Rv3198c L uvrD2 Replication, recombination and repair
Rv3198A O - Posttranslational modification, protein turnover, chaperones
+Rv3198c L uvrD2 Replication, recombination and repair
Rv3199c L nudC Replication, recombination and repair
Rv3200c P - Inorganic ion transport and metabolism
Rv3201c L - Replication, recombination and repair
-Rv3201c L - Replication, recombination and repair
-Rv3202c L - Replication, recombination and repair
Rv3202c L - Replication, recombination and repair
Rv3203 R lipV General function prediction only
Rv3204 L - Replication, recombination and repair
@@ -2523,8 +2462,8 @@ Rv3215 Q entC Secondary metabolites biosynthesis, transport and catabolism
Rv3216 R - General function prediction only
Rv3218 R - General function prediction only
Rv3220c T - Signal transduction mechanisms
-Rv3221c I TB7.3 Lipid transport and metabolism
Rv3221A P - Inorganic ion transport and metabolism
+Rv3221c I TB7.3 Lipid transport and metabolism
Rv3223c K sigH Transcription
Rv3224 R - General function prediction only
Rv3224B S - Function unknown
@@ -2541,7 +2480,6 @@ Rv3237c P - Inorganic ion transport and metabolism
Rv3238c O - Posttranslational modification, protein turnover, chaperones
Rv3239c R - General function prediction only
Rv3239c T - Signal transduction mechanisms
-Rv3239c R - General function prediction only
Rv3240c U secA1 Intracellular trafficking, secretion, and vesicular transport
Rv3241c J - Translation, ribosomal structure and biogenesis
Rv3242c R - General function prediction only
@@ -2567,7 +2505,6 @@ Rv3267 K - Transcription
Rv3270 P ctpC Inorganic ion transport and metabolism
Rv3272 C - Energy production and conversion
Rv3273 P - Inorganic ion transport and metabolism
-Rv3273 P - Inorganic ion transport and metabolism
Rv3274c I fadE25 Lipid transport and metabolism
Rv3275c F purE Nucleotide transport and metabolism
Rv3276c F purK Nucleotide transport and metabolism
@@ -2610,12 +2547,11 @@ Rv3319 C sdhB Energy production and conversion
Rv3320c R - General function prediction only
Rv3322c R - General function prediction only
Rv3323c H moaX Coenzyme transport and metabolism
-Rv3323c H moaX Coenzyme transport and metabolism
Rv3324c H moaC Coenzyme transport and metabolism
Rv3325 L - Replication, recombination and repair
Rv3326 L - Replication, recombination and repair
-Rv3327 S - Function unknown
Rv3327 L - Replication, recombination and repair
+Rv3327 S - Function unknown
Rv3328c K sigJ Transcription
Rv3329 H - Coenzyme transport and metabolism
Rv3330 M dacB1 Cell wall/membrane/envelope biogenesis
@@ -2646,8 +2582,8 @@ Rv3366 J spoU Translation, ribosomal structure and biogenesis
Rv3368c C - Energy production and conversion
Rv3369 H - Coenzyme transport and metabolism
Rv3370c L dnaE2 Replication, recombination and repair
-Rv3372 R otsB2 General function prediction only
Rv3372 G otsB2 Carbohydrate transport and metabolism
+Rv3372 R otsB2 General function prediction only
Rv3373 I echA18 Lipid transport and metabolism
Rv3374 I echA18.1 Lipid transport and metabolism
Rv3375 J amiD Translation, ribosomal structure and biogenesis
@@ -2662,13 +2598,12 @@ Rv3386 L - Replication, recombination and repair
Rv3387 L - Replication, recombination and repair
Rv3389c I - Lipid transport and metabolism
Rv3390 G lpqD Carbohydrate transport and metabolism
-Rv3391 R acrA1 General function prediction only
Rv3391 Q acrA1 Secondary metabolites biosynthesis, transport and catabolism
+Rv3391 R acrA1 General function prediction only
Rv3392c M cmaA1 Cell wall/membrane/envelope biogenesis
Rv3393 F iunH Nucleotide transport and metabolism
Rv3394c L - Replication, recombination and repair
Rv3396c F guaA Nucleotide transport and metabolism
-Rv3396c F guaA Nucleotide transport and metabolism
Rv3397c I phyA Lipid transport and metabolism
Rv3398c H idsA1 Coenzyme transport and metabolism
Rv3399 Q - Secondary metabolites biosynthesis, transport and catabolism
@@ -2698,8 +2633,8 @@ Rv3428c L - Replication, recombination and repair
Rv3430c L - Replication, recombination and repair
Rv3431c L - Replication, recombination and repair
Rv3432c E gadB Amino acid transport and metabolism
-Rv3433c S - Function unknown
Rv3433c G - Carbohydrate transport and metabolism
+Rv3433c S - Function unknown
Rv3436c M glmS Cell wall/membrane/envelope biogenesis
Rv3441c G mrsA Carbohydrate transport and metabolism
Rv3442c J rpsI Translation, ribosomal structure and biogenesis
@@ -2901,8 +2836,6 @@ Rv3725 G - Carbohydrate transport and metabolism
Rv3726 R - General function prediction only
Rv3727 S - Function unknown
Rv3728 R - General function prediction only
-Rv3728 R - General function prediction only
-Rv3729 R - General function prediction only
Rv3729 R - General function prediction only
Rv3730c L - Replication, recombination and repair
Rv3731 L ligC Replication, recombination and repair
@@ -2911,7 +2844,6 @@ Rv3733c R - General function prediction only
Rv3735 S - Function unknown
Rv3736 K - Transcription
Rv3737 S - Function unknown
-Rv3737 S - Function unknown
Rv3741c P - Inorganic ion transport and metabolism
Rv3742c P - Inorganic ion transport and metabolism
Rv3743c P ctpJ Inorganic ion transport and metabolism
@@ -2941,7 +2873,6 @@ Rv3782 R - General function prediction only
Rv3783 M rfbD Cell wall/membrane/envelope biogenesis
Rv3784 G - Carbohydrate transport and metabolism
Rv3786c M - Cell wall/membrane/envelope biogenesis
-Rv3786c M - Cell wall/membrane/envelope biogenesis
Rv3787c Q - Secondary metabolites biosynthesis, transport and catabolism
Rv3788 K - Transcription
Rv3789 S - Function unknown
@@ -2952,7 +2883,6 @@ Rv3797 I fadE35 Lipid transport and metabolism
Rv3798 L - Replication, recombination and repair
Rv3799c I accD4 Lipid transport and metabolism
Rv3800c Q pks13 Secondary metabolites biosynthesis, transport and catabolism
-Rv3800c Q pks13 Secondary metabolites biosynthesis, transport and catabolism
Rv3801c Q fadD32 Secondary metabolites biosynthesis, transport and catabolism
Rv3803c R fbpD General function prediction only
Rv3804c R fbpA General function prediction only
@@ -2967,7 +2897,6 @@ Rv3815c I - Lipid transport and metabolism
Rv3816c I - Lipid transport and metabolism
Rv3817 J - Translation, ribosomal structure and biogenesis
Rv3818 R - General function prediction only
-Rv3818 R - General function prediction only
Rv3820c Q papA2 Secondary metabolites biosynthesis, transport and catabolism
Rv3823c R mmpL8 General function prediction only
Rv3824c Q papA1 Secondary metabolites biosynthesis, transport and catabolism
@@ -2994,8 +2923,6 @@ Rv3855 K ethR Transcription
Rv3856c R - General function prediction only
Rv3858c R gltD General function prediction only
Rv3859c E gltB Amino acid transport and metabolism
-Rv3859c E gltB Amino acid transport and metabolism
-Rv3859c E gltB Amino acid transport and metabolism
Rv3862c J whiB6 Translation, ribosomal structure and biogenesis
Rv3868 O - Posttranslational modification, protein turnover, chaperones
Rv3870 D - Cell cycle control, cell division, chromosome partitioning
@@ -3006,7 +2933,6 @@ Rv3883c O mycP1 Posttranslational modification, protein turnover, chaperones
Rv3884c O - Posttranslational modification, protein turnover, chaperones
Rv3886c O mycP2 Posttranslational modification, protein turnover, chaperones
Rv3888c D - Cell cycle control, cell division, chromosome partitioning
-Rv3888c D - Cell cycle control, cell division, chromosome partitioning
Rv3890c S esxC Function unknown
Rv3894c D - Cell cycle control, cell division, chromosome partitioning
Rv3896c R - General function prediction only
@@ -3019,7 +2945,6 @@ Rv3911 K sigM Transcription
Rv3913 O trxB2 Posttranslational modification, protein turnover, chaperones
Rv3914 C trxC Energy production and conversion
Rv3915 M - Cell wall/membrane/envelope biogenesis
-Rv3915 M - Cell wall/membrane/envelope biogenesis
Rv3917c K parB Transcription
Rv3918c D parA Cell cycle control, cell division, chromosome partitioning
Rv3919c M gidB Cell wall/membrane/envelope biogenesis
=====================================
src/pytransit/data/H37Rv_sanger_roles.dat
=====================================
@@ -321,40 +321,40 @@ Rv2994 III
Rv3065 III.A.6
Rv3065 III.A
Rv3065 III
-Rv0233 I.F.3 2'
+Rv0233 I.F.3
Rv0233 I.F
Rv0233 I
-Rv0321 I.F.3 2'
+Rv0321 I.F.3
Rv0321 I.F
Rv0321 I
-Rv0570 I.F.3 2'
+Rv0570 I.F.3
Rv0570 I.F
Rv0570 I
-Rv1981c I.F.3 2'
+Rv1981c I.F.3
Rv1981c I.F
Rv1981c I
-Rv2697c I.F.3 2'
+Rv2697c I.F.3
Rv2697c I.F
Rv2697c I
-Rv2764c I.F.3 2'
+Rv2764c I.F.3
Rv2764c I.F
Rv2764c I
-Rv3048c I.F.3 2'
+Rv3048c I.F.3
Rv3048c I.F
Rv3048c I
-Rv3051c I.F.3 2'
+Rv3051c I.F.3
Rv3051c I.F
Rv3051c I
-Rv3052c I.F.3 2'
+Rv3052c I.F.3
Rv3052c I.F
Rv3052c I
-Rv3053c I.F.3 2'
+Rv3053c I.F.3
Rv3053c I.F
Rv3053c I
-Rv3247c I.F.3 2'
+Rv3247c I.F.3
Rv3247c I.F
Rv3247c I
-Rv3752c I.F.3 2'
+Rv3752c I.F.3
Rv3752c I.F
Rv3752c I
Rv1121 I.B.5
=====================================
src/pytransit/data/gene_ontology.1_2.3-11-18.obo
=====================================
The diff for this file was not included because it is too large.
=====================================
src/pytransit/doc/source/_images/genetic_interaction_types.png
=====================================
Binary files /dev/null and b/src/pytransit/doc/source/_images/genetic_interaction_types.png differ
=====================================
src/pytransit/doc/source/transit_methods.rst
=====================================
@@ -1,8 +1,9 @@
.. _`analysis_methods`:
-Analysis Methods
-================
+==================
+ Analysis Methods
+==================
TRANSIT has analysis methods capable of analyzing **Himar1** and **Tn5** datasets.
@@ -13,7 +14,7 @@ Below is a description of some of the methods.
.. _gumbel:
Gumbel
-------
+======
The Gumbel can be used to determine which genes are essential in a
single condition. It does a gene-by-gene analysis of the insertions at
@@ -25,7 +26,7 @@ probability of this using a Bayesian model.
Intended only for **Himar1** datasets.
How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
| For a formal description of how this method works, see our paper [DeJesus2013]_:
@@ -34,7 +35,7 @@ How does it work?
| `Bayesian analysis of gene essentiality based on sequencing of transposon insertion libraries. <http://www.ncbi.nlm.nih.gov/pubmed/23361328>`_ *Bioinformatics*, 29(6):695-703.
Example
-~~~~~~~
+-------
::
@@ -52,7 +53,7 @@ Example
Parameters
-~~~~~~~~~~
+----------
- **Samples:** Gumbel uses Metropolis-Hastings (MH) to generate samples
of posterior distributions. The default setting is to run the
@@ -87,7 +88,7 @@ Parameters
|
Outputs and diagnostics
-~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------
The Gumbel method generates a tab-separated output file at the location
chosen by the user. This file will automatically be loaded into the
@@ -137,7 +138,7 @@ defined as follows:
|
Run-time
-~~~~~~~~
+--------
The Gumbel method takes on the order of 10 minutes for 10,000 samples.
Run-time is linearly proportional to the 'samples' parameter, or length
@@ -151,7 +152,7 @@ replicates; replicate datasets will be automatically merged.
griffin
--------
+=======
This is an earlier version of the Gumbel method that
identifies essential genes based on how unlikely 'gaps'
@@ -176,7 +177,7 @@ of estimating internal parameters.
.. _`tn5gaps`:
Tn5Gaps
---------
+=======
The Tn5Gaps method can be used to determine which genes are essential
in a single condition for **Tn5** datasets. It does an analysis of the
@@ -193,7 +194,7 @@ the Gumbel distribution.
|
How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
This method is loosely is based on the original gumbel analysis
method described in this paper:
@@ -230,7 +231,7 @@ output of our analysis.
|
Usage
-~~~~~
+-----
::
python3 ../../../transit.py tn5gaps <comma-separated .wig files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
@@ -243,7 +244,7 @@ Usage
Parameters
-~~~~~~~~~~
+----------
+ **Minimum Read:** The minimum read count that is considered a true read. Because the Gumbel method depends on determining gaps of TA sites lacking insertions, it may be suceptible to spurious reads (e.g. errors). The default value of 1 will consider all reads as true reads. A value of 2, for example, will ignore read counts of 1.
@@ -256,7 +257,7 @@ Parameters
+ **-iC:** Trimming of insertions in C-terminus (given as percentage of ORF length, e.g. "5" for 5%; default=0)
Example
-~~~~~~~
+-------
::
python3 PATH/src/transit.py tn5gaps salmonella_baseline.wig Salmonella-Ty2.prot_table salmonella_baseline_tn5gaps_trimmed.dat -m 2 -r Sum -iN 5 -iC 5
@@ -267,7 +268,7 @@ These input and output files can be downloaded from the **Example Data** section
|
Outputs and diagnostics
-~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------
The Tn5Gaps method generates a tab-separated output file at the
location chosen by the user. This file will automatically be loaded
@@ -306,7 +307,7 @@ file are defined as follows:
|
Run-time
-~~~~~~~~
+--------
The Tn5Gaps method takes on the order of 10 minutes.
Other notes: Tn5Gaps can be run on multiple replicates; replicate
datasets will be automatically merged.
@@ -321,7 +322,7 @@ datasets will be automatically merged.
.. _HMM:
HMM
----
+===
The HMM method can be used to determine the essentiality of the entire genome, as opposed to gene-level analysis of the other methods. It is capable of identifying regions that have unusually high or unusually low read counts (i.e. growth advantage or growth defect regions), in addition to the more common categories of essential and non-essential.
@@ -331,7 +332,7 @@ The HMM method can be used to determine the essentiality of the entire genome, a
|
How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
| For a formal description of how this method works, see our paper [DeJesus2013HMM]_:
|
@@ -341,7 +342,7 @@ How does it work?
Example
-~~~~~~~
+-------
::
@@ -355,7 +356,7 @@ Example
Parameters
-~~~~~~~~~~
+----------
The HMM method automatically estimates the necessary statistical
parameters from the datasets. You can change how the method handles
@@ -370,7 +371,7 @@ replicate datasets:
|
Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
| The HMM method outputs two files. The first file provides the most
likely assignment of states for all the TA sites in the genome. Sites
@@ -441,7 +442,7 @@ Output and Diagnostics
|
Run-time
-~~~~~~~~
+--------
| The HMM method takes less than 10 minutes to complete. The parameters
of the method should not affect the running-time.
@@ -455,7 +456,7 @@ Run-time
.. _resampling:
Resampling
-----------
+==========
The resampling method is a comparative analysis the allows that can be
used to determine conditional essentiality of genes. It is based on a
@@ -474,7 +475,7 @@ of genes or known biological pathway.
|
How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
This technique has yet to be formally published in the context of
differential essentiality analysis. Briefly, the read-counts at each
@@ -490,7 +491,7 @@ original, observed difference in read-counts.
Usage
-~~~~~
+-----
::
@@ -518,7 +519,7 @@ Usage
Parameters
-~~~~~~~~~~
+----------
The resampling method is non-parametric, and therefore does not require
any parameters governing the distributions or the model. The following
@@ -575,7 +576,7 @@ parameters are available for the method:
|
Notes
-~~~~~
+-----
I recommend using -a (adaptive resampling). It runs much faster, and the p-values
will be very close to a full non-adaptive run (all 10,000 samples).
@@ -588,7 +589,7 @@ making it difficult to make confident calls on essentiality.
Doing resampling with a combined_wig file
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------------------
Resampling can also now take a combined_wig_ file as input (containing insertion counts
for multiple sample), along with a samples_metadata_ file
@@ -608,7 +609,7 @@ If you want to compare more than two conditions, see :ref:`ZINB <zinb>`.
Doing resampling with datasets from different libraries.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------------------------
In most cases, comparisons are done among samples (replicates) from
the same library evaluated in two different conditions. But if the
@@ -626,7 +627,7 @@ from each library in each condition.
Doing resampling between different strains.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------------
The most common case is that resampling is done among replicates all
from the same Tn library, and hence all the datasets (fastq files) are
@@ -698,7 +699,7 @@ replicates in condition B will be from another library (another strain).
|
Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
The resampling method outputs a tab-delimited file with results for each
gene in the genome. P-values are adjusted for multiple comparisons using
@@ -752,7 +753,7 @@ changed using the -PC flag (above).
|
Run-time
-~~~~~~~~
+--------
A typical run of the resampling method with 10,000 samples will take
around 45 minutes (with the histogram option ON). Using the *adaptive
@@ -764,7 +765,7 @@ resampling* option (-a), the run-time is reduced to around 10 minutes.
----
Mann-Whitney U-test (utest)
----------------------------
+===========================
This is a method for comparing datasets from a TnSeq library evaluated in
two different conditions, analogous to resampling.
@@ -788,13 +789,13 @@ A reference for this method is `(Santa Maria et al., 2014)
.. _genetic-interactions:
Genetic Interactions
---------------------
+====================
The genetic interactions (GI) method is a comparative analysis used
used to determine genetic interactions. It is a Bayesian method
that estimates the distribution of log fold-changes (logFC) in two
strain backgrounds under different conditions, and identifies significantly
-large changes in enrichment (delta-logFC) to identify those genes
+large changes in enrichment (delta_logFC) to identify those genes
that imply a genetic interaction.
@@ -805,7 +806,7 @@ that imply a genetic interaction.
|
How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
GI performs a comparison among 4 groups of datasets, strain A and B assessed in conditions 1 and 2 (e.g. control vs treatment).
It looks for interactions where the response to the treatment (i.e. effect on insertion counts) depends on the strain.
@@ -825,28 +826,92 @@ slopes between strain A and B (represented by 'delta_LFC' in the output file).
|
+Statistical Significance
+------------------------
+
+
+The computation that is done by GI is to compute the posterior distribution of the delta_LFC (or mean change in slopes)
+through Bayesian sampling.
+The primary method to determine significance of genes is whethter this the mean_delta_LFC is significantly differnt than 0.
+However, since the mean_delta_LFC is a distribution, we represent it by a Highest Density Interval, HDI, which is
+similar to a 95% confidence interval. Furthermore, rather than asking whether the HDI overlaps 0 exactly, we expand the interval
+around 0 to a Region of Probable Equivalence (ROPE), which is set to [-0.5,0.5] by default. Hence the significant genes
+are those for which the HDI does not overlap the ROPE. GI has a flag to adjust the size of the ROPE, if desired.
+
+In the GI output file, the final column give the significance call, along with type of interaction.
+If a gene is not significant, it will be marked with "**No Interaction**" (for the HDI method, meaning HDI overlaps the ROPE).
+If a gene IS significant, then its interaction will be cateogrized in 3 types (see NAR paper):
+
+ * **Aggravating** - mean_delta_LFC is negative; gene is more required in treatment than control in the B strain, compared to the A strain
+ * **Suppressive** - mean_delta_LFC is positive, and the gene was not conditionally essential in strain A (flat slope), but becomes conditionally non-essential in strain B when treated (positive slope)
+ * **Alleviating** - mean_delta_LFC is positive, but the conditional requirement (negative slope) of the gene in strain A with treatment is "cancelled" by the modification in strain B
+
+.. image:: _images/genetic_interaction_types.png
+
+A limitation of this HDI approach is that it is discrete (i.e. overlap is either True or False), but does not provide a quantitative metric
+for the degree of overlap. Thus a second method for assessing significance of genetic interactions is to compute
+the probability of overlap. The lower the probability, the more differnt the delta_LFC is from 0, indicating a more
+significant interaction. In this case, genes with prob < 0.05 are considered interactions and classified by the 3 types above,
+while genes with prob >= 0.05 are marked as "No Interaction".
+
+In addition, since we are calculating significance for thousands of genes in parallel,
+many researchers prefer to have some method for correcting for multiple tests, to control the false discovery rate.
+However, FDR correction is generally used only for frequentist analyses, and he GI method is fundamentally a Bayesian approach.
+Technically, in a Bayesian framework, FDR correction is not needed. Any adjustment for expectations about number of hits
+would be achieved through adjusting parameters for prior distributions. Nonetheless, GI includes options for
+two methods that approximate FDR correction: **BFDR** (Bayesian False Discovery Rate correction,
+`Newton M.A., Noueiry A., Sarkar D., Ahlquist P. (2004). Detecting differential gene expression with a semiparametric hierarchical
+mixture method. Biostatistics, 5:155–176. <https://pubmed.ncbi.nlm.nih.gov/15054023/>`_) and FWER (Familty-Wise
+Error Rate control). When these corrections are applied, a threshold of 0.05 for the adjusted probability of overlap
+is used for each, and this determines which
+genes are classified as interacting (1 of 3 types) or marked as "No Interaction", as above.
+
+In order to enable users to evaluate these various methods for determining significance of interactions,
+a '-signif' flag is provided for the GI method. The options are:
+
+ * **-signif HDI**: significant genes are those for which the HDI does not overlap the ROPE
+ * **-signif prob**: significant genes are those with prob < 0.05, where 'prob' is porbability that HDI overlap the ROPE (default)
+ * **-signif BFDR**: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the BFDR method
+ * **-signif FWER**: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the FWER method
+
+'-signif prob' is the default method.
+
+In the output file, the genes are sorted by the probability that the HDI overlaps the ROPE.
+The genes at the top are rougly the genes with the highest absolute value of mean_delta_LFC.
+
+
Usage
-~~~~~
+-----
::
- python3 transit.py GI <wigs_for_strA_cond1> <wigs_for_strA_cond2> <wigs_for_strB_cond1> <wigs_for_strB_cond2> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
+ python3 /pacific/home/ioerger/transit/src/transit.py GI <wigs_for_strA_cond1> <wigs_for_strA_cond2> <wigs_for_strB_cond1> <wigs_for_strB_cond2> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
+ GI performs a comparison among 4 groups of datasets, strain A and B assessed in conditions 1 and 2 (e.g. control vs treatment).
+ It looks for interactions where the response to the treatment (i.e. effect on insertion counts) depends on the strain (output variable: delta_LFC).
Provide replicates in each group as a comma-separated list of wig files.
+ HDI is highest density interval for posterior distribution of delta_LFC, which is like a confidence interval on difference of slopes.
+ Genes are sorted by probability of HDI overlapping with ROPE. (genes with the highest abs(mean_delta_logFC) are near the top, approximately)
+ Significant genes are indicated by 'Type of Interaction' column (No Interaction, Aggravating, Alleviating, Suppressive).
+ By default, hits are defined as "Is HDI outside of ROPE?"=TRUE (i.e. non-overlap of delta_LFC posterior distritbuion with Region of Probably Equivalence around 0)
+ Alternative methods for significance: use -signif flag with prob, BFDR, or FWER. These affect 'Type of Interaction' (i.e. which genes are labeled 'No Interaction')
Optional Arguments:
-s <integer> := Number of samples. Default: -s 10000
--rope <float> := Region of Practical Equivalence. Area around 0 (i.e. 0 +/- ROPE) that is NOT of interest. Can be thought of similar to the area of the null-hypothesis. Default: --rope 0.5
-n <string> := Normalization method. Default: -n TTR
- -iz := Include rows with zero accross conditions.
+ -iz := Include rows with zero across conditions.
-l := Perform LOESS Correction; Helps remove possible genomic position bias. Default: Turned Off.
-iN <float> := Ignore TAs occuring at given percentage (as integer) of the N terminus. Default: -iN 0
-iC <float> := Ignore TAs occuring at given percentage (as integer) of the C terminus. Default: -iC 0
+ -signif HDI := (default) Significant if HDI does not overlap ROPE; if HDI overlaps ROPE, 'Type of Interaction' is set to 'No Interaction'
+ -signif prob := Optionally, significant hits are re-defined based on probability (degree) of overlap of HDI with ROPE, prob<0.05 (no adjustment)
+ -signif BFDR := Apply "Bayesian" FDR correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as BFDR<0.05
+ -signif FWER := Apply "Bayesian" FWER correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as FWER<0.05
-You can think of 'control' and 'experimental' samples as 'untreated' vs. 'treated'.
Example
-~~~~~~~
+-------
In this example, the effect of a knockout of SigB is being evaluated for its effect on tolerance of isoniazid.
Some genes may become more essential (or less) in the presence of INH in the wild-type strain.
@@ -859,7 +924,7 @@ Note there are 2 replicates in each of the 4 groups of datasets.
Parameters
-~~~~~~~~~~
+----------
The resampling method is non-parametric, and therefore does not require
any parameters governing the distributions or the model. The following
@@ -890,17 +955,23 @@ parameters are available for the method:
as real differences. See the :ref:`Normalization <normalization>` section for a description
of normalization method available in TRANSIT.
+- **Significance Method:**
+ * -signif HDI: significant genes are those for which the HDI does not overlap the ROPE
+ * -signif prob: significant genes are those with prob < 0.05, where 'prob' is porbability that HDI overlap the ROPE (default)
+ * -signif BFDR: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the BFDR method
+ * -signif FWER: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the FWER method
Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
The GI method outputs a tab-delimited file with results for each
-gene in the genome. P-values are adjusted for multiple comparisons using
-the Benjamini-Hochberg procedure (called "q-values" or "p-adj."). A
-typical threshold for conditional essentiality on is q-value < 0.05.
+gene in the genome.
+All genes are sorted by significance using the probability that the HDI overlaps the ROPE.
+Significant genes are those NOT marked with 'No Interaction' in the last column.
+
+-----------------------------------------+----------------------------------------------------+
| Column Header | Column Definition |
@@ -940,10 +1011,6 @@ typical threshold for conditional essentiality on is q-value < 0.05.
|
-Significant interactions are those with "HDI outside ROPE?"=TRUE.
-
-All genes are sorted by significance using BFDR.
-
.. rst-class:: transit_sectionend
@@ -954,7 +1021,7 @@ All genes are sorted by significance using BFDR.
.. _anova:
ANOVA
------
+=====
The Anova (Analysis of variance) method is used to determine which genes
exhibit statistically significant variability of insertion counts across multiple conditions.
@@ -966,7 +1033,7 @@ to which experimental conditions).
|
How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
The method performs the `One-way anova test <https://en.wikipedia.org/wiki/Analysis_of_variance?oldformat=true#The_F-test>`_ for each gene across conditions.
It takes into account variability of normalized transposon insertion counts among TA sites
@@ -975,7 +1042,7 @@ to determine if the differences among the mean counts for each condition are sig
Example
-~~~~~~~
+-------
::
@@ -1014,7 +1081,7 @@ The filenames should match what is shown in the header of the combined_wig (incl
Parameters
-~~~~~~~~~~
+----------
The following parameters are available for the method:
@@ -1030,7 +1097,7 @@ The following parameters are available for the method:
- **-PC** Pseudocounts to use in calculating LFCs (see below). Default: -PC 5
Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
The anova method outputs a tab-delimited file with results for each
gene in the genome. P-values are adjusted for multiple comparisons using
@@ -1077,7 +1144,7 @@ Changing the pseudocounts will not affect the analysis of statistical significan
|
Run-time
-~~~~~~~~
+--------
A typical run of the anova method takes less than 1 minute for a combined wig file with 6 conditions, 3 replicates per condition.
@@ -1092,7 +1159,7 @@ A typical run of the anova method takes less than 1 minute for a combined wig fi
.. _zinb:
ZINB
-----
+====
The ZINB (Zero-Inflated Negative Binomial) method is used to determine
which genes exhibit statistically significant variability in either
@@ -1118,7 +1185,7 @@ See :ref:`Installation Instructions <install-zinb>`..
|
How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
| For a formal description of how this method works, see our paper [ZINB]_:
|
@@ -1127,7 +1194,7 @@ How does it work?
Example
-~~~~~~~
+-------
::
@@ -1149,7 +1216,7 @@ Example
.. _combined_wig:
Combined wig files
-~~~~~~~~~~~~~~~~~~
+------------------
Transit now supports a new file format called 'combined_wig' which basically
combines multiple wig files into one file (with multiple columns). This is
@@ -1174,7 +1241,7 @@ TTR is the default, but other relevant normalization options would be 'nonorm'
.. _samples_metadata:
Samples Metadata File
-~~~~~~~~~~~~~~~~~~~~~
+---------------------
Format of the *samples_metadata* file: a tab-separated file (which you
can edit in Excel) with 3 columns: Id, Condition, and Filename (it
@@ -1194,7 +1261,7 @@ of the combined_wig (including pathnames, if present).
chol2 cholesterol /Users/example_data/cholesterol_rep3.wig
Parameters
-~~~~~~~~~~
+----------
The following parameters are available for the method:
@@ -1208,7 +1275,7 @@ The following parameters are available for the method:
- **Covariates:** If additional covariates distinguishing the samples are available, such as library, timepoint, or genotype, they may be incorporated in the test.
Covariates and Interactions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------
While ZINB is focus on identifying variability of insertion counts across conditions,
the linear model also allows you to take other variables into account.
@@ -1281,7 +1348,7 @@ The difference between how covariates and interactions are handeled in the model
is discussed below in the section on Statistical Significance.
Categorical vs Numeric Covariates
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------
In some cases, covariates are intended to be treated as categorical
variables, like 'batch' or 'library' or 'medium'.
@@ -1297,7 +1364,7 @@ trend in the insertion counts as the covariate value increases.
Statistical Significance - What the P-values Mean in the ZINB Output
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------------------------------------
Formally, the P-value is from a likelihood ratio test (LRT) between a
condition-dependent ZINB model (:math:`m_1`) and a
@@ -1391,7 +1458,7 @@ and this would be compared to the following null model (without the interaction
Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
The ZINB method outputs a tab-delimited file with results for each
gene in the genome. P-values are adjusted for multiple comparisons using
@@ -1442,7 +1509,7 @@ Changing the pseudocounts will not affect the analysis of statistical significan
|
Run-time
-~~~~~~~~
+--------
A typical run of the ZINB method takes ~5 minutes to analze a combined wig
file with 6 conditions, 3 replicates per condition. It will, of
@@ -1457,7 +1524,7 @@ course, run more slowly if you have many more conditions.
.. _normalization:
Normalization
--------------
+=============
Proper normalization is important as it ensures that other sources of
@@ -1507,14 +1574,14 @@ briefly described below:
No normalization is performed.
Command-line
-~~~~~~~~~~~~
+------------
In addition to choosing normalization for various analyses in the GUI,
you can also call Transit to normalize wig files from the command-line,
as shown in this example:
Example
-~~~~~~~
+-------
::
@@ -1539,7 +1606,7 @@ If the input file is a combined_wig file, indicate it with a '-c' flag.
.. rst-class:: transit_clionly
Pathway Enrichment Analysis
----------------------------
+===========================
Pathway Enrichment Analysis provides a method to
identify enrichment of functionally-related genes among those that are
@@ -1548,10 +1615,10 @@ significantly more or less essential between two conditions).
The analysis is typically applied as post-processing step to the hits identified
by a comparative analysis, such as *resampling*.
Several analytical method are provided:
-Fisher's exact test (hypergeometric distribution), GSEA (Gene Set Enrichment Analysis)
+Fisher's exact test (FET, hypergeometric distribution), GSEA (Gene Set Enrichment Analysis)
by `Subramanian et al (2005) <https://www.ncbi.nlm.nih.gov/pubmed/16199517>`_,
and `Ontologizer <https://www.ncbi.nlm.nih.gov/pubmed/17848398>`_.
-For the Fisher exact test,
+For Fisher's exact test,
genes in the resampling output file with adjusted p-value < 0.05 are taken as hits,
and evaluated for overlap with functional categories of genes.
The GSEA methods use the whole list of genes, ranked in order of statistical significance
@@ -1578,17 +1645,17 @@ and is not available in the Transit GUI.
Usage
-~~~~~~~
+-----
::
- python3 src/transit.py pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FISHER|GSEA|ONT>] [-PC <int>]
+ python3 src/transit.py pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FET|GSEA|ONT>] [-PC <int>]
|
Parameters
-~~~~~~~~~~
+----------
- **Resampling File**
The resampling file is the one obtained after using the resampling method in Transit. (It is a tab separated file with 11 columns.) GSEA method makes usage of the last column (adjusted P-value)
- **Associations File**
@@ -1622,14 +1689,18 @@ Parameters
...
- **-M**
- Methodology to be used. FISHER is used by default (even without specifying -M).
+ Methodology to be used. FET is used by default (even without specifying -M).
- **FISHER**
+ **FET**
This implements Fisher's Exact Test (hypergeometric distribution) to determine a p-value for each pathway, based on the proportion of pathway member observed in list of hits (conditionally essential gene by resampling, padj<0.05) compared to the background proportion in the overall genome, and p-values are adjusted post-hoc by the Benjamini-Hochberg procedure to limit the FDR to 5%.
In the output file, an "enrichment score" is reported, which is the ratio of the observed number of pathway members among the hits to the expected number. Pseudocounts of 2 are included in the calculation to reduce the bias toward small pathways with only a few genes; this can be adjusted with the -PC flag (below).
- FISHER can be used with GO terms.
+ FET can be used with GO terms.
+
+ Additional flags for FET:
+
+ - **-PC <int>**: Pseudocounts used in calculating the enrichment score and p-value by hypergeometic distribution. Default: PC=2.
**GSEA**
Gene Set Enrichment Analysis. GSEA assess the significance of a pathway by looking at how the members fall in the ranking of all genes. The genes are first ranked by significance from resampling. Specifically, they are sorted by signed-log-p-value, SLPV=sign(LFC)*(log(pval)), which puts them in order so that the most significant genes with negative LFC are at the top, the most significant with positive LFC are at the bottom, and insignificant genes fall in the middle. Roughly, GSEA computes the mean rank of pathway members, and evaluates significance based on a simulated a null distribution. p-values are again adjusted at the end by BH.
@@ -1638,17 +1709,58 @@ Parameters
GSEA can be used with GO terms.
+ Additional flags for GSEA:
+
+ - **-ranking SLPV|LFC**: method used to rank all genes; SLPV is signed-log-p-value (default); LFC is log2-fold-change from resampling
+
+ - **-p <float>**: exponent to use in calculating enrichment score; recommend trying '-p 0' (default) or '-p 1' (as used in Subramaniam et al, 2005)
+
+ - **-Nperm <int>**: number of permutations to simulate for null distribution to determine p-value (default=10000)
+
+
**ONT**
- Ontologizer is a specialized method for GO terms that takes parent-child relationships into account among nodes in the GO hierarchy. This can enhance the specificity of pathways detected as significant. Hierarhical relationships among GO terms are encoded in an OBO file, which is included in the src/pytransit/data/ directory.
+ Ontologizer is a specialized method for GO terms that takes parent-child relationships into account among nodes in the GO hierarchy. This can enhance the specificity of pathways detected as significant. (The problem is that there are many GO terms in the hierarchy covering similar or identical sets of genes, and often, if one node is significantly enriched, then several of its ancestors will be too, which obscures the results with redundant hits; Ontologizer reduces the significance of nodes if their probability distribution among hits can be explained by their parents.) Hierarhical relationships among GO terms are encoded in an OBO file, which is included in the src/pytransit/data/ directory.
`Grossmann S, Bauer S, Robinson PN, Vingron M. Improved detection of overrepresentation of Gene-Ontology annotations with parent child analysis. Bioinformatics. 2007 Nov 15;23(22):3024-31. <https://www.ncbi.nlm.nih.gov/pubmed/17848398>`_
-- **-PC**
- Pseudocounts used in calculating enrichment score in output file for FISHER. Default: PC=2.
+
+
+
+Auxilliary Pathway Files in Transit Data Directory
+--------------------------------------------------
+
+::
+
+These files for pathway analysis are distributed in the Transit data directory
+(e.g. transit/src/pytransit/data/).
+
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+| system | num cats| applicable methods | associations of genes with roles | pathway definitions/role names |
++==========+=========+====================+======================================+================================+
+| COG | 20 | FET*, GSEA | H37Rv_COG_roles.dat | COG_roles.dat |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+| Sanger | 153 | FET*, GSEA* | H37Rv_sanger_roles.dat | sanger_roles.dat |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+| GO | 2545 | FET, GSEA | H37Rv_GO_terms.txt | GO_term_names.dat |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+| | | ONT* | GO_terms_for_each_Rv.obo-3-11-18.txt | gene_ontology.1_2.3-11-18.obo |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+
+asterisk means 'recommended' combination of method with system of functional categories
+
+
+Current Recommendations
+-----------------------
+
+Here are the recommended combinations of pathway methods to use for different systems of functional categories:
+
+ * For COG, use '-M FET'
+ * For Sanger roles, try both FET and GSEA
+ * For GO terms, use 'M -ONT'
Examples
-~~~~~~~~
+--------
::
@@ -1667,7 +1779,7 @@ Examples
# Ontologizer is a specialized method for GO terms
> transit pathway_enrichment resampling_glyc_chol.txt $DATA/H37Rv_GO_terms.txt $DATA/GO_term_names.dat pathways_Ontologizer.txt -M ONT
-The $DATA environment variable is these examples refers to the Transit data directory, e.g. src/pytransit/data/.
+The $DATA environment variable in these examples refers to the Transit data directory, e.g. src/pytransit/data/.
.. rst-class:: transit_sectionend
@@ -1679,7 +1791,7 @@ The $DATA environment variable is these examples refers to the Transit data dire
.. rst-class:: transit_clionly
tnseq_stats
------------
+===========
You can generate the same table to statistics as on the Quality Control panel in the GUI
from the command-line using the 'tnseq_stats' command. Here is an example:
@@ -1710,7 +1822,7 @@ from the command-line using the 'tnseq_stats' command. Here is an example:
.. rst-class:: transit_clionly
corrplot
--------
+========
A useful tool when evaluating the quality of a collection of TnSeq datasets is to make a
*correlation plot* of the mean insertion counts (averaged at the gene-level) among samples.
@@ -1725,7 +1837,7 @@ and relies on the 'corrplot' R package.
See :ref:`Installation Instructions <install-zinb>`.
Usage:
-~~~~~~
+------
::
@@ -1791,7 +1903,7 @@ to be installed on your system. See :ref:`Installation Instructions <install-zi
.. rst-class:: transit_clionly
heatmap
--------
+=======
The output of ANOVA or ZINB can be used to generate a heatmap that
simultaneously clusters the significant genes and clusters the conditions,
@@ -1803,21 +1915,27 @@ and relies on the 'gplots' R package.
See :ref:`Installation Instructions <install-zinb>`.
Usage:
-~~~~~~
+------
::
- python3 src/transit.py heatmap -anova|-zinb <anova_or_zinb_output> <heatmap.png>
+ python3 src/transit.py heatmap <anova_or_zinb_output> <heatmap.png> -anova|-zinb [-topk <int>] [-qval <float]
-Note that the *first* argument is required to be either '-anova' or '-zinb', a flag to
+Note that the first optional argument (flag) is required to be either '-anova' or '-zinb', a flag to
indicate the type of file being provided as the second argument.
+By default, genes are selected for the heatmap based on qval<0.05.
+However, the user may change the selection of genes through 2 flags:
+
+ * **-qval <float>**: change qval threshold for selecting genes (default=0.05)
+ * **-topk <int>**: select top k genes ranked by significance (qval)
+
Here is an example which generates the following image showing the similarities among
several different growth conditions:
::
- > python3 src/transit.py heatmap -anova anova_iron.txt heatmap_iron_anova.png
+ > python3 src/transit.py heatmap anova_iron.txt heatmap_iron_anova.png -anova
.. image:: _images/iron_heatmap_anova_rotated.png
:width: 1000
@@ -1826,7 +1944,7 @@ several different growth conditions:
Importantly, the heatmap is based only on the subset of genes
identified as significantly varying (Padj < 0:05, typically only a few
-hundred) in order to enhance the patterns, since otherwise they would
+hundred genes) in order to enhance the patterns, since otherwise they would
be washed out by the rest of the genes in the genome, the majority of
which usually do not exhibit significant variation in counts.
=====================================
src/pytransit/export/mean_counts.py
=====================================
@@ -61,6 +61,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
"""
def __init__(self,
+ combined_wig,
ctrldata,
annotation_path,
output_file,
@@ -72,7 +73,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
base.SingleConditionMethod.__init__(self, short_name, long_name, description, label, ctrldata, annotation_path, output_file, normalization=normalization, LOESS=LOESS, NTerminus=NTerminus, CTerminus=CTerminus, wxobj=wxobj)
-
+ self.combined_wig = combined_wig # boolean, interprete ctrldata as combined_wig file or comma-separated list of wig files?
@classmethod
@@ -110,9 +111,10 @@ class MeanCountsMethod(base.SingleConditionMethod):
if not output_path: return None
output_file = open(output_path, "w")
+ # could add a checkbox for combined_wig
+ combined_wig = False
-
- return self(ctrldata,
+ return self(combined_wig,ctrldata,
annotationPath,
output_file,
normalization,
@@ -124,8 +126,11 @@ class MeanCountsMethod(base.SingleConditionMethod):
@classmethod
def fromargs(self, rawargs):
(args, kwargs) = transit_tools.cleanargs(rawargs)
+ print("ARGS="+str(args))
+ print("KWARGS="+str(kwargs))
- ctrldata = args[0].split(",")
+ combined_wig = kwargs.get("c",False)
+ ctrldata = args[0].split(",")
annotationPath = args[1]
outpath = args[2]
output_file = open(outpath, "w")
@@ -136,7 +141,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
NTerminus = 0.0
CTerminus = 0.0
- return self(ctrldata,
+ return self(combined_wig,ctrldata,
annotationPath,
output_file,
normalization,
@@ -152,9 +157,9 @@ class MeanCountsMethod(base.SingleConditionMethod):
#Get orf data
self.transit_message("Getting Data")
- (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
- (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization,
- self.ctrldata, self.annotation_path)
+ if self.combined_wig: (position, fulldata, datasets) = tnseq_tools.read_combined_wig(self.ctrldata[0])
+ else: (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
+ (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path)
position = position.astype(int)
hash = transit_tools.get_pos_hash(self.annotation_path)
@@ -171,16 +176,19 @@ class MeanCountsMethod(base.SingleConditionMethod):
self.output.write("#Files:\n")
- for f in self.ctrldata:
+ names = datasets if self.combined_wig else self.ctrldata
+ for f in names:
self.output.write("#%s\n" % f)
K,Nsites = fulldata.shape
# Get Gene objects
- G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization)
+ if self.combined_wig: G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization,data=fulldata,position=position)
+ else: G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization)
N = len(G)
self.progress_range(N)
- dataset_header = "\t".join([transit_tools.fetch_name(D) for D in self.ctrldata])
+ if self.combined_wig: dataset_header = '\t'.join(datasets)
+ else: dataset_header = "\t".join([transit_tools.fetch_name(D) for D in self.ctrldata])
self.output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header)
for i,gene in enumerate(G):
if gene.n > 0:
@@ -204,7 +212,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
@classmethod
def usage_string(self):
- return """python %s export mean_counts <comma-separated .wig files> <annotation .prot_table> <output file>""" % (sys.argv[0])
+ return """python %s export mean_counts <comma-separated .wig files>|<combined_wig> <annotation .prot_table> <output file> [-c]\n note: append -c if inputing a combined_wig file\n""" % (sys.argv[0])
if __name__ == "__main__":
=====================================
src/pytransit/genomes/genomes.html~
=====================================
@@ -0,0 +1,36 @@
+<HTML>
+
+<table border=1>
+
+<TD><A HREF="H37Rv.fna">H37Rv.fna</A>
+<TD><A HREF="H37Rv.prot_table">H37Rv.prot_table</A>
+<TD>Stewart Cole's reference <I>M. tuberculosis</I> sequence (NC_000962.2)<TR>
+
+
+<TD><A HREF="H37RvMA2.fna">H37RvMA2.fna</A>
+<TD><A HREF="H37RvMA2.prot_table">H37RvMA2.prot_table</A>
+<TD>Sequenced strain from the Sassetti lab, with
+ sequencing errors fixed (Ioerger et al. 2010;
+ http://www.ncbi.nlm.nih.gov/pubmed/20472797).<TR>
+
+<TD><A HREF="H37RvBD.fna">H37RvBD.fna</A>
+<TD><A HREF="H37RvBD.prot_table">H37RvBD.prot_table</A>
+<TD>Sequenced by the Broad Institute (NC_018143.2)<TR>
+
+<TD><A HREF="BCG.fna">BCG.fna</A>
+<TD><A HREF="BCG.prot_table">BCG.prot_table</A>
+<TD>Reference genome for <I>M. bovis</I> BCG (NC_008769.1)<TR>
+
+<TD><A HREF="mc2_155_tamu.fna">mc2_155_tamu.fna</A>
+<TD><A HREF="mc2_155_tamu.prot_table">mc2_155_tamu.prot_table</A>
+<TD><I>M. smegmattis</> mc2 155 sequenced at TAMU<TR>
+
+<TD><TD>
+<A HREF="H37RvBD_mod3.prot_table">H37RvBD_mod3.prot_table</A>
+<TD>Annotation adjusted for an indel in the version of
+ the H37Rv Broad sequence used in the Sassetti lab (for
+ analyzing old datasets that have been previously processed
+ using this sequence).
+<TR>
+
+</table>
=====================================
src/pytransit/transit_gui.py
=====================================
@@ -382,10 +382,14 @@ class MainFrame ( wx.Frame ):
expLibSizer.Add(self.expLibTip, 0, wx.ALIGN_CENTER_VERTICAL, 5 )
- globalSizerVT.Add( nTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
- globalSizerVT.Add( cTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
- globalSizerVT.Add( ctrlLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
- globalSizerVT.Add( expLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+ #globalSizerVT.Add( nTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+ #globalSizerVT.Add( cTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+ #globalSizerVT.Add( ctrlLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+ #globalSizerVT.Add( expLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+ globalSizerVT.Add( nTermSizer, 1, wx.EXPAND, 5 )
+ globalSizerVT.Add( cTermSizer, 1, wx.EXPAND, 5 )
+ globalSizerVT.Add( ctrlLibSizer, 1, wx.EXPAND, 5 )
+ globalSizerVT.Add( expLibSizer, 1, wx.EXPAND, 5 )
=====================================
tests/H37Rv.fna
=====================================
@@ -0,0 +1 @@
+../misc/test/tpp/H37Rv.fna
\ No newline at end of file
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/compare/5d32ed1fa55fe9aee4d1447851cbc57064cf6bdc...960097a9170f1d03b8b46705d8761b30742e1160
--
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/compare/5d32ed1fa55fe9aee4d1447851cbc57064cf6bdc...960097a9170f1d03b8b46705d8761b30742e1160
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201028/1ee7eb79/attachment-0001.html>
More information about the debian-med-commit
mailing list