[med-svn] [Git][med-team/tnseq-transit][upstream] New upstream version 3.2.0

Nilesh Patra gitlab at salsa.debian.org
Wed Oct 28 21:34:17 GMT 2020



Nilesh Patra pushed to branch upstream at Debian Med / tnseq-transit


Commits:
3b803d06 by Nilesh Patra at 2020-10-29T02:47:18+05:30
New upstream version 3.2.0
- - - - -


14 changed files:

- CHANGELOG.md
- src/pytransit/__init__.py
- src/pytransit/analysis/corrplot.py
- src/pytransit/analysis/gi.py
- src/pytransit/analysis/heatmap.py
- src/pytransit/analysis/pathway_enrichment.py
- + src/pytransit/data/GO_terms_for_each_Rv.obo-3-11-18.txt
- src/pytransit/data/H37Rv_COG_roles.dat
- src/pytransit/data/H37Rv_sanger_roles.dat
- + src/pytransit/data/gene_ontology.1_2.3-11-18.obo
- + src/pytransit/doc/source/_images/genetic_interaction_types.png
- src/pytransit/doc/source/transit_methods.rst
- src/pytransit/export/mean_counts.py
- src/pytransit/transit_gui.py


Changes:

=====================================
CHANGELOG.md
=====================================
@@ -1,6 +1,16 @@
 # Change log
 All notable changes to this project will be documented in this file.
 
+## Version 3.2.0 2020-10-26
+#### TRANSIT:
+ - improvements to pathway_enrichment analysis
+  - added '--ranking' flag for GSEA to sort genes based on LFC or SLPV
+  - implemented Ontologizer method (-M ONT), which works better for GO terms
+  - updated auxilliary files in transit data directory for different systems of functional categories (COG, Sanger, GO)
+ - added '-signif' flag to GI (Genetic Interaction analysis) (options: HDI, prob, BFDR, FWER)
+  - updated description of methods for determining significant interactions in documentation
+ - various improvements to other methods, including corrplot and heatmap
+
 ## Version 3.1.0 2020-03-08
 #### TRANSIT:
  - added 'corrplot' and 'heatmap' commands


=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
 __all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
 
 
-__version__ = "v3.1.0"
+__version__ = "v3.2.0"
 prefix = "[TRANSIT]"
 


=====================================
src/pytransit/analysis/corrplot.py
=====================================
@@ -154,6 +154,8 @@ class CorrplotMethod(base.SingleConditionMethod):
         genenames = ["%s/%s" % (w[0],w[1]) for w in data]
         hash = {}
         headers = [h.replace("Mean_","") for h in headers]
+        headers = [h.replace("-",".") for h in headers] # because of R conversion
+        headers = ["X"+x if x[0].isdigit() else x for x in headers] # because R prepends 'X' to column names starting with a digit
         for i,col in enumerate(headers): hash[col] = FloatVector([x[i] for x in means])
         df = DataFrame(hash) # can't figure out how to set rownames
 


=====================================
src/pytransit/analysis/gi.py
=====================================
@@ -22,7 +22,7 @@ import math
 import random
 import numpy
 import scipy.stats
-import datetime
+import datetime 
 
 from pytransit.analysis import base
 import pytransit
@@ -44,7 +44,7 @@ NOTE: This method requires 4 groups of datasets. Use the main interface to add d
 """
 
 transposons = ["himar1"]
-columns = ["Orf","Name","Number of TA Sites","Mean count (Strain A Condition 1)","Mean count (Strain A Condition 2)","Mean count (Strain B Condition 1)","Mean count (Strain B Condition 2)", "Mean logFC (Strain A)", "Mean logFC (Strain B)", "Mean delta logFC","Lower Bound delta logFC","Upper Bound delta logFC", "Prob. of delta-logFC being within ROPE", "Adjusted Probability (BFDR)", "Is HDI outside ROPE?", "Type of Interaction"]
+columns = ["Orf","Name","Number of TA Sites","Mean count (Strain A Condition 1)","Mean count (Strain A Condition 2)","Mean count (Strain B Condition 1)","Mean count (Strain B Condition 2)", "Mean logFC (Strain A)", "Mean logFC (Strain B)", "Mean delta logFC","Lower Bound delta logFC","Upper Bound delta logFC", "Prob. of delta-logFC being within ROPE", "Adjusted Probability", "Is HDI outside ROPE?", "Type of Interaction"]
 
 
 class GIAnalysis(base.TransitAnalysis):
@@ -143,7 +143,7 @@ class GIGUI(base.AnalysisGUI):
 
 
         # Zeros Check
-        (self.wxobj.giZeroCheckBox, zeroSizer) = self.defineCheckBox(giPanel, labelText="Include sites with all zeros", widgetCheck=True, widgetSize=(-1,-1), tooltipText="Includes sites that are empty (zero) accross all datasets. Unchecking this may be useful for tn5 datasets, where all nucleotides are possible insertion sites and will have a large number of empty sites (significantly slowing down computation and affecting estimates).")
+        (self.wxobj.giZeroCheckBox, zeroSizer) = self.defineCheckBox(giPanel, labelText="Include sites with all zeros", widgetCheck=True, widgetSize=(-1,-1), tooltipText="Includes sites that are empty (zero) across all datasets. Unchecking this may be useful for tn5 datasets, where all nucleotides are possible insertion sites and will have a large number of empty sites (significantly slowing down computation and affecting estimates).")
         giSizer.Add(zeroSizer, 0, wx.EXPAND, 5 )
 
 
@@ -474,6 +474,7 @@ class GIMethod(base.QuadConditionMethod):
                 normalization="TTR",
                 samples=10000,
                 rope=0.5,
+                signif="HDI",
                 includeZeros=False,
                 replicates="Sum",
                 LOESS=False,
@@ -486,8 +487,7 @@ class GIMethod(base.QuadConditionMethod):
         self.samples = samples
         self.includeZeros = includeZeros
         self.rope = rope
-        self.doBFDR = True # TRI
-        self.doFWER = True # TRI
+        self.signif = signif
         self.NTerminus = NTerminus
         self.CTerminus = CTerminus
 
@@ -558,6 +558,7 @@ class GIMethod(base.QuadConditionMethod):
         if not output_path: return None
         output_file = open(output_path, "w")
 
+        signif = "HDI" # need to add choice for this to the GUI
 
         return self(ctrldataA,
                 ctrldataB,
@@ -568,6 +569,7 @@ class GIMethod(base.QuadConditionMethod):
                 normalization,
                 samples,
                 rope,
+                signif,
                 includeZeros,
                 replicates,
                 LOESS,
@@ -600,10 +602,10 @@ class GIMethod(base.QuadConditionMethod):
         normalization = kwargs.get("n", "TTR")
         samples = int(kwargs.get("s", 10000))
         rope = float(kwargs.get("-rope", 0.5)) # fixed! changed int to float
+        signif = kwargs.get("signif","HDI")
         replicates = kwargs.get("r", "Sum")
         includeZeros = kwargs.get("iz", False)
 
-
         LOESS = kwargs.get("l", False)
         ignoreCodon = True
         NTerminus = float(kwargs.get("iN", 0.00))
@@ -618,6 +620,7 @@ class GIMethod(base.QuadConditionMethod):
                 normalization,
                 samples,
                 rope,
+                signif,
                 includeZeros,
                 replicates,
                 LOESS,
@@ -674,7 +677,7 @@ class GIMethod(base.QuadConditionMethod):
         var_list_b2 = []
 
 
-        # Base priors on empirical observations accross genes.
+        # Base priors on empirical observations across genes.
         for gene in sorted(G_A1):
             if gene.n > 1:
                 A1_data = G_A1[gene.orf].reads.flatten()
@@ -764,7 +767,7 @@ class GIMethod(base.QuadConditionMethod):
                 mean_logFC_B = numpy.mean(logFC_B_post)
                 mean_delta_logFC = numpy.mean(delta_logFC_post)
 
-                # Is HDI significantly different than ROPE?
+                # Is HDI significantly different than ROPE? (i.e. no overlap)
                 not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope
 
                 # Probability of posterior overlaping with ROPE
@@ -816,25 +819,35 @@ class GIMethod(base.QuadConditionMethod):
             self.transit_message_inplace("Running Export Method... %1.1f%%" % (100.0*count/(N-1)))
             count+=1
 
-        data.sort(key=lambda x: x[-2])
+        # for HDI, maybe I should sort on abs(mean_delta_logFC); however, need to sort by prob to calculate BFDR
+        probcol = -2 # probROPEs
+        data.sort(key=lambda x: x[probcol]) 
+        sortedprobs = numpy.array([x[probcol] for x in data]) 
+
+        # BFDR method: Newton M.A., Noueiry A., Sarkar D., Ahlquist P. (2004). Detecting differential gene expression with a semiparametric hierarchical mixture method. Biostatistics, 5:155–176.
 
-        if self.doBFDR or not self.doFWER:
-            postprob = numpy.array(postprob)
-            postprob.sort()
-            bfdr = numpy.cumsum(postprob)/numpy.arange(1, len(postprob)+1)
-            adjusted_prob = bfdr
+        if self.signif=="BFDR":
+            sortedprobs = numpy.array(sortedprobs) 
+            #sortedprobs.sort() # why, since already sorted?
+            bfdr = numpy.cumsum(sortedprobs)/numpy.arange(1, len(sortedprobs)+1)
+            adjusted_prob = bfdr # should be same order as sorted above by probROPE
             adjusted_label = "BFDR"
-        elif doFWER:
-            fwer = FWER_Bayes(postprob)
-            fwer.sort()
+
+        elif self.signif=="FWER":
+            fwer = stat_tools.FWER_Bayes(sortedprobs)
+            #fwer.sort() # should not need this if monotonic
             adjusted_prob = fwer
             adjusted_label = "FWER"
 
         # If not using adjustment for classification, sort correctly
-        if not self.doBFDR and not self.doFWER:
-            sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
-            adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
-            data = [data[ii] for ii in sorted_index]
+        else:
+          adjusted_prob = sortedprobs
+          adjusted_label = "un"
+          # should I stable-sort by overlap_bit?
+
+#            sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
+#            adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
+#            data = [data[ii] for ii in sorted_index]
 
 
 
@@ -848,37 +861,40 @@ class GIMethod(base.QuadConditionMethod):
         else:
             self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))
 
+        now = str(datetime.datetime.now())
+        now = now[:now.rfind('.')]
+        self.output.write("#Date: "+now+"\n")
+        #self.output.write("#Runtime: %s s\n" % (time.time() - start_time))
+
 
         self.output.write("#Control Data-A: %s\n" % (",".join(self.ctrldataA).encode('utf-8')))
         self.output.write("#Control Data-B: %s\n" % (",".join(self.ctrldataB).encode('utf-8')))
         self.output.write("#Experimental Data-A: %s\n" % (",".join(self.expdataA).encode('utf-8')))
         self.output.write("#Experimental Data-B: %s\n" % (",".join(self.expdataB).encode('utf-8')))
         self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8')))
-        self.output.write("#Time: %s\n" % (time.time() - start_time))
-        self.output.write("#%s\n" % "\t".join(columns))
-
+        self.output.write("#ROPE=%s, method for significance=%s\n" % (self.rope,self.signif))
+        #self.output.write("#%s\n" % "\t".join(columns))
 
-        if self.doBFDR or self.doFWER:
-            self.output.write("# Significant interactions are those whose adjusted probability of the delta-logFC falling within ROPE is < 0.05 (Adjusted using %s)\n" % (adjusted_label))
-        else:
-            self.output.write("# Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n")
-        self.output.write("#\n")
+        if self.signif=="HDI":
+            self.output.write("#Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n")
+        elif self.signif in "prob BDFR FWER":
+            self.output.write("#Significant interactions are those whose %s-adjusted probability of the delta-logFC falling within ROPE is < 0.05.\n" % (adjusted_label))
 
-        # Write column names
-        self.output.write("#ORF\tName\tNumber of TA Sites\tMean count (Strain A Condition 1)\tMean count (Strain A Condition 2)\tMean count (Strain B Condition 1)\tMean count (Strain B Condition 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tProb. of delta-logFC being within ROPE\tAdjusted Probability (%s)\tIs HDI outside ROPE?\tType of Interaction\n" % adjusted_label)
+        # Write column names (redundant with self.columns)
+        self.output.write("#ORF\tName\tNumber of TA Sites\tMean count (Strain A Condition 1)\tMean count (Strain A Condition 2)\tMean count (Strain B Condition 1)\tMean count (Strain B Condition 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tIs HDI outside ROPE?\tProb. of delta-logFC being within ROPE\t%s-Adjusted Probability\tType of Interaction\n" % adjusted_label)
 
         # Write gene results
         for i,row in enumerate(data):
-        #1   2    3        4                5              6               7                8            9            10              11             12            13         14
-            orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row
-            type_of_interaction = "No Interaction"
-            if ((self.doBFDR or self.doFWER) and adjusted_prob[i] < 0.05):
-                type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
-            elif not (self.doBFDR or self.doFWER) and not_HDI_overlap_bit:
-                type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
+          #1   2    3        4                5              6               7                8            9            10              11             12            13         14
+          orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row
+
+          interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
+          type_of_interaction = "No Interaction"
+          if self.signif in "prob BFDR FWER" and adjusted_prob[i] < 0.05: type_of_interaction = interaction
+          if self.signif=="HDI" and not_HDI_overlap_bit: type_of_interaction = interaction
 
-            new_row = tuple(list(row[:-1])+[adjusted_prob[i], not_HDI_overlap_bit, type_of_interaction])
-            self.output.write("%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.8f\t%1.8f\t%s\t%s\n" % new_row)
+          new_row = tuple(list(row[:-2])+[not_HDI_overlap_bit, probROPE, adjusted_prob[i], type_of_interaction])
+          self.output.write("%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%s\t%1.8f\t%1.8f\t%s\n" % new_row)
 
 
         self.transit_message("Adding File: %s" % (self.output.name))
@@ -907,16 +923,24 @@ class GIMethod(base.QuadConditionMethod):
         GI performs a comparison among 4 groups of datasets, strain A and B assessed in conditions 1 and 2 (e.g. control vs treatment).
         It looks for interactions where the response to the treatment (i.e. effect on insertion counts) depends on the strain (output variable: delta_LFC).
         Provide replicates in each group as a comma-separated list of wig files.
-        Significant interactions are those with "HDI outside ROPE?"=TRUE, and all genes are sorted by significance using BFDR."
+        HDI is highest density interval for posterior distribution of delta_LFC, which is like a confidence interval on difference of slopes.
+        Genes are sorted by probability of HDI overlapping with ROPE. (genes with the highest abs(mean_delta_logFC) are near the top, approximately)
+        Significant genes are indicated by 'Type of Interaction' column (No Interaction, Aggravating, Alleviating, Suppressive).
+          By default, hits are defined as "Is HDI outside of ROPE?"=TRUE (i.e. non-overlap of delta_LFC posterior distritbuion with Region of Probably Equivalence around 0)
+          Alternative methods for significance: use -signif flag with prob, BFDR, or FWER. These affect 'Type of Interaction' (i.e. which genes are labeled 'No Interaction')
 
         Optional Arguments:
         -s <integer>    :=  Number of samples. Default: -s 10000
         --rope <float>  :=  Region of Practical Equivalence. Area around 0 (i.e. 0 +/- ROPE) that is NOT of interest. Can be thought of similar to the area of the null-hypothesis. Default: --rope 0.5
         -n <string>     :=  Normalization method. Default: -n TTR
-        -iz             :=  Include rows with zero accross conditions.
+        -iz             :=  Include rows with zero across conditions.
         -l              :=  Perform LOESS Correction; Helps remove possible genomic position bias. Default: Turned Off.
         -iN <float>     :=  Ignore TAs occuring at given percentage (as integer) of the N terminus. Default: -iN 0
         -iC <float>     :=  Ignore TAs occuring at given percentage (as integer) of the C terminus. Default: -iC 0
+        -signif HDI     :=  (default) Significant if HDI does not overlap ROPE; if HDI overlaps ROPE, 'Type of Interaction' is set to 'No Interaction'
+        -signif prob    :=  Optionally, significant hits are re-defined based on probability (degree) of overlap of HDI with ROPE, prob<0.05 (no adjustment)
+        -signif BFDR    :=  Apply "Bayesian" FDR correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as BFDR<0.05
+        -signif FWER    :=  Apply "Bayesian" FWER correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as FWER<0.05
         """ % (sys.argv[0])
 
 


=====================================
src/pytransit/analysis/heatmap.py
=====================================
@@ -102,41 +102,51 @@ class HeatmapMethod(base.SingleConditionMethod):
 
     @classmethod
     def fromargs(self, rawargs): 
+        (args, kwargs) = transit_tools.cleanargs(rawargs)
         if len(rawargs)<3: print(self.usage_string()); sys.exit(-1)
         self.filetype = None
-        if rawargs[0]=="-anova": self.filetype = "anova"
-        elif rawargs[0]=="-zinb": self.filetype = "zinb"
+        if kwargs.get("anova",False): self.filetype = "anova"
+        elif kwargs.get("zinb",False): self.filetype = "zinb"
         else: print(self.usage_string()); sys.exit(-1)
-        self.infile = rawargs[1]
-        self.outfile = rawargs[2]
+        self.infile = args[0]
+        self.outfile = args[1]
+        self.qval = float(kwargs.get("qval",0.05))
+        self.topk = int(kwargs.get("topk",-1))
         return self(self.infile,outfile=self.outfile)
 
     def Run(self):
 
-        # assume first non-comment line is header; samples are 
+        if self.filetype!="anova" and self.filetype!="zinb":
+          print("filetype not recognized: %s" % self.filetype); sys.exit(-1)
+        
         headers = None
-        data,LFCs = [],[]
-
-        if self.filetype=="anova" or self.filetype=="zinb":
-          n = -1 # number of conditions
-          for line in open(self.infile):
-            w = line.rstrip().split('\t')
-            if line[0]=='#' or ('pval' in line and 'padj' in line): # check for 'pval' for backwards compatibility
-              headers = w; continue # keep last comment line as headers
-            if n==-1: 
-              # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
-              # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
-              if self.filetype=="anova": n = int((len(w)-6)/2) 
-              elif self.filetype=="zinb": n = int((len(headers)-6)/4) 
-              headers = headers[3:3+n]
-              headers = [x.replace("Mean_","") for x in headers]
+        data,hits = [],[]
+        n = -1 # number of conditions
+
+        for line in open(self.infile):
+          w = line.rstrip().split('\t')
+          if line[0]=='#' or ('pval' in line and 'padj' in line): # check for 'pval' for backwards compatibility
+            headers = w; continue # keep last comment line as headers
+          # assume first non-comment line is header
+          if n==-1: 
+            # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
+            # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
+            if self.filetype=="anova": n = int((len(w)-6)/2) 
+            elif self.filetype=="zinb": n = int((len(headers)-6)/4) 
+            headers = headers[3:3+n]
+            headers = [x.replace("Mean_","") for x in headers]
+          else:
             lfcs = [float(x) for x in w[3+n:3+n+n]] # take just the columns of means
             qval = float(w[-2])
-            if qval<0.05: data.append(w); LFCs.append(lfcs)
-        else: print("filetype not recognized: %s" % self.filetype); sys.exit(-1)
-        print("heatmap based on %s genes" % len(LFCs))
+            data.append((w,lfcs,qval))
 
-        genenames = ["%s/%s" % (w[0],w[1]) for w in data]
+        data.sort(key=lambda x: x[-1])
+        hits,LFCs = [],[]
+        for k,(w,lfcs,qval) in enumerate(data):
+          if (self.topk==-1 and qval<self.qval) or (self.topk!=-1 and k<self.topk): hits.append(w); LFCs.append(lfcs)
+
+        print("heatmap based on %s genes" % len(hits))
+        genenames = ["%s/%s" % (w[0],w[1]) for w in hits]
         hash = {}
         headers = [h.replace("Mean_","") for h in headers]
         for i,col in enumerate(headers): hash[col] = FloatVector([x[i] for x in LFCs])
@@ -167,8 +177,7 @@ dev.off()
 
     @classmethod
     def usage_string(self):
-        return "usage: python3 %s heatmap -anova|-zinb <anova_or_zinb_output> <heatmap.png>" % sys.argv[0]
-        # could add a flag for padj cutoff (or top n most signif genes)
+        return "usage: python3 %s heatmap <anova_or_zinb_output> <heatmap.png> -anova|-zinb [-topk <int>] [-qval <float>]\n note: genes are selected based on qval<0.05 by default" % sys.argv[0]
 
 
 if __name__ == "__main__":


=====================================
src/pytransit/analysis/pathway_enrichment.py
=====================================
@@ -79,7 +79,7 @@ class PathwayGUI(base.AnalysisGUI):
 
 class PathwayMethod(base.AnalysisMethod):
 
-  def __init__(self,resamplingFile,associationsFile,pathwaysFile,outputFile,method,PC=0,N=10000,p=1):
+  def __init__(self,resamplingFile,associationsFile,pathwaysFile,outputFile,method,PC=0,Nperm=10000,p=0,ranking="SLPV"):
     base.AnalysisMethod.__init__(self, short_name, long_name, short_desc, long_desc, open(outputFile,"w"), None) # no annotation file
     self.resamplingFile = resamplingFile
     self.associationsFile = associationsFile
@@ -87,9 +87,10 @@ class PathwayMethod(base.AnalysisMethod):
     self.outputFile = outputFile 
     # self.output is the opened file, which will be set in base class
     self.method = method
-    self.PC = PC # for FISHER
-    self.N = N # for GSEA
+    self.PC = PC # for FET
+    self.Nperm = Nperm # for GSEA
     self.p = p # for GSEA
+    self.ranking = ranking # for GSEA
 
   @classmethod
   def fromGUI(self, wxobj):
@@ -102,37 +103,82 @@ class PathwayMethod(base.AnalysisMethod):
     associations = args[1]
     pathways = args[2]
     output = args[3]
-    method = kwargs.get("M", "FISHER")
-    N = int(kwargs.get("N", "10000")) # for GSEA?
-    p = int(kwargs.get("p","1")) # for GSEA?
-    PC = int(kwargs.get("PC","2"))
+    method = kwargs.get("M", "FET")
+    PC = int(kwargs.get("PC","2")) # for FET
+    Nperm = int(kwargs.get("Nperm", "10000")) # for GSEA
+    p = float(kwargs.get("p","0")) # for GSEA
+    ranking = kwargs.get("ranking","SLPV") # for GSEA
 
-    if method not in "FISHER GSEA ONT".split(): 
+    if method not in "FET GSEA ONT".split(): 
       print("error: method %s not recognized" % method)
       print(self.usage_string()); 
       sys.exit(0)
-    if method=="ONT": print("error: Ontologizer is not implemented yet"); sys.exit(0)
 
-    return self(resamplingFile,associations,pathways,output,method,PC=PC,N=N,p=p)
+    return self(resamplingFile,associations,pathways,output,method,PC=PC,Nperm=Nperm,p=p,ranking=ranking)
 
   @classmethod
   def usage_string(self):
-    return """python3 %s pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FISHER|GSEA|GO>] [-PC <int>]""" % (sys.argv[0])
+    return """python3 %s pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FET|GSEA|GO>] [-PC <int>] [-ranking SLPV|LFC] [-p <float>] [-Nperm <int>]
+
+Optional parameters:
+ -M FET|GSEA|ONT:     method to use, FET for Fisher's Exact Test (default), GSEA for Gene Set Enrichment Analysis (Subramaniam et al, 2005), or ONT for Ontologizer (Grossman et al, 2007)
+ -ranking (for GSEA): SLPV is signed-log-p-value (default); LFC is log2-fold-change from resampling
+ -p       (for GSEA): exponent to use in calculating enrichment score; recommend trying 0 or 1 (as in Subramaniam et al, 2005)
+ -Nperm   (for GSEA): number of permutations to simulate for null distribution to determine p-value (default=10000)
+ -PC      (for FET):  pseudo-counts to use in calculating p-value based on hypergeometric distribution (default=2)
+""" % (sys.argv[0])
 
   def Run(self):
     self.transit_message("Starting Pathway Enrichment Method")
     start_time = time.time()
 
     # self.output in base class should be open by now
-    self.write("# command: "+' '.join(sys.argv))
-    self.write("# date: "+str(datetime.datetime.now()))
+    self.write("# command: python3 "+' '.join(sys.argv))
+    now = str(datetime.datetime.now())
+    now = now[:now.rfind('.')]
+    self.write("# date: "+now)
+    self.write("# pathway method: "+self.method)
 
-    if self.method=="FISHER": self.fisher_exact_test()
+    if self.method=="FET": self.fisher_exact_test()
     elif self.method =="GSEA": self.GSEA()
+    elif self.method =="ONT": self.Ontologizer()
     else:
       method = "Not a valid method"
       self.progress_update("Not a valid method", 100)
 
+  def write(self,msg): self.output.write(msg+"\n")
+
+  def read_resampling_file(self,filename):
+    genes,hits,headers = [],[],[]
+    for line in open(filename):
+      if line[0]=='#': headers.append(line); continue
+      w = line.rstrip().split('\t')
+      genes.append(w)
+      qval = float(w[-1])
+      if qval<0.05: hits.append(w[0])
+    return genes,hits,headers
+
+  # assume these are listed as pairs (tab-sep)
+  # return bidirectional hash (genes->[terms], terms->[genes]; each can be one-to-many, hence lists)
+  def read_associations(self,filename):
+    associations = {}
+    for line in open(filename):
+      if line[0]=='#': continue
+      w = line.rstrip().split('\t')
+      # store mappings in both directions
+      for (a,b) in [(w[0],w[1]),(w[1],w[0])]:
+        if a not in associations: associations[a] = []
+        associations[a].append(b)
+    return associations
+
+  def read_pathways(self,filename):
+    pathways = {}
+    for line in open(filename):
+      if line[0]=='#': continue
+      w = line.rstrip().split('\t')
+      pathways[w[0]] = w[1]    
+    return pathways
+
   ############### GSEA ######################
 
   def makeindex(self,lst):
@@ -141,7 +187,6 @@ class PathwayMethod(base.AnalysisMethod):
     return index
     
   # based on GSEA paper (Subramanian et al, 2005, PNAS)
-  # xxx assume that Bindex is a dictionary that maps all genes into ranks, and Bscores maps all genes to SLPV
   # ranks and scores are hashes from genes into ranks and SLPV
   # when p=0, ES(S) reduces to the standard K-S statistic; p=1 is used in PNAS paper
     
@@ -183,7 +228,8 @@ class PathwayMethod(base.AnalysisMethod):
     terms2orfs = associations
     allgenes = [x[0] for x in data]
 
-    self.write("# method=GSEA, using SLPV to rank genes, Nperm=%d" % self.N)
+    self.write("# method=GSEA, Nperm=%d, p=%d" % (self.Nperm,self.p))
+    self.write("# ranking genes by %s" % self.ranking)
     self.write("# total genes: %s, mean rank: %s" % (len(data),n2))
 
     # rank by SLPV=sign(LFC)*log10(pval)
@@ -195,14 +241,21 @@ class PathwayMethod(base.AnalysisMethod):
     for w in data:
       orf,LFC,Pval = w[0],float(w[LFC_col]),float(w[Pval_col])
       SLPV = (-1 if LFC<0 else 1)*math.log(Pval+0.000001,10)
-      pairs.append((orf,SLPV))
+      if self.ranking=="SLPV": pairs.append((orf,SLPV))
+      elif self.ranking=="LFC": pairs.append((orf,LFC))
+
+    # pre-randomize ORFs, to avoid genome-position bias in case of ties in pvals (e.g. 1.0)
+    indexes = range(len(pairs))
+    indexes = numpy.random.permutation(indexes).tolist() 
+    pairs = [pairs[i] for i in indexes]
+
     pairs.sort(key=lambda x: x[1],reverse=True) # emulate ranking genes with *higher* correlation at top
     orfs2rank,orfs2score = {},{}
     for i,(orf,score) in enumerate(pairs): 
       orfs2score[orf] = score
       orfs2rank[orf] = i
 
-    Nperm = self.N
+    Nperm = self.Nperm
     results,Total = [],len(terms)
     for i,term in enumerate(terms):
       sys.stdout.flush()
@@ -254,38 +307,7 @@ class PathwayMethod(base.AnalysisMethod):
       self.output.write('\t'.join([str(x) for x in vals])+'\n')
     self.output.close()
 
-  #################################
-
-  def read_resampling_file(self,filename):
-    genes,hits,headers = [],[],[]
-    for line in open(filename):
-      if line[0]=='#': headers.append(line); continue
-      w = line.rstrip().split('\t')
-      genes.append(w)
-      qval = float(w[-1])
-      if qval<0.05: hits.append(w[0])
-    return genes,hits,headers
-
-  # assume these are listed as pairs (tab-sep)
-  # return bidirectional hash (genes->[terms], terms->[genes]; each can be one-to-many, hence lists)
-  def read_associations(self,filename):
-    associations = {}
-    for line in open(filename):
-      if line[0]=='#': continue
-      w = line.rstrip().split('\t')
-      # store mappings in both directions
-      for (a,b) in [(w[0],w[1]),(w[1],w[0])]:
-        if a not in associations: associations[a] = []
-        associations[a].append(b)
-    return associations
-
-  def read_pathways(self,filename):
-    pathways = {}
-    for line in open(filename):
-      if line[0]=='#': continue
-      w = line.rstrip().split('\t')
-      pathways[w[0]] = w[1]    
-    return pathways
+  ########## Fisher Exact Test ###############
 
   # HYPERGEOMETRIC 
   # scipy.stats.hypergeom.sf() is survival function (1-cdf), so only enriched genes will be significant
@@ -297,8 +319,6 @@ class PathwayMethod(base.AnalysisMethod):
   def hypergeometric(self,k,M,n,N):
     return hypergeom.sf(k,M,n,N)
 
-  def write(self,msg): self.output.write(msg+"\n")
-
   def fisher_exact_test(self):
     genes,hits,headers = self.read_resampling_file(self.resamplingFile)
     associations = self.read_associations(self.associationsFile)
@@ -313,7 +333,7 @@ class PathwayMethod(base.AnalysisMethod):
     for gene in genes: 
       orf = gene[0]
       if orf in associations: genes_with_associations += 1
-    self.write("# method=FISHER, PC=%s" % self.PC)
+    self.write("# method=FET, PC=%s" % self.PC)
     self.write("# genes with associations=%s out of %s total" % (genes_with_associations,len(genes)))
     self.write("# significant genes (qval<0.05): %s" % (len(hits)))
 
@@ -366,6 +386,142 @@ class PathwayMethod(base.AnalysisMethod):
     self.finish()
     self.transit_message("Finished Pathway Enrichment Method") 
 
+  ########## Ontologizer ###############
+
+  # this method is restricted to GO terms
+
+  # this implements the union method of:
+  #  Grossman et al (2007). Improved Detection of overrepresentation of Gene-Ontology 
+  #  annotation with parent-child analysis. Bioinformatics, 23(22):3024-3031.
+
+  def Ontologizer(self):
+
+    def warning(s): sys.stderr.write("%s\n" % s) # use self.warning? prepend method name?
+  
+    # returns True if ch is a descendant of GO (as GO terms, like "GO:0006810")
+    
+    def descendant_of(ch,GO):
+      if ch==GO: return True
+      for node in parents.get(ch,[]):
+        if descendant_of(node,GO)==True: return True
+      return False
+    
+    # visited is a hash
+    # assume B is above A
+    # do upward DFS following parent pointers till hit NULL
+    # return all nodes on all paths? include A and B?
+    
+    def get_ancestors(A,visited):
+      if A in visited: return visited
+      visited[A] = 1
+      for parent in parents.get(A,[]): get_ancestors(parent,visited)
+      return visited
+    
+    def depth(go):
+      if go not in parents: return 0
+      return 1+min([depth(x) for x in parents[go]])
+    
+    ontology,parents = {},{}
+  
+    GOannot = self.associationsFile
+    OBOfile = self.pathwaysFile
+
+    for line in open(OBOfile):
+      if line[:3]=="id:": id = line[4:-1]
+      if line[:5]=="is_a:":
+        parent = line.split()[1]
+        if id not in parents: parents[id] = []
+        parents[id].append(parent)
+      if len(line)<2: id = None
+      if line[:5]=="name:": ontology[id] = line[6:-1]
+  
+    rv2gos,go2rvs = {},{}
+    MINTERMS,MAXTERMS = 2,300
+  
+    for line in open(GOannot):
+      w = line.rstrip().split('\t')
+      rv,go = w[0],w[1]
+      if rv not in rv2gos: rv2gos[rv] = []
+      if go not in rv2gos[rv]: rv2gos[rv].append(go)
+      if go not in go2rvs: go2rvs[go] = []
+      if rv not in go2rvs[go]: go2rvs[go].append(rv)
+      # expand to all parents...
+      BP,CC,MF = ["GO:0008150","GO:0005575","GO:0003674"]
+      for g in get_ancestors(go,{}):
+        if g not in rv2gos[rv]: 
+          rv2gos[rv].append(g)
+          if g not in go2rvs: go2rvs[g] = []
+          go2rvs[g].append(rv)
+  
+    warning("GO terms with at least one ORF: %s" % len(go2rvs.keys())) # what about between MIN and MAX?
+    for go in go2rvs.keys():
+      if go not in ontology: warning("not found: %s" % go) # also indicate which gene?
+
+    # could use class method, but would have to adapt it: 
+    # genes,hits,headers = self.read_resampling_file(self.resamplingFile)
+  
+    genes,pvals = {},[]
+    allorfs,studyset = [],[]
+    for line in open(self.resamplingFile):
+      if line[0]=="#": continue
+      w = line.rstrip().split()
+      genes[w[0]] = w
+      allorfs.append(w[0])
+      pval,qval = float(w[-2]),float(w[-1])
+      if qval<0.05: studyset.append(w[0])
+      pvals.append((w[0],pval))
+    pvals.sort(key=lambda x: x[1])
+    ranks = {}
+    for i,(rv,pval) in enumerate(pvals): ranks[rv] = i+1
+    self.write("# number of resampling hits (qval<0.05): %s" % len(studyset))
+  
+    counts = []
+    n,a = len(allorfs),len(studyset)
+    for go in go2rvs.keys():
+      if go not in ontology: continue
+      m = len(go2rvs[go]) # orfs in the GO terms in the genome overall
+      if m>=MINTERMS and m<=MAXTERMS:
+        b = len(list(filter(lambda x: x in go2rvs[go],studyset))) # orfs with GO term in studyset
+        if b==0: continue
+        # calc p=len(rvs for parents of go), q=len(subset of parent rvs in studyset)
+        P = set()
+        if go not in parents: continue
+        npar = len(parents[go])
+        for par in parents[go]: P = P.union(go2rvs[par])
+        p,q = len(P),len(P.intersection(studyset))
+        enrich = ((b+1)/float(q+1))/((m+1)/float(p+1)) # enrichment
+        # parents: p overall, q in studyset; GO in studyset: b out of q (subset of studyset labeled with a parent GO term)
+        # assume m orfs have GO term inside parent (same as overall)         
+        # subtract 1 from b possibly because we are using 1-cdf? this is needed to get same numeric results as 'phypergeometric' in Ontologizer
+        pval = 1.0-scipy.stats.hypergeom.cdf(b-1,p,m,q) 
+        counts.append([go,n,m,p,q,a,b,npar,enrich,pval])
+    
+    pvals = [x[-1] for x in counts]
+    qvals = multitest.fdrcorrection(pvals)[1]
+  
+    counts = [x+[y] for x,y in zip(counts,qvals)]
+    counts.sort(key=lambda x: x[-1])
+
+    self.write("# number of GO terms that are significantly enriched (qval<0.05): %s" % len(list(filter(lambda x: x[-1]<0.05,counts))))
+  
+    # 1-sided pvals, only report enriched terms, not depleted
+  
+    self.write('\t'.join("GO_term description total_orfs orfs_in_GO orfs_in_parents hits_in_parents hits GO_in_hits num_parent_nodes enrichment pval qval genes_in_intersection_of_hits_and_GO_term".split())) # assume self.outputFile has already been opened
+    for (go,n,m,p,q,a,b,npar,enrich,pval,qval) in counts:
+      hits = filter(lambda x: x in go2rvs[go],studyset)
+      hits = [(x,genes[x][1],ranks[x]) for x in hits]
+      hits.sort(key=lambda x: x[2]) # sort on ranks
+      hits = ["%s/%s(%s)" % (rv,gene,rank) for (rv,gene,rank) in hits]
+      hits = ','.join(hits)    # add gene names
+      vals = [go,ontology[go],n,m,a,b,p,q,npar,round(enrich,3),round(pval,6),round(qval,6),hits]
+      self.write('\t'.join([str(x) for x in vals]))
+
+    self.transit_message("Adding File: %s" % (self.outputFile))
+    self.add_file(filetype="Pathway Enrichment")
+    self.finish()
+    self.transit_message("Finished Pathway Enrichment Method") 
+
+
 ####################################################
 
 if __name__ == "__main__":


=====================================
src/pytransit/data/GO_terms_for_each_Rv.obo-3-11-18.txt
=====================================
The diff for this file was not included because it is too large.

=====================================
src/pytransit/data/H37Rv_COG_roles.dat
=====================================
@@ -40,7 +40,6 @@ Rv0054	L	ssb	Replication, recombination and repair
 Rv0055	J	rpsR	Translation, ribosomal structure and biogenesis 
 Rv0056	J	rplI	Translation, ribosomal structure and biogenesis 
 Rv0058	L	dnaB	Replication, recombination and repair 
-Rv0058	L	dnaB	Replication, recombination and repair 
 Rv0060	R	-	General function prediction only 
 Rv0062	G	celA1	Carbohydrate transport and metabolism 
 Rv0063	C	-	Energy production and conversion 
@@ -66,7 +65,6 @@ Rv0084	C	hycD	Energy production and conversion
 Rv0085	C	hycP	Energy production and conversion 
 Rv0086	P	hycQ	Inorganic ion transport and metabolism 
 Rv0087	C	hycE	Energy production and conversion 
-Rv0087	C	hycE	Energy production and conversion 
 Rv0088	J	-	Translation, ribosomal structure and biogenesis 
 Rv0089	R	-	General function prediction only 
 Rv0090	P	-	Inorganic ion transport and metabolism 
@@ -78,7 +76,6 @@ Rv0097	Q	-	Secondary metabolites biosynthesis, transport and catabolism
 Rv0099	Q	fadD10	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0100	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0101	Q	nrp	Secondary metabolites biosynthesis, transport and catabolism 
-Rv0101	Q	nrp	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0102	S	-	Function unknown 
 Rv0103c	P	ctpB	Inorganic ion transport and metabolism 
 Rv0104	T	-	Signal transduction mechanisms 
@@ -135,11 +132,11 @@ Rv0165c	K	-	Transcription
 Rv0166	Q	fadD5	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0167	Q	yrbE1A	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0168	Q	yrbE1B	Secondary metabolites biosynthesis, transport and catabolism 
-Rv0169	T	mce1A	Signal transduction mechanisms 
 Rv0169	Q	mce1A	Secondary metabolites biosynthesis, transport and catabolism 
+Rv0169	T	mce1A	Signal transduction mechanisms 
 Rv0170	Q	mce1B	Secondary metabolites biosynthesis, transport and catabolism 
-Rv0171	T	mce1C	Signal transduction mechanisms 
 Rv0171	Q	mce1C	Secondary metabolites biosynthesis, transport and catabolism 
+Rv0171	T	mce1C	Signal transduction mechanisms 
 Rv0172	Q	mce1D	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0173	Q	lprK	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0174	Q	mce1F	Secondary metabolites biosynthesis, transport and catabolism 
@@ -168,7 +165,6 @@ Rv0207c	S	-	Function unknown
 Rv0208c	R	trmB	General function prediction only 
 Rv0211	C	pckA	Energy production and conversion 
 Rv0212c	H	nadR	Coenzyme transport and metabolism 
-Rv0212c	H	nadR	Coenzyme transport and metabolism 
 Rv0213c	C	-	Energy production and conversion 
 Rv0214	Q	fadD4	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0215c	I	fadE3	Lipid transport and metabolism 
@@ -204,15 +200,14 @@ Rv0253	R	nirD	General function prediction only
 Rv0254c	H	cobU	Coenzyme transport and metabolism 
 Rv0255c	H	cobQ1	Coenzyme transport and metabolism 
 Rv0259c	S	-	Function unknown 
-Rv0260c	K	-	Transcription 
 Rv0260c	H	-	Coenzyme transport and metabolism 
+Rv0260c	K	-	Transcription 
 Rv0261c	P	narK3	Inorganic ion transport and metabolism 
 Rv0262c	R	aac	General function prediction only 
 Rv0263c	E	-	Amino acid transport and metabolism 
 Rv0264c	E	-	Amino acid transport and metabolism 
 Rv0265c	P	-	Inorganic ion transport and metabolism 
 Rv0266c	Q	oplA	Secondary metabolites biosynthesis, transport and catabolism 
-Rv0266c	Q	oplA	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0267	P	narU	Inorganic ion transport and metabolism 
 Rv0268c	D	-	Cell cycle control, cell division, chromosome partitioning 
 Rv0269c	L	-	Replication, recombination and repair 
@@ -248,8 +243,8 @@ Rv0319	O	pcp	Posttranslational modification, protein turnover, chaperones
 Rv0321	F	dcd	Nucleotide transport and metabolism 
 Rv0322	M	udgA	Cell wall/membrane/envelope biogenesis 
 Rv0323c	S	-	Function unknown 
-Rv0324	P	-	Inorganic ion transport and metabolism 
 Rv0324	K	-	Transcription 
+Rv0324	P	-	Inorganic ion transport and metabolism 
 Rv0325	R	-	General function prediction only 
 Rv0326	R	-	General function prediction only 
 Rv0327c	Q	cyp135A1	Secondary metabolites biosynthesis, transport and catabolism 
@@ -261,7 +256,6 @@ Rv0334	M	rmlA	Cell wall/membrane/envelope biogenesis
 Rv0337c	E	aspC	Amino acid transport and metabolism 
 Rv0338c	C	-	Energy production and conversion 
 Rv0339c	K	-	Transcription 
-Rv0339c	K	-	Transcription 
 Rv0342	R	iniA	General function prediction only 
 Rv0343	R	iniC	General function prediction only 
 Rv0345	R	-	General function prediction only 
@@ -295,10 +289,9 @@ Rv0380c	J	-	Translation, ribosomal structure and biogenesis
 Rv0382c	F	pyrE	Nucleotide transport and metabolism 
 Rv0384c	O	clpB	Posttranslational modification, protein turnover, chaperones 
 Rv0385	C	-	Energy production and conversion 
-Rv0385	C	-	Energy production and conversion 
-Rv0386	T	-	Signal transduction mechanisms 
 Rv0386	K	-	Transcription 
 Rv0386	R	-	General function prediction only 
+Rv0386	T	-	Signal transduction mechanisms 
 Rv0389	F	purT	Nucleotide transport and metabolism 
 Rv0390	P	-	Inorganic ion transport and metabolism 
 Rv0391	E	metZ	Amino acid transport and metabolism 
@@ -347,8 +340,8 @@ Rv0449c	R	-	General function prediction only
 Rv0450c	R	mmpL4	General function prediction only 
 Rv0452	K	-	Transcription 
 Rv0454	J	-	Translation, ribosomal structure and biogenesis 
-Rv0456c	I	echA2	Lipid transport and metabolism 
 Rv0456A	T	-	Signal transduction mechanisms 
+Rv0456c	I	echA2	Lipid transport and metabolism 
 Rv0457c	E	-	Amino acid transport and metabolism 
 Rv0458	C	-	Energy production and conversion 
 Rv0459	S	-	Function unknown 
@@ -360,8 +353,8 @@ Rv0466	I	-	Lipid transport and metabolism
 Rv0467	C	icl	Energy production and conversion 
 Rv0468	I	fadB2	Lipid transport and metabolism 
 Rv0469	M	umaA	Cell wall/membrane/envelope biogenesis 
-Rv0470c	M	pcaA	Cell wall/membrane/envelope biogenesis 
 Rv0470A	H	-	Coenzyme transport and metabolism 
+Rv0470c	M	pcaA	Cell wall/membrane/envelope biogenesis 
 Rv0472c	K	-	Transcription 
 Rv0473	S	-	Function unknown 
 Rv0474	K	-	Transcription 
@@ -392,7 +385,6 @@ Rv0508	C	-	Energy production and conversion
 Rv0509	H	hemA	Coenzyme transport and metabolism 
 Rv0510	H	hemC	Coenzyme transport and metabolism 
 Rv0511	H	hemD	Coenzyme transport and metabolism 
-Rv0511	H	hemD	Coenzyme transport and metabolism 
 Rv0512	H	hemB	Coenzyme transport and metabolism 
 Rv0516c	T	-	Signal transduction mechanisms 
 Rv0517	I	-	Lipid transport and metabolism 
@@ -437,9 +429,8 @@ Rv0566c	S	-	Function unknown
 Rv0567	R	-	General function prediction only 
 Rv0568	Q	cyp135B1	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0570	F	nrdZ	Nucleotide transport and metabolism 
-Rv0571c	R	-	General function prediction only 
-Rv0571c	R	-	General function prediction only 
 Rv0571c	I	-	Lipid transport and metabolism 
+Rv0571c	R	-	General function prediction only 
 Rv0573c	H	-	Coenzyme transport and metabolism 
 Rv0574c	M	-	Cell wall/membrane/envelope biogenesis 
 Rv0575c	C	-	Energy production and conversion 
@@ -485,11 +476,10 @@ Rv0627	R	-	General function prediction only
 Rv0628c	S	-	Function unknown 
 Rv0629c	L	recD	Replication, recombination and repair 
 Rv0630c	L	recB	Replication, recombination and repair 
-Rv0630c	L	recB	Replication, recombination and repair 
 Rv0631c	L	recC	Replication, recombination and repair 
 Rv0632c	I	echA3	Lipid transport and metabolism 
-Rv0634c	R	-	General function prediction only 
 Rv0634B	J	rpmG	Translation, ribosomal structure and biogenesis 
+Rv0634c	R	-	General function prediction only 
 Rv0636	I	-	Lipid transport and metabolism 
 Rv0638	U	secE	Intracellular trafficking, secretion, and vesicular transport 
 Rv0639	K	nusG	Transcription 
@@ -663,7 +653,6 @@ Rv0855	C	far	Energy production and conversion
 Rv0858c	E	-	Amino acid transport and metabolism 
 Rv0859	I	fadA	Lipid transport and metabolism 
 Rv0860	I	fadB	Lipid transport and metabolism 
-Rv0860	I	fadB	Lipid transport and metabolism 
 Rv0861c	L	ercc3	Replication, recombination and repair 
 Rv0864	H	moaC	Coenzyme transport and metabolism 
 Rv0865	H	mog	Coenzyme transport and metabolism 
@@ -678,8 +667,8 @@ Rv0876c	R	-	General function prediction only
 Rv0880	K	-	Transcription 
 Rv0881	J	-	Translation, ribosomal structure and biogenesis 
 Rv0884c	E	serC	Amino acid transport and metabolism 
-Rv0886	R	fprB	General function prediction only 
 Rv0886	C	fprB	Energy production and conversion 
+Rv0886	R	fprB	General function prediction only 
 Rv0887c	S	-	Function unknown 
 Rv0889c	C	citA	Energy production and conversion 
 Rv0890c	K	-	Transcription 
@@ -695,7 +684,6 @@ Rv0899	M	ompA	Cell wall/membrane/envelope biogenesis
 Rv0902c	T	prrB	Signal transduction mechanisms 
 Rv0903c	K	prrA	Transcription 
 Rv0904c	I	accD3	Lipid transport and metabolism 
-Rv0904c	I	accD3	Lipid transport and metabolism 
 Rv0905	I	echA6	Lipid transport and metabolism 
 Rv0906	R	-	General function prediction only 
 Rv0907	V	-	Defense mechanisms 
@@ -709,8 +697,8 @@ Rv0919	R	-	General function prediction only
 Rv0920c	L	-	Replication, recombination and repair 
 Rv0921	L	-	Replication, recombination and repair 
 Rv0922	L	-	Replication, recombination and repair 
-Rv0923c	S	-	Function unknown 
 Rv0923c	R	-	General function prediction only 
+Rv0923c	S	-	Function unknown 
 Rv0924c	P	mntH	Inorganic ion transport and metabolism 
 Rv0925c	R	-	General function prediction only 
 Rv0926c	S	-	Function unknown 
@@ -727,7 +715,6 @@ Rv0935	P	pstC1	Inorganic ion transport and metabolism
 Rv0936	P	pstA2	Inorganic ion transport and metabolism 
 Rv0937c	S	-	Function unknown 
 Rv0938	L	-	Replication, recombination and repair 
-Rv0938	L	-	Replication, recombination and repair 
 Rv0939	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv0939	R	-	General function prediction only 
 Rv0940c	C	-	Energy production and conversion 
@@ -753,8 +740,6 @@ Rv0969	P	ctpV	Inorganic ion transport and metabolism
 Rv0971c	I	echA7	Lipid transport and metabolism 
 Rv0972c	I	fadE12	Lipid transport and metabolism 
 Rv0973c	I	accA2	Lipid transport and metabolism 
-Rv0973c	I	accA2	Lipid transport and metabolism 
-Rv0973c	I	accA2	Lipid transport and metabolism 
 Rv0974c	I	accD2	Lipid transport and metabolism 
 Rv0975c	I	fadE13	Lipid transport and metabolism 
 Rv0978c	S	PE_PGRS17	Function unknown 
@@ -897,8 +882,8 @@ Rv1166	E	lpqW	Amino acid transport and metabolism
 Rv1167c	K	-	Transcription 
 Rv1170	S	mshB	Function unknown 
 Rv1173	R	fbiC	General function prediction only 
-Rv1175c	R	fadH	General function prediction only 
 Rv1175c	C	fadH	Energy production and conversion 
+Rv1175c	R	fadH	General function prediction only 
 Rv1176c	K	-	Transcription 
 Rv1177	C	fdxC	Energy production and conversion 
 Rv1178	E	-	Amino acid transport and metabolism 
@@ -957,7 +942,6 @@ Rv1245c	R	-	General function prediction only
 Rv1246c	D	-	Cell cycle control, cell division, chromosome partitioning 
 Rv1247c	D	-	Cell cycle control, cell division, chromosome partitioning 
 Rv1248c	C	kgd	Energy production and conversion 
-Rv1248c	C	kgd	Energy production and conversion 
 Rv1250	R	-	General function prediction only 
 Rv1251c	L	-	Replication, recombination and repair 
 Rv1251c	R	-	General function prediction only 
@@ -974,7 +958,6 @@ Rv1263	J	amiB2	Translation, ribosomal structure and biogenesis
 Rv1264	T	-	Signal transduction mechanisms 
 Rv1266c	L	pknH	Replication, recombination and repair 
 Rv1267c	T	embR	Signal transduction mechanisms 
-Rv1267c	T	embR	Signal transduction mechanisms 
 Rv1272c	V	-	Defense mechanisms 
 Rv1273c	V	-	Defense mechanisms 
 Rv1276c	T	-	Signal transduction mechanisms 
@@ -989,10 +972,9 @@ Rv1283c	P	oppB	Inorganic ion transport and metabolism
 Rv1284	P	-	Inorganic ion transport and metabolism 
 Rv1285	H	cysD	Coenzyme transport and metabolism 
 Rv1286	P	cysN	Inorganic ion transport and metabolism 
-Rv1286	P	cysN	Inorganic ion transport and metabolism 
 Rv1287	K	-	Transcription 
-Rv1288	R	-	General function prediction only 
 Rv1288	M	-	Cell wall/membrane/envelope biogenesis 
+Rv1288	R	-	General function prediction only 
 Rv1290c	S	-	Function unknown 
 Rv1292	J	argS	Translation, ribosomal structure and biogenesis 
 Rv1293	E	lysA	Amino acid transport and metabolism 
@@ -1009,7 +991,6 @@ Rv1304	C	atpB	Energy production and conversion
 Rv1305	C	atpE	Energy production and conversion 
 Rv1306	C	atpF	Energy production and conversion 
 Rv1307	C	atpH	Energy production and conversion 
-Rv1307	C	atpH	Energy production and conversion 
 Rv1308	C	atpA	Energy production and conversion 
 Rv1309	C	atpG	Energy production and conversion 
 Rv1310	C	atpD	Energy production and conversion 
@@ -1018,8 +999,8 @@ Rv1313c	L	-	Replication, recombination and repair
 Rv1314c	S	-	Function unknown 
 Rv1315	M	murA	Cell wall/membrane/envelope biogenesis 
 Rv1316c	L	ogt	Replication, recombination and repair 
-Rv1317c	L	alkA	Replication, recombination and repair 
 Rv1317c	F	alkA	Nucleotide transport and metabolism 
+Rv1317c	L	alkA	Replication, recombination and repair 
 Rv1318c	T	-	Signal transduction mechanisms 
 Rv1319c	T	-	Signal transduction mechanisms 
 Rv1320c	T	-	Signal transduction mechanisms 
@@ -1047,23 +1028,21 @@ Rv1344	Q	-	Secondary metabolites biosynthesis, transport and catabolism
 Rv1345	Q	fadD33	Secondary metabolites biosynthesis, transport and catabolism 
 Rv1346	I	fadE14	Lipid transport and metabolism 
 Rv1347c	J	-	Translation, ribosomal structure and biogenesis 
-Rv1348	V	-	Defense mechanisms 
 Rv1348	P	-	Inorganic ion transport and metabolism 
+Rv1348	V	-	Defense mechanisms 
 Rv1349	V	-	Defense mechanisms 
 Rv1350	R	fabG	General function prediction only 
 Rv1353c	K	-	Transcription 
 Rv1354c	T	-	Signal transduction mechanisms 
-Rv1354c	T	-	Signal transduction mechanisms 
 Rv1355c	H	moeY	Coenzyme transport and metabolism 
 Rv1357c	T	-	Signal transduction mechanisms 
-Rv1358	T	-	Signal transduction mechanisms 
 Rv1358	K	-	Transcription 
 Rv1358	R	-	General function prediction only 
+Rv1358	T	-	Signal transduction mechanisms 
 Rv1359	T	-	Signal transduction mechanisms 
 Rv1360	C	-	Energy production and conversion 
-Rv1364c	T	-	Signal transduction mechanisms 
-Rv1364c	T	-	Signal transduction mechanisms 
 Rv1364c	K	-	Transcription 
+Rv1364c	T	-	Signal transduction mechanisms 
 Rv1365c	T	rsfA	Signal transduction mechanisms 
 Rv1366	S	-	Function unknown 
 Rv1367c	V	-	Defense mechanisms 
@@ -1099,13 +1078,11 @@ Rv1406	J	fmt	Translation, ribosomal structure and biogenesis
 Rv1407	J	fmu	Translation, ribosomal structure and biogenesis 
 Rv1408	G	rpe	Carbohydrate transport and metabolism 
 Rv1409	H	ribG	Coenzyme transport and metabolism 
-Rv1409	H	ribG	Coenzyme transport and metabolism 
 Rv1410c	R	-	General function prediction only 
 Rv1412	H	ribC	Coenzyme transport and metabolism 
 Rv1413	E	-	Amino acid transport and metabolism 
 Rv1414	E	-	Amino acid transport and metabolism 
 Rv1415	H	ribA2	Coenzyme transport and metabolism 
-Rv1415	H	ribA2	Coenzyme transport and metabolism 
 Rv1416	H	ribH	Coenzyme transport and metabolism 
 Rv1420	L	uvrC	Replication, recombination and repair 
 Rv1421	R	-	General function prediction only 
@@ -1136,8 +1113,8 @@ Rv1456c	O	-	Posttranslational modification, protein turnover, chaperones
 Rv1457c	V	-	Defense mechanisms 
 Rv1458c	V	-	Defense mechanisms 
 Rv1460	K	-	Transcription 
-Rv1461	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv1461	L	-	Replication, recombination and repair 
+Rv1461	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv1462	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv1463	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv1464	E	csd	Amino acid transport and metabolism 
@@ -1164,14 +1141,12 @@ Rv1488	O	-	Posttranslational modification, protein turnover, chaperones
 Rv1489A	I	-	Lipid transport and metabolism 
 Rv1491c	S	-	Function unknown 
 Rv1492	I	mutA	Lipid transport and metabolism 
-Rv1492	I	mutA	Lipid transport and metabolism 
-Rv1493	I	mutB	Lipid transport and metabolism 
 Rv1493	I	mutB	Lipid transport and metabolism 
 Rv1495	T	-	Signal transduction mechanisms 
 Rv1496	E	-	Amino acid transport and metabolism 
 Rv1497	V	lipL	Defense mechanisms 
-Rv1498c	R	-	General function prediction only 
 Rv1498A	S	-	Function unknown 
+Rv1498c	R	-	General function prediction only 
 Rv1500	M	-	Cell wall/membrane/envelope biogenesis 
 Rv1501	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv1502	G	-	Carbohydrate transport and metabolism 
@@ -1272,7 +1247,6 @@ Rv1626	T	-	Signal transduction mechanisms
 Rv1627c	I	-	Lipid transport and metabolism 
 Rv1628c	R	-	General function prediction only 
 Rv1629	L	polA	Replication, recombination and repair 
-Rv1629	L	polA	Replication, recombination and repair 
 Rv1630	J	rpsA	Translation, ribosomal structure and biogenesis 
 Rv1631	H	coaE	Coenzyme transport and metabolism 
 Rv1631	S	coaE	Function unknown 
@@ -1315,16 +1289,16 @@ Rv1668c	R	-	General function prediction only
 Rv1670	S	-	Function unknown 
 Rv1672c	R	-	General function prediction only 
 Rv1673c	E	-	Amino acid transport and metabolism 
-Rv1674c	P	-	Inorganic ion transport and metabolism 
 Rv1674c	K	-	Transcription 
+Rv1674c	P	-	Inorganic ion transport and metabolism 
 Rv1675c	T	-	Signal transduction mechanisms 
 Rv1676	E	-	Amino acid transport and metabolism 
 Rv1677	C	dsbF	Energy production and conversion 
 Rv1679	I	fadE16	Lipid transport and metabolism 
 Rv1680	P	-	Inorganic ion transport and metabolism 
 Rv1681	R	moeX	General function prediction only 
-Rv1683	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv1683	I	-	Lipid transport and metabolism 
+Rv1683	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv1684	S	-	Function unknown 
 Rv1685c	K	-	Transcription 
 Rv1686c	V	-	Defense mechanisms 
@@ -1370,8 +1344,6 @@ Rv1732c	C	-	Energy production and conversion
 Rv1734c	C	-	Energy production and conversion 
 Rv1735c	P	-	Inorganic ion transport and metabolism 
 Rv1736c	C	narX	Energy production and conversion 
-Rv1736c	C	narX	Energy production and conversion 
-Rv1736c	C	narX	Energy production and conversion 
 Rv1737c	P	narK2	Inorganic ion transport and metabolism 
 Rv1739c	P	-	Inorganic ion transport and metabolism 
 Rv1740	S	-	Function unknown 
@@ -1389,8 +1361,8 @@ Rv1757c	L	-	Replication, recombination and repair
 Rv1762c	S	-	Function unknown 
 Rv1763	L	-	Replication, recombination and repair 
 Rv1764	L	-	Replication, recombination and repair 
-Rv1765c	K	-	Transcription 
 Rv1765A	L	-	Replication, recombination and repair 
+Rv1765c	K	-	Transcription 
 Rv1766	S	-	Function unknown 
 Rv1767	S	-	Function unknown 
 Rv1769	E	-	Amino acid transport and metabolism 
@@ -1426,7 +1398,6 @@ Rv1828	K	-	Transcription
 Rv1829	S	-	Function unknown 
 Rv1830	K	-	Transcription 
 Rv1832	E	gcvB	Amino acid transport and metabolism 
-Rv1832	E	gcvB	Amino acid transport and metabolism 
 Rv1833c	R	-	General function prediction only 
 Rv1834	R	-	General function prediction only 
 Rv1835c	R	-	General function prediction only 
@@ -1450,8 +1421,8 @@ Rv1855c	C	-	Energy production and conversion
 Rv1856c	R	-	General function prediction only 
 Rv1857	P	modA	Inorganic ion transport and metabolism 
 Rv1858	P	modB	Inorganic ion transport and metabolism 
-Rv1859	P	modC	Inorganic ion transport and metabolism 
 Rv1859	G	modC	Carbohydrate transport and metabolism 
+Rv1859	P	modC	Inorganic ion transport and metabolism 
 Rv1861	S	-	Function unknown 
 Rv1862	R	adhA	General function prediction only 
 Rv1863c	R	-	General function prediction only 
@@ -1483,7 +1454,6 @@ Rv1899c	R	lppD	General function prediction only
 Rv1900c	R	lipJ	General function prediction only 
 Rv1900c	T	lipJ	Signal transduction mechanisms 
 Rv1901	R	cinA	General function prediction only 
-Rv1901	R	cinA	General function prediction only 
 Rv1902c	R	nanT	General function prediction only 
 Rv1903	S	-	Function unknown 
 Rv1904	T	-	Signal transduction mechanisms 
@@ -1505,18 +1475,15 @@ Rv1927	S	-	Function unknown
 Rv1928c	R	-	General function prediction only 
 Rv1930c	R	-	General function prediction only 
 Rv1931c	K	-	Transcription 
-Rv1931c	K	-	Transcription 
 Rv1932	O	tpx	Posttranslational modification, protein turnover, chaperones 
 Rv1933c	I	fadE18	Lipid transport and metabolism 
 Rv1934c	I	fadE17	Lipid transport and metabolism 
 Rv1935c	I	echA13	Lipid transport and metabolism 
 Rv1936	C	-	Energy production and conversion 
 Rv1937	C	-	Energy production and conversion 
-Rv1937	C	-	Energy production and conversion 
 Rv1938	R	ephB	General function prediction only 
 Rv1939	R	-	General function prediction only 
 Rv1940	H	ribA1	Coenzyme transport and metabolism 
-Rv1940	H	ribA1	Coenzyme transport and metabolism 
 Rv1941	R	-	General function prediction only 
 Rv1942c	T	-	Signal transduction mechanisms 
 Rv1944c	U	-	Intracellular trafficking, secretion, and vesicular transport 
@@ -1544,8 +1511,8 @@ Rv1985c	K	-	Transcription
 Rv1986	R	-	General function prediction only 
 Rv1988	J	-	Translation, ribosomal structure and biogenesis 
 Rv1989c	S	-	Function unknown 
-Rv1990c	K	-	Transcription 
 Rv1990A	R	-	General function prediction only 
+Rv1990c	K	-	Transcription 
 Rv1991c	T	-	Signal transduction mechanisms 
 Rv1992c	P	ctpG	Inorganic ion transport and metabolism 
 Rv1994c	K	-	Transcription 
@@ -1554,28 +1521,27 @@ Rv1996	T	-	Signal transduction mechanisms
 Rv1997	P	ctpF	Inorganic ion transport and metabolism 
 Rv1998c	G	-	Carbohydrate transport and metabolism 
 Rv1999c	E	-	Amino acid transport and metabolism 
-Rv2000	R	-	General function prediction only 
 Rv2000	E	-	Amino acid transport and metabolism 
+Rv2000	R	-	General function prediction only 
 Rv2001	I	-	Lipid transport and metabolism 
 Rv2002	R	fabG3	General function prediction only 
 Rv2003c	R	-	General function prediction only 
 Rv2004c	R	-	General function prediction only 
 Rv2004c	S	-	Function unknown 
 Rv2005c	T	-	Signal transduction mechanisms 
-Rv2006	R	otsB1	General function prediction only 
-Rv2006	G	otsB1	Carbohydrate transport and metabolism 
 Rv2006	G	otsB1	Carbohydrate transport and metabolism 
+Rv2006	R	otsB1	General function prediction only 
 Rv2007c	C	fdxA	Energy production and conversion 
-Rv2008c	R	-	General function prediction only 
 Rv2008c	J	-	Translation, ribosomal structure and biogenesis 
+Rv2008c	R	-	General function prediction only 
 Rv2009	K	-	Transcription 
 Rv2010	R	-	General function prediction only 
 Rv2011c	K	-	Transcription 
 Rv2012	R	-	General function prediction only 
 Rv2013	L	-	Replication, recombination and repair 
 Rv2014	L	-	Replication, recombination and repair 
-Rv2017	K	-	Transcription 
 Rv2017	E	-	Amino acid transport and metabolism 
+Rv2017	K	-	Transcription 
 Rv2018	S	-	Function unknown 
 Rv2020c	R	-	General function prediction only 
 Rv2021c	K	-	Transcription 
@@ -1584,11 +1550,9 @@ Rv2024c	R	-	General function prediction only
 Rv2025c	P	-	Inorganic ion transport and metabolism 
 Rv2026c	T	-	Signal transduction mechanisms 
 Rv2027c	T	-	Signal transduction mechanisms 
-Rv2027c	T	-	Signal transduction mechanisms 
 Rv2028c	T	-	Signal transduction mechanisms 
 Rv2029c	G	pfkB	Carbohydrate transport and metabolism 
 Rv2030c	R	-	General function prediction only 
-Rv2030c	R	-	General function prediction only 
 Rv2031c	O	hspX	Posttranslational modification, protein turnover, chaperones 
 Rv2032	C	acg	Energy production and conversion 
 Rv2034	K	-	Transcription 
@@ -1601,10 +1565,8 @@ Rv2043c	Q	pncA	Secondary metabolites biosynthesis, transport and catabolism
 Rv2045c	I	lipT	Lipid transport and metabolism 
 Rv2046	Q	lppI	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2047c	G	-	Carbohydrate transport and metabolism 
-Rv2047c	G	-	Carbohydrate transport and metabolism 
 Rv2048c	Q	pks12	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2051c	M	ppm1	Cell wall/membrane/envelope biogenesis 
-Rv2051c	M	ppm1	Cell wall/membrane/envelope biogenesis 
 Rv2052c	R	-	General function prediction only 
 Rv2054	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2055c	J	rpsR	Translation, ribosomal structure and biogenesis 
@@ -1618,14 +1580,12 @@ Rv2062c	H	cobN	Coenzyme transport and metabolism
 Rv2064	P	cobG	Inorganic ion transport and metabolism 
 Rv2065	H	cobH	Coenzyme transport and metabolism 
 Rv2066	H	cobI	Coenzyme transport and metabolism 
-Rv2066	H	cobI	Coenzyme transport and metabolism 
 Rv2067c	R	-	General function prediction only 
 Rv2068c	V	blaC	Defense mechanisms 
 Rv2069	K	sigC	Transcription 
 Rv2070c	H	cobK	Coenzyme transport and metabolism 
 Rv2071c	H	cobM	Coenzyme transport and metabolism 
 Rv2072c	H	cobL	Coenzyme transport and metabolism 
-Rv2072c	H	cobL	Coenzyme transport and metabolism 
 Rv2073c	R	-	General function prediction only 
 Rv2086	L	-	Replication, recombination and repair 
 Rv2087	L	-	Replication, recombination and repair 
@@ -1640,7 +1600,6 @@ Rv2096c	K	-	Transcription
 Rv2101	L	helZ	Replication, recombination and repair 
 Rv2102	S	-	Function unknown 
 Rv2103c	R	-	General function prediction only 
-Rv2103c	R	-	General function prediction only 
 Rv2105	L	-	Replication, recombination and repair 
 Rv2106	L	-	Replication, recombination and repair 
 Rv2109c	O	prcA	Posttranslational modification, protein turnover, chaperones 
@@ -1653,7 +1612,6 @@ Rv2120c	S	-	Function unknown
 Rv2121c	E	hisG	Amino acid transport and metabolism 
 Rv2122c	E	hisE	Amino acid transport and metabolism 
 Rv2124c	E	metH	Amino acid transport and metabolism 
-Rv2124c	E	metH	Amino acid transport and metabolism 
 Rv2125	R	-	General function prediction only 
 Rv2127	E	ansP1	Amino acid transport and metabolism 
 Rv2129c	R	-	General function prediction only 
@@ -1697,7 +1655,6 @@ Rv2187	I	fadD15	Lipid transport and metabolism
 Rv2188c	M	-	Cell wall/membrane/envelope biogenesis 
 Rv2190c	M	-	Cell wall/membrane/envelope biogenesis 
 Rv2191	L	-	Replication, recombination and repair 
-Rv2191	L	-	Replication, recombination and repair 
 Rv2192c	E	trpD	Amino acid transport and metabolism 
 Rv2193	C	ctaE	Energy production and conversion 
 Rv2194	C	qcrC	Energy production and conversion 
@@ -1715,7 +1672,6 @@ Rv2211c	E	gcvT	Amino acid transport and metabolism
 Rv2212	T	-	Signal transduction mechanisms 
 Rv2213	E	pepB	Amino acid transport and metabolism 
 Rv2214c	R	ephD	General function prediction only 
-Rv2214c	R	ephD	General function prediction only 
 Rv2215	C	dlaT	Energy production and conversion 
 Rv2216	R	-	General function prediction only 
 Rv2217	H	lipB	Coenzyme transport and metabolism 
@@ -1728,11 +1684,10 @@ Rv2224c	R	-	General function prediction only
 Rv2225	H	panB	Coenzyme transport and metabolism 
 Rv2226	S	-	Function unknown 
 Rv2227	S	-	Function unknown 
-Rv2228c	L	-	Replication, recombination and repair 
 Rv2228c	G	-	Carbohydrate transport and metabolism 
+Rv2228c	L	-	Replication, recombination and repair 
 Rv2229c	R	-	General function prediction only 
 Rv2230c	S	-	Function unknown 
-Rv2230c	S	-	Function unknown 
 Rv2231c	E	cobC	Amino acid transport and metabolism 
 Rv2232	R	-	General function prediction only 
 Rv2234	T	ptpA	Signal transduction mechanisms 
@@ -1748,8 +1703,8 @@ Rv2245	Q	kasA	Secondary metabolites biosynthesis, transport and catabolism
 Rv2246	Q	kasB	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2247	I	accD6	Lipid transport and metabolism 
 Rv2249c	C	glpD1	Energy production and conversion 
-Rv2250c	K	-	Transcription 
 Rv2250A	C	-	Energy production and conversion 
+Rv2250c	K	-	Transcription 
 Rv2251	C	-	Energy production and conversion 
 Rv2252	R	-	General function prediction only 
 Rv2257c	V	-	Defense mechanisms 
@@ -1847,7 +1802,6 @@ Rv2380c	Q	mbtE	Secondary metabolites biosynthesis, transport and catabolism
 Rv2381c	Q	mbtD	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2382c	Q	mbtC	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2383c	Q	mbtB	Secondary metabolites biosynthesis, transport and catabolism 
-Rv2383c	Q	mbtB	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2384	Q	mbtA	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2385	I	mbtJ	Lipid transport and metabolism 
 Rv2386c	H	mbtI	Coenzyme transport and metabolism 
@@ -1885,9 +1839,8 @@ Rv2426c	R	-	General function prediction only
 Rv2427c	E	proA	Amino acid transport and metabolism 
 Rv2428	O	ahpC	Posttranslational modification, protein turnover, chaperones 
 Rv2429	S	ahpD	Function unknown 
-Rv2434c	T	-	Signal transduction mechanisms 
 Rv2434c	M	-	Cell wall/membrane/envelope biogenesis 
-Rv2435c	T	-	Signal transduction mechanisms 
+Rv2434c	T	-	Signal transduction mechanisms 
 Rv2435c	T	-	Signal transduction mechanisms 
 Rv2436	G	rbsK	Carbohydrate transport and metabolism 
 Rv2437	O	-	Posttranslational modification, protein turnover, chaperones 
@@ -1906,7 +1859,6 @@ Rv2449c	S	-	Function unknown
 Rv2453c	H	mobA	Coenzyme transport and metabolism 
 Rv2454c	C	-	Energy production and conversion 
 Rv2455c	C	-	Energy production and conversion 
-Rv2455c	C	-	Energy production and conversion 
 Rv2456c	R	-	General function prediction only 
 Rv2457c	O	clpX	Posttranslational modification, protein turnover, chaperones 
 Rv2458	E	mmuM	Amino acid transport and metabolism 
@@ -1929,13 +1881,13 @@ Rv2478c	L	-	Replication, recombination and repair
 Rv2479c	L	-	Replication, recombination and repair 
 Rv2480c	L	-	Replication, recombination and repair 
 Rv2482c	I	plsB2	Lipid transport and metabolism 
-Rv2483c	I	plsC	Lipid transport and metabolism 
 Rv2483c	E	plsC	Amino acid transport and metabolism 
+Rv2483c	I	plsC	Lipid transport and metabolism 
 Rv2485c	I	lipQ	Lipid transport and metabolism 
 Rv2486	I	echA14	Lipid transport and metabolism 
-Rv2488c	T	-	Signal transduction mechanisms 
 Rv2488c	K	-	Transcription 
 Rv2488c	R	-	General function prediction only 
+Rv2488c	T	-	Signal transduction mechanisms 
 Rv2491	R	-	General function prediction only 
 Rv2492	F	-	Nucleotide transport and metabolism 
 Rv2494	R	-	General function prediction only 
@@ -1956,17 +1908,14 @@ Rv2509	R	-	General function prediction only
 Rv2510c	R	-	General function prediction only 
 Rv2511	A	orn	RNA processing and modification 
 Rv2512c	L	-	Replication, recombination and repair 
-Rv2515c	K	-	Transcription 
 Rv2515c	E	-	Amino acid transport and metabolism 
+Rv2515c	K	-	Transcription 
 Rv2518c	S	lppS	Function unknown 
 Rv2521	O	bcp	Posttranslational modification, protein turnover, chaperones 
 Rv2522c	E	-	Amino acid transport and metabolism 
 Rv2523c	I	acpS	Lipid transport and metabolism 
-Rv2524c	Q	fas	Secondary metabolites biosynthesis, transport and catabolism 
-Rv2524c	I	fas	Lipid transport and metabolism 
-Rv2524c	I	fas	Lipid transport and metabolism 
-Rv2524c	I	fas	Lipid transport and metabolism 
 Rv2524c	I	fas	Lipid transport and metabolism 
+Rv2524c	Q	fas	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2527	R	-	General function prediction only 
 Rv2528c	V	mrr	Defense mechanisms 
 Rv2529	L	-	Replication, recombination and repair 
@@ -1993,12 +1942,11 @@ Rv2559c	L	-	Replication, recombination and repair
 Rv2561	E	-	Amino acid transport and metabolism 
 Rv2563	V	-	Defense mechanisms 
 Rv2564	V	glnQ	Defense mechanisms 
-Rv2565	T	-	Signal transduction mechanisms 
 Rv2565	R	-	General function prediction only 
+Rv2565	T	-	Signal transduction mechanisms 
 Rv2566	E	-	Amino acid transport and metabolism 
 Rv2566	S	-	Function unknown 
 Rv2567	S	-	Function unknown 
-Rv2567	S	-	Function unknown 
 Rv2568c	S	-	Function unknown 
 Rv2569c	E	-	Amino acid transport and metabolism 
 Rv2570	S	-	Function unknown 
@@ -2006,8 +1954,8 @@ Rv2571c	S	-	Function unknown
 Rv2572c	J	aspS	Translation, ribosomal structure and biogenesis 
 Rv2573	H	-	Coenzyme transport and metabolism 
 Rv2575	R	-	General function prediction only 
-Rv2577	R	-	General function prediction only 
 Rv2577	P	-	Inorganic ion transport and metabolism 
+Rv2577	R	-	General function prediction only 
 Rv2578c	L	-	Replication, recombination and repair 
 Rv2579	R	dhaA	General function prediction only 
 Rv2580c	J	hisS	Translation, ribosomal structure and biogenesis 
@@ -2038,8 +1986,8 @@ Rv2610c	M	pimA	Cell wall/membrane/envelope biogenesis
 Rv2611c	M	-	Cell wall/membrane/envelope biogenesis 
 Rv2612c	I	pgsA1	Lipid transport and metabolism 
 Rv2613c	R	-	General function prediction only 
-Rv2614c	J	thrS	Translation, ribosomal structure and biogenesis 
 Rv2614A	G	-	Carbohydrate transport and metabolism 
+Rv2614c	J	thrS	Translation, ribosomal structure and biogenesis 
 Rv2616	S	-	Function unknown 
 Rv2618	K	-	Transcription 
 Rv2619c	S	-	Function unknown 
@@ -2048,10 +1996,9 @@ Rv2622	R	-	General function prediction only
 Rv2623	T	TB31.7	Signal transduction mechanisms 
 Rv2624c	T	-	Signal transduction mechanisms 
 Rv2625c	R	-	General function prediction only 
-Rv2625c	R	-	General function prediction only 
 Rv2626c	R	-	General function prediction only 
-Rv2629	L	-	Replication, recombination and repair 
 Rv2629	J	-	Translation, ribosomal structure and biogenesis 
+Rv2629	L	-	Replication, recombination and repair 
 Rv2630	S	-	Function unknown 
 Rv2631	S	-	Function unknown 
 Rv2636	V	-	Defense mechanisms 
@@ -2061,8 +2008,8 @@ Rv2639c	S	-	Function unknown
 Rv2640c	K	-	Transcription 
 Rv2641	E	cadI	Amino acid transport and metabolism 
 Rv2642	K	-	Transcription 
-Rv2643	T	arsC	Signal transduction mechanisms 
 Rv2643	P	arsC	Inorganic ion transport and metabolism 
+Rv2643	T	arsC	Signal transduction mechanisms 
 Rv2646	L	-	Replication, recombination and repair 
 Rv2648	L	-	Replication, recombination and repair 
 Rv2649	L	-	Replication, recombination and repair 
@@ -2120,7 +2067,6 @@ Rv2733c	J	-	Translation, ribosomal structure and biogenesis
 Rv2734	S	-	Function unknown 
 Rv2736c	R	recX	General function prediction only 
 Rv2737c	L	recA	Replication, recombination and repair 
-Rv2737c	L	recA	Replication, recombination and repair 
 Rv2739c	C	-	Energy production and conversion 
 Rv2740	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2744c	T	35kd_ag	Signal transduction mechanisms 
@@ -2202,11 +2148,9 @@ Rv2842c	S	-	Function unknown
 Rv2845c	J	proS	Translation, ribosomal structure and biogenesis 
 Rv2846c	R	efpA	General function prediction only 
 Rv2847c	H	cysG	Coenzyme transport and metabolism 
-Rv2847c	H	cysG	Coenzyme transport and metabolism 
 Rv2848c	H	cobB	Coenzyme transport and metabolism 
 Rv2849c	H	cobO	Coenzyme transport and metabolism 
 Rv2850c	H	-	Coenzyme transport and metabolism 
-Rv2850c	H	-	Coenzyme transport and metabolism 
 Rv2851c	R	-	General function prediction only 
 Rv2852c	R	mqo	General function prediction only 
 Rv2854	I	-	Lipid transport and metabolism 
@@ -2269,8 +2213,8 @@ Rv2918c	O	glnD	Posttranslational modification, protein turnover, chaperones
 Rv2919c	E	glnB	Amino acid transport and metabolism 
 Rv2920c	P	amt	Inorganic ion transport and metabolism 
 Rv2921c	U	ftsY	Intracellular trafficking, secretion, and vesicular transport 
-Rv2922c	D	smc	Cell cycle control, cell division, chromosome partitioning 
 Rv2922A	C	acyP	Energy production and conversion 
+Rv2922c	D	smc	Cell cycle control, cell division, chromosome partitioning 
 Rv2923c	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv2924c	L	fpg	Replication, recombination and repair 
 Rv2925c	K	rnc	Transcription 
@@ -2283,7 +2227,6 @@ Rv2932	Q	ppsB	Secondary metabolites biosynthesis, transport and catabolism
 Rv2933	Q	ppsC	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2934	Q	ppsD	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2935	Q	ppsE	Secondary metabolites biosynthesis, transport and catabolism 
-Rv2935	Q	ppsE	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2936	V	drrA	Defense mechanisms 
 Rv2937	V	drrB	Defense mechanisms 
 Rv2938	V	drrC	Defense mechanisms 
@@ -2297,7 +2240,6 @@ Rv2944	L	-	Replication, recombination and repair
 Rv2946c	Q	pks1	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2947c	Q	pks15	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2948c	Q	fadD22	Secondary metabolites biosynthesis, transport and catabolism 
-Rv2948c	Q	fadD22	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2949c	H	-	Coenzyme transport and metabolism 
 Rv2950c	Q	fadD29	Secondary metabolites biosynthesis, transport and catabolism 
 Rv2951c	C	-	Energy production and conversion 
@@ -2318,8 +2260,8 @@ Rv2966c	L	-	Replication, recombination and repair
 Rv2967c	C	pca	Energy production and conversion 
 Rv2968c	S	-	Function unknown 
 Rv2969c	O	-	Posttranslational modification, protein turnover, chaperones 
-Rv2970c	I	lipN	Lipid transport and metabolism 
 Rv2970A	R	-	General function prediction only 
+Rv2970c	I	lipN	Lipid transport and metabolism 
 Rv2971	R	-	General function prediction only 
 Rv2973c	K	recG	Transcription 
 Rv2974c	R	-	General function prediction only 
@@ -2414,8 +2356,8 @@ Rv3074	V	-	Defense mechanisms
 Rv3075c	G	-	Carbohydrate transport and metabolism 
 Rv3077	P	-	Inorganic ion transport and metabolism 
 Rv3079c	C	-	Energy production and conversion 
-Rv3080c	L	pknK	Replication, recombination and repair 
 Rv3080c	K	pknK	Transcription 
+Rv3080c	L	pknK	Replication, recombination and repair 
 Rv3082c	K	virS	Transcription 
 Rv3083	P	-	Inorganic ion transport and metabolism 
 Rv3084	I	lipR	Lipid transport and metabolism 
@@ -2451,7 +2393,6 @@ Rv3120	R	-	General function prediction only
 Rv3121	Q	cyp141	Secondary metabolites biosynthesis, transport and catabolism 
 Rv3124	T	-	Signal transduction mechanisms 
 Rv3132c	T	devS	Signal transduction mechanisms 
-Rv3132c	T	devS	Signal transduction mechanisms 
 Rv3133c	K	devR	Transcription 
 Rv3134c	T	-	Signal transduction mechanisms 
 Rv3137	G	-	Carbohydrate transport and metabolism 
@@ -2489,8 +2430,8 @@ Rv3176c	R	mesT	General function prediction only
 Rv3177	R	-	General function prediction only 
 Rv3179	R	-	General function prediction only 
 Rv3180c	R	-	General function prediction only 
-Rv3181c	K	-	Transcription 
 Rv3181c	D	-	Cell cycle control, cell division, chromosome partitioning 
+Rv3181c	K	-	Transcription 
 Rv3182	S	-	Function unknown 
 Rv3183	K	-	Transcription 
 Rv3184	L	-	Replication, recombination and repair 
@@ -2503,13 +2444,11 @@ Rv3193c	S	-	Function unknown
 Rv3194c	T	-	Signal transduction mechanisms 
 Rv3195	S	-	Function unknown 
 Rv3197	R	-	General function prediction only 
-Rv3198c	L	uvrD2	Replication, recombination and repair 
 Rv3198A	O	-	Posttranslational modification, protein turnover, chaperones 
+Rv3198c	L	uvrD2	Replication, recombination and repair 
 Rv3199c	L	nudC	Replication, recombination and repair 
 Rv3200c	P	-	Inorganic ion transport and metabolism 
 Rv3201c	L	-	Replication, recombination and repair 
-Rv3201c	L	-	Replication, recombination and repair 
-Rv3202c	L	-	Replication, recombination and repair 
 Rv3202c	L	-	Replication, recombination and repair 
 Rv3203	R	lipV	General function prediction only 
 Rv3204	L	-	Replication, recombination and repair 
@@ -2523,8 +2462,8 @@ Rv3215	Q	entC	Secondary metabolites biosynthesis, transport and catabolism
 Rv3216	R	-	General function prediction only 
 Rv3218	R	-	General function prediction only 
 Rv3220c	T	-	Signal transduction mechanisms 
-Rv3221c	I	TB7.3	Lipid transport and metabolism 
 Rv3221A	P	-	Inorganic ion transport and metabolism 
+Rv3221c	I	TB7.3	Lipid transport and metabolism 
 Rv3223c	K	sigH	Transcription 
 Rv3224	R	-	General function prediction only 
 Rv3224B	S	-	Function unknown 
@@ -2541,7 +2480,6 @@ Rv3237c	P	-	Inorganic ion transport and metabolism
 Rv3238c	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv3239c	R	-	General function prediction only 
 Rv3239c	T	-	Signal transduction mechanisms 
-Rv3239c	R	-	General function prediction only 
 Rv3240c	U	secA1	Intracellular trafficking, secretion, and vesicular transport 
 Rv3241c	J	-	Translation, ribosomal structure and biogenesis 
 Rv3242c	R	-	General function prediction only 
@@ -2567,7 +2505,6 @@ Rv3267	K	-	Transcription
 Rv3270	P	ctpC	Inorganic ion transport and metabolism 
 Rv3272	C	-	Energy production and conversion 
 Rv3273	P	-	Inorganic ion transport and metabolism 
-Rv3273	P	-	Inorganic ion transport and metabolism 
 Rv3274c	I	fadE25	Lipid transport and metabolism 
 Rv3275c	F	purE	Nucleotide transport and metabolism 
 Rv3276c	F	purK	Nucleotide transport and metabolism 
@@ -2610,12 +2547,11 @@ Rv3319	C	sdhB	Energy production and conversion
 Rv3320c	R	-	General function prediction only 
 Rv3322c	R	-	General function prediction only 
 Rv3323c	H	moaX	Coenzyme transport and metabolism 
-Rv3323c	H	moaX	Coenzyme transport and metabolism 
 Rv3324c	H	moaC	Coenzyme transport and metabolism 
 Rv3325	L	-	Replication, recombination and repair 
 Rv3326	L	-	Replication, recombination and repair 
-Rv3327	S	-	Function unknown 
 Rv3327	L	-	Replication, recombination and repair 
+Rv3327	S	-	Function unknown 
 Rv3328c	K	sigJ	Transcription 
 Rv3329	H	-	Coenzyme transport and metabolism 
 Rv3330	M	dacB1	Cell wall/membrane/envelope biogenesis 
@@ -2646,8 +2582,8 @@ Rv3366	J	spoU	Translation, ribosomal structure and biogenesis
 Rv3368c	C	-	Energy production and conversion 
 Rv3369	H	-	Coenzyme transport and metabolism 
 Rv3370c	L	dnaE2	Replication, recombination and repair 
-Rv3372	R	otsB2	General function prediction only 
 Rv3372	G	otsB2	Carbohydrate transport and metabolism 
+Rv3372	R	otsB2	General function prediction only 
 Rv3373	I	echA18	Lipid transport and metabolism 
 Rv3374	I	echA18.1	Lipid transport and metabolism 
 Rv3375	J	amiD	Translation, ribosomal structure and biogenesis 
@@ -2662,13 +2598,12 @@ Rv3386	L	-	Replication, recombination and repair
 Rv3387	L	-	Replication, recombination and repair 
 Rv3389c	I	-	Lipid transport and metabolism 
 Rv3390	G	lpqD	Carbohydrate transport and metabolism 
-Rv3391	R	acrA1	General function prediction only 
 Rv3391	Q	acrA1	Secondary metabolites biosynthesis, transport and catabolism 
+Rv3391	R	acrA1	General function prediction only 
 Rv3392c	M	cmaA1	Cell wall/membrane/envelope biogenesis 
 Rv3393	F	iunH	Nucleotide transport and metabolism 
 Rv3394c	L	-	Replication, recombination and repair 
 Rv3396c	F	guaA	Nucleotide transport and metabolism 
-Rv3396c	F	guaA	Nucleotide transport and metabolism 
 Rv3397c	I	phyA	Lipid transport and metabolism 
 Rv3398c	H	idsA1	Coenzyme transport and metabolism 
 Rv3399	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
@@ -2698,8 +2633,8 @@ Rv3428c	L	-	Replication, recombination and repair
 Rv3430c	L	-	Replication, recombination and repair 
 Rv3431c	L	-	Replication, recombination and repair 
 Rv3432c	E	gadB	Amino acid transport and metabolism 
-Rv3433c	S	-	Function unknown 
 Rv3433c	G	-	Carbohydrate transport and metabolism 
+Rv3433c	S	-	Function unknown 
 Rv3436c	M	glmS	Cell wall/membrane/envelope biogenesis 
 Rv3441c	G	mrsA	Carbohydrate transport and metabolism 
 Rv3442c	J	rpsI	Translation, ribosomal structure and biogenesis 
@@ -2901,8 +2836,6 @@ Rv3725	G	-	Carbohydrate transport and metabolism
 Rv3726	R	-	General function prediction only 
 Rv3727	S	-	Function unknown 
 Rv3728	R	-	General function prediction only 
-Rv3728	R	-	General function prediction only 
-Rv3729	R	-	General function prediction only 
 Rv3729	R	-	General function prediction only 
 Rv3730c	L	-	Replication, recombination and repair 
 Rv3731	L	ligC	Replication, recombination and repair 
@@ -2911,7 +2844,6 @@ Rv3733c	R	-	General function prediction only
 Rv3735	S	-	Function unknown 
 Rv3736	K	-	Transcription 
 Rv3737	S	-	Function unknown 
-Rv3737	S	-	Function unknown 
 Rv3741c	P	-	Inorganic ion transport and metabolism 
 Rv3742c	P	-	Inorganic ion transport and metabolism 
 Rv3743c	P	ctpJ	Inorganic ion transport and metabolism 
@@ -2941,7 +2873,6 @@ Rv3782	R	-	General function prediction only
 Rv3783	M	rfbD	Cell wall/membrane/envelope biogenesis 
 Rv3784	G	-	Carbohydrate transport and metabolism 
 Rv3786c	M	-	Cell wall/membrane/envelope biogenesis 
-Rv3786c	M	-	Cell wall/membrane/envelope biogenesis 
 Rv3787c	Q	-	Secondary metabolites biosynthesis, transport and catabolism 
 Rv3788	K	-	Transcription 
 Rv3789	S	-	Function unknown 
@@ -2952,7 +2883,6 @@ Rv3797	I	fadE35	Lipid transport and metabolism
 Rv3798	L	-	Replication, recombination and repair 
 Rv3799c	I	accD4	Lipid transport and metabolism 
 Rv3800c	Q	pks13	Secondary metabolites biosynthesis, transport and catabolism 
-Rv3800c	Q	pks13	Secondary metabolites biosynthesis, transport and catabolism 
 Rv3801c	Q	fadD32	Secondary metabolites biosynthesis, transport and catabolism 
 Rv3803c	R	fbpD	General function prediction only 
 Rv3804c	R	fbpA	General function prediction only 
@@ -2967,7 +2897,6 @@ Rv3815c	I	-	Lipid transport and metabolism
 Rv3816c	I	-	Lipid transport and metabolism 
 Rv3817	J	-	Translation, ribosomal structure and biogenesis 
 Rv3818	R	-	General function prediction only 
-Rv3818	R	-	General function prediction only 
 Rv3820c	Q	papA2	Secondary metabolites biosynthesis, transport and catabolism 
 Rv3823c	R	mmpL8	General function prediction only 
 Rv3824c	Q	papA1	Secondary metabolites biosynthesis, transport and catabolism 
@@ -2994,8 +2923,6 @@ Rv3855	K	ethR	Transcription
 Rv3856c	R	-	General function prediction only 
 Rv3858c	R	gltD	General function prediction only 
 Rv3859c	E	gltB	Amino acid transport and metabolism 
-Rv3859c	E	gltB	Amino acid transport and metabolism 
-Rv3859c	E	gltB	Amino acid transport and metabolism 
 Rv3862c	J	whiB6	Translation, ribosomal structure and biogenesis 
 Rv3868	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv3870	D	-	Cell cycle control, cell division, chromosome partitioning 
@@ -3006,7 +2933,6 @@ Rv3883c	O	mycP1	Posttranslational modification, protein turnover, chaperones
 Rv3884c	O	-	Posttranslational modification, protein turnover, chaperones 
 Rv3886c	O	mycP2	Posttranslational modification, protein turnover, chaperones 
 Rv3888c	D	-	Cell cycle control, cell division, chromosome partitioning 
-Rv3888c	D	-	Cell cycle control, cell division, chromosome partitioning 
 Rv3890c	S	esxC	Function unknown 
 Rv3894c	D	-	Cell cycle control, cell division, chromosome partitioning 
 Rv3896c	R	-	General function prediction only 
@@ -3019,7 +2945,6 @@ Rv3911	K	sigM	Transcription
 Rv3913	O	trxB2	Posttranslational modification, protein turnover, chaperones 
 Rv3914	C	trxC	Energy production and conversion 
 Rv3915	M	-	Cell wall/membrane/envelope biogenesis 
-Rv3915	M	-	Cell wall/membrane/envelope biogenesis 
 Rv3917c	K	parB	Transcription 
 Rv3918c	D	parA	Cell cycle control, cell division, chromosome partitioning 
 Rv3919c	M	gidB	Cell wall/membrane/envelope biogenesis 


=====================================
src/pytransit/data/H37Rv_sanger_roles.dat
=====================================
@@ -321,40 +321,40 @@ Rv2994	III
 Rv3065	III.A.6
 Rv3065	III.A
 Rv3065	III
-Rv0233	I.F.3 2'
+Rv0233	I.F.3
 Rv0233	I.F
 Rv0233	I
-Rv0321	I.F.3 2'
+Rv0321	I.F.3
 Rv0321	I.F
 Rv0321	I
-Rv0570	I.F.3 2'
+Rv0570	I.F.3
 Rv0570	I.F
 Rv0570	I
-Rv1981c	I.F.3 2'
+Rv1981c	I.F.3
 Rv1981c	I.F
 Rv1981c	I
-Rv2697c	I.F.3 2'
+Rv2697c	I.F.3
 Rv2697c	I.F
 Rv2697c	I
-Rv2764c	I.F.3 2'
+Rv2764c	I.F.3
 Rv2764c	I.F
 Rv2764c	I
-Rv3048c	I.F.3 2'
+Rv3048c	I.F.3
 Rv3048c	I.F
 Rv3048c	I
-Rv3051c	I.F.3 2'
+Rv3051c	I.F.3
 Rv3051c	I.F
 Rv3051c	I
-Rv3052c	I.F.3 2'
+Rv3052c	I.F.3
 Rv3052c	I.F
 Rv3052c	I
-Rv3053c	I.F.3 2'
+Rv3053c	I.F.3
 Rv3053c	I.F
 Rv3053c	I
-Rv3247c	I.F.3 2'
+Rv3247c	I.F.3
 Rv3247c	I.F
 Rv3247c	I
-Rv3752c	I.F.3 2'
+Rv3752c	I.F.3
 Rv3752c	I.F
 Rv3752c	I
 Rv1121	I.B.5


=====================================
src/pytransit/data/gene_ontology.1_2.3-11-18.obo
=====================================
The diff for this file was not included because it is too large.

=====================================
src/pytransit/doc/source/_images/genetic_interaction_types.png
=====================================
Binary files /dev/null and b/src/pytransit/doc/source/_images/genetic_interaction_types.png differ


=====================================
src/pytransit/doc/source/transit_methods.rst
=====================================
@@ -1,8 +1,9 @@
 
 .. _`analysis_methods`:
 
-Analysis Methods
-================
+==================
+ Analysis Methods
+==================
 
 
 TRANSIT has analysis methods capable of analyzing **Himar1** and **Tn5** datasets.
@@ -13,7 +14,7 @@ Below is a description of some of the methods.
 .. _gumbel:
 
 Gumbel
-------
+======
 
 The Gumbel can be used to determine which genes are essential in a
 single condition. It does a gene-by-gene analysis of the insertions at
@@ -25,7 +26,7 @@ probability of this using a Bayesian model.
    Intended only for **Himar1** datasets.
 
 How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
 
 | For a formal description of how this method works, see our paper [DeJesus2013]_:
 
@@ -34,7 +35,7 @@ How does it work?
 | `Bayesian analysis of gene essentiality based on sequencing of transposon insertion libraries. <http://www.ncbi.nlm.nih.gov/pubmed/23361328>`_ *Bioinformatics*, 29(6):695-703.
 
 Example
-~~~~~~~
+-------
 
 ::
 
@@ -52,7 +53,7 @@ Example
 
 
 Parameters
-~~~~~~~~~~
+----------
 
 -  **Samples:** Gumbel uses Metropolis-Hastings (MH) to generate samples
    of posterior distributions. The default setting is to run the
@@ -87,7 +88,7 @@ Parameters
 |
 
 Outputs and diagnostics
-~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------
 
 The Gumbel method generates a tab-separated output file at the location
 chosen by the user. This file will automatically be loaded into the
@@ -137,7 +138,7 @@ defined as follows:
 |
 
 Run-time
-~~~~~~~~
+--------
 
 The Gumbel method takes on the order of 10 minutes for 10,000 samples.
 Run-time is linearly proportional to the 'samples' parameter, or length
@@ -151,7 +152,7 @@ replicates; replicate datasets will be automatically merged.
 
 
 griffin
--------
+=======
 
 This is an earlier version of the Gumbel method that
 identifies essential genes based on how unlikely 'gaps'
@@ -176,7 +177,7 @@ of estimating internal parameters.
 .. _`tn5gaps`:
 
 Tn5Gaps
---------
+=======
 
 The Tn5Gaps method can be used to determine which genes are essential
 in a single condition for **Tn5** datasets. It does an analysis of the
@@ -193,7 +194,7 @@ the Gumbel distribution.
 |
 
 How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
 
 This method is loosely is based on the original gumbel analysis
 method described in this paper:
@@ -230,7 +231,7 @@ output of our analysis.
 |
 
 Usage
-~~~~~
+-----
 ::
 
     python3 ../../../transit.py tn5gaps <comma-separated .wig files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
@@ -243,7 +244,7 @@ Usage
 
 
 Parameters
-~~~~~~~~~~
+----------
 
 
 + **Minimum Read:** The minimum read count that is considered a true read. Because the Gumbel method depends on determining gaps of TA sites lacking insertions, it may be suceptible to spurious reads (e.g. errors). The default value of 1 will consider all reads as true reads. A value of 2, for example, will ignore read counts of 1.
@@ -256,7 +257,7 @@ Parameters
 + **-iC:** Trimming of insertions in C-terminus (given as percentage of ORF length, e.g. "5" for 5%; default=0)
 
 Example
-~~~~~~~
+-------
 ::
 
     python3 PATH/src/transit.py tn5gaps salmonella_baseline.wig Salmonella-Ty2.prot_table salmonella_baseline_tn5gaps_trimmed.dat -m 2 -r Sum -iN 5 -iC 5
@@ -267,7 +268,7 @@ These input and output files can be downloaded from the **Example Data** section
 |
 
 Outputs and diagnostics
-~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------
 
 The Tn5Gaps method generates a tab-separated output file at the
 location chosen by the user. This file will automatically be loaded
@@ -306,7 +307,7 @@ file are defined as follows:
 |
 
 Run-time
-~~~~~~~~
+--------
 The Tn5Gaps method takes on the order of 10 minutes.
 Other notes: Tn5Gaps can be run on multiple replicates; replicate
 datasets will be automatically merged.
@@ -321,7 +322,7 @@ datasets will be automatically merged.
 .. _HMM:
 
 HMM
----
+===
 
 The HMM method can be used to determine the essentiality of the entire genome, as opposed to gene-level analysis of the other methods. It is capable of identifying regions that have unusually high or unusually low read counts (i.e. growth advantage or growth defect regions), in addition to the more common categories of essential and non-essential.
 
@@ -331,7 +332,7 @@ The HMM method can be used to determine the essentiality of the entire genome, a
 |
 
 How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
 
 | For a formal description of how this method works, see our paper [DeJesus2013HMM]_:
 |
@@ -341,7 +342,7 @@ How does it work?
 
 
 Example
-~~~~~~~
+-------
 
 ::
 
@@ -355,7 +356,7 @@ Example
 
 
 Parameters
-~~~~~~~~~~
+----------
 
 The HMM method automatically estimates the necessary statistical
 parameters from the datasets. You can change how the method handles
@@ -370,7 +371,7 @@ replicate datasets:
 |
 
 Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
 
 | The HMM method outputs two files. The first file provides the most
   likely assignment of states for all the TA sites in the genome. Sites
@@ -441,7 +442,7 @@ Output and Diagnostics
 |
 
 Run-time
-~~~~~~~~
+--------
 
 | The HMM method takes less than 10 minutes to complete. The parameters
   of the method should not affect the running-time.
@@ -455,7 +456,7 @@ Run-time
 .. _resampling:
 
 Resampling
-----------
+==========
 
 The resampling method is a comparative analysis the allows that can be
 used to determine conditional essentiality of genes. It is based on a
@@ -474,7 +475,7 @@ of genes or known biological pathway.
 |
 
 How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
 
 This technique has yet to be formally published in the context of
 differential essentiality analysis. Briefly, the read-counts at each
@@ -490,7 +491,7 @@ original, observed difference in read-counts.
 
 
 Usage
-~~~~~
+-----
 
 
 ::
@@ -518,7 +519,7 @@ Usage
 
 
 Parameters
-~~~~~~~~~~
+----------
 
 The resampling method is non-parametric, and therefore does not require
 any parameters governing the distributions or the model. The following
@@ -575,7 +576,7 @@ parameters are available for the method:
 |
 
 Notes
-~~~~~
+-----
 
 I recommend using -a (adaptive resampling). It runs much faster, and the p-values
 will be very close to a full non-adaptive run (all 10,000 samples).
@@ -588,7 +589,7 @@ making it difficult to make confident calls on essentiality.
 
 
 Doing resampling with a combined_wig file
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------------------
 
 Resampling can also now take a combined_wig_ file as input (containing insertion counts
 for multiple sample), along with a samples_metadata_ file
@@ -608,7 +609,7 @@ If you want to compare more than two conditions, see :ref:`ZINB <zinb>`.
 
 
 Doing resampling with datasets from different libraries.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------------------------
 
 In most cases, comparisons are done among samples (replicates) from
 the same library evaluated in two different conditions.  But if the
@@ -626,7 +627,7 @@ from each library in each condition.
 
 
 Doing resampling between different strains.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------------
 
 The most common case is that resampling is done among replicates all
 from the same Tn library, and hence all the datasets (fastq files) are
@@ -698,7 +699,7 @@ replicates in condition B will be from another library (another strain).
 |
 
 Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
 
 The resampling method outputs a tab-delimited file with results for each
 gene in the genome. P-values are adjusted for multiple comparisons using
@@ -752,7 +753,7 @@ changed using the -PC flag (above).
 |
 
 Run-time
-~~~~~~~~
+--------
 
 A typical run of the resampling method with 10,000 samples will take
 around 45 minutes (with the histogram option ON). Using the *adaptive
@@ -764,7 +765,7 @@ resampling* option (-a), the run-time is reduced to around 10 minutes.
 ----
 
 Mann-Whitney U-test (utest)
----------------------------
+===========================
 
 This is a method for comparing datasets from a TnSeq library evaluated in
 two different conditions, analogous to resampling.
@@ -788,13 +789,13 @@ A reference for this method is `(Santa Maria et al., 2014)
 .. _genetic-interactions:
 
 Genetic Interactions
---------------------
+====================
 
 The genetic interactions (GI) method is a comparative analysis used
 used to determine genetic interactions. It is a Bayesian method
 that estimates the distribution of log fold-changes (logFC) in two
 strain backgrounds under different conditions, and identifies significantly
-large changes in enrichment (delta-logFC) to identify those genes
+large changes in enrichment (delta_logFC) to identify those genes
 that imply a genetic interaction.
 
 
@@ -805,7 +806,7 @@ that imply a genetic interaction.
 |
 
 How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
 
 GI performs a comparison among 4 groups of datasets, strain A and B assessed in conditions 1 and 2 (e.g. control vs treatment).
 It looks for interactions where the response to the treatment (i.e. effect on insertion counts) depends on the strain.
@@ -825,28 +826,92 @@ slopes between strain A and B (represented by 'delta_LFC' in the output file).
 |
 
 
+Statistical Significance
+------------------------
+
+
+The computation that is done by GI is to compute the posterior distribution of the delta_LFC (or mean change in slopes)
+through Bayesian sampling.
+The primary method to determine significance of genes is whethter this the mean_delta_LFC is significantly differnt than 0.
+However, since the mean_delta_LFC is a distribution, we represent it by a Highest Density Interval, HDI, which is 
+similar to a 95% confidence interval.  Furthermore, rather than asking whether the HDI overlaps 0 exactly, we expand the interval
+around 0 to a Region of Probable Equivalence (ROPE), which is set to [-0.5,0.5] by default.  Hence the significant genes
+are those for which the HDI does not overlap the ROPE.  GI has a flag to  adjust the size of the ROPE, if desired.
+
+In the GI output file, the final column give the significance call, along with type of interaction.
+If a gene is not significant, it will be marked with "**No Interaction**" (for the HDI method, meaning HDI overlaps the ROPE).
+If a gene IS significant, then its interaction will be cateogrized in 3 types (see NAR paper):
+
+ * **Aggravating** - mean_delta_LFC is negative; gene is more required in treatment than control in the B strain, compared to the A strain
+ * **Suppressive** - mean_delta_LFC is positive, and the gene was not conditionally essential in strain A (flat slope), but becomes conditionally non-essential in strain B when treated (positive slope)
+ * **Alleviating** - mean_delta_LFC is positive, but the conditional requirement (negative slope) of the gene in strain A with treatment is "cancelled" by the modification in strain B
+
+.. image:: _images/genetic_interaction_types.png
+
+A limitation of this HDI approach is that it is discrete (i.e. overlap is either True or False), but does not provide a quantitative metric
+for the degree of overlap.  Thus a second method for assessing significance of genetic interactions is to compute
+the probability of overlap.  The lower the probability, the more differnt the delta_LFC is from 0, indicating a more
+significant interaction.  In this case, genes with prob < 0.05 are considered interactions and classified by the 3 types above,
+while genes with prob >= 0.05 are marked as "No Interaction".
+
+In addition, since we are calculating significance for thousands of genes in parallel, 
+many researchers prefer to have some method for correcting for multiple tests, to control the false discovery rate.
+However, FDR correction is generally used only for frequentist analyses, and he GI method is fundamentally a Bayesian approach.
+Technically, in a Bayesian framework, FDR correction is not needed.  Any adjustment for expectations about number of hits
+would be achieved through adjusting parameters for prior distributions.  Nonetheless, GI includes options for 
+two methods that approximate FDR correction: **BFDR** (Bayesian False Discovery Rate correction,
+`Newton M.A., Noueiry A., Sarkar D., Ahlquist P. (2004). Detecting differential gene expression with a semiparametric hierarchical 
+mixture method. Biostatistics, 5:155–176. <https://pubmed.ncbi.nlm.nih.gov/15054023/>`_) and FWER (Familty-Wise
+Error Rate control).  When these corrections are applied, a threshold of 0.05 for the adjusted probability of overlap 
+is used for each, and this determines which
+genes are classified as interacting (1 of 3 types) or  marked as "No Interaction", as above.
+
+In order to enable users to evaluate these various methods for determining significance of interactions,
+a '-signif' flag is provided for the GI method.  The options are:
+
+ * **-signif HDI**: significant genes are those for which the HDI does not overlap the ROPE
+ * **-signif prob**: significant genes are those with prob < 0.05, where 'prob' is porbability that HDI overlap the ROPE (default)
+ * **-signif BFDR**: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the BFDR method
+ * **-signif FWER**: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the FWER method
+
+'-signif prob' is the default method.
+
+In the output file, the genes are sorted by the probability that the HDI overlaps the ROPE.
+The genes at the top are rougly the genes with the highest absolute value of mean_delta_LFC.
+
+
 Usage
-~~~~~
+-----
 
 ::
 
-  python3 transit.py GI <wigs_for_strA_cond1> <wigs_for_strA_cond2> <wigs_for_strB_cond1> <wigs_for_strB_cond2> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
+  python3 /pacific/home/ioerger/transit/src/transit.py GI <wigs_for_strA_cond1> <wigs_for_strA_cond2> <wigs_for_strB_cond1> <wigs_for_strB_cond2> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
 
+        GI performs a comparison among 4 groups of datasets, strain A and B assessed in conditions 1 and 2 (e.g. control vs treatment).
+        It looks for interactions where the response to the treatment (i.e. effect on insertion counts) depends on the strain (output variable: delta_LFC).
         Provide replicates in each group as a comma-separated list of wig files.
+        HDI is highest density interval for posterior distribution of delta_LFC, which is like a confidence interval on difference of slopes.
+        Genes are sorted by probability of HDI overlapping with ROPE. (genes with the highest abs(mean_delta_logFC) are near the top, approximately)
+        Significant genes are indicated by 'Type of Interaction' column (No Interaction, Aggravating, Alleviating, Suppressive).
+          By default, hits are defined as "Is HDI outside of ROPE?"=TRUE (i.e. non-overlap of delta_LFC posterior distritbuion with Region of Probably Equivalence around 0)
+          Alternative methods for significance: use -signif flag with prob, BFDR, or FWER. These affect 'Type of Interaction' (i.e. which genes are labeled 'No Interaction')
 
         Optional Arguments:
         -s <integer>    :=  Number of samples. Default: -s 10000
         --rope <float>  :=  Region of Practical Equivalence. Area around 0 (i.e. 0 +/- ROPE) that is NOT of interest. Can be thought of similar to the area of the null-hypothesis. Default: --rope 0.5
         -n <string>     :=  Normalization method. Default: -n TTR
-        -iz             :=  Include rows with zero accross conditions.
+        -iz             :=  Include rows with zero across conditions.
         -l              :=  Perform LOESS Correction; Helps remove possible genomic position bias. Default: Turned Off.
         -iN <float>     :=  Ignore TAs occuring at given percentage (as integer) of the N terminus. Default: -iN 0
         -iC <float>     :=  Ignore TAs occuring at given percentage (as integer) of the C terminus. Default: -iC 0
+        -signif HDI     :=  (default) Significant if HDI does not overlap ROPE; if HDI overlaps ROPE, 'Type of Interaction' is set to 'No Interaction'
+        -signif prob    :=  Optionally, significant hits are re-defined based on probability (degree) of overlap of HDI with ROPE, prob<0.05 (no adjustment)
+        -signif BFDR    :=  Apply "Bayesian" FDR correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as BFDR<0.05
+        -signif FWER    :=  Apply "Bayesian" FWER correction (see doc) to adjust HDI-ROPE overlap probabilities so that significant hits are re-defined as FWER<0.05
 
-You can think of 'control' and 'experimental' samples as 'untreated' vs. 'treated'.
 
 Example
-~~~~~~~
+-------
 
 In this example, the effect of a knockout of SigB is being evaluated for its effect on tolerance of isoniazid.
 Some genes may become more essential (or less) in the presence of INH in the wild-type strain.
@@ -859,7 +924,7 @@ Note there are 2 replicates in each of the 4 groups of datasets.
 
 
 Parameters
-~~~~~~~~~~
+----------
 
 The resampling method is non-parametric, and therefore does not require
 any parameters governing the distributions or the model. The following
@@ -890,17 +955,23 @@ parameters are available for the method:
    as real differences. See the :ref:`Normalization <normalization>` section for a description
    of normalization method available in TRANSIT.
 
+-  **Significance Method:**
 
+ * -signif HDI: significant genes are those for which the HDI does not overlap the ROPE
+ * -signif prob: significant genes are those with prob < 0.05, where 'prob' is porbability that HDI overlap the ROPE (default)
+ * -signif BFDR: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the BFDR method
+ * -signif FWER: significant genes are those with adjusted prob < 0.05, where prob is adjusted by the FWER method
 
 
 
 Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
 
 The GI method outputs a tab-delimited file with results for each
-gene in the genome. P-values are adjusted for multiple comparisons using
-the Benjamini-Hochberg procedure (called "q-values" or "p-adj."). A
-typical threshold for conditional essentiality on is q-value < 0.05.
+gene in the genome.
+All genes are sorted by significance using the probability that the HDI overlaps the ROPE.
+Significant genes are those NOT marked with 'No Interaction' in the last column.
+
 
 +-----------------------------------------+----------------------------------------------------+
 | Column Header                           | Column Definition                                  |
@@ -940,10 +1011,6 @@ typical threshold for conditional essentiality on is q-value < 0.05.
 
 |
 
-Significant interactions are those with "HDI outside ROPE?"=TRUE.
-
-All genes are sorted by significance using BFDR.
-
 
 
 .. rst-class:: transit_sectionend
@@ -954,7 +1021,7 @@ All genes are sorted by significance using BFDR.
 .. _anova:
 
 ANOVA
------
+=====
 
 The Anova (Analysis of variance) method is used to determine which genes
 exhibit statistically significant variability of insertion counts across multiple conditions.
@@ -966,7 +1033,7 @@ to which experimental conditions).
 |
 
 How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
 
 The method performs the `One-way anova test <https://en.wikipedia.org/wiki/Analysis_of_variance?oldformat=true#The_F-test>`_ for each gene across conditions.
 It takes into account variability of normalized transposon insertion counts among TA sites
@@ -975,7 +1042,7 @@ to determine if the differences among the mean counts for each condition are sig
 
 
 Example
-~~~~~~~
+-------
 
 ::
 
@@ -1014,7 +1081,7 @@ The filenames should match what is shown in the header of the combined_wig (incl
 
 
 Parameters
-~~~~~~~~~~
+----------
 
 The following parameters are available for the method:
 
@@ -1030,7 +1097,7 @@ The following parameters are available for the method:
 -  **-PC** Pseudocounts to use in calculating LFCs (see below). Default: -PC 5
 
 Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
 
 The anova method outputs a tab-delimited file with results for each
 gene in the genome. P-values are adjusted for multiple comparisons using
@@ -1077,7 +1144,7 @@ Changing the pseudocounts will not affect the analysis of statistical significan
 |
 
 Run-time
-~~~~~~~~
+--------
 
 A typical run of the anova method takes less than 1 minute for a combined wig file with 6 conditions, 3 replicates per condition.
 
@@ -1092,7 +1159,7 @@ A typical run of the anova method takes less than 1 minute for a combined wig fi
 .. _zinb:
 
 ZINB
-----
+====
 
 The ZINB (Zero-Inflated Negative Binomial) method is used to determine
 which genes exhibit statistically significant variability in either
@@ -1118,7 +1185,7 @@ See :ref:`Installation Instructions <install-zinb>`..
 |
 
 How does it work?
-~~~~~~~~~~~~~~~~~
+-----------------
 
 | For a formal description of how this method works, see our paper [ZINB]_: 
 |
@@ -1127,7 +1194,7 @@ How does it work?
 
 
 Example
-~~~~~~~
+-------
 
 ::
 
@@ -1149,7 +1216,7 @@ Example
 .. _combined_wig:
 
 Combined wig files
-~~~~~~~~~~~~~~~~~~
+------------------
 
 Transit now supports a new file format called 'combined_wig' which basically
 combines multiple wig files into one file (with multiple columns).  This is
@@ -1174,7 +1241,7 @@ TTR is the default, but other relevant normalization options would be 'nonorm'
 .. _samples_metadata:
 
 Samples Metadata File
-~~~~~~~~~~~~~~~~~~~~~
+---------------------
 
 Format of the *samples_metadata* file: a tab-separated file (which you
 can edit in Excel) with 3 columns: Id, Condition, and Filename (it
@@ -1194,7 +1261,7 @@ of the combined_wig (including pathnames, if present).
   chol2   cholesterol  /Users/example_data/cholesterol_rep3.wig
 
 Parameters
-~~~~~~~~~~
+----------
 
 The following parameters are available for the method:
 
@@ -1208,7 +1275,7 @@ The following parameters are available for the method:
 - **Covariates:** If additional covariates distinguishing the samples are available, such as library, timepoint, or genotype, they may be incorporated in the test.
 
 Covariates and Interactions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------
 
 While ZINB is focus on identifying variability of insertion counts across conditions,
 the linear model also allows you to take other variables into account.
@@ -1281,7 +1348,7 @@ The difference between how covariates and interactions are handeled in the model
 is discussed below in the section on Statistical Significance.  
 
 Categorical vs Numeric Covariates
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------
 
 In some cases, covariates are intended to be treated as categorical
 variables, like 'batch' or 'library' or 'medium'.
@@ -1297,7 +1364,7 @@ trend in the insertion counts as the covariate value increases.
 
 
 Statistical Significance - What the P-values Mean in the ZINB Output
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------------------------------------------------
 
 Formally, the P-value is from a likelihood ratio test (LRT) between a
 condition-dependent ZINB model (:math:`m_1`) and a
@@ -1391,7 +1458,7 @@ and this would be compared to the following null model (without the interaction
 
 
 Output and Diagnostics
-~~~~~~~~~~~~~~~~~~~~~~
+----------------------
 
 The ZINB method outputs a tab-delimited file with results for each
 gene in the genome. P-values are adjusted for multiple comparisons using
@@ -1442,7 +1509,7 @@ Changing the pseudocounts will not affect the analysis of statistical significan
 |
 
 Run-time
-~~~~~~~~
+--------
 
 A typical run of the ZINB method takes ~5 minutes to analze a combined wig
 file with 6 conditions, 3 replicates per condition. It will, of
@@ -1457,7 +1524,7 @@ course, run more slowly if you have many more conditions.
 .. _normalization:
 
 Normalization
--------------
+=============
 
 
 Proper normalization is important as it ensures that other sources of
@@ -1507,14 +1574,14 @@ briefly described below:
     No normalization is performed.
 
 Command-line
-~~~~~~~~~~~~
+------------
 
 In addition to choosing normalization for various analyses in the GUI,
 you can also call Transit to normalize wig files from the command-line,
 as shown in this example:
 
 Example
-~~~~~~~
+-------
 
 ::
 
@@ -1539,7 +1606,7 @@ If the input file is a combined_wig file, indicate it with a '-c' flag.
 
 .. rst-class:: transit_clionly
 Pathway Enrichment Analysis
----------------------------
+===========================
 
 Pathway Enrichment Analysis provides a method to
 identify enrichment of functionally-related genes among those that are
@@ -1548,10 +1615,10 @@ significantly more or less essential between two conditions).
 The analysis is typically applied as post-processing step to the hits identified
 by a comparative analysis, such as *resampling*.
 Several analytical method are provided:
-Fisher's exact test (hypergeometric distribution), GSEA (Gene Set Enrichment Analysis)
+Fisher's exact test (FET, hypergeometric distribution), GSEA (Gene Set Enrichment Analysis)
 by `Subramanian et al (2005) <https://www.ncbi.nlm.nih.gov/pubmed/16199517>`_,
 and `Ontologizer <https://www.ncbi.nlm.nih.gov/pubmed/17848398>`_.
-For the Fisher exact test, 
+For Fisher's exact test, 
 genes in the resampling output file with adjusted p-value < 0.05 are taken as hits,
 and evaluated for overlap with functional categories of genes.
 The GSEA methods use the whole list of genes, ranked in order of statistical significance
@@ -1578,17 +1645,17 @@ and is not available in the Transit GUI.
 
 
 Usage
-~~~~~~~
+-----
 
 ::
 
-    python3 src/transit.py pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FISHER|GSEA|ONT>] [-PC <int>]
+    python3 src/transit.py pathway_enrichment <resampling_file> <associations> <pathways> <output_file> [-M <FET|GSEA|ONT>] [-PC <int>]
 
 |
 
 
 Parameters
-~~~~~~~~~~
+----------
 - **Resampling File**
     The resampling file is the one obtained after using the resampling method in Transit. (It is a tab separated file with 11 columns.) GSEA method makes usage of the last column (adjusted P-value)
 - **Associations File**
@@ -1622,14 +1689,18 @@ Parameters
   ...
 
 - **-M**
-    Methodology to be used. FISHER is used by default (even without specifying -M).
+    Methodology to be used. FET is used by default (even without specifying -M).
 
-  **FISHER**
+  **FET**
     This implements Fisher's Exact Test (hypergeometric distribution) to determine a p-value for each pathway, based on the proportion of pathway member observed in list of hits (conditionally essential gene by resampling, padj<0.05) compared to the background proportion in the overall genome, and p-values are adjusted post-hoc by the Benjamini-Hochberg procedure to limit the FDR to 5%.  
 
     In the output file, an "enrichment score" is reported, which is the ratio of the observed number of pathway members among the hits to the expected number.  Pseudocounts of 2 are included in the calculation to reduce the bias toward small pathways with only a few genes; this can be adjusted with the -PC flag (below).
 
-    FISHER can be used with GO terms.
+    FET can be used with GO terms.
+
+    Additional flags for FET:
+
+    - **-PC <int>**: Pseudocounts used in calculating the enrichment score and p-value by hypergeometic distribution. Default: PC=2.
 
   **GSEA**
     Gene Set Enrichment Analysis. GSEA assess the significance of a pathway by looking at how the members fall in the ranking of all genes.  The genes are first ranked by significance from resampling.  Specifically, they are sorted by signed-log-p-value, SLPV=sign(LFC)*(log(pval)), which puts them in order so that the most significant genes with negative LFC are at the top, the most significant with positive LFC are at the bottom, and insignificant genes fall in the middle.  Roughly, GSEA computes the mean rank of pathway members, and evaluates significance based on a simulated a null distribution.  p-values are again adjusted at the end by BH.
@@ -1638,17 +1709,58 @@ Parameters
 
     GSEA can be used with GO terms.
 
+    Additional flags for GSEA:
+
+    - **-ranking SLPV|LFC**: method used to rank all genes; SLPV is signed-log-p-value (default); LFC is log2-fold-change from resampling
+
+    - **-p <float>**: exponent to use in calculating enrichment score; recommend trying '-p 0' (default) or '-p 1' (as used in Subramaniam et al, 2005)
+
+    - **-Nperm <int>**: number of permutations to simulate for null distribution to determine p-value (default=10000)
+
+
   **ONT**
-    Ontologizer is a specialized method for GO terms that takes parent-child relationships into account among nodes in the GO hierarchy.  This can enhance the specificity of pathways detected as significant.  Hierarhical relationships among GO terms are encoded in an OBO file, which is included in the src/pytransit/data/ directory.
+    Ontologizer is a specialized method for GO terms that takes parent-child relationships into account among nodes in the GO hierarchy.  This can enhance the specificity of pathways detected as significant.  (The problem is that there are many GO terms in the hierarchy covering similar or identical sets of genes, and often, if one node is significantly enriched, then several of its ancestors will be too, which obscures the results with redundant hits; Ontologizer reduces the significance of nodes if their probability distribution among hits can be explained by their parents.) Hierarhical relationships among GO terms are encoded in an OBO file, which is included in the src/pytransit/data/ directory.
 
     `Grossmann S, Bauer S, Robinson PN, Vingron M. Improved detection of overrepresentation of Gene-Ontology annotations with parent child analysis. Bioinformatics. 2007 Nov 15;23(22):3024-31. <https://www.ncbi.nlm.nih.gov/pubmed/17848398>`_
 
-- **-PC**
-   Pseudocounts used in calculating enrichment score in output file for FISHER. Default: PC=2.
+
+
+
+Auxilliary Pathway Files in Transit Data Directory
+--------------------------------------------------
+
+::
+
+These files for pathway analysis are distributed in the Transit data directory 
+(e.g. transit/src/pytransit/data/).
+
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+| system   | num cats| applicable methods | associations of genes with roles     | pathway definitions/role names |
++==========+=========+====================+======================================+================================+
+| COG      | 20      | FET*, GSEA         | H37Rv_COG_roles.dat                  | COG_roles.dat                  |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+| Sanger   | 153     | FET*, GSEA*        | H37Rv_sanger_roles.dat               | sanger_roles.dat               |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+| GO       | 2545    | FET, GSEA          | H37Rv_GO_terms.txt                   | GO_term_names.dat              |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+|          |         | ONT*               | GO_terms_for_each_Rv.obo-3-11-18.txt | gene_ontology.1_2.3-11-18.obo  |
++----------+---------+--------------------+--------------------------------------+--------------------------------+
+
+asterisk means 'recommended' combination of method with system of functional categories
+
+
+Current Recommendations
+-----------------------
+
+Here are the recommended combinations of pathway methods to use for different systems of functional categories:
+
+ * For COG, use '-M FET'
+ * For Sanger roles, try both FET and GSEA
+ * For GO terms, use 'M -ONT'
 
 
 Examples
-~~~~~~~~
+--------
 
 ::
 
@@ -1667,7 +1779,7 @@ Examples
     # Ontologizer is a specialized method for GO terms
     > transit pathway_enrichment resampling_glyc_chol.txt $DATA/H37Rv_GO_terms.txt $DATA/GO_term_names.dat pathways_Ontologizer.txt -M ONT
 
-The $DATA environment variable is these examples refers to the Transit data directory, e.g. src/pytransit/data/.
+The $DATA environment variable in these examples refers to the Transit data directory, e.g. src/pytransit/data/.
 
 
 .. rst-class:: transit_sectionend
@@ -1679,7 +1791,7 @@ The $DATA environment variable is these examples refers to the Transit data dire
 
 .. rst-class:: transit_clionly
 tnseq_stats
------------
+===========
 
 You can generate the same table to statistics as on the Quality Control panel in the GUI
 from the command-line using the 'tnseq_stats' command.  Here is an example:
@@ -1710,7 +1822,7 @@ from the command-line using the 'tnseq_stats' command.  Here is an example:
 
 .. rst-class:: transit_clionly
 corrplot
--------
+========
 
 A useful tool when evaluating the quality of a collection of TnSeq datasets is to make a 
 *correlation plot* of the mean insertion counts (averaged at the gene-level) among samples.  
@@ -1725,7 +1837,7 @@ and relies on the 'corrplot' R package.
 See :ref:`Installation Instructions <install-zinb>`.
 
 Usage:
-~~~~~~
+------
 
 ::
 
@@ -1791,7 +1903,7 @@ to be installed on your system.  See :ref:`Installation Instructions <install-zi
 
 .. rst-class:: transit_clionly
 heatmap
--------
+=======
 
 The output of ANOVA or ZINB can be used to generate a heatmap that
 simultaneously clusters the significant genes and clusters the conditions,
@@ -1803,21 +1915,27 @@ and relies on the 'gplots' R package.
 See :ref:`Installation Instructions <install-zinb>`.
 
 Usage:
-~~~~~~
+------
 
 ::
 
-  python3 src/transit.py heatmap -anova|-zinb <anova_or_zinb_output> <heatmap.png>
+  python3 src/transit.py heatmap <anova_or_zinb_output> <heatmap.png> -anova|-zinb [-topk <int>] [-qval <float]
 
-Note that the *first* argument is required to be either '-anova' or '-zinb', a flag to
+Note that the first optional argument (flag) is required to be either '-anova' or '-zinb', a flag to
 indicate the type of file being provided as the second argument.
 
+By default, genes are selected for the heatmap based on qval<0.05.
+However, the user may change the selection of genes through 2 flags:
+
+ * **-qval <float>**: change qval threshold for selecting genes (default=0.05)
+ * **-topk <int>**: select top k genes ranked by significance (qval)
+
 Here is an example which generates the following image showing the similarities among
 several different growth conditions:
 
 ::
 
-  > python3 src/transit.py heatmap -anova anova_iron.txt heatmap_iron_anova.png
+  > python3 src/transit.py heatmap anova_iron.txt heatmap_iron_anova.png -anova
 
 .. image:: _images/iron_heatmap_anova_rotated.png
    :width: 1000
@@ -1826,7 +1944,7 @@ several different growth conditions:
 
 Importantly, the heatmap is based only on the subset of genes
 identified as significantly varying (Padj < 0:05, typically only a few
-hundred) in order to enhance the patterns, since otherwise they would
+hundred genes) in order to enhance the patterns, since otherwise they would
 be washed out by the rest of the genes in the genome, the majority of
 which usually do not exhibit significant variation in counts.
 


=====================================
src/pytransit/export/mean_counts.py
=====================================
@@ -61,6 +61,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
  
     """
     def __init__(self,
+                combined_wig,
                 ctrldata,
                 annotation_path,
                 output_file,
@@ -72,7 +73,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
 
         base.SingleConditionMethod.__init__(self, short_name, long_name, description, label, ctrldata, annotation_path, output_file, normalization=normalization, LOESS=LOESS, NTerminus=NTerminus, CTerminus=CTerminus, wxobj=wxobj)
 
-
+        self.combined_wig = combined_wig # boolean, interprete ctrldata as combined_wig file or comma-separated list of wig files?
 
 
     @classmethod
@@ -110,9 +111,10 @@ class MeanCountsMethod(base.SingleConditionMethod):
         if not output_path: return None
         output_file = open(output_path, "w")
 
+        # could add a checkbox for combined_wig
+        combined_wig = False
 
-
-        return self(ctrldata,
+        return self(combined_wig,ctrldata,
                 annotationPath,
                 output_file,
                 normalization,
@@ -124,8 +126,11 @@ class MeanCountsMethod(base.SingleConditionMethod):
     @classmethod
     def fromargs(self, rawargs): 
         (args, kwargs) = transit_tools.cleanargs(rawargs)
+        print("ARGS="+str(args))
+        print("KWARGS="+str(kwargs))
 
-        ctrldata = args[0].split(",")
+        combined_wig = kwargs.get("c",False)
+        ctrldata = args[0].split(",") 
         annotationPath = args[1]
         outpath = args[2]
         output_file = open(outpath, "w")
@@ -136,7 +141,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
         NTerminus = 0.0
         CTerminus = 0.0
 
-        return self(ctrldata,
+        return self(combined_wig,ctrldata,
                 annotationPath,
                 output_file,
                 normalization,
@@ -152,9 +157,9 @@ class MeanCountsMethod(base.SingleConditionMethod):
         
         #Get orf data
         self.transit_message("Getting Data")
-        (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
-        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, 
-            self.ctrldata, self.annotation_path)
+        if self.combined_wig: (position, fulldata, datasets) = tnseq_tools.read_combined_wig(self.ctrldata[0])
+        else: (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
+        (fulldata, factors) = norm_tools.normalize_data(fulldata, self.normalization, self.ctrldata, self.annotation_path)
         position = position.astype(int)
 
         hash = transit_tools.get_pos_hash(self.annotation_path)
@@ -171,16 +176,19 @@ class MeanCountsMethod(base.SingleConditionMethod):
 
 
         self.output.write("#Files:\n")
-        for f in self.ctrldata:
+        names = datasets if self.combined_wig else self.ctrldata 
+        for f in names:
             self.output.write("#%s\n" % f)
 
 
         K,Nsites = fulldata.shape
         # Get Gene objects
-        G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization)
+        if self.combined_wig: G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization,data=fulldata,position=position)
+        else: G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, norm=self.normalization)
         N = len(G)
         self.progress_range(N)
-        dataset_header = "\t".join([transit_tools.fetch_name(D) for D in self.ctrldata])
+        if self.combined_wig: dataset_header = '\t'.join(datasets)
+        else: dataset_header = "\t".join([transit_tools.fetch_name(D) for D in self.ctrldata])
         self.output.write("#Orf\tName\tNumber of TA sites\t%s\n" % dataset_header)
         for i,gene in enumerate(G):
             if gene.n > 0:
@@ -204,7 +212,7 @@ class MeanCountsMethod(base.SingleConditionMethod):
 
     @classmethod
     def usage_string(self):
-        return """python %s export mean_counts <comma-separated .wig files> <annotation .prot_table> <output file>""" % (sys.argv[0])
+        return """python %s export mean_counts <comma-separated .wig files>|<combined_wig> <annotation .prot_table> <output file> [-c]\n note: append -c if inputing a combined_wig file\n""" % (sys.argv[0])
 
 
 if __name__ == "__main__":


=====================================
src/pytransit/transit_gui.py
=====================================
@@ -382,10 +382,14 @@ class MainFrame ( wx.Frame ):
         expLibSizer.Add(self.expLibTip, 0, wx.ALIGN_CENTER_VERTICAL, 5 )
 
 
-        globalSizerVT.Add( nTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
-        globalSizerVT.Add( cTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
-        globalSizerVT.Add( ctrlLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
-        globalSizerVT.Add( expLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+        #globalSizerVT.Add( nTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+        #globalSizerVT.Add( cTermSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+        #globalSizerVT.Add( ctrlLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+        #globalSizerVT.Add( expLibSizer, 1, wx.ALIGN_CENTER_HORIZONTAL|wx.EXPAND, 5 )
+        globalSizerVT.Add( nTermSizer, 1, wx.EXPAND, 5 )
+        globalSizerVT.Add( cTermSizer, 1, wx.EXPAND, 5 )
+        globalSizerVT.Add( ctrlLibSizer, 1, wx.EXPAND, 5 )
+        globalSizerVT.Add( expLibSizer, 1, wx.EXPAND, 5 )
 
 
 



View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/3b803d0613a70d5cdb6711783eca45db9e55fc51

-- 
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/3b803d0613a70d5cdb6711783eca45db9e55fc51
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201028/063f6716/attachment-0001.html>


More information about the debian-med-commit mailing list