[med-svn] [Git][med-team/tnseq-transit][upstream] New upstream version 3.3.1

Étienne Mollier (@emollier) gitlab at salsa.debian.org
Tue Oct 24 21:08:13 BST 2023



Étienne Mollier pushed to branch upstream at Debian Med / tnseq-transit


Commits:
b554dbac by Étienne Mollier at 2023-10-24T21:10:41+02:00
New upstream version 3.3.1
- - - - -


21 changed files:

- + .readthedocs.yaml
- CHANGELOG.md
- + src/HMM_conf.py
- + src/allpairs_resampling.py
- src/pytransit/__init__.py
- src/pytransit/analysis/CGI.py
- src/pytransit/analysis/hmm.py
- + src/pytransit/data/iron_combined_wig4.txt
- + src/pytransit/data/iron_samples_metadata.txt
- + src/pytransit/doc/requirements.txt
- + src/pytransit/doc/source/example_scripts.rst
- + src/pytransit/doc/source/file_formats.rst
- src/pytransit/doc/source/index.rst
- src/pytransit/doc/source/method_HMM.rst
- src/pytransit/doc/source/method_anova.rst
- src/pytransit/doc/source/method_resampling.rst
- src/pytransit/doc/source/method_zinb.rst
- src/pytransit/doc/source/transit.rst
- src/pytransit/doc/source/transit_install.rst
- + src/pytransit_resampling.py
- tests/test_analysis_methods.py


Changes:

=====================================
.readthedocs.yaml
=====================================
@@ -0,0 +1,35 @@
+# Read the Docs configuration file for Sphinx projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: src/pytransit/doc/source/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: src/pytransit/doc/requirements.txt


=====================================
CHANGELOG.md
=====================================
@@ -3,14 +3,23 @@ All notable changes to this project will be documented in this file.
 
 
 
-## Version 3.3.0 2023-08-03
+## Version 3.3.1 (2023-10-18)
+#### Transit:
+
+Minor changes:
+ - changed HMM output filenames to *.sites.txt and *.genes.txt
+ - added a page on File Formats to documentation
+ - added a page on Example Scripts to documentation (for developers, to illustrate use of pytransit package)
+
+
+## Version 3.3.0 (2023-08-03)
 #### Transit:
 
 Major changes:
  - added CRISPRi-DR method for identifying chemical-genetic interactions (CGI) in CRISPRi libraries
 
 
-## Version 3.2.8 2023-07-22
+## Version 3.2.8 (2023-07-22)
 #### TPP:
 
 Small bug fixes:
@@ -19,7 +28,7 @@ Small bug fixes:
  - added link in documentation to make users aware of Transit2
 
 
-## Version 3.2.7 2022-09-22
+## Version 3.2.7 (2022-09-22)
 #### TRANSIT:
 
 Major changes:


=====================================
src/HMM_conf.py
=====================================
@@ -0,0 +1,85 @@
+import sys,numpy
+import scipy.stats
+
+data = []
+all = []
+Sats = {}
+NZmeans = {}
+TAs = {}
+Calls = {}
+for line in open(sys.argv[1]):
+  line = line.strip()
+  if line[0]=='#': continue
+  w = line.split('\t')
+  nTA = int(w[3])
+  if nTA==0: continue
+  votes = [int(x) for x in w[4:8]]
+  m = max(votes)
+  consistency = m/float(nTA)
+  id = w[0]
+  Sats[id] = float(w[-3])
+  NZmeans[id] = float(w[-2])
+  TAs[id] = nTA
+  Calls[id] = w[-1]
+  all.append(consistency)
+  data.append(w)
+
+print("# avg gene-level consistency of HMM states: %s" % (round(numpy.mean(all),4)))
+
+cons = numpy.mean(all)
+
+IDs = [x[0] for x in data]
+
+meanSats,meanNZmeans = {},{}
+stdSats,stdNZmeans = {},{}
+
+print("# state posterior probability distributions:")
+for st in ["ES","GD","NE","GA"]:
+  sub = list(filter(lambda id: Calls[id]==st,IDs))
+  meanSat = numpy.mean([Sats[id] for id in sub])
+  meanNZmean = numpy.mean([NZmeans[id] for id in sub])
+  stdSat = numpy.std([Sats[id] for id in sub])
+  stdNZmean = numpy.std([NZmeans[id] for id in sub])
+  print("#  %s: genes=%s, meanSat=%s, stdSat=%s, meanNZmean=%s, stdNZmean=%s" % (st,len(sub),round(meanSat,3),round(stdSat,3),round(meanNZmean,1),round(stdNZmean,1)))
+  meanSats[st] = meanSat
+  meanNZmeans[st] = meanNZmean
+  stdSats[st] = meanSat
+  stdNZmeans[st] = meanNZmean
+
+def calc_prob(sat,NZmean,meanSat,stdSat,meanNZmean,stdNZmean):
+  a = scipy.stats.norm.pdf(sat,loc=meanSat,scale=stdSat)
+  b = scipy.stats.norm.pdf(NZmean,loc=meanNZmean,scale=stdNZmean)
+  return a*b
+
+# second pass...
+
+for line in open(sys.argv[1]):
+  line = line.strip()
+  if line[0]=='#': print (line); continue
+  w = line.split('\t')
+  id,Call = w[0],w[-1]
+  nTA = int(w[3])
+  if nTA==0: print (line); continue
+  votes = [int(x) for x in w[4:8]]
+  m = max(votes)
+  consistency = m/float(nTA)
+  #prob = scipy.stats.binom.cdf(m,nTA,cons)
+  probs = []
+  STATES = "ES GD NE GA".split()
+  for st in STATES:
+    probs.append(calc_prob(float(w[-3]),float(w[-2]),meanSats[st],stdSats[st],meanNZmeans[st],stdNZmeans[st]))
+  totprob = sum(probs)
+  relprobs = [x/float(totprob) for x in probs]
+  conf = relprobs[STATES.index(Call)]
+  flag=""
+  if conf<0.5: flag="low-confidence"
+  if max(relprobs)<0.7:
+    if (Call=="ES" or Call=="GD") and (relprobs[0]>0.25 and relprobs[1]>0.25): flag="ambiguous"
+    if (Call=="GD" or Call=="NE") and  (relprobs[1]>0.25 and relprobs[2]>0.25) :flag="ambiguous"
+    if (Call=="NE" or Call=="GA") and (relprobs[2]>0.25 and relprobs[3]>0.25):flag="ambiguous"
+  
+  #vals = w+[nTA,m,round(consistency,3),round(prob,4)]
+  vals = w+[round(consistency,3)]+[round(x,6) for x in relprobs]
+  vals += [round(conf,4),flag]
+  print ('\t'.join([str(x) for x in vals]))
+


=====================================
src/allpairs_resampling.py
=====================================
@@ -0,0 +1,75 @@
+import sys,numpy
+import pytransit.transit_tools as transit_tools
+import pytransit.tnseq_tools as tnseq_tools
+import pytransit.stat_tools as stat_tools
+import pytransit.norm_tools as norm_tools
+from statsmodels.stats.multitest import fdrcorrection
+
+# this is a stand-alone script that uses the pytransit library to do resampling between all pairs of conditions in a combined_wig file
+# metadata file indicates which replicates belong to which conditions
+# put transit/src/ in your PYTHON_PATH (e.g. using export in bash, or setenv in csh)
+# prints output to stdout
+
+if len(sys.argv)!=4:
+  print("usage: python3 allpairs_resampling.py <combined_wig_file> <metadata_file> <prot_table>")
+  sys.exit(0)
+
+combined_wig_file = sys.argv[1]
+metadata_file = sys.argv[2]
+prot_table = sys.argv[3]
+
+#################################
+
+print("reading data")
+(sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(combined_wig_file)
+print("data.shape =",data.shape)
+
+print("normalizing using TTR")
+(data, factors) = norm_tools.normalize_data(data, "TTR")
+
+# there is a tnseq_tools.read_samples_metadata(), but it is kind of complicated
+# so just read through tab-separated file and collect sample Filenames associated with Conditions
+# metadata files have at least 3 columns: Id, Filename, Condition
+# the columns in the combined_wig are cross-referenced by Filename (from the original wigs)
+
+Conditions,Samples = [],{}
+CondCol,FnameCol = -1,-1
+for line in open(metadata_file):
+  w = line.rstrip().split('\t') # tab-separated
+  if CondCol==-1: CondCol,FnameCol = w.index("Condition"),w.index("Filename"); continue # error if headers not found
+  cond,fname = w[CondCol],w[FnameCol]
+  if cond not in Conditions: Conditions.append(cond)
+  if cond not in Samples: Samples[cond] = []
+  Samples[cond].append(fname)
+
+print("\nConditions\t: Samples")
+print("-------------------------")
+for i,cond in enumerate(Conditions):
+  print("%s:%-8s" % (i+1,cond),"\t: ",", ".join(Samples[cond]))
+ 
+#################################
+
+genes = tnseq_tools.Genes(None,prot_table,data=data, position=sites) # divides counts at all TAs sites into groups by orf
+
+print()
+print("running resampling on each pair of conditions...")
+print("reporting number of conditionally essential genes (Qval<0.05)")
+for i,condA in enumerate(Conditions):
+  for j,condB in enumerate(Conditions):
+    if i<j:
+      pvals = []
+      for gene in genes:
+        if gene.n==0: continue # skip genes with 0 TA sites
+        idxA = [filenamesInCombWig.index(s) for s in Samples[condA]]
+        idxB = [filenamesInCombWig.index(s) for s in Samples[condB]]
+        data1 = gene.reads[idxA] # counts at TA sites in gene for wigs for condition A
+        data2 = gene.reads[idxB]
+        stats = stat_tools.resampling(data1, data2, adaptive=True) # use defaults for other params
+        (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) = stats # unpack
+        pvals.append(pval_2tail)
+
+      qvals = fdrcorrection(pvals)[1]
+      numhits = sum([q<0.05 for q in qvals])
+      vals = ["%s:%-8s" % (i+1,condA),"%s:%-8s" % (j+1,condB),numhits]
+      print("\t".join([str(x) for x in vals]))
+


=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
 __all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
 
 
-__version__ = "v3.3.0"
+__version__ = "v3.3.1"
 prefix = "[TRANSIT]"
 


=====================================
src/pytransit/analysis/CGI.py
=====================================
@@ -125,19 +125,25 @@ class CGI_Method(base.SingleConditionMethod):
         cmd,args,kwargs = self.cmd,self.args,self.kwargs
 
         if cmd=="extract_counts":
-            if len(args)<2: print(self.usage_string())
+            if len(args)<2: 
+                print("You have provided incorrect number of args")
+                print(self.usage_string())
             fastq_file = args[0]
             ids_file = args[1]
             self.extract_counts(fastq_file, ids_file)
         
         elif cmd=="create_combined_counts":
-            if len(args)<2: print(self.usage_string())
+            if len(args)<2: 
+                print("You have provided incorrect number of args")
+                print(self.usage_string())
             headers = args[0].split(",")
             counts_file_list = args[1:]
             self.create_combined_counts(headers,counts_file_list)
 
         elif cmd=="extract_abund":
-            if len(args)<7: print(self.usage_string())
+            if len(args)<7: 
+                print("You have provided incorrect number of args")
+                print(self.usage_string())
             combined_counts_file = args[0]
             metadata_file = args[1]
             control_condition=args[2]
@@ -147,17 +153,23 @@ class CGI_Method(base.SingleConditionMethod):
             days = args[6]
             self.extract_abund(combined_counts_file,metadata_file,control_condition,sgRNA_strength_file,no_dep_abund,drug,days)
         elif cmd == "run_model":
-            if len(args)<1: print(self.usage_string())
+            if len(args)<1: 
+                print("You have provided incorrect number of args")
+                print(self.usage_string())
             ifile_path = args[0] #example frac_abund_RIF_D5.txt
             self.run_model(ifile_path)
         elif cmd == "visualize":
-            if len(args)<3: print(self.usage_string())
+            if len(args)<3: 
+                print("You have provided incorrect number of args")
+                print(self.usage_string())
             frac_abund_file= args[0]
             gene = args[1]
             fig_location = args[2]
             self.visualize(frac_abund_file, gene, fig_location)
             
-        else: print(self.usage_string())
+        else: 
+            print("You have not entered a valid command, here are the options")
+            print(self.usage_string())
 
     def reverse_complement(self, seq):
         complement = {'A':'T','T':'A','C':'G','G':'C'}
@@ -291,15 +303,16 @@ class CGI_Method(base.SingleConditionMethod):
         abund_df["sgRNA"] = abund_df.index.values.tolist()
         abund_df[["orf-gene","remaining"]] = abund_df["sgRNA"].str.split('_',n=1,expand=True)
         abund_df[["orf","gene"]]= abund_df["orf-gene"].str.split(':',expand=True)
-        abund_df = abund_df.drop(columns=["orf-gene","remaining","sgRNA"])
+        abund_df = abund_df.drop(columns=["orf-gene","remaining"])
         abund_df = abund_df.dropna()
         
         abund_df.insert(0, "sgRNA strength", abund_df.pop("sgRNA strength"))
         abund_df.insert(0, "uninduced ATC values", abund_df.pop("uninduced ATC values"))
         abund_df.insert(0, 'gene', abund_df.pop('gene'))
         abund_df.insert(0, 'orf', abund_df.pop('orf'))
+        abund_df.insert(0, 'sgRNA', abund_df.pop('sgRNA'))
 
-        abund_df_text = abund_df.to_csv(sep="\t")
+        abund_df_text = abund_df.to_csv(sep="\t", index=False)
         print(abund_df_text)
 
 


=====================================
src/pytransit/analysis/hmm.py
=====================================
@@ -230,12 +230,13 @@ class HMMMethod(base.SingleConditionMethod):
 
         #Get output path
         name = transit_tools.basename(ctrldata[0])
-        defaultFileName = "hmm_output.dat"
+        defaultFileName = "hmm_output_base"
         defaultDir = os.getcwd()
         output_path = wxobj.SaveFile(defaultDir, defaultFileName)
         if not output_path: return None
-        output_file = open(output_path, "w")
-
+        self.sites_fname = output_path+".sites.txt"
+        self.genes_fname = output_path+".genes.txt"
+        output_file = open(self.sites_fname, "w")
 
 
         return self(ctrldata,
@@ -255,7 +256,9 @@ class HMMMethod(base.SingleConditionMethod):
         ctrldata = args[0].split(",")
         annotationPath = args[1]
         outpath = args[2]
-        output_file = open(outpath, "w")
+        self.sites_fname = outpath+".sites.txt"
+        self.genes_fname = outpath+".genes.txt"
+        output_file = open(self.sites_fname, "w")
 
         replicates = kwargs.get("r", "Mean")
         normalization = kwargs.get("n", "TTR")
@@ -409,12 +412,12 @@ class HMMMethod(base.SingleConditionMethod):
 
         self.transit_message("") # Printing empty line to flush stdout 
         self.transit_message("Finished HMM - Sites Method")
-        self.transit_message("Adding File: %s" % (self.output.name))
+        self.transit_message("Adding File: %s" % (self.sites_fname))
         self.add_file(filetype="HMM - Sites")
         
         #Gene Files
         self.transit_message("Creating HMM Genes Level Output")
-        genes_path = ".".join(self.output.name.split(".")[:-1]) + "_genes." + self.output.name.split(".")[-1] 
+        genes_path = self.genes_fname
 
         tempObs = numpy.zeros((1,len(O)))
         tempObs[0,:] = O - 1
@@ -429,7 +432,8 @@ class HMMMethod(base.SingleConditionMethod):
 
     @classmethod
     def usage_string(self):
-        return """python3 %s hmm <comma-separated .wig files> <annotation .prot_table or GFF3> <output file>
+        return """python3 %s hmm <comma-separated .wig files> <annotation .prot_table or GFF3> <output_BASE_filename>
+        (will create 2 output files: BASE.sites.txt and BASE.genes.txt)
 
         Optional Arguments:
             -r <string>     :=  How to handle replicates. Sum, Mean. Default: -r Mean
@@ -556,8 +560,7 @@ class HMMMethod(base.SingleConditionMethod):
 
 
     def post_process_genes(self, data, position, states, output_path):
-
-        output = open(output_path, "w")
+        output = open(self.genes_fname, "w")
         pos2state = dict([(position[t],states[t]) for t in range(len(states))])
         theta = numpy.mean(data > 0)
         G = tnseq_tools.Genes(self.ctrldata, self.annotation_path, data=data, position=position, ignoreCodon=False, nterm=self.NTerminus, cterm=self.CTerminus)
@@ -610,17 +613,10 @@ class HMMMethod(base.SingleConditionMethod):
         output.close()
 
 
-
-    
-
-
-
-
 if __name__ == "__main__":
 
     (args, kwargs) = transit_tools.cleanargs(sys.argv)
 
-
     G = HMMMethod.fromargs(sys.argv[1:])
 
     G.console_message("Printing the member variables:")   


=====================================
src/pytransit/data/iron_combined_wig4.txt
=====================================
The diff for this file was not included because it is too large.

=====================================
src/pytransit/data/iron_samples_metadata.txt
=====================================
@@ -0,0 +1,15 @@
+Id	Filename	Condition
+HighFeMBT	HighFeMBT.wig	HighFeMBT
+HighFeMBT2	HighFeMBT2.wig	HighFeMBT
+HighFeMBT3	HighFeMBT3.wig	HighFeMBT
+LowFeMBT	LowFeMBT.wig	LowFeMBT
+LowFeMBT2	LowFeMBT2.wig	LowFeMBT
+FeCMBT	FeCMBT.wig	FeCMBT
+FeCMBT2	FeCMBT2b.wig	FeCMBT
+Hemin	Hemin.wig	Hemin
+Hemin2	Hemin2b.wig	Hemin
+Hemin3	Hemin3b.wig	Hemin
+Hemoglobin	Hemoglobin.wig	Hemoglobin
+Hemoglobin2	Hemoglobin2b.wig	Hemoglobin
+HeminMBT	HeminMBT.wig	HeminMBT
+HeminMBT2	HeminMBT2.wig	HeminMBT


=====================================
src/pytransit/doc/requirements.txt
=====================================
@@ -0,0 +1,2 @@
+sphinx-rtd-theme
+


=====================================
src/pytransit/doc/source/example_scripts.rst
=====================================
@@ -0,0 +1,244 @@
+Example Scripts (for Developers)
+===============
+
+|
+
+Here are two example python scripts for developers showing how to use the *pytransit* package.
+
+Example 1: src/pytransit_resampling.py
+~~~~~~~~~~~~~~~~~~~~~
+
+This is a stand-alone script that uses the pytransit library to do
+resampling between two groups of wig files, analogous to running
+"python3 transit.py resampling...".  It is a simplified version of
+src/pytransit/analysis/resampling.py, which has more options.  
+
+To run this, cd into your transit/src/ directory.  It prints output to
+stdout. **Be sure to put transit/src/ in your PYTHON_PATH** 
+(e.g. using export in bash, or setenv in csh).
+
+::
+
+  usage: python3 pytransit_resampling.py <comma_separated_wigs_condA> <comma_separated_wigs_condB> <prot_table>
+
+  > cd transit/src/
+  > python3 pytransit_resampling.py pytransit/data/glycerol_H37Rv_rep1.wig,pytransit/data/glycerol_H37Rv_rep2.wig pytransit/data/cholesterol_H37Rv_rep1.wig,pytransit/data/cholesterol_H37Rv_rep2.wig,pytransit/data/cholesterol_H37Rv_rep3.wig pytransit/genomes/H37Rv.prot_table 
+
+The important functions illustrated in this script are:
+
+ * **transit_tools.get_validated_data()**
+ * **norm_tools.normalize_data()**
+ * **stat_tools.resampling()**
+
+.. code-block:: python
+
+  # transit/src/pytransit_resampling.py
+  import sys,numpy
+  import pytransit.transit_tools as transit_tools
+  import pytransit.tnseq_tools as tnseq_tools
+  import pytransit.stat_tools as stat_tools
+  import pytransit.norm_tools as norm_tools
+  from statsmodels.stats.multitest import fdrcorrection
+  
+  if len(sys.argv)!=4:
+    print("usage: python3 pytransit_resampling.py <comma_separated_wigs_condA> <comma_separated_wigs_condB> <prot_table>")
+    sys.exit(0)
+  
+  wigsA = sys.argv[1].split(',')
+  wigsB = sys.argv[2].split(',')
+  nA = len(wigsA)
+  prot_table = sys.argv[3]
+  
+  (counts,sites) = transit_tools.get_validated_data(wigsA+wigsB) # data is a DxN numpy array, D=num datasets, N=num TA sites
+  print(counts.shape)
+
+  print("normalizing data with TTR")
+  (data, factors) = norm_tools.normalize_data(counts, "TTR")
+
+  # this divides the raw data (counts at all TA sites) into a smaller array of counts for each gene based on its coordinates
+ genes = tnseq_tools.Genes(None,prot_table,data=data, position=sites)
+
+  results,pvals = [],[]
+  for gene in genes:
+    ii = gene.reads # coordinates of TA sites in gene
+    if gene.n==0: continue # skip genes with 0 TA sites
+    data1 = gene.reads[:nA] # counts at TA sites in gene for wigs for condition A
+    data2 = gene.reads[nA:]
+    stats = stat_tools.resampling(data1, data2, adaptive=True) # use defaults for other params
+    (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) = stats #     unpack
+    vals = [gene.orf,gene.name,gene.n,round(mean1,1),round(mean2,1),round(log2FC,3),pval_2tail]
+    results.append(vals)
+
+  # do multiple-tests correction
+  pvals = [x[-1] for x in results]
+  qvals = fdrcorrection(pvals)[1]
+  
+  print('\t'.join("orf gene numTA meanA meanB LFC Pval Qval".split()))
+  for vals,qval in zip(results,qvals):
+    print('\t'.join([str(x) for x in vals+[qval]]))
+  print("%s/%s hits (qval<0.05)" % (sum([q<0.05 for q in qvals]),len(results)))
+  
+
+
+
+Example 2: src/allpairs_resampling.py
+~~~~~~~~~~~~~~~~~~~~~
+
+While the example above shows how to read-in and process individual
+wig files, this examples shows how to work with :ref:`combined_wig and
+sample metadata files <combined_wig>`.  It does resampling between 
+each pair of conditions, and prints out a matrix of hits
+(statistically-signficant conditionally-essential genes).
+
+To run this, cd into your transit/src/ directory.  It prints output to
+stdout. **Be sure to put transit/src/ in your PYTHON_PATH** 
+(e.g. using export in bash, or setenv in csh).
+
+The important parts illustrated in this example script are: 
+
+ * the **tnseq_tools.read_combined_wig()** function
+ * how to read the metadata file
+ * selecting counts from the data matrix for the samples associated with each condition
+
+::
+
+  import sys,numpy
+  import pytransit.transit_tools as transit_tools
+  import pytransit.tnseq_tools as tnseq_tools
+  import pytransit.stat_tools as stat_tools
+  import pytransit.norm_tools as norm_tools
+  from statsmodels.stats.multitest import fdrcorrection
+  
+  # this is a stand-alone script that uses the pytransit library to do resampling between all pairs of conditions in a combined_wig file
+  # metadata file indicates which replicates belong to which conditions
+  # put transit/src/ in your PYTHON_PATH (e.g. using export in bash, or setenv in csh)
+  # prints output to stdout
+  
+  if len(sys.argv)!=4:
+    print("usage: python3 allpairs_resampling.py <combined_wig_file> <metadata_file> <prot_table>")
+    sys.exit(0)
+  
+  combined_wig_file = sys.argv[1]
+  metadata_file = sys.argv[2]
+  prot_table = sys.argv[3]
+  
+  #################################
+  
+  print("reading data")
+  (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(combined_wig_file)
+  print("data.shape =",data.shape)
+  
+  print("normalizing using TTR")
+  (data, factors) = norm_tools.normalize_data(data, "TTR")
+  
+  # there is a tnseq_tools.read_samples_metadata(), but it is kind of complicated
+  # so just read through tab-separated file and collect sample Filenames associated with Conditions
+  # metadata files have at least 3 columns: Id, Filename, Condition
+  # the columns in the combined_wig are cross-referenced by Filename (from the original wigs)
+  
+  Conditions,Samples = [],{}
+  CondCol,FnameCol = -1,-1
+  for line in open(metadata_file):
+    w = line.rstrip().split('\t') # tab-separated
+    if CondCol==-1: CondCol,FnameCol = w.index("Condition"),w.index("Filename"); continue # error if headers not found
+    cond,fname = w[CondCol],w[FnameCol]
+    if cond not in Conditions: Conditions.append(cond)
+    if cond not in Samples: Samples[cond] = []
+    Samples[cond].append(fname)
+  
+  print("\nConditions\t: Samples")
+  print("-------------------------")
+  for i,cond in enumerate(Conditions):
+    print("%s:%-8s" % (i+1,cond),"\t: ",", ".join(Samples[cond]))
+   
+  #################################
+  
+  genes = tnseq_tools.Genes(None,prot_table,data=data, position=sites) # divides counts at all TAs sites into groups by orf
+  
+  print()
+  print("running resampling on each pair of conditions...")
+  print("reporting number of conditionally essential genes (Qval<0.05)")
+  for i,condA in enumerate(Conditions):
+    for j,condB in enumerate(Conditions):
+      if i<j:
+        pvals = []
+        for gene in genes:
+          if gene.n==0: continue # skip genes with 0 TA sites
+          idxA = [filenamesInCombWig.index(s) for s in Samples[condA]]
+          idxB = [filenamesInCombWig.index(s) for s in Samples[condB]]
+          data1 = gene.reads[idxA] # counts at TA sites in gene for wigs for condition A
+          data2 = gene.reads[idxB]
+          stats = stat_tools.resampling(data1, data2, adaptive=True) # use defaults for other params
+          (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) = stats # unpack
+          pvals.append(pval_2tail)
+  
+        qvals = fdrcorrection(pvals)[1]
+        numhits = sum([q<0.05 for q in qvals])
+        vals = ["%s:%-8s" % (i+1,condA),"%s:%-8s" % (j+1,condB),numhits]
+        print("\t".join([str(x) for x in vals]))
+  
+
+Here is the output of this script for data from growth
+of M. tuberculosis H37Rv on media containing iron supplied by
+different vehicles (e.g. mycobactin, carboxymycobactin, hemin,
+hemoglobin...), which requires genes in different pathways for uptake
+`(Zhang et al., 2020) <https://pubmed.ncbi.nlm.nih.gov/32069330/>`_.
+The raw data (wig files, with insertion counts at TA sites) have been
+combined into a **combined_wig file** and a **metatdata file** that
+describes which samples belong to which condition.  These files 
+(*iron_combined_wig4.txt* and *iron_samples_metadata.txt*) can be found in
+the transit data directory, transit/src/pytransit/data/.  You can
+also compare this to the heatmap shown on the page for :ref:`corrplot
+<corrplot>`.
+
+::
+
+  > cd transit/src/
+  > python3 allpairs_resampling.py pytransit/data/iron_combined_wig4.txt pytransit/data/iron_samples_metadata.txt pytransit/genomes/H37Rv.prot_table
+  reading data
+  data.shape = (14, 74605)
+  normalizing using TTR
+  
+  Conditions	: Samples
+  -------------------------
+  1:HighFeMBT 	:  HighFeMBT.wig, HighFeMBT2.wig, HighFeMBT3.wig
+  2:LowFeMBT 	:  LowFeMBT.wig, LowFeMBT2.wig
+  3:FeCMBT   	:  FeCMBT.wig, FeCMBT2b.wig
+  4:Hemin    	:  Hemin.wig, Hemin2b.wig, Hemin3b.wig
+  5:Hemoglobin 	:  Hemoglobin.wig, Hemoglobin2b.wig
+  6:HeminMBT 	:  HeminMBT.wig, HeminMBT2.wig
+  
+  running resampling on each pair of conditions...
+  reporting number of conditionally essential genes (Qval<0.05)
+  1:HighFeMBT	2:LowFeMBT	38
+  1:HighFeMBT	3:FeCMBT  	10
+  1:HighFeMBT	4:Hemin   	136
+  1:HighFeMBT	5:Hemoglobin	134
+  1:HighFeMBT	6:HeminMBT	30
+  2:LowFeMBT	3:FeCMBT  	35
+  2:LowFeMBT	4:Hemin   	70
+  2:LowFeMBT	5:Hemoglobin	71
+  2:LowFeMBT	6:HeminMBT	5
+  3:FeCMBT  	4:Hemin   	104
+  3:FeCMBT  	5:Hemoglobin	106
+  3:FeCMBT  	6:HeminMBT	44
+  4:Hemin   	5:Hemoglobin	4
+  4:Hemin   	6:HeminMBT	77
+  5:Hemoglobin	6:HeminMBT	89
+
+
+.. NOTE::
+
+  Note, in allpairs_resampling.py, the FDR correction 
+  to adjust P-values for testing all genes in parallel 
+  is applied within each pairwise comparison.
+  This correction is then repeated independently for each pair
+  of conditions analyzed.  Formally, it would be more rigorous to apply
+  the FDR correction one time at the end, to adjust P-values over all pairs and
+  over all genes (which would be G*N*(N-1)/2 probabilities, where G in
+  the number of genes in the genome, and N is the number of conditions).
+  This would likely further reduce the number of significant
+  conditionally-essential genes.  But for simplicity, this example
+  script does not do that, because it is just designed to illustrate
+  using the *pytransit* package.
+


=====================================
src/pytransit/doc/source/file_formats.rst
=====================================
@@ -0,0 +1,242 @@
+.. _input_files:
+
+File Formats for Transit
+=======================
+
+
+Wig Files
+---------
+
+TPP processes raw sequencing files (.fastq) to extract transposon insertion counts 
+at TA sites.
+These are stored as ".wig" files.
+Wig files primarily have 2 columns of data: a) coordinates of TA sites, 
+and b) the number of insertions observed at those sites.
+By convention, there are also 2 header lines.
+The first header line can be arbitrary info, such as where the data came from.
+The second has to contain "variableStep chrom=" followed by the name of the
+reference genome used for mapping the reads (e.g. H37Rv.fna).
+Here is an example (transit/src/pytransit/data/glycerol_H37Rv_rep2.wig):
+
+::
+  
+  # from Griffin et al, (2011).  PLOS Pathogens, e1002251.
+  variableStep chrom=H37Rv
+  60 0
+  72 0
+  102 0
+  ...
+  1522 124
+  1552 0
+  1635 201
+  1779 0
+  1782 0
+  1788 0
+  1847 0
+  1858 342
+  1921 21
+  ...
+  4411440 99
+  4411479 0
+  4411508 0
+  4411526 0
+
+  (full file has ~75,000 lines)
+
+|
+
+.. _annotation_files:
+
+Prot_Tables (Genome Annotations)
+------------------
+
+The annotation of a genome contains information about genes, such as
+coordinates, strand, locus tag, gene name, and functional description.
+Transit uses a custom format for annotations called "prot_table"s,
+e.g. H37Rv.prot_table.  Prot_tables are **tab-separated text files**
+containing the gene information in 9 specific columns:
+
+**Prot_table file format:**
+
+1. gene function description
+2. start coordinate
+3. end coordinate
+4. strand
+5. length of protein product (in amino acids)
+6. don't care
+7. don't care
+8. gene name (like "dnaA")
+9. ORF id (like Rv0001)
+
+Here is an example (transit/src/pytransit/genomes/H37Rv.prot_table):
+
+::
+
+  chromosomal replication initiation protein 	1	1524	+	507	15607143	885041	dnaA	Rv0001
+  DNA polymerase III subunit beta 	2052	3260	+	402	15607144	887092	dnaN	Rv0002
+  recombination protein F 	3280	4437	+	385	15607145	887089	recF	Rv0003
+  hypothetical protein Rv0004 	4434	4997	+	187	15607146	887088	-	Rv0004
+  DNA gyrase subunit B 	5123	7267	+	714	15607147	887081	gyrB	Rv0005
+  DNA gyrase subunit A 	7302	9818	+	838	15607148	887105	gyrA	Rv0006
+  ... 
+
+  (full file has ~4000 lines)
+  
+
+.. NOTE::
+
+  *It is crucial* that the annotation file (.prot_table) used for
+  analyses in Transit corresponds to exactly the same genome sequence
+  (.fasta or .fna) that was used to generate the .wig files with TPP,
+  because it is used to determine which TA sites are contained in which
+  genes (by coordinates). For example, H37Rv.fna is paired with
+  H37Rv.prot_table, both derived from GenBank sequence NC_000962.3.
+
+
+In many cases, users might often obtain annotations for their genome
+in **.gff (or .gff3)** file format, such as downloaded from NCBI.  .gff
+files contains essentially the same information about genes.  However,
+there is a bit more flexibility in the .gff file format (especially in
+the tags used in the right-most column), and the information about
+genes is not always encoded in a uniform way, making it difficult to
+use arbitrary .gff filess for analyses in Transit.  
+Therefore, there is a
+simple procedure in Transit to convert a .gff file to .prot_table
+format (via GUI or command-line).  This
+step only has to be done once, and then the .prot_table can be used
+for all subsequent analyses in Transit.
+(The routine specifically looks for the 'locus_tag', 'gene', and 'product'
+tags in info field of CDS records.)
+
+::
+
+  > python3 transit.py convert gff_to_prot_table <input.gff_file> <output.prot_table>
+
+
+
+.. _combined_wig:
+
+Combined_wig Files
+------------------
+
+Transit now supports a new file format called 'combined_wig' which basically
+combines multiple wig files into one file (with multiple columns).  This is
+especially useful for managing larger collections of datasets.
+
+Combined_wig files can created through the Transit GUI
+(in the menu: **File -> Export -> Selected Datasets -> to_Combined_Wig**), or via the command line:
+
+::
+
+  Usage:
+
+  python3 transit.py export combined_wig <comma-separated .wig files> <annotation .prot_table> <output file> [-n <norm>]
+
+  Example:
+
+  > cd src/pytransit/data/
+  > python3 ../../transit.py export combined_wig glycerol_H37Rv_rep1.wig,glycerol_H37Rv_rep2.wig,cholesterol_H37Rv_rep1.wig,cholesterol_H37Rv_rep2.wig,cholesterol_H37Rv_rep3.wig ../genomes/H37Rv.prot_table glyc_chol.combined_wig.txt
+
+
+You can specify the normalization method you want to use with a flag.
+TTR is the default, but other relevant normalization options would be 'nonorm'
+(i.e. preserve raw counts) and 'betageom' (this corrects for skew, but is slow).
+
+The format of a combined_wig is simply a multi-column (tab-separated) file with
+the first column being the coordinates of TA sites, followed by 
+N columns of counts (for N samples), possibly with a final column indicating
+the gene annotation information.
+A combined_wig file can have header lines, prefixed by '#'.
+
+Importantly, a combined_wig file must include sample identifiers
+(filenames) that are prefixed with **"#File: "**.  These header lines
+are automatically included by the 'export combined_wig' Transit
+command the creates a combined_wig from multiple .wig files.  These "#File:"
+header lines should be given in the same order as the sample columns,
+and the filenames are used as identifiers to cross-reference
+information about each sample in the metadata file (see below).
+
+Here is the output file from the example above (glyc_chol.combined_wig.txt):
+
+::
+
+ #Converted to CombinedWig with TRANSIT.
+ #normalization method: TTR
+ #Normalization Factors: 2.286197257382532 1.4371695843583265 2.6365107688705036 3.2329912278198036 2.865270203473558
+ #RefGenome: H37Rv
+ #File: glycerol_H37Rv_rep1.wig
+ #File: glycerol_H37Rv_rep2.wig
+ #File: cholesterol_H37Rv_rep1.wig
+ #File: cholesterol_H37Rv_rep2.wig
+ #File: cholesterol_H37Rv_rep3.wig
+ #TA_coord	glycerol_H37Rv_rep1.wig	glycerol_H37Rv_rep2.wig	cholesterol_H37Rv_rep1.wig	cholesterol_H37Rv_rep2.wig	cholesterol_H37Rv_rep3.wig
+ 60	0.0	0.0	0.0	0.0	0.0	Rv0001 (dnaA)
+ 72	0.0	0.0	0.0	0.0	0.0	Rv0001 (dnaA)
+ ...
+ 1345	0.0	0.0	0.0	0.0	0.0	Rv0001 (dnaA)
+ 1423	0.0	0.0	0.0	0.0	0.0	Rv0001 (dnaA)
+ 1522	0.0	178.2	0.0	0.0	0.0	Rv0001 (dnaA)
+ 1552	61.7	0.0	0.0	29.1	0.0	
+ 1635	32.0	288.9	142.4	22.6	0.0	
+ ...
+
+ (full file has ~75,000 lines)
+
+(DnaA is essential, which is why there are no insertion counts in the first few TA sites)
+
+Note that the ORF id and gene names are appended for TA sites in CDS regions, for convenience.
+If you open a combined_wig file in Excel (with tab as separator character), you will
+see the last line of the header (starting with "TAcoord...") provides the input wig filenames as column headers.
+
+The '#RefGenome' is extracted from information in the individual .wig files,
+which records the reference genome sequence that was used with TPP; 
+the coordinates of TA sites are determined from the genome sequence.
+The **reference genome must be the same for all wigs** being combined.
+
+
+|
+
+
+.. _metadata_files:
+
+Samples Metadata File
+---------------------
+
+The **metadata** file describes the sample ids, filenames,
+and conditions they represent (e.g. different media, growth
+conditions, knockout strains, animal passaging, etc., or whatever
+treatments and controls your TnSeq experiment involves) for all the
+samples in a combined_wig file.  
+
+Format of the *samples_metadata* file: a tab-separated file (which you
+can edit in Excel) with 3 columns: Id, Condition, and Filename (it
+must have these headers). The Condition column should be as specific
+as possible, indicating **how to group replicates**.
+You can include other columns of info, but
+do not include additional rows.  Individual rows can be commented out
+by prefixing them with a '#'.  Here is an example of a samples
+metadata file: The filenames should match what is shown in the header
+of the combined_wig (including pathnames, if present).
+
+Note: the Condition column should have a unique label for each
+distinct condition (the same label shared only among replicates).  If
+there are attributes that distinguish the conditions (such as strain,
+treatment, etc), they could be included as **additional columns**
+(e.g. covariates, like Carbon_source or Drug or Days or Strain...).
+
+::
+
+  Id      Condition    Filename
+  glyc1   glycerol     /Users/example_data/glycerol_rep1.wig
+  glyc2   glycerol     /Users/example_data/glycerol_rep2.wig
+  chol1   cholesterol  /Users/example_data/cholesterol_rep1.wig
+  chol2   cholesterol  /Users/example_data/cholesterol_rep2.wig
+  chol2   cholesterol  /Users/example_data/cholesterol_rep3.wig
+
+
+
+
+|
+
+.. rst-class:: transit_sectionend
+----


=====================================
src/pytransit/doc/source/index.rst
=====================================
@@ -79,6 +79,7 @@ TRANSIT offers a variety of features including:
    transit_install
    transit_running
    transit_features
+   file_formats
    
 .. toctree::
    :maxdepth: 3
@@ -135,6 +136,7 @@ TRANSIT offers a variety of features including:
    :caption: Code Documentation
 
    transit
+   example_scripts
 
 * :ref:`genindex`
 * :ref:`modindex`


=====================================
src/pytransit/doc/source/method_HMM.rst
=====================================
@@ -20,13 +20,15 @@ How does it work?
 |
 
 
-Example
--------
+Usage
+-----
 
 ::
 
 
-  python3 transit.py hmm <comma-separated .wig files> <annotation .prot_table or GFF3> <output file>
+  > python3 transit.py hmm <comma-separated .wig files> <annotation .prot_table or GFF3> <output_BASE_filename>
+        (will create 2 output files: BASE.sites.txt and BASE.genes.txt)
+
         Optional Arguments:
             -r <string>     :=  How to handle replicates. Sum, Mean. Default: -r Mean
             -l              :=  Perform LOESS Correction; Helps remove possible genomic position bias. Default: Off.
@@ -52,7 +54,7 @@ replicate datasets:
 Output and Diagnostics
 ----------------------
 
-| The HMM method outputs two files. The first file provides the most
+| The HMM method outputs two files. The first file (**BASE.sites.txt**) provides the most
   likely assignment of states for all the TA sites in the genome. Sites
   can belong to one of the following states: "E" (Essential), "GD"
   (Growth-Defect), "NE" (Non-Essential), or "GA" (Growth-Advantage). In
@@ -81,7 +83,7 @@ Output and Diagnostics
 +------------+-----------------------------------------------------------------------------------------------------+
 
 |
-|  The second file provides a gene-level classification for all the
+|  The second file (**BASE.genes.txt**) provides a gene-level classification for all the
   genes in the genome. Genes are classified as "E" (Essential), "GD"
   (Growth-Defect), "NE" (Non-Essential), or "GA" (Growth-Advantage)
   depending on the number of sites within the gene that belong to those


=====================================
src/pytransit/doc/source/method_anova.rst
=====================================
@@ -46,25 +46,16 @@ Example
 The output file generated by ANOVA identifies which genes exhibit statistically
 significant variability in counts across conditions (see Output and Diagnostics below).
 
-Note: the combined_wig input file can be generated from multiple wig
-files through the Transit GUI
-(File->Export->Selected_Datasets->Combined_wig), or via the 'export'
-command on the command-line (see combined_wig_).
 
-Format of the **samples metadata file**: a tab-separated file (which you can edit in Excel)
-with 3 columns: Id, Condition, and Filename (it must have these headers).  You can include
-other columns of info, but do not include additional rows.  Individual rows can be
-commented out by prefixing them with a '#'.  Here is an example of a samples metadata file:
-The filenames should match what is shown in the header of the combined_wig (including pathnames, if present).
+Input Files
+-----------
 
-::
+Anova takes a **combined_wig file** and **metadata file** as input.
+The combined_wig file contains insertion counts for multiple conditions and replicates.
+The metadata file indicates which replicates go with which experimental conditions.
+
+See description of :ref:`combined_wig and metadata file formats <combined_wig>`.
 
-  ID      Condition    Filename
-  glyc1   glycerol     /Users/example_data/glycerol_rep1.wig
-  glyc2   glycerol     /Users/example_data/glycerol_rep2.wig
-  chol1   cholesterol  /Users/example_data/cholesterol_rep1.wig
-  chol2   cholesterol  /Users/example_data/cholesterol_rep2.wig
-  chol2   cholesterol  /Users/example_data/cholesterol_rep3.wig
 
 
 


=====================================
src/pytransit/doc/source/method_resampling.rst
=====================================
@@ -189,11 +189,12 @@ making it difficult to make confident calls on essentiality.
 Doing resampling with a combined_wig file
 -----------------------------------------
 
-Resampling can also now take a combined_wig_ file as input (containing insertion counts
-for multiple sample), along with a samples_metadata_ file
+Resampling can also now take a **combined_wig** file as input (containing insertion counts
+for multiple sample), along with a **samples_metadata** file
 that describes the samples. This mode is indicated with a '-c' flag.
 If you want to compare more than two conditions, see :ref:`ZINB <zinb>`.
 
+See description of :ref:`combined_wig and metadata file formats <combined_wig>`.
 
 ::
 


=====================================
src/pytransit/doc/source/method_zinb.rst
=====================================
@@ -61,55 +61,16 @@ Example
         --gene <Orf id or Gene name>:= Run method for one gene and print model output.
 
 
-.. _combined_wig:
 
-Combined wig files
-------------------
+Input Files
+-----------
 
-Transit now supports a new file format called 'combined_wig' which basically
-combines multiple wig files into one file (with multiple columns).  This is
-used for some of the new analysis methods for larger collections of datasets, like :ref:`Anova <anova>`, :ref:`ZINB <zinb>`.
-Combined_wig files can created through the Transit GUI
-(File->Export->Selected_Datasets->Combined_wig), or via the command line.
-You can specify the normalization method you want to use with a flag.
-TTR is the default, but other relevant normalization options would be 'nonorm'
-(i.e. preserve raw counts) and 'betageom' (this corrects for skew, but is slow).
+ZINB takes a **combined_wig file** and **metadata file** as input.
+The combined_wig file contains insertion counts for multiple conditions and replicates.
+The metadata file indicates which replicates go with which experimental conditions.
 
+See description of :ref:`combined_wig and metadata file formats <combined_wig>`.
 
-::
-
-  > python3 src/transit.py export combined_wig --help
-
-  usage: python3 src/transit.py export combined_wig <comma-separated .wig files> <annotation .prot_table> <output file>
-
-  > python3 ../transit/src/transit.py export combined_wig Rv_1_H37RvRef.wig,Rv_2_H37RvRef.wig,Rv_3_H37RvRef.wig H37Rv.prot_table clinicals_combined_TTR.wig -n TTR
-
-
-
-.. _samples_metadata:
-
-Samples Metadata File
----------------------
-
-Format of the *samples_metadata* file: a tab-separated file (which you
-can edit in Excel) with 3 columns: Id, Condition, and Filename (it
-must have these headers).  You can include other columns of info, but
-do not include additional rows.  Individual rows can be commented out
-by prefixing them with a '#'.  Here is an example of a samples
-metadata file: The filenames should match what is shown in the header
-of the combined_wig (including pathnames, if present).
-
-Note: the Condition column should have a unique label for each distinct condition (the same label shared only among replicates).
-If there are attributes that distinguish the conditions (such as strain, treatment, etc), they could be included as additional columns (e.g. covariates).
-
-::
-
-  ID      Condition    Filename
-  glyc1   glycerol     /Users/example_data/glycerol_rep1.wig
-  glyc2   glycerol     /Users/example_data/glycerol_rep2.wig
-  chol1   cholesterol  /Users/example_data/cholesterol_rep1.wig
-  chol2   cholesterol  /Users/example_data/cholesterol_rep2.wig
-  chol2   cholesterol  /Users/example_data/cholesterol_rep3.wig
 
 Parameters
 ----------


=====================================
src/pytransit/doc/source/transit.rst
=====================================
@@ -1,4 +1,4 @@
-transit package
+pytransit package
 ===============
 
 |


=====================================
src/pytransit/doc/source/transit_install.rst
=====================================
@@ -246,6 +246,22 @@ wxPython 4+ can be installed using pip
 If the above command fails and you already have wxPython < 4.0 installed, you may have to manually remove it.
 See https://stackoverflow.com/questions/50688630/cannot-uninstall-wxpython-3-0-2-0-macos for details.
 
+.. NOTE::
+
+  Installing *wxPython* can be a bit finicky.  It might require installing the
+  development version of GTK first.  There are at least two versions currently, 
+  *gtk2* and *gtk3*.
+  Transit should work with both, although there can be small differences in the 
+  visual look of the GUI.  To get *wxPython* to install, you might try doing this:
+
+    > sudo apt-get install libgtk-2-dev
+
+    or
+
+    > sudo apt-get install libgtk-3-dev
+
+  depending on which version of *libgtk* you have installed.
+
 .. _transit-troubleshoot:
 
 Troubleshooting


=====================================
src/pytransit_resampling.py
=====================================
@@ -0,0 +1,48 @@
+import sys,numpy
+import pytransit.transit_tools as transit_tools
+import pytransit.tnseq_tools as tnseq_tools
+import pytransit.stat_tools as stat_tools
+import pytransit.norm_tools as norm_tools
+from statsmodels.stats.multitest import fdrcorrection
+
+# this is a stand-alone script that uses the pytransit library to do resampling between 2 groups of wig files
+# analogous to running "python3 transit.py resampling..."; see also src/pytransit/analysis/resampling.py
+# put transit/src/ in your PYTHON_PATH (e.g. using export in bash, or setenv in csh)
+# prints output to stdout
+
+if len(sys.argv)!=4:
+  print("usage: python3 pytransit_resampling.py <comma_separated_wigs_condA> <comma_separated_wigs_condB> <prot_table>")
+  sys.exit(0)
+
+wigsA = sys.argv[1].split(',')
+wigsB = sys.argv[2].split(',')
+nA = len(wigsA)
+prot_table = sys.argv[3]
+
+(counts,sites) = transit_tools.get_validated_data(wigsA+wigsB) # data is a DxN numpy array, D=num datasets, N=num TA sites
+print("counts.shape =",counts.shape)
+
+print("normalizing data with TTR")
+(data, factors) = norm_tools.normalize_data(counts, "TTR")
+
+# this divides the raw data (counts at all TA sites) into a smaller array of counts for each gene based on its coordinates
+genes = tnseq_tools.Genes(None,prot_table,data=data, position=sites)
+
+results,pvals = [],[]
+for gene in genes:
+  if gene.n==0: continue # skip genes with 0 TA sites
+  data1 = gene.reads[:nA] # counts at TA sites in gene for wigs for condition A
+  data2 = gene.reads[nA:]
+  stats = stat_tools.resampling(data1, data2, adaptive=True) # use defaults for other params
+  (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) = stats # unpack
+  vals = [gene.orf,gene.name,gene.n,round(mean1,1),round(mean2,1),round(log2FC,3),pval_2tail]
+  results.append(vals)
+
+# do FDR correction
+pvals = [x[-1] for x in results]
+qvals = fdrcorrection(pvals)[1]
+
+print('\t'.join("orf gene numTA meanA meanB LFC Pval Qval".split()))
+for vals,qval in zip(results,qvals):
+  print('\t'.join([str(x) for x in vals+[qval]]))
+print("%s/%s hits (qval<0.05)" % (sum([q<0.05 for q in qvals]),len(results)))


=====================================
tests/test_analysis_methods.py
=====================================
@@ -63,8 +63,8 @@ class TestMethods(TransitTestCase):
         args = [mini_wig, small_annotation, output]
         G = HMMMethod.fromargs(args)
         G.Run()
-        self.assertTrue(os.path.exists(output))
-        genes_path = output.rsplit(".", 1)[0] + "_genes." + output.rsplit(".", 1)[1]
+        self.assertTrue(os.path.exists(output+".sites.txt"))
+        genes_path = output+".genes.txt"
         self.assertTrue(os.path.exists(genes_path))
 
 



View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/b554dbac69556f76636a212cc8bee9f1ea5f516b

-- 
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/b554dbac69556f76636a212cc8bee9f1ea5f516b
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231024/888dae3b/attachment-0001.htm>


More information about the debian-med-commit mailing list