[med-svn] [Git][med-team/tnseq-transit][master] 5 commits: initialize changelog.

Sat Oct 22 11:59:50 BST 2022


Étienne Mollier pushed to branch master at Debian Med / tnseq-transit


Commits:
0f437cac by Étienne Mollier at 2022-10-22T12:39:20+02:00
initialize changelog.

- - - - -
681b8e92 by Étienne Mollier at 2022-10-22T12:39:54+02:00
routine-update: New upstream version

- - - - -
039a7a34 by Étienne Mollier at 2022-10-22T12:39:56+02:00
New upstream version 3.2.7
- - - - -
de1e7c11 by Étienne Mollier at 2022-10-22T12:42:35+02:00
Update upstream source from tag 'upstream/3.2.7'

Update to upstream version '3.2.7'
with Debian dir 714089807149b98fad361670647c343b91efa74a
- - - - -
e7fcdb59 by Étienne Mollier at 2022-10-22T12:59:11+02:00
ready to upload to unstable.

- - - - -


10 changed files:

- CHANGELOG.md
- debian/changelog
- src/pytransit/__init__.py
- src/pytransit/analysis/corrplot.py
- src/pytransit/analysis/heatmap.py
- src/pytransit/analysis/resampling.py
- src/pytransit/doc/source/method_resampling.rst
- src/pytransit/stat_tools.py
- src/pytransit/tnseq_tools.py
- src/pytransit/transit_gui.py


Changes:

=====================================
CHANGELOG.md
=====================================
@@ -2,7 +2,14 @@
 All notable changes to this project will be documented in this file.
 
 
-## Version 3.2.6 2021-08-03
+## Version 3.2.7 2022-09-22
+#### TRANSIT:
+
+Major changes:
+ - implemented site-restricted (S-R) resampling (as a checkbox in the GUI, and '-sr' flag on the command line)
+
+
+## Version 3.2.6 2022-08-03
 #### TRANSIT:
 
 Major changes:
@@ -14,7 +21,7 @@ Minor changes:
  - updated 'export combined_wig' to include ref genome and column headers
 
 
-## Version 3.2.5 2021-06-15
+## Version 3.2.5 2022-06-15
 #### TRANSIT:
 
 Minor changes:
@@ -23,7 +30,7 @@ Minor changes:
  - added rpy2 warning (if not installed) for corrplot and heatmap
 
 
-## Version 3.2.4 2021-06-05
+## Version 3.2.4 2022-06-05
 #### TRANSIT:
 
 Major changes:


=====================================
debian/changelog
=====================================
@@ -1,3 +1,13 @@
+tnseq-transit (3.2.7-1) unstable; urgency=medium
+
+  [ Nilesh Patra ]
+  * Remove myself from uploaders
+
+  [ Étienne Mollier ]
+  * New upstream version
+
+ -- Étienne Mollier <emollier at debian.org>  Sat, 22 Oct 2022 12:39:54 +0200
+
 tnseq-transit (3.2.6-1) unstable; urgency=medium
 
   * Remove explicit debhelper Build-Depends which was automatically added


=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
 __all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
 
 
-__version__ = "v3.2.6"
+__version__ = "v3.2.7"
 prefix = "[TRANSIT]"
 


=====================================
src/pytransit/analysis/corrplot.py
=====================================
@@ -146,7 +146,8 @@ class CorrplotMethod(base.SingleConditionMethod):
             if n==-1: 
               # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
               # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
-              if self.filetype=="anova": n = int((len(w)-6)/2) 
+              # it would be better to read the column headers and look for "LFC_<cond>"
+              if self.filetype=="anova": n = int((len(w)-8)/2) 
               elif self.filetype=="zinb": n = int((len(headers)-6)/4) 
               headers = headers[3:3+n]
               headers = [x.replace("Mean_","") for x in headers]


=====================================
src/pytransit/analysis/heatmap.py
=====================================
@@ -137,7 +137,7 @@ class HeatmapMethod(base.SingleConditionMethod):
           if n==-1: 
             # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
             # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
-            if self.filetype=="anova": n = int((len(w)-6)/2) 
+            if self.filetype=="anova": n = int((len(w)-8)/2) 
             elif self.filetype=="zinb": n = int((len(headers)-6)/4) 
             headers = headers[3:3+n]
             headers = [x.replace("Mean_","") for x in headers]


=====================================
src/pytransit/analysis/resampling.py
=====================================
@@ -162,11 +162,18 @@ class ResamplingGUI(base.AnalysisGUI):
         (self.wxobj.resamplingHistogramCheckBox, histSizer) = self.defineCheckBox(resamplingPanel, labelText="Generate Resampling Histograms", widgetCheck=False, widgetSize=(-1,-1), tooltipText="Creates .png images with the resampling histogram for each of the ORFs. Histogram images are created in a folder with the same name as the output file.")
         resamplingSizer.Add(histSizer, 0, wx.EXPAND, 5 )
 
-
         # Zeros Check
         (self.wxobj.resamplingZeroCheckBox, zeroSizer) = self.defineCheckBox(resamplingPanel, labelText="Include sites with all zeros", widgetCheck=True, widgetSize=(-1,-1), tooltipText="Includes sites that are empty (zero) across all datasets. Unchecking this may be useful for tn5 datasets, where all nucleotides are possible insertion sites and will have a large number of empty sites (significantly slowing down computation and affecting estimates).")
         resamplingSizer.Add(zeroSizer, 0, wx.EXPAND, 5 )
 
+        # site-restricted checkbox
+        (self.wxobj.siteRestrictedCheckBox, srSizer) = self.defineCheckBox(resamplingPanel, labelText="Site-restricted resampling", widgetCheck=True, widgetSize=(-1,-1), tooltipText="Restrict permutations of insertion counts in a gene to each individual TA site, which could be more sensitive (detect more conditional-essentials) than permuting counts over all TA sites pooled (which is the default).")
+        resamplingSizer.Add(srSizer, 0, wx.EXPAND, 5 )
+
+        # winsorization checkbox
+        (self.wxobj.winsorizeCheckBox, winzSizer) = self.defineCheckBox(resamplingPanel, labelText="Winsorize", widgetCheck=False, widgetSize=(-1,-1), tooltipText="Replace highest insertion count in each gene with second highest counts; this can help reduce the impact of noise (e.g. spurious outlier counts).")
+        resamplingSizer.Add(winzSizer, 0, wx.EXPAND, 5 )
+
 
         resamplingButton = wx.Button( resamplingPanel, wx.ID_ANY, u"Run resampling", wx.DefaultPosition, wx.DefaultSize, 0 )
         resamplingSizer.Add( resamplingButton, 0, wx.ALL|wx.ALIGN_CENTER_HORIZONTAL, 5 )
@@ -218,6 +225,7 @@ class ResamplingMethod(base.DualConditionMethod):
                 ctrl_lib_str="",
                 exp_lib_str="",
                 winz = False,
+                site_restricted = False,
                 wxobj=None, Z = False, diffStrains = False, annotation_path_exp = "", combinedWigParams = None):
 
         base.DualConditionMethod.__init__(self, short_name, long_name, short_desc, long_desc, ctrldata, expdata, annotation_path, output_file, normalization=normalization, replicates=replicates, LOESS=LOESS, NTerminus=NTerminus, CTerminus=CTerminus, wxobj=wxobj)
@@ -234,6 +242,8 @@ class ResamplingMethod(base.DualConditionMethod):
         self.annotation_path_exp = annotation_path_exp if diffStrains else annotation_path
         self.combinedWigParams = combinedWigParams
         self.winz = winz
+        self.site_restricted = site_restricted
+        self.transit_message("site_restricted=%s" % site_restricted)
 
     @classmethod
     def fromGUI(self, wxobj):
@@ -273,6 +283,8 @@ class ResamplingMethod(base.DualConditionMethod):
         doHistogram = wxobj.resamplingHistogramCheckBox.GetValue()
 
         includeZeros = wxobj.resamplingZeroCheckBox.GetValue()
+        site_restricted = wxobj.siteRestrictedCheckBox.GetValue()
+        winz = wxobj.winsorizeCheckBox.GetValue()
         pseudocount = float(wxobj.resamplingPseudocountText.GetValue())
         LOESS = wxobj.resamplingLoessCheck.GetValue()
 
@@ -312,7 +324,10 @@ class ResamplingMethod(base.DualConditionMethod):
                 NTerminus,
                 CTerminus,
                 ctrl_lib_str,
-                exp_lib_str, wxobj=wxobj, Z = False, diffStrains = diffStrains, annotation_path_exp = annotationPathExp)
+                exp_lib_str, 
+                winz = winz,
+                site_restricted = site_restricted,
+                wxobj=wxobj, Z = False, diffStrains = diffStrains, annotation_path_exp = annotationPathExp)
 
     @classmethod
     def fromargs(self, rawargs):
@@ -359,7 +374,7 @@ class ResamplingMethod(base.DualConditionMethod):
         output_file = open(output_path, "w")
 
         # check for unrecognized flags
-        flags = "-c -s -n -h -a -ez -PC -l -iN -iC --ctrl_lib --exp_lib -Z -winz".split()
+        flags = "-c -s -n -h -a -ez -PC -l -iN -iC --ctrl_lib --exp_lib -Z -winz -sr".split()
         for arg in rawargs:
           if arg[0]=='-' and arg not in flags:
             self.transit_error("flag unrecognized: %s" % arg)
@@ -373,6 +388,7 @@ class ResamplingMethod(base.DualConditionMethod):
         replicates = kwargs.get("r", "Sum")
         excludeZeros = kwargs.get("ez", False)
         includeZeros = not excludeZeros
+        site_restricted = kwargs.get("sr", False)
         pseudocount = float(kwargs.get("PC", 1.0)) # use -PC (new semantics: for LFCs) instead of -pc (old semantics: fake counts)
 
         Z = True if "Z" in kwargs else False
@@ -403,6 +419,7 @@ class ResamplingMethod(base.DualConditionMethod):
                 ctrl_lib_str,
                 exp_lib_str, 
                 winz = winz,
+                site_restricted = site_restricted,
                 Z = Z, diffStrains = diffStrains, annotation_path_exp = annotationPathExp, combinedWigParams = combinedWigParams)
 
     def preprocess_data(self, position, data):
@@ -455,6 +472,8 @@ class ResamplingMethod(base.DualConditionMethod):
             print("Error: cannot do histograms")
             self.doHistogram = False
 
+        if self.ctrl_lib_str and self.site_restricted:
+          raise Exception("Cannot do site_restricted resampling with library strings at same time")
 
         self.transit_message("Starting resampling Method")
         start_time = time.time()
@@ -538,13 +557,18 @@ class ResamplingMethod(base.DualConditionMethod):
             self.output.write("#GUI with: norm=%s, samples=%s, pseudocounts=%1.2f, adaptive=%s, histogram=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.samples, self.pseudocount, self.adaptive, self.doHistogram, self.includeZeros, self.output.name.encode('utf-8')))
         else:
             self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))
-        self.output.write("#Parameters: samples=%s, norm=%s, histograms=%s, adaptive=%s, excludeZeros=%s, pseudocounts=%s, LOESS=%s, trim_Nterm=%s, trim_Cterm=%s\n" % (self.samples,self.normalization, self.doHistogram, self.adaptive,not self.includeZeros,self.pseudocount,self.LOESS,self.NTerminus,self.CTerminus))
+        self.output.write("#Parameters: samples=%s, norm=%s, histograms=%s, adaptive=%s, excludeZeros=%s, pseudocounts=%s, LOESS=%s, trim_Nterm=%s, trim_Cterm=%s, site_restricted=%s, winsorize=%s\n" % (self.samples,self.normalization, self.doHistogram, self.adaptive,not self.includeZeros,self.pseudocount,self.LOESS,self.NTerminus,self.CTerminus,self.site_restricted,self.winz))
         self.output.write("#Control Data: %s\n" % (",".join(self.ctrldata).encode('utf-8')))
         self.output.write("#Experimental Data: %s\n" % (",".join(self.expdata).encode('utf-8')))
         self.output.write("#Annotation path: %s %s\n" % (self.annotation_path.encode('utf-8'), self.annotation_path_exp.encode('utf-8') if self.diffStrains else ''))
-        self.output.write("#Time: %s\n" % (time.time() - start_time))
-        #Z = True # include Z-score column in resampling output?
+
+        nhits = len(list(filter(lambda x: x<0.05,qval)))
+        result_msg = "Number of significant conditionally essential genes (Padj<0.05): %s" % nhits
+        self.output.write("#%s\n" % result_msg)
+        self.transit_message(result_msg)
+
         global columns # consider redefining columns above (for GUI)
+        #Z = True # include Z-score column in resampling output?
         if self.Z==True: columns = ["Orf","Name","Desc","Sites","Mean Ctrl","Mean Exp","log2FC", "Sum Ctrl", "Sum Exp", "Delta Mean","p-value","Z-score","Adj. p-value"]
         self.output.write("#%s\n" % "\t".join(columns))
 
@@ -558,20 +582,26 @@ class ResamplingMethod(base.DualConditionMethod):
               if log2FC>0: z *= -1
               self.output.write("%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.1f\t%1.2f\t%1.1f\t%1.5f\t%0.2f\t%1.5f\n" % (orf, name, desc, n, mean1, mean2, log2FC, sum1, sum2, test_obs, pval_2tail, z, qval[i]))
             else: self.output.write("%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.1f\t%1.2f\t%1.1f\t%1.5f\t%1.5f\n" % (orf, name, desc, n, mean1, mean2, log2FC, sum1, sum2, test_obs, pval_2tail, qval[i]))
-        self.output.close()
 
-        self.transit_message("Adding File: %s" % (self.output.name))
-        self.add_file(filetype="Resampling")
+        self.output.close()
 
-    def winsorize_resampling(self, counts):
-      # input is insertion counts for gene as pre-flattened numpy array
-      counts = counts.tolist()
-      if len(counts)<3: return counts
+        if self.wxobj: # only need to do this in the GUI
+          self.transit_message("Adding File: %s" % (self.output.name)) 
+          self.add_file(filetype="Resampling")
+        self.transit_message("Time: %0.2fs" % (time.time() - start_time)) # maybe append time elapsed to the "Finished Resampling" message?
+
+    def winsorize_for_resampling(self, data):
+      # input is a 2D array of insertion counts for gene (not pre-flattened)
+      shp = data.shape
+      assert len(shp)==2, "winsorize_resampling() expected 2D numpy array"
+      counts = data.flatten().tolist()
+      if len(counts)<3: return data
       s = sorted(counts,reverse=True)
-      if s[1]==0: return counts # don't do anything if there is only 1 non-zero value
+      if s[1]==0: return data # don't do anything if there is only 1 non-zero value
       c2 = [s[1] if x==s[0] else x for x in counts]
-      return numpy.array(c2)
+      return numpy.array(c2).reshape(shp)
 
+      # the old way of winsorizing...
       #unique_counts = numpy.unique(counts)
       #if (len(unique_counts) < 2): return counts
       #else:
@@ -613,14 +643,14 @@ class ResamplingMethod(base.DualConditionMethod):
                     ii_exp = numpy.ones(gene_exp.n) == 1
 
                 #data1 = gene.reads[:,ii_ctrl].flatten() + self.pseudocount # we used to have an option to add pseudocounts to each observation, like this
-                data1 = gene.reads[:,ii_ctrl].flatten()
-                data2 = gene_exp.reads[:,ii_exp].flatten()
-                if self.winz: data1 = self.winsorize_resampling(data1); data2 = self.winsorize_resampling(data2)
+                data1 = gene.reads[:,ii_ctrl]###.flatten() #TRI - do not flatten, as of 9/6/22
+                data2 = gene_exp.reads[:,ii_exp]###.flatten()
+                if self.winz: data1 = self.winsorize_for_resampling(data1); data2 = self.winsorize_for_resampling(data2)
 
                 if doLibraryResampling:
-                    (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) =  stat_tools.resampling(data1, data2, S=self.samples, testFunc=stat_tools.F_mean_diff_dict, permFunc=stat_tools.F_shuffle_dict_libraries, adaptive=self.adaptive, lib_str1=self.ctrl_lib_str, lib_str2=self.exp_lib_str,PC=self.pseudocount)
+                    (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) =  stat_tools.resampling(data1, data2, S=self.samples, testFunc=stat_tools.F_mean_diff_dict, permFunc=stat_tools.F_shuffle_dict_libraries, adaptive=self.adaptive, lib_str1=self.ctrl_lib_str, lib_str2=self.exp_lib_str,PC=self.pseudocount,site_restricted=self.site_restricted)
                 else:
-                    (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) =  stat_tools.resampling(data1, data2, S=self.samples, testFunc=stat_tools.F_mean_diff_flat, permFunc=stat_tools.F_shuffle_flat, adaptive=self.adaptive, lib_str1=self.ctrl_lib_str, lib_str2=self.exp_lib_str,PC=self.pseudocount)
+                    (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) =  stat_tools.resampling(data1, data2, S=self.samples, testFunc=stat_tools.F_mean_diff_flat, permFunc=stat_tools.F_shuffle_flat, adaptive=self.adaptive, lib_str1=self.ctrl_lib_str, lib_str2=self.exp_lib_str,PC=self.pseudocount,site_restricted=self.site_restricted)
 
 
             if self.doHistogram:
@@ -689,8 +719,12 @@ class ResamplingMethod(base.DualConditionMethod):
                             If non-empty, resampling will limit permutations to within-libraries.
         -winz           :=  winsorize insertion counts for each gene in each condition 
                             (replace max cnt in each gene with 2nd highest; helps mitigate effect of outliers)
+        -sr             :=  site-restricted resampling; more sensitive, might find a few more significant conditionally essential genes"
         """ % (sys.argv[0], sys.argv[0])
 
+        # for docs:
+        # site-restricted resampling: counts are permuted among samples with each TA site, rather than permuting pooled counts; more sensitive, might find a few more significant conditionally essential genes"
+
 if __name__ == "__main__":
 
     (args, kwargs) = transit_tools.cleanargs(sys.argv)


=====================================
src/pytransit/doc/source/method_resampling.rst
=====================================
@@ -22,15 +22,55 @@ of genes or known biological pathway.
 How does it work?
 -----------------
 
-This technique has yet to be formally published in the context of
-differential essentiality analysis. Briefly, the read-counts at each
-genes are determined for each replicate of each condition. The mean
+Briefly, the normalized read-counts at all TA sites are determined 
+for each replicate of each of two conditions. For each gene, the mean
 read-count in condition A is subtracted from the mean read-count in
-condition B, to obtain an observed difference in means. The TA
-sites are then permuted for a given number of "samples". For each one of
-these permutations, the difference in read-counts is determined. This
+condition B to obtain an observed difference in means. The 
+counts are then permuted within the gene for a given number of times ("samples"). 
+For each of these permutations, the difference in read-counts is determined. This
 forms a null distribution, from which a p-value is calculated for the
-original, observed difference in read-counts.
+original, observed difference in read-counts. 
+(see Figure 3 in [DeJesus2015TRANSIT]_).
+
+
+Pooled vs Site-Restricted (S-R) Resampling 
+------------------------------------
+
+**9/21/2022:** The original version of resampling implemented in
+Transit is called 'pooled' resampling because the counts are permuted
+among all samples (replicates, conditions) *and* among all the TA
+sites in a gene (i.e. randomizing counts within and between TA sites).
+This was based on the assumption that insertions at any TA site are
+equally likely (and the only differences are due to stochastic
+differences in abundance in the transposon insertion library when
+constructed).
+
+We recently developed an improved version of resampling called
+"site-restricted" (S-R) resampling.  With S-R resampling, during the
+process of permuting the counts to generate the null distribution, the
+counts at each TA site are restricted to permutations among samples
+**within the same TA site** (the counts at each site are permuted
+independently).  This restriction has the effect of reducing the
+variance in the null distribution, because there is evidence that
+there is an insertion bias for the Himar1 transposon that causes some
+TA sites to have a higher propensity for insertions and hence higher
+insertion counts than others (:ref:`Choudhery et al, 2021<Choudhery2021>`).
+
+Testing on a wide range of TnSeq datasets suggests
+that S-R resampling is **more sensitive** than regular pooled resampling..
+Typically, S-R resampling finds 2-3 times as many significant conditionally essential
+genes, mostly including the ones found by pooled resampling, but also capturing more 
+borderline cases that were close to the previous significance cutoff.
+
+It is important to note that the previous pooled version of resampling that has been
+in Transit for many years is not wrong, just overly conservative.
+
+S-R resampling is incorporated in the v3.2.7 version of Transit.
+There is an option (i.e. checkbox) for it in the GUI interface
+for the resampling method, and there is a new command-line flag (-sr) if you want 
+use it in console mode.
+
+
 
 |
 
@@ -41,7 +81,12 @@ Usage
 
 ::
 
-  python3 transit.py resampling <comma-separated .wig control files> <comma-separated .wig experimental files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
+  > python3 transit.py resampling <comma-separated .wig control files> <comma-separated .wig experimental files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
+  ---OR---
+  > python3 transit.py resampling -c <combined wig file> <samples_metadata file> <ctrl condition name> <exp condition name> <annotation .prot_table> <output file> [Optional Arguments]
+
+  NB: The ctrl and exp condition names should match Condition names in samples_metadata file.
+
         Optional Arguments:
         -s <integer>    :=  Number of samples. Default: -s 10000
         -n <string>     :=  Normalization method. Default: -n TTR
@@ -63,7 +108,7 @@ Usage
                             If non-empty, resampling will limit permutations to within-libraries.
         -winz           :=  winsorize insertion counts for each gene in each condition 
                             (replace max count in each gene with 2nd highest; helps mitigate effect of outliers)
-
+        -sr             :=  site-restricted resampling; more sensitive, might find a few more significant conditionally essential genes
 
 Parameters
 ----------
@@ -72,7 +117,9 @@ The resampling method is non-parametric, and therefore does not require
 any parameters governing the distributions or the model. The following
 parameters are available for the method:
 
--  **Samples:** The number of samples (permutations) to perform. The
+-  **Samples:** The number of samples (permutations) to perform for each gene
+   (to generate the null distribution of mean differences in counts, for calculating 
+   p-values). The
    larger the number of samples, the more resolution the p-values
    calculated will have, at the expense of longer computation time. The
    resampling method runs on 10,000 samples by default.
@@ -122,7 +169,7 @@ parameters are available for the method:
 
 -  **-winz**: `winsorize <https://en.wikipedia.org/wiki/Winsorizing>`_ insertion counts for each gene in each condition. 
    Replace max count in each gene with 2nd highest.  This can help mitigate effect of outliers.
-
+-  **-sr**: use 'site-restricted' resampling (see description above)
 
 |
 
@@ -130,7 +177,7 @@ Notes
 -----
 
 I recommend using -a (adaptive resampling). It runs much faster, and the p-values
-will be very close to a full non-adaptive run (all 10,000 samples).
+will be very close to a full resamping run (doing all 10,000 samples for each gene).
 
 Occasionally, people ask if resampling can be done on intergenic regions as well.
 It could be done pretty easily (for example by making a prot_table with coordinates
@@ -306,7 +353,7 @@ changed using the -PC flag (above).
 Run-time
 --------
 
-A typical run of the resampling method with 10,000 samples will take
+A typical run of the resampling method with 10,000 samples for each gene will take
 around 45 minutes (with the histogram option ON). Using the *adaptive
 resampling* option (-a), the run-time is reduced to around 10 minutes.
 


=====================================
src/pytransit/stat_tools.py
=====================================
@@ -1,9 +1,8 @@
-import sys,math
+import sys,math,random
 import numpy
 import scipy.stats
 
 
-
 def sample_trunc_norm_post(data, S, mu0, s20, k0, nu0):
     n = len(data)
     s2 = numpy.var(data,ddof=1)
@@ -495,11 +494,22 @@ def F_sum_diff_dict(*args, **kwargs):
 
 #
 
+# input: an a array with counts pooled across both conditions
+
 def F_shuffle_flat(*args, **kwargs):
     X = args[0]
     return numpy.random.permutation(X)
 
-#
+# data is 2D array of counts (samples X TA sites) including all reps in all libs in both conditions (as rows)
+# DESTRUCTIVE; caller must make copy of input arg (data) if they want to preserve it
+
+def site_restricted_permutation(data):
+  if len(data)==0: return data
+  nsamples,nTAs = data.shape[0],data.shape[1]
+  for i in range(nTAs): numpy.random.shuffle(data[:,i])
+  return data
+
+# apparently, this takes 1 arg, a dictionary D which maps library strings to pairs of count vectors
 
 def F_shuffle_dict_libraries(*args, **kwargs):
     D = args[0]
@@ -519,7 +529,7 @@ def F_shuffle_dict_libraries(*args, **kwargs):
 #
 
 def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
-            permFunc=F_shuffle_flat, adaptive=False, lib_str1="", lib_str2="",PC=1):
+            permFunc=F_shuffle_flat, adaptive=False, lib_str1="", lib_str2="",PC=1,site_restricted=False):
     """Does a permutation test on two sets of data.
 
     Performs the resampling / permutation test given two sets of data using a
@@ -538,6 +548,15 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
                 shuffle.
         adaptive: Cuts-off resampling early depending on significance.
 
+    Data arrays: (data1 and data2)
+      Regular resampling used to take 1D arrays of counts pooled (flattened) over replicates.
+        Now 2D arrays are passed in and flatten them.
+        Uses F_shuffle_flat() and F_sum_diff_flat().
+      If using library strings, then inputs are 2D arrays of counts for each sample. 
+        Character in lib_str indicates which lib it is in.  Make a dict out of these to pass to permFunc.
+        Uses F_shuffle_dict_libraries() and F_sum_diff_dict_libraries().
+      If site_restricted, keep input arrays as 2D and pass to site_restricted_permutation() and F_sum_diff_flat().
+
     Returns:
         Tuple with described values
             - test_obs -- Test statistic of observation.
@@ -569,6 +588,9 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
         #raise ValueError("At least one library string has a letter not used by the other:\ %s" % ", ".join(lib_diff))
         raise ValueError("At least one library string has a letter not used by the other: " + ", ".join(lib_diff))
 
+    if lib_str1 and site_restricted:
+      raise Exception("Cannot do site_restricted resampling with library strings at same time")
+
     # - Check input has some data
     assert len(data1) > 0, "Data1 cannot be empty"
     assert len(data2) > 0, "Data2 cannot be empty"
@@ -576,6 +598,11 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
     if isinstance(data1,list): data1 = numpy.array(data1)
     if isinstance(data2,list): data2 = numpy.array(data2)
 
+    #TRI note - now I am switching resampling() so caller passes in NON-flattened arrays of counts
+    if not site_restricted and not lib_str1: 
+      data1 = data1.flatten()
+      data2 = data2.flatten()
+
     count_ltail = 0
     count_utail = 0
     count_2tail = 0
@@ -583,12 +610,13 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
     test_list = []
 
     # Calculate basic statistics for the input data:
-    n1 = len(data1)
+    # flattened (pooled) if not lib_str, else multiple samples (rows) for each lib
+    n1 = len(data1) # number of samples (i.e. rows) for site_restricted or lib_str, or pooled counts if flattened
     n2 = len(data2)
 
     mean1 = 0
     if n1 > 0:
-        mean1 = numpy.mean(data1)
+        mean1 = numpy.mean(data1) # over all counts pooled across reps and libs for cond1
     mean2 = 0
     if n2 > 0:
         mean2 = numpy.mean(data2)
@@ -606,12 +634,15 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
         # Get number of TA sites implied
         nTAs = len(data1.flatten())//len(lib_str1)
         assert len(data2.flatten())//len(lib_str2) == nTAs, "Datasets do not have matching sites; check input data and library strings."
+
         # Get data
-        perm = get_lib_data_dict(data1, lib_str1, data2, lib_str2, nTAs)
-        test_obs = testFunc(perm)
+        # for lib_str, perm is a dict mapping letters to pairs of numpy arrays (1 for each cond)
+        perm = get_lib_data_dict(data1.flatten(), lib_str1, data2.flatten(), lib_str2, nTAs) 
+        test_obs = testFunc(perm) 
     else:
         try:
-            test_obs = testFunc(data1, data2)
+            # site_retricted use F_sum_diff_flat() as testFunc too
+            test_obs = testFunc(data1, data2) # first call, actual value from observed counts
         except Exception as e:
             print("")
             print("!"*100)
@@ -624,10 +655,13 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
             print("")
             return None
 
-        perm = numpy.zeros(n1+n2)
-        perm[:n1] = data1
-        perm[n1:] = data2
-
+        if site_restricted:
+          data = numpy.concatenate((data1,data2),axis=0) # keep it as a 2D array
+          perm = data.copy() # this array will get modified with each permutation
+        else: # pool all counts (across conditions) into 1 big array
+          perm = numpy.zeros(n1+n2)
+          perm[:n1] = data1
+          perm[n1:] = data2
 
 
     count_ltail = 0
@@ -636,12 +670,14 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
     test_list = []
     s_performed = 0
     for s in range(S):
-        if len(perm) >0:
-            perm = permFunc(perm)
+        if mean1+mean2 > 0:
+#            perm = permFunc(perm) 
+            if site_restricted: perm = site_restricted_permutation(perm) #TRI - I could have passed this in as permFunc, but I don't want to require the caller to know this
+            else: perm = permFunc(perm) #TRI
             if not lib_str1:
                 test_sample = testFunc(perm[:n1], perm[n1:])
-            else:
-                test_sample = testFunc(perm)
+            else: # case for lib strings
+                test_sample = testFunc(perm) # how do I know how many counts are in cond1 or cond2? perm is a dict over lib strings (and conds?)
         else:
             test_sample = 0
 
@@ -720,7 +756,7 @@ def combine_lib_dicts(L1, L2):
         DATA[K] = numpy.array([L1[K], L2[K]],dtype=object)
     return DATA
 
-#
+# it looks like data1 is supposed to be pre-flattened (see parse_lib_index())
 
 def get_lib_data_dict(data1, ctrl_lib_str, data2, exp_lib_str, nTAs):
     lib1_index_dict = parse_lib_index(len(data1), ctrl_lib_str, nTAs)
@@ -847,10 +883,10 @@ if __name__ == "__main__":
  
     ii = numpy.ones(gene.n) == 1
         
-    data1 = gene.reads[:Kctrl,ii].flatten()
+    data1 = gene.reads[:Kctrl,ii].flatten() #TRI should we not flatten if doing site_restricted?
     data2 = gene.reads[Kctrl:,ii].flatten()
     
-    data_dict = get_lib_data_dict(data1, ctrl_lib_str, data2, exp_lib_str, gene.n)
+    data_dict = get_lib_data_dict(data1, ctrl_lib_str, data2, exp_lib_str, gene.n) # not used?
 
     if DO_LIB:
         (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,  pval_2tail, testlist) =  resampling(data1, data2, S=10000, testFunc=F_mean_diff_dict, permFunc=F_shuffle_dict_libraries, adaptive=False, lib_str1=ctrl_lib_str, lib_str2=exp_lib_str)


=====================================
src/pytransit/tnseq_tools.py
=====================================
@@ -80,8 +80,11 @@ def read_samples_metadata(metadata_file, covarsToRead = [], interactionsToRead =
     interactionsByFileList = [{} for i in range(len(interactionsToRead))]
     headersToRead = [condition_name.lower(), "filename"]
     orderingMetadata = { 'condition': [], 'interaction': [] }
-    with open(metadata_file) as mfile:
-        lines = mfile.readlines()
+    lines = []
+    for line in open(metadata_file):
+      if len(line)<=2 or line.startswith("#"): continue
+      lines.append(line)
+    if True:
         headIndexes = [i
                 for h in headersToRead
                 for i, c in enumerate(lines[0].split())


=====================================
src/pytransit/transit_gui.py
=====================================
@@ -343,6 +343,8 @@ class MainFrame ( wx.Frame ):
         cTermSizer.Add( self.globalCTerminusText, 1, wx.ALIGN_CENTER_VERTICAL, 5 )
         cTermSizer.Add( self.globalCTerminusIcon, 1, wx.ALIGN_CENTER, 5 )
 
+        lib_tooltip = 'String of letters indicating the Tn library each dataset comes from. For example, if there are three datasets from two different libraries, then set the string to "ABB", "AAB", or "ABA". The set of letters used must match the number of datasets. The same library letter codes must be use for both control and experimental samples; there has to be at least sample of each library in both sets. If the library string is left BLANK, it will be assumed that all samples are from the same Tn library by default.'
+
         # Control Libraries text - GLOBAL
         ctrlLibSizer = wx.BoxSizer( wx.HORIZONTAL )
         self.ctrlLibLabel = wx.StaticText(self.globalPanel, wx.ID_ANY, u"Control Libraries:",
@@ -350,12 +352,7 @@ class MainFrame ( wx.Frame ):
         self.ctrlLibLabel.Wrap( -1 )
         self.ctrlLibText = wx.TextCtrl( self.globalPanel, wx.ID_ANY, "",
             wx.DefaultPosition, (-1,-1), 0 )
-        self.ctrlLibTip = pytransit.analysis.base.InfoIcon(self.globalPanel, wx.ID_ANY,
-            tooltip="String of letters representing an \
-            identifier for the libraries the datasets belong to. For example, if adding three \
-            datasets of different libraries, change the string to 'ABC'. Set of letters used  \
-            must match those in Experimental datasets. Keep empty or with all letters equal, e.g. \
-            'AAA', to do regular resampling.")
+        self.ctrlLibTip = pytransit.analysis.base.InfoIcon(self.globalPanel, wx.ID_ANY,tooltip=lib_tooltip)
 
         self.ctrlLibText.Disable()
         ctrlLibSizer.Add(self.ctrlLibLabel, 0, wx.ALIGN_CENTER_VERTICAL, 5 )
@@ -370,11 +367,7 @@ class MainFrame ( wx.Frame ):
         self.expLibLabel.Wrap( -1 )
         self.expLibText = wx.TextCtrl( self.globalPanel, wx.ID_ANY, "", wx.DefaultPosition,
             (-1,-1), 0 )
-        self.expLibTip = pytransit.analysis.base.InfoIcon(self.globalPanel, wx.ID_ANY,
-            tooltip="String of letters representing an identifier for the libraries the datasets \
-            belong to. For example, if adding three datasets of different libraries, change the \
-            string to 'ABC'. Set  of letters used must match those in Control datasets. Keep \
-            empty or with all letters equal, e.g. 'AAA', to do regular resampling.")
+        self.expLibTip = pytransit.analysis.base.InfoIcon(self.globalPanel, wx.ID_ANY,tooltip=lib_tooltip)
 
         self.expLibText.Disable()
         expLibSizer.Add(self.expLibLabel, 0, wx.ALIGN_CENTER_VERTICAL, 5 )
@@ -1138,7 +1131,7 @@ class TnSeekFrame(MainFrame):
         self.list_ctrl.Select(self.index_ctrl)
         self.index_ctrl+=1
         try:
-            self.ctrlLibText.SetValue(self.ctrlLibText.GetValue()+"A")
+            pass # self.ctrlLibText.SetValue(self.ctrlLibText.GetValue()+"A")
         except Exception as e:
             transit_tools.transit_message("Error Modifying Ctrl Lib String: %s" % e)
             exc_type, exc_obj, exc_tb = sys.exc_info()
@@ -1161,7 +1154,7 @@ class TnSeekFrame(MainFrame):
         self.index_exp+=1
 
         try:
-            self.expLibText.SetValue(self.expLibText.GetValue()+"A")
+            pass # self.expLibText.SetValue(self.expLibText.GetValue()+"A")
         except Exception as e:
             transit_tools.transit_message("Error Modifying Ctrl Lib String: %s" % e)
             exc_type, exc_obj, exc_tb = sys.exc_info()



View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/compare/df0c59be90a7ec00171e205427c24032c8a91a15...e7fcdb595572f7ea598ca27f80632375a9e0790d

-- 
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/compare/df0c59be90a7ec00171e205427c24032c8a91a15...e7fcdb595572f7ea598ca27f80632375a9e0790d
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221022/8f00e07b/attachment-0001.htm>