[med-svn] [Git][med-team/tnseq-transit][upstream] New upstream version 3.3.3

Wed Nov 29 19:03:40 GMT 2023


Étienne Mollier pushed to branch upstream at Debian Med / tnseq-transit


Commits:
197c36f1 by Étienne Mollier at 2023-11-29T19:38:53+01:00
New upstream version 3.3.3
- - - - -


12 changed files:

- CHANGELOG.md
- setup.cfg
- setup.py
- src/HMM_conf.py
- src/pytransit/__init__.py
- src/pytransit/analysis/hmm.py
- src/pytransit/analysis/normalize.py
- + src/pytransit/doc/source/_images/HMM_1D_distributions.png
- src/pytransit/doc/source/method_HMM.rst
- + src/pytransit/generic_tools/file_system_py.py
- src/pytransit/norm_tools.py
- src/pytransit/stat_tools.py


Changes:

=====================================
CHANGELOG.md
=====================================
@@ -2,6 +2,13 @@
 All notable changes to this project will be documented in this file.
 
 
+## Version 3.3.3 (2023-11-26)
+#### Transit:
+
+Major changes:
+ - changed the calculation of HMM confidence scores in HMM_conf.py to use 1D distributions over Mean insertion counts for each gene for each essentiality state
+ - fixed bug in LOESS correction
+
 
 ## Version 3.3.2 (2023-10-29)
 #### Transit:


=====================================
setup.cfg
=====================================
@@ -1,2 +1,2 @@
 [metadata]
-description-file = README.md
+description_file = README.md


=====================================
setup.py
=====================================
@@ -4,28 +4,66 @@ See:
 https://packaging.python.org/en/latest/distributing.html
 https://github.com/pypa/sampleproject
 """
+import sys
+import os
+from os import path
+import glob
+import shutil
+from pathlib import Path
+from shutil import rmtree
 
 # Always prefer setuptools over distutils
 from setuptools import setup, find_packages, Command
 # To use a consistent encoding
 from codecs import open
-from os import path
-from shutil import rmtree
-
-import os
 
-here = path.abspath(path.dirname(__file__))
+sys.path.append("src")
+import pytransit
+from pytransit.generic_tools import file_system_py as FS
+version =  pytransit.__version__[1:]
 
+package_name = "pytransit"
+
+def file_exclusion_function(file_path):
+    """
+        Summary:
+            this is supposed to return True if the file should be excluded from the pypi upload
+            HOWEVER for some reason some files can still be included, partly because of the MANIFEST.in file
+            The process is rather mysterious
+            
+    """
+    # no folders
+    if os.path.isdir(file_path):
+        return True
+    
+    # no __pycache__ folders
+    if (
+        '/__pycache__/' in file_path
+        or file_path.startswith('__pycache__/')
+        or file_path.endswith('/__pycache__')
+    ):
+        return True
+        
+    # only .py, .json, and .yaml from the __dependencies__ folders
+    if '/__dependencies__/' in file_path and not (file_path.endswith(".py") or file_path.endswith(".json") or file_path.endswith(".yaml")):
+        return True
+    # no test files
+    if "/ruamel/yaml/_test/" in file_path:
+        return True
+    
+    if "/doc/" in file_path:
+        return True
+    
+    
+    return False
 
 # Get the long description from the README file
+here = path.abspath(path.dirname(__file__))
 with open(path.join(here, 'README.md'), encoding='utf-8') as f:
     long_description = f.read()
 
 # Get current version
-import sys
 sys.path.insert(1, "src/")
-import pytransit
-version =  pytransit.__version__[1:]
 
 class UploadCommand(Command):
     """Support setup.py upload."""
@@ -46,7 +84,7 @@ class UploadCommand(Command):
 
     def yes_or_no(self, question):
         while True:
-            reply = str(raw_input(question +' (y/n): ')).lower().strip()
+            reply = str(input(question +' (y/n): ')).lower().strip()
             if reply == 'y':
                 return True
             return False
@@ -91,6 +129,15 @@ class UploadCommand(Command):
 
         sys.exit()
 
+package_data = {
+    # include all files/folders in the module (recursively)
+    package_name: sorted(list(set([
+        each[len(package_name)+1:]
+            for each in FS.iterate_paths_in(package_name, recursively=True)
+                if not file_exclusion_function(each)
+    ]))),
+}
+
 setup(
     name='tnseq-transit',
 
@@ -130,8 +177,6 @@ setup(
     # What does your project relate to?
     keywords=['tnseq', 'analysis', 'biology', 'genome'],
 
-    #package_dir = {'tnseq-transit': 'src/pytransit'},
-
     # You can just specify the packages manually here if your project is
     # simple. Or you can use find_packages().
     packages = find_packages('src', exclude=['contrib', 'tests']),
@@ -150,7 +195,7 @@ setup(
     # https://packaging.python.org/en/latest/requirements.html
     # 'pypubsub<4.0' and 'wxPython' are needed for GUI only, but go ahead and install them
     # the reason for restriction on pypubsub is that version>=4.0 does not work with python2 - I can probably get rid of this restriction, since everybody must be using python3 by now
-    install_requires=['setuptools', 'numpy~=1.16', 'scipy~=1.2', 'matplotlib~=3.0', 'pillow', 'scikit-learn', 'statsmodels~=0.9', 'pypubsub<4.0', 'wxPython'],
+    install_requires=['setuptools', 'numpy~=1.16', 'scipy~=1.2', 'matplotlib~=3.0', 'pillow', 'scikit-learn', 'statsmodels~=0.9', 'pypubsub', 'wxPython'],
 
     #dependency_links = [
     #	"git+https://github.com/wxWidgets/wxPython.git#egg=wxPython"
@@ -168,9 +213,7 @@ setup(
     # If there are data files included in your packages that need to be
     # installed, specify them here.  If using Python 2.6 or less, then these
     # have to be included in MANIFEST.in as well.
-    package_data={
-        'pytransit': ['pytransit/data/*', 'pytransit/doc/*.*', 'pytransit/doc/images/*', 'pytransit/genomes/*']
-    },
+    package_data=package_data,
 
     #scripts=['src/tpp.py', 'src/transit.py'],
 
@@ -193,4 +236,3 @@ setup(
         'upload': UploadCommand,
     },
 )
-


=====================================
src/HMM_conf.py
=====================================
@@ -1,6 +1,8 @@
 import sys,numpy
 import scipy.stats
 
+STATES = "ES GD NE GA".split()
+
 # first pass...
 
 headers = []
@@ -8,6 +10,7 @@ data = []
 all = []
 Sats = {}
 NZmeans = {}
+Means = {}
 TAs = {}
 Calls = {}
 for line in open(sys.argv[1]):
@@ -22,12 +25,14 @@ for line in open(sys.argv[1]):
   id = w[0]
   Sats[id] = float(w[-3])
   NZmeans[id] = float(w[-2])
+  Means[id] = Sats[id]*NZmeans[id]
   TAs[id] = nTA
   Calls[id] = w[-1]
   all.append(consistency)
   data.append(w)
 
-print("# avg gene-level consistency of HMM states: %s" % (round(numpy.mean(all),4)))
+Info = ["# HMM confidence info:"]
+Info.append("# avg gene-level consistency of HMM states: %s" % (round(numpy.mean(all),4)))
 
 cons = numpy.mean(all)
 
@@ -35,57 +40,122 @@ IDs = [x[0] for x in data]
 
 meanSats,meanNZmeans = {},{}
 stdSats,stdNZmeans = {},{}
+SatParams = {}
+NZmeanParams = {}
+MeanParams = {}
 
-print("# state posterior probability distributions:")
+Info.append("# state posterior probability distributions:")
 for st in ["ES","GD","NE","GA"]:
   sub = list(filter(lambda id: Calls[id]==st,IDs))
-  meanSat = numpy.mean([Sats[id] for id in sub])
-  meanNZmean = numpy.mean([NZmeans[id] for id in sub])
-  stdSat = numpy.std([Sats[id] for id in sub])
-  stdNZmean = numpy.std([NZmeans[id] for id in sub])
-  print("#  %s: genes=%s, meanSat=%s, stdSat=%s, meanNZmean=%s, stdNZmean=%s" % (st,len(sub),round(meanSat,3),round(stdSat,3),round(meanNZmean,1),round(stdNZmean,1)))
-  meanSats[st] = meanSat
-  meanNZmeans[st] = meanNZmean
-  stdSats[st] = meanSat
-  stdNZmeans[st] = meanNZmean
-
-def calc_prob(sat,NZmean,meanSat,stdSat,meanNZmean,stdNZmean):
-  a = scipy.stats.norm.pdf(sat,loc=meanSat,scale=stdSat)
-  b = scipy.stats.norm.pdf(NZmean,loc=meanNZmean,scale=stdNZmean)
-  return a*b
+  nzmeans = [NZmeans[id] for id in sub]
+  sats = [Sats[id] for id in sub]
+  means = [Means[id] for id in sub]
 
-# second pass...
+  meanSat = numpy.mean(sats)
+  meanNZmean = numpy.mean(nzmeans)
+  stdSat = numpy.std(sats)
+  stdNZmean = numpy.std(nzmeans)
+  medNZmeans = numpy.median(nzmeans)
+  iqrNZmeans = scipy.stats.iqr(nzmeans)
+  meanMeans = -999 if len(means)==0 else numpy.median(means) # -999 if there are no GA genes, for example
+  stdMeans = max(1.0,0.7314*scipy.stats.iqr(means)) # don't let stdev collapse to 0 for ES
 
-for line in headers[:-1]: print(line)
-newheaders = headers[-1].split('\t') # assume last comment line is column headers
-newheaders += "consis probES probGD probNE probGA conf flag".split()
-print('\t'.join(newheaders))
+  # model NZmean with robust Normal distribution: use median and IQR
+  sigma = 0.7413*iqrNZmeans
+  sigma = max(0.01,sigma) # don't let it collapse to 0
+  NZmeanParams[st] = (medNZmeans,sigma)
+
+  # model saturation as Beta distribution, fit by method of moments:
+  # https://real-statistics.com/distribution-fitting/method-of-moments/method-of-moments-beta-distribution/
+  alpha = meanSat*( (meanSat*(1.0-meanSat)/(stdSat*stdSat) - 1.0) )
+  beta = alpha*(1.0-meanSat)/meanSat
+  SatParams[st] = (alpha,beta)
+  MeanParams[st] = (meanMeans,stdMeans)
+
+  #Info.append("#   Sat[%s]:    Beta(alpha=%s,beta=%s), E[sat]=%s" % (st,round(alpha,2),round(beta,2),round(alpha/(alpha+beta),3)))
+  #Info.append("#   NZmean[%s]: Norm(mean=%s,stdev=%s)" % (st,round(NZmeanParams[st][0],2),round(NZmeanParams[st][1],2)))
+  Info.append("#   Mean[%s]:   Norm(mean=%s,stdev=%s)" % (st,round(MeanParams[st][0],2),round(MeanParams[st][1],2)))
+
+
+def normalize(L):
+  tot = sum(L)
+  return [x/float(tot) for x in L]
+
+# prob of each state is joint prob over saturation (Beta) and NZmean (Normal)
+# for all 4 states; uses globals
+
+def calc_probs3(sats,nzmean):
+  A,B = [],[]
+  for st in STATES:
+    alpha,beta = SatParams[st]
+    meanNZmean,stdNZmean = NZmeanParams[st]
+    A.append(scipy.stats.beta.pdf(sat,a=alpha,b=beta))
+    B.append(scipy.stats.norm.pdf(nzmean,loc=meanNZmean,scale=stdNZmean))
+  A = normalize(A)
+  B = normalize(B)
+  probs = [a*b for a,b in zip(A,B)]
+  return normalize(probs)
+
+# prob of each state is based Gaussian density for Mean count for each gene (combines Sat and NZmean)
+
+def calc_probs4(sats,nzmean):
+  probs = []
+  for st in STATES:
+    meanMeans,stdMeans = MeanParams[st]
+    probs.append(0 if meanMeans<0 else scipy.stats.norm.pdf(sat*nzmean,loc=meanMeans,scale=stdMeans))
+  return normalize(probs)
 
+##################################
+
+# second pass...
+
+num_low_conf,num_ambig = 0,0
+results = []
 for line in open(sys.argv[1]):
   line = line.strip()
   if line[0]=='#': continue
   w = line.split('\t')
   id,Call = w[0],w[-1]
   nTA = int(w[3])
-  if nTA==0: print (line); continue
+  if nTA==0: continue; # print (line); continue
   votes = [int(x) for x in w[4:8]]
   m = max(votes)
   consistency = m/float(nTA)
-  probs = []
-  STATES = "ES GD NE GA".split()
-  for st in STATES:
-    probs.append(calc_prob(float(w[-3]),float(w[-2]),meanSats[st],stdSats[st],meanNZmeans[st],stdNZmeans[st]))
-  totprob = sum(probs)
-  relprobs = [x/float(totprob) for x in probs]
-  conf = relprobs[STATES.index(Call)]
+  sat,NZmean = float(w[-3]),float(w[-2])
+  PC = 0.01 # shrink range of saturation form [0,1] to [0.01,0.99] to prevent nan's from Beta
+  sat = max(PC,min(1.0-PC,sat))
+  probs = calc_probs4(sat,NZmean) # normalized
+  conf = probs[STATES.index(Call)]
+
   flag=""
-  if conf<0.5: flag="low-confidence"
-  if max(relprobs)<0.7:
-    if (Call=="ES" or Call=="GD") and (relprobs[0]>0.25 and relprobs[1]>0.25): flag="ambiguous"
-    if (Call=="GD" or Call=="NE") and  (relprobs[1]>0.25 and relprobs[2]>0.25) :flag="ambiguous"
-    if (Call=="NE" or Call=="GA") and (relprobs[2]>0.25 and relprobs[3]>0.25):flag="ambiguous"
+
+  #if conf<0.5: flag="low-confidence"
+  #if max(probs)<0.7:
+  #  if (Call=="ES" or Call=="GD") and (probs[0]>0.25 and probs[1]>0.25): flag="ambiguous"
+  #  if (Call=="GD" or Call=="NE") and  (probs[1]>0.25 and probs[2]>0.25): flag="ambiguous"
+  #  if (Call=="NE" or Call=="GA") and (probs[2]>0.25 and probs[3]>0.25): flag="ambiguous"
+
+  if conf<0.2: flag = "low-confidence"
+  if conf>=0.2 and conf!=max(probs): flag = "ambiguous"
+
+  if flag=="ambiguous": num_ambig += 1
+  if flag=="low-confidence": num_low_conf += 1
   
-  vals = w+[round(consistency,3)]+[round(x,6) for x in relprobs]
+  vals = w+[round(sat*NZmean,1),round(consistency,3)]+[round(x,6) for x in probs]
   vals += [round(conf,4),flag]
+  results.append(vals)
+
+Info.append("# num low-confidence genes=%s, num ambiguous genes=%s" % (num_low_conf,num_ambig))
+
+###############
+# print results
+
+for line in headers[:-1]: print(line)
+for line in Info: print(line)
+newheaders = headers[-1].split('\t') # assume last comment line is column headers
+newheaders += "Mean consis probES probGD probNE probGA conf flag".split()
+print('\t'.join(newheaders))
+
+for vals in results:
   print ('\t'.join([str(x) for x in vals]))
 


=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
 __all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
 
 
-__version__ = "v3.3.2"
+__version__ = "v3.3.3"
 prefix = "[TRANSIT]"
 


=====================================
src/pytransit/analysis/hmm.py
=====================================
@@ -608,7 +608,7 @@ class HMMMethod(base.SingleConditionMethod):
         output.write("#command line: python3 %s\n" % (' '.join(sys.argv)))
         output.write("#summary of gene calls: ES=%s, GD=%s, NE=%s, GA=%s, N/A=%s\n" % tuple([counts.get(x,0) for x in "ES GD NE GA N/A".split()]))
         output.write("#key: ES=essential, GD=insertions cause growth-defect, NE=non-essential, GA=insertions confer growth-advantage, N/A=not analyzed (genes with 0 TA sites)\n")
-        output.write("#ORF\tgene\tannotation\tTAs\tES sites\tGD sites\tNE sites\tGA sites\tsaturation\tmean\tcall\n")
+        output.write("#ORF\tgene\tannotation\tTAs\tES sites\tGD sites\tNE sites\tGA sites\tsaturation\tNZmean\tcall\n")
         for line in lines: output.write(line)
         output.close()
 


=====================================
src/pytransit/analysis/normalize.py
=====================================
@@ -103,7 +103,7 @@ class NormalizeMethod(base.SingleConditionMethod):
         else:
             self.infile = args[0] # only 1 input wig file
             self.outfile = args[1] # if no arg give, could print to screen
-        self.normalization = kwargs.get("n", "TTR") # check if it is a legal method name
+        self.normalization = kwargs.get("n", "TTR") # should check if it is a legal method name
         self.combined_wig = isCombinedWig
 
         return self(self.infile,self.outfile,self.normalization)


=====================================
src/pytransit/doc/source/_images/HMM_1D_distributions.png
=====================================
Binary files /dev/null and b/src/pytransit/doc/source/_images/HMM_1D_distributions.png differ


=====================================
src/pytransit/doc/source/method_HMM.rst
=====================================
@@ -138,58 +138,63 @@ is that, while the formal state probabilities are calculated at
 individual TA sites, the essentiality calls at the gene level are made
 by taking a vote (the most frequent state among its TA sites), and
 this does not lend itself to such formal certainty quantification.
-However, the reviewers’ question prompted us to devise a novel
-approach to evaluating the confidence of the HMM calls for genes.  In
-the output files, we include the local saturation and mean insertion
-count for each gene, along with the gene-level call and state
-distribution.  We have sometimes noticed that short genes are
+However, we developed a novel
+approach to evaluating the confidence of the HMM calls for genes.  
+We have sometimes noticed that short genes are
 susceptible to being influenced by the essentiality of an adjacent
-region, which is evident by examining the insertion statistics.  For
+region, which is evident by examining the insertion statistics
+(saturation in gene, or percent of TA sites with insertion, and 
+mean insertion count at those sites).  For
 example, consider a hypothetical gene with just 2 TA sites that is
 labeled as ES by the HMM but has insertions at both sites.  It might
 be explained by proximity to a large essential gene or region, due to
 the “smoothing” the HMM does across the sequence of TA sites.  Thus we
 can sometimes recognize inaccurate calls by the HMM if the insertion
-statistics of a gene are not consistent with the call.
+statistics of a gene are not consistent with the call
+(i.e. a gene labeled as NE that has no insertions, or conversely,
+a gene labeled ES that has many insertion).
 
 In our paper on the HMM in Transit `(DeJesus et al, 2013)
-<https://pubmed.ncbi.nlm.nih.gov/24103077/>`_, 
-we showed a 2D plot of the posterior distribution of
-saturation and mean insertion counts (for a high-quality 
-TnSeq reference dataset) for the 4 essentiality states, which demonstrates that
-genes are nicely clustered, with ES genes having near-0 saturation and
-low counts at non-zero sites, NE genes have high saturation and
-counts, GD genes fall in between, and GA genes are almost fully
-saturated with excessively high counts.
-
-.. image:: _images/HMM_distributions.png
+<https://pubmed.ncbi.nlm.nih.gov/24103077/>`_, we showed a plot of
+random samples from the joint posterior distribution of local
+saturation and mean insertion counts (at non-zero sites) for the 4
+essentiality states, which nicely demonstrates that that ES genes have
+near-0 saturation and low counts at non-zero sites, NE genes have high
+saturation and counts, GD genes fall in between, and GA genes are
+almost fully saturated with excessively high counts.
+
+Following this idea, we can use the
+observed insertion counts in each gene to assess the 
+confidence in each of the essentiality calls by the HMM.
+Rather than modeling them as 2D distributions, we *combine* them into
+1D (Gaussian) distributions over the *overall mean insertion count* in
+each gene, including sites with zeros.  The mean count for essential
+(ES) genes usually around 0, typically around 5-10 for growth-defect
+(GD) genes, around 100 for non-essential (NE) genes, and >300 for
+growth-advantaged (GA) genes (assuming TTR normalization, by default).
+
+.. image:: _images/HMM_1D_distributions.png
    :width: 400
    :align: center
 
-(Figure 5 from `(DeJesus et al, 2013) <https://pubmed.ncbi.nlm.nih.gov/24103077/>`_ ) 
-
-We now calculate the distributions of insertion statistics **for each
-dataset** on which the HMM is run, 
-and use it to assess the confidence in each of the essentiality calls.
-We start by calculating the mean and standard
-deviation of saturation and insertion count over all the genes in each
-of the 4 states (ES, GD, NE, and GA).  Then, for each gene, we compute
-the probability density of its local statistics with respect to the
-product of Normal distributions for its called state.  For example,
-suppose a gene g is called state s.  Then:
+We now calculate these conditional distributions empirically for each
+dataset on which the HMM is run, and use it to assess the confidence
+in each of the essentiality calls.  We start by calculating the mean
+and standard deviation of saturation and insertion count over all the
+genes in each of the 4 states (ES, GD, NE, and GA).  The empirical
+mean and standard deviation for each state are reported in the header
+of the output file (by HMM_conf.py, see below).  Then, for each gene,
+we compute the probability density (likelihood) of its mean count with
+respect to the Normal distribution for each of the 4 states. For
+example, suppose a gene g is called state s.  Then:
 
 ::
 
-     conf(g|s) = N(sat(g)|μ_sat(s), σ_sat(s)) * N(cnt(g)|μ_cnt(s), σ_cnt(s))
+     P(g|s) = N(cnt(g)|μ_cnt(s), σ_cnt(s))
 
-This models the joint probability of the distribution of saturation
-and insertion counts for each class (simplistically) as independent,
-effectively forming “rectangular” regions around each cluster.  We can
-then calculate the probability that the gene belongs to each state
-based on these posterior distributions, and normalize them to sum
-to 1.  Finally, the “confidence” of the HMM call for each gene is the
-normalized probability of the gene’s insertion statistics given the
-posterior distribution for that essentiality state.
+The 4 probabilities are normalized to sum up to 1.
+The confidence in the HMM call for a gene is taken to be the normalized
+probability of the called state.
 
 This confidence score nicely identifies genes of low confidence, where
 the local saturation and mean insertion count seem inconsistent with
@@ -199,19 +204,21 @@ sites as well.  Some of the former are cases where the call of a short
 gene is influenced by an adjacent region.  Some of the latter include
 ambiguous cases like multi-domain proteins, where one domain is
 essential and the other is not.  We observed that there are often
-“borderline” (or ambiguous) genes, which the HMM labels as one state,
-but which are more consistent with a similar state.  For example, in
-some cases, a low-confidence ES gene might have a higher posterior
-probability of GD, based on insertion statistics.  Other cases include
-ambiguities between NE and GD, or NE and GA, especially for genes with
-mean insertion counts on the border between the clusters (posterior
-distributions) in the 2D plot.  After identifying and removing these
-borderline genes (also now indicated in the HMM output files), we find
-that most of the low-confidence genes have confidence scores in the
-approximate range of 0 to 0.25.  The number of low-confidence genes
-can range from a few hundred up to a thousand
-(for which the predicted essentiality call should be ignored), 
-depending on the
+“borderline” (or ambiguous) genes, where the 
+called state has significant probability (>0.2), but is not the most
+probable state (i.e. another state is more likely, based on the insertions in the gene).
+
+Criteria (for gene g with called state c):
+  * genes where the called state has the highest conditional probability (most likely, given their mean count) are 'confident'
+  * genes where P(g|c)>0.2, but there is another state that has higher probability are 'ambiguous'
+  * genes with P(g|c)<0.2 are 'low-confidence'
+
+
+The 'low-confidence' and 'ambiguous' genes are now indicated in the
+'flag' field added by HMM_conf.py in the output files (see below).
+
+In genomes with thousands genes, it is not uncommon for there to be a
+few hundred low-confidence and ambiguous genes each, depending on the
 saturation of the input dataset (.wig file); less-saturated datasets
 tend to have more low-confidence genes.
 
@@ -233,51 +240,43 @@ obviating the need for a second step.)
 The script adds the following columns:
 
  * **consis** - consistency (proportion of TA sites representing the majority essentiality state)
- * **probES** - conditional probability (unnormalized) of the insertions statistics (sat and mean) if the gene were essential 
- * **probGD** - conditional probability (unnormalized) of the insertions statistics (sat and mean) if the gene were growth-defect
- * **probNE** - conditional probability (unnormalized) of the insertions statistics (sat and mean) if the gene were non-essential
- * **probGA** - conditional probability (unnormalized) of the insertions statistics (sat and mean) if the gene were growth-advantaged
+  - If consistency<1.0, it means not all the TA sites in the gene agree with the essentiality call, which is made by majority vote. It is OK if a small fraction of TA sites in a gene are labeled otherwise. If it is a large fraction (consistency close to 50%), it might be a 'domain-essential' (multi-domain gene where one domain is ES and the other is NE). 
+ * **probES** - conditional probability (normalized) of the mean insertion count if the gene were essential 
+ * **probGD** - conditional probability (normalized) of the mean insertion count if the gene were growth-defect (i.e. disruption of gene causes a growth defect)
+ * **probNE** - conditional probability (normalized) of the mean insertion count if the gene were non-essential 
+ * **probGA** - conditional probability (normalized) of the mean insertion count if the gene were growth-advantaged
  * **conf** - confidence score (normalized conditional joint probability of the insertion statistics, given the actual essential call made by the HMM)
- * **flag** - genes that are ambiguous or have low confidence are labeled as such
-  * *low-confidence* means: the HMM call is inconsistent with statistics of insertion counts in gene, so the HMM call should be ignored
-  * *ambiguous* means: the HMM gives high probability to 2 adjacent essentiality states, like ES-or-GD, or GD-or-NE; these could be borderline cases where the gene could be in either category
-
-If consistency<1.0, it means not all the TA sites in the gene agree with the essentiality call, which is made by majority vote. 
-It is OK if a small fraction of TA sites in a gene are labeled otherwise.
-If it is a large fraction (consistency close to 50%), it might be a 'domain-essential'
-(multi-domain gene where one domain is ES and the other is NE).
+ * **flag** - genes that are ambiguous or have low confidence are labeled as such:
+  - *low-confidence* means the proability of the HMM call is <0.2 based on the mean insertion counts in gene, so the HMM call should be ignored
+  - *ambiguous* means the called state has prob>0.2, but there is another state with higher probability; these could be borderline cases where the gene could be in either category
 
-Here is an example to show what the additional columns look like.
+Here is an example to show what the additional columns look like. 
+The Mean insertion counts for the 4 essentiality states can be seen in the header.
 Note that MAB_0005 was called NE, but only has insertions at 1 out of 4 TA sites
 (sat=25%) with a mean count of only 7, so it is more consistent with ES; hence it is
 flagged as low-confidence (and one should ignore the NE call).
 
-Note!!! *The extra column headers still need to be added to HMM_conf.py: cons
-probES probGD probNE probGA conf flag.*
-
 ::
 
-  # avg gene-level consistency of HMM states: 0.9894
-  # state posterior probability distributions:
-  #  ES: genes=364, meanSat=0.038, stdSat=0.075, meanNZmean=12.0, stdNZmean=61.1
-  #  GD: genes=146, meanSat=0.297, stdSat=0.227, meanNZmean=9.6, stdNZmean=16.0
-  #  NE: genes=3971, meanSat=0.816, stdSat=0.169, meanNZmean=147.4, stdNZmean=110.7
-  #  GA: genes=419, meanSat=0.923, stdSat=0.11, meanNZmean=393.3, stdNZmean=210.0
   #HMM - Genes
   #command line: python3 ../../transit/src/transit.py hmm TnSeq-Ref-1.wig,TnSeq-Ref-2.wig,TnSeq-Ref-3.wig abscessus.prot_table HMM_Ref_sync3.txt -iN 5 -iC 5
   #summary of gene calls: ES=364, GD=146, NE=3971, GA=419, N/A=23
   #key: ES=essential, GD=insertions cause growth-defect, NE=non-essential, GA=insertions confer growth-advantage, N/A=not analyzed (genes with 0 TA sites)
-  #ORF     gene annotation                                TAs ES sites GD sites NE sites GA sites saturation   mean call consis probES  probGD   probNE   probGA   conf   flag
-  MAB_0001 dnaA Chromosomal replication initiator protein 24  24       0         0        0       0.0417       1.00 ES   1.0  0.22865 0.025725 0.000515 0.000169 0.8965
-  MAB_0002 dnaN DNA polymerase III, beta subunit (DnaN)   13  13       0         0        0       0.0769       1.00 ES   1.0  0.13582 0.028284 0.000536 0.000175 0.8241
-  MAB_0003 gnD  6-phosphogluconate dehydrogenase Gnd      19   0       0        19        0       0.9474      81.33 NE   1.0  0.00000 0.000000 0.001181 0.000329 0.7870
-  MAB_0004 recF DNA replication and repair protein RecF   15   0       0        15        0       0.6667      80.80 NE   1.0  0.00000 0.000000 0.001175 0.000307 0.7926
-  MAB_0005 -    hypothetical protein                       4   0       0         4        0       0.2500       7.00 NE   1.0  0.00000 0.052913 0.000661 0.000207 0.0123 low-confidence
-  MAB_0006 -    DNA gyrase (subunit B) GyrB               30  30       0         0        0       0.0000       0.00 ES   1.0  0.12864 0.020461 0.000487 0.000161 0.8590
-  MAB_0007 -    Hypothetical protein                      24   0       0         0       24       0.9167     456.64 GA   1.0  0.00000 0.000000 0.000145 0.000433 0.7487
-  MAB_0008 -    Hypothetical protein                       3   0       0         0        3       0.6667     173.00 GA   1.0  0.00000 0.000000 0.780528 0.219472 0.2195 low-confidence
-  MAB_0009 -    Hypothetical protein                       6   0       0         0        6       1.0000     495.83 GA   1.0  0.00000 0.000000 0.157296 0.842704 0.8427 
-  MAB_0010c -   Hypothetical protein                      14   0       0         0       14       0.9286     390.08 GA   1.0  0.00000 0.000000 0.435214 0.564786 0.5648 ambiguous
+  # HMM confidence info:
+  # avg gene-level consistency of HMM states: 0.9894
+  # state posterior probability distributions:
+  #   Mean[ES]:   Norm(mean=0.0,stdev=1.0)
+  #   Mean[GD]:   Norm(mean=1.73,stdev=3.36)
+  #   Mean[NE]:   Norm(mean=101.64,stdev=86.89)
+  #   Mean[GA]:   Norm(mean=340.0,stdev=182.21)
+  # num low-confidence genes=463, num ambiguous genes=448
+  #ORF      gene  annotation                              TAs  ESsites  GDsites  NEsites GAsites saturation NZmean call Mean consis probES probGD probNE probGA  conf   flag
+  MAB_0001  dnaA  Chromosomal replication initiator       24     24         0        0      0      0.0417     1.00  ES   0.0   1.0  0.7878 0.2068 0.0045 0.0007  0.7878     
+  MAB_0002  dnaN  DNA polymerase III, beta subunit (DnaN) 13     13         0        0      0      0.0769     1.00  ES   0.1   1.0  0.7866 0.2080 0.0045 0.0007  0.7866     
+  MAB_0003  gnD   6-phosphogluconate dehydrogenase Gnd    19     0          0       19      0      0.9474    81.33  NE  77.1   1.0  0.0    0.0    0.8509 0.1490  0.8509     
+  MAB_0004  recF  DNA replication and repair protein RecF 15     0          0       15      0      0.6667    80.80  NE  53.9   1.0  0.0    0.0    0.8608 0.1391  0.8608     
+  MAB_0005  -     hypothetical protein                     4     0          0        4      0      0.2500     7.00  NE   1.8   1.0  0.4152 0.5714 0.0114 0.0018  0.0114 low-confidence
+  MAB_0006  -     DNA gyrase (subunit B) GyrB (DNA topoi  30     30         0        0      0      0.0000     0.00  ES   0.0   1.0  0.7889 0.2056 0.0045 0.0007  0.789     
   ...
 
 


=====================================
src/pytransit/generic_tools/file_system_py.py
=====================================
@@ -0,0 +1,395 @@
+import sys
+import os
+import glob
+import shutil
+from pathlib import Path
+
+# python 3.5-3.8 has a problem with __file__ being relative to the initial cwd, (which isnt saved anywhere AFAIK) so when the user does a chdir() all the relative __file__ paths are meaningless
+# this is an attempt at a workaround, although it does fail if the user changed BOTH the PWD env variable and did os.chdir() before importing this module
+inital_pwd = None
+if os.name == 'nt': # "if windows"
+    inital_pwd = os.getenv("CD") # this is the windows pwd var
+else:
+    inital_pwd = os.getenv("PWD")
+
+intial_cwd = os.getcwd()
+if inital_pwd != intial_cwd:
+    initial_cwd = inital_pwd
+
+def write(data, *, to=None, path=None, force=True):
+    path = to or path
+    # make sure the path exists
+    if force: ensure_is_folder(parent_path(path))
+    with open(path, 'w') as the_file:
+        the_file.write(str(data))
+
+def read(filepath):
+    try:
+        with open(filepath,'r') as f:
+            output = f.read()
+    except:
+        output = None
+    return output    
+    
+
+def remove(path):
+    if not Path(path).is_symlink() and os.path.isdir(path):
+        shutil.rmtree(path)
+    else:
+        try:
+            os.remove(path)
+        except:
+            pass
+
+def ensure_is_folder(path, *, force=True):
+    """
+    Note: very forcefull
+    """
+    parent_path = parent_folder(path)
+    # root is always a folder
+    if parent_path == path:
+        return 
+    
+    # make sure the parent actually is a folder
+    if not is_folder(parent_path):
+        # recurse up
+        ensure_is_folder(parent_path, force=force)
+    
+    # delete anything in the way
+    if force and exists(path) and not is_folder(path):
+        remove(path)
+    
+    # now make the folder
+    os.makedirs(path, exist_ok=True)
+    return path
+
+def ensure_is_file(path, *, force=True):
+    """
+    Note: very forcefull
+    """
+    ensure_is_folder(parent_path(path), force=force)
+    # delete a folder if its in the way
+    if force and is_folder(path):
+        remove(path)
+    if not exists(path):
+        write("", to=path)
+    return path
+
+def move_out_of_the_way(path, extension=".old"):
+    if exists(path):
+        new_path = path+extension
+        move_out_of_the_way(new_path, extension)
+        move(item=path, to=os.path.dirname(new_path), new_name=os.path.basename(new_path))
+    
+def clear_a_path_for(path, overwrite=False, extension=".old"):
+    original_path = path
+    paths = []
+    while os.path.dirname(path) != path:
+        paths.append(path)
+        path = os.path.dirname(path)
+    
+    paths.reverse()
+    for each_path in paths:
+        if not exists(each_path):
+            break
+        if is_file(each_path):
+            if overwrite:
+                remove(each_path)
+            else:
+                move_out_of_the_way(each_path, extension)
+    ensure_is_folder(os.path.dirname(original_path))
+    if overwrite:
+        remove(original_path)
+    else:
+        move_out_of_the_way(original_path, extension)
+    
+    return original_path
+
+def copy(item, *, to, new_name="", force=True):
+    if new_name == "":
+        raise Exception('copy() needs a new_name= argument:\n    copy(item="location", to="directory", new_name="")\nif you want the name to be the same as before do new_name=None')
+    elif new_name is None:
+        new_name = os.path.basename(item)
+    
+    # get the full path
+    to = os.path.join(to, new_name)
+    # if theres a file in the target, remove it
+    if force and exists(to):
+        remove(to)
+    # make sure the containing folder exists
+    ensure_is_folder(os.path.dirname(to))
+    if os.path.isdir(item):
+        shutil.copytree(item, to)
+    else:
+        return shutil.copy(item, to)
+
+def move(item, *, to=None, new_name="", force=True):
+    if new_name == "":
+        raise Exception('move() needs a new_name= argument:\n    move(item="location", to="directory", new_name="")\nif you want the name to be the same as before do new_name=None')
+    elif new_name is None:
+        new_name = os.path.basename(item)
+    
+    # get the full path
+    to = os.path.join(to, new_name)
+    # make sure the containing folder exists
+    ensure_is_folder(os.path.dirname(to))
+    shutil.move(item, to)
+
+def exists(path):
+    return os.path.exists(path)
+
+def final_target_of(path):
+    # resolve symlinks
+    if os.path.islink(path):
+        have_seen = set()
+        while os.path.islink(path):
+            path = os.readlink(path)
+            if path in have_seen:
+                return None # circular broken link
+            have_seen.add(path)
+    return path
+
+def is_folder(path):
+    # resolve symlinks
+    final_target = final_target_of(path)
+    if not final_target:
+        return False
+    return os.path.isdir(final_target)
+# aliases    
+is_dir = is_directory = is_folder
+    
+def is_file(path):
+    return os.path.isfile(path)
+
+def ls(path="."):
+    if is_folder(path):
+        return os.listdir(path)
+    else:
+        return [path]
+
+def list_paths_in(path):
+    if is_folder(path):
+        return [ join(path, each) for each in os.listdir(path) ]
+    else:
+        return []
+
+def list_basenames_in(path):
+    if is_folder(path):
+        return os.listdir(path)
+    else:
+        return []
+
+def list_file_paths_in(path):
+    if is_folder(path):
+        return [ join(path, each) for each in os.listdir(path) if is_file(join(path, each)) ]
+    else:
+        return []
+
+def list_folder_paths_in(path):
+    if is_folder(path):
+        return [ join(path, each) for each in os.listdir(path) if is_folder(join(path, each)) ]
+    else:
+        return []
+
+# iterate is MUCH faster than listing for large folders
+def iterate_paths_in(path, recursively=False):
+    if is_folder(path):
+        with os.scandir(path) as iterator:
+            for entry in iterator:
+                yield os.path.join(path, entry.name)
+        if recursively:
+            for each_sub_folder in iterate_folder_paths_in(path, recursively=True):
+                yield from iterate_paths_in(each_sub_folder, recursively=True)
+
+def iterate_basenames_in(path):
+    if is_folder(path):
+        with os.scandir(path) as iterator:
+            for entry in iterator:
+                yield entry.name
+
+def iterate_file_paths_in(path, recursively=False):
+    if is_folder(path):
+        with os.scandir(path) as iterator:
+            for entry in iterator:
+                if entry.is_file():
+                    yield os.path.join(path, entry.name)
+        if recursively:
+            for each_sub_folder in iterate_folder_paths_in(path, recursively=True):
+                yield from iterate_file_paths_in(each_sub_folder, recursively=False)
+
+def iterate_folder_paths_in(path, recursively=False, have_seen=None):
+    if recursively and have_seen is None: have_seen = set()
+    if is_folder(path):
+        with os.scandir(path) as iterator:
+            for entry in iterator:
+                entry_is_folder = False
+                if entry.is_dir():
+                    entry_is_folder = True
+                    each_subpath_final_target = os.path.join(path, entry.name)
+                # check if symlink to dir
+                elif entry.is_symlink():
+                    each_subpath = os.path.join(path, entry.name)
+                    each_subpath_final_target = final_target_of(each_subpath)
+                    if is_folder(each_subpath_final_target):
+                        entry_is_folder = True
+                
+                if entry_is_folder:
+                    yield os.path.join(path, entry.name)
+                    if recursively:
+                        have_not_already_seen_this = each_subpath_final_target not in have_seen
+                        have_seen.add(each_subpath_final_target) # need to add it before recursing
+                        if have_not_already_seen_this:
+                            yield from iterate_folder_paths_in(each_subpath_final_target, recursively, have_seen)
+
+def glob(path):
+    return glob.glob(path)
+
+def touch(path):
+    ensure_is_folder(dirname(path))
+    if not exists(path):
+        write("", to=path)
+
+def touch_dir(path):
+    ensure_is_folder(path)
+
+def parent_folder(path):
+    return os.path.dirname(path)
+
+parent_path = parent_folder
+dirname = parent_folder
+
+def basename(path):
+    return os.path.basename(path)
+
+def name(path):
+    filename, file_extension = os.path.splitext(os.path.basename(path))
+    return filename
+
+def extname(path):
+    filename, file_extension = os.path.splitext(path)
+    return file_extension
+
+def without_ext(path):
+    started_with_dot_slash = path.startswith("./")
+    parent_folders = os.path.dirname(path)
+    filename, file_extension = os.path.splitext(os.path.basename(path))
+    output = os.path.join(parent_folders, filename)
+    if not started_with_dot_slash and output.startswith("./"):
+        output = output[2:]
+    return output
+
+def without_any_ext(path):
+    started_with_dot_slash = path.startswith("./")
+    parent_folders = os.path.dirname(path)
+    filename = os.path.basename(path).split('.')[0]
+    output = os.path.join(parent_folders, filename)
+    if not started_with_dot_slash and output.startswith("./"):
+        output = output[2:]
+    return output
+
+without_extension = without_ext # alias
+without_any_extension = without_any_ext # alias
+
+def path_pieces(path):
+    """
+    example:
+        *folders, file_name, file_extension = path_pieces("/this/is/a/filepath.txt")
+    """
+    folders = []
+    while 1:
+        path, folder = os.path.split(path)
+
+        if folder != "":
+            folders.append(folder)
+        else:
+            if path != "":
+                folders.append(path)
+
+            break
+    folders.reverse()
+    *folders, file = folders
+    filename, file_extension = os.path.splitext(file)
+    return [ *folders, filename, file_extension ]
+
+def join(*paths):
+    return os.path.join(*paths)
+
+def is_absolute_path(path):
+    return os.path.isabs(path)
+
+def is_relative_path(path):
+    return not os.path.isabs(path)
+
+def make_absolute_path(to, coming_from=None):
+    # if coming from cwd, its easy
+    if coming_from is None:
+        return os.path.abspath(to)
+    
+    # source needs to be absolute
+    coming_from_absolute = os.path.abspath(coming_from)
+    # if other path is  absolute, make it relative to coming_from
+    relative_path = to
+    if os.path.isabs(to):
+        relative_path = os.path.relpath(to, coming_from_absolute)
+    return os.path.join(coming_from_absolute, relative_path)
+
+def make_relative_path(*, to, coming_from=None):
+    if coming_from is None:
+        coming_from = get_cwd()
+    return os.path.relpath(to, coming_from)
+
+def get_cwd():
+    return os.getcwd()
+
+def walk_up_until(file_to_find, start_path=None):
+    here = start_path or os.getcwd()
+    if not os.path.isabs(here):
+        here = os.path.join(os.getcwd(), file_to_find)
+    
+    while 1:
+        check_path = os.path.join(here, file_to_find)
+        if os.path.exists(check_path):
+            return check_path
+        
+        # reached the top
+        if here == os.path.dirname(here):
+            return None
+        else:
+            # go up a folder
+            here = os.path.dirname(here)
+
+def local_path(*paths):
+    import os
+    import inspect
+    
+    cwd = os.getcwd()
+    # https://stackoverflow.com/questions/28021472/get-relative-path-of-caller-in-python
+    try:
+        frame = inspect.stack()[1]
+        module = inspect.getmodule(frame[0])
+        directory = os.path.dirname(module.__file__)
+    # if inside a repl (error =>) assume that the working directory is the path
+    except (AttributeError, IndexError) as error:
+        directory = cwd
+    
+    if is_absolute_path(directory):
+        return join(directory, *paths)
+    else:
+        # See note at the top
+        return join(intial_cwd, directory, *paths)
+        
+
+def line_count_of(file_path):
+    # from stack overflow "how to get a line count of a large file cheaply"
+    def _make_gen(reader):
+        while 1:
+            b = reader(2**16)
+            if not b: break
+            yield b
+    with open(file_path, "rb") as file:
+        count = sum(buf.count(b"\n") for buf in _make_gen(file.raw.read))
+    
+    return count
+
+def get_home():
+    return str(Path.home())


=====================================
src/pytransit/norm_tools.py
=====================================
@@ -94,8 +94,59 @@ class TotReadsNorm(NormMethod):
         return (data, factors)
 
 
+# Quantile Matching to ideal geometric distribution
+# intended for single wig files - what to do for collection of multiple samples? post-process with TTR
+# scale them uniformly so mean is 100 (not just NZmean; scale for saturation)
+
+class QMgeomNorm(NormMethod):
+    name = "QMgeom"
+
+    # note: like many of the other methods below, ignore wigList and assume data is already read-in
+
+    @staticmethod
+    def normalize(data, wigList=[], annotationPath=""):
+      Nsamples = data.shape[0]
+      norm_data = data.copy()
+      NZmeans = []
+      for i in range(Nsamples):
+        counts = data[i]
+        adjusted,mu = QMgeomNorm.quantile_matching(counts)
+        norm_data[i] = adjusted
+        NZmeans.append(mu)
+      (norm_data, factors) = TTRNorm.normalize(norm_data)
+      return (norm_data,NZmeans)
+
+    # given a numpy array of counts, return array of adjusted counts in same order
+
+    def quantile_matching(counts):
+      counts = counts.tolist()
+      data = list(enumerate(counts)) # list of: [(index,count)]
+
+      NZsites = list(filter(lambda x: x[1]>0,data))
+      numpy.random.shuffle(NZsites)
+      NZsites = sorted(NZsites,key=lambda x: x[1])
+
+      NZcounts = [x[1] for x in NZsites]
+      mu = numpy.mean(NZcounts)
+      p = 1/float(mu)
+      #p = 1/100. # force all samples to have mean of 100 (however, will apply TTR post-hoc)
+
+      sample = scipy.stats.geom.rvs(p,size=len(NZsites))
+      sample = sorted(sample)
+
+      # both distributions start at count of 1, but have to deal with ties, e.g. if one distn has more 1's than the other
+
+      NZsites = [(coord,count,newcnt) for (coord,count),newcnt in zip(NZsites,sample)] # append adjusted counts to each site
+      Zsites = filter(lambda x: x[1]==0,data)
+      Zsites = [(coord,count,0) for coord,count in Zsites]
+
+      allsites = sorted(Zsites+NZsites)
+      newcounts = numpy.array([x[2] for x in allsites])
+      return newcounts,mu
+
+
 class TTRNorm(NormMethod):
-    name = "emphist"
+    name = "emphist" # should be "TTR"?
 
     def empirical_theta(X):
         """Calculates the observed density of the data.
@@ -550,7 +601,7 @@ methods["zinfnb"] = ZeroInflatedNBNorm
 methods["quantile"] = QuantileNorm
 methods["aBGC"] = AdaptiveBGCNorm
 methods["emphist"] = EmpHistNorm
-
+methods["QMgeom"] = QMgeomNorm
 
 #########################
 def normalize_data(data, method="nonorm", wigList=[], annotationPath=""):


=====================================
src/pytransit/stat_tools.py
=====================================
@@ -433,7 +433,7 @@ def loess(X, Y, h=10000):
         smoothed[i] = B*x + A
     return smoothed
 
-#
+# X is coords, Y is counts
 
 def loess_correction(X, Y, h=10000, window=100):
     #TODO: Write docstring
@@ -447,11 +447,10 @@ def loess_correction(X, Y, h=10000, window=100):
 
     ysmooth = loess(x_w, y_w, h)
     mline = numpy.mean(y_w)
-    y_w * (ysmooth/mline)
 
     normalized_Y = numpy.zeros(len(Y))
     for i in range(size):
-        normalized_Y[window*i:window*(i+1)] = Y[window*i:window*(i+1)] * (ysmooth[i]/mline)
+        normalized_Y[window*i:window*(i+1)] = Y[window*i:window*(i+1)] / (ysmooth[i]/mline)
 
     return normalized_Y
 



View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/197c36f1c1e6f4ce11dd83143fc7d947dc6b3250

-- 
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/197c36f1c1e6f4ce11dd83143fc7d947dc6b3250
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20231129/1317c157/attachment-0001.htm>