[med-svn] [Git][med-team/tnseq-transit][upstream] New upstream version 3.2.3

Nilesh Patra (@nilesh) gitlab at salsa.debian.org
Wed Oct 20 21:32:04 BST 2021



Nilesh Patra pushed to branch upstream at Debian Med / tnseq-transit


Commits:
fac26506 by Nilesh Patra at 2021-10-21T01:42:57+05:30
New upstream version 3.2.3
- - - - -


13 changed files:

- − .travis.yml
- CHANGELOG.md
- src/pytpp/tpp_tools.py
- src/pytransit/__init__.py
- src/pytransit/analysis/anova.py
- src/pytransit/analysis/base.py
- src/pytransit/analysis/gumbel.py
- src/pytransit/analysis/zinb.py
- src/pytransit/data/samples_metadata_cg_covar.txt
- src/pytransit/doc/source/transit_methods.rst
- src/pytransit/stat_tools.py
- src/pytransit/tnseq_tools.py
- tests/test_tpp.py


Changes:

=====================================
.travis.yml deleted
=====================================
@@ -1,16 +0,0 @@
-language: python
-
-services:
-  - docker
-
-before_install:
-  - docker build -t transit .
-
-DISPLAY: 0.0
-notifications:
-  email:
-    on_success: change # default: change
-    on_failure: change # default: always
-
-script: travis_wait 20 docker run -t transit
-


=====================================
CHANGELOG.md
=====================================
@@ -2,8 +2,15 @@
 All notable changes to this project will be documented in this file.
 
 
-
-## Version 3.2.2 2022-09-08
+## Version 3.2.3 2021-10-16
+#### TRANSIT:
+  - added Binomial essentials (EB) to Gumbel analysis (supplementing genes categorized as E), to help with low-saturation datasets
+  - modified ANOVA and ZINB so that --include-conditions and --exclude-conditions refer to original Conditions column in samples metadata file (instead of whatever is specified by --conditions) 
+	
+#### TPP:
+  - improved metrics reported in *.tn_stats by TPP, to help diagnose why reads don't map
+	
+## Version 3.2.2 2021-09-08
 #### TRANSIT:
  - fixed bug in converting gff_to_prot_table
  - fixed bug in tn5gaps (fixes some false negative calls)


=====================================
src/pytpp/tpp_tools.py
=====================================
@@ -310,7 +310,7 @@ def extract_staggered(infile,outfile,vars):
   output_failed.close()                                 # [WM] [add]
   if vars.tot_tgtta == 0:
     raise ValueError("Error: Input files did not contain any reads matching prefix sequence with %d mismatches" % vars.mm1)
-
+  vars.tot_reads = tot
 
 def message(s):
   #print("[tn_preprocess]",s)
@@ -511,7 +511,7 @@ def template_counts(ref,sam,bcfile,vars):
     genome = read_genome(ref, replicon_index)
     for i in range(len(genome)-1):
       #if genome[i:i+2].upper()=="TA":
-      if vars.transposon=="Himar1" and genome[i:i+2].upper()!="TA": continue
+      if vars.transposon=="Himar1" and genome[i:i+2].upper()!="TA": continue 
       else:
         pos = i+1
         h = hits[replicon_names[replicon_index]].get(pos,[])
@@ -1100,20 +1100,7 @@ def generate_output(vars):
     FR_corr.append(cur_FR_corr)
     BC_corr.append(cur_BC_corr)
 
-  primer = "CTAGAGGGCCCAATTCGCCCTATAGTGAGT"
-  vector = "CTAGACCGTCCAGTCTGGCAGGCCGGAAAC"
-  adapter = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
-  Himar1 = "ACTTATCAGCCAACCTGTTA"
-  tot_reads,nprimer,nvector,nadapter,misprimed = 0,0,0,0,0
-  for line in open(vars.reads1):
-    if line[0]=='>': tot_reads += 1; continue
-    if primer in line: nprimer += 1
-    if vector in line: nvector += 1
-    if adapter in line: nadapter += 1
-    #if "TGTTA" in line and Himar1 not in line: misprimed += 1
-    # basically, these should correspond to insertions at non-TA sites (so the terminal TA of ...TGTTA will be different)
-    if Himar1[:-5] in line and Himar1 not in line: misprimed += 1 
-
+  tot_reads = vars.tot_reads
   read_length = get_read_length(vars.base + ".reads1")
   mean_r1_genomic = get_genomic_portion(vars.base + ".trimmed1")
   if vars.single_end==False: mean_r2_genomic = get_genomic_portion(vars.base + ".genomic2")
@@ -1133,7 +1120,7 @@ def generate_output(vars):
   output.write('# ref_genome: %s\n' % vars.ref)
   output.write('# replicon_ids: %s\n' % ','.join(vars.replicon_ids))
   output.write("# total_reads (or read pairs): %s\n" % tot_reads)
-  #output.write("# truncated_reads %s (fragments shorter than the read length; ADAP2 appears in read1)\n" % vars.truncated_reads)
+  output.write("# truncated_reads %s (genomic inserts shorter than the read length; ADAP2 appears in read1)\n" % vars.truncated_reads)
   output.write("# trimmed_reads (reads with valid Tn prefix, and insert size>20bp): %s\n" % vars.tot_tgtta)
   output.write("# reads1_mapped: %s\n" % vars.r1)
   output.write("# reads2_mapped: %s\n" % vars.r2)
@@ -1187,10 +1174,30 @@ def generate_output(vars):
     output.write("# FR_corr (Fwd templates vs. Rev templates): %0.3f\n" % FR_corr[0])
     output.write("# BC_corr (reads vs. templates, summed over both strands): %0.3f\n" % BC_corr[0])
 
-  output.write("# primer_matches: %s reads (%0.1f%%) contain %s (Himar1)\n" % (nprimer,nprimer*100/float(tot_reads),primer))
-  output.write("# vector_matches: %s reads (%0.1f%%) contain %s (phiMycoMarT7)\n" % (nvector,nvector*100/float(tot_reads),vector))
-  output.write("# adapter_matches: %s reads (%0.1f%%) contain %s (Illumina/TruSeq index)\n" % (nadapter,nadapter*100/float(tot_reads),adapter))
-  output.write("# misprimed_reads: %s reads (%0.1f%%) contain Himar1 prefix but don't end in TGTTA\n" % (misprimed,misprimed*100/float(tot_reads)))
+  primer = "CTAGAGGGCCCAATTCGCCCTATAGTGAGT"
+  vector = "CTAGACCGTCCAGTCTGGCAGGCCGGAAAC"
+  adapter = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
+  ADAPTER2 = "TACCACGACCA" # rc of const2 region of R2, between barcode and genomic; these reads will be truncated here
+  Himar1 = "ACTTATCAGCCAACCTGTTA"
+  trimmed_reads,nprimer,nvector,nadapter,misprimed,ntruncated = 0,0,0,0,0,0
+  for line in open(vars.trimmed1):
+    if line[0]=='>': trimmed_reads += 1; continue
+    if primer in line: nprimer += 1
+    if vector in line: nvector += 1
+    if adapter in line: nadapter += 1
+    #if "TGTTA" in line and Himar1 not in line: misprimed += 1
+    # basically, these should correspond to insertions at non-TA sites (so the terminal TA of ...TGTTA will be different)
+    if Himar1[:-5] in line and Himar1 not in line: misprimed += 1 
+
+  output.write("# Break-down of total reads (%s):\n" % tot_reads)
+  output.write("#  %s reads (%0.1f%%) lack the expected Tn prefix\n" % (tot_reads-vars.tot_tgtta,(tot_reads-vars.tot_tgtta)*100/float(tot_reads)))
+
+  output.write("# Break-down of trimmed reads with valid Tn prefix (%s):\n" % trimmed_reads)
+  output.write("#  primer_matches: %s reads (%0.1f%%) contain %s (Himar1)\n" % (nprimer,nprimer*100/float(trimmed_reads),primer))
+  output.write("#  vector_matches: %s reads (%0.1f%%) contain %s (phiMycoMarT7)\n" % (nvector,nvector*100/float(trimmed_reads),vector))
+  output.write("#  adapter_matches: %s reads (%0.1f%%) contain %s (Illumina/TruSeq index)\n" % (nadapter,nadapter*100/float(trimmed_reads),adapter))
+  output.write("#  misprimed_reads: %s reads (%0.1f%%) contain Himar1 prefix but don't end in TGTTA\n" % (misprimed,misprimed*100/float(trimmed_reads)))
+
   output.write("# read_length: %s bp\n" % read_length)
   output.write("# mean_R1_genomic_length: %0.1f bp\n" % mean_r1_genomic)
   if vars.single_end==False: output.write("# mean_R2_genomic_length: %0.1f bp\n" % mean_r2_genomic)


=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
 __all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
 
 
-__version__ = "v3.2.2"
+__version__ = "v3.2.3"
 prefix = "[TRANSIT]"
 


=====================================
src/pytransit/analysis/anova.py
=====================================
@@ -34,9 +34,9 @@ class AnovaMethod(base.MultiConditionMethod):
     """
     anova
     """
-    def __init__(self, combined_wig, metadata, annotation, normalization, output_file, ignored_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0, PC=1, refs=[]):
+    def __init__(self, combined_wig, metadata, annotation, normalization, output_file, excluded_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0, PC=1, refs=[]):
         base.MultiConditionMethod.__init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation, output_file,
-                normalization=normalization, ignored_conditions=ignored_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
+                normalization=normalization, excluded_conditions=excluded_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
         self.PC = PC
         self.refs = refs
 
@@ -61,18 +61,18 @@ class AnovaMethod(base.MultiConditionMethod):
         PC = int(kwargs.get("PC", 5))
         refs = kwargs.get("-ref",[]) # list of condition names to use a reference for calculating LFCs
         if refs!=[]: refs = refs.split(',')
-        ignored_conditions = list(filter(None, kwargs.get("-ignore-conditions", "").split(",")))
+        excluded_conditions = list(filter(None, kwargs.get("-exclude-conditions", "").split(",")))
         included_conditions = list(filter(None, kwargs.get("-include-conditions", "").split(",")))
 
         # check for unrecognized flags
-        flags = "-n --ignore-conditions --include-conditions -iN -iC -PC --ref".split()
+        flags = "-n --exclude-conditions --include-conditions -iN -iC -PC --ref".split()
         for arg in rawargs:
           if arg[0]=='-' and arg not in flags:
             self.transit_error("flag unrecognized: %s" % arg)
             print(AnovaMethod.usage_string())
             sys.exit(0)
 
-        return self(combined_wig, metadata, annotation, normalization, output_file, ignored_conditions, included_conditions, NTerminus, CTerminus, PC, refs)
+        return self(combined_wig, metadata, annotation, normalization, output_file, excluded_conditions, included_conditions, NTerminus, CTerminus, PC, refs)
 
     def wigs_to_conditions(self, conditionsByFile, filenamesInCombWig):
         """
@@ -195,8 +195,20 @@ class AnovaMethod(base.MultiConditionMethod):
             conditionsByFile,
             filenamesInCombWig)
 
-        conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata)
-        data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList)
+        conditionsList = self.select_conditions(conditions,self.included_conditions,self.excluded_conditions,orderingMetadata)
+        #data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList)
+
+        metadata = self.get_samples_metadata()
+        conditionNames = metadata['Condition'] # original Condition names for each sample, as ordered list        
+        fileNames = metadata['Filename'] 
+
+        data, fileNames, conditionNames, conditions, _, _ = self.filter_wigs_by_conditions3(
+                data,
+                fileNames,
+                conditionNames, # original Condition column in samples metadata file
+                self.included_conditions,
+                self.excluded_conditions,
+                conditions = conditionNames) # this is kind of redundant for ANOVA, but it is here because condition, covars, and interactions could have been manipulated for ZINB
 
         genes = tnseq_tools.read_genes(self.annotation_path)
 
@@ -238,7 +250,7 @@ class AnovaMethod(base.MultiConditionMethod):
  Optional Arguments:
   -n <string>         :=  Normalization method. Default: -n TTR
   --include-conditions <cond1,...> := Comma-separated list of conditions to use for analysis (Default: all)
-  --ignore-conditions <cond1,...> := Comma-separated list of conditions to ignore (Default: none)
+  --exclude-conditions <cond1,...> := Comma-separated list of conditions to exclude (Default: none)
   --ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions)
   -iN <N> :=  Ignore TAs within given percentage (e.g. 5) of N terminus. Default: -iN 0
   -iC <N> :=  Ignore TAs within given percentage (e.g. 5) of C terminus. Default: -iC 0


=====================================
src/pytransit/analysis/base.py
=====================================
@@ -509,7 +509,7 @@ class MultiConditionMethod(AnalysisMethod):
     Class to be inherited by analysis methods that compare essentiality between multiple conditions (e.g Anova).
     '''
 
-    def __init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation_path, output, normalization=None, LOESS=False, ignoreCodon=True, wxobj=None, ignored_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0):
+    def __init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation_path, output, normalization=None, LOESS=False, ignoreCodon=True, wxobj=None, excluded_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0):
         AnalysisMethod.__init__(self, short_name, long_name, short_desc, long_desc, output,
                 annotation_path, wxobj)
         self.combined_wig = combined_wig
@@ -518,24 +518,24 @@ class MultiConditionMethod(AnalysisMethod):
         self.NTerminus = nterm
         self.CTerminus = cterm
         self.unknown_cond_flag = "FLAG-UNMAPPED-CONDITION-IN-WIG"
-        self.ignored_conditions = ignored_conditions
+        self.excluded_conditions = excluded_conditions
         self.included_conditions = included_conditions
 
-    def filter_wigs_by_conditions(self, data, conditions, covariates = [], interactions = [], ignored_conditions = [], included_conditions = []):
+    def filter_wigs_by_conditions(self, data, conditions, covariates = [], interactions = [], excluded_conditions = [], included_conditions = []):
         """
-            Filters conditions that are ignored/included.
+            Filters conditions that are excluded/included.
             ([[Wigdata]], [Condition], [[Covar]], [Condition], [Condition]) -> Tuple([[Wigdata]], [Condition])
         """
-        ignored_conditions, included_conditions = (set(ignored_conditions), set(included_conditions))
+        excluded_conditions, included_conditions = (set(excluded_conditions), set(included_conditions))
         d_filtered, cond_filtered, filtered_indexes = [], [], [];
 
-        if len(ignored_conditions) > 0 and len(included_conditions) > 0:
-            self.transit_error("Both ignored and included conditions have len > 0")
+        if len(excluded_conditions) > 0 and len(included_conditions) > 0:
+            self.transit_error("Both excluded and included conditions have len > 0")
             sys.exit(0)
-        elif (len(ignored_conditions) > 0):
-            self.transit_message("conditions ignored: {0}".format(ignored_conditions))
+        elif (len(excluded_conditions) > 0):
+            self.transit_message("conditions excluded: {0}".format(excluded_conditions))
             for i, c in enumerate(conditions):
-              if (c != self.unknown_cond_flag) and (c not in ignored_conditions):
+              if (c != self.unknown_cond_flag) and (c not in excluded_conditions):
                 d_filtered.append(data[i])
                 cond_filtered.append(conditions[i])
                 filtered_indexes.append(i)
@@ -565,19 +565,19 @@ class MultiConditionMethod(AnalysisMethod):
     # input: conditions are per wig; orderingMetdata comes from tnseq_tools.read_samples_metadata()
     # output: conditionsList is selected subset of conditions (unique, in preferred order)
 
-    def select_conditions(self,conditions,included_conditions,ignored_conditions,orderingMetadata): 
+    def select_conditions(self,conditions,included_conditions,excluded_conditions,orderingMetadata): 
         if len(included_conditions)>0: conditionsList = included_conditions
         else:
           conditionsList = []
           for c in orderingMetadata['condition']: # the order conds appear in metadata file, duplicated for each sample
             if c not in conditionsList: conditionsList.append(c)
-        for c in ignored_conditions:  
+        for c in excluded_conditions:  
           if c in conditionsList: conditionsList.remove(c)
         return conditionsList
 
     def filter_wigs_by_conditions2(self, data, conditions, conditionsList, covariates = [], interactions = []):
         """
-            Filters conditions that are ignored/included.
+            Filters conditions that are excluded/included.
             ([[Wigdata]], [Condition], [[Covar]], [Condition], [Condition]) -> Tuple([[Wigdata]], [Condition])
         """
         d_filtered, cond_filtered, filtered_indexes = [], [], [];
@@ -596,7 +596,48 @@ class MultiConditionMethod(AnalysisMethod):
                 numpy.array(covariates_filtered),
                 numpy.array(interactions_filtered))
 
-#
+    def filter_wigs_by_conditions3(self, data, fileNames, conditionNames, included_cond, excluded_cond, conditions, covariates = [], interactions = []):
+        """
+            Filters conditions that are excluded/included; also extract cond, covar, and interaction labels
+            conditionNames: based on original Conditions column in metadata
+            conditions: user might have specified an alternative column to analyze (list of labels parallel to wigs)
+        """
+        fileNames_filtered, condNames_filtered, d_filtered, cond_filtered, filtered_indexes = [], [], [], [], []
+
+        for i in range(len(data)):
+          if (len(included_cond)==0 or conditionNames[i] in included_cond) and conditionNames[i] not in excluded_cond:
+            d_filtered.append(data[i])
+            fileNames_filtered.append(fileNames[i])
+            condNames_filtered.append(conditionNames[i])
+            cond_filtered.append(conditions[i])
+            filtered_indexes.append(i)
+
+        covariates_filtered = [[c[i] for i in filtered_indexes] for c in covariates]
+        interactions_filtered = [[c[i] for i in filtered_indexes] for c in interactions]
+
+        return (numpy.array(d_filtered),
+                numpy.array(fileNames_filtered),
+                numpy.array(condNames_filtered),
+                numpy.array(cond_filtered),
+                numpy.array(covariates_filtered),
+                numpy.array(interactions_filtered))
+
+    # return a hash table of parallel lists, indexed by column header
+    def get_samples_metadata(self):
+      data = {}
+      header = None
+      for line in open(self.metadata):
+        if line[0]=='#': continue
+        w = line.rstrip().split('\t')
+        if header==None: 
+          header = w
+          for col in header: data[col] = []
+        else:
+          for i in range(len(header)):
+            data[header[i]].append(w[i])
+      return data
+
+#######################3
 
 class TransitAnalysis:
     def __init__(self, sn, ln, short_desc, long_desc, tn, method_class=AnalysisMethod, gui_class=AnalysisGUI, filetypes=[TransitFile]):


=====================================
src/pytransit/analysis/gumbel.py
=====================================
@@ -333,6 +333,9 @@ class GumbelMethod(base.SingleConditionMethod):
         self.transit_message("Getting Data")
         (data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
         (K,N) = data.shape
+        merged = numpy.sum(data,axis=0)
+        nsites,nzeros = merged.shape[0],numpy.sum(merged==0) # perhaps I should say >minCount
+        sat = (nsites-nzeros)/float(nsites)
 
         if self.normalization and self.normalization != "nonorm":
             self.transit_message("Normalizing using: %s" % self.normalization)
@@ -412,13 +415,14 @@ class GumbelMethod(base.SingleConditionMethod):
             
             phi_old = phi_new
             #Update progress
-            text = "Running Gumbel Method... %5.1f%%" % (100.0*(count+1)/(self.samples+self.burnin))
+            text = "Running Gumbel Method with Binomial Essentiality Calls... %5.1f%%" % (100.0*(count+1)/(self.samples+self.burnin))
             self.progress_update(text, count)
 
 
         ZBAR = numpy.apply_along_axis(numpy.mean, 1, Z_sample)
         (ess_t, non_t) = stat_tools.bayesian_ess_thresholds(ZBAR)
-
+        binomial_n = math.log10(0.05)/math.log10(G.global_phi())
+        
         #Orf    k   n   r   s   zbar
         self.output.write("#Gumbel\n")
         if self.wxobj:
@@ -432,16 +436,20 @@ class GumbelMethod(base.SingleConditionMethod):
 
         self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8'))) 
         self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8')) 
-        self.output.write("#FDR Corrected thresholds: %f, %f\n" % (ess_t, non_t))
-        self.output.write("#MH Acceptance-Rate:\t%2.2f%%\n" % (100.0*acctot/count))
+        self.output.write("#Trimming of TAs near termini: N-term=%s, C-term=%s (fraction of ORF length)\n" % (self.NTerminus,self.CTerminus))
+        self.output.write("#Significance thresholds (FDR-corrected): Essential if Zbar>%f, Non-essential if Zbar<%f\n" % (ess_t, non_t))
+        self.output.write("#Metropolis-Hastings Acceptance-Rate:\t%2.2f%%\n" % (100.0*acctot/count))
         self.output.write("#Total Iterations Performed:\t%d\n" % count)
         self.output.write("#Sample Size:\t%d\n" % i)
-        self.output.write("#phi estimate:\t%f\n" % numpy.average(phi_sample))
-        self.output.write("#Time: %s\n" % (time.time() - start_time))
-        self.output.write("#%s\n" % "\t".join(columns))
+        self.output.write("#Total number of TA sites: %s\n" % nsites)
+        self.output.write("#Genome-wide saturation: %s\n" % (round(sat,3))) # datasets merged
+        self.output.write("#phi estimate:\t%f (non-insertion probability in non-essential regions)\n" % numpy.average(phi_sample))
+        self.output.write("#Minimum number of TA sites with 0 insertions to be classified as essential by Binomial: \t%0.3f\n" % binomial_n)
+        self.output.write("#Time: %s s\n" % (round(time.time()-start_time,1)))
+
         i = 0
-        data = []
-        for g in G:
+        data,calls = [],[]
+        for j,g in enumerate(G):
             if not self.good_orf(g):
                 zbar = -1.0
             else:
@@ -449,18 +457,30 @@ class GumbelMethod(base.SingleConditionMethod):
                 i+=1
             if zbar > ess_t:
                 call = "E"
+            elif G.local_sites()[j]>binomial_n and G.local_thetas()[j]==0.0:
+                call = "EB"
             elif non_t <= zbar <= ess_t:
                 call = "U"
             elif 0 <= zbar < non_t:
                 call = "NE"
             else:
                 call = "S"
+            
             data.append("%s\t%s\t%s\t%d\t%d\t%d\t%d\t%f\t%s\n" % (g.orf, g.name, g.desc, g.k, g.n, g.r, g.s, zbar, call))
+            calls.append(call)
         data.sort()
+
+        self.output.write("#Summary of Essentiality Calls:\n")
+        self.output.write("#  E  = %4s (essential based on Gumbel)\n" % (calls.count("E")))
+        self.output.write("#  EB = %4s (essential based on Binomial)\n" % (calls.count("EB")))
+        self.output.write("#  NE = %4s (non-essential)\n" % (calls.count("NE")))
+        self.output.write("#  U  = %4s (uncertain)\n" % (calls.count("U")))
+        self.output.write("#  S  = %4s (too short)\n" % (calls.count("S")))
+
+        self.output.write("#%s\n" % "\t".join(columns))
         for line in data:
             self.output.write(line)
         self.output.close()
-
         self.transit_message("") # Printing empty line to flush stdout 
         self.transit_message("Adding File: %s" % (self.output.name))
         self.add_file(filetype="Gumbel")
@@ -494,8 +514,9 @@ class GumbelMethod(base.SingleConditionMethod):
         BetaGamma = B*tnseq_tools.getGamma()
         if n<EXACT: # estimate more accurately based on expected run len, using exact calc for small genes
           exprun = self.ExpectedRuns_cached(n,p)
-          u = exprun-BetaGamma # u is mu of Gumbel (mean=mu+gamma*beta); matching of moments
-        pval = 1 - scipy.exp(scipy.stats.gumbel_r.logcdf(r,u,B))
+          u = exprun-BetaGamma # u is mu of Gumbel (mean=mu+gamma*beta); matching of moments 
+          #https://github.blog/2020-12-15-token-authentication-requirements-for-git-operations/
+        pval = 1 - numpy.exp(scipy.stats.gumbel_r.logcdf(r,u,B))
         if pval < 0.05: return(1)
         else: return(0)
 
@@ -519,7 +540,7 @@ class GumbelMethod(base.SingleConditionMethod):
         for i in range(len(N)): # estimate more accurately based on expected run len, using exact calc for small genes
           if N[i]<EXACT: mu[i] = self.ExpectedRuns_cached(int(N[i]),p)-BetaGamma
         sigma = 1.0/math.log(1.0/p);
-        h0 = ((scipy.exp(scipy.stats.gumbel_r.logpdf(R,mu,sigma))) * scipy.stats.norm.pdf(S, mu_s*R, sigma_s)  * (1-w1))
+        h0 = ((numpy.exp(scipy.stats.gumbel_r.logpdf(R,mu,sigma))) * scipy.stats.norm.pdf(S, mu_s*R, sigma_s)  * (1-w1))
         h1 = SIG * w1
         h1 += 1e-10; h0 += 1e-10 # to prevent div-by-zero; if neither class is probable, p(z1) should be ~0.5
         p_z1 = h1/(h0+h1)


=====================================
src/pytransit/analysis/zinb.py
=====================================
@@ -48,9 +48,9 @@ class ZinbMethod(base.MultiConditionMethod):
     """
     Zinb
     """
-    def __init__(self, combined_wig, metadata, annotation, normalization, output_file, ignored_conditions=[], included_conditions=[], winz=False, nterm=5.0, cterm=5.0, condition="Condition", covars=[], interactions = [], PC=1, refs=[]):
+    def __init__(self, combined_wig, metadata, annotation, normalization, output_file, excluded_conditions=[], included_conditions=[], winz=False, nterm=5.0, cterm=5.0, condition="Condition", covars=[], interactions = [], PC=1, refs=[]):
         base.MultiConditionMethod.__init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation, output_file,
-                normalization=normalization, ignored_conditions=ignored_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
+                normalization=normalization, excluded_conditions=excluded_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
         self.winz = winz
         self.covars = covars
         self.interactions = interactions
@@ -96,18 +96,18 @@ class ZinbMethod(base.MultiConditionMethod):
         refs = kwargs.get("-ref",[]) # list of condition names to use a reference for calculating LFCs
         if refs!=[]: refs = refs.split(',')
         winz = True if "w" in kwargs else False
-        ignored_conditions = list(filter(None, kwargs.get("-ignore-conditions", "").split(",")))
+        excluded_conditions = list(filter(None, kwargs.get("-exclude-conditions", "").split(",")))
         included_conditions = list(filter(None, kwargs.get("-include-conditions", "").split(",")))
 
         # check for unrecognized flags
-        flags = "-n --ignore-conditions --include-conditions -iN -iC -PC --condition --covars --interactions --gene --ref".split()
+        flags = "-n --exclude-conditions --include-conditions -iN -iC -PC --condition --covars --interactions --gene --ref".split()
         for arg in rawargs:
           if arg[0]=='-' and arg not in flags:
             self.transit_error("flag unrecognized: %s" % arg)
             print(ZinbMethod.usage_string())
             sys.exit(0)
 
-        return self(combined_wig, metadata, annotation, normalization, output_file, ignored_conditions, included_conditions, winz, NTerminus, CTerminus, condition, covars, interactions, PC, refs)
+        return self(combined_wig, metadata, annotation, normalization, output_file, excluded_conditions, included_conditions, winz, NTerminus, CTerminus, condition, covars, interactions, PC, refs)
 
     def wigs_to_conditions(self, conditionsByFile, filenamesInCombWig):
         """
@@ -523,6 +523,8 @@ class ZinbMethod(base.MultiConditionMethod):
 
         condition_name = self.condition
         # if a covar is not found, this crashes; check for it?
+        # read it first with no condition specified, to get original Condition names
+        conditionsByFile1, covariatesByFileList1, interactionsByFileList1, orderingMetadata1 = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions) # without specifiying condition
         conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name)
 
         ## [Condition] in the order of files in combined wig
@@ -538,19 +540,37 @@ class ZinbMethod(base.MultiConditionMethod):
             interactionsByFileList,
             filenamesInCombWig)
 
-        conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata)
-        data, conditions, covariates, interactions = self.filter_wigs_by_conditions2(
+        # this was the old way to filter samples, where --include/exclude-conditions depended on which --condition was specified
+        #conditionsList = self.select_conditions(conditions,self.included_conditions,self.excluded_conditions,orderingMetadata)
+        #data, conditions, covariates, interactions = self.filter_wigs_by_conditions2(
+        #        data,
+        #        conditions,
+        #        conditionsList,
+        #        covariates = covariates,
+        #        interactions = interactions)
+        #filesByCondition  = self.invertDict(conditionsByFile)
+        #samples_used = set()
+        #for cond in conditionsList: samples_used.update(filesByCondition[cond])
+
+        metadata = self.get_samples_metadata()
+        conditionNames = metadata['Condition'] # original Condition names for each sample, as ordered list        
+        fileNames = metadata['Filename'] 
+
+        # this is the new way to filter samples, where --include/exclude-conditions refers to Condition column in metadata, regardless of whether --condition was specified
+        data, fileNames, conditionNames, conditions, covariates, interactions = self.filter_wigs_by_conditions3(
                 data,
-                conditions,
-                conditionsList,
+                fileNames,
+                conditionNames, # original Condition column in samples metadata file
+                self.included_conditions,
+                self.excluded_conditions,
+                conditions = conditions, # user might have specified a column other than Condition
                 covariates = covariates,
                 interactions = interactions)
-
+        conditionsList = list(set(conditions)) # unique condition labels, filtered
+        samples_used = set(fileNames)
+        
         # show the samples associated with each condition (and covariates or interactions, if defined), and count samples in each cross-product of vars
 
-        filesByCondition  = self.invertDict(conditionsByFile)
-        samples_used = set()
-        for cond in conditionsList: samples_used.update(filesByCondition[cond])
         vars = [condition_name]+self.covars+self.interactions
         vars2vals = {}
         vars2vals[condition_name] = list(set(conditions))
@@ -558,7 +578,7 @@ class ZinbMethod(base.MultiConditionMethod):
         for i,var in enumerate(self.interactions): vars2vals[var] = list(set(interactions[i]))
         varsByFileList = [conditionsByFile]+covariatesByFileList+interactionsByFileList
         for i,var in enumerate(vars):
-          print("\nCondition/Covariate/Interaction: %s" % vars[i])
+          print("\nsamples for Condition/Covariate/Interaction: %s" % vars[i])
           filesByVar = self.invertDict(varsByFileList[i])
           for k,v in filesByVar.items():
             samples = list(samples_used.intersection(set(v)))
@@ -584,7 +604,11 @@ class ZinbMethod(base.MultiConditionMethod):
             c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
             c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)
 
-            if len(self.included_conditions) > 0:
+            # use --include-conditions to determine order of columns in output file
+            # this only works if an alternative --condition was not specified
+            # otherwise don't try to order them this way because it gets too complicated
+            # possibly should require --covars and --interactions to be unspecified too
+            if self.condition=="Condition" and len(self.included_conditions) > 0 and len(self.excluded_conditions)==0:
                 condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2))
                 ## Order by interaction, if stat belongs to same condition
                 if condDiff == 0 and i1 is not None and i2 is not None:
@@ -640,8 +664,8 @@ class ZinbMethod(base.MultiConditionMethod):
 
         Optional Arguments:
         -n <string>         :=  Normalization method. Default: -n TTR
-        --ignore-conditions <cond1,cond2> :=  Comma separated list of conditions to ignore, for the analysis.
-        --include-conditions <cond1,cond2> :=  Comma separated list of conditions to include, for the analysis. Conditions not in this list, will be ignored.
+        --exclude-conditions <cond1,cond2> :=  Comma separated list of conditions to exclude, for the analysis.
+        --include-conditions <cond1,cond2> :=  Comma separated list of conditions to include, for the analysis. Conditions not in this list, will be excluded.
         --ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions)
         -iN <float>     :=  Ignore TAs occuring within given percentage (as integer) of the N terminus. Default: -iN 5
         -iC <float>     :=  Ignore TAs occuring within given percentage (as integer) of the C terminus. Default: -iC 5


=====================================
src/pytransit/data/samples_metadata_cg_covar.txt
=====================================
@@ -1,6 +1,6 @@
-Id	NewConditionCol	Filename	Batch
-c1	Cholesterol	src/pytransit/data/cholesterol_H37Rv_rep1.wig	b1
-c2	Cholesterol	src/pytransit/data/cholesterol_H37Rv_rep2.wig	b1
-c3	Cholesterol	src/pytransit/data/cholesterol_H37Rv_rep3.wig	b2
-g1	Glycerol	src/pytransit/data/glycerol_H37Rv_rep1.wig	b2
-g2	Glycerol	src/pytransit/data/glycerol_H37Rv_rep2.wig	b2
+Id	Condition	Filename	Batch	NewConditionCol
+c1	Cholesterol	src/pytransit/data/cholesterol_H37Rv_rep1.wig	b1	X
+c2	Cholesterol	src/pytransit/data/cholesterol_H37Rv_rep2.wig	b1	X
+c3	Cholesterol	src/pytransit/data/cholesterol_H37Rv_rep3.wig	b2	X
+g1	Glycerol	src/pytransit/data/glycerol_H37Rv_rep1.wig	b2	Y
+g2	Glycerol	src/pytransit/data/glycerol_H37Rv_rep2.wig	b2	Y


=====================================
src/pytransit/doc/source/transit_methods.rst
=====================================
@@ -32,18 +32,69 @@ probability of this using a Bayesian model.
 How does it work?
 -----------------
 
-| For a formal description of how this method works, see our paper [DeJesus2013]_:
+| 
 
-|  DeJesus, M.A., Zhang, Y.J., Sassettti, C.M., Rubin, E.J.,
-  Sacchettini, J.C., and Ioerger, T.R. (2013).
-| `Bayesian analysis of gene essentiality based on sequencing of transposon insertion libraries. <http://www.ncbi.nlm.nih.gov/pubmed/23361328>`_ *Bioinformatics*, 29(6):695-703.
+This method for identifying essential genes is based on analyzing
+'gaps', or consecutive sequences of TA sites lacking insertions.
+The statistical significance of the length of a gap is determined
+using the Gumbel distribution, which is a form of an Extreme-Value distribution.
 
-Example
--------
+For a formal description of how this method works, see our paper [DeJesus2013]_:
+
+| 
+  DeJesus, M.A., Zhang, Y.J., Sassettti, C.M., Rubin, E.J.,
+  Sacchettini, J.C., and Ioerger, T.R. (2013).  `Bayesian analysis of
+  gene essentiality based on sequencing of transposon insertion
+  libraries. <http://www.ncbi.nlm.nih.gov/pubmed/23361328>`_
+  *Bioinformatics*, 29(6):695-703.
+
+|
+**Update (2021) - Binomial** 
+
+Since the Gumbel method depends on the overall
+saturation (percent of TA sites with insertions), it can sometimes
+call a lot of smaller genes 'Uncertain'.  This might reduce the total
+number of essentials detected.  For example, if the saturation is
+~30%, no genes with fewer than ~10 TA sites might confidently be
+labeled as Essential.  In particular, there are often genes with no
+insertions that look like they should obviously be called essential,
+and yet, they are too short to be confidently called essential in
+low-saturation datasets by the conservative Gumbel model.
+
+To compensate for this, we have added a simple **Binomial** model for
+detecting small genes totally lacking insertions (described in `(Choudhery et al, 2021)
+<https://www.biorxiv.org/content/10.1101/2021.07.01.450749v2>`_).  In
+the output file, they are labled as 'EB' to distinguish them from
+essentials (E) based on the Gumbel model.  The EB genes supplement
+genes marked E by Gumbel, and the combination of the 2 groups (E and
+EB) should be considered as 'essentials'.  The number of genes in
+each category is reported in a table in the header of the output file, like this:
 
 ::
 
- python3 transit.py gumbel <comma-separated .wig files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
+ (for an M. tuberculosis TnSeq dataset with 60% saturation:)
+ #Summary of Essentiality Calls:
+ #  E  =  551 (essential based on Gumbel)
+ #  EB =   94 (essential based on Binomial)
+ #  NE = 2829 (non-essential)
+ #  U  =  262 (uncertain)
+ #  S  =  254 (too short)
+
+ (for an M. tuberculosis TnSeq dataset with 40% saturation:)
+ #  E  =  194 (essential based on Gumbel)
+ #  EB =  315 (essential based on Binomial)
+ #  NE = 2441 (non-essential)
+ #  U  =  774 (uncertain)
+ #  S  =  266 (too short)
+
+
+
+Usage
+-----
+
+::
+
+  > python3 transit.py gumbel <comma-separated .wig files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
         Optional Arguments:
         -s <integer>    :=  Number of samples. Default: -s 10000
         -b <integer>    :=  Number of Burn-in samples. Default -b 500
@@ -1053,7 +1104,7 @@ Example
   python3 transit.py anova <combined wig file> <samples_metadata file> <annotation .prot_table> <output file> [Optional Arguments]
         Optional Arguments:
         -n <string>         :=  Normalization method. Default: -n TTR
-        --ignore-conditions <cond1,...> :=  Comma separated list of conditions to ignore, for the analysis. Default: None
+        --exclude-conditions <cond1,...> :=  Comma separated list of conditions to ignore, for the analysis. Default: None
         --include-conditions <cond1,...> :=  Comma separated list of conditions to include, for the analysis. Default: All
         --ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions)
         -iN <float> :=  Ignore TAs occurring within given percentage (as integer) of the N terminus. Default: -iN 0
@@ -1092,7 +1143,7 @@ The following parameters are available for the ANOVA method:
 
 -  **\-\-include-conditions:** Includes the given set of conditions from the ZINB test. Conditions not in this list are ignored. Note: this is useful for specifying the order in which the columns are listed in the output file.
 
--  **\-\-ignore-conditions:** Can use this to drop conditions not of interest.
+-  **\-\-exclude-conditions:** Can use this to drop conditions not of interest.
 
 -  **\-\-ref:** Specify which condition to use as a reference for computing LFCs.
    By default, LFCs for each gene in each condition are calculated with respect
@@ -1218,8 +1269,8 @@ Example
   python3 transit.py zinb <combined wig file> <samples_metadata file> <annotation .prot_table> <output file> [Optional Arguments]
         Optional Arguments:
         -n <string>         :=  Normalization method. Default: -n TTR
-        --ignore-conditions <cond1,cond2> :=  Comma separated list of conditions to ignore, for the analysis. Default: None
-        --include-conditions <cond1,cond2> :=  Comma separated list of conditions to include, for the analysis. Default: All
+        --exclude-conditions <cond1,...> :=  Comma separated list of conditions to ignore, for the analysis. Default: None
+        --include-conditions <cond1,...> :=  Comma separated list of conditions to include, for the analysis. Default: All
         --ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if more than one)
         -iN <float>     :=  Ignore TAs occuring within given percentage of the N terminus. Default: -iN 5
         -iC <float>     :=  Ignore TAs occuring within given percentage of the C terminus. Default: -iC 5
@@ -1281,10 +1332,10 @@ of the combined_wig (including pathnames, if present).
 Parameters
 ----------
 
-The following parameters are available for the method:
+The following parameters are available for the ZINB method:
 
 -  **\-\-include-conditions:** Includes the given set of conditions from the ZINB test. Conditions not in this list are ignored. Note: this is useful for specifying the order in which the columns are listed in the output file.
--  **\-\-ignore-conditions:** Ignores the given set of conditions from the ZINB test.
+-  **\-\-exclude-conditions:** Ignores the given set of conditions from the ZINB test.
 -  **\-\-ref:** which condition to use as a reference when computing LFCs in the output file
 -  **Normalization Method:** Determines which normalization method to
    use when comparing datasets. Proper normalization is important as it


=====================================
src/pytransit/stat_tools.py
=====================================
@@ -512,7 +512,7 @@ def F_shuffle_dict_libraries(*args, **kwargs):
         perm = numpy.random.permutation(combined)
         #print("perm", perm)
         #print("perm[:n1]", perm[:n1])
-        E[L] = numpy.array([perm[:n1], perm[n1:]])
+        E[L] = numpy.array([perm[:n1], perm[n1:]],dtype=object)
         #print("D[L]", D[L])
     return E
 
@@ -566,7 +566,8 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
     # - Check library strings match in some way
     lib_diff = set(lib_str1) ^ set(lib_str2)
     if lib_diff:
-        raise ValueError("At least one library string has a letter not used by the other:\ %s" % ", ".join(lib_diff))
+        #raise ValueError("At least one library string has a letter not used by the other:\ %s" % ", ".join(lib_diff))
+        raise ValueError("At least one library string has a letter not used by the other: " + ", ".join(lib_diff))
 
     # - Check input has some data
     assert len(data1) > 0, "Data1 cannot be empty"
@@ -714,7 +715,7 @@ def combine_lib_dicts(L1, L2):
     KEYS = L1.keys()
     DATA = {}
     for K in KEYS:
-        DATA[K] = numpy.array([L1[K], L2[K]])
+        DATA[K] = numpy.array([L1[K], L2[K]],dtype=object)
     return DATA
 
 #


=====================================
src/pytransit/tnseq_tools.py
=====================================
@@ -80,6 +80,9 @@ def read_samples_metadata(metadata_file, covarsToRead = [], interactionsToRead =
     interactionsByFileList = [{} for i in range(len(interactionsToRead))]
     headersToRead = [condition_name.lower(), "filename"]
     orderingMetadata = { 'condition': [], 'interaction': [] }
+    print(headersToRead)
+    print(covarsToRead)
+    print(interactionsToRead)
     with open(metadata_file) as mfile:
         lines = mfile.readlines()
         headIndexes = [i


=====================================
tests/test_tpp.py
=====================================
@@ -35,7 +35,7 @@ NOFLAG_PRIMER = [
         "# FR_corr (Fwd templates vs. Rev templates): 0.019",
         "# transposon type: Himar1",
         "# protocol type: Sassetti",
-        "# primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
+        "#  primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
         ]
 
 MME1_PROTOCOL = [
@@ -46,7 +46,7 @@ MME1_PROTOCOL = [
         "# NZ_mean (among templates): 1.0",
         "# transposon type: Himar1",
         "# protocol type: Mme1",
-        "# primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
+        "#  primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
         ]
 
 FLAG_PRIMER = [



View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/fac2650606c9600626d84ca58087250253aed295

-- 
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/fac2650606c9600626d84ca58087250253aed295
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211020/786882e3/attachment-0001.htm>


More information about the debian-med-commit mailing list