[med-svn] [Git][med-team/tnseq-transit][upstream] New upstream version 3.2.3
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Wed Oct 20 21:32:04 BST 2021
Nilesh Patra pushed to branch upstream at Debian Med / tnseq-transit
Commits:
fac26506 by Nilesh Patra at 2021-10-21T01:42:57+05:30
New upstream version 3.2.3
- - - - -
13 changed files:
- − .travis.yml
- CHANGELOG.md
- src/pytpp/tpp_tools.py
- src/pytransit/__init__.py
- src/pytransit/analysis/anova.py
- src/pytransit/analysis/base.py
- src/pytransit/analysis/gumbel.py
- src/pytransit/analysis/zinb.py
- src/pytransit/data/samples_metadata_cg_covar.txt
- src/pytransit/doc/source/transit_methods.rst
- src/pytransit/stat_tools.py
- src/pytransit/tnseq_tools.py
- tests/test_tpp.py
Changes:
=====================================
.travis.yml deleted
=====================================
@@ -1,16 +0,0 @@
-language: python
-
-services:
- - docker
-
-before_install:
- - docker build -t transit .
-
-DISPLAY: 0.0
-notifications:
- email:
- on_success: change # default: change
- on_failure: change # default: always
-
-script: travis_wait 20 docker run -t transit
-
=====================================
CHANGELOG.md
=====================================
@@ -2,8 +2,15 @@
All notable changes to this project will be documented in this file.
-
-## Version 3.2.2 2022-09-08
+## Version 3.2.3 2021-10-16
+#### TRANSIT:
+ - added Binomial essentials (EB) to Gumbel analysis (supplementing genes categorized as E), to help with low-saturation datasets
+ - modified ANOVA and ZINB so that --include-conditions and --exclude-conditions refer to original Conditions column in samples metadata file (instead of whatever is specified by --conditions)
+
+#### TPP:
+ - improved metrics reported in *.tn_stats by TPP, to help diagnose why reads don't map
+
+## Version 3.2.2 2021-09-08
#### TRANSIT:
- fixed bug in converting gff_to_prot_table
- fixed bug in tn5gaps (fixes some false negative calls)
=====================================
src/pytpp/tpp_tools.py
=====================================
@@ -310,7 +310,7 @@ def extract_staggered(infile,outfile,vars):
output_failed.close() # [WM] [add]
if vars.tot_tgtta == 0:
raise ValueError("Error: Input files did not contain any reads matching prefix sequence with %d mismatches" % vars.mm1)
-
+ vars.tot_reads = tot
def message(s):
#print("[tn_preprocess]",s)
@@ -511,7 +511,7 @@ def template_counts(ref,sam,bcfile,vars):
genome = read_genome(ref, replicon_index)
for i in range(len(genome)-1):
#if genome[i:i+2].upper()=="TA":
- if vars.transposon=="Himar1" and genome[i:i+2].upper()!="TA": continue
+ if vars.transposon=="Himar1" and genome[i:i+2].upper()!="TA": continue
else:
pos = i+1
h = hits[replicon_names[replicon_index]].get(pos,[])
@@ -1100,20 +1100,7 @@ def generate_output(vars):
FR_corr.append(cur_FR_corr)
BC_corr.append(cur_BC_corr)
- primer = "CTAGAGGGCCCAATTCGCCCTATAGTGAGT"
- vector = "CTAGACCGTCCAGTCTGGCAGGCCGGAAAC"
- adapter = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
- Himar1 = "ACTTATCAGCCAACCTGTTA"
- tot_reads,nprimer,nvector,nadapter,misprimed = 0,0,0,0,0
- for line in open(vars.reads1):
- if line[0]=='>': tot_reads += 1; continue
- if primer in line: nprimer += 1
- if vector in line: nvector += 1
- if adapter in line: nadapter += 1
- #if "TGTTA" in line and Himar1 not in line: misprimed += 1
- # basically, these should correspond to insertions at non-TA sites (so the terminal TA of ...TGTTA will be different)
- if Himar1[:-5] in line and Himar1 not in line: misprimed += 1
-
+ tot_reads = vars.tot_reads
read_length = get_read_length(vars.base + ".reads1")
mean_r1_genomic = get_genomic_portion(vars.base + ".trimmed1")
if vars.single_end==False: mean_r2_genomic = get_genomic_portion(vars.base + ".genomic2")
@@ -1133,7 +1120,7 @@ def generate_output(vars):
output.write('# ref_genome: %s\n' % vars.ref)
output.write('# replicon_ids: %s\n' % ','.join(vars.replicon_ids))
output.write("# total_reads (or read pairs): %s\n" % tot_reads)
- #output.write("# truncated_reads %s (fragments shorter than the read length; ADAP2 appears in read1)\n" % vars.truncated_reads)
+ output.write("# truncated_reads %s (genomic inserts shorter than the read length; ADAP2 appears in read1)\n" % vars.truncated_reads)
output.write("# trimmed_reads (reads with valid Tn prefix, and insert size>20bp): %s\n" % vars.tot_tgtta)
output.write("# reads1_mapped: %s\n" % vars.r1)
output.write("# reads2_mapped: %s\n" % vars.r2)
@@ -1187,10 +1174,30 @@ def generate_output(vars):
output.write("# FR_corr (Fwd templates vs. Rev templates): %0.3f\n" % FR_corr[0])
output.write("# BC_corr (reads vs. templates, summed over both strands): %0.3f\n" % BC_corr[0])
- output.write("# primer_matches: %s reads (%0.1f%%) contain %s (Himar1)\n" % (nprimer,nprimer*100/float(tot_reads),primer))
- output.write("# vector_matches: %s reads (%0.1f%%) contain %s (phiMycoMarT7)\n" % (nvector,nvector*100/float(tot_reads),vector))
- output.write("# adapter_matches: %s reads (%0.1f%%) contain %s (Illumina/TruSeq index)\n" % (nadapter,nadapter*100/float(tot_reads),adapter))
- output.write("# misprimed_reads: %s reads (%0.1f%%) contain Himar1 prefix but don't end in TGTTA\n" % (misprimed,misprimed*100/float(tot_reads)))
+ primer = "CTAGAGGGCCCAATTCGCCCTATAGTGAGT"
+ vector = "CTAGACCGTCCAGTCTGGCAGGCCGGAAAC"
+ adapter = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
+ ADAPTER2 = "TACCACGACCA" # rc of const2 region of R2, between barcode and genomic; these reads will be truncated here
+ Himar1 = "ACTTATCAGCCAACCTGTTA"
+ trimmed_reads,nprimer,nvector,nadapter,misprimed,ntruncated = 0,0,0,0,0,0
+ for line in open(vars.trimmed1):
+ if line[0]=='>': trimmed_reads += 1; continue
+ if primer in line: nprimer += 1
+ if vector in line: nvector += 1
+ if adapter in line: nadapter += 1
+ #if "TGTTA" in line and Himar1 not in line: misprimed += 1
+ # basically, these should correspond to insertions at non-TA sites (so the terminal TA of ...TGTTA will be different)
+ if Himar1[:-5] in line and Himar1 not in line: misprimed += 1
+
+ output.write("# Break-down of total reads (%s):\n" % tot_reads)
+ output.write("# %s reads (%0.1f%%) lack the expected Tn prefix\n" % (tot_reads-vars.tot_tgtta,(tot_reads-vars.tot_tgtta)*100/float(tot_reads)))
+
+ output.write("# Break-down of trimmed reads with valid Tn prefix (%s):\n" % trimmed_reads)
+ output.write("# primer_matches: %s reads (%0.1f%%) contain %s (Himar1)\n" % (nprimer,nprimer*100/float(trimmed_reads),primer))
+ output.write("# vector_matches: %s reads (%0.1f%%) contain %s (phiMycoMarT7)\n" % (nvector,nvector*100/float(trimmed_reads),vector))
+ output.write("# adapter_matches: %s reads (%0.1f%%) contain %s (Illumina/TruSeq index)\n" % (nadapter,nadapter*100/float(trimmed_reads),adapter))
+ output.write("# misprimed_reads: %s reads (%0.1f%%) contain Himar1 prefix but don't end in TGTTA\n" % (misprimed,misprimed*100/float(trimmed_reads)))
+
output.write("# read_length: %s bp\n" % read_length)
output.write("# mean_R1_genomic_length: %0.1f bp\n" % mean_r1_genomic)
if vars.single_end==False: output.write("# mean_R2_genomic_length: %0.1f bp\n" % mean_r2_genomic)
=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
__all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
-__version__ = "v3.2.2"
+__version__ = "v3.2.3"
prefix = "[TRANSIT]"
=====================================
src/pytransit/analysis/anova.py
=====================================
@@ -34,9 +34,9 @@ class AnovaMethod(base.MultiConditionMethod):
"""
anova
"""
- def __init__(self, combined_wig, metadata, annotation, normalization, output_file, ignored_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0, PC=1, refs=[]):
+ def __init__(self, combined_wig, metadata, annotation, normalization, output_file, excluded_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0, PC=1, refs=[]):
base.MultiConditionMethod.__init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation, output_file,
- normalization=normalization, ignored_conditions=ignored_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
+ normalization=normalization, excluded_conditions=excluded_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
self.PC = PC
self.refs = refs
@@ -61,18 +61,18 @@ class AnovaMethod(base.MultiConditionMethod):
PC = int(kwargs.get("PC", 5))
refs = kwargs.get("-ref",[]) # list of condition names to use a reference for calculating LFCs
if refs!=[]: refs = refs.split(',')
- ignored_conditions = list(filter(None, kwargs.get("-ignore-conditions", "").split(",")))
+ excluded_conditions = list(filter(None, kwargs.get("-exclude-conditions", "").split(",")))
included_conditions = list(filter(None, kwargs.get("-include-conditions", "").split(",")))
# check for unrecognized flags
- flags = "-n --ignore-conditions --include-conditions -iN -iC -PC --ref".split()
+ flags = "-n --exclude-conditions --include-conditions -iN -iC -PC --ref".split()
for arg in rawargs:
if arg[0]=='-' and arg not in flags:
self.transit_error("flag unrecognized: %s" % arg)
print(AnovaMethod.usage_string())
sys.exit(0)
- return self(combined_wig, metadata, annotation, normalization, output_file, ignored_conditions, included_conditions, NTerminus, CTerminus, PC, refs)
+ return self(combined_wig, metadata, annotation, normalization, output_file, excluded_conditions, included_conditions, NTerminus, CTerminus, PC, refs)
def wigs_to_conditions(self, conditionsByFile, filenamesInCombWig):
"""
@@ -195,8 +195,20 @@ class AnovaMethod(base.MultiConditionMethod):
conditionsByFile,
filenamesInCombWig)
- conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata)
- data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList)
+ conditionsList = self.select_conditions(conditions,self.included_conditions,self.excluded_conditions,orderingMetadata)
+ #data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList)
+
+ metadata = self.get_samples_metadata()
+ conditionNames = metadata['Condition'] # original Condition names for each sample, as ordered list
+ fileNames = metadata['Filename']
+
+ data, fileNames, conditionNames, conditions, _, _ = self.filter_wigs_by_conditions3(
+ data,
+ fileNames,
+ conditionNames, # original Condition column in samples metadata file
+ self.included_conditions,
+ self.excluded_conditions,
+ conditions = conditionNames) # this is kind of redundant for ANOVA, but it is here because condition, covars, and interactions could have been manipulated for ZINB
genes = tnseq_tools.read_genes(self.annotation_path)
@@ -238,7 +250,7 @@ class AnovaMethod(base.MultiConditionMethod):
Optional Arguments:
-n <string> := Normalization method. Default: -n TTR
--include-conditions <cond1,...> := Comma-separated list of conditions to use for analysis (Default: all)
- --ignore-conditions <cond1,...> := Comma-separated list of conditions to ignore (Default: none)
+ --exclude-conditions <cond1,...> := Comma-separated list of conditions to exclude (Default: none)
--ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions)
-iN <N> := Ignore TAs within given percentage (e.g. 5) of N terminus. Default: -iN 0
-iC <N> := Ignore TAs within given percentage (e.g. 5) of C terminus. Default: -iC 0
=====================================
src/pytransit/analysis/base.py
=====================================
@@ -509,7 +509,7 @@ class MultiConditionMethod(AnalysisMethod):
Class to be inherited by analysis methods that compare essentiality between multiple conditions (e.g Anova).
'''
- def __init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation_path, output, normalization=None, LOESS=False, ignoreCodon=True, wxobj=None, ignored_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0):
+ def __init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation_path, output, normalization=None, LOESS=False, ignoreCodon=True, wxobj=None, excluded_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0):
AnalysisMethod.__init__(self, short_name, long_name, short_desc, long_desc, output,
annotation_path, wxobj)
self.combined_wig = combined_wig
@@ -518,24 +518,24 @@ class MultiConditionMethod(AnalysisMethod):
self.NTerminus = nterm
self.CTerminus = cterm
self.unknown_cond_flag = "FLAG-UNMAPPED-CONDITION-IN-WIG"
- self.ignored_conditions = ignored_conditions
+ self.excluded_conditions = excluded_conditions
self.included_conditions = included_conditions
- def filter_wigs_by_conditions(self, data, conditions, covariates = [], interactions = [], ignored_conditions = [], included_conditions = []):
+ def filter_wigs_by_conditions(self, data, conditions, covariates = [], interactions = [], excluded_conditions = [], included_conditions = []):
"""
- Filters conditions that are ignored/included.
+ Filters conditions that are excluded/included.
([[Wigdata]], [Condition], [[Covar]], [Condition], [Condition]) -> Tuple([[Wigdata]], [Condition])
"""
- ignored_conditions, included_conditions = (set(ignored_conditions), set(included_conditions))
+ excluded_conditions, included_conditions = (set(excluded_conditions), set(included_conditions))
d_filtered, cond_filtered, filtered_indexes = [], [], [];
- if len(ignored_conditions) > 0 and len(included_conditions) > 0:
- self.transit_error("Both ignored and included conditions have len > 0")
+ if len(excluded_conditions) > 0 and len(included_conditions) > 0:
+ self.transit_error("Both excluded and included conditions have len > 0")
sys.exit(0)
- elif (len(ignored_conditions) > 0):
- self.transit_message("conditions ignored: {0}".format(ignored_conditions))
+ elif (len(excluded_conditions) > 0):
+ self.transit_message("conditions excluded: {0}".format(excluded_conditions))
for i, c in enumerate(conditions):
- if (c != self.unknown_cond_flag) and (c not in ignored_conditions):
+ if (c != self.unknown_cond_flag) and (c not in excluded_conditions):
d_filtered.append(data[i])
cond_filtered.append(conditions[i])
filtered_indexes.append(i)
@@ -565,19 +565,19 @@ class MultiConditionMethod(AnalysisMethod):
# input: conditions are per wig; orderingMetdata comes from tnseq_tools.read_samples_metadata()
# output: conditionsList is selected subset of conditions (unique, in preferred order)
- def select_conditions(self,conditions,included_conditions,ignored_conditions,orderingMetadata):
+ def select_conditions(self,conditions,included_conditions,excluded_conditions,orderingMetadata):
if len(included_conditions)>0: conditionsList = included_conditions
else:
conditionsList = []
for c in orderingMetadata['condition']: # the order conds appear in metadata file, duplicated for each sample
if c not in conditionsList: conditionsList.append(c)
- for c in ignored_conditions:
+ for c in excluded_conditions:
if c in conditionsList: conditionsList.remove(c)
return conditionsList
def filter_wigs_by_conditions2(self, data, conditions, conditionsList, covariates = [], interactions = []):
"""
- Filters conditions that are ignored/included.
+ Filters conditions that are excluded/included.
([[Wigdata]], [Condition], [[Covar]], [Condition], [Condition]) -> Tuple([[Wigdata]], [Condition])
"""
d_filtered, cond_filtered, filtered_indexes = [], [], [];
@@ -596,7 +596,48 @@ class MultiConditionMethod(AnalysisMethod):
numpy.array(covariates_filtered),
numpy.array(interactions_filtered))
-#
+ def filter_wigs_by_conditions3(self, data, fileNames, conditionNames, included_cond, excluded_cond, conditions, covariates = [], interactions = []):
+ """
+ Filters conditions that are excluded/included; also extract cond, covar, and interaction labels
+ conditionNames: based on original Conditions column in metadata
+ conditions: user might have specified an alternative column to analyze (list of labels parallel to wigs)
+ """
+ fileNames_filtered, condNames_filtered, d_filtered, cond_filtered, filtered_indexes = [], [], [], [], []
+
+ for i in range(len(data)):
+ if (len(included_cond)==0 or conditionNames[i] in included_cond) and conditionNames[i] not in excluded_cond:
+ d_filtered.append(data[i])
+ fileNames_filtered.append(fileNames[i])
+ condNames_filtered.append(conditionNames[i])
+ cond_filtered.append(conditions[i])
+ filtered_indexes.append(i)
+
+ covariates_filtered = [[c[i] for i in filtered_indexes] for c in covariates]
+ interactions_filtered = [[c[i] for i in filtered_indexes] for c in interactions]
+
+ return (numpy.array(d_filtered),
+ numpy.array(fileNames_filtered),
+ numpy.array(condNames_filtered),
+ numpy.array(cond_filtered),
+ numpy.array(covariates_filtered),
+ numpy.array(interactions_filtered))
+
+ # return a hash table of parallel lists, indexed by column header
+ def get_samples_metadata(self):
+ data = {}
+ header = None
+ for line in open(self.metadata):
+ if line[0]=='#': continue
+ w = line.rstrip().split('\t')
+ if header==None:
+ header = w
+ for col in header: data[col] = []
+ else:
+ for i in range(len(header)):
+ data[header[i]].append(w[i])
+ return data
+
+#######################3
class TransitAnalysis:
def __init__(self, sn, ln, short_desc, long_desc, tn, method_class=AnalysisMethod, gui_class=AnalysisGUI, filetypes=[TransitFile]):
=====================================
src/pytransit/analysis/gumbel.py
=====================================
@@ -333,6 +333,9 @@ class GumbelMethod(base.SingleConditionMethod):
self.transit_message("Getting Data")
(data, position) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
(K,N) = data.shape
+ merged = numpy.sum(data,axis=0)
+ nsites,nzeros = merged.shape[0],numpy.sum(merged==0) # perhaps I should say >minCount
+ sat = (nsites-nzeros)/float(nsites)
if self.normalization and self.normalization != "nonorm":
self.transit_message("Normalizing using: %s" % self.normalization)
@@ -412,13 +415,14 @@ class GumbelMethod(base.SingleConditionMethod):
phi_old = phi_new
#Update progress
- text = "Running Gumbel Method... %5.1f%%" % (100.0*(count+1)/(self.samples+self.burnin))
+ text = "Running Gumbel Method with Binomial Essentiality Calls... %5.1f%%" % (100.0*(count+1)/(self.samples+self.burnin))
self.progress_update(text, count)
ZBAR = numpy.apply_along_axis(numpy.mean, 1, Z_sample)
(ess_t, non_t) = stat_tools.bayesian_ess_thresholds(ZBAR)
-
+ binomial_n = math.log10(0.05)/math.log10(G.global_phi())
+
#Orf k n r s zbar
self.output.write("#Gumbel\n")
if self.wxobj:
@@ -432,16 +436,20 @@ class GumbelMethod(base.SingleConditionMethod):
self.output.write("#Data: %s\n" % (",".join(self.ctrldata).encode('utf-8')))
self.output.write("#Annotation path: %s\n" % self.annotation_path.encode('utf-8'))
- self.output.write("#FDR Corrected thresholds: %f, %f\n" % (ess_t, non_t))
- self.output.write("#MH Acceptance-Rate:\t%2.2f%%\n" % (100.0*acctot/count))
+ self.output.write("#Trimming of TAs near termini: N-term=%s, C-term=%s (fraction of ORF length)\n" % (self.NTerminus,self.CTerminus))
+ self.output.write("#Significance thresholds (FDR-corrected): Essential if Zbar>%f, Non-essential if Zbar<%f\n" % (ess_t, non_t))
+ self.output.write("#Metropolis-Hastings Acceptance-Rate:\t%2.2f%%\n" % (100.0*acctot/count))
self.output.write("#Total Iterations Performed:\t%d\n" % count)
self.output.write("#Sample Size:\t%d\n" % i)
- self.output.write("#phi estimate:\t%f\n" % numpy.average(phi_sample))
- self.output.write("#Time: %s\n" % (time.time() - start_time))
- self.output.write("#%s\n" % "\t".join(columns))
+ self.output.write("#Total number of TA sites: %s\n" % nsites)
+ self.output.write("#Genome-wide saturation: %s\n" % (round(sat,3))) # datasets merged
+ self.output.write("#phi estimate:\t%f (non-insertion probability in non-essential regions)\n" % numpy.average(phi_sample))
+ self.output.write("#Minimum number of TA sites with 0 insertions to be classified as essential by Binomial: \t%0.3f\n" % binomial_n)
+ self.output.write("#Time: %s s\n" % (round(time.time()-start_time,1)))
+
i = 0
- data = []
- for g in G:
+ data,calls = [],[]
+ for j,g in enumerate(G):
if not self.good_orf(g):
zbar = -1.0
else:
@@ -449,18 +457,30 @@ class GumbelMethod(base.SingleConditionMethod):
i+=1
if zbar > ess_t:
call = "E"
+ elif G.local_sites()[j]>binomial_n and G.local_thetas()[j]==0.0:
+ call = "EB"
elif non_t <= zbar <= ess_t:
call = "U"
elif 0 <= zbar < non_t:
call = "NE"
else:
call = "S"
+
data.append("%s\t%s\t%s\t%d\t%d\t%d\t%d\t%f\t%s\n" % (g.orf, g.name, g.desc, g.k, g.n, g.r, g.s, zbar, call))
+ calls.append(call)
data.sort()
+
+ self.output.write("#Summary of Essentiality Calls:\n")
+ self.output.write("# E = %4s (essential based on Gumbel)\n" % (calls.count("E")))
+ self.output.write("# EB = %4s (essential based on Binomial)\n" % (calls.count("EB")))
+ self.output.write("# NE = %4s (non-essential)\n" % (calls.count("NE")))
+ self.output.write("# U = %4s (uncertain)\n" % (calls.count("U")))
+ self.output.write("# S = %4s (too short)\n" % (calls.count("S")))
+
+ self.output.write("#%s\n" % "\t".join(columns))
for line in data:
self.output.write(line)
self.output.close()
-
self.transit_message("") # Printing empty line to flush stdout
self.transit_message("Adding File: %s" % (self.output.name))
self.add_file(filetype="Gumbel")
@@ -494,8 +514,9 @@ class GumbelMethod(base.SingleConditionMethod):
BetaGamma = B*tnseq_tools.getGamma()
if n<EXACT: # estimate more accurately based on expected run len, using exact calc for small genes
exprun = self.ExpectedRuns_cached(n,p)
- u = exprun-BetaGamma # u is mu of Gumbel (mean=mu+gamma*beta); matching of moments
- pval = 1 - scipy.exp(scipy.stats.gumbel_r.logcdf(r,u,B))
+ u = exprun-BetaGamma # u is mu of Gumbel (mean=mu+gamma*beta); matching of moments
+ #https://github.blog/2020-12-15-token-authentication-requirements-for-git-operations/
+ pval = 1 - numpy.exp(scipy.stats.gumbel_r.logcdf(r,u,B))
if pval < 0.05: return(1)
else: return(0)
@@ -519,7 +540,7 @@ class GumbelMethod(base.SingleConditionMethod):
for i in range(len(N)): # estimate more accurately based on expected run len, using exact calc for small genes
if N[i]<EXACT: mu[i] = self.ExpectedRuns_cached(int(N[i]),p)-BetaGamma
sigma = 1.0/math.log(1.0/p);
- h0 = ((scipy.exp(scipy.stats.gumbel_r.logpdf(R,mu,sigma))) * scipy.stats.norm.pdf(S, mu_s*R, sigma_s) * (1-w1))
+ h0 = ((numpy.exp(scipy.stats.gumbel_r.logpdf(R,mu,sigma))) * scipy.stats.norm.pdf(S, mu_s*R, sigma_s) * (1-w1))
h1 = SIG * w1
h1 += 1e-10; h0 += 1e-10 # to prevent div-by-zero; if neither class is probable, p(z1) should be ~0.5
p_z1 = h1/(h0+h1)
=====================================
src/pytransit/analysis/zinb.py
=====================================
@@ -48,9 +48,9 @@ class ZinbMethod(base.MultiConditionMethod):
"""
Zinb
"""
- def __init__(self, combined_wig, metadata, annotation, normalization, output_file, ignored_conditions=[], included_conditions=[], winz=False, nterm=5.0, cterm=5.0, condition="Condition", covars=[], interactions = [], PC=1, refs=[]):
+ def __init__(self, combined_wig, metadata, annotation, normalization, output_file, excluded_conditions=[], included_conditions=[], winz=False, nterm=5.0, cterm=5.0, condition="Condition", covars=[], interactions = [], PC=1, refs=[]):
base.MultiConditionMethod.__init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation, output_file,
- normalization=normalization, ignored_conditions=ignored_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
+ normalization=normalization, excluded_conditions=excluded_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
self.winz = winz
self.covars = covars
self.interactions = interactions
@@ -96,18 +96,18 @@ class ZinbMethod(base.MultiConditionMethod):
refs = kwargs.get("-ref",[]) # list of condition names to use a reference for calculating LFCs
if refs!=[]: refs = refs.split(',')
winz = True if "w" in kwargs else False
- ignored_conditions = list(filter(None, kwargs.get("-ignore-conditions", "").split(",")))
+ excluded_conditions = list(filter(None, kwargs.get("-exclude-conditions", "").split(",")))
included_conditions = list(filter(None, kwargs.get("-include-conditions", "").split(",")))
# check for unrecognized flags
- flags = "-n --ignore-conditions --include-conditions -iN -iC -PC --condition --covars --interactions --gene --ref".split()
+ flags = "-n --exclude-conditions --include-conditions -iN -iC -PC --condition --covars --interactions --gene --ref".split()
for arg in rawargs:
if arg[0]=='-' and arg not in flags:
self.transit_error("flag unrecognized: %s" % arg)
print(ZinbMethod.usage_string())
sys.exit(0)
- return self(combined_wig, metadata, annotation, normalization, output_file, ignored_conditions, included_conditions, winz, NTerminus, CTerminus, condition, covars, interactions, PC, refs)
+ return self(combined_wig, metadata, annotation, normalization, output_file, excluded_conditions, included_conditions, winz, NTerminus, CTerminus, condition, covars, interactions, PC, refs)
def wigs_to_conditions(self, conditionsByFile, filenamesInCombWig):
"""
@@ -523,6 +523,8 @@ class ZinbMethod(base.MultiConditionMethod):
condition_name = self.condition
# if a covar is not found, this crashes; check for it?
+ # read it first with no condition specified, to get original Condition names
+ conditionsByFile1, covariatesByFileList1, interactionsByFileList1, orderingMetadata1 = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions) # without specifiying condition
conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name)
## [Condition] in the order of files in combined wig
@@ -538,19 +540,37 @@ class ZinbMethod(base.MultiConditionMethod):
interactionsByFileList,
filenamesInCombWig)
- conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata)
- data, conditions, covariates, interactions = self.filter_wigs_by_conditions2(
+ # this was the old way to filter samples, where --include/exclude-conditions depended on which --condition was specified
+ #conditionsList = self.select_conditions(conditions,self.included_conditions,self.excluded_conditions,orderingMetadata)
+ #data, conditions, covariates, interactions = self.filter_wigs_by_conditions2(
+ # data,
+ # conditions,
+ # conditionsList,
+ # covariates = covariates,
+ # interactions = interactions)
+ #filesByCondition = self.invertDict(conditionsByFile)
+ #samples_used = set()
+ #for cond in conditionsList: samples_used.update(filesByCondition[cond])
+
+ metadata = self.get_samples_metadata()
+ conditionNames = metadata['Condition'] # original Condition names for each sample, as ordered list
+ fileNames = metadata['Filename']
+
+ # this is the new way to filter samples, where --include/exclude-conditions refers to Condition column in metadata, regardless of whether --condition was specified
+ data, fileNames, conditionNames, conditions, covariates, interactions = self.filter_wigs_by_conditions3(
data,
- conditions,
- conditionsList,
+ fileNames,
+ conditionNames, # original Condition column in samples metadata file
+ self.included_conditions,
+ self.excluded_conditions,
+ conditions = conditions, # user might have specified a column other than Condition
covariates = covariates,
interactions = interactions)
-
+ conditionsList = list(set(conditions)) # unique condition labels, filtered
+ samples_used = set(fileNames)
+
# show the samples associated with each condition (and covariates or interactions, if defined), and count samples in each cross-product of vars
- filesByCondition = self.invertDict(conditionsByFile)
- samples_used = set()
- for cond in conditionsList: samples_used.update(filesByCondition[cond])
vars = [condition_name]+self.covars+self.interactions
vars2vals = {}
vars2vals[condition_name] = list(set(conditions))
@@ -558,7 +578,7 @@ class ZinbMethod(base.MultiConditionMethod):
for i,var in enumerate(self.interactions): vars2vals[var] = list(set(interactions[i]))
varsByFileList = [conditionsByFile]+covariatesByFileList+interactionsByFileList
for i,var in enumerate(vars):
- print("\nCondition/Covariate/Interaction: %s" % vars[i])
+ print("\nsamples for Condition/Covariate/Interaction: %s" % vars[i])
filesByVar = self.invertDict(varsByFileList[i])
for k,v in filesByVar.items():
samples = list(samples_used.intersection(set(v)))
@@ -584,7 +604,11 @@ class ZinbMethod(base.MultiConditionMethod):
c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)
- if len(self.included_conditions) > 0:
+ # use --include-conditions to determine order of columns in output file
+ # this only works if an alternative --condition was not specified
+ # otherwise don't try to order them this way because it gets too complicated
+ # possibly should require --covars and --interactions to be unspecified too
+ if self.condition=="Condition" and len(self.included_conditions) > 0 and len(self.excluded_conditions)==0:
condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2))
## Order by interaction, if stat belongs to same condition
if condDiff == 0 and i1 is not None and i2 is not None:
@@ -640,8 +664,8 @@ class ZinbMethod(base.MultiConditionMethod):
Optional Arguments:
-n <string> := Normalization method. Default: -n TTR
- --ignore-conditions <cond1,cond2> := Comma separated list of conditions to ignore, for the analysis.
- --include-conditions <cond1,cond2> := Comma separated list of conditions to include, for the analysis. Conditions not in this list, will be ignored.
+ --exclude-conditions <cond1,cond2> := Comma separated list of conditions to exclude, for the analysis.
+ --include-conditions <cond1,cond2> := Comma separated list of conditions to include, for the analysis. Conditions not in this list, will be excluded.
--ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions)
-iN <float> := Ignore TAs occuring within given percentage (as integer) of the N terminus. Default: -iN 5
-iC <float> := Ignore TAs occuring within given percentage (as integer) of the C terminus. Default: -iC 5
=====================================
src/pytransit/data/samples_metadata_cg_covar.txt
=====================================
@@ -1,6 +1,6 @@
-Id NewConditionCol Filename Batch
-c1 Cholesterol src/pytransit/data/cholesterol_H37Rv_rep1.wig b1
-c2 Cholesterol src/pytransit/data/cholesterol_H37Rv_rep2.wig b1
-c3 Cholesterol src/pytransit/data/cholesterol_H37Rv_rep3.wig b2
-g1 Glycerol src/pytransit/data/glycerol_H37Rv_rep1.wig b2
-g2 Glycerol src/pytransit/data/glycerol_H37Rv_rep2.wig b2
+Id Condition Filename Batch NewConditionCol
+c1 Cholesterol src/pytransit/data/cholesterol_H37Rv_rep1.wig b1 X
+c2 Cholesterol src/pytransit/data/cholesterol_H37Rv_rep2.wig b1 X
+c3 Cholesterol src/pytransit/data/cholesterol_H37Rv_rep3.wig b2 X
+g1 Glycerol src/pytransit/data/glycerol_H37Rv_rep1.wig b2 Y
+g2 Glycerol src/pytransit/data/glycerol_H37Rv_rep2.wig b2 Y
=====================================
src/pytransit/doc/source/transit_methods.rst
=====================================
@@ -32,18 +32,69 @@ probability of this using a Bayesian model.
How does it work?
-----------------
-| For a formal description of how this method works, see our paper [DeJesus2013]_:
+|
-| DeJesus, M.A., Zhang, Y.J., Sassettti, C.M., Rubin, E.J.,
- Sacchettini, J.C., and Ioerger, T.R. (2013).
-| `Bayesian analysis of gene essentiality based on sequencing of transposon insertion libraries. <http://www.ncbi.nlm.nih.gov/pubmed/23361328>`_ *Bioinformatics*, 29(6):695-703.
+This method for identifying essential genes is based on analyzing
+'gaps', or consecutive sequences of TA sites lacking insertions.
+The statistical significance of the length of a gap is determined
+using the Gumbel distribution, which is a form of an Extreme-Value distribution.
-Example
--------
+For a formal description of how this method works, see our paper [DeJesus2013]_:
+
+|
+ DeJesus, M.A., Zhang, Y.J., Sassettti, C.M., Rubin, E.J.,
+ Sacchettini, J.C., and Ioerger, T.R. (2013). `Bayesian analysis of
+ gene essentiality based on sequencing of transposon insertion
+ libraries. <http://www.ncbi.nlm.nih.gov/pubmed/23361328>`_
+ *Bioinformatics*, 29(6):695-703.
+
+|
+**Update (2021) - Binomial**
+
+Since the Gumbel method depends on the overall
+saturation (percent of TA sites with insertions), it can sometimes
+call a lot of smaller genes 'Uncertain'. This might reduce the total
+number of essentials detected. For example, if the saturation is
+~30%, no genes with fewer than ~10 TA sites might confidently be
+labeled as Essential. In particular, there are often genes with no
+insertions that look like they should obviously be called essential,
+and yet, they are too short to be confidently called essential in
+low-saturation datasets by the conservative Gumbel model.
+
+To compensate for this, we have added a simple **Binomial** model for
+detecting small genes totally lacking insertions (described in `(Choudhery et al, 2021)
+<https://www.biorxiv.org/content/10.1101/2021.07.01.450749v2>`_). In
+the output file, they are labled as 'EB' to distinguish them from
+essentials (E) based on the Gumbel model. The EB genes supplement
+genes marked E by Gumbel, and the combination of the 2 groups (E and
+EB) should be considered as 'essentials'. The number of genes in
+each category is reported in a table in the header of the output file, like this:
::
- python3 transit.py gumbel <comma-separated .wig files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
+ (for an M. tuberculosis TnSeq dataset with 60% saturation:)
+ #Summary of Essentiality Calls:
+ # E = 551 (essential based on Gumbel)
+ # EB = 94 (essential based on Binomial)
+ # NE = 2829 (non-essential)
+ # U = 262 (uncertain)
+ # S = 254 (too short)
+
+ (for an M. tuberculosis TnSeq dataset with 40% saturation:)
+ # E = 194 (essential based on Gumbel)
+ # EB = 315 (essential based on Binomial)
+ # NE = 2441 (non-essential)
+ # U = 774 (uncertain)
+ # S = 266 (too short)
+
+
+
+Usage
+-----
+
+::
+
+ > python3 transit.py gumbel <comma-separated .wig files> <annotation .prot_table or GFF3> <output file> [Optional Arguments]
Optional Arguments:
-s <integer> := Number of samples. Default: -s 10000
-b <integer> := Number of Burn-in samples. Default -b 500
@@ -1053,7 +1104,7 @@ Example
python3 transit.py anova <combined wig file> <samples_metadata file> <annotation .prot_table> <output file> [Optional Arguments]
Optional Arguments:
-n <string> := Normalization method. Default: -n TTR
- --ignore-conditions <cond1,...> := Comma separated list of conditions to ignore, for the analysis. Default: None
+ --exclude-conditions <cond1,...> := Comma separated list of conditions to ignore, for the analysis. Default: None
--include-conditions <cond1,...> := Comma separated list of conditions to include, for the analysis. Default: All
--ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions)
-iN <float> := Ignore TAs occurring within given percentage (as integer) of the N terminus. Default: -iN 0
@@ -1092,7 +1143,7 @@ The following parameters are available for the ANOVA method:
- **\-\-include-conditions:** Includes the given set of conditions from the ZINB test. Conditions not in this list are ignored. Note: this is useful for specifying the order in which the columns are listed in the output file.
-- **\-\-ignore-conditions:** Can use this to drop conditions not of interest.
+- **\-\-exclude-conditions:** Can use this to drop conditions not of interest.
- **\-\-ref:** Specify which condition to use as a reference for computing LFCs.
By default, LFCs for each gene in each condition are calculated with respect
@@ -1218,8 +1269,8 @@ Example
python3 transit.py zinb <combined wig file> <samples_metadata file> <annotation .prot_table> <output file> [Optional Arguments]
Optional Arguments:
-n <string> := Normalization method. Default: -n TTR
- --ignore-conditions <cond1,cond2> := Comma separated list of conditions to ignore, for the analysis. Default: None
- --include-conditions <cond1,cond2> := Comma separated list of conditions to include, for the analysis. Default: All
+ --exclude-conditions <cond1,...> := Comma separated list of conditions to ignore, for the analysis. Default: None
+ --include-conditions <cond1,...> := Comma separated list of conditions to include, for the analysis. Default: All
--ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if more than one)
-iN <float> := Ignore TAs occuring within given percentage of the N terminus. Default: -iN 5
-iC <float> := Ignore TAs occuring within given percentage of the C terminus. Default: -iC 5
@@ -1281,10 +1332,10 @@ of the combined_wig (including pathnames, if present).
Parameters
----------
-The following parameters are available for the method:
+The following parameters are available for the ZINB method:
- **\-\-include-conditions:** Includes the given set of conditions from the ZINB test. Conditions not in this list are ignored. Note: this is useful for specifying the order in which the columns are listed in the output file.
-- **\-\-ignore-conditions:** Ignores the given set of conditions from the ZINB test.
+- **\-\-exclude-conditions:** Ignores the given set of conditions from the ZINB test.
- **\-\-ref:** which condition to use as a reference when computing LFCs in the output file
- **Normalization Method:** Determines which normalization method to
use when comparing datasets. Proper normalization is important as it
=====================================
src/pytransit/stat_tools.py
=====================================
@@ -512,7 +512,7 @@ def F_shuffle_dict_libraries(*args, **kwargs):
perm = numpy.random.permutation(combined)
#print("perm", perm)
#print("perm[:n1]", perm[:n1])
- E[L] = numpy.array([perm[:n1], perm[n1:]])
+ E[L] = numpy.array([perm[:n1], perm[n1:]],dtype=object)
#print("D[L]", D[L])
return E
@@ -566,7 +566,8 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
# - Check library strings match in some way
lib_diff = set(lib_str1) ^ set(lib_str2)
if lib_diff:
- raise ValueError("At least one library string has a letter not used by the other:\ %s" % ", ".join(lib_diff))
+ #raise ValueError("At least one library string has a letter not used by the other:\ %s" % ", ".join(lib_diff))
+ raise ValueError("At least one library string has a letter not used by the other: " + ", ".join(lib_diff))
# - Check input has some data
assert len(data1) > 0, "Data1 cannot be empty"
@@ -714,7 +715,7 @@ def combine_lib_dicts(L1, L2):
KEYS = L1.keys()
DATA = {}
for K in KEYS:
- DATA[K] = numpy.array([L1[K], L2[K]])
+ DATA[K] = numpy.array([L1[K], L2[K]],dtype=object)
return DATA
#
=====================================
src/pytransit/tnseq_tools.py
=====================================
@@ -80,6 +80,9 @@ def read_samples_metadata(metadata_file, covarsToRead = [], interactionsToRead =
interactionsByFileList = [{} for i in range(len(interactionsToRead))]
headersToRead = [condition_name.lower(), "filename"]
orderingMetadata = { 'condition': [], 'interaction': [] }
+ print(headersToRead)
+ print(covarsToRead)
+ print(interactionsToRead)
with open(metadata_file) as mfile:
lines = mfile.readlines()
headIndexes = [i
=====================================
tests/test_tpp.py
=====================================
@@ -35,7 +35,7 @@ NOFLAG_PRIMER = [
"# FR_corr (Fwd templates vs. Rev templates): 0.019",
"# transposon type: Himar1",
"# protocol type: Sassetti",
- "# primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
+ "# primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
]
MME1_PROTOCOL = [
@@ -46,7 +46,7 @@ MME1_PROTOCOL = [
"# NZ_mean (among templates): 1.0",
"# transposon type: Himar1",
"# protocol type: Mme1",
- "# primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
+ "# primer_matches: 8 reads (0.8%) contain CTAGAGGGCCCAATTCGCCCTATAGTGAGT (Himar1)"
]
FLAG_PRIMER = [
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/fac2650606c9600626d84ca58087250253aed295
--
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/fac2650606c9600626d84ca58087250253aed295
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211020/786882e3/attachment-0001.htm>
More information about the debian-med-commit
mailing list