[med-svn] [Git][med-team/tnseq-transit][upstream] New upstream version 3.2.6
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Aug 26 07:00:34 BST 2022
Andreas Tille pushed to branch upstream at Debian Med / tnseq-transit
Commits:
e2e9cf2b by Andreas Tille at 2022-08-26T07:48:16+02:00
New upstream version 3.2.6
- - - - -
9 changed files:
- .gitignore
- CHANGELOG.md
- src/pytransit/__init__.py
- src/pytransit/analysis/anova.py
- src/pytransit/analysis/resampling.py
- src/pytransit/doc/source/method_anova.rst
- src/pytransit/export/combined_wig.py
- src/pytransit/stat_tools.py
- tests/test_analysis_methods.py
Changes:
=====================================
.gitignore
=====================================
@@ -1,5 +1,6 @@
# # Mac
.DS_STORE
+.DS_Store
.env
# Byte-compiled / optimized / DLL files
=====================================
CHANGELOG.md
=====================================
@@ -2,6 +2,18 @@
All notable changes to this project will be documented in this file.
+## Version 3.2.6 2021-08-03
+#### TRANSIT:
+
+Major changes:
+ - added a parameter 'alpha' to ANOVA to make the F-test less sensitive to genes with low counts, cutting down on 'irrelevant' genes with significant variability
+ - updated the online documentation to describe this
+
+Minor changes:
+ - fixed a (recently-introduced) bug that was causing the GUI to crash when running resampling
+ - updated 'export combined_wig' to include ref genome and column headers
+
+
## Version 3.2.5 2021-06-15
#### TRANSIT:
=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
__all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
-__version__ = "v3.2.5"
+__version__ = "v3.2.6"
prefix = "[TRANSIT]"
=====================================
src/pytransit/analysis/anova.py
=====================================
@@ -35,10 +35,11 @@ class AnovaMethod(base.MultiConditionMethod):
"""
anova
"""
- def __init__(self, combined_wig, metadata, annotation, normalization, output_file, excluded_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0, PC=1, winz=False, refs=[]):
+ def __init__(self, combined_wig, metadata, annotation, normalization, output_file, excluded_conditions=[], included_conditions=[], nterm=0.0, cterm=0.0, PC=5, winz=False, refs=[], alpha=1000):
base.MultiConditionMethod.__init__(self, short_name, long_name, short_desc, long_desc, combined_wig, metadata, annotation, output_file,
normalization=normalization, excluded_conditions=excluded_conditions, included_conditions=included_conditions, nterm=nterm, cterm=cterm)
self.PC = PC
+ self.alpha = alpha
self.refs = refs
self.winz = winz
@@ -62,20 +63,21 @@ class AnovaMethod(base.MultiConditionMethod):
CTerminus = float(kwargs.get("iC", 0.0))
winz = True if "winz" in kwargs else False
PC = int(kwargs.get("PC", 5))
+ alpha = float(kwargs.get("alpha", 1000))
refs = kwargs.get("-ref",[]) # list of condition names to use a reference for calculating LFCs
if refs!=[]: refs = refs.split(',')
excluded_conditions = list(filter(None, kwargs.get("-exclude-conditions", "").split(",")))
included_conditions = list(filter(None, kwargs.get("-include-conditions", "").split(",")))
# check for unrecognized flags
- flags = "-n --exclude-conditions --include-conditions -iN -iC -PC --ref -winz".split()
+ flags = "-n --exclude-conditions --include-conditions -iN -iC -PC --ref -winz -alpha".split()
for arg in rawargs:
if arg[0]=='-' and arg not in flags:
self.transit_error("flag unrecognized: %s" % arg)
print(AnovaMethod.usage_string())
sys.exit(0)
- return self(combined_wig, metadata, annotation, normalization, output_file, excluded_conditions, included_conditions, NTerminus, CTerminus, PC, winz, refs)
+ return self(combined_wig, metadata, annotation, normalization, output_file, excluded_conditions, included_conditions, NTerminus, CTerminus, PC, winz, refs, alpha)
def wigs_to_conditions(self, conditionsByFile, filenamesInCombWig):
"""
@@ -154,25 +156,48 @@ class AnovaMethod(base.MultiConditionMethod):
count = 0
self.progress_range(len(genes))
- pvals,Rvs,status = [],[],[]
+ MSR,MSE,Fstats,pvals,Rvs,status = [],[],[],[],[],[]
for gene in genes:
count += 1
Rv = gene["rv"]
if (len(RvSiteindexesMap[Rv]) <= 1):
status.append("TA sites <= 1")
- pvals.append(1)
+ msr,mse,Fstat,pval = 0,0,-1,1
else:
countSum, countsVec = self.group_by_condition(list(map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data)), conditions)
if self.winz: countsVec = self.winsorize(countsVec)
if (countSum == 0):
- pval = 1
+ msr,mse,Fstat,pval = 0,0,-1,1
status.append("No counts in all conditions")
- pvals.append(pval)
else:
- stat,pval = scipy.stats.f_oneway(*countsVec)
+ Fstat,pval = scipy.stats.f_oneway(*countsVec)
status.append("-")
- pvals.append(pval)
+ # countsVec is a list of numpy arrays, or could be a list of lists
+ # pooled counts for each condition, over TAs in gene and replicates
+ if isinstance(countsVec[0],numpy.ndarray):
+ countsVecAsArrays = countsVec
+ countsVecAsLists = [grp.tolist() for grp in countsVec]
+ else:
+ countsVecAsArrays = [numpy.array(grp) for grp in countsVec]
+ countsVecAsLists = countsVec
+ allcounts = [item for sublist in countsVecAsLists for item in sublist]
+ grandmean = numpy.mean(allcounts)
+ groupmeans = [numpy.mean(grp) for grp in countsVecAsArrays]
+ k,n = len(countsVec),len(allcounts)
+ dfBetween,dfWithin = k-1,n-k
+ msr,mse = 0,0
+ for grp in countsVecAsArrays: msr += grp.size*(numpy.mean(grp)-grandmean)**2/float(dfBetween)
+ for grp,mn in zip(countsVecAsArrays,groupmeans): mse += numpy.sum((grp-mn)**2)
+ mse /= float(dfWithin)
+ mse = mse+self.alpha ### moderation
+ Fmod = msr/float(mse)
+ Pmod = scipy.stats.f.sf(Fmod,dfBetween,dfWithin)
+ Fstat,pval = Fmod,Pmod
+ pvals.append(pval)
+ Fstats.append(Fstat)
+ MSR.append(msr)
+ MSE.append(mse)
Rvs.append(Rv)
# Update progress
@@ -184,12 +209,12 @@ class AnovaMethod(base.MultiConditionMethod):
qvals = numpy.full(pvals.shape, numpy.nan)
qvals[mask] = statsmodels.stats.multitest.fdrcorrection(pvals[mask])[1] # BH, alpha=0.05
- p,q,statusMap = {},{},{}
+ msr,mse,f,p,q,statusMap = {},{},{},{},{},{}
for i,rv in enumerate(Rvs):
- p[rv],q[rv],statusMap[rv] = pvals[i],qvals[i],status[i]
- return (p, q, statusMap)
+ msr[rv],mse[rv],f[rv],p[rv],q[rv],statusMap[rv] = MSR[i],MSE[i],Fstats[i],pvals[i],qvals[i],status[i]
+ return (msr, mse, f, p, q, statusMap)
- def calcLFCs(self,means,refs=[],PC=1):
+ def calcLFCs(self,means,refs=[],PC=5):
if len(refs)==0: refs = means # if ref condition(s) not explicitly defined, use mean of all
grandmean = numpy.mean(refs)
lfcs = [math.log((x+PC)/float(grandmean+PC),2) for x in means]
@@ -231,7 +256,7 @@ class AnovaMethod(base.MultiConditionMethod):
MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions)
self.transit_message("Running Anova")
- pvals,qvals,run_status = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions)
+ MSR,MSE,Fstats,pvals,qvals,run_status = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions)
self.transit_message("Adding File: %s" % (self.output))
file = open(self.output,"w")
@@ -239,9 +264,9 @@ class AnovaMethod(base.MultiConditionMethod):
heads = ("Rv Gene TAs".split() +
["Mean_%s" % x for x in conditionsList] +
["LFC_%s" % x for x in conditionsList] +
- "pval padj".split() + ["status"])
+ "MSR MSE+alpha Fstat Pval Padj".split() + ["status"])
file.write("#Console: python3 %s\n" % " ".join(sys.argv))
- file.write("#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n" % (self.normalization,self.NTerminus,self.CTerminus,self.PC))
+ file.write("#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s, alpha=%s\n" % (self.normalization,self.NTerminus,self.CTerminus,self.PC,self.alpha))
file.write('#'+'\t'.join(heads)+EOL)
for gene in genes:
Rv = gene["rv"]
@@ -252,7 +277,7 @@ class AnovaMethod(base.MultiConditionMethod):
vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] +
["%0.2f" % x for x in means] +
["%0.3f" % x for x in LFCs] +
- ["%f" % x for x in [pvals[Rv], qvals[Rv]]] + [run_status[Rv]])
+ ["%f" % x for x in [MSR[Rv], MSE[Rv], Fstats[Rv], pvals[Rv], qvals[Rv]]] + [run_status[Rv]])
file.write('\t'.join(vals)+EOL)
file.close()
self.transit_message("Finished Anova analysis")
@@ -268,7 +293,8 @@ class AnovaMethod(base.MultiConditionMethod):
--ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions)
-iN <N> := Ignore TAs within given percentage (e.g. 5) of N terminus. Default: -iN 0
-iC <N> := Ignore TAs within given percentage (e.g. 5) of C terminus. Default: -iC 0
- -PC <N> := pseudocounts to use for calculating LFC. Default: -PC 5
+ -PC <N> := pseudocounts to use for calculating LFCs. Default: -PC 5
+ -alpha <N> := value added to MSE in F-test for moderated anova (makes genes with low counts less significant). Default: -alpha 1000
-winz := winsorize insertion counts for each gene in each condition (replace max cnt with 2nd highest; helps mitigate effect of outliers)"""
return usage
=====================================
src/pytransit/analysis/resampling.py
=====================================
@@ -155,7 +155,7 @@ class ResamplingGUI(base.AnalysisGUI):
resamplingSizer.Add( self.wxobj.resamplingLoessPrev, 0, wx.ALL|wx.CENTER, 5 )
# Adaptive Check
- (self.wxobj.resamplingAdaptiveCheckBox, adaptiveSizer) = self.defineCheckBox(resamplingPanel, labelText="Adaptive Resampling (Faster)", widgetCheck=False, widgetSize=(-1,-1), tooltipText="Dynamically stops permutations early if it is unlikely the ORF will be significant given the results so far. Improves performance, though p-value calculations for genes that are not differentially essential will be less accurate.")
+ (self.wxobj.resamplingAdaptiveCheckBox, adaptiveSizer) = self.defineCheckBox(resamplingPanel, labelText="Adaptive Resampling (Faster)", widgetCheck=True, widgetSize=(-1,-1), tooltipText="Dynamically stops permutations early if it is unlikely the ORF will be significant given the results so far. Improves performance, though p-value calculations for genes that are not differentially essential will be less accurate.")
resamplingSizer.Add( adaptiveSizer, 0, wx.EXPAND, 5 )
# Histogram Check
@@ -312,7 +312,7 @@ class ResamplingMethod(base.DualConditionMethod):
NTerminus,
CTerminus,
ctrl_lib_str,
- exp_lib_str, wxobj, Z = False, diffStrains = diffStrains, annotation_path_exp = annotationPathExp)
+ exp_lib_str, wxobj=wxobj, Z = False, diffStrains = diffStrains, annotation_path_exp = annotationPathExp)
@classmethod
def fromargs(self, rawargs):
=====================================
src/pytransit/doc/source/method_anova.rst
=====================================
@@ -29,7 +29,7 @@ Example
::
- python3 transit.py anova <combined wig file> <samples_metadata file> <annotation .prot_table> <output file> [Optional Arguments]
+ > python3 transit.py anova <combined wig file> <samples_metadata file> <annotation .prot_table> <output file> [Optional Arguments]
Optional Arguments:
-n <string> := Normalization method. Default: -n TTR
--exclude-conditions <cond1,...> := Comma separated list of conditions to ignore for the analysis. Default: None
@@ -37,7 +37,8 @@ Example
--ref <cond> := which condition(s) to use as a reference for calculating LFCs (comma-separated if multiple conditions) (by default, LFCs for each condition are computed relative to the grandmean across all condintions)
-iN <float> := Ignore TAs occurring within given percentage (as integer) of the N terminus. Default: -iN 0
-iC <float> := Ignore TAs occurring within given percentage (as integer) of the C terminus. Default: -iC 0
- -PC := Pseudocounts to use in calculating LFCs. Default: -PC 5
+ -PC <N> := Pseudocounts to use in calculating LFCs. Default: -PC 5
+ -alpha <N> := value added to MSE in F-test for moderated anova (makes genes with low counts less significant). Default: -alpha 1000
-winz := winsorize insertion counts for each gene in each condition
(replace max count in each gene with 2nd highest; helps mitigate effect of outliers)
@@ -82,16 +83,23 @@ The following parameters are available for the ANOVA method:
with conditions with lower counts). However, if there is a defined reference condition
in the data, it may be specified using **\-\-ref** (in which case LFCs for that condition will
be around 0, and will be positive or negative for the other conditions, depending on whether
- counts are higher or lower than the reference condintion. If there is more than one
+ counts are higher or lower than the reference condition. If there is more than one
condition to use as reference (i.e. pooled), they may be given as a comma-separated list.
-- **-n** Normalization Method. Determines which normalization method to
+- **-n**: Normalization Method. Determines which normalization method to
use when comparing datasets. Proper normalization is important as it
ensures that other sources of variability are not mistakenly treated
as real differences. See the :ref:`Normalization <normalization>` section for a description
- of normalization method available in TRANSIT.
+ of normalization method available in TRANSIT. Default: -n TTR
-- **-PC** Pseudocounts to use in calculating LFCs (see below). Default: -PC 5
+- **-PC <N>**: Pseudocounts to use in calculating LFCs (see below). Default: -PC 5
+
+- **-alpha <N>**: Value added to MSE in F-test for moderated ANOVA: F = MSR/(MSE+alpha).
+ This is helpful because genes with very low counts are occasionally ranked as significant
+ by traditional ANOVA, even though the apparent variability is probably due to noise.
+ Setting alpha to a number like 1000 helps filter out these irrelevant genes
+ by reducing their significance. If you want to emulate the
+ standard ANOVA test, you can set alpha to 0. Default: -alpha 1000
- **-winz**: `winsorize <https://en.wikipedia.org/wiki/Winsorizing>`_ insertion counts for each gene in each condition.
Replace max count in each gene with 2nd highest. This can help mitigate effect of outliers.
@@ -118,6 +126,10 @@ typical threshold for conditional essentiality on is q-value < 0.05.
+-----------------+----------------------------------------------------------------------------+
| LFCs... | Log-fold-changes of counts in each condition vs mean across all conditions |
+-----------------+----------------------------------------------------------------------------+
+| MSR | Mean-squared residual |
++-----------------+----------------------------------------------------------------------------+
+| MSE+alpha | Mean-squared error, plus moderation value |
++-----------------+----------------------------------------------------------------------------+
| p-value | P-value calculated by the Anova test. |
+-----------------+----------------------------------------------------------------------------+
| p-adj | Adjusted p-value controlling for the FDR (Benjamini-Hochberg) |
@@ -128,8 +140,8 @@ typical threshold for conditional essentiality on is q-value < 0.05.
**LFCs** (log-fold-changes):
For each condition, the LFC is calculated as the log-base-2 of the
-ratio of mean insertion count in that condition **relative to the
-mean of means across all the conditions**.
+ratio of mean insertion count in that condition **relative to the mean of means across all the conditions**.
+You can use the '--ref' flag to designate a specific condition as a reference for computing LFCs.
Pseudocount are incorporated to reduce the impact of noise on LFCs, based on the formula below.
The pseudocounts can be adjusted using the -PC flag.
Changing the pseudocounts (via -PC) can reduce the artifactual appearance of genes with
@@ -141,7 +153,6 @@ Changing the pseudocounts will not affect the analysis of statistical significan
LFC = log2((mean_insertions_in_condition + PC)/(mean_of_means_across_all_conditions + PC))
-
|
Run-time
=====================================
src/pytransit/export/combined_wig.py
=====================================
@@ -175,11 +175,23 @@ class CombinedWigMethod(base.SingleConditionMethod):
else:
self.output.write("#Normalization Factors: %s\n" % " ".join([",".join(["%s" % bx for bx in b]) for b in factors]))
+ # get ref genome from first wig file (we could check that they are all the same)
+ # by this point, we know all the wig files exist and have same number of TA sites
+ # this assumes 2nd line of each wig file is "variableStep chrom=XXX"; if not, set ref to "unknown"
+ self.ref = "unknown"
+ wig = self.ctrldata[0]
+ with open(wig) as file:
+ line = file.readline()
+ line = file.readline()
+ if line.startswith("variableStep"):
+ w = line.rstrip().split("=")
+ if len(w)>=2: self.ref = w[1]
(K,N) = fulldata.shape
+ self.output.write("#RefGenome: %s\n" % self.ref)
for f in self.ctrldata:
self.output.write("#File: %s\n" % f)
- self.output.write("#TAcoord\t%s\n" % ('\t'.join(self.ctrldata)))
+ self.output.write("#TA_coord\t%s\n" % ('\t'.join(self.ctrldata)))
for i,pos in enumerate(position):
#self.output.write("%d\t%s\t%s\n" % (position[i], "\t".join(["%1.1f" % c for c in fulldata[:,i]]),",".join(["%s (%s)" % (orf,rv2info.get(orf,["-"])[0]) for orf in hash.get(position[i], [])]) ))
=====================================
src/pytransit/stat_tools.py
=====================================
@@ -1,4 +1,4 @@
-import math
+import sys,math
import numpy
import scipy.stats
@@ -527,7 +527,7 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
the data.
Args:
- ar: List or numpy array with the first set of observations.
+ data1: List or numpy array with the first set of observations.
data2: List or numpy array with the second set of observations.
S: Number of permutation tests (or samples) to obtain.
testFunc: Function defining the desired test statistic. Should accept
@@ -573,6 +573,9 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
assert len(data1) > 0, "Data1 cannot be empty"
assert len(data2) > 0, "Data2 cannot be empty"
+ if isinstance(data1,list): data1 = numpy.array(data1)
+ if isinstance(data2,list): data2 = numpy.array(data2)
+
count_ltail = 0
count_utail = 0
count_2tail = 0
@@ -596,14 +599,13 @@ def resampling(data1, data2, S=10000, testFunc=F_mean_diff_flat,
if mean1 > 0 and mean2 > 0: log2FC = math.log((mean2)/(mean1),2)
else: log2FC = math.log((mean2+1.0)/(mean1+1.0),2)
-
# Get stats and info based on whether working with libraries or not:
nTAs = 0
if lib_str1:
+ # note: returns a generator, not a list
# Get number of TA sites implied
nTAs = len(data1.flatten())//len(lib_str1)
- assert len(data2.flatten())//len(lib_str2) == nTAs, "Datasets do not have matching sites;\
- check input data and library strings."
+ assert len(data2.flatten())//len(lib_str2) == nTAs, "Datasets do not have matching sites; check input data and library strings."
# Get data
perm = get_lib_data_dict(data1, lib_str1, data2, lib_str2, nTAs)
test_obs = testFunc(perm)
=====================================
tests/test_analysis_methods.py
=====================================
@@ -80,7 +80,7 @@ class TestMethods(TransitTestCase):
self.assertLessEqual(
abs(len(sig_qvals) - 35),
2,
- "sig_qvals expected in range: %s, actual: %d" % ("[33, 37]", len(sig_qvals)))
+ "sig_qvals expected in range: %s, actual: %d" % ("[33, 37]", len(sig_qvals))) # maybe acceptable range should be expanded to 38
def test_resampling_combined_wig(self):
# The conditions in the args should be matched case-insensitively.
@@ -142,12 +142,12 @@ class TestMethods(TransitTestCase):
sig_qvals.sort()
self.assertEqual(
len(sig_pvals),
- 32,
- "sig_pvals expected: %d, actual: %d" % (32, len(sig_pvals)))
+ 30,
+ "sig_pvals expected: %d, actual: %d" % (30, len(sig_pvals)))
self.assertEqual(
len(sig_qvals),
- 28,
- "sig_qvals expected: %d, actual: %d" % (28, len(sig_qvals)))
+ 24,
+ "sig_qvals expected: %d, actual: %d" % (24, len(sig_qvals)))
@unittest.skipUnless(hasR, "requires R, rpy2")
def test_zinb(self):
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/e2e9cf2be5f3e0f5f7271dd80ca3ca2c95d1bc0d
--
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/e2e9cf2be5f3e0f5f7271dd80ca3ca2c95d1bc0d
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220826/4ccddc2e/attachment-0001.htm>
More information about the debian-med-commit
mailing list