[med-svn] [Git][med-team/tnseq-transit][upstream] New upstream version 3.3.4
Étienne Mollier (@emollier)
gitlab at salsa.debian.org
Sun Feb 18 14:09:29 GMT 2024
Étienne Mollier pushed to branch upstream at Debian Med / tnseq-transit
Commits:
415630a7 by Étienne Mollier at 2024-02-18T14:56:10+01:00
New upstream version 3.3.4
- - - - -
14 changed files:
- CHANGELOG.md
- src/pytransit/__init__.py
- src/pytransit/analysis/CGI.py
- src/pytransit/analysis/ttnfitness.py
- src/pytransit/analysis/zinb.py
- + src/pytransit/data/CGI/RIF_D1_combined_counts.txt
- src/pytransit/data/CGI/counts_metadata.txt → src/pytransit/data/CGI/samples_metadata.txt
- src/pytransit/data/CGI/sgRNA_metadata.txt → src/pytransit/data/CGI/sgRNA_info.txt
- + src/pytransit/data/CGI/temp_cdr.txt
- + src/pytransit/data/CGI/temp_frac_abund.txt
- src/pytransit/doc/source/CGI.rst
- src/pytransit/doc/source/_images/CGI_workflow.png
- src/pytransit/doc/source/index.rst
- src/pytransit/doc/source/method_ttnfitness.rst
Changes:
=====================================
CHANGELOG.md
=====================================
@@ -2,6 +2,13 @@
All notable changes to this project will be documented in this file.
+## Version 3.3.4 (2024-02-16)
+#### Transit:
+
+Minor changes:
+ - some improvements to ttnfitness
+
+
## Version 3.3.3 (2023-11-26)
#### Transit:
=====================================
src/pytransit/__init__.py
=====================================
@@ -2,6 +2,6 @@
__all__ = ["transit_tools", "tnseq_tools", "norm_tools", "stat_tools"]
-__version__ = "v3.3.3"
+__version__ = "v3.3.4"
prefix = "[TRANSIT]"
=====================================
src/pytransit/analysis/CGI.py
=====================================
@@ -122,6 +122,8 @@ class CGI_Method(base.SingleConditionMethod):
return self()
def Run(self):
+ print("Note: CRISPRi-DR (CGI) has been migrated to Transit2. Please use Transit2 for this.")
+ sys.exit(-1)
cmd,args,kwargs = self.cmd,self.args,self.kwargs
if cmd=="extract_counts":
@@ -361,26 +363,35 @@ class CGI_Method(base.SingleConditionMethod):
drug_output.append([orf,gene,len(gene_df)]+coeffs.values.tolist()+pvals.values.tolist())
sys.stderr.flush()
- drug_out_df = pd.DataFrame(drug_output, columns=["Orf","Gene","Nobs", "intercept","ceofficient sgRNA_strength","ceofficient concentration dependence","pval intercept","pval pred_logFC","pval concentration dependence"])
-
+ drug_out_df = pd.DataFrame(drug_output, columns=["Orf","Gene","Nobs", "intercept","coefficient sgRNA_strength","coefficient concentration dependence","pval intercept","pval sgRNA_strength","pval concentration dependence"])
+ drug_out_df["intercept"] = round(drug_out_df["intercept"],6)
+ drug_out_df["coefficient sgRNA_strength"] = round(drug_out_df["coefficient sgRNA_strength"],6)
+ drug_out_df["coefficient concentration dependence"] = round(drug_out_df["coefficient concentration dependence"],6)
+ drug_out_df["pval intercept"] = round(drug_out_df["pval intercept"],6)
+ drug_out_df["pval sgRNA_strength"] = round(drug_out_df["pval sgRNA_strength"],6)
+ drug_out_df["pval concentration dependence"] = round(drug_out_df["pval concentration dependence"],6)
+
+
mask = np.isfinite(drug_out_df["pval concentration dependence"])
pval_corrected = np.full(drug_out_df["pval concentration dependence"].shape, np.nan)
pval_corrected[mask] = fdr_correction(drug_out_df["pval concentration dependence"][mask])[1]
drug_out_df["qval concentration dependence"] = pval_corrected
+ drug_out_df["qval concentration dependence"] = round(drug_out_df["qval concentration dependence"] ,6)
drug_out_df = drug_out_df.replace(np.nan,1)
- drug_out_df["Z"] = (drug_out_df["ceofficient concentration dependence"] - drug_out_df["ceofficient concentration dependence"].mean())/drug_out_df["ceofficient concentration dependence"].std()
- drug_out_df["Siginificant Interactions"] = [0] * len(drug_out_df)
- drug_out_df.loc[(drug_out_df["qval concentration dependence"]<0.05) & (drug_out_df["Z"]<-2),"Siginificant Interactions"]=-1
- drug_out_df.loc[(drug_out_df["qval concentration dependence"]<0.05) & (drug_out_df["Z"]>2),"Siginificant Interactions"]=1
- drug_out_df.insert(0, "Siginificant Interactions", drug_out_df.pop("Siginificant Interactions"))
-
- n = len(drug_out_df[drug_out_df["Siginificant Interactions"]!=0])
- depl_n = len(drug_out_df[drug_out_df["Siginificant Interactions"]== -1])
- enrich_n = len(drug_out_df[drug_out_df["Siginificant Interactions"]==1])
- sys.stderr.write("%d Total Siginificant Gene Interactions\n"%n)
- sys.stderr.write("%d Siginificant Gene Depletions\n"%depl_n)
- sys.stderr.write("%d Siginificant Gene Enrichments\n"%enrich_n)
+ drug_out_df["Z score of concentration dependence"] = (drug_out_df["coefficient concentration dependence"] - drug_out_df["coefficient concentration dependence"].mean())/drug_out_df["coefficient concentration dependence"].std()
+ drug_out_df["Z score of concentration dependence"] = round(drug_out_df["Z score of concentration dependence"], 6)
+ drug_out_df["Significant Interactions"] = [0] * len(drug_out_df)
+ drug_out_df.loc[(drug_out_df["qval concentration dependence"]<0.05) & (drug_out_df["Z score of concentration dependence"]<-2),"Significant Interactions"]=-1
+ drug_out_df.loc[(drug_out_df["qval concentration dependence"]<0.05) & (drug_out_df["Z score of concentration dependence"]>2),"Significant Interactions"]=1
+ drug_out_df.insert(0, "Significant Interactions", drug_out_df.pop("Significant Interactions"))
+
+ n = len(drug_out_df[drug_out_df["Significant Interactions"]!=0])
+ depl_n = len(drug_out_df[drug_out_df["Significant Interactions"]== -1])
+ enrich_n = len(drug_out_df[drug_out_df["Significant Interactions"]==1])
+ sys.stderr.write("%d Total Significant Gene Interactions\n"%n)
+ sys.stderr.write("%d Significant Gene Depletions\n"%depl_n)
+ sys.stderr.write("%d Significant Gene Enrichments\n"%enrich_n)
drug_out_df = drug_out_df.replace(r'\s+',np.nan,regex=True).replace('',np.nan)
drug_out_txt = drug_out_df.to_csv(sep="\t", index=False)
=====================================
src/pytransit/analysis/ttnfitness.py
=====================================
@@ -361,20 +361,11 @@ class TTNFitnessMethod(base.SingleConditionMethod):
filtered_ttn_data = filtered_ttn_data[~filtered_ttn_data["Orf"].isin(ess_genes)] #filter out ess genes
filtered_ttn_data = filtered_ttn_data[~filtered_ttn_data["Orf"].isin(uncertain_genes)] #filter out uncertain genes
filtered_ttn_data = filtered_ttn_data.reset_index(drop=True)
- ##########################################################################################
- # STLM Predictions
- #self.transit_message("\t + Making TTN based predictions using loaded STLM")
- #X = filtered_ttn_data.drop(["Orf","Name", "Coord","State","Insertion Count","Local Average","Actual LFC","Upseq TTN","Downseq TTN"],axis=1)
- #X = sm.add_constant(X)
- #model_LFC_predictions = self.STLM_reg.predict(X)
- #filtered_ttn_data["STLM Predicted LFC"]=model_LFC_predictions
- #filtered_ttn_data["STLM Predicted Counts"] = filtered_ttn_data["Local Average"].mul(numpy.power(2,filtered_ttn_data["STLM Predicted LFC"]))
##########################################################################################
#Linear Regression
- gene_one_hot_encoded= pandas.get_dummies(filtered_ttn_data["Orf"],prefix='')
+ gene_one_hot_encoded= pandas.get_dummies(filtered_ttn_data["Orf"],prefix='',dtype=int)
ttn_vectors = filtered_ttn_data.drop(["Coord","Insertion Count","Orf","Name","Local Average","Actual LFC","State","Upstream TTN","Downstream TTN"],axis=1)
- #stlm_predicted_log_counts = numpy.log10(filtered_ttn_data["STLM Predicted Counts"]+0.5)
Y = numpy.log10(filtered_ttn_data["Insertion Count"]+0.5)
@@ -385,41 +376,30 @@ class TTNFitnessMethod(base.SingleConditionMethod):
filtered_ttn_data["M1 Pred log Count"] = results1.predict(X1)
filtered_ttn_data["M1 Predicted Count"] = numpy.power(10, (filtered_ttn_data["M1 Pred log Count"]-0.5))
- #self.transit_message("\t + Fitting new mod TTN-Fitness")
- #X2 = pandas.concat([gene_one_hot_encoded,stlm_predicted_log_counts],axis=1)
- #X2 = sm.add_constant(X2)
- #results2 = sm.OLS(Y,X2).fit()
- #filtered_ttn_data["mod ttn Pred log Count"] = results2.predict(X2)
- #filtered_ttn_data["mod ttn Predicted Count"] = numpy.power(10, (filtered_ttn_data["mod ttn Pred log Count"]-0.5))
-
self.transit_message("\t + Assessing Models")
#create Models Summary df
Models_df = pandas.DataFrame(results1.params[1:-256],columns=["M1 Coef"])
Models_df["M1 Pval"] = results1.pvalues[1:-256]
Models_df["M1 Adjusted Pval"] = statsmodels.stats.multitest.fdrcorrection(results1.pvalues[1:-256],alpha=0.05)[1]
- #Models_df["mod ttn Coef"] = results2.params[1:-1]
- #Models_df["mod ttn Pval"] = results2.pvalues[1:-1]
- #Models_df["mod ttn Adjusted Pval"] = statsmodels.stats.multitest.fdrcorrection(results2.pvalues[1:-1],alpha=0.05)[1]
-
+
#creating a mask for the adjusted pvals
Models_df.loc[(Models_df["M1 Coef"]>0) & (Models_df["M1 Adjusted Pval"]<0.05),"Gene+TTN States"]="GA"
Models_df.loc[(Models_df["M1 Coef"]<0) & (Models_df["M1 Adjusted Pval"]<0.05),"Gene+TTN States"]="GD"
Models_df.loc[(Models_df["M1 Coef"]==0) & (Models_df["M1 Adjusted Pval"]<0.05),"Gene+TTN States"]="NE"
Models_df.loc[(Models_df["M1 Adjusted Pval"]>0.05),"Gene+TTN States"]="NE"
-
- #mask using mod TTN fitness
- #Models_df.loc[(Models_df["mod ttn Coef"]>0) & (Models_df["mod ttn Adjusted Pval"]<0.05),"mod ttn States"]="GA"
- #Models_df.loc[(Models_df["mod ttn Coef"]<0) & (Models_df["mod ttn Adjusted Pval"]<0.05),"mod ttn States"]="GD"
- #Models_df.loc[(Models_df["mod ttn Coef"]==0) & (Models_df["mod ttn Adjusted Pval"]<0.05),"mod ttn States"]="NE"
- #Models_df.loc[(Models_df["mod ttn Adjusted Pval"]>0.05),"mod ttn States"]="NE"
#########################################################################################
self.transit_message("Writing To Output Files")
#Write Models Information to CSV
# Columns: ORF ID, ORF Name, ORF Description,M0 Coef, M0 Adj Pval
+ TA_sites_df["Upstream TTN coef"] = [""] * len(TA_sites_df)
+ TA_sites_df["Downstream TTN coef"] = [""] * len(TA_sites_df)
+ for ttn in ttn_vectors.columns:
+ TA_sites_df.loc[TA_sites_df["Upstream TTN"]==ttn,"Upstream TTN coef"]=results1.params[ttn]
+ TA_sites_df.loc[TA_sites_df["Downstream TTN"]==ttn,"Downstream TTN coef"]=results1.params[ttn]
+ TA_sites_df["Permissiveness (STLM prediction)"] = TA_sites_df["Upstream TTN coef"] + TA_sites_df["Downstream TTN coef"]
gene_dict={} #dictionary to map information per gene
TA_sites_df["M1 Predicted Count"] = [None]*len(TA_sites_df)
- #TA_sites_df["mod ttn Predicted Count"] = [None]*len(TA_sites_df)
for g in TA_sites_df["Orf"].unique():
#ORF Name
orfName = gene_obj_dict[g].name
@@ -439,45 +419,36 @@ class TTNFitnessMethod(base.SingleConditionMethod):
coords_orf = filtered_ttn_data[filtered_ttn_data["Orf"]==g]["Coord"].values.tolist()
for c in coords_orf:
TA_sites_df.loc[(TA_sites_df["Coord"]==c),'M1 Predicted Count'] = filtered_ttn_data[filtered_ttn_data["Coord"]==c]["M1 Predicted Count"].iloc[0]
- #TA_sites_df[TA_sites_df["Coord"].isin(coords_orf)]['mod ttn Predicted Count'] = filtered_ttn_data[filtered_ttn_data["Coord"].isin(coords_orf)]["mod ttn Predicted Count"]
#M1 info
if "_"+g in Models_df.index:
M1_coef = Models_df.loc["_"+g,"M1 Coef"]
M1_adj_pval = Models_df.loc["_"+g,"M1 Adjusted Pval"]
modified_M1 = math.exp(M1_coef - statistics.median(Models_df["M1 Coef"].values.tolist()))
- #mod_M1_coef = Models_df.loc["_"+g,"mod ttn Coef"]
- #mod_M1_adj_pval = Models_df.loc["_"+g,"mod ttn Adjusted Pval"]
- #mod_modified_M1 = math.exp(mod_M1_coef - statistics.median(Models_df["mod ttn Coef"].values.tolist()))
else:
M1_coef = None
M1_adj_pval = None
modified_M1 = None
- #mod_M1_coef = None
- #mod_M1_adj_pval = None
- #mod_modified_M1 = None
-
#States
gumbel_bernoulli_call = gumbel_bernoulli_gene_calls[g]
if gumbel_bernoulli_call=="E":
gene_ttn_call = "ES"
- #mod_gene_ttn_call = "ES"
elif gumbel_bernoulli_call=="EB":
gene_ttn_call = "ESB"
- #mod_gene_ttn_call = "ESB"
else:
if "_"+g in Models_df.index:
gene_ttn_call = Models_df.loc["_"+g,"Gene+TTN States"]
- #mod_gene_ttn_call = Models_df.loc["_"+g,"mod ttn States"]
else:
gene_ttn_call = "U" #these genes are in the uncertain genes list
- #mod_gene_ttn_call = "U"
TA_sites_df.loc[(TA_sites_df["Orf"]==g), 'TTN-Fitness Assessment'] = gene_ttn_call
- #TA_sites_df.loc[(TA_sites_df["Orf"]==g), 'Mod TTN-Fitness Assessment'] = mod_gene_ttn_call
gene_dict[g] = [g,orfName,orfDescription,numTAsites,above0TAsites,local_saturation,M1_coef,M1_adj_pval, mean_actual_counts,modified_M1, gene_ttn_call]
output_df = pandas.DataFrame.from_dict(gene_dict,orient='index')
output_df.columns=["ORF ID","Name","Description","Total # TA Sites","#Sites with insertions","Gene Saturation","Gene+TTN (M1) Coef","Gene+TTN (M1) Adj Pval","Mean Insertion Count","Fitness Ratio","TTN-Fitness Assessment"]
assesment_cnt = output_df["TTN-Fitness Assessment"].value_counts()
- #mod_assesment_cnt = output_df["Mod TTN-Fitness Assessment"].value_counts()
+ if "ES" not in assesment_cnt:assesment_cnt["ES"]=0
+ if "ESB" not in assesment_cnt:assesment_cnt["ESB"]=0
+ if "GD" not in assesment_cnt:assesment_cnt["GD"]=0
+ if "GA" not in assesment_cnt:assesment_cnt["GA"]=0
+ if "NE" not in assesment_cnt:assesment_cnt["NE"]=0
self.output.write("#TTNFitness\n")
if self.wxobj:
@@ -494,15 +465,17 @@ class TTNFitnessMethod(base.SingleConditionMethod):
self.output.write("#Time: %s\n" % (time.time() - start_time))
self.output.write("#Saturation of Dataset: %s\n" % (saturation))
self.output.write("#Assesment Counts: %s ES, %s ESB, %s GD, %s GA, %s NE, %s U \n" % (assesment_cnt["ES"],assesment_cnt["ESB"],assesment_cnt["GD"],assesment_cnt["GA"],assesment_cnt["NE"],assesment_cnt["U"]))
- #self.output.write("#Mod Assesment Counts: %s ES, %s ESB, %s GD, %s GA, %s NE, %s U \n" % (mod_assesment_cnt["ES"],mod_assesment_cnt["ESB"],mod_assesment_cnt["GD"],mod_assesment_cnt["GA"],mod_assesment_cnt["NE"],mod_assesment_cnt["U"]))
- TA_sites_df = TA_sites_df[["Coord","Orf","Name","Upstream TTN","Downstream TTN","TTN-Fitness Assessment","Insertion Count","Local Average","M1 Predicted Count"]]
+ TA_sites_df = TA_sites_df[["Coord","Orf","Name","Upstream TTN","Downstream TTN","TTN-Fitness Assessment","Insertion Count","Local Average","Permissiveness (STLM prediction)","M1 Predicted Count"]]
+ TA_sites_df[["Local Average","Permissiveness (STLM prediction)","M1 Predicted Count"]] = TA_sites_df[["Local Average","Permissiveness (STLM prediction)","M1 Predicted Count"]].astype(float).round(1)
output2_data = TA_sites_df.to_csv(header=True,sep='\t' ,index=False).split('\n')
vals = '\n'.join(output2_data)
self.output2_file.write(vals)
self.output2_file.close()
+
+ output_df = output_df.round(1)
output_data = output_df.to_csv(header=True, sep="\t", index=False).split('\n')
vals = '\n'.join(output_data)
self.output.write(vals)
=====================================
src/pytransit/analysis/zinb.py
=====================================
@@ -276,7 +276,7 @@ class ZinbMethod(base.MultiConditionMethod):
nbMod0,
DEBUG = F
) {
- print("Starting ZINB in R")
+ #print("Starting ZINB in R")
suppressMessages(require(pscl))
suppressMessages(require(MASS))
melted = df
@@ -354,7 +354,7 @@ class ZinbMethod(base.MultiConditionMethod):
# this gives same answer, but I would need to extract the Pvalue...
#require(lmtest)
#print(lrtest(mod1,mod0))
- print("Finished ZINB in R")
+ #print("Finished ZINB in R")
return (c(pval, status))
}
""")
@@ -460,11 +460,11 @@ class ZinbMethod(base.MultiConditionMethod):
# r_args = [IntVector(readCounts), StrVector(condition), melted, map(lambda x: StrVector(x), covars), FloatVector(NZmean), FloatVector(logitZPerc)] + [True]
debugFlag = True if DEBUG or GENE else False
#print(f'''melted =''', str(melted))
- print("zinbMod1", str(zinbMod1))
- print("zinbMod0", str(zinbMod0))
- print("nbMod1", str(nbMod1))
- print("nbMod0", str(nbMod0))
- print("debugFlag", str(debugFlag))
+ #print("zinbMod1", str(zinbMod1))
+ #print("zinbMod0", str(zinbMod0))
+ #print("nbMod1", str(nbMod1))
+ #print("nbMod0", str(nbMod0))
+ #print("debugFlag", str(debugFlag))
pval, msg = r_zinb_signif(melted, zinbMod1, zinbMod0, nbMod1, nbMod0, debugFlag)
status.append(msg)
pvals.append(float(pval))
=====================================
src/pytransit/data/CGI/RIF_D1_combined_counts.txt
=====================================
The diff for this file was not included because it is too large.
=====================================
src/pytransit/data/CGI/counts_metadata.txt → src/pytransit/data/CGI/samples_metadata.txt
=====================================
=====================================
src/pytransit/data/CGI/sgRNA_metadata.txt → src/pytransit/data/CGI/sgRNA_info.txt
=====================================
=====================================
src/pytransit/data/CGI/temp_cdr.txt
=====================================
The diff for this file was not included because it is too large.
=====================================
src/pytransit/data/CGI/temp_frac_abund.txt
=====================================
The diff for this file was not included because it is too large.
=====================================
src/pytransit/doc/source/CGI.rst
=====================================
@@ -11,7 +11,7 @@ CRISPRi-DR is designed to analyze CRISPRi libraries from CGI experiments and ide
Workflow
--------
-Starting with fastq files, barcode counts are extracted. The user creates their own metadata file, for the counts. Fractional abundances are and used to run the CRISPRi-DR model. The output of this model is a file that lists genes with their statistacal parameters and significance. Genes with significant interactions are those with qval of condetration dependence < 0.05 and \|Z score of concentration dpendence|>2 on the slope coefficient. However, genes can be ranked by depletion by sorting the coefficient of concentration dependence in ascending order
+Starting with fastq files, barcode counts are extracted. The user creates their own metadata file, for the counts. Fractional abundances are created using the counts files, the metadata file and the uninduced ATC counts file. The fractional abundances are then used to run the CRISPRi-DR model. The output of this model is a file that lists genes with their statistacal parameters and significance. Genes with significant interactions are those with *qval of condetration dependence < 0.05* and *\|Z score of concentration dependence|>2*. However, genes can be ranked by depletion by sorting the coefficient of concentration dependence in ascending order
.. image:: _images/CGI_workflow.png
@@ -45,7 +45,12 @@ This is a fairly fast process. It takes at most a minute for the combination of
**Step 2: Extract Fractional Abundances**
- This is a relatively quick process, taking less than a minute. This step is to turn the barcodes counts into relative normalized abundances. Counts are normalized within samples and calculated relative to the abundances in the uninduced ATC file, essentially fractions. The first few lines of the output file contains information about the counts files processed.
+ This is a relatively quick process, taking less than a minute. This step is to turn the barcodes counts into relative normalized abundances. Counts are normalized within samples and calculated relative to the abundances in the uninduced ATC file, essentially fractions.
+
+ Each mutant has an sgRNA mapping to a target gene that can reduce its expression (when induced with ATC, anhydrotetracycline). The uninduced ATC counts files are the read counts when the sgRNAs are mapped to a target gene but have not been induced. The uninduced counts provide a starting point to compare with the ATC induced zero concentration counts as well as the ATC inducted non-zero concentration counts.
+
+
+ The first few lines of the output file contains information about the counts files processed.
::
@@ -62,7 +67,7 @@ This is a fairly fast process. It takes at most a minute for the combination of
* Equal number of replicates for all concentrations are not nessessary
* see [Li, S et al. 2022, PMID: 35637331] for explanation of days_predepletion
- * Example metadata: ``transit/src/pytransit/data/CGI/counts_metadata.txt``
+ * Example metadata: ``transit/src/pytransit/data/CGI/samples_metadata.txt``
* control condition: The condition to to be considered the control for these set of experiments, as specificed in the "drug" column of the metadata file; typically an atc-induced (+ ATC) with 0 drug concentration condition.
@@ -78,11 +83,36 @@ This is a fairly fast process. It takes at most a minute for the combination of
**Step 3: Run the CRISPRi-DR model**
This is a relatively quick process, taking at most 3 minutes for a dataset of ~90,000 sgRNAs . This step fits the CRISPRi-DR model (statistical analysis of concentration dependence for each gene) to each gene in the file and prints each output to the <CRISPRi-DR results file> in a tab seperated file.
+
::
> python3 ../src/transit.py CGI run_model <fractional abundance file> > <CRISPRi-DR results file>
-* Siginificant interacting genes are those with adjusted P-val (Q-val) < 0.05 and \|Z slope\| > 2, these are indicated by a "-1" for depleted and "1" for enriched in in the "Significant Interactions" column
+The output columns in this file are:
+
+* Significant Interactions - Siginificant interacting genes are those with *qval of concentration dependence < 0.05* and *\|Z score of concentration dependence|>2*, these are indicated by a "-1" for depleted and "1" for enriched in in the "Significant Interactions" column
+
+* Orf - the orf id of the gene
+
+* Gene - the name of the gene
+
+* Nobs - the number of sgRNAs seen in an orf
+
+* intercept - the resulting intercept of the overall fitted regression
+
+* coefficient sgRNA_strength - coefficient of the amount sgRNA strength contributes to the decrease in abundance
+
+* coefficient concentration dependence - coefficient of the amount concentration contributes to the decrease in abundance
+
+* pval intercept - the wald test based p-value of the intercept
+
+* pval sgRNA_strength - the wald test based p-value of the coefficient sgRNA_strength
+
+* pval concentration dependence - the wald test based p-value of the coefficient of concentration dependence
+
+* qval concentration dependence - FDR corrected adjusted pvalues of of the coefficient of concentration dependence
+
+* Z score of concentration dependence - zscore of the coefficient of concentration dependence
.. note::
When the file is sorted on the slope of concentration dependence, the user can rank the genes based on amount of depletion.
@@ -114,42 +144,31 @@ This process is fairly quick, taking less than a minute to run. This figure visu
Tutorial
-------
-**Data : Obtain FastQ files from NCBI using the following run numbers**
-Fetch and process the following fastq files from NCBI using the SRA toolkit and place them in the ``transit/src/pytransit/data/CGI`` directory :
+This tutorial shows commands relative to this directory. Files in the ``transit/src/pytransit/data/CGI`` directory are:
-* FastQ files for the 3 replicates of control samples in this experiment. They are in a ATC-induced 0 drug concentration DMSO library with 1 day predepletion
+* samples_metadata.txt - describes the samples
+* sgRNA_info.txt - contains extrapolated LFCs for each sgRNA
+* uninduced_ATC_counts.txt - counts for uninduced ATC (no induction of target depletion) library
+* IDs.H37Rv.CRISPRi.lib.txt - ids of the sgRNAs that target the genes in H37Rv used in these experiments
+* RIF_D1_combined_counts.txt - combined counts of the RIF 1 day predepletion data for uninduced ATC, zero, low, medium and high concentrations (output of data preprocessed and Step 1 completed)
- * SRR14827863 -> extracts SRR14827863_1.fastq
- * SRR14827862 -> extracts SRR14827862_1.fastq
- * SRR14827799 -> extracts SRR14827799_1.fastq
+.. note::
-* FastQ files for 3 replicates of high concentration RIF in a 1 day pre-depletion library
+ If the user would like to evaluate the software, they can start with Step 2, using the *RIF_D1_combined_counts.txt* file in the ``transit/src/pytransit/data/CGI`` directory.
- * SRR14827727 -> extracts SRR14827727_1.fastq
- * SRR14827861 -> extracts SRR14827861_1.fastq
- * SRR14827850 -> extracts SRR14827850_1.fastq
-* FastQ files for 3 replicates of medium concentration RIF in a 1 day pre-depletion library
+**Raw Data : Obtain FastQ files from NCBI using the following run numbers**
- * SRR14827760 -> extracts SRR14827760_1.fastq
- * SRR14827749 -> extracts SRR14827749_1.fastq
- * SRR14827738 -> extracts SRR14827738_1.fastq
+Fetch and process the following into fastQ files from `NCBI <https://www.ncbi.nlm.nih.gov/bioproject/PRJNA738381/>`_ using the SRA toolkit and place them in the ``transit/src/pytransit/data/CGI`` directory :
-* FastQ files for 3 replicates of low concentration RIF in a 1 day pre-depletion library
+* Control samples (ATC-induced 0 drug concentration DMSO library with 1 day predepletion) : SRR14827863, SRR14827862, SRR14827799
- * SRR14827769 -> extracts SRR14827769_1.fastq
- * SRR14827614 -> extracts SRR14827614_1.fastq
- * SRR14827870 -> extracts SRR14827870_1.fastq
+* High concentration RIF in a 1 day pre-depletion library : SRR14827727, SRR14827861, SRR14827850
-This tutorial shows commands relative to this directory. Other files in the ``transit/src/pytransit/data/CGI`` directory are:
+* Medium concentration RIF in a 1 day pre-depletion library: SRR14827760, SRR14827749, SRR14827738
-* counts_metadata.txt - describes the samples
-* sgRNA_metadata.txt - contains extrapolated LFCs for each sgRNA
-* uninduced_ATC_counts.txt - counts for uninduced ATC (no induction of target depletion) library
-* IDs.H37Rv.CRISPRi.lib.txt - ids of the sgRNAs that target the genes in H37Rv used in these experiments
+* Low concentration RIF in a 1 day pre-depletion library: SRR14827769, SRR14827614, SRR14827870
-
-
**Preprocessing: Fastq to Count Files**
Create file of barcode counts from fastq files. Each fastq files reflect one replicate of a drug concentration, thus each will be converted into a file with two columns, sgNRA id and barcode counts
@@ -187,11 +206,11 @@ The resulting file will have 13 columns, where the first column is sgRNA ids and
**Step 2: Extract Fractional Abundances**
.. note::
- As a part of this step, the *user must also generate a metadata file.* , ie. ``counts_metadata.txt``. Note the values in the conc_xMIC column is actual values (0.0625, 0.125, 0.25) and not categorical values ("low", "medium", "high") as seen in the counts file names.
+ As a part of this step, the *user must also generate a metadata file.* , ie. ``samples_metadata.txt``. Note the values in the conc_xMIC column is actual values (0.0625, 0.125, 0.25) and not categorical values ("low", "medium", "high") as seen in the counts file names.
::
- > python3 ../../../transit.py CGI extract_abund RIF_D1_combined_counts.txt counts_metadata.txt DMSO sgRNA_metadata.txt uninduced_ATC_counts.txt RIF 1 > RIF_D1_frac_abund.txt
+ > python3 ../../../transit.py CGI extract_abund RIF_D1_combined_counts.txt samples_metadata.txt DMSO sgRNA_info.txt uninduced_ATC_counts.txt RIF 1 > RIF_D1_frac_abund.txt
The result of this command should be a file with a set of comments at the top, detailing the libraries used (DMSO and RIF). There should be a total of 17 columns, the last 12 of which are the calculated abundances, the first is the sgRNA ids followed by the orf/gene the sgRNA is targeting, uninduced ATC values, and sgRNA strength.
=====================================
src/pytransit/doc/source/_images/CGI_workflow.png
=====================================
Binary files a/src/pytransit/doc/source/_images/CGI_workflow.png and b/src/pytransit/doc/source/_images/CGI_workflow.png differ
=====================================
src/pytransit/doc/source/index.rst
=====================================
@@ -81,12 +81,6 @@ TRANSIT offers a variety of features including:
transit_quality_control
file_formats
-.. toctree::
- :maxdepth: 3
- :caption: [** NEW **] CRISRPi ANALYSIS METHODS
-
- CGI
-
.. toctree::
:maxdepth: 3
:caption: TnSeq ANALYSIS METHODS
=====================================
src/pytransit/doc/source/method_ttnfitness.rst
=====================================
@@ -133,28 +133,31 @@ columns in the output file and are the primary columns per gene reflecting the a
The second output file is a tab-seperated file of details of the TTN Fitness method per TA Site.
-+---------------------------+-------------------------------------------------------------------------------+
-| Column Header | Column Definition |
-+===========================+===============================================================================+
-| Coord | Coordinate of TA Site |
-+---------------------------+-------------------------------------------------------------------------------+
-| Orf | Gene ID |
-+---------------------------+-------------------------------------------------------------------------------+
-| Name | Name of the Gene |
-+---------------------------+-------------------------------------------------------------------------------+
-| Uqstream TTN | Nucleotides in position 1,2,3 and 4 from the TA site |
-+---------------------------+-------------------------------------------------------------------------------+
-| Downstream TTN | Reverse Complement of Nucleotides in position -1,-2,-3 and -4 from the TA site|
-+---------------------------+-------------------------------------------------------------------------------+
-| TTN Fitness Assessment | Fitness Call for the Gene |
-+---------------------------+-------------------------------------------------------------------------------+
-| Insertion Counts | Number of Insertions at TA site |
-+---------------------------+-------------------------------------------------------------------------------+
-| Local Average | The average number of insertions +5 to -5 from the TA site |
-+---------------------------+-------------------------------------------------------------------------------+
-| M1 Predicted Counts | TTN+gene based predictions at the TA site using TTN Fitness model |
-+---------------------------+-------------------------------------------------------------------------------+
-
++----------------------------------+---------------------------------------------------------------------------------------+
+| Column Header | Column Definition |
++==================================+=======================================================================================+
+| Coord | Coordinate of TA Site |
++----------------------------------+---------------------------------------------------------------------------------------+
+| Orf | Gene ID |
++----------------------------------+---------------------------------------------------------------------------------------+
+| Name | Name of the Gene |
++----------------------------------+---------------------------------------------------------------------------------------+
+| Uqstream TTN | Nucleotides in position 1,2,3 and 4 from the TA site |
++----------------------------------+---------------------------------------------------------------------------------------+
+| Downstream TTN | Reverse Complement of Nucleotides in position -1,-2,-3 and -4 from the TA site |
++----------------------------------+---------------------------------------------------------------------------------------+
+| TTN Fitness Assessment | Fitness Call for the Gene |
++----------------------------------+---------------------------------------------------------------------------------------+
+| Insertion Counts | Number of Insertions at TA site |
++----------------------------------+---------------------------------------------------------------------------------------+
+| Local Average | The average number of insertions +5 to -5 from the TA site |
++----------------------------------+---------------------------------------------------------------------------------------+
+| Permissiveness (STLM prediction) | The Permissiveness of a non-ES/ESB site based on the nucleotides surrounding the site |
++----------------------------------+---------------------------------------------------------------------------------------+
+| M1 Predicted Counts | TTN+gene based predictions at the TA site using TTN Fitness model |
++----------------------------------+---------------------------------------------------------------------------------------+
+
+The permissiveness value of each TA site is sum of the coefficent for the upsteam TTN and the downstream TTN from the fitted model. It nicely correlates with observed log counts in non-ES and non-ESB genes.
Example of running the TTN-Fitness methodology on the sample glycerol data
--------------------------------------------------------------------------
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/415630a75a72dc9fce4971602775145e7917853f
--
View it on GitLab: https://salsa.debian.org/med-team/tnseq-transit/-/commit/415630a75a72dc9fce4971602775145e7917853f
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240218/d1041e5c/attachment-0001.htm>
More information about the debian-med-commit
mailing list