[med-svn] [kineticstools] 01/04: Imported Upstream version 0.6.1+20161222

Sun Jan 15 22:28:15 UTC 2017

This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository kineticstools.

commit 75c2f1c2be66860d6c3c8b14d4a1ebd99a606ad9
Author: Afif Elghraoui <afif at debian.org>
Date:   Sun Jan 15 14:20:29 2017 -0800

    Imported Upstream version 0.6.1+20161222
---
 doc/whitepaper/ecoli-roc.png          | Bin 0 -> 26253 bytes
 doc/whitepaper/kinetics.bib           |  92 +++++++++++++++++
 doc/whitepaper/kinetics.pdf           | Bin 0 -> 399628 bytes
 doc/whitepaper/kinetics.tex           | 183 ++++++++++++++++++++++++++++++++++
 doc/whitepaper/logo.jpg               | Bin 0 -> 22613 bytes
 doc/whitepaper/position-influence.png | Bin 0 -> 88200 bytes
 6 files changed, 275 insertions(+)

diff --git a/doc/whitepaper/ecoli-roc.png b/doc/whitepaper/ecoli-roc.png
new file mode 100644
index 0000000..f291508
Binary files /dev/null and b/doc/whitepaper/ecoli-roc.png differ
diff --git a/doc/whitepaper/kinetics.bib b/doc/whitepaper/kinetics.bib
new file mode 100644
index 0000000..b8d2744
--- /dev/null
+++ b/doc/whitepaper/kinetics.bib
@@ -0,0 +1,92 @@
+ at article{ forney1973viterbi,
+  title={The viterbi algorithm},
+  author={Forney Jr, G.D.},
+  journal={Proceedings of the IEEE},
+  volume={61},
+  number={3},
+  pages={268--278},
+  year={1973},
+  publisher={IEEE}
+}
+
+ at article{ flusberg2010direct,
+  title={Direct detection of DNA methylation during single-molecule, real-time sequencing},
+  author={Flusberg, B.A. and Webster, D.R. and Lee, J.H. and Travers, K.J. and Olivares, E.C. and Clark, T.A. and Korlach, J. and Turner, S.W.},
+  journal={Nature methods},
+  volume={7},
+  number={6},
+  pages={461--465},
+  year={2010},
+  publisher={Nature Publishing Group}
+}
+
+ at article{ eid2009real,
+  title={Real-time DNA sequencing from single polymerase molecules},
+  author={Eid, J. and Fehr, A. and Gray, J. and Luong, K. and Lyle, J. and Otto, G. and Peluso, P. and Rank, D. and Baybayan, P. and Bettman, B. and others},
+  journal={Science},
+  volume={323},
+  number={5910},
+  pages={133--138},
+  year={2009},
+  publisher={American Association for the Advancement of Science}
+}
+
+ at article{ clark2012characterization,
+  title={Characterization of DNA methyltransferase specificities using single-molecule, real-time DNA sequencing},
+  author={Clark, T.A. and Murray, I.A. and Morgan, R.D. and Kislyuk, A.O. and Spittle, K.E. and Boitano, M. and Fomenkov, A. and Roberts, R.J. and Korlach, J.},
+  journal={Nucleic Acids Research},
+  volume={40},
+  number={4},
+  pages={e29--e29},
+  year={2012},
+  publisher={Oxford Univ Press}
+}
+
+ at Article{ blasr, 
+AUTHOR = {Chaisson, Mark and Tesler, Glenn},
+TITLE = {Mapping single molecule sequencing reads using Basic Local Alignment with Successive Refinement (BLASR): Theory and Application},
+JOURNAL = {BMC Bioinformatics},
+VOLUME = {13},
+YEAR = {2012},
+NUMBER = {1},
+PAGES = {238},
+URL = {http://www.biomedcentral.com/1471-2105/13/238},
+DOI = {10.1186/1471-2105-13-238},
+ISSN = {1471-2105},
+ABSTRACT = {We describe the method BLASR (Basic Local Alignment with Successive Refinement) for mapping Single Molecule Sequencing (SMS) reads that are thousands to tens of thousands of bases long with divergence between the read and genome dominated by insertion and deletion error. We also present a combinatorial model of sequencing error that motivates why our approach is effective. The results indicate that mapping SMS reads is both highly specific and rapid.},
+}
+
+
+ at book{ cart,
+author = "L. Breiman and J. H. Friedman and R. Olshen and C. J. Stone",
+title = "Classification and Regression Trees",
+publisher = "Wadsworth",
+address = "Belmont, California",
+year = "1984"
+}
+
+ at article{ gradientboost,
+   AUTHOR="Jerome H. Friedman",
+   TITLE="Greedy Function Approximation: A Gradient Boosting Machine",
+   JOURNAL="The Annals of Statistics",
+   YEAR="2001",
+  VOLUME="29",
+  PAGES="1189-1232"
+}
+
+ at article{ crf-boost,
+  AUTHOR= "Thomas G. Dietterich, Adam Ashenfelter, and Yaroslav Bulatov",
+  TITLE="Training Conditional Random Fields via Gradient Tree Boosting",
+  JOURNAL="International Conference on Machine Learning",
+  YEAR="2004",
+  VOLUME="1",
+  PAGES="1-2"
+}
+
+ at Manual{ gbm,
+  title = {gbm: Generalized Boosted Regression Models},
+  author = {Greg Ridgeway},
+  year = {2010},
+  note = {R package version 1.6-3.1},
+  url = {http://CRAN.R-project.org/package=gbm},
+}
diff --git a/doc/whitepaper/kinetics.pdf b/doc/whitepaper/kinetics.pdf
new file mode 100644
index 0000000..da899c6
Binary files /dev/null and b/doc/whitepaper/kinetics.pdf differ
diff --git a/doc/whitepaper/kinetics.tex b/doc/whitepaper/kinetics.tex
new file mode 100644
index 0000000..76943c0
--- /dev/null
+++ b/doc/whitepaper/kinetics.tex
@@ -0,0 +1,183 @@
+\documentclass[pdftex]{article}
+\usepackage{float}
+\usepackage{graphicx}
+\usepackage{hyperref}
+\usepackage{amsmath}
+
+\DeclareMathOperator*{\argmax}{argmax}
+
+\newcommand{\mC}[1]{$^\textrm{m#1}$C}
+\newcommand{\mA}{$^\textrm{m6}$A}
+
+\oddsidemargin  0.0in \evensidemargin 0.0in
+\textwidth      6in
+
+
+\begin{document}
+
+\title{\begin{center}\includegraphics[width=2in]{logo.jpg}\end{center}\vspace{0.5cm}
+Detection and Identification of Base Modifications with Single Molecule Real-Time Sequencing Data}
+\author{Patrick Marks, Onureena Banerjee, David Alexander}
+\maketitle
+
+
+% Will say the section an page
+\pagestyle{headings}
+
+\section{Introduction}
+Single Molecule Real-Time (SMRT\textsuperscript{\textregistered}) DNA sequencing\cite{eid2009real} data contains information about DNA base modifications imprinted in the kinetics of the polymerization reaction\cite{flusberg2010direct}.  This data enables single-base resolution detection of \mA, \mC{4}, and \mC{5}\cite{clark2012characterization}. We discuss the statistical approaches to the detection and identification of DNA modifications from SMRT sequencing data.  Our implementation i [...]
+
+
+\section{Preparation of IPD data}
+\label{sec:prep}
+
+This analysis assumes that we are interested in locus-specific DNA modifications - that is, samples where the same modification is present at a given locus, in some fraction of the sample molecules.  
+
+We compute a vector of IPDs that map confidently to a given genomic location. We use the alignment tool BLASR \cite{blasr} to generate alignments between reads and a reference template.   BLASR maps each read to the reference sequence. BLASR generates a Mapping QV representing the confidence that the read maps uniquely to the selected genomic interval.  Reads falling within repeats will have a low Mapping QV and are removed from the analysis to prevent incorrect modification calls inside [...]
+
+Sequencing errors in SMRT sequence data are predominately indels, which causes some ambiguity in the local placement of IPDs to genomic positions.  To avoid most of these errors we only use IPDs if the SMRT sequence matches exactly for $k$ bases around observed base. Currently we use $k=1$.
+
+These filtering steps yield a vector of IPDs confidently assigned to each genomic position.
+
+The distribution of IPDs observed at a genomic location is roughly exponential, with a long tail caused by polymerase pausing inherent to SMRT sequencing. Information about base modifications is contained in the main distribution, while the contaminating pauses just contribute noise.  We employ a simple capping procedure to reduce the influence of these outliers: 
+$I_i^{(l)} = \min(R_i^{(l)}, Q_{99})$ where $I_i^{(l)}$ is the $i$th capped IPD at genomic location $l$, $R_i^{(l)}$ is the $i$th raw IPD and $Q_{99}$ is the $99$th percentile IPD over all IPD observations.
+
+We summarize the capped IPDs at each positions with a sample mean and standard deviation of the mean:
+\begin{eqnarray} 
+\mu_l & = & \frac{1}{n_l} \sum_i R_i^{(l)} \\
+\sigma_l^2 & = & \frac{1}{n_l} \sqrt{\sum_i (R_i^{(l)} - m_l)^2 }
+\end{eqnarray}
+
+
+\section{Control IPDs}
+
+Modification detection proceeds by comparing the observed mean IPD $m_l$ with the expected mean IPD for unmodified DNA at that location. In case-control mode we sequence a sample of DNA from the same organism that has been \emph{whole genome amplified} (WGA) to preserve the DNA sequence while removing all base modifications. In this mode of operation we summarize the IPDs observed in the control sample in the same way as the case sample, then proceed to the modification detection step. I [...]
+
+\subsection{\emph{in-silico} Control}
+An alternative to the \emph{case-control} paradigm is to construct an \emph{in-silico} model to predict the mean IPD  unmodified DNA given the local sequence context. We use the Gradient Boosting Machines \cite{gradientboost} to construct a function $Q(\mathbf{c})$ that returns an estimate of the mean IPD given a DNA context $\mathbf{c} = \{c_{-8}, c_{-7}, ... ,c_0, c_1, .., c_3\}$ where $c_i \in \{A, C, G, T\}$ encodes the DNA sequence surrounding $c_0$. The IPD prediction $Q(\mathbf{c} [...]
+
+\begin{eqnarray}
+\mu_l^{(c)} & = & Q(\mathbf{Ctx}(l)) \\
+\sigma_l^{(c)} & = & f(\mu_l^{(c)})
+\end{eqnarray}
+
+\begin{figure}
+\begin{center}\includegraphics[width=5in]{position-influence.png}\end{center}
+\caption{ Influence of Base in Context Window }
+\label{fig:position-influence}
+\end{figure}
+
+
+\section{Modification Detection}
+We pose the modification detection problem as the detection of genome locations whose mean IPD differs significantly from the control mean. We use a Welch's $t$-test to test for differences in the means between the case sample and the control, derived from either a control sample or the \emph{in-silico} model above.
+
+The $t$-statistic is defined as:
+\begin{eqnarray}
+s_l & = & = \sqrt{ \sigma_l^2 + \sigma_l^{(c)2}} \\
+T_l & = &\frac{\mu_l - \mu_l^{(c)}}{s_l} 
+\end{eqnarray}
+
+and we compute the p-value of $T_l$ under the $t$-distribution, and report a Phred-transformed QV as well:
+\begin{eqnarray}
+p & = & \Pr(t > T_l) \\
+\mathrm{QV} & = & -10 \log_{10} p
+\end{eqnarray}
+
+
+\section{ Modification Identification } 
+
+\subsection { Positive Control Model }
+We extend the \emph{in-silico} model from an alphabet over DNA bases $c_i \in \{A,C,G,T\}$ to an alphabet that includes the \emph{modified} bases we aim to identify: $c_i \in \{A,C,G,T, ^\textrm{m6}A, ^\textrm{m5}C, ^\textrm{m4}C \}$. To train this model we require labeled examples of the modified bases in a reasonable diversity of background contexts.  Base modifications appearing in bacterial restriction-modification systems provide an excellent source of training data that is fairly s [...]
+
+Our positive control training set incorporates data from 11 bacteria, with the following number of unique methylated motifs for each modification type -- \mA:36, \mC{4}:7, \mC{5}:7.
+
+
+\subsection{Viterbi Decoding}
+
+Let $\mathbf{S} = s_1, s_2, s_3, ... s_n$ be the unmodified DNA sequence ($s_i \in \{A,C,G,T\}$). Let $\mathbf{M} = m_1, m_2, m_3, ..., m_n$ be a DNA sequence carrying modifications ($m_i \in \{A,C,G,T, ^\textrm{m6}A, ^\textrm{m5}C, ^\textrm{m4}C \}$). A modification removal $R(m)$ operation maps a modification to it's unmodified base - so $R(^\textrm{m6}A) = A, R(^\textrm{m5}C) = C, R(^\textrm{m4}C) = C$.  The modified sequence $\mathbf{M}$ is constrained to maintain the base identity o [...]
+
+$$
+\log \Pr(\mathbf{O} \mid \mathbf{M}) = \sum_i \log \Pr(O_i \mid \mathcal{C}(\mathbf{M}, i))
+$$
+
+where $\mathcal{C}(\mathbf{M}, i)$ is the \emph{context} function that snips out the -8 to +3 bp context from sequence $\mathbf{M}$ around position $i$.
+IPD observations $O_i$ are assumed independent given the context $\mathcal{C}(\mathbf{M}, i)$.  We seek to find the modification sequence $\hat{\mathbf{M}}$ that maximizes the likelihood of IPD observations: 
+
+$$
+\hat{\mathbf{M}} = \argmax_{\mathbf{M}} \sum_i \log \Pr(O_i \mid \mathcal{C}(\mathbf{M}, i)) 
+$$
+
+Again we use the t-distribution to model the likelihood of the observed IPDs, given a mean prediction generated by the \emph{in-silico} IPD model:
+
+\begin{eqnarray}
+T_i & = & \frac{\mu_i - Q(\mathcal{C}(\mathbf{M}, i)}{s_i} \\
+Pr(O_i \mid \mathcal{C}(\mathbf{M}, i)) & = & f(T_i)
+\end{eqnarray}
+
+where $f(T)$ is the $t$-distribution PDF.
+
+We find the maximum-likelihood modification sequence by applying the Viterbi algorithm\cite{forney1973viterbi}. At each position $i$ we define $\mathbf{H}^{(i)}$, the set of all possible modification contexts centered at $i$ with a unmodified sequence that matches the reference sequence. In order to reduce the algorithm run time, we reduce the size of $\mathbf{H}^{(i)}$ by only considering alternatives that have some supporting evidence in the nearby single-site $p$-values. Here we show  [...]
+
+$$
+\mathbf{H}^{(i)} = \{ \mathcal{C}(\mathbf{M}, i) \mid \mathbf{M}, R(m_i) = s_i \} 
+$$
+
+The Viterbi forward matrix $\alpha(H^{(i)}_j, i)$ is defined recursively: The first argument is the current state $H^{(i)}_j$ drawn from the possible modification configurations $\mathbf{H}^{(i)}$; the second argument is the position $i$.
+
+$$
+\alpha(H^{(i)}_j, i) = \max_{K \in \mathbf{H}^{(i-1)}} \alpha(K, i-1) \Pr(O_i \mid H^{(i)}_j) \; \mathbf{SM}(K, H^{(i)}_j)
+$$
+
+$\mathbf{SM}(K, L) = \mathbf{1}\{K_1=L_2, ..., K_{11}=L_{12}\}$ is the \emph{context matching function}, where $K$ and $L$ are 12 base context strings. It returns $1$ if the last 11 bases of $K$ match the first 11 bases of $L$, and $0$ otherwise. This enforces the constraint that modification sequence contexts are self-consistent for all paths through the Viterbi matrix.
+
+The standard Viterbi traceback procedure yields the maximum-likelihood modification state at each genomic position. We compute a Modification QV for each modification in the ML configuration by comparing the likelihood of the best modification sequence to the likelihood with a given modification set back to the canonical base:
+
+\begin{eqnarray}
+p_i & = & \frac{\Pr(\mathbf{O} \mid R(\hat{\mathbf{M}}, i))} {\log \Pr(\mathbf{O} \mid R(\hat{\mathbf{M}}, i)) + \log \Pr(\mathbf{O} \mid \mathbf{M}') } \\
+\mathrm{QV}_i & = & -10 \log_{10} p_e
+\end{eqnarray}
+
+where $R(\hat{\mathbf{M}}, i))$ denotes the $\hat{\mathbf{M}}$ with base $m_i$ converted to the unmodified base $R(m_i)$.
+
+\section{Results}
+\subsection{Detection Performance}
+Figure \ref{fig:ecoli-roc} shows the ROC curve for single-site detection of $^\textrm{m6}$A in \emph{E.coli}, for data from varying numbers of SMRT Cells. In these data we get the following coverage levels: 2 cells: 32x per strand, 4 cells: 65x per strand, 6 cells: 95x per strand. The \emph{E.coli} strain tested here has three methylated motifs: G\textbf{A}TC, GC\textbf{A}CNNNNNNGTT, A\textbf{A}CNNNNNNGTGC, with a total of 39430 sites matching one of those motifs.   Commonly, we observe  [...]
+
+\begin{figure}
+\begin{center}\includegraphics[width=5in]{ecoli-roc.png}\end{center}
+\caption{ ROC for $^\textrm{m6}$A detection in \emph{E.coli} genome }
+\label{fig:ecoli-roc}
+\end{figure}
+
+\subsection{Identification Performance}
+
+We tested the modification identification capability on the bacteria \emph{Desulfurobacterium thermolithotrophum}, which carries RM systems using \mA, \mC{5}, and \mC{4} modifications. The motif CA\textbf{C}C is modified with \mC{4}, GG\textbf{C}C with \mC{5}, and G\textbf{A}TC with \mA. Table \ref{table-confusion} shows that we can accurately separate \mC{4} and \mC{5} modifications, while accurately calling \mA. The TET treatment protocol that amplifies the \mC{5} signal appears to neg [...]
+
+\begin{table}
+\centering
+\begin{tabular}{c c c c c c}
+Motif & Not Detected & \mC{4} & \mC{5} & \mA & modified\_base \\
+\hline 
+CACC &   3830 &  \textbf{1222} &    10 &    0  &      114 \\
+GGCC &    235 &    0 & \textbf{1065} &    0  &       4  \\
+GATC &    66 &    0 &    0 & \textbf{4836}  &       0 \\
+None &     &   85 &   98 &   93  &       21799  \\
+\end{tabular}
+\caption{Modification Identification Confusion Matrix}
+\label{table-confusion}
+\end{table}
+
+
+\bibdata{kinetics}
+\bibliography{kinetics}
+\bibliographystyle{unsrt}
+
+\vspace{1in}
+
+\small{
+For Research Use Only. Not for use in diagnostic procedures. © Copyright 2012, Pacific Biosciences of California, Inc. All rights reserved. Information in this document is subject to change without notice. Pacific Biosciences assumes no responsibility for any errors or omissions in this document. Certain notices, terms, conditions and/or use restrictions may pertain to your use of Pacific Biosciences products and/or third party products. Please refer to the applicable Pacific Biosciences [...]
+Pacific Biosciences, the Pacific Biosciences logo, PacBio, SMRT and SMRTbell are trademarks of Pacific Biosciences in the United States and/or certain other countries. All other trademarks are the sole property of their respective owners.  }
+
+
+
+\end{document} 
\ No newline at end of file
diff --git a/doc/whitepaper/logo.jpg b/doc/whitepaper/logo.jpg
new file mode 100644
index 0000000..ddea572
Binary files /dev/null and b/doc/whitepaper/logo.jpg differ
diff --git a/doc/whitepaper/position-influence.png b/doc/whitepaper/position-influence.png
new file mode 100644
index 0000000..0e96887
Binary files /dev/null and b/doc/whitepaper/position-influence.png differ

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/kineticstools.git