lh3 · August 29, 2015 14:07
diff --git a/duprate.tex b/duprate.tex
 \documentclass[pdftex,10pt]{article}

 \usepackage{amsmath}
 \usepackage{amssymb}

 \title{PCR and mapping duplicate rate in NGS}
 \author{Heng Li}

 \begin{document}

 \maketitle

 \section{Amplicon duplicates}

 Let $N$ be the number of distinct segments (or seeds) before the
 amplification and $M$ be the total number of amplicons in the
 library. For seed $i$ ($i=1,\ldots,N$), let $k_i$ be the number of
 amplicons in the library and $k_i$ is drawn from Poinsson distribution
 ${\rm Po}(\lambda)$. When $N$ is sufficiently large, we have:
 \[
 M=\sum_{i=1}^Nk_i=N\sum_{k=0}^{\infty}kp_k=N\lambda
 \]
 where $p_k=e^{-\lambda}\lambda^k/{k!}$.

 At the sequencing step, we sample $m$ amplicons from the library. On the
 condition that:
 \begin{equation}
  m\ll M
 \end{equation}
 we can regard this procedure as sampling with replacement. For seed $i$,
 let:
 \begin{equation*}
  X_i=\left\{\begin{array}{ll}
      1 & \mbox{seed $i$ has been sampled at least once} \\
      0 & \mbox{otherwise}
    \end{array}
  \right.
 \end{equation*}
 and then:
 \begin{equation*}
  {\rm E} X_i= \Pr\{X_i=1\}=1-\Big(1-\frac{k_i}{M}\Big)^m\simeq 1-e^{-k_i m/M}
 \end{equation*}
 Let:
 \[Z=\sum_{i=1}^NX_i\]
 be the number of seeds sampled from the
 library. The fraction of duplicates $d$ is:
 \begin{eqnarray*}
  d&=&1-\frac{{\rm E}(Z)}{m}\\
  &\simeq&1-\frac{N}{m}\sum_{k=0}^{\infty}\big(1-e^{-km/M}\big)p_k\\
  &=&1-\frac{N}{m}+\frac{N e^{-\lambda}}{m}\sum_k \frac{1}{k!}\big(\lambda e^{-m/M}\big)^k\\
  &\simeq& 1-\frac{N}{m}\Big[1-e^{-\lambda}\cdot e^{\lambda(1-m/M)}\Big]
 \end{eqnarray*}
 i.e.
 \begin{equation}
  d \simeq 1 - \frac{N}{m}\Big(1-e^{-m/N}\Big)
 \end{equation}
 independent of of $\lambda$. This derivation also stands if $k_i$ is drawn from
 a uniform or a delta distribution. In addition, when $m/N$ is sufficiently
 small:
 \begin{equation}\label{equ:d2}
  d\approx \frac{m}{2N}
 \end{equation}

 This deduction assumes that i) $k_i\ll M$ which should almost always
 stand; ii) $m\ll M$ which should largely stand because otherwise the
 fraction of duplicates will far more than half given $\lambda\sim 1000$
 and iii) $k_i$ is drawn from a Poisson, a uniform or a delta distribution.

 The basic message is that to reduce PCR duplicates, we should either
 increase the original pool of distinct molecules before amplification or
 reduce the number of reads sequenced from the library. Reducing PCR
 cycles plays little role. Nonetheless, PCR frequently introduces compositional
 biases and amplification errors. Reducing PCR cycles or not applying PCR at all
 is still preferred.

 \section{Mapping duplicates}

 For simplicity, we assume a read is as short as a single base pair. For
 $m$ read pairs, define an indicator function:
 \begin{equation*}
  Y_{ij}=\left\{\begin{array}{ll}
      1 & \mbox{if at least one read pair is mapped to $(i,j)$} \\
      0 & \mbox{otherwise}
    \end{array}\right.
 \end{equation*}
 Let $\{p_k\}$ be the distribution of insert size. Then:
 \begin{equation*}
  {\rm E}Y_{ij}=\Pr\{Y_{ij}=1\}=1-\Big[1-\frac{p_{j-i}}{L-(j-i)}\Big]^m\simeq 1-e^{-p_{j-i}\cdot m/[L-(j-i)]}
 \end{equation*}
 where $L$ is the length of the reference. The fraction of random
 coincidence is:
 \begin{eqnarray*}
  d'&=&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L{\rm E}Y_{ij}\\
  &\simeq&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L\big(1-e^{-p_{j-i}\cdot m/(L-(j-i))}\big)\\
  &=&1-\frac{1}{m}\sum_{k=0}^{L-1}(L-k)\big[1-e^{-p_k m/(L-k)}\big]
 \end{eqnarray*}
 On the condition that $L$ is sufficient large and:
 \begin{equation}
  m\ll L
 \end{equation}
 \begin{equation}\label{equ:dd}
  d'\simeq\frac{m}{2}\sum_{k=0}^{L-1}\frac{p_k^2}{L-k}
 \end{equation}

 We can calculate/approximate Equation~\ref{equ:dd} for two types of
 distributions. Firstly, if $p_k$ is evenly distributed between
 $[k_0,k_0+k_1]$, $d'\simeq\frac{m}{2k_1L}$. Secondly, assume $k$ is
 drawn from $N(\mu,\sigma)$ with $\sigma\gg 1$:
 \begin{equation*}
  p_k=\frac{1}{\sqrt{2\pi}\sigma}\int_k^{k+1}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\,dx
  \simeq \frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(k-\mu)^2}{2\sigma^2}}
 \end{equation*}
 If $p_0\ll 1$, $\mu\ll L$ and $L\gg 1$:
 \begin{eqnarray*}
  d'&\simeq&\frac{m}{4\pi\sigma^2}\int_0^1\frac{1}{1-x}\cdot e^{-\frac{(Lx-\mu)^2}{\sigma^2}}\,dx\\
  &\simeq&\frac{m}{4\pi\sigma^2}\int_{-\infty}^{\infty}e^{-\frac{(x-\mu/L)^2}{(\sigma/L)^2}}\,dx\\
  &=&\frac{m}{4\pi\sigma^2}\cdot\frac{\sqrt{2\pi}\cdot \sqrt{2}\sigma}{L}\\
  &=&\frac{m}{2\sqrt{\pi}\sigma L}
 \end{eqnarray*}

 \end{document}
	\documentclass[pdftex,10pt]{article}

	\usepackage{amsmath}
	\usepackage{amssymb}

	\title{PCR and mapping duplicate rate in NGS}
	\author{Heng Li}

	\begin{document}

	\maketitle

	\section{Amplicon duplicates}

	Let $N$ be the number of distinct segments (or seeds) before the
	amplification and $M$ be the total number of amplicons in the
	library. For seed $i$ ($i=1,\ldots,N$), let $k_i$ be the number of
	amplicons in the library and $k_i$ is drawn from Poinsson distribution
	${\rm Po}(\lambda)$. When $N$ is sufficiently large, we have:
	\[
	M=\sum_{i=1}^Nk_i=N\sum_{k=0}^{\infty}kp_k=N\lambda
	\]
	where $p_k=e^{-\lambda}\lambda^k/{k!}$.

	At the sequencing step, we sample $m$ amplicons from the library. On the
	condition that:
	\begin{equation}
	m\ll M
	\end{equation}
	we can regard this procedure as sampling with replacement. For seed $i$,
	let:
	\begin{equation*}
	X_i=\left\{\begin{array}{ll}
	1 & \mbox{seed $i$ has been sampled at least once} \\
	0 & \mbox{otherwise}
	\end{array}
	\right.
	\end{equation*}
	and then:
	\begin{equation*}
	{\rm E} X_i= \Pr\{X_i=1\}=1-\Big(1-\frac{k_i}{M}\Big)^m\simeq 1-e^{-k_i m/M}
	\end{equation*}
	Let:
	\[Z=\sum_{i=1}^NX_i\]
	be the number of seeds sampled from the
	library. The fraction of duplicates $d$ is:
	\begin{eqnarray*}
	d&=&1-\frac{{\rm E}(Z)}{m}\\
	&\simeq&1-\frac{N}{m}\sum_{k=0}^{\infty}\big(1-e^{-km/M}\big)p_k\\
	&=&1-\frac{N}{m}+\frac{N e^{-\lambda}}{m}\sum_k \frac{1}{k!}\big(\lambda e^{-m/M}\big)^k\\
	&\simeq& 1-\frac{N}{m}\Big[1-e^{-\lambda}\cdot e^{\lambda(1-m/M)}\Big]
	\end{eqnarray*}
	i.e.
	\begin{equation}
	d \simeq 1 - \frac{N}{m}\Big(1-e^{-m/N}\Big)
	\end{equation}
	independent of of $\lambda$. This derivation also stands if $k_i$ is drawn from
	a uniform or a delta distribution. In addition, when $m/N$ is sufficiently
	small:
	\begin{equation}\label{equ:d2}
	d\approx \frac{m}{2N}
	\end{equation}

	This deduction assumes that i) $k_i\ll M$ which should almost always
	stand; ii) $m\ll M$ which should largely stand because otherwise the
	fraction of duplicates will far more than half given $\lambda\sim 1000$
	and iii) $k_i$ is drawn from a Poisson, a uniform or a delta distribution.

	The basic message is that to reduce PCR duplicates, we should either
	increase the original pool of distinct molecules before amplification or
	reduce the number of reads sequenced from the library. Reducing PCR
	cycles plays little role. Nonetheless, PCR frequently introduces compositional
	biases and amplification errors. Reducing PCR cycles or not applying PCR at all
	is still preferred.

	\section{Mapping duplicates}

	For simplicity, we assume a read is as short as a single base pair. For
	$m$ read pairs, define an indicator function:
	\begin{equation*}
	Y_{ij}=\left\{\begin{array}{ll}
	1 & \mbox{if at least one read pair is mapped to $(i,j)$} \\
	0 & \mbox{otherwise}
	\end{array}\right.
	\end{equation*}
	Let $\{p_k\}$ be the distribution of insert size. Then:
	\begin{equation*}
	{\rm E}Y_{ij}=\Pr\{Y_{ij}=1\}=1-\Big[1-\frac{p_{j-i}}{L-(j-i)}\Big]^m\simeq 1-e^{-p_{j-i}\cdot m/[L-(j-i)]}
	\end{equation*}
	where $L$ is the length of the reference. The fraction of random
	coincidence is:
	\begin{eqnarray*}
	d'&=&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L{\rm E}Y_{ij}\\
	&\simeq&1-\frac{1}{m}\sum_{i=1}^L\sum_{j=i}^L\big(1-e^{-p_{j-i}\cdot m/(L-(j-i))}\big)\\
	&=&1-\frac{1}{m}\sum_{k=0}^{L-1}(L-k)\big[1-e^{-p_k m/(L-k)}\big]
	\end{eqnarray*}
	On the condition that $L$ is sufficient large and:
	\begin{equation}
	m\ll L
	\end{equation}
	\begin{equation}\label{equ:dd}
	d'\simeq\frac{m}{2}\sum_{k=0}^{L-1}\frac{p_k^2}{L-k}
	\end{equation}

	We can calculate/approximate Equation~\ref{equ:dd} for two types of
	distributions. Firstly, if $p_k$ is evenly distributed between
	$[k_0,k_0+k_1]$, $d'\simeq\frac{m}{2k_1L}$. Secondly, assume $k$ is
	drawn from $N(\mu,\sigma)$ with $\sigma\gg 1$:
	\begin{equation*}
	p_k=\frac{1}{\sqrt{2\pi}\sigma}\int_k^{k+1}e^{-\frac{(x-\mu)^2}{2\sigma^2}}\,dx
	\simeq \frac{1}{\sqrt{2\pi}\sigma}e^{-\frac{(k-\mu)^2}{2\sigma^2}}
	\end{equation*}
	If $p_0\ll 1$, $\mu\ll L$ and $L\gg 1$:
	\begin{eqnarray*}
	d'&\simeq&\frac{m}{4\pi\sigma^2}\int_0^1\frac{1}{1-x}\cdot e^{-\frac{(Lx-\mu)^2}{\sigma^2}}\,dx\\
	&\simeq&\frac{m}{4\pi\sigma^2}\int_{-\infty}^{\infty}e^{-\frac{(x-\mu/L)^2}{(\sigma/L)^2}}\,dx\\
	&=&\frac{m}{4\pi\sigma^2}\cdot\frac{\sqrt{2\pi}\cdot \sqrt{2}\sigma}{L}\\
	&=&\frac{m}{2\sqrt{\pi}\sigma L}
	\end{eqnarray*}

	\end{document}