sample.tex

\documentclass{beamer}
\usepackage{latexsym}                                                                                      
\usepackage{graphicx}                                                                                      
\usetheme{Warsaw}

\title[Active Learning for Phenotyping Tasks\hspace{2em}\insertframenumber/\inserttotalframenumber]
{Active Learning for Phenotyping Tasks}
\author{Dmitriy Dligach, Timothy A. Miller, and \textbf{Guergana Savova}}
\institute{Boston Children's Hosptial and Harvard Medical School}
\date{\today}

\begin{document}

% remove word ``Figure'' from graphics caption
\setbeamertemplate{caption}{\insertcaption}

%\maketitle
\begin{frame}[t]                                                                                              
\titlepage                                                                                                    
\end{frame}  

\begin{frame}
\frametitle{Introduction}
\begin{itemize}
\item Phenotyping
\begin{itemize}
\item What's a phenotype?
\item i2b2 and eMERGE
\item Link EHRs to biobanks for genetic analysis
\item Supervised learning for phenotyping
\end{itemize}
\item Manual annotation needed
\begin{itemize}
\item Standard approach: passive learning
\item Alternative: active learning
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Active Learning}
\begin{itemize}
\item Approach for selecting data for annotation
\item Data selection delegated to classifier
\item Pool-based scenario
\begin{itemize}
\item Lots of unlabeled data
\item Can afford to annotate only a small amount
\end{itemize}
\item Little work in clinical domain
\end{itemize}
\end{frame}

\begin{frame}                                                                                                 
\frametitle{Intuition}                                                                                  
Suppose there's a little bit of labeled data                                                                  
\newline                                                                                                      
  \begin{itemize}                                                                                             
  \item Classify example $\vec{x}$                                                                               
  \begin{itemize}                                                                                             
    \item $p(c_1 | \vec{x}) = 0.95$ and $p(c_2 | \vec{x}) = 0.05$                                     
    \item $p(c_1 | \vec{x}) = 0.55$ and $p(c_2 | \vec{x}) = 0.45$                                      
    \newline                                                                                                  
  \end{itemize}                                                                                               
  \item Margin Sampling                                                                                       
  \begin{itemize}                                                                                             
    \item $Prediction Margin = |P(c_1 | \vec{x}) - P(c_2 | \vec{x})|$                                            
    \item Annotate examples with smallest margin first                                                        
  \end{itemize}                                                                                               
  \end{itemize}                                                                                               
\end{frame} 

\begin{frame}
\frametitle{How does active learning work?}
\begin{itemize}
\item Seed classifier
\begin{itemize}
\item Annotate a small amount of data
\item Train a classifier
\end{itemize}
\item Iterative process
\begin{itemize}
\item Apply the classifier to the pool of unlabeled data
\item Select an example and add it to the training set
\item Retrain the classifier
\item Check if we are done
\end{itemize}
\item The learner quickly converges on the decision boundary
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Data Representation}
\begin{itemize}
\item Unit of classification
\begin{itemize}
\item Single patient
\end{itemize}
\item Patient representation
\begin{itemize}
\item Set of CUIs extracted with cTAKES
\item Abstract from lexical variability of medical terminology
\item Filter out non-clinical vocabulary
\end{itemize}
\item Phenotype-specific dictionaries
\item Patient vector $\vec{x}$ 
\begin{itemize}
\item Element $x_n$ is frequency of $CUI_n$
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Naive Bayes}
\begin{itemize}
\item Need to evaluate $p(c_i|\vec{x})$
\item Multinomial Naive Bayes
\begin{itemize}
\item Probabilistic classifier
\item Supports multi-class classification
\item Training and classification speed
\end{itemize}
\item Uncertainty sampling:
\begin{equation} 
prediction\: margin = |p(c_1|\vec{x}) - p(c_2|\vec{x})|                                                            
\end{equation} 
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Counts}\
Compute posterior probability as follows:
\begin{equation}                                                                                          
p(c_i|\vec{x}) = \frac{1}{Z}p(c_i)\prod_{n=1}^Np(CUI_n|c_i)^{x_n}
\end{equation}
\fontsize{6.5pt}{7.2}\selectfont
\\
$p(c_i)$ - prior probability of class $c_i$ \\
$N$ is the number of CUIs in the phenotype-specific dictionary \\
$CUI_n$ is the $n_{th}$ CUI in that dictionary \\
$x_n$ is the frequency of $CUI_n$ in $\vec{x}$ \\
$Z$ (evidence) is the scaling factor \\
Determine $p(c_i)$ and $p(CUI_n|c_i)$ via maximum likelihood estimation
\end{frame}

\begin{frame}
\frametitle{Dataset creation}
\begin{itemize}
\item Created within the i2b2 initiative
\item ICD-9 codes used to form initial cohort
\item About 600 patients selected randomly
\item Labeled by domain experts
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Dataset stats}
\resizebox{\linewidth}{!}{
\begin{tabular}{|l|r|r|r|}                                                                                
\hline
Phenotype & Total Instances & Number of Classes & Proportion of Predominant Class \\                       
\hline                                                                                                    
Ulcerative Colitis & 600 & 2 & 0.630 \\                                                                    
Crohn's Disease & 600 & 2 & 0.665 \\                                                                       
Multiple Sclerosis & 595 & 5 & 0.395 \\                                                                    
Type II Diabetes & 600 & 3 & 0.583 \\                                                                      
\hline                                                                                                     \end{tabular}}
\end{frame}

\begin{frame}
\frametitle{Evaluation}
\begin{itemize}
\item Learning curve generation
\begin{itemize}
\item Done in the style of 10-fold cross validation
\end{itemize}
\item Within each fold: 
\begin{itemize}
\item Training data
\item Pool of ``unlabeled'' examples
\item Held-out test set
\end{itemize}
\item Various seed sizes
\begin{itemize}
\item Affect of seed size and performance
\item Only showing the plots for seed size = 30
\item See the paper for other sizes
\end{itemize}
\item Gold labels in the pool hidden from classifier
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Learning Curves} 
\begin{center} 
\begin{figure}
\includegraphics[width=0.30\textwidth]{figures/uc30.png} 
\includegraphics[width=0.30\textwidth]{figures/cd30.png} \\
\includegraphics[width=0.30\textwidth]{figures/ms30.png} 
\includegraphics[width=0.30\textwidth]{figures/t2d30.png} \\
\caption{Ulcerative Colitis, Crohn's Disease, Multiple Sclerosis, Type II Diabetes}
\end{figure}
\end{center} 
\end{frame} 

\begin{frame}
\frametitle{Close-up}
\begin{center}
\begin{figure}
\includegraphics[width=0.7\textwidth]{figures/uc30.png}
\caption{Ulcerative Colitis}
\end{figure}
\end{center}
\end{frame}

\begin{frame}
\frametitle{Sample plot}
\begin{columns}[c]
\column{.5\textwidth}
\begin{itemize}
\item {\scriptsize Active Learning above passive}
\item {\scriptsize Only need $1/3$ of the data}
\item {\scriptsize Best performance higher}
\end{itemize}
\column{.5\textwidth}
\begin{center}
\begin{figure}
\includegraphics[width=1.0\textwidth]{figures/uc30.png}
\caption{Ulcerative Colitis}
\end{figure}
\end{center}
\end{columns}
\end{frame}

\begin{frame}
\frametitle{Difference between areas under the curve (Active - Passive)}
\resizebox{\linewidth}{!}{
\begin{tabular}{|l|r|r|r|r|}                                                                                
\hline
Seed Size & Ulcerative Colitis & Crohn's Disease & Multiple Sclerosis & Type II Diabetes \\
\hline                                                                                                    
10 & 6.90 & 4.17 & 10.50 & 11.05 \\
30 & 6.64 & 2.21 & 15.43 & 7.49 \\
50 & 8.63 & 1.75 & 8.61 & 8.90 \\
\hline      
\end{tabular}}
\end{frame}

\begin{frame}
\frametitle{Conclusion}
\begin{itemize}
\item Annotation effort reduced by 2/3
\item Active learning sometimes reaches better accuracy
\item Need to know when to stop
\item What happens if the base classifier is swapped?
\end{itemize}
\end{frame}

\begin{frame}
Questions?
\end{frame}

\end{document}