forked from rasbt/PyMLSlides
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsample.tex
263 lines (242 loc) · 9.2 KB
/
sample.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
\documentclass{beamer}
\usepackage{latexsym}
\usepackage{graphicx}
\usetheme{Warsaw}
\title[Active Learning for Phenotyping Tasks\hspace{2em}\insertframenumber/\inserttotalframenumber]
{Active Learning for Phenotyping Tasks}
\author{Dmitriy Dligach, Timothy A. Miller, and \textbf{Guergana Savova}}
\institute{Boston Children's Hosptial and Harvard Medical School}
\date{\today}
\begin{document}
% remove word ``Figure'' from graphics caption
\setbeamertemplate{caption}{\insertcaption}
%\maketitle
\begin{frame}[t]
\titlepage
\end{frame}
\begin{frame}
\frametitle{Introduction}
\begin{itemize}
\item Phenotyping
\begin{itemize}
\item What's a phenotype?
\item i2b2 and eMERGE
\item Link EHRs to biobanks for genetic analysis
\item Supervised learning for phenotyping
\end{itemize}
\item Manual annotation needed
\begin{itemize}
\item Standard approach: passive learning
\item Alternative: active learning
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Active Learning}
\begin{itemize}
\item Approach for selecting data for annotation
\item Data selection delegated to classifier
\item Pool-based scenario
\begin{itemize}
\item Lots of unlabeled data
\item Can afford to annotate only a small amount
\end{itemize}
\item Little work in clinical domain
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Intuition}
Suppose there's a little bit of labeled data
\newline
\begin{itemize}
\item Classify example $\vec{x}$
\begin{itemize}
\item $p(c_1 | \vec{x}) = 0.95$ and $p(c_2 | \vec{x}) = 0.05$
\item $p(c_1 | \vec{x}) = 0.55$ and $p(c_2 | \vec{x}) = 0.45$
\newline
\end{itemize}
\item Margin Sampling
\begin{itemize}
\item $Prediction Margin = |P(c_1 | \vec{x}) - P(c_2 | \vec{x})|$
\item Annotate examples with smallest margin first
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{How does active learning work?}
\begin{itemize}
\item Seed classifier
\begin{itemize}
\item Annotate a small amount of data
\item Train a classifier
\end{itemize}
\item Iterative process
\begin{itemize}
\item Apply the classifier to the pool of unlabeled data
\item Select an example and add it to the training set
\item Retrain the classifier
\item Check if we are done
\end{itemize}
\item The learner quickly converges on the decision boundary
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Data Representation}
\begin{itemize}
\item Unit of classification
\begin{itemize}
\item Single patient
\end{itemize}
\item Patient representation
\begin{itemize}
\item Set of CUIs extracted with cTAKES
\item Abstract from lexical variability of medical terminology
\item Filter out non-clinical vocabulary
\end{itemize}
\item Phenotype-specific dictionaries
\item Patient vector $\vec{x}$
\begin{itemize}
\item Element $x_n$ is frequency of $CUI_n$
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Naive Bayes}
\begin{itemize}
\item Need to evaluate $p(c_i|\vec{x})$
\item Multinomial Naive Bayes
\begin{itemize}
\item Probabilistic classifier
\item Supports multi-class classification
\item Training and classification speed
\end{itemize}
\item Uncertainty sampling:
\begin{equation}
prediction\: margin = |p(c_1|\vec{x}) - p(c_2|\vec{x})|
\end{equation}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Counts}\
Compute posterior probability as follows:
\begin{equation}
p(c_i|\vec{x}) = \frac{1}{Z}p(c_i)\prod_{n=1}^Np(CUI_n|c_i)^{x_n}
\end{equation}
\fontsize{6.5pt}{7.2}\selectfont
\\
$p(c_i)$ - prior probability of class $c_i$ \\
$N$ is the number of CUIs in the phenotype-specific dictionary \\
$CUI_n$ is the $n_{th}$ CUI in that dictionary \\
$x_n$ is the frequency of $CUI_n$ in $\vec{x}$ \\
$Z$ (evidence) is the scaling factor \\
Determine $p(c_i)$ and $p(CUI_n|c_i)$ via maximum likelihood estimation
\end{frame}
\begin{frame}
\frametitle{Dataset creation}
\begin{itemize}
\item Created within the i2b2 initiative
\item ICD-9 codes used to form initial cohort
\item About 600 patients selected randomly
\item Labeled by domain experts
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Dataset stats}
\resizebox{\linewidth}{!}{
\begin{tabular}{|l|r|r|r|}
\hline
Phenotype & Total Instances & Number of Classes & Proportion of Predominant Class \\
\hline
Ulcerative Colitis & 600 & 2 & 0.630 \\
Crohn's Disease & 600 & 2 & 0.665 \\
Multiple Sclerosis & 595 & 5 & 0.395 \\
Type II Diabetes & 600 & 3 & 0.583 \\
\hline \end{tabular}}
\end{frame}
\begin{frame}
\frametitle{Evaluation}
\begin{itemize}
\item Learning curve generation
\begin{itemize}
\item Done in the style of 10-fold cross validation
\end{itemize}
\item Within each fold:
\begin{itemize}
\item Training data
\item Pool of ``unlabeled'' examples
\item Held-out test set
\end{itemize}
\item Various seed sizes
\begin{itemize}
\item Affect of seed size and performance
\item Only showing the plots for seed size = 30
\item See the paper for other sizes
\end{itemize}
\item Gold labels in the pool hidden from classifier
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Learning Curves}
\begin{center}
\begin{figure}
\includegraphics[width=0.30\textwidth]{figures/uc30.png}
\includegraphics[width=0.30\textwidth]{figures/cd30.png} \\
\includegraphics[width=0.30\textwidth]{figures/ms30.png}
\includegraphics[width=0.30\textwidth]{figures/t2d30.png} \\
\caption{Ulcerative Colitis, Crohn's Disease, Multiple Sclerosis, Type II Diabetes}
\end{figure}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Close-up}
\begin{center}
\begin{figure}
\includegraphics[width=0.7\textwidth]{figures/uc30.png}
\caption{Ulcerative Colitis}
\end{figure}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Sample plot}
\begin{columns}[c]
\column{.5\textwidth}
\begin{itemize}
\item {\scriptsize Active Learning above passive}
\item {\scriptsize Only need $1/3$ of the data}
\item {\scriptsize Best performance higher}
\end{itemize}
\column{.5\textwidth}
\begin{center}
\begin{figure}
\includegraphics[width=1.0\textwidth]{figures/uc30.png}
\caption{Ulcerative Colitis}
\end{figure}
\end{center}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Difference between areas under the curve (Active - Passive)}
\resizebox{\linewidth}{!}{
\begin{tabular}{|l|r|r|r|r|}
\hline
Seed Size & Ulcerative Colitis & Crohn's Disease & Multiple Sclerosis & Type II Diabetes \\
\hline
10 & 6.90 & 4.17 & 10.50 & 11.05 \\
30 & 6.64 & 2.21 & 15.43 & 7.49 \\
50 & 8.63 & 1.75 & 8.61 & 8.90 \\
\hline
\end{tabular}}
\end{frame}
\begin{frame}
\frametitle{Conclusion}
\begin{itemize}
\item Annotation effort reduced by 2/3
\item Active learning sometimes reaches better accuracy
\item Need to know when to stop
\item What happens if the base classifier is swapped?
\end{itemize}
\end{frame}
\begin{frame}
Questions?
\end{frame}
\end{document}