forked from rasbt/PyMLSlides
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchapter5.tex
189 lines (173 loc) · 6.4 KB
/
chapter5.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
\documentclass{beamer}
\usepackage{latexsym}
\usepackage{graphicx}
\usetheme{Warsaw}
\title{Chapter 5}
\subtitle{Dimensionality Reduction}
\begin{document}
\maketitle
\begin{frame}
\frametitle{Principal Component Analysis (PCA)}
\begin{columns}[c]
\column{0.5\textwidth}
\begin{itemize}
\item Find the directions of maximum variance
\item Project data onto the lower-dimensional space
\item Original features: $x_1$ and $x_2$
\item Principal components: \textbf{PC1} and \textbf{PC2}
\end{itemize}
\column{0.5\textwidth}
\includegraphics[width=\textwidth]{Code/ch05/images/05_01.png}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Mapping to a low-dimensional space}
When we use PCA for dimensionality reduction, we construct a $d \times k$ transformation matrix $\mathbf{W}$.
We then map a sample vector $\mathbf{x}$ onto a new $k$-dimensional feature subspace ($k << d$)
\[
\mathbf{x} = [ x_1, x_2, \dots, x_j], \mathbf{x} \in \mathbb{R}^d
\]
\[
\downarrow \mathbf{x W}, \quad \mathbf{W} \in \mathbb{R}^{d \times k}
\]
\[
\mathbf{z} = [z_1, z_2, \dots, z_k], \quad \mathbf{z} \in \mathbb{R}^k
\]
\end{frame}
\begin{frame}
\frametitle{Principal components}
\begin{itemize}
\item Transforming $d$-dimensional data to $k$ dimensions
\item First principal component will have the largest variance
\item Second principal component will have next largest variance
\item And so on...
\item PCA sensitive to data scaling, so need to standardize features
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Algorithm}
\begin{enumerate}
\item Standardize the $d$-dimensional dataset.
\item Construct the covariance matrix.
\item Decompose the covariance matrix into its eigenvectors and eigenvalues.
\item Select $k$ eigenvectors that correspond to the $k$ largest eigenvalues, where $k$ is the dimensionality of the new feature subspace $(k \le d)$.
\item Construct a projection matrix $\mathbf{W}$ from the "top" $k$ eigenvectors.
\item Transform the $d$-dimensional input dataset $\mathbf{X}$ using the projection matrix $\mathbf{W}$ to obtain the new $k$-dimensional feature subspace.
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{Variance-covariance matrix}
\begin{itemize}
\item Symmetric $d \times d$ -dimensional matrix ($d$ - number of dimensions)
\item Pairwise covariances between the different features
\item Covariance between two features $\mathbf{x}_j$ and $\mathbf{x}_k$:
\[
\sigma_{jk} = \frac{1}{n} \sum_{i=1}^{n} \big( x_{j}^{(i)} - \mu_j \big) \big( x_{k}^{(i)} - \mu_k \big)
\]
Where $\mu_j$ and $\mu_k$ are the sample means of feature $j$ and $k$
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{What is covariance?}
\begin{itemize}
\item Measure of how much two random variables change together
\item Positive covariance
\begin{itemize}
\item Features increase together
\item Features decrease together
\item E.g. As a balloon is blown up it gets larger in all dimensions
\end{itemize}
\item Negative covariance
\begin{itemize}
\item Features vary in opposite directions
\item Large values of one variable correspond to small values of the other
\item E.g. if a sealed balloon is squashed in one dimension then it will expand in the other two
\end{itemize}
\item The magnitude of the covariance is not easy to interpret
\item The normalized version of covariance (\textit{correlation coefficient}) indicates the strength of the linear relation.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Variance-covariance matrix}
\begin{itemize}
\item For three features, covariance matrix will look like this:
\[
\Sigma = \begin{bmatrix}
\sigma_{1}^2 & \sigma_{12} & \sigma_{13} \\
\sigma_{21} & \sigma_{2}^{2} & \sigma_{23} \\
\sigma_{31} & \sigma_{32} & \sigma_{3}^{2}
\end{bmatrix}
\]
\item The eigenvectors of $\Sigma$ represent the principle components
\item The corresponding eigenvalues represent their magnitude
\begin{itemize}
\item Principle components: the directions of maximum variance
\end{itemize}
\item E.g. Wine dataset (13 dimensions)
\begin{itemize}
\item $13x13$ covariance matrix
\item 13 eigenvectors
\item 13 eigenvalues
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Eigenpairs}
\begin{itemize}
\item An Eigenvector $\mathbf{v}$ satisfies the condition:
\[
\Sigma \mathbf{v} = \lambda \mathbf{v}
\]
Where $\lambda$ is the eigenvalue (scalar)
\item NumPy has a function to compute eigenpairs
\item We want to reduce the dimensionality
\item So, we select a subset of $k$ most informative eigenvectors
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Variance explained ratio}
\begin{itemize}
\item Variance explained ratio of an eigenvalue $\lambda_j$:
\[
\frac{\lambda_j}{\sum_{j=1}^{d} \lambda_j}
\]
\item First two principal components explain about 60 percent of the variance in the data
\end{itemize}
\includegraphics[scale=0.55]{Code/ch05/images/05_02.png}
\end{frame}
\begin{frame}
\frametitle{Feature transformation}
\begin{itemize}
\item We decomposed the covariance matrix into eigenpairs
\item Now need to project to new space defined by principle component axes
\item Construct a $13 \times 2$ projection matrix from top two eigenvectors
\item Transform a sample $\mathbf{x}$ onto the PCA subspace obtaining $\mathbf{x}'$
\item Which is a two-dimensional vector consisting of two new features:
\[
\mathbf{x}' = \mathbf{xW}
\]
\item Transform entire Wine dataset ($124 \times 13$)
\[
\mathbf{X}' = \mathbf{XW}
\]
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Visualize \textit{Wine} dataset in two dimensions}
\includegraphics[scale=0.55]{Code/ch05/images/05_03.png}
\end{frame}
\begin{frame}
\frametitle{Visualize \textit{Wine} dataset in two dimensions}
\begin{columns}[c]
\column{0.5\textwidth}
\includegraphics[scale=0.35]{Code/ch05/images/05_03.png}
\column{0.5\textwidth}
\begin{itemize}
\item Can now visualize a 13-dimensional dataset
\item Data more spread along first principal component, which explained 40 percent of the variance
\item A linear classifier should be able to do a good job separating the classes
\item Keep in mind that PCA is an \textit{unsupervised} algorithm
\end{itemize}
\end{columns}
\end{frame}
\end{document}