-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcourse.tex
1075 lines (924 loc) · 34.4 KB
/
course.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass{beamer}
\usepackage{graphicx}
\usepackage{multicol}
\usepackage{amsmath}
\usepackage{booktabs}
\graphicspath{{latex/}{img/}}
% for custom colors
\definecolor{maroon}{RGB}{215,0,0}
\mode<presentation>
{
% theme + colorscheme
\usetheme{Madrid}
\usecolortheme{beaver}
% customize bullets in items list
\setbeamertemplate{itemize item}{\color{red}$\bullet$}
\setbeamertemplate{itemize subitem}{\color{red}$\circ$}
% customize bullets in table of content
\setbeamerfont{section number projected}{%
family=\sffamily,series=\bfseries,size=\normalsize}
\setbeamercolor{section number projected}{bg=maroon, fg=white}
\setbeamercolor{subsection number projected}{bg=maroon}
\setbeamertemplate{section in toc}[circle]
\setbeamertemplate{subsection in toc}[square]
% remove navigation symbols
\setbeamertemplate{navigation symbols}{}
}
% customize footline: print section/subsection/date+slides_number instead of
% author/short_title/date+slides_number.
% You might need to remove it if you use another theme than Madrid.
\makeatletter
\setbeamertemplate{footline}
{
\leavevmode%
\hbox{%
\begin{beamercolorbox}[wd=.33333\paperwidth,ht=2.25ex,dp=1ex,center]
{section in head/foot}\usebeamerfont{section in head/foot}\insertsection
\end{beamercolorbox}% if no % at the end, there is a gap between subboxes
\begin{beamercolorbox}[wd=.33333\paperwidth,ht=2.25ex,dp=1ex,center]
{title in head/foot}\usebeamerfont{title in head/foot}\insertsubsection
\end{beamercolorbox}%
\begin{beamercolorbox}[wd=.33333\paperwidth,ht=2.25ex,dp=1ex,right]
{date in head/foot}\usebeamerfont{date in head/foot}\insertshortdate{}
\hspace*{2em} \insertframenumber{} / \inserttotalframenumber\hspace*{2ex}
\end{beamercolorbox}}%
\vskip0pt
}
\makeatother
% main information
\title[Algorithms for Data Analysis]{Algorithms for Data Analysis}
\author{Julien Tissier}
\date{\today}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%
%% F R A M E S %%
%%%%%%%%%%%%%%%%%%%%%%%
% section -> on the left in footline; subsection -> in the center in footline
\section*{\insertshortauthor} % asterisk -> do not appear in table of content
\subsection*{\insertshorttitle}
% frame with title + main information
\begin{frame}
\titlepage
\end{frame}
% summary frame
\begin{frame}
\frametitle{Overview}
\begin{multicols}{2}
\tableofcontents
\end{multicols}
\end{frame}
%------------------
% I N T R O -
%------------------
\section{Introduction to Data Analysis}
% data (analysis) definition
\subsection{Definition}
\begin{frame}
\frametitle{What is data analysis?}
\begin{block}{Data}
Pieces of information (measurement, values, facts...) that can be:
\begin{itemize}
\item structured (matrices, tabular data, RDBMS, time series...)
\item unstructured (news articles, webpages, images/video...)
\end{itemize}
\end{block}
\begin{block}{Data Analysis}
Process of preparing, transforming and using models to find more information
from data, as well as visualizing results.
\end{block}
\end{frame}
% tools (why Python?)
\subsection{Tools and libraries}
\begin{frame}
\frametitle{How to analyze data?}
There are usually two steps in data analysis. The first one is to find and
\textbf{develop models} that can extract useful information from data (with
languages like R or MATLAB). The second one is to \textbf{develop programs}
that can be used in production systems (with languages like Java or C++).
\\~\\
With a growing popularity among scientists as well as the development of
efficient libraries (numpy, pandas), \textbf{Python} became a great tool for
data analysis. Python has many advantages:
\begin{itemize}
\item great for string/data processing
\item can be used for both prototyping/production
\item has a lot of existing libraries
\item can easily integrate C/C++/FORTRAN legacy code
\item easy to read/develop
\end{itemize}
\end{frame}
% list of libraries
\begin{frame}
\frametitle{Libraries}
This course will be based on Python 3.7 (or above) and the following
libraries:
\begin{itemize}
\item IPython (7.8+): enhanced Python shell
\item numpy (1.17+): fast/efficient arrays and operations
\item pandas (0.25+): data structures (Series/DataFrame)
\item matplotlib (3.1+): plots and 2D visualization
\item scipy (1.3.1+): scientific algorithms
\item scikit-learn (0.21+): machine learning algorithms
\end{itemize}
Jupyter Notebook will also be used to give you samples of code, as they
provide a more interactive way to learn and discover how these libraries
work.
\end{frame}
% numpy
\subsection{Numpy}
\begin{frame}
\frametitle{Numpy}
Numpy (\textbf{Num}erical \textbf{Py}thon) is a high performance scientific
computing library that can be used for matrices computations, Fourier
transforms, linear algebra, statistical computations...
\\~\\
The main type of data in Numpy is the \textbf{ndarray}:
\begin{itemize}
\item n-dimensional array
\item fixed size
\item homogeneous datatypes
\item similar to C arrays (continuous block of memory)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Why is Numpy efficient?}
As a high level language, Python is slow to do any heavy computations,
especially if very large arrays are involved. Numpy solves this problem thanks
to the ndarray datatypes:
\begin{itemize}
\item efficient memory management (continuous block)
\item use C loops instead of Python loops for computations on array
\item vectorized operations (computations are done block by block, not
element by element)
\item rely on low-level routines for some operations (BLAS/LAPACK)
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Numpy}
\begin{example}
\begin{verbatim}
import numpy as np
a = np.array([1, 2, 3, 4])
b = np.array([6, 7, 8, 9])
c = a * b
d = np.array([[1, 2], [3, 4]])
\end{verbatim}
\end{example}
\end{frame}
% pandas
\subsection{Pandas}
\begin{frame}
\frametitle{Pandas}
Pandas is a high-performance Python library used to work with data and analyze
them. It contains many pre-implemented methods to read and parse data, as well
as common statistical computations (mean, variance, correlation...).
\\~\\
Pandas has two main datatypes:
\begin{itemize}
\item Series: one-dimensional container. Indexes can be integers (like an
array) or other objects (string, date...)
\item DataFrame: tabular data, like a spreadsheet. It contains multiple
rows and multiple columns. It can be seen as a collection of Series.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Pandas}
\begin{example}[Series]
\begin{verbatim}
from pandas import Series
s1 = Series([4, 7, 9, -1])
s2 = Series([12, 3.2, "John"],
index=["mark", "gpa", "name"])
\end{verbatim}
\end{example}
\begin{example}[DataFrame]
\begin{verbatim}
from pandas import DataFrame
df = DataFrame({"key1": [1, 2, 3],
"key2": [4, 5, 6]})
\end{verbatim}
\end{example}
\end{frame}
% matplotlib
\subsection{Matplotlib}
\begin{frame}
\frametitle{Matplotlib}
Matplotlib is a plotting library used to visualize data and create graphics.
Pandas directly uses matplotlib for representation.
\begin{figure}[h]
\includegraphics[width=8cm]{img/matplotlib_1.png}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Matplotlib}
Matplotlib is a plotting library used to visualize data and create graphics.
Pandas directly uses matplotlib for representation.
\begin{figure}[h]
\includegraphics[width=8cm]{img/matplotlib_2.png}
\end{figure}
\end{frame}
%----------------------------------------
% M A C H I N E L E A R N I N G -
%----------------------------------------
\section{Machine Learning}
% ML definition
\subsection{Definition}
\begin{frame}
\frametitle{What is Machine Learning?}
\begin{block}{Machine Learning (ML)}
ML is when a computational system can \textbf{automatically learn} and
extract \textbf{knowledge from data} without being explicitly programmed.
ML is at the intersection of statistics, artificial intelligence and
computer science.
ML is now everywhere:
\begin{itemize}
\item automatic friend tagging in your Facebook photos
\item music recommendation for Spotify/YouTube
\item self-driven cars
\item medical diagnosis in a hospital
\item and many more...
\end{itemize}
ML can be \textbf{supervised} or \textbf{unsupervised}.
\end{block}
\end{frame}
% ML history
\begin{frame}
\frametitle{What is Machine Learning?}
\begin{alertblock}{Brief history}
At the beginning, many decision-making applications (like spam detection)
were built with a lot of manually programmed if/else conditions. These
methods have some drawbacks:
\begin{itemize}
\item it requires the \textbf{knowledge of an expert} to design the rules
\item it is \textbf{very specific} to a task
\end{itemize}
With ML, you only need to feed a model with your data. The algorithm will
find the rules by itself.
\end{alertblock}
\end{frame}
% Supervised
\subsection{Supervised vs. Unsupervised}
\begin{frame}
\frametitle{Supervised Learning}
In supervised learning, you give the algorithm two kinds of data:
\begin{itemize}
\item \textbf{inputs} (an image, information about a person...)
\item \textbf{outputs} (name of the object, the salary...)
\end{itemize}
The algorithm learns to find a way to \textbf{produce the output given the
input}. The algorithm is then capable of producing the output of an input it
has never seen before. \\~\\
Supervised learning can be used to:
\begin{itemize}
\item recognize handwritten digits (input=image, output=digit)
\item identify spams (input=mail, output=yes/no)
\item predict a price (input=house infomation, output=price)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Supervised Learning}
The inputs are in general real value vectors (sometimes it can be binary
vectors). Each dimension of this vector is
called a \textbf{feature}. \\~\\
There are two main problems that can be solved with supervised learning:
\begin{itemize}
\item \textbf{classification}: the output is the class the item belongs to.
It can be a binary classification problem (yes/no) or a multi-class
classification problem (class 1, class 2, class 3...)
\item \textbf{regression}: the output is a real value number
\end{itemize}
\end{frame}
% Unsupervised
\begin{frame}
\frametitle{Unsupervised Learning}
In unsupervised learning, there are no known output (or label), so you only
give the algorithm the inputs. For example, if you want to regroup similar
clients according to their customer habits, you do not know in advance what
are the groups.\\~\\
The main problems that can be solved with unsupervised learning are:
\begin{itemize}
\item \textbf{clustering}
\item \textbf{dimensionality reduction}
\end{itemize}
The main difficulty of unsupervised learning algorithms is to \textbf{evaluate
them}. Since we do not have any labels, there are not any "true" valid answer.
Most of the time, we need to manually evaluate if the output of the algoritm
is relevant. Sometimes, unsupervised learning algorithms are used as a
\textbf{pre-processing} step before supervised learning algorithms and can
lead to a more accurate model or a faster training.
\end{frame}
% Evaluation
\subsection{Training, test \& validation sets}
\begin{frame}
\frametitle{How to evaluate a ML algorithm?}
After training a ML algorithm with data, we need to evaluate its performance.
One way would be to feed the algorithm with the same data, compute the output
and compare it with the original output.\\~\\
This method is bad because it does not tell us the capacity of the algorithm
\textbf{to generalize} (is the algorithm able to perform well on unseen data
?). One way to solve this problem is to separate the data into 3 sets:
\begin{itemize}
\item training set: used to train the algorithm
\item validation set: used to adjust the algorithm
\item test set: used to measure the performance of the algorithm
\end{itemize}
\end{frame}
% overfitting/underfitting
\subsection{Overfitting/Underfitting}
\begin{frame}
\frametitle{Problems of ML}
\begin{itemize}
\item \textbf{underfitting}: when the model is too simple. The training
score and the test score are both bad (left)
\item \textbf{overfitting}: when the model is too complex (right). The
training score is good but the test score is bad. The model is not able to
generalize well.
\end{itemize}
\begin{center}
\includegraphics[height=4.7cm]{img/underfitting.png}
\end{center}
\end{frame}
%----------------------------
% S U P E R V I S E D -
%----------------------------
\section{Supervised Learning}
% KNN
\subsection{k-Nearest Neighbors}
\begin{frame}
\frametitle{k-Nearest Neighbors - Classification}
\textbf{Inputs}: set of training points $\mathcal{S}$, point $X$ to
classify\\
\textbf{Procedure}:
\begin{itemize}
\item find the $k$ closest points (the neighbors) to $X$ from $\mathcal{S}$
\item find the most frequent class $c$ among the neighbors
\item assign $c$ to $X$
\end{itemize}
\begin{columns}
\column{0.48\linewidth}
\parbox{\linewidth}{The $k$ closest points are defined as the points with
minimal distance (usually the Euclidean distance) to $X$. If there is a
tie in finding $c$, we can increase/decrease $k$ until the tie is broken.}
\column{0.48\linewidth}
\centering
\includegraphics[height=3.8cm]{img/knn.png}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{k-Nearest Neighbors - Regression}
\textbf{Inputs}: set of training points $\mathcal{S}$, point $X$ to predict
value
\textbf{Procedure}:
\begin{itemize}
\item find the $k$ closest points (the neighbors) to $X$ from $\mathcal{S}$
\item compute the average value $v$ of all neighbors
\item assign $v$ to $X$
\end{itemize}
If $k$ is set to 1, the value assigned to $X$ is simply the value of its
nearest neighbor.
\end{frame}
\begin{frame}
\frametitle{k-Nearest Neighbors}
The k-nearest neighbors is one of the simplest model in ML. It usually has two
main parameters: \textbf{the number of neighbors} used and \textbf{the
distance} used to find the closest points.
\\~\\
\textbf{Pros}
\begin{itemize}
\item model is easy to understand
\item does not require extended tuning to achieve good performance
\item works well as a baseline before training more complex models
\end{itemize}
\textbf{Cons}
\begin{itemize}
\item slow when many training points or many features
\item not very good on sparse data
\end{itemize}
\end{frame}
% Linear models
\subsection{Linear Models}
\begin{frame}
\frametitle{Linear Models - Regression}
Linear models are usually used to make values prediction. The output $\hat{y}$
is a \textbf{linear function} of each features $x_i$ of a given input vector
$X$ (hence the name of \textbf{linear} models).
\begin{equation*}
\hat{y} = \sum_{i = 1}^k w_i * x_i + b
\end{equation*}
\begin{columns}
\column{0.48\linewidth}
\parbox{\linewidth}{The model learns the coefficients $w_i$ and $b$. The
model is trained to minimize the \textbf{Mean Squared Error (MSE)} on all
training points $X$ in $\mathcal{S}$ (where $y$ is the correct value of
$X$).
\begin{equation*}
MSE = \frac{1}{\#\mathcal{S}} \sum_{X \in \mathcal{S}} (\hat{y} - y)^2
\end{equation*}
}
\column{0.48\linewidth}
\centering
\includegraphics[height=3.6cm]{img/linear_reg.png}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Linear Models - Regression}
\begin{itemize}
\item \underline{Training}\\
\textbf{Inputs}: set of training points $\mathcal{S}$\\
\textbf{Procedure}:
\begin{itemize}
\item compute the MSE
\item compute the partial derivatives $\frac{\partial MSE}{\partial
w_i}$ and $\frac{\partial MSE}{\partial b}$
\item solve the linear system given by the equations $\frac{\partial
MSE}{\partial w_i}=0$ and $\frac{\partial MSE}{\partial b}=0$
\item the solution is unique and gives you the $w_i$ and $b$
\end{itemize}
\textbf{Output}: learned $w_i$ and $b$
\item \underline{Predicting}\\
\textbf{Inputs}: learned $w_i$ and $b$, point $X$ to predict value\\
\textbf{Procedure}: compute $\hat{y}$ with the formula and $w_i$, $b$,
$x_i$\\
\textbf{Output}: predicted value $\hat{y}$
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Linear Models - Classification}
Linear models can also be used for either binary or multi-class classification
problems. For binary classification (1 or 0 labels), the model evaluates the
output with:
\begin{equation*}
\hat{y} =
\begin{cases}
1 & \text{if } w \cdot x + b \geq 0\\
0 & \text{otherwise}
\end{cases}
\end{equation*}
We can rewrite the condition to remove the bias $b$ with $\sum_{i = 0}^k w_i *
x_i \geq 0$ and set $x_0$ to 1 for all vectors $X$. The objective of the model
is to find a good $w$ to predict the correct label (such that $\hat{y} =
y$). Different methods exist to find $w$:
\begin{itemize}
\item perceptron
\item logistic regression
\item support vector machines (SVM)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Linear Models - Perceptron}
We can define a loss $\ell$ (called the zero-one loss) for each point $(x, y)$
from our training set as follows:
\begin{equation*}
\ell(x, y, w) = 1_{[(w \cdot x)y \leq 0]}
\end{equation*}
The perceptron algorithm learns the $w$ weights as follows:
\begin{itemize}
\item initialize $w$ to 0
\item \textbf{for} step in [1..n]
\begin{itemize}
\item \textbf{for} x in training dataset
\begin{itemize}
\item[] \textbf{if $ (w \cdot x) \times y \leq 0$}\\
\hspace{.5cm} w = w + $\lambda$ y $\cdot$ x
\end{itemize}
\end{itemize}
\end{itemize}
The perceptron algorithm only updates the weights $w$ when it encounters a
misclassified example. If the dataset is \textbf{linearly separable}, the
perceptron algorithm is guaranteed to find an optimal solution.
\end{frame}
\begin{frame}
\frametitle{Linear Models - Logistic Regression}
Instead of directly finding the output $\hat{y}$ with the sign function, we
can compute the \textbf{probability of belonging to class 1} defined as
follows:
\begin{equation*}
P(y=1 | x) = \sigma(w \cdot x) = \frac{1}{1 + e^{-w \cdot x}} \in [0, 1]
\end{equation*}
The idea is to find $w$ such that this probability is high (near 1) for points
labeled with 1, and low (near 0) for points labeled with 0. We can define
another loss function (called negative log-likelihood) :
\begin{equation*}
\ell(x, y, w) = - y \times \log(\sigma(w \cdot x))
- (1 - y) \times \log(1 - \sigma(w \cdot x))
\end{equation*}
The model is trained to minimize the loss for all training examples. Since
this loss function is convex, we can use different algorithms like gradient
descent, L-BFGS...
\end{frame}
\begin{frame}
\frametitle{Linear Models}
Linear models have two
main parameters : \textbf{the weight of regularization} and \textbf{the
type of regularization} used.
\\~\\
\textbf{Pros}
\begin{itemize}
\item models are fast to train and predict
\item can be used with large datasets (scalability)
\item works well on sparse data
\end{itemize}
\textbf{Cons}
\begin{itemize}
\item learned coefficients are not very interpretable
\item not very good at detecting correlated features
\end{itemize}
\end{frame}
%% Decision Trees
\subsection{Decision Trees}
\begin{frame}
\frametitle{Decision Trees}
Decision trees are mostly used for classification problems. They learn a
\textbf{sequence of conditions} (if/else questions) to find the class of a vector
$x$.
Let's suppose we have the following dataset and we want to predict if the game
can be played (9 yes / 5 no) :
\begin{table}[htb]
\centering
\tiny
\begin{tabular}{@{}rcccc@{}}
\cmidrule{2-5}
& Outlook & Humidity & Wind & Play\\
\midrule
1 & Sunny & High & Weak & No \\
2 & Sunny & High & Strong & No \\
3 & Overcast & High & Weak & Yes \\
4 & Rain & High & Weak & Yes \\
5 & Rain & Normal & Weak & Yes \\
6 & Rain & Normal & Strong & No \\
7 & Overcast & Normal & Strong & Yes \\
8 & Sunny & High & Weak & No \\
9 & Sunny & Normal & Weak & Yes \\
10 & Rain & Normal & Weak & Yes \\
11 & Sunny & Normal & Strong & Yes \\
12 & Overcast & High & Strong & Yes \\
13 & Overcast & Normal & Weak & Yes \\
14 & Rain & High & Strong & No \\
\bottomrule
\end{tabular}
\normalsize
\end{table}
\end{frame}
\begin{frame}
\frametitle{Decision Trees}
Many algorithms exist to create a decision tree. One of them is ID3. At each
step, it computes the entropy of each unused attributes with :
\begin{equation*}
Entropy(S) = \sum_{x \in X} -p(x) \log_{2}p(x)
\end{equation*}
where $p(x)$ is the ratio between the number of elements in $x$ and in $S$.
The attribute that has the higher entropy is chosen as a node in the tree.
\begin{columns}
\column{0.48\linewidth}
\begin{table}[htb]
\centering
\tiny
\begin{tabular}{@{}rcccc@{}}
\cmidrule{2-5}
& Outlook & Humidity & Wind & Play\\
\midrule
1 & Sunny & High & Weak & No \\
2 & Sunny & High & Strong & No \\
3 & Overcast & High & Weak & Yes \\
4 & Rain & High & Weak & Yes \\
5 & Rain & Normal & Weak & Yes \\
6 & Rain & Normal & Strong & No \\
7 & Overcast & Normal & Strong & Yes \\
8 & Sunny & High & Weak & No \\
9 & Sunny & Normal & Weak & Yes \\
10 & Rain & Normal & Weak & Yes \\
11 & Sunny & Normal & Strong & Yes \\
12 & Overcast & High & Strong & Yes \\
13 & Overcast & Normal & Weak & Yes \\
14 & Rain & High & Strong & No \\
\bottomrule
\end{tabular}
\normalsize
\end{table}
\column{0.48\linewidth}
\centering
\includegraphics[height=3.2cm]{img/id3.png}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Decision Trees}
Decision trees have many parameters to control their size: maximum depth,
number of leaf, acceptable ratio in a node... They provide a strong
understandable tool for people not familiar with the ML field.
\\~\\
\textbf{Pros}
\begin{itemize}
\item can visually be understood
\item can be used with large datasets
\item no preprocessing needed (normalization of features)
\end{itemize}
\textbf{Cons}
\begin{itemize}
\item tend to easily overfit
\item poor generalization performance
\end{itemize}
\end{frame}
% SVM
\subsection{Support Vector Machine}
\begin{frame}
\frametitle{Support Vector Machine}
Support Vector Machine (SVM) is an algorithm used to solve
\textbf{classification problems}. The main goal is to find the \textbf{optimal
hyperplane}.\\~\\
\begin{columns}
\column{0.48\linewidth}
\centering
\includegraphics[height=4.2cm]{img/svm_goal.png}
\column{0.48\linewidth}
\begin{itemize}
\item ${H_1}$ is not correct
\item ${H_2}$ is correct, but not the best
\item ${H_3}$ is correct and optimal
\end{itemize}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{What is the optimal hyperplane?}
If the training data is linearly separable, an infinite number of correct
hyperplanes can be found. So we need to discriminate them to choose the
optimal one. We introduce the notion of \textbf{margin} as the distance
between the hyperplane and the closest points from training set.
\\~\\
The \textbf{optimal hyperplane} is the one with the \textbf{largest margin}.
The data points on the margin are called \textbf{support vectors}.
\begin{center}
\includegraphics[height=3.8cm]{img/svm_margin.png}
\end{center}
\end{frame}
\begin{frame}
\frametitle{Finding the optimal hyperplane}
\begin{columns}
\column{0.48\linewidth}
\parbox{\linewidth}{The equation of an hyperplane is:
\begin{equation}
\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b = 0
\end{equation}
The equations of the hyperplanes representing the margins are:
\begin{equation}
\begin{cases}
\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b = +1\\
\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b = -1
\end{cases}
\end{equation}
}
The distance between the margins and the optimal hyperplane is:
\begin{equation}
\frac{1}{{\lvert \lvert w \rvert \rvert}^2}
\end{equation}
\column{0.48\linewidth}
\centering
\includegraphics[height=4.2cm]{img/svm_hyperplanes.png}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Finding the optimal hyperplane}
The main goal of SVM is to \textbf{maximize the margin}, this means to
\textbf{minimize the value of ${\lvert \lvert w \rvert \rvert}^2$}.
We also want to correctly classify the points w.r.t. the margin (i.e. no
points are between the two hyperplanes defining the margin). This means:
\begin{equation}
\begin{cases}
\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b \geq +1
\text{ for labels = +1}\\
\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b \leq -1
\text{ for labels = -1}
\end{cases}
\end{equation}
This can be rewritten as:
\begin{equation}
y \times (\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x}} - b) \geq +1
\end{equation}
So the SVM solves (with the \textbf{Lagrangian multiplier} method):
\begin{equation}
\min_{w, b} {\lvert \lvert w \rvert \rvert}^2 \quad s.t. \quad
y_i \times (\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x_i}} - b) \geq +1
\quad \forall ~i
\end{equation}
\end{frame}
\begin{frame}
\frametitle{Soft margin}
The assumption that our dataset is linearly separable is strong and not true
for most of the cases. We can create a new model that will find the optimal
hyperplane, but do not try to correctly classify every training point. We
introduce the \textbf{hinge loss} as:
\begin{equation}
\max (0, 1 - y_i \times
(\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x_i}} - b))
\end{equation}
The problem to optimize becomes:
\begin{equation}
\min_{w, b} \left( {\lvert \lvert w \rvert \rvert}^2 + \lambda \sum_{i = 1}^n
\max (0, 1 - y_i \times
(\vec{\boldsymbol{w}} \cdot \vec{\boldsymbol{x_i}} - b)) \right)
\end{equation}
\end{frame}
\begin{frame}
\frametitle{Support Vector Machine}
SVM finds the optimal hyperplane by maximizing the margin. With hard margin,
no tuning is required but it does not work if the data is not linearly
separable. With soft margin, there is always a correct solution and the model
is more robust to outliers.
\\~\\
\textbf{Pros}
\begin{itemize}
\item work well on high or low dimension data
\item allow very complex boundaries (with kernel trick)
\end{itemize}
\textbf{Cons}
\begin{itemize}
\item do not scale well
\item require preprocessing and tuning
\item hard to inspect and understand
\end{itemize}
\end{frame}
% Neural nets
\subsection{Neural networks}
\begin{frame}
\frametitle{Neural Networks}
A neural network is a model used for either \textbf{classification} or
\textbf{regression} problems. The main unit of a neural network is called a
\textbf{neuron}:
\begin{center}
\includegraphics[height=2cm]{img/neuron.png}
\end{center}
Each neuron has a weight vector $w$ and a bias $b$. Its output is:
\begin{equation}
y = f(\boldsymbol{w} \cdot \boldsymbol{x} + b)
\end{equation}
\end{frame}
\begin{frame}
\frametitle{Neural Networks}
When we combine several neurons, we obtain a neural network:
\begin{center}
\includegraphics[height=5cm]{img/nn_full.png}
\end{center}
Each neuron from one layer is connect to each neurons from the following
layer. We call this the \textbf{fully connected} architecture.
\end{frame}
\begin{frame}
\frametitle{Training neural networks}
\begin{center}
\includegraphics[height=3cm]{img/nn_basic.png}
\end{center}
The goal is to be able to predict the output given the input. We can compute
the cost for one training example as the difference between the predicted
output and its label:
\begin{equation}
C_i = \frac{1}{2}
{\lvert \lvert \boldsymbol{y_i} - a^l(\boldsymbol{x_i}) \rvert \rvert}^2
\end{equation}
\end{frame}
\begin{frame}
\frametitle{Training neural networks}
We want to minimize the cost $C_i$. We can use the \textbf{gradient descent}
to do so. We compute the partial derivative of $C_i$ w.r.t. to the weights and
biases of the last layer.
\begin{equation}
\frac{\partial C_i}{\partial w_{jk}^l} \quad and \quad
\frac{\partial C_i}{\partial b_j^l}
\end{equation}
And then update the weights and biases with:
\begin{equation}
w_{jk}^l = w_{jk}^l - \lambda \times\frac{\partial C_i}{\partial w_{jk}^l}
\end{equation}
\begin{equation}
b_j^l = b_j^l - \lambda \times\frac{\partial C_i}{\partial b_j^l}
\end{equation}
\end{frame}
\begin{frame}
\frametitle{Training neural networks}
We repeat the same process on the layer before the last one, and again on the
layer before and again until the input layer. Since we are moving backward, we
are calling this the \textbf{backpropagation}.\\~\\
We repeat this process for each training examples. Once we have seen all
training examples, we call this an \textbf{epoch}. We repeat the backprogation
algorithm until a certain number of epoch is done, or until the network has a
good accuracy (or a cost below a threshold).
\end{frame}
\begin{frame}
\frametitle{Neural Networks}
Neural network is a really big subject and we only cover a fraction of it in
this course. Many more implementations exist (convolutional neural network,
deep network, recurrent neural network...). Neural networks have been proved
to be able to compute any function (so in theory, it can always predict the
right output given the input).
\\~\\
\textbf{Pros}
\begin{itemize}
\item can capture information in large datasets
\item can build complex models
\end{itemize}
\textbf{Cons}
\begin{itemize}
\item slow to train (if deep)
\item require preprocessing (homogeneous data)
\item hard to tune
\end{itemize}
\end{frame}
%--------------------------------
% U N S U P E R V I S E D -
%--------------------------------
\section{Unsupervised Learning}
% PCA
\subsection{Principal Component Analysis}
\begin{frame}
\frametitle{Representing data}
\begin{columns}
\column{0.48\linewidth}
\begin{figure}
\centering
\includegraphics[height=1.2cm]{img/pca_1dim.png}
\caption{1 dimension}
\end{figure}
\begin{figure}
\centering
\includegraphics[height=3.2cm]{img/pca_2dim.png}
\caption{2 dimensions}
\end{figure}
\column{0.48\linewidth}
\begin{figure}
\centering
\includegraphics[height=3.2cm]{img/pca_3dim.png}
\caption{3 dimensions}
\end{figure}
\parbox{\linewidth}{
\centering
4 dimensions ?\\
$n$ dimensions ?}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Reducing the dimension}
To visualize data, we need to project it into a 1, 2 or 3 dimensional space
(because we can not draw a representation if there are more than 3
dimensions). So we \textbf{need to reduce the number of dimensions}. One way
is to remove unimportant dimensions.\\~\\