MachineLearningCourse/assignment1 (A*)/report.tex at master · gribml/MachineLearningCourse · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
\documentclass[a4paper,12pt,oneside,final]{report}
%\usepackage{geometry}                % See geometry.pdf to learn the layout options. There are lots.
%\geometry{landscape}                % Activate for for rotated page geometry
%\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{epstopdf}
\usepackage[utf8]{inputenc}
\usepackage{titlesec}
\usepackage[titletoc]{appendix}
\usepackage[section,subsection,subsubsection]{extraplacesins}
\titleformat{\chapter}[hang]{\bf\Huge}{\thechapter}{1cm}{}

\pagestyle{plain}
% -------------------- this stuff for code --------------------

\usepackage{anysize}
\marginsize{30mm}{30mm}{20mm}{20mm}

\newenvironment{formal}{%
  \def\FrameCommand{%
    \hspace{1pt}%
    {\color{blue}\vrule width 2pt}%
    {\color{formalshade}\vrule width 4pt}%
    \colorbox{formalshade}%
  }%
  \MakeFramed{\advance\hsize-\width\FrameRestore}%
  \noindent\hspace{-4.55pt}% disable indenting first paragraph
  \begin{adjustwidth}{}{7pt}%
  \vspace{2pt}\vspace{2pt}%
}
{%
  \vspace{2pt}\end{adjustwidth}\endMakeFramed%
}

\newenvironment{changemargin}[2]{\begin{list}{}{%
\setlength{\topsep}{0pt}%
\setlength{\leftmargin}{0pt}%
\setlength{\rightmargin}{0pt}%
\setlength{\listparindent}{\parindent}%
\setlength{\itemindent}{\parindent}%
\setlength{\parsep}{0pt plus 1pt}%
\addtolength{\leftmargin}{#1}%
\addtolength{\rightmargin}{#2}%
}\item }{\end{list}}

\usepackage{color}
\usepackage{dsfont}
\usepackage[bitstream-charter]{mathdesign}
\usepackage[scaled]{helvet}
\usepackage{inconsolata}


\definecolor{colKeys}{rgb}{0,0,0.9}
\definecolor{colIdentifier}{rgb}{0,0,0}
\definecolor{colString}{rgb}{0.7,0,0}
\definecolor{colComments}{rgb}{0,0.6,0}
\usepackage{listings}
\lstset{
  stringstyle=\color{colString},
  keywordstyle=\color{colKeys},
  identifierstyle=\color{colIdentifier},
  commentstyle=\color{colComments},
  numbers=left,
  tabsize=4,
  frame=single,
  breaklines=true,
  basicstyle=\small\ttfamily,
  numberstyle=\tiny\ttfamily,
  framexleftmargin=0mm,
  xleftmargin=7mm,
  xrightmargin=7mm,
  frameround={tttt},
  captionpos=b
}

%% Headers and footers
\usepackage{fancyhdr}
\usepackage[section]{placeins}
\pagestyle{fancy}
\fancyhf{}
\addtolength{\headwidth}{30pt}
\addtolength{\headwidth}{30pt}
\renewcommand{\headrulewidth}{0.4pt} % thickness of the header line
\renewcommand{\footrulewidth}{0.4pt} % thickness of the footer line
\renewcommand{\chaptermark}[1]{\markboth{#1}{#1}} % chapter name
\renewcommand{\sectionmark}[1]{\markright{\thesection\ #1}}  % section name
\lhead[\fancyplain{}{\bf\thepage}]{\fancyplain{}{\bf\rightmark}} % display header
\rhead[\fancyplain{}{\bf\leftmark}]{\fancyplain{}{}} % display header
\fancyfoot[C]{\bf\thepage} % display footer (page number)
\fancyfoot[R]{\bf\today} % display footer (date)
\fancypagestyle{plain}{
	\fancyhead{} \renewcommand{\headrulewidth}{0pt}
}
\newcommand{\clearemptydoublepage}{\newpage{\pagestyle{plain}\cleardoublepage}}

\usepackage[T1]{fontenc}
\usepackage{enumerate}
\usepackage{afterpage,lastpage,fancyhdr}
\usepackage[includeheadfoot,margin=2.5cm]{geometry}
\geometry{letterpaper}                   % ... or a4paper or a5paper or ...

% -------------------- end of code stuff --------------------


\DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png}

%\title{Machine Learning Decision Trees Coursework}
%\author{Jean Kossaifi (jk712, a5) \\Sedef Ozlen (so512, s5) \\Paul Gribelyuk (pg1312, a5) \\Romain Brault (rb812, a5)}
%\date{}                                           % Activate to display a given date or no date

\makeatletter \def\thickhrulefill{\leavevmode \leaders \hrule height 1pt\hfill
\kern \z@} \renewcommand{\maketitle}{
    \begin{titlepage}
    \let\footnotesize\small \let\footnoterule\relax \parindent \z@ \reset@font
    \null\vfil
    \vspace{-20mm}
    \begin{center}
    {\small \scshape Imperial College London}
    \end{center}
    \vspace{0.5cm}
	\begin{minipage}{\textwidth}
		\vspace{1cm}
		%\noindent\rule[0ex]{\textwidth}{4pt} \\
		%\flushright
		\center
		\@title
		\\ \vspace{4mm}
		%\noindent\rule[0ex]{\textwidth}{4pt} \\
	\end{minipage}
	\vspace{2cm}
	\begin{center}
		\includegraphics[width=70mm,]{logo_imperial_college_london.png}
	\end{center}
	\vspace{5.4cm}
	\vspace{\stretch{1}}
	\begin{minipage}{\textwidth}
		\flushright
		{\bfseries}
		\vspace{7mm}
		\center
		\@author\\
	\end{minipage}
	\vspace{20mm}
		\flushleft
		{\bfseries}
		{\small \scshape \@date }
		\vspace{0.1cm}
		\rule{\linewidth}{.5pt}
  \end{titlepage}
  \setcounter{footnote}{1}
  \setcounter{page}{2}
}


\author{Jean Kossaifi (jk712, a5) \\Sedef Ozlen (so512, s5) \\Paul Gribelyuk (pg1312, a5) \\Romain Brault (rb812, a5)}
\makeatother
\title{\Huge Machine Learning Decision Trees Coursework}
\date{\today}


\begin{document}
\maketitle
\tableofcontents
\listoffigures
\chapter{Building the decision trees}
\paragraph{}
We implemented the decision trees algorithm via modular, object-oriented approach to allow flexibility when testing and modifying the algorithm.  This also allowed us to split the work between the team members and used GitHub for version control.
\paragraph{}
The workhorse class responsible for representing the tree is defined recursively as a \emph{tnode}, which has "kids", i.e. child nodes, which, in turn, represent subtrees of the tree.  We store the information gain and the depth of the node in the tree, which we later used in the decision-making algorithm when the tree classifier gave ambiguous results.  The project is split up into the following files:
\begin{itemize}
\item[-] \emph{ID3Driver.m}: This function takes (i) examples, (ii) targets, (iii) attributes (as column indices), (iv) a decision function, (v) and the number of folds to use for validation as inputs, (vi) a threshold value to avoid noisy data(when it is 1, it is not used) and return the confusion matrix, $F_{\alpha}$, a classification rate, and the six binary trees (as an array of nodes).
\item[-] \emph{ID3.m}: This is the algorithm which learns the binary tree by processing the examples until leaf nodes are reached.
\item[-] \emph{tnode.m}: This is the object representation of a node in a tree.  It can have an arbitrary number of child nodes, as well as properties such as ig (information gain), op (node label), and indop (index into the attribute array that this node has split the data on).
\item[-] \emph{ChooseBestDecisionAttribute.m}: This function returns the attribute that splits the data along the lines of the maximal information gain.
\item[-] \emph{trainer.m}: This method is responsible for producing the 6 trees given a set of examples, attributes, and targets.
\item[-] \emph{binaryFromMultiple.m}: Simple method which takes in target data with multiple classifications and converts it to a binary classification.
\item[-] \emph{classify.m}: Given some trees, example data, and a decision function, this method returns an array of predicted classifications.
\item[-] \emph{igClassify.m}: Our decision function which classifies the example data based on the sum of the information gains down a tree when an ambiguous situation arises (i.e. when either more than one tree returns a positive answer, or when all trees return a negative answer).
\item[-] \emph{depthClassify.m}: Another decision function we devised which resolves ambiguities based on shortest depth in the case of more than one positive answer, and longest depth in the case of all negative answers.
\item[-] \emph{NFoldValidationMask.m}: This is a helper function which returns 2 masks of "count" data points, for training and testing "N" times.
\item[-] \emph{ConfusionMatrix.m}: This function takes in predicted and actual target data and returns a confusion matrix.
\item[-] \emph{tune.m:} the following function find the best parameter $\beta$ for the improved algorithm using cross-validation (see appendix \ref{ch:improvedalgo}).
\end{itemize}
\section{Code flowchart}
\begin{figure}[p]
\begin{changemargin}{-50mm}{-50mm}
\center
\includegraphics[width=0.6\linewidth,height=0.97\textheight]{Main_flowchart.jpg}
\caption{Flowchart of Decision Tree implementation.}
\end{changemargin}
\end{figure}


\section{Tree Diagrams}
\paragraph{}
The six following figures have been obtained after training on the whole clean emotion dataset.
\label{ch:trees}
\begin{figure}[!h]
\begin{changemargin}{-20mm}{-20mm}
\center
\includegraphics[scale=1]{happiness.jpg}
\caption{Happiness Tree.}
\end{changemargin}
\end{figure}

\begin{figure}[h]
\begin{changemargin}{-20mm}{-20mm}
\begin{center}
\includegraphics[scale=0.6]{anger.jpg}
\end{center}
\caption{Anger Tree.}
\end{changemargin}
\end{figure}

\begin{figure}[!h]
\begin{changemargin}{-20mm}{-20mm}
\center
\includegraphics[scale=0.6]{disgust.jpg}
\caption{Disgust Tree.}
\end{changemargin}
\end{figure}

\begin{figure}[!h]
\center
\includegraphics[scale=0.6]{fear.jpg}
\caption{Fear Tree.}
\end{figure}


\begin{figure}[!h]
\center
\includegraphics[scale=0.6]{sadness.jpg}
\caption{Sadness Tree.}
\end{figure}

\begin{figure}[!h]
\begin{changemargin}{-20mm}{-20mm}
\center
\includegraphics[scale=1]{surprise.jpg}
\caption{Surprise Tree.}
\end{changemargin}
\end{figure}
\FloatBarrier

\chapter{Resolving Ambiguity of Tree Selection}
\label{ch:ambig}
\paragraph{}
The ambiguity about deciding which binary decision tree to use arises whenever more than one tree returns a positive result or when all trees return a negative result (otherwise, the unique tree which returns a positive classification can be used to make the overall classification).  We first decided to handle these cases based on the aggregate information gain accumulated (additively) when walking down the tree in search of the result.  Our reasoning was that the tree which most clearly divides the data (and thus has the highest cumulative information gain) is more likely to be classifying the emotion correctly.

\paragraph{}
Our second approach was to resolve the ambiguity by selecting the shortest path which gave a positive result among all positive results. If two paths are of the same length, then the selection resorts to a random one.  If no trees return a positive result, then we select the one which took the longest to traverse to our "no" answer.  Our rationale for this is that we can have the most confidence in the shortest path, the longest one should result in the most error-prone answer, thus a "no" is more likely to be a "yes".  The longer path of a tree is likely the result of noise or overfitting, thus we ascribe less confidence to its prediction.  This turned out to have a better effect than randomly selecting a tree (we obtained $classRate \approx 72\%$ randomly and $classRate \approx 74\%$ via this depth approach).
\paragraph{}
One of the advantages of the depth-based classification decision is that we can avoid the extra overhead of storing data (i.e, the information gain) at each node in each tree while we are building it.  The shortest path approach produces better results because the longer the tree is, the more risk of overfitting. Getting the shortest path could help produce the correct result when there is some noise further down the tree. For example, if happiness is described by AU12, AU6, and AU25, then an addition of another action unit to the path could produce a false negative in the answer, therefore, the longer path is more prone to mistakes. The drawback of information gain method is since it sums the information gains along the path, we get to add the information gains defined by AU12, AU6 and AU25 which would produce a large information gain, leading to an erroneous result. Eventually, these approaches work in a similar way, but the shortest path has more predictive power.
\paragraph{}
Lastly for our third approach, we looked at creating what we called a \emph{model example} for each classification.  Specifically, once we have built the trees using ID3, we averaged those examples which unambiguously classified for each emotion.  We found the top $M$ (a parameter to be tuned\footnote{In our code, we used 4.}) attributes for each model example and calculated a distance between each incoming example to be classified and that model example.  The idea behind this approach was that there are only a few main activation units that could adequately help to classify an emotion, and the rest are more likely to cause noise and create a misclassification.  In the cases that an ambiguity arose, the algorithm chose the classification corresponding to the minimal distance of that example from that model example.  This yielded very good results $F_{\alpha}\approx 77\%$, and $classRate \approx 80\%$ but it deserves further study to verify its superiority to our other methods.

\section{Results}
\subsection{Depth classifier}
\paragraph{}
To obtain the results, we run the following code:
\newline
\begin{lstlisting}[language=MATLAB, frame=single]
[cm, err, cr, tr] = ID3Driver(x, 1:45, y, @depthClassify, 10, 1);
\end{lstlisting}
\paragraph{}
The confusion matrix received for a random run of the 10-fold cross-validation is as follows (since we randomly shuffle our array for cross-validation, we seeded the random number generator at 50000 in line 3 of \emph{ID3Driver.m} to create reproducible results):
\[
cm = \left[\begin{array}{cccccc}
9.2000  &  1.7000  &  0.4000  &  0.4000 &   1.2000    &0.2000 \\
    2.2000   &15.3000&    0.3000&    1.0000&    0.7000 &    0.3000 \\
    1.4000    &1.0000    &8.3000    &0.1000    &0.4000   & 0.7000 \\
    0.4000    &1.1000    &0.3000   &18.1000   & 0.8000   & 0.8000 \\
    1.9000    &2.2000    &1.1000    &0.5000    &7.0000    &0.4000 \\
    0.7000    &0.5000    &1.8000    &0.3000    &1.0000   &16.3000 \\

\end{array}
\right]
\]
\paragraph{}
The rows of the confusion matrix represent the actual classifications, and the columns represent the predicted classifications for each of the emotions .  They are the average results found by taking the average of all the confusion matrices for each fold of the 10-fold cross-validation process. It seems that our trees have classified most correctly for the 4th emotion, which is happiness. Our average classification rate is   $74.20\%$. This rate is found by first summing the times our classifier guessed correctly over the total number of test data for  all 10-folds, and getting the average of it, which gives the correctness percentage. Subtracting this value from 1, gives the error rate. Our classifier can guess over the newly confronted test data with a classification rate of approximately $74.0\%$.  \\
\paragraph{}
Recall Rates:
\[
recall = \left[\begin{array}{cccccc}
70.2290   &77.2727  & 69.7479  & 84.1860 &  53.4351&   79.1262 \\

\end{array}
\right]
\]
\paragraph{}
Precision Rates:
\[
precision = \left[\begin{array}{cccccc}
58.2278 &  70.1835  & 68.0328  & 88.7255  & 63.0631 &  87.1658 \\
\end{array}
\right]
\]
\paragraph{}
The $F_{1}$ values according to these recall and precision rates are:
\[
F_{1} = \left[\begin{array}{cccccc}
63.6678 &  73.5577 &  68.8797 &  86.3962 &  57.8512 &  82.9517 \\
\end{array}
\right]
\]
\paragraph{}
As expected, emotion 4 was the best classified emotion, and we achieved the highest rates for both recall and precision.
\subsection{Information gain classifier}
\paragraph{}
For the information gain classifier we run the following code :
\newline
\begin{lstlisting}[language=MATLAB, frame=single]
[cm, err, cr, tr] = ID3Driver(x, 1:45, y, @igClassify, 10, 1);
\end{lstlisting}
\paragraph{}
We find the confusion matrix :
\[
cm = \left[\begin{array}{cccccc}
    8.0000  &  1.1000  &  0.9000  &  0.6000  &  1.7000  &  0.8000 \\
    1.9000  & 13.8000  &  1.2000  &  1.0000  &  1.4000  &  0.5000 \\
    0.7000  &  0.3000  &  8.1000  &  0.5000  &  1.6000  &  0.7000 \\
    1.2000  &  1.0000  &  0.2000  & 16.6000  &  1.7000  &  0.8000 \\
    2.4000  &  1.2000  &  1.6000  &  0.8000  &  6.8000  &  0.3000 \\
    0.7000  &  0.5000  &  0.8000  &  0.6000  &  1.5000  & 16.5000 \\

\end{array}
\right]
\]
\paragraph{}
Recall Rates:
\[
recall = \left[\begin{array}{cccccc}
61.0687 &  69.6970  & 68.0672 &  77.2093 &  51.9084  & 80.0971 \\
\end{array}
\right]
\]
\paragraph{}
Precision Rates:
\[
precision = \left[\begin{array}{cccccc}
 53.6913  & 77.0950 &  63.2812 &  82.5871  & 46.2585  & 84.1837 \\
\end{array}
\right]
\]
\paragraph{}
And $F_{1}$ values:
\[
F_{1} = \left[\begin{array}{cccccc}
57.1429  & 73.2095 &  65.5870  & 79.8077  & 48.9209  & 82.0896 \\
\end{array}
\right]
\]
\paragraph{}
Additionally the classification rate is 69.8\%. We can see that the information gain classifier is not as efficient as the depth classifier as explained chapter \ref{ch:ambig}.


\chapter{Analysis of Tree Pruning}
\paragraph{}
The \textit{pruning\_example} function starts by training a classification tree with the function \textit{treefit} on the dataset using a example data X and a target vector Y. The tree is then tested with \textit{treetest} by measuring error in two different ways :
\begin{itemize}
\item with a 10 folds cross-validation. Eventually \textit{treetest} returns a vector of Cost corresponding to the cross-validation error along the optimal pruning sequence for the tree. It also returns a vector containing the number of terminal nodes of the tree for each cost, and the best level of pruning (eg: the first minimum of the cost vector).
\item With a resubstitution error. Again, the algorithm computes the cost vector along the optimal pruning sequence. The cost of the subtree is the sum over all leafs of the probability to reach the node times the node cost, the node cost being the sum of all misclassified data\footnote{We set the error to 1 for a misclassified element, 0 otherwise.}.
The function is also returning the vector of costs, the vector of terminal nodes along the optimal pruning sequence and the best level of pruning.
\end{itemize}
Once the tree has been tested with the two error measures described above, two pruned trees are created thanks to the best level of pruning computed.
\paragraph{}
Finally, the \textit{pruning\_example} function plots the two costs vectors computed with \textit{treetest} as a function of the number of terminal nodes. Figure \ref{fig:pruning} shows the graphic obtained with \textit{pruning\_example} on the clean emotions database.
The red curve, representing the resubstitution error, keeps decreasing until it reaches zero with about 200 terminal nodes. At this stage it means that the classification tree runs perfectly on the training set since greater nodes allows for greater over fitting of the model to the training data. On the other hand, the blue curve, corresponding to the cross-validation error, stops decreasing after adding approximately 25 terminal nodes to the tree. This behaviour is caused by the cross-validation error measure since in a cross validation the tree is trained on a random subset of the whole data (9/10) and tested on the remaining part (1/10). As a result the tree cannot overfit the data and the error remains constant, even rising gradually.

\begin{figure}[!h]
\center
\includegraphics{pruning_example_clean.png}
\caption[Pruning example on clean data.]{Cost computed by cross-validation (blue curve) and resubstitution (red curve) on the clean emotion dataset depending on the number of terminal nodes. \label{fig:pruning}}
\end{figure}
\FloatBarrier
\paragraph{}
Thus the decreasing trend of the red curve while the blue curve slightly increase is a typical indicator of overfitting. The tree is predicting the training set perfectly, but is not able to generalize on the test set (out of sample). It is good practice to prune the tree or stop learning when the cross-validation error stops decreasing (blue box on figure \ref{fig:pruning}). On the clean data, the cross validation remains constant after 25 terminal nodes. As a result one can believe that it is not dangerous to overfit. However the same study on the noisy dataset (figure \ref{fig:pruning_noisy}) shows that with "real life" data, overfitting can lead to an increase of the cross-validation error.
\begin{figure}[!h]
\center
\includegraphics{pruning_example_noisy.png}
\caption[Pruning example on noisy data.]{Cost computed by cross-validation (blue curve) and resubstitution (red curve) on the noisy emotion dataset depending on the number of terminal nodes. \label{fig:pruning_noisy}}
\end{figure}

\begin{appendices}
\chapter{Algorithm Improvement}
\section{Presentation of the improvement}
\paragraph{}
\label{ch:improvedalgo}
We briefly looked at modifying the ID3 algorithm to create a more sensible stopping point for the tree. The idea is to tolerate a small error, which can be due to noise or outlier, while building the tree. Specifically, we assigned a parameter $0.75 \leq \beta\leq 1$ to stand for the threshold of the percentage of the binary target data begin classified "yes" or "no".  If the threshold was breached (e.g. more than $\beta$ targets classify one way), we assign a leaf node rather than proceeding further. Obviously, when choosing $\beta=1$ the improved algorithm is equivalent to the base algorithm. This modified ID3 algorithm showed a significant improvement, bumping each of the other decision functions a few percentage points.  We tuned the $\beta$ parameter via cross-validation and found that $\beta = 0.98$ produces optimal results for the depth classifier and $\beta=0.97$ the best results for the information gain classifier. In addition to a better classification rate, this improvement also recudes the computation time. Ideally we sould find a different beta for each tree and classifier function to obtain an optimal improvement of the $F_1$ value and classification rate. However finding a different beta for each tree would increase dramaticly the computation time of the tune function, which is already really so. As a result, to simplify the problem and save computation time, we have decided to tune $\beta$ only for each classifier function, which already shows improvement for both classifiers.

\section{Results}
\subsection{Depth classifier}
We have the following results for $\beta=0.98$ :
\[
cm = \left[\begin{array}{cccccc}
    9.3000  &  1.4000  &  0.4000  &  0.2000  &  1.6000  &  0.2000 \\
    1.1000  & 16.4000  &  0.1000  &  0.9000  &  0.9000  &  0.4000 \\
    0.8000  &  0.9000  &  8.4000  &  0.2000  &  0.4000  &  1.2000 \\
    0.2000  &  1.2000  &  0.1000  & 18.9000  &  0.6000  &  0.5000 \\
    1.5000  &  2.6000  &  0.4000  &  0.5000  &  7.5000  &  0.6000 \\
    0.2000  &  0.7000  &  0.9000  &  0.4000  &  0.9000  & 17.5000 \\
\end{array}
\right]
\]
\paragraph{}
Recall Rates:
\[
recall = \left[\begin{array}{cccccc}
70.9924 &  82.8283  & 70.5882 &  87.9070 &  57.2519  & 84.9515\\
\end{array}
\right]
\]
\paragraph{}
Precision Rates:
\[
precision = \left[\begin{array}{cccccc}
70.9924 &  70.6897  & 81.5534  & 89.5735 &  63.0252 &  85.7843 \\
\end{array}
\right]
\]
\paragraph{}
$F_{1}$ values:
\[
F_{1} = \left[\begin{array}{cccccc}
 70.9924  & 76.2791 &  75.6757  & 88.7324  & 60.0000  & 85.3659\\
\end{array}
\right]
\]
\paragraph{}
Classification rate : 78.0\%.
\subsection{Information gain classifier}
We have computed the following confusion matrix for $\beta=0.97$ :
\[
cm = \left[\begin{array}{cccccc}
    8.4000  &  0.9000  &  1.1000  &  0.5000  &  1.7000  &  0.5000 \\
    2.2000  & 14.4000  &  0.6000  &  0.8000  &  1.2000  &  0.6000 \\
    0.7000  &  0.2000  &  7.6000  &  0.5000  &  2.1000  &  0.8000 \\
    0.9000  &  0.9000  &  0.3000  & 16.6000  &  2.4000  &  0.4000 \\
    2.4000  &  1.1000  &  1.3000  &  0.7000  &  7.3000  &  0.3000 \\
    0.5000  &  0.4000  &  0.8000  &  0.5000  &  1.9000  & 16.5000 \\
\end{array}
\right]
\]
\paragraph{}
Recall Rates:
\[
recall = \left[\begin{array}{cccccc}
 64.1221 &  72.7273 &  63.8655 &  77.2093 &  55.7252 &  80.0971\\
\end{array}
\right]
\]
\paragraph{}
Precision Rates:
\[
precision = \left[\begin{array}{cccccc}
55.6291  & 80.4469 &  64.9573  & 84.6939 &  43.9759 &  86.3874 \\
\end{array}
\right]
\]
\paragraph{}
$F_{1}$ values:
\[
F_{1} = \left[\begin{array}{cccccc}
59.5745  & 76.3926  & 64.4068  & 80.7786 &  49.1582 &  83.1234 \\
\end{array}
\right]
\]
\paragraph{}
Classification rate : 70.8\%.

\end{appendices}

%\chapter{Addendum}
%We briefly looked at modifying the ID3 algorithm to create a more sensible stopping point for the tree.  Specifically, we assigned a parameter $0.75 \leq \beta\leq 1$ to stand for the threshold of the percentage of the binary target data begin classified "yes" or "no".  If the threshold was breached (e.g. more than $\beta$ targets classify one way), we assign a leaf node rather than proceeding further.  This modified ID3 algorithm showed a significant improvement, bumping each of the other decision functions a few percentage points.  We tuned the $\beta$ parameter via cross-validation and found that $\beta = 0.98$ produces optimal results.  We wrote a script called \emph{tune.m} to search for the optimal $\beta$ using cross-validation.


\end{document}