174 lines
4.4 KiB
TeX
174 lines
4.4 KiB
TeX
\chapter{Unsupervised Learning}
|
|
|
|
\begin{definition}[Precision Medicine]
|
|
Design of treatment for a given patient, based on genomic data.
|
|
\end{definition}
|
|
|
|
\begin{definition}[Hierarchical clustering]
|
|
\end{definition}
|
|
|
|
Gene expression time series: look for genes with similar expression footprint.
|
|
|
|
\paragraph{Representation of data}
|
|
|
|
\begin{itemize}
|
|
\item Tables;
|
|
\item Trees / Graphs;
|
|
\item Time series...
|
|
\end{itemize}
|
|
|
|
\begin{figure}
|
|
\includestandalone{figures/plots/genes_expression_timeseries}
|
|
\caption{Example of gene expression time series}
|
|
\end{figure}
|
|
|
|
\section{Distances and Similarities}
|
|
|
|
\begin{property}[Distance]
|
|
\begin{description}
|
|
\item[non-negativity] $d(i, j) \geq 0$
|
|
\item[isolation] $d(i, i) = 0$
|
|
\item[symmetry] $d(i, j) = d(j, i)$
|
|
\item[triangular inequality] $d(i, j) \leq d(i, h) + d(h, j)$
|
|
\end{description}
|
|
\end{property}
|
|
|
|
\begin{definition}[Dissimilarity]
|
|
Distance without triangular inequality.
|
|
\end{definition}
|
|
|
|
\begin{definition}[Similarity]
|
|
Function $s$ from $X \times X$ to $\RR_+$ such that:
|
|
\begin{enumerate}
|
|
\item $s$ is symmetric: $(x, y) \in X \times X; s(x, y) = s(y, x)$
|
|
\item $(x, y) \in X \times X; s(x, x) = s(y, y) > s(x, y)$.
|
|
\end{enumerate}
|
|
\end{definition}
|
|
|
|
\begin{exercise}
|
|
|
|
Let $d(x, y)$ be the distance, $d(x, y) \in [0, +\infty[$.
|
|
|
|
What should be the similarity measure $S(x, y) = f(d(x, y))$ that satisfies the following property:
|
|
\[
|
|
(x, y) \in X \times X \: | \: S(x, y) > S(x, y)
|
|
\]
|
|
having $S(x, y) \leq M$, $S(x, y) \in ]0, M]$.
|
|
\end{exercise}
|
|
$d(x, y) \geq 0 \: \forall (x, y)$
|
|
\begin{equation}
|
|
S(x, y) = \frac{M}{d(x, y) + 1}
|
|
\label{eq:similarity-first}
|
|
\end{equation}
|
|
In \cref{eq:similarity-first}, $S(x, y)$ ranges from 0 to M.
|
|
\begin{eqnarray}
|
|
\lim_{n \to \infty} \frac{M}{n + 1} = 0 && \lim_{n \to 0} \frac{M}{n + 1} = M
|
|
\end{eqnarray}
|
|
|
|
|
|
\section{Data Representation}
|
|
|
|
\paragraph{Data matrix}
|
|
|
|
|
|
\paragraph{Distance matrix}
|
|
|
|
\[
|
|
\begin{bmatrix}
|
|
0 \\
|
|
d(2, 1) & 0 \\
|
|
d(3, 1) & d(3, 2) & 0 \\
|
|
\vdots & \vdots & \ddots \\
|
|
d(n, 1) & d(n,2) & \dots & \dots & 0
|
|
\end{bmatrix}
|
|
\]
|
|
|
|
|
|
\begin{table}
|
|
\centering
|
|
\begin{tabular}{c|cc}
|
|
&$s_{1}$ & $s_{2}$ \\
|
|
\hline
|
|
$p_{1}$ & 0 & 1 \\
|
|
$p_{2}$ & 1 & 0 \\
|
|
$p_{3}$ & 3 & 2 \\
|
|
\end{tabular}
|
|
\caption{Example data matrix: 2 symptoms for 3 patients.}
|
|
\end{table}
|
|
|
|
|
|
|
|
\begin{definition}[Minkowski distance]
|
|
\[
|
|
L_p (x, y) = \left(\abs{x_1 - y_1}^p + \abs{x_2 - y_2}^p + \ldots + \abs{x_d - y_d}^p\right)^{\sfrac{1}{p}} = \left(\sum_{i=1}^d \left(x_i - y_i\right)^p\right)^{\sfrac{1}{p}}
|
|
\]
|
|
where $p$ is a positive integer.
|
|
\end{definition}
|
|
|
|
\begin{definition}[Manhattan distance]
|
|
\[
|
|
L_1(x, y) = \sum_{i=1}^d \abs{x_i - y_i}
|
|
\]
|
|
\end{definition}
|
|
|
|
\begin{definition}[Euclidian distance]
|
|
Let $A$ and $B$ be two points, with $(x_{A}, y_{A})$ and $(x_{B}, y_{B})$ their respective coordinates,
|
|
\end{definition}
|
|
|
|
If $p=2$, $L_2$ is the Euclidian distance:
|
|
\begin{definition}[Euclidian distance]
|
|
\[
|
|
d(x, y) = \sqrt{\abs{x_1 - y_1}^2 + \abs{x_2 - y_2} + \ldots + \abs{x_d - y_d}^2}
|
|
\]
|
|
\end{definition}
|
|
|
|
We can add weights
|
|
|
|
\subsection{K-means}
|
|
|
|
The cost function is minimized:
|
|
\[
|
|
Cost(C) \sum_{i=1}^{k}...
|
|
\]
|
|
|
|
\begin{algorithm}[H]
|
|
Choose the number of clusters $k$.
|
|
|
|
Choose randomly $k$ means.
|
|
|
|
For each point, compute the distance between the point and each means.
|
|
We allocate the point to the cluster represented by the clostest center.
|
|
|
|
We set each means to the center of the cluster, and reiterate.
|
|
\caption{$K$-means algorithm}
|
|
\end{algorithm}
|
|
|
|
|
|
\begin{exercise}
|
|
We have six genes:
|
|
\begin{table}[H]
|
|
\centering
|
|
\begin{tabular}{ccccccc}
|
|
\toprule
|
|
& $g_{1}$ & $g_{2}$ & $g_{3}$ & $g_{4}$ & $g_{5}$ & $g_{6}$ \\
|
|
\midrule
|
|
$\times 10^{-2}$ & 10 & 12 & 9 & 15 & 17 & 18 \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption{Sample values for six gene expressions.}
|
|
\end{table}
|
|
|
|
With $k=2$ and $m_{1} = 10 \cdot 10^{-2}$ and $m_{2} = 9 \cdot 10^{-2}$ the two initial randomly chosen means, run the $k$-means algorithm.
|
|
\end{exercise}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics[scale=1]{figures/trace/kmeans-trace.pdf}
|
|
\caption{$k$-means algorithm execution trace. The tables show the value of the centroids and the distance of each gene expression value to the centroid.}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics[scale=1]{figures/plots/kmeans.pdf}
|
|
\caption{$k$-means states at each of the 3 steps}
|
|
\end{figure} |