From 29dad16dfb895fe85cc6b09246f12680d034b0c5 Mon Sep 17 00:00:00 2001
From: Samuel Ortion <samuel+git@ortion.fr>
Date: Fri, 13 Oct 2023 13:19:12 +0200
Subject: [PATCH] Moved part on linear algebra

Add stuff on model validity
---
 .gitattributes                                |   7 +-
 content/chapters/include.tex                  |   2 +-
 content/chapters/part1/1.tex                  | 313 +++++++++++++++++-
 content/chapters/part1/2.tex                  | 220 +-----------
 content/chapters/part1/3.tex                  |   0
 content/chapters/part2/0.tex                  |   2 +
 content/chapters/part2/1.tex                  | 220 ++++++++++++
 content/introduction.tex                      |  12 +-
 definitions.tex                               |   4 +
 figures/plots/linear_regression.R             |  26 ++
 figures/plots/linear_regression_linear.pdf    |   3 +
 .../plots/linear_regression_non_linear.pdf    |   3 +
 figures/schemes/.gitattributes                |   3 +
 figures/schemes/covariance.pdf                |   3 +
 figures/schemes/covariance.tex                |  35 ++
 main.pdf                                      |   4 +-
 16 files changed, 627 insertions(+), 230 deletions(-)
 delete mode 100644 content/chapters/part1/3.tex
 create mode 100644 content/chapters/part2/0.tex
 create mode 100644 content/chapters/part2/1.tex
 create mode 100644 figures/plots/linear_regression.R
 create mode 100644 figures/plots/linear_regression_linear.pdf
 create mode 100644 figures/plots/linear_regression_non_linear.pdf
 create mode 100644 figures/schemes/.gitattributes
 create mode 100644 figures/schemes/covariance.pdf
 create mode 100644 figures/schemes/covariance.tex

diff --git a/.gitattributes b/.gitattributes
index b4a264a..59f2077 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,7 +1,2 @@
 main.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/regression_plan_3D.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/vector_orthogonality.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/base_plan.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/coordinates_systems.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/ordinary_least_squares.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/orthogonal_projection.pdf filter=lfs diff=lfs merge=lfs -text
+**/*.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/content/chapters/include.tex b/content/chapters/include.tex
index 7922d7d..7d1c4c3 100755
--- a/content/chapters/include.tex
+++ b/content/chapters/include.tex
@@ -13,6 +13,6 @@
 
 \includechapters{part1}{2}
 
-% \includechapters{part2}{2}
+\includechapters{part2}{2}
 
 % \includechapters{part3}{1}
\ No newline at end of file
diff --git a/content/chapters/part1/1.tex b/content/chapters/part1/1.tex
index 8219935..7b29d3c 100644
--- a/content/chapters/part1/1.tex
+++ b/content/chapters/part1/1.tex
@@ -117,7 +117,7 @@ We want to minimize the distance between $\X\beta$ and $\Y$:
     \Rightarrow& \X \beta = proj^{(1, \X)} \Y\\
     \Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\
     \Rightarrow& \forall i: \\
-    & \X_i \Y = \X_i X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
+    & \X_i \Y = \X_i \X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
     \Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\
     \Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\
     \Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y
@@ -127,7 +127,7 @@ This formula comes from the orthogonal projection of $\Y$ on the vector subspace
 
 $\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$.
 
-If $H$ is the projection matrix of the subspace generated by $\X$, $X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
+If $H$ is the projection matrix of the subspace generated by $\X$, $\X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
 
 \section{Sum of squares}
 
@@ -253,6 +253,7 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
 
 
 \begin{theorem}[Cochran Theorem (Consequence)]
+    \label{thm:cochran}
     Let $\mathbf{Z}$ be a gaussian vector: $\mathbf{Z} \sim \Norm_n(0_n, I_n)$.
 
     \begin{itemize}
@@ -263,8 +264,29 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
         \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$... 
         (\textcolor{red}{look to the slides})
     \end{itemize}
+
+    $Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$.
+
+    \dots
+
+
 \end{theorem}
 
+\begin{property}[Estimators properties in the linear model]
+    According to \autoref{thm:cochran}, 
+    \[
+        \hat{m} \text{ is independent from $\hat{\sigma}^2$}
+    \]\dots
+    \[
+        \frac{\norm{\Y - \Pi_V(\Y)}^2}{...} \sim 
+    \]
+
+    $\hat{m} = \X \hat{\beta}$
+
+    $\hat{m}$ is the estimation of the mean.
+\end{property}
+
+
 \begin{definition}[Chi 2 distribution]
     If $X_1, \ldots, X_n$ i.i.d. $\sim \Norm(0, 1)$, then;,
     \[
@@ -318,3 +340,290 @@ We can derive statistical test from these properties.
 \]
 
 where 
+
+\paragraph{Estimation of $\sigma^2$}
+
+A biased estimator of $\sigma^2$ is:
+\[
+    \hat{\sigma^2} = ?
+\]
+
+$S^2$ is the unbiased estimator of $\sigma^2$
+\begin{align*}
+    S^2 &= \frac{1}{n-q} \norm{\Y - \Pi_V(\Y)}^2 \\
+    &= \frac{1}{n-q} \sum_{i=1}^n (Y_i - (\X\hat{\beta})_i)^2
+\end{align*}
+
+\begin{remark}[On $\hat{m}$]
+    \begin{align*}
+        &\Y = \X \beta + \varepsilon
+        \Leftrightarrow& \EE(\Y) = \X \beta
+    \end{align*}
+\end{remark}
+
+\section{Student test of nullity of a parameter}
+
+Let $\beta_j$ be a parameter, the tested hypotheses are as follows:
+\[
+    \begin{cases}
+        (H_0): \beta_j = 0 \\
+        (H_1): \beta_j \neq 0
+    \end{cases}
+\]
+
+Under the null hypothesis:
+\[
+    \frac{\hat{\beta}_j - \beta_j}{S \sqrt{(\X^T \X)^1_{j,j}}} \sim \St(n-q).
+\]
+The test statistic is:
+\[
+    W_n = \frac{\hat{\beta}_j}{S \sqrt{(\X^T\X)^{-1}_{j,j}}} \underset{H_0}{\sim} \St(n-q).
+\]
+
+$\hat{\beta}$ is a multinormal vector.
+
+Let's consider a vector of 4 values:
+\begin{align*}
+    \begin{pmatrix}
+        \hat{\beta}_0 \\
+        \hat{\beta}_1 \\
+        \hat{\beta}_2 \\
+        \hat{\beta}_3 
+    \end{pmatrix}
+     \sim \Norm_4 \left( \begin{pmatrix}
+        \beta_0 \\
+        \beta_1 \\
+        \beta_2 \\
+        \beta_3
+     \end{pmatrix} ; 
+     \sigma^2 \left(\X^T \X\right)^{-1} 
+     \right)
+\end{align*}
+
+Let $\M$ be the following model
+\begin{align*}
+    Y_i &= \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i
+\end{align*}
+
+Why can't we use the following model to test each of the parameters values (here for $X_2$)?
+\[
+    Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i
+\]
+We can't use such a model, we would probably meet a confounding factor: even if we are only interested in relationship $X_2$ with  $Y$, we have to fit the whole model.
+
+\begin{example}[Confounding parameter]
+    Let $Y$ be a variable related to the lung cancer. Let $X_1$ be the smoking status, and $X_2$ the variable `alcohol' (for instance the quantity of alcohol drunk per week).
+    
+    If we only fit the model $\M: Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i$, we could conclude for a relationship between alcohol and lung cancer, because alcohol consumption and smoking is strongly related. If we had fit the model $\M = Y_i = \theta_0 + \theta_1 X_{1i} + \theta_2 X_{2i} + \varepsilon_i$, we could indeed have found no significant relationship between $X_2$ and $Y$.
+\end{example}
+
+\begin{definition}[Student law]    
+    Let $X$ and $Y$ be two random variables such as $X \indep Y$, and such that $X \sim \Norm(0, 1)$ and $Y \sim \chi_n^2$, then
+    \[
+        \frac{X}{\sqrt{Y}} \sim \St(n)
+    \]
+\end{definition}
+
+\subsection{Model comparison}
+
+\begin{definition}[Nested models]
+    
+\end{definition}
+
+Let $\M_2$ and $\M_4$ be two models:
+
+$\M_2: Y_i = \beta_0 + \beta_3 X_{3_i} + \varepsilon_i$
+
+$\M_4: Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i$
+
+$\M_2$ is nested in $\M_4$.
+
+\paragraph*{Principle} We compare the residual variances of the two models, that is, the variance that is not explained by the model.
+
+The better the model is, the smallest the variance would be.
+
+If everything is explained by the model, the residual variance would be null.
+
+
+Here $\M_4$ holds all the information found in $\M_2$ plus other informations. In the worst case It would be at least as good as $\M_2$.
+
+\subsection{Fisher $F$-test of model comparison}
+
+Let $\M_q$ and $\M_{q'}$ be two models such as $\dim(\M_q) = q$, $\dim(\M_{q'}) = q'$, $q > q'$ and $\M_{q'}$ is nested in $\M_q$.
+
+\paragraph{Tested hypotheses}
+\[
+\begin{cases}
+    (H_0): \M_{q'} \text{ is the proper model} \\
+    (H_1): \M_q \text{ is a better model} 
+\end{cases}
+\]
+
+\begin{description}
+    \item[ESS] Estimated Sum of Squares 
+    \item[RSS] Residual Sum of Squares
+    \item[EMS] Estimates Mean Square
+    \item[RMS] Residual Mean Square    
+\end{description}
+
+\[
+    ESS = RSS(\M_{q'}) - RSS(\M_q)
+\]
+\[
+    RSS(\M) = \norm{\Y - \X\hat{\beta}} = \sum_{i=1}^n \hat{\varepsilon}_i^2
+\]
+\[
+    EMS = \frac{ESS}{q - q'}
+\]
+\[
+    RMS = \frac{RSS(\M_q)}{n-q}
+\]
+
+Under the null hypotheses:
+\[
+    F = \frac{EMS}{RMS} \underset{H_0}{\sim} \Fish(q-q'; n-q)
+\]
+
+\section{Model validity}
+
+Assumptions:
+\begin{itemize}
+    \item $\X$ is a full rank matrix;
+    \item Residuals are i.i.d. $\varepsilon \sim \Norm(0_n, \sigma^2 \mathcal{I}_n)$;
+\end{itemize}
+
+We have also to look for influential variables.
+
+
+\subsection{$\X$ is full rank}
+
+To check that the rank of the matrix is $p+1$, we can calculate the eigen value of the correlation value of the matrix. If there is a perfect relationship between two variables (two columns of $\X$), one of the eigen value would be null. In practice, we never get a null eigen value. We consider the condition index as the ratio between the largest and the smallest eigenvalues, if the condition index $\kappa = \frac{\lambda_1}{\lambda_p}$, with $\lambda_1 \geq \lambda_2 \geq \ldots \geq \lambda_p$ the eigenvalues.
+
+
+If all eigenvalues is different from 0, $\X^T \X$ can be inverted, but the estimated parameter variance would be large, thus the estimation of the parameters would be not relevant (not good enough).
+
+\paragraph{Variance Inflation Factor}
+
+Perform a regression of each of the predictors against the other predictors.
+
+If there is a strong linear relationship between a parameter and the others, it would reflect that the coefficient of determination $R^2$ (the amount of variance explained by the model) for this model, which would mean that there is a strong relationship between the parameters.
+
+We do this for all parameters, and for parameter $j = 1, \ldots, p$, the variance inflation factor would be:
+\[
+    VIF_j = \frac{1}{1-R^2_j}.
+\]
+
+\subparagraph*{Rule}
+If $VIF > 10$ or $VIF > 100$\dots
+
+
+In case of multicollinearity, we have to remove the variable one by one until there is no longer multicollinearity.
+Variables have to be removed based on statistical results and through discussion with experimenters.
+
+
+\subsection{Residuals analysis}
+
+\paragraph*{Assumption}
+\[
+    \varepsilon \sim \Norm_n(0_n, \sigma^2 I_n)
+\]
+
+\paragraph{Normality of the residuals} If $\varepsilon_i$ ($i=1, \ldots, n$) could be observed we could build a QQ-plot of $\varepsilon_i / \sigma$ against quantiles of $\Norm(0, 1)$.
+
+Only the residual errors $\hat{e}_i$ can be observed:
+
+Let $e_i^*$ be the studentized residual, considered as estimators of $\varepsilon_i$
+
+\[
+    e_i^* = \frac{\hat{e}_i}{\sqrt{\sigma^2_{(i)(1-H_{ii})}}}
+\]
+
+\begin{align*}
+    \hat{Y} &= X \hat{\beta} \\
+    &= X \left( (X^TX)^{-1} X^T Y\right) \\
+    &= \underbrace{X (X^TX)^{-1} X^T}_{H} Y
+\end{align*}
+
+\paragraph{Centered residuals} If $(1, \ldots, 1)^T$ belongs to $\X$ $\EE(\varepsilon) = 0$, by construction.
+
+\paragraph{Independence} We do not have a statistical test for independence in R, we would plot the residuals $e$ against $\X \hat{\beta}$.
+
+\paragraph{Homoscedastiscity} Plot the $\sqrt{e^*}$ against $\X \hat{\beta}$.
+
+
+\paragraph{Influential observations} 
+
+We make the distinction between observations:
+\begin{itemize}
+    \item With too large residual
+    $\rightarrow$ Influence on the estimation of $\sigma^2$
+    \item Which are too isolated
+    $\rightarrow$ Influence on the estimation of $\beta$
+\end{itemize}
+
+\[
+    e_i^* \sim \St(n-p-1)
+\]
+\subparagraph*{Rule} We consider an observation to be aberrant if:
+\[
+    e_i^* > \F^{-1}_{\St(n-p-1)}(1-\alpha)
+\]
+quantile of order $1-\alpha$, $\alpha$ being often set as $1/n$, or we set the threshold to 2.
+
+\paragraph{Leverage} Leverage is the diagonal term of the orthogonal projection matrix(?) $H_{ii}$.
+
+\begin{property}
+    \begin{itemize}
+        \item $0 \leq H_{ii} \leq 1$
+        \item $\sum_i H_ii = p$
+    \end{itemize}
+\end{property}
+
+\subparagraph*{Rule} We consider that the observation is aberrant if the leverage is ??.
+
+
+\paragraph{Non-linearity}
+
+
+\section{Model Selection}
+
+We want to select the best model with the smallest number of predictors.
+
+When models have too many explicative variables, the power of statistical tests decreases.
+
+Different methods:
+\begin{itemize}
+    \item Comparison of nested models;
+    \item Information criteria;
+    \item Method based on the prediction error.
+\end{itemize}
+
+\subsection{Information criteria}
+
+\subsubsection{Likelihood}
+
+\begin{definition}[Likelihood]
+    Probability to observe what we observed for a particular model.
+    \[
+        L_n (\M(k))
+    \]
+\end{definition}
+
+
+\begin{definition}[Akaike Information Criterion]
+    \[
+        AIC(\M(k)) = -2 \log L_n (\M(k)) + 2k.
+    \]
+
+    $2k$ is a penalty, leading to privilege the smallest model.
+\end{definition}
+
+\begin{definition}[Bayesian Information Criterion]
+    \[
+        BIC(\M(k)) = -2 \log L_n (\M(k)) + \log(n) k.
+    \]
+    $\log(n) k$ is a penalty.
+\end{definition}
+
+Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion.
+
diff --git a/content/chapters/part1/2.tex b/content/chapters/part1/2.tex
index 0e5f075..532bfb7 100644
--- a/content/chapters/part1/2.tex
+++ b/content/chapters/part1/2.tex
@@ -1,220 +1,4 @@
-\chapter{Elements of Linear Algebra}
-\label{ch:elements-of-linear-algebra}
+\chapter{Generalized Linear Model}
 
-\begin{remark}[vector]
-    Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$
-\end{remark}
+\section{Logistic Regression}
 
-Let $u = \begin{pmatrix}
-        u_1    \\
-        \vdots \\
-        u_n
-    \end{pmatrix}$ and $v = \begin{pmatrix}
-        v_1    \\
-        \vdots \\
-        v_n
-    \end{pmatrix}$
-
-\begin{definition}[Scalar Product (Dot Product)]
-    \begin{align*}
-        \scalar{u, v} & = \begin{pmatrix}
-            u_1, \ldots, u_v
-        \end{pmatrix}
-        \begin{pmatrix}
-            v_1    \\
-            \vdots \\
-            v_n
-        \end{pmatrix} \\
-    & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n
-    \end{align*}
-
-    We may use $\scalar{u, v}$ or $u \cdot v$ notations.
-\end{definition}
-\paragraph{Dot product properties}
-\begin{description}
-    \item[Commutative] $\scalar{u, v} = \scalar{v, u}$
-    \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$
-    \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$
-    \item $\scalar{a, a} = \norm{a}^2$
-\end{description}
-
-\begin{definition}[Norm]
-    Length of the vector.
-    \[
-        \norm{u} = \sqrt{\scalar{u, v}}
-    \]
-
-    $\norm{u, v} > 0$
-\end{definition}
-
-\begin{definition}[Distance]
-    \[
-        dist(u, v) = \norm{u-v}
-    \]
-\end{definition}
-
-\begin{definition}[Orthogonality]
-
-\end{definition}
-
-\begin{remark}
-    \[
-        (dist(u, v))^2 = \norm{u - v}^2,
-    \] and
-    \[
-        \scalar{v-u, v-u}
-    \]
-\end{remark}
-
-\begin{figure}
-    \centering
-    \includestandalone{figures/schemes/vector_orthogonality}
-    \caption{Scalar product of two orthogonal vectors.}
-    \label{fig:scheme-orthogonal-scalar-product}
-\end{figure}
-
-\begin{align*}
-    \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\
-                    & = \norm{v}^2 + \norm{u}^2                         \\
-                    & = -2 \scalar{u, v}
-\end{align*}
-
-\begin{align*}
-    \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\
-    \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v}
-\end{align*}
-
-\begin{proposition}[Scalar product of orthogonal vectors]
-\[
-    u \perp v \Leftrightarrow \scalar{u, v} = 0
-\]
-\end{proposition}
-
-\begin{proof}[Indeed]
-    $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}.
-    \begin{align*}
-        \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\
-        \Leftrightarrow & 4 \scalar{u, v} = 0                \\
-        \Leftrightarrow & \scalar{u, v} = 0
-    \end{align*}
-\end{proof}
-
-\begin{theorem}[Pythagorean theorem]
-    If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ .
-\end{theorem}
-
-\begin{definition}[Orthogonal Projection]
-
-\end{definition}
-Let $y = \begin{pmatrix}
-        y_1 \\
-        .   \\
-        y_n
-    \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$.
-$\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$:
-\[
-    \mathcal{Y} = proj^w(y) + z,
-\]
-where
-\[
-    \begin{cases}
-        z \in w^\perp \\
-        proj^w(y) \in w
-    \end{cases}
-\]
-There is only one vector $\mathcal{Y}$ that ?
-
-The scalar product between $z$ and (?) is zero.
-
-\begin{property}
-    $proj^w(y)$ is the closest vector to $y$ that belongs to $w$.
-\end{property}
-
-\begin{definition}[Matrix]
-    A matrix is an application, that is, a function that transform a thing into another, it is a linear function.
-\end{definition}
-
-\begin{example}[Matrix application]
-
-    Let $A$ be a matrix:
-    \[
-        A = \begin{pmatrix}
-            a & b \\
-            c & d
-        \end{pmatrix}
-    \] and
-    \[
-        x = \begin{pmatrix}
-            x_1 \\
-            x_2
-        \end{pmatrix}
-    \]
-    Then,
-    \begin{align*}
-        Ax & = \begin{pmatrix}
-                a & b \\
-                c & d
-            \end{pmatrix}
-        \begin{pmatrix}
-            x_1 \\
-            x_2
-        \end{pmatrix}        \\
-        & = \begin{pmatrix}
-                a x_1 + b x_2 \\
-                c x_1 + d x_2
-            \end{pmatrix}
-    \end{align*}
-
-    Similarly,
-    \begin{align*}
-        \begin{pmatrix}
-            a & b & c & d \\
-            e & f & g & h \\
-            i & j & k & l
-        \end{pmatrix}
-        \begin{pmatrix}
-            x_1 \\
-            x_2 \\
-            x_3 \\
-            x_4
-        \end{pmatrix}
-        =
-        \begin{pmatrix} 
-            \luadirect{
-                local matrix_product = require("scripts.matrix_product")
-                local m1 = {
-                    {"a", "b", "c", "d"}, 
-                    {"e", "f", "g", "h"},
-                    {"i", "j", "k", "l"}
-                }
-                local m2 = {
-                    {"x_1"},
-                    {"x_2"},
-                    {"x_3"},
-                    {"x_4"}
-                }
-                local product_matrix = matrix_product.matrix_product_repr(m1,m2)
-                local matrix_dump = matrix_product.dump_matrix(product_matrix)
-                tex.print(matrix_dump)
-            }
-        \end{pmatrix}
-    \end{align*}
-\end{example}
-
-The number of columns has to be the same as the dimension of the vector to which the matrix is applied.
-
-\begin{definition}[Tranpose of a Matrix]
-    Let $A = \begin{pmatrix}
-        a & b \\
-        c & d
-    \end{pmatrix}$, then $A^T =  \begin{pmatrix}
-        a & c \\
-        b & d
-    \end{pmatrix}$
-\end{definition}
-
-\begin{figure}
-    \centering
-    \includestandalone{figures/schemes/coordinates_systems}
-    \caption{Coordinate systems}
-\end{figure}
diff --git a/content/chapters/part1/3.tex b/content/chapters/part1/3.tex
deleted file mode 100644
index e69de29..0000000
diff --git a/content/chapters/part2/0.tex b/content/chapters/part2/0.tex
new file mode 100644
index 0000000..fbfa3b9
--- /dev/null
+++ b/content/chapters/part2/0.tex
@@ -0,0 +1,2 @@
+\part{Linear Algebra}
+
diff --git a/content/chapters/part2/1.tex b/content/chapters/part2/1.tex
new file mode 100644
index 0000000..0e5f075
--- /dev/null
+++ b/content/chapters/part2/1.tex
@@ -0,0 +1,220 @@
+\chapter{Elements of Linear Algebra}
+\label{ch:elements-of-linear-algebra}
+
+\begin{remark}[vector]
+    Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$
+\end{remark}
+
+Let $u = \begin{pmatrix}
+        u_1    \\
+        \vdots \\
+        u_n
+    \end{pmatrix}$ and $v = \begin{pmatrix}
+        v_1    \\
+        \vdots \\
+        v_n
+    \end{pmatrix}$
+
+\begin{definition}[Scalar Product (Dot Product)]
+    \begin{align*}
+        \scalar{u, v} & = \begin{pmatrix}
+            u_1, \ldots, u_v
+        \end{pmatrix}
+        \begin{pmatrix}
+            v_1    \\
+            \vdots \\
+            v_n
+        \end{pmatrix} \\
+    & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n
+    \end{align*}
+
+    We may use $\scalar{u, v}$ or $u \cdot v$ notations.
+\end{definition}
+\paragraph{Dot product properties}
+\begin{description}
+    \item[Commutative] $\scalar{u, v} = \scalar{v, u}$
+    \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$
+    \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$
+    \item $\scalar{a, a} = \norm{a}^2$
+\end{description}
+
+\begin{definition}[Norm]
+    Length of the vector.
+    \[
+        \norm{u} = \sqrt{\scalar{u, v}}
+    \]
+
+    $\norm{u, v} > 0$
+\end{definition}
+
+\begin{definition}[Distance]
+    \[
+        dist(u, v) = \norm{u-v}
+    \]
+\end{definition}
+
+\begin{definition}[Orthogonality]
+
+\end{definition}
+
+\begin{remark}
+    \[
+        (dist(u, v))^2 = \norm{u - v}^2,
+    \] and
+    \[
+        \scalar{v-u, v-u}
+    \]
+\end{remark}
+
+\begin{figure}
+    \centering
+    \includestandalone{figures/schemes/vector_orthogonality}
+    \caption{Scalar product of two orthogonal vectors.}
+    \label{fig:scheme-orthogonal-scalar-product}
+\end{figure}
+
+\begin{align*}
+    \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\
+                    & = \norm{v}^2 + \norm{u}^2                         \\
+                    & = -2 \scalar{u, v}
+\end{align*}
+
+\begin{align*}
+    \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\
+    \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v}
+\end{align*}
+
+\begin{proposition}[Scalar product of orthogonal vectors]
+\[
+    u \perp v \Leftrightarrow \scalar{u, v} = 0
+\]
+\end{proposition}
+
+\begin{proof}[Indeed]
+    $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}.
+    \begin{align*}
+        \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\
+        \Leftrightarrow & 4 \scalar{u, v} = 0                \\
+        \Leftrightarrow & \scalar{u, v} = 0
+    \end{align*}
+\end{proof}
+
+\begin{theorem}[Pythagorean theorem]
+    If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ .
+\end{theorem}
+
+\begin{definition}[Orthogonal Projection]
+
+\end{definition}
+Let $y = \begin{pmatrix}
+        y_1 \\
+        .   \\
+        y_n
+    \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$.
+$\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$:
+\[
+    \mathcal{Y} = proj^w(y) + z,
+\]
+where
+\[
+    \begin{cases}
+        z \in w^\perp \\
+        proj^w(y) \in w
+    \end{cases}
+\]
+There is only one vector $\mathcal{Y}$ that ?
+
+The scalar product between $z$ and (?) is zero.
+
+\begin{property}
+    $proj^w(y)$ is the closest vector to $y$ that belongs to $w$.
+\end{property}
+
+\begin{definition}[Matrix]
+    A matrix is an application, that is, a function that transform a thing into another, it is a linear function.
+\end{definition}
+
+\begin{example}[Matrix application]
+
+    Let $A$ be a matrix:
+    \[
+        A = \begin{pmatrix}
+            a & b \\
+            c & d
+        \end{pmatrix}
+    \] and
+    \[
+        x = \begin{pmatrix}
+            x_1 \\
+            x_2
+        \end{pmatrix}
+    \]
+    Then,
+    \begin{align*}
+        Ax & = \begin{pmatrix}
+                a & b \\
+                c & d
+            \end{pmatrix}
+        \begin{pmatrix}
+            x_1 \\
+            x_2
+        \end{pmatrix}        \\
+        & = \begin{pmatrix}
+                a x_1 + b x_2 \\
+                c x_1 + d x_2
+            \end{pmatrix}
+    \end{align*}
+
+    Similarly,
+    \begin{align*}
+        \begin{pmatrix}
+            a & b & c & d \\
+            e & f & g & h \\
+            i & j & k & l
+        \end{pmatrix}
+        \begin{pmatrix}
+            x_1 \\
+            x_2 \\
+            x_3 \\
+            x_4
+        \end{pmatrix}
+        =
+        \begin{pmatrix} 
+            \luadirect{
+                local matrix_product = require("scripts.matrix_product")
+                local m1 = {
+                    {"a", "b", "c", "d"}, 
+                    {"e", "f", "g", "h"},
+                    {"i", "j", "k", "l"}
+                }
+                local m2 = {
+                    {"x_1"},
+                    {"x_2"},
+                    {"x_3"},
+                    {"x_4"}
+                }
+                local product_matrix = matrix_product.matrix_product_repr(m1,m2)
+                local matrix_dump = matrix_product.dump_matrix(product_matrix)
+                tex.print(matrix_dump)
+            }
+        \end{pmatrix}
+    \end{align*}
+\end{example}
+
+The number of columns has to be the same as the dimension of the vector to which the matrix is applied.
+
+\begin{definition}[Tranpose of a Matrix]
+    Let $A = \begin{pmatrix}
+        a & b \\
+        c & d
+    \end{pmatrix}$, then $A^T =  \begin{pmatrix}
+        a & c \\
+        b & d
+    \end{pmatrix}$
+\end{definition}
+
+\begin{figure}
+    \centering
+    \includestandalone{figures/schemes/coordinates_systems}
+    \caption{Coordinate systems}
+\end{figure}
diff --git a/content/introduction.tex b/content/introduction.tex
index 775a081..12a8352 100644
--- a/content/introduction.tex
+++ b/content/introduction.tex
@@ -22,4 +22,14 @@
     thus we might consider genotype either as a qualitative variable or quantitative variable.
 \end{example} 
 
-When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance.
\ No newline at end of file
+When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance.
+
+\begin{figure}
+    \begin{subfigure}{0.45\columnwidth}
+        \includegraphics[width=\columnwidth]{figures/plots/linear_regression_linear.pdf}
+    \end{subfigure}
+    \begin{subfigure}{0.45\columnwidth}
+        \includegraphics[width=\columnwidth]{figures/plots/linear_regression_non_linear.pdf}
+    \end{subfigure}
+    \caption{Illustration of two models fitting observed values}
+\end{figure}
\ No newline at end of file
diff --git a/definitions.tex b/definitions.tex
index 9387d0c..b69346b 100644
--- a/definitions.tex
+++ b/definitions.tex
@@ -1,6 +1,10 @@
 \DeclareMathOperator{\VVar}{\mathbb{V}} % variance
 \DeclareMathOperator{\One}{\mathbf{1}}
 \DeclareMathOperator{\Cor}{\mathrm{Cor}}
+\DeclareMathOperator{\St}{\mathscr{St}}
 \newcommand{\M}[1][]{\ensuremath{\ifstrempty{#1}{\mathcal{M}}{\mathbb{M}_{#1}}}}
 \newcommand{\X}{\ensuremath{\mathbf{X}}}
 \newcommand{\Y}{\ensuremath{\mathbf{Y}}}
+\newcommand{\Z}{\ensuremath{\mathbf{Z}}}
+\usepackage{unicode-math}
+
diff --git a/figures/plots/linear_regression.R b/figures/plots/linear_regression.R
new file mode 100644
index 0000000..1e3e902
--- /dev/null
+++ b/figures/plots/linear_regression.R
@@ -0,0 +1,26 @@
+# Plot an affine model
+n <- 250
+sd <- 0.05
+epsilon <- rnorm(n, mean = 0, sd = 2)
+beta0 <- 1.25
+beta1 <- 4
+linear_model <- function(x) {
+    return(beta0 + beta1*x)
+}
+x <- runif(n, min=0, max=1)
+y <- linear_model(x) + epsilon
+
+pdf("figures/plots/linear_regression_linear.pdf")
+plot(x, y, col="#5654fa", type="p", pch=20, xlab="x", ylab="y")
+abline(a = beta0, b = beta1, col="red") 
+dev.off()
+
+
+non_linear_model <- function(x) {
+    return(beta0 + beta1 * exp(2*x))
+}
+non_linear_y <- non_linear_model(x) + epsilon
+pdf("figures/plots/linear_regression_non_linear.pdf")
+plot(x, non_linear_y, col="#5654fa", type="p", pch=20, xlab="x", ylab="z")
+curve(non_linear_model, from=0, to=1, add=T, col="red")
+dev.off()
diff --git a/figures/plots/linear_regression_linear.pdf b/figures/plots/linear_regression_linear.pdf
new file mode 100644
index 0000000..0be2ed9
--- /dev/null
+++ b/figures/plots/linear_regression_linear.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25059b14c85b0700f41d52bfb08536a101f5ab0ee0b9580aadaae3faeefcd1ae
+size 19542
diff --git a/figures/plots/linear_regression_non_linear.pdf b/figures/plots/linear_regression_non_linear.pdf
new file mode 100644
index 0000000..20b5677
--- /dev/null
+++ b/figures/plots/linear_regression_non_linear.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02fef791e9ba93e0c8eac221ee47f79aecd5a7744945575a793ec3ebfe673c3e
+size 20288
diff --git a/figures/schemes/.gitattributes b/figures/schemes/.gitattributes
new file mode 100644
index 0000000..0799fdf
--- /dev/null
+++ b/figures/schemes/.gitattributes
@@ -0,0 +1,3 @@
+covariance.pdf filter=lfs diff=lfs merge=lfs -text
+../plots/linear_regression_linear.pdf filter=lfs diff=lfs merge=lfs -text
+../plots/linear_regression_non_linear.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/figures/schemes/covariance.pdf b/figures/schemes/covariance.pdf
new file mode 100644
index 0000000..773a8ca
--- /dev/null
+++ b/figures/schemes/covariance.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c32e65df88f221afec82d7d549ac15078d482112e8693a58e21eaaf1c8958785
+size 36303
diff --git a/figures/schemes/covariance.tex b/figures/schemes/covariance.tex
new file mode 100644
index 0000000..aa487ca
--- /dev/null
+++ b/figures/schemes/covariance.tex
@@ -0,0 +1,35 @@
+% Scheme of Covariance
+\documentclass[margin=0.5cm]{standalone}
+\usepackage{tikz}
+\usepackage{amssymb}
+\begin{document}
+\begin{tikzpicture}
+    \usetikzlibrary{positioning}
+    \tikzset{
+        point/.style = {circle, inner sep={.75\pgflinewidth}, opacity=1, draw, black, fill=black},
+        point name/.style = {insert path={coordinate (#1)}},
+    }
+    \begin{scope}[yshift=0]
+        \draw (-4, 0.5) -- (4,0.5) node[right] {$Y_i$};
+        \draw (-4, -0.5) -- (4,-0.5) node[right] {$Y_j$};
+        \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) > 0$};
+        \node (EYipoint) at (0,0.5) {$\times$};
+        \node at (0, 1) {$\mathbb{E}(Y_i)$};
+        \node (EYipoint) at (0,-0.5) {$\times$};
+        \node at (0, -1) {$\mathbb{E}(Y_j)$};
+
+        \foreach \x in {-3, 0.5, 2.75} {
+            \node[point] at (\x, 0.5) {};
+        }
+        \foreach \x in {-2, -1, 3} {
+            \node[point] at (\x, -0.5) {};
+        }
+    \end{scope}
+    \begin{scope}[yshift=-100]
+        \draw (-4,0.5) -- (4,0.5) node[right] {$Y_i$};
+        \draw (-4,-0.5) -- (4,-0.5) node[right] {$Y_j$};
+        \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) \approx 0$};
+    \end{scope}
+
+\end{tikzpicture}
+\end{document}
\ No newline at end of file
diff --git a/main.pdf b/main.pdf
index 101c25b..c767f7c 100644
--- a/main.pdf
+++ b/main.pdf
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27d9599621738b732087974ecd8141b18411aac1ef77ff31c2ea2464ac443eb8
-size 308493
+oid sha256:03d4328d90340efbb70107a12e2d267e15a18d788928a0c1243764ff4ed279f9
+size 286931