Moved part on linear algebra

Add stuff on model validity
2023-10-13 13:19:12 +02:00 · 2023-10-13 13:19:12 +02:00 · 29dad16dfb
parent 43acae64f3
commit 29dad16dfb
16 changed files with 627 additions and 230 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,7 +1,2 @@
 main.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/regression_plan_3D.pdf filter=lfs diff=lfs merge=lfs -text
+**/*.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/vector_orthogonality.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/base_plan.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/coordinates_systems.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/ordinary_least_squares.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/orthogonal_projection.pdf filter=lfs diff=lfs merge=lfs -text
--- a/content/chapters/include.tex
+++ b/content/chapters/include.tex
@ -13,6 +13,6 @@
 \includechapters{part1}{2}
-% \includechapters{part2}{2}
+\includechapters{part2}{2}
 % \includechapters{part3}{1}
--- a/content/chapters/part1/1.tex
+++ b/content/chapters/part1/1.tex
@ -117,7 +117,7 @@ We want to minimize the distance between $\X\beta$ and $\Y$:
    \Rightarrow& \X \beta = proj^{(1, \X)} \Y\\
    \Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\
    \Rightarrow& \forall i: \\
-    & \X_i \Y = \X_i X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
+    & \X_i \Y = \X_i \X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
    \Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\
    \Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\
    \Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y
@ -127,7 +127,7 @@ This formula comes from the orthogonal projection of $\Y$ on the vector subspace
 $\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$.
-If $H$ is the projection matrix of the subspace generated by $\X$, $X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
+If $H$ is the projection matrix of the subspace generated by $\X$, $\X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
 \section{Sum of squares}
@ -253,6 +253,7 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
 \begin{theorem}[Cochran Theorem (Consequence)]
    \label{thm:cochran}
    Let $\mathbf{Z}$ be a gaussian vector: $\mathbf{Z} \sim \Norm_n(0_n, I_n)$.
    \begin{itemize}
@ -263,8 +264,29 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
        \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$... 
        (\textcolor{red}{look to the slides})
    \end{itemize}
    $Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$.
    \dots
 \end{theorem}
 \begin{property}[Estimators properties in the linear model]
    According to \autoref{thm:cochran}, 
    \[
        \hat{m} \text{ is independent from $\hat{\sigma}^2$}
    \]\dots
    \[
        \frac{\norm{\Y - \Pi_V(\Y)}^2}{...} \sim 
    \]
    $\hat{m} = \X \hat{\beta}$
    $\hat{m}$ is the estimation of the mean.
 \end{property}
 \begin{definition}[Chi 2 distribution]
    If $X_1, \ldots, X_n$ i.i.d. $\sim \Norm(0, 1)$, then;,
    \[
@ -318,3 +340,290 @@ We can derive statistical test from these properties.
 \]
 where 
 \paragraph{Estimation of $\sigma^2$}
 A biased estimator of $\sigma^2$ is:
 \[
    \hat{\sigma^2} = ?
 \]
 $S^2$ is the unbiased estimator of $\sigma^2$
 \begin{align*}
    S^2 &= \frac{1}{n-q} \norm{\Y - \Pi_V(\Y)}^2 \\
    &= \frac{1}{n-q} \sum_{i=1}^n (Y_i - (\X\hat{\beta})_i)^2
 \end{align*}
 \begin{remark}[On $\hat{m}$]
    \begin{align*}
        &\Y = \X \beta + \varepsilon
        \Leftrightarrow& \EE(\Y) = \X \beta
    \end{align*}
 \end{remark}
 \section{Student test of nullity of a parameter}
 Let $\beta_j$ be a parameter, the tested hypotheses are as follows:
 \[
    \begin{cases}
        (H_0): \beta_j = 0 \\
        (H_1): \beta_j \neq 0
    \end{cases}
 \]
 Under the null hypothesis:
 \[
    \frac{\hat{\beta}_j - \beta_j}{S \sqrt{(\X^T \X)^1_{j,j}}} \sim \St(n-q).
 \]
 The test statistic is:
 \[
    W_n = \frac{\hat{\beta}_j}{S \sqrt{(\X^T\X)^{-1}_{j,j}}} \underset{H_0}{\sim} \St(n-q).
 \]
 $\hat{\beta}$ is a multinormal vector.
 Let's consider a vector of 4 values:
 \begin{align*}
    \begin{pmatrix}
        \hat{\beta}_0 \\
        \hat{\beta}_1 \\
        \hat{\beta}_2 \\
        \hat{\beta}_3 
    \end{pmatrix}
     \sim \Norm_4 \left( \begin{pmatrix}
        \beta_0 \\
        \beta_1 \\
        \beta_2 \\
        \beta_3
     \end{pmatrix} ; 
     \sigma^2 \left(\X^T \X\right)^{-1} 
     \right)
 \end{align*}
 Let $\M$ be the following model
 \begin{align*}
    Y_i &= \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i
 \end{align*}
 Why can't we use the following model to test each of the parameters values (here for $X_2$)?
 \[
    Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i
 \]
 We can't use such a model, we would probably meet a confounding factor: even if we are only interested in relationship $X_2$ with  $Y$, we have to fit the whole model.
 \begin{example}[Confounding parameter]
    Let $Y$ be a variable related to the lung cancer. Let $X_1$ be the smoking status, and $X_2$ the variable `alcohol' (for instance the quantity of alcohol drunk per week).
    If we only fit the model $\M: Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i$, we could conclude for a relationship between alcohol and lung cancer, because alcohol consumption and smoking is strongly related. If we had fit the model $\M = Y_i = \theta_0 + \theta_1 X_{1i} + \theta_2 X_{2i} + \varepsilon_i$, we could indeed have found no significant relationship between $X_2$ and $Y$.
 \end{example}
 \begin{definition}[Student law]    
    Let $X$ and $Y$ be two random variables such as $X \indep Y$, and such that $X \sim \Norm(0, 1)$ and $Y \sim \chi_n^2$, then
    \[
        \frac{X}{\sqrt{Y}} \sim \St(n)
    \]
 \end{definition}
 \subsection{Model comparison}
 \begin{definition}[Nested models]
 \end{definition}
 Let $\M_2$ and $\M_4$ be two models:
 $\M_2: Y_i = \beta_0 + \beta_3 X_{3_i} + \varepsilon_i$
 $\M_4: Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i$
 $\M_2$ is nested in $\M_4$.
 \paragraph*{Principle} We compare the residual variances of the two models, that is, the variance that is not explained by the model.
 The better the model is, the smallest the variance would be.
 If everything is explained by the model, the residual variance would be null.
 Here $\M_4$ holds all the information found in $\M_2$ plus other informations. In the worst case It would be at least as good as $\M_2$.
 \subsection{Fisher $F$-test of model comparison}
 Let $\M_q$ and $\M_{q'}$ be two models such as $\dim(\M_q) = q$, $\dim(\M_{q'}) = q'$, $q > q'$ and $\M_{q'}$ is nested in $\M_q$.
 \paragraph{Tested hypotheses}
 \[
 \begin{cases}
    (H_0): \M_{q'} \text{ is the proper model} \\
    (H_1): \M_q \text{ is a better model} 
 \end{cases}
 \]
 \begin{description}
    \item[ESS] Estimated Sum of Squares 
    \item[RSS] Residual Sum of Squares
    \item[EMS] Estimates Mean Square
    \item[RMS] Residual Mean Square    
 \end{description}
 \[
    ESS = RSS(\M_{q'}) - RSS(\M_q)
 \]
 \[
    RSS(\M) = \norm{\Y - \X\hat{\beta}} = \sum_{i=1}^n \hat{\varepsilon}_i^2
 \]
 \[
    EMS = \frac{ESS}{q - q'}
 \]
 \[
    RMS = \frac{RSS(\M_q)}{n-q}
 \]
 Under the null hypotheses:
 \[
    F = \frac{EMS}{RMS} \underset{H_0}{\sim} \Fish(q-q'; n-q)
 \]
 \section{Model validity}
 Assumptions:
 \begin{itemize}
    \item $\X$ is a full rank matrix;
    \item Residuals are i.i.d. $\varepsilon \sim \Norm(0_n, \sigma^2 \mathcal{I}_n)$;
 \end{itemize}
 We have also to look for influential variables.
 \subsection{$\X$ is full rank}
 To check that the rank of the matrix is $p+1$, we can calculate the eigen value of the correlation value of the matrix. If there is a perfect relationship between two variables (two columns of $\X$), one of the eigen value would be null. In practice, we never get a null eigen value. We consider the condition index as the ratio between the largest and the smallest eigenvalues, if the condition index $\kappa = \frac{\lambda_1}{\lambda_p}$, with $\lambda_1 \geq \lambda_2 \geq \ldots \geq \lambda_p$ the eigenvalues.
 If all eigenvalues is different from 0, $\X^T \X$ can be inverted, but the estimated parameter variance would be large, thus the estimation of the parameters would be not relevant (not good enough).
 \paragraph{Variance Inflation Factor}
 Perform a regression of each of the predictors against the other predictors.
 If there is a strong linear relationship between a parameter and the others, it would reflect that the coefficient of determination $R^2$ (the amount of variance explained by the model) for this model, which would mean that there is a strong relationship between the parameters.
 We do this for all parameters, and for parameter $j = 1, \ldots, p$, the variance inflation factor would be:
 \[
    VIF_j = \frac{1}{1-R^2_j}.
 \]
 \subparagraph*{Rule}
 If $VIF > 10$ or $VIF > 100$\dots
 In case of multicollinearity, we have to remove the variable one by one until there is no longer multicollinearity.
 Variables have to be removed based on statistical results and through discussion with experimenters.
 \subsection{Residuals analysis}
 \paragraph*{Assumption}
 \[
    \varepsilon \sim \Norm_n(0_n, \sigma^2 I_n)
 \]
 \paragraph{Normality of the residuals} If $\varepsilon_i$ ($i=1, \ldots, n$) could be observed we could build a QQ-plot of $\varepsilon_i / \sigma$ against quantiles of $\Norm(0, 1)$.
 Only the residual errors $\hat{e}_i$ can be observed:
 Let $e_i^*$ be the studentized residual, considered as estimators of $\varepsilon_i$
 \[
    e_i^* = \frac{\hat{e}_i}{\sqrt{\sigma^2_{(i)(1-H_{ii})}}}
 \]
 \begin{align*}
    \hat{Y} &= X \hat{\beta} \\
    &= X \left( (X^TX)^{-1} X^T Y\right) \\
    &= \underbrace{X (X^TX)^{-1} X^T}_{H} Y
 \end{align*}
 \paragraph{Centered residuals} If $(1, \ldots, 1)^T$ belongs to $\X$ $\EE(\varepsilon) = 0$, by construction.
 \paragraph{Independence} We do not have a statistical test for independence in R, we would plot the residuals $e$ against $\X \hat{\beta}$.
 \paragraph{Homoscedastiscity} Plot the $\sqrt{e^*}$ against $\X \hat{\beta}$.
 \paragraph{Influential observations} 
 We make the distinction between observations:
 \begin{itemize}
    \item With too large residual
    $\rightarrow$ Influence on the estimation of $\sigma^2$
    \item Which are too isolated
    $\rightarrow$ Influence on the estimation of $\beta$
 \end{itemize}
 \[
    e_i^* \sim \St(n-p-1)
 \]
 \subparagraph*{Rule} We consider an observation to be aberrant if:
 \[
    e_i^* > \F^{-1}_{\St(n-p-1)}(1-\alpha)
 \]
 quantile of order $1-\alpha$, $\alpha$ being often set as $1/n$, or we set the threshold to 2.
 \paragraph{Leverage} Leverage is the diagonal term of the orthogonal projection matrix(?) $H_{ii}$.
 \begin{property}
    \begin{itemize}
        \item $0 \leq H_{ii} \leq 1$
        \item $\sum_i H_ii = p$
    \end{itemize}
 \end{property}
 \subparagraph*{Rule} We consider that the observation is aberrant if the leverage is ??.
 \paragraph{Non-linearity}
 \section{Model Selection}
 We want to select the best model with the smallest number of predictors.
 When models have too many explicative variables, the power of statistical tests decreases.
 Different methods:
 \begin{itemize}
    \item Comparison of nested models;
    \item Information criteria;
    \item Method based on the prediction error.
 \end{itemize}
 \subsection{Information criteria}
 \subsubsection{Likelihood}
 \begin{definition}[Likelihood]
    Probability to observe what we observed for a particular model.
    \[
        L_n (\M(k))
    \]
 \end{definition}
 \begin{definition}[Akaike Information Criterion]
    \[
        AIC(\M(k)) = -2 \log L_n (\M(k)) + 2k.
    \]
    $2k$ is a penalty, leading to privilege the smallest model.
 \end{definition}
 \begin{definition}[Bayesian Information Criterion]
    \[
        BIC(\M(k)) = -2 \log L_n (\M(k)) + \log(n) k.
    \]
    $\log(n) k$ is a penalty.
 \end{definition}
 Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion.
--- a/content/chapters/part1/2.tex
+++ b/content/chapters/part1/2.tex
@ -1,220 +1,4 @@
-\chapter{Elements of Linear Algebra}
+\chapter{Generalized Linear Model}
 \label{ch:elements-of-linear-algebra}
-\begin{remark}[vector]
+\section{Logistic Regression}
    Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$
 \end{remark}
 Let $u = \begin{pmatrix}
        u_1    \\
        \vdots \\
        u_n
    \end{pmatrix}$ and $v = \begin{pmatrix}
        v_1    \\
        \vdots \\
        v_n
    \end{pmatrix}$
 \begin{definition}[Scalar Product (Dot Product)]
    \begin{align*}
        \scalar{u, v} & = \begin{pmatrix}
            u_1, \ldots, u_v
        \end{pmatrix}
        \begin{pmatrix}
            v_1    \\
            \vdots \\
            v_n
        \end{pmatrix} \\
    & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n
    \end{align*}
    We may use $\scalar{u, v}$ or $u \cdot v$ notations.
 \end{definition}
 \paragraph{Dot product properties}
 \begin{description}
    \item[Commutative] $\scalar{u, v} = \scalar{v, u}$
    \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$
    \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$
    \item $\scalar{a, a} = \norm{a}^2$
 \end{description}
 \begin{definition}[Norm]
    Length of the vector.
    \[
        \norm{u} = \sqrt{\scalar{u, v}}
    \]
    $\norm{u, v} > 0$
 \end{definition}
 \begin{definition}[Distance]
    \[
        dist(u, v) = \norm{u-v}
    \]
 \end{definition}
 \begin{definition}[Orthogonality]
 \end{definition}
 \begin{remark}
    \[
        (dist(u, v))^2 = \norm{u - v}^2,
    \] and
    \[
        \scalar{v-u, v-u}
    \]
 \end{remark}
 \begin{figure}
    \centering
    \includestandalone{figures/schemes/vector_orthogonality}
    \caption{Scalar product of two orthogonal vectors.}
    \label{fig:scheme-orthogonal-scalar-product}
 \end{figure}
 \begin{align*}
    \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\
                    & = \norm{v}^2 + \norm{u}^2                         \\
                    & = -2 \scalar{u, v}
 \end{align*}
 \begin{align*}
    \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\
    \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v}
 \end{align*}
 \begin{proposition}[Scalar product of orthogonal vectors]
 \[
    u \perp v \Leftrightarrow \scalar{u, v} = 0
 \]
 \end{proposition}
 \begin{proof}[Indeed]
    $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}.
    \begin{align*}
        \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\
        \Leftrightarrow & 4 \scalar{u, v} = 0                \\
        \Leftrightarrow & \scalar{u, v} = 0
    \end{align*}
 \end{proof}
 \begin{theorem}[Pythagorean theorem]
    If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ .
 \end{theorem}
 \begin{definition}[Orthogonal Projection]
 \end{definition}
 Let $y = \begin{pmatrix}
        y_1 \\
        .   \\
        y_n
    \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$.
 $\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$:
 \[
    \mathcal{Y} = proj^w(y) + z,
 \]
 where
 \[
    \begin{cases}
        z \in w^\perp \\
        proj^w(y) \in w
    \end{cases}
 \]
 There is only one vector $\mathcal{Y}$ that ?
 The scalar product between $z$ and (?) is zero.
 \begin{property}
    $proj^w(y)$ is the closest vector to $y$ that belongs to $w$.
 \end{property}
 \begin{definition}[Matrix]
    A matrix is an application, that is, a function that transform a thing into another, it is a linear function.
 \end{definition}
 \begin{example}[Matrix application]
    Let $A$ be a matrix:
    \[
        A = \begin{pmatrix}
            a & b \\
            c & d
        \end{pmatrix}
    \] and
    \[
        x = \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}
    \]
    Then,
    \begin{align*}
        Ax & = \begin{pmatrix}
                a & b \\
                c & d
            \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}        \\
        & = \begin{pmatrix}
                a x_1 + b x_2 \\
                c x_1 + d x_2
            \end{pmatrix}
    \end{align*}
    Similarly,
    \begin{align*}
        \begin{pmatrix}
            a & b & c & d \\
            e & f & g & h \\
            i & j & k & l
        \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2 \\
            x_3 \\
            x_4
        \end{pmatrix}
        =
        \begin{pmatrix} 
            \luadirect{
                local matrix_product = require("scripts.matrix_product")
                local m1 = {
                    {"a", "b", "c", "d"}, 
                    {"e", "f", "g", "h"},
                    {"i", "j", "k", "l"}
                }
                local m2 = {
                    {"x_1"},
                    {"x_2"},
                    {"x_3"},
                    {"x_4"}
                }
                local product_matrix = matrix_product.matrix_product_repr(m1,m2)
                local matrix_dump = matrix_product.dump_matrix(product_matrix)
                tex.print(matrix_dump)
            }
        \end{pmatrix}
    \end{align*}
 \end{example}
 The number of columns has to be the same as the dimension of the vector to which the matrix is applied.
 \begin{definition}[Tranpose of a Matrix]
    Let $A = \begin{pmatrix}
        a & b \\
        c & d
    \end{pmatrix}$, then $A^T =  \begin{pmatrix}
        a & c \\
        b & d
    \end{pmatrix}$
 \end{definition}
 \begin{figure}
    \centering
    \includestandalone{figures/schemes/coordinates_systems}
    \caption{Coordinate systems}
 \end{figure}
--- a/content/chapters/part1/3.tex
+++ b/content/chapters/part1/3.tex
--- a/content/chapters/part2/0.tex
+++ b/content/chapters/part2/0.tex
@ -0,0 +1,2 @@
 \part{Linear Algebra}
--- a/content/chapters/part2/1.tex
+++ b/content/chapters/part2/1.tex
@ -0,0 +1,220 @@
 \chapter{Elements of Linear Algebra}
 \label{ch:elements-of-linear-algebra}
 \begin{remark}[vector]
    Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$
 \end{remark}
 Let $u = \begin{pmatrix}
        u_1    \\
        \vdots \\
        u_n
    \end{pmatrix}$ and $v = \begin{pmatrix}
        v_1    \\
        \vdots \\
        v_n
    \end{pmatrix}$
 \begin{definition}[Scalar Product (Dot Product)]
    \begin{align*}
        \scalar{u, v} & = \begin{pmatrix}
            u_1, \ldots, u_v
        \end{pmatrix}
        \begin{pmatrix}
            v_1    \\
            \vdots \\
            v_n
        \end{pmatrix} \\
    & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n
    \end{align*}
    We may use $\scalar{u, v}$ or $u \cdot v$ notations.
 \end{definition}
 \paragraph{Dot product properties}
 \begin{description}
    \item[Commutative] $\scalar{u, v} = \scalar{v, u}$
    \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$
    \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$
    \item $\scalar{a, a} = \norm{a}^2$
 \end{description}
 \begin{definition}[Norm]
    Length of the vector.
    \[
        \norm{u} = \sqrt{\scalar{u, v}}
    \]
    $\norm{u, v} > 0$
 \end{definition}
 \begin{definition}[Distance]
    \[
        dist(u, v) = \norm{u-v}
    \]
 \end{definition}
 \begin{definition}[Orthogonality]
 \end{definition}
 \begin{remark}
    \[
        (dist(u, v))^2 = \norm{u - v}^2,
    \] and
    \[
        \scalar{v-u, v-u}
    \]
 \end{remark}
 \begin{figure}
    \centering
    \includestandalone{figures/schemes/vector_orthogonality}
    \caption{Scalar product of two orthogonal vectors.}
    \label{fig:scheme-orthogonal-scalar-product}
 \end{figure}
 \begin{align*}
    \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\
                    & = \norm{v}^2 + \norm{u}^2                         \\
                    & = -2 \scalar{u, v}
 \end{align*}
 \begin{align*}
    \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\
    \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v}
 \end{align*}
 \begin{proposition}[Scalar product of orthogonal vectors]
 \[
    u \perp v \Leftrightarrow \scalar{u, v} = 0
 \]
 \end{proposition}
 \begin{proof}[Indeed]
    $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}.
    \begin{align*}
        \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\
        \Leftrightarrow & 4 \scalar{u, v} = 0                \\
        \Leftrightarrow & \scalar{u, v} = 0
    \end{align*}
 \end{proof}
 \begin{theorem}[Pythagorean theorem]
    If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ .
 \end{theorem}
 \begin{definition}[Orthogonal Projection]
 \end{definition}
 Let $y = \begin{pmatrix}
        y_1 \\
        .   \\
        y_n
    \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$.
 $\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$:
 \[
    \mathcal{Y} = proj^w(y) + z,
 \]
 where
 \[
    \begin{cases}
        z \in w^\perp \\
        proj^w(y) \in w
    \end{cases}
 \]
 There is only one vector $\mathcal{Y}$ that ?
 The scalar product between $z$ and (?) is zero.
 \begin{property}
    $proj^w(y)$ is the closest vector to $y$ that belongs to $w$.
 \end{property}
 \begin{definition}[Matrix]
    A matrix is an application, that is, a function that transform a thing into another, it is a linear function.
 \end{definition}
 \begin{example}[Matrix application]
    Let $A$ be a matrix:
    \[
        A = \begin{pmatrix}
            a & b \\
            c & d
        \end{pmatrix}
    \] and
    \[
        x = \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}
    \]
    Then,
    \begin{align*}
        Ax & = \begin{pmatrix}
                a & b \\
                c & d
            \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}        \\
        & = \begin{pmatrix}
                a x_1 + b x_2 \\
                c x_1 + d x_2
            \end{pmatrix}
    \end{align*}
    Similarly,
    \begin{align*}
        \begin{pmatrix}
            a & b & c & d \\
            e & f & g & h \\
            i & j & k & l
        \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2 \\
            x_3 \\
            x_4
        \end{pmatrix}
        =
        \begin{pmatrix} 
            \luadirect{
                local matrix_product = require("scripts.matrix_product")
                local m1 = {
                    {"a", "b", "c", "d"}, 
                    {"e", "f", "g", "h"},
                    {"i", "j", "k", "l"}
                }
                local m2 = {
                    {"x_1"},
                    {"x_2"},
                    {"x_3"},
                    {"x_4"}
                }
                local product_matrix = matrix_product.matrix_product_repr(m1,m2)
                local matrix_dump = matrix_product.dump_matrix(product_matrix)
                tex.print(matrix_dump)
            }
        \end{pmatrix}
    \end{align*}
 \end{example}
 The number of columns has to be the same as the dimension of the vector to which the matrix is applied.
 \begin{definition}[Tranpose of a Matrix]
    Let $A = \begin{pmatrix}
        a & b \\
        c & d
    \end{pmatrix}$, then $A^T =  \begin{pmatrix}
        a & c \\
        b & d
    \end{pmatrix}$
 \end{definition}
 \begin{figure}
    \centering
    \includestandalone{figures/schemes/coordinates_systems}
    \caption{Coordinate systems}
 \end{figure}
--- a/content/introduction.tex
+++ b/content/introduction.tex
@ -22,4 +22,14 @@
    thus we might consider genotype either as a qualitative variable or quantitative variable.
 \end{example} 
-When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance.
+When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance.
 \begin{figure}
    \begin{subfigure}{0.45\columnwidth}
        \includegraphics[width=\columnwidth]{figures/plots/linear_regression_linear.pdf}
    \end{subfigure}
    \begin{subfigure}{0.45\columnwidth}
        \includegraphics[width=\columnwidth]{figures/plots/linear_regression_non_linear.pdf}
    \end{subfigure}
    \caption{Illustration of two models fitting observed values}
 \end{figure}
--- a/definitions.tex
+++ b/definitions.tex
@ -1,6 +1,10 @@
 \DeclareMathOperator{\VVar}{\mathbb{V}} % variance
 \DeclareMathOperator{\One}{\mathbf{1}}
 \DeclareMathOperator{\Cor}{\mathrm{Cor}}
 \DeclareMathOperator{\St}{\mathscr{St}}
 \newcommand{\M}[1][]{\ensuremath{\ifstrempty{#1}{\mathcal{M}}{\mathbb{M}_{#1}}}}
 \newcommand{\X}{\ensuremath{\mathbf{X}}}
 \newcommand{\Y}{\ensuremath{\mathbf{Y}}}
 \newcommand{\Z}{\ensuremath{\mathbf{Z}}}
 \usepackage{unicode-math}
--- a/figures/plots/linear_regression.R
+++ b/figures/plots/linear_regression.R
@ -0,0 +1,26 @@
 # Plot an affine model
 n <- 250
 sd <- 0.05
 epsilon <- rnorm(n, mean = 0, sd = 2)
 beta0 <- 1.25
 beta1 <- 4
 linear_model <- function(x) {
    return(beta0 + beta1*x)
 }
 x <- runif(n, min=0, max=1)
 y <- linear_model(x) + epsilon
 pdf("figures/plots/linear_regression_linear.pdf")
 plot(x, y, col="#5654fa", type="p", pch=20, xlab="x", ylab="y")
 abline(a = beta0, b = beta1, col="red") 
 dev.off()
 non_linear_model <- function(x) {
    return(beta0 + beta1 * exp(2*x))
 }
 non_linear_y <- non_linear_model(x) + epsilon
 pdf("figures/plots/linear_regression_non_linear.pdf")
 plot(x, non_linear_y, col="#5654fa", type="p", pch=20, xlab="x", ylab="z")
 curve(non_linear_model, from=0, to=1, add=T, col="red")
 dev.off()
--- a/figures/plots/linear_regression_linear.pdf
+++ b/figures/plots/linear_regression_linear.pdf
--- a/figures/plots/linear_regression_non_linear.pdf
+++ b/figures/plots/linear_regression_non_linear.pdf
--- a/figures/schemes/.gitattributes
+++ b/figures/schemes/.gitattributes
@ -0,0 +1,3 @@
 covariance.pdf filter=lfs diff=lfs merge=lfs -text
 ../plots/linear_regression_linear.pdf filter=lfs diff=lfs merge=lfs -text
 ../plots/linear_regression_non_linear.pdf filter=lfs diff=lfs merge=lfs -text
--- a/figures/schemes/covariance.pdf
+++ b/figures/schemes/covariance.pdf
--- a/figures/schemes/covariance.tex
+++ b/figures/schemes/covariance.tex
@ -0,0 +1,35 @@
 % Scheme of Covariance
 \documentclass[margin=0.5cm]{standalone}
 \usepackage{tikz}
 \usepackage{amssymb}
 \begin{document}
 \begin{tikzpicture}
    \usetikzlibrary{positioning}
    \tikzset{
        point/.style = {circle, inner sep={.75\pgflinewidth}, opacity=1, draw, black, fill=black},
        point name/.style = {insert path={coordinate (#1)}},
    }
    \begin{scope}[yshift=0]
        \draw (-4, 0.5) -- (4,0.5) node[right] {$Y_i$};
        \draw (-4, -0.5) -- (4,-0.5) node[right] {$Y_j$};
        \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) > 0$};
        \node (EYipoint) at (0,0.5) {$\times$};
        \node at (0, 1) {$\mathbb{E}(Y_i)$};
        \node (EYipoint) at (0,-0.5) {$\times$};
        \node at (0, -1) {$\mathbb{E}(Y_j)$};
        \foreach \x in {-3, 0.5, 2.75} {
            \node[point] at (\x, 0.5) {};
        }
        \foreach \x in {-2, -1, 3} {
            \node[point] at (\x, -0.5) {};
        }
    \end{scope}
    \begin{scope}[yshift=-100]
        \draw (-4,0.5) -- (4,0.5) node[right] {$Y_i$};
        \draw (-4,-0.5) -- (4,-0.5) node[right] {$Y_j$};
        \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) \approx 0$};
    \end{scope}
 \end{tikzpicture}
 \end{document}
--- a/main.pdf
+++ b/main.pdf