feat: Add some stuff on generalized linear models

Moved part on linear algebra
Add stuff on model validity
2023-11-10 13:39:13 +01:00 · 2023-10-13 13:19:12 +02:00
41 changed files with 905 additions and 262 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,7 +1,2 @@
 main.pdf filter=lfs diff=lfs merge=lfs -text
-figures/schemes/regression_plan_3D.pdf filter=lfs diff=lfs merge=lfs -text
+**/*.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/vector_orthogonality.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/base_plan.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/coordinates_systems.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/ordinary_least_squares.pdf filter=lfs diff=lfs merge=lfs -text
 figures/schemes/orthogonal_projection.pdf filter=lfs diff=lfs merge=lfs -text
--- a/0
+++ b/0
--- a/content/chapters/include.tex
+++ b/content/chapters/include.tex
@ -11,8 +11,8 @@
 		}
 }
-\includechapters{part1}{2}
+\includechapters{part1}{1}
-% \includechapters{part2}{2}
+\includechapters{part2}{2}
 % \includechapters{part3}{1}
--- a/content/chapters/part1/0.tex
+++ b/content/chapters/part1/0.tex
--- a/content/chapters/part1/1.tex
+++ b/content/chapters/part1/1.tex
@ -60,12 +60,6 @@ In order to estimate the parameters, we can use penalties (additional terms).
 Lasso regression, Elastic Net, etc.
 \subsection{Statistical Analysis Workflow}
 \begin{enumerate}[label={\bfseries\color{primary}Step \arabic*.}]
    \item Graphical representation;
    \item ...
 \end{enumerate}
 \[
    Y = X \beta + \varepsilon,
 \]
@ -117,7 +111,7 @@ We want to minimize the distance between $\X\beta$ and $\Y$:
    \Rightarrow& \X \beta = proj^{(1, \X)} \Y\\
    \Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\
    \Rightarrow& \forall i: \\
-    & \X_i \Y = \X_i X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
+    & \X_i \Y = \X_i \X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
    \Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\
    \Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\
    \Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y
@ -127,7 +121,7 @@ This formula comes from the orthogonal projection of $\Y$ on the vector subspace
 $\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$.
-If $H$ is the projection matrix of the subspace generated by $\X$, $X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
+If $H$ is the projection matrix of the subspace generated by $\X$, $\X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
 \section{Sum of squares}
@ -145,21 +139,18 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
 \begin{figure}
    \centering
-    \includestandalone{figures/schemes/orthogonal_projection}
+    \includegraphics{figures/schemes/orthogonal_projection.pdf}
    \caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\hat{\varepsilon} = \norm{\Y - \hat{\beta}\X}^2$} and $\color{blue}c$ corresponds to $\norm{Y - \bar{Y}}^2$.
    \label{fig:scheme-orthogonal-projection}
 \end{figure}
 \begin{figure}
    \centering
-    \includestandalone{figures/schemes/ordinary_least_squares}
+    \includegraphics{figures/schemes/ordinary_least_squares.pdf}
    \caption{Ordinary least squares and regression line with simulated data.}
    \label{fig:ordinary-least-squares}
 \end{figure}
 \begin{definition}[Model dimension]
    Let $\M$ be a model.
    The dimension of $\M$ is the dimension of the subspace generated by $\X$, that is the number of parameters in the $\beta$ vector.
@ -169,22 +160,21 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
 \section{Gaussian vectors}
 \begin{definition}[Normal distribution]
-
+  $X \sim \Norm(\mu, \sigma^{2})$, with density function $f$
  \[
    f(x) = \frac{1}{\sigma \sqrt{2\pi}}e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^{2}}
  \]
 \end{definition}
 \begin{definition}[Gaussian vector]
-    A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is ...
+    A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is a gaussian random variable.
 \end{definition}
 \begin{property}
    $m = \EE(Y) = (m_1, \ldots, m_n)^T$, where $m_i = \EE(Y_i)$
    ...
    \[
        \Y \sim \Norm_n(m, \Sigma)
    \]
@ -193,8 +183,6 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
        \Sigma = \E\left[(\Y -m)(\Y - m)^T\right].
    \]
 \end{property}
 \begin{remark}
@ -253,6 +241,7 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
 \begin{theorem}[Cochran Theorem (Consequence)]
    \label{thm:cochran}
    Let $\mathbf{Z}$ be a gaussian vector: $\mathbf{Z} \sim \Norm_n(0_n, I_n)$.
    \begin{itemize}
@ -260,11 +249,33 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
        \[
            \RR[n] = V_1 \overset{\perp}{\oplus} V_2.
        \]
-        \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$... 
+      \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$ ($\Pi_{1}$ and $\Pi_{2}$ being projection matrices)
-        (\textcolor{red}{look to the slides})
+            then:
      \item $z_{1}$, $Z_{2}$ are independent gaussian vectors, $Z_{1} \sim \Norm_{n_{1}} (0_{n}, \Pi_{1})$ and $Z_{2} \sim \Norm(0_{n_{2}}, \Pi_{2})$.
            In particular $\norm{Z_{1}} \sim \chi^{2}(n_{1})$ and $\norm{Z_{2}} \sim \chi^{2}(n_{2})$.
    \end{itemize}
    $Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$.
    \dots
 \end{theorem}
 \begin{property}[Estimators properties in the linear model]
    According to \autoref{thm:cochran}, 
    \[
        \hat{m} \text{ is independent from $\hat{\sigma}^2$}
    \]
    \[
        \norm{\Y - \Pi_V(\Y)}^2 = \norm{\varepsilon - \Pi_{V}(\varepsilon)}^{2} = \norm{\Pi_{V}^{\perp} (\varepsilon)}^{2}
    \]
    $\hat{m} = \X \hat{\beta}$
    $\hat{m}$ is the estimation of the mean.
 \end{property}
 \begin{definition}[Chi 2 distribution]
    If $X_1, \ldots, X_n$ i.i.d. $\sim \Norm(0, 1)$, then;,
    \[
@ -281,40 +292,362 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
 \begin{align*}
    \hat{m} &= \X \hat{\beta} = \X(\X^T\X)^{-1} \X^T \Y \\
-    \text{so} \\
+    \intertext{so} \\
    &= \Pi_V \Y 
 \end{align*}
 According to Cochran theorem, we can deduce that the estimator of the predicted value $\hat{m}$ is independent $\hat{\sigma}^2$
-All the sum of squares follows a $\chi^2$ distribution:
+All the sum of squares follows a $\chi^2$ distribution.
 \[
   ...
 \]
 \begin{property}
-\end{property}
+\subsection{Estimators properties}
 \subsection{Estimators consistency}
 If $q < n$,
 \begin{itemize}
-    \item $\hat{\sigma}^2 \overunderset{\PP}{n\to\infty} \sigma^{*2}$.
+  \item $\hat{m}$ is unbiased and estimator of $m$;
-    \item If $(\X^T\X)^{-1}$...
+  \item $\EE(\hat{\sigma}^{2}) = \sigma^{2}(n-q)/n$ $\hat{\sigma}^{2}$ is a biased estimator of $\sigma^{2}$.
-    \item ...
+        \[
        S^{2} = \frac{1}{n-q} \norm{\Y - \Pi_{V}}^{2}
        \]
        is an unbiased estimator of $\sigma²$.
 \end{itemize}
 We can derive statistical test from these properties.
 \section{Statistical tests}
 \subsection{Student $t$-test}
 \[
-    \frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t
+    \frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t_{n-q}
 \]
 where 
 \paragraph{Estimation of $\sigma^2$}
 A biased estimator of $\sigma^2$ is:
 \[
    \hat{\sigma^2} = ?
 \]
 $S^2$ is the unbiased estimator of $\sigma^2$
 \begin{align*}
    S^2 &= \frac{1}{n-q} \norm{\Y - \Pi_V(\Y)}^2 \\
    &= \frac{1}{n-q} \sum_{i=1}^n (Y_i - (\X\hat{\beta})_i)^2
 \end{align*}
 \begin{remark}[On $\hat{m}$]
    \begin{align*}
        &\Y = \X \beta + \varepsilon
        \Leftrightarrow& \EE(\Y) = \X \beta
    \end{align*}
 \end{remark}
 \section{Student test of nullity of a parameter}
 Let $\beta_j$ be a parameter, the tested hypotheses are as follows:
 \[
    \begin{cases}
        (H_0): \beta_j = 0 \\
        (H_1): \beta_j \neq 0
    \end{cases}
 \]
 Under the null hypothesis:
 \[
    \frac{\hat{\beta}_j - \beta_j}{S \sqrt{(\X^T \X)^1_{j,j}}} \sim \St(n-q).
 \]
 The test statistic is:
 \[
    W_n = \frac{\hat{\beta}_j}{S \sqrt{(\X^T\X)^{-1}_{j,j}}} \underset{H_0}{\sim} \St(n-q).
 \]
 $\hat{\beta}$ is a multinormal vector.
 Let's consider a vector of 4 values:
 \begin{align*}
    \begin{pmatrix}
        \hat{\beta}_0 \\
        \hat{\beta}_1 \\
        \hat{\beta}_2 \\
        \hat{\beta}_3 
    \end{pmatrix}
     \sim \Norm_4 \left( \begin{pmatrix}
        \beta_0 \\
        \beta_1 \\
        \beta_2 \\
        \beta_3
     \end{pmatrix} ; 
     \sigma^2 \left(\X^T \X\right)^{-1} 
     \right)
 \end{align*}
 Let $\M$ be the following model
 \begin{align*}
    Y_i &= \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i
 \end{align*}
 Why can't we use the following model to test each of the parameters values (here for $X_2$)?
 \[
    Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i
 \]
 We can't use such a model, we would probably meet a confounding factor: even if we are only interested in relationship $X_2$ with  $Y$, we have to fit the whole model.
 \begin{example}[Confounding parameter]
    Let $Y$ be a variable related to the lung cancer. Let $X_1$ be the smoking status, and $X_2$ the variable `alcohol' (for instance the quantity of alcohol drunk per week).
    If we only fit the model $\M: Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i$, we could conclude for a relationship between alcohol and lung cancer, because alcohol consumption and smoking is strongly related. If we had fit the model $\M = Y_i = \theta_0 + \theta_1 X_{1i} + \theta_2 X_{2i} + \varepsilon_i$, we could indeed have found no significant relationship between $X_2$ and $Y$.
 \end{example}
 \begin{definition}[Student law]    
    Let $X$ and $Y$ be two random variables such as $X \indep Y$, and such that $X \sim \Norm(0, 1)$ and $Y \sim \chi_n^2$, then
    \[
        \frac{X}{\sqrt{Y}} \sim \St(n)
    \]
 \end{definition}
 \subsection{Model comparison}
 \begin{definition}[Nested models]
 \end{definition}
 Let $\M_2$ and $\M_4$ be two models:
 $\M_2: Y_i = \beta_0 + \beta_3 X_{3_i} + \varepsilon_i$
 $\M_4: Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i$
 $\M_2$ is nested in $\M_4$.
 \paragraph*{Principle} We compare the residual variances of the two models, that is, the variance that is not explained by the model.
 The better the model is, the smallest the variance would be.
 If everything is explained by the model, the residual variance would be null.
 Here $\M_4$ holds all the information found in $\M_2$ plus other informations. In the worst case It would be at least as good as $\M_2$.
 \subsection{Fisher $F$-test of model comparison}
 Let $\M_q$ and $\M_{q'}$ be two models such as $\dim(\M_q) = q$, $\dim(\M_{q'}) = q'$, $q > q'$ and $\M_{q'}$ is nested in $\M_q$.
 \paragraph{Tested hypotheses}
 \[
 \begin{cases}
    (H_0): \M_{q'} \text{ is the proper model} \\
    (H_1): \M_q \text{ is a better model} 
 \end{cases}
 \]
 \begin{description}
    \item[ESS] Estimated Sum of Squares 
    \item[RSS] Residual Sum of Squares
    \item[EMS] Estimates Mean Square
    \item[RMS] Residual Mean Square    
 \end{description}
 \[
    ESS = RSS(\M_{q'}) - RSS(\M_q)
 \]
 \[
    RSS(\M) = \norm{\Y - \X\hat{\beta}} = \sum_{i=1}^n \hat{\varepsilon}_i^2
 \]
 \[
    EMS = \frac{ESS}{q - q'}
 \]
 \[
    RMS = \frac{RSS(\M_q)}{n-q}
 \]
 Under the null hypotheses:
 \[
    F = \frac{EMS}{RMS} \underset{H_0}{\sim} \Fish(q-q'; n-q)
 \]
 \section{Model validity}
 Assumptions:
 \begin{itemize}
    \item $\X$ is a full rank matrix;
    \item Residuals are i.i.d. $\varepsilon \sim \Norm(0_n, \sigma^2 \mathcal{I}_n)$;
 \end{itemize}
 We have also to look for influential variables.
 \subsection{$\X$ is full rank}
 To check that the rank of the matrix is $p+1$, we can calculate the eigen value of the correlation value of the matrix. If there is a perfect relationship between two variables (two columns of $\X$), one of the eigen value would be null. In practice, we never get a null eigen value. We consider the condition index as the ratio between the largest and the smallest eigenvalues, if the condition index $\kappa = \frac{\lambda_1}{\lambda_p}$, with $\lambda_1 \geq \lambda_2 \geq \ldots \geq \lambda_p$ the eigenvalues.
 If all eigenvalues is different from 0, $\X^T \X$ can be inverted, but the estimated parameter variance would be large, thus the estimation of the parameters would be not relevant (not good enough).
 \paragraph{Variance Inflation Factor}
 Perform a regression of each of the predictors against the other predictors.
 If there is a strong linear relationship between a parameter and the others, it would reflect that the coefficient of determination $R^2$ (the amount of variance explained by the model) for this model, which would mean that there is a strong relationship between the parameters.
 We do this for all parameters, and for parameter $j = 1, \ldots, p$, the variance inflation factor would be:
 \[
    VIF_j = \frac{1}{1-R^2_j}.
 \]
 \subparagraph*{Rule}
 If $VIF > 10$ or $VIF > 100$\dots
 In case of multicollinearity, we have to remove the variable one by one until there is no longer multicollinearity.
 Variables have to be removed based on statistical results and through discussion with experimenters.
 \subsection{Residuals analysis}
 \paragraph*{Assumption}
 \[
    \varepsilon \sim \Norm_n(0_n, \sigma^2 I_n)
 \]
 \paragraph{Normality of the residuals} If $\varepsilon_i$ ($i=1, \ldots, n$) could be observed we could build a QQ-plot of $\varepsilon_i / \sigma$ against quantiles of $\Norm(0, 1)$.
 Only the residual errors $\hat{e}_i$ can be observed:
 Let $e_i^*$ be the studentized residual, considered as estimators of $\varepsilon_i$
 \[
    e_i^* = \frac{\hat{e}_i}{\sqrt{\sigma^2_{(i)(1-H_{ii})}}}
 \]
 \begin{align*}
    \hat{Y} &= X \hat{\beta} \\
    &= X \left( (X^TX)^{-1} X^T Y\right) \\
    &= \underbrace{X (X^TX)^{-1} X^T}_{H} Y
 \end{align*}
 \paragraph{Centered residuals} If $(1, \ldots, 1)^T$ belongs to $\X$ $\EE(\varepsilon) = 0$, by construction.
 \paragraph{Independence} We do not have a statistical test for independence in R, we would plot the residuals $e$ against $\X \hat{\beta}$.
 \paragraph{Homoscedastiscity} Plot the $\sqrt{e^*}$ against $\X \hat{\beta}$.
 \paragraph{Influential observations} 
 We make the distinction between observations:
 \begin{itemize}
    \item With too large residual
    $\rightarrow$ Influence on the estimation of $\sigma^2$
    \item Which are too isolated
    $\rightarrow$ Influence on the estimation of $\beta$
 \end{itemize}
 \[
    e_i^* \sim \St(n-p-1)
 \]
 \subparagraph*{Rule} We consider an observation to be aberrant if:
 \[
    e_i^* > \F^{-1}_{\St(n-p-1)}(1-\alpha)
 \]
 quantile of order $1-\alpha$, $\alpha$ being often set as $1/n$, or we set the threshold to 2.
 \paragraph{Leverage} Leverage is the diagonal term of the orthogonal projection matrix(?) $H_{ii}$.
 \begin{property}
    \begin{itemize}
        \item $0 \leq H_{ii} \leq 1$
        \item $\sum_i H_ii = p$
    \end{itemize}
 \end{property}
 \subparagraph*{Rule} We consider that the observation is aberrant if the leverage is ??.
 \paragraph{Non-linearity}
 \section{Model Selection}
 We want to select the best model with the smallest number of predictors.
 When models have too many explicative variables, the power of statistical tests decreases.
 Different methods:
 \begin{itemize}
    \item Comparison of nested models;
    \item Information criteria;
    \item Method based on the prediction error.
 \end{itemize}
 \subsection{Information criteria}
 \subsubsection{Likelihood}
 \begin{definition}[Likelihood]
    Probability to observe what we observed for a particular model.
    \[
        L_n (\M(k))
    \]
 \end{definition}
 \begin{definition}[Akaike Information Criterion]
    \[
        AIC(\M(k)) = -2 \log L_n (\M(k)) + 2k.
    \]
    $2k$ is a penalty, leading to privilege the smallest model.
 \end{definition}
 \begin{definition}[Bayesian Information Criterion]
    \[
        BIC(\M(k)) = -2 \log L_n (\M(k)) + \log(n) k.
    \]
    $\log(n) k$ is a penalty.
 \end{definition}
 Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion.
 \subsection{Stepwise}
 \begin{description}
    \item[forward] Add new predictor iteratively, beginning with the most contributing predictors. 
    \item[backward] Remove predictors iteratively.
    \item[stepwise] Combination of forward and backward selection. We start by no predictors. We add predictor. Before adding the predictor, we check whether all previously predictors remain meaningful. 
 \end{description}
 The problem with this iterative regression, is that at each step we make a test. We have to reduce the confidence level for multiple test.
 In practice, the multiple testing problem is not taken into account in these approaches.
 We can use information criteria or model comparison in these methods.
 \section{Predictions}
 Let $X_i$ the $i$-th row of the matrix $\X$. The observed value $Y_i$ can be estimated by:
 \[
    \hat{Y}_i = (\X \hat{\beta})_i = X_i \hat{\beta}
 \]
 \begin{align*}
    \EE (\hat{Y}_i) &= (\X \beta)_i = X_i \beta \\
    \sigma^{-1} (\X \hat{\beta} - \X \beta) \sim \Norm (0_{p+1}, (\X^T \X)^{-1}), \qquad \text{and} \\
    \Var(\hat{Y}_i) = ... \\
    S^2 = \norm{...}
 \end{align*}
 \paragraph{Prediction Confidence Interval}
 We can build confidence interval for predicted values $(\X \hat{\beta})_i$
 \dots
 \paragraph{Prediction error of $Y$}
 \paragraph{Prediction interval for a new observation $Y_{n+1}$}
--- a/content/chapters/part1/2.tex
+++ b/content/chapters/part1/2.tex
@ -1,220 +1,186 @@
-\chapter{Elements of Linear Algebra}
+\chapter{Generalized Linear Model}
 \label{ch:elements-of-linear-algebra}
-\begin{remark}[vector]
+\begin{example}
    Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$
 \end{remark}
-Let $u = \begin{pmatrix}
+  \begin{description}
-        u_1    \\
+    \item[Ex. 1 - Credit Carb Default]
-        \vdots \\
+      Let $Y_i$ be a boolean random variable following a Bernoulli distribution.
-        u_n
+    \item[Ex. 2 - Horseshoe Crabs]
-    \end{pmatrix}$ and $v = \begin{pmatrix}
+      Let $Y_i$, be the number of satellites males.
        v_1    \\
        \vdots \\
        v_n
    \end{pmatrix}$
-\begin{definition}[Scalar Product (Dot Product)]
+      $Y_i$ can be described as following a Poisson distribution.
-    \begin{align*}
+  \end{description}
        \scalar{u, v} & = \begin{pmatrix}
            u_1, \ldots, u_v
        \end{pmatrix}
        \begin{pmatrix}
            v_1    \\
            \vdots \\
            v_n
        \end{pmatrix} \\
    & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n
    \end{align*}
    We may use $\scalar{u, v}$ or $u \cdot v$ notations.
 \end{definition}
 \paragraph{Dot product properties}
 \begin{description}
    \item[Commutative] $\scalar{u, v} = \scalar{v, u}$
    \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$
    \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$
    \item $\scalar{a, a} = \norm{a}^2$
 \end{description}
 \begin{definition}[Norm]
    Length of the vector.
    \[
        \norm{u} = \sqrt{\scalar{u, v}}
    \]
    $\norm{u, v} > 0$
 \end{definition}
 \begin{definition}[Distance]
    \[
        dist(u, v) = \norm{u-v}
    \]
 \end{definition}
 \begin{definition}[Orthogonality]
 \end{definition}
 \begin{remark}
    \[
        (dist(u, v))^2 = \norm{u - v}^2,
    \] and
    \[
        \scalar{v-u, v-u}
    \]
 \end{remark}
 \begin{figure}
    \centering
    \includestandalone{figures/schemes/vector_orthogonality}
    \caption{Scalar product of two orthogonal vectors.}
    \label{fig:scheme-orthogonal-scalar-product}
 \end{figure}
 \begin{align*}
    \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\
                    & = \norm{v}^2 + \norm{u}^2                         \\
                    & = -2 \scalar{u, v}
 \end{align*}
 \begin{align*}
    \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\
    \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v}
 \end{align*}
 \begin{proposition}[Scalar product of orthogonal vectors]
 \[
    u \perp v \Leftrightarrow \scalar{u, v} = 0
 \]
 \end{proposition}
 \begin{proof}[Indeed]
    $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}.
    \begin{align*}
        \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\
        \Leftrightarrow & 4 \scalar{u, v} = 0                \\
        \Leftrightarrow & \scalar{u, v} = 0
    \end{align*}
 \end{proof}
 \begin{theorem}[Pythagorean theorem]
    If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ .
 \end{theorem}
 \begin{definition}[Orthogonal Projection]
 \end{definition}
 Let $y = \begin{pmatrix}
        y_1 \\
        .   \\
        y_n
    \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$.
 $\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$:
 \[
    \mathcal{Y} = proj^w(y) + z,
 \]
 where
 \[
    \begin{cases}
        z \in w^\perp \\
        proj^w(y) \in w
    \end{cases}
 \]
 There is only one vector $\mathcal{Y}$ that ?
 The scalar product between $z$ and (?) is zero.
 \begin{property}
    $proj^w(y)$ is the closest vector to $y$ that belongs to $w$.
 \end{property}
 \begin{definition}[Matrix]
    A matrix is an application, that is, a function that transform a thing into another, it is a linear function.
 \end{definition}
 \begin{example}[Matrix application]
    Let $A$ be a matrix:
    \[
        A = \begin{pmatrix}
            a & b \\
            c & d
        \end{pmatrix}
    \] and
    \[
        x = \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}
    \]
    Then,
    \begin{align*}
        Ax & = \begin{pmatrix}
                a & b \\
                c & d
            \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}        \\
        & = \begin{pmatrix}
                a x_1 + b x_2 \\
                c x_1 + d x_2
            \end{pmatrix}
    \end{align*}
    Similarly,
    \begin{align*}
        \begin{pmatrix}
            a & b & c & d \\
            e & f & g & h \\
            i & j & k & l
        \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2 \\
            x_3 \\
            x_4
        \end{pmatrix}
        =
        \begin{pmatrix} 
            \luadirect{
                local matrix_product = require("scripts.matrix_product")
                local m1 = {
                    {"a", "b", "c", "d"}, 
                    {"e", "f", "g", "h"},
                    {"i", "j", "k", "l"}
                }
                local m2 = {
                    {"x_1"},
                    {"x_2"},
                    {"x_3"},
                    {"x_4"}
                }
                local product_matrix = matrix_product.matrix_product_repr(m1,m2)
                local matrix_dump = matrix_product.dump_matrix(product_matrix)
                tex.print(matrix_dump)
            }
        \end{pmatrix}
    \end{align*}
 \end{example}
-The number of columns has to be the same as the dimension of the vector to which the matrix is applied.
+\begin{remark}
  A Poisson distribution can be viewed as an approximation of binomial distribution when $n$ is high and $p$ low.
 \end{remark}
 \begin{definition}[Tranpose of a Matrix]
    Let $A = \begin{pmatrix}
        a & b \\
        c & d
    \end{pmatrix}$, then $A^T =  \begin{pmatrix}
        a & c \\
        b & d
    \end{pmatrix}$
 \end{definition}
-\begin{figure}
+We will consider the following relation:
-    \centering
+\[
-    \includestandalone{figures/schemes/coordinates_systems}
+  \EE(Y_i) = g^{-1} X_i \beta,
-    \caption{Coordinate systems}
+\]
-\end{figure}
+equivalently:
 \[
  g(\EE(Y_i)) = X_i \beta.
 \]
 \begin{itemize}
  \item $\beta$ is estimated by the maximum likelihood;
  \item $g$ is called the link function.
 \end{itemize}
 \begin{remark}
  In standard linear model, the OLS estimator is the estimator of maximum of likelihood.
 \end{remark}
 \section{Logistic Regression}
 \begin{align*}
                  & \log(\frac{\Pi}{1 - \Pi})                   & = \X \beta \\
  \Leftrightarrow & e^{\ln \frac{\Pi}{1 - \Pi}} =  e^{\X \beta}              \\
  \Leftrightarrow & \frac{\Pi}{1 - \Pi} = e^{\X \beta}                       \\
  \Leftrightarrow & \Pi = (1 - \Pi) e^{\X\beta}                              \\
  \Leftrightarrow & \Pi = e^{\X \beta} - \Pi e^{\X\beta}                     \\
  \Leftrightarrow & \Pi + \Pi e^{\X\beta} = e^{\X \beta}                     \\
  \Leftrightarrow & \Pi (1 - e^{\X\beta}) = e^{\X \beta}                     \\
  \Leftrightarrow & \Pi = \frac{e^{\X\beta}}{1 + e^{\X \beta}}
 \end{align*}
 \section{Maximum Likelihood estimator}
 log-likelihood: the probability to observe what we observe.
 Estimate $\beta$ by $\hat{\beta}$ such that $\forall \beta \in \RR[p+1]$:
 \[
  L_n (\hat{\beta}) \geq L_n (\beta)
 \]
 These estimators are consistent, but not necessarily unbiased.
 \section{Test for each single coordinate}
 \begin{example}[Payment Default]
  Let $Y_i$ be the default value for individual $i$.
  \[
    \log (\frac{\Pi (X)}{1 - \Pi (X)}) = \beta_0 + \beta_1 \text{student} + \beta_2 \text{balance} + \beta_3 \text{income}
  \]
  In this example, only $\beta_0$ and $\beta_2$ are significantly different from 0.
 \end{example}
 \begin{remark}
  We do not add $\varepsilon_i$, because $\log(\frac{\Pi (X)}{1 - \Pi (X)})$ corresponds to the expectation.
 \end{remark}
 \subsection{Comparison of nested models}
 To test $H_0:\: \beta_0 = \ldots = \beta_p = 0$, we use the likelihood ratio test:
 \[
  T_n = -2 \log (\mathcal{L}^{\texttt{null}}) + 2 \log (\mathcal{L}(\hat{\beta})) \underset{H_0}{\overunderset{\mathcal{L}}{n \to \infty}{\longrightarrow}} \chi^2(p).
 \]
 \begin{remark}[Family of Tests]
  \begin{itemize}
    \item Comparison of estimated values and values under the null hypothesis;
    \item Likelihood ratio test;
    \item Based on the slope on the derivative.
  \end{itemize}
 \end{remark}
 \section{Relative risk}
 $RR_i$ is the probably to have the disease, conditional to the predictor $X_{i1}$ over the probability of having the disease, conditional to the predictor $X_{i2}$.
 \[
  RR(j) = \frac{\Prob(Y_{i_1} = 1 \: | \: X_{i_1})}{\Prob(Y_{i_2} = 1) \: | \: X_{i_2}} = \frac{\EE(Y_{i_1})}{\EE(Y_{i_2})}.
 \]
 $\pi(X_i)$ is the probability of having the disease, according to $X_i$.
 The relative risk can be written as\dots
 \section{Odds}
 Quantity providing a measure of the likelihood of a particular outcome:
 \[
  odd = \frac{\pi(X_i)}{1 - \pi(X_i)}
 \]
 \[
  odds = \exp(X_i \beta)
 \]
 odds is the ratio of people having the disease, if Y represent the disease, over the people not having the disease.
 \section{Odds Ratio}
 \begin{align*}
  OR(j) =\frac{odds(X_{i_1})}{odds(X_{i_2})} & =  \frac{\frac{\pi{X_{i_1}}}{1 - \pi(X_{i_1})}}{\frac{\pi{X_{i_2}}}{1 - \pi(X_{i_2})}}
 \end{align*}
 The OR can be written as:
 \[
  OR(j) = \exp(\beta_j)
 \]
 \begin{exercise}
  Show that $OR(j) = \exp(\beta_j)$.
 \end{exercise}
 \begin{align*}
  OR(j) & = \frac{odds(X_{i_1})}{odds(X_{i_2})}             \\
        & = \frac{\exp(X_{i_1} \beta)}{\exp(X_{i_2} \beta)} \\
 \end{align*}
  \[
    \log \left(
    \frac{\Prob(Y=1 \: |\: X_{i_1})}{1 - \Prob(Y=1 \: |\: X_{i_1})}\right)
    = \beta_0 + \beta_1 X_1^{(1)} + \beta_2 X_2^{(1)} + \ldots + \beta_p X_p^{(1)}
 \]
    Similarly
 \[
    \log \left(
      \frac{\Prob(Y=1 \: |\: X_{i_2})}{1 - \Prob(Y=1 \: |\: X_{i_2})}\right)
    = \beta_0 + \beta_1 X_1^{(2)} + \beta_2 X_2^{(2)} + \ldots + \beta_p X_p^{(2)}
 \]
  We substract both equations:
    \begin{align*}
      &\log \left(
        \frac{\Prob(Y=1 \: |\: X_{i_1})}{1 - \Prob(Y=1 \: |\: X_{i_1})} \right) - \log \left(\frac{\Prob(Y=1 \: |\: X_{i_2})}{1 - \Prob(Y=1 \: |\: X_{i_2})}\right) \\
      & = \beta_0 + \beta_1 X_1^{(1)} + \beta_2 X_2^{(1)} + \ldots + \beta_p X_p^{(1)} - \beta_0 + \beta_1 X_1^{(2)} + \beta_2 X_2^{(2)} + \ldots + \beta_p X_p^{(2)} \\
      & = \log OR(j)  \\
      & = \cancel{(\beta_0 - \beta_0)} + \beta_1 \cancel{(X_1^{(1)} - X_1^{(2)})} + \beta_2 \cancel{(X_2^{(1)} - X_2^{(2)})} + \ldots + \beta_j \cancelto{1}{(X_j^{(1)} - X_j^{(2)})} + \ldots + \beta_p \cancel{(X_p^{(1)} - X_p^{(2)})} \\
      &\Leftrightarrow \log (OR_j) = \beta_j \\
      &\Leftrightarrow OR(j) = \exp(\beta_j)
    \end{align*}
 OR is not equal to RR, except in the particular case of probability (?)
 If OR is significantly different from 1, the $\exp(\beta_j)$ is significantly different from 1, thus $\beta_j$ is significantly different from 0.
 If we have more than two classes, we do not know what means $X_{i_1} - X_{i_2} = 0$. We will have to take a reference class, and compare successively each class with the reference class.
 $\hat{\pi}(X_{+}) = \hat{\Prob(X=1 \: | X_{i1})}$ for a new individual.
 \section{Poisson model}
 Let $Y_{i} \sim \mathcal{P}(\lambda_{i})$, corresponding to a counting.
 \begin{align*}
 	\EE(Y_{i}) & = g^{-1}(X_{i} \beta) \\
 	\Leftrightarrow g(\EE(Y_{i})) = X_{i} \beta
 \end{align*}
 where $g(x) = \ln(x)$, and $g^{-1}(x) = e^{x}$.
 \[
 	\lambda_{i} = \EE(Y_{i}) = \Var(Y_{i})
 \]
--- a/content/chapters/part1/3.tex
+++ b/content/chapters/part1/3.tex
@ -0,0 +1,25 @@
 \chapter{Tests Reminders}
 \section{$\chi^2$ test of independence}
 \section{$\chi^2$ test of goodness of fit}
 Check if the observations is in adequation with a particular distribution.
 \begin{example}[Mendel experiments]
 	Let $AB$, $Ab$, $aB$, $ab$ be the four possible genotypes of peas: colors and grain shape.
 	\begin{tabular}
 		\toprule
 		AB  & Ab  & aB  & ab \\
 		\midrule
 		315 & 108 & 101 & 32 \\
 		\bottomrule
 	\end{tabular}
 \end{example}
 The test statistics is:
 \[
 	D_{k,n} = \sum_{i=1}^{k} \frac{(N_i - np_i)^2}{np_i} \underoverset{H_0}{\mathcal{L}} \chi^2_{(n-1)(q-1)??}
 \]
--- a/content/chapters/part1/4.tex
+++ b/content/chapters/part1/4.tex
--- a/content/chapters/part2/0.tex
+++ b/content/chapters/part2/0.tex
@ -0,0 +1,2 @@
 \part{Linear Algebra}
--- a/content/chapters/part2/1.tex
+++ b/content/chapters/part2/1.tex
@ -0,0 +1,220 @@
 \chapter{Elements of Linear Algebra}
 \label{ch:elements-of-linear-algebra}
 \begin{remark}[vector]
    Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$
 \end{remark}
 Let $u = \begin{pmatrix}
        u_1    \\
        \vdots \\
        u_n
    \end{pmatrix}$ and $v = \begin{pmatrix}
        v_1    \\
        \vdots \\
        v_n
    \end{pmatrix}$
 \begin{definition}[Scalar Product (Dot Product)]
    \begin{align*}
        \scalar{u, v} & = \begin{pmatrix}
            u_1, \ldots, u_v
        \end{pmatrix}
        \begin{pmatrix}
            v_1    \\
            \vdots \\
            v_n
        \end{pmatrix} \\
    & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n
    \end{align*}
    We may use $\scalar{u, v}$ or $u \cdot v$ notations.
 \end{definition}
 \paragraph{Dot product properties}
 \begin{description}
    \item[Commutative] $\scalar{u, v} = \scalar{v, u}$
    \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$
    \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$
    \item $\scalar{a, a} = \norm{a}^2$
 \end{description}
 \begin{definition}[Norm]
    Length of the vector.
    \[
        \norm{u} = \sqrt{\scalar{u, v}}
    \]
    $\norm{u, v} > 0$
 \end{definition}
 \begin{definition}[Distance]
    \[
        dist(u, v) = \norm{u-v}
    \]
 \end{definition}
 \begin{definition}[Orthogonality]
 \end{definition}
 \begin{remark}
    \[
        (dist(u, v))^2 = \norm{u - v}^2,
    \] and
    \[
        \scalar{v-u, v-u}
    \]
 \end{remark}
 \begin{figure}
    \centering
    \includegraphics{figures/schemes/vector_orthogonality.pdf}
    \caption{Scalar product of two orthogonal vectors.}
    \label{fig:scheme-orthogonal-scalar-product}
 \end{figure}
 \begin{align*}
    \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\
                    & = \norm{v}^2 + \norm{u}^2                         \\
                    & = -2 \scalar{u, v}
 \end{align*}
 \begin{align*}
    \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\
    \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v}
 \end{align*}
 \begin{proposition}[Scalar product of orthogonal vectors]
 \[
    u \perp v \Leftrightarrow \scalar{u, v} = 0
 \]
 \end{proposition}
 \begin{proof}[Indeed]
    $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}.
    \begin{align*}
        \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\
        \Leftrightarrow & 4 \scalar{u, v} = 0                \\
        \Leftrightarrow & \scalar{u, v} = 0
    \end{align*}
 \end{proof}
 \begin{theorem}[Pythagorean theorem]
    If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ .
 \end{theorem}
 \begin{definition}[Orthogonal Projection]
 \end{definition}
 Let $y = \begin{pmatrix}
        y_1 \\
        .   \\
        y_n
    \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$.
 $\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$:
 \[
    \mathcal{Y} = proj^w(y) + z,
 \]
 where
 \[
    \begin{cases}
        z \in w^\perp \\
        proj^w(y) \in w
    \end{cases}
 \]
 There is only one vector $\mathcal{Y}$ that ?
 The scalar product between $z$ and (?) is zero.
 \begin{property}
    $proj^w(y)$ is the closest vector to $y$ that belongs to $w$.
 \end{property}
 \begin{definition}[Matrix]
    A matrix is an application, that is, a function that transform a thing into another, it is a linear function.
 \end{definition}
 \begin{example}[Matrix application]
    Let $A$ be a matrix:
    \[
        A = \begin{pmatrix}
            a & b \\
            c & d
        \end{pmatrix}
    \] and
    \[
        x = \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}
    \]
    Then,
    \begin{align*}
        Ax & = \begin{pmatrix}
                a & b \\
                c & d
            \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2
        \end{pmatrix}        \\
        & = \begin{pmatrix}
                a x_1 + b x_2 \\
                c x_1 + d x_2
            \end{pmatrix}
    \end{align*}
    Similarly,
    \begin{align*}
        \begin{pmatrix}
            a & b & c & d \\
            e & f & g & h \\
            i & j & k & l
        \end{pmatrix}
        \begin{pmatrix}
            x_1 \\
            x_2 \\
            x_3 \\
            x_4
        \end{pmatrix}
        =
        \begin{pmatrix} 
            \luadirect{
                local matrix_product = require("scripts.matrix_product")
                local m1 = {
                    {"a", "b", "c", "d"}, 
                    {"e", "f", "g", "h"},
                    {"i", "j", "k", "l"}
                }
                local m2 = {
                    {"x_1"},
                    {"x_2"},
                    {"x_3"},
                    {"x_4"}
                }
                local product_matrix = matrix_product.matrix_product_repr(m1,m2)
                local matrix_dump = matrix_product.dump_matrix(product_matrix)
                tex.print(matrix_dump)
            }
        \end{pmatrix}
    \end{align*}
 \end{example}
 The number of columns has to be the same as the dimension of the vector to which the matrix is applied.
 \begin{definition}[Tranpose of a Matrix]
    Let $A = \begin{pmatrix}
        a & b \\
        c & d
    \end{pmatrix}$, then $A^T =  \begin{pmatrix}
        a & c \\
        b & d
    \end{pmatrix}$
 \end{definition}
 \begin{figure}
    \centering
    \includegraphics{figures/schemes/coordinates_systems.pdf}
    \caption{Coordinate systems}
 \end{figure}
--- a/content/conclusion.tex
+++ b/content/conclusion.tex
--- a/content/introduction.tex
+++ b/content/introduction.tex
@ -22,4 +22,14 @@
    thus we might consider genotype either as a qualitative variable or quantitative variable.
 \end{example} 
-When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance.
+When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance.
 \begin{figure}
    \begin{subfigure}{0.45\columnwidth}
        \includegraphics[width=\columnwidth]{figures/plots/linear_regression_linear.pdf}
    \end{subfigure}
    \begin{subfigure}{0.45\columnwidth}
        \includegraphics[width=\columnwidth]{figures/plots/linear_regression_non_linear.pdf}
    \end{subfigure}
    \caption{Illustration of two models fitting observed values}
 \end{figure}
--- a/definitions.tex
+++ b/definitions.tex
@ -1,6 +1,10 @@
 \DeclareMathOperator{\VVar}{\mathbb{V}} % variance
 \DeclareMathOperator{\One}{\mathbf{1}}
 \DeclareMathOperator{\Cor}{\mathrm{Cor}}
 \DeclareMathOperator{\St}{\mathscr{St}}
 \newcommand{\M}[1][]{\ensuremath{\ifstrempty{#1}{\mathcal{M}}{\mathbb{M}_{#1}}}}
 \newcommand{\X}{\ensuremath{\mathbf{X}}}
 \newcommand{\Y}{\ensuremath{\mathbf{Y}}}
 \newcommand{\Z}{\ensuremath{\mathbf{Z}}}
 \usepackage{unicode-math}
--- a/figures/plots/linear_regression.R
+++ b/figures/plots/linear_regression.R
@ -0,0 +1,26 @@
 # Plot an affine model
 n <- 250
 sd <- 0.05
 epsilon <- rnorm(n, mean = 0, sd = 2)
 beta0 <- 1.25
 beta1 <- 4
 linear_model <- function(x) {
    return(beta0 + beta1*x)
 }
 x <- runif(n, min=0, max=1)
 y <- linear_model(x) + epsilon
 pdf("figures/plots/linear_regression_linear.pdf")
 plot(x, y, col="#5654fa", type="p", pch=20, xlab="x", ylab="y")
 abline(a = beta0, b = beta1, col="red") 
 dev.off()
 non_linear_model <- function(x) {
    return(beta0 + beta1 * exp(2*x))
 }
 non_linear_y <- non_linear_model(x) + epsilon
 pdf("figures/plots/linear_regression_non_linear.pdf")
 plot(x, non_linear_y, col="#5654fa", type="p", pch=20, xlab="x", ylab="z")
 curve(non_linear_model, from=0, to=1, add=T, col="red")
 dev.off()
--- a/figures/plots/linear_regression_linear.pdf
+++ b/figures/plots/linear_regression_linear.pdf
--- a/figures/plots/linear_regression_non_linear.pdf
+++ b/figures/plots/linear_regression_non_linear.pdf
--- a/figures/plots/logistic_curve.pdf
+++ b/figures/plots/logistic_curve.pdf
--- a/figures/plots/logistic_curve.tex
+++ b/figures/plots/logistic_curve.tex
@ -0,0 +1,23 @@
 \documentclass[margin=0.5cm]{standalone}
 \usepackage{tikz}
 \usepackage{pgfplots}
 \pgfplotsset{compat=1.18} 
 \begin{document}
 \begin{tikzpicture}
  \begin{axis}[
    title={Logit function},
    xlabel={$x$},
    ylabel={$y$},
    domain=-5:5,
    samples=200,
    legend style={at={(0.95,0.05)},anchor=south east}
 ]
 \newcommand{\Lvar}{1}
 \newcommand{\kvar}{1}
 \newcommand{\xvar}{0}
 \addplot [blue] {\Lvar / (1 + exp(-\kvar*(x-\xvar)))};
 \addlegendentry{$L = \Lvar, k=\kvar, x_0=\xvar$};
 \end{axis}
 \end{tikzpicture}
 \end{document}
--- a/figures/schemes/.gitattributes
+++ b/figures/schemes/.gitattributes
@ -0,0 +1,3 @@
 covariance.pdf filter=lfs diff=lfs merge=lfs -text
 ../plots/linear_regression_linear.pdf filter=lfs diff=lfs merge=lfs -text
 ../plots/linear_regression_non_linear.pdf filter=lfs diff=lfs merge=lfs -text
--- a/figures/schemes/base_plan.pdf
+++ b/figures/schemes/base_plan.pdf
--- a/figures/schemes/base_plan.tex
+++ b/figures/schemes/base_plan.tex
--- a/figures/schemes/coordinates_systems.pdf
+++ b/figures/schemes/coordinates_systems.pdf
--- a/figures/schemes/coordinates_systems.tex
+++ b/figures/schemes/coordinates_systems.tex
--- a/figures/schemes/covariance.pdf
+++ b/figures/schemes/covariance.pdf
--- a/figures/schemes/covariance.tex
+++ b/figures/schemes/covariance.tex
@ -0,0 +1,35 @@
 % Scheme of Covariance
 \documentclass[margin=0.5cm]{standalone}
 \usepackage{tikz}
 \usepackage{amssymb}
 \begin{document}
 \begin{tikzpicture}
    \usetikzlibrary{positioning}
    \tikzset{
        point/.style = {circle, inner sep={.75\pgflinewidth}, opacity=1, draw, black, fill=black},
        point name/.style = {insert path={coordinate (#1)}},
    }
    \begin{scope}[yshift=0]
        \draw (-4, 0.5) -- (4,0.5) node[right] {$Y_i$};
        \draw (-4, -0.5) -- (4,-0.5) node[right] {$Y_j$};
        \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) > 0$};
        \node (EYipoint) at (0,0.5) {$\times$};
        \node at (0, 1) {$\mathbb{E}(Y_i)$};
        \node (EYipoint) at (0,-0.5) {$\times$};
        \node at (0, -1) {$\mathbb{E}(Y_j)$};
        \foreach \x in {-3, 0.5, 2.75} {
            \node[point] at (\x, 0.5) {};
        }
        \foreach \x in {-2, -1, 3} {
            \node[point] at (\x, -0.5) {};
        }
    \end{scope}
    \begin{scope}[yshift=-100]
        \draw (-4,0.5) -- (4,0.5) node[right] {$Y_i$};
        \draw (-4,-0.5) -- (4,-0.5) node[right] {$Y_j$};
        \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) \approx 0$};
    \end{scope}
 \end{tikzpicture}
 \end{document}
--- a/figures/schemes/ordinary_least_squares.pdf
+++ b/figures/schemes/ordinary_least_squares.pdf
--- a/figures/schemes/ordinary_least_squares.png
+++ b/figures/schemes/ordinary_least_squares.png
--- a/figures/schemes/ordinary_least_squares.svg
+++ b/figures/schemes/ordinary_least_squares.svg
--- a/figures/schemes/ordinary_least_squares.tex
+++ b/figures/schemes/ordinary_least_squares.tex
--- a/figures/schemes/orthogonal_projection.pdf
+++ b/figures/schemes/orthogonal_projection.pdf
--- a/figures/schemes/orthogonal_projection.tex
+++ b/figures/schemes/orthogonal_projection.tex
--- a/figures/schemes/regression_plan_3D.pdf
+++ b/figures/schemes/regression_plan_3D.pdf
--- a/figures/schemes/regression_plan_3D.tex
+++ b/figures/schemes/regression_plan_3D.tex
--- a/figures/schemes/vector_orthogonality.pdf
+++ b/figures/schemes/vector_orthogonality.pdf
--- a/figures/schemes/vector_orthogonality.tex
+++ b/figures/schemes/vector_orthogonality.tex
--- a/glossary.tex
+++ b/glossary.tex
--- a/main.pdf
+++ b/main.pdf
--- a/main.tex
+++ b/main.tex
--- a/preamble.tex
+++ b/preamble.tex
@ -3,4 +3,5 @@
 \usepackage{standalone}
 \usepackage{tikz-3dplot}
 \usepackage{tkz-euclide}
-\usepackage{nicematrix}
+\usepackage{nicematrix}
 \usepackage{luacode}
--- a/references.bib
+++ b/references.bib
--- a/scripts/matrix_product.lua
+++ b/scripts/matrix_product.lua
Author	SHA1	Message	Date
Samuel Ortion	c552aa24f4	feat: Add some stuff on generalized linear models	2023-11-10 13:39:13 +01:00
Samuel Ortion	29dad16dfb	Moved part on linear algebra Add stuff on model validity	2023-10-13 13:19:12 +02:00