From 29dad16dfb895fe85cc6b09246f12680d034b0c5 Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Fri, 13 Oct 2023 13:19:12 +0200 Subject: [PATCH] Moved part on linear algebra Add stuff on model validity --- .gitattributes | 7 +- content/chapters/include.tex | 2 +- content/chapters/part1/1.tex | 313 +++++++++++++++++- content/chapters/part1/2.tex | 220 +----------- content/chapters/part1/3.tex | 0 content/chapters/part2/0.tex | 2 + content/chapters/part2/1.tex | 220 ++++++++++++ content/introduction.tex | 12 +- definitions.tex | 4 + figures/plots/linear_regression.R | 26 ++ figures/plots/linear_regression_linear.pdf | 3 + .../plots/linear_regression_non_linear.pdf | 3 + figures/schemes/.gitattributes | 3 + figures/schemes/covariance.pdf | 3 + figures/schemes/covariance.tex | 35 ++ main.pdf | 4 +- 16 files changed, 627 insertions(+), 230 deletions(-) delete mode 100644 content/chapters/part1/3.tex create mode 100644 content/chapters/part2/0.tex create mode 100644 content/chapters/part2/1.tex create mode 100644 figures/plots/linear_regression.R create mode 100644 figures/plots/linear_regression_linear.pdf create mode 100644 figures/plots/linear_regression_non_linear.pdf create mode 100644 figures/schemes/.gitattributes create mode 100644 figures/schemes/covariance.pdf create mode 100644 figures/schemes/covariance.tex diff --git a/.gitattributes b/.gitattributes index b4a264a..59f2077 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,7 +1,2 @@ main.pdf filter=lfs diff=lfs merge=lfs -text -figures/schemes/regression_plan_3D.pdf filter=lfs diff=lfs merge=lfs -text -figures/schemes/vector_orthogonality.pdf filter=lfs diff=lfs merge=lfs -text -figures/schemes/base_plan.pdf filter=lfs diff=lfs merge=lfs -text -figures/schemes/coordinates_systems.pdf filter=lfs diff=lfs merge=lfs -text -figures/schemes/ordinary_least_squares.pdf filter=lfs diff=lfs merge=lfs -text -figures/schemes/orthogonal_projection.pdf filter=lfs diff=lfs merge=lfs -text +**/*.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/content/chapters/include.tex b/content/chapters/include.tex index 7922d7d..7d1c4c3 100755 --- a/content/chapters/include.tex +++ b/content/chapters/include.tex @@ -13,6 +13,6 @@ \includechapters{part1}{2} -% \includechapters{part2}{2} +\includechapters{part2}{2} % \includechapters{part3}{1} \ No newline at end of file diff --git a/content/chapters/part1/1.tex b/content/chapters/part1/1.tex index 8219935..7b29d3c 100644 --- a/content/chapters/part1/1.tex +++ b/content/chapters/part1/1.tex @@ -117,7 +117,7 @@ We want to minimize the distance between $\X\beta$ and $\Y$: \Rightarrow& \X \beta = proj^{(1, \X)} \Y\\ \Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\ \Rightarrow& \forall i: \\ - & \X_i \Y = \X_i X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\ + & \X_i \Y = \X_i \X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\ \Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\ \Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\ \Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y @@ -127,7 +127,7 @@ This formula comes from the orthogonal projection of $\Y$ on the vector subspace $\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$. -If $H$ is the projection matrix of the subspace generated by $\X$, $X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$. +If $H$ is the projection matrix of the subspace generated by $\X$, $\X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$. \section{Sum of squares} @@ -253,6 +253,7 @@ Covariance is really sensitive to scale of variables. For instance, if we measur \begin{theorem}[Cochran Theorem (Consequence)] + \label{thm:cochran} Let $\mathbf{Z}$ be a gaussian vector: $\mathbf{Z} \sim \Norm_n(0_n, I_n)$. \begin{itemize} @@ -263,8 +264,29 @@ Covariance is really sensitive to scale of variables. For instance, if we measur \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$... (\textcolor{red}{look to the slides}) \end{itemize} + + $Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$. + + \dots + + \end{theorem} +\begin{property}[Estimators properties in the linear model] + According to \autoref{thm:cochran}, + \[ + \hat{m} \text{ is independent from $\hat{\sigma}^2$} + \]\dots + \[ + \frac{\norm{\Y - \Pi_V(\Y)}^2}{...} \sim + \] + + $\hat{m} = \X \hat{\beta}$ + + $\hat{m}$ is the estimation of the mean. +\end{property} + + \begin{definition}[Chi 2 distribution] If $X_1, \ldots, X_n$ i.i.d. $\sim \Norm(0, 1)$, then;, \[ @@ -318,3 +340,290 @@ We can derive statistical test from these properties. \] where + +\paragraph{Estimation of $\sigma^2$} + +A biased estimator of $\sigma^2$ is: +\[ + \hat{\sigma^2} = ? +\] + +$S^2$ is the unbiased estimator of $\sigma^2$ +\begin{align*} + S^2 &= \frac{1}{n-q} \norm{\Y - \Pi_V(\Y)}^2 \\ + &= \frac{1}{n-q} \sum_{i=1}^n (Y_i - (\X\hat{\beta})_i)^2 +\end{align*} + +\begin{remark}[On $\hat{m}$] + \begin{align*} + &\Y = \X \beta + \varepsilon + \Leftrightarrow& \EE(\Y) = \X \beta + \end{align*} +\end{remark} + +\section{Student test of nullity of a parameter} + +Let $\beta_j$ be a parameter, the tested hypotheses are as follows: +\[ + \begin{cases} + (H_0): \beta_j = 0 \\ + (H_1): \beta_j \neq 0 + \end{cases} +\] + +Under the null hypothesis: +\[ + \frac{\hat{\beta}_j - \beta_j}{S \sqrt{(\X^T \X)^1_{j,j}}} \sim \St(n-q). +\] +The test statistic is: +\[ + W_n = \frac{\hat{\beta}_j}{S \sqrt{(\X^T\X)^{-1}_{j,j}}} \underset{H_0}{\sim} \St(n-q). +\] + +$\hat{\beta}$ is a multinormal vector. + +Let's consider a vector of 4 values: +\begin{align*} + \begin{pmatrix} + \hat{\beta}_0 \\ + \hat{\beta}_1 \\ + \hat{\beta}_2 \\ + \hat{\beta}_3 + \end{pmatrix} + \sim \Norm_4 \left( \begin{pmatrix} + \beta_0 \\ + \beta_1 \\ + \beta_2 \\ + \beta_3 + \end{pmatrix} ; + \sigma^2 \left(\X^T \X\right)^{-1} + \right) +\end{align*} + +Let $\M$ be the following model +\begin{align*} + Y_i &= \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i +\end{align*} + +Why can't we use the following model to test each of the parameters values (here for $X_2$)? +\[ + Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i +\] +We can't use such a model, we would probably meet a confounding factor: even if we are only interested in relationship $X_2$ with $Y$, we have to fit the whole model. + +\begin{example}[Confounding parameter] + Let $Y$ be a variable related to the lung cancer. Let $X_1$ be the smoking status, and $X_2$ the variable `alcohol' (for instance the quantity of alcohol drunk per week). + + If we only fit the model $\M: Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i$, we could conclude for a relationship between alcohol and lung cancer, because alcohol consumption and smoking is strongly related. If we had fit the model $\M = Y_i = \theta_0 + \theta_1 X_{1i} + \theta_2 X_{2i} + \varepsilon_i$, we could indeed have found no significant relationship between $X_2$ and $Y$. +\end{example} + +\begin{definition}[Student law] + Let $X$ and $Y$ be two random variables such as $X \indep Y$, and such that $X \sim \Norm(0, 1)$ and $Y \sim \chi_n^2$, then + \[ + \frac{X}{\sqrt{Y}} \sim \St(n) + \] +\end{definition} + +\subsection{Model comparison} + +\begin{definition}[Nested models] + +\end{definition} + +Let $\M_2$ and $\M_4$ be two models: + +$\M_2: Y_i = \beta_0 + \beta_3 X_{3_i} + \varepsilon_i$ + +$\M_4: Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i$ + +$\M_2$ is nested in $\M_4$. + +\paragraph*{Principle} We compare the residual variances of the two models, that is, the variance that is not explained by the model. + +The better the model is, the smallest the variance would be. + +If everything is explained by the model, the residual variance would be null. + + +Here $\M_4$ holds all the information found in $\M_2$ plus other informations. In the worst case It would be at least as good as $\M_2$. + +\subsection{Fisher $F$-test of model comparison} + +Let $\M_q$ and $\M_{q'}$ be two models such as $\dim(\M_q) = q$, $\dim(\M_{q'}) = q'$, $q > q'$ and $\M_{q'}$ is nested in $\M_q$. + +\paragraph{Tested hypotheses} +\[ +\begin{cases} + (H_0): \M_{q'} \text{ is the proper model} \\ + (H_1): \M_q \text{ is a better model} +\end{cases} +\] + +\begin{description} + \item[ESS] Estimated Sum of Squares + \item[RSS] Residual Sum of Squares + \item[EMS] Estimates Mean Square + \item[RMS] Residual Mean Square +\end{description} + +\[ + ESS = RSS(\M_{q'}) - RSS(\M_q) +\] +\[ + RSS(\M) = \norm{\Y - \X\hat{\beta}} = \sum_{i=1}^n \hat{\varepsilon}_i^2 +\] +\[ + EMS = \frac{ESS}{q - q'} +\] +\[ + RMS = \frac{RSS(\M_q)}{n-q} +\] + +Under the null hypotheses: +\[ + F = \frac{EMS}{RMS} \underset{H_0}{\sim} \Fish(q-q'; n-q) +\] + +\section{Model validity} + +Assumptions: +\begin{itemize} + \item $\X$ is a full rank matrix; + \item Residuals are i.i.d. $\varepsilon \sim \Norm(0_n, \sigma^2 \mathcal{I}_n)$; +\end{itemize} + +We have also to look for influential variables. + + +\subsection{$\X$ is full rank} + +To check that the rank of the matrix is $p+1$, we can calculate the eigen value of the correlation value of the matrix. If there is a perfect relationship between two variables (two columns of $\X$), one of the eigen value would be null. In practice, we never get a null eigen value. We consider the condition index as the ratio between the largest and the smallest eigenvalues, if the condition index $\kappa = \frac{\lambda_1}{\lambda_p}$, with $\lambda_1 \geq \lambda_2 \geq \ldots \geq \lambda_p$ the eigenvalues. + + +If all eigenvalues is different from 0, $\X^T \X$ can be inverted, but the estimated parameter variance would be large, thus the estimation of the parameters would be not relevant (not good enough). + +\paragraph{Variance Inflation Factor} + +Perform a regression of each of the predictors against the other predictors. + +If there is a strong linear relationship between a parameter and the others, it would reflect that the coefficient of determination $R^2$ (the amount of variance explained by the model) for this model, which would mean that there is a strong relationship between the parameters. + +We do this for all parameters, and for parameter $j = 1, \ldots, p$, the variance inflation factor would be: +\[ + VIF_j = \frac{1}{1-R^2_j}. +\] + +\subparagraph*{Rule} +If $VIF > 10$ or $VIF > 100$\dots + + +In case of multicollinearity, we have to remove the variable one by one until there is no longer multicollinearity. +Variables have to be removed based on statistical results and through discussion with experimenters. + + +\subsection{Residuals analysis} + +\paragraph*{Assumption} +\[ + \varepsilon \sim \Norm_n(0_n, \sigma^2 I_n) +\] + +\paragraph{Normality of the residuals} If $\varepsilon_i$ ($i=1, \ldots, n$) could be observed we could build a QQ-plot of $\varepsilon_i / \sigma$ against quantiles of $\Norm(0, 1)$. + +Only the residual errors $\hat{e}_i$ can be observed: + +Let $e_i^*$ be the studentized residual, considered as estimators of $\varepsilon_i$ + +\[ + e_i^* = \frac{\hat{e}_i}{\sqrt{\sigma^2_{(i)(1-H_{ii})}}} +\] + +\begin{align*} + \hat{Y} &= X \hat{\beta} \\ + &= X \left( (X^TX)^{-1} X^T Y\right) \\ + &= \underbrace{X (X^TX)^{-1} X^T}_{H} Y +\end{align*} + +\paragraph{Centered residuals} If $(1, \ldots, 1)^T$ belongs to $\X$ $\EE(\varepsilon) = 0$, by construction. + +\paragraph{Independence} We do not have a statistical test for independence in R, we would plot the residuals $e$ against $\X \hat{\beta}$. + +\paragraph{Homoscedastiscity} Plot the $\sqrt{e^*}$ against $\X \hat{\beta}$. + + +\paragraph{Influential observations} + +We make the distinction between observations: +\begin{itemize} + \item With too large residual + $\rightarrow$ Influence on the estimation of $\sigma^2$ + \item Which are too isolated + $\rightarrow$ Influence on the estimation of $\beta$ +\end{itemize} + +\[ + e_i^* \sim \St(n-p-1) +\] +\subparagraph*{Rule} We consider an observation to be aberrant if: +\[ + e_i^* > \F^{-1}_{\St(n-p-1)}(1-\alpha) +\] +quantile of order $1-\alpha$, $\alpha$ being often set as $1/n$, or we set the threshold to 2. + +\paragraph{Leverage} Leverage is the diagonal term of the orthogonal projection matrix(?) $H_{ii}$. + +\begin{property} + \begin{itemize} + \item $0 \leq H_{ii} \leq 1$ + \item $\sum_i H_ii = p$ + \end{itemize} +\end{property} + +\subparagraph*{Rule} We consider that the observation is aberrant if the leverage is ??. + + +\paragraph{Non-linearity} + + +\section{Model Selection} + +We want to select the best model with the smallest number of predictors. + +When models have too many explicative variables, the power of statistical tests decreases. + +Different methods: +\begin{itemize} + \item Comparison of nested models; + \item Information criteria; + \item Method based on the prediction error. +\end{itemize} + +\subsection{Information criteria} + +\subsubsection{Likelihood} + +\begin{definition}[Likelihood] + Probability to observe what we observed for a particular model. + \[ + L_n (\M(k)) + \] +\end{definition} + + +\begin{definition}[Akaike Information Criterion] + \[ + AIC(\M(k)) = -2 \log L_n (\M(k)) + 2k. + \] + + $2k$ is a penalty, leading to privilege the smallest model. +\end{definition} + +\begin{definition}[Bayesian Information Criterion] + \[ + BIC(\M(k)) = -2 \log L_n (\M(k)) + \log(n) k. + \] + $\log(n) k$ is a penalty. +\end{definition} + +Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion. + diff --git a/content/chapters/part1/2.tex b/content/chapters/part1/2.tex index 0e5f075..532bfb7 100644 --- a/content/chapters/part1/2.tex +++ b/content/chapters/part1/2.tex @@ -1,220 +1,4 @@ -\chapter{Elements of Linear Algebra} -\label{ch:elements-of-linear-algebra} +\chapter{Generalized Linear Model} -\begin{remark}[vector] - Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$ -\end{remark} +\section{Logistic Regression} -Let $u = \begin{pmatrix} - u_1 \\ - \vdots \\ - u_n - \end{pmatrix}$ and $v = \begin{pmatrix} - v_1 \\ - \vdots \\ - v_n - \end{pmatrix}$ - -\begin{definition}[Scalar Product (Dot Product)] - \begin{align*} - \scalar{u, v} & = \begin{pmatrix} - u_1, \ldots, u_v - \end{pmatrix} - \begin{pmatrix} - v_1 \\ - \vdots \\ - v_n - \end{pmatrix} \\ - & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n - \end{align*} - - We may use $\scalar{u, v}$ or $u \cdot v$ notations. -\end{definition} -\paragraph{Dot product properties} -\begin{description} - \item[Commutative] $\scalar{u, v} = \scalar{v, u}$ - \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$ - \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$ - \item $\scalar{a, a} = \norm{a}^2$ -\end{description} - -\begin{definition}[Norm] - Length of the vector. - \[ - \norm{u} = \sqrt{\scalar{u, v}} - \] - - $\norm{u, v} > 0$ -\end{definition} - -\begin{definition}[Distance] - \[ - dist(u, v) = \norm{u-v} - \] -\end{definition} - -\begin{definition}[Orthogonality] - -\end{definition} - -\begin{remark} - \[ - (dist(u, v))^2 = \norm{u - v}^2, - \] and - \[ - \scalar{v-u, v-u} - \] -\end{remark} - -\begin{figure} - \centering - \includestandalone{figures/schemes/vector_orthogonality} - \caption{Scalar product of two orthogonal vectors.} - \label{fig:scheme-orthogonal-scalar-product} -\end{figure} - -\begin{align*} - \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\ - & = \norm{v}^2 + \norm{u}^2 \\ - & = -2 \scalar{u, v} -\end{align*} - -\begin{align*} - \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\ - \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v} -\end{align*} - -\begin{proposition}[Scalar product of orthogonal vectors] -\[ - u \perp v \Leftrightarrow \scalar{u, v} = 0 -\] -\end{proposition} - -\begin{proof}[Indeed] - $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}. - \begin{align*} - \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\ - \Leftrightarrow & 4 \scalar{u, v} = 0 \\ - \Leftrightarrow & \scalar{u, v} = 0 - \end{align*} -\end{proof} - -\begin{theorem}[Pythagorean theorem] - If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ . -\end{theorem} - -\begin{definition}[Orthogonal Projection] - -\end{definition} -Let $y = \begin{pmatrix} - y_1 \\ - . \\ - y_n - \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$. -$\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$: -\[ - \mathcal{Y} = proj^w(y) + z, -\] -where -\[ - \begin{cases} - z \in w^\perp \\ - proj^w(y) \in w - \end{cases} -\] -There is only one vector $\mathcal{Y}$ that ? - -The scalar product between $z$ and (?) is zero. - -\begin{property} - $proj^w(y)$ is the closest vector to $y$ that belongs to $w$. -\end{property} - -\begin{definition}[Matrix] - A matrix is an application, that is, a function that transform a thing into another, it is a linear function. -\end{definition} - -\begin{example}[Matrix application] - - Let $A$ be a matrix: - \[ - A = \begin{pmatrix} - a & b \\ - c & d - \end{pmatrix} - \] and - \[ - x = \begin{pmatrix} - x_1 \\ - x_2 - \end{pmatrix} - \] - Then, - \begin{align*} - Ax & = \begin{pmatrix} - a & b \\ - c & d - \end{pmatrix} - \begin{pmatrix} - x_1 \\ - x_2 - \end{pmatrix} \\ - & = \begin{pmatrix} - a x_1 + b x_2 \\ - c x_1 + d x_2 - \end{pmatrix} - \end{align*} - - Similarly, - \begin{align*} - \begin{pmatrix} - a & b & c & d \\ - e & f & g & h \\ - i & j & k & l - \end{pmatrix} - \begin{pmatrix} - x_1 \\ - x_2 \\ - x_3 \\ - x_4 - \end{pmatrix} - = - \begin{pmatrix} - \luadirect{ - local matrix_product = require("scripts.matrix_product") - local m1 = { - {"a", "b", "c", "d"}, - {"e", "f", "g", "h"}, - {"i", "j", "k", "l"} - } - local m2 = { - {"x_1"}, - {"x_2"}, - {"x_3"}, - {"x_4"} - } - local product_matrix = matrix_product.matrix_product_repr(m1,m2) - local matrix_dump = matrix_product.dump_matrix(product_matrix) - tex.print(matrix_dump) - } - \end{pmatrix} - \end{align*} -\end{example} - -The number of columns has to be the same as the dimension of the vector to which the matrix is applied. - -\begin{definition}[Tranpose of a Matrix] - Let $A = \begin{pmatrix} - a & b \\ - c & d - \end{pmatrix}$, then $A^T = \begin{pmatrix} - a & c \\ - b & d - \end{pmatrix}$ -\end{definition} - -\begin{figure} - \centering - \includestandalone{figures/schemes/coordinates_systems} - \caption{Coordinate systems} -\end{figure} diff --git a/content/chapters/part1/3.tex b/content/chapters/part1/3.tex deleted file mode 100644 index e69de29..0000000 diff --git a/content/chapters/part2/0.tex b/content/chapters/part2/0.tex new file mode 100644 index 0000000..fbfa3b9 --- /dev/null +++ b/content/chapters/part2/0.tex @@ -0,0 +1,2 @@ +\part{Linear Algebra} + diff --git a/content/chapters/part2/1.tex b/content/chapters/part2/1.tex new file mode 100644 index 0000000..0e5f075 --- /dev/null +++ b/content/chapters/part2/1.tex @@ -0,0 +1,220 @@ +\chapter{Elements of Linear Algebra} +\label{ch:elements-of-linear-algebra} + +\begin{remark}[vector] + Let $u$ a vector, we will use interchangeably the following notations: $u$ and $\vec{u}$ +\end{remark} + +Let $u = \begin{pmatrix} + u_1 \\ + \vdots \\ + u_n + \end{pmatrix}$ and $v = \begin{pmatrix} + v_1 \\ + \vdots \\ + v_n + \end{pmatrix}$ + +\begin{definition}[Scalar Product (Dot Product)] + \begin{align*} + \scalar{u, v} & = \begin{pmatrix} + u_1, \ldots, u_v + \end{pmatrix} + \begin{pmatrix} + v_1 \\ + \vdots \\ + v_n + \end{pmatrix} \\ + & = u_1 v_1 + u_2 v_2 + \ldots + u_n v_n + \end{align*} + + We may use $\scalar{u, v}$ or $u \cdot v$ notations. +\end{definition} +\paragraph{Dot product properties} +\begin{description} + \item[Commutative] $\scalar{u, v} = \scalar{v, u}$ + \item[Distributive] $\scalar{(u+v), w} = \scalar{u, w} + \scalar{v, w}$ + \item $\scalar{u, v} = \norm{u} \times \norm{v} \times \cos(\widehat{u, v})$ + \item $\scalar{a, a} = \norm{a}^2$ +\end{description} + +\begin{definition}[Norm] + Length of the vector. + \[ + \norm{u} = \sqrt{\scalar{u, v}} + \] + + $\norm{u, v} > 0$ +\end{definition} + +\begin{definition}[Distance] + \[ + dist(u, v) = \norm{u-v} + \] +\end{definition} + +\begin{definition}[Orthogonality] + +\end{definition} + +\begin{remark} + \[ + (dist(u, v))^2 = \norm{u - v}^2, + \] and + \[ + \scalar{v-u, v-u} + \] +\end{remark} + +\begin{figure} + \centering + \includestandalone{figures/schemes/vector_orthogonality} + \caption{Scalar product of two orthogonal vectors.} + \label{fig:scheme-orthogonal-scalar-product} +\end{figure} + +\begin{align*} + \scalar{v-u, v-u} & = \scalar{v, v} + \scalar{u, u} - 2 \scalar{u, v} \\ + & = \norm{v}^2 + \norm{u}^2 \\ + & = -2 \scalar{u, v} +\end{align*} + +\begin{align*} + \norm{u - v}^2 & = \norm{u}^2 + \norm{v}^2 - 2 \scalar{u,v} \\ + \norm{u + v}^2 & = \norm{u}^2 + \norm{v}^2 + 2 \scalar{u,v} +\end{align*} + +\begin{proposition}[Scalar product of orthogonal vectors] +\[ + u \perp v \Leftrightarrow \scalar{u, v} = 0 +\] +\end{proposition} + +\begin{proof}[Indeed] + $\norm{u-v}^2 = \norm{u+v}^2$, as illustrated in \autoref{fig:scheme-orthogonal-scalar-product}. + \begin{align*} + \Leftrightarrow & -2 \scalar{u, v} = 2 \scalar{u, v} \\ + \Leftrightarrow & 4 \scalar{u, v} = 0 \\ + \Leftrightarrow & \scalar{u, v} = 0 + \end{align*} +\end{proof} + +\begin{theorem}[Pythagorean theorem] + If $u \perp v$, then $\norm{u+v}^2 = \norm{u}^2 + \norm{v}^2$ . +\end{theorem} + +\begin{definition}[Orthogonal Projection] + +\end{definition} +Let $y = \begin{pmatrix} + y_1 \\ + . \\ + y_n + \end{pmatrix} \in \RR[n]$ and $w$ a subspace of $\RR[n]$. +$\mathcal{Y}$ can be written as the orthogonal projection of $y$ on $w$: +\[ + \mathcal{Y} = proj^w(y) + z, +\] +where +\[ + \begin{cases} + z \in w^\perp \\ + proj^w(y) \in w + \end{cases} +\] +There is only one vector $\mathcal{Y}$ that ? + +The scalar product between $z$ and (?) is zero. + +\begin{property} + $proj^w(y)$ is the closest vector to $y$ that belongs to $w$. +\end{property} + +\begin{definition}[Matrix] + A matrix is an application, that is, a function that transform a thing into another, it is a linear function. +\end{definition} + +\begin{example}[Matrix application] + + Let $A$ be a matrix: + \[ + A = \begin{pmatrix} + a & b \\ + c & d + \end{pmatrix} + \] and + \[ + x = \begin{pmatrix} + x_1 \\ + x_2 + \end{pmatrix} + \] + Then, + \begin{align*} + Ax & = \begin{pmatrix} + a & b \\ + c & d + \end{pmatrix} + \begin{pmatrix} + x_1 \\ + x_2 + \end{pmatrix} \\ + & = \begin{pmatrix} + a x_1 + b x_2 \\ + c x_1 + d x_2 + \end{pmatrix} + \end{align*} + + Similarly, + \begin{align*} + \begin{pmatrix} + a & b & c & d \\ + e & f & g & h \\ + i & j & k & l + \end{pmatrix} + \begin{pmatrix} + x_1 \\ + x_2 \\ + x_3 \\ + x_4 + \end{pmatrix} + = + \begin{pmatrix} + \luadirect{ + local matrix_product = require("scripts.matrix_product") + local m1 = { + {"a", "b", "c", "d"}, + {"e", "f", "g", "h"}, + {"i", "j", "k", "l"} + } + local m2 = { + {"x_1"}, + {"x_2"}, + {"x_3"}, + {"x_4"} + } + local product_matrix = matrix_product.matrix_product_repr(m1,m2) + local matrix_dump = matrix_product.dump_matrix(product_matrix) + tex.print(matrix_dump) + } + \end{pmatrix} + \end{align*} +\end{example} + +The number of columns has to be the same as the dimension of the vector to which the matrix is applied. + +\begin{definition}[Tranpose of a Matrix] + Let $A = \begin{pmatrix} + a & b \\ + c & d + \end{pmatrix}$, then $A^T = \begin{pmatrix} + a & c \\ + b & d + \end{pmatrix}$ +\end{definition} + +\begin{figure} + \centering + \includestandalone{figures/schemes/coordinates_systems} + \caption{Coordinate systems} +\end{figure} diff --git a/content/introduction.tex b/content/introduction.tex index 775a081..12a8352 100644 --- a/content/introduction.tex +++ b/content/introduction.tex @@ -22,4 +22,14 @@ thus we might consider genotype either as a qualitative variable or quantitative variable. \end{example} -When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance. \ No newline at end of file +When the variable are quantitative, we use regression, whereas for qualitative variables, we use an analysis of variance. + +\begin{figure} + \begin{subfigure}{0.45\columnwidth} + \includegraphics[width=\columnwidth]{figures/plots/linear_regression_linear.pdf} + \end{subfigure} + \begin{subfigure}{0.45\columnwidth} + \includegraphics[width=\columnwidth]{figures/plots/linear_regression_non_linear.pdf} + \end{subfigure} + \caption{Illustration of two models fitting observed values} +\end{figure} \ No newline at end of file diff --git a/definitions.tex b/definitions.tex index 9387d0c..b69346b 100644 --- a/definitions.tex +++ b/definitions.tex @@ -1,6 +1,10 @@ \DeclareMathOperator{\VVar}{\mathbb{V}} % variance \DeclareMathOperator{\One}{\mathbf{1}} \DeclareMathOperator{\Cor}{\mathrm{Cor}} +\DeclareMathOperator{\St}{\mathscr{St}} \newcommand{\M}[1][]{\ensuremath{\ifstrempty{#1}{\mathcal{M}}{\mathbb{M}_{#1}}}} \newcommand{\X}{\ensuremath{\mathbf{X}}} \newcommand{\Y}{\ensuremath{\mathbf{Y}}} +\newcommand{\Z}{\ensuremath{\mathbf{Z}}} +\usepackage{unicode-math} + diff --git a/figures/plots/linear_regression.R b/figures/plots/linear_regression.R new file mode 100644 index 0000000..1e3e902 --- /dev/null +++ b/figures/plots/linear_regression.R @@ -0,0 +1,26 @@ +# Plot an affine model +n <- 250 +sd <- 0.05 +epsilon <- rnorm(n, mean = 0, sd = 2) +beta0 <- 1.25 +beta1 <- 4 +linear_model <- function(x) { + return(beta0 + beta1*x) +} +x <- runif(n, min=0, max=1) +y <- linear_model(x) + epsilon + +pdf("figures/plots/linear_regression_linear.pdf") +plot(x, y, col="#5654fa", type="p", pch=20, xlab="x", ylab="y") +abline(a = beta0, b = beta1, col="red") +dev.off() + + +non_linear_model <- function(x) { + return(beta0 + beta1 * exp(2*x)) +} +non_linear_y <- non_linear_model(x) + epsilon +pdf("figures/plots/linear_regression_non_linear.pdf") +plot(x, non_linear_y, col="#5654fa", type="p", pch=20, xlab="x", ylab="z") +curve(non_linear_model, from=0, to=1, add=T, col="red") +dev.off() diff --git a/figures/plots/linear_regression_linear.pdf b/figures/plots/linear_regression_linear.pdf new file mode 100644 index 0000000..0be2ed9 --- /dev/null +++ b/figures/plots/linear_regression_linear.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25059b14c85b0700f41d52bfb08536a101f5ab0ee0b9580aadaae3faeefcd1ae +size 19542 diff --git a/figures/plots/linear_regression_non_linear.pdf b/figures/plots/linear_regression_non_linear.pdf new file mode 100644 index 0000000..20b5677 --- /dev/null +++ b/figures/plots/linear_regression_non_linear.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02fef791e9ba93e0c8eac221ee47f79aecd5a7744945575a793ec3ebfe673c3e +size 20288 diff --git a/figures/schemes/.gitattributes b/figures/schemes/.gitattributes new file mode 100644 index 0000000..0799fdf --- /dev/null +++ b/figures/schemes/.gitattributes @@ -0,0 +1,3 @@ +covariance.pdf filter=lfs diff=lfs merge=lfs -text +../plots/linear_regression_linear.pdf filter=lfs diff=lfs merge=lfs -text +../plots/linear_regression_non_linear.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/figures/schemes/covariance.pdf b/figures/schemes/covariance.pdf new file mode 100644 index 0000000..773a8ca --- /dev/null +++ b/figures/schemes/covariance.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c32e65df88f221afec82d7d549ac15078d482112e8693a58e21eaaf1c8958785 +size 36303 diff --git a/figures/schemes/covariance.tex b/figures/schemes/covariance.tex new file mode 100644 index 0000000..aa487ca --- /dev/null +++ b/figures/schemes/covariance.tex @@ -0,0 +1,35 @@ +% Scheme of Covariance +\documentclass[margin=0.5cm]{standalone} +\usepackage{tikz} +\usepackage{amssymb} +\begin{document} +\begin{tikzpicture} + \usetikzlibrary{positioning} + \tikzset{ + point/.style = {circle, inner sep={.75\pgflinewidth}, opacity=1, draw, black, fill=black}, + point name/.style = {insert path={coordinate (#1)}}, + } + \begin{scope}[yshift=0] + \draw (-4, 0.5) -- (4,0.5) node[right] {$Y_i$}; + \draw (-4, -0.5) -- (4,-0.5) node[right] {$Y_j$}; + \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) > 0$}; + \node (EYipoint) at (0,0.5) {$\times$}; + \node at (0, 1) {$\mathbb{E}(Y_i)$}; + \node (EYipoint) at (0,-0.5) {$\times$}; + \node at (0, -1) {$\mathbb{E}(Y_j)$}; + + \foreach \x in {-3, 0.5, 2.75} { + \node[point] at (\x, 0.5) {}; + } + \foreach \x in {-2, -1, 3} { + \node[point] at (\x, -0.5) {}; + } + \end{scope} + \begin{scope}[yshift=-100] + \draw (-4,0.5) -- (4,0.5) node[right] {$Y_i$}; + \draw (-4,-0.5) -- (4,-0.5) node[right] {$Y_j$}; + \node at (6, 0) {$\mathrm{Cov}(Y_i, Y_j) \approx 0$}; + \end{scope} + +\end{tikzpicture} +\end{document} \ No newline at end of file diff --git a/main.pdf b/main.pdf index 101c25b..c767f7c 100644 --- a/main.pdf +++ b/main.pdf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27d9599621738b732087974ecd8141b18411aac1ef77ff31c2ea2464ac443eb8 -size 308493 +oid sha256:03d4328d90340efbb70107a12e2d267e15a18d788928a0c1243764ff4ed279f9 +size 286931