\chapter{Linear Model} \section{Simple Linear Regression} \[ Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i \] \[ \Y = \X \beta + \varepsilon. \] \[ \begin{pmatrix} Y_1 \\ Y_2 \\ \vdots \\ Y_n \end{pmatrix} = \begin{pmatrix} 1 & X_1 \\ 1 & X_2 \\ \vdots & \vdots \\ 1 & X_n \end{pmatrix} \begin{pmatrix} \beta_0 \\ \beta_1 \end{pmatrix} + \begin{pmatrix} \varepsilon_1 \\ \varepsilon_2 \\ \vdots \varepsilon_n \end{pmatrix} \] \paragraph*{Assumptions} \begin{enumerate}[label={\color{primary}{($A_\arabic*$)}}] \item $\varepsilon_i$ are independent; \item $\varepsilon_i$ are identically distributed; \item $\varepsilon_i$ are i.i.d $\sim \Norm(0, \sigma^2)$ (homoscedasticity). \end{enumerate} \section{Generalized Linear Model} \[ g(\EE(Y)) = X \beta \] with $g$ being \begin{itemize} \item Logistic regression: $g(v) = \log \left(\frac{v}{1-v}\right)$, for instance for boolean values, \item Poisson regression: $g(v) = \log(v)$, for instance for discrete variables. \end{itemize} \subsection{Penalized Regression} When the number of variables is large, e.g, when the number of explanatory variable is above the number of observations, if $p >> n$ ($p$: the number of explanatory variable, $n$ is the number of observations), we cannot estimate the parameters. In order to estimate the parameters, we can use penalties (additional terms). Lasso regression, Elastic Net, etc. \subsection{Statistical Analysis Workflow} \begin{enumerate}[label={\bfseries\color{primary}Step \arabic*.}] \item Graphical representation; \item ... \end{enumerate} \[ Y = X \beta + \varepsilon, \] is noted equivalently as \[ \begin{pmatrix} y_1 \\ y_2 \\ y_3 \\ y_4 \end{pmatrix} = \begin{pmatrix} 1 & x_{11} & x_{12} \\ 1 & x_{21} & x_{22} \\ 1 & x_{31} & x_{32} \\ 1 & x_{41} & x_{42} \end{pmatrix} \begin{pmatrix} \beta_0 \\ \beta_1 \\ \beta_2 \end{pmatrix} + \begin{pmatrix} \varepsilon_1 \\ \varepsilon_2 \\ \varepsilon_3 \\ \varepsilon_4 \end{pmatrix}. \] \section{Parameter Estimation} \subsection{Simple Linear Regression} \subsection{General Case} If $\X^T\X$ is invertible, the OLS estimator is: \begin{equation} \hat{\beta} = (\X^T\X)^{-1} \X^T \Y \end{equation} \subsection{Ordinary Least Square Algorithm} We want to minimize the distance between $\X\beta$ and $\Y$: \[ \min \norm{\Y - \X\beta}^2 \] (See \autoref{ch:elements-of-linear-algebra}). \begin{align*} \Rightarrow& \X \beta = proj^{(1, \X)} \Y\\ \Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\ \Rightarrow& \forall i: \\ & \X_i \Y = \X_i X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\ \Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\ \Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\ \Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y \end{align*} This formula comes from the orthogonal projection of $\Y$ on the vector subspace defined by the explanatory variables $\X$ $\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$. If $H$ is the projection matrix of the subspace generated by $\X$, $X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$. \section{Sum of squares} $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so \[ \underbrace{\norm{\Y - \bar{\Y}\One}}_{\text{Total SS}} = \underbrace{\norm{\Y - \X \hat{\beta}}^2}_{\text{Residual SS}} + \underbrace{\norm{\X \hat{\beta} - \bar{\Y} \One}^2}_{\text{Explicated SS}} \] \section{Coefficient of Determination: \texorpdfstring{$R^2$}{R\textsuperscript{2}}} \begin{definition}[$R^2$] \[ 0 \leq R^2 = \frac{\norm{\X\hat{\beta} - \bar{\Y}\One}^2}{\norm{\Y - \bar{\Y}\One}^2} = 1 - \frac{\norm{\Y - \X\hat{\beta}}^2}{\norm{\Y - \bar{\Y}\One}^2} \leq 1 \] proportion of variation of $\Y$ explained by the model. \end{definition} \begin{figure} \centering \includestandalone{figures/schemes/orthogonal_projection} \caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\hat{\varepsilon} = \norm{\Y - \hat{\beta}\X}^2$} and $\color{blue}c$ corresponds to $\norm{Y - \bar{Y}}^2$. \label{fig:scheme-orthogonal-projection} \end{figure} \begin{figure} \centering \includestandalone{figures/schemes/ordinary_least_squares} \caption{Ordinary least squares and regression line with simulated data.} \label{fig:ordinary-least-squares} \end{figure} \begin{definition}[Model dimension] Let $\M$ be a model. The dimension of $\M$ is the dimension of the subspace generated by $\X$, that is the number of parameters in the $\beta$ vector. \textit{Nb.} The dimension of the model is not the number of parameter, as $\sigma^2$ is one of the model parameters. \end{definition} \section{Gaussian vectors} \begin{definition}[Normal distribution] \end{definition} \begin{definition}[Gaussian vector] A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is ... \end{definition} \begin{property} $m = \EE(Y) = (m_1, \ldots, m_n)^T$, where $m_i = \EE(Y_i)$ ... \[ \Y \sim \Norm_n(m, \Sigma) \] where $\Sigma$ is the variance-covariance matrix! \[ \Sigma = \E\left[(\Y -m)(\Y - m)^T\right]. \] \end{property} \begin{remark} \[ \Cov(Y_i, Y_i) = \Var(Y_i) \] \end{remark} \begin{definition}[Covariance] \[ \Cov(Y_i, Y_j) = \EE\left((Y_i-\EE(Y_j))(Y_j-\EE(Y_j))\right) \] \end{definition} When two variable are linked, the covariance is large. If two variables $X, Y$ are independent, $\Cov(X, Y) = 0$. \begin{definition}[Correlation coefficient] \[ \Cor(Y_i, Y_j) = \frac{\EE\left((Y_i-\EE(Y_j))(Y_j-\EE(Y_j))\right)}{\sqrt{\EE(Y_i - \EE(Y_i)) \cdot \EE(Y_j - \EE(Y_j))}} \] \end{definition} Covariance is really sensitive to scale of variables. For instance, if we measure distance in millimeters, the covariance would be larger than in the case of a measure expressed in metters. Thus the correlation coefficient, which is a sort of normalized covariance is useful, to be able to compare the values. \begin{remark} \begin{align*} \Cov(Y_i, Y_i) &= \EE((Y_i - \EE(Y_i)) (Y_i - \EE(Y_i))) \\ &= \EE((Y_i - \EE(Y_i))^2) \\ &= \Var(Y_i) \end{align*} \end{remark} \begin{equation} \Sigma = \begin{pNiceMatrix} \VVar(Y_1) & & & &\\ & \Ddots & & & \\ & \Cov(Y_i, Y_j) & \VVar(Y_i) & & \\ & & & \Ddots & \\ & & & & \VVar(Y_n) \end{pNiceMatrix} \end{equation} \begin{definition}[Identity matrix] \[ \mathcal{I}_n = \begin{pNiceMatrix} 1 & 0 & 0 \\ 0 & \Ddots & 0\\ 0 & 0 & 1 \end{pNiceMatrix} \] \end{definition} \begin{theorem}[Cochran Theorem (Consequence)] Let $\mathbf{Z}$ be a gaussian vector: $\mathbf{Z} \sim \Norm_n(0_n, I_n)$. \begin{itemize} \item If $V_1, V_n$ are orthogonal subspaces of $\RR[n]$ with dimensions $n_1, n_2$ such that \[ \RR[n] = V_1 \overset{\perp}{\oplus} V_2. \] \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$... (\textcolor{red}{look to the slides}) \end{itemize} \end{theorem} \begin{definition}[Chi 2 distribution] If $X_1, \ldots, X_n$ i.i.d. $\sim \Norm(0, 1)$, then;, \[ X_1^2 + \ldots X_n^2 \sim \chi_n^2 \] \end{definition} \subsection{Estimator's properties} \[ \Pi_V = \X(\X^T\X)^{-1} \X^T \] \begin{align*} \hat{m} &= \X \hat{\beta} = \X(\X^T\X)^{-1} \X^T \Y \\ \text{so} \\ &= \Pi_V \Y \end{align*} According to Cochran theorem, we can deduce that the estimator of the predicted value $\hat{m}$ is independent $\hat{\sigma}^2$ All the sum of squares follows a $\chi^2$ distribution: \[ ... \] \begin{property} \end{property} \subsection{Estimators consistency} If $q < n$, \begin{itemize} \item $\hat{\sigma}^2 \overunderset{\PP}{n\to\infty} \sigma^{*2}$. \item If $(\X^T\X)^{-1}$... \item ... \end{itemize} We can derive statistical test from these properties. \section{Statistical tests} \subsection{Student $t$-test} \[ \frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t \] where