multivariate-statistics/content/chapters/part1/1.tex

654 lines
20 KiB
TeX
Raw Permalink Normal View History

2023-11-16 16:47:14 +01:00
\chapter{Linear Model}
\section{Simple Linear Regression}
\[
Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i
\]
\[
\Y = \X \beta + \varepsilon.
\]
\[
\begin{pmatrix}
Y_1 \\
Y_2 \\
\vdots \\
Y_n
\end{pmatrix}
=
\begin{pmatrix}
1 & X_1 \\
1 & X_2 \\
\vdots & \vdots \\
1 & X_n
\end{pmatrix}
\begin{pmatrix}
\beta_0 \\
\beta_1
\end{pmatrix}
+
\begin{pmatrix}
\varepsilon_1 \\
\varepsilon_2 \\
\vdots
\varepsilon_n
\end{pmatrix}
\]
\paragraph*{Assumptions}
\begin{enumerate}[label={\color{primary}{($A_\arabic*$)}}]
\item $\varepsilon_i$ are independent;
\item $\varepsilon_i$ are identically distributed;
\item $\varepsilon_i$ are i.i.d $\sim \Norm(0, \sigma^2)$ (homoscedasticity).
\end{enumerate}
\section{Generalized Linear Model}
\[
g(\EE(Y)) = X \beta
\]
with $g$ being
\begin{itemize}
\item Logistic regression: $g(v) = \log \left(\frac{v}{1-v}\right)$, for instance for boolean values,
\item Poisson regression: $g(v) = \log(v)$, for instance for discrete variables.
\end{itemize}
\subsection{Penalized Regression}
When the number of variables is large, e.g, when the number of explanatory variable is above the number of observations, if $p >> n$ ($p$: the number of explanatory variable, $n$ is the number of observations), we cannot estimate the parameters.
In order to estimate the parameters, we can use penalties (additional terms).
Lasso regression, Elastic Net, etc.
\[
Y = X \beta + \varepsilon,
\]
is noted equivalently as
\[
\begin{pmatrix}
y_1 \\
y_2 \\
y_3 \\
y_4
\end{pmatrix}
= \begin{pmatrix}
1 & x_{11} & x_{12} \\
1 & x_{21} & x_{22} \\
1 & x_{31} & x_{32} \\
1 & x_{41} & x_{42}
\end{pmatrix}
\begin{pmatrix}
\beta_0 \\
\beta_1 \\
\beta_2
\end{pmatrix} +
\begin{pmatrix}
\varepsilon_1 \\
\varepsilon_2 \\
\varepsilon_3 \\
\varepsilon_4
\end{pmatrix}.
\]
\section{Parameter Estimation}
\subsection{Simple Linear Regression}
\subsection{General Case}
If $\X^T\X$ is invertible, the OLS estimator is:
\begin{equation}
\hat{\beta} = (\X^T\X)^{-1} \X^T \Y
\end{equation}
\subsection{Ordinary Least Square Algorithm}
We want to minimize the distance between $\X\beta$ and $\Y$:
\[
\min \norm{\Y - \X\beta}^2
\]
(See \autoref{ch:elements-of-linear-algebra}).
\begin{align*}
\Rightarrow& \X \beta = proj^{(1, \X)} \Y\\
\Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\
\Rightarrow& \forall i: \\
& \X_i \Y = \X_i \X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
\Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\
\Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\
\Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y
\end{align*}
This formula comes from the orthogonal projection of $\Y$ on the vector subspace defined by the explanatory variables $\X$
$\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$.
If $H$ is the projection matrix of the subspace generated by $\X$, $\X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
\section{Sum of squares}
$\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
\[
\underbrace{\norm{\Y - \bar{\Y}\One}}_{\text{Total SS}} = \underbrace{\norm{\Y - \X \hat{\beta}}^2}_{\text{Residual SS}} + \underbrace{\norm{\X \hat{\beta} - \bar{\Y} \One}^2}_{\text{Explicated SS}}
\]
\section{Coefficient of Determination: \texorpdfstring{$R^2$}{R\textsuperscript{2}}}
\begin{definition}[$R^2$]
\[
0 \leq R^2 = \frac{\norm{\X\hat{\beta} - \bar{\Y}\One}^2}{\norm{\Y - \bar{\Y}\One}^2} = 1 - \frac{\norm{\Y - \X\hat{\beta}}^2}{\norm{\Y - \bar{\Y}\One}^2} \leq 1
\] proportion of variation of $\Y$ explained by the model.
\end{definition}
\begin{figure}
\centering
\includegraphics{figures/schemes/orthogonal_projection.pdf}
\caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\hat{\varepsilon} = \norm{\Y - \hat{\beta}\X}^2$} and $\color{blue}c$ corresponds to $\norm{Y - \bar{Y}}^2$.
\label{fig:scheme-orthogonal-projection}
\end{figure}
\begin{figure}
\centering
\includegraphics{figures/schemes/ordinary_least_squares.pdf}
\caption{Ordinary least squares and regression line with simulated data.}
\label{fig:ordinary-least-squares}
\end{figure}
\begin{definition}[Model dimension]
Let $\M$ be a model.
The dimension of $\M$ is the dimension of the subspace generated by $\X$, that is the number of parameters in the $\beta$ vector.
\textit{Nb.} The dimension of the model is not the number of parameter, as $\sigma^2$ is one of the model parameters.
\end{definition}
\section{Gaussian vectors}
\begin{definition}[Normal distribution]
$X \sim \Norm(\mu, \sigma^{2})$, with density function $f$
\[
f(x) = \frac{1}{\sigma \sqrt{2\pi}}e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^{2}}
\]
\end{definition}
\begin{definition}[Gaussian vector]
A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is a gaussian random variable.
\end{definition}
\begin{property}
$m = \EE(Y) = (m_1, \ldots, m_n)^T$, where $m_i = \EE(Y_i)$
\[
\Y \sim \Norm_n(m, \Sigma)
\]
where $\Sigma$ is the variance-covariance matrix!
\[
\Sigma = \E\left[(\Y -m)(\Y - m)^T\right].
\]
\end{property}
\begin{remark}
\[
\Cov(Y_i, Y_i) = \Var(Y_i)
\]
\end{remark}
\begin{definition}[Covariance]
\[
\Cov(Y_i, Y_j) = \EE\left((Y_i-\EE(Y_j))(Y_j-\EE(Y_j))\right)
\]
\end{definition}
When two variable are linked, the covariance is large.
If two variables $X, Y$ are independent, $\Cov(X, Y) = 0$.
\begin{definition}[Correlation coefficient]
\[
\Cor(Y_i, Y_j) = \frac{\EE\left((Y_i-\EE(Y_j))(Y_j-\EE(Y_j))\right)}{\sqrt{\EE(Y_i - \EE(Y_i)) \cdot \EE(Y_j - \EE(Y_j))}}
\]
\end{definition}
Covariance is really sensitive to scale of variables. For instance, if we measure distance in millimeters, the covariance would be larger than in the case of a measure expressed in metters. Thus the correlation coefficient, which is a sort of normalized covariance is useful, to be able to compare the values.
\begin{remark}
\begin{align*}
\Cov(Y_i, Y_i) &= \EE((Y_i - \EE(Y_i)) (Y_i - \EE(Y_i))) \\
&= \EE((Y_i - \EE(Y_i))^2) \\
&= \Var(Y_i)
\end{align*}
\end{remark}
\begin{equation}
\Sigma = \begin{pNiceMatrix}
\VVar(Y_1) & & & &\\
& \Ddots & & & \\
& \Cov(Y_i, Y_j) & \VVar(Y_i) & & \\
& & & \Ddots & \\
& & & & \VVar(Y_n)
\end{pNiceMatrix}
\end{equation}
\begin{definition}[Identity matrix]
\[
\mathcal{I}_n = \begin{pNiceMatrix}
1 & 0 & 0 \\
0 & \Ddots & 0\\
0 & 0 & 1
\end{pNiceMatrix}
\]
\end{definition}
\begin{theorem}[Cochran Theorem (Consequence)]
\label{thm:cochran}
Let $\mathbf{Z}$ be a gaussian vector: $\mathbf{Z} \sim \Norm_n(0_n, I_n)$.
\begin{itemize}
\item If $V_1, V_n$ are orthogonal subspaces of $\RR[n]$ with dimensions $n_1, n_2$ such that
\[
\RR[n] = V_1 \overset{\perp}{\oplus} V_2.
\]
\item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$ ($\Pi_{1}$ and $\Pi_{2}$ being projection matrices)
then:
\item $z_{1}$, $Z_{2}$ are independent gaussian vectors, $Z_{1} \sim \Norm_{n_{1}} (0_{n}, \Pi_{1})$ and $Z_{2} \sim \Norm(0_{n_{2}}, \Pi_{2})$.
In particular $\norm{Z_{1}} \sim \chi^{2}(n_{1})$ and $\norm{Z_{2}} \sim \chi^{2}(n_{2})$.
\end{itemize}
$Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$.
\dots
\end{theorem}
\begin{property}[Estimators properties in the linear model]
According to \autoref{thm:cochran},
\[
\hat{m} \text{ is independent from $\hat{\sigma}^2$}
\]
\[
\norm{\Y - \Pi_V(\Y)}^2 = \norm{\varepsilon - \Pi_{V}(\varepsilon)}^{2} = \norm{\Pi_{V}^{\perp} (\varepsilon)}^{2}
\]
$\hat{m} = \X \hat{\beta}$
$\hat{m}$ is the estimation of the mean.
\end{property}
\begin{definition}[Chi 2 distribution]
If $X_1, \ldots, X_n$ i.i.d. $\sim \Norm(0, 1)$, then;,
\[
X_1^2 + \ldots X_n^2 \sim \chi_n^2
\]
\end{definition}
\subsection{Estimator's properties}
\[
\Pi_V = \X(\X^T\X)^{-1} \X^T
\]
\begin{align*}
\hat{m} &= \X \hat{\beta} = \X(\X^T\X)^{-1} \X^T \Y \\
\intertext{so} \\
&= \Pi_V \Y
\end{align*}
According to Cochran theorem, we can deduce that the estimator of the predicted value $\hat{m}$ is independent $\hat{\sigma}^2$
All the sum of squares follows a $\chi^2$ distribution.
\subsection{Estimators properties}
\begin{itemize}
\item $\hat{m}$ is unbiased and estimator of $m$;
\item $\EE(\hat{\sigma}^{2}) = \sigma^{2}(n-q)/n$ $\hat{\sigma}^{2}$ is a biased estimator of $\sigma^{2}$.
\[
S^{2} = \frac{1}{n-q} \norm{\Y - \Pi_{V}}^{2}
\]
is an unbiased estimator of $\sigma²$.
\end{itemize}
We can derive statistical test from these properties.
\section{Statistical tests}
\subsection{Student $t$-test}
\[
\frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t_{n-q}
\]
where
\paragraph{Estimation of $\sigma^2$}
A biased estimator of $\sigma^2$ is:
\[
\hat{\sigma^2} = ?
\]
$S^2$ is the unbiased estimator of $\sigma^2$
\begin{align*}
S^2 &= \frac{1}{n-q} \norm{\Y - \Pi_V(\Y)}^2 \\
&= \frac{1}{n-q} \sum_{i=1}^n (Y_i - (\X\hat{\beta})_i)^2
\end{align*}
\begin{remark}[On $\hat{m}$]
\begin{align*}
&\Y = \X \beta + \varepsilon
\Leftrightarrow& \EE(\Y) = \X \beta
\end{align*}
\end{remark}
\section{Student test of nullity of a parameter}
Let $\beta_j$ be a parameter, the tested hypotheses are as follows:
\[
\begin{cases}
(H_0): \beta_j = 0 \\
(H_1): \beta_j \neq 0
\end{cases}
\]
Under the null hypothesis:
\[
\frac{\hat{\beta}_j - \beta_j}{S \sqrt{(\X^T \X)^1_{j,j}}} \sim \St(n-q).
\]
The test statistic is:
\[
W_n = \frac{\hat{\beta}_j}{S \sqrt{(\X^T\X)^{-1}_{j,j}}} \underset{H_0}{\sim} \St(n-q).
\]
$\hat{\beta}$ is a multinormal vector.
Let's consider a vector of 4 values:
\begin{align*}
\begin{pmatrix}
\hat{\beta}_0 \\
\hat{\beta}_1 \\
\hat{\beta}_2 \\
\hat{\beta}_3
\end{pmatrix}
\sim \Norm_4 \left( \begin{pmatrix}
\beta_0 \\
\beta_1 \\
\beta_2 \\
\beta_3
\end{pmatrix} ;
\sigma^2 \left(\X^T \X\right)^{-1}
\right)
\end{align*}
Let $\M$ be the following model
\begin{align*}
Y_i &= \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i
\end{align*}
Why can't we use the following model to test each of the parameters values (here for $X_2$)?
\[
Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i
\]
We can't use such a model, we would probably meet a confounding factor: even if we are only interested in relationship $X_2$ with $Y$, we have to fit the whole model.
\begin{example}[Confounding parameter]
Let $Y$ be a variable related to the lung cancer. Let $X_1$ be the smoking status, and $X_2$ the variable `alcohol' (for instance the quantity of alcohol drunk per week).
If we only fit the model $\M: Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i$, we could conclude for a relationship between alcohol and lung cancer, because alcohol consumption and smoking is strongly related. If we had fit the model $\M = Y_i = \theta_0 + \theta_1 X_{1i} + \theta_2 X_{2i} + \varepsilon_i$, we could indeed have found no significant relationship between $X_2$ and $Y$.
\end{example}
\begin{definition}[Student law]
Let $X$ and $Y$ be two random variables such as $X \indep Y$, and such that $X \sim \Norm(0, 1)$ and $Y \sim \chi_n^2$, then
\[
\frac{X}{\sqrt{Y}} \sim \St(n)
\]
\end{definition}
\subsection{Model comparison}
\begin{definition}[Nested models]
\end{definition}
Let $\M_2$ and $\M_4$ be two models:
$\M_2: Y_i = \beta_0 + \beta_3 X_{3_i} + \varepsilon_i$
$\M_4: Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i$
$\M_2$ is nested in $\M_4$.
\paragraph*{Principle} We compare the residual variances of the two models, that is, the variance that is not explained by the model.
The better the model is, the smallest the variance would be.
If everything is explained by the model, the residual variance would be null.
Here $\M_4$ holds all the information found in $\M_2$ plus other informations. In the worst case It would be at least as good as $\M_2$.
\subsection{Fisher $F$-test of model comparison}
Let $\M_q$ and $\M_{q'}$ be two models such as $\dim(\M_q) = q$, $\dim(\M_{q'}) = q'$, $q > q'$ and $\M_{q'}$ is nested in $\M_q$.
\paragraph{Tested hypotheses}
\[
\begin{cases}
(H_0): \M_{q'} \text{ is the proper model} \\
(H_1): \M_q \text{ is a better model}
\end{cases}
\]
\begin{description}
\item[ESS] Estimated Sum of Squares
\item[RSS] Residual Sum of Squares
\item[EMS] Estimates Mean Square
\item[RMS] Residual Mean Square
\end{description}
\[
ESS = RSS(\M_{q'}) - RSS(\M_q)
\]
\[
RSS(\M) = \norm{\Y - \X\hat{\beta}} = \sum_{i=1}^n \hat{\varepsilon}_i^2
\]
\[
EMS = \frac{ESS}{q - q'}
\]
\[
RMS = \frac{RSS(\M_q)}{n-q}
\]
Under the null hypotheses:
\[
F = \frac{EMS}{RMS} \underset{H_0}{\sim} \Fish(q-q'; n-q)
\]
\section{Model validity}
Assumptions:
\begin{itemize}
\item $\X$ is a full rank matrix;
\item Residuals are i.i.d. $\varepsilon \sim \Norm(0_n, \sigma^2 \mathcal{I}_n)$;
\end{itemize}
We have also to look for influential variables.
\subsection{$\X$ is full rank}
To check that the rank of the matrix is $p+1$, we can calculate the eigen value of the correlation value of the matrix. If there is a perfect relationship between two variables (two columns of $\X$), one of the eigen value would be null. In practice, we never get a null eigen value. We consider the condition index as the ratio between the largest and the smallest eigenvalues, if the condition index $\kappa = \frac{\lambda_1}{\lambda_p}$, with $\lambda_1 \geq \lambda_2 \geq \ldots \geq \lambda_p$ the eigenvalues.
If all eigenvalues is different from 0, $\X^T \X$ can be inverted, but the estimated parameter variance would be large, thus the estimation of the parameters would be not relevant (not good enough).
\paragraph{Variance Inflation Factor}
Perform a regression of each of the predictors against the other predictors.
If there is a strong linear relationship between a parameter and the others, it would reflect that the coefficient of determination $R^2$ (the amount of variance explained by the model) for this model, which would mean that there is a strong relationship between the parameters.
We do this for all parameters, and for parameter $j = 1, \ldots, p$, the variance inflation factor would be:
\[
VIF_j = \frac{1}{1-R^2_j}.
\]
\subparagraph*{Rule}
If $VIF > 10$ or $VIF > 100$\dots
In case of multicollinearity, we have to remove the variable one by one until there is no longer multicollinearity.
Variables have to be removed based on statistical results and through discussion with experimenters.
\subsection{Residuals analysis}
\paragraph*{Assumption}
\[
\varepsilon \sim \Norm_n(0_n, \sigma^2 I_n)
\]
\paragraph{Normality of the residuals} If $\varepsilon_i$ ($i=1, \ldots, n$) could be observed we could build a QQ-plot of $\varepsilon_i / \sigma$ against quantiles of $\Norm(0, 1)$.
Only the residual errors $\hat{e}_i$ can be observed:
Let $e_i^*$ be the studentized residual, considered as estimators of $\varepsilon_i$
\[
e_i^* = \frac{\hat{e}_i}{\sqrt{\sigma^2_{(i)(1-H_{ii})}}}
\]
\begin{align*}
\hat{Y} &= X \hat{\beta} \\
&= X \left( (X^TX)^{-1} X^T Y\right) \\
&= \underbrace{X (X^TX)^{-1} X^T}_{H} Y
\end{align*}
\paragraph{Centered residuals} If $(1, \ldots, 1)^T$ belongs to $\X$ $\EE(\varepsilon) = 0$, by construction.
\paragraph{Independence} We do not have a statistical test for independence in R, we would plot the residuals $e$ against $\X \hat{\beta}$.
\paragraph{Homoscedastiscity} Plot the $\sqrt{e^*}$ against $\X \hat{\beta}$.
\paragraph{Influential observations}
We make the distinction between observations:
\begin{itemize}
\item With too large residual
$\rightarrow$ Influence on the estimation of $\sigma^2$
\item Which are too isolated
$\rightarrow$ Influence on the estimation of $\beta$
\end{itemize}
\[
e_i^* \sim \St(n-p-1)
\]
\subparagraph*{Rule} We consider an observation to be aberrant if:
\[
e_i^* > \F^{-1}_{\St(n-p-1)}(1-\alpha)
\]
quantile of order $1-\alpha$, $\alpha$ being often set as $1/n$, or we set the threshold to 2.
\paragraph{Leverage} Leverage is the diagonal term of the orthogonal projection matrix(?) $H_{ii}$.
\begin{property}
\begin{itemize}
\item $0 \leq H_{ii} \leq 1$
\item $\sum_i H_ii = p$
\end{itemize}
\end{property}
\subparagraph*{Rule} We consider that the observation is aberrant if the leverage is ??.
\paragraph{Non-linearity}
\section{Model Selection}
We want to select the best model with the smallest number of predictors.
When models have too many explicative variables, the power of statistical tests decreases.
Different methods:
\begin{itemize}
\item Comparison of nested models;
\item Information criteria;
\item Method based on the prediction error.
\end{itemize}
\subsection{Information criteria}
\subsubsection{Likelihood}
\begin{definition}[Likelihood]
Probability to observe what we observed for a particular model.
\[
L_n (\M(k))
\]
\end{definition}
\begin{definition}[Akaike Information Criterion]
\[
AIC(\M(k)) = -2 \log L_n (\M(k)) + 2k.
\]
$2k$ is a penalty, leading to privilege the smallest model.
\end{definition}
\begin{definition}[Bayesian Information Criterion]
\[
BIC(\M(k)) = -2 \log L_n (\M(k)) + \log(n) k.
\]
$\log(n) k$ is a penalty.
\end{definition}
Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion.
\subsection{Stepwise}
\begin{description}
\item[forward] Add new predictor iteratively, beginning with the most contributing predictors.
\item[backward] Remove predictors iteratively.
\item[stepwise] Combination of forward and backward selection. We start by no predictors. We add predictor. Before adding the predictor, we check whether all previously predictors remain meaningful.
\end{description}
The problem with this iterative regression, is that at each step we make a test. We have to reduce the confidence level for multiple test.
In practice, the multiple testing problem is not taken into account in these approaches.
We can use information criteria or model comparison in these methods.
\section{Predictions}
Let $X_i$ the $i$-th row of the matrix $\X$. The observed value $Y_i$ can be estimated by:
\[
\hat{Y}_i = (\X \hat{\beta})_i = X_i \hat{\beta}
\]
\begin{align*}
\EE (\hat{Y}_i) &= (\X \beta)_i = X_i \beta \\
\sigma^{-1} (\X \hat{\beta} - \X \beta) \sim \Norm (0_{p+1}, (\X^T \X)^{-1}), \qquad \text{and} \\
\Var(\hat{Y}_i) = ... \\
S^2 = \norm{...}
\end{align*}
\paragraph{Prediction Confidence Interval}
We can build confidence interval for predicted values $(\X \hat{\beta})_i$
\dots
\paragraph{Prediction error of $Y$}
\paragraph{Prediction interval for a new observation $Y_{n+1}$}