654 lines
20 KiB
TeX
Executable File
654 lines
20 KiB
TeX
Executable File
\chapter{Linear Model}
|
|
|
|
\section{Simple Linear Regression}
|
|
|
|
\[
|
|
Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i
|
|
\]
|
|
\[
|
|
\Y = \X \beta + \varepsilon.
|
|
\]
|
|
\[
|
|
\begin{pmatrix}
|
|
Y_1 \\
|
|
Y_2 \\
|
|
\vdots \\
|
|
Y_n
|
|
\end{pmatrix}
|
|
=
|
|
\begin{pmatrix}
|
|
1 & X_1 \\
|
|
1 & X_2 \\
|
|
\vdots & \vdots \\
|
|
1 & X_n
|
|
\end{pmatrix}
|
|
\begin{pmatrix}
|
|
\beta_0 \\
|
|
\beta_1
|
|
\end{pmatrix}
|
|
+
|
|
\begin{pmatrix}
|
|
\varepsilon_1 \\
|
|
\varepsilon_2 \\
|
|
\vdots
|
|
\varepsilon_n
|
|
\end{pmatrix}
|
|
\]
|
|
|
|
\paragraph*{Assumptions}
|
|
\begin{enumerate}[label={\color{primary}{($A_\arabic*$)}}]
|
|
\item $\varepsilon_i$ are independent;
|
|
\item $\varepsilon_i$ are identically distributed;
|
|
\item $\varepsilon_i$ are i.i.d $\sim \Norm(0, \sigma^2)$ (homoscedasticity).
|
|
\end{enumerate}
|
|
|
|
\section{Generalized Linear Model}
|
|
|
|
\[
|
|
g(\EE(Y)) = X \beta
|
|
\]
|
|
with $g$ being
|
|
\begin{itemize}
|
|
\item Logistic regression: $g(v) = \log \left(\frac{v}{1-v}\right)$, for instance for boolean values,
|
|
\item Poisson regression: $g(v) = \log(v)$, for instance for discrete variables.
|
|
\end{itemize}
|
|
|
|
\subsection{Penalized Regression}
|
|
|
|
When the number of variables is large, e.g, when the number of explanatory variable is above the number of observations, if $p >> n$ ($p$: the number of explanatory variable, $n$ is the number of observations), we cannot estimate the parameters.
|
|
In order to estimate the parameters, we can use penalties (additional terms).
|
|
|
|
Lasso regression, Elastic Net, etc.
|
|
|
|
\[
|
|
Y = X \beta + \varepsilon,
|
|
\]
|
|
is noted equivalently as
|
|
\[
|
|
\begin{pmatrix}
|
|
y_1 \\
|
|
y_2 \\
|
|
y_3 \\
|
|
y_4
|
|
\end{pmatrix}
|
|
= \begin{pmatrix}
|
|
1 & x_{11} & x_{12} \\
|
|
1 & x_{21} & x_{22} \\
|
|
1 & x_{31} & x_{32} \\
|
|
1 & x_{41} & x_{42}
|
|
\end{pmatrix}
|
|
\begin{pmatrix}
|
|
\beta_0 \\
|
|
\beta_1 \\
|
|
\beta_2
|
|
\end{pmatrix} +
|
|
\begin{pmatrix}
|
|
\varepsilon_1 \\
|
|
\varepsilon_2 \\
|
|
\varepsilon_3 \\
|
|
\varepsilon_4
|
|
\end{pmatrix}.
|
|
\]
|
|
\section{Parameter Estimation}
|
|
|
|
\subsection{Simple Linear Regression}
|
|
|
|
\subsection{General Case}
|
|
|
|
If $\X^T\X$ is invertible, the OLS estimator is:
|
|
\begin{equation}
|
|
\hat{\beta} = (\X^T\X)^{-1} \X^T \Y
|
|
\end{equation}
|
|
|
|
\subsection{Ordinary Least Square Algorithm}
|
|
|
|
We want to minimize the distance between $\X\beta$ and $\Y$:
|
|
\[
|
|
\min \norm{\Y - \X\beta}^2
|
|
\]
|
|
(See \autoref{ch:elements-of-linear-algebra}).
|
|
\begin{align*}
|
|
\Rightarrow& \X \beta = proj^{(1, \X)} \Y\\
|
|
\Rightarrow& \forall v \in w,\, vy = v proj^w(y)\\
|
|
\Rightarrow& \forall i: \\
|
|
& \X_i \Y = \X_i \X\hat{\beta} \qquad \text{where $\hat{\beta}$ is the estimator of $\beta$} \\
|
|
\Rightarrow& \X^T \Y = \X^T \X \hat{\beta} \\
|
|
\Rightarrow& {\color{gray}(\X^T \X)^{-1}} \X^T \Y = {\color{gray}(\X^T \X)^{-1}} (\X^T\X) \hat{\beta} \\
|
|
\Rightarrow& \hat{\beta} = (\X^T\X)^{-1} \X^T \Y
|
|
\end{align*}
|
|
|
|
This formula comes from the orthogonal projection of $\Y$ on the vector subspace defined by the explanatory variables $\X$
|
|
|
|
$\X \hat{\beta}$ is the closest point to $\Y$ in the subspace generated by $\X$.
|
|
|
|
If $H$ is the projection matrix of the subspace generated by $\X$, $\X\Y$ is the projection on $\Y$ on this subspace, that corresponds to $\X\hat{\beta}$.
|
|
|
|
\section{Sum of squares}
|
|
|
|
$\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
|
|
\[
|
|
\underbrace{\norm{\Y - \bar{\Y}\One}}_{\text{Total SS}} = \underbrace{\norm{\Y - \X \hat{\beta}}^2}_{\text{Residual SS}} + \underbrace{\norm{\X \hat{\beta} - \bar{\Y} \One}^2}_{\text{Explicated SS}}
|
|
\]
|
|
|
|
\section{Coefficient of Determination: \texorpdfstring{$R^2$}{R\textsuperscript{2}}}
|
|
\begin{definition}[$R^2$]
|
|
\[
|
|
0 \leq R^2 = \frac{\norm{\X\hat{\beta} - \bar{\Y}\One}^2}{\norm{\Y - \bar{\Y}\One}^2} = 1 - \frac{\norm{\Y - \X\hat{\beta}}^2}{\norm{\Y - \bar{\Y}\One}^2} \leq 1
|
|
\] proportion of variation of $\Y$ explained by the model.
|
|
\end{definition}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{figures/schemes/orthogonal_projection.pdf}
|
|
\caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\hat{\varepsilon} = \norm{\Y - \hat{\beta}\X}^2$} and $\color{blue}c$ corresponds to $\norm{Y - \bar{Y}}^2$.
|
|
\label{fig:scheme-orthogonal-projection}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\includegraphics{figures/schemes/ordinary_least_squares.pdf}
|
|
\caption{Ordinary least squares and regression line with simulated data.}
|
|
\label{fig:ordinary-least-squares}
|
|
\end{figure}
|
|
|
|
\begin{definition}[Model dimension]
|
|
Let $\M$ be a model.
|
|
The dimension of $\M$ is the dimension of the subspace generated by $\X$, that is the number of parameters in the $\beta$ vector.
|
|
|
|
\textit{Nb.} The dimension of the model is not the number of parameter, as $\sigma^2$ is one of the model parameters.
|
|
\end{definition}
|
|
|
|
\section{Gaussian vectors}
|
|
|
|
\begin{definition}[Normal distribution]
|
|
$X \sim \Norm(\mu, \sigma^{2})$, with density function $f$
|
|
\[
|
|
f(x) = \frac{1}{\sigma \sqrt{2\pi}}e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^{2}}
|
|
\]
|
|
\end{definition}
|
|
|
|
|
|
\begin{definition}[Gaussian vector]
|
|
A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is a gaussian random variable.
|
|
\end{definition}
|
|
|
|
\begin{property}
|
|
$m = \EE(Y) = (m_1, \ldots, m_n)^T$, where $m_i = \EE(Y_i)$
|
|
|
|
\[
|
|
\Y \sim \Norm_n(m, \Sigma)
|
|
\]
|
|
where $\Sigma$ is the variance-covariance matrix!
|
|
\[
|
|
\Sigma = \E\left[(\Y -m)(\Y - m)^T\right].
|
|
\]
|
|
|
|
\end{property}
|
|
|
|
\begin{remark}
|
|
\[
|
|
\Cov(Y_i, Y_i) = \Var(Y_i)
|
|
\]
|
|
\end{remark}
|
|
|
|
\begin{definition}[Covariance]
|
|
\[
|
|
\Cov(Y_i, Y_j) = \EE\left((Y_i-\EE(Y_j))(Y_j-\EE(Y_j))\right)
|
|
\]
|
|
\end{definition}
|
|
|
|
|
|
When two variable are linked, the covariance is large.
|
|
|
|
If two variables $X, Y$ are independent, $\Cov(X, Y) = 0$.
|
|
|
|
\begin{definition}[Correlation coefficient]
|
|
\[
|
|
\Cor(Y_i, Y_j) = \frac{\EE\left((Y_i-\EE(Y_j))(Y_j-\EE(Y_j))\right)}{\sqrt{\EE(Y_i - \EE(Y_i)) \cdot \EE(Y_j - \EE(Y_j))}}
|
|
\]
|
|
\end{definition}
|
|
|
|
Covariance is really sensitive to scale of variables. For instance, if we measure distance in millimeters, the covariance would be larger than in the case of a measure expressed in metters. Thus the correlation coefficient, which is a sort of normalized covariance is useful, to be able to compare the values.
|
|
|
|
\begin{remark}
|
|
\begin{align*}
|
|
\Cov(Y_i, Y_i) &= \EE((Y_i - \EE(Y_i)) (Y_i - \EE(Y_i))) \\
|
|
&= \EE((Y_i - \EE(Y_i))^2) \\
|
|
&= \Var(Y_i)
|
|
\end{align*}
|
|
\end{remark}
|
|
|
|
\begin{equation}
|
|
\Sigma = \begin{pNiceMatrix}
|
|
\VVar(Y_1) & & & &\\
|
|
& \Ddots & & & \\
|
|
& \Cov(Y_i, Y_j) & \VVar(Y_i) & & \\
|
|
& & & \Ddots & \\
|
|
& & & & \VVar(Y_n)
|
|
\end{pNiceMatrix}
|
|
\end{equation}
|
|
|
|
\begin{definition}[Identity matrix]
|
|
\[
|
|
\mathcal{I}_n = \begin{pNiceMatrix}
|
|
1 & 0 & 0 \\
|
|
0 & \Ddots & 0\\
|
|
0 & 0 & 1
|
|
\end{pNiceMatrix}
|
|
\]
|
|
|
|
\end{definition}
|
|
|
|
|
|
\begin{theorem}[Cochran Theorem (Consequence)]
|
|
\label{thm:cochran}
|
|
Let $\mathbf{Z}$ be a gaussian vector: $\mathbf{Z} \sim \Norm_n(0_n, I_n)$.
|
|
|
|
\begin{itemize}
|
|
\item If $V_1, V_n$ are orthogonal subspaces of $\RR[n]$ with dimensions $n_1, n_2$ such that
|
|
\[
|
|
\RR[n] = V_1 \overset{\perp}{\oplus} V_2.
|
|
\]
|
|
\item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$ ($\Pi_{1}$ and $\Pi_{2}$ being projection matrices)
|
|
then:
|
|
\item $z_{1}$, $Z_{2}$ are independent gaussian vectors, $Z_{1} \sim \Norm_{n_{1}} (0_{n}, \Pi_{1})$ and $Z_{2} \sim \Norm(0_{n_{2}}, \Pi_{2})$.
|
|
|
|
In particular $\norm{Z_{1}} \sim \chi^{2}(n_{1})$ and $\norm{Z_{2}} \sim \chi^{2}(n_{2})$.
|
|
\end{itemize}
|
|
|
|
$Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$.
|
|
|
|
\dots
|
|
\end{theorem}
|
|
|
|
\begin{property}[Estimators properties in the linear model]
|
|
According to \autoref{thm:cochran},
|
|
\[
|
|
\hat{m} \text{ is independent from $\hat{\sigma}^2$}
|
|
\]
|
|
\[
|
|
\norm{\Y - \Pi_V(\Y)}^2 = \norm{\varepsilon - \Pi_{V}(\varepsilon)}^{2} = \norm{\Pi_{V}^{\perp} (\varepsilon)}^{2}
|
|
\]
|
|
|
|
$\hat{m} = \X \hat{\beta}$
|
|
|
|
$\hat{m}$ is the estimation of the mean.
|
|
\end{property}
|
|
|
|
|
|
\begin{definition}[Chi 2 distribution]
|
|
If $X_1, \ldots, X_n$ i.i.d. $\sim \Norm(0, 1)$, then;,
|
|
\[
|
|
X_1^2 + \ldots X_n^2 \sim \chi_n^2
|
|
\]
|
|
\end{definition}
|
|
|
|
\subsection{Estimator's properties}
|
|
|
|
|
|
\[
|
|
\Pi_V = \X(\X^T\X)^{-1} \X^T
|
|
\]
|
|
|
|
\begin{align*}
|
|
\hat{m} &= \X \hat{\beta} = \X(\X^T\X)^{-1} \X^T \Y \\
|
|
\intertext{so} \\
|
|
&= \Pi_V \Y
|
|
\end{align*}
|
|
|
|
According to Cochran theorem, we can deduce that the estimator of the predicted value $\hat{m}$ is independent $\hat{\sigma}^2$
|
|
|
|
All the sum of squares follows a $\chi^2$ distribution.
|
|
|
|
|
|
\subsection{Estimators properties}
|
|
|
|
\begin{itemize}
|
|
\item $\hat{m}$ is unbiased and estimator of $m$;
|
|
\item $\EE(\hat{\sigma}^{2}) = \sigma^{2}(n-q)/n$ $\hat{\sigma}^{2}$ is a biased estimator of $\sigma^{2}$.
|
|
\[
|
|
S^{2} = \frac{1}{n-q} \norm{\Y - \Pi_{V}}^{2}
|
|
\]
|
|
is an unbiased estimator of $\sigma²$.
|
|
\end{itemize}
|
|
|
|
We can derive statistical test from these properties.
|
|
|
|
\section{Statistical tests}
|
|
|
|
\subsection{Student $t$-test}
|
|
|
|
\[
|
|
\frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t_{n-q}
|
|
\]
|
|
|
|
where
|
|
|
|
\paragraph{Estimation of $\sigma^2$}
|
|
|
|
A biased estimator of $\sigma^2$ is:
|
|
\[
|
|
\hat{\sigma^2} = ?
|
|
\]
|
|
|
|
$S^2$ is the unbiased estimator of $\sigma^2$
|
|
\begin{align*}
|
|
S^2 &= \frac{1}{n-q} \norm{\Y - \Pi_V(\Y)}^2 \\
|
|
&= \frac{1}{n-q} \sum_{i=1}^n (Y_i - (\X\hat{\beta})_i)^2
|
|
\end{align*}
|
|
|
|
\begin{remark}[On $\hat{m}$]
|
|
\begin{align*}
|
|
&\Y = \X \beta + \varepsilon
|
|
\Leftrightarrow& \EE(\Y) = \X \beta
|
|
\end{align*}
|
|
\end{remark}
|
|
|
|
\section{Student test of nullity of a parameter}
|
|
|
|
Let $\beta_j$ be a parameter, the tested hypotheses are as follows:
|
|
\[
|
|
\begin{cases}
|
|
(H_0): \beta_j = 0 \\
|
|
(H_1): \beta_j \neq 0
|
|
\end{cases}
|
|
\]
|
|
|
|
Under the null hypothesis:
|
|
\[
|
|
\frac{\hat{\beta}_j - \beta_j}{S \sqrt{(\X^T \X)^1_{j,j}}} \sim \St(n-q).
|
|
\]
|
|
The test statistic is:
|
|
\[
|
|
W_n = \frac{\hat{\beta}_j}{S \sqrt{(\X^T\X)^{-1}_{j,j}}} \underset{H_0}{\sim} \St(n-q).
|
|
\]
|
|
|
|
$\hat{\beta}$ is a multinormal vector.
|
|
|
|
Let's consider a vector of 4 values:
|
|
\begin{align*}
|
|
\begin{pmatrix}
|
|
\hat{\beta}_0 \\
|
|
\hat{\beta}_1 \\
|
|
\hat{\beta}_2 \\
|
|
\hat{\beta}_3
|
|
\end{pmatrix}
|
|
\sim \Norm_4 \left( \begin{pmatrix}
|
|
\beta_0 \\
|
|
\beta_1 \\
|
|
\beta_2 \\
|
|
\beta_3
|
|
\end{pmatrix} ;
|
|
\sigma^2 \left(\X^T \X\right)^{-1}
|
|
\right)
|
|
\end{align*}
|
|
|
|
Let $\M$ be the following model
|
|
\begin{align*}
|
|
Y_i &= \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i
|
|
\end{align*}
|
|
|
|
Why can't we use the following model to test each of the parameters values (here for $X_2$)?
|
|
\[
|
|
Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i
|
|
\]
|
|
We can't use such a model, we would probably meet a confounding factor: even if we are only interested in relationship $X_2$ with $Y$, we have to fit the whole model.
|
|
|
|
\begin{example}[Confounding parameter]
|
|
Let $Y$ be a variable related to the lung cancer. Let $X_1$ be the smoking status, and $X_2$ the variable `alcohol' (for instance the quantity of alcohol drunk per week).
|
|
|
|
If we only fit the model $\M: Y_i = \theta_0 + \theta_1 X_{2i} + \varepsilon_i$, we could conclude for a relationship between alcohol and lung cancer, because alcohol consumption and smoking is strongly related. If we had fit the model $\M = Y_i = \theta_0 + \theta_1 X_{1i} + \theta_2 X_{2i} + \varepsilon_i$, we could indeed have found no significant relationship between $X_2$ and $Y$.
|
|
\end{example}
|
|
|
|
\begin{definition}[Student law]
|
|
Let $X$ and $Y$ be two random variables such as $X \indep Y$, and such that $X \sim \Norm(0, 1)$ and $Y \sim \chi_n^2$, then
|
|
\[
|
|
\frac{X}{\sqrt{Y}} \sim \St(n)
|
|
\]
|
|
\end{definition}
|
|
|
|
\subsection{Model comparison}
|
|
|
|
\begin{definition}[Nested models]
|
|
|
|
\end{definition}
|
|
|
|
Let $\M_2$ and $\M_4$ be two models:
|
|
|
|
$\M_2: Y_i = \beta_0 + \beta_3 X_{3_i} + \varepsilon_i$
|
|
|
|
$\M_4: Y_i = \beta_0 + \beta_1 X_{1i} + \beta_2 X_{2i} + \beta_3 X_{3i} + \varepsilon_i$
|
|
|
|
$\M_2$ is nested in $\M_4$.
|
|
|
|
\paragraph*{Principle} We compare the residual variances of the two models, that is, the variance that is not explained by the model.
|
|
|
|
The better the model is, the smallest the variance would be.
|
|
|
|
If everything is explained by the model, the residual variance would be null.
|
|
|
|
|
|
Here $\M_4$ holds all the information found in $\M_2$ plus other informations. In the worst case It would be at least as good as $\M_2$.
|
|
|
|
\subsection{Fisher $F$-test of model comparison}
|
|
|
|
Let $\M_q$ and $\M_{q'}$ be two models such as $\dim(\M_q) = q$, $\dim(\M_{q'}) = q'$, $q > q'$ and $\M_{q'}$ is nested in $\M_q$.
|
|
|
|
\paragraph{Tested hypotheses}
|
|
\[
|
|
\begin{cases}
|
|
(H_0): \M_{q'} \text{ is the proper model} \\
|
|
(H_1): \M_q \text{ is a better model}
|
|
\end{cases}
|
|
\]
|
|
|
|
\begin{description}
|
|
\item[ESS] Estimated Sum of Squares
|
|
\item[RSS] Residual Sum of Squares
|
|
\item[EMS] Estimates Mean Square
|
|
\item[RMS] Residual Mean Square
|
|
\end{description}
|
|
|
|
\[
|
|
ESS = RSS(\M_{q'}) - RSS(\M_q)
|
|
\]
|
|
\[
|
|
RSS(\M) = \norm{\Y - \X\hat{\beta}} = \sum_{i=1}^n \hat{\varepsilon}_i^2
|
|
\]
|
|
\[
|
|
EMS = \frac{ESS}{q - q'}
|
|
\]
|
|
\[
|
|
RMS = \frac{RSS(\M_q)}{n-q}
|
|
\]
|
|
|
|
Under the null hypotheses:
|
|
\[
|
|
F = \frac{EMS}{RMS} \underset{H_0}{\sim} \Fish(q-q'; n-q)
|
|
\]
|
|
|
|
\section{Model validity}
|
|
|
|
Assumptions:
|
|
\begin{itemize}
|
|
\item $\X$ is a full rank matrix;
|
|
\item Residuals are i.i.d. $\varepsilon \sim \Norm(0_n, \sigma^2 \mathcal{I}_n)$;
|
|
\end{itemize}
|
|
|
|
We have also to look for influential variables.
|
|
|
|
|
|
\subsection{$\X$ is full rank}
|
|
|
|
To check that the rank of the matrix is $p+1$, we can calculate the eigen value of the correlation value of the matrix. If there is a perfect relationship between two variables (two columns of $\X$), one of the eigen value would be null. In practice, we never get a null eigen value. We consider the condition index as the ratio between the largest and the smallest eigenvalues, if the condition index $\kappa = \frac{\lambda_1}{\lambda_p}$, with $\lambda_1 \geq \lambda_2 \geq \ldots \geq \lambda_p$ the eigenvalues.
|
|
|
|
|
|
If all eigenvalues is different from 0, $\X^T \X$ can be inverted, but the estimated parameter variance would be large, thus the estimation of the parameters would be not relevant (not good enough).
|
|
|
|
\paragraph{Variance Inflation Factor}
|
|
|
|
Perform a regression of each of the predictors against the other predictors.
|
|
|
|
If there is a strong linear relationship between a parameter and the others, it would reflect that the coefficient of determination $R^2$ (the amount of variance explained by the model) for this model, which would mean that there is a strong relationship between the parameters.
|
|
|
|
We do this for all parameters, and for parameter $j = 1, \ldots, p$, the variance inflation factor would be:
|
|
\[
|
|
VIF_j = \frac{1}{1-R^2_j}.
|
|
\]
|
|
|
|
\subparagraph*{Rule}
|
|
If $VIF > 10$ or $VIF > 100$\dots
|
|
|
|
|
|
In case of multicollinearity, we have to remove the variable one by one until there is no longer multicollinearity.
|
|
Variables have to be removed based on statistical results and through discussion with experimenters.
|
|
|
|
|
|
\subsection{Residuals analysis}
|
|
|
|
\paragraph*{Assumption}
|
|
\[
|
|
\varepsilon \sim \Norm_n(0_n, \sigma^2 I_n)
|
|
\]
|
|
|
|
\paragraph{Normality of the residuals} If $\varepsilon_i$ ($i=1, \ldots, n$) could be observed we could build a QQ-plot of $\varepsilon_i / \sigma$ against quantiles of $\Norm(0, 1)$.
|
|
|
|
Only the residual errors $\hat{e}_i$ can be observed:
|
|
|
|
Let $e_i^*$ be the studentized residual, considered as estimators of $\varepsilon_i$
|
|
|
|
\[
|
|
e_i^* = \frac{\hat{e}_i}{\sqrt{\sigma^2_{(i)(1-H_{ii})}}}
|
|
\]
|
|
|
|
\begin{align*}
|
|
\hat{Y} &= X \hat{\beta} \\
|
|
&= X \left( (X^TX)^{-1} X^T Y\right) \\
|
|
&= \underbrace{X (X^TX)^{-1} X^T}_{H} Y
|
|
\end{align*}
|
|
|
|
\paragraph{Centered residuals} If $(1, \ldots, 1)^T$ belongs to $\X$ $\EE(\varepsilon) = 0$, by construction.
|
|
|
|
\paragraph{Independence} We do not have a statistical test for independence in R, we would plot the residuals $e$ against $\X \hat{\beta}$.
|
|
|
|
\paragraph{Homoscedastiscity} Plot the $\sqrt{e^*}$ against $\X \hat{\beta}$.
|
|
|
|
|
|
\paragraph{Influential observations}
|
|
|
|
We make the distinction between observations:
|
|
\begin{itemize}
|
|
\item With too large residual
|
|
$\rightarrow$ Influence on the estimation of $\sigma^2$
|
|
\item Which are too isolated
|
|
$\rightarrow$ Influence on the estimation of $\beta$
|
|
\end{itemize}
|
|
|
|
\[
|
|
e_i^* \sim \St(n-p-1)
|
|
\]
|
|
\subparagraph*{Rule} We consider an observation to be aberrant if:
|
|
\[
|
|
e_i^* > \F^{-1}_{\St(n-p-1)}(1-\alpha)
|
|
\]
|
|
quantile of order $1-\alpha$, $\alpha$ being often set as $1/n$, or we set the threshold to 2.
|
|
|
|
\paragraph{Leverage} Leverage is the diagonal term of the orthogonal projection matrix(?) $H_{ii}$.
|
|
|
|
\begin{property}
|
|
\begin{itemize}
|
|
\item $0 \leq H_{ii} \leq 1$
|
|
\item $\sum_i H_ii = p$
|
|
\end{itemize}
|
|
\end{property}
|
|
|
|
\subparagraph*{Rule} We consider that the observation is aberrant if the leverage is ??.
|
|
|
|
|
|
\paragraph{Non-linearity}
|
|
|
|
|
|
\section{Model Selection}
|
|
|
|
We want to select the best model with the smallest number of predictors.
|
|
|
|
When models have too many explicative variables, the power of statistical tests decreases.
|
|
|
|
Different methods:
|
|
\begin{itemize}
|
|
\item Comparison of nested models;
|
|
\item Information criteria;
|
|
\item Method based on the prediction error.
|
|
\end{itemize}
|
|
|
|
\subsection{Information criteria}
|
|
|
|
\subsubsection{Likelihood}
|
|
|
|
\begin{definition}[Likelihood]
|
|
Probability to observe what we observed for a particular model.
|
|
\[
|
|
L_n (\M(k))
|
|
\]
|
|
\end{definition}
|
|
|
|
|
|
\begin{definition}[Akaike Information Criterion]
|
|
\[
|
|
AIC(\M(k)) = -2 \log L_n (\M(k)) + 2k.
|
|
\]
|
|
|
|
$2k$ is a penalty, leading to privilege the smallest model.
|
|
\end{definition}
|
|
|
|
\begin{definition}[Bayesian Information Criterion]
|
|
\[
|
|
BIC(\M(k)) = -2 \log L_n (\M(k)) + \log(n) k.
|
|
\]
|
|
$\log(n) k$ is a penalty.
|
|
\end{definition}
|
|
|
|
Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion.
|
|
|
|
\subsection{Stepwise}
|
|
|
|
\begin{description}
|
|
\item[forward] Add new predictor iteratively, beginning with the most contributing predictors.
|
|
\item[backward] Remove predictors iteratively.
|
|
\item[stepwise] Combination of forward and backward selection. We start by no predictors. We add predictor. Before adding the predictor, we check whether all previously predictors remain meaningful.
|
|
\end{description}
|
|
|
|
The problem with this iterative regression, is that at each step we make a test. We have to reduce the confidence level for multiple test.
|
|
|
|
In practice, the multiple testing problem is not taken into account in these approaches.
|
|
|
|
We can use information criteria or model comparison in these methods.
|
|
|
|
\section{Predictions}
|
|
|
|
Let $X_i$ the $i$-th row of the matrix $\X$. The observed value $Y_i$ can be estimated by:
|
|
\[
|
|
\hat{Y}_i = (\X \hat{\beta})_i = X_i \hat{\beta}
|
|
\]
|
|
|
|
\begin{align*}
|
|
\EE (\hat{Y}_i) &= (\X \beta)_i = X_i \beta \\
|
|
\sigma^{-1} (\X \hat{\beta} - \X \beta) \sim \Norm (0_{p+1}, (\X^T \X)^{-1}), \qquad \text{and} \\
|
|
\Var(\hat{Y}_i) = ... \\
|
|
S^2 = \norm{...}
|
|
\end{align*}
|
|
|
|
|
|
\paragraph{Prediction Confidence Interval}
|
|
We can build confidence interval for predicted values $(\X \hat{\beta})_i$
|
|
|
|
\dots
|
|
|
|
\paragraph{Prediction error of $Y$}
|
|
|
|
|
|
\paragraph{Prediction interval for a new observation $Y_{n+1}$}
|
|
|
|
|
|
|