feat: Addendum on cross validation
This commit is contained in:
parent
c552aa24f4
commit
7bcd3f9289
|
@ -11,8 +11,8 @@
|
|||
}
|
||||
}
|
||||
|
||||
\includechapters{part1}{1}
|
||||
\includechapters{part1}{4}
|
||||
|
||||
\includechapters{part2}{2}
|
||||
|
||||
% \includechapters{part3}{1}
|
||||
% \includechapters{part3}{1}
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
\chapter{Tests Reminders}
|
||||
|
||||
\section{$\chi^2$ test of independence}
|
||||
\section{\texorpdfstring{$\chi^2$}{chi2} test of independence}
|
||||
|
||||
|
||||
\section{$\chi^2$ test of goodness of fit}
|
||||
\section{\texorpdfstring{$\chi^2$}{chi2} test of goodness of fit}
|
||||
|
||||
Check if the observations is in adequation with a particular distribution.
|
||||
|
||||
\begin{example}[Mendel experiments]
|
||||
Let $AB$, $Ab$, $aB$, $ab$ be the four possible genotypes of peas: colors and grain shape.
|
||||
\begin{tabular}
|
||||
\begin{tabular}{cccc}
|
||||
\toprule
|
||||
AB & Ab & aB & ab \\
|
||||
\midrule
|
||||
|
@ -20,6 +20,6 @@ Check if the observations is in adequation with a particular distribution.
|
|||
|
||||
The test statistics is:
|
||||
\[
|
||||
D_{k,n} = \sum_{i=1}^{k} \frac{(N_i - np_i)^2}{np_i} \underoverset{H_0}{\mathcal{L}} \chi^2_{(n-1)(q-1)??}
|
||||
D_{k,n} = \sum_{i=1}^{k} \frac{(N_i - np_i)^2}{np_i} \overunderset{\mathcal{L}}{H_0}{n \longrightarrow \infty} \chi^2_{(n-1)(q-1)??}
|
||||
\]
|
||||
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
\chapter{Regularized regressions}
|
||||
|
||||
|
||||
Let $\Y$ be a vector of observations and $\X$ a matrix of dimension $n \times (p+1)$.
|
||||
Suppose the real model is:
|
||||
\[
|
||||
\Y = \X^{m^{*}} \beta^{m^{*}} + \varepsilon^{m^{*}} = \X^{*} \beta^{*} + \varepsilon^{*}.
|
||||
\]
|
||||
if $p$ is large compared to $n$:
|
||||
\begin{itemize}
|
||||
\item $\hat{\beta} = (\X^{T}\X)^{-1} \X^{T} \Y$ is not defined as $\X^{T}\X$ is not invertible.
|
||||
|
||||
$m^{*}$ is the number of true predictors, that is, the number of predictor with non-zero values.
|
||||
|
||||
\item
|
||||
|
||||
\item
|
||||
\end{itemize}
|
||||
|
||||
\section{Ridge regression}
|
||||
|
||||
Instead of minimizing the mean square error, we want to minimize the following regularize expression:
|
||||
\[
|
||||
\hat{\beta}^{\text{ridge}}_{\lambda} = \argmin_{\beta \in \RR[p]} \norm{Y - X \beta}^{2} \lambda \sum_{j=1}^{p} \beta_{j}^{2}
|
||||
\]
|
||||
it is a way to favor the solution with small values for parameters.
|
||||
where $\lambda$ is used to callibrate the regularization.
|
||||
\[
|
||||
\sum_{j=1}^{p} \beta_{j}^{2} = \norm{\beta_{j}}^{2}
|
||||
\]
|
||||
is the classical square norm of the vector.
|
||||
|
||||
|
||||
\section{Cross validation}
|
||||
|
||||
\subsection{Leave-one-out \textit{jackknife}}
|
||||
|
||||
\begin{example}
|
||||
Let $\M_{0}$ be the model $Y_{i} = \beta_{0} + \beta_{1} X_{1i} + \beta_{2}X_{2i} + \beta_{3} X_{3i}$
|
||||
|
||||
The model will be:
|
||||
\[
|
||||
\begin{pmatrix}
|
||||
y_{1} \\
|
||||
y_{2} \\
|
||||
y_{3} \\
|
||||
y_{4} \\
|
||||
y_{5}
|
||||
\end{pmatrix} =
|
||||
\beta_{0} + \beta_{1} \begin{pmatrix}
|
||||
x_{11} \\
|
||||
x_{12} \\
|
||||
x_{13} \\
|
||||
x_{14} \\
|
||||
x_{15}
|
||||
\end{pmatrix}
|
||||
+ \beta_{2} \begin{pmatrix}
|
||||
x_{21} \\
|
||||
x_{22} \\
|
||||
x_{23} \\
|
||||
x_{24} \\
|
||||
x_{25}
|
||||
\end{pmatrix}
|
||||
+
|
||||
\beta_{3} \begin{pmatrix}
|
||||
x_{31} \\
|
||||
x_{32} \\
|
||||
x_{33} \\
|
||||
x_{34} \\
|
||||
x_{35}
|
||||
\end{pmatrix}
|
||||
\]
|
||||
\def\x{$\times$}
|
||||
\begin{tabular}{ccccc}
|
||||
\toprule
|
||||
1 & 2 & 3 & 4 & 5 \\
|
||||
\midrule
|
||||
. & \x & \x & \x & \x \\
|
||||
\x & . & \x & \x & \x \\
|
||||
\x & \x & . & \x & \x \\
|
||||
\x & \x & \x & . & \x \\
|
||||
\x & \x & \x & \x & . \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{example}
|
||||
|
||||
We perform computation of $\lambda$ for each dataset without one observation.
|
||||
|
||||
|
||||
\subsection{K-fold cross-validation}
|
||||
|
||||
We will have as many tables as subsets.
|
||||
|
||||
|
||||
We chose lambda such that the generalization error is the smallest.
|
||||
|
||||
\section{Lasso regression}
|
||||
|
||||
The difference with the Ridge regression lies in the penalty:
|
||||
|
||||
\[
|
||||
\hat{\beta}_{\lambda}^{\text{lasso}}= \argmin \norm{Y-X\beta}^{2} + \sum_{j=1}^{p} \abs{\beta_{j}}
|
||||
\]
|
||||
|
||||
$\sum_{j=1}^{p} \abs{\beta_j} = \norm{\beta}_1$
|
||||
|
||||
Instead of having a smooth increasing for each parameters, each parameters will enter iteratively in the model. Some parameters can be set to 0.
|
||||
|
||||
Lasso regression can be used to perform variable selection.
|
||||
|
||||
|
||||
We can use the same methods (K-fold and Leave-one-out) to select the $\lambda$ value.
|
||||
|
||||
\section{Elastic Net}
|
||||
|
||||
Combination of the Ridge and Lasso regression:
|
||||
|
||||
\[
|
||||
\hat{\beta}_\lambda^{en} = \argmin \norm{Y-X\beta}^{2} + \lambda_{1} \norm{\beta}_{1} + \lambda_{2} \norm{\beta}_{2}^{2}
|
||||
\]
|
||||
|
||||
|
||||
\begin{remark}
|
||||
In the case of Lasso, Elastic net or Ridge regression, we can no longer perform statistical test on the parameters.
|
||||
\end{remark}
|
|
@ -6,5 +6,7 @@
|
|||
\newcommand{\X}{\ensuremath{\mathbf{X}}}
|
||||
\newcommand{\Y}{\ensuremath{\mathbf{Y}}}
|
||||
\newcommand{\Z}{\ensuremath{\mathbf{Z}}}
|
||||
\DeclareMathOperator*{\argmax}{arg\,max}
|
||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||
\usepackage{unicode-math}
|
||||
|
||||
|
|
|
@ -4,4 +4,4 @@
|
|||
\usepackage{tikz-3dplot}
|
||||
\usepackage{tkz-euclide}
|
||||
\usepackage{nicematrix}
|
||||
\usepackage{luacode}
|
||||
\usepackage{luacode}
|
||||
|
|
Loading…
Reference in New Issue