feat: Add some stuff on generalized linear models

This commit is contained in:
Samuel Ortion 2023-11-10 13:39:13 +01:00
parent 29dad16dfb
commit c552aa24f4
41 changed files with 300 additions and 45 deletions

0
.gitattributes vendored Normal file → Executable file
View File

0
Makefile Normal file → Executable file
View File

View File

@ -11,7 +11,7 @@
} }
} }
\includechapters{part1}{2} \includechapters{part1}{1}
\includechapters{part2}{2} \includechapters{part2}{2}

0
content/chapters/part1/0.tex Normal file → Executable file
View File

106
content/chapters/part1/1.tex Normal file → Executable file
View File

@ -60,12 +60,6 @@ In order to estimate the parameters, we can use penalties (additional terms).
Lasso regression, Elastic Net, etc. Lasso regression, Elastic Net, etc.
\subsection{Statistical Analysis Workflow}
\begin{enumerate}[label={\bfseries\color{primary}Step \arabic*.}]
\item Graphical representation;
\item ...
\end{enumerate}
\[ \[
Y = X \beta + \varepsilon, Y = X \beta + \varepsilon,
\] \]
@ -145,21 +139,18 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
\begin{figure} \begin{figure}
\centering \centering
\includestandalone{figures/schemes/orthogonal_projection} \includegraphics{figures/schemes/orthogonal_projection.pdf}
\caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\hat{\varepsilon} = \norm{\Y - \hat{\beta}\X}^2$} and $\color{blue}c$ corresponds to $\norm{Y - \bar{Y}}^2$. \caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\hat{\varepsilon} = \norm{\Y - \hat{\beta}\X}^2$} and $\color{blue}c$ corresponds to $\norm{Y - \bar{Y}}^2$.
\label{fig:scheme-orthogonal-projection} \label{fig:scheme-orthogonal-projection}
\end{figure} \end{figure}
\begin{figure} \begin{figure}
\centering \centering
\includestandalone{figures/schemes/ordinary_least_squares} \includegraphics{figures/schemes/ordinary_least_squares.pdf}
\caption{Ordinary least squares and regression line with simulated data.} \caption{Ordinary least squares and regression line with simulated data.}
\label{fig:ordinary-least-squares} \label{fig:ordinary-least-squares}
\end{figure} \end{figure}
\begin{definition}[Model dimension] \begin{definition}[Model dimension]
Let $\M$ be a model. Let $\M$ be a model.
The dimension of $\M$ is the dimension of the subspace generated by $\X$, that is the number of parameters in the $\beta$ vector. The dimension of $\M$ is the dimension of the subspace generated by $\X$, that is the number of parameters in the $\beta$ vector.
@ -169,22 +160,21 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
\section{Gaussian vectors} \section{Gaussian vectors}
\begin{definition}[Normal distribution] \begin{definition}[Normal distribution]
$X \sim \Norm(\mu, \sigma^{2})$, with density function $f$
\[
f(x) = \frac{1}{\sigma \sqrt{2\pi}}e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^{2}}
\]
\end{definition} \end{definition}
\begin{definition}[Gaussian vector] \begin{definition}[Gaussian vector]
A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is ... A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is a gaussian random variable.
\end{definition} \end{definition}
\begin{property} \begin{property}
$m = \EE(Y) = (m_1, \ldots, m_n)^T$, where $m_i = \EE(Y_i)$ $m = \EE(Y) = (m_1, \ldots, m_n)^T$, where $m_i = \EE(Y_i)$
...
\[ \[
\Y \sim \Norm_n(m, \Sigma) \Y \sim \Norm_n(m, \Sigma)
\] \]
@ -193,8 +183,6 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
\Sigma = \E\left[(\Y -m)(\Y - m)^T\right]. \Sigma = \E\left[(\Y -m)(\Y - m)^T\right].
\] \]
\end{property} \end{property}
\begin{remark} \begin{remark}
@ -261,24 +249,25 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
\[ \[
\RR[n] = V_1 \overset{\perp}{\oplus} V_2. \RR[n] = V_1 \overset{\perp}{\oplus} V_2.
\] \]
\item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$... \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$ ($\Pi_{1}$ and $\Pi_{2}$ being projection matrices)
(\textcolor{red}{look to the slides}) then:
\item $z_{1}$, $Z_{2}$ are independent gaussian vectors, $Z_{1} \sim \Norm_{n_{1}} (0_{n}, \Pi_{1})$ and $Z_{2} \sim \Norm(0_{n_{2}}, \Pi_{2})$.
In particular $\norm{Z_{1}} \sim \chi^{2}(n_{1})$ and $\norm{Z_{2}} \sim \chi^{2}(n_{2})$.
\end{itemize} \end{itemize}
$Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$. $Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$.
\dots \dots
\end{theorem} \end{theorem}
\begin{property}[Estimators properties in the linear model] \begin{property}[Estimators properties in the linear model]
According to \autoref{thm:cochran}, According to \autoref{thm:cochran},
\[ \[
\hat{m} \text{ is independent from $\hat{\sigma}^2$} \hat{m} \text{ is independent from $\hat{\sigma}^2$}
\]\dots \]
\[ \[
\frac{\norm{\Y - \Pi_V(\Y)}^2}{...} \sim \norm{\Y - \Pi_V(\Y)}^2 = \norm{\varepsilon - \Pi_{V}(\varepsilon)}^{2} = \norm{\Pi_{V}^{\perp} (\varepsilon)}^{2}
\] \]
$\hat{m} = \X \hat{\beta}$ $\hat{m} = \X \hat{\beta}$
@ -303,40 +292,34 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
\begin{align*} \begin{align*}
\hat{m} &= \X \hat{\beta} = \X(\X^T\X)^{-1} \X^T \Y \\ \hat{m} &= \X \hat{\beta} = \X(\X^T\X)^{-1} \X^T \Y \\
\text{so} \\ \intertext{so} \\
&= \Pi_V \Y &= \Pi_V \Y
\end{align*} \end{align*}
According to Cochran theorem, we can deduce that the estimator of the predicted value $\hat{m}$ is independent $\hat{\sigma}^2$ According to Cochran theorem, we can deduce that the estimator of the predicted value $\hat{m}$ is independent $\hat{\sigma}^2$
All the sum of squares follows a $\chi^2$ distribution: All the sum of squares follows a $\chi^2$ distribution.
\[
...
\]
\begin{property}
\end{property} \subsection{Estimators properties}
\subsection{Estimators consistency}
If $q < n$,
\begin{itemize} \begin{itemize}
\item $\hat{\sigma}^2 \overunderset{\PP}{n\to\infty} \sigma^{*2}$. \item $\hat{m}$ is unbiased and estimator of $m$;
\item If $(\X^T\X)^{-1}$... \item $\EE(\hat{\sigma}^{2}) = \sigma^{2}(n-q)/n$ $\hat{\sigma}^{2}$ is a biased estimator of $\sigma^{2}$.
\item ... \[
S^{2} = \frac{1}{n-q} \norm{\Y - \Pi_{V}}^{2}
\]
is an unbiased estimator of $\sigma²$.
\end{itemize} \end{itemize}
We can derive statistical test from these properties. We can derive statistical test from these properties.
\section{Statistical tests} \section{Statistical tests}
\subsection{Student $t$-test} \subsection{Student $t$-test}
\[ \[
\frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t \frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t_{n-q}
\] \]
where where
@ -627,3 +610,44 @@ Different methods:
Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion. Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion.
\subsection{Stepwise}
\begin{description}
\item[forward] Add new predictor iteratively, beginning with the most contributing predictors.
\item[backward] Remove predictors iteratively.
\item[stepwise] Combination of forward and backward selection. We start by no predictors. We add predictor. Before adding the predictor, we check whether all previously predictors remain meaningful.
\end{description}
The problem with this iterative regression, is that at each step we make a test. We have to reduce the confidence level for multiple test.
In practice, the multiple testing problem is not taken into account in these approaches.
We can use information criteria or model comparison in these methods.
\section{Predictions}
Let $X_i$ the $i$-th row of the matrix $\X$. The observed value $Y_i$ can be estimated by:
\[
\hat{Y}_i = (\X \hat{\beta})_i = X_i \hat{\beta}
\]
\begin{align*}
\EE (\hat{Y}_i) &= (\X \beta)_i = X_i \beta \\
\sigma^{-1} (\X \hat{\beta} - \X \beta) \sim \Norm (0_{p+1}, (\X^T \X)^{-1}), \qquad \text{and} \\
\Var(\hat{Y}_i) = ... \\
S^2 = \norm{...}
\end{align*}
\paragraph{Prediction Confidence Interval}
We can build confidence interval for predicted values $(\X \hat{\beta})_i$
\dots
\paragraph{Prediction error of $Y$}
\paragraph{Prediction interval for a new observation $Y_{n+1}$}

182
content/chapters/part1/2.tex Normal file → Executable file
View File

@ -1,4 +1,186 @@
\chapter{Generalized Linear Model} \chapter{Generalized Linear Model}
\begin{example}
\begin{description}
\item[Ex. 1 - Credit Carb Default]
Let $Y_i$ be a boolean random variable following a Bernoulli distribution.
\item[Ex. 2 - Horseshoe Crabs]
Let $Y_i$, be the number of satellites males.
$Y_i$ can be described as following a Poisson distribution.
\end{description}
\end{example}
\begin{remark}
A Poisson distribution can be viewed as an approximation of binomial distribution when $n$ is high and $p$ low.
\end{remark}
We will consider the following relation:
\[
\EE(Y_i) = g^{-1} X_i \beta,
\]
equivalently:
\[
g(\EE(Y_i)) = X_i \beta.
\]
\begin{itemize}
\item $\beta$ is estimated by the maximum likelihood;
\item $g$ is called the link function.
\end{itemize}
\begin{remark}
In standard linear model, the OLS estimator is the estimator of maximum of likelihood.
\end{remark}
\section{Logistic Regression} \section{Logistic Regression}
\begin{align*}
& \log(\frac{\Pi}{1 - \Pi}) & = \X \beta \\
\Leftrightarrow & e^{\ln \frac{\Pi}{1 - \Pi}} = e^{\X \beta} \\
\Leftrightarrow & \frac{\Pi}{1 - \Pi} = e^{\X \beta} \\
\Leftrightarrow & \Pi = (1 - \Pi) e^{\X\beta} \\
\Leftrightarrow & \Pi = e^{\X \beta} - \Pi e^{\X\beta} \\
\Leftrightarrow & \Pi + \Pi e^{\X\beta} = e^{\X \beta} \\
\Leftrightarrow & \Pi (1 - e^{\X\beta}) = e^{\X \beta} \\
\Leftrightarrow & \Pi = \frac{e^{\X\beta}}{1 + e^{\X \beta}}
\end{align*}
\section{Maximum Likelihood estimator}
log-likelihood: the probability to observe what we observe.
Estimate $\beta$ by $\hat{\beta}$ such that $\forall \beta \in \RR[p+1]$:
\[
L_n (\hat{\beta}) \geq L_n (\beta)
\]
These estimators are consistent, but not necessarily unbiased.
\section{Test for each single coordinate}
\begin{example}[Payment Default]
Let $Y_i$ be the default value for individual $i$.
\[
\log (\frac{\Pi (X)}{1 - \Pi (X)}) = \beta_0 + \beta_1 \text{student} + \beta_2 \text{balance} + \beta_3 \text{income}
\]
In this example, only $\beta_0$ and $\beta_2$ are significantly different from 0.
\end{example}
\begin{remark}
We do not add $\varepsilon_i$, because $\log(\frac{\Pi (X)}{1 - \Pi (X)})$ corresponds to the expectation.
\end{remark}
\subsection{Comparison of nested models}
To test $H_0:\: \beta_0 = \ldots = \beta_p = 0$, we use the likelihood ratio test:
\[
T_n = -2 \log (\mathcal{L}^{\texttt{null}}) + 2 \log (\mathcal{L}(\hat{\beta})) \underset{H_0}{\overunderset{\mathcal{L}}{n \to \infty}{\longrightarrow}} \chi^2(p).
\]
\begin{remark}[Family of Tests]
\begin{itemize}
\item Comparison of estimated values and values under the null hypothesis;
\item Likelihood ratio test;
\item Based on the slope on the derivative.
\end{itemize}
\end{remark}
\section{Relative risk}
$RR_i$ is the probably to have the disease, conditional to the predictor $X_{i1}$ over the probability of having the disease, conditional to the predictor $X_{i2}$.
\[
RR(j) = \frac{\Prob(Y_{i_1} = 1 \: | \: X_{i_1})}{\Prob(Y_{i_2} = 1) \: | \: X_{i_2}} = \frac{\EE(Y_{i_1})}{\EE(Y_{i_2})}.
\]
$\pi(X_i)$ is the probability of having the disease, according to $X_i$.
The relative risk can be written as\dots
\section{Odds}
Quantity providing a measure of the likelihood of a particular outcome:
\[
odd = \frac{\pi(X_i)}{1 - \pi(X_i)}
\]
\[
odds = \exp(X_i \beta)
\]
odds is the ratio of people having the disease, if Y represent the disease, over the people not having the disease.
\section{Odds Ratio}
\begin{align*}
OR(j) =\frac{odds(X_{i_1})}{odds(X_{i_2})} & = \frac{\frac{\pi{X_{i_1}}}{1 - \pi(X_{i_1})}}{\frac{\pi{X_{i_2}}}{1 - \pi(X_{i_2})}}
\end{align*}
The OR can be written as:
\[
OR(j) = \exp(\beta_j)
\]
\begin{exercise}
Show that $OR(j) = \exp(\beta_j)$.
\end{exercise}
\begin{align*}
OR(j) & = \frac{odds(X_{i_1})}{odds(X_{i_2})} \\
& = \frac{\exp(X_{i_1} \beta)}{\exp(X_{i_2} \beta)} \\
\end{align*}
\[
\log \left(
\frac{\Prob(Y=1 \: |\: X_{i_1})}{1 - \Prob(Y=1 \: |\: X_{i_1})}\right)
= \beta_0 + \beta_1 X_1^{(1)} + \beta_2 X_2^{(1)} + \ldots + \beta_p X_p^{(1)}
\]
Similarly
\[
\log \left(
\frac{\Prob(Y=1 \: |\: X_{i_2})}{1 - \Prob(Y=1 \: |\: X_{i_2})}\right)
= \beta_0 + \beta_1 X_1^{(2)} + \beta_2 X_2^{(2)} + \ldots + \beta_p X_p^{(2)}
\]
We substract both equations:
\begin{align*}
&\log \left(
\frac{\Prob(Y=1 \: |\: X_{i_1})}{1 - \Prob(Y=1 \: |\: X_{i_1})} \right) - \log \left(\frac{\Prob(Y=1 \: |\: X_{i_2})}{1 - \Prob(Y=1 \: |\: X_{i_2})}\right) \\
& = \beta_0 + \beta_1 X_1^{(1)} + \beta_2 X_2^{(1)} + \ldots + \beta_p X_p^{(1)} - \beta_0 + \beta_1 X_1^{(2)} + \beta_2 X_2^{(2)} + \ldots + \beta_p X_p^{(2)} \\
& = \log OR(j) \\
& = \cancel{(\beta_0 - \beta_0)} + \beta_1 \cancel{(X_1^{(1)} - X_1^{(2)})} + \beta_2 \cancel{(X_2^{(1)} - X_2^{(2)})} + \ldots + \beta_j \cancelto{1}{(X_j^{(1)} - X_j^{(2)})} + \ldots + \beta_p \cancel{(X_p^{(1)} - X_p^{(2)})} \\
&\Leftrightarrow \log (OR_j) = \beta_j \\
&\Leftrightarrow OR(j) = \exp(\beta_j)
\end{align*}
OR is not equal to RR, except in the particular case of probability (?)
If OR is significantly different from 1, the $\exp(\beta_j)$ is significantly different from 1, thus $\beta_j$ is significantly different from 0.
If we have more than two classes, we do not know what means $X_{i_1} - X_{i_2} = 0$. We will have to take a reference class, and compare successively each class with the reference class.
$\hat{\pi}(X_{+}) = \hat{\Prob(X=1 \: | X_{i1})}$ for a new individual.
\section{Poisson model}
Let $Y_{i} \sim \mathcal{P}(\lambda_{i})$, corresponding to a counting.
\begin{align*}
\EE(Y_{i}) & = g^{-1}(X_{i} \beta) \\
\Leftrightarrow g(\EE(Y_{i})) = X_{i} \beta
\end{align*}
where $g(x) = \ln(x)$, and $g^{-1}(x) = e^{x}$.
\[
\lambda_{i} = \EE(Y_{i}) = \Var(Y_{i})
\]

25
content/chapters/part1/3.tex Executable file
View File

@ -0,0 +1,25 @@
\chapter{Tests Reminders}
\section{$\chi^2$ test of independence}
\section{$\chi^2$ test of goodness of fit}
Check if the observations is in adequation with a particular distribution.
\begin{example}[Mendel experiments]
Let $AB$, $Ab$, $aB$, $ab$ be the four possible genotypes of peas: colors and grain shape.
\begin{tabular}
\toprule
AB & Ab & aB & ab \\
\midrule
315 & 108 & 101 & 32 \\
\bottomrule
\end{tabular}
\end{example}
The test statistics is:
\[
D_{k,n} = \sum_{i=1}^{k} \frac{(N_i - np_i)^2}{np_i} \underoverset{H_0}{\mathcal{L}} \chi^2_{(n-1)(q-1)??}
\]

View File

0
content/chapters/part2/0.tex Normal file → Executable file
View File

4
content/chapters/part2/1.tex Normal file → Executable file
View File

@ -68,7 +68,7 @@ Let $u = \begin{pmatrix}
\begin{figure} \begin{figure}
\centering \centering
\includestandalone{figures/schemes/vector_orthogonality} \includegraphics{figures/schemes/vector_orthogonality.pdf}
\caption{Scalar product of two orthogonal vectors.} \caption{Scalar product of two orthogonal vectors.}
\label{fig:scheme-orthogonal-scalar-product} \label{fig:scheme-orthogonal-scalar-product}
\end{figure} \end{figure}
@ -215,6 +215,6 @@ The number of columns has to be the same as the dimension of the vector to which
\begin{figure} \begin{figure}
\centering \centering
\includestandalone{figures/schemes/coordinates_systems} \includegraphics{figures/schemes/coordinates_systems.pdf}
\caption{Coordinate systems} \caption{Coordinate systems}
\end{figure} \end{figure}

0
content/conclusion.tex Normal file → Executable file
View File

0
content/introduction.tex Normal file → Executable file
View File

0
definitions.tex Normal file → Executable file
View File

0
figures/plots/linear_regression.R Normal file → Executable file
View File

BIN
figures/plots/linear_regression_linear.pdf Normal file → Executable file

Binary file not shown.

BIN
figures/plots/linear_regression_non_linear.pdf Normal file → Executable file

Binary file not shown.

BIN
figures/plots/logistic_curve.pdf Executable file

Binary file not shown.

View File

@ -0,0 +1,23 @@
\documentclass[margin=0.5cm]{standalone}
\usepackage{tikz}
\usepackage{pgfplots}
\pgfplotsset{compat=1.18}
\begin{document}
\begin{tikzpicture}
\begin{axis}[
title={Logit function},
xlabel={$x$},
ylabel={$y$},
domain=-5:5,
samples=200,
legend style={at={(0.95,0.05)},anchor=south east}
]
\newcommand{\Lvar}{1}
\newcommand{\kvar}{1}
\newcommand{\xvar}{0}
\addplot [blue] {\Lvar / (1 + exp(-\kvar*(x-\xvar)))};
\addlegendentry{$L = \Lvar, k=\kvar, x_0=\xvar$};
\end{axis}
\end{tikzpicture}
\end{document}

0
figures/schemes/.gitattributes vendored Normal file → Executable file
View File

BIN
figures/schemes/base_plan.pdf Normal file → Executable file

Binary file not shown.

0
figures/schemes/base_plan.tex Normal file → Executable file
View File

BIN
figures/schemes/coordinates_systems.pdf Normal file → Executable file

Binary file not shown.

0
figures/schemes/coordinates_systems.tex Normal file → Executable file
View File

BIN
figures/schemes/covariance.pdf Normal file → Executable file

Binary file not shown.

0
figures/schemes/covariance.tex Normal file → Executable file
View File

BIN
figures/schemes/ordinary_least_squares.pdf Normal file → Executable file

Binary file not shown.

0
figures/schemes/ordinary_least_squares.png Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 8.5 KiB

After

Width:  |  Height:  |  Size: 8.5 KiB

0
figures/schemes/ordinary_least_squares.svg Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 24 KiB

0
figures/schemes/ordinary_least_squares.tex Normal file → Executable file
View File

BIN
figures/schemes/orthogonal_projection.pdf Normal file → Executable file

Binary file not shown.

0
figures/schemes/orthogonal_projection.tex Normal file → Executable file
View File

BIN
figures/schemes/regression_plan_3D.pdf Normal file → Executable file

Binary file not shown.

0
figures/schemes/regression_plan_3D.tex Normal file → Executable file
View File

BIN
figures/schemes/vector_orthogonality.pdf Normal file → Executable file

Binary file not shown.

0
figures/schemes/vector_orthogonality.tex Normal file → Executable file
View File

0
glossary.tex Normal file → Executable file
View File

BIN
main.pdf

Binary file not shown.

0
main.tex Normal file → Executable file
View File

3
preamble.tex Normal file → Executable file
View File

@ -3,4 +3,5 @@
\usepackage{standalone} \usepackage{standalone}
\usepackage{tikz-3dplot} \usepackage{tikz-3dplot}
\usepackage{tkz-euclide} \usepackage{tkz-euclide}
\usepackage{nicematrix} \usepackage{nicematrix}
\usepackage{luacode}

0
references.bib Normal file → Executable file
View File

0
scripts/matrix_product.lua Normal file → Executable file
View File