feat: Add some stuff on generalized linear models

2023-11-10 13:39:13 +01:00 · 2023-11-10 13:39:13 +01:00 · c552aa24f4
parent 29dad16dfb
commit c552aa24f4
41 changed files with 300 additions and 45 deletions
--- a/.gitattributes
+++ b/.gitattributes
--- a/0
+++ b/0
--- a/content/chapters/include.tex
+++ b/content/chapters/include.tex
@ -11,7 +11,7 @@
 		}
 }
-\includechapters{part1}{2}
+\includechapters{part1}{1}
 \includechapters{part2}{2}
--- a/content/chapters/part1/0.tex
+++ b/content/chapters/part1/0.tex
--- a/content/chapters/part1/1.tex
+++ b/content/chapters/part1/1.tex
@ -60,12 +60,6 @@ In order to estimate the parameters, we can use penalties (additional terms).
 Lasso regression, Elastic Net, etc.
 \subsection{Statistical Analysis Workflow}
 \begin{enumerate}[label={\bfseries\color{primary}Step \arabic*.}]
    \item Graphical representation;
    \item ...
 \end{enumerate}
 \[
    Y = X \beta + \varepsilon,
 \]
@ -145,21 +139,18 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
 \begin{figure}
    \centering
-    \includestandalone{figures/schemes/orthogonal_projection}
+    \includegraphics{figures/schemes/orthogonal_projection.pdf}
    \caption{Orthogonal projection of $\Y$ on plan generated by the base described by $\X$. $\color{blue}a$ corresponds to $\norm{\X\hat{\beta} - \bar{\Y}}^2$ and $\color{blue}b$ corresponds to $\hat{\varepsilon} = \norm{\Y - \hat{\beta}\X}^2$} and $\color{blue}c$ corresponds to $\norm{Y - \bar{Y}}^2$.
    \label{fig:scheme-orthogonal-projection}
 \end{figure}
 \begin{figure}
    \centering
-    \includestandalone{figures/schemes/ordinary_least_squares}
+    \includegraphics{figures/schemes/ordinary_least_squares.pdf}
    \caption{Ordinary least squares and regression line with simulated data.}
    \label{fig:ordinary-least-squares}
 \end{figure}
 \begin{definition}[Model dimension]
    Let $\M$ be a model.
    The dimension of $\M$ is the dimension of the subspace generated by $\X$, that is the number of parameters in the $\beta$ vector.
@ -169,22 +160,21 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
 \section{Gaussian vectors}
 \begin{definition}[Normal distribution]
-
+  $X \sim \Norm(\mu, \sigma^{2})$, with density function $f$
  \[
    f(x) = \frac{1}{\sigma \sqrt{2\pi}}e^{-\frac{1}{2}(\frac{x-\mu}{\sigma})^{2}}
  \]
 \end{definition}
 \begin{definition}[Gaussian vector]
-    A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is ...
+    A random vector $\Y \in \RR[n]$ is a gaussian vector if every linear combination of its component is a gaussian random variable.
 \end{definition}
 \begin{property}
    $m = \EE(Y) = (m_1, \ldots, m_n)^T$, where $m_i = \EE(Y_i)$
    ...
    \[
        \Y \sim \Norm_n(m, \Sigma)
    \]
@ -193,8 +183,6 @@ $\Y - \X \hat{\beta} \perp \X \hat{\beta} - \Y \One$ if $\One \in V$, so
        \Sigma = \E\left[(\Y -m)(\Y - m)^T\right].
    \]
 \end{property}
 \begin{remark}
@ -261,24 +249,25 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
        \[
            \RR[n] = V_1 \overset{\perp}{\oplus} V_2.
        \]
-        \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$... 
+      \item If $Z_1, Z_2$ are orthogonal of $\mathbf{Z}$ on $V_1$ and $V_2$ i.e. $Z_1 = \Pi_{V_1}(\mathbf{Z}) = \Pi_1 \Y$ and $Z_2 = \Pi_{V_2} (\mathbf{Z}) = \Pi_2 \Y$ ($\Pi_{1}$ and $\Pi_{2}$ being projection matrices)
-        (\textcolor{red}{look to the slides})
+            then:
      \item $z_{1}$, $Z_{2}$ are independent gaussian vectors, $Z_{1} \sim \Norm_{n_{1}} (0_{n}, \Pi_{1})$ and $Z_{2} \sim \Norm(0_{n_{2}}, \Pi_{2})$.
            In particular $\norm{Z_{1}} \sim \chi^{2}(n_{1})$ and $\norm{Z_{2}} \sim \chi^{2}(n_{2})$.
    \end{itemize}
    $Z_2 = \Pi_{V_1}(\Z)$ is the projection of $\Z$ on subspace $V_1$.
    \dots
 \end{theorem}
 \begin{property}[Estimators properties in the linear model]
    According to \autoref{thm:cochran}, 
    \[
        \hat{m} \text{ is independent from $\hat{\sigma}^2$}
-    \]\dots
+    \]
    \[
-        \frac{\norm{\Y - \Pi_V(\Y)}^2}{...} \sim 
+        \norm{\Y - \Pi_V(\Y)}^2 = \norm{\varepsilon - \Pi_{V}(\varepsilon)}^{2} = \norm{\Pi_{V}^{\perp} (\varepsilon)}^{2}
    \]
    $\hat{m} = \X \hat{\beta}$
@ -303,40 +292,34 @@ Covariance is really sensitive to scale of variables. For instance, if we measur
 \begin{align*}
    \hat{m} &= \X \hat{\beta} = \X(\X^T\X)^{-1} \X^T \Y \\
-    \text{so} \\
+    \intertext{so} \\
    &= \Pi_V \Y 
 \end{align*}
 According to Cochran theorem, we can deduce that the estimator of the predicted value $\hat{m}$ is independent $\hat{\sigma}^2$
-All the sum of squares follows a $\chi^2$ distribution:
+All the sum of squares follows a $\chi^2$ distribution.
 \[
   ...
 \]
 \begin{property}
-\end{property}
+\subsection{Estimators properties}
 \subsection{Estimators consistency}
 If $q < n$,
 \begin{itemize}
-    \item $\hat{\sigma}^2 \overunderset{\PP}{n\to\infty} \sigma^{*2}$.
+  \item $\hat{m}$ is unbiased and estimator of $m$;
-    \item If $(\X^T\X)^{-1}$...
+  \item $\EE(\hat{\sigma}^{2}) = \sigma^{2}(n-q)/n$ $\hat{\sigma}^{2}$ is a biased estimator of $\sigma^{2}$.
-    \item ...
+        \[
        S^{2} = \frac{1}{n-q} \norm{\Y - \Pi_{V}}^{2}
        \]
        is an unbiased estimator of $\sigma²$.
 \end{itemize}
 We can derive statistical test from these properties.
 \section{Statistical tests}
 \subsection{Student $t$-test}
 \[
-    \frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t
+    \frac{\hat{\theta}-\theta}{\sqrt{\frac{\widehat{\VVar}(\hat{\theta})}{n}}} \underset{H_0}{\sim} t_{n-q}
 \]
 where 
@ -627,3 +610,44 @@ Different methods:
 Usually $AIC$ have smaller penalty than $BIC$, thus $AIC$ criterion tends to select models with more variables than $BIC$ criterion.
 \subsection{Stepwise}
 \begin{description}
    \item[forward] Add new predictor iteratively, beginning with the most contributing predictors. 
    \item[backward] Remove predictors iteratively.
    \item[stepwise] Combination of forward and backward selection. We start by no predictors. We add predictor. Before adding the predictor, we check whether all previously predictors remain meaningful. 
 \end{description}
 The problem with this iterative regression, is that at each step we make a test. We have to reduce the confidence level for multiple test.
 In practice, the multiple testing problem is not taken into account in these approaches.
 We can use information criteria or model comparison in these methods.
 \section{Predictions}
 Let $X_i$ the $i$-th row of the matrix $\X$. The observed value $Y_i$ can be estimated by:
 \[
    \hat{Y}_i = (\X \hat{\beta})_i = X_i \hat{\beta}
 \]
 \begin{align*}
    \EE (\hat{Y}_i) &= (\X \beta)_i = X_i \beta \\
    \sigma^{-1} (\X \hat{\beta} - \X \beta) \sim \Norm (0_{p+1}, (\X^T \X)^{-1}), \qquad \text{and} \\
    \Var(\hat{Y}_i) = ... \\
    S^2 = \norm{...}
 \end{align*}
 \paragraph{Prediction Confidence Interval}
 We can build confidence interval for predicted values $(\X \hat{\beta})_i$
 \dots
 \paragraph{Prediction error of $Y$}
 \paragraph{Prediction interval for a new observation $Y_{n+1}$}
--- a/content/chapters/part1/2.tex
+++ b/content/chapters/part1/2.tex
@ -1,4 +1,186 @@
 \chapter{Generalized Linear Model}
 \begin{example}
  \begin{description}
    \item[Ex. 1 - Credit Carb Default]
      Let $Y_i$ be a boolean random variable following a Bernoulli distribution.
    \item[Ex. 2 - Horseshoe Crabs]
      Let $Y_i$, be the number of satellites males.
      $Y_i$ can be described as following a Poisson distribution.
  \end{description}
 \end{example}
 \begin{remark}
  A Poisson distribution can be viewed as an approximation of binomial distribution when $n$ is high and $p$ low.
 \end{remark}
 We will consider the following relation:
 \[
  \EE(Y_i) = g^{-1} X_i \beta,
 \]
 equivalently:
 \[
  g(\EE(Y_i)) = X_i \beta.
 \]
 \begin{itemize}
  \item $\beta$ is estimated by the maximum likelihood;
  \item $g$ is called the link function.
 \end{itemize}
 \begin{remark}
  In standard linear model, the OLS estimator is the estimator of maximum of likelihood.
 \end{remark}
 \section{Logistic Regression}
 \begin{align*}
                  & \log(\frac{\Pi}{1 - \Pi})                   & = \X \beta \\
  \Leftrightarrow & e^{\ln \frac{\Pi}{1 - \Pi}} =  e^{\X \beta}              \\
  \Leftrightarrow & \frac{\Pi}{1 - \Pi} = e^{\X \beta}                       \\
  \Leftrightarrow & \Pi = (1 - \Pi) e^{\X\beta}                              \\
  \Leftrightarrow & \Pi = e^{\X \beta} - \Pi e^{\X\beta}                     \\
  \Leftrightarrow & \Pi + \Pi e^{\X\beta} = e^{\X \beta}                     \\
  \Leftrightarrow & \Pi (1 - e^{\X\beta}) = e^{\X \beta}                     \\
  \Leftrightarrow & \Pi = \frac{e^{\X\beta}}{1 + e^{\X \beta}}
 \end{align*}
 \section{Maximum Likelihood estimator}
 log-likelihood: the probability to observe what we observe.
 Estimate $\beta$ by $\hat{\beta}$ such that $\forall \beta \in \RR[p+1]$:
 \[
  L_n (\hat{\beta}) \geq L_n (\beta)
 \]
 These estimators are consistent, but not necessarily unbiased.
 \section{Test for each single coordinate}
 \begin{example}[Payment Default]
  Let $Y_i$ be the default value for individual $i$.
  \[
    \log (\frac{\Pi (X)}{1 - \Pi (X)}) = \beta_0 + \beta_1 \text{student} + \beta_2 \text{balance} + \beta_3 \text{income}
  \]
  In this example, only $\beta_0$ and $\beta_2$ are significantly different from 0.
 \end{example}
 \begin{remark}
  We do not add $\varepsilon_i$, because $\log(\frac{\Pi (X)}{1 - \Pi (X)})$ corresponds to the expectation.
 \end{remark}
 \subsection{Comparison of nested models}
 To test $H_0:\: \beta_0 = \ldots = \beta_p = 0$, we use the likelihood ratio test:
 \[
  T_n = -2 \log (\mathcal{L}^{\texttt{null}}) + 2 \log (\mathcal{L}(\hat{\beta})) \underset{H_0}{\overunderset{\mathcal{L}}{n \to \infty}{\longrightarrow}} \chi^2(p).
 \]
 \begin{remark}[Family of Tests]
  \begin{itemize}
    \item Comparison of estimated values and values under the null hypothesis;
    \item Likelihood ratio test;
    \item Based on the slope on the derivative.
  \end{itemize}
 \end{remark}
 \section{Relative risk}
 $RR_i$ is the probably to have the disease, conditional to the predictor $X_{i1}$ over the probability of having the disease, conditional to the predictor $X_{i2}$.
 \[
  RR(j) = \frac{\Prob(Y_{i_1} = 1 \: | \: X_{i_1})}{\Prob(Y_{i_2} = 1) \: | \: X_{i_2}} = \frac{\EE(Y_{i_1})}{\EE(Y_{i_2})}.
 \]
 $\pi(X_i)$ is the probability of having the disease, according to $X_i$.
 The relative risk can be written as\dots
 \section{Odds}
 Quantity providing a measure of the likelihood of a particular outcome:
 \[
  odd = \frac{\pi(X_i)}{1 - \pi(X_i)}
 \]
 \[
  odds = \exp(X_i \beta)
 \]
 odds is the ratio of people having the disease, if Y represent the disease, over the people not having the disease.
 \section{Odds Ratio}
 \begin{align*}
  OR(j) =\frac{odds(X_{i_1})}{odds(X_{i_2})} & =  \frac{\frac{\pi{X_{i_1}}}{1 - \pi(X_{i_1})}}{\frac{\pi{X_{i_2}}}{1 - \pi(X_{i_2})}}
 \end{align*}
 The OR can be written as:
 \[
  OR(j) = \exp(\beta_j)
 \]
 \begin{exercise}
  Show that $OR(j) = \exp(\beta_j)$.
 \end{exercise}
 \begin{align*}
  OR(j) & = \frac{odds(X_{i_1})}{odds(X_{i_2})}             \\
        & = \frac{\exp(X_{i_1} \beta)}{\exp(X_{i_2} \beta)} \\
 \end{align*}
  \[
    \log \left(
    \frac{\Prob(Y=1 \: |\: X_{i_1})}{1 - \Prob(Y=1 \: |\: X_{i_1})}\right)
    = \beta_0 + \beta_1 X_1^{(1)} + \beta_2 X_2^{(1)} + \ldots + \beta_p X_p^{(1)}
 \]
    Similarly
 \[
    \log \left(
      \frac{\Prob(Y=1 \: |\: X_{i_2})}{1 - \Prob(Y=1 \: |\: X_{i_2})}\right)
    = \beta_0 + \beta_1 X_1^{(2)} + \beta_2 X_2^{(2)} + \ldots + \beta_p X_p^{(2)}
 \]
  We substract both equations:
    \begin{align*}
      &\log \left(
        \frac{\Prob(Y=1 \: |\: X_{i_1})}{1 - \Prob(Y=1 \: |\: X_{i_1})} \right) - \log \left(\frac{\Prob(Y=1 \: |\: X_{i_2})}{1 - \Prob(Y=1 \: |\: X_{i_2})}\right) \\
      & = \beta_0 + \beta_1 X_1^{(1)} + \beta_2 X_2^{(1)} + \ldots + \beta_p X_p^{(1)} - \beta_0 + \beta_1 X_1^{(2)} + \beta_2 X_2^{(2)} + \ldots + \beta_p X_p^{(2)} \\
      & = \log OR(j)  \\
      & = \cancel{(\beta_0 - \beta_0)} + \beta_1 \cancel{(X_1^{(1)} - X_1^{(2)})} + \beta_2 \cancel{(X_2^{(1)} - X_2^{(2)})} + \ldots + \beta_j \cancelto{1}{(X_j^{(1)} - X_j^{(2)})} + \ldots + \beta_p \cancel{(X_p^{(1)} - X_p^{(2)})} \\
      &\Leftrightarrow \log (OR_j) = \beta_j \\
      &\Leftrightarrow OR(j) = \exp(\beta_j)
    \end{align*}
 OR is not equal to RR, except in the particular case of probability (?)
 If OR is significantly different from 1, the $\exp(\beta_j)$ is significantly different from 1, thus $\beta_j$ is significantly different from 0.
 If we have more than two classes, we do not know what means $X_{i_1} - X_{i_2} = 0$. We will have to take a reference class, and compare successively each class with the reference class.
 $\hat{\pi}(X_{+}) = \hat{\Prob(X=1 \: | X_{i1})}$ for a new individual.
 \section{Poisson model}
 Let $Y_{i} \sim \mathcal{P}(\lambda_{i})$, corresponding to a counting.
 \begin{align*}
 	\EE(Y_{i}) & = g^{-1}(X_{i} \beta) \\
 	\Leftrightarrow g(\EE(Y_{i})) = X_{i} \beta
 \end{align*}
 where $g(x) = \ln(x)$, and $g^{-1}(x) = e^{x}$.
 \[
 	\lambda_{i} = \EE(Y_{i}) = \Var(Y_{i})
 \]
--- a/content/chapters/part1/3.tex
+++ b/content/chapters/part1/3.tex
@ -0,0 +1,25 @@
 \chapter{Tests Reminders}
 \section{$\chi^2$ test of independence}
 \section{$\chi^2$ test of goodness of fit}
 Check if the observations is in adequation with a particular distribution.
 \begin{example}[Mendel experiments]
 	Let $AB$, $Ab$, $aB$, $ab$ be the four possible genotypes of peas: colors and grain shape.
 	\begin{tabular}
 		\toprule
 		AB  & Ab  & aB  & ab \\
 		\midrule
 		315 & 108 & 101 & 32 \\
 		\bottomrule
 	\end{tabular}
 \end{example}
 The test statistics is:
 \[
 	D_{k,n} = \sum_{i=1}^{k} \frac{(N_i - np_i)^2}{np_i} \underoverset{H_0}{\mathcal{L}} \chi^2_{(n-1)(q-1)??}
 \]
--- a/content/chapters/part1/4.tex
+++ b/content/chapters/part1/4.tex
--- a/content/chapters/part2/0.tex
+++ b/content/chapters/part2/0.tex
--- a/content/chapters/part2/1.tex
+++ b/content/chapters/part2/1.tex
@ -68,7 +68,7 @@ Let $u = \begin{pmatrix}
 \begin{figure}
    \centering
-    \includestandalone{figures/schemes/vector_orthogonality}
+    \includegraphics{figures/schemes/vector_orthogonality.pdf}
    \caption{Scalar product of two orthogonal vectors.}
    \label{fig:scheme-orthogonal-scalar-product}
 \end{figure}
@ -215,6 +215,6 @@ The number of columns has to be the same as the dimension of the vector to which
 \begin{figure}
    \centering
-    \includestandalone{figures/schemes/coordinates_systems}
+    \includegraphics{figures/schemes/coordinates_systems.pdf}
    \caption{Coordinate systems}
 \end{figure}
--- a/content/conclusion.tex
+++ b/content/conclusion.tex
--- a/content/introduction.tex
+++ b/content/introduction.tex
--- a/definitions.tex
+++ b/definitions.tex
--- a/figures/plots/linear_regression.R
+++ b/figures/plots/linear_regression.R
--- a/figures/plots/linear_regression_linear.pdf
+++ b/figures/plots/linear_regression_linear.pdf
--- a/figures/plots/linear_regression_non_linear.pdf
+++ b/figures/plots/linear_regression_non_linear.pdf
--- a/figures/plots/logistic_curve.pdf
+++ b/figures/plots/logistic_curve.pdf
--- a/figures/plots/logistic_curve.tex
+++ b/figures/plots/logistic_curve.tex
@ -0,0 +1,23 @@
 \documentclass[margin=0.5cm]{standalone}
 \usepackage{tikz}
 \usepackage{pgfplots}
 \pgfplotsset{compat=1.18} 
 \begin{document}
 \begin{tikzpicture}
  \begin{axis}[
    title={Logit function},
    xlabel={$x$},
    ylabel={$y$},
    domain=-5:5,
    samples=200,
    legend style={at={(0.95,0.05)},anchor=south east}
 ]
 \newcommand{\Lvar}{1}
 \newcommand{\kvar}{1}
 \newcommand{\xvar}{0}
 \addplot [blue] {\Lvar / (1 + exp(-\kvar*(x-\xvar)))};
 \addlegendentry{$L = \Lvar, k=\kvar, x_0=\xvar$};
 \end{axis}
 \end{tikzpicture}
 \end{document}
--- a/figures/schemes/.gitattributes
+++ b/figures/schemes/.gitattributes
--- a/figures/schemes/base_plan.pdf
+++ b/figures/schemes/base_plan.pdf
--- a/figures/schemes/base_plan.tex
+++ b/figures/schemes/base_plan.tex
--- a/figures/schemes/coordinates_systems.pdf
+++ b/figures/schemes/coordinates_systems.pdf
--- a/figures/schemes/coordinates_systems.tex
+++ b/figures/schemes/coordinates_systems.tex
--- a/figures/schemes/covariance.pdf
+++ b/figures/schemes/covariance.pdf
--- a/figures/schemes/covariance.tex
+++ b/figures/schemes/covariance.tex
--- a/figures/schemes/ordinary_least_squares.pdf
+++ b/figures/schemes/ordinary_least_squares.pdf
--- a/figures/schemes/ordinary_least_squares.png
+++ b/figures/schemes/ordinary_least_squares.png
--- a/figures/schemes/ordinary_least_squares.svg
+++ b/figures/schemes/ordinary_least_squares.svg
--- a/figures/schemes/ordinary_least_squares.tex
+++ b/figures/schemes/ordinary_least_squares.tex
--- a/figures/schemes/orthogonal_projection.pdf
+++ b/figures/schemes/orthogonal_projection.pdf
--- a/figures/schemes/orthogonal_projection.tex
+++ b/figures/schemes/orthogonal_projection.tex
--- a/figures/schemes/regression_plan_3D.pdf
+++ b/figures/schemes/regression_plan_3D.pdf
--- a/figures/schemes/regression_plan_3D.tex
+++ b/figures/schemes/regression_plan_3D.tex
--- a/figures/schemes/vector_orthogonality.pdf
+++ b/figures/schemes/vector_orthogonality.pdf
--- a/figures/schemes/vector_orthogonality.tex
+++ b/figures/schemes/vector_orthogonality.tex
--- a/glossary.tex
+++ b/glossary.tex
--- a/main.pdf
+++ b/main.pdf
--- a/main.tex
+++ b/main.tex
--- a/preamble.tex
+++ b/preamble.tex
@ -3,4 +3,5 @@
 \usepackage{standalone}
 \usepackage{tikz-3dplot}
 \usepackage{tkz-euclide}
-\usepackage{nicematrix}
+\usepackage{nicematrix}
 \usepackage{luacode}
--- a/references.bib
+++ b/references.bib
--- a/scripts/matrix_product.lua
+++ b/scripts/matrix_product.lua