comparative-genomics-project/docs/presentation/main.tex

\documentclass{beamer}
\usepackage{booktabs}

\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}
\subtitle{First results}
\author{Naïa Périnelle \and Samuel Ortion}
\institute{Université d'Évry Paris-Saclay}
\date{2024-11-15}

\begin{document}

\frame{\titlepage}

\begin{frame}{Cultivated soy plant: \textit{Glycine max}}
A species of legume native to East Asia.
{
\centering
\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
\vspace{1em}
% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}
\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}
}


\end{frame}

\begin{frame}{Soybean industrial interest}
\begin{itemize}
    \item 353 million tonnes of soybean produced in 2020 
    \item Cattle feed 
    \item Human food
\end{itemize}

\end{frame}

\begin{frame}{\textit{Glycine max} genome statistics}
    \begin{itemize}
        \item 20 chromosomes
        \item 55897 protein coding genes (including supercontigs)
        \item 55589 protein coding genes (excluding supercontigs)
        \item 88412 protein isoforms
    \end{itemize}
\end{frame}

\begin{frame}{Datasets filter criteria and pipeline steps}
    \begin{tabular}{ccc}
    \toprule
    dataset stringency & low & high \\
    \midrule
    coverage & > 30\% & > 40\% \\
    identity & > 30\% & > 50\% \\
    \bottomrule
    \end{tabular}
    \only<2->{
    
    Steps:
    \begin{enumerate}
        \item<2-> Keep the longest isoform protein sequence per gene,
        \item<3-> Run BLASTp ``all against all'' on the proteome,
        \item<4-> Remove proteins of supercontigs,
        \item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria  (coverage and identity percentages),
        \item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,
        \item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.
    \end{enumerate}
    }
\end{frame}

\begin{frame}{Gene families size}
    \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}
    \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}
\end{frame}

\begin{frame}{Duplicate genes statistics}
    \begin{tabular}{lcc}
        dataset stringency & low & high \\
        \toprule
        number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\ 
        number of families & 8426 & 11997 \\
        number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\
        number of TAG\textsubscript{0} & 3208 & 2500 \\
        number of TAG\textsubscript{1} & 3481 & 2652 \\
        \bottomrule
    \end{tabular}
\end{frame}

\end{document}
Update project 2025-01-18 15:55:12 +01:00			`\documentclass{beamer}`
			`\usepackage{booktabs}`

			`\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}`
			`\subtitle{First results}`
			`\author{Naïa Périnelle \and Samuel Ortion}`
			`\institute{Université d'Évry Paris-Saclay}`
			`\date{2024-11-15}`

			`\begin{document}`

			`\frame{\titlepage}`

			`\begin{frame}{Cultivated soy plant: \textit{Glycine max}}`
			`A species of legume native to East Asia.`
			`{`
			`\centering`
			`\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}`
			`\vspace{1em}`
			`% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}`
			`\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}`
			`}`


			`\end{frame}`

			`\begin{frame}{Soybean industrial interest}`
			`\begin{itemize}`
			`\item 353 million tonnes of soybean produced in 2020`
			`\item Cattle feed`
			`\item Human food`
			`\end{itemize}`

			`\end{frame}`

			`\begin{frame}{\textit{Glycine max} genome statistics}`
			`\begin{itemize}`
			`\item 20 chromosomes`
			`\item 55897 protein coding genes (including supercontigs)`
			`\item 55589 protein coding genes (excluding supercontigs)`
			`\item 88412 protein isoforms`
			`\end{itemize}`
			`\end{frame}`

			`\begin{frame}{Datasets filter criteria and pipeline steps}`
			`\begin{tabular}{ccc}`
			`\toprule`
			`dataset stringency & low & high \\`
			`\midrule`
			`coverage & > 30\% & > 40\% \\`
			`identity & > 30\% & > 50\% \\`
			`\bottomrule`
			`\end{tabular}`
			`\only<2->{`

			`Steps:`
			`\begin{enumerate}`
			`\item<2-> Keep the longest isoform protein sequence per gene,`
			\item<3-> Run BLASTp ``all against all'' on the proteome,
			`\item<4-> Remove proteins of supercontigs,`
			`\item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria (coverage and identity percentages),`
			`\item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,`
			`\item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.`
			`\end{enumerate}`
			`}`
			`\end{frame}`

			`\begin{frame}{Gene families size}`
			`\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}`
			`\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}`
			`\end{frame}`

			`\begin{frame}{Duplicate genes statistics}`
			`\begin{tabular}{lcc}`
			`dataset stringency & low & high \\`
			`\toprule`
			`number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\`
			`number of families & 8426 & 11997 \\`
			`number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\`
			`number of TAG\textsubscript{0} & 3208 & 2500 \\`
			`number of TAG\textsubscript{1} & 3481 & 2652 \\`
			`\bottomrule`
			`\end{tabular}`
			`\end{frame}`

			`\end{document}`