\documentclass{beamer} \usepackage{booktabs} \title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)} \subtitle{First results} \author{Naïa Périnelle \and Samuel Ortion} \institute{Université d'Évry Paris-Saclay} \date{2024-11-15} \begin{document} \frame{\titlepage} \begin{frame}{Cultivated soy plant: \textit{Glycine max}} A species of legume native to East Asia. { \centering \includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg} \vspace{1em} % \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005} \includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg} } \end{frame} \begin{frame}{Soybean industrial interest} \begin{itemize} \item 353 million tonnes of soybean produced in 2020 \item Cattle feed \item Human food \end{itemize} \end{frame} \begin{frame}{\textit{Glycine max} genome statistics} \begin{itemize} \item 20 chromosomes \item 55897 protein coding genes (including supercontigs) \item 55589 protein coding genes (excluding supercontigs) \item 88412 protein isoforms \end{itemize} \end{frame} \begin{frame}{Datasets filter criteria and pipeline steps} \begin{tabular}{ccc} \toprule dataset stringency & low & high \\ \midrule coverage & > 30\% & > 40\% \\ identity & > 30\% & > 50\% \\ \bottomrule \end{tabular} \only<2->{ Steps: \begin{enumerate} \item<2-> Keep the longest isoform protein sequence per gene, \item<3-> Run BLASTp ``all against all'' on the proteome, \item<4-> Remove proteins of supercontigs, \item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria (coverage and identity percentages), \item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs, \item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families. \end{enumerate} } \end{frame} \begin{frame}{Gene families size} \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf} \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf} \end{frame} \begin{frame}{Duplicate genes statistics} \begin{tabular}{lcc} dataset stringency & low & high \\ \toprule number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\ number of families & 8426 & 11997 \\ number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\ number of TAG\textsubscript{0} & 3208 & 2500 \\ number of TAG\textsubscript{1} & 3481 & 2652 \\ \bottomrule \end{tabular} \end{frame} \end{document}