comparative-genomics-project/docs/presentation/main.tex

86 lines
2.9 KiB
TeX
Raw Normal View History

2025-01-18 15:55:12 +01:00
\documentclass{beamer}
\usepackage{booktabs}
\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}
\subtitle{First results}
\author{Naïa Périnelle \and Samuel Ortion}
\institute{Université d'Évry Paris-Saclay}
\date{2024-11-15}
\begin{document}
\frame{\titlepage}
\begin{frame}{Cultivated soy plant: \textit{Glycine max}}
A species of legume native to East Asia.
{
\centering
\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
\vspace{1em}
% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}
\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}
}
\end{frame}
\begin{frame}{Soybean industrial interest}
\begin{itemize}
\item 353 million tonnes of soybean produced in 2020
\item Cattle feed
\item Human food
\end{itemize}
\end{frame}
\begin{frame}{\textit{Glycine max} genome statistics}
\begin{itemize}
\item 20 chromosomes
\item 55897 protein coding genes (including supercontigs)
\item 55589 protein coding genes (excluding supercontigs)
\item 88412 protein isoforms
\end{itemize}
\end{frame}
\begin{frame}{Datasets filter criteria and pipeline steps}
\begin{tabular}{ccc}
\toprule
dataset stringency & low & high \\
\midrule
coverage & > 30\% & > 40\% \\
identity & > 30\% & > 50\% \\
\bottomrule
\end{tabular}
\only<2->{
Steps:
\begin{enumerate}
\item<2-> Keep the longest isoform protein sequence per gene,
\item<3-> Run BLASTp ``all against all'' on the proteome,
\item<4-> Remove proteins of supercontigs,
\item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria (coverage and identity percentages),
\item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,
\item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.
\end{enumerate}
}
\end{frame}
\begin{frame}{Gene families size}
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}
\end{frame}
\begin{frame}{Duplicate genes statistics}
\begin{tabular}{lcc}
dataset stringency & low & high \\
\toprule
number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\
number of families & 8426 & 11997 \\
number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\
number of TAG\textsubscript{0} & 3208 & 2500 \\
number of TAG\textsubscript{1} & 3481 & 2652 \\
\bottomrule
\end{tabular}
\end{frame}
\end{document}