86 lines
2.9 KiB
TeX
86 lines
2.9 KiB
TeX
|
\documentclass{beamer}
|
||
|
\usepackage{booktabs}
|
||
|
|
||
|
\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}
|
||
|
\subtitle{First results}
|
||
|
\author{Naïa Périnelle \and Samuel Ortion}
|
||
|
\institute{Université d'Évry Paris-Saclay}
|
||
|
\date{2024-11-15}
|
||
|
|
||
|
\begin{document}
|
||
|
|
||
|
\frame{\titlepage}
|
||
|
|
||
|
\begin{frame}{Cultivated soy plant: \textit{Glycine max}}
|
||
|
A species of legume native to East Asia.
|
||
|
{
|
||
|
\centering
|
||
|
\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
|
||
|
\vspace{1em}
|
||
|
% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}
|
||
|
\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}
|
||
|
}
|
||
|
|
||
|
|
||
|
\end{frame}
|
||
|
|
||
|
\begin{frame}{Soybean industrial interest}
|
||
|
\begin{itemize}
|
||
|
\item 353 million tonnes of soybean produced in 2020
|
||
|
\item Cattle feed
|
||
|
\item Human food
|
||
|
\end{itemize}
|
||
|
|
||
|
\end{frame}
|
||
|
|
||
|
\begin{frame}{\textit{Glycine max} genome statistics}
|
||
|
\begin{itemize}
|
||
|
\item 20 chromosomes
|
||
|
\item 55897 protein coding genes (including supercontigs)
|
||
|
\item 55589 protein coding genes (excluding supercontigs)
|
||
|
\item 88412 protein isoforms
|
||
|
\end{itemize}
|
||
|
\end{frame}
|
||
|
|
||
|
\begin{frame}{Datasets filter criteria and pipeline steps}
|
||
|
\begin{tabular}{ccc}
|
||
|
\toprule
|
||
|
dataset stringency & low & high \\
|
||
|
\midrule
|
||
|
coverage & > 30\% & > 40\% \\
|
||
|
identity & > 30\% & > 50\% \\
|
||
|
\bottomrule
|
||
|
\end{tabular}
|
||
|
\only<2->{
|
||
|
|
||
|
Steps:
|
||
|
\begin{enumerate}
|
||
|
\item<2-> Keep the longest isoform protein sequence per gene,
|
||
|
\item<3-> Run BLASTp ``all against all'' on the proteome,
|
||
|
\item<4-> Remove proteins of supercontigs,
|
||
|
\item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria (coverage and identity percentages),
|
||
|
\item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,
|
||
|
\item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.
|
||
|
\end{enumerate}
|
||
|
}
|
||
|
\end{frame}
|
||
|
|
||
|
\begin{frame}{Gene families size}
|
||
|
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}
|
||
|
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}
|
||
|
\end{frame}
|
||
|
|
||
|
\begin{frame}{Duplicate genes statistics}
|
||
|
\begin{tabular}{lcc}
|
||
|
dataset stringency & low & high \\
|
||
|
\toprule
|
||
|
number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\
|
||
|
number of families & 8426 & 11997 \\
|
||
|
number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\
|
||
|
number of TAG\textsubscript{0} & 3208 & 2500 \\
|
||
|
number of TAG\textsubscript{1} & 3481 & 2652 \\
|
||
|
\bottomrule
|
||
|
\end{tabular}
|
||
|
\end{frame}
|
||
|
|
||
|
\end{document}
|