Update project
This commit is contained in:
parent
33e898da2e
commit
5ad94110d7
|
@ -0,0 +1,86 @@
|
||||||
|
\documentclass{beamer}
|
||||||
|
\usepackage{booktabs}
|
||||||
|
|
||||||
|
\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}
|
||||||
|
\subtitle{First results}
|
||||||
|
\author{Naïa Périnelle \and Samuel Ortion}
|
||||||
|
\institute{Université d'Évry Paris-Saclay}
|
||||||
|
\date{2024-11-15}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\frame{\titlepage}
|
||||||
|
|
||||||
|
\begin{frame}{Cultivated soy plant: \textit{Glycine max}}
|
||||||
|
A species of legume native to East Asia.
|
||||||
|
{
|
||||||
|
\centering
|
||||||
|
\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
|
||||||
|
\vspace{1em}
|
||||||
|
% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}
|
||||||
|
\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Soybean industrial interest}
|
||||||
|
\begin{itemize}
|
||||||
|
\item 353 million tonnes of soybean produced in 2020
|
||||||
|
\item Cattle feed
|
||||||
|
\item Human food
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{\textit{Glycine max} genome statistics}
|
||||||
|
\begin{itemize}
|
||||||
|
\item 20 chromosomes
|
||||||
|
\item 55897 protein coding genes (including supercontigs)
|
||||||
|
\item 55589 protein coding genes (excluding supercontigs)
|
||||||
|
\item 88412 protein isoforms
|
||||||
|
\end{itemize}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Datasets filter criteria and pipeline steps}
|
||||||
|
\begin{tabular}{ccc}
|
||||||
|
\toprule
|
||||||
|
dataset stringency & low & high \\
|
||||||
|
\midrule
|
||||||
|
coverage & > 30\% & > 40\% \\
|
||||||
|
identity & > 30\% & > 50\% \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\only<2->{
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item<2-> Keep the longest isoform protein sequence per gene,
|
||||||
|
\item<3-> Run BLASTp ``all against all'' on the proteome,
|
||||||
|
\item<4-> Remove proteins of supercontigs,
|
||||||
|
\item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria (coverage and identity percentages),
|
||||||
|
\item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,
|
||||||
|
\item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.
|
||||||
|
\end{enumerate}
|
||||||
|
}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Gene families size}
|
||||||
|
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}
|
||||||
|
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\begin{frame}{Duplicate genes statistics}
|
||||||
|
\begin{tabular}{lcc}
|
||||||
|
dataset stringency & low & high \\
|
||||||
|
\toprule
|
||||||
|
number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\
|
||||||
|
number of families & 8426 & 11997 \\
|
||||||
|
number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\
|
||||||
|
number of TAG\textsubscript{0} & 3208 & 2500 \\
|
||||||
|
number of TAG\textsubscript{1} & 3481 & 2652 \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\end{frame}
|
||||||
|
|
||||||
|
\end{document}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 147 KiB |
Binary file not shown.
After Width: | Height: | Size: 53 KiB |
Binary file not shown.
After Width: | Height: | Size: 912 KiB |
599
notebook.org
599
notebook.org
|
@ -1,9 +1,11 @@
|
||||||
|
# -*- org-export-use-babel: nil -*-
|
||||||
#+title: Comparative Genomics Project
|
#+title: Comparative Genomics Project
|
||||||
#+subtitle: Duplicate Genes in /Glycine max/
|
#+subtitle: Duplicate Genes in /Glycine max/
|
||||||
#+date: 2024-2025
|
#+date: 2024-2025
|
||||||
#+author: Samuel Ortion
|
#+author: Samuel Ortion
|
||||||
#+LATEX_CLASS: scrartcl
|
#+LATEX_CLASS: scrartcl
|
||||||
#+LATEX_HEADER: \titlehead{M2 GENIOMHE}
|
#+LATEX_HEADER: \titlehead{M2 GENIOMHE}
|
||||||
|
#+LATEX_HEADER: \usepackage{mus}
|
||||||
|
|
||||||
* General infos
|
* General infos
|
||||||
|
|
||||||
|
@ -48,15 +50,13 @@ min_identity=50
|
||||||
/Glycine max/ is the soy plant.
|
/Glycine max/ is the soy plant.
|
||||||
|
|
||||||
* Download the source data
|
* Download the source data
|
||||||
|
- https://plants.ensembl.org/Glycine_max/Info/Index
|
||||||
|
- https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
|
||||||
|
|
||||||
https://plants.ensembl.org/Glycine_max/Info/Index
|
* Duplicate gene families
|
||||||
|
** Develop the Nextflow workflow
|
||||||
|
|
||||||
https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
|
*** Filtering the proteome to keep the longest isoform
|
||||||
|
|
||||||
|
|
||||||
* Develop the Nextflow workflow
|
|
||||||
|
|
||||||
** Filtering the proteome to keep the longest isoform
|
|
||||||
|
|
||||||
I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms:
|
I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms:
|
||||||
1. =protein_lengths.awk= associate each FASTA record with the length of its sequence,
|
1. =protein_lengths.awk= associate each FASTA record with the length of its sequence,
|
||||||
|
@ -89,19 +89,19 @@ zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 55897
|
: 55897
|
||||||
|
|
||||||
** Filtering out Chloroplasts and Mitochondria proteins
|
*** Filtering out Chloroplasts and Mitochondria proteins
|
||||||
|
|
||||||
From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/.
|
From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/.
|
||||||
|
|
||||||
** Filter out supercontigs
|
*** Filter out supercontigs
|
||||||
|
|
||||||
<2024-10-28 Mon>
|
<2024-10-28 Mon>
|
||||||
|
|
||||||
#+include: workflow/families_and_TAGs/scripts/remove_supercontigs.awk src awk
|
#+include: workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk src awk
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
|
zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
|
||||||
| awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs.awk \
|
| awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk \
|
||||||
> ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa
|
> ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ expr $ALL_ISOFORMS - $NO_SUPERCONTIGS
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 435
|
: 435
|
||||||
|
|
||||||
** BLASTp All Against All
|
*** BLASTp All Against All
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
|
makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
|
||||||
|
@ -153,7 +153,7 @@ makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
|
||||||
blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv"
|
blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv"
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
** Clustering
|
*** Clustering
|
||||||
|
|
||||||
Using =mcl=.
|
Using =mcl=.
|
||||||
|
|
||||||
|
@ -182,8 +182,7 @@ nextflow run main.nf -profile conda -resume
|
||||||
|
|
||||||
from the =./workflow/= folder
|
from the =./workflow/= folder
|
||||||
|
|
||||||
|
*** From the given BLASTp format 6 file
|
||||||
* From the given BLASTp format 6 file
|
|
||||||
<2024-11-02 Sat>
|
<2024-11-02 Sat>
|
||||||
/How many BLASTp HSP have been reported?/
|
/How many BLASTp HSP have been reported?/
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
|
@ -193,7 +192,7 @@ cat ../data/Glycine_max_Blastp_longIsoforme | wc -l
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 4456730
|
: 4456730
|
||||||
|
|
||||||
** Remove HSP on proteins found on supercontigs
|
*** Remove HSP on proteins found on supercontigs
|
||||||
|
|
||||||
Extract the isoform ID whose gene is on supercontigs.
|
Extract the isoform ID whose gene is on supercontigs.
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
|
@ -213,7 +212,7 @@ Remove HSP with subject or query being on a supercontig
|
||||||
|
|
||||||
=filter_blastp_sequence_id.awk=
|
=filter_blastp_sequence_id.awk=
|
||||||
|
|
||||||
#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk src awk
|
#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk src awk
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig
|
awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig
|
||||||
|
@ -241,7 +240,7 @@ cat ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig | wc -l
|
||||||
: 4420132
|
: 4420132
|
||||||
|
|
||||||
|
|
||||||
** Filter by coverage and identity percentage
|
*** Filter by coverage and identity percentage
|
||||||
|
|
||||||
*** Add protein length and gene name columns
|
*** Add protein length and gene name columns
|
||||||
|
|
||||||
|
@ -312,7 +311,7 @@ cat ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.tsv | wc -l
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 342180
|
: 342180
|
||||||
|
|
||||||
** Clustering
|
*** Clustering
|
||||||
<2024-11-03 Sun>
|
<2024-11-03 Sun>
|
||||||
*** Extract the homology graph
|
*** Extract the homology graph
|
||||||
|
|
||||||
|
@ -345,6 +344,13 @@ head -n 1 "tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
| GLYMA_10G098500 | GLYMA_U008600 | 154 |
|
| GLYMA_10G098500 | GLYMA_U008600 | 154 |
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
wc -l "./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: 1520850 ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc
|
||||||
|
|
||||||
*** Filter the homology graph to keep only the heaviest weight
|
*** Filter the homology graph to keep only the heaviest weight
|
||||||
|
|
||||||
|
|
||||||
|
@ -475,7 +481,7 @@ What is the interval of duplicate gene count? There are between 48109 and 49391
|
||||||
|
|
||||||
What is the number of families?
|
What is the number of families?
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
|
tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
|
||||||
cut -f2 "${tsv}" | sort | uniq | wc -l
|
cut -f2 "${tsv}" | sort | uniq | wc -l
|
||||||
#+end_src
|
#+end_src
|
||||||
|
@ -483,7 +489,7 @@ cut -f2 "${tsv}" | sort | uniq | wc -l
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 8426
|
: 8426
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
|
tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
|
||||||
cut -f2 "${tsv}" | sort | uniq | wc -l
|
cut -f2 "${tsv}" | sort | uniq | wc -l
|
||||||
#+end_src
|
#+end_src
|
||||||
|
@ -508,10 +514,9 @@ expr $ALL - $DUPLICATED
|
||||||
|
|
||||||
#+RESULTS: singletons-high-stringency
|
#+RESULTS: singletons-high-stringency
|
||||||
: 9128
|
: 9128
|
||||||
|
|
||||||
* Amount of Tandemly Arrayed Genes
|
* Amount of Tandemly Arrayed Genes
|
||||||
|
|
||||||
*** Extract gene position from a GFF3 file
|
** Extract gene position from a GFF3 file
|
||||||
|
|
||||||
#+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk
|
#+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk
|
||||||
|
|
||||||
|
@ -540,7 +545,7 @@ head "./tmp/Glycine_max.gene_positions.tsv"
|
||||||
| GLYMA_01G001000 | 1 | 196256 | 201895 |
|
| GLYMA_01G001000 | 1 | 196256 | 201895 |
|
||||||
|
|
||||||
#+name: gene-positions-count
|
#+name: gene-positions-count
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
cat "./tmp/Glycine_max.gene_positions.tsv" \
|
cat "./tmp/Glycine_max.gene_positions.tsv" \
|
||||||
| wc -l
|
| wc -l
|
||||||
#+end_src
|
#+end_src
|
||||||
|
@ -557,7 +562,7 @@ expr $GENES - $POSITIONS
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 308
|
: 308
|
||||||
|
|
||||||
*** Extract TAGs
|
** Extract TAGs
|
||||||
|
|
||||||
#+name: extract-TAGs
|
#+name: extract-TAGs
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
|
@ -593,20 +598,20 @@ head "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
| gene | family | tag0 | tag1 |
|
| gene | family | tag0 | tag1 |
|
||||||
| GLYMA_07G257400 | 3109 | - | - |
|
| GLYMA_19G258600 | spacer3040 | - | - |
|
||||||
| GLYMA_17G203900 | 2067 | - | - |
|
| GLYMA_08G277500 | 68 | - | - |
|
||||||
| GLYMA_20G062000 | 509 | - | - |
|
| GLYMA_08G342200 | 363 | 2467 | 2749 |
|
||||||
| GLYMA_16G099400 | 40 | - | - |
|
| GLYMA_08G072200 | 3499 | 2317 | 2592 |
|
||||||
| GLYMA_14G045400 | 1136 | 804 | 896 |
|
| GLYMA_15G143500 | spacer1810 | - | - |
|
||||||
| GLYMA_11G189200 | 15 | - | - |
|
| GLYMA_09G191100 | 3783 | - | - |
|
||||||
| GLYMA_02G191500 | 1310 | - | - |
|
| GLYMA_02G090200 | 7931 | - | - |
|
||||||
| GLYMA_02G308200 | 520 | - | - |
|
| GLYMA_17G091400 | 584 | - | - |
|
||||||
| GLYMA_14G205200 | 8 | - | - |
|
| GLYMA_13G199100 | 79 | - | 621 |
|
||||||
|
|
||||||
How many TAGs for definition 0?
|
How many TAGs for definition 0?
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
awk 'BEGIN {
|
awk 'BEGIN {
|
||||||
max=0
|
max=0
|
||||||
}
|
}
|
||||||
|
@ -622,11 +627,11 @@ awk 'BEGIN {
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 3208
|
: 2620
|
||||||
|
|
||||||
For TAG1?
|
For TAG1?
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
awk 'BEGIN {
|
awk 'BEGIN {
|
||||||
max=0
|
max=0
|
||||||
}
|
}
|
||||||
|
@ -642,10 +647,10 @@ awk 'BEGIN {
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 3481
|
: 2916
|
||||||
|
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
awk 'BEGIN {
|
awk 'BEGIN {
|
||||||
max=0
|
max=0
|
||||||
}
|
}
|
||||||
|
@ -661,7 +666,7 @@ awk 'BEGIN {
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 2500
|
: 2157
|
||||||
|
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
|
@ -680,16 +685,16 @@ awk 'BEGIN {
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 2652
|
: 2438
|
||||||
|
|
||||||
|
|
||||||
*** Size of the greatest TAG
|
** Size of the greatest TAG
|
||||||
|
|
||||||
**** High stringency
|
*** High stringency
|
||||||
|
|
||||||
/TAG₀/
|
/TAG₀/
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
awk '
|
awk '
|
||||||
{
|
{
|
||||||
tag_index=$3
|
tag_index=$3
|
||||||
|
@ -710,11 +715,11 @@ awk '
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 421 53
|
: 2134 32
|
||||||
|
|
||||||
/TAG₁/
|
/TAG₁/
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
awk '
|
awk '
|
||||||
{
|
{
|
||||||
tag_index=$4
|
tag_index=$4
|
||||||
|
@ -735,14 +740,14 @@ awk '
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 459 76
|
: 416 43
|
||||||
|
|
||||||
**** Low stringency
|
*** Low stringency
|
||||||
|
|
||||||
|
|
||||||
/TAG₀/
|
/TAG₀/
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
awk '
|
awk '
|
||||||
{
|
{
|
||||||
tag_index=$3
|
tag_index=$3
|
||||||
|
@ -763,11 +768,11 @@ awk '
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 579 76
|
: 2591 32
|
||||||
|
|
||||||
/TAG₁/
|
/TAG₁/
|
||||||
|
|
||||||
#+begin_src bash
|
#+begin_src bash :exports both
|
||||||
awk '
|
awk '
|
||||||
{
|
{
|
||||||
tag_index=$4
|
tag_index=$4
|
||||||
|
@ -788,9 +793,37 @@ awk '
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
: 646 76
|
: 525 43
|
||||||
|
|
||||||
* DONE Complete slides
|
|
||||||
|
** Amount of TAGs
|
||||||
|
|
||||||
|
How many TAG genes are there for low and high stringency datasets?
|
||||||
|
|
||||||
|
#+name: count-tags
|
||||||
|
#+begin_src bash
|
||||||
|
awk 'BEGIN { count0=0; count1; } { if ($3 != "-") count0++; if($4 != "-") count1++; } END { print count0, count1 }' $tags
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS: count-tags
|
||||||
|
|
||||||
|
#+begin_src bash :noweb strip-export
|
||||||
|
tags=./tmp/Glycine_max_TAGs_coverage30_identity30.tsv
|
||||||
|
<<count-tags>>
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: 6680 7876
|
||||||
|
|
||||||
|
#+begin_src bash :noweb strip-export
|
||||||
|
tags=./tmp/Glycine_max_TAGs_coverage40_identity50.tsv
|
||||||
|
<<count-tags>>
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: 5397 6480
|
||||||
|
|
||||||
|
* Complete slides
|
||||||
<2024-11-23 Sat>
|
<2024-11-23 Sat>
|
||||||
|
|
||||||
* Family sizes
|
* Family sizes
|
||||||
|
@ -812,16 +845,16 @@ head -1 ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl \
|
||||||
: 159
|
: 159
|
||||||
|
|
||||||
|
|
||||||
* TODO $K_s$ For all duplicated pairs within a family
|
* $K_s$ For all duplicated pairs within a family
|
||||||
|
|
||||||
Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$.
|
Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$.
|
||||||
|
|
||||||
PAML package:
|
Install PAML, Pal2Nal and Clustal packages with conda:
|
||||||
#+begin_src bash
|
#+begin_src bash
|
||||||
conda install paml pal2nal clustalw -c bioconda
|
conda install paml pal2nal clustalw -c bioconda
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
** Use a Nextflow workflow
|
** Testing each step
|
||||||
<2024-12-22 Sun>
|
<2024-12-22 Sun>
|
||||||
|
|
||||||
*** All pair of duplicate genes
|
*** All pair of duplicate genes
|
||||||
|
@ -831,16 +864,14 @@ The algorithm is simple.
|
||||||
We assume the gene-families mapping file is sorted by family index.
|
We assume the gene-families mapping file is sorted by family index.
|
||||||
For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j.
|
For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j.
|
||||||
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
#+begin_src bash :exports both
|
||||||
#+begin_src bash
|
|
||||||
cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv
|
cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv
|
||||||
head ./tmp/Glycine_max_duplicate_gene_pairs.tsv
|
head ./tmp/Glycine_max_duplicate_gene_pairs.tsv
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
| GLYMA_07G053000 | GLYMA_10G247700 |
|
| GLYMA_07G053000 | GLYMA_10G247700 |
|
||||||
| GLYMA_07G053000 | GLYMA_12G188000 |
|
| GLYMA_07G053000 | GLYMA_12G188000 |
|
||||||
| GLYMA_07G053000 | GLYMA_07G159200 |
|
| GLYMA_07G053000 | GLYMA_07G159200 |
|
||||||
| GLYMA_07G053000 | GLYMA_13G313300 |
|
| GLYMA_07G053000 | GLYMA_13G313300 |
|
||||||
|
@ -862,9 +893,7 @@ gene_id_1="GLYMA_07G053000"
|
||||||
gene_id_2="GLYMA_10G247700"
|
gene_id_2="GLYMA_10G247700"
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
CDS:
|
- CDS:
|
||||||
|
|
||||||
|
|
||||||
#+name: pair-gene-cds-extract
|
#+name: pair-gene-cds-extract
|
||||||
#+begin_src bash :noweb strip-export
|
#+begin_src bash :noweb strip-export
|
||||||
output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
|
output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
|
||||||
|
@ -900,7 +929,7 @@ done
|
||||||
|
|
||||||
#+RESULTS:
|
#+RESULTS:
|
||||||
|
|
||||||
Proteins:
|
- Proteins:
|
||||||
|
|
||||||
#+name: pair-gene-protein-extract
|
#+name: pair-gene-protein-extract
|
||||||
#+begin_src bash :noweb strip-export
|
#+begin_src bash :noweb strip-export
|
||||||
|
@ -930,7 +959,6 @@ done
|
||||||
|
|
||||||
#+RESULTS: pair-gene-protein-extract
|
#+RESULTS: pair-gene-protein-extract
|
||||||
|
|
||||||
|
|
||||||
#+begin_src bash :noweb strip-export
|
#+begin_src bash :noweb strip-export
|
||||||
<<pair-gene-id>>
|
<<pair-gene-id>>
|
||||||
<<pair-gene-protein-extract>>
|
<<pair-gene-protein-extract>>
|
||||||
|
@ -956,7 +984,6 @@ coding_sequence="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
|
||||||
pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy"
|
pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy"
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
|
|
||||||
/YN00 Ka-Ks computation/
|
/YN00 Ka-Ks computation/
|
||||||
|
|
||||||
#+begin_src bash :noweb strip-export
|
#+begin_src bash :noweb strip-export
|
||||||
|
@ -968,7 +995,7 @@ yn00
|
||||||
|
|
||||||
/Extraction of Ka-Ks values/
|
/Extraction of Ka-Ks values/
|
||||||
|
|
||||||
#+begin_src bash :noweb strip-export
|
#+begin_src bash :noweb strip-export :exports both
|
||||||
<<pair-gene-id>>
|
<<pair-gene-id>>
|
||||||
yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn"
|
yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn"
|
||||||
awk '
|
awk '
|
||||||
|
@ -1000,10 +1027,440 @@ awk '
|
||||||
*** Run the Nextflow workflow
|
*** Run the Nextflow workflow
|
||||||
<2024-12-23 Mon>
|
<2024-12-23 Mon>
|
||||||
|
|
||||||
|
The Nextflow workflow is stuck on the first steps, because the channel containing the queue of jobs for each gene pair is too huge in memory.
|
||||||
|
|
||||||
* TODO Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
|
*** Back to basics
|
||||||
|
|
||||||
|
I wrote a bash script that compute the Ka/Ks not loading the whole pairs file in memory.
|
||||||
|
|
||||||
|
<2024-12-26 Thu> The script is still running.
|
||||||
|
|
||||||
|
I did not handle the case where several coding sequences were present in the CDS file for the same gene, so the script ignores the proteins with several isoforms.
|
||||||
|
|
||||||
|
Let us filter the CDS to match the proteome file:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
awk '/^>/ { gsub(">", "", $1); print $1 }' ./tmp/proteome_filtered.fa > ./tmp/proteome_filtered.list
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
cat ./tmp/proteome_filtered.list | wc -l
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: 55589 ./tmp/proteome_filtered.list
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
awk -f ./workflow/families_and_TAGs/scripts/filter_records_fasta.awk ./tmp/proteome_filtered.list ../data/Glycine_max.Glycine_max_v2.1.cds.all.fa > ./tmp/cds_filtered.fa
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src bash :exports both
|
||||||
|
grep -c "^>" ./tmp/cds_filtered.fa
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: 55589
|
||||||
|
|
||||||
|
* Are TAG pairs different in age from non-TAG pairs ?
|
||||||
|
|
||||||
|
Run the bash script [[./workflow/KaKs/ugly.sh]].
|
||||||
|
|
||||||
|
<2025-01-07 Tue>
|
||||||
|
Computation ended after about a week.
|
||||||
|
|
||||||
|
<2025-01-08 Wed>
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
awk -F',' '$4 < 1' ./workflow/KaKs/results/complete_uniq.csv > ./workflow/KaKs/results/complete_uniq_ks_below1.csv
|
||||||
|
cat ./workflow/KaKs/results/complete_uniq_ks_below1.csv| wc -l
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: 294226
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results graphics file :file results/plots/Glycine_max_Ks_distribution.png :width 8 :height 8 :res 200 :units cm
|
||||||
|
library(ggplot2)
|
||||||
|
library(scales)
|
||||||
|
|
||||||
|
data <- read.table("./workflow/KaKs/complete2.csv", sep=",", header=TRUE)
|
||||||
|
colnames(data) <- c("gene_a","gene_b","ka","ks")
|
||||||
|
|
||||||
|
theme_set(theme_gray(base_size=8))
|
||||||
|
gg <- ggplot(data, aes(x=ks))
|
||||||
|
gg <- gg + geom_density()
|
||||||
|
gg <- gg + scale_x_continuous("Age (Ks)")
|
||||||
|
gg <- gg + scale_y_continuous("Density")
|
||||||
|
gg <- gg + xlim(0, 1)
|
||||||
|
gg <- gg + ggtitle("Proportion of duplicate gene pairs age")
|
||||||
|
gg <- gg + theme(plot.title=element_text(hjust=0.5))
|
||||||
|
gg
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
[[file:results/plots/Glycine_max_Ks_distribution.png]]
|
||||||
|
|
||||||
|
#+caption: Density of pair with a given Ks. (Inspired from Blanc and Wolfe, 2004)
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
[[file:results/plots/Glycine_max_Ks_distribution.png]]
|
||||||
|
|
||||||
|
Then, let us plot the same data separating TAG from non-TAG pairs.
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results silent
|
||||||
|
data <- read.csv("./workflow/KaKs/complete2.csv")
|
||||||
|
colnames(data) <- c("gene_a", "gene_b", "ka", "ks")
|
||||||
|
tag_df2 <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30.tsv", header=TRUE)
|
||||||
|
tag_def <- "tag1"
|
||||||
|
data$category <- apply(data, 1, function(row) {
|
||||||
|
gene_a <- row["gene_a"]
|
||||||
|
gene_b <- row["gene_b"]
|
||||||
|
tag_a <- tag_df2[tag_df2["gene"] == gene_a, tag_def]
|
||||||
|
tag_b <- tag_df2[tag_df2["gene"] == gene_b, tag_def]
|
||||||
|
# Filter out pairs tag character(0)
|
||||||
|
if (length(tag_a) == 0 || length(tag_b) == 0) {
|
||||||
|
return("notag")
|
||||||
|
} else if (tag_a == "-" || tag_b == "-") {
|
||||||
|
return("notag")
|
||||||
|
} else if (tag_a == tag_b) {
|
||||||
|
return("tag")
|
||||||
|
} else {
|
||||||
|
return("notag")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results graphics file :file results/plots/ks_density_tag_and_not_tag.png :width 8 :height 8 :res 200 :units cm
|
||||||
|
library(ggplot2)
|
||||||
|
library(scales)
|
||||||
|
theme_set(theme_gray(base_size = 6))
|
||||||
|
gg <- ggplot(data)
|
||||||
|
gg <- gg + geom_density(mapping=aes(x=ks, color=category))
|
||||||
|
gg <- gg + scale_x_discrete("Age (Ks)")
|
||||||
|
gg <- gg + scale_y_continuous("Density")
|
||||||
|
gg <- gg + xlim(0, 5)
|
||||||
|
gg <- gg + ggtitle("Proportion of duplicate gene pairs age (TAG and non-TAG)")
|
||||||
|
gg <- gg + theme(plot.title = element_text(hjust = 0.5))
|
||||||
|
gg
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
[[file:results/plots/ks_density_tag_and_not_tag.png]]
|
||||||
|
|
||||||
|
We will now test whether TAG gene duplication is more recent than non-TAG gene pairs.
|
||||||
|
As the distributions of the Ks values is not normally distributed, I will not use the Student \(t\)-test, but the Wilcoxon-Mann-Whitney U test.
|
||||||
|
|
||||||
|
The hypotheses are
|
||||||
|
\[
|
||||||
|
\begin{cases}
|
||||||
|
(H_0) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is equal to the probability of $Y$ being greater than $X$} \\
|
||||||
|
(H_1) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is greater than the probability of $Y$ being greater than $X$}
|
||||||
|
\end{cases}
|
||||||
|
\]
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results output :exports both
|
||||||
|
tag_ks <- data[data$category == "tag", "ks"]
|
||||||
|
non_tag_ks <- data[data$category == "notag", "ks"]
|
||||||
|
wilcox.test(non_tag_ks, tag_ks, alternative="greater")
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
:
|
||||||
|
: Wilcoxon rank sum test with continuity correction
|
||||||
|
:
|
||||||
|
: data: non_tag_ks and tag_ks
|
||||||
|
: W = 358006116, p-value < 2.2e-16
|
||||||
|
: alternative hypothesis: true location shift is greater than 0
|
||||||
|
|
||||||
|
The \(p\)-value is lower than $0.05$ so at level \(\alpha = 5 \%\), we reject the null hypotheses. Given two duplicate gene pairs, the probably that a TAG gene pair duplication event occurred before the non-TAG gene pair duplication event is greater than the otherway around, remaining TAG genes tends to be more recently duplicated.
|
||||||
|
|
||||||
|
* Are genes inside a TAG orientated in the same way more often than random?
|
||||||
|
|
||||||
|
We did not keep the orientation of the genes in the TAGs file. Let extract the gene orientation first, and join this information to the TAGs file.
|
||||||
|
We will do our analysis on the less stringent dataset. (coverage >30%, identity >30%).
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
awk 'BEGIN {
|
||||||
|
OFS="\t"
|
||||||
|
}
|
||||||
|
/^>/ {
|
||||||
|
locus=$3
|
||||||
|
split(locus, arr, ":")
|
||||||
|
orientation=arr[6]
|
||||||
|
gene=$4
|
||||||
|
sub("gene:", "", gene)
|
||||||
|
print gene, orientation
|
||||||
|
}' ./tmp/proteome_filtered.fa > ./tmp/filtered_gene_orientation.tsv
|
||||||
|
head ./tmp/filtered_gene_orientation.tsv
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
| GLYMA_01G141900 | -1 |
|
||||||
|
| GLYMA_01G234000 | -1 |
|
||||||
|
| GLYMA_01G157500 | -1 |
|
||||||
|
| GLYMA_01G031500 | -1 |
|
||||||
|
| GLYMA_01G132000 | -1 |
|
||||||
|
| GLYMA_01G038100 | -1 |
|
||||||
|
| GLYMA_01G184300 | 1 |
|
||||||
|
| GLYMA_01G231700 | 1 |
|
||||||
|
| GLYMA_01G031400 | 1 |
|
||||||
|
| GLYMA_01G008100 | 1 |
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
head ./tmp/Glycine_max
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
tail -n +1 ./tmp/filtered_gene_orientation.tsv | sort -d > ./tmp/filtered_gene_orientation_s.tsv
|
||||||
|
tail -n +1 ./tmp/Glycine_max_TAGs_coverage30_identity30.tsv | sort -d > ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv
|
||||||
|
join -t$'\t' -1 1 -2 1 ./tmp/filtered_gene_orientation_s.tsv ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv > ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
head ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
| GLYMA_01G000100 | -1 | 5506 | - | - |
|
||||||
|
| GLYMA_01G000200 | -1 | spacer1 | - | - |
|
||||||
|
| GLYMA_01G000300 | 1 | spacer2 | - | - |
|
||||||
|
| GLYMA_01G000400 | -1 | 146 | - | 1 |
|
||||||
|
| GLYMA_01G000500 | 1 | spacer3 | - | - |
|
||||||
|
| GLYMA_01G000600 | 1 | 146 | - | 1 |
|
||||||
|
| GLYMA_01G000700 | 1 | spacer4 | - | - |
|
||||||
|
| GLYMA_01G000800 | 1 | 3898 | - | - |
|
||||||
|
| GLYMA_01G000900 | 1 | 7954 | - | - |
|
||||||
|
| GLYMA_01G001000 | 1 | 7875 | - | - |
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :colnames yes
|
||||||
|
tag_df = read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv",
|
||||||
|
sep="\t")
|
||||||
|
colnames(tag_df) <- c("gene", "orientation", "family", "tag0", "tag1")
|
||||||
|
head(tag_df)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
| gene | orientation | family | tag0 | tag1 |
|
||||||
|
|-----------------+-------------+---------+------+------|
|
||||||
|
| GLYMA_01G000100 | -1 | 5506 | - | - |
|
||||||
|
| GLYMA_01G000200 | -1 | spacer1 | - | - |
|
||||||
|
| GLYMA_01G000300 | 1 | spacer2 | - | - |
|
||||||
|
| GLYMA_01G000400 | -1 | 146 | - | 1 |
|
||||||
|
| GLYMA_01G000500 | 1 | spacer3 | - | - |
|
||||||
|
| GLYMA_01G000600 | 1 | 146 | - | 1 |
|
||||||
|
|
||||||
|
We perform a Fischer test on the contingency table:
|
||||||
|
| | Coherent | Convergent | Divergent |
|
||||||
|
|---------+----------+------------+-----------|
|
||||||
|
| pair TAG0 | | | |
|
||||||
|
| pair not TAG0 | | | |
|
||||||
|
|
||||||
|
|
||||||
* TODO Are TAG pairs different in age from non-TAG pairs?
|
|
||||||
|
|
||||||
* TODO Are genes inside a TAG orientated in the same way more often than random?
|
#+begin_src R :session *R*
|
||||||
|
get_strand <- function(gene) {
|
||||||
|
return(
|
||||||
|
tag_df[tag_df[,"gene"] == gene, "orientation"]
|
||||||
|
)
|
||||||
|
}
|
||||||
|
get_convergence <- function(gene_a, gene_b) {
|
||||||
|
strand_a <- get_strand(gene_a)
|
||||||
|
strand_b <- get_strand(gene_b)
|
||||||
|
if (strand_a == strand_b) {
|
||||||
|
return('coherent')
|
||||||
|
} else if (strand_a == -1) {
|
||||||
|
return('divergent')
|
||||||
|
} else if (strand_a == 1) {
|
||||||
|
return('convergent')
|
||||||
|
} else {
|
||||||
|
message("Error: I do not understand why this does not fall in these case (coherent, divergent, convergent, what else?)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :colnames yes :rownames yes
|
||||||
|
contingency_table <- matrix(0, nrow=2, ncol=3)
|
||||||
|
colnames(contingency_table) <- c("coherent", "convergent", "divergent")
|
||||||
|
rownames(contingency_table) <- c("tag", "nottag")
|
||||||
|
definition <- 0
|
||||||
|
for (i in 1:(nrow(tag_df) - (definition + 1))) {
|
||||||
|
j <- i + 1
|
||||||
|
gene_i <- tag_df[i, "gene"]
|
||||||
|
gene_j <- tag_df[j, "gene"]
|
||||||
|
convergence <- get_convergence(gene_i, gene_j)
|
||||||
|
if (tag_df[i, "tag0"] == "-" || tag_df[j, "tag0"] == "-") {
|
||||||
|
category <- "nottag"
|
||||||
|
} else if (tag_df[i, "tag0"] == tag_df[j, "tag0"]) {
|
||||||
|
category <- "tag"
|
||||||
|
} else {
|
||||||
|
category <- "nottag"
|
||||||
|
}
|
||||||
|
contingency_table[category, convergence] <- contingency_table[category, convergence] + 1
|
||||||
|
}
|
||||||
|
contingency_table
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
| | coherent | convergent | divergent |
|
||||||
|
|--------+----------+------------+-----------|
|
||||||
|
| tag | 3353 | 342 | 361 |
|
||||||
|
| nottag | 25699 | 12926 | 12907 |
|
||||||
|
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: org_babel_R_eoe
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results output
|
||||||
|
fisher.test(contingency_table, workspace=2e6)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
:
|
||||||
|
: Fisher's Exact Test for Count Data
|
||||||
|
:
|
||||||
|
: data: contingency_table
|
||||||
|
: p-value < 2.2e-16
|
||||||
|
: alternative hypothesis: two.sided
|
||||||
|
|
||||||
|
|
||||||
|
\(p\)-value < 2.2e-16, so at level $\alpha = 0.05$, we reject the null hypothesis: the orientation of a pair of gene is significantly different between TAG and non TAG pairs.
|
||||||
|
More specifically, TAG genes tends to be more coherent than non TAG genes.
|
||||||
|
* Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
|
||||||
|
<2024-12-26 Thu>
|
||||||
|
|
||||||
|
Uses Gprofiler2
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results silent
|
||||||
|
library(gprofiler2)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
First we want to extract the list of genes belonging to the largest TAG:
|
||||||
|
|
||||||
|
We once again focus on the least stringent dataset, with the TAG₀ definition.
|
||||||
|
#+begin_src R :session *R*
|
||||||
|
tag_df_tag <- tag_df[tag_df["tag1"] != "-",]
|
||||||
|
tag1_count <- table(tag_df_tag["tag1"])
|
||||||
|
largest_tag1 <- names(tag1_count[which(tag1_count == max(tag1_count))])
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: 525
|
||||||
|
|
||||||
|
#+begin_src R :session *R*
|
||||||
|
largest_tag1_genes <- tag_df_tag[tag_df_tag["tag1"] == largest_tag1, "gene"]
|
||||||
|
gostres <- gost(query=largest_tag1_genes, organism="gmax")
|
||||||
|
head(gostres$result)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results graphics file :file results/plots/gprofiler_tag0579_enrichment_plot.png :width 8 :height 8 :res 200 :units cm
|
||||||
|
p <- gostplot(gostres, capped=FALSE, interactive=FALSE)
|
||||||
|
p
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
[[file:results/plots/gprofiler_tag0579_enrichment_plot.png]]
|
||||||
|
|
||||||
|
Our genes are mostly involved in auxin related signaling pathways, and response to hormones.
|
||||||
|
|
||||||
|
#+begin_example
|
||||||
|
term_name
|
||||||
|
1 regulation of cellular respiration
|
||||||
|
2 regulation of generation of precursor metabolites and energy
|
||||||
|
3 cellular respiration
|
||||||
|
4 energy derivation by oxidation of organic compounds
|
||||||
|
5 generation of precursor metabolites and energy
|
||||||
|
6 regulation of cellular metabolic process
|
||||||
|
#+end_example
|
||||||
|
|
||||||
|
* TAG size in function of TAG definition
|
||||||
|
<2024-12-27 Fri>
|
||||||
|
#+begin_src bash
|
||||||
|
./rust/tagfinder/target/release/tagfinder --families ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv --positions ./tmp/Glycine_max.gene_positions.tsv --definitions 0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50 > "./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv"
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src R :session *R* :results graphics file :file results/plots/nb_TAG_against_definition.png :width 8 :height 8 :res 200 :units cm :exports both
|
||||||
|
library(ggplot2)
|
||||||
|
library(scales)
|
||||||
|
|
||||||
|
data <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv", na.strings=c("-"), header=TRUE)
|
||||||
|
|
||||||
|
definitions <- c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50)
|
||||||
|
|
||||||
|
nb_TAGs <- sapply(definitions, function(definition) {
|
||||||
|
tags <- data[,paste0("tag", definition)]
|
||||||
|
tags <- tags[! is.na(tags)]
|
||||||
|
return(max(tags))
|
||||||
|
})
|
||||||
|
|
||||||
|
data <- data.frame(list(definition=definitions, nb=nb_TAGs))
|
||||||
|
|
||||||
|
theme_set(theme_gray(base_size = 8))
|
||||||
|
gg <- ggplot(data, aes(x=definition, y=nb))
|
||||||
|
gg <- gg + geom_point()
|
||||||
|
gg <- gg + scale_x_continuous("Definition")
|
||||||
|
gg <- gg + scale_y_continuous("TAGs")
|
||||||
|
gg <- gg + ggtitle("Number of TAGs for a TAG definition")
|
||||||
|
gg <- gg + theme(plot.title = element_text(hjust = 0.5))
|
||||||
|
gg
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
[[file:results/plots/nb_TAG_against_definition.pdf]]
|
||||||
|
|
||||||
|
* Computing some statistics on a selection of plants
|
||||||
|
|
||||||
|
<2024-12-29 Sun>
|
||||||
|
|
||||||
|
We want to extract the following statistics for the whole set of plant on eCampus:
|
||||||
|
- number of families
|
||||||
|
- number of duplicate genes
|
||||||
|
- number of singletons
|
||||||
|
- number of TAG0
|
||||||
|
|
||||||
|
|
||||||
|
We have to filter out some genes that are from Mitochondria or Chloroplasts (for /Oryza sativa ssp. Japonica/ for instance).
|
||||||
|
|
||||||
|
First we extract the protein ID that corresponds to mitochondrial or chloroplastic genes.
|
||||||
|
#+begin_src bash
|
||||||
|
fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz"
|
||||||
|
zcat "${fasta}" | awk '
|
||||||
|
/^>/ {
|
||||||
|
locus=$3
|
||||||
|
split(":", arr, locus)
|
||||||
|
chromosome=arr[3]
|
||||||
|
if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /^supercontig*/) {
|
||||||
|
protein=$1
|
||||||
|
gsub(">", "", protein)
|
||||||
|
print protein
|
||||||
|
}
|
||||||
|
}' > ./tmp/Oryza_sativa_faulty_proteins.list
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
#+begin_src bash
|
||||||
|
head ./tmp/Oryza_sativa_faulty_proteins.list
|
||||||
|
#+end_src
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
|
||||||
|
Then, remove all BLASTp hits that matches a protein ID in either query or subject.
|
||||||
|
|
||||||
|
We include this step in a Nextflow workflow. It does not compute the BLASTp all against all, It does rather use the TSV file provided on eCampus.
|
||||||
|
|
|
@ -15,7 +15,6 @@ fn main() {
|
||||||
let mut parts = line.split("\t");
|
let mut parts = line.split("\t");
|
||||||
let gene = parts.next().unwrap().to_string();
|
let gene = parts.next().unwrap().to_string();
|
||||||
let family = parts.next().unwrap().to_string();
|
let family = parts.next().unwrap().to_string();
|
||||||
println!("{}", family);
|
|
||||||
if family != family_index {
|
if family != family_index {
|
||||||
family_index = family;
|
family_index = family;
|
||||||
if family_genes.len() > 1 {
|
if family_genes.len() > 1 {
|
||||||
|
@ -23,7 +22,9 @@ fn main() {
|
||||||
}
|
}
|
||||||
family_genes.clear();
|
family_genes.clear();
|
||||||
}
|
}
|
||||||
family_genes.push(gene);
|
if gene != "" {
|
||||||
|
family_genes.push(gene);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if family_genes.len() > 1 {
|
if family_genes.len() > 1 {
|
||||||
print_gene_pairs(&family_genes);
|
print_gene_pairs(&family_genes);
|
||||||
|
|
|
@ -1,2 +1 @@
|
||||||
.nextflow.log*
|
.nextflow*
|
||||||
|
|
||||||
|
|
|
@ -8,16 +8,22 @@ for gene_id in ${gene_id_1} ${gene_id_2}; do
|
||||||
awk -v gene_id="${gene_id}" '
|
awk -v gene_id="${gene_id}" '
|
||||||
BEGIN {
|
BEGIN {
|
||||||
on_gene=0
|
on_gene=0
|
||||||
|
visited=0
|
||||||
}
|
}
|
||||||
/^[^>]/ && on_gene == 1 {
|
/^[^>]/ && on_gene == 1 {
|
||||||
|
|
||||||
print $0
|
print $0
|
||||||
}
|
}
|
||||||
/^>/ {
|
/^>/ {
|
||||||
|
if (visited == 1) {
|
||||||
|
exit
|
||||||
|
}
|
||||||
gene = $4
|
gene = $4
|
||||||
gsub("gene:", "", gene)
|
gsub("gene:", "", gene)
|
||||||
if (gene == gene_id) {
|
if (gene == gene_id) {
|
||||||
on_gene=1
|
on_gene=1
|
||||||
print $0
|
print $0
|
||||||
|
visited=1
|
||||||
} else {
|
} else {
|
||||||
on_gene=0
|
on_gene=0
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,35 +1,28 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# Bash version of the Nextflow script that should run faster
|
|
||||||
output_file="Glycine_max_duplicate_genes_KaKs.tsv"
|
|
||||||
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
|
|
||||||
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
|
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
mkdir -p tmp
|
NB_THREADS=16
|
||||||
|
|
||||||
|
output_file="Glycine_max_duplicate_genes_KaKs_v3.csv"
|
||||||
|
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
|
||||||
|
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/cds_filtered.fa"
|
||||||
|
input_file="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.csv"
|
||||||
|
|
||||||
echo "seqfile = ./tmp/cds.ali.phy
|
|
||||||
outfile = ./tmp/yn
|
|
||||||
verbose = 0
|
|
||||||
icode = 0
|
|
||||||
weighting = 0
|
|
||||||
commonf3x4 = 0" > yn00.ctl
|
|
||||||
compute_kaks() {
|
compute_kaks() {
|
||||||
local gene_id_1
|
local gene_id_1
|
||||||
local gene_id_2
|
local gene_id_2
|
||||||
gene_id_1=$1
|
gene_id_1="${1}"
|
||||||
gene_id_2=$2
|
gene_id_2="${2}"
|
||||||
# Extract the proteins
|
# Extract the proteins
|
||||||
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
|
../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${proteome_fasta}" "./prot.fst"
|
||||||
# Extract the CDS
|
# Extract the CDS
|
||||||
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
|
../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${cds_fasta}" "./cds.fst"
|
||||||
|
|
||||||
# Run clustalw2
|
# Run clustalw2
|
||||||
clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
|
clustalw2 -quiet -align -infile="./prot.fst" -outfile="./prot.ali.aln"
|
||||||
|
|
||||||
# Run Pal2Nal
|
# Run Pal2Nal
|
||||||
pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
|
pal2nal.pl "./prot.ali.aln" "./cds.fst" -output paml >"./cds.ali.phy"
|
||||||
|
|
||||||
# Run yn00
|
# Run yn00
|
||||||
yn00
|
yn00
|
||||||
|
@ -55,22 +48,49 @@ compute_kaks() {
|
||||||
skip -= 1
|
skip -= 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
' "./tmp/yn"
|
' "./yn"
|
||||||
}
|
}
|
||||||
echo "gene_a,gene_b,ka,ks" > "${output_file}"
|
|
||||||
input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
|
thread() {
|
||||||
n=$(wc -l "${input}")
|
local thread_id="${1}"
|
||||||
i=0
|
local start="${2}"
|
||||||
while IFS= read -r line
|
local end="${3}"
|
||||||
do
|
|
||||||
echo $i / $n
|
mkdir -p "tmp/${thread_id}"
|
||||||
gene_a=$(echo $line | awk '{print $1}')
|
pushd "tmp/${thread_id}" || return
|
||||||
gene_b=$(echo $line | awk '{print $2}')
|
|
||||||
i=$(expr $i + 1)
|
echo "seqfile = ./cds.ali.phy
|
||||||
kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
|
outfile = ./yn
|
||||||
if [[ ! -z $kaks ]]
|
verbose = 0
|
||||||
then
|
icode = 0
|
||||||
arr=(${kaks//\t/ })
|
weighting = 0
|
||||||
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
|
commonf3x4 = 0" >yn00.ctl
|
||||||
fi
|
echo "" >"part.csv"
|
||||||
done < "${input}"
|
for line_index in $(seq $start $end); do
|
||||||
|
line=$(head -n "${line_index}" "${input_file}" | tail -n 1)
|
||||||
|
gene_a=$(echo "${line}" | awk '{print $1}')
|
||||||
|
gene_b=$(echo "${line}" | awk '{print $2}')
|
||||||
|
kaks=$(compute_kaks "${gene_a}" "${gene_b}" | tail -1 | awk 'NF == 2')
|
||||||
|
if [[ ! -z $kaks ]]; then
|
||||||
|
arr=(${kaks//\t/ })
|
||||||
|
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >>"part.csv"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
popd || return
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
part=$(($(cat "${input_file}" | wc -l) / ${NB_THREADS}))
|
||||||
|
for thread_id in $(seq 0 $NB_THREADS); do
|
||||||
|
start=$((thread_id * part))
|
||||||
|
end=$(($((thread_id + 1)) * part))
|
||||||
|
thread $thread_id $start $end &
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
echo "gene_a,gene_b,ka,ks" >"${output_file}"
|
||||||
|
for thread_id in $(seq 0 $NB_THREADS); do
|
||||||
|
cat ./tmp/$thread_id/part.csv >>"${output_file}"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
main
|
||||||
|
|
Loading…
Reference in New Issue