Update project

This commit is contained in:
Samuel Ortion 2025-01-18 15:55:12 +01:00
parent 33e898da2e
commit 5ad94110d7
Signed by: sortion
GPG Key ID: 9B02406F8C4FB765
11 changed files with 681 additions and 112 deletions

View File

@ -0,0 +1,86 @@
\documentclass{beamer}
\usepackage{booktabs}
\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}
\subtitle{First results}
\author{Naïa Périnelle \and Samuel Ortion}
\institute{Université d'Évry Paris-Saclay}
\date{2024-11-15}
\begin{document}
\frame{\titlepage}
\begin{frame}{Cultivated soy plant: \textit{Glycine max}}
A species of legume native to East Asia.
{
\centering
\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
\vspace{1em}
% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}
\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}
}
\end{frame}
\begin{frame}{Soybean industrial interest}
\begin{itemize}
\item 353 million tonnes of soybean produced in 2020
\item Cattle feed
\item Human food
\end{itemize}
\end{frame}
\begin{frame}{\textit{Glycine max} genome statistics}
\begin{itemize}
\item 20 chromosomes
\item 55897 protein coding genes (including supercontigs)
\item 55589 protein coding genes (excluding supercontigs)
\item 88412 protein isoforms
\end{itemize}
\end{frame}
\begin{frame}{Datasets filter criteria and pipeline steps}
\begin{tabular}{ccc}
\toprule
dataset stringency & low & high \\
\midrule
coverage & > 30\% & > 40\% \\
identity & > 30\% & > 50\% \\
\bottomrule
\end{tabular}
\only<2->{
Steps:
\begin{enumerate}
\item<2-> Keep the longest isoform protein sequence per gene,
\item<3-> Run BLASTp ``all against all'' on the proteome,
\item<4-> Remove proteins of supercontigs,
\item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria (coverage and identity percentages),
\item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,
\item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.
\end{enumerate}
}
\end{frame}
\begin{frame}{Gene families size}
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}
\includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}
\end{frame}
\begin{frame}{Duplicate genes statistics}
\begin{tabular}{lcc}
dataset stringency & low & high \\
\toprule
number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\
number of families & 8426 & 11997 \\
number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\
number of TAG\textsubscript{0} & 3208 & 2500 \\
number of TAG\textsubscript{1} & 3481 & 2652 \\
\bottomrule
\end{tabular}
\end{frame}
\end{document}

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 912 KiB

View File

@ -1,9 +1,11 @@
# -*- org-export-use-babel: nil -*-
#+title: Comparative Genomics Project
#+subtitle: Duplicate Genes in /Glycine max/
#+date: 2024-2025
#+author: Samuel Ortion
#+LATEX_CLASS: scrartcl
#+LATEX_HEADER: \titlehead{M2 GENIOMHE}
#+LATEX_HEADER: \usepackage{mus}
* General infos
@ -48,15 +50,13 @@ min_identity=50
/Glycine max/ is the soy plant.
* Download the source data
- https://plants.ensembl.org/Glycine_max/Info/Index
- https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
https://plants.ensembl.org/Glycine_max/Info/Index
* Duplicate gene families
** Develop the Nextflow workflow
https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
* Develop the Nextflow workflow
** Filtering the proteome to keep the longest isoform
*** Filtering the proteome to keep the longest isoform
I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms:
1. =protein_lengths.awk= associate each FASTA record with the length of its sequence,
@ -89,19 +89,19 @@ zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
#+RESULTS:
: 55897
** Filtering out Chloroplasts and Mitochondria proteins
*** Filtering out Chloroplasts and Mitochondria proteins
From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/.
** Filter out supercontigs
*** Filter out supercontigs
<2024-10-28 Mon>
#+include: workflow/families_and_TAGs/scripts/remove_supercontigs.awk src awk
#+include: workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk src awk
#+begin_src bash
zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
| awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs.awk \
| awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk \
> ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa
#+end_src
@ -143,7 +143,7 @@ expr $ALL_ISOFORMS - $NO_SUPERCONTIGS
#+RESULTS:
: 435
** BLASTp All Against All
*** BLASTp All Against All
#+begin_src bash
makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
@ -153,7 +153,7 @@ makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv"
#+end_src
** Clustering
*** Clustering
Using =mcl=.
@ -182,8 +182,7 @@ nextflow run main.nf -profile conda -resume
from the =./workflow/= folder
* From the given BLASTp format 6 file
*** From the given BLASTp format 6 file
<2024-11-02 Sat>
/How many BLASTp HSP have been reported?/
#+begin_src bash
@ -193,7 +192,7 @@ cat ../data/Glycine_max_Blastp_longIsoforme | wc -l
#+RESULTS:
: 4456730
** Remove HSP on proteins found on supercontigs
*** Remove HSP on proteins found on supercontigs
Extract the isoform ID whose gene is on supercontigs.
#+begin_src bash
@ -213,7 +212,7 @@ Remove HSP with subject or query being on a supercontig
=filter_blastp_sequence_id.awk=
#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk src awk
#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk src awk
#+begin_src bash
awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig
@ -241,7 +240,7 @@ cat ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig | wc -l
: 4420132
** Filter by coverage and identity percentage
*** Filter by coverage and identity percentage
*** Add protein length and gene name columns
@ -312,7 +311,7 @@ cat ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.tsv | wc -l
#+RESULTS:
: 342180
** Clustering
*** Clustering
<2024-11-03 Sun>
*** Extract the homology graph
@ -345,6 +344,13 @@ head -n 1 "tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
#+RESULTS:
| GLYMA_10G098500 | GLYMA_U008600 | 154 |
#+begin_src bash
wc -l "./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
#+end_src
#+RESULTS:
: 1520850 ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc
*** Filter the homology graph to keep only the heaviest weight
@ -475,7 +481,7 @@ What is the interval of duplicate gene count? There are between 48109 and 49391
What is the number of families?
#+begin_src bash
#+begin_src bash :exports both
tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
cut -f2 "${tsv}" | sort | uniq | wc -l
#+end_src
@ -483,7 +489,7 @@ cut -f2 "${tsv}" | sort | uniq | wc -l
#+RESULTS:
: 8426
#+begin_src bash
#+begin_src bash :exports both
tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
cut -f2 "${tsv}" | sort | uniq | wc -l
#+end_src
@ -508,10 +514,9 @@ expr $ALL - $DUPLICATED
#+RESULTS: singletons-high-stringency
: 9128
* Amount of Tandemly Arrayed Genes
*** Extract gene position from a GFF3 file
** Extract gene position from a GFF3 file
#+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk
@ -540,7 +545,7 @@ head "./tmp/Glycine_max.gene_positions.tsv"
| GLYMA_01G001000 | 1 | 196256 | 201895 |
#+name: gene-positions-count
#+begin_src bash
#+begin_src bash :exports both
cat "./tmp/Glycine_max.gene_positions.tsv" \
| wc -l
#+end_src
@ -557,7 +562,7 @@ expr $GENES - $POSITIONS
#+RESULTS:
: 308
*** Extract TAGs
** Extract TAGs
#+name: extract-TAGs
#+begin_src bash
@ -593,20 +598,20 @@ head "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
#+end_src
#+RESULTS:
| gene | family | tag0 | tag1 |
| GLYMA_07G257400 | 3109 | - | - |
| GLYMA_17G203900 | 2067 | - | - |
| GLYMA_20G062000 | 509 | - | - |
| GLYMA_16G099400 | 40 | - | - |
| GLYMA_14G045400 | 1136 | 804 | 896 |
| GLYMA_11G189200 | 15 | - | - |
| GLYMA_02G191500 | 1310 | - | - |
| GLYMA_02G308200 | 520 | - | - |
| GLYMA_14G205200 | 8 | - | - |
| gene | family | tag0 | tag1 |
| GLYMA_19G258600 | spacer3040 | - | - |
| GLYMA_08G277500 | 68 | - | - |
| GLYMA_08G342200 | 363 | 2467 | 2749 |
| GLYMA_08G072200 | 3499 | 2317 | 2592 |
| GLYMA_15G143500 | spacer1810 | - | - |
| GLYMA_09G191100 | 3783 | - | - |
| GLYMA_02G090200 | 7931 | - | - |
| GLYMA_17G091400 | 584 | - | - |
| GLYMA_13G199100 | 79 | - | 621 |
How many TAGs for definition 0?
#+begin_src bash
#+begin_src bash :exports both
awk 'BEGIN {
max=0
}
@ -622,11 +627,11 @@ awk 'BEGIN {
#+end_src
#+RESULTS:
: 3208
: 2620
For TAG1?
#+begin_src bash
#+begin_src bash :exports both
awk 'BEGIN {
max=0
}
@ -642,10 +647,10 @@ awk 'BEGIN {
#+end_src
#+RESULTS:
: 3481
: 2916
#+begin_src bash
#+begin_src bash :exports both
awk 'BEGIN {
max=0
}
@ -661,7 +666,7 @@ awk 'BEGIN {
#+end_src
#+RESULTS:
: 2500
: 2157
#+begin_src bash
@ -680,16 +685,16 @@ awk 'BEGIN {
#+end_src
#+RESULTS:
: 2652
: 2438
*** Size of the greatest TAG
** Size of the greatest TAG
**** High stringency
*** High stringency
/TAG₀/
#+begin_src bash
#+begin_src bash :exports both
awk '
{
tag_index=$3
@ -710,11 +715,11 @@ awk '
#+end_src
#+RESULTS:
: 421 53
: 2134 32
/TAG₁/
#+begin_src bash
#+begin_src bash :exports both
awk '
{
tag_index=$4
@ -735,14 +740,14 @@ awk '
#+end_src
#+RESULTS:
: 459 76
: 416 43
**** Low stringency
*** Low stringency
/TAG₀/
#+begin_src bash
#+begin_src bash :exports both
awk '
{
tag_index=$3
@ -763,11 +768,11 @@ awk '
#+end_src
#+RESULTS:
: 579 76
: 2591 32
/TAG₁/
#+begin_src bash
#+begin_src bash :exports both
awk '
{
tag_index=$4
@ -788,9 +793,37 @@ awk '
#+end_src
#+RESULTS:
: 646 76
: 525 43
* DONE Complete slides
** Amount of TAGs
How many TAG genes are there for low and high stringency datasets?
#+name: count-tags
#+begin_src bash
awk 'BEGIN { count0=0; count1; } { if ($3 != "-") count0++; if($4 != "-") count1++; } END { print count0, count1 }' $tags
#+end_src
#+RESULTS: count-tags
#+begin_src bash :noweb strip-export
tags=./tmp/Glycine_max_TAGs_coverage30_identity30.tsv
<<count-tags>>
#+end_src
#+RESULTS:
: 6680 7876
#+begin_src bash :noweb strip-export
tags=./tmp/Glycine_max_TAGs_coverage40_identity50.tsv
<<count-tags>>
#+end_src
#+RESULTS:
: 5397 6480
* Complete slides
<2024-11-23 Sat>
* Family sizes
@ -812,16 +845,16 @@ head -1 ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl \
: 159
* TODO $K_s$ For all duplicated pairs within a family
* $K_s$ For all duplicated pairs within a family
Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$.
PAML package:
Install PAML, Pal2Nal and Clustal packages with conda:
#+begin_src bash
conda install paml pal2nal clustalw -c bioconda
#+end_src
** Use a Nextflow workflow
** Testing each step
<2024-12-22 Sun>
*** All pair of duplicate genes
@ -831,16 +864,14 @@ The algorithm is simple.
We assume the gene-families mapping file is sorted by family index.
For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j.
Example:
#+begin_src bash
#+begin_src bash :exports both
cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv
head ./tmp/Glycine_max_duplicate_gene_pairs.tsv
#+end_src
#+RESULTS:
| GLYMA_07G053000 | GLYMA_10G247700 |
| GLYMA_07G053000 | GLYMA_10G247700 |
| GLYMA_07G053000 | GLYMA_12G188000 |
| GLYMA_07G053000 | GLYMA_07G159200 |
| GLYMA_07G053000 | GLYMA_13G313300 |
@ -862,9 +893,7 @@ gene_id_1="GLYMA_07G053000"
gene_id_2="GLYMA_10G247700"
#+end_src
CDS:
- CDS:
#+name: pair-gene-cds-extract
#+begin_src bash :noweb strip-export
output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
@ -900,7 +929,7 @@ done
#+RESULTS:
Proteins:
- Proteins:
#+name: pair-gene-protein-extract
#+begin_src bash :noweb strip-export
@ -930,7 +959,6 @@ done
#+RESULTS: pair-gene-protein-extract
#+begin_src bash :noweb strip-export
<<pair-gene-id>>
<<pair-gene-protein-extract>>
@ -956,7 +984,6 @@ coding_sequence="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy"
#+end_src
/YN00 Ka-Ks computation/
#+begin_src bash :noweb strip-export
@ -968,7 +995,7 @@ yn00
/Extraction of Ka-Ks values/
#+begin_src bash :noweb strip-export
#+begin_src bash :noweb strip-export :exports both
<<pair-gene-id>>
yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn"
awk '
@ -1000,10 +1027,440 @@ awk '
*** Run the Nextflow workflow
<2024-12-23 Mon>
The Nextflow workflow is stuck on the first steps, because the channel containing the queue of jobs for each gene pair is too huge in memory.
* TODO Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
*** Back to basics
I wrote a bash script that compute the Ka/Ks not loading the whole pairs file in memory.
<2024-12-26 Thu> The script is still running.
I did not handle the case where several coding sequences were present in the CDS file for the same gene, so the script ignores the proteins with several isoforms.
Let us filter the CDS to match the proteome file:
#+begin_src bash
awk '/^>/ { gsub(">", "", $1); print $1 }' ./tmp/proteome_filtered.fa > ./tmp/proteome_filtered.list
#+end_src
#+RESULTS:
#+begin_src bash
cat ./tmp/proteome_filtered.list | wc -l
#+end_src
#+RESULTS:
: 55589 ./tmp/proteome_filtered.list
#+begin_src bash
awk -f ./workflow/families_and_TAGs/scripts/filter_records_fasta.awk ./tmp/proteome_filtered.list ../data/Glycine_max.Glycine_max_v2.1.cds.all.fa > ./tmp/cds_filtered.fa
#+end_src
#+RESULTS:
#+begin_src bash :exports both
grep -c "^>" ./tmp/cds_filtered.fa
#+end_src
#+RESULTS:
: 55589
* Are TAG pairs different in age from non-TAG pairs ?
Run the bash script [[./workflow/KaKs/ugly.sh]].
<2025-01-07 Tue>
Computation ended after about a week.
<2025-01-08 Wed>
#+begin_src bash
awk -F',' '$4 < 1' ./workflow/KaKs/results/complete_uniq.csv > ./workflow/KaKs/results/complete_uniq_ks_below1.csv
cat ./workflow/KaKs/results/complete_uniq_ks_below1.csv| wc -l
#+end_src
#+RESULTS:
: 294226
#+begin_src R :session *R* :results graphics file :file results/plots/Glycine_max_Ks_distribution.png :width 8 :height 8 :res 200 :units cm
library(ggplot2)
library(scales)
data <- read.table("./workflow/KaKs/complete2.csv", sep=",", header=TRUE)
colnames(data) <- c("gene_a","gene_b","ka","ks")
theme_set(theme_gray(base_size=8))
gg <- ggplot(data, aes(x=ks))
gg <- gg + geom_density()
gg <- gg + scale_x_continuous("Age (Ks)")
gg <- gg + scale_y_continuous("Density")
gg <- gg + xlim(0, 1)
gg <- gg + ggtitle("Proportion of duplicate gene pairs age")
gg <- gg + theme(plot.title=element_text(hjust=0.5))
gg
#+end_src
#+RESULTS:
[[file:results/plots/Glycine_max_Ks_distribution.png]]
#+caption: Density of pair with a given Ks. (Inspired from Blanc and Wolfe, 2004)
#+RESULTS:
[[file:results/plots/Glycine_max_Ks_distribution.png]]
Then, let us plot the same data separating TAG from non-TAG pairs.
#+begin_src R :session *R* :results silent
data <- read.csv("./workflow/KaKs/complete2.csv")
colnames(data) <- c("gene_a", "gene_b", "ka", "ks")
tag_df2 <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30.tsv", header=TRUE)
tag_def <- "tag1"
data$category <- apply(data, 1, function(row) {
gene_a <- row["gene_a"]
gene_b <- row["gene_b"]
tag_a <- tag_df2[tag_df2["gene"] == gene_a, tag_def]
tag_b <- tag_df2[tag_df2["gene"] == gene_b, tag_def]
# Filter out pairs tag character(0)
if (length(tag_a) == 0 || length(tag_b) == 0) {
return("notag")
} else if (tag_a == "-" || tag_b == "-") {
return("notag")
} else if (tag_a == tag_b) {
return("tag")
} else {
return("notag")
}
})
#+end_src
#+RESULTS:
#+begin_src R :session *R* :results graphics file :file results/plots/ks_density_tag_and_not_tag.png :width 8 :height 8 :res 200 :units cm
library(ggplot2)
library(scales)
theme_set(theme_gray(base_size = 6))
gg <- ggplot(data)
gg <- gg + geom_density(mapping=aes(x=ks, color=category))
gg <- gg + scale_x_discrete("Age (Ks)")
gg <- gg + scale_y_continuous("Density")
gg <- gg + xlim(0, 5)
gg <- gg + ggtitle("Proportion of duplicate gene pairs age (TAG and non-TAG)")
gg <- gg + theme(plot.title = element_text(hjust = 0.5))
gg
#+end_src
#+RESULTS:
[[file:results/plots/ks_density_tag_and_not_tag.png]]
We will now test whether TAG gene duplication is more recent than non-TAG gene pairs.
As the distributions of the Ks values is not normally distributed, I will not use the Student \(t\)-test, but the Wilcoxon-Mann-Whitney U test.
The hypotheses are
\[
\begin{cases}
(H_0) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is equal to the probability of $Y$ being greater than $X$} \\
(H_1) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is greater than the probability of $Y$ being greater than $X$}
\end{cases}
\]
#+begin_src R :session *R* :results output :exports both
tag_ks <- data[data$category == "tag", "ks"]
non_tag_ks <- data[data$category == "notag", "ks"]
wilcox.test(non_tag_ks, tag_ks, alternative="greater")
#+end_src
#+RESULTS:
:
: Wilcoxon rank sum test with continuity correction
:
: data: non_tag_ks and tag_ks
: W = 358006116, p-value < 2.2e-16
: alternative hypothesis: true location shift is greater than 0
The \(p\)-value is lower than $0.05$ so at level \(\alpha = 5 \%\), we reject the null hypotheses. Given two duplicate gene pairs, the probably that a TAG gene pair duplication event occurred before the non-TAG gene pair duplication event is greater than the otherway around, remaining TAG genes tends to be more recently duplicated.
* Are genes inside a TAG orientated in the same way more often than random?
We did not keep the orientation of the genes in the TAGs file. Let extract the gene orientation first, and join this information to the TAGs file.
We will do our analysis on the less stringent dataset. (coverage >30%, identity >30%).
#+begin_src bash
awk 'BEGIN {
OFS="\t"
}
/^>/ {
locus=$3
split(locus, arr, ":")
orientation=arr[6]
gene=$4
sub("gene:", "", gene)
print gene, orientation
}' ./tmp/proteome_filtered.fa > ./tmp/filtered_gene_orientation.tsv
head ./tmp/filtered_gene_orientation.tsv
#+end_src
#+RESULTS:
| GLYMA_01G141900 | -1 |
| GLYMA_01G234000 | -1 |
| GLYMA_01G157500 | -1 |
| GLYMA_01G031500 | -1 |
| GLYMA_01G132000 | -1 |
| GLYMA_01G038100 | -1 |
| GLYMA_01G184300 | 1 |
| GLYMA_01G231700 | 1 |
| GLYMA_01G031400 | 1 |
| GLYMA_01G008100 | 1 |
#+begin_src bash
head ./tmp/Glycine_max
#+end_src
#+RESULTS:
#+begin_src bash
tail -n +1 ./tmp/filtered_gene_orientation.tsv | sort -d > ./tmp/filtered_gene_orientation_s.tsv
tail -n +1 ./tmp/Glycine_max_TAGs_coverage30_identity30.tsv | sort -d > ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv
join -t$'\t' -1 1 -2 1 ./tmp/filtered_gene_orientation_s.tsv ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv > ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
#+end_src
#+RESULTS:
#+begin_src bash
head ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
#+end_src
#+RESULTS:
| GLYMA_01G000100 | -1 | 5506 | - | - |
| GLYMA_01G000200 | -1 | spacer1 | - | - |
| GLYMA_01G000300 | 1 | spacer2 | - | - |
| GLYMA_01G000400 | -1 | 146 | - | 1 |
| GLYMA_01G000500 | 1 | spacer3 | - | - |
| GLYMA_01G000600 | 1 | 146 | - | 1 |
| GLYMA_01G000700 | 1 | spacer4 | - | - |
| GLYMA_01G000800 | 1 | 3898 | - | - |
| GLYMA_01G000900 | 1 | 7954 | - | - |
| GLYMA_01G001000 | 1 | 7875 | - | - |
#+RESULTS:
#+begin_src R :session *R* :colnames yes
tag_df = read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv",
sep="\t")
colnames(tag_df) <- c("gene", "orientation", "family", "tag0", "tag1")
head(tag_df)
#+end_src
#+RESULTS:
| gene | orientation | family | tag0 | tag1 |
|-----------------+-------------+---------+------+------|
| GLYMA_01G000100 | -1 | 5506 | - | - |
| GLYMA_01G000200 | -1 | spacer1 | - | - |
| GLYMA_01G000300 | 1 | spacer2 | - | - |
| GLYMA_01G000400 | -1 | 146 | - | 1 |
| GLYMA_01G000500 | 1 | spacer3 | - | - |
| GLYMA_01G000600 | 1 | 146 | - | 1 |
We perform a Fischer test on the contingency table:
| | Coherent | Convergent | Divergent |
|---------+----------+------------+-----------|
| pair TAG0 | | | |
| pair not TAG0 | | | |
* TODO Are TAG pairs different in age from non-TAG pairs?
* TODO Are genes inside a TAG orientated in the same way more often than random?
#+begin_src R :session *R*
get_strand <- function(gene) {
return(
tag_df[tag_df[,"gene"] == gene, "orientation"]
)
}
get_convergence <- function(gene_a, gene_b) {
strand_a <- get_strand(gene_a)
strand_b <- get_strand(gene_b)
if (strand_a == strand_b) {
return('coherent')
} else if (strand_a == -1) {
return('divergent')
} else if (strand_a == 1) {
return('convergent')
} else {
message("Error: I do not understand why this does not fall in these case (coherent, divergent, convergent, what else?)")
}
}
#+end_src
#+RESULTS:
#+begin_src R :session *R* :colnames yes :rownames yes
contingency_table <- matrix(0, nrow=2, ncol=3)
colnames(contingency_table) <- c("coherent", "convergent", "divergent")
rownames(contingency_table) <- c("tag", "nottag")
definition <- 0
for (i in 1:(nrow(tag_df) - (definition + 1))) {
j <- i + 1
gene_i <- tag_df[i, "gene"]
gene_j <- tag_df[j, "gene"]
convergence <- get_convergence(gene_i, gene_j)
if (tag_df[i, "tag0"] == "-" || tag_df[j, "tag0"] == "-") {
category <- "nottag"
} else if (tag_df[i, "tag0"] == tag_df[j, "tag0"]) {
category <- "tag"
} else {
category <- "nottag"
}
contingency_table[category, convergence] <- contingency_table[category, convergence] + 1
}
contingency_table
#+end_src
#+RESULTS:
| | coherent | convergent | divergent |
|--------+----------+------------+-----------|
| tag | 3353 | 342 | 361 |
| nottag | 25699 | 12926 | 12907 |
#+RESULTS:
: org_babel_R_eoe
#+begin_src R :session *R* :results output
fisher.test(contingency_table, workspace=2e6)
#+end_src
#+RESULTS:
:
: Fisher's Exact Test for Count Data
:
: data: contingency_table
: p-value < 2.2e-16
: alternative hypothesis: two.sided
\(p\)-value < 2.2e-16, so at level $\alpha = 0.05$, we reject the null hypothesis: the orientation of a pair of gene is significantly different between TAG and non TAG pairs.
More specifically, TAG genes tends to be more coherent than non TAG genes.
* Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
<2024-12-26 Thu>
Uses Gprofiler2
#+begin_src R :session *R* :results silent
library(gprofiler2)
#+end_src
First we want to extract the list of genes belonging to the largest TAG:
We once again focus on the least stringent dataset, with the TAG₀ definition.
#+begin_src R :session *R*
tag_df_tag <- tag_df[tag_df["tag1"] != "-",]
tag1_count <- table(tag_df_tag["tag1"])
largest_tag1 <- names(tag1_count[which(tag1_count == max(tag1_count))])
#+end_src
#+RESULTS:
: 525
#+begin_src R :session *R*
largest_tag1_genes <- tag_df_tag[tag_df_tag["tag1"] == largest_tag1, "gene"]
gostres <- gost(query=largest_tag1_genes, organism="gmax")
head(gostres$result)
#+end_src
#+RESULTS:
#+begin_src R :session *R* :results graphics file :file results/plots/gprofiler_tag0579_enrichment_plot.png :width 8 :height 8 :res 200 :units cm
p <- gostplot(gostres, capped=FALSE, interactive=FALSE)
p
#+end_src
#+RESULTS:
[[file:results/plots/gprofiler_tag0579_enrichment_plot.png]]
Our genes are mostly involved in auxin related signaling pathways, and response to hormones.
#+begin_example
term_name
1 regulation of cellular respiration
2 regulation of generation of precursor metabolites and energy
3 cellular respiration
4 energy derivation by oxidation of organic compounds
5 generation of precursor metabolites and energy
6 regulation of cellular metabolic process
#+end_example
* TAG size in function of TAG definition
<2024-12-27 Fri>
#+begin_src bash
./rust/tagfinder/target/release/tagfinder --families ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv --positions ./tmp/Glycine_max.gene_positions.tsv --definitions 0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50 > "./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv"
#+end_src
#+RESULTS:
#+begin_src R :session *R* :results graphics file :file results/plots/nb_TAG_against_definition.png :width 8 :height 8 :res 200 :units cm :exports both
library(ggplot2)
library(scales)
data <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv", na.strings=c("-"), header=TRUE)
definitions <- c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50)
nb_TAGs <- sapply(definitions, function(definition) {
tags <- data[,paste0("tag", definition)]
tags <- tags[! is.na(tags)]
return(max(tags))
})
data <- data.frame(list(definition=definitions, nb=nb_TAGs))
theme_set(theme_gray(base_size = 8))
gg <- ggplot(data, aes(x=definition, y=nb))
gg <- gg + geom_point()
gg <- gg + scale_x_continuous("Definition")
gg <- gg + scale_y_continuous("TAGs")
gg <- gg + ggtitle("Number of TAGs for a TAG definition")
gg <- gg + theme(plot.title = element_text(hjust = 0.5))
gg
#+end_src
#+RESULTS:
[[file:results/plots/nb_TAG_against_definition.pdf]]
* Computing some statistics on a selection of plants
<2024-12-29 Sun>
We want to extract the following statistics for the whole set of plant on eCampus:
- number of families
- number of duplicate genes
- number of singletons
- number of TAG0
We have to filter out some genes that are from Mitochondria or Chloroplasts (for /Oryza sativa ssp. Japonica/ for instance).
First we extract the protein ID that corresponds to mitochondrial or chloroplastic genes.
#+begin_src bash
fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz"
zcat "${fasta}" | awk '
/^>/ {
locus=$3
split(":", arr, locus)
chromosome=arr[3]
if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /^supercontig*/) {
protein=$1
gsub(">", "", protein)
print protein
}
}' > ./tmp/Oryza_sativa_faulty_proteins.list
#+end_src
#+RESULTS:
#+begin_src bash
head ./tmp/Oryza_sativa_faulty_proteins.list
#+end_src
#+RESULTS:
Then, remove all BLASTp hits that matches a protein ID in either query or subject.
We include this step in a Nextflow workflow. It does not compute the BLASTp all against all, It does rather use the TSV file provided on eCampus.

View File

@ -15,7 +15,6 @@ fn main() {
let mut parts = line.split("\t");
let gene = parts.next().unwrap().to_string();
let family = parts.next().unwrap().to_string();
println!("{}", family);
if family != family_index {
family_index = family;
if family_genes.len() > 1 {
@ -23,7 +22,9 @@ fn main() {
}
family_genes.clear();
}
family_genes.push(gene);
if gene != "" {
family_genes.push(gene);
}
}
if family_genes.len() > 1 {
print_gene_pairs(&family_genes);

3
workflow/.gitignore vendored
View File

@ -1,2 +1 @@
.nextflow.log*
.nextflow*

View File

@ -8,16 +8,22 @@ for gene_id in ${gene_id_1} ${gene_id_2}; do
awk -v gene_id="${gene_id}" '
BEGIN {
on_gene=0
visited=0
}
/^[^>]/ && on_gene == 1 {
print $0
}
/^>/ {
if (visited == 1) {
exit
}
gene = $4
gsub("gene:", "", gene)
if (gene == gene_id) {
on_gene=1
print $0
visited=1
} else {
on_gene=0
}

View File

@ -1,35 +1,28 @@
#!/usr/bin/env bash
# Bash version of the Nextflow script that should run faster
output_file="Glycine_max_duplicate_genes_KaKs.tsv"
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
set -euo pipefail
mkdir -p tmp
NB_THREADS=16
output_file="Glycine_max_duplicate_genes_KaKs_v3.csv"
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/cds_filtered.fa"
input_file="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.csv"
echo "seqfile = ./tmp/cds.ali.phy
outfile = ./tmp/yn
verbose = 0
icode = 0
weighting = 0
commonf3x4 = 0" > yn00.ctl
compute_kaks() {
local gene_id_1
local gene_id_2
gene_id_1=$1
gene_id_2=$2
gene_id_1="${1}"
gene_id_2="${2}"
# Extract the proteins
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${proteome_fasta}" "./prot.fst"
# Extract the CDS
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${cds_fasta}" "./cds.fst"
# Run clustalw2
clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
clustalw2 -quiet -align -infile="./prot.fst" -outfile="./prot.ali.aln"
# Run Pal2Nal
pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
pal2nal.pl "./prot.ali.aln" "./cds.fst" -output paml >"./cds.ali.phy"
# Run yn00
yn00
@ -55,22 +48,49 @@ compute_kaks() {
skip -= 1
}
}
' "./tmp/yn"
' "./yn"
}
echo "gene_a,gene_b,ka,ks" > "${output_file}"
input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
n=$(wc -l "${input}")
i=0
while IFS= read -r line
do
echo $i / $n
gene_a=$(echo $line | awk '{print $1}')
gene_b=$(echo $line | awk '{print $2}')
i=$(expr $i + 1)
kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
if [[ ! -z $kaks ]]
then
arr=(${kaks//\t/ })
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
fi
done < "${input}"
thread() {
local thread_id="${1}"
local start="${2}"
local end="${3}"
mkdir -p "tmp/${thread_id}"
pushd "tmp/${thread_id}" || return
echo "seqfile = ./cds.ali.phy
outfile = ./yn
verbose = 0
icode = 0
weighting = 0
commonf3x4 = 0" >yn00.ctl
echo "" >"part.csv"
for line_index in $(seq $start $end); do
line=$(head -n "${line_index}" "${input_file}" | tail -n 1)
gene_a=$(echo "${line}" | awk '{print $1}')
gene_b=$(echo "${line}" | awk '{print $2}')
kaks=$(compute_kaks "${gene_a}" "${gene_b}" | tail -1 | awk 'NF == 2')
if [[ ! -z $kaks ]]; then
arr=(${kaks//\t/ })
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >>"part.csv"
fi
done
popd || return
}
main() {
part=$(($(cat "${input_file}" | wc -l) / ${NB_THREADS}))
for thread_id in $(seq 0 $NB_THREADS); do
start=$((thread_id * part))
end=$(($((thread_id + 1)) * part))
thread $thread_id $start $end &
done
wait
echo "gene_a,gene_b,ka,ks" >"${output_file}"
for thread_id in $(seq 0 $NB_THREADS); do
cat ./tmp/$thread_id/part.csv >>"${output_file}"
done
}
main