Update project

2025-01-18 15:55:12 +01:00 · 2025-01-18 15:55:12 +01:00 · 5ad94110d7
parent 33e898da2e
commit 5ad94110d7
11 changed files with 681 additions and 112 deletions
--- a/docs/presentation/main.tex
+++ b/docs/presentation/main.tex
@ -0,0 +1,86 @@
+\documentclass{beamer}
+\usepackage{booktabs}
+
+\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}
+\subtitle{First results}
+\author{Naïa Périnelle \and Samuel Ortion}
+\institute{Université d'Évry Paris-Saclay}
+\date{2024-11-15}
+
+\begin{document}
+
+\frame{\titlepage}
+
+\begin{frame}{Cultivated soy plant: \textit{Glycine max}}
+A species of legume native to East Asia.
+{
+\centering
+\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
+\vspace{1em}
+% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}
+\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}
+}
+
+
+\end{frame}
+
+\begin{frame}{Soybean industrial interest}
+\begin{itemize}
+    \item 353 million tonnes of soybean produced in 2020 
+    \item Cattle feed 
+    \item Human food
+\end{itemize}
+
+\end{frame}
+
+\begin{frame}{\textit{Glycine max} genome statistics}
+    \begin{itemize}
+        \item 20 chromosomes
+        \item 55897 protein coding genes (including supercontigs)
+        \item 55589 protein coding genes (excluding supercontigs)
+        \item 88412 protein isoforms
+    \end{itemize}
+\end{frame}
+
+\begin{frame}{Datasets filter criteria and pipeline steps}
+    \begin{tabular}{ccc}
+    \toprule
+    dataset stringency & low & high \\
+    \midrule
+    coverage & > 30\% & > 40\% \\
+    identity & > 30\% & > 50\% \\
+    \bottomrule
+    \end{tabular}
+    \only<2->{
+    
+    Steps:
+    \begin{enumerate}
+        \item<2-> Keep the longest isoform protein sequence per gene,
+        \item<3-> Run BLASTp ``all against all'' on the proteome,
+        \item<4-> Remove proteins of supercontigs,
+        \item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria  (coverage and identity percentages),
+        \item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,
+        \item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.
+    \end{enumerate}
+    }
+\end{frame}
+
+\begin{frame}{Gene families size}
+    \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}
+    \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}
+\end{frame}
+
+\begin{frame}{Duplicate genes statistics}
+    \begin{tabular}{lcc}
+        dataset stringency & low & high \\
+        \toprule
+        number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\ 
+        number of families & 8426 & 11997 \\
+        number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\
+        number of TAG\textsubscript{0} & 3208 & 2500 \\
+        number of TAG\textsubscript{1} & 3481 & 2652 \\
+        \bottomrule
+    \end{tabular}
+\end{frame}
+
+\end{document}
--- a/docs/presentation/media/Glycine_max_family_size_hist_coverage30_identity30.pdf
+++ b/docs/presentation/media/Glycine_max_family_size_hist_coverage30_identity30.pdf
--- a/docs/presentation/media/Glycine_max_family_size_hist_coverage40_identity50.pdf
+++ b/docs/presentation/media/Glycine_max_family_size_hist_coverage40_identity50.pdf
--- a/docs/presentation/media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg
+++ b/docs/presentation/media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg
--- a/docs/presentation/media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png
+++ b/docs/presentation/media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png
--- a/docs/presentation/media/Soybeanvarieties.jpeg
+++ b/docs/presentation/media/Soybeanvarieties.jpeg
--- a/notebook.org
+++ b/notebook.org
@ -1,9 +1,11 @@
+# -*- org-export-use-babel: nil -*-
 #+title: Comparative Genomics Project
 #+subtitle: Duplicate Genes in /Glycine max/
 #+date: 2024-2025
 #+author: Samuel Ortion
 #+LATEX_CLASS: scrartcl
 #+LATEX_HEADER: \titlehead{M2 GENIOMHE}
+#+LATEX_HEADER: \usepackage{mus}

 * General infos

@ -48,15 +50,13 @@ min_identity=50
 /Glycine max/ is the soy plant.

 * Download the source data
+- https://plants.ensembl.org/Glycine_max/Info/Index
+- https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz

-https://plants.ensembl.org/Glycine_max/Info/Index
+* Duplicate gene families
+** Develop the Nextflow workflow

-https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
-
-
-* Develop the Nextflow workflow
-
-** Filtering the proteome to keep the longest isoform
+*** Filtering the proteome to keep the longest isoform

 I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms:
 1. =protein_lengths.awk= associate each FASTA record with the length of its sequence,
@ -89,19 +89,19 @@ zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
 #+RESULTS:
 : 55897

-** Filtering out Chloroplasts and Mitochondria proteins
+*** Filtering out Chloroplasts and Mitochondria proteins

 From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/.

-** Filter out supercontigs
+*** Filter out supercontigs

 <2024-10-28 Mon>

-#+include: workflow/families_and_TAGs/scripts/remove_supercontigs.awk src awk
+#+include: workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk src awk

 #+begin_src bash
 zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
-    | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs.awk \
+    | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk \
    > ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa
 #+end_src

@ -143,7 +143,7 @@ expr $ALL_ISOFORMS - $NO_SUPERCONTIGS
 #+RESULTS:
 : 435

-** BLASTp All Against All
+*** BLASTp All Against All

 #+begin_src bash
 makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
@ -153,7 +153,7 @@ makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
 blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv"
 #+end_src

-** Clustering
+*** Clustering

 Using =mcl=.

@ -182,8 +182,7 @@ nextflow run main.nf -profile conda -resume

 from the =./workflow/= folder

-
-* From the given BLASTp format 6 file
+*** From the given BLASTp format 6 file
 <2024-11-02 Sat>
 /How many BLASTp HSP have been reported?/
 #+begin_src bash
@ -193,7 +192,7 @@ cat ../data/Glycine_max_Blastp_longIsoforme | wc -l
 #+RESULTS:
 : 4456730

-** Remove HSP on proteins found on supercontigs
+*** Remove HSP on proteins found on supercontigs

 Extract the isoform ID whose gene is on supercontigs.
 #+begin_src bash
@ -213,7 +212,7 @@ Remove HSP with subject or query being on a supercontig

 =filter_blastp_sequence_id.awk=

-#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk src awk
+#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk src awk

 #+begin_src bash
 awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig
@ -241,7 +240,7 @@ cat ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig | wc -l
 : 4420132


-** Filter by coverage and identity percentage
+*** Filter by coverage and identity percentage

 *** Add protein length and gene name columns

@ -312,7 +311,7 @@ cat ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.tsv | wc -l
 #+RESULTS:
 : 342180

-** Clustering
+*** Clustering
 <2024-11-03 Sun>
 *** Extract the homology graph

@ -345,6 +344,13 @@ head -n 1 "tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
 #+RESULTS:
 | GLYMA_10G098500 | GLYMA_U008600 | 154 |

+#+begin_src bash
+wc -l "./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
+#+end_src
+
+#+RESULTS:
+: 1520850 ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc
+
 *** Filter the homology graph to keep only the heaviest weight


@ -475,7 +481,7 @@ What is the interval of duplicate gene count? There are between 48109 and 49391

 What is the number of families?

-#+begin_src bash
+#+begin_src bash :exports both
 tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
 cut -f2 "${tsv}" | sort | uniq | wc -l
 #+end_src
@ -483,7 +489,7 @@ cut -f2 "${tsv}" | sort | uniq | wc -l
 #+RESULTS:
 : 8426

-#+begin_src bash
+#+begin_src bash :exports both
 tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
 cut -f2 "${tsv}" | sort | uniq | wc -l
 #+end_src
@ -508,10 +514,9 @@ expr $ALL - $DUPLICATED

 #+RESULTS: singletons-high-stringency
 : 9128
-
 * Amount of Tandemly Arrayed Genes

-*** Extract gene position from a GFF3 file
+** Extract gene position from a GFF3 file

 #+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk

@ -540,7 +545,7 @@ head "./tmp/Glycine_max.gene_positions.tsv"
 | GLYMA_01G001000 | 1 | 196256 | 201895 |

 #+name: gene-positions-count
-#+begin_src bash
+#+begin_src bash :exports both
 cat "./tmp/Glycine_max.gene_positions.tsv" \
    | wc -l
 #+end_src
@ -557,7 +562,7 @@ expr $GENES - $POSITIONS
 #+RESULTS:
 : 308

-*** Extract TAGs
+** Extract TAGs

 #+name: extract-TAGs
 #+begin_src bash
@ -593,20 +598,20 @@ head "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
 #+end_src

 #+RESULTS:
-| gene            | family | tag0 | tag1 |
-| GLYMA_07G257400 |   3109 | -    | -    |
-| GLYMA_17G203900 |   2067 | -    | -    |
-| GLYMA_20G062000 |    509 | -    | -    |
-| GLYMA_16G099400 |     40 | -    | -    |
-| GLYMA_14G045400 |   1136 | 804  | 896  |
-| GLYMA_11G189200 |     15 | -    | -    |
-| GLYMA_02G191500 |   1310 | -    | -    |
-| GLYMA_02G308200 |    520 | -    | -    |
-| GLYMA_14G205200 |      8 | -    | -    |
+| gene            |     family | tag0 | tag1 |
+| GLYMA_19G258600 | spacer3040 | -    | -    |
+| GLYMA_08G277500 |         68 | -    | -    |
+| GLYMA_08G342200 |        363 | 2467 | 2749 |
+| GLYMA_08G072200 |       3499 | 2317 | 2592 |
+| GLYMA_15G143500 | spacer1810 | -    | -    |
+| GLYMA_09G191100 |       3783 | -    | -    |
+| GLYMA_02G090200 |       7931 | -    | -    |
+| GLYMA_17G091400 |        584 | -    | -    |
+| GLYMA_13G199100 |         79 | -    | 621  |

 How many TAGs for definition 0?

-#+begin_src bash
+#+begin_src bash :exports both
 awk 'BEGIN {
           max=0
    }
@ -622,11 +627,11 @@ awk 'BEGIN {
 #+end_src

 #+RESULTS:
-: 3208
+: 2620

 For TAG1?

-#+begin_src bash
+#+begin_src bash :exports both
 awk 'BEGIN {
           max=0
    }
@ -642,10 +647,10 @@ awk 'BEGIN {
 #+end_src

 #+RESULTS:
-: 3481
+: 2916


-#+begin_src bash
+#+begin_src bash :exports both
 awk 'BEGIN {
           max=0
    }
@ -661,7 +666,7 @@ awk 'BEGIN {
 #+end_src

 #+RESULTS:
-: 2500
+: 2157


 #+begin_src bash
@ -680,16 +685,16 @@ awk 'BEGIN {
 #+end_src

 #+RESULTS:
-: 2652
+: 2438


-*** Size of the greatest TAG
+** Size of the greatest TAG

-**** High stringency
+*** High stringency

 /TAG₀/

-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$3
@ -710,11 +715,11 @@ awk '
 #+end_src

 #+RESULTS:
-: 421 53
+: 2134 32

 /TAG₁/

-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$4
@ -735,14 +740,14 @@ awk '
 #+end_src

 #+RESULTS:
-: 459 76
+: 416 43

-**** Low stringency
+*** Low stringency


 /TAG₀/

-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$3
@ -763,11 +768,11 @@ awk '
 #+end_src

 #+RESULTS:
-: 579 76
+: 2591 32

 /TAG₁/

-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$4
@ -788,9 +793,37 @@ awk '
 #+end_src

 #+RESULTS:
-: 646 76
+: 525 43

-* DONE Complete slides
+
+** Amount of TAGs
+
+How many TAG genes are there for low and high stringency datasets?
+
+#+name: count-tags
+#+begin_src bash
+awk 'BEGIN { count0=0; count1; } { if ($3 != "-") count0++; if($4 != "-") count1++; } END { print count0, count1 }' $tags
+#+end_src
+
+#+RESULTS: count-tags
+
+#+begin_src bash :noweb strip-export
+tags=./tmp/Glycine_max_TAGs_coverage30_identity30.tsv
+<<count-tags>>
+#+end_src
+
+#+RESULTS:
+: 6680 7876
+
+#+begin_src bash :noweb strip-export
+tags=./tmp/Glycine_max_TAGs_coverage40_identity50.tsv
+<<count-tags>>
+#+end_src
+
+#+RESULTS:
+: 5397 6480
+
+* Complete slides
 <2024-11-23 Sat>

 * Family sizes
@ -812,16 +845,16 @@ head -1 ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl \
 : 159


-* TODO $K_s$ For all duplicated pairs within a family
+* $K_s$ For all duplicated pairs within a family

 Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$.

-PAML package:
+Install PAML, Pal2Nal and Clustal packages with conda:
 #+begin_src bash
 conda install paml pal2nal clustalw -c bioconda
 #+end_src

-** Use a Nextflow workflow
+** Testing each step
 <2024-12-22 Sun>

 *** All pair of duplicate genes
@ -831,16 +864,14 @@ The algorithm is simple.
 We assume the gene-families mapping file is sorted by family index.
 For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j.

-
 Example:
-
-#+begin_src bash
+#+begin_src bash :exports both
 cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv
 head ./tmp/Glycine_max_duplicate_gene_pairs.tsv
 #+end_src

 #+RESULTS:
-| GLYMA_07G053000   | GLYMA_10G247700 |
+| GLYMA_07G053000 | GLYMA_10G247700 |
 | GLYMA_07G053000 | GLYMA_12G188000 |
 | GLYMA_07G053000 | GLYMA_07G159200 |
 | GLYMA_07G053000 | GLYMA_13G313300 |
@ -862,9 +893,7 @@ gene_id_1="GLYMA_07G053000"
 gene_id_2="GLYMA_10G247700"
 #+end_src

-CDS:
-
-
+- CDS:
 #+name: pair-gene-cds-extract
 #+begin_src bash :noweb strip-export
 output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
@ -900,7 +929,7 @@ done

 #+RESULTS:

-Proteins:
+- Proteins:

 #+name: pair-gene-protein-extract
 #+begin_src bash :noweb strip-export
@ -930,7 +959,6 @@ done

 #+RESULTS: pair-gene-protein-extract

-
 #+begin_src bash :noweb strip-export
 <<pair-gene-id>>
 <<pair-gene-protein-extract>>
@ -956,7 +984,6 @@ coding_sequence="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
 pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy"
 #+end_src

-
 /YN00 Ka-Ks computation/

 #+begin_src bash :noweb strip-export
@ -968,7 +995,7 @@ yn00

 /Extraction of Ka-Ks values/

-#+begin_src bash :noweb strip-export
+#+begin_src bash :noweb strip-export :exports both
 <<pair-gene-id>>
 yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn"
 awk '
@ -1000,10 +1027,440 @@ awk '
 *** Run the Nextflow workflow
 <2024-12-23 Mon>

+The  Nextflow workflow is stuck on the first steps, because the channel containing the queue of jobs for each gene pair is too huge in memory.

-* TODO Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
+*** Back to basics
+
+I wrote a bash script that compute the Ka/Ks not loading the whole pairs file in memory.
+
+<2024-12-26 Thu> The script is still running.
+
+I did not handle the case where several coding sequences were present in the CDS file for the same gene, so the script ignores the proteins with several isoforms.
+
+Let us filter the CDS to match the proteome file:
+
+#+begin_src bash
+awk '/^>/ { gsub(">", "", $1); print $1 }' ./tmp/proteome_filtered.fa > ./tmp/proteome_filtered.list
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+cat ./tmp/proteome_filtered.list | wc -l
+#+end_src
+
+#+RESULTS:
+: 55589 ./tmp/proteome_filtered.list
+
+#+begin_src bash
+awk -f ./workflow/families_and_TAGs/scripts/filter_records_fasta.awk ./tmp/proteome_filtered.list ../data/Glycine_max.Glycine_max_v2.1.cds.all.fa > ./tmp/cds_filtered.fa
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash :exports both
+grep -c "^>" ./tmp/cds_filtered.fa
+#+end_src
+
+#+RESULTS:
+: 55589
+
+* Are TAG pairs different in age from non-TAG pairs ?
+
+Run the bash script [[./workflow/KaKs/ugly.sh]].
+
+<2025-01-07 Tue>
+Computation ended after about a week.
+
+<2025-01-08 Wed>
+
+#+begin_src bash
+awk -F',' '$4 < 1' ./workflow/KaKs/results/complete_uniq.csv > ./workflow/KaKs/results/complete_uniq_ks_below1.csv
+cat ./workflow/KaKs/results/complete_uniq_ks_below1.csv| wc -l
+#+end_src
+
+#+RESULTS:
+: 294226
+
+#+begin_src R :session *R* :results graphics file :file results/plots/Glycine_max_Ks_distribution.png :width 8 :height 8 :res 200 :units cm
+library(ggplot2)
+library(scales)
+
+data <- read.table("./workflow/KaKs/complete2.csv", sep=",", header=TRUE)
+colnames(data) <- c("gene_a","gene_b","ka","ks")
+
+theme_set(theme_gray(base_size=8))
+gg <- ggplot(data, aes(x=ks))
+gg <- gg + geom_density()
+gg <- gg + scale_x_continuous("Age (Ks)")
+gg <- gg + scale_y_continuous("Density")
+gg <- gg + xlim(0, 1)
+gg <- gg + ggtitle("Proportion of duplicate gene pairs age")
+gg <- gg + theme(plot.title=element_text(hjust=0.5))
+gg
+#+end_src
+
+#+RESULTS:
+[[file:results/plots/Glycine_max_Ks_distribution.png]]
+
+#+caption: Density of pair with a given Ks. (Inspired from Blanc and Wolfe, 2004)
+
+#+RESULTS:
+[[file:results/plots/Glycine_max_Ks_distribution.png]]
+
+Then, let us plot the same data separating TAG from non-TAG pairs.
+
+#+begin_src R :session *R* :results silent
+data <- read.csv("./workflow/KaKs/complete2.csv")
+colnames(data) <- c("gene_a", "gene_b", "ka", "ks")
+tag_df2 <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30.tsv", header=TRUE)
+tag_def <- "tag1"
+data$category <- apply(data, 1, function(row) {
+  gene_a <- row["gene_a"]
+  gene_b <- row["gene_b"]
+  tag_a <- tag_df2[tag_df2["gene"] == gene_a, tag_def]
+  tag_b <- tag_df2[tag_df2["gene"] == gene_b, tag_def]
+  # Filter out pairs tag character(0)
+    if (length(tag_a) == 0 || length(tag_b) == 0) {
+        return("notag")
+    } else if (tag_a == "-" || tag_b == "-") {
+        return("notag")
+    } else if (tag_a == tag_b) {
+        return("tag")
+    } else {
+        return("notag")
+    }
+})
+#+end_src
+
+#+RESULTS:
+
+#+begin_src R :session *R* :results graphics file :file results/plots/ks_density_tag_and_not_tag.png :width 8 :height 8 :res 200 :units cm
+library(ggplot2)
+library(scales)
+theme_set(theme_gray(base_size = 6))
+gg <- ggplot(data)
+gg <- gg + geom_density(mapping=aes(x=ks, color=category))
+gg <- gg + scale_x_discrete("Age (Ks)")
+gg <- gg + scale_y_continuous("Density")
+gg <- gg + xlim(0, 5)
+gg <- gg + ggtitle("Proportion of duplicate gene pairs age (TAG and non-TAG)")
+gg <- gg + theme(plot.title = element_text(hjust = 0.5))
+gg
+#+end_src
+
+#+RESULTS:
+[[file:results/plots/ks_density_tag_and_not_tag.png]]
+
+We will now test whether TAG gene duplication is more recent than non-TAG gene pairs.
+As the distributions of the Ks values is not normally distributed, I will not use the Student \(t\)-test, but the Wilcoxon-Mann-Whitney U test.
+
+The hypotheses are
+\[
+\begin{cases}
+(H_0) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is equal to the probability of $Y$ being greater than $X$} \\
+(H_1) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is greater than the probability of $Y$ being greater than $X$}
+\end{cases}
+\]
+
+#+begin_src R :session *R* :results output :exports both
+tag_ks <- data[data$category == "tag", "ks"]
+non_tag_ks <- data[data$category == "notag", "ks"]
+wilcox.test(non_tag_ks, tag_ks, alternative="greater")
+#+end_src
+
+#+RESULTS:
+:
+: 	Wilcoxon rank sum test with continuity correction
+:
+: data:  non_tag_ks and tag_ks
+: W = 358006116, p-value < 2.2e-16
+: alternative hypothesis: true location shift is greater than 0
+
+The \(p\)-value is lower than $0.05$ so at level \(\alpha = 5 \%\), we reject the null hypotheses. Given two duplicate gene pairs, the probably that a TAG gene pair duplication event occurred before the non-TAG gene pair duplication event is greater than the otherway around, remaining TAG genes tends to be more recently duplicated.
+
+* Are genes inside a TAG orientated in the same way more often than random?
+
+We did not keep the orientation of the genes in the TAGs file. Let extract the gene orientation first, and join this information to the TAGs file.
+We will do our analysis on the less stringent dataset. (coverage >30%, identity >30%).
+
+#+begin_src bash
+awk 'BEGIN {
+    OFS="\t"
+    }
+ /^>/ {
+          locus=$3
+          split(locus, arr, ":")
+          orientation=arr[6]
+          gene=$4
+          sub("gene:", "", gene)
+          print gene, orientation
+    }' ./tmp/proteome_filtered.fa > ./tmp/filtered_gene_orientation.tsv
+head ./tmp/filtered_gene_orientation.tsv
+#+end_src
+
+#+RESULTS:
+| GLYMA_01G141900 | -1 |
+| GLYMA_01G234000 | -1 |
+| GLYMA_01G157500 | -1 |
+| GLYMA_01G031500 | -1 |
+| GLYMA_01G132000 | -1 |
+| GLYMA_01G038100 | -1 |
+| GLYMA_01G184300 |  1 |
+| GLYMA_01G231700 |  1 |
+| GLYMA_01G031400 |  1 |
+| GLYMA_01G008100 |  1 |
+
+#+begin_src bash
+head ./tmp/Glycine_max
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+tail -n +1 ./tmp/filtered_gene_orientation.tsv | sort -d > ./tmp/filtered_gene_orientation_s.tsv
+tail -n +1 ./tmp/Glycine_max_TAGs_coverage30_identity30.tsv  | sort -d > ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv
+join -t$'\t' -1 1 -2 1 ./tmp/filtered_gene_orientation_s.tsv ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv > ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+head ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
+#+end_src
+
+#+RESULTS:
+| GLYMA_01G000100 | -1 |    5506 | - | - |
+| GLYMA_01G000200 | -1 | spacer1 | - | - |
+| GLYMA_01G000300 |  1 | spacer2 | - | - |
+| GLYMA_01G000400 | -1 |     146 | - | 1 |
+| GLYMA_01G000500 |  1 | spacer3 | - | - |
+| GLYMA_01G000600 |  1 |     146 | - | 1 |
+| GLYMA_01G000700 |  1 | spacer4 | - | - |
+| GLYMA_01G000800 |  1 |    3898 | - | - |
+| GLYMA_01G000900 |  1 |    7954 | - | - |
+| GLYMA_01G001000 |  1 |    7875 | - | - |
+
+#+RESULTS:
+
+#+begin_src R :session *R* :colnames yes
+tag_df = read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv",
+                    sep="\t")
+colnames(tag_df) <- c("gene", "orientation", "family", "tag0", "tag1")
+head(tag_df)
+#+end_src
+
+#+RESULTS:
+| gene            | orientation | family  | tag0 | tag1 |
+|-----------------+-------------+---------+------+------|
+| GLYMA_01G000100 |          -1 | 5506    | -    | -    |
+| GLYMA_01G000200 |          -1 | spacer1 | -    | -    |
+| GLYMA_01G000300 |           1 | spacer2 | -    | -    |
+| GLYMA_01G000400 |          -1 | 146     | -    | 1    |
+| GLYMA_01G000500 |           1 | spacer3 | -    | -    |
+| GLYMA_01G000600 |           1 | 146     | -    | 1    |
+
+We perform a Fischer test on the contingency table:
+|         | Coherent | Convergent | Divergent |
+|---------+----------+------------+-----------|
+| pair TAG0 |          |            |           |
+| pair not TAG0 |          |            |           |


-* TODO Are TAG pairs different in age from non-TAG pairs?

-* TODO Are genes inside a TAG orientated in the same way more often than random?
+#+begin_src R :session *R*
+get_strand <- function(gene) {
+    return(
+        tag_df[tag_df[,"gene"] == gene, "orientation"]
+    )
+}
+get_convergence <- function(gene_a, gene_b) {
+    strand_a <- get_strand(gene_a)
+    strand_b <- get_strand(gene_b)
+    if (strand_a == strand_b) {
+        return('coherent')
+    } else if (strand_a == -1) {
+        return('divergent')
+    } else if (strand_a == 1) {
+        return('convergent')
+    } else {
+        message("Error: I do not understand why this does not fall in these case (coherent, divergent, convergent, what else?)")
+    }
+}
+#+end_src
+
+#+RESULTS:
+
+#+begin_src R :session *R* :colnames yes :rownames yes
+contingency_table <- matrix(0, nrow=2, ncol=3)
+colnames(contingency_table) <- c("coherent", "convergent", "divergent")
+rownames(contingency_table) <- c("tag", "nottag")
+definition <- 0
+for (i in 1:(nrow(tag_df) - (definition + 1))) {
+  j <- i + 1
+  gene_i <- tag_df[i, "gene"]
+  gene_j <- tag_df[j, "gene"]
+  convergence <- get_convergence(gene_i, gene_j)
+  if (tag_df[i, "tag0"] == "-" || tag_df[j, "tag0"] == "-") {
+    category <- "nottag"
+  } else if (tag_df[i, "tag0"] == tag_df[j, "tag0"]) {
+    category <- "tag"
+  } else {
+    category <- "nottag"
+  }
+  contingency_table[category, convergence] <- contingency_table[category, convergence] + 1
+}
+contingency_table
+#+end_src
+
+#+RESULTS:
+|        | coherent | convergent | divergent |
+|--------+----------+------------+-----------|
+| tag    |     3353 |        342 |       361 |
+| nottag |    25699 |      12926 |     12907 |
+
+
+#+RESULTS:
+: org_babel_R_eoe
+
+#+begin_src R :session *R* :results output
+fisher.test(contingency_table, workspace=2e6)
+#+end_src
+
+#+RESULTS:
+:
+: 	Fisher's Exact Test for Count Data
+:
+: data:  contingency_table
+: p-value < 2.2e-16
+: alternative hypothesis: two.sided
+
+
+\(p\)-value < 2.2e-16, so at level $\alpha = 0.05$, we reject the null hypothesis: the orientation of a pair of gene is significantly different between TAG and non TAG pairs.
+More specifically, TAG genes tends to be more coherent than non TAG genes.
+* Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
+<2024-12-26 Thu>
+
+Uses Gprofiler2
+
+#+begin_src R :session *R* :results silent
+library(gprofiler2)
+#+end_src
+
+First we want to extract the list of genes belonging to the largest TAG:
+
+We once again focus on the least stringent dataset, with the TAG₀ definition.
+#+begin_src R :session *R*
+tag_df_tag <- tag_df[tag_df["tag1"] != "-",]
+tag1_count <- table(tag_df_tag["tag1"])
+largest_tag1 <- names(tag1_count[which(tag1_count == max(tag1_count))])
+#+end_src
+
+#+RESULTS:
+: 525
+
+#+begin_src R :session *R*
+largest_tag1_genes <- tag_df_tag[tag_df_tag["tag1"] == largest_tag1, "gene"]
+gostres <- gost(query=largest_tag1_genes, organism="gmax")
+head(gostres$result)
+#+end_src
+
+#+RESULTS:
+
+#+begin_src R :session *R*  :results graphics file :file results/plots/gprofiler_tag0579_enrichment_plot.png :width 8 :height 8 :res 200 :units cm
+p <- gostplot(gostres, capped=FALSE, interactive=FALSE)
+p
+#+end_src
+
+#+RESULTS:
+[[file:results/plots/gprofiler_tag0579_enrichment_plot.png]]
+
+Our genes are mostly involved in auxin related signaling pathways, and response to hormones.
+
+#+begin_example
+                                                     term_name
+1                           regulation of cellular respiration
+2 regulation of generation of precursor metabolites and energy
+3                                         cellular respiration
+4          energy derivation by oxidation of organic compounds
+5               generation of precursor metabolites and energy
+6                     regulation of cellular metabolic process
+#+end_example
+
+* TAG size in function of TAG definition
+<2024-12-27 Fri>
+#+begin_src bash
+./rust/tagfinder/target/release/tagfinder --families  ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv --positions ./tmp/Glycine_max.gene_positions.tsv --definitions 0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50 > "./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv"
+#+end_src
+
+#+RESULTS:
+
+#+begin_src R :session *R*  :results graphics file  :file results/plots/nb_TAG_against_definition.png :width 8 :height 8 :res 200 :units cm :exports both
+library(ggplot2)
+library(scales)
+
+data <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv", na.strings=c("-"), header=TRUE)
+
+definitions <- c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50)
+
+nb_TAGs <- sapply(definitions, function(definition) {
+  tags <- data[,paste0("tag", definition)]
+  tags <- tags[! is.na(tags)]
+  return(max(tags))
+    })
+
+data <- data.frame(list(definition=definitions, nb=nb_TAGs))
+
+theme_set(theme_gray(base_size = 8))
+gg <- ggplot(data, aes(x=definition, y=nb))
+gg <- gg + geom_point()
+gg <- gg + scale_x_continuous("Definition")
+gg <- gg + scale_y_continuous("TAGs")
+gg <- gg + ggtitle("Number of TAGs for a TAG definition")
+gg <- gg + theme(plot.title = element_text(hjust = 0.5))
+                      gg
+#+end_src
+
+#+RESULTS:
+[[file:results/plots/nb_TAG_against_definition.pdf]]
+
+* Computing some statistics on a selection of plants
+
+<2024-12-29 Sun>
+
+We want to extract the following statistics for the whole set of plant on eCampus:
+- number of families
+- number of duplicate genes
+- number of singletons
+- number of TAG0
+
+
+We have to filter out some genes that are from Mitochondria or Chloroplasts (for /Oryza sativa ssp. Japonica/ for instance).
+
+First we extract the protein ID that corresponds to mitochondrial or chloroplastic genes.
+#+begin_src bash
+fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz"
+zcat "${fasta}" | awk '
+ /^>/ {
+     locus=$3
+     split(":", arr, locus)
+     chromosome=arr[3]
+     if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /^supercontig*/) {
+        protein=$1
+        gsub(">", "", protein)
+        print protein
+     }
+    }' > ./tmp/Oryza_sativa_faulty_proteins.list
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+head ./tmp/Oryza_sativa_faulty_proteins.list
+#+end_src
+
+#+RESULTS:
+
+Then, remove all BLASTp hits that matches a protein ID in either query or subject.
+
+We include this step in a Nextflow workflow. It does not compute the BLASTp all against all, It does rather use the TSV file provided on eCampus.
--- a/rust/pairs/src/main.rs
+++ b/rust/pairs/src/main.rs
@ -15,7 +15,6 @@ fn main() {
        let mut parts = line.split("\t");
        let gene = parts.next().unwrap().to_string();
        let family = parts.next().unwrap().to_string();
-        println!("{}", family);
        if family != family_index {
            family_index = family;
            if family_genes.len() > 1 {
@ -23,7 +22,9 @@ fn main() {
            }                    
            family_genes.clear();
        }
-        family_genes.push(gene);
+        if gene != "" {
+            family_genes.push(gene);
+        }
    }
    if family_genes.len() > 1 {
        print_gene_pairs(&family_genes);
--- a/workflow/.gitignore
+++ b/workflow/.gitignore
@ -1,2 +1 @@
-.nextflow.log*
-
+.nextflow*
--- a/workflow/KaKs/scripts/extract_fasta_records.sh
+++ b/workflow/KaKs/scripts/extract_fasta_records.sh
@ -8,16 +8,22 @@ for gene_id in ${gene_id_1} ${gene_id_2}; do
    awk -v gene_id="${gene_id}" '
        BEGIN {
            on_gene=0
+            visited=0
        }
        /^[^>]/ && on_gene == 1 {
+            
            print $0
        }
        /^>/ {
+            if (visited == 1) {
+                    exit
+            }
            gene = $4
            gsub("gene:", "", gene)
            if (gene == gene_id) {
                on_gene=1
                print $0
+                visited=1
            } else {
                on_gene=0
            }
--- a/workflow/KaKs/standalone.sh
+++ b/workflow/KaKs/standalone.sh
@ -1,35 +1,28 @@
 #!/usr/bin/env bash
-
-# Bash version of the Nextflow script that should run faster
-output_file="Glycine_max_duplicate_genes_KaKs.tsv"
-proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
-cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
-
 set -euo pipefail

-mkdir -p tmp
+NB_THREADS=16
+
+output_file="Glycine_max_duplicate_genes_KaKs_v3.csv"
+proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
+cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/cds_filtered.fa"
+input_file="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.csv"

-echo "seqfile = ./tmp/cds.ali.phy
-outfile = ./tmp/yn
-verbose = 0
-icode = 0
-weighting = 0
-commonf3x4 = 0" > yn00.ctl
 compute_kaks() {
    local gene_id_1
    local gene_id_2
-    gene_id_1=$1
-    gene_id_2=$2
+    gene_id_1="${1}"
+    gene_id_2="${2}"
    # Extract the proteins
-    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
+    ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${proteome_fasta}" "./prot.fst"
    # Extract the CDS
-    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
+    ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${cds_fasta}" "./cds.fst"

    # Run clustalw2
-    clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
+    clustalw2 -quiet -align -infile="./prot.fst" -outfile="./prot.ali.aln"

    # Run Pal2Nal
-    pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
+    pal2nal.pl "./prot.ali.aln" "./cds.fst" -output paml >"./cds.ali.phy"

    # Run yn00
    yn00
@ -55,22 +48,49 @@ compute_kaks() {
            skip -= 1
        }
    }
-    ' "./tmp/yn"
+    ' "./yn"
 }
-echo "gene_a,gene_b,ka,ks" > "${output_file}"
-input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
-n=$(wc -l "${input}")
-i=0
-while IFS= read -r line
-do
-    echo $i / $n
-    gene_a=$(echo $line | awk '{print $1}')
-    gene_b=$(echo $line | awk '{print $2}')
-    i=$(expr $i + 1)
-    kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
-    if [[ ! -z $kaks ]]
-       then
-       arr=(${kaks//\t/ })
-       echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
-    fi
-done < "${input}"
+
+thread() {
+    local thread_id="${1}"
+    local start="${2}"
+    local end="${3}"
+
+    mkdir -p "tmp/${thread_id}"
+    pushd "tmp/${thread_id}" || return
+
+    echo "seqfile = ./cds.ali.phy
+outfile = ./yn
+verbose = 0
+icode = 0
+weighting = 0
+commonf3x4 = 0" >yn00.ctl
+    echo "" >"part.csv"
+    for line_index in $(seq $start $end); do
+        line=$(head -n "${line_index}" "${input_file}" | tail -n 1)
+        gene_a=$(echo "${line}" | awk '{print $1}')
+        gene_b=$(echo "${line}" | awk '{print $2}')
+        kaks=$(compute_kaks "${gene_a}" "${gene_b}" | tail -1 | awk 'NF == 2')
+        if [[ ! -z $kaks ]]; then
+            arr=(${kaks//\t/ })
+            echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >>"part.csv"
+        fi
+    done
+    popd || return
+}
+
+main() {
+    part=$(($(cat "${input_file}" | wc -l) / ${NB_THREADS}))
+    for thread_id in $(seq 0 $NB_THREADS); do
+        start=$((thread_id * part))
+        end=$(($((thread_id + 1)) * part))
+        thread $thread_id $start $end &
+    done
+    wait
+    echo "gene_a,gene_b,ka,ks" >"${output_file}"
+    for thread_id in $(seq 0 $NB_THREADS); do
+        cat ./tmp/$thread_id/part.csv >>"${output_file}"
+    done
+}
+
+main