Update project

2025-01-18 15:55:12 +01:00 · 2025-01-18 15:55:12 +01:00 · 5ad94110d7
parent 33e898da2e
commit 5ad94110d7
11 changed files with 681 additions and 112 deletions
--- a/docs/presentation/main.tex
+++ b/docs/presentation/main.tex
@ -0,0 +1,86 @@
 \documentclass{beamer}
 \usepackage{booktabs}
 \title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)}
 \subtitle{First results}
 \author{Naïa Périnelle \and Samuel Ortion}
 \institute{Université d'Évry Paris-Saclay}
 \date{2024-11-15}
 \begin{document}
 \frame{\titlepage}
 \begin{frame}{Cultivated soy plant: \textit{Glycine max}}
 A species of legume native to East Asia.
 {
 \centering
 \includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
 \vspace{1em}
 % \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005}
 \includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg}
 }
 \end{frame}
 \begin{frame}{Soybean industrial interest}
 \begin{itemize}
    \item 353 million tonnes of soybean produced in 2020 
    \item Cattle feed 
    \item Human food
 \end{itemize}
 \end{frame}
 \begin{frame}{\textit{Glycine max} genome statistics}
    \begin{itemize}
        \item 20 chromosomes
        \item 55897 protein coding genes (including supercontigs)
        \item 55589 protein coding genes (excluding supercontigs)
        \item 88412 protein isoforms
    \end{itemize}
 \end{frame}
 \begin{frame}{Datasets filter criteria and pipeline steps}
    \begin{tabular}{ccc}
    \toprule
    dataset stringency & low & high \\
    \midrule
    coverage & > 30\% & > 40\% \\
    identity & > 30\% & > 50\% \\
    \bottomrule
    \end{tabular}
    \only<2->{
    Steps:
    \begin{enumerate}
        \item<2-> Keep the longest isoform protein sequence per gene,
        \item<3-> Run BLASTp ``all against all'' on the proteome,
        \item<4-> Remove proteins of supercontigs,
        \item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria  (coverage and identity percentages),
        \item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs,
        \item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families.
    \end{enumerate}
    }
 \end{frame}
 \begin{frame}{Gene families size}
    \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf}
    \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf}
 \end{frame}
 \begin{frame}{Duplicate genes statistics}
    \begin{tabular}{lcc}
        dataset stringency & low & high \\
        \toprule
        number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\ 
        number of families & 8426 & 11997 \\
        number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\
        number of TAG\textsubscript{0} & 3208 & 2500 \\
        number of TAG\textsubscript{1} & 3481 & 2652 \\
        \bottomrule
    \end{tabular}
 \end{frame}
 \end{document}
--- a/docs/presentation/media/Glycine_max_family_size_hist_coverage30_identity30.pdf
+++ b/docs/presentation/media/Glycine_max_family_size_hist_coverage30_identity30.pdf
--- a/docs/presentation/media/Glycine_max_family_size_hist_coverage40_identity50.pdf
+++ b/docs/presentation/media/Glycine_max_family_size_hist_coverage40_identity50.pdf
--- a/docs/presentation/media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg
+++ b/docs/presentation/media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg
--- a/docs/presentation/media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png
+++ b/docs/presentation/media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png
--- a/docs/presentation/media/Soybeanvarieties.jpeg
+++ b/docs/presentation/media/Soybeanvarieties.jpeg
--- a/notebook.org
+++ b/notebook.org
@ -1,9 +1,11 @@
 # -*- org-export-use-babel: nil -*-
 #+title: Comparative Genomics Project
 #+subtitle: Duplicate Genes in /Glycine max/
 #+date: 2024-2025
 #+author: Samuel Ortion
 #+LATEX_CLASS: scrartcl
 #+LATEX_HEADER: \titlehead{M2 GENIOMHE}
 #+LATEX_HEADER: \usepackage{mus}
 * General infos
@ -48,15 +50,13 @@ min_identity=50
 /Glycine max/ is the soy plant.
 * Download the source data
 - https://plants.ensembl.org/Glycine_max/Info/Index
 - https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
-https://plants.ensembl.org/Glycine_max/Info/Index
+* Duplicate gene families
 ** Develop the Nextflow workflow
-https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
+*** Filtering the proteome to keep the longest isoform
 * Develop the Nextflow workflow
 ** Filtering the proteome to keep the longest isoform
 I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms:
 1. =protein_lengths.awk= associate each FASTA record with the length of its sequence,
@ -89,19 +89,19 @@ zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
 #+RESULTS:
 : 55897
-** Filtering out Chloroplasts and Mitochondria proteins
+*** Filtering out Chloroplasts and Mitochondria proteins
 From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/.
-** Filter out supercontigs
+*** Filter out supercontigs
 <2024-10-28 Mon>
-#+include: workflow/families_and_TAGs/scripts/remove_supercontigs.awk src awk
+#+include: workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk src awk
 #+begin_src bash
 zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
-    | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs.awk \
+    | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk \
    > ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa
 #+end_src
@ -143,7 +143,7 @@ expr $ALL_ISOFORMS - $NO_SUPERCONTIGS
 #+RESULTS:
 : 435
-** BLASTp All Against All
+*** BLASTp All Against All
 #+begin_src bash
 makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
@ -153,7 +153,7 @@ makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
 blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv"
 #+end_src
-** Clustering
+*** Clustering
 Using =mcl=.
@ -182,8 +182,7 @@ nextflow run main.nf -profile conda -resume
 from the =./workflow/= folder
-
+*** From the given BLASTp format 6 file
 * From the given BLASTp format 6 file
 <2024-11-02 Sat>
 /How many BLASTp HSP have been reported?/
 #+begin_src bash
@ -193,7 +192,7 @@ cat ../data/Glycine_max_Blastp_longIsoforme | wc -l
 #+RESULTS:
 : 4456730
-** Remove HSP on proteins found on supercontigs
+*** Remove HSP on proteins found on supercontigs
 Extract the isoform ID whose gene is on supercontigs.
 #+begin_src bash
@ -213,7 +212,7 @@ Remove HSP with subject or query being on a supercontig
 =filter_blastp_sequence_id.awk=
-#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk src awk
+#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk src awk
 #+begin_src bash
 awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig
@ -241,7 +240,7 @@ cat ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig | wc -l
 : 4420132
-** Filter by coverage and identity percentage
+*** Filter by coverage and identity percentage
 *** Add protein length and gene name columns
@ -312,7 +311,7 @@ cat ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.tsv | wc -l
 #+RESULTS:
 : 342180
-** Clustering
+*** Clustering
 <2024-11-03 Sun>
 *** Extract the homology graph
@ -345,6 +344,13 @@ head -n 1 "tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
 #+RESULTS:
 | GLYMA_10G098500 | GLYMA_U008600 | 154 |
 #+begin_src bash
 wc -l "./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
 #+end_src
 #+RESULTS:
 : 1520850 ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc
 *** Filter the homology graph to keep only the heaviest weight
@ -475,7 +481,7 @@ What is the interval of duplicate gene count? There are between 48109 and 49391
 What is the number of families?
-#+begin_src bash
+#+begin_src bash :exports both
 tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
 cut -f2 "${tsv}" | sort | uniq | wc -l
 #+end_src
@ -483,7 +489,7 @@ cut -f2 "${tsv}" | sort | uniq | wc -l
 #+RESULTS:
 : 8426
-#+begin_src bash
+#+begin_src bash :exports both
 tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
 cut -f2 "${tsv}" | sort | uniq | wc -l
 #+end_src
@ -508,10 +514,9 @@ expr $ALL - $DUPLICATED
 #+RESULTS: singletons-high-stringency
 : 9128
 * Amount of Tandemly Arrayed Genes
-*** Extract gene position from a GFF3 file
+** Extract gene position from a GFF3 file
 #+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk
@ -540,7 +545,7 @@ head "./tmp/Glycine_max.gene_positions.tsv"
 | GLYMA_01G001000 | 1 | 196256 | 201895 |
 #+name: gene-positions-count
-#+begin_src bash
+#+begin_src bash :exports both
 cat "./tmp/Glycine_max.gene_positions.tsv" \
    | wc -l
 #+end_src
@ -557,7 +562,7 @@ expr $GENES - $POSITIONS
 #+RESULTS:
 : 308
-*** Extract TAGs
+** Extract TAGs
 #+name: extract-TAGs
 #+begin_src bash
@ -593,20 +598,20 @@ head "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
 #+end_src
 #+RESULTS:
-| gene            | family | tag0 | tag1 |
+| gene            |     family | tag0 | tag1 |
-| GLYMA_07G257400 |   3109 | -    | -    |
+| GLYMA_19G258600 | spacer3040 | -    | -    |
-| GLYMA_17G203900 |   2067 | -    | -    |
+| GLYMA_08G277500 |         68 | -    | -    |
-| GLYMA_20G062000 |    509 | -    | -    |
+| GLYMA_08G342200 |        363 | 2467 | 2749 |
-| GLYMA_16G099400 |     40 | -    | -    |
+| GLYMA_08G072200 |       3499 | 2317 | 2592 |
-| GLYMA_14G045400 |   1136 | 804  | 896  |
+| GLYMA_15G143500 | spacer1810 | -    | -    |
-| GLYMA_11G189200 |     15 | -    | -    |
+| GLYMA_09G191100 |       3783 | -    | -    |
-| GLYMA_02G191500 |   1310 | -    | -    |
+| GLYMA_02G090200 |       7931 | -    | -    |
-| GLYMA_02G308200 |    520 | -    | -    |
+| GLYMA_17G091400 |        584 | -    | -    |
-| GLYMA_14G205200 |      8 | -    | -    |
+| GLYMA_13G199100 |         79 | -    | 621  |
 How many TAGs for definition 0?
-#+begin_src bash
+#+begin_src bash :exports both
 awk 'BEGIN {
           max=0
    }
@ -622,11 +627,11 @@ awk 'BEGIN {
 #+end_src
 #+RESULTS:
-: 3208
+: 2620
 For TAG1?
-#+begin_src bash
+#+begin_src bash :exports both
 awk 'BEGIN {
           max=0
    }
@ -642,10 +647,10 @@ awk 'BEGIN {
 #+end_src
 #+RESULTS:
-: 3481
+: 2916
-#+begin_src bash
+#+begin_src bash :exports both
 awk 'BEGIN {
           max=0
    }
@ -661,7 +666,7 @@ awk 'BEGIN {
 #+end_src
 #+RESULTS:
-: 2500
+: 2157
 #+begin_src bash
@ -680,16 +685,16 @@ awk 'BEGIN {
 #+end_src
 #+RESULTS:
-: 2652
+: 2438
-*** Size of the greatest TAG
+** Size of the greatest TAG
-**** High stringency
+*** High stringency
 /TAG₀/
-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$3
@ -710,11 +715,11 @@ awk '
 #+end_src
 #+RESULTS:
-: 421 53
+: 2134 32
 /TAG₁/
-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$4
@ -735,14 +740,14 @@ awk '
 #+end_src
 #+RESULTS:
-: 459 76
+: 416 43
-**** Low stringency
+*** Low stringency
 /TAG₀/
-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$3
@ -763,11 +768,11 @@ awk '
 #+end_src
 #+RESULTS:
-: 579 76
+: 2591 32
 /TAG₁/
-#+begin_src bash
+#+begin_src bash :exports both
 awk '
    {
    tag_index=$4
@ -788,9 +793,37 @@ awk '
 #+end_src
 #+RESULTS:
-: 646 76
+: 525 43
-* DONE Complete slides
+
 ** Amount of TAGs
 How many TAG genes are there for low and high stringency datasets?
 #+name: count-tags
 #+begin_src bash
 awk 'BEGIN { count0=0; count1; } { if ($3 != "-") count0++; if($4 != "-") count1++; } END { print count0, count1 }' $tags
 #+end_src
 #+RESULTS: count-tags
 #+begin_src bash :noweb strip-export
 tags=./tmp/Glycine_max_TAGs_coverage30_identity30.tsv
 <<count-tags>>
 #+end_src
 #+RESULTS:
 : 6680 7876
 #+begin_src bash :noweb strip-export
 tags=./tmp/Glycine_max_TAGs_coverage40_identity50.tsv
 <<count-tags>>
 #+end_src
 #+RESULTS:
 : 5397 6480
 * Complete slides
 <2024-11-23 Sat>
 * Family sizes
@ -812,16 +845,16 @@ head -1 ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl \
 : 159
-* TODO $K_s$ For all duplicated pairs within a family
+* $K_s$ For all duplicated pairs within a family
 Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$.
-PAML package:
+Install PAML, Pal2Nal and Clustal packages with conda:
 #+begin_src bash
 conda install paml pal2nal clustalw -c bioconda
 #+end_src
-** Use a Nextflow workflow
+** Testing each step
 <2024-12-22 Sun>
 *** All pair of duplicate genes
@ -831,16 +864,14 @@ The algorithm is simple.
 We assume the gene-families mapping file is sorted by family index.
 For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j.
 Example:
-
+#+begin_src bash :exports both
 #+begin_src bash
 cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv
 head ./tmp/Glycine_max_duplicate_gene_pairs.tsv
 #+end_src
 #+RESULTS:
-| GLYMA_07G053000   | GLYMA_10G247700 |
+| GLYMA_07G053000 | GLYMA_10G247700 |
 | GLYMA_07G053000 | GLYMA_12G188000 |
 | GLYMA_07G053000 | GLYMA_07G159200 |
 | GLYMA_07G053000 | GLYMA_13G313300 |
@ -862,9 +893,7 @@ gene_id_1="GLYMA_07G053000"
 gene_id_2="GLYMA_10G247700"
 #+end_src
-CDS:
+- CDS:
 #+name: pair-gene-cds-extract
 #+begin_src bash :noweb strip-export
 output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
@ -900,7 +929,7 @@ done
 #+RESULTS:
-Proteins:
+- Proteins:
 #+name: pair-gene-protein-extract
 #+begin_src bash :noweb strip-export
@ -930,7 +959,6 @@ done
 #+RESULTS: pair-gene-protein-extract
 #+begin_src bash :noweb strip-export
 <<pair-gene-id>>
 <<pair-gene-protein-extract>>
@ -956,7 +984,6 @@ coding_sequence="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
 pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy"
 #+end_src
 /YN00 Ka-Ks computation/
 #+begin_src bash :noweb strip-export
@ -968,7 +995,7 @@ yn00
 /Extraction of Ka-Ks values/
-#+begin_src bash :noweb strip-export
+#+begin_src bash :noweb strip-export :exports both
 <<pair-gene-id>>
 yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn"
 awk '
@ -1000,10 +1027,440 @@ awk '
 *** Run the Nextflow workflow
 <2024-12-23 Mon>
 The  Nextflow workflow is stuck on the first steps, because the channel containing the queue of jobs for each gene pair is too huge in memory.
-* TODO Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
+*** Back to basics
 I wrote a bash script that compute the Ka/Ks not loading the whole pairs file in memory.
 <2024-12-26 Thu> The script is still running.
 I did not handle the case where several coding sequences were present in the CDS file for the same gene, so the script ignores the proteins with several isoforms.
 Let us filter the CDS to match the proteome file:
 #+begin_src bash
 awk '/^>/ { gsub(">", "", $1); print $1 }' ./tmp/proteome_filtered.fa > ./tmp/proteome_filtered.list
 #+end_src
 #+RESULTS:
 #+begin_src bash
 cat ./tmp/proteome_filtered.list | wc -l
 #+end_src
 #+RESULTS:
 : 55589 ./tmp/proteome_filtered.list
 #+begin_src bash
 awk -f ./workflow/families_and_TAGs/scripts/filter_records_fasta.awk ./tmp/proteome_filtered.list ../data/Glycine_max.Glycine_max_v2.1.cds.all.fa > ./tmp/cds_filtered.fa
 #+end_src
 #+RESULTS:
 #+begin_src bash :exports both
 grep -c "^>" ./tmp/cds_filtered.fa
 #+end_src
 #+RESULTS:
 : 55589
 * Are TAG pairs different in age from non-TAG pairs ?
 Run the bash script [[./workflow/KaKs/ugly.sh]].
 <2025-01-07 Tue>
 Computation ended after about a week.
 <2025-01-08 Wed>
 #+begin_src bash
 awk -F',' '$4 < 1' ./workflow/KaKs/results/complete_uniq.csv > ./workflow/KaKs/results/complete_uniq_ks_below1.csv
 cat ./workflow/KaKs/results/complete_uniq_ks_below1.csv| wc -l
 #+end_src
 #+RESULTS:
 : 294226
 #+begin_src R :session *R* :results graphics file :file results/plots/Glycine_max_Ks_distribution.png :width 8 :height 8 :res 200 :units cm
 library(ggplot2)
 library(scales)
 data <- read.table("./workflow/KaKs/complete2.csv", sep=",", header=TRUE)
 colnames(data) <- c("gene_a","gene_b","ka","ks")
 theme_set(theme_gray(base_size=8))
 gg <- ggplot(data, aes(x=ks))
 gg <- gg + geom_density()
 gg <- gg + scale_x_continuous("Age (Ks)")
 gg <- gg + scale_y_continuous("Density")
 gg <- gg + xlim(0, 1)
 gg <- gg + ggtitle("Proportion of duplicate gene pairs age")
 gg <- gg + theme(plot.title=element_text(hjust=0.5))
 gg
 #+end_src
 #+RESULTS:
 [[file:results/plots/Glycine_max_Ks_distribution.png]]
 #+caption: Density of pair with a given Ks. (Inspired from Blanc and Wolfe, 2004)
 #+RESULTS:
 [[file:results/plots/Glycine_max_Ks_distribution.png]]
 Then, let us plot the same data separating TAG from non-TAG pairs.
 #+begin_src R :session *R* :results silent
 data <- read.csv("./workflow/KaKs/complete2.csv")
 colnames(data) <- c("gene_a", "gene_b", "ka", "ks")
 tag_df2 <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30.tsv", header=TRUE)
 tag_def <- "tag1"
 data$category <- apply(data, 1, function(row) {
  gene_a <- row["gene_a"]
  gene_b <- row["gene_b"]
  tag_a <- tag_df2[tag_df2["gene"] == gene_a, tag_def]
  tag_b <- tag_df2[tag_df2["gene"] == gene_b, tag_def]
  # Filter out pairs tag character(0)
    if (length(tag_a) == 0 || length(tag_b) == 0) {
        return("notag")
    } else if (tag_a == "-" || tag_b == "-") {
        return("notag")
    } else if (tag_a == tag_b) {
        return("tag")
    } else {
        return("notag")
    }
 })
 #+end_src
 #+RESULTS:
 #+begin_src R :session *R* :results graphics file :file results/plots/ks_density_tag_and_not_tag.png :width 8 :height 8 :res 200 :units cm
 library(ggplot2)
 library(scales)
 theme_set(theme_gray(base_size = 6))
 gg <- ggplot(data)
 gg <- gg + geom_density(mapping=aes(x=ks, color=category))
 gg <- gg + scale_x_discrete("Age (Ks)")
 gg <- gg + scale_y_continuous("Density")
 gg <- gg + xlim(0, 5)
 gg <- gg + ggtitle("Proportion of duplicate gene pairs age (TAG and non-TAG)")
 gg <- gg + theme(plot.title = element_text(hjust = 0.5))
 gg
 #+end_src
 #+RESULTS:
 [[file:results/plots/ks_density_tag_and_not_tag.png]]
 We will now test whether TAG gene duplication is more recent than non-TAG gene pairs.
 As the distributions of the Ks values is not normally distributed, I will not use the Student \(t\)-test, but the Wilcoxon-Mann-Whitney U test.
 The hypotheses are
 \[
 \begin{cases}
 (H_0) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is equal to the probability of $Y$ being greater than $X$} \\
 (H_1) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is greater than the probability of $Y$ being greater than $X$}
 \end{cases}
 \]
 #+begin_src R :session *R* :results output :exports both
 tag_ks <- data[data$category == "tag", "ks"]
 non_tag_ks <- data[data$category == "notag", "ks"]
 wilcox.test(non_tag_ks, tag_ks, alternative="greater")
 #+end_src
 #+RESULTS:
 :
 : 	Wilcoxon rank sum test with continuity correction
 :
 : data:  non_tag_ks and tag_ks
 : W = 358006116, p-value < 2.2e-16
 : alternative hypothesis: true location shift is greater than 0
 The \(p\)-value is lower than $0.05$ so at level \(\alpha = 5 \%\), we reject the null hypotheses. Given two duplicate gene pairs, the probably that a TAG gene pair duplication event occurred before the non-TAG gene pair duplication event is greater than the otherway around, remaining TAG genes tends to be more recently duplicated.
 * Are genes inside a TAG orientated in the same way more often than random?
 We did not keep the orientation of the genes in the TAGs file. Let extract the gene orientation first, and join this information to the TAGs file.
 We will do our analysis on the less stringent dataset. (coverage >30%, identity >30%).
 #+begin_src bash
 awk 'BEGIN {
    OFS="\t"
    }
 /^>/ {
          locus=$3
          split(locus, arr, ":")
          orientation=arr[6]
          gene=$4
          sub("gene:", "", gene)
          print gene, orientation
    }' ./tmp/proteome_filtered.fa > ./tmp/filtered_gene_orientation.tsv
 head ./tmp/filtered_gene_orientation.tsv
 #+end_src
 #+RESULTS:
 | GLYMA_01G141900 | -1 |
 | GLYMA_01G234000 | -1 |
 | GLYMA_01G157500 | -1 |
 | GLYMA_01G031500 | -1 |
 | GLYMA_01G132000 | -1 |
 | GLYMA_01G038100 | -1 |
 | GLYMA_01G184300 |  1 |
 | GLYMA_01G231700 |  1 |
 | GLYMA_01G031400 |  1 |
 | GLYMA_01G008100 |  1 |
 #+begin_src bash
 head ./tmp/Glycine_max
 #+end_src
 #+RESULTS:
 #+begin_src bash
 tail -n +1 ./tmp/filtered_gene_orientation.tsv | sort -d > ./tmp/filtered_gene_orientation_s.tsv
 tail -n +1 ./tmp/Glycine_max_TAGs_coverage30_identity30.tsv  | sort -d > ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv
 join -t$'\t' -1 1 -2 1 ./tmp/filtered_gene_orientation_s.tsv ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv > ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
 #+end_src
 #+RESULTS:
 #+begin_src bash
 head ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv
 #+end_src
 #+RESULTS:
 | GLYMA_01G000100 | -1 |    5506 | - | - |
 | GLYMA_01G000200 | -1 | spacer1 | - | - |
 | GLYMA_01G000300 |  1 | spacer2 | - | - |
 | GLYMA_01G000400 | -1 |     146 | - | 1 |
 | GLYMA_01G000500 |  1 | spacer3 | - | - |
 | GLYMA_01G000600 |  1 |     146 | - | 1 |
 | GLYMA_01G000700 |  1 | spacer4 | - | - |
 | GLYMA_01G000800 |  1 |    3898 | - | - |
 | GLYMA_01G000900 |  1 |    7954 | - | - |
 | GLYMA_01G001000 |  1 |    7875 | - | - |
 #+RESULTS:
 #+begin_src R :session *R* :colnames yes
 tag_df = read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv",
                    sep="\t")
 colnames(tag_df) <- c("gene", "orientation", "family", "tag0", "tag1")
 head(tag_df)
 #+end_src
 #+RESULTS:
 | gene            | orientation | family  | tag0 | tag1 |
 |-----------------+-------------+---------+------+------|
 | GLYMA_01G000100 |          -1 | 5506    | -    | -    |
 | GLYMA_01G000200 |          -1 | spacer1 | -    | -    |
 | GLYMA_01G000300 |           1 | spacer2 | -    | -    |
 | GLYMA_01G000400 |          -1 | 146     | -    | 1    |
 | GLYMA_01G000500 |           1 | spacer3 | -    | -    |
 | GLYMA_01G000600 |           1 | 146     | -    | 1    |
 We perform a Fischer test on the contingency table:
 |         | Coherent | Convergent | Divergent |
 |---------+----------+------------+-----------|
 | pair TAG0 |          |            |           |
 | pair not TAG0 |          |            |           |
 * TODO Are TAG pairs different in age from non-TAG pairs?
-* TODO Are genes inside a TAG orientated in the same way more often than random?
+#+begin_src R :session *R*
 get_strand <- function(gene) {
    return(
        tag_df[tag_df[,"gene"] == gene, "orientation"]
    )
 }
 get_convergence <- function(gene_a, gene_b) {
    strand_a <- get_strand(gene_a)
    strand_b <- get_strand(gene_b)
    if (strand_a == strand_b) {
        return('coherent')
    } else if (strand_a == -1) {
        return('divergent')
    } else if (strand_a == 1) {
        return('convergent')
    } else {
        message("Error: I do not understand why this does not fall in these case (coherent, divergent, convergent, what else?)")
    }
 }
 #+end_src
 #+RESULTS:
 #+begin_src R :session *R* :colnames yes :rownames yes
 contingency_table <- matrix(0, nrow=2, ncol=3)
 colnames(contingency_table) <- c("coherent", "convergent", "divergent")
 rownames(contingency_table) <- c("tag", "nottag")
 definition <- 0
 for (i in 1:(nrow(tag_df) - (definition + 1))) {
  j <- i + 1
  gene_i <- tag_df[i, "gene"]
  gene_j <- tag_df[j, "gene"]
  convergence <- get_convergence(gene_i, gene_j)
  if (tag_df[i, "tag0"] == "-" || tag_df[j, "tag0"] == "-") {
    category <- "nottag"
  } else if (tag_df[i, "tag0"] == tag_df[j, "tag0"]) {
    category <- "tag"
  } else {
    category <- "nottag"
  }
  contingency_table[category, convergence] <- contingency_table[category, convergence] + 1
 }
 contingency_table
 #+end_src
 #+RESULTS:
 |        | coherent | convergent | divergent |
 |--------+----------+------------+-----------|
 | tag    |     3353 |        342 |       361 |
 | nottag |    25699 |      12926 |     12907 |
 #+RESULTS:
 : org_babel_R_eoe
 #+begin_src R :session *R* :results output
 fisher.test(contingency_table, workspace=2e6)
 #+end_src
 #+RESULTS:
 :
 : 	Fisher's Exact Test for Count Data
 :
 : data:  contingency_table
 : p-value < 2.2e-16
 : alternative hypothesis: two.sided
 \(p\)-value < 2.2e-16, so at level $\alpha = 0.05$, we reject the null hypothesis: the orientation of a pair of gene is significantly different between TAG and non TAG pairs.
 More specifically, TAG genes tends to be more coherent than non TAG genes.
 * Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
 <2024-12-26 Thu>
 Uses Gprofiler2
 #+begin_src R :session *R* :results silent
 library(gprofiler2)
 #+end_src
 First we want to extract the list of genes belonging to the largest TAG:
 We once again focus on the least stringent dataset, with the TAG₀ definition.
 #+begin_src R :session *R*
 tag_df_tag <- tag_df[tag_df["tag1"] != "-",]
 tag1_count <- table(tag_df_tag["tag1"])
 largest_tag1 <- names(tag1_count[which(tag1_count == max(tag1_count))])
 #+end_src
 #+RESULTS:
 : 525
 #+begin_src R :session *R*
 largest_tag1_genes <- tag_df_tag[tag_df_tag["tag1"] == largest_tag1, "gene"]
 gostres <- gost(query=largest_tag1_genes, organism="gmax")
 head(gostres$result)
 #+end_src
 #+RESULTS:
 #+begin_src R :session *R*  :results graphics file :file results/plots/gprofiler_tag0579_enrichment_plot.png :width 8 :height 8 :res 200 :units cm
 p <- gostplot(gostres, capped=FALSE, interactive=FALSE)
 p
 #+end_src
 #+RESULTS:
 [[file:results/plots/gprofiler_tag0579_enrichment_plot.png]]
 Our genes are mostly involved in auxin related signaling pathways, and response to hormones.
 #+begin_example
                                                     term_name
 1                           regulation of cellular respiration
 2 regulation of generation of precursor metabolites and energy
 3                                         cellular respiration
 4          energy derivation by oxidation of organic compounds
 5               generation of precursor metabolites and energy
 6                     regulation of cellular metabolic process
 #+end_example
 * TAG size in function of TAG definition
 <2024-12-27 Fri>
 #+begin_src bash
 ./rust/tagfinder/target/release/tagfinder --families  ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv --positions ./tmp/Glycine_max.gene_positions.tsv --definitions 0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50 > "./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv"
 #+end_src
 #+RESULTS:
 #+begin_src R :session *R*  :results graphics file  :file results/plots/nb_TAG_against_definition.png :width 8 :height 8 :res 200 :units cm :exports both
 library(ggplot2)
 library(scales)
 data <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv", na.strings=c("-"), header=TRUE)
 definitions <- c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50)
 nb_TAGs <- sapply(definitions, function(definition) {
  tags <- data[,paste0("tag", definition)]
  tags <- tags[! is.na(tags)]
  return(max(tags))
    })
 data <- data.frame(list(definition=definitions, nb=nb_TAGs))
 theme_set(theme_gray(base_size = 8))
 gg <- ggplot(data, aes(x=definition, y=nb))
 gg <- gg + geom_point()
 gg <- gg + scale_x_continuous("Definition")
 gg <- gg + scale_y_continuous("TAGs")
 gg <- gg + ggtitle("Number of TAGs for a TAG definition")
 gg <- gg + theme(plot.title = element_text(hjust = 0.5))
                      gg
 #+end_src
 #+RESULTS:
 [[file:results/plots/nb_TAG_against_definition.pdf]]
 * Computing some statistics on a selection of plants
 <2024-12-29 Sun>
 We want to extract the following statistics for the whole set of plant on eCampus:
 - number of families
 - number of duplicate genes
 - number of singletons
 - number of TAG0
 We have to filter out some genes that are from Mitochondria or Chloroplasts (for /Oryza sativa ssp. Japonica/ for instance).
 First we extract the protein ID that corresponds to mitochondrial or chloroplastic genes.
 #+begin_src bash
 fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz"
 zcat "${fasta}" | awk '
 /^>/ {
     locus=$3
     split(":", arr, locus)
     chromosome=arr[3]
     if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /^supercontig*/) {
        protein=$1
        gsub(">", "", protein)
        print protein
     }
    }' > ./tmp/Oryza_sativa_faulty_proteins.list
 #+end_src
 #+RESULTS:
 #+begin_src bash
 head ./tmp/Oryza_sativa_faulty_proteins.list
 #+end_src
 #+RESULTS:
 Then, remove all BLASTp hits that matches a protein ID in either query or subject.
 We include this step in a Nextflow workflow. It does not compute the BLASTp all against all, It does rather use the TSV file provided on eCampus.
--- a/rust/pairs/src/main.rs
+++ b/rust/pairs/src/main.rs
@ -15,7 +15,6 @@ fn main() {
        let mut parts = line.split("\t");
        let gene = parts.next().unwrap().to_string();
        let family = parts.next().unwrap().to_string();
        println!("{}", family);
        if family != family_index {
            family_index = family;
            if family_genes.len() > 1 {
@ -23,7 +22,9 @@ fn main() {
            }                    
            family_genes.clear();
        }
-        family_genes.push(gene);
+        if gene != "" {
            family_genes.push(gene);
        }
    }
    if family_genes.len() > 1 {
        print_gene_pairs(&family_genes);
--- a/workflow/.gitignore
+++ b/workflow/.gitignore
@ -1,2 +1 @@
-.nextflow.log*
+.nextflow*
--- a/workflow/KaKs/scripts/extract_fasta_records.sh
+++ b/workflow/KaKs/scripts/extract_fasta_records.sh
@ -8,16 +8,22 @@ for gene_id in ${gene_id_1} ${gene_id_2}; do
    awk -v gene_id="${gene_id}" '
        BEGIN {
            on_gene=0
            visited=0
        }
        /^[^>]/ && on_gene == 1 {
            print $0
        }
        /^>/ {
            if (visited == 1) {
                    exit
            }
            gene = $4
            gsub("gene:", "", gene)
            if (gene == gene_id) {
                on_gene=1
                print $0
                visited=1
            } else {
                on_gene=0
            }
--- a/workflow/KaKs/standalone.sh
+++ b/workflow/KaKs/standalone.sh
@ -1,35 +1,28 @@
 #!/usr/bin/env bash
 # Bash version of the Nextflow script that should run faster
 output_file="Glycine_max_duplicate_genes_KaKs.tsv"
 proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
 cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
 set -euo pipefail
-mkdir -p tmp
+NB_THREADS=16
 output_file="Glycine_max_duplicate_genes_KaKs_v3.csv"
 proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
 cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/cds_filtered.fa"
 input_file="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.csv"
 echo "seqfile = ./tmp/cds.ali.phy
 outfile = ./tmp/yn
 verbose = 0
 icode = 0
 weighting = 0
 commonf3x4 = 0" > yn00.ctl
 compute_kaks() {
    local gene_id_1
    local gene_id_2
-    gene_id_1=$1
+    gene_id_1="${1}"
-    gene_id_2=$2
+    gene_id_2="${2}"
    # Extract the proteins
-    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
+    ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${proteome_fasta}" "./prot.fst"
    # Extract the CDS
-    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
+    ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${cds_fasta}" "./cds.fst"
    # Run clustalw2
-    clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
+    clustalw2 -quiet -align -infile="./prot.fst" -outfile="./prot.ali.aln"
    # Run Pal2Nal
-    pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
+    pal2nal.pl "./prot.ali.aln" "./cds.fst" -output paml >"./cds.ali.phy"
    # Run yn00
    yn00
@ -55,22 +48,49 @@ compute_kaks() {
            skip -= 1
        }
    }
-    ' "./tmp/yn"
+    ' "./yn"
 }
-echo "gene_a,gene_b,ka,ks" > "${output_file}"
+
-input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
+thread() {
-n=$(wc -l "${input}")
+    local thread_id="${1}"
-i=0
+    local start="${2}"
-while IFS= read -r line
+    local end="${3}"
-do
+
-    echo $i / $n
+    mkdir -p "tmp/${thread_id}"
-    gene_a=$(echo $line | awk '{print $1}')
+    pushd "tmp/${thread_id}" || return
-    gene_b=$(echo $line | awk '{print $2}')
+
-    i=$(expr $i + 1)
+    echo "seqfile = ./cds.ali.phy
-    kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
+outfile = ./yn
-    if [[ ! -z $kaks ]]
+verbose = 0
-       then
+icode = 0
-       arr=(${kaks//\t/ })
+weighting = 0
-       echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
+commonf3x4 = 0" >yn00.ctl
-    fi
+    echo "" >"part.csv"
-done < "${input}"
+    for line_index in $(seq $start $end); do
        line=$(head -n "${line_index}" "${input_file}" | tail -n 1)
        gene_a=$(echo "${line}" | awk '{print $1}')
        gene_b=$(echo "${line}" | awk '{print $2}')
        kaks=$(compute_kaks "${gene_a}" "${gene_b}" | tail -1 | awk 'NF == 2')
        if [[ ! -z $kaks ]]; then
            arr=(${kaks//\t/ })
            echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >>"part.csv"
        fi
    done
    popd || return
 }
 main() {
    part=$(($(cat "${input_file}" | wc -l) / ${NB_THREADS}))
    for thread_id in $(seq 0 $NB_THREADS); do
        start=$((thread_id * part))
        end=$(($((thread_id + 1)) * part))
        thread $thread_id $start $end &
    done
    wait
    echo "gene_a,gene_b,ka,ks" >"${output_file}"
    for thread_id in $(seq 0 $NB_THREADS); do
        cat ./tmp/$thread_id/part.csv >>"${output_file}"
    done
 }
 main