diff --git a/docs/presentation/main.tex b/docs/presentation/main.tex new file mode 100644 index 0000000..8edea95 --- /dev/null +++ b/docs/presentation/main.tex @@ -0,0 +1,86 @@ +\documentclass{beamer} +\usepackage{booktabs} + +\title{Duplicate Genes \& Tandemly Arrayed Genes in \textit{Glycine max} (soy)} +\subtitle{First results} +\author{Naïa Périnelle \and Samuel Ortion} +\institute{Université d'Évry Paris-Saclay} +\date{2024-11-15} + +\begin{document} + +\frame{\titlepage} + +\begin{frame}{Cultivated soy plant: \textit{Glycine max}} +A species of legume native to East Asia. +{ +\centering +\includegraphics{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg} +\vspace{1em} +% \includegraphics[width=0.5\textwidth]{media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png}\footnote{Gepts et al. 2005} +\includegraphics[width=0.25\textwidth]{media/Soybeanvarieties.jpeg} +} + + +\end{frame} + +\begin{frame}{Soybean industrial interest} +\begin{itemize} + \item 353 million tonnes of soybean produced in 2020 + \item Cattle feed + \item Human food +\end{itemize} + +\end{frame} + +\begin{frame}{\textit{Glycine max} genome statistics} + \begin{itemize} + \item 20 chromosomes + \item 55897 protein coding genes (including supercontigs) + \item 55589 protein coding genes (excluding supercontigs) + \item 88412 protein isoforms + \end{itemize} +\end{frame} + +\begin{frame}{Datasets filter criteria and pipeline steps} + \begin{tabular}{ccc} + \toprule + dataset stringency & low & high \\ + \midrule + coverage & > 30\% & > 40\% \\ + identity & > 30\% & > 50\% \\ + \bottomrule + \end{tabular} + \only<2->{ + + Steps: + \begin{enumerate} + \item<2-> Keep the longest isoform protein sequence per gene, + \item<3-> Run BLASTp ``all against all'' on the proteome, + \item<4-> Remove proteins of supercontigs, + \item<5-> Filter the HSP\footnote{High-scoring Segment Pair} based on the dataset criteria (coverage and identity percentages), + \item<6-> Run Markov Clustering (\texttt{mcl} with default parameters) on the homology graph built with the highest \texttt{bitscore} values per homologous genes pairs, + \item<7-> Detect \texttt{Tandemly Arrayed Genes} with a Rust program based on gene positions and gene families. + \end{enumerate} + } +\end{frame} + +\begin{frame}{Gene families size} + \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage30_identity30.pdf} + \includegraphics[width=0.47\textwidth]{media/Glycine_max_family_size_hist_coverage40_identity50.pdf} +\end{frame} + +\begin{frame}{Duplicate genes statistics} + \begin{tabular}{lcc} + dataset stringency & low & high \\ + \toprule + number of duplicate genes & 50254 (89.9\%) & 46769 (83.7\%) \\ + number of families & 8426 & 11997 \\ + number of singletons & 5643 (10.1\%) & 9128 (16.3\%) \\ + number of TAG\textsubscript{0} & 3208 & 2500 \\ + number of TAG\textsubscript{1} & 3481 & 2652 \\ + \bottomrule + \end{tabular} +\end{frame} + +\end{document} \ No newline at end of file diff --git a/docs/presentation/media/Glycine_max_family_size_hist_coverage30_identity30.pdf b/docs/presentation/media/Glycine_max_family_size_hist_coverage30_identity30.pdf new file mode 100644 index 0000000..5d5a633 Binary files /dev/null and b/docs/presentation/media/Glycine_max_family_size_hist_coverage30_identity30.pdf differ diff --git a/docs/presentation/media/Glycine_max_family_size_hist_coverage40_identity50.pdf b/docs/presentation/media/Glycine_max_family_size_hist_coverage40_identity50.pdf new file mode 100644 index 0000000..b1b0c63 Binary files /dev/null and b/docs/presentation/media/Glycine_max_family_size_hist_coverage40_identity50.pdf differ diff --git a/docs/presentation/media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg b/docs/presentation/media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg new file mode 100644 index 0000000..090ba6a Binary files /dev/null and b/docs/presentation/media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg differ diff --git a/docs/presentation/media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png b/docs/presentation/media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png new file mode 100644 index 0000000..30393e1 Binary files /dev/null and b/docs/presentation/media/Simplified-schematic-tree-of-legume-family-modified-from-Doyle-and-Luckow-2003-The.png differ diff --git a/docs/presentation/media/Soybeanvarieties.jpeg b/docs/presentation/media/Soybeanvarieties.jpeg new file mode 100644 index 0000000..a94a60c Binary files /dev/null and b/docs/presentation/media/Soybeanvarieties.jpeg differ diff --git a/notebook.org b/notebook.org index d7fd400..151fad3 100644 --- a/notebook.org +++ b/notebook.org @@ -1,9 +1,11 @@ +# -*- org-export-use-babel: nil -*- #+title: Comparative Genomics Project #+subtitle: Duplicate Genes in /Glycine max/ #+date: 2024-2025 #+author: Samuel Ortion #+LATEX_CLASS: scrartcl #+LATEX_HEADER: \titlehead{M2 GENIOMHE} +#+LATEX_HEADER: \usepackage{mus} * General infos @@ -48,15 +50,13 @@ min_identity=50 /Glycine max/ is the soy plant. * Download the source data +- https://plants.ensembl.org/Glycine_max/Info/Index +- https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz -https://plants.ensembl.org/Glycine_max/Info/Index +* Duplicate gene families +** Develop the Nextflow workflow -https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz - - -* Develop the Nextflow workflow - -** Filtering the proteome to keep the longest isoform +*** Filtering the proteome to keep the longest isoform I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms: 1. =protein_lengths.awk= associate each FASTA record with the length of its sequence, @@ -89,19 +89,19 @@ zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \ #+RESULTS: : 55897 -** Filtering out Chloroplasts and Mitochondria proteins +*** Filtering out Chloroplasts and Mitochondria proteins From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/. -** Filter out supercontigs +*** Filter out supercontigs <2024-10-28 Mon> -#+include: workflow/families_and_TAGs/scripts/remove_supercontigs.awk src awk +#+include: workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk src awk #+begin_src bash zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \ - | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs.awk \ + | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk \ > ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa #+end_src @@ -143,7 +143,7 @@ expr $ALL_ISOFORMS - $NO_SUPERCONTIGS #+RESULTS: : 435 -** BLASTp All Against All +*** BLASTp All Against All #+begin_src bash makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max" @@ -153,7 +153,7 @@ makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max" blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv" #+end_src -** Clustering +*** Clustering Using =mcl=. @@ -182,8 +182,7 @@ nextflow run main.nf -profile conda -resume from the =./workflow/= folder - -* From the given BLASTp format 6 file +*** From the given BLASTp format 6 file <2024-11-02 Sat> /How many BLASTp HSP have been reported?/ #+begin_src bash @@ -193,7 +192,7 @@ cat ../data/Glycine_max_Blastp_longIsoforme | wc -l #+RESULTS: : 4456730 -** Remove HSP on proteins found on supercontigs +*** Remove HSP on proteins found on supercontigs Extract the isoform ID whose gene is on supercontigs. #+begin_src bash @@ -213,7 +212,7 @@ Remove HSP with subject or query being on a supercontig =filter_blastp_sequence_id.awk= -#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk src awk +#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk src awk #+begin_src bash awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig @@ -241,7 +240,7 @@ cat ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig | wc -l : 4420132 -** Filter by coverage and identity percentage +*** Filter by coverage and identity percentage *** Add protein length and gene name columns @@ -312,7 +311,7 @@ cat ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.tsv | wc -l #+RESULTS: : 342180 -** Clustering +*** Clustering <2024-11-03 Sun> *** Extract the homology graph @@ -345,6 +344,13 @@ head -n 1 "tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc" #+RESULTS: | GLYMA_10G098500 | GLYMA_U008600 | 154 | +#+begin_src bash +wc -l "./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc" +#+end_src + +#+RESULTS: +: 1520850 ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc + *** Filter the homology graph to keep only the heaviest weight @@ -475,7 +481,7 @@ What is the interval of duplicate gene count? There are between 48109 and 49391 What is the number of families? -#+begin_src bash +#+begin_src bash :exports both tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv" cut -f2 "${tsv}" | sort | uniq | wc -l #+end_src @@ -483,7 +489,7 @@ cut -f2 "${tsv}" | sort | uniq | wc -l #+RESULTS: : 8426 -#+begin_src bash +#+begin_src bash :exports both tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv" cut -f2 "${tsv}" | sort | uniq | wc -l #+end_src @@ -508,10 +514,9 @@ expr $ALL - $DUPLICATED #+RESULTS: singletons-high-stringency : 9128 - * Amount of Tandemly Arrayed Genes -*** Extract gene position from a GFF3 file +** Extract gene position from a GFF3 file #+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk @@ -540,7 +545,7 @@ head "./tmp/Glycine_max.gene_positions.tsv" | GLYMA_01G001000 | 1 | 196256 | 201895 | #+name: gene-positions-count -#+begin_src bash +#+begin_src bash :exports both cat "./tmp/Glycine_max.gene_positions.tsv" \ | wc -l #+end_src @@ -557,7 +562,7 @@ expr $GENES - $POSITIONS #+RESULTS: : 308 -*** Extract TAGs +** Extract TAGs #+name: extract-TAGs #+begin_src bash @@ -593,20 +598,20 @@ head "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv" #+end_src #+RESULTS: -| gene | family | tag0 | tag1 | -| GLYMA_07G257400 | 3109 | - | - | -| GLYMA_17G203900 | 2067 | - | - | -| GLYMA_20G062000 | 509 | - | - | -| GLYMA_16G099400 | 40 | - | - | -| GLYMA_14G045400 | 1136 | 804 | 896 | -| GLYMA_11G189200 | 15 | - | - | -| GLYMA_02G191500 | 1310 | - | - | -| GLYMA_02G308200 | 520 | - | - | -| GLYMA_14G205200 | 8 | - | - | +| gene | family | tag0 | tag1 | +| GLYMA_19G258600 | spacer3040 | - | - | +| GLYMA_08G277500 | 68 | - | - | +| GLYMA_08G342200 | 363 | 2467 | 2749 | +| GLYMA_08G072200 | 3499 | 2317 | 2592 | +| GLYMA_15G143500 | spacer1810 | - | - | +| GLYMA_09G191100 | 3783 | - | - | +| GLYMA_02G090200 | 7931 | - | - | +| GLYMA_17G091400 | 584 | - | - | +| GLYMA_13G199100 | 79 | - | 621 | How many TAGs for definition 0? -#+begin_src bash +#+begin_src bash :exports both awk 'BEGIN { max=0 } @@ -622,11 +627,11 @@ awk 'BEGIN { #+end_src #+RESULTS: -: 3208 +: 2620 For TAG1? -#+begin_src bash +#+begin_src bash :exports both awk 'BEGIN { max=0 } @@ -642,10 +647,10 @@ awk 'BEGIN { #+end_src #+RESULTS: -: 3481 +: 2916 -#+begin_src bash +#+begin_src bash :exports both awk 'BEGIN { max=0 } @@ -661,7 +666,7 @@ awk 'BEGIN { #+end_src #+RESULTS: -: 2500 +: 2157 #+begin_src bash @@ -680,16 +685,16 @@ awk 'BEGIN { #+end_src #+RESULTS: -: 2652 +: 2438 -*** Size of the greatest TAG +** Size of the greatest TAG -**** High stringency +*** High stringency /TAG₀/ -#+begin_src bash +#+begin_src bash :exports both awk ' { tag_index=$3 @@ -710,11 +715,11 @@ awk ' #+end_src #+RESULTS: -: 421 53 +: 2134 32 /TAG₁/ -#+begin_src bash +#+begin_src bash :exports both awk ' { tag_index=$4 @@ -735,14 +740,14 @@ awk ' #+end_src #+RESULTS: -: 459 76 +: 416 43 -**** Low stringency +*** Low stringency /TAG₀/ -#+begin_src bash +#+begin_src bash :exports both awk ' { tag_index=$3 @@ -763,11 +768,11 @@ awk ' #+end_src #+RESULTS: -: 579 76 +: 2591 32 /TAG₁/ -#+begin_src bash +#+begin_src bash :exports both awk ' { tag_index=$4 @@ -788,9 +793,37 @@ awk ' #+end_src #+RESULTS: -: 646 76 +: 525 43 -* DONE Complete slides + +** Amount of TAGs + +How many TAG genes are there for low and high stringency datasets? + +#+name: count-tags +#+begin_src bash +awk 'BEGIN { count0=0; count1; } { if ($3 != "-") count0++; if($4 != "-") count1++; } END { print count0, count1 }' $tags +#+end_src + +#+RESULTS: count-tags + +#+begin_src bash :noweb strip-export +tags=./tmp/Glycine_max_TAGs_coverage30_identity30.tsv +<> +#+end_src + +#+RESULTS: +: 6680 7876 + +#+begin_src bash :noweb strip-export +tags=./tmp/Glycine_max_TAGs_coverage40_identity50.tsv +<> +#+end_src + +#+RESULTS: +: 5397 6480 + +* Complete slides <2024-11-23 Sat> * Family sizes @@ -812,16 +845,16 @@ head -1 ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl \ : 159 -* TODO $K_s$ For all duplicated pairs within a family +* $K_s$ For all duplicated pairs within a family Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$. -PAML package: +Install PAML, Pal2Nal and Clustal packages with conda: #+begin_src bash conda install paml pal2nal clustalw -c bioconda #+end_src -** Use a Nextflow workflow +** Testing each step <2024-12-22 Sun> *** All pair of duplicate genes @@ -831,16 +864,14 @@ The algorithm is simple. We assume the gene-families mapping file is sorted by family index. For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j. - Example: - -#+begin_src bash +#+begin_src bash :exports both cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv head ./tmp/Glycine_max_duplicate_gene_pairs.tsv #+end_src #+RESULTS: -| GLYMA_07G053000 | GLYMA_10G247700 | +| GLYMA_07G053000 | GLYMA_10G247700 | | GLYMA_07G053000 | GLYMA_12G188000 | | GLYMA_07G053000 | GLYMA_07G159200 | | GLYMA_07G053000 | GLYMA_13G313300 | @@ -862,9 +893,7 @@ gene_id_1="GLYMA_07G053000" gene_id_2="GLYMA_10G247700" #+end_src -CDS: - - +- CDS: #+name: pair-gene-cds-extract #+begin_src bash :noweb strip-export output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst" @@ -900,7 +929,7 @@ done #+RESULTS: -Proteins: +- Proteins: #+name: pair-gene-protein-extract #+begin_src bash :noweb strip-export @@ -930,7 +959,6 @@ done #+RESULTS: pair-gene-protein-extract - #+begin_src bash :noweb strip-export <> <> @@ -956,7 +984,6 @@ coding_sequence="./tmp/${gene_id_1}_${gene_id_2}.cds.fst" pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy" #+end_src - /YN00 Ka-Ks computation/ #+begin_src bash :noweb strip-export @@ -968,7 +995,7 @@ yn00 /Extraction of Ka-Ks values/ -#+begin_src bash :noweb strip-export +#+begin_src bash :noweb strip-export :exports both <> yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn" awk ' @@ -1000,10 +1027,440 @@ awk ' *** Run the Nextflow workflow <2024-12-23 Mon> +The Nextflow workflow is stuck on the first steps, because the channel containing the queue of jobs for each gene pair is too huge in memory. -* TODO Does /Glycine max/ have big TAGs and which function are the big TAG gene implied? +*** Back to basics + +I wrote a bash script that compute the Ka/Ks not loading the whole pairs file in memory. + +<2024-12-26 Thu> The script is still running. + +I did not handle the case where several coding sequences were present in the CDS file for the same gene, so the script ignores the proteins with several isoforms. + +Let us filter the CDS to match the proteome file: + +#+begin_src bash +awk '/^>/ { gsub(">", "", $1); print $1 }' ./tmp/proteome_filtered.fa > ./tmp/proteome_filtered.list +#+end_src + +#+RESULTS: + +#+begin_src bash +cat ./tmp/proteome_filtered.list | wc -l +#+end_src + +#+RESULTS: +: 55589 ./tmp/proteome_filtered.list + +#+begin_src bash +awk -f ./workflow/families_and_TAGs/scripts/filter_records_fasta.awk ./tmp/proteome_filtered.list ../data/Glycine_max.Glycine_max_v2.1.cds.all.fa > ./tmp/cds_filtered.fa +#+end_src + +#+RESULTS: + +#+begin_src bash :exports both +grep -c "^>" ./tmp/cds_filtered.fa +#+end_src + +#+RESULTS: +: 55589 + +* Are TAG pairs different in age from non-TAG pairs ? + +Run the bash script [[./workflow/KaKs/ugly.sh]]. + +<2025-01-07 Tue> +Computation ended after about a week. + +<2025-01-08 Wed> + +#+begin_src bash +awk -F',' '$4 < 1' ./workflow/KaKs/results/complete_uniq.csv > ./workflow/KaKs/results/complete_uniq_ks_below1.csv +cat ./workflow/KaKs/results/complete_uniq_ks_below1.csv| wc -l +#+end_src + +#+RESULTS: +: 294226 + +#+begin_src R :session *R* :results graphics file :file results/plots/Glycine_max_Ks_distribution.png :width 8 :height 8 :res 200 :units cm +library(ggplot2) +library(scales) + +data <- read.table("./workflow/KaKs/complete2.csv", sep=",", header=TRUE) +colnames(data) <- c("gene_a","gene_b","ka","ks") + +theme_set(theme_gray(base_size=8)) +gg <- ggplot(data, aes(x=ks)) +gg <- gg + geom_density() +gg <- gg + scale_x_continuous("Age (Ks)") +gg <- gg + scale_y_continuous("Density") +gg <- gg + xlim(0, 1) +gg <- gg + ggtitle("Proportion of duplicate gene pairs age") +gg <- gg + theme(plot.title=element_text(hjust=0.5)) +gg +#+end_src + +#+RESULTS: +[[file:results/plots/Glycine_max_Ks_distribution.png]] + +#+caption: Density of pair with a given Ks. (Inspired from Blanc and Wolfe, 2004) + +#+RESULTS: +[[file:results/plots/Glycine_max_Ks_distribution.png]] + +Then, let us plot the same data separating TAG from non-TAG pairs. + +#+begin_src R :session *R* :results silent +data <- read.csv("./workflow/KaKs/complete2.csv") +colnames(data) <- c("gene_a", "gene_b", "ka", "ks") +tag_df2 <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30.tsv", header=TRUE) +tag_def <- "tag1" +data$category <- apply(data, 1, function(row) { + gene_a <- row["gene_a"] + gene_b <- row["gene_b"] + tag_a <- tag_df2[tag_df2["gene"] == gene_a, tag_def] + tag_b <- tag_df2[tag_df2["gene"] == gene_b, tag_def] + # Filter out pairs tag character(0) + if (length(tag_a) == 0 || length(tag_b) == 0) { + return("notag") + } else if (tag_a == "-" || tag_b == "-") { + return("notag") + } else if (tag_a == tag_b) { + return("tag") + } else { + return("notag") + } +}) +#+end_src + +#+RESULTS: + +#+begin_src R :session *R* :results graphics file :file results/plots/ks_density_tag_and_not_tag.png :width 8 :height 8 :res 200 :units cm +library(ggplot2) +library(scales) +theme_set(theme_gray(base_size = 6)) +gg <- ggplot(data) +gg <- gg + geom_density(mapping=aes(x=ks, color=category)) +gg <- gg + scale_x_discrete("Age (Ks)") +gg <- gg + scale_y_continuous("Density") +gg <- gg + xlim(0, 5) +gg <- gg + ggtitle("Proportion of duplicate gene pairs age (TAG and non-TAG)") +gg <- gg + theme(plot.title = element_text(hjust = 0.5)) +gg +#+end_src + +#+RESULTS: +[[file:results/plots/ks_density_tag_and_not_tag.png]] + +We will now test whether TAG gene duplication is more recent than non-TAG gene pairs. +As the distributions of the Ks values is not normally distributed, I will not use the Student \(t\)-test, but the Wilcoxon-Mann-Whitney U test. + +The hypotheses are +\[ +\begin{cases} +(H_0) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is equal to the probability of $Y$ being greater than $X$} \\ +(H_1) & \text{for randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the probability of $X$ to be greater than $Y$ is greater than the probability of $Y$ being greater than $X$} +\end{cases} +\] + +#+begin_src R :session *R* :results output :exports both +tag_ks <- data[data$category == "tag", "ks"] +non_tag_ks <- data[data$category == "notag", "ks"] +wilcox.test(non_tag_ks, tag_ks, alternative="greater") +#+end_src + +#+RESULTS: +: +: Wilcoxon rank sum test with continuity correction +: +: data: non_tag_ks and tag_ks +: W = 358006116, p-value < 2.2e-16 +: alternative hypothesis: true location shift is greater than 0 + +The \(p\)-value is lower than $0.05$ so at level \(\alpha = 5 \%\), we reject the null hypotheses. Given two duplicate gene pairs, the probably that a TAG gene pair duplication event occurred before the non-TAG gene pair duplication event is greater than the otherway around, remaining TAG genes tends to be more recently duplicated. + +* Are genes inside a TAG orientated in the same way more often than random? + +We did not keep the orientation of the genes in the TAGs file. Let extract the gene orientation first, and join this information to the TAGs file. +We will do our analysis on the less stringent dataset. (coverage >30%, identity >30%). + +#+begin_src bash +awk 'BEGIN { + OFS="\t" + } + /^>/ { + locus=$3 + split(locus, arr, ":") + orientation=arr[6] + gene=$4 + sub("gene:", "", gene) + print gene, orientation + }' ./tmp/proteome_filtered.fa > ./tmp/filtered_gene_orientation.tsv +head ./tmp/filtered_gene_orientation.tsv +#+end_src + +#+RESULTS: +| GLYMA_01G141900 | -1 | +| GLYMA_01G234000 | -1 | +| GLYMA_01G157500 | -1 | +| GLYMA_01G031500 | -1 | +| GLYMA_01G132000 | -1 | +| GLYMA_01G038100 | -1 | +| GLYMA_01G184300 | 1 | +| GLYMA_01G231700 | 1 | +| GLYMA_01G031400 | 1 | +| GLYMA_01G008100 | 1 | + +#+begin_src bash +head ./tmp/Glycine_max +#+end_src + +#+RESULTS: + +#+begin_src bash +tail -n +1 ./tmp/filtered_gene_orientation.tsv | sort -d > ./tmp/filtered_gene_orientation_s.tsv +tail -n +1 ./tmp/Glycine_max_TAGs_coverage30_identity30.tsv | sort -d > ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv +join -t$'\t' -1 1 -2 1 ./tmp/filtered_gene_orientation_s.tsv ./tmp/Glycine_max_TAGs_coverage30_identity30_s.tsv > ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv +#+end_src + +#+RESULTS: + +#+begin_src bash +head ./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv +#+end_src + +#+RESULTS: +| GLYMA_01G000100 | -1 | 5506 | - | - | +| GLYMA_01G000200 | -1 | spacer1 | - | - | +| GLYMA_01G000300 | 1 | spacer2 | - | - | +| GLYMA_01G000400 | -1 | 146 | - | 1 | +| GLYMA_01G000500 | 1 | spacer3 | - | - | +| GLYMA_01G000600 | 1 | 146 | - | 1 | +| GLYMA_01G000700 | 1 | spacer4 | - | - | +| GLYMA_01G000800 | 1 | 3898 | - | - | +| GLYMA_01G000900 | 1 | 7954 | - | - | +| GLYMA_01G001000 | 1 | 7875 | - | - | + +#+RESULTS: + +#+begin_src R :session *R* :colnames yes +tag_df = read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_oriented.tsv", + sep="\t") +colnames(tag_df) <- c("gene", "orientation", "family", "tag0", "tag1") +head(tag_df) +#+end_src + +#+RESULTS: +| gene | orientation | family | tag0 | tag1 | +|-----------------+-------------+---------+------+------| +| GLYMA_01G000100 | -1 | 5506 | - | - | +| GLYMA_01G000200 | -1 | spacer1 | - | - | +| GLYMA_01G000300 | 1 | spacer2 | - | - | +| GLYMA_01G000400 | -1 | 146 | - | 1 | +| GLYMA_01G000500 | 1 | spacer3 | - | - | +| GLYMA_01G000600 | 1 | 146 | - | 1 | + +We perform a Fischer test on the contingency table: +| | Coherent | Convergent | Divergent | +|---------+----------+------------+-----------| +| pair TAG0 | | | | +| pair not TAG0 | | | | -* TODO Are TAG pairs different in age from non-TAG pairs? -* TODO Are genes inside a TAG orientated in the same way more often than random? +#+begin_src R :session *R* +get_strand <- function(gene) { + return( + tag_df[tag_df[,"gene"] == gene, "orientation"] + ) +} +get_convergence <- function(gene_a, gene_b) { + strand_a <- get_strand(gene_a) + strand_b <- get_strand(gene_b) + if (strand_a == strand_b) { + return('coherent') + } else if (strand_a == -1) { + return('divergent') + } else if (strand_a == 1) { + return('convergent') + } else { + message("Error: I do not understand why this does not fall in these case (coherent, divergent, convergent, what else?)") + } +} +#+end_src + +#+RESULTS: + +#+begin_src R :session *R* :colnames yes :rownames yes +contingency_table <- matrix(0, nrow=2, ncol=3) +colnames(contingency_table) <- c("coherent", "convergent", "divergent") +rownames(contingency_table) <- c("tag", "nottag") +definition <- 0 +for (i in 1:(nrow(tag_df) - (definition + 1))) { + j <- i + 1 + gene_i <- tag_df[i, "gene"] + gene_j <- tag_df[j, "gene"] + convergence <- get_convergence(gene_i, gene_j) + if (tag_df[i, "tag0"] == "-" || tag_df[j, "tag0"] == "-") { + category <- "nottag" + } else if (tag_df[i, "tag0"] == tag_df[j, "tag0"]) { + category <- "tag" + } else { + category <- "nottag" + } + contingency_table[category, convergence] <- contingency_table[category, convergence] + 1 +} +contingency_table +#+end_src + +#+RESULTS: +| | coherent | convergent | divergent | +|--------+----------+------------+-----------| +| tag | 3353 | 342 | 361 | +| nottag | 25699 | 12926 | 12907 | + + +#+RESULTS: +: org_babel_R_eoe + +#+begin_src R :session *R* :results output +fisher.test(contingency_table, workspace=2e6) +#+end_src + +#+RESULTS: +: +: Fisher's Exact Test for Count Data +: +: data: contingency_table +: p-value < 2.2e-16 +: alternative hypothesis: two.sided + + +\(p\)-value < 2.2e-16, so at level $\alpha = 0.05$, we reject the null hypothesis: the orientation of a pair of gene is significantly different between TAG and non TAG pairs. +More specifically, TAG genes tends to be more coherent than non TAG genes. +* Does /Glycine max/ have big TAGs and which function are the big TAG gene implied? +<2024-12-26 Thu> + +Uses Gprofiler2 + +#+begin_src R :session *R* :results silent +library(gprofiler2) +#+end_src + +First we want to extract the list of genes belonging to the largest TAG: + +We once again focus on the least stringent dataset, with the TAG₀ definition. +#+begin_src R :session *R* +tag_df_tag <- tag_df[tag_df["tag1"] != "-",] +tag1_count <- table(tag_df_tag["tag1"]) +largest_tag1 <- names(tag1_count[which(tag1_count == max(tag1_count))]) +#+end_src + +#+RESULTS: +: 525 + +#+begin_src R :session *R* +largest_tag1_genes <- tag_df_tag[tag_df_tag["tag1"] == largest_tag1, "gene"] +gostres <- gost(query=largest_tag1_genes, organism="gmax") +head(gostres$result) +#+end_src + +#+RESULTS: + +#+begin_src R :session *R* :results graphics file :file results/plots/gprofiler_tag0579_enrichment_plot.png :width 8 :height 8 :res 200 :units cm +p <- gostplot(gostres, capped=FALSE, interactive=FALSE) +p +#+end_src + +#+RESULTS: +[[file:results/plots/gprofiler_tag0579_enrichment_plot.png]] + +Our genes are mostly involved in auxin related signaling pathways, and response to hormones. + +#+begin_example + term_name +1 regulation of cellular respiration +2 regulation of generation of precursor metabolites and energy +3 cellular respiration +4 energy derivation by oxidation of organic compounds +5 generation of precursor metabolites and energy +6 regulation of cellular metabolic process +#+end_example + +* TAG size in function of TAG definition +<2024-12-27 Fri> +#+begin_src bash +./rust/tagfinder/target/release/tagfinder --families ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv --positions ./tmp/Glycine_max.gene_positions.tsv --definitions 0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50 > "./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv" +#+end_src + +#+RESULTS: + +#+begin_src R :session *R* :results graphics file :file results/plots/nb_TAG_against_definition.png :width 8 :height 8 :res 200 :units cm :exports both +library(ggplot2) +library(scales) + +data <- read.table("./tmp/Glycine_max_TAGs_coverage30_identity30_v2.tsv", na.strings=c("-"), header=TRUE) + +definitions <- c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,15,16,17,18,19,20,25,30,35,40,45,50) + +nb_TAGs <- sapply(definitions, function(definition) { + tags <- data[,paste0("tag", definition)] + tags <- tags[! is.na(tags)] + return(max(tags)) + }) + +data <- data.frame(list(definition=definitions, nb=nb_TAGs)) + +theme_set(theme_gray(base_size = 8)) +gg <- ggplot(data, aes(x=definition, y=nb)) +gg <- gg + geom_point() +gg <- gg + scale_x_continuous("Definition") +gg <- gg + scale_y_continuous("TAGs") +gg <- gg + ggtitle("Number of TAGs for a TAG definition") +gg <- gg + theme(plot.title = element_text(hjust = 0.5)) + gg +#+end_src + +#+RESULTS: +[[file:results/plots/nb_TAG_against_definition.pdf]] + +* Computing some statistics on a selection of plants + +<2024-12-29 Sun> + +We want to extract the following statistics for the whole set of plant on eCampus: +- number of families +- number of duplicate genes +- number of singletons +- number of TAG0 + + +We have to filter out some genes that are from Mitochondria or Chloroplasts (for /Oryza sativa ssp. Japonica/ for instance). + +First we extract the protein ID that corresponds to mitochondrial or chloroplastic genes. +#+begin_src bash +fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz" +zcat "${fasta}" | awk ' + /^>/ { + locus=$3 + split(":", arr, locus) + chromosome=arr[3] + if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /^supercontig*/) { + protein=$1 + gsub(">", "", protein) + print protein + } + }' > ./tmp/Oryza_sativa_faulty_proteins.list +#+end_src + +#+RESULTS: + +#+begin_src bash +head ./tmp/Oryza_sativa_faulty_proteins.list +#+end_src + +#+RESULTS: + +Then, remove all BLASTp hits that matches a protein ID in either query or subject. + +We include this step in a Nextflow workflow. It does not compute the BLASTp all against all, It does rather use the TSV file provided on eCampus. diff --git a/rust/pairs/src/main.rs b/rust/pairs/src/main.rs index cc2d4ad..f6a3514 100644 --- a/rust/pairs/src/main.rs +++ b/rust/pairs/src/main.rs @@ -15,7 +15,6 @@ fn main() { let mut parts = line.split("\t"); let gene = parts.next().unwrap().to_string(); let family = parts.next().unwrap().to_string(); - println!("{}", family); if family != family_index { family_index = family; if family_genes.len() > 1 { @@ -23,7 +22,9 @@ fn main() { } family_genes.clear(); } - family_genes.push(gene); + if gene != "" { + family_genes.push(gene); + } } if family_genes.len() > 1 { print_gene_pairs(&family_genes); diff --git a/workflow/.gitignore b/workflow/.gitignore index de6c3af..0dd26b8 100644 --- a/workflow/.gitignore +++ b/workflow/.gitignore @@ -1,2 +1 @@ -.nextflow.log* - +.nextflow* diff --git a/workflow/KaKs/scripts/extract_fasta_records.sh b/workflow/KaKs/scripts/extract_fasta_records.sh index 1bbff6c..9eecd50 100755 --- a/workflow/KaKs/scripts/extract_fasta_records.sh +++ b/workflow/KaKs/scripts/extract_fasta_records.sh @@ -8,16 +8,22 @@ for gene_id in ${gene_id_1} ${gene_id_2}; do awk -v gene_id="${gene_id}" ' BEGIN { on_gene=0 + visited=0 } /^[^>]/ && on_gene == 1 { + print $0 } /^>/ { + if (visited == 1) { + exit + } gene = $4 gsub("gene:", "", gene) if (gene == gene_id) { on_gene=1 print $0 + visited=1 } else { on_gene=0 } diff --git a/workflow/KaKs/standalone.sh b/workflow/KaKs/standalone.sh index e1ff4e8..f73d629 100644 --- a/workflow/KaKs/standalone.sh +++ b/workflow/KaKs/standalone.sh @@ -1,35 +1,28 @@ #!/usr/bin/env bash - -# Bash version of the Nextflow script that should run faster -output_file="Glycine_max_duplicate_genes_KaKs.tsv" -proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa" -cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa" - set -euo pipefail -mkdir -p tmp +NB_THREADS=16 + +output_file="Glycine_max_duplicate_genes_KaKs_v3.csv" +proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa" +cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/cds_filtered.fa" +input_file="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.csv" -echo "seqfile = ./tmp/cds.ali.phy -outfile = ./tmp/yn -verbose = 0 -icode = 0 -weighting = 0 -commonf3x4 = 0" > yn00.ctl compute_kaks() { local gene_id_1 local gene_id_2 - gene_id_1=$1 - gene_id_2=$2 + gene_id_1="${1}" + gene_id_2="${2}" # Extract the proteins - ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst" + ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${proteome_fasta}" "./prot.fst" # Extract the CDS - ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst" + ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${cds_fasta}" "./cds.fst" # Run clustalw2 - clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln" + clustalw2 -quiet -align -infile="./prot.fst" -outfile="./prot.ali.aln" # Run Pal2Nal - pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy" + pal2nal.pl "./prot.ali.aln" "./cds.fst" -output paml >"./cds.ali.phy" # Run yn00 yn00 @@ -55,22 +48,49 @@ compute_kaks() { skip -= 1 } } - ' "./tmp/yn" + ' "./yn" } -echo "gene_a,gene_b,ka,ks" > "${output_file}" -input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv" -n=$(wc -l "${input}") -i=0 -while IFS= read -r line -do - echo $i / $n - gene_a=$(echo $line | awk '{print $1}') - gene_b=$(echo $line | awk '{print $2}') - i=$(expr $i + 1) - kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2') - if [[ ! -z $kaks ]] - then - arr=(${kaks//\t/ }) - echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}" - fi -done < "${input}" + +thread() { + local thread_id="${1}" + local start="${2}" + local end="${3}" + + mkdir -p "tmp/${thread_id}" + pushd "tmp/${thread_id}" || return + + echo "seqfile = ./cds.ali.phy +outfile = ./yn +verbose = 0 +icode = 0 +weighting = 0 +commonf3x4 = 0" >yn00.ctl + echo "" >"part.csv" + for line_index in $(seq $start $end); do + line=$(head -n "${line_index}" "${input_file}" | tail -n 1) + gene_a=$(echo "${line}" | awk '{print $1}') + gene_b=$(echo "${line}" | awk '{print $2}') + kaks=$(compute_kaks "${gene_a}" "${gene_b}" | tail -1 | awk 'NF == 2') + if [[ ! -z $kaks ]]; then + arr=(${kaks//\t/ }) + echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >>"part.csv" + fi + done + popd || return +} + +main() { + part=$(($(cat "${input_file}" | wc -l) / ${NB_THREADS})) + for thread_id in $(seq 0 $NB_THREADS); do + start=$((thread_id * part)) + end=$(($((thread_id + 1)) * part)) + thread $thread_id $start $end & + done + wait + echo "gene_a,gene_b,ka,ks" >"${output_file}" + for thread_id in $(seq 0 $NB_THREADS); do + cat ./tmp/$thread_id/part.csv >>"${output_file}" + done +} + +main