Add report

This commit is contained in:
Samuel Ortion 2025-01-18 15:38:59 +01:00
parent 5bc8f82d48
commit f2e3b168be
Signed by: sortion
GPG Key ID: 9B02406F8C4FB765
26 changed files with 1072 additions and 0 deletions

View File

@ -0,0 +1,14 @@
Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0
Arabidopsis lyrata,32667,26320,6348,5012,494,1571,1834,3868,4739,14
Glycine max,55897,50258,5640,8421,437,2626,2922,6692,7890,32
Gossypium raimondii,38208,32975,5234,5519,333,1964,2163,5202,6155,23
Malus domestica Golden,40624,34948,5677,6912,858,2340,2707,5643,6988,15
Medicago truncatula,50444,39818,10627,5965,639,3264,3864,8195,10712,19
Musa acuminata,35012,28752,6261,4698,624,949,1069,2345,2745,21
Oryza sativa,35775,23377,12399,4605,787,1440,1835,3544,4906,19
Phaseolus vulgaris,28134,22763,5372,4126,686,1743,1888,4702,5383,21
Prunus persica,26873,20222,6652,3653,292,1758,1949,4962,5928,22
Solanum tuberosum,39021,31477,7545,4465,1044,2558,2891,6488,7903,16
Theobroma cacao,29188,21051,8138,3614,606,1593,1836,4041,5069,22
Vigna angularis,33860,26954,6907,4608,649,1622,1939,3820,4771,26
Vigna radiata,22368,17021,5348,3556,411,599,728,1332,1705,7
1 Species Genes Duplicates Singletons Families LargestFamily TAG0 TAG1 TAG0 Genes TAG1 Genes Largest TAG0
2 Arabidopsis lyrata 32667 26320 6348 5012 494 1571 1834 3868 4739 14
3 Glycine max 55897 50258 5640 8421 437 2626 2922 6692 7890 32
4 Gossypium raimondii 38208 32975 5234 5519 333 1964 2163 5202 6155 23
5 Malus domestica Golden 40624 34948 5677 6912 858 2340 2707 5643 6988 15
6 Medicago truncatula 50444 39818 10627 5965 639 3264 3864 8195 10712 19
7 Musa acuminata 35012 28752 6261 4698 624 949 1069 2345 2745 21
8 Oryza sativa 35775 23377 12399 4605 787 1440 1835 3544 4906 19
9 Phaseolus vulgaris 28134 22763 5372 4126 686 1743 1888 4702 5383 21
10 Prunus persica 26873 20222 6652 3653 292 1758 1949 4962 5928 22
11 Solanum tuberosum 39021 31477 7545 4465 1044 2558 2891 6488 7903 16
12 Theobroma cacao 29188 21051 8138 3614 606 1593 1836 4041 5069 22
13 Vigna angularis 33860 26954 6907 4608 649 1622 1939 3820 4771 26
14 Vigna radiata 22368 17021 5348 3556 411 599 728 1332 1705 7

View File

@ -0,0 +1,12 @@
Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0
Arabidopsis lyrata,32667,"80,57 %","19,43 %",5012,494,1571,1834,"14,70 %","18,01 %",14
Glycine max,55897,"89,91 %","10,09 %",8421,437,2626,2922,"13,32 %","15,70 %",32
Gossypium raimondii,38208,"86,30 %","13,70 %",5519,333,1964,2163,"15,78 %","18,67 %",23
Malus domestica Golden,40624,"86,03 %","13,97 %",6912,858,2340,2707,"16,15 %","20,00 %",15
Musa acuminata,35012,"82,12 %","17,88 %",4698,624,949,1069,"8,16 %","9,55 %",21
Oryza sativa,35775,"65,34 %","34,66 %",4605,787,1440,1835,"15,16 %","20,99 %",19
Prunus persica,26873,"75,25 %","24,75 %",3653,292,1758,1949,"24,54 %","29,31 %",22
Solanum tuberosum,39021,"80,67 %","19,34 %",4465,1044,2558,2891,"20,61 %","25,11 %",16
Theobroma cacao,29188,"72,12 %","27,88 %",3614,606,1593,1836,"19,20 %","24,08 %",22
Vigna angularis,33860,"79,60 %","20,40 %",4608,649,1622,1939,"14,17 %","17,70 %",26
Vigna radiata,22368,"76,10 %","23,91 %",3556,411,599,728,"7,83 %","10,02 %",7
1 Species Genes Duplicates Singletons Families LargestFamily TAG0 TAG1 TAG0 Genes TAG1 Genes Largest TAG0
2 Arabidopsis lyrata 32667 80,57 % 19,43 % 5012 494 1571 1834 14,70 % 18,01 % 14
3 Glycine max 55897 89,91 % 10,09 % 8421 437 2626 2922 13,32 % 15,70 % 32
4 Gossypium raimondii 38208 86,30 % 13,70 % 5519 333 1964 2163 15,78 % 18,67 % 23
5 Malus domestica Golden 40624 86,03 % 13,97 % 6912 858 2340 2707 16,15 % 20,00 % 15
6 Musa acuminata 35012 82,12 % 17,88 % 4698 624 949 1069 8,16 % 9,55 % 21
7 Oryza sativa 35775 65,34 % 34,66 % 4605 787 1440 1835 15,16 % 20,99 % 19
8 Prunus persica 26873 75,25 % 24,75 % 3653 292 1758 1949 24,54 % 29,31 % 22
9 Solanum tuberosum 39021 80,67 % 19,34 % 4465 1044 2558 2891 20,61 % 25,11 % 16
10 Theobroma cacao 29188 72,12 % 27,88 % 3614 606 1593 1836 19,20 % 24,08 % 22
11 Vigna angularis 33860 79,60 % 20,40 % 4608 649 1622 1939 14,17 % 17,70 % 26
12 Vigna radiata 22368 76,10 % 23,91 % 3556 411 599 728 7,83 % 10,02 % 7

BIN
docs/report/media/Evolution_fate_duplicate_genes.pdf (Stored with Git LFS) Normal file

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

BIN
docs/report/media/Soja.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 912 KiB

BIN
docs/report/media/gene_orientation_figure.pdf (Stored with Git LFS) Normal file

Binary file not shown.

BIN
docs/report/media/lallemand2020-fig1_copy.pdf (Stored with Git LFS) Normal file

Binary file not shown.

BIN
docs/report/media/lallemand2020_fig1_part1.pdf (Stored with Git LFS) Normal file

Binary file not shown.

BIN
docs/report/media/tag-definition.pdf (Stored with Git LFS) Normal file

Binary file not shown.

209
docs/report/references.bib Normal file
View File

@ -0,0 +1,209 @@
@article{altschulBasicLocalAlignment1990,
title = {Basic Local Alignment Search Tool},
author = {Altschul, Stephen F. and Gish, Warren and Miller, Webb and Myers, Eugene W. and Lipman, David J.},
date = {1990-10-05},
journaltitle = {Journal of Molecular Biology},
shortjournal = {Journal of Molecular Biology},
volume = {215},
number = {3},
pages = {403--410},
issn = {0022-2836},
doi = {10.1016/S0022-2836(05)80360-2},
url = {https://www.sciencedirect.com/science/article/pii/S0022283605803602},
urldate = {2023-04-30},
abstract = {A new approach to rapid sequence comparison, basic local alignment search tool (BLAST), directly approximates alignments that optimize a measure of local similarity, the maximal segment pair (MSP) score. Recent mathematical results on the stochastic properties of MSP scores allow an analysis of the performance of this method as well as the statistical significance of alignments it generates. The basic algorithm is simple and robust; it can be implemented in a number of ways and applied in a variety of contexts including straight-forward DNA and protein sequence database searches, motif searches, gene identification searches, and in the analysis of multiple regions of similarity in long DNA sequences. In addition to its flexibility and tractability to mathematical analysis, BLAST is an order of magnitude faster than existing sequence comparison tools of comparable sensitivity.},
langid = {english}
}
@article{blancWidespreadPaleopolyploidyModel2004,
title = {Widespread {{Paleopolyploidy}} in {{Model Plant Species Inferred}} from {{Age Distributions}} of {{Duplicate Genes}}},
author = {Blanc, Guillaume and Wolfe, Kenneth H.},
date = {2004-07-02},
journaltitle = {The Plant Cell},
shortjournal = {The Plant Cell},
volume = {16},
number = {7},
pages = {1667--1678},
issn = {1040-4651},
doi = {10.1105/tpc.021345},
url = {https://doi.org/10.1105/tpc.021345},
urldate = {2025-01-09},
abstract = {It is often anticipated that many of today's diploid plant species are in fact paleopolyploids. Given that an ancient large-scale duplication will result in an excess of relatively old duplicated genes with similar ages, we analyzed the timing of duplication of pairs of paralogous genes in 14 model plant species. Using EST contigs (unigenes), we identified pairs of paralogous genes in each species and used the level of synonymous nucleotide substitution to estimate the relative ages of gene duplication. For nine of the investigated species (wheat [Triticum aestivum], maize [Zea mays], tetraploid cotton [Gossypium hirsutum], diploid cotton [G. arboretum], tomato [Lycopersicon esculentum], potato [Solanum tuberosum], soybean [Glycine max], barrel medic [Medicago truncatula], and Arabidopsis thaliana), the age distributions of duplicated genes contain peaks corresponding to short evolutionary periods during which large numbers of duplicated genes were accumulated. Large-scale duplications (polyploidy or aneuploidy) are strongly suspected to be the cause of these temporal peaks of gene duplication. However, the unusual age profile of tandem gene duplications in Arabidopsis indicates that other scenarios, such as variation in the rate at which duplicated genes are deleted, must also be considered.}
}
@article{lariviereMethodesBioinformatiquesDanalyse,
title = {Méthodes bioinformatiques d'analyse de l'histoire évolutive des familles de gènes ˸ intégration de données, indices évolutifs, et analyses fonctionnelles appliquées aux familles de gènes impliquées dans la réponse des plantes aux stress environnementaux},
author = {Lariviere, Delphine},
langid = {french}
}
@report{le-hoangEtudeTranscriptomiqueGenes2017,
type = {Internship Report},
title = {Etude transcriptomique des gènes dupliqués en tandem (TAG) chez Arabidopsis thaliana},
author = {Lê-Hoang, Julie},
date = {2017},
institution = {Laboratoire de Mathématiques et Modélisation d'Évry},
langid = {french}
}
@article{liTranscriptionalAnalysisHighly2008,
title = {Transcriptional Analysis of Highly Syntenic Regions between {{Medicago}} Truncatula and {{Glycine}} Max Using Tiling Microarrays},
author = {Li, Lei and He, Hang and Zhang, Juan and Wang, Xiangfeng and Bai, Sulan and Stolc, Viktor and Tongprasit, Waraporn and Young, Nevin D. and Yu, Oliver and Deng, Xing-Wang},
date = {2008-03-19},
journaltitle = {Genome Biology},
shortjournal = {Genome Biol},
volume = {9},
number = {3},
pages = {R57},
issn = {1474-760X},
doi = {10.1186/gb-2008-9-3-r57},
url = {https://doi.org/10.1186/gb-2008-9-3-r57},
urldate = {2025-01-08},
abstract = {Legumes are the third largest family of flowering plants and are unique among crop species in their ability to fix atmospheric nitrogen. As a result of recent genome sequencing efforts, legumes are now one of a few plant families with extensive genomic and transcriptomic data available in multiple species. The unprecedented complexity and impending completeness of these data create opportunities for new approaches to discovery.},
langid = {english},
keywords = {Additional Data File,Medicago Truncatula,Organ Type,Syntenic Region,Tiling Array}
}
@book{ohnoEvolutionGeneDuplication1970,
title = {Evolution by {{Gene Duplication}}},
author = {Ohno, Susumu},
date = {1970},
publisher = {Springer Berlin Heidelberg},
location = {Berlin, Heidelberg},
doi = {10.1007/978-3-642-86659-3},
url = {http://link.springer.com/10.1007/978-3-642-86659-3},
urldate = {2024-03-21},
isbn = {978-3-642-86661-6},
langid = {english}
}
@article{pfeilPlacingPaleopolyploidyRelation2005,
title = {Placing {{Paleopolyploidy}} in {{Relation}} to {{Taxon Divergence}}: {{A Phylogenetic Analysis}} in {{Legumes Using}} 39 {{Gene Families}}},
shorttitle = {Placing {{Paleopolyploidy}} in {{Relation}} to {{Taxon Divergence}}},
author = {Pfeil, B E and Schlueter, J A and Shoemaker, R C and Doyle, J J},
date = {2005-06-01},
journaltitle = {Systematic Biology},
shortjournal = {Systematic Biology},
volume = {54},
number = {3},
pages = {441--454},
issn = {1063-5157},
doi = {10.1080/10635150590945359},
url = {https://doi.org/10.1080/10635150590945359},
urldate = {2025-01-15},
abstract = {Young polyploid events are easily diagnosed by various methods, but older polyploid events become increasingly difficult to identify as chromosomal rearrangements, tandem gene or partial chromosome duplications, changes in substitution rates among duplicated genes, pseudogenization or locus loss, and interlocus interactions complicate the means of inferring past genetic events. Genomic data have provided valuable information about the polyploid history of numerous species, but on their own fail to show whether related species, each with a polyploid past, share a particular polyploid event. A phylogenetic approach provides a powerful method to determine this but many processes may mislead investigators. These processes can affect individual gene trees, but most likely will not affect all genes, and almost certainly will not affect all genes in the same way. Thus, a multigene approach, which combines the large-scale aspect of genomics with the resolution of phylogenetics, has the power to overcome these difficulties and allow us to infer genomic events further into the past than would otherwise be possible. Previous work using synonymous distances among gene pairs within species has shown evidence for large-scale duplications in the legumes Glycine max and Medicago truncatula. We present a case study using 39 gene families, each with three or four members in G. max and the putative orthologues in M. truncatula, rooted using Arabidopsis thaliana. We tested whether the gene duplications in these legumes occurred separately in each lineage after their divergence (Hypothesis 1), or whether they share a round of gene duplications (Hypothesis 2). Many more gene family topologies supported Hypothesis 2 over Hypothesis 1 (11 and 2, respectively), even after synonymous distance analysis revealed that some topologies were providing misleading results. Only ca. 33\% of genes examined support either hypothesis, which strongly suggests that single gene family approaches may be insufficient when studying ancient events with nuclear DNA. Our results suggest that G. max and M. truncatula, along with approximately 7000 other legume species from the same clade, share an ancient round of gene duplications, either due to polyploidy or to some other process.}
}
@article{reimandProfilerWebServer2016,
title = {G:{{Profiler}}—a Web Server for Functional Interpretation of Gene Lists (2016 Update)},
shorttitle = {G},
author = {Reimand, Jüri and Arak, Tambet and Adler, Priit and Kolberg, Liis and Reisberg, Sulev and Peterson, Hedi and Vilo, Jaak},
date = {2016-07-08},
journaltitle = {Nucleic Acids Research},
shortjournal = {Nucleic Acids Research},
volume = {44},
number = {W1},
pages = {W83-W89},
issn = {0305-1048},
doi = {10.1093/nar/gkw199},
url = {https://doi.org/10.1093/nar/gkw199},
urldate = {2025-01-16},
abstract = {Functional enrichment analysis is a key step in interpreting gene lists discovered in diverse high-throughput experiments. g:Profiler studies flat and ranked gene lists and finds statistically significant Gene Ontology terms, pathways and other gene function related terms. Translation of hundreds of gene identifiers is another core feature of g:Profiler. Since its first publication in 2007, our web server has become a popular tool of choice among basic and translational researchers. Timeliness is a major advantage of g:Profiler as genome and pathway information is synchronized with the Ensembl database in quarterly updates. g:Profiler supports 213 species including mammals and other vertebrates, plants, insects and fungi. The 2016 update of g:Profiler introduces several novel features. We have added further functional datasets to interpret gene lists, including transcription factor binding site predictions, Mendelian disease annotations, information about protein expression and complexes and gene mappings of human genetic polymorphisms. Besides the interactive web interface, g:Profiler can be accessed in computational pipelines using our R package, Python interface and BioJS component. g:Profiler is freely available at http://biit.cs.ut.ee/gprofiler/.}
}
@article{schlueterFractionationSyntenyGenomic2008,
title = {Fractionation of {{Synteny}} in a {{Genomic Region Containing Tandemly Duplicated Genes}} across {{Glycine}} Max, {{Medicago}} Truncatula, and {{Arabidopsis}} Thaliana},
author = {Schlueter, Jessica A. and Scheffler, Brian E. and Jackson, Scott and Shoemaker, Randy C.},
date = {2008-07-01},
journaltitle = {Journal of Heredity},
shortjournal = {Journal of Heredity},
volume = {99},
number = {4},
pages = {390--395},
issn = {0022-1503},
doi = {10.1093/jhered/esn010},
url = {https://doi.org/10.1093/jhered/esn010},
urldate = {2025-01-08},
abstract = {Extended comparison of gene sequences found on homeologous soybean Bacterial Artificial Chromosomes to Medicago truncatula and Arabidopsis thaliana genomic sequences demonstrated a network of synteny within conserved regions interrupted by gene addition and/or deletions. Consolidation of gene order among all 3 species provides a picture of ancestral gene order. The observation supports a genome history of fractionation resulting from gene loss/addition and rearrangement. In all 3 species, clusters of N-hydroxycinnamoyl/benzoyltransferase genes were identified in tandemly duplicated clusters. Parsimony-based gene trees suggest that the genes within the arrays have independently undergone tandem duplication in each species.}
}
@article{schmutzGenomeSequencePalaeopolyploid2010,
title = {Genome Sequence of the Palaeopolyploid Soybean},
author = {Schmutz, Jeremy and Cannon, Steven B. and Schlueter, Jessica and Ma, Jianxin and Mitros, Therese and Nelson, William and Hyten, David L. and Song, Qijian and Thelen, Jay J. and Cheng, Jianlin and Xu, Dong and Hellsten, Uffe and May, Gregory D. and Yu, Yeisoo and Sakurai, Tetsuya and Umezawa, Taishi and Bhattacharyya, Madan K. and Sandhu, Devinder and Valliyodan, Babu and Lindquist, Erika and Peto, Myron and Grant, David and Shu, Shengqiang and Goodstein, David and Barry, Kerrie and Futrell-Griggs, Montona and Abernathy, Brian and Du, Jianchang and Tian, Zhixi and Zhu, Liucun and Gill, Navdeep and Joshi, Trupti and Libault, Marc and Sethuraman, Anand and Zhang, Xue-Cheng and Shinozaki, Kazuo and Nguyen, Henry T. and Wing, Rod A. and Cregan, Perry and Specht, James and Grimwood, Jane and Rokhsar, Dan and Stacey, Gary and Shoemaker, Randy C. and Jackson, Scott A.},
date = {2010-01},
journaltitle = {Nature},
volume = {463},
number = {7278},
pages = {178--183},
publisher = {Nature Publishing Group},
issn = {1476-4687},
doi = {10.1038/nature08670},
url = {https://www.nature.com/articles/nature08670},
urldate = {2025-01-15},
abstract = {Soybean (Glycine max) is one of the most important crop plants for seed protein and oil content, and for its capacity to fix atmospheric nitrogen through symbioses with soil-borne microorganisms. We sequenced the 1.1-gigabase genome by a whole-genome shotgun approach and integrated it with physical and high-density genetic maps to create a chromosome-scale draft sequence assembly. We predict 46,430 protein-coding genes, 70\% more than Arabidopsis and similar to the poplar genome which, like soybean, is an ancient polyploid (palaeopolyploid). About 78\% of the predicted genes occur in chromosome ends, which comprise less than one-half of the genome but account for nearly all of the genetic recombination. Genome duplications occurred at approximately 59 and 13 million years ago, resulting in a highly duplicated genome with nearly 75\% of the genes present in multiple copies. The two duplication events were followed by gene diversification and loss, and numerous chromosome rearrangements. An accurate soybean genome sequence will facilitate the identification of the genetic basis of many soybean traits, and accelerate the creation of improved soybean varieties.},
langid = {english},
keywords = {DNA sequencing,Plant genetics}
}
@incollection{stuparInsightsSoybeanGlycine2013,
title = {Insights from the {{Soybean}} ({{Glycine}} Max and {{Glycine}} Soja) {{Genome}}: {{Past}}, {{Present}}, and {{Future}} - {{ScienceDirect}}},
booktitle = {Advances in {{Agronomy}}},
author = {Stupar, Robert M. and Specht, James E.},
date = {2013},
volume = {118},
pages = {177--204},
url = {https://www.sciencedirect.com/science/article/abs/pii/B9780124059429000049},
urldate = {2025-01-08},
langid = {english}
}
@article{thomasPANTHERMakingGenomescale2022,
title = {{{PANTHER}}: {{Making}} Genome-Scale Phylogenetics Accessible to All},
shorttitle = {{{PANTHER}}},
author = {Thomas, Paul D. and Ebert, Dustin and Muruganujan, Anushya and Mushayahama, Tremayne and Albou, Laurent-Philippe and Mi, Huaiyu},
date = {2022},
journaltitle = {Protein Science},
volume = {31},
number = {1},
pages = {8--22},
issn = {1469-896X},
doi = {10.1002/pro.4218},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/pro.4218},
urldate = {2025-01-16},
abstract = {Phylogenetics is a powerful tool for analyzing protein sequences, by inferring their evolutionary relationships to other proteins. However, phylogenetics analyses can be challenging: they are computationally expensive and must be performed carefully in order to avoid systematic errors and artifacts. Protein Analysis THrough Evolutionary Relationships (PANTHER; http://pantherdb.org) is a publicly available, user-focused knowledgebase that stores the results of an extensive phylogenetic reconstruction pipeline that includes computational and manual processes and quality control steps. First, fully reconciled phylogenetic trees (including ancestral protein sequences) are reconstructed for a set of “reference” protein sequences obtained from fully sequenced genomes of organisms across the tree of life. Second, the resulting phylogenetic trees are manually reviewed and annotated with function evolution events: inferred gains and losses of protein function along branches of the phylogenetic tree. Here, we describe in detail the current contents of PANTHER, how those contents are generated, and how they can be used in a variety of applications. The PANTHER knowledgebase can be downloaded or accessed via an extensive API. In addition, PANTHER provides software tools to facilitate the application of the knowledgebase to common protein sequence analysis tasks: exploring an annotated genome by gene function; performing “enrichment analysis” of lists of genes; annotating a single sequence or large batch of sequences by homology; and assessing the likelihood that a genetic variant at a particular site in a protein will have deleterious effects.},
langid = {english},
keywords = {gene ontology,genome analysis,hidden Markov model,molecular evolution,omics data analysis,phylogenetic tree,protein function annotation,protein function evolution}
}
@incollection{vandongenUsingMCLExtract2012a,
title = {Using {{MCL}} to {{Extract Clusters}} from {{Networks}}},
booktitle = {Bacterial {{Molecular Networks}}},
author = {Van Dongen, Stijn and Abreu-Goodger, Cei},
editor = {Van Helden, Jacques and Toussaint, Ariane and Thieffry, Denis},
date = {2012},
volume = {804},
pages = {281--295},
publisher = {Springer New York},
location = {New York, NY},
doi = {10.1007/978-1-61779-361-5_15},
url = {http://link.springer.com/10.1007/978-1-61779-361-5_15},
urldate = {2024-04-11},
isbn = {978-1-61779-360-8 978-1-61779-361-5},
langid = {english}
}
@article{yangEstimatingSynonymousNonsynonymous2000a,
title = {Estimating {{Synonymous}} and {{Nonsynonymous Substitution Rates Under Realistic Evolutionary Models}}},
author = {Yang, Ziheng and Nielsen, Rasmus},
date = {2000-01-01},
journaltitle = {Molecular Biology and Evolution},
shortjournal = {Molecular Biology and Evolution},
volume = {17},
number = {1},
pages = {32--43},
issn = {0737-4038},
doi = {10.1093/oxfordjournals.molbev.a026236},
url = {https://doi.org/10.1093/oxfordjournals.molbev.a026236},
urldate = {2024-12-29},
abstract = {Approximate methods for estimating the numbers of synonymous and nonsynonymous substitutions between two DNA sequences involve three steps: counting of synonymous and nonsynonymous sites in the two sequences, counting of synonymous and nonsynonymous differences between the two sequences, and correcting for multiple substitutions at the same site. We examine complexities involved in those steps and propose a new approximate method that takes into account two major features of DNA sequence evolution: transition/transversion rate bias and base/codon frequency bias. We compare the new method with maximum likelihood, as well as several other approximate methods, by examining infinitely long sequences, performing computer simulations, and analyzing a real data set. The results suggest that when there are transition/transversion rate biases and base/codon frequency biases, previously described approximate methods for estimating the nonsynonymous/synonymous rate ratio may involve serious biases, and the bias can be both positive and negative. The new method is, in general, superior to earlier approximate methods and may be useful for analyzing large data sets, although maximum likelihood appears to always be the method of choice.}
}

3
docs/report/report.pdf Normal file
View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:031de836dbc48cc6694f81bbfc06cf5c8e0acedcc737066670af79360d76e531
size 1394512

366
docs/report/report.tex Normal file
View File

@ -0,0 +1,366 @@
\documentclass{scrreprt}
\usepackage{sty/style}
\title{Tandemly arrayed genes in \textit{Glycine max}}
\subtitle{Comparative genomics project}
\titlehead{M2 GENIOMHE}
\author{Naïa Périnelle \and Samuel Ortion}
\date{2024-12-21}
\makeglossaries
\makeindex
\addbibresource{references.bib}
\begin{document}
\maketitle
\tableofcontents
\clearpage
\begin{relaxclearpage}
\listoffigures
\listoftables
\end{relaxclearpage}
\chapter{Introduction}
The objective of our analysis is to determine the amount of duplicate genes in a plant species.
We chose to focus more specifically on soybean (\textit{Glycine max}).
We aimed to determine the amount of tandemly arrayed genes.
The main questions that marked our analysis progress were:
Is there a plant clade that is more loaded in duplicated genes?
Is the orientation of TAG gene pairs random?
Is there a lot of duplicated operons in plants?
Is TAG gene duplication more recent than non-TAG duplicates?
\section{Soybean}
Soybean (\textit{Glycine max}) is a native East Asian legume that is widely used in agriculture, mainly for cattle feed or human alimentation.
Soybean has been domesticated from \textit{Glycine soja} multiple times in China \autocite{stuparInsightsSoybeanGlycine2013}. Documents dated from 1500 to 1100 BC relates its use.
Pictures of the general aspect of \textit{Glycine max}, its flowers, leaves and beans are reproduced in \cref{fig:soy-plant} and \cref{fig:soy-bean-varieties}.
Soybean karyotype comprises 20 chromosomes. Its genome is about 1~Gbp long. Soybean proteome counts 55,897 protein-coding genes.
General statistics on \textit{Glycine max} genome are summarized in \cref{tab:glycine-max-genome-statistics}.
%
\begin{figure}
\centering
\begin{subfigure}[b]{0.3\columnwidth}
\centering
\includegraphics[height=3.5cm]{media/Glycine_max_plant1_Carol_Rose_(10220578213).jpg}
\caption{}
\label{fig:soy-plant-global}
\end{subfigure}
\begin{subfigure}[b]{0.3\columnwidth}
\centering
\includegraphics[height=3.5cm]{media/Soybean_flowers.png}
\caption{}
\label{fig:soy-plant-flower}
\end{subfigure}
\begin{subfigure}[b]{0.3\columnwidth}
\centering
\includegraphics[height=3.5cm]{media/Plante_de_Soja_-_Feuilles_et_fruits.jpg}
\caption{}
\label{fig:soy-plant-leaves}
\end{subfigure}
\caption[Soy plant pictures]{Soy plant pictures. (a) General aspect of the plant (Harry Rose, \href{https://creativecommons.org/licenses/by/2.0}{CC BY 2.0}, via Wikimedia Commons). (b) Flowers (Huwmanbeing, Public Domain via Wikimedia Commons). (c) Leaves and fruits (Pancrat, \href{https://creativecommons.org/licenses/by-sa/3.0}{CC BY-SA 3.0}, via Wikimedia Commons).}
\label{fig:soy-plant}
\end{figure}
\begin{figure}
\centering
\includegraphics[width=0.5\linewidth]{media/Soybeanvarieties.jpeg}
\caption{Soy bean varieties}
\label{fig:soy-bean-varieties}
\end{figure}
\section{Duplicate genes and gene families}
During species evolution, different mechanisms may lead to a duplication of a gene.
Polyploidization may occur thanks to an abnormal meiosis, and lead to the duplication of the whole genome (Whole Genome Duplication, WGD) or a chromosome (as in the case of aneuploidy).
An uneven crossing-over may duplicate a segment of a chromosome that may potentially lead to a segment duplication, or tandemly arrayed genes.
Transposable elements may well introduce new copies of genes.
DNA transposon may incorporate a gene sequence with its transposase enzyme at a new place in the genome. The retrotransposase enzyme of retrotransposons may retro-transcribe a mRNA of a gene into its cDNA that may be included right back into the genome, resulting in a copy of the transcribed gene with its introns lost and with a poly-A tail appended.
\section{The fate of duplicate genes}
Once a gene is duplicated, one of the two copies may be lost in a process known as pseudogenization. They may both keep the same function, resulting in a functional redundancy. Otherwise, both of them may specialize in a subpart of the original function, being subject to subfunctionalization. Finally, one of the duplicate genes may acquire a new function, a process termed neofunctionalization.
\section{Tandemly arrayed genes}
Tandemly Arrayed Genes (TAGs) are duplicated genes located close to each other on the same chromosome.
Closeness on a chromosome is measured with the number of genes that do not belong to the duplicate gene family and that are located between two members of the TAG. Those genes are called ``spacers''. We denote by TAG\textsubscript{$d$} the TAG definition allowing at most $d$ spacers between TAG genes.
\section{Synonymous mutation rate as a proxy of duplication age}
Codons are three nucleotides units encoding one amino-acid. There exist $4^3 = 64$ codons, and only 20 proteinogenic amino-acid. Thus, some amino-acid may be encoded by multiple different codons.
A mutation in a coding sequence may not be involved in any change in the peptide sequence: This is what we call a synonymous mutation. The ratio of the number of synonymous mutations over synonymous sites ($K_s$) can be used as a proxy for the age of the duplication under the assumption that the rate of mutation remains steady and is similar in every gene.
\chapter{Material and Methods}
\section{Data sources}
For our species \textit{Glycine max}, we used the following input data files from the \href{https://plants.ensembl.org/Glycine_max/Info/Index}{Ensembl Plant portal}:
\begin{itemize}
\item \verb|Glycine_max.Glycine_max_v2.1.pep.all.fa.gz| -- the proteome of \textit{Glycine max} in compressed FASTA format
\item \verb|Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz| -- the genome features table for \textit{Glycine max} in compressed GFF3 format
\item \verb|Glycine_max.Glycine_max_v2.1.cds.all.fa.gz| -- the coding sequences of \textit{Glycine max} genes in compressed FASTA format
\end{itemize}
In addition, we used the following input data files from eCampus:
\begin{itemize}
\item \verb|Glycine_max_list| -- the list of protein IDs and their respective gene ID
\item \verb|Glycine_max_Blastp_longIsoforme| -- the results of the BLASTp alignment already performed.
\end{itemize}
For our extended analysis on the other proposed plants (cf. \cref{tab:count-table-for-all-plants}), we used the same data sources.
\section{Identification of Duplicated Gene Families and Detection of Tandemly Arrayed Genes}
\paragraph{Mitochondria and Chloroplasts} We do not want mitochondrial nor chloroplast genes to appear in our dataset. In our case, none of the organelles were sequenced in \textit{Glycine max}, so we did not have to filter out these genes. We did however systematically filter these genes in our analyses on the whole set of plants.
\paragraph{Supercontigs} In the Ensembl Plant proteome file, the denomination ``supercontig'' corresponds to a contiguous sequence that the assembly could not locate in the right position on the genome. We filter out proteins whose sequence is located in a supercontig.
\paragraph{Isoform selection} We kept only the longest protein isoform of each gene.
\paragraph{BLASTp all-against-all} To estimate the homology links between each pair of genes, we ran a BLASTp \autocite{altschulBasicLocalAlignment1990} all-against-all on the filtered proteome.
\paragraph{Filter BLASTp hits} Resulting BLASTp hits were filtered based on the coverage of the local alignment on both query and subject sequences and identity percentage for two different datasets whose threshold values are reported in \cref{tab:dataset-threshold-values}.
\begin{table}
\centering
\begin{tabular}{ccc}
\toprule
& \bfseries Low stringency dataset & \bfseries High stringency dataset \\
\midrule
\bfseries Coverage & >30\% & >40\% \\
\bfseries Identity & >30\% & >50\% \\
\bottomrule
\end{tabular}
\caption{\label{tab:dataset-threshold-values} Threshold values for the high and low stringency datasets}
\end{table}
\paragraph{Clustering of the homology graph} From the BLASTp file, we extracted the inferred homology links between pairs of genes with the bitscore value of the local alignment. We kept the highest bitscore value in case there were several alignments for the same pair of genes. We clustered the homology graph with the Markov Clustering algorithm (\texttt{mcl} 22-28) to extract communities of genes that we consider being families of duplicate genes \autocite{vandongenUsingMCLExtract2012a}.
\paragraph{Detection of TAGs} A homemade Rust program has been written in order to associate each gene to its TAG status: whether it belongs to a TAG, and which one, or if it does not belong to a TAG. This program accepts as input a list of TAG definition numbers. Another version of the code was developed in python. Comparing the results of the two programs allowed to correct errors in both until arriving at the correct result.
\paragraph{Workflow automation with Nextflow} The workflow has been ported on Nextflow to be run on the thirteen plant genomes proposed more easily. Each plant genome analysis takes about a minute to run, from the given BLASTP format 7 file to the summary statistics.
\section{Testing orientation concordance between TAG genes and non-TAG genes}
Based on the TAG data table, we generated all pairs of genes not separated by any spacer (definition TAG\textsubscript{0}). We associated each pair to whether they belong to the same TAG and to its orientation concordance status. If both genes are oriented in the same direction, their orientation is said to be \textit{coherent} ($\rightarrow \rightarrow$ or $\leftarrow\leftarrow$). If the genes head in opposite directions, they are said to be \textit{divergent} ($\leftarrow \rightarrow$). If they head towards each other ($\rightarrow \leftarrow$), their orientation is \textit{convergent}.
We run a Fisher exact test on this contingency table.
The tested hypotheses are
\[
\begin{cases}
(H_0) & \text{the gene pair orientation concordance is not associated with the TAG status} \\
(H_1) & \text{the gene pair orientation concordance is associated with the TAG status}
\end{cases}
\]
Given a $2 \times 3$ contingency table like \cref{tab:contingency-table}, the Fisher exact test $p$-value is computed with the following formula
\begin{equation}
\label{eqn:fisher-exact-test-2x3}
p = \frac{\binom{a + b}{a}\binom{c + d}{c}\binom{e + f}{e}}{\binom{n}{a + c + e}},
\end{equation}
where $n$ is the total count of gene pairs. We used the R function \texttt{fisher.test} to run this test.
\begin{table}[H]
\centering
\begin{tblr}{
hline{1,2}={3-Z}{solid}, hline{3-Z}={solid},
vline{1,2}={3-Z}{solid}, vline{3-Z}={solid},
cell{1}{3} = {c = 3}{halign = c},
cell{3}{1} = {r = 2}{valign = m},
}
& & Category 1 & \\
& & Group 1 & Group 2 & Group 3 \\
Category 2 & Group 1 & $a$ & $b$ & $c$ \\
& Group 2 & $d$ & $e$ & $f$
\end{tblr}
\caption{$2 \times 3$ contingency table}
\label{tab:contingency-table}
\end{table}
\section{Testing age difference between TAG pairs and non-TAG duplicates}
Given the list of duplicate genes associated with their family identifiers, we built a list of duplicate gene pairs.
This list contains 1,029,762 pairs.
Then, we computed the $K_a$ and $K_s$ values using \texttt{PAML} v4.10.7.
To do so, we wrote a simple bash script that (i) extracts the protein sequence of each of the gene pair members, (ii) extracts the coding sequence of these proteins, (iii) aligns the proteins using \texttt{Clustalw2} v2.1, (iv) use Pal2Nal v14 to obtain the corresponding nucleotide alignment, and finally (v) use PAML \texttt{yn00} tool using the Yang and Nielsen model to compute $K_a$ and $K_s$ values \autocite{yangEstimatingSynonymousNonsynonymous2000a}.
$K_s$ values above 5 have been filtered out, as the duplication events associated with such a high $K_s$ value are too ancient for the $K_s$ value to be reliable.
We did not filter the $K_s$ values based on the standard deviation.
To test whether the duplication age of TAG gene is more recent than the duplication age of non-TAG genes, we performed a Wilcoxon-Mann-Whitney U test.
For randomly selected values $X$ and $Y$ from `non-TAG' and `TAG' respectively, the tested hypotheses are
\[
\begin{cases}
(H_0) & \parbox[t]{.6\textwidth}{the probability of $X$ to be greater than $Y$ is equal to the probability of $Y$ being greater than $X$} \\
(H_1) & \parbox[t]{.6\textwidth}{the probability of $X$ to be greater than $Y$ is greater than the probability of $Y$ being greater than $X$}
\end{cases}
\]
To do so, we used the \texttt{wilcox.test} R function with the ``greater'' alternative.
\section{Identification of big TAG and functional analysis}
\paragraph{PANTHERdb annotation}
We counted the number of duplicated genes within each TAG to analyze the distribution of genes across TAGs and identify the TAG with the highest number of duplicated genes. We then retrieved the list of gene IDs contained in the largest TAG according to the definition ``1 spacer". To perform a brief functional analysis of the genes in this largest TAG\textsubscript{1}, we used the online \href{https://pantherdb.org/}{PANTHERdb} (Protein ANalysis THrough Evolutionary Relationships database) \autocite{thomasPANTHERMakingGenomescale2022}. We uploaded our gene lists and selected \textit{Glycine max} as the organism. We ran a statistical over-representation test using PANTHER GO-Slim Biological Process as the annotation set. The analysis was conducted with default parameters.
We looked at other annotation data set, and even changed the test type and the correction (respectively \texttt{Ficher exact} and \texttt{Calculation false discovery rate} initially, then \texttt{Binomial} and/or \texttt{Bonferroni correction for multiple testing}). But results remain always the same.
\paragraph{g:Profiler annotation} Using the same set of TAG gene identifiers, we ran a functional annotation with g:Profiler R package \autocite{reimandProfilerWebServer2016}.
\chapter{Results}
\Cref{tab:statistics-results-glycine-max} reports the counts we obtained during our analysis. \Cref{tab:count-table-for-all-plants} reports our duplicate genes and TAGs counts for the thirteen plants selection.
In \cref{fig:family-sizes-distributions}, the distribution of the sizes of the duplicate genes families are represented for both dataset. As expected, this distribution follows an exponential degrowth.
\begin{table}
\centering
\begin{tabular}{ll}
\toprule
\bfseries Variable & \bfseries Value \\
\midrule
Chromosomes & 20 \\
Genome length & 978,491,270~bp \\
Protein coding genes (with supercontigs) & 55,897 \\
Protein coding genes (without supercontigs) & 55,589 \\
Protein isoforms & 88,412 \\
\bottomrule
\end{tabular}
\caption{General statistics on \textit{Glycine max} genome}
\label{tab:glycine-max-genome-statistics}
\end{table}
\begin{figure}
\centering
\begin{subfigure}[t]{0.45\columnwidth}
\centering
\includegraphics[width=\columnwidth]{results/Glycine_max_family_size_hist_coverage30_identity30.pdf}
\end{subfigure}
\begin{subfigure}[t]{0.45\columnwidth}
\centering
\includegraphics[width=\columnwidth]{results/Glycine_max_family_size_hist_coverage40_identity50.pdf}
\end{subfigure}
\caption{Gene family sizes distributions for low and high stringency dataset in \textit{Glycine max}}
\label{fig:family-sizes-distributions}
\end{figure}
\begin{table}
\centering
\begin{tabular}{lll}
\toprule
\bfseries Dataset stringency & \bfseries low & \bfseries high \\
\midrule
Families & 8,426 & 11,997 \\
Duplicate genes & 50,254 (89.9\%) & 46,769 (83.7\%) \\
Singletons & 5,643 (10.1\%) & 9,128 (16.3\%) \\
TAG\textsubscript{0} & 2,620 & 2,157 \\
TAG\textsubscript{1} & 2,916 & 2,438 \\
Genes in TAG\textsubscript{0} & 6,652 (13\%) & 5,335 (12\%) \\
Genes in TAG\textsubscript{1} & 7,857 (16\%) & 6,420 (14\%) \\
Genes in biggest TAG\textsubscript{0} & 32 (4.8\%) & 32 (6.0\%) \\
Genes in biggest TAG\textsubscript{1} & 43 (5.5\%) & 43 (6.7\%) \\
\bottomrule
\end{tabular}
\caption{Number of families, duplicated genes, singletons, TAGs, duplicated genes belongs to a TAG, and max size of a TAG obtained in \textit{Glycine max} for low and high stringency datasets. The percentages of genes in TAG\textsubscript{d} are calculated from the number of duplicated genes, not the total genes. Those for the largest TAG are calculated from the number of duplicated genes belongs to a TAG}
\label{tab:statistics-results-glycine-max}
\end{table}
\begin{table}[]
\centering
\begin{adjustbox}{angle=90}
\csvreader[
head to column names,
tabular = lllllllllll,
table head = \toprule \bfseries Species & \bfseries Genes & \bfseries Duplicates & \bfseries Singletons & \bfseries Families & \bfseries \makecell{Largest\\Family} & \bfseries TAG\textsubscript{0} & \bfseries TAG\textsubscript{1} & \bfseries \makecell{TAG\textsubscript{0}\\Genes} & \bfseries \makecell{TAG\textsubscript{1}\\Genes} & \bfseries \makecell{Largest\\TAG\textsubscript{0}} \\\midrule,
table foot = \bottomrule,
]{data/concat.csv}{}{%
\slshape\Species & \num[group-separator={,}]{\Genes} & \num[group-separator={,}]{\Duplicates} & \num[group-separator={,}]{\Singletons} & \num[group-separator={,}]{\Families} & \num[group-separator={,}]{\LargestFamily} & \num[group-separator={,}]{\csvcolvii} & \num[group-separator={,}]{\csvcolviii} & \num[group-separator={,}]{\csvcolix} & \num[group-separator={,}]{\csvcolx} & \num[group-separator={,}]{\csvcolxi}
}
\end{adjustbox}
\caption{Count tables for duplicate genes in a selection of thirteen plants, using the low stringency filtering criteria.}
\label{tab:count-table-for-all-plants}
\end{table}
\section{TAG\textsubscript{0} gene pairs orientation concordance is different from the one of other gene pairs}
\Cref{tab:contingency-table-fisher-test-orientation-convergence} reports the count of TAG and not TAG gene pairs orientation concordance.
The Fisher exact test on this contingency table reports a $p$-value below $2.2\cdot 10^{-16}$, so at level $\alpha = 0.05$, we reject the null hypothesis: the orientations of a pair of genes in TAG is significantly different from that of non-TAG genes. More specifically, TAG gene are more coherent than non-TAG genes.
\begin{table}
\centering
\begin{tabular}{llll}
\toprule
& \bfseries Coherent & \bfseries Convergent & \bfseries Divergent \\
\midrule
\bfseries TAG & 3,353 & 342 & 361 \\
\bfseries non-TAG & 25,699 & 12,926 & 12,907 \\
\bottomrule
\end{tabular}
\caption{Contingency table, count of gene pair orientation concordance for TAG pairs or not TAG pairs}
\label{tab:contingency-table-fisher-test-orientation-convergence}
\end{table}
\section{TAG\textsubscript{0} gene pair duplication is more recent than non-TAG duplication}
In \cref{fig:ks-density-tag-and-not-tag}, the distribution of duplication age, as seen through the proxy of substitutions per synonymous site ($K_s$), is depicted. There are a large peak in the distribution of $K_s$ (mode $K_s \approx 0.1$). This corresponds to a secondary burst of gene duplication in the species evolution. This is consistent with what has been found by Blanc and Wolfe, 2004 \autocite{blancWidespreadPaleopolyploidyModel2004}.
The Wilcoxon-Mann-Whitney U test statistic is $W = 116946624$, $p$-value $< 2.2\cdot 10^{-16} < 0.05$, so at level $\alpha = 5~\%$, we reject the null hypothesis. The TAG gene pair duplication that we still observe in the remaining genes tends to have a lower $K_s$ value than non-TAG duplicate pairs, which means that TAG pair duplication tends to have occurred more recently than non-TAG pairs.
\begin{figure}
\centering
\includegraphics{results/ks_density_tag_and_not_tag.png}
\caption{Distribution of substitutions per synonymous site ($K_s$) for \textit{Glycine max} duplicated gene pairs. The duplicate genes are splitted in two categories based on whether they belong to a TAG\textsubscript{0} (in blue) or not (in red).}
\label{fig:ks-density-tag-and-not-tag}
\end{figure}
\section{The number of TAG increases logarithmically with the number of allowed spacers}
\Cref{fig:number-of-tag-in-function-of-tag-definition} depicts the number of TAG detected in the low stringency dataset with varying number of spacers.
We obtain a logarithmic growth of the number of TAGs as we increase the number of spacers.
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{results/nb_TAG_against_definition.png}
\caption{Number of TAG in function of the TAG definition}
\label{fig:number-of-tag-in-function-of-tag-definition}
\end{figure}
\section{Functional analysis}
One the one hand, with a reference list of 55,853 genes, only 27 of our 43 gene IDs map uniquely to the PANTHER database. However, these 27 genes are "Unclassified", like the majority of genes in the reference. 17 gene IDs mapped to multiple entries in the database. This result is the same for the two largest TAG of definition 1 for low and high stringency.
On the other hand, with g:Profiler, the largest TAG's genes present an enrichment in function related to cellular respiration and energy metabolism.
\chapter{Discussion}
\section{Amount of paralogs}
We found a number of paralogs quite different from previous studies performed on \textit{Glycine max} \autocite{blancWidespreadPaleopolyploidyModel2004}. They found a proportion of 32\% of genes being duplicated, whereas we estimate this proportion to be more than 80\%. This may be explained by the different criteria used to define a homology between two genes. Indeed, in Blanc and Wolfe approach, they defined duplicate genes based on a nucleic sequence alignment, which is far more stringent than our protein alignment approach.
\section{Orientation of TAG genes}
As previously shown for \textit{Arabidopsis thaliana} \autocite{le-hoangEtudeTranscriptomiqueGenes2017}, the orientation of tandemly arrayed genes is not random: two genes belonging to the same gene family and separated by no spacer are more likely to share the same orientation.
However, this result has to be mitigated, because in TAG category there are only pairs of the same family, whereas in non-TAG any pair of genes is considered. Thus, we could well be measuring an effect of gene families instead of TAG ownership. In spite of this remark, we may interpret the orientation concordance as an effect of a gene regulation constraint on duplicate genes with no spacer, sharing a similar genetic environment -- enhancers, silencers: \textit{cis} regulatory regions -- might be advantageous to keep a coherent expression pattern for the duplicate. This could be important to keep the same stoichiometry in protein complexes for instance.
% TODO : discuss all results
% TODO : discuss more specificaly Ks results
\section{Age of gene duplication}
Our analysis of the age of genes duplication in soybean has shown that TAG gene duplicates found in the current soybean sequenced variety are more recent than non-TAG gene duplicates. The distribution of synonymous mutations per synonymous sites showed the presence of two more ancient bursts of gene duplication. These bursts of duplication could be due to ancient whole genome duplication events. It has indeed been shown that several whole genome duplication events occurred in the history of \textit{Glycine max} genome evolution \autocite{stuparInsightsSoybeanGlycine2013}.
% TODO : compare results to literature
% TODO : Compare to other plants
\section{Comparison with other plants}
In all the plants studied, it seems that the larger the genome (i.e., the more genes it contains), the greater the proportion of duplicated genes (and conversely, the smaller the proportion of singletons), although there are exceptions (such as \textit{Oryza sativa}). Our organism, \textit{Glycine max}, has a much larger number of genes in its genome (55,897 genes), while the other genomes have between 22,000 and 40,000 genes; in terms of genome size, \textit{Glycine max} is the most distant from the others. Its proportion of duplicated genes is the highest, around 90\%, while most other plants have between 72\% and 86\% duplicated genes. In terms of the proportion of duplicated genes, \textit{Glycine max} has indeed more duplicate genes than the other plants, but the proportions remain similar, whereas \textit{Oryza sativa} shows a much lower proportion of duplicated genes (65\%), despite being in the upper range for genome size (about 35,000 genes). The number of families is also higher when there are more genes. Thus, \textit{Glycine max} shows a significantly higher number of families. The same applies to the number of TAGs, for different definitions (comparison made for TAG\textsubscript{0} and TAG\textsubscript{1}). However, the size of the largest family in our organism is among the smallest when compared to the other organisms. Additionally, in terms of the proportion of duplicated genes associated with a TAG, \textit{Glycine max} shows lower proportions compared to the other genomes, with 13\% and 16\%, whereas the other genomes average between 14\%-21\% and 18\%-24\% (values for 7 of the other genomes). \textit{Prunus persica} exhibits the highest proportion of duplicated genes in TAGs, yet has one of the smallest numbers of families, duplicated genes, and total genes in the genome. To summarize, it seems that \textit{Glycine max} has a large genome, highly redundant (although normal compared to other plants), but with duplicated genes that are very dispersed.
\printbibliography
\end{document}

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

View File

@ -0,0 +1,195 @@
%% bioinfo-report.cls
%% Copyright 2023 Samuel ORTION <samuel+dev@ortion.fr>
%
% This work may be distributed and/or modified under the
% conditions of the LaTeX Project Public License, either version 1.3
% of this license or (at your option) any later version.
% The latest version of this license is in
% http://www.latex-project.org/lppl.txt
% and version 1.3 or later is part of all distributions of LaTeX
% version 2005/12/01 or later.
%
% This work has the LPPL maintenance status `maintained'.
%
% The Current Maintainer of this work is Samuel ORTION <samuel+dev@ortion.fr>.
%
% This work consists of the files bioinfo-report.cls and is part of the Chameleon Press project
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Chameleon Bioinformatics Report LuaLaTeX Class
%
% Author: Samuel ORTION
% Version: v0.0.1
% Created on: 2023-01-06
% Updated on: 2023-01-06
% License: LPPL
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% CLASS OPTIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\NeedsTeXFormat{LaTeX2e}
\ProvidesClass{sty/bioinfo-report}[2023-01-06 v0.0.2 Bioinformatics Report LuaLaTeX Class]
\LoadClass{report}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% MISC
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RequirePackage{hyperref}
\RequirePackage{etoolbox}
\RequirePackage{calc}
\RequirePackage{luatextra}
\RequirePackage{pgffor}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% TEXT & FONTS OPTIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RequirePackage{fontspec}
\setmainfont{Linux Libertine O}
\RequirePackage{ulem}
\RequirePackage{lettrine}
\RequirePackage{microtype}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COLOR OPRTIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RequirePackage{xcolor}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% PAGE OPTIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RequirePackage{geometry}
\geometry{a4paper, margin=2cm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% SECTION OPTIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RequirePackage[explicit]{titlesec}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% FLOAT OPTIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\RequirePackage{graphicx}
\RequirePackage{caption}
\RequirePackage{subcaption}
\RequirePackage{float}
\RequirePackage{wrapfig}
\usepackage{url}
\usepackage{array}
\usepackage{xcolor}
\usepackage{hyperref}
\usepackage{afterpage}
\usepackage{lipsum}
\usepackage{sectsty}
\usepackage{tikz}
%
\RequirePackage{csquotes}
\RequirePackage{polyglossia}
\setmainlanguage{french}
\setotherlanguage{english}
\RequirePackage{mathtools}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% CUSTOM COMMANDS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%-------------------------------------------------------------------------
% FLOAT LABELS
%-------------------------------------------------------------------------
\definecolor{UoDBlue}{RGB}{67, 101, 226}
\definecolor{UoDDarkBlue}{RGB}{61, 88, 151}
\definecolor{UoDLightBlue}{RGB}{209,226,242}
\colorlet{colorprimary}{UoDBlue}
\partfont{\color{colorprimary}}
\sectionfont{\color{colorprimary}}
\subsectionfont{\color{colorprimary}}
\subsubsectionfont{\color{colorprimary}}
% Define a font for figure labels
\captionsetup{
figurename=Figure,
tablename=Tableau,
labelfont={bf,color=colorprimary},
}
% Align right captions
\captionsetup{justification = raggedright,
singlelinecheck = false}
% Declare a command for figure source, in figure environment
\newcommand{\source}[1]{\vspace{-3pt} \caption*{ Source: {#1}} }
\RequirePackage{polyglossia}
\setmainlanguage{french}
\addto\captionsfrench{\renewcommand*{\partname}{Partie}}
\RequirePackage{titlesec}
% \titleclass{\part}{top}
% \titleformat{\part}[display]
% {\huge\bfseries\centering\color{colorprimary}}{Partie~\thepart}{0pt}{}
% \titlespacing*{\part}{0pt}{40pt}{40pt}
% % Set section formatting to "A." (Alph)
% \titleformat{\section}
% {\normalfont\large\bfseries\color{colorprimary}}{\Alph{section}.}{1em}{}
\RequirePackage{minted}
\setminted{
% bgcolor=mintedbackground,
fontfamily=tt,
linenos=true,
numberblanklines=true,
numbersep=12pt,
numbersep=5pt,
gobble=0,
frame=leftline,
framesep=2mm,
funcnamehighlighting=true,
tabsize=4,
obeytabs=false,
mathescape=false
samepage=false,
showspaces=false,
showtabs =false,
texcl=false,
baselinestretch=1.2,
breaklines=true,
}
\usemintedstyle{vs}
% Fix the red frame arround lines with python lexer
\AtBeginEnvironment{minted}{\dontdofcolorbox}
\def\dontdofcolorbox{\renewcommand\fcolorbox[4][]{##4}}
\AtBeginEnvironment{minted}{\dontdofcolorbox}
\def\dontdofcolorbox{\renewcommand\fcolorbox[4][]{##4}}
\xpatchcmd{\inputminted}{\minted@fvset}{\minted@fvset\dontdofcolorbox}{}{}
\xpatchcmd{\mintinline}{\minted@fvset}{\minted@fvset\dontdofcolorbox}{}{} % see https://tex.stackexchange.com/a/401250/
\RequirePackage{booktabs}
\hypersetup{
colorlinks,
linkcolor={red!50!black},
citecolor={blue!50!black},
urlcolor={blue!80!black},
backref=page
}
% \hfuzz=11pt
\usepackage[nameinlink]{cleveref}

View File

@ -0,0 +1,97 @@
\RequirePackage[manualmark]{scrlayer-scrpage}
\iffalse
\renewcommand*\chaptermark[1]{%
\markboth{\Ifnumbered{chapter}{\chaptermarkformat}{}}{#1}% <- outdated macro replaced
}
\AfterTOCHead[toc]{\markboth{}{\contentsname}}
\fi
\clearpairofpagestyles
\clubpenalty = 10000
\widowpenalty = 10000
\automark[section]{part}
\setlength{\footheight}{120pt} % avoids scrlayer-scrpage warning:
% footheight to low warning
\setlength{\footskip}{185pt} % BAD HACK that moves the foot downwards
\KOMAoption{footwidth}{foot:53pt} % BAD HACK that moves the foot towards
\setkomafont{pagefoot}{\normalfont\footnotesize}
\setkomafont{pagenumber}{\normalfont \fontfamily{\sfdefault}\selectfont \normalsize \bfseries\color{black}}
\renewcommand{\partmark}[1]{%
\markboth{%
% use \@chapapp instead of \chaptername to avoid
% 'Chapter A Appendix ...', thanks to @farbverlust (issue #47)
\fontfamily{\sfdefault}\selectfont
{\color{fgBlue}\textbf{\partname\ \thepart}}%
\quad%
\protect\begin{minipage}[t]{.65\textwidth}%
#1%
\protect\end{minipage}%
}{}%
}
\newlength{\lensectionnumber}
\renewcommand{\sectionmark}[1]{%
\markright{%
\normalsize\fontfamily{\sfdefault}\selectfont\bfseries
\setlength{\lensectionnumber}{0em}
\settowidth{\lensectionnumber}{\textbf{\thesection}\quad}
\protect\begin{minipage}[t]{.72\textwidth}%
{\ }% bad hack to prevent a wrong baseline for the minipage
\protect\raggedleft%
\hangindent=\lensectionnumber%
{\color{black}\textbf{\fontfamily{\sfdefault}\selectfont\thesection}}%
\quad%
#1%
\protect\end{minipage}%
}%
}
\newcommand{\ctfooterline}{%
\color{black}\rule[-90pt]{1.25pt}{100pt}%
}
% Page number for odd (right) pages
\newcommand{\ctfooterrightpagenumber}{%
\ctfooterline%
\hspace*{10pt}%
\begin{minipage}[b]{1.5cm}%
\pagemark\ %
\end{minipage}%
}
%% Page number for even (left) pages
\newcommand{\ctfooterleftpagenumber}{%
\begin{minipage}[b]{1.5cm}%
\raggedleft\pagemark%
\end{minipage}%
\hspace*{10pt}%
\ctfooterline%
}
%% Defines the content for header and footer
\lehead{}
\cehead{}
\rehead{}
\lohead{}
\cohead{}
\rohead{}
\lefoot[% > plain
\ctfooterleftpagenumber%
]{% > srcheadings
\ctfooterleftpagenumber%
\hspace*{0.75cm}%
%\headmark%
}
\cefoot{}
\refoot{}
\lofoot{}
\cofoot{}
\rofoot[% > plain
\ctfooterrightpagenumber%
]{% > srcheadings
%\headmark%
\hspace*{0.75cm}%
\ctfooterrightpagenumber%
}

View File

@ -0,0 +1,7 @@
\colorlet{headingcolor}{black}
\renewcommand*{\sectionformat}{\llap{\textcolor{headingcolor}{\thesection}\hspace{1em}}}
\renewcommand*{\chapterformat}{\llap{\textcolor{headingcolor}{\thechapter}\hspace{1em}}}
\renewcommand*{\subsectionformat}{\llap{\textcolor{headingcolor}{\thesubsection}\hspace{1em}}}

148
docs/report/sty/style.sty Normal file
View File

@ -0,0 +1,148 @@
% LaTeX style for my internship report
\RequirePackage{graphicx}
\RequirePackage{xcolor}
\RequirePackage{subcaption}
\RequirePackage{caption}
\RequirePackage{float}
% Table
\usepackage{colortbl}
\RequirePackage{biblatex}
\renewcommand{\bibname}{References}
% Color
\definecolor{ueve-blue}{HTML}{005595}
\colorlet{maincolor}{ueve-blue}
% Font
\RequirePackage{fontspec}
\setmainfont{TeX Gyre Termes} % Times New Roman alternative
\defaultfontfeatures[\rmfamily,\sffamily]{Ligatures=TeX}
\RequirePackage{setspace}
\singlespacing % simple interline spacing
\RequirePackage{epigraph}
\RequirePackage[toc]{appendix} % add 'page' option, if you want to add a seperating page with 'Appendices' centered in it.
\renewcommand{\appendixname}{Annex}
\renewcommand{\appendixtocname}{Annex}
\renewcommand{\appendixpagename}{Annex}
\usepackage[
abbreviations, % create "abbreviations" glossary
%nomain, % don't create "main" glossary
stylemods=longbooktabs, % do the adjustments for the longbooktabs styles,
automake
]{glossaries-extra}
\setabbreviationstyle[acronym]{long-short}
%
\RequirePackage{sty/cleanthesis-footer}
\RequirePackage{sty/scr-legrand-heading}
\definecolor{Prune}{RGB}{99,0,60}
\colorlet{PruneLink}{Prune!70!blue}
\colorlet{PruneCite}{Prune!60!red}
\RequirePackage{lettrine}
\RequirePackage{csquotes}
\RequirePackage{hyperref}
\RequirePackage[noabbrev,nameinlink]{cleveref}
\hypersetup{
bookmarksnumbered=true,
bookmarksopen=true,
unicode=true,
colorlinks=true,
linktoc=all, %linktoc=page
linkcolor=PruneLink,
citecolor=PruneCite,
filecolor=PruneLink,
urlcolor=PruneLink,
anchorcolor=PruneLink,
pdfstartview=FitH,
pdfencoding=auto % avoid encoding problems in PDF bookmarks (French)
}
\RequirePackage{minted}
\setminted{
% bgcolor=mintedbackground,
fontfamily=tt,
linenos=true,
numberblanklines=true,
numbersep=12pt,
numbersep=5pt,
gobble=0,
frame=leftline,
framesep=2mm,
funcnamehighlighting=true,
tabsize=4,
obeytabs=false,
mathescape=false
samepage=false,
showspaces=false,
showtabs =false,
texcl=false,
baselinestretch=1.2,
breaklines=true,
}
\usemintedstyle{vs}
\usepackage{etoolbox,xpatch}
\AtBeginEnvironment{minted}{\dontdofcolorbox}
\def\dontdofcolorbox{\renewcommand\fcolorbox[4][]{##4}}
\AtBeginEnvironment{minted}{\dontdofcolorbox}
\def\dontdofcolorbox{\renewcommand\fcolorbox[4][]{##4}}
\xpatchcmd{\inputminted}{\minted@fvset}{\minted@fvset\dontdofcolorbox}{}{}
\xpatchcmd{\mintinline}{\minted@fvset}{\minted@fvset\dontdofcolorbox}{}{} % see https://tex.stackexchange.com/a/401250/
\RequirePackage{verbments}
\RequirePackage{mathtools}
\RequirePackage{amsmath}
% Customize sectioning
\usepackage{adforn}
\renewcommand*{\thepart}{\arabic{part}}
\renewcommand*{\partformat}{\adforn{21}~\thepart~\adforn{49}}
\renewcommand*{\partpagestyle}{empty}
\newcommand*{\decoratedpage}{%
{\renewcommand*{\partformat}{\adforn{21}~\adforn{11}~\adforn{49}}
\part{}
\addtocounter{part}{-1}}%
}
\newenvironment{relaxclearpage}{
\let\LaTeXStandardClearpage\clearpage
\let\clearpage\relax % Do nothing when a \clearpage command appears
}
{
\let\clearpage\LaTeXStandardClearpage % Return to the old definition
}
\RequirePackage{tabularray}
\usepackage[l3]{csvsimple}
\usepackage[group-minimum-digits=4]{siunitx}
\usepackage{adjustbox}
\usepackage{makecell}
% Adjust hyphenation
\pretolerance=5000
\tolerance=9000
\emergencystretch=0pt
\usepackage{pdflscape}
\usepackage{multicol}