Add a figure from Wikimedia that depicts duplicate genes fate

2024-04-18 18:22:59 +02:00 · 2024-04-18 18:22:59 +02:00 · 1d3803d6ea
parent b441d29ba8
commit 1d3803d6ea
6 changed files with 5404 additions and 20 deletions
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-OPTIONS=-shell-escape -file-line-error -synctex=1 -interaction=batchmode
+OPTIONS=-shell-escape -file-line-error -synctex=1
 SOURCE=report
 all: latexmk
 debug: 
--- a/figures/Evolution_fate_duplicate_genes.pdf
+++ b/figures/Evolution_fate_duplicate_genes.pdf
--- a/figures/Evolution_fate_duplicate_genes.svg
+++ b/figures/Evolution_fate_duplicate_genes.svg
--- a/references.bib
+++ b/references.bib
@ -33,6 +33,19 @@
  langid = {english}
 }

+@article{assisModelsRetentionDuplicate2024,
+  title = {Models for the Retention of Duplicate Genes and Their Biological Underpinnings},
+  author = {Assis, Raquel and Conant, Gavin and Holland, Barbara and Liberles, David and O'Reilly, Małgorzata and Wilson, Amanda},
+  date = {2024-02-12},
+  journaltitle = {F1000Research},
+  shortjournal = {F1000Research},
+  volume = {12},
+  pages = {1400},
+  doi = {10.12688/f1000research.141786.2},
+  abstract = {Gene content in genomes changes through several different processes, with gene duplication being an important contributor to such changes. Gene duplication occurs over a range of scales from individual genes to whole genomes, and the dynamics of this process can be context dependent. Still, there are rules by which genes are retained or lost from genomes after duplication, and probabilistic modeling has enabled characterization of these rules, including their context-dependence. Here, we describe the biology and corresponding mathematical models that are used to understand duplicate gene retention and its contribution to the set of biochemical functions encoded in a genome.},
+  keywords = {2read}
+}
+
@article{beallIdentificationAnalysisHyperactive2002,
  title = {Identification and {{Analysis}} of a {{Hyperactive Mutant Form}} of {{Drosophila P-Element Transposase}}},
  author = {Beall, Eileen L and Mahoney, Matthew B and Rio, Donald C},
@ -49,6 +62,24 @@
  abstract = {Transposition in many organisms is regulated to control the frequency of DNA damage caused by the DNA breakage and joining reactions. However, genetic studies in prokaryotic systems have led to the isolation of mutant transposase proteins with higher or novel activities compared to those of the wild-type protein. In the course of our study of the effects of mutating potential ATM-family DNA damage checkpoint protein kinase sites in the Drosophila P-element transposase protein, we found one mutation, S129A, that resulted in an elevated level of transposase activity using in vivo recombination assays, including P-element-mediated germline transformation. In vitro assays for P-element transposase activity indicate that the S129A mutant exhibits elevated donor DNA cleavage activity when compared to the wild-type protein, whereas the strand-transfer activity is similar to that of wild type. This difference may reflect the nature of the in vitro assays and that normally in vivo the two reactions may proceed in concert. The P-element transposase protein contains 10 potential consensus phosphorylation sites for the ATM family of PI3-related protein kinases. Of these 10 sites, 8 affect transposase activity either positively or negatively when substituted individually with alanine and tested in vivo. A mutant transposase protein that contains all eight N-terminal serine and threonine residues substituted with alanine is inactive and can be restored to full activity by substitution of wild-type amino acids back at only 3 of the 8 positions. These data suggest that the activity of P-element transposase may be regulated by phosphorylation and demonstrate that one mutation, S129A, results in hyperactive transposition.}
 }

+@article{berthelotRainbowTroutGenome2014,
+  title = {The Rainbow Trout Genome Provides Novel Insights into Evolution after Whole-Genome Duplication in Vertebrates},
+  author = {Berthelot, Camille and Brunet, Frédéric and Chalopin, Domitille and Juanchich, Amélie and Bernard, Maria and Noël, Benjamin and Bento, Pascal and Da Silva, Corinne and Labadie, Karine and Alberti, Adriana and Aury, Jean-Marc and Louis, Alexandra and Dehais, Patrice and Bardou, Philippe and Montfort, Jérôme and Klopp, Christophe and Cabau, Cédric and Gaspin, Christine and Thorgaard, Gary H. and Boussaha, Mekki and Quillet, Edwige and Guyomard, René and Galiana, Delphine and Bobe, Julien and Volff, Jean-Nicolas and Genêt, Carine and Wincker, Patrick and Jaillon, Olivier and Roest Crollius, Hugues and Guiguen, Yann},
+  date = {2014-04-22},
+  journaltitle = {Nature Communications},
+  shortjournal = {Nat Commun},
+  volume = {5},
+  eprint = {24755649},
+  eprinttype = {pmid},
+  pages = {3657},
+  issn = {2041-1723},
+  doi = {10.1038/ncomms4657},
+  abstract = {Vertebrate evolution has been shaped by several rounds of whole-genome duplications (WGDs) that are often suggested to be associated with adaptive radiations and evolutionary innovations. Due to an additional round of WGD, the rainbow trout genome offers a unique opportunity to investigate the early evolutionary fate of a duplicated vertebrate genome. Here we show that after 100 million years of evolution the two ancestral subgenomes have remained extremely collinear, despite the loss of half of the duplicated protein-coding genes, mostly through pseudogenization. In striking contrast is the fate of miRNA genes that have almost all been retained as duplicated copies. The slow and stepwise rediploidization process characterized here challenges the current hypothesis that WGD is followed by massive and rapid genomic reorganizations and gene deletions.},
+  langid = {english},
+  pmcid = {PMC4071752},
+  keywords = {2read,Animals,Evolution Molecular,Gene Duplication,Oncorhynchus mykiss,Vertebrates}
+}
+
@article{blankenbergGalaxyWebbasedGenome2010,
  title = {Galaxy: A Web-Based Genome Analysis Tool for Experimentalists},
  shorttitle = {Galaxy},
@ -93,6 +124,49 @@
  keywords = {Computational biology and bioinformatics,Genome informatics,Genomic analysis,Sequencing,Software}
 }

+@unpublished{caronCyberGalaxy2013,
+  title = {Towards a Cyber {{Galaxy}} ?},
+  author = {Caron, Christophe C. and Carre, Wilfried and Cormier, Alexandre and Derozier, Sandra S. and Giacomoni, Franck and Inizan, Olivier and Le Corguillé, Gildas and Lermine, Alban and Maman Haddad, Sarah and Pericard, Pierre and Samson, Franck F.},
+  date = {2013-07},
+  series = {{{JOBIM TOULOUSE}} 2013 - {{RÉSUMÉS COURTS}} (Affiches)},
+  pages = {246},
+  url = {https://hal.inrae.fr/hal-02748994},
+  urldate = {2024-04-09},
+  abstract = {The success of the open web based platform “Galaxy” is growing among diverse scientific communities. The French Institute of Bioinformatics - IFB wish to initiate a collaborative work dedicated to scientific workflows and especially to the platform Galaxy. We report here the main items on which future collaborations could be build: (i) software and hardware architecture, (ii) tools integration and (iii) training.},
+  keywords = {formation,galaxy,intégration d'outils,NGS,partage de données,workflow},
+  annotation = {Published: JOBIM 2013}
+}
+
+@unpublished{caronFrenchCyberGalaxy2013,
+  title = {Toward a {{French}} Cyber {{Galaxy}} ?},
+  author = {Caron, Christophe C. and Carré, Wilfrid and Cormier, Alexandre and Derozier, Sandra S. and Giacomoni, Franck and Inizan, Olivier and Le Corguillé, Gildas and Lermine, Alban and Maman Haddad, Sarah and Pericard, Pierre and Samson, Franck F.},
+  date = {2013-06},
+  series = {Galaxy {{Community Conference}} 2013 : {{Posters}} / {{Abstracts}}},
+  pages = {online},
+  url = {https://hal.inrae.fr/hal-02748274},
+  urldate = {2024-04-09},
+  abstract = {The success of the open web based platform “Galaxy” is growing among scientific communities. The French Institute of Bioinformatics (IFB) wishes to initiate a collaborative work dedicated to scientific workflows and especially to the Galaxy platform. We report here the main items on which future collaborations could be build: (i) software and hardware architecture, (ii) tools integration and (iii) training. High throughput technologies advent significantly alters analysis behaviour and strategy with mobilization of new infrastructure, new tools and new skills. IFB decided to conduct a cross action on "workflows" data analysis solutions, and especially on the Galaxy platform. The first item called "software and hardware architecture" addresses the operational issues in production environments, the potential for automating deployment tasks and the monitoring solutions for Galaxy servers. With the second one, "Tools integration" we aim to provide processes facilitating tool interfacing in a Galaxy instance. Priority will be the development of a good practice guide, as well as a technology watch around the methods proposed by the international community. We also want to promote the sharing of training activities at national level (such as the Aviesan Bioinformatics school, January 2013 - http://galaxy-ecole.sb-roscoff.fr/) and ensure a smooth transition to new uses, such as E-learning. A first working group is already effective. Previous items will be improved in the coming months thanks to a specific dedicated wiki and the first French Galaxy Workshop this autumn.},
+  keywords = {data sharing,Galaxy,NGS,tools integration,training,workflow},
+  annotation = {Published: Galaxy Community Conference}
+}
+
+@article{casneufNonrandomDivergenceGene2006,
+  title = {Nonrandom Divergence of Gene Expression Following Gene and Genome Duplications in the Flowering Plant {{Arabidopsis}} Thaliana},
+  author = {Casneuf, Tineke and De Bodt, Stefanie and Raes, Jeroen and Maere, Steven and Van de Peer, Yves},
+  date = {2006-02-20},
+  journaltitle = {Genome Biology},
+  shortjournal = {Genome Biology},
+  volume = {7},
+  number = {2},
+  pages = {R13},
+  issn = {1474-760X},
+  doi = {10.1186/gb-2006-7-2-r13},
+  url = {https://doi.org/10.1186/gb-2006-7-2-r13},
+  urldate = {2024-04-13},
+  abstract = {Genome analyses have revealed that gene duplication in plants is rampant. Furthermore, many of the duplicated genes seem to have been created through ancient genome-wide duplication events. Recently, we have shown that gene loss is strikingly different for large- and small-scale duplication events and highly biased towards the functional class to which a gene belongs. Here, we study the expression divergence of genes that were created during large- and small-scale gene duplication events by means of microarray data and investigate both the influence of the origin (mode of duplication) and the function of the duplicated genes on expression divergence.},
+  keywords = {2read,Additional Data File,Anchor Point,Duplicate Gene,Duplication Event,Expression Divergence}
+}
+
@report{charlesFinalisationPipelineFTAG2023,
  type = {Internship Report},
  title = {Finalisation du pipeline FTAG (Families and TAG) Finder, un outil de détection des gènes dupliqués sous Galaxy},
@ -153,6 +227,30 @@
  langid = {english}
 }

+@article{denoeudAnalyseGenomesRecherche,
+  title = {Analyse des génomes à la recherche de répétitions en tandem polymorphes: outils d?épidémiologie bactérienne et locus hypermutables humains},
+  author = {Denoeud, France},
+  langid = {french}
+}
+
+@article{desponsTandemGeneArrays2011,
+  title = {Tandem Gene Arrays, Plastic Chromosomal Organizations},
+  author = {Despons, Laurence and Uzunov, Zlatyo and Louis, Véronique Leh},
+  date = {2011-08-01},
+  journaltitle = {Comptes Rendus Biologies},
+  shortjournal = {Comptes Rendus Biologies},
+  series = {Ten Years of Genomic Exploration in Eukaryotes : Strategy and Progress of {{Genolevures}}},
+  volume = {334},
+  number = {8},
+  pages = {639--646},
+  issn = {1631-0691},
+  doi = {10.1016/j.crvi.2011.05.012},
+  url = {https://www.sciencedirect.com/science/article/pii/S1631069111001454},
+  urldate = {2024-04-09},
+  abstract = {This short article presents an overview of tandem gene arrays (TGAs) in hemiascomycete yeasts. In silico and in vivo analyses are combined to address structural, functional and evolutionary aspects of these particular chromosomal structures. Genomic instability of TGAs is discussed. We conclude that TGAs are generally dynamic regions of the genome in that they are the seats of chromosomal rearrangement events. In addition, they are often breeding grounds of new genes for a rapid adaptation of cells to demands of the environment. Résumé Ce court article présente une vue d’ensemble des tandems de gènes chez les levures hémiascomycètes. Des analyses in silico et in vivo ont été combinées pour aborder les aspects structuraux, fonctionnels et évolutifs de ces structures chromosomiques particulières. L’instabilité génomique des tandems de gènes est discutée. Nous concluons que les tandems de gènes sont généralement des régions dynamiques du génome car ils sont le siège d’événements de réarrangements chromosomiques. De surcroît, ils sont souvent des zones de reproduction de nouveaux gènes pour une adaptation rapide des cellules aux demandes de l’environnement.},
+  keywords = {Chromosomal rearrangements,Duplication de gènes en tandem,Evolution,Évolution,Levure,Réarrangements chromosomiques,Tandem gene duplication,Yeast}
+}
+
@article{ditommasoNextflowEnablesReproducible2017,
  title = {Nextflow Enables Reproducible Computational Workflows},
  author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric},
@ -204,14 +302,20 @@
  langid = {english}
 }

-@online{DupliquerPourAdapter2020,
-  title = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ? | CNRS Biologie},
-  shorttitle = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ?},
-  date = {2020-10-14},
-  url = {https://www.insb.cnrs.fr/fr/cnrsinfo/dupliquer-pour-sadapter-ou-comment-accelerer-levolution-des-plantes},
-  urldate = {2024-03-25},
-  abstract = {Les duplications de portions de chromosomes permettant aux organismes de dupliquer des gènes existants et d’en créer de nouveaux sont bien},
-  langid = {french}
+@article{duarteExpressionPatternShifts2006,
+  title = {Expression {{Pattern Shifts Following Duplication Indicative}} of {{Subfunctionalization}} and {{Neofunctionalization}} in {{Regulatory Genes}} of {{Arabidopsis}}},
+  author = {Duarte, Jill M. and Cui, Liying and Wall, P. Kerr and Zhang, Qing and Zhang, Xiaohong and Leebens-Mack, Jim and Ma, Hong and Altman, Naomi and {dePamphilis}, Claude W.},
+  date = {2006-02-01},
+  journaltitle = {Molecular Biology and Evolution},
+  shortjournal = {Molecular Biology and Evolution},
+  volume = {23},
+  number = {2},
+  pages = {469--478},
+  issn = {0737-4038},
+  doi = {10.1093/molbev/msj051},
+  url = {https://doi.org/10.1093/molbev/msj051},
+  urldate = {2024-04-14},
+  abstract = {Gene duplication plays an important role in the evolution of diversity and novel function and is especially prevalent in the nuclear genomes of flowering plants. Duplicate genes may be maintained through subfunctionalization and neofunctionalization at the level of expression or coding sequence. In order to test the hypothesis that duplicated regulatory genes will be differentially expressed in a specific manner indicative of regulatory subfunctionalization and/or neofunctionalization, we examined expression pattern shifts in duplicated regulatory genes in Arabidopsis. A two-way analysis of variance was performed on expression data for 280 phylogenetically identified paralogous pairs. Expression data were extracted from global expression profiles for wild-type root, stem, leaf, developing inflorescence, nearly mature flower buds, and seedpod. Gene, organ, and gene by organ interaction (G × O) effects were examined. Results indicate that 85\% of the paralogous pairs exhibited a significant G × O effect indicative of regulatory subfunctionalization and/or neofunctionalization. A significant G × O effect was associated with complementary expression patterns in 45\% of pairwise comparisons. No association was detected between a G × O effect and a relaxed evolutionary constraint as detected by the ratio of nonsynonymous to synonymous substitutions. Ancestral gene expression patterns inferred across a Type II MADS-box gene phylogeny suggest several cases of regulatory neofunctionalization and organ-specific nonfunctionalization. Complete linkage clustering of gene expression levels across organs suggests that regulatory modules for each organ are independent or ancestral genes had limited expression. We propose a new classification, regulatory hypofunctionalization, for an overall decrease in expression level in one member of a paralogous pair while still having a significant G × O effect. We conclude that expression divergence specifically indicative of subfunctionalization and/or neofunctionalization contributes to the maintenance of most if not all duplicated regulatory genes in Arabidopsis and hypothesize that this results in increasing expression diversity or specificity of regulatory genes after each round of duplication.}
 }

@article{emmsOrthoFinderPhylogeneticOrthology2019,
@ -250,6 +354,17 @@
  keywords = {Blast Score,Gene Length,Phylogenetic Distance,Sequence Similarity Score,Transcription Factor Gene Family}
 }

+@video{evry-senartsciencesetinnovationCareneRizzonUEVE2014,
+  entrysubtype = {video},
+  title = {Carène {{Rizzon}} ({{UEVE}}) - {{Etude}} de l’évolution Des Gènes Dupliqués},
+  editor = {{Evry-Sénart Sciences et Innovation}},
+  editortype = {director},
+  date = {2014},
+  url = {https://www.youtube.com/watch?v=ubiOE7w3374},
+  urldate = {2024-04-10},
+  abstract = {Colloque ESI 2014 "Evry Bio \& Evry STIC" organisé par Evry Sciences et Innovation le 30 avril 2014 à Evry.  Intervention de Carène Rizzon de l'Université d’Évry: "Étude de l’évolution des gènes dupliqués chez Arabidopsis thaliana via les réseaux biologiques.}
+}
+
@article{gautRecombinationUnderappreciatedFactor2007,
  title = {Recombination: An Underappreciated Factor in the Evolution of Plant Genomes},
  shorttitle = {Recombination},
@ -322,12 +437,63 @@
  keywords = {Aegilops,molecular evolution,plasmon and B genome inheritance,Triticum,wheat}
 }

+@article{grahamTandemGenesClustered1995,
+  title = {Tandem Genes and Clustered Genes},
+  author = {Graham, Geoffrey J.},
+  date = {1995-07-07},
+  journaltitle = {Journal of Theoretical Biology},
+  shortjournal = {Journal of Theoretical Biology},
+  volume = {175},
+  number = {1},
+  pages = {71--87},
+  issn = {0022-5193},
+  doi = {10.1006/jtbi.1995.0122},
+  url = {https://www.sciencedirect.com/science/article/pii/S0022519385701221},
+  urldate = {2024-04-09},
+  abstract = {Two patterns of gene repetition are described: tandem arraying and clustering. Tandemly arrayed genes reside within segments of DNA that are repeated head-to-tail a number of times. Clustered genes are linked but irregularly spaced, are often mutually inverted in an unpredictable pattern and are connected by non-conserved DNA. Tandem arrays are homogenized by both unequal recombination and gene conversion, are necessary for the maintenance of large gene families, can expand and contract rapidly in response to changing demand, can keep functionally related genes equal in number, and do not engender increased genetic complexity. Gene clusters are homogenized by conversion only, seldom if ever contain more than 50 members, are stable in number, and often engender increased genetic complexity. The interrelationships among these properties are discussed. Tandem gene arrays can evolve into gene clusters. It is suggested that this occurs when some change in the array inhibits unequal recombination but not gene conversion. The most common such change is inversion of part of the tandem array with respect to the rest; however, arrays can evolve into clusters without inversion. Clustered genes are sometimes re-amplified into new tandem arrays. Clustered genes are probably more durable than tandemly arrayed genes during periods of relaxed selection, and in the case of fish antifreeze protein genes, seem to behave as a genetic memory.}
+}
+
+@article{hanadaImportanceLineagespecificExpansion2008,
+  title = {Importance of Lineage-Specific Expansion of Plant Tandem Duplicates in the Adaptive Response to Environmental Stimuli},
+  author = {Hanada, Kousuke and Zou, Cheng and Lehti-Shiu, Melissa D. and Shinozaki, Kazuo and Shiu, Shin-Han},
+  date = {2008-10},
+  journaltitle = {Plant Physiology},
+  shortjournal = {Plant Physiol},
+  volume = {148},
+  number = {2},
+  eprint = {18715958},
+  eprinttype = {pmid},
+  pages = {993--1003},
+  issn = {0032-0889},
+  doi = {10.1104/pp.108.122457},
+  abstract = {Plants have substantially higher gene duplication rates compared with most other eukaryotes. These plant gene duplicates are mostly derived from whole genome and/or tandem duplications. Earlier studies have shown that a large number of duplicate genes are retained over a long evolutionary time, and there is a clear functional bias in retention. However, the influence of duplication mechanism, particularly tandem duplication, on duplicate retention has not been thoroughly investigated. We have defined orthologous groups (OGs) between Arabidopsis (Arabidopsis thaliana) and three other land plants to examine the functional bias of retained duplicate genes during vascular plant evolution. Based on analysis of Gene Ontology categories, it is clear that genes in OGs that expanded via tandem duplication tend to be involved in responses to environmental stimuli, while those that expanded via nontandem mechanisms tend to have intracellular regulatory roles. Using Arabidopsis stress expression data, we further demonstrated that tandem duplicates in expanded OGs are significantly enriched in genes that are up-regulated by biotic stress conditions. In addition, tandem duplication of genes in an OG tends to be highly asymmetric. That is, expansion of OGs with tandem genes in one organismal lineage tends to be coupled with losses in the other. This is consistent with the notion that these tandem genes have experienced lineage-specific selection. In contrast, OGs with genes duplicated via nontandem mechanisms tend to experience convergent expansion, in which similar numbers of genes are gained in parallel. Our study demonstrates that the expansion of gene families and the retention of duplicates in plants exhibit substantial functional biases that are strongly influenced by the mechanism of duplication. In particular, genes involved in stress responses have an elevated probability of retention in a single-lineage fashion following tandem duplication, suggesting that these tandem duplicates are likely important for adaptive evolution to rapidly changing environments.},
+  langid = {english},
+  pmcid = {PMC2556807},
+  keywords = {Adaptation Biological,Arabidopsis,Evolution Molecular,Gene Duplication,Genes Duplicate,Genes Plant,Genome Plant,Multigene Family,Oligonucleotide Array Sequence Analysis,Phylogeny}
+}
+
@online{HomeCromwell,
  title = {Home - {{Cromwell}}},
  url = {https://cromwell.readthedocs.io/en/stable/},
  urldate = {2024-03-27}
 }

+@online{HttpsMicansOrg,
+  title = {{{https://micans.org/mcl/lit/mimb.pdf}}},
+  url = {https://micans.org/mcl/lit/mimb.pdf},
+  urldate = {2024-04-11}
+}
+
+@report{jasminStudyTandemlyArrayed2016,
+  type = {Internship Report},
+  title = {Study of Tandemly Arrayed Genes Expression for {{Arabidopsis}} Thaliana},
+  author = {Jasmin, Fabien},
+  year = {2016-06-2016},
+  institution = {Laboratoire de Mathématiques et Modélisation d'Évry},
+  abstract = {Tandemly arrayed genes, also called TAGs, are duplicated genes which come from tandem arrayed duplication. They can be separated or not by few genes called spacers. Although duplicated genes are commonly studied, TAGs features remain little known. In this study, I performed a statistical analysis of Arabidopsis thaliana TAGs using genomic and transcriptomic data of high quality providing from TAIR database and CATdb. After merging the different data and assessing it, I observed the distribution of the different size of TAG and the behaviour of TAGs depending on the number of spacers that I made vary from 0 to 10 in my survey. I equally defined different list of gene pairs to easily compare TAGs to other type of genes. In all 5 lists have been defined during my investigation. The defined lists are random genes pairs list, duplicated genes pairs list, successive genes pairs list, local genes pairs list and TAGs pairs list. After creating all lists previously defined, I made gene pairs lists comparisons between TAGs pair list and the other type of gene pairs list according to different features such as the effect of abiotic or biotic stress conditions, the genes orientation, or the correlation of the expression profiles.},
+  langid = {english}
+}
+
@video{javiernovoDuplicationGenes2015,
  entrysubtype = {video},
  title = {Duplication of Genes},
@ -390,6 +556,22 @@
  pubstate = {preprint}
 }

+@inproceedings{lajoieEvolutionTandemlyArrayed2007,
+  title = {Evolution of {{Tandemly Arrayed Genes}} in {{Multiple Species}}},
+  booktitle = {Comparative {{Genomics}}},
+  author = {Lajoie, Mathieu and Bertrand, Denis and El-Mabrouk, Nadia},
+  editor = {Tesler, Glenn and Durand, Dannie},
+  date = {2007},
+  pages = {96--109},
+  publisher = {Springer},
+  location = {Berlin, Heidelberg},
+  doi = {10.1007/978-3-540-74960-8_8},
+  abstract = {Tandemly arrayed genes (TAG) constitute a large fraction of most genomes and play important biological roles. They evolve through unequal recombination, which places duplicated genes next to the original ones (tandem duplications). Many algorithms have been proposed to infer a tandem duplication history for a TAG cluster in a single species. However, the presence of different transcriptional orientations in most TAG clusters highlight the fact that processes such as inversions also contribute to their evolution. This makes those algorithms unsuitable in many cases. To circumvent this limitation, we proposed in a previous work an extended evolutionary model which includes inversions and presented a branch-and-bound algorithm allowing to infer a most parsimonious scenario of evolution for a given TAG cluster. Here, we generalize this model to multiple species and present a general framework to infer ancestral gene orders that minimize the number of inversions in the whole evolutionary history. An application on a pair of human-rat TAG clusters is presented.},
+  isbn = {978-3-540-74960-8},
+  langid = {english},
+  keywords = {Ancestral Genome,Gene Order,Gene Tree,Inversion Event,Tandem Duplication}
+}
+
@thesis{lallemandEvolutionGenesDupliques2022,
  type = {phdthesis},
  title = {Évolution des gènes dupliqués chez le pommier : Identification et caractérisation de la dominance du sous-génome dans le génome de la pomme},
@ -438,6 +620,13 @@
  langid = {english}
 }

+@article{landes-devauchelleArtResumerPour,
+  title = {De l’art de résumer pour tenter de comprendre en génomique évolutive},
+  author = {Landès-Devauchelle, Claudine},
+  url = {http://www.math-evry.cnrs.fr/_media/publications/devauchelle_hdr_2011.pdf},
+  langid = {french}
+}
+
@article{lannesDoesPresenceTransposable2019,
  title = {Does the {{Presence}} of {{Transposable Elements Impact}} the {{Epigenetic Environment}} of {{Human Duplicated Genes}}?},
  author = {Lannes, Romain and Rizzon, Carène and Lerat, Emmanuelle},
@ -457,6 +646,12 @@
  pmcid = {PMC6470583}
 }

+@report{le-hoangEtudeTranscriptomiqueGenes,
+  title = {Etude transcriptomique des gènes dupliqués en tandem (TAG) chez Arabidopsis thaliana},
+  author = {Lê-Hoang, Julie},
+  langid = {french}
+}
+
@thesis{leducEtudeEvolutionGenes,
  title = {Étude de l’évolution des gènes dupliqués chez les Rosaceae},
  author = {Leduc, Martin},
@ -529,6 +724,22 @@
  pmcid = {PMC6347962}
 }

+@online{moixPhylogeneticPlacementWhole2023,
+  title = {Phylogenetic Placement of Whole Genome Duplications in Yeasts through Quantitative Analysis of Hierarchical Orthologous Groups},
+  author = {Moix, Samuel and Glover, Natasha and Majidian, Sina},
+  date = {2023-04-12},
+  number = {12:382},
+  eprint = {12:382},
+  eprinttype = {F1000Research},
+  doi = {10.12688/f1000research.128656.1},
+  url = {https://f1000research.com/articles/12-382},
+  urldate = {2024-04-17},
+  abstract = {Background: Whole genome duplications (WGD) are genomic events leading to formation of polyploid organisms. Resulting duplicated genes play important roles in driving species evolution and diversification. After such events, the initial ploidy is usually restored, complicating their detection across evolution. With the advance of bioinformatics and the rising number of new well-assembled genomes, new detection methods are ongoingly being developed to overcome the weaknesses of different approaches. Results: Here we propose a novel method for detecting WGD in yeast lineages based on the quantitative and comparative analysis of hierarchical orthologous groups (HOGs) of duplicated genes for a given set of organisms. We reconstruct ancestral genomes to obtain evolutionary information for each phylogenetic branch. This reconstruction relies on the inference of HOGs from the selected species’ proteomes. To estimate WGD events, the number of HOGs of duplicated genes across all taxonomic ranges are adjusted according to the molecular clock hypothesis and by the average genome size. Branches with a significant increase in the adjusted number of duplicated gene families are kept as candidates for WGD placement. The developed method was tested on two real datasets and showed promising results in phylogenetic WGD placements on the yeast lineage.},
+  langid = {english},
+  pubstate = {preprint},
+  keywords = {comparative genomics,orthologous groups,whole genome duplications,yeast}
+}
+
@online{molderSustainableDataAnalysis2021a,
  title = {Sustainable Data Analysis with {{Snakemake}}},
  author = {Mölder, Felix and Jablonski, Kim Philipp and Letcher, Brice and Hall, Michael B. and Tomkins-Tinch, Christopher H. and Sochat, Vanessa and Forster, Jan and Lee, Soohyun and Twardziok, Sven O. and Kanitz, Alexander and Wilm, Andreas and Holtgrewe, Manuel and Rahmann, Sven and Nahnsen, Sven and Köster, Johannes},
@ -573,6 +784,40 @@
  langid = {english}
 }

+@article{ottoRecombinationSelectionEvolution2022,
+  title = {Recombination, Selection, and the Evolution of Tandem Gene Arrays},
+  author = {Otto, Moritz and Zheng, Yichen and Wiehe, Thomas},
+  date = {2022-07-01},
+  journaltitle = {Genetics},
+  shortjournal = {Genetics},
+  volume = {221},
+  number = {3},
+  pages = {iyac052},
+  issn = {1943-2631},
+  doi = {10.1093/genetics/iyac052},
+  url = {https://doi.org/10.1093/genetics/iyac052},
+  urldate = {2024-04-09},
+  abstract = {Multigene families—immunity genes or sensory receptors, for instance—are often subject to diversifying selection. Allelic diversity may be favored not only through balancing or frequency-dependent selection at individual loci but also by associating different alleles in multicopy gene families. Using a combination of analytical calculations and simulations, we explored a population genetic model of epistatic selection and unequal recombination, where a trade-off exists between the benefit of allelic diversity and the cost of copy abundance. Starting from the neutral case, where we showed that gene copy number is Gamma distributed at equilibrium, we derived also the mean and shape of the limiting distribution under selection. Considering a more general model, which includes variable population size and population substructure, we explored by simulations mean fitness and some summary statistics of the copy number distribution. We determined the relative effects of selection, recombination, and demographic parameters in maintaining allelic diversity and shaping the mean fitness of a population. One way to control the variance of copy number is by lowering the rate of unequal recombination. Indeed, when encoding recombination by a rate modifier locus, we observe exactly this prediction. Finally, we analyzed the empirical copy number distribution of 3 genes in human and estimated recombination and selection parameters of our model.}
+}
+
+@article{panTandemlyArrayedGenes2008,
+  title = {Tandemly {{Arrayed Genes}} in {{Vertebrate Genomes}}},
+  author = {Pan, Deng and Zhang, Liqing},
+  date = {2008},
+  journaltitle = {Comparative and Functional Genomics},
+  shortjournal = {Comp Funct Genomics},
+  volume = {2008},
+  eprint = {18815629},
+  eprinttype = {pmid},
+  pages = {545269},
+  issn = {1531-6912},
+  doi = {10.1155/2008/545269},
+  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2547482/},
+  urldate = {2024-04-09},
+  abstract = {Tandemly arrayed genes (TAGs) are duplicated genes that are linked as neighbors on a chromosome, many of which have important physiological and biochemical functions. Here we performed a survey of these genes in 11 available vertebrate genomes. TAGs account for an average of about 14\% of all genes in these vertebrate genomes, and about 25\% of all duplications. The majority of TAGs (72–94\%) have parallel transcription orientation (i.e., they are encoded on the same strand) in contrast to the genome, which has about 50\% of its genes in parallel transcription orientation. The majority of tandem arrays have only two members. In all species, the proportion of genes that belong to TAGs tends to be higher in large gene families than in small ones; together with our recent finding that tandem duplication played a more important role than retroposition in large families, this fact suggests that among all types of duplication mechanisms, tandem duplication is the predominant mechanism of duplication, especially in large families. Finally, several species have a higher proportion of large tandem arrays that are species-specific than random expectation.},
+  pmcid = {PMC2547482}
+}
+
@online{PEPkitBioData,
  title = {{{PEPkit}}: The Bio Data Management Toolkit - {{PEPkit}}: The Bio Data Management Toolkit},
  url = {https://pep.databio.org/},
@ -596,6 +841,23 @@
  langid = {english}
 }

+@article{picart-picoloLargeTandemDuplications2020a,
+  title = {Large Tandem Duplications Affect Gene Expression, {{3D}} Organization, and Plant–Pathogen Response},
+  author = {Picart-Picolo, Ariadna and Grob, Stefan and Picault, Nathalie and Franek, Michal and Llauro, Christel and Halter, Thierry and Maier, Tom R. and Jobet, Edouard and Descombin, Julie and Zhang, Panpan and Paramasivan, Vijayapalani and Baum, Thomas J. and Navarro, Lionel and Dvořáčková, Martina and Mirouze, Marie and Pontvianne, Frédéric},
+  date = {2020-10-08},
+  journaltitle = {Genome Research},
+  shortjournal = {Genome Res.},
+  eprint = {33033057},
+  eprinttype = {pmid},
+  publisher = {Cold Spring Harbor Lab},
+  issn = {1088-9051, 1549-5469},
+  doi = {10.1101/gr.261586.120},
+  url = {https://genome.cshlp.org/content/early/2020/10/05/gr.261586.120},
+  urldate = {2024-04-09},
+  abstract = {Rapid plant genome evolution is crucial to adapt to environmental changes. Chromosomal rearrangements and gene copy number variation (CNV) are two important tools for genome evolution and sources for the creation of new genes. However, their emergence takes many generations. In this study, we show that in Arabidopsis thaliana, a significant loss of ribosomal RNA (rRNA) genes with a past history of a mutation for the chromatin assembly factor 1 (CAF1) complex causes rapid changes in the genome structure. Using long-read sequencing and microscopic approaches, we have identified up to 15 independent large tandem duplications in direct orientation (TDDOs) ranging from 60 kb to 1.44 Mb. Our data suggest that these TDDOs appeared within a few generations, leading to the duplication of hundreds of genes. By subsequently focusing on a line only containing 20\% of rRNA gene copies (20rDNA line), we investigated the impact of TDDOs on 3D genome organization, gene expression, and cytosine methylation. We found that duplicated genes often accumulate more transcripts. Among them, several are involved in plant–pathogen response, which could explain why the 20rDNA line is hyper-resistant to both bacterial and nematode infections. Finally, we show that the TDDOs create gene fusions and/or truncations and discuss their potential implications for the evolution of plant genomes.},
+  langid = {english}
+}
+
@online{ponsComputingCommunitiesLarge2005,
  title = {Computing Communities in Large Networks Using Random Walks (Long Version)},
  author = {Pons, Pascal and Latapy, Matthieu},
@ -605,11 +867,22 @@
  doi = {10.48550/arXiv.physics/0512106},
  url = {http://arxiv.org/abs/physics/0512106},
  urldate = {2024-03-30},
-  abstract = {Dense subgraphs of sparse graphs (communities), which appear in most real-world complex networks, play an important role in many contexts. Computing them however is generally expensive. We propose here a measure of similarities between vertices based on random walks which has several important advantages: it captures well the community structure in a network, it can be computed efficiently, and it can be used in an agglomerative algorithm to compute efficiently the community structure of a network. We propose such an algorithm, called Walktrap, which runs in time O(mn\^{}2) and space O(n\^{}2) in the worst case, and in time O(n\^{}2log n) and space O(n\^{}2) in most real-world cases (n and m are respectively the number of vertices and edges in the input graph). Extensive comparison tests show that our algorithm surpasses previously proposed ones concerning the quality of the obtained community structures and that it stands among the best ones concerning the running time.},
+  abstract = {Dense subgraphs of sparse graphs (communities), which appear in most real-world complex networks, play an important role in many contexts. Computing them however is generally expensive. We propose here a measure of similarities between vertices based on random walks which has several important advantages: it captures well the community structure in a network, it can be computed efficiently, and it can be used in an agglomerative algorithm to compute efficiently the community structure of a network. We propose such an algorithm, called Walktrap, which runs in time O(mn\textasciicircum 2) and space O(n\textasciicircum 2) in the worst case, and in time O(n\textasciicircum 2log n) and space O(n\textasciicircum 2) in most real-world cases (n and m are respectively the number of vertices and edges in the input graph). Extensive comparison tests show that our algorithm surpasses previously proposed ones concerning the quality of the obtained community structures and that it stands among the best ones concerning the running time.},
  pubstate = {preprint},
  keywords = {Condensed Matter - Disordered Systems and Neural Networks,Condensed Matter - Statistical Mechanics,Physics - Physics and Society}
 }

+@online{pontvianneDupliquerPourAdapter2020,
+  title = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ? | CNRS Biologie},
+  shorttitle = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ?},
+  author = {Pontvianne, Frédéric},
+  date = {2020-10-14},
+  url = {https://www.insb.cnrs.fr/fr/cnrsinfo/dupliquer-pour-sadapter-ou-comment-accelerer-levolution-des-plantes},
+  urldate = {2024-03-25},
+  abstract = {Les duplications de portions de chromosomes permettant aux organismes de dupliquer des gènes existants et d’en créer de nouveaux sont bien},
+  langid = {french}
+}
+
@article{reamsSelectionGeneClustering2004,
  title = {Selection for {{Gene Clustering}} by {{Tandem Duplication}}},
  author = {Reams, Andrew B. and Neidle, Ellen L.},
@ -640,6 +913,25 @@
  abstract = {We analyzed the distribution of 54 families of transposable elements (TEs; transposons, LTR retrotransposons, and non-LTR retrotransposons) in the chromosomes of Drosophila melanogaster, using data from the sequenced genome. The density of LTR and non-LTR retrotransposons (RNA-based elements) was high in regions with low recombination rates, but there was no clear tendency to parallel the recombination rate. However, the density of transposons (DNA-based elements) was significantly negatively correlated with recombination rate. The accumulation of TEs in regions of reduced recombination rate is compatible with selection acting against TEs, as selection is expected to be weaker in regions with lower recombination. The differences in the relationship between recombination rate and TE density that exist between chromosome arms suggest that TE distribution depends on specific characteristics of the chromosomes (chromatin structure, distribution of other sequences), the TEs themselves (transposition mechanism), and the species (reproductive system, effective population size, etc.), that have differing influences on the effect of natural selection acting against the TE insertions.}
 }

+@article{rizzonStrikingSimilaritiesGenomic2006,
+  title = {Striking {{Similarities}} in the {{Genomic Distribution}} of {{Tandemly Arrayed Genes}} in {{Arabidopsis}} and {{Rice}}},
+  author = {Rizzon, Carene and Ponger, Loic and Gaut, Brandon S},
+  date = {2006-09},
+  journaltitle = {PLoS Computational Biology},
+  shortjournal = {PLoS Comput Biol},
+  volume = {2},
+  number = {9},
+  eprint = {16948529},
+  eprinttype = {pmid},
+  pages = {e115},
+  issn = {1553-734X},
+  doi = {10.1371/journal.pcbi.0020115},
+  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1557586/},
+  urldate = {2024-04-10},
+  abstract = {In Arabidopsis, tandemly arrayed genes (TAGs) comprise {$>$}10\% of the genes in the genome. These duplicated genes represent a rich template for genetic innovation, but little is known of the evolutionary forces governing their generation and maintenance. Here we compare the organization and evolution of TAGs between Arabidopsis and rice, two plant genomes that diverged \textasciitilde 150 million years ago. TAGs from the two genomes are similar in a number of respects, including the proportion of genes that are tandemly arrayed, the number of genes within an array, the number of tandem arrays, and the dearth of TAGs relative to single copy genes in centromeric regions. Analysis of recombination rates along rice chromosomes confirms a positive correlation between the occurrence of TAGs and recombination rate, as found in Arabidopsis. TAGs are also biased functionally relative to duplicated, nontandemly arrayed genes. In both genomes, TAGs are enriched for genes that encode membrane proteins and function in “abiotic and biotic stress” but underrepresented for genes involved in transcription and DNA or RNA binding functions. We speculate that these observations reflect an evolutionary trend in which successful tandem duplication involves genes either at the end of biochemical pathways or in flexible steps in a pathway, for which fluctuation in copy number is unlikely to affect downstream genes. Despite differences in the age distribution of tandem arrays, the striking similarities between rice and Arabidopsis indicate similar mechanisms of TAG generation and maintenance., The nuclear genomes of higher plants vary tremendously in size and gene content. Much of this variation is attributable to gene duplication. To date, most studies of plant gene duplication have focused on whole genome duplication events, which duplicate all genes simultaneously. Another prominent process is single gene duplication, which often results in duplicated genes arranged in a tandem array. Here Rizzon, Ponger, and Gaut identify tandem arrays in rice and their genome organization between Arabidopsis and rice, two plant species that diverged \textasciitilde 150 million years ago. The two genomes contain a similar proportion of genes that are tandemly arrayed, with a similar number of genes within an array. Moreover, tandemly arrayed genes are most common in genomic regions of high recombination in both species. This organization appears to be a general feature of eukaryotic genomes, perhaps because duplication rates are higher in high recombination regions. Tandemly arrayed genes of rice and Arabidopsis also represent a biased gene set with regard to function. In contrast to genes duplicated through whole genome events, tandemly arrayed genes are enriched for genes that encode membrane proteins and genes that function in response to environmental stresses. Taken together, these observations suggest that tandemly arrayed genes represent a rich and relatively fluid source for plant adaptation.},
+  pmcid = {PMC1557586}
+}
+
@article{rognesParAlignParallelSequence2001,
  title = {{{ParAlign}}: A Parallel Sequence Alignment Algorithm for Rapid and Sensitive Database Searches},
  shorttitle = {{{ParAlign}}},
@ -690,6 +982,24 @@
  issue = {suppl\_2}
 }

+@article{shojaRoadmapTandemlyArrayed2006,
+  title = {A Roadmap of Tandemly Arrayed Genes in the Genomes of Human, Mouse, and Rat},
+  author = {Shoja, Valia and Zhang, Liqing},
+  date = {2006-11},
+  journaltitle = {Molecular Biology and Evolution},
+  shortjournal = {Mol Biol Evol},
+  volume = {23},
+  number = {11},
+  eprint = {16901985},
+  eprinttype = {pmid},
+  pages = {2134--2141},
+  issn = {0737-4038},
+  doi = {10.1093/molbev/msl085},
+  abstract = {Tandemly arrayed genes (TAGs) play an important functional and physiological role in the genome. Most previous studies have focused on individual TAG families in a few species, yet a broad characterization of TAGs is not available. Here we identified all TAGs in the genomes of humans, mouse, and rat and performed a comprehensive analysis of TAG distribution, TAG sizes, TAG orientations and intergenic distances, and TAG functions. TAGs account for about 14-17\% of all genes in the genome and nearly one-third of all duplicated genes, highlighting the predominant role that tandem duplication plays in gene duplication. For all species, TAG distribution is highly heterogeneous along chromosomes and some chromosomes are enriched with TAG forests, whereas others are enriched with TAG deserts. The majority of TAGs are of size 2 for all genomes, similar to the previous findings in Caenorhabditis elegans, Arabidopsis thaliana, and Oryza sativa, suggesting that it is a rather general phenomenon in eukaryotes. The comparison with the genome patterns shows that TAG members have a significantly higher proportion of parallel gene orientation in all species, corroborating Graham's claim that parallel orientation is the preferred form of orientation in TAGs. Moreover, TAG members with parallel orientation tend to be closer to each other than all neighboring genes in the genome with parallel orientation. The analyses of Gene Ontology function indicate that genes with receptor or binding activities are significantly overrepresented by TAGs. Computer simulation reveals that random gene rearrangements have little effect on the statistics of TAGs for all genomes. Finally, the average proportion of TAGs shows a trend of increase with the increase of family sizes, although the correlation between TAG proportions in individual families and family sizes is not significant.},
+  langid = {english},
+  keywords = {Animals,Chromosome Mapping,Chromosomes,Gene Duplication,Genome,Humans,Mice,Rats,Recombination Genetic,Tandem Repeat Sequences}
+}
+
@article{smithIdentificationCommonMolecular1981,
  title = {Identification of Common Molecular Subsequences},
  author = {Smith, T. F. and Waterman, M. S.},
@ -759,6 +1069,41 @@
  keywords = {Algorithms,Base Sequence,Chromosome Mapping,Computer Graphics,DNA,DNA Mutational Analysis,Molecular Sequence Data,Sequence Alignment,Sequence Analysis DNA,Software,User-Computer Interface}
 }

+@article{thibaud-nissenIdentificationCharacterizationPseudogenes2009,
+  title = {Identification and Characterization of Pseudogenes in the Rice Gene Complement},
+  author = {Thibaud-Nissen, Françoise and Ouyang, Shu and Buell, C. Robin},
+  date = {2009-07-16},
+  journaltitle = {BMC Genomics},
+  shortjournal = {BMC Genomics},
+  volume = {10},
+  number = {1},
+  pages = {317},
+  issn = {1471-2164},
+  doi = {10.1186/1471-2164-10-317},
+  url = {https://doi.org/10.1186/1471-2164-10-317},
+  urldate = {2024-04-09},
+  abstract = {The Osa1 Genome Annotation of rice (Oryza sativa L. ssp. japonica cv. Nipponbare) is the product of a semi-automated pipeline that does not explicitly predict pseudogenes. As such, it is likely to mis-annotate pseudogenes as functional genes. A total of 22,033 gene models within the Osa1 Release 5 were investigated as potential pseudogenes as these genes exhibit at least one feature potentially indicative of pseudogenes: lack of transcript support, short coding region, long untranslated region, or, for genes residing within a segmentally duplicated region, lack of a paralog or significantly shorter corresponding paralog.},
+  langid = {english},
+  keywords = {Duplicate Region,GOSlim Term,Massively Parallel Signature Sequencing,Massively Parallel Signature Sequencing Data,Paralogous Family}
+}
+
+@article{tremblaysavardEvolutionOrthologousTandemly2011,
+  title = {Evolution of Orthologous Tandemly Arrayed Gene Clusters},
+  author = {Tremblay Savard, Olivier and Bertrand, Denis and El-Mabrouk, Nadia},
+  date = {2011-10-05},
+  journaltitle = {BMC Bioinformatics},
+  shortjournal = {BMC Bioinformatics},
+  volume = {12},
+  number = {9},
+  pages = {S2},
+  issn = {1471-2105},
+  doi = {10.1186/1471-2105-12-S9-S2},
+  url = {https://doi.org/10.1186/1471-2105-12-S9-S2},
+  urldate = {2024-04-09},
+  abstract = {Tandemly Arrayed Gene (TAG) clusters are groups of paralogous genes that are found adjacent on a chromosome. TAGs represent an important repertoire of genes in eukaryotes. In addition to tandem duplication events, TAG clusters are affected during their evolution by other mechanisms, such as inversion and deletion events, that affect the order and orientation of genes. The DILTAG algorithm developed in [1] makes it possible to infer a set of optimal evolutionary histories explaining the evolution of a single TAG cluster, from an ancestral single gene, through tandem duplications (simple or multiple, direct or inverted), deletions and inversion events.},
+  keywords = {Gene Order,Gene Tree,Internal Vertex,Inversion Event,Tandem Duplication}
+}
+
@article{vandongenGraphClusteringDiscrete2008a,
  title = {Graph {{Clustering Via}} a {{Discrete Uncoupling Process}}},
  author = {Van Dongen, Stijn},
@ -787,6 +1132,58 @@
  langid = {english}
 }

+@incollection{vandongenUsingMCLExtract2012,
+  title = {Using {{MCL}} to {{Extract Clusters}} from {{Networks}}},
+  booktitle = {Bacterial {{Molecular Networks}}},
+  author = {Van Dongen, Stijn and Abreu-Goodger, Cei},
+  editor = {Van Helden, Jacques and Toussaint, Ariane and Thieffry, Denis},
+  date = {2012},
+  volume = {804},
+  pages = {281--295},
+  publisher = {Springer New York},
+  location = {New York, NY},
+  doi = {10.1007/978-1-61779-361-5_15},
+  url = {http://link.springer.com/10.1007/978-1-61779-361-5_15},
+  urldate = {2024-04-11},
+  isbn = {978-1-61779-360-8 978-1-61779-361-5},
+  langid = {english}
+}
+
+@incollection{vandongenUsingMCLExtract2012a,
+  title = {Using {{MCL}} to {{Extract Clusters}} from {{Networks}}},
+  booktitle = {Bacterial {{Molecular Networks}}},
+  author = {Van Dongen, Stijn and Abreu-Goodger, Cei},
+  editor = {Van Helden, Jacques and Toussaint, Ariane and Thieffry, Denis},
+  date = {2012},
+  volume = {804},
+  pages = {281--295},
+  publisher = {Springer New York},
+  location = {New York, NY},
+  doi = {10.1007/978-1-61779-361-5_15},
+  url = {http://link.springer.com/10.1007/978-1-61779-361-5_15},
+  urldate = {2024-04-11},
+  isbn = {978-1-61779-360-8 978-1-61779-361-5},
+  langid = {english}
+}
+
+@article{vizuetaBitacoraComprehensiveTool2020,
+  title = {Bitacora: {{A}} Comprehensive Tool for the Identification and Annotation of Gene Families in Genome Assemblies},
+  shorttitle = {Bitacora},
+  author = {Vizueta, Joel and Sánchez-Gracia, Alejandro and Rozas, Julio},
+  date = {2020},
+  journaltitle = {Molecular Ecology Resources},
+  volume = {20},
+  number = {5},
+  pages = {1445--1452},
+  issn = {1755-0998},
+  doi = {10.1111/1755-0998.13202},
+  url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.13202},
+  urldate = {2024-04-09},
+  abstract = {Gene annotation is a critical bottleneck in genomic research, especially for the comprehensive study of very large gene families in the genomes of nonmodel organisms. Despite the recent progress in automatic methods, state-of-the-art tools used for this task often produce inaccurate annotations, such as fused, chimeric, partial or even completely absent gene models for many family copies, errors that require considerable extra efforts to be corrected. Here we present bitacora, a bioinformatics solution that integrates popular sequence similarity-based search tools and Perl scripts to facilitate both the curation of these inaccurate annotations and the identification of previously undetected gene family copies directly in genomic DNA sequences. We tested the performance of bitacora in annotating the members of two chemosensory gene families with different repertoire size in seven available genome sequences, and compared its performance with that of augustus-ppx, a tool also designed to improve automatic annotations using a sequence similarity-based approach. Despite the relatively high fragmentation of some of these drafts, bitacora was able to improve the annotation of many members of these families and detected thousands of new chemoreceptors encoded in genome sequences. The program creates general feature format (GFF) files, with both curated and newly identified gene models, and FASTA files with the predicted proteins. These outputs can be easily integrated in genomic annotation editors, greatly facilitating subsequent manual annotation and downstream evolutionary analyses.},
+  langid = {english},
+  keywords = {bioinfomatics/phyloinfomatics,gene families,gene structure and function,genomics,molecular evolution,structural annotation,transcriptomics}
+}
+
@article{wolfeRobustnessItNot2000,
  title = {Robustness—It's Not Where You Think It Is},
  author = {Wolfe, Ken},
--- a/report.org
+++ b/report.org
@ -123,9 +123,19 @@ A typical DNA transposon contains a transposase gene. This enzyme recognizes two
 Finally, glspl:segment_duplication, also called /low copy repeats/ are long stretches of DNA with high identity score ([[cref:fig:gene-duplication-mechanisms]] (F)). Their exact duplication mechanism remains unclear [cite:@lallemandOverviewDuplicatedGene2020]. They may come from an accidental replication, distinct from an uneven cross-over or a double stranded breakage.
 Transposable elements may well be involved in the mechanism, as a high enrichment of transposable elements is found next to duplicate segment extremities, in /Drosophila/ [cite:@lallemandOverviewDuplicatedGene2020].

+#+begin_export latex
+\fladdfig{
+	\includegraphics[width=.9\linewidth]{figures/Evolution_fate_duplicate_genes.pdf}
+	\caption[Fate of duplicate genes]{\label{fig:fate-duplicate-genes} Fate of duplicate genes. An original gene with four functions is duplicated. Its two copies may both keep the original functions (functional redoundancy). The original functions may split between the different copies (subfunctionalization). One of the copy may acquire a new function (neofunctionalization). It may also degenerate and lose its original functions (pseudogenization).
+
+        Adapted from \href{https://commons.wikimedia.org/wiki/File:Evolution_fate_duplicate_genes_-_vector.svg}{Smedlib}, \href{https://creativecommons.org/licenses/by-sa/4.0}{CC BY-SA 4.0}, via Wikimedia Commons}
+}
+#+end_export
+
 ** Fate of duplicate genes in genome evolution
 In his book /Evolution by Gene Duplication/, Susumu [[latex:textsc][Ohno]] proposed that gene duplication plays a major role in species evolution [cite:@ohnoEvolutionGeneDuplication1970], because it provides new genetic materials to build on new phenotypes while keeping a backup gene for the previous function.
 Indeed, duplicate genes evolve after duplication: they may be inactivated, and become glspl:pseudogene; they may be deleted or conserved, and if conserved, the may or may not acquire a new function.
+[[Cref:fig:fate-duplicate-genes]] depicts the different possible fates of a duplicate gene.

 # *** Pseudogenization
 As genome evolves, duplicate genes may be inactivated and become pseudogenes. These pseudogenes keep a gene-like structure which degrades as and when further genome modifications occur but they are no longer expressed.
@ -140,6 +150,12 @@ Two duplicate genes with the same original function may encounter a gls:subfunct
 # *** Functional redundancy
 Another possibility is that the two gene copies keep the ancestral function, resulting in a functional redoundancy. In this case the quantity of gene product may increase.
 ** Methods to identify duplicate genes
+#+begin_export latex
+\fladdfig{
+	\includegraphics[width=.9\linewidth]{./figures/tag-definition.pdf}
+	\caption[Schematic representation of TAG definitions]{\label{fig:tag-definitions} Schematic representation of TAG definitions. Several genes are represented on a linear chromosome. The red box represent a singleton gene. Orange boxes represent a TAG with two duplicate genes seperated by 7 other genes ($\mathrm{TAG}_7$). Four green boxes constitute a TAG, the gene at the extremities are seperated by three genes ($\mathrm{TAG}_3$. The two blue boxes represents a TAG with two genes next to each other $\mathrm{TAG}_0$. The bended edges represents the homology links between each pair of genes of a TAG.}}
+#+end_export
+
 Different methods exists to detect duplicate genes. These methods depend on the type of duplicate genes they target and vary on computation burden as well as in the ease of use (for a review, see [cite/t:@lallemandOverviewDuplicatedGene2020]).

 *** Paralog detection
@ -161,15 +177,10 @@ Several =BLAST= metrics can be used as an homology measure, such as bitscore, id
 **** Identification of gene families
 Based on the homology links between each pair of genes, we construct an undirected weighted graph whose vertices correspond to genes and edges to homology links between them.
 We apply a graph clustering algorithm on the graph in order to infer the gene families corresponding to densely connected communities of vertices.
-
-#+begin_export latex
-\fladdfig{
-	\includegraphics[width=.9\linewidth]{./figures/tag-definition.pdf}
-	\caption[Schematic representation of TAG definitions]{\label{fig:tag-definitions} Schematic representation of TAG definitions. Several genes are represented on a linear chromosome. The red box represent a singleton gene. Orange boxes represent a TAG with two duplicate genes seperated by 7 other genes ($\mathrm{TAG}_7$). Four green boxes constitute a TAG, the gene at the extremities are seperated by three genes ($\mathrm{TAG}_3$. The two blue boxes represents a TAG with two genes next to each other $\mathrm{TAG}_0$. The bended edges represents the homology links between each pair of genes of a TAG.}}
-#+end_export
-
-
 FTAG Finder proposes three clustering algorithm alternatives: single linkage, Markov Clustering [cite:@vandongenNewClusterAlgorithm1998] or Walktrap [cite:@ponsComputingCommunitiesLarge2005].
+
+
+
 **** Detection of TAGs
 The final step of FTAG Finder consists in the identification of gls:TAG from the gene families and the positions of genes.
 For a given chromosome, the tool seeks genes belonging to the same family and located close to each other. The tool allows a maximal number of genes between the homologous genes, with a parameter set by the user. Cref:fig:tag-definitions is a schematic representation of some possible gls:TAG positioning on a genome associated with their definition in FTAG Finder /Find Tags/ step.
@ -192,6 +203,9 @@ The two main options being Snakemake and Nextflow. Snakemake is a python powered
 \flstop
 #+end_export

+* Methodological approaches
+
+
 * References
 :PROPERTIES:
 :UNNUMBERED: t
--- a/report.pdf
+++ b/report.pdf