lamme2024-scientific-project/references.bib

@article{acharyaGlobalAnalysisHuman2016,
  title = {Global Analysis of Human Duplicated Genes Reveals the Relative Importance of Whole-Genome Duplicates Originated in the Early Vertebrate Evolution},
  author = {Acharya, Debarun and Ghosh, Tapash C.},
  date = {2016-01-22},
  journaltitle = {BMC Genomics},
  shortjournal = {BMC Genomics},
  volume = {17},
  eprint = {26801093},
  eprinttype = {pmid},
  pages = {71},
  issn = {1471-2164},
  doi = {10.1186/s12864-016-2392-0},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4724117/},
  urldate = {2024-03-27},
  abstract = {Background Gene duplication is a genetic mutation that creates functionally redundant gene copies that are initially relieved from selective pressures and may adapt themselves to new functions with time. The levels of gene duplication may vary from small-scale duplication (SSD) to whole genome duplication (WGD). Studies with yeast revealed ample differences between these duplicates: Yeast WGD pairs were functionally more similar, less divergent in subcellular localization and contained a lesser proportion of essential genes. In this study, we explored the differences in evolutionary genomic properties of human SSD and WGD genes, with the identifiable human duplicates coming from the two rounds of whole genome duplication occurred early in vertebrate evolution. Results We observed that these two groups of duplicates were also dissimilar in terms of their evolutionary and genomic properties. But interestingly, this is not like the same observed in yeast. The human WGDs were found to be functionally less similar, diverge more in subcellular level and contain a higher proportion of essential genes than the SSDs, all of which are opposite from yeast. Additionally, we explored that human WGDs were more divergent in their gene expression profile, have higher multifunctionality and are more often associated with disease, and are evolutionarily more conserved than human SSDs. Conclusions Our study suggests that human WGD duplicates are more divergent and entails the adaptation of WGDs to novel and important functions that consequently lead to their evolutionary conservation in the course of evolution. Electronic supplementary material The online version of this article (doi:10.1186/s12864-016-2392-0) contains supplementary material, which is available to authorized users.},
  pmcid = {PMC4724117}
}

@article{altschulBasicLocalAlignment1990,
  title = {Basic Local Alignment Search Tool},
  author = {Altschul, Stephen F. and Gish, Warren and Miller, Webb and Myers, Eugene W. and Lipman, David J.},
  date = {1990-10-05},
  journaltitle = {Journal of Molecular Biology},
  shortjournal = {Journal of Molecular Biology},
  volume = {215},
  number = {3},
  pages = {403--410},
  issn = {0022-2836},
  doi = {10.1016/S0022-2836(05)80360-2},
  url = {https://www.sciencedirect.com/science/article/pii/S0022283605803602},
  urldate = {2023-04-30},
  abstract = {A new approach to rapid sequence comparison, basic local alignment search tool (BLAST), directly approximates alignments that optimize a measure of local similarity, the maximal segment pair (MSP) score. Recent mathematical results on the stochastic properties of MSP scores allow an analysis of the performance of this method as well as the statistical significance of alignments it generates. The basic algorithm is simple and robust; it can be implemented in a number of ways and applied in a variety of contexts including straight-forward DNA and protein sequence database searches, motif searches, gene identification searches, and in the analysis of multiple regions of similarity in long DNA sequences. In addition to its flexibility and tractability to mathematical analysis, BLAST is an order of magnitude faster than existing sequence comparison tools of comparable sensitivity.},
  langid = {english}
}

@article{assisModelsRetentionDuplicate2024,
  title = {Models for the Retention of Duplicate Genes and Their Biological Underpinnings},
  author = {Assis, Raquel and Conant, Gavin and Holland, Barbara and Liberles, David and O'Reilly, Małgorzata and Wilson, Amanda},
  date = {2024-02-12},
  journaltitle = {F1000Research},
  shortjournal = {F1000Research},
  volume = {12},
  pages = {1400},
  doi = {10.12688/f1000research.141786.2},
  abstract = {Gene content in genomes changes through several different processes, with gene duplication being an important contributor to such changes. Gene duplication occurs over a range of scales from individual genes to whole genomes, and the dynamics of this process can be context dependent. Still, there are rules by which genes are retained or lost from genomes after duplication, and probabilistic modeling has enabled characterization of these rules, including their context-dependence. Here, we describe the biology and corresponding mathematical models that are used to understand duplicate gene retention and its contribution to the set of biochemical functions encoded in a genome.},
  keywords = {2read}
}

@article{beallIdentificationAnalysisHyperactive2002,
  title = {Identification and {{Analysis}} of a {{Hyperactive Mutant Form}} of {{Drosophila P-Element Transposase}}},
  author = {Beall, Eileen L and Mahoney, Matthew B and Rio, Donald C},
  date = {2002-09-01},
  journaltitle = {Genetics},
  shortjournal = {Genetics},
  volume = {162},
  number = {1},
  pages = {217--227},
  issn = {1943-2631},
  doi = {10.1093/genetics/162.1.217},
  url = {https://doi.org/10.1093/genetics/162.1.217},
  urldate = {2024-03-25},
  abstract = {Transposition in many organisms is regulated to control the frequency of DNA damage caused by the DNA breakage and joining reactions. However, genetic studies in prokaryotic systems have led to the isolation of mutant transposase proteins with higher or novel activities compared to those of the wild-type protein. In the course of our study of the effects of mutating potential ATM-family DNA damage checkpoint protein kinase sites in the Drosophila P-element transposase protein, we found one mutation, S129A, that resulted in an elevated level of transposase activity using in vivo recombination assays, including P-element-mediated germline transformation. In vitro assays for P-element transposase activity indicate that the S129A mutant exhibits elevated donor DNA cleavage activity when compared to the wild-type protein, whereas the strand-transfer activity is similar to that of wild type. This difference may reflect the nature of the in vitro assays and that normally in vivo the two reactions may proceed in concert. The P-element transposase protein contains 10 potential consensus phosphorylation sites for the ATM family of PI3-related protein kinases. Of these 10 sites, 8 affect transposase activity either positively or negatively when substituted individually with alanine and tested in vivo. A mutant transposase protein that contains all eight N-terminal serine and threonine residues substituted with alanine is inactive and can be restored to full activity by substitution of wild-type amino acids back at only 3 of the 8 positions. These data suggest that the activity of P-element transposase may be regulated by phosphorylation and demonstrate that one mutation, S129A, results in hyperactive transposition.}
}

@article{berthelotRainbowTroutGenome2014,
  title = {The Rainbow Trout Genome Provides Novel Insights into Evolution after Whole-Genome Duplication in Vertebrates},
  author = {Berthelot, Camille and Brunet, Frédéric and Chalopin, Domitille and Juanchich, Amélie and Bernard, Maria and Noël, Benjamin and Bento, Pascal and Da Silva, Corinne and Labadie, Karine and Alberti, Adriana and Aury, Jean-Marc and Louis, Alexandra and Dehais, Patrice and Bardou, Philippe and Montfort, Jérôme and Klopp, Christophe and Cabau, Cédric and Gaspin, Christine and Thorgaard, Gary H. and Boussaha, Mekki and Quillet, Edwige and Guyomard, René and Galiana, Delphine and Bobe, Julien and Volff, Jean-Nicolas and Genêt, Carine and Wincker, Patrick and Jaillon, Olivier and Roest Crollius, Hugues and Guiguen, Yann},
  date = {2014-04-22},
  journaltitle = {Nature Communications},
  shortjournal = {Nat Commun},
  volume = {5},
  eprint = {24755649},
  eprinttype = {pmid},
  pages = {3657},
  issn = {2041-1723},
  doi = {10.1038/ncomms4657},
  abstract = {Vertebrate evolution has been shaped by several rounds of whole-genome duplications (WGDs) that are often suggested to be associated with adaptive radiations and evolutionary innovations. Due to an additional round of WGD, the rainbow trout genome offers a unique opportunity to investigate the early evolutionary fate of a duplicated vertebrate genome. Here we show that after 100 million years of evolution the two ancestral subgenomes have remained extremely collinear, despite the loss of half of the duplicated protein-coding genes, mostly through pseudogenization. In striking contrast is the fate of miRNA genes that have almost all been retained as duplicated copies. The slow and stepwise rediploidization process characterized here challenges the current hypothesis that WGD is followed by massive and rapid genomic reorganizations and gene deletions.},
  langid = {english},
  pmcid = {PMC4071752},
  keywords = {2read,Animals,Evolution Molecular,Gene Duplication,Oncorhynchus mykiss,Vertebrates}
}

@article{blankenbergGalaxyWebbasedGenome2010,
  title = {Galaxy: A Web-Based Genome Analysis Tool for Experimentalists},
  shorttitle = {Galaxy},
  author = {Blankenberg, Daniel and Von Kuster, Gregory and Coraor, Nathaniel and Ananda, Guruprasad and Lazarus, Ross and Mangan, Mary and Nekrutenko, Anton and Taylor, James},
  date = {2010-01},
  journaltitle = {Current Protocols in Molecular Biology},
  shortjournal = {Curr Protoc Mol Biol},
  volume = {Chapter 19},
  eprint = {20069535},
  eprinttype = {pmid},
  pages = {Unit 19.10.1-21},
  issn = {1934-3647},
  doi = {10.1002/0471142727.mb1910s89},
  abstract = {High-throughput data production has revolutionized molecular biology. However, massive increases in data generation capacity require analysis approaches that are more sophisticated, and often very computationally intensive. Thus, making sense of high-throughput data requires informatics support. Galaxy (http://galaxyproject.org) is a software system that provides this support through a framework that gives experimentalists simple interfaces to powerful tools, while automatically managing the computational details. Galaxy is distributed both as a publicly available Web service, which provides tools for the analysis of genomic, comparative genomic, and functional genomic data, or a downloadable package that can be deployed in individual laboratories. Either way, it allows experimentalists without informatics or programming expertise to perform complex large-scale analysis with just a Web browser.},
  langid = {english},
  pmcid = {PMC4264107},
  keywords = {Animals,Computational Biology,Genetic Techniques,Genome,Humans,Internet,Software Design}
}

@misc{bouillonFTAGFinderOutil2016,
  title = {{{FTAG Finder}}: {{Un}} Outil Simple Pour Déterminer Les Familles de Gènes et Les Gènes Dupliqués En Tandem Sous {{Galaxy}}},
  author = {Bouillon, Bérengère and Samson, Franck and Birmelé, Etienne and Ponger, Loïc and Rizzon, Carène},
  date = {2016}
}

@article{buchfinkSensitiveProteinAlignments2021,
  title = {Sensitive Protein Alignments at Tree-of-Life Scale Using {{DIAMOND}}},
  author = {Buchfink, Benjamin and Reuter, Klaus and Drost, Hajk-Georg},
  date = {2021-04},
  journaltitle = {Nature Methods},
  shortjournal = {Nat Methods},
  volume = {18},
  number = {4},
  pages = {366--368},
  publisher = {Nature Publishing Group},
  issn = {1548-7105},
  doi = {10.1038/s41592-021-01101-x},
  url = {https://www.nature.com/articles/s41592-021-01101-x},
  urldate = {2024-03-28},
  abstract = {We are at the beginning of a genomic revolution in which all known species are planned to be sequenced. Accessing such data for comparative analyses is crucial in this new age of data-driven biology. Here, we introduce an improved version of DIAMOND that greatly exceeds previous search performances and harnesses supercomputing to perform tree-of-life scale protein alignments in hours, while matching the sensitivity of the gold standard BLASTP.},
  langid = {english},
  keywords = {Computational biology and bioinformatics,Genome informatics,Genomic analysis,Sequencing,Software}
}

@unpublished{caronCyberGalaxy2013,
  title = {Towards a Cyber {{Galaxy}} ?},
  author = {Caron, Christophe C. and Carre, Wilfried and Cormier, Alexandre and Derozier, Sandra S. and Giacomoni, Franck and Inizan, Olivier and Le Corguillé, Gildas and Lermine, Alban and Maman Haddad, Sarah and Pericard, Pierre and Samson, Franck F.},
  date = {2013-07},
  series = {{{JOBIM TOULOUSE}} 2013 - {{RÉSUMÉS COURTS}} (Affiches)},
  pages = {246},
  url = {https://hal.inrae.fr/hal-02748994},
  urldate = {2024-04-09},
  abstract = {The success of the open web based platform “Galaxy” is growing among diverse scientific communities. The French Institute of Bioinformatics - IFB wish to initiate a collaborative work dedicated to scientific workflows and especially to the platform Galaxy. We report here the main items on which future collaborations could be build: (i) software and hardware architecture, (ii) tools integration and (iii) training.},
  keywords = {formation,galaxy,intégration d'outils,NGS,partage de données,workflow},
  annotation = {Published: JOBIM 2013}
}

@unpublished{caronFrenchCyberGalaxy2013,
  title = {Toward a {{French}} Cyber {{Galaxy}} ?},
  author = {Caron, Christophe C. and Carré, Wilfrid and Cormier, Alexandre and Derozier, Sandra S. and Giacomoni, Franck and Inizan, Olivier and Le Corguillé, Gildas and Lermine, Alban and Maman Haddad, Sarah and Pericard, Pierre and Samson, Franck F.},
  date = {2013-06},
  series = {Galaxy {{Community Conference}} 2013 : {{Posters}} / {{Abstracts}}},
  pages = {online},
  url = {https://hal.inrae.fr/hal-02748274},
  urldate = {2024-04-09},
  abstract = {The success of the open web based platform “Galaxy” is growing among scientific communities. The French Institute of Bioinformatics (IFB) wishes to initiate a collaborative work dedicated to scientific workflows and especially to the Galaxy platform. We report here the main items on which future collaborations could be build: (i) software and hardware architecture, (ii) tools integration and (iii) training. High throughput technologies advent significantly alters analysis behaviour and strategy with mobilization of new infrastructure, new tools and new skills. IFB decided to conduct a cross action on "workflows" data analysis solutions, and especially on the Galaxy platform. The first item called "software and hardware architecture" addresses the operational issues in production environments, the potential for automating deployment tasks and the monitoring solutions for Galaxy servers. With the second one, "Tools integration" we aim to provide processes facilitating tool interfacing in a Galaxy instance. Priority will be the development of a good practice guide, as well as a technology watch around the methods proposed by the international community. We also want to promote the sharing of training activities at national level (such as the Aviesan Bioinformatics school, January 2013 - http://galaxy-ecole.sb-roscoff.fr/) and ensure a smooth transition to new uses, such as E-learning. A first working group is already effective. Previous items will be improved in the coming months thanks to a specific dedicated wiki and the first French Galaxy Workshop this autumn.},
  keywords = {data sharing,Galaxy,NGS,tools integration,training,workflow},
  annotation = {Published: Galaxy Community Conference}
}

@article{casneufNonrandomDivergenceGene2006,
  title = {Nonrandom Divergence of Gene Expression Following Gene and Genome Duplications in the Flowering Plant {{Arabidopsis}} Thaliana},
  author = {Casneuf, Tineke and De Bodt, Stefanie and Raes, Jeroen and Maere, Steven and Van de Peer, Yves},
  date = {2006-02-20},
  journaltitle = {Genome Biology},
  shortjournal = {Genome Biology},
  volume = {7},
  number = {2},
  pages = {R13},
  issn = {1474-760X},
  doi = {10.1186/gb-2006-7-2-r13},
  url = {https://doi.org/10.1186/gb-2006-7-2-r13},
  urldate = {2024-04-13},
  abstract = {Genome analyses have revealed that gene duplication in plants is rampant. Furthermore, many of the duplicated genes seem to have been created through ancient genome-wide duplication events. Recently, we have shown that gene loss is strikingly different for large- and small-scale duplication events and highly biased towards the functional class to which a gene belongs. Here, we study the expression divergence of genes that were created during large- and small-scale gene duplication events by means of microarray data and investigate both the influence of the origin (mode of duplication) and the function of the duplicated genes on expression divergence.},
  keywords = {2read,Additional Data File,Anchor Point,Duplicate Gene,Duplication Event,Expression Divergence}
}

@report{charlesFinalisationPipelineFTAG2023,
  type = {Internship Report},
  title = {Finalisation du pipeline FTAG (Families and TAG) Finder, un outil de détection des gènes dupliqués sous Galaxy},
  author = {Charles, Séanna},
  date = {2023},
  institution = {Laboratoire de Mathématiques et Modélisation d'Évry},
  langid = {french}
}

@article{conantTurningHobbyJob2008,
  title = {Turning a Hobby into a Job: {{How}} Duplicated Genes Find New Functions},
  shorttitle = {Turning a Hobby into a Job},
  author = {Conant, Gavin C. and Wolfe, Kenneth H.},
  date = {2008-12},
  journaltitle = {Nature Reviews Genetics},
  shortjournal = {Nat Rev Genet},
  volume = {9},
  number = {12},
  pages = {938--950},
  issn = {1471-0056, 1471-0064},
  doi = {10.1038/nrg2482},
  url = {https://www.nature.com/articles/nrg2482},
  urldate = {2024-03-19},
  abstract = {Gene duplication provides raw material for functional innovation. Recent advances have shed light on two fundamental questions regarding gene duplication: which genes tend to undergo duplication? And how does natural selection subsequently act on them? Genomic data suggest that different gene classes tend to be retained after single-gene and whole-genome duplications. We also know that functional differences between duplicate genes can originate in several different ways, including mutations that directly impart new functions, subdivision of ancestral functions and selection for changes in gene dosage. Interestingly, in many cases the ‘new’ function of one copy is a secondary property that was always present, but that has been co-opted to a primary role after the duplication.},
  langid = {english}
}

@article{correaTransposableElementEnvironment2021,
  title = {The {{Transposable Element Environment}} of {{Human Genes Differs According}} to {{Their Duplication Status}} and {{Essentiality}}},
  author = {Correa, Margot and Lerat, Emmanuelle and Birmelé, Etienne and Samson, Franck and Bouillon, Bérengère and Normand, Kévin and Rizzon, Carène},
  date = {2021-05-01},
  journaltitle = {Genome Biology and Evolution},
  shortjournal = {Genome Biology and Evolution},
  volume = {13},
  number = {5},
  pages = {evab062},
  issn = {1759-6653},
  doi = {10.1093/gbe/evab062},
  url = {https://doi.org/10.1093/gbe/evab062},
  urldate = {2023-09-15},
  abstract = {Transposable elements (TEs) are major components of eukaryotic genomes and represent approximately 45\% of the human genome. TEs can be important sources of novelty in genomes and there is increasing evidence that TEs contribute to the evolution of gene regulation in mammals. Gene duplication is an evolutionary mechanism that also provides new genetic material and opportunities to acquire new functions. To investigate how duplicated genes are maintained in genomes, here, we explored the TE environment of duplicated and singleton genes. We found that singleton genes have more short-interspersed nuclear elements and DNA transposons in their vicinity than duplicated genes, whereas long-interspersed nuclear elements and long-terminal repeat retrotransposons have accumulated more near duplicated genes. We also discovered that this result is highly associated with the degree of essentiality of the genes with an unexpected accumulation of short-interspersed nuclear elements and DNA transposons around the more-essential genes. Our results underline the importance of taking into account the TE environment of genes to better understand how duplicated genes are maintained in genomes.}
}

@article{correaTransposableElementEnvironment2021a,
  title = {The {{Transposable Element Environment}} of {{Human Genes Differs According}} to {{Their Duplication Status}} and {{Essentiality}}},
  author = {Correa, Margot and Lerat, Emmanuelle and Birmelé, Etienne and Samson, Franck and Bouillon, Bérengère and Normand, Kévin and Rizzon, Carène},
  editor = {Pritham, Ellen},
  date = {2021-05-07},
  journaltitle = {Genome Biology and Evolution},
  volume = {13},
  number = {5},
  pages = {evab062},
  issn = {1759-6653},
  doi = {10.1093/gbe/evab062},
  url = {https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab062/6273345},
  urldate = {2024-03-19},
  abstract = {Transposable elements (TEs) are major components of eukaryotic genomes and represent approximately 45\% of the human genome. TEs can be important sources of novelty in genomes and there is increasing evidence that TEs contribute to the evolution of gene regulation in mammals. Gene duplication is an evolutionary mechanism that also provides new genetic material and opportunities to acquire new functions. To investigate how duplicated genes are maintained in genomes, here, we explored the TE environment of duplicated and singleton genes. We found that singleton genes have more short-interspersed nuclear elements and DNA transposons in their vicinity than duplicated genes, whereas long-interspersed nuclear elements and long-terminal repeat retrotransposons have accumulated more near duplicated genes. We also discovered that this result is highly associated with the degree of essentiality of the genes with an unexpected accumulation of short-interspersed nuclear elements and DNA transposons around the more-essential genes. Our results underline the importance of taking into account the TE environment of genes to better understand how duplicated genes are maintained in genomes.},
  langid = {english}
}

@article{denoeudAnalyseGenomesRecherche,
  title = {Analyse des génomes à la recherche de répétitions en tandem polymorphes: outils d?épidémiologie bactérienne et locus hypermutables humains},
  author = {Denoeud, France},
  langid = {french}
}

@article{desponsTandemGeneArrays2011,
  title = {Tandem Gene Arrays, Plastic Chromosomal Organizations},
  author = {Despons, Laurence and Uzunov, Zlatyo and Louis, Véronique Leh},
  date = {2011-08-01},
  journaltitle = {Comptes Rendus Biologies},
  shortjournal = {Comptes Rendus Biologies},
  series = {Ten Years of Genomic Exploration in Eukaryotes : Strategy and Progress of {{Genolevures}}},
  volume = {334},
  number = {8},
  pages = {639--646},
  issn = {1631-0691},
  doi = {10.1016/j.crvi.2011.05.012},
  url = {https://www.sciencedirect.com/science/article/pii/S1631069111001454},
  urldate = {2024-04-09},
  abstract = {This short article presents an overview of tandem gene arrays (TGAs) in hemiascomycete yeasts. In silico and in vivo analyses are combined to address structural, functional and evolutionary aspects of these particular chromosomal structures. Genomic instability of TGAs is discussed. We conclude that TGAs are generally dynamic regions of the genome in that they are the seats of chromosomal rearrangement events. In addition, they are often breeding grounds of new genes for a rapid adaptation of cells to demands of the environment. Résumé Ce court article présente une vue d’ensemble des tandems de gènes chez les levures hémiascomycètes. Des analyses in silico et in vivo ont été combinées pour aborder les aspects structuraux, fonctionnels et évolutifs de ces structures chromosomiques particulières. L’instabilité génomique des tandems de gènes est discutée. Nous concluons que les tandems de gènes sont généralement des régions dynamiques du génome car ils sont le siège d’événements de réarrangements chromosomiques. De surcroît, ils sont souvent des zones de reproduction de nouveaux gènes pour une adaptation rapide des cellules aux demandes de l’environnement.},
  keywords = {Chromosomal rearrangements,Duplication de gènes en tandem,Evolution,Évolution,Levure,Réarrangements chromosomiques,Tandem gene duplication,Yeast}
}

@article{ditommasoNextflowEnablesReproducible2017,
  title = {Nextflow Enables Reproducible Computational Workflows},
  author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric},
  date = {2017-04},
  journaltitle = {Nature Biotechnology},
  shortjournal = {Nat Biotechnol},
  volume = {35},
  number = {4},
  pages = {316--319},
  issn = {1087-0156, 1546-1696},
  doi = {10.1038/nbt.3820},
  url = {https://www.nature.com/articles/nbt.3820},
  urldate = {2024-03-27},
  langid = {english}
}

@article{ditommasoNextflowEnablesReproducible2017a,
  title = {Nextflow Enables Reproducible Computational Workflows},
  author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W. and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric},
  date = {2017-04},
  journaltitle = {Nature Biotechnology},
  shortjournal = {Nat Biotechnol},
  volume = {35},
  number = {4},
  pages = {316--319},
  publisher = {Nature Publishing Group},
  issn = {1546-1696},
  doi = {10.1038/nbt.3820},
  url = {https://www.nature.com/articles/nbt.3820},
  urldate = {2024-03-26},
  langid = {english},
  keywords = {Computational biology and bioinformatics,Data publication and archiving}
}

@article{djaffardjyDevelopingReusingBioinformatics2023,
  title = {Developing and Reusing Bioinformatics Data Analysis Pipelines Using Scientific Workflow Systems},
  author = {Djaffardjy, Marine and Marchment, George and Sebe, Clémence and Blanchet, Raphael and Bellajhame, Khalid and Gaignard, Alban and Lemoine, Frédéric and Cohen-Boulakia, Sarah},
  date = {2023},
  journaltitle = {Computational and Structural Biotechnology Journal},
  volume = {21},
  eprint = {36968012},
  eprinttype = {pmid},
  pages = {2075},
  publisher = {{Research Network of Computational and Structural Biotechnology}},
  doi = {10.1016/j.csbj.2023.03.003},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10030817/},
  urldate = {2024-03-26},
  abstract = {Data analysis pipelines are now established as an effective means for specifying and executing bioinformatics data analysis and experiments. While scripting languages, particularly Python, R and notebooks, are popular and sufficient for developing small-scale ...},
  langid = {english}
}

@article{duarteExpressionPatternShifts2006,
  title = {Expression {{Pattern Shifts Following Duplication Indicative}} of {{Subfunctionalization}} and {{Neofunctionalization}} in {{Regulatory Genes}} of {{Arabidopsis}}},
  author = {Duarte, Jill M. and Cui, Liying and Wall, P. Kerr and Zhang, Qing and Zhang, Xiaohong and Leebens-Mack, Jim and Ma, Hong and Altman, Naomi and {dePamphilis}, Claude W.},
  date = {2006-02-01},
  journaltitle = {Molecular Biology and Evolution},
  shortjournal = {Molecular Biology and Evolution},
  volume = {23},
  number = {2},
  pages = {469--478},
  issn = {0737-4038},
  doi = {10.1093/molbev/msj051},
  url = {https://doi.org/10.1093/molbev/msj051},
  urldate = {2024-04-14},
  abstract = {Gene duplication plays an important role in the evolution of diversity and novel function and is especially prevalent in the nuclear genomes of flowering plants. Duplicate genes may be maintained through subfunctionalization and neofunctionalization at the level of expression or coding sequence. In order to test the hypothesis that duplicated regulatory genes will be differentially expressed in a specific manner indicative of regulatory subfunctionalization and/or neofunctionalization, we examined expression pattern shifts in duplicated regulatory genes in Arabidopsis. A two-way analysis of variance was performed on expression data for 280 phylogenetically identified paralogous pairs. Expression data were extracted from global expression profiles for wild-type root, stem, leaf, developing inflorescence, nearly mature flower buds, and seedpod. Gene, organ, and gene by organ interaction (G × O) effects were examined. Results indicate that 85\% of the paralogous pairs exhibited a significant G × O effect indicative of regulatory subfunctionalization and/or neofunctionalization. A significant G × O effect was associated with complementary expression patterns in 45\% of pairwise comparisons. No association was detected between a G × O effect and a relaxed evolutionary constraint as detected by the ratio of nonsynonymous to synonymous substitutions. Ancestral gene expression patterns inferred across a Type II MADS-box gene phylogeny suggest several cases of regulatory neofunctionalization and organ-specific nonfunctionalization. Complete linkage clustering of gene expression levels across organs suggests that regulatory modules for each organ are independent or ancestral genes had limited expression. We propose a new classification, regulatory hypofunctionalization, for an overall decrease in expression level in one member of a paralogous pair while still having a significant G × O effect. We conclude that expression divergence specifically indicative of subfunctionalization and/or neofunctionalization contributes to the maintenance of most if not all duplicated regulatory genes in Arabidopsis and hypothesize that this results in increasing expression diversity or specificity of regulatory genes after each round of duplication.}
}

@article{emmsOrthoFinderPhylogeneticOrthology2019,
  title = {{{OrthoFinder}}: Phylogenetic Orthology Inference for Comparative Genomics},
  shorttitle = {{{OrthoFinder}}},
  author = {Emms, David M. and Kelly, Steven},
  date = {2019-11-14},
  journaltitle = {Genome Biology},
  shortjournal = {Genome Biology},
  volume = {20},
  number = {1},
  pages = {238},
  issn = {1474-760X},
  doi = {10.1186/s13059-019-1832-y},
  url = {https://doi.org/10.1186/s13059-019-1832-y},
  urldate = {2024-03-31},
  abstract = {Here, we present a major advance of the OrthoFinder method. This extends OrthoFinder’s high accuracy orthogroup inference to provide phylogenetic inference of orthologs, rooted gene trees, gene duplication events, the rooted species tree, and comparative genomics statistics. Each output is benchmarked on appropriate real or simulated datasets, and where comparable methods exist, OrthoFinder is equivalent to or outperforms these methods. Furthermore, OrthoFinder is the most accurate ortholog inference method on the Quest for Orthologs benchmark test. Finally, OrthoFinder’s comprehensive phylogenetic analysis is achieved with equivalent speed and scalability to the fastest, score-based heuristic methods. OrthoFinder is available at https://github.com/davidemms/OrthoFinder.},
  keywords = {Comparative genomics,Gene duplication,Gene tree inference,Ortholog inference}
}

@article{emmsOrthoFinderSolvingFundamental2015,
  title = {{{OrthoFinder}}: Solving Fundamental Biases in Whole Genome Comparisons Dramatically Improves Orthogroup Inference Accuracy},
  shorttitle = {{{OrthoFinder}}},
  author = {Emms, David M. and Kelly, Steven},
  date = {2015-08-06},
  journaltitle = {Genome Biology},
  shortjournal = {Genome Biology},
  volume = {16},
  number = {1},
  pages = {157},
  issn = {1474-760X},
  doi = {10.1186/s13059-015-0721-2},
  url = {https://doi.org/10.1186/s13059-015-0721-2},
  urldate = {2024-03-30},
  abstract = {Identifying homology relationships between sequences is fundamental to biological research. Here we provide a novel orthogroup inference algorithm called OrthoFinder that solves a previously undetected gene length bias in orthogroup inference, resulting in significant improvements in accuracy. Using real benchmark datasets we demonstrate that OrthoFinder is more accurate than other orthogroup inference methods by between 8 \% and 33 \%. Furthermore, we demonstrate the utility of OrthoFinder by providing a complete classification of transcription factor gene families in plants revealing 6.9 million previously unobserved relationships.},
  keywords = {Blast Score,Gene Length,Phylogenetic Distance,Sequence Similarity Score,Transcription Factor Gene Family}
}

@video{evry-senartsciencesetinnovationCareneRizzonUEVE2014,
  entrysubtype = {video},
  title = {Carène {{Rizzon}} ({{UEVE}}) - {{Etude}} de l’évolution Des Gènes Dupliqués},
  editor = {{Evry-Sénart Sciences et Innovation}},
  editortype = {director},
  date = {2014},
  url = {https://www.youtube.com/watch?v=ubiOE7w3374},
  urldate = {2024-04-10},
  abstract = {Colloque ESI 2014 "Evry Bio \& Evry STIC" organisé par Evry Sciences et Innovation le 30 avril 2014 à Evry.  Intervention de Carène Rizzon de l'Université d’Évry: "Étude de l’évolution des gènes dupliqués chez Arabidopsis thaliana via les réseaux biologiques.}
}

@article{gautRecombinationUnderappreciatedFactor2007,
  title = {Recombination: An Underappreciated Factor in the Evolution of Plant Genomes},
  shorttitle = {Recombination},
  author = {Gaut, Brandon S. and Wright, Stephen I. and Rizzon, Carène and Dvorak, Jan and Anderson, Lorinda K.},
  date = {2007-01},
  journaltitle = {Nature Reviews Genetics},
  shortjournal = {Nat Rev Genet},
  volume = {8},
  number = {1},
  pages = {77--84},
  issn = {1471-0056, 1471-0064},
  doi = {10.1038/nrg1970},
  url = {https://www.nature.com/articles/nrg1970},
  urldate = {2023-09-15},
  langid = {english}
}

@article{gibbonsEvaluationBLASTbasedEdgeweighting2015,
  title = {Evaluation of {{BLAST-based}} Edge-Weighting Metrics Used for Homology Inference with the {{Markov Clustering}} Algorithm},
  author = {Gibbons, Theodore R. and Mount, Stephen M. and Cooper, Endymion D. and Delwiche, Charles F.},
  date = {2015-12},
  journaltitle = {BMC Bioinformatics},
  shortjournal = {BMC Bioinformatics},
  volume = {16},
  number = {1},
  pages = {218},
  issn = {1471-2105},
  doi = {10.1186/s12859-015-0625-x},
  url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0625-x},
  urldate = {2024-03-19},
  abstract = {Background: Clustering protein sequences according to inferred homology is a fundamental step in the analysis of many large data sets. Since the publication of the Markov Clustering (MCL) algorithm in 2002, it has been the centerpiece of several popular applications. Each of these approaches generates an undirected graph that represents sequences as nodes connected to each other by edges weighted with a BLAST-based metric. MCL is then used to infer clusters of homologous proteins by analyzing these graphs. The various approaches differ only by how they weight the edges, yet there has been very little direct examination of the relative performance of alternative edge-weighting metrics. This study compares the performance of four BLAST-based edge-weighting metrics: the bit score, bit score ratio (BSR), bit score over anchored length (BAL), and negative common log of the expectation value (NLE). Performance is tested using the Extended CEGMA KOGs (ECK) database, which we introduce here. Results: All metrics performed similarly when analyzing full-length sequences, but dramatic differences emerged as progressively larger fractions of the test sequences were split into fragments. The BSR and BAL successfully rescued subsets of clusters by strengthening certain types of alignments between fragmented sequences, but also shifted the largest correct scores down near the range of scores generated from spurious alignments. This penalty outweighed the benefits in most test cases, and was greatly exacerbated by increasing the MCL inflation parameter, making these metrics less robust than the bit score or the more popular NLE. Notably, the bit score performed as well or better than the other three metrics in all scenarios. Conclusions: The results provide a strong case for use of the bit score, which appears to offer equivalent or superior performance to the more popular NLE. The insight that MCL-based clustering methods can be improved using a more tractable edge-weighting metric will greatly simplify future implementations. We demonstrate this with our own minimalist Python implementation: Porthos, which uses only standard libraries and can process a graph with 25 m + edges connecting the 60 k + KOG sequences in half a minute using less than half a gigabyte of memory.},
  langid = {english}
}

@article{goecksGalaxyComprehensiveApproach2010,
  title = {Galaxy: A Comprehensive Approach for Supporting Accessible, Reproducible, and Transparent Computational Research in the Life Sciences},
  shorttitle = {Galaxy},
  author = {Goecks, Jeremy and Nekrutenko, Anton and Taylor, James and {Galaxy Team}},
  date = {2010},
  journaltitle = {Genome Biology},
  shortjournal = {Genome Biol},
  volume = {11},
  number = {8},
  eprint = {20738864},
  eprinttype = {pmid},
  pages = {R86},
  issn = {1474-760X},
  doi = {10.1186/gb-2010-11-8-r86},
  abstract = {Increased reliance on computational approaches in the life sciences has revealed grave concerns about how accessible and reproducible computation-reliant results truly are. Galaxy http://usegalaxy.org, an open web-based platform for genomic research, addresses these problems. Galaxy automatically tracks and manages data provenance and provides support for capturing the context and intent of computational methods. Galaxy Pages are interactive, web-based documents that provide users with a medium to communicate a complete computational analysis.},
  langid = {english},
  pmcid = {PMC2945788},
  keywords = {Algorithms,Animals,Computational Biology,Databases Nucleic Acid,Genomics,Humans,Internet}
}

@article{golovninaMolecularPhylogenyGenus2007a,
  title = {Molecular Phylogeny of the Genus {{Triticum L}}},
  author = {Golovnina, K. A. and Glushkov, S. A. and Blinov, A. G. and Mayorov, V. I. and Adkison, L. R. and Goncharov, N. P.},
  date = {2007-04-01},
  journaltitle = {Plant Systematics and Evolution},
  shortjournal = {Plant Syst. Evol.},
  volume = {264},
  number = {3},
  pages = {195--216},
  issn = {1615-6110},
  doi = {10.1007/s00606-006-0478-x},
  url = {https://doi.org/10.1007/s00606-006-0478-x},
  urldate = {2024-03-27},
  abstract = {The genus Triticum L. includes the major cereal crop, common or bread wheat (hexaploid Triticum aestivum L.), and other important cultivated species. Here, we conducted a phylogenetic analysis of all known wheat species and the closely related Aegilops species. This analysis was based on chloroplast matK gene comparison along with trnL intron sequences of some species. Polyploid wheat species are successfully divided only into two groups – Emmer (sections Dicoccoides and Triticum) and Timopheevii (section Timopheevii). Results reveal strictly maternal plastid inheritance of synthetic wheat amphiploids included in the study. A concordance of chloroplast origin with the definite nuclear genomes of polyploid species that were inherited at the last hybridization events was found. Our analysis suggests that there were two ancestral representatives of Aegilops speltoides Tausch that participated in the speciation of polyploid wheats with B and G genome in their genome composition. However, G genome species are younger in evolution than ones with B genome. B genome-specific PCR primers were developed for amplification of Acc-1 gene.},
  langid = {english},
  keywords = {Aegilops,molecular evolution,plasmon and B genome inheritance,Triticum,wheat}
}

@article{grahamTandemGenesClustered1995,
  title = {Tandem Genes and Clustered Genes},
  author = {Graham, Geoffrey J.},
  date = {1995-07-07},
  journaltitle = {Journal of Theoretical Biology},
  shortjournal = {Journal of Theoretical Biology},
  volume = {175},
  number = {1},
  pages = {71--87},
  issn = {0022-5193},
  doi = {10.1006/jtbi.1995.0122},
  url = {https://www.sciencedirect.com/science/article/pii/S0022519385701221},
  urldate = {2024-04-09},
  abstract = {Two patterns of gene repetition are described: tandem arraying and clustering. Tandemly arrayed genes reside within segments of DNA that are repeated head-to-tail a number of times. Clustered genes are linked but irregularly spaced, are often mutually inverted in an unpredictable pattern and are connected by non-conserved DNA. Tandem arrays are homogenized by both unequal recombination and gene conversion, are necessary for the maintenance of large gene families, can expand and contract rapidly in response to changing demand, can keep functionally related genes equal in number, and do not engender increased genetic complexity. Gene clusters are homogenized by conversion only, seldom if ever contain more than 50 members, are stable in number, and often engender increased genetic complexity. The interrelationships among these properties are discussed. Tandem gene arrays can evolve into gene clusters. It is suggested that this occurs when some change in the array inhibits unequal recombination but not gene conversion. The most common such change is inversion of part of the tandem array with respect to the rest; however, arrays can evolve into clusters without inversion. Clustered genes are sometimes re-amplified into new tandem arrays. Clustered genes are probably more durable than tandemly arrayed genes during periods of relaxed selection, and in the case of fish antifreeze protein genes, seem to behave as a genetic memory.}
}

@article{hanadaImportanceLineagespecificExpansion2008,
  title = {Importance of Lineage-Specific Expansion of Plant Tandem Duplicates in the Adaptive Response to Environmental Stimuli},
  author = {Hanada, Kousuke and Zou, Cheng and Lehti-Shiu, Melissa D. and Shinozaki, Kazuo and Shiu, Shin-Han},
  date = {2008-10},
  journaltitle = {Plant Physiology},
  shortjournal = {Plant Physiol},
  volume = {148},
  number = {2},
  eprint = {18715958},
  eprinttype = {pmid},
  pages = {993--1003},
  issn = {0032-0889},
  doi = {10.1104/pp.108.122457},
  abstract = {Plants have substantially higher gene duplication rates compared with most other eukaryotes. These plant gene duplicates are mostly derived from whole genome and/or tandem duplications. Earlier studies have shown that a large number of duplicate genes are retained over a long evolutionary time, and there is a clear functional bias in retention. However, the influence of duplication mechanism, particularly tandem duplication, on duplicate retention has not been thoroughly investigated. We have defined orthologous groups (OGs) between Arabidopsis (Arabidopsis thaliana) and three other land plants to examine the functional bias of retained duplicate genes during vascular plant evolution. Based on analysis of Gene Ontology categories, it is clear that genes in OGs that expanded via tandem duplication tend to be involved in responses to environmental stimuli, while those that expanded via nontandem mechanisms tend to have intracellular regulatory roles. Using Arabidopsis stress expression data, we further demonstrated that tandem duplicates in expanded OGs are significantly enriched in genes that are up-regulated by biotic stress conditions. In addition, tandem duplication of genes in an OG tends to be highly asymmetric. That is, expansion of OGs with tandem genes in one organismal lineage tends to be coupled with losses in the other. This is consistent with the notion that these tandem genes have experienced lineage-specific selection. In contrast, OGs with genes duplicated via nontandem mechanisms tend to experience convergent expansion, in which similar numbers of genes are gained in parallel. Our study demonstrates that the expansion of gene families and the retention of duplicates in plants exhibit substantial functional biases that are strongly influenced by the mechanism of duplication. In particular, genes involved in stress responses have an elevated probability of retention in a single-lineage fashion following tandem duplication, suggesting that these tandem duplicates are likely important for adaptive evolution to rapidly changing environments.},
  langid = {english},
  pmcid = {PMC2556807},
  keywords = {Adaptation Biological,Arabidopsis,Evolution Molecular,Gene Duplication,Genes Duplicate,Genes Plant,Genome Plant,Multigene Family,Oligonucleotide Array Sequence Analysis,Phylogeny}
}

@online{HomeCromwell,
  title = {Home - {{Cromwell}}},
  url = {https://cromwell.readthedocs.io/en/stable/},
  urldate = {2024-03-27}
}

@online{HttpsMicansOrg,
  title = {{{https://micans.org/mcl/lit/mimb.pdf}}},
  url = {https://micans.org/mcl/lit/mimb.pdf},
  urldate = {2024-04-11}
}

@report{jasminStudyTandemlyArrayed2016,
  type = {Internship Report},
  title = {Study of Tandemly Arrayed Genes Expression for {{Arabidopsis}} Thaliana},
  author = {Jasmin, Fabien},
  year = {2016-06-2016},
  institution = {Laboratoire de Mathématiques et Modélisation d'Évry},
  abstract = {Tandemly arrayed genes, also called TAGs, are duplicated genes which come from tandem arrayed duplication. They can be separated or not by few genes called spacers. Although duplicated genes are commonly studied, TAGs features remain little known. In this study, I performed a statistical analysis of Arabidopsis thaliana TAGs using genomic and transcriptomic data of high quality providing from TAIR database and CATdb. After merging the different data and assessing it, I observed the distribution of the different size of TAG and the behaviour of TAGs depending on the number of spacers that I made vary from 0 to 10 in my survey. I equally defined different list of gene pairs to easily compare TAGs to other type of genes. In all 5 lists have been defined during my investigation. The defined lists are random genes pairs list, duplicated genes pairs list, successive genes pairs list, local genes pairs list and TAGs pairs list. After creating all lists previously defined, I made gene pairs lists comparisons between TAGs pair list and the other type of gene pairs list according to different features such as the effect of abiotic or biotic stress conditions, the genes orientation, or the correlation of the expression profiles.},
  langid = {english}
}

@video{javiernovoDuplicationGenes2015,
  entrysubtype = {video},
  title = {Duplication of Genes},
  editor = {{Javier Novo}},
  editortype = {director},
  date = {2015},
  url = {https://www.youtube.com/watch?v=CW1tojSWPxA},
  urldate = {2024-03-27},
  abstract = {Video 4 of the third Unit of the MOOC on Genome Evolution. Paralogs and orthologs. Neo-functionalization and subfunctionalization.}
}

@article{johnsonHiddenMarkovModel2010,
  title = {Hidden {{Markov}} Model Speed Heuristic and Iterative {{HMM}} Search Procedure},
  author = {Johnson, L. Steven and Eddy, Sean R. and Portugaly, Elon},
  date = {2010-08-18},
  journaltitle = {BMC Bioinformatics},
  shortjournal = {BMC Bioinformatics},
  volume = {11},
  number = {1},
  pages = {431},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-11-431},
  url = {https://doi.org/10.1186/1471-2105-11-431},
  urldate = {2024-04-09},
  abstract = {Profile hidden Markov models (profile-HMMs) are sensitive tools for remote protein homology detection, but the main scoring algorithms, Viterbi or Forward, require considerable time to search large sequence databases.},
  keywords = {Entropy Weighting,Iterative Search,Profile Hide Markov Model,Search Time,Test Database}
}

@article{kosterSnakemakeScalableBioinformatics2012,
  title = {Snakemake--a Scalable Bioinformatics Workflow Engine},
  author = {Köster, Johannes and Rahmann, Sven},
  date = {2012-10-01},
  journaltitle = {Bioinformatics (Oxford, England)},
  shortjournal = {Bioinformatics},
  volume = {28},
  number = {19},
  eprint = {22908215},
  eprinttype = {pmid},
  pages = {2520--2522},
  issn = {1367-4811},
  doi = {10.1093/bioinformatics/bts480},
  abstract = {SUMMARY: Snakemake is a workflow engine that provides a readable Python-based workflow definition language and a powerful execution environment that scales from single-core workstations to compute clusters without modifying the workflow. It is the first system to support the use of automatically inferred multiple named wildcards (or variables) in input and output filenames. AVAILABILITY: http://snakemake.googlecode.com. CONTACT: johannes.koester@uni-due.de.},
  langid = {english},
  keywords = {Computational Biology,Electronic Data Processing,Programming Languages,Software,Workflow}
}

@online{kursNextflowWorkbenchReproducibleReusable2016,
  title = {{{NextflowWorkbench}}: {{Reproducible}} and {{Reusable Workflows}} for {{Beginners}} and {{Experts}}},
  shorttitle = {{{NextflowWorkbench}}},
  author = {Kurs, Jason P. and Simi, Manuele and Campagne, Fabien},
  date = {2016-03-28},
  eprinttype = {bioRxiv},
  eprintclass = {New Results},
  pages = {041236},
  doi = {10.1101/041236},
  url = {https://www.biorxiv.org/content/10.1101/041236v2},
  urldate = {2024-03-26},
  abstract = {Computational workflows and pipelines are often created to automate series of processing steps. For instance, workflows enable one to standardize analysis for large projects or core facilities, but are also useful for individual biologists who need to perform repetitive data processing. Some workflow systems, designed for beginners, offer a graphical user interface and have been very popular with biologists. In practice, these tools are infrequently used by more experienced bioinformaticians, who may require more flexibility or performance than afforded by the user interfaces, and seem to prefer developing workflows with scripting or command line tools. Here, we present a workflow system, the NextflowWorkbench (NW), which was designed for both beginners and experts, and blends the distinction between user interface and scripting language. This system extends and reuses the popular Nextflow workflow description language and shares its advantages. In contrast to Nextflow, NextflowWorkbench offers an integrated development environment that helps complete beginners get started with workflow development. Auto-completion helps beginners who do not know the syntax of the Nextflow language. Reusable processes provide modular workflows. Programmers will benefit from unique interactive features that help users work more productively with docker containers. We illustrate this tool with a workflow to estimate RNA-Seq counts using Kallisto. We found that beginners can be taught how to assemble this workflow in a two hours training session. NW workflows are portable and can execute on laptop/desktop computers with docker, on a lab cluster, or in the cloud to facilitate training. NextflowWorkbench is open-source and available at http://workflow.campagnelab.org.},
  langid = {english},
  pubstate = {preprint}
}

@inproceedings{lajoieEvolutionTandemlyArrayed2007,
  title = {Evolution of {{Tandemly Arrayed Genes}} in {{Multiple Species}}},
  booktitle = {Comparative {{Genomics}}},
  author = {Lajoie, Mathieu and Bertrand, Denis and El-Mabrouk, Nadia},
  editor = {Tesler, Glenn and Durand, Dannie},
  date = {2007},
  pages = {96--109},
  publisher = {Springer},
  location = {Berlin, Heidelberg},
  doi = {10.1007/978-3-540-74960-8_8},
  abstract = {Tandemly arrayed genes (TAG) constitute a large fraction of most genomes and play important biological roles. They evolve through unequal recombination, which places duplicated genes next to the original ones (tandem duplications). Many algorithms have been proposed to infer a tandem duplication history for a TAG cluster in a single species. However, the presence of different transcriptional orientations in most TAG clusters highlight the fact that processes such as inversions also contribute to their evolution. This makes those algorithms unsuitable in many cases. To circumvent this limitation, we proposed in a previous work an extended evolutionary model which includes inversions and presented a branch-and-bound algorithm allowing to infer a most parsimonious scenario of evolution for a given TAG cluster. Here, we generalize this model to multiple species and present a general framework to infer ancestral gene orders that minimize the number of inversions in the whole evolutionary history. An application on a pair of human-rat TAG clusters is presented.},
  isbn = {978-3-540-74960-8},
  langid = {english},
  keywords = {Ancestral Genome,Gene Order,Gene Tree,Inversion Event,Tandem Duplication}
}

@thesis{lallemandEvolutionGenesDupliques2022,
  type = {phdthesis},
  title = {Évolution des gènes dupliqués chez le pommier : Identification et caractérisation de la dominance du sous-génome dans le génome de la pomme},
  shorttitle = {Évolution des gènes dupliqués chez le pommier},
  author = {Lallemand, Tanguy},
  date = {2022-11-15},
  institution = {Université d'Angers},
  url = {https://theses.hal.science/tel-04081238},
  urldate = {2024-03-30},
  abstract = {Un événement de duplication du génome entier (WGD) s’est produit chez l’ancêtre du pommier (Malus domestica). Les événements de WGD ont un impact profond sur les génomes et sont connus pour être des moteurs majeurs de l’évolution. Cette WGD est relativement récente (27 Millions d’années) et fait du pommier un organisme de choix pour étudier le devenir des gènes dupliqués par autopolyploïdisation. Dans cette étude, nous avons examiné l’évolution des fragments chromosomiques dupliqués, sous le prisme d’analyses génomiques, transcriptomiques et épigénétiques. Nous avons identifié 16 779 paires de gènes dupliqués dans le génome du pommier, confirmant le caractère récent de la WGD. Les gènes au sein des paires ohnologues ne semblent pas soumis à des pressions de sélection différentes. Nous avons montré plusieurs déséquilibres dans la proportion de QTLs cartographiés entre fragments chromosomiques dupliqués, et caractérisé divers biais dans le fractionnement du génome, le niveau d’expression des gènes, la couverture en éléments transposables et la méthylation de l’ADN. Nos résultats suggèrent une dominance sous-chromosomique dans cet autopolyploïde, un phénomène proche de la sous dominance génomique décrite jusqu’à présent uniquement chez les allopolyploïdes.},
  langid = {french}
}

@article{lallemandInsightsEvolutionOhnologous2023,
  title = {Insights into the {{Evolution}} of {{Ohnologous Sequences}} and {{Their Epigenetic Marks Post-WGD}} in {{Malus Domestica}}},
  author = {Lallemand, Tanguy and Leduc, Martin and Desmazières, Adèle and Aubourg, Sébastien and Rizzon, Carène and Landès, Claudine and Celton, Jean-Marc},
  date = {2023-10},
  journaltitle = {Genome Biology and Evolution},
  volume = {15},
  number = {10},
  eprint = {37847638},
  eprinttype = {pmid},
  publisher = {Oxford University Press},
  doi = {10.1093/gbe/evad178},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10601995/},
  urldate = {2024-03-30},
  abstract = {A Whole Genome Duplication (WGD) event occurred several Ma in a Rosaceae ancestor, giving rise to the Maloideae subfamily which includes today many pome fruits such as pear (Pyrus communis) and apple (Malus domestica). This complete and well-conserved ...},
  langid = {english}
}

@article{lallemandOverviewDuplicatedGene2020,
  title = {An {{Overview}} of {{Duplicated Gene Detection Methods}}: {{Why}} the {{Duplication Mechanism Has}} to {{Be Accounted}} for in {{Their Choice}}},
  shorttitle = {An {{Overview}} of {{Duplicated Gene Detection Methods}}},
  author = {Lallemand, Tanguy and Leduc, Martin and Landès, Claudine and Rizzon, Carène and Lerat, Emmanuelle},
  date = {2020-09-04},
  journaltitle = {Genes},
  shortjournal = {Genes},
  volume = {11},
  number = {9},
  pages = {1046},
  issn = {2073-4425},
  doi = {10.3390/genes11091046},
  url = {https://www.mdpi.com/2073-4425/11/9/1046},
  urldate = {2024-03-19},
  abstract = {Gene duplication is an important evolutionary mechanism allowing to provide new genetic material and thus opportunities to acquire new gene functions for an organism, with major implications such as speciation events. Various processes are known to allow a gene to be duplicated and different models explain how duplicated genes can be maintained in genomes. Due to their particular importance, the identification of duplicated genes is essential when studying genome evolution but it can still be a challenge due to the various fates duplicated genes can encounter. In this review, we first describe the evolutionary processes allowing the formation of duplicated genes but also describe the various bioinformatic approaches that can be used to identify them in genome sequences. Indeed, these bioinformatic approaches differ according to the underlying duplication mechanism. Hence, understanding the specificity of the duplicated genes of interest is a great asset for tool selection and should be taken into account when exploring a biological question.},
  langid = {english}
}

@article{landes-devauchelleArtResumerPour,
  title = {De l’art de résumer pour tenter de comprendre en génomique évolutive},
  author = {Landès-Devauchelle, Claudine},
  url = {http://www.math-evry.cnrs.fr/_media/publications/devauchelle_hdr_2011.pdf},
  langid = {french}
}

@article{lannesDoesPresenceTransposable2019,
  title = {Does the {{Presence}} of {{Transposable Elements Impact}} the {{Epigenetic Environment}} of {{Human Duplicated Genes}}?},
  author = {Lannes, Romain and Rizzon, Carène and Lerat, Emmanuelle},
  date = {2019-03-26},
  journaltitle = {Genes},
  shortjournal = {Genes (Basel)},
  volume = {10},
  number = {3},
  eprint = {30917603},
  eprinttype = {pmid},
  pages = {249},
  issn = {2073-4425},
  doi = {10.3390/genes10030249},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6470583/},
  urldate = {2023-09-15},
  abstract = {Epigenetic modifications have an important role to explain part of the intra- and inter-species variation in gene expression. They also have a role in the control of transposable elements (TEs) whose activity may have a significant impact on genome evolution by promoting various mutations, which are expected to be mostly deleterious. A change in the local epigenetic landscape associated with the presence of TEs is expected to affect the expression of neighboring genes since these modifications occurring at TE sequences can spread to neighboring sequences. In this work, we have studied how the epigenetic modifications of genes are conserved and what the role of TEs is in this conservation. For that, we have compared the conservation of the epigenome associated with human duplicated genes and the differential presence of TEs near these genes. Our results show higher epigenome conservation of duplicated genes from the same family when they share similar TE environment, suggesting a role for the differential presence of TEs in the evolutionary divergence of duplicates through variation in the epigenetic landscape.},
  pmcid = {PMC6470583}
}

@report{le-hoangEtudeTranscriptomiqueGenes,
  title = {Etude transcriptomique des gènes dupliqués en tandem (TAG) chez Arabidopsis thaliana},
  author = {Lê-Hoang, Julie},
  langid = {french}
}

@thesis{leducEtudeEvolutionGenes,
  title = {Étude de l’évolution des gènes dupliqués chez les Rosaceae},
  author = {Leduc, Martin},
  langid = {french}
}

@article{leitchGenomicPlasticityDiversity2008,
  title = {Genomic Plasticity and the Diversity of Polyploid Plants},
  author = {Leitch, A. R. and Leitch, I. J.},
  date = {2008-04-25},
  journaltitle = {Science (New York, N.Y.)},
  shortjournal = {Science},
  volume = {320},
  number = {5875},
  eprint = {18436776},
  eprinttype = {pmid},
  pages = {481--483},
  issn = {1095-9203},
  doi = {10.1126/science.1153585},
  abstract = {Polyploidy, a change whereby the entire chromosome set is multiplied, arises through mitotic or meiotic misdivisions and frequently involves unreduced gametes and interspecific hybridization. The success of newly formed angiosperm polyploids is partly attributable to their highly plastic genome structure, as manifested by tolerance to changing chromosome numbers (aneuploidy and polyploidy), genome size, (retro)transposable element mobility, insertions, deletions, and epigenome restructuring. The ability to withstand large-scale changes, frequently within one or a few generations, is associated with a restructuring of the transcriptome, metabolome, and proteome and can result in an altered phenotype and ecology. Thus, polyploid-induced changes can generate individuals that are able to exploit new niches or to outcompete progenitor species. This process has been a major driving force behind the divergence of the angiosperms and their biodiversity.},
  langid = {english},
  keywords = {Biodiversity,Biological Evolution,Chromosomes Plant,Genetic Speciation,Genetic Variation,Genome Plant,Hybridization Genetic,Magnoliopsida,Nondisjunction Genetic,Plant Proteins,Polyploidy,Proteome,Transcription Genetic}
}

@article{longGeneDuplicationEvolution2001,
  title = {Gene {{Duplication}} and {{Evolution}}},
  author = {Long, Manyuan and Thornton, Kevin},
  date = {2001-08-31},
  journaltitle = {Science},
  volume = {293},
  number = {5535},
  pages = {1551--1551},
  publisher = {American Association for the Advancement of Science},
  doi = {10.1126/science.293.5535.1551a},
  url = {https://www.science.org/doi/abs/10.1126/science.293.5535.1551a},
  urldate = {2024-03-28}
}

@article{lynchEvolutionaryFateConsequences2000,
  title = {The {{Evolutionary Fate}} and {{Consequences}} of {{Duplicate Genes}}},
  author = {Lynch, Michael and Conery, John S.},
  date = {2000-11-10},
  journaltitle = {Science},
  volume = {290},
  number = {5494},
  pages = {1151--1155},
  publisher = {American Association for the Advancement of Science},
  doi = {10.1126/science.290.5494.1151},
  url = {https://www.science.org/doi/abs/10.1126/science.290.5494.1151},
  urldate = {2024-03-28},
  abstract = {Gene duplication has generally been viewed as a necessary source of material for the origin of evolutionary novelties, but it is unclear how often gene duplicates arise and how frequently they evolve new functions. Observations from the genomic databases for several eukaryotic species suggest that duplicate genes arise at a very high rate, on average 0.01 per gene per million years. Most duplicated genes experience a brief period of relaxed selection early in their history, with a moderate fraction of them evolving in an effectively neutral manner during this period. However, the vast majority of gene duplicates are silenced within a few million years, with the few survivors subsequently experiencing strong purifying selection. Although duplicate genes may only rarely evolve new functions, the stochastic silencing of such genes may play a significant role in the passive origin of new species.}
}

@article{maoGenoDupPipelineTool2019,
  title = {{{GenoDup Pipeline}}: A Tool to Detect Genome Duplication Using the {{dS-based}} Method},
  shorttitle = {{{GenoDup Pipeline}}},
  author = {Mao, Yafei},
  date = {2019-01-23},
  journaltitle = {PeerJ},
  shortjournal = {PeerJ},
  volume = {7},
  eprint = {30697488},
  eprinttype = {pmid},
  pages = {e6303},
  issn = {2167-8359},
  doi = {10.7717/peerj.6303},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6347962/},
  urldate = {2024-03-21},
  abstract = {Understanding whole genome duplication (WGD), or polyploidy, is fundamental to investigating the origin and diversification of organisms in evolutionary biology. The wealth of genomic data generated by next generation sequencing (NGS) has resulted in an urgent need for handy and accurate tools to detect WGD. Here, I present a useful and user-friendly pipeline called GenoDup for inferring WGD using the dS-based method. I have successfully applied GenoDup to identify WGD in empirical data from both plants and animals. The GenoDup Pipeline provides a reliable and useful tool to infer WGD from NGS data.},
  pmcid = {PMC6347962}
}

@online{moixPhylogeneticPlacementWhole2023,
  title = {Phylogenetic Placement of Whole Genome Duplications in Yeasts through Quantitative Analysis of Hierarchical Orthologous Groups},
  author = {Moix, Samuel and Glover, Natasha and Majidian, Sina},
  date = {2023-04-12},
  number = {12:382},
  eprint = {12:382},
  eprinttype = {F1000Research},
  doi = {10.12688/f1000research.128656.1},
  url = {https://f1000research.com/articles/12-382},
  urldate = {2024-04-17},
  abstract = {Background: Whole genome duplications (WGD) are genomic events leading to formation of polyploid organisms. Resulting duplicated genes play important roles in driving species evolution and diversification. After such events, the initial ploidy is usually restored, complicating their detection across evolution. With the advance of bioinformatics and the rising number of new well-assembled genomes, new detection methods are ongoingly being developed to overcome the weaknesses of different approaches. Results: Here we propose a novel method for detecting WGD in yeast lineages based on the quantitative and comparative analysis of hierarchical orthologous groups (HOGs) of duplicated genes for a given set of organisms. We reconstruct ancestral genomes to obtain evolutionary information for each phylogenetic branch. This reconstruction relies on the inference of HOGs from the selected species’ proteomes. To estimate WGD events, the number of HOGs of duplicated genes across all taxonomic ranges are adjusted according to the molecular clock hypothesis and by the average genome size. Branches with a significant increase in the adjusted number of duplicated gene families are kept as candidates for WGD placement. The developed method was tested on two real datasets and showed promising results in phylogenetic WGD placements on the yeast lineage.},
  langid = {english},
  pubstate = {preprint},
  keywords = {comparative genomics,orthologous groups,whole genome duplications,yeast}
}

@online{molderSustainableDataAnalysis2021a,
  title = {Sustainable Data Analysis with {{Snakemake}}},
  author = {Mölder, Felix and Jablonski, Kim Philipp and Letcher, Brice and Hall, Michael B. and Tomkins-Tinch, Christopher H. and Sochat, Vanessa and Forster, Jan and Lee, Soohyun and Twardziok, Sven O. and Kanitz, Alexander and Wilm, Andreas and Holtgrewe, Manuel and Rahmann, Sven and Nahnsen, Sven and Köster, Johannes},
  date = {2021-04-19},
  number = {10:33},
  eprint = {10:33},
  eprinttype = {F1000Research},
  doi = {10.12688/f1000research.29032.2},
  url = {https://f1000research.com/articles/10-33},
  urldate = {2024-03-26},
  abstract = {Data analysis often entails a multitude of heterogeneous steps, from the application of various command line tools to the usage of scripting languages like R or Python for the generation of plots and tables. It is widely recognized that data analyses should ideally be conducted in a reproducible way.\&nbsp;Reproducibility enables technical validation and regeneration of results on the original or even new data. However, reproducibility alone is by no means sufficient to deliver an analysis that is of lasting impact (i.e., sustainable) for the field, or even just one research group. We postulate that it is equally important to ensure adaptability and transparency. The former describes the ability to modify the analysis to answer extended or slightly different research questions. The latter describes the ability to understand the analysis in order to judge whether it is not only technically, but methodologically valid. Here, we analyze the properties needed for a data analysis to become reproducible, adaptable, and transparent. We show how the popular workflow management system Snakemake can be used to guarantee this, and how it enables an ergonomic, combined, unified representation of all steps involved in data analysis, ranging from raw data processing, to quality control and fine-grained, interactive exploration and plotting of final results.},
  langid = {english},
  pubstate = {preprint},
  keywords = {adaptability,data analysis,reproducibility,scalability,sustainability,transparency,workflow management}
}

@article{nozawaEvolutionaryDynamicsOlfactory2007,
  title = {Evolutionary Dynamics of Olfactory Receptor Genes in {{Drosophila}} Species},
  author = {Nozawa, Masafumi and Nei, Masatoshi},
  date = {2007-04-24},
  journaltitle = {Proceedings of the National Academy of Sciences},
  volume = {104},
  number = {17},
  pages = {7122--7127},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.0702133104},
  url = {https://www.pnas.org/doi/full/10.1073/pnas.0702133104},
  urldate = {2024-04-02},
  abstract = {Olfactory receptor (OR) genes are of vital importance for animals to find food, identify mates, and avoid dangers. In mammals, the number of OR genes is large and varies extensively among different orders, whereas, in insects, the extent of interspecific variation appears to be small, although only a few species have been studied. To understand the evolutionary changes of OR genes, we identified all OR genes from 12 Drosophila species, of which the evolutionary time is roughly equivalent to that of eutherian mammals. The results showed that all species examined have similar numbers (≈60) of functional OR genes. Phylogenetic analysis indicated that the ancestral species also had similar numbers of genes, but there were frequent gains and losses of genes that occurred in each evolutionary lineage. It appears that tandem duplication and random inactivation of duplicate genes are the major factors of gene number change. However, chromosomal rearrangements have contributed to the establishment of genome-wide distribution of OR genes. These results suggest that the repertoire of OR genes in Drosophila has been quite stable compared with the mammalian genes. The difference in evolutionary pattern between Drosophila and mammals can be explained partly by the differences of gene expression mechanisms and partly by the environmental and behavioral differences.}
}

@book{ohnoEvolutionGeneDuplication1970,
  title = {Evolution by {{Gene Duplication}}},
  author = {Ohno, Susumu},
  date = {1970},
  publisher = {Springer Berlin Heidelberg},
  location = {Berlin, Heidelberg},
  doi = {10.1007/978-3-642-86659-3},
  url = {http://link.springer.com/10.1007/978-3-642-86659-3},
  urldate = {2024-03-21},
  isbn = {978-3-642-86661-6},
  langid = {english}
}

@article{ottoRecombinationSelectionEvolution2022,
  title = {Recombination, Selection, and the Evolution of Tandem Gene Arrays},
  author = {Otto, Moritz and Zheng, Yichen and Wiehe, Thomas},
  date = {2022-07-01},
  journaltitle = {Genetics},
  shortjournal = {Genetics},
  volume = {221},
  number = {3},
  pages = {iyac052},
  issn = {1943-2631},
  doi = {10.1093/genetics/iyac052},
  url = {https://doi.org/10.1093/genetics/iyac052},
  urldate = {2024-04-09},
  abstract = {Multigene families—immunity genes or sensory receptors, for instance—are often subject to diversifying selection. Allelic diversity may be favored not only through balancing or frequency-dependent selection at individual loci but also by associating different alleles in multicopy gene families. Using a combination of analytical calculations and simulations, we explored a population genetic model of epistatic selection and unequal recombination, where a trade-off exists between the benefit of allelic diversity and the cost of copy abundance. Starting from the neutral case, where we showed that gene copy number is Gamma distributed at equilibrium, we derived also the mean and shape of the limiting distribution under selection. Considering a more general model, which includes variable population size and population substructure, we explored by simulations mean fitness and some summary statistics of the copy number distribution. We determined the relative effects of selection, recombination, and demographic parameters in maintaining allelic diversity and shaping the mean fitness of a population. One way to control the variance of copy number is by lowering the rate of unequal recombination. Indeed, when encoding recombination by a rate modifier locus, we observe exactly this prediction. Finally, we analyzed the empirical copy number distribution of 3 genes in human and estimated recombination and selection parameters of our model.}
}

@article{panTandemlyArrayedGenes2008,
  title = {Tandemly {{Arrayed Genes}} in {{Vertebrate Genomes}}},
  author = {Pan, Deng and Zhang, Liqing},
  date = {2008},
  journaltitle = {Comparative and Functional Genomics},
  shortjournal = {Comp Funct Genomics},
  volume = {2008},
  eprint = {18815629},
  eprinttype = {pmid},
  pages = {545269},
  issn = {1531-6912},
  doi = {10.1155/2008/545269},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2547482/},
  urldate = {2024-04-09},
  abstract = {Tandemly arrayed genes (TAGs) are duplicated genes that are linked as neighbors on a chromosome, many of which have important physiological and biochemical functions. Here we performed a survey of these genes in 11 available vertebrate genomes. TAGs account for an average of about 14\% of all genes in these vertebrate genomes, and about 25\% of all duplications. The majority of TAGs (72–94\%) have parallel transcription orientation (i.e., they are encoded on the same strand) in contrast to the genome, which has about 50\% of its genes in parallel transcription orientation. The majority of tandem arrays have only two members. In all species, the proportion of genes that belong to TAGs tends to be higher in large gene families than in small ones; together with our recent finding that tandem duplication played a more important role than retroposition in large families, this fact suggests that among all types of duplication mechanisms, tandem duplication is the predominant mechanism of duplication, especially in large families. Finally, several species have a higher proportion of large tandem arrays that are species-specific than random expectation.},
  pmcid = {PMC2547482}
}

@online{PEPkitBioData,
  title = {{{PEPkit}}: The Bio Data Management Toolkit - {{PEPkit}}: The Bio Data Management Toolkit},
  url = {https://pep.databio.org/},
  urldate = {2024-03-27}
}

@article{picart-picoloLargeTandemDuplications2020,
  title = {Large Tandem Duplications Affect Gene Expression, {{3D}} Organization, and Plant–Pathogen Response},
  author = {Picart-Picolo, Ariadna and Grob, Stefan and Picault, Nathalie and Franek, Michal and Llauro, Christel and Halter, Thierry and Maier, Tom R. and Jobet, Edouard and Descombin, Julie and Zhang, Panpan and Paramasivan, Vijayapalani and Baum, Thomas J. and Navarro, Lionel and Dvořáčková, Martina and Mirouze, Marie and Pontvianne, Frédéric},
  date = {2020-10-08},
  journaltitle = {Genome Research},
  shortjournal = {Genome Res.},
  eprint = {33033057},
  eprinttype = {pmid},
  publisher = {Cold Spring Harbor Lab},
  issn = {1088-9051, 1549-5469},
  doi = {10.1101/gr.261586.120},
  url = {https://genome.cshlp.org/content/early/2020/10/05/gr.261586.120},
  urldate = {2024-03-25},
  abstract = {Rapid plant genome evolution is crucial to adapt to environmental changes. Chromosomal rearrangements and gene copy number variation (CNV) are two important tools for genome evolution and sources for the creation of new genes. However, their emergence takes many generations. In this study, we show that in Arabidopsis thaliana, a significant loss of ribosomal RNA (rRNA) genes with a past history of a mutation for the chromatin assembly factor 1 (CAF1) complex causes rapid changes in the genome structure. Using long-read sequencing and microscopic approaches, we have identified up to 15 independent large tandem duplications in direct orientation (TDDOs) ranging from 60 kb to 1.44 Mb. Our data suggest that these TDDOs appeared within a few generations, leading to the duplication of hundreds of genes. By subsequently focusing on a line only containing 20\% of rRNA gene copies (20rDNA line), we investigated the impact of TDDOs on 3D genome organization, gene expression, and cytosine methylation. We found that duplicated genes often accumulate more transcripts. Among them, several are involved in plant–pathogen response, which could explain why the 20rDNA line is hyper-resistant to both bacterial and nematode infections. Finally, we show that the TDDOs create gene fusions and/or truncations and discuss their potential implications for the evolution of plant genomes.},
  langid = {english}
}

@article{picart-picoloLargeTandemDuplications2020a,
  title = {Large Tandem Duplications Affect Gene Expression, {{3D}} Organization, and Plant–Pathogen Response},
  author = {Picart-Picolo, Ariadna and Grob, Stefan and Picault, Nathalie and Franek, Michal and Llauro, Christel and Halter, Thierry and Maier, Tom R. and Jobet, Edouard and Descombin, Julie and Zhang, Panpan and Paramasivan, Vijayapalani and Baum, Thomas J. and Navarro, Lionel and Dvořáčková, Martina and Mirouze, Marie and Pontvianne, Frédéric},
  date = {2020-10-08},
  journaltitle = {Genome Research},
  shortjournal = {Genome Res.},
  eprint = {33033057},
  eprinttype = {pmid},
  publisher = {Cold Spring Harbor Lab},
  issn = {1088-9051, 1549-5469},
  doi = {10.1101/gr.261586.120},
  url = {https://genome.cshlp.org/content/early/2020/10/05/gr.261586.120},
  urldate = {2024-04-09},
  abstract = {Rapid plant genome evolution is crucial to adapt to environmental changes. Chromosomal rearrangements and gene copy number variation (CNV) are two important tools for genome evolution and sources for the creation of new genes. However, their emergence takes many generations. In this study, we show that in Arabidopsis thaliana, a significant loss of ribosomal RNA (rRNA) genes with a past history of a mutation for the chromatin assembly factor 1 (CAF1) complex causes rapid changes in the genome structure. Using long-read sequencing and microscopic approaches, we have identified up to 15 independent large tandem duplications in direct orientation (TDDOs) ranging from 60 kb to 1.44 Mb. Our data suggest that these TDDOs appeared within a few generations, leading to the duplication of hundreds of genes. By subsequently focusing on a line only containing 20\% of rRNA gene copies (20rDNA line), we investigated the impact of TDDOs on 3D genome organization, gene expression, and cytosine methylation. We found that duplicated genes often accumulate more transcripts. Among them, several are involved in plant–pathogen response, which could explain why the 20rDNA line is hyper-resistant to both bacterial and nematode infections. Finally, we show that the TDDOs create gene fusions and/or truncations and discuss their potential implications for the evolution of plant genomes.},
  langid = {english}
}

@online{ponsComputingCommunitiesLarge2005,
  title = {Computing Communities in Large Networks Using Random Walks (Long Version)},
  author = {Pons, Pascal and Latapy, Matthieu},
  date = {2005-12-12},
  eprint = {physics/0512106},
  eprinttype = {arxiv},
  doi = {10.48550/arXiv.physics/0512106},
  url = {http://arxiv.org/abs/physics/0512106},
  urldate = {2024-03-30},
  abstract = {Dense subgraphs of sparse graphs (communities), which appear in most real-world complex networks, play an important role in many contexts. Computing them however is generally expensive. We propose here a measure of similarities between vertices based on random walks which has several important advantages: it captures well the community structure in a network, it can be computed efficiently, and it can be used in an agglomerative algorithm to compute efficiently the community structure of a network. We propose such an algorithm, called Walktrap, which runs in time O(mn\textasciicircum 2) and space O(n\textasciicircum 2) in the worst case, and in time O(n\textasciicircum 2log n) and space O(n\textasciicircum 2) in most real-world cases (n and m are respectively the number of vertices and edges in the input graph). Extensive comparison tests show that our algorithm surpasses previously proposed ones concerning the quality of the obtained community structures and that it stands among the best ones concerning the running time.},
  pubstate = {preprint},
  keywords = {Condensed Matter - Disordered Systems and Neural Networks,Condensed Matter - Statistical Mechanics,Physics - Physics and Society}
}

@online{pontvianneDupliquerPourAdapter2020,
  title = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ? | CNRS Biologie},
  shorttitle = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ?},
  author = {Pontvianne, Frédéric},
  date = {2020-10-14},
  url = {https://www.insb.cnrs.fr/fr/cnrsinfo/dupliquer-pour-sadapter-ou-comment-accelerer-levolution-des-plantes},
  urldate = {2024-03-25},
  abstract = {Les duplications de portions de chromosomes permettant aux organismes de dupliquer des gènes existants et d’en créer de nouveaux sont bien},
  langid = {french}
}

@article{reamsSelectionGeneClustering2004,
  title = {Selection for {{Gene Clustering}} by {{Tandem Duplication}}},
  author = {Reams, Andrew B. and Neidle, Ellen L.},
  date = {2004-10-01},
  journaltitle = {Annual Review of Microbiology},
  shortjournal = {Annu. Rev. Microbiol.},
  volume = {58},
  number = {1},
  pages = {119--142},
  issn = {0066-4227, 1545-3251},
  doi = {10.1146/annurev.micro.58.030603.123806},
  url = {https://www.annualreviews.org/doi/10.1146/annurev.micro.58.030603.123806},
  urldate = {2024-03-28},
  abstract = {▪ Abstract\enspace{} In prokaryotic genomes, related genes are frequently clustered in operons and higher-order arrangements that reflect functional context. Organization emerges despite rearrangements that constantly shuffle gene and operon order. Evidence is presented that the tandem duplication of related genes acts as a driving evolutionary force in the origin and maintenance of clusters. Gene amplification can be viewed as a dynamic and reversible regulatory mechanism that facilitates adaptation to variable environments. Clustered genes confer selective benefits via their ability to be coamplified. During evolution, rearrangements that bring together related genes can be selected if they increase the fitness of the organism in which they reside. Similarly, the benefits of gene amplification can prevent the dispersal of existing clusters. Examples of frequent and spontaneous amplification of large genomic fragments are provided. The possibility is raised that tandem gene duplication works in concert with horizontal gene transfer as interrelated evolutionary forces for gene clustering.},
  langid = {english}
}

@article{rizzonRizzonMaraisGouy2002,
  title = {Rizzon {{C}}, {{Marais G}}, {{Gouy M}}, {{Biemont C}}. {{Recombination}} Rate and the Distribution of Transposable Elements in the {{Drosophila}} Melanogaster Genome. {{Genome Res}} 12: 400-407},
  shorttitle = {Rizzon {{C}}, {{Marais G}}, {{Gouy M}}, {{Biemont C}}. {{Recombination}} Rate and the Distribution of Transposable Elements in the {{Drosophila}} Melanogaster Genome. {{Genome Res}} 12},
  author = {Rizzon, Carène and Marais, Gabriel and Gouy, Manolo and Biémont, Christian},
  date = {2002-04-01},
  journaltitle = {Genome research},
  shortjournal = {Genome research},
  volume = {12},
  pages = {400--7},
  doi = {10.1101/gr.210802},
  abstract = {We analyzed the distribution of 54 families of transposable elements (TEs; transposons, LTR retrotransposons, and non-LTR retrotransposons) in the chromosomes of Drosophila melanogaster, using data from the sequenced genome. The density of LTR and non-LTR retrotransposons (RNA-based elements) was high in regions with low recombination rates, but there was no clear tendency to parallel the recombination rate. However, the density of transposons (DNA-based elements) was significantly negatively correlated with recombination rate. The accumulation of TEs in regions of reduced recombination rate is compatible with selection acting against TEs, as selection is expected to be weaker in regions with lower recombination. The differences in the relationship between recombination rate and TE density that exist between chromosome arms suggest that TE distribution depends on specific characteristics of the chromosomes (chromatin structure, distribution of other sequences), the TEs themselves (transposition mechanism), and the species (reproductive system, effective population size, etc.), that have differing influences on the effect of natural selection acting against the TE insertions.}
}

@article{rizzonStrikingSimilaritiesGenomic2006,
  title = {Striking {{Similarities}} in the {{Genomic Distribution}} of {{Tandemly Arrayed Genes}} in {{Arabidopsis}} and {{Rice}}},
  author = {Rizzon, Carene and Ponger, Loic and Gaut, Brandon S},
  date = {2006-09},
  journaltitle = {PLoS Computational Biology},
  shortjournal = {PLoS Comput Biol},
  volume = {2},
  number = {9},
  eprint = {16948529},
  eprinttype = {pmid},
  pages = {e115},
  issn = {1553-734X},
  doi = {10.1371/journal.pcbi.0020115},
  url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1557586/},
  urldate = {2024-04-10},
  abstract = {In Arabidopsis, tandemly arrayed genes (TAGs) comprise {$>$}10\% of the genes in the genome. These duplicated genes represent a rich template for genetic innovation, but little is known of the evolutionary forces governing their generation and maintenance. Here we compare the organization and evolution of TAGs between Arabidopsis and rice, two plant genomes that diverged \textasciitilde 150 million years ago. TAGs from the two genomes are similar in a number of respects, including the proportion of genes that are tandemly arrayed, the number of genes within an array, the number of tandem arrays, and the dearth of TAGs relative to single copy genes in centromeric regions. Analysis of recombination rates along rice chromosomes confirms a positive correlation between the occurrence of TAGs and recombination rate, as found in Arabidopsis. TAGs are also biased functionally relative to duplicated, nontandemly arrayed genes. In both genomes, TAGs are enriched for genes that encode membrane proteins and function in “abiotic and biotic stress” but underrepresented for genes involved in transcription and DNA or RNA binding functions. We speculate that these observations reflect an evolutionary trend in which successful tandem duplication involves genes either at the end of biochemical pathways or in flexible steps in a pathway, for which fluctuation in copy number is unlikely to affect downstream genes. Despite differences in the age distribution of tandem arrays, the striking similarities between rice and Arabidopsis indicate similar mechanisms of TAG generation and maintenance., The nuclear genomes of higher plants vary tremendously in size and gene content. Much of this variation is attributable to gene duplication. To date, most studies of plant gene duplication have focused on whole genome duplication events, which duplicate all genes simultaneously. Another prominent process is single gene duplication, which often results in duplicated genes arranged in a tandem array. Here Rizzon, Ponger, and Gaut identify tandem arrays in rice and their genome organization between Arabidopsis and rice, two plant species that diverged \textasciitilde 150 million years ago. The two genomes contain a similar proportion of genes that are tandemly arrayed, with a similar number of genes within an array. Moreover, tandemly arrayed genes are most common in genomic regions of high recombination in both species. This organization appears to be a general feature of eukaryotic genomes, perhaps because duplication rates are higher in high recombination regions. Tandemly arrayed genes of rice and Arabidopsis also represent a biased gene set with regard to function. In contrast to genes duplicated through whole genome events, tandemly arrayed genes are enriched for genes that encode membrane proteins and genes that function in response to environmental stresses. Taken together, these observations suggest that tandemly arrayed genes represent a rich and relatively fluid source for plant adaptation.},
  pmcid = {PMC1557586}
}

@article{rognesParAlignParallelSequence2001,
  title = {{{ParAlign}}: A Parallel Sequence Alignment Algorithm for Rapid and Sensitive Database Searches},
  shorttitle = {{{ParAlign}}},
  author = {Rognes, Torbjørn},
  date = {2001-04-01},
  journaltitle = {Nucleic Acids Research},
  shortjournal = {Nucleic Acids Research},
  volume = {29},
  number = {7},
  pages = {1647--1652},
  issn = {0305-1048},
  doi = {10.1093/nar/29.7.1647},
  url = {https://doi.org/10.1093/nar/29.7.1647},
  urldate = {2024-04-09},
  abstract = {There is a need for faster and more sensitive algorithms for sequence similarity searching in view of the rapidly increasing amounts of genomic sequence data available. Parallel processing capabilities in the form of the single instruction, multiple data (SIMD) technology are now available in common microprocessors and enable a single microprocessor to perform many operations in parallel. The ParAlign algorithm has been specifically designed to take advantage of this technology. The new algorithm initially exploits parallelism to perform a very rapid computation of the exact optimal ungapped alignment score for all diagonals in the alignment matrix. Then, a novel heuristic is employed to compute an approximate score of a gapped alignment by combining the scores of several diagonals. This approximate score is used to select the most interesting database sequences for a subsequent Smith–Waterman alignment, which is also parallelised. The resulting method represents a substantial improvement compared to existing heuristics. The sensitivity and specificity of ParAlign was found to be as good as Smith–Waterman implementations when the same method for computing the statistical significance of the matches was used. In terms of speed, only the significantly less sensitive NCBI BLAST 2 program was found to outperform the new approach. Online searches are available at http://dna.uio.no/search/}
}

@article{rognesSixfoldSpeedupSmith2000,
  title = {Six-Fold Speed-up of {{Smith}}–{{Waterman}} Sequence Database Searches Using Parallel Processing on Common Microprocessors},
  author = {Rognes, Torbjørn and Seeberg, Erling},
  date = {2000-08-01},
  journaltitle = {Bioinformatics},
  shortjournal = {Bioinformatics},
  volume = {16},
  number = {8},
  pages = {699--706},
  issn = {1367-4803},
  doi = {10.1093/bioinformatics/16.8.699},
  url = {https://doi.org/10.1093/bioinformatics/16.8.699},
  urldate = {2024-04-09},
  abstract = {Motivation: Sequence database searching is among the most important and challenging tasks in bioinformatics. The ultimate choice of sequence-search algorithm is that of Smith–Waterman. However, because of the computationally demanding nature of this method, heuristic programs or special-purpose hardware alternatives have been developed. Increased speed has been obtained at the cost of reduced sensitivity or very expensive hardware.Results: A fast implementation of the Smith–Waterman sequence-alignment algorithm using Single-Instruction, Multiple-Data (SIMD) technology is presented. This implementation is based on the MultiMedia eXtensions (MMX) and Streaming SIMD Extensions (SSE) technology that is embedded in Intel’s latest microprocessors. Similar technology exists also in other modern microprocessors. Six-fold speed-up relative to the fastest previously known Smith–Waterman implementation on the same hardware was achieved by an optimized 8-way parallel processing approach. A speed of more than 150 million cell updates per second was obtained on a single Intel Pentium III 500 MHz microprocessor. This is probably the fastest implementation of this algorithm on a single general-purpose microprocessor described to date.Availability: Online searches with the software are available at http://dna.uio.no/search/Contact: torbjorn.rognes@labmed.uio.noTo whom correspondence should be addressed.}
}

@article{saeboPARALIGNRapidSensitive2005,
  title = {{{PARALIGN}}: Rapid and Sensitive Sequence Similarity Searches Powered by Parallel Computing Technology},
  shorttitle = {{{PARALIGN}}},
  author = {Sæbø, Per Eystein and Andersen, Sten Morten and Myrseth, Jon and Laerdahl, Jon K. and Rognes, Torbjørn},
  date = {2005-07-01},
  journaltitle = {Nucleic Acids Research},
  shortjournal = {Nucleic Acids Research},
  volume = {33},
  pages = {W535-W539},
  issn = {0305-1048},
  doi = {10.1093/nar/gki423},
  url = {https://doi.org/10.1093/nar/gki423},
  urldate = {2024-04-09},
  abstract = {PARALIGN is a rapid and sensitive similarity search tool for the identification of distantly related sequences in both nucleotide and amino acid sequence databases. Two algorithms are implemented, accelerated Smith–Waterman and ParAlign. The ParAlign algorithm is similar to Smith–Waterman in sensitivity, while as quick as BLAST for protein searches. A form of parallel computing technology known as multimedia technology that is available in modern processors, but rarely used by other bioinformatics software, has been exploited to achieve the high speed. The software is also designed to run efficiently on computer clusters using the message-passing interface standard. A public search service powered by a large computer cluster has been set-up and is freely available at www.paralign.org , where the major public databases can be searched. The software can also be downloaded free of charge for academic use.},
  issue = {suppl\_2}
}

@article{shojaRoadmapTandemlyArrayed2006,
  title = {A Roadmap of Tandemly Arrayed Genes in the Genomes of Human, Mouse, and Rat},
  author = {Shoja, Valia and Zhang, Liqing},
  date = {2006-11},
  journaltitle = {Molecular Biology and Evolution},
  shortjournal = {Mol Biol Evol},
  volume = {23},
  number = {11},
  eprint = {16901985},
  eprinttype = {pmid},
  pages = {2134--2141},
  issn = {0737-4038},
  doi = {10.1093/molbev/msl085},
  abstract = {Tandemly arrayed genes (TAGs) play an important functional and physiological role in the genome. Most previous studies have focused on individual TAG families in a few species, yet a broad characterization of TAGs is not available. Here we identified all TAGs in the genomes of humans, mouse, and rat and performed a comprehensive analysis of TAG distribution, TAG sizes, TAG orientations and intergenic distances, and TAG functions. TAGs account for about 14-17\% of all genes in the genome and nearly one-third of all duplicated genes, highlighting the predominant role that tandem duplication plays in gene duplication. For all species, TAG distribution is highly heterogeneous along chromosomes and some chromosomes are enriched with TAG forests, whereas others are enriched with TAG deserts. The majority of TAGs are of size 2 for all genomes, similar to the previous findings in Caenorhabditis elegans, Arabidopsis thaliana, and Oryza sativa, suggesting that it is a rather general phenomenon in eukaryotes. The comparison with the genome patterns shows that TAG members have a significantly higher proportion of parallel gene orientation in all species, corroborating Graham's claim that parallel orientation is the preferred form of orientation in TAGs. Moreover, TAG members with parallel orientation tend to be closer to each other than all neighboring genes in the genome with parallel orientation. The analyses of Gene Ontology function indicate that genes with receptor or binding activities are significantly overrepresented by TAGs. Computer simulation reveals that random gene rearrangements have little effect on the statistics of TAGs for all genomes. Finally, the average proportion of TAGs shows a trend of increase with the increase of family sizes, although the correlation between TAG proportions in individual families and family sizes is not significant.},
  langid = {english},
  keywords = {Animals,Chromosome Mapping,Chromosomes,Gene Duplication,Genome,Humans,Mice,Rats,Recombination Genetic,Tandem Repeat Sequences}
}

@article{smithIdentificationCommonMolecular1981,
  title = {Identification of Common Molecular Subsequences},
  author = {Smith, T. F. and Waterman, M. S.},
  date = {1981-03-25},
  journaltitle = {Journal of Molecular Biology},
  shortjournal = {Journal of Molecular Biology},
  volume = {147},
  number = {1},
  pages = {195--197},
  issn = {0022-2836},
  doi = {10.1016/0022-2836(81)90087-5},
  url = {https://www.sciencedirect.com/science/article/pii/0022283681900875},
  urldate = {2023-04-29},
  langid = {english}
}

@article{suyamaPAL2NALRobustConversion2006,
  title = {{{PAL2NAL}}: Robust Conversion of Protein Sequence Alignments into the Corresponding Codon Alignments},
  shorttitle = {{{PAL2NAL}}},
  author = {Suyama, Mikita and Torrents, David and Bork, Peer},
  date = {2006-07-01},
  journaltitle = {Nucleic Acids Research},
  shortjournal = {Nucleic Acids Research},
  volume = {34},
  pages = {W609-W612},
  issn = {0305-1048},
  doi = {10.1093/nar/gkl315},
  url = {https://doi.org/10.1093/nar/gkl315},
  urldate = {2024-03-31},
  abstract = {PAL2NAL is a web server that constructs a multiple codon alignment from the corresponding aligned protein sequences. Such codon alignments can be used to evaluate the type and rate of nucleotide substitutions in coding DNA for a wide range of evolutionary analyses, such as the identification of levels of selective constraint acting on genes, or to perform DNA-based phylogenetic studies. The server takes a protein sequence alignment and the corresponding DNA sequences as input. In contrast to other existing applications, this server is able to construct codon alignments even if the input DNA sequence has mismatches with the input protein sequence, or contains untranslated regions and polyA tails. The server can also deal with frame shifts and inframe stop codons in the input models, and is thus suitable for the analysis of pseudogenes. Another distinct feature is that the user can specify a subregion of the input alignment in order to specifically analyze functional domains or exons of interest. The PAL2NAL server is available at http://www.bork.embl.de/pal2nal .},
  issue = {suppl\_2}
}

@article{taylorDuplicationDivergenceEvolution2004,
  title = {Duplication and {{Divergence}}: {{The Evolution}} of {{New Genes}} and {{Old Ideas}}},
  shorttitle = {Duplication and {{Divergence}}},
  author = {Taylor, John S. and Raes, Jeroen},
  date = {2004-12-15},
  journaltitle = {Annual Review of Genetics},
  volume = {38},
  pages = {615--643},
  publisher = {Annual Reviews},
  issn = {0066-4197, 1545-2948},
  doi = {10.1146/annurev.genet.38.072902.092831},
  url = {https://www.annualreviews.org/content/journals/10.1146/annurev.genet.38.072902.092831},
  urldate = {2024-03-25},
  abstract = {▪ Abstract Over 35 years ago, Susumu Ohno stated that gene duplication was the single most important factor in evolution (97). He reiterated this point a few years later in proposing that without duplicated genes the creation of metazoans, vertebrates, and mammals from unicellular organisms would have been impossible. Such big leaps in evolution, he argued, required the creation of new gene loci with previously nonexistent functions (98). Bold statements such as these, combined with his proposal that at least one whole-genome duplication event facilitated the evolution of vertebrates, have made Ohno an icon in the literature on genome evolution. However, discussion on the occurrence and consequences of gene and genome duplication events has a much longer, and often neglected, history. Here we review literature dealing with the occurence and consequences of gene duplication, begining in 1911. We document conceptual and technological advances in gene duplication research from this early research in comparative cytology up to recent research on whole genomes, “transcriptomes,” and “interactomes.” We have formerly seen that parts many times repeated are eminently liable to vary in number and structure; consequently it is quite probable that natural selection, during the long-continued course of modification, should have seized on a certain number of the primordially similar elements, many times repeated, and have adapted them to the most diverse purposes. Charles Darwin, 1859 (23)},
  issue = {Volume 38, 2004},
  langid = {english}
}

@article{taylorUsingGalaxyPerform2007,
  title = {Using Galaxy to Perform Large-Scale Interactive Data Analyses},
  author = {Taylor, James and Schenck, Ian and Blankenberg, Dan and Nekrutenko, Anton},
  date = {2007-09},
  journaltitle = {Current Protocols in Bioinformatics},
  shortjournal = {Curr Protoc Bioinformatics},
  volume = {Chapter 10},
  eprint = {18428782},
  eprinttype = {pmid},
  pages = {Unit 10.5},
  issn = {1934-340X},
  doi = {10.1002/0471250953.bi1005s19},
  abstract = {While most experimental biologists know where to download genomic data, few have a concrete plan on how to analyze it. This situation can be corrected by: (1) providing unified portals serving genomic data and (2) building Web applications to allow flexible retrieval and on-the-fly analyses of the data. Powerful resources, such as the UCSC Genome Browser already address the first issue. The second issue, however, remains open. For example, how to find human protein-coding exons with the highest density of single nucleotide polymorphisms (SNPs) and extract orthologous sequences from all sequenced mammals? Indeed, one can access all relevant data from the UCSC Genome Browser. But once the data is downloaded how would one deal with millions of SNPs and gigabytes of alignments? Galaxy (http://g2.bx.psu.edu) is designed specifically for that purpose. It amplifies the strengths of existing resources (such as UCSC Genome Browser) by allowing the user to access and, most importantly, analyze data within a single interface in an unprecedented number of ways.},
  langid = {english},
  pmcid = {PMC3418382},
  keywords = {Algorithms,Base Sequence,Chromosome Mapping,Computer Graphics,DNA,DNA Mutational Analysis,Molecular Sequence Data,Sequence Alignment,Sequence Analysis DNA,Software,User-Computer Interface}
}

@article{thibaud-nissenIdentificationCharacterizationPseudogenes2009,
  title = {Identification and Characterization of Pseudogenes in the Rice Gene Complement},
  author = {Thibaud-Nissen, Françoise and Ouyang, Shu and Buell, C. Robin},
  date = {2009-07-16},
  journaltitle = {BMC Genomics},
  shortjournal = {BMC Genomics},
  volume = {10},
  number = {1},
  pages = {317},
  issn = {1471-2164},
  doi = {10.1186/1471-2164-10-317},
  url = {https://doi.org/10.1186/1471-2164-10-317},
  urldate = {2024-04-09},
  abstract = {The Osa1 Genome Annotation of rice (Oryza sativa L. ssp. japonica cv. Nipponbare) is the product of a semi-automated pipeline that does not explicitly predict pseudogenes. As such, it is likely to mis-annotate pseudogenes as functional genes. A total of 22,033 gene models within the Osa1 Release 5 were investigated as potential pseudogenes as these genes exhibit at least one feature potentially indicative of pseudogenes: lack of transcript support, short coding region, long untranslated region, or, for genes residing within a segmentally duplicated region, lack of a paralog or significantly shorter corresponding paralog.},
  langid = {english},
  keywords = {Duplicate Region,GOSlim Term,Massively Parallel Signature Sequencing,Massively Parallel Signature Sequencing Data,Paralogous Family}
}

@article{tremblaysavardEvolutionOrthologousTandemly2011,
  title = {Evolution of Orthologous Tandemly Arrayed Gene Clusters},
  author = {Tremblay Savard, Olivier and Bertrand, Denis and El-Mabrouk, Nadia},
  date = {2011-10-05},
  journaltitle = {BMC Bioinformatics},
  shortjournal = {BMC Bioinformatics},
  volume = {12},
  number = {9},
  pages = {S2},
  issn = {1471-2105},
  doi = {10.1186/1471-2105-12-S9-S2},
  url = {https://doi.org/10.1186/1471-2105-12-S9-S2},
  urldate = {2024-04-09},
  abstract = {Tandemly Arrayed Gene (TAG) clusters are groups of paralogous genes that are found adjacent on a chromosome. TAGs represent an important repertoire of genes in eukaryotes. In addition to tandem duplication events, TAG clusters are affected during their evolution by other mechanisms, such as inversion and deletion events, that affect the order and orientation of genes. The DILTAG algorithm developed in [1] makes it possible to infer a set of optimal evolutionary histories explaining the evolution of a single TAG cluster, from an ancestral single gene, through tandem duplications (simple or multiple, direct or inverted), deletions and inversion events.},
  keywords = {Gene Order,Gene Tree,Internal Vertex,Inversion Event,Tandem Duplication}
}

@article{vandongenGraphClusteringDiscrete2008a,
  title = {Graph {{Clustering Via}} a {{Discrete Uncoupling Process}}},
  author = {Van Dongen, Stijn},
  date = {2008-01},
  journaltitle = {SIAM Journal on Matrix Analysis and Applications},
  shortjournal = {SIAM J. Matrix Anal. Appl.},
  volume = {30},
  number = {1},
  pages = {121--141},
  publisher = {{Society for Industrial and Applied Mathematics}},
  issn = {0895-4798},
  doi = {10.1137/040608635},
  url = {https://epubs.siam.org/doi/10.1137/040608635},
  urldate = {2024-03-22},
  abstract = {We generalize the concepts of sign symmetry and weak sign symmetry by defining k-sign symmetric matrices. For a positive integer k, we show that all diagonal shifts of an irreducible matrix are k-sign symmetric if and only if the matrix is diagonally similar to a Hermitian matrix. A similar result holds for scalar shifts, but requires an additional condition in the case \$k = 1\$. Extensions are given to reducible matrices.}
}

@article{vandongenNewClusterAlgorithm1998,
  title = {A New Cluster Algorithm for Graphs},
  author = {family=Dongen, given=S., prefix=van, useprefix=true},
  date = {1998-01-01},
  number = {R 9814},
  url = {https://ir.cwi.nl/pub/4604},
  urldate = {2024-03-22},
  abstract = {A new cluster algorithm for graphs called the emph\{Markov Cluster algorithm (\$MCL\$ algorithm) is introduced. The graphs may be both weighted (with nonnegative weight) and directed. Let\textasciitilde\$G\$\textasciitilde be such a graph. The \$MCL\$ algorithm simulates flow in \$G\$ by first identifying \$G\$ in a canonical way with a Markov graph \$G\_1\$. Flow is then alternatingly expanded and contracted, leading to a row of Markov Graphs \$G\_\{(i)\$. The expansion step is done by computing higher step transition probabilities (\$TP\$'s), the contraction step creates a new Markov graph by favouring high \$TP\$'s and demoting low \$TP\$'s in a specific way. The heuristic underlying this approach is the expectation that flow between dense regions which are sparsely connected will evaporate. The stable limits of the process are easily derived and in practice the algorithm converges very fast to such a limit, the structure of which has a generic interpretation as an overlapping clustering of the graph\textasciitilde\$G\$. Overlap is limited to cases where the input graph has a symmetric structure inducing it. The contraction and expansion parameters of the algorithm influence the granularity of the output. The algorithm is space and time efficient with a space\$+\$quality/time trade--off, works very well for a wide range of test cases, and lends itself to drastic scaling. Experiments with a scaled \$C\$--implementation have been conducted on graphs having several tens of thousands of nodes. This report describes the algorithm, its complexity, and experimental results. The algorithm is introduced by first considering a generalization of generic single link clustering for graphs called \$k\$--path clustering.\vphantom{\}\}}},
  langid = {english}
}

@incollection{vandongenUsingMCLExtract2012,
  title = {Using {{MCL}} to {{Extract Clusters}} from {{Networks}}},
  booktitle = {Bacterial {{Molecular Networks}}},
  author = {Van Dongen, Stijn and Abreu-Goodger, Cei},
  editor = {Van Helden, Jacques and Toussaint, Ariane and Thieffry, Denis},
  date = {2012},
  volume = {804},
  pages = {281--295},
  publisher = {Springer New York},
  location = {New York, NY},
  doi = {10.1007/978-1-61779-361-5_15},
  url = {http://link.springer.com/10.1007/978-1-61779-361-5_15},
  urldate = {2024-04-11},
  isbn = {978-1-61779-360-8 978-1-61779-361-5},
  langid = {english}
}

@incollection{vandongenUsingMCLExtract2012a,
  title = {Using {{MCL}} to {{Extract Clusters}} from {{Networks}}},
  booktitle = {Bacterial {{Molecular Networks}}},
  author = {Van Dongen, Stijn and Abreu-Goodger, Cei},
  editor = {Van Helden, Jacques and Toussaint, Ariane and Thieffry, Denis},
  date = {2012},
  volume = {804},
  pages = {281--295},
  publisher = {Springer New York},
  location = {New York, NY},
  doi = {10.1007/978-1-61779-361-5_15},
  url = {http://link.springer.com/10.1007/978-1-61779-361-5_15},
  urldate = {2024-04-11},
  isbn = {978-1-61779-360-8 978-1-61779-361-5},
  langid = {english}
}

@article{vizuetaBitacoraComprehensiveTool2020,
  title = {Bitacora: {{A}} Comprehensive Tool for the Identification and Annotation of Gene Families in Genome Assemblies},
  shorttitle = {Bitacora},
  author = {Vizueta, Joel and Sánchez-Gracia, Alejandro and Rozas, Julio},
  date = {2020},
  journaltitle = {Molecular Ecology Resources},
  volume = {20},
  number = {5},
  pages = {1445--1452},
  issn = {1755-0998},
  doi = {10.1111/1755-0998.13202},
  url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.13202},
  urldate = {2024-04-09},
  abstract = {Gene annotation is a critical bottleneck in genomic research, especially for the comprehensive study of very large gene families in the genomes of nonmodel organisms. Despite the recent progress in automatic methods, state-of-the-art tools used for this task often produce inaccurate annotations, such as fused, chimeric, partial or even completely absent gene models for many family copies, errors that require considerable extra efforts to be corrected. Here we present bitacora, a bioinformatics solution that integrates popular sequence similarity-based search tools and Perl scripts to facilitate both the curation of these inaccurate annotations and the identification of previously undetected gene family copies directly in genomic DNA sequences. We tested the performance of bitacora in annotating the members of two chemosensory gene families with different repertoire size in seven available genome sequences, and compared its performance with that of augustus-ppx, a tool also designed to improve automatic annotations using a sequence similarity-based approach. Despite the relatively high fragmentation of some of these drafts, bitacora was able to improve the annotation of many members of these families and detected thousands of new chemoreceptors encoded in genome sequences. The program creates general feature format (GFF) files, with both curated and newly identified gene models, and FASTA files with the predicted proteins. These outputs can be easily integrated in genomic annotation editors, greatly facilitating subsequent manual annotation and downstream evolutionary analyses.},
  langid = {english},
  keywords = {bioinfomatics/phyloinfomatics,gene families,gene structure and function,genomics,molecular evolution,structural annotation,transcriptomics}
}

@article{wolfeRobustnessItNot2000,
  title = {Robustness—It's Not Where You Think It Is},
  author = {Wolfe, Ken},
  date = {2000-05},
  journaltitle = {Nature Genetics},
  shortjournal = {Nat Genet},
  volume = {25},
  number = {1},
  pages = {3--4},
  issn = {1061-4036, 1546-1718},
  doi = {10.1038/75560},
  url = {https://www.nature.com/articles/ng0500_3},
  urldate = {2024-03-28},
  langid = {english}
}

@article{yangWGDdetectorPipelineDetecting2019,
  title = {{{WGDdetector}}: A Pipeline for Detecting Whole Genome Duplication Events Using the Genome or Transcriptome Annotations},
  shorttitle = {{{WGDdetector}}},
  author = {Yang, Yongzhi and Li, Ying and Chen, Qiao and Sun, Yongshuai and Lu, Zhiqiang},
  date = {2019-12},
  journaltitle = {BMC Bioinformatics},
  shortjournal = {BMC Bioinformatics},
  volume = {20},
  number = {1},
  pages = {75},
  issn = {1471-2105},
  doi = {10.1186/s12859-019-2670-3},
  url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2670-3},
  urldate = {2024-03-19},
  abstract = {Background: With the availability of well-assembled genomes of a growing number of organisms, identifying the bioinformatic basis of whole genome duplication (WGD) is a growing field of genomics. The most extant software for detecting footprints of WGDs has been restricted to a well-assembled genome. However, the massive poor quality genomes and the more accessible transcriptomes have been largely ignored, and in theoretically they are also likely to contribute to detect WGD using dS based method. Here, to resolve these problems, we have designed a universal and simple technical tool WGDdetector for detecting WGDs using either genome or transcriptome annotations in different organisms based on the widely used dS based method. Results: We have constructed WGDdetector pipeline that integrates all analyses including gene family constructing, dS estimating and phasing, and outputting the dS values of each paralogs pairs processed with only one command. We further chose four species (Arabidopsis thaliana, Juglans regia, Populus trichocarpa and Xenopus laevis) representing herb, wood and animal, to test its practicability. Our final results showed a high degree of accuracy with the previous studies using both genome and transcriptome data. Conclusion: WGDdetector is not only reliable and stable for genome data, but also a new way to using the transcriptome data to obtain the correct dS distribution for detecting WGD. The source code is freely available, and is implemented in Windows and Linux operation system.},
  langid = {english}
}

@article{zhangLandscapeVariationNovel2017,
  title = {Landscape and Variation of Novel Retroduplications in 26 Human Populations},
  author = {Zhang, Yan and Li, Shantao and Abyzov, Alexej and Gerstein, Mark B.},
  date = {2017-06-29},
  journaltitle = {PLOS Computational Biology},
  shortjournal = {PLOS Computational Biology},
  volume = {13},
  number = {6},
  pages = {e1005567},
  publisher = {Public Library of Science},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.1005567},
  url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005567},
  urldate = {2024-03-27},
  abstract = {Retroduplications come from reverse transcription of mRNAs and their insertion back into the genome. Here, we performed comprehensive discovery and analysis of retroduplications in a large cohort of 2,535 individuals from 26 human populations, as part of 1000 Genomes Phase 3. We developed an integrated approach to discover novel retroduplications combining high-coverage exome and low-coverage whole-genome sequencing data, utilizing information from both exon-exon junctions and discordant paired-end reads. We found 503 parent genes having novel retroduplications absent from the reference genome. Based solely on retroduplication variation, we built phylogenetic trees of human populations; these represent superpopulation structure well and indicate that variable retroduplications are effective population markers. We further identified 43 retroduplication parent genes differentiating superpopulations. This group contains several interesting insertion events, including a SLMO2 retroduplication and insertion into CAV3, which has a potential disease association. We also found retroduplications to be associated with a variety of genomic features: (1) Insertion sites were correlated with regular nucleosome positioning. (2) They, predictably, tend to avoid conserved functional regions, such as exons, but, somewhat surprisingly, also avoid introns. (3) Retroduplications tend to be co-inserted with young L1 elements, indicating recent retrotranspositional activity, and (4) they have a weak tendency to originate from highly expressed parent genes. Our investigation provides insight into the functional impact and association with genomic elements of retroduplications. We anticipate our approach and analytical methodology to have application in a more clinical context, where exome sequencing data is abundant and the discovery of retroduplications can potentially improve the accuracy of SNP calling.},
  langid = {english},
  keywords = {Functional genomics,Gene expression,Genomics,Human genomics,Nucleosomes,Phylogenetic analysis,Pseudogenes,Single nucleotide polymorphisms}
}