Compare commits

..

No commits in common. "30807f2bd00ba1950d142aced92fa9e91c06e342" and "03f3efd6680ecee3201dd3c6ec77960a63782c3e" have entirely different histories.

11 changed files with 175 additions and 5628 deletions

View File

@ -1,4 +1,4 @@
OPTIONS=-shell-escape -file-line-error -synctex=1 OPTIONS=-shell-escape -file-line-error -synctex=1 -interaction=batchmode
SOURCE=report SOURCE=report
all: latexmk all: latexmk
debug: debug:

BIN
figures/Evolution_fate_duplicate_genes.pdf (Stored with Git LFS)

Binary file not shown.

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 201 KiB

BIN
media/dummy.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 546 B

View File

@ -33,19 +33,6 @@
langid = {english} langid = {english}
} }
@article{assisModelsRetentionDuplicate2024,
title = {Models for the Retention of Duplicate Genes and Their Biological Underpinnings},
author = {Assis, Raquel and Conant, Gavin and Holland, Barbara and Liberles, David and O'Reilly, Małgorzata and Wilson, Amanda},
date = {2024-02-12},
journaltitle = {F1000Research},
shortjournal = {F1000Research},
volume = {12},
pages = {1400},
doi = {10.12688/f1000research.141786.2},
abstract = {Gene content in genomes changes through several different processes, with gene duplication being an important contributor to such changes. Gene duplication occurs over a range of scales from individual genes to whole genomes, and the dynamics of this process can be context dependent. Still, there are rules by which genes are retained or lost from genomes after duplication, and probabilistic modeling has enabled characterization of these rules, including their context-dependence. Here, we describe the biology and corresponding mathematical models that are used to understand duplicate gene retention and its contribution to the set of biochemical functions encoded in a genome.},
keywords = {2read}
}
@article{beallIdentificationAnalysisHyperactive2002, @article{beallIdentificationAnalysisHyperactive2002,
title = {Identification and {{Analysis}} of a {{Hyperactive Mutant Form}} of {{Drosophila P-Element Transposase}}}, title = {Identification and {{Analysis}} of a {{Hyperactive Mutant Form}} of {{Drosophila P-Element Transposase}}},
author = {Beall, Eileen L and Mahoney, Matthew B and Rio, Donald C}, author = {Beall, Eileen L and Mahoney, Matthew B and Rio, Donald C},
@ -62,24 +49,6 @@
abstract = {Transposition in many organisms is regulated to control the frequency of DNA damage caused by the DNA breakage and joining reactions. However, genetic studies in prokaryotic systems have led to the isolation of mutant transposase proteins with higher or novel activities compared to those of the wild-type protein. In the course of our study of the effects of mutating potential ATM-family DNA damage checkpoint protein kinase sites in the Drosophila P-element transposase protein, we found one mutation, S129A, that resulted in an elevated level of transposase activity using in vivo recombination assays, including P-element-mediated germline transformation. In vitro assays for P-element transposase activity indicate that the S129A mutant exhibits elevated donor DNA cleavage activity when compared to the wild-type protein, whereas the strand-transfer activity is similar to that of wild type. This difference may reflect the nature of the in vitro assays and that normally in vivo the two reactions may proceed in concert. The P-element transposase protein contains 10 potential consensus phosphorylation sites for the ATM family of PI3-related protein kinases. Of these 10 sites, 8 affect transposase activity either positively or negatively when substituted individually with alanine and tested in vivo. A mutant transposase protein that contains all eight N-terminal serine and threonine residues substituted with alanine is inactive and can be restored to full activity by substitution of wild-type amino acids back at only 3 of the 8 positions. These data suggest that the activity of P-element transposase may be regulated by phosphorylation and demonstrate that one mutation, S129A, results in hyperactive transposition.} abstract = {Transposition in many organisms is regulated to control the frequency of DNA damage caused by the DNA breakage and joining reactions. However, genetic studies in prokaryotic systems have led to the isolation of mutant transposase proteins with higher or novel activities compared to those of the wild-type protein. In the course of our study of the effects of mutating potential ATM-family DNA damage checkpoint protein kinase sites in the Drosophila P-element transposase protein, we found one mutation, S129A, that resulted in an elevated level of transposase activity using in vivo recombination assays, including P-element-mediated germline transformation. In vitro assays for P-element transposase activity indicate that the S129A mutant exhibits elevated donor DNA cleavage activity when compared to the wild-type protein, whereas the strand-transfer activity is similar to that of wild type. This difference may reflect the nature of the in vitro assays and that normally in vivo the two reactions may proceed in concert. The P-element transposase protein contains 10 potential consensus phosphorylation sites for the ATM family of PI3-related protein kinases. Of these 10 sites, 8 affect transposase activity either positively or negatively when substituted individually with alanine and tested in vivo. A mutant transposase protein that contains all eight N-terminal serine and threonine residues substituted with alanine is inactive and can be restored to full activity by substitution of wild-type amino acids back at only 3 of the 8 positions. These data suggest that the activity of P-element transposase may be regulated by phosphorylation and demonstrate that one mutation, S129A, results in hyperactive transposition.}
} }
@article{berthelotRainbowTroutGenome2014,
title = {The Rainbow Trout Genome Provides Novel Insights into Evolution after Whole-Genome Duplication in Vertebrates},
author = {Berthelot, Camille and Brunet, Frédéric and Chalopin, Domitille and Juanchich, Amélie and Bernard, Maria and Noël, Benjamin and Bento, Pascal and Da Silva, Corinne and Labadie, Karine and Alberti, Adriana and Aury, Jean-Marc and Louis, Alexandra and Dehais, Patrice and Bardou, Philippe and Montfort, Jérôme and Klopp, Christophe and Cabau, Cédric and Gaspin, Christine and Thorgaard, Gary H. and Boussaha, Mekki and Quillet, Edwige and Guyomard, René and Galiana, Delphine and Bobe, Julien and Volff, Jean-Nicolas and Genêt, Carine and Wincker, Patrick and Jaillon, Olivier and Roest Crollius, Hugues and Guiguen, Yann},
date = {2014-04-22},
journaltitle = {Nature Communications},
shortjournal = {Nat Commun},
volume = {5},
eprint = {24755649},
eprinttype = {pmid},
pages = {3657},
issn = {2041-1723},
doi = {10.1038/ncomms4657},
abstract = {Vertebrate evolution has been shaped by several rounds of whole-genome duplications (WGDs) that are often suggested to be associated with adaptive radiations and evolutionary innovations. Due to an additional round of WGD, the rainbow trout genome offers a unique opportunity to investigate the early evolutionary fate of a duplicated vertebrate genome. Here we show that after 100 million years of evolution the two ancestral subgenomes have remained extremely collinear, despite the loss of half of the duplicated protein-coding genes, mostly through pseudogenization. In striking contrast is the fate of miRNA genes that have almost all been retained as duplicated copies. The slow and stepwise rediploidization process characterized here challenges the current hypothesis that WGD is followed by massive and rapid genomic reorganizations and gene deletions.},
langid = {english},
pmcid = {PMC4071752},
keywords = {2read,Animals,Evolution Molecular,Gene Duplication,Oncorhynchus mykiss,Vertebrates}
}
@article{blankenbergGalaxyWebbasedGenome2010, @article{blankenbergGalaxyWebbasedGenome2010,
title = {Galaxy: A Web-Based Genome Analysis Tool for Experimentalists}, title = {Galaxy: A Web-Based Genome Analysis Tool for Experimentalists},
shorttitle = {Galaxy}, shorttitle = {Galaxy},
@ -124,49 +93,6 @@
keywords = {Computational biology and bioinformatics,Genome informatics,Genomic analysis,Sequencing,Software} keywords = {Computational biology and bioinformatics,Genome informatics,Genomic analysis,Sequencing,Software}
} }
@unpublished{caronCyberGalaxy2013,
title = {Towards a Cyber {{Galaxy}} ?},
author = {Caron, Christophe C. and Carre, Wilfried and Cormier, Alexandre and Derozier, Sandra S. and Giacomoni, Franck and Inizan, Olivier and Le Corguillé, Gildas and Lermine, Alban and Maman Haddad, Sarah and Pericard, Pierre and Samson, Franck F.},
date = {2013-07},
series = {{{JOBIM TOULOUSE}} 2013 - {{RÉSUMÉS COURTS}} (Affiches)},
pages = {246},
url = {https://hal.inrae.fr/hal-02748994},
urldate = {2024-04-09},
abstract = {The success of the open web based platform “Galaxy” is growing among diverse scientific communities. The French Institute of Bioinformatics - IFB wish to initiate a collaborative work dedicated to scientific workflows and especially to the platform Galaxy. We report here the main items on which future collaborations could be build: (i) software and hardware architecture, (ii) tools integration and (iii) training.},
keywords = {formation,galaxy,intégration d'outils,NGS,partage de données,workflow},
annotation = {Published: JOBIM 2013}
}
@unpublished{caronFrenchCyberGalaxy2013,
title = {Toward a {{French}} Cyber {{Galaxy}} ?},
author = {Caron, Christophe C. and Carré, Wilfrid and Cormier, Alexandre and Derozier, Sandra S. and Giacomoni, Franck and Inizan, Olivier and Le Corguillé, Gildas and Lermine, Alban and Maman Haddad, Sarah and Pericard, Pierre and Samson, Franck F.},
date = {2013-06},
series = {Galaxy {{Community Conference}} 2013 : {{Posters}} / {{Abstracts}}},
pages = {online},
url = {https://hal.inrae.fr/hal-02748274},
urldate = {2024-04-09},
abstract = {The success of the open web based platform “Galaxy” is growing among scientific communities. The French Institute of Bioinformatics (IFB) wishes to initiate a collaborative work dedicated to scientific workflows and especially to the Galaxy platform. We report here the main items on which future collaborations could be build: (i) software and hardware architecture, (ii) tools integration and (iii) training. High throughput technologies advent significantly alters analysis behaviour and strategy with mobilization of new infrastructure, new tools and new skills. IFB decided to conduct a cross action on "workflows" data analysis solutions, and especially on the Galaxy platform. The first item called "software and hardware architecture" addresses the operational issues in production environments, the potential for automating deployment tasks and the monitoring solutions for Galaxy servers. With the second one, "Tools integration" we aim to provide processes facilitating tool interfacing in a Galaxy instance. Priority will be the development of a good practice guide, as well as a technology watch around the methods proposed by the international community. We also want to promote the sharing of training activities at national level (such as the Aviesan Bioinformatics school, January 2013 - http://galaxy-ecole.sb-roscoff.fr/) and ensure a smooth transition to new uses, such as E-learning. A first working group is already effective. Previous items will be improved in the coming months thanks to a specific dedicated wiki and the first French Galaxy Workshop this autumn.},
keywords = {data sharing,Galaxy,NGS,tools integration,training,workflow},
annotation = {Published: Galaxy Community Conference}
}
@article{casneufNonrandomDivergenceGene2006,
title = {Nonrandom Divergence of Gene Expression Following Gene and Genome Duplications in the Flowering Plant {{Arabidopsis}} Thaliana},
author = {Casneuf, Tineke and De Bodt, Stefanie and Raes, Jeroen and Maere, Steven and Van de Peer, Yves},
date = {2006-02-20},
journaltitle = {Genome Biology},
shortjournal = {Genome Biology},
volume = {7},
number = {2},
pages = {R13},
issn = {1474-760X},
doi = {10.1186/gb-2006-7-2-r13},
url = {https://doi.org/10.1186/gb-2006-7-2-r13},
urldate = {2024-04-13},
abstract = {Genome analyses have revealed that gene duplication in plants is rampant. Furthermore, many of the duplicated genes seem to have been created through ancient genome-wide duplication events. Recently, we have shown that gene loss is strikingly different for large- and small-scale duplication events and highly biased towards the functional class to which a gene belongs. Here, we study the expression divergence of genes that were created during large- and small-scale gene duplication events by means of microarray data and investigate both the influence of the origin (mode of duplication) and the function of the duplicated genes on expression divergence.},
keywords = {2read,Additional Data File,Anchor Point,Duplicate Gene,Duplication Event,Expression Divergence}
}
@report{charlesFinalisationPipelineFTAG2023, @report{charlesFinalisationPipelineFTAG2023,
type = {Internship Report}, type = {Internship Report},
title = {Finalisation du pipeline FTAG (Families and TAG) Finder, un outil de détection des gènes dupliqués sous Galaxy}, title = {Finalisation du pipeline FTAG (Families and TAG) Finder, un outil de détection des gènes dupliqués sous Galaxy},
@ -227,30 +153,6 @@
langid = {english} langid = {english}
} }
@article{denoeudAnalyseGenomesRecherche,
title = {Analyse des génomes à la recherche de répétitions en tandem polymorphes: outils d?épidémiologie bactérienne et locus hypermutables humains},
author = {Denoeud, France},
langid = {french}
}
@article{desponsTandemGeneArrays2011,
title = {Tandem Gene Arrays, Plastic Chromosomal Organizations},
author = {Despons, Laurence and Uzunov, Zlatyo and Louis, Véronique Leh},
date = {2011-08-01},
journaltitle = {Comptes Rendus Biologies},
shortjournal = {Comptes Rendus Biologies},
series = {Ten Years of Genomic Exploration in Eukaryotes : Strategy and Progress of {{Genolevures}}},
volume = {334},
number = {8},
pages = {639--646},
issn = {1631-0691},
doi = {10.1016/j.crvi.2011.05.012},
url = {https://www.sciencedirect.com/science/article/pii/S1631069111001454},
urldate = {2024-04-09},
abstract = {This short article presents an overview of tandem gene arrays (TGAs) in hemiascomycete yeasts. In silico and in vivo analyses are combined to address structural, functional and evolutionary aspects of these particular chromosomal structures. Genomic instability of TGAs is discussed. We conclude that TGAs are generally dynamic regions of the genome in that they are the seats of chromosomal rearrangement events. In addition, they are often breeding grounds of new genes for a rapid adaptation of cells to demands of the environment. Résumé Ce court article présente une vue densemble des tandems de gènes chez les levures hémiascomycètes. Des analyses in silico et in vivo ont été combinées pour aborder les aspects structuraux, fonctionnels et évolutifs de ces structures chromosomiques particulières. Linstabilité génomique des tandems de gènes est discutée. Nous concluons que les tandems de gènes sont généralement des régions dynamiques du génome car ils sont le siège dévénements de réarrangements chromosomiques. De surcroît, ils sont souvent des zones de reproduction de nouveaux gènes pour une adaptation rapide des cellules aux demandes de lenvironnement.},
keywords = {Chromosomal rearrangements,Duplication de gènes en tandem,Evolution,Évolution,Levure,Réarrangements chromosomiques,Tandem gene duplication,Yeast}
}
@article{ditommasoNextflowEnablesReproducible2017, @article{ditommasoNextflowEnablesReproducible2017,
title = {Nextflow Enables Reproducible Computational Workflows}, title = {Nextflow Enables Reproducible Computational Workflows},
author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric}, author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric},
@ -302,20 +204,14 @@
langid = {english} langid = {english}
} }
@article{duarteExpressionPatternShifts2006, @online{DupliquerPourAdapter2020,
title = {Expression {{Pattern Shifts Following Duplication Indicative}} of {{Subfunctionalization}} and {{Neofunctionalization}} in {{Regulatory Genes}} of {{Arabidopsis}}}, title = {Dupliquer pour sadapter ou comment accélérer lévolution des plantes ? | CNRS Biologie},
author = {Duarte, Jill M. and Cui, Liying and Wall, P. Kerr and Zhang, Qing and Zhang, Xiaohong and Leebens-Mack, Jim and Ma, Hong and Altman, Naomi and {dePamphilis}, Claude W.}, shorttitle = {Dupliquer pour sadapter ou comment accélérer lévolution des plantes ?},
date = {2006-02-01}, date = {2020-10-14},
journaltitle = {Molecular Biology and Evolution}, url = {https://www.insb.cnrs.fr/fr/cnrsinfo/dupliquer-pour-sadapter-ou-comment-accelerer-levolution-des-plantes},
shortjournal = {Molecular Biology and Evolution}, urldate = {2024-03-25},
volume = {23}, abstract = {Les duplications de portions de chromosomes permettant aux organismes de dupliquer des gènes existants et den créer de nouveaux sont bien},
number = {2}, langid = {french}
pages = {469--478},
issn = {0737-4038},
doi = {10.1093/molbev/msj051},
url = {https://doi.org/10.1093/molbev/msj051},
urldate = {2024-04-14},
abstract = {Gene duplication plays an important role in the evolution of diversity and novel function and is especially prevalent in the nuclear genomes of flowering plants. Duplicate genes may be maintained through subfunctionalization and neofunctionalization at the level of expression or coding sequence. In order to test the hypothesis that duplicated regulatory genes will be differentially expressed in a specific manner indicative of regulatory subfunctionalization and/or neofunctionalization, we examined expression pattern shifts in duplicated regulatory genes in Arabidopsis. A two-way analysis of variance was performed on expression data for 280 phylogenetically identified paralogous pairs. Expression data were extracted from global expression profiles for wild-type root, stem, leaf, developing inflorescence, nearly mature flower buds, and seedpod. Gene, organ, and gene by organ interaction (G × O) effects were examined. Results indicate that 85\% of the paralogous pairs exhibited a significant G × O effect indicative of regulatory subfunctionalization and/or neofunctionalization. A significant G × O effect was associated with complementary expression patterns in 45\% of pairwise comparisons. No association was detected between a G × O effect and a relaxed evolutionary constraint as detected by the ratio of nonsynonymous to synonymous substitutions. Ancestral gene expression patterns inferred across a Type II MADS-box gene phylogeny suggest several cases of regulatory neofunctionalization and organ-specific nonfunctionalization. Complete linkage clustering of gene expression levels across organs suggests that regulatory modules for each organ are independent or ancestral genes had limited expression. We propose a new classification, regulatory hypofunctionalization, for an overall decrease in expression level in one member of a paralogous pair while still having a significant G × O effect. We conclude that expression divergence specifically indicative of subfunctionalization and/or neofunctionalization contributes to the maintenance of most if not all duplicated regulatory genes in Arabidopsis and hypothesize that this results in increasing expression diversity or specificity of regulatory genes after each round of duplication.}
} }
@article{emmsOrthoFinderPhylogeneticOrthology2019, @article{emmsOrthoFinderPhylogeneticOrthology2019,
@ -354,17 +250,6 @@
keywords = {Blast Score,Gene Length,Phylogenetic Distance,Sequence Similarity Score,Transcription Factor Gene Family} keywords = {Blast Score,Gene Length,Phylogenetic Distance,Sequence Similarity Score,Transcription Factor Gene Family}
} }
@video{evry-senartsciencesetinnovationCareneRizzonUEVE2014,
entrysubtype = {video},
title = {Carène {{Rizzon}} ({{UEVE}}) - {{Etude}} de lévolution Des Gènes Dupliqués},
editor = {{Evry-Sénart Sciences et Innovation}},
editortype = {director},
date = {2014},
url = {https://www.youtube.com/watch?v=ubiOE7w3374},
urldate = {2024-04-10},
abstract = {Colloque ESI 2014 "Evry Bio \& Evry STIC" organisé par Evry Sciences et Innovation le 30 avril 2014 à Evry. Intervention de Carène Rizzon de l'Université dÉvry: "Étude de lévolution des gènes dupliqués chez Arabidopsis thaliana via les réseaux biologiques.}
}
@article{gautRecombinationUnderappreciatedFactor2007, @article{gautRecombinationUnderappreciatedFactor2007,
title = {Recombination: An Underappreciated Factor in the Evolution of Plant Genomes}, title = {Recombination: An Underappreciated Factor in the Evolution of Plant Genomes},
shorttitle = {Recombination}, shorttitle = {Recombination},
@ -437,63 +322,12 @@
keywords = {Aegilops,molecular evolution,plasmon and B genome inheritance,Triticum,wheat} keywords = {Aegilops,molecular evolution,plasmon and B genome inheritance,Triticum,wheat}
} }
@article{grahamTandemGenesClustered1995,
title = {Tandem Genes and Clustered Genes},
author = {Graham, Geoffrey J.},
date = {1995-07-07},
journaltitle = {Journal of Theoretical Biology},
shortjournal = {Journal of Theoretical Biology},
volume = {175},
number = {1},
pages = {71--87},
issn = {0022-5193},
doi = {10.1006/jtbi.1995.0122},
url = {https://www.sciencedirect.com/science/article/pii/S0022519385701221},
urldate = {2024-04-09},
abstract = {Two patterns of gene repetition are described: tandem arraying and clustering. Tandemly arrayed genes reside within segments of DNA that are repeated head-to-tail a number of times. Clustered genes are linked but irregularly spaced, are often mutually inverted in an unpredictable pattern and are connected by non-conserved DNA. Tandem arrays are homogenized by both unequal recombination and gene conversion, are necessary for the maintenance of large gene families, can expand and contract rapidly in response to changing demand, can keep functionally related genes equal in number, and do not engender increased genetic complexity. Gene clusters are homogenized by conversion only, seldom if ever contain more than 50 members, are stable in number, and often engender increased genetic complexity. The interrelationships among these properties are discussed. Tandem gene arrays can evolve into gene clusters. It is suggested that this occurs when some change in the array inhibits unequal recombination but not gene conversion. The most common such change is inversion of part of the tandem array with respect to the rest; however, arrays can evolve into clusters without inversion. Clustered genes are sometimes re-amplified into new tandem arrays. Clustered genes are probably more durable than tandemly arrayed genes during periods of relaxed selection, and in the case of fish antifreeze protein genes, seem to behave as a genetic memory.}
}
@article{hanadaImportanceLineagespecificExpansion2008,
title = {Importance of Lineage-Specific Expansion of Plant Tandem Duplicates in the Adaptive Response to Environmental Stimuli},
author = {Hanada, Kousuke and Zou, Cheng and Lehti-Shiu, Melissa D. and Shinozaki, Kazuo and Shiu, Shin-Han},
date = {2008-10},
journaltitle = {Plant Physiology},
shortjournal = {Plant Physiol},
volume = {148},
number = {2},
eprint = {18715958},
eprinttype = {pmid},
pages = {993--1003},
issn = {0032-0889},
doi = {10.1104/pp.108.122457},
abstract = {Plants have substantially higher gene duplication rates compared with most other eukaryotes. These plant gene duplicates are mostly derived from whole genome and/or tandem duplications. Earlier studies have shown that a large number of duplicate genes are retained over a long evolutionary time, and there is a clear functional bias in retention. However, the influence of duplication mechanism, particularly tandem duplication, on duplicate retention has not been thoroughly investigated. We have defined orthologous groups (OGs) between Arabidopsis (Arabidopsis thaliana) and three other land plants to examine the functional bias of retained duplicate genes during vascular plant evolution. Based on analysis of Gene Ontology categories, it is clear that genes in OGs that expanded via tandem duplication tend to be involved in responses to environmental stimuli, while those that expanded via nontandem mechanisms tend to have intracellular regulatory roles. Using Arabidopsis stress expression data, we further demonstrated that tandem duplicates in expanded OGs are significantly enriched in genes that are up-regulated by biotic stress conditions. In addition, tandem duplication of genes in an OG tends to be highly asymmetric. That is, expansion of OGs with tandem genes in one organismal lineage tends to be coupled with losses in the other. This is consistent with the notion that these tandem genes have experienced lineage-specific selection. In contrast, OGs with genes duplicated via nontandem mechanisms tend to experience convergent expansion, in which similar numbers of genes are gained in parallel. Our study demonstrates that the expansion of gene families and the retention of duplicates in plants exhibit substantial functional biases that are strongly influenced by the mechanism of duplication. In particular, genes involved in stress responses have an elevated probability of retention in a single-lineage fashion following tandem duplication, suggesting that these tandem duplicates are likely important for adaptive evolution to rapidly changing environments.},
langid = {english},
pmcid = {PMC2556807},
keywords = {Adaptation Biological,Arabidopsis,Evolution Molecular,Gene Duplication,Genes Duplicate,Genes Plant,Genome Plant,Multigene Family,Oligonucleotide Array Sequence Analysis,Phylogeny}
}
@online{HomeCromwell, @online{HomeCromwell,
title = {Home - {{Cromwell}}}, title = {Home - {{Cromwell}}},
url = {https://cromwell.readthedocs.io/en/stable/}, url = {https://cromwell.readthedocs.io/en/stable/},
urldate = {2024-03-27} urldate = {2024-03-27}
} }
@online{HttpsMicansOrg,
title = {{{https://micans.org/mcl/lit/mimb.pdf}}},
url = {https://micans.org/mcl/lit/mimb.pdf},
urldate = {2024-04-11}
}
@report{jasminStudyTandemlyArrayed2016,
type = {Internship Report},
title = {Study of Tandemly Arrayed Genes Expression for {{Arabidopsis}} Thaliana},
author = {Jasmin, Fabien},
year = {2016-06-2016},
institution = {Laboratoire de Mathématiques et Modélisation d'Évry},
abstract = {Tandemly arrayed genes, also called TAGs, are duplicated genes which come from tandem arrayed duplication. They can be separated or not by few genes called spacers. Although duplicated genes are commonly studied, TAGs features remain little known. In this study, I performed a statistical analysis of Arabidopsis thaliana TAGs using genomic and transcriptomic data of high quality providing from TAIR database and CATdb. After merging the different data and assessing it, I observed the distribution of the different size of TAG and the behaviour of TAGs depending on the number of spacers that I made vary from 0 to 10 in my survey. I equally defined different list of gene pairs to easily compare TAGs to other type of genes. In all 5 lists have been defined during my investigation. The defined lists are random genes pairs list, duplicated genes pairs list, successive genes pairs list, local genes pairs list and TAGs pairs list. After creating all lists previously defined, I made gene pairs lists comparisons between TAGs pair list and the other type of gene pairs list according to different features such as the effect of abiotic or biotic stress conditions, the genes orientation, or the correlation of the expression profiles.},
langid = {english}
}
@video{javiernovoDuplicationGenes2015, @video{javiernovoDuplicationGenes2015,
entrysubtype = {video}, entrysubtype = {video},
title = {Duplication of Genes}, title = {Duplication of Genes},
@ -556,22 +390,6 @@
pubstate = {preprint} pubstate = {preprint}
} }
@inproceedings{lajoieEvolutionTandemlyArrayed2007,
title = {Evolution of {{Tandemly Arrayed Genes}} in {{Multiple Species}}},
booktitle = {Comparative {{Genomics}}},
author = {Lajoie, Mathieu and Bertrand, Denis and El-Mabrouk, Nadia},
editor = {Tesler, Glenn and Durand, Dannie},
date = {2007},
pages = {96--109},
publisher = {Springer},
location = {Berlin, Heidelberg},
doi = {10.1007/978-3-540-74960-8_8},
abstract = {Tandemly arrayed genes (TAG) constitute a large fraction of most genomes and play important biological roles. They evolve through unequal recombination, which places duplicated genes next to the original ones (tandem duplications). Many algorithms have been proposed to infer a tandem duplication history for a TAG cluster in a single species. However, the presence of different transcriptional orientations in most TAG clusters highlight the fact that processes such as inversions also contribute to their evolution. This makes those algorithms unsuitable in many cases. To circumvent this limitation, we proposed in a previous work an extended evolutionary model which includes inversions and presented a branch-and-bound algorithm allowing to infer a most parsimonious scenario of evolution for a given TAG cluster. Here, we generalize this model to multiple species and present a general framework to infer ancestral gene orders that minimize the number of inversions in the whole evolutionary history. An application on a pair of human-rat TAG clusters is presented.},
isbn = {978-3-540-74960-8},
langid = {english},
keywords = {Ancestral Genome,Gene Order,Gene Tree,Inversion Event,Tandem Duplication}
}
@thesis{lallemandEvolutionGenesDupliques2022, @thesis{lallemandEvolutionGenesDupliques2022,
type = {phdthesis}, type = {phdthesis},
title = {Évolution des gènes dupliqués chez le pommier : Identification et caractérisation de la dominance du sous-génome dans le génome de la pomme}, title = {Évolution des gènes dupliqués chez le pommier : Identification et caractérisation de la dominance du sous-génome dans le génome de la pomme},
@ -620,13 +438,6 @@
langid = {english} langid = {english}
} }
@article{landes-devauchelleArtResumerPour,
title = {De lart de résumer pour tenter de comprendre en génomique évolutive},
author = {Landès-Devauchelle, Claudine},
url = {http://www.math-evry.cnrs.fr/_media/publications/devauchelle_hdr_2011.pdf},
langid = {french}
}
@article{lannesDoesPresenceTransposable2019, @article{lannesDoesPresenceTransposable2019,
title = {Does the {{Presence}} of {{Transposable Elements Impact}} the {{Epigenetic Environment}} of {{Human Duplicated Genes}}?}, title = {Does the {{Presence}} of {{Transposable Elements Impact}} the {{Epigenetic Environment}} of {{Human Duplicated Genes}}?},
author = {Lannes, Romain and Rizzon, Carène and Lerat, Emmanuelle}, author = {Lannes, Romain and Rizzon, Carène and Lerat, Emmanuelle},
@ -646,16 +457,6 @@
pmcid = {PMC6470583} pmcid = {PMC6470583}
} }
@report{le-hoangEtudeTranscriptomiqueGenes2017,
type = {Internship Report},
title = {Etude transcriptomique des gènes dupliqués en tandem (TAG) chez Arabidopsis thaliana},
author = {Lê-Hoang, Julie},
date = {2017},
pages = {74},
institution = {Laboratoire de Mathématiques et Modélisation d'Évry},
langid = {french}
}
@thesis{leducEtudeEvolutionGenes, @thesis{leducEtudeEvolutionGenes,
title = {Étude de lévolution des gènes dupliqués chez les Rosaceae}, title = {Étude de lévolution des gènes dupliqués chez les Rosaceae},
author = {Leduc, Martin}, author = {Leduc, Martin},
@ -728,22 +529,6 @@
pmcid = {PMC6347962} pmcid = {PMC6347962}
} }
@online{moixPhylogeneticPlacementWhole2023,
title = {Phylogenetic Placement of Whole Genome Duplications in Yeasts through Quantitative Analysis of Hierarchical Orthologous Groups},
author = {Moix, Samuel and Glover, Natasha and Majidian, Sina},
date = {2023-04-12},
number = {12:382},
eprint = {12:382},
eprinttype = {F1000Research},
doi = {10.12688/f1000research.128656.1},
url = {https://f1000research.com/articles/12-382},
urldate = {2024-04-17},
abstract = {Background: Whole genome duplications (WGD) are genomic events leading to formation of polyploid organisms. Resulting duplicated genes play important roles in driving species evolution and diversification. After such events, the initial ploidy is usually restored, complicating their detection across evolution. With the advance of bioinformatics and the rising number of new well-assembled genomes, new detection methods are ongoingly being developed to overcome the weaknesses of different approaches. Results: Here we propose a novel method for detecting WGD in yeast lineages based on the quantitative and comparative analysis of hierarchical orthologous groups (HOGs) of duplicated genes for a given set of organisms. We reconstruct ancestral genomes to obtain evolutionary information for each phylogenetic branch. This reconstruction relies on the inference of HOGs from the selected species proteomes. To estimate WGD events, the number of HOGs of duplicated genes across all taxonomic ranges are adjusted according to the molecular clock hypothesis and by the average genome size. Branches with a significant increase in the adjusted number of duplicated gene families are kept as candidates for WGD placement. The developed method was tested on two real datasets and showed promising results in phylogenetic WGD placements on the yeast lineage.},
langid = {english},
pubstate = {preprint},
keywords = {comparative genomics,orthologous groups,whole genome duplications,yeast}
}
@online{molderSustainableDataAnalysis2021a, @online{molderSustainableDataAnalysis2021a,
title = {Sustainable Data Analysis with {{Snakemake}}}, title = {Sustainable Data Analysis with {{Snakemake}}},
author = {Mölder, Felix and Jablonski, Kim Philipp and Letcher, Brice and Hall, Michael B. and Tomkins-Tinch, Christopher H. and Sochat, Vanessa and Forster, Jan and Lee, Soohyun and Twardziok, Sven O. and Kanitz, Alexander and Wilm, Andreas and Holtgrewe, Manuel and Rahmann, Sven and Nahnsen, Sven and Köster, Johannes}, author = {Mölder, Felix and Jablonski, Kim Philipp and Letcher, Brice and Hall, Michael B. and Tomkins-Tinch, Christopher H. and Sochat, Vanessa and Forster, Jan and Lee, Soohyun and Twardziok, Sven O. and Kanitz, Alexander and Wilm, Andreas and Holtgrewe, Manuel and Rahmann, Sven and Nahnsen, Sven and Köster, Johannes},
@ -788,40 +573,6 @@
langid = {english} langid = {english}
} }
@article{ottoRecombinationSelectionEvolution2022,
title = {Recombination, Selection, and the Evolution of Tandem Gene Arrays},
author = {Otto, Moritz and Zheng, Yichen and Wiehe, Thomas},
date = {2022-07-01},
journaltitle = {Genetics},
shortjournal = {Genetics},
volume = {221},
number = {3},
pages = {iyac052},
issn = {1943-2631},
doi = {10.1093/genetics/iyac052},
url = {https://doi.org/10.1093/genetics/iyac052},
urldate = {2024-04-09},
abstract = {Multigene families—immunity genes or sensory receptors, for instance—are often subject to diversifying selection. Allelic diversity may be favored not only through balancing or frequency-dependent selection at individual loci but also by associating different alleles in multicopy gene families. Using a combination of analytical calculations and simulations, we explored a population genetic model of epistatic selection and unequal recombination, where a trade-off exists between the benefit of allelic diversity and the cost of copy abundance. Starting from the neutral case, where we showed that gene copy number is Gamma distributed at equilibrium, we derived also the mean and shape of the limiting distribution under selection. Considering a more general model, which includes variable population size and population substructure, we explored by simulations mean fitness and some summary statistics of the copy number distribution. We determined the relative effects of selection, recombination, and demographic parameters in maintaining allelic diversity and shaping the mean fitness of a population. One way to control the variance of copy number is by lowering the rate of unequal recombination. Indeed, when encoding recombination by a rate modifier locus, we observe exactly this prediction. Finally, we analyzed the empirical copy number distribution of 3 genes in human and estimated recombination and selection parameters of our model.}
}
@article{panTandemlyArrayedGenes2008,
title = {Tandemly {{Arrayed Genes}} in {{Vertebrate Genomes}}},
author = {Pan, Deng and Zhang, Liqing},
date = {2008},
journaltitle = {Comparative and Functional Genomics},
shortjournal = {Comp Funct Genomics},
volume = {2008},
eprint = {18815629},
eprinttype = {pmid},
pages = {545269},
issn = {1531-6912},
doi = {10.1155/2008/545269},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2547482/},
urldate = {2024-04-09},
abstract = {Tandemly arrayed genes (TAGs) are duplicated genes that are linked as neighbors on a chromosome, many of which have important physiological and biochemical functions. Here we performed a survey of these genes in 11 available vertebrate genomes. TAGs account for an average of about 14\% of all genes in these vertebrate genomes, and about 25\% of all duplications. The majority of TAGs (7294\%) have parallel transcription orientation (i.e., they are encoded on the same strand) in contrast to the genome, which has about 50\% of its genes in parallel transcription orientation. The majority of tandem arrays have only two members. In all species, the proportion of genes that belong to TAGs tends to be higher in large gene families than in small ones; together with our recent finding that tandem duplication played a more important role than retroposition in large families, this fact suggests that among all types of duplication mechanisms, tandem duplication is the predominant mechanism of duplication, especially in large families. Finally, several species have a higher proportion of large tandem arrays that are species-specific than random expectation.},
pmcid = {PMC2547482}
}
@online{PEPkitBioData, @online{PEPkitBioData,
title = {{{PEPkit}}: The Bio Data Management Toolkit - {{PEPkit}}: The Bio Data Management Toolkit}, title = {{{PEPkit}}: The Bio Data Management Toolkit - {{PEPkit}}: The Bio Data Management Toolkit},
url = {https://pep.databio.org/}, url = {https://pep.databio.org/},
@ -845,23 +596,6 @@
langid = {english} langid = {english}
} }
@article{picart-picoloLargeTandemDuplications2020a,
title = {Large Tandem Duplications Affect Gene Expression, {{3D}} Organization, and PlantPathogen Response},
author = {Picart-Picolo, Ariadna and Grob, Stefan and Picault, Nathalie and Franek, Michal and Llauro, Christel and Halter, Thierry and Maier, Tom R. and Jobet, Edouard and Descombin, Julie and Zhang, Panpan and Paramasivan, Vijayapalani and Baum, Thomas J. and Navarro, Lionel and Dvořáčková, Martina and Mirouze, Marie and Pontvianne, Frédéric},
date = {2020-10-08},
journaltitle = {Genome Research},
shortjournal = {Genome Res.},
eprint = {33033057},
eprinttype = {pmid},
publisher = {Cold Spring Harbor Lab},
issn = {1088-9051, 1549-5469},
doi = {10.1101/gr.261586.120},
url = {https://genome.cshlp.org/content/early/2020/10/05/gr.261586.120},
urldate = {2024-04-09},
abstract = {Rapid plant genome evolution is crucial to adapt to environmental changes. Chromosomal rearrangements and gene copy number variation (CNV) are two important tools for genome evolution and sources for the creation of new genes. However, their emergence takes many generations. In this study, we show that in Arabidopsis thaliana, a significant loss of ribosomal RNA (rRNA) genes with a past history of a mutation for the chromatin assembly factor 1 (CAF1) complex causes rapid changes in the genome structure. Using long-read sequencing and microscopic approaches, we have identified up to 15 independent large tandem duplications in direct orientation (TDDOs) ranging from 60 kb to 1.44 Mb. Our data suggest that these TDDOs appeared within a few generations, leading to the duplication of hundreds of genes. By subsequently focusing on a line only containing 20\% of rRNA gene copies (20rDNA line), we investigated the impact of TDDOs on 3D genome organization, gene expression, and cytosine methylation. We found that duplicated genes often accumulate more transcripts. Among them, several are involved in plantpathogen response, which could explain why the 20rDNA line is hyper-resistant to both bacterial and nematode infections. Finally, we show that the TDDOs create gene fusions and/or truncations and discuss their potential implications for the evolution of plant genomes.},
langid = {english}
}
@online{ponsComputingCommunitiesLarge2005, @online{ponsComputingCommunitiesLarge2005,
title = {Computing Communities in Large Networks Using Random Walks (Long Version)}, title = {Computing Communities in Large Networks Using Random Walks (Long Version)},
author = {Pons, Pascal and Latapy, Matthieu}, author = {Pons, Pascal and Latapy, Matthieu},
@ -871,22 +605,11 @@
doi = {10.48550/arXiv.physics/0512106}, doi = {10.48550/arXiv.physics/0512106},
url = {http://arxiv.org/abs/physics/0512106}, url = {http://arxiv.org/abs/physics/0512106},
urldate = {2024-03-30}, urldate = {2024-03-30},
abstract = {Dense subgraphs of sparse graphs (communities), which appear in most real-world complex networks, play an important role in many contexts. Computing them however is generally expensive. We propose here a measure of similarities between vertices based on random walks which has several important advantages: it captures well the community structure in a network, it can be computed efficiently, and it can be used in an agglomerative algorithm to compute efficiently the community structure of a network. We propose such an algorithm, called Walktrap, which runs in time O(mn\textasciicircum 2) and space O(n\textasciicircum 2) in the worst case, and in time O(n\textasciicircum 2log n) and space O(n\textasciicircum 2) in most real-world cases (n and m are respectively the number of vertices and edges in the input graph). Extensive comparison tests show that our algorithm surpasses previously proposed ones concerning the quality of the obtained community structures and that it stands among the best ones concerning the running time.}, abstract = {Dense subgraphs of sparse graphs (communities), which appear in most real-world complex networks, play an important role in many contexts. Computing them however is generally expensive. We propose here a measure of similarities between vertices based on random walks which has several important advantages: it captures well the community structure in a network, it can be computed efficiently, and it can be used in an agglomerative algorithm to compute efficiently the community structure of a network. We propose such an algorithm, called Walktrap, which runs in time O(mn\^{}2) and space O(n\^{}2) in the worst case, and in time O(n\^{}2log n) and space O(n\^{}2) in most real-world cases (n and m are respectively the number of vertices and edges in the input graph). Extensive comparison tests show that our algorithm surpasses previously proposed ones concerning the quality of the obtained community structures and that it stands among the best ones concerning the running time.},
pubstate = {preprint}, pubstate = {preprint},
keywords = {Condensed Matter - Disordered Systems and Neural Networks,Condensed Matter - Statistical Mechanics,Physics - Physics and Society} keywords = {Condensed Matter - Disordered Systems and Neural Networks,Condensed Matter - Statistical Mechanics,Physics - Physics and Society}
} }
@online{pontvianneDupliquerPourAdapter2020,
title = {Dupliquer pour sadapter ou comment accélérer lévolution des plantes ? | CNRS Biologie},
shorttitle = {Dupliquer pour sadapter ou comment accélérer lévolution des plantes ?},
author = {Pontvianne, Frédéric},
date = {2020-10-14},
url = {https://www.insb.cnrs.fr/fr/cnrsinfo/dupliquer-pour-sadapter-ou-comment-accelerer-levolution-des-plantes},
urldate = {2024-03-25},
abstract = {Les duplications de portions de chromosomes permettant aux organismes de dupliquer des gènes existants et den créer de nouveaux sont bien},
langid = {french}
}
@article{reamsSelectionGeneClustering2004, @article{reamsSelectionGeneClustering2004,
title = {Selection for {{Gene Clustering}} by {{Tandem Duplication}}}, title = {Selection for {{Gene Clustering}} by {{Tandem Duplication}}},
author = {Reams, Andrew B. and Neidle, Ellen L.}, author = {Reams, Andrew B. and Neidle, Ellen L.},
@ -917,25 +640,6 @@
abstract = {We analyzed the distribution of 54 families of transposable elements (TEs; transposons, LTR retrotransposons, and non-LTR retrotransposons) in the chromosomes of Drosophila melanogaster, using data from the sequenced genome. The density of LTR and non-LTR retrotransposons (RNA-based elements) was high in regions with low recombination rates, but there was no clear tendency to parallel the recombination rate. However, the density of transposons (DNA-based elements) was significantly negatively correlated with recombination rate. The accumulation of TEs in regions of reduced recombination rate is compatible with selection acting against TEs, as selection is expected to be weaker in regions with lower recombination. The differences in the relationship between recombination rate and TE density that exist between chromosome arms suggest that TE distribution depends on specific characteristics of the chromosomes (chromatin structure, distribution of other sequences), the TEs themselves (transposition mechanism), and the species (reproductive system, effective population size, etc.), that have differing influences on the effect of natural selection acting against the TE insertions.} abstract = {We analyzed the distribution of 54 families of transposable elements (TEs; transposons, LTR retrotransposons, and non-LTR retrotransposons) in the chromosomes of Drosophila melanogaster, using data from the sequenced genome. The density of LTR and non-LTR retrotransposons (RNA-based elements) was high in regions with low recombination rates, but there was no clear tendency to parallel the recombination rate. However, the density of transposons (DNA-based elements) was significantly negatively correlated with recombination rate. The accumulation of TEs in regions of reduced recombination rate is compatible with selection acting against TEs, as selection is expected to be weaker in regions with lower recombination. The differences in the relationship between recombination rate and TE density that exist between chromosome arms suggest that TE distribution depends on specific characteristics of the chromosomes (chromatin structure, distribution of other sequences), the TEs themselves (transposition mechanism), and the species (reproductive system, effective population size, etc.), that have differing influences on the effect of natural selection acting against the TE insertions.}
} }
@article{rizzonStrikingSimilaritiesGenomic2006,
title = {Striking {{Similarities}} in the {{Genomic Distribution}} of {{Tandemly Arrayed Genes}} in {{Arabidopsis}} and {{Rice}}},
author = {Rizzon, Carene and Ponger, Loic and Gaut, Brandon S},
date = {2006-09},
journaltitle = {PLoS Computational Biology},
shortjournal = {PLoS Comput Biol},
volume = {2},
number = {9},
eprint = {16948529},
eprinttype = {pmid},
pages = {e115},
issn = {1553-734X},
doi = {10.1371/journal.pcbi.0020115},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1557586/},
urldate = {2024-04-10},
abstract = {In Arabidopsis, tandemly arrayed genes (TAGs) comprise {$>$}10\% of the genes in the genome. These duplicated genes represent a rich template for genetic innovation, but little is known of the evolutionary forces governing their generation and maintenance. Here we compare the organization and evolution of TAGs between Arabidopsis and rice, two plant genomes that diverged \textasciitilde 150 million years ago. TAGs from the two genomes are similar in a number of respects, including the proportion of genes that are tandemly arrayed, the number of genes within an array, the number of tandem arrays, and the dearth of TAGs relative to single copy genes in centromeric regions. Analysis of recombination rates along rice chromosomes confirms a positive correlation between the occurrence of TAGs and recombination rate, as found in Arabidopsis. TAGs are also biased functionally relative to duplicated, nontandemly arrayed genes. In both genomes, TAGs are enriched for genes that encode membrane proteins and function in “abiotic and biotic stress” but underrepresented for genes involved in transcription and DNA or RNA binding functions. We speculate that these observations reflect an evolutionary trend in which successful tandem duplication involves genes either at the end of biochemical pathways or in flexible steps in a pathway, for which fluctuation in copy number is unlikely to affect downstream genes. Despite differences in the age distribution of tandem arrays, the striking similarities between rice and Arabidopsis indicate similar mechanisms of TAG generation and maintenance., The nuclear genomes of higher plants vary tremendously in size and gene content. Much of this variation is attributable to gene duplication. To date, most studies of plant gene duplication have focused on whole genome duplication events, which duplicate all genes simultaneously. Another prominent process is single gene duplication, which often results in duplicated genes arranged in a tandem array. Here Rizzon, Ponger, and Gaut identify tandem arrays in rice and their genome organization between Arabidopsis and rice, two plant species that diverged \textasciitilde 150 million years ago. The two genomes contain a similar proportion of genes that are tandemly arrayed, with a similar number of genes within an array. Moreover, tandemly arrayed genes are most common in genomic regions of high recombination in both species. This organization appears to be a general feature of eukaryotic genomes, perhaps because duplication rates are higher in high recombination regions. Tandemly arrayed genes of rice and Arabidopsis also represent a biased gene set with regard to function. In contrast to genes duplicated through whole genome events, tandemly arrayed genes are enriched for genes that encode membrane proteins and genes that function in response to environmental stresses. Taken together, these observations suggest that tandemly arrayed genes represent a rich and relatively fluid source for plant adaptation.},
pmcid = {PMC1557586}
}
@article{rognesParAlignParallelSequence2001, @article{rognesParAlignParallelSequence2001,
title = {{{ParAlign}}: A Parallel Sequence Alignment Algorithm for Rapid and Sensitive Database Searches}, title = {{{ParAlign}}: A Parallel Sequence Alignment Algorithm for Rapid and Sensitive Database Searches},
shorttitle = {{{ParAlign}}}, shorttitle = {{{ParAlign}}},
@ -986,24 +690,6 @@
issue = {suppl\_2} issue = {suppl\_2}
} }
@article{shojaRoadmapTandemlyArrayed2006,
title = {A Roadmap of Tandemly Arrayed Genes in the Genomes of Human, Mouse, and Rat},
author = {Shoja, Valia and Zhang, Liqing},
date = {2006-11},
journaltitle = {Molecular Biology and Evolution},
shortjournal = {Mol Biol Evol},
volume = {23},
number = {11},
eprint = {16901985},
eprinttype = {pmid},
pages = {2134--2141},
issn = {0737-4038},
doi = {10.1093/molbev/msl085},
abstract = {Tandemly arrayed genes (TAGs) play an important functional and physiological role in the genome. Most previous studies have focused on individual TAG families in a few species, yet a broad characterization of TAGs is not available. Here we identified all TAGs in the genomes of humans, mouse, and rat and performed a comprehensive analysis of TAG distribution, TAG sizes, TAG orientations and intergenic distances, and TAG functions. TAGs account for about 14-17\% of all genes in the genome and nearly one-third of all duplicated genes, highlighting the predominant role that tandem duplication plays in gene duplication. For all species, TAG distribution is highly heterogeneous along chromosomes and some chromosomes are enriched with TAG forests, whereas others are enriched with TAG deserts. The majority of TAGs are of size 2 for all genomes, similar to the previous findings in Caenorhabditis elegans, Arabidopsis thaliana, and Oryza sativa, suggesting that it is a rather general phenomenon in eukaryotes. The comparison with the genome patterns shows that TAG members have a significantly higher proportion of parallel gene orientation in all species, corroborating Graham's claim that parallel orientation is the preferred form of orientation in TAGs. Moreover, TAG members with parallel orientation tend to be closer to each other than all neighboring genes in the genome with parallel orientation. The analyses of Gene Ontology function indicate that genes with receptor or binding activities are significantly overrepresented by TAGs. Computer simulation reveals that random gene rearrangements have little effect on the statistics of TAGs for all genomes. Finally, the average proportion of TAGs shows a trend of increase with the increase of family sizes, although the correlation between TAG proportions in individual families and family sizes is not significant.},
langid = {english},
keywords = {Animals,Chromosome Mapping,Chromosomes,Gene Duplication,Genome,Humans,Mice,Rats,Recombination Genetic,Tandem Repeat Sequences}
}
@article{smithIdentificationCommonMolecular1981, @article{smithIdentificationCommonMolecular1981,
title = {Identification of Common Molecular Subsequences}, title = {Identification of Common Molecular Subsequences},
author = {Smith, T. F. and Waterman, M. S.}, author = {Smith, T. F. and Waterman, M. S.},
@ -1073,41 +759,6 @@
keywords = {Algorithms,Base Sequence,Chromosome Mapping,Computer Graphics,DNA,DNA Mutational Analysis,Molecular Sequence Data,Sequence Alignment,Sequence Analysis DNA,Software,User-Computer Interface} keywords = {Algorithms,Base Sequence,Chromosome Mapping,Computer Graphics,DNA,DNA Mutational Analysis,Molecular Sequence Data,Sequence Alignment,Sequence Analysis DNA,Software,User-Computer Interface}
} }
@article{thibaud-nissenIdentificationCharacterizationPseudogenes2009,
title = {Identification and Characterization of Pseudogenes in the Rice Gene Complement},
author = {Thibaud-Nissen, Françoise and Ouyang, Shu and Buell, C. Robin},
date = {2009-07-16},
journaltitle = {BMC Genomics},
shortjournal = {BMC Genomics},
volume = {10},
number = {1},
pages = {317},
issn = {1471-2164},
doi = {10.1186/1471-2164-10-317},
url = {https://doi.org/10.1186/1471-2164-10-317},
urldate = {2024-04-09},
abstract = {The Osa1 Genome Annotation of rice (Oryza sativa L. ssp. japonica cv. Nipponbare) is the product of a semi-automated pipeline that does not explicitly predict pseudogenes. As such, it is likely to mis-annotate pseudogenes as functional genes. A total of 22,033 gene models within the Osa1 Release 5 were investigated as potential pseudogenes as these genes exhibit at least one feature potentially indicative of pseudogenes: lack of transcript support, short coding region, long untranslated region, or, for genes residing within a segmentally duplicated region, lack of a paralog or significantly shorter corresponding paralog.},
langid = {english},
keywords = {Duplicate Region,GOSlim Term,Massively Parallel Signature Sequencing,Massively Parallel Signature Sequencing Data,Paralogous Family}
}
@article{tremblaysavardEvolutionOrthologousTandemly2011,
title = {Evolution of Orthologous Tandemly Arrayed Gene Clusters},
author = {Tremblay Savard, Olivier and Bertrand, Denis and El-Mabrouk, Nadia},
date = {2011-10-05},
journaltitle = {BMC Bioinformatics},
shortjournal = {BMC Bioinformatics},
volume = {12},
number = {9},
pages = {S2},
issn = {1471-2105},
doi = {10.1186/1471-2105-12-S9-S2},
url = {https://doi.org/10.1186/1471-2105-12-S9-S2},
urldate = {2024-04-09},
abstract = {Tandemly Arrayed Gene (TAG) clusters are groups of paralogous genes that are found adjacent on a chromosome. TAGs represent an important repertoire of genes in eukaryotes. In addition to tandem duplication events, TAG clusters are affected during their evolution by other mechanisms, such as inversion and deletion events, that affect the order and orientation of genes. The DILTAG algorithm developed in [1] makes it possible to infer a set of optimal evolutionary histories explaining the evolution of a single TAG cluster, from an ancestral single gene, through tandem duplications (simple or multiple, direct or inverted), deletions and inversion events.},
keywords = {Gene Order,Gene Tree,Internal Vertex,Inversion Event,Tandem Duplication}
}
@article{vandongenGraphClusteringDiscrete2008a, @article{vandongenGraphClusteringDiscrete2008a,
title = {Graph {{Clustering Via}} a {{Discrete Uncoupling Process}}}, title = {Graph {{Clustering Via}} a {{Discrete Uncoupling Process}}},
author = {Van Dongen, Stijn}, author = {Van Dongen, Stijn},
@ -1136,58 +787,6 @@
langid = {english} langid = {english}
} }
@incollection{vandongenUsingMCLExtract2012,
title = {Using {{MCL}} to {{Extract Clusters}} from {{Networks}}},
booktitle = {Bacterial {{Molecular Networks}}},
author = {Van Dongen, Stijn and Abreu-Goodger, Cei},
editor = {Van Helden, Jacques and Toussaint, Ariane and Thieffry, Denis},
date = {2012},
volume = {804},
pages = {281--295},
publisher = {Springer New York},
location = {New York, NY},
doi = {10.1007/978-1-61779-361-5_15},
url = {http://link.springer.com/10.1007/978-1-61779-361-5_15},
urldate = {2024-04-11},
isbn = {978-1-61779-360-8 978-1-61779-361-5},
langid = {english}
}
@incollection{vandongenUsingMCLExtract2012a,
title = {Using {{MCL}} to {{Extract Clusters}} from {{Networks}}},
booktitle = {Bacterial {{Molecular Networks}}},
author = {Van Dongen, Stijn and Abreu-Goodger, Cei},
editor = {Van Helden, Jacques and Toussaint, Ariane and Thieffry, Denis},
date = {2012},
volume = {804},
pages = {281--295},
publisher = {Springer New York},
location = {New York, NY},
doi = {10.1007/978-1-61779-361-5_15},
url = {http://link.springer.com/10.1007/978-1-61779-361-5_15},
urldate = {2024-04-11},
isbn = {978-1-61779-360-8 978-1-61779-361-5},
langid = {english}
}
@article{vizuetaBitacoraComprehensiveTool2020,
title = {Bitacora: {{A}} Comprehensive Tool for the Identification and Annotation of Gene Families in Genome Assemblies},
shorttitle = {Bitacora},
author = {Vizueta, Joel and Sánchez-Gracia, Alejandro and Rozas, Julio},
date = {2020},
journaltitle = {Molecular Ecology Resources},
volume = {20},
number = {5},
pages = {1445--1452},
issn = {1755-0998},
doi = {10.1111/1755-0998.13202},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.13202},
urldate = {2024-04-09},
abstract = {Gene annotation is a critical bottleneck in genomic research, especially for the comprehensive study of very large gene families in the genomes of nonmodel organisms. Despite the recent progress in automatic methods, state-of-the-art tools used for this task often produce inaccurate annotations, such as fused, chimeric, partial or even completely absent gene models for many family copies, errors that require considerable extra efforts to be corrected. Here we present bitacora, a bioinformatics solution that integrates popular sequence similarity-based search tools and Perl scripts to facilitate both the curation of these inaccurate annotations and the identification of previously undetected gene family copies directly in genomic DNA sequences. We tested the performance of bitacora in annotating the members of two chemosensory gene families with different repertoire size in seven available genome sequences, and compared its performance with that of augustus-ppx, a tool also designed to improve automatic annotations using a sequence similarity-based approach. Despite the relatively high fragmentation of some of these drafts, bitacora was able to improve the annotation of many members of these families and detected thousands of new chemoreceptors encoded in genome sequences. The program creates general feature format (GFF) files, with both curated and newly identified gene models, and FASTA files with the predicted proteins. These outputs can be easily integrated in genomic annotation editors, greatly facilitating subsequent manual annotation and downstream evolutionary analyses.},
langid = {english},
keywords = {bioinfomatics/phyloinfomatics,gene families,gene structure and function,genomics,molecular evolution,structural annotation,transcriptomics}
}
@article{wolfeRobustnessItNot2000, @article{wolfeRobustnessItNot2000,
title = {Robustness—It's Not Where You Think It Is}, title = {Robustness—It's Not Where You Think It Is},
author = {Wolfe, Ken}, author = {Wolfe, Ken},

View File

@ -18,8 +18,6 @@
| FTAGFinder | FTAG Finder | Families and Tandemly Arrayed Genes Finder | | FTAGFinder | FTAG Finder | Families and Tandemly Arrayed Genes Finder |
| WGD | WGD | Whole Genome Duplication | | WGD | WGD | Whole Genome Duplication |
| MCL | MCL | Markov Clustering | | MCL | MCL | Markov Clustering |
| BLAST | BLAST | Basic Local Alignment Search Tool |
| GO | GO | Gene Ontology |
#+name: glossary #+name: glossary
| label | name | description | | label | name | description |
@ -35,7 +33,6 @@
| segment_duplication | segment duplication | DNA sequences present in multiple locations within a genome that share high level of sequence identity | | segment_duplication | segment duplication | DNA sequences present in multiple locations within a genome that share high level of sequence identity |
| subfunctionalization | subfunctionalization | Fate of a duplicate gene which gets a part of the original gene function, the function being shared among multiple duplicates | | subfunctionalization | subfunctionalization | Fate of a duplicate gene which gets a part of the original gene function, the function being shared among multiple duplicates |
| orthologues | orthologues | Homologous genes whose divergence started at a speciation event | | orthologues | orthologues | Homologous genes whose divergence started at a speciation event |
| neofunctionalization | neofunctionalization | Acquisition of a new function by the duplicate gene |
#+begin_export latex #+begin_export latex
\makeatletter \makeatletter
@ -66,7 +63,7 @@
#+end_export #+end_export
* Scientific context * Scientific context
\lettrine{D}uplicate genes represent an important fraction of Eukaryotic genes: It is estimated that between 46% and 65.5% of human genes could be considered as duplicate[fn:: The estimate vary strongly depending on the criteria in use, because ancient duplication event may be hard to detect.] [cite:@correaTransposableElementEnvironment2021]. It is estimated that between 46% and 65.5% of human genes could be considered as duplicate genes\footnote{The estimate vary strongly depending on the criteria in use} [cite:@correaTransposableElementEnvironment2021].
Duplicate genes offers a pool of genetic material available for further experimentation during species evolution. Duplicate genes offers a pool of genetic material available for further experimentation during species evolution.
** Gene duplication mechanisms ** Gene duplication mechanisms
@ -98,9 +95,9 @@ Multiple mechanisms may lead to a gene duplication. Their effect ranges from the
During an event of gls:WGD, the entire set of genes present on the chromosomes is duplicated ([[cref:fig:gene-duplication-mechanisms]] (A)). During an event of gls:WGD, the entire set of genes present on the chromosomes is duplicated ([[cref:fig:gene-duplication-mechanisms]] (A)).
gls:WGD can occur thanks to gls:polyspermy or in case of a non-reduced gamete. gls:WGD can occur thanks to gls:polyspermy or in case of a non-reduced gamete.
Gls:polyploidisation is a mechanism leading to a species with at least three copies of an initial genome. Gls:polyploidisation is a mechanism leading to a species with at least three copies of an initial genome.
A striking example is probably /Triticum aestivum/ (wheat) which is hexaploid due to hybridisation events [cite:@golovninaMolecularPhylogenyGenus2007a]. A striking example is probably /Triticum aestivum/ (wheat) which is hexaploid[fn:hexaploid: An hexaploid cell have three pairs of homologous chromosomes.] due to several hybridisation events [cite:@golovninaMolecularPhylogenyGenus2007a].
We distinguish two kinds of glspl:polyploidisation, based on the origin of the duplicate genome: (i) Gls:allopolyploidisation occurs when the supplementary chromosomes come from a divergent species. This is the case for the /Triticum aestivum/ hybridisation, which consisted in the union of the chromosome set of a /Triticum/ species with that of an /Aegilops/ species. (ii) Gls:autopolyploidisation consists in the hybridisation or duplication of the whole genome within the same species. We distinguish two kinds of glspl:polyploidisation, based on the origin of the duplicate genome: (i) Gls:allopolyploidisation occurs when the supplementary chromosomes come from a divergent species. This is the case for /Triticum aestivum/ hybridisation, which consisted in the union of the chromosome set of a /Triticum/ species with those of an /Aegilops/ species. (ii) Gls:autopolyploidisation consists in the hybridisation or duplication of the whole genome within the same species.
*** Unequal crossing-over *** Unequal crossing-over
Another source of gene duplication relies on unequal crossing-over. During cell division, a crossing-over occurs when two chromatids exchange fragments of chromosome. If the cleavage of the two chromatids occurs at different positions, the shared fragments may have different lengths. Homologous recombination of such uneven crossing-over leads to the incorporation of a duplicate region, as depicted in cref:fig:gene-duplication-mechanisms (B, C). Another source of gene duplication relies on unequal crossing-over. During cell division, a crossing-over occurs when two chromatids exchange fragments of chromosome. If the cleavage of the two chromatids occurs at different positions, the shared fragments may have different lengths. Homologous recombination of such uneven crossing-over leads to the incorporation of a duplicate region, as depicted in cref:fig:gene-duplication-mechanisms (B, C).
@ -123,48 +120,28 @@ A typical DNA transposon contains a transposase gene. This enzyme recognizes two
Finally, glspl:segment_duplication, also called /low copy repeats/ are long stretches of DNA with high identity score ([[cref:fig:gene-duplication-mechanisms]] (F)). Their exact duplication mechanism remains unclear [cite:@lallemandOverviewDuplicatedGene2020]. They may come from an accidental replication, distinct from an uneven cross-over or a double stranded breakage. Finally, glspl:segment_duplication, also called /low copy repeats/ are long stretches of DNA with high identity score ([[cref:fig:gene-duplication-mechanisms]] (F)). Their exact duplication mechanism remains unclear [cite:@lallemandOverviewDuplicatedGene2020]. They may come from an accidental replication, distinct from an uneven cross-over or a double stranded breakage.
Transposable elements may well be involved in the mechanism, as a high enrichment of transposable elements is found next to duplicate segment extremities, in /Drosophila/ [cite:@lallemandOverviewDuplicatedGene2020]. Transposable elements may well be involved in the mechanism, as a high enrichment of transposable elements is found next to duplicate segment extremities, in /Drosophila/ [cite:@lallemandOverviewDuplicatedGene2020].
#+begin_export latex
\fladdfig{
\includegraphics[width=.9\linewidth]{figures/Evolution_fate_duplicate_genes.pdf}
\caption[Fate of duplicate genes]{\label{fig:fate-duplicate-genes} Fate of duplicate genes. An original gene with four functions is duplicated. Its two copies may both keep the original functions (functional redoundancy). The original functions may split between the different copies (subfunctionalization). One of the copy may acquire a new function (neofunctionalization). It may also degenerate and lose its original functions (pseudogenization).
Adapted from \href{https://commons.wikimedia.org/wiki/File:Evolution_fate_duplicate_genes_-_vector.svg}{Smedlib}, \href{https://creativecommons.org/licenses/by-sa/4.0}{CC BY-SA 4.0}, via Wikimedia Commons}
}
#+end_export
** Fate of duplicate genes in genome evolution ** Fate of duplicate genes in genome evolution
In his book /Evolution by Gene Duplication/, Susumu [[latex:textsc][Ohno]] proposed that gene duplication plays a major role in species evolution [cite:@ohnoEvolutionGeneDuplication1970], because it provides new genetic materials to build on new phenotypes while keeping a backup gene for the previous function. In his book /Evolution by Gene Duplication/, Susumu [[latex:textsc][Ohno]] proposed that gene duplication plays a major role in species evolution [cite:@ohnoEvolutionGeneDuplication1970], because it provides new genetic materials to build on new phenotypes while keeping a backup gene for the previous function.
Indeed, duplicate genes evolve after duplication: they may be inactivated, and become glspl:pseudogene; they may be deleted or conserved, and if conserved, the may or may not acquire a new function. Indeed, duplicate genes may evolve after duplication: they may be inactivated, becoming glspl:pseudogene; they may be deleted or conserved and so, they may acquire new functions.
[[Cref:fig:fate-duplicate-genes]] depicts the different possible fates of a duplicate gene.
# *** Pseudogenization *** Pseudogenization
As genome evolves, duplicate genes may be inactivated and become pseudogenes. These pseudogenes keep a gene-like structure which degrades as and when further genome modifications occur but they are no longer expressed. Duplicate genes may be inactivated and become pseudogenes. These pseudogenes keep a gene-like structure, which degrades as and when further genome modifications occur. However, they are no longer expressed.
*** Neofunctionalization
# *** Neofunctionalization Duplicate genes may be conserved and gain a new function.
After duplication, the new gene copy may gain a new function. We call this possible outcome gls:neofunctionalization. For instance, the current set of olfactory receptor genes result from several duplication and deletion events (in /Drosophila/: [cite/t:@nozawaEvolutionaryDynamicsOlfactory2007]), after which the duplicate olfactory genes specialized in the detection of particular chemical compounds.
For instance, the current set of olfactory receptor genes result from several duplication and deletion events (for /Drosophila/, see: [cite/t:@nozawaEvolutionaryDynamicsOlfactory2007]), after which each duplicate olfactory gene specialized in the detection of a particular chemical compound. *** Subfunctionalization
Two duplicate genes with the same original function may encounter a gls:subfunctionalization by which each gene conserves only one part of the function.
# *** Subfunctionalization *** Functional redundancy
Two duplicate genes with the same original function may encounter a gls:subfunctionalization: each gene conserves only one part of the function. The two gene copies may keep the ancestral function: in this case the quantity of gene product may increase.
# *** Functional redundancy
Another possibility is that the two gene copies keep the ancestral function, resulting in a functional redoundancy. In this case the quantity of gene product may increase.
** Methods to identify duplicate genes ** Methods to identify duplicate genes
#+begin_export latex [[latex:textsc][Lallemand]] et al. review the different methods used to detect duplicate genes. These methods depend on the type of duplicate genes they target and vary on computation burden as well as ease of use [cite:@lallemandOverviewDuplicatedGene2020].
\fladdfig{
\includegraphics[width=.9\linewidth]{./figures/tag-definition.pdf}
\caption[Schematic representation of TAG definitions]{\label{fig:tag-definitions} Schematic representation of TAG definitions. Several genes are represented on a linear chromosome. The red box represent a singleton gene. Orange boxes represent a TAG with two duplicate genes seperated by 7 other genes ($\mathrm{TAG}_7$). Four green boxes constitute a TAG, the gene at the extremities are seperated by three genes ($\mathrm{TAG}_3$). The two blue boxes represents a TAG with two genes next to each other ($\mathrm{TAG}_0$). The bended edges represents the homology links between each pair of genes within a TAG.}}
#+end_export
Different methods exists to detect duplicate genes. These methods depend on the type of duplicate genes they target and vary on computation burden as well as in the ease of use (for a review, see [cite/t:@lallemandOverviewDuplicatedGene2020]).
*** Paralog detection *** Paralog detection
Paralogs are homologous genes derived from a duplication event. We can identify them as homologous genes coming from the same genome, or as homologous genes between different species once we filtered out gls:orthologues (homologous genes derived from a speciation event). Paralogs are homologous genes derived from a duplication event. We can identify them as homologous genes coming from the same genome, or as homologous genes between different species once we filtered out gls:orthologues (homologous genes derived from a speciation event).
We can use two gene characteristics to assess the homology between two genes: gene structure or sequence similarity. We can use two gene characteristics to assess the homology between two genes: gene structure or sequence similarity.
The sequence similarity can be tested with a sequence alignment tool, such as =BLAST= [cite:@altschulBasicLocalAlignment1990], =Psi-BLAST=, and =HMMER3= [cite:@johnsonHiddenMarkovModel2010], or =diamond= [cite:@buchfinkSensitiveProteinAlignments2021], which are heuristic algorithms, which means they may not provide the best results, but do so way faster than exact algorithms, such as the classical Smith and Waterman algorithm [cite:@smithIdentificationCommonMolecular1981] or its optimized versions =PARALIGN= [cite:@rognesParAlignParallelSequence2001] or =SWIMM=. The sequence similarity can be tested with a sequence alignment tool, such as =BLAST= [cite:@altschulBasicLocalAlignment1990], =Psi-BLAST=, and =HMMER3= [cite:@johnsonHiddenMarkovModel2010], or =diamond= [cite:@buchfinkSensitiveProteinAlignments2021], which are heuristic algorithms, which means they may not provide the best results, but do so way faster than exact algorithms, such as the classical Smith and Waterman algorithm [cite:@smithIdentificationCommonMolecular1981] or its optimized versions =PARALIGN= [cite:@rognesParAlignParallelSequence2001] or =SWIMM=.
This is the case for Triticum aestivum hybridisation, which consisted in the union of the
chromosome set of a Triticum species with those of an Aegilops species
*** FTAG Finder *** FTAG Finder
Developed in the LaMME laboratory, the FTAG Finder (Families and Tandemly Arrayed Genes Finder) pipeline is a simple pipeline targeting the detection of gls:TAG from the proteome of single species [cite:@bouillonFTAGFinderOutil2016]. Developed in the LaMME laboratory, the FTAG Finder (Families and Tandemly Arrayed Genes Finder) pipeline is a simple pipeline targeting the detection of gls:TAG from the proteome of single species [cite:@bouillonFTAGFinderOutil2016].
@ -177,10 +154,15 @@ Several =BLAST= metrics can be used as an homology measure, such as bitscore, id
**** Identification of gene families **** Identification of gene families
Based on the homology links between each pair of genes, we construct an undirected weighted graph whose vertices correspond to genes and edges to homology links between them. Based on the homology links between each pair of genes, we construct an undirected weighted graph whose vertices correspond to genes and edges to homology links between them.
We apply a graph clustering algorithm on the graph in order to infer the gene families corresponding to densely connected communities of vertices. We apply a graph clustering algorithm on the graph in order to infer the gene families corresponding to densely connected communities of vertices.
#+begin_export latex
\fladdfig{
\includegraphics[width=.9\linewidth]{./figures/tag-definition.pdf}
\caption[Schematic representation of TAG definitions]{\label{fig:tag-definitions} Schematic representation of TAG definitions. Several genes are represented on a linear chromosome. The red box represent a singleton gene. Orange boxes represent a TAG with two duplicate genes seperated by 7 other genes ($\mathrm{TAG}_7$). Four green boxes constitute a TAG, the gene at the extremities are seperated by three genes ($\mathrm{TAG}_3$. The two blue boxes represents a TAG with two genes next to each other $\mathrm{TAG}_0$. The bended edges represents the homology links between each pair of genes of a TAG.}}
#+end_export
FTAG Finder proposes three clustering algorithm alternatives: single linkage, Markov Clustering [cite:@vandongenNewClusterAlgorithm1998] or Walktrap [cite:@ponsComputingCommunitiesLarge2005]. FTAG Finder proposes three clustering algorithm alternatives: single linkage, Markov Clustering [cite:@vandongenNewClusterAlgorithm1998] or Walktrap [cite:@ponsComputingCommunitiesLarge2005].
**** Detection of TAGs **** Detection of TAGs
The final step of FTAG Finder consists in the identification of gls:TAG from the gene families and the positions of genes. The final step of FTAG Finder consists in the identification of gls:TAG from the gene families and the positions of genes.
For a given chromosome, the tool seeks genes belonging to the same family and located close to each other. The tool allows a maximal number of genes between the homologous genes, with a parameter set by the user. Cref:fig:tag-definitions is a schematic representation of some possible gls:TAG positioning on a genome associated with their definition in FTAG Finder /Find Tags/ step. For a given chromosome, the tool seeks genes belonging to the same family and located close to each other. The tool allows a maximal number of genes between the homologous genes, with a parameter set by the user. Cref:fig:tag-definitions is a schematic representation of some possible gls:TAG positioning on a genome associated with their definition in FTAG Finder /Find Tags/ step.
@ -188,55 +170,15 @@ For a given chromosome, the tool seeks genes belonging to the same family and lo
* Objectives for the internship * Objectives for the internship
** Scientific questions ** Scientific questions
The underlying question of FTAG Finder is the study of the evolutionary fate of duplicate genes in Eukaryotes. The underlying question of FTAG Finder is the study of the evolutionary fate of duplicate genes in Eukaryotes.
Duplicate genes are
** Extend the existing FTAG Finder Galaxy pipeline ** Extend the existing FTAG Finder Galaxy pipeline
Galaxy is a web-based platform for running accessible data analysis pipelines, first designed for use in genomics data analysis [cite:@goecksGalaxyComprehensiveApproach2010]. Galaxy is a web-based platform for running accessible data analysis pipelines, first designed for use in genomics data analysis [cite:@goecksGalaxyComprehensiveApproach2010].
Last year, Séanna [[latex:textsc][Charles]] worked on the Galaxy version of the FTAG Finder pipeline during her M1 internship [cite:@charlesFinalisationPipelineFTAG2023]. I will continue this work. Last year, Séanna [[latex:textsc][Charles]] worked on the Galaxy version of the FTAG Finder pipeline during her M1 internship [cite:@charlesFinalisationPipelineFTAG2023]. I will continue this work.
FTAG Finder is currently deployed on the server of the /Laboratoire d'Analyse et Modélisation d'Évry/[fn: [[http://stat.genopole.cnrs.fr/galaxy]] ].
** Port FTAG Finder pipeline on a workflow manager ** Port FTAG Finder pipeline on a workflow manager
Another objective of my internship will be to port FTAG Finder on a workflow manager better suited to larger and more reproducible analysis. Another objective of my internship will be to port FTAG Finder on a workflow manager better suited to larger and more reproducible analysis.
We will have to make a choice for the tool we will use. We will have to make a choice for the tool we will use.
The two main options being Snakemake and Nextflow. Snakemake is a python powered workflow manager based on rules /à la/ GNU Make [cite:@kosterSnakemakeScalableBioinformatics2012]. Nextflow is a groovy powered workflow manager, which rely on the data flows paradigm [cite:@ditommasoNextflowEnablesReproducible2017]. Both are widely used in the bioinformatics community. Their use have been on the rise since they came out in 2012 and 2013 respectively [cite:@djaffardjyDevelopingReusingBioinformatics2023]. The two main options being Snakemake and Nextflow. Snakemake is a python powered workflow manager based on rules /à la/ GNU Make [cite:@kosterSnakemakeScalableBioinformatics2012]. Nextflow is a groovy powered workflow manager, which rely on the data flows paradigm [cite:@ditommasoNextflowEnablesReproducible2017]. Both are widely used in the bioinformatics community, and their use have been on the rise since they came out in 2012 and 2013 respectively [cite:@djaffardjyDevelopingReusingBioinformatics2023].
#+begin_export latex
\fladdtab{
\begin{tabular}{ccc}
\toprule
& List ref & List $L$ \\
\midrule
related to $go$ & $a$ & $b$ \\
unrelated & $c$ & $d$ \\
\bottomrule
\end{tabular}
\caption{\label{tab:fisher-test-contigency-table}Contingency table for a Fisher exact test on gene lists}
}
#+end_export
* Methodological approaches
Based on the output of the FTAG Finder pipeline, which consist in lists of genes, researchers could perform bespoke subsequent analyses on TAGs.
** Analysis of over-represented gene functions among TAGs
The gls:GO describes biological concepts across three main classes: Cellular Component, Molecular Function and Biological Process. It describe a controlled vocabulary of concepts and the relationship between them. The genes with known functions can be associated with a particular GO term. We can perform an GO enrichment analysis to assess whether a particular GO term is over-represented in a particular gene list, compared to an other. We can use a Fisher exact test, using the FDR (False Discovery Rate) control procedure of [[latex:textsc][Benjamini]] and [[latex:textsc][Hocheberg]] to do so.
Let $go$ be a GO term. We construct a contingency matrix based on the count of genes associated with this GO term (or associated with one of its brother GO term) for the reference gene list and the list of interest (here, the list of genes in a TAG) (see cref:tab:fisher-test-contigency-table).
** Are TAG located preferentially on specific chromosome region?
** Are there chromosomes enriched or depleted in TAG?
** Do genes located next to each other in a TAG share the same orientation?
The concordance of two genes of a TAG falls in three possible cases: either both genes are on the same strand (\(\rightarrow \rightarrow\)), either they have a divergent orientation (\(\leftarrow \rightarrow\)), or a convergent one (\(\rightarrow \leftarrow\)). Graham conjectured that genes of a TAG that are close to each other would be more likely to share the same orientation, and it seems to be effectively the case [cite:@shojaRoadmapTandemlyArrayed2006].
# To test this, we can use a $\Chi^2$ test of goodness of fit or a Student $t$-test.
*** TODO write down the hypotheses
** What is the robustness and accuracy of the detection method?
[cite/t:@le-hoangEtudeTranscriptomiqueGenes2017] started analyses of the impact of parameter choice on FTAG Finder output lists. A more detailed benchmark of FTAG Finder in comparison with other methods on some known test dataset might be of particular interest.
#+begin_export latex #+begin_export latex
\flstop \flstop
@ -262,18 +204,6 @@ The concordance of two genes of a TAG falls in three possible cases: either both
:UNNUMBERED: t :UNNUMBERED: t
:END: :END:
Duplicate genes is an important feature of Eukaryotic genomes. They contribute to the plasticity of genome, hence to the capacity of species to evolve.
Several mechanisms may lead to gene duplication. Among them, an unequal crossing-over leads to the formation of Tandemly Arrayed Genes (TAG) corresponding to homologous genes located one set after the other on the same chromosome.
There are multiple methods for detecting duplicate genes from sequences. These methods vary in terms of the particular gene duplication mechanism they target, computational efficiency and ease of use.
FTAG Finder is a simple Galaxy pipeline aiming at the detection of families of duplicate genes and the identification of TAG based on the proteome of a single species. FTAG Finder is developed in the /Laboratoire de Mathématiques et Modélisation d'Évry/, where I will do my internship.
On the one hand, the aim of my internship is to extend the current Galaxy implementation of FTAG Finder with new export lists best suited to the analysis requirements of the laboratory. On the other hand, the objective of my internship will be to port the Galaxy pipeline on another scientific workflow manager better suited to reproducible analyses such as Snakemake and Nextflow.
Then, the updated version of the FTAG Finder pipeline will be used to perform an analysis on the TAG of a model species, to assess its proper behavior. A benchmark of the pipeline will probably be run to compare the FTAG Finder with alternative published methods targetting duplicate genes and TAG in particular.
* Bean :noexport: * Bean :noexport:
** MCL ** MCL
MCL uses two operations on a stochastic matrix representation $M$ of the graph first derived from the adjacency matrix, namely /expansion/ and /inflation/. Expansion consists in elevating the matrix to a power $r$, and subsequently scaling its columns so that they sum to 1 again. The image of the inflation operator $\Gamma_r$ is defined as MCL uses two operations on a stochastic matrix representation $M$ of the graph first derived from the adjacency matrix, namely /expansion/ and /inflation/. Expansion consists in elevating the matrix to a power $r$, and subsequently scaling its columns so that they sum to 1 again. The image of the inflation operator $\Gamma_r$ is defined as
@ -282,12 +212,18 @@ MCL uses two operations on a stochastic matrix representation $M$ of the graph f
\] \]
where $m$ is number of rows in the matrix, and $M_{pq}$ is the value in the $p, q$ cell of the matrix $M$. where $m$ is number of rows in the matrix, and $M_{pq}$ is the value in the $p, q$ cell of the matrix $M$.
This operator strengthens the edges with higher weights and tend to annihilate edges with lower flow. This operator strengthens the edges with higher weights and tend to anihilate edges with lower flow.
The application of both operator iteratively eventually ends up in a partition of the initial graph's edges into clusters of closely connected nodes (corresponding, in our case to gene families). The application of both operator iteratively eventually ends up in a partition of the initial graph's edges into clusters of closely connected nodes (corresponding, in our case to gene families).
** Walktrap ** Walktrap
Principle: construct vertex communities based on where an agent would get stuck in a random walk. Principle: construct vertex communities based on where an agent would get stuck in a random walk.
# LocalWords: speciation Subfunctionalization Neofunctionalization
# LocalWords: Pseudogenization
# Local Variables:
# eval: (progn (org-babel-goto-named-src-block "startup") (org-babel-execute-src-block) (outline-hide-sublevels 1))
# End:
* Setup :noexport: * Setup :noexport:
#+name: startup #+name: startup
@ -297,12 +233,3 @@ Principle: construct vertex communities based on where an agent would get stuck
#+RESULTS: startup #+RESULTS: startup
: Loaded ./setup.el : Loaded ./setup.el
#+begin_example
# LocalWords: speciation subfunctionalization neofunctionalization
# LocalWords: pseudogenization bioinformatics
# Local Variables:
# eval: (progn (org-babel-goto-named-src-block "startup") (org-babel-execute-src-block) (outline-hide-sublevels 1))
# End:
#+end_example

BIN
report.pdf (Stored with Git LFS)

Binary file not shown.

View File

@ -1,97 +0,0 @@
\RequirePackage[manualmark]{scrlayer-scrpage}
\iffalse
\renewcommand*\chaptermark[1]{%
\markboth{\Ifnumbered{chapter}{\chaptermarkformat}{}}{#1}% <- outdated macro replaced
}
\AfterTOCHead[toc]{\markboth{}{\contentsname}}
\fi
\clearpairofpagestyles
\clubpenalty = 10000
\widowpenalty = 10000
\automark[section]{part}
\setlength{\footheight}{120pt} % avoids scrlayer-scrpage warning:
% footheight to low warning
\setlength{\footskip}{185pt} % BAD HACK that moves the foot downwards
\KOMAoption{footwidth}{foot:53pt} % BAD HACK that moves the foot towards
\setkomafont{pagefoot}{\normalfont\footnotesize}
\setkomafont{pagenumber}{\normalfont \fontfamily{\sfdefault}\selectfont \normalsize \bfseries\color{black}}
\renewcommand{\partmark}[1]{%
\markboth{%
% use \@chapapp instead of \chaptername to avoid
% 'Chapter A Appendix ...', thanks to @farbverlust (issue #47)
\fontfamily{\sfdefault}\selectfont
{\color{fgBlue}\textbf{\partname\ \thepart}}%
\quad%
\protect\begin{minipage}[t]{.65\textwidth}%
#1%
\protect\end{minipage}%
}{}%
}
\newlength{\lensectionnumber}
\renewcommand{\sectionmark}[1]{%
\markright{%
\normalsize\fontfamily{\sfdefault}\selectfont\bfseries
\setlength{\lensectionnumber}{0em}
\settowidth{\lensectionnumber}{\textbf{\thesection}\quad}
\protect\begin{minipage}[t]{.72\textwidth}%
{\ }% bad hack to prevent a wrong baseline for the minipage
\protect\raggedleft%
\hangindent=\lensectionnumber%
{\color{black}\textbf{\fontfamily{\sfdefault}\selectfont\thesection}}%
\quad%
#1%
\protect\end{minipage}%
}%
}
\newcommand{\ctfooterline}{%
\color{black}\rule[-90pt]{1.25pt}{100pt}%
}
% Page number for odd (right) pages
\newcommand{\ctfooterrightpagenumber}{%
\ctfooterline%
\hspace*{10pt}%
\begin{minipage}[b]{1.5cm}%
\pagemark\ %
\end{minipage}%
}
%% Page number for even (left) pages
\newcommand{\ctfooterleftpagenumber}{%
\begin{minipage}[b]{1.5cm}%
\raggedleft\pagemark%
\end{minipage}%
\hspace*{10pt}%
\ctfooterline%
}
%% Defines the content for header and footer
\lehead{}
\cehead{}
\rehead{}
\lohead{}
\cohead{}
\rohead{}
\lefoot[% > plain
\ctfooterleftpagenumber%
]{% > srcheadings
\ctfooterleftpagenumber%
\hspace*{0.75cm}%
%\headmark%
}
\cefoot{}
\refoot{}
\lofoot{}
\cofoot{}
\rofoot[% > plain
\ctfooterrightpagenumber%
]{% > srcheadings
%\headmark%
\hspace*{0.75cm}%
\ctfooterrightpagenumber%
}

View File

@ -15,13 +15,16 @@
\newcommand{\flstop}{ \newcommand{\flstop}{
\boolfalse{flcontinue} \boolfalse{flcontinue}
} }
\newcommand\@fladdfloat{\relax} \newcommand\@fladdfig{\relax}
\newcommand\fladdfloat[1]{\global\long\def\@fladdfloat{#1}} \newcommand\fladdfig[1]{\global\long\def\@fladdfig{#1}}
\newcommand\@flputfloat{\@fladdfloat\fladdfloat{\relax}} \newcommand\@flputfig{\@fladdfig\fladdfig{\relax}}
\newcommand\flblankpage{% \newcommand\flblankpage{%
\null \null
\vfill \vfill
\@flputfloat% \begin{figure}[H]
\centering
\@flputfig%
\end{figure}
\vfill \vfill
%\thispagestyle{empty}% %\thispagestyle{empty}%
\clearpage% \clearpage%
@ -31,39 +34,19 @@
} }
} }
\newcommand{\fladdfig}[1]{
\fladdfloat{
\begin{figure}[H]
\centering
#1
\end{figure}
}
}
\newcommand{\fladdtab}[1]{
\fladdfloat{
\begin{table}[H]
\centering
#1
\end{table}
}
}
\iffalse \iffalse
% Example % Example
% Add this at the beginning of the document (typically after the titlepage) % Add this at the beggining of the document (typically after the titlepage)
\flstart \flstart
% Then for each new figure you want to add, add the following % Then for each new figure you want to add, add the following
\fladdfloat{% \fladdfig{%
\begin{figure}[H] \includegraphics[scale=1]{Image1}
\includegraphics[scale=1]{Image1} \caption{Test}
\caption{Test} \label{Ima1}
\label{Ima1}
\end{figure}
} }
% At the end of the document, to avoid a unwanted late terminated loop of empty page creation, add the following % At the end of the document, to avoid a unwanted late terminated loop of empty page creation, add the following
\flstop flstop
\fi \fi

View File

@ -1,4 +1,4 @@
\RequirePackage{lettrine}
% Font % Font
\usepackage{fontspec} \usepackage{fontspec}
@ -137,7 +137,122 @@
\fi \fi
} }
\usepackage{scrhack} \usepackage{scrhack}
% From S. Ivanov hdr preamble
\iffalse
\titleformat{\chapter}[frame]
{\itshape\color{primary}}
{\filright
\normalsize
\enspace Chapter \thechapter\enspace}
{10mm}
{\fontsize{35}{20}\selectfont\normalfont\bfseries\filright\hspace{1ex}}
\titleformat{\section}{\Large\normalfont\bfseries\color{primary}}{\thesection \hspace{1ex}}{1ex}{}
\titleformat{\subsection}{\large\normalfont\bfseries\color{primary}}{\thesubsection \hspace{1ex}}{1ex}{}
\titleformat{\subsubsection}{\normalsize\normalfont\bfseries\color{primary}}{}{1ex}{}
\usepackage{sty/cleanthesis-footer} \newcommand{\changelocaltocdepth}[1]{%
\usepackage{sty/scr-legrand-heading} \addtocontents{toc}{\protect\setcounter{tocdepth}{#1}}%
\setcounter{tocdepth}{#1}%
}
\fi
%
% \usepackage{sty/cleanthesis-extracts}
\RequirePackage[manualmark]{scrlayer-scrpage}
\renewcommand*\chaptermark[1]{%
\markboth{\Ifnumbered{chapter}{\chaptermarkformat}{}}{#1}% <- outdated macro replaced
}
\AfterTOCHead[toc]{\markboth{}{\contentsname}}
\clearpairofpagestyles
\clubpenalty = 10000
\widowpenalty = 10000
\automark[section]{part}
\setlength{\footheight}{120pt} % avoids scrlayer-scrpage warning:
% footheight to low warning
\setlength{\footskip}{185pt} % BAD HACK that moves the foot downwards
\KOMAoption{footwidth}{foot:53pt} % BAD HACK that moves the foot towards
\setkomafont{pagefoot}{\normalfont\footnotesize}
\setkomafont{pagenumber}{\normalfont \fontfamily{\sfdefault}\selectfont \normalsize \bfseries\color{black}}
\renewcommand{\partmark}[1]{%
\markboth{%
% use \@chapapp instead of \chaptername to avoid
% 'Chapter A Appendix ...', thanks to @farbverlust (issue #47)
\fontfamily{\sfdefault}\selectfont
{\color{fgBlue}\textbf{\partname\ \thepart}}%
\quad%
\protect\begin{minipage}[t]{.65\textwidth}%
#1%
\protect\end{minipage}%
}{}%
}
\newlength{\lensectionnumber}
\renewcommand{\sectionmark}[1]{%
\markright{%
\normalsize\fontfamily{\sfdefault}\selectfont\bfseries
\setlength{\lensectionnumber}{0em}
\settowidth{\lensectionnumber}{\textbf{\thesection}\quad}
\protect\begin{minipage}[t]{.72\textwidth}%
{\ }% bad hack to prevent a wrong baseline for the minipage
\protect\raggedleft%
\hangindent=\lensectionnumber%
{\color{black}\textbf{\fontfamily{\sfdefault}\selectfont\thesection}}%
\quad%
#1%
\protect\end{minipage}%
}%
}
\newcommand{\ctfooterline}{%
\color{black}\rule[-90pt]{1.25pt}{100pt}%
}
% Page number for odd (right) pages
\newcommand{\ctfooterrightpagenumber}{%
\ctfooterline%
\hspace*{10pt}%
\begin{minipage}[b]{1.5cm}%
\pagemark\ %
\end{minipage}%
}
%% Page number for even (left) pages
\newcommand{\ctfooterleftpagenumber}{%
\begin{minipage}[b]{1.5cm}%
\raggedleft\pagemark%
\end{minipage}%
\hspace*{10pt}%
\ctfooterline%
}
%% Defines the content for header and footer
\lehead{}
\cehead{}
\rehead{}
\lohead{}
\cohead{}
\rohead{}
\lefoot[% > plain
\ctfooterleftpagenumber%
]{% > srcheadings
\ctfooterleftpagenumber%
\hspace*{0.75cm}%
%\headmark%
}
\cefoot{}
\refoot{}
\lofoot{}
\cofoot{}
\rofoot[% > plain
\ctfooterrightpagenumber%
]{% > srcheadings
%\headmark%
\hspace*{0.75cm}%
\ctfooterrightpagenumber%
}

View File

@ -1,7 +0,0 @@
\colorlet{headingcolor}{black}
\renewcommand*{\sectionformat}{\llap{\textcolor{headingcolor}{\thesection}\hspace{1em}}}
\renewcommand*{\chapterformat}{\llap{\textcolor{headingcolor}{\thechapter}\hspace{1em}}}
\renewcommand*{\subsectionformat}{\llap{\textcolor{headingcolor}{\thesubsection}\hspace{1em}}}